So, I do a lot of web scraping - work fit for an episode of Dirty Jobs I'd say - and I end up writing a lot of code to do this. Standard toolkit includes HtmlAgilityPack for HTML parsing, Fiddler for monitoring network traffic, Firebug for both, HttpWeb(Request|Response) (System.NET), and a web scraping library that I wrote to simplify my life, modifying the code as needed.
So, here's a bare-bones version of a basic HttpWebClient that stores cookies for authenticating, sessions, and stuff - an except of the library I use Obviously doesn't process Javascript or deal with Javascript-set cookies, have fun with those. Uses C# 5 async keyword, so .NET 4.5 is required.
Code:
public class HttpWebClient
{
public WebProxy WebProxy { get; set; }
public CookieContainer CookieContainer { get; set; }
private readonly int _timeoutMilliseconds;
static HttpWebClient()
{
ServicePointManager.UseNagleAlgorithm = true;
ServicePointManager.MaxServicePoints = 500;
ServicePointManager.DefaultConnectionLimit = 500;
ServicePointManager.Expect100Continue = false;
}
public HttpWebClient(WebProxy proxy = null, int timeoutMilliseconds = 30000)
{
this.WebProxy = proxy;
this.CookieContainer = new CookieContainer();
_timeoutMilliseconds = timeoutMilliseconds;
}
public async Task<HttpWebResponse> HttpGet(string url)
{
var request = ConstructHttpGetRequest(url);
return await GetHttpResponse(request);
}
public async Task<HttpWebResponse> HttpPost(string url, string postData)
{
var request = ConstructHttpPostRequest(url, postData);
return await GetHttpResponse(request);
}
public async Task<HttpWebResponse> HttpPost(string url, Dictionary<string, string> valueDictionary)
{
var postData = GeneratePostBody(valueDictionary);
var request = ConstructHttpPostRequest(url, postData);
return await GetHttpResponse(request);
}
public HttpWebRequest ConstructHttpGetRequest(string url)
{
return CreateDefaultHttpWebRequest(url, "GET");
}
public HttpWebRequest ConstructHttpPostRequest(string url, Dictionary<string, string> valueDictionary, string host = null)
{
var postData = GeneratePostBody(valueDictionary);
return ConstructHttpPostRequest(url, postData);
}
public HttpWebRequest ConstructHttpPostRequest(string url, string postData)
{
var request = CreateDefaultHttpWebRequest(url, "POST");
WriteToHttpWebRequestStream(request, postData);
return request;
}
protected void WriteToHttpWebRequestStream(HttpWebRequest httpWebRequest, string data)
{
WriteToHttpWebRequestStream(httpWebRequest, Encoding.ASCII.GetBytes(data));
}
protected void WriteToHttpWebRequestStream(HttpWebRequest httpWebRequest, byte[] data)
{
using (var requestStream = httpWebRequest.GetRequestStream())
{
var contentBytes = data;
requestStream.Write(contentBytes, 0, contentBytes.Length);
}
}
protected HttpWebRequest CreateDefaultHttpWebRequest(string url, string method, string accept=null)
{
var request = (HttpWebRequest)WebRequest.Create(url);
request.ProtocolVersion = HttpVersion.Version10;
request.Timeout = _timeoutMilliseconds;
request.Host = new Uri(url).Host;
request.CookieContainer = CookieContainer;
request.Method = method;
request.Accept = "application/json,text/javascript,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
request.ContentType = "application/x-www-form-urlencoded";
request.Headers["Accept-Charset"] = "ISO-8859-1,utf-8;q=0.7,*;q=0.7";
if(this.WebProxy != null)
request.Proxy = this.WebProxy;
return request;
}
public async Task<HttpWebResponse> GetHttpResponse(HttpWebRequest request)
{
HttpWebResponse response = await Task<HttpWebResponse>.Factory.FromAsync(request.BeginGetResponse, r => (HttpWebResponse) request.EndGetResponse(r), null);
return response;
}
public void ClearSession()
{
if(CookieContainer != null)
CookieContainer = new CookieContainer();
}
public static string GeneratePostBody(Dictionary<string, string> postValues)
{
var values = String.Join("&", postValues.Select(kv => String.Join("=", kv.Key, kv.Value)));
return values;
}
}
On top of this basic class, you can build APIs for various websites and HTTP services. Nothing fancy at all, just saves a bit of typing. Change whatever properties you need and enjoy!
No comments:
Post a Comment