Saturday, December 8, 2012

Basic HttpClient class

So, I do a lot of web scraping - work fit for an episode of Dirty Jobs I'd say - and I end up writing a lot of code to do this.  Standard toolkit includes HtmlAgilityPack for HTML parsing, Fiddler for monitoring network traffic, Firebug for both, HttpWeb(Request|Response) (System.NET), and a web scraping library that I wrote to simplify my life, modifying the code as needed.

So, here's a bare-bones version of a basic HttpWebClient that stores cookies for authenticating, sessions, and stuff - an except of the library I use  Obviously doesn't process Javascript or deal with Javascript-set cookies, have fun with those.  Uses C# 5 async keyword, so .NET 4.5 is required.


Code:
public class HttpWebClient
{
    public WebProxy WebProxy { get; set; }
    public CookieContainer CookieContainer { get; set; }

    private readonly int _timeoutMilliseconds;

    static HttpWebClient()
    {
        ServicePointManager.UseNagleAlgorithm = true;
        ServicePointManager.MaxServicePoints = 500;
        ServicePointManager.DefaultConnectionLimit = 500;
        ServicePointManager.Expect100Continue = false;
    }

    // Default to 30 second timeout
    public HttpWebClient(WebProxy proxy = null, int timeoutMilliseconds = 30000)
    {
        this.WebProxy = proxy;
        this.CookieContainer = new CookieContainer();

        _timeoutMilliseconds = timeoutMilliseconds;
    }

    public async Task<HttpWebResponse> HttpGet(string url)
    {
        var request = ConstructHttpGetRequest(url);
        return await GetHttpResponse(request);
    }

    public async Task<HttpWebResponse> HttpPost(string url, string postData)
    {
        var request = ConstructHttpPostRequest(url, postData);
        return await GetHttpResponse(request);
    }

    public async Task<HttpWebResponse> HttpPost(string url, Dictionary<string, string> valueDictionary)
    {
        var postData = GeneratePostBody(valueDictionary);
        var request = ConstructHttpPostRequest(url, postData);
        return await GetHttpResponse(request);
    }

    public HttpWebRequest ConstructHttpGetRequest(string url)
    {
        return CreateDefaultHttpWebRequest(url, "GET");
    }

    public HttpWebRequest ConstructHttpPostRequest(string url, Dictionary<string, string> valueDictionary, string host = null)
    {
        var postData = GeneratePostBody(valueDictionary);
        return ConstructHttpPostRequest(url, postData);
    }

    public HttpWebRequest ConstructHttpPostRequest(string url, string postData)
    {
        var request = CreateDefaultHttpWebRequest(url, "POST");
        WriteToHttpWebRequestStream(request, postData);
        return request;
    }

    protected void WriteToHttpWebRequestStream(HttpWebRequest httpWebRequest, string data)
    {
        WriteToHttpWebRequestStream(httpWebRequest, Encoding.ASCII.GetBytes(data));
    }

    protected void WriteToHttpWebRequestStream(HttpWebRequest httpWebRequest, byte[] data)
    {
        using (var requestStream = httpWebRequest.GetRequestStream())
        {
            var contentBytes = data;
            requestStream.Write(contentBytes, 0, contentBytes.Length);
        }
    }

    protected HttpWebRequest CreateDefaultHttpWebRequest(string url, string method, string accept=null)
    {
        var request = (HttpWebRequest)WebRequest.Create(url);

        // Default to HTTP 1.0
        request.ProtocolVersion = HttpVersion.Version10;

        request.Timeout = _timeoutMilliseconds;
        request.Host = new Uri(url).Host;
        request.CookieContainer = CookieContainer;
        request.Method = method;
        request.Accept = "application/json,text/javascript,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
        request.ContentType = "application/x-www-form-urlencoded";
        request.Headers["Accept-Charset"] = "ISO-8859-1,utf-8;q=0.7,*;q=0.7";
            
        if(this.WebProxy != null)
            request.Proxy = this.WebProxy;

        return request;
    }

    public async Task<HttpWebResponse> GetHttpResponse(HttpWebRequest request)
    {
        HttpWebResponse response = await Task<HttpWebResponse>.Factory.FromAsync(request.BeginGetResponse, r => (HttpWebResponse) request.EndGetResponse(r), null);
        return response;
    }
        
    public void ClearSession()
    {
        if(CookieContainer != null)
            CookieContainer = new CookieContainer();
    }

    public static string GeneratePostBody(Dictionary<string, string> postValues)
    {
        var values = String.Join("&", postValues.Select(kv => String.Join("=", kv.Key, kv.Value)));
        return values;
    }
}   


On top of this basic class, you can build APIs for various websites and HTTP services.  Nothing fancy at all, just saves a bit of typing.  Change whatever properties you need and enjoy!

No comments:

Post a Comment