2

I'm developing a web scraper, but I need to persist cookies between requests much like I can do in PHP using curl. However, it seems that if I try to use a CookieContainer object in C#, it doesn't grab all of the cookies from the response and send them to the next request.

Here's my C# class:

    public class Scraper
    {
        public string Username { get; set; }
        public string Password { get; set; }
        public string UserAgent { get; set; }
        public string ContentType { get; set; }
        public CookieCollection Cookies { get; set; }
        public CookieContainer Container { get; set; }

        public Scraper()
        {
            UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0";
            ContentType = "application/x-www-form-urlencoded";
            Cookies = new CookieCollection();
            Container = new CookieContainer();
        }

        public string Load(string uri, string postData = "", NetworkCredential creds = null, int timeout = 60000, string host = "", string referer = "", string requestedwith = "")
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
            request.CookieContainer = Container;
            request.CookieContainer.Add(Cookies);   
            request.UserAgent = UserAgent;
            request.AllowWriteStreamBuffering = true;
            request.ProtocolVersion = HttpVersion.Version11;
            request.AllowAutoRedirect = true;
            request.ContentType = ContentType;
            request.PreAuthenticate = true;

            if (requestedwith.Length > 0)
                request.Headers["X-Requested-With"] = requestedwith;

            if (host.Length > 0)
                request.Host = host;

            if (referer.Length > 0)
                request.Referer = referer;

            if (timeout > 0)
                request.Timeout = timeout;

            if (creds != null)
                request.Credentials = creds;

            if (postData.Length > 0)
            {
                request.Method = "POST";
                ASCIIEncoding encoding = new ASCIIEncoding();
                byte[] data = encoding.GetBytes(postData);
                request.ContentLength = data.Length;
                Stream newStream = request.GetRequestStream(); //open connection
                newStream.Write(data, 0, data.Length); // Send the data.
                newStream.Close();
            }
            else
                request.Method = "GET";

            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            Cookies = response.Cookies;
            StringBuilder page;
            using (StreamReader sr = new StreamReader(response.GetResponseStream()))
            {
                page = new StringBuilder(sr.ReadToEnd());
                page = page.Replace("\r\n", ""); // strip all new lines and tabs
                page = page.Replace("\r", ""); // strip all new lines and tabs
                page = page.Replace("\n", ""); // strip all new lines and tabs
                page = page.Replace("\t", ""); // strip all new lines and tabs
            }

            string str = page.ToString();
            str = Regex.Replace(str, @">\s+<", "><");

            return str;
        }
    }

Here's my PHP code for loading and maintaining cookies in a cookie jar:

    private function load($url = 'http://www.google.com/', $postData = array(), $headers = FALSE)
    {
        $useragent = "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; " . $this->locale . "; rv:1.9.2.10) Gecko/20100914 BRI/1 Firefox/3.6.10 ( .NET CLR 3.5.30729)";

        $curl = curl_init();
        curl_setopt($curl, CURLOPT_URL, $url);
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
        curl_setopt($curl, CURLOPT_HEADER, FALSE);
        if($headers) curl_setopt($curl, CURLOPT_HTTPHEADER, array('X-Requested-With: XMLHttpRequest'));
        curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
        curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE);
        curl_setopt($curl, CURLOPT_ENCODING, 'UTF-8');
        curl_setopt($curl, CURLOPT_USERAGENT, $useragent);
        curl_setopt($curl, CURLOPT_POST, !empty($postData));
        if(!empty($postData)) curl_setopt($curl, CURLOPT_POSTFIELDS, $postData);
        curl_setopt($curl, CURLOPT_COOKIEFILE, $this->cookieFile);
        curl_setopt($curl, CURLOPT_COOKIEJAR, $this->cookieFile);
        $page = curl_exec ($curl);
        $page = str_replace(array("\r\n", "\r", "\n", "\t"), "", $page); // strip all new lines and tabs
        $page = preg_replace('~>\s+<~', '><', $page);// strip all whitespace between tags
        curl_close ($curl);

        return $page;
    }

How do I successfully maintain cookies between requests?

Cameron Tinker
  • 9,634
  • 10
  • 46
  • 85
  • I've looked at your code and couldn't see anything conceptually wrong with it. I ran it in a test app to verify that it isn't working but it works as expected. The first request has no cookie data of course, all subsequent requests (using the same instance of Scraper) all contain cookie information verified with Fiddler. This may not work if you load multiple domains since cookies are usually domain specific. As soon as you request from the same domain twice, it works well. Can you elaborate more on what you are expecting? – BrutalDev Aug 24 '13 at 19:07
  • Sorry it's taken so long to get back to you. I'm attempting to login to xbox.com and I have a working implementation in PHP, but I want to move my code to C#. However, it doesn't seem to store the cookies correctly between requests in C#. In PHP, there is a cookie file that maintains all cookie information between requests. I would like a similar implementation in C#, but as far as I know, C# only does in memory cookie storage and doesn't persist them to disk. – Cameron Tinker Sep 04 '13 at 02:42
  • 1
    OK that makes sense. You could simply save the cookie container to disk and read it back in from scraper code before it starts any processing. Since CookieContainer is serializable you can read and write this quite easily in a variety of formats: http://stackoverflow.com/questions/1777203/c-writing-a-cookiecontainer-to-disk-and-loading-back-in-for-use – BrutalDev Sep 04 '13 at 05:46
  • I may end up going with manual cookie handling since CookieContainer doesn't seem to properly parse all cookies on all websites. – Cameron Tinker Sep 27 '13 at 17:17

1 Answers1

2

I found a .NET wrapper for libcurl called LibCurl.NET and have been able to handle cookies in the same way as cURL with PHP from C#! Here is my code for anyone interested:

using SeasideResearch.LibCurlNet;
using System;
using System.Collections.Generic;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;

namespace Scraping
{
    public class LibCurlScraper
    {
        StringBuilder sb = new StringBuilder();
        MemoryStream ms = new MemoryStream();
        public string CookieFile { get; set; }
        public string RedirectUrl { get; set; }
        public string UserAgent { get; set; }
        public string ContentType { get; set; }
        public bool DisplayHeaders { get; set; }
        public bool FollowRedirects { get; set; }

        public LibCurlScraper()
        {
            UserAgent = "useragent";
            ContentType = "application/x-www-form-urlencoded";
            Curl.GlobalInit((int)CURLinitFlag.CURL_GLOBAL_ALL);
            DisplayHeaders = false;
        }

        private int MyWriteFunction(byte[] buf, int size, int nmemb, Object extraData)
        {
            foreach (byte b in buf)
            {
                //Console.Write((char)b);
                sb.Append((char)b);
            }

            return buf.Length;
        }

        private int MyWriteBinaryFunction(byte[] buf, int size, int nmemb, Object extraData)
        {
            foreach (byte b in buf)
            {
                //Console.Write((char)b);
                ms.WriteByte(b);
            }

            return buf.Length;
        }

        public MemoryStream LoadBinary(string uri, string method = "GET", string postData = "", List<string> headers = null)
        {
            ms = new MemoryStream();
            Easy easy = new Easy();
            Easy.WriteFunction wf = MyWriteBinaryFunction;
            easy.SetOpt(CURLoption.CURLOPT_URL, uri);
            easy.SetOpt(CURLoption.CURLOPT_HEADER, false);
            easy.SetOpt(CURLoption.CURLOPT_FOLLOWLOCATION, true);

            Slist headerSlist = new Slist();

            if (headers != null)
            {
                foreach (var header in headers)
                {
                    headerSlist.Append(header);
                }

            }

            easy.SetOpt(CURLoption.CURLOPT_HTTPHEADER, headerSlist);

            easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYPEER, false);
            easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYHOST, false);
            easy.SetOpt(CURLoption.CURLOPT_USERAGENT, UserAgent);
            easy.SetOpt(CURLoption.CURLOPT_TIMEOUT, 10);
            easy.SetOpt(CURLoption.CURLOPT_CONNECTTIMEOUT, 3);

            if (!string.IsNullOrEmpty(postData))
            {
                easy.SetOpt(CURLoption.CURLOPT_POST, true);
                easy.SetOpt(CURLoption.CURLOPT_POSTFIELDS, postData);
            }

            easy.SetOpt(CURLoption.CURLOPT_COOKIEFILE, CookieFile);
            easy.SetOpt(CURLoption.CURLOPT_COOKIEJAR, CookieFile);
            easy.SetOpt(CURLoption.CURLOPT_WRITEFUNCTION, wf);
            easy.Perform();
            int code = 0;
            easy.GetInfo(CURLINFO.CURLINFO_RESPONSE_CODE, ref code);
            easy.Cleanup();

            return ms;
        }

        public string Load(string uri, string method = "GET", string postData = "", List<string> headers = null)
        {
            sb.Clear();
            Easy easy = new Easy();
            Easy.WriteFunction wf = MyWriteFunction;
            easy.SetOpt(CURLoption.CURLOPT_URL, uri);
            easy.SetOpt(CURLoption.CURLOPT_HEADER, DisplayHeaders);
            easy.SetOpt(CURLoption.CURLOPT_FOLLOWLOCATION, FollowRedirects);

            Slist headerSlist = new Slist();

            if (headers != null)
            {
                foreach (var header in headers)
                {
                    headerSlist.Append(header);
                }

            }

            easy.SetOpt(CURLoption.CURLOPT_HTTPHEADER, headerSlist);


            easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYPEER, false);
            easy.SetOpt(CURLoption.CURLOPT_SSL_VERIFYHOST, false);
            easy.SetOpt(CURLoption.CURLOPT_USERAGENT, UserAgent);
            easy.SetOpt(CURLoption.CURLOPT_TIMEOUT, 10);
            easy.SetOpt(CURLoption.CURLOPT_CONNECTTIMEOUT, 3);

            if (!string.IsNullOrEmpty(postData))
            {
                easy.SetOpt(CURLoption.CURLOPT_POST, true);
                easy.SetOpt(CURLoption.CURLOPT_POSTFIELDS, postData);
            }

            if (method.Equals("POST"))
            {
                easy.SetOpt(CURLoption.CURLOPT_POST, true);
            }

            easy.SetOpt(CURLoption.CURLOPT_COOKIEFILE, CookieFile);
            easy.SetOpt(CURLoption.CURLOPT_COOKIEJAR, CookieFile);
            easy.SetOpt(CURLoption.CURLOPT_WRITEFUNCTION, wf);
            easy.Perform();
            int code = 0;
            easy.GetInfo(CURLINFO.CURLINFO_RESPONSE_CODE, ref code);
            easy.Cleanup();

            //Console.WriteLine(code);
            if (code == 302)
            {
                RedirectUrl = FindString(sb.ToString(), "Location:(.*?)\n");
                //Console.WriteLine(RedirectUrl);
            }


            string page = sb.ToString();
            page = page.Replace("\r\n", ""); // strip all new lines and tabs
            page = page.Replace("\r", ""); // strip all new lines and tabs
            page = page.Replace("\n", ""); // strip all new lines and tabs
            page = page.Replace("\t", ""); // strip all new lines and tabs

            page = Regex.Replace(page, @">\s+<", "><");

            return page;
        }

        public static void OnDebug(CURLINFOTYPE infoType, String msg, Object extraData)
        {
            Console.WriteLine(msg);
            TextWriter tw = new StreamWriter(@"C:\cookies\verbose.txt", true);
            tw.WriteLine(msg);
            tw.Close();
        }
    }
}

I have two methods, one for returning a string and one for returning a MemoryStream. You'll need to initiliaze the CookieFile property and make sure that the directory/file is writeable before attempting to write to the file.

I have noticed that problems arise if your cookie file contains old session data from a previous run. This can be fixed by deleting your cookie file before instantiating a new instance of LibCurlScraper and populating the cookie file.

Ideally, we could use the built-in managed classes for all HTTP cookies, but this works until a better solution is found.

EDIT:
I came across some code that properly parses a "Set-Cookie" header. It handles cookies separated by commas and extracts the name, expiration, path, value, and domain of each cookie. This should be the preferred way to make HTTP requests and not LibCurl.NET. You can apply this method to asynchronous requests too.

This code works better than Microsoft's own cookie parser and this is really what the official cookie parser should be doing. I don't have any clue why Microsoft hasn't fixed this yet since it's a very common issue.

Here is the original code: http://snipplr.com/view/4427/

I'm posting it here in case the link goes down at some point:

public static CookieCollection GetAllCookiesFromHeader(string strHeader, string strHost)
{
    ArrayList al = new ArrayList();
    CookieCollection cc = new CookieCollection();
    if (strHeader != string.Empty)
    {
        al = ConvertCookieHeaderToArrayList(strHeader);
        cc = ConvertCookieArraysToCookieCollection(al, strHost);
    }
    return cc;
}


private static ArrayList ConvertCookieHeaderToArrayList(string strCookHeader)
{
    strCookHeader = strCookHeader.Replace("\r", "");
    strCookHeader = strCookHeader.Replace("\n", "");
    string[] strCookTemp = strCookHeader.Split(',');
    ArrayList al = new ArrayList();
    int i = 0;
    int n = strCookTemp.Length;
    while (i < n)
    {
        if (strCookTemp[i].IndexOf("expires=", StringComparison.OrdinalIgnoreCase) > 0)
        {
            al.Add(strCookTemp[i] + "," + strCookTemp[i + 1]);
            i = i + 1;
        }
        else
        {
            al.Add(strCookTemp[i]);
        }
        i = i + 1;
    }
    return al;
}


private static CookieCollection ConvertCookieArraysToCookieCollection(ArrayList al, string strHost)
{
    CookieCollection cc = new CookieCollection();

    int alcount = al.Count;
    string strEachCook;
    string[] strEachCookParts;
    for (int i = 0; i < alcount; i++)
    {
        strEachCook = al[i].ToString();
        strEachCookParts = strEachCook.Split(';');
        int intEachCookPartsCount = strEachCookParts.Length;
        string strCNameAndCValue = string.Empty;
        string strPNameAndPValue = string.Empty;
        string strDNameAndDValue = string.Empty;
        string[] NameValuePairTemp;
        Cookie cookTemp = new Cookie();

        for (int j = 0; j < intEachCookPartsCount; j++)
        {
            if (j == 0)
            {
                strCNameAndCValue = strEachCookParts[j];
                if (strCNameAndCValue != string.Empty)
                {
                    int firstEqual = strCNameAndCValue.IndexOf("=");
                    string firstName = strCNameAndCValue.Substring(0, firstEqual);
                    string allValue = strCNameAndCValue.Substring(firstEqual + 1, strCNameAndCValue.Length - (firstEqual + 1));
                    cookTemp.Name = firstName;
                    cookTemp.Value = allValue;
                }
                continue;
            }
            if (strEachCookParts[j].IndexOf("path", StringComparison.OrdinalIgnoreCase) >= 0)
            {
                strPNameAndPValue = strEachCookParts[j];
                if (strPNameAndPValue != string.Empty)
                {
                    NameValuePairTemp = strPNameAndPValue.Split('=');
                    if (NameValuePairTemp[1] != string.Empty)
                    {
                        cookTemp.Path = NameValuePairTemp[1];
                    }
                    else
                    {
                        cookTemp.Path = "/";
                    }
                }
                continue;
            }

            if (strEachCookParts[j].IndexOf("domain", StringComparison.OrdinalIgnoreCase) >= 0)
            {
                strPNameAndPValue = strEachCookParts[j];
                if (strPNameAndPValue != string.Empty)
                {
                    NameValuePairTemp = strPNameAndPValue.Split('=');

                    if (NameValuePairTemp[1] != string.Empty)
                    {
                        cookTemp.Domain = NameValuePairTemp[1];
                    }
                    else
                    {
                        cookTemp.Domain = strHost;
                    }
                }
                continue;
            }
        }

        if (cookTemp.Path == string.Empty)
        {
            cookTemp.Path = "/";
        }
        if (cookTemp.Domain == string.Empty)
        {
            cookTemp.Domain = strHost;
        }
        cc.Add(cookTemp);
    }
    return cc;
}
Cameron Tinker
  • 9,634
  • 10
  • 46
  • 85