0

My goal is to get status code for about 5k URL.

Constraints:
1/ if the URL A redirects to URL B, then get the status code of the URL B.
2/ If it's timed out, then retry for 3 times.

This is what I implemented:

  Parallel.ForEach(
                linkList,
                new ParallelOptions() {MaxDegreeOfParallelism=64},
                link=>
                    {
                        HtmlAnalyzor htmlAnalyzor = new HtmlAnalyzor(link.URL);
                        int statusCode=-1;
                        for (int retryTime = 2; retryTime >= 0; retryTime--)
                        {
                            statusCode = htmlAnalyzor.GetDestinationURLStatusCode(link.URL, link.IdQualityPage,retryTime);
                            if (statusCode!=-1 && statusCode!=0) { break; }
                        }
                        linkStatusCodeDic.Add(link, statusCode);
                    });



public int GetDestinationURLStatusCode(string originalURL,int qPageId, int retryTime)
        {
            try
            {
                Console.WriteLine("URL:{0}",originalURL);
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(originalURL);
                request.Method = "HEAD";
                request.Timeout = 10000;

//Half of the time, the line below will throw a WebException and give me a statusCode=0;
                _Response = (HttpWebResponse)request.GetResponse(); 

            string destURL = _Response.ResponseUri.ToString();
            if (originalURL != destURL)
            {
                GetDestinationURLStatusCode(destURL,qPageId,retryTime);
            }
            int statusCode = (int)_Response.StatusCode;
            _Response.Close();
            Console.WriteLine("Normal:{0}", statusCode);
            return statusCode;
        }catch(WebException webEx)
        {
            int statusCode = 0;
            if (webEx.Status == WebExceptionStatus.ProtocolError)
            {
                //statusCode = (int)((HttpWebResponse)webEx.Response).StatusCode;
                Console.WriteLine("WebEx:{0}", statusCode);
            }
            if (_Response != null)
            {
                _Response.Close();
            }
            return statusCode;


        }
        catch(Exception ex)
        {
            if (_Response != null)
            {
                _Response.Close();
            }
            if(retryTime==0)
            {
                Console.WriteLine("Failed to get status code for URL['{1}'] on the Page[Code:{2}].{0}ErrorMessage:{3}", Environment.NewLine, _URL, pageId, ex.Message);
            }

            return -1;
        }
}

Result Of My Code: half of the time, it will throw a WebException and give me a status code = 0.
What I've tried to change this situation:
1/ I've changed MaxDegreeOfParallelism to 40 and 20, it doesn't work.
2/ I've changed request.TimeOut to 20s, 30s, even 90s, it doesn't work.

Leona
  • 377
  • 1
  • 4
  • 17

1 Answers1

0

I've changed my code, now it's working. The main points that I've changed are:

  1. delete:new ParallelOptions() {MaxDegreeOfParallelism=64}

  2. using parallel first, then use tradition for loop to deal with the ones fails in parallel. This increase the percentage of success.

  3. some parameters are modified for httpwebrequest:

    request.UserAgent ="html-analyzor";
    request.KeepAlive = false;
    request.Timeout =15000;

Here's the code:

List<QualityPageLink> linkListToRetrySync = new List<QualityPageLink>();
    ServicePointManager.DefaultConnectionLimit = 1000;
    Parallel.ForEach(
         linkList,
         //new ParallelOptions() { //MaxDegreeOfParallelism = 64 },
         link =>
         {
          HtmlAnalyzor htmlAnalyzor = new HtmlAnalyzor(link.URL);
          int statusCode = -1;
          for (int retryTime = 2; retryTime >= 0; retryTime--)
          {
              statusCode = htmlAnalyzor.GetDestinationURLStatusCode(link.URL, link.IdQualityPage, retryTime);
              if (statusCode > 0) { break; }
              if (statusCode != 200) { linkListToRetrySync.Add(link); }
              linkIdStatusCodeDic.Add(link, statusCode);
          });


if(linkListToRetrySync!=null && linkListToRetrySync.Count()!=0)
{
      for (int i = 0; i < linkListToRetrySync.Count(); i++)
      {
           var link = linkListToRetrySync[i];
           int statusCode = -1;
           HtmlAnalyzor htmlAnalyzor = new HtmlAnalyzor(link.URL);
           for (int retryTime = 2; retryTime >= 0; retryTime--)
           {
               statusCode = htmlAnalyzor.GetDestinationURLStatusCode(link.URL, link.IdQualityPage, retryTime);
               if (statusCode > 0) { break; }
           }
           linkIdStatusCodeDic[link] = statusCode;
            }
    }

 public int GetDestinationURLStatusCode(string originalURL, int qPageId, int retryTime)
        {
            HttpWebRequest request;
            int statusCode = -1;
            //HttpWebResponse response = null;
            try
            {
                Console.WriteLine("URL:{0}", Helper.ToString(originalURL));
                request = (HttpWebRequest)WebRequest.Create(originalURL);
                request.UserAgent = "html-analyzor";
                request.KeepAlive = false;
                request.Timeout = 15000;

            using (this._Response = (HttpWebResponse)request.GetResponse())
            {
                statusCode = (int)_Response.StatusCode;
            }

            //string destURL = _Response.ResponseUri.ToString();
            //if (originalURL != destURL)
            //{
            //    GetDestinationURLStatusCode(destURL, qPageId, retryTime);
            //}

            Console.WriteLine("Normal:{0}", statusCode);
            return statusCode;
        }
        catch (WebException webEx)
        {
            statusCode = 0;
            if (webEx.Status == WebExceptionStatus.ProtocolError)
            {
                statusCode = (int)((HttpWebResponse)webEx.Response).StatusCode;
                Console.WriteLine("WebEx:{0}", statusCode);
            }
            if (this._Response != null)
            {
                this._Response.Close();
                this._Response = null;
            }
            return statusCode;
        }
        catch(Exception ex)
        {
            if (this._Response != null)
            {
                this._Response.Close();
                this._Response = null;
            }
            if (retryTime == 0)
            {
                // Console.WriteLine("Failed.");
            }

            return -1;
        }

    }
Leona
  • 377
  • 1
  • 4
  • 17