My goal is to get status code for about 5k URL.
Constraints:
1/ if the URL A redirects to URL B, then get the status code of the URL B.
2/ If it's timed out, then retry for 3 times.
This is what I implemented:
Parallel.ForEach(
linkList,
new ParallelOptions() {MaxDegreeOfParallelism=64},
link=>
{
HtmlAnalyzor htmlAnalyzor = new HtmlAnalyzor(link.URL);
int statusCode=-1;
for (int retryTime = 2; retryTime >= 0; retryTime--)
{
statusCode = htmlAnalyzor.GetDestinationURLStatusCode(link.URL, link.IdQualityPage,retryTime);
if (statusCode!=-1 && statusCode!=0) { break; }
}
linkStatusCodeDic.Add(link, statusCode);
});
public int GetDestinationURLStatusCode(string originalURL,int qPageId, int retryTime)
{
try
{
Console.WriteLine("URL:{0}",originalURL);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(originalURL);
request.Method = "HEAD";
request.Timeout = 10000;
//Half of the time, the line below will throw a WebException and give me a statusCode=0;
_Response = (HttpWebResponse)request.GetResponse();
string destURL = _Response.ResponseUri.ToString();
if (originalURL != destURL)
{
GetDestinationURLStatusCode(destURL,qPageId,retryTime);
}
int statusCode = (int)_Response.StatusCode;
_Response.Close();
Console.WriteLine("Normal:{0}", statusCode);
return statusCode;
}catch(WebException webEx)
{
int statusCode = 0;
if (webEx.Status == WebExceptionStatus.ProtocolError)
{
//statusCode = (int)((HttpWebResponse)webEx.Response).StatusCode;
Console.WriteLine("WebEx:{0}", statusCode);
}
if (_Response != null)
{
_Response.Close();
}
return statusCode;
}
catch(Exception ex)
{
if (_Response != null)
{
_Response.Close();
}
if(retryTime==0)
{
Console.WriteLine("Failed to get status code for URL['{1}'] on the Page[Code:{2}].{0}ErrorMessage:{3}", Environment.NewLine, _URL, pageId, ex.Message);
}
return -1;
}
}
Result Of My Code: half of the time, it will throw a WebException and give me a status code = 0.
What I've tried to change this situation:
1/ I've changed MaxDegreeOfParallelism to 40 and 20, it doesn't work.
2/ I've changed request.TimeOut to 20s, 30s, even 90s, it doesn't work.