I have written an app that goes through our own properties and scraps the data. To make sure I don't run through the same URLs, I am using a MySQL database to store the URL, flag it once its processed. All this was being done in a single thread and it's fine if I had only few thousand entries. But I have few hundred thousand entries that I need to parse so I need to make changes in the code (I am newbie in multithreading in general). I found an example and was trying to copy the style but doesn't seem to work. Anyone know what the issue is with the following code?
EDIT: Sorry didn't mean to make people guess the issue but was stupid of me to include the exception. Here is the exception "System.InValidCastException: 'Specified cast is not valid.'" When I start the process it collects the URLs from the database and then never hits DoWork method
//This will get the entries from the database
List<Mappings> items = bot.GetUrlsToProcess(100);
if (items != null)
{
var tokenSource = new CancellationTokenSource();
var token = tokenSource.Token;
Worker.Done = new Worker.DoneDelegate(WorkerDone);
foreach (var item in items)
{
urls.Add(item.Url);
WaitingTasks.Enqueue(new Task(id => new Worker().DoWork((int)id, item.Url, token), item.Url, token));
}
LaunchTasks();
}
static async void LaunchTasks()
{
// keep checking until we're done
while ((WaitingTasks.Count > 0) || (RunningTasks.Count > 0))
{
// launch tasks when there's room
while ((WaitingTasks.Count > 0) && (RunningTasks.Count < MaxRunningTasks))
{
Task task = WaitingTasks.Dequeue();
lock (RunningTasks) RunningTasks.Add((int)task.AsyncState, task);
task.Start();
}
UpdateConsole();
await Task.Delay(300); // wait before checking again
}
UpdateConsole(); // all done
}
static void UpdateConsole()
{
Console.Write(string.Format("\rwaiting: {0,3:##0} running: {1,3:##0} ", WaitingTasks.Count, RunningTasks.Count));
}
static void WorkerDone(int id)
{
lock (RunningTasks) RunningTasks.Remove(id);
}
public class Worker
{
public delegate void DoneDelegate(int taskId);
public static DoneDelegate Done { private get; set; }
public async void DoWork(object id, string url, CancellationToken token)
{
if (token.IsCancellationRequested) return;
Content obj;
try
{
int tries = 0;
bool IsUrlProcessed = true;
DateTime dtStart = DateTime.Now;
string articleDate = string.Empty;
try
{
ScrapeWeb bot = new ScrapeWeb();
SearchApi searchApi = new SearchApi();
SearchHits searchHits = searchApi.Url(url, 5, 0);
if (searchHits.Hits.Count() == 0)
{
obj = await bot.ReturnArticleObject(url);
if (obj.Code != HttpStatusCode.OK)
{
Console.WriteLine(string.Format("\r Status is {0}", obj.Code));
tries = itemfound.UrlMaxTries + 1;
IsUrlProcessed = false;
itemfound.HttpCode = obj.Code;
}
else
{
string title = obj.Title;
string content = obj.Contents;
string description = obj.Description;
Articles article = new Articles();
article.Site = url.GetSite();
article.Content = content;
article.Title = title;
article.Url = url.ToLower();
article.Description = description;
string strThumbNail = HtmlHelper.GetImageUrl(url, obj.RawResponse);
article.Author = HtmlHelper.GetAuthor(url, obj.RawResponse);
if (!string.IsNullOrEmpty(strThumbNail))
{
//This condition needs to be added to remove ?n=<number> from EP thumbnails
if (strThumbNail.Contains("?"))
{
article.ImageUrl = strThumbNail.Substring(0, strThumbNail.IndexOf("?")).Replace("http:", "https:");
}
else
article.ImageUrl = strThumbNail.Replace("http:", "https:");
}
else
{
article.ImageUrl = string.IsNullOrEmpty(strThumbNail) ? article.Url.GetDefaultImageUrls() : strThumbNail.Replace("http:", "https:");
}
articleDate = HtmlHelper.GetPublishDate(url, obj.RawResponse);
if (string.IsNullOrEmpty(articleDate))
article.Pubdate = DateTime.Now;
else
article.Pubdate = DateTime.Parse(articleDate);
var client = new Index(searchApi);
var result = client.Upsert(article);
itemfound.HttpCode = obj.Code;
if (result)
{
itemfound.DateCreated = DateTime.Parse(articleDate);
itemfound.DateModified = DateTime.Parse(articleDate);
UpdateItem(itemfound);
}
else
{
tries = itemfound.UrlMaxTries + 1;
IsUrlProcessed = false;
itemfound.DateCreated = DateTime.Parse(articleDate);
itemfound.DateModified = DateTime.Parse(articleDate) == null ? DateTime.Now : DateTime.Parse(articleDate);
UpdateItem(itemfound, tries, IsUrlProcessed);
}
}
}
else
{
tries = itemfound.UrlMaxTries + 1;
IsUrlProcessed = true;
itemfound.HttpCode = HttpStatusCode.OK;
itemfound.DateCreated = DateTime.Parse(articleDate);
itemfound.DateModified = DateTime.Parse(articleDate) == null ? DateTime.Now : DateTime.Parse(articleDate);
}
}
catch (Exception e)
{
tries = itemfound.UrlMaxTries + 1;
IsUrlProcessed = false;
itemfound.DateCreated = DateTime.Parse(articleDate);
itemfound.DateModified = DateTime.Parse(articleDate) == null ? DateTime.Now : DateTime.Parse(articleDate);
}
finally
{
DateTime dtEnd = DateTime.Now;
Console.WriteLine(string.Format("\r Total time taken to process items is {0}", (dtEnd - dtStart).TotalSeconds));
}
}
catch (Exception e)
{
Console.WriteLine(e);
}
Done((int)id);
}
}
All this code is based from Best multi-thread approach for multiple web requests this link. Can someone tell me how to get this approach running?