-1

I have written an app that goes through our own properties and scraps the data. To make sure I don't run through the same URLs, I am using a MySQL database to store the URL, flag it once its processed. All this was being done in a single thread and it's fine if I had only few thousand entries. But I have few hundred thousand entries that I need to parse so I need to make changes in the code (I am newbie in multithreading in general). I found an example and was trying to copy the style but doesn't seem to work. Anyone know what the issue is with the following code?

EDIT: Sorry didn't mean to make people guess the issue but was stupid of me to include the exception. Here is the exception "System.InValidCastException: 'Specified cast is not valid.'" When I start the process it collects the URLs from the database and then never hits DoWork method

//This will get the entries from the database

List<Mappings> items = bot.GetUrlsToProcess(100);
if (items != null)
{
    var tokenSource = new CancellationTokenSource();
    var token = tokenSource.Token;
    Worker.Done = new Worker.DoneDelegate(WorkerDone);
    foreach (var item in items)
    {
        urls.Add(item.Url);
        WaitingTasks.Enqueue(new Task(id => new Worker().DoWork((int)id, item.Url, token), item.Url, token));
     }
     LaunchTasks();

 }


static async void LaunchTasks()
{
        // keep checking until we're done
        while ((WaitingTasks.Count > 0) || (RunningTasks.Count > 0))
        {
            // launch tasks when there's room
            while ((WaitingTasks.Count > 0) && (RunningTasks.Count < MaxRunningTasks))
            {
                Task task = WaitingTasks.Dequeue();
                lock (RunningTasks) RunningTasks.Add((int)task.AsyncState, task);
                task.Start();
            }
            UpdateConsole();
            await Task.Delay(300); // wait before checking again
        }
        UpdateConsole();    // all done
 }


 static void UpdateConsole()
 {
        Console.Write(string.Format("\rwaiting: {0,3:##0}  running: {1,3:##0} ", WaitingTasks.Count, RunningTasks.Count));
 }

 static void WorkerDone(int id)
 {
        lock (RunningTasks) RunningTasks.Remove(id);
 }


 public class Worker
 {
    public delegate void DoneDelegate(int taskId);
    public static DoneDelegate Done { private get; set; }

    public async void DoWork(object id, string url, CancellationToken token)
    {
        if (token.IsCancellationRequested) return;
        Content obj;
        try
        {
            int tries = 0;
            bool IsUrlProcessed = true;

            DateTime dtStart = DateTime.Now;
            string articleDate = string.Empty;

            try
            {
                ScrapeWeb bot = new ScrapeWeb();

                SearchApi searchApi = new SearchApi();
                SearchHits searchHits = searchApi.Url(url, 5, 0);
                if (searchHits.Hits.Count() == 0)
                {
                    obj = await bot.ReturnArticleObject(url);
                    if (obj.Code != HttpStatusCode.OK)
                    {
                        Console.WriteLine(string.Format("\r Status is {0}", obj.Code));
                        tries = itemfound.UrlMaxTries + 1;
                        IsUrlProcessed = false;
                        itemfound.HttpCode = obj.Code;
                    }
                    else
                    {

                        string title = obj.Title;
                        string content = obj.Contents;
                        string description = obj.Description;

                        Articles article = new Articles();
                        article.Site = url.GetSite();
                        article.Content = content;
                        article.Title = title;
                        article.Url = url.ToLower();
                        article.Description = description;
                        string strThumbNail = HtmlHelper.GetImageUrl(url, obj.RawResponse);
                        article.Author = HtmlHelper.GetAuthor(url, obj.RawResponse);
                        if (!string.IsNullOrEmpty(strThumbNail))
                        {
                            //This condition needs to be added to remove ?n=<number> from EP thumbnails
                            if (strThumbNail.Contains("?"))
                            {
                                article.ImageUrl = strThumbNail.Substring(0, strThumbNail.IndexOf("?")).Replace("http:", "https:");
                            }
                            else
                                article.ImageUrl = strThumbNail.Replace("http:", "https:");
                        }
                        else
                        {
                            article.ImageUrl = string.IsNullOrEmpty(strThumbNail) ? article.Url.GetDefaultImageUrls() : strThumbNail.Replace("http:", "https:");
                        }

                        articleDate = HtmlHelper.GetPublishDate(url, obj.RawResponse);
                        if (string.IsNullOrEmpty(articleDate))
                            article.Pubdate = DateTime.Now;
                        else
                            article.Pubdate = DateTime.Parse(articleDate);


                        var client = new Index(searchApi);
                        var result = client.Upsert(article);
                        itemfound.HttpCode = obj.Code;
                        if (result)
                        {
                            itemfound.DateCreated = DateTime.Parse(articleDate);
                            itemfound.DateModified = DateTime.Parse(articleDate);
                            UpdateItem(itemfound);
                        }
                        else
                        {
                            tries = itemfound.UrlMaxTries + 1;
                            IsUrlProcessed = false;
                            itemfound.DateCreated = DateTime.Parse(articleDate);
                            itemfound.DateModified = DateTime.Parse(articleDate) == null ? DateTime.Now : DateTime.Parse(articleDate);
                            UpdateItem(itemfound, tries, IsUrlProcessed);
                        }
                    }
                }
                else
                {
                    tries = itemfound.UrlMaxTries + 1;
                    IsUrlProcessed = true;
                    itemfound.HttpCode = HttpStatusCode.OK;
                    itemfound.DateCreated = DateTime.Parse(articleDate);
                    itemfound.DateModified = DateTime.Parse(articleDate) == null ? DateTime.Now : DateTime.Parse(articleDate);
                }
            }
            catch (Exception e)
            {
                tries = itemfound.UrlMaxTries + 1;
                IsUrlProcessed = false;
                itemfound.DateCreated = DateTime.Parse(articleDate);
                itemfound.DateModified = DateTime.Parse(articleDate) == null ? DateTime.Now : DateTime.Parse(articleDate);

            }
            finally
            {
                DateTime dtEnd = DateTime.Now;

                Console.WriteLine(string.Format("\r Total time taken to process items is {0}", (dtEnd - dtStart).TotalSeconds));

            }
        }
        catch (Exception e)
        {
            Console.WriteLine(e);
        }




        Done((int)id);
    }

}

All this code is based from Best multi-thread approach for multiple web requests this link. Can someone tell me how to get this approach running?

d219
  • 2,707
  • 5
  • 31
  • 36
Subrato M
  • 159
  • 1
  • 12
  • 3
    Don't ask us what the issue is, you tell us the problem. We don't have a database of 100,000 URL's to run this on and that is a lot of code to look over and "try to guess the problem". Please tell us what you want to happen and what is actually happening. If there is an error, tell us what line and exactly the error you are getting. – Ron Beyer Mar 30 '18 at 18:53
  • "doesn't seem to work" isn't a sufficient description of the problem. – hatchet - done with SOverflow Mar 30 '18 at 18:55
  • I get an exception "System.InValidCastException: 'Specified cast is not valid.'" When I start the process it collects the URLs from the database and then never hits DoWork method. – Subrato M Mar 30 '18 at 18:55
  • 4
    @SubratoM Please tell us what line this exception occurs on, we can't guess. – Ron Beyer Mar 30 '18 at 18:56

2 Answers2

1

I think the problem is in the way you're creating your tasks:

new Task(id => new Worker().DoWork((int)id, item.Url, token), item.Url, token)

This Task constructor overload expected Action<object> delegate. That means id will be typed as object and you need to cast it back to something useful first.

Parameters

action

  • Type: System.Action<Object>
  • The delegate that represents the code to execute in the task.

state

  • Type: System.Object
  • An object representing data to be used by the action.

cancellationToken

  • Type: System.Threading.CancellationToken -The CancellationToken that that the new task will observe.

You decided to cast it to int by calling (int)id, but you're passing item.Url as the object itself. I can't tell you 100% what the type of Url is but I don't expect Url-named property to be of type int.

MarcinJuraszek
  • 124,003
  • 15
  • 196
  • 263
  • how would I pass string to DoWork? I noticed the exceptions on this line RunningTasks.Add((int)task.AsyncState, task); Add method only takes in while I don't have id to pass along to it. Url is basically the URL i want to scan and then work on the data that is being returned. – Subrato M Mar 31 '18 at 00:20
0

Based on what @MarcinJuraszek said I just went back to my code and added an int as I couldn't find another way to resolve it. Here is the change I made

int i=0
foreach (var item in items)
{
    urls.Add(item.Url);
    WaitingTasks.Enqueue(new Task(id => new Worker().DoWork((string)id, item.Url, token), item.Url, token));
    i++;
 }
Subrato M
  • 159
  • 1
  • 12