0

this program reads a list of web site then saves them. i found it runs good for the first 2 url requests. then goes very slow (about 5 min per request)

the time spend on row 1 and row 2 are only 2 second. Then all other will be about 5 min each.

When i debug , i see it actually tooks long in wb.Navigate(url.ToString());

    public static async Task<bool> test()
    {

        long totalCnt = rows.Count();
        long procCnt = 0;
        foreach (string url in rows)
        {
            procCnt++;

            string webStr = load_WebStr(url).Result;
            Console.WriteLine(DateTime.Now+ "["+procCnt + "/" + totalCnt+"]  "+url);
        }



        return true;
    }


public static async Task<string> load_WebStr(string url)
{
    var tcs = new TaskCompletionSource<string>();

    var thread = new Thread(() =>
    {
        EventHandler idleHandler = null;

        idleHandler = async (s, e) =>
        {
            // handle Application.Idle just once
            Application.Idle -= idleHandler;

            // return to the message loop
            await Task.Yield();

            // and continue asynchronously
            // propogate the result or exception
            try
            {
                var result = await webBrowser_Async(url);
                tcs.SetResult(result);
            }
            catch (Exception ex)
            {
                tcs.SetException(ex);
            }

            // signal to exit the message loop
            // Application.Run will exit at this point
            Application.ExitThread();
        };

        // handle Application.Idle just once
        // to make sure we're inside the message loop
        // and SynchronizationContext has been correctly installed
        Application.Idle += idleHandler;
        Application.Run();
    });

    // set STA model for the new thread
    thread.SetApartmentState(ApartmentState.STA);

    // start the thread and await for the task
    thread.Start();
    try
    {
        return await tcs.Task;
    }
    finally
    {
        thread.Join();
    }

}


public static async Task<string> webBrowser_Async(string url)
{

    string result = "";
    using (var wb = new WebBrowser())
    {
        wb.ScriptErrorsSuppressed = true;

        TaskCompletionSource<bool> tcs = null;
        WebBrowserDocumentCompletedEventHandler documentCompletedHandler = (s, e) =>
        tcs.TrySetResult(true);


        tcs = new TaskCompletionSource<bool>();
        wb.DocumentCompleted += documentCompletedHandler;
        try
        {
            wb.Navigate(url.ToString());
            // await for DocumentCompleted
            await tcs.Task;
        }
        catch
        {
            Console.WriteLine("BUG!");

        }
        finally
        {
            wb.DocumentCompleted -= documentCompletedHandler;
        }
        // the DOM is ready

        result = wb.DocumentText;

    }


    return result;
}
noseratio
  • 59,932
  • 34
  • 208
  • 486
Benny Ae
  • 1,897
  • 7
  • 25
  • 37
  • Erm...pretty sure your tasks never complete. You remove the `DocumentCompleted` handler before it is completed. – Aron Feb 12 '15 at 02:09

1 Answers1

0

I recognize a slightly modified version of the code I used to answer quite a few WebBrowser-related questions. Was it this one? It's always a good idea to include a link to the original source.

Anyhow, the major problem in how you're using it here is perhaps the fact that you create and destroy an instance of WebBrowser control for every URL from your list.

Instead, you should be re-using a single instance of WebBrowser (or a pool of WebBrowser objects). You can find both versions here.

Community
  • 1
  • 1
noseratio
  • 59,932
  • 34
  • 208
  • 486