1

I am using some scraping code by Noseratio found here https://stackoverflow.com/a/22262976/3499115. He wrote it to scrape a list of urls, but I am using it, however to render only one url at a time inside another web crawler MVC controller that I am using. I call this code each time i find a specific type of link and it appears that doing this many times is causing me to run out of memory. Maybe a solution would be to use a threadpool and limit the max number of threads, but how would I do that to this code? Here is the web crawler code that calls the webbrowser code:

public static HtmlDocument renderJavascript(string url)
    {
        HtmlDocument doc = new HtmlDocument();
        // using webBrowserScraper
        try
        {
            WebBrowserExt.SetFeatureBrowserEmulation(); // enable HTML5

            var cts = new CancellationTokenSource((int)TimeSpan.FromMinutes(3).TotalMilliseconds);
            var task = WebBrowserScraper.ScrapeSitesAsync(
                url,
                cts.Token);

            task.Wait();

            //Console.WriteLine("Press Enter to exit...");
            //Console.ReadLine();
            doc.LoadHtml(task.Result);
            return doc;
        }
        catch (Exception ex)
        {
            while (ex is AggregateException && ex.InnerException != null)
                ex = ex.InnerException;
            Console.WriteLine(ex.Message);
            //Environment.Exit(-1);
        }
        return null;
    }

And the webbrowser code (I just changed the parameter to a single string in the ScrapeSitesAsync function:

using System;
using using System.Linq;
using System.Text;
using Microsoft.Win32;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace Abot.Demo
{
    public class WebBrowserScraper
    {
        // by Noseratio - https://stackoverflow.com/a/22262976/1768303

        // main logic
        public static async Task<string> ScrapeSitesAsync(string url, CancellationToken token)

        {
            using (var apartment = new MessageLoopApartment())
            {
                // create WebBrowser inside MessageLoopApartment
                var webBrowser = apartment.Invoke(() => new WebBrowser());
                try
                {
                    Console.WriteLine("WebBrowser URL:\n" + url);

                    // cancel in 30s or when the main token is signalled
                    var navigationCts = CancellationTokenSource.CreateLinkedTokenSource(token);
                    navigationCts.CancelAfter((int)TimeSpan.FromSeconds(10).TotalMilliseconds);
                    var navigationToken = navigationCts.Token;

                    // run the navigation task inside MessageLoopApartment
                    string html = await apartment.Run(() =>
                        webBrowser.NavigateAsync(url, navigationToken), navigationToken);

                    Console.WriteLine("Scrape complete for URL:\n" + url);
                    return html;
            }
            finally
            {
                // dispose of WebBrowser inside MessageLoopApartment
                apartment.Invoke(() => webBrowser.Dispose());
            }
        }
    }
}

/// <summary>
/// WebBrowserExt - WebBrowser extensions
/// by Noseratio - https://stackoverflow.com/a/22262976/1768303
/// </summary>
public static class WebBrowserExt
{
    const int POLL_DELAY = 500;

    // navigate and download 
    public static async Task<string> NavigateAsync(this WebBrowser webBrowser, string url, CancellationToken token)
    {
        // navigate and await DocumentCompleted
        var tcs = new TaskCompletionSource<bool>();
        WebBrowserDocumentCompletedEventHandler handler = (s, arg) =>
            tcs.TrySetResult(true);

        using (token.Register(
            () => { webBrowser.Stop(); tcs.TrySetCanceled(); }, 
            useSynchronizationContext: true))
        {
            webBrowser.DocumentCompleted += handler;
            try
            {
                webBrowser.Navigate(url);
                await tcs.Task; // wait for DocumentCompleted
            }
            finally
            {
                webBrowser.DocumentCompleted -= handler;
            }
        }

        // get the root element
        var documentElement = webBrowser.Document.GetElementsByTagName("html")[0];

        // poll the current HTML for changes asynchronosly
        var html = documentElement.OuterHtml;
        while (true)
        {
            // wait asynchronously, this will throw if cancellation requested
            await Task.Delay(POLL_DELAY, token);

            // continue polling if the WebBrowser is still busy
            if (webBrowser.IsBusy)
                continue;

            var htmlNow = documentElement.OuterHtml;
            if (html == htmlNow)
                break; // no changes detected, end the poll loop

            html = htmlNow;
        }

        // consider the page fully rendered 
        token.ThrowIfCancellationRequested();
        return html;
    }

    // enable HTML5 (assuming we're running IE10+)
    // more info: https://stackoverflow.com/a/18333982/1768303
    public static void SetFeatureBrowserEmulation()
    {
        if (System.ComponentModel.LicenseManager.UsageMode != System.ComponentModel.LicenseUsageMode.Runtime)
            return;
        var appName = System.IO.Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName);
        Registry.SetValue(@"HKEY_CURRENT_USER\Software\Microsoft\Internet Explorer\Main\FeatureControl\FEATURE_BROWSER_EMULATION",
            appName, 10000, RegistryValueKind.DWord);
    }
}

/// <summary>
/// MessageLoopApartment
/// STA thread with message pump for serial execution of tasks
/// by Noseratio - https://stackoverflow.com/a/22262976/1768303
/// </summary>
public class MessageLoopApartment : IDisposable
{
    Thread _thread; // the STA thread

    TaskScheduler _taskScheduler; // the STA thread's task scheduler

    public TaskScheduler TaskScheduler { get { return _taskScheduler; } }

    /// <summary>MessageLoopApartment constructor</summary>
    public MessageLoopApartment()
    {
        var tcs = new TaskCompletionSource<TaskScheduler>();

        // start an STA thread and gets a task scheduler
        _thread = new Thread(startArg =>
        {
            EventHandler idleHandler = null;

            idleHandler = (s, e) =>
            {
                // handle Application.Idle just once
                Application.Idle -= idleHandler;
                // return the task scheduler
                tcs.SetResult(TaskScheduler.FromCurrentSynchronizationContext());
            };

            // handle Application.Idle just once
            // to make sure we're inside the message loop
            // and SynchronizationContext has been correctly installed
            Application.Idle += idleHandler;
            Application.Run();
        });

        _thread.SetApartmentState(ApartmentState.STA);
        _thread.IsBackground = true;
        _thread.Start();
        _taskScheduler = tcs.Task.Result;
    }

    /// <summary>shutdown the STA thread</summary>
    public void Dispose()
    {
        if (_taskScheduler != null)
        {
            var taskScheduler = _taskScheduler;
            _taskScheduler = null;

            // execute Application.ExitThread() on the STA thread
            Task.Factory.StartNew(
                () => Application.ExitThread(),
                CancellationToken.None,
                TaskCreationOptions.None,
                taskScheduler).Wait();

            _thread.Join();
            _thread = null;
        }
    }

    /// <summary>Task.Factory.StartNew wrappers</summary>
    public void Invoke(Action action)
    {
        Task.Factory.StartNew(action,
            CancellationToken.None, TaskCreationOptions.None, _taskScheduler).Wait();
    }

    public TResult Invoke<TResult>(Func<TResult> action)
    {
        return Task.Factory.StartNew(action,
            CancellationToken.None, TaskCreationOptions.None, _taskScheduler).Result;
    }

    public Task Run(Action action, CancellationToken token)
    {
        return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler);
    }

    public Task<TResult> Run<TResult>(Func<TResult> action, CancellationToken token)
    {
        return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler);
    }

    public Task Run(Func<Task> action, CancellationToken token)
    {
        return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler).Unwrap();
    }

    public Task<TResult> Run<TResult>(Func<Task<TResult>> action, CancellationToken token)
    {
        return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler).Unwrap();
    }
}

}

JBaczuk
  • 13,886
  • 10
  • 58
  • 86
  • in the commments to the original code link you were using, even NoseRatio states he limits the number of created webbrowser threads open at once. – Kevin Cook May 22 '14 at 14:56
  • You're right, but I'm not sure if this would work if I am calling his code over and over from a different source, I think his will limit the number of threads within his own code. – JBaczuk May 22 '14 at 15:11
  • Im never really sure how people who post so much code in their question expect to get a proper answer – Yuval Itzchakov May 23 '14 at 11:06
  • @YuvalItzchakov, in this particular case, the code is very much familiar to me. So I prefer to answer it here than via a private email. Someone else might benefit from this in the future. – noseratio May 23 '14 at 12:04
  • @Noseratio I understand, but if its a direct question to you it might be better to put it as a blog post imo. Even if I had the same problem, reading through so much code on SO would probably make it difficult to understand. – Yuval Itzchakov May 23 '14 at 12:27
  • 1
    @YuvalItzchakov, the OP did not contact me offline, if that's what you mean under "direct question". He posted it here on SO, and it's no surprise that I came by. Same as with his [another question](http://stackoverflow.com/q/23818529/1768303), which I voted to close as a duplicate. – noseratio May 23 '14 at 12:37

1 Answers1

2

One solution is to use SemaphoreSlim to maintain a limited pool of WebBrowser objects to scrape web sites in parallel. It also makes sense to share the common message loop for all WebBrowser instances.

Here is how it can be implemented, based on my console web scraper code you linked. The new part is the WebBrowserPool class (warning: only slightly tested):

using Microsoft.Win32;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace AsyncWebBrowserScraper
{
    class Program
    {
        // by Noseratio - https://stackoverflow.com/a/23819021/1768303

        // test: web-scrape a list of URLs
        static async Task ScrapeSitesAsync(string[] urls, CancellationToken token)
        {
            using (var pool = new WebBrowserPool(maxParallel: 2, token: token))
            {
                // cancel each site in 30s or when the main token is signalled
                var timeout = (int)TimeSpan.FromSeconds(30).TotalMilliseconds;

                var results = urls.ToDictionary(
                    url => url, url => pool.ScrapeSiteAsync(url, timeout));

                await Task.WhenAll(results.Values);

                foreach (var url in results.Keys)
                {
                    Console.WriteLine("URL:\n" + url);

                    string html = results[url].Result;

                    Console.WriteLine("HTML:\n" + html);
                }
            }
        }

        // entry point
        static void Main(string[] args)
        {
            try
            {
                WebBrowserExt.SetFeatureBrowserEmulation(); // enable HTML5

                var cts = new CancellationTokenSource((int)TimeSpan.FromMinutes(3).TotalMilliseconds);

                var task = ScrapeSitesAsync(
                    new[] { "http://example.com", "http://example.org", "http://example.net", "http://www.bing.com", "http://www.google.com" },
                    cts.Token);

                task.Wait();

                Console.WriteLine("Press Enter to exit...");
                Console.ReadLine();
            }
            catch (Exception ex)
            {
                while (ex is AggregateException && ex.InnerException != null)
                    ex = ex.InnerException;
                Console.WriteLine(ex.Message);
                Environment.Exit(-1);
            }
        }
    }

    /// <summary>
    /// WebBrowserPool the pool of WebBrowser objects sharing the same message loop
    /// </summary>
    public class WebBrowserPool : IDisposable
    {
        MessageLoopApartment _apartment; // a WinFroms STA thread with message loop
        readonly SemaphoreSlim _semaphore; // regulate available browsers
        readonly Queue<WebBrowser> _browsers; // the pool of available browsers
        readonly HashSet<Task> _pendingTasks; // keep track of pending tasks for proper cancellation
        readonly CancellationTokenSource _cts; // global cancellation (for Dispose)

        public WebBrowserPool(int maxParallel, CancellationToken token)
        {
            if (maxParallel < 1)
                throw new ArgumentException("maxParallel");

            _cts = CancellationTokenSource.CreateLinkedTokenSource(token);
            _apartment = new MessageLoopApartment();
            _semaphore = new SemaphoreSlim(maxParallel);
            _browsers = new Queue<WebBrowser>();
            _pendingTasks = new HashSet<Task>();

            // init the pool of WebBrowser objects
            _apartment.Invoke(() =>
            {
                while (--maxParallel >= 0)
                    _browsers.Enqueue(new WebBrowser());
            });
        }

        // Navigate to a site and get a snapshot of its DOM HTML
        public async Task<string> ScrapeSiteAsync(string url, int timeout, CancellationToken token = default(CancellationToken))
        {
            var navigationCts = CancellationTokenSource.CreateLinkedTokenSource(token, _cts.Token);
            var combinedToken = navigationCts.Token;

            // we have a limited number of WebBrowser objects available, so await the semaphore
            await _semaphore.WaitAsync(combinedToken);
            try
            {
                if (timeout != Timeout.Infinite)
                    navigationCts.CancelAfter(timeout);

                // run the main logic on the STA thread
                return await _apartment.Run(async () =>
                {
                    // acquire the 1st available WebBrowser from the pool
                    var webBrowser = _browsers.Dequeue();
                    try
                    {
                        var task = webBrowser.NavigateAsync(url, combinedToken);
                        _pendingTasks.Add(task); // register the pending task
                        try
                        {
                            return await task;
                        }
                        finally
                        {
                            // unregister the completed task
                            _pendingTasks.Remove(task);
                        }
                    }
                    finally
                    {
                        // return the WebBrowser to the pool
                        _browsers.Enqueue(webBrowser);
                    }
                }, combinedToken);
            }
            finally
            {
                _semaphore.Release();
            }
        }

        // Dispose of WebBrowserPool
        public void Dispose()
        {
            if (_apartment == null)
                throw new ObjectDisposedException(this.GetType().Name);

            // cancel and wait for all pending tasks
            _cts.Cancel();
            var task = _apartment.Run(() => Task.WhenAll(_pendingTasks.ToArray()));
            try
            {
                task.Wait();
            }
            catch
            {
                if (!task.IsCanceled)
                    throw;
            }

            // dispose of WebBrowser objects
            _apartment.Run(() =>
            {
                while (_browsers.Any())
                    _browsers.Dequeue().Dispose();
            });

            _apartment.Dispose();
            _apartment = null;
        }
    }

    /// <summary>
    /// WebBrowserExt - WebBrowser extensions
    /// by Noseratio - https://stackoverflow.com/a/22262976/1768303
    /// </summary>
    public static class WebBrowserExt
    {
        const int POLL_DELAY = 500;

        // navigate and download 
        public static async Task<string> NavigateAsync(this WebBrowser webBrowser, string url, CancellationToken token)
        {
            // navigate and await DocumentCompleted
            var tcs = new TaskCompletionSource<bool>();
            WebBrowserDocumentCompletedEventHandler handler = (s, arg) =>
                tcs.TrySetResult(true);

            using (token.Register(
                () => { webBrowser.Stop(); tcs.TrySetCanceled(); },
                useSynchronizationContext: true))
            {
                webBrowser.DocumentCompleted += handler;
                try
                {
                    webBrowser.Navigate(url);
                    await tcs.Task; // wait for DocumentCompleted
                }
                finally
                {
                    webBrowser.DocumentCompleted -= handler;
                }
            }

            // get the root element
            var documentElement = webBrowser.Document.GetElementsByTagName("html")[0];

            // poll the current HTML for changes asynchronosly
            var html = documentElement.OuterHtml;
            while (true)
            {
                // wait asynchronously, this will throw if cancellation requested
                await Task.Delay(POLL_DELAY, token);

                // continue polling if the WebBrowser is still busy
                if (webBrowser.IsBusy)
                    continue;

                var htmlNow = documentElement.OuterHtml;
                if (html == htmlNow)
                    break; // no changes detected, end the poll loop

                html = htmlNow;
            }

            // consider the page fully rendered 
            token.ThrowIfCancellationRequested();
            return html;
        }

        // enable HTML5 (assuming we're running IE10+)
        // more info: https://stackoverflow.com/a/18333982/1768303
        public static void SetFeatureBrowserEmulation()
        {
            if (System.ComponentModel.LicenseManager.UsageMode != System.ComponentModel.LicenseUsageMode.Runtime)
                return;
            var appName = System.IO.Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName);
            Registry.SetValue(@"HKEY_CURRENT_USER\Software\Microsoft\Internet Explorer\Main\FeatureControl\FEATURE_BROWSER_EMULATION",
                appName, 10000, RegistryValueKind.DWord);
        }
    }

    /// <summary>
    /// MessageLoopApartment
    /// STA thread with message pump for serial execution of tasks
    /// by Noseratio - https://stackoverflow.com/a/22262976/1768303
    /// </summary>
    public class MessageLoopApartment : IDisposable
    {
        Thread _thread; // the STA thread

        TaskScheduler _taskScheduler; // the STA thread's task scheduler

        public TaskScheduler TaskScheduler { get { return _taskScheduler; } }

        /// <summary>MessageLoopApartment constructor</summary>
        public MessageLoopApartment()
        {
            var tcs = new TaskCompletionSource<TaskScheduler>();

            // start an STA thread and gets a task scheduler
            _thread = new Thread(startArg =>
            {
                EventHandler idleHandler = null;

                idleHandler = (s, e) =>
                {
                    // handle Application.Idle just once
                    Application.Idle -= idleHandler;
                    // return the task scheduler
                    tcs.SetResult(TaskScheduler.FromCurrentSynchronizationContext());
                };

                // handle Application.Idle just once
                // to make sure we're inside the message loop
                // and SynchronizationContext has been correctly installed
                Application.Idle += idleHandler;
                Application.Run();
            });

            _thread.SetApartmentState(ApartmentState.STA);
            _thread.IsBackground = true;
            _thread.Start();
            _taskScheduler = tcs.Task.Result;
        }

        /// <summary>shutdown the STA thread</summary>
        public void Dispose()
        {
            if (_taskScheduler != null)
            {
                var taskScheduler = _taskScheduler;
                _taskScheduler = null;

                // execute Application.ExitThread() on the STA thread
                Task.Factory.StartNew(
                    () => Application.ExitThread(),
                    CancellationToken.None,
                    TaskCreationOptions.None,
                    taskScheduler).Wait();

                _thread.Join();
                _thread = null;
            }
        }

        /// <summary>Task.Factory.StartNew wrappers</summary>
        public void Invoke(Action action)
        {
            Task.Factory.StartNew(action,
                CancellationToken.None, TaskCreationOptions.None, _taskScheduler).Wait();
        }

        public TResult Invoke<TResult>(Func<TResult> action)
        {
            return Task.Factory.StartNew(action,
                CancellationToken.None, TaskCreationOptions.None, _taskScheduler).Result;
        }

        public Task Run(Action action, CancellationToken token = default(CancellationToken))
        {
            return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler);
        }

        public Task<TResult> Run<TResult>(Func<TResult> action, CancellationToken token = default(CancellationToken))
        {
            return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler);
        }

        public Task Run(Func<Task> action, CancellationToken token = default(CancellationToken))
        {
            return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler).Unwrap();
        }

        public Task<TResult> Run<TResult>(Func<Task<TResult>> action, CancellationToken token = default(CancellationToken))
        {
            return Task.Factory.StartNew(action, token, TaskCreationOptions.None, _taskScheduler).Unwrap();
        }
    }
}
noseratio
  • 59,932
  • 34
  • 208
  • 486
  • I am calling ScrapSitesAsync multiple times (with a single URL each time) from my other crawler code, isn't this designed to be called only once (with a list of URLs)? – JBaczuk May 23 '14 at 16:56
  • 1
    @JBaczuk, it works for both cases. Create an instance of WebBrowserPool, keep the reference to it and call WebBrowserPool.ScrapSiteAsync(url) as many times as you want. Call `WebBrowserPool.Dispose` when done with it. I updated the code to show how to call `ScrapSiteAsync` in parallel. – noseratio May 24 '14 at 00:03
  • I am going to mark this as an answer if I can get it to work, I tried what you are saying and I am getting a lot of tasks cancelled. It seems to not be completing every time I tell it to ScrapSiteAsync. It's pretty involved though, so I feel bad asking you to jump into my project, I'll let you know. – JBaczuk May 25 '14 at 04:16
  • 1
    @JBaczuk, use `Timeout.Infinite` anywhere you specify a timeout. See if you still get canceled tasks. – noseratio May 25 '14 at 04:20
  • I tried and there are still cancellations, I'm trying to find out why they were cancelled. It might be something I'm doing wrong. – JBaczuk May 25 '14 at 06:38
  • I think the problem I'm having is that in order to call WebBrowserPool.ScrapSiteAsync(url), I have to be inside of an asynchronous function. So I had to create one called public static async Task renderJavascript(string url) and I call that over and over using var task = renderJavascript(e.CrawledPage.Uri.ToString()); task.Wait(); – JBaczuk May 25 '14 at 07:27
  • @JBaczuk, I suspect the fact the you use `Wait` causes a deadlock, until a time-out which cancels the tasks and unblocks the deadlock. You only should use `Wait` on the topmost level (like in `Main`), if anywhere at all. More about this in Stephen Cleary's blog: http://blog.stephencleary.com/2012/07/dont-block-on-async-code.html – noseratio May 27 '14 at 01:31
  • `var cancelSource = new CancellationToken(); var scrapetask = AsyncWebBrowserScrapper.ScrapSitesAsync(new string[] { WEBSITE }, cancelSource); scrapetask.Wait();` Never returns, the web browser tasks finishes and succeeds, but for some reason it never thinks all the URL tasks are finished. – BlamKiwi Dec 18 '14 at 10:56
  • @MorphingDragon, the code I posted in this answer works just fine on my side. If you have a specific set of urls which fails for you and is publicly available, post the here and I may give it try. – noseratio Dec 20 '14 at 02:34