2

I'm currently trying to do some webscraping using this code:

public static User registerUser()
    {
        User toreturn = new User();

        string csrf;

        WebBrowser webcontrol = new WebBrowser();

        webcontrol.AllowNavigation = true;
        webcontrol.ScriptErrorsSuppressed = true;
        webcontrol.Navigate("https://example.com/signup");
        webcontrol.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webcontrol_DocumentCompleted);

        HtmlElementCollection forms = webcontrol.Document.GetElementById("csrf_token").GetElementsByTagName("value");

        string tosend = forms[0].InnerText;
        toreturn.apikey = tosend;
        return toreturn;
    }

    private static void webcontrol_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
    {

    }

However, the entire variable webcontrol.Document is null, both before and after the event.

Any idea as to why this is? Hard to scrape when blind.

EDIT: Worth noting, obviously I need to use the values I get

OK

So now I have got it to load thanks to help below, but I can't get the threads to return the value..

public static User registerUser()
    {
        Uri test = new Uri("https://www.example.com/signup");
        HtmlDocument testdoc = runBrowserThread(test);

        string tosend = "test";

        User user = new User();

        user.apikey = tosend;

        return user;

    }
    public static HtmlDocument runBrowserThread(Uri url)
    {
        HtmlDocument value = null;
        var th = new Thread(() =>
        {
            var br = new WebBrowser();
            br.DocumentCompleted += browser_DocumentCompleted;
            br.Navigate(url);
            value = br.Document;
            Application.Run();
        });
        th.SetApartmentState(ApartmentState.STA);
        th.Start();
        th.Join(8000); 
        return value;
    }

    static void browser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
    {
        var br = sender as WebBrowser;
        if (br.Url == e.Url)
        {
            Console.WriteLine("Natigated to {0}", e.Url);
            Console.WriteLine(br.Document.Body.InnerHtml);
            System.Console.ReadLine();
            Application.ExitThread();   // Stops the thread
        }
    }

The System.Console.WriteLine works - I see the HTML! Joy! (Although its cloudflare, but I should get whitelisted)

But the thread returns null..

William Dunne
  • 479
  • 1
  • 5
  • 12
  • In your code, you are possibly accessing `webcontrol.Document` before the event fires. Is `webcontrol.Document` indeed null in the body of `webcontrol_DocumentCompleted`? – lc. Apr 14 '15 at 06:13
  • Hmm doesn't seem to trigger.. – William Dunne Apr 14 '15 at 06:21
  • Is it triggered if you assign the webcontrol_DocumentCompleted handler before calling Navigate? If yes, try accessing the Document in the handler. – Szabolcs Dézsi Apr 14 '15 at 06:25
  • Nope, that doesn't seem to work either: `public static void registerUser() { WebBrowser webcontrol = new WebBrowser(); webcontrol.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webcontrol_DocumentCompleted); webcontrol.Navigate("https://coinkite.com/signup"); } private static void webcontrol_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) { System.Console.WriteLine(((WebBrowser)sender).Document); System.Console.ReadLine(); }` – William Dunne Apr 14 '15 at 06:28
  • 1
    Is this a console app? Check the answer here: http://stackoverflow.com/questions/10720703/webbrowser-doesnt-get-to-documentcompleted-event – Szabolcs Dézsi Apr 14 '15 at 06:38
  • 1
    FWIW I was able to reproduce exactly what @SzabolcsDézsi is referring to. If I run the message pump with Application.Run, it works. (I assume the WebBrowser control is a `System.Windows.Forms.WebBrowser`) – lc. Apr 14 '15 at 06:42
  • So how would I return the response from the thread? – William Dunne Apr 14 '15 at 06:54
  • Added source to top for clarity – William Dunne Apr 14 '15 at 07:13

0 Answers0