I'm currently trying to do some webscraping using this code:
public static User registerUser()
{
User toreturn = new User();
string csrf;
WebBrowser webcontrol = new WebBrowser();
webcontrol.AllowNavigation = true;
webcontrol.ScriptErrorsSuppressed = true;
webcontrol.Navigate("https://example.com/signup");
webcontrol.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webcontrol_DocumentCompleted);
HtmlElementCollection forms = webcontrol.Document.GetElementById("csrf_token").GetElementsByTagName("value");
string tosend = forms[0].InnerText;
toreturn.apikey = tosend;
return toreturn;
}
private static void webcontrol_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
}
However, the entire variable webcontrol.Document is null, both before and after the event.
Any idea as to why this is? Hard to scrape when blind.
EDIT: Worth noting, obviously I need to use the values I get
OK
So now I have got it to load thanks to help below, but I can't get the threads to return the value..
public static User registerUser()
{
Uri test = new Uri("https://www.example.com/signup");
HtmlDocument testdoc = runBrowserThread(test);
string tosend = "test";
User user = new User();
user.apikey = tosend;
return user;
}
public static HtmlDocument runBrowserThread(Uri url)
{
HtmlDocument value = null;
var th = new Thread(() =>
{
var br = new WebBrowser();
br.DocumentCompleted += browser_DocumentCompleted;
br.Navigate(url);
value = br.Document;
Application.Run();
});
th.SetApartmentState(ApartmentState.STA);
th.Start();
th.Join(8000);
return value;
}
static void browser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
var br = sender as WebBrowser;
if (br.Url == e.Url)
{
Console.WriteLine("Natigated to {0}", e.Url);
Console.WriteLine(br.Document.Body.InnerHtml);
System.Console.ReadLine();
Application.ExitThread(); // Stops the thread
}
}
The System.Console.WriteLine works - I see the HTML! Joy! (Although its cloudflare, but I should get whitelisted)
But the thread returns null..