0

I want to get html code from website. In Browser I usually can just click on ‘View Page Source’ in context menu or something similar. But how can I automatized it? I’ve tried it with WebBrowser class but sometimes it doesn’t work. I am not web developer so I don’t really know if my approach at least make sense. I think main problem is that I sometimes get html where not all code was executed. Hence it is uncompleted. I have problem with e.g. this site: http://www.sreality.cz/en/search/for-sale/praha

My code (I’ve tried to make it small but runnable on its own):

using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Windows.Forms;

namespace WebBrowserForm
{
    internal static class Program
    {
        [STAThread]
        private static void Main()
        {
            Application.EnableVisualStyles();
            Application.SetCompatibleTextRenderingDefault(false);
            for (int i = 0; i < 10; i++)
            {
                Form1 f = new Form1();
                f.ShowDialog();
            }
            // Now I can check Form1.List and see that some html is final and some is not
        }
    }

    public class Form1 : Form
    {
        public static List<string> List = new List<string>();
        private const string Url = "http://www.sreality.cz/en/search/for-sale/praha";
        private System.Windows.Forms.WebBrowser webBrowser1;

        public Form1()
        {
            this.webBrowser1 = new System.Windows.Forms.WebBrowser();
            this.SuspendLayout();
            this.webBrowser1.Dock = System.Windows.Forms.DockStyle.Fill;
            this.webBrowser1.Name = "webBrowser1";
            this.webBrowser1.TabIndex = 0;
            this.ResumeLayout(false);

            Load += new EventHandler(Form1_Load);
            this.webBrowser1.ObjectForScripting = new MyScript();
        }

        private void Form1_Load(object sender, EventArgs e)
        {
            webBrowser1.Navigate(Url);
            webBrowser1.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser1_DocumentCompleted);
        }

        private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {

            if (webBrowser1.ReadyState == WebBrowserReadyState.Complete)
            {
                // Final html for 99% of web pages, but unfortunately not for all
                string tst = webBrowser1.Document.GetElementsByTagName("HTML")[0].OuterHtml;

                webBrowser1.DocumentCompleted -= new WebBrowserDocumentCompletedEventHandler(webBrowser1_DocumentCompleted);
                Application.DoEvents();
                webBrowser1.Navigate("javascript: window.external.CallServerSideCode();");
                Application.DoEvents();
            }
        }

        [ComVisible(true)]
        public class MyScript
        {
            public void CallServerSideCode()
            {
                HtmlDocument doc = ((Form1)Application.OpenForms[0]).webBrowser1.Document;
                string renderedHtml = doc.GetElementsByTagName("HTML")[0].OuterHtml;
                // here I sometimes get full html but sometimes the same as in webBrowser1_DocumentCompleted method
                List.Add(renderedHtml);
                ((Form1)Application.OpenForms[0]).Close();
            }
        }
    }
}

I would expect that in ‘webBrowser1_DocumentCompleted’ method I could get final html. It usually works, but with this site it doesn’t. So I’ve tried get html in my own code which should be executed in web site -> method ‘CallServerSideCode’. What is strange that sometimes I get final html (basically the same as if I do it manually via Browser) but sometimes not. I think the problem is caused because my script start before whole web site is rendered instead after. But I am not really sure since this kind of things are far from my comfort zone and I don’t really understand what I am doing. I’m just trying to apply something what I found on the internet.

So, does anyone knows what is wrong with the code? Or even more importantly how to easily get final html from the site?

Any help appreciated.

Sekory
  • 143
  • 11

2 Answers2

0

You should use WebClient class to download HTML page. No display control necessary.

You want method DownloadString

Ales Ruzicka
  • 2,770
  • 1
  • 18
  • 24
  • It doesn't work I've tried it. Yes I get some html, but not in final state. If you try it on the web site with which I have problem, then you will see that html from WebClient is different from the 'final' html which you see in browser (in this context browser=IE, Chrome...). – Sekory Apr 23 '15 at 19:13
  • You want it after all JavaScript is interpreted? – Ales Ruzicka Apr 23 '15 at 19:18
0

May be it will be helpful if you add calling of your external function to the end of the body and wrap it by Jquery "ondomready" function. I mean something like this:

private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
    if (webBrowser1.ReadyState == WebBrowserReadyState.Complete)
    {
        // Final html for 99% of web pages, but unfortunately not for all
        string tst = webBrowser1.Document.GetElementsByTagName("HTML")[0].OuterHtml;

        webBrowser1.DocumentCompleted -= new WebBrowserDocumentCompletedEventHandler(webBrowser1_DocumentCompleted);

        HtmlElement body = webBrowser1.Document.GetElementsByTagName("body")[0];
        HtmlElement scriptEl = webBrowser1.Document.CreateElement("script");
        IHTMLScriptElement element = (IHTMLScriptElement)scriptEl.DomElement;
        element.text = "$(function() { window.external.CallServerSideCode(); });";
        body.AppendChild(scriptEl);
    }
}

[ComVisible(true)]
public class MyScript
{
    public void CallServerSideCode()
    {
        HtmlDocument doc = ((Form1)Application.OpenForms[0]).webBrowser1.Document;
        string renderedHtml = doc.GetElementsByTagName("HTML")[0].OuterHtml;
        // here I sometimes get full html but sometimes the same as in webBrowser1_DocumentCompleted method
        List.Add(renderedHtml);
        ((Form1)Application.OpenForms[0]).Close();
    }
}
m1burn
  • 78
  • 7