1

This is a web site https://www.wsj.com/news/types/newsplus whose data is getting loaded by ajax at runtime. i have to read all article title text. from morning i tried lots of code but still no code worked because data is getting load by ajax.

This is my code which i tried.

HtmlDocument hd = GetHtmlAjax(new Uri("https://www.wsj.com/news/types/newsplus"), 300, true);
ParseData(hd);


HtmlElementCollection main_element = hd.GetElementsByTagName("h3");
if (main_element != null)
{
    foreach (HtmlElement element in main_element)
    {
        string cls = element.GetAttribute("className");
        if (String.IsNullOrEmpty(cls) || !cls.Equals("WSJTheme--headline--unZqjb45 undefined WSJTheme--heading-3--2z_phq5h typography--serif-display--ZXeuhS5E"))
            continue;

        HtmlElementCollection childDivs = element.Children.GetElementsByName("a");
        foreach (HtmlElement childElement in childDivs)
        {
            //grab links and other stuff same way
            string linktxt = childElement.InnerText;
        }
    }
}           


WebBrowser wb = null;
public HtmlDocument GetHtmlAjax(Uri uri, int AjaxTimeLoadTimeOut,bool loadurl)
{
    if (loadurl)
    {
            wb = new WebBrowser();
            wb.ScriptErrorsSuppressed = true;
            wb.Navigate(uri);
    }

    while (wb.ReadyState != WebBrowserReadyState.Complete)
        Application.DoEvents();

    Thread.Sleep(AjaxTimeLoadTimeOut);
    Application.DoEvents();
    return wb.Document;
}

i follow many links to handle this issue but fail. these are the links i followed.

htmlagilitypack and dynamic content issue Get HTML in C# from page that Loads Dynamic Data Retrieve ajax/JavaScript return results from webpage in c#

How to extract dynamic ajax content from a web page

please some tell me what to change in my code to parse title link text. thanks

Post code from @aepot

private static HttpClient client = new HttpClient();

        private static async Task<T> GetJsonPageAsync<T>(string url)
        {
            using (HttpResponseMessage response = await client.GetAsync(url, HttpCompletionOption.ResponseHeadersRead))
            {
                response.EnsureSuccessStatusCode();
                string text = await response.Content.ReadAsStringAsync();
                return JsonConvert.DeserializeObject<T>(text);
            }
        }

        private async void button1_Click(object sender, EventArgs e)
        {
            try
            {
                dynamic newsList = await GetJsonPageAsync<dynamic>("https://www.wsj.com/news/types/newsplus?id={%22query%22:%22type:=\\%22NewsPlus\\%22%22,%22db%22:%22wsjie,blog,interactivemedia%22}&type=search_collection");
                List<Task<dynamic>> tasks = new List<Task<dynamic>>();
                foreach (dynamic item in newsList.collection)
                {
                    string strUrl = "https://www.wsj.com/news/types/newsplus?id=" + item.id + "&type=article";
                    tasks.Add(GetJsonPageAsync<dynamic>(strUrl));

                    //tasks.Add(GetJsonPageAsync<dynamic>($"https://www.wsj.com/news/types/newsplus?id={item.id}&type=article"));
                }

                dynamic[] newsDataList = await Task.WhenAll(tasks);
                foreach (dynamic newItem in newsDataList)
                {
                    //Console.WriteLine(newItem.data.headline);
                    //Console.WriteLine(newItem.data.url);

                    txtData.Text += newItem.data.headline + System.Environment.NewLine;
                    txtData.Text += new string('-', 200); + System.Environment.NewLine;
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }
aepot
  • 4,558
  • 2
  • 12
  • 24
Indi_Rain
  • 179
  • 5
  • 17
  • `WebBrowser` is underlying Internet Explorer 11 which is for now incompartible with Internet. Let it rest in peace. Try `WebView`/`WebView2` or `CefSharp` – aepot Jul 11 '20 at 17:08
  • see my code and tell me what is wrong there for which it is not being able to parse ajax content. – Indi_Rain Jul 11 '20 at 18:41

1 Answers1

0

AJAX is simple GET or POST request.

Using regular Browser dev tools I've found that page sends simple GET request and receive JSON data. JSON can be deserealized or explored via reader.

For JSON parsing i used Newtonsoft.Json NuGet package

Here's simple example based on WinForms app.

public partial class Form1 : Form
{
    private static readonly HttpClient client = new HttpClient();

    private async Task<T> GetJsonPageAsync<T>(string url)
    {
        using (HttpResponseMessage response = await client.GetAsync(url, HttpCompletionOption.ResponseHeadersRead))
        {
            response.EnsureSuccessStatusCode();
            string text = await response.Content.ReadAsStringAsync();
            return JsonConvert.DeserializeObject<T>(text);
        }
    }

    public Form1()
    {
        InitializeComponent();
        ServicePointManager.DefaultConnectionLimit = 10; // to make it faster
        ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12;
    }

    private async void button1_Click(object sender, EventArgs e)
    {
        try
        {
            dynamic newsList = await GetJsonPageAsync<dynamic>("https://www.wsj.com/news/types/newsplus?id={%22query%22:%22type:=\\%22NewsPlus\\%22%22,%22db%22:%22wsjie,blog,interactivemedia%22}&type=search_collection");
            List<Task<dynamic>> tasks = new List<Task<dynamic>>();
            foreach (dynamic item in newsList.collection)
            {
                tasks.Add(GetJsonPageAsync<dynamic>($"https://www.wsj.com/news/types/newsplus?id={item.id}&type=article"));
            }
            dynamic[] newsDataList = await Task.WhenAll(tasks);
            foreach (dynamic newItem in newsDataList)
            {
                textBox1.Text += newItem.data.headline + Environment.NewLine;
                textBox1.Text += new string('-', 200) + Environment.NewLine;
            }
        }
        catch (Exception ex)
        {
            textBox1.Text = ex.Message;
        }
    }
}

enter image description here

UPD: Added fix for .NET Framework 4.5.2

aepot
  • 4,558
  • 2
  • 12
  • 24
  • Comments are not for extended discussion; this conversation has been [moved to chat](https://chat.stackoverflow.com/rooms/217678/discussion-on-answer-by-aepot-c-facing-problem-to-read-ajax-data-using-web-brow). – Samuel Liew Jul 12 '20 at 01:13
  • @aepot if found another url which load data initially without ajax and when we scroll down then load rest of the data gradually by ajax. so in this case how could i read initial data when using `HttpClient ` ? – Indi_Rain Jul 17 '20 at 06:53
  • @Indi_Rain Simply send the same request as AJAX is sending when you scroll down. I already told you that at least twice. – aepot Jul 17 '20 at 07:06