9

I wanted to import some data from HTML table (here is a link http://road2paris.com/wp-content/themes/roadtoparis/api/generated_table_august.html) and display first 16 people in DataGridView in my Form application. From what I've read the best way to do it is to use HTML Agility pack, so I downloaded it and included to my project. I understand that the first thing to do is to load the content of html file. This is the code I used to do so:

        string htmlCode = "";
        using (WebClient client = new WebClient())
        {
            client.Headers.Add(HttpRequestHeader.UserAgent, "AvoidError");
            htmlCode = client.DownloadString("http://road2paris.com/wp-content/themes/roadtoparis/api/generated_table_august.html");
        }
        HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();

        doc.LoadHtml(htmlCode);

And then I got stuck. I don't know how to fill my datatable with data from the html table. I've tried many various solutions but nothing seems to work properly. I'd be glad if anyone could help me with that.

user2658455
  • 125
  • 1
  • 1
  • 5

2 Answers2

24
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(htmlCode);
var headers = doc.DocumentNode.SelectNodes("//tr/th");
DataTable table = new DataTable();
foreach (HtmlNode header in headers)
    table.Columns.Add(header.InnerText); // create columns from th
// select rows with td elements 
foreach (var row in doc.DocumentNode.SelectNodes("//tr[td]")) 
    table.Rows.Add(row.SelectNodes("td").Select(td => td.InnerText).ToArray());

You'll need the HTML Agility Pack library to use this code.

Justyn
  • 73
  • 7
Sergey Berezovskiy
  • 232,247
  • 41
  • 429
  • 459
  • To extend this answer a bit, if you have a column named "Price", you can now access a cell directly using for example: var cell = table.Rows[0]["Price"]; (or just use the index number if you prefer.) – Dan Randolph Jan 20 '23 at 20:15
  • I installed HTMLAgilityPack using the Nuget Package Manager in Visual Studiio. – Dan Randolph Jan 20 '23 at 20:17
0

Below I have created code that will prevent having duplicate data headers. When you create a DataTable each "Column" must have a unique name. Also, there are times when a HTML row might go out of bounds and you have to add additional columns to the data table, otherwise you will drop data. this has been my solution.

'''
public enum DuplicateHeaderReplacementStrategy
{
    AppendAlpha,
    AppendNumeric,
    Delete
}

public class HtmlServices
{
    private static readonly string[] Alpha = new[] { "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z" };

    public static HtmlDocument RenameDuplicateHeaders(HtmlDocument doc, DuplicateHeaderReplacementStrategy strategy)
    {
        var index = 0;
        try
        {
            foreach (HtmlNode table in doc.DocumentNode?.SelectNodes("//table"))
            {
                var tableHeaders = table.SelectNodes("th")?
                   .GroupBy(x => x)?
                   .Where(g => g.Count() > 1)?
                   .ToList();
                tableHeaders?.ForEach(y =>
                   {
                       switch (strategy)
                       {
                           case DuplicateHeaderReplacementStrategy.AppendNumeric:
                               y.Key.InnerHtml += index++;
                               break;

                           case DuplicateHeaderReplacementStrategy.AppendAlpha:
                               y.Key.InnerHtml += Alpha[index++];
                               break;

                           case DuplicateHeaderReplacementStrategy.Delete:
                               y.Key.InnerHtml = string.Empty;
                               break;
                       }
                });
            }
            return doc;
        }
        catch
        {
            return doc;
        }


    }
}


public static DataTable GetDataTableFromHtmlTable(string url, string[] htmlIds)
    {
        ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls11 | SecurityProtocolType.Tls12;
        HtmlWeb web = new HtmlWeb();
        HtmlDocument doc = web.Load(url);
        string html = doc.DocumentNode.OuterHtml;

        doc = HtmlServices.RenameDuplicateHeaders(doc, DuplicateHeaderReplacementStrategy.AppendNumeric);

        var headers = doc.DocumentNode.SelectNodes("//tr/th");

        DataTable table = new DataTable();
        foreach (HtmlNode header in headers)
            if (!table.ColumnExists(header.InnerText))
            {
                table.Columns.Add(header.InnerText); // create columns from th
            }
            else
            {
                int columnIteration = 0;
                while (table.ColumnExists(header.InnerText + columnIteration.ToString()))
                {
                    columnIteration++;
                }
                table.Columns.Add(header.InnerText + columnIteration.ToString()); // create columns from th
            }

        // select rows with td elements
        foreach (var row in doc.DocumentNode.SelectNodes("//tr[td]"))
        {
            var addRow = row.SelectNodes("td").Select(td => td.InnerHtml.StripHtmlTables()).ToArray();

            if (addRow.Count() > table.Columns.Count)
            {
                int m_numberOfRowsToAdd = addRow.Count() - table.Columns.Count;
                for (int i = 0; i < m_numberOfRowsToAdd; i++)
                    table.Columns.Add($"ExtraColumn {i + 1}");
            }

            try
            {
                table.Rows.Add(addRow);
            }
            catch (Exception e)
            {
                debug.Print(e.Message);
            }
        }
        return table;
    }
Jonathan Hall
  • 75,165
  • 16
  • 143
  • 189