I'm using HtmlAgilityPack installed as a Nuget to parse dynamic html tables. its very fast and as per this answer you can query the results using linq. I've used this to store the result as a DataTable. Here's the public extension method class:-
public static class HtmlTableExtensions
{
private static readonly ILog Log = LogManager.GetLogger(typeof(HtmlTableExtensions));
/// <summary>
/// based on an idea from https://stackoverflow.com/questions/655603/html-agility-pack-parsing-tables
/// </summary>
/// <param name="tableBy"></param>
/// <param name="driver"></param>
/// <returns></returns>
public static HtmlTableData GetTableData(this By tableBy, IWebdriverCore driver)
{
try
{
var doc = tableBy.GetTableHtmlAsDoc(driver);
var columns = doc.GetHtmlColumnNames();
return doc.GetHtmlTableCellData(columns);
}
catch (Exception e)
{
Log.Warn(String.Format("unable to get table data from {0} using driver {1} ",tableBy ,driver),e);
return null;
}
}
/// <summary>
/// Take an HtmlTableData object and convert it into an untyped data table,
/// assume that the row key is the sole primary key for the table,
/// and the key in each of the rows is the column header
/// Hopefully this will make more sense when its written!
/// Expecting overloads for swichting column and headers,
/// multiple primary keys, non standard format html tables etc
/// </summary>
/// <param name="htmlTableData"></param>
/// <param name="primaryKey"></param>
/// <param name="tableName"></param>
/// <returns></returns>
public static DataTable ConvertHtmlTableDataToDataTable(this HtmlTableData htmlTableData,
string primaryKey = null, string tableName = null)
{
if (htmlTableData == null) return null;
var table = new DataTable(tableName);
foreach (var colName in htmlTableData.Values.First().Keys)
{
table.Columns.Add(new DataColumn(colName, typeof (string)));
}
table.SetPrimaryKey(new[] { primaryKey });
foreach (var values in htmlTableData
.Select(row => row.Value.Values.ToArray<object>()))
{
table.Rows.Add(values);
}
return table;
}
private static HtmlTableData GetHtmlTableCellData(this HtmlDocument doc, IReadOnlyList<string> columns)
{
var data = new HtmlTableData();
foreach (
var rowData in doc.DocumentNode.SelectNodes(XmlExpressions.AllDescendants + HtmlAttributes.TableRow)
.Skip(1)
.Select(row => row.SelectNodes(HtmlAttributes.TableCell)
.Select(n => WebUtility.HtmlDecode(n.InnerText)).ToList()))
{
data[rowData.First()] = new Dictionary<string, string>();
for (var i = 0; i < columns.Count; i++)
{
data[rowData.First()].Add(columns[i], rowData[i]);
}
}
return data;
}
private static List<string> GetHtmlColumnNames(this HtmlDocument doc)
{
var columns =
doc.DocumentNode.SelectNodes(XmlExpressions.AllDescendants + HtmlAttributes.TableRow)
.First()
.SelectNodes(XmlExpressions.AllDescendants + HtmlAttributes.TableHeader)
.Select(n => WebUtility.HtmlDecode(n.InnerText).Trim())
.ToList();
return columns;
}
private static HtmlDocument GetTableHtmlAsDoc(this By tableBy, IWebdriverCore driver)
{
var webTable = driver.FindElement(tableBy);
var doc = new HtmlDocument();
doc.LoadHtml(webTable.GetAttribute(HtmlAttributes.InnerHtml));
return doc;
}
}
The html data object is just an extension of dictionary:-
public class HtmlTableData : Dictionary<string,Dictionary<string,string>>
{
}
IWebdriverCore driver is a wrapper on IWebDriver or IRemoteWebdriver which exposes either of these interfaces as a readonly property, but you could just replace this with IWebDriver.
HtmlAttributes is a static lass holding const values for common html attributes to save on typos when referring to html elements/attributes/tags etc. in c# code:-
/// <summary>
/// config class holding common Html Attributes and tag names etc
/// </summary>
public static class HtmlAttributes
{
public const string InnerHtml = "innerHTML";
public const string TableRow = "tr";
public const string TableHeader = "th";
public const string TableCell = "th|td";
public const string Class = "class";
...
}
and SetPrimaryKey is an extension of DataTable which allows easy setting of the primary key for a datatable:-
public static void SetPrimaryKey(this DataTable table,string[] primaryKeyColumns)
{
int size = primaryKeyColumns.Length;
var keyColumns = new DataColumn[size];
for (int i = 0; i < size; i++)
{
keyColumns[i] = table.Columns[primaryKeyColumns[i]];
}
table.PrimaryKey = keyColumns;
}
I found this to be pretty performant - < 2 ms to parse a 30*80 table, and its a doddle to use.