Some background:
I have an HTML file with page dimensions contained in DIV tags and data contained in SPAN tags. The SPAN tags are children of DIV tags. SPAN tags are positioned by setting each one's "left" and "top" in the style attribute.
There are several SPAN tags per row but only determined by their "top" and "left". The SPAN tags are ordered in an arbitrary way within a DIV tag.
I've managed to load all the tag properties into a List(Of Node) and sort them the way I need. How do I update the "Order" property of the _allData list?
Public Structure Node
Public Underline As Boolean ' text-decoration: underline
Public InnerText As String ' content/data
Public Order As Integer ' initially all 0
Public Top As Single ' relative position of tag to TopOffset
Public Left As Single ' relative position of tag to TopOffset
Public TopOffset As Single ' page's postion from 0,0
End Structure
Private _allData As List(Of Node)
Private Sub SetGlobalElementsOrder()
Dim currentOrder As Integer = 0
' not really points of anything but
' a useful built-in structure to store 2 values of type Single
Dim pageColumns As IEnumerable(Of PointF) = {
New PointF(0, 2.2),
New PointF(2.45, 4.1),
New PointF(5.1, 7.35)}
' this has values like: 10.29, 20.55, 30.92, ...
Dim topOffsets As IEnumerable(Of Single) =
_allData.Select(Function(its) its.TopOffset).Distinct.ToList
' sort each page separately
For Each topOffset As Single In topOffsets
' select each column's tags whose "left" is between X and Y of PointF
For Each columnLimit As PointF In pageColumns
Dim columnTags As List(Of Node) = CType((From node In _allData
Where node.TopOffset = topOffset AndAlso node.Left > columnLimit.X AndAlso node.Left < columnLimit.Y
Order By node.Top, node.Left),
List(Of Node))
For Each tag As Node In columnTags
' keep updating this value for all processed tags
currentOrder += 1
' <@:o(
' _allData.Find(Function(its) its.Equals(tag)).Order = currentOrder
' _allData(_allData.IndexOf(tag)).Order = currentOrder
Next
Next
Next
End Sub