0

I am using regex to find all pattern matches in a Word doc, which I will then manipulate.

The file I'm searching is ~330 pages long and includes copy/pasted emails. My problem is that when I use InStr(startPos, objRange.Text, match.submatches(0)) to find the starting position of each match, the result is actually offset by some amount. For the document in its original state, that offset happened to be 324 characters.

On a hunch, I decided to remove all the hyperlinks in the document to see what that would do. The RemoveHyperlinks sub found and removed 24 hyperlinks, after which the Instr() return value was off by only 20 characters (so that subtracting the magic number matchStart = matchStart - 1 - 20 gives the correct starting position). Obviously I want to avoid all magic numbers, but I cannot figure out where the last 20 characters are coming from.

I tried unlinking all fields, but there weren't any to unlink after the hyperlinks were removed.

Any thoughts on why

matchStart = InStr(startPos, objRange.Text, match.submatches(0))
matchEnd = matchStart + Len(match.submatches(0))
Set subRange0 = objDoc.Range(matchStart, matchEnd)

give me subRange0.Text different from match.submatches(0)? Or where the other hidden characters may be found (to be removed)?

Sub FixHighlightedText()
    Dim objDoc As Document
    Dim objRange As Range, subRange0 As Range
    Dim matchStart As Long, matchEnd As Long, startPos As Long
    Dim regex As Object
    Dim matches

    Set objDoc = ActiveDocument
    Set objRange = objDoc.Range(0, objDoc.Content.End)
    startPos = 1
    Set regex = CreateObject("VBScript.RegExp")

    Call RemoveHyperlinks

    With regex
        .Pattern = "((\([a-zA-Z]*?[-]?Time:.*?\})[a-zA-Z0-9]{0,3})"
        .Global = True
    End With

    If regex.test(objRange.Text) Then
        Set matches = regex.Execute(objRange.Text)

        Debug.Print "Document has " & matches.Count & " matches"
        Debug.Print "Document range is " & objRange.Start & " to " & objRange.End
        Debug.Print "FirstIndex = " & matches(0).FirstIndex

        For Each match In matches
            matchStart = InStr(startPos, objRange.Text, match.submatches(0))
            startPos = matchStart + Len(match.submatches(0))
            If matchStart > 0 Then
                matchStart = matchStart - 1
                matchEnd = matchStart + Len(match.submatches(0))
                Set subRange0 = objDoc.Range(matchStart, matchEnd)

                Debug.Print "Match starts at " & matchStart & " and ends at " & (matchStart + Len(match.submatches(1)))
                Debug.Print "   match0 text = " & match.submatches(0)
                Debug.Print "   subrange0 text = " & subRange0.Text
            Else
                Debug.Print "Match mysteriously not found in text"
            End If
        Next match
    Else
        Debug.Print "No regex matches"
    End If
End Sub

Sub RemoveHyperlinks()
    Dim link, cnt As Long, linkRange As Range, i As Long

    cnt = 0

    For i = ActiveDocument.Hyperlinks.Count To 1 Step -1
        With ActiveDocument.Hyperlinks(i)
            .TextToDisplay = .TextToDisplay & " (" & .Address & ")"
            Set linkRange = .Range
        End With

        ActiveDocument.Hyperlinks(i).Delete

        With linkRange.Font
            .Underline = wdUnderlineNone
            .ColorIndex = wdAuto
        End With

        cnt = cnt + 1
    Next i
    Debug.Print "Removed " & cnt & " link(s)"
End Sub

Sub RemoveFields()
    Dim cnt As Long, i As Long

    cnt = 0

    For i = ActiveDocument.Fields.Count To 1 Step -1
        ActiveDocument.Fields(i).Unlink

        cnt = cnt + 1
    Next i
    Debug.Print "Removed " & cnt & " field(s)"
End Sub
halfer
  • 19,824
  • 17
  • 99
  • 186
MTen
  • 1
  • 2

1 Answers1

0

I ended up finding the hint to my answer in the selected answer to this question: vbscript: replace text in activedocument with hyperlink.

Essentially, Instr() does not play well with the WYSIWYG feature of Word, but the Find method will give selections with the proper ranges. No need to remove hyperlinks nor worry about other mysterious, hidden text.

The code would look like:

Sub FixHighlightedText()
    Dim objDoc As Document
    Dim objRange As Range
    Dim startPos As Long
    Dim regex As Object
    Dim matches

    Set objDoc = ActiveDocument
    Set objRange = objDoc.Range
    startPos = 1
    Set regex = CreateObject("VBScript.RegExp")

    With regex
        .Pattern = "((\([a-zA-Z]*?[-]?Time:.*?\})[a-zA-Z0-9]{0,3})"
        .Global = True
    End With

    If regex.test(objRange.Text) Then
        Set matches = regex.Execute(objRange.Text)

        Debug.Print "Document has " & matches.Count & " matches"
        Debug.Print "Document range is " & objRange.Start & " to " & objRange.End
        Debug.Print "FirstIndex = " & matches(0).FirstIndex

        For Each match In matches
            Set objRange = objDoc.Range(startPos, objDoc.Content.End)
            With objRange.Find
                .Text = match.submatches(0)
                .MatchWholeWord = True
                .MatchCase = True
                .Wrap = wdFindStop
                .Execute
            End With
            startPos = objRange.End
            Debug.Print "Match starts at " & objRange.Start & " and ends at " & objRange.End
            Debug.Print "   match0 text = " & match.submatches(0)
            Debug.Print "   subrange text = " & objRange.Text
        Next match
    Else
        Debug.Print "No regex matches"
    End If
End Sub
halfer
  • 19,824
  • 17
  • 99
  • 186
MTen
  • 1
  • 2