1

Been wondering if it would be possible to make a simple script to check if multiple criteria are met and make necessary amendments to file.

Moving on to example of what I have and what I want to achieve.

I have an xml file with 4 lines - number, year, model and man.

If <man> is Ford or Dodge, I want no amendments to be made. But if <man> is anything other than that, then I want to check if <year> or <model> are "NA" and remove the line with "NA".

<?xml version="1.0" encoding="UTF-8"?>
<CarStuff>
    <fileName>CarExpor201217.xml</fileName>
    <numberCars>5</numberCars>
    <ref>2017XY</ref>
    <carExo id="CAR0001_01">
        <dealVen id="CAR0001_02">
            <name>John</name>
            <surname>Smith</surname>
        </dealVen>
        <soldCar id="CAR0001_03">
            <amount>1811.10</amount>
            <lotNumber>1</lotNumber>
            <year>NA</year>             - Line must be removed
            <model>NA</model>           - Line must be removed
            <man>Acura</man>
        </soldCar>
    </carExo>
    <carExo id="CAR0002_01">
        <dealVen id="CAR0002_02">
            <name>John</name>
            <surname>Smith</surname>
        </dealVen>
        <soldCar id="CAR0002_03">
            <amount>1811.10</amount>
            <lotNumber>1</lotNumber>
            <year>NA</year>         - Line must be kept
            <model>NA</model>       - Line must be kept
            <man>Ford</man>
        </soldCar>
    </carExo>
    <carExo id="CAR0003_01">
        <dealVen id="CAR0003_02">
            <name>John</name>
            <surname>Smith</surname>
        </dealVen>
        <soldCar id="CAR0003_03">
            <amount>1811.10</amount>
            <lotNumber>1</lotNumber>
            <year>1997</year>       - Line must be kept
            <model>NA</model>       - Line must be removed
            <man>Bugati</man>
        </soldCar>
    </carExo>
    <carExo id="CAR0004_01">
        <dealVen id="CAR0004_02">
            <name>John</name>
            <surname>Smith</surname>
        </dealVen>
        <soldCar id="CAR0004_03">
            <amount>1811.10</amount>
            <lotNumber>1</lotNumber>
            <year>1997</year>       - Line must be kept
            <model>NA</model>       - Line must be kept
            <man>Dodge</man>
        </soldCar>
    </carExo>
    <carExo id="CAR0005_01">
        <dealVen id="CAR0005_02">
            <name>John</name>
            <surname>Smith</surname>
        </dealVen>
        <soldCar id="CAR0005_03">
            <amount>1811.10</amount>
            <lotNumber>2</lotNumber>
            <year>NA</year>         - Line must be kept
            <model>Charger</model>  - Line must be kept
            <man>Dodge</man>
        </soldCar>
    </carExo>
    <carExo id="CAR0005_01">
        <dealVen id="CAR0005_02">
            <name>John</name>
            <surname>Smith</surname>
        </dealVen>
        <soldCar id="CAR0005_03">
            <amount>1811.10</amount>
            <lotNumber>3</lotNumber>
            <year>NA</year>         - Line must be removed
            <model>Dot</model>      - Line must be kept
            <man>Datsun</man>
        </soldCar>
    </carExo>
</CarStuff>

Grateful for all the comments and ideas.

Mr Lister
  • 45,515
  • 15
  • 108
  • 150
user99776644
  • 23
  • 1
  • 4

3 Answers3

1

Simply use XSLT, the special-purpose language designed to do exactly what you need in transforming original XML file by removing nodes according to various criteria.

Specifically below runs the Identity Transform to copy XML as is and then excludes the nodes by your criteria for model / year / man.

XSLT (save as .xsl, a special .xml file)

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
  <xsl:output indent="yes"/>
  <xsl:strip-space elements="*"/>

  <xsl:template match="@*|node()">
    <xsl:copy>
      <xsl:apply-templates select="@*|node()"/>
    </xsl:copy>
  </xsl:template>

  <xsl:template match="soldCar[man != 'Ford' and man != 'Dodge']">
    <xsl:copy>
        <xsl:copy-of select="amount|lotNumber"/>
        <xsl:if test="model != 'NA'">
            <xsl:copy-of select="model"/>
        </xsl:if>
        <xsl:if test="year != 'NA'">
            <xsl:copy-of select="year"/>
        </xsl:if>
        <xsl:copy-of select="man"/>
    </xsl:copy>
  </xsl:template>

</xsl:stylesheet>

VBA

Public Sub RunXSLT()
    Dim strFile As String, strPath As String
    ' REFERENCE MS XML, v6.0
    Dim xmlDoc As New MSXML2.DOMDocument60, xslDoc As New MSXML2.DOMDocument60 
    Dim newDoc As New MSXML2.DOMDocument60

    ' LOAD XML SOURCE
    xmlDoc.Load "C:\Path\To\Input.xml"

    ' LOAD XSL SOURCE
    xslDoc.Load "C:\Path\To\XSLT\Script.xsl"

   ' TRANSFORM SOURCE
   xmlDoc.transformNodeToObject xslDoc, newDoc
   newDoc.Save "C:\Path\To\Output.xml"

   ' RELEASE DOM OBJECTS
    Set xmlDoc = Nothing: Set xslDoc = Nothing: Set newDoc = Nothing
End Sub

Output

<?xml version="1.0" encoding="utf-8"?>
<CarStuff>
  <fileName>CarExpor201217.xml</fileName>
  <numberCars>5</numberCars>
  <ref>2017XY</ref>
  <carExo id="CAR0001_01">
    <dealVen id="CAR0001_02">
      <name>John</name>
      <surname>Smith</surname>
    </dealVen>
    <soldCar>
      <amount>1811.10</amount>
      <lotNumber>1</lotNumber>
      <man>Acura</man>
    </soldCar>
  </carExo>
  <carExo id="CAR0002_01">
    <dealVen id="CAR0002_02">
      <name>John</name>
      <surname>Smith</surname>
    </dealVen>
    <soldCar id="CAR0002_03">
      <amount>1811.10</amount>
      <lotNumber>1</lotNumber>
      <year>NA</year>
      <model>NA</model>
      <man>Ford</man>
    </soldCar>
  </carExo>
  <carExo id="CAR0003_01">
    <dealVen id="CAR0003_02">
      <name>John</name>
      <surname>Smith</surname>
    </dealVen>
    <soldCar>
      <amount>1811.10</amount>
      <lotNumber>1</lotNumber>
      <year>1997</year>
      <man>Bugati</man>
    </soldCar>
  </carExo>
  <carExo id="CAR0004_01">
    <dealVen id="CAR0004_02">
      <name>John</name>
      <surname>Smith</surname>
    </dealVen>
    <soldCar id="CAR0004_03">
      <amount>1811.10</amount>
      <lotNumber>1</lotNumber>
      <year>1997</year>
      <model>NA</model>
      <man>Dodge</man>
    </soldCar>
  </carExo>
  <carExo id="CAR0005_01">
    <dealVen id="CAR0005_02">
      <name>John</name>
      <surname>Smith</surname>
    </dealVen>
    <soldCar id="CAR0005_03">
      <amount>1811.10</amount>
      <lotNumber>2</lotNumber>
      <year>NA</year>
      <model>Charger</model>
      <man>Dodge</man>
    </soldCar>
  </carExo>
  <carExo id="CAR0005_01">
    <dealVen id="CAR0005_02">
      <name>John</name>
      <surname>Smith</surname>
    </dealVen>
    <soldCar>
      <amount>1811.10</amount>
      <lotNumber>3</lotNumber>
      <model>Dot</model>
      <man>Datsun</man>
    </soldCar>
  </carExo>
</CarStuff>
Parfait
  • 104,375
  • 17
  • 94
  • 125
  • Helpful as it shows the other way round via XSLT (and early binding) + – T.M. Dec 28 '17 at 09:20
  • Indeed. And even more XSLT is portable and not restricted to VBA. OP can use this same XSL script in other languages to transform original XML: Java, PHP, Python, even PowerShell and Bash! No `For` loops or `If...Then` logic needed. – Parfait Dec 28 '17 at 14:28
0

Sounds like you need to remove all lines that contain >NA<.

This isn't really a programming question (so it's off-topic) but here's a quick answer using Notepad++:

  • Ctrl + H to bring up the find replace dialog.

  • In the Find what: text box include your regex: .*>NA<.*\r?\n (where the \r is optional in case the file doesn't have Windows line endings).

  • Leave the Replace with: text box empty.

  • Make sure the Regular Expression radio button in the Search Mode area is selected.

  • lick Replace All and voilà! All lines containing >NA< have been removed.

NPP repl line example

(Answer adapted from this).

ashleedawg
  • 20,365
  • 9
  • 72
  • 105
  • Wish it was this simple. The Ford may also have lines with NA, but these should be kept. That is the reason I'm looking for fields where is NOT Ford or Dodge. In addition only two lines above should be checked for NA. – user99776644 Dec 27 '17 at 08:30
  • a one-time thing? or something that needs to be done repeatedly? – ashleedawg Dec 27 '17 at 08:33
  • There are a number of ways to tackle this depending on a few things like whether the file is properly formatted XML (as opposed to your sample, missing top-level, etc), as well as where this file is coming from, whether this is a 1-time change, where the data is going when you're done, etc. – ashleedawg Dec 27 '17 at 08:41
  • Two lines above should be checked when it is not Ford or Dodge. E.g. - script checks line by line, find NOT Ford or Dodge on line 4, goes up one line to 3rd and if it is NA - removes line, else leaves as is. Then goes up another line to 2nd and checks if it is NA - same logic as line 3. Resumes from the line 4 where it found NOT Ford or Dodge, and does the same operation until the end of file. – user99776644 Dec 27 '17 at 08:48
  • Unfortunately unable to share the file, but it is a well formatted xml with the fields I described being in their nodes. – user99776644 Dec 27 '17 at 08:51
  • I can't think of an easily-explained way to do that especially without seeing some data. I'm a bit of a data geek so I could likely write a quick sub to accomplish that but not without a test file... You could always make a sample one with fake data, and 10 records. (and the appropriate headers etc so its representative of the actual file). – ashleedawg Dec 27 '17 at 08:53
  • ...now I'm curious! Which lines could be "NA"? Any of the 3 besides ""? Will there always be 4 _(and only 4)_ items per record (number/year/model/man) ? – ashleedawg Dec 27 '17 at 09:00
  • I'll try to mock-up something. There's more data. but only one group requires amendments, it contains a fixed number of nodes of which and must be removed or kept based on value. – user99776644 Dec 27 '17 at 09:14
  • always just those 4 items for this node? – ashleedawg Dec 27 '17 at 09:15
  • Thanks, that's much more clear example; unfortunately I'm out of time for the moment.... in the meantime, have you tried Excel's built in XML tools? (ie, xml table or source pane options when just Ctrl+O opening the xml file in Excel?) – ashleedawg Dec 27 '17 at 10:31
0

Solution via XMLDom

You can use XMLDom and XPath to search in a so called NodeList for <man> tags not containing Dodge or Ford strings and check all siblings if they contain "NA" in order to delete them. The code below uses late binding. BTW, your xml in OP wasn't well formed (closing tag </carStuf> instead of </carStuff> - I added a little parse error routine to check this when loading.

Code

Option Explicit

Sub checkNA()
Dim xDoc        As Object    ' xml document
Dim noli, noli2 As Object    ' node list
Dim no, no2     As Object    ' node
Dim noMan       As Object    ' node <man> to check if no Dodge or Ford
Dim s           As String
Dim sFile       As String    ' xml file name

  sFile = ThisWorkbook.Path & "\xml\na_test.xml"  ' <<< change to your xml file name

' late binding xml
  Set xDoc = CreateObject("MSXML2.DOMDocument.6.0")
  xDoc.async = False: xDoc.validateOnParse = False
  xDoc.setProperty "SelectionLanguage", "XPath"
' load xml
  If xDoc.Load(sFile) Then
    Debug.Print "Loaded successfully"
  Else
    Dim xPE        As Object    ' Set xPE = CreateObject("MSXML2.IXMLDOMParseError")
    Dim strErrText As String
    Set xPE = xDoc.parseError
    With xPE
    strErrText = "Load error " & .ErrorCode & " xml file " & vbCrLf & _
                 Replace(.URL, "file:///", "") & vbCrLf & vbCrLf & _
                 xPE.reason & _
                 "Source Text: " & .srcText & vbCrLf & vbCrLf & _
                 "Line No.:    " & .Line & vbCrLf & _
                 "Line Pos.: " & .linepos & vbCrLf & _
                "File Pos.:  " & .filepos & vbCrLf & vbCrLf
    End With
    MsgBox strErrText, vbExclamation
    Set xPE = Nothing
    Exit Sub
  End If

' check items
  s = "carExo/soldCar"
  Set noli = xDoc.DocumentElement.SelectNodes(s)
  For Each no In noli
      Set noMan = no.SelectSingleNode("man")
      If Not noMan Is Nothing Then
         If InStr("Ford.Dodge" & ".", noMan.Text & ".") = 0 Then
            Debug.Print "delete", noMan.Text
            ' delete all subtags containing "NA" as text
            Set noli2 = no.SelectNodes("*")
            For Each no2 In noli2
                If no2.Text = "NA" Then
                   ' delete item
                     Debug.Print , no2.nodename & "=" & no2.Text
                     no2.ParentNode.RemoveChild no2
                End If
            Next no2

         Else
            ' Debug.Print "keep", noman.Text
         End If
      End If
  Next no

' save
  ' Debug.Print xDoc.XML
  xDoc.Save sFile      
' close
  Set xDoc = Nothing
End Sub

Edit 12/29 - Addendum

I added a second workable version of the ' check items part using some extra XPath. This alternative simply avoids two If conditions in normal code as it narrows the range of found nodes in the two node lists.

' check items
  s = "carExo/soldCar[man!='Ford'][man!='Dodge']"   ' << (1) added condition to XPath
  Set noli = xDoc.DocumentElement.SelectNodes(s)
  For Each no In noli
      Set noMan = no.SelectSingleNode("man")
      If Not noMan Is Nothing Then
         Debug.Print "delete", noMan.Text
       ' delete all subtags containing "NA" as text
         Set noli2 = no.SelectNodes("*[.='NA']")    ' << (2)added condition to XPath
         For Each no2 In noli2
           ' delete item
             Debug.Print , no2.nodename & "=" & no2.Text
             no2.ParentNode.RemoveChild no2
         Next no2
      End If
  Next no

Hint

Of course there are many streets leading to Rome, see @Parfait 's XSLT approach below.

T.M.
  • 9,436
  • 3
  • 33
  • 57
  • Great! Simple and easy to understand. I'll just need to amend it slightly, just to find node no matter the parent node. – user99776644 Dec 28 '17 at 06:31
  • 1
    In case anyone would need it, maybe there's a simpler way, but did it like so: Set noli = xDoc.DocumentElement.SelectNodes("descendant::soldCar") – user99776644 Dec 28 '17 at 06:43
  • Glad I could be helpful. I edited my answer with a 2nd workable version permitting to avoid two `If` conditions by including them into the XPath string. – T.M. Dec 28 '17 at 09:42
  • You can search nodes at any level via a double slash prefix `//`, too. BTW, it isn't necessary in this case to use `DocumentElement` designing the root node, precisely because **all** XML levels are included. – T.M. Dec 28 '17 at 10:19