0

I have a text file:

<span class="html-tag">&lt;script&gt;</span></td></tr><tr><td class="line-number" value="1431"></td><td class="line-content">            var awbManifests = {"requestId":"16d1-4451-9b12-f61a87e9cd11","errorMessage":null,"errorCode":null,"success":true,"content":[{"id":"5ec8-444e-9d5b-f7487ce592c2","storeId":"10001","createdDate":1541923869937,"createdBy":"asdf","updatedDate":1541968417296,"updatedBy":"dsa","type":"airwaybill","value":"5468468464568466","logisticTrackingID":"5468468464568466","senderName":"dasdf","senderAddress":"Batuceper","receiverName":"ATIK","receiverAddress":"JL. SRIKATON BARAT\n","manifestList":[{"logisticProviderCode":"asd","blibliAirwayBillNumber":"5468468464568466","status":"DEPARTED FROM TRANSIT [GATEWAY JAKARTA]","timestamp":1541976677000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"RECEIVED AT ORIGIN GATEWAY [GATEWAY JAKARTA]","timestamp":1541976343000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"PROCESSED AT SORTING CENTER [JAKARTA]","timestamp":1541968348000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"RECEIVED AT SORTING CENTER [JAKARTA]","timestamp":1541960930000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"SHIPMENT RECEIVED BY asdf COUNTER OFFICER AT [JAKARTA]","timestamp":1541926728000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]}]}],"pageMetaData":null};</td></tr><tr><td class="line-number" value="1432"></td><td class="line-content">            var ordersTracking = [{"orderItemId":"53000116530","product":null,"shipment":"asdf","airwaybillNumber":"5468468464568466","receiver":null,"receivedDate":null,"relation":null,"status":"Valid","productType":"Regular","eligibleForFeedback":false,"feedback":null,"invalidAWBJiraNumber":"","mismatchAWBJiraNumber":"","isAirwayBillValid":true,"mismatchAirwayBill":false}];

And I want to get the result from var awbManifests = until the first ; sign, so the output should only be a JSON format like this:

{"requestId":"16d1-4451-9b12-f61a87e9cd11","errorMessage":null,"errorCode":null,"success":true,"content":[{"id":"5ec8-444e-9d5b-f7487ce592c2","storeId":"10001","createdDate":1541923869937,"createdBy":"asdf","updatedDate":1541968417296,"updatedBy":"dsa","type":"airwaybill","value":"5468468464568466","logisticTrackingID":"5468468464568466","senderName":"dasdf","senderAddress":"Batuceper","receiverName":"ATIK","receiverAddress":"JL. SRIKATON BARAT\n","manifestList":[{"logisticProviderCode":"asd","blibliAirwayBillNumber":"5468468464568466","status":"DEPARTED FROM TRANSIT [GATEWAY JAKARTA]","timestamp":1541976677000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"RECEIVED AT ORIGIN GATEWAY [GATEWAY JAKARTA]","timestamp":1541976343000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"PROCESSED AT SORTING CENTER [JAKARTA]","timestamp":1541968348000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"RECEIVED AT SORTING CENTER [JAKARTA]","timestamp":1541960930000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"SHIPMENT RECEIVED BY asdf COUNTER OFFICER AT [JAKARTA]","timestamp":1541926728000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]}]}],"pageMetaData":null}

So far I can only do this but this command doesn't grep all the json strings:

grep -o -P '(?<=var awbManifests = ).*(?=pageMetaData)' test.html

How do I fix it ?

Joe
  • 791
  • 1
  • 9
  • 24

3 Answers3

1

Please don't use regular expressions to parse html. If you ask me you're much better off with a tool that can parse HTML/XML and JSON, like .

Parse the relevant <td> element-node:

xidel -s input.htm -e '//td[@value="1431"]/following-sibling::td'
 var awbManifests = {"requestId":"16d1-4451-9b12-f61a87e9cd11",[...],"pageMetaData":null};

Isolate the JSON:

xidel -s input.htm -e '
  //td[@value="1431"]/substring-before(substring-after(following-sibling::td,"awbManifests = "),";")
'
#or
xidel -s input.htm -e '
  //td[@value="1431"]/extract(following-sibling::td,"awbManifests = (.+);",1)
'
{"requestId":"16d1-4451-9b12-f61a87e9cd11",[...],"pageMetaData":null}

Parse the JSON:

xidel -s input.htm -e '
  json(
    //td[@value="1431"]/substring-before(substring-after(following-sibling::td,"awbManifests = "),";")
  )
'
{
  "requestId": "16d1-4451-9b12-f61a87e9cd11",
  "errorMessage": null,
  "errorCode": null,
  "success": true,
  "content": [...],
  "pageMetaData": null
}
Reino
  • 3,203
  • 1
  • 13
  • 21
0

It took me a while to understand what was going on. It's interesting because you use lookbehinds ((?<=)) and lookaheads in your regular expression and those are very useful constructs that I almost never use.

The principle of a lookahead / lookbehind, is that the string inside the look* group is matched, but is not present in the matched string. This is very useful with grep -o. The lookbehind (?<=var awbManifests = ) is used correctly, but the lookahead (?=pageMetaData) should be (?=;). But now, the problem you are running into is that the regex matches too much text.

By default, regex quantifiers +, * and {n,m} are greedy, this means they try to match as much text as they can. In perl mode (-P), non-greedy quantifiers are available in grep. Their syntax is +? for +, *? for *, and {n,m}? for {n,m}.

Applied here, the regex to use is:

grep -o -P '(?<=var awbManifests = ).*?(?=;)' test.html

But there is still one problem: If one of the JSON strings contains a ;, the JSON won't be completely matched. To account for strings, use instead:

grep -o -P '(?<=var awbManifests = )[^"]*("([^"]|\\")*"[^"]*?)*?(?=;)' test.html

In the above regex:

  • [^"] matches anything but a double quote.
  • \\ backslashes must be escaped because they have a meaning in regular expressions.
  • "([^"]|\\")*" matches one string, taking escaped quotes into account.
  • [^"]*("([^"]|\\")*"[^"]*)* matches several strings, with text before and after it. This pattern matches the full JSON but is greedy.
  • [^"]*("([^"]|\\")*"[^"]*?)*? matches the JSON in a non-greedy manner.
Mathieu CAROFF
  • 1,230
  • 13
  • 19
0

With Perl also you can extract them with non-greedy quantifier

> cat regex_jsoon.dat
<span class="html-tag">&lt;script&gt;</span></td></tr><tr><td class="line-number" value="1431"></td><td class="line-content">            var awbManifests = {"requestId":"16d1-4451-9b12-f61a87e9cd11","errorMessage":null,"errorCode":null,"success":true,"content":[{"id":"5ec8-444e-9d5b-f7487ce592c2","storeId":"10001","createdDate":1541923869937,"createdBy":"asdf","updatedDate":1541968417296,"updatedBy":"dsa","type":"airwaybill","value":"5468468464568466","logisticTrackingID":"5468468464568466","senderName":"dasdf","senderAddress":"Batuceper","receiverName":"ATIK","receiverAddress":"JL. SRIKATON BARAT\n","manifestList":[{"logisticProviderCode":"asd","blibliAirwayBillNumber":"5468468464568466","status":"DEPARTED FROM TRANSIT [GATEWAY JAKARTA]","timestamp":1541976677000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"RECEIVED AT ORIGIN GATEWAY [GATEWAY JAKARTA]","timestamp":1541976343000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"PROCESSED AT SORTING CENTER [JAKARTA]","timestamp":1541968348000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"RECEIVED AT SORTING CENTER [JAKARTA]","timestamp":1541960930000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"SHIPMENT RECEIVED BY asdf COUNTER OFFICER AT [JAKARTA]","timestamp":1541926728000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]}]}],"pageMetaData":null};</td></tr><tr><td class="line-number" value="1432"></td><td class="line-content">            var ordersTracking = [{"orderItemId":"53000116530","product":null,"shipment":"asdf","airwaybillNumber":"5468468464568466","receiver":null,"receivedDate":null,"relation":null,"status":"Valid","productType":"Regular","eligibleForFeedback":false,"feedback":null,"invalidAWBJiraNumber":"","mismatchAWBJiraNumber":"","isAirwayBillValid":true,"mismatchAirwayBill":false}];

> perl -ne ' { s/.*var awbManifests = (.*?);.*/\1/g; print } ' regex_jsoon.dat
{"requestId":"16d1-4451-9b12-f61a87e9cd11","errorMessage":null,"errorCode":null,"success":true,"content":[{"id":"5ec8-444e-9d5b-f7487ce592c2","storeId":"10001","createdDate":1541923869937,"createdBy":"asdf","updatedDate":1541968417296,"updatedBy":"dsa","type":"airwaybill","value":"5468468464568466","logisticTrackingID":"5468468464568466","senderName":"dasdf","senderAddress":"Batuceper","receiverName":"ATIK","receiverAddress":"JL. SRIKATON BARAT\n","manifestList":[{"logisticProviderCode":"asd","blibliAirwayBillNumber":"5468468464568466","status":"DEPARTED FROM TRANSIT [GATEWAY JAKARTA]","timestamp":1541976677000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"RECEIVED AT ORIGIN GATEWAY [GATEWAY JAKARTA]","timestamp":1541976343000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"PROCESSED AT SORTING CENTER [JAKARTA]","timestamp":1541968348000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"RECEIVED AT SORTING CENTER [JAKARTA]","timestamp":1541960930000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"SHIPMENT RECEIVED BY asdf COUNTER OFFICER AT [JAKARTA]","timestamp":1541926728000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]}]}],"pageMetaData":null}

>

or one more version without substitution.

> perl -ne ' { print "$x\n"  if /var awbManifests = (.*?);/osg  and $x=$1 } ' regex_jsoon.dat
{"requestId":"16d1-4451-9b12-f61a87e9cd11","errorMessage":null,"errorCode":null,"success":true,"content":[{"id":"5ec8-444e-9d5b-f7487ce592c2","storeId":"10001","createdDate":1541923869937,"createdBy":"asdf","updatedDate":1541968417296,"updatedBy":"dsa","type":"airwaybill","value":"5468468464568466","logisticTrackingID":"5468468464568466","senderName":"dasdf","senderAddress":"Batuceper","receiverName":"ATIK","receiverAddress":"JL. SRIKATON BARAT\n","manifestList":[{"logisticProviderCode":"asd","blibliAirwayBillNumber":"5468468464568466","status":"DEPARTED FROM TRANSIT [GATEWAY JAKARTA]","timestamp":1541976677000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"RECEIVED AT ORIGIN GATEWAY [GATEWAY JAKARTA]","timestamp":1541976343000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"PROCESSED AT SORTING CENTER [JAKARTA]","timestamp":1541968348000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"RECEIVED AT SORTING CENTER [JAKARTA]","timestamp":1541960930000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]},{"logisticProviderCode":"asdf","blibliAirwayBillNumber":"5468468464568466","status":"SHIPMENT RECEIVED BY asdf COUNTER OFFICER AT [JAKARTA]","timestamp":1541926728000,"additionalInfo":[{"label":"Third Party Tracking ID","value":null,"type":"STRING","description":"Third Party Tracking ID"}]}]}],"pageMetaData":null}
>
stack0114106
  • 8,534
  • 3
  • 13
  • 38