0

I'm working on this project that should scrape websites and output HTML in the form of a JSON, now the only useful things in those JSONs to us are "forms".

I wanted to filter that but the native array filter only works when I know the attribute's location relative to the entire page (DOM??) but that won't always be the case, and I fear checking every object's value till I reach the desired value isn't viable due to

  1. some pages being humongous,
  2. form being a string in other places we don't want, this is in NodeJS

Snippet of input:

[
  {
    "type": "element",
    "tagName": "p",
    "attributes": [],
    "children": [
      {
        "type": "text",
        "content": "This is how the HTML code above will be displayed in a browser:"
      }
    ]
  },
  {
    "type": "text",
    "content": "\n"
  },
  {
    "type": "element",
    "tagName": "form",
    "attributes": [
      {
        "key": "action",
        "value": "/action_page.php"
      },
      {
        "key": "target",
        "value": "_blank"
      }
    ],
    "children": [
      {
        "type": "text",
        "content": "\nFirst name:"
      },
      {
        "type": "element",
        "tagName": "br",
        "attributes": [],
        "children": []
      },
      {
        "type": "text",
        "content": "\n"
      },
      {
        "type": "element",
        "tagName": "input",
        "attributes": [
          {
            "key": "type",
            "value": "text"
          },
          {
            "key": "name",
            "value": "firstname0"
          },
          {
            "key": "value",
            "value": "John"
          }
        ],
        "children": []
      },
      {
        "type": "element",
        "tagName": "br",
        "attributes": [],
        "children": []
      },
      {
        "type": "text",
        "content": "\nLast name:"
      },
      {
        "type": "element",
        "tagName": "br",
        "attributes": [],
        "children": []
      },
      {
        "type": "text",
        "content": "\n"
      },
      {
        "type": "element",
        "tagName": "input",
        "attributes": [
          {
            "key": "type",
            "value": "text"
          },
          {
            "key": "name",
            "value": "lastname0"
          },
          {
            "key": "value",
            "value": "Doe"
          }
        ],
        "children": []
      },
      {
        "type": "text",
        "content": "\n"
      },
      {
        "type": "element",
        "tagName": "br",
        "attributes": [],
        "children": []
      },
      {
        "type": "element",
        "tagName": "br",
        "attributes": [],
        "children": []
      },
      {
        "type": "text",
        "content": "\n"
      },
      {
        "type": "element",
        "tagName": "input",
        "attributes": [
          {
            "key": "type",
            "value": "submit"
          },
          {
            "key": "value",
            "value": "Submit"
          }
        ],
        "children": []
      },
      {
        "type": "text",
        "content": "\n"
      },
      {
        "type": "element",
        "tagName": "input",
        "attributes": [
          {
            "key": "type",
            "value": "reset"
          }
        ],
        "children": []
      },
      {
        "type": "text",
        "content": "\n"
      }
    ]
  },
  {
    "type": "text",
    "content": "\n"
  }
]

A snippet of output:

[
  {
    "type": "element",
    "tagName": "form",
    "attributes": [
      {
        "key": "action",
        "value": "/action_page.php"
      },
      {
        "key": "target",
        "value": "_blank"
      }
    ],
    "children": [
      {
        "type": "text",
        "content": "\nFirst name:"
      },
      {
        "type": "element",
        "tagName": "br",
        "attributes": [],
        "children": []
      },
      {
        "type": "text",
        "content": "\n"
      },
      {
        "type": "element",
        "tagName": "input",
        "attributes": [
          {
            "key": "type",
            "value": "text"
          },
          {
            "key": "name",
            "value": "firstname0"
          },
          {
            "key": "value",
            "value": "John"
          }
        ],
        "children": []
      },
      {
        "type": "element",
        "tagName": "br",
        "attributes": [],
        "children": []
      },
      {
        "type": "text",
        "content": "\nLast name:"
      },
      {
        "type": "element",
        "tagName": "br",
        "attributes": [],
        "children": []
      },
      {
        "type": "text",
        "content": "\n"
      },
      {
        "type": "element",
        "tagName": "input",
        "attributes": [
          {
            "key": "type",
            "value": "text"
          },
          {
            "key": "name",
            "value": "lastname0"
          },
          {
            "key": "value",
            "value": "Doe"
          }
        ],
        "children": []
      },
      {
        "type": "text",
        "content": "\n"
      },
      {
        "type": "element",
        "tagName": "br",
        "attributes": [],
        "children": []
      },
      {
        "type": "element",
        "tagName": "br",
        "attributes": [],
        "children": []
      },
      {
        "type": "text",
        "content": "\n"
      },
      {
        "type": "element",
        "tagName": "input",
        "attributes": [
          {
            "key": "type",
            "value": "submit"
          },
          {
            "key": "value",
            "value": "Submit"
          }
        ],
        "children": []
      },
      {
        "type": "text",
        "content": "\n"
      },
      {
        "type": "element",
        "tagName": "input",
        "attributes": [
          {
            "key": "type",
            "value": "reset"
          }
        ],
        "children": []
      },
      {
        "type": "text",
        "content": "\n"
      }
    ]
  }
]

TL;DR: only retain forms and any of its children.

Md. Abu Taher
  • 17,395
  • 5
  • 49
  • 73

1 Answers1

1

First of all, this input looks like very incomplete, it may be an array or an object. If I assume it's an array of objects, then I can use jsonpath to access any of the values.

var jp = require('jsonpath');
var formNodes = jp.query(nodes, `$..[?(@.tagName=="form")]`);

You can achive the same using vanila javascript, there was several stackoverflow questions for that. But I found jsonpath and xpath being easier to implement than those.

Md. Abu Taher
  • 17,395
  • 5
  • 49
  • 73
  • thanks a lot, it worked, really grateful for your help – Ramii Ahmed Aug 26 '21 at 14:19
  • now i have filtered that & im working on attaching coordinates to each element, i have done the coordinates part (elementhandler boundingbox) and need only return value & type, any ideas of how i can parse these data from JSON then append them back to their respective object? all solutions i have found to this issue are too hardcoded/specific – Ramii Ahmed Aug 26 '21 at 23:52
  • kindly create a new question with details – Md. Abu Taher Aug 28 '21 at 06:29