I am trying to convert all HTML Nodes into XPATH Here is a sample Input. Based on the HTML i am looking for all XPATH for all child nodes
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title">
<b>
The Dormouse's story
</b>
</p>
<span>Hello</span>
</body>
</html>
Output I want
html
html/head
html/head/title
html/body
html/body/p
What I have currently
{
"name": "[document]",
"attr": {},
"children": [
{
"name": "html",
"attr": {},
"children": [
{
"name": "head",
"attr": {},
"children": [
{
"name": "title",
"attr": {},
"children": []
}
]
},
{
"name": "body",
"attr": {},
"children": [
{
"name": "p",
"attr": {
"class": [
"title"
]
},
"children": [
{
"name": "b",
"attr": {},
"children": []
}
]
},
{
"name": "span",
"attr": {},
"children": []
}
]
}
]
}
]
}
The code
try:
import os
import lxml.etree
from bs4 import BeautifulSoup
import json
import etree
except Exception as e:
pass
def traverse(soup):
if soup.name is not None:
dom_dictionary = {}
dom_dictionary['name'] = soup.name
dom_dictionary['attr'] = soup.attrs
dom_dictionary['children'] = [
traverse(child)
for child in soup.children if child.name is not None
]
return dom_dictionary
with open("html.txt", "r") as f:
data = f.read()
soup = BeautifulSoup(data, 'html.parser')
JsonDom = traverse(soup)
print(json.dumps(JsonDom, indent=4))
Any help would be great if you guys can also point me in right direction would be great help
Any ideas suggestions would be great. i did look into Lxml bs4 and selenium but unfortunately no luck