I have a JSON that looks like this;
{
"h1": [
{
"_value": "Health Authority Updates"
}
],
"h2": [
{
"_value": "North America"
}
],
"h3": [
{
"a": [
{
"_attributes": {
"id": "_US_guidances/regulations"
}
}
],
"_value": "US guidances/regulations"
}
],
"ol": [
{
"li": [
{
"_value": "Final Guidance: 25-May-2021:",
"a": [
{
"_attributes": {
"href": "https://www.fda.gov/regulatory-information/search-fda-guidance-documents/emergency-use-authorization-vaccines-prevent-covid-19"
},
"_value": "Emergency Use Authorization for Vaccines to Prevent COVID-19: Guidance for Industry"
}
],
"ol": [
{
"li": [
{
"_value": "First",
"ol": [
{
"li": [
{
"_value": "second"
}
]
}
]
},
{
"_value": "second"
}
]
}
]
},
{
"_value": "Final Guidance: 25-May-2021:",
"a": [
{
"_attributes": {
"href": "https://www.fda.gov/regulatory-information/search-fda-guidance-documents/emergency-use-authorization-vaccines-prevent-covid-19"
},
"_value": "Emergency Use Authorization for Vaccines to Prevent COVID-19: Guidance for Industry"
}
],
"ol": [
{
"li": [
{
"_value": "hi",
"ol": [
{
"li": [
{
"_value": "second. Check this page",
"a": [
{
"_attributes": {
"href": "https://urldefense.proofpoint.com/v2/url?u=http-3A__s2027422842.t.en25.com_e_er-3Futm-5Fcampaign-3DSBIA-253A-2520CT-2520Guidance-2520Updated-26utm-5Fmedium-3Demail-26utm-5Fsource-3DEloqua-26s-3D2027422842-26lid-3D13556-26elqTrackId-3DC52ECDADDE74615811BB817590D02682-26elq-3D2a878946f6664822b1c219ea1db8e241-26elqaid-3D12812-26elqat-3D1&d=DwMGaQ&c=sb6gdlHSSEAKVs7mNNqH8g&r=JoNDtKjZ-WLYpZe6yoHY-rfZe2_aVwx3audxLDIHLIc&m=3cHBVqsEpMXfvcFT2C3A4RIlQHNpDuAjtMSavvNj-Nw&s=r7NaqUZanfVFJLNwJUC6PFSdNjI274dhL--YmK4-Cgg&e=",
"target": "_blank"
},
"_value": "COVID MyStudies App"
}
]
}
]
}
]
},
{
"_value": "hellow"
}
]
}
]
}
]
}
]
}
I need to convert it into something like this:
{
"response": {
"docs": [
{
"region_s" : "North America",
"country_s": "US",
"ArticleHeading_s": "Emergency Use Authorization for Vaccines to Prevent COVID-19: Guidance for Industry 22",
"ArticleDate_s": "Fri Jun 22 17:29:07 UTC 2021",
"refined_summary_t": "<ol><li> first list</li><li> Second</li></ol>",
"id": "http://www.minsa.gob.pa/noticia/arranca-esperado-proceso-de-vacunacion-en-chiriqui",
"source_type_s": "Press Releases/News",
"curation_date_t":"19 Jun 2021 to 25 Jun 2021"
}
]
}
}
Ol and li keys in the previous json are basically html tags that need to be added into refined_summary. I have written this code:
class BaseDocument():
def __init__(self):
self.date = ""
self.id_url = ""
self.articleHeading = ""
self.refined_summary = ""
self.hyperlinks = []
self.text = []
self.dict = []
self.dic = {}
self.out = {
"response": {
"docs": [],
},
}
def a(self,data):
if len(data) > 1:
for i in range(0, len(data)):
self.hyperlinks.append(data[i]["_attributes"]["href"])
self.text.append(data[i]["_value"])
elif (len(data) == 1) and (self.id_url =="") and (self.articleHeading == "" ) :
self.id_url = data[0]["_attributes"]["href"]
self.articleHeading= data[0]["_value"]
elif (len(data) == 1) and (len(self.id_url) > 0 ) and (len(self.articleHeading) > 0 ) :
self.hyperlinks.append(data[0]["_attributes"]["href"])
self.text.append(data[0]["_value"])
def li(self,data):
print("data")
print(data)
for i in range(0,len(data)):
print("here")
print(data[i])
if 'a' in data[i]:
self.a(data[i]["a"])
if len(self.hyperlinks) == 0:
if "_value" in data[i]:
value = data[i]["_value"]
self.date = re.search("[0-9][0-9]-.*[A-Za-z]-[0-9][0-9][0-9][0-9]", value).group(0)
elif "_values" in data[i]:
value = data[i]["_values"][0]
self.date = re.search("[0-9][0-9]-.*[A-Za-z]-[0-9][0-9][0-9][0-9]", value).group(0)
elif len(self.hyperlinks) > 0:
self.hyperlinks = []
self.text = []
elif "a" not in data[i]:
self.refined_summary += "<li>"
self.refined_summary += data[i]["_value"]
if "ol" not in data[i]:
self.refined_summary += "</li>"
if "ol" in data[i]:
for j in range(0, len(data[i]["ol"])):
self.refined_summary += "<ol>"
self.ol(data[i]["ol"][j])
self.refined_summary += "</li>"
return
self.refined_summary += "</ol>"
self.dic["id"] = self.id_url
self.dic["ArticleDate_s"] = self.date
self.dic["ArticleHeading_s"] = self.articleHeading
self.dic["refined_summary_t"] = self.refined_summary
self.dict.append(self.dic)
self.id_url = ''
self.date = ""
self.articleHeading = ""
self.refined_summary = "<ol>"
self.dic = {}
print("data")
print(data[i])
def ol(self,data):
for i in range(0, len(data)):
self.li(data["li"])
def begin_ol(self,data):
for i in range(0, len(data)):
self.li(data["li"])
self.dic["id"] = self.id_url
self.dic["ArticleDate_s"] = self.date
self.dic["ArticleHeading_s"] = self.articleHeading
self.dic["refined_summary_t"] = self.refined_summary
self.dict.append(self.dic)
self.id_url = ''
self.date = ""
self.articleHeading = ""
self.refined_summary = "<ol>"
self.dic = {}
print("printing")
print(self.dict)
This is a class that I have created and I call it by using this:
countries = []
heading_three = output_json["h3"]
for i in range(0, len(heading_three)):
ele = heading_three[i]
countries.append(ele["_value"])
ele = output_json["ol"]
dic = []
for i in range(0, len(countries)):
print("in tis ")
print(ele[i])
self.begin_ol(ele[i])
This is sort of recursive but still has a flavour of iteration. I have created a separate function for each tag. So when a list(value) corresponding to that tag is passed to it, it iterates over it and checks for tags/keys. Whatever key it finds it calls the relevant function. The problem is that I am unable to properly put a base case for this. The resultant output looks like this
[
{
"id": "https://www.fda.gov/regulatory-information/search-fda-guidance-documents/emergency-use-authorization-vaccines-prevent-covid-19",
"ArticleDate_s": "25-May-2021",
"ArticleHeading_s": "Emergency Use Authorization for Vaccines to Prevent COVID-19: Guidance for Industry",
"refined_summary_t": "<ol><li>First<ol><li>second</li></ol>"
},
{
"id": "",
"ArticleDate_s": "",
"ArticleHeading_s": "",
"refined_summary_t": "<ol></li></li>"
}
]
I am sure there is some other Pythonic way to deal with this. But I am unable to get there. Been stuck here for past 12 hours :(