0

I have a JSON that looks like this;

{
  "h1": [
    {
      "_value": "Health Authority Updates"
    }
  ],
  "h2": [
    {
      "_value": "North America"
    }
  ],
  "h3": [
    {
      "a": [
        {
          "_attributes": {
            "id": "_US_guidances/regulations"
          }
        }
      ],
      "_value": "US guidances/regulations"
    }
  ],
  "ol": [
    {
      "li": [
        {
          "_value": "Final Guidance: 25-May-2021:",
          "a": [
            {
              "_attributes": {
                "href": "https://www.fda.gov/regulatory-information/search-fda-guidance-documents/emergency-use-authorization-vaccines-prevent-covid-19"
              },
              "_value": "Emergency Use Authorization for Vaccines to Prevent COVID-19: Guidance for Industry"
            }
          ],
          "ol": [
            {
              "li": [
                {
                  "_value": "First",
                  "ol": [
                    {
                      "li": [
                        {
                          "_value": "second"
                        }
                      ]
                    }
                  ]
                },
                {
                  "_value": "second"
                }
              ]
            }
          ]
        },
        {
          "_value": "Final Guidance: 25-May-2021:",
          "a": [
            {
              "_attributes": {
                "href": "https://www.fda.gov/regulatory-information/search-fda-guidance-documents/emergency-use-authorization-vaccines-prevent-covid-19"
              },
              "_value": "Emergency Use Authorization for Vaccines to Prevent COVID-19: Guidance for Industry"
            }
          ],
          "ol": [
            {
              "li": [
                {
                  "_value": "hi",
                  "ol": [
                    {
                      "li": [
                        {
                          "_value": "second. Check this  page",
                          "a": [
                            {
                              "_attributes": {
                                "href": "https://urldefense.proofpoint.com/v2/url?u=http-3A__s2027422842.t.en25.com_e_er-3Futm-5Fcampaign-3DSBIA-253A-2520CT-2520Guidance-2520Updated-26utm-5Fmedium-3Demail-26utm-5Fsource-3DEloqua-26s-3D2027422842-26lid-3D13556-26elqTrackId-3DC52ECDADDE74615811BB817590D02682-26elq-3D2a878946f6664822b1c219ea1db8e241-26elqaid-3D12812-26elqat-3D1&d=DwMGaQ&c=sb6gdlHSSEAKVs7mNNqH8g&r=JoNDtKjZ-WLYpZe6yoHY-rfZe2_aVwx3audxLDIHLIc&m=3cHBVqsEpMXfvcFT2C3A4RIlQHNpDuAjtMSavvNj-Nw&s=r7NaqUZanfVFJLNwJUC6PFSdNjI274dhL--YmK4-Cgg&e=",
                                "target": "_blank"
                              },
                              "_value": "COVID MyStudies App"
                            }
                          ]
                        }
                      ]
                    }
                  ]
                },
                {
                  "_value": "hellow"
                }
              ]
            }
          ]
        }
      ]
    }
  ]
}

I need to convert it into something like this:

{
    "response": {
        "docs": [
            {
                "region_s" : "North America",
                "country_s": "US",
                "ArticleHeading_s": "Emergency Use Authorization for Vaccines to Prevent COVID-19: Guidance for Industry 22",
                "ArticleDate_s": "Fri Jun 22 17:29:07 UTC 2021",
                "refined_summary_t": "<ol><li> first list</li><li> Second</li></ol>",
                "id": "http://www.minsa.gob.pa/noticia/arranca-esperado-proceso-de-vacunacion-en-chiriqui",
                "source_type_s": "Press Releases/News",
                "curation_date_t":"19 Jun 2021 to 25 Jun 2021"
            }
        
            

        ]

    }

}

Ol and li keys in the previous json are basically html tags that need to be added into refined_summary. I have written this code:

class BaseDocument():


    def __init__(self):
        self.date = ""
        self.id_url = ""
        self.articleHeading = ""
        self.refined_summary = ""
        self.hyperlinks = []
        self.text = []
        self.dict = []
        self.dic = {}
        self.out = {
            "response": {
                "docs": [],
            },
        }
    def a(self,data):

        if len(data) > 1:

            for i in range(0, len(data)):
                self.hyperlinks.append(data[i]["_attributes"]["href"])
                self.text.append(data[i]["_value"])

        elif (len(data) == 1) and (self.id_url =="") and (self.articleHeading == "" ) :
            self.id_url = data[0]["_attributes"]["href"]

            self.articleHeading= data[0]["_value"]

        elif (len(data) == 1) and (len(self.id_url) > 0 ) and (len(self.articleHeading) > 0 ) :
            self.hyperlinks.append(data[0]["_attributes"]["href"])
            self.text.append(data[0]["_value"])

    def li(self,data):
        print("data")
        print(data)
        for i in range(0,len(data)):
            print("here")
            print(data[i])
            if 'a' in data[i]:
                self.a(data[i]["a"])
                if len(self.hyperlinks) == 0:
                    if "_value" in data[i]:
                        value = data[i]["_value"]
                        self.date = re.search("[0-9][0-9]-.*[A-Za-z]-[0-9][0-9][0-9][0-9]", value).group(0)
                    elif "_values" in data[i]:
                        value = data[i]["_values"][0]

                        self.date = re.search("[0-9][0-9]-.*[A-Za-z]-[0-9][0-9][0-9][0-9]", value).group(0)

                elif len(self.hyperlinks) > 0:

                    self.hyperlinks = []
                    self.text = []
            elif "a" not in data[i]:

                self.refined_summary += "<li>"
                self.refined_summary += data[i]["_value"]
                if "ol" not in data[i]:
                    self.refined_summary += "</li>"

            if "ol" in data[i]:

                for j in range(0, len(data[i]["ol"])):
                    self.refined_summary += "<ol>"
                    self.ol(data[i]["ol"][j])
                    self.refined_summary += "</li>"
                return

        self.refined_summary += "</ol>"
        self.dic["id"] = self.id_url
        self.dic["ArticleDate_s"] = self.date
        self.dic["ArticleHeading_s"] = self.articleHeading
        self.dic["refined_summary_t"] = self.refined_summary
        self.dict.append(self.dic)
        self.id_url = ''
        self.date = ""
        self.articleHeading = ""
        self.refined_summary = "<ol>"
        self.dic = {}
        print("data")
        print(data[i])



    def ol(self,data):

        for i in range(0, len(data)):
            self.li(data["li"])
    def begin_ol(self,data):

        for i in range(0, len(data)):
            self.li(data["li"])

            self.dic["id"] = self.id_url
            self.dic["ArticleDate_s"] = self.date
            self.dic["ArticleHeading_s"] = self.articleHeading
            self.dic["refined_summary_t"] = self.refined_summary
            self.dict.append(self.dic)
            self.id_url = ''
            self.date = ""
            self.articleHeading = ""
            self.refined_summary = "<ol>"
            self.dic = {}
            print("printing")
            print(self.dict)

This is a class that I have created and I call it by using this:

countries = []
        heading_three = output_json["h3"]
        for i in range(0, len(heading_three)):
            ele = heading_three[i]
            countries.append(ele["_value"])
        ele = output_json["ol"]

        dic = []
        for i in range(0, len(countries)):
            print("in tis ")
            print(ele[i])
            self.begin_ol(ele[i])

This is sort of recursive but still has a flavour of iteration. I have created a separate function for each tag. So when a list(value) corresponding to that tag is passed to it, it iterates over it and checks for tags/keys. Whatever key it finds it calls the relevant function. The problem is that I am unable to properly put a base case for this. The resultant output looks like this

[
  {
    "id": "https://www.fda.gov/regulatory-information/search-fda-guidance-documents/emergency-use-authorization-vaccines-prevent-covid-19",
    "ArticleDate_s": "25-May-2021",
    "ArticleHeading_s": "Emergency Use Authorization for Vaccines to Prevent COVID-19: Guidance for Industry",
    "refined_summary_t": "<ol><li>First<ol><li>second</li></ol>"
  },
  {
    "id": "",
    "ArticleDate_s": "",
    "ArticleHeading_s": "",
    "refined_summary_t": "<ol></li></li>"
  }
]

I am sure there is some other Pythonic way to deal with this. But I am unable to get there. Been stuck here for past 12 hours :(

0 Answers0