1

NOTES fragment below causes an error from .\Anaconda3\pandas\core\indexes\base.py) " 'NoneType' object has no attribute 'keys'" (see listing below)

if method is None:
    if tolerance is not None:
        raise ValueError(
            "tolerance argument only valid if using pad, "
            "backfill or nearest lookups"
        )
    casted_key = self._maybe_cast_indexer(key)
    try:
        return self._engine.get_loc(casted_key)
    except KeyError as err:
        raise KeyError(key) from err
    except TypeError:
        # If we have a listlike key, _check_indexing_error will raise
        #  InvalidIndexError. Otherwise we fall through and re-raise
        #  the TypeError.
        self._check_indexing_error(key)
        raise

When the casted_key can't be read from a dictionary of Nonetype, the code fails. (from base.py)

PROBLEM: In a json file, there can be multiple levels. When parsing a dictionary inside of a list, the code errors out because it cannot pull the casted_key from the level required as it is null at that lower level. The dictionaries are cast inside of a list or a tuple. Would a lambda be written to get around this null value while pulling the df[columns - sql_type - data]?

What would that lambda look like?

.apply(lambda x: strorNan(k,v))

 with open(r'D:\\meta.json', encoding='utf-16') as data_file:
        data = json.loads(data_file.read())
    df=pd.json_normalize(data)

    result=pd.concat([pd.json_normalize(df['columns'][k]['sql_type'], 'data') for k in  df['columns'].keys())

JSON STRUCTURE: SAMPLE FILE

{
  "^o":"MetaDataTable",
  "filename":"meta.csv",
  "schema":"meta",
  "tablename":"meta",
  "headers":[
    "fidped"
  ],
  "num_rows":6,
  "num_cols":3,
  "columns":{
    "fidped":{
      "^o":"MetaDataColumn",
      "name":"fidped",
      "number":27,
      "sql_type":{
        "^o":"SQLDate",
        "data":[
          {
              "^O":"Date",
              "year":2019,
              "month":12,
              "day":31,
              "start":2299161.0},
          null,
          {
              "^O":"Date",
              "year":2021,
              "month":9,
              "day":30,
              "start":2299161.0},
          {
              "^O":"Date",
              "year":2022,
              "month":9,
              "day":30,
              "start":2299161.0},
          {
              "^O":"Date",
              "year":2011,
              "month":12,
              "day":31,
              "start":2299161.0},
          {
              "^O":"Date",
              "year":2020,
              "month":6,
              "day":30,
              "start":2299161.0},
          {
              "^O":"Date",
              "year":2016,
              "month":12,
              "day":31,
              "start":2299161.0}
        ],            
        "nullable":true,
        "bytes":4,
        "type":"date"
      },
      "rows":6,
      "min":null,
      "max":null,
      "distribution":{
        "^#2d70":[
          {
              "^O":"Date",
              "year":2019,
              "month":12,
              "day":31,
              "start":2299161.0},
          12
        ],
        "^#2d71":[
          null,
          360
        ],
        "^#2d72":[
          {
              "^O":"Date",
              "year":2021,
              "month":9,
              "day":30,
              "start":2299161.0},
          43
        ],
        "^#2d73":[
          {
              "^O":"Date",
              "year":2022,
              "month":9,
              "day":30,
              "start":2299161.0},
          59
        ],
        "^#2d74":[
          {
              "^O":"Date",
              "year":2011,
              "month":12,
              "day":31,
              "start":2299161.0},
          115
        ],
        "^#2d75":[
          {
              "^O":"Date",
              "year":2010,
              "month":6,
              "day":30,
              "start":2299161.0},
          2
        ]
      },
      "num_distinct":6,
      "distinct":[ {
            "^O":"Date",
            "year":2001,
            "month":2,
            "day":28,
            "start":2299161.0},
        {
            "^O":"Date",
            "year":2003,
            "month":8,
            "day":31,
            "start":2299161.0},
        {
            "^O":"Date",
            "year":2004,
            "month":6,
            "day":30,
            "start":2299161.0},
        {
            "^O":"Date",
            "year":2004,
            "month":11,
            "day":30,
            "start":2299161.0},
        {
            "^O":"Date",
            "year":2005,
            "month":2,
            "day":28,
            "start":2299161.0},
        {
            "^O":"Date",
            "year":2005,
            "month":4,
            "day":30,
            "start":2299161.0}
        ]               
     }
  }
}

POSSIBLE SOLUTION - in need of a method to circumvent the base.py which expects the dictionary object being iterated to not be null (nonetype):

def transformMDY(x): 
    for k,v in x.items():
        if isinstance(v, dict):
            x['month'] =x['month'].astype(str)
            x['month'] =x['month'].apply(lambda x: fullColumnDate(x))
            x['day'] =x['day'].astype(str)
            x['day'] =x['day'].apply(lambda x: fullColumnDate(x))
            dtMerge=pd.DataFrame()
            dfyr=[i for i in x['year']]
            dfmo=[i for i in x['month']]
            dfdy=[i for i in x['day']]
            dfyr=pd.DataFrame(dfyr)      
            dfmo=pd.DataFrame(dfmo) 
            dfdy=pd.DataFrame(dfdy)
            dtMerge['year']=dfyr
            dtMerge['month']=dfmo
            dtMerge['day']=dfdy
            dtMerge['Full_Date'] = pd.to_datetime(dtMerge['day'].astype("str")+"/"+dtMerge['month'].astype("str")+"/"+dtMerge["year"].astype("str"), format = "%d/%m/%Y")
            dtMerge.drop('year',axis=1,inplace=True)
            dtMerge.drop('month',axis=1,inplace=True)
            dtMerge.drop('day',axis=1,inplace=True)
            return dtMerge
        else:
            v="1970-01-01"
            
    return (x)


main()
jamiel22
  • 117
  • 6
  • if you are reading json into a dataframe, then you can skip the `with open()` and go directly to `pd.read_json()`. This will probably save a lot of work. – D.L Dec 30 '22 at 21:16
  • Thanks D.L but your solution causes an error. I think the problem is the call to ".\Anaconda3\pandas\core\indexes\base.py". There are no exceptions built into the "base.py" code allowing for a null dictionary type within a list or tuple. I'm not familiar with Python enough to know how to write my way around it. – jamiel22 Jan 03 '23 at 15:07
  • Using Spark, the following shows: "Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the referenced columns only include the internal corrupt record column (named _corrupt_record by default). For example: spark.read.schema(schema).json(file).filter($"_corrupt_record".isNotNull).count() and spark.read.schema(schema).json(file).select("_corrupt_record").show(). Instead, you can cache or save the parsed results and then send the same query. For example, val df = spark.read.schema(schema).json(file).cache() and then df.filter($"_corrupt_record".isNotNull).count()." – jamiel22 Jan 05 '23 at 13:12
  • I found something similar here - https://stackoverflow.com/questions/55799314/using-python-to-create-complex-json-with-the-creation-of-some-nested-arrays-bein --- would be nice to find a way to construct a class that would read the complex structure in a serialized fashion. – jamiel22 Jan 06 '23 at 12:58
  • Also, wrapping my thoughts around this answer. https://stackoverflow.com/questions/4014621/a-python-class-that-acts-like-dict – jamiel22 Jan 09 '23 at 11:04

1 Answers1

1
def empty_date_dict():
    d=dict();
    d['^O']='Date'
    d['day']='01'
    d['month']='01'
    d['start']='2299161.0'
    d['year']='1970'
    return d

def transformMDY(x):  
    import pandas as pd
    for itm in range(0,len(x)):
        x['month'][itm] =str(x['month'][itm])
        x['day'][itm] =str(x['day'][itm])

    dtMerge=pd.DataFrame()
    dfyr=[i for i in x['year']]
    dfmo=[i for i in x['month']]
    dfdy=[i for i in x['day']]
    dfyr=pd.DataFrame(dfyr)      
    dfmo=pd.DataFrame(dfmo) 
    dfdy=pd.DataFrame(dfdy)
    dtMerge['year']=dfyr
    dtMerge['month']=dfmo
    dtMerge['day']=dfdy
    dtMerge['Full_Date'] = pd.to_datetime(dtMerge['day'].astype("str")+"/"+dtMerge['month'].astype("str")+"/"+dtMerge["year"].astype("str"), format = "%d/%m/%Y")
   # drop the month, day, year and keep only the full_date
    dtMerge.drop('year',axis=1,inplace=True)
    dtMerge.drop('month',axis=1,inplace=True)
    dtMerge.drop('day',axis=1,inplace=True)          
    return dtMerge     


def main(): 
    import sys,os
    import pandas as pd
    import numpy as np
    import json
    from pandas import json_normalize

    # data=pd.read_json(r'cnal_airrs.meta.json')
    with open(r'D:\\meta.json', encoding='utf-16') as data_file:
          data = json.loads(data_file.read())
    df=pd.json_normalize(data)
    # reach down into the data and pull it up separately
    data['fidped'] =(data["columns"]['fidped']['sql_type'],"data")
    
    # create some variables
    lst_fidped=[]
    fidped=data['fidped']
    lst_fidped=list(fidped)
    newlst_fidped=[]

    # neat trick - creates an array of true false values
    isNan=pd.isna(lst_fidped[0]['data'])

    #fill the empty dictionary slots
    t=0
    for i in isNan:
        t=t+1
        try:
            if(i==True):
                newlst_fidped.append(empty_date_dict())
            else:
                newlst_fidped.append(lst_fidped[0]['data'][t-1])
        except:
            newlst_fidped[t-1].append(empty_date_dict())

    # lose the tuple
    fidped_list=list(newlst_fidped)

    #convert back to a dictionary
    d=pd.DataFrame(fidped_list)

    #transform month day year to a single datetime column
    fidped_fixed=transformMDY(d)

main()
jamiel22
  • 117
  • 6