1

I have some json files in a directory, and I am working on parsing all of the files in said directory. I want to define the columns that the key values will fit under

Currently, the data fits into those columns perfectly. The problem is that my script only takes the first json object from each file instead of parsing through the whole file and returning each indexed json object.


import os, json
import pandas as pd

#Read files from Log directory
path_to_json = r'C:\TestLogfiles'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
print(json_files)  

# Define Columns within Log files
jsons_data = pd.DataFrame(columns=['category', 'description', 'created_date', 'host', 'machine', 'user_id', 'error_code', 'process', 'thread'])

# we need both the json and an index number so use enumerate()
for index, js in enumerate(json_files):
    with open(os.path.join(path_to_json, js)) as json_files:
        json_text = json.load(json_files)
        for logMessages in json_text['logMessages']:

            #Nav thru the log entry to return a parsed list of these items
            category = logMessages['type']
            description = logMessages['message']
            created_date = logMessages['time']
            host = logMessages['source']
            machine = logMessages['machine']
            user_id = logMessages['user']
            error_code = logMessages['code']
            process = logMessages['process']
            thread = logMessages['thread']

            #push list of data into pandas DFrame at a row given by 'index'

            jsons_data.loc[index] = [category, description, created_date, host, machine, user_id, error_code, process, thread]
            
print(jsons_data)

Json files look like this:

{
    "hasMore": true,
    "startTime": 1663612608354,
    "endTime": 1662134983365,
    "logMessages": [
        {
            "type": "DEBUG",
            "message": "Health check took 50 ms.",
            "time": 1663612608354,
            "source": "Portal Admin",
            "machine": "machineName.domain.com",
            "user": "",
            "code": 9999,
            "elapsed": "",
            "process": "6966",
            "thread": "1",
            "methodName": "",
            "requestID": ""
        },
        {
            "type": "DEBUG",
            "message": "Checking the Sharing API took 12 ms.",
            "time": 1663612608354,
            "source": "Portal Admin",
            "machine": "machineName.domain.com",
            "user": "",
            "code": 9999,
            "elapsed": "",
            "process": "6966",
            "thread": "1",
            "methodName": "",
            "requestID": ""
        }
    ]
}

Result is:

['PortalLog.json', 'Testlog.json']
  category                                        description   created_date                              host             machine      user_id error_code process thread
0  WARNING   Failed to delete service with service URL 'ht...  1662134983365                           Sharing  MachineName.domain.com  portaladmin     200007   11623      1
1   SEVERE  Error executing tool. Export Web Map Task : Fa...  1657133904189  Utilities/PrintingTools.GPServer  MachineName.domain.com  portaladmin      20010   25561    161
gkelly
  • 11
  • 3
  • you can store the key/value pairs as json strings in a column. you then can use the key/value pair later by access the contents of the column and loading the data into a dictionary – Golden Lion Sep 21 '22 at 23:00

1 Answers1

0

I found a way to do this with much less code using @Trenton McKinney's answer in another post... Renaming column names in Pandas

import os, json
import pandas as pd

#Read files from Log directory
jsonFile = r'C:\testlog.json'
df = pd.read_json(jsonFile)

#print(df)
#df = pd.read_json(URL)

#Load data with json module
with open(jsonFile) as f:
    data = json.loads(f.read())

#flatten data 
df_nested_list = pd.json_normalize(data, record_path = ['logMessages'])

#Use df.set_axis() to rename columns permanently using "inplace=true"
df2 = df_nested_list.set_axis(['category', 'description', 'created_date', 'host', 'machine','user_id','error_code','elapsed','process_id','thread','method name','requestID'], axis=1, inplace=True)

print(df_nested_list)

Result is:

    category                                        description   created_date  ... thread                                   method name                             requestID
0    WARNING       Physical output directory is not accessible.  1662620431270  ...    866  MapServiceLoader::PutPhysicalOutputDirectory
1    WARNING       Physical output directory is not accessible.  1662620416602  ...    778  MapServiceLoader::PutPhysicalOutputDirectory
2    WARNING  Failed to generate index candidates for '{0}'....  1662620405891  ...      1
3    WARNING  Failed to update license for relational data s...  1662620400883  ...      1
4    WARNING  Error during relational data store vacuum/anal...  1662620399934  ...      1
..       ...                                                ...            ...  ...    ...                                           ...                                   ...
995   SEVERE  Response already committed. Cannot forward to ...  1657136422579  ...      1                                                4d8ed104-80fa-4133-8098-facd6d7995a7
996   SEVERE  This exception was thrown after the response w...  1657136417283  ...      1                                                a8f438da-37ba-4f3a-82cf-3f8d4bdb15d4
997   SEVERE  Response already committed. Cannot forward to ...  1657136417281  ...      1                                                a8f438da-37ba-4f3a-82cf-3f8d4bdb15d4
998   SEVERE  Response already committed. Cannot forward to ...  1657136417281  ...      1                                                a8f438da-37ba-4f3a-82cf-3f8d4bdb15d4
999   SEVERE  Error executing tool. Export Web Map Task : Fa...  1657133904189  ...    161                    GPServerSync.CheckMessages  4da5288a-be72-4b54-8816-8daf30166d1e
gkelly
  • 11
  • 3