I have 100 thousand of very large JSON files that I need to process on specific elements. To avoid memory overload I am using a python library called ijson which works fine when I am processing every object with preceding f.seek(0)
to point file pointer to start but it makes my processing very very slow. Also if I remove this f.seek(0)
the output then I am getting an error
premature EOF
Partial JSON:
{
"info": {
"added": 1638217153.782366,
"started": 1638261651.130148,
"duration": 15,
"ended": 1638261666.212257,
"owner": null,
"score": 0.2,
"id": 5062,
"category": "file",
"git": {
"head": "13cbe0d9e457be3673304533043e992ead1ea9b2",
"fetch_head": "13cbe0d9e457be3673304533043e992ead1ea9b2"
},
"monitor": "2deb9ccd75d5a7a3fe05b2625b03a8639d6ee36b",
"package": "dll",
"route": "internet",
"custom": null,
"machine": {
"status": "stopped",
"name": "192.168.56.1012",
"label": "192.168.56.1012",
"manager": "VirtualBox",
"started_on": "2021-11-30 08:40:51",
"shutdown_on": "2021-11-30 08:41:06"
},
"platform": "windows",
"version": "2.0.7",
"options": "procmemdump=yes,route=internet"
},
"network": {
"domains": [
{
"ip": "87.101.200.41",
"domain": "www.msftncsi.com"
},
{
"ip": "131.107.255.255",
"domain": "dns.msftncsi.com"
}
]
},
"signatures": [
{
"families": [],
"description": "This executable has a PDB path",
"severity": 1,
"ttp": {},
"markcount": 1,
"references": [],
"marks": [
{
"category": "pdb_path",
"ioc": "MsiHnd.pdb",
"type": "ioc",
"description": null
}
],
"name": "has_pdb"
}
],
"behavior": {
"generic": [
{
"process_path": "C:\\Windows\\System32\\lsass.exe",
"process_name": "lsass.exe",
"pid": 496,
"summary": {},
"first_seen": 1638224353.328125,
"ppid": 380
}
],
"apistats": {
"2336": {
"NtQueryValueKey": 2,
"LdrUnloadDll": 1,
"NtCreateSection": 1,
"LoadStringW": 2,
"CreateActCtxW": 4,
"NtOpenKey": 2,
"NtUnmapViewOfSection": 4,
"MessageBoxTimeoutW": 1,
"SetUnhandledExceptionFilter": 1,
"SetErrorMode": 1,
"NtCreateFile": 1,
"NtClose": 17,
"GetSystemTimeAsFileTime": 1,
"LdrLoadDll": 1,
"NtTerminateProcess": 3,
"GetFileAttributesW": 2,
"NtMapViewOfSection": 1
}
},
"summary": {
"file_opened": [
"C:\\Windows\\System32\\en-US\\KERNELBASE.dll.mui"
]
}
}
}
Current Code: (Multiple f.seek(0)
) which don't want to use :)
my_file_list = [f for f in glob.glob("data/jsons/test.json")]
final_result = []
i = 0
for filename in my_file_list:
try:
with open(filename, 'r', encoding='utf8', errors='ignore') as f:
row = {}
parse_events = ijson.parse(f, use_float=True)
for prefix, event, value in parse_events:
if prefix == 'info.added':
row['added'] = value
elif prefix == 'info.started':
row['started'] = value
elif prefix == 'info.duration':
row['duration'] = value
elif prefix == 'info.ended':
row['ended'] = value
elif prefix == 'info' and event == 'end_map':
break
f.seek(0)
row['AF-DomainCount'] = sum(1 for _ in ijson.items(f, 'network.domains.item'))
f.seek(0)
row['AG-SignatureCount'] = sum(1 for _ in ijson.items(f, 'signatures.item'))
f.seek(0)
row['AH-GenericCount'] = sum(1 for _ in ijson.items(f, 'behavior.generic.item'))
f.seek(0)
row['AI-ApistatCount'] = sum(1 for _ in ijson.items(f, 'behavior.apistats'))
f.seek(0)
row['AJ-ProcessCount'] = sum(1 for _ in ijson.items(f, 'behavior.processes.item'))
f.seek(0)
row['AK-SummaryCount'] = sum(1 for _ in ijson.items(f, 'behavior.summary'))
f.seek(0)
apistats_element = ijson.items(f, 'behavior.apistats')
for inner_apistats in apistats_element:
for index, inner_fields in inner_apistats.items():
row = dict(Counter(row) + Counter(inner_fields))
row['AA-Filename'] = os.path.basename(filename)
i+=1
print(f"processed file {i}", end='\r')
except Exception as e:
#pass
print(f"Filename {filename} has issue with {e}")
row = {}
if row:
final_result.append(row)
Output: row
{
'added': 1638217153.782366,
'started': 1638261651.130148,
'duration': 15,
'ended': 1638261666.212257,
'AF-DomainCount': 2,
'AG-SignatureCount': 1,
'AH-GenericCount': 1,
'AI-ApistatCount': 1,
'AK-SummaryCount': 1,
'NtQueryValueKey': 2,
'LdrUnloadDll': 1,
'NtCreateSection': 1,
'LoadStringW': 2,
'CreateActCtxW': 4,
'NtOpenKey': 2,
'NtUnmapViewOfSection': 4,
'MessageBoxTimeoutW': 1,
'SetUnhandledExceptionFilter': 1,
'SetErrorMode': 1,
'NtCreateFile': 1,
'NtClose': 17,
'GetSystemTimeAsFileTime': 1,
'LdrLoadDll': 1,
'NtTerminateProcess': 3,
'GetFileAttributesW': 2,
'NtMapViewOfSection': 1,
'AA-Filename': 'test.json'
}
Not sure this is the reason Using python ijson to read a large json file with multiple json objects that ijson not able to work on multiple json element at once.
Also, let me know any other python package or any sample example that can handle large size JSON without memory issues.
Edit: If I use parse_events without f.seek(0)
, only row['AF-DomainCount']
returns correct value other row counts are 0
row['AF-DomainCount'] = sum(1 for _ in ijson.items(parse_events, 'network.domains.item'))
row['AG-SignatureCount'] = sum(1 for _ in ijson.items(parse_events, 'signatures.item'))
row['AH-GenericCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.generic.item'))
row['AI-ApistatCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.apistats'))
row['AJ-ProcessCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.processes.item'))
row['AK-SummaryCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.summary'))
Note: This is not an assignment it is a real life issue that I am facing. Basically, I need some sort of solution to avoid f.seek(0)
multiple times and make my script faster with ijson