1

I have 100 thousand of very large JSON files that I need to process on specific elements. To avoid memory overload I am using a python library called ijson which works fine when I am processing every object with preceding f.seek(0) to point file pointer to start but it makes my processing very very slow. Also if I remove this f.seek(0) the output then I am getting an error

premature EOF

Partial JSON:

{
"info": {
    "added": 1638217153.782366, 
    "started": 1638261651.130148, 
    "duration": 15, 
    "ended": 1638261666.212257, 
    "owner": null, 
    "score": 0.2, 
    "id": 5062, 
    "category": "file", 
    "git": {
        "head": "13cbe0d9e457be3673304533043e992ead1ea9b2", 
        "fetch_head": "13cbe0d9e457be3673304533043e992ead1ea9b2"
    }, 
    "monitor": "2deb9ccd75d5a7a3fe05b2625b03a8639d6ee36b", 
    "package": "dll", 
    "route": "internet", 
    "custom": null, 
    "machine": {
        "status": "stopped", 
        "name": "192.168.56.1012", 
        "label": "192.168.56.1012", 
        "manager": "VirtualBox", 
        "started_on": "2021-11-30 08:40:51", 
        "shutdown_on": "2021-11-30 08:41:06"
    }, 
    "platform": "windows", 
    "version": "2.0.7", 
    "options": "procmemdump=yes,route=internet"
}, 
"network": {
    "domains": [
        {
            "ip": "87.101.200.41", 
            "domain": "www.msftncsi.com"
        }, 
        {
            "ip": "131.107.255.255", 
            "domain": "dns.msftncsi.com"
        }
    ]
}, 
"signatures": [
    {
        "families": [], 
        "description": "This executable has a PDB path", 
        "severity": 1, 
        "ttp": {}, 
        "markcount": 1, 
        "references": [], 
        "marks": [
            {
                "category": "pdb_path", 
                "ioc": "MsiHnd.pdb", 
                "type": "ioc", 
                "description": null
            }
        ], 
        "name": "has_pdb"
    }
],
"behavior": {
    "generic": [
        {
            "process_path": "C:\\Windows\\System32\\lsass.exe", 
            "process_name": "lsass.exe", 
            "pid": 496, 
            "summary": {}, 
            "first_seen": 1638224353.328125, 
            "ppid": 380
        }
    ], 
    "apistats": {
        "2336": {
            "NtQueryValueKey": 2, 
            "LdrUnloadDll": 1, 
            "NtCreateSection": 1, 
            "LoadStringW": 2, 
            "CreateActCtxW": 4, 
            "NtOpenKey": 2, 
            "NtUnmapViewOfSection": 4, 
            "MessageBoxTimeoutW": 1, 
            "SetUnhandledExceptionFilter": 1, 
            "SetErrorMode": 1, 
            "NtCreateFile": 1, 
            "NtClose": 17, 
            "GetSystemTimeAsFileTime": 1, 
            "LdrLoadDll": 1, 
            "NtTerminateProcess": 3, 
            "GetFileAttributesW": 2, 
            "NtMapViewOfSection": 1
        }
    },
    "summary": {
        "file_opened": [
            "C:\\Windows\\System32\\en-US\\KERNELBASE.dll.mui"
        ]
    }
}

}

Current Code: (Multiple f.seek(0)) which don't want to use :)

my_file_list = [f for f in glob.glob("data/jsons/test.json")]
final_result = []
i = 0
for filename in my_file_list:
    try:
        with open(filename, 'r', encoding='utf8', errors='ignore') as f:
            row = {}
            parse_events = ijson.parse(f, use_float=True)
            for prefix, event, value in parse_events:
                if prefix == 'info.added':
                    row['added'] = value
                elif prefix == 'info.started':
                    row['started'] = value
                elif prefix == 'info.duration':
                     row['duration'] = value
                elif prefix == 'info.ended':
                     row['ended'] = value
                elif prefix == 'info' and event == 'end_map':
                    break
            
            f.seek(0)
            row['AF-DomainCount'] = sum(1 for _ in ijson.items(f, 'network.domains.item'))
            f.seek(0)
            row['AG-SignatureCount'] = sum(1 for _ in ijson.items(f, 'signatures.item'))
            f.seek(0)
            row['AH-GenericCount'] = sum(1 for _ in ijson.items(f, 'behavior.generic.item'))
            f.seek(0)
            row['AI-ApistatCount'] = sum(1 for _ in ijson.items(f, 'behavior.apistats'))
            f.seek(0)
            row['AJ-ProcessCount'] = sum(1 for _ in ijson.items(f, 'behavior.processes.item'))
            f.seek(0)
            row['AK-SummaryCount'] = sum(1 for _ in ijson.items(f, 'behavior.summary'))

            f.seek(0)
            apistats_element = ijson.items(f, 'behavior.apistats')
            for inner_apistats in apistats_element:
                for index, inner_fields in inner_apistats.items():
                    row = dict(Counter(row) + Counter(inner_fields))
            
            row['AA-Filename'] = os.path.basename(filename)
            i+=1
            print(f"processed file {i}", end='\r')
            
    except Exception as e:
        #pass
        print(f"Filename {filename} has issue with {e}")
        row = {}
    
    if row:        
        final_result.append(row)

Output: row

{
 'added': 1638217153.782366,
 'started': 1638261651.130148,
 'duration': 15,
 'ended': 1638261666.212257,
 'AF-DomainCount': 2,
 'AG-SignatureCount': 1,
 'AH-GenericCount': 1,
 'AI-ApistatCount': 1,
 'AK-SummaryCount': 1,
 'NtQueryValueKey': 2,
 'LdrUnloadDll': 1,
 'NtCreateSection': 1,
 'LoadStringW': 2,
 'CreateActCtxW': 4,
 'NtOpenKey': 2,
 'NtUnmapViewOfSection': 4,
 'MessageBoxTimeoutW': 1,
 'SetUnhandledExceptionFilter': 1,
 'SetErrorMode': 1,
 'NtCreateFile': 1,
 'NtClose': 17,
 'GetSystemTimeAsFileTime': 1,
 'LdrLoadDll': 1,
 'NtTerminateProcess': 3,
 'GetFileAttributesW': 2,
 'NtMapViewOfSection': 1,
 'AA-Filename': 'test.json'
}

Not sure this is the reason Using python ijson to read a large json file with multiple json objects that ijson not able to work on multiple json element at once.

Also, let me know any other python package or any sample example that can handle large size JSON without memory issues.

Edit: If I use parse_events without f.seek(0), only row['AF-DomainCount'] returns correct value other row counts are 0

row['AF-DomainCount'] = sum(1 for _ in ijson.items(parse_events, 'network.domains.item'))
row['AG-SignatureCount'] = sum(1 for _ in ijson.items(parse_events, 'signatures.item'))
row['AH-GenericCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.generic.item'))
row['AI-ApistatCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.apistats'))
row['AJ-ProcessCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.processes.item'))
row['AK-SummaryCount'] = sum(1 for _ in ijson.items(parse_events, 'behavior.summary'))

Note: This is not an assignment it is a real life issue that I am facing. Basically, I need some sort of solution to avoid f.seek(0) multiple times and make my script faster with ijson

A l w a y s S u n n y
  • 36,497
  • 8
  • 60
  • 103

1 Answers1

0

I've been having my own issues with ijson reading large files. I think that you are on the right track here, but if you use the ijson.items() method, then it is going to read the whole file just for those items and then stop, hence your need for the f.seek(0) calls (I think that you know this, but just for clarity). I think that the only way around this is to use the ijson.parse() generator as you have, but put all of your counting/summing logic in that iterator block.

So, you might be able to do (simplified for first two counts):

with open(filename, 'r', encoding='utf8', errors='ignore') as f:
    row = {}
    domain_count = 0
    signature_count = 0
    parse_events = ijson.parse(f, use_float=True)
    for prefix, event, value in parse_events:
        if prefix == 'info.added':
            row['added'] = value
        elif prefix == 'info.started':
            row['started'] = value
        elif prefix == 'info.duration':
             row['duration'] = value
        elif prefix == 'info.ended':
             row['ended'] = value
        elif prefix == 'network.domains.item':
            domain_count += 1
        elif prefix == 'signatures.item':
            signature_count += 1

This should allow you to perform those counts in a single pass of the file, instead of having to go back to the start for each one. However, it can become challenging if your parsing logic is more complicated than just simple counts.

Also, I think that your logic for the prefixes that don't end in .item will all return 1; if they don't have items the generator will just return a single serialized object (and it could possibly be a very large object).

Derek Kaknes
  • 961
  • 8
  • 10