I am seeking general guidance on how to process large file (will millions of rows in it) quickly and general advice on how to approach this as well as improve the code. I assume the last part where line by line is written to the file is not optimal but I am not so sure how to approach it i.e. what would be the fastest way and mechanisms behind it.
Any help is greatly appreciated.
import csv
import json
import pandas as pd
filename = 'filename.csv'
def getstuff():
with open(filename, "rt") as csvfile:
datareader = csv.reader(csvfile,delimiter="|")
for row in datareader:
yield row[8]
def get_json():
with open('bad_records.txt', 'w') as f: # Open the bad records file
for i in getstuff():
if '"metrics"' in i:
try:
yield json.loads(i)
except json.JSONDecodeError:
f.write(f"{i}\n")
def make_dataframe():
for i in get_json():
yield pd.json_normalize(i["metrics"])
with open('name.txt',mode="a") as f:
for df in make_dataframe():
df.to_csv(f, index=False, header=False, sep='\t',escapechar='\\')