I am running a script (in multiprocessing mode) that extract some parameters from a bunch of JSON files but currently it is very slow. Here is the script:
from __future__ import print_function, division
import os
from glob import glob
from os import getpid
from time import time
from sys import stdout
import resource
from multiprocessing import Pool
import subprocess
try:
import simplejson as json
except ImportError:
import json
path = '/data/data//*.A.1'
print("Running with PID: %d" % getpid())
def process_file(file):
start = time()
filename =file.split('/')[-1]
print(file)
with open('/data/data/A.1/%s_DI' %filename, 'w') as w:
with open(file, 'r') as f:
for n, line in enumerate(f):
d = json.loads(line)
try:
domain = d['rrname']
ips = d['rdata']
for i in ips:
print("%s|%s" % (i, domain), file=w)
except:
print (d)
pass
if __name__ == "__main__":
files_list = glob(path)
cores = 12
print("Using %d cores" % cores)
pp = Pool(processes=cores)
pp.imap_unordered(process_file, files_list)
pp.close()
pp.join()
Does any body know how to speed this up?