This is a script to calculate histogram, and I find the lib csv.py
takes most time. How can I run it paralleled ?
The size of input file samtools.depth.gz
is 14G, contains about 3 billion lines.
SamplesList = ('Sample_A', 'Sample_B', 'Sample_C', 'Sample_D')
from collections import Counter
cDepthCnt = {key:Counter() for key in SamplesList}
cDepthStat = {key:[0,0] for key in SamplesList} # x and x^2
RecordCnt,MaxDepth = inStat('samtools.depth.gz')
print('xxx')
def inStat(inDepthFile):
import gzip
import csv
RecordCnt = 0
MaxDepth = 0
with gzip.open(inDepthFile, 'rt') as tsvfin:
tsvin = csv.DictReader(tsvfin, delimiter='\t', fieldnames=('ChrID','Pos')+SamplesList )
RecordCnt += 1
for row in tsvin:
for k in SamplesList:
theValue = int(row[k])
if theValue > MaxDepth:
MaxDepth = theValue
cDepthCnt[k][theValue] += 1
cDepthStat[k][0] += theValue
cDepthStat[k][1] += theValue * theValue
return RecordCnt,MaxDepth
There are ways to read huge file into chunks and distribute them with list, like https://stackoverflow.com/a/30294434/159695 :
bufsize = 65536
with open(path) as infile:
while True:
lines = infile.readlines(bufsize)
if not lines:
break
for line in lines:
process(line)
However, csv.DictReader
only accepts file handles.
There is a way to split to temporary files at https://gist.github.com/jbylund/c37402573a896e5b5fc8 , I wonder whether I can use fifo to do it on-the-fly.
I just find csv.DictReader
accepts any object which supports the iterator protocol and returns a string each time its next() method is called — file objects and list objects are both suitable.
I have modify inStat()
to accept lines. Would you please help me to complete statPool()
?
def statPool(inDepthFile):
import gzip
RecordCnt = 0
MaxDepth = 0
cDepthCnt = {key:Counter() for key in SamplesList}
cDepthStat = {key:[0,0,0,0,0] for key in SamplesList} # x and x^2
with gzip.open(inDepthFile, 'rt') as tsvfin:
while True:
lines = tsvfin.readlines(ChunkSize)
if not lines:
break
with Pool(processes=4) as pool:
res = pool.apply_async(inStat,[lines])
iRecordCnt,iMaxDepth,icDepthCnt,icDepthStat = res.get()
RecordCnt += iRecordCnt
if iMaxDepth > MaxDepth:
MaxDepth = iMaxDepth
for k in SamplesList:
cDepthCnt[k].update(icDepthCnt[k])
cDepthStat[k][0] += icDepthStat[k][0]
cDepthStat[k][1] += icDepthStat[k][1]
return RecordCnt,MaxDepth,cDepthCnt,cDepthStat
I think asyncio.Queue
seems be a good way to pipe to multiple csv.DictReader
workers.