I would like to use PyMongo’s bulk write operation features which executes write operations in batches in order to reduces the number of network round trips and increaseses rite throughput.
I also found here that it was possible to used 5000 as a batch number.
However, I do not want is the best size for batch number and how to combine PyMongo’s bulk write operation features with generators in the following code?
from pymongo import MongoClient
from itertools import groupby
import csv
def iter_something(rows):
key_names = ['type', 'name', 'sub_name', 'pos', 's_type', 'x_type']
chr_key_names = ['letter', 'no']
for keys, group in groupby(rows, lambda row: row[:6]):
result = dict(zip(key_names, keys))
result['chr'] = [dict(zip(chr_key_names, row[6:])) for row in group]
yield result
def main():
converters = [str, str, str, int, int, int, str, int]
with open("/home/mic/tmp/test.txt") as c:
reader = csv.reader(c, skipinitialspace=True)
converted = ([conv(col) for conv, col in zip(converters, row)] for row in reader)
for object_ in iter_something(converted):
print(object_)
if __name__ == '__main__':
db = MongoClient().test
sDB = db.snps
main()
test.txt file:
Test, A, B01, 828288, 1, 7, C, 5
Test, A, B01, 828288, 1, 7, T, 6
Test, A, B01, 171878, 3, 7, C, 5
Test, A, B01, 171878, 3, 7, T, 6
Test, A, B01, 871963, 3, 9, A, 5
Test, A, B01, 871963, 3, 9, G, 6
Test, A, B01, 1932523, 1, 10, T, 4
Test, A, B01, 1932523, 1, 10, A, 5
Test, A, B01, 1932523, 1, 10, X, 6
Test, A, B01, 667214, 1, 14, T, 4
Test, A, B01, 667214, 1, 14, G, 5
Test, A, B01, 67214, 1, 14, G, 6