I am writing a script to time how long it takes to insert data from CSV file to MongoDB, 60MB file - around 650000 lines took ~9.5 seconds. I know using thread may decrease the run-time but I am new to using threads and I would love to get some help.
My code:
def timeitImportContent() :
SETUP_CODE = '''
import pymongo
import csv
'''
TEST_CODE = '''
print("Attempting to connect to MongoDB.....")
client = pymongo.MongoClient('localhost', 27017)
collection = client['db']['myCollection']
print("Connection established.....")
print("Opening file at " + "path/to/my/file" + ".....")
csvFile = open("path/to/my/file", 'r')
print("Reading file.....")
data = csv.DictReader(csvFile)
print("Reading completed.....")
print("Inserting data into MongoDB")
collection.insert_many(data)
print("Successfully inserted data into MongoDB")
print("Attempting to close connection.....")
client.close()
print("Client disconnected")
print("Attempting to close CSV file.....")
csvFile.close();
print("CSV closed.....")
'''
times = timeit.timeit(setup = SETUP_CODE, stmt = TEST_CODE, number = 1)
print("It took " + str(times) + " seconds to execute")
if __name__ == "__main__":
timeitImportContent()
Processing as a stream attempt:
def getSingleRow(filename):
with open(filename, 'r') as csv_file:
data = csv.DictReader(csv_file)
for row in data:
yield row
csv_file.close()
return
def getData(filename):
print("Attempting to connect to MongoDB.....")
client = pymongo.MongoClient('localhost', 27017)
collection = client['donorschoose']['MapData']
print("Connection established.....")
print("Inserting data into MongoDB")
for row in getSingleRow(filename):
collection.insert_one(row);
print("Successfully inserted data into MongoDB")
print("Attempting to close connection.....")
client.close()
print("Connection disconnected")
if __name__ == "__main__":
getData("path/to/file/name")