In fact, you are using a pretty deficient way to achieve your goal, you need to use numpy in order to increase the performance.
import numpy as np
import matplotlib.pyplot as plt
stock_price_spy = np.loadtxt('SPY.csv', dtype=float, delimiter=',', skiprows=1, usecols=4)
#here you have nothing else than the 5th column of your csv, this cuts the bottleneck in memory.
n, bins, patches = plt.hist( stock_price_spy, 50 )
plt.show()
I didn't test it, but it should work.
And I recommend you to use the optimized version of python from intel. It's better to manage this kind of process. Intel python distribution
Adding code for testing. Because some fellows are trying to misinform and are missing true arguments, panda uses Dataframes which are dictionaries, not numpy arrays. And numpy arrays are almost twice faster.
import numpy as np
import pandas as pd
import random
import csv
import matplotlib.pyplot as plt
import time
#Creating a random csv file 6 x 4871, simulating the problem.
rows = 4871
columns = 6
fields = ['one', 'two', 'three', 'four', 'five', 'six']
write_a_csv = csv.DictWriter(open("random.csv", "w"),
fieldnames=fields)
for i in range(0, rows):
write_a_csv.writerow(dict([
('one', random.random()),
('two', random.random()),
('three', random.random()),
('four', random.random()),
('five', random.random()),
('six', random.random())
]))
start_old = time.clock()
spy = pd.read_csv( 'random.csv' )
print(type(spy))
stock_price_spy = spy.values[ :, 5 ]
n, bins, patches = plt.hist( stock_price_spy, 50 )
plt.show()
end_old = time.clock()
total_time_old = end_old - start_old
print(total_time_old)
start_new = time.clock()
stock_price_spy_new = np.loadtxt('random.csv', dtype=float,
delimiter=',', skiprows=1, usecols=4)
print(type(stock_price_spy_new))
#here you have nothing else than the 5th column of your csv, this cuts the bottleneck in memory.
n, bins, patches = plt.hist( stock_price_spy_new, 50 )
plt.show()
end_new = time.clock()
total_time_new = end_new - start_new
print(total_time_new)