I am writing a python script that essentially does the following
- Reads a CSV file as a dataframe object.
- Selects some columns based on names and stores them in a new DF object.
- Does some math and string manipulation on the values in cells. I use the for loop and the
iterrows()
method here. - Writes the modified DF to a CSV
- Writes the CSV to json using a for loop.
This code takes forever to run. I am trying to understand why this is taking so long, and if I should do my tasks differently to speed up the execution.
import pandas
import json
import pendulum
import csv
import os
import time
start_time = time.time()
print("--- %s seconds ---" % (time.time() - start_time))
os.chdir('/home/csv_files_from_REC')
df11 = pandas.read_csv('RTP_Gap_2018-01-21.csv') ### Reads the CSV FILE
print df11.shape ### Prints the shape of the DF
### Filter the initial DF by selecting some columns based on NAME
df1 = df11[['ENODEB','DAY','HR','SITE','RTP_Gap_Length_Total_sec','RTP_Session_Duration_Total_sec','RTP_Gap_Duration_Ratio_Avg%']]
print df1.shape ## Prints Shape
#### Math and String manupulation stuff ###
for index, row in df1.iterrows():
if row['DAY'] == 'Total':
df1.drop(index, inplace=True)
else:
stamp = row['DAY'] + ' ' + str(row['HR']) + ':00:00'
sitename = str(row['ENODEB'])+'_'+row['SITE']
if row['RTP_Session_Duration_Total_sec'] == 0:
rtp_gap = 0
else:
rtp_gap = row['RTP_Gap_Length_Total_sec']/row['RTP_Session_Duration_Total_sec']
time1 = pendulum.parse(stamp,tz='America/Chicago').isoformat()
df1.loc[index,'DAY'] = time1
df1.loc[index,'SITE'] = sitename
df1.loc[index,'HR'] = rtp_gap
### Write DF to CSV ###
df1.to_csv('RTP_json.csv',index=None)
json_file_ind = 'RTP_json.json'
file = open(json_file_ind, 'w')
file.write("")
file.close()
#### Write CSV to JSON ###
with open('RTP_json.csv', 'r') as csvfile:
reader_ind = csv.DictReader(csvfile)
row=[]
for row in reader_ind:
row["RTP_Gap_Length_Total_sec"] = float(row["RTP_Gap_Length_Total_sec"])
row["RTP_Session_Duration_Total_sec"] = float(row["RTP_Session_Duration_Total_sec"])
row["RTP_Gap_Duration_Ratio_Avg%"]=float(row["RTP_Gap_Duration_Ratio_Avg%"])
row["HR"] = float(row["HR"])
with open('RTP_json.json', 'a') as json_file_ind:
json.dump(row, json_file_ind)
json_file_ind.write('\n')
end_time = time.time()
print("--- %s seconds ---" % (time.time() - end_time))
Output
--- 2018-01-23T12:25:07.411691-06:00 seconds ---### START TIME
(2055, 36) ### SIZE of initial DF
(2055, 7) ### Size of Filtered DF
--- 2018-01-23T12:31:54.480568-06:00 seconds --- --- ### END TIME