Need help in optimizing code below.
code is working for small set of data but not large set. it is throwing 'unable to allocate array with shape()
code works fine with small file (less than 50 mb) but it failed with large file (more than 200 mb).
the chunck code comes from stackoverflow only.
Split pandas dataframe in two if it has more than 10 rows
Here is the original txt file
id, no, julianDate
123,6683969930867371,24118
123,9.22465322574855E+015,21232
123,9.88635756801637E+015,19219
123,3422437279060279,18110
123,3340444112843298,20124
123,4059401965256193,20131
123,7419046088033512,22331
here is the code which is doing:
update id length to 3 if some other length,
convert julian date format YYDDD to YYMM
and jumble the whole df.
import pandas as pd
from os import getcwd, listdir
def single_file(folder):
# list_of_files = listdir(getcwd())
for f in listdir(getcwd()):
if f.endswith('.txt'):
return f
def convert_file_to_df(file):
colnames = ['Id', 'no', 'julianDate']
df = pd.read_csv(file, names=colnames, header=None, converters={'Id': '{:0>3}'.format})
return df
def convert_no_to_int(df):
df['no'] = df['no'].astype('int')
return df
def convert_julian_to_yymm(df, ex_year):
df['julianDate'] = df['julianDate'].astype('str')
df['year'] = df['julianDate'].str[:2]
df['year'] = df['year'].astype(int)
df = df.drop(df[df['year'] < ex_year].index)
df['year'] = df['year'].astype(str)
df['day'] = df['julianDate'].str[2:]
df['month'] = df['day'].astype(int)
df['month'] = df['month'] // 30
df['month'] = df['month'].astype(str).str.zfill(2)
df['yymm'] = df['year'] + df['month']
df['yymm'] = df['yymm'].astype(int)
return df
def split_dataframe_to_chunks(df, n):
df_len = len(df)
count = 0
dfs = []
while True:
if count > df_len-1:
break
start = count
count += n
# print('{0} : {1}'.format(start, count))
dfs.append(df.iloc[start : count])
return dfs
for f in listdir(getcwd()):
# print('for loop {}'.format(i))
if f.endswith('.txt'):
# print('file if loop {}'.format(i))
df = convert_file_to_df(f)
convert_no_to_int(df)
dfs = split_dataframe_to_chunks(df, 100)
# loop = 1
for df in dfs:
df_data = convert_julian_to_yymm(df, 20)
small_chunck_df = drop_columns(df_data)
final_df = pd.concat([final_df, small_chunck_df])
here is the data after processing
id, no, yymm
123,7623696460219421,2007
123,5638234312228287,2304
123,7285561860395750,2010
123,9232591144713690,2109
123,3214483466634390,2311
123,7443958909235846,2308
123,2479507924021739,2306
123,1477320189903881,2108