I currently have about 1,500 .bin.txt files that I am analysing using Excel Powerquery; however, the loading of the data is very slow (15+ minutes) so I decided to create a short python script to combine all the bin files first, then read them all together in powerquery.
I have the following script, but the time to concatenate all these files is over 20+ minutes and the full dataset will be 10 or more times the 1,500 number. Is there any way to speed this up?
def combineBinFiles():
root = tk.Tk()
root.withdraw()
#Get bin files folder
folder_selected = filedialog.askdirectory()
print(folder_selected)
os.chdir(folder_selected)
files = os.listdir(folder_selected)
print(files)
df = pd.DataFrame()
temp_df = pd.DataFrame(columns= ['Timestamp', 'Wind speed', 'Own consumption'])
for file in files:
if file.endswith('.bin.txt'):
print("Reading file: " + file)
#Format file based on space delimiter
temp_df = pd.read_csv(file, delimiter=" ", header=0)
#Extract date from column name
date = temp_df.columns[0]
#Concatenate date to the beginning of each timestamp before adding it to the dataframe
temp_df[date] = temp_df[date].apply(lambda x: date + ' ' + x)
df = pd.concat([df, temp_df], axis=0, ignore_index=True)
df.to_csv('combinedBinFile.csv', index=False)
print(df)
combineBinFiles()
The files are formatted as: Timestamp, Wind speed, and Consumption. Each file has the date as the timestamp header, and the rest of the column is the exact time (without the date). So in the code I concatenate the date to each time value before adding it to the overall dataframe.
Edit: Bin files example; each file looks like this just different data and dates.
14_07_2023 .WindSpeed .Power
17 50 00 006 10,53 0
17 50 00 016 10,53 0
17 50 00 026 10,53 0
17 50 00 036 10,53 0
17 50 00 046 10,53 0
17 50 00 056 10,53 0
Modified code with multithreading and chunk reading:
def worker(q, df_list):
while not q.empty():
file = q.get()
if file.endswith('.bin.txt'):
print("Reading file: " + file)
temp_df = pd.read_csv(file, delimiter="\t", header=0, engine='python')
date = temp_df.columns[0]
temp_df[date] = temp_df[date].apply(lambda x: date + ' ' + x)
#Delete the last column:
temp_df = temp_df.iloc[:, :-1]
temp_df.columns = ['Timestamp', 'Wind speed', 'Own consumption']
df_list.append(temp_df)
q.task_done()
def combineBinFilesThreaded():
root = tk.Tk()
root.withdraw()
folder_selected = filedialog.askdirectory()
print(folder_selected)
os.chdir(folder_selected)
files = os.listdir(folder_selected)
print(files)
df_list = []
q = queue.Queue()
# Create 2 worker threads
for i in range(4):
t = threading.Thread(target=worker, args=(q, df_list))
t.daemon = True
t.start()
# Put the files in the queue
for file in files:
q.put(file)
# Wait for all the tasks in the queue to be processed
q.join()
print("Done joining")
# Combine all the dataframes
df = pd.concat(df_list, axis=0, ignore_index=True)
print("Done concatenating")
chunksize = 100000
#Add if statement to check if combinedBinFile exists so that it doesn't append to itself
if os.path.exists('combinedBinFile.csv'):
os.remove('combinedBinFile.csv')
for i in range(0, len(df), chunksize):
print("Writing chunk: " + str(i))
df.iloc[i:i+chunksize].to_csv('combinedBinFile.csv', index=False, mode='a')
print("Done writing")