I want to read a large .txt file (c.2.5GB) in chunks and then perform some operations before loading into the database.
The file only has 2 columns (column delimiter is ¬
) and is qualified with double quotes
. The values in the second column could span across multiple lines (sample below). I thought of using this answer but the issue would be that it might process incomplete lines as it depends on preset chunk size
. Can someone please help? I've included the sample data and code below.
Sample Data (Sample_load_file.txt)
"LINE_ID"¬"LINE_TEXT"
"C1111-G00-BC222"¬"this line is
split into
multiple lines
% All needs to be read into 1 line
% Currently that's not happening
"
"C22-f0-333"¬"2nd row. This line is
split into
multiple lines
% All needs to be read into 1 line
% Currently that's not happening
*******************************************************************
This line also includes the column delimiter within text qualifier
*******************************************************************
# !¬!¬!¬|
"
Code
import pandas as pd
import os
from dbconnection import DBConnection
path = r'C:\Sample_load_file.txt'
db = DBConnection(server ='XXXX', database='XXXX')
def read_in_chunks(file_object, chunk_size=1024):
#Lazy load to read a file piece by piece (avoiding moemory issues)
#Default chunk size: 1k.
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
def process_chunk(data=piece):
#Build a list of lines based on ' "\n" ' as custom separator
data = data.split('"\n"')
#Split each line based on ' "¬" ' as custom separator
data = [line.split('"¬"') for line in data]
#Cleanup remaining double quotes
data = [[e.replace('"', '') for e in line] for line in data]
#Check the number of columns
number_of_cols = len(str(data[0]).split('¬'))
number_of_cols
#Load data into a dataframe
df = pd.DataFrame(data)
#Reformat dataframe
df.columns = df.iloc[0] # Set first row as column index
df = df.iloc[1:].reset_index(drop=True) # Drop first line and reset index
#Split the first column into two
try:
df[['LINE_ID', 'LINE_TEXT']] = df['LINE_ID¬LINE_TEXT'].str.split('¬',expand=True)
except:
print('Error')
del df['LINE_ID¬LINE_TEXT']
#Add metadata
df['loaded_by'] = 'XXXX'
df['file_line_number'] = range(2,len(df)+2)
df['load_date'] = pd.datetime.now()
df['source_file'] = path
df['loading_script'] = r'Load_Extracts.ipynb'
#Load in SQL db
df.to_sql('SQL_table_name', db.engine, schema='dbo', index=False, if_exists='append')
#Load text file
with open(path) as f:
for piece in read_in_chunks(f):
process_data(piece)