I need to write coding for reading all the text files in the subfolders and count the frequency of some words in each text file. Below is my coding:
def process_files(dir):
# List of filenames in the directory
filenames = []
for root, _, files in walk(dir):
filenames.extend([join(root, file) for file in filter(files, '*.txt')])
df = pd.DataFrame()
for filename in filenames:
virtual_currency, bitcoin,blockchain, cryptocurrency, digital_currency, litecoin, dogecoin, etherrum =0, 0, 0, 0, 0, 0, 0,0
with open(filename, 'r') as f:
contents = f.read().lower()
# Process the content
cryptocurrency=contents.count("cryptocurrenc")
virtual_currency = contents.count("virtual currenc")
digital_currency=contents.count("digital currenc")
bitcoin = contents.count("bitcoin")
blockchain=contents.count("blockchain")
litecoin=contents.count("litecoin")
dogecoin=contents.count("dogecoin")
etherrum=contents.count("etherrum")
# Create data row
data = {'File Name': filename,
'Virtual Currency': virtual_currency,
'Bitcoin': bitcoin,
'cryptocurrency': cryptocurrency,
'digital currency': digital_currency,
'litecoin': litecoin,
'dogecoin': dogecoin,
'etherrum': etherrum}
#df.append(data,ignore_index=True)
a=pd.DataFrame(data, index=[0])
df=df.append(a,ignore_index=True, sort=False)
# Save data to CSV
df.to_csv(os.path.join(dir, filename), index=False)
return df
result = process_files(r'C:\test\QTR2')
The issue is, the generated result is all the same for each document. My guess is that the last result rewrites the old values.
Could someone please help check which part went wrong? I really appreciate any help you can provide.