I am in the middle of some textual analysis. Basically, I am trying to get the total word counts (based on a list of words) and the total phrase counts (based on a list of phrases) for each file in a certain folder. So far, I have the following. But I keep getting errors 'str' object has no attribute 'words'
. The code I have tried to write is a combination of several other codes, so I don't know which part is creating the issue. Any help would be appreciated.
import csv
import glob
import re
import string
import sys
import time
target_files = r'C:/Users/Mansoor/Documents/Files/*.*'
output_file = r'C:/Users/Mansoor/Documents/Parser.csv'
output_fields = ['file name,', 'file size,', 'words,', 'phrases,']
words = {'uncertainty', 'downturn', 'shock'}
phrases = {'economic downturn', 'political uncertainty'}
def main():
f_out = open(output_file, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(output_fields)
file_list = glob.glob(target_files)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc_len = len(doc)
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
output_data[1] = doc_len
wr.writerow(output_data)
def get_data(doc):
vdictionary = {}
_odata = [0] * 4
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token not in vdictionary:
vdictionary[token] = 1
if token.words: _odata[2] += 1
for w1, w2 in zip(phrases, phrases[1:]):
phrase = w1 + " " + w2
if phrase.phrases: _odata[3] += 1
return _odata
if __name__ == '__main__':
print('\n' + time.strftime('%c') + '\nUncertainty.py\n')
main()
print('\n' + time.strftime('%c') + '\nNormal termination.')