from collections import Counter
input = 'file.txt'
CounterWords = {}
words = {}
with open(input,'r', encoding='utf-8-sig') as fh:
for line in fh:
word_list = line.replace(',','').replace('\'','').replace('.','').lower().split()
for word in word_list:
if len(word) < 6
continue
elif word not in CounterWords:
CounterWords[word] = 1
else:
CounterWords[word] = CounterWords[word] + 1
N = 50
top_words = Counter(CounterWords).most_common(N)
for word, frequency in top_words:
print("%s %d" % (word, frequency))
At the moment i am able two select the most frequent words with strings more than X characters.
The program should screen the text and count words like:
"climate finance" "market failure" "Paris 2015"
Amount of minimum characters per single string should be still included to prevent results such as "I and".