from glob import glob
pattern = "D:\\report\\shakeall\\*.txt"
filelist = glob(pattern)
def countwords(fp):
with open(fp) as fh:
return len(fh.read().split())
print "There are" ,sum(map(countwords, filelist)), "words in the files. " "From directory",pattern
import os
import re
import string
uniquewords = set([])
for root, dirs, files in os.walk("D:\\report\\shakeall"):
for name in files:
[uniquewords.add(x) for x in open(os.path.join(root,name)).read().split()]
wordlist = list(uniquewords)
This code counts the total number of unique and total words. However, the problem is, if I write len(uniquewords) , it shows unreasonable number because it recognizes for example, 'shake' 'shake!' 'shake,' and 'shake?' as different unique words. I've tried to remove punctuations from uniquewords by making the list and modifying it, everything failed. Can anybody help me?