i am trying to find most frequently used words in tweets. i tokinized txt file and passed tokens into a json file but when i do json.loads it gives me an error:no JSON object could be decoded.
s_tweets.head()
print(s_tweets.iloc[:,2])
tweets = s_tweets.iloc[:,2]
#step 2: remove the special characters and punctuation
tlist = []
for t in tweets:
t_new=re.sub('[^A-Za-z0-9]+', ' ', t)
tlist.append(t_new)
#print(t_new)
#print(t_list)
test=word_tokenize(tlist[1])
print(test)
fname = 'tokensALL.json'
ff = open(fname, 'a')
for i in range(0,1751):
ff.write(str(word_tokenize(tlist[i])) + "\n")
ff.close()
###### find most frequent words
fname2 = 'tokensALL.json'
with open(fname2, 'r') as f:
count_all = Counter()
for line in f:
tweet = json.loads(line)
# Create a list with all the terms
terms_stop = [term for term in preprocess(tweet['text']) if
term not in stop]
# Update the counter
# terms_single = set(terms_all)
# Count hashtags only
terms_hash = [term for term in preprocess(tweet['text'])
if term.startswith('#')]
# Count terms only (no hashtags, no mentions)
terms_only = [term for term in preprocess(tweet['text'])
if term not in stop and
not term.startswith(('#', '@'))]
# mind the ((double brackets))
# startswith() takes a tuple (not a list) if
# we pass a list of inputs
terms_single = set(terms_stop)
terms_bigram = bigrams(terms_stop)
count_all.update(terms_stop)
# Print the first 5 most frequent words
print(count_all.most_common(5))
That's my code and json file content example( ['cries', 'for', 'help', 'like', 'tears', 'in', 'rain'] ['rain', 'rain', 'go', 'away']...etc)
Could anybody help to solve the problem? Thank you!