I have compiled the following code to create a word cloud of terms used in tweets containing the term "happy".
import tweepy
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
from time import sleep
import json
import os
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from scipy.misc import imread
import time
import pandas as pd
consumer_key =''
consumer_secret =''
access_token =''
access_token_secret =''
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
searchTerm ='happy'
MAX_TWEETS = 10
search_data=[ ]
current_working_dir = os.path.dirname(os.path.realpath(__file__))
current_working_dir = "./"
log_tweets = current_working_dir + str(time.time()) + '_searchtweets.txt'
with open(log_tweets, 'w') as outfile:
for tweet in tweepy.Cursor(api.search,q=searchTerm).items(MAX_TWEETS):
search_data.append(json.loads(json.dumps(tweet._json)))
outfile.write(json.dumps(tweet._json))
outfile.write("\n")
tweets = pd.DataFrame()
tweets['created_at'] = list(map(lambda tweet: time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')), search_data))
tweets['user'] = list(map(lambda tweet: tweet['user']['screen_name'], search_data))
tweets['text'] = list(map(lambda tweet: tweet['text'].encode('utf-8'), search_data))
tweets['lang'] = list(map(lambda tweet: tweet['lang'], search_data))
tweets['Location'] = list(map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, search_data))
tweets['retweet_count'] = list(map(lambda tweet: tweet['retweet_count'], search_data))
tweets['favorite_count'] = list(map(lambda tweet: tweet['favorite_count'], search_data))
tweets['long'] = list(map(lambda tweet: tweet['coordinates']['coordinates'][0] if tweet['coordinates'] != None else 'NaN', search_data))
tweets['latt'] = list(map(lambda tweet: tweet['coordinates']['coordinates'][1] if tweet['coordinates'] != None else 'NaN', search_data))
words = " ".join(tweets['text'].values.astype(str))
no_urls_no_tags = " ".join([word for word in words.split()
if 'http' not in word
and not word.startswith('@')
and word != 'RT'
])
search_mask = imread('images/twitter_mask.png', flatten=True )
wc = WordCloud(background_color="white",
font_path="/Library/Fonts/Verdana.ttf",
stopwords=STOPWORDS,
width=1800,
height=140,
mask=search_mask)
wc.generate(no_urls_no_tags)
plt.imshow(wc)
plt.axis("off")
plt.savefig('search_term_wordcloud_print.png', dpi =300)
plt.show()
When running, i get the following error message,
words = " ".join(tweets['text'].values.astype(str))
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 139: ordinal not in range(128)
is there a way, or lines of code to change to stop this error from occurring?