0

I have compiled the following code to create a word cloud of terms used in tweets containing the term "happy".

import tweepy
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
from time import sleep
import json
import os
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from scipy.misc import imread
import time
import pandas as pd

consumer_key =''
consumer_secret =''
access_token =''
access_token_secret =''

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

searchTerm ='happy'
MAX_TWEETS = 10

search_data=[ ]
current_working_dir = os.path.dirname(os.path.realpath(__file__))
current_working_dir = "./"
log_tweets = current_working_dir  + str(time.time()) + '_searchtweets.txt'
with open(log_tweets, 'w') as outfile:

    for tweet in tweepy.Cursor(api.search,q=searchTerm).items(MAX_TWEETS):

        search_data.append(json.loads(json.dumps(tweet._json)))
        outfile.write(json.dumps(tweet._json))
        outfile.write("\n")


tweets = pd.DataFrame()
tweets['created_at'] = list(map(lambda tweet: time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')), search_data))
tweets['user'] = list(map(lambda tweet: tweet['user']['screen_name'], search_data))
tweets['text'] = list(map(lambda tweet: tweet['text'].encode('utf-8'), search_data))
tweets['lang'] = list(map(lambda tweet: tweet['lang'], search_data))
tweets['Location'] = list(map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, search_data))
tweets['retweet_count'] = list(map(lambda tweet: tweet['retweet_count'], search_data))
tweets['favorite_count'] = list(map(lambda tweet: tweet['favorite_count'], search_data))

tweets['long'] = list(map(lambda tweet: tweet['coordinates']['coordinates'][0] if tweet['coordinates'] != None else 'NaN', search_data))

tweets['latt'] = list(map(lambda tweet: tweet['coordinates']['coordinates'][1] if tweet['coordinates'] != None else 'NaN', search_data))


words = " ".join(tweets['text'].values.astype(str))

no_urls_no_tags = " ".join([word for word in words.split()
                            if 'http' not in word
                            and not word.startswith('@')
                            and word != 'RT'
                            ])

search_mask = imread('images/twitter_mask.png', flatten=True )

wc = WordCloud(background_color="white", 
               font_path="/Library/Fonts/Verdana.ttf", 
               stopwords=STOPWORDS, 
               width=1800, 
               height=140, 
               mask=search_mask)
wc.generate(no_urls_no_tags)
plt.imshow(wc)
plt.axis("off")
plt.savefig('search_term_wordcloud_print.png', dpi =300)
plt.show()

When running, i get the following error message,

words = " ".join(tweets['text'].values.astype(str))
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 139: ordinal not in range(128)

is there a way, or lines of code to change to stop this error from occurring?

A.Lona
  • 3
  • 2
  • You have to specify encoding. Please check this question http://stackoverflow.com/questions/9644099/python-ascii-codec-cant-decode-byte – taras Feb 19 '17 at 17:32
  • Why `.encode('utf8')` the tweet text? Process the strings as Unicode and you won't have to convert it back to a `str` to join it. – Mark Tolonen Feb 19 '17 at 19:55
  • Thanks, Mark. That fixed the problem. Thank you very much!! – A.Lona Feb 21 '17 at 13:07

0 Answers0