I have a list of lists
[["Due to the storms this weekend, we have rescheduled the Blumenfield Bike Ride for Feb 26. Hope to see you there.\xe2\x80\xa6 '"], ['Lots of sun this weekend, take advantage of the Beach Bus that gets you from Woodland Hills to the beach for just $\xe2\x80\xa6 '], ["RT @LHansenLA: Yesterday got a peek inside @LAPPL @EagleandBadge new rig for End of Watch Memorial Wall. Moving tribute to fallen @LAPD w/\xe2\x80\xa6'"], ["Happy to join Art Sherman and Wings Over @Wendys to honor veterans & 15 years of weekly meetings hosted by Ron and\xe2\x80\xa6 '"], ["Join me for the 4th Annual Blumenfield Bike Ride. Enjoy the West Valley on 2 wheels. RSVP:'"]]
As you can see, the lists unfortunately are displaying literal UTF-8 instead of the characters themselves. At some point in my code, I encode into UTF-8
outtweets = [[str(tweet.text.encode("utf-8"))] for tweet in correct_date_tweet]
outtweets = [[stuff.replace("b\'", "")] for sublist in outtweets for stuff in sublist]
outtweets = [[stuff.replace('b\"', "")] for sublist in outtweets for stuff in sublist]
The above code is all necessary in order to remove the b prefixes. These cannot be in my tweets because I am doing machine learning analysis and having the bs affects it.
My Question
How do I replace the UTF-8 script with the actual characters?
I need to encode it somehow because I am pulling tweets from (3 cities) x (50 officials) x (12 months of tweets for each) so it would be impossibly inefficient to try to manually replace them.
Code
import tweepy #https://github.com/tweepy/tweepy
#Twitter API credentials
consumer_key = "insert key here"
consumer_secret = "insert key here"
access_key = "insert key here"
access_secret = "insert key here"
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#!/usr/bin/env python
# encoding: utf-8
import tweepy #https://github.com/tweepy/tweepy
import json
import csv
import datetime
from datetime import datetime
import os.path
failed_accounts = []
def get_all_tweets(screen_name,mode):
#try:
#Twitter only allows access to a users most recent 3240 tweets with this method
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
i = 0
num_req = 0
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print ("...%s tweets downloaded so far" % (len(alltweets)))
num_req = num_req + 1
# makes further requests only if batch doesn't contain tweets beyond oldest limit
oldest_limit = datetime(2016, 1, 20,0,0,0)
x = 0
for tweet in new_tweets:
raw_date = tweet.created_at
if raw_date < oldest_limit:
x = 1
else:
continue
if x == 1:
break
#BSP this script is designed to just keep going. I want it to stop.
#i = i + 1
#if i == 10:
# break
print("Number of Tweet Request Rounds: %s" %num_req)
correct_date_tweet = []
for tweet in alltweets:
raw_date = tweet.created_at
#date = datetime.strptime(raw_date, "%Y-%m-%d %H:%M:%S")
newest_limit = datetime(2017, 1, 20,0,0,0)
oldest_limit = datetime(2016, 1, 20,0,0,0)
if raw_date > oldest_limit and raw_date < newest_limit:
correct_date_tweet.append(tweet)
else:
continue
#transform the tweepy tweets into a 2D array that will populate the csv
if mode == "tweets only" or "instance file":
outtweets = [[str(tweet.text.encode("utf-8"))] for tweet in correct_date_tweet]
outtweets = [[stuff.replace("b\'", "")] for sublist in outtweets for stuff in sublist]
outtweets = [[stuff.replace('b\"', "")] for sublist in outtweets for stuff in sublist]
outtweets = [["1 ",stuff.replace('"', "")] for sublist in outtweets for stuff in sublist]
#outtweets = [["1 ",stuff] for sublist in outtweets for stuff in sublist]
else:
outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8"),tweet.retweet_count,tweet.favorite_count,len(tweet.entities.get("hashtags")),len(tweet.entities.get("urls")),len(tweet.entities.get("user_mentions"))] for tweet in correct_date_tweet]
#write the csv
if mode == "instance file":
with open(os.path.join(save_location,'%s.instance' % screen_name), mode ='w') as f:
writer = csv.writer(f)
writer.writerows(outtweets)
else:
with open(os.path.join(save_location,'%s.csv' % screen_name), 'w',encoding='utf-8') as f:
writer = csv.writer(f)
if mode != "tweets only":
writer.writerow(["id","created_at","text","retweets","favorites","hashtags","urls"])
writer.writerows(outtweets)
pass
print("Done with %s" % screen_name)
get_all_tweets("BobBlumenfield","instance file")
Update
Based on an answer, I tried changing one of the lines to outtweets = [[tweet.text] for tweet in correct_date_tweet]
But this didn't work because it yields
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-12-a864b5efe8af> in <module>()
----> 1 get_all_tweets("BobBlumenfield","instance file")
<ipython-input-9-d0b9b37c7261> in get_all_tweets(screen_name, mode)
104 with open(os.path.join(save_location,'%s.instance' % screen_name), mode ='w') as f:
105 writer = csv.writer(f)
--> 106 writer.writerows(outtweets)
107 else:
108 with open(os.path.join(save_location,'%s.csv' % screen_name), 'w',encoding='utf-8') as f:
C:\Users\Stan Shunpike\Anaconda3\lib\encodings\cp1252.py in encode(self, input, final)
17 class IncrementalEncoder(codecs.IncrementalEncoder):
18 def encode(self, input, final=False):
---> 19 return codecs.charmap_encode(input,self.errors,encoding_table)[0]
20
21 class IncrementalDecoder(codecs.IncrementalDecoder):
UnicodeEncodeError: 'charmap' codec can't encode characters in position 64-65: character maps to <undefined>