I'm working on a project that gets tweets from twitter using tweepy and processes the text. The problem that I am having is that I can't have any emoji's, special characters, etc.. Unfortunately one of the libraries that I am using doesn't support python 3 so I have to use python 2.7. Is there any way to remove everything except the "human readable text". I have been using the ftfy library but I still get stuff like this:
- ��
티파챗
- ��
my code:
import tweepy
from ftfy import fix_text,fix_encoding
from requests.exceptions import ConnectionError
from requests.packages.urllib3.exceptions import ProtocolError,ReadTimeoutError
import time
import exceptions
consumer_key = '...'
consumer_secret = '...'
access_token = '...'
access_token_secret = '...'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
class MyStreamListener(tweepy.StreamListener):
def on_connect(self):
print 'Connected'
def on_status(self, status):
fixed_text = fix_text(fix_encoding(status.text)).encode('utf-8')
print fixed_text
return True
def on_error(self, status):
print status
return False
running = True
while running is True:
try:
print 'Connecting'
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth=auth,listener=myStreamListener)
myStream.filter(track=['python'])
except ConnectionError:
print 'Connection Error: Waiting 10 seconds before retrying'
time.sleep(10)
except ProtocolError:
print 'ProtocolError: Waiting 10 seconds before retrying'
time.sleep(10)
except ReadTimeoutError:
print 'Read Timeout Error: Waiting 10 seconds before retrying'
note: this is just my test script to learn how to take tweets from twitter and print them