As a result of parsing, only English sentences are stored correctly.
When you parse (scrape), you will come across other languages.
However, my code only stores English sentences correctly.
The emoticon is not saved either. (ex. ★♬◆♥)
How should I solve it? Thank you for your advice.
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import time
#Twitter Authentication Information
consumerKey = "x"
consumerSecret = "x"
accessToken = "x"
accessSecret = "x"
class listener(StreamListener):
def on_data(self, tweetRaw):
try:
# Extract relevant information from the raw tweet data
tweetAuthor = tweetRaw.split('"user":{"id":')[1].split(',"id_str":')[0]
tweetText = tweetRaw.split(',"text":"')[1].split('","source":')[0]
tweetText = tweetText.replace(',', '')
tweetDate = tweetRaw.split('"created_at":"')[1].split('","id":')[0]
tweet = tweetDate + ',' + tweetAuthor + ',' + tweetText
# Check tweet for location data
try:
tweetLocation = tweetRaw.split(',"geo":{"type":"Point","coordinates":[')[1] \
.split(']},"coordinates":{"type":')[0]
tweet = tweet + ',' + tweetLocation
except:
pass
# Write tweet to file
try:
tweetToFile = open('rr.csv', 'a')
tweetToFile.write(tweet)
tweetToFile.write('\n')
tweetToFile.close()
except:
print('Error! Unable to write to file.')
return True
except:
time.sleep(5)
def on_error(self, status):
print(status.tweetText)
auth = OAuthHandler(consumerKey, consumerSecret)
auth.set_access_token(accessToken, accessSecret)
twitterStream = Stream(auth, listener())
# Filter(s) for stream of tweets
## Geobox made using: http://boundingbox.klokantech.com/
indiaGeoBox = [[[139.5799775191,35.5341801276],[139.9369292191,35.5341801276],[139.9369292191,35.8307871111],[139.5799775191,35.8307871111],[139.5799775191,35.5341801276]]]
## Geobox contains tweets from parts of neighbouring countries
## Topic Filter
twitterStream.filter(track=['sing'or"day"])
twitterStream.filter(locations=indiaGeoBox)
indiaGeoBox = If not English speaking country(ex. korea.japan.china)
twitterStream.filter(track=['No English keyword'])
tweet : \ud83d\udcaf\ud83d\ude4f\ud83d\ udcaf\ud83d\ude4f\ud83d\udcaf\ud83d\ude4f
or
indiaGeoBox = If not English speaking country(ex. china.korea.japan)
twitterStream.filter(track=['English keyword'])
tweet : https://t.co/ppKCeiEU5n\n\ub178\ud1b5\uc758 \uc808\uce5c \ubb38\uc7ac\uc778