I'm following a little article called: Mining Twitter Data with Python
Actually, I'm in part 2 that is text pre-processing. This is the example for tokenize a tweet text.
import re
import json
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML Tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^' + emoticons_str + '$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
Now, it works properly when you insert a string directly like this:
tweet = 'RT @marcobonzanini: just an example! :D http://example.com #NLP'
print(preprocess(tweet))
But once I try to import a JSON for tokenize all tweets text in the file it occurrs an Error.
This is how it's supposed to work.
with open('tweets.json', 'r') as f:
for line in f:
tweet = json.loads(line)
tokens = preprocess(tweet['text'])
This is the error displayed:
Traceback (most recent call last):
File "C:/Users/fmigg/PycharmProjects/untitled/Data Mining/tweetTextProcessing.py", line 43, in <module>
tweet = json.loads(line)
File "C:\Program Files\Anaconda3\lib\json\__init__.py", line 319, in loads
return _default_decoder.decode(s)
File "C:\Program Files\Anaconda3\lib\json\decoder.py", line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Program Files\Anaconda3\lib\json\decoder.py", line 357, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 2 column 1 (char 1)
Finally, this is the JSON file called tweets.json with Tweets on it (the number of tweets is kinda large, so I'll only put one Tweet in order to analyze the structure of it).
{"created_at":"Tue Jun 27 16:05:01 +0000 2017","id":879732307992739840,"id_str":"879732307992739840","text":"RT @PythonQnA: Python List Comprehension Vs. Map #python #list-comprehension #map-function https:\/\/t.co\/YtxeSt64pd","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":704974573985525760,"id_str":"704974573985525760","name":"UNIVERSAL TGSI","screen_name":"universaltgsi","location":"Magny-le-Hongre, France, SM","url":"http:\/\/www.tgsi.eu","description":"Find everything you want to know about business Technology by ONE TGSI","protected":false,"verified":false,"followers_count":424,"friends_count":343,"listed_count":273,"favourites_count":4250,"statuses_count":2958,"created_at":"Wed Mar 02 10:20:11 +0000 2016","utc_offset":7200,"time_zone":"Paris","geo_enabled":false,"lang":"fr","contributors_enabled":false,"is_translator":false,"profile_background_color":"1B95E0","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/705020861909225472\/psLvMIAP.jpg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/705020861909225472\/psLvMIAP.jpg","profile_background_tile":true,"profile_link_color":"0084B9","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/866410987880099840\/HT8fZKLO_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/866410987880099840\/HT8fZKLO_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/704974573985525760\/1495404137","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Tue Jun 27 08:24:00 +0000 2017","id":879616290700263424,"id_str":"879616290700263424","text":"Python List Comprehension Vs. Map #python #list-comprehension #map-function https:\/\/t.co\/YtxeSt64pd","source":"\u003ca href=\"http:\/\/jarvis.ratankumar.org\/\" rel=\"nofollow\"\u003ePythonQnA\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":747460774998605825,"id_str":"747460774998605825","name":"PythonQnA","screen_name":"PythonQnA","location":"Bengaluru, India","url":null,"description":"I tweet Python questions from stackoverflow.","protected":false,"verified":false,"followers_count":632,"friends_count":64,"listed_count":277,"favourites_count":0,"statuses_count":85791,"created_at":"Mon Jun 27 16:05:10 +0000 2016","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"F5F8FA","profile_background_image_url":"","profile_background_image_url_https":"","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/747461193653092352\/Mz9NjeE__normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/747461193653092352\/Mz9NjeE__normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/747460774998605825\/1467044067","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":2,"favorite_count":1,"entities":{"hashtags":[{"text":"python","indices":[34,41]},{"text":"list","indices":[42,47]},{"text":"map","indices":[62,66]}],"urls":[{"url":"https:\/\/t.co\/YtxeSt64pd","expanded_url":"https:\/\/goo.gl\/OZxWIC","display_url":"goo.gl\/OZxWIC","indices":[76,99]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"python","indices":[49,56]},{"text":"list","indices":[57,62]},{"text":"map","indices":[77,81]}],"urls":[{"url":"https:\/\/t.co\/YtxeSt64pd","expanded_url":"https:\/\/goo.gl\/OZxWIC","display_url":"goo.gl\/OZxWIC","indices":[91,114]}],"user_mentions":[{"screen_name":"PythonQnA","name":"PythonQnA","id":747460774998605825,"id_str":"747460774998605825","indices":[3,13]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":true,"filter_level":"low","lang":"en","timestamp_ms":"1498579501518"}
I'd like to know why is the reason that this is happening. Thank you so much to everyone!
P.S this is the link to the article: Mining Twitter Data with Python (Part 2: Text Pre-processing)
UPDATE:
I tried the code using one simple JSON tweet and two simple JSON tweets in a JSON file and it worked. So it seems that the problem is when I open the entire file with all Tweets on it.
If someone needs that file you can download or watch it in my Microsoft Onedrive. https://1drv.ms/f/s!AjHPHWCBEuf7ux3uLmSVEaSCPWIE