I have around 5 million tweets in the following format:
{"created_at":"Mon May 21 05:40:26 +0000 2018","id":998438346987683840,"id_str":"998438346987683840","text":"sometext","display_text_range":[0,0],"source":"u003ca href="someURL" rel="nofollow"u003eTwitter for iPhoneu003c/au003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1062745482,"id_str":"1062745482","name":"u3074u30fc","screen_name":"maga12171","location":null,"url":null,"description":"u3068u3063u3066u3082u30aau30c8u30cau3067u3059 u706bu661fu4ebauff08uff0buff09 u7652u3057u306fu95a2u30b8u30e3u30cb u6c34u66dcu3069u3046u3067u3057u3087u3046","translator_type":"none","protected":false,"verified":false,"followers_count":4,"friends_count":23,"listed_count":0,"favourites_count":977,"statuses_count":238,"created_at":"Sat Jan 05 11:09:11 +0000 2013","utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"someURL","profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","profile_background_tile":false,"profile_link_color":"0066FF","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http://pbs.twimg.com/profile_images/874952211184222209/UZ8RcGuU_normal.jpg","profile_image_url_https":"someURL","profile_banner_url":"someURL","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[],"media":[{"id":998438345532325888,"id_str":"998438345532325888","indices":[0,23],"media_url":"someURL","media_url_https":"someURL","url":"someURL","display_url":"pic.twitter.com/J1RJGazs8k","expanded_url":"someURL","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":750,"h":1334,"resize":"fit"},"small":{"w":382,"h":680,"resize":"fit"},"medium":{"w":675,"h":1200,"resize":"fit"}}}]},"extended_entities":{"media":[{"id":998438345532325888,"id_str":"998438345532325888","indices":[0,23],"media_url":"someURL","media_url_https":"someURL","url":"someURL","display_url":"pic.twitter.com/J1RJGazs8k","expanded_url":"someURL","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":750,"h":1334,"resize":"fit"},"small":{"w":382,"h":680,"resize":"fit"},"medium":{"w":675,"h":1200,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"und","timestamp_ms":"1526881226666"}
i have imported them in a mysqldb as text datatype and now i am trying to pull them row by row and clean them up so that i can keep only the data i need.
import json
import MySQLdb.cursors
HOST = ""
USER = ""
PASSWD = ""
DB = ""
conn = MySQLdb.connect(host=HOST, port =3306, user=USER,passwd=PASSWD,db=DB , cursorclass= MySQLdb.cursors.SSCursor)
x = conn.cursor()
query = "SELECT txt_column FROM txt_data limit 1"
x.execute(query)
for row in x:
print(row)
res = json.loads(row[0])
when i run it i get : ValueError: Expecting ',' delimiter: line 1 column 185 (char 184)
, That is a "
character found at the href=
" of the attached json string. A few previous posts in stackoverflow recommend to use .replace() to get rid of the " characters, but this will ruin the json format.
I think the problem is that python expects to find a format like "attribute_name":"data". When it finds "attribute_name":"data "data in quotes" more data" it throughs the error for the unexpected " char
If i am right, is there anyway i can fix this?
Please note i had to modify the json sample attached, substituting all URLs with "someURL", stackoverflow doesn't allow urls. Thus you will not find the error at char 184. In the original data 184 is the first " in href="someURL"