I am trying to scrape submissions from WBS containing the TSLA ticker. I have the below code which is intended to take the top 25 submissions for each hour in the timeframe. I had a similar code for comments which worked really well for me, but now I can't figure out why my code is not working for submissions. I changed the base_url (I left in some of the urls I tried to use) and also change 'body' to 'selftext' in my code.
The error given is: ValueError: arrays must all be same length I will post the entire traceback if it helps.
year=2020
month = 6
start_date =1
days = 2
subreddit = "wallstreetbets"
def number_of_days_in_month(year=2020, month=6):
return monthrange(year, month)[1]
if month > 12 or days > number_of_days_in_month(year, month):
raise Exception()
submission_urls = []
# sorted by score
base_url = "https://api.pushshift.io/reddit/submission/search/?sort=desc&sort_type=num_comments&size=25&subreddit={}
"
#base_url = "https://api.pushshift.io/reddit/submission/search?limit=25&sort_type=score&sort=desc&subreddit={}
"
#base_url = "https://api.pushshift.io/reddit/search/submission/?selftext=TSLA
"
#base_url = "https://api.pushshift.io/reddit/submission/search?limit=25&sort_type=score&sort=desc&subreddit={}
"
#base_url = 'https://api.pushshift.io/reddit/search/submission/?subreddit=WallStreetBets&after=2d&before=1d&q
=TSLA'
def downloadsubmissionsFromUrl(base_url):
count = 0
submission_temp = {"id":[], "author":[], "selftext":[], "created_utc":[], "permalink":[]}
for j in range(start_date,days+1):
# for every hour in a day
for i in range(0,23,2): # 2 hour steps
count += 1
current_day_start = int(dt.datetime(year,month, j, i, 0).timestamp())
current_day_end = int(dt.datetime(year,month, j, i, 59,59).timestamp())
url = base_url + f"&after={current_day_start}&before={current_day_end}"
new_url = url.format(subreddit)
print(new_url)
issue = False
try:
json = requests.get(new_url, timeout=5)
except:
json = None
issue = True
print("request failed - skipping")
try:
json_data = json.json()
except:
print(json)
# i guess if it didnt fail above?
else:
if 'data' not in json_data:
issue = True
objects = json_data['data']
if len(objects) == 0:
issue = True
if not issue:
for submission in objects:
try:
submission_temp["id"].append(submission['id'])
submission_temp["author"].append(submission['author'])
submission_temp["selftext"].append(submission['selftext'])
submission_temp["created_utc"].append(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(submission['created_utc'])))
submission_temp["permalink"].append(f"https://reddit.com{submission['permalink']}")
except Exception as err:
print(traceback.format_exc())
else:
print("JSON request failed")
time.sleep(5)
return pd.DataFrame(submission_temp)
submissions = downloadsubmissionsFromUrl(base_url)