I'm having a peculiar problem with making graphs from several comma-separated lists in Pandas. Basically, I want to make four graphs at once. If I run the code blocks 2.1, 2.2, 2.3, and 2.4 separately, I get the correct plots.
However, if I run all of the code at once, I get this strange-looking plot that isn't correct at all:
Is there any way to fix this?
Here's the Kaggle data repo that has the input files. Download, unzip, and rename appropriately if you wish to replicate the issue: Kaggle repo
And here's the code. I'm using Spyder 4.1.5 with Python 3.8.5 (Anaconda version). Apologies in advance for the length of the code:
"""
This program will use pandas to take multiple comma-seperated list files from
https://www.kaggle.com/maksymshkliarevskyi/reddit-data-science-posts and randomly output data from them.
Filename: Poeth_Lab_11-1.py
Author: Dean Poeth
Created on 4/13/2021
"""
#%% Initialization sequence
#imports
import os
import pandas as pd
# read data. Idea for reading Windows file from https://stackoverflow.com/questions/37400974/unicode-error-unicodeescape-codec-cant-decode-bytes-in-position-2-3-trunca
input_folder = "reddit_aiml/"
# list of files
file_list = os.listdir(input_folder)
# empty list to store dataframes
df_list = []
# read in dataframes
for file in file_list:
print("loading", file)
temp_df = pd.read_csv(input_folder + file)
#add dataframe to list
df_list.append(temp_df)
#join together
# join them together
# https://stackoverflow.com/questions/32444138/concatenate-a-list-of-pandas-dataframes-together
aiml_data = pd.concat(df_list)
# get a random sample of data to see if things work without
# taking forever!
"###########------------REMOVE WHEN HOMEWORK IS COMPLETE!------------###########"
#aiml_data = aiml_data.sample(1000)
"""^^^^^^^^^^^^^^^^^^"""
"""Row check goes here"""
aiml_data['date_posted'] = pd.to_datetime(aiml_data['created_timestamp'])
aiml_data['date_posted'].head()
# explore data
aiml_data.info()
summary = aiml_data.describe(include='all')
# date formatting
aiml_data['author_created_date'] = pd.to_datetime(aiml_data['author_created_utc'], unit='s')
aiml_data['author_created_date'].head()
aiml_data['created_date'] = pd.to_datetime(aiml_data['created_date'])
#%% Q 2.1 plot total number of posts across all subreddits over time
"###########------------COMPLETE------------###########"
print("------------Question 2.1------------")
print()
#aiml_data['created_date'].dt.date
total_post_plot = aiml_data.groupby(aiml_data['created_date'].dt.year)['created_date'].count().plot(kind='line')
total_post_plot.set(xlabel="Year", ylabel="Post Count",
title = "AI/ML Total number of posts over time (all subreddits)")
total_post_plot.get_figure().savefig('totalPostPlot.png')
print("Plot drawn. See Plots area for more details.")
print()
print()
#%% Q 2.2 distribution of post scores
"###########------------COMPLETE------------###########"
print("------------Question 2.2------------")
print()
#post_score_dist = aiml_data.groupby(aiml_data['created_date'].dt.year)['score'].count().plot(kind='bar', sharex=False)
#post_score_dist.set(xlabel="Year", ylabel="Score",
# title = "AI/ML Post score distribution (all subreddits)")
#post_score_dist.get_figure().savefig('postScoreDist.png')
post_score_dist = aiml_data['score'].plot.hist()
post_score_dist.set(xlabel="Score", ylabel="Post Count", title = "AI/ML Post Score Distribution")
post_score_dist.get_figure().savefig('postScoreDistribution.png')
print("Plot drawn. See Plots area for more details.")
print()
print()
#%% Q 2.3 posts by day of week
"###########------------COMPLETE------------###########"
print("------------Question 2.3------------")
print()
# Which day of the week was something posted?
aiml_data['dow'] = aiml_data['created_date'].dt.day_name()
aiml_data['dow'] = pd.Categorical(aiml_data['dow'], categories=
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
ordered=True)
#Print the plot directly (Minimal Formatting)
#aiml_data.groupby('dow')['created_date'].count().plot(kind='bar')
dow_plot = aiml_data.groupby('dow')['created_date'].count().plot(kind='bar')
dow_plot.set(xlabel="Day of the Week", ylabel="Total Number of Posts",
title = "AI/ML Posts per Day of the week")
#Save figure to file
dow_plot.get_figure().savefig('postsByDOW.png')
print("Plot drawn. See Plots area for more details.")
print()
print()
#%% Q 2.4 Plot the total number of posts per hour of the day.
"###########------------COMPLETE------------###########"
print("------------Question 2.4------------")
print()
aiml_data['hod'] = aiml_data['created_date'].dt.hour
#aiml_data['hod'] = pd.Categorical(aiml_data['hod'], categories=
# ['1pm', '2pm'],
# ordered=True)
#Print the plot directly (Minimal Formatting)
#aiml_data.groupby('hod')['created_date'].count().plot(kind='bar')
hod_plot = aiml_data.groupby('hod')['created_date'].count().plot(kind='bar')
hod_plot.set(xlabel="Hour of the day", ylabel="Total Number of Posts",
title = "AI/ML Posts per hour of the day")
#Save figure to file
hod_plot.get_figure().savefig('postsByHOD.png')
print("Plot drawn. See Plots area for more details.")
print()
print()