I've just had some trouble piecing together matplotlib documentation particularly for my data set. I think I have a small enough block of code that shows all of my lack of understanding despite reading documentation. The documentation i've beeen trying to use for reference was initially this for creating a line graph https://matplotlib.org/gallery/text_labels_and_annotations/date.html
I've been trying to plot a numpy array, post_records
containing two columns. I'm working with social media data, so the first column is for post_ids
and the second is for datetime_obj_col
that I managed to read from csv file using some scripting.
I managed to create a line graph with this data in matplotlib, but I don't quite know how to make a histogram.
Right now, nothing shows up when I run my program
fig, ax = plt.subplots()
hist, bins, patch_lst = ax.hist(post_records[:,1], bins=range(31)) # thought that bins could be a sequence, wanted to create 31 bins for 31 total days in a month
ax.plot(hist, bins)
ax.set_xlabel('Days')
ax.set_ylabel('frequency')
ax.set_title(r'Histogram of Time')
plt.show() # shows nothing
- What do I need to pass to ax.plot? I'm unclear about how to pass in my x dataset
- why isn't the window showing?
Edit with how to replicate this:
def create_dataframe_of_datetime_objects_and_visualize():
datetime_lst = [1521071920000000000, 1521071901000000000, 1521071844000000000, 1521071741000000000, 1521071534000000000] # to get this variable I loaded my original dataframe with 1980000, sliced the first 5 entries, then printed out the 'datetime_obj_col'. I can't exactly remember what this format is called, I think it's unix time.
id_lst = [974013, 974072, 327212, 123890, 438201]
for each in range(len(datetime_lst)):
datetime_lst[each] = pd.to_datetime(datetime_lst[each], errors='coerce')
datetime_lst[each] = datetime_lst[each].strftime("%d-%b-%y %H:%M:%S")
datetime_lst[each] = pd.to_datetime(datetime_lst[each], errors='coerce', dayfirst=True, format="%d-%b-%y %H:%M:%S")
datetime_lst = pd.Series(datetime_lst)
df = pd.DataFrame({'tweet_id':id_lst, 'datetime_obj_col': datetime_lst})
gb_var = df.groupby(df["datetime_obj_col"].dt.month)
gb_var_count = gb_var.count()
gb_var.plot(kind="bar")
plt.show()
note that I am not using histogram anymore. But there should be two errors that come up, the following:
Traceback (most recent call last): File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\core \groupby\groupby.py", line 918, in apply result = self._python_apply_general(f) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\core \groupby\groupby.py", line 936, in _python_apply_general self.axis) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\core \groupby\groupby.py", line 2273, in apply res = f(group) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\core \groupby\groupby.py", line 541, in f return self.plot(*args, **kwargs) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\plot ting_core.py", line 2941, in call sort_columns=sort_columns, **kwds) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\plot ting_core.py", line 1977, in plot_frame **kwds) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\plot ting_core.py", line 1804, in _plot plot_obj.generate() File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\plot ting_core.py", line 266, in generate self._post_plot_logic_common(ax, self.data) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\plot ting_core.py", line 405, in _post_plot_logic_common self._apply_axis_properties(ax.yaxis, fontsize=self.fontsize) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\plot ting_core.py", line 478, in _apply_axis_properties labels = axis.get_majorticklabels() + axis.get_minorticklabels() File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ axis.py", line 1245, in get_majorticklabels ticks = self.get_major_ticks() File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ axis.py", line 1396, in get_major_ticks numticks = len(self.get_major_locator()()) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ dates.py", line 1249, in call self.refresh() File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ dates.py", line 1269, in refresh dmin, dmax = self.viewlim_to_dt() File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ dates.py", line 1026, in viewlim_to_dt .format(vmin)) ValueError: view limit minimum 0.0 is less than 1 and is an invalid Matplotlib d ate value. This often happens if you pass a non-datetime value to an axis that h as datetime units
Edit:
This is starting to seem like a bug related specifically to trying to use hist() to plot a column of datetime objects.
I took data from post_records
which is a loaded numpy array that stores a 2d data set of 198000+ post ids and datetime objects.
This is the code for a function called create datetime objects. It opens a csv file "tweet_time_info_preprocessed.csv," which only has three columns: 'tweet_id" "tweet_created_at_date
," and "tweet_created_at_hour.
" The following is code to combine the tweet_created_at_date
and tweet_created_at_hour
columns into formatted datetime objects using the pandas
library to_datetime()
method.
Csv file sample
def create_datetime_objects():
with open("post_time_info_preprocessed.csv", 'r', encoding='utf8') as time_csv:
mycsv = csv.reader(time_csv)
progress = 0
for row in mycsv:
progress +=1
if progress == 1: #header row
continue
if progress % 10000 == 0:
print(progress)
each_post_datetime_lst = []
each_post_datetime_lst.append(row[0])
time_str = str(row[1]) + " " + str(row[2])
a_date_object = pd.to_datetime(time_str, dayfirst=True, format="%d-%b-%y %H:%M:%S")
each_post_datetime_lst.append(a_date_object)
post_and_datetime_lst.append(each_tweet_datetime_lst)
numpy_arr_of_tweets_and_datetimes = np.array(tweets_and_datetime_objs)
np.save(np_save_path, numpy_arr_of_tweets_and_datetimes)
then I have visualize_objects_histogram()
def visualize_objects_histogram():
print("Visualizing timeplot as histogram")
post_records= np.load("tweets_and_datetime_objects.npy")
df = pd.DataFrame(data=post_records, columns=['post_id', 'datetime_obj_col'])
df_sliced = df[0:5]
print(df_sliced)
fig, ax = plt.subplots()
hist, bins, patch_lst = ax.hist(df_sliced['datetime_obj_col'], bins=range(5))
ax.plot(hist, bins)
ax.set_xlabel('Days')
ax.set_ylabel('frequency')
ax.set_title('Histogram of Time')
plt.show()
So I sliced off 5 rows of the data frame and stored them into df_slice
. I run this code, a blank white window appears. Printing df_slice
gives
tweet_id datetime_obj_col
0 974072352958042112 2018-03-14 23:58:40
1 974072272578166784 2018-03-14 23:58:21
2 974072032177598464 2018-03-14 23:57:24
3 974071601313533953 2018-03-14 23:55:41
4 974070732777914368 2018-03-14 23:52:14
And there's also an error message that comes with the blank white window. It's very long.
Exception in Tkinter callback Traceback (most recent call last): File "C:\Users\biney\AppData\Local\Programs\Python\Python36-32\lib\tkinter__i nit__.py", line 1699, in call return self.func(*args) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ backends_backend_tk.py", line 227, in resize self.draw() File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ backends\backend_tkagg.py", line 12, in draw super(FigureCanvasTkAgg, self).draw() File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ backends\backend_agg.py", line 433, in draw self.figure.draw(self.renderer) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ artist.py", line 55, in draw_wrapper return draw(artist, renderer, *args, **kwargs) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ figure.py", line 1475, in draw renderer, self, artists, self.suppressComposite) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ image.py", line 141, in _draw_list_compositing_images a.draw(renderer) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ artist.py", line 55, in draw_wrapper return draw(artist, renderer, *args, **kwargs) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ axes_base.py", line 2607, in draw mimage._draw_list_compositing_images(renderer, self, artists) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ image.py", line 141, in _draw_list_compositing_images a.draw(renderer) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ artist.py", line 55, in draw_wrapper return draw(artist, renderer, *args, **kwargs) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ axis.py", line 1190, in draw ticks_to_draw = self._update_ticks(renderer) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ axis.py", line 1028, in _update_ticks tick_tups = list(self.iter_ticks()) # iter_ticks calls the locator File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ axis.py", line 971, in iter_ticks majorLocs = self.major.locator() File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ dates.py", line 1249, in call self.refresh() File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ dates.py", line 1269, in refresh dmin, dmax = self.viewlim_to_dt() File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ dates.py", line 1026, in viewlim_to_dt .format(vmin)) ValueError: view limit minimum -0.19500000000000003 is less than 1 and is an inv alid Matplotlib date value. This often happens if you pass a non-datetime value to an axis that has datetime units
This error message repeats 5 times with slightly different values for the "view limit." Possibly 5 error messages ffor my 5 records. I think the error messages are most closely related to the following online version of dates.py... Could be wrong : https://fossies.org/linux/matplotlib/lib/matplotlib/dates.py (around line 1022, I'm going to check the actual file on my computer soon).
I'm going to try stuff from this post to see if it will help: Can Pandas plot a histogram of dates?
Edit 2: The previous stackoverflow introduced me to two helpful methods, but they didn't work. I changed my visualize... function to the following
def visualize_datetime_objects_with_pandas():
tweets_and_datetime_objects = np.load("tweets_and_datetime_objects.npy") # contains python datetime objects
print("with pandas")
print(tweets_and_datetime_objects.shape)
df = pd.DataFrame(data=tweets_and_datetime_objects, columns=['tweet_id', 'datetimeobj'])
pandas_freq_dict = df['datetimeobj'].value_counts().to_dict()
#print(pandas_freq_dict)
print(len(list(pandas_freq_dict.keys())))
print(list(pandas_freq_dict.keys())[0])
print(list(pandas_freq_dict.values())[1])
plt.plot(pandas_freq_dict.keys(), pandas_freq_dict.values())
#df = df.set_index('datetimeobj')
# changing the index of this dataframe to a time index
#df['datetimeobj'].plot(kind='line', style=['--'])
plt.show()
It gives the following output/error message.
date-time temporal data visualization script
Visualizing timeplot as histogram tweet_id datetime_obj_col datetime_obj_col 14 5 5 tweet_id datetime_obj_col datetime_obj_col 14 5 5 Traceback (most recent call last): File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\core \groupby\groupby.py", line 918, in apply result = self._python_apply_general(f) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\core \groupby\groupby.py", line 936, in _python_apply_general self.axis) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\core \groupby\groupby.py", line 2273, in apply res = f(group) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\core \groupby\groupby.py", line 541, in f return self.plot(*args, **kwargs) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\plot ting_core.py", line 2941, in call sort_columns=sort_columns, **kwds) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\plot ting_core.py", line 1977, in plot_frame **kwds) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\plot ting_core.py", line 1804, in _plot plot_obj.generate() File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\plot ting_core.py", line 266, in generate self._post_plot_logic_common(ax, self.data) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\plot ting_core.py", line 405, in _post_plot_logic_common self._apply_axis_properties(ax.yaxis, fontsize=self.fontsize) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\pandas\plot ting_core.py", line 478, in _apply_axis_properties labels = axis.get_majorticklabels() + axis.get_minorticklabels() File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ axis.py", line 1245, in get_majorticklabels ticks = self.get_major_ticks() File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ axis.py", line 1396, in get_major_ticks numticks = len(self.get_major_locator()()) File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ dates.py", line 1249, in call self.refresh() File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ dates.py", line 1269, in refresh dmin, dmax = self.viewlim_to_dt() File "C:\Users\biney\AppData\Roaming\Python\Python36\site-packages\matplotlib\ dates.py", line 1026, in viewlim_to_dt .format(vmin)) ValueError: view limit minimum 0.0 is less than 1 and is an invalid Matplotlib d ate value. This often happens if you pass a non-datetime value to an axis that h as datetime units