I have the following dataframe
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
# 3.5.3
df=pd.DataFrame({'Type': [ 'Sentence', 'Array', 'String', '-','-', 'Sentence', 'Array', 'String', '-','-', 'Sentence'],
'Length': [42,21,11,6,6,42,21,11,6,6,42],
'label': [1,1,0,0,0,1,1,0,0,0,1],
})
print(df)
# Type Length label
#0 Sentence 42 1
#1 Array 21 1
#2 String 11 0
#3 - 6 0
#4 - 6 0
#5 Sentence 42 1
#6 Array 21 1
#7 String 11 0
#8 - 6 0
#9 - 6 0
#10 Sentence 42 1
I want to plot stacked bar chart for the arbitrary column within dataframe (either numerical e.g. Length
column or categorical e.g. Type
column) and stack with respect to label
column using annotations of both count/percentage, where small values of rare observations are also displayed. The following script gives me the wrong results:
ax = df.plot.bar(stacked=True)
#ax = df[["Type","label"]].plot.bar(stacked=True)
#ax = df.groupby('Type').size().plot(kind='bar', stacked=True)
ax.legend(["0: normanl", "1: Anomaly"])
for p in ax.patches:
width, height = p.get_width(), p.get_height()
x, y = p.get_xy()
ax.text(x+width/2,
y+height/2,
'{:.0f} %'.format(height),
horizontalalignment='center',
verticalalignment='center')
I can imagine that somehow I need to calculate the counts of the selected column with respect to label
column:
## counts will be used for the labels
counts = df.apply(lambda x: x.value_counts())
## percents will be used to determine the height of each bar
percents = counts.div(counts.sum(axis=1), axis=0)
I tried to solve the problem by using df.groupby(['selcted column', 'label']
unsuccessfully. I collected all possible solutions in this Google Colab Notebook nevertheless I couldn't find a straightforward way to adapt into dataframe.
So far I have tried following solution inspired by this post to solve the problem by using df.groupby(['selcted column', 'label']
unsuccessfully and I got TypeError: unsupported operand type(s) for +: 'int' and 'str' for total = sum(dff.sum())
can't figure out what is the problem? in indexing or df
transformation.
BTW I collected all possible solutions in this Google Colab Notebook nevertheless I couldn't find a straightforward way to adapt into dataframe via Mathplotlib
. So I'm looking for an elegant way of using Seaborn
or plotly
.
df = df.groupby(["Type","label"]).count()
#dfp_Type = df.pivot_table(index='Type', columns='label', values= 'Length', aggfunc='mean')
dfp_Type = df.pivot_table(index='Type', columns='label', values= df.Type.size(), aggfunc='mean')
#dfp_Length = df.pivot_table(index='Length', columns='label', values= df.Length.size(), aggfunc='mean')
ax = dfp_Type.plot(kind='bar', stacked=True, rot=0)
# iterate through each bar container
for c in ax.containers: labels = [v.get_height() if v.get_height() > 0 else '' for v in c]
# add the annotations
ax.bar_label(c, fmt='%0.0f%%', label_type='center')
# move the legend
ax.legend(title='Class', bbox_to_anchor=(1, 1.02), loc='upper left')
plt.show()
output:
Expected output: