I'm trying to include a data frame with multi-index in a report in pdf. I would like to have a nice table output.
I have found these 2 solutions:
pandas.df -> HTML -> pdf
import pandas as pd
from IPython.display import HTML
import pdfkit
# df generation
df = pd.read_csv(path_to_csv, sep =',')
groupeddf = df.groupby('Cluster')
res = groupeddf.describe([0.05, 0.5, 0.95])
res.index.rename(['Cluster', 'stats'], inplace=True)
res['Cluster'] = res.index.get_level_values('Cluster')
res['stats'] = res.index.get_level_values('stats')
populations = (res.iloc[(res.index.get_level_values('stats') == 'count'), \
0].values).tolist()
res['population'] = [populations[i] for i in res.index.labels[0].values()]
total_pop = sum(populations)
res['frequency'] =(res['population']/total_pop).round(3)
res.set_index(['Cluster', 'population','frequency', 'stats'], inplace=True)
res1 = res.iloc[(res.index.get_level_values('stats') == '5%') |
(res.index.get_level_values('stats') == 'mean') |
(res.index.get_level_values('stats') == '50%') |
(res.index.get_level_values('stats') == '95%')]
res1 = res1.round(2)
# saving the df
h = HTML(res1.to_html())
my_file = open('test.html', 'w')
my_file.write(h.data)
my_file.close()
options = {
'orientation': 'Landscape'
}
with open('test.html') as f:
pdfkit.from_file(f, 'out.pdf', options=options)
But this has a dependence on pdfkit
which make it difficult to us. That's why I am trying to use pandas.df -> tex -> pdf (as mentioned in Export a Pandas dataframe as a table image )
import pandas as pd
import os
# df generation
df = pd.read_csv(path_to_csv, sep =',')
groupeddf = df.groupby('Cluster')
res = groupeddf.describe([0.05, 0.5, 0.95])
res.index.rename(['Cluster', 'stats'], inplace=True)
res['Cluster'] = res.index.get_level_values('Cluster')
res['stats'] = res.index.get_level_values('stats')
populations = (res.iloc[(res.index.get_level_values('stats') == 'count'), \
0].values).tolist()
res['population'] = [populations[i] for i in res.index.labels[0].values()]
total_pop = sum(populations)
res['frequency'] =(res['population']/total_pop).round(3)
res.set_index(['Cluster', 'population','frequency', 'stats'], inplace=True)
res1 = res.iloc[(res.index.get_level_values('stats') == '5%') |
(res.index.get_level_values('stats') == 'mean') |
(res.index.get_level_values('stats') == '50%') |
(res.index.get_level_values('stats') == '95%')]
res1 = res1.round(2)
res1.rename(columns=lambda x: x.replace('_', ' '), inplace=True)
#latex
template = r'''\documentclass[preview]{{standalone}}
\usepackage{{booktabs}}
\begin{{document}}
{}
\end{{document}}
'''
with open("outputfile.tex", "wb") as afile:
afile.write(template.format(res1.to_latex()))
os.system("pdflatex outputfile.tex")
However, I am not familiar with latex, and I get this error :
! LaTeX Error: File `standalone.cls' not found.
Type X to quit or <RETURN> to proceed,
or enter a new name. (Default extension: cls)
Any idea about the error or the standard way to do pandas.df -> pdf ?