I have 100 annual reports of different banks. All these annual reports are of same format.I want to extract profit&loss table and balance sheet table from all the 100 PDFs and store in an excel file. Is there any way to do that using python?
Below is the code that is extracting all the tables In a PDF and saving in an excel file.
import tabula
from tabula import wrapper
from tabula import *
import PyPDF2,os,time
import pandas as pd
filename=input("enter pdf name")+".pdf"
pdf=PyPDF2.PdfFileReader(open(filename,"rb"))
pag_no=pdf.getNumPages()
for i in range(0,pag_no):
pg=pdf.getPage(i)
writer=PyPDF2.PdfFileWriter()
writer.addPage(pg)
NewPDFfilename="Page_"+str(i)+".pdf"
with open(NewPDFfilename,"wb")as outputStream:
writer.write(outputStream)
for i in range(0,pag_no):
file=wrapper.convert_into('Page_'+str(i)+'.pdf,'result_'+str(i)+'.csv',output_format='csv')
file=wrapper.convert_into('Page_'+str(i)+'.pdf,'result_'+str(i)+'.csv',output_format='xml')
try:
df=pd.read_csv("result_"+str(i)+".csv", sep=" ",header='none',delimiter=r"\s+")
if(df.empty):
print("yes")
else:
print("table found in --->PAGE"+str(i+1)+"and store in --->result_"+str(i)+".csv")
except (pd.errors.EmptyDataError,FileNotFoundError):
os.remove(r'Users\Downloads\Table-extraction-from-PDF-and-Images-master'+str(i)+'.pdf')
os.remove(r'Users\Downloads\Table-extraction-from-PDF-and-Images-master'+str(i)+'.csv')
pass