I am working on extracting text from PDF and save it in .csv file. Below image shows the text I am trying to extract from the PDF:
Currently, I am able to extract text but can't get rid of the numbers that indicate page numbers and indexing (i.e., numbers at the start and end of the text 1, 5, 1.1, 5, 1.2 etc...). Below is my working code (I am working on python 3.5):
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO, BytesIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages = maxpages, password = password, caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
text = text.replace('\n\n', ' ').replace('\n',' ').replace('–',' ').replace('_',' ').replace('\t',' ').encode('ascii', errors='replace').decode('utf-8').replace("?","").replace("\x0c","").replace(".","").replace('\\',"").replace('/',"").replace('\r',"").replace("-"," ").replace(".......*"," ")
text = " ".join(text.split())
fp.close()
device.close()
retstr.close()
return text
content = convert_pdf_to_txt('filename.pdf')
#print (content.encode('utf-8'))
s = StringIO(content)
with open('output.csv', 'w') as f:
for line in s:
f.write(line)
Thanks in advance for the help.