I am currently working on a project where I want to extract text from a PDF and then check if one of the words in the extracted text appears in a certain dictionary. If so, I want to us example.replace(file, x, y) to replace the word from my text with the value from my dictionary.
I'm struggling with the loop for checking all words in my text and compare them to the dictionary automatically. The goal is that I don't have to type "old" and "new" on my own but the programme checks all words in the text and if it finds one in the dictionary "old" shall be the word from the text and "new" the value of the key. The manual version works.
Here is my code
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
dictionary = {"Die" : "Der", "Arbeitsfläche":"Platz"}
def convert(file, old, new):
translation = convert_pdf_to_txt(file).replace(old, new)
return translation
print(convert('mytest.pdf','Die' ,'Der'))
Thanks for help!