I have 5 pdf-files, which i want to be converted to txt-files. 3 of the files work fine. The other 2 only return (CID:number), f.e.:
(cid:47)(cid:57)(cid:3)(cid:69)(cid:72)
I wrote my code with pdfminer. Has anyone an idea how to fix this or adjust my code?
Btw.: The text is in german no CJK and i tried to convered the file on the page https://www.pdf2go.com and it worked.
Here is my code:
import sys
import io
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
# importieren Modul regex
import re
import os
filename = 'test.pdf'
page_start_input = 24
pages = list(range((page_start_input-1),500))
def pdfparser(data):
fp = open(data, 'rb')
resource_manager = PDFResourceManager()
retstr = io.StringIO()
codec = 'utf-8'
pagenos = set(pages)
laparams = LAParams()
device = TextConverter(resource_manager, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(resource_manager, device)
for page in PDFPage.get_pages(fp, pagenos):
interpreter.process_page(page)
data = retstr.getvalue()
# print (data)
file = open("test_out.txt", "w", encoding='utf-8')
file.write(data)
file.close()
pdfparser(filename)