I'm trying to use python 3 to get text data from pdf linked to webpage, and save it as csv file.
I've tried to get url of pdf files from webpage. Then after reading it, made attempt to get the text through pdfminer.
here's my code:
def PDFtoStringFromUrl(url):
try:
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
context = ssl._create_unverified_context()
f = urlopen(url, context=context).read()
fp = BytesIO(f)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
a = retstr.getvalue()
retstr.close()
return a.replace('\n', ' ').strip()
except Exception as e:
print("Read PDF Error :" + str(e))
raise NotImplemented
pdf_url, content = []
url = 'https://hcss.nl/research?page={}'
for page_num in range(0, 16):
response = requests.get(url.format(page_num))
soup = BeautifulSoup(response.text, "html.parser")
list = []
for link in soup.select('#visible-content > div > div.main-container.container-fluid.js-quickedit-main-content > section > div.region.region-content > div.views-element-container.form-group > div > div > div > div > div.col-sm-3.search-right.hidden-xs > div > a'):
pdf_link = link.get('href')
if pdf_link.lower().endswith('pdf'):
if pdf_link.startswith('http'):
list.append(pdf_link)
else:
list.append('https://hcss.nl' + pdf_link)
pdf_url.extend(list)
## get pdf
pdf_text = []
for file in pdf_url:
s = PDFtoStringFromUrl(file)
s = "".join(s)
pdf_text.append(s)
content.extend(pdf_text)
but i've got following error message:
Read PDF Error :Literal required: <PDFStream(331): raw=68251, {'CIDSystemInfo': {'Ordering': b'Japan1', 'Registry': b'Adobe', 'Supplement': 6}, 'CMapName': /'UniJIS-UTF16-H', 'Filter': /'FlateDecode', 'Length': 68249, 'Type': /'CMap', 'WMode': 0}>