2

I want to extract text from online PDF using pdfminer using below code, it is showing no error but output is nothing

from pdfminer.pdfpage import PDFPage
from urllib import request
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open

def readPDF(pdfFile):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    PDFPage.get_pages(rsrcmgr, device, pdfFile)
    device.close()
    content = retstr.getvalue()
    retstr.close()
    return content

pdfFile = request.urlopen("https://www.jstage.jst.go.jp/article/cancer/9/0/9_KJ00003588219/_pdf/-char/en")
outputString = readPDF(pdfFile)
print(outputString)
Srinath Neela
  • 374
  • 1
  • 15

2 Answers2

2

Following code works in Python 3.7.4

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
import io
import urllib.request
import requests


def pdf_to_text(pdf_file):
    text_memory_file = io.StringIO()

    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, text_memory_file, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # get first 3 pages of the pdf file
    for page in PDFPage.get_pages(pdf_file, pagenos=(0, 1, 2)):
        interpreter.process_page(page)
    text = text_memory_file.getvalue()
    text_memory_file.close()
    return text

# # online pdf to text by urllib
# online_pdf_file=urllib.request.urlopen('http://www.dabeaz.com/python/UnderstandingGIL.pdf')
# pdf_memory_file=io.BytesIO()
# pdf_memory_file.write(online_pdf_file.read())
# print(pdf_to_text(pdf_memory_file))


# online pdf to text by requests
response = requests.get('http://www.dabeaz.com/python/UnderstandingGIL.pdf')
pdf_memory_file = io.BytesIO()
pdf_memory_file.write(response.content)
print(pdf_to_text(pdf_memory_file))

# extract metadata
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1

parser = PDFParser(pdf_memory_file)
doc = PDFDocument(parser)
metadata=doc.info[0]
for k in metadata:
    print(k, resolve1(metadata[k]))
Alpha
  • 668
  • 6
  • 11
  • This works perfectly fine for my purpose. However, I had an additional requirement of finding out some metadata from the online pdf- such as Title of the pdf, no of lines, name of the author etc. I have found code for the ones saved in hard drive, but it does not work with the online pdf. COuld you please help? – Bitopan Gogoi Feb 28 '23 at 10:19
  • 1
    @BitopanGogoi I have added related code in the edited code, have fun – Alpha Feb 28 '23 at 14:24
0

I suggest you to use pdftotext library for extracting text.

import pdftotext
fh = open(document_name, 'rb')
pdf = pdftotext.PDF(fh)
text = ""
for page in pdf:
    text += page
print(text)
Vaibhav Mishra
  • 227
  • 2
  • 11
  • pdftotext is not installing in windows, i have tried it – Srinath Neela Aug 21 '19 at 12:21
  • Please follow this link it may helpful for you https://stackoverflow.com/questions/52336495/cannot-install-pdftotext-on-windows-because-of-poppler – Vaibhav Mishra Aug 21 '19 at 12:25
  • path = 'localpath\\pdftotext.exe' import subprocess subprocess.call([path]) fh = open("https://www.jstage.jst.go.jp/article/cancer/9/0/9_KJ00003588219/_pdf/-char/en", 'rb') pdf = subprocess.PDF(fh) text = "" for page in pdf: text += page print(text) – Srinath Neela Aug 21 '19 at 12:47
  • once check the above code it is giving options when execute, can you help me on this – Srinath Neela Aug 21 '19 at 12:49
  • Hi Sorry for late reply please check once you are getting pdf file in fh variable or not. – Vaibhav Mishra Aug 22 '19 at 08:36