1

I want to convert more than one pdf file from a folder and put them in another folder using pdfminer. I initially managed to convert a single file, However, when I want to convert more than one files it writes the content from the first pdf files in the second converted txt file. I wrote the following codes.

from subprocess import Popen, PIPE

#http://stackoverflow.com/questions/5725278/python-help-using-pdfminer-as-a-library
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import os

import os


def convert_pdf_to_txt(path):

    listing=os.listdir(path)

    docid=[]
    for infile in listing:
        infile=os.path.join(path,infile)

        docid.append(infile)
        zz=docid
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)    
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = False
    pagenos=set()

    for kk in zz:
        ass=kk
        ap=file(ass,"rb")
        for page in PDFPage.get_pages(ap, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
            str = retstr.getvalue()           
        filename, file_extension=os.path.splitext(kk)
        splitted,files=os.path.split(filename)
        splittedd,pathd=os.path.split(splitted)
        yy=splittedd+'//'+'doc3'+'//'+files+'.'+'txt'
        zz=splittedd+'//'+'doc3'+'//'+files+'.'+'html'
        txtfileo=open(yy,'w')
        txtfileo.write(str)
        txtfileo.close()
        txtfileo1=open(zz,'w')
        txtfileo1.write(str)
        txtfileo1.close()
    retstr.close()            
    ap.close()
    device.close()

print convert_pdf_to_txt('amharicir\docname1')

If any one can help met it It is my pleasure

GLR
  • 1,070
  • 1
  • 11
  • 29
Sertse
  • 11
  • 1
  • Very useful variable names `kk`, `yy`, `zz`, `ass`, `ap`. – Anmol Singh Jaggi Mar 25 '16 at 10:28
  • Unfortunately I did some silly mistakes by putting some statements out of the iteration. I now included them and perfectly work to write multi pdf file from a folder in to text or html file using pdfminer – Sertse Mar 28 '16 at 05:51

1 Answers1

0
from subprocess import Popen, PIPE
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import os

import os


def convert_pdf_to_txt(path):

    filelist=os.listdir(path)
    documentcollection=[]
    for files in filelist:
        files=os.path.join(path,files)
        documentcollection.append(files)
    for ifiles in documentcollection:
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)    
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
        ifilesid=file(ifiles,"rb")
        for page in PDFPage.get_pages(ifilesid, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
        text = retstr.getvalue()

        filename, file_extension=os.path.splitext(ifiles)
        splitted,files=os.path.split(filename)
        splittedd,pathd=os.path.split(splitted)
        yy=splittedd+'//'+'doc3'+'//'+files+'.'+'txt'
        zz=splittedd+'//'+'doc3'+'//'+files+'.'+'html'
        txtfileo=open(yy,'w')
        txtfileo.write(text)
        txtfileo.close()
        txtfileo1=open(zz,'w')
        txtfileo1.write(text)
        txtfileo1.close()
    ifilesid.close()

    retstr.close()
    device.close()
    return text

print convert_pdf_to_txt('amharicir\\docname1')
Sertse
  • 11
  • 1