1

I am working on problem which involves converting all the PDF-files to text, after the conversion I am not able to get spaces while saving the content into a text-file.

AMITY UNIVERSITY420 Udyog Vihar, Phase­IV,Gurugram 122016, Haryana, IndiaTel: + 91 124 391

I am expecting a space between India and Tel number

from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
import pdfminer
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
import os
import tempfile, subprocess


#converting pdf to text


pdfDir = "C:\\user\\IRS\\Documents\\6to11thfeb\\spi\\"

def convert(fname, pages = None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)


    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')

    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return(text)

def convertMultiple(pdfDir, txtDir):
    for pdf in os.listdir(pdfDir):
        fileExtension = pdf.split(".")[-1]
        if fileExtension == "pdf":
            pdfFilename = pdfDir + pdf
            text = convert(pdfFilename)
            textFilename = txtDir + pdf.split(".")[0] + ".txt"
            textFile = open(textFilename, "w", encoding = "utf-8")
            textFile.write(text + '\t')
#            textFile.close()
#            print(text) 
#            textFile.write(text + '\t')
#            print(textFile)

pdfDir = "C:\\Users\\IRIJFE\\Documents\\6to11thfeb\\spi\\"
txtDir = "C:\\Users\\IRIJFE\\Documents\\6to11thfeb\\spi\\"

convertMultiple(pdfDir, txtDir)   
iLuvLogix
  • 5,920
  • 3
  • 26
  • 43

0 Answers0