0

I am trying to extract the table data in a comma delimited format from this pdf file. It is too large to work with Adobe Pro , so I have looked at using Python. The PDF file is located at https://www.live-military-mode-s.eu/pdf/Military%20Mode-S%20codes.pdf . The code extracts the data but it ends up with about 360,000 lines of data . I was hoping to be able to import only the table data . no header or bottom page info data.

from tika import parser
import os
import glob
from easygui import *
from time import sleep
import random
import string
from PIL import Image

path=None

basewidth = 150
img = Image.open('dlogo.jpg')
wpercent = (basewidth/float(img.size[0]))
hsize = int((float(img.size[1])*float(wpercent)))




def converter(filename,savelocation):
    parsed = parser.from_file(filename+'.pdf')
    text=parsed["content"]

    new_name=filename+'.txt'
    fname=savelocation+'\\'+new_name.split('\\')[-1]
    with open(fname,'w+', encoding='utf-8',errors='ignore') as f:
        f.writelines(text)

    remove_empty_lines(fname)
    return new_name

def remove_empty_lines(filename):
    if not os.path.isfile(filename):
        print("{} does not exist ".format(filename))
        return
    with open(filename, errors='ignore') as filehandle:
        lines = filehandle.readlines()

    with open(filename, 'w',errors='ignore') as filehandle:
        lines = filter(lambda x: x.strip(), lines)
        filehandle.writelines(lines)




while 1:

    msg = "Please Choose a File or Folder"
    title = "PDF Converter"

    choices = ["Exit","Choose File","Choose Folder"]

    reply = buttonbox(msg,title=title,choices=choices)
    if reply is 'Exit':
        break
    elif reply is 'Choose File':
        path=fileopenbox()
        savelocation=buttonbox("Choose a Save location",title="Saving",choices=["Save Location","Cancel"])
        if savelocation is 'Cancel':
            continue
        savepath=diropenbox()
        print(savepath)
        filename, file_extension = os.path.splitext(path)
        name=converter(filename,savepath)
        print(name)

        msgbox("File Successfully Converted to Text!!")

    elif reply is 'Choose Folder':
        path=diropenbox()
        savelocation=buttonbox("Choose a Save location",title="Save Location",choices=["Save Location","Cancel"])
        if savelocation is 'Cancel':
            continue
        savepath=diropenbox()
        n=1
        for i in glob.iglob(path+'\*.pdf'):
            randomname=''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(5))
            os.rename(i,path+'\\'+str(n)+'_'+str(randomname)+'.pdf')
            n+=1
        for f in glob.iglob(path+'\*.pdf'):
            filename, file_extension = os.path.splitext(f)
            name=converter(filename,savepath)
        #     
        msgbox("PDFS Successfully Converted to Text!!")
JDB
  • 19
  • 6
  • I see you read the PDF and save as text, then open again to remove blank lines. But how you can identify which is header or bottom page info? – adrtam Apr 08 '19 at 20:42
  • yes , this is my first attempt at using python , I was looking for a solution if possible to import the pdf data in the same structure as the table data , and save as maybe a comma delimted file. But I have been unable to find a solution. – JDB Apr 08 '19 at 20:44
  • Other solutions more shorter is to use pdfminer or Apache Tika. – Mihai8 Apr 08 '19 at 20:55
  • i am unable to install pdf miner it comes up with an error , not sure how to use apache tika as it appears to be java ? – JDB Apr 09 '19 at 10:02
  • Apache Tinker gives the same results as my code , is there any better solution that can extract the tables from the pdf in a more structured way , and if possible comma delimited – JDB Apr 09 '19 at 16:40

0 Answers0