I have a 100 page pdf document. Each two pages contain unique employee data. I need a python code to extract each of the two pages and save them as separate files with filenames as the text extracted from each first page. For example
- The 100 page pdf document will be saved at 50 separate files
- The first page of each file contains the text Dear Miles Wood, Dear Kate Aaron etc,
- The first extracted filename should be Miles_Wood.pdf and second Kate_Aaron.pdf and so on..
Will be most pleased with a python solution
Thanks in advance
I have tried to adapt a seemingly similar python solution by the following but it doesn't appear to work for me
from PyPDF2 import PdfReader, PdfWriter
import re
import argparse
import os
cwd = os.getcwd()
output_dir = os.path.join(cwd, 'output')
if not os.path.exists(output_dir):
os.makedirs(output_dir)
def get_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--pdf", dest="pdf", required=True)
parser.add_argument("-c", "--count", dest="count", type=int, required=True)
parser.add_argument("-r", "--regex", dest="regex", required=True)
return parser.parse_args()
def split_pdf(file, page_count, regex):
reader = PdfReader(file)
os.chdir(output_dir)
for i in range(0, reader.numPages, page_count):
writer = PdfWriter()
if reader.numPages > 1 :
for y in range(page_count):
writer.add_page(reader.pages[i])
writer.add_page(reader.pages[i + y])
else :
writer.add_page(reader.pages[i])
text = reader.pages[i].extract_text()
search = re.search(regex, text)
newname = search.group(1) + ".pdf"
outputStream = open(newname, "wb")
writer.write(outputStream)
if __name__ == "__main__" :
arguments = get_arguments()
split_pdf(arguments.pdf, arguments.count, arguments.regex)](https://stackoverflow.com)
Credit https://pastebin.com/mDRV77pp