I am trying to create a script to split the pdf pages for the given page numbers/labels from the pdf, the script are producing the split pdf correctly, but few information are losing, and need to be correct
- book mark is losing in the separated pdf, if original pdf contains bookmark
- if pdf contains the page labels with Roman and arabic page numbers, like prelims part start with i, ii, iii, iv ... then again main matter part start with Arabic number 1, 2 ,3... and so on, when passing the value of arabic number for split, it is splitting the prelim part (Roman page number), ie., (start = 5, end = 10 ), but it is splitting from (start = V, End = X)
how to correct the issue in the below script
MWE
import re
import regex
import sys
import os
from iPython.ErrorLog import *
from iPython.LaTeX_QC_validation import *
#from pdfrw import PdfReader, PdfWriter
from PyPDF4 import PdfFileWriter,PdfFileReader
from pdfrw import PdfReader, PdfWriter
from pagelabels import PageLabels, PageLabelScheme
pg_info = open('pageinfo.txt','r')
pgcnt=pg_info.read()
pg_info.close()
print(pgcnt)
pgcnt= re.sub(r'<Misc="([0-9]+)" StartPage="([^">].*)">\s*<Misc="(?:[0-9]+)" EndPage="([^">].*)"/>',r'<Misc="\1" StartPage="\2" EndPage="\3"/>',pgcnt,re.I | re.S| re.M)
print(pgcnt)
pno = []
def value(rno):
r = rno.upper()
if (r == 'I'):
return 1
if (r == 'V'):
return 5
if (r == 'X'):
return 10
if (r == 'L'):
return 50
if (r == 'C'):
return 100
if (r == 'D'):
return 500
if (r == 'M'):
return 1000
return -1
def romanToDecimal(str):
res = 0
i = 0
while (i < len(str)):
# Getting value of symbol s[i]
s1 = value(str[i])
if (i + 1 < len(str)):
# Getting value of symbol s[i + 1]
s2 = value(str[i + 1])
# Comparing both values
if (s1 >= s2):
# Value of current symbol is greater
# or equal to the next symbol
res = res + s1
i = i + 1
else:
# Value of current symbol is greater
# or equal to the next symbol
res = res + s2 - s1
i = i + 2
else:
res = res + s1
i = i + 1
return res
def get_pageInfo(pginfo):
global pno
for m in re.finditer(r'<Misc="([0-9]+)" StartPage="([^">].*)" EndPage="([^">].*)"/>',pginfo,re.I):
Start_page = m.group(2)
End_page = m.group(3)
x = Start_page
y = End_page
numeric_test = x.isnumeric()
if not numeric_test:
Start_page = romanToDecimal(Start_page)
else:
Start_page = int(Start_page)
numeric_test = y.isnumeric()
if not numeric_test:
End_page = romanToDecimal(End_page)
else:
End_page = int(End_page)
print(x, Start_page, y, End_page)
pno.append((Start_page,End_page))
return pno
pgdetails = get_pageInfo(pgcnt)
print(pgdetails)
def pdf_splitter(file,start,end,fcount):
fix_start = start
#we will save new splited pdf as "nameofpdf splitted.pdf"
#example if pdf name is "abc.pdf" then it will be saved as "abc splitted.pdf"
new_file_name = str(fcount)+".pdf"
read_file = PdfFileReader(open(file,"rb")) #read pdf
new_pdf = PdfFileWriter() #create write object
start-=1
try:
with open(new_file_name,"wb") as f:
for i in range(start, end):
new_pdf.addPage(read_file.getPage(i))
new_pdf.write(f)
i+=1
f.close()
print("PDF splitted Successfully")
reader = PdfReader(new_file_name)
labels = PageLabels.from_pdf(reader)
newlabel = PageLabelScheme(startpage=0, # the index of the page of the PDF where the labels will start
style="roman lowercase", # See options in PageLabelScheme.styles()
prefix="",
firstpagenum=fix_start) # number to attribute to the first page of this index
labels.append(newlabel) # Adding our page labels to the existing ones
labels.write(reader)
writer = PdfWriter()
writer.trailer = reader
writer.write(new_file_name)
except Exception as e:
print(e)
x = 0
for i in pgdetails:
x += 1
#pvalaue = i
Start,End = i
pdf_splitter('input.pdf',Start,End,x)
sys.exit()
and the page information file (txt) will contain the below information
<Misc="1" StartPage="i">
<Misc="1" EndPage="ii"/>
<Misc="2" StartPage="ii">
<Misc="2" EndPage="ii"/>
<Misc="3" StartPage="iv">
<Misc="3" EndPage="iv"/>
<Misc="4" StartPage="v">
<Misc="4" EndPage="vi"/>
<Misc="5" StartPage="vii">
<Misc="5" EndPage="xiv"/>
<Misc="6" StartPage="xv">
<Misc="6" EndPage="xv"/>
<Misc="7" StartPage="xvi">
<Misc="7" EndPage="xviii"/>
<Misc="8" StartPage="xix">
<Misc="8" EndPage="xx"/>
Thanks in Advance