0

I am trying to create a script to split the pdf pages for the given page numbers/labels from the pdf, the script are producing the split pdf correctly, but few information are losing, and need to be correct

  1. book mark is losing in the separated pdf, if original pdf contains bookmark
  2. if pdf contains the page labels with Roman and arabic page numbers, like prelims part start with i, ii, iii, iv ... then again main matter part start with Arabic number 1, 2 ,3... and so on, when passing the value of arabic number for split, it is splitting the prelim part (Roman page number), ie., (start = 5, end = 10 ), but it is splitting from (start = V, End = X)

how to correct the issue in the below script

MWE

import re
import regex
import sys
import os
from iPython.ErrorLog import *
from iPython.LaTeX_QC_validation import *

#from pdfrw import PdfReader, PdfWriter
from PyPDF4 import PdfFileWriter,PdfFileReader
from pdfrw import PdfReader, PdfWriter
from pagelabels import PageLabels, PageLabelScheme

pg_info = open('pageinfo.txt','r')
pgcnt=pg_info.read()
pg_info.close()

print(pgcnt)
pgcnt= re.sub(r'<Misc="([0-9]+)" StartPage="([^">].*)">\s*<Misc="(?:[0-9]+)" EndPage="([^">].*)"/>',r'<Misc="\1" StartPage="\2" EndPage="\3"/>',pgcnt,re.I | re.S| re.M)

print(pgcnt)

pno = []

def value(rno):
    r = rno.upper()
    if (r == 'I'):
        return 1
    if (r == 'V'):
        return 5
    if (r == 'X'):
        return 10
    if (r == 'L'):
        return 50
    if (r == 'C'):
        return 100
    if (r == 'D'):
        return 500
    if (r == 'M'):
        return 1000
    return -1

def romanToDecimal(str):
    res = 0
    i = 0

    while (i < len(str)):
    # Getting value of symbol s[i]
        s1 = value(str[i])
        if (i + 1 < len(str)):
            # Getting value of symbol s[i + 1]
            s2 = value(str[i + 1])
            # Comparing both values
            if (s1 >= s2):
                # Value of current symbol is greater
                # or equal to the next symbol
                res = res + s1
                i = i + 1
            else:
                # Value of current symbol is greater
                # or equal to the next symbol
                res = res + s2 - s1
                i = i + 2
        else:
            res = res + s1
            i = i + 1
    return res


def get_pageInfo(pginfo):
    global pno
    for m in re.finditer(r'<Misc="([0-9]+)" StartPage="([^">].*)" EndPage="([^">].*)"/>',pginfo,re.I):
        Start_page = m.group(2)
        End_page = m.group(3)
        x = Start_page
        y = End_page
        numeric_test = x.isnumeric() 
        if not numeric_test:
            Start_page = romanToDecimal(Start_page)
        else: 
            Start_page = int(Start_page)
        numeric_test = y.isnumeric() 
        if not numeric_test:
            End_page = romanToDecimal(End_page)
        else:
            End_page = int(End_page)
        print(x, Start_page, y, End_page)
        pno.append((Start_page,End_page))
    return pno
    
pgdetails = get_pageInfo(pgcnt)    
print(pgdetails)

def pdf_splitter(file,start,end,fcount):
    fix_start = start
    #we will save new splited pdf as "nameofpdf splitted.pdf"
    #example if pdf name is "abc.pdf" then it will be saved as "abc splitted.pdf"
    new_file_name = str(fcount)+".pdf"        
        
    read_file = PdfFileReader(open(file,"rb")) #read pdf
    
    new_pdf = PdfFileWriter() #create write object
    start-=1
    try:
        with open(new_file_name,"wb") as f:
            for i in range(start, end):
              new_pdf.addPage(read_file.getPage(i))
              new_pdf.write(f)
              i+=1
            f.close()
            print("PDF splitted Successfully")
            
        reader = PdfReader(new_file_name)
        labels = PageLabels.from_pdf(reader)
        newlabel = PageLabelScheme(startpage=0, # the index of the page of the PDF where the labels will start
                               style="roman lowercase", # See options in PageLabelScheme.styles()
                               prefix="",
                               firstpagenum=fix_start) # number to attribute to the first page of this index
        labels.append(newlabel) # Adding our page labels to the existing ones
        labels.write(reader)
        writer = PdfWriter()
        writer.trailer = reader
        writer.write(new_file_name)
    except Exception as e:
        print(e)

x = 0
for i in pgdetails:
    x += 1
    #pvalaue = i
    Start,End = i
    pdf_splitter('input.pdf',Start,End,x)
        
sys.exit()        

  

and the page information file (txt) will contain the below information

<Misc="1" StartPage="i">
<Misc="1" EndPage="ii"/>
<Misc="2" StartPage="ii">
<Misc="2" EndPage="ii"/>
<Misc="3" StartPage="iv">
<Misc="3" EndPage="iv"/>
<Misc="4" StartPage="v">
<Misc="4" EndPage="vi"/>
<Misc="5" StartPage="vii">
<Misc="5" EndPage="xiv"/>
<Misc="6" StartPage="xv">
<Misc="6" EndPage="xv"/>
<Misc="7" StartPage="xvi">
<Misc="7" EndPage="xviii"/>
<Misc="8" StartPage="xix">
<Misc="8" EndPage="xx"/>

Thanks in Advance

Martin Thoma
  • 124,992
  • 159
  • 614
  • 958
TeX_learner
  • 123
  • 6
  • 1
    Use `pypdf` instead of PyPDF2/PyPDF3/PyPDF4. I am the maintainer of pypdf and PyPDF2. We improved pypdf a lot in 2022. – Martin Thoma Dec 26 '22 at 08:58

0 Answers0