i'm a total new in python, could you help me correct this code?
I would like to add 2 things:
- do the operation on multiple pdf and not just one and pasting the content in A2,A3 A4 and so on
- if possible writing in the another row (B2,B3,B4) the name of the pdf file.
Thank you in advance, this is the code i'm working with
import PyPDF2
import openpyxl
pdfFileObj = open("file.pdf", 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
wb = openpyxl.load_workbook('excel.xlsx')
sheet = wb.active
sheet.title = 'MyPDF'
sheet['A1'] = mytext
wb.save('excel.xlsx')
print('DONE!!')
I've modified the code as suggested and the cycle seems to get all the pages! but maybe i have to work with "sheet[f'A{row}'].value = '\n'.join(output)" because it seems to print a lot of spaces
import PyPDF2
import openpyxl
import os
import glob
root_dir = "your directory"
filenames = []
# root_dir needs a trailing slash (i.e. /root/dir/)
for filename in glob.iglob(root_dir + '**/**', recursive=True):
if filename.lower().endswith('.pdf'):
filenames.append(os.path.join(directory, filename))
wb = openpyxl.load_workbook('excel.xlsx')#your file excel
sheet = wb.active
sheet.title = 'MyPDF'
for row, filename in enumerate(filenames, start=1):
with open(filename, 'rb') as f:
pdfReader = PyPDF2.PdfFileReader(f)
count=pdfReader.numPages
pageObj = pdfReader.getPage(0)
mytext = pageObj.extractText()
for i in range(count):
page = pdfReader.getPage(i)
output = []
output = page.extractText()
print(output)
sheet[f'A{row}'].value = '\n'.join(output)
sheet[f'B{row}'].value = filename
wb.save('excel.xlsx') #your file excel
print('DONE!!')