if your file names is like file1.pdf, file2.pdf, and ... then you may use a for loop:
import PyPDF2
import re
for k in range(1,100):
# open the pdf file
object = PyPDF2.PdfFileReader("C:/my_path/file%s.pdf"%(k))
# get number of pages
NumPages = object.getNumPages()
# define keyterms
String = "New York State Real Property Law"
# extract text and do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
print("this is page " + str(i))
Text = PageObj.extractText()
# print(Text)
ResSearch = re.search(String, Text)
print(ResSearch)
otherwise you can walk through your folder using os module
import PyPDF2
import re
import os
for foldername,subfolders,files in os.walk(r"C:/my_path"):
for file in files:
# open the pdf file
object = PyPDF2.PdfFileReader(os.path.join(foldername,file))
# get number of pages
NumPages = object.getNumPages()
# define keyterms
String = "New York State Real Property Law"
# extract text and do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
print("this is page " + str(i))
Text = PageObj.extractText()
# print(Text)
ResSearch = re.search(String, Text)
print(ResSearch)
sorry if I recognized your problem wrong.
EDIT:
unfortunately I'm not familiar with pyPDF2 module but it seems when you convert contents of a pdf using this module something weird (like additional newlines or format changing or ...) happens.
may this page helps:
Extracting text from a PDF file using Python
however if your file was .txt then a regex was helpful
import re
import os
myRegex=re.compile("New York State Real Property Law.*?common elements of the property\.",re.DOTALL)
for foldername,subfolders,files in os.walk(r"C:/Users/Mirana/Me2"):
for file in files:
object=open(os.path.join(foldername,file))
Text=object.read()
for subText in myRegex.findall(Text):
print(subText)
object.close()
I changed your pdf version too but cause of the problem mentioned above it doesn't work at least for my pdfs (give it a try):
import PyPDF2
import re
import os
myRegex=re.compile("New York State Real Property Law.*?common elements of the property\.",re.DOTALL)
for foldername,subfolders,files in os.walk(r"C:/my_path"):
for file in files:
# open the pdf file
object = PyPDF2.PdfFileReader(os.path.join(foldername,file))
# get number of pages
NumPages = object.getNumPages()
# extract text and do the search
for i in range(0, NumPages):
PageObj = object.getPage(i)
print("this is page " + str(i))
Text = PageObj.extractText()
# print(Text)
for subText in myRegex.findall(Text):
print(subText)