i'm a complete beginner in Python and what i'm trying is extracting the text from multiple pdf (contained in different subfolders of a big one) and pasting the text in a excel file with:
- A1: Name of the file
- A2: text of the file contained in ONE cell
I've tried some solution like this one:
import pdfplumber
import pandas as pd
import os
def extract_pdf(pdf_path):
linesOfFile = []
with pdfplumber.open(pdf_path) as pdf:
for pdf_page in pdf.pages:
single_page_text = pdf_page.extract_text()
for linesOfFile in single_page_text.split('\n'):
linesOfFile.append(line)
#print(linesOfFile)
return linesOfFile
folder_with_pdfs = 'folder_path'
linesOfFiles = []
for pdf_file in os.listdir(folder_with_pdfs):
if pdf_file.endswith('.pdf'):
pdf_file_path = os.path.join(folder_with_pdfs, pdf_file)
linesOfFile = extract_pdf(pdf_file_path)
linesOfFiles.append(linesOfFile)
df = pd.DataFrame(linesOfFiles)
df.to_csv('test.csv')
Any help is appreciated