I understand you're looking for open-source, but I thought I would share this with you, using Adobe API; you can use the free trial from here: Adobe API Developer.
The Python function below (make sure you acquire 'private.Key' from Adobe API (https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/). You will get this after clicking on start trial inside a zipped folder which is going to be downloaded on your machine.
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import \
ExtractRenditionsElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
import logging
import os
import re
import zipfile
import json
import glob
import pandas as pd
def adobeAPI(base_path, file_path):
# Your code for the AdobeAPI function
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
try:
# Initial setup, create credentials instance.
credentials = Credentials.service_account_credentials_builder() \
.from_file("/path/to/pdfservices-api-credentials.json") \
.build()
# Create an ExecutionContext using credentials and create a new operation instance.
execution_context = ExecutionContext.create(credentials)
extract_pdf_operation = ExtractPDFOperation.create_new()
# Set operation input from a source file.
source = FileRef.create_from_local_file(file_path)
extract_pdf_operation.set_input(source)
# Build ExtractPDF options and set them into the operation
extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
.with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \
.with_elements_to_extract_renditions([ExtractRenditionsElementType.TABLES,
ExtractRenditionsElementType.FIGURES]) \
.build()
extract_pdf_operation.set_options(extract_pdf_options)
# Execute the operation.
result: FileRef = extract_pdf_operation.execute(execution_context)
# set a path for the zipped file
outputzip = os.path.join(base_path, "output", str(
get_filename(file_path)+".zip"))
# set a path for the extracted zipped file
outputzipextract = os.path.join(
base_path, "output", str(get_filename(file_path)))
# Save the result to the specified location.
result.save_as(outputzip)
except (ServiceApiException, ServiceUsageException, SdkException):
logging.exception("Exception encountered while executing operation")
# Open the ZIP file
with zipfile.ZipFile(outputzip, 'r') as zip_ref:
# Extract all the contents of the ZIP file to the current working directory
zip_ref.extractall(path=outputzipextract)
# Opening JSON file
with open(os.path.join(outputzipextract, "structuredData.json")) as json_file:
data = json.load(json_file)
# get the list of .xlsx files
List_xlsx_files = []
xlsx_files = glob.glob(os.path.join(
outputzipextract, "tables", "*.xlsx"))
for file in xlsx_files:
List_xlsx_files.append(file)
list_of_values = list(range(len(data['elements'])-1))
filename = get_filename(file_path)
with open(os.path.join(outputzipextract, str(filename + '.txt')), "w", encoding='utf-8') as file:
concatenated_string = ""
for sec_index in list_of_values:
pattern_figure = r"Figure$"
match_figure = re.search(
pattern_figure, data['elements'][int(sec_index)]['Path'])
pattern_table_all = r"\/Table(?:\[\d+\])?$"
match_table_all = re.search(
pattern_table_all, data['elements'][int(sec_index)]['Path'])
pattern_table_part = r"/Table(?:\[\d+\])?/"
match_table_part = re.search(
pattern_table_part, data['elements'][int(sec_index)]['Path'])
if match_figure or match_table_part:
continue
elif match_table_all:
xlsx_file = List_xlsx_files[0]
match = re.search(r'(?<=\\)[^\\]*$', xlsx_file)
xlsx_file = match.group(0)
dfs_fixed_dict = get_dict_xlsx(outputzipextract, xlsx_file)
json_string = json.dumps(dfs_fixed_dict)
concatenated_string = concatenated_string + "\n" + json_string
List_xlsx_files.pop(0) # removing the used xlsx file
elif 'Text' in data['elements'][int(sec_index)]:
concatenated_string = concatenated_string + \
"\n" + data['elements'][int(sec_index)]['Text']
else:
continue
file.write(concatenated_string)
localfile = os.path.join(outputzipextract, str(filename + '.txt'))
return localfile
############################ < Function to get filename out of path>##################
def get_filename(file_path):
pattern = r'[/\\]([^/\\]+)\.pdf$'
match = re.search(pattern, file_path)
if match:
return match.group(1)
else:
return None
############################ </ Function to get filename out of path>##################
#################### < Function to get a dictionary of Excel files>##################
def get_dict_xlsx(outputzipextract, xlsx_file):
dfs = pd.read_excel(os.path.join(
outputzipextract, "tables", xlsx_file), sheet_name='Sheet1', engine='openpyxl')
# Convert the DataFrame to a dictionary
data_dict = dfs.to_dict(orient='records')
cleaned_data_dict = [
{re.sub(r'_x[0-9a-fA-F]{4}_', '', k).strip()
: re.sub(r'_x[0-9a-fA-F]{4}_', '', v).strip() for k, v in item.items()}
for item in data_dict
]
return cleaned_data_dict
#################### </Function to get a dictionary of Excel files>##################
This is the file structure before running the code:

After you run it, you will have an 'output' folder where you can find the txt version of your pdf. You will notice the json format of the tables in there when you open the txt file. GPT can read json and take it into account when it tries to answer your question.
In my PDF file, there is a table:

Your txt file includes the json format of this:
[{
"No.": "1",
"Equipment": "Pump",
"Plant": "A1",
"Tag": "P-1"
}, {
"No.": "2",
"Equipment": "Tank",
"Plant": "A2",
"Tag": "T-1"
}, {
"No.": "3",
"Equipment": "Heat Exchanger",
"Plant": "A3",
"Tag": "HE-1"
}, {
"No.": "4",
"Equipment": "Vessel",
"Plant": "A4",
"Tag": "V-1"
}]
This is the file structure after you run the code:

I hope it helps.