3
def ask(file):
    print(" Loading...")
    PDFReader = download_loader("PDFReader")
    loader = PDFReader()
    documents = loader.load_data(file=Path(file))
    print("Path: ", Path(file))

    # Check if the index file exists
    if os.path.exists(INDEX_FILE):
        # Load the index from the file
        logger.info("found index.json in the directory")
        index = GPTSimpleVectorIndex.load_from_disk(INDEX_FILE)
    else:
        logger.info("didnt find index.json in the directory")
        llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003"))

        service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, chunk_size_limit=1024)
        index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context)

        # Save the index to the file
        index.save_to_disk(INDEX_FILE)

Above is my code snippet for generating index for a pdf. I have used PDFReader from llamahub to extract texts from the pdf. The bot answers well when asked about the text. But it fails when I ask the value from the table present in the pdf.

I tried using different open-ai text models. The best one being text-davinci-003. The bot is not able to answer me about the values present in the tables in the pdf. This is because the pdfReader simply just converts the content of pdf to text (it doesnot take any special steps to convert the table content). I want to know how can i sucessfully index both text and the tables in the pdf using langchain and llamaindex.

Harshit
  • 41
  • 2

1 Answers1

0

I understand you're looking for open-source, but I thought I would share this with you, using Adobe API; you can use the free trial from here: Adobe API Developer. The Python function below (make sure you acquire 'private.Key' from Adobe API (https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/). You will get this after clicking on start trial inside a zipped folder which is going to be downloaded on your machine.

from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import \
    ExtractRenditionsElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
import logging
import os
import re
import zipfile
import json
import glob
import pandas as pd

def adobeAPI(base_path, file_path):
    # Your code for the AdobeAPI function
    logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))

    try:

        # Initial setup, create credentials instance.
        credentials = Credentials.service_account_credentials_builder() \
            .from_file("/path/to/pdfservices-api-credentials.json") \
            .build()

        # Create an ExecutionContext using credentials and create a new operation instance.
        execution_context = ExecutionContext.create(credentials)
        extract_pdf_operation = ExtractPDFOperation.create_new()

        # Set operation input from a source file.
        source = FileRef.create_from_local_file(file_path)
        extract_pdf_operation.set_input(source)

        # Build ExtractPDF options and set them into the operation
        extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
            .with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \
            .with_elements_to_extract_renditions([ExtractRenditionsElementType.TABLES,
                                                  ExtractRenditionsElementType.FIGURES]) \
            .build()
        extract_pdf_operation.set_options(extract_pdf_options)

        # Execute the operation.
        result: FileRef = extract_pdf_operation.execute(execution_context)

        # set a path for the zipped file
        outputzip = os.path.join(base_path, "output", str(
            get_filename(file_path)+".zip"))

        # set a path for the extracted zipped file
        outputzipextract = os.path.join(
            base_path, "output", str(get_filename(file_path)))

        # Save the result to the specified location.
        result.save_as(outputzip)
    except (ServiceApiException, ServiceUsageException, SdkException):
        logging.exception("Exception encountered while executing operation")

    # Open the ZIP file
    with zipfile.ZipFile(outputzip, 'r') as zip_ref:
        # Extract all the contents of the ZIP file to the current working directory
        zip_ref.extractall(path=outputzipextract)

    # Opening JSON file
    with open(os.path.join(outputzipextract, "structuredData.json")) as json_file:
        data = json.load(json_file)



    # get the list of .xlsx files
    List_xlsx_files = []
    xlsx_files = glob.glob(os.path.join(
        outputzipextract, "tables", "*.xlsx"))
    for file in xlsx_files:
        List_xlsx_files.append(file)


    list_of_values = list(range(len(data['elements'])-1))


    filename = get_filename(file_path)
    with open(os.path.join(outputzipextract, str(filename + '.txt')), "w", encoding='utf-8') as file:

        concatenated_string = ""

        for sec_index in list_of_values:

            pattern_figure = r"Figure$"
            match_figure = re.search(
                pattern_figure, data['elements'][int(sec_index)]['Path'])

            pattern_table_all = r"\/Table(?:\[\d+\])?$"
            match_table_all = re.search(
                pattern_table_all, data['elements'][int(sec_index)]['Path'])

            pattern_table_part = r"/Table(?:\[\d+\])?/"
            match_table_part = re.search(
                pattern_table_part, data['elements'][int(sec_index)]['Path'])

            if match_figure or match_table_part:
                continue

            elif match_table_all:

                xlsx_file = List_xlsx_files[0]
                match = re.search(r'(?<=\\)[^\\]*$', xlsx_file)
                xlsx_file = match.group(0)
                dfs_fixed_dict = get_dict_xlsx(outputzipextract, xlsx_file)
                json_string = json.dumps(dfs_fixed_dict)
                concatenated_string = concatenated_string + "\n" + json_string

                List_xlsx_files.pop(0)  # removing the used xlsx file

            elif 'Text' in data['elements'][int(sec_index)]:
                concatenated_string = concatenated_string + \
                    "\n" + data['elements'][int(sec_index)]['Text']

            else:
                continue


        file.write(concatenated_string)


    localfile = os.path.join(outputzipextract, str(filename + '.txt'))

    return localfile


############################ < Function to get filename out of path>##################


def get_filename(file_path):
    pattern = r'[/\\]([^/\\]+)\.pdf$'
    match = re.search(pattern, file_path)
    if match:
        return match.group(1)
    else:
        return None

############################ </ Function to get filename out of path>##################



#################### < Function to get a dictionary of Excel files>##################

def get_dict_xlsx(outputzipextract, xlsx_file):

    dfs = pd.read_excel(os.path.join(
        outputzipextract, "tables", xlsx_file), sheet_name='Sheet1', engine='openpyxl')

    # Convert the DataFrame to a dictionary
    data_dict = dfs.to_dict(orient='records')


    cleaned_data_dict = [
        {re.sub(r'_x[0-9a-fA-F]{4}_', '', k).strip()
                : re.sub(r'_x[0-9a-fA-F]{4}_', '', v).strip() for k, v in item.items()}
        for item in data_dict
    ]

    return cleaned_data_dict

#################### </Function to get a dictionary of Excel files>##################

This is the file structure before running the code:

enter image description here

After you run it, you will have an 'output' folder where you can find the txt version of your pdf. You will notice the json format of the tables in there when you open the txt file. GPT can read json and take it into account when it tries to answer your question. In my PDF file, there is a table:

Pump_Table

Your txt file includes the json format of this:

[{
    "No.": "1",
    "Equipment": "Pump",
    "Plant": "A1",
    "Tag": "P-1"
}, {
    "No.": "2",
    "Equipment": "Tank",
    "Plant": "A2",
    "Tag": "T-1"
}, {
    "No.": "3",
    "Equipment": "Heat Exchanger",
    "Plant": "A3",
    "Tag": "HE-1"
}, {
    "No.": "4",
    "Equipment": "Vessel",
    "Plant": "A4",
    "Tag": "V-1"
}]

This is the file structure after you run the code:

enter image description here

I hope it helps.

Behrouz Beheshti
  • 1,053
  • 1
  • 10
  • 14