0

I am not able to extract text of all pages using tabula so I was trying to run camelot on my Macbook m2. I am getting the following error logs:

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/camelot/ext/ghostscript/_gsprint.py", line 260, in <module>
    libgs = cdll.LoadLibrary("libgs.so")
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/ctypes/__init__.py", line 454, in LoadLibrary
    return self._dlltype(name)
           ^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/ctypes/__init__.py", line 376, in __init__
    self._handle = _dlopen(self._name, mode)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: dlopen(libgs.so, 0x0006): tried: 'libgs.so' (no such file), '/System/Volumes/Preboot/Cryptexes/OSlibgs.so' (no such file), '/usr/lib/libgs.so' (no such file, not in dyld cache), 'libgs.so' (no such file), '/usr/lib/libgs.so' (no such file, not in dyld cache)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/xxx/Desktop/xx/test.py", line 117, in <module>
    extracted_data =  extract_pdf_camelot()
                      ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/xxx/Desktop/xx/test.py", line 106, in extract_pdf_camelot
    tables = camelot.read_pdf(pdf_path)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/camelot/io.py", line 113, in read_pdf
    tables = p.parse(
             ^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/camelot/handlers.py", line 173, in parse
    t = parser.extract_tables(
        ^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/camelot/parsers/lattice.py", line 402, in extract_tables
    self._generate_image()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/camelot/parsers/lattice.py", line 211, in _generate_image
    from ..ext.ghostscript import Ghostscript
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/camelot/ext/ghostscript/__init__.py", line 24, in <module>
    from . import _gsprint as gs
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/camelot/ext/ghostscript/_gsprint.py", line 267, in <module>
    raise RuntimeError("Please make sure that Ghostscript is installed")
RuntimeError: Please make sure that Ghostscript is installed

The code I am running is


import os
import sys
import PyPDF2
from openpyxl import Workbook
import os
import PyPDF2
import tabula
import pandas as pd
from openpyxl import load_workbook
import camelot


pdf_paths =[]
BatchList = []
university = "xx"
college = "xx"
Batch= "2022"
# Program / Degree:row[3]
# Roll No,.:row[1]
# Name:row[2]
# Branch:row[4]
file_add="xxx"
univ_res = "https://convocation.ccc.ac.in/index.php/convocation/Information_page/degree_recipients"

shet = "Batch:" + Batchexcel_path = 'ExcelDatabase/Ix.xlsx'
tracker_path = "ExcelDatabase/tracker.xlsx"




def extract_pdf_data_tabula(pdf_path):# Specify the path to the main folder
    main_folder_path = "./EducationDatabase2/xxC"
    df_list = []
    for root, dirs, files in os.walk(main_folder_path):
     if len(dirs) > 0:
        for no, dir in enumerate(dirs, start =1):
         if no==1:continue
         for filename in os.listdir(main_folder_path+'/'+dir):
            if filename.endswith(".pdf"):
                print(filename)
                pdf_path =  main_folder_path+'/'+dir + '/'+ filename
                batch=""
                batch+=dir[len(dir)-1]
                batch+=dir[len(dir)-2]
                batch+=dir[len(dir)-3]
                batch+=dir[len(dir)-4]
                batch  =  batch[::-1]
                BatchList.append(batch)
                with open(pdf_path, "rb") as pdf_file:
                    pdf_paths.append(pdf_path)
                    readpdf = PyPDF2.PdfReader(pdf_file)
                    totalpages = len(readpdf.pages)
                    df_list.extend( tabula.read_pdf(pdf_path, pages='1-32',stream=True, lattice=True, guess=False, multiple_tables=True))
                    # df_list[len(df_list)-1].dropna(subset=['Discipline'], inplace=True)
                    # print(batch)
                    # print(df_list[len(df_list)-1])
                    # print()
                    return df_list
                    sys.exit(1)


def save_data_to_excel(data, excel_path):
    cnt =2
    # workbook = load_workbook(excel_path)
    shet = "Batch_2022"
    # sheet = workbook[shet]
    for no, df in enumerate(data, start = 1):
        print("Dataframe:", no)
        df2=df.dropna(axis=1)
        # df2=df.dropna(axis=1,thresh=0)
        print(df2.head(1))
        print()
    #     for i in range(len(df)):
    #         row =  df.iloc[i].to_list()
    #         sno =  str(cnt+1)
    #         row =  [university, college, Batch, row[3], row[1], row[2], row[4], file_add,univ_res ]
    #         for i in range(len(row)):
    #           sheet.cell(row= cnt+1, column = i+1).value = row[i]
    #         cnt+=1
    # workbook.save(excel_path)




def extract_pdf_camelot():
    main_folder_path = "./EducationDatabase2/Indian Institute of Technology - Guwahati"
    df_list = []
    for root, dirs, files in os.walk(main_folder_path):
     if len(dirs) > 0:
        for no, dir in enumerate(dirs, start =1):
         if no==1:continue
         for filename in os.listdir(main_folder_path+'/'+dir):
            if filename.endswith(".pdf"):
                print(filename)
                pdf_path =  main_folder_path+'/'+dir + '/'+ filename
                batch=""
                batch+=dir[len(dir)-1]
                batch+=dir[len(dir)-2]
                batch+=dir[len(dir)-3]
                batch+=dir[len(dir)-4]
                batch  =  batch[::-1]
                BatchList.append(batch)
                # extract all the tables in the PDF file
                tables = camelot.read_pdf(pdf_path)
                cntdf =0
                for table in tables:
                   print("Df no:", cntdf+1)
                   df_list.append(table.df)
                   print(table.df.head(2))
                return 



# extracted_data = extract_pdf_data_tabula(pdf_path)
extracted_data =  extract_pdf_camelot()
cnt =2
save_data_to_excel(extracted_data, excel_path)

I have found the similar issue or rather say the same issue here(https://github.com/camelot-dev/excalibur/issues/87 and here https://github.com/camelot-dev/camelot/issues/193) but I am not able to find any solution there.

I have tried some the solution provided on the stack overflow(like libssl.so.1.1: cannot open shared object file: No such file or directory , ) and googled a lot but most of the solution is for linux operating system or windows. Else other are not working.

I have tried checking the ghostscript pip3 show ghostscript

I got the following result:

Name: ghostscript Version: 0.7 Summary: Interface to the Ghostscript C-API, both high- and low-level, based on ctypes Home-page: https://gitlab.com/pdftools/python-ghostscript Author: Hartmut Goebel Author-email: h.goebel@crazy-compilers.com License: GNU General Public License v3 or later (GPLv3+) Location: /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages Requires: setuptools Required-by:

I tried installing the ghostscript from the site(https://www.ghostscript.com/releases/gsdnld.html) downloaded the "Ghostscript 10.01.1 Source for all platforms" version.

but still getting the same error.

  • You've downloaded the source tarball for Ghostscript from what you say. That won't help you unless you also build the source and install the resulting executable. Assuming you are running on Linux you will need to get the Ghostscript package from your distro maintainer. If you're on MacOS then you'll need to build it from source. If you're on Windows then note that the DLL is not called libgs.so, it is gsdllxx.dll where xx is 32 or 64 depending on which you install. – KenS May 26 '23 at 07:02
  • What do you mean by building from source? Can you be more specific and little more descriptive? – MAYANK GARG May 27 '23 at 11:09
  • Its hard to see how I can be more specific or descriptive. I believe you have downloaded the source tarball because you say you downloaded "Ghostscript 10.01.1 Source for all platforms". You need to extract all the source files from the tarball, then use a C compiler to compile the source code into a binary executable. There are instructions here https://ghostscript.readthedocs.io/en/gs10.0.0/Make.html – KenS May 28 '23 at 07:56
  • Would you mind formatting the errors as code as well? – Lover of Structure May 28 '23 at 18:51

0 Answers0