I am not able to extract text of all pages using tabula so I was trying to run camelot on my Macbook m2. I am getting the following error logs:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/camelot/ext/ghostscript/_gsprint.py", line 260, in <module>
libgs = cdll.LoadLibrary("libgs.so")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/ctypes/__init__.py", line 454, in LoadLibrary
return self._dlltype(name)
^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/ctypes/__init__.py", line 376, in __init__
self._handle = _dlopen(self._name, mode)
^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: dlopen(libgs.so, 0x0006): tried: 'libgs.so' (no such file), '/System/Volumes/Preboot/Cryptexes/OSlibgs.so' (no such file), '/usr/lib/libgs.so' (no such file, not in dyld cache), 'libgs.so' (no such file), '/usr/lib/libgs.so' (no such file, not in dyld cache)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/xxx/Desktop/xx/test.py", line 117, in <module>
extracted_data = extract_pdf_camelot()
^^^^^^^^^^^^^^^^^^^^^
File "/Users/xxx/Desktop/xx/test.py", line 106, in extract_pdf_camelot
tables = camelot.read_pdf(pdf_path)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/camelot/io.py", line 113, in read_pdf
tables = p.parse(
^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/camelot/handlers.py", line 173, in parse
t = parser.extract_tables(
^^^^^^^^^^^^^^^^^^^^^^
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/camelot/parsers/lattice.py", line 402, in extract_tables
self._generate_image()
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/camelot/parsers/lattice.py", line 211, in _generate_image
from ..ext.ghostscript import Ghostscript
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/camelot/ext/ghostscript/__init__.py", line 24, in <module>
from . import _gsprint as gs
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/camelot/ext/ghostscript/_gsprint.py", line 267, in <module>
raise RuntimeError("Please make sure that Ghostscript is installed")
RuntimeError: Please make sure that Ghostscript is installed
The code I am running is
import os
import sys
import PyPDF2
from openpyxl import Workbook
import os
import PyPDF2
import tabula
import pandas as pd
from openpyxl import load_workbook
import camelot
pdf_paths =[]
BatchList = []
university = "xx"
college = "xx"
Batch= "2022"
# Program / Degree:row[3]
# Roll No,.:row[1]
# Name:row[2]
# Branch:row[4]
file_add="xxx"
univ_res = "https://convocation.ccc.ac.in/index.php/convocation/Information_page/degree_recipients"
shet = "Batch:" + Batchexcel_path = 'ExcelDatabase/Ix.xlsx'
tracker_path = "ExcelDatabase/tracker.xlsx"
def extract_pdf_data_tabula(pdf_path):# Specify the path to the main folder
main_folder_path = "./EducationDatabase2/xxC"
df_list = []
for root, dirs, files in os.walk(main_folder_path):
if len(dirs) > 0:
for no, dir in enumerate(dirs, start =1):
if no==1:continue
for filename in os.listdir(main_folder_path+'/'+dir):
if filename.endswith(".pdf"):
print(filename)
pdf_path = main_folder_path+'/'+dir + '/'+ filename
batch=""
batch+=dir[len(dir)-1]
batch+=dir[len(dir)-2]
batch+=dir[len(dir)-3]
batch+=dir[len(dir)-4]
batch = batch[::-1]
BatchList.append(batch)
with open(pdf_path, "rb") as pdf_file:
pdf_paths.append(pdf_path)
readpdf = PyPDF2.PdfReader(pdf_file)
totalpages = len(readpdf.pages)
df_list.extend( tabula.read_pdf(pdf_path, pages='1-32',stream=True, lattice=True, guess=False, multiple_tables=True))
# df_list[len(df_list)-1].dropna(subset=['Discipline'], inplace=True)
# print(batch)
# print(df_list[len(df_list)-1])
# print()
return df_list
sys.exit(1)
def save_data_to_excel(data, excel_path):
cnt =2
# workbook = load_workbook(excel_path)
shet = "Batch_2022"
# sheet = workbook[shet]
for no, df in enumerate(data, start = 1):
print("Dataframe:", no)
df2=df.dropna(axis=1)
# df2=df.dropna(axis=1,thresh=0)
print(df2.head(1))
print()
# for i in range(len(df)):
# row = df.iloc[i].to_list()
# sno = str(cnt+1)
# row = [university, college, Batch, row[3], row[1], row[2], row[4], file_add,univ_res ]
# for i in range(len(row)):
# sheet.cell(row= cnt+1, column = i+1).value = row[i]
# cnt+=1
# workbook.save(excel_path)
def extract_pdf_camelot():
main_folder_path = "./EducationDatabase2/Indian Institute of Technology - Guwahati"
df_list = []
for root, dirs, files in os.walk(main_folder_path):
if len(dirs) > 0:
for no, dir in enumerate(dirs, start =1):
if no==1:continue
for filename in os.listdir(main_folder_path+'/'+dir):
if filename.endswith(".pdf"):
print(filename)
pdf_path = main_folder_path+'/'+dir + '/'+ filename
batch=""
batch+=dir[len(dir)-1]
batch+=dir[len(dir)-2]
batch+=dir[len(dir)-3]
batch+=dir[len(dir)-4]
batch = batch[::-1]
BatchList.append(batch)
# extract all the tables in the PDF file
tables = camelot.read_pdf(pdf_path)
cntdf =0
for table in tables:
print("Df no:", cntdf+1)
df_list.append(table.df)
print(table.df.head(2))
return
# extracted_data = extract_pdf_data_tabula(pdf_path)
extracted_data = extract_pdf_camelot()
cnt =2
save_data_to_excel(extracted_data, excel_path)
I have found the similar issue or rather say the same issue here(https://github.com/camelot-dev/excalibur/issues/87 and here https://github.com/camelot-dev/camelot/issues/193) but I am not able to find any solution there.
I have tried some the solution provided on the stack overflow(like libssl.so.1.1: cannot open shared object file: No such file or directory , ) and googled a lot but most of the solution is for linux operating system or windows. Else other are not working.
I have tried checking the ghostscript
pip3 show ghostscript
I got the following result:
Name: ghostscript Version: 0.7 Summary: Interface to the Ghostscript C-API, both high- and low-level, based on ctypes Home-page: https://gitlab.com/pdftools/python-ghostscript Author: Hartmut Goebel Author-email: h.goebel@crazy-compilers.com License: GNU General Public License v3 or later (GPLv3+) Location: /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages Requires: setuptools Required-by:
I tried installing the ghostscript from the site(https://www.ghostscript.com/releases/gsdnld.html) downloaded the "Ghostscript 10.01.1 Source for all platforms" version.
but still getting the same error.