I'm trying to use ocrMyPdf library and here is my code:
ocrmypdf.ocr("input/mypdf.pdf",
"input/mypdf_ocr.pdf",
skip_text=False,
force_ocr=True,
deskew=True,
rotate_pages=True,
remove_background=False,
rotate_pages_threshold=3,
pages="1,72",
max_image_mpixels=1_000_000_000,
keep_temporary_files=False,
pdf_renderer="sandwich",
unpaper_args="",
clean=True,
progress_bar=False,)
Above call is throwing PermissionError: [Errno 13] Permission denied: 'unpaper'
. Not sure where to debug the cause of the error.
I'm using WSL2 (Ubuntu 20.4) on Windows 11.
Here is the full trace from jupyter notebook:
---------------------------------------------------------------------------
PermissionError Traceback (most recent call last)
Input In [8], in <cell line: 1>()
----> 1 ocrmypdf.ocr("input/mypdf.pdf",
2 "input/mypdf_ocr.pdf",
3 skip_text=False,
4 force_ocr=True,
5 deskew=True,
6 rotate_pages=True,
7 remove_background=False,
8 rotate_pages_threshold=3,
9 pages="1,72",
10 max_image_mpixels=1_000_000_000,
11 keep_temporary_files=False,
12 pdf_renderer="sandwich",
13 unpaper_args="",
14 clean=True,
15 progress_bar=False)
File ~/AI/nexus/aps-esg-data-scraper/venv/lib/python3.8/site-packages/ocrmypdf/api.py:339, in ocr(input_file, output_file, language, image_dpi, output_type, sidecar, jobs, use_threads, title, author, subject, keywords, rotate_pages, remove_background, deskew, clean, clean_final, unpaper_args, oversample, remove_vectors, force_ocr, skip_text, redo_ocr, skip_big, optimize, jpg_quality, png_quality, jbig2_lossy, jbig2_page_group_size, pages, max_image_mpixels, tesseract_config, tesseract_pagesegmode, tesseract_oem, tesseract_thresholding, pdf_renderer, tesseract_timeout, rotate_pages_threshold, pdfa_image_compression, user_words, user_patterns, fast_web_view, plugins, plugin_manager, keep_temporary_files, progress_bar, **kwargs)
336 warn("ocrmypdf.ocr(verbose=) is ignored. Use ocrmypdf.configure_logging().")
338 options = create_options(**create_options_kwargs)
--> 339 check_options(options, plugin_manager)
340 return run_pipeline(options=options, plugin_manager=plugin_manager, api=True)
File ~/AI/nexus/aps-esg-data-scraper/venv/lib/python3.8/site-packages/ocrmypdf/_validation.py:245, in check_options(options, plugin_manager)
244 def check_options(options: Namespace, plugin_manager: PluginManager) -> None:
--> 245 _check_plugin_invariant_options(options)
246 _check_plugin_options(options, plugin_manager)
File ~/AI/nexus/aps-esg-data-scraper/venv/lib/python3.8/site-packages/ocrmypdf/_validation.py:232, in _check_plugin_invariant_options(options)
230 check_options_output(options)
231 check_options_sidecar(options)
--> 232 check_options_preprocessing(options)
233 check_options_ocr_behavior(options)
234 check_options_advanced(options)
File ~/AI/nexus/aps-esg-data-scraper/venv/lib/python3.8/site-packages/ocrmypdf/_validation.py:132, in check_options_preprocessing(options)
130 raise BadArgsError("--clean is required for --unpaper-args")
131 if options.clean:
--> 132 check_external_program(
133 program='unpaper',
134 package='unpaper',
135 version_checker=unpaper.version,
136 need_version='6.1',
137 required_for=['--clean, --clean-final'],
138 )
139 try:
140 if options.unpaper_args:
File ~/AI/nexus/aps-esg-data-scraper/venv/lib/python3.8/site-packages/ocrmypdf/subprocess/__init__.py:331, in check_external_program(program, package, version_checker, need_version, required_for, recommended, version_parser)
329 try:
330 if callable(version_checker):
--> 331 found_version = version_checker()
332 else: # deprecated
333 found_version = version_checker
File ~/AI/nexus/aps-esg-data-scraper/venv/lib/python3.8/site-packages/ocrmypdf/_exec/unpaper.py:69, in version()
68 def version() -> str:
---> 69 return get_version('unpaper')
File ~/AI/nexus/aps-esg-data-scraper/venv/lib/python3.8/site-packages/ocrmypdf/subprocess/__init__.py:157, in get_version(program, version_arg, regex, env)
155 args_prog = [program, version_arg]
156 try:
--> 157 proc = run(
158 args_prog,
159 close_fds=True,
160 text=True,
161 stdout=PIPE,
162 stderr=STDOUT,
163 check=True,
164 env=env,
165 )
166 output: str = proc.stdout
167 except FileNotFoundError as e:
File ~/AI/nexus/aps-esg-data-scraper/venv/lib/python3.8/site-packages/ocrmypdf/subprocess/__init__.py:58, in run(args, env, logs_errors_to_stdout, check, **kwargs)
56 stderr_name = 'stderr' if not logs_errors_to_stdout else 'stdout'
57 try:
---> 58 proc = subprocess_run(args, env=env, check=check, **kwargs)
59 except CalledProcessError as e:
60 stderr = getattr(e, stderr_name, None)
File ~/.pyenv/versions/3.8.3/lib/python3.8/subprocess.py:489, in run(input, capture_output, timeout, check, *popenargs, **kwargs)
486 kwargs['stdout'] = PIPE
487 kwargs['stderr'] = PIPE
--> 489 with Popen(*popenargs, **kwargs) as process:
490 try:
491 stdout, stderr = process.communicate(input, timeout=timeout)
File ~/.pyenv/versions/3.8.3/lib/python3.8/subprocess.py:854, in Popen.__init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
850 if self.text_mode:
851 self.stderr = io.TextIOWrapper(self.stderr,
852 encoding=encoding, errors=errors)
--> 854 self._execute_child(args, executable, preexec_fn, close_fds,
855 pass_fds, cwd, env,
856 startupinfo, creationflags, shell,
857 p2cread, p2cwrite,
858 c2pread, c2pwrite,
859 errread, errwrite,
860 restore_signals, start_new_session)
861 except:
862 # Cleanup if the child failed starting.
863 for f in filter(None, (self.stdin, self.stdout, self.stderr)):
File ~/.pyenv/versions/3.8.3/lib/python3.8/subprocess.py:1702, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)
1700 if errno_num != 0:
1701 err_msg = os.strerror(errno_num)
-> 1702 raise child_exception_type(errno_num, err_msg, err_filename)
1703 raise child_exception_type(err_msg)
PermissionError: [Errno 13] Permission denied: 'unpaper'