I have a zip file A001-C-002.zip
and a .xlsx file HUBMAP B004 codex antibodies metadata.xlsx
within a folder.
First, I want to read into the xlsx
file and convert it into a dataframe.
Next, I want to process all the files in the zip file.
from pathlib import Path
import pandas as pd
import zipfile
import os
import sys
path = "./../../"
os.chdir(path)
for filename in os.listdir(os.getcwd()):
with open(os.path.join(os.getcwd(), filename), 'r') as f:
with open("HUBMAP B004 codex antibodies metadata.xlsx", 'r') as ab:
ab_df = pd.read_excel(ab)
print(f"Antibody metadata column names:\n {ab_df.columns.values}")
# Patient A001
with zipfile.ZipFile(path / "A001-C-002.zip") as z:
for filename in z.namelist():
if not os.path.isdir(filename):
for line in z.open(filename):
print(line)
z.close()
Traceback
> --------------------------------------------------------------------------- UnicodeDecodeError Traceback (most recent call
> last) /tmp/ipykernel_3212/4008185006.py in <module>
> 2 with open(os.path.join(os.getcwd(), filename), 'r') as f:
> 3 with open("HUBMAP B004 codex antibodies metadata.xlsx", 'r') as ab:
> ----> 4 ab_df = pd.read_excel(ab)
> 5 print(f"Antibody metadata column names:\n {ab_df.columns.values}")
> 6
>
> ~/.local/lib/python3.8/site-packages/pandas/util/_decorators.py in
> wrapper(*args, **kwargs)
> 309 stacklevel=stacklevel,
> 310 )
> --> 311 return func(*args, **kwargs)
> 312
> 313 return wrapper
>
> ~/.local/lib/python3.8/site-packages/pandas/io/excel/_base.py in
> read_excel(io, sheet_name, header, names, index_col, usecols, squeeze,
> dtype, engine, converters, true_values, false_values, skiprows, nrows,
> na_values, keep_default_na, na_filter, verbose, parse_dates,
> date_parser, thousands, comment, skipfooter, convert_float,
> mangle_dupe_cols, storage_options)
> 362 if not isinstance(io, ExcelFile):
> 363 should_close = True
> --> 364 io = ExcelFile(io, storage_options=storage_options, engine=engine)
> 365 elif engine and engine != io.engine:
> 366 raise ValueError(
>
> ~/.local/lib/python3.8/site-packages/pandas/io/excel/_base.py in
> __init__(self, path_or_buffer, engine, storage_options) 1189 ext = "xls" 1190 else:
> -> 1191 ext = inspect_excel_format( 1192 content_or_path=path_or_buffer, storage_options=storage_options
> 1193 )
>
> ~/.local/lib/python3.8/site-packages/pandas/io/excel/_base.py in
> inspect_excel_format(content_or_path, storage_options) 1073
> stream = handle.handle 1074 stream.seek(0)
> -> 1075 buf = stream.read(PEEK_SIZE) 1076 if buf is None: 1077 raise ValueError("stream is empty")
>
> /usr/lib/python3.8/codecs.py in decode(self, input, final)
> 320 # decode input (taking the buffer into account)
> 321 data = self.buffer + input
> --> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
> 323 # keep undecoded input until the next call
> 324 self.buffer = data[consumed:]
>
> UnicodeDecodeError: 'utf-8' codec can't decode byte 0x9a in position
> 15: invalid start byte