I want to create a dataframe for all the csv files within my input_path
. My code is raising UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 14: invalid start byte
error. I've also tried calling read_csv with encoding='latin1'
, encoding='iso-8859-1'
or encoding='cp1252'
.
import os
import pandas as pd
input_path = "../input_data/"
# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
for file in files:
with open(os.path.join(root, file), "r") as data:
df = pd.read_csv(data, encoding='utf_8')
Traceback:
> --------------------------------------------------------------------------- UnicodeDecodeError Traceback (most recent call
> last) /tmp/ipykernel_136/3748812978.py in <module>
> 3 for file in files:
> 4 with open(os.path.join(root, file), "r") as data:
> ----> 5 df = pd.read_csv(data, encoding='utf_8')
>
> ~/.local/lib/python3.8/site-packages/pandas/util/_decorators.py in
> wrapper(*args, **kwargs)
> 309 stacklevel=stacklevel,
> 310 )
> --> 311 return func(*args, **kwargs)
> 312
> 313 return wrapper
>
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col,
> usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters,
> true_values, false_values, skipinitialspace, skiprows, skipfooter,
> nrows, na_values, keep_default_na, na_filter, verbose,
> skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col,
> date_parser, dayfirst, cache_dates, iterator, chunksize, compression,
> thousands, decimal, lineterminator, quotechar, quoting, doublequote,
> escapechar, comment, encoding, encoding_errors, dialect,
> error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace,
> low_memory, memory_map, float_precision, storage_options)
> 584 kwds.update(kwds_defaults)
> 585
> --> 586 return _read(filepath_or_buffer, kwds)
> 587
> 588
>
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> _read(filepath_or_buffer, kwds)
> 480
> 481 # Create the parser.
> --> 482 parser = TextFileReader(filepath_or_buffer, **kwds)
> 483
> 484 if chunksize or iterator:
>
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> __init__(self, f, engine, **kwds)
> 809 self.options["has_index_names"] = kwds["has_index_names"]
> 810
> --> 811 self._engine = self._make_engine(self.engine)
> 812
> 813 def close(self):
>
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> _make_engine(self, engine) 1038 ) 1039 # error: Too many arguments for "ParserBase"
> -> 1040 return mapping[engine](self.f, **self.options) # type: ignore[call-arg] 1041 1042 def
> _failover_to_python(self):
>
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py
> in __init__(self, src, **kwds)
> 67 kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
> 68 try:
> ---> 69 self._reader = parsers.TextReader(self.handles.handle, **kwds)
> 70 except Exception:
> 71 self.handles.close()
>
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.TextReader.__cinit__()
>
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.TextReader._get_header()
>
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.TextReader._tokenize_rows()
>
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.raise_parser_error()
>
> UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position
> 14: invalid start byte