UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 14: invalid start byte

Question

I want to create a dataframe for all the csv files within my input_path. My code is raising UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 14: invalid start byte error. I've also tried calling read_csv with encoding='latin1', encoding='iso-8859-1' or encoding='cp1252'.

import os
import pandas as pd

input_path = "../input_data/"


# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
    for file in files:
        with open(os.path.join(root, file), "r") as data:
            df = pd.read_csv(data, encoding='utf_8')

Traceback:

> --------------------------------------------------------------------------- UnicodeDecodeError                        Traceback (most recent call
> last) /tmp/ipykernel_136/3748812978.py in <module>
>       3     for file in files:
>       4         with open(os.path.join(root, file), "r") as data:
> ----> 5             df = pd.read_csv(data, encoding='utf_8')
> 
> ~/.local/lib/python3.8/site-packages/pandas/util/_decorators.py in
> wrapper(*args, **kwargs)
>     309                     stacklevel=stacklevel,
>     310                 )
> --> 311             return func(*args, **kwargs)
>     312 
>     313         return wrapper
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col,
> usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters,
> true_values, false_values, skipinitialspace, skiprows, skipfooter,
> nrows, na_values, keep_default_na, na_filter, verbose,
> skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col,
> date_parser, dayfirst, cache_dates, iterator, chunksize, compression,
> thousands, decimal, lineterminator, quotechar, quoting, doublequote,
> escapechar, comment, encoding, encoding_errors, dialect,
> error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace,
> low_memory, memory_map, float_precision, storage_options)
>     584     kwds.update(kwds_defaults)
>     585 
> --> 586     return _read(filepath_or_buffer, kwds)
>     587 
>     588 
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> _read(filepath_or_buffer, kwds)
>     480 
>     481     # Create the parser.
> --> 482     parser = TextFileReader(filepath_or_buffer, **kwds)
>     483 
>     484     if chunksize or iterator:
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> __init__(self, f, engine, **kwds)
>     809             self.options["has_index_names"] = kwds["has_index_names"]
>     810 
> --> 811         self._engine = self._make_engine(self.engine)
>     812 
>     813     def close(self):
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> _make_engine(self, engine)    1038             )    1039         # error: Too many arguments for "ParserBase"
> -> 1040         return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]    1041     1042     def
> _failover_to_python(self):
> 
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py
> in __init__(self, src, **kwds)
>      67         kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
>      68         try:
> ---> 69             self._reader = parsers.TextReader(self.handles.handle, **kwds)
>      70         except Exception:
>      71             self.handles.close()
> 
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.TextReader.__cinit__()
> 
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.TextReader._get_header()
> 
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.TextReader._tokenize_rows()
> 
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.raise_parser_error()
> 
> UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position
> 14: invalid start byte

First: set the right encoding schema for each file (maybe your target file has `Latin` encoding for example) Second: set `error='ignore'` to skip the errors! (not recommended) — meti, Oct 03 '21 at 07:24
Maybe see https://stackoverflow.com/questions/30462807/encoding-error-in-panda-read-csv — Stefan, Oct 03 '21 at 07:25
Also, your code will try to load **all** files found by *os.walk()* but only ever resulting in one dataframe. Maybe it's trying treat certain unwanted files as if they're CSV format when they're not. Try printing the filenames. That might give you a clue — , Oct 03 '21 at 07:27

score 0 · Answer 1 · answered Oct 03 '21 at 07:46

Try to determine your file encoding using chardet package.

Demo:

# Python env: pip install chardet
# Anaconda env: conda install chardet

import chardet
import pathlib

input_path = "../input_data/"
detector = chardet.UniversalDetector()

for filename in pathlib.Path(input_path).glob('*.csv'):
    detector.reset()
    print(f"Filename: {filename}")
    for line in open(filename, 'rb'):
        detector.feed(line)
        if detector.done: break
    detector.close()
    print(f"Encoding: {detector.result['encoding']} (confidence: {detector.result['confidence']})\n")

Output:

Filename: ../input_data/file1.csv
Encoding: Windows-1252 (confidence: 0.7299263369321677)

Filename: ../input_data/file2.csv
Encoding: ascii (confidence: 1.0)

Filename: ../input_data/file3.csv
Encoding: ISO-8859-1 (confidence: 0.73)

Filename: ../input_data/file4.csv
Encoding: utf-8 (confidence: 0.99)

Filename: ../input_data/file5.csv
Encoding: ISO-8859-1 (confidence: 0.73)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 14: invalid start byte

1 Answers1