With the help from these two questions I have coded a program to help me count words in csv files through given directories.
question no1: Counting elements in specified column of a .csv file
question no2: Sum of values from multiple dicts
When I pass a directory it kind of works like I want it to. Now I have stumbled across a problem. I pass a new directory. The file names are of the same pattern, I have tried moving to another disk, tried putting r' in front of the path etc. But I keep getting this error: 'utf-8' codec can't decode byte 0xb8 in position 77: invalid start byte
Here is the full code and the full error message.:
import numpy as np
import csv
import glob
import pandas as pd
import os
import collections
import functools
import operator
import matplotlib.pyplot as plt
root = input("Direktorij: ")
ext = ('.csv')
for subdir, dirs, files in os.walk(root):
seznam = [] #here i create an empty list to use in the next for loop.
for datoteka in os.scandir(subdir):
if datoteka.path.endswith(ext):
df = pd.read_csv(datoteka)
klasifikacija = df.iloc[:, 4] #this gets me all of the files from a directory with .csv extension
napake = klasifikacija.value_counts() #this counts the same words
dict_napake = dict(napake)
seznam.append(dict_napake)
napake = dict(functools.reduce(operator.add,map(collections.Counter, seznam)))
with open('petek.csv', 'a') as output: #from here on the code is use to write the results in a new csv file. This kind of works as I want it to.
writer = csv.writer(output)
for key, value in napake.items():
writer.writerow([key, value])
indeks = subdir.split("\\")[-1]
with open('petek.csv', 'a') as output:
writer = csv.writer(output)
writer.writerow(indeks)
print(f'{indeks} \n {napake}')
print("done") #Makes it easier to know if the code ran to the end
Here is the error message:
UnicodeDecodeError Traceback (most recent call last)
Input In [40], in <cell line: 1>()
4 for datoteka in os.scandir(subdir):
5 if datoteka.path.endswith(ext):
----> 6 df = pd.read_csv(datoteka)
7 klasifikacija = df.iloc[:, 4] # Gets all rows for the fourth column (index starts at 0)
8 napake = klasifikacija.value_counts()
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\util\_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
305 if len(args) > num_allow_args:
306 warnings.warn(
307 msg.format(arguments=arguments),
308 FutureWarning,
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:680, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
665 kwds_defaults = _refine_defaults_read(
666 dialect,
667 delimiter,
(...)
676 defaults={"delimiter": ","},
677 )
678 kwds.update(kwds_defaults)
--> 680 return _read(filepath_or_buffer, kwds)
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:575, in _read(filepath_or_buffer, kwds)
572 _validate_names(kwds.get("names", None))
574 # Create the parser.
--> 575 parser = TextFileReader(filepath_or_buffer, **kwds)
577 if chunksize or iterator:
578 return parser
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:933, in TextFileReader.__init__(self, f, engine, **kwds)
930 self.options["has_index_names"] = kwds["has_index_names"]
932 self.handles: IOHandles | None = None
--> 933 self._engine = self._make_engine(f, self.engine)
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:1235, in TextFileReader._make_engine(self, f, engine)
1232 raise ValueError(msg)
1234 try:
-> 1235 return mapping[engine](f, **self.options)
1236 except Exception:
1237 if self.handles is not None:
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py:75, in CParserWrapper.__init__(self, src, **kwds)
72 kwds.pop(key, None)
74 kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
---> 75 self._reader = parsers.TextReader(src, **kwds)
77 self.unnamed_cols = self._reader.unnamed_cols
79 # error: Cannot determine type of 'names'
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\_libs\parsers.pyx:544, in pandas._libs.parsers.TextReader.__cinit__()
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\_libs\parsers.pyx:633, in pandas._libs.parsers.TextReader._get_header()
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\_libs\parsers.pyx:847, in pandas._libs.parsers.TextReader._tokenize_rows()
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\_libs\parsers.pyx:1952, in pandas._libs.parsers.raise_parser_error()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb8 in position 77: invalid start byte