0

I have a problem. I get the following error ValueError: Input contains NaN, infinity or a value too large for dtype('float32').. Is there an option to get the exact column name where this error occurs?

I looked at

I tried

np.any(np.isnan(df_train))
np.all(np.isfinite(df_train))
[OUT]

False
False
%%time
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV, cross_validate
df_train = copy.deepcopy(df)
#df_train = dfListings_w_outliners
X = df_train.drop(columns=['car']) # Features
y = df_train['car'] # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)


from sklearn.ensemble import (RandomForestClassifier,)
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

print(rf.score(X_test, y_test))
y_pred = rf.predict(X_test)
y_pred_test = rf.predict(X_train)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)


from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precison:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 Maß:",metrics.f1_score(y_test, y_pred))
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File <timed exec>:13, in <module>

File ~\Anaconda3\lib\site-packages\sklearn\ensemble\_forest.py:327, in BaseForest.fit(self, X, y, sample_weight)
    325 if issparse(y):
    326     raise ValueError("sparse multilabel-indicator for y is not supported.")
--> 327 X, y = self._validate_data(
    328     X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
    329 )
    330 if sample_weight is not None:
    331     sample_weight = _check_sample_weight(sample_weight, X)

File ~\Anaconda3\lib\site-packages\sklearn\base.py:581, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
    579         y = check_array(y, **check_y_params)
    580     else:
--> 581         X, y = check_X_y(X, y, **check_params)
    582     out = X, y
    584 if not no_val_X and check_params.get("ensure_2d", True):

File ~\Anaconda3\lib\site-packages\sklearn\utils\validation.py:964, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
    961 if y is None:
    962     raise ValueError("y cannot be None")
--> 964 X = check_array(
    965     X,
    966     accept_sparse=accept_sparse,
    967     accept_large_sparse=accept_large_sparse,
    968     dtype=dtype,
    969     order=order,
    970     copy=copy,
    971     force_all_finite=force_all_finite,
    972     ensure_2d=ensure_2d,
    973     allow_nd=allow_nd,
    974     ensure_min_samples=ensure_min_samples,
    975     ensure_min_features=ensure_min_features,
    976     estimator=estimator,
    977 )
    979 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric)
    981 check_consistent_length(X, y)

File ~\Anaconda3\lib\site-packages\sklearn\utils\validation.py:800, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    794         raise ValueError(
    795             "Found array with dim %d. %s expected <= 2."
    796             % (array.ndim, estimator_name)
    797         )
    799     if force_all_finite:
--> 800         _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan")
    802 if ensure_min_samples > 0:
    803     n_samples = _num_samples(array)

File ~\Anaconda3\lib\site-packages\sklearn\utils\validation.py:114, in _assert_all_finite(X, allow_nan, msg_dtype)
    107     if (
    108         allow_nan
    109         and np.isinf(X).any()
    110         or not allow_nan
    111         and not np.isfinite(X).all()
    112     ):
    113         type_err = "infinity" if allow_nan else "NaN, infinity"
--> 114         raise ValueError(
    115             msg_err.format(
    116                 type_err, msg_dtype if msg_dtype is not None else X.dtype
    117             )
    118         )
    119 # for object dtype data, we only check for NaNs (GH-13254)
    120 elif X.dtype == np.dtype("object") and not allow_nan:

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').
Vitalizzare
  • 4,496
  • 7
  • 13
  • 32
Test
  • 571
  • 13
  • 32

1 Answers1

1

Assuming df_train is a pandas.DataFrame.

Would something simple like help:

df_train.isna().sum()
df_train.isin([np.inf]).sum()

From this you get the solumns and investigate further with:

df_train[df["affected_col"].isna()]

to get the rows.

MaxS
  • 71
  • 3
  • I only get the columns displayed and I have over 254 columns, so not all of them are displayed. – Test Jul 02 '22 at 08:52
  • Write a for loop to iterate over all columns. – Mikko Ohtamaa Jul 02 '22 at 08:54
  • @MikkoOhtamaa `for i in df_train.isna().sum(): print(i)`, or what? – Test Jul 02 '22 at 08:55
  • 1
    @Test if you need all the column names, e.g. try `l = list(df_train.columns[df_train.isna().sum() > 0])`. Might be more efficient to see how many NaN you have and define a strategy to deal with those, e.g. fill with mean of column or something. – MaxS Jul 02 '22 at 09:09