I am currently working on an Excel datasheet where I have rows with several features for which I want to predict multiple labels.
The features are either 0 or 1 because they describe whether something occurs in that row or not. The labels are also arrays filled with values of either 0 or 1, they describe whether certain features were related to a certain label.
I am trying out different approaches and arrived at random forests for now.
To speed things up I want to try out the cuML implementation of random forests. But when I feed it my data I encounter a RuntimeError: exception occurred! file=/__w/cuml/cuml/cpp/src/randomforest/randomforest.cu line=263: More than one variable expected for classification problem.
I don't know how to interpret this error.
Here is some code with example data, that reproduces the error:
from sklearn.datasets import make_multilabel_classification
from cuml.ensemble import RandomForestClassifier as cuRFC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
# Make a multi-label classification dataset
X, y = make_multilabel_classification(n_samples=13, n_features=79, n_classes=10, n_labels=10, random_state=0)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# cuML random forest classifier
rfc = cuRFC(random_state=0)
multi_rfc = MultiOutputClassifier(rfc)
multi_rfc.fit(X_train, y_train)
# Predict on the data
y_pred = multi_rfc.predict(X_test)
# Evaluate the performance of the model
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred, average='weighted'))
print('Precision:', precision_score(y_test, y_pred, average='weighted'))
print('Recall:', recall_score(y_test, y_pred, average='weighted'))
and here is the output:
/home/user/anaconda3/envs/cuml/lib/python3.10/site-packages/cuml/internals/api_decorators.py:344: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
return func(**kwargs)
/home/user/anaconda3/envs/cuml/lib/python3.10/site-packages/cuml/internals/api_decorators.py:344: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
return func(**kwargs)
/home/user/anaconda3/envs/cuml/lib/python3.10/site-packages/cuml/internals/api_decorators.py:188: UserWarning: The number of bins, `n_bins` is greater than the number of samples used for training. Changing `n_bins` to number of training samples.
ret = func(*args, **kwargs)
/home/user/anaconda3/envs/cuml/lib/python3.10/site-packages/cuml/internals/api_decorators.py:188: UserWarning: To use pickling first train using float32 data to fit the estimator
ret = func(*args, **kwargs)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[28], line 17
14 multi_rfc.fit(X_train, y_train)
16 # Predict on the data
---> 17 y_pred = multi_rfc.predict(X_test)
19 # Evaluate the performance of the model
20 print('Accuracy:', accuracy_score(y_test, y_pred))
File ~/anaconda3/envs/cuml/lib/python3.10/site-packages/sklearn/multioutput.py:305, in _MultiOutputEstimator.predict(self, X)
302 if not hasattr(self.estimators_[0], "predict"):
303 raise ValueError("The base estimator should implement a predict method")
--> 305 y = Parallel(n_jobs=self.n_jobs)(
306 delayed(e.predict)(X) for e in self.estimators_
307 )
309 return np.asarray(y).T
File ~/anaconda3/envs/cuml/lib/python3.10/site-packages/sklearn/utils/parallel.py:65, in Parallel.__call__(self, iterable)
60 config = get_config()
61 iterable_with_config = (
62 (_with_config(delayed_func, config), args, kwargs)
63 for delayed_func, args, kwargs in iterable
64 )
---> 65 return super().__call__(iterable_with_config)
File ~/anaconda3/envs/cuml/lib/python3.10/site-packages/joblib/parallel.py:1863, in Parallel.__call__(self, iterable)
1861 output = self._get_sequential_output(iterable)
1862 next(output)
-> 1863 return output if self.return_generator else list(output)
1865 # Let's create an ID that uniquely identifies the current call. If the
1866 # call is interrupted early and that the same instance is immediately
1867 # re-used, this id will be used to prevent workers that were
1868 # concurrently finalizing a task from the previous call to run the
1869 # callback.
1870 with self._lock:
File ~/anaconda3/envs/cuml/lib/python3.10/site-packages/joblib/parallel.py:1792, in Parallel._get_sequential_output(self, iterable)
1790 self.n_dispatched_batches += 1
1791 self.n_dispatched_tasks += 1
-> 1792 res = func(*args, **kwargs)
1793 self.n_completed_tasks += 1
1794 self.print_progress()
File ~/anaconda3/envs/cuml/lib/python3.10/site-packages/sklearn/utils/parallel.py:127, in _FuncWrapper.__call__(self, *args, **kwargs)
125 config = {}
126 with config_context(**config):
--> 127 return self.function(*args, **kwargs)
File ~/anaconda3/envs/cuml/lib/python3.10/site-packages/cuml/internals/api_decorators.py:188, in _make_decorator_function.<locals>.decorator_function.<locals>.decorator_closure.<locals>.wrapper(*args, **kwargs)
185 set_api_output_dtype(output_dtype)
187 if process_return:
--> 188 ret = func(*args, **kwargs)
189 else:
190 return func(*args, **kwargs)
File ~/anaconda3/envs/cuml/lib/python3.10/site-packages/nvtx/nvtx.py:101, in annotate.__call__.<locals>.inner(*args, **kwargs)
98 @wraps(func)
99 def inner(*args, **kwargs):
100 libnvtx_push_range(self.attributes, self.domain.handle)
--> 101 result = func(*args, **kwargs)
102 libnvtx_pop_range(self.domain.handle)
103 return result
File randomforestclassifier.pyx:605, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.predict()
File ~/anaconda3/envs/cuml/lib/python3.10/site-packages/cuml/internals/api_decorators.py:188, in _make_decorator_function.<locals>.decorator_function.<locals>.decorator_closure.<locals>.wrapper(*args, **kwargs)
185 set_api_output_dtype(output_dtype)
187 if process_return:
--> 188 ret = func(*args, **kwargs)
189 else:
190 return func(*args, **kwargs)
File randomforest_common.pyx:348, in cuml.ensemble.randomforest_common.BaseRandomForestModel._predict_model_on_gpu()
File randomforest_common.pyx:232, in cuml.ensemble.randomforest_common.BaseRandomForestModel._obtain_treelite_handle()
RuntimeError: exception occurred! file=/__w/cuml/cuml/cpp/src/randomforest/randomforest.cu line=263: More than one variable expected for classification problem.
Obtained 64 stack frames
#0 in /home/user/anaconda3/envs/cuml/lib/python3.10/site-packages/cuml/internals/../libcuml++.so(_ZN4raft9exception18collect_call_stackEv+0x81) [0x7f13b9460c11]
...(I didn't paste most of the stack frames here.)
#63 in /home/user/anaconda3/envs/cuml/bin/python() [0x59ea19]
I thought that the problem might be caused by the data, so I tried it with other models like sklearn random forests and XGBoost (Had to convert the labels with a LabelEncoder
here for it to work or use it without the wrapper) and it works.
When you change n_samples
in make_multilabel_classification
to something other than 13 it works with the cuML implementation as well but fails for example at 3 again and always fails with real data so far.