This error is weird and I cant even find anything on google about it.
I'm attempting to hot encode a column in an existing sparse dataframe,
combined_cats
is a set of all the possible categories. column_name
is a generic column name.
df[column_name] = df[column_name].astype('category', categories=combined_cats,copy=False)
However, this fails with the error in the title. I figured that you cant hot encode a sparse matrix, but I can't seem to convert it back to a dense matrix by to_dense(), as it says numpy ndarray has no such method.
I attempted using as_matrix() and resetting the column:
df[column_name] = df[column_name].as_matrix()
df[column_name] = df[column_name].astype('category', categories=combined_cats,copy=False)
Which didn't work either. Is there something im doing wrong? The error occurs when I try to use combined_cats.
eg:
def hot_encode_column_in_both_datasets(column_name,df,df2,sparse=True):
col1b = set(df2[column_name].unique())
col1a = set(df[column_name].unique())
combined_cats = list(col1a.union(col1b))
df[column_name] = df[column_name].astype('category', categories=combined_cats,copy=False)
df2[column_name] = df2[column_name].astype('category', categories=combined_cats,copy=False)
df = pd.get_dummies(df, columns=[column_name],sparse=sparse)
df2 = pd.get_dummies(df2, columns=[column_name],sparse=sparse)
try:
del df[column_name]
del df2[column_name]
except:
pass
return df,df2
df = pd.DataFrame({"col1":['a','b','c','d'],"col2":["potato","tomato","potato","tomato"],"col3":[1,1,1,1]})
df2 = pd.DataFrame({"col1":['g','b','q','r'],"col2":["potato","flowers","potato","flowers"],"col3":[1,1,1,1]})
## Hot encode col1
df,df2 = hot_encode_column_in_both_datasets("col1",df,df2)
len(df.columns) #9
len(df2.columns) #9
## Hot encode col2 as well
df,df2 = hot_encode_column_in_both_datasets("col2",df,df2)
Traceback (most recent call last):
File "<ipython-input-44-d8e27874a25b>", line 1, in <module>
df,df2 = hot_encode_column_in_both_datasets("col2",df,df2)
File "<ipython-input-34-5ae1e71bbbd5>", line 331, in hot_encode_column_in_both_datasets
df[column_name] = df[column_name].astype('category', categories=combined_cats,copy=False)
File "/storage/programfiles/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py", line 2419, in __setitem__
self._set_item(key, value)
File "/storage/programfiles/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py", line 2485, in _set_item
value = self._sanitize_column(key, value)
File "/storage/programfiles/anaconda3/lib/python3.5/site-packages/pandas/sparse/frame.py", line 324, in _sanitize_column
clean = value.reindex(self.index).as_sparse_array(
File "/storage/programfiles/anaconda3/lib/python3.5/site-packages/pandas/sparse/series.py", line 573, in reindex
return self.copy()
File "/storage/programfiles/anaconda3/lib/python3.5/site-packages/pandas/sparse/series.py", line 555, in copy
return self._constructor(new_data, sparse_index=self.sp_index,
File "/storage/programfiles/anaconda3/lib/python3.5/site-packages/pandas/core/generic.py", line 2744, in __getattr__
return object.__getattribute__(self, name)
File "/storage/programfiles/anaconda3/lib/python3.5/site-packages/pandas/sparse/series.py", line 242, in sp_index
return self.block.sp_index
AttributeError: 'CategoricalBlock' object has no attribute 'sp_index'