I have the following code below, however, the last line of the label encoder
X = MultiColumnLabelEncoder(columns = ['newlyConst','balcony', 'cellar', 'lift', 'garden', ]).fit_transform(df)
adds the y column (rent), into the X numpy.array.
I'm unsure how to specify the columns to be encoded another way to prevent this issue, for instance, by specifying the X np array and the specific columns instead of via df as when I do I receive an Index Error.
Any help would be great, thanks!
Update I substituted the long label encoder for a much more elegant solution as noted my @Corralien -- in-depth info found here Converting Pandas Types
The substitution:
df = df.astype({"newlyConst" :int, "balcony" : int, "cellar" : int, "lift" : int, "garden":int})
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model
df = pd.read_csv('immo_data.csv')
df.drop(columns=['telekomTvOffer', 'telekomHybridUploadSpeed', 'pricetrend',
'telekomUploadSpeed', 'scoutId', 'noParkSpaces', 'yearConstructedRange',
'houseNumber', 'interiorQual', 'petsAllowed', 'street', 'streetPlain', 'baseRentRange',
'geo_plz','geo_bln', 'geo_krs','thermalChar', 'floor','numberOfFloors', 'noRoomsRange', 'livingSpaceRange',
'regio3', 'description', 'facilities', 'hasKitchen','heatingCosts', 'energyEfficiencyClass',
'lastRefurbish', 'electricityBasePrice', 'electricityKwhPrice','date','condition', 'typeOfFlat','serviceCharge'
,'heatingType','firingTypes', 'yearConstructed'], axis=1, inplace = True)
df_head=df.head(250)
df_nan_count=df.isna().sum()
#With 'firingTypes', 'yearConstructed', 'condition', 'typeOfFlat' number of NaN values exceeding 40-50%, those will be dropped
df.dropna(inplace=True)
df3=df.count()
df=df[['regio1', 'newlyConst', 'balcony', 'picturecount', 'cellar', 'livingSpace',
'lift','noRooms', 'garden', 'baseRent', 'totalRent']]
dfcount = df.nunique()
##Regression
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
#Encoding Categorical Data
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
le = LabelEncoder()
class MultiColumnLabelEncoder:
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self # not relevant here
'''
Transforms columns of X specified in self.columns using
LabelEncoder(). If no columns specified, transforms all
columns in X.
'''
def transform(self,X):
output = X.copy()
if self.columns is not None:
for col in self.columns:
output[col] = LabelEncoder().fit_transform(output[col])
else:
for colname,col in output.iteritems():
output[colname] = LabelEncoder().fit_transform(col)
return output
def fit_transform(self,X,y=None):
return self.fit(X,y).transform(X)
X = MultiColumnLabelEncoder(columns = ['newlyConst','balcony', 'cellar', 'lift', 'garden', ]).fit_transform(df)
# Encoding categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
df
regio1 newlyConst balcony picturecount cellar livingSpace lift noRooms garden totalRent
0 Nordrhein_Westfalen 0 0 6 1 86.00 0 4.0 1 840.00
2 Sachsen 1 1 8 1 83.80 1 3.0 0 1300.00
4 Bremen 0 1 19 0 84.97 0 3.0 0 903.00
6 Sachsen 0 0 9 1 62.00 0 2.0 1 380.00
7 Bremen 0 1 5 1 60.30 0 3.0 0 584.25
8 Baden_Württemberg 0 0 5 1 53.00 0 2.0 0 690.00
10 Sachsen 0 1 11 1 40.20 0 2.0 0 307.00
11 Sachsen 0 0 9 1 80.00 0 3.0 1 555.00
12 Rheinland_Pfalz 0 0 4 0 100.00 0 4.0 1 920.00
13 Nordrhein_Westfalen 0 0 3 0 123.44 0 4.0 0 1150.00