from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
df = pd.DataFrame({
'Country': ['AB', 'CD', 'EF', 'FG']*20,
'ColumnA' : [1]*20*4,'ColumnB' : [10]*20*4, 'Label': [1,0,1,0]*20
})
df['Country_Code'] = df['Country'].astype('category').cat.codes
X = df.loc[:, df.columns.drop(['Label','Country'])]
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=df.Country_Code)
lm = LinearRegression()
lm.fit(X_train,y_train)
lm_predictions = lm.predict(X_test)
- Convert the string values in
country
to numbers and save it as a new column
- When creating
x
train data drop label
(y
) and also the string country
columns
Method 2
If your test data on which you will make predictions will come later, you will need a mechanism to convert their country
into code
before making predictions. The recommended way in such a cases is to use LabelEncoder
on which you can use fit
method to encode strings to labels and later use transform
to encode the country of test data.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
df = pd.DataFrame({
'Country': ['AB', 'CD', 'EF', 'FG']*20,
'ColumnA' : [1]*20*4,'ColumnB' : [10]*20*4, 'Label': [1,0,1,0]*20
})
# Train-Validation
le = preprocessing.LabelEncoder()
df['Country_Code'] = le.fit_transform(df['Country'])
X = df.loc[:, df.columns.drop(['Label','Country'])]
y = df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=df.Country_Code)
lm = LinearRegression()
lm.fit(X_train,y_train)
# Test
test_df = pd.DataFrame({'Country': ['AB'], 'ColumnA' : [1],'ColumnB' : [10] })
test_df['Country_Code'] = le.transform(test_df['Country'])
print (lm.predict(test_df.loc[:, test_df.columns.drop(['Country'])]))