I am trying to implement a lasso model for house pricing, but its predicting 0.00 for r2_score for test and train data. Can anyone please help me where I am going wrong.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
train = pd.read_csv(r"C:\train - Copy.csv")
test = pd.read_csv(r"C:\test.csv")
# CHECKING SHAPE OF THE DATA
print("Train Data shape", train.shape,"\n Test Data shape",test.shape)
# Save the 'Id' column
train_ID = train['id']
test_ID = test['id']
# Droping COLUMNS WHICH HAS NO IMPACT ON DATA
train = train.drop(['id', 'thumbnail_url'], axis=1)
test = test.drop(['id', 'thumbnail_url'], axis=1)
# Check data size after dropping no impact variables
print("\nThe train data size after dropping features is : {} ".format(train.shape))
print("The test data size after dropping featurea is : {} ".format(test.shape))
# Checking Categorical Data
C_data = train.select_dtypes(include=['object']).columns
print("Categorical Data", C_data)
# Checking Numerical Data
N_data = train.select_dtypes(include=['int64', 'float64']).columns
print("Numerical Data", N_data)
# Combining Datasets
ntrain = train.shape[0]
ntest = test.shape[0]
#y_train = train.log_price.values
y = train.log_price.values
print(ntrain)
print(ntest)
print(y)
all_data = pd.concat((train, test),sort='true').reset_index(drop=True)
print(all_data.shape)
all_data = all_data.drop(['log_price'], axis=1)
print(all_data.shape)
# Find Missing Ratio of Dataset
null_values = all_data.isnull().sum()
# print(null_values)
# IMPUTING NULL VALUES
all_data = all_data.dropna(subset=['host_since'])
all_data['bathrooms'] = all_data['bathrooms'].fillna(all_data['bathrooms'].mean())
all_data['bedrooms'] = all_data['bedrooms'].fillna(all_data['bedrooms'].mean())
all_data['beds'] = all_data['beds'].fillna(all_data['beds'].mean())
all_data['review_scores_rating'] = all_data['review_scores_rating'].fillna(all_data['review_scores_rating'].mean())
all_data['host_response_rate'] = all_data['host_response_rate'].fillna('None')
all_data['neighbourhood'] = all_data['neighbourhood'].fillna('None')
all_data['host_has_profile_pic'] = all_data['host_has_profile_pic'].fillna('f')
all_data['host_identity_verified'] = all_data['host_identity_verified'].fillna('f')
all_data['description'] = all_data['description'].fillna('None')
all_data['first_review'] = all_data['first_review'].fillna('None')
all_data['last_review'] = all_data['last_review'].fillna('None')
all_data['name'] = all_data['name'].fillna('None')
all_data['zipcode'] = all_data['zipcode'].fillna('None')
# Check if Missing values left
post_null_values = all_data.isnull().sum().sum()
print("post_null_values\n", post_null_values)
print("-----------------------------------------------------------------------------------------------")
# apply LabelEncoder to categorical features
from sklearn.preprocessing import LabelEncoder
cols = ('property_type', 'room_type', 'amenities', 'bed_type',
'cancellation_policy', 'city', 'description', 'first_review',
'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
'host_since', 'instant_bookable', 'last_review', 'name',
'neighbourhood', 'zipcode')
for c in cols:
lbl = LabelEncoder()
lbl.fit(list(all_data[c].values))
all_data[c] = lbl.transform(list(all_data[c].values))
# creating matrices for sklearn:
X = all_data[:ntrain]
test_values = all_data[ntrain:]
print("X col", X.columns, "X shape", X.shape)
# import train test split
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
#clf = LinearRegression()
clf = Lasso()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)
from sklearn.metrics import r2_score
print("Train acc: " , r2_score(y_train, y_train_pred))
print("Test acc: ", r2_score(y_test, y_pred))
from sklearn.metrics import mean_squared_error
print("Train acc: " , clf.score(X_train, y_train))
print("Test acc: ", clf.score(X_test, y_test))
Output: Train acc: 0.0001732000413904311 Test acc: 0.00011093390171657003 Train acc: 0.0001732000413904311 Test acc: 0.00011093390171657004