Is this the best way to work with pandas and vectorizer ? Converting a dataframe to a dict, vectorize and put all in a new dataframe? Or there is a better way to work with?
import pandas as pd
# Putting AmesHousing.txt data into a dataframe
data = pd.read_csv('AmesHousing.txt', encoding='UTF-8', delimiter='\t')
data = data.fillna(0)
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False, dtype=int)
df = pd.DataFrame(vec.fit_transform(data.T.to_dict().values()), columns = [vec.get_feature_names()])
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
#Here we are splitting our data with 2 pieces: train and test. Test will have 33% of data; train will have all the rest
test, train = train_test_split(df,test_size=0.33, random_state=42)
model = LinearRegression()
model.fit(train.drop(['SalePrice'], axis=1), train[['SalePrice']])
predict = model.predict(test.drop(['SalePrice'], axis=1))
MSE = mean_squared_error(predict,test[['SalePrice']])
RMSE = np.sqrt(MSE)
print('MSE:',MSE,'RMSE:',RMSE)