import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
pd.set_option("display.max_columns", None)
df = pd.read_csv(r"C:\Users\kiaab\Downloads\FuelConsumption.csv.csv")
#print(df.head(6))
cdf = df[['ENGINESIZE', 'CYLINDERS', 'FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_COMB',
'CO2EMISSIONS']]
#print(cdf.head(9))
#plt.scatter(cdf.ENGINESIZE, cdf.CO2EMISSIONS, color='black')
plt.xlabel("engine size")
plt.ylabel("Emission")
#plt.show()
msk = np.random.rand(len(df)) < 0.8
train = cdf[msk]
test = cdf[~msk]
#plt.scatter(train.ENGINESIZE, train.CO2EMISSIONS, color='blue')
plt.xlabel("engine size")
plt.ylabel("Emission")
#plt.show()
from sklearn import linear_model
regr = linear_model.LinearRegression()
x = np.asanyarray(train[['ENGINESIZE', 'CYLINDERS', 'FUELCONSUMPTION_COMB']])
y = np.asanyarray(train[['CO2EMISSIONS']])
regr.fit(x, y)
#print('Coefficients: ', regr.coef_)
#print('Interceept: ', regr.intercept_)
y_hat = regr.predict(test[['ENGINESIZE', 'CYLINDERS', 'FUELCONSUMPTION_COMB']])
y = np.asanyarray(test[['CO2EMISSIONS']])
print("Residual sum of squares: %.2f" % np.mean((y_hat - y) ** 2))
print('variance score: %.2f' % regr.score(x, y))
I am testing my model and in the last step, I got the below error and do not know how to solve it.
ValueError: Found input variables with inconsistent numbers of samples: [225, 842]