You could define a custom scorer with signature func(estimator, X, y)
as suggested in this answer. In your case the custom scorer definition would be:
import numpy as np
def r2_score_adj(estimator, X, y):
y_pred = estimator.predict(X)
if estimator.fit_intercept:
rsquared = 1 - np.nansum((y - y_pred) ** 2) / np.nansum((y - np.nanmean(y)) ** 2)
rsquared_adj = 1 - (X.shape[0] - 1) / (X.shape[0] - X.shape[1] - 1) * (1 - rsquared)
else:
rsquared = 1 - np.nansum((y - y_pred) ** 2) / np.nansum(y ** 2)
rsquared_adj = 1 - X.shape[0] / (X.shape[0] - X.shape[1]) * (1 - rsquared)
return rsquared_adj
which is equivalent to statsmodel's adjusted R-squared definition:
import statsmodels.api as sm
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
X, y = make_regression(n_samples=100, n_features=10, noise=50, random_state=42)
# statsmodels with intercept
reg1 = sm.OLS(exog=sm.add_constant(X), endog=y).fit()
print(reg1.rsquared_adj)
# 0.9313017447410593
# scikit-learn with intercept
reg2 = LinearRegression(fit_intercept=True).fit(X, y)
print(r2_score_adj(reg2, X, y))
# 0.9313017447410593
# statsmodels without intercept
reg3 = sm.OLS(exog=X, endog=y).fit()
print(reg3.rsquared_adj)
# 0.9307276380801821
# scikit-learn without intercept
reg4 = LinearRegression(fit_intercept=False).fit(X, y)
print(r2_score_adj(reg4, X, y))
# 0.930727638080182
You can then use the custom scorer in mlxtend
's SequentialFeatureSelector
as follows:
from mlxtend.feature_selection import SequentialFeatureSelector
sfs = SequentialFeatureSelector(
estimator=LinearRegression(),
k_features=2,
forward=True,
scoring=r2_score_adj,
cv=2
)
sfs.fit(X, y)
print(sfs.subsets_)
# {1: {'feature_idx': (4,), 'cv_scores': array([0.11337299, 0.11996526]), 'avg_score': 0.1166691229065483, 'feature_names': ('4',)}, 2: {'feature_idx': (4, 9), 'cv_scores': array([0.20589701, 0.38117558]), 'avg_score': 0.2935362938943045, 'feature_names': ('4', '9')}}