Regresi Linier dan Teknik Seleksi Peubah

MTCARS Dataset

The MTCARS dataset is a well-known dataset in the field of statistics and machine learning. It contains data extracted from the 1974 Motor Trend US magazine, and comprises fuel consumption and 10 aspects of automobile design and performance for 32 automobiles (1973–74 models).

Variables/Columns:

mpg: Miles/(US) gallon
cyl: Number of cylinders
disp: Displacement (cu.in.)
hp: Gross horsepower
drat: Rear axle ratio
wt: Weight (1000 lbs)
qsec: 1/4 mile time
vs: Engine (0 = V-shaped, 1 = straight)
am: Transmission (0 = automatic, 1 = manual)
gear: Number of forward gears
carb: Number of carburetors

This dataset is often used for regression analysis and various machine learning tasks to predict the fuel efficiency (mpg) based on the other variables.

mpg adalah ukuran efisiensi bahan bakar dalam mil per galon. Dalam konteks ini, kita akan menggunakan variabel mpg sebagai variabel target (y) dan variabel lainnya sebagai variabel prediktor (x).

Modeling Workflow

1. Import data to Python
2. Data Preprocessing
3. Training a Machine Learning Models
4. Test Prediction

Import Data to Python

# Import library pengolahan struktur data
import pandas as pd

# Import library pengolahan angka
import numpy as np

# Create a function to read the data
def read_data(fname):
    data = pd.read_csv(fname)
    print('Data shape raw               :', data.shape)
    print('Number of duplicate          :', data.duplicated().sum())
    data = data.drop_duplicates()
    print('Data shape after dropping    :', data.shape)
    print('Data shape final             :', data.shape)
    return data

# Read the Uber data
data = read_data(fname='mtcars.csv')
data = data.drop(columns = ["model"])
data.head

Data shape raw               : (32, 12)
Number of duplicate          : 0
Data shape after dropping    : (32, 12)
Data shape final             : (32, 12)

<bound method NDFrame.head of      mpg  cyl   disp   hp  drat     wt   qsec  vs  am  gear  carb
0   21.0    6  160.0  110  3.90  2.620  16.46   0   1     4     4
1   21.0    6  160.0  110  3.90  2.875  17.02   0   1     4     4
2   22.8    4  108.0   93  3.85  2.320  18.61   1   1     4     1
3   21.4    6  258.0  110  3.08  3.215  19.44   1   0     3     1
4   18.7    8  360.0  175  3.15  3.440  17.02   0   0     3     2
5   18.1    6  225.0  105  2.76  3.460  20.22   1   0     3     1
6   14.3    8  360.0  245  3.21  3.570  15.84   0   0     3     4
7   24.4    4  146.7   62  3.69  3.190  20.00   1   0     4     2
8   22.8    4  140.8   95  3.92  3.150  22.90   1   0     4     2
9   19.2    6  167.6  123  3.92  3.440  18.30   1   0     4     4
10  17.8    6  167.6  123  3.92  3.440  18.90   1   0     4     4
11  16.4    8  275.8  180  3.07  4.070  17.40   0   0     3     3
12  17.3    8  275.8  180  3.07  3.730  17.60   0   0     3     3
13  15.2    8  275.8  180  3.07  3.780  18.00   0   0     3     3
14  10.4    8  472.0  205  2.93  5.250  17.98   0   0     3     4
15  10.4    8  460.0  215  3.00  5.424  17.82   0   0     3     4
16  14.7    8  440.0  230  3.23  5.345  17.42   0   0     3     4
17  32.4    4   78.7   66  4.08  2.200  19.47   1   1     4     1
18  30.4    4   75.7   52  4.93  1.615  18.52   1   1     4     2
19  33.9    4   71.1   65  4.22  1.835  19.90   1   1     4     1
20  21.5    4  120.1   97  3.70  2.465  20.01   1   0     3     1
21  15.5    8  318.0  150  2.76  3.520  16.87   0   0     3     2
22  15.2    8  304.0  150  3.15  3.435  17.30   0   0     3     2
23  13.3    8  350.0  245  3.73  3.840  15.41   0   0     3     4
24  19.2    8  400.0  175  3.08  3.845  17.05   0   0     3     2
25  27.3    4   79.0   66  4.08  1.935  18.90   1   1     4     1
26  26.0    4  120.3   91  4.43  2.140  16.70   0   1     5     2
27  30.4    4   95.1  113  3.77  1.513  16.90   1   1     5     2
28  15.8    8  351.0  264  4.22  3.170  14.50   0   1     5     4
29  19.7    6  145.0  175  3.62  2.770  15.50   0   1     5     6
30  15.0    8  301.0  335  3.54  3.570  14.60   0   1     5     8
31  21.4    4  121.0  109  4.11  2.780  18.60   1   1     4     2>

Data Preprocessing

# Buat input & output
def split_input_output(data, target_column):
    X = data.drop(columns = target_column)
    y = data[target_column]

    return X, y

X, y = split_input_output(data = data,
                          target_column = "mpg")

# Split train & test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 123)
X_train.head()

	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
21	8	318.0	150	2.76	3.520	16.87	0	0	3	2
11	8	275.8	180	3.07	4.070	17.40	0	0	3	3
23	8	350.0	245	3.73	3.840	15.41	0	0	3	4
18	4	75.7	52	4.93	1.615	18.52	1	1	4	2
4	8	360.0	175	3.15	3.440	17.02	0	0	3	2

###  Data Preprocessing
X_train.isnull().sum()

cyl     0
disp    0
hp      0
drat    0
wt      0
qsec    0
vs      0
am      0
gear    0
carb    0
dtype: int64

Training a Machine Learning Models - Linear Regression

from sklearn.dummy import DummyRegressor

# Create object
baseline_model = DummyRegressor(strategy='mean')

# Fit object
baseline_model.fit(X_train, y_train)
y_train_pred = baseline_model.predict(X_train)
y_train_pred

array([19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3,
       19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3,
       19.3, 19.3, 19.3])

from sklearn.metrics import mean_squared_error

mse_baseline_train = mean_squared_error(y_true = y_train,
                                        y_pred = y_train_pred)
print(mse_baseline_train)

37.193599999999996

# Lakukan cross validation
from sklearn.model_selection import cross_val_score

scores_baseline = cross_val_score(estimator = baseline_model,
                                  X = X_train,
                                  y = y_train,
                                  cv = 5,
                                  scoring = 'neg_mean_squared_error')

mse_baseline_cv = -np.mean(scores_baseline)
mse_baseline_cv

39.23434999999999

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# Predict y_train
y_train_pred = lr.predict(X_train)

# Cari MSE di data train
mse_lr_train = mean_squared_error(y_true = y_train,
                                  y_pred = y_train_pred)
print(mse_lr_train)

3.8855506573917955

# Lakukan cross validation
scores_lr = cross_val_score(estimator = lr,
                            X = X_train,
                            y = y_train,
                            cv = 5,
                            scoring = "neg_mean_squared_error")

mse_lr_cv = -np.mean(scores_lr)
mse_lr_cv

23.42111033958928

model_summary = pd.DataFrame({"Model Name": ['Baseline', 'LinearRegression'],
                              "Model": [baseline_model, lr],
                              "MSE Train": [mse_baseline_train, mse_lr_train],
                              "MSE CV": [mse_baseline_cv, mse_lr_cv]})

model_summary

	Model Name	Model	MSE Train	MSE CV
0	Baseline	DummyRegressor()	37.193600	39.23435
1	LinearRegression	LinearRegression()	3.885551	23.42111

Test Prediction

# Cek test scores
y_pred_test = lr.predict(X_test)

# Cari MSE data test
test_score = mean_squared_error(y_true = y_test,
                                y_pred = y_pred_test)
test_score

16.511353509876333

# Ekstrak model parameter
coef_ = lr.coef_
intercept_ = lr.intercept_
lr_params = np.append(coef_, intercept_)

lr_params = pd.DataFrame(lr_params,
                         index = list(X_train.columns) + ["constant"],
                         columns = ["coefficient"])
lr_params

	coefficient
cyl	1.097167
disp	0.006168
hp	-0.005287
drat	1.820345
wt	-3.668249
qsec	2.139810
vs	0.851392
am	5.880618
gear	-1.092819
carb	0.032545
constant	-19.270044

def fit_model(estimator, X_train, y_train):
    """Fungsi untuk fitting model"""
    # 1. Fitting model
    estimator.fit(X_train, y_train)

    # 2. Cari evaluasi di data train & valid
    y_pred_train = estimator.predict(X_train)
    train_score = mean_squared_error(y_true = y_train,
                                     y_pred = y_pred_train)

    valid_scores = cross_val_score(estimator = estimator,
                                   X = X_train,
                                   y = y_train,
                                   cv = 5,
                                   scoring = 'neg_mean_squared_error')
    cv_score = -np.mean(valid_scores)

    # 3. Ekstrak coefficient
    coef_ = estimator.coef_
    intercept_ = estimator.intercept_
    estimator_params = np.append(coef_, intercept_)

    estimator_params_df = pd.DataFrame(estimator_params,
                                       index = list(X_train.columns) + ["constant"],
                                       columns = ["coefficient"])

    return estimator, train_score, cv_score, estimator_params_df

Tanda negatif (-) digunakan karena Scikit-Learn mengembalikan error dalam bentuk negatif untuk menjaga konsistensi dengan metrik lain (di mana nilai lebih tinggi lebih baik). Oleh karena itu, kita harus mengalikannya dengan -1 agar mendapatkan nilai MSE dalam bentuk yang benar (positif).

from sklearn.linear_model import LinearRegression
lr, train_score, cv_score, lr_params_df = fit_model(estimator = LinearRegression(),
                                                    X_train = X_train,
                                                    y_train = y_train)

print(f"train score: {train_score:.3f}, cv score: {cv_score:.3f}")

train score: 3.886, cv score: 23.421

Subset Selection

import statsmodels.api as sm
from itertools import combinations

column_list = list(X_train.columns)
n_column = len(column_list)

column_list

['cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']

train_column_list = []

for i in range(n_column):
    list_of_combination = combinations(column_list, i)
    for combi in list_of_combination:
        train_column_list.append(list(combi))

# tambahkan seluruh kolom
train_column_list.append(column_list)

len(train_column_list)

idx = 95
train_list_idx = train_column_list[idx]
train_list_idx

['disp', 'hp', 'vs']

train_score = []
cv_score = []

for idx in range(len(train_column_list)):
    if idx != 0:
        # Filter data
        train_list_idx = train_column_list[idx]
        X_train_idx = X_train[train_list_idx]

        # Buat model
        _, train_idx, cv_idx, _ = fit_model(estimator = LinearRegression(),
                                            X_train = X_train_idx,
                                            y_train = y_train)

        # Simpan hasil
        train_score.append(train_idx)
        cv_score.append(cv_idx)

import matplotlib.pyplot as plt
# Plot hasil
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 8))

ax.boxplot([train_score, cv_score])

ax.set_xticklabels(["TRAIN", "CV"])
ax.set_ylabel("MSE")
plt.show()

# Cari best di data validasi
best_score = np.min(cv_score)
best_idx = np.argmin(cv_score)

best_idx, best_score

(477, 7.989577252522926)

# Best features
train_column_list[best_idx + 1]

['cyl', 'drat', 'wt', 'qsec', 'am']

# Find model
lr_best, train_best_score, \
        cv_best_score, lr_params_best = fit_model(estimator = LinearRegression(),
                                                  X_train = X_train[train_column_list[best_idx+1]],
                                                  y_train = y_train)

print('Train score :', train_best_score)
print('CV score    :', cv_best_score)

Train score : 4.113333306973907
CV score    : 7.989577252522926

lr_params_best

	coefficient
cyl	1.282553
drat	1.639653
wt	-3.579665
qsec	2.539308
am	4.763395
constant	-29.508518

Test Prediction

# Cek test scores
y_pred_test = lr_best.predict(X_test[train_column_list[best_idx+1]])

# Cari MSE data test
test_score = mean_squared_error(y_true = y_test,
                                y_pred = y_pred_test)
test_score

17.675913225570763

Conclusion 1

best_scores_df = pd.DataFrame({"Model": ["Baseline", "OLS full features", "OLS best features"],
                               "CV Scores": [mse_baseline_cv, mse_lr_cv, cv_best_score]})

best_scores_df

	Model	CV Scores
0	Baseline	39.234350
1	OLS full features	23.421110
2	OLS best features	7.989577

Sedikit lebih besar dibandingkan dengan model OLS full features. Mana yang lebih baik? Tentu saja, kita harus melihat trade-off antara kompleksitas model dan performa. Jika performa model tidak meningkat secara signifikan, kita mungkin lebih memilih model yang lebih sederhana.

Ingat trade-off antara bias dan varians!

Shrinkage Methods: Ridge, Lasso, Elastic Net

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV

Ridge

alphas = [0.5, 1.0, 2.5, 5.0, 7.5, 10.0,
          12.5, 15.0, 17.5, 30.0, 50.0]

mse_train_list = []
mse_cv_list = []
model_list = []

for alpha in alphas:
    model_i, train_score_i, \
        cv_score_i, model_param_i = fit_model(estimator = Ridge(alpha=alpha),
                                              X_train = X_train,
                                              y_train = y_train)

    mse_train_list.append(train_score_i)
    mse_cv_list.append(cv_score_i)
    model_list.append(model_param_i)

# Plot error
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))

ax.plot(alphas, mse_train_list, c="r", marker=".", label="Train")
ax.plot(alphas, mse_cv_list, c="g", marker=".", label="CV")

ax.set_xlabel("alpha")
ax.set_ylabel("MSE")

plt.grid()
plt.legend()
plt.show()

# Best parameter adalah saat MSE di CV paling kecil
best_idx = np.argmin(mse_cv_list)
best_alpha = alphas[best_idx]
best_ridge_cv = mse_cv_list[best_idx]

best_alpha, best_ridge_cv

(2.5, 10.789408042167233)

# Best model
best_param_ridge = model_list[best_idx]
best_param_ridge

	coefficient
cyl	-0.453721
disp	-0.005147
hp	-0.007531
drat	1.025724
wt	-1.539528
qsec	0.624478
vs	-0.126656
am	1.701397
gear	0.680265
carb	-0.731590
constant	14.389671

Cara lebih cepat dengan GridSearchCV

# Buat model & parameter model yang ingin divariasikan
ridge = Ridge()

param_space = {"alpha": alphas}
param_space

{'alpha': [0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 30.0, 50.0]}

# Lakukan grid search dengan CV
cv_ridge = GridSearchCV(estimator = ridge,
                        param_grid = param_space,
                        scoring = "neg_mean_squared_error",
                        cv = 5)

# Fit searching
cv_ridge.fit(X = X_train,
             y = y_train)

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': [0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0,
                                   17.5, 30.0, 50.0]},
             scoring='neg_mean_squared_error')

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

cv_ridge.best_params_

{'alpha': 2.5}

# Buat objek baru
best_ridge = Ridge(alpha = cv_ridge.best_params_["alpha"])

# Fit model
best_ridge.fit(X = X_train,
               y = y_train)

Ridge(alpha=2.5)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

best_ridge.coef_

array([-0.45372081, -0.00514687, -0.00753064,  1.02572436, -1.53952835,
        0.62447844, -0.12665636,  1.70139692,  0.68026538, -0.73159023])

Lasso

alphas = [0.05, 0.10, 0.15, 0.20, 0.25, 1.00,
          1.25, 1.50, 1.75, 3.00, 5.00]

mse_train_list = []
mse_cv_list = []
model_list = []

for alpha in alphas:
    model_i, train_score_i, \
        cv_score_i, model_param_i = fit_model(estimator = Lasso(alpha=alpha),
                                              X_train = X_train,
                                              y_train = y_train)

    mse_train_list.append(train_score_i)
    mse_cv_list.append(cv_score_i)
    model_list.append(model_param_i)

# Plot error
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6))

ax.plot(alphas, mse_train_list, c="r", marker=".", label="Train")
ax.plot(alphas, mse_cv_list, c="g", marker=".", label="CV")

ax.set_xlabel("alpha")
ax.set_ylabel("MSE")

plt.grid()
plt.legend()
plt.show()

# Best parameter adalah saat MSE di CV paling kecil
best_idx = np.argmin(mse_cv_list)
best_alpha = alphas[best_idx]
best_lasso_cv = mse_cv_list[best_idx]
best_alpha, best_lasso_cv

(0.15, 12.976374619574276)

# Best model
best_param_lasso = model_list[best_idx]
best_param_lasso

	coefficient
cyl	-0.092881
disp	-0.006417
hp	-0.010095
drat	0.668991
wt	-1.819467
qsec	0.767749
vs	0.000000
am	2.808862
gear	0.000000
carb	-0.506165
constant	13.712239

Dengan GridSearchCV

# Buat model & parameter model yang ingin divariasikan
lasso = Lasso()
param_space = {"alpha": alphas}

# Lakukan grid search dengan CV
cv_lasso = GridSearchCV(estimator = lasso,
                        param_grid = param_space,
                        scoring = "neg_mean_squared_error",
                        cv = 5)

# Fit searching
cv_lasso.fit(X = X_train,
             y = y_train)

GridSearchCV(cv=5, estimator=Lasso(),
             param_grid={'alpha': [0.05, 0.1, 0.15, 0.2, 0.25, 1.0, 1.25, 1.5,
                                   1.75, 3.0, 5.0]},
             scoring='neg_mean_squared_error')

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

cv_lasso.best_params_

{'alpha': 0.15}

# Buat objek baru
best_lasso = Lasso(alpha = cv_lasso.best_params_["alpha"])

# Fit model
best_lasso.fit(X = X_train,
               y = y_train)

Lasso(alpha=0.15)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

best_lasso.coef_

array([-0.09288145, -0.00641718, -0.01009455,  0.66899118, -1.81946701,
        0.76774888,  0.        ,  2.80886172,  0.        , -0.50616465])

Elastic Net

Langsung dengan GridSearchCV

alphas = [0.05, 0.10, 0.15, 0.20, 0.25, 1.00, 1.25, 1.50, 1.75, 3.00, 5.00]
l1_ratios = [0.1, 0.5, 0.7, 0.9, 1.0]

# Buat model & parameter model yang ingin divariasikan
elastic = ElasticNet()
param_space = {"alpha": alphas,
               "l1_ratio": l1_ratios}

# Lakukan grid search dengan CV
cv_elastic = GridSearchCV(estimator = elastic,
                          param_grid = param_space,
                          scoring = "neg_mean_squared_error",
                          cv = 5)
# Fit searching
cv_elastic.fit(X = X_train,
               y = y_train)

GridSearchCV(cv=5, estimator=ElasticNet(),
             param_grid={'alpha': [0.05, 0.1, 0.15, 0.2, 0.25, 1.0, 1.25, 1.5,
                                   1.75, 3.0, 5.0],
                         'l1_ratio': [0.1, 0.5, 0.7, 0.9, 1.0]},
             scoring='neg_mean_squared_error')

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

cv_elastic.best_params_

{'alpha': 0.15, 'l1_ratio': 0.1}

# Buat objek baru
best_elastic = ElasticNet(alpha = cv_elastic.best_params_["alpha"],
                          l1_ratio = cv_elastic.best_params_["l1_ratio"])
best_elastic.fit(X = X_train,
                 y = y_train)

ElasticNet(alpha=0.15, l1_ratio=0.1)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

best_elastic.coef_

array([-0.52408861, -0.00769543, -0.00878001,  0.86008691, -1.28146081,
        0.42166796, -0.        ,  1.41763367,  0.61949478, -0.74413128])

# Cek scores cv from cv_elastic best
best_elastic_cv = -cv_elastic.best_score_
best_elastic_cv

10.847311992084979

Conclusion 2

best_params = pd.concat([lr_params_df,
                         lr_params_best,
                         best_param_ridge,
                         best_param_lasso,
                         pd.DataFrame(best_elastic.coef_, index=X_train.columns, columns=["ElasticNet"])],
                        axis=1)
best_params.columns = ["OLS full features", "OLS best features", "Ridge", "Lasso", "ElasticNet"]
best_params

	OLS full features	OLS best features	Ridge	Lasso	ElasticNet
cyl	1.097167	1.282553	-0.453721	-0.092881	-0.524089
disp	0.006168	NaN	-0.005147	-0.006417	-0.007695
hp	-0.005287	NaN	-0.007531	-0.010095	-0.008780
drat	1.820345	1.639653	1.025724	0.668991	0.860087
wt	-3.668249	-3.579665	-1.539528	-1.819467	-1.281461
qsec	2.139810	2.539308	0.624478	0.767749	0.421668
vs	0.851392	NaN	-0.126656	0.000000	-0.000000
am	5.880618	4.763395	1.701397	2.808862	1.417634
gear	-1.092819	NaN	0.680265	0.000000	0.619495
carb	0.032545	NaN	-0.731590	-0.506165	-0.744131
constant	-19.270044	-29.508518	14.389671	13.712239	NaN

# Skor MSE
best_scores_df = pd.DataFrame({"Model": ["Baseline", "OLS full features", "OLS best features", "Ridge", "Lasso", "ElasticNet"],
                               "MSE": [mse_baseline_cv, mse_lr_cv, cv_best_score, best_ridge_cv, best_lasso_cv, best_elastic_cv]})

best_scores_df

	Model	MSE
0	Baseline	39.234350
1	OLS full features	23.421110
2	OLS best features	7.989577
3	Ridge	10.789408
4	Lasso	12.976375
5	ElasticNet	10.847312

# Cek test scores
def mse_model(estimator, X_test, y_test):
    # Predict
    y_pred = estimator.predict(X_test)

    # Cari mse
    mse = mean_squared_error(y_test, y_pred)

    return mse

mse_model(estimator = lr_best,
          X_test = X_test[lr_params_best.index[:-1]],
          y_test = y_test)

17.675913225570763

mse_model(estimator = best_elastic,
          X_test = X_test,
          y_test = y_test)

6.514650231430356

# Evaluate all models on the test set
test_scores = {
    "Baseline": mse_model(estimator=baseline_model, X_test=X_test, y_test=y_test),
    "OLS full features": mse_model(estimator=lr, X_test=X_test, y_test=y_test),
    "OLS best features": mse_model(estimator=lr_best, X_test=X_test[lr_params_best.index[:-1]], y_test=y_test),
    "Ridge": mse_model(estimator=best_ridge, X_test=X_test, y_test=y_test),
    "Lasso": mse_model(estimator=best_lasso, X_test=X_test, y_test=y_test),
    "ElasticNet": mse_model(estimator=best_elastic, X_test=X_test, y_test=y_test)
}

test_scores_df = pd.DataFrame(list(test_scores.items()), columns=["Model", "Test MSE"])
test_scores_df

	Model	Test MSE
0	Baseline	30.887143
1	OLS full features	16.511354
2	OLS best features	17.675913
3	Ridge	6.949301
4	Lasso	7.244562
5	ElasticNet	6.514650

References

https://scikit-learn.org/stable/modules/linear_model.html#