# Import library pengolahan struktur data
import pandas as pd
# Import library pengolahan angka
import numpy as np
Regresi Linier dan Teknik Seleksi Peubah
MTCARS Dataset
The MTCARS dataset is a well-known dataset in the field of statistics and machine learning. It contains data extracted from the 1974 Motor Trend US magazine, and comprises fuel consumption and 10 aspects of automobile design and performance for 32 automobiles (1973–74 models).
Variables/Columns:
mpg
: Miles/(US) galloncyl
: Number of cylindersdisp
: Displacement (cu.in.)hp
: Gross horsepowerdrat
: Rear axle ratiowt
: Weight (1000 lbs)qsec
: 1/4 mile timevs
: Engine (0 = V-shaped, 1 = straight)am
: Transmission (0 = automatic, 1 = manual)gear
: Number of forward gearscarb
: Number of carburetors
This dataset is often used for regression analysis and various machine learning tasks to predict the fuel efficiency (mpg
) based on the other variables.
mpg
adalah ukuran efisiensi bahan bakar dalam mil per galon. Dalam konteks ini, kita akan menggunakan variabel mpg
sebagai variabel target (y) dan variabel lainnya sebagai variabel prediktor (x).
Modeling Workflow
1. Import data to Python
2. Data Preprocessing
3. Training a Machine Learning Models
4. Test Prediction
Import Data to Python
# Create a function to read the data
def read_data(fname):
= pd.read_csv(fname)
data print('Data shape raw :', data.shape)
print('Number of duplicate :', data.duplicated().sum())
= data.drop_duplicates()
data print('Data shape after dropping :', data.shape)
print('Data shape final :', data.shape)
return data
# Read the Uber data
= read_data(fname='mtcars.csv')
data = data.drop(columns = ["model"])
data data.head
Data shape raw : (32, 12)
Number of duplicate : 0
Data shape after dropping : (32, 12)
Data shape final : (32, 12)
<bound method NDFrame.head of mpg cyl disp hp drat wt qsec vs am gear carb
0 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
1 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
3 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
4 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
5 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
7 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
8 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
9 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
10 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
11 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
12 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
13 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
14 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
15 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
16 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
17 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
18 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
19 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
20 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
21 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
22 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
23 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
24 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
25 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
26 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
27 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
28 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
29 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
30 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
31 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2>
Data Preprocessing
# Buat input & output
def split_input_output(data, target_column):
= data.drop(columns = target_column)
X = data[target_column]
y
return X, y
= split_input_output(data = data,
X, y = "mpg") target_column
# Split train & test
from sklearn.model_selection import train_test_split
= train_test_split(X,
X_train, X_test, y_train, y_test
y,= 0.2,
test_size = 123)
random_state X_train.head()
cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|
21 | 8 | 318.0 | 150 | 2.76 | 3.520 | 16.87 | 0 | 0 | 3 | 2 |
11 | 8 | 275.8 | 180 | 3.07 | 4.070 | 17.40 | 0 | 0 | 3 | 3 |
23 | 8 | 350.0 | 245 | 3.73 | 3.840 | 15.41 | 0 | 0 | 3 | 4 |
18 | 4 | 75.7 | 52 | 4.93 | 1.615 | 18.52 | 1 | 1 | 4 | 2 |
4 | 8 | 360.0 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
### Data Preprocessing
sum() X_train.isnull().
cyl 0
disp 0
hp 0
drat 0
wt 0
qsec 0
vs 0
am 0
gear 0
carb 0
dtype: int64
Training a Machine Learning Models - Linear Regression
from sklearn.dummy import DummyRegressor
# Create object
= DummyRegressor(strategy='mean')
baseline_model
# Fit object
baseline_model.fit(X_train, y_train)= baseline_model.predict(X_train)
y_train_pred y_train_pred
array([19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3,
19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3, 19.3,
19.3, 19.3, 19.3])
from sklearn.metrics import mean_squared_error
= mean_squared_error(y_true = y_train,
mse_baseline_train = y_train_pred)
y_pred print(mse_baseline_train)
37.193599999999996
# Lakukan cross validation
from sklearn.model_selection import cross_val_score
= cross_val_score(estimator = baseline_model,
scores_baseline = X_train,
X = y_train,
y = 5,
cv = 'neg_mean_squared_error')
scoring
= -np.mean(scores_baseline)
mse_baseline_cv mse_baseline_cv
39.23434999999999
from sklearn.linear_model import LinearRegression
= LinearRegression()
lr lr.fit(X_train, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
# Predict y_train
= lr.predict(X_train)
y_train_pred
# Cari MSE di data train
= mean_squared_error(y_true = y_train,
mse_lr_train = y_train_pred)
y_pred print(mse_lr_train)
3.8855506573917955
# Lakukan cross validation
= cross_val_score(estimator = lr,
scores_lr = X_train,
X = y_train,
y = 5,
cv = "neg_mean_squared_error")
scoring
= -np.mean(scores_lr)
mse_lr_cv mse_lr_cv
23.42111033958928
= pd.DataFrame({"Model Name": ['Baseline', 'LinearRegression'],
model_summary "Model": [baseline_model, lr],
"MSE Train": [mse_baseline_train, mse_lr_train],
"MSE CV": [mse_baseline_cv, mse_lr_cv]})
model_summary
Model Name | Model | MSE Train | MSE CV | |
---|---|---|---|---|
0 | Baseline | DummyRegressor() | 37.193600 | 39.23435 |
1 | LinearRegression | LinearRegression() | 3.885551 | 23.42111 |
Test Prediction
# Cek test scores
= lr.predict(X_test)
y_pred_test
# Cari MSE data test
= mean_squared_error(y_true = y_test,
test_score = y_pred_test)
y_pred test_score
16.511353509876333
# Ekstrak model parameter
= lr.coef_
coef_ = lr.intercept_
intercept_ = np.append(coef_, intercept_)
lr_params
= pd.DataFrame(lr_params,
lr_params = list(X_train.columns) + ["constant"],
index = ["coefficient"])
columns lr_params
coefficient | |
---|---|
cyl | 1.097167 |
disp | 0.006168 |
hp | -0.005287 |
drat | 1.820345 |
wt | -3.668249 |
qsec | 2.139810 |
vs | 0.851392 |
am | 5.880618 |
gear | -1.092819 |
carb | 0.032545 |
constant | -19.270044 |
def fit_model(estimator, X_train, y_train):
"""Fungsi untuk fitting model"""
# 1. Fitting model
estimator.fit(X_train, y_train)
# 2. Cari evaluasi di data train & valid
= estimator.predict(X_train)
y_pred_train = mean_squared_error(y_true = y_train,
train_score = y_pred_train)
y_pred
= cross_val_score(estimator = estimator,
valid_scores = X_train,
X = y_train,
y = 5,
cv = 'neg_mean_squared_error')
scoring = -np.mean(valid_scores)
cv_score
# 3. Ekstrak coefficient
= estimator.coef_
coef_ = estimator.intercept_
intercept_ = np.append(coef_, intercept_)
estimator_params
= pd.DataFrame(estimator_params,
estimator_params_df = list(X_train.columns) + ["constant"],
index = ["coefficient"])
columns
return estimator, train_score, cv_score, estimator_params_df
Tanda negatif (-) digunakan karena Scikit-Learn mengembalikan error dalam bentuk negatif untuk menjaga konsistensi dengan metrik lain (di mana nilai lebih tinggi lebih baik). Oleh karena itu, kita harus mengalikannya dengan -1 agar mendapatkan nilai MSE dalam bentuk yang benar (positif).
from sklearn.linear_model import LinearRegression
= fit_model(estimator = LinearRegression(),
lr, train_score, cv_score, lr_params_df = X_train,
X_train = y_train)
y_train
print(f"train score: {train_score:.3f}, cv score: {cv_score:.3f}")
train score: 3.886, cv score: 23.421
Subset Selection
import statsmodels.api as sm
from itertools import combinations
= list(X_train.columns)
column_list = len(column_list)
n_column
column_list
['cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']
= []
train_column_list
for i in range(n_column):
= combinations(column_list, i)
list_of_combination for combi in list_of_combination:
list(combi))
train_column_list.append(
# tambahkan seluruh kolom
train_column_list.append(column_list)
len(train_column_list)
1024
= 95
idx = train_column_list[idx]
train_list_idx train_list_idx
['disp', 'hp', 'vs']
= []
train_score = []
cv_score
for idx in range(len(train_column_list)):
if idx != 0:
# Filter data
= train_column_list[idx]
train_list_idx = X_train[train_list_idx]
X_train_idx
# Buat model
= fit_model(estimator = LinearRegression(),
_, train_idx, cv_idx, _ = X_train_idx,
X_train = y_train)
y_train
# Simpan hasil
train_score.append(train_idx) cv_score.append(cv_idx)
import matplotlib.pyplot as plt
# Plot hasil
= plt.subplots(nrows=1, ncols=1, figsize=(15, 8))
fig, ax
ax.boxplot([train_score, cv_score])
"TRAIN", "CV"])
ax.set_xticklabels(["MSE")
ax.set_ylabel( plt.show()
# Cari best di data validasi
= np.min(cv_score)
best_score = np.argmin(cv_score)
best_idx
best_idx, best_score
(477, 7.989577252522926)
# Best features
+ 1] train_column_list[best_idx
['cyl', 'drat', 'wt', 'qsec', 'am']
# Find model
\
lr_best, train_best_score, = fit_model(estimator = LinearRegression(),
cv_best_score, lr_params_best = X_train[train_column_list[best_idx+1]],
X_train = y_train)
y_train
print('Train score :', train_best_score)
print('CV score :', cv_best_score)
Train score : 4.113333306973907
CV score : 7.989577252522926
lr_params_best
coefficient | |
---|---|
cyl | 1.282553 |
drat | 1.639653 |
wt | -3.579665 |
qsec | 2.539308 |
am | 4.763395 |
constant | -29.508518 |
Test Prediction
# Cek test scores
= lr_best.predict(X_test[train_column_list[best_idx+1]])
y_pred_test
# Cari MSE data test
= mean_squared_error(y_true = y_test,
test_score = y_pred_test)
y_pred test_score
17.675913225570763
Conclusion 1
= pd.DataFrame({"Model": ["Baseline", "OLS full features", "OLS best features"],
best_scores_df "CV Scores": [mse_baseline_cv, mse_lr_cv, cv_best_score]})
best_scores_df
Model | CV Scores | |
---|---|---|
0 | Baseline | 39.234350 |
1 | OLS full features | 23.421110 |
2 | OLS best features | 7.989577 |
Sedikit lebih besar dibandingkan dengan model OLS full features. Mana yang lebih baik? Tentu saja, kita harus melihat trade-off antara kompleksitas model dan performa. Jika performa model tidak meningkat secara signifikan, kita mungkin lebih memilih model yang lebih sederhana.
Ingat trade-off antara bias dan varians!
Shrinkage Methods: Ridge, Lasso, Elastic Net
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
Ridge
= [0.5, 1.0, 2.5, 5.0, 7.5, 10.0,
alphas 12.5, 15.0, 17.5, 30.0, 50.0]
= []
mse_train_list = []
mse_cv_list = []
model_list
for alpha in alphas:
\
model_i, train_score_i, = fit_model(estimator = Ridge(alpha=alpha),
cv_score_i, model_param_i = X_train,
X_train = y_train)
y_train
mse_train_list.append(train_score_i)
mse_cv_list.append(cv_score_i) model_list.append(model_param_i)
# Plot error
= plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
fig, ax
="r", marker=".", label="Train")
ax.plot(alphas, mse_train_list, c="g", marker=".", label="CV")
ax.plot(alphas, mse_cv_list, c
"alpha")
ax.set_xlabel("MSE")
ax.set_ylabel(
plt.grid()
plt.legend() plt.show()
# Best parameter adalah saat MSE di CV paling kecil
= np.argmin(mse_cv_list)
best_idx = alphas[best_idx]
best_alpha = mse_cv_list[best_idx]
best_ridge_cv
best_alpha, best_ridge_cv
(2.5, 10.789408042167233)
# Best model
= model_list[best_idx]
best_param_ridge best_param_ridge
coefficient | |
---|---|
cyl | -0.453721 |
disp | -0.005147 |
hp | -0.007531 |
drat | 1.025724 |
wt | -1.539528 |
qsec | 0.624478 |
vs | -0.126656 |
am | 1.701397 |
gear | 0.680265 |
carb | -0.731590 |
constant | 14.389671 |
Cara lebih cepat dengan GridSearchCV
# Buat model & parameter model yang ingin divariasikan
= Ridge()
ridge
= {"alpha": alphas}
param_space param_space
{'alpha': [0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 30.0, 50.0]}
# Lakukan grid search dengan CV
= GridSearchCV(estimator = ridge,
cv_ridge = param_space,
param_grid = "neg_mean_squared_error",
scoring = 5)
cv
# Fit searching
= X_train,
cv_ridge.fit(X = y_train) y
GridSearchCV(cv=5, estimator=Ridge(), param_grid={'alpha': [0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 30.0, 50.0]}, scoring='neg_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=Ridge(), param_grid={'alpha': [0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 30.0, 50.0]}, scoring='neg_mean_squared_error')
Ridge()
Ridge()
cv_ridge.best_params_
{'alpha': 2.5}
# Buat objek baru
= Ridge(alpha = cv_ridge.best_params_["alpha"])
best_ridge
# Fit model
= X_train,
best_ridge.fit(X = y_train) y
Ridge(alpha=2.5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Ridge(alpha=2.5)
best_ridge.coef_
array([-0.45372081, -0.00514687, -0.00753064, 1.02572436, -1.53952835,
0.62447844, -0.12665636, 1.70139692, 0.68026538, -0.73159023])
Lasso
= [0.05, 0.10, 0.15, 0.20, 0.25, 1.00,
alphas 1.25, 1.50, 1.75, 3.00, 5.00]
= []
mse_train_list = []
mse_cv_list = []
model_list
for alpha in alphas:
\
model_i, train_score_i, = fit_model(estimator = Lasso(alpha=alpha),
cv_score_i, model_param_i = X_train,
X_train = y_train)
y_train
mse_train_list.append(train_score_i)
mse_cv_list.append(cv_score_i) model_list.append(model_param_i)
# Plot error
= plt.subplots(nrows=1, ncols=1, figsize=(10, 6))
fig, ax
="r", marker=".", label="Train")
ax.plot(alphas, mse_train_list, c="g", marker=".", label="CV")
ax.plot(alphas, mse_cv_list, c
"alpha")
ax.set_xlabel("MSE")
ax.set_ylabel(
plt.grid()
plt.legend() plt.show()
# Best parameter adalah saat MSE di CV paling kecil
= np.argmin(mse_cv_list)
best_idx = alphas[best_idx]
best_alpha = mse_cv_list[best_idx]
best_lasso_cv best_alpha, best_lasso_cv
(0.15, 12.976374619574276)
# Best model
= model_list[best_idx]
best_param_lasso best_param_lasso
coefficient | |
---|---|
cyl | -0.092881 |
disp | -0.006417 |
hp | -0.010095 |
drat | 0.668991 |
wt | -1.819467 |
qsec | 0.767749 |
vs | 0.000000 |
am | 2.808862 |
gear | 0.000000 |
carb | -0.506165 |
constant | 13.712239 |
Dengan GridSearchCV
# Buat model & parameter model yang ingin divariasikan
= Lasso()
lasso = {"alpha": alphas}
param_space
# Lakukan grid search dengan CV
= GridSearchCV(estimator = lasso,
cv_lasso = param_space,
param_grid = "neg_mean_squared_error",
scoring = 5)
cv
# Fit searching
= X_train,
cv_lasso.fit(X = y_train) y
GridSearchCV(cv=5, estimator=Lasso(), param_grid={'alpha': [0.05, 0.1, 0.15, 0.2, 0.25, 1.0, 1.25, 1.5, 1.75, 3.0, 5.0]}, scoring='neg_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=Lasso(), param_grid={'alpha': [0.05, 0.1, 0.15, 0.2, 0.25, 1.0, 1.25, 1.5, 1.75, 3.0, 5.0]}, scoring='neg_mean_squared_error')
Lasso()
Lasso()
cv_lasso.best_params_
{'alpha': 0.15}
# Buat objek baru
= Lasso(alpha = cv_lasso.best_params_["alpha"])
best_lasso
# Fit model
= X_train,
best_lasso.fit(X = y_train) y
Lasso(alpha=0.15)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Lasso(alpha=0.15)
best_lasso.coef_
array([-0.09288145, -0.00641718, -0.01009455, 0.66899118, -1.81946701,
0.76774888, 0. , 2.80886172, 0. , -0.50616465])
Elastic Net
Langsung dengan GridSearchCV
= [0.05, 0.10, 0.15, 0.20, 0.25, 1.00, 1.25, 1.50, 1.75, 3.00, 5.00]
alphas = [0.1, 0.5, 0.7, 0.9, 1.0] l1_ratios
# Buat model & parameter model yang ingin divariasikan
= ElasticNet()
elastic = {"alpha": alphas,
param_space "l1_ratio": l1_ratios}
# Lakukan grid search dengan CV
= GridSearchCV(estimator = elastic,
cv_elastic = param_space,
param_grid = "neg_mean_squared_error",
scoring = 5)
cv # Fit searching
= X_train,
cv_elastic.fit(X = y_train) y
GridSearchCV(cv=5, estimator=ElasticNet(), param_grid={'alpha': [0.05, 0.1, 0.15, 0.2, 0.25, 1.0, 1.25, 1.5, 1.75, 3.0, 5.0], 'l1_ratio': [0.1, 0.5, 0.7, 0.9, 1.0]}, scoring='neg_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=ElasticNet(), param_grid={'alpha': [0.05, 0.1, 0.15, 0.2, 0.25, 1.0, 1.25, 1.5, 1.75, 3.0, 5.0], 'l1_ratio': [0.1, 0.5, 0.7, 0.9, 1.0]}, scoring='neg_mean_squared_error')
ElasticNet()
ElasticNet()
cv_elastic.best_params_
{'alpha': 0.15, 'l1_ratio': 0.1}
# Buat objek baru
= ElasticNet(alpha = cv_elastic.best_params_["alpha"],
best_elastic = cv_elastic.best_params_["l1_ratio"])
l1_ratio = X_train,
best_elastic.fit(X = y_train) y
ElasticNet(alpha=0.15, l1_ratio=0.1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
ElasticNet(alpha=0.15, l1_ratio=0.1)
best_elastic.coef_
array([-0.52408861, -0.00769543, -0.00878001, 0.86008691, -1.28146081,
0.42166796, -0. , 1.41763367, 0.61949478, -0.74413128])
# Cek scores cv from cv_elastic best
= -cv_elastic.best_score_
best_elastic_cv best_elastic_cv
10.847311992084979
Conclusion 2
= pd.concat([lr_params_df,
best_params
lr_params_best,
best_param_ridge,
best_param_lasso,=X_train.columns, columns=["ElasticNet"])],
pd.DataFrame(best_elastic.coef_, index=1)
axis= ["OLS full features", "OLS best features", "Ridge", "Lasso", "ElasticNet"]
best_params.columns best_params
OLS full features | OLS best features | Ridge | Lasso | ElasticNet | |
---|---|---|---|---|---|
cyl | 1.097167 | 1.282553 | -0.453721 | -0.092881 | -0.524089 |
disp | 0.006168 | NaN | -0.005147 | -0.006417 | -0.007695 |
hp | -0.005287 | NaN | -0.007531 | -0.010095 | -0.008780 |
drat | 1.820345 | 1.639653 | 1.025724 | 0.668991 | 0.860087 |
wt | -3.668249 | -3.579665 | -1.539528 | -1.819467 | -1.281461 |
qsec | 2.139810 | 2.539308 | 0.624478 | 0.767749 | 0.421668 |
vs | 0.851392 | NaN | -0.126656 | 0.000000 | -0.000000 |
am | 5.880618 | 4.763395 | 1.701397 | 2.808862 | 1.417634 |
gear | -1.092819 | NaN | 0.680265 | 0.000000 | 0.619495 |
carb | 0.032545 | NaN | -0.731590 | -0.506165 | -0.744131 |
constant | -19.270044 | -29.508518 | 14.389671 | 13.712239 | NaN |
# Skor MSE
= pd.DataFrame({"Model": ["Baseline", "OLS full features", "OLS best features", "Ridge", "Lasso", "ElasticNet"],
best_scores_df "MSE": [mse_baseline_cv, mse_lr_cv, cv_best_score, best_ridge_cv, best_lasso_cv, best_elastic_cv]})
best_scores_df
Model | MSE | |
---|---|---|
0 | Baseline | 39.234350 |
1 | OLS full features | 23.421110 |
2 | OLS best features | 7.989577 |
3 | Ridge | 10.789408 |
4 | Lasso | 12.976375 |
5 | ElasticNet | 10.847312 |
# Cek test scores
def mse_model(estimator, X_test, y_test):
# Predict
= estimator.predict(X_test)
y_pred
# Cari mse
= mean_squared_error(y_test, y_pred)
mse
return mse
= lr_best,
mse_model(estimator = X_test[lr_params_best.index[:-1]],
X_test = y_test) y_test
17.675913225570763
= best_elastic,
mse_model(estimator = X_test,
X_test = y_test) y_test
6.514650231430356
# Evaluate all models on the test set
= {
test_scores "Baseline": mse_model(estimator=baseline_model, X_test=X_test, y_test=y_test),
"OLS full features": mse_model(estimator=lr, X_test=X_test, y_test=y_test),
"OLS best features": mse_model(estimator=lr_best, X_test=X_test[lr_params_best.index[:-1]], y_test=y_test),
"Ridge": mse_model(estimator=best_ridge, X_test=X_test, y_test=y_test),
"Lasso": mse_model(estimator=best_lasso, X_test=X_test, y_test=y_test),
"ElasticNet": mse_model(estimator=best_elastic, X_test=X_test, y_test=y_test)
}
= pd.DataFrame(list(test_scores.items()), columns=["Model", "Test MSE"])
test_scores_df test_scores_df
Model | Test MSE | |
---|---|---|
0 | Baseline | 30.887143 |
1 | OLS full features | 16.511354 |
2 | OLS best features | 17.675913 |
3 | Ridge | 6.949301 |
4 | Lasso | 7.244562 |
5 | ElasticNet | 6.514650 |
References
https://scikit-learn.org/stable/modules/linear_model.html#