# Import Numpy and Pandas library
import pandas as pd
import numpy as np
Machine Learning Workflow (Simplified)
Dataset Description
Note
- This dataset originally comes from Uber Fares Dataset
- We perform several edit for this mentoring purposes. So, please use the dataset from here.
Description - We’re looking to predict the fare of Uber’s transactions. - The dataset contains of the following fields
Feature | Type | Descriptions |
---|---|---|
order_id |
int |
a unique identifier for each trip |
pickup_time |
str |
a class of pickup time. 04-10 , 10-16 , 16-22 , 22-04 . E.g. 04-10 means the pickup time is between 04.00 to 10.00 |
pickup_longitude |
float |
the longitude where the meter was engaged |
pickup_latitude |
float |
the latitude where the meter was engaged |
dropoff_longitude |
float |
the longitude where the meter was disengaged |
dropoff_latitude |
float |
the latitude where the meter was disengaged |
passenger_count |
float |
the number of passengers in the vehicle (driver entered value) |
fare_amount |
int |
the cost of each trip in USD, (our target) |
Modeling Workflow
1. Import data to Python
2. Data Preprocessing
3. Training a Machine Learning Models
4. Test Prediction
1. Import data to Python
# Create a function to read the data
def read_data(fname):
= pd.read_csv(fname)
data print('Data shape raw :', data.shape)
print('Number of duplicate order id :', data.duplicated(subset='order_id').sum())
= data.drop_duplicates(subset='order_id', keep='last')
data = data.set_index('order_id')
data print('Data shape after dropping :', data.shape)
print('Data shape final :', data.shape)
return data
# Read the Uber data
= read_data(fname='uber_edit.csv') data
Data shape raw : (194814, 8)
Number of duplicate order id : 0
Data shape after dropping : (194814, 7)
Data shape final : (194814, 7)
data.head()
fare_amount | pickup_time | pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | passenger_count | |
---|---|---|---|---|---|---|---|
order_id | |||||||
24238194 | 7.5 | 16-22 | -73.999817 | 40.738354 | -73.999512 | 40.723217 | 1.0 |
27835199 | 7.7 | 16-22 | -73.994355 | 40.728225 | -73.994710 | 40.750325 | 1.0 |
44984355 | 12.9 | 16-22 | -74.005043 | 40.740770 | -73.962565 | 40.772647 | 1.0 |
25894730 | 5.3 | 04-10 | -73.976124 | 40.790844 | -73.965316 | 40.803349 | 3.0 |
17610152 | 16.0 | 16-22 | -73.925023 | 40.744085 | -73.973082 | 40.761247 | 5.0 |
2. Data Preprocessing
The processing pipeline
2.1 Input-Output Split
2.2 Train-Valid-Test Split
2.3 Separate Numerical and Categorical Features
2.4 Numerical Imputation
2.5 Categorical Imputation
2.6 Preprocess Categorical Features
2.7 Join the Data
2.8 Feature Engineering the Data
2.9 Create a Preprocessing Function
2.1. Input-Output Split
- We’re going to split input & output according to the modeling objective.
- Create a function to split the input & output
def split_input_output(data, target_col):
= data.drop(columns=target_col)
X = data[target_col]
y print('X shape:', X.shape)
print('y shape:', y.shape)
return X, y
= split_input_output(data=data,
X, y ='fare_amount') target_col
X shape: (194814, 6)
y shape: (194814,)
X.head()
pickup_time | pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | passenger_count | |
---|---|---|---|---|---|---|
order_id | ||||||
24238194 | 16-22 | -73.999817 | 40.738354 | -73.999512 | 40.723217 | 1.0 |
27835199 | 16-22 | -73.994355 | 40.728225 | -73.994710 | 40.750325 | 1.0 |
44984355 | 16-22 | -74.005043 | 40.740770 | -73.962565 | 40.772647 | 1.0 |
25894730 | 04-10 | -73.976124 | 40.790844 | -73.965316 | 40.803349 | 3.0 |
17610152 | 16-22 | -73.925023 | 40.744085 | -73.973082 | 40.761247 | 5.0 |
y.head()
order_id
24238194 7.5
27835199 7.7
44984355 12.9
25894730 5.3
17610152 16.0
Name: fare_amount, dtype: float64
2.2. Train-Valid-Test Split
- Now, we want to split the data before modeling.
- Split the data into three set:
- Train, for training the model
- Validation, for choosing the best model
- Test, for error generalization
from sklearn.model_selection import train_test_split
def split_train_test(X, y, test_size, seed):
= train_test_split(X, y, test_size=test_size, random_state=seed)
X_train, X_test, y_train, y_test print('X train shape:', X_train.shape)
print('y train shape:', y_train.shape)
print('X test shape :', X_test.shape)
print('y test shape :', y_test.shape)
return X_train, X_test, y_train, y_test
# Split the data
# First, split the train & not train
= split_train_test(X, y, test_size=0.2, seed=123)
X_train, X_not_train, y_train, y_not_train
# Then, split the valid & test
= split_train_test(X_not_train, y_not_train, test_size=0.5, seed=123) X_valid, X_test, y_valid, y_test
X train shape: (155851, 6)
y train shape: (155851,)
X test shape : (38963, 6)
y test shape : (38963,)
X train shape: (19481, 6)
y train shape: (19481,)
X test shape : (19482, 6)
y test shape : (19482,)
print(len(X_train)/len(X)) # should be 0.8
print(len(X_valid)/len(X)) # should be 0.1
print(len(X_test)/len(X)) # should be 0.1
0.7999989733797366
0.09999794675947314
0.1000030798607903
X_train.head()
pickup_time | pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | passenger_count | |
---|---|---|---|---|---|---|
order_id | ||||||
51655713 | 16-22 | -73.979392 | 40.735734 | -73.906281 | 40.745539 | 2.0 |
37525839 | 16-22 | -73.986575 | 40.761473 | -73.981880 | 40.768660 | 5.0 |
55058970 | 16-22 | -73.972533 | 40.782260 | -73.952761 | 40.708980 | 1.0 |
15663447 | 10-16 | -73.979967 | 40.751612 | -73.976313 | 40.758427 | 6.0 |
13325650 | 16-22 | -73.976192 | 40.744026 | -73.980935 | 40.733946 | 1.0 |
2.3. Separate Numerical and Categorical Features
- We now prepare to perform data preprocessing
- But, we first separate the data into numerical data & categorical data.
def split_num_cat(data, num_cols, cat_cols):
= data[num_cols]
data_num = data[cat_cols]
data_cat print('Data num shape:', data_num.shape)
print('Data cat shape:', data_cat.shape)
return data_num, data_cat
= ['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude' ,'passenger_count']
num_cols = ['pickup_time']
cat_cols = split_num_cat(X_train, num_cols, cat_cols) # WRITE YOUR CODE HERE X_train_num, X_train_cat
Data num shape: (155851, 5)
Data cat shape: (155851, 1)
X_train_num.head()
pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | passenger_count | |
---|---|---|---|---|---|
order_id | |||||
51655713 | -73.979392 | 40.735734 | -73.906281 | 40.745539 | 2.0 |
37525839 | -73.986575 | 40.761473 | -73.981880 | 40.768660 | 5.0 |
55058970 | -73.972533 | 40.782260 | -73.952761 | 40.708980 | 1.0 |
15663447 | -73.979967 | 40.751612 | -73.976313 | 40.758427 | 6.0 |
13325650 | -73.976192 | 40.744026 | -73.980935 | 40.733946 | 1.0 |
X_train_cat.head()
pickup_time | |
---|---|
order_id | |
51655713 | 16-22 |
37525839 | 16-22 |
55058970 | 16-22 |
15663447 | 10-16 |
13325650 | 16-22 |
EDA before Preprocessing
- Find the number of missing values
100 * (X_train.isna().sum(0) / len(X_train))
pickup_time 0.000000
pickup_longitude 0.000000
pickup_latitude 0.000000
dropoff_longitude 0.000000
dropoff_latitude 0.000000
passenger_count 0.606348
dtype: float64
We will impute all these variables if there is any missing value
First, check the numerical features distribution
import matplotlib.pyplot as plt
import seaborn as sns
= plt.subplots(nrows=3, ncols=2, figsize=(12, 8))
fig, ax = ax.flatten()
axes
for i, col in enumerate(X_train_num.columns):
=axes[i])
sns.kdeplot(X_train_num[col], axf'Distribution of {col}')
axes[i].set_title(
plt.tight_layout() plt.show()
All the distribution are skewed, we can impute a missing value by its features median.
Next, explore the
pickup_time
'pickup_time'].value_counts(normalize=True) X_train[
pickup_time
16-22 0.328160
10-16 0.286376
22-04 0.221648
04-10 0.157599
- 0.006217
Name: proportion, dtype: float64
There’s a missing value with symbol
'-'
inpickup_time
,We can impute the missing value with
UNKNOWN
Explore the relation between
pickup_time
andfare
= pd.concat((X_train, y_train), axis=1)
train_data train_data.head()
pickup_time | pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | passenger_count | fare_amount | |
---|---|---|---|---|---|---|---|
order_id | |||||||
51655713 | 16-22 | -73.979392 | 40.735734 | -73.906281 | 40.745539 | 2.0 | 16.5 |
37525839 | 16-22 | -73.986575 | 40.761473 | -73.981880 | 40.768660 | 5.0 | 3.7 |
55058970 | 16-22 | -73.972533 | 40.782260 | -73.952761 | 40.708980 | 1.0 | 18.9 |
15663447 | 10-16 | -73.979967 | 40.751612 | -73.976313 | 40.758427 | 6.0 | 4.1 |
13325650 | 16-22 | -73.976192 | 40.744026 | -73.980935 | 40.733946 | 1.0 | 5.0 |
=train_data[train_data['fare_amount'] < 50],
sns.boxplot(data='pickup_time',
x='fare_amount')
y plt.show()
- There is no significant fare different between
pickup_time
. - We can perform a one hot encoding for this data.
Conclusion for preprocessing - Impute the missing passenger_counts
with its median - Impute the missing pickup_time
with 'UNKNOWN'
- Feature engineering the dropoff
and pickup
coordinate to be a distance between pickup and dropoff. We can use an Euclidean distance for simplicity.
2.4. Numerical Imputation (6 pts)
- Now, let’s perform a numerical imputation
- First check the missing value of the numerical data
sum(0) X_train_num.isna().
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 945
dtype: int64
- Create a function to fit a numerical features imputer
from sklearn.impute import SimpleImputer
def num_imputer_fit(data):
= SimpleImputer(strategy='median')
imputer
imputer.fit(data)return imputer
def num_imputer_transform(data, imputer):
= imputer.transform(data)
data_imputed = pd.DataFrame(data_imputed, columns=data.columns, index=data.index)
data_imputed return data_imputed
- Perform imputation
# Get the numerical imputer
= num_imputer_fit(X_train_num)
num_imputer # Transform the data
= num_imputer_transform(X_train_num, num_imputer) X_train_num_imputed
sum(0) X_train_num_imputed.isna().
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 0
dtype: int64
2.5. Categorical Imputation
- Next, let’s perform the categorical imputation
=True) X_train_cat.value_counts(normalize
pickup_time
16-22 0.328160
10-16 0.286376
22-04 0.221648
04-10 0.157599
- 0.006217
Name: proportion, dtype: float64
- Create a function to fit a categorical features imputer
def cat_imputer_fit(data):
= SimpleImputer(missing_values='-', strategy='constant', fill_value='UNKNOWN')
imputer
imputer.fit(data)return imputer
def cat_imputer_transform(data, imputer):
= imputer.transform(data)
data_imputed = pd.DataFrame(data_imputed, columns=data.columns, index=data.index)
data_imputed return data_imputed
- Perform imputation
# Perform categorical imputation
= cat_imputer_fit(X_train_cat)
cat_imputer
# Transform
= cat_imputer_transform(X_train_cat, cat_imputer) X_train_cat_imputed
=True) X_train_cat_imputed.value_counts(normalize
pickup_time
16-22 0.328160
10-16 0.286376
22-04 0.221648
04-10 0.157599
UNKNOWN 0.006217
Name: proportion, dtype: float64
Great!
2.6. Preprocess Categorical Features
- We will create a one-hot-encoder (read the
EDA before processing
) for the categorical features - Create a function to perform a one hot encoder
from sklearn.preprocessing import OneHotEncoder
def cat_encoder_fit(data):
= OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder
encoder.fit(data)return encoder
def cat_encoder_transform(data, encoder):
= encoder.transform(data)
data_encoded = pd.DataFrame(data_encoded, columns=encoder.get_feature_names_out(data.columns), index=data.index)
data_encoded return data_encoded
- Perform imputation
# Perform categorical imputation
= cat_encoder_fit(X_train_cat_imputed)
cat_encoder
# Transform
= cat_encoder_transform(X_train_cat_imputed, cat_encoder) X_train_cat_encoded
C:\Users\derik\anaconda3\Lib\site-packages\sklearn\preprocessing\_encoders.py:868: FutureWarning:
`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
print('Original shape:', X_train_cat_imputed.shape)
print('Encoded shape :', X_train_cat_encoded.shape)
Original shape: (155851, 1)
Encoded shape : (155851, 5)
X_train_cat_encoded.head()
pickup_time_04-10 | pickup_time_10-16 | pickup_time_16-22 | pickup_time_22-04 | pickup_time_UNKNOWN | |
---|---|---|---|---|---|
order_id | |||||
51655713 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
37525839 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
55058970 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
15663447 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
13325650 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
X_train_cat_imputed.head()
pickup_time | |
---|---|
order_id | |
51655713 | 16-22 |
37525839 | 16-22 |
55058970 | 16-22 |
15663447 | 10-16 |
13325650 | 16-22 |
Great!
2.7. Join the data
- After all the data is filled (numerically), we can join the data
- Create a function to join the data
def concat_data(num_data, cat_data):
= pd.concat((num_data, cat_data), axis=1)
data return data
- Perform concatenated
= concat_data(X_train_num_imputed, X_train_cat_encoded)
X_train_concat print('Numerical data shape :', X_train_num_imputed.shape)
print('Categorical data shape:', X_train_cat_encoded.shape)
print('Concat data shape :', X_train_concat.shape)
Numerical data shape : (155851, 5)
Categorical data shape: (155851, 5)
Concat data shape : (155851, 10)
# Validate
X_train_concat.head()
pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | passenger_count | pickup_time_04-10 | pickup_time_10-16 | pickup_time_16-22 | pickup_time_22-04 | pickup_time_UNKNOWN | |
---|---|---|---|---|---|---|---|---|---|---|
order_id | ||||||||||
51655713 | -73.979392 | 40.735734 | -73.906281 | 40.745539 | 2.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
37525839 | -73.986575 | 40.761473 | -73.981880 | 40.768660 | 5.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
55058970 | -73.972533 | 40.782260 | -73.952761 | 40.708980 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
15663447 | -73.979967 | 40.751612 | -73.976313 | 40.758427 | 6.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
13325650 | -73.976192 | 40.744026 | -73.980935 | 40.733946 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
2.8. Feature engineering the data
- Now,
pickup
anddropoff
coordinate is not an explicit features. - We can create a better feature called by
distance
to summarize thepickup
anddropoff
coordinate.
def map_distance(data):
'distance'] = np.sqrt((data['pickup_longitude'] - data['dropoff_longitude'])**2 + (data['pickup_latitude'] - data['dropoff_latitude'])**2)
data[= data.drop(columns=['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'])
data return data
- Perform distance calculation
= map_distance(X_train_concat)
X_train_concat_fe print('Original data shape:', X_train_concat.shape)
print('Mapped data shape :', X_train_concat_fe.shape)
Original data shape: (155851, 11)
Mapped data shape : (155851, 7)
X_train_concat_fe.head()
passenger_count | pickup_time_04-10 | pickup_time_10-16 | pickup_time_16-22 | pickup_time_22-04 | pickup_time_UNKNOWN | distance | |
---|---|---|---|---|---|---|---|
order_id | |||||||
51655713 | 2.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.073766 |
37525839 | 5.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.008585 |
55058970 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.075901 |
15663447 | 6.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.007733 |
13325650 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.011140 |
- And finally, we standardize the data so that it can perform well during model optimization
from sklearn.preprocessing import StandardScaler
def fit_scaler(data):
= StandardScaler()
scaler
scaler.fit(data)return scaler
def transform_scaler(data, scaler):
= scaler.transform(data)
data_scaled = pd.DataFrame(data_scaled, columns=data.columns, index=data.index)
data_scaled return data_scaled
# Fit the scaler
= fit_scaler(X_train_concat_fe)
scaler # Transform the scaler
= transform_scaler(X_train_concat_fe, scaler) X_train_clean
round(4) X_train_clean.describe().
passenger_count | pickup_time_04-10 | pickup_time_10-16 | pickup_time_16-22 | pickup_time_22-04 | pickup_time_UNKNOWN | distance | |
---|---|---|---|---|---|---|---|
count | 155851.0000 | 155851.0000 | 155851.0000 | 155851.0000 | 155851.0000 | 155851.0000 | 155851.0000 |
mean | -0.0000 | 0.0000 | -0.0000 | -0.0000 | -0.0000 | 0.0000 | -0.0000 |
std | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 |
min | -0.5267 | -0.4325 | -0.6335 | -0.6989 | -0.5336 | -0.0791 | -0.0383 |
25% | -0.5267 | -0.4325 | -0.6335 | -0.6989 | -0.5336 | -0.0791 | -0.0340 |
50% | -0.5267 | -0.4325 | -0.6335 | -0.6989 | -0.5336 | -0.0791 | -0.0311 |
75% | 0.2412 | -0.4325 | 1.5786 | 1.4308 | -0.5336 | -0.0791 | -0.0256 |
max | 3.3130 | 2.3120 | 1.5786 | 1.4308 | 1.8739 | 12.6427 | 218.5245 |
2.9. Create the preprocess function
- Now, let’s create a function to preprocess other set of data (valid & test) so that we can predict that
def preprocess_data(data, num_cols, cat_cols, num_imputer, cat_imputer, cat_encoder, scaler):
= split_num_cat(data, num_cols, cat_cols)
data_num, data_cat = num_imputer_transform(data_num, num_imputer)
data_num_imputed = cat_imputer_transform(data_cat, cat_imputer)
data_cat_imputed = cat_encoder_transform(data_cat_imputed, cat_encoder)
data_cat_encoded = concat_data(data_num_imputed, data_cat_encoded)
data_concat = map_distance(data_concat)
data_mapped = transform_scaler(data_mapped, scaler)
data_scaled return data_scaled
= preprocess_data(data=X_train,
X_train_clean =num_cols,
num_cols=cat_cols,
cat_cols=num_imputer,
num_imputer=cat_imputer,
cat_imputer=cat_encoder,
cat_encoder=scaler)
scaler
print('Numerical data shape :', X_train_num_imputed.shape)
print('Categorical data shape:', X_train_cat_encoded.shape)
print('Concat data shape :', X_train_concat.shape)
print('Original data shape:', X_train_concat.shape)
print('Mapped data shape :', X_train_concat_fe.shape)
Data num shape: (155851, 5)
Data cat shape: (155851, 1)
Numerical data shape : (155851, 5)
Categorical data shape: (155851, 5)
Concat data shape : (155851, 11)
Original data shape: (155851, 11)
Mapped data shape : (155851, 7)
print('Original data shape:', X_train.shape)
print('Cleaned data shape :', X_train_clean.shape)
X_train_clean.head()
Original data shape: (155851, 6)
Cleaned data shape : (155851, 7)
passenger_count | pickup_time_04-10 | pickup_time_10-16 | pickup_time_16-22 | pickup_time_22-04 | pickup_time_UNKNOWN | distance | |
---|---|---|---|---|---|---|---|
order_id | |||||||
51655713 | 0.241233 | -0.432531 | -0.633481 | 1.430838 | -0.533634 | -0.079097 | -0.013978 |
37525839 | 2.545080 | -0.432531 | -0.633481 | 1.430838 | -0.533634 | -0.079097 | -0.035436 |
55058970 | -0.526715 | -0.432531 | -0.633481 | 1.430838 | -0.533634 | -0.079097 | -0.013275 |
15663447 | 3.313029 | -0.432531 | 1.578579 | -0.698891 | -0.533634 | -0.079097 | -0.035716 |
13325650 | -0.526715 | -0.432531 | -0.633481 | 1.430838 | -0.533634 | -0.079097 | -0.034594 |
= preprocess_data(data=X_valid,
X_valid_clean =num_cols,
num_cols=cat_cols,
cat_cols=num_imputer,
num_imputer=cat_imputer,
cat_imputer=cat_encoder,
cat_encoder=scaler)
scaler= preprocess_data(data=X_test,
X_test_clean =num_cols,
num_cols=cat_cols,
cat_cols=num_imputer,
num_imputer=cat_imputer,
cat_imputer=cat_encoder,
cat_encoder=scaler)
scalerprint('Cleaned X_valid data shape :', X_valid_clean.shape)
print('Cleaned X_test data shape :', X_test_clean.shape)
Data num shape: (19481, 5)
Data cat shape: (19481, 1)
Data num shape: (19482, 5)
Data cat shape: (19482, 1)
Cleaned X_valid data shape : (19481, 7)
Cleaned X_test data shape : (19482, 7)
3. Training Machine Learning Models
3.1 Prepare train & evaluate model function
3.2 Train & evaluate several models
3.3 Choose the best model
3.1. Preprare train & evaluate model function
- Before modeling, let’s prepare function to train & evaluate model
from sklearn.metrics import mean_squared_error
def train_model(estimator, X_train, y_train):
estimator.fit(X_train, y_train)
def evaluate_model(estimator, X_train, y_train, X_valid, y_valid):
= estimator.predict(X_train)
y_train_pred = estimator.predict(X_valid)
y_valid_pred = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_train = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
rmse_valid return rmse_train, rmse_valid
3.2. Train and Evaluate Several Models
Now, let’s train & evaluate several models
You should check, which one of the following model is the best model
- Baseline model
- k-NN with k=1
- k-NN with k=100
- k-NN with k=200
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
= DummyRegressor()
reg_1 = KNeighborsRegressor(n_neighbors=1)
reg_2 = KNeighborsRegressor(n_neighbors=100)
reg_3 = KNeighborsRegressor(n_neighbors=200) reg_4
# Train the model
train_model(reg_1, X_train_clean, y_train)
train_model(reg_2, X_train_clean, y_train)
train_model(reg_3, X_train_clean, y_train) train_model(reg_4, X_train_clean, y_train)
import time
for reg in [reg_1, reg_2, reg_3, reg_4]:
= time.time()
t0
# Generate the rmse
= evaluate_model(estimator=reg,
rmse_train, rmse_valid =X_train_clean,
X_train=y_train,
y_train=X_valid_clean,
X_valid=y_valid)
y_valid
# Logging
= time.time() - t0
elapsed print(f'model : {str(reg):40s} '
f'| RMSE train: {rmse_train:.4f} '
f'| RMSE valid: {rmse_valid:.4f} '
f'| Time elapsed: {elapsed*1000:.2f} ms')
model : DummyRegressor() | RMSE train: 8.9221 | RMSE valid: 8.8614 | Time elapsed: 2.00 ms
model : KNeighborsRegressor(n_neighbors=1) | RMSE train: 1.4510 | RMSE valid: 5.4185 | Time elapsed: 11933.78 ms
model : KNeighborsRegressor(n_neighbors=100) | RMSE train: 3.9589 | RMSE valid: 3.9791 | Time elapsed: 23782.22 ms
model : KNeighborsRegressor(n_neighbors=200) | RMSE train: 4.1513 | RMSE valid: 4.1420 | Time elapsed: 22960.39 ms
3.3. Choose the best model
From the previous results, which one is the best model?
Why do you choose that model?
And, create a reg_best
to store the best model
= reg_3 reg_best
4. Predictions & Evaluations
4.1 Predict & Evaluate on the Train Data
4.2 Predict & Evaluate on the Test Data
4.1. Predict & evaluate on train data
# Predict
= reg_best.predict(X_train_clean) y_train_pred
plt.scatter(y_train, y_train_pred)
0, 200], [0, 200], c='red')
plt.plot([0, 200); plt.ylim(0, 200)
plt.xlim('y actual'); plt.ylabel('y predicted')
plt.xlabel('Comparison of y actual vs y predicted on Train Data')
plt.title( plt.show()
4.2. Predict & evaluate on test data
# Predict
= reg_best.predict(X_test_clean) y_test_pred
# Visualize & compare the prediction
plt.scatter(y_test, y_test_pred)
0, 200], [0, 200], c='red')
plt.plot([0, 200); plt.ylim(0, 200)
plt.xlim('y actual'); plt.ylabel('y predicted')
plt.xlabel('Comparison of y actual vs y predicted on Test Data')
plt.title( plt.show()
# RMSE
= evaluate_model(estimator=reg_best,
rmse_train, rmse_test =X_train_clean,
X_train=y_train,
y_train=X_test_clean,
X_valid=y_test) y_valid
print(f'| RMSE train: {rmse_train:.4f} '
f'| RMSE test: {rmse_test:.4f} ')
| RMSE train: 3.9589 | RMSE test: 4.0970