import pandas as pd
import numpy as np
Logistic Regression Workflow
= pd.read_csv('diabetes.csv')
data data
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 0 |
764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 0 |
765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0 |
766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 1 |
767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 0 |
768 rows × 9 columns
# Load Data
= pd.read_csv("diabetes.csv")
data = data[["Glucose", "BloodPressure", "Outcome"]]
data "Outcome"] = data["Outcome"].apply(lambda x: "diabetes" if x==1 else "not diabetes")
data[
data.head()
Glucose | BloodPressure | Outcome | |
---|---|---|---|
0 | 148 | 72 | diabetes |
1 | 85 | 66 | not diabetes |
2 | 183 | 64 | diabetes |
3 | 89 | 66 | not diabetes |
4 | 137 | 40 | diabetes |
import matplotlib.pyplot as plt
import seaborn as sns
# Visualkan Data
= plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
fig, ax
= "Glucose",
sns.scatterplot(x = "BloodPressure",
y = "Outcome",
hue = data,
data = ax)
ax
plt.show()
# Hapus data glucose == 0 & bloodpressure = 0
= data["Glucose"] > 0
condition_1 = data["BloodPressure"] > 0
condition_2
= data[condition_1 & condition_2] data
# Visualkan Data
= plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
fig, ax
= "Glucose",
sns.scatterplot(x = "BloodPressure",
y = "Outcome",
hue = data,
data = ax)
ax
plt.show()
"Outcome"].value_counts(normalize = True) data[
Outcome
not diabetes 0.656593
diabetes 0.343407
Name: proportion, dtype: float64
# Buat input & output
= data.drop(columns = "Outcome")
X = data["Outcome"] y
# Split train & test
from sklearn.model_selection import train_test_split
= train_test_split(X,
X_train, X_test, y_train, y_test
y,= 0.2,
test_size = y, # stratify sampling karena y imbalance
stratify = 123) random_state
= True), y_test.value_counts(), y_test.value_counts(normalize = True) y_train.value_counts(), y_train.value_counts(normalize
(Outcome
not diabetes 382
diabetes 200
Name: count, dtype: int64,
Outcome
not diabetes 0.656357
diabetes 0.343643
Name: proportion, dtype: float64,
Outcome
not diabetes 96
diabetes 50
Name: count, dtype: int64,
Outcome
not diabetes 0.657534
diabetes 0.342466
Name: proportion, dtype: float64)
# Baseline Model
# Dummy Classifier memprediksi dengan random (nebak) berdasarkan proporsi kelas
from sklearn.dummy import DummyClassifier
# Buat objek
= DummyClassifier(strategy = "most_frequent") # most_frequent = nebak kelas yang paling banyak muncul
dummy_clf
# Lakukan fit, untuk data y_train saja
= X_train,
dummy_clf.fit(X = y_train) y
DummyClassifier(strategy='most_frequent')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DummyClassifier(strategy='most_frequent')
# Predict
= dummy_clf.predict(X_train) y_pred_dummy
# Tampilkan confusion matrix
from sklearn.metrics import confusion_matrix
= y_train,
confusion_matrix(y_true = y_pred_dummy) y_pred
array([[ 0, 200],
[ 0, 382]])
# Visual confusion matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Buat confusion matrix
= confusion_matrix(y_true = y_train,
cm = y_pred_dummy)
y_pred
# Buat figure
= plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
fig, ax =True, fmt="d", cmap="Blues", ax=ax)
sns.heatmap(cm, annot"Predicted")
ax.set_xlabel("True")
ax.set_ylabel("Confusion Matrix")
ax.set_title( plt.show()
# cek yang positif itu apa
1] # not diabetes, ini adalah kelas 1 (positif) dummy_clf.classes_[
'not diabetes'
0] # diabetes, ini adalah kelas 0 (negatif) dummy_clf.classes_[
'diabetes'
# Cari score
from sklearn.metrics import accuracy_score
= y_train,
accuracy_score(y_true = y_pred_dummy)
y_pred
# Sama seperti proporsi kelas terbesar
0.6563573883161512
# Dummy Classifier dengan strategi stratified
= DummyClassifier(strategy = "stratified",
dummy_clf = 123)
random_state
# Lakukan fit, untuk data y_train saja
= X_train,
dummy_clf.fit(X = y_train) y
DummyClassifier(random_state=123, strategy='stratified')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DummyClassifier(random_state=123, strategy='stratified')
# predict
= dummy_clf.predict(X_train)
y_pred_dummy
# Tampilkan confusion matrix
= y_train,
confusion_matrix(y_true = y_pred_dummy) y_pred
array([[ 66, 134],
[128, 254]])
Cara Mengubah Data Kategorikal ke Numerik - Possible value: {not diabetes
, diabetes
} - Strateginya: Label Encoding, ubah not diabetes -> 0, diabetes -> 1
# Convert y_train & y_test
= y_train.apply(lambda x: 1 if x == "diabetes" else 0)
y_train = y_test.apply(lambda x: 1 if x == "diabetes" else 0) y_test
=True) y_train.value_counts(normalize
Outcome
0 0.656357
1 0.343643
Name: proportion, dtype: float64
from sklearn.preprocessing import StandardScaler
# Buat scaler
= StandardScaler()
scaler
scaler.fit(X_train)
# Transform data train & test
= scaler.transform(X_train)
X_train_scaled = scaler.transform(X_test) X_test_scaled
# Import library
from sklearn.linear_model import LogisticRegression
# Buat weight class
= len(y_train)
n_samples = len(y_train.value_counts())
n_classes = y_train.value_counts()
n_samples_j
= n_samples / (n_classes * n_samples_j)
class_weight
class_weight
# Kita kasih weight yang lebih besar untuk kelas 1
Outcome
0 0.76178
1 1.45500
Name: count, dtype: float64
# Buat objek
= LogisticRegression(class_weight = dict(class_weight),
logreg = "liblinear",
solver = 123) random_state
# Lakukan eksperimentasi
from sklearn.model_selection import GridSearchCV
= {"penalty": ["l1", "l2"],
search_params "C": np.logspace(-5, 5, 20)}
= GridSearchCV(estimator = logreg,
logreg_cv = search_params,
param_grid = 5) cv
# Lakukan Fitting Data
= X_train_scaled,
logreg_cv.fit(X = y_train) y
GridSearchCV(cv=5, estimator=LogisticRegression(class_weight={0: np.float64(0.7617801047120419), 1: np.float64(1.455)}, random_state=123, solver='liblinear'), param_grid={'C': array([1.00000000e-05, 3.35981829e-05, 1.12883789e-04, 3.79269019e-04, 1.27427499e-03, 4.28133240e-03, 1.43844989e-02, 4.83293024e-02, 1.62377674e-01, 5.45559478e-01, 1.83298071e+00, 6.15848211e+00, 2.06913808e+01, 6.95192796e+01, 2.33572147e+02, 7.84759970e+02, 2.63665090e+03, 8.85866790e+03, 2.97635144e+04, 1.00000000e+05]), 'penalty': ['l1', 'l2']})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=LogisticRegression(class_weight={0: np.float64(0.7617801047120419), 1: np.float64(1.455)}, random_state=123, solver='liblinear'), param_grid={'C': array([1.00000000e-05, 3.35981829e-05, 1.12883789e-04, 3.79269019e-04, 1.27427499e-03, 4.28133240e-03, 1.43844989e-02, 4.83293024e-02, 1.62377674e-01, 5.45559478e-01, 1.83298071e+00, 6.15848211e+00, 2.06913808e+01, 6.95192796e+01, 2.33572147e+02, 7.84759970e+02, 2.63665090e+03, 8.85866790e+03, 2.97635144e+04, 1.00000000e+05]), 'penalty': ['l1', 'l2']})
LogisticRegression(C=np.float64(0.5455594781168515), class_weight={0: np.float64(0.7617801047120419), 1: np.float64(1.455)}, penalty='l1', random_state=123, solver='liblinear')
LogisticRegression(C=np.float64(0.5455594781168515), class_weight={0: np.float64(0.7617801047120419), 1: np.float64(1.455)}, penalty='l1', random_state=123, solver='liblinear')
# Best params
logreg_cv.best_params_
{'C': np.float64(0.5455594781168515), 'penalty': 'l1'}
# Buat best model
= LogisticRegression(penalty = logreg_cv.best_params_["penalty"],
logreg = logreg_cv.best_params_["C"],
C = dict(class_weight),
class_weight = "liblinear",
solver = 123)
random_state
# Fit model
logreg.fit(X_train_scaled, y_train)
LogisticRegression(C=np.float64(0.5455594781168515), class_weight={0: np.float64(0.7617801047120419), 1: np.float64(1.455)}, penalty='l1', random_state=123, solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=np.float64(0.5455594781168515), class_weight={0: np.float64(0.7617801047120419), 1: np.float64(1.455)}, penalty='l1', random_state=123, solver='liblinear')
# Buat prediksi probability
= logreg_cv.predict_proba(X_train_scaled)
y_pred_train_proba
y_pred_train_proba
# [1 - P(y = 1), P(y = 1)]
array([[0.05962135, 0.94037865],
[0.85225979, 0.14774021],
[0.66425899, 0.33574101],
...,
[0.23035483, 0.76964517],
[0.91901121, 0.08098879],
[0.88771835, 0.11228165]], shape=(582, 2))
= logreg.predict(X_train_scaled)
y_pred_train y_pred_train
array([1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,
0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,
1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0,
1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
1, 1, 0, 1, 0, 0, 1, 1, 0, 0])
# Tampilkan confusion matrix
= y_train,
confusion_matrix(y_true = y_pred_train) y_pred
array([[287, 95],
[ 66, 134]])
# figure
= plt.subplots(nrows=1, ncols=1, figsize=(8, 8))
fig, ax = y_train,
sns.heatmap(confusion_matrix(y_true = y_pred_train),
y_pred =True, fmt="d", cmap="Blues", ax=ax)
annot"Predicted")
ax.set_xlabel("True")
ax.set_ylabel("Confusion Matrix")
ax.set_title( plt.show()
# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_true = y_train,
= y_pred_train,
y_pred = ["not diabetes", "diabetes"])) target_names
precision recall f1-score support
not diabetes 0.81 0.75 0.78 382
diabetes 0.59 0.67 0.62 200
accuracy 0.72 582
macro avg 0.70 0.71 0.70 582
weighted avg 0.73 0.72 0.73 582
= logreg.predict(X_test_scaled)
y_pred_test
# Tampilkan confusion matrix
= y_test,
confusion_matrix(y_true = y_pred_test) y_pred
array([[66, 30],
[15, 35]])
# ROC Curve
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
= logreg.predict_proba(X_test_scaled)
y_pred_test_proba = roc_curve(y_test,
fpr_lr, tpr_lr, threshold_lr 1])
y_pred_test_proba[:, = auc(fpr_lr, tpr_lr)
roc_auc_lr
'Receiver Operating Characteristic')
plt.title('b', label = 'AUC = %0.2f' % roc_auc_lr)
plt.plot(fpr_lr, tpr_lr, = 'lower right')
plt.legend(loc 0, 1], [0, 1],'r--')
plt.plot([0, 1])
plt.xlim([0, 1])
plt.ylim(['True Positive Rate')
plt.ylabel('False Positive Rate')
plt.xlabel( plt.show()
# Koefisien/slope logistic regression
logreg.coef_
array([[1.19139131, 0.21313067]])
# Threshold Tuning untuk Model Logistic Regression
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, f1_score, accuracy_score, recall_score, precision_score, confusion_matrix, classification_report
# Mendapatkan probabilitas prediksi
= logreg.predict_proba(X_test_scaled)
y_pred_test_proba = y_pred_test_proba[:, 1] # Probabilitas untuk kelas positif (diabetes)
y_scores
# Menghitung precision dan recall untuk berbagai threshold
= precision_recall_curve(y_test, y_scores)
precision, recall, thresholds
# Menghitung F1 score untuk setiap threshold
= []
f1_scores for i in range(len(precision)):
if precision[i] + recall[i] > 0: # Hindari pembagian dengan nol
= 2 * (precision[i] * recall[i]) / (precision[i] + recall[i])
f1
f1_scores.append(f1)else:
0)
f1_scores.append(
# Plotting precision, recall, dan F1-score vs threshold
=(10, 6))
plt.figure(figsize-1], 'b--', label='Precision')
plt.plot(thresholds, precision[:-1], 'g-', label='Recall')
plt.plot(thresholds, recall[:-1], 'r-.', label='F1 Score')
plt.plot(thresholds, f1_scores[:'Threshold')
plt.xlabel('Score')
plt.ylabel('Precision, Recall, dan F1-score vs. Threshold')
plt.title(
plt.legend()True)
plt.grid(
plt.show()
# Mencari threshold dengan F1-score tertinggi
= np.argmax(f1_scores[:-1])
optimal_idx = thresholds[optimal_idx]
optimal_threshold = f1_scores[optimal_idx]
best_f1
print(f"Threshold default: 0.5")
print(f"Threshold optimal berdasarkan F1-score: {optimal_threshold:.4f}")
print(f"F1-score optimal: {best_f1:.4f}")
# Evaluasi model dengan threshold optimal
= (y_scores >= optimal_threshold).astype(int)
y_pred_test_optimal
# Perbandingan metrik dengan threshold default dan optimal
= logreg.predict(X_test_scaled)
default_pred print("\n--- Perbandingan Metrik ---")
print(f"Accuracy (threshold=0.5): {accuracy_score(y_test, default_pred):.4f}")
print(f"Accuracy (threshold={optimal_threshold:.4f}): {accuracy_score(y_test, y_pred_test_optimal):.4f}")
print(f"Recall (threshold=0.5): {recall_score(y_test, default_pred):.4f}")
print(f"Recall (threshold={optimal_threshold:.4f}): {recall_score(y_test, y_pred_test_optimal):.4f}")
print(f"Precision (threshold=0.5): {precision_score(y_test, default_pred):.4f}")
print(f"Precision (threshold={optimal_threshold:.4f}): {precision_score(y_test, y_pred_test_optimal):.4f}")
# Tampilkan confusion matrix dengan threshold optimal
= plt.subplots(1, 2, figsize=(16, 6))
fig, (ax1, ax2)
=True, fmt="d", cmap="Blues", ax=ax1)
sns.heatmap(confusion_matrix(y_test, default_pred), annot"Predicted")
ax1.set_xlabel("True")
ax1.set_ylabel(f"Confusion Matrix (Threshold=0.5)")
ax1.set_title(
=True, fmt="d", cmap="Blues", ax=ax2)
sns.heatmap(confusion_matrix(y_test, y_pred_test_optimal), annot"Predicted")
ax2.set_xlabel("True")
ax2.set_ylabel(f"Confusion Matrix (Threshold={optimal_threshold:.4f})")
ax2.set_title(
plt.tight_layout()
plt.show()
# Tampilkan classification report dengan threshold optimal
print("\n--- Classification Report dengan Threshold Default (0.5) ---")
print(classification_report(y_test, default_pred, target_names=["not diabetes", "diabetes"]))
print("\n--- Classification Report dengan Threshold Optimal ---")
print(classification_report(y_test, y_pred_test_optimal, target_names=["not diabetes", "diabetes"]))
Threshold default: 0.5
Threshold optimal berdasarkan F1-score: 0.3072
F1-score optimal: 0.6486
--- Perbandingan Metrik ---
Accuracy (threshold=0.5): 0.6918
Accuracy (threshold=0.3072): 0.6438
Recall (threshold=0.5): 0.7000
Recall (threshold=0.3072): 0.9600
Precision (threshold=0.5): 0.5385
Precision (threshold=0.3072): 0.4898
--- Classification Report dengan Threshold Default (0.5) ---
precision recall f1-score support
not diabetes 0.81 0.69 0.75 96
diabetes 0.54 0.70 0.61 50
accuracy 0.69 146
macro avg 0.68 0.69 0.68 146
weighted avg 0.72 0.69 0.70 146
--- Classification Report dengan Threshold Optimal ---
precision recall f1-score support
not diabetes 0.96 0.48 0.64 96
diabetes 0.49 0.96 0.65 50
accuracy 0.64 146
macro avg 0.72 0.72 0.64 146
weighted avg 0.80 0.64 0.64 146