본문 바로가기
기본소양/CODE

2. Tree based model CODE [4] Hyperparameter Tuning / Threshold

by EXUPERY 2021. 2. 9.
반응형

Hyperparameter Tuning / Threshold

Tree based model CODE 

 

 


 

1. RandomizedSearchCV

# RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

Model_xx_rcv = make_pipeline(SimpleImputer(), 
                             RandomForestClassifier(criterion='entropy',
                                                    n_jobs=-1, 
                                                    random_state=1000, 
                                                    oob_score=True,
                                                    class_weight="balanced")
  )

dists = {
    "randomforestclassifier__min_samples_leaf": [None, 9, 10, 11],
    "randomforestclassifier__max_depth": [18, 20, 22], 
    "randomforestclassifier__max_features": [0.3,0.4,0.5]
}

xx_rcv = RandomizedSearchCV(
    Model_xx_rcv, 
    param_distributions=dists, 
    n_iter=25, 
    cv=3, 
    scoring="f1",  
    verbose=1,
    n_jobs=-1
)

xx_rcv.fit(X_train_xx, y_train_xx); # fit

# 결과 확인
print("최적의 하이퍼 파라미터")
print(xx_rcv.best_params_)
print("f1 score: ", xx_rcv.best_score_)

 

 

2. GridSearchCV

Model_xx_gcv = make_pipeline(SimpleImputer(), 
                             RandomForestClassifier(criterion='entropy',
                                                    n_jobs=-1, 
                                                    min_samples_leaf=9, 
                                                    random_state=1000, 
                                                    oob_score=True,
                                                    class_weight="balanced")
  )

dists = {
    "randomforestclassifier__max_depth": [19,20,21],
    "randomforestclassifier__max_features": ["auto", "sqrt", "log2", 0.4],
    "randomforestclassifier__min_samples_split": [2,5,10]
}

xx_gcv = GridSearchCV(Model_xx_gcv, 
                      param_grid=dists, 
                      cv=3, 
                      scoring="f1",  
                      verbose=1
)
xx_gcv.fit(X_train_xx, y_train_xx);

# 결과 확인
print("최적의 하이퍼 파라미터")
print(xx_gcv.best_params_)
print("f1 score: ", xx_gcv.best_score_)

 

3. ROC Curve

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

#probs
ns_probs = [0 for _ in range(len(y_val_simp))]

lr_probs = lr.predict_proba(X_val_simp)
lr_probs = lr_probs[:, 1]

dtr_probs = dtr.predict_proba(X_val_simp)
dtr_probs = dtr_probs[:, 1]

rf_probs = Final_Model_4.predict_proba(X_val_simp)
rf_probs = rf_probs[:, 1]

#auc
ns_auc = roc_auc_score(y_val_simp, ns_probs)
lr_auc = roc_auc_score(y_val_simp, lr_probs)
dtr_auc = roc_auc_score(y_val_simp, dtr_probs)
rf_auc = roc_auc_score(y_val_simp, rf_probs)

#curve
ns_fpr, ns_tpr, _ = roc_curve(y_val_simp, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_val_simp, lr_probs)
dtr_fpr, dtr_tpr, _ = roc_curve(y_val_simp, dtr_probs)
rf_fpr, rf_tpr, thresholds = roc_curve(y_val_simp, rf_probs)

#visualize
print('Baseline: ROC AUC=%.2f' % (ns_auc))
print('Logistic: ROC AUC=%.2f' % (lr_auc))
print('Decision Tree: ROC AUC=%.2f' % (dtr_auc))
print('Random Forest: ROC AUC=%.2f' % (rf_auc))
plt.figure(figsize=(8,8))
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='Baseline')
plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
plt.plot(dtr_fpr, dtr_tpr, marker='.', label='Decision Tree')
plt.plot(rf_fpr, rf_tpr, marker='.', label='Random Forest')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

 

4. Threshold

# Threshold
optimal_idx = np.argmax(rf_tpr - rf_fpr)
optimal_threshold = thresholds[optimal_idx]

print('idx:', optimal_idx, ', threshold:', optimal_threshold)

 

5. plot distribution with threshold

# Distribution with threshold
plt.figure(figsize = (15,6))
plt.hist(df_prob[df_prob[target]==0].iloc[:,1],bins=200,alpha=0.8,label=target+str(': 0'))
plt.hist(df_prob[df_prob[target]==1].iloc[:,1],bins=200,alpha=0.8,label=target+str(': 1'))
plt.axvline(optimal_threshold,color="red")
plt.grid();plt.legend();plt.show()

 

6. Classification report

# classfication report 비교
y_pred_optimal = rf_probs >= optimal_threshold
print('★ threshold = optimal ★\n')
print('검증 f1 score : ',f1_score(y_val_simp,y_pred_optimal))
print(classification_report(y_val_simp, y_pred_optimal))

y_pred_5 = rf_probs >= 0.5
print('-'*100)
print('★ threshold = 0.5 ★\n')
print('검증 f1 score : ',f1_score(y_val_simp,y_pred_5))
print(classification_report(y_val_simp, y_pred_5))
반응형

댓글