반응형
Hyperparameter Tuning / Threshold
Tree based model CODE
1. RandomizedSearchCV
# RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
Model_xx_rcv = make_pipeline(SimpleImputer(),
RandomForestClassifier(criterion='entropy',
n_jobs=-1,
random_state=1000,
oob_score=True,
class_weight="balanced")
)
dists = {
"randomforestclassifier__min_samples_leaf": [None, 9, 10, 11],
"randomforestclassifier__max_depth": [18, 20, 22],
"randomforestclassifier__max_features": [0.3,0.4,0.5]
}
xx_rcv = RandomizedSearchCV(
Model_xx_rcv,
param_distributions=dists,
n_iter=25,
cv=3,
scoring="f1",
verbose=1,
n_jobs=-1
)
xx_rcv.fit(X_train_xx, y_train_xx); # fit
# 결과 확인
print("최적의 하이퍼 파라미터")
print(xx_rcv.best_params_)
print("f1 score: ", xx_rcv.best_score_)
2. GridSearchCV
Model_xx_gcv = make_pipeline(SimpleImputer(),
RandomForestClassifier(criterion='entropy',
n_jobs=-1,
min_samples_leaf=9,
random_state=1000,
oob_score=True,
class_weight="balanced")
)
dists = {
"randomforestclassifier__max_depth": [19,20,21],
"randomforestclassifier__max_features": ["auto", "sqrt", "log2", 0.4],
"randomforestclassifier__min_samples_split": [2,5,10]
}
xx_gcv = GridSearchCV(Model_xx_gcv,
param_grid=dists,
cv=3,
scoring="f1",
verbose=1
)
xx_gcv.fit(X_train_xx, y_train_xx);
# 결과 확인
print("최적의 하이퍼 파라미터")
print(xx_gcv.best_params_)
print("f1 score: ", xx_gcv.best_score_)
3. ROC Curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
#probs
ns_probs = [0 for _ in range(len(y_val_simp))]
lr_probs = lr.predict_proba(X_val_simp)
lr_probs = lr_probs[:, 1]
dtr_probs = dtr.predict_proba(X_val_simp)
dtr_probs = dtr_probs[:, 1]
rf_probs = Final_Model_4.predict_proba(X_val_simp)
rf_probs = rf_probs[:, 1]
#auc
ns_auc = roc_auc_score(y_val_simp, ns_probs)
lr_auc = roc_auc_score(y_val_simp, lr_probs)
dtr_auc = roc_auc_score(y_val_simp, dtr_probs)
rf_auc = roc_auc_score(y_val_simp, rf_probs)
#curve
ns_fpr, ns_tpr, _ = roc_curve(y_val_simp, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_val_simp, lr_probs)
dtr_fpr, dtr_tpr, _ = roc_curve(y_val_simp, dtr_probs)
rf_fpr, rf_tpr, thresholds = roc_curve(y_val_simp, rf_probs)
#visualize
print('Baseline: ROC AUC=%.2f' % (ns_auc))
print('Logistic: ROC AUC=%.2f' % (lr_auc))
print('Decision Tree: ROC AUC=%.2f' % (dtr_auc))
print('Random Forest: ROC AUC=%.2f' % (rf_auc))
plt.figure(figsize=(8,8))
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='Baseline')
plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
plt.plot(dtr_fpr, dtr_tpr, marker='.', label='Decision Tree')
plt.plot(rf_fpr, rf_tpr, marker='.', label='Random Forest')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
4. Threshold
# Threshold
optimal_idx = np.argmax(rf_tpr - rf_fpr)
optimal_threshold = thresholds[optimal_idx]
print('idx:', optimal_idx, ', threshold:', optimal_threshold)
5. plot distribution with threshold
# Distribution with threshold
plt.figure(figsize = (15,6))
plt.hist(df_prob[df_prob[target]==0].iloc[:,1],bins=200,alpha=0.8,label=target+str(': 0'))
plt.hist(df_prob[df_prob[target]==1].iloc[:,1],bins=200,alpha=0.8,label=target+str(': 1'))
plt.axvline(optimal_threshold,color="red")
plt.grid();plt.legend();plt.show()
6. Classification report
# classfication report 비교
y_pred_optimal = rf_probs >= optimal_threshold
print('★ threshold = optimal ★\n')
print('검증 f1 score : ',f1_score(y_val_simp,y_pred_optimal))
print(classification_report(y_val_simp, y_pred_optimal))
y_pred_5 = rf_probs >= 0.5
print('-'*100)
print('★ threshold = 0.5 ★\n')
print('검증 f1 score : ',f1_score(y_val_simp,y_pred_5))
print(classification_report(y_val_simp, y_pred_5))
반응형
'기본소양 > CODE' 카테고리의 다른 글
3. Applied Predictive Modeling [1] Modeling(Boost) (0) | 2021.02.18 |
---|---|
3. Applied Predictive Modeling [0] Preparing (0) | 2021.02.18 |
2. Tree based model CODE [3] Model Selection (0) | 2021.02.09 |
2. Tree based model CODE [2] Tree Model (0) | 2021.02.09 |
2. Tree based model CODE [1] Encode, Impute (0) | 2021.02.09 |
댓글