반응형
Modeling
with Multivariate , Polynomial, Ridge, Lasso
Linear Regression CODE
0. Reference (Baseline)
## Simple Regression Reference Model (Mean or Median)
# Visualization to flind Baseline
plt.figure(figsize = (15,5))
plt.hist(df.target, bins=100,color='blue',alpha=0.5)
plt.axvline(df.target.mean(),color ='red')
plt.axvline(df.target.median(),color ='navy')
plt.xlabel('target')
plt.title('Histogram of Price')
plt.grid()
plt.show()
# Baseline
pred_median = [df.target.median()]*len(y_test)
print(df.target.median(),len(pred_median))
# Visulization
plt.xlabel('feature')
plt.ylabel('target')
plt.grid()
plt.scatter(x = df['feature'], y = df['target'], alpha = 0.3)
plt.axhline(df.target.median(),color ='red')
plt.legend()
plt.show()
1. the error of the baseline
from sklearn.metrics import mean_absolute_error
def print_errors(y_true, y_hat):
mae = mean_absolute_error(y_true, y_hat)
rmse = np.sqrt((((y_hat - y_true)**2).sum() / len(y_true)))
mse = sklearn.metrics.mean_squared_error(y_hat, y_true)
print(f'mae : {mae:.2f}') # Mean Absolute Error 절대평균오차, 에러의 절대값 평균
print(f'rmse : {rmse:.2f}') # Root Mean Squared Error 오차에 제곱해서 평균한 뒤에 루트씌운것
print(f'mse : {mse:.2f}') # Mean Sqared Error 오차에 제곱한 것의 평균
print_errors(y_test, pred_median)
# 기준모델의 오차값이며 앞으로 뒤에 새로운 모델과 비교하여 성능을 평가하는 기준입니다
2. 변수의 유의성 검정
import statsmodels.api as sm
logit = sm.OLS(y_train,X_train) #선형회귀분석 시행
result = logit.fit()
result.summary()
# t-test를 통한 각 변수의 유의성 검정
3. 다중공선성진단
## VIF table 생성
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["features"] = X_train.columns
vif["VIF Factor"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif
## 상관계수 ##
# 연속형 vs 순서형
from scipy.stats import pointbiserialr
X_train[['연속형변수','연속형변수','연속형변수','연속형변수','순서형변수']]
df_pbs = pd.DataFrame()
for pbss in ['연속형변수','연속형변수','연속형변수','연속형변수']:
df_pbs[pbss] = pointbiserialr(X_train['순서형변수'],X_train[pbss])
df_pbs = df_pbs.T
df_pbs.columns=['correlation','pvalue']
# 연속형 vs 연속형
df_pearson = X_train[['연속형변수','연속형변수','연속형변수','연속형변수']].corr(method='pearson')
# Heatmap
fig,axes = plt.subplots(1,2,figsize=(14,7))
sns.heatmap(df_pearson,ax=axes[0] ,cmap='Blues', vmax=.3,annot = True,square=True, linewidths=.5)
axes[0].set_title('Pearson Correlation',fontsize=20)
sns.heatmap(df_pbs[['correlation']],ax=axes[1] ,cmap='Blues', vmax=.3,annot = True,square=True, linewidths=.5)
axes[1].set_title('Point-Biserial Correlation',fontsize=20)
plt.show
4. 정규화
from sklearn.preprocessing import StandardScaler
sclaer = StandardScaler()
X_train_scaled = sclaer.fit_transform(X_train)
X_val_scaled = sclaer.transform(X_val)
X_test_scaled = sclaer.transform(X_test)
print(f'평균 : {(X_train_scaled.mean()):.0f}\n표준오차:{(X_train_scaled.std()):.0f}')
5. Polynomial Features
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=3)
X_poly_train = poly_features.fit_transform(X_train)
X_poly_test = poly_features.transform(X_test)
5.5. Hyper parameter Tuning
## library
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
## lambda 범위 지정 및 수행
alpha = np.linspace(0,20,100)
ridge_alpha = []
for a in alpha:
ridge = Ridge(alpha=a)
# RMSE
ridge.fit(X_train_K,y_train)
y_test_pred = ridge.predict(X_test_K)
mse = mean_squared_error(y_test, y_test_pred)
rmse_train = np.sqrt(mse)
# RMSE using 10-fold CV
kf = KFold(10)
xval_err = 0
for train, test in kf.split(X_train_K):
ridge.fit(X_train_K[train], y_train.values[train])
p = ridge.predict(X_train_K[test])
err = p - y_train.values[test]
xval_err += np.dot(err,err)
rmse_10cv = np.sqrt(xval_err/len(X)) # RMSE 10 fold 값
ridge_alpha.append([a,rmse_train,rmse_10cv]) # RMSE구해서 리스트에 넣기
ridge_lam = pd.DataFrame(ridge_alpha)
ridge_lam.columns = ['lambda','RMSE','RMSE_10cv']
ridge_lam # 테이블 Return
## Visualization
plt.figure(figsize=(15,5))
sns.scatterplot(x='lambda',y='RMSE',data=ridge_lam)
plt.grid()
plt.show()
plt.figure(figsize=(15,5))
sns.scatterplot(x='lambda',y='RMSE_10cv',data=ridge_lam)
plt.grid()
plt.show()
6. Model 만들기
## 선형회귀
# LinearRegression (선형회귀)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
lr.predict(X_test)
# RidgeRegression (선형회귀 + Rdige)
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=hyper parameter)
ridge.fit(X_train,y_train)
ridge.predict(X_test)
# RidgeCV
from sklearn.linear_model import RidgeCV
ridge_cv = RidgeCV(normalize=True,cv = 5)
ridge_cv.fit(X_train,y_train)
ridge_cv.predict(X_test)
# LassoRegression (선형회귀 + Lasso)
from sklearn.linear_model import Lasso
las = Lasso(alpha=hyper parameter)
las.fit(X_train,y_train)
las.predict(X_test)
# LassoCV
from sklearn.linear_model import LassoCV
las_cv = LassoCV(normalize=True,cv = 5)
las_cv.fit(X_train,y_train)
las_cv.predict(X_test)
## 분류
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression
logreg.fit(X_train,y_train)
logreg.predict(X_test)
# Logistic Regression CV
from sklearn.linear_model import LogisticRegressionCV
logreg_cv = LogisticRegressionCV(cv=10, random_state=1)
logreg_cv.fit(X_train, y_train)
logreg_cv.predict(X_test)
# SVC
from sklearn.svm import SVC
support_vm = SVC(**kernel=‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’)
support_vm.fit(X_train, y_train)
support_vm.predict(X_test)
# Nu-SVC
from sklearn.svm import NuSVC
nu_svm = NuSVC(**kernel=‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’)
nu_svm.fit(X_train, y_train)
nu_svm.predict(X_test)
# LinearSVC
from sklearn.svm import LinearSVC
linearSV = LinearSVC(**penalty{‘l1’, ‘l2’},)
linearSV.fit(X_train, y_train)
linearSV.predict(X_test)
# K-neareast
from sklearn.neighbors import NearestNeighbors
K_nrst = NearestNeighbors(**n_neighborsint, default=5)
K_nrst.fit(X_train, y_train)
K_nrst.predict(X_test)
7. 정확도
# accuracy_score 사용
from sklearn.metrics import accuracy_score
print("training accuracy: ", accuracy_score(y_train, y_train_pred))
print("validation accuracy: ", accuracy_score(y_val, y_val_pred))
print("test accuracy: ", accuracy_score(y_test, y_test_pred))
# score 사용
print("training accuracy: ", model.score(X_train, y_train))
print("validation accuracy: ", model.score(X_val, y_val))
## Confusion Matrix (Classification)
tn, fp, fn, tp = metrics.confusion_matrix(y_test, logistic_cv.predict(X_test_scaled)).ravel()
cnf_metrix= pd.DataFrame([[tp,fp],[fn,tn]])
cnf_metrix.columns = [['Actual_Pos','Actual_Neg']]
cnf_metrix.index = [['Pred_Pos','Pred_Neg']]
print(tn,fp,fn,tp)
## ROC curve (Classification)
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
lr_probs = logreg_cv.predict_proba(X_test_scaled)
lr_probs = lr_probs[:, 1]
ns_probs = [0 for _ in range(len(y_test))]
lr_auc = roc_auc_score(y_test, lr_probs)
ns_auc = roc_auc_score(y_test, ns_probs)
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
print('No Skill: ROC AUC=%.2f' % (ns_auc))
print('Logistic: ROC AUC=%.2f' % (lr_auc))
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(lr_fpr, lr_tpr, marker='.', label='LogisticCV')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
8. Coefficient
## Coefficient Table만들기
df_coef = pd.DataFrame(data=model.coef_,columns=X.columns,index=['coef']).T
df_coef.sort_values('coef')
## 식만들기
print(f"log (p/1-p) = ", end='')
print(f"\t {df_coef.loc['ap_hi','coef']:+05f} * b 0")
i = 1
for index in df_coef.index:
print(f"\t\t {df_coef.loc[index,'coef']:+05f} * b",i,f"({index})")
i += 1
## 시각화
sns.barplot(y = df_coef.sort_values('coef',ascending=False).reset_index()['index'],
x = df_coef.sort_values('coef',ascending=False).reset_index()['coef'], orient='h')
반응형
'기본소양 > CODE' 카테고리의 다른 글
2. Tree based model CODE [0] 시작은 언제나 EDA (0) | 2021.02.09 |
---|---|
1. Linear Regression CODE [3] How to select Variables (0) | 2021.02.03 |
1. Linear Regression CODE [1] Simple Regression (0) | 2021.02.02 |
1. Linear Regression CODE [0] 시작은 언제나 EDA (0) | 2021.02.02 |
3. Linear Algebra[4] CODE (0) | 2021.01.18 |
댓글