본문 바로가기
기본소양/CODE

1. Linear Regression CODE [2] Modeling

by EXUPERY 2021. 2. 3.
반응형

 

 Modeling

with Multivariate , Polynomial, Ridge, Lasso

Linear Regression CODE 

 

 


 

0. Reference (Baseline)

## Simple Regression Reference Model (Mean or Median)

# Visualization to flind Baseline
plt.figure(figsize = (15,5))
plt.hist(df.target, bins=100,color='blue',alpha=0.5)
plt.axvline(df.target.mean(),color ='red')
plt.axvline(df.target.median(),color ='navy')
plt.xlabel('target')
plt.title('Histogram of Price')
plt.grid()
plt.show() 

# Baseline
pred_median = [df.target.median()]*len(y_test)
print(df.target.median(),len(pred_median))

# Visulization
plt.xlabel('feature')
plt.ylabel('target')
plt.grid()
plt.scatter(x = df['feature'], y = df['target'], alpha = 0.3)
plt.axhline(df.target.median(),color ='red')
plt.legend()
plt.show()

 

1. the error of the baseline

from sklearn.metrics import mean_absolute_error
def print_errors(y_true, y_hat):
  mae = mean_absolute_error(y_true, y_hat)
  rmse = np.sqrt((((y_hat - y_true)**2).sum() / len(y_true)))
  mse = sklearn.metrics.mean_squared_error(y_hat, y_true)
  print(f'mae : {mae:.2f}') # Mean Absolute Error 절대평균오차, 에러의 절대값 평균
  print(f'rmse : {rmse:.2f}') # Root Mean Squared Error 오차에 제곱해서 평균한 뒤에 루트씌운것
  print(f'mse : {mse:.2f}') # Mean Sqared Error 오차에 제곱한 것의 평균
  
print_errors(y_test, pred_median) 
# 기준모델의 오차값이며 앞으로 뒤에 새로운 모델과 비교하여 성능을 평가하는 기준입니다

 

2. 변수의 유의성 검정

import statsmodels.api as sm
logit = sm.OLS(y_train,X_train) #선형회귀분석 시행
result = logit.fit()
result.summary()
# t-test를 통한 각 변수의 유의성 검정

 

 

3. 다중공선성진단

## VIF table 생성
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["features"] = X_train.columns 
vif["VIF Factor"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif
## 상관계수 ##

# 연속형 vs 순서형

from scipy.stats import pointbiserialr
X_train[['연속형변수','연속형변수','연속형변수','연속형변수','순서형변수']]

df_pbs = pd.DataFrame()
for pbss in ['연속형변수','연속형변수','연속형변수','연속형변수']:
  df_pbs[pbss] =  pointbiserialr(X_train['순서형변수'],X_train[pbss])
df_pbs = df_pbs.T
df_pbs.columns=['correlation','pvalue']


# 연속형 vs 연속형
df_pearson = X_train[['연속형변수','연속형변수','연속형변수','연속형변수']].corr(method='pearson')


# Heatmap
fig,axes = plt.subplots(1,2,figsize=(14,7))
sns.heatmap(df_pearson,ax=axes[0] ,cmap='Blues', vmax=.3,annot = True,square=True, linewidths=.5)
axes[0].set_title('Pearson Correlation',fontsize=20)
sns.heatmap(df_pbs[['correlation']],ax=axes[1] ,cmap='Blues', vmax=.3,annot = True,square=True, linewidths=.5)
axes[1].set_title('Point-Biserial Correlation',fontsize=20)
plt.show

 

4. 정규화

from sklearn.preprocessing import StandardScaler
sclaer = StandardScaler()
X_train_scaled = sclaer.fit_transform(X_train)
X_val_scaled = sclaer.transform(X_val)
X_test_scaled = sclaer.transform(X_test)
print(f'평균 : {(X_train_scaled.mean()):.0f}\n표준오차:{(X_train_scaled.std()):.0f}')

 

5. Polynomial Features

 

from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=3)
X_poly_train = poly_features.fit_transform(X_train)
X_poly_test = poly_features.transform(X_test)

 

5.5. Hyper parameter Tuning

## library
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold

## lambda 범위 지정 및 수행
alpha = np.linspace(0,20,100)
ridge_alpha = []

for a in alpha:
    ridge = Ridge(alpha=a)
    
    # RMSE
    ridge.fit(X_train_K,y_train)
    y_test_pred = ridge.predict(X_test_K)
    mse = mean_squared_error(y_test, y_test_pred)
    rmse_train = np.sqrt(mse)

    #  RMSE using 10-fold CV
    kf = KFold(10)
    xval_err = 0
    for train, test in kf.split(X_train_K):
        ridge.fit(X_train_K[train], y_train.values[train])
        p = ridge.predict(X_train_K[test])
        err = p - y_train.values[test]
        xval_err += np.dot(err,err)
    rmse_10cv = np.sqrt(xval_err/len(X)) # RMSE 10 fold 값

    ridge_alpha.append([a,rmse_train,rmse_10cv]) # RMSE구해서 리스트에 넣기

ridge_lam = pd.DataFrame(ridge_alpha)
ridge_lam.columns = ['lambda','RMSE','RMSE_10cv']
ridge_lam # 테이블 Return

## Visualization
plt.figure(figsize=(15,5))
sns.scatterplot(x='lambda',y='RMSE',data=ridge_lam)
plt.grid()
plt.show()

plt.figure(figsize=(15,5))
sns.scatterplot(x='lambda',y='RMSE_10cv',data=ridge_lam)
plt.grid()
plt.show()

 

 

6. Model 만들기

## 선형회귀
# LinearRegression (선형회귀)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
lr.predict(X_test)

# RidgeRegression (선형회귀 + Rdige)
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=hyper parameter)
ridge.fit(X_train,y_train)
ridge.predict(X_test)

# RidgeCV
from sklearn.linear_model import RidgeCV
ridge_cv = RidgeCV(normalize=True,cv = 5)
ridge_cv.fit(X_train,y_train)
ridge_cv.predict(X_test)

# LassoRegression (선형회귀 + Lasso)
from sklearn.linear_model import Lasso
las = Lasso(alpha=hyper parameter)
las.fit(X_train,y_train)
las.predict(X_test)


# LassoCV
from sklearn.linear_model import LassoCV
las_cv = LassoCV(normalize=True,cv = 5)
las_cv.fit(X_train,y_train)
las_cv.predict(X_test)

## 분류
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression
logreg.fit(X_train,y_train)
logreg.predict(X_test)

# Logistic Regression CV
from sklearn.linear_model import LogisticRegressionCV
logreg_cv = LogisticRegressionCV(cv=10, random_state=1)
logreg_cv.fit(X_train, y_train)
logreg_cv.predict(X_test)

# SVC
from sklearn.svm import SVC
support_vm = SVC(**kernel=‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’)
support_vm.fit(X_train, y_train)
support_vm.predict(X_test)

# Nu-SVC
from sklearn.svm import NuSVC
nu_svm = NuSVC(**kernel=‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’)
nu_svm.fit(X_train, y_train)
nu_svm.predict(X_test)

# LinearSVC
from sklearn.svm import LinearSVC
linearSV = LinearSVC(**penalty{‘l1’, ‘l2’},)
linearSV.fit(X_train, y_train)
linearSV.predict(X_test)

# K-neareast
from sklearn.neighbors import NearestNeighbors
K_nrst = NearestNeighbors(**n_neighborsint, default=5)
K_nrst.fit(X_train, y_train)
K_nrst.predict(X_test)

 

7.  정확도

# accuracy_score 사용
from sklearn.metrics import accuracy_score
print("training accuracy: ", accuracy_score(y_train, y_train_pred))
print("validation accuracy: ", accuracy_score(y_val, y_val_pred))
print("test accuracy: ", accuracy_score(y_test, y_test_pred))

# score 사용
print("training accuracy: ", model.score(X_train, y_train))
print("validation accuracy: ", model.score(X_val, y_val))
## Confusion Matrix (Classification)
tn, fp, fn, tp = metrics.confusion_matrix(y_test, logistic_cv.predict(X_test_scaled)).ravel()
cnf_metrix= pd.DataFrame([[tp,fp],[fn,tn]])
cnf_metrix.columns = [['Actual_Pos','Actual_Neg']]
cnf_metrix.index = [['Pred_Pos','Pred_Neg']]
print(tn,fp,fn,tp)

## ROC curve (Classification)
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

lr_probs = logreg_cv.predict_proba(X_test_scaled)
lr_probs = lr_probs[:, 1]
ns_probs = [0 for _ in range(len(y_test))]

lr_auc = roc_auc_score(y_test, lr_probs)
ns_auc = roc_auc_score(y_test, ns_probs)

ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)

print('No Skill: ROC AUC=%.2f' % (ns_auc))
print('Logistic: ROC AUC=%.2f' % (lr_auc))

plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(lr_fpr, lr_tpr, marker='.', label='LogisticCV')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

 

8. Coefficient

## Coefficient Table만들기
df_coef = pd.DataFrame(data=model.coef_,columns=X.columns,index=['coef']).T
df_coef.sort_values('coef')

## 식만들기
print(f"log (p/1-p) = ", end='')
print(f"\t {df_coef.loc['ap_hi','coef']:+05f} * b 0")
i = 1
for index in df_coef.index:
  print(f"\t\t {df_coef.loc[index,'coef']:+05f} * b",i,f"({index})")
  i += 1
  
## 시각화
sns.barplot(y = df_coef.sort_values('coef',ascending=False).reset_index()['index'], 
            x = df_coef.sort_values('coef',ascending=False).reset_index()['coef'], orient='h')

 

 

 

반응형

댓글