반응형
Preparing
Applied Predictive Modeling
1. Package
!pip install category_encoders
!pip install PublicDataReader
!pip install PublicDataReader --upgrade
!pip install finance-datareader
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import FinanceDataReader as fdr
import PublicDataReader as pdr
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
plt.rc('font', family='NanumBarunGothic')
mpl.rc('axes', unicode_minus=False)
sns.set(rc={'font.family':'NanumBarunGothic'})
plt.rcParams["font.family"] = 'nanummyeongjo'
plt.rcParams["axes.grid"] = True
plt.rcParams["figure.figsize"] = (12,6)
plt.rcParams["axes.formatter.useoffset"] = False
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams["axes.formatter.limits"] = -10000, 10000
%matplotlib inline
%config InlineBackend.figure_format='retina'
2. Load Data
sample_submission=pd.read_csv("/content/sampleSubmission.csv")
train=pd.read_csv("/content/train.csv")
test=pd.read_csv("/content/test.csv")
target="count"
train
3. Preprocessing
# dtype 등
train.info()
# missing val
train.isna().sum().any(), test.isna().sum().any()
# zeros
zeros = pd.DataFrame()
for col in train.columns:
if train[col].all() == False :
zeros[col] = train[col][train[col]==0].value_counts()
zeros
# duplicate
train.T.duplicated().any()
#Preprocessing
def preprocessing(df):
# 날짜바꾸기
temp_1 = df.datetime.str.split(' ').str[0]
df["year"] = temp_1.str.split('-').str[0]
df["month"] = temp_1.str.split('-').str[1]
df["date"] = temp_1
df["weekday"] = temp_1.apply(lambda x:calendar.day_name[datetime.strptime(x,"%Y-%m-%d").weekday()])
df["hour"] = df.datetime.str.split(' ').str[1].str[0:2]
# 카테고리형으로 변환
category_var = ["datetime","year","month","date","hour",'weekday',"season","holiday","workingday","weather"]
for var in category_var:
df[var] = df[var].astype("category")
df.drop(columns="datetime",inplace=True)
return df
4. Data Summary
#수치형데이터 Boxplot
fig,axes = plt.subplots(1,7, figsize=(20,4))
i=0
for column in finding.columns[finding.dtypes!="category"]:
sns.boxplot(data=finding[column],ax= axes[i])
axes[i].set_title(column)
i = i+1
5. Correlation Analysis
## 수치형
# heatmap
corrMatt = finding[["temp","atemp","casual","registered","humidity","windspeed","count"]].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(corrMatt, mask=mask,vmax=.8, square=True,annot=True,cmap="coolwarm");
#scatter
fig,axes = plt.subplots(1,3,figsize=(30,5))
sns.scatterplot(x="temp", y="count",hue="season", data=finding,ax=axes[0],alpha=0.4)
sns.scatterplot(x="windspeed", y="count",hue="season", data=finding,ax=axes[1],alpha=0.4)
sns.scatterplot(x="humidity", y="count",hue="season", data=finding,ax=axes[2],alpha=0.4);
## 범주형
from scipy.stats import chi2_contingency
def cramers_V(var1,var2) :
crosstab =np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building
stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test
obs = np.sum(crosstab) # Number of observations
mini = min(crosstab.shape)-1 # Take the minimum value between the columns and the rows of the cross table
return (stat/(obs*mini))
category_col = finding.columns[finding.dtypes=="category"]
rows= []
for var1 in category_col:
col = []
for var2 in category_col :
cramers =cramers_V(finding[var1], finding[var2]) # Cramer's V test
col.append(round(cramers,2)) # Keeping of the rounded value of the Cramer's V
rows.append(col)
cramers_results = np.array(rows)
category_cor = pd.DataFrame(cramers_results, columns = category_col, index = category_col)
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(category_cor,center=0, square=True,annot=True,cmap="coolwarm");
5. Feature Engineering & Wrangling
# concat
stock_kr.shape,stock_us.shape,stock_wo.shape,stock_fu.shape, stock_ex.shape # 크기 확인
df=pd.concat([stock_kr,stock_us,stock_wo,stock_fu,stock_ex],axis=1)
df.sample(5)
# merge
df_final = pd.concat([df_today,df_merge],axis=1)
df_final = df_final.drop(columns=['거래년도'])
df_final.sample(5)
# engineering
def engineering(df):
# 날짜바꾸기
temp_1 = df.datetime.str.split(' ').str[0]
df["year"] = temp_1.str.split('-').str[0]
df["month"] = temp_1.str.split('-').str[1]
df["date"] = temp_1
df["weekday"] = temp_1.apply(lambda x:calendar.day_name[datetime.strptime(x,"%Y-%m-%d").weekday()])
df["hour"] = df.datetime.str.split(' ').str[1].str[0:2]
# 카테고리형으로 변환
category_var = ["datetime","year","month","date","hour",'weekday',"season","holiday","workingday","weather"]
for var in category_var:
df[var] = df[var].astype("category")
# 지울 Feature
df.drop(columns=["datetime","date","workingday","season"],inplace=True)
df.drop(columns=df.columns[df.columns.isin(["casual","registered"])],inplace=True)
# 이상치
df=df[df["weather"]!=4]
return df
반응형
'기본소양 > CODE' 카테고리의 다른 글
3. Applied Predictive Modeling [2] Importance (0) | 2021.02.18 |
---|---|
3. Applied Predictive Modeling [1] Modeling(Boost) (0) | 2021.02.18 |
2. Tree based model CODE [4] Hyperparameter Tuning / Threshold (0) | 2021.02.09 |
2. Tree based model CODE [3] Model Selection (0) | 2021.02.09 |
2. Tree based model CODE [2] Tree Model (0) | 2021.02.09 |
댓글