본문 바로가기
기본소양/CODE

3. Applied Predictive Modeling [0] Preparing

by EXUPERY 2021. 2. 18.
반응형

Preparing

Applied Predictive Modeling  

 

 


 

1. Package

!pip install category_encoders
!pip install PublicDataReader
!pip install PublicDataReader --upgrade
!pip install finance-datareader
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import FinanceDataReader as fdr
import PublicDataReader as pdr
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
plt.rc('font', family='NanumBarunGothic') 
mpl.rc('axes', unicode_minus=False)
sns.set(rc={'font.family':'NanumBarunGothic'})
plt.rcParams["font.family"] = 'nanummyeongjo'
plt.rcParams["axes.grid"] = True
plt.rcParams["figure.figsize"] = (12,6)
plt.rcParams["axes.formatter.useoffset"] = False
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams["axes.formatter.limits"] = -10000, 10000
%matplotlib inline
%config InlineBackend.figure_format='retina'

 

2. Load Data

sample_submission=pd.read_csv("/content/sampleSubmission.csv")
train=pd.read_csv("/content/train.csv")
test=pd.read_csv("/content/test.csv")
target="count"
train

 

3. Preprocessing

# dtype 등
train.info()

# missing val
train.isna().sum().any(), test.isna().sum().any()

# zeros
zeros = pd.DataFrame()
for col in train.columns:
  if train[col].all() == False :
    zeros[col] = train[col][train[col]==0].value_counts()
zeros

# duplicate
train.T.duplicated().any()

#Preprocessing
def preprocessing(df):
# 날짜바꾸기
  temp_1 = df.datetime.str.split(' ').str[0]
  df["year"] = temp_1.str.split('-').str[0]
  df["month"] = temp_1.str.split('-').str[1]
  df["date"] = temp_1
  df["weekday"] = temp_1.apply(lambda x:calendar.day_name[datetime.strptime(x,"%Y-%m-%d").weekday()])
  df["hour"] = df.datetime.str.split(' ').str[1].str[0:2]
# 카테고리형으로 변환
  category_var = ["datetime","year","month","date","hour",'weekday',"season","holiday","workingday","weather"]
  for var in category_var:
      df[var] = df[var].astype("category")
  df.drop(columns="datetime",inplace=True)
  return df

 

4. Data Summary

#수치형데이터 Boxplot
fig,axes = plt.subplots(1,7, figsize=(20,4))
i=0
for column in finding.columns[finding.dtypes!="category"]:
  sns.boxplot(data=finding[column],ax= axes[i])
  axes[i].set_title(column)
  i = i+1

 

5. Correlation Analysis

## 수치형

# heatmap
corrMatt = finding[["temp","atemp","casual","registered","humidity","windspeed","count"]].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(corrMatt, mask=mask,vmax=.8, square=True,annot=True,cmap="coolwarm");

#scatter
fig,axes = plt.subplots(1,3,figsize=(30,5))
sns.scatterplot(x="temp", y="count",hue="season", data=finding,ax=axes[0],alpha=0.4)
sns.scatterplot(x="windspeed", y="count",hue="season", data=finding,ax=axes[1],alpha=0.4)
sns.scatterplot(x="humidity", y="count",hue="season", data=finding,ax=axes[2],alpha=0.4);


## 범주형
from scipy.stats import chi2_contingency
def cramers_V(var1,var2) :
  crosstab =np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building
  stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test
  obs = np.sum(crosstab) # Number of observations
  mini = min(crosstab.shape)-1 # Take the minimum value between the columns and the rows of the cross table
  return (stat/(obs*mini))
  
category_col = finding.columns[finding.dtypes=="category"]
rows= []
for var1 in category_col:
  col = []
  for var2 in category_col :
    cramers =cramers_V(finding[var1], finding[var2]) # Cramer's V test
    col.append(round(cramers,2)) # Keeping of the rounded value of the Cramer's V  
  rows.append(col)
  
cramers_results = np.array(rows)
category_cor = pd.DataFrame(cramers_results, columns = category_col, index = category_col)

fig,ax= plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(category_cor,center=0, square=True,annot=True,cmap="coolwarm");

 

5. Feature Engineering & Wrangling

# concat
stock_kr.shape,stock_us.shape,stock_wo.shape,stock_fu.shape, stock_ex.shape # 크기 확인
df=pd.concat([stock_kr,stock_us,stock_wo,stock_fu,stock_ex],axis=1)
df.sample(5) 

# merge
df_final = pd.concat([df_today,df_merge],axis=1)
df_final = df_final.drop(columns=['거래년도'])
df_final.sample(5)

# engineering
def engineering(df):
# 날짜바꾸기
  temp_1 = df.datetime.str.split(' ').str[0]
  df["year"] = temp_1.str.split('-').str[0]
  df["month"] = temp_1.str.split('-').str[1]
  df["date"] = temp_1
  df["weekday"] = temp_1.apply(lambda x:calendar.day_name[datetime.strptime(x,"%Y-%m-%d").weekday()])
  df["hour"] = df.datetime.str.split(' ').str[1].str[0:2]
# 카테고리형으로 변환
  category_var = ["datetime","year","month","date","hour",'weekday',"season","holiday","workingday","weather"]
  for var in category_var:
      df[var] = df[var].astype("category")
# 지울 Feature
  df.drop(columns=["datetime","date","workingday","season"],inplace=True)
  df.drop(columns=df.columns[df.columns.isin(["casual","registered"])],inplace=True)
# 이상치
  df=df[df["weather"]!=4]
  return df
반응형

댓글