본문 바로가기
기본소양/CODE

2. Tree based model CODE [0] 시작은 언제나 EDA

by EXUPERY 2021. 2. 9.
반응형

시작은 언제나 EDA

Tree based model CODE 

 

 


0. Data Description

항상 먼저 확인 할 것

 

1. Profiling

pip install -U pandas-profiling

from pandas_profiling import ProfileReport
df.profile_report()

 

2. Duplicated

train.T.duplicated().any()

 

3. Missing Value (Bar로 나타내기)

import matplotlib.pyplot as plt
import seaborn as sns

missing = train.isnull().sum()
missing = missing[missing>0]

miss = pd.DataFrame(missing, columns=['missing'])
miss = miss.reset_index()
miss.sort_values(by='missing', ascending=False, inplace=True)
print(miss.columns)

plt.figure(figsize=(15, 10))
sns.barplot(y = miss['index'], x= miss.missing, palette='Blues_r');

 

4.1 Correlation ( binary )

from scipy.stats import pointbiserialr
# Target만 Heatmap
train_binary = train_binary.dropna()
bi_cor = pd.DataFrame()
for bi in train_binary.columns :
  bi_cor[bi] = pointbiserialr(train_binary[bi],train_binary[target])
bi_cor = bi_cor.T
bi_cor.columns = ['Correlation','p-value']
plt.figure(figsize=(5,10))
sns.heatmap(bi_cor.sort_values('Correlation',ascending=True), cmap=sns.diverging_palette(20, 220, n=200), annot=True, fmt=".2f", center = 0, )
plt.title('biserial correlation');
# 전체 Heatmap
train_binary = train_binary.dropna()
bi_cor = pd.DataFrame()
train_binary_cor = pd.DataFrame()

for vi in train_binary.columns:
  bi_cor = pd.DataFrame()
  for bi in train_binary.columns :
    bi_cor[bi] = pointbiserialr(train_binary[bi],train_binary[vi])
  bi_cor = bi_cor.T
  bi_cor.columns = [vi,'drop']
  bi_cor = bi_cor.drop(columns='drop')
  train_binary_cor = pd.concat([train_binary_cor,bi_cor],axis=1)
  
plt.figure(figsize=(10,8))
sns.heatmap(train_binary_cor, cmap=sns.diverging_palette(20, 220, n=200), annot=True, fmt=".2f", center = 0, )
plt.title('biserial correlation');

4.2 Correlation ( Ordinal )

# 스피어만, 켄달
fig = plt.figure(figsize=(17,15))
ax1 = plt.subplot2grid((2,2), (0,0))
ax2 = plt.subplot2grid((2,2), (0,1), rowspan=2)
ax3 = plt.subplot2grid((2,2), (1,0))
df_oe_cor_sk = pd.concat([df_oe.corr(method='spearman')[target],df_oe.corr(method='kendall')[target]],axis=1)
df_oe_cor_sk.columns = ['spearman','kendall']
plt.subplots_adjust(wspace=0.4,hspace=0.6)
ax1.set_title('Spearman',fontsize=20)
ax3.set_title('Kendall',fontsize=20)
ax2.set_title('Correaltion',fontsize=20)
sns.heatmap(df_oe.corr(method='spearman'), cmap=sns.diverging_palette(20, 220, n=200), annot=True, fmt=".2f", center = 0, ax = ax1)
sns.heatmap(df_oe.corr(method='kendall'), cmap=sns.diverging_palette(20, 220, n=200), annot=True, fmt=".2f", center = 0, ax=ax3)
sns.heatmap(df_oe_cor_sk.sort_values('spearman',ascending=False), cmap=sns.diverging_palette(20, 220, n=200), annot=True, fmt=".2f", center = 0,ax=ax2);

 

4.3 Correlation ( Norminal )

# Norminal Data 타겟과 Crosstab 후 시각화 (for loop)
train_norminal_ratio = pd.DataFrame()
for nor_col in train_norminal.columns[:-2]:
  plt.figure()
  df_cro = pd.crosstab(train_norminal[target],train_norminal[nor_col])
  df_cro = df_cro.T.iloc[:,1]/(df_cro.T.iloc[:,1]+df_cro.T.iloc[:,0])
  df_cro = df_cro.sort_values(ascending=False)

  if df_cro.index.dtype == 'int64' :
    sns.barplot(df_cro.index, df_cro,palette="Set2")
  else :
    plt.title(nor_col)
    sns.barplot(df_cro, df_cro.index, palette="Set2", orient="h")
# Norminal Data 타겟과 Crosstab 후 시각화 (하나)
nor_col = train_norminal.columns[-2]
plt.figure(figsize=(7,10))
df_cro_state = pd.crosstab(train_norminal[target],train_norminal[nor_col])
df_cro_state = df_cro_state.T.iloc[:,1]/(df_cro_state.T.iloc[:,1]+df_cro_state.T.iloc[:,0])
df_cro_state = df_cro_state.sort_values(ascending=False)
plt.title(nor_col)
sns.barplot(df_cro_state, df_cro_state.index, palette="Set2", orient="h");

5.1 Heatmap

# 좀 더 심미적으로
style.use('seaborn-poster')
sns.set_style('ticks')
plt.subplots(figsize = (17,14))
mask = np.zeros_like(train.corr(), dtype=np.bool) # mask란
mask[np.triu_indices_from(mask)] = True
sns.heatmap(train.corr(method='kendall'), cmap=sns.diverging_palette(20, 220, n=200), annot=True, fmt=".2f", mask=mask, center = 0, );
plt.title("Heatmap of all the Features", fontsize = 25);
# 간단히
plt.figure(figsize=(15,12))
sns.heatmap(df_train.corr()*100, cmap=sns.diverging_palette(20, 220, n=200), annot=True, fmt=".0f", center = 0, )
plt.title('Correlation',fontsize=20);

5.2 Pairplot

fe_people = all_df[['marital','n_adult_r','n_people_r','household_children','state']].copy()
fe_people['sum_of_people'] = fe_people['marital'] + fe_people['n_adult_r'] + fe_people['n_people_r']+fe_people['household_children']
fe_people = fe_people.dropna()
fe_people = fe_people.groupby('state').sum()
sns.pairplot(fe_people)

6. Cardinality

train.describe(exclude='number').T.sort_values(by='unique')
sns.regplot(x="n_people_r", y='vacc_h1n1_f', data=msa_group)

 

반응형

댓글