본문 바로가기
기본소양/선형대수학

파이썬으로 하는 선형대수학 (5. Dimensionality Reduction)

by EXUPERY 2021. 1. 19.
반응형

5.0 Dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
iris = sns.load_dataset('iris')
iris.head()

5.1 PCA

Normalization

label = iris[['species']]
iris = iris.drop('species',axis=1)
x = iris.values
y = iris
x = StandardScaler().fit_transform(x)
pca = PCA(n_components = 2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['component 1', 'component 2'])
finalDataFrame = pd.concat([principalDf, label], axis=1)
print('PC1로 분산의',round(pca.explained_variance_ratio_[0]*100,2),'%를 알수있다')
print('PC1로 분산의',round(pca.explained_variance_ratio_[1]*100,2),'%를 알수있다')
finalDataFrame

original_data = sns.scatterplot(x=finalDataFrame['component 1'],
                                y=finalDataFrame['component 2'],
                                hue = 'species', data = finalDataFrame)

ScreePlot

iris = sns.load_dataset('iris')
iris.head()
iris = iris.drop('species',axis=1)
x = iris.values
y = iris
x = StandardScaler().fit_transform(x)
pca = PCA(n_components = None)
principalComponents = pca.fit_transform(x)

per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)
labels_scree = ['PC' + str(x) for x in range(1, len(per_var)+1)]
per_var = pd.DataFrame([i for i in zip(per_var,labels_scree)]).set_index(1)
per_var.columns=['PC']
plt.figure(figsize = (21,8))
plt.plot(per_var,linewidth=5, c= 'blue', alpha=0.4)
plt.bar(labels_scree,per_var['PC'],alpha=0.3)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
for label_a in labels_scree :
  plt.axvline(x=label_a, color='r', linestyle=':', linewidth=1)


count = 0
sum_pc = 0
scree_list = (pca.explained_variance_ratio_*100).tolist()
scree_sum = []
for pc in scree_list :
  sum_pc += pc
  count += 1
  print(count,'PC 까지의 합은 : ',sum_pc)
  scree_sum.append(sum_pc)
  if count == 10 :
    break
plt.plot(labels_scree[:10],scree_sum,linewidth=5, c= 'green', alpha=0.4)
plt.axhline(y=90, color='r', linestyle=':', linewidth=1)
for labelsscree in labels_scree[:10] :
  plt.axvline(x=labelsscree, color='r', linestyle=':', linewidth=1)
plt.grid()
plt.show()

5.2 T-SNE

Normalization

iris = sns.load_dataset('iris')
iris.head()
iris = iris.drop('species',axis=1)
x = iris.values
y = iris
x = StandardScaler().fit_transform(x)

import warnings
warnings.filterwarnings("ignore")

df_tsne = StandardScaler().fit_transform(iris)
df_tsne = pd.DataFrame(df_tsne)
df_tsne.columns = iris.columns.values
print(df_tsne.shape)
df_tsne.head()

 

Processing (learning rate=100)

from sklearn.manifold import TSNE as TS
from sklearn.metrics.cluster import adjusted_rand_score
df_tsne = StandardScaler().fit_transform(iris)
df_tsne = pd.DataFrame(df_tsne)
df_tsne.columns = iris.columns.values
model = TS(learning_rate=100) # learning rate = 100
transformed = model.fit_transform(df_tsne)
xs = transformed[:,0]
ys = transformed[:,1]
df_tsne = pd.DataFrame(transformed)
df_tsne.columns = ['tsne1','tsne2']
finalDataFrame_tsne = pd.concat([df_tsne,label],axis = 1)
finalDataFrame_tsne

original_data_tsne = sns.scatterplot(x=finalDataFrame_tsne['tsne1']
                                ,y=finalDataFrame_tsne['tsne2']
                                ,hue = 'species', data = finalDataFrame_tsne)

반응형

댓글