본문 바로가기
기본소양/선형대수학

파이썬으로 하는 선형대수학 (6. Clustering)

by EXUPERY 2021. 1. 19.
반응형

6.1 K-Means

Dataset

iris = sns.load_dataset('iris')
iris.head()
iris = iris.drop('species',axis=1)
df_kmeans = StandardScaler().fit_transform(iris)
df_kmeans = pd.DataFrame(df_kmeans)
df_kmeans.columns = iris.columns.values
print(df_kmeans.shape)
df_kmeans.head()

Cluster & fit

kmeans = KMeans(n_clusters=3)
kmeans.fit(df_kmeans)

Check labels & Predict

print(kmeans.labels_.shape)
print(kmeans.predict(df_kmeans).shape)
df_kmeans['cluster'] = pd.DataFrame(kmeans.labels_)
df_kmeans

Accuracy

label_new = kmeans.labels_
acc = pd.DataFrame(label_new)
acc = pd.concat([label, acc], axis = 1)
acc.columns = ['label_origin','label_new']
acc.label_new = acc.label_new.astype(str)
acc.label_new = acc.label_new.replace('1','setosa')
acc.label_new = acc.label_new.replace('2','virginica')
acc.label_new = acc.label_new.replace('0','versicolor')
acc.tail(100)

same = 0
total = 0
for origin, new in zip(acc['label_origin'],acc['label_new']) :
  if origin == new :
    same += 1
    total += 1
  else :
    total += 1
    continue
  accuracy = same/total
print('accuracy : ',accuracy)
#adjusted_rand_score(label1['label'],AC_cluster_labels)

Elbow Method

# Elbow method
from sklearn.cluster import KMeans
square_distance = []

for n_k in range(1, 40) :
    num = KMeans(n_clusters = n_k)
    num = num.fit(points)
    square_distance.append(num.inertia_)

plt.figure(figsize=(15,6))
plt.gca().set_facecolor('#333333')
plt.xlim(0, 40)
plt.xticks(ticks=np.arange(0, 40, step=1))
plt.xlabel('Number of K')
plt.ylabel('Sum of squared distances')
plt.annotate('←This is Elbow',(4.5,26),fontsize = 18, color = 'yellow')
plt.title('Elbow method to find optimal K',fontsize=22)
plt.grid()

plt.plot(range(1, 40), square_distance, 'o-',color='yellow')
plt.show();

Silhouette Method

from sklearn.metrics import silhouette_score

sil = []

for k in range(2, 40):
  kmeans = KMeans(n_clusters = k).fit(np.array(points))
  labels = kmeans.labels_
  sil.append(silhouette_score(np.array(points), labels, metric = 'euclidean'))

plt.figure(figsize=(15,6))
plt.gca().set_facecolor('#333333')
plt.xticks(ticks=np.arange(2, 40, step=1))
plt.xlabel('Number of K')
plt.xlim(1,40)
plt.ylabel('Silhouette_score')
plt.title('Silhouette method to find optimal K',fontsize=22)
plt.annotate('←The Highest Value',(4.5,0.695),fontsize = 18, color = 'yellow')
plt.grid()
plt.plot(range(2, 40), sil, 'o-',color='yellow')
plt.show();

Eyeball Method

Visualizition with PCA (pc=2)

from sklearn.cluster import KMeans 
fig, axes = plt.subplots(1, 2, figsize=(20,8))

df_pca = finalDataFrame[['component 1','component 2']]
K_cluster = KMeans(n_clusters=3).fit_predict(df_pca)
axes[0].scatter(df_pca['component 1'],df_pca['component 2'], 
                c = K_cluster ,cmap = 'rainbow')
axes[0].set_title("K-Means with PCA",fontsize=20)
axes[1].set_title("Original Data",fontsize=20)
sns.scatterplot(x=finalDataFrame['component 1'],
                                y=finalDataFrame['component 2'],
                                hue = 'species', 
                                data = finalDataFrame)
plt.show()

6.2 Hierarchical

Dataset

import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering as AC
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering

iris = sns.load_dataset('iris')
iris.head()
iris = iris.drop('species',axis=1)
iris.head(3)

Cluster & fit

AC_cluster = AC(n_clusters=3).fit_predict(iris)
AC_cluster

Visualization

import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering as AC
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
hierarchical_list = [ ]
for one, two in zip(df_pca['component 1'],df_pca['component 2']) :
  hierarchical_list.append([one,two])

plt.figure(figsize=(14, 8))
plt.axhline(y=10, color='purple', linestyle=':', linewidth=7)
plt.text(14,12,'3 Cluster',fontsize=25,c = 'purple')
plt.title("hierarchical clustering",fontsize=20)
d = shc.dendrogram(shc.linkage(hierarchical_list, method='ward'))

Visualization with PCA (pc=2)

cluster = AC(n_clusters=3, affinity='euclidean', linkage='ward')
cluster_label = cluster.fit_predict(hierarchical_list)

fig, axes = plt.subplots(1, 3, figsize=(20,8))

df_pca = finalDataFrame[['component 1','component 2']]
K_cluster = KMeans(n_clusters=3).fit_predict(df_pca)
axes[0].set_title("K-Means clustering\nAccuracy : {:.2f}".format(adjusted_rand_score(label1['label'],K_cluster)),fontsize=20)
axes[0].scatter(df_pca['component 1'],df_pca['component 2'], 
                c = K_cluster ,cmap = 'rainbow')

colors = {'setosa':'r', 'virginica':'g', 'versicolor':'b'}
axes[1].scatter(df_pca['component 1'],df_pca['component 2'], c = label1['label'].apply(lambda x : colors[x]) )
axes[1].set_title("Original Data\nLABEL",fontsize=20)

axes[2].scatter(df_pca['component 1'],df_pca['component 2'],
                c = cluster_label ,cmap = 'rainbow')
axes[2].set_title("Agglomerative Clustering\nAccuracy : {:.2f}".format(adjusted_rand_score(label1['label'],AC_cluster_labels)),fontsize=20)

plt.show()

6.3 K-Means & Hierarchical with T-SNE

from sklearn.manifold import TSNE as TS
from sklearn.metrics.cluster import adjusted_rand_score
import sklearn.metrics as metrics
df_tsne = StandardScaler().fit_transform(iris)
df_tsne = pd.DataFrame(df_tsne)
df_tsne.columns = iris.columns.values
model = TS(learning_rate=500)
transformed = model.fit_transform(df_tsne)
xs = transformed[:,0]
ys = transformed[:,1]
df_tsne = pd.DataFrame(transformed)
df_tsne.columns = ['tsne1','tsne2']

label1 = pd.DataFrame(label)
label1.columns = ['label']
fig, axes = plt.subplots(1, 3, figsize=(20,8))

K_cluster = KMeans(n_clusters=3).fit_predict(transformed)
axes[0].scatter(df_tsne['tsne1'],df_tsne['tsne2'], c = K_cluster ,cmap = 'rainbow')
axes[0].set_title("K-means Clustering\nAccuracy : {:.2f}".format(adjusted_rand_score(label1['label'],K_cluster)),fontsize=20)

AC_cluster = AC(n_clusters=3).fit_predict(transformed)
AC_cluster_labels = AC_cluster
axes[2].scatter(df_tsne['tsne1'],df_tsne['tsne2'], c = AC_cluster ,cmap = 'rainbow_r')
axes[2].set_title("Agglomerative Clustering\nAccuracy : {:.2f}".format(adjusted_rand_score(label1['label'],AC_cluster_labels)),fontsize=20)

colors = {'setosa':'r', 'virginica':'g', 'versicolor':'b'}
axes[1].scatter(df_tsne['tsne1'],df_tsne['tsne2'], c = label1['label'].apply(lambda x : colors[x]) )
axes[1].set_title("Original Data\nLABEL",fontsize=20)

axes[0].grid()
axes[1].grid()
axes[2].grid()
plt.show()

반응형

댓글