import pandas as pd
import numpy as np
df = pd.read_csv("../data/dogs.csv")
df.head()
breed | height | weight | |
---|---|---|---|
0 | Border Collie | 20 | 45 |
1 | Boston Terrier | 16 | 20 |
2 | Brittany Spaniel | 18 | 35 |
3 | Bullmastiff | 27 | 120 |
4 | Chihuahua | 8 | 8 |
features = df.columns[1:]
features
Index(['height', 'weight'], dtype='object')
X = df[features]
y = df["breed"]
print(X.shape)
print(y.shape)
(11, 2) (11,)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=features)
X.head()
height | weight | |
---|---|---|
0 | 0.538462 | 0.248366 |
1 | 0.384615 | 0.084967 |
2 | 0.461538 | 0.183007 |
3 | 0.807692 | 0.738562 |
4 | 0.076923 | 0.006536 |
from sklearn.cluster import AgglomerativeClustering
model = AgglomerativeClustering(n_clusters=3, linkage="single")
model.fit(X)
AgglomerativeClustering(linkage='single', n_clusters=3)
Dodeljene oznake klastera za svaku od instanci:
model.labels_
array([0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 2])
Izvršena spajanja:
model.children_
array([[ 0, 8], [ 4, 10], [ 6, 5], [11, 2], [14, 9], [15, 1], [16, 13], [17, 3], [18, 12], [19, 7]])
import matplotlib.pyplot as plt
Na sledećem dijagramu je prikazana razlika klasterovanja sa različitim funkcijama rastojanja:
colors = ["red", "green", "blue"]
fig = plt.figure(figsize=(15, 4))
n_clusters = 3
for i, link in enumerate(["single", "complete", "average"]):
model = AgglomerativeClustering(n_clusters=3, linkage=link)
model.fit(X)
df["label"] = model.labels_
fig.add_subplot(1, n_clusters, i + 1)
for cluster_label in range(n_clusters):
cluster = df[df["label"] == cluster_label]
plt.scatter(cluster["height"], cluster["weight"], color=colors[cluster_label], marker="o")
plt.legend([f"Klaster {j + 1}" for j in range(n_clusters)])
from scipy.cluster.hierarchy import dendrogram, linkage
df.set_index("breed", inplace=True)
df.head()
height | weight | label | |
---|---|---|---|
breed | |||
Border Collie | 20 | 45 | 1 |
Boston Terrier | 16 | 20 | 1 |
Brittany Spaniel | 18 | 35 | 1 |
Bullmastiff | 27 | 120 | 0 |
Chihuahua | 8 | 8 | 2 |
Z = linkage(X)
print(Z.shape)
(10, 4)
Z
array([[ 0. , 8. , 0.05047034, 2. ], [ 4. , 10. , 0.07720025, 2. ], [ 5. , 6. , 0.09301156, 2. ], [ 2. , 11. , 0.10094068, 3. ], [ 9. , 14. , 0.1246148 , 4. ], [ 1. , 15. , 0.1246148 , 5. ], [13. , 16. , 0.15167269, 7. ], [ 3. , 17. , 0.28508383, 8. ], [12. , 18. , 0.31753116, 10. ], [ 7. , 19. , 0.32454896, 11. ]])
fig = plt.figure(figsize=(15, 4))
plt.plot([0,100], [0.3, 0.3], color="red")
dendrogram(Z, labels=df.index, leaf_rotation=90, color_threshold=0.3)
plt.show()
model = AgglomerativeClustering(n_clusters=None, linkage="single", distance_threshold=0)
model.fit(X)
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack([model.children_, model.distances_, counts])
fig = plt.figure(figsize=(15,4))
plt.plot([0, 200], [0.2, 0.2], color="red")
dendrogram(linkage_matrix, color_threshold=0.2)
plt.show()