import pandas as pd
Standard scales - Klasa koja transofrmise numericke atriute tako da budu iz normalne raspodele
df = pd.read_csv("../data/iris.csv")
df.head()
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
df.describe()
Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | |
---|---|---|---|---|
count | 150.000000 | 150.000000 | 150.000000 | 150.000000 |
mean | 5.843333 | 3.057333 | 3.758000 | 1.199333 |
std | 0.828066 | 0.435866 | 1.765298 | 0.762238 |
min | 4.300000 | 2.000000 | 1.000000 | 0.100000 |
25% | 5.100000 | 2.800000 | 1.600000 | 0.300000 |
50% | 5.800000 | 3.000000 | 4.350000 | 1.300000 |
75% | 6.400000 | 3.300000 | 5.100000 | 1.800000 |
max | 7.900000 | 4.400000 | 6.900000 | 2.500000 |
features = df.columns[:-1]
features
Index(['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'], dtype='object')
X = df[features]
y = df["Species"]
print(X.shape)
print(y.shape)
(150, 4) (150,)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=89, stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(105, 4) (45, 4) (105,) (45,)
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()
print(scaler.mean_)
print(scaler.var_)
print(scaler.scale_) # Standardna decijacija
[5.83619048 3.02952381 3.76190476 1.19238095] [0.71583311 0.18855692 3.20692971 0.58146576] [0.84606921 0.43423141 1.79079025 0.76253902]
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=4)
model.fit(X_train, y_train)
KNeighborsClassifier(n_neighbors=4)
classes = model.classes_
from sklearn.metrics import confusion_matrix
y_train_pred = model.predict(X_train)
pd.DataFrame(confusion_matrix(y_train, y_train_pred), columns=classes, index=classes)
setosa | versicolor | virginica | |
---|---|---|---|
setosa | 35 | 0 | 0 |
versicolor | 0 | 33 | 2 |
virginica | 0 | 3 | 32 |
y_test_pred = model.predict(X_test)
pd.DataFrame(confusion_matrix(y_test, y_test_pred), columns=classes, index=classes)
setosa | versicolor | virginica | |
---|---|---|---|
setosa | 15 | 0 | 0 |
versicolor | 0 | 15 | 0 |
virginica | 0 | 2 | 13 |