import pandas as pd
df = pd.read_csv("../data/balloons.csv")
df.head()
color | size | act | age | inflated | |
---|---|---|---|---|---|
0 | YELLOW | SMALL | STRETCH | ADULT | T |
1 | YELLOW | SMALL | STRETCH | ADULT | T |
2 | YELLOW | SMALL | STRETCH | CHILD | F |
3 | YELLOW | SMALL | DIP | ADULT | F |
4 | YELLOW | SMALL | DIP | CHILD | F |
df.describe()
color | size | act | age | inflated | |
---|---|---|---|---|---|
count | 76 | 76 | 76 | 76 | 76 |
unique | 2 | 2 | 2 | 2 | 2 |
top | YELLOW | SMALL | STRETCH | ADULT | F |
freq | 40 | 40 | 38 | 38 | 41 |
features = list(df.columns[:-1])
print(features)
['color', 'size', 'act', 'age']
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
Ordinal encoder - kotegoričke atribute označene simbolima pretvara u numeričke vrednosti
X = df[features]
y = df["inflated"]
print(X.shape)
print(y.shape)
(76, 4) (76,)
Uvek je korisno da se provere dimenzije karakteristika i klasa kako bismo utvrdili da su dimenzije usklađene.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=13, stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(50, 4) (26, 4) (50,) (26,)
oe = OrdinalEncoder()
oe.fit(X_train)
X_train = oe.transform(X_train)
X_test = oe.transform(X_test)
pd.DataFrame(X_train, columns=features).head()
color | size | act | age | |
---|---|---|---|---|
0 | 1.0 | 0.0 | 0.0 | 0.0 |
1 | 1.0 | 1.0 | 1.0 | 1.0 |
2 | 0.0 | 1.0 | 1.0 | 1.0 |
3 | 1.0 | 0.0 | 1.0 | 0.0 |
4 | 1.0 | 0.0 | 0.0 | 0.0 |
from sklearn.naive_bayes import CategoricalNB
CategoricalNB - algoritam naivnog Bajesa za kategoričke atribute
model = CategoricalNB()
model.fit(X_train, y_train)
CategoricalNB()
classes = model.classes_
classes
array(['F', 'T'], dtype='<U1')
model.class_count_
array([27., 23.])
model.category_count_
model.category_count_
[array([[13., 14.], [ 7., 16.]]), array([[17., 10.], [ 7., 16.]]), array([[19., 8.], [ 6., 17.]]), array([[ 9., 18.], [17., 6.]])]
from sklearn.metrics import confusion_matrix
y_train_pred = model.predict(X_train)
pd.DataFrame(confusion_matrix(y_train, y_train_pred), columns=classes, index=classes)
F | T | |
---|---|---|
F | 21 | 6 |
T | 6 | 17 |
y_test_pred = model.predict(X_test)
pd.DataFrame(confusion_matrix(y_test, y_test_pred), columns=classes, index=classes)
F | T | |
---|---|---|
F | 12 | 2 |
T | 2 | 10 |
Pajplajn možemo da koristimo da automatizujemo proces preprocesiranja ulaznih podataka i treniranja modela. Kada kreiramo pajplajn, navodimo niz transformacija, a poslednji elemenat niza mora da bude klasifikacioni model.
from sklearn.pipeline import Pipeline
pipe = Pipeline([("ordinal encoder", OrdinalEncoder()), ("classifier", CategoricalNB())])
pipe.fit(X_train, y_train)
Pipeline(steps=[('ordinal encoder', OrdinalEncoder()), ('classifier', CategoricalNB())])
pipe["ordinal encoder"]
OrdinalEncoder()
y_test_pred = pipe.predict(X_test)
pd.DataFrame(confusion_matrix(y_test, y_test_pred), columns=classes, index=classes)
F | T | |
---|---|---|
F | 12 | 2 |
T | 2 | 10 |