Istrenirati model koji klasifikuje novinske članke.
import pandas as pd
import os
Skup podataka se sastoji od datoteka koje sadže informacije o novinskim člancima. Svakom novinskom članku odgovara jedna datoteka iz skupa podataka, a u svakoj datoteci se nalazi spisak reči i broj njihovih pojavljivanja u odgovarajućem novinskom članku. Svaka reč i broj njenih pojavljaivanja se nalazi u posebnom redu. Svaka dateka se nalazi u direktorijumu sa nazivom kategorije novinskog članka koje ta datoteka odgovara.
Pravimo funkicju koja prolazi kroz svaki direktorijum i kroz svaku datoteku i za svaki članak čuva rečnik pojavljivanja reči i klasu kojoj taj članak pripada.
def read_date(root_dir):
corpus = []
classes = []
for class_name in os.listdir(root_dir):
class_dir = os.path.join(root_dir, class_name)
for file_name in os.listdir(class_dir):
file_path = os.path.join(class_dir, file_name)
with open(file_path) as f:
word_counts = {}
for line in f:
word, count = line.split()
count = int(count)
word_counts[word] = count
corpus.append(word_counts)
classes.append(class_name)
return corpus, classes
X_train, y_train = read_date("../data/tekstovi/Trening/")
print(len(X_train))
print(len(y_train))
3492 3492
Rečnik pojavljivanja treba pretvoriti u matricu frkvencije reči
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()
dv.fit(X_train)
DictVectorizer()
dv.feature_names_[:5]
['ab', 'abasu', 'abati', 'abc', 'abdul']
len(dv.feature_names_)
36830
X_train = dv.transform(X_train)
X_train.toarray()
array([[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]])
X_train = pd.DataFrame(X_train.toarray(), columns=dv.feature_names_)
X_train.head()
ab | abasu | abati | abc | abdul | abdulah | abe | aberdin | abhaziji | abida | ... | zxurno | zxustel | zxustrine | zxustro | zxuticx | zxutih | zxutilovine | zxuto | zxutra | zxuzxa | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 36830 columns
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
MultinomialNB()
class_names = model.classes_
class_names
array(['Ekonomija', 'HronikaKriminal', 'KulturaZabava', 'Politika', 'Sport'], dtype='<U15')
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
y_train_pred = model.predict(X_train)
accuracy_score(y_train, y_train_pred)
0.9401489117983963
X_test, y_test = read_date("../data/tekstovi/Testing/")
X_test = dv.transform(X_test)
X_test = pd.DataFrame(X_test.toarray(), columns=dv.feature_names_)
X_test.head()
ab | abasu | abati | abc | abdul | abdulah | abe | aberdin | abhaziji | abida | ... | zxurno | zxustel | zxustrine | zxustro | zxuticx | zxutih | zxutilovine | zxuto | zxutra | zxuzxa | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 36830 columns
y_test_pred = model.predict(X_test)
accuracy_score(y_test, y_test_pred)
0.8995983935742972
pd.DataFrame(confusion_matrix(y_test, y_test_pred), index=class_names, columns=class_names)
Ekonomija | HronikaKriminal | KulturaZabava | Politika | Sport | |
---|---|---|---|---|---|
Ekonomija | 152 | 0 | 1 | 13 | 0 |
HronikaKriminal | 10 | 226 | 6 | 66 | 1 |
KulturaZabava | 2 | 0 | 301 | 6 | 4 |
Politika | 8 | 36 | 9 | 411 | 3 |
Sport | 2 | 1 | 1 | 6 | 478 |
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train,y_train)
KNeighborsClassifier()
y_test_pred = model.predict(X_test)
accuracy_score(y_test, y_test_pred)
0.5783132530120482
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
DecisionTreeClassifier()
y_test_pred = model.predict(X_test)
accuracy_score(y_test, y_test_pred)
0.7504302925989673