Przykłady w R do wykładu z ZUM: ocena klasyfikatorów



      # początek standardowy
library(mlbench)
data(HouseVotes84)

      # podzielmy dane na trenujące i testowe losowo w stosunku ok. 2:1
rhv <- runif(nrow(HouseVotes84))
hv.train <- HouseVotes84[rhv>=0.33,]
hv.test <- HouseVotes84[rhv<0.33,]

      # wykorzystamy drzewa decyzyjne z pakietu rpart
library(rpart)
      # i naiwny klasyfikator bayesowski z pakietu e1071
library(e1071)

      # do oceny klasyfikatorów przydadzą się pakiety caret i ROCR
if (! "caret" %in% row.names(installed.packages()))
  install.packages("caret")
if (! "ROCR" %in% row.names(installed.packages()))  
  install.packages("ROCR")
library(caret)
library(ROCR)

      # budujemy modele
hv.tree <- rpart(Class ~ ., hv.train)
hv.nb <- naiveBayes(Class ~ ., hv.train)

      # przyjmijmy dla budowy drzewa macierz kosztów pomyłek:
      # pomyłka democrat->republican 10x droższa niż odwrotna
r10 <- matrix(c(0, 1, 10, 0), nrow=2, byrow=T)
      # pomyłka democrat->republican 100x droższa niż odwrotna
r100 <- matrix(c(0, 1, 100, 0), nrow=2, byrow=T)
      # drzewa uwzględniające koszty pomyłek
hv.tree10 <- rpart(Class ~ ., hv.train, parms=list(loss=r10))
hv.tree100 <- rpart(Class ~ ., hv.train, parms=list(loss=r100))
      # czy hv.tree, hv.tree10 i hv.tree100 się różnią?
hv.tree
hv.tree10
hv.tree100

      # macierze pomyłek modeli na zbiorze trenującym
confusionMatrix(predict(hv.tree, hv.train, type="c"), hv.train$Class, 
positive="democrat")
confusionMatrix(predict(hv.tree10, hv.train, type="c"), hv.train$Class, 
positive="democrat")
confusionMatrix(predict(hv.tree100, hv.train, type="c"), hv.train$Class, 
positive="democrat")
confusionMatrix(predict(hv.nb, hv.train[,-1], type="c"), hv.train$Class, 
positive="democrat")
      # i na zbiorze testowym
confusionMatrix(predict(hv.tree, hv.test, type="c"), hv.test$Class, 
positive="democrat")
confusionMatrix(predict(hv.tree10, hv.test, type="c"), hv.test$Class, 
positive="democrat")
confusionMatrix(predict(hv.tree100, hv.test, type="c"), hv.test$Class, 
positive="democrat")
confusionMatrix(predict(hv.nb, hv.test[,-1], type="c"), hv.test$Class,
positive="democrat")

      # krzywe ROC dla modeli na zbiorze testowym
      # metoda 1 -- z użyciem funkcji z pakietu caret
hv.tree.roc <- roc(predict(hv.tree, hv.test)[,1], hv.test$Class, 
positive="democrat")
      # krzywa
plot(1-hv.tree.roc[,3], hv.tree.roc[,2], type="l")
      # pole pod krzywą
aucRoc(hv.tree.roc)
hv.tree10.roc <- roc(predict(hv.tree10, hv.test)[,1], hv.test$Class, positive="democrat")
      # krzywa
plot(1-hv.tree10.roc[,3], hv.tree10.roc[,2], type="l")
      # pole pod krzywą
aucRoc(hv.tree10.roc)
hv.tree100.roc <- roc(predict(hv.tree100, hv.test)[,1], hv.test$Class, positive="democrat")
      # krzywa
plot(1-hv.tree100.roc[,3], hv.tree100.roc[,2], type="l")
      # pole pod krzywą
aucRoc(hv.tree100.roc)
hv.nb.roc <- roc(predict(hv.nb, hv.test[,-1], type="raw")[,1], hv.test$Class, positive="democrat")
      # krzywa
plot(1-hv.nb.roc[,3], hv.nb.roc[,2], type="l")
      # pole pod krzywą
aucRoc(hv.nb.roc)

      # metoda 2 -- z użyciem funkcji z pakietu ROCR
hv.tree.pred <- prediction(predict(hv.tree, hv.test)[,1], hv.test$Class,
                           label.ordering=c("republican", "democrat"))
hv.tree.perf <- performance(hv.tree.pred, measure="tpr", x.measure="fpr")
      # krzywa
plot(hv.tree.perf)
      # pole pod krzywą
slot(performance(hv.tree.pred, measure="auc"), "y.values")
      # dla pozostałych modeli analogicznie