G Solutions chapter 8 - use case 1

Solutions to exercises of chapter 8.

G.1 Preparation

G.1.1 Load required libraries

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(doMC)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(corrplot)
## corrplot 0.84 loaded
library(rpart.plot)
## Loading required package: rpart
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

G.1.2 Define SVM model

svmRadialE1071 <- list(
  label = "Support Vector Machines with Radial Kernel - e1071",
  library = "e1071",
  type = c("Regression", "Classification"),
  parameters = data.frame(parameter="cost",
                          class="numeric",
                          label="Cost"),
  grid = function (x, y, len = NULL, search = "grid") 
    {
      if (search == "grid") {
        out <- expand.grid(cost = 2^((1:len) - 3))
      }
      else {
        out <- data.frame(cost = 2^runif(len, min = -5, max = 10))
      }
      out
    },
  loop=NULL,
  fit=function (x, y, wts, param, lev, last, classProbs, ...) 
    {
      if (any(names(list(...)) == "probability") | is.numeric(y)) {
        out <- e1071::svm(x = as.matrix(x), y = y, kernel = "radial", 
                          cost = param$cost, ...)
      }
      else {
        out <- e1071::svm(x = as.matrix(x), y = y, kernel = "radial", 
                          cost = param$cost, probability = classProbs, ...)
      }
      out
    },
  predict = function (modelFit, newdata, submodels = NULL) 
    {
      predict(modelFit, newdata)
    },
  prob = function (modelFit, newdata, submodels = NULL) 
    {
      out <- predict(modelFit, newdata, probability = TRUE)
      attr(out, "probabilities")
    },
  predictors = function (x, ...) 
    {
      out <- if (!is.null(x$terms)) 
        predictors.terms(x$terms)
      else x$xNames
      if (is.null(out)) 
        out <- names(attr(x, "scaling")$x.scale$`scaled:center`)
      if (is.null(out)) 
        out <- NA
      out
    },
  tags = c("Kernel Methods", "Support Vector Machines", "Regression", "Classifier", "Robust Methods"),
  levels = function(x) x$levels,
  sort = function(x)
  {
    x[order(x$cost), ]
  }
)

G.1.3 Setup parallel processing

registerDoMC(detectCores())
getDoParWorkers()
## [1] 8

G.1.4 Load data

load("data/malaria/malaria.RData")

Inspect objects that have been loaded into R session

ls()
## [1] "infectionStatus" "morphology"      "stage"           "svmRadialE1071"
class(morphology)
## [1] "data.frame"
dim(morphology)
## [1] 1237   23
names(morphology)
##  [1] "Area"                        "Major Axis Length"          
##  [3] "Minor Axis length"           "Eccentricity"               
##  [5] "Mean OPL"                    "Max OPL"                    
##  [7] "Median OPL"                  "Std OPL"                    
##  [9] "Skewness"                    "Kurtosis"                   
## [11] "Variance OPL"                "IQR OPL"                    
## [13] "Optical volume"              "Centroid vs. center of mass"
## [15] "Elongation"                  "Upper quartile OPL"         
## [17] "Perimeter"                   "Equivalent diameter"        
## [19] "Max gradient"                "Mean gradient"              
## [21] "Upper quartile gradient"     "Min symmetry"               
## [23] "Mean symmetry"
class(infectionStatus)
## [1] "factor"
summary(as.factor(infectionStatus))
##   infected uninfected 
##        824        413
class(stage)
## [1] "factor"
summary(as.factor(stage))
## early trophozoite  late trophozoite          schizont        uninfected 
##               173               314               337               413

G.1.5 Data splitting

Partition data into a training and test set using the createDataPartition function

set.seed(42)
trainIndex <- createDataPartition(y=stage, times=1, p=0.7, list=F)
infectionStatusTrain <- infectionStatus[trainIndex]
stageTrain <- stage[trainIndex]
morphologyTrain <- morphology[trainIndex,]
infectionStatusTest <- infectionStatus[-trainIndex]
stageTest <- stage[-trainIndex]
morphologyTest <- morphology[-trainIndex,]

G.2 Assess data quality

G.2.1 Zero and near-zero variance predictors

The function nearZeroVar identifies predictors that have one unique value. It also diagnoses predictors having both of the following characteristics:

  • very few unique values relative to the number of samples
  • the ratio of the frequency of the most common value to the frequency of the 2nd most common value is large.

Such zero and near zero-variance predictors have a deleterious impact on modelling and may lead to unstable fits.

nearZeroVar(morphologyTrain, saveMetrics = T)
##                             freqRatio percentUnique zeroVar   nzv
## Area                         1.000000      92.51152   FALSE FALSE
## Major Axis Length            1.000000     100.00000   FALSE FALSE
## Minor Axis length            1.000000     100.00000   FALSE FALSE
## Eccentricity                 1.000000     100.00000   FALSE FALSE
## Mean OPL                     1.000000     100.00000   FALSE FALSE
## Max OPL                      1.000000     100.00000   FALSE FALSE
## Median OPL                   1.000000     100.00000   FALSE FALSE
## Std OPL                      1.000000     100.00000   FALSE FALSE
## Skewness                     1.000000     100.00000   FALSE FALSE
## Kurtosis                     1.000000     100.00000   FALSE FALSE
## Variance OPL                 1.000000     100.00000   FALSE FALSE
## IQR OPL                      1.000000     100.00000   FALSE FALSE
## Optical volume               1.000000     100.00000   FALSE FALSE
## Centroid vs. center of mass  1.000000     100.00000   FALSE FALSE
## Elongation                   1.000000     100.00000   FALSE FALSE
## Upper quartile OPL           1.000000     100.00000   FALSE FALSE
## Perimeter                    1.166667      69.12442   FALSE FALSE
## Equivalent diameter          1.000000      92.51152   FALSE FALSE
## Max gradient                 1.000000     100.00000   FALSE FALSE
## Mean gradient                1.000000     100.00000   FALSE FALSE
## Upper quartile gradient      1.000000     100.00000   FALSE FALSE
## Min symmetry                 1.000000     100.00000   FALSE FALSE
## Mean symmetry                1.000000     100.00000   FALSE FALSE

There are no zero variance or near zero variance predictors in our data set.

G.2.2 Are all predictors on the same scale?

featurePlot(x = morphologyTrain,
            y = stageTrain,
            plot = "box",
            ## Pass in options to bwplot()
            scales = list(y = list(relation="free"),
                          x = list(rot = 90)),
            layout = c(5,5))

The variables in this data set are on different scales. In this situation it is important to centre and scale each predictor. A predictor variable is centered by subtracting the mean of the predictor from each value. To scale a predictor variable, each value is divided by its standard deviation. After centring and scaling the predictor variable has a mean of 0 and a standard deviation of 1.

G.2.3 Redundancy from correlated variables

Examine pairwise correlations of predictors to identify redundancy in data set

corMat <- cor(morphologyTrain)
corrplot(corMat, order="hclust", tl.cex=1)

Find highly correlated predictors

highCorr <- findCorrelation(corMat, cutoff=0.75)
length(highCorr)
## [1] 16
names(morphologyTrain)[highCorr]
##  [1] "Max OPL"                 "Area"                   
##  [3] "Minor Axis length"       "Std OPL"                
##  [5] "Equivalent diameter"     "Variance OPL"           
##  [7] "Mean gradient"           "Skewness"               
##  [9] "IQR OPL"                 "Optical volume"         
## [11] "Upper quartile gradient" "Median OPL"             
## [13] "Mean symmetry"           "Min symmetry"           
## [15] "Major Axis Length"       "Elongation"

G.2.4 Skewness

Observations grouped by infection status:

featurePlot(x = morphologyTrain,
            y = infectionStatusTrain,
            plot = "density",
            ## Pass in options to xyplot() to
            ## make it prettier
            scales = list(x = list(relation="free"),
                          y = list(relation="free")),
            adjust = 1.5,
            pch = "|",
            layout = c(5, 5),
            auto.key = list(columns = 2))

Observations grouped by infection stage:

featurePlot(x = morphologyTrain,
            y = stageTrain,
            plot = "density",
            ## Pass in options to xyplot() to
            ## make it prettier
            scales = list(x = list(relation="free"),
                          y = list(relation="free")),
            adjust = 1.5,
            pch = "|",
            layout = c(5, 5),
            auto.key = list(columns = 2))

G.3 Infection status (two-class problem)

G.3.1 Model training and parameter tuning

All of the models we are going to use have a single tuning parameter. For each model we will use repeated cross validation to try 10 different values of the tuning parameter.

For each model let’s do five-fold cross-validation a total of five times. To make the analysis reproducible we need to specify the seed for each resampling iteration.

set.seed(42)
seeds <- vector(mode = "list", length = 26)
for(i in 1:25) seeds[[i]] <- sample.int(1000, 10)
seeds[[26]] <- sample.int(1000,1)

train_ctrl_infect_status <- trainControl(method="repeatedcv",
                           number = 5,
                           repeats = 5,
                           seeds = seeds,
                           summaryFunction = twoClassSummary,
                           classProbs = TRUE)

G.3.2 KNN

Train knn model:

knnFit <- train(morphologyTrain, infectionStatusTrain,
                method="knn",
                preProcess = c("center", "scale"),
                #tuneGrid=tuneParam,
                tuneLength=10,
                trControl=train_ctrl_infect_status)
## Warning in train.default(morphologyTrain, infectionStatusTrain, method =
## "knn", : The metric "Accuracy" was not in the result set. ROC will be used
## instead.
knnFit
## k-Nearest Neighbors 
## 
## 868 samples
##  23 predictors
##   2 classes: 'infected', 'uninfected' 
## 
## Pre-processing: centered (23), scaled (23) 
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 695, 694, 694, 694, 695, 694, ... 
## Resampling results across tuning parameters:
## 
##   k   ROC        Sens       Spec     
##    5  0.9956347  0.9726777  0.9937931
##    7  0.9966555  0.9692174  0.9931034
##    9  0.9967199  0.9692174  0.9931034
##   11  0.9965857  0.9702519  0.9924138
##   13  0.9963379  0.9692174  0.9910345
##   15  0.9964685  0.9678351  0.9875862
##   17  0.9966529  0.9664498  0.9882759
##   19  0.9967286  0.9661109  0.9868966
##   21  0.9970731  0.9650675  0.9868966
##   23  0.9969775  0.9654153  0.9875862
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 21.
plot(knnFit)

G.3.3 SVM

Train svm model:

svmFit <- train(morphologyTrain, infectionStatusTrain,
                method=svmRadialE1071,
                preProcess = c("center", "scale"),
                #tuneGrid=tuneParam,
                tuneLength=10,
                trControl=train_ctrl_infect_status)
## Warning in train.default(morphologyTrain, infectionStatusTrain, method =
## svmRadialE1071, : The metric "Accuracy" was not in the result set. ROC will
## be used instead.
svmFit
## Support Vector Machines with Radial Kernel - e1071 
## 
## 868 samples
##  23 predictors
##   2 classes: 'infected', 'uninfected' 
## 
## Pre-processing: centered (23), scaled (23) 
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 694, 694, 695, 694, 695, 694, ... 
## Resampling results across tuning parameters:
## 
##   cost    ROC        Sens       Spec     
##     0.25  0.9977719  0.9764948  0.9848276
##     0.50  0.9979155  0.9806417  0.9931034
##     1.00  0.9980584  0.9813343  0.9917241
##     2.00  0.9983201  0.9820270  0.9931034
##     4.00  0.9983314  0.9813343  0.9931034
##     8.00  0.9981986  0.9827226  0.9931034
##    16.00  0.9982936  0.9847886  0.9931034
##    32.00  0.9980604  0.9851334  0.9889655
##    64.00  0.9974875  0.9837481  0.9882759
##   128.00  0.9970936  0.9823688  0.9841379
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cost = 4.
plot(svmFit, scales = list(x = list(log =2)))

G.3.4 Decision tree

Train decision tree model:

dtFit <- train(morphologyTrain, infectionStatusTrain,
                method="rpart",
                preProcess = c("center", "scale"),
                #tuneGrid=tuneParam,
                tuneLength=10,
                trControl=train_ctrl_infect_status)
## Warning in train.default(morphologyTrain, infectionStatusTrain, method =
## "rpart", : The metric "Accuracy" was not in the result set. ROC will be
## used instead.
dtFit
## CART 
## 
## 868 samples
##  23 predictors
##   2 classes: 'infected', 'uninfected' 
## 
## Pre-processing: centered (23), scaled (23) 
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 694, 695, 694, 695, 694, 694, ... 
## Resampling results across tuning parameters:
## 
##   cp          ROC        Sens       Spec     
##   0.00000000  0.9752903  0.9754423  0.9634483
##   0.09885057  0.9523538  0.9619490  0.9427586
##   0.19770115  0.9523538  0.9619490  0.9427586
##   0.29655172  0.9523538  0.9619490  0.9427586
##   0.39540230  0.9523538  0.9619490  0.9427586
##   0.49425287  0.9523538  0.9619490  0.9427586
##   0.59310345  0.9523538  0.9619490  0.9427586
##   0.69195402  0.9523538  0.9619490  0.9427586
##   0.79080460  0.9523538  0.9619490  0.9427586
##   0.88965517  0.8015262  0.9733973  0.6296552
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
plot(dtFit)

prp(dtFit$finalModel)

G.3.5 Random forest

rfFit <- train(morphologyTrain, infectionStatusTrain,
                method="rf",
                preProcess = c("center", "scale"),
                #tuneGrid=tuneParam,
                tuneLength=10,
                trControl=train_ctrl_infect_status)
## Warning in train.default(morphologyTrain, infectionStatusTrain, method =
## "rf", : The metric "Accuracy" was not in the result set. ROC will be used
## instead.
rfFit
## Random Forest 
## 
## 868 samples
##  23 predictors
##   2 classes: 'infected', 'uninfected' 
## 
## Pre-processing: centered (23), scaled (23) 
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 694, 694, 695, 694, 695, 694, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    2    0.9981532  0.9868666  0.9813793
##    4    0.9979057  0.9879010  0.9868966
##    6    0.9977327  0.9885937  0.9855172
##    9    0.9973191  0.9892834  0.9841379
##   11    0.9973216  0.9892834  0.9820690
##   13    0.9967486  0.9889355  0.9813793
##   16    0.9968264  0.9882399  0.9793103
##   18    0.9964275  0.9878921  0.9772414
##   20    0.9959667  0.9875442  0.9772414
##   23    0.9941175  0.9868516  0.9765517
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
plot(rfFit)

G.3.6 Compare models

Make a list of our models

model_list <- list(knn=knnFit,
                   svm=svmFit,
                   decisionTree=dtFit,
                   randomForest=rfFit)

Collect resampling results for each model

resamps <- resamples(model_list)
resamps
## 
## Call:
## resamples.default(x = model_list)
## 
## Models: knn, svm, decisionTree, randomForest 
## Number of resamples: 25 
## Performance metrics: ROC, Sens, Spec 
## Time estimates for: everything, final model fit
summary(resamps)
## 
## Call:
## summary.resamples(object = resamps)
## 
## Models: knn, svm, decisionTree, randomForest 
## Number of resamples: 25 
## 
## ROC 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## knn          0.9925684 0.9952774 0.9980510 0.9970731 0.9985880 0.9998501
## svm          0.9933115 0.9965815 0.9995541 0.9983314 1.0000000 1.0000000
## decisionTree 0.9357907 0.9716855 0.9778537 0.9752903 0.9826087 0.9976962
## randomForest 0.9939061 0.9971514 0.9994003 0.9981532 0.9997027 1.0000000
##              NA's
## knn             0
## svm             0
## decisionTree    0
## randomForest    0
## 
## Sens 
##                   Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## knn          0.9224138 0.9565217 0.9655172 0.9650675 0.9826087    1    0
## svm          0.9482759 0.9741379 0.9827586 0.9813343 0.9913043    1    0
## decisionTree 0.9217391 0.9655172 0.9741379 0.9754423 0.9827586    1    0
## randomForest 0.9568966 0.9826087 0.9913043 0.9868666 1.0000000    1    0
## 
## Spec 
##                   Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
## knn          0.9482759 0.9827586 0.9827586 0.9868966 1.0000000    1    0
## svm          0.9827586 0.9827586 1.0000000 0.9931034 1.0000000    1    0
## decisionTree 0.8793103 0.9482759 0.9827586 0.9634483 0.9827586    1    0
## randomForest 0.9310345 0.9827586 0.9827586 0.9813793 1.0000000    1    0
bwplot(resamps)

G.3.7 Predict test set using our best model

test_pred <- predict(svmFit, morphologyTest)
confusionMatrix(test_pred, infectionStatusTest)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   infected uninfected
##   infected        242          3
##   uninfected        4        120
##                                           
##                Accuracy : 0.981           
##                  95% CI : (0.9613, 0.9923)
##     No Information Rate : 0.6667          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9574          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9837          
##             Specificity : 0.9756          
##          Pos Pred Value : 0.9878          
##          Neg Pred Value : 0.9677          
##              Prevalence : 0.6667          
##          Detection Rate : 0.6558          
##    Detection Prevalence : 0.6640          
##       Balanced Accuracy : 0.9797          
##                                           
##        'Positive' Class : infected        
## 

G.3.8 ROC curve

svmProbs <- predict(svmFit, morphologyTest, type="prob")
head(svmProbs)
##               infected uninfected
## normal_..4 0.019842960 0.98015704
## normal_..7 0.959900420 0.04009958
## normal_.12 0.009452970 0.99054703
## normal_.17 0.002097783 0.99790222
## normal_.18 0.003581587 0.99641841
## normal_.19 0.024682569 0.97531743
svmROC <- roc(infectionStatusTest, svmProbs[,"infected"])
auc(svmROC)
## Area under the curve: 0.9977
plot(svmROC)

G.4 Discrimination of infective stages (multi-class problem)

G.4.1 Define cross-validation procedure

train_ctrl_stage <- trainControl(method="repeatedcv",
                           number = 5,
                           repeats = 5,
                           seeds = seeds)

G.4.2 KNN

Train knn model with all variables:

knnFit <- train(morphologyTrain, stageTrain,
                method="knn",
                preProcess = c("center", "scale"),
                #tuneGrid=tuneParam,
                tuneLength=10,
                trControl=train_ctrl_stage)

knnFit
## k-Nearest Neighbors 
## 
## 868 samples
##  23 predictors
##   4 classes: 'early trophozoite', 'late trophozoite', 'schizont', 'uninfected' 
## 
## Pre-processing: centered (23), scaled (23) 
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 694, 695, 693, 695, 695, 695, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.6806513  0.5576508
##    7  0.6940101  0.5754335
##    9  0.6949123  0.5762065
##   11  0.6931076  0.5731317
##   13  0.6940246  0.5743092
##   15  0.6917084  0.5706892
##   17  0.6917110  0.5704146
##   19  0.6958385  0.5759926
##   21  0.6956085  0.5755366
##   23  0.6958385  0.5755180
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 19.
plot(knnFit)

G.4.3 SVM

Train SVM model with all variables:

svmFit <- train(morphologyTrain, stageTrain,
                method=svmRadialE1071,
                preProcess = c("center", "scale"),
                #tuneGrid=tuneParam,
                tuneLength=10,
                trControl=train_ctrl_stage)

svmFit
## Support Vector Machines with Radial Kernel - e1071 
## 
## 868 samples
##  23 predictors
##   4 classes: 'early trophozoite', 'late trophozoite', 'schizont', 'uninfected' 
## 
## Pre-processing: centered (23), scaled (23) 
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 693, 695, 695, 694, 695, 694, ... 
## Resampling results across tuning parameters:
## 
##   cost    Accuracy   Kappa    
##     0.25  0.7078501  0.5940116
##     0.50  0.7156902  0.6063755
##     1.00  0.7195864  0.6124250
##     2.00  0.7198083  0.6134929
##     4.00  0.7198030  0.6136120
##     8.00  0.7207491  0.6154010
##    16.00  0.7138537  0.6068131
##    32.00  0.6991156  0.5874694
##    64.00  0.6869050  0.5717486
##   128.00  0.6763009  0.5579433
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cost = 8.
plot(svmFit, scales = list(x = list(log =2)))

G.4.4 Decision tree

Train decision tree model with all variables:

dtFit <- train(morphologyTrain, stageTrain,
                method="rpart",
                preProcess = c("center", "scale"),
                #tuneGrid=tuneParam,
                tuneLength=10,
                trControl=train_ctrl_stage)

dtFit
## CART 
## 
## 868 samples
##  23 predictors
##   4 classes: 'early trophozoite', 'late trophozoite', 'schizont', 'uninfected' 
## 
## Pre-processing: centered (23), scaled (23) 
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 693, 695, 695, 694, 695, 694, ... 
## Resampling results across tuning parameters:
## 
##   cp           Accuracy   Kappa    
##   0.005190311  0.6850209  0.5667306
##   0.006920415  0.6864227  0.5691875
##   0.007785467  0.6871071  0.5706621
##   0.010380623  0.6799831  0.5607537
##   0.012110727  0.6825278  0.5637137
##   0.013840830  0.6815977  0.5624175
##   0.015570934  0.6811498  0.5609683
##   0.034602076  0.6820522  0.5613656
##   0.124567474  0.6184755  0.4610759
##   0.399653979  0.3964157  0.1019836
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.007785467.
plot(dtFit)

prp(dtFit$finalModel)

G.4.5 Random forest

Train random forest model with all variables:

rfFit <- train(morphologyTrain, stageTrain,
                method="rf",
                preProcess = c("center", "scale"),
                #tuneGrid=tuneParam,
                tuneLength=10,
                trControl=train_ctrl_stage)

rfFit
## Random Forest 
## 
## 868 samples
##  23 predictors
##   4 classes: 'early trophozoite', 'late trophozoite', 'schizont', 'uninfected' 
## 
## Pre-processing: centered (23), scaled (23) 
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 693, 695, 695, 694, 695, 694, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.7184462  0.6121623
##    4    0.7179838  0.6120214
##    6    0.7232845  0.6196398
##    9    0.7248884  0.6221363
##   11    0.7209591  0.6167334
##   13    0.7230175  0.6196534
##   16    0.7195890  0.6150192
##   18    0.7181885  0.6132322
##   20    0.7184422  0.6133707
##   23    0.7149754  0.6086754
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 9.
plot(rfFit)

G.4.6 Compare models

Make a list of our models

model_list <- list(knn=knnFit,
                   svm=svmFit,
                   decisionTree=dtFit,
                   randomForest=rfFit)

Collect resampling results for each model

resamps <- resamples(model_list)
resamps
## 
## Call:
## resamples.default(x = model_list)
## 
## Models: knn, svm, decisionTree, randomForest 
## Number of resamples: 25 
## Performance metrics: Accuracy, Kappa 
## Time estimates for: everything, final model fit
summary(resamps)
## 
## Call:
## summary.resamples(object = resamps)
## 
## Models: knn, svm, decisionTree, randomForest 
## Number of resamples: 25 
## 
## Accuracy 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## knn          0.6494253 0.6763006 0.6896552 0.6958385 0.7142857 0.7643678
## svm          0.6666667 0.6994220 0.7241379 0.7207491 0.7341040 0.7873563
## decisionTree 0.6494253 0.6647399 0.6820809 0.6871071 0.7011494 0.7514451
## randomForest 0.6820809 0.7109827 0.7225434 0.7248884 0.7413793 0.7745665
##              NA's
## knn             0
## svm             0
## decisionTree    0
## randomForest    0
## 
## Kappa 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## knn          0.5132532 0.5494582 0.5659846 0.5759926 0.6029405 0.6719246
## svm          0.5396405 0.5873584 0.6199144 0.6154010 0.6334746 0.7079477
## decisionTree 0.5204446 0.5381140 0.5650286 0.5706621 0.5876401 0.6588554
## randomForest 0.5631113 0.6014192 0.6164257 0.6221363 0.6424494 0.6931089
##              NA's
## knn             0
## svm             0
## decisionTree    0
## randomForest    0
bwplot(resamps)

G.4.7 Predict test set using our best model

test_pred <- predict(rfFit, morphologyTest)
confusionMatrix(test_pred, stageTest)
## Confusion Matrix and Statistics
## 
##                    Reference
## Prediction          early trophozoite late trophozoite schizont uninfected
##   early trophozoite                27                3        7          3
##   late trophozoite                 16               57       28          0
##   schizont                          5               33       66          0
##   uninfected                        3                1        0        120
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7317          
##                  95% CI : (0.6834, 0.7763)
##     No Information Rate : 0.3333          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6305          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: early trophozoite Class: late trophozoite
## Sensitivity                           0.52941                  0.6064
## Specificity                           0.95912                  0.8400
## Pos Pred Value                        0.67500                  0.5644
## Neg Pred Value                        0.92705                  0.8619
## Prevalence                            0.13821                  0.2547
## Detection Rate                        0.07317                  0.1545
## Detection Prevalence                  0.10840                  0.2737
## Balanced Accuracy                     0.74427                  0.7232
##                      Class: schizont Class: uninfected
## Sensitivity                   0.6535            0.9756
## Specificity                   0.8582            0.9837
## Pos Pred Value                0.6346            0.9677
## Neg Pred Value                0.8679            0.9878
## Prevalence                    0.2737            0.3333
## Detection Rate                0.1789            0.3252
## Detection Prevalence          0.2818            0.3360
## Balanced Accuracy             0.7558            0.9797