################################################################################################# ######## Supervised modelling with R: the CARET package ######################################### ################################################################################################# # this tutorial has been inspired from: # http://will-stanton.com/machine-learning-with-r-an-irresponsibly-fast-tutorial/ library(caret) library(randomForest) library(rpart) library(rpart.plot) library(mclust) library(e1071) library(ROCR) library(foreach) library(doMC) setwd("/pico/home/userinternal/msartori/Documents/CORSO_MILANO/DATA/TITANIC") #set working directory #setwd("~/Google Drive/CINECA/CORSO_PARALLEL_COMPUTING/DATA/TITANIC") #set working directory rm(list=ls()) #clear workspace set.seed(123) #set the seed ########################################################################################## ######### parallelization (it is the only instructions needed by CARET) ################## ########################################################################################## #registerDoMC(10) ########################################################################################## ####################### load the data #################################################### ########################################################################################## # we will use only the training set, since the test set is unlabelled data=read.csv("train.csv",header=T,sep=",") ########################################################################################## ########### find and erase the rows that contain NAs ##################################### ########################################################################################## # it is the best way to keep things simple data$Embarked[which(data$Embarked=="")]=NA NAs=function(i)length(which(is.na(data[i,c(1,2,3,5,6,7,8,10,12)]))) remove=(unlist(lapply(1:nrow(data),NAs))) data=data[which(remove==0),] rm(remove,NAs) ########################################################################################## ####### coerce a)Survived (1=yes, 0=no), b)Pclass and c)Embarked to factors ############## ########################################################################################## data$Pclass=factor(data$Pclass) data$Survived=factor(data$Survived) data$Embarked=factor(data$Embarked) ########################################################################################## ####### split the data into training/test sets ########################################### ########################################################################################## trainID=sample(nrow(data),nrow(data)*(4/5)) trainSet=data[trainID,-c(1,4,9,11)] testSet=data[-trainID,-c(1,4,9,11)] # some variables have been excluded! ########################################################################################## ###### which are the most "useful" variables? ############################################ ########################################################################################## # careful! this is a very rough way to determine which variables are useful to our # puroposes: no further effort will be made in order to select the best subset of # variables, since it is not the goal of this example # passenger class (categorical) table(trainSet$Survived,trainSet$Pclass) #it might be an interesting predictor # age (numerical) summary(trainSet$Age) boxplot(trainSet$Age~trainSet$Survived) #probably not a good predictor # fare (numerical) summary(trainSet$Fare) #no NAs boxplot(trainSet$Fare~trainSet$Survived) #it may be an interesting predictor # sex (categorical) table(trainSet$Survived,trainSet$Sex) # it may be an interesting predictor, but it could be associated with the passenger class table(trainSet$Sex,trainSet$Pclass) # it could be worth inspecting the relationships among the variables ########################################################################################## ########## train and test a decision tree with "rpart" package ########################### ########################################################################################## # important feature: complexity parameter (cp) of the tree # fit the model dt=rpart(Survived~Pclass+Sex+Age+SibSp+Embarked+Parch+Fare,data=trainSet) dt summary(dt) ls(dt) # plot the tree rpart.plot(dt,type=4) # predict the class of the new obs dt_pred=predict(dt,newdata=testSet) # ROC curve pred=prediction(predictions=dt_pred[,2],labels=testSet$Survived,label.ordering=c("0","1")) perf=performance(pred,"spec","sens") plot(perf,print.cutoffs.at=c(0.5),colorize=T,main="ROC curve (decision tree)") rm(perf,pred) # confusion table dt_pred=as.factor(apply(dt_pred,1,which.max)-1) confusionMatrix(dt_pred,testSet$Survived, positive="1") # accuracy = (# correctly predicted)/(# of observations) # sensitivity = (# true positives)/(# real positives) # specificity = (# true negatives)/(# real negatives) # detection rate = (# true positives)/(# of observations) # positive prediction value = (# true positives)/(# predicted positives) # negative prediction value = (# true negatives)/(# predicted negatives) # kappa = adjusts accuracy by accounting for the possibility of a correct prediction by chance alone # range of kappa: 0-1 ########################################################################################## ########## train and test a randomForest model with "randomForest" package ############## ########################################################################################## # important features: # mtry -> Number of variables randomly sampled as candidates at each split # ntree -> Number of trees to grow: every input row must be predicted at least a few times. # let's assume that this algorithm works as a "black box"! # fit the model rf1=randomForest(Survived~Pclass+Sex+Age+SibSp+Embarked+Parch+Fare,data=trainSet,ntree=500) rf1 ls(rf1) # visualize a single tree grown by the randomForest algorithm getTree(rf1,k=1,labelVar=T) table(is.na(getTree(rf1,k=1,labelVar=T)$pred)) # in this tree only 127 obs have been used and only 63 obs have ben classified # predict new obs and confusion table rf1_pred=predict(rf1,newdata=testSet) confusionMatrix(rf1_pred,testSet$Survived, positive="1") ########################################################################################## ########## train and test a randomForest model with CARET ################################ ########################################################################################## # fit the model (with parallelization over the CV step) begin=Sys.time() rf2_caret=train(Survived~Pclass+Sex+Age+SibSp+Embarked+Parch+Fare, #model formula data=trainSet, method="rf", #method=randomForest trControl=trainControl(method = "cv",number=20,classProbs=T)) #cross-validation end=Sys.time() print(end-begin) rm(end,begin) # NB: CV (cross-validation) evaluates the performance of a model using only the training data # Caret itself does this!! # workfolow: # 1) split the training data into (k) equally sized pieces called folds # 2) train the model on (k-1)/k folds, and check its accuracy on the k-th fold # 3) repeat this process with each split of the data # 4) at the end, the percentage accuracy across the different splits of the data is averaged # view the estimated model rf2_caret ls(rf2_caret) rf2_caret$bestTune rf2_caret$coef rf2_caret$finalModel # predict new obs and confusion table rf2_pred=predict(rf2_caret,newdata=testSet) confusionMatrix(rf2_pred,testSet$Survived, positive="1") ########################################################################################## ########## train and test a logistic model ############################################### ########################################################################################## # number of covariate pattern dim(trainSet[,-1]) dim(unique(trainSet[,-1])) # some rows are duplicated! # fit the model Survived=trainSet$Survived=="1" log1=glm(cbind(Survived,1-Survived)~Pclass+Sex+Age+SibSp+Embarked+Parch+Fare, family=binomial,data=trainSet[,-1]) # view the estimated model (we won't consider the diagnostics of the model) summary(log1) # Pclass, Sex and Age have significant coefficients ls(log1) # predict new obs and confusion table log1_pred=round(predict(log1,newdata=testSet,type="response")) confusionMatrix(log1_pred,testSet$Survived, positive="1") # ROC curve pred=prediction(predictions=predict(log1,newdata=testSet,type="response"),labels=testSet$Survived,label.ordering=c("0","1")) perf=performance(pred,"spec","sens") plot(perf,print.cutoffs.at=c(0.5),colorize=T,main="ROC curve (decision tree)") rm(perf,pred,Survived) ########################################################################################## ########## train and test a logistic model with CARET #################################### ########################################################################################## # fit the model (with parallelization over the CV step) begin=Sys.time() log2_caret=train(Survived~Pclass+Sex+Age+SibSp+Embarked+Parch+Fare, #model formula data=trainSet, method="glm", #method=logistic regression trControl=trainControl(method = "cv",number=20)) #cross-validation end=Sys.time() print(end-begin) rm(end,begin) # view the estimated model summary(log2_caret) # Pclass, Sex and Age have significant coefficients ls(log2_caret) log2_caret$finalModel # predict new obs and confusion table log2_pred=predict(log2_caret,newdata=testSet) confusionMatrix(log2_pred,testSet$Survived, positive="1")