#################################################################################################
######## Supervised modelling with R: the CARET package #########################################
#################################################################################################

# this tutorial has been inspired from:
# http://will-stanton.com/machine-learning-with-r-an-irresponsibly-fast-tutorial/

library(caret)
library(randomForest)
library(rpart)
library(rpart.plot)
library(mclust)
library(e1071)
library(ROCR)
library(foreach)
library(doMC)
setwd("/pico/home/userinternal/msartori/Documents/CORSO_MILANO/DATA/TITANIC") #set working directory
#setwd("~/Google Drive/CINECA/CORSO_PARALLEL_COMPUTING/DATA/TITANIC") #set working directory
rm(list=ls()) #clear workspace
set.seed(123) #set the seed

##########################################################################################
######### parallelization (it is the only instructions needed by CARET) ##################
##########################################################################################
#registerDoMC(10)

##########################################################################################
####################### load the data ####################################################
##########################################################################################
# we will use only the training set, since the test set is unlabelled
data=read.csv("train.csv",header=T,sep=",")

##########################################################################################
########### find and erase the rows that contain NAs #####################################
##########################################################################################
# it is the best way to keep things simple
data$Embarked[which(data$Embarked=="")]=NA
NAs=function(i)length(which(is.na(data[i,c(1,2,3,5,6,7,8,10,12)])))
remove=(unlist(lapply(1:nrow(data),NAs)))
data=data[which(remove==0),]
rm(remove,NAs)

##########################################################################################
####### coerce a)Survived (1=yes, 0=no), b)Pclass and c)Embarked to factors ##############
##########################################################################################


data$Pclass=factor(data$Pclass)
data$Survived=factor(data$Survived)
data$Embarked=factor(data$Embarked)

##########################################################################################
####### split the data into training/test sets ###########################################
##########################################################################################
trainID=sample(nrow(data),nrow(data)*(4/5))
trainSet=data[trainID,-c(1,4,9,11)]
testSet=data[-trainID,-c(1,4,9,11)]
# some variables have been excluded!

##########################################################################################
###### which are the most "useful" variables? ############################################
##########################################################################################
# careful! this is a very rough way to determine which variables are useful to our
# puroposes: no further effort will be made in order to select the best subset of 
# variables, since it is not the goal of this example

# passenger class (categorical)
table(trainSet$Survived,trainSet$Pclass) #it might be an interesting predictor

# age (numerical)
summary(trainSet$Age) 
boxplot(trainSet$Age~trainSet$Survived) #probably not a good predictor

# fare (numerical)
summary(trainSet$Fare) #no NAs
boxplot(trainSet$Fare~trainSet$Survived) #it may be an interesting predictor

# sex (categorical)
table(trainSet$Survived,trainSet$Sex) 
# it may be an interesting predictor, but it could be associated with the passenger class

table(trainSet$Sex,trainSet$Pclass)
# it could be worth inspecting the relationships among the variables

##########################################################################################
########## train and test a decision tree with "rpart" package ###########################
##########################################################################################
# important feature: complexity parameter (cp) of the tree

# fit the model
dt=rpart(Survived~Pclass+Sex+Age+SibSp+Embarked+Parch+Fare,data=trainSet)
dt
summary(dt)
ls(dt)

# plot the tree
rpart.plot(dt,type=4)

# predict the class of the new obs
dt_pred=predict(dt,newdata=testSet)

# ROC curve
pred=prediction(predictions=dt_pred[,2],labels=testSet$Survived,label.ordering=c("0","1"))
perf=performance(pred,"spec","sens")
plot(perf,print.cutoffs.at=c(0.5),colorize=T,main="ROC curve (decision tree)")
rm(perf,pred)

# confusion table
dt_pred=as.factor(apply(dt_pred,1,which.max)-1)
confusionMatrix(dt_pred,testSet$Survived, positive="1")
# accuracy = (# correctly predicted)/(# of observations)
# sensitivity = (# true positives)/(# real positives)
# specificity = (# true negatives)/(# real negatives)
# detection rate = (# true positives)/(# of observations)
# positive prediction value = (# true positives)/(# predicted positives)
# negative prediction value = (# true negatives)/(# predicted negatives)
# kappa = adjusts accuracy by accounting for the possibility of a correct prediction by chance alone
# range of kappa: 0-1


##########################################################################################
########## train and test a randomForest model with "randomForest" package  ##############
##########################################################################################
# important features:
# mtry -> Number of variables randomly sampled as candidates at each split
# ntree -> Number of trees to grow: every input row must be predicted at least a few times.
# let's assume that this algorithm works as a "black box"!

# fit the model
rf1=randomForest(Survived~Pclass+Sex+Age+SibSp+Embarked+Parch+Fare,data=trainSet,ntree=500)
rf1
ls(rf1) 

# visualize a single tree grown by the randomForest algorithm
getTree(rf1,k=1,labelVar=T)
table(is.na(getTree(rf1,k=1,labelVar=T)$pred))
# in this tree only 127 obs have been used and only 63 obs have ben classified

# predict new obs and confusion table
rf1_pred=predict(rf1,newdata=testSet)
confusionMatrix(rf1_pred,testSet$Survived, positive="1")

##########################################################################################
########## train and test a randomForest model with CARET ################################
##########################################################################################


# fit the model (with parallelization over the CV step)
begin=Sys.time()
rf2_caret=train(Survived~Pclass+Sex+Age+SibSp+Embarked+Parch+Fare, #model formula
              data=trainSet,
              method="rf", #method=randomForest
              trControl=trainControl(method = "cv",number=20,classProbs=T)) #cross-validation
end=Sys.time()
print(end-begin)
rm(end,begin)

# NB: CV (cross-validation) evaluates the performance of a model using only the training data
# Caret itself does this!!
# workfolow:
# 1) split the training data into (k) equally sized pieces called folds
# 2) train the model on (k-1)/k folds, and check its accuracy on the k-th fold
# 3) repeat this process with each split of the data
# 4) at the end, the percentage accuracy across the different splits of the data is averaged


# view the estimated model
rf2_caret
ls(rf2_caret)
rf2_caret$bestTune
rf2_caret$coef
rf2_caret$finalModel

# predict new obs and confusion table
rf2_pred=predict(rf2_caret,newdata=testSet)
confusionMatrix(rf2_pred,testSet$Survived, positive="1")


##########################################################################################
########## train and test a logistic model ###############################################
##########################################################################################

# number of covariate pattern
dim(trainSet[,-1])
dim(unique(trainSet[,-1]))
# some rows are duplicated!

# fit the model 
Survived=trainSet$Survived=="1"
log1=glm(cbind(Survived,1-Survived)~Pclass+Sex+Age+SibSp+Embarked+Parch+Fare, family=binomial,data=trainSet[,-1])

# view the estimated model (we won't consider the diagnostics of the model)
summary(log1) # Pclass, Sex and Age have significant coefficients
ls(log1)

# predict new obs and confusion table
log1_pred=round(predict(log1,newdata=testSet,type="response"))
confusionMatrix(log1_pred,testSet$Survived, positive="1")

# ROC curve
pred=prediction(predictions=predict(log1,newdata=testSet,type="response"),labels=testSet$Survived,label.ordering=c("0","1"))
perf=performance(pred,"spec","sens")
plot(perf,print.cutoffs.at=c(0.5),colorize=T,main="ROC curve (decision tree)")
rm(perf,pred,Survived) 


##########################################################################################
########## train and test a logistic model with CARET ####################################
##########################################################################################


# fit the model (with parallelization over the CV step)
begin=Sys.time()
log2_caret=train(Survived~Pclass+Sex+Age+SibSp+Embarked+Parch+Fare, #model formula
                data=trainSet,
                method="glm", #method=logistic regression
                trControl=trainControl(method = "cv",number=20)) #cross-validation
end=Sys.time()
print(end-begin)
rm(end,begin)


# view the estimated model
summary(log2_caret) # Pclass, Sex and Age have significant coefficients
ls(log2_caret)
log2_caret$finalModel


# predict new obs and confusion table
log2_pred=predict(log2_caret,newdata=testSet)
confusionMatrix(log2_pred,testSet$Survived, positive="1")