##############################################################################################################
##############################################################################################################
######## R parallelization: model-based clustering example on bivariate randomly generated data ##############
##############################################################################################################
##############################################################################################################


# This tutorial is about R parallelization and has been inspired from http://glennklockwood.com/di/R-para.php 


rm(list=ls()) #clear workspace
setwd("/pico/home/userinternal/msartori/Documents/CORSO_MILANO/DATA/") #set working directory
source("/pico/home/userinternal/msartori/Documents/CORSO_MILANO/CODE/R/DATA_GENERATION/genData.R") #loading the data generation function
# install.packages("e1071")
# install.packages("snow")
# install.packages("mclust")
# install.packages("doMC")
# install.packages("foreach")
library(e1071)
library(parallel)
library(foreach)
library(doMC)
library(mclust)
library(snow)


##############################################################################
######## data generation and data writing ####################################
##############################################################################

# setting the parameters
nrow <- 1000 # number of obs of each cluster
sd <- matrix(c(0.5,1,0.7,0.9,0.4,0.7,0.2,1.2,0.1,0.3),5,2) # standard deviation of each cluster
real.centers <- matrix(c(-2, -1.5, 0.0, +0.7, +1.5,-1.4, +1.5, 0.1, -1.3, +1.5),5,2 ) # the real centers of the clusters
seed=1234 # set seed: in this way the generated data will be replicable

#data generation
data=genData(nrow,sd,real.centers,seed) # total: 5000 bivariate obs (1000 obs for each group)
labels=data$labels
data=data$data

# plot the data
plot(x=data[,1],y=data[,2],col=labels,main="Randomly generated bivariate data",xlab="x",ylab="y")

# write and read the data
write.csv(data, file='smallGenData.csv', row.names=FALSE)
data <- read.csv('smallGenData.csv') #load the data set

# removing useless items
rm(real.centers,sd,nrow,seed)

##############################################################################
######## definition of the parallelization function ##########################
##############################################################################

# parameter matrix: each row is a combination of modelNames and G (see Mclust function)
modelNames=c("EII","EEI","EEE","VVV")
G=4:6
param=expand.grid(G=G,modelNames=modelNames) #12 combinations!
rm(G,modelNames)

# how many cores do we want to exploit during the parallelization?
ncores=ifelse(nrow(param)>16,yes=16,no=nrow(param))

parallel.function <- function(i) { Mclust( data=data, G=as.character(param$G[i]), modelNames=as.character(param$modelNames[i]),prior=priorControl()) }
# NB: the parallelization will not be about splitting the data, but about choosing the best parameters
# the data are hard-coded in the function, the variables here are "G"(1) and "modelNames"(2)


##############################################################################
################## EXAMPLE 1: mclapply (shared-memory parallelism) ###########
##############################################################################


b=Sys.time()
results_list=mclapply((1:nrow(param)),parallel.function,mc.cores=ncores)
e=Sys.time()
mclapply_time=e-b
print(mclapply_time)
mclapply_result=results_list[[which.max(sapply(results_list,function(x){x$bic}))]]
rm(e,b,results_list)

ls(mclapply_result)
print(mclapply_result$G)
print(mclapply_result$modelName)

# matching the results with the actual labels
matchClasses(table(labels,mclapply_result$cl),method="exact")
# entropy of the result
mclapply_entropy=sum((table(mclapply_result$cl)/length(mclapply_result$cl))*log(table(mclapply_result$cl)/length(mclapply_result$cl)))

# which obs have been incorrectly classified?
errors=labels
errors[classError(mclapply_result$cl,truth=labels)$misclassified]=2
errors[-classError(mclapply_result$cl,truth=labels)$misclassified]=1
plot(data[,1],data[,2],col=errors,main="misclassified obbservations (red dots)",xlab="x",ylab="y")
by(errors-1,labels,mean)


##############################################################################
######### EXAMPLE 2: running the best model only (no parallelization) ########
##############################################################################


b=Sys.time()
Mclust(data,G=mclapply_result$G,modelNames=mclapply_result$modelName,prior=priorControl())
e=Sys.time()
print(e-b)
rm(b,e)

# running this model only takes approximately the same time of the parallelized function


##############################################################################
######## EXAMPLE 3: doMC foreach (shared-memory parallelism) #################
##############################################################################

registerDoMC(ncores) #it is a necessary step in order to make foreach work in parallel

b=Sys.time()
results_list <- foreach(i = 1:nrow(param)) %dopar% parallel.function(i)
e=Sys.time()
doMC_result = results_list[[which.max(sapply(results_list,function(x){x$bic}))]]
doMC_time=e-b
print(doMC_time)
rm(e,b,results_list)

ls(doMC_result)
print(mclapply_result$G)
print(mclapply_result$modelName)

# matching the reslts with the actual labels
matchClasses(table(labels,doMC_result$cl),method="exact")
# entropy of the result
doMC_entropy=sum((table(doMC_result$cl)/length(doMC_result$cl))*log(table(doMC_result$cl)/length(doMC_result$cl)))