############################################################################################################## ############################################################################################################## ######## R parallelization: model-based clustering example on bivariate randomly generated data ############## ############################################################################################################## ############################################################################################################## # This tutorial is about R parallelization and has been inspired from http://glennklockwood.com/di/R-para.php rm(list=ls()) #clear workspace setwd("/pico/home/userinternal/msartori/Documents/CORSO_MILANO/DATA/") #set working directory source("/pico/home/userinternal/msartori/Documents/CORSO_MILANO/CODE/R/DATA_GENERATION/genData.R") #loading the data generation function # install.packages("e1071") # install.packages("snow") # install.packages("mclust") # install.packages("doMC") # install.packages("foreach") library(e1071) library(parallel) library(foreach) library(doMC) library(mclust) library(snow) ############################################################################## ######## data generation and data writing #################################### ############################################################################## # setting the parameters nrow <- 1000 # number of obs of each cluster sd <- matrix(c(0.5,1,0.7,0.9,0.4,0.7,0.2,1.2,0.1,0.3),5,2) # standard deviation of each cluster real.centers <- matrix(c(-2, -1.5, 0.0, +0.7, +1.5,-1.4, +1.5, 0.1, -1.3, +1.5),5,2 ) # the real centers of the clusters seed=1234 # set seed: in this way the generated data will be replicable #data generation data=genData(nrow,sd,real.centers,seed) # total: 5000 bivariate obs (1000 obs for each group) labels=data$labels data=data$data # plot the data plot(x=data[,1],y=data[,2],col=labels,main="Randomly generated bivariate data",xlab="x",ylab="y") # write and read the data write.csv(data, file='smallGenData.csv', row.names=FALSE) data <- read.csv('smallGenData.csv') #load the data set # removing useless items rm(real.centers,sd,nrow,seed) ############################################################################## ######## definition of the parallelization function ########################## ############################################################################## # parameter matrix: each row is a combination of modelNames and G (see Mclust function) modelNames=c("EII","EEI","EEE","VVV") G=4:6 param=expand.grid(G=G,modelNames=modelNames) #12 combinations! rm(G,modelNames) # how many cores do we want to exploit during the parallelization? ncores=ifelse(nrow(param)>16,yes=16,no=nrow(param)) parallel.function <- function(i) { Mclust( data=data, G=as.character(param$G[i]), modelNames=as.character(param$modelNames[i]),prior=priorControl()) } # NB: the parallelization will not be about splitting the data, but about choosing the best parameters # the data are hard-coded in the function, the variables here are "G"(1) and "modelNames"(2) ############################################################################## ################## EXAMPLE 1: mclapply (shared-memory parallelism) ########### ############################################################################## b=Sys.time() results_list=mclapply((1:nrow(param)),parallel.function,mc.cores=ncores) e=Sys.time() mclapply_time=e-b print(mclapply_time) mclapply_result=results_list[[which.max(sapply(results_list,function(x){x$bic}))]] rm(e,b,results_list) ls(mclapply_result) print(mclapply_result$G) print(mclapply_result$modelName) # matching the results with the actual labels matchClasses(table(labels,mclapply_result$cl),method="exact") # entropy of the result mclapply_entropy=sum((table(mclapply_result$cl)/length(mclapply_result$cl))*log(table(mclapply_result$cl)/length(mclapply_result$cl))) # which obs have been incorrectly classified? errors=labels errors[classError(mclapply_result$cl,truth=labels)$misclassified]=2 errors[-classError(mclapply_result$cl,truth=labels)$misclassified]=1 plot(data[,1],data[,2],col=errors,main="misclassified obbservations (red dots)",xlab="x",ylab="y") by(errors-1,labels,mean) ############################################################################## ######### EXAMPLE 2: running the best model only (no parallelization) ######## ############################################################################## b=Sys.time() Mclust(data,G=mclapply_result$G,modelNames=mclapply_result$modelName,prior=priorControl()) e=Sys.time() print(e-b) rm(b,e) # running this model only takes approximately the same time of the parallelized function ############################################################################## ######## EXAMPLE 3: doMC foreach (shared-memory parallelism) ################# ############################################################################## registerDoMC(ncores) #it is a necessary step in order to make foreach work in parallel b=Sys.time() results_list <- foreach(i = 1:nrow(param)) %dopar% parallel.function(i) e=Sys.time() doMC_result = results_list[[which.max(sapply(results_list,function(x){x$bic}))]] doMC_time=e-b print(doMC_time) rm(e,b,results_list) ls(doMC_result) print(mclapply_result$G) print(mclapply_result$modelName) # matching the reslts with the actual labels matchClasses(table(labels,doMC_result$cl),method="exact") # entropy of the result doMC_entropy=sum((table(doMC_result$cl)/length(doMC_result$cl))*log(table(doMC_result$cl)/length(doMC_result$cl)))