############################################################################################################## ############################################################################################################## ######## R parallelization: interactive qsub example (MPI-based distributed memory) ########################## ############################################################################################################## ############################################################################################################## rm(list=ls()) #clear workspace setwd("/pico/home/userinternal/msartori/Documents/CORSO_MILANO/DATA/KDDCUP") #set working directory set.seed(1234) #set seed # loading the necessary R packages library(e1071) library(parallel) library(foreach) library(doMC) library(snow) library(Rmpi) library(doSNOW) # loading data data=read.table("kddcup.data",sep=",",header=F) labels=(data[,42]) data=data[,-42] # managing the labels class(labels) for(i in 1:length(levels(labels))) levels(labels)[i] = substr(levels(labels)[i],1,nchar(levels(labels)[i])-1) rm(i) labels=as.character(labels) round(table(labels)/length((labels)),3) # assigning names to the columns of the data frame decodifica=read.table("decodifica_kddcup.txt",sep=",",header=F) colnames(decodifica)=c("label","description") colnames(data)=decodifica$label # discarding the non-continuous variables reducedData=data[,-which(decodifica$description!="continuous" )] # how many groups of observations? (it is the only information we suppose to be known) nClusters=length(unique(labels)) # definition of the parallelization function parallel.function <- function(i) { kmeans( x=reducedData, centers=nClusters, nstart=i,iter.max=20 ) } ############################################################################## #### MPI-based foreach parallelization (distributed memory approach) ######### ############################################################################## # creating the cluster cl <- makeCluster( 10, type="MPI" ) clusterExport(cl, c('reducedData','nClusters') ) registerDoSNOW(cl) begin=Sys.time() results_list <- foreach( i = rep(5,times = 10) ) %dopar% parallel.function(i) tot.withinss <- sapply( results_list, function(results_list) { results_list$tot.withinss } ) MPI_result <- results_list[[which.min(tot.withinss)]] end=Sys.time() MPI_time=end-begin print(MPI_time) #rm(end,begin,results_list,tot.withinss) ls(MPI_result) # matching the results with the actual labels matchClasses(table(labels,MPI_result$cluster),method="greedy") # entropy of the result MPI_entropy=sum((table(MPI_result$cluster)/length(MPI_result$cluster))*log(table(MPI_result$cluster)/length(MPI_result$cluster))) # stopping the cluster stopCluster(cl) mpi.exit()