rm(list=ls())

library(foreach)
library(iterators)
library(doParallel)
library(parallel)
library(doMC)


#############################################################################
#################### set working directory  #################################
#############################################################################

setwd("/pico/home/userinternal/msartori/Documents/CORSO_MILANO/DATA/POLIMI/RESULTS")

#############################################################################
############# load treeTagger and CLUTO ################################
#############################################################################

# you need to open the shell and type the following instructions
# in order to load the following functions on PICO

# module load treetagger
# module load cluto


#############################################################################
############# load polarity and stopwords ################################
#############################################################################

polarity=read.csv("/pico/home/userinternal/msartori/Documents/CORSO_MILANO/DATA/POLIMI/polarity.csv",header=T)
polarity[,1]=as.character(polarity[,1])
polarity=unique(polarity)


stopwords=read.table("/pico/home/userinternal/msartori/Documents/CORSO_MILANO/DATA/POLIMI/Stopwords_2015_06_04.txt",header=F)
stopwords=stopwords[,1]
stopwords=unique(tolower(as.character(stopwords)))


#######################################################################################
##### read and manage the data ( first pre-processing steps) ##########################
#######################################################################################

data <- read.table("/pico/home/userinternal/msartori/Documents/CORSO_MILANO/DATA/POLIMI/polimi.txt",header=T,comment.char="",quote="",sep="\t") # class: data.frame
colnames(data)
data=data[-43023,] 
data=data[duplicated(data[,c(1,18)])==F,]
data<-as.data.frame (sapply(data, function(x) iconv(x, from='LATIN1',to='UTF8', sub='byte'))) # change encoding
data$text=as.character(data$text)

# relevel motivo_estrazione 
Uni=read.table("/pico/home/userinternal/msartori/Documents/CORSO_MILANO/DATA/POLIMI/UNIVERSITIES.txt",sep=";")
colnames(Uni)=c("from","to")
Uni=as.data.frame(sapply(Uni, function(x) iconv(x,from="LATIN1",to="UTF8", sub='byte')))
Uni$from=as.character(Uni$from)
Uni$to=as.character(Uni$to)

cambia=function(motivo,newMotivo=Uni){
  if(motivo %in% newMotivo[,1]) to=newMotivo[which(newMotivo[,1]==motivo),2]
  else to=motivo
  return(to)
}
data$motivo_estrazione=(apply(as.matrix(data$motivo_estrazione),1,FUN=cambia))
data$motivo_estrazione=(apply(as.matrix(data$motivo_estrazione),1,FUN=cambia))

unique(data$motivo_estrazione)
table(is.na(data$motivo_estrazione))


# keep only UNIBO tweets
data=data[which(as.character(data$motivo_estrazione)=="UniboMagazine"),]


## drop the retweets and split the data set into "general infos" and "data"(they will be merged again later) 
general_infos=data[-which(substr(data$text,1,2)=="RT"),c(2,19,21)]
colnames(general_infos)=c("ID","motivo","isUtente")
general_infos$ID=as.character(general_infos$ID)
general_infos$motivo=as.character(general_infos$motivo)
general_infos$isUtente=as.character(general_infos$isUtente)

data=data[-which(substr(data$text,1,2)=="RT"),c(2,1)]
colnames(data)=c("ID","MESSAGE")

#save the original ID column
ID=as.character(data$ID)


# removing https 
mod=which(grepl("http",data$MESSAGE))
data$MESSAGE[mod]=gsub("http(s?)(:?)(/?)(/?)\\S*","",data$MESSAGE[mod])
rm(mod)

# adding FIVE hashes before the IDs in order to keep track of them 
hash=function(x) paste("#####",x,sep="")
data$ID=apply(as.data.frame(data$ID),1,hash)

####################################################################################################
########### lemmatization with treeTagger (it is possible only in UNIX environment #################
####################################################################################################


write.table(data, file="reducedData.txt",row.names=F,col.names=F)
system("tree-tagger-italian reducedData.txt > treeTagger_output.txt")


taggedResults=read.table("treeTagger_output.txt",sep="\t",comment.char="",quote="",header=F,na.strings = "")
system("rm reducedData.txt")
#system("rm treeTagger_output.txt")


############################################################################################################
################## keep only the rows of taggedResults that contain useful types (for SENTIMENT STEP) ######
############################################################################################################


tagCorrect=c("NOM","VER","ADJ","ADV")
keep=which((substr(as.character(taggedResults[,2]),1,3) %in% tagCorrect))
taggedResults=taggedResults[keep,]
rm(keep)


###########################################################
######## assign an ID to each word of taggedResults #######
###########################################################


where_ID=which(substr(as.character(taggedResults[,1]),1,5)=="#####")
temp=c(where_ID[-1],(nrow(taggedResults)+1))
ID_length=temp-where_ID
taggedResults$ID=rep(substr(data$ID,6,nchar(data$ID)),times=ID_length)


##########################################
########### manage the data frame ########
##########################################

taggedResults=taggedResults[-where_ID,c(4,1,3,2)]
colnames(taggedResults)=c("ID","parola","lemma","tipo")
taggedResults$lemma=as.character(taggedResults$lemma)
taggedResults$tipo=as.character(taggedResults$tipo)
taggedResults$ID=as.factor(taggedResults$ID)


#####################################################
#### resolving the hashtags and @s problem ##########
#####################################################

lemma=function(x) {
  if (x[3]=="<unknown>" &  substr(x[2],1,1)=="#" ) lemma_new=x[2]
  else lemma_new=x[3]
  return(tolower(lemma_new))
}
lemma_new=apply(X=taggedResults,FUN=lemma,MARGIN=1)
taggedResults$lemma=lemma_new
taggedResults=taggedResults[,-2]


######################################################################################
################## keep only the rows that are not null and not unknown ##############
######################################################################################

discard=(taggedResults$lemma=="" | nchar(taggedResults$lemma)==1 | taggedResults$lemma=="<unknown>" | (taggedResults$lemma %in% stopwords) )
taggedResults=taggedResults[discard==FALSE,]
rm(discard,ID_length,temp,where_ID,lemma_new)
taggedResults$tipo[(substr(taggedResults$tipo,1,3)=="VER")]="VER"

#####################################################################################
################## SENTIMENT STEP ##################################################
#####################################################################################

taggedResults$SENT=0
taggedResults$MATCH=0
sent_pos=which(taggedResults$lemma %in% polarity$word)

SENT_DICT=function(word,dictionary=polarity){
  pol=dictionary[which(dictionary[,1]==word),2]
  match=abs(dictionary[which(dictionary[,1]==word),2])
  if(length(pol)==0){
    pol=0
    match=0
  }
  out=list(pol=pol,match=match)
  return(out)
}

taggedResults$SENT[sent_pos]=unlist(sapply(taggedResults$lemma[sent_pos],SENT_DICT)[1,])
taggedResults$MATCH[sent_pos]=unlist(sapply(taggedResults$lemma[sent_pos],SENT_DICT)[2,])


sentiment=(by(data=taggedResults$SENT,INDICES = as.character(taggedResults$ID),FUN=sum))
matched_words=(by(data=taggedResults$MATCH,INDICES = as.character(taggedResults$ID),FUN=sum))
ID_sentiment=names(sentiment)
sentiment=data.frame(as.numeric(sentiment),as.numeric(matched_words))
sentiment$ID=ID_sentiment
colnames(sentiment)=c("sentiment","matched_words","ID")

##################################################
#### return to the original IDs without hashes ###
##################################################
data$ID=ID

#####################################################################
################# merge the polaritie with the data #################
#####################################################################
data=merge(data,sentiment)

#########################################################
############### set the null polarities to NA ###########
#########################################################

data$sentiment[which(data$matched_words==0)]=NA
table((data$sentiment))
table(is.na(data$sentiment))


### other instructions and checks
taggedResults=taggedResults[,-c(4,5)]
which(!(unique(taggedResults$ID) %in% data$ID))
rm(ID_sentiment,sent_pos,tagCorrect,sentiment,polarity,matched_words,stopwords)


##################################################################################
############# remove everything that is not ADJ and NOM (clustering step) ########
##################################################################################

tagCorrect=c("NOM","ADJ")
keep=which((substr(as.character(taggedResults$tipo),1,3) %in% tagCorrect))
taggedResults=taggedResults[keep,]
rm(keep,tagCorrect)


##################################################################################
############ save the data frame as input for the PERL script ####################
##################################################################################
write.table(taggedResults,row.names=F,col.names=F,"taggedData.load",sep="\t")


####################################################################################
########### PERL script: create a matrix that is compatible with CLUTO #############
####################################################################################

# NB: these are UNIX instructions called with R function "system()"

system("perl /pico/home/userinternal/msartori/Documents/CORSO_MILANO/CODE/PERL/info2cluto.pl < taggedData.load")   
system("cat mio.firstrow mio.mat > cluto_matrix.mat")
system("mv mio.cname wordNames.txt")
system("mv mio.tran IDs.txt")
system("rm mio.firstrow mio.mat treeTagger_output.txt")


####################################################################
################### CLUTO algorithm (CLUSTERING step) ##############
####################################################################

CLUTO=function(j) {
  comando=(paste("vcluster -showfeatures -zscores -clabelfile=wordNames.txt cluto_matrix.mat ",j,"  > output_",j,".txt", sep=""))
  system(comando)
  print(j)
}

J=10
CLUTO(J)


####################################################################################
####################### merge clusters, data and general infos ###################
####################################################################################

options(scipen=999)
data=merge(data,general_infos)
rm(general_infos)
CLUSTERS=read.table(paste("cluto_matrix.mat.clustering.",J,sep=""))
CLUSTERS=CLUSTERS[,1:2]

ID_CLUSTERS=read.table("IDs.txt")
clusters=data.frame(ID_CLUSTERS,CLUSTERS)
colnames(clusters)=c("ID","cluster","similarity")
clusters$ID=unlist(sapply(clusters$ID,as.character))
rm(ID_CLUSTERS,CLUSTERS)
data=merge(data,clusters,all=F)
data=data[,c(1,2,3,6,7,8)]
data$cluster[which(is.na(data$cluster))]=-1
data=data[order(data$cluster,decreasing=F),]
rm(clusters,Uni,taggedResults,ID)

############################################################
############ remove the NA rows and unclustered obs ########
############################################################
if(sum(is.na(data$MESSAGE))>0) {data=data[-which(is.na(data$MESSAGE)),]}
if(length(which(data$cluster<0))) data=data[-which(data$cluster<0),]

##########################################
##### descriptive statistics #############
##########################################

colnames(data)=c("ID","tweet","polarity","Profilo_Istituzionale","cluster","similarity")
by(data$polarity,list(data$cluster,data$cluster),function(x) mean(x,na.rm=T))
by(data$polarity,list(data$Profilo_Istituzionale),function(x) mean(x,na.rm=T))
by(data$polarity,list(data$cluster,data$Profilo_Istituzionale),function(x) mean(x,na.rm=T))

################################################################
###################### save output #############################
################################################################

data2=apply(data,2,as.character)
colnames(data2)=c("ID","tweet","polarity","Profilo_Istituzionale","cluster","similarity")
write.csv2(data2,file="textMining_example.csv",row.names=F)
rm(data2)