# PART 1: function calls and messages

## Comments

In [1]:
# This is a comment
x <- "This is a string assignement" # This is a comment, also

## Functions calls

In [2]:
# This is a function call
print("Hello World!")

[1] "Hello World!"


In [3]:
# this is an error, brackets are required
print "Hello World"

ERROR: Error in parse(text = x, srcfile = src): <text>:2:7: unexpected string constant
1: # this is an error, brackets are required
2: print "Hello World"
         ^


In [4]:
# R is case sensitive
Print("") # Print function does not exist

ERROR: Error in eval(expr, envir, enclos): could not find function "Print"


In [5]:
# this is the function
print

In [6]:
# This show documentation about the "print" function
?print

In [7]:
# Try to print multiple values
print("Hello", "World")

“NAs introduced by coercion”

ERROR: Error in print.default("Hello", "World"): invalid 'digits' argument


### Print messages

In [8]:
# We have to print one string at a time ...
print("Hello")
print("World")

[1] "Hello"
[1] "World"


In [9]:
# ... or merge into a single string
print( paste("Hello", "World", "!") )
print( paste0("Hello ", "World", "!") )

[1] "Hello World !"
[1] "Hello World!"


In [10]:
print("Every string literal in R 
can be a multiline string") # Print with escaping

[1] "Every string literal in R \ncan be a multiline string"


In [11]:
cat("Every string literal in R 
can be a multiline string") # Print without escaping

Every string literal in R 
can be a multiline string

In [12]:
# R has two string delimiter characters " and '
cat("this is a string literal with 'quotation' insde ")
cat("\n") # newline
cat('this is a string literal with "double quotation" insde ')

this is a string literal with 'quotation' insde 
this is a string literal with "double quotation" insde 

#### <span style="color:red">Back to presentation</span>

# PART 2: Variables

In [13]:
# R has variables, of course.
greet = "Hello" # assignement
who <- "Marco" # assignement
print( paste0(greet, " ", who, "!") )

[1] "Hello Marco!"


## R primitive data types

In [14]:
x <- 5 # Usually numbers are floating point double called 'numeric'
typeof(x)
class(x)
x

In [15]:
x <- as.numeric(5.12) # Explicitly convert a number in a 'numeric' (not very usefull)
typeof(x)
class(x)
x

In [16]:
x <- as.integer(-3.7) # convert a number to an integer. Truncate the decimal part to obtain the nearest 
                      # integer with the smaller absolute value
typeof(x)
class(x)
x

In [17]:
x <- -1 + 0i # Complex numbers
typeof(x)
class(x)
x

In [18]:
x <- T # or F or TRUE or FALSE. 
typeof(x)
class(x)
x

In [19]:
x <- "12" # String representation of a number.
typeof(x)
class(x)
x

### A note on complex
Default arithmetic is in Real domain, if you whant complex arithmentic you have to convert your variable

In [20]:
x <- -1
sqrt( x )

“NaNs produced”

In [21]:
x <- -1
sqrt( as.complex( x )  )

### A special kind of integer: Factors

In [22]:
x1 <- as.factor( 0.000000001 )
typeof(x1)
class(x1)
x1

In [23]:
levels(x1)
typeof(levels(x1))

In [24]:
as.numeric(x1)

In [25]:
as.character(x1)

In [26]:
as.numeric(as.character(x1))

## Operators

### Arithmetic operators

In [27]:
print(paste( "4 - 3 =", 4 - 3 ))
print(paste( "4 + 3 =", 4 + 3 ))
print(paste( "4 * 3 =", 4 * 3 ))
print(paste( "4 ** 3 =", 4 ** 3 ))
print(paste( "4 ^ 3 =", 4 ^ 3 ))
print(paste( "4 / 3 =", 4 / 3 ))
print(paste( "4 %% 3 =", 4 %% 3 ))

[1] "4 - 3 = 1"
[1] "4 + 3 = 7"
[1] "4 * 3 = 12"
[1] "4 ** 3 = 64"
[1] "4 ^ 3 = 64"
[1] "4 / 3 = 1.33333333333333"
[1] "4 %% 3 = 1"


### Comparison operators

In [28]:
print(paste( "'a' < 'b' is ", 'a' < 'b' ))
print(paste( "8 > 7.4 is ", 8 > 7.4 ))
print(paste( "5 < 7.4 is ", 5 < 7.4 ))
print(paste( "5 == 5.0 is ", 5 == 5.0 ))
print(paste( "'5' != '5.0' is ", '5' != '5.0' ))
print(paste( "'a' %in% c('a', 'b') is ", 'a' %in% c('a', 'b') ))

[1] "'a' < 'b' is  TRUE"
[1] "8 > 7.4 is  TRUE"
[1] "5 < 7.4 is  TRUE"
[1] "5 == 5.0 is  TRUE"
[1] "'5' != '5.0' is  TRUE"
[1] "'a' %in% c('a', 'b') is  TRUE"


### Logical Operator

In [29]:
print(paste( "! TRUE is", ! TRUE )) # NOT
print(paste( "TRUE || TRUE is", TRUE || TRUE )) # OR
print(paste( "TRUE || FALSE is", TRUE || FALSE )) # OR
print(paste( "TRUE && TRUE is", TRUE && TRUE )) # AND
print(paste( "TRUE && FALSE is", TRUE && FALSE )) # AND

[1] "! TRUE is FALSE"
[1] "TRUE || TRUE is TRUE"
[1] "TRUE || FALSE is TRUE"
[1] "TRUE && TRUE is TRUE"
[1] "TRUE && FALSE is FALSE"


## R objects has attributes

In [30]:
attributes( as.factor( 0.000000001 ) )

## Some tricks about implicit conversions

In [31]:
num <- 0.01
str <- "0.01"

print("Num")
print(num)
print("Character")
print(str)

print( paste( "(num == str) is ", num == str ) ) # TRUE: automatic conversion from number to string representation

[1] "Num"
[1] 0.01
[1] "Character"
[1] "0.01"
[1] "(num == str) is  TRUE"


In [32]:
num <- 0.0000001
str <- "0.0000001"

print("Num")
print(num)
print("Character")
print(str)

print( paste( "(num == str) is ", num == str ) ) # FALSE: automatic conversion from number to string 
                                                 #        is not always a good idea

[1] "Num"
[1] 1e-07
[1] "Character"
[1] "0.0000001"
[1] "(num == str) is  FALSE"


In [33]:
print( as.numeric( 0.0000001 ) == as.numeric( "0.0000001" ) ) # - Explicit conversion is safer

# - The same example, step by step
a <- 0.0000001
b <- "0.0000001"
print( paste("Compare numbers: ", as.numeric( a ) == as.numeric( b ) ) )
print( paste("Compare string representations: ", as.character( a ) ==  as.character( b ) ) )


[1] TRUE
[1] "Compare numbers:  TRUE"
[1] "Compare string representations:  FALSE"


## Costructor Vs Conversion

In [34]:
# Until now whe have seen 
x <- as.character(5)
print(x)

[1] "5"


In [35]:
x <- character(5) # that create a vector of 5 empty strings
x

In [36]:
list( 
    integer(1),
    complex(2),
    character(3),
    list(4,5)
)

#### <span style="color:red">Back to presentation</span>

# PART 3 : one dimensional data structures and subsetting
**R scalar values are special case of vectors**

## Vectors

In [37]:
x <- "Hello"
length(x) # Why 1 ?

In [38]:
x <- c("Hello", "Bye", "Hi", "Welcome")
length(x) # .. because the base type of R is a vector object. An ordered set (with repetition), 
          #    of element with same class
x

In [39]:
print( x ) # Print the whole vector

[1] "Hello"   "Bye"     "Hi"      "Welcome"


In [40]:
1:3 # this is the range operator

0:11 # generate integer vector incrementing by one
class(0:11)

0.1:11.5 # it works also for numeric
class(0.1:11.5)

In [41]:
# Vector concatenation
a <- c("Blue", "Gray")
b <- c(a, "White")
b

In [42]:
# Element by element operations
c(1,2,3) / c(1,2,3)

In [43]:
# Array and "scalar" operations
c(1,2,3) + 10

In [44]:
# element by element logical operations
! c(T, F) # NOT
c(T, T, F, F) | c(T, F, T, F) # OR
c(T, T, F, F) & c(T, F, T, F) # AND
xor(c(T, T, F, F) , c(T, F, T, F))

### Which are TRUE ?

In [45]:
which( ! c(T, T, F, T, F, T) )

## Access to a single element  [[ ]]

In [46]:
# the  '[[ integer ]]' notation permit to access one single value of a vector
print( x[[1]] )
print( x[[2]] )
print( x[[3]] )
print( x[[4]] )

[1] "Hello"
[1] "Bye"
[1] "Hi"
[1] "Welcome"


In [47]:
print( x[[0]] ) # Vector are 1 based. 

ERROR: Error in x[[0]]: attempt to select less than one element in get1index <real>


## Subsetting [ ]

In [48]:
indexes <- c(1:3)
x2 <- x[ indexes ]
print( x2 ) # - Select and print a subvector

[1] "Hello" "Bye"   "Hi"   


In [49]:
indexes <- c(1,3, 2,4, 1,3, 2,4, 1,3, 2,4, 1,3, 2,4, 1,3, 2,4)
x2 <- x[ indexes ] # Generate a vector from another reusing elements
print( x2 ) 

 [1] "Hello"   "Hi"      "Bye"     "Welcome" "Hello"   "Hi"      "Bye"    
 [8] "Welcome" "Hello"   "Hi"      "Bye"     "Welcome" "Hello"   "Hi"     
[15] "Bye"     "Welcome" "Hello"   "Hi"      "Bye"     "Welcome"


In [50]:
x2 == "Hi" # element by element comparison

In [51]:
# Vector filtering
x2[ x2 == "Hi" ]

# List: vector with element of different types

In [52]:
l <- list(1.0, "one", as.integer(1), 1+0i)
class(l)
l

In [53]:
class( l[[1]] )  # the class of an element could be different from the class of a list

In [54]:
class( l[[2]] )  # the classes of two elements could be different

In [55]:
class( l[1] )  # this create a list like ...

In [56]:
class( l[1:3] ) # ... this.

### Named List

In [57]:
x <- list( Name = c("Andrea", "Giovanni"), Surname = c("Rossi", "Bianchi"), Age = c(34, 27) )
print("Names")
names(x)
print("Values")
x

[1] "Names"


[1] "Values"


In [58]:
# Subset by name
class(x['Age'])
x['Age']

In [59]:
# Element access by name
class(x[['Age']])
x[['Age']]

# Element access by name
class(x$Age)
x$Age

#### Change names

In [60]:
x <- list( Name = c("Andrea", "Giovanni"), Surname = c("Rossi", "Bianchi"), Age = c(34, 27, 789) )

# change names
names(x) # print
names(x) <- c("a", "b", "c") # change
names(x) # print new names

names(x)[ names(x) == 'a'] <- c("A") # change only one
names(x) # print new names

#### <span style="color:red">Back to presentation</span>

# PART 4: Missing Values

## Strange values

In [61]:
x <- 5 / 0 # - Infinite
typeof(x)
class(x)
x

In [62]:
x <- sqrt(-1) # - Not a Number: typical result of sqrt(-1) in real domain
typeof(x)
class(x)
x

“NaNs produced”

In [63]:
x <- NA  # - The information exist but it is unknown
typeof(x)
class(x)
x

[1] NA

In [64]:
NA + 5 # - Operations with NA give NA

[1] NA

In [65]:
x <- NULL  # - The information do not exist (not very R construct)
typeof(x)
class(x)
x

NULL

In [66]:
# NULL is absence of information
print("A")
a <- c(NA, 4)
a
is.null( a )
length( a )

print("B")
b <- c(NULL, 4)
b
is.null( b )
length( b )

print("C")
c <- c(NA)
c
is.null( c )
length( c )

print("D")
d <- c(NULL)
d
is.null( d )
length( d )

print("E")
e <- NULL
e
is.null( e )
length( e )

[1] "A"


[1] "B"


[1] "C"


[1] NA

[1] "D"


NULL

[1] "E"


NULL

In [67]:
# Special value comparison
is.na(list("", 0, NA, NaN, Inf))
is.nan(as.numeric(c("", 0, NA, NaN, Inf)))
as.numeric(c("", 0, NA, NaN, Inf)) == Inf

#### <span style="color:red">Back to presentation</span>

# PART 5: two dimensional data structures

## Matrices

In [68]:
# Matrix are two dimensional vectors (not list: alle elements have the same type)
m <- matrix(1:6, nrow = 2, ncol = 3)
typeof(m)
class(m)
m # Matrices are constructed column-wise

0,1,2
1,3,5
2,4,6


In [69]:
# Matrix are two dimensional vectors (not list: alle elements have the same type)
m <- matrix(as.numeric(1:6), nrow = 2, ncol = 3)
typeof(m)
class(m)
m # Matrices are constructed column-wise

0,1,2
1,3,5
2,4,6


In [70]:
dim(m) 
nrow(m) # Quantity of rows
ncol(m) # Quantity of columns

In [71]:
# Column Bind function
m1 <- cbind( c(1,2,3), c(4,5,6))
m1

0,1
1,4
2,5
3,6


In [72]:
# Row Bind function
m2 <- rbind( c(1,2,3), c(4,5,6))
m2

0,1,2
1,2,3
4,5,6


In [73]:
m2 * m2 # product element by element

0,1,2
1,4,9
16,25,36


In [74]:
m + 10 # sum elements and scalar

0,1,2
11,13,15
12,14,16


In [75]:
# Matrix element access
class( m[[1,2]] )
m[[1,2]]

In [76]:
# Sub Matrix 
sub.m <- m[c(1,2), c(2,3)]
class( sub.m )
sub.m

0,1
3,5
4,6


In [77]:
# Sub Matrix automatic conversion to vector
sub.m <- m[c(1:2), c(3)]
class( sub.m )
sub.m

# Data Frames

In [78]:
df = data.frame( Name = c("Andrea", "Giovanni", "Luca"), Surname = c("Rossi", "Bianchi", "Verdi"), Age = c(34, 27, NA)  )
df

Name,Surname,Age
Andrea,Rossi,34.0
Giovanni,Bianchi,27.0
Luca,Verdi,


In [79]:
names(df)
colnames(df)
rownames(df)

In [80]:
rownames(df) <- paste( df$Surname, df$Name)
rownames(df)

In [81]:
df

Unnamed: 0,Name,Surname,Age
Rossi Andrea,Andrea,Rossi,34.0
Bianchi Giovanni,Giovanni,Bianchi,27.0
Verdi Luca,Luca,Verdi,


In [82]:
# Add a row with factors
df <- rbind(df, list(Name = 'Mario', Surname = 'Ferrari', Age = 42 ))
df

“invalid factor level, NA generated”

Unnamed: 0,Name,Surname,Age
Rossi Andrea,Andrea,Rossi,34.0
Bianchi Giovanni,Giovanni,Bianchi,27.0
Verdi Luca,Luca,Verdi,
4,,,42.0


In [83]:
# Avoid factors in dataframes
df = data.frame( 
    Name = c("Andrea", "Giovanni", "Luca"), 
    Surname = c("Rossi", "Bianchi", "Verdi"), 
    Age = c(34, 27, NA), 
    stringsAsFactors = F  
)
rownames(df) <- paste( df$Surname, df$Name)
df <- rbind(df, list(Name = 'Mario', Surname = 'Ferrari', Age = 42 ))
df

Unnamed: 0,Name,Surname,Age
Rossi Andrea,Andrea,Rossi,34.0
Bianchi Giovanni,Giovanni,Bianchi,27.0
Verdi Luca,Luca,Verdi,
4,Mario,Ferrari,42.0


In [84]:
# Add columns
df$BirthdYear = 2017 - df$Age
df

Unnamed: 0,Name,Surname,Age,BirthdYear
Rossi Andrea,Andrea,Rossi,34.0,1983.0
Bianchi Giovanni,Giovanni,Bianchi,27.0,1990.0
Verdi Luca,Luca,Verdi,,
4,Mario,Ferrari,42.0,1975.0


In [85]:
# Subsetting on rows
df[ df$Age < 40 ,] # Problems with NA

Unnamed: 0,Name,Surname,Age,BirthdYear
Rossi Andrea,Andrea,Rossi,34.0,1983.0
Bianchi Giovanni,Giovanni,Bianchi,27.0,1990.0
,,,,


In [86]:
df$Age < 40

In [87]:
# Subsetting on rows
df[ !is.na(df$Age) & df$Age < 40 ,] # Avoid NA issue

Unnamed: 0,Name,Surname,Age,BirthdYear
Rossi Andrea,Andrea,Rossi,34,1983
Bianchi Giovanni,Giovanni,Bianchi,27,1990


In [88]:
# Subsetting on columns
df[  , c("Name", "Age")]

Unnamed: 0,Name,Age
Rossi Andrea,Andrea,34.0
Bianchi Giovanni,Giovanni,27.0
Verdi Luca,Luca,
4,Mario,42.0


In [89]:
# Subsetting on both
df[ df$Age < 40 , c("Name", "Age") ]

Unnamed: 0,Name,Age
Rossi Andrea,Andrea,34.0
Bianchi Giovanni,Giovanni,27.0
,,


In [90]:
# Aggregated values on column
sum(df$Age)
prod(df$Age, na.rm = TRUE)
mean(df$Age, na.rm = TRUE)
sd(df$Age, na.rm = TRUE) # Standard deviation

[1] NA

### Merge two data frame

In [91]:
df1 = data.frame( Name = c("Mario", "Giovanni"), Surname = c("Rossi", "Bianchi"), Company = c("company 2", "company 1") )
df2 = data.frame( Name = c("Mario", "Giovanni"), Surname = c("Rossi", "Bianchi"), JobTitle = c("job title 2", "job title 1") )

merge(df1, df2)

Name,Surname,Company,JobTitle
Giovanni,Bianchi,company 1,job title 1
Mario,Rossi,company 2,job title 2


In [92]:
?merge # For more info

### Dataframe: Long and wide format

In [93]:
# install a new package
#install.packages("reshape", lib="/opt/conda/lib/R/library", repo="http://cran.us.r-project.org")
library(reshape) # use the installed library

ERROR: Error in library(reshape): there is no package called ‘reshape’


In [94]:
long_format_df <- rbind( 
        list( Day = "2017-04-20", Variable = "temperature", value = 10),
        list( Day = "2017-04-20", Variable = "umidity", value = 70),
        list( Day = "2017-04-21", Variable = "temperature", value = 15),
        list( Day = "2017-04-21", Variable = "umidity", value = 75),
        list( Day = "2017-04-22", Variable = "umidity", value = 80)
    )

class(long_format_df)
long_format_df <- as.data.frame(long_format_df)
class(long_format_df)
long_format_df

Day,Variable,value
2017-04-20,temperature,10
2017-04-20,umidity,70
2017-04-21,temperature,15
2017-04-21,umidity,75
2017-04-22,umidity,80


In [95]:
wide_format_df = cast(long_format_df, Day~Variable, mean) # - transform from long to wide format
wide_format_df

ERROR: Error in eval(expr, envir, enclos): could not find function "cast"


In [96]:
class(long_format_df$Day)

In [97]:
long_format_df$Day = as.character(long_format_df$Day)
class(long_format_df$Day)

long_format_df$Variable = as.character(long_format_df$Variable)
long_format_df$value = as.numeric(long_format_df$value)


In [98]:
wide_format_df = cast(long_format_df, Day~Variable, mean) # - transform from long to wide format
wide_format_df

ERROR: Error in eval(expr, envir, enclos): could not find function "cast"


In [99]:
back_to_wide_format =  melt(wide_format_df, id = c("Day")) # - transform from wide to long format
back_to_wide_format

ERROR: Error in eval(expr, envir, enclos): could not find function "melt"


In [100]:
# - A slitly more accurate transformation

na_if_no_value_mean_otherwise = function(x) {
    if( length(x) == 0) {
        NA
    }
    else {
        mean(x)
    }
}

wide_format_df = cast(long_format_df, Day~Variable, na_if_no_value_mean_otherwise)
wide_format_df

ERROR: Error in eval(expr, envir, enclos): could not find function "cast"


#### <span style="color:red">Back to presentation</span>

# PART 6: Control flow

## Function declaration 

In [None]:
my_function = function( parameter1, parameter_with_default = "default value", third_parameter_with_second_as_default = parameter_with_default ) {
        print( c(parameter1, parameter_with_default, third_parameter_with_second_as_default) )
}

In [None]:
my_function()

In [None]:
my_function("Value")

In [None]:
my_function("Value", "Override second parameter default")

In [None]:
my_function("Value", "Override second parameter default", "Override third param")

In [None]:
my_function("Value", third_parameter_with_second_as_default = "Override third param and leave default in second")

## Conditional branching

In [None]:
test_if = function(x) {
    if( 5 %in% x) {
        print("Five is present into x")
    }
    else if( 6 %in% x ) {
        print("Six is present into x")
    }
    else {
        print("Neither 5 nor 6 are present into x")
    }
    
    if( sum(x) > 20 ) {
        print("More than 20 units")
    }
    else {
        print("20 units or less")
    }
    
    if( length(x) == 0) {
        print("x is empty")
    }
}

In [None]:
test_if( c(5,6) )

In [None]:
test_if( c(6,19) )

In [None]:
test_if( c() )

## Loops

In [None]:
i = 1
while( i * i < 100 ) {
    print( i * i)
    i = i * 2
}

### Loop a *know number* times

In [None]:
for( element in c("Blue", "Gray", "Green")) {
    print(element)
}

## Function result
The result of a function is the value of last expression executed

In [None]:
na_if_no_value_mean_otherwise = function(x) {
    if( length(x) == 0) {
        NA
    }
    else {
        mean(x)
    }
    #print("Exit")
}

In [None]:
na_if_no_value_mean_otherwise( c() )

In [None]:
na_if_no_value_mean_otherwise( c(8,10) )

In [None]:
# Perhaps a safer implementation is
na_if_no_value_mean_otherwise = function(x) {
    if( length(x) == 0) {
        result = NA
    }
    else {
        result = mean(x)
    }
    print(paste("Result", result))
    result
}
na_if_no_value_mean_otherwise( c(8,10) )

In [None]:
# - The "return" function evaluate the expression given as parameter and stop the 
#   execution of the current function.
# It is not an idiomatic form of R. The "return" usage is more appropriate in languages like Java or C. 
na_if_no_value_mean_otherwise = function(x) {
    if( length(x) == 0) {
        result = NA
    }
    else {
        result = mean(x)
    }
    return(result)
    print(paste("Result", result))
}
na_if_no_value_mean_otherwise( c(8,10) )

#### <span style="color:red">Back to presentation</span>

# PART 7: Load and Save data

## CSV

In [None]:
irisdata_df <- read.csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"), header = FALSE)
names(irisdata_df) <- c("sepal_length", "sepal_width", "petal_length", "petal_width", "type")
head(irisdata_df)

In [None]:
dir.create("data", showWarnings = F)
write.table(irisdata_df, file = "data/iris_with_header.csv", 
            append = FALSE, 
            quote = TRUE, sep = ",", eol = "\n", na = "NA", dec = ".", 
            row.names = FALSE, col.names = TRUE, 
            qmethod = c("escape", "double"),
            fileEncoding = "UTF-8"
        )

## JSON

In [None]:
library(jsonlite)

In [None]:
data_from_json <- fromJSON(url("http://api.tvmaze.com/singlesearch/shows?q=big-bang-theory&embed=episodes"))
episodes_df <- data_from_json[['_embedded']]$episodes
episodes_df[ , c("id","name", "season", "number", "airdate", "airtime", "url" )]

# R

In [None]:
dir.create("data", showWarnings = F)
dput(episodes_df, "data/data_from_r.Rdata")

In [None]:
reloaded_episodes_df = dget("data/data_from_r.Rdata")
reloaded_episodes_df[ , c("id","name", "season", "number", "airdate", "airtime", "url" )]

#### <span style="color:red">Back to presentation</span>

# PART 8: Functional loops and multicore

## Functional style for loops

In [None]:
?apply

In [None]:
lapply( list("klaus","martin","georg"), toupper)

In [None]:
lapply( c("klaus","martin","georg"), toupper)

In [None]:
df = data.frame( Name = c("Andrea", "Giovanni", "Luca"), Surname = c("Rossi", "Bianchi", "Verdi"), Age = c(34, 27, NA)  )

In [None]:
# For each column of a dataframe
result = lapply(df, function(x) {
    print("========")
    class(x)
    print(x)
    print("========")
    length(x)
})
print("# The result of lapply")
result

In [None]:
# For each row of a dataframe with automatic conversionto character
result = apply( df, 1, function(one_row_df) {
    paste(one_row_df['Name'], one_row_df['Surname'], class(one_row_df['Age']))
})
result

In [None]:
# For each row of a dataframe with global variable
result = lapply( seq( nrow(df) ), function(idx) {
    one_row_df = df[ idx, ]
    paste(one_row_df$Name, one_row_df$Surname, class(one_row_df$Age))
})
result

In [None]:
# For each row of a dataframe without gloabal variable
result = lapply( split(df, rownames(df)), function(one_row_df) {
    paste(one_row_df$Name, one_row_df$Surname, class(one_row_df$Age))
})
result

## mclapply: linux multicore lapply

In [None]:
library(parallel)
?mclapply

In [None]:
Sys.time()
result = mclapply( c(1:3), function(i) { Sys.sleep(5); sqrt(i)  }, mc.cores = 3)
Sys.time()
result

## Foreach: multicore and multinode parallelization for windows and linux

In [3]:
install.packages("doParallel", lib="/opt/conda/lib/R/library", repo="http://cran.us.r-project.org")
library(doParallel)


Updating HTML index of packages in '.Library'
Making 'packages.html' ... done
Loading required package: foreach
Loading required package: iterators
Loading required package: parallel


In [None]:
# Iterate on a vector
Sys.time()
cl <- makeCluster(3)
registerDoParallel(cl)

result = foreach(i=1:3) %dopar% { Sys.sleep(5); sqrt(i) }

stopCluster(cl)
Sys.time()
result

In [None]:
# Iterate on a list
Sys.time()
cl <- makeCluster(3)
registerDoParallel(cl)

result = foreach(i= list( c(1,2,3), c("Hi", "Bye"), NA) ) %dopar% { Sys.sleep(5); length(i) }

stopCluster(cl)
Sys.time()
result


In [4]:
df = data.frame( Name = c("Andrea", "Giovanni", "Luca"), Surname = c("Rossi", "Bianchi", "Verdi"), Age = c(34, 27, NA)  )

In [None]:
# for each column
Sys.time()
cl <- makeCluster(3)
registerDoParallel(cl)

result = foreach(i= df ) %dopar% { Sys.sleep(5); class(i) }

stopCluster(cl)
Sys.time()
result

In [5]:
# for each row with global variable
Sys.time()
cl <- makeCluster(3)
registerDoParallel(cl)

result = foreach(i= seq( nrow(df)) ) %dopar% { 
        Sys.sleep(5); 
         one_row_df = df[ i, ]
        paste(one_row_df$Name, one_row_df$Surname, class(one_row_df$Age))
    }

stopCluster(cl)
Sys.time()
result

[1] "2017-04-27 08:19:05 UTC"

[1] "2017-04-27 08:19:11 UTC"

In [7]:
# for each row without global variable
Sys.time()
cl <- makeCluster(3)
registerDoParallel(cl)

result = foreach( one_row_df = split(df, rownames(df) ) ) %dopar% { 
        Sys.sleep(5); 
        paste(one_row_df$Name, one_row_df$Surname, class(one_row_df$Age))
    }

stopCluster(cl)
Sys.time()
result

[1] "2017-04-27 08:20:24 UTC"

[1] "2017-04-27 08:20:29 UTC"

#### <span style="color:red">Back to presentation</span>

# MORE about dataframe manipulation

### Dataframe sample

In [None]:
data(iris)


iris_mean = lapply( split(iris, iris$Species), function(df) {
  
    data.frame( 
        Sepal.Length = mean( df$Sepal.Length ),
        Sepal.Width = mean( df$Sepal.Width ), 
        Petal.Length = mean( df$Petal.Length ),
        Petal.Width = mean( df$Petal.Width ),
        Species = unique( df$Species )
    )
})
print("Iris Mean values")
iris_mean


iris_mean_df = do.call("rbind", iris_mean) # try to change to rbindlist
print("Iris Mean values as DataFrame")
s_mean_df

In [None]:
aggregate(
        iris[, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")], 
        by = list(iris$Species), 
        FUN = mean, 
        na.rm=TRUE
    )

# Some usefull R references

### Reference Card
Synthetic reference of a lot of commands
https://cran.r-project.org/doc/contrib/Short-refcard.pdf


### CRAN Views
A lot of library listed by argument
https://cran.r-project.org/web/views/


### Other Refs
https://www.r-bloggers.com/

https://cran.r-project.org/manuals.html

Il container docker su cui ci siamo basati https://hub.docker.com/r/jupyter/all-spark-notebook/


In [None]:
# Compute max time distance between two consecutive big-bang theory episodies.
# and write episode titles  

data_from_json <- fromJSON(url("http://api.tvmaze.com/singlesearch/shows?q=big-bang-theory&embed=episodes"))
data_from_json