MapReduce & R “MapReduce allows us to stop thinking about fault tolerance.” Cathy O’Neil & Rachel Schutt, 2013.

MapReduce & R “MapReduce allows us to stop thinking about fault tolerance.” Cathy O’Neil & Rachel Schutt, 2013

R & Hadoop Compute squares

R # create a list of 10 integers ints <- 1:10
# equivalent to ints <- c(1,2,3,4,5,6,7,8,9,10) # compute the squares result <- sapply(ints,function(x) x^2) result [1]

Key-value mapping Input Map Reduce Output (null,1) (1,1) (null,2)
Reduce Output (null,1) (1,1) (null,2) (2,4) … (null,10) (10,100)

MapReduce MapReduce No reduce require(rmr2)
rmr.options(backend = "local") # local or hadoop # load a list of 10 integers into HDFS hdfs.ints = to.dfs(1:10) # mapper for the key-value pairs to compute squares mapper <- function(k,v) { key <- v value <- key^2 keyval(key,value) } # run MapReduce out = mapreduce(input = hdfs.ints, map = mapper) # convert to a data frame df1 = as.data.frame(from.dfs(out)) colnames(df1) = c('n', 'n^2') #display the results df1 MapReduce No reduce

Exercise Use the map component of the mapreduce() to create the cubes of the integers from 1 to 25

R & Hadoop Tabulation

R library(readr) url <- " t <- read_delim(url, delim=',') #convert and round temperature to an integer t$temperature = round((t$temperature-32)*5/9,0) # tabulate frequencies table(t$temperature)

Key-value mapping Input (null, F) Map (C, 1) Reduce Output
Reduce Output (C, length(v)) (null,35.1) (2,1) (-7,c(1)) (-7,1) (null,37.5) (3,1) (-6,c(1)) (-6,1) … (null,43.3) (6,1) (27,c(1,1,1,1,1,1,1,1)) (27,8)

MapReduce (1) MapReduce library(sqldf) options(sqldf.driver='SQLite')
rmr.options(backend = "local") #local or hadoop url <- " t <- read_delim(url, delim=',') # save temperature in hdfs file hdfs.temp <- to.dfs(t$temperature) # mapper for conversion to C mapper <- function(k,v) { key <- round((v-32)*5/9,0) value <- 1 keyval(key,value) } MapReduce

MapReduce (2) MapReduce # reducer to count frequencies
reducer <- function(k,v) { key <- k value = length(v) keyval(key,value) } out = mapreduce( input = hdfs.temp, map = mapper, reduce = reducer) df2 = as.data.frame(from.dfs(out)) colnames(df2) = c('temperature', 'count') df3 <-sqldf('SELECT * FROM df2 ORDER BY count DESC;') print(df3, row.names = FALSE) # no row names MapReduce

R & Hadoop Basic statistics

R # Basic stats library(reshape) library(sqldf)
options(sqldf.driver='SQLite') url <- " t <- read.table(url, header=T, sep=',') stats <- sqldf('SELECT year, max(temperature) as max, round(avg(temperature),1) as mean, min(temperature) as min from t GROUP BY year;') head(stats)

Key-value mapping Input Map Reduce Output (null,record)
Reduce Output (null,record) (year, temperature) (year, vector of temperatures) (year, max) (year, mean) (year, min)

MapReduce (1) MapReduce library(rmr2) library(reshape) library(readr)
rmr.options(backend = "local") # local or hadoop url <- " t <- read_delim(url, delim=',') # convert to hdfs file hdfs.temp <- to.dfs(data.frame(t)) # mapper for computing temperature measures for each year mapper <- function(k,v) { key <- v$year value <- v$temperature keyval(key,value) } MapReduce

MapReduce (2) MapReduce #reducer to report stats
reducer <- function(k,v) { key <- k #year value <- c(max(v,na.rm=T), round(mean(v,na.rm=T),1), min(v,na.rm=T)) #v is list of values for a year keyval(key,value) } out = mapreduce( input = hdfs.temp, map = mapper, reduce = reducer) df3 = as.data.frame(from.dfs(out)) df3$measure <- c('max','mean','min') # reshape with year, max, mean, min in one row stats2 <- cast(df3,key ~ measure,value="val") head(stats2) MapReduce

R & Hadoop Word counting

R library(stringr) # read as a single character string
t <- readChar(" nchars=1e6) t1 <- tolower(t[[1]]) # convert to lower case t2 <- str_replace_all(t1,"[[:punct:]]","") # get rid of punctuation wordList <- str_split(t2, "\\s") #split into strings wordVector <- unlist(wordList) # convert list to vector table(wordVector)

Key-value mapping Input Map Reduce Output (null, text) (word,1) …
Reduce Output (null, text) (word,1) … (word, vector) word, length(vector)

MapReduce (1) MapReduce library(rmr2) library(stringr)
rmr.options(backend = "local") # local or hadoop # read as a single character string url <- " t <- readChar(url, nchars=1e6) text.hdfs <- to.dfs(t) mapper=function(k,v){ t1 <- tolower(v) # convert to lower case t2 <- str_replace_all(t1,"[[:punct:]]","") # get rid of punctuation wordList <- str_split(t2, "\\s") #split into words wordVector <- unlist(wordList) # convert list to vector keyval(wordVector,1) } MapReduce

MapReduce (2) MapReduce reducer = function(k,v) { keyval(k,length(v))
} out <- mapreduce (input = text.hdfs, map = mapper, reduce = reducer,combine=T) # convert output to a frame df1 = as.data.frame(from.dfs(out)) colnames(df1) = c('word', 'count') #display the results print(df1, row.names = FALSE) # no row names MapReduce

MapReduce is changing the way we process data
MapReduce & R MapReduce is changing the way we process data rmr2 is powerful! Difficult to install but quite flexible

MapReduce & R “MapReduce allows us to stop thinking about fault tolerance.” Cathy O’Neil & Rachel Schutt, 2013.

Similar presentations

Presentation on theme: "MapReduce & R “MapReduce allows us to stop thinking about fault tolerance.” Cathy O’Neil & Rachel Schutt, 2013."— Presentation transcript:

Similar presentations

About project

Feedback

Log in

Auth with social network:

MapReduce & R “MapReduce allows us to stop thinking about fault tolerance.” Cathy O’Neil & Rachel Schutt, 2013.

Similar presentations

Presentation on theme: "MapReduce & R “MapReduce allows us to stop thinking about fault tolerance.” Cathy O’Neil & Rachel Schutt, 2013."— Presentation transcript:

Similar presentations

About project

Feedback