Accum.Rmd

Set up some shared stuff

library('microbenchmark')
library('ggplot2')
set.seed(23525) # make run more repeatable

nCol <- 10
timeSeq <- seq.int(100,2000,100)

Function to populate a row of data.

mkRow <- function(nCol) {
  x <- as.list(rnorm(nCol))
  x[[1]] <- ifelse(x[[1]]>0,'pos','neg')
  names(x) <- paste('x',seq_len(nCol),sep='.')
  x
}

The common wrong-way to accumulate the rows of data into a single data frame.

mkFrameForLoop <- function(nRow,nCol) {
  d <- c()
  for(i in seq_len(nRow)) {
    ri <- mkRow(nCol)
    di <- data.frame(ri,
                     stringsAsFactors=FALSE)
    d <- rbind(d,di)
  }
  d
}

# build function to plot timings
# devtools::install_github("WinVector/WVPlots")
# library('WVPlots')
plotTimings <- function(timings) {
  timings$expr <- reorder(timings$expr,-timings$time,FUN=max)
  ggplot(data=timings,aes(x=nRow,y=time,color=expr)) +
    geom_point(alpha=0.8) + geom_smooth(alpha=0.8) 
  nmax <- max(timings$nRow)
  tsub <- timings[timings$nRow==nmax,]
  tsub$expr <- reorder(tsub$expr,tsub$time,FUN=median)
  list(
    ggplot(data=timings,aes(x=nRow,y=time,color=expr)) +
      geom_point(alpha=0.8) + geom_smooth(alpha=0.8),
    ggplot(data=timings,aes(x=nRow,y=time,color=expr)) +
      geom_point(alpha=0.8) + geom_smooth(alpha=0.8) +
      scale_y_log10(),
    WVPlots::ScatterBoxPlot(tsub,'expr','time',
                            title=paste('nRow = ',nmax)) +
      coord_flip()
  )
}

Timing showing the quadratic runtime.

timings <-  vector("list", length(timeSeq))
for(i in seq_len(length(timeSeq))) {
  nRow <- timeSeq[[i]]
  ti <- microbenchmark(
    mkFrameForLoop(nRow,nCol),
    times=10)
  ti <- data.frame(ti,
                   stringsAsFactors=FALSE)
  ti$nRow <- nRow
  ti$nCol <- nCol
  timings[[i]] <- ti
}
timings <- data.table::rbindlist(timings)

print(plotTimings(timings))

## [[1]]

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

## 
## [[2]]

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

## 
## [[3]]

A few roughly equivilent right ways to accumlate the rows.

# Simplest fix, collect the data in a list and
# grow the list.  Exploits the fact that R can mutate
# common objects when object visibility is limited.
mkFrameForList <- function(nRow,nCol,verbose=FALSE) {
  d <- as.list(seq_len(nRow)) # pre-alloc destination
  for(i in seq_len(nRow)) {
    ri <- mkRow(nCol)
    di <- data.frame(ri,
                     stringsAsFactors=FALSE)
    d[[i]] <- di
    if(verbose) {
      print(pryr::address(d))
    }
  }
  do.call(rbind,d)
}

# Cleanest fix- wrap procedure in a function and
# use lapply.
mkFrameList <- function(nRow,nCol) {
  d <- lapply(seq_len(nRow),function(i) {
    ri <- mkRow(nCol)
    data.frame(ri,
               stringsAsFactors=FALSE)
  })
  do.call(rbind,d)
}

Confirm value getting altered in place (effiency depends on interior columns also not chaning address, which is also the case).

mkFrameForList(10,5,TRUE)

## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"

##    x.1         x.2          x.3         x.4        x.5
## 1  neg -0.60301455  0.198773364  0.05982714 -1.2601829
## 2  pos  1.96341648 -0.564996823  0.37151297 -0.3083645
## 3  neg  0.57626331  0.028149072 -0.19453441 -3.0256527
## 4  pos  0.28755502  0.007095078 -0.05992568 -0.5024812
## 5  neg -0.16336357 -1.561558013 -0.15738111 -0.6285802
## 6  neg -1.37783842  0.830355860 -0.79339171  0.4223554
## 7  pos -0.52289654  1.663399105 -0.90919184 -1.3405057
## 8  pos -1.40676558 -1.422215092  0.37772800 -1.5121601
## 9  neg  0.14575393 -0.573764946  1.17134733 -0.4860252
## 10 pos  0.03247208 -0.756778828 -1.11823951  0.3115500

Get more timings and plots.

timingsPrev <- timings
timings <-  vector("list", length(timeSeq))
for(i in seq_len(length(timeSeq))) {
  nRow <- timeSeq[[i]]
  ti <- microbenchmark(
    mkFrameForList(nRow,nCol),
    mkFrameList(nRow,nCol),
    times=10)
  ti <- data.frame(ti,
                   stringsAsFactors=FALSE)
  ti$nRow <- nRow
  ti$nCol <- nCol
  timings[[i]] <- ti
}
timings <- data.table::rbindlist(timings)

print(plotTimings(timings))

## [[1]]

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

## 
## [[2]]

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

## 
## [[3]]

timings <- rbind(timings,timingsPrev)

print(plotTimings(timings))

## [[1]]

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

## 
## [[2]]

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

## 
## [[3]]

Show the in-place alteration of objects in a simpler setting.

computeSquares <- function(n,
                           messUpVisibility,
                           usePRYR=FALSE) {
  # pre-allocate v
  # (doesn't actually help!)
  v <- 1:n
  if(messUpVisibility) {
     vLast <- v
  }
  # print details of v
  .Internal(inspect(v))
  if(usePRYR) {
    print(pryr::address(v))
  }
  for(i in 1:n) {
    v[[i]] <- i^2
    if(messUpVisibility) {
      vLast <- v
    }
    # print details of v
    .Internal(inspect(v))
    if(usePRYR) {
      print(pryr::address(v))
    }
  }
  v
}

Show how if the value associated with v is visible only to the variable name “v” that R will start performing in-place replacement (making calculation much cheaper).

computeSquares(5,FALSE)

## @7f9e3d95dba8 13 INTSXP g0c3 [NAM(1)] (len=5, tl=0) 1,2,3,4,5
## @7f9e3e53ef48 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,2,3,4,5
## @7f9e3e53ef48 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,3,4,5
## @7f9e3e53ef48 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,4,5
## @7f9e3e53ef48 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,16,5
## @7f9e3e53ef48 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,16,25

## [1]  1  4  9 16 25

Show how if the value associated with v is visible to more than one variable that R will not performing in-place replacement (making calcultion much more expensive).

computeSquares(5,TRUE)

## @7f9e3c6c4280 13 INTSXP g0c3 [NAM(2)] (len=5, tl=0) 1,2,3,4,5
## @7f9e3d8b3088 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,2,3,4,5
## @7f9e3d8b6808 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,3,4,5
## @7f9e3d8b69a8 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,9,4,5
## @7f9e3d8b6a78 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,9,16,5
## @7f9e3d8b6c18 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,9,16,25

## [1]  1  4  9 16 25

Check if we can use PRYR for addresses in this case.

computeSquares(5,FALSE,usePRYR=TRUE)

## @7f9e3d8aba48 13 INTSXP g0c3 [NAM(1)] (len=5, tl=0) 1,2,3,4,5
## [1] "0x7f9e3d8aba48"
## @7f9e407d18f0 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,2,3,4,5
## [1] "0x7f9e407d18f0"
## @7f9e407d18f0 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,3,4,5
## [1] "0x7f9e407d18f0"
## @7f9e407d18f0 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,4,5
## [1] "0x7f9e407d18f0"
## @7f9e407d18f0 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,16,5
## [1] "0x7f9e407d18f0"
## @7f9e407d18f0 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,16,25
## [1] "0x7f9e407d18f0"

## [1]  1  4  9 16 25

Show slowdown of incremental method versus others as a function of number of row.

timingsPrev <- timings
timings$isIncremental <- timings$expr=='mkFrameForLoop(nRow, nCol)'
agg <- aggregate(time~isIncremental+nRow,data=timings,FUN=median)
dInc <- agg[agg$isIncremental,]
dInc <- dInc[order(dInc$nRow),]
dRest <- agg[!agg$isIncremental,]
dRest <- dRest[order(dRest$nRow),]
dInc$slowDown <-  dInc$time/dRest$time
ggplot(data=dInc,aes(x=nRow,y=slowDown)) +
  geom_point() + geom_smooth()

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

# From Arun Srinivasan's comment
# http://www.win-vector.com/blog/2015/07/efficient-accumulation-in-r/comment-page-1/#comment-65994
# library('data.table')
mkFrameDataTableFor <- function(nRow, nCol) {
  v = vector("list", nRow)
  for (i in seq_len(nRow)) {
    v[[i]] = mkRow(nCol)
  }
  data.table::rbindlist(v)
}

mkFrameDataTableLapply <- function(nRow, nCol) {
  v <- lapply(seq_len(nRow),function(i) {
    mkRow(nCol)
  })
  data.table::rbindlist(v)
}

# Mucking with environments fix.  Environments
# are mutable and tend to be faster than lists.
mkFrameEnvDataTable <- function(nRow,nCol) {
  e <- new.env(hash=TRUE,parent=emptyenv())
  for(i in seq_len(nRow)) {
    ri <- mkRow(nCol)
    assign(as.character(i),ri,envir=e)
  }
  data.table::rbindlist(as.list(e))
}

# Another possibility, working in place.
mkFrameInPlace <- function(nRow,nCol,classHack=TRUE) {
  r1 <- mkRow(nCol)
  d <- data.frame(r1,
                    stringsAsFactors=FALSE)
  if(nRow>1) {
    d <- d[rep.int(1,nRow),]
    if(classHack) {
      # lose data.frame class for a while
      # changes what S3 methods implement
      # assignment.
      d <- as.list(d) 
    }
    for(i in seq.int(2,nRow,1)) {
      ri <- mkRow(nCol)
      for(j in seq_len(nCol)) {
        d[[j]][[i]] <- ri[[j]]
      }
    }
  }
  if(classHack) {
     d <- data.frame(d,stringsAsFactors=FALSE)
  }
  d
}

Still more timings.

timings <-  vector("list", length(timeSeq))
for(i in seq_len(length(timeSeq))) {
  nRow <- timeSeq[[i]]
  ti <- microbenchmark(
    mkFrameInPlace(nRow,nCol),
    mkFrameDataTableFor(nRow,nCol),
    times=10)
  ti <- data.frame(ti,
                   stringsAsFactors=FALSE)
  ti$nRow <- nRow
  ti$nCol <- nCol
  timings[[i]] <- ti
}
timings <- data.table::rbindlist(timings)
timings <- rbind(timings,timingsPrev)

print(plotTimings(timings))

## [[1]]

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

## 
## [[2]]

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

## 
## [[3]]

# library('dplyr')
# Fast method from https://twitter.com/drob/status/626146450842497028
mkFrameDplyrBind <- function(nRow,nCol) {
 dplyr::bind_rows(replicate(nRow, 
                     data.frame(mkRow(nCol),
                                stringsAsFactors=FALSE), 
                     simplify=FALSE))
}


# library('plyr')
# Idiomatic plyr https://www.reddit.com/r/rstats/comments/3ex31f/efficient_accumulation_in_r/ctjl9lq
mkFramePlyrLdply <- function(nRow,nCol) {
  plyr::ldply(seq_len(nRow),
              function(i) data.frame(mkRow(nCol),
                                     stringsAsFactors=FALSE))
}


# try to avoid all storage re-alloc, name checking, and type checking
mkFrameMat <- function(nRow,nCol) {
  r1 <- mkRow(nCol)
  isNum <- vapply(r1,is.numeric,logical(1))
  numIndices <- seq_len(nCol)[isNum]
  strIndices <- seq_len(nCol)[!isNum]
  dN <- matrix(data="",
     nrow=nRow,ncol=length(numIndices))
  dC <- matrix(data="",
     nrow=nRow,ncol=length(strIndices))
  dN[1,] <- as.numeric(r1[numIndices])
  dC[1,] <- as.character(r1[strIndices])
  if(nRow>1) {
    for(i in seq.int(2,nRow,1)) {
      ri <- mkRow(nCol)
      dN[i,] <- as.numeric(ri[numIndices])
      dC[i,] <- as.character(ri[strIndices])
    }
  }
  dN <- data.frame(dN,stringsAsFactors=FALSE)
  names(dN) <- names(r1)[numIndices]
  dC <- data.frame(dC,stringsAsFactors=FALSE)
  names(dC) <- names(r1)[strIndices]
  cbind(dC,dN)
}


# allocate the vectors indpendently
# seems to be avoiding some overhead by not having data.frame class
# (close to current mkFrameInPlace, but faster)
# Adapted from: https://gist.github.com/jimhester/e725e1ad50a5a62f3dee#file-accumulation-r-L43-L57
mkFrameVector <- function(nRow,nCol) {
  r1 <- mkRow(nCol)
  res <- lapply(r1,function(cv) { 
    if(is.numeric(cv)) {
      col = numeric(nRow)
    } else {
      col = character(nRow)
    }
    col[[1]] = cv
    col
    })
  if(nRow>1) {
    for(i in seq.int(2,nRow,1)) {
      ri <- mkRow(nCol)
      for(j in seq_len(nCol)) {
        res[[j]][[i]] <- ri[[j]]
      }
    }
  }
  data.frame(res,stringsAsFactors=FALSE)
}

# Efficient method that doesn't need to know how many rows are coming
# devtools::install_github("WinVector/daccum")
# library('daccum')
mkFrameDaccum <- function(nRow,nCol) {
  collector <- daccum::mkColletor()
  for(i in seq_len(nRow)) {
    ri <- mkRow(nCol)
    daccum::collectRows(collector,ri)
  }
  daccum::unwrapRows(collector)
}

timingsPrev <- timings
timings <-  vector("list", length(timeSeq))
for(i in seq_len(length(timeSeq))) {
  nRow <- timeSeq[[i]]
  ti <- microbenchmark(
    mkFrameDplyrBind(nRow,nCol),
    mkFramePlyrLdply(nRow,nCol),
    mkFrameMat(nRow,nCol),
    mkFrameVector(nRow,nCol),
    mkFrameDaccum(nRow,nCol),
    mkFrameEnvDataTable(nRow,nCol),
    mkFrameDataTableLapply(nRow,nCol),
    times=10)
  ti <- data.frame(ti,
                   stringsAsFactors=FALSE)
  ti$nRow <- nRow
  ti$nCol <- nCol
  timings[[i]] <- ti
}
timings <- data.table::rbindlist(timings)
timings <- rbind(timings,timingsPrev)

print(plotTimings(timings))

## [[1]]

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

## 
## [[2]]

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

## 
## [[3]]

Accum.Rmd

John Mount

July 24, 2015