Set up some shared stuff
library('microbenchmark')
library('ggplot2')
set.seed(23525) # make run more repeatable
nCol <- 10
timeSeq <- seq.int(100,2000,100)
Function to populate a row of data.
mkRow <- function(nCol) {
x <- as.list(rnorm(nCol))
x[[1]] <- ifelse(x[[1]]>0,'pos','neg')
names(x) <- paste('x',seq_len(nCol),sep='.')
x
}
The common wrong-way to accumulate the rows of data into a single data frame.
mkFrameForLoop <- function(nRow,nCol) {
d <- c()
for(i in seq_len(nRow)) {
ri <- mkRow(nCol)
di <- data.frame(ri,
stringsAsFactors=FALSE)
d <- rbind(d,di)
}
d
}
# build function to plot timings
# devtools::install_github("WinVector/WVPlots")
# library('WVPlots')
plotTimings <- function(timings) {
timings$expr <- reorder(timings$expr,-timings$time,FUN=max)
ggplot(data=timings,aes(x=nRow,y=time,color=expr)) +
geom_point(alpha=0.8) + geom_smooth(alpha=0.8)
nmax <- max(timings$nRow)
tsub <- timings[timings$nRow==nmax,]
tsub$expr <- reorder(tsub$expr,tsub$time,FUN=median)
list(
ggplot(data=timings,aes(x=nRow,y=time,color=expr)) +
geom_point(alpha=0.8) + geom_smooth(alpha=0.8),
ggplot(data=timings,aes(x=nRow,y=time,color=expr)) +
geom_point(alpha=0.8) + geom_smooth(alpha=0.8) +
scale_y_log10(),
WVPlots::ScatterBoxPlot(tsub,'expr','time',
title=paste('nRow = ',nmax)) +
coord_flip()
)
}
Timing showing the quadratic runtime.
timings <- vector("list", length(timeSeq))
for(i in seq_len(length(timeSeq))) {
nRow <- timeSeq[[i]]
ti <- microbenchmark(
mkFrameForLoop(nRow,nCol),
times=10)
ti <- data.frame(ti,
stringsAsFactors=FALSE)
ti$nRow <- nRow
ti$nCol <- nCol
timings[[i]] <- ti
}
timings <- data.table::rbindlist(timings)
print(plotTimings(timings))
## [[1]]
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
##
## [[2]]
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
##
## [[3]]
A few roughly equivilent right ways to accumlate the rows.
# Simplest fix, collect the data in a list and
# grow the list. Exploits the fact that R can mutate
# common objects when object visibility is limited.
mkFrameForList <- function(nRow,nCol,verbose=FALSE) {
d <- as.list(seq_len(nRow)) # pre-alloc destination
for(i in seq_len(nRow)) {
ri <- mkRow(nCol)
di <- data.frame(ri,
stringsAsFactors=FALSE)
d[[i]] <- di
if(verbose) {
print(pryr::address(d))
}
}
do.call(rbind,d)
}
# Cleanest fix- wrap procedure in a function and
# use lapply.
mkFrameList <- function(nRow,nCol) {
d <- lapply(seq_len(nRow),function(i) {
ri <- mkRow(nCol)
data.frame(ri,
stringsAsFactors=FALSE)
})
do.call(rbind,d)
}
Confirm value getting altered in place (effiency depends on interior columns also not chaning address, which is also the case).
mkFrameForList(10,5,TRUE)
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## x.1 x.2 x.3 x.4 x.5
## 1 neg -0.60301455 0.198773364 0.05982714 -1.2601829
## 2 pos 1.96341648 -0.564996823 0.37151297 -0.3083645
## 3 neg 0.57626331 0.028149072 -0.19453441 -3.0256527
## 4 pos 0.28755502 0.007095078 -0.05992568 -0.5024812
## 5 neg -0.16336357 -1.561558013 -0.15738111 -0.6285802
## 6 neg -1.37783842 0.830355860 -0.79339171 0.4223554
## 7 pos -0.52289654 1.663399105 -0.90919184 -1.3405057
## 8 pos -1.40676558 -1.422215092 0.37772800 -1.5121601
## 9 neg 0.14575393 -0.573764946 1.17134733 -0.4860252
## 10 pos 0.03247208 -0.756778828 -1.11823951 0.3115500
Get more timings and plots.
timingsPrev <- timings
timings <- vector("list", length(timeSeq))
for(i in seq_len(length(timeSeq))) {
nRow <- timeSeq[[i]]
ti <- microbenchmark(
mkFrameForList(nRow,nCol),
mkFrameList(nRow,nCol),
times=10)
ti <- data.frame(ti,
stringsAsFactors=FALSE)
ti$nRow <- nRow
ti$nCol <- nCol
timings[[i]] <- ti
}
timings <- data.table::rbindlist(timings)
print(plotTimings(timings))
## [[1]]
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
##
## [[2]]
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
##
## [[3]]
timings <- rbind(timings,timingsPrev)
print(plotTimings(timings))
## [[1]]
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
##
## [[2]]
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
##
## [[3]]
Show the in-place alteration of objects in a simpler setting.
computeSquares <- function(n,
messUpVisibility,
usePRYR=FALSE) {
# pre-allocate v
# (doesn't actually help!)
v <- 1:n
if(messUpVisibility) {
vLast <- v
}
# print details of v
.Internal(inspect(v))
if(usePRYR) {
print(pryr::address(v))
}
for(i in 1:n) {
v[[i]] <- i^2
if(messUpVisibility) {
vLast <- v
}
# print details of v
.Internal(inspect(v))
if(usePRYR) {
print(pryr::address(v))
}
}
v
}
Show how if the value associated with v is visible only to the variable name “v” that R will start performing in-place replacement (making calculation much cheaper).
computeSquares(5,FALSE)
## @7f9e3d95dba8 13 INTSXP g0c3 [NAM(1)] (len=5, tl=0) 1,2,3,4,5
## @7f9e3e53ef48 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,2,3,4,5
## @7f9e3e53ef48 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,3,4,5
## @7f9e3e53ef48 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,4,5
## @7f9e3e53ef48 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,16,5
## @7f9e3e53ef48 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,16,25
## [1] 1 4 9 16 25
Show how if the value associated with v is visible to more than one variable that R will not performing in-place replacement (making calcultion much more expensive).
computeSquares(5,TRUE)
## @7f9e3c6c4280 13 INTSXP g0c3 [NAM(2)] (len=5, tl=0) 1,2,3,4,5
## @7f9e3d8b3088 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,2,3,4,5
## @7f9e3d8b6808 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,3,4,5
## @7f9e3d8b69a8 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,9,4,5
## @7f9e3d8b6a78 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,9,16,5
## @7f9e3d8b6c18 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,9,16,25
## [1] 1 4 9 16 25
Check if we can use PRYR for addresses in this case.
computeSquares(5,FALSE,usePRYR=TRUE)
## @7f9e3d8aba48 13 INTSXP g0c3 [NAM(1)] (len=5, tl=0) 1,2,3,4,5
## [1] "0x7f9e3d8aba48"
## @7f9e407d18f0 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,2,3,4,5
## [1] "0x7f9e407d18f0"
## @7f9e407d18f0 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,3,4,5
## [1] "0x7f9e407d18f0"
## @7f9e407d18f0 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,4,5
## [1] "0x7f9e407d18f0"
## @7f9e407d18f0 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,16,5
## [1] "0x7f9e407d18f0"
## @7f9e407d18f0 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,16,25
## [1] "0x7f9e407d18f0"
## [1] 1 4 9 16 25
Show slowdown of incremental method versus others as a function of number of row.
timingsPrev <- timings
timings$isIncremental <- timings$expr=='mkFrameForLoop(nRow, nCol)'
agg <- aggregate(time~isIncremental+nRow,data=timings,FUN=median)
dInc <- agg[agg$isIncremental,]
dInc <- dInc[order(dInc$nRow),]
dRest <- agg[!agg$isIncremental,]
dRest <- dRest[order(dRest$nRow),]
dInc$slowDown <- dInc$time/dRest$time
ggplot(data=dInc,aes(x=nRow,y=slowDown)) +
geom_point() + geom_smooth()
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
# From Arun Srinivasan's comment
# http://www.win-vector.com/blog/2015/07/efficient-accumulation-in-r/comment-page-1/#comment-65994
# library('data.table')
mkFrameDataTableFor <- function(nRow, nCol) {
v = vector("list", nRow)
for (i in seq_len(nRow)) {
v[[i]] = mkRow(nCol)
}
data.table::rbindlist(v)
}
mkFrameDataTableLapply <- function(nRow, nCol) {
v <- lapply(seq_len(nRow),function(i) {
mkRow(nCol)
})
data.table::rbindlist(v)
}
# Mucking with environments fix. Environments
# are mutable and tend to be faster than lists.
mkFrameEnvDataTable <- function(nRow,nCol) {
e <- new.env(hash=TRUE,parent=emptyenv())
for(i in seq_len(nRow)) {
ri <- mkRow(nCol)
assign(as.character(i),ri,envir=e)
}
data.table::rbindlist(as.list(e))
}
# Another possibility, working in place.
mkFrameInPlace <- function(nRow,nCol,classHack=TRUE) {
r1 <- mkRow(nCol)
d <- data.frame(r1,
stringsAsFactors=FALSE)
if(nRow>1) {
d <- d[rep.int(1,nRow),]
if(classHack) {
# lose data.frame class for a while
# changes what S3 methods implement
# assignment.
d <- as.list(d)
}
for(i in seq.int(2,nRow,1)) {
ri <- mkRow(nCol)
for(j in seq_len(nCol)) {
d[[j]][[i]] <- ri[[j]]
}
}
}
if(classHack) {
d <- data.frame(d,stringsAsFactors=FALSE)
}
d
}
Still more timings.
timings <- vector("list", length(timeSeq))
for(i in seq_len(length(timeSeq))) {
nRow <- timeSeq[[i]]
ti <- microbenchmark(
mkFrameInPlace(nRow,nCol),
mkFrameDataTableFor(nRow,nCol),
times=10)
ti <- data.frame(ti,
stringsAsFactors=FALSE)
ti$nRow <- nRow
ti$nCol <- nCol
timings[[i]] <- ti
}
timings <- data.table::rbindlist(timings)
timings <- rbind(timings,timingsPrev)
print(plotTimings(timings))
## [[1]]
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
##
## [[2]]
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
##
## [[3]]
# library('dplyr')
# Fast method from https://twitter.com/drob/status/626146450842497028
mkFrameDplyrBind <- function(nRow,nCol) {
dplyr::bind_rows(replicate(nRow,
data.frame(mkRow(nCol),
stringsAsFactors=FALSE),
simplify=FALSE))
}
# library('plyr')
# Idiomatic plyr https://www.reddit.com/r/rstats/comments/3ex31f/efficient_accumulation_in_r/ctjl9lq
mkFramePlyrLdply <- function(nRow,nCol) {
plyr::ldply(seq_len(nRow),
function(i) data.frame(mkRow(nCol),
stringsAsFactors=FALSE))
}
# try to avoid all storage re-alloc, name checking, and type checking
mkFrameMat <- function(nRow,nCol) {
r1 <- mkRow(nCol)
isNum <- vapply(r1,is.numeric,logical(1))
numIndices <- seq_len(nCol)[isNum]
strIndices <- seq_len(nCol)[!isNum]
dN <- matrix(data="",
nrow=nRow,ncol=length(numIndices))
dC <- matrix(data="",
nrow=nRow,ncol=length(strIndices))
dN[1,] <- as.numeric(r1[numIndices])
dC[1,] <- as.character(r1[strIndices])
if(nRow>1) {
for(i in seq.int(2,nRow,1)) {
ri <- mkRow(nCol)
dN[i,] <- as.numeric(ri[numIndices])
dC[i,] <- as.character(ri[strIndices])
}
}
dN <- data.frame(dN,stringsAsFactors=FALSE)
names(dN) <- names(r1)[numIndices]
dC <- data.frame(dC,stringsAsFactors=FALSE)
names(dC) <- names(r1)[strIndices]
cbind(dC,dN)
}
# allocate the vectors indpendently
# seems to be avoiding some overhead by not having data.frame class
# (close to current mkFrameInPlace, but faster)
# Adapted from: https://gist.github.com/jimhester/e725e1ad50a5a62f3dee#file-accumulation-r-L43-L57
mkFrameVector <- function(nRow,nCol) {
r1 <- mkRow(nCol)
res <- lapply(r1,function(cv) {
if(is.numeric(cv)) {
col = numeric(nRow)
} else {
col = character(nRow)
}
col[[1]] = cv
col
})
if(nRow>1) {
for(i in seq.int(2,nRow,1)) {
ri <- mkRow(nCol)
for(j in seq_len(nCol)) {
res[[j]][[i]] <- ri[[j]]
}
}
}
data.frame(res,stringsAsFactors=FALSE)
}
# Efficient method that doesn't need to know how many rows are coming
# devtools::install_github("WinVector/daccum")
# library('daccum')
mkFrameDaccum <- function(nRow,nCol) {
collector <- daccum::mkColletor()
for(i in seq_len(nRow)) {
ri <- mkRow(nCol)
daccum::collectRows(collector,ri)
}
daccum::unwrapRows(collector)
}
timingsPrev <- timings
timings <- vector("list", length(timeSeq))
for(i in seq_len(length(timeSeq))) {
nRow <- timeSeq[[i]]
ti <- microbenchmark(
mkFrameDplyrBind(nRow,nCol),
mkFramePlyrLdply(nRow,nCol),
mkFrameMat(nRow,nCol),
mkFrameVector(nRow,nCol),
mkFrameDaccum(nRow,nCol),
mkFrameEnvDataTable(nRow,nCol),
mkFrameDataTableLapply(nRow,nCol),
times=10)
ti <- data.frame(ti,
stringsAsFactors=FALSE)
ti$nRow <- nRow
ti$nCol <- nCol
timings[[i]] <- ti
}
timings <- data.table::rbindlist(timings)
timings <- rbind(timings,timingsPrev)
print(plotTimings(timings))
## [[1]]
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
##
## [[2]]
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
##
## [[3]]