Set up some shared stuff
library('microbenchmark')
library('ggplot2')
set.seed(23525) # make run more repeatable
nCol <- 10
timeSeq <- seq.int(100,2000,100)
Function to populate a row of data.
mkRow <- function(nCol) {
x <- as.list(rnorm(nCol))
x[[1]] <- ifelse(x[[1]]>0,'pos','neg')
names(x) <- paste('x',seq_len(nCol),sep='.')
x
}
The common wrong-way to accumulate the rows of data into a single data frame.
mkFrameForLoop <- function(nRow,nCol) {
d <- c()
for(i in seq_len(nRow)) {
ri <- mkRow(nCol)
di <- data.frame(ri,
stringsAsFactors=FALSE)
d <- rbind(d,di)
}
d
}
# build function to plot timings
# devtools::install_github("WinVector/WVPlots")
# library('WVPlots')
plotTimings <- function(timings) {
timings$expr <- reorder(timings$expr,-timings$time,FUN=max)
ggplot(data=timings,aes(x=nRow,y=time,color=expr)) +
geom_point(alpha=0.8) + geom_smooth(alpha=0.8)
nmax <- max(timings$nRow)
tsub <- timings[timings$nRow==nmax,]
tsub$expr <- reorder(tsub$expr,tsub$time,FUN=median)
list(
ggplot(data=timings,aes(x=nRow,y=time,color=expr)) +
geom_point(alpha=0.8) + geom_smooth(alpha=0.8),
ggplot(data=timings,aes(x=nRow,y=time,color=expr)) +
geom_point(alpha=0.8) + geom_smooth(alpha=0.8) +
scale_y_log10(),
WVPlots::ScatterBoxPlot(tsub,'expr','time',
title=paste('nRow = ',nmax)) +
coord_flip()
)
}
Timing showing the quadratic runtime.
timings <- vector("list", length(timeSeq))
for(i in seq_len(length(timeSeq))) {
nRow <- timeSeq[[i]]
ti <- microbenchmark(
mkFrameForLoop(nRow,nCol),
times=10)
ti <- data.frame(ti,
stringsAsFactors=FALSE)
ti$nRow <- nRow
ti$nCol <- nCol
timings[[i]] <- ti
}
timings <- data.table::rbindlist(timings)
print(plotTimings(timings))
## [[1]]
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
##
## [[2]]
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
##
## [[3]]
A few roughly equivilent right ways to accumlate the rows.
# Simplest fix, collect the data in a list and
# grow the list. Exploits the fact that R can mutate
# common objects when object visibility is limited.
mkFrameForList <- function(nRow,nCol,verbose=FALSE) {
d <- as.list(seq_len(nRow)) # pre-alloc destination
for(i in seq_len(nRow)) {
ri <- mkRow(nCol)
di <- data.frame(ri,
stringsAsFactors=FALSE)
d[[i]] <- di
if(verbose) {
print(pryr::address(d))
}
}
do.call(rbind,d)
}
# Cleanest fix- wrap procedure in a function and
# use lapply.
mkFrameList <- function(nRow,nCol) {
d <- lapply(seq_len(nRow),function(i) {
ri <- mkRow(nCol)
data.frame(ri,
stringsAsFactors=FALSE)
})
do.call(rbind,d)
}
Confirm value getting altered in place (effiency depends on interior columns also not chaning address, which is also the case).
mkFrameForList(10,5,TRUE)
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## [1] "0x7f9e406c61f0"
## x.1 x.2 x.3 x.4 x.5
## 1 neg -0.60301455 0.198773364 0.05982714 -1.2601829
## 2 pos 1.96341648 -0.564996823 0.37151297 -0.3083645
## 3 neg 0.57626331 0.028149072 -0.19453441 -3.0256527
## 4 pos 0.28755502 0.007095078 -0.05992568 -0.5024812
## 5 neg -0.16336357 -1.561558013 -0.15738111 -0.6285802
## 6 neg -1.37783842 0.830355860 -0.79339171 0.4223554
## 7 pos -0.52289654 1.663399105 -0.90919184 -1.3405057
## 8 pos -1.40676558 -1.422215092 0.37772800 -1.5121601
## 9 neg 0.14575393 -0.573764946 1.17134733 -0.4860252
## 10 pos 0.03247208 -0.756778828 -1.11823951 0.3115500
Get more timings and plots.
timingsPrev <- timings
timings <- vector("list", length(timeSeq))
for(i in seq_len(length(timeSeq))) {
nRow <- timeSeq[[i]]
ti <- microbenchmark(
mkFrameForList(nRow,nCol),
mkFrameList(nRow,nCol),
times=10)
ti <- data.frame(ti,
stringsAsFactors=FALSE)
ti$nRow <- nRow
ti$nCol <- nCol
timings[[i]] <- ti
}
timings <- data.table::rbindlist(timings)
print(plotTimings(timings))
## [[1]]
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.