benchmark

library('microbenchmark')
library('pryr')
library('plyr')
library('ggplot2')


# standard naive "non-R" way to square items in a vector
f1 <- function() {
  v <- 1:n
  for(i in 1:n) {
    v[i] <- v[i]^2
  }
  v
}

Show address of value referred by by v is stable (indicating in-place alteration of vector, likely due to limited visibility of the value referred to).

# same naive loop with pryr:address() inspection
# doesn't trigger visibile value address changes, but very slow
f1p <- function(verbose=FALSE) {
  v <- 1:n
  addr <- pryr::address(v)
  if(verbose) {
    print(addr)
  }
  for(i in 1:n) {
    v[i] <- v[i]^2
    addr <- pryr::address(v)
    if(verbose) {
      print(addr)
    }
  }
  v
}

# same naive loop with .Internal(inspect(v))
f1b <- function(verbose=FALSE) {
  v <- 1:n
  if(verbose) {
    .Internal(inspect(v))
  }
  for(i in 1:n) {
    v[i] <- v[i]^2
    if(verbose) {
       .Internal(inspect(v))
    }
  }
  v
}

# even worse case: vLast shares reference to value referred to by v,
# causing address to change (evidence of more object copying)
f1c <- function(verbose=FALSE) {
  v <- 1:n
  vLast <- v
  if(verbose) {
    .Internal(inspect(v))
  }
  for(i in 1:n) {
    v[i] <- v[i]^2
    vLast <- v
    if(verbose) {
      .Internal(inspect(v))
    }
  }
  v
}



# address not changing
n <- 5
f1b(TRUE)

## @7fc6fe8afd70 13 INTSXP g0c3 [NAM(1)] (len=5, tl=0) 1,2,3,4,5
## @7fc6fdf4bb50 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,2,3,4,5
## @7fc6fdf4bb50 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,3,4,5
## @7fc6fdf4bb50 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,4,5
## @7fc6fdf4bb50 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,16,5
## @7fc6fdf4bb50 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,16,25

## [1]  1  4  9 16 25

# address not changing
n <- 5
f1p(TRUE)

## [1] "0x7fc6fe133db0"
## [1] "0x7fc6fdf2c070"
## [1] "0x7fc6fdf2c070"
## [1] "0x7fc6fdf2c070"
## [1] "0x7fc6fdf2c070"
## [1] "0x7fc6fdf2c070"

## [1]  1  4  9 16 25

# address changing
f1c(TRUE)

## @7fc6fe15db28 13 INTSXP g0c3 [NAM(2)] (len=5, tl=0) 1,2,3,4,5
## @7fc6fdf29950 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,2,3,4,5
## @7fc6fdf29a20 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,3,4,5
## @7fc6fdf29af0 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,9,4,5
## @7fc6fdf25870 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,9,16,5
## @7fc6fdf259a8 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,9,16,25

## [1]  1  4  9 16 25

# pryr::address(v) seems uniquely slower than the other f1X() functions
microbenchmark(f1(),f1b(),f1p(),f1c())

## Unit: microseconds
##   expr    min      lq     mean  median       uq     max neval
##   f1()  4.587  5.6930  7.10846  6.1925   6.9915  29.247   100
##  f1b()  5.021  6.2815  7.56307  6.7675   7.5230  17.550   100
##  f1p() 78.611 85.6475 96.31977 93.2305 101.2380 231.254   100
##  f1c()  5.643  6.7685  8.62212  7.4920   8.2985  24.596   100

Obviously using built-ins is way faster than any interpretted loop or apply.

# vector apply way of squaring items
f2 <- function(v) {
   v <- 1:n
   v <- vapply(v,function(x) {x*x},numeric(1))
   v
}

# true vector method of squaring items
f3 <- function() {
  v <- 1:n
  v <- v^2
  v
}

n <- 1000
microbenchmark(f1(),f2(),f3())

## Unit: microseconds
##  expr     min      lq       mean   median        uq      max neval
##  f1() 769.904 849.662 1098.78994 898.6425 1236.5335 2825.948   100
##  f2() 544.032 624.841  782.76640 665.3085  832.3115 2191.192   100
##  f3()   3.574   4.852    8.38659   6.9350   10.0080   34.839   100

powseq <- function(min,max,nstep) {
  mul <- (max/min)^(1/nstep)
  sort(unique(c(min,round(min*mul^(1:nstep)),max)))
}

m <- c()
for(n in powseq(10,10000,40)) {
   mi <- as.data.frame(microbenchmark(f1(),f2(),f3()))
   mi$n <- n
   m <- rbind(m,mi)
   # si <- ddply(mi,'expr',summarize,time=mean(time))
}

qSummary = function(ycol){
  data.frame(y=median(ycol),
             ymin=quantile(ycol, 0.25),
             ymax=quantile(ycol, 0.75))
}
ggplot(data=m,aes(x=n,y=time,color=expr)) + 
  stat_summary(fun.y=mean,geom='point') +
  stat_summary(fun.data=qSummary,geom='errorbar') +
  geom_smooth()

## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.

# confirm f3() moving
ggplot(data=m,aes(x=n,y=time,color=expr)) + 
  stat_summary(fun.y=mean,geom='point') +
  stat_summary(fun.data=qSummary,geom='errorbar') + 
  geom_smooth() +
  scale_y_log10() + scale_x_log10()

## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.

for(expr in sort(unique(m$expr))) {
  print(expr)
  print(summary(lm(time~I(n*n)+n,data=m[m$expr==expr,])))
}

## [1] "f1()"
## 
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1685817   -75159   -10926    -5603 38743412 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.943e+03  2.489e+04   0.319   0.7496    
## I(n * n)    4.998e-03  2.888e-03   1.730   0.0836 .  
## n           9.017e+02  2.416e+01  37.322   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1209000 on 4097 degrees of freedom
## Multiple R-squared:  0.7842, Adjusted R-squared:  0.7841 
## F-statistic:  7446 on 2 and 4097 DF,  p-value: < 2.2e-16
## 
## [1] "f2()"
## 
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
##  -949245   -65851    -3894      292 34943120 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.087e+03  1.286e+04   0.240    0.810    
## I(n * n)    -1.785e-03  1.493e-03  -1.196    0.232    
## n            6.491e+02  1.249e+01  51.972   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 625000 on 4097 degrees of freedom
## Multiple R-squared:  0.8609, Adjusted R-squared:  0.8608 
## F-statistic: 1.268e+04 on 2 and 4097 DF,  p-value: < 2.2e-16
## 
## [1] "f3()"
## 
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8017   -570   -179     34 675287 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.995e+02  2.297e+02   3.917 9.13e-05 ***
## I(n * n)    -9.493e-05  2.665e-05  -3.561 0.000373 ***
## n            4.550e+00  2.230e-01  20.404  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11160 on 4097 degrees of freedom
## Multiple R-squared:  0.4108, Adjusted R-squared:  0.4106 
## F-statistic:  1429 on 2 and 4097 DF,  p-value: < 2.2e-16

m <- c()
for(n in powseq(10,1000,40)) {
   mi <- as.data.frame(microbenchmark(f1(),f1p(),f2(),f3(),f1c()))
   mi$n <- n
   m <- rbind(m,mi)
   # si <- ddply(mi,'expr',summarize,time=mean(time))
}

qSummary = function(ycol){
  data.frame(y=median(ycol),
             ymin=quantile(ycol, 0.25),
             ymax=quantile(ycol, 0.75))
}
ggplot(data=m,aes(x=n,y=time,color=expr)) + 
  stat_summary(fun.y=mean,geom='point') +
  stat_summary(fun.data=qSummary,geom='errorbar') +
  geom_smooth()

## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.

for(expr in sort(unique(m$expr))) {
  print(expr)
  print(summary(lm(time~I(n*n)+n,data=m[m$expr==expr,])))
}

## [1] "f1()"
## 
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -91738   -8422   -3668    1059 3528933 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5970.98994 2102.19686    2.84  0.00453 ** 
## I(n * n)      -0.02731    0.01883   -1.45  0.14700    
## n            865.83692   16.32872   53.02  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 83550 on 4097 degrees of freedom
## Multiple R-squared:  0.8758, Adjusted R-squared:  0.8757 
## F-statistic: 1.444e+04 on 2 and 4097 DF,  p-value: < 2.2e-16
## 
## [1] "f1p()"
## 
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
##  -1363034   -193022    -48672     -8444 107026362 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.129e+04  4.495e+04   0.251    0.802    
## I(n * n)    -8.451e-02  4.027e-01  -0.210    0.834    
## n            1.465e+04  3.492e+02  41.970   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1787000 on 4097 degrees of freedom
## Multiple R-squared:  0.8218, Adjusted R-squared:  0.8217 
## F-statistic:  9448 on 2 and 4097 DF,  p-value: < 2.2e-16
## 
## [1] "f2()"
## 
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -58155   -6198   -2961    1847 1819350 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.134e+03  1.109e+03   7.334 2.67e-13 ***
## I(n * n)    -1.860e-02  9.934e-03  -1.872   0.0613 .  
## n            6.132e+02  8.615e+00  71.180  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44080 on 4097 degrees of freedom
## Multiple R-squared:  0.9272, Adjusted R-squared:  0.9271 
## F-statistic: 2.608e+04 on 2 and 4097 DF,  p-value: < 2.2e-16
## 
## [1] "f3()"
## 
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5305   -691   -324    383 584140 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.128e+03  2.347e+02   4.804 1.61e-06 ***
## I(n * n)    -1.770e-03  2.103e-03  -0.842      0.4    
## n            9.652e+00  1.823e+00   5.293 1.26e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9330 on 4097 degrees of freedom
## Multiple R-squared:  0.05078,    Adjusted R-squared:  0.05032 
## F-statistic: 109.6 on 2 and 4097 DF,  p-value: < 2.2e-16
## 
## [1] "f1c()"
## 
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
##  -1988806   -107838     -6417      3678 104332740 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3111.527  59614.396  -0.052   0.9584    
## I(n * n)        2.638      0.534   4.939 8.15e-07 ***
## n            1172.255    463.052   2.532   0.0114 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2369000 on 4097 degrees of freedom
## Multiple R-squared:  0.1262, Adjusted R-squared:  0.1258 
## F-statistic: 295.8 on 2 and 4097 DF,  p-value: < 2.2e-16

benchmark

John Mount

March 13, 2015