library('microbenchmark')
library('pryr')
library('plyr')
library('ggplot2')
# standard naive "non-R" way to square items in a vector
f1 <- function() {
v <- 1:n
for(i in 1:n) {
v[i] <- v[i]^2
}
v
}
Show address of value referred by by v is stable (indicating in-place alteration of vector, likely due to limited visibility of the value referred to).
# same naive loop with pryr:address() inspection
# doesn't trigger visibile value address changes, but very slow
f1p <- function(verbose=FALSE) {
v <- 1:n
addr <- pryr::address(v)
if(verbose) {
print(addr)
}
for(i in 1:n) {
v[i] <- v[i]^2
addr <- pryr::address(v)
if(verbose) {
print(addr)
}
}
v
}
# same naive loop with .Internal(inspect(v))
f1b <- function(verbose=FALSE) {
v <- 1:n
if(verbose) {
.Internal(inspect(v))
}
for(i in 1:n) {
v[i] <- v[i]^2
if(verbose) {
.Internal(inspect(v))
}
}
v
}
# even worse case: vLast shares reference to value referred to by v,
# causing address to change (evidence of more object copying)
f1c <- function(verbose=FALSE) {
v <- 1:n
vLast <- v
if(verbose) {
.Internal(inspect(v))
}
for(i in 1:n) {
v[i] <- v[i]^2
vLast <- v
if(verbose) {
.Internal(inspect(v))
}
}
v
}
# address not changing
n <- 5
f1b(TRUE)
## @7fc6fe8afd70 13 INTSXP g0c3 [NAM(1)] (len=5, tl=0) 1,2,3,4,5
## @7fc6fdf4bb50 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,2,3,4,5
## @7fc6fdf4bb50 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,3,4,5
## @7fc6fdf4bb50 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,4,5
## @7fc6fdf4bb50 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,16,5
## @7fc6fdf4bb50 14 REALSXP g0c4 [NAM(1)] (len=5, tl=0) 1,4,9,16,25
## [1] 1 4 9 16 25
# address not changing
n <- 5
f1p(TRUE)
## [1] "0x7fc6fe133db0"
## [1] "0x7fc6fdf2c070"
## [1] "0x7fc6fdf2c070"
## [1] "0x7fc6fdf2c070"
## [1] "0x7fc6fdf2c070"
## [1] "0x7fc6fdf2c070"
## [1] 1 4 9 16 25
# address changing
f1c(TRUE)
## @7fc6fe15db28 13 INTSXP g0c3 [NAM(2)] (len=5, tl=0) 1,2,3,4,5
## @7fc6fdf29950 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,2,3,4,5
## @7fc6fdf29a20 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,3,4,5
## @7fc6fdf29af0 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,9,4,5
## @7fc6fdf25870 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,9,16,5
## @7fc6fdf259a8 14 REALSXP g0c4 [NAM(2)] (len=5, tl=0) 1,4,9,16,25
## [1] 1 4 9 16 25
# pryr::address(v) seems uniquely slower than the other f1X() functions
microbenchmark(f1(),f1b(),f1p(),f1c())
## Unit: microseconds
## expr min lq mean median uq max neval
## f1() 4.587 5.6930 7.10846 6.1925 6.9915 29.247 100
## f1b() 5.021 6.2815 7.56307 6.7675 7.5230 17.550 100
## f1p() 78.611 85.6475 96.31977 93.2305 101.2380 231.254 100
## f1c() 5.643 6.7685 8.62212 7.4920 8.2985 24.596 100
Obviously using built-ins is way faster than any interpretted loop or apply.
# vector apply way of squaring items
f2 <- function(v) {
v <- 1:n
v <- vapply(v,function(x) {x*x},numeric(1))
v
}
# true vector method of squaring items
f3 <- function() {
v <- 1:n
v <- v^2
v
}
n <- 1000
microbenchmark(f1(),f2(),f3())
## Unit: microseconds
## expr min lq mean median uq max neval
## f1() 769.904 849.662 1098.78994 898.6425 1236.5335 2825.948 100
## f2() 544.032 624.841 782.76640 665.3085 832.3115 2191.192 100
## f3() 3.574 4.852 8.38659 6.9350 10.0080 34.839 100
powseq <- function(min,max,nstep) {
mul <- (max/min)^(1/nstep)
sort(unique(c(min,round(min*mul^(1:nstep)),max)))
}
m <- c()
for(n in powseq(10,10000,40)) {
mi <- as.data.frame(microbenchmark(f1(),f2(),f3()))
mi$n <- n
m <- rbind(m,mi)
# si <- ddply(mi,'expr',summarize,time=mean(time))
}
qSummary = function(ycol){
data.frame(y=median(ycol),
ymin=quantile(ycol, 0.25),
ymax=quantile(ycol, 0.75))
}
ggplot(data=m,aes(x=n,y=time,color=expr)) +
stat_summary(fun.y=mean,geom='point') +
stat_summary(fun.data=qSummary,geom='errorbar') +
geom_smooth()
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
# confirm f3() moving
ggplot(data=m,aes(x=n,y=time,color=expr)) +
stat_summary(fun.y=mean,geom='point') +
stat_summary(fun.data=qSummary,geom='errorbar') +
geom_smooth() +
scale_y_log10() + scale_x_log10()
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
for(expr in sort(unique(m$expr))) {
print(expr)
print(summary(lm(time~I(n*n)+n,data=m[m$expr==expr,])))
}
## [1] "f1()"
##
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1685817 -75159 -10926 -5603 38743412
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.943e+03 2.489e+04 0.319 0.7496
## I(n * n) 4.998e-03 2.888e-03 1.730 0.0836 .
## n 9.017e+02 2.416e+01 37.322 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1209000 on 4097 degrees of freedom
## Multiple R-squared: 0.7842, Adjusted R-squared: 0.7841
## F-statistic: 7446 on 2 and 4097 DF, p-value: < 2.2e-16
##
## [1] "f2()"
##
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -949245 -65851 -3894 292 34943120
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.087e+03 1.286e+04 0.240 0.810
## I(n * n) -1.785e-03 1.493e-03 -1.196 0.232
## n 6.491e+02 1.249e+01 51.972 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 625000 on 4097 degrees of freedom
## Multiple R-squared: 0.8609, Adjusted R-squared: 0.8608
## F-statistic: 1.268e+04 on 2 and 4097 DF, p-value: < 2.2e-16
##
## [1] "f3()"
##
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -8017 -570 -179 34 675287
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.995e+02 2.297e+02 3.917 9.13e-05 ***
## I(n * n) -9.493e-05 2.665e-05 -3.561 0.000373 ***
## n 4.550e+00 2.230e-01 20.404 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11160 on 4097 degrees of freedom
## Multiple R-squared: 0.4108, Adjusted R-squared: 0.4106
## F-statistic: 1429 on 2 and 4097 DF, p-value: < 2.2e-16
m <- c()
for(n in powseq(10,1000,40)) {
mi <- as.data.frame(microbenchmark(f1(),f1p(),f2(),f3(),f1c()))
mi$n <- n
m <- rbind(m,mi)
# si <- ddply(mi,'expr',summarize,time=mean(time))
}
qSummary = function(ycol){
data.frame(y=median(ycol),
ymin=quantile(ycol, 0.25),
ymax=quantile(ycol, 0.75))
}
ggplot(data=m,aes(x=n,y=time,color=expr)) +
stat_summary(fun.y=mean,geom='point') +
stat_summary(fun.data=qSummary,geom='errorbar') +
geom_smooth()
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
for(expr in sort(unique(m$expr))) {
print(expr)
print(summary(lm(time~I(n*n)+n,data=m[m$expr==expr,])))
}
## [1] "f1()"
##
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -91738 -8422 -3668 1059 3528933
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5970.98994 2102.19686 2.84 0.00453 **
## I(n * n) -0.02731 0.01883 -1.45 0.14700
## n 865.83692 16.32872 53.02 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 83550 on 4097 degrees of freedom
## Multiple R-squared: 0.8758, Adjusted R-squared: 0.8757
## F-statistic: 1.444e+04 on 2 and 4097 DF, p-value: < 2.2e-16
##
## [1] "f1p()"
##
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1363034 -193022 -48672 -8444 107026362
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.129e+04 4.495e+04 0.251 0.802
## I(n * n) -8.451e-02 4.027e-01 -0.210 0.834
## n 1.465e+04 3.492e+02 41.970 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1787000 on 4097 degrees of freedom
## Multiple R-squared: 0.8218, Adjusted R-squared: 0.8217
## F-statistic: 9448 on 2 and 4097 DF, p-value: < 2.2e-16
##
## [1] "f2()"
##
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -58155 -6198 -2961 1847 1819350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.134e+03 1.109e+03 7.334 2.67e-13 ***
## I(n * n) -1.860e-02 9.934e-03 -1.872 0.0613 .
## n 6.132e+02 8.615e+00 71.180 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44080 on 4097 degrees of freedom
## Multiple R-squared: 0.9272, Adjusted R-squared: 0.9271
## F-statistic: 2.608e+04 on 2 and 4097 DF, p-value: < 2.2e-16
##
## [1] "f3()"
##
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -5305 -691 -324 383 584140
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.128e+03 2.347e+02 4.804 1.61e-06 ***
## I(n * n) -1.770e-03 2.103e-03 -0.842 0.4
## n 9.652e+00 1.823e+00 5.293 1.26e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9330 on 4097 degrees of freedom
## Multiple R-squared: 0.05078, Adjusted R-squared: 0.05032
## F-statistic: 109.6 on 2 and 4097 DF, p-value: < 2.2e-16
##
## [1] "f1c()"
##
## Call:
## lm(formula = time ~ I(n * n) + n, data = m[m$expr == expr, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1988806 -107838 -6417 3678 104332740
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3111.527 59614.396 -0.052 0.9584
## I(n * n) 2.638 0.534 4.939 8.15e-07 ***
## n 1172.255 463.052 2.532 0.0114 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2369000 on 4097 degrees of freedom
## Multiple R-squared: 0.1262, Adjusted R-squared: 0.1258
## F-statistic: 295.8 on 2 and 4097 DF, p-value: < 2.2e-16