grouped ordered apply — gapply • replyr

Partitions from by values in grouping column, applies a generic transform to each group and then binds the groups back together. Only advised for a moderate number of groups and better if grouping column is an index. This is powerful enough to implement "The Split-Apply-Combine Strategy for Data Analysis" https://www.jstatsoft.org/article/view/v040i01

gapply(
  df,
  gcolumn,
  f,
  ...,
  ocolumn = NULL,
  decreasing = FALSE,
  partitionMethod = "split",
  bindrows = TRUE,
  maxgroups = 100,
  eagerCompute = FALSE,
  restoreGroup = FALSE,
  tempNameGenerator = mk_tmp_name_source("replyr_gapply")
)

Arguments

df	remote dplyr data item
gcolumn	grouping column
f	transform function or pipeline
...	force later values to be bound by name
ocolumn	ordering column (optional)
decreasing	logical, if TRUE sort in decreasing order by ocolumn
partitionMethod	method to partition the data, one of 'group_by' (depends on f being dplyr compatible), 'split' (only works over local data frames), or 'extract'
bindrows	logical, if TRUE bind the rows back into a data item, else return split list
maxgroups	maximum number of groups to work over (intentionally not enforced if `partitionMethod=='group_by'`)
eagerCompute	logical, if TRUE call compute on split results
restoreGroup	logical, if TRUE restore group column after apply when `partitionMethod %in% c('extract', 'split')`
tempNameGenerator	temp name generator produced by `wrapr::mk_tmp_name_source`, used to record `dplyr::compute()` effects.

Value

transformed frame

Details

Note this is a fairly expensive operator, so it only makes sense to use in situations where f itself is fairly complicated and/or expensive.

Examples


d <- data.frame(
  group = c(1, 1, 2, 2, 2),
  order = c(.1, .2, .3, .4, .5),
  values = c(10, 20, 2, 4, 8)
)

# User supplied window functions.  They depend on known column names and
# the data back-end matching function names (as cumsum).
cumulative_sum <- function(d) {
  dplyr::mutate(d, cv = cumsum(values))
}
rank_in_group <- function(d) {
  d %.>%
    dplyr::mutate(., constcol = 1) %.>%
    dplyr::mutate(., rank = cumsum(constcol)) %.>%
    dplyr::select(., -constcol)
}

for (partitionMethod in c('group_by', 'split', 'extract')) {
  print(partitionMethod)
  print('cumulative sum example')
  print(
    gapply(
      d,
      'group',
      cumulative_sum,
      ocolumn = 'order',
      partitionMethod = partitionMethod
    )
  )
  print('ranking example')
  print(
    gapply(
      d,
      'group',
      rank_in_group,
      ocolumn = 'order',
      partitionMethod = partitionMethod
    )
  )
  print('ranking example (decreasing)')
  print(
    gapply(
      d,
      'group',
      rank_in_group,
      ocolumn = 'order',
      decreasing = TRUE,
      partitionMethod = partitionMethod
    )
  )
}
#> [1] "group_by"
#> [1] "cumulative sum example"
#> # A tibble: 5 x 4
#>   group order values    cv
#>   <dbl> <dbl>  <dbl> <dbl>
#> 1     1   0.1     10    10
#> 2     1   0.2     20    30
#> 3     2   0.3      2     2
#> 4     2   0.4      4     6
#> 5     2   0.5      8    14
#> [1] "ranking example"
#> # A tibble: 5 x 4
#>   group order values  rank
#>   <dbl> <dbl>  <dbl> <dbl>
#> 1     1   0.1     10     1
#> 2     1   0.2     20     2
#> 3     2   0.3      2     1
#> 4     2   0.4      4     2
#> 5     2   0.5      8     3
#> [1] "ranking example (decreasing)"
#> # A tibble: 5 x 4
#>   group order values  rank
#>   <dbl> <dbl>  <dbl> <dbl>
#> 1     2   0.5      8     1
#> 2     2   0.4      4     2
#> 3     2   0.3      2     3
#> 4     1   0.2     20     1
#> 5     1   0.1     10     2
#> [1] "split"
#> [1] "cumulative sum example"
#>   group order values cv
#> 1     1   0.1     10 10
#> 2     1   0.2     20 30
#> 3     2   0.3      2  2
#> 4     2   0.4      4  6
#> 5     2   0.5      8 14
#> [1] "ranking example"
#>   group order values rank
#> 1     1   0.1     10    1
#> 2     1   0.2     20    2
#> 3     2   0.3      2    1
#> 4     2   0.4      4    2
#> 5     2   0.5      8    3
#> [1] "ranking example (decreasing)"
#>   group order values rank
#> 1     1   0.2     20    1
#> 2     1   0.1     10    2
#> 3     2   0.5      8    1
#> 4     2   0.4      4    2
#> 5     2   0.3      2    3
#> [1] "extract"
#> [1] "cumulative sum example"
#> Warning: `as.tbl()` is deprecated as of dplyr 1.0.0.
#> Please use `tibble::as_tibble()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_warnings()` to see where this warning was generated.
#>   group order values cv
#> 1     1   0.1     10 10
#> 2     1   0.2     20 30
#> 3     2   0.3      2  2
#> 4     2   0.4      4  6
#> 5     2   0.5      8 14
#> [1] "ranking example"
#>   group order values rank
#> 1     1   0.1     10    1
#> 2     1   0.2     20    2
#> 3     2   0.3      2    1
#> 4     2   0.4      4    2
#> 5     2   0.5      8    3
#> [1] "ranking example (decreasing)"
#>   group order values rank
#> 1     1   0.2     20    1
#> 2     1   0.1     10    2
#> 3     2   0.5      8    1
#> 4     2   0.4      4    2
#> 5     2   0.3      2    3