coalesce

replyr::coalesce() is used to fill in missing data especially for data coming from counting aggregations. The issue is that for data representing counts you often do want rows for zero-counts, and data produced by un-weighted counting can never produce such rows (please see “The Zero Bug” for some discussion).

replyr::coalesce() takes two primary arguments data (the data to be augmented) and support (the pattern of rows defining the complete desired range of the data). support is taken as a list of keys. If support can not be unique joined into data (say data has rows that are do not correspond to support key combinations) then replyr::coalesce() intentionally throws to avoid silent data loss. Otherwise an anti-join is performed to discover new rows that need to be added to data and column in this new row set are populated either by NA or by scalar values from the named-list argument fills. replyr::coalesce() works over various dplyr controlled data services (Spark 2 and above, PostgreSQL, SQLite, and local data).

Some examples are given below.

Setting up:

library("dplyr")

## Warning: replacing previous import 'vctrs::data_frame' by 'tibble::data_frame'
## when loading 'dplyr'

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library("replyr")

## Loading required package: wrapr

## 
## Attaching package: 'wrapr'

## The following object is masked from 'package:dplyr':
## 
##     coalesce

A simple single key example:

# single column key example
data <- data.frame(year = c(2005,2007,2010),
                   count = c(6,1,NA),
                   name = c('a','b','c'),
                   stringsAsFactors = FALSE)
support <- data.frame(year=2005:2010)
filled <- replyr_coalesce(data, support,
                          fills=list(count=0))
filled <- filled[order(filled$year), ]
filled

##   year count name
## 1 2005     6    a
## 4 2006     0 <NA>
## 2 2007     1    b
## 5 2008     0 <NA>
## 6 2009     0 <NA>
## 3 2010    NA    c

A cross product of keys example:

# complex key example
data <- data.frame(year = c(2005,2007,2010),
                   count = c(6,1,NA),
                   name = c('a','b','c'),
                   stringsAsFactors = FALSE)
support <- expand.grid(year=2005:2010,
                   name= c('a','b','c','d'),
                   stringsAsFactors = FALSE)
filled <- replyr_coalesce(data, support,
                          fills=list(count=0))
filled <- filled[order(filled$year, filled$name), ]
filled

##    year count name
## 1  2005     6    a
## 9  2005     0    b
## 14 2005     0    c
## 19 2005     0    d
## 4  2006     0    a
## 10 2006     0    b
## 15 2006     0    c
## 20 2006     0    d
## 5  2007     0    a
## 2  2007     1    b
## 16 2007     0    c
## 21 2007     0    d
## 6  2008     0    a
## 11 2008     0    b
## 17 2008     0    c
## 22 2008     0    d
## 7  2009     0    a
## 12 2009     0    b
## 18 2009     0    c
## 23 2009     0    d
## 8  2010     0    a
## 13 2010     0    b
## 3  2010    NA    c
## 24 2010     0    d

An irregular (cities contained in counties) example:

# cities and counties
def <- data.frame(county= c('Calaveras County',
                            'Colusa County',
                            'Colusa County'),
                  city= c('Angels Camp',
                          'Colusa',
                          'Williams'),
                  stringsAsFactors= FALSE)
counts <- data.frame(county= c('Calaveras County',
                               'Colusa County'),
                     city= c('Angels Camp',
                             'Colusa'),
                     n= c(2,3),
                     stringsAsFactors= FALSE)
replyr::replyr_coalesce(counts, def, fills = list(n=0))

##             county        city n
## 1 Calaveras County Angels Camp 2
## 2    Colusa County      Colusa 3
## 3    Colusa County    Williams 0

Re-running and example in a database:

execute_vignette <- requireNamespace("RSQLite", quietly = TRUE)

my_db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
RSQLite::initExtension(my_db)
dbData <- dplyr::copy_to(my_db, data)
dbSupport <- dplyr::copy_to(my_db, support)
replyr::replyr_coalesce(dbData, dbSupport, 
                        fills = list(count=0))

## # Source:   lazy query [?? x 3]
## # Database: sqlite 3.30.1 [:memory:]
##     year count name 
##    <dbl> <dbl> <chr>
##  1  2005     6 a    
##  2  2007     1 b    
##  3  2010    NA c    
##  4  2006     0 a    
##  5  2007     0 a    
##  6  2008     0 a    
##  7  2009     0 a    
##  8  2010     0 a    
##  9  2005     0 b    
## 10  2006     0 b    
## # … with more rows

As we mentioned replyr_coalesce only alter column that it has added. Under this convention the column “n” is not zeroed in the “c-row”.

d <- data.frame(x=c('a','c'), n=c(1,NA), 
                stringsAsFactors=FALSE)
s <- data.frame(x=c('a','b','c'), 
                stringsAsFactors=FALSE)

print(d)

##   x  n
## 1 a  1
## 2 c NA

replyr::replyr_coalesce(d,s,
                        fills= list(n=0))

##   x  n
## 1 a  1
## 2 c NA
## 3 b  0

Also replyr_coalesce intentionally throws if the join discovers new rows in data. This is to help defend against silent data loss.

d <- data.frame(x=c('a','c'), n=c(1,NA), 
                stringsAsFactors=FALSE)
s <- data.frame(x=c('a','b'), 
                stringsAsFactors=FALSE)
tryCatch(
  replyr::replyr_coalesce(d,s,
                        fills= list(n=0)),
  error= function(e) { e })

## <simpleError in replyr::replyr_coalesce(d, s, fills = list(n = 0)): replyr::replyr_coalesce support is not a unique set of keys for data>

Cleaning up:

DBI::dbDisconnect(my_db)
rm(list=ls())
gc()

##           used (Mb) gc trigger  (Mb) limit (Mb) max used (Mb)
## Ncells 1026787 54.9    2054300 109.8         NA  1715583 91.7
## Vcells 1863374 14.3    8388608  64.0      16384  2651711 20.3

John Mount

2020-09-06