Functions

library(ggplot2)
library(vtreat)


# ----
# for making data sets with both noise and weakly correlated variables
# ----
mkCoefs = function(ngood) {
  nGoodN = ceiling(ngood/2)
  nGoodC = ngood-nGoodN
  coefs = list()
  if(nGoodN>0) {
    cN = lapply(seq_len(nGoodN),function(i) {
      v = list()
      v[[paste('gn', i, sep='_')]] = rnorm(1)
      v
    })
    coefs = unlist(cN,recursive=FALSE)
  }
  if(nGoodN>0) {
    cC = lapply(seq_len(nGoodC),function(i) {
      v = list()
      v[[paste('gc', i, sep='_')]] = list('a'=rnorm(1),
                                          'b'=rnorm(1),
                                          'c'=rnorm(1))
      v
    })
    coefs = append(coefs,unlist(cC,recursive=FALSE))
  }
  coefs
}

# generate columns
genColumns = function(nrow,prefix,nNum,nCat) {
  cols = list()
  if(nNum>0) {
    numCols = lapply(seq_len(nNum),function(i) rnorm(nrow))
    names(numCols) = paste(prefix,'n_',seq_len(nNum),sep='')
    cols = append(cols,numCols)
  }
  if(nCat>0) {
    cCols = lapply(seq_len(nCat),function(i) {
      sample(c('a','b','c'),nrow,replace=TRUE)
      })
    names(cCols) = paste(prefix,'c_',seq_len(nCat),sep='')
    cols = append(cols,cCols)
  }
  data.frame(cols,stringsAsFactors=FALSE)
}

# evaluate coefs on a data frame
applyCoefs = function(coefs,d) {
  v = numeric(nrow(d))
  for(n in names(coefs)) {
    cf = coefs[[n]]
    if(is.list(cf)) {
      # categorical
      v = v + as.numeric(cf[d[,n,drop=TRUE]])
    } else {
      # numeric
      v = v + cf[[1]]*d[,n,drop=TRUE]
    }
  }
  v
}

# build a data frame with pure noise columns
# and columns weakly correlated with y
mkData = function(nrows, coefs, nnoise) {
  noiseMagnitude = 1
  d = data.frame(y = noiseMagnitude*rnorm(nrows),
                 stringsAsFactors = FALSE)
  ngood = length(coefs)
  if(ngood>0) {
    ngC = sum(vapply(coefs,is.list,numeric(1)))
    ngN = ngood - ngC
    gd = genColumns(nrows,'g',ngN,ngC)
    d = cbind(d,gd)
    d$y = d$y + applyCoefs(coefs,d)
  }
  if(nnoise > 0) {
    nnN = ceiling(nnoise/2)
    nnC = nnoise-nnN
    nd = genColumns(nrows,'n',nnN,nnC)
    d = cbind(d,nd)
  }
  d$y = d$y > 0
  d
}
# ------


# run vtreat variable scoring experiment and print/plot results
showVTreat <- function(dframe,yName,yTarget,parallelCluster) {
  varnames <- setdiff(colnames(dframe),yName)
  treatments = designTreatmentsC(dframe,
                                 varnames,yName,yTarget,
                                 verbose=FALSE,
                                 parallelCluster=parallelCluster)
  ord <- order(treatments$scoreFrame$csig)
  sf <- treatments$scoreFrame[ord,]
  print(sf[seq_len(min(20,nrow(sf))),])
  goodIndices <- grep("^g",sf$origName)
  print(goodIndices)
  print(sf[goodIndices,])
  
  sf$goodVar <- FALSE
  sf$goodVar[goodIndices] <- TRUE
  list(pd=ggplot(data=sf,aes(x=sig,color=goodVar,fill=goodVar)) +
    geom_density(adjust=0.2,alpha=0.5) + scale_x_log10(),
    ph=ggplot(data=sf,aes(x=sig,color=goodVar,fill=goodVar)) +
    geom_histogram() + facet_wrap(~goodVar,ncol=1,scale="free_y") +
    scale_x_log10())
}
nCoreEstimate <-  parallel::detectCores()
print(paste('core estimate',nCoreEstimate))
## [1] "core estimate 4"
parallelCluster = parallel::makeCluster(nCoreEstimate)

Example 1 Small example of evaluating a variable with signal, and without

set.seed(3266)
N = 1000
g1 = rnorm(N)
n1 = rnorm(N)
y = 2*g1 + rnorm(N)
dframe = data.frame(y=y>0, g1=g1, n1=n1)

showVTreat(dframe,'y',TRUE,parallelCluster)[[2]]
##    varName origName varMoves PRESSRsquared         psig          sig
## 1 g1_clean       g1     TRUE    0.55143902 4.549788e-45 1.588708e-47
## 2 n1_clean       n1     TRUE   -0.01583777 1.000000e+00 7.172873e-01
##   catPRSquared         csig
## 1 0.6053479103 1.588708e-47
## 2 0.0003784514 7.172873e-01
## [1] 1
##    varName origName varMoves PRESSRsquared         psig          sig
## 1 g1_clean       g1     TRUE      0.551439 4.549788e-45 1.588708e-47
##   catPRSquared         csig
## 1    0.6053479 1.588708e-47
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Example 2

Small example of using chi-sq significance for variable filtering

ngood = 5; nnoise = 5; N = 1000
coefs = mkCoefs(ngood)
dframe = mkData(N, coefs, nnoise)
summary(dframe)
##      y                gn_1               gn_2               gn_3         
##  Mode :logical   Min.   :-3.07992   Min.   :-2.86504   Min.   :-3.51880  
##  FALSE:370       1st Qu.:-0.58050   1st Qu.:-0.76455   1st Qu.:-0.61031  
##  TRUE :630       Median : 0.08235   Median :-0.08096   Median : 0.05888  
##  NA's :0         Mean   : 0.08442   Mean   :-0.06745   Mean   : 0.04560  
##                  3rd Qu.: 0.76994   3rd Qu.: 0.64176   3rd Qu.: 0.69444  
##                  Max.   : 2.83260   Max.   : 3.16709   Max.   : 3.86147  
##      gc_1               gc_2                nn_1         
##  Length:1000        Length:1000        Min.   :-4.06423  
##  Class :character   Class :character   1st Qu.:-0.58554  
##  Mode  :character   Mode  :character   Median : 0.02891  
##                                        Mean   : 0.02070  
##                                        3rd Qu.: 0.68636  
##                                        Max.   : 3.08184  
##       nn_2                nn_3              nc_1          
##  Min.   :-3.484511   Min.   :-3.15580   Length:1000       
##  1st Qu.:-0.663425   1st Qu.:-0.65419   Class :character  
##  Median :-0.009027   Median :-0.03368   Mode  :character  
##  Mean   : 0.018741   Mean   :-0.01567                     
##  3rd Qu.: 0.660170   3rd Qu.: 0.63787                     
##  Max.   : 3.553240   Max.   : 2.91261                     
##      nc_2          
##  Length:1000       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
print("True coefficients of signal variables")
## [1] "True coefficients of signal variables"
print(coefs)
## $gn_1
## [1] -1.148735
## 
## $gn_2
## [1] -0.01430854
## 
## $gn_3
## [1] -1.562585
## 
## $gc_1
## $gc_1$a
## [1] -0.2714472
## 
## $gc_1$b
## [1] 1.223394
## 
## $gc_1$c
## [1] 1.172623
## 
## 
## $gc_2
## $gc_2$a
## [1] -0.247957
## 
## $gc_2$b
## [1] 0.57554
## 
## $gc_2$c
## [1] 0.1020899
showVTreat(dframe,'y',TRUE,parallelCluster)
##         varName origName varMoves PRESSRsquared         psig          sig
## 3    gn_3_clean     gn_3     TRUE   0.338913933 4.445277e-24 1.606254e-25
## 4  gc_1_lev_x.a     gc_1     TRUE   0.143220127 6.197608e-10 5.315279e-10
## 7     gc_1_catB     gc_1     TRUE   0.141021079 8.579409e-10 6.916390e-10
## 5  gc_1_lev_x.b     gc_1     TRUE   0.059516435 9.736829e-05 9.886860e-06
## 1    gn_1_clean     gn_1     TRUE   0.057039159 1.375082e-04 2.011085e-05
## 6  gc_1_lev_x.c     gc_1     TRUE  -0.001586634 1.000000e+00 5.997196e-02
## 10   nn_2_clean     nn_2     TRUE  -0.010512375 1.000000e+00 2.080640e-01
## 11   nn_3_clean     nn_3     TRUE  -0.012843806 1.000000e+00 3.462239e-01
## 8     gc_2_catB     gc_2     TRUE  -0.012845158 1.000000e+00 3.708804e-01
## 9    nn_1_clean     nn_1     TRUE  -0.014424495 1.000000e+00 4.979076e-01
## 12    nc_1_catB     nc_1     TRUE  -0.014929125 1.000000e+00 5.440322e-01
## 13    nc_2_catB     nc_2     TRUE  -0.015885584 1.000000e+00 7.456541e-01
## 2    gn_2_clean     gn_2     TRUE  -0.016332985 1.000000e+00 9.940577e-01
##    catPRSquared         csig
## 3  3.445905e-01 1.606254e-25
## 4  1.218735e-01 5.315279e-10
## 7  1.202494e-01 6.916390e-10
## 5  6.174021e-02 9.886860e-06
## 1  5.745927e-02 2.011085e-05
## 6  1.118338e-02 5.997196e-02
## 10 5.009366e-03 2.080640e-01
## 11 2.804407e-03 3.462239e-01
## 8  2.530884e-03 3.708804e-01
## 9  1.452046e-03 4.979076e-01
## 12 1.163542e-03 5.440322e-01
## 13 3.325757e-04 7.456541e-01
## 2  1.753198e-07 9.940577e-01
## [1]  1  2  3  4  5  6  9 13
##        varName origName varMoves PRESSRsquared         psig          sig
## 3   gn_3_clean     gn_3     TRUE   0.338913933 4.445277e-24 1.606254e-25
## 4 gc_1_lev_x.a     gc_1     TRUE   0.143220127 6.197608e-10 5.315279e-10
## 7    gc_1_catB     gc_1     TRUE   0.141021079 8.579409e-10 6.916390e-10
## 5 gc_1_lev_x.b     gc_1     TRUE   0.059516435 9.736829e-05 9.886860e-06
## 1   gn_1_clean     gn_1     TRUE   0.057039159 1.375082e-04 2.011085e-05
## 6 gc_1_lev_x.c     gc_1     TRUE  -0.001586634 1.000000e+00 5.997196e-02
## 8    gc_2_catB     gc_2     TRUE  -0.012845158 1.000000e+00 3.708804e-01
## 2   gn_2_clean     gn_2     TRUE  -0.016332985 1.000000e+00 9.940577e-01
##   catPRSquared         csig
## 3 3.445905e-01 1.606254e-25
## 4 1.218735e-01 5.315279e-10
## 7 1.202494e-01 6.916390e-10
## 5 6.174021e-02 9.886860e-06
## 1 5.745927e-02 2.011085e-05
## 6 1.118338e-02 5.997196e-02
## 8 2.530884e-03 3.708804e-01
## 2 1.753198e-07 9.940577e-01
## $pd

## 
## $ph
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Example 3

Larger example; lots of noise

ngood = 5; nnoise = 2000; N = 2500
coefs = mkCoefs(ngood)
dframe = mkData(N, coefs, nnoise)
varnames = setdiff(colnames(dframe), "y")

print("True coefficients of signal variables")
## [1] "True coefficients of signal variables"
print(coefs)
## $gn_1
## [1] -0.7097394
## 
## $gn_2
## [1] -0.9863472
## 
## $gn_3
## [1] -1.957874
## 
## $gc_1
## $gc_1$a
## [1] 1.623468
## 
## $gc_1$b
## [1] -1.334577
## 
## $gc_1$c
## [1] -0.3808084
## 
## 
## $gc_2
## $gc_2$a
## [1] -0.7030629
## 
## $gc_2$b
## [1] 0.9048151
## 
## $gc_2$c
## [1] 0.08699959
showVTreat(dframe,'y',TRUE,parallelCluster)
##           varName origName varMoves PRESSRsquared         psig
## 3      gn_3_clean     gn_3     TRUE   0.276031186 1.216116e-45
## 7       gc_1_catB     gc_1     TRUE   0.094442262 3.889489e-15
## 4    gc_1_lev_x.a     gc_1     TRUE   0.089316652 2.318363e-14
## 2      gn_2_clean     gn_2     TRUE   0.086278674 6.652504e-14
## 5    gc_1_lev_x.b     gc_1     TRUE   0.045381742 7.576831e-08
## 8       gc_2_catB     gc_2     TRUE   0.037842865 9.560980e-07
## 1      gn_1_clean     gn_1     TRUE   0.023426773 1.223103e-04
## 214  nn_206_clean   nn_206     TRUE   0.012967263 4.366080e-03
## 1920  nc_900_catB   nc_900     TRUE   0.012383237 5.351364e-03
## 227  nn_219_clean   nn_219     TRUE   0.009947471 1.260791e-02
## 402  nn_394_clean   nn_394     TRUE   0.009533584 1.460807e-02
## 98    nn_90_clean    nn_90     TRUE   0.009339980 1.565279e-02
## 1890  nc_870_catB   nc_870     TRUE   0.008636568 2.014115e-02
## 926  nn_918_clean   nn_918     TRUE   0.008368394 2.218460e-02
## 1023   nc_15_catB    nc_15     TRUE   0.007415338 3.135809e-02
## 711  nn_703_clean   nn_703     TRUE   0.007074422 3.553082e-02
## 1673  nc_659_catB   nc_659     TRUE   0.006578245 4.266847e-02
## 858  nn_850_clean   nn_850     TRUE   0.006265118 4.793375e-02
## 1887  nc_867_catB   nc_867     TRUE   0.006129828 5.041616e-02
## 608  nn_600_clean   nn_600     TRUE   0.006092032 5.113362e-02
##               sig catPRSquared         csig
## 3    1.099456e-46  0.237668530 1.099456e-46
## 7    1.199196e-15  0.073972831 1.199196e-15
## 4    7.092569e-15  0.069931691 7.092569e-15
## 2    1.003374e-14  0.069143359 1.003374e-14
## 5    1.102327e-08  0.037697130 1.102327e-08
## 8    1.366152e-07  0.032061136 1.366152e-07
## 1    1.540759e-05  0.021573779 1.540759e-05
## 214  4.686696e-04  0.014127087 4.686696e-04
## 1920 6.190129e-04  0.013528524 6.190129e-04
## 227  1.446387e-03  0.011713370 1.446387e-03
## 402  1.645780e-03  0.011438740 1.645780e-03
## 98   1.756565e-03  0.011300381 1.756565e-03
## 1890 2.192794e-03  0.010830195 2.192794e-03
## 926  2.489297e-03  0.010562028 2.489297e-03
## 1023 3.330638e-03  0.009948316 3.330638e-03
## 711  3.780329e-03  0.009682269 3.780329e-03
## 1673 4.439460e-03  0.009345477 4.439460e-03
## 858  5.107303e-03  0.009052617 5.107303e-03
## 1887 5.177012e-03  0.009024327 5.177012e-03
## 608  5.291993e-03  0.008978503 5.291993e-03
## [1]  1  2  3  4  5  6  7 40
##        varName origName varMoves PRESSRsquared         psig          sig
## 3   gn_3_clean     gn_3     TRUE    0.27603119 1.216116e-45 1.099456e-46
## 7    gc_1_catB     gc_1     TRUE    0.09444226 3.889489e-15 1.199196e-15
## 4 gc_1_lev_x.a     gc_1     TRUE    0.08931665 2.318363e-14 7.092569e-15
## 2   gn_2_clean     gn_2     TRUE    0.08627867 6.652504e-14 1.003374e-14
## 5 gc_1_lev_x.b     gc_1     TRUE    0.04538174 7.576831e-08 1.102327e-08
## 8    gc_2_catB     gc_2     TRUE    0.03784286 9.560980e-07 1.366152e-07
## 1   gn_1_clean     gn_1     TRUE    0.02342677 1.223103e-04 1.540759e-05
## 6 gc_1_lev_x.c     gc_1     TRUE    0.00216022 2.459449e-01 2.085056e-02
##   catPRSquared         csig
## 3   0.23766853 1.099456e-46
## 7   0.07397283 1.199196e-15
## 4   0.06993169 7.092569e-15
## 2   0.06914336 1.003374e-14
## 5   0.03769713 1.102327e-08
## 8   0.03206114 1.366152e-07
## 1   0.02157378 1.540759e-05
## 6   0.00616424 2.085056e-02
## $pd

## 
## $ph
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

if(!is.null(parallelCluster)) {
  parallel::stopCluster(parallelCluster)
  parallelCluster <- NULL
}