Functions
library(ggplot2)
library(vtreat)
# ----
# for making data sets with both noise and weakly correlated variables
# ----
mkCoefs = function(ngood) {
nGoodN = ceiling(ngood/2)
nGoodC = ngood-nGoodN
coefs = list()
if(nGoodN>0) {
cN = lapply(seq_len(nGoodN),function(i) {
v = list()
v[[paste('gn', i, sep='_')]] = rnorm(1)
v
})
coefs = unlist(cN,recursive=FALSE)
}
if(nGoodN>0) {
cC = lapply(seq_len(nGoodC),function(i) {
v = list()
v[[paste('gc', i, sep='_')]] = list('a'=rnorm(1),
'b'=rnorm(1),
'c'=rnorm(1))
v
})
coefs = append(coefs,unlist(cC,recursive=FALSE))
}
coefs
}
# generate columns
genColumns = function(nrow,prefix,nNum,nCat) {
cols = list()
if(nNum>0) {
numCols = lapply(seq_len(nNum),function(i) rnorm(nrow))
names(numCols) = paste(prefix,'n_',seq_len(nNum),sep='')
cols = append(cols,numCols)
}
if(nCat>0) {
cCols = lapply(seq_len(nCat),function(i) {
sample(c('a','b','c'),nrow,replace=TRUE)
})
names(cCols) = paste(prefix,'c_',seq_len(nCat),sep='')
cols = append(cols,cCols)
}
data.frame(cols,stringsAsFactors=FALSE)
}
# evaluate coefs on a data frame
applyCoefs = function(coefs,d) {
v = numeric(nrow(d))
for(n in names(coefs)) {
cf = coefs[[n]]
if(is.list(cf)) {
# categorical
v = v + as.numeric(cf[d[,n,drop=TRUE]])
} else {
# numeric
v = v + cf[[1]]*d[,n,drop=TRUE]
}
}
v
}
# build a data frame with pure noise columns
# and columns weakly correlated with y
mkData = function(nrows, coefs, nnoise) {
noiseMagnitude = 1
d = data.frame(y = noiseMagnitude*rnorm(nrows),
stringsAsFactors = FALSE)
ngood = length(coefs)
if(ngood>0) {
ngC = sum(vapply(coefs,is.list,numeric(1)))
ngN = ngood - ngC
gd = genColumns(nrows,'g',ngN,ngC)
d = cbind(d,gd)
d$y = d$y + applyCoefs(coefs,d)
}
if(nnoise > 0) {
nnN = ceiling(nnoise/2)
nnC = nnoise-nnN
nd = genColumns(nrows,'n',nnN,nnC)
d = cbind(d,nd)
}
d$y = d$y > 0
d
}
# ------
# run vtreat variable scoring experiment and print/plot results
showVTreat <- function(dframe,yName,yTarget,parallelCluster) {
varnames <- setdiff(colnames(dframe),yName)
treatments = designTreatmentsC(dframe,
varnames,yName,yTarget,
verbose=FALSE,
parallelCluster=parallelCluster)
ord <- order(treatments$scoreFrame$csig)
sf <- treatments$scoreFrame[ord,]
print(sf[seq_len(min(20,nrow(sf))),])
goodIndices <- grep("^g",sf$origName)
print(goodIndices)
print(sf[goodIndices,])
sf$goodVar <- FALSE
sf$goodVar[goodIndices] <- TRUE
list(pd=ggplot(data=sf,aes(x=sig,color=goodVar,fill=goodVar)) +
geom_density(adjust=0.2,alpha=0.5) + scale_x_log10(),
ph=ggplot(data=sf,aes(x=sig,color=goodVar,fill=goodVar)) +
geom_histogram() + facet_wrap(~goodVar,ncol=1,scale="free_y") +
scale_x_log10())
}
nCoreEstimate <- parallel::detectCores()
print(paste('core estimate',nCoreEstimate))
## [1] "core estimate 4"
parallelCluster = parallel::makeCluster(nCoreEstimate)
Example 1 Small example of evaluating a variable with signal, and without
set.seed(3266)
N = 1000
g1 = rnorm(N)
n1 = rnorm(N)
y = 2*g1 + rnorm(N)
dframe = data.frame(y=y>0, g1=g1, n1=n1)
showVTreat(dframe,'y',TRUE,parallelCluster)[[2]]
## varName origName varMoves PRESSRsquared psig sig
## 1 g1_clean g1 TRUE 0.55143902 4.549788e-45 1.588708e-47
## 2 n1_clean n1 TRUE -0.01583777 1.000000e+00 7.172873e-01
## catPRSquared csig
## 1 0.6053479103 1.588708e-47
## 2 0.0003784514 7.172873e-01
## [1] 1
## varName origName varMoves PRESSRsquared psig sig
## 1 g1_clean g1 TRUE 0.551439 4.549788e-45 1.588708e-47
## catPRSquared csig
## 1 0.6053479 1.588708e-47
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Example 2
Small example of using chi-sq significance for variable filtering
ngood = 5; nnoise = 5; N = 1000
coefs = mkCoefs(ngood)
dframe = mkData(N, coefs, nnoise)
summary(dframe)
## y gn_1 gn_2 gn_3
## Mode :logical Min. :-3.07992 Min. :-2.86504 Min. :-3.51880
## FALSE:370 1st Qu.:-0.58050 1st Qu.:-0.76455 1st Qu.:-0.61031
## TRUE :630 Median : 0.08235 Median :-0.08096 Median : 0.05888
## NA's :0 Mean : 0.08442 Mean :-0.06745 Mean : 0.04560
## 3rd Qu.: 0.76994 3rd Qu.: 0.64176 3rd Qu.: 0.69444
## Max. : 2.83260 Max. : 3.16709 Max. : 3.86147
## gc_1 gc_2 nn_1
## Length:1000 Length:1000 Min. :-4.06423
## Class :character Class :character 1st Qu.:-0.58554
## Mode :character Mode :character Median : 0.02891
## Mean : 0.02070
## 3rd Qu.: 0.68636
## Max. : 3.08184
## nn_2 nn_3 nc_1
## Min. :-3.484511 Min. :-3.15580 Length:1000
## 1st Qu.:-0.663425 1st Qu.:-0.65419 Class :character
## Median :-0.009027 Median :-0.03368 Mode :character
## Mean : 0.018741 Mean :-0.01567
## 3rd Qu.: 0.660170 3rd Qu.: 0.63787
## Max. : 3.553240 Max. : 2.91261
## nc_2
## Length:1000
## Class :character
## Mode :character
##
##
##
print("True coefficients of signal variables")
## [1] "True coefficients of signal variables"
print(coefs)
## $gn_1
## [1] -1.148735
##
## $gn_2
## [1] -0.01430854
##
## $gn_3
## [1] -1.562585
##
## $gc_1
## $gc_1$a
## [1] -0.2714472
##
## $gc_1$b
## [1] 1.223394
##
## $gc_1$c
## [1] 1.172623
##
##
## $gc_2
## $gc_2$a
## [1] -0.247957
##
## $gc_2$b
## [1] 0.57554
##
## $gc_2$c
## [1] 0.1020899
showVTreat(dframe,'y',TRUE,parallelCluster)
## varName origName varMoves PRESSRsquared psig sig
## 3 gn_3_clean gn_3 TRUE 0.338913933 4.445277e-24 1.606254e-25
## 4 gc_1_lev_x.a gc_1 TRUE 0.143220127 6.197608e-10 5.315279e-10
## 7 gc_1_catB gc_1 TRUE 0.141021079 8.579409e-10 6.916390e-10
## 5 gc_1_lev_x.b gc_1 TRUE 0.059516435 9.736829e-05 9.886860e-06
## 1 gn_1_clean gn_1 TRUE 0.057039159 1.375082e-04 2.011085e-05
## 6 gc_1_lev_x.c gc_1 TRUE -0.001586634 1.000000e+00 5.997196e-02
## 10 nn_2_clean nn_2 TRUE -0.010512375 1.000000e+00 2.080640e-01
## 11 nn_3_clean nn_3 TRUE -0.012843806 1.000000e+00 3.462239e-01
## 8 gc_2_catB gc_2 TRUE -0.012845158 1.000000e+00 3.708804e-01
## 9 nn_1_clean nn_1 TRUE -0.014424495 1.000000e+00 4.979076e-01
## 12 nc_1_catB nc_1 TRUE -0.014929125 1.000000e+00 5.440322e-01
## 13 nc_2_catB nc_2 TRUE -0.015885584 1.000000e+00 7.456541e-01
## 2 gn_2_clean gn_2 TRUE -0.016332985 1.000000e+00 9.940577e-01
## catPRSquared csig
## 3 3.445905e-01 1.606254e-25
## 4 1.218735e-01 5.315279e-10
## 7 1.202494e-01 6.916390e-10
## 5 6.174021e-02 9.886860e-06
## 1 5.745927e-02 2.011085e-05
## 6 1.118338e-02 5.997196e-02
## 10 5.009366e-03 2.080640e-01
## 11 2.804407e-03 3.462239e-01
## 8 2.530884e-03 3.708804e-01
## 9 1.452046e-03 4.979076e-01
## 12 1.163542e-03 5.440322e-01
## 13 3.325757e-04 7.456541e-01
## 2 1.753198e-07 9.940577e-01
## [1] 1 2 3 4 5 6 9 13
## varName origName varMoves PRESSRsquared psig sig
## 3 gn_3_clean gn_3 TRUE 0.338913933 4.445277e-24 1.606254e-25
## 4 gc_1_lev_x.a gc_1 TRUE 0.143220127 6.197608e-10 5.315279e-10
## 7 gc_1_catB gc_1 TRUE 0.141021079 8.579409e-10 6.916390e-10
## 5 gc_1_lev_x.b gc_1 TRUE 0.059516435 9.736829e-05 9.886860e-06
## 1 gn_1_clean gn_1 TRUE 0.057039159 1.375082e-04 2.011085e-05
## 6 gc_1_lev_x.c gc_1 TRUE -0.001586634 1.000000e+00 5.997196e-02
## 8 gc_2_catB gc_2 TRUE -0.012845158 1.000000e+00 3.708804e-01
## 2 gn_2_clean gn_2 TRUE -0.016332985 1.000000e+00 9.940577e-01
## catPRSquared csig
## 3 3.445905e-01 1.606254e-25
## 4 1.218735e-01 5.315279e-10
## 7 1.202494e-01 6.916390e-10
## 5 6.174021e-02 9.886860e-06
## 1 5.745927e-02 2.011085e-05
## 6 1.118338e-02 5.997196e-02
## 8 2.530884e-03 3.708804e-01
## 2 1.753198e-07 9.940577e-01
## $pd
##
## $ph
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Example 3
Larger example; lots of noise
ngood = 5; nnoise = 2000; N = 2500
coefs = mkCoefs(ngood)
dframe = mkData(N, coefs, nnoise)
varnames = setdiff(colnames(dframe), "y")
print("True coefficients of signal variables")
## [1] "True coefficients of signal variables"
print(coefs)
## $gn_1
## [1] -0.7097394
##
## $gn_2
## [1] -0.9863472
##
## $gn_3
## [1] -1.957874
##
## $gc_1
## $gc_1$a
## [1] 1.623468
##
## $gc_1$b
## [1] -1.334577
##
## $gc_1$c
## [1] -0.3808084
##
##
## $gc_2
## $gc_2$a
## [1] -0.7030629
##
## $gc_2$b
## [1] 0.9048151
##
## $gc_2$c
## [1] 0.08699959
showVTreat(dframe,'y',TRUE,parallelCluster)
## varName origName varMoves PRESSRsquared psig
## 3 gn_3_clean gn_3 TRUE 0.276031186 1.216116e-45
## 7 gc_1_catB gc_1 TRUE 0.094442262 3.889489e-15
## 4 gc_1_lev_x.a gc_1 TRUE 0.089316652 2.318363e-14
## 2 gn_2_clean gn_2 TRUE 0.086278674 6.652504e-14
## 5 gc_1_lev_x.b gc_1 TRUE 0.045381742 7.576831e-08
## 8 gc_2_catB gc_2 TRUE 0.037842865 9.560980e-07
## 1 gn_1_clean gn_1 TRUE 0.023426773 1.223103e-04
## 214 nn_206_clean nn_206 TRUE 0.012967263 4.366080e-03
## 1920 nc_900_catB nc_900 TRUE 0.012383237 5.351364e-03
## 227 nn_219_clean nn_219 TRUE 0.009947471 1.260791e-02
## 402 nn_394_clean nn_394 TRUE 0.009533584 1.460807e-02
## 98 nn_90_clean nn_90 TRUE 0.009339980 1.565279e-02
## 1890 nc_870_catB nc_870 TRUE 0.008636568 2.014115e-02
## 926 nn_918_clean nn_918 TRUE 0.008368394 2.218460e-02
## 1023 nc_15_catB nc_15 TRUE 0.007415338 3.135809e-02
## 711 nn_703_clean nn_703 TRUE 0.007074422 3.553082e-02
## 1673 nc_659_catB nc_659 TRUE 0.006578245 4.266847e-02
## 858 nn_850_clean nn_850 TRUE 0.006265118 4.793375e-02
## 1887 nc_867_catB nc_867 TRUE 0.006129828 5.041616e-02
## 608 nn_600_clean nn_600 TRUE 0.006092032 5.113362e-02
## sig catPRSquared csig
## 3 1.099456e-46 0.237668530 1.099456e-46
## 7 1.199196e-15 0.073972831 1.199196e-15
## 4 7.092569e-15 0.069931691 7.092569e-15
## 2 1.003374e-14 0.069143359 1.003374e-14
## 5 1.102327e-08 0.037697130 1.102327e-08
## 8 1.366152e-07 0.032061136 1.366152e-07
## 1 1.540759e-05 0.021573779 1.540759e-05
## 214 4.686696e-04 0.014127087 4.686696e-04
## 1920 6.190129e-04 0.013528524 6.190129e-04
## 227 1.446387e-03 0.011713370 1.446387e-03
## 402 1.645780e-03 0.011438740 1.645780e-03
## 98 1.756565e-03 0.011300381 1.756565e-03
## 1890 2.192794e-03 0.010830195 2.192794e-03
## 926 2.489297e-03 0.010562028 2.489297e-03
## 1023 3.330638e-03 0.009948316 3.330638e-03
## 711 3.780329e-03 0.009682269 3.780329e-03
## 1673 4.439460e-03 0.009345477 4.439460e-03
## 858 5.107303e-03 0.009052617 5.107303e-03
## 1887 5.177012e-03 0.009024327 5.177012e-03
## 608 5.291993e-03 0.008978503 5.291993e-03
## [1] 1 2 3 4 5 6 7 40
## varName origName varMoves PRESSRsquared psig sig
## 3 gn_3_clean gn_3 TRUE 0.27603119 1.216116e-45 1.099456e-46
## 7 gc_1_catB gc_1 TRUE 0.09444226 3.889489e-15 1.199196e-15
## 4 gc_1_lev_x.a gc_1 TRUE 0.08931665 2.318363e-14 7.092569e-15
## 2 gn_2_clean gn_2 TRUE 0.08627867 6.652504e-14 1.003374e-14
## 5 gc_1_lev_x.b gc_1 TRUE 0.04538174 7.576831e-08 1.102327e-08
## 8 gc_2_catB gc_2 TRUE 0.03784286 9.560980e-07 1.366152e-07
## 1 gn_1_clean gn_1 TRUE 0.02342677 1.223103e-04 1.540759e-05
## 6 gc_1_lev_x.c gc_1 TRUE 0.00216022 2.459449e-01 2.085056e-02
## catPRSquared csig
## 3 0.23766853 1.099456e-46
## 7 0.07397283 1.199196e-15
## 4 0.06993169 7.092569e-15
## 2 0.06914336 1.003374e-14
## 5 0.03769713 1.102327e-08
## 8 0.03206114 1.366152e-07
## 1 0.02157378 1.540759e-05
## 6 0.00616424 2.085056e-02
## $pd
##
## $ph
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
if(!is.null(parallelCluster)) {
parallel::stopCluster(parallelCluster)
parallelCluster <- NULL
}