Functions
library(ggplot2)
# get the theoretical significance glm model (wrt deviance)
get_significance = function(model) {
delta_deviance = model$null.deviance - model$deviance
df = model$df.null - model$df.residual
sig = pchisq(delta_deviance, df, lower.tail=FALSE)
}
# return a frame of the deviance scores on the permuted data
permutation_test = function(dataf, ycol, nperm) {
nrows = dim(dataf)[1]
y = dataf[[ycol]]
X = dataf[, setdiff(colnames(dataf), ycol), drop=FALSE]
varnames = colnames(X)
fmla = paste("y", paste(varnames, collapse=" + "), sep=" ~ ")
deviances = numeric(nperm)
for(i in seq_len(nperm)) {
# random order of rows
ord = sample.int(nrows, size=nrows, replace=FALSE)
model = glm(fmla, data=cbind(y=y[ord], X),
family=binomial(link="logit"))
#print(summary(model))
deviances[[i]] =model$deviance
}
deviances
}
score_variable = function(dframe, ycol, var, nperm,
title='') {
df=data.frame(y=dframe[[ycol]], x=dframe[[var]])
mod = glm("y~x", data=df,
family=binomial(link="logit"))
vdev = mod$deviance
vperm = permutation_test(df, "y", nperm)
print(summary(mod))
# count how many times vdev >= deviances from perm test
num = sum(vperm <= vdev)
vscore = num/nperm
print(ggplot(data.frame(nullperm=vperm), aes(x=nullperm)) +
geom_density() + geom_vline(xintercept=vdev, color='red') +
ggtitle(paste(title, "left tail area ~", vscore)))
print(paste("Chi-sq estimate:", get_significance(mod)))
}
# ----
# for making data sets with both noise and weakly correlated variables
# ----
mkCoefs = function(ngood) {
nGoodN = ceiling(ngood/2)
nGoodC = ngood-nGoodN
coefs = list()
if(nGoodN>0) {
cN = lapply(seq_len(nGoodN),function(i) {
v = list()
v[[paste('gn', i, sep='_')]] = rnorm(1)
v
})
coefs = unlist(cN,recursive=FALSE)
}
if(nGoodN>0) {
cC = lapply(seq_len(nGoodC),function(i) {
v = list()
v[[paste('gc', i, sep='_')]] = list('a'=rnorm(1),
'b'=rnorm(1),
'c'=rnorm(1))
v
})
coefs = append(coefs,unlist(cC,recursive=FALSE))
}
coefs
}
# generate columns
genColumns = function(nrow,prefix,nNum,nCat) {
cols = list()
if(nNum>0) {
numCols = lapply(seq_len(nNum),function(i) rnorm(nrow))
names(numCols) = paste(prefix,'n_',seq_len(nNum),sep='')
cols = append(cols,numCols)
}
if(nCat>0) {
cCols = lapply(seq_len(nCat),function(i) {
sample(c('a','b','c'),nrow,replace=TRUE)
})
names(cCols) = paste(prefix,'c_',seq_len(nCat),sep='')
cols = append(cols,cCols)
}
data.frame(cols,stringsAsFactors=FALSE)
}
# evaluate coefs on a data frame
applyCoefs = function(coefs,d) {
v = numeric(nrow(d))
for(n in names(coefs)) {
cf = coefs[[n]]
if(is.list(cf)) {
# categorical
v = v + as.numeric(cf[d[,n,drop=TRUE]])
} else {
# numeric
v = v + cf[[1]]*d[,n,drop=TRUE]
}
}
v
}
# build a data frame with pure noise columns
# and columns weakly correlated with y
mkData = function(nrows, coefs, nnoise) {
noiseMagnitude = 1
d = data.frame(y = noiseMagnitude*rnorm(nrows),
stringsAsFactors = FALSE)
ngood = length(coefs)
if(ngood>0) {
ngC = sum(vapply(coefs,is.list,numeric(1)))
ngN = ngood - ngC
gd = genColumns(nrows,'g',ngN,ngC)
d = cbind(d,gd)
d$y = d$y + applyCoefs(coefs,d)
}
if(nnoise > 0) {
nnN = ceiling(nnoise/2)
nnC = nnoise-nnN
nd = genColumns(nrows,'n',nnN,nnC)
d = cbind(d,nd)
}
d$y = d$y > 0
d
}
# ------
# get the signal scores for the variables
# in a data set
# assume output is a binary variable named y
get_chiscores = function(dframe, varnames) {
nvar = length(varnames)
scores = numeric(nvar)
for(i in seq_len(nvar)) {
model = glm(paste("y~",varnames[i]), dframe,
family=binomial(link="logit"))
scores[i] = get_significance(model)
}
sframe = data.frame(var=varnames,
scores=scores, stringsAsFactors=FALSE)
sframe
}
#
# Plot the scores of each variable
# frm has columns var and scores
# (output of get_chiscores)
#
scoreplot = function(frm, threshold, sort=1) {
n = dim(frm)[1]
frm$var = reorder(frm$var, frm$scores*sort, FUN=sum)
frm$goodvar = frm$scores < threshold
ggplot(frm, aes(x=var, y=scores, ymin=0, ymax=scores, color=goodvar)) +
geom_pointrange() +
geom_hline(yintercept=threshold, color="red", linetype=2) +
scale_color_manual(values=c("TRUE"="darkgreen", "FALSE"="darkgray")) +
theme(legend.position="none")
}
Example 1 Small example of evaluating a variable with signal, and without
set.seed(3266)
N = 1000
s1 = rnorm(N)
n1 = rnorm(N)
y = 2*s1 + rnorm(N)
dframe = data.frame(y=y>0, s1=s1, n1=n1)
nperm=500
score_variable(dframe, "y", "s1", nperm,
title='Signal variable deviance,')
##
## Call:
## glm(formula = "y~x", family = binomial(link = "logit"), data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.80896 -0.42049 0.01506 0.45285 2.87230
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.02815 0.09930 -0.283 0.777
## x 3.50570 0.22463 15.606 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1386.23 on 999 degrees of freedom
## Residual deviance: 643.95 on 998 degrees of freedom
## AIC: 647.95
##
## Number of Fisher Scoring iterations: 6
## [1] "Chi-sq estimate: 1.91151650110488e-163"
score_variable(dframe, "y", "n1", nperm,
title='Noise variable deviance,')
##
## Call:
## glm(formula = "y~x", family = binomial(link = "logit"), data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.256 -1.183 1.116 1.170 1.243
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.01460 0.06329 0.231 0.818
## x -0.05703 0.06575 -0.867 0.386
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1386.2 on 999 degrees of freedom
## Residual deviance: 1385.5 on 998 degrees of freedom
## AIC: 1389.5
##
## Number of Fisher Scoring iterations: 3
## [1] "Chi-sq estimate: 0.385532666108562"
Example 2
Small example of using chi-sq significance for variable filtering
ngood = 5; nnoise = 5; N = 1000
coefs = mkCoefs(ngood)
dframe = mkData(N, coefs, nnoise)
summary(dframe)
## y gn_1 gn_2 gn_3
## Mode :logical Min. :-3.44603 Min. :-2.85755 Min. :-3.082419
## FALSE:648 1st Qu.:-0.67585 1st Qu.:-0.58603 1st Qu.:-0.662868
## TRUE :352 Median : 0.01202 Median : 0.02734 Median : 0.000484
## NA's :0 Mean :-0.01574 Mean : 0.04027 Mean :-0.015867
## 3rd Qu.: 0.63183 3rd Qu.: 0.74720 3rd Qu.: 0.678202
## Max. : 3.14300 Max. : 3.21589 Max. : 3.461750
## gc_1 gc_2 nn_1
## Length:1000 Length:1000 Min. :-3.15092
## Class :character Class :character 1st Qu.:-0.68175
## Mode :character Mode :character Median : 0.04855
## Mean :-0.03193
## 3rd Qu.: 0.61428
## Max. : 3.58665
## nn_2 nn_3 nc_1
## Min. :-2.94448 Min. :-2.87404 Length:1000
## 1st Qu.:-0.66348 1st Qu.:-0.73475 Class :character
## Median :-0.01824 Median :-0.05155 Mode :character
## Mean :-0.01459 Mean :-0.05146
## 3rd Qu.: 0.63712 3rd Qu.: 0.66751
## Max. : 3.84612 Max. : 3.26796
## nc_2
## Length:1000
## Class :character
## Mode :character
##
##
##
varnames = setdiff(colnames(dframe), "y")
sframe = get_chiscores(dframe, varnames)
threshold = 0.05 # be generous in accepting a variable (1/20 false positive rate)
scoreplot(sframe, threshold)
print("Variables selected:")
## [1] "Variables selected:"
print(sframe[sframe$scores<threshold,])
## var scores
## 2 gn_2 3.909215e-99
## 3 gn_3 3.561903e-09
## 4 gc_1 3.906063e-06
## 5 gc_2 4.325259e-06
## 8 nn_3 1.459305e-02
print("True coefficients of signal variables")
## [1] "True coefficients of signal variables"
print(coefs)
## $gn_1
## [1] 0.03406483
##
## $gn_2
## [1] -1.457936
##
## $gn_3
## [1] 0.4138757
##
## $gc_1
## $gc_1$a
## [1] -0.504306
##
## $gc_1$b
## [1] -0.2303952
##
## $gc_1$c
## [1] -1.174718
##
##
## $gc_2
## $gc_2$a
## [1] -0.2770224
##
## $gc_2$b
## [1] -0.1947211
##
## $gc_2$c
## [1] 0.2442891
Example 3
Larger example; lots of noise
ngood = 5; nnoise = 2000; N = 2500
coefs = mkCoefs(ngood)
dframe = mkData(N, coefs, nnoise)
varnames = setdiff(colnames(dframe), "y")
print("True coefficients of signal variables")
## [1] "True coefficients of signal variables"
print(coefs)
## $gn_1
## [1] 0.8907755
##
## $gn_2
## [1] 0.09630513
##
## $gn_3
## [1] 1.072262
##
## $gc_1
## $gc_1$a
## [1] 1.371897
##
## $gc_1$b
## [1] -0.8606408
##
## $gc_1$c
## [1] -0.4705882
##
##
## $gc_2
## $gc_2$a
## [1] -2.172888
##
## $gc_2$b
## [1] -0.9766713
##
## $gc_2$c
## [1] 0.9012387
sframe = get_chiscores(dframe, varnames)
# we should always pick the threshold first, but
# we will use this as an example of what different
# thresholds will do
# 1/100 error rate, 1/40 error rate, 1/20 error rate
thresholds = c(0.01, 0.025, 0.05)
for(threshold in thresholds) {
n = sum(sframe$scores<threshold)
print(paste(n, "variables selected, threshold =", threshold))
print(sframe[sframe$scores<threshold,])
}
## [1] "17 variables selected, threshold = 0.01"
## var scores
## 1 gn_1 4.744091e-40
## 3 gn_3 4.426174e-72
## 4 gc_1 3.754517e-64
## 5 gc_2 2.838399e-117
## 95 nn_90 7.278843e-03
## 353 nn_348 2.702906e-03
## 470 nn_465 3.625489e-03
## 617 nn_612 8.157911e-03
## 833 nn_828 7.904690e-03
## 1006 nc_1 2.983575e-03
## 1265 nc_260 6.512229e-03
## 1290 nc_285 9.559572e-03
## 1370 nc_365 5.091275e-03
## 1490 nc_485 7.549149e-03
## 1606 nc_601 2.689314e-03
## 1650 nc_645 5.622456e-03
## 1912 nc_907 9.193532e-03
## [1] "46 variables selected, threshold = 0.025"
## var scores
## 1 gn_1 4.744091e-40
## 3 gn_3 4.426174e-72
## 4 gc_1 3.754517e-64
## 5 gc_2 2.838399e-117
## 25 nn_20 1.534896e-02
## 50 nn_45 1.054503e-02
## 95 nn_90 7.278843e-03
## 98 nn_93 1.748200e-02
## 153 nn_148 1.463944e-02
## 338 nn_333 1.652431e-02
## 353 nn_348 2.702906e-03
## 386 nn_381 1.125829e-02
## 426 nn_421 1.037611e-02
## 470 nn_465 3.625489e-03
## 478 nn_473 1.465016e-02
## 503 nn_498 1.977930e-02
## 529 nn_524 2.162455e-02
## 563 nn_558 1.895795e-02
## 596 nn_591 1.226371e-02
## 617 nn_612 8.157911e-03
## 631 nn_626 2.439658e-02
## 665 nn_660 1.843377e-02
## 781 nn_776 2.499622e-02
## 833 nn_828 7.904690e-03
## 952 nn_947 2.143719e-02
## 978 nn_973 1.328706e-02
## 1006 nc_1 2.983575e-03
## 1059 nc_54 1.426079e-02
## 1187 nc_182 2.355118e-02
## 1230 nc_225 1.576357e-02
## 1265 nc_260 6.512229e-03
## 1290 nc_285 9.559572e-03
## 1370 nc_365 5.091275e-03
## 1490 nc_485 7.549149e-03
## 1522 nc_517 1.481355e-02
## 1549 nc_544 2.044036e-02
## 1606 nc_601 2.689314e-03
## 1615 nc_610 2.097747e-02
## 1650 nc_645 5.622456e-03
## 1810 nc_805 1.363454e-02
## 1840 nc_835 1.659934e-02
## 1847 nc_842 1.051913e-02
## 1893 nc_888 1.580964e-02
## 1912 nc_907 9.193532e-03
## 1918 nc_913 1.375041e-02
## 1998 nc_993 1.545178e-02
## [1] "103 variables selected, threshold = 0.05"
## var scores
## 1 gn_1 4.744091e-40
## 2 gn_2 3.591492e-02
## 3 gn_3 4.426174e-72
## 4 gc_1 3.754517e-64
## 5 gc_2 2.838399e-117
## 21 nn_16 3.219597e-02
## 25 nn_20 1.534896e-02
## 40 nn_35 3.404971e-02
## 42 nn_37 4.844582e-02
## 50 nn_45 1.054503e-02
## 53 nn_48 2.618896e-02
## 93 nn_88 3.240945e-02
## 95 nn_90 7.278843e-03
## 98 nn_93 1.748200e-02
## 144 nn_139 4.228818e-02
## 153 nn_148 1.463944e-02
## 244 nn_239 4.254657e-02
## 249 nn_244 4.379877e-02
## 282 nn_277 3.771266e-02
## 338 nn_333 1.652431e-02
## 353 nn_348 2.702906e-03
## 371 nn_366 4.976024e-02
## 383 nn_378 3.239405e-02
## 386 nn_381 1.125829e-02
## 388 nn_383 2.694754e-02
## 424 nn_419 3.732872e-02
## 426 nn_421 1.037611e-02
## 428 nn_423 2.653770e-02
## 433 nn_428 3.461036e-02
## 445 nn_440 2.518610e-02
## 460 nn_455 3.921065e-02
## 470 nn_465 3.625489e-03
## 478 nn_473 1.465016e-02
## 503 nn_498 1.977930e-02
## 524 nn_519 3.026417e-02
## 529 nn_524 2.162455e-02
## 552 nn_547 2.778526e-02
## 563 nn_558 1.895795e-02
## 565 nn_560 3.183238e-02
## 578 nn_573 4.064631e-02
## 596 nn_591 1.226371e-02
## 617 nn_612 8.157911e-03
## 631 nn_626 2.439658e-02
## 665 nn_660 1.843377e-02
## 697 nn_692 3.818386e-02
## 724 nn_719 3.046587e-02
## 726 nn_721 3.717127e-02
## 781 nn_776 2.499622e-02
## 784 nn_779 4.837252e-02
## 785 nn_780 2.672533e-02
## 832 nn_827 4.028992e-02
## 833 nn_828 7.904690e-03
## 848 nn_843 3.172189e-02
## 851 nn_846 3.228479e-02
## 852 nn_847 3.043253e-02
## 872 nn_867 3.829050e-02
## 885 nn_880 3.935873e-02
## 912 nn_907 2.991736e-02
## 952 nn_947 2.143719e-02
## 970 nn_965 3.839762e-02
## 978 nn_973 1.328706e-02
## 985 nn_980 2.555435e-02
## 1006 nc_1 2.983575e-03
## 1007 nc_2 3.160839e-02
## 1059 nc_54 1.426079e-02
## 1060 nc_55 3.073947e-02
## 1128 nc_123 2.984885e-02
## 1187 nc_182 2.355118e-02
## 1199 nc_194 2.514109e-02
## 1218 nc_213 2.816941e-02
## 1230 nc_225 1.576357e-02
## 1250 nc_245 4.320626e-02
## 1253 nc_248 4.925703e-02
## 1265 nc_260 6.512229e-03
## 1267 nc_262 2.641738e-02
## 1290 nc_285 9.559572e-03
## 1307 nc_302 2.732476e-02
## 1330 nc_325 4.497941e-02
## 1336 nc_331 3.163276e-02
## 1370 nc_365 5.091275e-03
## 1490 nc_485 7.549149e-03
## 1522 nc_517 1.481355e-02
## 1525 nc_520 4.977668e-02
## 1549 nc_544 2.044036e-02
## 1598 nc_593 2.518194e-02
## 1606 nc_601 2.689314e-03
## 1615 nc_610 2.097747e-02
## 1632 nc_627 2.792739e-02
## 1650 nc_645 5.622456e-03
## 1672 nc_667 3.779772e-02
## 1686 nc_681 3.638010e-02
## 1697 nc_692 4.903456e-02
## 1719 nc_714 3.363250e-02
## 1810 nc_805 1.363454e-02
## 1826 nc_821 2.870117e-02
## 1840 nc_835 1.659934e-02
## 1847 nc_842 1.051913e-02
## 1893 nc_888 1.580964e-02
## 1912 nc_907 9.193532e-03
## 1918 nc_913 1.375041e-02
## 1962 nc_957 3.220855e-02
## 1975 nc_970 2.818335e-02
## 1998 nc_993 1.545178e-02
threshold = 0.01
scoreplot(sframe, threshold, sort=-1) + coord_flip()
Not try fitting a random forest on this data.
library('randomForest')
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
# devtools::install_github("WinVector/WCPlots")
library('WVPlots')
## Loading required package: grid
## Loading required package: gridExtra
## Loading required package: reshape2
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
##
## Loading required package: plyr
## Loading required package: stringr
# convert all character vectors to factor for randomForest
dframe = as.data.frame(lapply(dframe, function(col) {
if(is.character(col)) {as.factor(col)}
else {col}
}))
allvars = sframe$var
selvars = sframe$var[sframe$scores < 0.01]
predframeTrain = data.frame(y=dframe$y)
# model on all variables
rfmodel_full = randomForest(x=dframe[,allvars], y=as.factor(dframe$y))
predframeTrain$trainfull = predict(rfmodel_full, newdata=dframe[,allvars], type='prob')[,'TRUE']
rfmodel_red = randomForest(dframe[,selvars], as.factor(dframe$y))
predframeTrain$trainred = predict(rfmodel_red, newdata=dframe[,selvars], type='prob')[,'TRUE']
testframe = mkData(N, coefs, nnoise)
testframe = as.data.frame(lapply(testframe, function(col) {
if(is.character(col)) {as.factor(col)}
else {col}
}))
predframeTest = data.frame(y=testframe$y)
predframeTest$testfull = predict(rfmodel_full, newdata=testframe[,allvars], type='prob')[,'TRUE']
predframeTest$testred = predict(rfmodel_red, newdata=testframe[,selvars], type='prob')[,'TRUE']
printutil = function(frame, column, yname, title) {
print(DoubleDensityPlot(frame, column, yname, title=title))
print(ROCPlot(frame, column, yname, title=title))
}
print("performance, all variables")
## [1] "performance, all variables"
printutil(predframeTrain, "trainfull", "y", "training performance, all variables")
printutil(predframeTest, "testfull", "y", "test performance, all variables")
print("performance, reduced variables")
## [1] "performance, reduced variables"
printutil(predframeTrain, "trainred", "y", "training performance, reduced variables")
printutil(predframeTest, "testred", "y", "test performance, reduced variables")