In [1]:
%load_ext rpy2.ipython
In [2]:
# load libraries (this example is in Python 3)
import re
import math
import tarfile
import random
import numpy.random
import csv
import sys
import time
import os
import gzip
import pandas
import numpy
import sklearn.ensemble
import scipy.optimize
import contextlib
In [3]:
# load isolet data
dTrain = pandas.io.parsers.read_csv('isolet1+2+3+4.data.gz',compression='gzip',header=None)
dTest = pandas.io.parsers.read_csv('isolet5.data.gz',compression='gzip',header=None)
print(dTrain.shape)
print(dTest.shape)
targets = set([13,14]) # m and n
yColumn = 617
trainColumns = range(yColumn)
# select down to rows where class is one of our targets
dTrain = dTrain[dTrain[yColumn].isin(targets)]
dTest = dTest[dTest[yColumn].isin(targets)]
print(dTrain.shape)
print(dTest.shape)
(6238, 618)
(1559, 618)
(480, 618)
(119, 618)

In [4]:
# train a GBM model and apply to test
model = sklearn.ensemble.GradientBoostingClassifier(random_state=numpy.random.RandomState(2362))
xvars = dTrain.as_matrix(trainColumns)
yvar = [y for y in dTrain[yColumn]]
model = model.fit(xvars,yvar)
xvarsTest = dTest.as_matrix(trainColumns)
pred = [ row[0] for row in model.predict_proba(xvars) ]
predTest = [ row[0] for row in model.predict_proba(xvarsTest) ]
#print(pred)
yvarTest = [y for y in dTest[yColumn]]
#print(yvarTest)
In [5]:
%%R
# load plotting fns
library('ggplot2')
library('reshape2')
library('ROCR')

yColumn <- 'isLetter'


# define some helper and reporting functions
# calulcate area under the curve of numeric vectors x,y
# length(x)==length(y)
# y>=0, 0<=x<=1 and x increasing
areaCalc <- function(x,y) {
   # append extra points to get rid of degenerate cases
   x <- c(0,x,1)
   y <- c(0,y,1)
   n <- length(x)
   sum(0.5*(y[-1]+y[-n])*(x[-1]-x[-n]))
}


# needs ggplot2 and reshape2
# plot the gain-curve which is what proportion of the target (make target numric)
# is sorted into position by the prediction
# example:
#  d <- data.frame(pred=runif(100))
#  d$y <- runif(100)<d$pred
#  gainCurve('test',d$pred,d$y)
gainCurve <- function(title,ycol,predcol) {
  ycol <- as.numeric(ycol)
  spearStat <- cor(predcol,ycol,method="spearman")
  d <- data.frame(predcol=predcol,ycol=ycol)
  predord <- order(d[['predcol']], decreasing=TRUE) # reorder, with highest first
  wizard <- order(d[['ycol']], decreasing=TRUE)
  npop <- dim(d)[1]
  
  results <- data.frame(pctpop= (1:npop)/npop,
                       model = cumsum(d[predord,'ycol'])/sum(d[['ycol']]),
                       wizard = cumsum(d[wizard, 'ycol'])/sum(d[['ycol']]))
  wideIDX <- which.max(results$model-results$pctpop)
  wFrame <- data.frame(wideX=results[wideIDX,'pctpop',drop=TRUE],
     wideY=results[wideIDX,'model',drop=TRUE])
  maxvgap <- wFrame$wideY-wFrame$wideX
  idealArea <- areaCalc(results$pctpop,results$wizard)
  modelArea <- areaCalc(results$pctpop,results$model)
  giniScore <- modelArea/idealArea
  
  results <- melt(results, id.vars="pctpop", measure.vars=c("model", "wizard"),
                 variable.name="sort_criterion", value.name="pct_outcome")
  
  gplot <- ggplot(data=results, aes(x=pctpop, y=pct_outcome, color=sort_criterion)) + 
    geom_point() + geom_line() + 
    geom_abline(color="gray") +
    ggtitle(paste("Gain curve,", title, '\n', 
       'relative Gini score', format(giniScore,digits=2), '\n',
       'Spearman rank correlation', format(spearStat,digits=2) , '\n',
       'maxVGap',format(maxvgap,digits=2))) +
       xlab("% items in score order") + ylab("% total category") +
       geom_segment(data=wFrame,
          aes(x=wideX,y=wideX,xend=wideX,yend=wideY),color="gray") +
       scale_x_continuous(breaks=seq(0,1,0.1)) +
       scale_y_continuous(breaks=seq(0,1,0.1))
  gplot
}

plotROC <- function(title,outcol,predcol) {
  pred <- prediction(predcol,outcol)
  perf <- performance(pred,'tpr','fpr')
  auc <- as.numeric(performance(pred,'auc')@y.values)
  pf <- data.frame(
    FalsePositiveRate=perf@x.values[[1]],
    TruePositiveRate=perf@y.values[[1]])
  plot=ggplot() +
    geom_ribbon(data=pf,aes(x=FalsePositiveRate,ymax=TruePositiveRate,ymin=0),
      fill='blue',alpha=0.3) +
      geom_point(data=pf,aes(x=FalsePositiveRate,y=TruePositiveRate)) +
      geom_line(aes(x=c(0,1),y=c(0,1))) + coord_fixed() +
      ggtitle(paste(title,'\nAUC:',format(auc,digits=2)))
  list(pf=pf,plot=plot)
}


deviance <- function(truth,pred,epsilon=0) {
  pred = pmax(pred, epsilon)
  pred = pmin(pred, 1-epsilon)
  S = 0.0 # assumed log-likelihood of saturated model
  -2*(sum(ifelse(truth,log(pred),log(1-pred)))-S)
}


reportStats <- function(d,test,modelName,title,epsilon) {
  dSub <- d[d$isTest==test,,drop=FALSE]
  tab <- table(truth=dSub[,yColumn],pred=dSub[,modelName]>0.5)
  accuracy <- (tab[1,1] + tab[2,2])/sum(tab)
  note = ifelse(test,'test','train')
  print(paste('\t',note,'accuracy',modelName,format(accuracy,digits=2)))
  residual.deviance <- deviance(dSub[,yColumn],dSub[,modelName],epsilon)
  #print(paste('\tresidual.deviance',residual.deviance))
  null.deviance <- deviance(dSub[,yColumn],mean(dSub[,yColumn]),epsilon)
  #print(paste('\tnull.deviance',null.deviance))
  print(paste("\tmodel explained a",
              format((1-residual.deviance/null.deviance),digits=2),
            "fraction of the variation on",note))  
}

report <- function(d,modelName,title,epsilon=1.0e-2) {
  print("***********")
  print(paste("model",modelName,title))
  reportStats(d,FALSE,modelName,title,epsilon)
  reportStats(d,TRUE,modelName,title,epsilon)
  print(ggplot(data=d[d$isTest==TRUE,,drop=FALSE],
               aes_string(x=modelName,color=yColumn)) + 
    geom_density() + 
    ggtitle(paste(title,'test')))
  print(plotROC(paste(title,'train'),
                d[d$isTest==FALSE,yColumn],
                d[d$isTest==FALSE,modelName])$plot)
  print(plotROC(paste(title,'test'),
                d[d$isTest==TRUE,yColumn],
                d[d$isTest==TRUE,modelName])$plot)
  print(gainCurve(paste(title,'train'),
                d[d$isTest==FALSE,yColumn],
                d[d$isTest==FALSE,modelName]))
  print(gainCurve(paste(title,'test'),
                d[d$isTest==TRUE,yColumn],
                d[d$isTest==TRUE,modelName]))
  print("***********")
}
Loading required package: gplots

Attaching package: ‘gplots’

The following object is masked from ‘package:stats’:

    lowess


In [6]:
%%R -i pred,yvar,predTest,yvarTest
# plot performance
dTrain <- data.frame(modelGBMpy=as.numeric(pred))
dTrain[[yColumn]] <- as.numeric(yvar)==13
dTrain$isTest <- FALSE
dTest <- data.frame(modelGBMpy=as.numeric(predTest))
dTest[[yColumn]] <- as.numeric(yvarTest)==13
dTest$isTest <- TRUE
d <- rbind(dTrain,dTest)

print(summary(d))
report(d,'modelGBMpy',"GBM")
   modelGBMpy         isLetter         isTest       
 Min.   :0.0004144   Mode :logical   Mode :logical  
 1st Qu.:0.0023632   FALSE:300       FALSE:480      
 Median :0.3880218   TRUE :299       TRUE :119      
 Mean   :0.4961387   NA's :0         NA's :0        
 3rd Qu.:0.9975971                                  
 Max.   :0.9995392                                  
[1] "***********"
[1] "model modelGBMpy GBM"
[1] "\t train accuracy modelGBMpy 1"
[1] "\tmodel explained a 0.98 fraction of the variation on train"
[1] "\t test accuracy modelGBMpy 0.89"
[1] "\tmodel explained a 0.65 fraction of the variation on test"
[1] "***********"

In [6]: