In [1]:
%load_ext rpy2.ipython
In [2]:
# load libraries (this example is in Python 3)
import re
import math
import random
import numpy.random
import csv
import sys
import time
import os
import gzip
import pandas
import numpy
import sklearn.ensemble
import sklearn.linear_model
import scipy.optimize
import contextlib
In [3]:
# load the data
d = pandas.io.parsers.read_csv('car.data',header=None)
yColumn = 'rating'
vars = ['buying', 'maint', 'doors', 'persons', 
                 'lug_boot', 'safety']
d.columns = vars + [yColumn]
isTest = numpy.random.rand(len(d))<0.2
want = d[yColumn]!='unacc'
dTrain = d[~isTest]
dTest = d[isTest]
print(dTrain.shape)
print(dTest.shape)
(1371, 7)
(357, 7)

In [4]:
# try to train a GBM model
xvarsTrain = dTrain.as_matrix(vars)
yvarTrain = want[~isTest]
model = sklearn.linear_model.LogisticRegression()

try:
  model = model.fit(xvarsTrain,yvarTrain)
except ValueError:
  print("caught error")
caught error

In [5]:
# encode the variables the wrong way (ordinals or hashes), train and make predictions

for v in vars:
   labs = pandas.factorize(d[v])[0]
   d[v] = labs


dTrain = d[~isTest]
dTest = d[isTest]

xvarsTrain = dTrain.as_matrix(vars)
yvarTrain = want[~isTest]
model = sklearn.linear_model.LogisticRegression()
model.fit(xvarsTrain,yvarTrain)

xvarsTest = dTest.as_matrix(vars)
predTest = model.predict(xvarsTest)
yvarTest = want[isTest]
In [6]:
%%R -i predTest,yvarTest

library('ggplot2')
predTest <- as.numeric(predTest)
yvarTest <- as.numeric(yvarTest)
d <- data.frame(predC=predTest,want=yvarTest>0.5)
print(ggplot(data=d) + 
  geom_density(aes(x=predC,color=want)))
print(table(truth=d[,'want'],decision=d[,'predC']>0.5))
Use suppressPackageStartupMessages to eliminate package startup messages.
       decision
truth   FALSE TRUE
  FALSE   240   20
  TRUE     23   74

In [7]:
# encode the variables the right way (dummies/indicators)
enc = sklearn.preprocessing.OneHotEncoder()
enc.fit(dTrain[vars])

xvarsTrain = enc.transform(dTrain[vars]).toarray()
yvarTrain = want[~isTest]
model = sklearn.linear_model.LogisticRegression()
model.fit(xvarsTrain,yvarTrain)

xvarsTest = enc.transform(dTest[vars]).toarray()
predTest = model.predict(xvarsTest)
yvarTest = want[isTest]
In [8]:
%%R -i predTest,yvarTest

library('ggplot2')
predTest <- as.numeric(predTest)
yvarTest <- as.numeric(yvarTest)
d <- data.frame(predC=predTest,want=yvarTest>0.5)
print(ggplot(data=d) + 
  geom_density(aes(x=predC,color=want)))
print(table(truth=d[,'want'],decision=d[,'predC']>0.5))
       decision
truth   FALSE TRUE
  FALSE   248   12
  TRUE      4   93

In [8]: