%load_ext rpy2.ipython
# load libraries (this example is in Python 3)
import re
import math
import random
import numpy.random
import csv
import sys
import time
import os
import gzip
import pandas
import numpy
import sklearn.ensemble
import sklearn.linear_model
import scipy.optimize
import contextlib
# load the data
d = pandas.io.parsers.read_csv('car.data',header=None)
yColumn = 'rating'
vars = ['buying', 'maint', 'doors', 'persons',
'lug_boot', 'safety']
d.columns = vars + [yColumn]
isTest = numpy.random.rand(len(d))<0.2
want = d[yColumn]!='unacc'
dTrain = d[~isTest]
dTest = d[isTest]
print(dTrain.shape)
print(dTest.shape)
# try to train a GBM model
xvarsTrain = dTrain.as_matrix(vars)
yvarTrain = want[~isTest]
model = sklearn.linear_model.LogisticRegression()
try:
model = model.fit(xvarsTrain,yvarTrain)
except ValueError:
print("caught error")
# encode the variables the wrong way (ordinals or hashes), train and make predictions
for v in vars:
labs = pandas.factorize(d[v])[0]
d[v] = labs
dTrain = d[~isTest]
dTest = d[isTest]
xvarsTrain = dTrain.as_matrix(vars)
yvarTrain = want[~isTest]
model = sklearn.linear_model.LogisticRegression()
model.fit(xvarsTrain,yvarTrain)
xvarsTest = dTest.as_matrix(vars)
predTest = model.predict(xvarsTest)
yvarTest = want[isTest]
%%R -i predTest,yvarTest
library('ggplot2')
predTest <- as.numeric(predTest)
yvarTest <- as.numeric(yvarTest)
d <- data.frame(predC=predTest,want=yvarTest>0.5)
print(ggplot(data=d) +
geom_density(aes(x=predC,color=want)))
print(table(truth=d[,'want'],decision=d[,'predC']>0.5))
# encode the variables the right way (dummies/indicators)
enc = sklearn.preprocessing.OneHotEncoder()
enc.fit(dTrain[vars])
xvarsTrain = enc.transform(dTrain[vars]).toarray()
yvarTrain = want[~isTest]
model = sklearn.linear_model.LogisticRegression()
model.fit(xvarsTrain,yvarTrain)
xvarsTest = enc.transform(dTest[vars]).toarray()
predTest = model.predict(xvarsTest)
yvarTest = want[isTest]
%%R -i predTest,yvarTest
library('ggplot2')
predTest <- as.numeric(predTest)
yvarTest <- as.numeric(yvarTest)
d <- data.frame(predC=predTest,want=yvarTest>0.5)
print(ggplot(data=d) +
geom_density(aes(x=predC,color=want)))
print(table(truth=d[,'want'],decision=d[,'predC']>0.5))