NBDB

# load libraries
library('stringr')
library('ggplot2')
# read from a JDBC connection to an arbitrary database
options( java.parameters = "-Xmx2g" )
library('RJDBC')

## Loading required package: DBI
## Loading required package: rJava

# define somem functions

# read text as a sequence of token blocks
readText <- function(fName,firstLine=1000,lastLine=3000,chunkSize=200) {
  f <- file(fName)
  lines <- readLines(f)[firstLine:lastLine]
  close(f)
  text <- paste(str_trim(lines),collapse=' ')
  toks <- str_split(text,'[ \t\n]+')[[1]]
  groups <- split(toks, ceiling(seq_along(toks)/chunkSize))
  vapply(groups,function(s) paste(s,collapse=' '),c(''))
}

# return list of two-grams (without counts) from a single string
twoGramStr <- function(docID,s) {
  s <- tolower(s)
  s <- str_replace_all(s,'[^a-z]+',' ')
  s <- str_trim(s)
  toks <- str_split(s,'[^a-z]+')[[1]]
  ntok <- length(toks)
  data.frame(docID=docID,
             feature=unique(paste(toks[seq_len(ntok-1)],toks[1+seq_len(ntok-1)])))
}

executeDBQuery <- function(con,query) {
  res <- dbSendQuery(con,query)
  d <- fetch(res,-1)
  dbClearResult(res)
  d
}

# executeDBQuery(con,'SELECT a FROM ( SELECT 1 a UNION SELECT NULL a) s')
# executeDBQuery(con,'SELECT COUNT(a) FROM ( SELECT 1 a UNION SELECT NULL a) s')

dropTable <- function(con,tableName) {
   # dbExistsTable() seems to have different case sensitivity than dropping
   tryCatch(
      dbSendUpdate(con,paste('DROP TABLE',tableName)),
      error = function(e) {e},
      warn = function(w) {w})
}

mkTable <- function(con,tableName,columnString) {
  dropTable(con,tableName)
  msg = dbSendUpdate(con,paste('CREATE TABLE ',tableName,columnString))
}

enterDocDef <- function(con,tableName,docID,isShakespeare) {
  # remove any previous copy
  dbSendUpdate(con,
      paste('DELETE FROM ',tableName,' WHERE docID=\'',docID,'\'',sep=''))
  dbSendUpdate(con,
         paste('INSERT INTO ',tableName,' VALUES (?,?)'),
         docID,isShakespeare)
}


enterDocFeatureRelations <- function(con,tableName,docID,docText) {
  # remove any previous copy
  dbSendUpdate(con,
      paste('DELETE FROM ',tableName,' WHERE docID=\'',docID,'\'',sep=''))
  toks <- twoGramStr(docID,docText)
  for(i in seq_len(nrow(toks))) {
    dbSendUpdate(con,
        paste('INSERT INTO ',tableName,' VALUES (?,?)'),
        docID,toks[i,'feature'])
  }
}




# load text data

ShakespeareMacbeth <- readText('pg2264.txt.gz')
ShakespeareHamlet <- readText('pg1524.txt.gz')
MarloweEdwardII <- readText('pg20288.txt.gz')
MarloweFaustus <- readText('pg811.txt.gz')

head(MarloweEdwardII,n=1)

##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          1 
## "Unto the proudest peer of Britainy. Thou that compar'st him to a flying-fish, And threaten'st death whether he rise or fall, 'Tis not the hugest monster of the sea, Nor foulest harpy, that shall swallow him. _Y. Mor._ If in his absence thus he favours him, What will he do whenas he shall be present? _Lan._ That shall we see: look, where his lordship come! _Enter_ GAVESTON. _K. Edw._ My Gaveston! Welcome to Tynmouth! welcome to thy friend! Thy absence made me droop and pine away; For, as the lovers of fair Danaë, When she was lock'd up in a brazen tower, Desir'd her more, and wax'd outrageous, So did it fare with me: and now thy sight Is sweeter far than was thy parting hence Bitter and irksome to my sobbing heart. _Gav._ Sweet lord and king, your speech preventeth mine; Yet have I words left to express my joy: The shepherd, nipt with biting winter's rage, Frolics not more to see the painted spring Than I do to behold your majesty. _K. Edw._ Will none of you salute my Gaveston? _Lan._ Salute him! yes.--Welcome, Lord Chamberlain! _Y. Mor._ Welcome is the good Earl of Cornwall! _War._ Welcome, Lord"

tail(MarloweEdwardII,n=1)

##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     76 
## "I am frighted with thy words: My father's murder'd through thy treachery; And thou shalt die, and on his mournful hearse Thy hateful and accursed head shall lie, To witness to the world that by thy means His kingly body was too soon interr'd. _Q. Isab._ Weep not, sweet son. _K. Edw. Third._ Forbid not me to weep; he was my father; And had you lov'd him half so well as I, You could not bear his death thus patiently: But you, I fear, conspir'd with Mortimer. _First Lord._ Why speak you not unto my lord the king? _Y. Mor._ Because I think scorn to be accus'd. Who is the man dares say I murder'd him? _K. Edw. Third._ Traitor, in me my loving father speaks,"

# For demonstration purposes use count each block as a separate document
trainData <- rbind(
    data.frame(docText=ShakespeareMacbeth,title='Shakespeare Macbeth',
               isShakespeare=1,stringsAsFactors=FALSE),
    data.frame(docText=MarloweEdwardII,title='Marlowe EdwardII',
               isShakespeare=0,stringsAsFactors=FALSE)
)
trainData$docID <- paste('trainDoc',seq_len(nrow(trainData)))
# bring in two new plays
testData <- rbind(
    data.frame(docText=ShakespeareHamlet,title='Shakespeare Hamlet',
               isShakespeare=1,stringsAsFactors=FALSE),
    data.frame(docText=MarloweFaustus,title='Marlowe Faustus',
               isShakespeare=0,stringsAsFactors=FALSE)
)
testData$docID <- paste('testDoc',seq_len(nrow(testData)))

# load the data into a database
drv <- JDBC('org.h2.Driver','h2-1.3.176.jar',identifier.quote='"')
# remove the file NBDB.h2db.* to create new one
con <- dbConnect(drv,'jdbc:h2:NBDB.h2db')



mkTable(con,'docs','(docID VARCHAR(255), isShakespeare INTEGER)')
mkTable(con,'relationDocsAndFeatures','(docID VARCHAR(255), feature VARCHAR(255))')

enterDocDef(con,'docs',
            trainData[1,'docID'],
            trainData[1,'isShakespeare'])
enterDocFeatureRelations(con,'relationDocsAndFeatures',
                         trainData[1,'docID'],
                         trainData[1,'docText'])

head(dbReadTable(con,'docs'))

##        DOCID ISSHAKESPEARE
## 1 trainDoc 1             1

head(dbReadTable(con,'relationDocsAndFeatures'))

##        DOCID         FEATURE
## 1 trainDoc 1        we still
## 2 trainDoc 1      still haue
## 3 trainDoc 1  haue iudgement
## 4 trainDoc 1 iudgement heere
## 5 trainDoc 1      heere that
## 6 trainDoc 1         that we

# some examples of SQL

# SQL is great at aggregaton
print(head(executeDBQuery(con,'
   SELECT
      docID,
      COUNT(1)
   FROM
      relationDocsAndFeatures
   GROUP BY
      docID
')))

##        DOCID COUNT(1)
## 1 trainDoc 1      209

# bringing in data is by joins
print(head(executeDBQuery(con,'
   SELECT
      relationDocsAndFeatures.docID,
      relationDocsAndFeatures.feature,
      docs.isShakespeare
   FROM
      relationDocsAndFeatures
   JOIN
      docs
   ON
      docs.docID=relationDocsAndFeatures.docID
')))

##        DOCID         FEATURE ISSHAKESPEARE
## 1 trainDoc 1        we still             1
## 2 trainDoc 1      still haue             1
## 3 trainDoc 1  haue iudgement             1
## 4 trainDoc 1 iudgement heere             1
## 5 trainDoc 1      heere that             1
## 6 trainDoc 1         that we             1

print(executeDBQuery(con,'
SELECT
  *
FROM
  ( SELECT
      r1.docID,
      r1.feature,
      COUNT(r2.feature) ord
   FROM
      relationDocsAndFeatures r1
   LEFT JOIN
      relationDocsAndFeatures r2
   ON
      r1.docID=r2.docID
      AND r2.feature<r1.feature
   GROUP BY
      r1.docID,
      r1.feature
   ORDER BY
      r1.docID,
      r1.feature ) sub
WHERE
   ord<=5
'))

##        DOCID        FEATURE ORD
## 1 trainDoc 1        a naked   0
## 2 trainDoc 1    against his   1
## 3 trainDoc 1    against the   2
## 4 trainDoc 1    almost supt   3
## 5 trainDoc 1         am his   4
## 6 trainDoc 1 ambition which   5

#entero all of the data
for(i in seq_len(nrow(trainData))) {
  enterDocDef(con,'docs',
              trainData[i,'docID'],
              trainData[i,'isShakespeare'])
  enterDocFeatureRelations(con,'relationDocsAndFeatures',
                           trainData[i,'docID'],
                           trainData[i,'docText'])
}
dbSendUpdate(con,'CREATE INDEX DFI1 ON relationDocsAndFeatures(feature)')
dbSendUpdate(con,'CREATE INDEX DFI2 ON relationDocsAndFeatures(docID)')
dbSendUpdate(con,'CREATE UNIQUE INDEX DFU ON relationDocsAndFeatures(docID,feature)')
dbSendUpdate(con,'CREATE INDEX DDI1 ON docs(isShakespeare)')
dbSendUpdate(con,'CREATE UNIQUE INDEX DDUI ON docs(docID)')

# put in the extra data we need
# it would probably be better to do this another table, but for this demo we will just add the column
dbSendUpdate(con,'ALTER TABLE relationDocsAndFeatures ADD COLUMN isShakespeare INTEGER')
dbSendUpdate(con,'UPDATE relationDocsAndFeatures SET isShakespeare = (SELECT isShakespeare FROM docs WHERE docs.docID=relationDocsAndFeatures.docID)')
dbSendUpdate(con,'CREATE INDEX DFI3 ON relationDocsAndFeatures(isShakespeare)')

print(executeDBQuery(con,'SELECT * from relationDocsAndFeatures where feature=\'peer of\''))

##         DOCID FEATURE ISSHAKESPEARE
## 1 trainDoc 56 peer of             0

# build the naive Bayes model

# total counts
mkTable(con,'totals','(n REAL, nC REAL, isShakespeare INTEGER, prior REAL, logPrior REAL)')
dbSendUpdate(con,'
   INSERT INTO totals
   SELECT 
      *,
      (1.0+nC)/(2.0+n) prior,
      log((1.0+nC)/(2.0+n)) logPrior
   FROM (
      SELECT 
         (SELECT COUNT(1) FROM docs) n,
         COUNT(1) nC,
         docs.isShakespeare isShakespeare
      FROM
         docs
      GROUP BY
         isShakespeare
    ) innertable
   ')
head(dbReadTable(con,'totals'))

##     N NC ISSHAKESPEARE     PRIOR   LOGPRIOR
## 1 131 76             0 0.5789474 -0.5465437
## 2 131 55             1 0.4210526 -0.8649974

# counts by class and feature
mkTable(con,'countFootprint','(feature VARCHAR(255), isShakespeare INTEGER, n REAL, nc REAL)')
dbSendUpdate(con,'
   INSERT INTO countFootprint
      SELECT 
        feature,
        isShakespeare,
        MAX(n) n,
        MAX(nc) nc
      FROM
        ( SELECT feature FROM relationDocsAndFeatures GROUP by feature ) featlist
      JOIN
        totals
      ON
        1=1
      GROUP BY
        featlist.feature,
        isShakespeare
   ')
head(dbReadTable(con,'countFootprint'))

##            FEATURE ISSHAKESPEARE   N NC
## 1         swear we             0 131 76
## 2         swear we             1 131 55
## 3        who frets             0 131 76
## 4        who frets             1 131 55
## 5      you returne             0 131 76
## 6 highness quickly             0 131 76

dbSendUpdate(con,'CREATE INDEX CFI1 ON countFootprint(feature)')
dbSendUpdate(con,'CREATE INDEX CFI2 ON countFootprint(isShakespeare)')
dbSendUpdate(con,'CREATE UNIQUE INDEX CFU ON countFootprint(feature,isShakespeare)')

mkTable(con,'conditionalCounts','(feature VARCHAR(255), isShakespeare INTEGER, nc REAL, nf REAL, pCgivenY REAL, logpCgivenY REAL)')
dbSendUpdate(con,'
   INSERT INTO conditionalCounts
     SELECT
      *,
      (1.0+nf)/(2.0+nc) pCgivenY,
      log((1.0+nf)/(2.0+nc)) logpCgivenY
     FROM (
      SELECT 
        countFootprint.feature feature,
        countFootprint.isShakespeare isShakespeare,
        MAX(countFootprint.nc) nc,
        COUNT(relationDocsAndFeatures.feature) nf
      FROM
        countFootprint
      LEFT JOIN
        relationDocsAndFeatures
      ON
        relationDocsAndFeatures.feature=countFootprint.feature
        AND relationDocsAndFeatures.isShakespeare=countFootprint.isShakespeare
      GROUP BY
        countFootprint.feature,
        countFootprint.isShakespeare
      ) summaries
   ')
head(dbReadTable(con,'conditionalCounts'))

##      FEATURE ISSHAKESPEARE NC NF   PCGIVENY LOGPCGIVENY
## 1 a baboones             0 76  0 0.01282051   -4.356709
## 2 a baboones             1 55  1 0.03508772   -3.349904
## 3     a band             0 76  1 0.02564103   -3.663562
## 4     a band             1 55  0 0.01754386   -4.043051
## 5   a barren             0 76  0 0.01282051   -4.356709
## 6   a barren             1 55  1 0.03508772   -3.349904

dbSendUpdate(con,'CREATE INDEX CDI1 ON conditionalCounts(feature)')
dbSendUpdate(con,'CREATE INDEX CDI2 ON conditionalCounts(isShakespeare)')
dbSendUpdate(con,'CREATE UNIQUE INDEX CFCU ON conditionalCounts(feature,isShakespeare)')

print(executeDBQuery(con,'SELECT * from conditionalCounts where feature=\'i am\''))

##   FEATURE ISSHAKESPEARE NC NF  PCGIVENY LOGPCGIVENY
## 1    i am             0 76 11 0.1538462   -1.871802
## 2    i am             1 55 16 0.2982456   -1.209838

print(executeDBQuery(con,'SELECT * from conditionalCounts where feature=\'me la\''))

##   FEATURE ISSHAKESPEARE NC NF   PCGIVENY LOGPCGIVENY
## 1   me la             0 76  0 0.01282051   -4.356709
## 2   me la             1 55  1 0.03508772   -3.349904

print(executeDBQuery(con,'SELECT * from conditionalCounts where feature=\'peer of\''))

##   FEATURE ISSHAKESPEARE NC NF   PCGIVENY LOGPCGIVENY
## 1 peer of             0 76  1 0.02564103   -3.663562
## 2 peer of             1 55  0 0.01754386   -4.043051

mkTable(con,'scoringTable','(docID VARCHAR(255), feature VARCHAR(255))')

scoreTextDoc <- function(text) {
  enterDocFeatureRelations(con,'scoringTable','score tmp',text)
  # dbReadTable(con,'scoringTable')
  d <- executeDBQuery(con,'
     SELECT
        docID,
        totals.isShakespeare,
        MAX(totals.logPrior) logPrior,
        SUM(COALESCE(logpCgivenY,0.0)) logpCgivenY
     FROM
        scoringTable
     JOIN
        totals
     ON
        1=1
     LEFT JOIN
        conditionalCounts
     ON
        scoringTable.feature=conditionalCounts.feature
        AND totals.isShakespeare=conditionalCounts.isShakespeare
     WHERE
        scoringTable.docID=\'score tmp\'
     GROUP BY
        docID,
        totals.isShakespeare
  ')
  logPY <- d[d$ISSHAKESPEARE==1,'LOGPRIOR'] + d[d$ISSHAKESPEARE==1,'LOGPCGIVENY']
  logPN <- d[d$ISSHAKESPEARE==0,'LOGPRIOR'] + d[d$ISSHAKESPEARE==0,'LOGPCGIVENY']
  shift <- max(logPY,logPN)
  PY <- exp(logPY-shift)
  PN <- exp(logPN-shift)
  Z <- PY + PN
  PY/Z
}

# work some examples
scoreTextDoc(ShakespeareMacbeth[[1]])

## [1] 1

scoreTextDoc(MarloweEdwardII[[1]])

## [1] 1.029912e-42

scoreTextDoc(ShakespeareHamlet[[1]])

## [1] 0.9999831

scoreTextDoc(MarloweFaustus[[1]])

## [1] 0.9948094

scoreTextDoc(MarloweFaustus[[2]])

## [1] 2.626084e-05

# Neil Armstrong
scoreTextDoc("One small step for a man, one large step for mankind.")

## [1] 0.798879

# Alfred Tennyson
scoreTextDoc("Tis better to have loved and lost than never to have loved at all.")

## [1] 0.3831751

# Francis Bacon
scoreTextDoc("Hope is a good breakfast, but it is a bad supper.")

## [1] 0.937742

dbDisconnect(con)

## [1] TRUE