```{r}
library('stringr')
library('ggplot2')
```

```{r}
# read text as a sequence of token blocks
readText <- function(fName,firstLine=1000,lastLine=3000,chunkSize=200) {
  f <- file(fName)
  lines <- readLines(f)[firstLine:lastLine]
  close(f)
  text <- paste(str_trim(lines),collapse=' ')
  toks <- str_split(text,'[ \t\n]+')[[1]]
  groups <- split(toks, ceiling(seq_along(toks)/chunkSize))
  vapply(groups,function(s) paste(s,collapse=' '),c(''))
}

# return list of two-grams (without counts) from a single string
twoGramStr <- function(s) {
  s <- tolower(s)
  s <- str_replace_all(s,'[^a-z]+',' ')
  s <- str_trim(s)
  toks <- str_split(s,'[^a-z]+')[[1]]
  ntok <- length(toks)
  unique(paste(toks[seq_len(ntok-1)],toks[1+seq_len(ntok-1)]))
}

# from a list of documents get 2-gram counts
getCounts <- function(docList) {
  table(unlist(lapply(docList,twoGramStr)))
}

# vectorized lookup mapping novel values to zero
lookupVals <- function(stats,key) {
  v <- as.numeric(stats[key])
  v[is.na(v)] <- 0
  v
}
```

```{r}
# load text data
ShakespeareMacbeth <- readText('pg2264.txt.gz')
ShakespeareHamlet <- readText('pg1524.txt.gz')
MarloweEdwardII <- readText('pg20288.txt.gz')
MarloweFaustus <- readText('pg811.txt.gz')

head(MarloweEdwardII,n=1)
tail(MarloweEdwardII,n=1)
```

```{r}
# For demonstration purposes use count each block as a separate document
trainData <- rbind(
    data.frame(docs=ShakespeareMacbeth,title='Shakespeare Macbeth',
               isShakespeare=1,stringsAsFactors=FALSE),
    data.frame(docs=MarloweEdwardII,title='Marlowe EdwardII',
               isShakespeare=0,stringsAsFactors=FALSE)
)

# View(head(trainData))

# form all the estimates as in the last lecture
n <- nrow(trainData)
nY <- sum(trainData$isShakespeare)
nN <- n-nY
toks <- names(getCounts(trainData[,'docs']))

print(length(toks))

# This block is essentially the slide
# "The Bernoulli model implementation"
# of the presentation
# compute isShakespeare==1 or y/yes statistics
yeStats <- double(length=length(toks))
names(yeStats) <- toks
yTab <- getCounts(trainData[trainData$isShakespeare==1,'docs'])
# copy non-zero counts into oveall vector, there may be 2-grams that
# only occur in isShakespeare==1 or isShakespeare==0
yeStats[names(yTab)] <- yTab
#
# compute isShakespeare==1 or n/no statistics
# yStats and nStats have the exact same lenght and indices
neStats <- double(length=length(toks))
names(neStats) <- toks
nTab <- getCounts(trainData[trainData$isShakespeare==0,'docs'])
# copy non-zero counts into oveall vector, there may be 2-grams that
# only occur in isShakespeare==1 or isShakespeare==0
neStats[names(nTab)] <- nTab

# This block is essentailly the slide
# "Define smoothed estimates"
# compute the log-probability terms
# prior log-probabilites of class
lPriorY <- log((nY+1)/(n+2))
lPriorN <- log((nN+1)/(n+2))
# conditional log-probabilties of evidence given class
lpeY <- log((yeStats+1)/(nY+2))
lpeN <- log((neStats+1)/(nN+2))

# notice I didn't have to cast numbers to double to get
# double arithmetic 
# so I didn't have to write log((nY+1.0)/(n+2.0))
as.integer(1)/as.integer(2)

# This function is essentially the slide
# "To estimate a probability"
# return an estimate of p(isShakespeare|evidence)
# notice this funciton is stealing its data from
# the surrounding environment (don't have to pass
# them in as parameters).
scoreTextDoc <- function(text) {
  twoGrams <- twoGramStr(text)
  logPY <- lPriorY + sum(lookupVals(lpeY,twoGrams))
  logPN <- lPriorN + sum(lookupVals(lpeN,twoGrams))
  shift <- max(logPY,logPN)
  PY <- exp(logPY-shift)
  PN <- exp(logPN-shift)
  Z <- PY + PN
  PY/Z
}

# work some examples
scoreTextDoc(ShakespeareMacbeth[[1]])
scoreTextDoc(MarloweEdwardII[[1]])
scoreTextDoc(ShakespeareHamlet[[1]])
scoreTextDoc(MarloweFaustus[[1]])

scoreTextDoc(MarloweFaustus[[2]])

# Neil Armstrong
scoreTextDoc("One small step for a man, one large step for mankind.")
# Alfred Tennyson
scoreTextDoc("Tis better to have loved and lost than never to have loved at all.")
# Francis Bacon
scoreTextDoc("Hope is a good breakfast, but it is a bad supper.")

# score and plot data we trained on
trainData$pred <- vapply(trainData$docs,scoreTextDoc,c(0.0))
ggplot(data=trainData,aes(x=pred,color=as.factor(isShakespeare),fill=as.factor(isShakespeare))) + 
  geom_histogram() + facet_wrap(~title,ncol=1) + ggtitle("train scores")
aggregate(pred~isShakespeare,data=trainData,FUN=mean)
aggregate(pred~isShakespeare,data=trainData,FUN=median)


# bring in two new plays
testData <- rbind(
    data.frame(docs=ShakespeareHamlet,title='Shakespeare Hamlet',
               isShakespeare=1,stringsAsFactors=FALSE),
    data.frame(docs=MarloweFaustus,title='Marlowe Faustus',
               isShakespeare=0,stringsAsFactors=FALSE)
)

# score and plot new data
testData$pred <- vapply(testData$docs,scoreTextDoc,c(0.0))
ggplot(data=testData,aes(x=pred,color=as.factor(isShakespeare),fill=as.factor(isShakespeare))) + 
  geom_histogram() + facet_wrap(~title,ncol=1) + ggtitle("test scores")
aggregate(pred~isShakespeare,data=testData,FUN=mean)
aggregate(pred~isShakespeare,data=testData,FUN=median)
```