library('stringr')
library('ggplot2')
# read text as a sequence of token blocks
readText <- function(fName,firstLine=1000,lastLine=3000,chunkSize=200) {
f <- file(fName)
lines <- readLines(f)[firstLine:lastLine]
close(f)
text <- paste(str_trim(lines),collapse=' ')
toks <- str_split(text,'[ \t\n]+')[[1]]
groups <- split(toks, ceiling(seq_along(toks)/chunkSize))
vapply(groups,function(s) paste(s,collapse=' '),c(''))
}
# return list of two-grams (without counts) from a single string
twoGramStr <- function(s) {
s <- tolower(s)
s <- str_replace_all(s,'[^a-z]+',' ')
s <- str_trim(s)
toks <- str_split(s,'[^a-z]+')[[1]]
ntok <- length(toks)
unique(paste(toks[seq_len(ntok-1)],toks[1+seq_len(ntok-1)]))
}
# from a list of documents get 2-gram counts
getCounts <- function(docList) {
table(unlist(lapply(docList,twoGramStr)))
}
# vectorized lookup mapping novel values to zero
lookupVals <- function(stats,key) {
v <- as.numeric(stats[key])
v[is.na(v)] <- 0
v
}
# load text data
ShakespeareMacbeth <- readText('pg2264.txt.gz')
ShakespeareHamlet <- readText('pg1524.txt.gz')
MarloweEdwardII <- readText('pg20288.txt.gz')
MarloweFaustus <- readText('pg811.txt.gz')
head(MarloweEdwardII,n=1)
## 1
## "Unto the proudest peer of Britainy. Thou that compar'st him to a flying-fish, And threaten'st death whether he rise or fall, 'Tis not the hugest monster of the sea, Nor foulest harpy, that shall swallow him. _Y. Mor._ If in his absence thus he favours him, What will he do whenas he shall be present? _Lan._ That shall we see: look, where his lordship come! _Enter_ GAVESTON. _K. Edw._ My Gaveston! Welcome to Tynmouth! welcome to thy friend! Thy absence made me droop and pine away; For, as the lovers of fair Danaë, When she was lock'd up in a brazen tower, Desir'd her more, and wax'd outrageous, So did it fare with me: and now thy sight Is sweeter far than was thy parting hence Bitter and irksome to my sobbing heart. _Gav._ Sweet lord and king, your speech preventeth mine; Yet have I words left to express my joy: The shepherd, nipt with biting winter's rage, Frolics not more to see the painted spring Than I do to behold your majesty. _K. Edw._ Will none of you salute my Gaveston? _Lan._ Salute him! yes.--Welcome, Lord Chamberlain! _Y. Mor._ Welcome is the good Earl of Cornwall! _War._ Welcome, Lord"
tail(MarloweEdwardII,n=1)
## 76
## "I am frighted with thy words: My father's murder'd through thy treachery; And thou shalt die, and on his mournful hearse Thy hateful and accursed head shall lie, To witness to the world that by thy means His kingly body was too soon interr'd. _Q. Isab._ Weep not, sweet son. _K. Edw. Third._ Forbid not me to weep; he was my father; And had you lov'd him half so well as I, You could not bear his death thus patiently: But you, I fear, conspir'd with Mortimer. _First Lord._ Why speak you not unto my lord the king? _Y. Mor._ Because I think scorn to be accus'd. Who is the man dares say I murder'd him? _K. Edw. Third._ Traitor, in me my loving father speaks,"
# For demonstration purposes use count each block as a separate document
trainData <- rbind(
data.frame(docs=ShakespeareMacbeth,title='Shakespeare Macbeth',
isShakespeare=1,stringsAsFactors=FALSE),
data.frame(docs=MarloweEdwardII,title='Marlowe EdwardII',
isShakespeare=0,stringsAsFactors=FALSE)
)
# View(head(trainData))
# form all the estimates as in the last lecture
n <- nrow(trainData)
nY <- sum(trainData$isShakespeare)
nN <- n-nY
toks <- names(getCounts(trainData[,'docs']))
print(length(toks))
## [1] 18932
# This block is essentially the slide
# "The Bernoulli model implementation"
# of the presentation
# compute isShakespeare==1 or y/yes statistics
yeStats <- double(length=length(toks))
names(yeStats) <- toks
yTab <- getCounts(trainData[trainData$isShakespeare==1,'docs'])
# copy non-zero counts into oveall vector, there may be 2-grams that
# only occur in isShakespeare==1 or isShakespeare==0
yeStats[names(yTab)] <- yTab
#
# compute isShakespeare==1 or n/no statistics
# yStats and nStats have the exact same lenght and indices
neStats <- double(length=length(toks))
names(neStats) <- toks
nTab <- getCounts(trainData[trainData$isShakespeare==0,'docs'])
# copy non-zero counts into oveall vector, there may be 2-grams that
# only occur in isShakespeare==1 or isShakespeare==0
neStats[names(nTab)] <- nTab
# This block is essentailly the slide
# "Define smoothed estimates"
# compute the log-probability terms
# prior log-probabilites of class
lPriorY <- log((nY+1)/(n+2))
lPriorN <- log((nN+1)/(n+2))
# conditional log-probabilties of evidence given class
lpeY <- log((yeStats+1)/(nY+2))
lpeN <- log((neStats+1)/(nN+2))
# notice I didn't have to cast numbers to double to get
# double arithmetic
# so I didn't have to write log((nY+1.0)/(n+2.0))
as.integer(1)/as.integer(2)
## [1] 0.5
# This function is essentially the slide
# "To estimate a probability"
# return an estimate of p(isShakespeare|evidence)
# notice this funciton is stealing its data from
# the surrounding environment (don't have to pass
# them in as parameters).
scoreTextDoc <- function(text) {
twoGrams <- twoGramStr(text)
logPY <- lPriorY + sum(lookupVals(lpeY,twoGrams))
logPN <- lPriorN + sum(lookupVals(lpeN,twoGrams))
shift <- max(logPY,logPN)
PY <- exp(logPY-shift)
PN <- exp(logPN-shift)
Z <- PY + PN
PY/Z
}
# work some examples
scoreTextDoc(ShakespeareMacbeth[[1]])
## [1] 1
scoreTextDoc(MarloweEdwardII[[1]])
## [1] 1.029921e-42
scoreTextDoc(ShakespeareHamlet[[1]])
## [1] 0.9999831
scoreTextDoc(MarloweFaustus[[1]])
## [1] 0.9948094
scoreTextDoc(MarloweFaustus[[2]])
## [1] 2.626085e-05
# Neil Armstrong
scoreTextDoc("One small step for a man, one large step for mankind.")
## [1] 0.798879
# Alfred Tennyson
scoreTextDoc("Tis better to have loved and lost than never to have loved at all.")
## [1] 0.383175
# Francis Bacon
scoreTextDoc("Hope is a good breakfast, but it is a bad supper.")
## [1] 0.937742
# score and plot data we trained on
trainData$pred <- vapply(trainData$docs,scoreTextDoc,c(0.0))
ggplot(data=trainData,aes(x=pred,color=as.factor(isShakespeare),fill=as.factor(isShakespeare))) +
geom_histogram() + facet_wrap(~title,ncol=1) + ggtitle("train scores")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
aggregate(pred~isShakespeare,data=trainData,FUN=mean)
## isShakespeare pred
## 1 0 2.048874e-30
## 2 1 1.000000e+00
aggregate(pred~isShakespeare,data=trainData,FUN=median)
## isShakespeare pred
## 1 0 4.100032e-47
## 2 1 1.000000e+00
# bring in two new plays
testData <- rbind(
data.frame(docs=ShakespeareHamlet,title='Shakespeare Hamlet',
isShakespeare=1,stringsAsFactors=FALSE),
data.frame(docs=MarloweFaustus,title='Marlowe Faustus',
isShakespeare=0,stringsAsFactors=FALSE)
)
# score and plot new data
testData$pred <- vapply(testData$docs,scoreTextDoc,c(0.0))
ggplot(data=testData,aes(x=pred,color=as.factor(isShakespeare),fill=as.factor(isShakespeare))) +
geom_histogram() + facet_wrap(~title,ncol=1) + ggtitle("test scores")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
aggregate(pred~isShakespeare,data=testData,FUN=mean)
## isShakespeare pred
## 1 0 0.1682885
## 2 1 0.5926906
aggregate(pred~isShakespeare,data=testData,FUN=median)
## isShakespeare pred
## 1 0 0.003132988
## 2 1 0.913684556