Support materials for Win-Vector LLC blog article Relative error distributions, without the heavy tail theatrics.
knitr::opts_chunk$set(echo = TRUE)
library('ggplot2')
# draw item i with probability pi
draw <- function(universe,n) {
u <- sort(runif(n))
idxs <- findInterval(u,c(0,universe$cumsum),rightmost.closed = TRUE)
idxs <- pmin(idxs,nrow(universe))
# permute
idxs <- idxs[sample.int(n,n,replace=FALSE)]
idxs
}
nRanks = 200
sampleSize = 1000
contrainPosition = 20
# generate population of items with Zipfian probabilities
set.seed(23234)
zipfianUniverse <- data.frame(theoreticalRank=seq_len(nRanks))
zipfianUniverse$theoreticalProb <- 1/zipfianUniverse$theoreticalRank^1.1
zipfianUniverse$theoreticalProb <- zipfianUniverse$theoreticalProb/sum(zipfianUniverse$theoreticalProb)
zipfianUniverse <- zipfianUniverse[order(zipfianUniverse$theoreticalProb,decreasing=TRUE),,
drop=FALSE]
zipfianUniverse$cumsum <- cumsum(zipfianUniverse$theoreticalProb)
zipfianUniverse$theoreticalProbConstrained <- zipfianUniverse$theoreticalProb*
ifelse(zipfianUniverse$theoreticalRank>=contrainPosition,0.01,1.0)
zipfianUniverse$missedMass = zipfianUniverse$theoreticalProb -
zipfianUniverse$theoreticalProbConstrained
zipfianUniverse$missedMass[zipfianUniverse$missedMass<=0] <- NA
zipfianUniverse$theoreticalProbConstrained <- zipfianUniverse$theoreticalProbConstrained/
sum(zipfianUniverse$theoreticalProbConstrained)
observedZ <- data.frame(theoreticalRank=draw(zipfianUniverse,sampleSize))
countsZ <- as.data.frame(table(theoreticalRank=observedZ$theoreticalRank))
countsZ$theoreticalRank <- as.integer(as.character(countsZ$theoreticalRank))
countsZ$Freq <- countsZ$Freq/sum(countsZ$Freq)
countsZ$observedRank <- rank(-countsZ$Freq)
observedZ$Freq <- countsZ$Freq[match(observedZ$theoreticalRank,
countsZ$theoreticalRank)]
ggplot(data=zipfianUniverse,aes(x=theoreticalRank,y=theoreticalProb,
ymin=0,ymax=theoreticalProb)) +
geom_line() +
ggtitle("plot of Zipfian original population\ntheoretical item probability as a function of rank\nZipfian original universe")
ggplot(data=zipfianUniverse,aes(x=theoreticalRank,y=theoreticalProb,
ymin=0,ymax=theoreticalProb)) +
geom_ribbon(alpha=0.2) +
geom_ribbon(aes(ymax=missedMass),fill='blue') +
geom_line() +
ggtitle("plot of Zipfian original population\ntheoretical item probability as a function of rank\nZipfian original universe")
sum(zipfianUniverse$missedMass,na.rm=TRUE)
## [1] 0.3254028
ggplot(data=zipfianUniverse,aes(x=theoreticalRank,y=theoreticalProb)) +
geom_point() + scale_y_log10() + scale_x_log10() +
geom_smooth(method='lm',se=FALSE) +
ggtitle("log log plot of Zipfian original population\ntheoretical item probability as a function of rank\nZipfian original universe")