FalseZipf

Support materials for Win-Vector LLC blog article Relative error distributions, without the heavy tail theatrics.

knitr::opts_chunk$set(echo = TRUE)
library('ggplot2')

# draw item i with probability pi
draw <- function(universe,n) {
  u <- sort(runif(n))
  idxs <- findInterval(u,c(0,universe$cumsum),rightmost.closed = TRUE)
  idxs <- pmin(idxs,nrow(universe))
  # permute
  idxs <- idxs[sample.int(n,n,replace=FALSE)]
  idxs
}

nRanks = 200
sampleSize = 1000
contrainPosition = 20

# generate population of items with Zipfian probabilities
set.seed(23234)
zipfianUniverse <- data.frame(theoreticalRank=seq_len(nRanks))
zipfianUniverse$theoreticalProb <- 1/zipfianUniverse$theoreticalRank^1.1
zipfianUniverse$theoreticalProb <- zipfianUniverse$theoreticalProb/sum(zipfianUniverse$theoreticalProb)
zipfianUniverse <- zipfianUniverse[order(zipfianUniverse$theoreticalProb,decreasing=TRUE),,
                                       drop=FALSE]
zipfianUniverse$cumsum <- cumsum(zipfianUniverse$theoreticalProb)

zipfianUniverse$theoreticalProbConstrained <- zipfianUniverse$theoreticalProb*
  ifelse(zipfianUniverse$theoreticalRank>=contrainPosition,0.01,1.0)
zipfianUniverse$missedMass =  zipfianUniverse$theoreticalProb -
  zipfianUniverse$theoreticalProbConstrained
zipfianUniverse$missedMass[zipfianUniverse$missedMass<=0] <- NA
zipfianUniverse$theoreticalProbConstrained <- zipfianUniverse$theoreticalProbConstrained/
  sum(zipfianUniverse$theoreticalProbConstrained)


observedZ <- data.frame(theoreticalRank=draw(zipfianUniverse,sampleSize))
countsZ <- as.data.frame(table(theoreticalRank=observedZ$theoreticalRank))
countsZ$theoreticalRank <- as.integer(as.character(countsZ$theoreticalRank))
countsZ$Freq <- countsZ$Freq/sum(countsZ$Freq)
countsZ$observedRank <- rank(-countsZ$Freq)
observedZ$Freq <- countsZ$Freq[match(observedZ$theoreticalRank,
                                     countsZ$theoreticalRank)]

ggplot(data=zipfianUniverse,aes(x=theoreticalRank,y=theoreticalProb,
                                ymin=0,ymax=theoreticalProb)) + 
  geom_line() + 
  ggtitle("plot of Zipfian original population\ntheoretical item probability as a function of rank\nZipfian original universe")

ggplot(data=zipfianUniverse,aes(x=theoreticalRank,y=theoreticalProb,
                                ymin=0,ymax=theoreticalProb)) + 
  geom_ribbon(alpha=0.2) +
  geom_ribbon(aes(ymax=missedMass),fill='blue') +
  geom_line() + 
  ggtitle("plot of Zipfian original population\ntheoretical item probability as a function of rank\nZipfian original universe")

sum(zipfianUniverse$missedMass,na.rm=TRUE)

## [1] 0.3254028

ggplot(data=zipfianUniverse,aes(x=theoreticalRank,y=theoreticalProb)) + 
  geom_point() + scale_y_log10() + scale_x_log10() +
  geom_smooth(method='lm',se=FALSE) +
  ggtitle("log log plot of Zipfian original population\ntheoretical item probability as a function of rank\nZipfian original universe")

lm(log(theoreticalProb)~log(theoreticalRank),data=zipfianUniverse,
   weights=zipfianUniverse$theoreticalProb)

## 
## Call:
## lm(formula = log(theoreticalProb) ~ log(theoreticalRank), data = zipfianUniverse, 
##     weights = zipfianUniverse$theoreticalProb)
## 
## Coefficients:
##          (Intercept)  log(theoreticalRank)  
##               -1.547                -1.100

ggplot(data=zipfianUniverse,aes(x=theoreticalRank,y=theoreticalProbConstrained)) + 
  geom_point() +
  ggtitle("log log plot of constrained population\ntheoretical item probability as a function of rank\nconstrained Zipfian original universe")

ggplot(data=zipfianUniverse,aes(x=theoreticalRank,y=theoreticalProbConstrained)) + 
  geom_point() + scale_y_log10() + scale_x_log10() +
  ggtitle("log log plot of constrained population\ntheoretical item probability as a function of rank\nconstrained Zipfian original universe")

ggplot(data=countsZ,aes(x=observedRank,y=Freq)) + 
  geom_point() + scale_y_log10() + scale_x_log10() +
  geom_smooth(method='lm',se=FALSE) +
  ggtitle("log log plot of sample\nestimated frequency as a function of estimated rank\nZipfian original universe")

lm(log(Freq)~log(observedRank),data=countsZ,
   weights=countsZ$Freq)

## 
## Call:
## lm(formula = log(Freq) ~ log(observedRank), data = countsZ, weights = countsZ$Freq)
## 
## Coefficients:
##       (Intercept)  log(observedRank)  
##            -1.547             -1.078

ggplot(data=countsZ,aes(x=theoreticalRank,y=Freq)) + 
  geom_point() + scale_y_log10() + scale_x_log10() +
  geom_smooth(method='lm',se=FALSE) +
  ggtitle("log log plot of sample\nestimated frequency as a function of theoretical rank\nZipfian original universe")

lm(log(Freq)~log(theoreticalRank),data=countsZ,
   weights=countsZ$Freq)

## 
## Call:
## lm(formula = log(Freq) ~ log(theoreticalRank), data = countsZ, 
##     weights = countsZ$Freq)
## 
## Coefficients:
##          (Intercept)  log(theoreticalRank)  
##               -1.634                -1.003

# generate population of items with empirical log-normal probabilities
set.seed(23234)
logNormalUniverse <- data.frame(theoreticalProb=rlnorm(nRanks))
logNormalUniverse$theoreticalProb <- logNormalUniverse$theoreticalProb/sum(logNormalUniverse$theoreticalProb)
logNormalUniverse <- logNormalUniverse[order(logNormalUniverse$theoreticalProb,decreasing=TRUE),,
                                       drop=FALSE]
logNormalUniverse$theoreticalRank <- seq_len(nrow(logNormalUniverse))
logNormalUniverse$cumsum <- cumsum(logNormalUniverse$theoreticalProb)


observedLN <- data.frame(theoreticalRank=draw(logNormalUniverse,sampleSize))
countsLN <- as.data.frame(table(theoreticalRank=observedLN$theoreticalRank))
countsLN$theoreticalRank <-  as.integer(as.character(countsLN$theoreticalRank))
countsLN$Freq <- countsLN$Freq/sum(countsLN$Freq)
countsLN$observedRank <- rank(-countsLN$Freq)
observedLN$Freq <- countsLN$Freq[match(observedLN$theoreticalRank,
                                       countsLN$theoreticalRank)]

ggplot(data=logNormalUniverse,aes(x=theoreticalRank,y=theoreticalProb)) + 
  geom_point() + scale_y_log10() + scale_x_log10() +
  geom_smooth(method='lm',se=FALSE) +
  ggtitle("log log plot of log normal original population\ntheoretical item probability as a function of rank\nlog normal original universe")

lm(log(theoreticalProb)~log(theoreticalRank),data=logNormalUniverse,
   weights=logNormalUniverse$theoreticalProb)

## 
## Call:
## lm(formula = log(theoreticalProb) ~ log(theoreticalRank), data = logNormalUniverse, 
##     weights = logNormalUniverse$theoreticalProb)
## 
## Coefficients:
##          (Intercept)  log(theoreticalRank)  
##              -2.4665               -0.7377

ggplot(data=countsLN,aes(x=observedRank,y=Freq)) + 
  geom_point() + scale_y_log10() + scale_x_log10() +
  geom_smooth(method='lm',se=FALSE) +
  ggtitle("log log plot of sample\nestimated frequency as a function of estimated rank\nlog normal original universe")

lm(log(Freq)~log(observedRank),data=countsLN,
   weights=countsLN$Freq)

## 
## Call:
## lm(formula = log(Freq) ~ log(observedRank), data = countsLN, 
##     weights = countsLN$Freq)
## 
## Coefficients:
##       (Intercept)  log(observedRank)  
##           -2.4118            -0.7416

ggplot(data=countsLN,aes(x=theoreticalRank,y=Freq)) + 
  geom_point() + scale_y_log10() + scale_x_log10() +
  geom_smooth(method='lm',se=FALSE) +
  ggtitle("log log plot of sample\nestimated frequency as a function of theoretical rank\nlog normal original universe")

lm(log(Freq)~log(theoreticalRank),data=countsLN,
   weights=countsLN$Freq)

## 
## Call:
## lm(formula = log(Freq) ~ log(theoreticalRank), data = countsLN, 
##     weights = countsLN$Freq)
## 
## Coefficients:
##          (Intercept)  log(theoreticalRank)  
##              -2.5130               -0.6907

FalseZipf

Win-Vector LLC

September 18, 2016