library('ggplot2')
# Galton's data 
dAll <- read.table('Galton.csv',header=TRUE,sep=',',stringsAsFactors=FALSE) 
print(summary(dAll))
##      Child        Midparent   
##  Min.   :61.7   Min.   :64.0  
##  1st Qu.:66.2   1st Qu.:67.5  
##  Median :68.2   Median :68.5  
##  Mean   :68.1   Mean   :68.3  
##  3rd Qu.:70.2   3rd Qu.:69.5  
##  Max.   :73.7   Max.   :73.0
print(var(dAll$Child))
## [1] 6.34
print(var(dAll$Midparent))
## [1] 3.195
print(summary(lm(Child~Midparent,data=dAll)))
## 
## Call:
## lm(formula = Child ~ Midparent, data = dAll)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.805 -1.366  0.049  1.634  5.926 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  23.9415     2.8109    8.52   <2e-16 ***
## Midparent     0.6463     0.0411   15.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.24 on 926 degrees of freedom
## Multiple R-squared:  0.21,   Adjusted R-squared:  0.21 
## F-statistic:  247 on 1 and 926 DF,  p-value: <2e-16
mH <- mean(c(dAll$Child,dAll$Midparent))
dAll$ChildExcess <- dAll$Child - mH
dAll$MidparentExcess <- dAll$Midparent - mH
print(summary(lm(dAll$ChildExcess~dAll$MidparentExcess,data=dAll)))
## 
## Call:
## lm(formula = dAll$ChildExcess ~ dAll$MidparentExcess, data = dAll)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.805 -1.366  0.049  1.634  5.926 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -0.1809     0.0736   -2.46    0.014 *  
## dAll$MidparentExcess   0.6463     0.0411   15.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.24 on 926 degrees of freedom
## Multiple R-squared:  0.21,   Adjusted R-squared:  0.21 
## F-statistic:  247 on 1 and 926 DF,  p-value: <2e-16
set.seed(15961)
d <- dAll[sample.int(nrow(dAll),20),,drop=FALSE]
# complicated code to number repeated points
dSort <- data.frame(idx=seq_len(nrow(d)),key=paste(d$Midparent,d$Child),one=1,total=0,id=0,offset=0)
for(ki in unique(dSort$key)) {
  posns <- dSort$key==ki
  n <- sum(posns)
  dSort[posns,'total'] <- n
  dSort[posns,'id'] <- seq_len(n)
}
hasRep <- dSort$total>1
dSort[hasRep,'offset'] <- 2*(dSort[hasRep,'id']-1)/(dSort[hasRep,'total']-1) -1
d$offset <- dSort$offset
model <- lm(Child~Midparent,data=d)
print(summary(model))
## 
## Call:
## lm(formula = Child ~ Midparent, data = d)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.697 -0.846 -0.097  1.387  3.837 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   30.840     17.653    1.75    0.098 .
## Midparent      0.533      0.256    2.08    0.052 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.12 on 18 degrees of freedom
## Multiple R-squared:  0.194,  Adjusted R-squared:  0.149 
## F-statistic: 4.33 on 1 and 18 DF,  p-value: 0.0521
d$yEst <- predict(model,newdata=d)
ggplot(data=d,aes(x=Midparent)) +
  geom_line(aes(y=yEst),size=2,color='blue') +
  scale_size() + geom_point(aes(x=(Midparent+0.1*offset),y=Child),size=3) +
  geom_segment(aes(xend=Midparent,y=Child,yend=yEst),color='red') +
  scale_y_continuous('Child Height')

plot of chunk unnamed-chunk-2