library('ggplot2')
# Galton's data
dAll <- read.table('Galton.csv',header=TRUE,sep=',',stringsAsFactors=FALSE)
print(summary(dAll))
## Child Midparent
## Min. :61.7 Min. :64.0
## 1st Qu.:66.2 1st Qu.:67.5
## Median :68.2 Median :68.5
## Mean :68.1 Mean :68.3
## 3rd Qu.:70.2 3rd Qu.:69.5
## Max. :73.7 Max. :73.0
print(var(dAll$Child))
## [1] 6.34
print(var(dAll$Midparent))
## [1] 3.195
print(summary(lm(Child~Midparent,data=dAll)))
##
## Call:
## lm(formula = Child ~ Midparent, data = dAll)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.805 -1.366 0.049 1.634 5.926
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.9415 2.8109 8.52 <2e-16 ***
## Midparent 0.6463 0.0411 15.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.24 on 926 degrees of freedom
## Multiple R-squared: 0.21, Adjusted R-squared: 0.21
## F-statistic: 247 on 1 and 926 DF, p-value: <2e-16
mH <- mean(c(dAll$Child,dAll$Midparent))
dAll$ChildExcess <- dAll$Child - mH
dAll$MidparentExcess <- dAll$Midparent - mH
print(summary(lm(dAll$ChildExcess~dAll$MidparentExcess,data=dAll)))
##
## Call:
## lm(formula = dAll$ChildExcess ~ dAll$MidparentExcess, data = dAll)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.805 -1.366 0.049 1.634 5.926
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1809 0.0736 -2.46 0.014 *
## dAll$MidparentExcess 0.6463 0.0411 15.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.24 on 926 degrees of freedom
## Multiple R-squared: 0.21, Adjusted R-squared: 0.21
## F-statistic: 247 on 1 and 926 DF, p-value: <2e-16
set.seed(15961)
d <- dAll[sample.int(nrow(dAll),20),,drop=FALSE]
# complicated code to number repeated points
dSort <- data.frame(idx=seq_len(nrow(d)),key=paste(d$Midparent,d$Child),one=1,total=0,id=0,offset=0)
for(ki in unique(dSort$key)) {
posns <- dSort$key==ki
n <- sum(posns)
dSort[posns,'total'] <- n
dSort[posns,'id'] <- seq_len(n)
}
hasRep <- dSort$total>1
dSort[hasRep,'offset'] <- 2*(dSort[hasRep,'id']-1)/(dSort[hasRep,'total']-1) -1
d$offset <- dSort$offset
model <- lm(Child~Midparent,data=d)
print(summary(model))
##
## Call:
## lm(formula = Child ~ Midparent, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.697 -0.846 -0.097 1.387 3.837
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.840 17.653 1.75 0.098 .
## Midparent 0.533 0.256 2.08 0.052 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.12 on 18 degrees of freedom
## Multiple R-squared: 0.194, Adjusted R-squared: 0.149
## F-statistic: 4.33 on 1 and 18 DF, p-value: 0.0521
d$yEst <- predict(model,newdata=d)
ggplot(data=d,aes(x=Midparent)) +
geom_line(aes(y=yEst),size=2,color='blue') +
scale_size() + geom_point(aes(x=(Midparent+0.1*offset),y=Child),size=3) +
geom_segment(aes(xend=Midparent,y=Child,yend=yEst),color='red') +
scale_y_continuous('Child Height')