## ydata --- observations of the variable of interest
## xdata --- observations of the auxilliary variable
## N --- population size
## xbarU --- population mean of auxilliary variable
## the output is the estimate mean or total (est.total=TRUE)
<- function (ydata, xdata, xbarU, N = Inf, est.total = FALSE)
srs_reg_est
{<- length (ydata)
n <- lm (ydata ~ xdata)
lmfit <- lmfit$coefficients
Bhat <- lmfit$residuals
efit <- sum (efit^2) / (n - 2)
SSe <- Bhat[1] + Bhat[2] * xbarU
yhat_reg <- sqrt ((1-n/N) * SSe / n)
se_yhat_reg <- qt (0.975, df = n - 2) * se_yhat_reg
mem <- c(yhat_reg, se_yhat_reg, yhat_reg - mem, yhat_reg + mem)
output
if (est.total) {
if(!is.finite(N)) stop("N must be finite for estimating population total" )
<- output * N
output
}
names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output
}
## ydata --- observations of the variable of interest
## xdata --- observations of the auxilliary variable
## N --- population size
## the output is the ratio of ybarU/xbarU
<- function (ydata, xdata, N = Inf)
srs_ratio_est
{ <- length (xdata)
n <- mean (xdata)
xbar <- mean (ydata)
ybar <- ybar / xbar
B_hat <- ydata - B_hat * xdata
d <- sum (d^2) / (n - 1)
var_d <- sqrt ((1 - n/N) * var_d / n) / xbar
sd_B_hat <- qt (0.975, df = n - 1) * sd_B_hat
mem <- c (B_hat, sd_B_hat, B_hat - mem, B_hat + mem )
output
names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output
}
## sdata --- a vector of original survey data in a domain
## N --- population size
## n --- total sample size (not the sample size in the domain)
## to find total, multiply domain size N_d to the estimate returned by this function
<- function (sdata, n, N = Inf)
srs_domain_mean_est
{<- length (sdata)
n_d <- mean (sdata)
ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n_d)
se.ybar <- qt (0.975, df = n_d - 1) * se.ybar
mem c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
## sdata --- a vector of original survey data
## N --- population size
## to find total, multiply N to the estimate returned by this function
<- function (sdata, N = Inf)
srs_mean_est
{<- length (sdata)
n <- mean (sdata)
ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)
se.ybar <- qt (0.975, df = n - 1) * se.ybar
mem c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
## to find total, multiply N to the estimate returned by this function
## for poststratification, use nh = n * Nh/N
<- function (ybarh, sh, nh, Nh)
str_mean_estimate
{<- sum (Nh)
N <- Nh/N
Pi_h <- sum(ybarh * Pi_h)
ybar <- sqrt(sum((1-nh/Nh)*Pi_h^2*sh^2/nh))
seybar <- 1.96 * seybar
mem c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
<- read.csv ("data/cherry.csv", header = T)
cherry cherry
plot (cherry$volume ~ cherry$diameter)
summary (lm(cherry$volume ~ 0+cherry$diameter))
##
## Call:
## lm(formula = cherry$volume ~ 0 + cherry$diameter)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.104 -8.470 -6.199 1.883 27.129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## cherry$diameter 2.4209 0.1253 19.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.493 on 30 degrees of freedom
## Multiple R-squared: 0.9256, Adjusted R-squared: 0.9231
## F-statistic: 373.1 on 1 and 30 DF, p-value: < 2.2e-16
<- 2967
N ## estimating the mean of volume
<- srs_mean_est(cherry$volume, N = N)
srs_mean_volume srs_mean_volume
## Est. S.E. ci.low ci.upp
## 30.170968 2.936861 24.173098 36.168837
## estimating the total of volume
<- srs_mean_est(cherry$volume, N = N) * N
srs_total_volume srs_total_volume
## Est. S.E. ci.low ci.upp
## 89517.261 8713.665 71721.583 107312.940
## input
<- cherry$volume
ydata <- cherry$diameter
xdata <- 2967
N
## calculation
<- length (xdata)
n <- mean (xdata)
xbar <- mean (ydata)
ybar <- ybar / xbar ## ratio estimate
B_hat
plot (cherry$volume ~ cherry$diameter)
abline (a = 0, b = B_hat)
<- ydata - B_hat * xdata ## errors
d data.frame (cherry, d = d)
## estimating S^2_e
<- sum (d^2) / (n - 1) ## variance of errors
var_d <- sqrt ((1 - n/N) * var_d / n) / xbar ## SE for B
sd_B_hat <- qt (0.975, df = n - 1) * sd_B_hat ## margin error for B
mem
## output
<- c (B_hat, sd_B_hat, B_hat - mem, B_hat + mem )
output_B names (output_B) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output_B
## Est. S.E. ci.low ci.upp
## 2.277331 0.130786 2.010231 2.544432
<- 41835/N
mean_diameters * mean_diameters output_B
## Est. S.E. ci.low ci.upp
## 32.110603 1.844097 28.344455 35.876750
<- 41835
t_diameters * t_diameters output_B
## Est. S.E. ci.low ci.upp
## 95272.159 5471.434 84097.999 106446.318
<- srs_ratio_est (ydata = cherry$volume, xdata = cherry$diameter, N = 2967)
B_v2d B_v2d
## Est. S.E. ci.low ci.upp
## 2.277331 0.130786 2.010231 2.544432
<- 41835/N
xbarU <- srs_ratio_est (ydata = cherry$volume, xdata = cherry$diameter, N = 2967) * xbarU
ratio_mean_volume ratio_mean_volume
## Est. S.E. ci.low ci.upp
## 32.110603 1.844097 28.344455 35.876750
<- 41835
total_diameters srs_ratio_est (ydata = cherry$volume, xdata = cherry$diameter, N = 2967) * total_diameters
## Est. S.E. ci.low ci.upp
## 95272.159 5471.434 84097.999 106446.318
cat(1-(ratio_mean_volume[2]/srs_mean_volume[2])^2)
## 0.6057237
<- read.csv ("data/agpop.csv")
agpop <- agpop[agpop$acres92 != -99, ] ## remove those counties with na
agpop
# sample size
<- 300
n # population size
<- nrow (agpop)
N
# true values that we want to estimate
<- sum (agpop [,"acres92"])
tyU # suppose known for ratio estimate
<- sum (agpop [,"acres87"])
txU <- tyU/txU
B plot (agpop [, "acres87"], agpop [, "acres92"])
abline(a=0,b=B)
# expected reduction of variance of ratio estimate to srs
1-var(agpop$acres92-B*agpop$acres87)/var(agpop$acres92)
## [1] 0.9917355
# linear model output
summary(lm(agpop$acres92 ~ agpop$acres87))
##
## Call:
## lm(formula = agpop$acres92 ~ agpop$acres87)
##
## Residuals:
## Min 1Q Median 3Q Max
## -642399 -8256 -692 5593 656458
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.121e+03 8.648e+02 -2.453 0.0142 *
## agpop$acres87 9.878e-01 1.626e-03 607.388 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38560 on 3057 degrees of freedom
## Multiple R-squared: 0.9918, Adjusted R-squared: 0.9918
## F-statistic: 3.689e+05 on 1 and 3057 DF, p-value: < 2.2e-16
<- 2000
nsim <- data.frame (
sim_rat simple = rep (0, nsim), ratio = rep (0, nsim), B = rep (0, nsim))
for (i in 1:nsim)
{<- sample (N,n)
srs <- agpop[srs, "acres92"]
y_srs <- agpop[srs, "acres87"]
x_srs
# SRS estimate
$simple [i] <- mean (y_srs) * N
sim_rat# ratio estimate
$B [i] <- mean (y_srs) / mean (x_srs)
sim_rat$ratio [i] <- sim_rat$B [i] * txU
sim_rat
}
1:20,] sim_rat[
boxplot (sim_rat [, 1:2])
abline (h = tyU, col = "red")
<- sapply (sim_rat[,1:2], var)
sim_var
<- sim_var/sim_var[1]
ratio_sim_var <- 1-ratio_sim_var
percentage_reduction data.frame (sim_var, ratio_sim_var,percentage_reduction)
<- read.csv("data/agsrs.csv")
agsrs <- nrow (agsrs)
n n
## [1] 300
## estimating domain means
<- data.frame(rbind(
region_agsrs_domain NC = srs_domain_mean_est(agsrs$acres92[agsrs$region=="NC"], n=n, N = 3078),
NE = srs_domain_mean_est(agsrs$acres92[agsrs$region=="NE"], n=n,N = 3078),
S = srs_domain_mean_est(agsrs$acres92[agsrs$region=="S"], n=n,N = 3078),
W = srs_domain_mean_est(agsrs$acres92[agsrs$region=="W"], n=n,N = 3078))
) region_agsrs_domain
## Compared to the results of naively applying SRS estimation to each domain
<- data.frame(rbind(
region_agsrs_srs NC = srs_mean_est(agsrs$acres92[agsrs$region=="NC"], N = 1054),
NE = srs_mean_est(agsrs$acres92[agsrs$region=="NE"], N = 220),
S = srs_mean_est(agsrs$acres92[agsrs$region=="S"], N = 1382),
W = srs_mean_est(agsrs$acres92[agsrs$region=="W"], N = 422))
) region_agsrs_srs
<- tapply (1:nrow(agsrs), agsrs[,"region"], length)
nh <- sum (nh)
n n
## [1] 300
<- tapply (agsrs[, "acres92"], agsrs[,"region"], sd)
sh <- tapply (agsrs[, "acres92"], agsrs[,"region"], mean)
ybarh # create a vector with external information
<- c(NC = 1054, NE = 220, S= 1382, W = 422) Nh
Create nh proportional to Nh, instead of using observed nh
<- Nh/sum(Nh)* n
nh_post
## regional summary for stratified sampling estimation
data.frame(cbind(ybarh, sh, nh, prop_obs=nh/sum(nh), Nh, nh_post, prop_post = nh_post/sum(nh_post)))
## strata mean estimate
str_mean_estimate (ybarh, sh, nh_post, Nh)
## Est. S.E. ci.low ci.upp
## 300051.69 17760.26 265241.58 334861.79
## compare with SRS estimate
srs_mean_est(agsrs$acres92, N = 3087)
## Est. S.E. ci.low ci.upp
## 297897.05 18901.41 260700.40 335093.69
## true mean
mean(agpop$acres92)
## [1] 308582.4
<- read.csv ("data/college_teacher.csv")
teacher head(teacher)
table(teacher[,2:1])
## teacher
## gender 0 1
## 1 156 84
## 2 120 40
addmargins(table(teacher[,2:1]), 2)
## teacher
## gender 0 1 Sum
## 1 156 84 240
## 2 120 40 160
prop.table(table(teacher[,2:1]), margin = "gender")
## teacher
## gender 0 1
## 1 0.65 0.35
## 2 0.75 0.25
<- nrow (teacher)
n <- tapply (teacher$teacher, INDEX = teacher$gender, FUN = mean); yh yh
## 1 2
## 0.35 0.25
<- tapply (teacher$teacher, INDEX = teacher$gender, FUN = sd); sh sh
## 1 2
## 0.4779664 0.4343722
<- tapply (teacher$teacher, INDEX = teacher$gender, FUN = length)
nh_obs <- nh_obs/sum (nh_obs); prop_h_obs prop_h_obs
## 1 2
## 0.6 0.4
data.frame(yh, sh, nh_obs, prop_h_obs)
<- c(3000, 1000)
Nh <- nrow(teacher)
n data.frame(rbind(
female=srs_domain_mean_est(subset(teacher, gender==1)$teacher, n=n, N=4000),
male = srs_domain_mean_est(subset(teacher, gender==2)$teacher, n=n, N=4000)
))
<- c(3000, 1000)
Nh <- Nh/sum (Nh); prop_h_post prop_h_post
## [1] 0.75 0.25
Create nh proportional to Nh, instead of using observed nh
<- Nh/sum (Nh) * n
nh_post
## show the differences
data.frame(p_hat = yh, sh, nh_obs, prop_obs=prop_h_obs, nh_post, prop_post=prop_h_post)
## poststratification estimation of mean
str_mean_estimate (yh, sh, nh_post, Nh)
## Est. S.E. ci.low ci.upp
## 0.32500000 0.02217306 0.28154080 0.36845920
srs_mean_est (teacher$teacher, N=sum(Nh))
## Est. S.E. ci.low ci.upp
## 0.31000000 0.02196545 0.26681751 0.35318249
<- read.csv ("data/cherry.csv", header = T)
cherry <- cherry$volume
ydata <- cherry$diameter
xdata <- 41835
t_diameters <- t_diameters/2967
xbarU <- 2967 N
<- length (ydata)
n <- lm (ydata ~ xdata)
lmfit summary (lmfit)
##
## Call:
## lm(formula = ydata ~ xdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.065 -3.107 0.152 3.495 9.587
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36.9435 3.3651 -10.98 7.62e-12 ***
## xdata 5.0659 0.2474 20.48 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.252 on 29 degrees of freedom
## Multiple R-squared: 0.9353, Adjusted R-squared: 0.9331
## F-statistic: 419.4 on 1 and 29 DF, p-value: < 2.2e-16
plot (xdata, ydata)
abline (lmfit)
<- lmfit$coefficients
Bhat <- ydata - (Bhat[1] + Bhat[2] * xdata)
efit data.frame (cherry, residual=efit) ## for visualization
<- sum (efit^2) / (n - 2)
SSe SSe
## [1] 18.0794
## SSe can be obtained directly from lm fitting output
anova(lmfit)
<- Bhat[1] + Bhat[2] * xbarU
yhat_reg <- sqrt ((1-n/N) * SSe / n)
se_yhat_reg <- qt (0.975, df = n - 2) * se_yhat_reg
mem <- c(yhat_reg, se_yhat_reg, yhat_reg - mem, yhat_reg + mem)
output names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output
## Est. S.E. ci.low ci.upp
## 34.4856287 0.7596795 32.9319097 36.0393476
* N output
## Est. S.E. ci.low ci.upp
## 102318.860 2253.969 97708.976 106928.744
<- srs_reg_est(ydata = cherry$volume, xdata = cherry$diameter,
reg_mean_volume xbarU=t_diameters/2967, N = 2967)
reg_mean_volume
## Est. S.E. ci.low ci.upp
## 34.4856287 0.7596795 32.9319097 36.0393476
<- 41835
t_diameters srs_reg_est(ydata = cherry$volume, xdata = cherry$diameter,
xbarU=t_diameters/2967, N = 2967, est.total = TRUE)
## Est. S.E. ci.low ci.upp
## 102318.860 2253.969 97708.976 106928.744
cat(1-(reg_mean_volume[2]/srs_mean_volume[2])^2)
## 0.9330895
To estimate the number of dead trees in an area, we divide the area into 100 square plots and count the number of dead trees on a photograph of each plot. Photo counts can be made quickly, but sometimes a tree is misclassified or not detected. So we select an SRS of 25 of the plots for field counts of dead trees. We know that the population mean number of dead trees per plot from the photo count is 11.3.
<- c (10,12,7, 13,13, 6,17, 16, 15, 10, 14, 12, 10, 5,12, 10,
photocounts 10, 9, 6, 11, 7, 9, 11, 10, 10)
<- c (15, 14, 9, 14, 8, 5, 18, 15, 13, 15, 11, 15, 12, 8, 13,
fieldcounts 9, 11, 12, 9, 12, 13, 11, 10, 9, 8)
<- lm (fieldcounts ~ photocounts)
lmfit summary (lmfit)
##
## Call:
## lm(formula = fieldcounts ~ photocounts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0319 -1.8053 0.1947 1.4212 3.8080
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.0593 1.7635 2.869 0.008676 **
## photocounts 0.6133 0.1601 3.832 0.000854 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.406 on 23 degrees of freedom
## Multiple R-squared: 0.3896, Adjusted R-squared: 0.3631
## F-statistic: 14.68 on 1 and 23 DF, p-value: 0.0008538
plot (photocounts, fieldcounts)
abline (lmfit)
# estimate for the mean number of dead trees per plot
srs_reg_est (ydata = fieldcounts, xdata = photocounts, xbarU = 11.3, N = 100)
## Est. S.E. ci.low ci.upp
## 11.9892920 0.4167579 11.1271625 12.8514215
# estimate for the total number of dead trees in the area
srs_reg_est (ydata = fieldcounts, xdata = photocounts, xbarU = 11.3, N = 100, est.total = TRUE)
## Est. S.E. ci.low ci.upp
## 1198.92920 41.67579 1112.71625 1285.14215
# compare to simple estimate
srs_mean_est (sdata = fieldcounts, N = 100)
## Est. S.E. ci.low ci.upp
## 11.5600000 0.5222069 10.4822180 12.6377820
# compare to ratio estimate
srs_ratio_est (ydata = fieldcounts, xdata = photocounts, N = 100) * 11.3
## Est. S.E. ci.low ci.upp
## 12.3233962 0.5121485 11.2663736 13.3804189