## ydata --- observations of the variable of interest
## xdata --- observations of the auxilliary variable
## N --- population size
## xbarU --- population mean of auxilliary variable
## the output is the estimate mean or total (est.total=TRUE)
srs_reg_est <- function (ydata, xdata, xbarU, N = Inf, est.total = FALSE)
{
n <- length (ydata)
lmfit <- lm (ydata ~ xdata)
Bhat <- lmfit$coefficients
efit <- lmfit$residuals
SSe <- sum (efit^2) / (n - 2)
yhat_reg <- Bhat[1] + Bhat[2] * xbarU
se_yhat_reg <- sqrt ((1-n/N) * SSe / n)
mem <- qt (0.975, df = n - 2) * se_yhat_reg
output <- c(yhat_reg, se_yhat_reg, yhat_reg - mem, yhat_reg + mem)
if (est.total) {
if(!is.finite(N)) stop("N must be finite for estimating population total" )
output <- output * N
}
names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output
}
## ydata --- observations of the variable of interest
## xdata --- observations of the auxilliary variable
## N --- population size
## the output is the ratio of ybarU/xbarU
srs_ratio_est <- function (ydata, xdata, N = Inf)
{
n <- length (xdata)
xbar <- mean (xdata)
ybar <- mean (ydata)
B_hat <- ybar / xbar
d <- ydata - B_hat * xdata
var_d <- sum (d^2) / (n - 1)
sd_B_hat <- sqrt ((1 - n/N) * var_d / n) / xbar
mem <- qt (0.975, df = n - 1) * sd_B_hat
output <- c (B_hat, sd_B_hat, B_hat - mem, B_hat + mem )
names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output
}
## sdata --- a vector of original survey data in a domain
## N --- population size
## n --- total sample size (not the sample size in the domain)
## to find total, multiply domain size N_d to the estimate returned by this function
srs_domain_mean_est <- function (sdata, n, N = Inf)
{
n_d <- length (sdata)
ybar <- mean (sdata)
se.ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n_d)
mem <- qt (0.975, df = n_d - 1) * se.ybar
c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
## sdata --- a vector of original survey data
## N --- population size
## to find total, multiply N to the estimate returned by this function
srs_mean_est <- function (sdata, N = Inf)
{
n <- length (sdata)
ybar <- mean (sdata)
se.ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)
mem <- qt (0.975, df = n - 1) * se.ybar
c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
For poststratification, plug in \(n_h = n * {N_h \over N}\) in the estimator for a real stratified sample
## to find total, multiply N to the estimate returned by this function
str_mean_estimate <- function (ybarh, sh, nh, Nh)
{
N <- sum (Nh)
Pi_h <- Nh/N
ybar <- sum(ybarh * Pi_h)
seybar <- sqrt(sum((1-nh/Nh)*Pi_h^2*sh^2/nh))
mem <- 1.96 * seybar
c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
##
## Call:
## lm(formula = cherry$volume ~ 0 + cherry$diameter)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.104 -8.470 -6.199 1.883 27.129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## cherry$diameter 2.4209 0.1253 19.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.493 on 30 degrees of freedom
## Multiple R-squared: 0.9256, Adjusted R-squared: 0.9231
## F-statistic: 373.1 on 1 and 30 DF, p-value: < 2.2e-16
N <- 2967
## estimating the mean of volume
srs_mean_volume <- srs_mean_est(cherry$volume, N = N)
srs_mean_volume
## Est. S.E. ci.low ci.upp
## 30.170968 2.936861 24.173098 36.168837
## estimating the total of volume
srs_total_volume <- srs_mean_est(cherry$volume, N = N) * N
srs_total_volume
## Est. S.E. ci.low ci.upp
## 89517.261 8713.665 71721.583 107312.940
## input
ydata <- cherry$volume
xdata <- cherry$diameter
N <- 2967
## calculation
n <- length (xdata)
xbar <- mean (xdata)
ybar <- mean (ydata)
B_hat <- ybar / xbar ## ratio estimate
plot (cherry$volume ~ cherry$diameter)
abline (a = 0, b = B_hat)
d <- ydata - B_hat * xdata ## errors
knitr::kable(data.frame ("$y_i$"=cherry$volume,
"$x_i$"=cherry$diameter,
"$\\hat y_i=\\hat B\\times x_i$"=B_hat*xdata,
"$e_i=y_i - \\hat y_i$" = d,
check.names = FALSE))
\(y_i\) | \(x_i\) | \(\hat y_i=\hat B\times x_i\) | \(e_i=y_i - \hat y_i\) |
---|---|---|---|
10.3 | 8.3 | 18.90185 | -8.6018505 |
10.3 | 8.6 | 19.58505 | -9.2850499 |
10.2 | 8.8 | 20.04052 | -9.8405162 |
16.4 | 10.5 | 23.91198 | -7.5119795 |
18.8 | 10.7 | 24.36745 | -5.5674458 |
19.7 | 10.8 | 24.59518 | -4.8951790 |
15.6 | 11.0 | 25.05065 | -9.4506452 |
18.2 | 11.0 | 25.05065 | -6.8506452 |
22.6 | 11.1 | 25.27838 | -2.6783784 |
19.9 | 11.2 | 25.50611 | -5.6061115 |
24.2 | 11.3 | 25.73384 | -1.5338447 |
21.0 | 11.4 | 25.96158 | -4.9615778 |
21.4 | 11.4 | 25.96158 | -4.5615778 |
21.3 | 11.7 | 26.64478 | -5.3447772 |
19.1 | 12.0 | 27.32798 | -8.2279766 |
22.2 | 12.9 | 29.37757 | -7.1775749 |
33.8 | 12.9 | 29.37757 | 4.4224251 |
27.4 | 13.3 | 30.28851 | -2.8885074 |
25.7 | 13.7 | 31.19944 | -5.4994400 |
24.9 | 13.8 | 31.42717 | -6.5271731 |
34.5 | 14.0 | 31.88264 | 2.6173606 |
31.7 | 14.2 | 32.33811 | -0.6381057 |
36.3 | 14.5 | 33.02131 | 3.2786949 |
38.3 | 16.0 | 36.43730 | 1.8626978 |
42.6 | 16.3 | 37.12050 | 5.4794984 |
55.4 | 17.3 | 39.39783 | 16.0021670 |
55.7 | 17.5 | 39.85330 | 15.8467008 |
58.3 | 17.9 | 40.76423 | 17.5357682 |
51.5 | 18.0 | 40.99196 | 10.5080351 |
51.0 | 18.0 | 40.99196 | 10.0080351 |
77.0 | 20.6 | 46.91303 | 30.0869735 |
## estimating S^2_e
var_d <- sum (d^2) / (n - 1) ## variance of errors
sd_B_hat <- sqrt ((1 - n/N) * var_d / n) / xbar ## SE for B
mem <- qt (0.975, df = n - 1) * sd_B_hat ## margin error for B
## output
output_B <- c (B_hat, sd_B_hat, B_hat - mem, B_hat + mem )
names (output_B) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output_B
## Est. S.E. ci.low ci.upp
## 2.277331 0.130786 2.010231 2.544432
## Est. S.E. ci.low ci.upp
## 32.110603 1.844097 28.344455 35.876750
## Est. S.E. ci.low ci.upp
## 2.277331 0.130786 2.010231 2.544432
xbarU <- 41835/N
ratio_mean_volume <- srs_ratio_est (ydata = cherry$volume, xdata = cherry$diameter, N = 2967) * xbarU
ratio_mean_volume
## Est. S.E. ci.low ci.upp
## 32.110603 1.844097 28.344455 35.876750
agpop <- read.csv ("data/agpop.csv")
agpop <- agpop[agpop$acres92 != -99, ] ## remove those counties with na
# sample size
n <- 300
# population size
N <- nrow (agpop)
# true values that we want to estimate
tyU <- sum (agpop [,"acres92"])
# suppose known for ratio estimate
txU <- sum (agpop [,"acres87"])
B <- tyU/txU
plot (agpop [, "acres87"], agpop [, "acres92"])
abline(a=0,b=B)
# expected reduction of variance of ratio estimate to srs
1-var(agpop$acres92-B*agpop$acres87)/var(agpop$acres92)
## [1] 0.9917355
##
## Call:
## lm(formula = agpop$acres92 ~ agpop$acres87)
##
## Residuals:
## Min 1Q Median 3Q Max
## -642399 -8256 -692 5593 656458
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.121e+03 8.648e+02 -2.453 0.0142 *
## agpop$acres87 9.878e-01 1.626e-03 607.388 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38560 on 3057 degrees of freedom
## Multiple R-squared: 0.9918, Adjusted R-squared: 0.9918
## F-statistic: 3.689e+05 on 1 and 3057 DF, p-value: < 2.2e-16
nsim <- 2000
sim_rat <- data.frame (
simple = rep (0, nsim), ratio = rep (0, nsim), B = rep (0, nsim))
for (i in 1:nsim)
{
srs <- sample (N,n)
y_srs <- agpop[srs, "acres92"]
x_srs <- agpop[srs, "acres87"]
# SRS estimate
sim_rat$simple [i] <- mean (y_srs) * N
# ratio estimate
sim_rat$B [i] <- mean (y_srs) / mean (x_srs)
sim_rat$ratio [i] <- sim_rat$B [i] * txU
}
sim_rat[1:20,]
sim_var <- sapply (sim_rat[,1:2], var)
ratio_sim_var <- sim_var/sim_var[1]
percentage_reduction <- 1-ratio_sim_var
data.frame (sim_var, ratio_sim_var,percentage_reduction)
## [1] 300
## estimating domain means
region_agsrs_domain <- data.frame(rbind(
NC = srs_domain_mean_est(agsrs$acres92[agsrs$region=="NC"], n=n, N = 3078),
NE = srs_domain_mean_est(agsrs$acres92[agsrs$region=="NE"], n=n,N = 3078),
S = srs_domain_mean_est(agsrs$acres92[agsrs$region=="S"], n=n,N = 3078),
W = srs_domain_mean_est(agsrs$acres92[agsrs$region=="W"], n=n,N = 3078))
)
region_agsrs_domain
## Compared to the results of naively applying SRS estimation to each domain (wrong analysis)
region_agsrs_srs <- data.frame(rbind(
NC = srs_mean_est(agsrs$acres92[agsrs$region=="NC"], N = 1054),
NE = srs_mean_est(agsrs$acres92[agsrs$region=="NE"], N = 220),
S = srs_mean_est(agsrs$acres92[agsrs$region=="S"], N = 1382),
W = srs_mean_est(agsrs$acres92[agsrs$region=="W"], N = 422))
)
region_agsrs_srs
## [1] 300
sh <- tapply (agsrs[, "acres92"], agsrs[,"region"], sd)
ybarh <- tapply (agsrs[, "acres92"], agsrs[,"region"], mean)
# create a vector with external information
Nh <- c(NC = 1054, NE = 220, S= 1382, W = 422)
**Create \(n_h^{\mbox{post-strat}}=n\times \frac{N_h}{N}\), instead of using observed \(n_h\)
nh_post <- Nh/sum(Nh)* n
## regional summary for stratified sampling estimation
knitr::kable(
data.frame(
"${\\bar y}_h$"=ybarh,
"$s_h$"=sh,
"$n_h$"=nh,
"$\\pi^{\\mbox{obs}}_h=n_h/n$"=nh/n,
"$N_h$"=Nh,
"$\\pi^{\\mbox{pop}}_h=N_h/N$"=Nh/sum(Nh),
"$n_h^{\\mbox{post-strat}}=n\\times N_h/N$"=nh_post,
check.names = FALSE ),
)
\({\bar y}_h\) | \(s_h\) | \(n_h\) | \(\pi^{\mbox{obs}}_h=n_h/n\) | \(N_h\) | \(\pi^{\mbox{pop}}_h=N_h/N\) | \(n_h^{\mbox{post-strat}}=n\times N_h/N\) | |
---|---|---|---|---|---|---|---|
NC | 323416.4 | 278970.0 | 78 | 0.2600000 | 1054 | 0.3424301 | 102.7290 |
NE | 106549.0 | 129320.2 | 18 | 0.0600000 | 220 | 0.0714750 | 21.4425 |
S | 246185.8 | 312941.3 | 160 | 0.5333333 | 1382 | 0.4489929 | 134.6979 |
W | 518977.6 | 490842.0 | 44 | 0.1466667 | 422 | 0.1371020 | 41.1306 |
## Est. S.E. ci.low ci.upp
## 300051.69 17760.26 265241.58 334861.79
## Est. S.E. ci.low ci.upp
## 297897.05 18901.41 260700.40 335093.69
## [1] 308582.4
## teacher
## gender 0 1
## 1 156 84
## 2 120 40
## teacher
## gender 0 1 Sum
## 1 156 84 240
## 2 120 40 160
# Proportions of students who want to become a teacher in two domains (female and male)
prop.table(table(teacher[,2:1]), margin = "gender")
## teacher
## gender 0 1
## 1 0.65 0.35
## 2 0.75 0.25
## 1 2
## 0.35 0.25
## 1 2
## 0.4779664 0.4343722
nh <- tapply (teacher$teacher, INDEX = teacher$gender, FUN = length)
prop_h_obs <- nh/sum (nh); prop_h_obs
## 1 2
## 0.6 0.4
knitr::kable(
data.frame('$\\bar y_h$'=ybarh,
"$s_h$"=sh,
"$n^{\\mbox{obs}}_h$"=nh,
"$\\pi^{\\mbox{obs}}_h$"=prop_h_obs,
check.names = FALSE)
)
\(\bar y_h\) | \(s_h\) | \(n^{\mbox{obs}}_h\) | \(\pi^{\mbox{obs}}_h\) |
---|---|---|---|
0.35 | 0.4779664 | 240 | 0.6 |
0.25 | 0.4343722 | 160 | 0.4 |
Nh <- c(3000, 1000)
n <- nrow(teacher)
data.frame(rbind(
female=srs_domain_mean_est(subset(teacher, gender==1)$teacher, n=n, N=4000),
male = srs_domain_mean_est(subset(teacher, gender==2)$teacher, n=n, N=4000)
))
## [1] 0.75 0.25
**Create \(n_h^{\mbox{post-strat}}=n\times \frac{N_h}{N}\), instead of using observed \(n_h\)
nh_post <- Nh/sum (Nh) * n
## show the differences
knitr::kable(
data.frame(
"${\\hat p}_h$"=ybarh,
"$s_h$"=sh,
"$n_h$"=nh,
"$\\pi^{\\mbox{obs}}_h=n_h/n$"=nh/n,
"$N_h$"=Nh,
"$\\pi^{\\mbox{pop}}_h=N_h/N$"=Nh/sum(Nh),
"$n_h^{\\mbox{post-strat}}=n\\times N_h/N$"=nh_post,
check.names = FALSE )
)
\({\hat p}_h\) | \(s_h\) | \(n_h\) | \(\pi^{\mbox{obs}}_h=n_h/n\) | \(N_h\) | \(\pi^{\mbox{pop}}_h=N_h/N\) | \(n_h^{\mbox{post-strat}}=n\times N_h/N\) |
---|---|---|---|---|---|---|
0.35 | 0.4779664 | 240 | 0.6 | 3000 | 0.75 | 300 |
0.25 | 0.4343722 | 160 | 0.4 | 1000 | 0.25 | 100 |
## Est. S.E. ci.low ci.upp
## 0.32500000 0.02217306 0.28154080 0.36845920
This post-stratification estimate can be also calculated with:
## [1] 0.325
## Est. S.E. ci.low ci.upp
## 0.31000000 0.02196545 0.26681751 0.35318249
This answer will be the same as using the observed (sample) gender proportions (0.6 and 0.4) to weight the \(\bar y_h\), that is,
## [1] 0.31
It is also equal to naive sample proportion:
## [1] 0.31
##
## Call:
## lm(formula = ydata ~ xdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.065 -3.107 0.152 3.495 9.587
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36.9435 3.3651 -10.98 7.62e-12 ***
## xdata 5.0659 0.2474 20.48 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.252 on 29 degrees of freedom
## Multiple R-squared: 0.9353, Adjusted R-squared: 0.9331
## F-statistic: 419.4 on 1 and 29 DF, p-value: < 2.2e-16
Calculation of \(\hat{y}_i=\hat{B}_0 + \hat{B}_1\times x_i\) and \(e_i=y_i-\hat{y}_i\):
Bhat <- lmfit$coefficients
yhat <- Bhat[1] + Bhat[2] * xdata
residuals <- ydata - yhat
knitr::kable(data.frame ("$x_i$"=cherry$diameter,
"$y_i$"=cherry$volume,
"$\\hat y_i=\\hat B_0+\\hat B_1\\times x_i$"=yhat,
"$e_i=y_i-\\hat y_i$"=residuals,
check.names=FALSE)) ## for visualization
\(x_i\) | \(y_i\) | \(\hat y_i=\hat B_0+\hat B_1\times x_i\) | \(e_i=y_i-\hat y_i\) |
---|---|---|---|
8.3 | 10.3 | 5.103149 | 5.1968508 |
8.6 | 10.3 | 6.622906 | 3.6770939 |
8.8 | 10.2 | 7.636077 | 2.5639226 |
10.5 | 16.4 | 16.248033 | 0.1519667 |
10.7 | 18.8 | 17.261205 | 1.5387954 |
10.8 | 19.7 | 17.767790 | 1.9322098 |
11.0 | 15.6 | 18.780962 | -3.1809615 |
11.0 | 18.2 | 18.780962 | -0.5809615 |
11.1 | 22.6 | 19.287547 | 3.3124528 |
11.2 | 19.9 | 19.794133 | 0.1058672 |
11.3 | 24.2 | 20.300718 | 3.8992815 |
11.4 | 21.0 | 20.807304 | 0.1926959 |
11.4 | 21.4 | 20.807304 | 0.5926959 |
11.7 | 21.3 | 22.327061 | -1.0270610 |
12.0 | 19.1 | 23.846818 | -4.7468179 |
12.9 | 22.2 | 28.406089 | -6.2060887 |
12.9 | 33.8 | 28.406089 | 5.3939113 |
13.3 | 27.4 | 30.432431 | -3.0324313 |
13.7 | 25.7 | 32.458774 | -6.7587739 |
13.8 | 24.9 | 32.965359 | -8.0653595 |
14.0 | 34.5 | 33.978531 | 0.5214692 |
14.2 | 31.7 | 34.991702 | -3.2917021 |
14.5 | 36.3 | 36.511459 | -0.2114590 |
16.0 | 38.3 | 44.110244 | -5.8102436 |
16.3 | 42.6 | 45.630001 | -3.0300006 |
17.3 | 55.4 | 50.695857 | 4.7041430 |
17.5 | 55.7 | 51.709028 | 3.9909717 |
17.9 | 58.3 | 53.735371 | 4.5646292 |
18.0 | 51.5 | 54.241957 | -2.7419565 |
18.0 | 51.0 | 54.241957 | -3.2419565 |
20.6 | 77.0 | 67.413183 | 9.5868168 |
We can then estimate \(s^2_e\) by this expression:
## [1] 18.0794
\(e_i\) and \(s^2_e\) can be obtained directly from lm fitting output
## 1 2 3 4 5 6 7
## 5.1968508 3.6770939 2.5639226 0.1519667 1.5387954 1.9322098 -3.1809615
## 8 9 10 11 12 13 14
## -0.5809615 3.3124528 0.1058672 3.8992815 0.1926959 0.5926959 -1.0270610
## 15 16 17 18 19 20 21
## -4.7468179 -6.2060887 5.3939113 -3.0324313 -6.7587739 -8.0653595 0.5214692
## 22 23 24 25 26 27 28
## -3.2917021 -0.2114590 -5.8102436 -3.0300006 4.7041430 3.9909717 4.5646292
## 29 30 31
## -2.7419565 -3.2419565 9.5868168
The Mean Sq for “Residuals” is the \(s^2_e\).
yhat_reg <- Bhat[1] + Bhat[2] * xbarU
se_yhat_reg <- sqrt ((1-n/N) * SSe / n)
mem <- qt (0.975, df = n - 2) * se_yhat_reg
output <- c(yhat_reg, se_yhat_reg, yhat_reg - mem, yhat_reg + mem)
names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output
## Est. S.E. ci.low ci.upp
## 34.4856287 0.7596795 32.9319097 36.0393476
reg_mean_volume <- srs_reg_est(ydata = cherry$volume, xdata = cherry$diameter,
xbarU=t_diameters/2967, N = 2967)
reg_mean_volume
## Est. S.E. ci.low ci.upp
## 34.4856287 0.7596795 32.9319097 36.0393476
To estimate the number of dead trees in an area, we divide the area into 100 square plots and count the number of dead trees on a photograph of each plot. Photo counts can be made quickly, but sometimes a tree is misclassified or not detected. So we select an SRS of 25 of the plots for field counts of dead trees. We know that the population mean number of dead trees per plot from the photo count is 11.3.
photocounts <- c (10,12,7, 13,13, 6,17, 16, 15, 10, 14, 12, 10, 5,12, 10,
10, 9, 6, 11, 7, 9, 11, 10, 10)
fieldcounts <- c (15, 14, 9, 14, 8, 5, 18, 15, 13, 15, 11, 15, 12, 8, 13,
9, 11, 12, 9, 12, 13, 11, 10, 9, 8)
lmfit <- lm (fieldcounts ~ photocounts)
summary (lmfit)
##
## Call:
## lm(formula = fieldcounts ~ photocounts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0319 -1.8053 0.1947 1.4212 3.8080
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.0593 1.7635 2.869 0.008676 **
## photocounts 0.6133 0.1601 3.832 0.000854 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.406 on 23 degrees of freedom
## Multiple R-squared: 0.3896, Adjusted R-squared: 0.3631
## F-statistic: 14.68 on 1 and 23 DF, p-value: 0.0008538
# estimate for the mean number of dead trees per plot
srs_reg_est (ydata = fieldcounts, xdata = photocounts, xbarU = 11.3, N = 100)
## Est. S.E. ci.low ci.upp
## 11.9892920 0.4167579 11.1271625 12.8514215
# estimate for the total number of dead trees in the area
srs_reg_est (ydata = fieldcounts, xdata = photocounts, xbarU = 11.3, N = 100, est.total = TRUE)
## Est. S.E. ci.low ci.upp
## 1198.92920 41.67579 1112.71625 1285.14215
## Est. S.E. ci.low ci.upp
## 11.5600000 0.5222069 10.4822180 12.6377820
# compare to ratio estimate
srs_ratio_est (ydata = fieldcounts, xdata = photocounts, N = 100) * 11.3
## Est. S.E. ci.low ci.upp
## 12.3233962 0.5121485 11.2663736 13.3804189