1 Functions and packages for Analyzing Data

## ydata --- observations of the variable of interest
## xdata --- observations of the auxilliary variable
## N --- population size
## xbarU --- population mean of auxilliary variable

## the output is the estimate mean or total (est.total=TRUE)
srs_reg_est <- function (ydata, xdata, xbarU, N = Inf, est.total = FALSE)
{
  n <- length (ydata)
  lmfit <- lm (ydata ~ xdata)
  Bhat <- lmfit$coefficients
  efit <- lmfit$residuals
  SSe <- sum (efit^2) / (n - 2) 
  yhat_reg <- Bhat[1] + Bhat[2] * xbarU
  se_yhat_reg <- sqrt ((1-n/N) * SSe / n)
  mem <- qt (0.975, df = n - 2) * se_yhat_reg
  output <- c(yhat_reg, se_yhat_reg, yhat_reg - mem, yhat_reg + mem)
  
  if (est.total) {
      if(!is.finite(N)) stop("N must be finite for estimating population total" )
      output <- output * N
  }

  names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
  output
}


## ydata --- observations of the variable of interest
## xdata --- observations of the auxilliary variable
## N --- population size

## the output is the ratio of ybarU/xbarU
srs_ratio_est <- function (ydata, xdata, N = Inf)
{   
  n <- length (xdata)
  xbar <- mean (xdata)
  ybar <- mean (ydata)
  B_hat <- ybar / xbar
  d <- ydata - B_hat * xdata
  var_d <- sum (d^2) / (n - 1)
  sd_B_hat <- sqrt ((1 - n/N) * var_d / n) / xbar
  mem <- qt (0.975, df = n - 1) * sd_B_hat
  output <- c (B_hat, sd_B_hat, B_hat - mem, B_hat + mem )
  
  
  names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
  output
}

## sdata --- a vector of original survey data in a domain
## N --- population size
## n --- total sample size (not the sample size in the domain)
## to find total, multiply domain size N_d to the estimate returned by this function
srs_domain_mean_est <- function (sdata, n, N = Inf)
{
    n_d <- length (sdata)
    ybar <- mean (sdata)
    se.ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n_d)  
    mem <- qt (0.975, df = n_d - 1) * se.ybar
    c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}

## sdata --- a vector of original survey data
## N --- population size
## to find total, multiply N to the estimate returned by this function

srs_mean_est <- function (sdata, N = Inf)
{
    n <- length (sdata)
    ybar <- mean (sdata)
    se.ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)  
    mem <- qt (0.975, df = n - 1) * se.ybar
    c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}

## to find total, multiply N to the estimate returned by this function
## for poststratification, use nh = n * Nh/N
str_mean_estimate <- function (ybarh, sh, nh, Nh)
{
    N <- sum (Nh)
    Pi_h <- Nh/N
    ybar <- sum(ybarh * Pi_h)
    seybar <- sqrt(sum((1-nh/Nh)*Pi_h^2*sh^2/nh))
    mem <- 1.96 * seybar
    c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}

2 Ratio Estimation for cherry.csv dataset

2.1 Importing data

cherry <- read.csv ("data/cherry.csv", header = T)
cherry
plot (cherry$volume ~ cherry$diameter)

summary (lm(cherry$volume ~ 0+cherry$diameter))
## 
## Call:
## lm(formula = cherry$volume ~ 0 + cherry$diameter)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.104  -8.470  -6.199   1.883  27.129 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## cherry$diameter   2.4209     0.1253   19.32   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.493 on 30 degrees of freedom
## Multiple R-squared:  0.9256, Adjusted R-squared:  0.9231 
## F-statistic: 373.1 on 1 and 30 DF,  p-value: < 2.2e-16

2.2 SRS estimate

N <- 2967
## estimating the mean of volume
srs_mean_volume <- srs_mean_est(cherry$volume, N = N)
srs_mean_volume
##      Est.      S.E.    ci.low    ci.upp 
## 30.170968  2.936861 24.173098 36.168837
## estimating the total of volume
srs_total_volume <- srs_mean_est(cherry$volume, N = N) * N
srs_total_volume
##       Est.       S.E.     ci.low     ci.upp 
##  89517.261   8713.665  71721.583 107312.940

2.3 Step-by-step calculation for Ratio Estimation

2.3.1 Estimating B and calculating residuals

## input
ydata <- cherry$volume
xdata <- cherry$diameter
N <- 2967

## calculation
n <- length (xdata)
xbar <- mean (xdata)
ybar <- mean (ydata)
B_hat <- ybar / xbar ## ratio estimate


plot (cherry$volume ~ cherry$diameter)
abline (a = 0, b = B_hat)

d <- ydata - B_hat * xdata ## errors
data.frame (cherry, d = d)

2.3.2 Estimating SE of B

## estimating S^2_e
var_d <- sum (d^2) / (n - 1) ## variance of errors
sd_B_hat <- sqrt ((1 - n/N) * var_d / n) / xbar ## SE for B
mem <- qt (0.975, df = n - 1) * sd_B_hat ## margin error for B

## output
output_B <- c (B_hat, sd_B_hat, B_hat - mem, B_hat + mem )
names (output_B) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output_B
##     Est.     S.E.   ci.low   ci.upp 
## 2.277331 0.130786 2.010231 2.544432

2.3.3 Estimating the mean volume of wood

mean_diameters <- 41835/N
output_B * mean_diameters
##      Est.      S.E.    ci.low    ci.upp 
## 32.110603  1.844097 28.344455 35.876750

2.3.4 Estimating the total volume of wood

t_diameters <- 41835
output_B * t_diameters
##       Est.       S.E.     ci.low     ci.upp 
##  95272.159   5471.434  84097.999 106446.318

2.4 Ratio estimation with a function

2.4.1 Estimating the ratio of volume to diameter

B_v2d <- srs_ratio_est (ydata = cherry$volume, xdata = cherry$diameter,  N = 2967)
B_v2d
##     Est.     S.E.   ci.low   ci.upp 
## 2.277331 0.130786 2.010231 2.544432

2.4.2 Estimating the mean of volume

xbarU <- 41835/N
ratio_mean_volume <- srs_ratio_est (ydata = cherry$volume, xdata = cherry$diameter, N = 2967) * xbarU
ratio_mean_volume
##      Est.      S.E.    ci.low    ci.upp 
## 32.110603  1.844097 28.344455 35.876750

2.4.3 Estimating the total of volume

total_diameters <- 41835
srs_ratio_est (ydata = cherry$volume, xdata = cherry$diameter, N = 2967) * total_diameters
##       Est.       S.E.     ci.low     ci.upp 
##  95272.159   5471.434  84097.999 106446.318

2.4.4 Percentage of Variance Reduction

cat(1-(ratio_mean_volume[2]/srs_mean_volume[2])^2)
## 0.6057237

3 Simulation study with agpop.csv

3.1 Information of population data

agpop <- read.csv ("data/agpop.csv")
agpop <- agpop[agpop$acres92 != -99, ] ## remove those counties with na

# sample size
n <- 300
# population size
N <- nrow (agpop)

# true values that we want to estimate
tyU <- sum (agpop [,"acres92"])
# suppose known for ratio estimate
txU <- sum (agpop [,"acres87"]) 
B <- tyU/txU
plot (agpop [, "acres87"], agpop [, "acres92"])
abline(a=0,b=B)

# expected reduction of variance of ratio estimate to srs

1-var(agpop$acres92-B*agpop$acres87)/var(agpop$acres92)
## [1] 0.9917355
# linear model output

summary(lm(agpop$acres92 ~ agpop$acres87))
## 
## Call:
## lm(formula = agpop$acres92 ~ agpop$acres87)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -642399   -8256    -692    5593  656458 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -2.121e+03  8.648e+02  -2.453   0.0142 *  
## agpop$acres87  9.878e-01  1.626e-03 607.388   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38560 on 3057 degrees of freedom
## Multiple R-squared:  0.9918, Adjusted R-squared:  0.9918 
## F-statistic: 3.689e+05 on 1 and 3057 DF,  p-value: < 2.2e-16

3.2 Simulation Studies

nsim <- 2000
sim_rat <- data.frame (
    simple = rep (0, nsim), ratio = rep (0, nsim), B = rep (0, nsim))

for (i in 1:nsim)
{
  srs <- sample (N,n)
  y_srs <- agpop[srs, "acres92"]
  x_srs <- agpop[srs, "acres87"]

  # SRS estimate
  sim_rat$simple [i] <- mean (y_srs) * N 
  # ratio estimate
  sim_rat$B [i] <- mean (y_srs) / mean (x_srs)
  sim_rat$ratio [i] <-  sim_rat$B [i] * txU
}

sim_rat[1:20,]
boxplot (sim_rat [, 1:2])
abline (h = tyU, col = "red")

sim_var <- sapply (sim_rat[,1:2], var)

ratio_sim_var <- sim_var/sim_var[1]
percentage_reduction <- 1-ratio_sim_var
data.frame (sim_var, ratio_sim_var,percentage_reduction)

4 Estimating Domain Means and Post-stratification with agsrs.csv example

4.1 Estimating Domain Means

agsrs <- read.csv("data/agsrs.csv")
n <- nrow (agsrs)
n
## [1] 300
## estimating domain means
region_agsrs_domain <- data.frame(rbind(
    NC = srs_domain_mean_est(agsrs$acres92[agsrs$region=="NC"], n=n, N = 3078),
    NE = srs_domain_mean_est(agsrs$acres92[agsrs$region=="NE"], n=n,N = 3078),
    S = srs_domain_mean_est(agsrs$acres92[agsrs$region=="S"], n=n,N = 3078),
    W = srs_domain_mean_est(agsrs$acres92[agsrs$region=="W"], n=n,N = 3078))
)
region_agsrs_domain
## Compared to the results of naively applying SRS estimation to each domain
region_agsrs_srs <- data.frame(rbind(
    NC = srs_mean_est(agsrs$acres92[agsrs$region=="NC"], N = 1054),
    NE = srs_mean_est(agsrs$acres92[agsrs$region=="NE"], N = 220),
    S = srs_mean_est(agsrs$acres92[agsrs$region=="S"], N = 1382),
    W = srs_mean_est(agsrs$acres92[agsrs$region=="W"], N = 422))
)
region_agsrs_srs

4.2 Post-stratification Analysis

nh <- tapply (1:nrow(agsrs), agsrs[,"region"], length)
n <- sum (nh)
n
## [1] 300
sh <- tapply (agsrs[, "acres92"], agsrs[,"region"], sd)
ybarh <- tapply (agsrs[, "acres92"], agsrs[,"region"], mean)
# create a vector with external information
Nh <- c(NC = 1054, NE = 220, S= 1382, W = 422)

Create nh proportional to Nh, instead of using observed nh

nh_post <- Nh/sum(Nh)* n

## regional summary for stratified sampling estimation
data.frame(cbind(ybarh, sh, nh, prop_obs=nh/sum(nh), Nh, nh_post, prop_post = nh_post/sum(nh_post)))
## strata mean estimate
str_mean_estimate (ybarh, sh, nh_post, Nh)
##      Est.      S.E.    ci.low    ci.upp 
## 300051.69  17760.26 265241.58 334861.79
## compare with SRS estimate

srs_mean_est(agsrs$acres92, N = 3087)
##      Est.      S.E.    ci.low    ci.upp 
## 297897.05  18901.41 260700.40 335093.69
## true mean
mean(agpop$acres92)
## [1] 308582.4

5 Estimating Domain Means and Post-stratification with teacher example

5.1 Importing Data

teacher <- read.csv ("data/college_teacher.csv")
head(teacher)
table(teacher[,2:1])
##       teacher
## gender   0   1
##      1 156  84
##      2 120  40
addmargins(table(teacher[,2:1]), 2)
##       teacher
## gender   0   1 Sum
##      1 156  84 240
##      2 120  40 160
prop.table(table(teacher[,2:1]), margin = "gender")
##       teacher
## gender    0    1
##      1 0.65 0.35
##      2 0.75 0.25

5.2 Domain Summary

n <- nrow (teacher)
yh <- tapply (teacher$teacher, INDEX = teacher$gender, FUN = mean); yh
##    1    2 
## 0.35 0.25
sh <- tapply (teacher$teacher, INDEX = teacher$gender, FUN = sd); sh
##         1         2 
## 0.4779664 0.4343722
nh_obs <- tapply (teacher$teacher, INDEX = teacher$gender, FUN = length)
prop_h_obs <- nh_obs/sum (nh_obs); prop_h_obs
##   1   2 
## 0.6 0.4
data.frame(yh, sh, nh_obs, prop_h_obs)

5.3 Estimating Domain Means

Nh <- c(3000, 1000)
n <- nrow(teacher)
data.frame(rbind(
    female=srs_domain_mean_est(subset(teacher, gender==1)$teacher, n=n, N=4000),
    male = srs_domain_mean_est(subset(teacher, gender==2)$teacher, n=n, N=4000)
))

5.4 Post-stratification Analysis

Nh <- c(3000, 1000)
prop_h_post <- Nh/sum (Nh); prop_h_post
## [1] 0.75 0.25

Create nh proportional to Nh, instead of using observed nh

nh_post <- Nh/sum (Nh) * n

## show the differences
data.frame(p_hat = yh, sh, nh_obs, prop_obs=prop_h_obs, nh_post, prop_post=prop_h_post)
## poststratification estimation of mean
str_mean_estimate (yh, sh, nh_post, Nh)
##       Est.       S.E.     ci.low     ci.upp 
## 0.32500000 0.02217306 0.28154080 0.36845920

5.5 Comparing to the Analysis with SRS

srs_mean_est (teacher$teacher, N=sum(Nh))
##       Est.       S.E.     ci.low     ci.upp 
## 0.31000000 0.02196545 0.26681751 0.35318249

6 Regression Estimation for the cherry.csv dataset

6.1 Importing data

cherry <- read.csv ("data/cherry.csv", header = T)
ydata <- cherry$volume 
xdata <- cherry$diameter
t_diameters <- 41835
xbarU <- t_diameters/2967 
N <- 2967

6.2 Step-by-step calculation

6.2.1 Fitting a linear regression model

n <- length (ydata)
lmfit <- lm (ydata ~ xdata)
summary (lmfit)
## 
## Call:
## lm(formula = ydata ~ xdata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.065 -3.107  0.152  3.495  9.587 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -36.9435     3.3651  -10.98 7.62e-12 ***
## xdata         5.0659     0.2474   20.48  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.252 on 29 degrees of freedom
## Multiple R-squared:  0.9353, Adjusted R-squared:  0.9331 
## F-statistic: 419.4 on 1 and 29 DF,  p-value: < 2.2e-16
plot (xdata, ydata)
abline (lmfit)

Bhat <- lmfit$coefficients
efit <- ydata - (Bhat[1] + Bhat[2] * xdata)
data.frame (cherry, residual=efit) ## for visualization
SSe <- sum (efit^2) / (n - 2) 
SSe
## [1] 18.0794
## SSe can be obtained directly from lm fitting output
anova(lmfit)

6.2.2 Estimate the mean

yhat_reg <- Bhat[1] + Bhat[2] * xbarU
se_yhat_reg <- sqrt ((1-n/N) * SSe / n)
mem <- qt (0.975, df = n - 2) * se_yhat_reg
output <- c(yhat_reg, se_yhat_reg, yhat_reg - mem, yhat_reg + mem)
names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output
##       Est.       S.E.     ci.low     ci.upp 
## 34.4856287  0.7596795 32.9319097 36.0393476

6.2.3 Estimate the total

output * N 
##       Est.       S.E.     ci.low     ci.upp 
## 102318.860   2253.969  97708.976 106928.744

6.3 Regression estimation Using the function

6.3.1 Estimating the mean

reg_mean_volume <- srs_reg_est(ydata = cherry$volume, xdata = cherry$diameter, 
                 xbarU=t_diameters/2967, N = 2967) 
reg_mean_volume
##       Est.       S.E.     ci.low     ci.upp 
## 34.4856287  0.7596795 32.9319097 36.0393476

6.3.2 Estimating the total

t_diameters <- 41835
srs_reg_est(ydata = cherry$volume, xdata = cherry$diameter, 
                 xbarU=t_diameters/2967, N = 2967, est.total = TRUE)
##       Est.       S.E.     ci.low     ci.upp 
## 102318.860   2253.969  97708.976 106928.744

6.3.3 Percentage of Variance Reduction

cat(1-(reg_mean_volume[2]/srs_mean_volume[2])^2)
## 0.9330895

7 Regression estimation for photo counts of dead trees

To estimate the number of dead trees in an area, we divide the area into 100 square plots and count the number of dead trees on a photograph of each plot. Photo counts can be made quickly, but sometimes a tree is misclassified or not detected. So we select an SRS of 25 of the plots for field counts of dead trees. We know that the population mean number of dead trees per plot from the photo count is 11.3.

photocounts <- c (10,12,7, 13,13, 6,17, 16, 15, 10, 14, 12, 10, 5,12, 10,
10, 9, 6, 11, 7, 9, 11, 10, 10)

fieldcounts <- c (15, 14, 9, 14, 8, 5, 18, 15, 13, 15, 11, 15, 12, 8, 13,
9, 11, 12, 9, 12, 13, 11, 10, 9, 8)


lmfit <- lm (fieldcounts ~ photocounts)
summary (lmfit)
## 
## Call:
## lm(formula = fieldcounts ~ photocounts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0319 -1.8053  0.1947  1.4212  3.8080 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.0593     1.7635   2.869 0.008676 ** 
## photocounts   0.6133     0.1601   3.832 0.000854 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.406 on 23 degrees of freedom
## Multiple R-squared:  0.3896, Adjusted R-squared:  0.3631 
## F-statistic: 14.68 on 1 and 23 DF,  p-value: 0.0008538
plot (photocounts, fieldcounts)
abline (lmfit)

# estimate for the mean number of dead trees per plot
srs_reg_est (ydata = fieldcounts, xdata = photocounts, xbarU = 11.3, N = 100)
##       Est.       S.E.     ci.low     ci.upp 
## 11.9892920  0.4167579 11.1271625 12.8514215
# estimate for the total number of dead trees in the area
srs_reg_est (ydata = fieldcounts, xdata = photocounts, xbarU = 11.3, N = 100, est.total = TRUE)
##       Est.       S.E.     ci.low     ci.upp 
## 1198.92920   41.67579 1112.71625 1285.14215
# compare to simple estimate
srs_mean_est (sdata = fieldcounts, N = 100)
##       Est.       S.E.     ci.low     ci.upp 
## 11.5600000  0.5222069 10.4822180 12.6377820
# compare to ratio estimate 
srs_ratio_est (ydata = fieldcounts, xdata = photocounts, N = 100) * 11.3
##       Est.       S.E.     ci.low     ci.upp 
## 12.3233962  0.5121485 11.2663736 13.3804189