1 Functions and packages for Analyzing Data
- 1.1 General functions for ratio and regression estimation
- 1.2 Post-stratification
2 Ratio Estimation for cherry.csv dataset
3 Simulation study with agpop.csv
- 3.1 Information of population data
- 3.2 Simulation Studies
4 Estimating Domain Means and Post-stratification with agsrs.csv example
- 4.1 Estimating Domain Means
- 4.2 Post-stratification Analysis
5 Estimating Domain Means and Post-stratification with teacher example
6 Regression Estimation for the cherry.csv dataset
7 Regression estimation for photo counts of dead trees

1 Functions and packages for Analyzing Data

1.1 General functions for ratio and regression estimation

## ydata --- observations of the variable of interest
## xdata --- observations of the auxilliary variable
## N --- population size
## xbarU --- population mean of auxilliary variable

## the output is the estimate mean or total (est.total=TRUE)
srs_reg_est <- function (ydata, xdata, xbarU, N = Inf, est.total = FALSE)
{
  n <- length (ydata)
  lmfit <- lm (ydata ~ xdata)
  Bhat <- lmfit$coefficients
  efit <- lmfit$residuals
  SSe <- sum (efit^2) / (n - 2) 
  yhat_reg <- Bhat[1] + Bhat[2] * xbarU
  se_yhat_reg <- sqrt ((1-n/N) * SSe / n)
  mem <- qt (0.975, df = n - 2) * se_yhat_reg
  output <- c(yhat_reg, se_yhat_reg, yhat_reg - mem, yhat_reg + mem)
  
  if (est.total) {
      if(!is.finite(N)) stop("N must be finite for estimating population total" )
      output <- output * N
  }

  names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
  output
}


## ydata --- observations of the variable of interest
## xdata --- observations of the auxilliary variable
## N --- population size

## the output is the ratio of ybarU/xbarU
srs_ratio_est <- function (ydata, xdata, N = Inf)
{   
  n <- length (xdata)
  xbar <- mean (xdata)
  ybar <- mean (ydata)
  B_hat <- ybar / xbar
  d <- ydata - B_hat * xdata
  var_d <- sum (d^2) / (n - 1)
  sd_B_hat <- sqrt ((1 - n/N) * var_d / n) / xbar
  mem <- qt (0.975, df = n - 1) * sd_B_hat
  output <- c (B_hat, sd_B_hat, B_hat - mem, B_hat + mem )
  
  
  names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
  output
}

## sdata --- a vector of original survey data in a domain
## N --- population size
## n --- total sample size (not the sample size in the domain)
## to find total, multiply domain size N_d to the estimate returned by this function
srs_domain_mean_est <- function (sdata, n, N = Inf)
{
    n_d <- length (sdata)
    ybar <- mean (sdata)
    se.ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n_d)  
    mem <- qt (0.975, df = n_d - 1) * se.ybar
    c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}

## sdata --- a vector of original survey data
## N --- population size
## to find total, multiply N to the estimate returned by this function

srs_mean_est <- function (sdata, N = Inf)
{
    n <- length (sdata)
    ybar <- mean (sdata)
    se.ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)  
    mem <- qt (0.975, df = n - 1) * se.ybar
    c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}

1.2 Post-stratification

For poststratification, plug in \(n_h = n * {N_h \over N}\) in the estimator for a real stratified sample

## to find total, multiply N to the estimate returned by this function
str_mean_estimate <- function (ybarh, sh, nh, Nh)
{
    N <- sum (Nh)
    Pi_h <- Nh/N
    ybar <- sum(ybarh * Pi_h)
    seybar <- sqrt(sum((1-nh/Nh)*Pi_h^2*sh^2/nh))
    mem <- 1.96 * seybar
    c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}

2 Ratio Estimation for cherry.csv dataset

2.1 Importing data

cherry <- read.csv ("data/cherry.csv", header = T)
cherry

plot (cherry$volume ~ cherry$diameter)

summary (lm(cherry$volume ~ 0+cherry$diameter))

## 
## Call:
## lm(formula = cherry$volume ~ 0 + cherry$diameter)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.104  -8.470  -6.199   1.883  27.129 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## cherry$diameter   2.4209     0.1253   19.32   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.493 on 30 degrees of freedom
## Multiple R-squared:  0.9256, Adjusted R-squared:  0.9231 
## F-statistic: 373.1 on 1 and 30 DF,  p-value: < 2.2e-16

2.2 SRS estimate

N <- 2967
## estimating the mean of volume
srs_mean_volume <- srs_mean_est(cherry$volume, N = N)
srs_mean_volume

##      Est.      S.E.    ci.low    ci.upp 
## 30.170968  2.936861 24.173098 36.168837

## estimating the total of volume
srs_total_volume <- srs_mean_est(cherry$volume, N = N) * N
srs_total_volume

##       Est.       S.E.     ci.low     ci.upp 
##  89517.261   8713.665  71721.583 107312.940

2.3 Step-by-step calculation for Ratio Estimation

2.3.1 Estimating B and calculating residuals

## input
ydata <- cherry$volume
xdata <- cherry$diameter
N <- 2967

## calculation
n <- length (xdata)
xbar <- mean (xdata)
ybar <- mean (ydata)
B_hat <- ybar / xbar ## ratio estimate


plot (cherry$volume ~ cherry$diameter)
abline (a = 0, b = B_hat)

d <- ydata - B_hat * xdata ## errors
knitr::kable(data.frame ("$y_i$"=cherry$volume, 
                         "$x_i$"=cherry$diameter,
                         "$\\hat y_i=\\hat B\\times x_i$"=B_hat*xdata,
                         "$e_i=y_i - \\hat y_i$" = d,
                         check.names = FALSE))

\(y_i\)	\(x_i\)	\(\hat y_i=\hat B\times x_i\)	\(e_i=y_i - \hat y_i\)
10.3	8.3	18.90185	-8.6018505
10.3	8.6	19.58505	-9.2850499
10.2	8.8	20.04052	-9.8405162
16.4	10.5	23.91198	-7.5119795
18.8	10.7	24.36745	-5.5674458
19.7	10.8	24.59518	-4.8951790
15.6	11.0	25.05065	-9.4506452
18.2	11.0	25.05065	-6.8506452
22.6	11.1	25.27838	-2.6783784
19.9	11.2	25.50611	-5.6061115
24.2	11.3	25.73384	-1.5338447
21.0	11.4	25.96158	-4.9615778
21.4	11.4	25.96158	-4.5615778
21.3	11.7	26.64478	-5.3447772
19.1	12.0	27.32798	-8.2279766
22.2	12.9	29.37757	-7.1775749
33.8	12.9	29.37757	4.4224251
27.4	13.3	30.28851	-2.8885074
25.7	13.7	31.19944	-5.4994400
24.9	13.8	31.42717	-6.5271731
34.5	14.0	31.88264	2.6173606
31.7	14.2	32.33811	-0.6381057
36.3	14.5	33.02131	3.2786949
38.3	16.0	36.43730	1.8626978
42.6	16.3	37.12050	5.4794984
55.4	17.3	39.39783	16.0021670
55.7	17.5	39.85330	15.8467008
58.3	17.9	40.76423	17.5357682
51.5	18.0	40.99196	10.5080351
51.0	18.0	40.99196	10.0080351
77.0	20.6	46.91303	30.0869735

2.3.2 Estimating SE of B

## estimating S^2_e
var_d <- sum (d^2) / (n - 1) ## variance of errors
sd_B_hat <- sqrt ((1 - n/N) * var_d / n) / xbar ## SE for B
mem <- qt (0.975, df = n - 1) * sd_B_hat ## margin error for B

## output
output_B <- c (B_hat, sd_B_hat, B_hat - mem, B_hat + mem )
names (output_B) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output_B

##     Est.     S.E.   ci.low   ci.upp 
## 2.277331 0.130786 2.010231 2.544432

2.3.3 Estimating the mean volume of wood

mean_diameters <- 41835/N
output_B * mean_diameters

##      Est.      S.E.    ci.low    ci.upp 
## 32.110603  1.844097 28.344455 35.876750

2.3.4 Estimating the total volume of wood

t_diameters <- 41835
output_B * t_diameters

##       Est.       S.E.     ci.low     ci.upp 
##  95272.159   5471.434  84097.999 106446.318

2.4 Ratio estimation with a function

2.4.1 Estimating the ratio of volume to diameter

B_v2d <- srs_ratio_est (ydata = cherry$volume, xdata = cherry$diameter,  N = 2967)
B_v2d

##     Est.     S.E.   ci.low   ci.upp 
## 2.277331 0.130786 2.010231 2.544432

2.4.2 Estimating the mean of volume

xbarU <- 41835/N
ratio_mean_volume <- srs_ratio_est (ydata = cherry$volume, xdata = cherry$diameter, N = 2967) * xbarU
ratio_mean_volume

##      Est.      S.E.    ci.low    ci.upp 
## 32.110603  1.844097 28.344455 35.876750

2.4.3 Estimating the total of volume

total_diameters <- 41835
srs_ratio_est (ydata = cherry$volume, xdata = cherry$diameter, N = 2967) * total_diameters

##       Est.       S.E.     ci.low     ci.upp 
##  95272.159   5471.434  84097.999 106446.318

2.4.4 Percentage of Variance Reduction

cat(1-(ratio_mean_volume[2]/srs_mean_volume[2])^2)

## 0.6057237

3 Simulation study with agpop.csv

3.1 Information of population data

agpop <- read.csv ("data/agpop.csv")
agpop <- agpop[agpop$acres92 != -99, ] ## remove those counties with na

# sample size
n <- 300
# population size
N <- nrow (agpop)

# true values that we want to estimate
tyU <- sum (agpop [,"acres92"])
# suppose known for ratio estimate
txU <- sum (agpop [,"acres87"]) 
B <- tyU/txU
plot (agpop [, "acres87"], agpop [, "acres92"])
abline(a=0,b=B)

# expected reduction of variance of ratio estimate to srs

1-var(agpop$acres92-B*agpop$acres87)/var(agpop$acres92)

## [1] 0.9917355

# linear model output

summary(lm(agpop$acres92 ~ agpop$acres87))

## 
## Call:
## lm(formula = agpop$acres92 ~ agpop$acres87)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -642399   -8256    -692    5593  656458 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -2.121e+03  8.648e+02  -2.453   0.0142 *  
## agpop$acres87  9.878e-01  1.626e-03 607.388   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38560 on 3057 degrees of freedom
## Multiple R-squared:  0.9918, Adjusted R-squared:  0.9918 
## F-statistic: 3.689e+05 on 1 and 3057 DF,  p-value: < 2.2e-16

3.2 Simulation Studies

nsim <- 2000
sim_rat <- data.frame (
    simple = rep (0, nsim), ratio = rep (0, nsim), B = rep (0, nsim))

for (i in 1:nsim)
{
  srs <- sample (N,n)
  y_srs <- agpop[srs, "acres92"]
  x_srs <- agpop[srs, "acres87"]

  # SRS estimate
  sim_rat$simple [i] <- mean (y_srs) * N 
  # ratio estimate
  sim_rat$B [i] <- mean (y_srs) / mean (x_srs)
  sim_rat$ratio [i] <-  sim_rat$B [i] * txU
}

sim_rat[1:20,]

boxplot (sim_rat [, 1:2])
abline (h = tyU, col = "red")

sim_var <- sapply (sim_rat[,1:2], var)

ratio_sim_var <- sim_var/sim_var[1]
percentage_reduction <- 1-ratio_sim_var
data.frame (sim_var, ratio_sim_var,percentage_reduction)

4 Estimating Domain Means and Post-stratification with agsrs.csv example

4.1 Estimating Domain Means

agsrs <- read.csv("data/agsrs.csv")
n <- nrow (agsrs)
n

## [1] 300

## estimating domain means
region_agsrs_domain <- data.frame(rbind(
    NC = srs_domain_mean_est(agsrs$acres92[agsrs$region=="NC"], n=n, N = 3078),
    NE = srs_domain_mean_est(agsrs$acres92[agsrs$region=="NE"], n=n,N = 3078),
    S = srs_domain_mean_est(agsrs$acres92[agsrs$region=="S"], n=n,N = 3078),
    W = srs_domain_mean_est(agsrs$acres92[agsrs$region=="W"], n=n,N = 3078))
)
region_agsrs_domain

## Compared to the results of naively applying SRS estimation to each domain (wrong analysis)
region_agsrs_srs <- data.frame(rbind(
    NC = srs_mean_est(agsrs$acres92[agsrs$region=="NC"], N = 1054),
    NE = srs_mean_est(agsrs$acres92[agsrs$region=="NE"], N = 220),
    S = srs_mean_est(agsrs$acres92[agsrs$region=="S"], N = 1382),
    W = srs_mean_est(agsrs$acres92[agsrs$region=="W"], N = 422))
)
region_agsrs_srs

4.2 Post-stratification Analysis

nh <- tapply (agsrs[, "acres92"], agsrs[,"region"], length)
n <- sum (nh)
n

## [1] 300

sh <- tapply (agsrs[, "acres92"], agsrs[,"region"], sd)
ybarh <- tapply (agsrs[, "acres92"], agsrs[,"region"], mean)
# create a vector with external information
Nh <- c(NC = 1054, NE = 220, S= 1382, W = 422)

**Create \(n_h^{\mbox{post-strat}}=n\times \frac{N_h}{N}\), instead of using observed \(n_h\)

nh_post <- Nh/sum(Nh)* n

## regional summary for stratified sampling estimation
knitr::kable(
    data.frame(
    "${\\bar y}_h$"=ybarh, 
    "$s_h$"=sh, 
    "$n_h$"=nh, 
    "$\\pi^{\\mbox{obs}}_h=n_h/n$"=nh/n, 
    "$N_h$"=Nh, 
    "$\\pi^{\\mbox{pop}}_h=N_h/N$"=Nh/sum(Nh),  
     "$n_h^{\\mbox{post-strat}}=n\\times N_h/N$"=nh_post,
    check.names = FALSE ),
    )

	\({\bar y}_h\)	\(s_h\)	\(n_h\)	\(\pi^{\mbox{obs}}_h=n_h/n\)	\(N_h\)	\(\pi^{\mbox{pop}}_h=N_h/N\)	\(n_h^{\mbox{post-strat}}=n\times N_h/N\)
NC	323416.4	278970.0	78	0.2600000	1054	0.3424301	102.7290
NE	106549.0	129320.2	18	0.0600000	220	0.0714750	21.4425
S	246185.8	312941.3	160	0.5333333	1382	0.4489929	134.6979
W	518977.6	490842.0	44	0.1466667	422	0.1371020	41.1306

## strata mean estimate
str_mean_estimate (ybarh, sh, nh_post, Nh)

##      Est.      S.E.    ci.low    ci.upp 
## 300051.69  17760.26 265241.58 334861.79

## compare with SRS estimate

srs_mean_est(agsrs$acres92, N = 3087)

##      Est.      S.E.    ci.low    ci.upp 
## 297897.05  18901.41 260700.40 335093.69

## compare with true mean
mean(agpop$acres92)

## [1] 308582.4

5 Estimating Domain Means and Post-stratification with teacher example

5.1 Importing Data and Calculating Contingency Table

teacher <- read.csv ("data/college_teacher.csv"); teacher

table(teacher[,2:1])

##       teacher
## gender   0   1
##      1 156  84
##      2 120  40

addmargins(table(teacher[,2:1]), 2)

##       teacher
## gender   0   1 Sum
##      1 156  84 240
##      2 120  40 160

# Proportions of students who want to become a teacher in two domains (female and male)
prop.table(table(teacher[,2:1]), margin = "gender")

##       teacher
## gender    0    1
##      1 0.65 0.35
##      2 0.75 0.25

5.2 Domain Summary

n <- nrow (teacher)
ybarh <- tapply (teacher$teacher, INDEX = teacher$gender, FUN = mean); ybarh

##    1    2 
## 0.35 0.25

sh <- tapply (teacher$teacher, INDEX = teacher$gender, FUN = sd); sh

##         1         2 
## 0.4779664 0.4343722

nh <- tapply (teacher$teacher, INDEX = teacher$gender, FUN = length)
prop_h_obs <- nh/sum (nh); prop_h_obs

##   1   2 
## 0.6 0.4

knitr::kable(
    data.frame('$\\bar y_h$'=ybarh, 
               "$s_h$"=sh, 
               "$n^{\\mbox{obs}}_h$"=nh, 
               "$\\pi^{\\mbox{obs}}_h$"=prop_h_obs,
               check.names = FALSE)
)

\(\bar y_h\)	\(s_h\)	\(n^{\mbox{obs}}_h\)	\(\pi^{\mbox{obs}}_h\)
0.35	0.4779664	240	0.6
0.25	0.4343722	160	0.4

5.3 Estimating Domain Means

Nh <- c(3000, 1000)
n <- nrow(teacher)
data.frame(rbind(
    female=srs_domain_mean_est(subset(teacher, gender==1)$teacher, n=n, N=4000),
    male = srs_domain_mean_est(subset(teacher, gender==2)$teacher, n=n, N=4000)
))

5.4 Post-stratification Analysis

Nh <- c(3000, 1000)
prop_h_pop <- Nh/sum (Nh); prop_h_pop

## [1] 0.75 0.25

**Create \(n_h^{\mbox{post-strat}}=n\times \frac{N_h}{N}\), instead of using observed \(n_h\)

nh_post <- Nh/sum (Nh) * n

## show the differences
knitr::kable(
    data.frame(
    "${\\hat p}_h$"=ybarh, 
    "$s_h$"=sh, 
    "$n_h$"=nh, 
    "$\\pi^{\\mbox{obs}}_h=n_h/n$"=nh/n, 
    "$N_h$"=Nh, 
    "$\\pi^{\\mbox{pop}}_h=N_h/N$"=Nh/sum(Nh),  
     "$n_h^{\\mbox{post-strat}}=n\\times N_h/N$"=nh_post,
    check.names = FALSE )
    )

\({\hat p}_h\)	\(s_h\)	\(n_h\)	\(\pi^{\mbox{obs}}_h=n_h/n\)	\(N_h\)	\(\pi^{\mbox{pop}}_h=N_h/N\)	\(n_h^{\mbox{post-strat}}=n\times N_h/N\)
0.35	0.4779664	240	0.6	3000	0.75	300
0.25	0.4343722	160	0.4	1000	0.25	100

## poststratification estimation of mean
str_mean_estimate (ybarh, sh, nh_post, Nh)

##       Est.       S.E.     ci.low     ci.upp 
## 0.32500000 0.02217306 0.28154080 0.36845920

This post-stratification estimate can be also calculated with:

0.35*0.75+0.25*0.25

## [1] 0.325

5.5 Comparing to the Analysis with SRS

srs_mean_est (teacher$teacher, N=sum(Nh))

##       Est.       S.E.     ci.low     ci.upp 
## 0.31000000 0.02196545 0.26681751 0.35318249

This answer will be the same as using the observed (sample) gender proportions (0.6 and 0.4) to weight the \(\bar y_h\), that is,

0.35*0.6+0.25*0.4

## [1] 0.31

It is also equal to naive sample proportion:

mean (teacher$teacher==1)

## [1] 0.31

6 Regression Estimation for the cherry.csv dataset

6.1 Importing data

cherry <- read.csv ("data/cherry.csv", header = T)
ydata <- cherry$volume 
xdata <- cherry$diameter
t_diameters <- 41835
xbarU <- t_diameters/2967 
N <- 2967

6.2 Step-by-step calculation

6.2.1 Fitting a linear regression model

n <- length (ydata)
lmfit <- lm (ydata ~ xdata)
summary (lmfit)

## 
## Call:
## lm(formula = ydata ~ xdata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.065 -3.107  0.152  3.495  9.587 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -36.9435     3.3651  -10.98 7.62e-12 ***
## xdata         5.0659     0.2474   20.48  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.252 on 29 degrees of freedom
## Multiple R-squared:  0.9353, Adjusted R-squared:  0.9331 
## F-statistic: 419.4 on 1 and 29 DF,  p-value: < 2.2e-16

plot (xdata, ydata)
abline (lmfit)

Calculation of \(\hat{y}_i=\hat{B}_0 + \hat{B}_1\times x_i\) and \(e_i=y_i-\hat{y}_i\):

Bhat <- lmfit$coefficients
yhat <- Bhat[1] + Bhat[2] * xdata
residuals <- ydata - yhat
knitr::kable(data.frame ("$x_i$"=cherry$diameter, 
                         "$y_i$"=cherry$volume,
                         "$\\hat y_i=\\hat B_0+\\hat B_1\\times x_i$"=yhat,
                         "$e_i=y_i-\\hat y_i$"=residuals,
                         check.names=FALSE)) ## for visualization

\(x_i\)	\(y_i\)	\(\hat y_i=\hat B_0+\hat B_1\times x_i\)	\(e_i=y_i-\hat y_i\)
8.3	10.3	5.103149	5.1968508
8.6	10.3	6.622906	3.6770939
8.8	10.2	7.636077	2.5639226
10.5	16.4	16.248033	0.1519667
10.7	18.8	17.261205	1.5387954
10.8	19.7	17.767790	1.9322098
11.0	15.6	18.780962	-3.1809615
11.0	18.2	18.780962	-0.5809615
11.1	22.6	19.287547	3.3124528
11.2	19.9	19.794133	0.1058672
11.3	24.2	20.300718	3.8992815
11.4	21.0	20.807304	0.1926959
11.4	21.4	20.807304	0.5926959
11.7	21.3	22.327061	-1.0270610
12.0	19.1	23.846818	-4.7468179
12.9	22.2	28.406089	-6.2060887
12.9	33.8	28.406089	5.3939113
13.3	27.4	30.432431	-3.0324313
13.7	25.7	32.458774	-6.7587739
13.8	24.9	32.965359	-8.0653595
14.0	34.5	33.978531	0.5214692
14.2	31.7	34.991702	-3.2917021
14.5	36.3	36.511459	-0.2114590
16.0	38.3	44.110244	-5.8102436
16.3	42.6	45.630001	-3.0300006
17.3	55.4	50.695857	4.7041430
17.5	55.7	51.709028	3.9909717
17.9	58.3	53.735371	4.5646292
18.0	51.5	54.241957	-2.7419565
18.0	51.0	54.241957	-3.2419565
20.6	77.0	67.413183	9.5868168

We can then estimate \(s^2_e\) by this expression:

SSe <- sum (residuals^2) / (n - 2); SSe

## [1] 18.0794

\(e_i\) and \(s^2_e\) can be obtained directly from lm fitting output

# e_i can be obtained from lm() fitting outputs too
lmfit$residuals # for e_i

##          1          2          3          4          5          6          7 
##  5.1968508  3.6770939  2.5639226  0.1519667  1.5387954  1.9322098 -3.1809615 
##          8          9         10         11         12         13         14 
## -0.5809615  3.3124528  0.1058672  3.8992815  0.1926959  0.5926959 -1.0270610 
##         15         16         17         18         19         20         21 
## -4.7468179 -6.2060887  5.3939113 -3.0324313 -6.7587739 -8.0653595  0.5214692 
##         22         23         24         25         26         27         28 
## -3.2917021 -0.2114590 -5.8102436 -3.0300006  4.7041430  3.9909717  4.5646292 
##         29         30         31 
## -2.7419565 -3.2419565  9.5868168

anova(lmfit)

The Mean Sq for “Residuals” is the \(s^2_e\).

6.2.2 Estimate the mean

yhat_reg <- Bhat[1] + Bhat[2] * xbarU
se_yhat_reg <- sqrt ((1-n/N) * SSe / n)
mem <- qt (0.975, df = n - 2) * se_yhat_reg
output <- c(yhat_reg, se_yhat_reg, yhat_reg - mem, yhat_reg + mem)
names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output

##       Est.       S.E.     ci.low     ci.upp 
## 34.4856287  0.7596795 32.9319097 36.0393476

6.2.3 Estimate the total

output * N

##       Est.       S.E.     ci.low     ci.upp 
## 102318.860   2253.969  97708.976 106928.744

6.3 Regression estimation Using the function

6.3.1 Estimating the mean

reg_mean_volume <- srs_reg_est(ydata = cherry$volume, xdata = cherry$diameter, 
                 xbarU=t_diameters/2967, N = 2967) 
reg_mean_volume

##       Est.       S.E.     ci.low     ci.upp 
## 34.4856287  0.7596795 32.9319097 36.0393476

6.3.2 Estimating the total

t_diameters <- 41835
srs_reg_est(ydata = cherry$volume, xdata = cherry$diameter, 
                 xbarU=t_diameters/2967, N = 2967, est.total = TRUE)

##       Est.       S.E.     ci.low     ci.upp 
## 102318.860   2253.969  97708.976 106928.744

6.3.3 Percentage of Variance Reduction

cat(1-(reg_mean_volume[2]/srs_mean_volume[2])^2)

## 0.9330895

7 Regression estimation for photo counts of dead trees

To estimate the number of dead trees in an area, we divide the area into 100 square plots and count the number of dead trees on a photograph of each plot. Photo counts can be made quickly, but sometimes a tree is misclassiﬁed or not detected. So we select an SRS of 25 of the plots for ﬁeld counts of dead trees. We know that the population mean number of dead trees per plot from the photo count is 11.3.

photocounts <- c (10,12,7, 13,13, 6,17, 16, 15, 10, 14, 12, 10, 5,12, 10,
10, 9, 6, 11, 7, 9, 11, 10, 10)

fieldcounts <- c (15, 14, 9, 14, 8, 5, 18, 15, 13, 15, 11, 15, 12, 8, 13,
9, 11, 12, 9, 12, 13, 11, 10, 9, 8)


lmfit <- lm (fieldcounts ~ photocounts)
summary (lmfit)

## 
## Call:
## lm(formula = fieldcounts ~ photocounts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0319 -1.8053  0.1947  1.4212  3.8080 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.0593     1.7635   2.869 0.008676 ** 
## photocounts   0.6133     0.1601   3.832 0.000854 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.406 on 23 degrees of freedom
## Multiple R-squared:  0.3896, Adjusted R-squared:  0.3631 
## F-statistic: 14.68 on 1 and 23 DF,  p-value: 0.0008538

plot (photocounts, fieldcounts)
abline (lmfit)

# estimate for the mean number of dead trees per plot
srs_reg_est (ydata = fieldcounts, xdata = photocounts, xbarU = 11.3, N = 100)

##       Est.       S.E.     ci.low     ci.upp 
## 11.9892920  0.4167579 11.1271625 12.8514215

# estimate for the total number of dead trees in the area
srs_reg_est (ydata = fieldcounts, xdata = photocounts, xbarU = 11.3, N = 100, est.total = TRUE)

##       Est.       S.E.     ci.low     ci.upp 
## 1198.92920   41.67579 1112.71625 1285.14215

# compare to simple estimate
srs_mean_est (sdata = fieldcounts, N = 100)

##       Est.       S.E.     ci.low     ci.upp 
## 11.5600000  0.5222069 10.4822180 12.6377820

# compare to ratio estimate 
srs_ratio_est (ydata = fieldcounts, xdata = photocounts, N = 100) * 11.3

##       Est.       S.E.     ci.low     ci.upp 
## 12.3233962  0.5121485 11.2663736 13.3804189

Data Analysis and Simulation for Ratio and Regression Estimation

Longhai Li

November 2024