1 Functions and packages for Analyzing Data
2 Analysis of “agstrat.csv” data
3 Allocation of stratum sample size
- 3.1 Analyzing seals.csv collected with stratified sampling
- 3.2 Neyman allocation of stratum sample size
4 Simulation to Study the Efficiency of Stratified Sampling with Different Allocation
- 4.1 Using “Region” to Stratify
- 4.2 Using “acres82” to Define Strata

1 Functions and packages for Analyzing Data

library(latex2exp)
library(sampling)
library(plyr)
## this function finds statistical estimates given a dataset with sampling weight
# stratdata --- data.frame containing stratified sample
# y --- name of variable for which we want to estiamte population mean
# stratum --- name of variable that will be used as stratum variable
# weight --- name of variable indicating sampling weight
# post = TRUE is to indicate that we will do post-stratification analysis (ignoring it for the moment)
# note: from weights we can find Nh (see the code for formula)
str_mean_estimate_data <- function (stratdata, y, stratum, weight)
{
    ## compute stratum-wise data
    n <- nrow (stratdata)
    sh <- tapply (stratdata[, y], stratdata[,stratum], sd)
    ybarh <- tapply (stratdata[, y], stratdata[,stratum], mean)
    ## find population stratum size using sampling weight included in the data set
    Nh <- tapply (stratdata[, weight], stratdata[,stratum], sum)
    nh <- tapply (1:nrow(stratdata), stratdata[,stratum], length)
    ## find mean estimates
    N <- sum (Nh)
    Pi_h <- Nh/N
    ybar <- sum(ybarh * Pi_h)
    seybar <- sqrt(sum((1-nh/Nh)*Pi_h^2*sh^2/nh))
    mem <- 1.96 * seybar
    c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)
    
}


## to find total, multiply N to the estimate returned by this function
## for poststratification, use nh = n * Nh/N
str_mean_estimate <- function (ybarh, sh, nh, Nh)
{
    N <- sum (Nh)
    Pi_h <- Nh/N
    ybar <- sum(ybarh * Pi_h)
    seybar <- sqrt(sum((1-nh/Nh)*Pi_h^2*sh^2/nh))
    mem <- 1.96 * seybar
    c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}



## sdata --- a vector of original survey data
## N --- population size
## to find total, multiply N to the estimate returned by this function

srs_mean_est <- function (sdata, N = Inf)
{
    n <- length (sdata)
    ybar <- mean (sdata)
    se.ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)  
    mem <- qt (0.975, df = n - 1) * se.ybar
    c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}

2 Analysis of “agstrat.csv” data

2.1 Importing agstrat.csv data

agstrat <- read.csv("data/agstrat.csv")
agstrat

2.2 Spreadsheet calculation

2.2.1 Summarizing acre92 in each stratum

nh <- tapply (agstrat[, "acres92"], agstrat[,"region"], length)
sh <- tapply (agstrat[, "acres92"], agstrat[,"region"], sd)
ybarh <- tapply (agstrat[, "acres92"], agstrat[,"region"], mean)
# create a vector with external information
Nh <- c(NC = 1054, NE = 220, S= 1382, W = 422)

data.frame (Nh, nh, ybarh, sh)

2.2.2 Estimates

N <- sum (Nh)
pi_h <- Nh/N
weighted_ybar_h <- ybarh * pi_h
var_ybarh <- (1-nh/Nh)*sh^2/nh
var_h=var_ybarh*pi_h^2

workingtable <- data.frame ("$N_h$"=Nh, 
                            "$n_h$"=nh,  
                            "$\\bar y_h$"= ybarh, 
                            "$s_h$"= sh, 
                            "$s^2_h$" = sh^2, 
                            "$\\pi_h={N_h/N}$"= pi_h,
                            "$\\bar y_h\\cdot \\pi_h$"= weighted_ybar_h,
                            "${\\hat V} (\\bar y_h)=(1-n_h/N_h)s^2_h/n_h$"=var_ybarh,
                            "${\\hat V} (\\bar y_h)\\cdot \\pi^2_h$"=var_h,
                            check.names = FALSE)

workingtable <- rbind (workingtable, colSums(workingtable))
row.names(workingtable)[5] <- "Sum"
library(knitr)
library(kableExtra)
kableExtra::kable(workingtable, escape=FALSE)

	\(N_h\)	\(n_h\)	\(\bar y_h\)	\(s_h\)	\(s^2_h\)	\(\pi_h={N_h/N}\)	\(\bar y_h\cdot \pi_h\)	\({\hat V} (\bar y_h)=(1-n_h/N_h)s^2_h/n_h\)	\({\hat V} (\bar y_h)\cdot \pi^2_h\)
NC	1054	103	300504.16	172099.34	29618183543	0.3424301	102901.683	259454437	30423214
NE	220	21	97629.81	87449.83	7647472708	0.0714750	6978.089	329404127	1682818
S	1382	135	211315.04	231489.71	53587487856	0.4489929	94878.945	358169038	72204937
W	422	41	662295.51	629433.04	396185950266	0.1371020	90802.049	8724242692	163989261
Sum	3078	300	1271744.52	1120471.92	487039094374	1.0000000	295560.765	9671270293	268300231

ybar <- sum(weighted_ybar_h); ybar

## [1] 295560.8

var_ybar <- sum((1-nh/Nh)*pi_h^2*sh^2/nh); var_ybar

## [1] 268300231

seybar <- sqrt(var_ybar)
mem <- 1.96 * seybar
c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)

##      Est.      S.E.    ci.low    ci.upp 
## 295560.77  16379.87 263456.21 327665.32

2.3 Using the function “str_mean_estimate”

nh <- tapply (agstrat[, "acres92"], agstrat[,"region"], length)
sh <- tapply (agstrat[, "acres92"], agstrat[,"region"], sd)
ybarh <- tapply (agstrat[, "acres92"], agstrat[,"region"], mean)
# create a vector with external information
Nh <- c(NC = 1054, NE = 220, S= 1382, W = 422)

## strata mean estimate
str_mean_estimate (ybarh, sh, nh, Nh)

##      Est.      S.E.    ci.low    ci.upp 
## 295560.77  16379.87 263456.21 327665.32

## population total estimate 
str_mean_estimate (ybarh, sh, nh, Nh) * sum (Nh)

##       Est.       S.E.     ci.low     ci.upp 
##  909736035   50417248  810918229 1008553842

2.4 Using the function “str_mean_estimate_data”

## In the fucntion "str_mean_estimate_data", we can find the stratum size with the variable "weights":
tapply (agstrat$weight, agstrat$region, sum)

##   NC   NE    S    W 
## 1054  220 1382  422

## if the dataset contains a variable "weight"
str_mean_estimate_data (agstrat, y="acres92", stratum="region", weight="weight")

##      Est.      S.E.    ci.low    ci.upp 
## 295560.77  16379.87 263456.21 327665.32

## estimating the mean of the number of small farms in 1992
str_mean_estimate_data (agstrat, y="smallf92", stratum="region", weight="weight")

##      Est.      S.E.    ci.low    ci.upp 
## 56.862794  7.201417 42.748016 70.977572

2.5 Comparing with SRS estimate

agsrs <- read.csv ("data/agsrs.csv")
srs_mean_est(agsrs[, "acres92"], N=3078)

##      Est.      S.E.    ci.low    ci.upp 
## 297897.05  18898.43 260706.26 335087.84

## the ratio of estimated variance
(16379.87/18898.43)^2

## [1] 0.751224

## percentage of reduction of variance of str estimates from that of SRS estimate
1- (16379.87/18898.43)^2

## [1] 0.248776

3 Allocation of stratum sample size

3.1 Analyzing seals.csv collected with stratified sampling

Lydersen and Ryg (1991) used stratification techniques to estimate ringed seal pop- ulations in Svalbard fjords. The 200 km2 study area was divided into three zones: Zone 1, outer Sassenfjorden, was covered with relatively new ice during the study period in March, 1990, and had little snow cover; Zone 3, Tempelfjorden, had a stable ice cover throughout the year; Zone 2, inner Sassenfjorden, was intermediate between the stable Zone 3 and the unstable Zone 1. Ringed seals need good ice to establish territories with breathing holes, and snow cover enables females to dig out birth lairs. Thus, it was thought that the three zones would have different seal densities. The investigators took a stratified random sample of 20% of the 200 1-km2 areas.

###### ########

## load data
seals <- read.csv("data/seals.csv")
seals

# survey data summary in each stratum
nh <- as.vector(table(seals[,"zone"]))
sh <- tapply (seals[, "holes"], seals[,"zone"], sd)
ybarh <- tapply (seals[, "holes"], seals[,"zone"], mean)
Nh <- c(68,84,48)
data.frame (Nh,nh, ybarh, sh)

str_mean_estimate (ybarh, sh, nh, Nh)

##      Est.      S.E.    ci.low    ci.upp 
## 4.9859091 0.5901322 3.8292499 6.1425683

3.2 Neyman allocation of stratum sample size

## assuming the cost of surving each zone is the same
Ch <- rep (1, 3)
Sh <- sh ## estimate Sh with sh
NhShDCh <- Nh * Sh / sqrt (Ch)

## the optimal allocation scheme for estimating the population total or means holes:
Lh <- NhShDCh / sum (NhShDCh)
data.frame(Ch,Nh, Sh,NhShDCh, Lh)

4 Simulation to Study the Efficiency of Stratified Sampling with Different Allocation

4.1 Using “Region” to Stratify

4.1.1 Read Population Data

agpop <- read.csv ("data/agpop.csv")
agpop <- agpop[agpop$acres92 != -99, ] ## remove those counties with na
N <- nrow(agpop)

4.1.2 Define Stratum Variable

agpop$stratum <- agpop$region
## reorder agpop for the ease of using strata of "sampling" (very important)
agpop <- agpop[order (agpop$stratum), ]

Look at variance decomposition

boxplot (acres92 ~ stratum, data = agpop)

anova(lm (agpop$acres92~agpop$stratum))

Look at the \(R^2\) of predicting “acres92” with “region”.

summary(lm (agpop$acres92~agpop$stratum))$r.squared

## [1] 0.1821031

4.1.3 Stratified Sampling with P2S allocation

# doing one stratified sampling
Nh <- tapply (1:nrow(agpop), agpop$stratum, length) 
nh <- round(Nh/sum (Nh)*300)
data.frame(Nh,nh)

nh

##  NC  NE   S   W 
## 103  21 135  41

## doing stratified sampling
strsample <- strata (agpop, "stratum", size = nh,  method = "srswor")
strsample

## checking sampling results
table (strsample [,1])

## 
##  NC  NE   S   W 
## 103  21 135  41

# collecting data on sampled counties
agstrat <- agpop [strsample$ID_unit, ]
agstrat$weight <- 1/strsample$Prob

str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")

##      Est.      S.E.    ci.low    ci.upp 
## 308959.85  17061.56 275519.20 342400.50

Estimate with spreadsheet method for a double checking

## double checking the estimates
nh <- tapply (agstrat[, "acres92"], agstrat[,"region"], length)
sh <- tapply (agstrat[, "acres92"], agstrat[,"region"], sd)
ybarh <- tapply (agstrat[, "acres92"], agstrat[,"region"], mean)
# create a vector with external information
Nh <- tapply (1:nrow(agpop), agpop[,"region"], length)
## strata mean estimate
str_mean_estimate (ybarh, sh, nh, Nh)

##      Est.      S.E.    ci.low    ci.upp 
## 308959.85  17061.56 275519.20 342400.50

4.1.4 Repeat stratified sampling with P2S allocation 2000 times

Nh <- tapply (1:nrow (agpop), agpop$stratum, length) 
nh <- round(Nh/sum (Nh)*300) ## make sure the order matches Nh

nres <- 2000
str_p2s_simulated <- matrix (0, nres, 4)
for (i in 1:nres)
{
    ## doing stratified sampling
    strsample <- strata (agpop, "stratum", size = nh,  method = "srswor")
    agstrat <- agpop [strsample$ID_unit, ]
    agstrat$weight <- 1/strsample$Prob
    ## analyziing data
    str_p2s_simulated[i,] <- str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
    
}

4.1.5 Repeat stratified sampling with optimal allocation 2000 times

Nh <- tapply (agpop$acres92, agpop$stratum, length) 

Sh <- tapply (agpop$acres92, agpop$stratum, sd)
nh_opt <- round((Nh*Sh)/sum (Nh*Sh) * 300);nh_opt

##  NC  NE   S   W 
##  87   5 102 106

data.frame(Nh,Sh,nh_opt)

nres <- 2000
str_neyman_simulated <- matrix (0, nres, 4)
for (i in 1:nres)
{
    ## doing stratified sampling
    strsample <- strata (agpop, "stratum", size = nh_opt,  method = "srswor")
    # collecting data on sampled counties
    agstrat <- agpop [strsample$ID_unit, ]
    agstrat$weight <- 1/strsample$Prob
    
    str_neyman_simulated[i,] <- str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
    
}

4.1.6 Repeat simple random sampling 2000 times

nres <- 2000
srs_simulated <- matrix (0, nres, 4)
for (i in 1:nres)
{
    ## doing stratified sampling
    srs <- sample (sum(Nh), sum (nh))
    # collecting data on sampled counties
    agsrs <- agpop [srs, ]
    srs_simulated[i,] <- srs_mean_est (agsrs[, "acres92"], N= sum(Nh))
    
}

4.1.7 Compare the efficiency of different methods

sim_results_str_region <- data.frame("SRS"=srs_simulated[,1], 
                      "Prop2size"=str_p2s_simulated[,1], 
                      "Neyman"=str_neyman_simulated[,1])
boxplot (sim_results_str_region)
abline (h = mean (agpop$acres92), col = "red")

sapply (sim_results_str_region, mean) -> sim_means
sapply (sim_results_str_region, var) -> sim_var

sim_var/sim_var[1] -> sim_var_relative

#Percentage of reduction of variances of estimates compared to SRS

1-sim_var/sim_var[1] -> sim_var_reduction

data.frame("Mean"=sim_means, "Variance"=sim_var, "Relative Variance"= sim_var_relative, "Percentage of Variance Reduction"=sim_var_reduction, check.names = FALSE)

4.2 Using “acres82” to Define Strata

4.2.1 Read Population Data

agpop <- read.csv ("data/agpop.csv")
agpop <- agpop[agpop$acres92 != -99, ] ## remove those counties with na
N <- nrow(agpop)

4.2.2 Define Stratum Variable with Quantiles of “acres82”

plot (acres92~acres82, data = agpop)

summary (lm(acres92~acres82, data = agpop))

## 
## Call:
## lm(formula = acres92 ~ acres82, data = agpop)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -786951  -11680    -548    8907  804356 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.142e+03  1.159e+03  -6.165 7.99e-10 ***
## acres82      9.813e-01  2.157e-03 455.015  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51310 on 3057 degrees of freedom
## Multiple R-squared:  0.9854, Adjusted R-squared:  0.9854 
## F-statistic: 2.07e+05 on 1 and 3057 DF,  p-value: < 2.2e-16

agpop$stratum <- cut(agpop$acres82,
                     breaks = quantile (agpop$acres82, 
                                        probs = c(0,0.25,0.75,0.95,1)),
                     include.lowest =  T)
agpop$stratum <- mapvalues(agpop$stratum, 
                          from = (sort(unique (agpop$stratum))), 
                          to= paste0("acres82",c("Q1", "Q2", "Q3", "Q4")))
agpop <- agpop[order(agpop$stratum), ]