1 Functions and packages for Analyzing Data

library(latex2exp)
library(sampling)
library(plyr)
## this function finds statistical estimates given a dataset with sampling weight
# stratdata --- data.frame containing stratified sample
# y --- name of variable for which we want to estiamte population mean
# stratum --- name of variable that will be used as stratum variable
# weight --- name of variable indicating sampling weight
# post = TRUE is to indicate that we will do post-stratification analysis (ignoring it for the moment)
# note: from weights we can find Nh (see the code for formula)
str_mean_estimate_data <- function (stratdata, y, stratum, weight)
{
    ## compute stratum-wise data
    n <- nrow (stratdata)
    sh <- tapply (stratdata[, y], stratdata[,stratum], sd)
    ybarh <- tapply (stratdata[, y], stratdata[,stratum], mean)
    ## find population stratum size using sampling weight included in the data set
    Nh <- tapply (stratdata[, weight], stratdata[,stratum], sum)
    nh <- tapply (1:nrow(stratdata), stratdata[,stratum], length)
    ## find mean estimates
    N <- sum (Nh)
    Pi_h <- Nh/N
    ybar <- sum(ybarh * Pi_h)
    seybar <- sqrt(sum((1-nh/Nh)*Pi_h^2*sh^2/nh))
    mem <- 1.96 * seybar
    c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)
    
}


## to find total, multiply N to the estimate returned by this function
## for poststratification, use nh = n * Nh/N
str_mean_estimate <- function (ybarh, sh, nh, Nh)
{
    N <- sum (Nh)
    Pi_h <- Nh/N
    ybar <- sum(ybarh * Pi_h)
    seybar <- sqrt(sum((1-nh/Nh)*Pi_h^2*sh^2/nh))
    mem <- 1.96 * seybar
    c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}



## sdata --- a vector of original survey data
## N --- population size
## to find total, multiply N to the estimate returned by this function

srs_mean_est <- function (sdata, N = Inf)
{
    n <- length (sdata)
    ybar <- mean (sdata)
    se.ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)  
    mem <- qt (0.975, df = n - 1) * se.ybar
    c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}

2 Analysis of “agstrat.csv” data

2.1 Importing agstrat.csv data

agstrat <- read.csv("data/agstrat.csv")
head (agstrat) 

2.2 Spreadsheet calculation

2.2.1 Summarizing acre92 in each stratum

nh <- tapply (1:nrow(agstrat), agstrat[,"region"], length)
sh <- tapply (agstrat[, "acres92"], agstrat[,"region"], sd)
ybarh <- tapply (agstrat[, "acres92"], agstrat[,"region"], mean)
# create a vector with external information
Nh <- c(NC = 1054, NE = 220, S= 1382, W = 422)

data.frame (Nh, nh, ybarh, sh)

2.2.2 Estimates

N <- sum (Nh)
pi_h <- Nh/N
ybar <- sum(ybarh * pi_h)
data.frame (Nh, nh=nh,pi_h, ybarh, sh=sh, "var component"=(1-nh/Nh)*pi_h^2*sh^2/nh)
seybar <- sqrt(sum((1-nh/Nh)*pi_h^2*sh^2/nh))
mem <- 1.96 * seybar
c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)
##      Est.      S.E.    ci.low    ci.upp 
## 295560.77  16379.87 263456.21 327665.32

2.3 Using the function “str_mean_estimate”

nh <- tapply (1:nrow(agstrat), agstrat[,"region"], length)
sh <- tapply (agstrat[, "acres92"], agstrat[,"region"], sd)
ybarh <- tapply (agstrat[, "acres92"], agstrat[,"region"], mean)
# create a vector with external information
Nh <- c(NC = 1054, NE = 220, S= 1382, W = 422)

## strata mean estimate
str_mean_estimate (ybarh, sh, nh, Nh)
##      Est.      S.E.    ci.low    ci.upp 
## 295560.77  16379.87 263456.21 327665.32
## population total estimate 
str_mean_estimate (ybarh, sh, nh, Nh) * sum (Nh)
##       Est.       S.E.     ci.low     ci.upp 
##  909736035   50417248  810918229 1008553842

2.4 Using the function “str_mean_estimate_data”

## In the fucntion "str_mean_estimate_data", we can find the stratum size with the variable "weights":
tapply (agstrat$weight, agstrat$region, sum)
##   NC   NE    S    W 
## 1054  220 1382  422
## if the dataset contains a variable "weight"
str_mean_estimate_data (agstrat, y="acres92", stratum="region", weight="weight")
##      Est.      S.E.    ci.low    ci.upp 
## 295560.77  16379.87 263456.21 327665.32
## estimating the mean of the number of small farms in 1992
str_mean_estimate_data (agstrat, y="smallf92", stratum="region", weight="weight")
##      Est.      S.E.    ci.low    ci.upp 
## 56.862794  7.201417 42.748016 70.977572

2.5 Comparing with SRS estimate

agsrs <- read.csv ("data/agsrs.csv")
srs_mean_est(agsrs[, "acres92"], N=3078)
##      Est.      S.E.    ci.low    ci.upp 
## 297897.05  18898.43 260706.26 335087.84
## the ratio of estimated variance
(16379.87/18898.43)^2
## [1] 0.751224
## percentage of reduction of variance of str estimates from that of SRS estimate
1- (16379.87/18898.43)^2
## [1] 0.248776

3 Allocation of stratum sample size

3.1 Analyzing seals.csv collected with stratified sampling

Lydersen and Ryg (1991) used stratification techniques to estimate ringed seal pop- ulations in Svalbard fjords. The 200 km2 study area was divided into three zones: Zone 1, outer Sassenfjorden, was covered with relatively new ice during the study period in March, 1990, and had little snow cover; Zone 3, Tempelfjorden, had a stable ice cover throughout the year; Zone 2, inner Sassenfjorden, was intermediate between the stable Zone 3 and the unstable Zone 1. Ringed seals need good ice to establish territories with breathing holes, and snow cover enables females to dig out birth lairs. Thus, it was thought that the three zones would have different seal densities. The investigators took a stratified random sample of 20% of the 200 1-km2 areas.

###### ########

## load data
seals <- read.csv("data/seals.csv")
head (seals)
# survey data summary in each stratum
nh <- as.vector(table(seals[,"zone"]))
sh <- tapply (seals[, "holes"], seals[,"zone"], sd)
ybarh <- tapply (seals[, "holes"], seals[,"zone"], mean)
Nh <- c(68,84,48)
data.frame (Nh,nh, ybarh, sh)
str_mean_estimate (ybarh, sh, nh, Nh)
##      Est.      S.E.    ci.low    ci.upp 
## 4.9859091 0.5901322 3.8292499 6.1425683

3.2 Neyman allocation of stratum sample size

## assuming the cost of surving each zone is the same
Ch <- rep (1, 3)
Sh <- sh ## estimate Sh with sh
NhShDCh <- Nh * Sh / sqrt (Ch)

## the optimal allocation scheme for estimating the population total or means holes:
Lh <- NhShDCh / sum (NhShDCh)
data.frame(Ch,Nh, Sh,NhShDCh, Lh)

4 Simulation to Study the Efficiency of Stratified Sampling with Different Allocation

4.1 Using “Region” to Stratify

4.1.1 Read Population Data

agpop <- read.csv ("data/agpop.csv")
agpop <- agpop[agpop$acres92 != -99, ] ## remove those counties with na
N <- nrow(agpop)

4.1.2 Define Stratum Variable

agpop$stratum <- agpop$region
## reorder agpop for the ease of using strata of "sampling" (very important)
agpop <- agpop[order (agpop$stratum), ]

Look at variance decomposition

boxplot (acres92 ~ stratum, data = agpop)

anova(lm (agpop$acres92~agpop$stratum))

Look at the \(R^2\) of predicting “acres92” with “region”.

summary(lm (agpop$acres92~agpop$stratum))$r.squared
## [1] 0.1821031

4.1.3 Stratified Sampling with P2S allocation

# doing one stratified sampling
Nh <- tapply (1:nrow(agpop), agpop$stratum, length) 
nh <- round(Nh/sum (Nh)*300)
data.frame(Nh,nh)
nh
##  NC  NE   S   W 
## 103  21 135  41
## doing stratified sampling
strsample <- strata (agpop, "stratum", size = nh,  method = "srswor")
head(strsample)
## checking sampling results
table (strsample [,1]) 
## 
##  NC  NE   S   W 
## 103  21 135  41
# collecting data on sampled counties
agstrat <- agpop [strsample$ID_unit, ]
agstrat$weight <- 1/strsample$Prob

str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
##      Est.      S.E.    ci.low    ci.upp 
## 339562.26  26301.57 288011.19 391113.33

Estimate with spreadsheet method for a double checking

## double checking the estimates
nh <- tapply (1:nrow(agstrat), agstrat[,"region"], length)
sh <- tapply (agstrat[, "acres92"], agstrat[,"region"], sd)
ybarh <- tapply (agstrat[, "acres92"], agstrat[,"region"], mean)
# create a vector with external information
Nh <- tapply (1:nrow(agpop), agpop[,"region"], length)
## strata mean estimate
str_mean_estimate (ybarh, sh, nh, Nh)
##      Est.      S.E.    ci.low    ci.upp 
## 339562.26  26301.57 288011.19 391113.33

4.1.4 Repeat stratified sampling with P2S allocation 2000 times

Nh <- tapply (1:nrow (agpop), agpop$stratum, length) 
nh <- round(Nh/sum (Nh)*300) ## make sure the order matches Nh

nres <- 2000
str_p2s_simulated <- matrix (0, nres, 4)
for (i in 1:nres)
{
    ## doing stratified sampling
    strsample <- strata (agpop, "stratum", size = nh,  method = "srswor")
    agstrat <- agpop [strsample$ID_unit, ]
    agstrat$weight <- 1/strsample$Prob
    ## analyziing data
    str_p2s_simulated[i,] <- str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
    
}

4.1.5 Repeat stratified sampling with optimal allocation 2000 times

Nh <- tapply (1:nrow (agpop), agpop$stratum, length) 

Sh <- tapply (agpop$acres92, agpop$stratum, sd)
nh_opt <- round((Nh*Sh)/sum (Nh*Sh) * 300);nh_opt
##  NC  NE   S   W 
##  87   5 102 106
data.frame(Nh,Sh,nh_opt)
nres <- 2000
str_neyman_simulated <- matrix (0, nres, 4)
for (i in 1:nres)
{
    ## doing stratified sampling
    strsample <- strata (agpop, "stratum", size = nh_opt,  method = "srswor")
    # collecting data on sampled counties
    agstrat <- agpop [strsample$ID_unit, ]
    agstrat$weight <- 1/strsample$Prob
    
    str_neyman_simulated[i,] <- str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
    
}

4.1.6 Repeat simple random sampling 2000 times

nres <- 2000
srs_simulated <- matrix (0, nres, 4)
for (i in 1:nres)
{
    ## doing stratified sampling
    srs <- sample (sum(Nh), sum (nh))
    # collecting data on sampled counties
    agsrs <- agpop [srs, ]
    srs_simulated[i,] <- srs_mean_est (agsrs[, "acres92"], N= sum(Nh))
    
}

4.1.7 Compare the efficiency of different methods

sim_results_str_region <- data.frame("SRS"=srs_simulated[,1], 
                      "Prop2size"=str_p2s_simulated[,1], 
                      "Neyman"=str_neyman_simulated[,1])
boxplot (sim_results_str_region)
abline (h = mean (agpop$acres92), col = "red")

sapply (sim_results_str_region, mean) -> sim_means
sapply (sim_results_str_region, var) -> sim_var

sim_var/sim_var[1] -> sim_var_relative

#Percentage of reduction of variances of estimates compared to SRS

1-sim_var/sim_var[1] -> sim_var_reduction

cbind("Mean"=sim_means, "Variance"=sim_var, "Relative Variance"= sim_var_relative, "Percentage of Variance Reduction"=sim_var_reduction)
##               Mean  Variance Relative Variance Percentage of Variance Reduction
## SRS       308584.0 523283890         1.0000000                        0.0000000
## Prop2size 309078.9 439085739         0.8390966                        0.1609034
## Neyman    308684.2 302887132         0.5788199                        0.4211801

4.2 Using “acres82” to Define Strata

4.2.1 Read Population Data

agpop <- read.csv ("data/agpop.csv")
agpop <- agpop[agpop$acres92 != -99, ] ## remove those counties with na
N <- nrow(agpop)

4.2.2 Define Stratum Variable with Quantiles of “acres82”

plot (acres92~acres82, data = agpop)

summary (lm(acres92~acres82, data = agpop))
## 
## Call:
## lm(formula = acres92 ~ acres82, data = agpop)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -786951  -11680    -548    8907  804356 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.142e+03  1.159e+03  -6.165 7.99e-10 ***
## acres82      9.813e-01  2.157e-03 455.015  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51310 on 3057 degrees of freedom
## Multiple R-squared:  0.9854, Adjusted R-squared:  0.9854 
## F-statistic: 2.07e+05 on 1 and 3057 DF,  p-value: < 2.2e-16
agpop$stratum <- cut(agpop$acres82,
                     breaks = quantile (agpop$acres82, 
                                        probs = c(0,0.25,0.75,0.95,1)),
                     include.lowest =  T)
agpop$stratum <- mapvalues(agpop$stratum, 
                          from = (sort(unique (agpop$stratum))), 
                          to= paste0("acres82",c("Q1", "Q2", "Q3", "Q4")))
agpop <- agpop[order(agpop$stratum), ]

Look at variance decomposition

boxplot (acres92 ~ stratum, data = agpop)

anova(lm (agpop$acres92~agpop$stratum))

Look at the \(R^2\) of predicting “acres92” with “acres82” quantile.

summary(lm (agpop$acres92~agpop$stratum))$r.squared
## [1] 0.7393047

4.2.3 Stratified sampling with P2S allocation

# doing one stratified sampling
Nh <- tapply (1:nrow(agpop), agpop$stratum, length) 
nh <- round(Nh/sum (Nh)*300)
nh
## acres82Q1 acres82Q2 acres82Q3 acres82Q4 
##        75       150        60        15
## doing stratified sampling
strsample <- strata (agpop, "stratum", size = nh,  method = "srswor")
head(strsample)
## checking sampling results
table (strsample [,1]) 
## 
## acres82Q1 acres82Q2 acres82Q3 acres82Q4 
##        75       150        60        15
# collecting data on sampled counties
agstrat <- agpop [strsample$ID_unit, ]
agstrat$weight <- 1/strsample$Prob

str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
##       Est.       S.E.     ci.low     ci.upp 
## 302920.277   9063.152 285156.499 320684.056

4.2.4 Repeat stratified sampling with P2S allocation 2000 times

Nh <- tapply (1:nrow (agpop), agpop$stratum, length) 
nh <- round(Nh/sum (Nh)*300) ## make sure the order matches Nh
data.frame(Nh, nh)
##             Nh  nh
## acres82Q1  765  75
## acres82Q2 1529 150
## acres82Q3  612  60
## acres82Q4  153  15
nres <- 2000
str_p2s_simulated <- matrix (0, nres, 4)
for (i in 1:nres)
{
    ## doing stratified sampling
    strsample <- strata (agpop, "stratum", size = nh,  method = "srswor")
    agstrat <- agpop [strsample$ID_unit, ]
    agstrat$weight <- 1/strsample$Prob
    ## analyziing data
    str_p2s_simulated[i,] <- str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
    
}

4.2.5 Repeat stratified sampling with optimal allocation 2000 times

Nh <- tapply (1:nrow (agpop), agpop$stratum, length) 
# find order of stratum
# unique (agpop$stratum)
Sh <- tapply (agpop$acres92, agpop$stratum, sd)
nh_opt <- round((Nh*Sh)/sum (Nh*Sh) * 300);nh_opt
## acres82Q1 acres82Q2 acres82Q3 acres82Q4 
##        22        98        77       103
data.frame(Nh,Sh,nh_opt)
nres <- 2000
str_neyman_simulated <- matrix (0, nres, 4)
for (i in 1:nres)
{
    ## doing stratified sampling
    strsample <- strata (agpop, "stratum", size = nh_opt,  method = "srswor")
    # collecting data on sampled counties
    agstrat <- agpop [strsample$ID_unit, ]
    agstrat$weight <- 1/strsample$Prob
    
    str_neyman_simulated[i,] <- str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
    
}

4.2.6 Repeat simple random sampling 2000 times

nres <- 2000
srs_simulated <- matrix (0, nres, 4)
for (i in 1:nres)
{
    ## doing stratified sampling
    srs <- sample (sum(Nh), sum (nh))
    # collecting data on sampled counties
    agsrs <- agpop [srs, ]
    srs_simulated[i,] <- srs_mean_est (agsrs[, "acres92"], N= sum(Nh))
    
}

4.2.7 Compare the efficiency of different methods

sim_results_str_acres82 <- data.frame("SRS"=srs_simulated[,1], 
                      "Prop2size"=str_p2s_simulated[,1], 
                      "Neyman"=str_neyman_simulated[,1])
boxplot (sim_results_str_acres82)
abline (h = mean (agpop$acres92), col = "red")

sapply (sim_results_str_acres82, mean) -> sim_means
sapply (sim_results_str_acres82, var) -> sim_var

sim_var/sim_var[1] -> sim_var_relative

#Percentage of reduction of variances of estimates compared to SRS

1-sim_var/sim_var[1] -> sim_var_reduction

cbind("Mean"=sim_means, "Variance"=sim_var, "Relative Variance"= sim_var_relative, "Percentage of Variance Reduction"=sim_var_reduction)
##               Mean  Variance Relative Variance Percentage of Variance Reduction
## SRS       308386.0 539231495        1.00000000                        0.0000000
## Prop2size 308405.7 137506779        0.25500509                        0.7449949
## Neyman    308632.5  36946519        0.06851699                        0.9314830