library(latex2exp)
library(sampling)
library(plyr)
## this function finds statistical estimates given a dataset with sampling weight
# stratdata --- data.frame containing stratified sample
# y --- name of variable for which we want to estiamte population mean
# stratum --- name of variable that will be used as stratum variable
# weight --- name of variable indicating sampling weight
# post = TRUE is to indicate that we will do post-stratification analysis (ignoring it for the moment)
# note: from weights we can find Nh (see the code for formula)
str_mean_estimate_data <- function (stratdata, y, stratum, weight)
{
## compute stratum-wise data
n <- nrow (stratdata)
sh <- tapply (stratdata[, y], stratdata[,stratum], sd)
ybarh <- tapply (stratdata[, y], stratdata[,stratum], mean)
## find population stratum size using sampling weight included in the data set
Nh <- tapply (stratdata[, weight], stratdata[,stratum], sum)
nh <- tapply (1:nrow(stratdata), stratdata[,stratum], length)
## find mean estimates
N <- sum (Nh)
Pi_h <- Nh/N
ybar <- sum(ybarh * Pi_h)
seybar <- sqrt(sum((1-nh/Nh)*Pi_h^2*sh^2/nh))
mem <- 1.96 * seybar
c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
## to find total, multiply N to the estimate returned by this function
## for poststratification, use nh = n * Nh/N
str_mean_estimate <- function (ybarh, sh, nh, Nh)
{
N <- sum (Nh)
Pi_h <- Nh/N
ybar <- sum(ybarh * Pi_h)
seybar <- sqrt(sum((1-nh/Nh)*Pi_h^2*sh^2/nh))
mem <- 1.96 * seybar
c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
## sdata --- a vector of original survey data
## N --- population size
## to find total, multiply N to the estimate returned by this function
srs_mean_est <- function (sdata, N = Inf)
{
n <- length (sdata)
ybar <- mean (sdata)
se.ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)
mem <- qt (0.975, df = n - 1) * se.ybar
c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
nh <- tapply (agstrat[, "acres92"], agstrat[,"region"], length)
sh <- tapply (agstrat[, "acres92"], agstrat[,"region"], sd)
ybarh <- tapply (agstrat[, "acres92"], agstrat[,"region"], mean)
# create a vector with external information
Nh <- c(NC = 1054, NE = 220, S= 1382, W = 422)
data.frame (Nh, nh, ybarh, sh)
N <- sum (Nh)
pi_h <- Nh/N
weighted_ybar_h <- ybarh * pi_h
var_ybarh <- (1-nh/Nh)*sh^2/nh
var_h=var_ybarh*pi_h^2
workingtable <- data.frame ("$N_h$"=Nh,
"$n_h$"=nh,
"$\\bar y_h$"= ybarh,
"$s_h$"= sh,
"$s^2_h$" = sh^2,
"$\\pi_h={N_h/N}$"= pi_h,
"$\\bar y_h\\cdot \\pi_h$"= weighted_ybar_h,
"${\\hat V} (\\bar y_h)=(1-n_h/N_h)s^2_h/n_h$"=var_ybarh,
"${\\hat V} (\\bar y_h)\\cdot \\pi^2_h$"=var_h,
check.names = FALSE)
workingtable <- rbind (workingtable, colSums(workingtable))
row.names(workingtable)[5] <- "Sum"
library(knitr)
library(kableExtra)
kableExtra::kable(workingtable, escape=FALSE)
\(N_h\) | \(n_h\) | \(\bar y_h\) | \(s_h\) | \(s^2_h\) | \(\pi_h={N_h/N}\) | \(\bar y_h\cdot \pi_h\) | \({\hat V} (\bar y_h)=(1-n_h/N_h)s^2_h/n_h\) | \({\hat V} (\bar y_h)\cdot \pi^2_h\) | |
---|---|---|---|---|---|---|---|---|---|
NC | 1054 | 103 | 300504.16 | 172099.34 | 29618183543 | 0.3424301 | 102901.683 | 259454437 | 30423214 |
NE | 220 | 21 | 97629.81 | 87449.83 | 7647472708 | 0.0714750 | 6978.089 | 329404127 | 1682818 |
S | 1382 | 135 | 211315.04 | 231489.71 | 53587487856 | 0.4489929 | 94878.945 | 358169038 | 72204937 |
W | 422 | 41 | 662295.51 | 629433.04 | 396185950266 | 0.1371020 | 90802.049 | 8724242692 | 163989261 |
Sum | 3078 | 300 | 1271744.52 | 1120471.92 | 487039094374 | 1.0000000 | 295560.765 | 9671270293 | 268300231 |
## [1] 295560.8
## [1] 268300231
seybar <- sqrt(var_ybar)
mem <- 1.96 * seybar
c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)
## Est. S.E. ci.low ci.upp
## 295560.77 16379.87 263456.21 327665.32
nh <- tapply (agstrat[, "acres92"], agstrat[,"region"], length)
sh <- tapply (agstrat[, "acres92"], agstrat[,"region"], sd)
ybarh <- tapply (agstrat[, "acres92"], agstrat[,"region"], mean)
# create a vector with external information
Nh <- c(NC = 1054, NE = 220, S= 1382, W = 422)
## strata mean estimate
str_mean_estimate (ybarh, sh, nh, Nh)
## Est. S.E. ci.low ci.upp
## 295560.77 16379.87 263456.21 327665.32
## Est. S.E. ci.low ci.upp
## 909736035 50417248 810918229 1008553842
## In the fucntion "str_mean_estimate_data", we can find the stratum size with the variable "weights":
tapply (agstrat$weight, agstrat$region, sum)
## NC NE S W
## 1054 220 1382 422
## if the dataset contains a variable "weight"
str_mean_estimate_data (agstrat, y="acres92", stratum="region", weight="weight")
## Est. S.E. ci.low ci.upp
## 295560.77 16379.87 263456.21 327665.32
## estimating the mean of the number of small farms in 1992
str_mean_estimate_data (agstrat, y="smallf92", stratum="region", weight="weight")
## Est. S.E. ci.low ci.upp
## 56.862794 7.201417 42.748016 70.977572
Lydersen and Ryg (1991) used stratification techniques to estimate ringed seal pop- ulations in Svalbard fjords. The 200 km2 study area was divided into three zones: Zone 1, outer Sassenfjorden, was covered with relatively new ice during the study period in March, 1990, and had little snow cover; Zone 3, Tempelfjorden, had a stable ice cover throughout the year; Zone 2, inner Sassenfjorden, was intermediate between the stable Zone 3 and the unstable Zone 1. Ringed seals need good ice to establish territories with breathing holes, and snow cover enables females to dig out birth lairs. Thus, it was thought that the three zones would have different seal densities. The investigators took a stratified random sample of 20% of the 200 1-km2 areas.
# survey data summary in each stratum
nh <- as.vector(table(seals[,"zone"]))
sh <- tapply (seals[, "holes"], seals[,"zone"], sd)
ybarh <- tapply (seals[, "holes"], seals[,"zone"], mean)
Nh <- c(68,84,48)
data.frame (Nh,nh, ybarh, sh)
## Est. S.E. ci.low ci.upp
## 4.9859091 0.5901322 3.8292499 6.1425683
## assuming the cost of surving each zone is the same
Ch <- rep (1, 3)
Sh <- sh ## estimate Sh with sh
NhShDCh <- Nh * Sh / sqrt (Ch)
## the optimal allocation scheme for estimating the population total or means holes:
Lh <- NhShDCh / sum (NhShDCh)
data.frame(Ch,Nh, Sh,NhShDCh, Lh)
agpop$stratum <- agpop$region
## reorder agpop for the ease of using strata of "sampling" (very important)
agpop <- agpop[order (agpop$stratum), ]
Look at variance decomposition
Look at the \(R^2\) of predicting “acres92” with “region”.
## [1] 0.1821031
# doing one stratified sampling
Nh <- tapply (1:nrow(agpop), agpop$stratum, length)
nh <- round(Nh/sum (Nh)*300)
data.frame(Nh,nh)
## NC NE S W
## 103 21 135 41
## doing stratified sampling
strsample <- strata (agpop, "stratum", size = nh, method = "srswor")
strsample
##
## NC NE S W
## 103 21 135 41
# collecting data on sampled counties
agstrat <- agpop [strsample$ID_unit, ]
agstrat$weight <- 1/strsample$Prob
str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
## Est. S.E. ci.low ci.upp
## 308959.85 17061.56 275519.20 342400.50
Estimate with spreadsheet method for a double checking
## double checking the estimates
nh <- tapply (agstrat[, "acres92"], agstrat[,"region"], length)
sh <- tapply (agstrat[, "acres92"], agstrat[,"region"], sd)
ybarh <- tapply (agstrat[, "acres92"], agstrat[,"region"], mean)
# create a vector with external information
Nh <- tapply (1:nrow(agpop), agpop[,"region"], length)
## strata mean estimate
str_mean_estimate (ybarh, sh, nh, Nh)
## Est. S.E. ci.low ci.upp
## 308959.85 17061.56 275519.20 342400.50
Nh <- tapply (1:nrow (agpop), agpop$stratum, length)
nh <- round(Nh/sum (Nh)*300) ## make sure the order matches Nh
nres <- 2000
str_p2s_simulated <- matrix (0, nres, 4)
for (i in 1:nres)
{
## doing stratified sampling
strsample <- strata (agpop, "stratum", size = nh, method = "srswor")
agstrat <- agpop [strsample$ID_unit, ]
agstrat$weight <- 1/strsample$Prob
## analyziing data
str_p2s_simulated[i,] <- str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
}
Nh <- tapply (agpop$acres92, agpop$stratum, length)
Sh <- tapply (agpop$acres92, agpop$stratum, sd)
nh_opt <- round((Nh*Sh)/sum (Nh*Sh) * 300);nh_opt
## NC NE S W
## 87 5 102 106
nres <- 2000
str_neyman_simulated <- matrix (0, nres, 4)
for (i in 1:nres)
{
## doing stratified sampling
strsample <- strata (agpop, "stratum", size = nh_opt, method = "srswor")
# collecting data on sampled counties
agstrat <- agpop [strsample$ID_unit, ]
agstrat$weight <- 1/strsample$Prob
str_neyman_simulated[i,] <- str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
}
sim_results_str_region <- data.frame("SRS"=srs_simulated[,1],
"Prop2size"=str_p2s_simulated[,1],
"Neyman"=str_neyman_simulated[,1])
boxplot (sim_results_str_region)
abline (h = mean (agpop$acres92), col = "red")
sapply (sim_results_str_region, mean) -> sim_means
sapply (sim_results_str_region, var) -> sim_var
sim_var/sim_var[1] -> sim_var_relative
#Percentage of reduction of variances of estimates compared to SRS
1-sim_var/sim_var[1] -> sim_var_reduction
data.frame("Mean"=sim_means, "Variance"=sim_var, "Relative Variance"= sim_var_relative, "Percentage of Variance Reduction"=sim_var_reduction, check.names = FALSE)
##
## Call:
## lm(formula = acres92 ~ acres82, data = agpop)
##
## Residuals:
## Min 1Q Median 3Q Max
## -786951 -11680 -548 8907 804356
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.142e+03 1.159e+03 -6.165 7.99e-10 ***
## acres82 9.813e-01 2.157e-03 455.015 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51310 on 3057 degrees of freedom
## Multiple R-squared: 0.9854, Adjusted R-squared: 0.9854
## F-statistic: 2.07e+05 on 1 and 3057 DF, p-value: < 2.2e-16
agpop$stratum <- cut(agpop$acres82,
breaks = quantile (agpop$acres82,
probs = c(0,0.25,0.75,0.95,1)),
include.lowest = T)
agpop$stratum <- mapvalues(agpop$stratum,
from = (sort(unique (agpop$stratum))),
to= paste0("acres82",c("Q1", "Q2", "Q3", "Q4")))
agpop <- agpop[order(agpop$stratum), ]
Look at variance decomposition
Look at the \(R^2\) of predicting “acres92” with “acres82” quantile.
## [1] 0.7393047
# doing one stratified sampling
Nh <- tapply (1:nrow(agpop), agpop$stratum, length)
nh <- round(Nh/sum (Nh)*300)
nh
## acres82Q1 acres82Q2 acres82Q3 acres82Q4
## 75 150 60 15
## doing stratified sampling
strsample <- strata (agpop, "stratum", size = nh, method = "srswor")
strsample
##
## acres82Q1 acres82Q2 acres82Q3 acres82Q4
## 75 150 60 15
# collecting data on sampled counties
agstrat <- agpop [strsample$ID_unit, ]
agstrat$weight <- 1/strsample$Prob
str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
## Est. S.E. ci.low ci.upp
## 302920.277 9063.152 285156.499 320684.056
Nh <- tapply (1:nrow (agpop), agpop$stratum, length)
nh <- round(Nh/sum (Nh)*300) ## make sure the order matches Nh
data.frame(Nh, nh)
## Nh nh
## acres82Q1 765 75
## acres82Q2 1529 150
## acres82Q3 612 60
## acres82Q4 153 15
nres <- 2000
str_p2s_simulated <- matrix (0, nres, 4)
for (i in 1:nres)
{
## doing stratified sampling
strsample <- strata (agpop, "stratum", size = nh, method = "srswor")
agstrat <- agpop [strsample$ID_unit, ]
agstrat$weight <- 1/strsample$Prob
## analyziing data
str_p2s_simulated[i,] <- str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
}
Nh <- tapply (1:nrow (agpop), agpop$stratum, length)
# find order of stratum
# unique (agpop$stratum)
Sh <- tapply (agpop$acres92, agpop$stratum, sd)
nh_opt <- round((Nh*Sh)/sum (Nh*Sh) * 300);nh_opt
## acres82Q1 acres82Q2 acres82Q3 acres82Q4
## 22 98 77 103
nres <- 2000
str_neyman_simulated <- matrix (0, nres, 4)
for (i in 1:nres)
{
## doing stratified sampling
strsample <- strata (agpop, "stratum", size = nh_opt, method = "srswor")
# collecting data on sampled counties
agstrat <- agpop [strsample$ID_unit, ]
agstrat$weight <- 1/strsample$Prob
str_neyman_simulated[i,] <- str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
}
sim_results_str_acres82 <- data.frame("SRS"=srs_simulated[,1],
"Prop2size"=str_p2s_simulated[,1],
"Neyman"=str_neyman_simulated[,1])
boxplot (sim_results_str_acres82)
abline (h = mean (agpop$acres92), col = "red")
sapply (sim_results_str_acres82, mean) -> sim_means
sapply (sim_results_str_acres82, var) -> sim_var
sim_var/sim_var[1] -> sim_var_relative
#Percentage of reduction of variances of estimates compared to SRS
1-sim_var/sim_var[1] -> sim_var_reduction
data.frame("Mean"=sim_means, "Variance"=sim_var, "Relative Variance"= sim_var_relative, "Percentage of Variance Reduction"=sim_var_reduction, check.names = FALSE)