library(latex2exp)
library(sampling)
library(plyr)
## this function finds statistical estimates given a dataset with sampling weight
# stratdata --- data.frame containing stratified sample
# y --- name of variable for which we want to estiamte population mean
# stratum --- name of variable that will be used as stratum variable
# weight --- name of variable indicating sampling weight
# post = TRUE is to indicate that we will do post-stratification analysis (ignoring it for the moment)
# note: from weights we can find Nh (see the code for formula)
<- function (stratdata, y, stratum, weight)
str_mean_estimate_data
{## compute stratum-wise data
<- nrow (stratdata)
n <- tapply (stratdata[, y], stratdata[,stratum], sd)
sh <- tapply (stratdata[, y], stratdata[,stratum], mean)
ybarh ## find population stratum size using sampling weight included in the data set
<- tapply (stratdata[, weight], stratdata[,stratum], sum)
Nh <- tapply (1:nrow(stratdata), stratdata[,stratum], length)
nh ## find mean estimates
<- sum (Nh)
N <- Nh/N
Pi_h <- sum(ybarh * Pi_h)
ybar <- sqrt(sum((1-nh/Nh)*Pi_h^2*sh^2/nh))
seybar <- 1.96 * seybar
mem c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
## to find total, multiply N to the estimate returned by this function
## for poststratification, use nh = n * Nh/N
<- function (ybarh, sh, nh, Nh)
str_mean_estimate
{<- sum (Nh)
N <- Nh/N
Pi_h <- sum(ybarh * Pi_h)
ybar <- sqrt(sum((1-nh/Nh)*Pi_h^2*sh^2/nh))
seybar <- 1.96 * seybar
mem c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
## sdata --- a vector of original survey data
## N --- population size
## to find total, multiply N to the estimate returned by this function
<- function (sdata, N = Inf)
srs_mean_est
{<- length (sdata)
n <- mean (sdata)
ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)
se.ybar <- qt (0.975, df = n - 1) * se.ybar
mem c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
<- read.csv("data/agstrat.csv")
agstrat head (agstrat)
<- tapply (1:nrow(agstrat), agstrat[,"region"], length)
nh <- tapply (agstrat[, "acres92"], agstrat[,"region"], sd)
sh <- tapply (agstrat[, "acres92"], agstrat[,"region"], mean)
ybarh # create a vector with external information
<- c(NC = 1054, NE = 220, S= 1382, W = 422)
Nh
data.frame (Nh, nh, ybarh, sh)
<- sum (Nh)
N <- Nh/N
pi_h <- sum(ybarh * pi_h)
ybar data.frame (Nh, nh=nh,pi_h, ybarh, sh=sh, "var component"=(1-nh/Nh)*pi_h^2*sh^2/nh)
<- sqrt(sum((1-nh/Nh)*pi_h^2*sh^2/nh))
seybar <- 1.96 * seybar
mem c(Est. = ybar, S.E. = seybar, ci.low = ybar - mem, ci.upp = ybar + mem)
## Est. S.E. ci.low ci.upp
## 295560.77 16379.87 263456.21 327665.32
<- tapply (1:nrow(agstrat), agstrat[,"region"], length)
nh <- tapply (agstrat[, "acres92"], agstrat[,"region"], sd)
sh <- tapply (agstrat[, "acres92"], agstrat[,"region"], mean)
ybarh # create a vector with external information
<- c(NC = 1054, NE = 220, S= 1382, W = 422)
Nh
## strata mean estimate
str_mean_estimate (ybarh, sh, nh, Nh)
## Est. S.E. ci.low ci.upp
## 295560.77 16379.87 263456.21 327665.32
## population total estimate
str_mean_estimate (ybarh, sh, nh, Nh) * sum (Nh)
## Est. S.E. ci.low ci.upp
## 909736035 50417248 810918229 1008553842
## In the fucntion "str_mean_estimate_data", we can find the stratum size with the variable "weights":
tapply (agstrat$weight, agstrat$region, sum)
## NC NE S W
## 1054 220 1382 422
## if the dataset contains a variable "weight"
str_mean_estimate_data (agstrat, y="acres92", stratum="region", weight="weight")
## Est. S.E. ci.low ci.upp
## 295560.77 16379.87 263456.21 327665.32
## estimating the mean of the number of small farms in 1992
str_mean_estimate_data (agstrat, y="smallf92", stratum="region", weight="weight")
## Est. S.E. ci.low ci.upp
## 56.862794 7.201417 42.748016 70.977572
<- read.csv ("data/agsrs.csv")
agsrs srs_mean_est(agsrs[, "acres92"], N=3078)
## Est. S.E. ci.low ci.upp
## 297897.05 18898.43 260706.26 335087.84
## the ratio of estimated variance
16379.87/18898.43)^2 (
## [1] 0.751224
## percentage of reduction of variance of str estimates from that of SRS estimate
1- (16379.87/18898.43)^2
## [1] 0.248776
Lydersen and Ryg (1991) used stratification techniques to estimate ringed seal pop- ulations in Svalbard fjords. The 200 km2 study area was divided into three zones: Zone 1, outer Sassenfjorden, was covered with relatively new ice during the study period in March, 1990, and had little snow cover; Zone 3, Tempelfjorden, had a stable ice cover throughout the year; Zone 2, inner Sassenfjorden, was intermediate between the stable Zone 3 and the unstable Zone 1. Ringed seals need good ice to establish territories with breathing holes, and snow cover enables females to dig out birth lairs. Thus, it was thought that the three zones would have different seal densities. The investigators took a stratified random sample of 20% of the 200 1-km2 areas.
###### ########
## load data
<- read.csv("data/seals.csv")
seals head (seals)
# survey data summary in each stratum
<- as.vector(table(seals[,"zone"]))
nh <- tapply (seals[, "holes"], seals[,"zone"], sd)
sh <- tapply (seals[, "holes"], seals[,"zone"], mean)
ybarh <- c(68,84,48)
Nh data.frame (Nh,nh, ybarh, sh)
str_mean_estimate (ybarh, sh, nh, Nh)
## Est. S.E. ci.low ci.upp
## 4.9859091 0.5901322 3.8292499 6.1425683
## assuming the cost of surving each zone is the same
<- rep (1, 3)
Ch <- sh ## estimate Sh with sh
Sh <- Nh * Sh / sqrt (Ch)
NhShDCh
## the optimal allocation scheme for estimating the population total or means holes:
<- NhShDCh / sum (NhShDCh)
Lh data.frame(Ch,Nh, Sh,NhShDCh, Lh)
<- read.csv ("data/agpop.csv")
agpop <- agpop[agpop$acres92 != -99, ] ## remove those counties with na
agpop <- nrow(agpop) N
$stratum <- agpop$region
agpop## reorder agpop for the ease of using strata of "sampling" (very important)
<- agpop[order (agpop$stratum), ] agpop
Look at variance decomposition
boxplot (acres92 ~ stratum, data = agpop)
anova(lm (agpop$acres92~agpop$stratum))
Look at the \(R^2\) of predicting “acres92” with “region”.
summary(lm (agpop$acres92~agpop$stratum))$r.squared
## [1] 0.1821031
# doing one stratified sampling
<- tapply (1:nrow(agpop), agpop$stratum, length)
Nh <- round(Nh/sum (Nh)*300)
nh data.frame(Nh,nh)
nh
## NC NE S W
## 103 21 135 41
## doing stratified sampling
<- strata (agpop, "stratum", size = nh, method = "srswor")
strsample head(strsample)
## checking sampling results
table (strsample [,1])
##
## NC NE S W
## 103 21 135 41
# collecting data on sampled counties
<- agpop [strsample$ID_unit, ]
agstrat $weight <- 1/strsample$Prob
agstrat
str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
## Est. S.E. ci.low ci.upp
## 339562.26 26301.57 288011.19 391113.33
Estimate with spreadsheet method for a double checking
## double checking the estimates
<- tapply (1:nrow(agstrat), agstrat[,"region"], length)
nh <- tapply (agstrat[, "acres92"], agstrat[,"region"], sd)
sh <- tapply (agstrat[, "acres92"], agstrat[,"region"], mean)
ybarh # create a vector with external information
<- tapply (1:nrow(agpop), agpop[,"region"], length)
Nh ## strata mean estimate
str_mean_estimate (ybarh, sh, nh, Nh)
## Est. S.E. ci.low ci.upp
## 339562.26 26301.57 288011.19 391113.33
<- tapply (1:nrow (agpop), agpop$stratum, length)
Nh <- round(Nh/sum (Nh)*300) ## make sure the order matches Nh
nh
<- 2000
nres <- matrix (0, nres, 4)
str_p2s_simulated for (i in 1:nres)
{## doing stratified sampling
<- strata (agpop, "stratum", size = nh, method = "srswor")
strsample <- agpop [strsample$ID_unit, ]
agstrat $weight <- 1/strsample$Prob
agstrat## analyziing data
<- str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
str_p2s_simulated[i,]
}
<- tapply (1:nrow (agpop), agpop$stratum, length)
Nh
<- tapply (agpop$acres92, agpop$stratum, sd)
Sh <- round((Nh*Sh)/sum (Nh*Sh) * 300);nh_opt nh_opt
## NC NE S W
## 87 5 102 106
data.frame(Nh,Sh,nh_opt)
<- 2000
nres <- matrix (0, nres, 4)
str_neyman_simulated for (i in 1:nres)
{## doing stratified sampling
<- strata (agpop, "stratum", size = nh_opt, method = "srswor")
strsample # collecting data on sampled counties
<- agpop [strsample$ID_unit, ]
agstrat $weight <- 1/strsample$Prob
agstrat
<- str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
str_neyman_simulated[i,]
}
<- 2000
nres <- matrix (0, nres, 4)
srs_simulated for (i in 1:nres)
{## doing stratified sampling
<- sample (sum(Nh), sum (nh))
srs # collecting data on sampled counties
<- agpop [srs, ]
agsrs <- srs_mean_est (agsrs[, "acres92"], N= sum(Nh))
srs_simulated[i,]
}
<- data.frame("SRS"=srs_simulated[,1],
sim_results_str_region "Prop2size"=str_p2s_simulated[,1],
"Neyman"=str_neyman_simulated[,1])
boxplot (sim_results_str_region)
abline (h = mean (agpop$acres92), col = "red")
sapply (sim_results_str_region, mean) -> sim_means
sapply (sim_results_str_region, var) -> sim_var
/sim_var[1] -> sim_var_relative
sim_var
#Percentage of reduction of variances of estimates compared to SRS
1-sim_var/sim_var[1] -> sim_var_reduction
cbind("Mean"=sim_means, "Variance"=sim_var, "Relative Variance"= sim_var_relative, "Percentage of Variance Reduction"=sim_var_reduction)
## Mean Variance Relative Variance Percentage of Variance Reduction
## SRS 308584.0 523283890 1.0000000 0.0000000
## Prop2size 309078.9 439085739 0.8390966 0.1609034
## Neyman 308684.2 302887132 0.5788199 0.4211801
<- read.csv ("data/agpop.csv")
agpop <- agpop[agpop$acres92 != -99, ] ## remove those counties with na
agpop <- nrow(agpop) N
plot (acres92~acres82, data = agpop)
summary (lm(acres92~acres82, data = agpop))
##
## Call:
## lm(formula = acres92 ~ acres82, data = agpop)
##
## Residuals:
## Min 1Q Median 3Q Max
## -786951 -11680 -548 8907 804356
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.142e+03 1.159e+03 -6.165 7.99e-10 ***
## acres82 9.813e-01 2.157e-03 455.015 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51310 on 3057 degrees of freedom
## Multiple R-squared: 0.9854, Adjusted R-squared: 0.9854
## F-statistic: 2.07e+05 on 1 and 3057 DF, p-value: < 2.2e-16
$stratum <- cut(agpop$acres82,
agpopbreaks = quantile (agpop$acres82,
probs = c(0,0.25,0.75,0.95,1)),
include.lowest = T)
$stratum <- mapvalues(agpop$stratum,
agpopfrom = (sort(unique (agpop$stratum))),
to= paste0("acres82",c("Q1", "Q2", "Q3", "Q4")))
<- agpop[order(agpop$stratum), ] agpop
Look at variance decomposition
boxplot (acres92 ~ stratum, data = agpop)
anova(lm (agpop$acres92~agpop$stratum))
Look at the \(R^2\) of predicting “acres92” with “acres82” quantile.
summary(lm (agpop$acres92~agpop$stratum))$r.squared
## [1] 0.7393047
# doing one stratified sampling
<- tapply (1:nrow(agpop), agpop$stratum, length)
Nh <- round(Nh/sum (Nh)*300)
nh nh
## acres82Q1 acres82Q2 acres82Q3 acres82Q4
## 75 150 60 15
## doing stratified sampling
<- strata (agpop, "stratum", size = nh, method = "srswor")
strsample head(strsample)
## checking sampling results
table (strsample [,1])
##
## acres82Q1 acres82Q2 acres82Q3 acres82Q4
## 75 150 60 15
# collecting data on sampled counties
<- agpop [strsample$ID_unit, ]
agstrat $weight <- 1/strsample$Prob
agstrat
str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
## Est. S.E. ci.low ci.upp
## 302920.277 9063.152 285156.499 320684.056
<- tapply (1:nrow (agpop), agpop$stratum, length)
Nh <- round(Nh/sum (Nh)*300) ## make sure the order matches Nh
nh data.frame(Nh, nh)
## Nh nh
## acres82Q1 765 75
## acres82Q2 1529 150
## acres82Q3 612 60
## acres82Q4 153 15
<- 2000
nres <- matrix (0, nres, 4)
str_p2s_simulated for (i in 1:nres)
{## doing stratified sampling
<- strata (agpop, "stratum", size = nh, method = "srswor")
strsample <- agpop [strsample$ID_unit, ]
agstrat $weight <- 1/strsample$Prob
agstrat## analyziing data
<- str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
str_p2s_simulated[i,]
}
<- tapply (1:nrow (agpop), agpop$stratum, length)
Nh # find order of stratum
# unique (agpop$stratum)
<- tapply (agpop$acres92, agpop$stratum, sd)
Sh <- round((Nh*Sh)/sum (Nh*Sh) * 300);nh_opt nh_opt
## acres82Q1 acres82Q2 acres82Q3 acres82Q4
## 22 98 77 103
data.frame(Nh,Sh,nh_opt)
<- 2000
nres <- matrix (0, nres, 4)
str_neyman_simulated for (i in 1:nres)
{## doing stratified sampling
<- strata (agpop, "stratum", size = nh_opt, method = "srswor")
strsample # collecting data on sampled counties
<- agpop [strsample$ID_unit, ]
agstrat $weight <- 1/strsample$Prob
agstrat
<- str_mean_estimate_data (agstrat, "acres92", "stratum", "weight")
str_neyman_simulated[i,]
}
<- 2000
nres <- matrix (0, nres, 4)
srs_simulated for (i in 1:nres)
{## doing stratified sampling
<- sample (sum(Nh), sum (nh))
srs # collecting data on sampled counties
<- agpop [srs, ]
agsrs <- srs_mean_est (agsrs[, "acres92"], N= sum(Nh))
srs_simulated[i,]
}
<- data.frame("SRS"=srs_simulated[,1],
sim_results_str_acres82 "Prop2size"=str_p2s_simulated[,1],
"Neyman"=str_neyman_simulated[,1])
boxplot (sim_results_str_acres82)
abline (h = mean (agpop$acres92), col = "red")
sapply (sim_results_str_acres82, mean) -> sim_means
sapply (sim_results_str_acres82, var) -> sim_var
/sim_var[1] -> sim_var_relative
sim_var
#Percentage of reduction of variances of estimates compared to SRS
1-sim_var/sim_var[1] -> sim_var_reduction
cbind("Mean"=sim_means, "Variance"=sim_var, "Relative Variance"= sim_var_relative, "Percentage of Variance Reduction"=sim_var_reduction)
## Mean Variance Relative Variance Percentage of Variance Reduction
## SRS 308386.0 539231495 1.00000000 0.0000000
## Prop2size 308405.7 137506779 0.25500509 0.7449949
## Neyman 308632.5 36946519 0.06851699 0.9314830