library(latex2exp)
## read survey data
<- read.csv ("data/agsrs.csv")
agsrs head(agsrs)
## extract the variable of interest
<- agsrs$acres92
sdata <- 3078
N
## do calculation
<- length (sdata)
n <- mean (sdata)
ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)
se.ybar <- qt (0.975, df = n - 1) * se.ybar
mem ## return estimate vector for pop mean
c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
## Est. S.E. ci.low ci.upp
## 297897.05 18898.43 260706.26 335087.84
## return estimate vector for pop total
c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem) * N
## Est. S.E. ci.low ci.upp
## 916927110 58169381 802453859 1031400361
#
# sdata -- a vector of sampling survey data
# N -- population size
# to find total, multiply N to the estimate returned by this function
<- function (sdata, N)
srs_mean_est
{<- length (sdata)
n <- mean (sdata)
ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)
se.ybar <- qt (0.975, df = n - 1) * se.ybar
mem c (ybar = ybar, se = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
Import Data
<- read.csv ("data/agsrs.csv") agsrs
Estimating the mean of acre92
srs_mean_est (agsrs[,"acres92"], N = 3078)
## ybar se ci.low ci.upp
## 297897.05 18898.43 260706.26 335087.84
Estimating the total of acre92
srs_mean_est (agsrs[,"acres92"], N = 3078) * 3078
## ybar se ci.low ci.upp
## 916927110 58169381 802453859 1031400361
Estimating the proportion of counties with fewer than 200K acres for farming in 1992
.200k <- as.numeric (agsrs[,"acres92"] < 200000)
acres92.is.fewerhead(acres92.is.fewer.200k)
## [1] 1 1 1 1 1 1
srs_mean_est (acres92.is.fewer.200k, N = 3078)
## ybar se ci.low ci.upp
## 0.51000000 0.02746498 0.45595084 0.56404916
Estimating the total number of counties with fewer than 200K acres for farming in 1992
srs_mean_est (acres92.is.fewer.200k, N = 3078) * 3078
## ybar se ci.low ci.upp
## 1569.78000 84.53722 1403.41670 1736.14330
<- read.csv ("data/agpop.csv", na = "-99")
agpop #true mean
mean (agpop[, "acres92"], na.rm = T)
## [1] 308582.4
# true total
sum (agpop[, "acres92"], na.rm = T)
## [1] 943953599
# true proportion of counties with less than 200K acres for farming
mean (agpop[, "acres92"] < 200000, na.rm = T)
## [1] 0.5145472
# true number of counties with less than 200K acres for farming
sum (agpop[, "acres92"] < 200000, na.rm = T)
## [1] 1574
# read population data
<- read.csv ("data/agpop.csv")
agpop # remove those counties with na
<- subset( agpop, acres92 != -99) agpop
# sample size
<- 300
n # population size
<- nrow (agpop); N N
## [1] 3059
# true value of population mean
<- mean (agpop[,"acres92"]); ybarU ybarU
## [1] 308582.4
# true value of deviation of sample mean
<- sqrt (1- n/N) * sd (agpop[,"acres92"]) / sqrt (n); true.se.ybar true.se.ybar
## [1] 23320.29
##
# srs sampling
<- sample (1:N,n)
srs head(agpop [srs, ])
# get data of variable "acres92"
<- agpop [srs, "acres92"]
sdata # analysis
srs_mean_est (sdata, N)
## ybar se ci.low ci.upp
## 289270.96 18708.93 252453.10 326088.82
<- 5000 # number of repeated sampling
nres <- matrix (0, nres, 4) # matrix recording repeated results
simulation.results colnames(simulation.results) <- c( "Est.", "S.E.", "ci.low", "ci.upp")
for (i in 1:nres)
{<- sample (N, n)
srs <- agpop [srs, "acres92"]
sdata <- srs_mean_est (sdata, N)
simulation.results [i,]
}
head(simulation.results)
## Est. S.E. ci.low ci.upp
## [1,] 266764.8 17930.94 231478.0 302051.7
## [2,] 282238.1 20017.83 242844.4 321631.8
## [3,] 323510.3 24537.12 275223.0 371797.6
## [4,] 300429.8 20338.81 260404.5 340455.2
## [5,] 306957.1 20887.15 265852.6 348061.5
## [6,] 340185.7 27555.73 285957.9 394413.4
# look at the distribution of sample mean
par (mfrow= c(2,2))
hist (agpop$acres92,main = "Population Distribution of acre92")
hist (simulation.results[,1], main = "Sampling Distribution of Sample Mean for acre92")
abline (v = ybarU, col = "red")
qqnorm (simulation.results[,1], main="QQ plot of Sample Mean"); qqline(simulation.results[,1])
boxplot (simulation.results[,1], main = "Boxplot of Sample Mean")
abline (h = ybarU, col = "red")
mean (simulation.results[,1])
## [1] 308889.3
ybarU
## [1] 308582.4
sd (simulation.results [,1])
## [1] 23436.76
true.se.ybar
## [1] 23320.29
<- cbind (simulation.results, (simulation.results[,3] < ybarU) * (ybarU < simulation.results[,4] ))
simulation.results colnames(simulation.results)[5] <- "Covered?"
head(simulation.results)
## Est. S.E. ci.low ci.upp Covered?
## [1,] 266764.8 17930.94 231478.0 302051.7 0
## [2,] 282238.1 20017.83 242844.4 321631.8 1
## [3,] 323510.3 24537.12 275223.0 371797.6 1
## [4,] 300429.8 20338.81 260404.5 340455.2 1
## [5,] 306957.1 20887.15 265852.6 348061.5 1
## [6,] 340185.7 27555.73 285957.9 394413.4 1
library("plotrix")
par(mfrow=c(1,1))
plotCI(x=1:100,
y=simulation.results[1:100,1],
li = simulation.results[1:100,3],
ui = simulation.results[1:100,4],
col = 2-simulation.results[,5])
abline(h=ybarU, col = "blue")
# Empirical coverage rate
mean (simulation.results[,"Covered?"])
## [1] 0.9348