## extract the variable of interest
sdata <- agsrs$acres92
N <- 3078
## do calculation
n <- length (sdata)
ybar <- mean (sdata)
se.ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)
mem <- qt (0.975, df = n - 1) * se.ybar
## return estimate vector for pop mean
c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
## Est. S.E. ci.low ci.upp
## 297897.05 18898.43 260706.26 335087.84
## return estimate vector for pop total
c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem) * N
## Est. S.E. ci.low ci.upp
## 916927110 58169381 802453859 1031400361
#
# sdata -- a vector of sampling survey data
# N -- population size
# to find total, multiply N to the estimate returned by this function
srs_mean_est <- function (sdata, N)
{
n <- length (sdata)
ybar <- mean (sdata)
se.ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)
mem <- qt (0.975, df = n - 1) * se.ybar
c (ybar = ybar, se = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
Import Data
Estimating the mean of acre92
## ybar se ci.low ci.upp
## 297897.05 18898.43 260706.26 335087.84
Estimating the total of acre92
## ybar se ci.low ci.upp
## 916927110 58169381 802453859 1031400361
Estimating the proportion of counties with fewer than 200K acres for farming in 1992
## [1] 1 1 1 1 1 1
## ybar se ci.low ci.upp
## 0.51000000 0.02746498 0.45595084 0.56404916
Estimating the total number of counties with fewer than 200K acres for farming in 1992
## ybar se ci.low ci.upp
## 1569.78000 84.53722 1403.41670 1736.14330
## [1] 308582.4
## [1] 943953599
# true proportion of counties with less than 200K acres for farming
mean (agpop[, "acres92"] < 200000, na.rm = T)
## [1] 0.5145472
# true number of counties with less than 200K acres for farming
sum (agpop[, "acres92"] < 200000, na.rm = T)
## [1] 1574
# read population data
agpop <- read.csv ("data/agpop.csv")
# remove those counties with na
agpop <- subset( agpop, acres92 != -99)
## [1] 3059
## [1] 308582.4
# true value of deviation of sample mean
true.se.ybar <- sqrt (1- n/N) * sd (agpop[,"acres92"]) / sqrt (n); true.se.ybar
## [1] 23320.29
## ybar se ci.low ci.upp
## 307750.05 20833.52 266751.15 348748.95
nres <- 5000 # number of repeated sampling
simulation.results <- matrix (0, nres, 4) # matrix recording repeated results
colnames(simulation.results) <- c( "Est.", "S.E.", "ci.low", "ci.upp")
for (i in 1:nres)
{
srs <- sample (N, n)
sdata <- agpop [srs, "acres92"]
simulation.results [i,] <- srs_mean_est (sdata, N)
}
head(simulation.results)
## Est. S.E. ci.low ci.upp
## [1,] 360057.2 24473.34 311895.3 408219.0
## [2,] 290263.1 20708.05 249511.1 331015.0
## [3,] 298222.0 28890.38 241367.7 355076.2
## [4,] 310999.4 19461.42 272700.7 349298.1
## [5,] 319907.5 20418.64 279725.1 360090.0
## [6,] 275013.4 17327.38 240914.4 309112.5
# look at the distribution of sample mean
par (mfrow= c(2,2))
hist (agpop$acres92,main = "Population Distribution of acre92")
hist (simulation.results[,1], main = "Sampling Distribution of Sample Mean for acre92")
abline (v = ybarU, col = "red")
qqnorm (simulation.results[,1], main="QQ plot of Sample Mean"); qqline(simulation.results[,1])
boxplot (simulation.results[,1], main = "Boxplot of Sample Mean")
abline (h = ybarU, col = "red")
## [1] 308808.5
## [1] 308582.4
## [1] 23523.86
## [1] 23320.29
simulation.results <- cbind (simulation.results, (simulation.results[,3] < ybarU) * (ybarU < simulation.results[,4] ))
colnames(simulation.results)[5] <- "Covered?"
head(simulation.results)
## Est. S.E. ci.low ci.upp Covered?
## [1,] 360057.2 24473.34 311895.3 408219.0 0
## [2,] 290263.1 20708.05 249511.1 331015.0 1
## [3,] 298222.0 28890.38 241367.7 355076.2 1
## [4,] 310999.4 19461.42 272700.7 349298.1 1
## [5,] 319907.5 20418.64 279725.1 360090.0 1
## [6,] 275013.4 17327.38 240914.4 309112.5 1
library("plotrix")
par(mfrow=c(1,1))
plotCI(x=1:100,
y=simulation.results[1:100,1],
li = simulation.results[1:100,3],
ui = simulation.results[1:100,4],
col = 2-simulation.results[,5])
abline(h=ybarU, col = "blue")
## [1] 0.931