library(latex2exp)

1 Analysis of agsrs.csv Data

1.1 Step by step calculation without using a function

## read survey data
agsrs <- read.csv ("data/agsrs.csv")
head(agsrs)

## extract the variable of interest
sdata <- agsrs$acres92
N <- 3078

## do calculation
n <- length (sdata)
ybar <- mean (sdata)
se.ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)  
mem <- qt (0.975, df = n - 1) * se.ybar
## return estimate vector for pop mean
c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)

##      Est.      S.E.    ci.low    ci.upp 
## 297897.05  18898.43 260706.26 335087.84

## return estimate vector for pop total
c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem) * N

##       Est.       S.E.     ci.low     ci.upp 
##  916927110   58169381  802453859 1031400361

1.2 Write a function for repeated use

1.2.1 A function for doing data analysis for srs sample

# 
# sdata -- a vector of sampling survey data
# N -- population size 
# to find total, multiply N to the estimate returned by this function
srs_mean_est <- function (sdata, N)
{
    n <- length (sdata)
    ybar <- mean (sdata)
    se.ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)  
    mem <- qt (0.975, df = n - 1) * se.ybar
    c (ybar = ybar, se = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}

1.2.2 Apply srs_mean_est to agsrs.csv data

Import Data

agsrs <- read.csv ("data/agsrs.csv")

Estimating the mean of acre92

srs_mean_est (agsrs[,"acres92"], N = 3078)

##      ybar        se    ci.low    ci.upp 
## 297897.05  18898.43 260706.26 335087.84

Estimating the total of acre92

srs_mean_est (agsrs[,"acres92"], N = 3078) * 3078

##       ybar         se     ci.low     ci.upp 
##  916927110   58169381  802453859 1031400361

Estimating the proportion of counties with fewer than 200K acres for farming in 1992

acres92.is.fewer.200k <- as.numeric (agsrs[,"acres92"] < 200000)
head(acres92.is.fewer.200k)

## [1] 1 1 1 1 1 1

srs_mean_est (acres92.is.fewer.200k, N = 3078)

##       ybar         se     ci.low     ci.upp 
## 0.51000000 0.02746498 0.45595084 0.56404916

Estimating the total number of counties with fewer than 200K acres for farming in 1992

srs_mean_est (acres92.is.fewer.200k, N = 3078) * 3078

##       ybar         se     ci.low     ci.upp 
## 1569.78000   84.53722 1403.41670 1736.14330

1.3 Comparing with true value

agpop <- read.csv ("data/agpop.csv", na = "-99")
#true mean
mean (agpop[, "acres92"], na.rm = T)

## [1] 308582.4

# true total
sum (agpop[, "acres92"], na.rm = T)

## [1] 943953599

# true proportion of counties with less than 200K acres for farming
mean (agpop[, "acres92"] < 200000, na.rm = T)

## [1] 0.5145472

# true number of counties with less than 200K acres for farming
sum (agpop[, "acres92"] < 200000, na.rm = T)

## [1] 1574

2 A Simulation Demonstration of SRS Inference

# read population data 
agpop <- read.csv ("data/agpop.csv")
# remove those counties with na
agpop <- subset( agpop, acres92 != -99)

2.1 True Values

# sample size
n <- 300
# population size
N <- nrow (agpop); N

## [1] 3059

# true value of population mean
ybarU <- mean (agpop[,"acres92"]); ybarU

## [1] 308582.4

# true value of deviation of sample mean
true.se.ybar <- sqrt (1- n/N) * sd (agpop[,"acres92"]) / sqrt (n); true.se.ybar

## [1] 23320.29

2.2 One SRS sampling

## 
# srs sampling
srs <- sample (1:N,n)
head(agpop [srs, ])

# get data of variable "acres92"
sdata <- agpop [srs, "acres92"]
# analysis
srs_mean_est (sdata, N)

##      ybar        se    ci.low    ci.upp 
## 307750.05  20833.52 266751.15 348748.95

2.3 Repeating SRS sampling 5000 times

nres <- 5000 # number of repeated sampling
simulation.results <- matrix (0, nres, 4) # matrix recording repeated results
colnames(simulation.results) <- c( "Est.",   "S.E.",   "ci.low", "ci.upp")

for (i in 1:nres)
{
    srs <- sample (N, n)
    sdata <- agpop [srs, "acres92"]
    simulation.results [i,] <- srs_mean_est (sdata, N)
}

head(simulation.results)

##          Est.     S.E.   ci.low   ci.upp
## [1,] 360057.2 24473.34 311895.3 408219.0
## [2,] 290263.1 20708.05 249511.1 331015.0
## [3,] 298222.0 28890.38 241367.7 355076.2
## [4,] 310999.4 19461.42 272700.7 349298.1
## [5,] 319907.5 20418.64 279725.1 360090.0
## [6,] 275013.4 17327.38 240914.4 309112.5

# look at the distribution of sample mean
par (mfrow= c(2,2))
hist (agpop$acres92,main = "Population Distribution of acre92")
hist (simulation.results[,1], main = "Sampling Distribution of Sample Mean for acre92")
abline (v = ybarU, col = "red")
qqnorm (simulation.results[,1], main="QQ plot of Sample Mean"); qqline(simulation.results[,1])
boxplot (simulation.results[,1], main = "Boxplot of Sample Mean")
abline (h = ybarU, col = "red")

mean (simulation.results[,1])

## [1] 308808.5

ybarU

## [1] 308582.4

sd (simulation.results [,1])

## [1] 23523.86

true.se.ybar

## [1] 23320.29

2.4 Empirical Coverage Rate of CIs

simulation.results <- cbind (simulation.results, (simulation.results[,3] < ybarU) * (ybarU < simulation.results[,4] ))
colnames(simulation.results)[5] <- "Covered?"
head(simulation.results)

##          Est.     S.E.   ci.low   ci.upp Covered?
## [1,] 360057.2 24473.34 311895.3 408219.0        0
## [2,] 290263.1 20708.05 249511.1 331015.0        1
## [3,] 298222.0 28890.38 241367.7 355076.2        1
## [4,] 310999.4 19461.42 272700.7 349298.1        1
## [5,] 319907.5 20418.64 279725.1 360090.0        1
## [6,] 275013.4 17327.38 240914.4 309112.5        1

library("plotrix") 
par(mfrow=c(1,1))
plotCI(x=1:100,
       y=simulation.results[1:100,1],
       li = simulation.results[1:100,3],
       ui = simulation.results[1:100,4],
       col = 2-simulation.results[,5])
abline(h=ybarU, col = "blue")

# Empirical coverage rate 
mean (simulation.results[,"Covered?"])

## [1] 0.931

Data Analysis and Simulation for Simple Random Sampling

Longhai Li

September 2024