## ydata --- observations of the variable of interest
## xdata --- observations of the auxilliary variable
## N --- population size
## the output is the ratio of ybarU/xbarU
srs_ratio_est <- function (ydata, xdata, N = Inf)
{
n <- length (xdata)
xbar <- mean (xdata)
ybar <- mean (ydata)
B_hat <- ybar / xbar
d <- ydata - B_hat * xdata
var_d <- sum (d^2) / (n - 1)
sd_B_hat <- sqrt ((1 - n/N) * var_d / n) / xbar
mem <- qt (0.975, df = n - 1) * sd_B_hat
output <- c (B_hat, sd_B_hat, B_hat - mem, B_hat + mem )
names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output
}
## ydata --- observations of the variable of interest
## xdata --- observations of the auxilliary variable
## N --- population size
## xbarU --- population mean of auxilliary variable
## the output is the estimate mean or total (est.total=TRUE)
srs_reg_est <- function (ydata, xdata, xbarU, N = Inf, est.total = FALSE)
{
n <- length (ydata)
lmfit <- lm (ydata ~ xdata)
Bhat <- lmfit$coefficients
efit <- lmfit$residuals
SSe <- sum (efit^2) / (n - 2)
yhat_reg <- Bhat[1] + Bhat[2] * xbarU
se_yhat_reg <- sqrt ((1-n/N) * SSe / n)
mem <- qt (0.975, df = n - 2) * se_yhat_reg
output <- c(yhat_reg, se_yhat_reg, yhat_reg - mem, yhat_reg + mem)
if (est.total) {
if(!is.finite(N)) stop("N must be finite for estimating population total" )
output <- output * N
}
names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output
}
## sdata --- a vector of original survey data
## N --- population size
## to find total, multiply N to the estimate returned by this function
srs_mean_est <- function (sdata, N = Inf)
{
n <- length (sdata)
ybar <- mean (sdata)
se.ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)
mem <- qt (0.975, df = n - 1) * se.ybar
c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
# total estimation for datasets collected with UPS with Replacement
upswr_total_est <- function (total, psi)
{
srs_mean_est (total/psi, N = Inf)
}
# ratio estimation for datasets collected with UPS with Replacement
upswr_ratio_est <- function (total, M, psi)
{
srs_ratio_est (total/psi, M/psi, N = Inf)
}
psi <- c(24, 100, 100, 76, 44) / 647
Mi <- c(24, 100, 100, 76, 44)
hrs <- c(75, 203, 203, 191, 168)
plot(hrs~Mi)
##
## Call:
## lm(formula = hrs ~ Mi)
##
## Residuals:
## 1 2 3 4 5
## -29.825 -8.997 -8.997 12.847 34.972
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.9820 31.5534 2.250 0.1100
## Mi 1.4101 0.4195 3.362 0.0437 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.52 on 3 degrees of freedom
## Multiple R-squared: 0.7902, Adjusted R-squared: 0.7203
## F-statistic: 11.3 on 1 and 3 DF, p-value: 0.04368
## Est. S.E. ci.low ci.upp
## 1749.0144 222.4218 1131.4724 2366.5563
## Est. S.E. ci.low ci.upp
## 2.7032679 0.3437741 1.7487981 3.6577378
The file “statepop.csv” is a dataset from an unequal-probability sample of 100 counties in the United States. Counties were chosen using the cumulative-size method from the listings in the County and City Data Book (U.S. Census Bureau, 1994) with probabilities proportional to their populations. The total population for all counties is \(M_0 = \sum_{i=1} M_i = 255,077,536\). Sampling was done with replacement, so very large counties occur multiple times in the sample. For example, Los Angeles County, with the largest population in the United States, occurs four times.
M0 <- 255077536 ## pop of USA in 1992
popn <- statepop$popn ## pop per county in 1992
psi <- popn / M0
phys <- statepop$phys
plot (popn, phys, log="xy")
##
## Call:
## lm(formula = phys ~ psi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2241.1 -564.9 -256.7 -125.8 9884.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 152.0 185.3 0.821 0.414
## psi 687788.5 21708.5 31.683 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1623 on 98 degrees of freedom
## Multiple R-squared: 0.9111, Adjusted R-squared: 0.9101
## F-statistic: 1004 on 1 and 98 DF, p-value: < 2.2e-16
## Est. S.E. ci.low ci.upp
## 570304.30 41401.23 488155.27 652453.32
## Est. S.E. ci.low ci.upp
## 0.0022358076 0.0001623084 0.0019137525 0.0025578627
Compare with an estimate based on an SRS sample (Assignment 3 Question 3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 5 y values <= 0 omitted from
## logarithmic plot
##
## Call:
## lm(formula = physician ~ totpop, data = counties)
##
## Residuals:
## Min 1Q Median 3Q Max
## -767.24 -43.44 4.84 32.93 3105.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.423e+01 3.490e+01 -1.554 0.123
## totpop 2.965e-03 6.519e-05 45.477 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 340.3 on 98 degrees of freedom
## Multiple R-squared: 0.9548, Adjusted R-squared: 0.9543
## F-statistic: 2068 on 1 and 98 DF, p-value: < 2.2e-16
totpop <- 255077536
N <- 3141 # total number of counties
# naive estimate
srs_mean_est (counties$physician, N = 3141) * 3141
## Est. S.E. ci.low ci.upp
## 933410.97 491982.79 -42789.62 1909611.56
## Est. S.E. ci.low ci.upp
## 639506.03 87885.27 465122.60 813889.46
## Est. S.E. ci.low ci.upp
## 585870.6 105177.4 377149.4 794591.8
studenthrs <- read.table ("data/studentshrs.txt", header = T)
## add sampling probability to the data
studenthrs$cpsi <- studenthrs$class_size/647
studenthrs
## data --- original data frame for holding data
## sid --- variable recording sample id of psu, note that,
## sid must be different when the same psu is surveyed multiple times
## csize --- variable recording cluster (psu) population size (not sample size)
## cpik --- sampling probability for each cluster
## yvar --- variable of interest
## N --- total number of clusters (psus)
cluster_upswr_ratio_est <- function (data, sid, csize, cpsi, yvar,show.cluster.summary=TRUE)
{
clust <- data[, sid]
ydata <- data [, yvar]
## cluster-wise summary
ybari <- tapply (ydata, clust, mean)
Mi <- tapply (data [,csize], clust, function (x) x[1])
psi <- tapply (data [,cpsi], clust, function (x) x[1])
## the same as total in cluster if Mi = mi
t_hat_cls <- ybari * Mi
if(show.cluster.summary){
cat ("Cluster level Summary:\n")
print(data.frame(Mi,psi,ybari, ti=t_hat_cls, ti_psi=t_hat_cls/psi,Mi_psi=Mi/psi))
}
## apply ratio estimate to t_hat_cls and Mi
srs_ratio_est (t_hat_cls/psi, Mi/psi)
}
cluster_upswr_ratio_est (studenthrs, "sid" , "class_size", "cpsi", "hrs")
## Cluster level Summary:
## Mi psi ybari ti ti_psi Mi_psi
## 1 44 0.06800618 3.7 162.8 2393.9 647
## 5 76 0.11746522 2.8 212.8 1811.6 647
## 12 24 0.03709428 2.4 57.6 1552.8 647
## 141 100 0.15455951 1.6 160.0 1035.2 647
## 142 100 0.15455951 2.0 200.0 1294.0 647
## Est. S.E. ci.low ci.upp
## 2.5000000 0.3605551 1.4989385 3.5010615
# Read population data
agpop <- read.csv ("data/agpop.csv", na = "-99")
## removing the counties with NA in either acres92 or acres87
agpop <- subset(agpop, subset = !is.na(agpop$acres92) & !is.na(agpop$acres87))
N <- nrow (agpop); N
## [1] 3044
##
## Call:
## lm(formula = agpop$acres92 ~ agpop$acres87)
##
## Residuals:
## Min 1Q Median 3Q Max
## -642580 -8012 -453 5738 656073
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.451e+03 8.517e+02 -2.878 0.00404 **
## agpop$acres87 9.882e-01 1.598e-03 618.428 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37840 on 3042 degrees of freedom
## Multiple R-squared: 0.9921, Adjusted R-squared: 0.9921
## F-statistic: 3.825e+05 on 1 and 3042 DF, p-value: < 2.2e-16
acres87 <- agpop[, "acres87"]
est_srs300 <- est_ups300_acres87 <- est_srs300_reg_acres87 <- rep (0, 2000)
for (i in 1:2000)
{
## srs sample
srs300 <- sample (1:N, size = 300)
## survey
acres92_srs300 <- agpop[srs300, "acres92"]
acres87_srs300 <- agpop[srs300, "acres87"]
## estimate
est_srs300 [i] <- (srs_mean_est (acres92_srs300, N = N) * N) [1]
est_srs300_reg_acres87[i] <- srs_reg_est(
acres92_srs300, acres87_srs300, total_acres87/N, N = N, est.total = TRUE )[1]
## upswr sample
ups300 <- sample (1:N, size = 300, prob = acres87, replace = TRUE)
## survey (obtaining measurements on acres92)
acres92_ups300 <- agpop[ups300, "acres92"]
## estimate
psi <- (acres87 / sum (acres87)) [ups300]
est_ups300_acres87 [i] <- upswr_total_est (acres92_ups300, psi) [1]
}
##
## Call:
## lm(formula = agpop$acres92 ~ agpop$largef87)
##
## Residuals:
## Min 1Q Median 3Q Max
## -837145 -85139 -46310 18214 6770696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95759.61 7151.10 13.39 <2e-16 ***
## agpop$largef87 3863.08 77.35 49.94 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 315700 on 3042 degrees of freedom
## Multiple R-squared: 0.4505, Adjusted R-squared: 0.4503
## F-statistic: 2494 on 1 and 3042 DF, p-value: < 2.2e-16
est_ups300_largef87 <- rep (0, 2000)
for (i in 1:2000)
{
## ups sample
ups300 <- sample (1:N, size = 300, prob = largef87, replace = TRUE)
## survey
acres92_ups300 <- agpop[ups300, "acres92"]
## estimate
psi <- (largef87 / sum (largef87)) [ups300]
est_ups300_largef87 [i] <- upswr_total_est (acres92_ups300, psi) [1]
}
boxplot (data.frame(est_srs300,est_srs300_reg_acres87, est_ups300_acres87, est_ups300_largef87))
abline(h=total_acres92, col="red")
## data --- original data frame for holding data
## cname --- variable recording cluster (psu) identity
## csize --- variable recording cluster (psu) population size (not sample size)
## cpik --- sampling probability for each cluster
## yvar --- variable of interest
## N --- total number of clusters (psus)
cluster_upswo_ratio_est <- function (data, cname, csize, cpik, yvar)
{
clust <- data[,cname]
ydata <- data [, yvar]
## cluster-wise summary
ybari <- tapply (ydata, clust, mean)
Mi <- tapply (data [,csize], clust, function (x) x[1])
pik <- tapply (data [,cpik], clust, function (x) x[1])
## the same as total in cluster if Mi = mi
t_hat_cls <- ybari * Mi
## apply ratio estimate to t_hat_cls and Mi
srs_ratio_est (t_hat_cls/pik, Mi/pik)
}
classpps <- read.csv ("data/classpps.csv")
classpps
## Est. S.E. ci.low ci.upp
## 3.4500000 0.4818584 2.1121467 4.7878533