## ydata --- observations of the variable of interest
## xdata --- observations of the auxilliary variable
## N --- population size
## the output is the ratio of ybarU/xbarU
<- function (ydata, xdata, N = Inf)
srs_ratio_est
{ <- length (xdata)
n <- mean (xdata)
xbar <- mean (ydata)
ybar <- ybar / xbar
B_hat <- ydata - B_hat * xdata
d <- sum (d^2) / (n - 1)
var_d <- sqrt ((1 - n/N) * var_d / n) / xbar
sd_B_hat <- qt (0.975, df = n - 1) * sd_B_hat
mem <- c (B_hat, sd_B_hat, B_hat - mem, B_hat + mem )
output
names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output
}
## ydata --- observations of the variable of interest
## xdata --- observations of the auxilliary variable
## N --- population size
## xbarU --- population mean of auxilliary variable
## the output is the estimate mean or total (est.total=TRUE)
<- function (ydata, xdata, xbarU, N = Inf, est.total = FALSE)
srs_reg_est
{<- length (ydata)
n <- lm (ydata ~ xdata)
lmfit <- lmfit$coefficients
Bhat <- lmfit$residuals
efit <- sum (efit^2) / (n - 2)
SSe <- Bhat[1] + Bhat[2] * xbarU
yhat_reg <- sqrt ((1-n/N) * SSe / n)
se_yhat_reg <- qt (0.975, df = n - 2) * se_yhat_reg
mem <- c(yhat_reg, se_yhat_reg, yhat_reg - mem, yhat_reg + mem)
output
if (est.total) {
if(!is.finite(N)) stop("N must be finite for estimating population total" )
<- output * N
output
}
names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
output
}
## sdata --- a vector of original survey data
## N --- population size
## to find total, multiply N to the estimate returned by this function
<- function (sdata, N = Inf)
srs_mean_est
{<- length (sdata)
n <- mean (sdata)
ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)
se.ybar <- qt (0.975, df = n - 1) * se.ybar
mem c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}
# total estimation for datasets collected with UPS with Replacement
<- function (total, psi)
upswr_total_est
{ srs_mean_est (total/psi, N = Inf)
}# ratio estimation for datasets collected with UPS with Replacement
<- function (total, M, psi)
upswr_ratio_est
{srs_ratio_est (total/psi, M/psi, N = Inf)
}
<- c(24, 100, 100, 76, 44) / 647
psi <- c(24, 100, 100, 76, 44)
Mi <- c(75, 203, 203, 191, 168)
hrs
plot(hrs~Mi)
summary (lm(hrs~Mi))
##
## Call:
## lm(formula = hrs ~ Mi)
##
## Residuals:
## 1 2 3 4 5
## -29.825 -8.997 -8.997 12.847 34.972
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.9820 31.5534 2.250 0.1100
## Mi 1.4101 0.4195 3.362 0.0437 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.52 on 3 degrees of freedom
## Multiple R-squared: 0.7902, Adjusted R-squared: 0.7203
## F-statistic: 11.3 on 1 and 3 DF, p-value: 0.04368
## total hrs spent by all students
upswr_total_est (total = hrs, psi = psi)
## Est. S.E. ci.low ci.upp
## 1749.0144 222.4218 1131.4724 2366.5563
## mean hrs spent by each student
upswr_ratio_est (total = hrs, M = Mi, psi = psi)
## Est. S.E. ci.low ci.upp
## 2.7032679 0.3437741 1.7487981 3.6577378
The file “statepop.csv” is a dataset from an unequal-probability sample of 100 counties in the United States. Counties were chosen using the cumulative-size method from the listings in the County and City Data Book (U.S. Census Bureau, 1994) with probabilities proportional to their populations. The total population for all counties is \(M_0 = \sum_{i=1} M_i = 255,077,536\). Sampling was done with replacement, so very large counties occur multiple times in the sample. For example, Los Angeles County, with the largest population in the United States, occurs four times.
<- read.csv ("data/statepop.csv")
statepop statepop
<- 255077536 ## pop of USA in 1992
M0 <- statepop$popn ## pop per county in 1992
popn <- popn / M0
psi <- statepop$phys
phys
plot (popn, phys, log="xy")
summary(lm (phys~psi))
##
## Call:
## lm(formula = phys ~ psi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2241.1 -564.9 -256.7 -125.8 9884.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 152.0 185.3 0.821 0.414
## psi 687788.5 21708.5 31.683 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1623 on 98 degrees of freedom
## Multiple R-squared: 0.9111, Adjusted R-squared: 0.9101
## F-statistic: 1004 on 1 and 98 DF, p-value: < 2.2e-16
# estimate the total number of physicians in USA
upswr_total_est (phys, psi)
## Est. S.E. ci.low ci.upp
## 570304.30 41401.23 488155.27 652453.32
# estimate number of physicians per capita
upswr_ratio_est (phys, popn, psi)
## Est. S.E. ci.low ci.upp
## 0.0022358076 0.0001623084 0.0019137525 0.0025578627
Compare with an estimate based on an SRS sample (Assignment 3 Question 3)
# read srs sample of size 100
<- read.csv ("data/counties.csv", na = "-99")
counties counties
plot(physician~totpop, data=counties, log="xy")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 5 y values <= 0 omitted from
## logarithmic plot
summary (lm(physician~totpop, data=counties))
##
## Call:
## lm(formula = physician ~ totpop, data = counties)
##
## Residuals:
## Min 1Q Median 3Q Max
## -767.24 -43.44 4.84 32.93 3105.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.423e+01 3.490e+01 -1.554 0.123
## totpop 2.965e-03 6.519e-05 45.477 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 340.3 on 98 degrees of freedom
## Multiple R-squared: 0.9548, Adjusted R-squared: 0.9543
## F-statistic: 2068 on 1 and 98 DF, p-value: < 2.2e-16
<- 255077536
totpop <- 3141 # total number of counties
N # naive estimate
srs_mean_est (counties$physician, N = 3141) * 3141
## Est. S.E. ci.low ci.upp
## 933410.97 491982.79 -42789.62 1909611.56
# ratio estimate
srs_ratio_est (counties$physician, counties$totpop, N = 3141) * totpop
## Est. S.E. ci.low ci.upp
## 639506.03 87885.27 465122.60 813889.46
# regression estimate
srs_reg_est (counties$physician, counties$totpop, totpop/N, N = N) * N
## Est. S.E. ci.low ci.upp
## 585870.6 105177.4 377149.4 794591.8
# From the textbook by Lohr, the true value is:
<- 532638 total_physician
<- read.table ("data/studentshrs.txt", header = T)
studenthrs
## add sampling probability to the data
$cpsi <- studenthrs$class_size/647
studenthrs studenthrs
## data --- original data frame for holding data
## sid --- variable recording sample id of psu, note that,
## sid must be different when the same psu is surveyed multiple times
## csize --- variable recording cluster (psu) population size (not sample size)
## cpik --- sampling probability for each cluster
## yvar --- variable of interest
## N --- total number of clusters (psus)
<- function (data, sid, csize, cpsi, yvar,show.cluster.summary=TRUE)
cluster_upswr_ratio_est
{<- data[, sid]
clust <- data [, yvar]
ydata
## cluster-wise summary
<- tapply (ydata, clust, mean)
ybari <- tapply (data [,csize], clust, function (x) x[1])
Mi <- tapply (data [,cpsi], clust, function (x) x[1])
psi ## the same as total in cluster if Mi = mi
<- ybari * Mi
t_hat_cls
if(show.cluster.summary){
cat ("Cluster level Summary:\n")
print(data.frame(Mi,psi,ybari, ti=t_hat_cls, ti_psi=t_hat_cls/psi,Mi_psi=Mi/psi))
}
## apply ratio estimate to t_hat_cls and Mi
srs_ratio_est (t_hat_cls/psi, Mi/psi)
}
cluster_upswr_ratio_est (studenthrs, "sid" , "class_size", "cpsi", "hrs")
## Cluster level Summary:
## Mi psi ybari ti ti_psi Mi_psi
## 1 44 0.06800618 3.7 162.8 2393.9 647
## 5 76 0.11746522 2.8 212.8 1811.6 647
## 12 24 0.03709428 2.4 57.6 1552.8 647
## 141 100 0.15455951 1.6 160.0 1035.2 647
## 142 100 0.15455951 2.0 200.0 1294.0 647
## Est. S.E. ci.low ci.upp
## 2.5000000 0.3605551 1.4989385 3.5010615
# Read population data
<- read.csv ("data/agpop.csv", na = "-99")
agpop ## removing the counties with NA in either acres92 or acres87
<- subset(agpop, subset = !is.na(agpop$acres92) & !is.na(agpop$acres87))
agpop
<- nrow (agpop); N N
## [1] 3044
<- sum (agpop$acres92)
total_acres92<- sum (agpop$acres87) total_acres87
plot (agpop$acres87, agpop$acres92)
summary(lm(agpop$acres92~agpop$acres87))
##
## Call:
## lm(formula = agpop$acres92 ~ agpop$acres87)
##
## Residuals:
## Min 1Q Median 3Q Max
## -642580 -8012 -453 5738 656073
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.451e+03 8.517e+02 -2.878 0.00404 **
## agpop$acres87 9.882e-01 1.598e-03 618.428 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37840 on 3042 degrees of freedom
## Multiple R-squared: 0.9921, Adjusted R-squared: 0.9921
## F-statistic: 3.825e+05 on 1 and 3042 DF, p-value: < 2.2e-16
<- agpop[, "acres87"]
acres87 <- est_ups300_acres87 <- est_srs300_reg_acres87 <- rep (0, 2000)
est_srs300 for (i in 1:2000)
{## srs sample
<- sample (1:N, size = 300)
srs300 ## survey
<- agpop[srs300, "acres92"]
acres92_srs300 <- agpop[srs300, "acres87"]
acres87_srs300 ## estimate
<- (srs_mean_est (acres92_srs300, N = N) * N) [1]
est_srs300 [i] <- srs_reg_est(
est_srs300_reg_acres87[i] /N, N = N, est.total = TRUE )[1]
acres92_srs300, acres87_srs300, total_acres87
## upswr sample
<- sample (1:N, size = 300, prob = acres87, replace = TRUE)
ups300
## survey (obtaining measurements on acres92)
<- agpop[ups300, "acres92"]
acres92_ups300
## estimate
<- (acres87 / sum (acres87)) [ups300]
psi <- upswr_total_est (acres92_ups300, psi) [1]
est_ups300_acres87 [i]
}
par (mfrow = c(2,1))
boxplot (data.frame(est_srs300, est_srs300_reg_acres87, est_ups300_acres87))
abline(h=total_acres92, col="red")
boxplot (data.frame(est_srs300_reg_acres87, est_ups300_acres87))
abline(h=total_acres92, col="red")
plot (agpop$largef92, agpop$acres92)
<- agpop[, "largef87"]
largef87 summary(lm(agpop$acres92~agpop$largef87))
##
## Call:
## lm(formula = agpop$acres92 ~ agpop$largef87)
##
## Residuals:
## Min 1Q Median 3Q Max
## -837145 -85139 -46310 18214 6770696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95759.61 7151.10 13.39 <2e-16 ***
## agpop$largef87 3863.08 77.35 49.94 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 315700 on 3042 degrees of freedom
## Multiple R-squared: 0.4505, Adjusted R-squared: 0.4503
## F-statistic: 2494 on 1 and 3042 DF, p-value: < 2.2e-16
<- rep (0, 2000)
est_ups300_largef87
for (i in 1:2000)
{
## ups sample
<- sample (1:N, size = 300, prob = largef87, replace = TRUE)
ups300 ## survey
<- agpop[ups300, "acres92"]
acres92_ups300
## estimate
<- (largef87 / sum (largef87)) [ups300]
psi <- upswr_total_est (acres92_ups300, psi) [1]
est_ups300_largef87 [i]
}
boxplot (data.frame(est_srs300,est_srs300_reg_acres87, est_ups300_acres87, est_ups300_largef87))
abline(h=total_acres92, col="red")
## data --- original data frame for holding data
## cname --- variable recording cluster (psu) identity
## csize --- variable recording cluster (psu) population size (not sample size)
## cpik --- sampling probability for each cluster
## yvar --- variable of interest
## N --- total number of clusters (psus)
<- function (data, cname, csize, cpik, yvar)
cluster_upswo_ratio_est
{<- data[,cname]
clust <- data [, yvar]
ydata
## cluster-wise summary
<- tapply (ydata, clust, mean)
ybari <- tapply (data [,csize], clust, function (x) x[1])
Mi <- tapply (data [,cpik], clust, function (x) x[1])
pik ## the same as total in cluster if Mi = mi
<- ybari * Mi
t_hat_cls
## apply ratio estimate to t_hat_cls and Mi
srs_ratio_est (t_hat_cls/pik, Mi/pik)
}
<- read.csv ("data/classpps.csv")
classpps
classpps
cluster_upswo_ratio_est (classpps, "class", "clssize", "clssize", "hours")
## Est. S.E. ci.low ci.upp
## 3.4500000 0.4818584 2.1121467 4.7878533