1 Functions and Packages
2 Analysis of a dataset (student study hours) collected by UPSWR
3 Analysis of a dataset (statepop.csv) collected with one-stage UPSWR
4 Analysis of a dataset (studenthrs.csv) collected with two-stage UPSWR
5 Simulation Studies for UPSWR using agpop data
- 5.1 Using acres87 to construct sampling probabilities
- 5.2 Using largef87 to construct sampling probabilities
6 Analysis of a dataset (student study hours) collected by UPSWO (without replacement)

1 Functions and Packages

## ydata --- observations of the variable of interest
## xdata --- observations of the auxilliary variable
## N --- population size

## the output is the ratio of ybarU/xbarU
srs_ratio_est <- function (ydata, xdata, N = Inf)
{   
  n <- length (xdata)
  xbar <- mean (xdata)
  ybar <- mean (ydata)
  B_hat <- ybar / xbar
  d <- ydata - B_hat * xdata
  var_d <- sum (d^2) / (n - 1)
  sd_B_hat <- sqrt ((1 - n/N) * var_d / n) / xbar
  mem <- qt (0.975, df = n - 1) * sd_B_hat
  output <- c (B_hat, sd_B_hat, B_hat - mem, B_hat + mem )
  
  
  names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
  output
}

## ydata --- observations of the variable of interest
## xdata --- observations of the auxilliary variable
## N --- population size
## xbarU --- population mean of auxilliary variable

## the output is the estimate mean or total (est.total=TRUE)
srs_reg_est <- function (ydata, xdata, xbarU, N = Inf, est.total = FALSE)
{
  n <- length (ydata)
  lmfit <- lm (ydata ~ xdata)
  Bhat <- lmfit$coefficients
  efit <- lmfit$residuals
  SSe <- sum (efit^2) / (n - 2) 
  yhat_reg <- Bhat[1] + Bhat[2] * xbarU
  se_yhat_reg <- sqrt ((1-n/N) * SSe / n)
  mem <- qt (0.975, df = n - 2) * se_yhat_reg
  output <- c(yhat_reg, se_yhat_reg, yhat_reg - mem, yhat_reg + mem)
  
  if (est.total) {
      if(!is.finite(N)) stop("N must be finite for estimating population total" )
      output <- output * N
  }

  names (output) <- c("Est.", "S.E.", "ci.low", "ci.upp" )
  output
}

## sdata --- a vector of original survey data
## N --- population size
## to find total, multiply N to the estimate returned by this function

srs_mean_est <- function (sdata, N = Inf)
{
    n <- length (sdata)
    ybar <- mean (sdata)
    se.ybar <- sqrt((1 - n / N)) * sd (sdata) / sqrt(n)  
    mem <- qt (0.975, df = n - 1) * se.ybar
    c (Est. = ybar, S.E. = se.ybar, ci.low = ybar - mem, ci.upp = ybar + mem)
}

# total estimation for datasets collected with UPS with Replacement
upswr_total_est <- function (total, psi)
{   
  srs_mean_est (total/psi, N = Inf)
}
# ratio estimation for datasets collected with UPS with Replacement
upswr_ratio_est <- function (total, M, psi)
{
  srs_ratio_est (total/psi, M/psi, N = Inf)
}

2 Analysis of a dataset (student study hours) collected by UPSWR

psi <- c(24, 100, 100, 76, 44) / 647
Mi <- c(24, 100, 100, 76, 44)
hrs <- c(75, 203, 203, 191, 168)

plot(hrs~Mi)

summary (lm(hrs~Mi))

## 
## Call:
## lm(formula = hrs ~ Mi)
## 
## Residuals:
##       1       2       3       4       5 
## -29.825  -8.997  -8.997  12.847  34.972 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  70.9820    31.5534   2.250   0.1100  
## Mi            1.4101     0.4195   3.362   0.0437 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.52 on 3 degrees of freedom
## Multiple R-squared:  0.7902, Adjusted R-squared:  0.7203 
## F-statistic:  11.3 on 1 and 3 DF,  p-value: 0.04368

## total hrs spent by all students

upswr_total_est (total = hrs, psi = psi)

##      Est.      S.E.    ci.low    ci.upp 
## 1749.0144  222.4218 1131.4724 2366.5563

## mean hrs spent by each student
upswr_ratio_est (total = hrs, M = Mi, psi = psi)

##      Est.      S.E.    ci.low    ci.upp 
## 2.7032679 0.3437741 1.7487981 3.6577378

3 Analysis of a dataset (statepop.csv) collected with one-stage UPSWR

The file “statepop.csv” is a dataset from an unequal-probability sample of 100 counties in the United States. Counties were chosen using the cumulative-size method from the listings in the County and City Data Book (U.S. Census Bureau, 1994) with probabilities proportional to their populations. The total population for all counties is \(M_0 = \sum_{i=1} M_i = 255,077,536\). Sampling was done with replacement, so very large counties occur multiple times in the sample. For example, Los Angeles County, with the largest population in the United States, occurs four times.

statepop <- read.csv ("data/statepop.csv")
statepop

M0 <- 255077536 ## pop of USA in 1992
popn <- statepop$popn ## pop per county in 1992
psi <- popn / M0
phys <- statepop$phys

plot (popn, phys, log="xy")

summary(lm (phys~psi))

## 
## Call:
## lm(formula = phys ~ psi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2241.1  -564.9  -256.7  -125.8  9884.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    152.0      185.3   0.821    0.414    
## psi         687788.5    21708.5  31.683   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1623 on 98 degrees of freedom
## Multiple R-squared:  0.9111, Adjusted R-squared:  0.9101 
## F-statistic:  1004 on 1 and 98 DF,  p-value: < 2.2e-16

# estimate the total number of physicians in USA
upswr_total_est (phys, psi)

##      Est.      S.E.    ci.low    ci.upp 
## 570304.30  41401.23 488155.27 652453.32

# estimate number of physicians per capita
upswr_ratio_est (phys, popn, psi)

##         Est.         S.E.       ci.low       ci.upp 
## 0.0022358076 0.0001623084 0.0019137525 0.0025578627

Compare with an estimate based on an SRS sample (Assignment 3 Question 3)

# read srs sample of size 100
counties <- read.csv ("data/counties.csv", na = "-99")
counties

plot(physician~totpop, data=counties, log="xy")

## Warning in xy.coords(x, y, xlabel, ylabel, log): 5 y values <= 0 omitted from
## logarithmic plot

summary (lm(physician~totpop, data=counties))

## 
## Call:
## lm(formula = physician ~ totpop, data = counties)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -767.24  -43.44    4.84   32.93 3105.60 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.423e+01  3.490e+01  -1.554    0.123    
## totpop       2.965e-03  6.519e-05  45.477   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 340.3 on 98 degrees of freedom
## Multiple R-squared:  0.9548, Adjusted R-squared:  0.9543 
## F-statistic:  2068 on 1 and 98 DF,  p-value: < 2.2e-16

totpop <-  255077536
N <- 3141 # total number of counties
# naive estimate 
srs_mean_est (counties$physician, N = 3141) * 3141

##       Est.       S.E.     ci.low     ci.upp 
##  933410.97  491982.79  -42789.62 1909611.56

# ratio estimate
srs_ratio_est (counties$physician, counties$totpop, N = 3141) * totpop

##      Est.      S.E.    ci.low    ci.upp 
## 639506.03  87885.27 465122.60 813889.46

# regression estimate
srs_reg_est (counties$physician, counties$totpop, totpop/N, N = N) * N

##     Est.     S.E.   ci.low   ci.upp 
## 585870.6 105177.4 377149.4 794591.8

# From the textbook by Lohr, the true value is: 
total_physician <- 532638

4 Analysis of a dataset (studenthrs.csv) collected with two-stage UPSWR

studenthrs <- read.table ("data/studentshrs.txt", header = T)


## add sampling probability to the data
studenthrs$cpsi <- studenthrs$class_size/647
studenthrs

## data --- original data frame for holding data
## sid --- variable recording sample id of psu, note that, 
## sid must be different when the same psu is surveyed multiple times
## csize --- variable recording cluster (psu) population size (not sample size)
## cpik --- sampling probability for each cluster
## yvar --- variable of interest
## N --- total number of clusters (psus)

cluster_upswr_ratio_est <- function (data, sid, csize, cpsi, yvar,show.cluster.summary=TRUE)
{
    clust <- data[, sid]
    ydata <- data [, yvar]  
    
    ## cluster-wise summary
    ybari <- tapply (ydata, clust, mean)
    Mi <- tapply (data [,csize], clust, function (x) x[1])
    psi <- tapply (data [,cpsi], clust, function (x) x[1])
    ## the same as total in cluster if Mi = mi
    t_hat_cls <- ybari * Mi 
    
    if(show.cluster.summary){
        cat ("Cluster level Summary:\n")
        print(data.frame(Mi,psi,ybari, ti=t_hat_cls, ti_psi=t_hat_cls/psi,Mi_psi=Mi/psi))
    }
    
    ## apply ratio estimate to t_hat_cls and Mi
    
    srs_ratio_est (t_hat_cls/psi, Mi/psi)
}

cluster_upswr_ratio_est (studenthrs, "sid" , "class_size", "cpsi", "hrs")

## Cluster level Summary:
##      Mi        psi ybari    ti ti_psi Mi_psi
## 1    44 0.06800618   3.7 162.8 2393.9    647
## 5    76 0.11746522   2.8 212.8 1811.6    647
## 12   24 0.03709428   2.4  57.6 1552.8    647
## 141 100 0.15455951   1.6 160.0 1035.2    647
## 142 100 0.15455951   2.0 200.0 1294.0    647

##      Est.      S.E.    ci.low    ci.upp 
## 2.5000000 0.3605551 1.4989385 3.5010615

5 Simulation Studies for UPSWR using agpop data

# Read population data
agpop <- read.csv ("data/agpop.csv", na = "-99")
## removing the counties with NA in either acres92 or acres87
agpop <- subset(agpop, subset = !is.na(agpop$acres92) & !is.na(agpop$acres87))

N <- nrow (agpop); N

## [1] 3044

total_acres92<- sum (agpop$acres92)
total_acres87 <- sum (agpop$acres87)

5.1 Using acres87 to construct sampling probabilities

plot (agpop$acres87, agpop$acres92)

summary(lm(agpop$acres92~agpop$acres87))

## 
## Call:
## lm(formula = agpop$acres92 ~ agpop$acres87)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -642580   -8012    -453    5738  656073 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -2.451e+03  8.517e+02  -2.878  0.00404 ** 
## agpop$acres87  9.882e-01  1.598e-03 618.428  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37840 on 3042 degrees of freedom
## Multiple R-squared:  0.9921, Adjusted R-squared:  0.9921 
## F-statistic: 3.825e+05 on 1 and 3042 DF,  p-value: < 2.2e-16

acres87 <- agpop[, "acres87"]
est_srs300 <- est_ups300_acres87 <- est_srs300_reg_acres87 <- rep (0, 2000)
for (i in 1:2000)
{
  ## srs sample
  srs300 <- sample (1:N, size = 300)
  ## survey
  acres92_srs300 <- agpop[srs300, "acres92"]
  acres87_srs300 <- agpop[srs300, "acres87"]
  ## estimate
  est_srs300 [i] <-  (srs_mean_est (acres92_srs300, N = N) * N) [1] 
  est_srs300_reg_acres87[i] <- srs_reg_est(
      acres92_srs300, acres87_srs300, total_acres87/N, N = N, est.total = TRUE )[1]
  
  ## upswr sample
  ups300 <- sample (1:N, size = 300, prob = acres87, replace = TRUE)

  ## survey (obtaining measurements on acres92)
  acres92_ups300 <- agpop[ups300, "acres92"]

  ## estimate
  psi <- (acres87 / sum (acres87)) [ups300]
  est_ups300_acres87 [i] <-  upswr_total_est (acres92_ups300, psi) [1] 

}

par (mfrow = c(2,1))
boxplot (data.frame(est_srs300, est_srs300_reg_acres87, est_ups300_acres87))
abline(h=total_acres92, col="red")
boxplot (data.frame(est_srs300_reg_acres87, est_ups300_acres87))

abline(h=total_acres92, col="red")

5.2 Using largef87 to construct sampling probabilities

plot (agpop$largef92, agpop$acres92)

largef87 <- agpop[, "largef87"]
summary(lm(agpop$acres92~agpop$largef87))

## 
## Call:
## lm(formula = agpop$acres92 ~ agpop$largef87)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -837145  -85139  -46310   18214 6770696 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    95759.61    7151.10   13.39   <2e-16 ***
## agpop$largef87  3863.08      77.35   49.94   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 315700 on 3042 degrees of freedom
## Multiple R-squared:  0.4505, Adjusted R-squared:  0.4503 
## F-statistic:  2494 on 1 and 3042 DF,  p-value: < 2.2e-16

est_ups300_largef87 <- rep (0, 2000)

for (i in 1:2000)
{

    ## ups sample
    ups300 <- sample (1:N, size = 300, prob = largef87, replace = TRUE)
    ## survey
    acres92_ups300 <- agpop[ups300, "acres92"]
    
    ## estimate
    psi <- (largef87 / sum (largef87)) [ups300]
    est_ups300_largef87 [i] <-  upswr_total_est (acres92_ups300, psi) [1] 
    
}

boxplot (data.frame(est_srs300,est_srs300_reg_acres87, est_ups300_acres87, est_ups300_largef87))
abline(h=total_acres92, col="red")

6 Analysis of a dataset (student study hours) collected by UPSWO (without replacement)

## data --- original data frame for holding data
## cname --- variable recording cluster (psu) identity
## csize --- variable recording cluster (psu) population size (not sample size)
## cpik --- sampling probability for each cluster
## yvar --- variable of interest
## N --- total number of clusters (psus)

cluster_upswo_ratio_est <- function (data, cname, csize, cpik, yvar)
{
  clust <- data[,cname]
  ydata <- data [, yvar]  
  
  ## cluster-wise summary
  ybari <- tapply (ydata, clust, mean)
  Mi <- tapply (data [,csize], clust, function (x) x[1])
  pik <- tapply (data [,cpik], clust, function (x) x[1])
  ## the same as total in cluster if Mi = mi
  t_hat_cls <- ybari * Mi 
  
  ## apply ratio estimate to t_hat_cls and Mi
  srs_ratio_est (t_hat_cls/pik, Mi/pik)
}

classpps <- read.csv ("data/classpps.csv")

classpps

cluster_upswo_ratio_est (classpps, "class", "clssize", "clssize", "hours")

##      Est.      S.E.    ci.low    ci.upp 
## 3.4500000 0.4818584 2.1121467 4.7878533

Unequal Probability Sampling

Longhai Li

September 2024