1 Estimate Normal Probability P(a < X < b) Using Importance Sampling

## a function computing the sum of numbers represented with logarithm
## lx     --- a vector of numbers, which are the log of another vector x.
## the log of sum of x is returned
log_sum_exp <- function(lx)
{   mlx <- max(lx)
    mlx + log(sum(exp(lx-mlx)))
}

## estimating the probability P(X in A) for X ~ N(0,1) 
## by sampling from N(0,1)
est_normprob_mc <- function(A,iters_mc)
{
     X <- rnorm(iters_mc)
     mean((X > A[1]) * (X<A[2]))
}

## estimating the probability P(X in A) for X ~ N(0,1) 
## by sampling from Unif(A[1],A[2])
est_normprob_imps <- function(A, iters_mc)
{
     X <- runif(iters_mc,A[1],A[2])
     mean(dnorm(X))*(A[2]-A[1])
}

## estimating the probability P(X in A) for X ~ N(0,1) 
## by sampling from Unif(A[1],A[2])
est_log_normprob_imps <- function(A, iters_mc)
{
     X <- runif(iters_mc,A[1],A[2])
     log_sum_exp(dnorm(X,log = TRUE))-log(iters_mc)+ log(A[2]-A[1])
}

1.1 Test 1

A <- c(-2,2)
tp <- pnorm (A[2]) - pnorm (A[1])

probs_mc <- replicate(1000,est_normprob_mc(A,100))
probs_imps <- replicate(1000,est_normprob_imps(A,100))
par(mfrow = c(1,2))
ylim <- range (probs_mc, probs_imps)
plot(probs_mc, ylim = ylim); abline (h=tp)
plot(probs_imps, ylim = ylim); abline (h=tp)

mean((probs_mc-tp)^2)

## [1] 0.0004084102

mean((probs_imps-tp)^2)

## [1] 0.002170898

1.2 Test 2

A <- c(5,6)
tp <- pnorm (A[2]) - pnorm (A[1])

probs_mc <- replicate(1000,est_normprob_mc(A,100))
probs_imps <- replicate(1000,est_normprob_imps(A,100))
par(mfrow = c(1,2))
ylim <- range (probs_mc, probs_imps)
plot(probs_mc, ylim = ylim); abline (h=tp)
plot(probs_imps, ylim = ylim); abline (h=tp)

mean((probs_mc-tp)^2)

## [1] 8.160448e-14

mean((probs_imps-tp)^2)

## [1] 1.287765e-15

1.3 Test 3

A <- c(-2,2)
tp <- pnorm (A[2]) - pnorm (A[1])

probs_mc <- replicate(1000,est_normprob_mc(A,100))
probs_imps <- replicate(1000,est_normprob_imps(A,100))
par(mfrow = c(1,2))
ylim <- range (probs_mc, probs_imps)
plot(probs_mc, ylim = ylim); abline (h=tp)
plot(probs_imps, ylim = ylim); abline (h=tp)

mean((probs_mc-tp)^2)

## [1] 0.0004322001

mean((probs_imps-tp)^2)

## [1] 0.002086488

1.4 Test 4: An example of computing a underflow probability

A <- c(50,100)
#A method that cannot compute so small probability
tp1 <- pnorm (A[2]) - pnorm (A[1]); log(tp1)

## [1] -Inf

#The reason is that pnorm(A[1]) is so close to 1, or pnorm(-A[1]) underflow
pnorm (A[1])

## [1] 1

log(pnorm(A[1]))

## [1] 0

#Instead, we can use this way to compute the log probability
log_minus_exp <- function (la, lb) la + log (1-exp(lb-la))
la <- pnorm(-A[1], log=TRUE); la

## [1] -1254.831

lb <- pnorm(-A[2], log=TRUE); lb

## [1] -5005.524

log_tp2 <- log_minus_exp(la, lb); log_tp2

## [1] -1254.831

#We still use the same importance sampling procedure, but taking care of the underflow problem in dnorm for values in (A[1],A[2])
log_probs_imps <- replicate(100,est_log_normprob_imps(A,1000)); log_probs_imps

##   [1] -1256.219 -1253.717 -1256.840 -1254.937 -1256.541 -1253.724 -1255.119
##   [8] -1256.416 -1256.960 -1254.609 -1257.707 -1254.813 -1259.334 -1253.898
##  [15] -1256.832 -1254.067 -1253.740 -1255.728 -1260.397 -1254.268 -1254.212
##  [22] -1255.655 -1255.716 -1254.137 -1253.946 -1255.643 -1254.760 -1259.863
##  [29] -1256.737 -1260.544 -1254.382 -1255.791 -1254.248 -1255.785 -1255.147
##  [36] -1255.416 -1256.365 -1259.139 -1255.745 -1255.261 -1255.384 -1257.428
##  [43] -1257.536 -1257.853 -1263.320 -1263.491 -1256.170 -1253.923 -1258.896
##  [50] -1254.380 -1256.178 -1254.778 -1258.196 -1254.844 -1253.831 -1254.172
##  [57] -1256.840 -1261.347 -1258.454 -1256.153 -1257.195 -1256.215 -1254.023
##  [64] -1253.650 -1258.758 -1254.485 -1253.927 -1254.819 -1253.525 -1253.864
##  [71] -1259.330 -1254.258 -1256.158 -1260.621 -1254.152 -1254.813 -1255.764
##  [78] -1255.049 -1256.081 -1254.544 -1254.217 -1254.992 -1254.146 -1255.051
##  [85] -1255.003 -1257.130 -1253.178 -1255.417 -1255.867 -1254.125 -1253.724
##  [92] -1255.978 -1256.471 -1253.932 -1255.597 -1254.092 -1261.953 -1256.690
##  [99] -1258.961 -1256.369

plot (log_probs_imps); abline (h = log_tp2)

2 Estimate E(a(X)) of Truncated Normal Using Importance Sampling

2.1 Estimating Function

## compute E(a) with importance sampling
est_tnorm_imps <- function(a, A, iters_mc)
{
    X <- runif(iters_mc,A[1],A[2])
    W <- dnorm (X)
    ahat <- sum (a(X) * W) / sum (W)
    attr(ahat, "effective sample size") <- 1/sum((W/sum(W))^2)
    ahat
}


## a generic function for approximating 1-D integral with midpoint rule
## the logarithms of the function values are passed in
## the log of the integral result is returned
## log_f  --- a function computing the logarithm of the integrant function
## range  --- the range of integral varaible, a vector of two elements
## n      --- the number of points at which the integrant is evaluated
## ...    --- other parameters needed by log_f
log_int_mid <- function(log_f, range, n,...)
{   if(range[1] >= range[2]) 
    stop("Wrong ranges")
    h <- (range[2]-range[1]) / n
    v_log_f <- sapply(range[1] + (1:n - 0.5) * h, log_f,...)
    log_sum_exp(v_log_f) + log(h)       
}

## compute E(a) with midpoint rule
est_tnorm_mid <- function (a, A, iters_mc)
{
    log_f <- function (x) dnorm (x, log = T) + log (a(x))
    exp(log_int_mid (log_f, A, iters_mc) ) / (pnorm (A[2]) - pnorm (A[1]))
}

library (ars)

## Warning: package 'ars' was built under R version 3.4.4

## a direct rejection sampling for truncated normal
sample_tnorm_drs <- function (n, lb = -Inf, ub = Inf)
{
    x <- rep (0, n)
    for (i in 1:n)
    {
        rej <- TRUE
        while (rej)
        {
            x[i] <- rnorm (1)
            if (x[i] >= lb & x[i] <= ub) rej <- FALSE
        }
    }
    x
}

## sample from truncated normal using ars package
sample_tnorm_ars <- function (n, lb, ub)
{
    logf <- function (x) dnorm (x, log = TRUE) ## define log density
    fprima <- function (x) -x ## define derivative of log density
    
    ars (10000, f = logf, fprima = fprima, 
         x = c(lb, (lb + ub )/2, ub), # starting points
         lb = TRUE, ub = TRUE, xlb = lb, xub = ub) # boundary of log density
}

2.2 Test 1

## define the function a
a <- function (x) x^2
interval <- c(1,2)
A <- est_tnorm_mid (a, interval, 100000) ## midpoint rule

## estimate E(a) with rejection sampling
system.time(
    {
        rn_tnorm_ars <- sample_tnorm_ars (1000, interval[1],interval[2]) # draw samples from tnorm
        mean (a (rn_tnorm_ars)) 
    }
)

##    user  system elapsed 
##   0.088   0.003   0.091

system.time(
est_tnorm_imps (a, interval, 1000000) ## importance sampling
)

##    user  system elapsed 
##   0.116   0.010   0.128

## simulation comparison of importance sampling and rejection sampling
times.imps <- system.time(
EA_imps <- replicate (100, est_tnorm_imps (a, interval, 1000000)) 
)

times.imps

##    user  system elapsed 
##   6.431   0.242   7.089

times.rej <- system.time (
EA_rej <- replicate (100,
    {   rn_tnorm_ars <- sample_tnorm_ars (1000, interval[1],interval[2])
        mean (a (rn_tnorm_ars))
    }
    )
)
times.rej

##    user  system elapsed 
##   6.987   0.894   8.032

par (mfrow = c(1,2))
ylim <- range (EA_imps, EA_rej)
plot (EA_imps, ylim = ylim, main = "Importance Sampling") 
abline (h = A)
plot (EA_rej, ylim = ylim, main = "Adaptive Rejection Sampling") 
abline (h = A)

mean ((EA_rej-A)^2)

## [1] 6.529003e-05

mean ((EA_imps-A)^2)

## [1] 5.649387e-07

2.3 Test 2

## define the function a
a <- function (x) x^2
interval <- c(-1,1)
A <- est_tnorm_mid (a, interval, 100000) ## midpoint rule

## estimate E(a) with rejection sampling
system.time(
    {
        rn_tnorm_ars <- sample_tnorm_ars (1000, interval[1],interval[2]) # draw samples from tnorm
        mean (a (rn_tnorm_ars)) 
    }
)

##    user  system elapsed 
##   0.071   0.012   0.083

system.time(
est_tnorm_imps (a, interval, 1000000) ## importance sampling
)

##    user  system elapsed 
##   0.081   0.004   0.091

## simulation comparison of importance sampling and rejection sampling
times.imps <- system.time(
EA_imps <- replicate (100, est_tnorm_imps (a, interval, 1000000)) 
)

times.imps

##    user  system elapsed 
##   6.256   0.225   6.598

times.rej <- system.time (
EA_rej <- replicate (100,
    {   rn_tnorm_ars <- sample_tnorm_ars (1000, interval[1],interval[2])
        mean (a (rn_tnorm_ars))
    }
    )
)
times.rej

##    user  system elapsed 
##   7.345   0.656   8.328

par (mfrow = c(1,2))
ylim <- range (EA_imps, EA_rej)
plot (EA_imps, ylim = ylim, main = "Importance Sampling") 
abline (h = A)
plot (EA_rej, ylim = ylim, main = "Adaptive Rejection Sampling") 
abline (h = A)

mean ((EA_rej-A)^2)

## [1] 7.350695e-06

mean ((EA_imps-A)^2)

## [1] 7.0304e-08

3 Computing Log Marginalized Likelihood for Normal Models with Importance Sampling

3.1 Functions

## a function computing the sum of numbers represented with logarithm
## lx     --- a vector of numbers, which are the log of another vector x.
## the log of sum of x is returned
log_sum_exp <- function(lx)
{   mlx <- max(lx)
    mlx + log(sum(exp(lx-mlx)))
}
## computing the log probability density function of multivariate normal
## x     --- a vector, the p.d.f at x will be computed
## mu   --- the mean vector of multivariate normal distribution
## A     --- the inverse covariance matrix of multivariate normal distribution
log_pdf_mnormal <- function(x, mu, A)
{   0.5 * ( -length(mu)*log(2*pi) + sum(log(svd(A)$d)) - t(x-mu) %*% A %*% (x-mu) ) 

}
## the function for computing log likelihood of normal data
log_lik <- function(x,mu,w)
{   sum(dnorm(x,mu,exp(w),log=TRUE))
}

## the function for computing log prior
log_prior <- function(mu,w, mu_0,sigma_mu,w_0,sigma_w)
{   dnorm(mu,mu_0,sigma_mu,log=TRUE) + dnorm(w,w_0,sigma_w,log=TRUE)
}

## the function for computing the negative log of likelihood * prior 
neg_log_post <- function(x, theta, mu_0,sigma_mu,w_0,sigma_w)
{   - log_lik(x,theta[1], theta[2]) - 
    log_prior(theta[1],theta[2],mu_0,sigma_mu,w_0,sigma_w)
}


## computing the log marginal likelihood using importance sampling with 
## the posterior distribution approximated by the Gaussian distribution at
## its mode
log_mar_gaussian_imps <- function(x,mu_0,sigma_mu,w_0,sigma_w,iters_mc)
{   result_min <- nlm(f=neg_log_post,p=c(mean(x),log(sqrt(var(x)))), 
                      hessian=TRUE,
                      x=x,mu_0=mu_0,sigma_mu=sigma_mu,w_0=w_0,sigma_w=sigma_w)
    hessian <- result_min$hessian
    mu <- result_min$estimate
    
    ## finding the multiplier for sampling from multivariate normal
    Sigma <- t( chol(solve(hessian)) )
    ## draw samples from N(mu, Sigma %*% Sigma')
    thetas <- Sigma %*% matrix(rnorm(2*iters_mc),2,iters_mc) + mu
     
    ## values of log approximate p.d.f. at samples
    log_pdf_mnormal_thetas <- apply(thetas,2,log_pdf_mnormal,mu=mu,A=hessian)
    ## values of log true p.d.f. at samples
    log_post_thetas <- - apply(thetas,2,neg_log_post,x=x, mu_0=mu_0,
                               sigma_mu=sigma_mu,w_0=w_0,sigma_w=sigma_w)
  
    ## averaging the weights, returning its log
    log_sum_exp(log_post_thetas-log_pdf_mnormal_thetas) - log(iters_mc)
}


## we use Monte Carlo method to debug the above function
log_mar_gaussian_mc <- function(x,mu_0,sigma_mu,w_0,sigma_w,iters_mc)
{
    ## draw samples from the priors
    mus <- rnorm(iters_mc,mu_0,sigma_mu)
    ws <- rnorm(iters_mc,w_0,sigma_w)
    one_log_lik <- function(i)
    {  log_lik(x,mus[i],ws[i])
    }
    v_log_lik <- sapply(1:iters_mc,one_log_lik)
    log_sum_exp(v_log_lik) - log(iters_mc)
}
## the generic function for finding laplace approximation of integral of 'f'
## neg_log_f    --- the negative log of the intergrand function
## p0           --- initial value in searching mode
## ...          --- other arguments needed by neg_log_f
bayes_inference_lap <- function(neg_log_f,p0,...)
{   ## looking for the mode and hessian of the log likehood function
    result_min <- nlm(f=neg_log_f,p=p0, hessian=TRUE,...)
    hessian <- result_min$hessian
    neg_log_like_mode <- result_min$minimum
    
    
    estimates <- result_min$estimate ## posterior mode
    SIGMA <- solve(result_min$hessian) ## covariance matrix of posterior mode
    sds <- sqrt (diag(SIGMA)) ## standard errors of each estimate
    log_mar_lik <- ## log marginalized likelihood
        - neg_log_like_mode + 0.5 * ( sum(log(2*pi) - log(svd(hessian)$d) ))
    
    list (estimates = estimates, sds = sds, SIGMA = SIGMA, log_mar_lik = log_mar_lik)
}


## approximating the log of integral of likelihood * prior
bayes_inference_lap_gaussian <- function(x,mu_0,sigma_mu,w_0,sigma_w)
{   bayes_inference_lap(
                neg_log_post,p0=c(mean(x),log(sqrt(var(x)))),
                x=x,mu_0=mu_0,sigma_mu=sigma_mu,w_0=w_0,sigma_w=sigma_w
    )
}

3.2 Testing

## debugging the program
x <- rnorm(50)
log_mar_gaussian_imps(x,0,1,0,5,100)

## [1] -79.60525

log_mar_gaussian_mc(x,0,1,0,5,10000)

## [1] -79.75541

bayes_inference_lap_gaussian(x,0,1,0,5)

## $estimates
## [1] -0.05266754  0.05650374
## 
## $sds
## [1] 0.14799462 0.09998787
## 
## $SIGMA
##              [,1]         [,2]
## [1,] 0.0219024079 0.0000240405
## [2,] 0.0000240405 0.0099975749
## 
## $log_mar_lik
## [1] -79.59742

x <- rnorm(10) # another debug
log_mar_gaussian_imps(x,0,1,0,5,100)

## [1] -17.45898

log_mar_gaussian_mc(x,0,1,0,5,10000)

## [1] -17.38112

bayes_inference_lap_gaussian(x,0,1,0,5)

## $estimates
## [1]  0.04303722 -0.11027164
## 
## $sds
## [1] 0.2724970 0.2234578
## 
## $SIGMA
##               [,1]          [,2]
## [1,]  0.0742545976 -0.0003145331
## [2,] -0.0003145331  0.0499333888
## 
## $log_mar_lik
## [1] -17.49375

## comparing importance sampling with Gaussian approximation with naive monte carlo
x <- rnorm(200)
bayes_inference_lap_gaussian(x,0,1,0,5)

## $estimates
## [1]  0.04912102 -0.02271822
## 
## $sds
## [1] 0.06895783 0.05000259
## 
## $SIGMA
##               [,1]          [,2]
## [1,]  4.755182e-03 -9.215509e-07
## [2,] -9.215509e-07  2.500259e-03
## 
## $log_mar_lik
## [1] -286.5243

v_log_mar_imps <- replicate(1000, log_mar_gaussian_imps(x,0,1,0,5,100))
v_log_mar_mc <- replicate(1000, log_mar_gaussian_mc(x,0,1,0,5,100))


par(mfcol=c(2,3))
xlim <- c(min(c(v_log_mar_imps,v_log_mar_mc)),max(c(v_log_mar_imps,v_log_mar_mc)))

plot(v_log_mar_imps, main="Important Sampling")
hist(v_log_mar_imps,main="Important Sampling")

plot(v_log_mar_imps, ylim = xlim, main="Important Sampling")
hist(v_log_mar_imps,main="Important Sampling",xlim=xlim)

plot(v_log_mar_mc,main="Naive Monte Carlo",ylim=xlim)
hist(v_log_mar_mc,main="Naive Monte Carlo",xlim=xlim)

## comparing variance of importance sampling and naive sampling
var (v_log_mar_imps)

## [1] 0.0001848107

var (v_log_mar_mc)

## [1] 184.0005

Comparing Monte Carlo variance of Importance sampling and Laplace Approximation

mean (v_log_mar_imps)

## [1] -286.5195

bayes_inference_lap_gaussian(x,0,1,0,5)

## $estimates
## [1]  0.04912102 -0.02271822
## 
## $sds
## [1] 0.06895783 0.05000259
## 
## $SIGMA
##               [,1]          [,2]
## [1,]  4.755182e-03 -9.215509e-07
## [2,] -9.215509e-07  2.500259e-03
## 
## $log_mar_lik
## [1] -286.5243

We see that Laplace approximation is really good for this problem, but it may not be good for other problems in which the posterior cannot be approximated by Gaussian.

STAT 812: Computational Statistics

Importance Sampling

Longhai Li

September 2018