1 Estimate Normal Probability P(a < X < b) Using Importance Sampling

## a function computing the sum of numbers represented with logarithm
## lx     --- a vector of numbers, which are the log of another vector x.
## the log of sum of x is returned
log_sum_exp <- function(lx)
{   mlx <- max(lx)
    mlx + log(sum(exp(lx-mlx)))
}

## estimating the probability P(X in A) for X ~ N(0,1) 
## by sampling from N(0,1)
est_normprob_mc <- function(A,iters_mc)
{
     X <- rnorm(iters_mc)
     mean((X > A[1]) * (X<A[2]))
}

## estimating the probability P(X in A) for X ~ N(0,1) 
## by sampling from Unif(A[1],A[2])
est_normprob_imps <- function(A, iters_mc)
{
     X <- runif(iters_mc,A[1],A[2])
     mean(dnorm(X))*(A[2]-A[1])
}

## estimating the probability P(X in A) for X ~ N(0,1) 
## by sampling from Unif(A[1],A[2])
est_log_normprob_imps <- function(A, iters_mc)
{
     X <- runif(iters_mc,A[1],A[2])
     log_sum_exp(dnorm(X,log = TRUE))-log(iters_mc)+ log(A[2]-A[1])
}

1.1 Test 1

A <- c(-2,2)
tp <- pnorm (A[2]) - pnorm (A[1])

probs_mc <- replicate(1000,est_normprob_mc(A,100))
probs_imps <- replicate(1000,est_normprob_imps(A,100))
par(mfrow = c(1,2))
ylim <- range (probs_mc, probs_imps)
plot(probs_mc, ylim = ylim); abline (h=tp)
plot(probs_imps, ylim = ylim); abline (h=tp)

mean((probs_mc-tp)^2)
## [1] 0.0004084102
mean((probs_imps-tp)^2)
## [1] 0.002170898

1.2 Test 2

A <- c(5,6)
tp <- pnorm (A[2]) - pnorm (A[1])

probs_mc <- replicate(1000,est_normprob_mc(A,100))
probs_imps <- replicate(1000,est_normprob_imps(A,100))
par(mfrow = c(1,2))
ylim <- range (probs_mc, probs_imps)
plot(probs_mc, ylim = ylim); abline (h=tp)
plot(probs_imps, ylim = ylim); abline (h=tp)

mean((probs_mc-tp)^2)
## [1] 8.160448e-14
mean((probs_imps-tp)^2)
## [1] 1.287765e-15

1.3 Test 3

A <- c(-2,2)
tp <- pnorm (A[2]) - pnorm (A[1])

probs_mc <- replicate(1000,est_normprob_mc(A,100))
probs_imps <- replicate(1000,est_normprob_imps(A,100))
par(mfrow = c(1,2))
ylim <- range (probs_mc, probs_imps)
plot(probs_mc, ylim = ylim); abline (h=tp)
plot(probs_imps, ylim = ylim); abline (h=tp)

mean((probs_mc-tp)^2)
## [1] 0.0004322001
mean((probs_imps-tp)^2)
## [1] 0.002086488

1.4 Test 4: An example of computing a underflow probability

A <- c(50,100)
#A method that cannot compute so small probability
tp1 <- pnorm (A[2]) - pnorm (A[1]); log(tp1)
## [1] -Inf
#The reason is that pnorm(A[1]) is so close to 1, or pnorm(-A[1]) underflow
pnorm (A[1]) 
## [1] 1
log(pnorm(A[1]))
## [1] 0
#Instead, we can use this way to compute the log probability
log_minus_exp <- function (la, lb) la + log (1-exp(lb-la))
la <- pnorm(-A[1], log=TRUE); la
## [1] -1254.831
lb <- pnorm(-A[2], log=TRUE); lb
## [1] -5005.524
log_tp2 <- log_minus_exp(la, lb); log_tp2
## [1] -1254.831
#We still use the same importance sampling procedure, but taking care of the underflow problem in dnorm for values in (A[1],A[2])
log_probs_imps <- replicate(100,est_log_normprob_imps(A,1000)); log_probs_imps
##   [1] -1256.219 -1253.717 -1256.840 -1254.937 -1256.541 -1253.724 -1255.119
##   [8] -1256.416 -1256.960 -1254.609 -1257.707 -1254.813 -1259.334 -1253.898
##  [15] -1256.832 -1254.067 -1253.740 -1255.728 -1260.397 -1254.268 -1254.212
##  [22] -1255.655 -1255.716 -1254.137 -1253.946 -1255.643 -1254.760 -1259.863
##  [29] -1256.737 -1260.544 -1254.382 -1255.791 -1254.248 -1255.785 -1255.147
##  [36] -1255.416 -1256.365 -1259.139 -1255.745 -1255.261 -1255.384 -1257.428
##  [43] -1257.536 -1257.853 -1263.320 -1263.491 -1256.170 -1253.923 -1258.896
##  [50] -1254.380 -1256.178 -1254.778 -1258.196 -1254.844 -1253.831 -1254.172
##  [57] -1256.840 -1261.347 -1258.454 -1256.153 -1257.195 -1256.215 -1254.023
##  [64] -1253.650 -1258.758 -1254.485 -1253.927 -1254.819 -1253.525 -1253.864
##  [71] -1259.330 -1254.258 -1256.158 -1260.621 -1254.152 -1254.813 -1255.764
##  [78] -1255.049 -1256.081 -1254.544 -1254.217 -1254.992 -1254.146 -1255.051
##  [85] -1255.003 -1257.130 -1253.178 -1255.417 -1255.867 -1254.125 -1253.724
##  [92] -1255.978 -1256.471 -1253.932 -1255.597 -1254.092 -1261.953 -1256.690
##  [99] -1258.961 -1256.369
plot (log_probs_imps); abline (h = log_tp2)

2 Estimate E(a(X)) of Truncated Normal Using Importance Sampling

2.1 Estimating Function

## compute E(a) with importance sampling
est_tnorm_imps <- function(a, A, iters_mc)
{
    X <- runif(iters_mc,A[1],A[2])
    W <- dnorm (X)
    ahat <- sum (a(X) * W) / sum (W)
    attr(ahat, "effective sample size") <- 1/sum((W/sum(W))^2)
    ahat
}


## a generic function for approximating 1-D integral with midpoint rule
## the logarithms of the function values are passed in
## the log of the integral result is returned
## log_f  --- a function computing the logarithm of the integrant function
## range  --- the range of integral varaible, a vector of two elements
## n      --- the number of points at which the integrant is evaluated
## ...    --- other parameters needed by log_f
log_int_mid <- function(log_f, range, n,...)
{   if(range[1] >= range[2]) 
    stop("Wrong ranges")
    h <- (range[2]-range[1]) / n
    v_log_f <- sapply(range[1] + (1:n - 0.5) * h, log_f,...)
    log_sum_exp(v_log_f) + log(h)       
}

## compute E(a) with midpoint rule
est_tnorm_mid <- function (a, A, iters_mc)
{
    log_f <- function (x) dnorm (x, log = T) + log (a(x))
    exp(log_int_mid (log_f, A, iters_mc) ) / (pnorm (A[2]) - pnorm (A[1]))
}

library (ars) 
## Warning: package 'ars' was built under R version 3.4.4
## a direct rejection sampling for truncated normal
sample_tnorm_drs <- function (n, lb = -Inf, ub = Inf)
{
    x <- rep (0, n)
    for (i in 1:n)
    {
        rej <- TRUE
        while (rej)
        {
            x[i] <- rnorm (1)
            if (x[i] >= lb & x[i] <= ub) rej <- FALSE
        }
    }
    x
}

## sample from truncated normal using ars package
sample_tnorm_ars <- function (n, lb, ub)
{
    logf <- function (x) dnorm (x, log = TRUE) ## define log density
    fprima <- function (x) -x ## define derivative of log density
    
    ars (10000, f = logf, fprima = fprima, 
         x = c(lb, (lb + ub )/2, ub), # starting points
         lb = TRUE, ub = TRUE, xlb = lb, xub = ub) # boundary of log density
}

2.2 Test 1

## define the function a
a <- function (x) x^2
interval <- c(1,2)
A <- est_tnorm_mid (a, interval, 100000) ## midpoint rule

## estimate E(a) with rejection sampling
system.time(
    {
        rn_tnorm_ars <- sample_tnorm_ars (1000, interval[1],interval[2]) # draw samples from tnorm
        mean (a (rn_tnorm_ars)) 
    }
)
##    user  system elapsed 
##   0.088   0.003   0.091
system.time(
est_tnorm_imps (a, interval, 1000000) ## importance sampling
)
##    user  system elapsed 
##   0.116   0.010   0.128
## simulation comparison of importance sampling and rejection sampling
times.imps <- system.time(
EA_imps <- replicate (100, est_tnorm_imps (a, interval, 1000000)) 
)

times.imps
##    user  system elapsed 
##   6.431   0.242   7.089
times.rej <- system.time (
EA_rej <- replicate (100,
    {   rn_tnorm_ars <- sample_tnorm_ars (1000, interval[1],interval[2])
        mean (a (rn_tnorm_ars))
    }
    )
)
times.rej
##    user  system elapsed 
##   6.987   0.894   8.032
par (mfrow = c(1,2))
ylim <- range (EA_imps, EA_rej)
plot (EA_imps, ylim = ylim, main = "Importance Sampling") 
abline (h = A)
plot (EA_rej, ylim = ylim, main = "Adaptive Rejection Sampling") 
abline (h = A)

mean ((EA_rej-A)^2)
## [1] 6.529003e-05
mean ((EA_imps-A)^2)
## [1] 5.649387e-07

2.3 Test 2

## define the function a
a <- function (x) x^2
interval <- c(-1,1)
A <- est_tnorm_mid (a, interval, 100000) ## midpoint rule

## estimate E(a) with rejection sampling
system.time(
    {
        rn_tnorm_ars <- sample_tnorm_ars (1000, interval[1],interval[2]) # draw samples from tnorm
        mean (a (rn_tnorm_ars)) 
    }
)
##    user  system elapsed 
##   0.071   0.012   0.083
system.time(
est_tnorm_imps (a, interval, 1000000) ## importance sampling
)
##    user  system elapsed 
##   0.081   0.004   0.091
## simulation comparison of importance sampling and rejection sampling
times.imps <- system.time(
EA_imps <- replicate (100, est_tnorm_imps (a, interval, 1000000)) 
)

times.imps
##    user  system elapsed 
##   6.256   0.225   6.598
times.rej <- system.time (
EA_rej <- replicate (100,
    {   rn_tnorm_ars <- sample_tnorm_ars (1000, interval[1],interval[2])
        mean (a (rn_tnorm_ars))
    }
    )
)
times.rej
##    user  system elapsed 
##   7.345   0.656   8.328
par (mfrow = c(1,2))
ylim <- range (EA_imps, EA_rej)
plot (EA_imps, ylim = ylim, main = "Importance Sampling") 
abline (h = A)
plot (EA_rej, ylim = ylim, main = "Adaptive Rejection Sampling") 
abline (h = A)

mean ((EA_rej-A)^2)
## [1] 7.350695e-06
mean ((EA_imps-A)^2)
## [1] 7.0304e-08

3 Computing Log Marginalized Likelihood for Normal Models with Importance Sampling

3.1 Functions

## a function computing the sum of numbers represented with logarithm
## lx     --- a vector of numbers, which are the log of another vector x.
## the log of sum of x is returned
log_sum_exp <- function(lx)
{   mlx <- max(lx)
    mlx + log(sum(exp(lx-mlx)))
}
## computing the log probability density function of multivariate normal
## x     --- a vector, the p.d.f at x will be computed
## mu   --- the mean vector of multivariate normal distribution
## A     --- the inverse covariance matrix of multivariate normal distribution
log_pdf_mnormal <- function(x, mu, A)
{   0.5 * ( -length(mu)*log(2*pi) + sum(log(svd(A)$d)) - t(x-mu) %*% A %*% (x-mu) ) 

}
## the function for computing log likelihood of normal data
log_lik <- function(x,mu,w)
{   sum(dnorm(x,mu,exp(w),log=TRUE))
}

## the function for computing log prior
log_prior <- function(mu,w, mu_0,sigma_mu,w_0,sigma_w)
{   dnorm(mu,mu_0,sigma_mu,log=TRUE) + dnorm(w,w_0,sigma_w,log=TRUE)
}

## the function for computing the negative log of likelihood * prior 
neg_log_post <- function(x, theta, mu_0,sigma_mu,w_0,sigma_w)
{   - log_lik(x,theta[1], theta[2]) - 
    log_prior(theta[1],theta[2],mu_0,sigma_mu,w_0,sigma_w)
}


## computing the log marginal likelihood using importance sampling with 
## the posterior distribution approximated by the Gaussian distribution at
## its mode
log_mar_gaussian_imps <- function(x,mu_0,sigma_mu,w_0,sigma_w,iters_mc)
{   result_min <- nlm(f=neg_log_post,p=c(mean(x),log(sqrt(var(x)))), 
                      hessian=TRUE,
                      x=x,mu_0=mu_0,sigma_mu=sigma_mu,w_0=w_0,sigma_w=sigma_w)
    hessian <- result_min$hessian
    mu <- result_min$estimate
    
    ## finding the multiplier for sampling from multivariate normal
    Sigma <- t( chol(solve(hessian)) )
    ## draw samples from N(mu, Sigma %*% Sigma')
    thetas <- Sigma %*% matrix(rnorm(2*iters_mc),2,iters_mc) + mu
     
    ## values of log approximate p.d.f. at samples
    log_pdf_mnormal_thetas <- apply(thetas,2,log_pdf_mnormal,mu=mu,A=hessian)
    ## values of log true p.d.f. at samples
    log_post_thetas <- - apply(thetas,2,neg_log_post,x=x, mu_0=mu_0,
                               sigma_mu=sigma_mu,w_0=w_0,sigma_w=sigma_w)
  
    ## averaging the weights, returning its log
    log_sum_exp(log_post_thetas-log_pdf_mnormal_thetas) - log(iters_mc)
}


## we use Monte Carlo method to debug the above function
log_mar_gaussian_mc <- function(x,mu_0,sigma_mu,w_0,sigma_w,iters_mc)
{
    ## draw samples from the priors
    mus <- rnorm(iters_mc,mu_0,sigma_mu)
    ws <- rnorm(iters_mc,w_0,sigma_w)
    one_log_lik <- function(i)
    {  log_lik(x,mus[i],ws[i])
    }
    v_log_lik <- sapply(1:iters_mc,one_log_lik)
    log_sum_exp(v_log_lik) - log(iters_mc)
}
## the generic function for finding laplace approximation of integral of 'f'
## neg_log_f    --- the negative log of the intergrand function
## p0           --- initial value in searching mode
## ...          --- other arguments needed by neg_log_f
bayes_inference_lap <- function(neg_log_f,p0,...)
{   ## looking for the mode and hessian of the log likehood function
    result_min <- nlm(f=neg_log_f,p=p0, hessian=TRUE,...)
    hessian <- result_min$hessian
    neg_log_like_mode <- result_min$minimum
    
    
    estimates <- result_min$estimate ## posterior mode
    SIGMA <- solve(result_min$hessian) ## covariance matrix of posterior mode
    sds <- sqrt (diag(SIGMA)) ## standard errors of each estimate
    log_mar_lik <- ## log marginalized likelihood
        - neg_log_like_mode + 0.5 * ( sum(log(2*pi) - log(svd(hessian)$d) ))
    
    list (estimates = estimates, sds = sds, SIGMA = SIGMA, log_mar_lik = log_mar_lik)
}


## approximating the log of integral of likelihood * prior
bayes_inference_lap_gaussian <- function(x,mu_0,sigma_mu,w_0,sigma_w)
{   bayes_inference_lap(
                neg_log_post,p0=c(mean(x),log(sqrt(var(x)))),
                x=x,mu_0=mu_0,sigma_mu=sigma_mu,w_0=w_0,sigma_w=sigma_w
    )
}

3.2 Testing

## debugging the program
x <- rnorm(50)
log_mar_gaussian_imps(x,0,1,0,5,100)
## [1] -79.60525
log_mar_gaussian_mc(x,0,1,0,5,10000)
## [1] -79.75541
bayes_inference_lap_gaussian(x,0,1,0,5)
## $estimates
## [1] -0.05266754  0.05650374
## 
## $sds
## [1] 0.14799462 0.09998787
## 
## $SIGMA
##              [,1]         [,2]
## [1,] 0.0219024079 0.0000240405
## [2,] 0.0000240405 0.0099975749
## 
## $log_mar_lik
## [1] -79.59742
x <- rnorm(10) # another debug
log_mar_gaussian_imps(x,0,1,0,5,100)
## [1] -17.45898
log_mar_gaussian_mc(x,0,1,0,5,10000)
## [1] -17.38112
bayes_inference_lap_gaussian(x,0,1,0,5)
## $estimates
## [1]  0.04303722 -0.11027164
## 
## $sds
## [1] 0.2724970 0.2234578
## 
## $SIGMA
##               [,1]          [,2]
## [1,]  0.0742545976 -0.0003145331
## [2,] -0.0003145331  0.0499333888
## 
## $log_mar_lik
## [1] -17.49375
## comparing importance sampling with Gaussian approximation with naive monte carlo
x <- rnorm(200)
bayes_inference_lap_gaussian(x,0,1,0,5)
## $estimates
## [1]  0.04912102 -0.02271822
## 
## $sds
## [1] 0.06895783 0.05000259
## 
## $SIGMA
##               [,1]          [,2]
## [1,]  4.755182e-03 -9.215509e-07
## [2,] -9.215509e-07  2.500259e-03
## 
## $log_mar_lik
## [1] -286.5243
v_log_mar_imps <- replicate(1000, log_mar_gaussian_imps(x,0,1,0,5,100))
v_log_mar_mc <- replicate(1000, log_mar_gaussian_mc(x,0,1,0,5,100))


par(mfcol=c(2,3))
xlim <- c(min(c(v_log_mar_imps,v_log_mar_mc)),max(c(v_log_mar_imps,v_log_mar_mc)))

plot(v_log_mar_imps, main="Important Sampling")
hist(v_log_mar_imps,main="Important Sampling")

plot(v_log_mar_imps, ylim = xlim, main="Important Sampling")
hist(v_log_mar_imps,main="Important Sampling",xlim=xlim)

plot(v_log_mar_mc,main="Naive Monte Carlo",ylim=xlim)
hist(v_log_mar_mc,main="Naive Monte Carlo",xlim=xlim)

## comparing variance of importance sampling and naive sampling
var (v_log_mar_imps)
## [1] 0.0001848107
var (v_log_mar_mc)
## [1] 184.0005

Comparing Monte Carlo variance of Importance sampling and Laplace Approximation

mean (v_log_mar_imps)
## [1] -286.5195
bayes_inference_lap_gaussian(x,0,1,0,5)
## $estimates
## [1]  0.04912102 -0.02271822
## 
## $sds
## [1] 0.06895783 0.05000259
## 
## $SIGMA
##               [,1]          [,2]
## [1,]  4.755182e-03 -9.215509e-07
## [2,] -9.215509e-07  2.500259e-03
## 
## $log_mar_lik
## [1] -286.5243

We see that Laplace approximation is really good for this problem, but it may not be good for other problems in which the posterior cannot be approximated by Gaussian.