1 Finding MLE for Logistic Regression with Multivariate NR Method
2 Fit a non-linear model to sunspots data: using “nlm”
3 Fit a non-linear model to sunspots data: using “nls”
4 Other Optimization Methods

1 Finding MLE for Logistic Regression with Multivariate NR Method

# function to compute logistic regression -log likelihood, -score and
# information. b=parameters, r=binary response, z=covariate
lik_score_info <- function(b,r,z) {
        u <- b[1]+b[2]*z
        u2 <- exp(u)
        l <- -sum(u*r-log(1+u2))
        p <- u2/(1+u2)
        s <- -c(sum(r-p),sum(z*(r-p)))
        v <- matrix(c(sum(p*(1-p)),sum(z*p*(1-p)),0,sum(z*z*p*(1-p))),2,2)
        v[1,2] <- v[2,1]
        list(neg.loglik=l,neg.score=s,inf=v)
}

neg_logp_logistic <- function (b, z, r)
{
    u <- b[1]+b[2]*z
    u2 <- exp(u)
    -sum(u*r-log(1+u2))
}

####### a function for finding MLE with newton method
mle_logistic_nr <- function(b0, no_iter, r, z, debug=FALSE)
{
    result_nr <- matrix(0,no_iter+1, 3)
    colnames(result_nr) <- c('beta0','beta1','neg_loglike')
    result_nr[1,] <- c(b0, 0)

    for( i in 1:no_iter + 1)
    {
         q <- lik_score_info(result_nr[i-1,1:2],r,z)
         result_nr[i,1:2] <-   result_nr[i-1,1:2] - solve(q$inf,q$neg.score)
         result_nr[i-1,3] <- q$neg.loglik
         if(debug) print(result_nr[i-1,])
    }
    result_nr[-(no_iter+1),]
}


## generate a data set

gen_logistic_data <- function(b,n)
{
    z <- sort(runif(n, -2,2))
    emu <- exp(b[1]+z*b[2])
    p <- emu/(1+emu)
    r <- (runif(n)<p)*1
    plot (z, p, type = "l",ylim=c(0,1))
    points (z, r, col = r+1)
    list(z=z,r=r)
}

data <- gen_logistic_data(c(0,1.5),200)

# using self-programmed newton method
mle_logistic_nr(c(0,3),15,data$r,data$z)

##            beta0     beta1 neg_loglike
##  [1,]  0.0000000  3.000000   119.05563
##  [2,]  0.3124563 -0.792032   208.32250
##  [3,] -0.1505522  1.711167   101.59214
##  [4,]  0.2325889  1.333272    98.94906
##  [5,]  0.1870832  1.415984    98.80625
##  [6,]  0.1856721  1.420173    98.80598
##  [7,]  0.1856700  1.420182    98.80598
##  [8,]  0.1856700  1.420182    98.80598
##  [9,]  0.1856700  1.420182    98.80598
## [10,]  0.1856700  1.420182    98.80598
## [11,]  0.1856700  1.420182    98.80598
## [12,]  0.1856700  1.420182    98.80598
## [13,]  0.1856700  1.420182    98.80598
## [14,]  0.1856700  1.420182    98.80598
## [15,]  0.1856700  1.420182    98.80598

mle_logistic_nr(c(0,5),15,data$r,data$z)

##             beta0     beta1 neg_loglike
##  [1,]   0.0000000    5.0000    165.0582
##  [2,]   0.4211062  -16.1995   2577.7598
##  [3,] -58.2626433 2892.3818         Inf
##  [4,]         NaN       NaN         NaN
##  [5,]         NaN       NaN         NaN
##  [6,]         NaN       NaN         NaN
##  [7,]         NaN       NaN         NaN
##  [8,]         NaN       NaN         NaN
##  [9,]         NaN       NaN         NaN
## [10,]         NaN       NaN         NaN
## [11,]         NaN       NaN         NaN
## [12,]         NaN       NaN         NaN
## [13,]         NaN       NaN         NaN
## [14,]         NaN       NaN         NaN
## [15,]         NaN       NaN         NaN

## look at contour of bivariate log likelihood

B0 <- seq (-2, 2, by = 0.1)
B1 <- seq (-10, 10, by = 0.1)
loglike_values <- matrix (0, length (B0), length (B1))
for (i in 1:length (B0)) {
    for (j in 1:length (B1))
    {
        loglike_values[i,j] <- 
            neg_logp_logistic (c(B0[i], B1[j]),z = data$z, r = data$r )
    }
}
contour (B0, B1, loglike_values, nlevels = 100)
points(mle_logistic_nr(c(0,3),15,data$r,data$z), col = 1, type = "b") 
points(mle_logistic_nr(c(0,5),15,data$r,data$z), col = 2, type = "b")

###### Find MLE using nlm function
nlm (neg_logp_logistic, c(0,0), z = data$z, r = data$r, hessian = T) -> logit_nlm
nlm (neg_logp_logistic, c(0,5), z = data$z, r = data$r, hessian = T)

## $minimum
## [1] 98.80598
## 
## $estimate
## [1] 0.1856695 1.4201817
## 
## $gradient
## [1] -3.836931e-07  4.702991e-06
## 
## $hessian
##           [,1]      [,2]
## [1,] 32.414326 -2.154326
## [2,] -2.154326 24.980483
## 
## $code
## [1] 1
## 
## $iterations
## [1] 8

nlm (neg_logp_logistic, c(-10, -10), z = data$z, r = data$r, hessian = T)

## Warning in nlm(neg_logp_logistic, c(-10, -10), z = data$z, r = data$r,
## hessian = T): NA/Inf replaced by maximum positive value

## $minimum
## [1] 98.80598
## 
## $estimate
## [1] 0.1856695 1.4201813
## 
## $gradient
## [1]  3.524292e-06 -5.453470e-06
## 
## $hessian
##           [,1]      [,2]
## [1,] 32.414330 -2.154329
## [2,] -2.154329 24.980492
## 
## $code
## [1] 1
## 
## $iterations
## [1] 16

# find MLE standard errors using hessian of negative log likelihood
sds <- sqrt (diag (solve(logit_nlm$hessian))); sds

## [1] 0.1761488 0.2006540

###### find MLE using glm function

logitfit_glm <- glm (r ~ z, family = binomial(), data = data)
summary (logitfit_glm)

## 
## Call:
## glm(formula = r ~ z, family = binomial(), data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2888  -0.7409   0.3421   0.7389   2.2902  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.1857     0.1761   1.054    0.292    
## z             1.4202     0.2006   7.078 1.46e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 276.28  on 199  degrees of freedom
## Residual deviance: 197.61  on 198  degrees of freedom
## AIC: 201.61
## 
## Number of Fisher Scoring iterations: 4

logitfit_glm <- glm (r ~ z, family = binomial(), data = data, start = c(0,0))
summary (logitfit_glm)

## 
## Call:
## glm(formula = r ~ z, family = binomial(), data = data, start = c(0, 
##     0))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2888  -0.7409   0.3421   0.7389   2.2902  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.1857     0.1761   1.054    0.292    
## z             1.4202     0.2006   7.078 1.46e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 276.28  on 199  degrees of freedom
## Residual deviance: 197.61  on 198  degrees of freedom
## AIC: 201.61
## 
## Number of Fisher Scoring iterations: 5

logitfit_glm <- glm (r ~ z, family = binomial(), data = data, start = c(0,2))
summary (logitfit_glm)

## 
## Call:
## glm(formula = r ~ z, family = binomial(), data = data, start = c(0, 
##     2))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2888  -0.7409   0.3421   0.7389   2.2902  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.1857     0.1761   1.054    0.292    
## z             1.4202     0.2006   7.078 1.46e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 276.28  on 199  degrees of freedom
## Residual deviance: 197.61  on 198  degrees of freedom
## AIC: 201.61
## 
## Number of Fisher Scoring iterations: 5

logitfit_glm <- glm (r ~ z, family = binomial(), data = data, start = c(0,3))
summary (logitfit_glm)

## 
## Call:
## glm(formula = r ~ z, family = binomial(), data = data, start = c(0, 
##     3))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2888  -0.7409   0.3421   0.7389   2.2902  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.1857     0.1761   1.054    0.292    
## z             1.4202     0.2006   7.078 1.46e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 276.28  on 199  degrees of freedom
## Residual deviance: 197.61  on 198  degrees of freedom
## AIC: 201.61
## 
## Number of Fisher Scoring iterations: 6

2 Fit a non-linear model to sunspots data: using “nlm”

#
# The model is for the data on sunspot counts supplied with R.  We
# hypothesize that the counts (actually integers, but modelled as
# non-negative continuous values) come from taking the absolute value
# of a normal variate whose mean varies with time according to a sine 
# wave.  The model parameters are a, b, f, and M, which define the
# sine wave, as M*(a+sin(b+f*t), and the log of the standard deviation
# of the observation, lsigma.
#
# We use "nlm" to find the maximum likelihood estimates

logl <- function (p, x, t)
{
    a <- p[1]
    b <- p[2]
    f <- p[3]
    M <- p[4]
    lsigma <- p[5]
    - sum(log(dnorm ( x, M*(a+sin(b+f*t)), exp (lsigma)) + 
              dnorm (-x, M*(a+sin(b+f*t)), exp (lsigma))
      )) 
}


## get the data.

x <- sunspots [1:500]
t <- 1:length(x)

# Estimation, from a carefully chosen starting point.

est <- nlm (logl, 
            p = c(a= 0, b = -1, f = 2*pi/200, M=80,lsigma = 4), 
            x = x, t = t, hessian = T)
print(est)

## $minimum
## [1] 2334.028
## 
## $estimate
## [1]  0.01113550  0.87341653  0.02759776 90.46730054  3.43956442
## 
## $gradient
## [1]  1.991998e-04  8.534471e-05  6.612604e-03  1.952099e-07 -2.453170e-04
## 
## $hessian
##               [,1]          [,2]          [,3]        [,4]          [,5]
## [1,]  3.817365e+03 -9.407540e+00  9.867027e+04   4.4115695  3.091087e-02
## [2,] -9.407540e+00  1.614385e+03  4.831372e+05   0.5154039 -5.408146e+01
## [3,]  9.867027e+04  4.831372e+05  1.714552e+08 335.5790898 -3.208757e+04
## [4,]  4.411569e+00  5.154039e-01  3.355791e+02   0.2701867  9.972335e-01
## [5,]  3.091087e-02 -5.408146e+01 -3.208757e+04   0.9972335  8.184459e+02
## 
## $code
## [1] 3
## 
## $iterations
## [1] 85

mle <- est$estimate
sds <- sqrt(diag(solve(est$hessian)))

# Plot the mean from the model with the MLE parameter estimates together
# with the data points.

a <- mle[1]
b <- mle[2]
f <- mle[3]
M <- mle[4]

plot(t,x,pch=20)
lines(t,abs(M*(a+sin(b+f*t))),col="red")

3 Fit a non-linear model to sunspots data: using “nls”

x <- sunspots [1:500]
t <- 1:length(x)
ss_data <- data.frame (sunspots = x, t = t)

nlsfit_ss <- summary (
                nls (sunspots ~ abs(M*(a+sin(b+f*t))), 
                     data = ss_data,
                     start = c(a= 0, b = -1, f = 2*pi/200, M = 80))
                )
## note that this model is different from the model given in sunspots_nlm.r

names (nlsfit_ss)

##  [1] "formula"      "residuals"    "sigma"        "df"          
##  [5] "cov.unscaled" "call"         "convInfo"     "control"     
##  [9] "na.action"    "coefficients" "parameters"

nlsfit_coef <- nlsfit_ss$coef

a2 <- nlsfit_coef[1,1]
b2 <- nlsfit_coef[2,1]
f2 <- nlsfit_coef[3,1]
M2 <- nlsfit_coef[4,1]

plot(t,x,pch=20)
lines(t,abs(M2*(a2+sin(b2+f2*t))),col="red")

4 Other Optimization Methods

4.1 An overview

https://scipy-lectures.org/advanced/mathematical_optimization/index.html#knowing-your-problem

4.2 Nelder-Mead Method (A Gradient-Free Method)

https://codesachin.wordpress.com/2016/01/16/nelder-mead-optimization/

4.3 Gradient Descent Methods (Hessian-Free Methods)

https://www.benfrederickson.com/numerical-optimization/

STAT 812: Computational Statistics

Multivariate Optimization Methods for Finding MLE

Longhai Li

Feb 2020

1 Finding MLE for Logistic Regression with Multivariate NR Method

2 Fit a non-linear model to sunspots data: using “nlm”

3 Fit a non-linear model to sunspots data: using “nls”

4 Other Optimization Methods

4.1 An overview

4.2 Nelder-Mead Method (A Gradient-Free Method)

4.3 Gradient Descent Methods (Hessian-Free Methods)