Let \(T_i\) be a possibly right-censored observation, \(d_i\) be the indicator for being uncensored, and \(S_i(\cdot)\) be the survival function of original uncensored \(T^*_i\) given covariate \(x_i\) and parameters. The randomized survival probability (RSP) for \(T_{i}\) is defined as: \[\begin{equation} S_i^{R}(T_i, d_{i}, U_{i}) = \left\{ \begin{array}{rl} S_{i}(T_{i}), & \text{if $T_i$ is uncensored, i.e., $d_{i}=1$,}\\ U_{i}\,S_{i}(T_{i}), & \text{if $T_i$ is censored, i.e., $d_i=0$.} \end{array} \right. \label{rsp} \end{equation}\] where \(U_i\) is a uniform random number on \((0, 1)\). In Wu, Feng, and Li (2019+), we show that RSPs are uniformly distributed on \((0,1)\) under the true model for \(T^*_i\). Therefore, they can be transformed into residuals with any desired distribution. NRSP residuals are transformed RSPs with the standard normal quantile: \[\begin{equation} r_{i}^{\mbox{nrsp}}(T_i, d_{i}, U_i)=\Phi^{-1} (S_{i}^R(T_i, d_{i}, U_i)) \end{equation}\]
NRSP residuals are approximately standard normally distributed under the true model with estimated parameters. The approximate normality holds for any censoring distribution. NRSP residuals can be used to check the goodness-of-fit of \(S_i(\cdot)\) quantitatively by applying SW normality test to NRSP residuals, and graphically by visualizing their QQplot. Scatterplots of NRSP residuals may reveal the model departure nature.
survreg
and coxph
Objects#input: survreg_fit is a survreg object
resid_survreg<-function(survreg_fit)
{
distr<-survreg_fit$dist
y<- survreg_fit$y
m <- nrow (y)
parms<- as.numeric(survreg_fit[["parms"]])
alpha_hat<-1/survreg_fit$scale
if (distr %in% c("weibull","exponential","logistic","lognormal",
"loglogistic","gaussian", "loggaussian","rayleigh"))
{
SP<-1-(psurvreg(as.data.frame(as.matrix(y))[,-2],
survreg_fit[["linear.predictors"]],
scale=1/alpha_hat, distribution=distr))
haz <- dsurvreg(as.data.frame(as.matrix(y))[,-2],
survreg_fit[["linear.predictors"]],
scale=1/alpha_hat, distribution=distr)/SP
}else if (distr %in% "t")
{
SP<-1-(psurvreg(as.data.frame(as.matrix(y))[,-2],
survreg_fit[["linear.predictors"]],
scale=1/alpha_hat, distribution=distr,
parms = parms))
haz <- dsurvreg(as.data.frame(as.matrix(y))[,-2],
survreg_fit[["linear.predictors"]],
scale=1/alpha_hat, distribution=distr,
parms = parms)/SP
}else stop ("The distribution is not supported")
censored <- which(as.data.frame(as.matrix(y))[,-1]==0)
n.censored <- length(censored)
# NRSP
RSP <- SP
RSP[censored] <- RSP[censored]*runif(n.censored)
nrsp <- qnorm(RSP)
# Unmodified CS residual
ucs<- -log(SP)
# Modified CS residual
MSP<- SP
MSP[censored] <- SP[censored]/exp(1)
mcs <- -log(MSP)
nmsp <- qnorm (MSP)
#Martingale Residual
martg<- as.data.frame(as.matrix(survreg_fit$y))[,-1]- ucs
#Deviance Residual
dev<- sign(martg)* sqrt((-2)*(martg+as.data.frame(as.matrix(survreg_fit$y))[,-1]*
log(as.data.frame(as.matrix(survreg_fit$y))[,-1]-martg)))
nrsp.sw.pvalue<-shapiro.test(nrsp)$p.value
#hazard function
haz_fn<- haz
list(RSP=RSP,nrsp=nrsp,nrsp.sw.pvalue=nrsp.sw.pvalue,
ucs=ucs, mcs=mcs,martg=martg, dev=dev, nmsp=nmsp,haz_fn=haz_fn)
}
# outputs:
## RSP --- Randomized Survival Probabilities
## nrsp --- Normalized RSP
## nrsp.sw.pvalue --- GOF test p-values by applying SW test to NRSP
## ucs --- unmodified CS residuals
## mcs --- modified CS residuals
## nmsp --- normalized modified SP
## martg --- Martingale residuals
## dev --- Deviance residuals
## haz_fn --- hazard function of cs residuals
#input: coxfit_fit is a coxph object
resid_coxph<-function(coxfit_fit)
{
y<- coxfit_fit$y
m <- nrow (y)
mre <- resid(coxfit_fit, type="martingale")
dre <- resid(coxfit_fit, type="deviance")
#Unmodified Cox-Snell residual
ucs <- as.data.frame(as.matrix(y))[,-1] - mre
#Survival Function
SP<- exp(-ucs)
censored <- which(as.data.frame(as.matrix(y))[,-1]==0)
n.censored <- length(censored)
#NMSP residual (normal-transformed modified survival prob)
MSP<- SP
MSP[censored] <- SP[censored]/exp(1)
mcs <- -log(MSP)
nmsp<- qnorm(MSP)
#NRSP residual
RSP <- SP
RSP[censored] <- RSP[censored]*runif(n.censored)
nrsp <- qnorm(RSP)
nrsp.sw.pvalue<-shapiro.test(nrsp)$p.value
list(nrsp=nrsp,RSP=RSP,nrsp.sw.pvalue=nrsp.sw.pvalue,
ucs=ucs,mcs=mcs,nmsp=nmsp)
}
# outputs:
## RSP --- Randomized Survival Probabilities
## nrsp --- Normalized RSP
## nrsp.sw.pvalue --- GOF test p-values by applying SW test to NRSP
## ucs --- unmodified CS residuals
## mcs --- modified CS residuals
## nmsp --- normalized modified SP
## martg --- Martingale residuals
## dev --- Deviance residuals
## haz_fn --- hazard function of cs residuals
Copy the above R functions into your R enviroment.
Li, L., Wu, T., Feng, C., 2021. Model Diagnostics for Censored Regression via Randomized Survival Probabilities. Statistics in Medicine, 2020+. https://doi.org/10.1002/sim.8852; https://onlinelibrary.wiley.com/share/author/F8DKBTX7IT7UT2WTSZP3?target=10.1002/sim.8852
#simulated data:t from weibull distrbution,c from exponentail distrbutions
set.seed(1)
rexp2 <- function(n, rate){ if (rate==0) rep(Inf,n) else rexp(n=n, rate = rate)}
simulated_data<- function(n,beta0 , beta1 , alpha, mean.censor)
{
x <- rbinom(n, size = 1, p = 0.5)
t0<- rexp2(n, rate= 1/mean.censor)
survreg_sim_data <- rsurvreg( n, mean = beta0 + beta1 * x,
scale=1/alpha, distribution='weibull')
t <- pmin(survreg_sim_data, t0)
d <- as.numeric(t0>= survreg_sim_data )
data_form<- data.frame(survreg_sim_data,t0,t,d,x)
out_r<-list(data_form=data_form, alpha=alpha, beta0=beta0, beta1=beta1)
return (out_r)
}
library("foreach")
library("survival")
n<-2000
beta0<-2
beta1<-1
alpha<-1.7
mean.censor<-14
#nrep is preset to a number
foreach (j = 1:nrep) %do%
{
# simulating a dataset
out_r<- simulated_data(n=n,beta0=beta0,beta1=beta1,
alpha=alpha, mean.censor=mean.censor)
simulated_data_random<-out_r$data_form
#checking censoring rate
table(simulated_data_random$d)
#fit AFT model
survreg_fit_t <- survreg(Surv(out_r[[1]]$t,
out_r[[1]]$d) ~ out_r[[1]]$x,dist="weibull")
survreg_fit_w <- survreg(Surv(out_r[[1]]$t,
out_r[[1]]$d) ~ out_r[[1]]$x,dist="lognormal")
# compute residuals
true_model <- resid_survreg(survreg_fit_t)
wrong_model<- resid_survreg(survreg_fit_w)
rsp.t <- true_model$RSP
nrsp.t<- true_model$nrsp
rsp.w <- wrong_model$RSP
nrsp.w<- wrong_model$nrsp
t<-seq(0, 50, length=n)
gp1<- which(out_r[[1]]$x==1&out_r[[1]]$d==1)
gp2<- which(out_r[[1]]$x==1&out_r[[1]]$d==0)
gp3<- which(out_r[[1]]$x==0&out_r[[1]]$d==1)
gp4<- which(out_r[[1]]$x==0&out_r[[1]]$d==0)
s1.t<-1-(psurvreg(t, mean=sum(survreg_fit_t$coefficients),
scale=survreg_fit_t$scale, distribution="weibull"))
s0.t<-1-(psurvreg(t, mean=survreg_fit_t$coefficients[[1]],
scale=survreg_fit_t$scale, distribution="weibull"))
s1.w<-1-(psurvreg(t, mean=sum(survreg_fit_w$coefficients),
scale=survreg_fit_w$scale, distribution="lognormal"))
s0.w<-1-(psurvreg(t, mean=survreg_fit_w$coefficients[[1]],
scale=survreg_fit_w$scale, distribution="lognormal"))
par(mfrow = c(2,2),mar=c(4,4,2,2))
#The plot of true model
plot(t, s1.t,type="l", ylab="S(t)",xlab="Time",
main="RSPs, True Model",ylim=c(0,1.0),xlim=c(0,40))
lines(t, s0.t)
## choose a random sample of observations for display
sp <- sample(1:n,400)
s.gp1 <- gp1[which(gp1 %in% sp)]
s.gp2 <- gp2[which(gp2 %in% sp)]
s.gp3 <- gp3[which(gp3 %in% sp)]
s.gp4 <- gp4[which(gp4 %in% sp)]
points(out_r[[1]]$t[s.gp1], rsp.t[s.gp1],col="green",pch=0)
points(out_r[[1]]$t[s.gp2],rsp.t[s.gp2],col="red",pch=8)
points(out_r[[1]]$t[s.gp3],rsp.t[s.gp3],pch=2,col="green")
points(out_r[[1]]$t[s.gp4],rsp.t[s.gp4],pch=1,col="red")
legend ("topright",
legend = c(expression('Uncensored,x'['i']*"=1"),
expression('Censored,x'['i']*'=1'),
expression('Uncensored,x'['i']*'=0'),
expression('Censored,x'['i']*'=0')),
pch=c(0,8,2,1), col = c(3,2,3,2))
hist(rsp.t,xlab="Randomized Survival Probability",main="Histogram of RSPs, True Model")
#The plot of wrong model
plot(t, s1.w,type="l",ylab="S(t)",xlab="Time",main="RSPs, Wrong Model",
ylim=c(0,1.0),xlim=c(0,40))
lines(t, s0.w)
sp <- sample(1:n,400)
s.gp1 <- gp1[which(gp1 %in% sp)]
s.gp2 <- gp2[which(gp2 %in% sp)]
s.gp3 <- gp3[which(gp3 %in% sp)]
s.gp4 <- gp4[which(gp4 %in% sp)]
points(out_r[[1]]$t[s.gp1],rsp.w[s.gp1],col="green",pch=0)
points(out_r[[1]]$t[s.gp2],rsp.w[s.gp2],col="red",pch=8)
points(out_r[[1]]$t[s.gp3],rsp.w[s.gp3],pch=2,col="green")
points(out_r[[1]]$t[s.gp4],rsp.w[s.gp4],pch=1,col="red")
legend ("topright", legend = c(expression('Uncensored,x'['i']*"=1"),
expression('Censored,x'['i']*'=1'),
expression('Uncensored,x'['i']*'=0'),
expression('Censored,x'['i']*'=0')),
pch=c(0,8,2,1), col = c(3,2,3,2)
)
hist(rsp.w,xlab="Randomized Survival Probability",
main="Histogram of RSPs, Wrong Model")
}
#simulated data:t from weibull distrbution,c from exponentail distrbutions
set.seed(1)
rexp2 <- function(n, rate){ if (rate==0) rep(Inf,n) else rexp(n=n, rate = rate)}
simulated_data<- function(n,beta0 , beta1 , alpha, mean.censor)
{
x <- rbinom(n, size = 1, p = 0.5)
t0<- rexp2(n, rate= 1/mean.censor)
survreg_sim_data <- rsurvreg( n, mean = beta0 + beta1 * x,
scale=1/alpha, distribution='weibull')
t <- pmin(survreg_sim_data, t0)
d <- as.numeric(t0>= survreg_sim_data )
data_form<- data.frame(survreg_sim_data,t0,t,d,x)
out_r<-list(data_form=data_form, alpha=alpha, beta0=beta0, beta1=beta1)
return (out_r)
}
library("foreach")
library("survival")
n<- 800
beta0<-2
beta1<-1
alpha<-2
mean.censor<-14.6
#nrep is preset to a number
foreach (j = 1:nrep) %do%
{
# simulating a dataset
out_r<- simulated_data(n=n,beta0=beta0,beta1=beta1,
alpha=alpha, mean.censor=mean.censor)
simulated_data_random<-out_r$data_form
#checking censoring rate
table(simulated_data_random$d)
#fit AFT model
true_model <- survreg(Surv(out_r[[1]]$t, out_r[[1]]$d) ~ out_r[[1]]$x,dist="weibull")
wrong_model <- survreg(Surv(out_r[[1]]$t, out_r[[1]]$d) ~ out_r[[1]]$x,dist="lognormal")
nrsp.t<-resid_survreg(true_model)
nrsp.w<-resid_survreg(wrong_model)
#the cumulative hazard function estimated by Kaplan-Meier method of CS residuals
km.ucs.t <- survfit(Surv(nrsp.t$ucs, out_r[[1]]$d)~1,type='fleming')
id.ucs.t<-order(nrsp.t$ucs)
km.ucs.w <- survfit(Surv(nrsp.w$ucs, out_r[[1]]$d)~1,type='fleming')
id.ucs.w<-order(nrsp.w$ucs)
#The plot of true model
par(mfrow = c(2,3),mar=c(4,4,2,2))
resid.lim <- c(-1,1) * max(range(abs(nrsp.t$nrsp),abs(nrsp.w$nrsp)),5)
cs.lim <- c(0,max(6,range(km.ucs.t$cumhaz,km.ucs.w$cumhaz)))
plot(nrsp.t$nrsp,ylab="NRSP",col=out_r[[1]]$d+2,pch=out_r[[1]]$d+1,
ylim=resid.lim, main="True Model, NRSP Residual plot")
legend ("topleft", pch = c(2,1), legend = c("Uncensored", "Censored"),
col = c(3,2), cex=1)
qqnorm(nrsp.t$nrsp,xlab="Theoretical Quantiles", ylab="Sample Quantiles",
main= paste0("True Model, QQ plot, SW p-value=",
sprintf("%4.3f",nrsp.t$nrsp.sw.pvalue)),
xlim=resid.lim, ylim = resid.lim)
abline(a=0, b=1)
plot(km.ucs.t, fun="cumhaz", xlab="Unmodified Cox-Snell Residuals",
ylab="Cumulative Hazard Function",
main="True Model, Cum. Hazard of CS Residuals",
xlim=c(0,6), ylim = c(0,6))
abline(a=0, b=1, col="blue", lty=2)
points(km.ucs.t$time, -log(km.ucs.t$surv), col =out_r[[1]]$d[id.ucs.t]+2,
pch=out_r[[1]]$d[id.ucs.t]+1)
legend ("topleft", pch = c(2,1), legend = c("Uncensored", "Censored"),
col = c(3,2), cex=1)
#The plot of wrong model
plot(nrsp.w$nrsp,ylab="NRSP",col=out_r[[1]]$d+2,pch=out_r[[1]]$d+1,
ylim=resid.lim, main="Wrong Model, NRSP Residual plot")
legend ("topleft", pch = c(2,1), legend = c("Uncensored", "Censored"),col = c(3,2), cex=1)
qqnorm(nrsp.w$nrsp,xlab="Theoretical Quantiles", ylab="Sample Quantiles",
main=paste0("Wrong Model, QQ plot, SW p-value=",
sprintf("%4.3f",nrsp.w$nrsp.sw.pvalue)),
xlim=resid.lim, ylim = resid.lim)
abline(a=0,b=1)
plot(km.ucs.w, fun="cumhaz", xlab=("Unmodified Cox-Snell Residuals"),
ylab=("Cumulative Hazard Function"),
main="Wrong Model, Cum. Hazard of CS Residuals",
xlim=c(0,6), ylim = c(0,6))
abline(a=0, b=1, col="blue", lty=2)
points(km.ucs.w$time, -log(km.ucs.w$surv), col =out_r[[1]]$d[id.ucs.w]+2,
pch=out_r[[1]]$d[id.ucs.w]+1)
legend ("topleft", pch = c(2,1), legend = c("Uncensored", "Censored"),col = c(3,2), cex=1)
}