|
| 1 | +klUniform <- function(v, numPriorDraws, samplesPerPrior) { |
| 2 | + # D_{KL}(v || uniform) |
| 3 | + # https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence |
| 4 | + expectedPr <- 1.0/samplesPerPrior |
| 5 | + observedPr <- table(v) / numPriorDraws |
| 6 | + sum(observedPr * log(observedPr/expectedPr)) |
| 7 | +} |
| 8 | + |
| 9 | +#' Histograms for Simulation Based Calibration |
| 10 | +#' |
| 11 | +#' @param ranks A list of sampling realizations. |
| 12 | +#' @param thin An integer vector of length one indicating the thinning interval |
| 13 | +#' when plotting |
| 14 | +#' @param perBin Number of histogram entries to combine into a single bar. |
| 15 | +#' @param worst If NA, plots all parameters. Otherwise how many parameters to show. |
| 16 | +#' Parameters are ordered by the degree of non-uniformity. |
| 17 | +#' @param alpha Uncertainty interval probability for a false positive (alpha level). |
| 18 | +#' @param hideAxes Whether to hide the plot axes. |
| 19 | +#' |
| 20 | +#' Each list element of \code{ranks} should be a matrix of rank |
| 21 | +#' comparison results (encoded as 0 or 1) associated with a single |
| 22 | +#' draw from the prior distribution. Each draw from the posterior is |
| 23 | +#' in the row and parameters are in columns. The matrix should have |
| 24 | +#' column names to correctly label the parameters. |
| 25 | +#' |
| 26 | +#' So that the histograms consist of independent realizations, |
| 27 | +#' draws from the posterior should be thinned to remove |
| 28 | +#' autocorrelation. Set \code{thin} such that the number of |
| 29 | +#' draws approximately matches the effective sample size. |
| 30 | +#' |
| 31 | +#' For best results, one plus the number of draws from the posterior |
| 32 | +#' should be evenly divisible by the number of histogram bins after |
| 33 | +#' thinning. For example, 511 draws after thinning results in 128 |
| 34 | +#' draws. If perBin is set to 4 then 32 histogram bars are drawn. |
| 35 | +#' |
| 36 | +#' @template return-ggplot |
| 37 | +#' |
| 38 | +#' @references |
| 39 | +#' Talts, S., Betancourt, M., Simpson, D., Vehtari, A., and Gelman, A. (2018). |
| 40 | +#' Validating Bayesian Inference Algorithms with Simulation-Based Calibration. |
| 41 | +#' arXiv preprint arXiv:1804.06788. \url{https://arxiv.org/abs/1804.06788} |
| 42 | +#' @seealso |
| 43 | +#' \link[rstan]{sbc} |
| 44 | +#' @examples |
| 45 | +#' pars <- paste0('parameter',1:2) |
| 46 | +#' samplesPerPrior <- 511 |
| 47 | +#' ranks <- list() |
| 48 | +#' for (px in 1:500) { |
| 49 | +#' r1 <- matrix(0, nrow=samplesPerPrior, ncol=length(pars), |
| 50 | +#' dimnames=list(NULL, pars)) |
| 51 | +#' for (p1 in 1:length(pars)) { |
| 52 | +#' r1[sample.int(samplesPerPrior, |
| 53 | +#' floor(runif(1, 0, samplesPerPrior))), p1] <- 1 |
| 54 | +#' } |
| 55 | +#' ranks[[px]] <- r1 |
| 56 | +#' } |
| 57 | +#' sbc_hist(ranks) |
| 58 | +#' @export |
| 59 | + |
| 60 | +sbc_hist <- function(ranks, thin = 4, perBin=4, worst=16, ..., |
| 61 | + alpha = 0.01, hideAxes=TRUE) { |
| 62 | + numPriorDraws <- length(ranks) |
| 63 | + thinner <- seq(from = 1, to = nrow(ranks[[1]]), by = thin) |
| 64 | + samplesPerPrior <- length(thinner) |
| 65 | + u <- t(sapply(ranks, FUN = function(r) 1 + colSums(r[thinner, , drop = FALSE]))) |
| 66 | + if (ncol(ranks[[1]]) == 1) { |
| 67 | + u <- t(u) |
| 68 | + dimnames(u) <- list(NULL, colnames(ranks[[1]])) |
| 69 | + } |
| 70 | + |
| 71 | + if (!is.na(worst)) { |
| 72 | + kl <- apply(u, 2, function(v) klUniform(v, numPriorDraws, samplesPerPrior)) |
| 73 | + filter <- order(-kl)[1:min(worst,ncol(u))] |
| 74 | +# print(filter) |
| 75 | + u <- u[, filter, drop=FALSE ] |
| 76 | + } |
| 77 | + |
| 78 | + parameter <- ordered(rep(colnames(u), each = nrow(u)), |
| 79 | + levels=colnames(u)) |
| 80 | + d <- data.frame(u = c(u), parameter) |
| 81 | + if (samplesPerPrior %% perBin != 0) { |
| 82 | + warning(paste("perBin (", perBin, ") does not evenly divide the", |
| 83 | + "number of samples per prior (",samplesPerPrior,")")) |
| 84 | + } |
| 85 | + numBins <- samplesPerPrior/perBin |
| 86 | + CI <- qbinom(c(alpha/2,0.5,1-alpha/2), numPriorDraws, numBins^-1) + c(-.5,0,.5) |
| 87 | + offset <- perBin*2 |
| 88 | + pl <- ggplot(d, aes(x = u)) + |
| 89 | + geom_polygon(data=data.frame(x=c(-offset,0,-offset,samplesPerPrior + offset, |
| 90 | + samplesPerPrior, samplesPerPrior + offset,-offset), |
| 91 | + y=c(CI[1],CI[2],CI[3],CI[3],CI[2],CI[1],CI[1])), |
| 92 | + aes(x=x,y=y),fill="grey45",color="grey25",alpha=0.5) + |
| 93 | + geom_histogram(bins=numBins, na.rm=TRUE) + |
| 94 | +# xlim(1,samplesPerPrior) + |
| 95 | + # https://github.com/tidyverse/ggplot2/issues/3332 |
| 96 | + facet_wrap("parameter") + |
| 97 | + geom_hline(yintercept=CI[1], color='green', linetype="dotted", alpha=.5) + |
| 98 | + geom_hline(yintercept=CI[3], color='green', linetype="dotted", alpha=.5) |
| 99 | + if (hideAxes) { |
| 100 | + pl <- pl + theme(axis.text.x=element_blank(), |
| 101 | + axis.text.y=element_blank(),axis.ticks=element_blank(), |
| 102 | + axis.title.x=element_blank(), |
| 103 | + axis.title.y=element_blank()) |
| 104 | + } |
| 105 | + pl |
| 106 | +} |
0 commit comments