docs: add comparison vignette

cmgoold · cmgoold · commit 16f12b805d18 · 2024-04-05T17:58:12.000+01:00
Add a comparison notebook using simulated data. Addresses #10
diff --git a/bayesblend/__init__.py b/bayesblend/__init__.py
@@ -1,8 +1,10 @@
 from .models import BayesStacking, HierarchicalBayesStacking, MleStacking, PseudoBma
+from .io import Draws
 
 __all__ = [
     "MleStacking",
     "PseudoBma",
     "BayesStacking",
     "HierarchicalBayesStacking",
+    "Draws",
 ]
diff --git a/bayesblend/models.py b/bayesblend/models.py
@@ -215,6 +215,8 @@ def _blend(
         for model, draws in model_draws.items():
             blend_id = blend_idx[model]
             for par, samples in draws:
+                if samples is None:
+                    continue
                 blended_list = [
                     list(samples[draws_idx == blend_id, idx])
                     for idx, draws_idx in enumerate(draws_idx_list)
@@ -271,27 +273,23 @@ def __init__(
     def _obj_fun(self, w, *args):
         """Negative sum of the weighted log predictive densities"""
         Y = args[0]
-        w = np.concatenate([w, [1 - sum(w)]])
         log_scores = np.log(Y @ w)
         return -sum(log_scores)
 
     def _grad(self, w, *args):
         """Jacobian of the objective function.
 
         The gradient of log(Y @ w) wrt w is 1/(Y @ w) Y, using
-        the chain rule. Since we are only estimating K - 1
-        weights, we can correct the gradient by multiplying
-        by Y[:,:K - 1] - Y[:,-1].
+        the chain rule. 
         """
         Y = args[0]
-        w = np.concatenate([w, [1 - sum(w)]])
         N, K = Y.shape
-        grad = np.diag(np.ones(N) / (Y @ w)) @ (Y[:,:K - 1] - Y[:,-1].reshape((N, 1)))
+        grad = np.diag(np.ones(N) / (Y @ w)) @ Y
         return -grad.sum(axis=0)
 
     def _constraint(self, w):
         # -sum(w) + 1 > 0
-        return -sum(w) + 1
+        return sum(w) - 1
 
     def fit(self) -> MleStacking:
         lpd_points = np.array([draws.lpd for draws in self.model_draws.values()]).T
@@ -302,18 +300,16 @@ def fit(self) -> MleStacking:
             fun=self._obj_fun,
             jac=self._grad,
             args=(exp_lpd),
-            x0=np.repeat(1 / K, K - 1),
+            x0=np.repeat(1 / K, K),
             method="SLSQP",
-            constraints=dict(type="ineq", fun=self._constraint),
-            bounds=[(0, 1) for _ in range(K - 1)],
+            constraints=dict(type="eq", fun=self._constraint),
+            bounds=[(0, 1) for _ in range(K)],
             options=self.optimizer_options,
         )
-        _weights = np.concatenate([res.x, [1 - sum(res.x)]])
-        breakpoint()
 
         self._weights = {
             model: np.atleast_2d(weight)
-            for model, weight in zip(self.model_draws, _weights)
+            for model, weight in zip(self.model_draws, res.x)
         }
         self._model_info = res
         return self
diff --git a/docs/user-guide/blending.md b/docs/user-guide/blending.md
@@ -25,6 +25,9 @@ marginal likelihood or evidence for each model in $\mathcal{M}$.
 Because this quantity is often difficult to calculate,
 *pseudo Bayesian model averaging* (pseudo-BMA) has been introduced
 as a method of approximating BMA using information criteria.
+In addition, *pseudo Bayesian model averaging plus* (pseudo-BMA+)
+accounts for uncertainty in the information criteria by applying
+the Bayesian bootstrap to log likelihood vectors.
 
 [Stacking](
 https://en.wikipedia.org/wiki/Ensemble_learning#Stacking
@@ -45,8 +48,6 @@ represents any scoring rule
 used to evaluate data point $y_{i}$ from
 the posterior distribution of model $k$, $p(\Theta_{k} \mid \mathbf{y})$,
 with parameters $\Theta_{k}$.
-In practice, we only need to estimate $K - 1$ weights as the final
-weight is known due to $\sum_{k=1} w = 1$.
 
 The appeal of stacking, apart from its reported improved predictive
 accuracy over other procedures (see e.g. 
@@ -61,7 +62,9 @@ The weights can also be a function of covariates,
 and/or can be estimated hierarchically. 
 These extensions have generally been referred to
 as hierarchical Bayesian stacking (see
-[Yao *et al.*, 2021](https://arxiv.org/abs/2101.08954)).
+[Yao *et al.*, 2021](https://arxiv.org/abs/2101.08954))
+because of their use of a fully Bayesian model
+to estimate the weights.
 
 ## Blending
 
diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md
@@ -5,5 +5,5 @@ that illustrate the use of BayesBlend.
 
 * [Getting started](getting-started.md)
 * [Model averaging, stacking and blending](blending.md)
-* [Recovering blending weights using simulated data](simulation.md)
+* [Comparing mixture modelling to pseudo-BMA+ and stacking](simulation.md)
 * [Integration with Arviz](arviz.md)
diff --git a/docs/user-guide/scripts/simulation.py b/docs/user-guide/scripts/simulation.py
diff --git a/docs/user-guide/simulation.md b/docs/user-guide/simulation.md