mcbal
diff --git a/‎README.md
+21-7 b/‎README.md
+21-7
diff --git a/‎deep_implicit_attention/deq.py
+38-24 b/‎deep_implicit_attention/deq.py
+38-24
diff --git a/‎deep_implicit_attention/models.py
+5-1 b/‎deep_implicit_attention/models.py
+5-1
@@ -2,15 +2,27 @@
 
 ## Deep Implicit Attention
 
-_The return of the Boltzmann machine_
+Experimental implementation of deep implicit attention in PyTorch.
 
----
+**Summary:** Using deep equilibrium networks to implicitly solve a set of self-consistent mean-field equations of a random Ising model implements attention as a collective response 🤗 and provides insight into the transformer architecture, connecting it to mean-field theory, message-passing algorithms, and Boltzmann machines.
 
-Experimental implementation of deep implicit attention in PyTorch.
+**Blog post (in preparation): _Deep Implicit Attention: A Mean-Field Theory Perspective on Attention Mechanisms_**
+
+## To-do
 
-**Key idea:** Use deep equilibrium networks to implicitly solve a set of self-consistent mean-field equations of a random Ising model: attention as a collective response 🤗.
+### Modules
+- [x] Add a `GeneralizedIsingGaussianAdaTAP` module implementing the adaptive TAP mean-field equations for an Ising-like vector model with standard multivariate Gaussian priors over spins
+- [ ] Figure out the analytical Gibbs free energy for `GeneralizedIsingGaussianAdaTAP` and implement it to be able to use it as a stand-alone loss function
+- [ ] Look into making the parameters of the multivariate Gaussian priors in `GeneralizedIsingGaussianAdaTAP` trainable
+- [ ] Add a `VanillaSoftmaxAttention` module which reproduces vanilla softmax attention, i.e. implementing coupling weights between spins which depend solely on linear transformations of the external sources (queries/keys) and replacing the self-correction term with a parametrized position-wise feed-forward network
 
-**Blog post (in preparation):** <a href="https://mcbal.github.io/">Deep Implicit Attention: A Mean-Field Theory Perspective on Attention Mechanisms</a>
+### Models
+- [ ] Add a `DeepImplicitAttentionTransformer` model
+- [ ] Add a `DeepImplicitAttentionViT` model
+
+### Miscellaneous
+- [ ] Add additional fixed-point / root solvers (e.g. Broyden)
+- [ ] Add examples (MNIST, sequence tasks, ...)
 
 ## Setup
 
@@ -30,7 +42,9 @@ $ python -m unittest
 
 See `tests` for now until `examples` folder is populated.
 
-## Selection of references
+## References
+
+### Selection of literature
 On variational inference, iterative approximation algorithms, expectation propagation, mean-field methods and belief propagation:
 - [Expectation Propagation](https://arxiv.org/abs/1409.6179) (2014) by Jack Raymond, Andre Manoel, Manfred Opper
 
@@ -48,7 +62,7 @@ On deep equilibrium networks:
 - [Chapter 4: Deep Equilibrium Models](https://implicit-layers-tutorial.org/deep_equilibrium_models/) of the [Deep Implicit Layers - Neural ODEs, Deep Equilibirum Models, and Beyond](http://implicit-layers-tutorial.org/), created by Zico Kolter, David Duvenaud, and Matt Johnson
 
 
-## Code inspiration
+### Code inspiration
 
 - http://implicit-layers-tutorial.org/
 - https://github.com/locuslab/deq
 
@@ -11,44 +11,48 @@
 class _DEQModule(nn.Module, metaclass=ABCMeta):
     def __init__(self):
         super().__init__()
-        self.state_shape = None
+        self.shapes = None
 
     def pack_state(self, z_list):
-        """Transform list of batched tensors into batch of vectors."""
-        self.state_shape = [t.shape[1:] for t in z_list]
+        """
+        Transform list of batched tensors into batch of vectors.
+        """
+        self.shapes = [t.shape[1:] for t in z_list]
         bsz = z_list[0].shape[0]
-        z = torch.cat([elem.reshape(bsz, -1) for elem in z_list], dim=1)
+        z = torch.cat([t.reshape(bsz, -1) for t in z_list], dim=1)
         return z
 
     def unpack_state(self, z):
-        """Transform batch of vectors into list of batched tensors according to `state_shape`."""
-        assert self.state_shape is not None
+        """
+        Transform batch of vectors into list of batched tensors.
+        """
+        assert self.shapes is not None
         bsz, z_list = z.shape[0], []
-        start_idx, end_idx = 0, reduce(lambda x, y: x * y, self.state_shape[0])
-        for i in range(len(self.state_shape)):
-            z_list.append(z[:, start_idx:end_idx].view(bsz, *self.state_shape[i]))
-            if i < len(self.state_shape) - 1:
+        start_idx, end_idx = 0, reduce(lambda x, y: x * y, self.shapes[0])
+        for i in range(len(self.shapes)):
+            z_list.append(z[:, start_idx:end_idx].view(bsz, *self.shapes[i]))
+            if i < len(self.shapes) - 1:
                 start_idx = end_idx
-                end_idx += reduce(lambda x, y: x * y, self.state_shape[i + 1])
+                end_idx += reduce(lambda x, y: x * y, self.shapes[i + 1])
         return z_list
 
     @abstractmethod
-    def get_initial_guess(self, x):
+    def _initial_guess(self, x):
         """Return an initial guess for the fixed-point state based on shape of `x`."""
         pass
 
     @abstractmethod
     def forward(self, z, x, *args):
-        """Implement (z_{n}, x) -> z_{n+1}."""
+        """Implement f(z_{n}, x) -> z_{n+1}."""
         pass
 
 
 class DEQFixedPoint(nn.Module):
     _default_kwargs = {
-        "solver_fwd_max_iter": 30,
-        "solver_fwd_tol": 1e-4,
-        "solver_bwd_max_iter": 30,
-        "solver_bwd_tol": 1e-4,
+        'solver_fwd_max_iter': 30,
+        'solver_fwd_tol': 1e-4,
+        'solver_bwd_max_iter': 30,
+        'solver_bwd_tol': 1e-4,
     }
 
     def __init__(self, fun, solver, output_elements=[0], **kwargs):
@@ -60,14 +64,20 @@ def __init__(self, fun, solver, output_elements=[0], **kwargs):
         self.kwargs.update(**kwargs)
 
     def _fixed_point(self, z0, x, *args, **kwargs):
+        """Find fixed-point of `fun` given `z0` and `x`."""
+
         # Compute forward pass: find equilibrium state
         with torch.no_grad():
             out = self.solver(
                 lambda z: self.fun(z, x, *args),
                 z0,
-                **filter_kwargs(kwargs, "solver_fwd_"),
+                **filter_kwargs(kwargs, 'solver_fwd_'),
             )
-            z, _ = out["result"], out["rel_trace"]
+            z = out['result']
+            if kwargs.get('debug', False):
+                print(f"{out['rel_trace'][0]} -> {out['rel_trace'][-1]}")
+                # from .utils import log_plot
+                # log_plot(out['rel_trace'])
 
         if self.training:
             # Re-engage autograd tape at equilibrium state
@@ -78,23 +88,27 @@ def _fixed_point(self, z0, x, *args, **kwargs):
 
             def backward_hook(grad):
                 out = self.solver(
-                    lambda y: autograd.grad(fun_bwd, z_bwd, y, retain_graph=True)[0]
+                    lambda y: autograd.grad(
+                        fun_bwd, z_bwd, y, retain_graph=True)[0]
                     + grad,
                     torch.zeros_like(grad),
-                    **filter_kwargs(kwargs, "solver_bwd_"),
+                    **filter_kwargs(kwargs, 'solver_bwd_'),
                 )
-                g, _ = out["result"], out["rel_trace"]
+                g = out['result']
+                # [DEBUG] insert statements here for backward pass inspection
                 return g
 
             z.register_hook(backward_hook)
 
         return z
 
     def forward(self, x, *args, **kwargs):
+        # Merge default kwargs with incoming runtime kwargs.
+        kwargs = {**self.kwargs, **kwargs}
         # Get list of initial guess tensors and reshape into a batch of vectors
-        z0 = self.fun.pack_state(kwargs.get("z0", self.fun.get_initial_guess(x)))
+        z0 = self.fun.pack_state(self.fun._initial_guess(x))
         # Find equilibrium vectors
-        z_star = self._fixed_point(z0, x, *args, **self.kwargs)
+        z_star = self._fixed_point(z0, x, *args, **kwargs)
         # Return (subset of) list of tensors of original input shapes
         out = [self.fun.unpack_state(z_star)[i] for i in self.output_elements]
         return out[0] if len(out) == 1 else out
@@ -1 +1,5 @@
-"""TODO: BoltzmannAgent, BoltzmannTransformer"""
+"""
+TODO:
+    - Add a `DeepImplicitAttentionTransformer` model
+    - Add a `DeepImplicitAttentionViT` model
+"""