pymc-labs
diff --git a/‎causalpy/pymc_experiments.py
+14-9 b/‎causalpy/pymc_experiments.py
+14-9
diff --git a/‎causalpy/skl_experiments.py
+50-8 b/‎causalpy/skl_experiments.py
+50-8
diff --git a/‎causalpy/tests/test_integration_pymc_examples.py
+21-10 b/‎causalpy/tests/test_integration_pymc_examples.py
+21-10
diff --git a/‎causalpy/tests/test_integration_skl_examples.py
+4-1 b/‎causalpy/tests/test_integration_skl_examples.py
+4-1
diff --git a/‎docs/notebooks/did_pymc.ipynb
+12-17 b/‎docs/notebooks/did_pymc.ipynb
+12-17
@@ -340,7 +340,8 @@ def __init__(
         (new_x,) = build_design_matrices([self._x_design_info], self.x_pred_treatment)
         self.y_pred_treatment = self.model.predict(np.asarray(new_x))
 
-        # predicted outcome for counterfactual
+        # predicted outcome for counterfactual. This is given by removing the influence
+        # of the interaction term between the group and the post_treatment variable
         self.x_pred_counterfactual = (
             self.data
             # just the treated group
@@ -349,24 +350,28 @@ def __init__(
             .query("post_treatment == True")
             # drop the outcome variable
             .drop(self.outcome_variable_name, axis=1)
-            # DO AN INTERVENTION. Set the post_treatment variable to False
-            .assign(post_treatment=False)
             # We may have multiple units per time point, we only want one time point
             .groupby(self.time_variable_name)
             .first()
             .reset_index()
         )
         assert not self.x_pred_counterfactual.empty
         (new_x,) = build_design_matrices(
-            [self._x_design_info], self.x_pred_counterfactual
+            [self._x_design_info], self.x_pred_counterfactual, return_type="dataframe"
         )
+        # INTERVENTION: set the interaction term between the group and the
+        # post_treatment variable to zero. This is the counterfactual.
+        for i, label in enumerate(self.labels):
+            if "post_treatment" in label and self.group_variable_name in label:
+                new_x.iloc[:, i] = 0
         self.y_pred_counterfactual = self.model.predict(np.asarray(new_x))
 
-        # calculate causal impact
-        self.causal_impact = (
-            self.y_pred_treatment["posterior_predictive"].mu.isel({"obs_ind": 1})
-            - self.y_pred_counterfactual["posterior_predictive"].mu.squeeze()
-        )
+        # calculate causal impact.
+        # This is the coefficient on the interaction term
+        coeff_names = self.idata.posterior.coords["coeffs"].data
+        for i, label in enumerate(coeff_names):
+            if "post_treatment" in label and self.group_variable_name in label:
+                self.causal_impact = self.idata.posterior["beta"].isel({"coeffs": i})
 
     def plot(self):
         """Plot the results.
 
@@ -176,13 +176,21 @@ def __init__(
         data: pd.DataFrame,
         formula: str,
         time_variable_name: str,
+        group_variable_name: str,
+        treated: str,
+        untreated: str,
         model=None,
         **kwargs,
     ):
         super().__init__(model=model, **kwargs)
         self.data = data
         self.formula = formula
         self.time_variable_name = time_variable_name
+        self.group_variable_name = group_variable_name
+        self.treated = treated  # level of the group_variable_name that was treated
+        self.untreated = (
+            untreated  # level of the group_variable_name that was untreated
+        )
         y, X = dmatrices(formula, self.data)
         self._y_design_info = y.design_info
         self._x_design_info = X.design_info
@@ -194,32 +202,66 @@ def __init__(
         self.model.fit(X=self.X, y=self.y)
 
         # predicted outcome for control group
-        self.x_pred_control = pd.DataFrame(
-            {"group": [0, 0], "t": [0.0, 1.0], "post_treatment": [0, 0]}
+        self.x_pred_control = (
+            self.data
+            # just the untreated group
+            .query(f"{self.group_variable_name} == @self.untreated")
+            # drop the outcome variable
+            .drop(self.outcome_variable_name, axis=1)
+            # We may have multiple units per time point, we only want one time point
+            .groupby(self.time_variable_name)
+            .first()
+            .reset_index()
         )
         assert not self.x_pred_control.empty
         (new_x,) = build_design_matrices([self._x_design_info], self.x_pred_control)
         self.y_pred_control = self.model.predict(np.asarray(new_x))
 
         # predicted outcome for treatment group
-        self.x_pred_treatment = pd.DataFrame(
-            {"group": [1, 1], "t": [0.0, 1.0], "post_treatment": [0, 1]}
+        self.x_pred_treatment = (
+            self.data
+            # just the treated group
+            .query(f"{self.group_variable_name} == @self.treated")
+            # drop the outcome variable
+            .drop(self.outcome_variable_name, axis=1)
+            # We may have multiple units per time point, we only want one time point
+            .groupby(self.time_variable_name)
+            .first()
+            .reset_index()
         )
         assert not self.x_pred_treatment.empty
         (new_x,) = build_design_matrices([self._x_design_info], self.x_pred_treatment)
         self.y_pred_treatment = self.model.predict(np.asarray(new_x))
 
-        # predicted outcome for counterfactual
-        self.x_pred_counterfactual = pd.DataFrame(
-            {"group": [1], "t": [1.0], "post_treatment": [0]}
+        # predicted outcome for counterfactual. This is given by removing the influence
+        # of the interaction term between the group and the post_treatment variable
+        self.x_pred_counterfactual = (
+            self.data
+            # just the treated group
+            .query(f"{self.group_variable_name} == @self.treated")
+            # just the treatment period(s)
+            .query("post_treatment == True")
+            # drop the outcome variable
+            .drop(self.outcome_variable_name, axis=1)
+            # We may have multiple units per time point, we only want one time point
+            .groupby(self.time_variable_name)
+            .first()
+            .reset_index()
         )
         assert not self.x_pred_counterfactual.empty
         (new_x,) = build_design_matrices(
-            [self._x_design_info], self.x_pred_counterfactual
+            [self._x_design_info], self.x_pred_counterfactual, return_type="dataframe"
         )
+        # INTERVENTION: set the interaction term between the group and the
+        # post_treatment variable to zero. This is the counterfactual.
+        for i, label in enumerate(self.labels):
+            if "post_treatment" in label and self.group_variable_name in label:
+                new_x.iloc[:, i] = 0
         self.y_pred_counterfactual = self.model.predict(np.asarray(new_x))
 
         # calculate causal impact
+        # This is the coefficient on the interaction term
+        # TODO: THIS IS NOT YET CORRECT
         self.causal_impact = self.y_pred_treatment[1] - self.y_pred_counterfactual[0]
 
     def plot(self):
 
@@ -11,7 +11,7 @@ def test_did():
     df = cp.load_data("did")
     result = cp.pymc_experiments.DifferenceInDifferences(
         df,
-        formula="y ~ 1 + group + t + group:post_treatment",
+        formula="y ~ 1 + group*post_treatment",
         time_variable_name="t",
         group_variable_name="group",
         treated=1,
@@ -37,6 +37,10 @@ def test_did_banks_simple():
         .groupby("year")
         .median()
     )
+    # SET TREATMENT TIME TO ZERO =========
+    df.index = df.index - treatment_time
+    treatment_time = 0
+    # ====================================
     df.reset_index(level=0, inplace=True)
     df_long = pd.melt(
         df,
@@ -45,16 +49,18 @@ def test_did_banks_simple():
         var_name="district",
         value_name="bib",
     ).sort_values("year")
-    df_long["district"] = df_long["district"].astype("category")
     df_long["unit"] = df_long["district"]
     df_long["post_treatment"] = df_long.year >= treatment_time
+    df_long = df_long.replace({"district": {"Sixth District": 1, "Eighth District": 0}})
+
     result = cp.pymc_experiments.DifferenceInDifferences(
-        df_long[df_long.year.isin([1930, 1931])],
-        formula="bib ~ 1 + district + year + district:post_treatment",
+        # df_long[df_long.year.isin([1930, 1931])],
+        df_long[df_long.year.isin([-0.5, 0.5])],
+        formula="bib ~ 1 + district * post_treatment",
         time_variable_name="year",
         group_variable_name="district",
-        treated="Sixth District",
-        untreated="Eighth District",
+        treated=1,
+        untreated=0,
         model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs),
     )
     assert isinstance(df, pd.DataFrame)
@@ -73,6 +79,10 @@ def test_did_banks_multi():
         .groupby("year")
         .median()
     )
+    # SET TREATMENT TIME TO ZERO =========
+    df.index = df.index - treatment_time
+    treatment_time = 0
+    # ====================================
     df.reset_index(level=0, inplace=True)
     df_long = pd.melt(
         df,
@@ -81,16 +91,17 @@ def test_did_banks_multi():
         var_name="district",
         value_name="bib",
     ).sort_values("year")
-    df_long["district"] = df_long["district"].astype("category")
     df_long["unit"] = df_long["district"]
     df_long["post_treatment"] = df_long.year >= treatment_time
+    df_long = df_long.replace({"district": {"Sixth District": 1, "Eighth District": 0}})
+
     result = cp.pymc_experiments.DifferenceInDifferences(
         df_long,
-        formula="bib ~ 1 + district + year + district:post_treatment",
+        formula="bib ~ 1 + year + district + post_treatment + district:post_treatment",
         time_variable_name="year",
         group_variable_name="district",
-        treated="Sixth District",
-        untreated="Eighth District",
+        treated=1,
+        untreated=0,
         model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs),
     )
     assert isinstance(df, pd.DataFrame)
 
@@ -12,8 +12,11 @@ def test_did():
     data = cp.load_data("did")
     result = cp.skl_experiments.DifferenceInDifferences(
         data,
-        formula="y ~ 1 + group + t + group:post_treatment",
+        formula="y ~ 1 + group*post_treatment",
         time_variable_name="t",
+        group_variable_name="group",
+        treated=1,
+        untreated=0,
         model=LinearRegression(),
     )
     assert isinstance(data, pd.DataFrame)