fix up skl DiD code + rerun DiD notebooks + update integration tests

drbenvincent · drbenvincent · commit 37c7f1a9e5af · 2023-01-04T19:15:48.000Z
diff --git a/causalpy/skl_experiments.py b/causalpy/skl_experiments.py
@@ -176,13 +176,21 @@ def __init__(
         data: pd.DataFrame,
         formula: str,
         time_variable_name: str,
+        group_variable_name: str,
+        treated: str,
+        untreated: str,
         model=None,
         **kwargs,
     ):
         super().__init__(model=model, **kwargs)
         self.data = data
         self.formula = formula
         self.time_variable_name = time_variable_name
+        self.group_variable_name = group_variable_name
+        self.treated = treated  # level of the group_variable_name that was treated
+        self.untreated = (
+            untreated  # level of the group_variable_name that was untreated
+        )
         y, X = dmatrices(formula, self.data)
         self._y_design_info = y.design_info
         self._x_design_info = X.design_info
@@ -194,32 +202,66 @@ def __init__(
         self.model.fit(X=self.X, y=self.y)
 
         # predicted outcome for control group
-        self.x_pred_control = pd.DataFrame(
-            {"group": [0, 0], "t": [0.0, 1.0], "post_treatment": [0, 0]}
+        self.x_pred_control = (
+            self.data
+            # just the untreated group
+            .query(f"{self.group_variable_name} == @self.untreated")
+            # drop the outcome variable
+            .drop(self.outcome_variable_name, axis=1)
+            # We may have multiple units per time point, we only want one time point
+            .groupby(self.time_variable_name)
+            .first()
+            .reset_index()
         )
         assert not self.x_pred_control.empty
         (new_x,) = build_design_matrices([self._x_design_info], self.x_pred_control)
         self.y_pred_control = self.model.predict(np.asarray(new_x))
 
         # predicted outcome for treatment group
-        self.x_pred_treatment = pd.DataFrame(
-            {"group": [1, 1], "t": [0.0, 1.0], "post_treatment": [0, 1]}
+        self.x_pred_treatment = (
+            self.data
+            # just the treated group
+            .query(f"{self.group_variable_name} == @self.treated")
+            # drop the outcome variable
+            .drop(self.outcome_variable_name, axis=1)
+            # We may have multiple units per time point, we only want one time point
+            .groupby(self.time_variable_name)
+            .first()
+            .reset_index()
         )
         assert not self.x_pred_treatment.empty
         (new_x,) = build_design_matrices([self._x_design_info], self.x_pred_treatment)
         self.y_pred_treatment = self.model.predict(np.asarray(new_x))
 
-        # predicted outcome for counterfactual
-        self.x_pred_counterfactual = pd.DataFrame(
-            {"group": [1], "t": [1.0], "post_treatment": [0]}
+        # predicted outcome for counterfactual. This is given by removing the influence
+        # of the interaction term between the group and the post_treatment variable
+        self.x_pred_counterfactual = (
+            self.data
+            # just the treated group
+            .query(f"{self.group_variable_name} == @self.treated")
+            # just the treatment period(s)
+            .query("post_treatment == True")
+            # drop the outcome variable
+            .drop(self.outcome_variable_name, axis=1)
+            # We may have multiple units per time point, we only want one time point
+            .groupby(self.time_variable_name)
+            .first()
+            .reset_index()
         )
         assert not self.x_pred_counterfactual.empty
         (new_x,) = build_design_matrices(
-            [self._x_design_info], self.x_pred_counterfactual
+            [self._x_design_info], self.x_pred_counterfactual, return_type="dataframe"
         )
+        # INTERVENTION: set the interaction term between the group and the
+        # post_treatment variable to zero. This is the counterfactual.
+        for i, label in enumerate(self.labels):
+            if "post_treatment" in label and self.group_variable_name in label:
+                new_x.iloc[:, i] = 0
         self.y_pred_counterfactual = self.model.predict(np.asarray(new_x))
 
         # calculate causal impact
+        # This is the coefficient on the interaction term
+        # TODO: THIS IS NOT YET CORRECT
         self.causal_impact = self.y_pred_treatment[1] - self.y_pred_counterfactual[0]
 
     def plot(self):
diff --git a/causalpy/tests/test_integration_pymc_examples.py b/causalpy/tests/test_integration_pymc_examples.py
@@ -11,7 +11,7 @@ def test_did():
     df = cp.load_data("did")
     result = cp.pymc_experiments.DifferenceInDifferences(
         df,
-        formula="y ~ 1 + group + t + group:post_treatment",
+        formula="y ~ 1 + group*post_treatment",
         time_variable_name="t",
         group_variable_name="group",
         treated=1,
@@ -37,6 +37,10 @@ def test_did_banks_simple():
         .groupby("year")
         .median()
     )
+    # SET TREATMENT TIME TO ZERO =========
+    df.index = df.index - treatment_time
+    treatment_time = 0
+    # ====================================
     df.reset_index(level=0, inplace=True)
     df_long = pd.melt(
         df,
@@ -45,16 +49,18 @@ def test_did_banks_simple():
         var_name="district",
         value_name="bib",
     ).sort_values("year")
-    df_long["district"] = df_long["district"].astype("category")
     df_long["unit"] = df_long["district"]
     df_long["post_treatment"] = df_long.year >= treatment_time
+    df_long = df_long.replace({"district": {"Sixth District": 1, "Eighth District": 0}})
+
     result = cp.pymc_experiments.DifferenceInDifferences(
-        df_long[df_long.year.isin([1930, 1931])],
-        formula="bib ~ 1 + district + year + district:post_treatment",
+        # df_long[df_long.year.isin([1930, 1931])],
+        df_long[df_long.year.isin([-0.5, 0.5])],
+        formula="bib ~ 1 + district * post_treatment",
         time_variable_name="year",
         group_variable_name="district",
-        treated="Sixth District",
-        untreated="Eighth District",
+        treated=1,
+        untreated=0,
         model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs),
     )
     assert isinstance(df, pd.DataFrame)
@@ -73,6 +79,10 @@ def test_did_banks_multi():
         .groupby("year")
         .median()
     )
+    # SET TREATMENT TIME TO ZERO =========
+    df.index = df.index - treatment_time
+    treatment_time = 0
+    # ====================================
     df.reset_index(level=0, inplace=True)
     df_long = pd.melt(
         df,
@@ -81,16 +91,17 @@ def test_did_banks_multi():
         var_name="district",
         value_name="bib",
     ).sort_values("year")
-    df_long["district"] = df_long["district"].astype("category")
     df_long["unit"] = df_long["district"]
     df_long["post_treatment"] = df_long.year >= treatment_time
+    df_long = df_long.replace({"district": {"Sixth District": 1, "Eighth District": 0}})
+
     result = cp.pymc_experiments.DifferenceInDifferences(
         df_long,
-        formula="bib ~ 1 + district + year + district:post_treatment",
+        formula="bib ~ 1 + year + district + post_treatment + district:post_treatment",
         time_variable_name="year",
         group_variable_name="district",
-        treated="Sixth District",
-        untreated="Eighth District",
+        treated=1,
+        untreated=0,
         model=cp.pymc_models.LinearRegression(sample_kwargs=sample_kwargs),
     )
     assert isinstance(df, pd.DataFrame)
diff --git a/causalpy/tests/test_integration_skl_examples.py b/causalpy/tests/test_integration_skl_examples.py
@@ -12,8 +12,11 @@ def test_did():
     data = cp.load_data("did")
     result = cp.skl_experiments.DifferenceInDifferences(
         data,
-        formula="y ~ 1 + group + t + group:post_treatment",
+        formula="y ~ 1 + group*post_treatment",
         time_variable_name="t",
+        group_variable_name="group",
+        treated=1,
+        untreated=0,
         model=LinearRegression(),
     )
     assert isinstance(data, pd.DataFrame)
diff --git a/docs/notebooks/did_pymc_banks.ipynb b/docs/notebooks/did_pymc_banks.ipynb
diff --git a/docs/notebooks/did_skl.ipynb b/docs/notebooks/did_skl.ipynb
@@ -25,33 +25,55 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = cp.load_data(\"did\")"
+    "%load_ext autoreload\n",
+    "%autoreload 2"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "data = cp.load_data(\"did\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/benjamv/git/CausalPy/causalpy/skl_experiments.py:259: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`\n",
+      "  new_x.iloc[:, i] = 0\n"
+     ]
+    }
+   ],
    "source": [
     "result = cp.skl_experiments.DifferenceInDifferences(\n",
     "    data,\n",
-    "    formula=\"y ~ 1 + group + t + group:post_treatment\",\n",
+    "    formula=\"y ~ 1 + group*post_treatment\",\n",
     "    time_variable_name=\"t\",\n",
+    "    group_variable_name=\"group\",\n",
+    "    treated=1,\n",
+    "    untreated=0,\n",
     "    model=LinearRegression(),\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/benjamv/opt/mambaforge/envs/CausalPy/lib/python3.10/site-packages/numpy/core/_methods.py:164: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "/Users/benjamv/mambaforge/envs/CausalPy/lib/python3.10/site-packages/numpy/core/_methods.py:164: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
       "  arr = asanyarray(a)\n"
      ]
     },
@@ -94,12 +116,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.10.8"
   },
   "orig_nbformat": 4,
   "vscode": {
    "interpreter": {
-    "hash": "02f5385db19eab57520277c5168790c7855381ee953bdbb5c89c321e1f17586e"
+    "hash": "46d31859cc45aa26a1223a391e7cf3023d69984b498bed11e66c690302b7e251"
    }
   }
  },