fix: fixed issue with RANDOM sampling of rows in case of sequential data (#120)

mplatzer · web-flow · commit bb8b91089edb · 2025-03-12T14:03:50.000+01:00
diff --git a/mostlyai/qa/_sampling.py b/mostlyai/qa/_sampling.py
@@ -152,7 +152,7 @@ def sample_two_consecutive_rows(
     seq_lens = df.groupby(col_by).size()
 
     # make random draw from [0, seq_len-1]
-    sel_idx = (seq_lens - 1) * np.random.random(len(seq_lens)).astype("int")
+    sel_idx = ((seq_lens - 1) * np.random.random(len(seq_lens))).astype("int")
     sel_idx_df = pd.Series(sel_idx).to_frame("__IDX").reset_index()
 
     # filter to randomly selected indices
diff --git a/tests/unit/test_accuracy.py b/tests/unit/test_accuracy.py
@@ -340,8 +340,14 @@ def test_plot_univariate_distribution_numeric():
 
 
 def test_sample_two_consecutive_rows():
-    df = pd.DataFrame({"id": [1, 1, 1, 1, 2, 2, 2, 3, 3, 4], "x": [1, 2, 3, 5, 1, 2, 3, 1, 2, 1]})
+    df = pd.DataFrame(
+        {
+            "id": [1] * 1000 + [2] * 500 + [3] * 2 + [4] * 1,
+            "x": list(range(1000)) + list(range(500)) + list(range(2)) + list(range(1)),
+        }
+    )
     first_rows, second_rows = sample_two_consecutive_rows(df=df, col_by="id")
+    assert not (first_rows["x"] == 0).all()
     assert len(first_rows) == 4
     assert len(second_rows) == 3
     assert (first_rows["x"][0:2] == second_rows["x"][0:2] - 1).all()