feat: reproducibility (#197)

lukaszkolodziejczyk · web-flow · commit 7058be30a65b · 2025-05-14T13:06:29.000+02:00
diff --git a/mostlyai/qa/_common.py b/mostlyai/qa/_common.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import logging
+import os
+import struct
 from typing import Protocol
 
 import pandas as pd
@@ -122,3 +124,21 @@ def determine_data_size(
         return len(tgt_keys)
     else:
         return len(tgt_data)
+
+
+def set_random_state(random_state: int | None = None):
+    def get_random_int_from_os() -> int:
+        # 32-bit, cryptographically secure random int from os
+        return int(struct.unpack("I", os.urandom(4))[0])
+
+    if random_state is not None:
+        _LOG.info(f"Global random_state set to `{random_state}`")
+
+    if random_state is None:
+        random_state = get_random_int_from_os()
+
+    import random
+    import numpy as np
+
+    random.seed(random_state)
+    np.random.seed(random_state)
diff --git a/mostlyai/qa/reporting.py b/mostlyai/qa/reporting.py
@@ -62,6 +62,7 @@
     TGT_COLUMN_PREFIX,
     REPORT_CREDITS,
     ProgressCallbackWrapper,
+    set_random_state,
 )
 from mostlyai.qa._filesystem import Statistics, TemporaryWorkspace
 
@@ -87,6 +88,7 @@ def report(
     max_sample_size_embeddings: int | None = None,
     statistics_path: str | Path | None = None,
     update_progress: ProgressCallback | None = None,
+    random_state: int | None = None,
 ) -> tuple[Path, ModelMetrics | None]:
     """
     Generate an HTML report and metrics for assessing synthetic data quality.
@@ -121,12 +123,15 @@ def report(
         max_sample_size_embeddings: The maximum sample size for embedding calculations.
         statistics_path: The path of where to store the statistics to be used by `report_from_statistics`
         update_progress: The progress callback.
+        random_state: Seed for the random number generators.
 
     Returns:
         The path to the generated HTML report.
         Metrics instance with accuracy, similarity, and distances metrics.
     """
 
+    set_random_state(random_state)
+
     if syn_ctx_data is not None:
         if ctx_primary_key is None:
             raise ValueError("If syn_ctx_data is provided, then ctx_primary_key must also be provided.")
diff --git a/mostlyai/qa/reporting_from_statistics.py b/mostlyai/qa/reporting_from_statistics.py
@@ -33,6 +33,7 @@
     determine_data_size,
     REPORT_CREDITS,
     ProgressCallbackWrapper,
+    set_random_state,
 )
 from mostlyai.qa._filesystem import Statistics, TemporaryWorkspace
 
@@ -53,6 +54,7 @@ def report_from_statistics(
     max_sample_size_accuracy: int | None = None,
     max_sample_size_coherence: int | None = None,
     update_progress: ProgressCallback | None = None,
+    random_state: int | None = None,
 ) -> Path:
     """
     Generate an HTML report based on previously generated statistics and newly provided synthetic data samples.
@@ -70,11 +72,14 @@ def report_from_statistics(
         max_sample_size_accuracy: The maximum sample size for accuracy calculations.
         max_sample_size_coherence: The maximum sample size for coherence calculations.
         update_progress: The progress callback.
+        random_state: Seed for the random number generators.
 
     Returns:
         The path to the generated HTML report.
     """
 
+    set_random_state(random_state)
+
     with (
         TemporaryWorkspace() as workspace,
         ProgressCallbackWrapper(update_progress) as progress,