chore: drop FAISS; cap compute (AUC, Contours, SeqLen, Subsets); handle empty tgt_data (#194)

mplatzer · web-flow · commit ed501296fc25 · 2025-05-12T20:47:57.000-04:00
diff --git a/mostlyai/qa/_distances.py b/mostlyai/qa/_distances.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 
 import logging
-import platform
 import time
 import numpy as np
 import networkx as nx
 import xxhash
+from sklearn.neighbors import NearestNeighbors
+from joblib import cpu_count
 
 from mostlyai.qa._common import (
     CHARTS_COLORS,
@@ -42,22 +43,9 @@ def calculate_dcrs_nndrs(
     t0 = time.time()
     data = data[data[:, 0].argsort()]  # sort data by first dimension to enforce deterministic results
 
-    if platform.system() == "Linux":
-        # use FAISS on Linux for best performance
-        import faiss  # type: ignore
-
-        index = faiss.IndexFlatL2(data.shape[1])
-        index.add(data)
-        dcrs, _ = index.search(query, 2)
-        dcrs = np.sqrt(dcrs)  # FAISS returns squared distances
-    else:
-        # use sklearn as a fallback on non-Linux systems to avoid segfaults; these occurred when using QA as part of SDK
-        from sklearn.neighbors import NearestNeighbors  # type: ignore
-        from joblib import cpu_count  # type: ignore
-
-        index = NearestNeighbors(n_neighbors=2, algorithm="auto", metric="l2", n_jobs=min(16, max(1, cpu_count() - 1)))
-        index.fit(data)
-        dcrs, _ = index.kneighbors(query)
+    index = NearestNeighbors(n_neighbors=2, algorithm="auto", metric="l2", n_jobs=min(16, max(1, cpu_count() - 1)))
+    index.fit(data)
+    dcrs, _ = index.kneighbors(query)
     dcr = dcrs[:, 0]
     nndr = (dcrs[:, 0] + 1e-8) / (dcrs[:, 1] + 1e-8)
     _LOG.info(f"calculated DCRs for {data.shape=} and {query.shape=} in {time.time() - t0:.2f}s")
@@ -85,14 +73,12 @@ def calculate_distances(
         groups = []
         # check all columns together
         groups += [np.arange(ori_embeds.shape[1])]
-        # check subsets of correlated columns together
+        # check 3 correlated subsets of columns
         if ori_embeds.shape[1] > 10:
-            k = max(3, ori_embeds.shape[1] // 10)
-            groups += split_columns_into_correlated_groups(ori_embeds, k=k)
-        # check random subsets of columns
+            groups += split_columns_into_correlated_groups(ori_embeds, k=3)
+        # check 3 random subsets of columns
         if ori_embeds.shape[1] > 10:
-            k = max(3, ori_embeds.shape[1] // 10)
-            groups += split_columns_into_random_groups(ori_embeds, k=k)
+            groups += split_columns_into_random_groups(ori_embeds, k=3)
         dcr_share = 0.0
         nndr_ratio = 1.0
         for columns in groups:
diff --git a/mostlyai/qa/_sampling.py b/mostlyai/qa/_sampling.py
@@ -300,11 +300,13 @@ def prepare_data_for_embeddings(
 
     # cap to Q95 sequence length of original to avoid excessive samples per group distorting results
     if tgt_context_key is not None:
+        cap_sequence_length = 100
         q95_sequence_length = trn_tgt_data.groupby(key).size().quantile(0.95)
-        syn_tgt_data = syn_tgt_data.groupby(key).sample(frac=1).groupby(key).head(n=q95_sequence_length)
-        trn_tgt_data = trn_tgt_data.groupby(key).sample(frac=1).groupby(key).head(n=q95_sequence_length)
+        max_sequence_length = min(q95_sequence_length, cap_sequence_length)
+        syn_tgt_data = syn_tgt_data.groupby(key).sample(frac=1).groupby(key).head(n=max_sequence_length)
+        trn_tgt_data = trn_tgt_data.groupby(key).sample(frac=1).groupby(key).head(n=max_sequence_length)
         hol_tgt_data = (
-            hol_tgt_data.groupby(key).sample(frac=1).groupby(key).head(n=q95_sequence_length) if hol else None
+            hol_tgt_data.groupby(key).sample(frac=1).groupby(key).head(n=max_sequence_length) if hol else None
         )
 
     # drop key from data as its not relevant for embeddings
diff --git a/mostlyai/qa/_similarity.py b/mostlyai/qa/_similarity.py
@@ -69,6 +69,10 @@ def calculate_mean_auc(embeds1, embeds2):
         for a ML model to discriminate between two embedding arrays.
         """
 
+        # limit the number of samples to 10000
+        embeds1 = embeds1[:10000]
+        embeds2 = embeds2[:10000]
+
         # create labels for the data
         labels1 = np.zeros(embeds1.shape[0])
         labels2 = np.ones(embeds2.shape[0])
@@ -195,6 +199,11 @@ def plot_store_similarity_contours(
     if trn_embeds.shape[1] < 3:
         return
 
+    # limit the number of samples to 10000
+    syn_embeds = syn_embeds[:10000]
+    trn_embeds = trn_embeds[:10000]
+    hol_embeds = hol_embeds[:10000] if hol_embeds is not None else None
+
     # perform PCA on trn embeddings
     pca_model = PCA(n_components=3)
     pca_model.fit(trn_embeds)
diff --git a/mostlyai/qa/reporting.py b/mostlyai/qa/reporting.py
@@ -181,6 +181,8 @@ def report(
             check_min_sample_size(trn_sample_size, 90, "training")
             if hol_tgt_data is not None:
                 check_min_sample_size(hol_sample_size, 10, "holdout")
+            if trn_tgt_data.shape[1] == 0 or syn_tgt_data.shape[1] == 0:
+                raise PrerequisiteNotMetError("Provided data has no columns.")
         except PrerequisiteNotMetError as err:
             _LOG.info(err)
             statistics.mark_early_exit()
@@ -205,7 +207,6 @@ def report(
         else:
             setup = "1:1"
 
-        _LOG.info("prepare training data for accuracy")
         trn = prepare_data_for_accuracy(
             df_tgt=trn_tgt_data,
             df_ctx=trn_ctx_data,
@@ -214,8 +215,8 @@ def report(
             max_sample_size=max_sample_size_accuracy,
             setup=setup,
         )
+        _LOG.info(f"prepared training data for accuracy: {trn.shape}")
         if hol_tgt_data is not None:
-            _LOG.info("prepare holdout data for accuracy")
             hol = prepare_data_for_accuracy(
                 df_tgt=hol_tgt_data,
                 df_ctx=hol_ctx_data,
@@ -225,13 +226,13 @@ def report(
                 setup=setup,
                 ori_dtypes=trn.dtypes.to_dict(),
             )
+            _LOG.info(f"prepared holdout data for accuracy: {hol.shape}")
             ori = pd.concat([trn, hol], axis=0, ignore_index=True)
         else:
             hol = None
             ori = trn
         progress.update(completed=5, total=100)
 
-        _LOG.info("prepare synthetic data for accuracy")
         syn = prepare_data_for_accuracy(
             df_tgt=syn_tgt_data,
             df_ctx=syn_ctx_data,
@@ -241,29 +242,29 @@ def report(
             setup=setup,
             ori_dtypes=trn.dtypes.to_dict(),
         )
+        _LOG.info(f"prepared synthetic data for accuracy: {syn.shape}")
         progress.update(completed=10, total=100)
 
         # do coherence analysis only if there are non-fk columns in the target data
         do_coherence = setup == "1:N" and len(trn_tgt_data.columns) > 1
         if do_coherence:
-            _LOG.info("prepare original data for coherence started")
             ori_coh, ori_coh_bins = prepare_data_for_coherence(
                 df_tgt=pd.concat([trn_tgt_data, hol_tgt_data]) if hol_tgt_data is not None else trn_tgt_data,
                 tgt_context_key=tgt_context_key,
                 max_sample_size=max_sample_size_coherence,
             )
-            _LOG.info("prepare synthetic data for coherence started")
+            _LOG.info(f"prepared original data for coherence: {ori_coh.shape}")
             syn_coh, _ = prepare_data_for_coherence(
                 df_tgt=syn_tgt_data,
                 tgt_context_key=tgt_context_key,
                 bins=ori_coh_bins,
                 max_sample_size=max_sample_size_coherence,
             )
-            _LOG.info("store bins used for training data for coherence")
+            _LOG.info(f"prepared synthetic data for coherence: {syn_coh.shape}")
             statistics.store_coherence_bins(bins=ori_coh_bins)
+            _LOG.info("stored bins used for training data for coherence")
         progress.update(completed=15, total=100)
 
-        _LOG.info("calculate embeddings")
         syn_embeds, trn_embeds, hol_embeds = prepare_data_for_embeddings(
             syn_tgt_data=syn_tgt_data,
             trn_tgt_data=trn_tgt_data,
@@ -275,6 +276,9 @@ def report(
             tgt_context_key=tgt_context_key,
             max_sample_size=max_sample_size_embeddings,
         )
+        _LOG.info(
+            f"calculated embeddings: syn={syn_embeds.shape}, trn={trn_embeds.shape}, hol={hol_embeds.shape if hol_embeds is not None else None}"
+        )
         progress.update(completed=20, total=100)
 
         ## 1. ACCURACY ##
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,7 +39,6 @@ dependencies = [
     "accelerate>=1.5.0",
     "torch>=2.6.0",
     "xxhash>=3.5.0",
-    "faiss-cpu>=1.7.0",
 ]
 
 [project.urls]
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,6 @@ dependencies = [`
`39`	`39`	`"accelerate>=1.5.0",`
`40`	`40`	`"torch>=2.6.0",`
`41`	`41`	`"xxhash>=3.5.0",`
`42`		`- "faiss-cpu>=1.7.0",`
`43`	`42`	`]`
`44`	`43`
`45`	`44`	`[project.urls]`