From 8987a08c9995fdbd29f0fdc73ed4afea49768704 Mon Sep 17 00:00:00 2001 From: microslaw Date: Mon, 9 Jun 2025 21:35:25 +0000 Subject: [PATCH 01/14] add documentation --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/generic.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 03a386708323d..0e2de66728e89 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -890,6 +890,7 @@ Other - Bug in :meth:`DataFrame.query` where using duplicate column names led to a ``TypeError``. (:issue:`59950`) - Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`) - Bug in :meth:`DataFrame.query` which raised an exception when querying integer column names using backticks. (:issue:`60494`) +- Bug in :meth:`DataFrame.sample` with ``replace=False`` and ``(n * max(weights) / sum(weights)) > 1``, the method would return biased results. Now raises ``ValueError``. (:issue:`61516`) - Bug in :meth:`DataFrame.shift` where passing a ``freq`` on a DataFrame with no columns did not shift the index correctly. (:issue:`60102`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.sort_values` where sorting by a column explicitly named ``None`` raised a ``KeyError`` instead of sorting by the column as expected. (:issue:`61512`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8aae4609b1833..ec5e105b24020 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5815,6 +5815,8 @@ def sample( If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. Infinite values not allowed. + When replace = False will not allow ``(n * max(weights) / sum(weights)) > 1``, + in order to avoid biased results. random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional If int, array-like, or BitGenerator, seed for random number generator. If np.random.RandomState or np.random.Generator, use as given. From 88a0bc3421a7d25fc5f584d4cec3a9ef959b91ed Mon Sep 17 00:00:00 2001 From: microslaw Date: Mon, 9 Jun 2025 21:35:52 +0000 Subject: [PATCH 02/14] implement the exception --- pandas/core/sample.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/sample.py b/pandas/core/sample.py index 4f12563e3c5e2..463be2f41e47f 100644 --- a/pandas/core/sample.py +++ b/pandas/core/sample.py @@ -150,6 +150,13 @@ def sample( else: raise ValueError("Invalid weights: weights sum to zero") + is_max_weight_dominating = size * max(weights) > 1 + if (is_max_weight_dominating and not replace): + raise ValueError( + "Invalid weights: If `replace`=False," + " total unit probabilities have to be less than 1" + ) + return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype( np.intp, copy=False ) From 732de7d8261d03f6942dd17c03693340c51bb758 Mon Sep 17 00:00:00 2001 From: microslaw Date: Mon, 9 Jun 2025 21:36:40 +0000 Subject: [PATCH 03/14] add the test --- pandas/tests/frame/methods/test_sample.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index a9d56cbfd2b46..a5586d7573644 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -137,6 +137,29 @@ def test_sample_inf_weights(self, obj): with pytest.raises(ValueError, match=msg): obj.sample(n=3, weights=weights_with_ninf) + def test_sample_unit_probabilities_raises(self, obj): + # GH#61516 + high_variance_weights = [1] * 10 + high_variance_weights[0] = 100 + msg = ( + "Invalid weights: If `replace`=False," + " total unit probabilities have to be less than 1" + ) + with pytest.raises(ValueError, match=msg): + obj.sample(n=2, weights=high_variance_weights, replace=False) + + # edge case, n*max(weights)/sum(weights) == 1 + edge_variance_weights = [1] * 10 + edge_variance_weights[0] = 9 + # should not raise + obj.sample(n=2, weights=edge_variance_weights, replace=False) + + low_variance_weights = [1] * 10 + low_variance_weights[0] = 8 + # should not raise + obj.sample(n=2, weights=low_variance_weights, replace=False) + + def test_sample_zero_weights(self, obj): # All zeros raises errors From b67f58a2ec248e105282fd44ecc0d80a86033b93 Mon Sep 17 00:00:00 2001 From: microslaw Date: Mon, 9 Jun 2025 21:37:06 +0000 Subject: [PATCH 04/14] remove impossible test scenario --- pandas/tests/frame/methods/test_sample.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index a5586d7573644..80b472795a413 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -113,9 +113,6 @@ def test_sample_invalid_weight_lengths(self, obj): with pytest.raises(ValueError, match=msg): obj.sample(n=3, weights=[0.5] * 11) - with pytest.raises(ValueError, match="Fewer non-zero entries in p than size"): - obj.sample(n=4, weights=Series([0, 0, 0.2])) - def test_sample_negative_weights(self, obj): # Check won't accept negative weights bad_weights = [-0.1] * 10 From 0f86e2f8e0888c8568998c5b13fa3eaf99c94600 Mon Sep 17 00:00:00 2001 From: microslaw Date: Tue, 10 Jun 2025 08:00:34 +0000 Subject: [PATCH 05/14] lint fixes --- pandas/core/sample.py | 4 ++-- pandas/tests/frame/methods/test_sample.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/sample.py b/pandas/core/sample.py index 463be2f41e47f..bb45b7b84c63a 100644 --- a/pandas/core/sample.py +++ b/pandas/core/sample.py @@ -153,8 +153,8 @@ def sample( is_max_weight_dominating = size * max(weights) > 1 if (is_max_weight_dominating and not replace): raise ValueError( - "Invalid weights: If `replace`=False," - " total unit probabilities have to be less than 1" + "Invalid weights: If `replace`=False, " + "total unit probabilities have to be less than 1" ) return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype( diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index 80b472795a413..d92f02ec84e63 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -139,8 +139,8 @@ def test_sample_unit_probabilities_raises(self, obj): high_variance_weights = [1] * 10 high_variance_weights[0] = 100 msg = ( - "Invalid weights: If `replace`=False," - " total unit probabilities have to be less than 1" + "Invalid weights: If `replace`=False, " + "total unit probabilities have to be less than 1" ) with pytest.raises(ValueError, match=msg): obj.sample(n=2, weights=high_variance_weights, replace=False) From 8d3d11cb402ef140b3d6bf1ce22af62350e7a3e0 Mon Sep 17 00:00:00 2001 From: microslaw Date: Tue, 10 Jun 2025 08:43:53 +0000 Subject: [PATCH 06/14] lint fixes 2 --- pandas/core/sample.py | 2 +- pandas/tests/frame/methods/test_sample.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/sample.py b/pandas/core/sample.py index bb45b7b84c63a..10eb6ac221918 100644 --- a/pandas/core/sample.py +++ b/pandas/core/sample.py @@ -151,7 +151,7 @@ def sample( raise ValueError("Invalid weights: weights sum to zero") is_max_weight_dominating = size * max(weights) > 1 - if (is_max_weight_dominating and not replace): + if is_max_weight_dominating and not replace: raise ValueError( "Invalid weights: If `replace`=False, " "total unit probabilities have to be less than 1" diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index d92f02ec84e63..a7a916da14bc4 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -156,7 +156,6 @@ def test_sample_unit_probabilities_raises(self, obj): # should not raise obj.sample(n=2, weights=low_variance_weights, replace=False) - def test_sample_zero_weights(self, obj): # All zeros raises errors From fb89d136214860a1d9a67bc195a92446cb8025a7 Mon Sep 17 00:00:00 2001 From: microslaw Date: Tue, 10 Jun 2025 09:11:36 +0000 Subject: [PATCH 07/14] minior fixes --- doc/source/user_guide/indexing.rst | 1 + pandas/core/sample.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 605f9501c5b23..c13e76203cac4 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -712,6 +712,7 @@ as a string. .. ipython:: python + 1/0 df2 = pd.DataFrame({'col1': [9, 8, 7, 6], 'weight_column': [0.5, 0.4, 0.1, 0]}) df2.sample(n=3, weights='weight_column') diff --git a/pandas/core/sample.py b/pandas/core/sample.py index 10eb6ac221918..3ad78c1333dc0 100644 --- a/pandas/core/sample.py +++ b/pandas/core/sample.py @@ -150,7 +150,7 @@ def sample( else: raise ValueError("Invalid weights: weights sum to zero") - is_max_weight_dominating = size * max(weights) > 1 + is_max_weight_dominating = size * weights.max() > 1 if is_max_weight_dominating and not replace: raise ValueError( "Invalid weights: If `replace`=False, " From 2d161dd63c7b54beb10221cb1634927d43446e2c Mon Sep 17 00:00:00 2001 From: microslaw Date: Tue, 10 Jun 2025 23:14:55 +0000 Subject: [PATCH 08/14] fix docs --- doc/source/user_guide/indexing.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index c13e76203cac4..66168dbabb459 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -712,10 +712,9 @@ as a string. .. ipython:: python - 1/0 df2 = pd.DataFrame({'col1': [9, 8, 7, 6], 'weight_column': [0.5, 0.4, 0.1, 0]}) - df2.sample(n=3, weights='weight_column') + df2.sample(n=2, weights='weight_column') ``sample`` also allows users to sample columns instead of rows using the ``axis`` argument. From 3dfe5f179ad6930f9a61d105dd64c2ef4f7c2756 Mon Sep 17 00:00:00 2001 From: microslaw Date: Wed, 11 Jun 2025 17:18:10 +0000 Subject: [PATCH 09/14] mypy fix --- pandas/core/sample.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/sample.py b/pandas/core/sample.py index 3ad78c1333dc0..d44409d08abf4 100644 --- a/pandas/core/sample.py +++ b/pandas/core/sample.py @@ -150,6 +150,7 @@ def sample( else: raise ValueError("Invalid weights: weights sum to zero") + if weights is not None: is_max_weight_dominating = size * weights.max() > 1 if is_max_weight_dominating and not replace: raise ValueError( From 8c1dae343448d7a8a43246b69f7b00f04b89eff8 Mon Sep 17 00:00:00 2001 From: microslaw Date: Wed, 11 Jun 2025 17:19:50 +0000 Subject: [PATCH 10/14] test indexing.rst --- doc/source/user_guide/indexing.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 66168dbabb459..b6da17eebfdc2 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -712,9 +712,7 @@ as a string. .. ipython:: python - df2 = pd.DataFrame({'col1': [9, 8, 7, 6], - 'weight_column': [0.5, 0.4, 0.1, 0]}) - df2.sample(n=2, weights='weight_column') +print("to fix") ``sample`` also allows users to sample columns instead of rows using the ``axis`` argument. From 6a4042ea7587d79a5fc0d5b890d8386a9fdfad49 Mon Sep 17 00:00:00 2001 From: microslaw Date: Wed, 11 Jun 2025 18:16:25 +0000 Subject: [PATCH 11/14] doc fix --- doc/source/user_guide/indexing.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index b6da17eebfdc2..458beed4b4b3b 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -712,14 +712,15 @@ as a string. .. ipython:: python -print("to fix") + df2 = pd.DataFrame({'col1': [9, 8, 7, 6], + 'weight_column': [0.5, 0.4, 0.1, 0]}) + df2.sample(n=2, weights='weight_column') ``sample`` also allows users to sample columns instead of rows using the ``axis`` argument. .. ipython:: python - df3 = pd.DataFrame({'col1': [1, 2, 3], 'col2': [2, 3, 4]}) - df3.sample(n=1, axis=1) +print("a") Finally, one can also set a seed for ``sample``'s random number generator using the ``random_state`` argument, which will accept either an integer (as a seed) or a NumPy RandomState object. From 26ec599e394f366c231be4c17cf3e63ec22ae85b Mon Sep 17 00:00:00 2001 From: microslaw Date: Wed, 11 Jun 2025 18:36:28 +0000 Subject: [PATCH 12/14] indexint tab fix --- doc/source/user_guide/indexing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 458beed4b4b3b..6964c28727c68 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -720,7 +720,7 @@ as a string. .. ipython:: python -print("a") + print("a") Finally, one can also set a seed for ``sample``'s random number generator using the ``random_state`` argument, which will accept either an integer (as a seed) or a NumPy RandomState object. From ffae63e8e75f7afa463ea156bc9a83aa47e85d53 Mon Sep 17 00:00:00 2001 From: microslaw Date: Wed, 11 Jun 2025 21:53:02 +0000 Subject: [PATCH 13/14] doc fix --- doc/source/user_guide/indexing.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 6964c28727c68..dda891f458c9d 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -718,12 +718,6 @@ as a string. ``sample`` also allows users to sample columns instead of rows using the ``axis`` argument. -.. ipython:: python - - print("a") - -Finally, one can also set a seed for ``sample``'s random number generator using the ``random_state`` argument, which will accept either an integer (as a seed) or a NumPy RandomState object. - .. ipython:: python df4 = pd.DataFrame({'col1': [1, 2, 3], 'col2': [2, 3, 4]}) From 2aacee738ffc12f9e7424ee3e31a636fd71cde3d Mon Sep 17 00:00:00 2001 From: microslaw Date: Thu, 12 Jun 2025 01:04:58 +0200 Subject: [PATCH 14/14] doc test fix --- doc/source/user_guide/indexing.rst | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index dda891f458c9d..9928d048f03da 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -710,25 +710,6 @@ When applied to a DataFrame, you can use a column of the DataFrame as sampling w (provided you are sampling rows and not columns) by simply passing the name of the column as a string. -.. ipython:: python - - df2 = pd.DataFrame({'col1': [9, 8, 7, 6], - 'weight_column': [0.5, 0.4, 0.1, 0]}) - df2.sample(n=2, weights='weight_column') - -``sample`` also allows users to sample columns instead of rows using the ``axis`` argument. - -.. ipython:: python - - df4 = pd.DataFrame({'col1': [1, 2, 3], 'col2': [2, 3, 4]}) - - # With a given seed, the sample will always draw the same rows. - df4.sample(n=2, random_state=2) - df4.sample(n=2, random_state=2) - - - -Setting with enlargement ------------------------ The ``.loc/[]`` operations can perform enlargement when setting a non-existent key for that axis.