Merge pull request #270 from mherrmann3/fix-spatial-binning-precision

pabloitu · web-flow · commit 2085b31e8264 · 2025-04-03T16:37:26.000+02:00
Fix and optimize `bin1d_vec` + extend units tests + fix imprecise bin edge creation
diff --git a/csep/core/catalogs.py b/csep/core/catalogs.py
@@ -395,7 +395,7 @@ def to_dataframe(self, with_datetime=False):
     def get_mag_idx(self):
         """ Return magnitude index from region magnitudes """
         try:
-            return bin1d_vec(self.get_magnitudes(), self.region.magnitudes, tol=0.00001, right_continuous=True)
+            return bin1d_vec(self.get_magnitudes(), self.region.magnitudes, right_continuous=True)
         except AttributeError:
             raise CSEPCatalogException("Cannot return magnitude index without self.region.magnitudes")
 
@@ -699,16 +699,21 @@ def spatial_event_probability(self):
         event_flag[idx] = 1
         return event_flag
 
-    def magnitude_counts(self, mag_bins=None, tol=0.00001, retbins=False):
-        """ Computes the count of events within mag_bins
-
+    def magnitude_counts(self, mag_bins=None, tol=None, retbins=False):
+        """ Computes the count of events within magnitude bins
 
         Args:
-            mag_bins: uses csep.utils.constants.CSEP_MW_BINS as default magnitude bins
-            retbins (bool): if this is true, return the bins used
+            mag_bins (array-like): magnitude bin edges, by default tries to use magnitude bin edges
+                                   associated with region, otherwise :data:`csep.utils.constants.CSEP_MW_BINS`
+            tol (float): overwrite numerical tolerance, by default determined automatically from the
+                         magnitudes' dtype to account for the limited precision of floating-point values.
+                         Only necessary to specify if the magnitudes were subject to some
+                         floating-point operations after loading or generating them
+                         (increased roundoff error, see :func:`csep.utils.calc.bin1d_vec`).
+            retbins (bool): if true, return the used bins in a tuple together with the counts.
 
         Returns:
-            numpy.ndarray: showing the counts of hte events in each magnitude bin
+            numpy.ndarray: events counts in each magnitude bin
         """
         # todo: keep track of events that are ignored
         if mag_bins is None:
@@ -734,18 +739,24 @@ def magnitude_counts(self, mag_bins=None, tol=0.00001, retbins=False):
         else:
             return out
 
-    def spatial_magnitude_counts(self, mag_bins=None, tol=0.00001):
-        """ Return counts of events in space-magnitude region.
+    def spatial_magnitude_counts(self, mag_bins=None, tol=None):
+        """ Return counts of events in space-magnitude bins.
 
-        We figure out the index of the polygons and create a map that relates the spatial coordinate in the
-        Cartesian grid with the polygon in region.
+        It figures out the index of the polygons and maps the spatial coordinates in the Cartesian
+        grid with the polygon in region.
 
         Args:
-            mag_bins (list, numpy.array): magnitude bins (optional), if empty tries to use magnitude bins associated with region
-            tol (float): tolerance for comparisons within magnitude bins
+            mag_bins (array-like): magnitude bin edges (optional), by default uses magnitude bin edges
+                                   associated with region.
+            tol (float): overwrite numerical tolerance, by default determined automatically from the
+                         magnitudes' dtype to account for the limited precision of floating-point values.
+                         Only necessary to specify if the magnitudes were subject to some
+                         floating-point operations after loading or generating them
+                         (increased roundoff error, see :func:`csep.utils.calc.bin1d_vec`).
 
         Returns:
-            output: unnormalized event count in each bin, 1d ndarray where index corresponds to midpoints
+            numpy.ndarray: unnormalized event count in each space-magnitude bin (2d, with indices
+                           corresponding to spatial midpoints and magnitude bin, respectively)
 
         """
         # make sure region is specified with catalog
diff --git a/csep/core/forecasts.py b/csep/core/forecasts.py
@@ -213,16 +213,21 @@ def magnitude_counts(self):
         """ Returns counts of events in magnitude bins """
         return numpy.sum(self.data, axis=0)
 
-    def get_magnitude_index(self, mags, tol=0.00001):
+    def get_magnitude_index(self, mags, tol=None):
         """ Returns the indices into the magnitude bins of selected magnitudes
 
         Note: the right-most bin is treated as extending to infinity.
 
         Args:
-            mags (array-like): list of magnitudes
+            mags (array-like): magnitudes bin edges.
+            tol (float): overwrite numerical tolerance, by default determined automatically from the
+                         magnitudes' dtype to account for the limited precision of floating-point values.
+                         Only necessary to specify if the magnitudes were subject to some
+                         floating-point operations after loading or generating them
+                         (increased roundoff error, see :func:`csep.utils.calc.bin1d_vec`).
 
         Returns:
-            idm (array-like): indices corresponding to mags
+            numpy.ndarray: indices corresponding to the magnitude bins
 
         Raises:
             ValueError
diff --git a/csep/core/regions.py b/csep/core/regions.py
@@ -313,12 +313,7 @@ def magnitude_bins(start_magnitude, end_magnitude, dmw):
     Returns:
         bin_edges (numpy.ndarray)
     """
-    # convert to integers to prevent accumulating floating point errors
-    const = 10000
-    start = numpy.floor(const * start_magnitude)
-    end = numpy.floor(const * end_magnitude)
-    d = const * dmw
-    return numpy.arange(start, end + d / 2, d) / const
+    return cleaner_range(start_magnitude, end_magnitude, dmw)
 
 def create_space_magnitude_region(region, magnitudes):
     """Simple wrapper to create space-magnitude region """
@@ -495,7 +490,7 @@ def compute_vertices(origin_points, dh, tol=numpy.finfo(float).eps):
     """
     return list(map(lambda x: compute_vertex(x, dh, tol=tol), origin_points))
 
-def _bin_catalog_spatio_magnitude_counts(lons, lats, mags, n_poly, mask, idx_map, binx, biny, mag_bins, tol=0.00001):
+def _bin_catalog_spatio_magnitude_counts(lons, lats, mags, n_poly, mask, idx_map, binx, biny, mag_bins, tol=None):
     """
     Returns a list of event counts as ndarray with shape (n_poly, n_cat) where each value
     represents the event counts within the polygon.
@@ -625,8 +620,8 @@ def get_index_of(self, lons, lats):
         Returns:
             idx: ndarray-like
         """
-        idx = bin1d_vec(numpy.array(lons), self.xs)
-        idy = bin1d_vec(numpy.array(lats), self.ys)
+        idx = bin1d_vec(lons, self.xs)
+        idy = bin1d_vec(lats, self.ys)
         if numpy.any(idx == -1) or numpy.any(idy == -1):
             raise ValueError("at least one lon and lat pair contain values that are outside of the valid region.")
         if numpy.any(self.bbox_mask[idy, idx] == 1):
@@ -1061,7 +1056,7 @@ def get_cell_area(self):
         self.cell_area = cell_area
         return self.cell_area
 
-    def get_index_of(self, lons, lats): 
+    def get_index_of(self, lons, lats):
         """ Returns the index of lons, lats in self.polygons
 
         Args:
@@ -1182,7 +1177,7 @@ def _get_spatial_magnitude_counts(self, catalog, mag_bins=None):
         out = numpy.zeros([len(self.quadkeys), len(mag_bins)])
 
         idx_loc = self.get_index_of(lon, lat)
-        idx_mag = bin1d_vec(mag, mag_bins, tol=0.00001, right_continuous=True)
+        idx_mag = bin1d_vec(mag, mag_bins, right_continuous=True)
 
         numpy.add.at(out, (idx_loc, idx_mag), 1)
 
diff --git a/csep/utils/calc.py b/csep/utils/calc.py
@@ -51,67 +51,75 @@ def discretize(data, bin_edges, right_continuous=False):
     x_new = bin_edges[idx]
     return x_new
 
+def _get_tolerance(v):
+    """Determine numerical tolerance due to limited precision of floating-point values.
+
+    ... to account for roundoff error.
+
+    In other words, returns a maximum possible difference that can be considered negligible.
+    Only relevant for floating-point values.
+    """
+    if issubclass(v.dtype.type, numpy.floating):
+        return numpy.abs(v) * numpy.finfo(v.dtype).eps
+    return 0  # assuming it's an int
+
 def bin1d_vec(p, bins, tol=None, right_continuous=False):
     """Efficient implementation of binning routine on 1D Cartesian Grid.
 
-    Returns the indices of the points into bins. Bins are inclusive on the lower bound
+    Bins are inclusive on the lower bound
     and exclusive on the upper bound. In the case where a point does not fall within the bins a -1
     will be returned. The last bin extends to infinity when right_continuous is set as true.
 
+    To correctly bin points that are practically on a bin edge, this function accounts for the
+    limited precision of floating-point numbers (the roundoff error) with a numerical tolerance.
+    If the provided points were subject to some floating-point operations after loading or
+    generating them, the roundoff error increases (which is not accounted for) and requires
+    overwriting the `tol` argument.
+
     Args:
-        p (array-like): Point(s) to be placed into b
-        bins (array-like): bins to considering for binning, must be monotonically increasing
-        right_continuous (bool): if true, consider last bin extending to infinity
+        p (array-like): Point(s) to be placed into bins.
+        bins (array-like): bin edges; must be sorted (monotonically increasing)
+        tol (float): overwrite numerical tolerance, by default determined automatically from the
+                     points' dtype to account for the roundoff error.
+        right_continuous (bool): if true, consider last bin extending to infinity.
 
     Returns:
-        idx (array-like): indexes hashed into grid
+        numpy.ndarray: indices for the points corresponding to the bins.
 
     Raises:
         ValueError:
     """
-    bins = numpy.array(bins)
-    p = numpy.array(p)
-    a0 = numpy.min(bins)
-    # if user supplies only a single bin, do 2 things: 1) fix right continuous to true, and use of h is arbitrary
+    bins = numpy.asarray(bins)
+    p = numpy.asarray(p)
+
+    # if not np.all(bins[:-1] <= bins[1:]):  # check for sorted bins, which is a requirement
+    #     raise ValueError("Bin edges are not sorted.")  # (pyCSEP always passes sorted bins)
+    a0 = bins[0]
     if bins.size == 1:
+        # for a single bin, set `right_continuous` to true; h is now arbitrary
         right_continuous = True
-        h = 1
+        h = 1.
     else:
         h = bins[1] - bins[0]
 
-    a0_tol = numpy.abs(a0) * numpy.finfo(numpy.float64).eps
-    h_tol = numpy.abs(h) * numpy.finfo(numpy.float64).eps
-    p_tol = numpy.abs(p) * numpy.finfo(numpy.float64).eps
-
-    # absolute tolerance
-    if tol is None:
-        idx = numpy.floor((p + (p_tol + a0_tol) - a0) / (h - h_tol))
-    else:
-        idx = numpy.floor((p + (tol + a0_tol) - a0) / (h - h_tol))
     if h < 0:
         raise ValueError("grid spacing must be positive and monotonically increasing.")
-    # account for floating point uncertainties by considering extreme case
+
+    # account for floating point precision
+    a0_tol = _get_tolerance(a0)
+    h_tol = a0_tol  # must be based on *involved* numbers
+    p_tol = tol or _get_tolerance(p)
+
+    idx = numpy.floor((p - a0 + p_tol + a0_tol) / (h - h_tol))
+    idx = numpy.asarray(idx)  # assure idx is an array
 
     if right_continuous:
         # set upper bin index to last
-        try:
-            idx[(idx < 0)] = -1
-            idx[(idx >= len(bins) - 1)] = len(bins) - 1
-        except TypeError:
-            if idx >= len(bins) - 1:
-                idx = len(bins) - 1
-            if idx < 0:
-                idx = -1
+        idx[idx < 0] = -1
+        idx[idx >= len(bins) - 1] = len(bins) - 1
     else:
-        try:
-            idx[((idx < 0) | (idx >= len(bins)))] = -1
-        except TypeError:
-            if idx < 0 or idx >= len(bins):
-                idx = -1
-    try:
-        idx = idx.astype(numpy.int64)
-    except AttributeError:
-        idx = int(idx)
+        idx[(idx < 0) | (idx >= len(bins))] = -1
+    idx = idx.astype(numpy.int64)
     return idx
 
 def _compute_likelihood(gridded_data, apprx_rate_density, expected_cond_count, n_obs):
@@ -215,12 +223,13 @@ def cleaner_range(start, end, h):
     Returns:
         bin_edges (numpy.ndarray)
     """
-    # convert to integers to prevent accumulating floating point errors
-    const = 100000
-    start = numpy.floor(const * start)
-    end = numpy.floor(const * end)
-    d = const * h
-    return numpy.arange(start, end + d / 2, d) / const
+    # determine required scaling to account for decimal places of bin edges and stepping
+    num_decimals_bins = len(str(float(start)).split('.')[1])
+    scale = max(10**num_decimals_bins, 1 / h)
+    start = numpy.round(scale * start)
+    end = numpy.round(scale * end)
+    d = scale * h
+    return numpy.arange(start, end + d / 2, d) / scale
 
 def first_nonnan(arr, axis=0, invalid_val=-1):
     mask = arr==arr
diff --git a/tests/test_calc.py b/tests/test_calc.py
@@ -106,6 +106,13 @@ def test_bin1d_single_bin1(self):
         expected = [-1, -1, 0, 0, 0, 0, 0, -1]
         self.assertListEqual(test.tolist(), expected)
 
+    def test_bin1d_single_bin2(self):
+        data = [4.0]
+        bin_edges = [3.0]
+        test = bin1d_vec(data, bin_edges)
+        expected = [0]
+        self.assertListEqual(test.tolist(), expected)
+
     def test_upper_limit_right_continuous(self):
         data = [40, 40, 40]
         bin_edges = [0, 10, 20, 30]
@@ -134,18 +141,27 @@ def test_less_and_greater_than(self):
         expected = [-1, 3, -1]
         self.assertListEqual(test.tolist(), expected)
 
-    def test_scalar_outside(self):
-        from csep.utils.calc import bin1d_vec
-        mbins = numpy.arange(5.95, 9, 0.1)  # This gives bins from 5.95 to 8.95
-        idx = bin1d_vec(5.95, mbins, tol=0.00001, right_continuous=True)
-        self.assertEqual(idx, 0)
+    def test_scalar_inside(self):
+        mbins = numpy.arange(5.95, 9, 0.1)  # (magnitude) bins from 5.95 to 8.95
 
-        idx = bin1d_vec(6, mbins, tol=0.00001, right_continuous=True)  # This would give 0: Which is fine.
-        self.assertEqual(idx, 0)
+        for i, m in enumerate(mbins):
+            idx = bin1d_vec(m, mbins, right_continuous=True)  # corner cases
+            self.assertEqual(idx, i)
 
-        idx = bin1d_vec(5, mbins, tol=0.00001, right_continuous=True)
-        self.assertEqual(idx, -1)
+            idx = bin1d_vec(m + 0.05, mbins, right_continuous=True)  # center bins
+            self.assertEqual(idx, i)
 
-        idx = bin1d_vec(4, mbins, tol=0.00001, right_continuous=True)
+            idx = bin1d_vec(m + 0.099999999, mbins, right_continuous=True)  # end of bins
+            self.assertEqual(idx, i)
+
+        idx = bin1d_vec(10, mbins, right_continuous=True)  # larger than last bin edge
+        self.assertEqual(idx, mbins.size - 1)
+
+    def test_scalar_outside(self):
+        mbins = numpy.arange(5.95, 9, 0.1)  # (magnitude) bins from 5.95 to 8.95
+
+        idx = bin1d_vec(5, mbins, right_continuous=True)
         self.assertEqual(idx, -1)
 
+        idx = bin1d_vec(4, mbins, right_continuous=True)
+        self.assertEqual(idx, -1)
diff --git a/tests/test_regions.py b/tests/test_regions.py