Skip to content

Commit 2085b31

Browse files
authored
Merge pull request #270 from mherrmann3/fix-spatial-binning-precision
Fix and optimize `bin1d_vec` + extend units tests + fix imprecise bin edge creation
2 parents 1315e8e + bde4f87 commit 2085b31

File tree

6 files changed

+195
-82
lines changed

6 files changed

+195
-82
lines changed

csep/core/catalogs.py

+25-14
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,7 @@ def to_dataframe(self, with_datetime=False):
395395
def get_mag_idx(self):
396396
""" Return magnitude index from region magnitudes """
397397
try:
398-
return bin1d_vec(self.get_magnitudes(), self.region.magnitudes, tol=0.00001, right_continuous=True)
398+
return bin1d_vec(self.get_magnitudes(), self.region.magnitudes, right_continuous=True)
399399
except AttributeError:
400400
raise CSEPCatalogException("Cannot return magnitude index without self.region.magnitudes")
401401

@@ -699,16 +699,21 @@ def spatial_event_probability(self):
699699
event_flag[idx] = 1
700700
return event_flag
701701

702-
def magnitude_counts(self, mag_bins=None, tol=0.00001, retbins=False):
703-
""" Computes the count of events within mag_bins
704-
702+
def magnitude_counts(self, mag_bins=None, tol=None, retbins=False):
703+
""" Computes the count of events within magnitude bins
705704
706705
Args:
707-
mag_bins: uses csep.utils.constants.CSEP_MW_BINS as default magnitude bins
708-
retbins (bool): if this is true, return the bins used
706+
mag_bins (array-like): magnitude bin edges, by default tries to use magnitude bin edges
707+
associated with region, otherwise :data:`csep.utils.constants.CSEP_MW_BINS`
708+
tol (float): overwrite numerical tolerance, by default determined automatically from the
709+
magnitudes' dtype to account for the limited precision of floating-point values.
710+
Only necessary to specify if the magnitudes were subject to some
711+
floating-point operations after loading or generating them
712+
(increased roundoff error, see :func:`csep.utils.calc.bin1d_vec`).
713+
retbins (bool): if true, return the used bins in a tuple together with the counts.
709714
710715
Returns:
711-
numpy.ndarray: showing the counts of hte events in each magnitude bin
716+
numpy.ndarray: events counts in each magnitude bin
712717
"""
713718
# todo: keep track of events that are ignored
714719
if mag_bins is None:
@@ -734,18 +739,24 @@ def magnitude_counts(self, mag_bins=None, tol=0.00001, retbins=False):
734739
else:
735740
return out
736741

737-
def spatial_magnitude_counts(self, mag_bins=None, tol=0.00001):
738-
""" Return counts of events in space-magnitude region.
742+
def spatial_magnitude_counts(self, mag_bins=None, tol=None):
743+
""" Return counts of events in space-magnitude bins.
739744
740-
We figure out the index of the polygons and create a map that relates the spatial coordinate in the
741-
Cartesian grid with the polygon in region.
745+
It figures out the index of the polygons and maps the spatial coordinates in the Cartesian
746+
grid with the polygon in region.
742747
743748
Args:
744-
mag_bins (list, numpy.array): magnitude bins (optional), if empty tries to use magnitude bins associated with region
745-
tol (float): tolerance for comparisons within magnitude bins
749+
mag_bins (array-like): magnitude bin edges (optional), by default uses magnitude bin edges
750+
associated with region.
751+
tol (float): overwrite numerical tolerance, by default determined automatically from the
752+
magnitudes' dtype to account for the limited precision of floating-point values.
753+
Only necessary to specify if the magnitudes were subject to some
754+
floating-point operations after loading or generating them
755+
(increased roundoff error, see :func:`csep.utils.calc.bin1d_vec`).
746756
747757
Returns:
748-
output: unnormalized event count in each bin, 1d ndarray where index corresponds to midpoints
758+
numpy.ndarray: unnormalized event count in each space-magnitude bin (2d, with indices
759+
corresponding to spatial midpoints and magnitude bin, respectively)
749760
750761
"""
751762
# make sure region is specified with catalog

csep/core/forecasts.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -213,16 +213,21 @@ def magnitude_counts(self):
213213
""" Returns counts of events in magnitude bins """
214214
return numpy.sum(self.data, axis=0)
215215

216-
def get_magnitude_index(self, mags, tol=0.00001):
216+
def get_magnitude_index(self, mags, tol=None):
217217
""" Returns the indices into the magnitude bins of selected magnitudes
218218
219219
Note: the right-most bin is treated as extending to infinity.
220220
221221
Args:
222-
mags (array-like): list of magnitudes
222+
mags (array-like): magnitudes bin edges.
223+
tol (float): overwrite numerical tolerance, by default determined automatically from the
224+
magnitudes' dtype to account for the limited precision of floating-point values.
225+
Only necessary to specify if the magnitudes were subject to some
226+
floating-point operations after loading or generating them
227+
(increased roundoff error, see :func:`csep.utils.calc.bin1d_vec`).
223228
224229
Returns:
225-
idm (array-like): indices corresponding to mags
230+
numpy.ndarray: indices corresponding to the magnitude bins
226231
227232
Raises:
228233
ValueError

csep/core/regions.py

+6-11
Original file line numberDiff line numberDiff line change
@@ -313,12 +313,7 @@ def magnitude_bins(start_magnitude, end_magnitude, dmw):
313313
Returns:
314314
bin_edges (numpy.ndarray)
315315
"""
316-
# convert to integers to prevent accumulating floating point errors
317-
const = 10000
318-
start = numpy.floor(const * start_magnitude)
319-
end = numpy.floor(const * end_magnitude)
320-
d = const * dmw
321-
return numpy.arange(start, end + d / 2, d) / const
316+
return cleaner_range(start_magnitude, end_magnitude, dmw)
322317

323318
def create_space_magnitude_region(region, magnitudes):
324319
"""Simple wrapper to create space-magnitude region """
@@ -495,7 +490,7 @@ def compute_vertices(origin_points, dh, tol=numpy.finfo(float).eps):
495490
"""
496491
return list(map(lambda x: compute_vertex(x, dh, tol=tol), origin_points))
497492

498-
def _bin_catalog_spatio_magnitude_counts(lons, lats, mags, n_poly, mask, idx_map, binx, biny, mag_bins, tol=0.00001):
493+
def _bin_catalog_spatio_magnitude_counts(lons, lats, mags, n_poly, mask, idx_map, binx, biny, mag_bins, tol=None):
499494
"""
500495
Returns a list of event counts as ndarray with shape (n_poly, n_cat) where each value
501496
represents the event counts within the polygon.
@@ -625,8 +620,8 @@ def get_index_of(self, lons, lats):
625620
Returns:
626621
idx: ndarray-like
627622
"""
628-
idx = bin1d_vec(numpy.array(lons), self.xs)
629-
idy = bin1d_vec(numpy.array(lats), self.ys)
623+
idx = bin1d_vec(lons, self.xs)
624+
idy = bin1d_vec(lats, self.ys)
630625
if numpy.any(idx == -1) or numpy.any(idy == -1):
631626
raise ValueError("at least one lon and lat pair contain values that are outside of the valid region.")
632627
if numpy.any(self.bbox_mask[idy, idx] == 1):
@@ -1061,7 +1056,7 @@ def get_cell_area(self):
10611056
self.cell_area = cell_area
10621057
return self.cell_area
10631058

1064-
def get_index_of(self, lons, lats):
1059+
def get_index_of(self, lons, lats):
10651060
""" Returns the index of lons, lats in self.polygons
10661061
10671062
Args:
@@ -1182,7 +1177,7 @@ def _get_spatial_magnitude_counts(self, catalog, mag_bins=None):
11821177
out = numpy.zeros([len(self.quadkeys), len(mag_bins)])
11831178

11841179
idx_loc = self.get_index_of(lon, lat)
1185-
idx_mag = bin1d_vec(mag, mag_bins, tol=0.00001, right_continuous=True)
1180+
idx_mag = bin1d_vec(mag, mag_bins, right_continuous=True)
11861181

11871182
numpy.add.at(out, (idx_loc, idx_mag), 1)
11881183

csep/utils/calc.py

+52-43
Original file line numberDiff line numberDiff line change
@@ -51,67 +51,75 @@ def discretize(data, bin_edges, right_continuous=False):
5151
x_new = bin_edges[idx]
5252
return x_new
5353

54+
def _get_tolerance(v):
55+
"""Determine numerical tolerance due to limited precision of floating-point values.
56+
57+
... to account for roundoff error.
58+
59+
In other words, returns a maximum possible difference that can be considered negligible.
60+
Only relevant for floating-point values.
61+
"""
62+
if issubclass(v.dtype.type, numpy.floating):
63+
return numpy.abs(v) * numpy.finfo(v.dtype).eps
64+
return 0 # assuming it's an int
65+
5466
def bin1d_vec(p, bins, tol=None, right_continuous=False):
5567
"""Efficient implementation of binning routine on 1D Cartesian Grid.
5668
57-
Returns the indices of the points into bins. Bins are inclusive on the lower bound
69+
Bins are inclusive on the lower bound
5870
and exclusive on the upper bound. In the case where a point does not fall within the bins a -1
5971
will be returned. The last bin extends to infinity when right_continuous is set as true.
6072
73+
To correctly bin points that are practically on a bin edge, this function accounts for the
74+
limited precision of floating-point numbers (the roundoff error) with a numerical tolerance.
75+
If the provided points were subject to some floating-point operations after loading or
76+
generating them, the roundoff error increases (which is not accounted for) and requires
77+
overwriting the `tol` argument.
78+
6179
Args:
62-
p (array-like): Point(s) to be placed into b
63-
bins (array-like): bins to considering for binning, must be monotonically increasing
64-
right_continuous (bool): if true, consider last bin extending to infinity
80+
p (array-like): Point(s) to be placed into bins.
81+
bins (array-like): bin edges; must be sorted (monotonically increasing)
82+
tol (float): overwrite numerical tolerance, by default determined automatically from the
83+
points' dtype to account for the roundoff error.
84+
right_continuous (bool): if true, consider last bin extending to infinity.
6585
6686
Returns:
67-
idx (array-like): indexes hashed into grid
87+
numpy.ndarray: indices for the points corresponding to the bins.
6888
6989
Raises:
7090
ValueError:
7191
"""
72-
bins = numpy.array(bins)
73-
p = numpy.array(p)
74-
a0 = numpy.min(bins)
75-
# if user supplies only a single bin, do 2 things: 1) fix right continuous to true, and use of h is arbitrary
92+
bins = numpy.asarray(bins)
93+
p = numpy.asarray(p)
94+
95+
# if not np.all(bins[:-1] <= bins[1:]): # check for sorted bins, which is a requirement
96+
# raise ValueError("Bin edges are not sorted.") # (pyCSEP always passes sorted bins)
97+
a0 = bins[0]
7698
if bins.size == 1:
99+
# for a single bin, set `right_continuous` to true; h is now arbitrary
77100
right_continuous = True
78-
h = 1
101+
h = 1.
79102
else:
80103
h = bins[1] - bins[0]
81104

82-
a0_tol = numpy.abs(a0) * numpy.finfo(numpy.float64).eps
83-
h_tol = numpy.abs(h) * numpy.finfo(numpy.float64).eps
84-
p_tol = numpy.abs(p) * numpy.finfo(numpy.float64).eps
85-
86-
# absolute tolerance
87-
if tol is None:
88-
idx = numpy.floor((p + (p_tol + a0_tol) - a0) / (h - h_tol))
89-
else:
90-
idx = numpy.floor((p + (tol + a0_tol) - a0) / (h - h_tol))
91105
if h < 0:
92106
raise ValueError("grid spacing must be positive and monotonically increasing.")
93-
# account for floating point uncertainties by considering extreme case
107+
108+
# account for floating point precision
109+
a0_tol = _get_tolerance(a0)
110+
h_tol = a0_tol # must be based on *involved* numbers
111+
p_tol = tol or _get_tolerance(p)
112+
113+
idx = numpy.floor((p - a0 + p_tol + a0_tol) / (h - h_tol))
114+
idx = numpy.asarray(idx) # assure idx is an array
94115

95116
if right_continuous:
96117
# set upper bin index to last
97-
try:
98-
idx[(idx < 0)] = -1
99-
idx[(idx >= len(bins) - 1)] = len(bins) - 1
100-
except TypeError:
101-
if idx >= len(bins) - 1:
102-
idx = len(bins) - 1
103-
if idx < 0:
104-
idx = -1
118+
idx[idx < 0] = -1
119+
idx[idx >= len(bins) - 1] = len(bins) - 1
105120
else:
106-
try:
107-
idx[((idx < 0) | (idx >= len(bins)))] = -1
108-
except TypeError:
109-
if idx < 0 or idx >= len(bins):
110-
idx = -1
111-
try:
112-
idx = idx.astype(numpy.int64)
113-
except AttributeError:
114-
idx = int(idx)
121+
idx[(idx < 0) | (idx >= len(bins))] = -1
122+
idx = idx.astype(numpy.int64)
115123
return idx
116124

117125
def _compute_likelihood(gridded_data, apprx_rate_density, expected_cond_count, n_obs):
@@ -215,12 +223,13 @@ def cleaner_range(start, end, h):
215223
Returns:
216224
bin_edges (numpy.ndarray)
217225
"""
218-
# convert to integers to prevent accumulating floating point errors
219-
const = 100000
220-
start = numpy.floor(const * start)
221-
end = numpy.floor(const * end)
222-
d = const * h
223-
return numpy.arange(start, end + d / 2, d) / const
226+
# determine required scaling to account for decimal places of bin edges and stepping
227+
num_decimals_bins = len(str(float(start)).split('.')[1])
228+
scale = max(10**num_decimals_bins, 1 / h)
229+
start = numpy.round(scale * start)
230+
end = numpy.round(scale * end)
231+
d = scale * h
232+
return numpy.arange(start, end + d / 2, d) / scale
224233

225234
def first_nonnan(arr, axis=0, invalid_val=-1):
226235
mask = arr==arr

tests/test_calc.py

+26-10
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,13 @@ def test_bin1d_single_bin1(self):
106106
expected = [-1, -1, 0, 0, 0, 0, 0, -1]
107107
self.assertListEqual(test.tolist(), expected)
108108

109+
def test_bin1d_single_bin2(self):
110+
data = [4.0]
111+
bin_edges = [3.0]
112+
test = bin1d_vec(data, bin_edges)
113+
expected = [0]
114+
self.assertListEqual(test.tolist(), expected)
115+
109116
def test_upper_limit_right_continuous(self):
110117
data = [40, 40, 40]
111118
bin_edges = [0, 10, 20, 30]
@@ -134,18 +141,27 @@ def test_less_and_greater_than(self):
134141
expected = [-1, 3, -1]
135142
self.assertListEqual(test.tolist(), expected)
136143

137-
def test_scalar_outside(self):
138-
from csep.utils.calc import bin1d_vec
139-
mbins = numpy.arange(5.95, 9, 0.1) # This gives bins from 5.95 to 8.95
140-
idx = bin1d_vec(5.95, mbins, tol=0.00001, right_continuous=True)
141-
self.assertEqual(idx, 0)
144+
def test_scalar_inside(self):
145+
mbins = numpy.arange(5.95, 9, 0.1) # (magnitude) bins from 5.95 to 8.95
142146

143-
idx = bin1d_vec(6, mbins, tol=0.00001, right_continuous=True) # This would give 0: Which is fine.
144-
self.assertEqual(idx, 0)
147+
for i, m in enumerate(mbins):
148+
idx = bin1d_vec(m, mbins, right_continuous=True) # corner cases
149+
self.assertEqual(idx, i)
145150

146-
idx = bin1d_vec(5, mbins, tol=0.00001, right_continuous=True)
147-
self.assertEqual(idx, -1)
151+
idx = bin1d_vec(m + 0.05, mbins, right_continuous=True) # center bins
152+
self.assertEqual(idx, i)
148153

149-
idx = bin1d_vec(4, mbins, tol=0.00001, right_continuous=True)
154+
idx = bin1d_vec(m + 0.099999999, mbins, right_continuous=True) # end of bins
155+
self.assertEqual(idx, i)
156+
157+
idx = bin1d_vec(10, mbins, right_continuous=True) # larger than last bin edge
158+
self.assertEqual(idx, mbins.size - 1)
159+
160+
def test_scalar_outside(self):
161+
mbins = numpy.arange(5.95, 9, 0.1) # (magnitude) bins from 5.95 to 8.95
162+
163+
idx = bin1d_vec(5, mbins, right_continuous=True)
150164
self.assertEqual(idx, -1)
151165

166+
idx = bin1d_vec(4, mbins, right_continuous=True)
167+
self.assertEqual(idx, -1)

0 commit comments

Comments
 (0)