@@ -51,67 +51,75 @@ def discretize(data, bin_edges, right_continuous=False):
51
51
x_new = bin_edges [idx ]
52
52
return x_new
53
53
54
+ def _get_tolerance (v ):
55
+ """Determine numerical tolerance due to limited precision of floating-point values.
56
+
57
+ ... to account for roundoff error.
58
+
59
+ In other words, returns a maximum possible difference that can be considered negligible.
60
+ Only relevant for floating-point values.
61
+ """
62
+ if issubclass (v .dtype .type , numpy .floating ):
63
+ return numpy .abs (v ) * numpy .finfo (v .dtype ).eps
64
+ return 0 # assuming it's an int
65
+
54
66
def bin1d_vec (p , bins , tol = None , right_continuous = False ):
55
67
"""Efficient implementation of binning routine on 1D Cartesian Grid.
56
68
57
- Returns the indices of the points into bins. Bins are inclusive on the lower bound
69
+ Bins are inclusive on the lower bound
58
70
and exclusive on the upper bound. In the case where a point does not fall within the bins a -1
59
71
will be returned. The last bin extends to infinity when right_continuous is set as true.
60
72
73
+ To correctly bin points that are practically on a bin edge, this function accounts for the
74
+ limited precision of floating-point numbers (the roundoff error) with a numerical tolerance.
75
+ If the provided points were subject to some floating-point operations after loading or
76
+ generating them, the roundoff error increases (which is not accounted for) and requires
77
+ overwriting the `tol` argument.
78
+
61
79
Args:
62
- p (array-like): Point(s) to be placed into b
63
- bins (array-like): bins to considering for binning, must be monotonically increasing
64
- right_continuous (bool): if true, consider last bin extending to infinity
80
+ p (array-like): Point(s) to be placed into bins.
81
+ bins (array-like): bin edges; must be sorted (monotonically increasing)
82
+ tol (float): overwrite numerical tolerance, by default determined automatically from the
83
+ points' dtype to account for the roundoff error.
84
+ right_continuous (bool): if true, consider last bin extending to infinity.
65
85
66
86
Returns:
67
- idx (array-like): indexes hashed into grid
87
+ numpy.ndarray: indices for the points corresponding to the bins.
68
88
69
89
Raises:
70
90
ValueError:
71
91
"""
72
- bins = numpy .array (bins )
73
- p = numpy .array (p )
74
- a0 = numpy .min (bins )
75
- # if user supplies only a single bin, do 2 things: 1) fix right continuous to true, and use of h is arbitrary
92
+ bins = numpy .asarray (bins )
93
+ p = numpy .asarray (p )
94
+
95
+ # if not np.all(bins[:-1] <= bins[1:]): # check for sorted bins, which is a requirement
96
+ # raise ValueError("Bin edges are not sorted.") # (pyCSEP always passes sorted bins)
97
+ a0 = bins [0 ]
76
98
if bins .size == 1 :
99
+ # for a single bin, set `right_continuous` to true; h is now arbitrary
77
100
right_continuous = True
78
- h = 1
101
+ h = 1.
79
102
else :
80
103
h = bins [1 ] - bins [0 ]
81
104
82
- a0_tol = numpy .abs (a0 ) * numpy .finfo (numpy .float64 ).eps
83
- h_tol = numpy .abs (h ) * numpy .finfo (numpy .float64 ).eps
84
- p_tol = numpy .abs (p ) * numpy .finfo (numpy .float64 ).eps
85
-
86
- # absolute tolerance
87
- if tol is None :
88
- idx = numpy .floor ((p + (p_tol + a0_tol ) - a0 ) / (h - h_tol ))
89
- else :
90
- idx = numpy .floor ((p + (tol + a0_tol ) - a0 ) / (h - h_tol ))
91
105
if h < 0 :
92
106
raise ValueError ("grid spacing must be positive and monotonically increasing." )
93
- # account for floating point uncertainties by considering extreme case
107
+
108
+ # account for floating point precision
109
+ a0_tol = _get_tolerance (a0 )
110
+ h_tol = a0_tol # must be based on *involved* numbers
111
+ p_tol = tol or _get_tolerance (p )
112
+
113
+ idx = numpy .floor ((p - a0 + p_tol + a0_tol ) / (h - h_tol ))
114
+ idx = numpy .asarray (idx ) # assure idx is an array
94
115
95
116
if right_continuous :
96
117
# set upper bin index to last
97
- try :
98
- idx [(idx < 0 )] = - 1
99
- idx [(idx >= len (bins ) - 1 )] = len (bins ) - 1
100
- except TypeError :
101
- if idx >= len (bins ) - 1 :
102
- idx = len (bins ) - 1
103
- if idx < 0 :
104
- idx = - 1
118
+ idx [idx < 0 ] = - 1
119
+ idx [idx >= len (bins ) - 1 ] = len (bins ) - 1
105
120
else :
106
- try :
107
- idx [((idx < 0 ) | (idx >= len (bins )))] = - 1
108
- except TypeError :
109
- if idx < 0 or idx >= len (bins ):
110
- idx = - 1
111
- try :
112
- idx = idx .astype (numpy .int64 )
113
- except AttributeError :
114
- idx = int (idx )
121
+ idx [(idx < 0 ) | (idx >= len (bins ))] = - 1
122
+ idx = idx .astype (numpy .int64 )
115
123
return idx
116
124
117
125
def _compute_likelihood (gridded_data , apprx_rate_density , expected_cond_count , n_obs ):
@@ -215,12 +223,13 @@ def cleaner_range(start, end, h):
215
223
Returns:
216
224
bin_edges (numpy.ndarray)
217
225
"""
218
- # convert to integers to prevent accumulating floating point errors
219
- const = 100000
220
- start = numpy .floor (const * start )
221
- end = numpy .floor (const * end )
222
- d = const * h
223
- return numpy .arange (start , end + d / 2 , d ) / const
226
+ # determine required scaling to account for decimal places of bin edges and stepping
227
+ num_decimals_bins = len (str (float (start )).split ('.' )[1 ])
228
+ scale = max (10 ** num_decimals_bins , 1 / h )
229
+ start = numpy .round (scale * start )
230
+ end = numpy .round (scale * end )
231
+ d = scale * h
232
+ return numpy .arange (start , end + d / 2 , d ) / scale
224
233
225
234
def first_nonnan (arr , axis = 0 , invalid_val = - 1 ):
226
235
mask = arr == arr
0 commit comments