Merge pull request #34 from marshallpierce/count-saturates

jonhoo · web-flow · commit 5a51f467b2f3 · 2017-03-15T14:30:10.000-04:00
Count saturates
diff --git a/benches/record.rs b/benches/record.rs
@@ -0,0 +1,145 @@
+#![feature(test)]
+
+extern crate hdrsample;
+extern crate rand;
+extern crate test;
+
+use hdrsample::*;
+use self::rand::Rng;
+use self::test::Bencher;
+
+#[bench]
+fn record_precalc_random_values_with_1_count_u64(b: &mut Bencher) {
+    let mut h = Histogram::<u64>::new_with_bounds(1, u64::max_value(), 3).unwrap();
+    let mut indices = Vec::<u64>::new();
+    let mut rng = rand::weak_rng();
+
+    // same value approach as record_precalc_random_values_with_max_count_u64 so that
+    // they are comparable
+
+    for _ in 0..1000_000 {
+        indices.push(rng.gen());
+    }
+
+    b.iter(|| {
+        for i in indices.iter() {
+            // u64 counts, won't overflow
+            h.record(*i).unwrap()
+        }
+    })
+}
+
+#[bench]
+fn record_precalc_random_values_with_max_count_u64(b: &mut Bencher) {
+    let mut h = Histogram::<u64>::new_with_bounds(1, u64::max_value(), 3).unwrap();
+    let mut indices = Vec::<u64>::new();
+    let mut rng = rand::weak_rng();
+
+    // store values in an array and re-use so we can be sure to hit the overflow case
+
+    for _ in 0..1000_000 {
+        let r = rng.gen();
+        indices.push(r);
+        h.record_n(r, u64::max_value()).unwrap();
+    }
+
+    b.iter(|| {
+        for i in indices.iter() {
+            // all values are already at u64
+            h.record(*i).unwrap()
+        }
+    })
+}
+
+#[bench]
+fn record_random_values_with_1_count_u64(b: &mut Bencher) {
+    let mut h = Histogram::<u64>::new_with_bounds(1, u64::max_value(), 3).unwrap();
+    let mut rng = rand::weak_rng();
+
+    // This should be *slower* than the benchmarks above where we pre-calculate the values
+    // outside of the hot loop. If it isn't, then those measurements are likely spurious.
+
+    b.iter(|| {
+        for _ in 0..1000_000 {
+            h.record(rng.gen()).unwrap()
+        }
+    })
+}
+
+#[bench]
+fn add_precalc_random_value_1_count_same_dimensions_u64(b: &mut Bencher) {
+    do_add_benchmark(b, 1, || { Histogram::<u64>::new_with_bounds(1, u64::max_value(), 3).unwrap() })
+}
+
+#[bench]
+fn add_precalc_random_value_max_count_same_dimensions_u64(b: &mut Bencher) {
+    do_add_benchmark(b, u64::max_value(), || { Histogram::<u64>::new_with_bounds(1, u64::max_value(), 3).unwrap() })
+}
+
+#[bench]
+fn add_precalc_random_value_1_count_different_precision_u64(b: &mut Bencher) {
+    do_add_benchmark(b, 1, || { Histogram::<u64>::new_with_bounds(1, u64::max_value(), 2).unwrap() })
+}
+
+#[bench]
+fn add_precalc_random_value_max_count_different_precision_u64(b: &mut Bencher) {
+    do_add_benchmark(b, u64::max_value(), || { Histogram::<u64>::new_with_bounds(1, u64::max_value(), 2).unwrap() })
+}
+
+#[bench]
+fn subtract_precalc_random_value_1_count_same_dimensions_u64(b: &mut Bencher) {
+    do_subtract_benchmark(b, 1, || { Histogram::<u64>::new_with_bounds(1, u64::max_value(), 3).unwrap() })
+}
+
+// can't do subtraction with max count because it will error after the first iteration because
+// subtrahend count exceeds minuend. Similarly, when subtracting a different precision, the same
+// issue happens because the smallest equivalent value in the lower precision can map to a different
+// bucket in higher precision so we cannot easily pre-populate.
+
+fn do_subtract_benchmark<F: Fn() -> Histogram<u64>>(b: &mut Bencher, count_at_each_addend_value: u64, addend_factory: F) {
+    let mut accum = Histogram::<u64>::new_with_bounds(1, u64::max_value(), 3).unwrap();
+    let mut subtrahends = Vec::new();
+    let mut rng = rand::weak_rng();
+
+    for _ in 0..1000 {
+        let mut h = addend_factory();
+
+        for _ in 0..1000 {
+            let r = rng.gen();
+            h.record_n(r, count_at_each_addend_value).unwrap();
+            // ensure there's a count to subtract from
+            accum.record_n(r, u64::max_value()).unwrap();
+        }
+
+        subtrahends.push(h);
+    }
+
+    b.iter(|| {
+        for h in subtrahends.iter() {
+            accum.subtract(h).unwrap();
+        }
+    })
+}
+
+fn do_add_benchmark<F: Fn() -> Histogram<u64>>(b: &mut Bencher, count_at_each_addend_value: u64, addend_factory: F) {
+    let mut accum = Histogram::<u64>::new_with_bounds(1, u64::max_value(), 3).unwrap();
+    let mut addends = Vec::new();
+    let mut rng = rand::weak_rng();
+
+    for _ in 0..1000 {
+        let mut h = addend_factory();
+
+        for _ in 0..1000 {
+            let r = rng.gen();
+            h.record_n(r, count_at_each_addend_value).unwrap();
+        }
+
+        addends.push(h);
+    }
+
+    b.iter(|| {
+        for h in addends.iter() {
+            accum.add(h).unwrap();
+        }
+    })
+}
diff --git a/src/iterators/mod.rs b/src/iterators/mod.rs
@@ -159,6 +159,7 @@ impl<'a, T: 'a, P> Iterator for HistogramIterator<'a, T, P>
                     }
 
                     // maintain total count so we can yield percentiles
+                    // TODO overflow
                     self.total_count_to_index = self.total_count_to_index + count.to_u64().unwrap();
 
                     // make sure we don't add this index again
diff --git a/src/lib.rs b/src/lib.rs
@@ -171,12 +171,14 @@ const ORIGINAL_MAX: u64 = 0;
 /// into an integer count. Partial ordering is used for threshholding, also usually in the context
 /// of percentiles.
 pub trait Counter
-    : num::Num + num::ToPrimitive + num::FromPrimitive + Copy + PartialOrd<Self> {
+    : num::Num + num::ToPrimitive + num::FromPrimitive + num::Saturating + num::CheckedSub
+    + num::CheckedAdd + Copy + PartialOrd<Self>{
 }
 
 // auto-implement marker trait
 impl<T> Counter for T
-    where T: num::Num + num::ToPrimitive + num::FromPrimitive + Copy + PartialOrd<T>
+    where T: num::Num + num::ToPrimitive + num::FromPrimitive + num::Saturating + num::CheckedSub
+    + num::CheckedAdd + Copy + PartialOrd<T>
 {
 }
 
@@ -455,13 +457,14 @@ impl<T: Counter> Histogram<T> {
             for i in 0..source.len() {
                 let other_count = source[i];
                 if other_count != T::zero() {
-                    self[i] = self[i] + other_count;
+                    self[i] = self[i].saturating_add(other_count);
                     // TODO unwrapping .to_u64()
-                    observed_other_total_count = observed_other_total_count + other_count.to_u64().unwrap();
+                    observed_other_total_count = observed_other_total_count
+                        .saturating_add(other_count.to_u64().unwrap());
                 }
             }
 
-            self.total_count = self.total_count + observed_other_total_count;
+            self.total_count = self.total_count.saturating_add(observed_other_total_count);
             let mx = source.max();
             if mx > self.max() {
                 self.update_max(mx);
@@ -552,6 +555,9 @@ impl<T: Counter> Histogram<T> {
             if other_count != T::zero() {
                 let other_value = other.value_for(i);
                 if self.count_at(other_value).unwrap() < other_count {
+                    // TODO Perhaps we should saturating sub here? Or expose some form of
+                    // pluggability so users could choose to error or saturate? Both seem useful.
+                    // It's also sort of inconsistent with overflow, which now saturates.
                     return Err(SubtractionError::SubtrahendCountExceedsMinuendCount);
                 }
                 self.alter_n(other_value, other_count, false).expect("value should fit by now");
@@ -777,11 +783,19 @@ impl<T: Counter> Histogram<T> {
     }
 
     fn alter_n(&mut self, value: u64, count: T, add: bool) -> Result<(), ()> {
+        // TODO consider split out addition and subtraction cases; this isn't gaining much by
+        // unifying since we have to test all the cases anyway, and the TODO below is marking a case
+        // that might well be impossible but seems needed because of the (possibly false) symmetry
+        // with addition
+
+        // add=false is used by subtract(), which should have already aborted if underflow was
+        // possible
+
         let success = if let Some(c) = self.mut_at(value) {
             if add {
-                *c = *c + count;
+                *c = (*c).saturating_add(count);
             } else {
-                *c = *c - count;
+                *c = (*c).checked_sub(&count).expect("count underflow on subtraction");
             }
             true
         } else {
@@ -798,9 +812,14 @@ impl<T: Counter> Histogram<T> {
             {
                 let c = self.mut_at(value).expect("value should fit after resize");
                 if add {
-                    *c = *c + count;
+                    // after resize, should be no possibility of overflow because this is a new slot
+                    *c = (*c).checked_add(&count).expect("count overflow after resize");
                 } else {
-                    *c = *c - count;
+                    // TODO Not sure this code path can ever be hit: if subtraction requires minuend
+                    // count to exceed subtrahend count for a given value, we shouldn't ever need
+                    // to resize to subtract.
+                    // Anyway, at the very least, we know it shouldn't underflow.
+                    *c = (*c).checked_sub(&count).expect("count underflow after resize");
                 }
             }
 
@@ -809,9 +828,10 @@ impl<T: Counter> Histogram<T> {
 
         self.update_min_max(value);
         if add {
-            self.total_count = self.total_count + count.to_u64().unwrap();
+            self.total_count = self.total_count.saturating_add(count.to_u64().unwrap());
         } else {
-            self.total_count = self.total_count - count.to_u64().unwrap();
+            self.total_count = self.total_count.checked_sub(count.to_u64().unwrap())
+                .expect("total count underflow on subtraction");
         }
         Ok(())
     }
@@ -1113,6 +1133,7 @@ impl<T: Counter> Histogram<T> {
 
         let mut total_to_current_index: u64 = 0;
         for i in 0..self.len() {
+            // TODO overflow
             total_to_current_index = total_to_current_index + self[i].to_u64().unwrap();
             if total_to_current_index >= count_at_percentile {
                 let value_at_index = self.value_for(i);
@@ -1139,6 +1160,7 @@ impl<T: Counter> Histogram<T> {
         }
 
         let target_index = cmp::min(self.index_for(value), self.last());
+        // TODO overflow
         let total_to_current_index =
             (0..(target_index + 1)).map(|i| self[i]).fold(T::zero(), |t, v| t + v);
         100.0 * total_to_current_index.to_f64().unwrap() / self.total_count as f64
@@ -1157,6 +1179,7 @@ impl<T: Counter> Histogram<T> {
     pub fn count_between(&self, low: u64, high: u64) -> Result<T, ()> {
         let low_index = self.index_for(low);
         let high_index = cmp::min(self.index_for(high), self.last());
+        // TODO overflow
         Ok((low_index..(high_index + 1)).map(|i| self[i]).fold(T::zero(), |t, v| t + v))
     }
 
@@ -1229,6 +1252,7 @@ impl<T: Counter> Histogram<T> {
     ///
     /// Note that the return value is capped at `u64::max_value()`.
     pub fn median_equivalent(&self, value: u64) -> u64 {
+        // TODO isn't this just saturating?
         match self.lowest_equivalent(value).overflowing_add(self.equivalent_range(value) >> 1) {
             (_, of) if of => u64::max_value(),
             (v, _) => v,
@@ -1444,8 +1468,7 @@ impl <T: Counter> RestatState<T> {
     fn on_nonzero_count(&mut self, index: usize, count: T) {
         // TODO don't unwrap here; weird user Counter types may not work.
         // Fix Counter types to just be u8-64?
-        // TODO this can wrap, but not sure there's much we can do about that. Saturating add maybe?
-        self.total_count += count.to_u64().unwrap();
+        self.total_count = self.total_count.saturating_add(count.to_u64().unwrap());
 
         self.max_index = Some(index);
 
@@ -1533,6 +1556,7 @@ impl<T: Counter, F: Counter> PartialEq<Histogram<F>> for Histogram<T>
         if self.min_nz() != other.min_nz() {
             return false;
         }
+        // TODO may panic? Does the above guarantee that the other array is at least as long?
         (0..self.len()).all(|i| self[i] == other[i])
     }
 }
diff --git a/src/serialization/tests.rs b/src/serialization/tests.rs
@@ -70,7 +70,6 @@ fn serialize_roundtrip_all_zeros() {
 
     assert_eq!(orig.total_count, deser.total_count);
     assert_eq!(orig.counts, deser.counts);
-
 }
 
 #[test]
diff --git a/tests/histogram.rs b/tests/histogram.rs

Original file line number	Diff line number	Diff line change
`@@ -159,6 +159,7 @@ impl<'a, T: 'a, P> Iterator for HistogramIterator<'a, T, P>`
`159`	`159`	`}`
`160`	`160`
`161`	`161`	`// maintain total count so we can yield percentiles`
	`162`	`+ // TODO overflow`
`162`	`163`	`self.total_count_to_index = self.total_count_to_index + count.to_u64().unwrap();`
`163`	`164`
`164`	`165`	`// make sure we don't add this index again`
Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,6 @@ fn serialize_roundtrip_all_zeros() {`
`70`	`70`
`71`	`71`	`assert_eq!(orig.total_count, deser.total_count);`
`72`	`72`	`assert_eq!(orig.counts, deser.counts);`
`73`		`-`
`74`	`73`	`}`
`75`	`74`
`76`	`75`	`#[test]`