Move UniformInt SIMD implementations to new module

dhardy · dhardy · commit f080e472903a · 2022-02-28T10:44:14.000Z
diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
@@ -120,6 +120,7 @@ use crate::distributions::utils::Float;
 mod uniform_float;
 mod uniform_int;
 mod uniform_other;
+#[cfg(feature = "simd_support")] mod uniform_simd;
 
 pub use uniform_float::UniformFloat;
 pub use uniform_int::UniformInt;
diff --git a/src/distributions/uniform/uniform_int.rs b/src/distributions/uniform/uniform_int.rs
@@ -9,7 +9,6 @@
 use super::{SampleBorrow, SampleUniform, UniformSampler};
 use crate::distributions::utils::WideningMultiply;
 use crate::Rng;
-#[cfg(feature = "simd_support")] use packed_simd::*;
 #[cfg(feature = "serde1")] use serde::{Deserialize, Serialize};
 
 /// The back-end implementing [`UniformSampler`] for integer types.
@@ -49,9 +48,10 @@ use crate::Rng;
 #[derive(Clone, Copy, Debug, PartialEq)]
 #[cfg_attr(feature = "serde1", derive(Serialize, Deserialize))]
 pub struct UniformInt<X> {
-    low: X,
-    range: X,
-    z: X, // either ints_to_reject or zone depending on implementation
+    // HACK: fields are pub(crate)
+    pub(crate) low: X,
+    pub(crate) range: X,
+    pub(crate) z: X, // either ints_to_reject or zone depending on implementation
 }
 
 macro_rules! uniform_int_impl {
@@ -202,151 +202,6 @@ uniform_int_impl! { u64, u64, u64 }
 uniform_int_impl! { usize, usize, usize }
 uniform_int_impl! { u128, u128, u128 }
 
-#[cfg(feature = "simd_support")]
-macro_rules! uniform_simd_int_impl {
-    ($ty:ident, $unsigned:ident, $u_scalar:ident) => {
-        // The "pick the largest zone that can fit in an `u32`" optimization
-        // is less useful here. Multiple lanes complicate things, we don't
-        // know the PRNG's minimal output size, and casting to a larger vector
-        // is generally a bad idea for SIMD performance. The user can still
-        // implement it manually.
-
-        // TODO: look into `Uniform::<u32x4>::new(0u32, 100)` functionality
-        //       perhaps `impl SampleUniform for $u_scalar`?
-        impl SampleUniform for $ty {
-            type Sampler = UniformInt<$ty>;
-        }
-
-        impl UniformSampler for UniformInt<$ty> {
-            type X = $ty;
-
-            #[inline] // if the range is constant, this helps LLVM to do the
-                      // calculations at compile-time.
-            fn new<B1, B2>(low_b: B1, high_b: B2) -> Self
-                where B1: SampleBorrow<Self::X> + Sized,
-                      B2: SampleBorrow<Self::X> + Sized
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                assert!(low.lt(high).all(), "Uniform::new called with `low >= high`");
-                UniformSampler::new_inclusive(low, high - 1)
-            }
-
-            #[inline] // if the range is constant, this helps LLVM to do the
-                      // calculations at compile-time.
-            fn new_inclusive<B1, B2>(low_b: B1, high_b: B2) -> Self
-                where B1: SampleBorrow<Self::X> + Sized,
-                      B2: SampleBorrow<Self::X> + Sized
-            {
-                let low = *low_b.borrow();
-                let high = *high_b.borrow();
-                assert!(low.le(high).all(),
-                        "Uniform::new_inclusive called with `low > high`");
-                let unsigned_max = ::core::$u_scalar::MAX;
-
-                // NOTE: these may need to be replaced with explicitly
-                // wrapping operations if `packed_simd` changes
-                let range: $unsigned = ((high - low) + 1).cast();
-                // `% 0` will panic at runtime.
-                let not_full_range = range.gt($unsigned::splat(0));
-                // replacing 0 with `unsigned_max` allows a faster `select`
-                // with bitwise OR
-                let modulo = not_full_range.select(range, $unsigned::splat(unsigned_max));
-                // wrapping addition
-                let ints_to_reject = (unsigned_max - range + 1) % modulo;
-                // When `range` is 0, `lo` of `v.wmul(range)` will always be
-                // zero which means only one sample is needed.
-                let zone = unsigned_max - ints_to_reject;
-
-                UniformInt {
-                    low,
-                    // These are really $unsigned values, but store as $ty:
-                    range: range.cast(),
-                    z: zone.cast(),
-                }
-            }
-
-            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
-                let range: $unsigned = self.range.cast();
-                let zone: $unsigned = self.z.cast();
-
-                // This might seem very slow, generating a whole new
-                // SIMD vector for every sample rejection. For most uses
-                // though, the chance of rejection is small and provides good
-                // general performance. With multiple lanes, that chance is
-                // multiplied. To mitigate this, we replace only the lanes of
-                // the vector which fail, iteratively reducing the chance of
-                // rejection. The replacement method does however add a little
-                // overhead. Benchmarking or calculating probabilities might
-                // reveal contexts where this replacement method is slower.
-                let mut v: $unsigned = rng.gen();
-                loop {
-                    let (hi, lo) = v.wmul(range);
-                    let mask = lo.le(zone);
-                    if mask.all() {
-                        let hi: $ty = hi.cast();
-                        // wrapping addition
-                        let result = self.low + hi;
-                        // `select` here compiles to a blend operation
-                        // When `range.eq(0).none()` the compare and blend
-                        // operations are avoided.
-                        let v: $ty = v.cast();
-                        return range.gt($unsigned::splat(0)).select(result, v);
-                    }
-                    // Replace only the failing lanes
-                    v = mask.select(v, rng.gen());
-                }
-            }
-        }
-    };
-
-    // bulk implementation
-    ($(($unsigned:ident, $signed:ident),)+ $u_scalar:ident) => {
-        $(
-            uniform_simd_int_impl!($unsigned, $unsigned, $u_scalar);
-            uniform_simd_int_impl!($signed, $unsigned, $u_scalar);
-        )+
-    };
-}
-
-#[cfg(feature = "simd_support")]
-uniform_simd_int_impl! {
-    (u64x2, i64x2),
-    (u64x4, i64x4),
-    (u64x8, i64x8),
-    u64
-}
-
-#[cfg(feature = "simd_support")]
-uniform_simd_int_impl! {
-    (u32x2, i32x2),
-    (u32x4, i32x4),
-    (u32x8, i32x8),
-    (u32x16, i32x16),
-    u32
-}
-
-#[cfg(feature = "simd_support")]
-uniform_simd_int_impl! {
-    (u16x2, i16x2),
-    (u16x4, i16x4),
-    (u16x8, i16x8),
-    (u16x16, i16x16),
-    (u16x32, i16x32),
-    u16
-}
-
-#[cfg(feature = "simd_support")]
-uniform_simd_int_impl! {
-    (u8x2, i8x2),
-    (u8x4, i8x4),
-    (u8x8, i8x8),
-    (u8x16, i8x16),
-    (u8x32, i8x32),
-    (u8x64, i8x64),
-    u8
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -441,34 +296,8 @@ mod tests {
                     |x, y| x < y
                 );)*
             }};
-
-            // simd bulk
-            ($($ty:ident),* => $scalar:ident) => {{
-                $(t!(
-                    $ty,
-                    [
-                        ($ty::splat(0), $ty::splat(10)),
-                        ($ty::splat(10), $ty::splat(127)),
-                        ($ty::splat($scalar::MIN), $ty::splat($scalar::MAX)),
-                    ],
-                    |x: $ty, y| x.le(y).all(),
-                    |x: $ty, y| x.lt(y).all()
-                );)*
-            }};
         }
         t!(i8, i16, i32, i64, isize, u8, u16, u32, u64, usize, i128, u128);
-
-        #[cfg(feature = "simd_support")]
-        {
-            t!(u8x2, u8x4, u8x8, u8x16, u8x32, u8x64 => u8);
-            t!(i8x2, i8x4, i8x8, i8x16, i8x32, i8x64 => i8);
-            t!(u16x2, u16x4, u16x8, u16x16, u16x32 => u16);
-            t!(i16x2, i16x4, i16x8, i16x16, i16x32 => i16);
-            t!(u32x2, u32x4, u32x8, u32x16 => u32);
-            t!(i32x2, i32x4, i32x8, i32x16 => i32);
-            t!(u64x2, u64x4, u64x8 => u64);
-            t!(i64x2, i64x4, i64x8 => i64);
-        }
     }
 
     #[test]
diff --git a/src/distributions/uniform/uniform_simd.rs b/src/distributions/uniform/uniform_simd.rs