diff --git a/Cargo.toml b/Cargo.toml
index 3f1abd73519..9802386e456 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,5 +2,6 @@
 
 members = [
     "crates/core_simd",
+    "crates/std_float",
     "crates/test_helpers",
 ]
diff --git a/crates/core_simd/Cargo.toml b/crates/core_simd/Cargo.toml
index a103ef115a5..d2ff5f3b1b1 100644
--- a/crates/core_simd/Cargo.toml
+++ b/crates/core_simd/Cargo.toml
@@ -26,3 +26,6 @@ features = ["alloc"]
 
 [dev-dependencies.test_helpers]
 path = "../test_helpers"
+
+[dev-dependencies]
+std_float = { path = "../std_float/", features = ["as_crate"] }
diff --git a/crates/core_simd/examples/nbody.rs b/crates/core_simd/examples/nbody.rs
index 43280feebbd..7b1e6840f64 100644
--- a/crates/core_simd/examples/nbody.rs
+++ b/crates/core_simd/examples/nbody.rs
@@ -1,11 +1,13 @@
-#![cfg_attr(feature = "std", feature(portable_simd))]
+#![feature(portable_simd)]
+extern crate std_float;
 
 /// Benchmarks game nbody code
 /// Taken from the `packed_simd` crate
 /// Run this benchmark with `cargo test --example nbody`
-#[cfg(feature = "std")]
 mod nbody {
-    use core_simd::*;
+    use core_simd::simd::*;
+    #[allow(unused)] // False positive?
+    use std_float::StdFloat;
 
     use std::f64::consts::PI;
     const SOLAR_MASS: f64 = 4.0 * PI * PI;
@@ -167,7 +169,6 @@ mod nbody {
     }
 }
 
-#[cfg(feature = "std")]
 #[cfg(test)]
 mod tests {
     // Good enough for demonstration purposes, not going for strictness here.
@@ -184,7 +185,6 @@ mod tests {
 }
 
 fn main() {
-    #[cfg(feature = "std")]
     {
         let (energy_before, energy_after) = nbody::run(1000);
         println!("Energy before: {}", energy_before);
diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 6a6d26d10a7..0bc241af1f1 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -87,29 +87,3 @@ extern "platform-intrinsic" {
     #[allow(unused)]
     pub(crate) fn simd_select_bitmask<M, T>(m: M, a: T, b: T) -> T;
 }
-
-#[cfg(feature = "std")]
-mod std {
-    extern "platform-intrinsic" {
-        // ceil
-        pub(crate) fn simd_ceil<T>(x: T) -> T;
-
-        // floor
-        pub(crate) fn simd_floor<T>(x: T) -> T;
-
-        // round
-        pub(crate) fn simd_round<T>(x: T) -> T;
-
-        // trunc
-        pub(crate) fn simd_trunc<T>(x: T) -> T;
-
-        // fsqrt
-        pub(crate) fn simd_fsqrt<T>(x: T) -> T;
-
-        // fma
-        pub(crate) fn simd_fma<T>(x: T, y: T, z: T) -> T;
-    }
-}
-
-#[cfg(feature = "std")]
-pub(crate) use crate::simd::intrinsics::std::*;
diff --git a/crates/core_simd/src/round.rs b/crates/core_simd/src/round.rs
index 09789e11492..06ccab3ec49 100644
--- a/crates/core_simd/src/round.rs
+++ b/crates/core_simd/src/round.rs
@@ -5,47 +5,6 @@ macro_rules! implement {
     {
         $type:ty, $int_type:ty
     } => {
-        #[cfg(feature = "std")]
-        impl<const LANES: usize> Simd<$type, LANES>
-        where
-            LaneCount<LANES>: SupportedLaneCount,
-        {
-            /// Returns the smallest integer greater than or equal to each lane.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn ceil(self) -> Self {
-                unsafe { intrinsics::simd_ceil(self) }
-            }
-
-            /// Returns the largest integer value less than or equal to each lane.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn floor(self) -> Self {
-                unsafe { intrinsics::simd_floor(self) }
-            }
-
-            /// Rounds to the nearest integer value. Ties round toward zero.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn round(self) -> Self {
-                unsafe { intrinsics::simd_round(self) }
-            }
-
-            /// Returns the floating point's integer value, with its fractional part removed.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn trunc(self) -> Self {
-                unsafe { intrinsics::simd_trunc(self) }
-            }
-
-            /// Returns the floating point's fractional value, with its integer part removed.
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[inline]
-            pub fn fract(self) -> Self {
-                self - self.trunc()
-            }
-        }
-
         impl<const LANES: usize> Simd<$type, LANES>
         where
             LaneCount<LANES>: SupportedLaneCount,
diff --git a/crates/core_simd/src/vector/float.rs b/crates/core_simd/src/vector/float.rs
index 4a4b23238c4..3528a420351 100644
--- a/crates/core_simd/src/vector/float.rs
+++ b/crates/core_simd/src/vector/float.rs
@@ -38,29 +38,6 @@ macro_rules! impl_float_vector {
                 unsafe { intrinsics::simd_fabs(self) }
             }
 
-            /// Fused multiply-add.  Computes `(self * a) + b` with only one rounding error,
-            /// yielding a more accurate result than an unfused multiply-add.
-            ///
-            /// Using `mul_add` *may* be more performant than an unfused multiply-add if the target
-            /// architecture has a dedicated `fma` CPU instruction.  However, this is not always
-            /// true, and will be heavily dependent on designing algorithms with specific target
-            /// hardware in mind.
-            #[cfg(feature = "std")]
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            pub fn mul_add(self, a: Self, b: Self) -> Self {
-                unsafe { intrinsics::simd_fma(self, a, b) }
-            }
-
-            /// Produces a vector where every lane has the square root value
-            /// of the equivalently-indexed lane in `self`
-            #[inline]
-            #[must_use = "method returns a new vector and does not mutate the original value"]
-            #[cfg(feature = "std")]
-            pub fn sqrt(self) -> Self {
-                unsafe { intrinsics::simd_fsqrt(self) }
-            }
-
             /// Takes the reciprocal (inverse) of each lane, `1/x`.
             #[inline]
             #[must_use = "method returns a new vector and does not mutate the original value"]
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index 43ddde4c55e..4fb9de198ee 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -546,6 +546,8 @@ macro_rules! impl_float_tests {
 
             #[cfg(feature = "std")]
             mod std {
+                use std_float::StdFloat;
+
                 use super::*;
                 test_helpers::test_lanes! {
                     fn sqrt<const LANES: usize>() {
diff --git a/crates/core_simd/tests/round.rs b/crates/core_simd/tests/round.rs
index 11d617a6c2c..1a1bc9ebca7 100644
--- a/crates/core_simd/tests/round.rs
+++ b/crates/core_simd/tests/round.rs
@@ -3,6 +3,8 @@
 macro_rules! float_rounding_test {
     { $scalar:tt, $int_scalar:tt } => {
         mod $scalar {
+            use std_float::StdFloat;
+
             type Vector<const LANES: usize> = core_simd::Simd<$scalar, LANES>;
             type Scalar = $scalar;
             type IntScalar = $int_scalar;
diff --git a/crates/std_float/Cargo.toml b/crates/std_float/Cargo.toml
new file mode 100644
index 00000000000..82f66b8dcb7
--- /dev/null
+++ b/crates/std_float/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "std_float"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+core_simd = { path = "../core_simd" }
+
+[features]
+default = ["as_crate"]
+as_crate = []
diff --git a/crates/std_float/src/lib.rs b/crates/std_float/src/lib.rs
new file mode 100644
index 00000000000..4bd4d4c05e3
--- /dev/null
+++ b/crates/std_float/src/lib.rs
@@ -0,0 +1,165 @@
+#![cfg_attr(feature = "as_crate", no_std)] // We are std!
+#![cfg_attr(
+    feature = "as_crate",
+    feature(platform_intrinsics),
+    feature(portable_simd)
+)]
+#[cfg(not(feature = "as_crate"))]
+use core::simd;
+#[cfg(feature = "as_crate")]
+use core_simd::simd;
+
+use simd::{LaneCount, Simd, SupportedLaneCount};
+
+#[cfg(feature = "as_crate")]
+mod experimental {
+    pub trait Sealed {}
+}
+
+#[cfg(feature = "as_crate")]
+use experimental as sealed;
+
+use crate::sealed::Sealed;
+
+// "platform intrinsics" are essentially "codegen intrinsics"
+// each of these may be scalarized and lowered to a libm call
+extern "platform-intrinsic" {
+    // ceil
+    fn simd_ceil<T>(x: T) -> T;
+
+    // floor
+    fn simd_floor<T>(x: T) -> T;
+
+    // round
+    fn simd_round<T>(x: T) -> T;
+
+    // trunc
+    fn simd_trunc<T>(x: T) -> T;
+
+    // fsqrt
+    fn simd_fsqrt<T>(x: T) -> T;
+
+    // fma
+    fn simd_fma<T>(x: T, y: T, z: T) -> T;
+}
+
+/// This trait provides a possibly-temporary implementation of float functions
+/// that may, in the absence of hardware support, canonicalize to calling an
+/// operating system's `math.h` dynamically-loaded library (also known as a
+/// shared object). As these conditionally require runtime support, they
+/// should only appear in binaries built assuming OS support: `std`.
+///
+/// However, there is no reason SIMD types, in general, need OS support,
+/// as for many architectures an embedded binary may simply configure that
+/// support itself. This means these types must be visible in `core`
+/// but have these functions available in `std`.
+///
+/// [`f32`] and [`f64`] achieve a similar trick by using "lang items", but
+/// due to compiler limitations, it is harder to implement this approach for
+/// abstract data types like [`Simd`]. From that need, this trait is born.
+///
+/// It is possible this trait will be replaced in some manner in the future,
+/// when either the compiler or its supporting runtime functions are improved.
+/// For now this trait is available to permit experimentation with SIMD float
+/// operations that may lack hardware support, such as `mul_add`.
+pub trait StdFloat: Sealed + Sized {
+    /// Fused multiply-add.  Computes `(self * a) + b` with only one rounding error,
+    /// yielding a more accurate result than an unfused multiply-add.
+    ///
+    /// Using `mul_add` *may* be more performant than an unfused multiply-add if the target
+    /// architecture has a dedicated `fma` CPU instruction.  However, this is not always
+    /// true, and will be heavily dependent on designing algorithms with specific target
+    /// hardware in mind.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn mul_add(self, a: Self, b: Self) -> Self {
+        unsafe { simd_fma(self, a, b) }
+    }
+
+    /// Produces a vector where every lane has the square root value
+    /// of the equivalently-indexed lane in `self`
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn sqrt(self) -> Self {
+        unsafe { simd_fsqrt(self) }
+    }
+
+    /// Returns the smallest integer greater than or equal to each lane.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn ceil(self) -> Self {
+        unsafe { simd_ceil(self) }
+    }
+
+    /// Returns the largest integer value less than or equal to each lane.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn floor(self) -> Self {
+        unsafe { simd_floor(self) }
+    }
+
+    /// Rounds to the nearest integer value. Ties round toward zero.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn round(self) -> Self {
+        unsafe { simd_round(self) }
+    }
+
+    /// Returns the floating point's integer value, with its fractional part removed.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn trunc(self) -> Self {
+        unsafe { simd_trunc(self) }
+    }
+
+    /// Returns the floating point's fractional value, with its integer part removed.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn fract(self) -> Self;
+}
+
+impl<const N: usize> Sealed for Simd<f32, N> where LaneCount<N>: SupportedLaneCount {}
+impl<const N: usize> Sealed for Simd<f64, N> where LaneCount<N>: SupportedLaneCount {}
+
+// We can safely just use all the defaults.
+impl<const N: usize> StdFloat for Simd<f32, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    /// Returns the floating point's fractional value, with its integer part removed.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn fract(self) -> Self {
+        self - self.trunc()
+    }
+}
+
+impl<const N: usize> StdFloat for Simd<f64, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    /// Returns the floating point's fractional value, with its integer part removed.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[inline]
+    fn fract(self) -> Self {
+        self - self.trunc()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use simd::*;
+
+    #[test]
+    fn everything_works() {
+        let x = f32x4::from_array([0.1, 0.5, 0.6, -1.5]);
+        let x2 = x + x;
+        let _xc = x.ceil();
+        let _xf = x.floor();
+        let _xr = x.round();
+        let _xt = x.trunc();
+        let _xfma = x.mul_add(x, x);
+        let _xsqrt = x.sqrt();
+        let _ = x2.abs() * x2;
+    }
+}