From b506127649c52b1264af4f777ac03d0eb36acced Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Thu, 26 Sep 2024 09:15:50 +0000
Subject: [PATCH 01/10] Try switching from rustacuda to cust

---
 .github/workflows/rustdoc.yml             |  5 ++-
 Cargo.toml                                | 26 ++++++------
 examples/print/src/main.rs                | 20 +++++-----
 rust-cuda-derive/src/rust_to_cuda/impl.rs |  8 ++--
 src/deps.rs                               |  4 +-
 src/host/mod.rs                           | 33 +++++++++-------
 src/kernel/mod.rs                         | 33 +++++++++-------
 src/kernel/param.rs                       | 48 +++++++++++------------
 src/lend/impls/arc.rs                     | 18 ++++-----
 src/lend/impls/arced_slice.rs             | 26 +++++++-----
 src/lend/impls/box.rs                     | 26 ++++++------
 src/lend/impls/boxed_slice.rs             | 16 ++++----
 src/lend/impls/final.rs                   |  8 ++--
 src/lend/impls/option.rs                  |  2 +-
 src/lend/impls/ref.rs                     | 18 ++++-----
 src/lend/impls/ref_mut.rs                 |  8 ++--
 src/lend/impls/slice_ref.rs               | 10 ++---
 src/lend/impls/slice_ref_mut.rs           |  8 ++--
 src/lend/mod.rs                           | 22 +++++------
 src/utils/adapter.rs                      | 18 ++++-----
 src/utils/aliasing/const.rs               |  8 ++--
 src/utils/aliasing/dynamic.rs             |  8 ++--
 src/utils/async.rs                        | 10 ++---
 src/utils/exchange/buffer/host.rs         | 26 ++++++------
 src/utils/exchange/buffer/mod.rs          | 16 ++++----
 src/utils/exchange/wrapper.rs             | 20 +++++-----
 26 files changed, 229 insertions(+), 216 deletions(-)

diff --git a/.github/workflows/rustdoc.yml b/.github/workflows/rustdoc.yml
index 046a89cee..f0d7b683b 100644
--- a/.github/workflows/rustdoc.yml
+++ b/.github/workflows/rustdoc.yml
@@ -37,8 +37,9 @@ jobs:
             --enable-index-page \
             --extern-html-root-url const_type_layout=https://docs.rs/const-type-layout/0.3.2/ \
             --extern-html-root-url final=https://docs.rs/final/0.1.1/ \
-            --extern-html-root-url rustacuda=https://docs.rs/rustacuda/0.1.3/ \
-            --extern-html-root-url rustacuda_core=https://docs.rs/rustacuda_core/0.1.2/ \
+            --extern-html-root-url cust=https://docs.rs/cust/0.3.2/ \
+            --extern-html-root-url cust_core=https://docs.rs/cust_core/0.1/ \
+            --extern-html-root-url cust_derive=https://docs.rs/cust_derive/0.2/ \
             -Zunstable-options \
           " cargo doc \
             --all-features \
diff --git a/Cargo.toml b/Cargo.toml
index f4d43727a..450c0a989 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -89,22 +89,22 @@ rust-version = { workspace = true }
 
 [features]
 default = []
-derive = ["dep:rust-cuda-derive"]
+derive = ["dep:cust_derive", "dep:rust-cuda-derive"]
 device = []
 final = ["dep:final"]
-host = ["dep:rustacuda", "dep:regex", "dep:oneshot", "dep:safer_owning_ref"]
+host = ["dep:cust", "dep:regex", "dep:oneshot", "dep:safer_owning_ref"]
 kernel = ["dep:rust-cuda-kernel"]
 
 [dependencies]
-const-type-layout = { workspace = true, features = ["derive"] }
-final = { workspace = true, optional = true }
-oneshot = { workspace = true, features = ["std", "async"], optional = true }
-regex = { workspace = true, optional = true }
-rustacuda = { workspace = true, optional = true }
-rustacuda_core = { workspace = true }
-rust-cuda-derive = { workspace = true, optional = true }
-rust-cuda-kernel = { workspace = true, optional = true }
-safer_owning_ref = { workspace = true, optional = true }
+const-type-layout = { version = "0.3.2", default-features = false, features = ["derive"] }
+# FIXME: cust fails to compile without the `bytemuck` feature
+cust = { version = "0.3.2", default-features = false, features = ["bytemuck"], optional = true }
+cust_core = { version = "0.1", default-features = false }
+cust_derive = { version = "0.2", default-features = false, optional = true }
+final = { version = "0.1.1", default-features = false, optional = true }
+oneshot = { version = "0.1", default-features = false, features = ["std", "async"], optional = true }
+regex = { version = "1.10", default-features = false, optional = true }
+safer_owning_ref = { version = "0.5", default-features = false, optional = true }
 
-[lints]
-workspace = true
+rust-cuda-derive = { path = "rust-cuda-derive", default-features = false, optional = true }
+rust-cuda-kernel = { path = "rust-cuda-kernel", default-features = false, optional = true }
diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 008b39f5b..1998a7057 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -2,38 +2,38 @@
 
 use print::{kernel, link, Action};
 
-fn main() -> rust_cuda::deps::rustacuda::error::CudaResult<()> {
+fn main() -> rust_cuda::deps::cust::error::CudaResult<()> {
     // Link the non-generic CUDA kernel
     struct KernelPtx;
     link! { impl kernel for KernelPtx }
 
     // Initialize the CUDA API
-    rust_cuda::deps::rustacuda::init(rust_cuda::deps::rustacuda::CudaFlags::empty())?;
+    rust_cuda::deps::cust::init(rust_cuda::deps::cust::CudaFlags::empty())?;
 
     // Get the first CUDA GPU device
-    let device = rust_cuda::deps::rustacuda::device::Device::get_device(0)?;
+    let device = rust_cuda::deps::cust::device::Device::get_device(0)?;
 
     // Create a CUDA context associated to this device
     let _context = rust_cuda::host::CudaDropWrapper::from(
-        rust_cuda::deps::rustacuda::context::Context::create_and_push(
-            rust_cuda::deps::rustacuda::context::ContextFlags::MAP_HOST
-                | rust_cuda::deps::rustacuda::context::ContextFlags::SCHED_AUTO,
+        rust_cuda::deps::cust::context::Context::create_and_push(
+            rust_cuda::deps::cust::context::ContextFlags::MAP_HOST
+                | rust_cuda::deps::cust::context::ContextFlags::SCHED_AUTO,
             device,
         )?,
     );
 
     // Create a new CUDA stream to submit kernels to
     let mut stream =
-        rust_cuda::host::CudaDropWrapper::from(rust_cuda::deps::rustacuda::stream::Stream::new(
-            rust_cuda::deps::rustacuda::stream::StreamFlags::NON_BLOCKING,
+        rust_cuda::host::CudaDropWrapper::from(rust_cuda::deps::cust::stream::Stream::new(
+            rust_cuda::deps::cust::stream::StreamFlags::NON_BLOCKING,
             None,
         )?);
 
     // Create a new instance of the CUDA kernel and prepare the launch config
     let mut kernel = rust_cuda::kernel::TypedPtxKernel::<kernel>::new::<KernelPtx>(None);
     let config = rust_cuda::kernel::LaunchConfig {
-        grid: rust_cuda::deps::rustacuda::function::GridSize::x(1),
-        block: rust_cuda::deps::rustacuda::function::BlockSize::x(4),
+        grid: rust_cuda::deps::cust::function::GridSize::x(1),
+        block: rust_cuda::deps::cust::function::BlockSize::x(4),
         ptx_jit: false,
     };
 
diff --git a/rust-cuda-derive/src/rust_to_cuda/impl.rs b/rust-cuda-derive/src/rust_to_cuda/impl.rs
index 55b0948d7..56dc3dcca 100644
--- a/rust-cuda-derive/src/rust_to_cuda/impl.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/impl.rs
@@ -84,7 +84,7 @@ pub fn rust_to_cuda_trait(
             unsafe fn borrow<CudaAllocType: #crate_path::alloc::CudaAlloc>(
                 &self,
                 alloc: CudaAllocType,
-            ) -> #crate_path::deps::rustacuda::error::CudaResult<(
+            ) -> #crate_path::deps::cust::error::CudaResult<(
                 #crate_path::utils::ffi::DeviceAccessible<Self::CudaRepresentation>,
                 #crate_path::alloc::CombinedCudaAlloc<Self::CudaAllocation, CudaAllocType>
             )> {
@@ -107,7 +107,7 @@ pub fn rust_to_cuda_trait(
                 alloc: #crate_path::alloc::CombinedCudaAlloc<
                     Self::CudaAllocation, CudaAllocType
                 >,
-            ) -> #crate_path::deps::rustacuda::error::CudaResult<CudaAllocType> {
+            ) -> #crate_path::deps::cust::error::CudaResult<CudaAllocType> {
                 let (alloc_front, alloc_tail) = alloc.split();
 
                 #(#r2c_field_destructors)*
@@ -192,7 +192,7 @@ pub fn rust_to_cuda_async_trait(
                 &self,
                 alloc: CudaAllocType,
                 stream: #crate_path::host::Stream<'stream>,
-            ) -> #crate_path::deps::rustacuda::error::CudaResult<(
+            ) -> #crate_path::deps::cust::error::CudaResult<(
                 #crate_path::utils::r#async::Async<
                     '_, 'stream,
                     #crate_path::utils::ffi::DeviceAccessible<Self::CudaRepresentation>,
@@ -220,7 +220,7 @@ pub fn rust_to_cuda_async_trait(
                     Self::CudaAllocationAsync, CudaAllocType
                 >,
                 stream: #crate_path::host::Stream<'stream>,
-            ) -> #crate_path::deps::rustacuda::error::CudaResult<(
+            ) -> #crate_path::deps::cust::error::CudaResult<(
                 #crate_path::utils::r#async::Async<
                     'a, 'stream,
                     #crate_path::deps::owning_ref::BoxRefMut<'a, CudaRestoreOwner, Self>,
diff --git a/src/deps.rs b/src/deps.rs
index 50fd38f3f..8521ed267 100644
--- a/src/deps.rs
+++ b/src/deps.rs
@@ -7,6 +7,6 @@ pub extern crate const_type_layout;
 pub extern crate owning_ref;
 
 #[cfg(feature = "host")]
-pub extern crate rustacuda;
+pub extern crate cust;
 
-pub extern crate rustacuda_core;
+pub extern crate cust_core;
diff --git a/src/host/mod.rs b/src/host/mod.rs
index c2d0558c4..782b589f8 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -5,13 +5,14 @@ use std::{
 };
 
 use const_type_layout::TypeGraphLayout;
-use rustacuda::{
+use cust::{
     context::Context,
     error::CudaError,
     event::Event,
     memory::{CopyDestination, DeviceBox, DeviceBuffer, LockedBox, LockedBuffer},
     module::Module,
 };
+use cust_core::DeviceCopy;
 
 use crate::{
     safety::PortableBitSemantics,
@@ -30,12 +31,12 @@ type InvariantLifetime<'brand> = PhantomData<fn(&'brand ()) -> &'brand ()>;
 #[derive(Copy, Clone)]
 #[repr(transparent)]
 pub struct Stream<'stream> {
-    stream: &'stream rustacuda::stream::Stream,
+    stream: &'stream cust::stream::Stream,
     _brand: InvariantLifetime<'stream>,
 }
 
 impl<'stream> Deref for Stream<'stream> {
-    type Target = rustacuda::stream::Stream;
+    type Target = cust::stream::Stream;
 
     fn deref(&self) -> &Self::Target {
         self.stream
@@ -65,7 +66,7 @@ impl<'stream> Stream<'stream> {
     /// }
     /// ```
     pub fn with<O>(
-        stream: &mut rustacuda::stream::Stream,
+        stream: &mut cust::stream::Stream,
         inner: impl for<'new_stream> FnOnce(Stream<'new_stream>) -> O,
     ) -> O {
         inner(Stream {
@@ -77,7 +78,7 @@ impl<'stream> Stream<'stream> {
 
 pub trait CudaDroppable: Sized {
     #[expect(clippy::missing_errors_doc)]
-    fn drop(val: Self) -> Result<(), (rustacuda::error::CudaError, Self)>;
+    fn drop(val: Self) -> Result<(), (cust::error::CudaError, Self)>;
 }
 
 #[repr(transparent)]
@@ -112,25 +113,27 @@ impl<C: CudaDroppable> DerefMut for CudaDropWrapper<C> {
     }
 }
 
-impl<T> CudaDroppable for DeviceBox<T> {
+impl<T: DeviceCopy> CudaDroppable for DeviceBox<T> {
     fn drop(val: Self) -> Result<(), (CudaError, Self)> {
         Self::drop(val)
     }
 }
 
-impl<T: rustacuda_core::DeviceCopy> CudaDroppable for DeviceBuffer<T> {
+impl<T: cust_core::DeviceCopy> CudaDroppable for DeviceBuffer<T> {
     fn drop(val: Self) -> Result<(), (CudaError, Self)> {
         Self::drop(val)
     }
 }
 
-impl<T> CudaDroppable for LockedBox<T> {
+impl<T: DeviceCopy> CudaDroppable for LockedBox<T> {
     fn drop(val: Self) -> Result<(), (CudaError, Self)> {
-        Self::drop(val)
+        // FIXME: cust's LockedBox no longer has a fallible drop
+        std::mem::drop(val);
+        Ok(())
     }
 }
 
-impl<T: rustacuda_core::DeviceCopy> CudaDroppable for LockedBuffer<T> {
+impl<T: cust_core::DeviceCopy> CudaDroppable for LockedBuffer<T> {
     fn drop(val: Self) -> Result<(), (CudaError, Self)> {
         Self::drop(val)
     }
@@ -147,7 +150,7 @@ macro_rules! impl_sealed_drop_value {
 }
 
 impl_sealed_drop_value!(Module);
-impl_sealed_drop_value!(rustacuda::stream::Stream);
+impl_sealed_drop_value!(cust::stream::Stream);
 impl_sealed_drop_value!(Context);
 impl_sealed_drop_value!(Event);
 
@@ -207,7 +210,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
         'a: 'b,
     {
         DeviceMutRef {
-            pointer: DeviceMutPointer(self.device_box.as_device_ptr().as_raw_mut().cast()),
+            pointer: DeviceMutPointer(self.device_box.as_device_ptr().as_mut_ptr().cast()),
             reference: PhantomData,
         }
     }
@@ -322,10 +325,10 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T>
     where
         'a: 'b,
     {
-        let mut hack = ManuallyDrop::new(unsafe { std::ptr::read(self.device_box) });
+        let hack = ManuallyDrop::new(unsafe { std::ptr::read(self.device_box) });
 
         DeviceConstRef {
-            pointer: DeviceConstPointer(hack.as_device_ptr().as_raw().cast()),
+            pointer: DeviceConstPointer(hack.as_device_ptr().as_ptr().cast()),
             reference: PhantomData,
         }
     }
@@ -390,7 +393,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceOwned<'a, T> {
     #[must_use]
     pub(crate) fn for_device(self) -> DeviceOwnedRef<'a, T> {
         DeviceOwnedRef {
-            pointer: DeviceOwnedPointer(self.device_box.as_device_ptr().as_raw_mut().cast()),
+            pointer: DeviceOwnedPointer(self.device_box.as_device_ptr().as_mut_ptr().cast()),
             marker: PhantomData::<T>,
             reference: PhantomData::<&'a mut ()>,
         }
diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
index 44b4c6216..95d21457a 100644
--- a/src/kernel/mod.rs
+++ b/src/kernel/mod.rs
@@ -1,3 +1,4 @@
+use core::str;
 #[cfg(feature = "host")]
 use std::{
     ffi::{CStr, CString},
@@ -6,8 +7,9 @@ use std::{
     ptr::NonNull,
 };
 
+use cust::module::{ModuleJitOption, OptLevel};
 #[cfg(feature = "host")]
-use rustacuda::{
+use cust::{
     error::{CudaError, CudaResult},
     function::Function,
     module::Module,
@@ -42,12 +44,7 @@ mod sealed {
 
 #[cfg(all(feature = "host", not(doc)))]
 #[doc(hidden)]
-pub trait WithNewAsync<
-    'stream,
-    P: ?Sized + CudaKernelParameter,
-    O,
-    E: From<rustacuda::error::CudaError>,
->
+pub trait WithNewAsync<'stream, P: ?Sized + CudaKernelParameter, O, E: From<cust::error::CudaError>>
 {
     fn with<'b>(self, param: P::AsyncHostType<'stream, 'b>) -> Result<O, E>
     where
@@ -59,7 +56,7 @@ impl<
         'stream,
         P: ?Sized + CudaKernelParameter,
         O,
-        E: From<rustacuda::error::CudaError>,
+        E: From<cust::error::CudaError>,
         F: for<'b> FnOnce(P::AsyncHostType<'stream, 'b>) -> Result<O, E>,
     > WithNewAsync<'stream, P, O, E> for F
 {
@@ -109,7 +106,7 @@ pub trait CudaKernelParameter: sealed::Sealed {
 
     #[cfg(feature = "host")]
     #[expect(clippy::missing_errors_doc)] // FIXME
-    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<cust::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
         #[cfg(not(doc))] inner: impl WithNewAsync<'stream, Self, O, E>,
@@ -139,7 +136,7 @@ pub trait CudaKernelParameter: sealed::Sealed {
 
     #[doc(hidden)]
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+    fn async_to_ffi<'stream, 'b, E: From<cust::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E>
@@ -286,8 +283,8 @@ impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> {
 #[cfg(feature = "host")]
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub struct LaunchConfig {
-    pub grid: rustacuda::function::GridSize,
-    pub block: rustacuda::function::BlockSize,
+    pub grid: cust::function::GridSize,
+    pub block: cust::function::BlockSize,
     pub ptx_jit: bool,
 }
 
@@ -305,9 +302,15 @@ impl RawPtxKernel {
     /// Returns a [`CudaError`] if `ptx` is not a valid PTX source, or it does
     ///  not contain an entry point named `entry_point`.
     pub fn new(ptx: &CStr, entry_point: &CStr) -> CudaResult<Self> {
-        let module: Box<Module> = Box::new(Module::load_from_string(ptx)?);
-
-        let function = unsafe { &*std::ptr::from_ref(module.as_ref()) }.get_function(entry_point);
+        let module: Box<Module> = Box::new(Module::from_ptx_cstr(
+            ptx,
+            &[ModuleJitOption::OptLevel(OptLevel::O4)],
+        )?);
+
+        // FIXME: cust's Module::get_function takes a str and turns it back into
+        //        a CString immediately
+        let function = unsafe { &*std::ptr::from_ref(module.as_ref()) }
+            .get_function(unsafe { str::from_utf8_unchecked(entry_point.to_bytes()) });
 
         let function = match function {
             Ok(function) => function,
diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index 2ad1b0bf8..6d95224dc 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -88,7 +88,7 @@ impl<
     type SyncHostType = T;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<cust::error::CudaError>>(
         param: Self::SyncHostType,
         _stream: crate::host::Stream<'stream>,
         #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
@@ -124,7 +124,7 @@ impl<
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+    fn async_to_ffi<'stream, 'b, E: From<cust::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E>
@@ -180,7 +180,7 @@ impl<
     type SyncHostType = &'a T;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<cust::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
         #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
@@ -219,7 +219,7 @@ impl<
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+    fn async_to_ffi<'stream, 'b, E: From<cust::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E>
@@ -273,7 +273,7 @@ impl<
     type SyncHostType = <&'a PerThreadShallowCopy<T> as CudaKernelParameter>::SyncHostType;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<cust::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
         #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
@@ -315,7 +315,7 @@ impl<
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+    fn async_to_ffi<'stream, 'b, E: From<cust::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E>
@@ -403,7 +403,7 @@ impl<
     type SyncHostType = &'a mut T;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<cust::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
         #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
@@ -442,7 +442,7 @@ impl<
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+    fn async_to_ffi<'stream, 'b, E: From<cust::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E>
@@ -549,7 +549,7 @@ impl<
     type SyncHostType = T;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<cust::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
         #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
@@ -585,7 +585,7 @@ impl<
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+    fn async_to_ffi<'stream, 'b, E: From<cust::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E>
@@ -644,7 +644,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T
     type SyncHostType = &'a T;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<cust::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
         #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
@@ -683,7 +683,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+    fn async_to_ffi<'stream, 'b, E: From<cust::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E>
@@ -737,7 +737,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     type SyncHostType = &'a mut T;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<cust::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
         #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
@@ -781,7 +781,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+    fn async_to_ffi<'stream, 'b, E: From<cust::error::CudaError>>(
         mut param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E>
@@ -835,7 +835,7 @@ impl<
     type SyncHostType = <DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<cust::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
         #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
@@ -862,7 +862,7 @@ impl<
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+    fn async_to_ffi<'stream, 'b, E: From<cust::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E>
@@ -926,7 +926,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit<DeepPerThreadB
     type SyncHostType = <&'a DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<cust::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
         #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
@@ -968,7 +968,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit<DeepPerThreadB
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+    fn async_to_ffi<'stream, 'b, E: From<cust::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E>
@@ -1017,7 +1017,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     type SyncHostType = <&'a mut DeepPerThreadBorrow<T> as CudaKernelParameter>::SyncHostType;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<cust::error::CudaError>>(
         param: Self::SyncHostType,
         stream: crate::host::Stream<'stream>,
         #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
@@ -1064,7 +1064,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+    fn async_to_ffi<'stream, 'b, E: From<cust::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E>
@@ -1154,7 +1154,7 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa
     type SyncHostType = Self;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<cust::error::CudaError>>(
         param: Self::SyncHostType,
         _stream: crate::host::Stream<'stream>,
         #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
@@ -1190,7 +1190,7 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+    fn async_to_ffi<'stream, 'b, E: From<cust::error::CudaError>>(
         _param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E>
@@ -1241,7 +1241,7 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete
     type SyncHostType = Self;
 
     #[cfg(feature = "host")]
-    fn with_new_async<'stream, 'b, O, E: From<rustacuda::error::CudaError>>(
+    fn with_new_async<'stream, 'b, O, E: From<cust::error::CudaError>>(
         param: Self::SyncHostType,
         _stream: crate::host::Stream<'stream>,
         #[cfg(not(doc))] inner: impl super::WithNewAsync<'stream, Self, O, E>,
@@ -1277,7 +1277,7 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete
     }
 
     #[cfg(feature = "host")]
-    fn async_to_ffi<'stream, 'b, E: From<rustacuda::error::CudaError>>(
+    fn async_to_ffi<'stream, 'b, E: From<cust::error::CudaError>>(
         param: Self::AsyncHostType<'stream, 'b>,
         _token: sealed::Token,
     ) -> Result<Self::FfiType<'stream, 'b>, E>
diff --git a/src/lend/impls/arc.rs b/src/lend/impls/arc.rs
index 4d59837ff..9bb3e1cb0 100644
--- a/src/lend/impls/arc.rs
+++ b/src/lend/impls/arc.rs
@@ -5,7 +5,7 @@ use std::mem::ManuallyDrop;
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 #[cfg(feature = "host")]
-use rustacuda::{error::CudaResult, memory::DeviceBox, memory::LockedBox};
+use cust::{error::CudaResult, memory::DeviceBox, memory::LockedBox};
 
 use crate::{
     deps::alloc::sync::Arc,
@@ -65,13 +65,13 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Arc<T> {
         let offset = std::mem::offset_of!(_ArcInner<T>, data);
         let arc_ptr: *const _ArcInner<T> = data_ptr.byte_sub(offset).cast();
 
-        let mut device_box = CudaDropWrapper::from(DeviceBox::new(
+        let device_box = CudaDropWrapper::from(DeviceBox::new(
             DeviceCopyWithPortableBitSemantics::from_ref(&*arc_ptr),
         )?);
 
         Ok((
             DeviceAccessible::from(ArcCudaRepresentation(DeviceOwnedPointer(
-                device_box.as_device_ptr().as_raw_mut().cast(),
+                device_box.as_device_ptr().as_mut_ptr().cast(),
             ))),
             CombinedCudaAlloc::new(device_box, alloc),
         ))
@@ -101,11 +101,11 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Arc<T
         &self,
         alloc: A,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
-        use rustacuda::memory::AsyncCopyDestination;
+        use cust::memory::AsyncCopyDestination;
 
         let locked_box = unsafe {
             let inner = ManuallyDrop::new(_ArcInner {
@@ -114,12 +114,12 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Arc<T
                 data: std::ptr::read(&**self),
             });
 
-            let mut uninit = CudaDropWrapper::from(LockedBox::<
+            let uninit = CudaDropWrapper::from(LockedBox::<
                 DeviceCopyWithPortableBitSemantics<ManuallyDrop<_ArcInner<T>>>,
             >::uninitialized()?);
             std::ptr::copy_nonoverlapping(
                 std::ptr::from_ref(DeviceCopyWithPortableBitSemantics::from_ref(&inner)),
-                uninit.as_mut_ptr(),
+                uninit.as_raw(),
                 1,
             );
 
@@ -129,12 +129,12 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Arc<T
         let mut device_box = CudaDropWrapper::from(DeviceBox::<
             DeviceCopyWithPortableBitSemantics<ManuallyDrop<_ArcInner<T>>>,
         >::uninitialized()?);
-        device_box.async_copy_from(&*locked_box, &stream)?;
+        device_box.async_copy_from(&**locked_box, &stream)?;
 
         Ok((
             Async::pending(
                 DeviceAccessible::from(ArcCudaRepresentation(DeviceOwnedPointer(
-                    device_box.as_device_ptr().as_raw_mut().cast(),
+                    device_box.as_device_ptr().as_mut_ptr().cast(),
                 ))),
                 stream,
                 NoCompletion,
diff --git a/src/lend/impls/arced_slice.rs b/src/lend/impls/arced_slice.rs
index cce12b3cd..1fc334589 100644
--- a/src/lend/impls/arced_slice.rs
+++ b/src/lend/impls/arced_slice.rs
@@ -5,12 +5,12 @@ use std::mem::{ManuallyDrop, MaybeUninit};
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 #[cfg(feature = "host")]
-use rustacuda::{
+use cust::{
     error::CudaResult,
     memory::LockedBuffer,
     memory::{DeviceBox, DeviceBuffer},
 };
-use rustacuda_core::DeviceCopy;
+use cust_core::DeviceCopy;
 
 use crate::{
     deps::alloc::sync::Arc,
@@ -51,10 +51,17 @@ pub struct _ArcInner<T: ?Sized> {
     data: T,
 }
 
+#[derive(Copy, Clone)]
 #[repr(C)]
 struct _ArcInnerHeader {
-    strong: AtomicUsize,
-    weak: AtomicUsize,
+    strong: _AtomicUsize,
+    weak: _AtomicUsize,
+}
+
+#[derive(Copy, Clone)]
+#[repr(C, align(8))]
+struct _AtomicUsize {
+    v: usize,
 }
 
 unsafe impl DeviceCopy for _ArcInnerHeader {}
@@ -74,8 +81,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Arc<[T]> {
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
-        use rustacuda::memory::{CopyDestination, DeviceSlice};
-        use rustacuda_core::DevicePointer;
+        use cust::memory::{CopyDestination, DevicePointer, DeviceSlice};
 
         let data_ptr: *const T = std::ptr::from_ref(&**self).as_ptr();
         let offset = std::mem::offset_of!(_ArcInner<[T; 42]>, data);
@@ -105,7 +111,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Arc<[T]> {
 
         Ok((
             DeviceAccessible::from(ArcedSliceCudaRepresentation {
-                data: DeviceOwnedPointer(header.as_device_ptr().as_raw_mut().cast()),
+                data: DeviceOwnedPointer(header.as_device_ptr().as_mut_ptr().cast()),
                 len: self.len(),
             }),
             CombinedCudaAlloc::new(device_buffer, alloc),
@@ -136,11 +142,11 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Arc<[
         &self,
         alloc: A,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
-        use rustacuda::memory::AsyncCopyDestination;
+        use cust::memory::AsyncCopyDestination;
 
         let data_ptr: *const T = std::ptr::from_ref(&**self).as_ptr();
         let offset = std::mem::offset_of!(_ArcInner<[T; 42]>, data);
@@ -187,7 +193,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Arc<[
                     data: DeviceOwnedPointer(
                         device_buffer
                             .as_device_ptr()
-                            .as_raw_mut()
+                            .as_mut_ptr()
                             .byte_add(header_len * std::mem::size_of::<T>() - offset)
                             .cast(),
                     ),
diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs
index 305072a34..9c16f07a4 100644
--- a/src/lend/impls/box.rs
+++ b/src/lend/impls/box.rs
@@ -4,7 +4,7 @@ use std::mem::ManuallyDrop;
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 #[cfg(feature = "host")]
-use rustacuda::{error::CudaResult, memory::DeviceBox, memory::LockedBox};
+use cust::{error::CudaResult, memory::DeviceBox, memory::LockedBox};
 
 use crate::{
     deps::alloc::boxed::Box,
@@ -47,13 +47,13 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<T> {
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
-        let mut device_box = CudaDropWrapper::from(DeviceBox::new(
+        let device_box = CudaDropWrapper::from(DeviceBox::new(
             DeviceCopyWithPortableBitSemantics::from_ref(&**self),
         )?);
 
         Ok((
             DeviceAccessible::from(BoxCudaRepresentation(DeviceOwnedPointer(
-                device_box.as_device_ptr().as_raw_mut().cast(),
+                device_box.as_device_ptr().as_mut_ptr().cast(),
             ))),
             CombinedCudaAlloc::new(device_box, alloc),
         ))
@@ -64,7 +64,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<T> {
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
     ) -> CudaResult<A> {
-        use rustacuda::memory::CopyDestination;
+        use cust::memory::CopyDestination;
 
         let (alloc_front, alloc_tail) = alloc.split();
 
@@ -90,20 +90,20 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
         &self,
         alloc: A,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
-        use rustacuda::memory::AsyncCopyDestination;
+        use cust::memory::AsyncCopyDestination;
 
         let locked_box = unsafe {
-            let mut uninit = CudaDropWrapper::from(LockedBox::<
+            let uninit = CudaDropWrapper::from(LockedBox::<
                 DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
             >::uninitialized()?);
             std::ptr::copy_nonoverlapping(
                 std::ptr::from_ref::<T>(&**self)
                     .cast::<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>(),
-                uninit.as_mut_ptr(),
+                uninit.as_raw(),
                 1,
             );
             uninit
@@ -112,12 +112,12 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
         let mut device_box = CudaDropWrapper::from(DeviceBox::<
             DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
         >::uninitialized()?);
-        device_box.async_copy_from(&*locked_box, &stream)?;
+        device_box.async_copy_from(&**locked_box, &stream)?;
 
         Ok((
             Async::pending(
                 DeviceAccessible::from(BoxCudaRepresentation(DeviceOwnedPointer(
-                    device_box.as_device_ptr().as_raw_mut().cast(),
+                    device_box.as_device_ptr().as_mut_ptr().cast(),
                 ))),
                 stream,
                 NoCompletion,
@@ -135,12 +135,12 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
     )> {
-        use rustacuda::memory::AsyncCopyDestination;
+        use cust::memory::AsyncCopyDestination;
 
         let (alloc_front, alloc_tail) = alloc.split();
         let (mut locked_box, device_box) = alloc_front.split();
 
-        device_box.async_copy_to(&mut *locked_box, &stream)?;
+        device_box.async_copy_to(&mut **locked_box, &stream)?;
 
         let r#async = crate::utils::r#async::Async::<_, CompletionFnMut<'a, Self>>::pending(
             this,
@@ -151,7 +151,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<T
                 // Safety: equivalent to *data = *locked_box since
                 //         LockedBox<ManuallyDrop<T>> doesn't drop T
                 unsafe {
-                    std::ptr::copy_nonoverlapping(locked_box.as_ptr().cast::<T>(), data, 1);
+                    std::ptr::copy_nonoverlapping(locked_box.as_raw().cast::<T>(), data, 1);
                 }
                 std::mem::drop(locked_box);
                 Ok(())
diff --git a/src/lend/impls/boxed_slice.rs b/src/lend/impls/boxed_slice.rs
index b2c22765c..8b0937b06 100644
--- a/src/lend/impls/boxed_slice.rs
+++ b/src/lend/impls/boxed_slice.rs
@@ -7,7 +7,7 @@ use crate::{deps::alloc::boxed::Box, lend::RustToCudaAsync, utils::ffi::DeviceOw
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 #[cfg(feature = "host")]
-use rustacuda::{error::CudaResult, memory::DeviceBuffer, memory::LockedBuffer};
+use cust::{error::CudaResult, memory::DeviceBuffer, memory::LockedBuffer};
 
 use crate::{
     lend::{CudaAsRust, RustToCuda},
@@ -51,13 +51,13 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<[T]> {
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
-        let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice(
+        let device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice(
             DeviceCopyWithPortableBitSemantics::from_slice(self),
         )?);
 
         Ok((
             DeviceAccessible::from(BoxedSliceCudaRepresentation {
-                data: DeviceOwnedPointer(device_buffer.as_mut_ptr().cast()),
+                data: DeviceOwnedPointer(device_buffer.as_device_ptr().as_mut_ptr().cast()),
                 len: device_buffer.len(),
                 _marker: PhantomData::<T>,
             }),
@@ -70,7 +70,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<[T]> {
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
     ) -> CudaResult<A> {
-        use rustacuda::memory::CopyDestination;
+        use cust::memory::CopyDestination;
 
         let (alloc_front, alloc_tail) = alloc.split();
 
@@ -96,11 +96,11 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[
         &self,
         alloc: A,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
-        use rustacuda::memory::AsyncCopyDestination;
+        use cust::memory::AsyncCopyDestination;
 
         let locked_buffer = unsafe {
             let mut uninit = CudaDropWrapper::from(LockedBuffer::<
@@ -124,7 +124,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[
         Ok((
             Async::pending(
                 DeviceAccessible::from(BoxedSliceCudaRepresentation {
-                    data: DeviceOwnedPointer(device_buffer.as_mut_ptr().cast()),
+                    data: DeviceOwnedPointer(device_buffer.as_device_ptr().as_mut_ptr().cast()),
                     len: device_buffer.len(),
                     _marker: PhantomData::<T>,
                 }),
@@ -144,7 +144,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Box<[
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
     )> {
-        use rustacuda::memory::AsyncCopyDestination;
+        use cust::memory::AsyncCopyDestination;
 
         let (alloc_front, alloc_tail) = alloc.split();
         let (mut locked_buffer, device_buffer) = alloc_front.split();
diff --git a/src/lend/impls/final.rs b/src/lend/impls/final.rs
index fa83de5a2..51b228c24 100644
--- a/src/lend/impls/final.rs
+++ b/src/lend/impls/final.rs
@@ -19,7 +19,7 @@ unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
     unsafe fn borrow<A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
         crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
@@ -35,7 +35,7 @@ unsafe impl<T: RustToCuda> RustToCuda for Final<T> {
     unsafe fn restore<A: crate::alloc::CudaAlloc>(
         &mut self,
         alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> rustacuda::error::CudaResult<A> {
+    ) -> cust::error::CudaResult<A> {
         let (_alloc_front, alloc_tail) = alloc.split();
         Ok(alloc_tail)
     }
@@ -49,7 +49,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
         &self,
         alloc: A,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
@@ -76,7 +76,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for Final<T> {
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         crate::utils::r#async::Async<
             'a,
             'stream,
diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs
index 1997822a2..931c7e952 100644
--- a/src/lend/impls/option.rs
+++ b/src/lend/impls/option.rs
@@ -3,7 +3,7 @@ use core::mem::MaybeUninit;
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 #[cfg(feature = "host")]
-use rustacuda::error::CudaResult;
+use cust::error::CudaResult;
 
 use crate::{
     lend::{CudaAsRust, RustToCuda, RustToCudaAsync, RustToCudaProxy},
diff --git a/src/lend/impls/ref.rs b/src/lend/impls/ref.rs
index 43358c546..99318e055 100644
--- a/src/lend/impls/ref.rs
+++ b/src/lend/impls/ref.rs
@@ -5,7 +5,7 @@ use std::mem::ManuallyDrop;
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 #[cfg(feature = "host")]
-use rustacuda::{error::CudaResult, memory::DeviceBox, memory::LockedBox};
+use cust::{error::CudaResult, memory::DeviceBox, memory::LockedBox};
 
 use crate::{
     lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
@@ -48,13 +48,13 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a T
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
-        let mut device_box = CudaDropWrapper::from(DeviceBox::new(
+        let device_box = CudaDropWrapper::from(DeviceBox::new(
             DeviceCopyWithPortableBitSemantics::from_ref(&**self),
         )?);
 
         Ok((
             DeviceAccessible::from(RefCudaRepresentation {
-                data: DeviceConstPointer(device_box.as_device_ptr().as_raw().cast()),
+                data: DeviceConstPointer(device_box.as_device_ptr().as_ptr().cast()),
                 _marker: PhantomData::<&'a T>,
             }),
             CombinedCudaAlloc::new(device_box, alloc),
@@ -85,20 +85,20 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &
         &self,
         alloc: A,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
-        use rustacuda::memory::AsyncCopyDestination;
+        use cust::memory::AsyncCopyDestination;
 
         let locked_box = unsafe {
-            let mut uninit = CudaDropWrapper::from(LockedBox::<
+            let uninit = CudaDropWrapper::from(LockedBox::<
                 DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
             >::uninitialized()?);
             std::ptr::copy_nonoverlapping(
                 std::ptr::from_ref::<T>(&**self)
                     .cast::<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>(),
-                uninit.as_mut_ptr(),
+                uninit.as_raw(),
                 1,
             );
             uninit
@@ -107,12 +107,12 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &
         let mut device_box = CudaDropWrapper::from(DeviceBox::<
             DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>,
         >::uninitialized()?);
-        device_box.async_copy_from(&*locked_box, &stream)?;
+        device_box.async_copy_from(&**locked_box, &stream)?;
 
         Ok((
             Async::pending(
                 DeviceAccessible::from(RefCudaRepresentation {
-                    data: DeviceConstPointer(device_box.as_device_ptr().as_raw().cast()),
+                    data: DeviceConstPointer(device_box.as_device_ptr().as_ptr().cast()),
                     _marker: PhantomData::<&T>,
                 }),
                 stream,
diff --git a/src/lend/impls/ref_mut.rs b/src/lend/impls/ref_mut.rs
index ca9830c75..6945c9bc2 100644
--- a/src/lend/impls/ref_mut.rs
+++ b/src/lend/impls/ref_mut.rs
@@ -3,7 +3,7 @@ use core::marker::PhantomData;
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 #[cfg(feature = "host")]
-use rustacuda::{error::CudaResult, memory::DeviceBox};
+use cust::{error::CudaResult, memory::DeviceBox};
 
 use crate::{
     lend::{CudaAsRust, RustToCuda},
@@ -45,13 +45,13 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
-        let mut device_box = CudaDropWrapper::from(DeviceBox::new(
+        let device_box = CudaDropWrapper::from(DeviceBox::new(
             DeviceCopyWithPortableBitSemantics::from_ref(&**self),
         )?);
 
         Ok((
             DeviceAccessible::from(RefMutCudaRepresentation {
-                data: DeviceMutPointer(device_box.as_device_ptr().as_raw_mut().cast()),
+                data: DeviceMutPointer(device_box.as_device_ptr().as_mut_ptr().cast()),
                 _marker: PhantomData::<&'a mut T>,
             }),
             CombinedCudaAlloc::new(device_box, alloc),
@@ -63,7 +63,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
     ) -> CudaResult<A> {
-        use rustacuda::memory::CopyDestination;
+        use cust::memory::CopyDestination;
 
         let (alloc_front, alloc_tail) = alloc.split();
 
diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs
index 0a97b673f..53558a17c 100644
--- a/src/lend/impls/slice_ref.rs
+++ b/src/lend/impls/slice_ref.rs
@@ -5,7 +5,7 @@ use std::mem::ManuallyDrop;
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 #[cfg(feature = "host")]
-use rustacuda::{error::CudaResult, memory::DeviceBuffer, memory::LockedBuffer};
+use cust::{error::CudaResult, memory::DeviceBuffer, memory::LockedBuffer};
 
 use crate::{
     lend::{CudaAsRust, RustToCuda, RustToCudaAsync},
@@ -56,7 +56,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a [T
 
         Ok((
             DeviceAccessible::from(SliceRefCudaRepresentation {
-                data: DeviceConstPointer(device_buffer.as_ptr().cast()),
+                data: DeviceConstPointer(device_buffer.as_device_ptr().as_ptr().cast()),
                 len: device_buffer.len(),
                 _marker: PhantomData::<&'a [T]>,
             }),
@@ -88,11 +88,11 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &
         &self,
         alloc: A,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
-        use rustacuda::memory::AsyncCopyDestination;
+        use cust::memory::AsyncCopyDestination;
 
         let locked_buffer = unsafe {
             let mut uninit = CudaDropWrapper::from(LockedBuffer::<
@@ -116,7 +116,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &
         Ok((
             Async::pending(
                 DeviceAccessible::from(SliceRefCudaRepresentation {
-                    data: DeviceConstPointer(device_buffer.as_ptr().cast()),
+                    data: DeviceConstPointer(device_buffer.as_device_ptr().as_ptr().cast()),
                     len: device_buffer.len(),
                     _marker: PhantomData::<&'a [T]>,
                 }),
diff --git a/src/lend/impls/slice_ref_mut.rs b/src/lend/impls/slice_ref_mut.rs
index 0300735cd..59d9eeff5 100644
--- a/src/lend/impls/slice_ref_mut.rs
+++ b/src/lend/impls/slice_ref_mut.rs
@@ -3,7 +3,7 @@ use core::marker::PhantomData;
 use const_type_layout::{TypeGraphLayout, TypeLayout};
 
 #[cfg(feature = "host")]
-use rustacuda::{error::CudaResult, memory::DeviceBuffer};
+use cust::{error::CudaResult, memory::DeviceBuffer};
 
 use crate::{
     lend::{CudaAsRust, RustToCuda},
@@ -47,13 +47,13 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
-        let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice(
+        let device_buffer = CudaDropWrapper::from(DeviceBuffer::from_slice(
             DeviceCopyWithPortableBitSemantics::from_slice(self),
         )?);
 
         Ok((
             DeviceAccessible::from(SliceRefMutCudaRepresentation {
-                data: DeviceMutPointer(device_buffer.as_mut_ptr().cast()),
+                data: DeviceMutPointer(device_buffer.as_device_ptr().as_mut_ptr().cast()),
                 len: device_buffer.len(),
                 _marker: PhantomData::<&'a mut [T]>,
             }),
@@ -66,7 +66,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a mu
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
     ) -> CudaResult<A> {
-        use rustacuda::memory::CopyDestination;
+        use cust::memory::CopyDestination;
 
         let (alloc_front, alloc_tail) = alloc.split();
 
diff --git a/src/lend/mod.rs b/src/lend/mod.rs
index 3bca11f75..7296473e9 100644
--- a/src/lend/mod.rs
+++ b/src/lend/mod.rs
@@ -1,6 +1,6 @@
 use const_type_layout::TypeGraphLayout;
 #[cfg(feature = "host")]
-use rustacuda::error::CudaError;
+use cust::error::CudaError;
 
 #[cfg(feature = "derive")]
 #[expect(clippy::module_name_repetitions)]
@@ -34,7 +34,7 @@ pub unsafe trait RustToCuda {
     #[cfg(feature = "host")]
     /// # Errors
     ///
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA
     ///
     /// # Safety
@@ -46,7 +46,7 @@ pub unsafe trait RustToCuda {
     unsafe fn borrow<A: CudaAlloc>(
         &self,
         alloc: A,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )>;
@@ -55,7 +55,7 @@ pub unsafe trait RustToCuda {
     #[cfg(feature = "host")]
     /// # Errors
     ///
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA
     ///
     /// # Safety
@@ -64,7 +64,7 @@ pub unsafe trait RustToCuda {
     unsafe fn restore<A: CudaAlloc>(
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> rustacuda::error::CudaResult<A>;
+    ) -> cust::error::CudaResult<A>;
 }
 
 /// # Safety
@@ -78,7 +78,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
     #[cfg(feature = "host")]
     /// # Errors
     ///
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA
     ///
     /// # Safety
@@ -101,7 +101,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
         &self,
         alloc: A,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )>;
@@ -110,7 +110,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
     #[cfg(feature = "host")]
     /// # Errors
     ///
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA
     ///
     /// # Safety
@@ -127,7 +127,7 @@ pub unsafe trait RustToCudaAsync: RustToCuda {
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
     )>;
@@ -187,7 +187,7 @@ pub trait LendToCuda: RustToCuda {
     ///
     /// # Errors
     ///
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a `cust::errors::CudaError` iff an error occurs inside CUDA
     fn lend_to_cuda_mut<
         O,
         E: From<CudaError>,
@@ -339,7 +339,7 @@ pub trait LendToCudaAsync: RustToCudaAsync {
     ///
     /// # Errors
     ///
-    /// Returns a `rustacuda::errors::CudaError` iff an error occurs inside CUDA
+    /// Returns a `cust::errors::CudaError` iff an error occurs inside CUDA
     fn lend_to_cuda_mut_async<
         'a,
         'stream,
diff --git a/src/utils/adapter.rs b/src/utils/adapter.rs
index bc8bd161b..c8a533d80 100644
--- a/src/utils/adapter.rs
+++ b/src/utils/adapter.rs
@@ -124,7 +124,7 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCuda
     unsafe fn borrow<A: CudaAlloc>(
         &self,
         alloc: A,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
@@ -136,7 +136,7 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCuda
     unsafe fn restore<A: CudaAlloc>(
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> rustacuda::error::CudaResult<A> {
+    ) -> cust::error::CudaResult<A> {
         let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
 
         Ok(alloc_tail)
@@ -153,7 +153,7 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
         &self,
         alloc: A,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
@@ -169,7 +169,7 @@ unsafe impl<T: Copy + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         crate::utils::r#async::Async<
             'a,
             'stream,
@@ -312,7 +312,7 @@ unsafe impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCuda
     unsafe fn borrow<A: CudaAlloc>(
         &self,
         alloc: A,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
@@ -324,7 +324,7 @@ unsafe impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCuda
     unsafe fn restore<A: CudaAlloc>(
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> rustacuda::error::CudaResult<A> {
+    ) -> cust::error::CudaResult<A> {
         let (_alloc_front, alloc_tail): (NoCudaAlloc, A) = alloc.split();
 
         Ok(alloc_tail)
@@ -341,7 +341,7 @@ unsafe impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
         &self,
         alloc: A,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
@@ -357,7 +357,7 @@ unsafe impl<T: Clone + PortableBitSemantics + TypeGraphLayout> RustToCudaAsync
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         crate::utils::r#async::Async<
             'a,
             'stream,
@@ -394,7 +394,7 @@ unsafe impl<T: Clone + PortableBitSemantics + TypeGraphLayout> CudaAsRust
 #[repr(transparent)]
 pub struct DeviceCopyWithPortableBitSemantics<T: PortableBitSemantics + TypeGraphLayout>(T);
 
-unsafe impl<T: PortableBitSemantics + TypeGraphLayout> rustacuda_core::DeviceCopy
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> cust_core::DeviceCopy
     for DeviceCopyWithPortableBitSemantics<T>
 {
 }
diff --git a/src/utils/aliasing/const.rs b/src/utils/aliasing/const.rs
index 4cd6eb228..624aa1ea5 100644
--- a/src/utils/aliasing/const.rs
+++ b/src/utils/aliasing/const.rs
@@ -193,7 +193,7 @@ unsafe impl<T: RustToCuda, const STRIDE: usize> RustToCuda
     unsafe fn borrow<A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
         crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
@@ -209,7 +209,7 @@ unsafe impl<T: RustToCuda, const STRIDE: usize> RustToCuda
     unsafe fn restore<A: crate::alloc::CudaAlloc>(
         &mut self,
         alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> rustacuda::error::CudaResult<A> {
+    ) -> cust::error::CudaResult<A> {
         self.0.restore(alloc)
     }
 }
@@ -224,7 +224,7 @@ unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
         &self,
         alloc: A,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
@@ -252,7 +252,7 @@ unsafe impl<T: RustToCudaAsync, const STRIDE: usize> RustToCudaAsync
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         crate::utils::r#async::Async<
             'a,
             'stream,
diff --git a/src/utils/aliasing/dynamic.rs b/src/utils/aliasing/dynamic.rs
index 2e16bf42e..2fd8c3646 100644
--- a/src/utils/aliasing/dynamic.rs
+++ b/src/utils/aliasing/dynamic.rs
@@ -170,7 +170,7 @@ unsafe impl<T: RustToCuda> RustToCuda for SplitSliceOverCudaThreadsDynamicStride
     unsafe fn borrow<A: crate::alloc::CudaAlloc>(
         &self,
         alloc: A,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
         crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
@@ -189,7 +189,7 @@ unsafe impl<T: RustToCuda> RustToCuda for SplitSliceOverCudaThreadsDynamicStride
     unsafe fn restore<A: crate::alloc::CudaAlloc>(
         &mut self,
         alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> rustacuda::error::CudaResult<A> {
+    ) -> cust::error::CudaResult<A> {
         self.inner.restore(alloc)
     }
 }
@@ -202,7 +202,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDyn
         &self,
         alloc: A,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         crate::utils::r#async::Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
@@ -232,7 +232,7 @@ unsafe impl<T: RustToCudaAsync> RustToCudaAsync for SplitSliceOverCudaThreadsDyn
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: crate::alloc::CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         crate::utils::r#async::Async<
             'a,
             'stream,
diff --git a/src/utils/async.rs b/src/utils/async.rs
index 6221447a8..899e94cdb 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -2,7 +2,7 @@
 use std::{borrow::BorrowMut, future::Future, future::IntoFuture, marker::PhantomData, task::Poll};
 
 #[cfg(feature = "host")]
-use rustacuda::{
+use cust::{
     error::CudaError, error::CudaResult, event::Event, event::EventFlags,
     stream::StreamWaitEventFlags,
 };
@@ -136,7 +136,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     /// such that its computation can be synchronised on.
     ///
     /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA.
     pub fn pending(value: T, stream: Stream<'stream>, completion: C) -> CudaResult<Self> {
         let (sender, receiver) = oneshot::channel();
@@ -160,11 +160,11 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     /// operations.
     ///
     /// Calling `synchronize` after the computation has completed, e.g. after
-    /// calling [`rustacuda::stream::Stream::synchronize`], should be very
+    /// calling [`cust::stream::Stream::synchronize`], should be very
     /// cheap.
     ///
     /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA.
     pub fn synchronize(self) -> CudaResult<T> {
         let (_stream, mut value, status) = self.destructure_into_parts();
@@ -198,7 +198,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Async<'a, 'strea
     /// used on the new one.
     ///
     /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA.
     pub fn move_to_stream<'stream_new>(
         self,
diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index f5a3e5308..5c766fcae 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -4,7 +4,7 @@ use std::{
 };
 
 use const_type_layout::TypeGraphLayout;
-use rustacuda::{
+use cust::{
     error::CudaResult,
     memory::{DeviceBuffer, LockedBuffer},
 };
@@ -45,7 +45,7 @@ impl<
     > CudaExchangeBufferHost<T, M2D, M2H>
 {
     /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA
     pub fn new(elem: &T, capacity: usize) -> CudaResult<Self> {
         // Safety: CudaExchangeItem is a `repr(transparent)` wrapper around T
@@ -70,7 +70,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
     CudaExchangeBufferHost<T, M2D, M2H>
 {
     /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA
     pub fn from_vec(vec: Vec<T>) -> CudaResult<Self> {
         let host_buffer = unsafe {
@@ -127,7 +127,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
     pub unsafe fn borrow<A: CudaAlloc>(
         &self,
         alloc: A,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         DeviceAccessible<CudaExchangeBufferCudaRepresentation<T, M2D, M2H>>,
         CombinedCudaAlloc<NoCudaAlloc, A>,
     )> {
@@ -138,7 +138,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
         if M2D {
             // Only move the buffer contents to the device if needed
 
-            rustacuda::memory::CopyDestination::copy_from(
+            cust::memory::CopyDestination::copy_from(
                 &mut ***device_buffer,
                 self.host_buffer.as_slice(),
             )?;
@@ -146,7 +146,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
 
         Ok((
             DeviceAccessible::from(CudaExchangeBufferCudaRepresentation(
-                DeviceMutPointer(device_buffer.as_mut_ptr().cast()),
+                DeviceMutPointer(device_buffer.as_device_ptr().as_mut_ptr().cast()),
                 device_buffer.len(),
             )),
             CombinedCudaAlloc::new(NoCudaAlloc, alloc),
@@ -156,13 +156,13 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
     pub unsafe fn restore<A: CudaAlloc>(
         &mut self,
         alloc: CombinedCudaAlloc<NoCudaAlloc, A>,
-    ) -> rustacuda::error::CudaResult<A> {
+    ) -> cust::error::CudaResult<A> {
         let (_alloc_front, alloc_tail) = alloc.split();
 
         if M2H {
             // Only move the buffer contents back to the host if needed
 
-            rustacuda::memory::CopyDestination::copy_to(
+            cust::memory::CopyDestination::copy_to(
                 &***self.device_buffer.get_mut(),
                 self.host_buffer.as_mut_slice(),
             )?;
@@ -180,7 +180,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
         &self,
         alloc: A,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<CudaExchangeBufferCudaRepresentation<T, M2D, M2H>>>,
         CombinedCudaAlloc<NoCudaAlloc, A>,
     )> {
@@ -191,7 +191,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
         if M2D {
             // Only move the buffer contents to the device if needed
 
-            rustacuda::memory::AsyncCopyDestination::async_copy_from(
+            cust::memory::AsyncCopyDestination::async_copy_from(
                 &mut ***device_buffer,
                 self.host_buffer.as_slice(),
                 &stream,
@@ -199,7 +199,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
         }
 
         let cuda_repr = DeviceAccessible::from(CudaExchangeBufferCudaRepresentation(
-            DeviceMutPointer(device_buffer.as_mut_ptr().cast()),
+            DeviceMutPointer(device_buffer.as_device_ptr().as_mut_ptr().cast()),
             device_buffer.len(),
         ));
 
@@ -217,7 +217,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
         mut this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<NoCudaAlloc, A>,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
     )> {
@@ -228,7 +228,7 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
 
             let this: &mut Self = &mut this;
 
-            rustacuda::memory::AsyncCopyDestination::async_copy_to(
+            cust::memory::AsyncCopyDestination::async_copy_to(
                 &***this.device_buffer.get_mut(),
                 this.host_buffer.as_mut_slice(),
                 &stream,
diff --git a/src/utils/exchange/buffer/mod.rs b/src/utils/exchange/buffer/mod.rs
index fe4bec276..782dbab56 100644
--- a/src/utils/exchange/buffer/mod.rs
+++ b/src/utils/exchange/buffer/mod.rs
@@ -63,9 +63,9 @@ impl<
     > CudaExchangeBuffer<T, M2D, M2H>
 {
     /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA
-    pub fn new(elem: &T, capacity: usize) -> rustacuda::error::CudaResult<Self> {
+    pub fn new(elem: &T, capacity: usize) -> cust::error::CudaResult<Self> {
         Ok(Self {
             inner: host::CudaExchangeBufferHost::new(elem, capacity)?,
         })
@@ -77,9 +77,9 @@ impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bool, con
     CudaExchangeBuffer<T, M2D, M2H>
 {
     /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA
-    pub fn from_vec(vec: Vec<T>) -> rustacuda::error::CudaResult<Self> {
+    pub fn from_vec(vec: Vec<T>) -> cust::error::CudaResult<Self> {
         Ok(Self {
             inner: host::CudaExchangeBufferHost::from_vec(vec)?,
         })
@@ -117,7 +117,7 @@ unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bo
     unsafe fn borrow<A: CudaAlloc>(
         &self,
         alloc: A,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
@@ -128,7 +128,7 @@ unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bo
     unsafe fn restore<A: CudaAlloc>(
         &mut self,
         alloc: CombinedCudaAlloc<Self::CudaAllocation, A>,
-    ) -> rustacuda::error::CudaResult<A> {
+    ) -> cust::error::CudaResult<A> {
         self.inner.restore(alloc)
     }
 }
@@ -144,7 +144,7 @@ unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bo
         &self,
         alloc: A,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         Async<'_, 'stream, DeviceAccessible<Self::CudaRepresentation>>,
         CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
     )> {
@@ -156,7 +156,7 @@ unsafe impl<T: StackOnly + PortableBitSemantics + TypeGraphLayout, const M2D: bo
         this: owning_ref::BoxRefMut<'a, O, Self>,
         alloc: CombinedCudaAlloc<Self::CudaAllocationAsync, A>,
         stream: crate::host::Stream<'stream>,
-    ) -> rustacuda::error::CudaResult<(
+    ) -> cust::error::CudaResult<(
         Async<'a, 'stream, owning_ref::BoxRefMut<'a, O, Self>, CompletionFnMut<'a, Self>>,
         A,
     )> {
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index ed15c63de..36bd68614 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -1,6 +1,6 @@
 use std::ops::{Deref, DerefMut};
 
-use rustacuda::{
+use cust::{
     error::CudaResult,
     memory::{AsyncCopyDestination, CopyDestination, DeviceBox, LockedBox},
 };
@@ -55,7 +55,7 @@ pub struct ExchangeWrapperOnDevice<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>
 
 impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
     /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA
     pub fn new(value: T) -> CudaResult<Self> {
         // Safety: The uninitialised memory is never exposed
@@ -65,13 +65,13 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
 
         let (cuda_repr, _null_alloc) = unsafe { value.borrow(NoCudaAlloc) }?;
         let locked_cuda_repr = unsafe {
-            let mut uninit = CudaDropWrapper::from(LockedBox::<
+            let uninit = CudaDropWrapper::from(LockedBox::<
                 DeviceCopyWithPortableBitSemantics<
                     DeviceAccessible<<T as RustToCuda>::CudaRepresentation>,
                 >,
             >::uninitialized()?);
             uninit
-                .as_mut_ptr()
+                .as_raw()
                 .write(DeviceCopyWithPortableBitSemantics::from(cuda_repr));
             uninit
         };
@@ -88,7 +88,7 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnHost<T> {
     /// via [`ExchangeWrapperOnDevice::as_mut_async`](Async::as_mut_async).
     ///
     /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA
     pub fn move_to_device(mut self) -> CudaResult<ExchangeWrapperOnDevice<T>> {
         let (cuda_repr, null_alloc) = unsafe { self.value.borrow(NoCudaAlloc) }?;
@@ -113,7 +113,7 @@ impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: Emp
     /// Moves the data asynchronously to the CUDA device.
     ///
     /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA
     pub fn move_to_device_async<'stream>(
         mut self,
@@ -130,7 +130,7 @@ impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: Emp
         // - the kernel is launched on the passed-in [`Stream`]
         unsafe {
             self.device_box
-                .async_copy_from(&*self.locked_cuda_repr, &stream)
+                .async_copy_from(&**self.locked_cuda_repr, &stream)
         }?;
 
         Async::pending(
@@ -163,7 +163,7 @@ impl<T: RustToCuda<CudaAllocation: EmptyCudaAlloc>> ExchangeWrapperOnDevice<T> {
     /// Moves the data synchronously back to the host CPU device.
     ///
     /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA
     pub fn move_to_host(mut self) -> CudaResult<ExchangeWrapperOnHost<T>> {
         let null_alloc = NoCudaAlloc.into();
@@ -201,7 +201,7 @@ impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: Emp
     /// Moves the data asynchronously back to the host CPU device.
     ///
     /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA
     pub fn move_to_host_async<'stream>(
         self,
@@ -259,7 +259,7 @@ impl<
     /// Moves the data asynchronously back to the host CPU device.
     ///
     /// # Errors
-    /// Returns a [`rustacuda::error::CudaError`] iff an error occurs inside
+    /// Returns a [`cust::error::CudaError`] iff an error occurs inside
     /// CUDA
     pub fn move_to_host_async(
         self,

From bfc332a917fb8d3c7ddbfbbc938d564de6f04498 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sat, 8 Feb 2025 21:41:43 +0000
Subject: [PATCH 02/10] Update to experimental cust fork

---
 Cargo.toml                              |  6 ++--
 examples/print/src/main.rs              |  6 ++--
 rust-cuda-kernel/src/kernel/link/mod.rs | 33 ++++++++--------------
 rust-cuda-kernel/src/kernel/lints.rs    |  2 +-
 rust-toolchain                          |  6 ++--
 src/host/mod.rs                         |  9 +++---
 src/kernel/mod.rs                       |  5 ++--
 src/kernel/param.rs                     | 37 +++++++++++++------------
 src/lend/impls/mod.rs                   |  2 +-
 src/lend/impls/ref.rs                   |  2 +-
 src/lend/impls/slice_ref.rs             |  1 +
 src/lib.rs                              |  2 +-
 src/safety/aliasing.rs                  |  6 ++--
 src/utils/async.rs                      | 16 +++++------
 src/utils/exchange/wrapper.rs           |  3 +-
 src/utils/ffi.rs                        | 14 +++++-----
 16 files changed, 69 insertions(+), 81 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 450c0a989..c0e307352 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -98,9 +98,9 @@ kernel = ["dep:rust-cuda-kernel"]
 [dependencies]
 const-type-layout = { version = "0.3.2", default-features = false, features = ["derive"] }
 # FIXME: cust fails to compile without the `bytemuck` feature
-cust = { version = "0.3.2", default-features = false, features = ["bytemuck"], optional = true }
-cust_core = { version = "0.1", default-features = false }
-cust_derive = { version = "0.2", default-features = false, optional = true }
+cust = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.3.2", default-features = false, features = ["bytemuck"], optional = true }
+cust_core = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.1", default-features = false }
+cust_derive = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.2", default-features = false, optional = true }
 final = { version = "0.1.1", default-features = false, optional = true }
 oneshot = { version = "0.1", default-features = false, features = ["std", "async"], optional = true }
 regex = { version = "1.10", default-features = false, optional = true }
diff --git a/examples/print/src/main.rs b/examples/print/src/main.rs
index 1998a7057..e4ae250d0 100644
--- a/examples/print/src/main.rs
+++ b/examples/print/src/main.rs
@@ -15,9 +15,9 @@ fn main() -> rust_cuda::deps::cust::error::CudaResult<()> {
 
     // Create a CUDA context associated to this device
     let _context = rust_cuda::host::CudaDropWrapper::from(
-        rust_cuda::deps::cust::context::Context::create_and_push(
-            rust_cuda::deps::cust::context::ContextFlags::MAP_HOST
-                | rust_cuda::deps::cust::context::ContextFlags::SCHED_AUTO,
+        rust_cuda::deps::cust::context::legacy::Context::create_and_push(
+            rust_cuda::deps::cust::context::legacy::ContextFlags::MAP_HOST
+                | rust_cuda::deps::cust::context::legacy::ContextFlags::SCHED_AUTO,
             device,
         )?,
     );
diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs
index 49db5c264..00f33edd3 100644
--- a/rust-cuda-kernel/src/kernel/link/mod.rs
+++ b/rust-cuda-kernel/src/kernel/link/mod.rs
@@ -189,6 +189,7 @@ fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> proc_macro2::TokenStrea
             );
         }
 
+        #[allow(clippy::literal_string_with_formatting_args)] // false positive
         if type_layout_metas
             .insert(String::from(param), bytes)
             .is_some()
@@ -320,8 +321,7 @@ fn check_kernel_ptx_and_report(
         Ok(None) => (),
         Ok(Some(binary)) => {
             if ptx_lint_levels
-                .get(&PtxLint::DumpAssembly)
-                .map_or(false, |level| *level > LintLevel::Allow)
+                .get(&PtxLint::DumpAssembly).is_some_and(|level| *level > LintLevel::Allow)
             {
                 const HEX: [char; 16] = [
                     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
@@ -335,8 +335,7 @@ fn check_kernel_ptx_and_report(
                 }
 
                 if ptx_lint_levels
-                    .get(&PtxLint::DumpAssembly)
-                    .map_or(false, |level| *level > LintLevel::Warn)
+                    .get(&PtxLint::DumpAssembly).is_some_and(|level| *level > LintLevel::Warn)
                 {
                     emit_call_site_error!(
                         "{} compiled binary:\n{}\n\n{}",
@@ -458,27 +457,22 @@ fn check_kernel_ptx(
             let mut options = options.clone();
 
             if ptx_lint_levels
-                .get(&PtxLint::Verbose)
-                .map_or(false, |level| *level > LintLevel::Warn)
+                .get(&PtxLint::Verbose).is_some_and(|level| *level > LintLevel::Warn)
             {
                 options.push(c"--verbose");
             }
             if ptx_lint_levels
-                .get(&PtxLint::DoublePrecisionUse)
-                .map_or(false, |level| *level > LintLevel::Warn)
+                .get(&PtxLint::DoublePrecisionUse).is_some_and(|level| *level > LintLevel::Warn)
             {
                 options.push(c"--warn-on-double-precision-use");
             }
             if ptx_lint_levels
-                .get(&PtxLint::LocalMemoryUse)
-                .map_or(false, |level| *level > LintLevel::Warn)
+                .get(&PtxLint::LocalMemoryUse).is_some_and(|level| *level > LintLevel::Warn)
             {
                 options.push(c"--warn-on-local-memory-usage");
             }
             if ptx_lint_levels
-                .get(&PtxLint::RegisterSpills)
-                .map_or(false, |level| *level > LintLevel::Warn)
-            {
+                .get(&PtxLint::RegisterSpills).is_some_and(|level| *level > LintLevel::Warn) {
                 options.push(c"--warn-on-spills");
             }
             if ptx_lint_levels
@@ -504,26 +498,21 @@ fn check_kernel_ptx(
         };
 
         if ptx_lint_levels
-            .get(&PtxLint::Verbose)
-            .map_or(false, |level| *level > LintLevel::Allow)
+            .get(&PtxLint::Verbose).is_some_and(|level| *level > LintLevel::Allow)
         {
             options.push(c"--verbose");
         }
         if ptx_lint_levels
-            .get(&PtxLint::DoublePrecisionUse)
-            .map_or(false, |level| *level > LintLevel::Allow)
-        {
+            .get(&PtxLint::DoublePrecisionUse).is_some_and(|level| *level > LintLevel::Allow) {
             options.push(c"--warn-on-double-precision-use");
         }
         if ptx_lint_levels
-            .get(&PtxLint::LocalMemoryUse)
-            .map_or(false, |level| *level > LintLevel::Allow)
+            .get(&PtxLint::LocalMemoryUse).is_some_and(|level| *level > LintLevel::Allow)
         {
             options.push(c"--warn-on-local-memory-usage");
         }
         if ptx_lint_levels
-            .get(&PtxLint::RegisterSpills)
-            .map_or(false, |level| *level > LintLevel::Allow)
+            .get(&PtxLint::RegisterSpills).is_some_and(|level| *level > LintLevel::Allow)
         {
             options.push(c"--warn-on-spills");
         }
diff --git a/rust-cuda-kernel/src/kernel/lints.rs b/rust-cuda-kernel/src/kernel/lints.rs
index dd85a289f..c5d05704d 100644
--- a/rust-cuda-kernel/src/kernel/lints.rs
+++ b/rust-cuda-kernel/src/kernel/lints.rs
@@ -180,7 +180,7 @@ pub trait NestedMetaParser {
     ) -> syn::Result<()>;
 }
 
-impl<'a> NestedMetaParser for syn::meta::ParseNestedMeta<'a> {
+impl NestedMetaParser for syn::meta::ParseNestedMeta<'_> {
     fn path(&self) -> &syn::Path {
         &self.path
     }
diff --git a/rust-toolchain b/rust-toolchain
index 071c4ebfe..2404f256b 100644
--- a/rust-toolchain
+++ b/rust-toolchain
@@ -1,5 +1,5 @@
 [toolchain]
-# Pin to final 1.81.0 nightly
-channel = "nightly-2024-07-21"
+# Pin to final 1.85.0 nightly
+channel = "nightly-2025-01-03"
 components = [ "cargo", "rustfmt", "clippy", "llvm-bitcode-linker", "llvm-tools" ]
-targets = [ "x86_64-unknown-linux-gnu", "nvptx64-nvidia-cuda" ]
+targets = [ "nvptx64-nvidia-cuda" ]
diff --git a/src/host/mod.rs b/src/host/mod.rs
index 782b589f8..c97452438 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -35,7 +35,7 @@ pub struct Stream<'stream> {
     _brand: InvariantLifetime<'stream>,
 }
 
-impl<'stream> Deref for Stream<'stream> {
+impl Deref for Stream<'_> {
     type Target = cust::stream::Stream;
 
     fn deref(&self) -> &Self::Target {
@@ -43,7 +43,7 @@ impl<'stream> Deref for Stream<'stream> {
     }
 }
 
-impl<'stream> Stream<'stream> {
+impl Stream<'_> {
     /// Create a new uniquely branded [`Stream`], which can bind async
     /// operations to the [`Stream`] that they are computed on.
     ///
@@ -152,6 +152,7 @@ macro_rules! impl_sealed_drop_value {
 impl_sealed_drop_value!(Module);
 impl_sealed_drop_value!(cust::stream::Stream);
 impl_sealed_drop_value!(Context);
+impl_sealed_drop_value!(cust::context::legacy::Context);
 impl_sealed_drop_value!(Event);
 
 #[expect(clippy::module_name_repetitions)]
@@ -271,13 +272,13 @@ pub struct HostAndDeviceConstRef<'a, T: PortableBitSemantics + TypeGraphLayout>
     host_ref: &'a T,
 }
 
-impl<'a, T: PortableBitSemantics + TypeGraphLayout> Clone for HostAndDeviceConstRef<'a, T> {
+impl<T: PortableBitSemantics + TypeGraphLayout> Clone for HostAndDeviceConstRef<'_, T> {
     fn clone(&self) -> Self {
         *self
     }
 }
 
-impl<'a, T: PortableBitSemantics + TypeGraphLayout> Copy for HostAndDeviceConstRef<'a, T> {}
+impl<T: PortableBitSemantics + TypeGraphLayout> Copy for HostAndDeviceConstRef<'_, T> {}
 
 impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceConstRef<'a, T> {
     /// # Errors
diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
index 95d21457a..43f8fb563 100644
--- a/src/kernel/mod.rs
+++ b/src/kernel/mod.rs
@@ -7,12 +7,11 @@ use std::{
     ptr::NonNull,
 };
 
-use cust::module::{ModuleJitOption, OptLevel};
 #[cfg(feature = "host")]
 use cust::{
     error::{CudaError, CudaResult},
     function::Function,
-    module::Module,
+    module::{Module, ModuleJitOption, OptLevel},
 };
 
 #[cfg(feature = "kernel")]
@@ -226,7 +225,7 @@ macro_rules! impl_launcher_launch {
 }
 
 #[cfg(feature = "host")]
-impl<'stream, 'kernel, Kernel> Launcher<'stream, 'kernel, Kernel> {
+impl<'stream, Kernel> Launcher<'stream, '_, Kernel> {
     impl_launcher_launch! { launch0() => with0_async => launch0_async }
 
     impl_launcher_launch! { launch1(
diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index 6d95224dc..a54044e2f 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -157,6 +157,7 @@ impl<
 {
 }
 
+#[cfg_attr(feature = "device", expect(clippy::needless_lifetimes))]
 impl<
         'a,
         T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout,
@@ -244,9 +245,8 @@ impl<
     }
 }
 impl<
-        'a,
         T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout,
-    > sealed::Sealed for &'a PerThreadShallowCopy<T>
+    > sealed::Sealed for &PerThreadShallowCopy<T>
 {
 }
 
@@ -342,9 +342,8 @@ impl<
     }
 }
 impl<
-        'a,
         T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout,
-    > sealed::Sealed for &'a PtxJit<PerThreadShallowCopy<T>>
+    > sealed::Sealed for &PtxJit<PerThreadShallowCopy<T>>
 {
 }
 
@@ -374,6 +373,7 @@ impl<
     }
 }
 
+#[cfg_attr(feature = "device", expect(clippy::needless_lifetimes))]
 impl<
         'a,
         T: Sync
@@ -467,13 +467,12 @@ impl<
     }
 }
 impl<
-        'a,
         T: crate::safety::StackOnly
             + Sync
             + crate::safety::PortableBitSemantics
             + TypeGraphLayout
             + InteriorMutableSync,
-    > sealed::Sealed for &'a ShallowInteriorMutable<T>
+    > sealed::Sealed for &ShallowInteriorMutable<T>
 {
 }
 
@@ -618,6 +617,7 @@ impl<
 {
 }
 
+#[cfg_attr(feature = "device", expect(clippy::needless_lifetimes))]
 impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T> {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b>
@@ -707,8 +707,9 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T
         }
     }
 }
-impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a DeepPerThreadBorrow<T> {}
+impl<T: Sync + RustToCuda> sealed::Sealed for &DeepPerThreadBorrow<T> {}
 
+#[cfg_attr(feature = "device", expect(clippy::needless_lifetimes))]
 impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     for &'a mut DeepPerThreadBorrow<T>
 {
@@ -806,8 +807,8 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
         }
     }
 }
-impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
-    for &'a mut DeepPerThreadBorrow<T>
+impl<T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
+    for &mut DeepPerThreadBorrow<T>
 {
 }
 
@@ -994,7 +995,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a PtxJit<DeepPerThreadB
         }
     }
 }
-impl<'a, T: Sync + RustToCuda> sealed::Sealed for &'a PtxJit<DeepPerThreadBorrow<T>> {}
+impl<T: Sync + RustToCuda> sealed::Sealed for &PtxJit<DeepPerThreadBorrow<T>> {}
 
 impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     for &'a mut PtxJit<DeepPerThreadBorrow<T>>
@@ -1090,8 +1091,8 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
         }
     }
 }
-impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
-    for &'a mut PtxJit<DeepPerThreadBorrow<T>>
+impl<T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
+    for &mut PtxJit<DeepPerThreadBorrow<T>>
 {
 }
 
@@ -1135,7 +1136,7 @@ mod private_shared {
     }
 }
 
-impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::ThreadBlockShared<T> {
+impl<T: 'static> CudaKernelParameter for &mut crate::utils::shared::ThreadBlockShared<T> {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b>
         = &'b mut crate::utils::shared::ThreadBlockShared<T>
@@ -1218,10 +1219,10 @@ impl<'a, T: 'static> CudaKernelParameter for &'a mut crate::utils::shared::Threa
         inner.with(&mut param)
     }
 }
-impl<'a, T: 'static> sealed::Sealed for &'a mut crate::utils::shared::ThreadBlockShared<T> {}
+impl<T: 'static> sealed::Sealed for &mut crate::utils::shared::ThreadBlockShared<T> {}
 
-impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParameter
-    for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
+impl<T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParameter
+    for &mut crate::utils::shared::ThreadBlockSharedSlice<T>
 {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b>
@@ -1307,7 +1308,7 @@ impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> CudaKernelParamete
         }
     }
 }
-impl<'a, T: 'static + PortableBitSemantics + TypeGraphLayout> sealed::Sealed
-    for &'a mut crate::utils::shared::ThreadBlockSharedSlice<T>
+impl<T: 'static + PortableBitSemantics + TypeGraphLayout> sealed::Sealed
+    for &mut crate::utils::shared::ThreadBlockSharedSlice<T>
 {
 }
diff --git a/src/lend/impls/mod.rs b/src/lend/impls/mod.rs
index 7f7af6ad2..13ee7d6a0 100644
--- a/src/lend/impls/mod.rs
+++ b/src/lend/impls/mod.rs
@@ -1,5 +1,5 @@
 mod arc;
-mod arced_slice;
+// mod arced_slice;
 mod r#box;
 mod boxed_slice;
 #[cfg(feature = "final")]
diff --git a/src/lend/impls/ref.rs b/src/lend/impls/ref.rs
index 99318e055..d49f31cc6 100644
--- a/src/lend/impls/ref.rs
+++ b/src/lend/impls/ref.rs
@@ -71,7 +71,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a T
     }
 }
 
-unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &'a T {
+unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &T {
     #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocationAsync = CombinedCudaAlloc<
         CudaDropWrapper<LockedBox<DeviceCopyWithPortableBitSemantics<ManuallyDrop<T>>>>,
diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs
index 53558a17c..08f1c8418 100644
--- a/src/lend/impls/slice_ref.rs
+++ b/src/lend/impls/slice_ref.rs
@@ -74,6 +74,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a [T
     }
 }
 
+#[cfg_attr(feature = "device", expect(clippy::needless_lifetimes))]
 unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &'a [T] {
     #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocationAsync = CombinedCudaAlloc<
diff --git a/src/lib.rs b/src/lib.rs
index 5605ad612..c065da2e0 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -48,7 +48,7 @@
 #![feature(generic_const_exprs)]
 #![expect(internal_features)]
 #![feature(core_intrinsics)]
-#![feature(const_intrinsic_compare_bytes)]
+// #![feature(const_intrinsic_compare_bytes)]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
 
 #[cfg(all(feature = "host", feature = "device", not(doc)))]
diff --git a/src/safety/aliasing.rs b/src/safety/aliasing.rs
index 3a9cb8442..fefe7be0c 100644
--- a/src/safety/aliasing.rs
+++ b/src/safety/aliasing.rs
@@ -38,23 +38,21 @@
 pub unsafe trait SafeMutableAliasing {}
 
 unsafe impl<
-        'a,
         T: crate::safety::StackOnly
             + crate::safety::PortableBitSemantics
             + const_type_layout::TypeGraphLayout,
         const STRIDE: usize,
     > SafeMutableAliasing
-    for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride<&'a mut [T], STRIDE>
+    for crate::utils::aliasing::SplitSliceOverCudaThreadsConstStride<&mut [T], STRIDE>
 {
 }
 
 unsafe impl<
-        'a,
         T: crate::safety::StackOnly
             + crate::safety::PortableBitSemantics
             + const_type_layout::TypeGraphLayout,
     > SafeMutableAliasing
-    for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<&'a mut [T]>
+    for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride<&mut [T]>
 {
 }
 
diff --git a/src/utils/async.rs b/src/utils/async.rs
index 899e94cdb..1dfb79745 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -55,7 +55,7 @@ impl<T: ?Sized> Completion<T> for NoCompletion {
 impl sealed::Sealed for NoCompletion {}
 
 #[cfg(feature = "host")]
-impl<'a, T: ?Sized + BorrowMut<B>, B: ?Sized> Completion<T> for CompletionFnMut<'a, B> {
+impl<T: ?Sized + BorrowMut<B>, B: ?Sized> Completion<T> for CompletionFnMut<'_, B> {
     type Completed = B;
 
     #[inline]
@@ -74,7 +74,7 @@ impl<'a, T: ?Sized + BorrowMut<B>, B: ?Sized> Completion<T> for CompletionFnMut<
     }
 }
 #[cfg(feature = "host")]
-impl<'a, T: ?Sized> sealed::Sealed for CompletionFnMut<'a, T> {}
+impl<T: ?Sized> sealed::Sealed for CompletionFnMut<'_, T> {}
 
 #[cfg(feature = "host")]
 impl<T: ?Sized + BorrowMut<C::Completed>, C: Completion<T>> Completion<T> for Option<C> {
@@ -87,7 +87,7 @@ impl<T: ?Sized + BorrowMut<C::Completed>, C: Completion<T>> Completion<T> for Op
 
     #[inline]
     fn synchronize_on_drop(&self) -> bool {
-        self.as_ref().map_or(false, Completion::synchronize_on_drop)
+        self.as_ref().is_some_and(Completion::synchronize_on_drop)
     }
 
     #[inline]
@@ -407,7 +407,7 @@ where
 }
 
 #[cfg(feature = "host")]
-impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Drop for Async<'a, 'stream, T, C> {
+impl<T: BorrowMut<C::Completed>, C: Completion<T>> Drop for Async<'_, '_, T, C> {
     fn drop(&mut self) {
         let AsyncStatus::Processing {
             receiver,
@@ -434,8 +434,8 @@ struct AsyncFuture<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> {
 }
 
 #[cfg(feature = "host")]
-impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Future
-    for AsyncFuture<'a, 'stream, T, C>
+impl<T: BorrowMut<C::Completed>, C: Completion<T>> Future
+    for AsyncFuture<'_, '_, T, C>
 {
     type Output = CudaResult<T>;
 
@@ -517,8 +517,8 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> IntoFuture
 }
 
 #[cfg(feature = "host")]
-impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> Drop
-    for AsyncFuture<'a, 'stream, T, C>
+impl<T: BorrowMut<C::Completed>, C: Completion<T>> Drop
+    for AsyncFuture<'_, '_, T, C>
 {
     fn drop(&mut self) {
         let Some(mut value) = self.value.take() else {
diff --git a/src/utils/exchange/wrapper.rs b/src/utils/exchange/wrapper.rs
index 36bd68614..3c56ebfc1 100644
--- a/src/utils/exchange/wrapper.rs
+++ b/src/utils/exchange/wrapper.rs
@@ -251,10 +251,9 @@ impl<T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: Emp
 }
 
 impl<
-        'a,
         'stream,
         T: RustToCudaAsync<CudaAllocationAsync: EmptyCudaAlloc, CudaAllocation: EmptyCudaAlloc>,
-    > Async<'a, 'stream, ExchangeWrapperOnDevice<T>, NoCompletion>
+    > Async<'_, 'stream, ExchangeWrapperOnDevice<T>, NoCompletion>
 {
     /// Moves the data asynchronously back to the host CPU device.
     ///
diff --git a/src/utils/ffi.rs b/src/utils/ffi.rs
index 9566a0c40..f94af17d8 100644
--- a/src/utils/ffi.rs
+++ b/src/utils/ffi.rs
@@ -66,16 +66,16 @@ pub struct DeviceConstRef<'r, T: PortableBitSemantics + 'r> {
     pub(crate) reference: PhantomData<&'r T>,
 }
 
-impl<'r, T: PortableBitSemantics> Copy for DeviceConstRef<'r, T> {}
+impl<T: PortableBitSemantics> Copy for DeviceConstRef<'_, T> {}
 
-impl<'r, T: PortableBitSemantics> Clone for DeviceConstRef<'r, T> {
+impl<T: PortableBitSemantics> Clone for DeviceConstRef<'_, T> {
     fn clone(&self) -> Self {
         *self
     }
 }
 
 #[cfg(feature = "device")]
-impl<'r, T: PortableBitSemantics> AsRef<T> for DeviceConstRef<'r, T> {
+impl<T: PortableBitSemantics> AsRef<T> for DeviceConstRef<'_, T> {
     fn as_ref(&self) -> &T {
         unsafe { &*self.pointer.0 }
     }
@@ -90,14 +90,14 @@ pub struct DeviceMutRef<'r, T: PortableBitSemantics + 'r> {
 }
 
 #[cfg(feature = "device")]
-impl<'r, T: PortableBitSemantics> AsRef<T> for DeviceMutRef<'r, T> {
+impl<T: PortableBitSemantics> AsRef<T> for DeviceMutRef<'_, T> {
     fn as_ref(&self) -> &T {
         unsafe { &*self.pointer.0 }
     }
 }
 
 #[cfg(feature = "device")]
-impl<'r, T: PortableBitSemantics> AsMut<T> for DeviceMutRef<'r, T> {
+impl<T: PortableBitSemantics> AsMut<T> for DeviceMutRef<'_, T> {
     fn as_mut(&mut self) -> &mut T {
         unsafe { &mut *self.pointer.0 }
     }
@@ -113,14 +113,14 @@ pub struct DeviceOwnedRef<'r, T: PortableBitSemantics> {
 }
 
 #[cfg(feature = "device")]
-impl<'r, T: PortableBitSemantics> AsRef<T> for DeviceOwnedRef<'r, T> {
+impl<T: PortableBitSemantics> AsRef<T> for DeviceOwnedRef<'_, T> {
     fn as_ref(&self) -> &T {
         unsafe { &*self.pointer.0 }
     }
 }
 
 #[cfg(feature = "device")]
-impl<'r, T: PortableBitSemantics> AsMut<T> for DeviceOwnedRef<'r, T> {
+impl<T: PortableBitSemantics> AsMut<T> for DeviceOwnedRef<'_, T> {
     fn as_mut(&mut self) -> &mut T {
         unsafe { &mut *self.pointer.0 }
     }

From 9a53d25e320d219bfa649bf17e20c6b051cd4714 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 9 Feb 2025 05:25:27 +0000
Subject: [PATCH 03/10] Fix clippy lints

---
 examples/lifetime/src/main.rs                 | 20 ++++++------
 rust-cuda-derive/src/rust_to_cuda/field_ty.rs |  1 -
 rust-cuda-derive/src/rust_to_cuda/mod.rs      |  2 +-
 rust-cuda-kernel/src/kernel/link/mod.rs       | 32 +++++++++++++------
 src/kernel/mod.rs                             |  3 +-
 src/kernel/param.rs                           |  5 +--
 src/kernel/ptx_jit/regex.rs                   |  4 ---
 src/lend/impls/arc.rs                         |  1 -
 src/lend/impls/box.rs                         |  1 -
 src/lend/impls/boxed_slice.rs                 |  1 -
 src/lend/impls/final.rs                       |  1 -
 src/lend/impls/option.rs                      |  1 -
 src/lend/impls/ref.rs                         |  1 -
 src/lend/impls/ref_mut.rs                     |  1 -
 src/lend/impls/slice_ref.rs                   |  1 -
 src/lend/impls/slice_ref_mut.rs               |  1 -
 src/lib.rs                                    |  2 --
 src/safety/aliasing.rs                        |  1 -
 src/safety/portable.rs                        |  1 -
 src/utils/async.rs                            |  8 ++---
 src/utils/exchange/buffer/device.rs           |  1 -
 src/utils/exchange/buffer/host.rs             |  1 -
 src/utils/shared/slice.rs                     |  1 -
 23 files changed, 37 insertions(+), 54 deletions(-)

diff --git a/examples/lifetime/src/main.rs b/examples/lifetime/src/main.rs
index 78cbe943d..212d5de45 100644
--- a/examples/lifetime/src/main.rs
+++ b/examples/lifetime/src/main.rs
@@ -2,30 +2,30 @@
 
 use lifetime::{kernel, link};
 
-fn main() -> rust_cuda::deps::rustacuda::error::CudaResult<()> {
+fn main() -> rust_cuda::deps::cust::error::CudaResult<()> {
     // Link the lifetime-only-generic CUDA kernel
     struct KernelPtx<'a, 'b>(core::marker::PhantomData<(&'a (), &'b ())>);
     link! { impl kernel<'a, 'b> for KernelPtx }
 
     // Initialize the CUDA API
-    rust_cuda::deps::rustacuda::init(rust_cuda::deps::rustacuda::CudaFlags::empty())?;
+    rust_cuda::deps::cust::init(rust_cuda::deps::cust::CudaFlags::empty())?;
 
     // Get the first CUDA GPU device
-    let device = rust_cuda::deps::rustacuda::device::Device::get_device(0)?;
+    let device = rust_cuda::deps::cust::device::Device::get_device(0)?;
 
     // Create a CUDA context associated to this device
     let _context = rust_cuda::host::CudaDropWrapper::from(
-        rust_cuda::deps::rustacuda::context::Context::create_and_push(
-            rust_cuda::deps::rustacuda::context::ContextFlags::MAP_HOST
-                | rust_cuda::deps::rustacuda::context::ContextFlags::SCHED_AUTO,
+        rust_cuda::deps::cust::context::legacy::Context::create_and_push(
+            rust_cuda::deps::cust::context::legacy::ContextFlags::MAP_HOST
+                | rust_cuda::deps::cust::context::legacy::ContextFlags::SCHED_AUTO,
             device,
         )?,
     );
 
     // Create a new CUDA stream to submit kernels to
     let mut stream =
-        rust_cuda::host::CudaDropWrapper::from(rust_cuda::deps::rustacuda::stream::Stream::new(
-            rust_cuda::deps::rustacuda::stream::StreamFlags::NON_BLOCKING,
+        rust_cuda::host::CudaDropWrapper::from(rust_cuda::deps::cust::stream::Stream::new(
+            rust_cuda::deps::cust::stream::StreamFlags::NON_BLOCKING,
             None,
         )?);
 
@@ -34,8 +34,8 @@ fn main() -> rust_cuda::deps::rustacuda::error::CudaResult<()> {
     // Create a new instance of the CUDA kernel and prepare the launch config
     let mut kernel = rust_cuda::kernel::TypedPtxKernel::<kernel>::new::<KernelPtx>(None);
     let config = rust_cuda::kernel::LaunchConfig {
-        grid: rust_cuda::deps::rustacuda::function::GridSize::x(1),
-        block: rust_cuda::deps::rustacuda::function::BlockSize::x(4),
+        grid: rust_cuda::deps::cust::function::GridSize::x(1),
+        block: rust_cuda::deps::cust::function::BlockSize::x(4),
         ptx_jit: false,
     };
 
diff --git a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
index 4278d308c..8e167a626 100644
--- a/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/field_ty.rs
@@ -1,6 +1,5 @@
 use syn::{parse_quote, spanned::Spanned};
 
-#[expect(clippy::module_name_repetitions)]
 pub enum CudaReprFieldTy {
     SafeDeviceCopy,
     RustToCuda {
diff --git a/rust-cuda-derive/src/rust_to_cuda/mod.rs b/rust-cuda-derive/src/rust_to_cuda/mod.rs
index 800c58fa7..099f97a1d 100644
--- a/rust-cuda-derive/src/rust_to_cuda/mod.rs
+++ b/rust-cuda-derive/src/rust_to_cuda/mod.rs
@@ -10,7 +10,7 @@ fn get_cuda_repr_ident(rust_repr_ident: &proc_macro2::Ident) -> proc_macro2::Ide
     format_ident!("{}CudaRepresentation", rust_repr_ident)
 }
 
-#[expect(clippy::module_name_repetitions, clippy::too_many_lines)]
+#[expect(clippy::too_many_lines)]
 pub fn impl_rust_to_cuda(ast: &syn::DeriveInput) -> proc_macro::TokenStream {
     let (mut struct_fields_cuda, struct_semi_cuda) = if let syn::Data::Struct(s) = &ast.data {
         (s.fields.clone(), s.semi_token)
diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs
index 00f33edd3..8b4a549bb 100644
--- a/rust-cuda-kernel/src/kernel/link/mod.rs
+++ b/rust-cuda-kernel/src/kernel/link/mod.rs
@@ -321,7 +321,8 @@ fn check_kernel_ptx_and_report(
         Ok(None) => (),
         Ok(Some(binary)) => {
             if ptx_lint_levels
-                .get(&PtxLint::DumpAssembly).is_some_and(|level| *level > LintLevel::Allow)
+                .get(&PtxLint::DumpAssembly)
+                .is_some_and(|level| *level > LintLevel::Allow)
             {
                 const HEX: [char; 16] = [
                     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
@@ -335,7 +336,8 @@ fn check_kernel_ptx_and_report(
                 }
 
                 if ptx_lint_levels
-                    .get(&PtxLint::DumpAssembly).is_some_and(|level| *level > LintLevel::Warn)
+                    .get(&PtxLint::DumpAssembly)
+                    .is_some_and(|level| *level > LintLevel::Warn)
                 {
                     emit_call_site_error!(
                         "{} compiled binary:\n{}\n\n{}",
@@ -457,22 +459,27 @@ fn check_kernel_ptx(
             let mut options = options.clone();
 
             if ptx_lint_levels
-                .get(&PtxLint::Verbose).is_some_and(|level| *level > LintLevel::Warn)
+                .get(&PtxLint::Verbose)
+                .is_some_and(|level| *level > LintLevel::Warn)
             {
                 options.push(c"--verbose");
             }
             if ptx_lint_levels
-                .get(&PtxLint::DoublePrecisionUse).is_some_and(|level| *level > LintLevel::Warn)
+                .get(&PtxLint::DoublePrecisionUse)
+                .is_some_and(|level| *level > LintLevel::Warn)
             {
                 options.push(c"--warn-on-double-precision-use");
             }
             if ptx_lint_levels
-                .get(&PtxLint::LocalMemoryUse).is_some_and(|level| *level > LintLevel::Warn)
+                .get(&PtxLint::LocalMemoryUse)
+                .is_some_and(|level| *level > LintLevel::Warn)
             {
                 options.push(c"--warn-on-local-memory-usage");
             }
             if ptx_lint_levels
-                .get(&PtxLint::RegisterSpills).is_some_and(|level| *level > LintLevel::Warn) {
+                .get(&PtxLint::RegisterSpills)
+                .is_some_and(|level| *level > LintLevel::Warn)
+            {
                 options.push(c"--warn-on-spills");
             }
             if ptx_lint_levels
@@ -498,21 +505,26 @@ fn check_kernel_ptx(
         };
 
         if ptx_lint_levels
-            .get(&PtxLint::Verbose).is_some_and(|level| *level > LintLevel::Allow)
+            .get(&PtxLint::Verbose)
+            .is_some_and(|level| *level > LintLevel::Allow)
         {
             options.push(c"--verbose");
         }
         if ptx_lint_levels
-            .get(&PtxLint::DoublePrecisionUse).is_some_and(|level| *level > LintLevel::Allow) {
+            .get(&PtxLint::DoublePrecisionUse)
+            .is_some_and(|level| *level > LintLevel::Allow)
+        {
             options.push(c"--warn-on-double-precision-use");
         }
         if ptx_lint_levels
-            .get(&PtxLint::LocalMemoryUse).is_some_and(|level| *level > LintLevel::Allow)
+            .get(&PtxLint::LocalMemoryUse)
+            .is_some_and(|level| *level > LintLevel::Allow)
         {
             options.push(c"--warn-on-local-memory-usage");
         }
         if ptx_lint_levels
-            .get(&PtxLint::RegisterSpills).is_some_and(|level| *level > LintLevel::Allow)
+            .get(&PtxLint::RegisterSpills)
+            .is_some_and(|level| *level > LintLevel::Allow)
         {
             options.push(c"--warn-on-spills");
         }
diff --git a/src/kernel/mod.rs b/src/kernel/mod.rs
index 43f8fb563..a6134af30 100644
--- a/src/kernel/mod.rs
+++ b/src/kernel/mod.rs
@@ -1,4 +1,3 @@
-use core::str;
 #[cfg(feature = "host")]
 use std::{
     ffi::{CStr, CString},
@@ -309,7 +308,7 @@ impl RawPtxKernel {
         // FIXME: cust's Module::get_function takes a str and turns it back into
         //        a CString immediately
         let function = unsafe { &*std::ptr::from_ref(module.as_ref()) }
-            .get_function(unsafe { str::from_utf8_unchecked(entry_point.to_bytes()) });
+            .get_function(unsafe { core::str::from_utf8_unchecked(entry_point.to_bytes()) });
 
         let function = match function {
             Ok(function) => function,
diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index a54044e2f..34cd03f6b 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -807,10 +807,7 @@ impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
         }
     }
 }
-impl<T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed
-    for &mut DeepPerThreadBorrow<T>
-{
-}
+impl<T: Sync + RustToCuda + SafeMutableAliasing> sealed::Sealed for &mut DeepPerThreadBorrow<T> {}
 
 impl<
         T: Send
diff --git a/src/kernel/ptx_jit/regex.rs b/src/kernel/ptx_jit/regex.rs
index f07f64fa5..d5237475e 100644
--- a/src/kernel/ptx_jit/regex.rs
+++ b/src/kernel/ptx_jit/regex.rs
@@ -2,7 +2,6 @@ use std::sync::OnceLock;
 
 use regex::bytes::Regex;
 
-#[expect(clippy::module_name_repetitions)]
 pub fn const_marker_regex() -> &'static Regex {
     static CONST_MARKER_REGEX: OnceLock<Regex> = OnceLock::new();
     #[allow(clippy::unwrap_used)]
@@ -12,7 +11,6 @@ pub fn const_marker_regex() -> &'static Regex {
     })
 }
 
-#[expect(clippy::module_name_repetitions)]
 pub fn const_base_register_regex() -> &'static Regex {
     static CONST_BASE_REGISTER_REGEX: OnceLock<Regex> = OnceLock::new();
     #[allow(clippy::unwrap_used)]
@@ -22,7 +20,6 @@ pub fn const_base_register_regex() -> &'static Regex {
     })
 }
 
-#[expect(clippy::module_name_repetitions)]
 pub fn const_load_instruction_regex() -> &'static Regex {
     static CONST_LOAD_INSTRUCTION_REGEX: OnceLock<Regex> = OnceLock::new();
     #[allow(clippy::unwrap_used)]
@@ -54,7 +51,6 @@ pub fn const_load_instruction_regex() -> &'static Regex {
     })
 }
 
-#[expect(clippy::module_name_repetitions)]
 pub fn register_regex() -> &'static Regex {
     static REGISTER_REGEX: OnceLock<Regex> = OnceLock::new();
     #[allow(clippy::unwrap_used)]
diff --git a/src/lend/impls/arc.rs b/src/lend/impls/arc.rs
index 9bb3e1cb0..ec5527330 100644
--- a/src/lend/impls/arc.rs
+++ b/src/lend/impls/arc.rs
@@ -30,7 +30,6 @@ use crate::{
 #[doc(hidden)]
 #[repr(transparent)]
 #[derive(TypeLayout)]
-#[expect(clippy::module_name_repetitions)]
 pub struct ArcCudaRepresentation<T: PortableBitSemantics + TypeGraphLayout>(
     DeviceOwnedPointer<_ArcInner<T>>,
 );
diff --git a/src/lend/impls/box.rs b/src/lend/impls/box.rs
index 9c16f07a4..2bd7ec78c 100644
--- a/src/lend/impls/box.rs
+++ b/src/lend/impls/box.rs
@@ -29,7 +29,6 @@ use crate::{
 #[doc(hidden)]
 #[repr(transparent)]
 #[derive(TypeLayout)]
-#[expect(clippy::module_name_repetitions)]
 pub struct BoxCudaRepresentation<T: PortableBitSemantics + TypeGraphLayout>(DeviceOwnedPointer<T>);
 
 unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Box<T> {
diff --git a/src/lend/impls/boxed_slice.rs b/src/lend/impls/boxed_slice.rs
index 8b0937b06..8d00e49e8 100644
--- a/src/lend/impls/boxed_slice.rs
+++ b/src/lend/impls/boxed_slice.rs
@@ -26,7 +26,6 @@ use crate::{
 };
 
 #[doc(hidden)]
-#[expect(clippy::module_name_repetitions)]
 #[derive(TypeLayout)]
 #[repr(C)]
 pub struct BoxedSliceCudaRepresentation<T: PortableBitSemantics + TypeGraphLayout> {
diff --git a/src/lend/impls/final.rs b/src/lend/impls/final.rs
index 51b228c24..68569d7a4 100644
--- a/src/lend/impls/final.rs
+++ b/src/lend/impls/final.rs
@@ -6,7 +6,6 @@ use crate::{
 };
 
 #[doc(hidden)]
-#[expect(clippy::module_name_repetitions)]
 #[derive(const_type_layout::TypeLayout)]
 #[repr(transparent)]
 pub struct FinalCudaRepresentation<T: CudaAsRust>(DeviceAccessible<T>);
diff --git a/src/lend/impls/option.rs b/src/lend/impls/option.rs
index 931c7e952..bca51faf3 100644
--- a/src/lend/impls/option.rs
+++ b/src/lend/impls/option.rs
@@ -18,7 +18,6 @@ use crate::{
 };
 
 #[doc(hidden)]
-#[expect(clippy::module_name_repetitions)]
 #[derive(TypeLayout)]
 #[repr(C)]
 pub struct OptionCudaRepresentation<T: CudaAsRust> {
diff --git a/src/lend/impls/ref.rs b/src/lend/impls/ref.rs
index d49f31cc6..4224f51a5 100644
--- a/src/lend/impls/ref.rs
+++ b/src/lend/impls/ref.rs
@@ -27,7 +27,6 @@ use crate::{
 #[doc(hidden)]
 #[repr(transparent)]
 #[derive(TypeLayout)]
-#[expect(clippy::module_name_repetitions)]
 pub struct RefCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> {
     data: DeviceConstPointer<T>,
     _marker: PhantomData<&'a T>,
diff --git a/src/lend/impls/ref_mut.rs b/src/lend/impls/ref_mut.rs
index 6945c9bc2..3ade45276 100644
--- a/src/lend/impls/ref_mut.rs
+++ b/src/lend/impls/ref_mut.rs
@@ -24,7 +24,6 @@ use crate::{
 #[doc(hidden)]
 #[repr(transparent)]
 #[derive(TypeLayout)]
-#[expect(clippy::module_name_repetitions)]
 pub struct RefMutCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> {
     data: DeviceMutPointer<T>,
     _marker: PhantomData<&'a mut T>,
diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs
index 08f1c8418..062058668 100644
--- a/src/lend/impls/slice_ref.rs
+++ b/src/lend/impls/slice_ref.rs
@@ -25,7 +25,6 @@ use crate::{
 };
 
 #[doc(hidden)]
-#[expect(clippy::module_name_repetitions)]
 #[derive(TypeLayout)]
 #[repr(C)]
 pub struct SliceRefCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> {
diff --git a/src/lend/impls/slice_ref_mut.rs b/src/lend/impls/slice_ref_mut.rs
index 59d9eeff5..c98ae3111 100644
--- a/src/lend/impls/slice_ref_mut.rs
+++ b/src/lend/impls/slice_ref_mut.rs
@@ -22,7 +22,6 @@ use crate::{
 };
 
 #[doc(hidden)]
-#[expect(clippy::module_name_repetitions)]
 #[derive(TypeLayout)]
 #[repr(C)]
 pub struct SliceRefMutCudaRepresentation<'a, T: 'a + PortableBitSemantics + TypeGraphLayout> {
diff --git a/src/lib.rs b/src/lib.rs
index c065da2e0..0511e0191 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,7 +29,6 @@
 #![feature(negative_impls)]
 #![cfg_attr(all(feature = "device", not(doc)), feature(stdarch_nvptx))]
 #![cfg_attr(feature = "device", feature(asm_experimental_arch))]
-#![cfg_attr(feature = "device", feature(asm_const))]
 #![feature(doc_auto_cfg)]
 #![feature(doc_cfg)]
 #![feature(marker_trait_attr)]
@@ -48,7 +47,6 @@
 #![feature(generic_const_exprs)]
 #![expect(internal_features)]
 #![feature(core_intrinsics)]
-// #![feature(const_intrinsic_compare_bytes)]
 #![doc(html_root_url = "https://juntyr.github.io/rust-cuda/")]
 
 #[cfg(all(feature = "host", feature = "device", not(doc)))]
diff --git a/src/safety/aliasing.rs b/src/safety/aliasing.rs
index fefe7be0c..100dbbbd8 100644
--- a/src/safety/aliasing.rs
+++ b/src/safety/aliasing.rs
@@ -1,4 +1,3 @@
-#[expect(clippy::module_name_repetitions)]
 /// Types for which mutable references can be safely shared with each CUDA
 /// thread without breaking Rust's no-mutable-aliasing memory safety
 /// guarantees.
diff --git a/src/safety/portable.rs b/src/safety/portable.rs
index 6013b7d74..9e81d2cfc 100644
--- a/src/safety/portable.rs
+++ b/src/safety/portable.rs
@@ -36,7 +36,6 @@ macro_rules! portable_bit_semantics_docs {
 
 #[cfg(not(doc))]
 portable_bit_semantics_docs! {
-    #[expect(clippy::module_name_repetitions)]
     pub trait PortableBitSemantics: sealed::PortableBitSemantics {}
 }
 #[cfg(doc)]
diff --git a/src/utils/async.rs b/src/utils/async.rs
index 1dfb79745..a6791b313 100644
--- a/src/utils/async.rs
+++ b/src/utils/async.rs
@@ -434,9 +434,7 @@ struct AsyncFuture<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> {
 }
 
 #[cfg(feature = "host")]
-impl<T: BorrowMut<C::Completed>, C: Completion<T>> Future
-    for AsyncFuture<'_, '_, T, C>
-{
+impl<T: BorrowMut<C::Completed>, C: Completion<T>> Future for AsyncFuture<'_, '_, T, C> {
     type Output = CudaResult<T>;
 
     fn poll(
@@ -517,9 +515,7 @@ impl<'a, 'stream, T: BorrowMut<C::Completed>, C: Completion<T>> IntoFuture
 }
 
 #[cfg(feature = "host")]
-impl<T: BorrowMut<C::Completed>, C: Completion<T>> Drop
-    for AsyncFuture<'_, '_, T, C>
-{
+impl<T: BorrowMut<C::Completed>, C: Completion<T>> Drop for AsyncFuture<'_, '_, T, C> {
     fn drop(&mut self) {
         let Some(mut value) = self.value.take() else {
             return;
diff --git a/src/utils/exchange/buffer/device.rs b/src/utils/exchange/buffer/device.rs
index 760fe4d35..047652186 100644
--- a/src/utils/exchange/buffer/device.rs
+++ b/src/utils/exchange/buffer/device.rs
@@ -9,7 +9,6 @@ use crate::{
 
 use super::CudaExchangeItem;
 
-#[expect(clippy::module_name_repetitions)]
 pub struct CudaExchangeBufferDevice<
     T: StackOnly + PortableBitSemantics + TypeGraphLayout,
     const M2D: bool,
diff --git a/src/utils/exchange/buffer/host.rs b/src/utils/exchange/buffer/host.rs
index 5c766fcae..05b8d1ead 100644
--- a/src/utils/exchange/buffer/host.rs
+++ b/src/utils/exchange/buffer/host.rs
@@ -22,7 +22,6 @@ use crate::{
 
 use super::{common::CudaExchangeBufferCudaRepresentation, CudaExchangeItem};
 
-#[expect(clippy::module_name_repetitions)]
 pub struct CudaExchangeBufferHost<
     T: StackOnly + PortableBitSemantics + TypeGraphLayout,
     const M2D: bool,
diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs
index a3df82d06..f239ce7a8 100644
--- a/src/utils/shared/slice.rs
+++ b/src/utils/shared/slice.rs
@@ -2,7 +2,6 @@ use core::alloc::Layout;
 
 use const_type_layout::TypeGraphLayout;
 
-#[expect(clippy::module_name_repetitions)]
 #[repr(transparent)]
 pub struct ThreadBlockSharedSlice<T: 'static + TypeGraphLayout> {
     shared: *mut [T],

From ccad446f4e81d59b131d347f378ec4fe6a98aa78 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 9 Feb 2025 06:17:42 +0000
Subject: [PATCH 04/10] Add back Arc<[T]> lending

---
 src/lend/impls/arced_slice.rs | 24 ++++++++++++------------
 src/lend/impls/mod.rs         |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/lend/impls/arced_slice.rs b/src/lend/impls/arced_slice.rs
index 1fc334589..672cc2a6b 100644
--- a/src/lend/impls/arced_slice.rs
+++ b/src/lend/impls/arced_slice.rs
@@ -33,7 +33,6 @@ use crate::{
 };
 
 #[doc(hidden)]
-#[expect(clippy::module_name_repetitions)]
 #[derive(TypeLayout)]
 #[repr(C)]
 pub struct ArcedSliceCudaRepresentation<T: PortableBitSemantics + TypeGraphLayout> {
@@ -81,31 +80,32 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Arc<[T]> {
         DeviceAccessible<Self::CudaRepresentation>,
         CombinedCudaAlloc<Self::CudaAllocation, A>,
     )> {
-        use cust::memory::{CopyDestination, DevicePointer, DeviceSlice};
+        use cust::memory::{CopyDestination, DeviceSlice};
 
         let data_ptr: *const T = std::ptr::from_ref(&**self).as_ptr();
         let offset = std::mem::offset_of!(_ArcInner<[T; 42]>, data);
         let arc_ptr: *const _ArcInner<[T; 42]> = data_ptr.byte_sub(offset).cast();
 
-        let header_len = (offset + (std::mem::align_of::<T>() - 1)) / std::mem::align_of::<T>();
+        let header_len = offset.div_ceil(std::mem::align_of::<T>());
 
-        let mut device_buffer = CudaDropWrapper::from(DeviceBuffer::<
+        let device_buffer = CudaDropWrapper::from(DeviceBuffer::<
             DeviceCopyWithPortableBitSemantics<T>,
         >::uninitialized(
             header_len + self.len()
         )?);
-        let (header, buffer): (&mut DeviceSlice<_>, &mut DeviceSlice<_>) =
-            device_buffer.split_at_mut(header_len);
+
+        let mut buffer: DeviceSlice<_> = device_buffer.index(header_len..);
         buffer.copy_from(std::slice::from_raw_parts(self.as_ptr().cast(), self.len()))?;
+
+        let header: DeviceSlice<_> = device_buffer.index(..header_len);
         let header = DeviceSlice::from_raw_parts_mut(
-            DevicePointer::wrap(header.as_mut_ptr().cast::<u8>()),
+            header.as_device_ptr().cast::<u8>(),
             header.len() * std::mem::size_of::<T>(),
         );
-        let (_, header) = header.split_at_mut(header.len() - offset);
-        let (header, _) = header.split_at_mut(std::mem::size_of::<_ArcInnerHeader>());
-        #[expect(clippy::cast_ptr_alignment)]
+        let header = header.index((header.len() - offset)..);
+        let header = header.index(..std::mem::size_of::<_ArcInnerHeader>());
         let mut header: ManuallyDrop<DeviceBox<_ArcInnerHeader>> = ManuallyDrop::new(
-            DeviceBox::from_raw(header.as_mut_ptr().cast::<_ArcInnerHeader>()),
+            DeviceBox::from_device(header.as_device_ptr().cast::<_ArcInnerHeader>()),
         );
         header.copy_from(&*arc_ptr.cast::<_ArcInnerHeader>())?;
 
@@ -152,7 +152,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Arc<[
         let offset = std::mem::offset_of!(_ArcInner<[T; 42]>, data);
         let arc_ptr: *const _ArcInner<[T; 42]> = data_ptr.byte_sub(offset).cast();
 
-        let header_len = (offset + (std::mem::align_of::<T>() - 1)) / std::mem::align_of::<T>();
+        let header_len = offset.div_ceil(std::mem::align_of::<T>());
 
         let locked_buffer = unsafe {
             let mut locked_buffer =
diff --git a/src/lend/impls/mod.rs b/src/lend/impls/mod.rs
index 13ee7d6a0..7f7af6ad2 100644
--- a/src/lend/impls/mod.rs
+++ b/src/lend/impls/mod.rs
@@ -1,5 +1,5 @@
 mod arc;
-// mod arced_slice;
+mod arced_slice;
 mod r#box;
 mod boxed_slice;
 #[cfg(feature = "final")]

From e04c308e9677a2ad24a86f1bd014521fb7224395 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Sun, 9 Feb 2025 09:36:54 +0000
Subject: [PATCH 05/10] Fix clippy lints

---
 src/kernel/param.rs         | 8 ++++----
 src/lend/impls/slice_ref.rs | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/kernel/param.rs b/src/kernel/param.rs
index 34cd03f6b..9ab27fa1b 100644
--- a/src/kernel/param.rs
+++ b/src/kernel/param.rs
@@ -157,7 +157,7 @@ impl<
 {
 }
 
-#[cfg_attr(feature = "device", expect(clippy::needless_lifetimes))]
+#[cfg_attr(not(feature = "host"), expect(clippy::needless_lifetimes))]
 impl<
         'a,
         T: Sync + crate::safety::StackOnly + crate::safety::PortableBitSemantics + TypeGraphLayout,
@@ -373,7 +373,7 @@ impl<
     }
 }
 
-#[cfg_attr(feature = "device", expect(clippy::needless_lifetimes))]
+#[cfg_attr(not(feature = "host"), expect(clippy::needless_lifetimes))]
 impl<
         'a,
         T: Sync
@@ -617,7 +617,7 @@ impl<
 {
 }
 
-#[cfg_attr(feature = "device", expect(clippy::needless_lifetimes))]
+#[cfg_attr(not(feature = "host"), expect(clippy::needless_lifetimes))]
 impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T> {
     #[cfg(feature = "host")]
     type AsyncHostType<'stream, 'b>
@@ -709,7 +709,7 @@ impl<'a, T: Sync + RustToCuda> CudaKernelParameter for &'a DeepPerThreadBorrow<T
 }
 impl<T: Sync + RustToCuda> sealed::Sealed for &DeepPerThreadBorrow<T> {}
 
-#[cfg_attr(feature = "device", expect(clippy::needless_lifetimes))]
+#[cfg_attr(not(feature = "host"), expect(clippy::needless_lifetimes))]
 impl<'a, T: Sync + RustToCuda + SafeMutableAliasing> CudaKernelParameter
     for &'a mut DeepPerThreadBorrow<T>
 {
diff --git a/src/lend/impls/slice_ref.rs b/src/lend/impls/slice_ref.rs
index 062058668..400ef0669 100644
--- a/src/lend/impls/slice_ref.rs
+++ b/src/lend/impls/slice_ref.rs
@@ -73,7 +73,7 @@ unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCuda for &'a [T
     }
 }
 
-#[cfg_attr(feature = "device", expect(clippy::needless_lifetimes))]
+#[cfg_attr(not(feature = "host"), expect(clippy::needless_lifetimes))]
 unsafe impl<'a, T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for &'a [T] {
     #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocationAsync = CombinedCudaAlloc<

From 5b850130c5efeabe8d8dd818f44f38cb00538bbf Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 11 Feb 2025 08:07:12 +0000
Subject: [PATCH 06/10] Remove extraneous cust/bytemuck feature

---
 Cargo.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index c0e307352..ac41c8150 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -97,8 +97,7 @@ kernel = ["dep:rust-cuda-kernel"]
 
 [dependencies]
 const-type-layout = { version = "0.3.2", default-features = false, features = ["derive"] }
-# FIXME: cust fails to compile without the `bytemuck` feature
-cust = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.3.2", default-features = false, features = ["bytemuck"], optional = true }
+cust = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.3.2", default-features = false, optional = true }
 cust_core = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.1", default-features = false }
 cust_derive = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.2", default-features = false, optional = true }
 final = { version = "0.1.1", default-features = false, optional = true }

From a3fcdaad182034ad4d8f5007463e6a86b6e84c0a Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 11 Feb 2025 08:12:51 +0000
Subject: [PATCH 07/10] Update dependencies

---
 Cargo.toml | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index ac41c8150..4afadb7b3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,12 +24,13 @@ rust-cuda-derive = { version = "0.1", path = "rust-cuda-derive", default-feature
 rust-cuda-kernel = { version = "0.1", path = "rust-cuda-kernel", default-features = false }
 
 # third-party dependencies with unpublished patches
-rustacuda = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", default-features = false }
-rustacuda_core = { git = "https://github.com/juntyr/RustaCUDA", rev = "c6ea7cc", default-features = false }
+cust = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.3.2", default-features = false }
+cust_core = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.1", default-features = false }
+cust_derive = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.2", default-features = false }
 
 # crates.io third-party dependencies
 cargo_metadata = { version = "0.19", default-features = false }
-cargo-util = { version = "=0.2.16", default-features = false }  # TODO: keep in sync with toolchain
+cargo-util = { version = "=0.2.17", default-features = false }  # TODO: keep in sync with toolchain
 colored = { version = "3.0", default-features = false }
 const-type-layout = { version = "0.3.2", default-features = false }
 final = { version = "0.1.1", default-features = false }
@@ -96,14 +97,13 @@ host = ["dep:cust", "dep:regex", "dep:oneshot", "dep:safer_owning_ref"]
 kernel = ["dep:rust-cuda-kernel"]
 
 [dependencies]
-const-type-layout = { version = "0.3.2", default-features = false, features = ["derive"] }
-cust = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.3.2", default-features = false, optional = true }
-cust_core = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.1", default-features = false }
-cust_derive = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.2", default-features = false, optional = true }
-final = { version = "0.1.1", default-features = false, optional = true }
-oneshot = { version = "0.1", default-features = false, features = ["std", "async"], optional = true }
-regex = { version = "1.10", default-features = false, optional = true }
-safer_owning_ref = { version = "0.5", default-features = false, optional = true }
-
-rust-cuda-derive = { path = "rust-cuda-derive", default-features = false, optional = true }
-rust-cuda-kernel = { path = "rust-cuda-kernel", default-features = false, optional = true }
+const-type-layout = { workspace = true, features = ["derive"] }
+cust = { workspace = true, optional = true }
+cust_core = { workspace = true }
+cust_derive = { workspace = true, optional = true }
+final = { workspace = true, optional = true }
+oneshot = { workspace = true, features = ["std", "async"], optional = true }
+regex = { workspace = true, optional = true }
+rust-cuda-derive = { workspace = true, optional = true }
+rust-cuda-kernel = { workspace = true, optional = true }
+safer_owning_ref = { workspace = true, optional = true }

From 3d2bf614a3dce39be1519f1dbcfdfc3438dae4fc Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 11 Feb 2025 08:23:55 +0000
Subject: [PATCH 08/10] Remove the extraneous cust-derive dependency

---
 .github/workflows/rustdoc.yml |  1 -
 Cargo.toml                    |  7 ++++---
 src/host/mod.rs               | 10 ++++------
 src/lend/impls/arc.rs         |  1 +
 src/safety/portable.rs        |  9 ++++++---
 5 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/rustdoc.yml b/.github/workflows/rustdoc.yml
index f0d7b683b..f62503604 100644
--- a/.github/workflows/rustdoc.yml
+++ b/.github/workflows/rustdoc.yml
@@ -39,7 +39,6 @@ jobs:
             --extern-html-root-url final=https://docs.rs/final/0.1.1/ \
             --extern-html-root-url cust=https://docs.rs/cust/0.3.2/ \
             --extern-html-root-url cust_core=https://docs.rs/cust_core/0.1/ \
-            --extern-html-root-url cust_derive=https://docs.rs/cust_derive/0.2/ \
             -Zunstable-options \
           " cargo doc \
             --all-features \
diff --git a/Cargo.toml b/Cargo.toml
index 4afadb7b3..79705b409 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,7 +26,6 @@ rust-cuda-kernel = { version = "0.1", path = "rust-cuda-kernel", default-feature
 # third-party dependencies with unpublished patches
 cust = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.3.2", default-features = false }
 cust_core = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.1", default-features = false }
-cust_derive = { git = "https://github.com/juntyr/Rust-GPU-CUDA.git", rev = "5365c14", version = "0.2", default-features = false }
 
 # crates.io third-party dependencies
 cargo_metadata = { version = "0.19", default-features = false }
@@ -90,7 +89,7 @@ rust-version = { workspace = true }
 
 [features]
 default = []
-derive = ["dep:cust_derive", "dep:rust-cuda-derive"]
+derive = ["dep:rust-cuda-derive"]
 device = []
 final = ["dep:final"]
 host = ["dep:cust", "dep:regex", "dep:oneshot", "dep:safer_owning_ref"]
@@ -100,10 +99,12 @@ kernel = ["dep:rust-cuda-kernel"]
 const-type-layout = { workspace = true, features = ["derive"] }
 cust = { workspace = true, optional = true }
 cust_core = { workspace = true }
-cust_derive = { workspace = true, optional = true }
 final = { workspace = true, optional = true }
 oneshot = { workspace = true, features = ["std", "async"], optional = true }
 regex = { workspace = true, optional = true }
 rust-cuda-derive = { workspace = true, optional = true }
 rust-cuda-kernel = { workspace = true, optional = true }
 safer_owning_ref = { workspace = true, optional = true }
+
+[lints]
+workspace = true
diff --git a/src/host/mod.rs b/src/host/mod.rs
index c97452438..8c42d80cf 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -205,6 +205,7 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
         }
     }
 
+    #[allow(clippy::needless_pass_by_ref_mut)]
     #[must_use]
     pub(crate) fn for_device<'b>(&'b mut self) -> DeviceMutRef<'a, T>
     where
@@ -244,18 +245,15 @@ impl<'a, T: PortableBitSemantics + TypeGraphLayout> HostAndDeviceMutRef<'a, T> {
     }
 
     #[must_use]
-    pub fn into_mut<'b>(self) -> HostAndDeviceMutRef<'b, T>
+    pub const fn into_mut<'b>(self) -> HostAndDeviceMutRef<'b, T>
     where
         'a: 'b,
     {
-        HostAndDeviceMutRef {
-            device_box: self.device_box,
-            host_ref: self.host_ref,
-        }
+        self
     }
 
     #[must_use]
-    pub fn into_async<'b, 'stream>(
+    pub const fn into_async<'b, 'stream>(
         self,
         stream: Stream<'stream>,
     ) -> Async<'b, 'stream, HostAndDeviceMutRef<'b, T>, NoCompletion>
diff --git a/src/lend/impls/arc.rs b/src/lend/impls/arc.rs
index ec5527330..b08ba6342 100644
--- a/src/lend/impls/arc.rs
+++ b/src/lend/impls/arc.rs
@@ -107,6 +107,7 @@ unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCudaAsync for Arc<T
         use cust::memory::AsyncCopyDestination;
 
         let locked_box = unsafe {
+            #[allow(clippy::used_underscore_items)]
             let inner = ManuallyDrop::new(_ArcInner {
                 strong: AtomicUsize::new(1),
                 weak: AtomicUsize::new(1),
diff --git a/src/safety/portable.rs b/src/safety/portable.rs
index 9e81d2cfc..74e42c144 100644
--- a/src/safety/portable.rs
+++ b/src/safety/portable.rs
@@ -1,8 +1,11 @@
 macro_rules! portable_bit_semantics_docs {
     ($item:item) => {
-        /// Types whose in-memory bit representation on the CPU host is safe to copy
-        /// to and read back on the GPU device while maintaining the same semantics,
-        /// iff the type layout on the CPU matches the type layout on the GPU.
+        /// Types with a CPU-GPU-compatible memory representation.
+        ///
+        /// More specifically, types in-memory bit representation on the CPU host
+        /// is safe to copy to and read back on the GPU device while maintaining
+        /// the same semantics, iff the type layout on the CPU matches the type
+        /// layout on the GPU.
         ///
         /// For a type to implement [`PortableBitSemantics`], it
         ///

From 8dc39c8a6bdc817cefd197300e537bad7bb7b882 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 11 Feb 2025 08:42:24 +0000
Subject: [PATCH 09/10] Clean up the cust_core dependency

---
 src/host/mod.rs               |  7 +++----
 src/lend/impls/arced_slice.rs | 11 +++++------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/host/mod.rs b/src/host/mod.rs
index 8c42d80cf..3ae4f0353 100644
--- a/src/host/mod.rs
+++ b/src/host/mod.rs
@@ -9,10 +9,9 @@ use cust::{
     context::Context,
     error::CudaError,
     event::Event,
-    memory::{CopyDestination, DeviceBox, DeviceBuffer, LockedBox, LockedBuffer},
+    memory::{CopyDestination, DeviceBox, DeviceBuffer, DeviceCopy, LockedBox, LockedBuffer},
     module::Module,
 };
-use cust_core::DeviceCopy;
 
 use crate::{
     safety::PortableBitSemantics,
@@ -119,7 +118,7 @@ impl<T: DeviceCopy> CudaDroppable for DeviceBox<T> {
     }
 }
 
-impl<T: cust_core::DeviceCopy> CudaDroppable for DeviceBuffer<T> {
+impl<T: DeviceCopy> CudaDroppable for DeviceBuffer<T> {
     fn drop(val: Self) -> Result<(), (CudaError, Self)> {
         Self::drop(val)
     }
@@ -133,7 +132,7 @@ impl<T: DeviceCopy> CudaDroppable for LockedBox<T> {
     }
 }
 
-impl<T: cust_core::DeviceCopy> CudaDroppable for LockedBuffer<T> {
+impl<T: DeviceCopy> CudaDroppable for LockedBuffer<T> {
     fn drop(val: Self) -> Result<(), (CudaError, Self)> {
         Self::drop(val)
     }
diff --git a/src/lend/impls/arced_slice.rs b/src/lend/impls/arced_slice.rs
index 672cc2a6b..76d3f15cd 100644
--- a/src/lend/impls/arced_slice.rs
+++ b/src/lend/impls/arced_slice.rs
@@ -8,9 +8,8 @@ use const_type_layout::{TypeGraphLayout, TypeLayout};
 use cust::{
     error::CudaResult,
     memory::LockedBuffer,
-    memory::{DeviceBox, DeviceBuffer},
+    memory::{DeviceBox, DeviceBuffer, DeviceCopy},
 };
-use cust_core::DeviceCopy;
 
 use crate::{
     deps::alloc::sync::Arc,
@@ -50,21 +49,21 @@ pub struct _ArcInner<T: ?Sized> {
     data: T,
 }
 
-#[derive(Copy, Clone)]
+#[cfg(feature = "host")]
+#[derive(Copy, Clone, DeviceCopy)]
 #[repr(C)]
 struct _ArcInnerHeader {
     strong: _AtomicUsize,
     weak: _AtomicUsize,
 }
 
-#[derive(Copy, Clone)]
+#[cfg(feature = "host")]
+#[derive(Copy, Clone, DeviceCopy)]
 #[repr(C, align(8))]
 struct _AtomicUsize {
     v: usize,
 }
 
-unsafe impl DeviceCopy for _ArcInnerHeader {}
-
 unsafe impl<T: PortableBitSemantics + TypeGraphLayout> RustToCuda for Arc<[T]> {
     #[cfg(all(feature = "host", not(doc)))]
     type CudaAllocation = CudaDropWrapper<DeviceBuffer<DeviceCopyWithPortableBitSemantics<T>>>;

From 43937b0a57090de54f325de1535544f91cd4dcb6 Mon Sep 17 00:00:00 2001
From: Juniper Tyree <juniper.tyree@helsinki.fi>
Date: Tue, 11 Feb 2025 09:14:20 +0000
Subject: [PATCH 10/10] Bump MSRV to 1.84-nightly

---
 Cargo.toml                              | 2 +-
 README.md                               | 2 +-
 rust-cuda-derive/src/lib.rs             | 2 +-
 rust-cuda-kernel/build.rs               | 2 +-
 rust-cuda-kernel/src/kernel/link/mod.rs | 5 ++---
 rust-cuda-kernel/src/lib.rs             | 2 +-
 rust-toolchain                          | 4 ++--
 src/lib.rs                              | 2 +-
 src/utils/shared/slice.rs               | 6 +++---
 9 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 79705b409..6ec09e455 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,7 @@ edition = "2021"
 authors = ["Juniper Tyree <juniper.tyree@helsinki.fi>"]
 repository = "https://github.com/juntyr/rust-cuda"
 license = "MIT OR Apache-2.0"
-rust-version = "1.81" # nightly
+rust-version = "1.84" # nightly
 
 [workspace.dependencies]
 # workspace-internal crates
diff --git a/README.md b/README.md
index 4140f9e4b..ebc3a5a0b 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main
 [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain
 
-[MSRV]: https://img.shields.io/badge/MSRV-1.81.0--nightly-orange
+[MSRV]: https://img.shields.io/badge/MSRV-1.84.0--nightly-orange
 [repo]: https://github.com/juntyr/rust-cuda
 
 [Rust Doc]: https://img.shields.io/badge/docs-main-blue
diff --git a/rust-cuda-derive/src/lib.rs b/rust-cuda-derive/src/lib.rs
index cc371f18c..2cfb62949 100644
--- a/rust-cuda-derive/src/lib.rs
+++ b/rust-cuda-derive/src/lib.rs
@@ -5,7 +5,7 @@
 //! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main
 //! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain
 //!
-//! [MSRV]: https://img.shields.io/badge/MSRV-1.81.0--nightly-orange
+//! [MSRV]: https://img.shields.io/badge/MSRV-1.84.0--nightly-orange
 //! [repo]: https://github.com/juntyr/rust-cuda
 //!
 //! [Rust Doc]: https://img.shields.io/badge/docs-main-blue
diff --git a/rust-cuda-kernel/build.rs b/rust-cuda-kernel/build.rs
index ecd3b29cb..65b149df9 100644
--- a/rust-cuda-kernel/build.rs
+++ b/rust-cuda-kernel/build.rs
@@ -5,7 +5,7 @@
 //! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main
 //! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain
 //!
-//! [MSRV]: https://img.shields.io/badge/MSRV-1.81.0--nightly-orange
+//! [MSRV]: https://img.shields.io/badge/MSRV-1.84.0--nightly-orange
 //! [repo]: https://github.com/juntyr/rust-cuda
 //!
 //! [Rust Doc]: https://img.shields.io/badge/docs-main-blue
diff --git a/rust-cuda-kernel/src/kernel/link/mod.rs b/rust-cuda-kernel/src/kernel/link/mod.rs
index 8b4a549bb..f0a6bd154 100644
--- a/rust-cuda-kernel/src/kernel/link/mod.rs
+++ b/rust-cuda-kernel/src/kernel/link/mod.rs
@@ -189,7 +189,6 @@ fn extract_ptx_kernel_layout(kernel_ptx: &mut String) -> proc_macro2::TokenStrea
             );
         }
 
-        #[allow(clippy::literal_string_with_formatting_args)] // false positive
         if type_layout_metas
             .insert(String::from(param), bytes)
             .is_some()
@@ -484,7 +483,7 @@ fn check_kernel_ptx(
             }
             if ptx_lint_levels
                 .get(&PtxLint::DynamicStackSize)
-                .map_or(true, |level| *level <= LintLevel::Warn)
+                .is_none_or(|level| *level <= LintLevel::Warn)
             {
                 options.push(c"--suppress-stack-size-warning");
             }
@@ -530,7 +529,7 @@ fn check_kernel_ptx(
         }
         if ptx_lint_levels
             .get(&PtxLint::DynamicStackSize)
-            .map_or(true, |level| *level < LintLevel::Warn)
+            .is_none_or(|level| *level < LintLevel::Warn)
         {
             options.push(c"--suppress-stack-size-warning");
         }
diff --git a/rust-cuda-kernel/src/lib.rs b/rust-cuda-kernel/src/lib.rs
index 1714bddcd..e2b198153 100644
--- a/rust-cuda-kernel/src/lib.rs
+++ b/rust-cuda-kernel/src/lib.rs
@@ -5,7 +5,7 @@
 //! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main
 //! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain
 //!
-//! [MSRV]: https://img.shields.io/badge/MSRV-1.81.0--nightly-orange
+//! [MSRV]: https://img.shields.io/badge/MSRV-1.84.0--nightly-orange
 //! [repo]: https://github.com/juntyr/rust-cuda
 //!
 //! [Rust Doc]: https://img.shields.io/badge/docs-main-blue
diff --git a/rust-toolchain b/rust-toolchain
index 2404f256b..5b8ab7dea 100644
--- a/rust-toolchain
+++ b/rust-toolchain
@@ -1,5 +1,5 @@
 [toolchain]
-# Pin to final 1.85.0 nightly
-channel = "nightly-2025-01-03"
+# Pin to final 1.84.0 nightly
+channel = "nightly-2024-11-22"
 components = [ "cargo", "rustfmt", "clippy", "llvm-bitcode-linker", "llvm-tools" ]
 targets = [ "nvptx64-nvidia-cuda" ]
diff --git a/src/lib.rs b/src/lib.rs
index 0511e0191..1a1bebd63 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -5,7 +5,7 @@
 //! [CI Status]: https://img.shields.io/github/actions/workflow/status/juntyr/rust-cuda/ci.yml?branch=main
 //! [workflow]: https://github.com/juntyr/rust-cuda/actions/workflows/ci.yml?query=branch%3Amain
 //!
-//! [MSRV]: https://img.shields.io/badge/MSRV-1.81.0--nightly-orange
+//! [MSRV]: https://img.shields.io/badge/MSRV-1.84.0--nightly-orange
 //! [repo]: https://github.com/juntyr/rust-cuda
 //!
 //! [Rust Doc]: https://img.shields.io/badge/docs-main-blue
diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs
index f239ce7a8..a691bd2ea 100644
--- a/src/utils/shared/slice.rs
+++ b/src/utils/shared/slice.rs
@@ -10,7 +10,7 @@ pub struct ThreadBlockSharedSlice<T: 'static + TypeGraphLayout> {
 impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
     #[cfg(feature = "host")]
     #[must_use]
-    pub fn new_uninit_with_len(len: usize) -> Self {
+    pub const fn new_uninit_with_len(len: usize) -> Self {
         Self {
             shared: Self::dangling_slice_with_len(len),
         }
@@ -18,7 +18,7 @@ impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
 
     #[cfg(feature = "host")]
     #[must_use]
-    pub fn with_len(mut self, len: usize) -> Self {
+    pub const fn with_len(mut self, len: usize) -> Self {
         self.shared = Self::dangling_slice_with_len(len);
         self
     }
@@ -31,7 +31,7 @@ impl<T: 'static + TypeGraphLayout> ThreadBlockSharedSlice<T> {
     }
 
     #[cfg(feature = "host")]
-    fn dangling_slice_with_len(len: usize) -> *mut [T] {
+    const fn dangling_slice_with_len(len: usize) -> *mut [T] {
         core::ptr::slice_from_raw_parts_mut(core::ptr::NonNull::dangling().as_ptr(), len)
     }