diff --git a/lib/opte/src/ddi/mblk.rs b/lib/opte/src/ddi/mblk.rs index 22db5a7d..c20ea9e1 100644 --- a/lib/opte/src/ddi/mblk.rs +++ b/lib/opte/src/ddi/mblk.rs @@ -44,6 +44,34 @@ pub trait AsMblk { /// Consume `self`, returning the underlying `mblk_t`. The caller of this /// function now owns the underlying segment chain. fn unwrap_mblk(self) -> Option>; + + /// Consume `self`, returning the underlying `mblk_t` and a pointer to the tail + /// element. The caller of this function now owns the underlying segment chain. + /// + /// If the chain contains a single element, then the tail will be equal to the + /// head. + fn unwrap_head_and_tail(self) -> Option<(NonNull, NonNull)> + where + Self: Sized, + { + // SAFETY: `v`, if present, is a valid mblk_t. + self.unwrap_mblk().map(|v| unsafe { (v, find_mblk_tail(v)) }) + } +} + +/// Find the last element in an `mblk_t` chain. +/// +/// # SAFETY +/// `head` must point to a valid mblk_t, which must not contain any loops +/// in its `b_next` chain. +unsafe fn find_mblk_tail(head: NonNull) -> NonNull { + let mut tail = head; + unsafe { + while let Some(next_ptr) = NonNull::new((*tail.as_ptr()).b_next) { + tail = next_ptr; + } + } + tail } /// The head and tail of an mblk_t list. @@ -86,12 +114,7 @@ impl MsgBlkChain { let head = NonNull::new(mp).ok_or(WrapError::NullPtr)?; // Walk the chain to find the tail, and support faster append. - let mut tail = head; - unsafe { - while let Some(next_ptr) = NonNull::new((*tail.as_ptr()).b_next) { - tail = next_ptr; - } - } + let tail = unsafe { find_mblk_tail(head) }; Ok(Self(Some(MsgBlkChainInner { head, tail }))) } @@ -164,6 +187,12 @@ impl AsMblk for MsgBlkChain { fn unwrap_mblk(mut self) -> Option> { self.0.take().map(|v| v.head) } + + fn unwrap_head_and_tail( + mut self, + ) -> Option<(NonNull, NonNull)> { + self.0.take().map(|v| (v.head, v.tail)) + } } impl Drop for MsgBlkChain { @@ -830,6 +859,15 @@ impl AsMblk for MsgBlk { _ = ManuallyDrop::new(self); Some(ptr_out) } + + fn unwrap_head_and_tail( + self, + ) -> Option<(NonNull, NonNull)> { + // MsgBlk represents a single `mblk_t` with NULL `b_next`. + // Thus, tail is just the head. + let out = self.unwrap_mblk(); + Some((out, out)) + } } /// An interior node of an [`MsgBlk`]'s chain, accessed via iterator. diff --git a/xde/src/mac/mod.rs b/xde/src/mac/mod.rs index 42f70af3..2a68d4e8 100644 --- a/xde/src/mac/mod.rs +++ b/xde/src/mac/mod.rs @@ -377,6 +377,49 @@ impl

Drop for MacPromiscHandle

{ } } +/// Safe wrapper around `mac_siphon_set`/`mac_siphon_clear`. +#[derive(Debug)] +pub struct MacSiphon { + /// The MAC client this siphon callback is attached to. + parent: *const P, +} + +impl MacSiphon

{ + /// Register a promiscuous callback to receive packets on the underlying MAC. + pub fn new( + parent: Arc

, + siphon_fn: mac_siphon_fn, + ) -> Result { + let mch = parent.mac_client_handle()?; + let parent = Arc::into_raw(parent); + let arg = parent as *mut c_void; + + // SAFETY: `MacSiphon` keeps a reference to this `P` until it is removed, + // and so we can safely access it from the callback via the `arg` + // pointer. + unsafe { + mac_siphon_set(mch, siphon_fn, arg); + } + + Ok(Self { parent }) + } +} + +impl Drop for MacSiphon

{ + fn drop(&mut self) { + // Safety: the parent MAC we've attached this siphon to is guaranteed + // to live long enough to access again, since we have a refcount hold + // on it. + unsafe { + let parent = Arc::from_raw(self.parent); + let mac_client = parent + .mac_client_handle() + .expect("FATAL: cannot remove mac siphon from client"); + mac_siphon_clear(mac_client); + }; + } +} + /// Safe wrapper around a `mac_unicast_handle_t`. #[derive(Debug)] pub struct MacUnicastHandle { diff --git a/xde/src/mac/sys.rs b/xde/src/mac/sys.rs index 62ce2f9c..4ec048c6 100644 --- a/xde/src/mac/sys.rs +++ b/xde/src/mac/sys.rs @@ -76,6 +76,13 @@ pub type mac_rx_fn = unsafe extern "C" fn( *mut mblk_t, boolean_t, ); +pub type mac_siphon_fn = unsafe extern "C" fn( + *mut c_void, + *mut mblk_t, + *mut *mut mblk_t, + *mut c_uint, + *mut usize, +) -> *mut mblk_t; unsafe extern "C" { pub type mac_handle; @@ -133,6 +140,12 @@ unsafe extern "C" { arg: *mut c_void, ); pub fn mac_rx_clear(mch: *const mac_client_handle); + pub fn mac_siphon_set( + mch: *const mac_client_handle, + rx_fn: mac_siphon_fn, + arg: *mut c_void, + ); + pub fn mac_siphon_clear(mch: *const mac_client_handle); pub fn mac_tx( mch: *const mac_client_handle, mp_chain: *mut mblk_t, diff --git a/xde/src/route.rs b/xde/src/route.rs index 9b3474cd..ba481315 100644 --- a/xde/src/route.rs +++ b/xde/src/route.rs @@ -8,7 +8,7 @@ use crate::ip; use crate::sys; use crate::xde::DropRef; use crate::xde::XdeDev; -use crate::xde::xde_underlay_port; +use crate::xde::XdeUnderlayPort; use alloc::collections::BTreeMap; use alloc::collections::btree_map::Entry; use alloc::sync::Arc; @@ -273,7 +273,7 @@ fn netstack_rele(ns: *mut ip::netstack_t) { fn next_hop<'a>( key: &RouteKey, ustate: &'a XdeDev, -) -> Result, &'a xde_underlay_port> { +) -> Result, &'a XdeUnderlayPort> { let RouteKey { dst: ip6_dst, l4_hash } = key; unsafe { // Use the GZ's routing table. @@ -657,20 +657,20 @@ impl CachedRoute { pub struct Route<'a> { pub src: EtherAddr, pub dst: EtherAddr, - pub underlay_dev: &'a xde_underlay_port, + pub underlay_dev: &'a XdeUnderlayPort, } impl<'a> Route<'a> { fn cached(&self, xde: &XdeDev, timestamp: Moment) -> CachedRoute { // As unfortunate as `into_route`. - let port_0: &xde_underlay_port = &xde.u1; + let port_0: &XdeUnderlayPort = &xde.u1; let underlay_idx = if core::ptr::eq(self.underlay_dev, port_0) { 0 } else { 1 }; CachedRoute { src: self.src, dst: self.dst, underlay_idx, timestamp } } - fn zero_addr(underlay_dev: &'a xde_underlay_port) -> Route<'a> { + fn zero_addr(underlay_dev: &'a XdeUnderlayPort) -> Route<'a> { Self { src: EtherAddr::zero(), dst: EtherAddr::zero(), underlay_dev } } } diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 4ebc306f..7bc8d091 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -21,7 +21,7 @@ use crate::mac; use crate::mac::ChecksumOffloadCapabs; use crate::mac::MacEmul; use crate::mac::MacHandle; -use crate::mac::MacPromiscHandle; +use crate::mac::MacSiphon; use crate::mac::MacTxFlags; use crate::mac::OffloadInfo; use crate::mac::TcpLsoFlags; @@ -48,6 +48,7 @@ use alloc::vec::Vec; use core::ffi::CStr; use core::num::NonZeroU32; use core::ptr; +use core::ptr::NonNull; use core::ptr::addr_of; use core::ptr::addr_of_mut; use core::time::Duration; @@ -232,7 +233,7 @@ fn bad_packet_probe( /// Underlay port state. #[derive(Debug)] -pub struct xde_underlay_port { +pub struct XdeUnderlayPort { /// Name of the link being used for this underlay port. pub name: String, @@ -243,7 +244,7 @@ pub struct xde_underlay_port { pub mtu: u32, /// MAC promiscuous handle for receiving packets on the underlay link. - mph: MacPromiscHandle, + siphon: MacSiphon, /// DLS-level handle on a device for promiscuous registration and /// packet Tx. @@ -263,8 +264,8 @@ struct XdeState { struct UnderlayState { // each xde driver has a handle to two underlay ports that are used for I/O // onto the underlay network - u1: Arc, - u2: Arc, + u1: Arc, + u2: Arc, shared_props: OffloadInfo, } @@ -331,8 +332,8 @@ pub struct XdeDev { // These are clones of the underlay ports initialized by the // driver. - pub u1: Arc, - pub u2: Arc, + pub u1: Arc, + pub u2: Arc, underlay_capab: OffloadInfo, // We make this a per-port cache rather than sharing between all @@ -962,19 +963,20 @@ fn clear_xde_underlay() -> Result { }; for u in [u1, u2] { - // We have a chain of refs here: `MacPromiscHandle` holds a ref to - // `DldStream`. We explicitly drop them in order here to ensure + // We have a chain of refs here: `MacSiphon` holds a ref to + // `DlsStream`. We explicitly drop them in order here to ensure // there are no outstanding refs. - // 1. Remove promisc callback. - drop(u.mph); + // 1. Remove packet rx callback. + drop(u.siphon); // Although `xde_rx` can be called into without any running ports - // via the promisc handle, illumos guarantees that this callback won't - // be running here. `mac_promisc_remove` will either remove the callback - // immediately (if there are no walkers) or will mark the callback as - // condemned and await all active walkers finishing. Accordingly, no one - // else will have or try to clone the Stream handle. + // via the siphon handle, illumos guarantees that this callback won't + // be running here. `mac_siphon_clear` performs the moral equivalent of + // `mac_rx_barrier` -- the client's SRS is quiesced, and then restarted + // after the callback is removed. + // Because there are no ports and we hold the write/management lock, no + // one else will have or try to clone the Stream handle. // 2. Close the open stream handle. if Arc::into_inner(u.stream).is_none() { @@ -1098,7 +1100,7 @@ fn create_underlay_port( link_name: String, // This parameter is likely to be used as part of the flows work. _mc_name: &str, -) -> Result<(xde_underlay_port, OffloadInfo), OpteError> { +) -> Result<(XdeUnderlayPort, OffloadInfo), OpteError> { let link_cstr = CString::new(link_name.as_str()).unwrap(); let link_id = @@ -1113,19 +1115,12 @@ fn create_underlay_port( msg: format!("failed to grab open stream for {link_name}: {e}"), })?); - // Setup promiscuous callback to receive all packets on this link. - // - // We specify `MAC_PROMISC_FLAGS_NO_TX_LOOP` here to skip receiving copies - // of outgoing packets we sent ourselves. - let mph = MacPromiscHandle::new( - stream.clone(), - mac::mac_client_promisc_type_t::MAC_CLIENT_PROMISC_ALL, - xde_rx, - mac::MAC_PROMISC_FLAGS_NO_TX_LOOP, - ) - .map_err(|e| OpteError::System { - errno: EFAULT, - msg: format!("mac_promisc_add failed for {link_name}: {e}"), + // Bind a packet handler to the MAC client underlying `stream`. + let siphon = MacSiphon::new(stream.clone(), xde_rx).map_err(|e| { + OpteError::System { + errno: EFAULT, + msg: format!("failed to set MAC siphon on {link_name}: {e}"), + } })?; // Grab mac handle for underlying link, to retrieve its MAC address. @@ -1141,11 +1136,11 @@ fn create_underlay_port( let lso_state = mh.get_lso_capabs(); Ok(( - xde_underlay_port { + XdeUnderlayPort { name: link_name, mac: mh.get_mac_addr(), mtu, - mph, + siphon, stream, }, OffloadInfo { lso_state, cso_state, mtu }, @@ -2048,49 +2043,88 @@ fn new_port( #[unsafe(no_mangle)] unsafe extern "C" fn xde_rx( arg: *mut c_void, - mrh: *mut mac::mac_resource_handle, mp_chain: *mut mblk_t, - _is_loopback: boolean_t, -) { + out_mp_tail: *mut *mut mblk_t, + out_count: *mut c_uint, + out_len: *mut usize, +) -> *mut mblk_t { __dtrace_probe_rx(mp_chain as uintptr_t); // Safety: This arg comes from `Arc::from_ptr()` on the `MacClientHandle` - // corresponding to the underlay port we're receiving on. Being - // here in the callback means the `MacPromiscHandle` hasn't been - // dropped yet and thus our `MacClientHandle` is also still valid. - let stream: Arc = unsafe { - let mch_ptr = arg as *const DlsStream; - Arc::increment_strong_count(mch_ptr); - Arc::from_raw(mch_ptr) + // corresponding to the underlay port we're receiving on (derived from + // `DlsStream`). Being here in the callback means the `MacSiphon` hasn't + // been dropped yet, and thus our `MacClientHandle` is also still valid. + let stream = unsafe { + (arg as *const DlsStream) + .as_ref() + .expect("packet was received from siphon with a NULL argument") }; - let Ok(mut chain) = (unsafe { MsgBlkChain::new(mp_chain) }) else { + let mut chain = if let Ok(chain) = unsafe { MsgBlkChain::new(mp_chain) } { + chain + } else { bad_packet_probe( None, - Direction::Out, + Direction::In, mp_chain as uintptr_t, c"rx'd packet chain was null", ); - return; + + // Continue processing on an empty chain to uphold the contract with + // MAC for the three `out_` pointer values. + MsgBlkChain::empty() }; + let mut out_chain = MsgBlkChain::empty(); + let mut count = 0; + let mut len = 0; + // TODO: In future we may want to batch packets for further tx // by the mch they're being targeted to. E.g., either build a list // of chains (port0, port1, ...), or hold tx until another // packet breaks the run targeting the same dest. while let Some(pkt) = chain.pop_front() { unsafe { - xde_rx_one(&stream, mrh, pkt); + if let Some(pkt) = xde_rx_one(&stream, pkt) { + count += 1; + len += pkt.byte_len(); + out_chain.append(pkt); + } + } + } + + let (head, tail) = out_chain + .unwrap_head_and_tail() + .map(|v| (v.0.as_ptr(), v.1.as_ptr())) + .unwrap_or((ptr::null_mut(), ptr::null_mut())); + + if let Some(ptr) = NonNull::new(out_len) { + unsafe { + ptr.write(len); } } + + if let Some(ptr) = NonNull::new(out_count) { + unsafe { + ptr.write(count); + } + } + + if let Some(ptr) = NonNull::new(out_mp_tail) { + unsafe { + ptr.write(tail); + } + } + + head } +/// Processes an individual packet receiver on the underlay device `stream`. +/// +/// This function returns any input `pkt` which is not of interest to XDE (e.g., +/// the packet is not Geneve over v6, or no matching OPTE port could be found). #[inline] -unsafe fn xde_rx_one( - stream: &DlsStream, - mrh: *mut mac::mac_resource_handle, - mut pkt: MsgBlk, -) { +unsafe fn xde_rx_one(stream: &DlsStream, mut pkt: MsgBlk) -> Option { let mblk_addr = pkt.mblk_addr(); // We must first parse the packet in order to determine where it @@ -2109,7 +2143,7 @@ unsafe fn xde_rx_one( opte::engine::dbg!("Tx bad packet: {:?}", e); bad_packet_parse_probe(None, Direction::In, mblk_addr, &e); - return; + return Some(pkt); } }; @@ -2121,7 +2155,7 @@ unsafe fn xde_rx_one( Ok(ulp_meoi) => ulp_meoi, Err(e) => { opte::engine::dbg!("{}", e); - return; + return None; } }; @@ -2139,7 +2173,7 @@ unsafe fn xde_rx_one( vni, ether_dst ); - return; + return Some(pkt); }; let is_tcp = matches!(meta.inner_ulp, ValidUlp::Tcp(_)); @@ -2150,9 +2184,9 @@ unsafe fn xde_rx_one( if dev.passthrough { drop(parsed_pkt); unsafe { - mac::mac_rx(dev.mh, mrh, pkt.unwrap_mblk().as_ptr()); + mac::mac_rx(dev.mh, ptr::null_mut(), pkt.unwrap_mblk().as_ptr()); } - return; + return None; } let port = &dev.port; @@ -2161,7 +2195,7 @@ unsafe fn xde_rx_one( match res { Ok(ProcessResult::Bypass) => unsafe { - mac::mac_rx(dev.mh, mrh, pkt.unwrap_mblk().as_ptr()); + mac::mac_rx(dev.mh, ptr::null_mut(), pkt.unwrap_mblk().as_ptr()); }, Ok(ProcessResult::Modified(emit_spec)) => { let mut npkt = emit_spec.apply(pkt); @@ -2186,7 +2220,11 @@ unsafe fn xde_rx_one( } unsafe { - mac::mac_rx(dev.mh, mrh, npkt.unwrap_mblk().as_ptr()); + mac::mac_rx( + dev.mh, + ptr::null_mut(), + npkt.unwrap_mblk().as_ptr(), + ); } } Ok(ProcessResult::Hairpin(hppkt)) => { @@ -2194,6 +2232,8 @@ unsafe fn xde_rx_one( } _ => {} } + + None } #[unsafe(no_mangle)]