diff --git a/Cargo.lock b/Cargo.lock index 26f5741a..7bb2995c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -893,6 +893,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "illumos-sys-hdrs" version = "0.1.0" +dependencies = [ + "bitflags 2.9.0", +] [[package]] name = "indexmap" diff --git a/crates/illumos-sys-hdrs/Cargo.toml b/crates/illumos-sys-hdrs/Cargo.toml index 3fe76454..f6196a93 100644 --- a/crates/illumos-sys-hdrs/Cargo.toml +++ b/crates/illumos-sys-hdrs/Cargo.toml @@ -8,4 +8,7 @@ repository.workspace = true [features] default = [] -kernel = [] \ No newline at end of file +kernel = [] + +[dependencies] +bitflags.workspace = true diff --git a/crates/illumos-sys-hdrs/src/kernel.rs b/crates/illumos-sys-hdrs/src/kernel.rs index 36a66757..d8b38a73 100644 --- a/crates/illumos-sys-hdrs/src/kernel.rs +++ b/crates/illumos-sys-hdrs/src/kernel.rs @@ -370,6 +370,9 @@ pub const DDI_PSEUDO: *const c_char = c"ddi_pseudo".as_ptr(); pub const KM_SLEEP: i32 = 0x0000; pub const KM_NOSLEEP: i32 = 0x0001; +pub const MAXNAMELEN: c_int = 256; +pub const MAXLINKNAMELEN: c_int = 32; +pub const MAXPATHLEN: usize = 1024; pub const MODREV_1: c_int = 1; pub const S_IFCHR: c_int = 0x2000; diff --git a/crates/illumos-sys-hdrs/src/lib.rs b/crates/illumos-sys-hdrs/src/lib.rs index dcd52d40..f20eb65c 100644 --- a/crates/illumos-sys-hdrs/src/lib.rs +++ b/crates/illumos-sys-hdrs/src/lib.rs @@ -12,6 +12,8 @@ pub mod kernel; #[cfg(feature = "kernel")] pub use kernel::*; +pub mod mac; + use core::ptr; use core::sync::atomic::AtomicI32; use core::sync::atomic::AtomicI64; @@ -246,6 +248,8 @@ pub struct dblk_t { pub db_struioun: u64, // imprecise pub db_fthdr: *const c_void, // imprecise pub db_credp: *const c_void, // imprecise + + pub db_meoi: [u8; 16], // imprecise } impl Default for dblk_t { @@ -269,6 +273,8 @@ impl Default for dblk_t { db_struioun: 0, db_fthdr: ptr::null(), db_credp: ptr::null(), + + db_meoi: Default::default(), } } } diff --git a/crates/illumos-sys-hdrs/src/mac.rs b/crates/illumos-sys-hdrs/src/mac.rs new file mode 100644 index 00000000..4dcee14b --- /dev/null +++ b/crates/illumos-sys-hdrs/src/mac.rs @@ -0,0 +1,190 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company + +#[cfg(feature = "kernel")] +use crate::mblk_t; +use bitflags::bitflags; + +// ====================================================================== +// uts/common/sys/mac_provider.h +// ====================================================================== + +bitflags! { +#[repr(C)] +#[derive(Clone, Copy, Debug, Default)] +/// Flags which denote the valid fields of a `mac_ether_offload_info_t` +/// or `mac_ether_tun_info_t`. +/// +/// These are derived from `mac_ether_offload_flags_t` (mac_provider.h, +/// omitting the `MEOI_` prefix). +pub struct MacEtherOffloadFlags: u32 { + /// `l2hlen` and `l3proto` are set. + const L2INFO_SET = 1 << 0; + /// `l3hlen` and `l4proto` are set. + const L3INFO_SET = 1 << 1; + /// `l4hlen` is set. + const L4INFO_SET = 1 << 2; + /// `tunhlen` is set. + const TUNINFO_SET = 1 << 3; + /// The ethernet header contains a VLAN tag. + const VLAN_TAGGED = 1 << 4; + /// The packet is fragmented at L3. + const L3_FRAGMENT = 1 << 5; +} +} + +#[repr(C)] +#[derive(Copy, Clone, Debug, Default, Eq, PartialEq)] +/// The type of tunnel in use for a packet's outermost layer. +/// +/// These are derived from `mac_ether_tun_type_t` (mac_provider.h, +/// omitting the `METT_` prefix). +pub struct MacTunType(u32); + +impl MacTunType { + pub const NONE: Self = Self(0); + pub const GENEVE: Self = Self(1); + pub const VXLAN: Self = Self(2); +} + +#[repr(C)] +#[derive(Clone, Copy, Debug, Default)] +pub struct mac_ether_offload_info_t { + pub meoi_flags: MacEtherOffloadFlags, + pub meoi_tuntype: MacTunType, + pub meoi_len: u32, + pub meoi_l2hlen: u8, + pub meoi_l3proto: u16, + pub meoi_l3hlen: u16, + pub meoi_l4proto: u8, + pub meoi_l4hlen: u8, + pub meoi_tunhlen: u16, +} + +#[cfg(feature = "kernel")] +unsafe extern "C" { + pub fn lso_info_set(mp: *mut mblk_t, mss: u32, flags: u32); + pub fn lso_info_cleanup(mp: *mut mblk_t); + pub fn mac_hcksum_set( + mp: *mut mblk_t, + start: u32, + stuff: u32, + end: u32, + value: u32, + flags: u32, + ); + pub fn mac_hcksum_get( + mp: *mut mblk_t, + start: *mut u32, + stuff: *mut u32, + end: *mut u32, + value: *mut u32, + flags: *mut u32, + ); + pub fn mac_lso_get(mp: *mut mblk_t, mss: *mut u32, flags: *mut u32); + pub fn mac_ether_set_pktinfo( + mp: *mut mblk_t, + outer_info: *const mac_ether_offload_info_t, + inner_info: *const mac_ether_offload_info_t, + ); +} + +// ====================================================================== +// uts/common/sys/pattr.h +// ====================================================================== + +bitflags! { +/// Flags which denote checksum and LSO state for an `mblk_t`. +/// +/// These are derived from `#define`s in pattr.h. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct MblkOffloadFlags: u32 { + /// Tx: IPv4 header checksum must be computed by hardware. + const HCK_IPV4_HDRCKSUM = 1 << 0; + /// Rx: IPv4 header checksum was verified correct by hardware. + const HCK_IPV4_HDRCKSUM_OK = Self::HCK_IPV4_HDRCKSUM.bits(); + /// * Tx: Compute partial checksum based on start/stuff/end offsets. + /// * Rx: Partial checksum computed and attached. + const HCK_PARTIALCKSUM = 1 << 1; + /// * Tx: Compute full (pseudo + l4 + payload) cksum for this packet. + /// * Rx: Full checksum was computed in hardware, and is attached. + const HCK_FULLCKSUM = 1 << 2; + /// Rx: Hardware has verified that L3/L4 checksums are correct. + const HCK_FULLCKSUM_OK = 1 << 3; + /// Tx: Hardware must perform LSO. + const HW_LSO = 1 << 4; + /// Tx: The inner frame's IPv4 header checksum must be computed by + /// hardware. + const HCK_INNER_V4CKSUM = 1 << 5; + /// Rx: The inner frame's IPv4 header checksum was verified correct by + /// hardware. + const HCK_INNER_V4CKSUM_OK = 1 << 6; + /// * Tx: Compute inner L4 partial checksum based on MEOI parse offsets. + const HCK_INNER_PARTIAL = 1 << 7; + /// * Tx: Compute full (pseudo + l4 + payload) cksum for this packet's + /// inner L4. + const HCK_INNER_FULL = 1 << 8; + /// Rx: Hardware has verified that inner L3/L4 checksums are correct. + const HCK_INNER_FULL_OK = 1 << 9; + /// The union of all checksum-related flags. + const HCK_FLAGS = Self::HCK_IPV4_HDRCKSUM.bits() | + Self::HCK_PARTIALCKSUM.bits() | Self::HCK_FULLCKSUM.bits() | + Self::HCK_FULLCKSUM_OK.bits() | Self::HCK_INNER_V4CKSUM.bits() | + Self::HCK_INNER_V4CKSUM_OK.bits() | Self::HCK_INNER_PARTIAL.bits() | + Self::HCK_INNER_FULL.bits() | Self::HCK_INNER_FULL_OK.bits(); + /// The union of all checksum-related flags used in the transmit path + /// (i.e., indicating missing checksums). + const HCK_TX_FLAGS = Self::HCK_IPV4_HDRCKSUM.bits() | + Self::HCK_PARTIALCKSUM.bits() | Self::HCK_FULLCKSUM.bits() | + Self::HCK_INNER_V4CKSUM.bits() | Self::HCK_INNER_PARTIAL.bits() | + Self::HCK_INNER_FULL.bits(); + /// The union of all checksum-related flags used in the transmit path + /// for outer headers (untunnelled packets and encap layers). + const HCK_OUTER_TX_FLAGS = Self::HCK_IPV4_HDRCKSUM.bits() | + Self::HCK_PARTIALCKSUM.bits() | Self::HCK_FULLCKSUM.bits(); + /// The union of all checksum-related flags for outer headers (untunnelled + /// packets and encap layers). + const HCK_OUTER_FLAGS = Self::HCK_OUTER_TX_FLAGS.bits() | + Self::HCK_IPV4_HDRCKSUM_OK.bits() | Self::HCK_FULLCKSUM_OK.bits(); + /// The union of all checksum-related flags used in the transmit path + /// for inner headers (tunnelled packets). + const HCK_INNER_TX_FLAGS = Self::HCK_INNER_V4CKSUM.bits() | + Self::HCK_INNER_PARTIAL.bits() | Self::HCK_INNER_FULL.bits(); + /// The union of all checksum-related flags for inner headers (tunnelled + /// packets). + const HCK_INNER_FLAGS = Self::HCK_INNER_TX_FLAGS.bits() | + Self::HCK_INNER_V4CKSUM_OK.bits() | Self::HCK_INNER_FULL_OK.bits(); + /// The union of all LSO-related flags. + const HW_LSO_FLAGS = Self::HW_LSO.bits(); +} +} + +impl MblkOffloadFlags { + /// Move any outer offload flags to the inner layer, as part of + /// encapsulation. + pub fn shift_in(self) -> Self { + let mut out = + self.difference(Self::HCK_INNER_FLAGS.union(Self::HCK_OUTER_FLAGS)); + + if self.contains(Self::HCK_IPV4_HDRCKSUM) { + out |= Self::HCK_INNER_V4CKSUM; + } + + if self.contains(Self::HCK_PARTIALCKSUM) { + out |= Self::HCK_INNER_PARTIAL; + } + + if self.contains(Self::HCK_FULLCKSUM) { + out |= Self::HCK_INNER_FULL; + } + + if self.contains(Self::HCK_FULLCKSUM_OK) { + out |= Self::HCK_INNER_FULL_OK; + } + + out + } +} diff --git a/lib/opte/Cargo.toml b/lib/opte/Cargo.toml index dcc77f47..4aa8fc9a 100644 --- a/lib/opte/Cargo.toml +++ b/lib/opte/Cargo.toml @@ -29,7 +29,7 @@ opte-api.workspace = true ingot.workspace = true -bitflags.workspace = true +bitflags = { workspace = true , features = ["serde"] } cfg-if.workspace = true crc32fast = { workspace = true, optional = true } dyn-clone.workspace = true diff --git a/lib/opte/src/ddi/mblk.rs b/lib/opte/src/ddi/mblk.rs index 023130dd..820e7660 100644 --- a/lib/opte/src/ddi/mblk.rs +++ b/lib/opte/src/ddi/mblk.rs @@ -27,6 +27,8 @@ use illumos_sys_hdrs as ddi; use illumos_sys_hdrs::c_uchar; #[cfg(any(feature = "std", test))] use illumos_sys_hdrs::dblk_t; +use illumos_sys_hdrs::mac::MblkOffloadFlags; +use illumos_sys_hdrs::mac::mac_ether_offload_info_t; use illumos_sys_hdrs::mblk_t; use illumos_sys_hdrs::uintptr_t; use ingot::types::Emit; @@ -36,6 +38,14 @@ use ingot::types::Read; pub static MBLK_MAX_SIZE: usize = u16::MAX as usize; +/// Abstractions over an `mblk_t` which can be returned to their +/// raw pointer representation. +pub trait AsMblk { + /// Consume `self`, returning the underlying `mblk_t`. The caller of this + /// function now owns the underlying segment chain. + fn unwrap_mblk(self) -> Option>; +} + /// The head and tail of an mblk_t list. struct MsgBlkChainInner { head: NonNull, @@ -148,11 +158,10 @@ impl MsgBlkChain { self.0 = Some(MsgBlkChainInner { head: pkt, tail: pkt }); } } +} - /// Return the head of the underlying `mblk_t` packet chain and - /// consume `self`. The caller of this function now owns the - /// `mblk_t` segment chain. - pub fn unwrap_mblk(mut self) -> Option> { +impl AsMblk for MsgBlkChain { + fn unwrap_mblk(mut self) -> Option> { self.0.take().map(|v| v.head) } } @@ -619,9 +628,9 @@ impl MsgBlk { /// consume `self`. The caller of this function now owns the /// `mblk_t` segment chain. pub fn unwrap_mblk(self) -> NonNull { - let ptr_out = self.0; - _ = ManuallyDrop::new(self); - ptr_out + // SAFETY: this type's `AsMblk` always returns `Some` + AsMblk::unwrap_mblk(self) + .expect("unwrapping a single mblk is always infallible") } /// Wrap the `mblk_t` packet in a [`MsgBlk`], taking ownership of @@ -713,6 +722,105 @@ impl MsgBlk { self.0 = head; } + + /// Copies the offload information from this message block to + /// another, including checksum/LSO flags and TCP MSS (if set). + pub fn copy_offload_info_to(&self, other: &mut Self) { + unsafe { + let info = offload_info(self.0); + set_offload_info(other.0, info); + } + } + + /// Return the number of active [`MsgBlk`]a referring to the underlying + /// data. + pub fn ref_count(&self) -> usize { + (unsafe { (*(*self.0.as_ptr()).b_datap).db_ref }) as usize + } + + /// Sets a packet's offload flags, and sets MSS if `HW_LSO` is enabled. + #[cfg_attr(any(feature = "std", test), allow(unused))] + pub fn request_offload(&mut self, flags: MblkOffloadFlags, mss: u32) { + let ckflags = flags & MblkOffloadFlags::HCK_FLAGS; + + #[cfg(all(not(feature = "std"), not(test)))] + unsafe { + illumos_sys_hdrs::mac::mac_hcksum_set( + self.0.as_ptr(), + 0, + 0, + 0, + 0, + ckflags.bits(), + ); + if flags.contains(MblkOffloadFlags::HW_LSO) { + illumos_sys_hdrs::mac::lso_info_set( + self.0.as_ptr(), + mss, + MblkOffloadFlags::HW_LSO.bits(), + ); + } + } + } + + /// Set parse information attached to a packet to enable tunnel-aware + /// offloads, and to help NIC drivers correctly program offloads without + /// a reparse. + #[cfg_attr(any(feature = "std", test), allow(unused))] + pub fn fill_parse_info( + &mut self, + outer_meoi: &mac_ether_offload_info_t, + inner_meoi: Option<&mac_ether_offload_info_t>, + ) -> Result<(), PktInfoError> { + if self.ref_count() > 1 { + return Err(PktInfoError::PacketShared); + } + + #[cfg(all(not(feature = "std"), not(test)))] + unsafe { + illumos_sys_hdrs::mac::mac_ether_set_pktinfo( + self.0.as_ptr(), + outer_meoi, + inner_meoi.map(|v| v as *const _).unwrap_or_else(ptr::null), + ) + } + + Ok(()) + } + + /// Return the offloads currently requested by a packet. + #[cfg_attr(any(feature = "std", test), allow(unused))] + pub fn offload_flags(&self) -> MblkOffloadFlags { + let mut cso_out = 0u32; + let mut lso_out = 0u32; + + #[cfg(all(not(feature = "std"), not(test)))] + unsafe { + illumos_sys_hdrs::mac::mac_hcksum_get( + self.0.as_ptr(), + ptr::null_mut(), + ptr::null_mut(), + ptr::null_mut(), + ptr::null_mut(), + &raw mut cso_out, + ); + illumos_sys_hdrs::mac::mac_lso_get( + self.0.as_ptr(), + ptr::null_mut(), + &raw mut lso_out, + ); + }; + + MblkOffloadFlags::from_bits_retain(cso_out | lso_out) + } +} + +impl AsMblk for MsgBlk { + fn unwrap_mblk(self) -> Option> { + let ptr_out = self.0; + _ = ManuallyDrop::new(self); + Some(ptr_out) + } } /// An interior node of an [`MsgBlk`]'s chain, accessed via iterator. @@ -852,6 +960,23 @@ impl Pullup for MsgBlkIterMut<'_> { } } +/// Reasons a [`MsgBlk`] could not have its parse information set. +#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Hash)] +pub enum PktInfoError { + /// The underlying `dblk_t` is pointed to by more than one [`MsgBlk`]. + PacketShared, +} + +impl core::error::Error for PktInfoError {} + +impl core::fmt::Display for PktInfoError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.write_str(match self { + Self::PacketShared => "packet has a reference count > 1", + }) + } +} + /// Counts the number of segments in an `mblk_t` from `head`, linked /// via `b_cont`. unsafe fn count_mblk_chain(mut head: Option>) -> usize { @@ -1056,6 +1181,8 @@ pub fn mock_desballoc(buf: Vec) -> *mut mblk_t { db_struioun: 0, db_fthdr: ptr::null(), db_credp: ptr::null(), + + ..Default::default() }); let dbp = Box::into_raw(dblk); diff --git a/lib/opte/src/engine/ip/mod.rs b/lib/opte/src/engine/ip/mod.rs index adc55594..0971420b 100644 --- a/lib/opte/src/engine/ip/mod.rs +++ b/lib/opte/src/engine/ip/mod.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2025 Oxide Computer Company pub mod v4; pub mod v6; @@ -81,6 +81,7 @@ impl L3 { } impl ValidL3 { + #[inline] pub fn pseudo_header(&self) -> Checksum { match self { ValidL3::Ipv4(v4) => { @@ -110,6 +111,7 @@ impl ValidL3 { } } + #[inline] pub fn csum(&self) -> [u8; 2] { match self { ValidL3::Ipv4(i4) => i4.checksum(), @@ -118,6 +120,16 @@ impl ValidL3 { .to_be_bytes() } + /// Return whether the IP layer has a checksum both structurally + /// and that it is non-zero (i.e., not offloaded). + #[inline] + pub fn has_ip_csum(&self) -> bool { + match self { + ValidL3::Ipv4(i4) => i4.checksum() != 0, + _ => false, + } + } + #[inline] pub fn validate(&self, bytes_after: usize) -> Result<(), ParseError> { match self { diff --git a/lib/opte/src/engine/mod.rs b/lib/opte/src/engine/mod.rs index e56d7e42..b1437db6 100644 --- a/lib/opte/src/engine/mod.rs +++ b/lib/opte/src/engine/mod.rs @@ -288,7 +288,9 @@ pub trait LightweightMeta: Into> { fn encap_len(&self) -> u16; /// Recalculate checksums within inner headers, derived from a pre-computed `body_csum`. - fn update_inner_checksums(&mut self, body_csum: Checksum); + /// + /// The body checksum may not be present if L4 headers have been offloaded. + fn update_inner_checksums(&mut self, body_csum: Option); /// Provide a view of internal TCP state. fn inner_tcp(&self) -> Option<&impl TcpRef>; diff --git a/lib/opte/src/engine/packet.rs b/lib/opte/src/engine/packet.rs index 1688889d..1a9396a4 100644 --- a/lib/opte/src/engine/packet.rs +++ b/lib/opte/src/engine/packet.rs @@ -2,14 +2,9 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2025 Oxide Computer Company //! Types for creating, reading, and writing network packets. -//! -//! TODO -//! -//! * Add hardware offload information to [`Packet`]. -//! use super::Direction; use super::LightweightMeta; @@ -38,6 +33,7 @@ use super::ip::v6::Ipv6Ref; use super::parse::NoEncap; use super::parse::Ulp; use super::parse::UlpRepr; +use super::port::meta::ActionMeta; use super::rule::CompiledEncap; use super::rule::CompiledTransform; use super::rule::HdrTransform; @@ -964,7 +960,10 @@ impl Packet> { #[inline] /// Convert a packet's metadata into a set of instructions /// needed to serialize all its changes to the wire. - pub fn emit_spec(&mut self) -> Result + pub fn emit_spec( + &mut self, + action_meta: &ActionMeta, + ) -> Result where T::Chunk: ByteSliceMut, { @@ -1169,6 +1168,7 @@ impl Packet> { ulp_len: encapped_len as u32, prepend: PushSpec::Slowpath(push_spec.into()), l4_hash, + mtu_unrestricted: action_meta.is_internal_target(), }) } @@ -1586,11 +1586,18 @@ pub struct EmitSpec { pub(crate) l4_hash: u32, pub(crate) rewind: u16, pub(crate) ulp_len: u32, + pub(crate) mtu_unrestricted: bool, } impl Default for EmitSpec { fn default() -> Self { - Self { prepend: PushSpec::NoOp, l4_hash: 0, rewind: 0, ulp_len: 0 } + Self { + prepend: PushSpec::NoOp, + l4_hash: 0, + rewind: 0, + ulp_len: 0, + mtu_unrestricted: false, + } } } @@ -1602,6 +1609,14 @@ impl EmitSpec { self.l4_hash } + /// Return whether this packet's route allows the use of a full jumbo frame + /// MSS. + #[inline] + #[must_use] + pub fn mtu_unrestricted(&self) -> bool { + self.mtu_unrestricted + } + /// Perform final structural transformations to a packet (removal of /// existing headers, and copying in new/replacement headers). #[inline] @@ -1736,6 +1751,7 @@ impl EmitSpec { } if let Some(mut prepend) = prepend { + pkt.copy_offload_info_to(&mut prepend); prepend.append(pkt); prepend } else { diff --git a/lib/opte/src/engine/parse.rs b/lib/opte/src/engine/parse.rs index 3fe22feb..17863aa4 100644 --- a/lib/opte/src/engine/parse.rs +++ b/lib/opte/src/engine/parse.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2025 Oxide Computer Company //! Constructs used in packet parsing, such as choices over protocol //! and complete packet definitions. @@ -83,6 +83,7 @@ pub enum Ulp { } impl ValidUlp { + #[inline] pub fn csum(&self) -> [u8; 2] { match self { ValidUlp::Tcp(t) => t.checksum(), @@ -92,9 +93,24 @@ impl ValidUlp { } .to_be_bytes() } + + /// Return whether the ULP layer has a checksum both structurally + /// and that it is non-zero (i.e., not offloaded). + #[inline] + pub fn has_ulp_csum(&self) -> bool { + let csum = match self { + ValidUlp::Tcp(t) => t.checksum(), + ValidUlp::Udp(u) => u.checksum(), + ValidUlp::IcmpV4(i4) => i4.checksum(), + ValidUlp::IcmpV6(i6) => i6.checksum(), + }; + + csum != 0 + } } impl ValidUlp { + #[inline] pub fn compute_checksum( &mut self, mut body_csum: Checksum, @@ -288,12 +304,18 @@ impl LightweightMeta for ValidNoEncap { } #[inline] - fn update_inner_checksums(&mut self, body_csum: Checksum) { + fn update_inner_checksums(&mut self, body_csum: Option) { if let Some(l3) = self.inner_l3.as_mut() { - if let Some(ulp) = self.inner_ulp.as_mut() { - ulp.compute_checksum(body_csum, l3); + if let (Some(ulp), Some(body_csum)) = + (self.inner_ulp.as_mut(), body_csum) + { + if ulp.has_ulp_csum() { + ulp.compute_checksum(body_csum, l3); + } + } + if l3.has_ip_csum() { + l3.compute_checksum(); } - l3.compute_checksum(); } } @@ -415,9 +437,15 @@ impl LightweightMeta for ValidGeneveOverV6 { } #[inline] - fn update_inner_checksums(&mut self, body_csum: Checksum) { - self.inner_ulp.compute_checksum(body_csum, &self.inner_l3); - self.inner_l3.compute_checksum(); + fn update_inner_checksums(&mut self, body_csum: Option) { + if let Some(body_csum) = body_csum { + if self.inner_ulp.has_ulp_csum() { + self.inner_ulp.compute_checksum(body_csum, &self.inner_l3); + } + } + if self.inner_l3.has_ip_csum() { + self.inner_l3.compute_checksum(); + } } #[inline] diff --git a/lib/opte/src/engine/port/meta.rs b/lib/opte/src/engine/port/meta.rs new file mode 100644 index 00000000..5b12189e --- /dev/null +++ b/lib/opte/src/engine/port/meta.rs @@ -0,0 +1,124 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2025 Oxide Computer Company +use alloc::collections::BTreeMap; +use alloc::string::String; +use alloc::string::ToString; + +/// A value meant to be used in the [`ActionMeta`] map. +/// +/// The purpose of this trait is to define the value's key as well +/// as serialization to/from strings. These are like Display and +/// FromStr; but here their focus is on unambiguous parsing. That +/// is, we can't necessarily rely on a type's Display impl being +/// good for serializing to a metadata string, but at the same +/// time we don't want to force its Display to have to work in +/// this constraint. +/// +/// A value doesn't have to implement this type; there is nothing +/// that enforces the strings stored in [`ActionMeta`] are strings +/// generated by this trait impl. It's just a convenient way to +/// mark and implement values meant to be used as action metadata. +pub trait ActionMetaValue: Sized { + const KEY: &'static str; + + fn key(&self) -> String { + Self::KEY.to_string() + } + + /// Create a representation of the value to be used in + /// [`ActionMeta`]. + fn as_meta(&self) -> String; + + /// Attempt to create a value assuming that `s` was created + /// with [`Self::as_meta()`]. + fn from_meta(s: &str) -> Result; +} + +/// The action metadata map. +/// +/// This metadata is accessible by all actions during layer +/// processing and acts as a form of inter-action communication. +/// Given that packets and their metadata are immutable (outside of +/// reified header transforms), this also allows actions to inform +/// OPTE of facts about a path or destination (e.g., MTU). +/// +/// Action metadata is nothing more than a map of string keys +/// to string values. It is up to the actions to decide what these strings +/// mean. However, *all keys prefaced with "opte:" are reserved for use by +/// operations on `ActionMeta`*, and map to functionality in OPTE itself +/// rather than a given dataplane design. +#[derive(Default)] +pub struct ActionMeta { + inner: BTreeMap, +} + +impl ActionMeta { + pub fn new() -> Self { + Self::default() + } + + /// Clear all entries. + pub fn clear(&mut self) { + self.inner.clear(); + } + + /// Insert the key-value pair into the map, replacing any + /// existing key-value pair. Return the value being replaced, + /// or `None`. + pub fn insert(&mut self, key: String, val: String) -> Option { + self.inner.insert(key, val) + } + + /// Remove the key-value pair with the specified key. Return + /// the value, or `None` if no such entry exists. + pub fn remove(&mut self, key: &str) -> Option { + self.inner.remove(key) + } + + /// Get a reference to the value with the given key, or `None` + /// if no such entry exists. + pub fn get(&self, key: &str) -> Option<&String> { + self.inner.get(key) + } + + /// Records whether this packet's destination can be reached using only + /// internal/private paths. + /// + /// The dataplane may use this to choose a larger (jumbo-frame) MSS for + /// TCP segmentation, or rely on other aspects of its internal network. + pub fn set_internal_target(&mut self, val: bool) { + _ = self + .insert(InternalTarget::KEY.into(), InternalTarget(val).as_meta()); + } + + /// Returns whether this packet's destination can be reached using only + /// internal/private paths. + pub fn is_internal_target(&self) -> bool { + self.get(InternalTarget::KEY) + .and_then(|v| InternalTarget::from_meta(v).ok()) + .unwrap_or_default() + .0 + } +} + +#[derive(Copy, Clone, Default)] +struct InternalTarget(bool); + +impl ActionMetaValue for InternalTarget { + const KEY: &'static str = "opte:internal-target"; + + fn as_meta(&self) -> String { + (if self.0 { "1" } else { "0" }).into() + } + + fn from_meta(s: &str) -> Result { + match s { + "1" => Ok(Self(true)), + "0" => Ok(Self(false)), + s => Err(format!("value `{s}` is illegal for InternalTarget")), + } + } +} diff --git a/lib/opte/src/engine/port.rs b/lib/opte/src/engine/port/mod.rs similarity index 96% rename from lib/opte/src/engine/port.rs rename to lib/opte/src/engine/port/mod.rs index ff2841c6..78ed89e3 100644 --- a/lib/opte/src/engine/port.rs +++ b/lib/opte/src/engine/port/mod.rs @@ -6,7 +6,6 @@ //! A virtual switch port. -use self::meta::ActionMeta; use super::HdlPktAction; use super::LightweightMeta; use super::NetworkImpl; @@ -49,6 +48,7 @@ use super::rule::Finalized; use super::rule::HdrTransform; use super::rule::HdrTransformError; use super::rule::Rule; +use super::rule::TransformFlags; use super::tcp::KEEPALIVE_EXPIRE_TTL; use super::tcp::TIME_WAIT_EXPIRE_TTL; use super::tcp::TcpState; @@ -95,12 +95,16 @@ use ingot::types::Emit; use ingot::types::HeaderLen; use ingot::types::Read; use ingot::udp::Udp; +use meta::ActionMeta; use opte_api::Direction; use opte_api::MacAddr; use opte_api::OpteError; use zerocopy::ByteSlice; use zerocopy::ByteSliceMut; +/// Metadata for inter-action communication. +pub mod meta; + pub type Result = result::Result; #[derive(Debug)] @@ -1455,14 +1459,13 @@ impl Port { let len = pkt.len(); let meta = pkt.meta_mut(); - let body_csum = if tx.checksums_dirty { - meta.compute_body_csum() - } else { - None - }; + let csum_dirty = tx.checksums_dirty(); + + let body_csum = + if csum_dirty { meta.compute_body_csum() } else { None }; meta.run_compiled_transform(&tx); - if let Some(csum) = body_csum { - meta.update_inner_checksums(csum); + if csum_dirty { + meta.update_inner_checksums(body_csum); } let encap_len = meta.encap_len(); let ulp_len = (len - (encap_len as usize)) as u32; @@ -1471,6 +1474,7 @@ impl Port { _ => 0, }; let out = EmitSpec { + mtu_unrestricted: tx.internal_destination(), prepend: PushSpec::Fastpath(tx), l4_hash, rewind, @@ -1567,7 +1571,7 @@ impl Port { } InternalProcessResult::Hairpin(v) => Ok(ProcessResult::Hairpin(v)), InternalProcessResult::Modified => pkt - .emit_spec() + .emit_spec(&ameta) .map_err(|_| ProcessError::BadEmitSpec) .map(ProcessResult::Modified), }); @@ -1762,7 +1766,7 @@ impl Transforms { } #[inline] - fn compile(mut self, checksums_dirty: bool) -> Arc { + fn compile(mut self, flags: TransformFlags) -> Arc { // Compile to a fasterpath transform iff. no body transform. if self.body.is_empty() { let mut still_permissable = true; @@ -1925,7 +1929,7 @@ impl Transforms { inner_ether: inner_ether.cloned(), inner_ip: inner_ip.cloned(), inner_ulp: inner_ulp.cloned(), - checksums_dirty, + flags, } .into(), ); @@ -2353,10 +2357,18 @@ impl Port { Err(e) => return Err(ProcessError::Layer(e)), } + let mut flags = TransformFlags::empty(); + if pkt.checksums_dirty() { + flags |= TransformFlags::CSUM_DIRTY; + } + if ameta.is_internal_target() { + flags |= TransformFlags::INTERNAL_DESTINATION; + } + let ufid_out = pkt.flow().mirror(); let mut hte = UftEntry { pair: KMutex::new(Some(ufid_out)), - xforms: xforms.compile(pkt.checksums_dirty()), + xforms: xforms.compile(flags), epoch, l4_hash: ufid_in.crc32(), tcp_flow: None, @@ -2577,9 +2589,17 @@ impl Port { let flow_before = *pkt.flow(); let res = self.layers_process(data, Out, pkt, &mut xforms, ameta); + let mut flags = TransformFlags::empty(); + if pkt.checksums_dirty() { + flags |= TransformFlags::CSUM_DIRTY; + } + if ameta.is_internal_target() { + flags |= TransformFlags::INTERNAL_DESTINATION; + } + let hte = UftEntry { pair: KMutex::new(None), - xforms: xforms.compile(pkt.checksums_dirty()), + xforms: xforms.compile(flags), epoch, l4_hash: flow_before.crc32(), tcp_flow, @@ -3068,82 +3088,3 @@ unsafe extern "C" { ifid: *const InnerFlowId, ); } - -/// Metadata for inter-action communication. -pub mod meta { - use alloc::collections::BTreeMap; - use alloc::string::String; - use alloc::string::ToString; - - /// A value meant to be used in the [`ActionMeta`] map. - /// - /// The purpose of this trait is to define the value's key as well - /// as serialization to/from strings. These are like Display and - /// FromStr; but here their focus is on unambiguous parsing. That - /// is, we can't necessarily rely on a type's Display impl being - /// good for serializing to a metadata string, but at the same - /// time we don't want to force its Display to have to work in - /// this constraint. - /// - /// A value doesn't have to implement this type; there is nothing - /// that enforces the strings stored in [`ActionMeta`] are strings - /// generated by this trait impl. It's just a convenient way to - /// mark and implement values meant to be used as action metadata. - pub trait ActionMetaValue: Sized { - const KEY: &'static str; - - fn key(&self) -> String { - Self::KEY.to_string() - } - - /// Create a representation of the value to be used in - /// [`ActionMeta`]. - fn as_meta(&self) -> String; - - /// Attempt to create a value assuming that `s` was created - /// with [`Self::as_meta()`]. - fn from_meta(s: &str) -> Result; - } - - /// The action metadata map. - /// - /// This metadata is accessible by all actions during layer - /// processing and acts as a form of inter-action communication. - /// The action metadata is nothing more than a map of string keys - /// to string values -- their meaning is opaque to OPTE itself. It - /// is up to the actions to decide what these strings mean. - #[derive(Default)] - pub struct ActionMeta { - inner: BTreeMap, - } - - impl ActionMeta { - pub fn new() -> Self { - Self::default() - } - - /// Clear all entries. - pub fn clear(&mut self) { - self.inner.clear(); - } - - /// Insert the key-value pair into the map, replacing any - /// existing key-value pair. Return the value being replaced, - /// or `None`. - pub fn insert(&mut self, key: String, val: String) -> Option { - self.inner.insert(key, val) - } - - /// Remove the key-value pair with the specified key. Return - /// the value, or `None` if no such entry exists. - pub fn remove(&mut self, key: &str) -> Option { - self.inner.remove(key) - } - - /// Get a reference to the value with the given key, or `None` - /// if no such entry exists. - pub fn get(&self, key: &str) -> Option<&String> { - self.inner.get(key) - } - } -} diff --git a/lib/opte/src/engine/rule.rs b/lib/opte/src/engine/rule.rs index 4e9ca351..d3695a99 100644 --- a/lib/opte/src/engine/rule.rs +++ b/lib/opte/src/engine/rule.rs @@ -45,6 +45,7 @@ use alloc::string::String; use alloc::string::ToString; use alloc::sync::Arc; use alloc::vec::Vec; +use bitflags::bitflags; use core::ffi::CStr; use core::fmt; use core::fmt::Debug; @@ -319,6 +320,14 @@ impl Display for HdrTransform { } } +bitflags! { + #[derive(Copy, Clone, Debug, Deserialize, Serialize)] + pub struct TransformFlags: u8 { + const CSUM_DIRTY = 1; + const INTERNAL_DESTINATION = 1 << 1; + } +} + /// Header transformations matching a simple format, amenable /// to fastpath compilation: /// * Encap is either pushed or popped in its entirety, @@ -331,10 +340,24 @@ pub struct CompiledTransform { pub inner_ether: Option, pub inner_ip: Option, pub inner_ulp: Option, - pub checksums_dirty: bool, + pub flags: TransformFlags, } impl CompiledTransform { + /// Does this transform modify any fields which factor into the + /// inner frame's L3/L4 checksums? + #[inline] + pub fn checksums_dirty(&self) -> bool { + self.flags.contains(TransformFlags::CSUM_DIRTY) + } + + /// Can the remote side of this flow be accessed purely using + /// internal/private paths? + #[inline] + pub fn internal_destination(&self) -> bool { + self.flags.contains(TransformFlags::INTERNAL_DESTINATION) + } + #[inline(always)] pub fn transform_ether( &self, @@ -463,13 +486,14 @@ impl CompiledEncap { return pkt; }; - let mut prepend = if pkt.head_capacity() < bytes.len() { - let mut pkt = MsgBlk::new_ethernet(bytes.len()); - pkt.pop_all(); - Some(pkt) - } else { - None - }; + let mut prepend = + if pkt.ref_count() > 1 || pkt.head_capacity() < bytes.len() { + let mut pkt = MsgBlk::new_ethernet(bytes.len()); + pkt.pop_all(); + Some(pkt) + } else { + None + }; let target = if let Some(prepend) = prepend.as_mut() { prepend @@ -498,6 +522,7 @@ impl CompiledEncap { *l4_len_slot = (l4_len as u16).to_be_bytes(); if let Some(mut prepend) = prepend { + pkt.copy_offload_info_to(&mut prepend); prepend.append(pkt); prepend } else { diff --git a/lib/oxide-vpc/src/engine/overlay.rs b/lib/oxide-vpc/src/engine/overlay.rs index 27425ab3..396fbc8d 100644 --- a/lib/oxide-vpc/src/engine/overlay.rs +++ b/lib/oxide-vpc/src/engine/overlay.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2025 Oxide Computer Company //! The Oxide Network VPC Overlay. //! @@ -237,6 +237,7 @@ impl StaticAction for EncapAction { let phys_target = match target { RouterTargetInternal::InternetGateway(_) => { + action_meta.set_internal_target(false); match self.v2b.get(&flow_id.dst_ip()) { Some(phys) => { // Hash the packet onto a route target. This is a very @@ -259,6 +260,7 @@ impl StaticAction for EncapAction { RouterTargetInternal::Ip(virt_ip) => match self.v2p.get(&virt_ip) { Some(phys) => { + action_meta.set_internal_target(true); PhysNet { ether: phys.ether, ip: phys.ip, vni: self.vni } } diff --git a/xde-tests/src/lib.rs b/xde-tests/src/lib.rs index 4ed72be0..a92c720d 100644 --- a/xde-tests/src/lib.rs +++ b/xde-tests/src/lib.rs @@ -4,6 +4,7 @@ // Copyright 2024 Oxide Computer Company +use anyhow::Context; use anyhow::Result; use opteadm::OpteAdm; use oxide_vpc::api::AddRouterEntryReq; @@ -355,6 +356,27 @@ pub fn two_node_topology() -> Result { println!("setup zone b"); b.setup(&opte1.name, opte1.ip())?; + // We now need to establish an NDP cache entry both ways, otherwise + // we'll write zero for both MAC addrs and the packet *will* be dropped -- + // we're not in promisc anymore :). One ping from the global zone will suffice. + let ping_res = Command::new("ping") + .args([ + "-A", + "inet6", + "-i", + &sim.end_b, + &format!("{}%{}", ll0.ip, sim.end_a), + ]) + .output() + .with_context(|| "calling 'ping' over simnet")?; + + if !ping_res.status.success() { + anyhow::bail!( + "Failed to ping over simnet links!\nstderr:{:?}", + std::str::from_utf8(&ping_res.stderr) + ); + } + Ok(Topology { xde, lls: vec![ll0, ll1], @@ -468,9 +490,9 @@ pub fn single_node_over_real_nic( // Create any null ports before our actual one, to get worst-case // lookups in the linear case. let underlay_addr = my_info.underlay_addr.to_string(); - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); while null_ports.len() as u32 != null_port_count { - let i = rng.gen_range(0..usable_macs.len()); + let i = rng.random_range(0..usable_macs.len()); let taken_mac = usable_macs.swap_remove(i).to_string(); // VIP reuse is not an issue, we aren't using these ports for communication. diff --git a/xde/src/dls/mod.rs b/xde/src/dls/mod.rs index 319cb7e9..694739fe 100644 --- a/xde/src/dls/mod.rs +++ b/xde/src/dls/mod.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2025 Oxide Computer Company //! Safe abstractions around DLS public and private functions. @@ -21,7 +21,7 @@ use illumos_sys_hdrs::ENOENT; use illumos_sys_hdrs::c_int; use illumos_sys_hdrs::datalink_id_t; use illumos_sys_hdrs::uintptr_t; -use opte::ddi::mblk::MsgBlk; +use opte::ddi::mblk::AsMblk; pub use sys::*; /// An integer ID used by DLS to refer to a given link. @@ -204,7 +204,7 @@ impl DlsStream { /// but for now we pass only a single packet at a time. pub fn tx_drop_on_no_desc( &self, - pkt: MsgBlk, + pkt: impl AsMblk, hint: uintptr_t, flags: MacTxFlags, ) { @@ -215,13 +215,16 @@ impl DlsStream { // We must unwrap the raw `mblk_t` out of the `pkt` here, // otherwise the mblk_t would be dropped at the end of this // function along with `pkt`. + let Some(mblk) = pkt.unwrap_mblk() else { + return; + }; let mut raw_flags = flags.bits(); raw_flags |= MAC_DROP_ON_NO_DESC; unsafe { // mac_tx(self.mch, pkt.unwrap_mblk(), hint, raw_flags, &mut ret_mp) str_mdata_fastpath_put( inner.dld_str.as_ptr(), - pkt.unwrap_mblk().as_ptr(), + mblk.as_ptr(), hint, raw_flags, ) diff --git a/xde/src/mac/mod.rs b/xde/src/mac/mod.rs index 45f3dfb2..7433ee32 100644 --- a/xde/src/mac/mod.rs +++ b/xde/src/mac/mod.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2024 Oxide Computer Company +// Copyright 2025 Oxide Computer Company //! Safe abstractions for the mac client API. //! @@ -18,9 +18,15 @@ use alloc::sync::Arc; use bitflags::bitflags; use core::ffi::CStr; use core::fmt; +use core::mem; +use core::mem::MaybeUninit; +use core::ops::RangeInclusive; use core::ptr; use illumos_sys_hdrs::*; +use ingot::ip::IpProtocol; +use opte::ddi::mblk::AsMblk; use opte::ddi::mblk::MsgBlk; +use opte::ddi::mblk::MsgBlkChain; use opte::engine::ether::EtherAddr; pub use sys::*; @@ -63,6 +69,7 @@ impl MacHandle { Ok(Self(mh)) } + /// Get the primary MAC address associated with this device. pub fn get_mac_addr(&self) -> [u8; 6] { let mut mac = [0u8; 6]; unsafe { @@ -70,6 +77,44 @@ impl MacHandle { } mac } + + /// Get the range of valid MTUs supported by this device. + pub fn get_valid_mtus(&self) -> RangeInclusive { + let (mut min, mut max) = (0, 0); + + unsafe { + mac_sdu_get(self.0, &raw mut min, &raw mut max); + } + + min..=max + } + + /// Query this device's supported checksum offload capabilities. + pub fn get_cso_capabs(&self) -> mac_capab_cso_t { + let mut cso = mac_capab_cso_t::default(); + unsafe { + mac_capab_get( + self.0, + mac_capab_t::MAC_CAPAB_HCKSUM, + (&raw mut cso) as *mut _, + ); + } + cso + } + + /// Query this device's supported large send offload capabilities. + pub fn get_lso_capabs(&self) -> mac_capab_lso_t { + let mut lso = MaybeUninit::::zeroed(); + unsafe { + mac_capab_get( + self.0, + mac_capab_t::MAC_CAPAB_LSO, + (&raw mut lso) as *mut _, + ); + + lso.assume_init() + } + } } impl Drop for MacHandle { @@ -106,6 +151,12 @@ bitflags! { // For now we only include flags currently used by consumers. pub struct MacOpenFlags: u16 { const NONE = 0; + const IS_VNIC = MAC_OPEN_FLAGS_IS_VNIC; + const EXCLUSIVE = MAC_OPEN_FLAGS_EXCLUSIVE; + const IS_AGGR_PORT = MAC_OPEN_FLAGS_IS_AGGR_PORT; + const SHARES_DESIRED = MAC_OPEN_FLAGS_SHARES_DESIRED; + const USE_DATALINK_NAME = MAC_OPEN_FLAGS_USE_DATALINK_NAME; + const MULTI_PRIMARY = MAC_OPEN_FLAGS_MULTI_PRIMARY; const NO_UNICAST_ADDR = MAC_OPEN_FLAGS_NO_UNICAST_ADDR; } } @@ -187,6 +238,7 @@ impl MacClientHandle { self.mch, ether.as_mut_ptr(), 0, + // MAC_UNICAST_PRIMARY | MAC_UNICAST_NODUPCHECK, &mut muh, 0, &mut diag, @@ -207,7 +259,7 @@ impl MacClientHandle { /// but for now we pass only a single packet at a time. pub fn tx( &self, - pkt: MsgBlk, + pkt: impl AsMblk, hint: uintptr_t, flags: MacTxFlags, ) -> Option { @@ -215,14 +267,9 @@ impl MacClientHandle { // otherwise the mblk_t would be dropped at the end of this // function along with `pkt`. let mut ret_mp = ptr::null_mut(); + let mblk = pkt.unwrap_mblk()?; unsafe { - mac_tx( - self.mch, - pkt.unwrap_mblk().as_ptr(), - hint, - flags.bits(), - &mut ret_mp, - ) + mac_tx(self.mch, mblk.as_ptr(), hint, flags.bits(), &mut ret_mp) }; if !ret_mp.is_null() { // Unwrap: We know the ret_mp is valid because we gave @@ -248,7 +295,7 @@ impl MacClientHandle { /// but for now we pass only a single packet at a time. pub fn tx_drop_on_no_desc( &self, - pkt: MsgBlk, + pkt: impl AsMblk, hint: uintptr_t, flags: MacTxFlags, ) { @@ -258,17 +305,44 @@ impl MacClientHandle { let mut raw_flags = flags.bits(); raw_flags |= MAC_DROP_ON_NO_DESC; let mut ret_mp = ptr::null_mut(); + + let Some(mblk) = pkt.unwrap_mblk() else { + return; + }; + unsafe { - mac_tx( - self.mch, - pkt.unwrap_mblk().as_ptr(), - hint, - raw_flags, - &mut ret_mp, - ) + mac_tx(self.mch, mblk.as_ptr(), hint, raw_flags, &mut ret_mp) }; debug_assert_eq!(ret_mp, ptr::null_mut()); } + + /// TODO: document what's happening here. + /// TODO: error conditions? + pub fn rx_set(self: &Arc, promisc_fn: mac_rx_fn) { + let mch = self.clone(); + let arg = Arc::as_ptr(&mch) as *mut c_void; + unsafe { mac_rx_set(self.mch, Some(promisc_fn), arg) } + // unsafe { mac_rx_set(self.mch, None, arg) } + } + + pub fn set_flow_cb(self: &Arc, promisc_fn: mac_rx_fn) { + let mch = self.clone(); + let arg = Arc::as_ptr(&mch) as *mut c_void; + unsafe { mac_client_set_flow_cb(self.mch, Some(promisc_fn), arg) } + // unsafe { mac_client_set_flow_cb(self.mch, None, arg) } + } + + /// TODO: document what's happening here. + /// TODO: error conditions? + pub fn rx_bypass_disable(&self) { + unsafe { mac_rx_bypass_disable(self.mch) } + } + + /// TODO: document what's happening here. + /// TODO: error conditions? + pub fn rx_bypass_enable(&self) { + unsafe { mac_rx_bypass_enable(self.mch) } + } } impl Drop for MacClientHandle { @@ -386,3 +460,391 @@ impl Drop for MacPerimeterHandle { } } } + +bitflags! { +/// Flagset for requesting emulation on any packets marked +/// with the given offloads. +/// +/// Derived from `mac_emul_t` (mac.h). +pub struct MacEmul: u32 { + /// Calculate the L3/L4 checksums. + const HWCKSUM_EMUL = MAC_HWCKSUM_EMUL; + /// Calculate the IPv4 checksum, ignoring L4. + const IPCKSUM_EMUL = MAC_IPCKSUM_EMUL; + /// Segment TCP packets into MSS-sized chunks. + const LSO_EMUL = MAC_LSO_EMUL; +} +} + +/// Emulates various offloads (checksum, LSO) for packets on loopback paths. +/// +/// Specific offloads within `flags` must be requested using +/// [`MsgBlk::request_offload`]. +pub fn mac_hw_emul(msg: impl AsMblk, flags: MacEmul) -> Option { + let mut chain = msg.unwrap_mblk()?.as_ptr(); + unsafe { + sys::mac_hw_emul( + &raw mut chain, + ptr::null_mut(), + ptr::null_mut(), + flags.bits(), + ); + } + + (!chain.is_null()).then(|| unsafe { MsgBlkChain::new(chain).unwrap() }) +} + +#[derive(Copy, Clone, Debug)] +pub struct OffloadInfo { + pub cso_state: mac_capab_cso_t, + pub lso_state: mac_capab_lso_t, + pub mtu: u32, +} + +impl OffloadInfo { + /// Forwards the underlay's tunnel checksum offload capabilities into + /// standard capabilities. + pub fn upstream_csum(&self) -> mac_capab_cso_t { + let base_capabs = self.cso_state.cso_flags; + let mut out = mac_capab_cso_t::default(); + + if base_capabs.contains(ChecksumOffloadCapabs::TUNNEL_VALID) + && self.cso_state.cso_tunnel.ct_types.contains(TunnelType::GENEVE) + { + let tsco_flags = self.cso_state.cso_tunnel.ct_flags; + if tsco_flags.contains(TunnelCsoFlags::INNER_IPHDR) { + out.cso_flags |= ChecksumOffloadCapabs::INET_HDRCKSUM; + } + if tsco_flags.contains( + TunnelCsoFlags::INNER_TCP_PARTIAL + | TunnelCsoFlags::INNER_UDP_PARTIAL, + ) { + out.cso_flags |= ChecksumOffloadCapabs::INET_PARTIAL; + } + if tsco_flags.contains( + TunnelCsoFlags::INNER_TCP_FULL | TunnelCsoFlags::INNER_UDP_FULL, + ) { + out.cso_flags |= ChecksumOffloadCapabs::INET_FULL_V4 + | ChecksumOffloadCapabs::INET_FULL_V6; + } + } + + out + } + + /// Forwards the underlay's tunnel TCP LSO capabilities into + /// standard LSO capabilities. + pub fn upstream_lso(&self) -> mac_capab_lso_t { + let mut out = mac_capab_lso_t::default(); + + if self.lso_state.lso_flags.contains(TcpLsoFlags::TUNNEL_TCP) + && self + .lso_state + .lso_tunnel_tcp + .tun_types + .contains(TunnelType::GENEVE) + { + out.lso_flags |= TcpLsoFlags::BASIC_IPV4 | TcpLsoFlags::BASIC_IPV6; + out.lso_basic_tcp_ipv4 = lso_basic_tcp_ipv4_t { + lso_max: self.lso_state.lso_tunnel_tcp.tun_pay_max, + }; + out.lso_basic_tcp_ipv6 = lso_basic_tcp_ipv6_t { + lso_max: self.lso_state.lso_tunnel_tcp.tun_pay_max, + }; + } + + out + } + + /// Return the set of capabilities and MTUs compatible across one or more + /// underlay devices. + pub fn mutual_capabs(&self, other: &Self) -> Self { + Self { + cso_state: mac_capab_cso_t { + cso_flags: self.cso_state.cso_flags & other.cso_state.cso_flags, + cso_tunnel: cso_tunnel_t { + ct_flags: self.cso_state.cso_tunnel.ct_flags + & other.cso_state.cso_tunnel.ct_flags, + ct_encap_max: self + .cso_state + .cso_tunnel + .ct_encap_max + .min(other.cso_state.cso_tunnel.ct_encap_max), + ct_types: self.cso_state.cso_tunnel.ct_types + & other.cso_state.cso_tunnel.ct_types, + }, + }, + lso_state: mac_capab_lso_t { + lso_flags: self.lso_state.lso_flags & other.lso_state.lso_flags, + lso_basic_tcp_ipv4: lso_basic_tcp_ipv4_t { + lso_max: self + .lso_state + .lso_basic_tcp_ipv4 + .lso_max + .min(other.lso_state.lso_basic_tcp_ipv4.lso_max), + }, + lso_basic_tcp_ipv6: lso_basic_tcp_ipv6_t { + lso_max: self + .lso_state + .lso_basic_tcp_ipv6 + .lso_max + .min(other.lso_state.lso_basic_tcp_ipv6.lso_max), + }, + lso_tunnel_tcp: lso_tunnel_tcp_t { + tun_pay_max: self + .lso_state + .lso_tunnel_tcp + .tun_pay_max + .min(other.lso_state.lso_tunnel_tcp.tun_pay_max), + tun_encap_max: self + .lso_state + .lso_tunnel_tcp + .tun_encap_max + .min(other.lso_state.lso_tunnel_tcp.tun_encap_max), + tun_flags: self.lso_state.lso_tunnel_tcp.tun_flags + & other.lso_state.lso_tunnel_tcp.tun_flags, + tun_types: self.lso_state.lso_tunnel_tcp.tun_types + & other.lso_state.lso_tunnel_tcp.tun_types, + tun_pad: [0; 2], + }, + }, + mtu: self.mtu.min(other.mtu), + } + } +} + +#[derive(Clone, Default)] +pub struct MacFlowDesc { + mask: flow_mask_t, + ip_ver: Option, + proto: Option, + local_port: Option, +} + +// TODO At the moment the flow system only allows six different +// combinations of attributes: +// +// local_ip=address[/prefixlen] +// remote_ip=address[/prefixlen] +// transport={tcp|udp|sctp|icmp|icmpv6} +// transport={tcp|udp|sctp},local_port=port +// transport={tcp|udp|sctp},remote_port=port +// dsfield=val[:dsfield_mask] +// +// Update this type to enforce the above. +// +// For now my best bet is to use +// transport={tcp|udp|sctp},local_port=port to classify on Geneve +// packets. +impl MacFlowDesc { + pub fn new() -> Self { + Default::default() + } + + /// TODO create an IpVersion type to avoid invalid descriptors. + pub fn set_ipver(&mut self, ver: u8) -> &mut Self { + self.mask |= FLOW_IP_VERSION; + self.ip_ver = Some(ver); + self + } + + pub fn set_proto(&mut self, proto: IpProtocol) -> &mut Self { + self.mask |= FLOW_IP_PROTOCOL; + self.proto = Some(proto); + self + } + + pub fn set_local_port(&mut self, port: u16) -> &mut Self { + self.mask |= FLOW_ULP_PORT_LOCAL; + self.local_port = Some(port); + self + } + + pub fn to_desc(&self) -> flow_desc_t { + flow_desc_t::from(self.clone()) + } + + pub fn new_flow<'a, P>( + &'a self, + flow_name: &'a str, + link_id: LinkId, + ) -> Result, FlowCreateError<'a>> { + let name = CString::new(flow_name) + .map_err(|_| FlowCreateError::InvalidFlowName(flow_name))?; + let desc = self.to_desc(); + + match unsafe { + mac_link_flow_add( + link_id.into(), + name.as_ptr(), + &desc, + &MAC_RESOURCE_PROPS_DEF, + ) + } { + 0 => {} + err => return Err(FlowCreateError::CreateFailed(flow_name, err)), + } + + let mut flent = ptr::null_mut(); + match unsafe { mac_flow_lookup_byname(name.as_ptr(), &mut flent) } { + 0 => Ok(MacFlow { name, flent, parent: None }), + err => Err(FlowCreateError::CreateFailed(flow_name, err)), + } + } +} + +impl From for flow_desc_t { + fn from(mf: MacFlowDesc) -> Self { + let no_addr = IP_NO_ADDR; + let fd_ipversion = mf.ip_ver.unwrap_or(0); + + // The mac flow subsystem uses 0 as sentinel to indicate no + // filtering on protocol. + let fd_protocol = match mf.proto { + Some(p) => p.0, + None => 0, + }; + + // Apparently mac flow wants this in network order. + let fd_local_port = mf.local_port.unwrap_or(0).to_be(); + + Self { + // The mask controls options like priority and bandwidth, + // which we are not using at the moment. + fd_mask: mf.mask, + fd_mac_len: 0, + fd_dst_mac: [0u8; MAXMACADDR], + fd_src_mac: [0u8; MAXMACADDR], + fd_vid: 0, + // XXX Typically I would saw the SAP is the EtherType + // (when talking about Ethernet, but this stuff always + // confuses me. This doesn't seem to be used to filter + // anything. + fd_sap: 0, + fd_ipversion, + fd_protocol, + fd_local_addr: no_addr, + fd_local_netmask: no_addr, + fd_remote_addr: no_addr, + fd_remote_netmask: no_addr, + fd_local_port, + fd_remote_port: 0, + fd_dsfield: 0, + fd_dsfield_mask: 0, + } + } +} + +/// Errors while opening a MAC handle. +#[derive(Debug)] +pub enum FlowCreateError<'a> { + InvalidFlowName(&'a str), + CreateFailed(&'a str, i32), + LookupFailed(&'a str, i32), +} + +impl fmt::Display for FlowCreateError<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + FlowCreateError::InvalidFlowName(flow) => { + write!(f, "invalid flow name: {flow}") + } + FlowCreateError::CreateFailed(flow, err) => { + write!(f, "mac_link_flow_add failed for {flow}: {err}") + } + FlowCreateError::LookupFailed(flow, err) => { + write!(f, "mac_flow_lookup_byname failed for {flow}: {err}") + } + } + } +} + +/// Resource handle for a created Mac flow. +#[derive(Debug)] +pub struct MacFlow

{ + name: CString, + flent: *mut flow_entry_t, + parent: Option<*const P>, +} + +impl

MacFlow

{ + /// Forcibly wrench this flow from the clutches of DLS. + pub fn set_flow_cb(&mut self, new_fn: mac_rx_fn, parent: Arc

) { + let parent = Arc::into_raw(parent); + let arg = parent as *mut c_void; + + unsafe { + crate::warn!( + "Some potential offsets {:x} {:x} {:x} {:x} {:x} {:x}", + mem::offset_of!(flow_entry_t, fe_next), + mem::offset_of!(flow_entry_t, fe_link_id), + mem::offset_of!(flow_entry_t, fe_resource_props), + mem::offset_of!(flow_entry_t, fe_effective_props), + mem::offset_of!(flow_entry_t, fe_lock), + mem::offset_of!(flow_entry_t, fe_rx_srs) + ); + + // flow_cb_pre_srs(self.flent, new_fn); + flow_cb_post_srs(self.flent, new_fn, arg); + } + self.parent = Some(parent); + } +} + +/// Insert a flow callback which replaces `mac_rx_srs_subflow_process`. +#[unsafe(no_mangle)] +pub unsafe fn flow_cb_pre_srs(flent: *mut flow_entry_t, new_fn: mac_rx_fn) { + // TODO: locks, negative handling, ... + unsafe { + (*flent).fe_cb_fn = new_fn; + } +} + +/// Insert a flow callback accessed via `mac_rx_srs_subflow_process`. +/// +/// This will usually replace `mac_rx_deliver`, which will call into the mac_rx +/// callback on the *parent device* (i.e., i_dls_link_rx). +#[unsafe(no_mangle)] +pub unsafe fn flow_cb_post_srs( + flent: *mut flow_entry_t, + new_fn: mac_rx_fn, + arg: *mut c_void, +) { + // TODO: locks, negative handling, ... + unsafe { + let n_srs = (*flent).fe_rx_srs_cnt; + for srs in (*flent).fe_rx_srs.iter().take(n_srs as usize) { + mutex_enter(&mut (**srs).srs_lock); + (**srs).srs_rx.sr_func = new_fn; + (**srs).srs_rx.sr_arg1 = arg; + mutex_exit(&mut (**srs).srs_lock); + } + } +} + +unsafe fn flow_user_refrele(flent: *mut flow_entry_t) { + unsafe { + // TODO: use the existing KMutex code... + mutex_enter(&mut (*flent).fe_lock); + // ASSERT((flent)->fe_user_refcnt != 0); \ + (*flent).fe_user_refcnt -= 1; + if (*flent).fe_user_refcnt == 0 && (*flent).fe_flags & FE_WAITER != 0 { + crate::ip::cv_signal((&mut (*flent).fe_cv) as *mut _); + } + + mutex_exit(&mut (*flent).fe_lock); + } +} + +impl

Drop for MacFlow

{ + fn drop(&mut self) { + unsafe { + // TODO: need to reimplement FLOW_USER_REFRELE in here... + flow_user_refrele(self.flent); + mac_link_flow_remove(self.name.as_ptr()); + if let Some(parent) = self.parent { + Arc::from_raw(parent); // dropped immediately + } + } + } +} diff --git a/xde/src/mac/sys.rs b/xde/src/mac/sys.rs index f2bfca79..daa7f213 100644 --- a/xde/src/mac/sys.rs +++ b/xde/src/mac/sys.rs @@ -6,21 +6,36 @@ // stuff we need from mac +use crate::ip::clock_t; +use crate::ip::in6_addr__bindgen_ty_1; +use crate::ip::in6_addr_t; +use crate::ip::kcondvar_t; +use crate::ip::kthread_t; +use crate::ip::pri_t; +use crate::ip::processorid_t; +use crate::ip::timeout_id_t; +use illumos_sys_hdrs::MAXPATHLEN; use illumos_sys_hdrs::boolean_t; use illumos_sys_hdrs::c_char; use illumos_sys_hdrs::c_int; +use illumos_sys_hdrs::c_uchar; use illumos_sys_hdrs::c_uint; use illumos_sys_hdrs::c_void; use illumos_sys_hdrs::datalink_id_t; use illumos_sys_hdrs::ddi_info_cmd_t; use illumos_sys_hdrs::dev_info; use illumos_sys_hdrs::dev_ops; +use illumos_sys_hdrs::kmutex_t; use illumos_sys_hdrs::mblk_t; use illumos_sys_hdrs::minor_t; use illumos_sys_hdrs::queue_t; use illumos_sys_hdrs::size_t; use illumos_sys_hdrs::uintptr_t; +pub const MAX_RINGS_PER_GROUP: usize = 128; + +pub const MAXFLOWNAMELEN: usize = 128; + pub const MAC_DROP_ON_NO_DESC: u16 = 0x01; pub const MAC_TX_NO_ENQUEUE: u16 = 0x02; pub const MAC_TX_NO_HOLD: u16 = 0x04; @@ -67,6 +82,8 @@ pub enum link_state_t { #[allow(unused_imports)] use mac_client_promisc_type_t::*; +use crate::ip::t_uscalar_t; + pub type mac_tx_cookie_t = uintptr_t; pub type mac_rx_fn = unsafe extern "C" fn( *mut c_void, @@ -75,6 +92,14 @@ pub type mac_rx_fn = unsafe extern "C" fn( boolean_t, ); +// typedef boolean_t (*flow_match_fn_t)(flow_tab_t *, flow_entry_t *, +// flow_state_t *); +pub type flow_match_fn = + unsafe extern "C" fn(*mut c_void, *mut flow_entry_t, *mut c_void); + +// TODO: actual. +pub type mac_srs_drain_proc_fn = *mut c_void; + unsafe extern "C" { pub type mac_handle; pub type mac_client_handle; @@ -111,6 +136,10 @@ unsafe extern "C" { pub fn mac_client_close(mch: *const mac_client_handle, flags: u16); pub fn mac_client_name(mch: *const mac_client_handle) -> *const c_char; pub fn mac_close(mh: *mut mac_handle); + pub fn mac_open_by_linkid( + linkid: datalink_id_t, + mhp: *mut *mut mac_handle, + ) -> c_int; pub fn mac_open_by_linkname( link: *const c_char, mhp: *mut *mut mac_handle, @@ -127,7 +156,7 @@ unsafe extern "C" { pub fn mac_rx_barrier(mch: *const mac_client_handle); pub fn mac_rx_set( mch: *const mac_client_handle, - rx_fn: mac_rx_fn, + rx_fn: Option, arg: *mut c_void, ); pub fn mac_rx_clear(mch: *const mac_client_handle); @@ -159,6 +188,33 @@ unsafe extern "C" { mp_chain: *mut mblk_t, ); pub fn mac_private_minor() -> minor_t; + + pub fn mac_sdu_get( + mh: *mut mac_handle, + min_sdu: *mut c_uint, + max_sdu: *mut c_uint, + ); + + pub fn mac_link_flow_add( + linkid: datalink_id_t, + flow_name: *const c_char, + flow_desc: *const flow_desc_t, + mrp: *const mac_resource_props_t, + ) -> c_int; + pub fn mac_link_flow_remove(flow_name: *const c_char) -> c_int; + + pub fn mac_rx_bypass_enable(mch: *mut mac_client_handle); + pub fn mac_rx_bypass_disable(mch: *mut mac_client_handle); + + pub fn mac_client_set_flow_cb( + mch: *mut mac_client_handle, + rx_fn: Option, + arg: *mut c_void, + ); + pub fn mac_flow_lookup_byname( + flow_name: *const c_char, + flent: *mut *mut flow_entry_t, + ) -> c_int; } // Private MAC functions needed to get us a Tx path. @@ -177,6 +233,158 @@ unsafe extern "C" { ) -> c_int; pub fn mac_perim_exit(mph: mac_perim_handle); pub fn mac_perim_held(mh: mac_handle) -> boolean_t; + + pub fn mac_hw_emul( + mp_chain: *mut *mut mblk_t, + otail: *mut *mut mblk_t, + ocount: *mut c_uint, + mac_emul: u32, + ); + + pub fn mac_capab_get( + mh: *mut mac_handle, + capab: mac_capab_t, + data: *mut c_void, + ) -> boolean_t; +} + +// ====================================================================== +// uts/common/sys/mac_provider.h +// ====================================================================== + +#[repr(C)] +#[derive(Clone, Copy, Default, Debug)] +pub struct cso_tunnel_t { + pub ct_flags: TunnelCsoFlags, + pub ct_encap_max: u32, + pub ct_types: TunnelType, +} + +#[derive(Clone, Copy, Default, Debug)] +pub struct mac_capab_cso_t { + pub cso_flags: ChecksumOffloadCapabs, + pub cso_tunnel: cso_tunnel_t, +} + +#[repr(C)] +#[derive(Clone, Copy, Default, Debug)] +pub struct lso_basic_tcp_ipv4_t { + pub lso_max: t_uscalar_t, +} + +#[repr(C)] +#[derive(Clone, Copy, Default, Debug)] +pub struct lso_basic_tcp_ipv6_t { + pub lso_max: t_uscalar_t, +} + +#[repr(C)] +#[derive(Clone, Copy, Default, Debug)] +pub struct lso_tunnel_tcp_t { + pub tun_pay_max: u32, + pub tun_encap_max: u32, + pub tun_flags: TunnelTcpLsoFlags, + pub tun_types: TunnelType, + pub tun_pad: [u32; 2], +} + +#[repr(C)] +#[derive(Clone, Copy, Default, Debug)] +pub struct mac_capab_lso_t { + pub lso_flags: TcpLsoFlags, + pub lso_basic_tcp_ipv4: lso_basic_tcp_ipv4_t, + pub lso_basic_tcp_ipv6: lso_basic_tcp_ipv6_t, + + pub lso_tunnel_tcp: lso_tunnel_tcp_t, +} + +bitflags::bitflags! { +/// Classes of TCP segmentation offload supported by a MAC provider. +/// +/// These are derived from `#define LSO_TX_*` statements in +/// mac_provider.h, omitting the enum prefix. +#[derive(Clone, Copy, Debug, Default)] +pub struct TcpLsoFlags: u32 { + /// The device supports TCP LSO over IPv4. + const BASIC_IPV4 = 1 << 0; + /// The device supports TCP LSO over IPv6. + const BASIC_IPV6 = 1 << 1; + /// The device supports LSO of TCP packets within IP-based tunnels. + const TUNNEL_TCP = 1 << 2; +} + +/// Supported LSO use specific to [`TcpLsoFlags::TUNNEL_TCP`]. +/// +/// These are derived from `#define LSO_TX_TUNNEL_*` statements in +/// mac_provider.h, omitting the enum prefix. +#[derive(Clone, Copy, Debug, Default)] +pub struct TunnelTcpLsoFlags: u32 { + /// The device can fill the outer L4 (e.g., UDP) checksum + /// on generated tunnel packets. + const FILL_OUTER_CSUM = 1 << 0; +} + +/// Classes of checksum offload suppported by a MAC provider. +/// +/// These are derived from `#define HCKSUM_*` statements in +/// dlpi.h, omitting the enum prefix. +#[derive(Clone, Copy, Debug, Default)] +pub struct ChecksumOffloadCapabs: u32 { + /// Legacy definition -- CSO is enabled on the device. + const ENABLE = 1 << 0; + + /// Device can finalize packet checksum when provided with a partial + /// (pseudoheader) checksum. + const INET_PARTIAL = 1 << 1; + /// Device can compute full (L3+L4) checksum of TCP/UDP over IPv4. + const INET_FULL_V4 = 1 << 2; + /// Device can compute full (L4) checksum of TCP/UDP over IPv6. + const INET_FULL_V6 = 1 << 3; + /// Device can compute IPv4 header checksum. + const INET_HDRCKSUM = 1 << 4; + + const NON_TUN_CAPABS = + Self::ENABLE.bits() | Self::INET_PARTIAL.bits() | + Self::INET_FULL_V4.bits() | Self::INET_FULL_V6.bits() | + Self::INET_HDRCKSUM.bits(); + + /// The `cso_tunnel` field has been filled by the driver. + const TUNNEL_VALID = 1 << 5; +} + +/// Classes of checksum offload suppported for tunnelled packets by a +/// MAC provider. +/// +/// These are derived from `#define MAC_CSO_TUN_*` statements in +/// mac_provider.h, omitting the enum prefix. +#[derive(Clone, Copy, Debug, Default)] +pub struct TunnelCsoFlags: u32 { + /// The inner IPv4 checksum can be entirely computed in hardware. + const INNER_IPHDR = 1 << 0; + /// The inner TCP checksum must contain the IPv4/v6 pseudoheader. + const INNER_TCP_PARTIAL = 1 << 1; + /// The inner TCP checksum can be entirely computed in hardware. + const INNER_TCP_FULL = 1 << 2; + /// The inner UDP checksum must contain the IPv4/v6 pseudoheader. + const INNER_UDP_PARTIAL = 1 << 3; + /// The inner TCP checksum can be entirely computed in hardware. + const INNER_UDP_FULL = 1 << 4; + /// The outer IPv4 checksum can be entirely computed in hardware. + const OUTER_IPHDR = 1 << 5; + /// When requested, the outer UDP checksum (e.g., in Geneve/VXLAN) must + /// contain the IPv4/v6 pseudoheader + const OUTER_UDP_PARTIAL = 1 << 6; + /// When requested, the outer UDP checksum (e.g., in Geneve/VXLAN) can be + /// entirely computed in hardware. + const OUTER_UDP_FULL = 1 << 7; +} + +/// Classes of tunnel suppported by a MAC provider. +#[derive(Clone, Copy, Debug, Default)] +pub struct TunnelType: u32 { + const GENEVE = 1 << 0; + const VXLAN = 1 << 1; +} } #[repr(C)] @@ -461,3 +669,606 @@ pub struct mac_register_t { pub m_v12n: u32, pub m_multicast_sdu: c_uint, } + +// ====================================================================== +// uts/common/sys/mac.h +// ====================================================================== +pub const MAC_HWCKSUM_EMUL: u32 = 1 << 0; +pub const MAC_IPCKSUM_EMUL: u32 = 1 << 1; +pub const MAC_LSO_EMUL: u32 = 1 << 2; + +// ====================================================================== +// uts/common/sys/mac_flow.h +// ====================================================================== +pub type flow_mask_t = u64; +pub const MAXMACADDR: usize = 20; + +// mac_flow.h: +// #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +// #pragma pack(4) +// #endif + +#[repr(C, packed(4))] +pub struct flow_desc_t { + pub fd_mask: flow_mask_t, + pub fd_mac_len: u32, + pub fd_dst_mac: [u8; MAXMACADDR], + pub fd_src_mac: [u8; MAXMACADDR], + pub fd_vid: u16, + pub fd_sap: u32, + pub fd_ipversion: u8, + pub fd_protocol: u8, + pub fd_local_addr: crate::ip::in6_addr_t, + pub fd_local_netmask: crate::ip::in6_addr_t, + pub fd_remote_addr: crate::ip::in6_addr_t, + pub fd_remote_netmask: crate::ip::in6_addr_t, + pub fd_local_port: crate::ip::in_port_t, + pub fd_remote_port: crate::ip::in_port_t, + pub fd_dsfield: u8, + pub fd_dsfield_mask: u8, +} + +pub const IP_NO_ADDR: in6_addr_t = + in6_addr_t { _S6_un: in6_addr__bindgen_ty_1 { _S6_u16: [0u16; 8] } }; + +#[repr(C, packed(4))] +pub struct mac_resource_props_t { + mrp_mask: u32, + mrp_maxbw: u64, + mrp_priority: mac_priority_level_t, + mrp_cpus: mac_cpus_t, + mrp_protect: mac_protect_t, + mrp_nrxings: u32, + mrp_ntxrings: u32, + mrp_pool: [c_char; MAXPATHLEN], +} + +pub const FLOW_LINK_DST: u64 = 0x00000001; +pub const FLOW_LINK_SRC: u64 = 0x00000002; +pub const FLOW_LINK_VID: u64 = 0x00000004; +pub const FLOW_LINK_SAP: u64 = 0x00000008; + +pub const FLOW_IP_VERSION: u64 = 0x00000010; +pub const FLOW_IP_PROTOCOL: u64 = 0x00000020; +pub const FLOW_IP_LOCAL: u64 = 0x00000040; +pub const FLOW_IP_REMOTE: u64 = 0x00000080; +pub const FLOW_IP_DSFIELD: u64 = 0x00000100; + +pub const FLOW_ULP_PORT_LOCAL: u64 = 0x00001000; +pub const FLOW_ULP_PORT_REMOTE: u64 = 0x00002000; + +pub const MPT_MACNOSPOOF: c_int = 0x00000001; +pub const MPT_RESTRICTED: c_int = 0x00000002; +pub const MPT_IPNOSPOOF: c_int = 0x00000004; +pub const MPT_DHCPNOSPOOF: c_int = 0x00000008; +pub const MPT_ALL: c_int = 0x0000000f; +pub const MPT_RESET: c_int = -1; +pub const MPT_MAXCNT: usize = 32; +pub const MPT_MAXIPADDR: usize = MPT_MAXCNT; +pub const MPT_MAXCID: usize = MPT_MAXCNT; +pub const MPT_MAXCIDLEN: usize = 256; + +pub const MRP_MAXBW: c_int = 0x00000001; +pub const MRP_CPUS: c_int = 0x00000002; +pub const MRP_CPUS_USERSPEC: c_int = 0x00000004; +pub const MRP_PRIORITY: c_int = 0x00000008; +pub const MRP_PROTECT: c_int = 0x00000010; +pub const MRP_RX_RINGS: c_int = 0x00000020; +pub const MRP_TX_RINGS: c_int = 0x00000040; +pub const MRP_RXRINGS_UNSPEC: c_int = 0x00000080; +pub const MRP_TXRINGS_UNSPEC: c_int = 0x00000100; +pub const MRP_RINGS_RESET: c_int = 0x00000200; +pub const MRP_POOL: c_int = 0x00000400; + +pub const MRP_NCPUS: usize = 256; + +#[repr(C)] +#[derive(Clone, Copy)] +pub enum mac_priority_level_t { + MPL_LOW, + MPL_MEDIUM, + MPL_HIGH, + MPL_RESET, +} + +#[repr(C)] +#[derive(Clone, Copy)] +pub enum mac_cpu_mode_t { + MCM_FANOUT = 1, + MCM_CPUS, +} + +pub const MAC_RESOURCE_PROPS_DEF: mac_resource_props_t = mac_resource_props_t { + mrp_mask: 0, + mrp_maxbw: 0, + mrp_priority: mac_priority_level_t::MPL_HIGH, + mrp_cpus: MAC_CPUS_DEF, + mrp_protect: MAC_PROTECT_DEF, + mrp_nrxings: 0, + mrp_ntxrings: 0, + mrp_pool: [0; MAXPATHLEN], +}; + +pub const MAC_CPUS_DEF: mac_cpus_t = mac_cpus_t { + mc_ncpus: 0, + mc_cpus: [0; MRP_NCPUS], + mc_rx_fanout_cnt: 0, + mc_rx_fanout_cpus: [0; MRP_NCPUS], + mc_rx_pollid: 0, + mc_rx_workerid: 0, + mc_rx_intr_cpu: 0, + mc_tx_fanout_cpus: [0; MRP_NCPUS], + mc_tx_intr_cpus: mac_tx_intr_cpu_t { + mtc_intr_cpu: [0; MRP_NCPUS], + mtc_retargeted_cpu: [0; MRP_NCPUS], + }, + mc_fanout_mode: mac_cpu_mode_t::MCM_FANOUT, +}; + +#[repr(C, packed(4))] +pub struct mac_cpus_t { + mc_ncpus: u32, + mc_cpus: [u32; MRP_NCPUS], + mc_rx_fanout_cnt: u32, + mc_rx_fanout_cpus: [u32; MRP_NCPUS], + mc_rx_pollid: u32, + mc_rx_workerid: u32, + mc_rx_intr_cpu: i32, + mc_tx_fanout_cpus: [i32; MRP_NCPUS], + mc_tx_intr_cpus: mac_tx_intr_cpu_t, + mc_fanout_mode: mac_cpu_mode_t, +} + +#[repr(C, packed(4))] +pub struct mac_tx_intr_cpu_t { + mtc_intr_cpu: [i32; MRP_NCPUS], + mtc_retargeted_cpu: [i32; MRP_NCPUS], +} + +#[repr(C, packed(4))] +pub struct mac_protect_t { + mp_types: u32, + mp_ipaddrcnt: u32, + mp_ipaddrs: [mac_ipaddr_t; MPT_MAXIPADDR as usize], + mp_cidcnt: u32, + mp_cids: [mac_dhcpcid_t; MPT_MAXCID as usize], +} + +pub const MAC_PROTECT_DEF: mac_protect_t = mac_protect_t { + mp_types: 0, + mp_ipaddrcnt: 0, + mp_ipaddrs: [MAC_IPADDR_DEF; MPT_MAXIPADDR], + mp_cidcnt: 0, + mp_cids: [MAC_DHCPCID_DEF; MPT_MAXCID], +}; + +#[repr(C, packed(4))] +pub struct mac_ipaddr_t { + ip_version: u32, + ip_addr: in6_addr_t, + ip_netmask: u8, +} + +pub const MAC_IPADDR_DEF: mac_ipaddr_t = + mac_ipaddr_t { ip_version: 0, ip_addr: IP_NO_ADDR, ip_netmask: 0 }; + +#[repr(C, packed(4))] +pub struct mac_dhcpcid_t { + dc_id: [c_uchar; MPT_MAXCIDLEN as usize], + dc_len: u32, + dc_form: mac_dhcpcid_form_t, +} + +pub const MAC_DHCPCID_DEF: mac_dhcpcid_t = mac_dhcpcid_t { + dc_id: [0; MPT_MAXCIDLEN], + dc_len: 0, + dc_form: mac_dhcpcid_form_t::CIDFORM_TYPED, +}; + +#[repr(C)] +#[derive(Clone, Copy)] +pub enum mac_dhcpcid_form_t { + CIDFORM_TYPED = 1, + CIDFORM_HEX, + CIDFORM_STR, +} + +// typedef mac_resource_handle_t (*mac_resource_add_t)(void *, mac_resource_t *); +pub type mac_resource_handle_t = *mut c_void; + +// have 0 8 10 3c50 7890 ... 79d8 +// want 0 8 0c 3c48 7888 ... 79d0 + +#[repr(C)] +pub struct flow_entry_t { + pub fe_next: *mut flow_entry_t, /* ft_lock */ + + pub fe_link_id: datalink_id_t, /* WO */ + + /* Properties as specified for this flow */ + pub fe_resource_props: mac_resource_props_t, /* SL */ + + /* Properties actually effective at run time for this flow */ + pub fe_effective_props: mac_resource_props_t, /* SL */ + + pub fe_lock: kmutex_t, + fe_flow_name: [c_char; MAXFLOWNAMELEN], /* fe_lock */ + fe_flow_desc: flow_desc_t, /* fe_lock */ + pub fe_cv: kcondvar_t, /* fe_lock */ + /* + * Initial flow ref is 1 on creation. A thread that lookups the + * flent typically by a mac_flow_lookup() dynamically holds a ref. + * If the ref is 1, it means there arent' any upcalls from the driver + * or downcalls from the stack using this flent. Structures pointing + * to the flent or flent inserted in lists don't count towards this + * refcnt. Instead they are tracked using fe_flags. Only a control + * thread doing a teardown operation deletes the flent, after waiting + * for upcalls to finish synchronously. The fe_refcnt tracks + * the number of upcall refs + */ + fe_refcnt: u32, /* fe_lock */ + + /* + * This tracks lookups done using the global hash list for user + * generated flows. This refcnt only protects the flent itself + * from disappearing and helps walkers to read the flent info such + * as flow spec. However the flent may be quiesced and the SRS could + * be deleted. The fe_user_refcnt tracks the number of global flow + * has refs. + */ + pub fe_user_refcnt: u32, /* fe_lock */ + pub fe_flags: c_uint, /* fe_lock */ + + //uint_t + + /* + * Function/args to invoke for delivering matching packets + * Only the function ff_fn may be changed dynamically and atomically. + * The ff_arg1 and ff_arg2 are set at creation time and may not + * be changed. + */ + pub fe_cb_fn: mac_rx_fn, /* fe_lock */ + pub fe_cb_arg1: *mut c_void, /* fe_lock */ + pub fe_cb_arg2: *mut c_void, /* fe_lock */ + + fe_client_cookie: *mut c_void, /* WO */ + fe_rx_ring_group: *mut c_void, /* SL */ + pub fe_rx_srs: [*mut mac_soft_ring_set_t; MAX_RINGS_PER_GROUP], /* fe_lock */ + pub fe_rx_srs_cnt: c_int, /* fe_lock */ + //int?? + fe_tx_ring_group: *mut c_void, + fe_tx_srs: *mut c_void, /* WO */ + fe_tx_ring_cnt: c_int, + + /* + * This is a unicast flow, and is a mac_client_impl_t + */ + fe_mcip: *mut c_void, /* WO */ + + /* + * Used by mci_flent_list of mac_client_impl_t to track flows sharing + * the same mac_client_impl_t. + */ + fe_client_next: *mut flow_entry_t, + + /* + * This is a broadcast or multicast flow and is a mac_bcast_grp_t + */ + fe_mbg: *mut c_void, /* WO */ + fe_type: c_uint, /* WO */ + + /* + * BW control info. + */ + fe_tx_bw: mac_bw_ctl_t, + fe_rx_bw: mac_bw_ctl_t, + + /* + * Used by flow table lookup code + */ + fe_match: flow_match_fn, + + /* + * Used by mac_flow_remove(). + */ + fe_index: c_int, + fe_flow_tab: *mut c_void, //flow_tab_t + + fe_ksp: *mut c_void, // kstat_t + fe_misc_stat_ksp: *mut c_void, // kstat_t + + fe_desc_logged: boolean_t, + fe_nic_speed: u64, +} + +#[repr(C)] +pub struct mac_bw_ctl_t { + mac_bw_lock: kmutex_t, + mac_bw_state: u32, + mac_bw_sz: size_t, /* ?? Is it needed */ + mac_bw_limit: size_t, /* Max bytes to process per tick */ + mac_bw_used: size_t, /* Bytes processed in current tick */ + mac_bw_drop_threshold: size_t, /* Max queue length */ + mac_bw_drop_bytes: size_t, + mac_bw_polled: size_t, + mac_bw_intr: size_t, + mac_bw_curr_time: clock_t, +} + +pub type mac_soft_ring_t = c_void; + +#[repr(C)] +pub struct mac_soft_ring_set_t { + /* + * Common elements, common to both Rx and Tx SRS type. + * The following block of fields are protected by srs_lock + */ + pub srs_lock: kmutex_t, + srs_type: u32, + srs_state: u32, /* state flags */ + srs_count: u32, + srs_first: *mut mblk_t, /* first mblk chain or NULL */ + srs_last: *mut mblk_t, /* last mblk chain or NULL */ + srs_async: kcondvar_t, /* cv for worker thread */ + srs_cv: kcondvar_t, /* cv for poll thread */ + srs_quiesce_done_cv: kcondvar_t, /* cv for removal */ + srs_tid: timeout_id_t, /* timeout id for pending timeout */ + + /* + * List of soft rings & processing function. + * The following block is protected by Rx quiescence. + * i.e. they can be changed only after quiescing the SRS + * Protected by srs_lock. + */ + srs_soft_ring_head: *mut mac_soft_ring_t, + srs_soft_ring_tail: *mut mac_soft_ring_t, + srs_soft_ring_count: c_int, + srs_soft_ring_quiesced_count: c_int, + srs_soft_ring_condemned_count: c_int, + srs_tcp_soft_rings: *mut *mut mac_soft_ring_t, + srs_tcp_ring_count: c_int, + srs_udp_soft_rings: *mut *mut mac_soft_ring_t, + srs_udp_ring_count: c_int, + srs_tcp6_soft_rings: *mut *mut mac_soft_ring_t, + srs_tcp6_ring_count: c_int, + srs_udp6_soft_rings: *mut *mut mac_soft_ring_t, + srs_udp6_ring_count: c_int, + srs_oth_soft_rings: *mut *mut mac_soft_ring_t, + srs_oth_ring_count: c_int, + /* + * srs_tx_soft_rings is used by tx_srs in + * when operating in multi tx ring mode. + */ + srs_tx_soft_rings: *mut *mut mac_soft_ring_t, + srs_tx_ring_count: c_int, + + /* + * Bandwidth control related members. + * They are common to both Rx- and Tx-side. + * Following protected by srs_lock + */ + srs_bw: *mut mac_bw_ctl_t, + srs_size: size_t, /* Size of packets queued in bytes */ + srs_pri: pri_t, + + srs_next: *mut mac_soft_ring_set_t, /* mac_srs_g_lock */ + srs_prev: *mut mac_soft_ring_set_t, /* mac_srs_g_lock */ + + /* Attribute specific drain func (BW ctl vs non-BW ctl) */ + srs_drain_func: mac_srs_drain_proc_fn, /* Write once (WO) */ + + /* + * If the associated ring is exclusively used by a mac client, e.g., + * an aggregation, this fields is used to keep a reference to the + * MAC client's pseudo ring. + */ + srs_mrh: mac_resource_handle_t, + /* + * The following blocks are write once (WO) and valid for the life + * of the SRS + */ + srs_mcip: *mut c_void, /* back ptr to mac client */ + // mac_client_impl_s* + srs_flent: *mut c_void, /* back ptr to flent */ + srs_ring: *mut c_void, /* Ring Descriptor */ + + // mac_ring_t* + // NOTE: srs_ring->mr_srs->sr_lower_proc is called into if + // we have hardware classification (mr->mr_classify_type == + // MAC_HW_CLASSIFIER). Might be worth keeping in mind. + // otherwise go through mac_rx_flow. + + /* Teardown, disable control ops */ + srs_client_cv: kcondvar_t, /* Client wait for the control op */ + + srs_worker: *mut kthread_t, /* WO, worker thread */ + srs_poll_thr: *mut kthread_t, /* WO, poll thread */ + + srs_ind: c_uint, /* Round Robin indx for picking up SR */ + srs_worker_cpuid: processorid_t, /* processor to bind to */ + srs_worker_cpuid_save: processorid_t, /* saved cpuid during offline */ + srs_poll_cpuid: processorid_t, /* processor to bind to */ + srs_poll_cpuid_save: processorid_t, /* saved cpuid during offline */ + srs_fanout_state: c_uint, + srs_cpu: mac_cpus_t, + + pub srs_rx: mac_srs_rx_t, + srs_tx: mac_srs_tx_t, + srs_ksp: *mut c_void, //kstat_t +} + +/* Transmit side Soft Ring Set */ +#[repr(C)] +pub struct mac_srs_tx_t { + /* Members for Tx size processing */ + st_mode: u32, + st_func: *mut c_void, //mac_tx_func_t + st_arg1: *mut c_void, + st_arg2: *mut c_void, + st_group: *mut c_void, /* TX group for share */ + // mac_group_t + st_woken_up: boolean_t, + + /* + * st_max_q_cnt is the queue depth threshold to limit + * outstanding packets on the Tx SRS. Once the limit + * is reached, Tx SRS will drop packets until the + * limit goes below the threshold. + */ + st_max_q_cnt: u32, /* max. outstanding packets */ + /* + * st_hiwat is used Tx serializer and bandwidth mode. + * This is the queue depth threshold upto which + * packets will get buffered with no flow-control + * back pressure applied to the caller. Once this + * threshold is reached, back pressure will be + * applied to the caller of mac_tx() (mac_tx() starts + * returning a cookie to indicate a blocked SRS). + * st_hiwat should always be lesser than or equal to + * st_max_q_cnt. + */ + st_hiwat: u32, /* mblk cnt to apply flow control */ + st_lowat: u32, /* mblk cnt to relieve flow control */ + st_hiwat_cnt: u32, /* times blocked for Tx descs */ + st_stat: mac_tx_stats_t, + st_capab_aggr: mac_capab_aggr_t, + /* + * st_soft_rings is used as an array to store aggr Tx soft + * rings. When aggr_find_tx_ring() returns a pseudo ring, + * the associated soft ring has to be found. st_soft_rings + * array stores the soft ring associated with a pseudo Tx + * ring and it can be accessed using the pseudo ring + * index (mr_index). Note that the ring index is unique + * for each ring in a group. + */ + st_soft_rings: *mut *mut mac_soft_ring_t, +} + +/* Receive side Soft Ring Set */ +#[repr(C)] +pub struct mac_srs_rx_t { + /* + * Upcall Function for fanout, Rx processing etc. Perhaps + * the same 3 members below can be used for Tx + * processing, but looking around, mac_rx_func_t has + * proliferated too much into various files at different + * places. I am leaving the consolidation battle for + * another day. + */ + pub sr_func: mac_rx_fn, /* srs_lock */ + pub sr_arg1: *mut c_void, /* srs_lock */ + pub sr_arg2: mac_resource_handle_t, /* srs_lock */ + // mac_resource_handle_t + sr_lower_proc: mac_rx_fn, /* Atomically changed */ + // should be subflow srs + sr_poll_pkt_cnt: u32, + sr_poll_thres: u32, + + /* mblk cnt to apply flow control */ + sr_hiwat: u32, + /* mblk cnt to relieve flow control */ + sr_lowat: u32, + sr_stat: mac_rx_stats_t, + + /* Times polling was enabled */ + sr_poll_on: u32, + /* Times polling was enabled by worker thread */ + sr_worker_poll_on: u32, + /* Times polling was disabled */ + sr_poll_off: u32, + /* Poll thread signalled count */ + sr_poll_thr_sig: u32, + /* Poll thread busy */ + sr_poll_thr_busy: u32, + /* SRS drains, stays in poll mode but doesn't poll */ + sr_poll_drain_no_poll: u32, + /* + * SRS has nothing to do and no packets in H/W but + * there is a backlog in softrings. SRS stays in + * poll mode but doesn't do polling. + */ + sr_poll_no_poll: u32, + /* Active polling restarted */ + sr_below_hiwat: u32, + /* Found packets in last poll so try and poll again */ + sr_poll_again: u32, + /* + * Packets in queue but poll thread not allowed to process so + * signal the worker thread. + */ + sr_poll_sig_worker: u32, + /* + * Poll thread has nothing to do and H/W has nothing so + * reenable the interrupts. + */ + sr_poll_intr_enable: u32, + /* + * Poll thread has nothing to do and worker thread was already + * running so it can decide to reenable interrupt or poll again. + */ + sr_poll_goto_sleep: u32, + /* Worker thread goes back to draining the queue */ + sr_drain_again: u32, + /* More Packets in queue so signal the poll thread to drain */ + sr_drain_poll_sig: u32, + /* More Packets in queue so signal the worker thread to drain */ + sr_drain_worker_sig: u32, + /* Poll thread is already running so worker has nothing to do */ + sr_drain_poll_running: u32, + /* We have packets already queued so keep polling */ + sr_drain_keep_polling: u32, + /* Drain is done and interrupts are reenabled */ + sr_drain_finish_intr: u32, + /* Polling thread needs to schedule worker wakeup */ + sr_poll_worker_wakeup: u32, +} + +#[repr(C)] +pub struct mac_rx_stats_t { + mrs_lclbytes: u64, + mrs_lclcnt: u64, + mrs_pollcnt: u64, + mrs_pollbytes: u64, + mrs_intrcnt: u64, + mrs_intrbytes: u64, + mrs_sdrops: u64, + mrs_chaincntundr10: u64, + mrs_chaincnt10to50: u64, + mrs_chaincntover50: u64, + mrs_ierrors: u64, +} + +#[repr(C)] +pub struct mac_tx_stats_t { + mts_obytes: u64, + mts_opackets: u64, + mts_oerrors: u64, + /* + * Number of times the srs gets blocked due to lack of Tx + * desc is noted down. Corresponding wakeup from driver + * to unblock is also noted down. They should match in a + * correctly working setup. If there is less unblocks + * than blocks, then Tx side waits forever for a wakeup + * from below. The following protected by srs_lock. + */ + mts_blockcnt: u64, /* times blocked for Tx descs */ + mts_unblockcnt: u64, /* unblock calls from driver */ + mts_sdrops: u64, +} + +#[repr(C)] +pub struct mac_capab_aggr_t { + mca_rename_fn: *mut c_void, + mca_unicst: *mut c_void, + mca_find_tx_ring_fn: *mut c_void, + mca_arg: *mut c_void, +} + +pub const FE_QUIESCE: u32 = 0x01; /* Quiesce the flow */ +pub const FE_WAITER: u32 = 0x02; /* Flow has a waiter */ +pub const FE_FLOW_TAB: u32 = 0x04; /* Flow is in the flow tab list */ +pub const FE_G_FLOW_HASH: u32 = 0x08; /* Flow is in the global flow hash */ +pub const FE_INCIPIENT: u32 = 0x10; /* Being setup */ +pub const FE_CONDEMNED: u32 = 0x20; /* Being deleted */ +pub const FE_UF_NO_DATAPATH: u32 = 0x40; /* No datapath setup for User flow */ +pub const FE_MC_NO_DATAPATH: u32 = 0x80; /* No datapath setup for mac client */ diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 503fce8e..1fe55db5 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -18,17 +18,25 @@ use crate::dls::DlsStream; use crate::dls::LinkId; use crate::ioctl::IoctlEnvelope; use crate::mac; +use crate::mac::ChecksumOffloadCapabs; +use crate::mac::MacEmul; +use crate::mac::MacFlow; use crate::mac::MacHandle; -use crate::mac::MacPromiscHandle; use crate::mac::MacTxFlags; +use crate::mac::OffloadInfo; +use crate::mac::TcpLsoFlags; +use crate::mac::lso_basic_tcp_ipv4_t; +use crate::mac::lso_basic_tcp_ipv6_t; +use crate::mac::mac_capab_cso_t; +use crate::mac::mac_capab_lso_t; use crate::mac::mac_getinfo; +use crate::mac::mac_hw_emul; use crate::mac::mac_private_minor; use crate::route::Route; use crate::route::RouteCache; use crate::route::RouteKey; use crate::secpolicy; use crate::stats::XdeStats; -use crate::sys; use crate::warn; use alloc::borrow::ToOwned; use alloc::boxed::Box; @@ -43,8 +51,17 @@ use core::ptr; use core::ptr::addr_of; use core::ptr::addr_of_mut; use core::time::Duration; +use illumos_sys_hdrs::mac::MacEtherOffloadFlags; +use illumos_sys_hdrs::mac::MacTunType; +use illumos_sys_hdrs::mac::MblkOffloadFlags; +use illumos_sys_hdrs::mac::mac_ether_offload_info_t; use illumos_sys_hdrs::*; +use ingot::ethernet::Ethertype; +use ingot::geneve::Geneve; use ingot::geneve::GeneveRef; +use ingot::ip::IpProtocol; +use ingot::types::HeaderLen; +use ingot::udp::Udp; use opte::ExecCtx; use opte::api::ClearXdeUnderlayReq; use opte::api::CmdOk; @@ -58,6 +75,7 @@ use opte::api::XDE_IOC_OPTE_CMD; use opte::d_error::LabelBlock; use opte::ddi::kstat::KStatNamed; use opte::ddi::kstat::KStatProvider; +use opte::ddi::mblk::AsMblk; use opte::ddi::mblk::MsgBlk; use opte::ddi::mblk::MsgBlkChain; use opte::ddi::sync::KMutex; @@ -67,14 +85,17 @@ use opte::ddi::sync::KRwLockType; use opte::ddi::time::Interval; use opte::ddi::time::Periodic; use opte::engine::NetworkImpl; +use opte::engine::ether::Ethernet; use opte::engine::ether::EthernetRef; use opte::engine::geneve::Vni; use opte::engine::headers::IpAddr; use opte::engine::ioctl::{self as api}; +use opte::engine::ip::v6::Ipv6; use opte::engine::ip::v6::Ipv6Addr; use opte::engine::packet::InnerFlowId; use opte::engine::packet::Packet; use opte::engine::packet::ParseError; +use opte::engine::parse::ValidUlp; use opte::engine::port::Port; use opte::engine::port::PortBuilder; use opte::engine::port::ProcessResult; @@ -109,6 +130,8 @@ use oxide_vpc::engine::nat; use oxide_vpc::engine::overlay; use oxide_vpc::engine::router; +const ETHERNET_MTU: u16 = 1500; + // Entry limits for the various flow tables. const FW_FT_LIMIT: NonZeroU32 = NonZeroU32::new(8096).unwrap(); const FT_LIMIT_ONE: NonZeroU32 = NonZeroU32::new(1).unwrap(); @@ -216,8 +239,11 @@ pub struct xde_underlay_port { /// The MAC address associated with this underlay port. pub mac: [u8; 6], - /// MAC promiscuous handle for receiving packets on the underlay link. - mph: MacPromiscHandle, + /// Handles to created MacFlow instances. + flows: Vec>, + + /// The MTU of this link. + pub mtu: u32, /// DLS-level handle on a device for promiscuous registration and /// packet Tx. @@ -239,6 +265,7 @@ struct UnderlayState { // onto the underlay network u1: Arc, u2: Arc, + shared_props: OffloadInfo, } fn get_xde_state() -> &'static XdeState { @@ -306,6 +333,7 @@ pub struct XdeDev { // driver. pub u1: Arc, pub u2: Arc, + underlay_capab: OffloadInfo, // We make this a per-port cache rather than sharing between all // ports to theoretically reduce contention around route expiry @@ -745,6 +773,7 @@ fn create_xde(req: &CreateXdeReq) -> Result { passthrough: req.passthrough, u1: underlay.u1.clone(), u2: underlay.u2.clone(), + underlay_capab: underlay.shared_props, routes: RouteCache::default(), }); drop(underlay_); @@ -766,9 +795,9 @@ fn create_xde(req: &CreateXdeReq) -> Result { mreg.m_priv_props = core::ptr::null_mut(); mreg.m_instance = c_uint::MAX; // let mac handle this mreg.m_min_sdu = 1; - mreg.m_max_sdu = 1500; // TODO hardcode + mreg.m_max_sdu = u32::from(ETHERNET_MTU); // TODO hardcode mreg.m_multicast_sdu = 0; - mreg.m_margin = sys::VLAN_TAGSZ; + mreg.m_margin = crate::sys::VLAN_TAGSZ; mreg.m_v12n = mac::MAC_VIRT_NONE as u32; unsafe { @@ -933,12 +962,12 @@ fn clear_xde_underlay() -> Result { }; for u in [u1, u2] { - // We have a chain of refs here: `MacPromiscHandle` holds a ref to + // We have a chain of refs here: `MacFlow` holds a ref to // `DldStream`. We explicitly drop them in order here to ensure // there are no outstanding refs. - // 1. Remove promisc callback. - drop(u.mph); + // 1. Remove flow callback. + drop(u.flows); // Although `xde_rx` can be called into without any running ports // via the promisc handle, illumos guarantees that this callback won't @@ -1069,7 +1098,7 @@ fn create_underlay_port( link_name: String, // This parameter is likely to be used as part of the flows work. _mc_name: &str, -) -> Result { +) -> Result<(xde_underlay_port, OffloadInfo), OpteError> { let link_cstr = CString::new(link_name.as_str()).unwrap(); let link_id = @@ -1084,20 +1113,35 @@ fn create_underlay_port( msg: format!("failed to grab open stream for {link_name}: {e}"), })?); - // Setup promiscuous callback to receive all packets on this link. + // Use a link flow to steer only IPv6 + Geneve traffic to our Rx + // handler. + // + // XXX The mac flow mechanism is currently not sophisticated + // enough to understand encapsulated packets. Each xde instance + // will receive all client traffic. It is up to each xde instance + // to further filter the traffic so that only packets destined for + // its client are sent upstream. The plan is to expand mac's flow + // classification to handle encapsulated packets; at which point + // we should be able to setup a distinct flow per xde instance. + let mut flow_desc = mac::MacFlowDesc::new(); + flow_desc + // .set_ipver(6) + .set_proto(IpProtocol::UDP) + .set_local_port(opte::engine::geneve::GENEVE_PORT) + .to_desc(); + + // TODO I'm able to remove these flows via flowadm while the + // driver is still attached -- that's no good. Either find a way + // to add a hold or I'll need to implement such a mechanism. // - // We specify `MAC_PROMISC_FLAGS_NO_TX_LOOP` here to skip receiving copies - // of outgoing packets we sent ourselves. - let mph = MacPromiscHandle::new( - stream.clone(), - mac::mac_client_promisc_type_t::MAC_CLIENT_PROMISC_ALL, - xde_rx, - mac::MAC_PROMISC_FLAGS_NO_TX_LOOP, - ) - .map_err(|e| OpteError::System { - errno: EFAULT, - msg: format!("mac_promisc_add failed for {link_name}: {e}"), - })?; + // There is a fe_refcnt and fe_user_refcnt, I'll have to look at + // those. + let mut fkey = flow_desc + .new_flow(format!("{link_name}_xde").as_str(), link_id) + .map_err(|e| OpteError::System { + errno: EFAULT, + msg: format!("{e}"), + })?; // Grab mac handle for underlying link, to retrieve its MAC address. let mh = MacHandle::open_by_link_name(&link_name).map(Arc::new).map_err( @@ -1107,12 +1151,33 @@ fn create_underlay_port( }, )?; - Ok(xde_underlay_port { - name: link_name, - mac: mh.get_mac_addr(), - mph, - stream, - }) + /* + * TODO: this - curently promisc RX is needed to get packets into the xde + * device. Maybehapps setting something like MAC_OPEN_FLAGS_MULTI_PRIMARY + * and doing a mac_unicast_add with MAC_UNICAST_PRIMARY would work?. + */ + // How does this compare to the above? + // mch.rx_set(xde_rx); + // mch.rx_bypass_disable(); + // mch.set_flow_cb(xde_rx); + + let mtu = *mh.get_valid_mtus().end(); + let cso_state = mh.get_cso_capabs(); + let lso_state = mh.get_lso_capabs(); + + fkey.set_flow_cb(xde_rx, stream.clone()); + let flows = vec![fkey]; + + Ok(( + xde_underlay_port { + name: link_name, + mac: mh.get_mac_addr(), + mtu, + flows, + stream, + }, + OffloadInfo { lso_state, cso_state, mtu }, + )) } #[unsafe(no_mangle)] @@ -1120,9 +1185,13 @@ unsafe fn init_underlay_ingress_handlers( u1_name: String, u2_name: String, ) -> Result { - let u1 = Arc::new(create_underlay_port(u1_name, "xdeu0")?); - let u2 = Arc::new(create_underlay_port(u2_name, "xdeu1")?); - Ok(UnderlayState { u1, u2 }) + let (u1, i1) = create_underlay_port(u1_name, "xdeu0")?; + let (u2, i2) = create_underlay_port(u2_name, "xdeu1")?; + Ok(UnderlayState { + u1: u1.into(), + u2: u2.into(), + shared_props: i1.mutual_capabs(&i2), + }) } #[unsafe(no_mangle)] @@ -1478,13 +1547,29 @@ fn guest_loopback( match dest_dev.port.process(In, parsed_pkt) { Ok(ProcessResult::Modified(emit_spec)) => { let pkt = emit_spec.apply(pkt); - unsafe { - mac::mac_rx( - dest_dev.mh, - ptr::null_mut(), - pkt.unwrap_mblk().as_ptr(), - ) + + // Having advertised offloads to our guest, looped back + // packets are liable to have zero-checksums. Fill these + // if necessary. + let pkt = if pkt + .offload_flags() + .intersects(MblkOffloadFlags::HCK_TX_FLAGS) + { + mac_hw_emul(pkt, MacEmul::HWCKSUM_EMUL) + .and_then(|v| v.unwrap_mblk()) + } else { + Some(pkt.unwrap_mblk()) }; + + if let Some(pkt) = pkt { + unsafe { + mac::mac_rx( + dest_dev.mh, + ptr::null_mut(), + pkt.as_ptr(), + ) + }; + } } Ok(ProcessResult::Drop { reason }) => { @@ -1582,6 +1667,7 @@ unsafe extern "C" fn xde_mc_tx( unsafe fn xde_mc_tx_one(src_dev: &XdeDev, mut pkt: MsgBlk) -> *mut mblk_t { let parser = src_dev.port.network().parser(); let mblk_addr = pkt.mblk_addr(); + let offload_flags = pkt.offload_flags(); let parsed_pkt = match Packet::parse_outbound(pkt.iter_mut(), parser) { Ok(pkt) => pkt, Err(e) => { @@ -1600,6 +1686,53 @@ unsafe fn xde_mc_tx_one(src_dev: &XdeDev, mut pkt: MsgBlk) -> *mut mblk_t { return ptr::null_mut(); } }; + let old_len = parsed_pkt.len(); + + let meta = parsed_pkt.meta(); + let Ok(non_eth_payl_bytes) = + u32::try_from((&meta.inner_l3, &meta.inner_ulp).packet_length()) + else { + opte::engine::dbg!("sum of packet L3/L4 exceeds u32::MAX"); + return ptr::null_mut(); + }; + + let (l4_flag, l4_ty) = match &meta.inner_ulp { + Some(ValidUlp::Tcp(_)) => { + (MacEtherOffloadFlags::L4INFO_SET, IpProtocol::TCP) + } + Some(ValidUlp::Udp(_)) => { + (MacEtherOffloadFlags::L4INFO_SET, IpProtocol::UDP) + } + _ => (MacEtherOffloadFlags::empty(), IpProtocol(0)), + }; + + // If L3 headers are too long to represent in the MEOI API, then + // drop the packet (e.g., >u16::MAX on v6 extensions). + let Ok(meoi_l3hlen) = u16::try_from(meta.inner_l3.packet_length()) else { + opte::engine::dbg!("packet L3 exceeds u16::MAX"); + return ptr::null_mut(); + }; + + let Ok(meoi_len) = u32::try_from(old_len) else { + opte::engine::dbg!("packet exceeds u32::MAX"); + return ptr::null_mut(); + }; + + let ulp_meoi = mac_ether_offload_info_t { + meoi_flags: MacEtherOffloadFlags::L2INFO_SET + | MacEtherOffloadFlags::L3INFO_SET + | l4_flag, + meoi_len, + meoi_l2hlen: u8::try_from(meta.inner_eth.packet_length()) + .expect("L2 should never exceed ~22B (QinQ)"), + meoi_l3proto: meta.inner_eth.ethertype().0, + meoi_l3hlen, + meoi_l4proto: l4_ty.0, + meoi_l4hlen: u8::try_from(meta.inner_ulp.packet_length()) + .expect("L4 should never exceed 60B (max TCP options)"), + + ..Default::default() + }; // Choose u1 as a starting point. This may be changed in the next_hop // function when we are actually able to determine what interface should be @@ -1652,8 +1785,10 @@ unsafe fn xde_mc_tx_one(src_dev: &XdeDev, mut pkt: MsgBlk) -> *mut mblk_t { } }; + let mtu_unrestricted = emit_spec.mtu_unrestricted(); let l4_hash = emit_spec.l4_hash(); - let out_pkt = emit_spec.apply(pkt); + let mut out_pkt = emit_spec.apply(pkt); + let new_len = out_pkt.byte_len(); if ip6_src == ip6_dst { let devs = xde_devs().read(); @@ -1661,6 +1796,64 @@ unsafe fn xde_mc_tx_one(src_dev: &XdeDev, mut pkt: MsgBlk) -> *mut mblk_t { return ptr::null_mut(); } + let Ok(encap_len) = u32::try_from(new_len.saturating_sub(old_len)) + else { + opte::engine::err!( + "tried to push encap_len greater than u32::MAX" + ); + return ptr::null_mut(); + }; + + // Boost MSS to use full jumbo frames if we know our path + // can be served purely on internal links. + // Recall that SDU does not include L2 size, hence 'non_eth_payl' + let mut flags = offload_flags; + let inner_mtu = if mtu_unrestricted { + src_dev.underlay_capab.mtu - encap_len + } else { + u32::from(ETHERNET_MTU) + }; + let mss = inner_mtu - non_eth_payl_bytes; + + // As underlay devices may need to emulate tunnelled LSO, then we + // need to strip the flag to prevent a drop, in cases where we'd + // ask to split a packet back into... 1 segment. + // Hardware tends to handle this without issue. + if meoi_len.saturating_sub( + u32::try_from(Ethernet::MINIMUM_LENGTH) + .expect("14B < u32::MAX"), + ) <= inner_mtu + { + flags.remove(MblkOffloadFlags::HW_LSO); + } + + out_pkt.request_offload(flags.shift_in(), mss); + + let tun_meoi = mac_ether_offload_info_t { + meoi_flags: MacEtherOffloadFlags::L2INFO_SET + | MacEtherOffloadFlags::L3INFO_SET + | MacEtherOffloadFlags::L4INFO_SET + | MacEtherOffloadFlags::TUNINFO_SET, + meoi_l2hlen: u8::try_from(Ethernet::MINIMUM_LENGTH) + .expect("14B < u8::MAX"), + meoi_l3proto: Ethertype::IPV6.0, + meoi_l3hlen: u16::try_from(Ipv6::MINIMUM_LENGTH) + .expect("40B < u16::MAX"), + meoi_l4proto: IpProtocol::UDP.0, + meoi_l4hlen: u8::try_from(Udp::MINIMUM_LENGTH) + .expect("8B < u8::MAX"), + meoi_tuntype: MacTunType::GENEVE, + meoi_tunhlen: u16::try_from(Geneve::MINIMUM_LENGTH) + .expect("8B < u16::MAX"), + // meoi_len will be recomputed by consumers. + meoi_len: u32::try_from(new_len).unwrap_or(u32::MAX), + }; + + if let Err(e) = out_pkt.fill_parse_info(&tun_meoi, Some(&ulp_meoi)) + { + opte::engine::err!("failed to set offload info: {}", e); + } + // Currently the overlay layer leaves the outer frame // destination and source zero'd. Ask IRE for the route // associated with the underlay destination. Then ask NCE @@ -1762,11 +1955,63 @@ where #[unsafe(no_mangle)] unsafe extern "C" fn xde_mc_getcapab( - _arg: *mut c_void, - _cap: mac::mac_capab_t, - _capb_data: *mut c_void, + arg: *mut c_void, + cap: mac::mac_capab_t, + capb_data: *mut c_void, ) -> boolean_t { - boolean_t::B_FALSE + let dev = arg as *mut XdeDev; + + let shared_underlay_caps = unsafe { (*dev).underlay_capab }; + + // XDE's approach to the capabilities we advertise is to always say + // that we support LSO/CSO, using tunnelled LSO/CSO if the underlay + // supports it or having MAC emulate offloads when it does not. + // We know in actuality what the intersection of our two underlay ports' + // capabilities is, which we use to limit the `lso_max` when tunnelled + // LSO hardware support over Geneve is present. + match cap { + // TODO: work out a safer interface for this. + mac::mac_capab_t::MAC_CAPAB_HCKSUM => { + let capab = capb_data as *mut mac_capab_cso_t; + + unsafe { + (*capab).cso_flags = ChecksumOffloadCapabs::NON_TUN_CAPABS + .difference(ChecksumOffloadCapabs::INET_PARTIAL); + } + + boolean_t::B_TRUE + } + mac::mac_capab_t::MAC_CAPAB_LSO => { + let capab = capb_data as *mut mac_capab_lso_t; + let upstream_lso = shared_underlay_caps.upstream_lso(); + + // Geneve TSO support in the underlay has been converted to basic TSO + // in `upstream_lso`, use the values there if possible. + let (v4_lso_max, v6_lso_max) = if upstream_lso + .lso_flags + .contains(TcpLsoFlags::BASIC_IPV4 | TcpLsoFlags::BASIC_IPV6) + { + ( + upstream_lso.lso_basic_tcp_ipv4.lso_max, + upstream_lso.lso_basic_tcp_ipv6.lso_max, + ) + } else { + (u32::from(u16::MAX), u32::from(u16::MAX)) + }; + + unsafe { + (*capab).lso_flags = + TcpLsoFlags::BASIC_IPV4 | TcpLsoFlags::BASIC_IPV6; + (*capab).lso_basic_tcp_ipv4 = + lso_basic_tcp_ipv4_t { lso_max: v4_lso_max }; + (*capab).lso_basic_tcp_ipv6 = + lso_basic_tcp_ipv6_t { lso_max: v6_lso_max }; + } + + boolean_t::B_TRUE + } + _ => boolean_t::B_FALSE, + } } #[unsafe(no_mangle)] @@ -1854,7 +2099,7 @@ unsafe extern "C" fn xde_rx( // Safety: This arg comes from `Arc::from_ptr()` on the `MacClientHandle` // corresponding to the underlay port we're receiving on. Being - // here in the callback means the `MacPromiscHandle` hasn't been + // here in the callback means the `MacFlow` hasn't been // dropped yet and thus our `MacClientHandle` is also still valid. let stream: Arc = unsafe { let mch_ptr = arg as *const DlsStream; @@ -1931,6 +2176,10 @@ unsafe fn xde_rx_one( return; }; + let is_tcp = matches!(meta.inner_ulp, ValidUlp::Tcp(_)); + let mss_estimate = usize::from(ETHERNET_MTU) + - (&meta.inner_l3, &meta.inner_ulp).packet_length(); + // We are in passthrough mode, skip OPTE processing. if dev.passthrough { drop(parsed_pkt); @@ -1948,11 +2197,28 @@ unsafe fn xde_rx_one( Ok(ProcessResult::Bypass) => unsafe { mac::mac_rx(dev.mh, mrh, pkt.unwrap_mblk().as_ptr()); }, - Ok(ProcessResult::Modified(emit_spec)) => unsafe { - let npkt = emit_spec.apply(pkt); + Ok(ProcessResult::Modified(emit_spec)) => { + let mut npkt = emit_spec.apply(pkt); + + // Due to possible pseudo-GRO, we need to inform mac/viona on how + // it can split up this packet, if the guest cannot receive it + // (e.g., no GRO/large frame support). + // HW_LSO will cause viona to treat this packet as though it were + // a locally delivered segment making use of LSO. + if is_tcp + && npkt.len() + > usize::from(ETHERNET_MTU) + Ethernet::MINIMUM_LENGTH + { + npkt.request_offload( + MblkOffloadFlags::HW_LSO, + mss_estimate as u32, + ); + } - mac::mac_rx(dev.mh, mrh, npkt.unwrap_mblk().as_ptr()); - }, + unsafe { + mac::mac_rx(dev.mh, mrh, npkt.unwrap_mblk().as_ptr()); + } + } Ok(ProcessResult::Hairpin(hppkt)) => { stream.tx_drop_on_no_desc(hppkt, 0, MacTxFlags::empty()); }