diff --git a/Cargo.lock b/Cargo.lock index 38fb857826d..6a74b4267fd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1053,6 +1053,20 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pci" +version = "0.1.0" +dependencies = [ + "byteorder", + "libc", + "log", + "serde", + "thiserror 2.0.12", + "vm-allocator", + "vm-device", + "vm-memory", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -1650,12 +1664,12 @@ checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" [[package]] name = "vm-allocator" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e4ce718bd4e8d74b1747363e27f715a6b1bd6971597cb21425dadbf4e712241" +checksum = "3c2fce39487bd03b5b0ab176f584682e9eaab7875254bafd3d188c69c85fce6e" dependencies = [ "libc", - "thiserror 1.0.69", + "thiserror 2.0.12", ] [[package]] @@ -1717,6 +1731,7 @@ dependencies = [ "log-instrument", "memfd", "micro_http", + "pci", "proptest", "semver", "serde", @@ -1726,6 +1741,7 @@ dependencies = [ "timerfd", "userfaultfd", "utils", + "uuid", "vhost", "vm-allocator", "vm-device", diff --git a/resources/chroot.sh b/resources/chroot.sh index e7177d7e2ca..82061700b4a 100755 --- a/resources/chroot.sh +++ b/resources/chroot.sh @@ -11,7 +11,7 @@ PS4='+\t ' cp -ruv $rootfs/* / -packages="udev systemd-sysv openssh-server iproute2 curl socat python3-minimal iperf3 iputils-ping fio kmod tmux hwloc-nox vim-tiny trace-cmd linuxptp strace" +packages="udev systemd-sysv openssh-server iproute2 curl socat python3-minimal iperf3 iputils-ping fio kmod tmux hwloc-nox vim-tiny trace-cmd linuxptp strace pciutils" # msr-tools is only supported on x86-64. arch=$(uname -m) diff --git a/resources/guest_configs/pcie.config b/resources/guest_configs/pcie.config new file mode 100644 index 00000000000..b7262f7ae73 --- /dev/null +++ b/resources/guest_configs/pcie.config @@ -0,0 +1,8 @@ +CONFIG_BLK_MQ_PCI=y +CONFIG_PCI=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_MSI=y +CONFIG_PCIEPORTBUS=y +CONFIG_VIRTIO_PCI=y +CONFIG_PCI_HOST_COMMON=y +CONFIG_PCI_HOST_GENERIC=y diff --git a/resources/overlay/etc/systemd/system/fcnet.service b/resources/overlay/etc/systemd/system/fcnet.service index 26d3af1dc20..ace1c8322e1 100644 --- a/resources/overlay/etc/systemd/system/fcnet.service +++ b/resources/overlay/etc/systemd/system/fcnet.service @@ -1,5 +1,6 @@ [Service] Type=oneshot +ExecStartPre=/usr/bin/udevadm settle ExecStart=/usr/local/bin/fcnet-setup.sh [Install] WantedBy=sshd.service diff --git a/resources/rebuild.sh b/resources/rebuild.sh index 56afd1bdbac..dabffa8c2ae 100755 --- a/resources/rebuild.sh +++ b/resources/rebuild.sh @@ -223,15 +223,16 @@ function build_al_kernels { clone_amazon_linux_repo CI_CONFIG="$PWD/guest_configs/ci.config" + PCIE_CONFIG="$PWD/guest_configs/pcie.config" if [[ "$KERNEL_VERSION" == @(all|5.10) ]]; then - build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config "$CI_CONFIG" + build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config "$CI_CONFIG" "$PCIE_CONFIG" fi if [[ $ARCH == "x86_64" && "$KERNEL_VERSION" == @(all|5.10-no-acpi) ]]; then - build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10-no-acpi.config "$CI_CONFIG" + build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10-no-acpi.config "$CI_CONFIG" "$PCIE_CONFIG" fi if [[ "$KERNEL_VERSION" == @(all|6.1) ]]; then - build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config "$CI_CONFIG" + build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config "$CI_CONFIG" "$PCIE_CONFIG" fi # Build debug kernels @@ -240,11 +241,11 @@ function build_al_kernels { OUTPUT_DIR=$OUTPUT_DIR/debug mkdir -pv $OUTPUT_DIR if [[ "$KERNEL_VERSION" == @(all|5.10) ]]; then - build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config" "$CI_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" + build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config" "$CI_CONFIG" "$PCIE_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" vmlinux_split_debuginfo $OUTPUT_DIR/vmlinux-5.10.* fi if [[ "$KERNEL_VERSION" == @(all|6.1) ]]; then - build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config" "$CI_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" + build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config" "$CI_CONFIG" "$PCIE_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" vmlinux_split_debuginfo $OUTPUT_DIR/vmlinux-6.1.* fi } diff --git a/src/acpi-tables/src/lib.rs b/src/acpi-tables/src/lib.rs index 321328047ed..d3b7df0791e 100644 --- a/src/acpi-tables/src/lib.rs +++ b/src/acpi-tables/src/lib.rs @@ -10,6 +10,7 @@ pub mod aml; pub mod dsdt; pub mod fadt; pub mod madt; +pub mod mcfg; pub mod rsdp; pub mod xsdt; @@ -17,6 +18,7 @@ pub use aml::Aml; pub use dsdt::Dsdt; pub use fadt::Fadt; pub use madt::Madt; +pub use mcfg::Mcfg; pub use rsdp::Rsdp; pub use xsdt::Xsdt; use zerocopy::little_endian::{U32, U64}; @@ -89,7 +91,7 @@ pub struct SdtHeader { pub oem_table_id: [u8; 8], pub oem_revision: U32, pub creator_id: [u8; 4], - pub creator_revison: U32, + pub creator_revision: U32, } impl SdtHeader { @@ -110,7 +112,7 @@ impl SdtHeader { oem_table_id, oem_revision: U32::new(oem_revision), creator_id: FC_ACPI_CREATOR_ID, - creator_revison: U32::new(FC_ACPI_CREATOR_REVISION), + creator_revision: U32::new(FC_ACPI_CREATOR_REVISION), } } } diff --git a/src/acpi-tables/src/mcfg.rs b/src/acpi-tables/src/mcfg.rs new file mode 100644 index 00000000000..a5dd8b9d227 --- /dev/null +++ b/src/acpi-tables/src/mcfg.rs @@ -0,0 +1,77 @@ +// Copyright © 2019 Intel Corporation +// Copyright © 2023 Rivos, Inc. +// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +use std::mem::size_of; + +use vm_memory::{Bytes, GuestAddress, GuestMemory}; +use zerocopy::{Immutable, IntoBytes}; + +use crate::{Result, Sdt, SdtHeader, checksum}; + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Default, Debug, IntoBytes, Clone, Copy, Immutable)] +struct PciRangeEntry { + pub base_address: u64, + pub segment: u16, + pub start: u8, + pub end: u8, + _reserved: u32, +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Debug, Default, IntoBytes, Immutable)] +pub struct Mcfg { + header: SdtHeader, + _reserved: u64, + pci_range_entry: PciRangeEntry, +} + +impl Mcfg { + pub fn new( + oem_id: [u8; 6], + oem_table_id: [u8; 8], + oem_revision: u32, + pci_mmio_config_addr: u64, + ) -> Self { + let header = SdtHeader::new( + *b"MCFG", + size_of::().try_into().unwrap(), + 1, + oem_id, + oem_table_id, + oem_revision, + ); + + let mut mcfg = Mcfg { + header, + pci_range_entry: PciRangeEntry { + base_address: pci_mmio_config_addr, + segment: 0, + start: 0, + end: 0, + ..Default::default() + }, + ..Default::default() + }; + + mcfg.header.checksum = checksum(&[mcfg.as_bytes()]); + + mcfg + } +} + +impl Sdt for Mcfg { + fn len(&self) -> usize { + self.as_bytes().len() + } + + fn write_to_guest(&mut self, mem: &M, address: GuestAddress) -> Result<()> { + mem.write_slice(self.as_bytes(), address)?; + Ok(()) + } +} diff --git a/src/firecracker/src/api_server_adapter.rs b/src/firecracker/src/api_server_adapter.rs index 173ef298265..f597a5f7db9 100644 --- a/src/firecracker/src/api_server_adapter.rs +++ b/src/firecracker/src/api_server_adapter.rs @@ -143,6 +143,7 @@ pub(crate) fn run_with_api( instance_info: InstanceInfo, process_time_reporter: ProcessTimeReporter, boot_timer_enabled: bool, + pci_enabled: bool, api_payload_limit: usize, mmds_size_limit: usize, metadata_json: Option<&str>, @@ -212,6 +213,7 @@ pub(crate) fn run_with_api( json, instance_info, boot_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json, ) @@ -224,6 +226,7 @@ pub(crate) fn run_with_api( &to_api, &api_event_fd, boot_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json, ) diff --git a/src/firecracker/src/main.rs b/src/firecracker/src/main.rs index 6b01f776729..3e6ad35d6a9 100644 --- a/src/firecracker/src/main.rs +++ b/src/firecracker/src/main.rs @@ -260,6 +260,11 @@ fn main_exec() -> Result<(), MainError> { Argument::new("mmds-size-limit") .takes_value(true) .help("Mmds data store limit, in bytes."), + ) + .arg( + Argument::new("enable-pci") + .takes_value(false) + .help("Enables PCIe support."), ); arg_parser.parse_from_cmdline()?; @@ -369,6 +374,7 @@ fn main_exec() -> Result<(), MainError> { .map(|x| x.expect("Unable to open or read from the mmds content file")); let boot_timer_enabled = arguments.flag_present("boot-timer"); + let pci_enabled = arguments.flag_present("enable-pci"); let api_enabled = !arguments.flag_present("no-api"); let api_payload_limit = arg_parser .arguments() @@ -422,6 +428,7 @@ fn main_exec() -> Result<(), MainError> { instance_info, process_time_reporter, boot_timer_enabled, + pci_enabled, api_payload_limit, mmds_size_limit, metadata_json.as_deref(), @@ -437,6 +444,7 @@ fn main_exec() -> Result<(), MainError> { vmm_config_json, instance_info, boot_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json.as_deref(), ) @@ -449,7 +457,7 @@ fn main_exec() -> Result<(), MainError> { /// the default the jailer would set). /// /// We do this resizing because the kernel default is 64, with a reallocation happening whenever -/// the tabel fills up. This was happening for some larger microVMs, and reallocating the +/// the table fills up. This was happening for some larger microVMs, and reallocating the /// fdtable while a lot of file descriptors are active (due to being eventfds/timerfds registered /// to epoll) incurs a penalty of 30ms-70ms on the snapshot restore path. fn resize_fdtable() -> Result<(), ResizeFdTableError> { @@ -554,12 +562,14 @@ pub enum BuildFromJsonError { } // Configure and start a microVM as described by the command-line JSON. +#[allow(clippy::too_many_arguments)] fn build_microvm_from_json( seccomp_filters: &BpfThreadMap, event_manager: &mut EventManager, config_json: String, instance_info: InstanceInfo, boot_timer_enabled: bool, + pci_enabled: bool, mmds_size_limit: usize, metadata_json: Option<&str>, ) -> Result<(VmResources, Arc>), BuildFromJsonError> { @@ -567,6 +577,7 @@ fn build_microvm_from_json( VmResources::from_json(&config_json, &instance_info, mmds_size_limit, metadata_json) .map_err(BuildFromJsonError::ParseFromJson)?; vm_resources.boot_timer = boot_timer_enabled; + vm_resources.pci_enabled = pci_enabled; let vmm = vmm::builder::build_and_boot_microvm( &instance_info, &vm_resources, @@ -593,6 +604,7 @@ fn run_without_api( config_json: Option, instance_info: InstanceInfo, bool_timer_enabled: bool, + pci_enabled: bool, mmds_size_limit: usize, metadata_json: Option<&str>, ) -> Result<(), RunWithoutApiError> { @@ -610,6 +622,7 @@ fn run_without_api( config_json.unwrap(), instance_info, bool_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json, ) diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml new file mode 100644 index 00000000000..c88cd270b23 --- /dev/null +++ b/src/pci/Cargo.toml @@ -0,0 +1,25 @@ +[package] +authors = ["Samuel Ortiz "] +edition = "2021" +name = "pci" +version = "0.1.0" +license = "Apache-2.0 AND BSD-3-Clause" + +[lib] +bench = false + +[features] +default = [] + +[dependencies] +byteorder = "1.5.0" +libc = "0.2.172" +log = "0.4.27" +serde = { version = "1.0.219", features = ["derive"] } +thiserror = "2.0.12" +vm-allocator = "0.1.2" +vm-device = { path = "../vm-device" } +vm-memory = { version = "0.16.1", features = [ + "backend-mmap", + "backend-bitmap", +] } diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs new file mode 100644 index 00000000000..cb42b4ee9c5 --- /dev/null +++ b/src/pci/src/bus.rs @@ -0,0 +1,477 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::any::Any; +use std::collections::HashMap; +use std::ops::DerefMut; +use std::sync::{Arc, Barrier, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use vm_device::{Bus, BusDevice, BusDeviceSync}; + +use crate::configuration::{ + PciBarRegionType, PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType, +}; +use crate::device::{DeviceRelocation, Error as PciDeviceError, PciDevice}; +use crate::PciBarConfiguration; + +const VENDOR_ID_INTEL: u16 = 0x8086; +const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; +const NUM_DEVICE_IDS: usize = 32; + +/// Errors for device manager. +#[derive(Debug)] +pub enum PciRootError { + /// Could not allocate device address space for the device. + AllocateDeviceAddrs(PciDeviceError), + /// Could not allocate an IRQ number. + AllocateIrq, + /// Could not add a device to the port io bus. + PioInsert(vm_device::BusError), + /// Could not add a device to the mmio bus. + MmioInsert(vm_device::BusError), + /// Could not find an available device slot on the PCI bus. + NoPciDeviceSlotAvailable, + /// Invalid PCI device identifier provided. + InvalidPciDeviceSlot(usize), + /// Valid PCI device identifier but already used. + AlreadyInUsePciDeviceSlot(usize), +} +pub type Result = std::result::Result; + +/// Emulates the PCI Root bridge device. +pub struct PciRoot { + /// Configuration space. + config: PciConfiguration, +} + +impl PciRoot { + /// Create an empty PCI root bridge. + pub fn new(config: Option) -> Self { + if let Some(config) = config { + PciRoot { config } + } else { + PciRoot { + config: PciConfiguration::new( + VENDOR_ID_INTEL, + DEVICE_ID_INTEL_VIRT_PCIE_HOST, + 0, + PciClassCode::BridgeDevice, + &PciBridgeSubclass::HostBridge, + None, + PciHeaderType::Device, + 0, + 0, + None, + None, + ), + } + } + } +} + +impl BusDevice for PciRoot {} + +impl PciDevice for PciRoot { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + self.config.write_config_register(reg_idx, offset, data); + None + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + self.config.read_reg(reg_idx) + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn id(&self) -> Option { + None + } +} + +pub struct PciBus { + /// Devices attached to this bus. + /// Device 0 is host bridge. + devices: HashMap>>, + device_reloc: Arc, + device_ids: Vec, +} + +impl PciBus { + pub fn new(pci_root: PciRoot, device_reloc: Arc) -> Self { + let mut devices: HashMap>> = HashMap::new(); + let mut device_ids: Vec = vec![false; NUM_DEVICE_IDS]; + + devices.insert(0, Arc::new(Mutex::new(pci_root))); + device_ids[0] = true; + + PciBus { + devices, + device_reloc, + device_ids, + } + } + + pub fn register_mapping( + &self, + dev: Arc, + io_bus: &Bus, + mmio_bus: &Bus, + bars: Vec, + ) -> Result<()> { + for bar in bars { + match bar.region_type() { + PciBarRegionType::IoRegion => { + io_bus + .insert(dev.clone(), bar.addr(), bar.size()) + .map_err(PciRootError::PioInsert)?; + } + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { + mmio_bus + .insert(dev.clone(), bar.addr(), bar.size()) + .map_err(PciRootError::MmioInsert)?; + } + } + } + Ok(()) + } + + pub fn add_device(&mut self, device_id: u32, device: Arc>) -> Result<()> { + self.devices.insert(device_id, device); + Ok(()) + } + + pub fn remove_by_device(&mut self, device: &Arc>) -> Result<()> { + self.devices.retain(|_, dev| !Arc::ptr_eq(dev, device)); + Ok(()) + } + + pub fn next_device_id(&mut self) -> Result { + for (idx, device_id) in self.device_ids.iter_mut().enumerate() { + if !(*device_id) { + *device_id = true; + return Ok(idx as u32); + } + } + + Err(PciRootError::NoPciDeviceSlotAvailable) + } + + pub fn get_device_id(&mut self, id: usize) -> Result<()> { + if id < NUM_DEVICE_IDS { + if !self.device_ids[id] { + self.device_ids[id] = true; + Ok(()) + } else { + Err(PciRootError::AlreadyInUsePciDeviceSlot(id)) + } + } else { + Err(PciRootError::InvalidPciDeviceSlot(id)) + } + } + + pub fn put_device_id(&mut self, id: usize) -> Result<()> { + if id < NUM_DEVICE_IDS { + self.device_ids[id] = false; + Ok(()) + } else { + Err(PciRootError::InvalidPciDeviceSlot(id)) + } + } +} + +pub struct PciConfigIo { + /// Config space register. + config_address: u32, + pci_bus: Arc>, +} + +impl PciConfigIo { + pub fn new(pci_bus: Arc>) -> Self { + PciConfigIo { + config_address: 0, + pci_bus, + } + } + + pub fn config_space_read(&self) -> u32 { + let enabled = (self.config_address & 0x8000_0000) != 0; + if !enabled { + return 0xffff_ffff; + } + + let (bus, device, function, register) = + parse_io_config_address(self.config_address & !0x8000_0000); + + // Only support one bus. + if bus != 0 { + return 0xffff_ffff; + } + + // Don't support multi-function devices. + if function > 0 { + return 0xffff_ffff; + } + + self.pci_bus + .as_ref() + .lock() + .unwrap() + .devices + .get(&(device as u32)) + .map_or(0xffff_ffff, |d| { + d.lock().unwrap().read_config_register(register) + }) + } + + pub fn config_space_write(&mut self, offset: u64, data: &[u8]) -> Option> { + if offset as usize + data.len() > 4 { + return None; + } + + let enabled = (self.config_address & 0x8000_0000) != 0; + if !enabled { + return None; + } + + let (bus, device, _function, register) = + parse_io_config_address(self.config_address & !0x8000_0000); + + // Only support one bus. + if bus != 0 { + return None; + } + + let pci_bus = self.pci_bus.as_ref().lock().unwrap(); + if let Some(d) = pci_bus.devices.get(&(device as u32)) { + let mut device = d.lock().unwrap(); + + // Find out if one of the device's BAR is being reprogrammed, and + // reprogram it if needed. + if let Some(params) = device.detect_bar_reprogramming(register, data) { + if let Err(e) = pci_bus.device_reloc.move_bar( + params.old_base, + params.new_base, + params.len, + device.deref_mut(), + params.region_type, + ) { + error!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + e, params.old_base, params.new_base, params.len + ); + } + } + + // Update the register value + device.write_config_register(register, offset, data) + } else { + None + } + } + + fn set_config_address(&mut self, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + let (mask, value): (u32, u32) = match data.len() { + 1 => ( + 0x0000_00ff << (offset * 8), + u32::from(data[0]) << (offset * 8), + ), + 2 => ( + 0x0000_ffff << (offset * 16), + ((u32::from(data[1]) << 8) | u32::from(data[0])) << (offset * 16), + ), + 4 => (0xffff_ffff, LittleEndian::read_u32(data)), + _ => return, + }; + self.config_address = (self.config_address & !mask) | value; + } +} + +impl BusDevice for PciConfigIo { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + // `offset` is relative to 0xcf8 + let value = match offset { + 0..=3 => self.config_address, + 4..=7 => self.config_space_read(), + _ => 0xffff_ffff, + }; + + // Only allow reads to the register boundary. + let start = offset as usize % 4; + let end = start + data.len(); + if end <= 4 { + for i in start..end { + data[i - start] = (value >> (i * 8)) as u8; + } + } else { + for d in data { + *d = 0xff; + } + } + } + + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + // `offset` is relative to 0xcf8 + match offset { + o @ 0..=3 => { + self.set_config_address(o, data); + None + } + o @ 4..=7 => self.config_space_write(o - 4, data), + _ => None, + } + } +} + +/// Emulates PCI memory-mapped configuration access mechanism. +pub struct PciConfigMmio { + pci_bus: Arc>, +} + +impl PciConfigMmio { + pub fn new(pci_bus: Arc>) -> Self { + PciConfigMmio { pci_bus } + } + + fn config_space_read(&self, config_address: u32) -> u32 { + let (bus, device, _function, register) = parse_mmio_config_address(config_address); + + // Only support one bus. + if bus != 0 { + return 0xffff_ffff; + } + + self.pci_bus + .lock() + .unwrap() + .devices + .get(&(device as u32)) + .map_or(0xffff_ffff, |d| { + d.lock().unwrap().read_config_register(register) + }) + } + + fn config_space_write(&mut self, config_address: u32, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + + let (bus, device, _function, register) = parse_mmio_config_address(config_address); + + // Only support one bus. + if bus != 0 { + return; + } + + let pci_bus = self.pci_bus.lock().unwrap(); + if let Some(d) = pci_bus.devices.get(&(device as u32)) { + let mut device = d.lock().unwrap(); + + // Find out if one of the device's BAR is being reprogrammed, and + // reprogram it if needed. + if let Some(params) = device.detect_bar_reprogramming(register, data) { + if let Err(e) = pci_bus.device_reloc.move_bar( + params.old_base, + params.new_base, + params.len, + device.deref_mut(), + params.region_type, + ) { + error!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + e, params.old_base, params.new_base, params.len + ); + } + } + + // Update the register value + device.write_config_register(register, offset, data); + } + } +} + +impl BusDevice for PciConfigMmio { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + // Only allow reads to the register boundary. + let start = offset as usize % 4; + let end = start + data.len(); + if end > 4 || offset > u64::from(u32::MAX) { + for d in data { + *d = 0xff; + } + return; + } + + let value = self.config_space_read(offset as u32); + for i in start..end { + data[i - start] = (value >> (i * 8)) as u8; + } + } + + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + if offset > u64::from(u32::MAX) { + return None; + } + self.config_space_write(offset as u32, offset % 4, data); + + None + } +} + +fn shift_and_mask(value: u32, offset: usize, mask: u32) -> usize { + ((value >> offset) & mask) as usize +} + +// Parse the MMIO address offset to a (bus, device, function, register) tuple. +// See section 7.2.2 PCI Express Enhanced Configuration Access Mechanism (ECAM) +// from the Pci Express Base Specification Revision 5.0 Version 1.0. +fn parse_mmio_config_address(config_address: u32) -> (usize, usize, usize, usize) { + const BUS_NUMBER_OFFSET: usize = 20; + const BUS_NUMBER_MASK: u32 = 0x00ff; + const DEVICE_NUMBER_OFFSET: usize = 15; + const DEVICE_NUMBER_MASK: u32 = 0x1f; + const FUNCTION_NUMBER_OFFSET: usize = 12; + const FUNCTION_NUMBER_MASK: u32 = 0x07; + const REGISTER_NUMBER_OFFSET: usize = 2; + const REGISTER_NUMBER_MASK: u32 = 0x3ff; + + ( + shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), + shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), + shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), + shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), + ) +} + +// Parse the CONFIG_ADDRESS register to a (bus, device, function, register) tuple. +fn parse_io_config_address(config_address: u32) -> (usize, usize, usize, usize) { + const BUS_NUMBER_OFFSET: usize = 16; + const BUS_NUMBER_MASK: u32 = 0x00ff; + const DEVICE_NUMBER_OFFSET: usize = 11; + const DEVICE_NUMBER_MASK: u32 = 0x1f; + const FUNCTION_NUMBER_OFFSET: usize = 8; + const FUNCTION_NUMBER_MASK: u32 = 0x07; + const REGISTER_NUMBER_OFFSET: usize = 2; + const REGISTER_NUMBER_MASK: u32 = 0x3f; + + ( + shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), + shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), + shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), + shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), + ) +} diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs new file mode 100644 index 00000000000..3a53167148c --- /dev/null +++ b/src/pci/src/configuration.rs @@ -0,0 +1,1252 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::fmt::{self, Display}; +use std::sync::{Arc, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_device::PciBarType; + +use crate::device::BarReprogrammingParams; +use crate::{MsixConfig, PciInterruptPin}; + +// The number of 32bit registers in the config space, 4096 bytes. +const NUM_CONFIGURATION_REGISTERS: usize = 1024; + +const STATUS_REG: usize = 1; +const STATUS_REG_CAPABILITIES_USED_MASK: u32 = 0x0010_0000; +const BAR0_REG: usize = 4; +const ROM_BAR_REG: usize = 12; +const ROM_BAR_IDX: usize = 6; +const BAR_IO_ADDR_MASK: u32 = 0xffff_fffc; +const BAR_MEM_ADDR_MASK: u32 = 0xffff_fff0; +const ROM_BAR_ADDR_MASK: u32 = 0xffff_f800; +const MSI_CAPABILITY_REGISTER_MASK: u32 = 0x0071_0000; +const MSIX_CAPABILITY_REGISTER_MASK: u32 = 0xc000_0000; +const NUM_BAR_REGS: usize = 6; +const CAPABILITY_LIST_HEAD_OFFSET: usize = 0x34; +const FIRST_CAPABILITY_OFFSET: usize = 0x40; +const CAPABILITY_MAX_OFFSET: usize = 192; + +const INTERRUPT_LINE_PIN_REG: usize = 15; + +pub const PCI_CONFIGURATION_ID: &str = "pci_configuration"; + +/// Represents the types of PCI headers allowed in the configuration registers. +#[derive(Copy, Clone)] +pub enum PciHeaderType { + Device, + Bridge, +} + +/// Classes of PCI nodes. +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciClassCode { + TooOld, + MassStorage, + NetworkController, + DisplayController, + MultimediaController, + MemoryController, + BridgeDevice, + SimpleCommunicationController, + BaseSystemPeripheral, + InputDevice, + DockingStation, + Processor, + SerialBusController, + WirelessController, + IntelligentIoController, + EncryptionController, + DataAcquisitionSignalProcessing, + Other = 0xff, +} + +impl PciClassCode { + pub fn get_register_value(self) -> u8 { + self as u8 + } +} + +/// A PCI subclass. Each class in `PciClassCode` can specify a unique set of subclasses. This trait +/// is implemented by each subclass. It allows use of a trait object to generate configurations. +pub trait PciSubclass { + /// Convert this subclass to the value used in the PCI specification. + fn get_register_value(&self) -> u8; +} + +/// Subclasses of the MultimediaController class. +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciMultimediaSubclass { + VideoController = 0x00, + AudioController = 0x01, + TelephonyDevice = 0x02, + AudioDevice = 0x03, + Other = 0x80, +} + +impl PciSubclass for PciMultimediaSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Subclasses of the BridgeDevice +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciBridgeSubclass { + HostBridge = 0x00, + IsaBridge = 0x01, + EisaBridge = 0x02, + McaBridge = 0x03, + PciToPciBridge = 0x04, + PcmciaBridge = 0x05, + NuBusBridge = 0x06, + CardBusBridge = 0x07, + RacEwayBridge = 0x08, + PciToPciSemiTransparentBridge = 0x09, + InfiniBrandToPciHostBridge = 0x0a, + OtherBridgeDevice = 0x80, +} + +impl PciSubclass for PciBridgeSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Subclass of the SerialBus +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciSerialBusSubClass { + Firewire = 0x00, + Accessbus = 0x01, + Ssa = 0x02, + Usb = 0x03, +} + +impl PciSubclass for PciSerialBusSubClass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Mass Storage Sub Classes +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciMassStorageSubclass { + ScsiStorage = 0x00, + IdeInterface = 0x01, + FloppyController = 0x02, + IpiController = 0x03, + RaidController = 0x04, + AtaController = 0x05, + SataController = 0x06, + SerialScsiController = 0x07, + NvmController = 0x08, + MassStorage = 0x80, +} + +impl PciSubclass for PciMassStorageSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Network Controller Sub Classes +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciNetworkControllerSubclass { + EthernetController = 0x00, + TokenRingController = 0x01, + FddiController = 0x02, + AtmController = 0x03, + IsdnController = 0x04, + WorldFipController = 0x05, + PicmgController = 0x06, + InfinibandController = 0x07, + FabricController = 0x08, + NetworkController = 0x80, +} + +impl PciSubclass for PciNetworkControllerSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Trait to define a PCI class programming interface +/// +/// Each combination of `PciClassCode` and `PciSubclass` can specify a +/// set of register-level programming interfaces. +/// This trait is implemented by each programming interface. +/// It allows use of a trait object to generate configurations. +pub trait PciProgrammingInterface { + /// Convert this programming interface to the value used in the PCI specification. + fn get_register_value(&self) -> u8; +} + +/// Types of PCI capabilities. +#[derive(PartialEq, Eq, Copy, Clone)] +#[allow(dead_code)] +#[allow(non_camel_case_types)] +#[repr(u8)] +pub enum PciCapabilityId { + ListId = 0, + PowerManagement = 0x01, + AcceleratedGraphicsPort = 0x02, + VitalProductData = 0x03, + SlotIdentification = 0x04, + MessageSignalledInterrupts = 0x05, + CompactPciHotSwap = 0x06, + PciX = 0x07, + HyperTransport = 0x08, + VendorSpecific = 0x09, + Debugport = 0x0A, + CompactPciCentralResourceControl = 0x0B, + PciStandardHotPlugController = 0x0C, + BridgeSubsystemVendorDeviceId = 0x0D, + AgpTargetPciPcibridge = 0x0E, + SecureDevice = 0x0F, + PciExpress = 0x10, + MsiX = 0x11, + SataDataIndexConf = 0x12, + PciAdvancedFeatures = 0x13, + PciEnhancedAllocation = 0x14, +} + +impl From for PciCapabilityId { + fn from(c: u8) -> Self { + match c { + 0 => PciCapabilityId::ListId, + 0x01 => PciCapabilityId::PowerManagement, + 0x02 => PciCapabilityId::AcceleratedGraphicsPort, + 0x03 => PciCapabilityId::VitalProductData, + 0x04 => PciCapabilityId::SlotIdentification, + 0x05 => PciCapabilityId::MessageSignalledInterrupts, + 0x06 => PciCapabilityId::CompactPciHotSwap, + 0x07 => PciCapabilityId::PciX, + 0x08 => PciCapabilityId::HyperTransport, + 0x09 => PciCapabilityId::VendorSpecific, + 0x0A => PciCapabilityId::Debugport, + 0x0B => PciCapabilityId::CompactPciCentralResourceControl, + 0x0C => PciCapabilityId::PciStandardHotPlugController, + 0x0D => PciCapabilityId::BridgeSubsystemVendorDeviceId, + 0x0E => PciCapabilityId::AgpTargetPciPcibridge, + 0x0F => PciCapabilityId::SecureDevice, + 0x10 => PciCapabilityId::PciExpress, + 0x11 => PciCapabilityId::MsiX, + 0x12 => PciCapabilityId::SataDataIndexConf, + 0x13 => PciCapabilityId::PciAdvancedFeatures, + 0x14 => PciCapabilityId::PciEnhancedAllocation, + _ => PciCapabilityId::ListId, + } + } +} + +/// Types of PCI Express capabilities. +#[derive(PartialEq, Eq, Copy, Clone, Debug)] +#[allow(dead_code)] +#[repr(u16)] +pub enum PciExpressCapabilityId { + NullCapability = 0x0000, + AdvancedErrorReporting = 0x0001, + VirtualChannelMultiFunctionVirtualChannelNotPresent = 0x0002, + DeviceSerialNumber = 0x0003, + PowerBudgeting = 0x0004, + RootComplexLinkDeclaration = 0x0005, + RootComplexInternalLinkControl = 0x0006, + RootComplexEventCollectorEndpointAssociation = 0x0007, + MultiFunctionVirtualChannel = 0x0008, + VirtualChannelMultiFunctionVirtualChannelPresent = 0x0009, + RootComplexRegisterBlock = 0x000a, + VendorSpecificExtendedCapability = 0x000b, + ConfigurationAccessCorrelation = 0x000c, + AccessControlServices = 0x000d, + AlternativeRoutingIdentificationInterpretation = 0x000e, + AddressTranslationServices = 0x000f, + SingleRootIoVirtualization = 0x0010, + DeprecatedMultiRootIoVirtualization = 0x0011, + Multicast = 0x0012, + PageRequestInterface = 0x0013, + ReservedForAmd = 0x0014, + ResizeableBar = 0x0015, + DynamicPowerAllocation = 0x0016, + ThpRequester = 0x0017, + LatencyToleranceReporting = 0x0018, + SecondaryPciExpress = 0x0019, + ProtocolMultiplexing = 0x001a, + ProcessAddressSpaceId = 0x001b, + LnRequester = 0x001c, + DownstreamPortContainment = 0x001d, + L1PmSubstates = 0x001e, + PrecisionTimeMeasurement = 0x001f, + PciExpressOverMphy = 0x0020, + FRSQueueing = 0x0021, + ReadinessTimeReporting = 0x0022, + DesignatedVendorSpecificExtendedCapability = 0x0023, + VfResizeableBar = 0x0024, + DataLinkFeature = 0x0025, + PhysicalLayerSixteenGts = 0x0026, + LaneMarginingAtTheReceiver = 0x0027, + HierarchyId = 0x0028, + NativePcieEnclosureManagement = 0x0029, + PhysicalLayerThirtyTwoGts = 0x002a, + AlternateProtocol = 0x002b, + SystemFirmwareIntermediary = 0x002c, + ShadowFunctions = 0x002d, + DataObjectExchange = 0x002e, + Reserved = 0x002f, + ExtendedCapabilitiesAbsence = 0xffff, +} + +impl From for PciExpressCapabilityId { + fn from(c: u16) -> Self { + match c { + 0x0000 => PciExpressCapabilityId::NullCapability, + 0x0001 => PciExpressCapabilityId::AdvancedErrorReporting, + 0x0002 => PciExpressCapabilityId::VirtualChannelMultiFunctionVirtualChannelNotPresent, + 0x0003 => PciExpressCapabilityId::DeviceSerialNumber, + 0x0004 => PciExpressCapabilityId::PowerBudgeting, + 0x0005 => PciExpressCapabilityId::RootComplexLinkDeclaration, + 0x0006 => PciExpressCapabilityId::RootComplexInternalLinkControl, + 0x0007 => PciExpressCapabilityId::RootComplexEventCollectorEndpointAssociation, + 0x0008 => PciExpressCapabilityId::MultiFunctionVirtualChannel, + 0x0009 => PciExpressCapabilityId::VirtualChannelMultiFunctionVirtualChannelPresent, + 0x000a => PciExpressCapabilityId::RootComplexRegisterBlock, + 0x000b => PciExpressCapabilityId::VendorSpecificExtendedCapability, + 0x000c => PciExpressCapabilityId::ConfigurationAccessCorrelation, + 0x000d => PciExpressCapabilityId::AccessControlServices, + 0x000e => PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation, + 0x000f => PciExpressCapabilityId::AddressTranslationServices, + 0x0010 => PciExpressCapabilityId::SingleRootIoVirtualization, + 0x0011 => PciExpressCapabilityId::DeprecatedMultiRootIoVirtualization, + 0x0012 => PciExpressCapabilityId::Multicast, + 0x0013 => PciExpressCapabilityId::PageRequestInterface, + 0x0014 => PciExpressCapabilityId::ReservedForAmd, + 0x0015 => PciExpressCapabilityId::ResizeableBar, + 0x0016 => PciExpressCapabilityId::DynamicPowerAllocation, + 0x0017 => PciExpressCapabilityId::ThpRequester, + 0x0018 => PciExpressCapabilityId::LatencyToleranceReporting, + 0x0019 => PciExpressCapabilityId::SecondaryPciExpress, + 0x001a => PciExpressCapabilityId::ProtocolMultiplexing, + 0x001b => PciExpressCapabilityId::ProcessAddressSpaceId, + 0x001c => PciExpressCapabilityId::LnRequester, + 0x001d => PciExpressCapabilityId::DownstreamPortContainment, + 0x001e => PciExpressCapabilityId::L1PmSubstates, + 0x001f => PciExpressCapabilityId::PrecisionTimeMeasurement, + 0x0020 => PciExpressCapabilityId::PciExpressOverMphy, + 0x0021 => PciExpressCapabilityId::FRSQueueing, + 0x0022 => PciExpressCapabilityId::ReadinessTimeReporting, + 0x0023 => PciExpressCapabilityId::DesignatedVendorSpecificExtendedCapability, + 0x0024 => PciExpressCapabilityId::VfResizeableBar, + 0x0025 => PciExpressCapabilityId::DataLinkFeature, + 0x0026 => PciExpressCapabilityId::PhysicalLayerSixteenGts, + 0x0027 => PciExpressCapabilityId::LaneMarginingAtTheReceiver, + 0x0028 => PciExpressCapabilityId::HierarchyId, + 0x0029 => PciExpressCapabilityId::NativePcieEnclosureManagement, + 0x002a => PciExpressCapabilityId::PhysicalLayerThirtyTwoGts, + 0x002b => PciExpressCapabilityId::AlternateProtocol, + 0x002c => PciExpressCapabilityId::SystemFirmwareIntermediary, + 0x002d => PciExpressCapabilityId::ShadowFunctions, + 0x002e => PciExpressCapabilityId::DataObjectExchange, + 0xffff => PciExpressCapabilityId::ExtendedCapabilitiesAbsence, + _ => PciExpressCapabilityId::Reserved, + } + } +} + +/// A PCI capability list. Devices can optionally specify capabilities in their configuration space. +pub trait PciCapability { + fn bytes(&self) -> &[u8]; + fn id(&self) -> PciCapabilityId; +} + +fn encode_32_bits_bar_size(bar_size: u32) -> Option { + if bar_size > 0 { + return Some(!(bar_size - 1)); + } + None +} + +fn decode_32_bits_bar_size(bar_size: u32) -> Option { + if bar_size > 0 { + return Some(!bar_size + 1); + } + None +} + +fn encode_64_bits_bar_size(bar_size: u64) -> Option<(u32, u32)> { + if bar_size > 0 { + let result = !(bar_size - 1); + let result_hi = (result >> 32) as u32; + let result_lo = (result & 0xffff_ffff) as u32; + return Some((result_hi, result_lo)); + } + None +} + +fn decode_64_bits_bar_size(bar_size_hi: u32, bar_size_lo: u32) -> Option { + let bar_size: u64 = ((bar_size_hi as u64) << 32) | (bar_size_lo as u64); + if bar_size > 0 { + return Some(!bar_size + 1); + } + None +} + +#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)] +struct PciBar { + addr: u32, + size: u32, + used: bool, + r#type: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct PciConfigurationState { + registers: Vec, + writable_bits: Vec, + bars: Vec, + rom_bar_addr: u32, + rom_bar_size: u32, + rom_bar_used: bool, + last_capability: Option<(usize, usize)>, + msix_cap_reg_idx: Option, +} + +/// Contains the configuration space of a PCI node. +/// +/// See the [specification](https://en.wikipedia.org/wiki/PCI_configuration_space). +/// The configuration space is accessed with DWORD reads and writes from the guest. +pub struct PciConfiguration { + registers: [u32; NUM_CONFIGURATION_REGISTERS], + writable_bits: [u32; NUM_CONFIGURATION_REGISTERS], // writable bits for each register. + bars: [PciBar; NUM_BAR_REGS], + rom_bar_addr: u32, + rom_bar_size: u32, + rom_bar_used: bool, + // Contains the byte offset and size of the last capability. + last_capability: Option<(usize, usize)>, + msix_cap_reg_idx: Option, + msix_config: Option>>, +} + +/// See pci_regs.h in kernel +#[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] +pub enum PciBarRegionType { + Memory32BitRegion = 0, + IoRegion = 0x01, + Memory64BitRegion = 0x04, +} + +impl From for PciBarRegionType { + fn from(type_: PciBarType) -> Self { + match type_ { + PciBarType::Io => PciBarRegionType::IoRegion, + PciBarType::Mmio32 => PciBarRegionType::Memory32BitRegion, + PciBarType::Mmio64 => PciBarRegionType::Memory64BitRegion, + } + } +} + +impl From for PciBarType { + fn from(val: PciBarRegionType) -> Self { + match val { + PciBarRegionType::IoRegion => PciBarType::Io, + PciBarRegionType::Memory32BitRegion => PciBarType::Mmio32, + PciBarRegionType::Memory64BitRegion => PciBarType::Mmio64, + } + } +} + +#[derive(Copy, Clone)] +pub enum PciBarPrefetchable { + NotPrefetchable = 0, + Prefetchable = 0x08, +} + +impl From for bool { + fn from(val: PciBarPrefetchable) -> Self { + match val { + PciBarPrefetchable::NotPrefetchable => false, + PciBarPrefetchable::Prefetchable => true, + } + } +} + +#[derive(Copy, Clone)] +pub struct PciBarConfiguration { + addr: u64, + size: u64, + idx: usize, + region_type: PciBarRegionType, + prefetchable: PciBarPrefetchable, +} + +#[derive(Debug)] +pub enum Error { + BarAddressInvalid(u64, u64), + BarInUse(usize), + BarInUse64(usize), + BarInvalid(usize), + BarInvalid64(usize), + BarSizeInvalid(u64), + CapabilityEmpty, + CapabilityLengthInvalid(usize), + CapabilitySpaceFull(usize), + Decode32BarSize, + Decode64BarSize, + Encode32BarSize, + Encode64BarSize, + RomBarAddressInvalid(u64, u64), + RomBarInUse(usize), + RomBarInvalid(usize), + RomBarSizeInvalid(u64), +} +pub type Result = std::result::Result; + +impl std::error::Error for Error {} + +impl Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + match self { + BarAddressInvalid(a, s) => write!(f, "address {a} size {s} too big"), + BarInUse(b) => write!(f, "bar {b} already used"), + BarInUse64(b) => write!(f, "64bit bar {b} already used(requires two regs)"), + BarInvalid(b) => write!(f, "bar {} invalid, max {}", b, NUM_BAR_REGS - 1), + BarInvalid64(b) => write!( + f, + "64bitbar {} invalid, requires two regs, max {}", + b, + NUM_BAR_REGS - 1 + ), + BarSizeInvalid(s) => write!(f, "bar address {s} not a power of two"), + CapabilityEmpty => write!(f, "empty capabilities are invalid"), + CapabilityLengthInvalid(l) => write!(f, "Invalid capability length {l}"), + CapabilitySpaceFull(s) => write!(f, "capability of size {s} doesn't fit"), + Decode32BarSize => write!(f, "failed to decode 32 bits BAR size"), + Decode64BarSize => write!(f, "failed to decode 64 bits BAR size"), + Encode32BarSize => write!(f, "failed to encode 32 bits BAR size"), + Encode64BarSize => write!(f, "failed to encode 64 bits BAR size"), + RomBarAddressInvalid(a, s) => write!(f, "address {a} size {s} too big"), + RomBarInUse(b) => write!(f, "rom bar {b} already used"), + RomBarInvalid(b) => write!(f, "rom bar {} invalid, max {}", b, NUM_BAR_REGS - 1), + RomBarSizeInvalid(s) => write!(f, "rom bar address {s} not a power of two"), + } + } +} + +impl PciConfiguration { + #[allow(clippy::too_many_arguments)] + pub fn new( + vendor_id: u16, + device_id: u16, + revision_id: u8, + class_code: PciClassCode, + subclass: &dyn PciSubclass, + programming_interface: Option<&dyn PciProgrammingInterface>, + header_type: PciHeaderType, + subsystem_vendor_id: u16, + subsystem_id: u16, + msix_config: Option>>, + state: Option, + ) -> Self { + let ( + registers, + writable_bits, + bars, + rom_bar_addr, + rom_bar_size, + rom_bar_used, + last_capability, + msix_cap_reg_idx, + ) = if let Some(state) = state { + ( + state.registers.try_into().unwrap(), + state.writable_bits.try_into().unwrap(), + state.bars.try_into().unwrap(), + state.rom_bar_addr, + state.rom_bar_size, + state.rom_bar_used, + state.last_capability, + state.msix_cap_reg_idx, + ) + } else { + let mut registers = [0u32; NUM_CONFIGURATION_REGISTERS]; + let mut writable_bits = [0u32; NUM_CONFIGURATION_REGISTERS]; + registers[0] = (u32::from(device_id) << 16) | u32::from(vendor_id); + // TODO(dverkamp): Status should be write-1-to-clear + writable_bits[1] = 0x0000_ffff; // Status (r/o), command (r/w) + let pi = if let Some(pi) = programming_interface { + pi.get_register_value() + } else { + 0 + }; + registers[2] = (u32::from(class_code.get_register_value()) << 24) + | (u32::from(subclass.get_register_value()) << 16) + | (u32::from(pi) << 8) + | u32::from(revision_id); + writable_bits[3] = 0x0000_00ff; // Cacheline size (r/w) + match header_type { + PciHeaderType::Device => { + registers[3] = 0x0000_0000; // Header type 0 (device) + writable_bits[15] = 0x0000_00ff; // Interrupt line (r/w) + } + PciHeaderType::Bridge => { + registers[3] = 0x0001_0000; // Header type 1 (bridge) + writable_bits[9] = 0xfff0_fff0; // Memory base and limit + writable_bits[15] = 0xffff_00ff; // Bridge control (r/w), interrupt line (r/w) + } + }; + registers[11] = (u32::from(subsystem_id) << 16) | u32::from(subsystem_vendor_id); + + ( + registers, + writable_bits, + [PciBar::default(); NUM_BAR_REGS], + 0, + 0, + false, + None, + None, + ) + }; + + PciConfiguration { + registers, + writable_bits, + bars, + rom_bar_addr, + rom_bar_size, + rom_bar_used, + last_capability, + msix_cap_reg_idx, + msix_config, + } + } + + pub fn state(&self) -> PciConfigurationState { + PciConfigurationState { + registers: self.registers.to_vec(), + writable_bits: self.writable_bits.to_vec(), + bars: self.bars.to_vec(), + rom_bar_addr: self.rom_bar_addr, + rom_bar_size: self.rom_bar_size, + rom_bar_used: self.rom_bar_used, + last_capability: self.last_capability, + msix_cap_reg_idx: self.msix_cap_reg_idx, + } + } + + /// Reads a 32bit register from `reg_idx` in the register map. + pub fn read_reg(&self, reg_idx: usize) -> u32 { + *(self.registers.get(reg_idx).unwrap_or(&0xffff_ffff)) + } + + /// Writes a 32bit register to `reg_idx` in the register map. + pub fn write_reg(&mut self, reg_idx: usize, value: u32) { + let mut mask = self.writable_bits[reg_idx]; + + if (BAR0_REG..BAR0_REG + NUM_BAR_REGS).contains(®_idx) { + // Handle very specific case where the BAR is being written with + // all 1's to retrieve the BAR size during next BAR reading. + if value == 0xffff_ffff { + mask &= self.bars[reg_idx - 4].size; + } + } else if reg_idx == ROM_BAR_REG { + // Handle very specific case where the BAR is being written with + // all 1's on bits 31-11 to retrieve the BAR size during next BAR + // reading. + if value & ROM_BAR_ADDR_MASK == ROM_BAR_ADDR_MASK { + mask &= self.rom_bar_size; + } + } + + if let Some(r) = self.registers.get_mut(reg_idx) { + *r = (*r & !self.writable_bits[reg_idx]) | (value & mask); + } else { + warn!("bad PCI register write {}", reg_idx); + } + } + + /// Writes a 16bit word to `offset`. `offset` must be 16bit aligned. + pub fn write_word(&mut self, offset: usize, value: u16) { + let shift = match offset % 4 { + 0 => 0, + 2 => 16, + _ => { + warn!("bad PCI config write offset {}", offset); + return; + } + }; + let reg_idx = offset / 4; + + if let Some(r) = self.registers.get_mut(reg_idx) { + let writable_mask = self.writable_bits[reg_idx]; + let mask = (0xffffu32 << shift) & writable_mask; + let shifted_value = (u32::from(value) << shift) & writable_mask; + *r = *r & !mask | shifted_value; + } else { + warn!("bad PCI config write offset {}", offset); + } + } + + /// Writes a byte to `offset`. + pub fn write_byte(&mut self, offset: usize, value: u8) { + self.write_byte_internal(offset, value, true); + } + + /// Writes a byte to `offset`, optionally enforcing read-only bits. + fn write_byte_internal(&mut self, offset: usize, value: u8, apply_writable_mask: bool) { + let shift = (offset % 4) * 8; + let reg_idx = offset / 4; + + if let Some(r) = self.registers.get_mut(reg_idx) { + let writable_mask = if apply_writable_mask { + self.writable_bits[reg_idx] + } else { + 0xffff_ffff + }; + let mask = (0xffu32 << shift) & writable_mask; + let shifted_value = (u32::from(value) << shift) & writable_mask; + *r = *r & !mask | shifted_value; + } else { + warn!("bad PCI config write offset {}", offset); + } + } + + /// Adds a region specified by `config`. Configures the specified BAR(s) to + /// report this region and size to the guest kernel. Enforces a few constraints + /// (i.e, region size must be power of two, register not already used). + pub fn add_pci_bar(&mut self, config: &PciBarConfiguration) -> Result<()> { + let bar_idx = config.idx; + let reg_idx = BAR0_REG + bar_idx; + + if self.bars[bar_idx].used { + return Err(Error::BarInUse(bar_idx)); + } + + if !config.size.is_power_of_two() { + return Err(Error::BarSizeInvalid(config.size)); + } + + if bar_idx >= NUM_BAR_REGS { + return Err(Error::BarInvalid(bar_idx)); + } + + let end_addr = config + .addr + .checked_add(config.size - 1) + .ok_or(Error::BarAddressInvalid(config.addr, config.size))?; + match config.region_type { + PciBarRegionType::Memory32BitRegion | PciBarRegionType::IoRegion => { + if end_addr > u64::from(u32::MAX) { + return Err(Error::BarAddressInvalid(config.addr, config.size)); + } + + // Encode the BAR size as expected by the software running in + // the guest. + self.bars[bar_idx].size = + encode_32_bits_bar_size(config.size as u32).ok_or(Error::Encode32BarSize)?; + } + PciBarRegionType::Memory64BitRegion => { + if bar_idx + 1 >= NUM_BAR_REGS { + return Err(Error::BarInvalid64(bar_idx)); + } + + if self.bars[bar_idx + 1].used { + return Err(Error::BarInUse64(bar_idx)); + } + + // Encode the BAR size as expected by the software running in + // the guest. + let (bar_size_hi, bar_size_lo) = + encode_64_bits_bar_size(config.size).ok_or(Error::Encode64BarSize)?; + + self.registers[reg_idx + 1] = (config.addr >> 32) as u32; + self.writable_bits[reg_idx + 1] = 0xffff_ffff; + self.bars[bar_idx + 1].addr = self.registers[reg_idx + 1]; + self.bars[bar_idx].size = bar_size_lo; + self.bars[bar_idx + 1].size = bar_size_hi; + self.bars[bar_idx + 1].used = true; + } + } + + let (mask, lower_bits) = match config.region_type { + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => ( + BAR_MEM_ADDR_MASK, + config.prefetchable as u32 | config.region_type as u32, + ), + PciBarRegionType::IoRegion => (BAR_IO_ADDR_MASK, config.region_type as u32), + }; + + self.registers[reg_idx] = ((config.addr as u32) & mask) | lower_bits; + self.writable_bits[reg_idx] = mask; + self.bars[bar_idx].addr = self.registers[reg_idx]; + self.bars[bar_idx].used = true; + self.bars[bar_idx].r#type = Some(config.region_type); + + Ok(()) + } + + /// Adds rom expansion BAR. + pub fn add_pci_rom_bar(&mut self, config: &PciBarConfiguration, active: u32) -> Result<()> { + let bar_idx = config.idx; + let reg_idx = ROM_BAR_REG; + + if self.rom_bar_used { + return Err(Error::RomBarInUse(bar_idx)); + } + + if !config.size.is_power_of_two() { + return Err(Error::RomBarSizeInvalid(config.size)); + } + + if bar_idx != ROM_BAR_IDX { + return Err(Error::RomBarInvalid(bar_idx)); + } + + let end_addr = config + .addr + .checked_add(config.size - 1) + .ok_or(Error::RomBarAddressInvalid(config.addr, config.size))?; + + if end_addr > u64::from(u32::MAX) { + return Err(Error::RomBarAddressInvalid(config.addr, config.size)); + } + + self.registers[reg_idx] = (config.addr as u32) | active; + self.writable_bits[reg_idx] = ROM_BAR_ADDR_MASK; + self.rom_bar_addr = self.registers[reg_idx]; + self.rom_bar_size = + encode_32_bits_bar_size(config.size as u32).ok_or(Error::Encode32BarSize)?; + self.rom_bar_used = true; + + Ok(()) + } + + /// Returns the address of the given BAR region. + pub fn get_bar_addr(&self, bar_num: usize) -> u64 { + let bar_idx = BAR0_REG + bar_num; + + let mut addr = u64::from(self.bars[bar_num].addr & self.writable_bits[bar_idx]); + + if let Some(bar_type) = self.bars[bar_num].r#type { + if bar_type == PciBarRegionType::Memory64BitRegion { + addr |= u64::from(self.bars[bar_num + 1].addr) << 32; + } + } + + addr + } + + /// Configures the IRQ line and pin used by this device. + pub fn set_irq(&mut self, line: u8, pin: PciInterruptPin) { + // `pin` is 1-based in the pci config space. + let pin_idx = (pin as u32) + 1; + self.registers[INTERRUPT_LINE_PIN_REG] = (self.registers[INTERRUPT_LINE_PIN_REG] + & 0xffff_0000) + | (pin_idx << 8) + | u32::from(line); + } + + /// Adds the capability `cap_data` to the list of capabilities. + /// `cap_data` should include the two-byte PCI capability header (type, next), + /// but not populate it. Correct values will be generated automatically based + /// on `cap_data.id()`. + pub fn add_capability(&mut self, cap_data: &dyn PciCapability) -> Result { + let total_len = cap_data.bytes().len(); + // Check that the length is valid. + if cap_data.bytes().is_empty() { + return Err(Error::CapabilityEmpty); + } + let (cap_offset, tail_offset) = match self.last_capability { + Some((offset, len)) => (Self::next_dword(offset, len), offset + 1), + None => (FIRST_CAPABILITY_OFFSET, CAPABILITY_LIST_HEAD_OFFSET), + }; + let end_offset = cap_offset + .checked_add(total_len) + .ok_or(Error::CapabilitySpaceFull(total_len))?; + if end_offset > CAPABILITY_MAX_OFFSET { + return Err(Error::CapabilitySpaceFull(total_len)); + } + self.registers[STATUS_REG] |= STATUS_REG_CAPABILITIES_USED_MASK; + self.write_byte_internal(tail_offset, cap_offset as u8, false); + self.write_byte_internal(cap_offset, cap_data.id() as u8, false); + self.write_byte_internal(cap_offset + 1, 0, false); // Next pointer. + for (i, byte) in cap_data.bytes().iter().enumerate() { + self.write_byte_internal(cap_offset + i + 2, *byte, false); + } + self.last_capability = Some((cap_offset, total_len)); + + match cap_data.id() { + PciCapabilityId::MessageSignalledInterrupts => { + self.writable_bits[cap_offset / 4] = MSI_CAPABILITY_REGISTER_MASK; + } + PciCapabilityId::MsiX => { + self.msix_cap_reg_idx = Some(cap_offset / 4); + self.writable_bits[self.msix_cap_reg_idx.unwrap()] = MSIX_CAPABILITY_REGISTER_MASK; + } + _ => {} + } + + Ok(cap_offset) + } + + // Find the next aligned offset after the one given. + fn next_dword(offset: usize, len: usize) -> usize { + let next = offset + len; + (next + 3) & !3 + } + + pub fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + + // Handle potential write to MSI-X message control register + if let Some(msix_cap_reg_idx) = self.msix_cap_reg_idx { + if let Some(msix_config) = &self.msix_config { + if msix_cap_reg_idx == reg_idx && offset == 2 && data.len() == 2 { + msix_config + .lock() + .unwrap() + .set_msg_ctl(LittleEndian::read_u16(data)); + } else if msix_cap_reg_idx == reg_idx && offset == 0 && data.len() == 4 { + msix_config + .lock() + .unwrap() + .set_msg_ctl((LittleEndian::read_u32(data) >> 16) as u16); + } + } + } + + match data.len() { + 1 => self.write_byte(reg_idx * 4 + offset as usize, data[0]), + 2 => self.write_word( + reg_idx * 4 + offset as usize, + u16::from(data[0]) | (u16::from(data[1]) << 8), + ), + 4 => self.write_reg(reg_idx, LittleEndian::read_u32(data)), + _ => (), + } + } + + pub fn read_config_register(&self, reg_idx: usize) -> u32 { + self.read_reg(reg_idx) + } + + pub fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + if data.len() != 4 { + return None; + } + + let value = LittleEndian::read_u32(data); + + let mask = self.writable_bits[reg_idx]; + if (BAR0_REG..BAR0_REG + NUM_BAR_REGS).contains(®_idx) { + // Ignore the case where the BAR size is being asked for. + if value == 0xffff_ffff { + return None; + } + + let bar_idx = reg_idx - 4; + // Handle special case where the address being written is + // different from the address initially provided. This is a + // BAR reprogramming case which needs to be properly caught. + if let Some(bar_type) = self.bars[bar_idx].r#type { + // In case of 64 bits memory BAR, we don't do anything until + // the upper BAR is modified, otherwise we would be moving the + // BAR to a wrong location in memory. + if bar_type == PciBarRegionType::Memory64BitRegion { + return None; + } + + // Ignore the case where the value is unchanged. + if (value & mask) == (self.bars[bar_idx].addr & mask) { + return None; + } + + info!( + "Detected BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = u64::from(self.bars[bar_idx].addr & mask); + let new_base = u64::from(value & mask); + let len = u64::from( + decode_32_bits_bar_size(self.bars[bar_idx].size) + .ok_or(Error::Decode32BarSize) + .unwrap(), + ); + let region_type = bar_type; + + self.bars[bar_idx].addr = value; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } else if (reg_idx > BAR0_REG) + && ((self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]) + != (self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]) + || (value & mask) != (self.bars[bar_idx].addr & mask)) + { + info!( + "Detected BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = (u64::from(self.bars[bar_idx].addr & mask) << 32) + | u64::from(self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]); + let new_base = (u64::from(value & mask) << 32) + | u64::from(self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]); + let len = + decode_64_bits_bar_size(self.bars[bar_idx].size, self.bars[bar_idx - 1].size) + .ok_or(Error::Decode64BarSize) + .unwrap(); + let region_type = PciBarRegionType::Memory64BitRegion; + + self.bars[bar_idx].addr = value; + self.bars[bar_idx - 1].addr = self.registers[reg_idx - 1]; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } + } else if reg_idx == ROM_BAR_REG && (value & mask) != (self.rom_bar_addr & mask) { + // Ignore the case where the BAR size is being asked for. + if value & ROM_BAR_ADDR_MASK == ROM_BAR_ADDR_MASK { + return None; + } + + info!( + "Detected ROM BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = u64::from(self.rom_bar_addr & mask); + let new_base = u64::from(value & mask); + let len = u64::from( + decode_32_bits_bar_size(self.rom_bar_size) + .ok_or(Error::Decode32BarSize) + .unwrap(), + ); + let region_type = PciBarRegionType::Memory32BitRegion; + + self.rom_bar_addr = value; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } + + None + } +} + +impl Default for PciBarConfiguration { + fn default() -> Self { + PciBarConfiguration { + idx: 0, + addr: 0, + size: 0, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::NotPrefetchable, + } + } +} + +impl PciBarConfiguration { + pub fn new( + idx: usize, + size: u64, + region_type: PciBarRegionType, + prefetchable: PciBarPrefetchable, + ) -> Self { + PciBarConfiguration { + idx, + addr: 0, + size, + region_type, + prefetchable, + } + } + + #[must_use] + pub fn set_index(mut self, idx: usize) -> Self { + self.idx = idx; + self + } + + #[must_use] + pub fn set_address(mut self, addr: u64) -> Self { + self.addr = addr; + self + } + + #[must_use] + pub fn set_size(mut self, size: u64) -> Self { + self.size = size; + self + } + + #[must_use] + pub fn set_region_type(mut self, region_type: PciBarRegionType) -> Self { + self.region_type = region_type; + self + } + + #[must_use] + pub fn set_prefetchable(mut self, prefetchable: PciBarPrefetchable) -> Self { + self.prefetchable = prefetchable; + self + } + + pub fn idx(&self) -> usize { + self.idx + } + + pub fn addr(&self) -> u64 { + self.addr + } + + pub fn size(&self) -> u64 { + self.size + } + + pub fn region_type(&self) -> PciBarRegionType { + self.region_type + } + + pub fn prefetchable(&self) -> PciBarPrefetchable { + self.prefetchable + } +} + +#[cfg(test)] +mod tests { + use vm_memory::ByteValued; + + use super::*; + + #[repr(C, packed)] + #[derive(Clone, Copy, Default)] + #[allow(dead_code)] + struct TestCap { + len: u8, + foo: u8, + } + + // SAFETY: All members are simple numbers and any value is valid. + unsafe impl ByteValued for TestCap {} + + impl PciCapability for TestCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } + } + + #[test] + fn add_capability() { + let mut cfg = PciConfiguration::new( + 0x1234, + 0x5678, + 0x1, + PciClassCode::MultimediaController, + &PciMultimediaSubclass::AudioController, + None, + PciHeaderType::Device, + 0xABCD, + 0x2468, + None, + None, + ); + + // Add two capabilities with different contents. + let cap1 = TestCap { len: 4, foo: 0xAA }; + let cap1_offset = cfg.add_capability(&cap1).unwrap(); + assert_eq!(cap1_offset % 4, 0); + + let cap2 = TestCap { + len: 0x04, + foo: 0x55, + }; + let cap2_offset = cfg.add_capability(&cap2).unwrap(); + assert_eq!(cap2_offset % 4, 0); + + // The capability list head should be pointing to cap1. + let cap_ptr = cfg.read_reg(CAPABILITY_LIST_HEAD_OFFSET / 4) & 0xFF; + assert_eq!(cap1_offset, cap_ptr as usize); + + // Verify the contents of the capabilities. + let cap1_data = cfg.read_reg(cap1_offset / 4); + assert_eq!(cap1_data & 0xFF, 0x09); // capability ID + assert_eq!((cap1_data >> 8) & 0xFF, cap2_offset as u32); // next capability pointer + assert_eq!((cap1_data >> 16) & 0xFF, 0x04); // cap1.len + assert_eq!((cap1_data >> 24) & 0xFF, 0xAA); // cap1.foo + + let cap2_data = cfg.read_reg(cap2_offset / 4); + assert_eq!(cap2_data & 0xFF, 0x09); // capability ID + assert_eq!((cap2_data >> 8) & 0xFF, 0x00); // next capability pointer + assert_eq!((cap2_data >> 16) & 0xFF, 0x04); // cap2.len + assert_eq!((cap2_data >> 24) & 0xFF, 0x55); // cap2.foo + } + + #[derive(Copy, Clone)] + enum TestPi { + Test = 0x5a, + } + + impl PciProgrammingInterface for TestPi { + fn get_register_value(&self) -> u8 { + *self as u8 + } + } + + #[test] + fn class_code() { + let cfg = PciConfiguration::new( + 0x1234, + 0x5678, + 0x1, + PciClassCode::MultimediaController, + &PciMultimediaSubclass::AudioController, + Some(&TestPi::Test), + PciHeaderType::Device, + 0xABCD, + 0x2468, + None, + None, + ); + + let class_reg = cfg.read_reg(2); + let class_code = (class_reg >> 24) & 0xFF; + let subclass = (class_reg >> 16) & 0xFF; + let prog_if = (class_reg >> 8) & 0xFF; + assert_eq!(class_code, 0x04); + assert_eq!(subclass, 0x01); + assert_eq!(prog_if, 0x5a); + } +} diff --git a/src/pci/src/device.rs b/src/pci/src/device.rs new file mode 100644 index 00000000000..d3bd3056a36 --- /dev/null +++ b/src/pci/src/device.rs @@ -0,0 +1,136 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::any::Any; +use std::fmt::{self, Display}; +use std::sync::{Arc, Barrier}; +use std::{io, result}; + +use vm_allocator::AddressAllocator; +use vm_device::Resource; + +use crate::configuration::{self, PciBarRegionType}; +use crate::PciBarConfiguration; + +#[derive(Debug)] +pub enum Error { + /// Setup of the device capabilities failed. + CapabilitiesSetup(configuration::Error), + /// Allocating space for an IO BAR failed. + IoAllocationFailed(u64), + /// Registering an IO BAR failed. + IoRegistrationFailed(u64, configuration::Error), + /// Expected resource not found. + MissingResource, + /// Invalid resource. + InvalidResource(Resource), +} +pub type Result = std::result::Result; + +impl Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + + match self { + CapabilitiesSetup(e) => write!(f, "failed to add capability {e}"), + IoAllocationFailed(size) => { + write!(f, "failed to allocate space for an IO BAR, size={size}") + } + IoRegistrationFailed(addr, e) => { + write!(f, "failed to register an IO BAR, addr={addr} err={e}") + } + MissingResource => write!(f, "failed to find expected resource"), + InvalidResource(r) => write!(f, "invalid resource {r:?}"), + } + } +} + +#[derive(Clone, Copy)] +pub struct BarReprogrammingParams { + pub old_base: u64, + pub new_base: u64, + pub len: u64, + pub region_type: PciBarRegionType, +} + +pub trait PciDevice: Send { + /// Allocates the needed PCI BARs space using the `allocate` function which takes a size and + /// returns an address. Returns a Vec of (GuestAddress, GuestUsize) tuples. + fn allocate_bars( + &mut self, + _mmio32_allocator: &mut AddressAllocator, + _mmio64_allocator: &mut AddressAllocator, + _resources: Option>, + ) -> Result> { + Ok(Vec::new()) + } + + /// Frees the PCI BARs previously allocated with a call to allocate_bars(). + fn free_bars( + &mut self, + _mmio32_allocator: &mut AddressAllocator, + _mmio64_allocator: &mut AddressAllocator, + ) -> Result<()> { + Ok(()) + } + + /// Sets a register in the configuration space. + /// * `reg_idx` - The index of the config register to modify. + /// * `offset` - Offset into the register. + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option>; + /// Gets a register from the configuration space. + /// * `reg_idx` - The index of the config register to read. + fn read_config_register(&mut self, reg_idx: usize) -> u32; + /// Detects if a BAR is being reprogrammed. + fn detect_bar_reprogramming( + &mut self, + _reg_idx: usize, + _data: &[u8], + ) -> Option { + None + } + /// Reads from a BAR region mapped into the device. + /// * `addr` - The guest address inside the BAR. + /// * `data` - Filled with the data from `addr`. + fn read_bar(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} + /// Writes to a BAR region mapped into the device. + /// * `addr` - The guest address inside the BAR. + /// * `data` - The data to write. + fn write_bar(&mut self, _base: u64, _offset: u64, _data: &[u8]) -> Option> { + None + } + /// Relocates the BAR to a different address in guest address space. + fn move_bar(&mut self, _old_base: u64, _new_base: u64) -> result::Result<(), io::Error> { + Ok(()) + } + /// Provides a mutable reference to the Any trait. This is useful to let + /// the caller have access to the underlying type behind the trait. + fn as_any_mut(&mut self) -> &mut dyn Any; + + /// Optionally returns a unique identifier. + fn id(&self) -> Option; +} + +/// This trait defines a set of functions which can be triggered whenever a +/// PCI device is modified in any way. +pub trait DeviceRelocation: Send + Sync { + /// The BAR needs to be moved to a different location in the guest address + /// space. This follows a decision from the software running in the guest. + fn move_bar( + &self, + old_base: u64, + new_base: u64, + len: u64, + pci_dev: &mut dyn PciDevice, + region_type: PciBarRegionType, + ) -> result::Result<(), io::Error>; +} diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs new file mode 100644 index 00000000000..2672159e474 --- /dev/null +++ b/src/pci/src/lib.rs @@ -0,0 +1,198 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! Implements pci devices and busses. +#[macro_use] +extern crate log; + +mod bus; +mod configuration; +mod device; +mod msi; +mod msix; + +use std::fmt::{self, Debug, Display}; +use std::num::ParseIntError; +use std::str::FromStr; + +use serde::de::Visitor; + +pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; +pub use self::configuration::{ + PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityId, + PciClassCode, PciConfiguration, PciExpressCapabilityId, PciHeaderType, PciMassStorageSubclass, + PciNetworkControllerSubclass, PciProgrammingInterface, PciSerialBusSubClass, PciSubclass, + PCI_CONFIGURATION_ID, +}; +pub use self::device::{ + BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, +}; +pub use self::msi::{msi_num_enabled_vectors, MsiCap, MsiConfig}; +pub use self::msix::{MsixCap, MsixConfig, MsixTableEntry, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE}; + +/// PCI has four interrupt pins A->D. +#[derive(Copy, Clone)] +pub enum PciInterruptPin { + IntA, + IntB, + IntC, + IntD, +} + +impl PciInterruptPin { + pub fn to_mask(self) -> u32 { + self as u32 + } +} + +#[cfg(target_arch = "x86_64")] +pub const PCI_CONFIG_IO_PORT: u64 = 0xcf8; +#[cfg(target_arch = "x86_64")] +pub const PCI_CONFIG_IO_PORT_SIZE: u64 = 0x8; + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] +pub struct PciBdf(u32); + +struct PciBdfVisitor; + +impl Visitor<'_> for PciBdfVisitor { + type Value = PciBdf; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("struct PciBdf") + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + Ok(v.into()) + } +} + +impl<'de> serde::Deserialize<'de> for PciBdf { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_str(PciBdfVisitor) + } +} + +impl serde::Serialize for PciBdf { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.collect_str(&self.to_string()) + } +} + +impl PciBdf { + pub fn segment(&self) -> u16 { + ((self.0 >> 16) & 0xffff) as u16 + } + + pub fn bus(&self) -> u8 { + ((self.0 >> 8) & 0xff) as u8 + } + + pub fn device(&self) -> u8 { + ((self.0 >> 3) & 0x1f) as u8 + } + + pub fn function(&self) -> u8 { + (self.0 & 0x7) as u8 + } + + pub fn new(segment: u16, bus: u8, device: u8, function: u8) -> Self { + Self( + ((segment as u32) << 16) + | ((bus as u32) << 8) + | (((device & 0x1f) as u32) << 3) + | (function & 0x7) as u32, + ) + } +} + +impl From for PciBdf { + fn from(bdf: u32) -> Self { + Self(bdf) + } +} + +impl From for u32 { + fn from(bdf: PciBdf) -> Self { + bdf.0 + } +} + +impl From<&PciBdf> for u32 { + fn from(bdf: &PciBdf) -> Self { + bdf.0 + } +} + +impl From for u16 { + fn from(bdf: PciBdf) -> Self { + (bdf.0 & 0xffff) as u16 + } +} + +impl From<&PciBdf> for u16 { + fn from(bdf: &PciBdf) -> Self { + (bdf.0 & 0xffff) as u16 + } +} + +impl Debug for PciBdf { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:04x}:{:02x}:{:02x}.{:01x}", + self.segment(), + self.bus(), + self.device(), + self.function() + ) + } +} + +impl Display for PciBdf { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:04x}:{:02x}:{:02x}.{:01x}", + self.segment(), + self.bus(), + self.device(), + self.function() + ) + } +} + +impl FromStr for PciBdf { + type Err = ParseIntError; + + fn from_str(s: &str) -> Result { + let items: Vec<&str> = s.split('.').collect(); + assert_eq!(items.len(), 2); + let function = u8::from_str_radix(items[1], 16)?; + let items: Vec<&str> = items[0].split(':').collect(); + assert_eq!(items.len(), 3); + let segment = u16::from_str_radix(items[0], 16)?; + let bus = u8::from_str_radix(items[1], 16)?; + let device = u8::from_str_radix(items[2], 16)?; + Ok(PciBdf::new(segment, bus, device, function)) + } +} + +impl From<&str> for PciBdf { + fn from(bdf: &str) -> Self { + Self::from_str(bdf).unwrap() + } +} diff --git a/src/pci/src/msi.rs b/src/pci/src/msi.rs new file mode 100644 index 00000000000..16d593cd115 --- /dev/null +++ b/src/pci/src/msi.rs @@ -0,0 +1,282 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +// + +use std::io; +use std::sync::Arc; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use vm_device::interrupt::{ + InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, +}; + +// MSI control masks +const MSI_CTL_ENABLE: u16 = 0x1; +const MSI_CTL_MULTI_MSG_ENABLE: u16 = 0x70; +const MSI_CTL_64_BITS: u16 = 0x80; +const MSI_CTL_PER_VECTOR: u16 = 0x100; + +// MSI message offsets +const MSI_MSG_CTL_OFFSET: u64 = 0x2; +const MSI_MSG_ADDR_LO_OFFSET: u64 = 0x4; + +// MSI message masks +const MSI_MSG_ADDR_LO_MASK: u32 = 0xffff_fffc; + +pub fn msi_num_enabled_vectors(msg_ctl: u16) -> usize { + let field = (msg_ctl >> 4) & 0x7; + + if field > 5 { + return 0; + } + + 1 << field +} + +#[derive(Error, Debug)] +pub enum Error { + #[error("Failed enabling the interrupt route: {0}")] + EnableInterruptRoute(io::Error), + #[error("Failed updating the interrupt route: {0}")] + UpdateInterruptRoute(io::Error), +} + +#[derive(Clone, Copy, Default, Serialize, Deserialize)] +pub struct MsiCap { + // Message Control Register + // 0: MSI enable. + // 3-1; Multiple message capable. + // 6-4: Multiple message enable. + // 7: 64 bits address capable. + // 8: Per-vector masking capable. + // 15-9: Reserved. + pub msg_ctl: u16, + // Message Address (LSB) + // 1-0: Reserved. + // 31-2: Message address. + pub msg_addr_lo: u32, + // Message Upper Address (MSB) + // 31-0: Message address. + pub msg_addr_hi: u32, + // Message Data + // 15-0: Message data. + pub msg_data: u16, + // Mask Bits + // 31-0: Mask bits. + pub mask_bits: u32, + // Pending Bits + // 31-0: Pending bits. + pub pending_bits: u32, +} + +impl MsiCap { + fn addr_64_bits(&self) -> bool { + self.msg_ctl & MSI_CTL_64_BITS == MSI_CTL_64_BITS + } + + fn per_vector_mask(&self) -> bool { + self.msg_ctl & MSI_CTL_PER_VECTOR == MSI_CTL_PER_VECTOR + } + + fn enabled(&self) -> bool { + self.msg_ctl & MSI_CTL_ENABLE == MSI_CTL_ENABLE + } + + fn num_enabled_vectors(&self) -> usize { + msi_num_enabled_vectors(self.msg_ctl) + } + + fn vector_masked(&self, vector: usize) -> bool { + if !self.per_vector_mask() { + return false; + } + + (self.mask_bits >> vector) & 0x1 == 0x1 + } + + fn size(&self) -> u64 { + let mut size: u64 = 0xa; + + if self.addr_64_bits() { + size += 0x4; + } + if self.per_vector_mask() { + size += 0xa; + } + + size + } + + fn update(&mut self, offset: u64, data: &[u8]) { + // Calculate message data offset depending on the address being 32 or + // 64 bits. + // Calculate upper address offset if the address is 64 bits. + // Calculate mask bits offset based on the address being 32 or 64 bits + // and based on the per vector masking being enabled or not. + let (msg_data_offset, addr_hi_offset, mask_bits_offset): (u64, Option, Option) = + if self.addr_64_bits() { + let mask_bits = if self.per_vector_mask() { + Some(0x10) + } else { + None + }; + (0xc, Some(0x8), mask_bits) + } else { + let mask_bits = if self.per_vector_mask() { + Some(0xc) + } else { + None + }; + (0x8, None, mask_bits) + }; + + // Update cache without overriding the read-only bits. + match data.len() { + 2 => { + let value = LittleEndian::read_u16(data); + match offset { + MSI_MSG_CTL_OFFSET => { + self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + | (value & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + } + x if x == msg_data_offset => self.msg_data = value, + _ => error!("invalid offset"), + } + } + 4 => { + let value = LittleEndian::read_u32(data); + match offset { + 0x0 => { + self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + | ((value >> 16) as u16 & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + } + MSI_MSG_ADDR_LO_OFFSET => self.msg_addr_lo = value & MSI_MSG_ADDR_LO_MASK, + x if x == msg_data_offset => self.msg_data = value as u16, + x if addr_hi_offset.is_some() && x == addr_hi_offset.unwrap() => { + self.msg_addr_hi = value + } + x if mask_bits_offset.is_some() && x == mask_bits_offset.unwrap() => { + self.mask_bits = value + } + _ => error!("invalid offset"), + } + } + _ => error!("invalid data length"), + } + } +} + +#[derive(Serialize, Deserialize)] +pub struct MsiConfigState { + cap: MsiCap, +} + +pub struct MsiConfig { + pub cap: MsiCap, + interrupt_source_group: Arc, +} + +impl MsiConfig { + pub fn new( + msg_ctl: u16, + interrupt_source_group: Arc, + state: Option, + ) -> Result { + let cap = if let Some(state) = state { + if state.cap.enabled() { + for idx in 0..state.cap.num_enabled_vectors() { + let config = MsiIrqSourceConfig { + high_addr: state.cap.msg_addr_hi, + low_addr: state.cap.msg_addr_lo, + data: state.cap.msg_data as u32, + devid: 0, + }; + + interrupt_source_group + .update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + state.cap.vector_masked(idx), + false, + ) + .map_err(Error::UpdateInterruptRoute)?; + } + + interrupt_source_group + .set_gsi() + .map_err(Error::EnableInterruptRoute)?; + + interrupt_source_group + .enable() + .map_err(Error::EnableInterruptRoute)?; + } + + state.cap + } else { + MsiCap { + msg_ctl, + ..Default::default() + } + }; + + Ok(MsiConfig { + cap, + interrupt_source_group, + }) + } + + pub fn state(&self) -> MsiConfigState { + MsiConfigState { cap: self.cap } + } + + pub fn enabled(&self) -> bool { + self.cap.enabled() + } + + pub fn size(&self) -> u64 { + self.cap.size() + } + + pub fn num_enabled_vectors(&self) -> usize { + self.cap.num_enabled_vectors() + } + + pub fn update(&mut self, offset: u64, data: &[u8]) { + let old_enabled = self.cap.enabled(); + + self.cap.update(offset, data); + + if self.cap.enabled() { + for idx in 0..self.num_enabled_vectors() { + let config = MsiIrqSourceConfig { + high_addr: self.cap.msg_addr_hi, + low_addr: self.cap.msg_addr_lo, + data: self.cap.msg_data as u32, + devid: 0, + }; + + if let Err(e) = self.interrupt_source_group.update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + self.cap.vector_masked(idx), + true, + ) { + error!("Failed updating vector: {:?}", e); + } + } + + if !old_enabled { + if let Err(e) = self.interrupt_source_group.enable() { + error!("Failed enabling irq_fd: {:?}", e); + } + } + } else if old_enabled { + if let Err(e) = self.interrupt_source_group.disable() { + error!("Failed disabling irq_fd: {:?}", e); + } + } + } +} diff --git a/src/pci/src/msix.rs b/src/pci/src/msix.rs new file mode 100644 index 00000000000..4b3cf688980 --- /dev/null +++ b/src/pci/src/msix.rs @@ -0,0 +1,552 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +// + +use std::sync::Arc; +use std::{io, result}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_device::interrupt::{ + InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, +}; +use vm_memory::ByteValued; + +use crate::{PciCapability, PciCapabilityId}; + +const MAX_MSIX_VECTORS_PER_DEVICE: u16 = 2048; +const MSIX_TABLE_ENTRIES_MODULO: u64 = 16; +const MSIX_PBA_ENTRIES_MODULO: u64 = 8; +const BITS_PER_PBA_ENTRY: usize = 64; +const FUNCTION_MASK_BIT: u8 = 14; +const MSIX_ENABLE_BIT: u8 = 15; +const FUNCTION_MASK_MASK: u16 = (1 << FUNCTION_MASK_BIT) as u16; +const MSIX_ENABLE_MASK: u16 = (1 << MSIX_ENABLE_BIT) as u16; +pub const MSIX_TABLE_ENTRY_SIZE: usize = 16; +pub const MSIX_CONFIG_ID: &str = "msix_config"; + +#[derive(Debug)] +pub enum Error { + /// Failed enabling the interrupt route. + EnableInterruptRoute(io::Error), + /// Failed updating the interrupt route. + UpdateInterruptRoute(io::Error), +} + +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +pub struct MsixTableEntry { + pub msg_addr_lo: u32, + pub msg_addr_hi: u32, + pub msg_data: u32, + pub vector_ctl: u32, +} + +impl MsixTableEntry { + pub fn masked(&self) -> bool { + self.vector_ctl & 0x1 == 0x1 + } +} + +impl Default for MsixTableEntry { + fn default() -> Self { + MsixTableEntry { + msg_addr_lo: 0, + msg_addr_hi: 0, + msg_data: 0, + vector_ctl: 0x1, + } + } +} + +#[derive(Serialize, Deserialize)] +pub struct MsixConfigState { + table_entries: Vec, + pba_entries: Vec, + masked: bool, + enabled: bool, +} + +pub struct MsixConfig { + pub table_entries: Vec, + pub pba_entries: Vec, + pub devid: u32, + interrupt_source_group: Arc, + masked: bool, + enabled: bool, +} + +impl MsixConfig { + pub fn new( + msix_vectors: u16, + interrupt_source_group: Arc, + devid: u32, + state: Option, + ) -> result::Result { + assert!(msix_vectors <= MAX_MSIX_VECTORS_PER_DEVICE); + + let (table_entries, pba_entries, masked, enabled) = if let Some(state) = state { + if state.enabled && !state.masked { + for (idx, table_entry) in state.table_entries.iter().enumerate() { + if table_entry.masked() { + continue; + } + + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid, + }; + + interrupt_source_group + .update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + state.masked, + true, + ) + .map_err(Error::UpdateInterruptRoute)?; + + interrupt_source_group + .enable() + .map_err(Error::EnableInterruptRoute)?; + } + } + + ( + state.table_entries, + state.pba_entries, + state.masked, + state.enabled, + ) + } else { + let mut table_entries: Vec = Vec::new(); + table_entries.resize_with(msix_vectors as usize, Default::default); + let mut pba_entries: Vec = Vec::new(); + let num_pba_entries: usize = ((msix_vectors as usize) / BITS_PER_PBA_ENTRY) + 1; + pba_entries.resize_with(num_pba_entries, Default::default); + + (table_entries, pba_entries, true, false) + }; + + Ok(MsixConfig { + table_entries, + pba_entries, + devid, + interrupt_source_group, + masked, + enabled, + }) + } + + pub fn state(&self) -> MsixConfigState { + MsixConfigState { + table_entries: self.table_entries.clone(), + pba_entries: self.pba_entries.clone(), + masked: self.masked, + enabled: self.enabled, + } + } + + pub fn masked(&self) -> bool { + self.masked + } + + pub fn enabled(&self) -> bool { + self.enabled + } + + pub fn set_msg_ctl(&mut self, reg: u16) { + let old_masked = self.masked; + let old_enabled = self.enabled; + + self.masked = ((reg >> FUNCTION_MASK_BIT) & 1u16) == 1u16; + self.enabled = ((reg >> MSIX_ENABLE_BIT) & 1u16) == 1u16; + + // Update interrupt routing + if old_masked != self.masked || old_enabled != self.enabled { + if self.enabled && !self.masked { + debug!("MSI-X enabled for device 0x{:x}", self.devid); + for (idx, table_entry) in self.table_entries.iter().enumerate() { + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid: self.devid, + }; + + if let Err(e) = self.interrupt_source_group.update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + table_entry.masked(), + true, + ) { + error!("Failed updating vector: {:?}", e); + } + } + } else if old_enabled || !old_masked { + debug!("MSI-X disabled for device 0x{:x}", self.devid); + if let Err(e) = self.interrupt_source_group.disable() { + error!("Failed disabling irq_fd: {:?}", e); + } + } + } + + // If the Function Mask bit was set, and has just been cleared, it's + // important to go through the entire PBA to check if there was any + // pending MSI-X message to inject, given that the vector is not + // masked. + if old_masked && !self.masked { + for (index, entry) in self.table_entries.clone().iter().enumerate() { + if !entry.masked() && self.get_pba_bit(index as u16) == 1 { + self.inject_msix_and_clear_pba(index); + } + } + } + } + + pub fn read_table(&self, offset: u64, data: &mut [u8]) { + assert!((data.len() == 4 || data.len() == 8)); + + let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; + + if index >= self.table_entries.len() { + debug!("Invalid MSI-X table entry index {index}"); + data.copy_from_slice(&[0xff; 8][..data.len()]); + return; + } + + match data.len() { + 4 => { + let value = match modulo_offset { + 0x0 => self.table_entries[index].msg_addr_lo, + 0x4 => self.table_entries[index].msg_addr_hi, + 0x8 => self.table_entries[index].msg_data, + 0xc => self.table_entries[index].vector_ctl, + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R TABLE offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u32(data, value); + } + 8 => { + let value = match modulo_offset { + 0x0 => { + (u64::from(self.table_entries[index].msg_addr_hi) << 32) + | u64::from(self.table_entries[index].msg_addr_lo) + } + 0x8 => { + (u64::from(self.table_entries[index].vector_ctl) << 32) + | u64::from(self.table_entries[index].msg_data) + } + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R TABLE offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u64(data, value); + } + _ => { + error!("invalid data length"); + } + } + } + + pub fn write_table(&mut self, offset: u64, data: &[u8]) { + assert!((data.len() == 4 || data.len() == 8)); + + let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; + + if index >= self.table_entries.len() { + debug!("Invalid MSI-X table entry index {index}"); + return; + } + + // Store the value of the entry before modification + let old_entry = self.table_entries[index].clone(); + + match data.len() { + 4 => { + let value = LittleEndian::read_u32(data); + match modulo_offset { + 0x0 => self.table_entries[index].msg_addr_lo = value, + 0x4 => self.table_entries[index].msg_addr_hi = value, + 0x8 => self.table_entries[index].msg_data = value, + 0xc => { + self.table_entries[index].vector_ctl = value; + } + _ => error!("invalid offset"), + }; + + debug!("MSI_W TABLE offset 0x{:x} data 0x{:x}", offset, value); + } + 8 => { + let value = LittleEndian::read_u64(data); + match modulo_offset { + 0x0 => { + self.table_entries[index].msg_addr_lo = (value & 0xffff_ffffu64) as u32; + self.table_entries[index].msg_addr_hi = (value >> 32) as u32; + } + 0x8 => { + self.table_entries[index].msg_data = (value & 0xffff_ffffu64) as u32; + self.table_entries[index].vector_ctl = (value >> 32) as u32; + } + _ => error!("invalid offset"), + }; + + debug!("MSI_W TABLE offset 0x{:x} data 0x{:x}", offset, value); + } + _ => error!("invalid data length"), + }; + + let table_entry = &self.table_entries[index]; + + // Optimisation to avoid excessive updates + if &old_entry == table_entry { + return; + } + + // Update interrupt routes + // Optimisation: only update routes if the entry is not masked; + // this is safe because if the entry is masked (starts masked as per spec) + // in the table then it won't be triggered. (See: #4273) + if self.enabled && !self.masked && !table_entry.masked() { + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid: self.devid, + }; + + if let Err(e) = self.interrupt_source_group.update( + index as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + table_entry.masked(), + true, + ) { + error!("Failed updating vector: {:?}", e); + } + } + + // After the MSI-X table entry has been updated, it is necessary to + // check if the vector control masking bit has changed. In case the + // bit has been flipped from 1 to 0, we need to inject a MSI message + // if the corresponding pending bit from the PBA is set. Once the MSI + // has been injected, the pending bit in the PBA needs to be cleared. + // All of this is valid only if MSI-X has not been masked for the whole + // device. + + // Check if bit has been flipped + if !self.masked() + && self.enabled() + && old_entry.masked() + && !table_entry.masked() + && self.get_pba_bit(index as u16) == 1 + { + self.inject_msix_and_clear_pba(index); + } + } + + pub fn read_pba(&mut self, offset: u64, data: &mut [u8]) { + assert!((data.len() == 4 || data.len() == 8)); + + let index: usize = (offset / MSIX_PBA_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_PBA_ENTRIES_MODULO; + + if index >= self.pba_entries.len() { + debug!("Invalid MSI-X PBA entry index {index}"); + data.copy_from_slice(&[0xff; 8][..data.len()]); + return; + } + + match data.len() { + 4 => { + let value: u32 = match modulo_offset { + 0x0 => (self.pba_entries[index] & 0xffff_ffffu64) as u32, + 0x4 => (self.pba_entries[index] >> 32) as u32, + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R PBA offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u32(data, value); + } + 8 => { + let value: u64 = match modulo_offset { + 0x0 => self.pba_entries[index], + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R PBA offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u64(data, value); + } + _ => { + error!("invalid data length"); + } + } + } + + pub fn write_pba(&mut self, _offset: u64, _data: &[u8]) { + error!("Pending Bit Array is read only"); + } + + pub fn set_pba_bit(&mut self, vector: u16, reset: bool) { + assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE); + + let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY; + let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY; + let mut mask: u64 = (1 << shift) as u64; + + if reset { + mask = !mask; + self.pba_entries[index] &= mask; + } else { + self.pba_entries[index] |= mask; + } + } + + fn get_pba_bit(&self, vector: u16) -> u8 { + assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE); + + let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY; + let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY; + + ((self.pba_entries[index] >> shift) & 0x0000_0001u64) as u8 + } + + fn inject_msix_and_clear_pba(&mut self, vector: usize) { + // Inject the MSI message + match self + .interrupt_source_group + .trigger(vector as InterruptIndex) + { + Ok(_) => debug!("MSI-X injected on vector control flip"), + Err(e) => error!("failed to inject MSI-X: {}", e), + } + + // Clear the bit from PBA + self.set_pba_bit(vector as u16, true); + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Default, Serialize, Deserialize)] +pub struct MsixCap { + // Message Control Register + // 10-0: MSI-X Table size + // 13-11: Reserved + // 14: Mask. Mask all MSI-X when set. + // 15: Enable. Enable all MSI-X when set. + pub msg_ctl: u16, + // Table. Contains the offset and the BAR indicator (BIR) + // 2-0: Table BAR indicator (BIR). Can be 0 to 5. + // 31-3: Table offset in the BAR pointed by the BIR. + pub table: u32, + // Pending Bit Array. Contains the offset and the BAR indicator (BIR) + // 2-0: PBA BAR indicator (BIR). Can be 0 to 5. + // 31-3: PBA offset in the BAR pointed by the BIR. + pub pba: u32, +} + +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for MsixCap {} + +impl PciCapability for MsixCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::MsiX + } +} + +impl MsixCap { + pub fn new( + table_pci_bar: u8, + table_size: u16, + table_off: u32, + pba_pci_bar: u8, + pba_off: u32, + ) -> Self { + assert!(table_size < MAX_MSIX_VECTORS_PER_DEVICE); + + // Set the table size and enable MSI-X. + let msg_ctl: u16 = 0x8000u16 + table_size - 1; + + MsixCap { + msg_ctl, + table: (table_off & 0xffff_fff8u32) | u32::from(table_pci_bar & 0x7u8), + pba: (pba_off & 0xffff_fff8u32) | u32::from(pba_pci_bar & 0x7u8), + } + } + + pub fn set_msg_ctl(&mut self, data: u16) { + self.msg_ctl = (self.msg_ctl & !(FUNCTION_MASK_MASK | MSIX_ENABLE_MASK)) + | (data & (FUNCTION_MASK_MASK | MSIX_ENABLE_MASK)); + } + + pub fn masked(&self) -> bool { + (self.msg_ctl >> FUNCTION_MASK_BIT) & 0x1 == 0x1 + } + + pub fn enabled(&self) -> bool { + (self.msg_ctl >> MSIX_ENABLE_BIT) & 0x1 == 0x1 + } + + pub fn table_offset(&self) -> u32 { + self.table & 0xffff_fff8 + } + + pub fn pba_offset(&self) -> u32 { + self.pba & 0xffff_fff8 + } + + pub fn table_set_offset(&mut self, addr: u32) { + self.table &= 0x7; + self.table += addr; + } + + pub fn pba_set_offset(&mut self, addr: u32) { + self.pba &= 0x7; + self.pba += addr; + } + + pub fn table_bir(&self) -> u32 { + self.table & 0x7 + } + + pub fn pba_bir(&self) -> u32 { + self.pba & 0x7 + } + + pub fn table_size(&self) -> u16 { + (self.msg_ctl & 0x7ff) + 1 + } + + pub fn table_range(&self) -> (u64, u64) { + // The table takes 16 bytes per entry. + let size = self.table_size() as u64 * 16; + (self.table_offset() as u64, size) + } + + pub fn pba_range(&self) -> (u64, u64) { + // The table takes 1 bit per entry modulo 8 bytes. + let size = ((self.table_size() as u64 / 64) + 1) * 8; + (self.pba_offset() as u64, size) + } +} diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 2b2763d8d65..b95d23bf1b1 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -38,6 +38,7 @@ log = { version = "0.4.27", features = ["std", "serde"] } log-instrument = { path = "../log-instrument", optional = true } memfd = "0.6.3" micro_http = { git = "https://github.com/firecracker-microvm/micro-http" } +pci = { path = "../pci" } semver = { version = "1.0.26", features = ["serde"] } serde = { version = "1.0.219", features = ["derive", "rc"] } serde_json = "1.0.140" @@ -46,8 +47,9 @@ thiserror = "2.0.12" timerfd = "1.5.0" userfaultfd = "0.8.1" utils = { path = "../utils" } +uuid = "1.16.0" vhost = { version = "0.13.0", features = ["vhost-user-frontend"] } -vm-allocator = "0.1.0" +vm-allocator = "0.1.2" vm-device = { path = "../vm-device" } vm-memory = { version = "0.16.1", features = [ "backend-mmap", diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index 542e53409b7..a3e471aed9e 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use acpi_tables::fadt::{FADT_F_HW_REDUCED_ACPI, FADT_F_PWR_BUTTON, FADT_F_SLP_BUTTON}; -use acpi_tables::{Aml, Dsdt, Fadt, Madt, Rsdp, Sdt, Xsdt, aml}; +use acpi_tables::{Aml, Dsdt, Fadt, Madt, Mcfg, Rsdp, Sdt, Xsdt, aml}; use log::{debug, error}; use vm_allocator::AllocPolicy; @@ -10,6 +10,7 @@ use crate::Vcpu; use crate::acpi::x86_64::{ apic_addr, rsdp_addr, setup_arch_dsdt, setup_arch_fadt, setup_interrupt_controllers, }; +use crate::arch::x86_64::layout; use crate::device_manager::DeviceManager; use crate::device_manager::resources::ResourceAllocator; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; @@ -53,7 +54,7 @@ impl AcpiTableWriter<'_> { /// buffer. It returns the address in which it wrote the table. fn write_acpi_table( &mut self, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, table: &mut S, ) -> Result where @@ -90,11 +91,15 @@ impl AcpiTableWriter<'_> { .acpi_devices .append_aml_bytes(&mut dsdt_data)?; + if let Some(pci_segment) = &device_manager.pci_devices.pci_segment { + pci_segment.append_aml_bytes(&mut dsdt_data)?; + } + // Architecture specific DSDT data setup_arch_dsdt(&mut dsdt_data)?; let mut dsdt = Dsdt::new(OEM_ID, *b"FCVMDSDT", OEM_REVISION, dsdt_data); - self.write_acpi_table(&mut device_manager.resource_allocator, &mut dsdt) + self.write_acpi_table(&device_manager.resource_allocator, &mut dsdt) } /// Build the FADT table for the guest @@ -102,7 +107,7 @@ impl AcpiTableWriter<'_> { /// This includes a pointer with the location of the DSDT in guest memory fn build_fadt( &mut self, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, dsdt_addr: u64, ) -> Result { let mut fadt = Fadt::new(OEM_ID, *b"FCVMFADT", OEM_REVISION); @@ -120,7 +125,7 @@ impl AcpiTableWriter<'_> { /// This includes information about the interrupt controllers supported in the platform fn build_madt( &mut self, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, nr_vcpus: u8, ) -> Result { let mut madt = Madt::new( @@ -138,19 +143,30 @@ impl AcpiTableWriter<'_> { /// Currently, we pass to the guest just FADT and MADT tables. fn build_xsdt( &mut self, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, fadt_addr: u64, madt_addr: u64, + mcfg_addr: u64, ) -> Result { let mut xsdt = Xsdt::new( OEM_ID, *b"FCMVXSDT", OEM_REVISION, - vec![fadt_addr, madt_addr], + vec![fadt_addr, madt_addr, mcfg_addr], ); self.write_acpi_table(resource_allocator, &mut xsdt) } + /// Build the MCFG table for the guest. + fn build_mcfg( + &mut self, + resource_allocator: &ResourceAllocator, + pci_mmio_config_addr: u64, + ) -> Result { + let mut mcfg = Mcfg::new(OEM_ID, *b"FCMVMCFG", OEM_REVISION, pci_mmio_config_addr); + self.write_acpi_table(resource_allocator, &mut mcfg) + } + /// Build the RSDP pointer for the guest. /// /// This will build the RSDP pointer which points to the XSDT table and write it in guest @@ -180,15 +196,23 @@ pub(crate) fn create_acpi_tables( vcpus: &[Vcpu], ) -> Result<(), AcpiError> { let mut writer = AcpiTableWriter { mem }; - let dsdt_addr = writer.build_dsdt(device_manager)?; - let fadt_addr = writer.build_fadt(&mut device_manager.resource_allocator, dsdt_addr)?; + + let fadt_addr = writer.build_fadt(&device_manager.resource_allocator, dsdt_addr)?; let madt_addr = writer.build_madt( - &mut device_manager.resource_allocator, + &device_manager.resource_allocator, vcpus.len().try_into().unwrap(), )?; - let xsdt_addr = - writer.build_xsdt(&mut device_manager.resource_allocator, fadt_addr, madt_addr)?; + let mcfg_addr = writer.build_mcfg( + &device_manager.resource_allocator, + layout::PCI_MMCONFIG_START, + )?; + let xsdt_addr = writer.build_xsdt( + &device_manager.resource_allocator, + fadt_addr, + madt_addr, + mcfg_addr, + )?; writer.build_rsdp(xsdt_addr) } @@ -227,7 +251,7 @@ mod tests { #[test] fn test_write_acpi_table_memory_allocation() { // A mocke Vmm object with 128MBs of memory - let mut vmm = default_vmm(); + let vmm = default_vmm(); let mut writer = AcpiTableWriter { mem: vmm.vm.guest_memory(), }; @@ -235,14 +259,14 @@ mod tests { // This should succeed let mut sdt = MockSdt(vec![0; 4096]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START); // Let's try to write two 4K pages plus one byte let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE + 1).unwrap()]); let err = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( @@ -257,27 +281,27 @@ mod tests { // succeed. let mut sdt = MockSdt(vec![0; 5]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4096); let mut sdt = MockSdt(vec![0; 2]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4101); let mut sdt = MockSdt(vec![0; 4]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4103); let mut sdt = MockSdt(vec![0; 8]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4107); let mut sdt = MockSdt(vec![0; 16]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4115); } @@ -294,11 +318,11 @@ mod tests { let mut writer = AcpiTableWriter { mem: vm.guest_memory(), }; - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE).unwrap()]); let err = writer - .write_acpi_table(&mut resource_allocator, &mut sdt) + .write_acpi_table(&resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index be53ef6993d..8e67a50bd64 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -13,8 +13,13 @@ use vm_memory::GuestMemoryError; use super::cache_info::{CacheEntry, read_cache_config}; use super::gic::GICDevice; +use crate::arch::{ + MEM_32BIT_DEVICES_SIZE, MEM_32BIT_DEVICES_START, MEM_64BIT_DEVICES_SIZE, + MEM_64BIT_DEVICES_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, +}; use crate::device_manager::DeviceManager; use crate::device_manager::mmio::MMIODeviceInfo; +use crate::device_manager::pci_mngr::PciDevices; use crate::devices::acpi::vmgenid::{VMGENID_MEM_SIZE, VmGenId}; use crate::initrd::InitrdConfig; use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap}; @@ -90,6 +95,7 @@ pub fn create_fdt( create_psci_node(&mut fdt_writer)?; create_devices_node(&mut fdt_writer, device_manager)?; create_vmgenid_node(&mut fdt_writer, &device_manager.acpi_devices.vmgenid)?; + create_pci_nodes(&mut fdt_writer, &device_manager.pci_devices)?; // End Header node. fdt_writer.end_node(root)?; @@ -431,6 +437,63 @@ fn create_devices_node( Ok(()) } +fn create_pci_nodes(fdt: &mut FdtWriter, pci_devices: &PciDevices) -> Result<(), FdtError> { + if pci_devices.pci_segment.is_none() { + return Ok(()); + } + + // Fine to unwrap here, we just checked it's not `None`. + let segment = pci_devices.pci_segment.as_ref().unwrap(); + + let pci_node_name = format!("pci@{:x}", segment.mmio_config_address); + // Each range here is a thruple of `(PCI address, CPU address, PCI size)`. + // + // More info about the format can be found here: + // https://elinux.org/Device_Tree_Usage#PCI_Address_Translation + let ranges = [ + // 32bit addresses + 0x200_0000u32, + (MEM_32BIT_DEVICES_START >> 32) as u32, // PCI address + (MEM_32BIT_DEVICES_START & 0xffff_ffff) as u32, + (MEM_32BIT_DEVICES_START >> 32) as u32, // CPU address + (MEM_32BIT_DEVICES_START & 0xffff_ffff) as u32, + (MEM_32BIT_DEVICES_SIZE >> 32) as u32, // Range size + (MEM_32BIT_DEVICES_SIZE & 0xffff_ffff) as u32, + // 64bit addresses + 0x300_0000u32, + // PCI address + (MEM_64BIT_DEVICES_START >> 32) as u32, // PCI address + (MEM_64BIT_DEVICES_START & 0xffff_ffff) as u32, + // CPU address + (MEM_64BIT_DEVICES_START >> 32) as u32, // CPU address + (MEM_64BIT_DEVICES_START & 0xffff_ffff) as u32, + // Range size + (MEM_64BIT_DEVICES_SIZE >> 32) as u32, // Range size + ((MEM_64BIT_DEVICES_SIZE & 0xffff_ffff) >> 32) as u32, + ]; + let pci_node = fdt.begin_node(&pci_node_name)?; + + fdt.property_string("compatible", "pci-host-ecam-generic")?; + fdt.property_string("device_type", "pci")?; + fdt.property_array_u32("ranges", &ranges)?; + fdt.property_array_u32("bus-range", &[0, 0])?; + fdt.property_u32("linux,pci-domain", segment.id.into())?; + fdt.property_u32("#address-cells", 3)?; + fdt.property_u32("#size-cells", 2)?; + fdt.property_array_u64( + "reg", + &[ + segment.mmio_config_address, + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, + ], + )?; + fdt.property_u32("#interrupt-cells", 1)?; + fdt.property_null("interrupt-map")?; + fdt.property_null("interrupt-map-mask")?; + fdt.property_null("dma-coherent")?; + Ok(fdt.end_node(pci_node)?) +} + #[cfg(test)] mod tests { use std::ffi::CString; @@ -477,7 +540,7 @@ mod tests { .register_virtio_test_device( &vm, mem.clone(), - &mut device_manager.resource_allocator, + &device_manager.resource_allocator, dummy, &mut cmdline, "dummy", diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs index 22aaa4b4b74..c4b9208a0a6 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs @@ -30,7 +30,7 @@ impl GICv2 { /// Get the address of the GICv2 distributor. const fn get_dist_addr() -> u64 { - super::layout::MAPPED_IO_START - GICv2::KVM_VGIC_V2_DIST_SIZE + super::layout::MMIO32_MEM_START - GICv2::KVM_VGIC_V2_DIST_SIZE } /// Get the size of the GIC_v2 distributor. diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs index 558b47ab065..39c4e5ce148 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs @@ -30,7 +30,7 @@ impl GICv3 { /// Get the address of the GIC distributor. fn get_dist_addr() -> u64 { - super::layout::MAPPED_IO_START - GICv3::KVM_VGIC_V3_DIST_SIZE + super::layout::MMIO32_MEM_START - GICv3::KVM_VGIC_V3_DIST_SIZE } /// Get the size of the GIC distributor. diff --git a/src/vmm/src/arch/aarch64/layout.rs b/src/vmm/src/arch/aarch64/layout.rs index 922cfbb66e6..bdecc712696 100644 --- a/src/vmm/src/arch/aarch64/layout.rs +++ b/src/vmm/src/arch/aarch64/layout.rs @@ -4,51 +4,53 @@ // ==== Address map in use in ARM development systems today ==== // // - 32-bit - - 36-bit - - 40-bit - -// 1024GB + + +-------------------+ <- 40-bit +// 1024GB + + +-------------------+ <- 40-bit // | | DRAM | // ~ ~ ~ ~ // | | | // | | | // | | | // | | | -// 544GB + + +-------------------+ +// 544GB + + +-------------------+ // | | Hole or DRAM | // | | | -// 512GB + + +-------------------+ +// 512GB + + +-------------------+ // | | Mapped | // | | I/O | // ~ ~ ~ ~ // | | | -// 256GB + + +-------------------+ +// 256GB + + +-------------------+ // | | Reserved | // ~ ~ ~ ~ // | | | -// 64GB + +-----------------------+-------------------+ <- 36-bit +// 64GB + +-----------------------+-------------------+ <- 36-bit // | | DRAM | // ~ ~ ~ ~ // | | | // | | | -// 34GB + +-----------------------+-------------------+ +// 34GB + +-----------------------+-------------------+ // | | Hole or DRAM | -// 32GB + +-----------------------+-------------------+ +// 32GB + +-----------------------+-------------------+ // | | Mapped I/O | // ~ ~ ~ ~ // | | | -// 16GB + +-----------------------+-------------------+ +// 16GB + +-----------------------+-------------------+ // | | Reserved | // ~ ~ ~ ~ -// 4GB +-------------------+-----------------------+-------------------+ <- 32-bit +// 4GB +-------------------+-----------------------+-------------------+ <- 32-bit // | 2GB of DRAM | // | | -// 2GB +-------------------+-----------------------+-------------------+ +// 2GB +-------------------+-----------------------+-------------------+ // | Mapped I/O | -// 1GB +-------------------+-----------------------+-------------------+ +// 1GB +-------------------+-----------------------+-------------------+ // | ROM & RAM & I/O | -// 0GB +-------------------+-----------------------+-------------------+ 0 +// 0GB +-------------------+-----------------------+-------------------+ 0 // - 32-bit - - 36-bit - - 40-bit - // // Taken from (http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_principles_of_arm_memory_maps.pdf). +use crate::device_manager::mmio::MMIO_LEN; + /// Start of RAM on 64 bit ARM. pub const DRAM_MEM_START: u64 = 0x8000_0000; // 2 GB. /// The maximum RAM size. @@ -80,5 +82,46 @@ pub const IRQ_MAX: u32 = 128; /// First usable interrupt on aarch64. pub const IRQ_BASE: u32 = 32; +/// The start of the memory area reserved for MMIO 32-bit accesses. /// Below this address will reside the GIC, above this address will reside the MMIO devices. -pub const MAPPED_IO_START: u64 = 1 << 30; // 1 GB +pub const MMIO32_MEM_START: u64 = 1 << 30; // 1GiB +/// The size of the memory area reserved for MMIO 32-bit accesses (1GiB). +pub const MMIO32_MEM_SIZE: u64 = DRAM_MEM_START - MMIO32_MEM_START; + +// The rest of the MMIO address space (256 MiB) we dedicate to PCIe for memory-mapped access to +// configuration. +/// Size of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_SIZE: u64 = 256 << 20; +/// Start of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_START: u64 = DRAM_MEM_START - PCI_MMCONFIG_SIZE; +/// MMIO space per PCIe segment +pub const PCI_MMIO_CONFIG_SIZE_PER_SEGMENT: u64 = 4096 * 256; + +// We reserve 768 MiB for devices at the beginning of the MMIO region. This includes space both for +// pure MMIO and PCIe devices. + +/// Memory region start for boot device. +pub const BOOT_DEVICE_MEM_START: u64 = MMIO32_MEM_START; +/// Memory region start for RTC device. +pub const RTC_MEM_START: u64 = BOOT_DEVICE_MEM_START + MMIO_LEN; +/// Memory region start for Serial device. +pub const SERIAL_MEM_START: u64 = RTC_MEM_START + MMIO_LEN; + +/// Beginning of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_START: u64 = SERIAL_MEM_START + MMIO_LEN; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_SIZE: u64 = PCI_MMCONFIG_START - MEM_32BIT_DEVICES_START; + +// 64-bits region for MMIO accesses +/// The start of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_START: u64 = 256 << 30; +/// The size of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_SIZE: u64 = 256 << 30; + +// At the moment, all of this region goes to devices +/// Beginning of memory region for device MMIO 64-bit accesses +pub const MEM_64BIT_DEVICES_START: u64 = MMIO64_MEM_START; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_64BIT_DEVICES_SIZE: u64 = MMIO64_MEM_SIZE; +/// First address past the 64-bit MMIO gap +pub const FIRST_ADDR_PAST_64BITS_MMIO: u64 = MMIO64_MEM_START + MMIO64_MEM_SIZE; diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index 6d1d0e26359..df6e712dcf5 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -24,11 +24,11 @@ use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::{Cmdline, KernelLoader}; use vm_memory::GuestMemoryError; -use crate::arch::{BootProtocol, EntryPoint}; +use crate::arch::{BootProtocol, EntryPoint, arch_memory_regions_with_gap}; use crate::cpu_config::aarch64::{CpuConfiguration, CpuConfigurationError}; use crate::cpu_config::templates::CustomCpuTemplate; use crate::initrd::InitrdConfig; -use crate::utils::{align_up, usize_to_u64}; +use crate::utils::{align_up, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap}; use crate::vstate::vcpu::KvmVcpuError; @@ -51,42 +51,34 @@ pub enum ConfigurationError { VcpuConfigure(#[from] KvmVcpuError), } -/// The start of the memory area reserved for MMIO devices. -pub const MMIO_MEM_START: u64 = layout::MAPPED_IO_START; -/// The size of the memory area reserved for MMIO devices. -pub const MMIO_MEM_SIZE: u64 = layout::DRAM_MEM_START - layout::MAPPED_IO_START; //>> 1GB - /// Returns a Vec of the valid memory addresses for aarch64. /// See [`layout`](layout) module for a drawing of the specific memory model for this platform. -/// -/// The `offset` parameter specified the offset from [`layout::DRAM_MEM_START`]. -pub fn arch_memory_regions(offset: usize, size: usize) -> Vec<(GuestAddress, usize)> { +pub fn arch_memory_regions(size: usize) -> Vec<(GuestAddress, usize)> { assert!(size > 0, "Attempt to allocate guest memory of length 0"); - assert!( - offset.checked_add(size).is_some(), - "Attempt to allocate guest memory such that the address space would wrap around" - ); - assert!( - offset < layout::DRAM_MEM_MAX_SIZE, - "offset outside allowed DRAM range" - ); - let dram_size = min(size, layout::DRAM_MEM_MAX_SIZE - offset); + let dram_size = min(size, layout::DRAM_MEM_MAX_SIZE); if dram_size != size { logger::warn!( - "Requested offset/memory size {}/{} exceeds architectural maximum (1022GiB). Size has \ - been truncated to {}", - offset, + "Requested memory size {} exceeds architectural maximum (1022GiB). Size has been \ + truncated to {}", size, dram_size ); } - vec![( - GuestAddress(layout::DRAM_MEM_START + offset as u64), + let mut regions = vec![]; + if let Some((offset, remaining)) = arch_memory_regions_with_gap( + &mut regions, + u64_to_usize(layout::DRAM_MEM_START), dram_size, - )] + u64_to_usize(layout::MMIO64_MEM_START), + u64_to_usize(layout::MMIO64_MEM_SIZE), + ) { + regions.push((GuestAddress(offset as u64), remaining)); + } + + regions } /// Configures the system for booting Linux. @@ -211,39 +203,66 @@ pub fn load_kernel( #[cfg(kani)] mod verification { - use vm_memory::GuestAddress; - - use crate::arch::aarch64::layout; + use crate::arch::aarch64::layout::{ + DRAM_MEM_MAX_SIZE, DRAM_MEM_START, FIRST_ADDR_PAST_64BITS_MMIO, MMIO64_MEM_START, + }; use crate::arch::arch_memory_regions; #[kani::proof] #[kani::unwind(3)] fn verify_arch_memory_regions() { - let offset: u64 = kani::any::(); - let len: u64 = kani::any::(); - + let len: usize = kani::any::(); kani::assume(len > 0); - kani::assume(offset.checked_add(len).is_some()); - kani::assume(offset < layout::DRAM_MEM_MAX_SIZE as u64); - let regions = arch_memory_regions(offset as usize, len as usize); + let regions = arch_memory_regions(len); - // No MMIO gap on ARM - assert_eq!(regions.len(), 1); + for region in ®ions { + println!( + "region: [{:x}:{:x})", + region.0.0, + region.0.0 + region.1 as u64 + ); + } - let (GuestAddress(start), actual_len) = regions[0]; - let actual_len = actual_len as u64; + // On Arm we have one MMIO gap that might fall within addressable ranges, + // so we can get either 1 or 2 regions. + assert!(regions.len() >= 1); + assert!(regions.len() <= 2); - assert_eq!(start, layout::DRAM_MEM_START + offset); - assert!(actual_len <= layout::DRAM_MEM_MAX_SIZE as u64); + // The total length of all regions cannot exceed DRAM_MEM_MAX_SIZE + let actual_len = regions.iter().map(|&(_, len)| len).sum::(); + assert!(actual_len <= DRAM_MEM_MAX_SIZE); + // The total length is smaller or equal to the length we asked assert!(actual_len <= len); + // If it's smaller, it's because we asked more than the the maximum possible. + if (actual_len) < len { + assert!(len > DRAM_MEM_MAX_SIZE); + } - if actual_len < len { - assert_eq!( - start + actual_len, - layout::DRAM_MEM_START + layout::DRAM_MEM_MAX_SIZE as u64 - ); - assert!(offset + len >= layout::DRAM_MEM_MAX_SIZE as u64); + // No region overlaps the 64-bit MMIO gap + assert!( + regions + .iter() + .all(|&(start, len)| start.0 >= FIRST_ADDR_PAST_64BITS_MMIO + || start.0 + len as u64 <= MMIO64_MEM_START) + ); + + // All regions start after our DRAM_MEM_START + assert!(regions.iter().all(|&(start, _)| start.0 >= DRAM_MEM_START)); + + // All regions have non-zero length + assert!(regions.iter().all(|&(_, len)| len > 0)); + + // If there's two regions, they perfectly snuggle up the 64bit MMIO gap + if regions.len() == 2 { + kani::cover!(); + + // The very first address should be DRAM_MEM_START + assert_eq!(regions[0].0.0, DRAM_MEM_START); + // The first region ends at the beginning of the 64 bits gap. + assert_eq!(regions[0].0.0 + regions[0].1 as u64, MMIO64_MEM_START); + // The second region starts exactly after the 64 bits gap. + assert_eq!(regions[1].0.0, FIRST_ADDR_PAST_64BITS_MMIO); } } } @@ -251,33 +270,42 @@ mod verification { #[cfg(test)] mod tests { use super::*; + use crate::arch::aarch64::layout::{ + DRAM_MEM_MAX_SIZE, DRAM_MEM_START, FDT_MAX_SIZE, FIRST_ADDR_PAST_64BITS_MMIO, + MMIO64_MEM_START, + }; use crate::test_utils::arch_mem; #[test] fn test_regions_lt_1024gb() { - let regions = arch_memory_regions(0, 1usize << 29); + let regions = arch_memory_regions(1usize << 29); assert_eq!(1, regions.len()); - assert_eq!(GuestAddress(super::layout::DRAM_MEM_START), regions[0].0); + assert_eq!(GuestAddress(DRAM_MEM_START), regions[0].0); assert_eq!(1usize << 29, regions[0].1); } #[test] fn test_regions_gt_1024gb() { - let regions = arch_memory_regions(0, 1usize << 41); - assert_eq!(1, regions.len()); - assert_eq!(GuestAddress(super::layout::DRAM_MEM_START), regions[0].0); - assert_eq!(super::layout::DRAM_MEM_MAX_SIZE, regions[0].1); + let regions = arch_memory_regions(1usize << 41); + assert_eq!(2, regions.len()); + assert_eq!(GuestAddress(DRAM_MEM_START), regions[0].0); + assert_eq!(MMIO64_MEM_START - DRAM_MEM_START, regions[0].1 as u64); + assert_eq!(GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO), regions[1].0); + assert_eq!( + DRAM_MEM_MAX_SIZE as u64 - MMIO64_MEM_START + DRAM_MEM_START, + regions[1].1 as u64 + ); } #[test] fn test_get_fdt_addr() { - let mem = arch_mem(layout::FDT_MAX_SIZE - 0x1000); - assert_eq!(get_fdt_addr(&mem), layout::DRAM_MEM_START); + let mem = arch_mem(FDT_MAX_SIZE - 0x1000); + assert_eq!(get_fdt_addr(&mem), DRAM_MEM_START); - let mem = arch_mem(layout::FDT_MAX_SIZE); - assert_eq!(get_fdt_addr(&mem), layout::DRAM_MEM_START); + let mem = arch_mem(FDT_MAX_SIZE); + assert_eq!(get_fdt_addr(&mem), DRAM_MEM_START); - let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); - assert_eq!(get_fdt_addr(&mem), 0x1000 + layout::DRAM_MEM_START); + let mem = arch_mem(FDT_MAX_SIZE + 0x1000); + assert_eq!(get_fdt_addr(&mem), 0x1000 + DRAM_MEM_START); } } diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index 61d65fea1a5..3693feed04b 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -20,10 +20,13 @@ pub use aarch64::vcpu::*; pub use aarch64::vm::{ArchVm, ArchVmError, VmState}; #[cfg(target_arch = "aarch64")] pub use aarch64::{ - ConfigurationError, MMIO_MEM_SIZE, MMIO_MEM_START, arch_memory_regions, - configure_system_for_boot, get_kernel_start, initrd_load_addr, layout::CMDLINE_MAX_SIZE, - layout::IRQ_BASE, layout::IRQ_MAX, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, - load_kernel, + ConfigurationError, arch_memory_regions, configure_system_for_boot, get_kernel_start, + initrd_load_addr, layout::BOOT_DEVICE_MEM_START, layout::CMDLINE_MAX_SIZE, layout::IRQ_BASE, + layout::IRQ_MAX, layout::MEM_32BIT_DEVICES_SIZE, layout::MEM_32BIT_DEVICES_START, + layout::MEM_64BIT_DEVICES_SIZE, layout::MEM_64BIT_DEVICES_START, layout::MMIO32_MEM_SIZE, + layout::MMIO32_MEM_START, layout::PCI_MMCONFIG_SIZE, layout::PCI_MMCONFIG_START, + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, layout::RTC_MEM_START, layout::SERIAL_MEM_START, + layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, load_kernel, }; /// Module for x86_64 related functionality. @@ -39,10 +42,14 @@ pub use x86_64::vm::{ArchVm, ArchVmError, VmState}; #[cfg(target_arch = "x86_64")] pub use crate::arch::x86_64::{ - ConfigurationError, MMIO_MEM_SIZE, MMIO_MEM_START, arch_memory_regions, - configure_system_for_boot, get_kernel_start, initrd_load_addr, layout::APIC_ADDR, - layout::CMDLINE_MAX_SIZE, layout::IOAPIC_ADDR, layout::IRQ_BASE, layout::IRQ_MAX, - layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, load_kernel, + ConfigurationError, arch_memory_regions, configure_system_for_boot, get_kernel_start, + initrd_load_addr, layout::APIC_ADDR, layout::BOOT_DEVICE_MEM_START, layout::CMDLINE_MAX_SIZE, + layout::IOAPIC_ADDR, layout::IRQ_BASE, layout::IRQ_MAX, layout::MEM_32BIT_DEVICES_SIZE, + layout::MEM_32BIT_DEVICES_START, layout::MEM_64BIT_DEVICES_SIZE, + layout::MEM_64BIT_DEVICES_START, layout::MMIO32_MEM_SIZE, layout::MMIO32_MEM_START, + layout::PCI_MMCONFIG_SIZE, layout::PCI_MMCONFIG_START, + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, + load_kernel, }; /// Types of devices that can get attached to this platform. @@ -114,3 +121,32 @@ pub struct EntryPoint { /// Specifies which boot protocol to use pub protocol: BootProtocol, } + +/// Adds in [`regions`] the valid memory regions suitable for RAM taking into account a gap in the +/// available address space and returns the remaining region (if any) past this gap +fn arch_memory_regions_with_gap( + regions: &mut Vec<(GuestAddress, usize)>, + region_start: usize, + region_size: usize, + gap_start: usize, + gap_size: usize, +) -> Option<(usize, usize)> { + // 0-sized gaps don't really make sense. We should never receive such a gap. + assert!(gap_size > 0); + + let first_addr_past_gap = gap_start + gap_size; + match (region_start + region_size).checked_sub(gap_start) { + // case0: region fits all before gap + None | Some(0) => { + regions.push((GuestAddress(region_start as u64), region_size)); + None + } + // case1: region starts before the gap and goes past it + Some(remaining) if region_start < gap_start => { + regions.push((GuestAddress(region_start as u64), gap_start - region_start)); + Some((first_addr_past_gap, remaining)) + } + // case2: region starts past the gap + Some(_) => Some((first_addr_past_gap.max(region_start), region_size)), + } +} diff --git a/src/vmm/src/arch/x86_64/layout.rs b/src/vmm/src/arch/x86_64/layout.rs index 18d718a49b8..8ae558e91c3 100644 --- a/src/vmm/src/arch/x86_64/layout.rs +++ b/src/vmm/src/arch/x86_64/layout.rs @@ -7,6 +7,9 @@ //! Magic addresses externally used to lay out x86_64 VMs. +use crate::device_manager::mmio::MMIO_LEN; +use crate::utils::mib_to_bytes; + /// Initial stack for the boot CPU. pub const BOOT_STACK_POINTER: u64 = 0x8ff0; @@ -77,3 +80,45 @@ pub const SYSTEM_MEM_START: u64 = 0x9fc00; /// 257KiB is more than we need, however we reserve this space for potential future use of /// ACPI features (new tables and/or devices). pub const SYSTEM_MEM_SIZE: u64 = RSDP_ADDR - SYSTEM_MEM_START; + +/// First address that cannot be addressed using 32 bit anymore. +pub const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32; + +/// The size of the memory area reserved for MMIO 32-bit accesses. +pub const MMIO32_MEM_SIZE: u64 = mib_to_bytes(1024) as u64; +/// The start of the memory area reserved for MMIO 32-bit accesses. +pub const MMIO32_MEM_START: u64 = FIRST_ADDR_PAST_32BITS - MMIO32_MEM_SIZE; + +// We dedicate the last 256 MiB of the 32-bit MMIO address space PCIe for memory-mapped access to +// configuration. +/// Size of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_SIZE: u64 = 256 << 20; +/// Start of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_START: u64 = IOAPIC_ADDR as u64 - PCI_MMCONFIG_SIZE; +/// MMIO space per PCIe segment +pub const PCI_MMIO_CONFIG_SIZE_PER_SEGMENT: u64 = 4096 * 256; + +// We reserve 768 MiB for devices at the beginning of the MMIO region. This includes space both for +// pure MMIO and PCIe devices. + +/// Memory region start for boot device. +pub const BOOT_DEVICE_MEM_START: u64 = MMIO32_MEM_START; + +/// Beginning of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_START: u64 = BOOT_DEVICE_MEM_START + MMIO_LEN; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_SIZE: u64 = PCI_MMCONFIG_START - MEM_32BIT_DEVICES_START; + +// 64-bits region for MMIO accesses +/// The start of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_START: u64 = 256 << 30; +/// The size of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_SIZE: u64 = 256 << 30; + +// At the moment, all of this region goes to devices +/// Beginning of memory region for device MMIO 64-bit accesses +pub const MEM_64BIT_DEVICES_START: u64 = MMIO64_MEM_START; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_64BIT_DEVICES_SIZE: u64 = MMIO64_MEM_SIZE; +/// First address past the 64-bit MMIO gap +pub const FIRST_ADDR_PAST_64BITS_MMIO: u64 = MMIO64_MEM_START + MMIO64_MEM_SIZE; diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index c54ec46c987..fe1296e5d1c 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -33,7 +33,10 @@ pub mod generated; use std::fs::File; -use layout::CMDLINE_START; +use layout::{ + CMDLINE_START, FIRST_ADDR_PAST_32BITS, FIRST_ADDR_PAST_64BITS_MMIO, MMIO32_MEM_SIZE, + MMIO32_MEM_START, MMIO64_MEM_SIZE, MMIO64_MEM_START, PCI_MMCONFIG_SIZE, PCI_MMCONFIG_START, +}; use linux_loader::configurator::linux::LinuxBootConfigurator; use linux_loader::configurator::pvh::PvhBootConfigurator; use linux_loader::configurator::{BootConfigurator, BootParams}; @@ -47,17 +50,17 @@ use log::debug; use super::EntryPoint; use crate::acpi::create_acpi_tables; -use crate::arch::{BootProtocol, SYSTEM_MEM_SIZE, SYSTEM_MEM_START}; +use crate::arch::{BootProtocol, SYSTEM_MEM_SIZE, SYSTEM_MEM_START, arch_memory_regions_with_gap}; use crate::cpu_config::templates::{CustomCpuTemplate, GuestConfigError}; use crate::cpu_config::x86_64::CpuConfiguration; use crate::initrd::InitrdConfig; -use crate::utils::{align_down, mib_to_bytes, u64_to_usize, usize_to_u64}; +use crate::utils::{align_down, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{ Address, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion, }; use crate::vstate::vcpu::KvmVcpuConfigureError; -use crate::{Vcpu, VcpuConfig, Vmm}; +use crate::{Vcpu, VcpuConfig, Vmm, logger}; // Value taken from https://elixir.bootlin.com/linux/v5.10.68/source/arch/x86/include/uapi/asm/e820.h#L31 // Usable normal RAM @@ -96,48 +99,53 @@ pub enum ConfigurationError { Acpi(#[from] crate::acpi::AcpiError), } -/// First address that cannot be addressed using 32 bit anymore. -pub const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32; - -/// Size of MMIO gap at top of 32-bit address space. -pub const MEM_32BIT_GAP_SIZE: u64 = mib_to_bytes(768) as u64; -/// The start of the memory area reserved for MMIO devices. -pub const MMIO_MEM_START: u64 = FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE; -/// The size of the memory area reserved for MMIO devices. -pub const MMIO_MEM_SIZE: u64 = MEM_32BIT_GAP_SIZE; - /// Returns a Vec of the valid memory addresses. /// These should be used to configure the GuestMemoryMmap structure for the platform. -/// For x86_64 all addresses are valid from the start of the kernel except a -/// carve out at the end of 32bit address space. -pub fn arch_memory_regions(offset: usize, size: usize) -> Vec<(GuestAddress, usize)> { +/// For x86_64 all addresses are valid from the start of the kernel except an 1GB +/// carve out at the end of 32bit address space and a second 256GB one at the 256GB limit. +pub fn arch_memory_regions(size: usize) -> Vec<(GuestAddress, usize)> { // If we get here with size == 0 something has seriously gone wrong. Firecracker should never // try to allocate guest memory of size 0 assert!(size > 0, "Attempt to allocate guest memory of length 0"); - assert!( - offset.checked_add(size).is_some(), - "Attempt to allocate guest memory such that the address space would wrap around" + + let dram_size = std::cmp::min( + usize::MAX - u64_to_usize(MMIO32_MEM_SIZE) - u64_to_usize(MMIO64_MEM_SIZE), + size, ); - // It's safe to cast MMIO_MEM_START to usize because it fits in a u32 variable - // (It points to an address in the 32 bit space). - match (size + offset).checked_sub(u64_to_usize(MMIO_MEM_START)) { - // case1: guest memory fits before the gap - None | Some(0) => vec![(GuestAddress(offset as u64), size)], - // case2: starts before the gap, but doesn't completely fit - Some(remaining) if (offset as u64) < MMIO_MEM_START => vec![ - ( - GuestAddress(offset as u64), - u64_to_usize(MMIO_MEM_START) - offset, - ), - (GuestAddress(FIRST_ADDR_PAST_32BITS), remaining), - ], - // case3: guest memory start after the gap - Some(_) => vec![( - GuestAddress(FIRST_ADDR_PAST_32BITS.max(offset as u64)), + if dram_size != size { + logger::warn!( + "Requested memory size {} exceeds architectural maximum (1022GiB). Size has been \ + truncated to {}", size, - )], + dram_size + ); } + + let mut regions = vec![]; + + if let Some((start_past_32bit_gap, remaining_past_32bit_gap)) = arch_memory_regions_with_gap( + &mut regions, + 0, + dram_size, + u64_to_usize(MMIO32_MEM_START), + u64_to_usize(MMIO32_MEM_SIZE), + ) { + if let Some((start_past_64bit_gap, remaining_past_64bit_gap)) = arch_memory_regions_with_gap( + &mut regions, + start_past_32bit_gap, + remaining_past_32bit_gap, + u64_to_usize(MMIO64_MEM_START), + u64_to_usize(MMIO64_MEM_SIZE), + ) { + regions.push(( + GuestAddress(start_past_64bit_gap as u64), + remaining_past_64bit_gap, + )); + } + } + + regions } /// Returns the memory address where the kernel could be loaded. @@ -205,7 +213,7 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( vmm.vm.guest_memory(), - &mut vmm.device_manager.resource_allocator, + &vmm.device_manager.resource_allocator, vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; @@ -237,7 +245,9 @@ fn configure_pvh( ) -> Result<(), ConfigurationError> { const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336e_c578; let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); - let end_32bit_gap_start = GuestAddress(MMIO_MEM_START); + let end_32bit_gap_start = GuestAddress(MMIO32_MEM_START); + let first_addr_past_64bits = GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO); + let end_64bit_gap_start = GuestAddress(MMIO64_MEM_START); let himem_start = GuestAddress(layout::HIMEM_START); // Vector to hold modules (currently either empty or holding initrd). @@ -269,32 +279,42 @@ fn configure_pvh( type_: E820_RESERVED, ..Default::default() }); + memmap.push(hvm_memmap_table_entry { + addr: PCI_MMCONFIG_START, + size: PCI_MMCONFIG_SIZE, + type_: E820_RESERVED, + ..Default::default() + }); let last_addr = guest_mem.last_addr(); - if last_addr < end_32bit_gap_start { + + if last_addr > first_addr_past_64bits { memmap.push(hvm_memmap_table_entry { - addr: himem_start.raw_value(), - size: last_addr.unchecked_offset_from(himem_start) + 1, + addr: first_addr_past_64bits.raw_value(), + size: last_addr.unchecked_offset_from(first_addr_past_64bits) + 1, type_: MEMMAP_TYPE_RAM, ..Default::default() }); - } else { + } + + if last_addr > first_addr_past_32bits { memmap.push(hvm_memmap_table_entry { - addr: himem_start.raw_value(), - size: end_32bit_gap_start.unchecked_offset_from(himem_start), + addr: first_addr_past_32bits.raw_value(), + size: (end_64bit_gap_start.unchecked_offset_from(first_addr_past_32bits)) + .min(last_addr.unchecked_offset_from(first_addr_past_32bits) + 1), type_: MEMMAP_TYPE_RAM, ..Default::default() }); - - if last_addr > first_addr_past_32bits { - memmap.push(hvm_memmap_table_entry { - addr: first_addr_past_32bits.raw_value(), - size: last_addr.unchecked_offset_from(first_addr_past_32bits) + 1, - type_: MEMMAP_TYPE_RAM, - ..Default::default() - }); - } } + memmap.push(hvm_memmap_table_entry { + addr: himem_start.raw_value(), + size: end_32bit_gap_start + .unchecked_offset_from(himem_start) + .min(last_addr.unchecked_offset_from(himem_start) + 1), + type_: MEMMAP_TYPE_RAM, + ..Default::default() + }); + // Construct the hvm_start_info structure and serialize it into // boot_params. This will be stored at PVH_INFO_START address, and %rbx // will be initialized to contain PVH_INFO_START prior to starting the @@ -340,7 +360,9 @@ fn configure_64bit_boot( const KERNEL_LOADER_OTHER: u8 = 0xff; const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x0100_0000; // Must be non-zero. let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); - let end_32bit_gap_start = GuestAddress(MMIO_MEM_START); + let end_32bit_gap_start = GuestAddress(MMIO32_MEM_START); + let first_addr_past_64bits = GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO); + let end_64bit_gap_start = GuestAddress(MMIO64_MEM_START); let himem_start = GuestAddress(layout::HIMEM_START); @@ -371,39 +393,42 @@ fn configure_64bit_boot( layout::SYSTEM_MEM_SIZE, E820_RESERVED, )?; + add_e820_entry( + &mut params, + PCI_MMCONFIG_START, + PCI_MMCONFIG_SIZE, + E820_RESERVED, + )?; let last_addr = guest_mem.last_addr(); - if last_addr < end_32bit_gap_start { + + if last_addr > first_addr_past_64bits { add_e820_entry( &mut params, - himem_start.raw_value(), - // it's safe to use unchecked_offset_from because - // mem_end > himem_start - last_addr.unchecked_offset_from(himem_start) + 1, + first_addr_past_64bits.raw_value(), + last_addr.unchecked_offset_from(first_addr_past_64bits) + 1, E820_RAM, )?; - } else { + } + + if last_addr > first_addr_past_32bits { add_e820_entry( &mut params, - himem_start.raw_value(), - // it's safe to use unchecked_offset_from because - // end_32bit_gap_start > himem_start - end_32bit_gap_start.unchecked_offset_from(himem_start), + first_addr_past_32bits.raw_value(), + (end_64bit_gap_start.unchecked_offset_from(first_addr_past_32bits)) + .min(last_addr.unchecked_offset_from(first_addr_past_32bits) + 1), E820_RAM, )?; - - if last_addr > first_addr_past_32bits { - add_e820_entry( - &mut params, - first_addr_past_32bits.raw_value(), - // it's safe to use unchecked_offset_from because - // mem_end > first_addr_past_32bits - last_addr.unchecked_offset_from(first_addr_past_32bits) + 1, - E820_RAM, - )?; - } } + add_e820_entry( + &mut params, + himem_start.raw_value(), + (last_addr.unchecked_offset_from(himem_start) + 1) + .min(end_32bit_gap_start.unchecked_offset_from(himem_start)), + E820_RAM, + )?; + LinuxBootConfigurator::write_bootparams( &BootParams::new(¶ms, GuestAddress(layout::ZERO_PAGE_START)), guest_mem, @@ -468,51 +493,69 @@ pub fn load_kernel( #[cfg(kani)] mod verification { - use crate::arch::x86_64::FIRST_ADDR_PAST_32BITS; - use crate::arch::{MMIO_MEM_START, arch_memory_regions}; + + use crate::arch::arch_memory_regions; + use crate::arch::x86_64::layout::{ + FIRST_ADDR_PAST_32BITS, FIRST_ADDR_PAST_64BITS_MMIO, MMIO32_MEM_SIZE, MMIO32_MEM_START, + MMIO64_MEM_SIZE, MMIO64_MEM_START, + }; + use crate::utils::u64_to_usize; #[kani::proof] - #[kani::unwind(3)] + #[kani::unwind(4)] fn verify_arch_memory_regions() { - let offset: u64 = kani::any::(); let len: u64 = kani::any::(); kani::assume(len > 0); - kani::assume(offset.checked_add(len).is_some()); - let regions = arch_memory_regions(offset as usize, len as usize); + let regions = arch_memory_regions(len as usize); - // There's only one MMIO gap, so we can get either 1 or 2 regions - assert!(regions.len() <= 2); + // There are two MMIO gaps, so we can get either 1, 2 or 3 regions + assert!(regions.len() <= 3); assert!(regions.len() >= 1); + // The first address is always 0 + assert_eq!(regions[0].0.0, 0); + // The total length of all regions is what we requested - assert_eq!( - regions.iter().map(|&(_, len)| len).sum::(), - len as usize - ); + let actual_size = regions.iter().map(|&(_, len)| len).sum::(); + assert!(actual_size <= len as usize); + if actual_size < u64_to_usize(len) { + assert_eq!( + actual_size, + usize::MAX - u64_to_usize(MMIO32_MEM_SIZE) - u64_to_usize(MMIO64_MEM_SIZE) + ); + } // No region overlaps the MMIO gap assert!( regions .iter() - .all(|&(start, len)| start.0 >= FIRST_ADDR_PAST_32BITS - || start.0 + len as u64 <= MMIO_MEM_START) + .all(|&(start, len)| (start.0 >= FIRST_ADDR_PAST_32BITS + || start.0 + len as u64 <= MMIO32_MEM_START) + && (start.0 >= FIRST_ADDR_PAST_64BITS_MMIO + || start.0 + len as u64 <= MMIO64_MEM_START)) ); - // All regions start after our specified offset - assert!(regions.iter().all(|&(start, _)| start.0 >= offset as u64)); - // All regions have non-zero length assert!(regions.iter().all(|&(_, len)| len > 0)); - // If there's two regions, they perfectly snuggle up to the MMIO gap - if regions.len() == 2 { + // If there's at least two regions, they perfectly snuggle up to one of the two MMIO gaps + if regions.len() >= 2 { kani::cover!(); - assert_eq!(regions[0].0.0 + regions[0].1 as u64, MMIO_MEM_START); + assert_eq!(regions[0].0.0 + regions[0].1 as u64, MMIO32_MEM_START); assert_eq!(regions[1].0.0, FIRST_ADDR_PAST_32BITS); } + + // If there are three regions, the last two perfectly snuggle up to the 64bit + // MMIO gap + if regions.len() == 3 { + kani::cover!(); + + assert_eq!(regions[1].0.0 + regions[1].1 as u64, MMIO64_MEM_START); + assert_eq!(regions[2].0.0, FIRST_ADDR_PAST_64BITS_MMIO); + } } } @@ -523,37 +566,25 @@ mod tests { use super::*; use crate::device_manager::resources::ResourceAllocator; use crate::test_utils::{arch_mem, single_region_mem}; + use crate::utils::mib_to_bytes; #[test] fn regions_lt_4gb() { - let regions = arch_memory_regions(0, 1usize << 29); + let regions = arch_memory_regions(1usize << 29); assert_eq!(1, regions.len()); assert_eq!(GuestAddress(0), regions[0].0); assert_eq!(1usize << 29, regions[0].1); - - let regions = arch_memory_regions(1 << 28, 1 << 29); - assert_eq!(1, regions.len()); - assert_eq!(regions[0], (GuestAddress(1 << 28), 1 << 29)); } #[test] fn regions_gt_4gb() { const MEMORY_SIZE: usize = (1 << 32) + 0x8000; - let regions = arch_memory_regions(0, MEMORY_SIZE); + let regions = arch_memory_regions(MEMORY_SIZE); assert_eq!(2, regions.len()); assert_eq!(GuestAddress(0), regions[0].0); assert_eq!(GuestAddress(1u64 << 32), regions[1].0); - let regions = arch_memory_regions(1 << 31, MEMORY_SIZE); - assert_eq!(2, regions.len()); - assert_eq!( - regions[0], - ( - GuestAddress(1 << 31), - u64_to_usize(MMIO_MEM_START) - (1 << 31) - ) - ); assert_eq!( regions[1], ( @@ -567,8 +598,8 @@ mod tests { fn test_system_configuration() { let no_vcpus = 4; let gm = single_region_mem(0x10000); - let mut resource_allocator = ResourceAllocator::new().unwrap(); - let err = mptable::setup_mptable(&gm, &mut resource_allocator, 1); + let resource_allocator = ResourceAllocator::new().unwrap(); + let err = mptable::setup_mptable(&gm, &resource_allocator, 1); assert!(matches!( err.unwrap_err(), mptable::MptableError::NotEnoughMemory @@ -577,24 +608,24 @@ mod tests { // Now assigning some memory that falls before the 32bit memory hole. let mem_size = mib_to_bytes(128); let gm = arch_mem(mem_size); - let mut resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); + mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); // Now assigning some memory that is equal to the start of the 32bit memory hole. let mem_size = mib_to_bytes(3328); let gm = arch_mem(mem_size); - let mut resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); + mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); // Now assigning some memory that falls after the 32bit memory hole. let mem_size = mib_to_bytes(3330); let gm = arch_mem(mem_size); - let mut resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); + mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); } diff --git a/src/vmm/src/arch/x86_64/mptable.rs b/src/vmm/src/arch/x86_64/mptable.rs index 6646c17e282..c397290c23e 100644 --- a/src/vmm/src/arch/x86_64/mptable.rs +++ b/src/vmm/src/arch/x86_64/mptable.rs @@ -116,7 +116,7 @@ fn compute_mp_size(num_cpus: u8) -> usize { /// Performs setup of the MP table for the given `num_cpus`. pub fn setup_mptable( mem: &GuestMemoryMmap, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, num_cpus: u8, ) -> Result<(), MptableError> { if num_cpus > MAX_SUPPORTED_CPUS { @@ -334,27 +334,27 @@ mod tests { fn bounds_check() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); } #[test] fn bounds_check_fails() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus) - 1); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap_err(); + setup_mptable(&mem, &resource_allocator, num_cpus).unwrap_err(); } #[test] fn mpf_intel_checksum() { let num_cpus = 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); @@ -365,9 +365,9 @@ mod tests { fn mpc_table_checksum() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); let mpc_offset = GuestAddress(u64::from(mpf_intel.physptr)); @@ -388,9 +388,9 @@ mod tests { fn mpc_entry_count() { let num_cpus = 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); let mpc_offset = GuestAddress(u64::from(mpf_intel.physptr)); @@ -419,8 +419,8 @@ mod tests { let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(MAX_SUPPORTED_CPUS)); for i in 0..MAX_SUPPORTED_CPUS { - let mut resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, i).unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); + setup_mptable(&mem, &resource_allocator, i).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); @@ -450,9 +450,9 @@ mod tests { fn cpu_entry_count_max() { let cpus = MAX_SUPPORTED_CPUS + 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - let result = setup_mptable(&mem, &mut resource_allocator, cpus).unwrap_err(); + let result = setup_mptable(&mem, &resource_allocator, cpus).unwrap_err(); assert_eq!(result, MptableError::TooManyCpus); } } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 48590201f2d..2c037fc529f 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -24,6 +24,7 @@ use crate::cpu_config::templates::{ }; #[cfg(target_arch = "aarch64")] use crate::device_manager::AttachLegacyMmioDeviceError; +use crate::device_manager::pci_mngr::PciManagerError; use crate::device_manager::{ AttachMmioDeviceError, AttachVmgenidError, DeviceManager, DevicePersistError, DeviceRestoreArgs, }; @@ -71,6 +72,8 @@ pub enum StartMicrovmError { CreateLegacyDevice(device_manager::legacy::LegacyDeviceError), /// Error creating VMGenID device: {0} CreateVMGenID(VmGenIdError), + /// Error enabling PCIe support: {0} + EnablePciDevices(#[from] PciManagerError), /// Error enabling pvtime on vcpu: {0} #[cfg(target_arch = "aarch64")] EnablePVTime(crate::arch::VcpuArchError), @@ -214,6 +217,12 @@ pub fn build_microvm_for_boot( .map(|vcpu| vcpu.copy_kvm_vcpu_fd(vmm.vm())) .collect::, _>>()?; + if vm_resources.pci_enabled { + vmm.device_manager.enable_pci()?; + } else { + boot_cmdline.insert("pci", "off")?; + } + // The boot timer device needs to be the first device attached in order // to maintain the same MMIO address referenced in the documentation // and tests. @@ -1038,8 +1047,8 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5 virtio_mmio.device=4K@0xd0001000:6 \ - virtio_mmio.device=4K@0xd0002000:7" + "virtio_mmio.device=4K@0xc0001000:5 virtio_mmio.device=4K@0xc0002000:6 \ + virtio_mmio.device=4K@0xc0003000:7" )); } @@ -1137,7 +1146,7 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5" + "virtio_mmio.device=4K@0xc0001000:5" )); } @@ -1154,7 +1163,7 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5" + "virtio_mmio.device=4K@0xc0001000:5" )); } @@ -1173,7 +1182,7 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5" + "virtio_mmio.device=4K@0xc0001000:5" )); } } diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 398f97bc6ab..f730dd5be0d 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -21,6 +21,9 @@ use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; use super::resources::ResourceAllocator; +use crate::arch::BOOT_DEVICE_MEM_START; +#[cfg(target_arch = "aarch64")] +use crate::arch::{RTC_MEM_START, SERIAL_MEM_START}; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::{RTCDevice, SerialDevice}; use crate::devices::pseudo::BootTimer; @@ -166,7 +169,7 @@ impl MMIODeviceManager { /// Allocates resources for a new device to be added. fn allocate_mmio_resources( &mut self, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, irq_count: u32, ) -> Result { let irq = match resource_allocator.allocate_gsi(irq_count)?[..] { @@ -176,7 +179,7 @@ impl MMIODeviceManager { }; let device_info = MMIODeviceInfo { - addr: resource_allocator.allocate_mmio_memory( + addr: resource_allocator.allocate_32bit_mmio_memory( MMIO_LEN, MMIO_LEN, AllocPolicy::FirstMatch, @@ -250,9 +253,8 @@ impl MMIODeviceManager { pub fn register_mmio_virtio_for_boot( &mut self, vm: &VmFd, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, device_id: String, - mmio_bus: &vm_device::Bus, mmio_device: MmioTransport, _cmdline: &mut kernel_cmdline::Cmdline, ) -> Result<(), MmioError> { @@ -273,7 +275,7 @@ impl MMIODeviceManager { device.resources.irq.unwrap().get(), )?; } - self.register_mmio_virtio(vm, device_id, mmio_bus, device)?; + self.register_mmio_virtio(vm, device_id, &resource_allocator.mmio_bus, device)?; Ok(()) } @@ -283,8 +285,7 @@ impl MMIODeviceManager { pub fn register_mmio_serial( &mut self, vm: &VmFd, - mmio_bus: &vm_device::Bus, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, serial: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { @@ -293,7 +294,12 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - self.allocate_mmio_resources(resource_allocator, 1)? + let gsi = resource_allocator.allocate_gsi(1)?; + MMIODeviceInfo { + addr: SERIAL_MEM_START, + len: MMIO_LEN, + irq: NonZeroU32::new(gsi[0]), + } }; vm.register_irqfd( @@ -307,7 +313,7 @@ impl MMIODeviceManager { inner: serial, }; - mmio_bus.insert( + resource_allocator.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -338,8 +344,7 @@ impl MMIODeviceManager { /// given as parameter, otherwise allocate a new MMIO resources for it. pub fn register_mmio_rtc( &mut self, - mmio_bus: &vm_device::Bus, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, rtc: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { @@ -348,7 +353,12 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - self.allocate_mmio_resources(resource_allocator, 1)? + let gsi = resource_allocator.allocate_gsi(1)?; + MMIODeviceInfo { + addr: RTC_MEM_START, + len: MMIO_LEN, + irq: NonZeroU32::new(gsi[0]), + } }; let device = MMIODevice { @@ -356,7 +366,7 @@ impl MMIODeviceManager { inner: rtc, }; - mmio_bus.insert( + resource_allocator.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -369,11 +379,15 @@ impl MMIODeviceManager { pub fn register_mmio_boot_timer( &mut self, mmio_bus: &vm_device::Bus, - resource_allocator: &mut ResourceAllocator, boot_timer: Arc>, ) -> Result<(), MmioError> { // Attach a new boot timer device. - let device_info = self.allocate_mmio_resources(resource_allocator, 0)?; + let device_info = MMIODeviceInfo { + addr: BOOT_DEVICE_MEM_START, + len: MMIO_LEN, + irq: None, + }; + let device = MMIODevice { resources: device_info, inner: boot_timer, @@ -554,19 +568,17 @@ pub(crate) mod tests { &mut self, vm: &VmFd, guest_mem: GuestMemoryMmap, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, device: Arc>, cmdline: &mut kernel_cmdline::Cmdline, dev_id: &str, ) -> Result { let interrupt = Arc::new(IrqTrigger::new()); - let mmio_bus = vm_device::Bus::new(); let mmio_device = MmioTransport::new(guest_mem, interrupt, device.clone(), false); self.register_mmio_virtio_for_boot( vm, resource_allocator, dev_id.to_string(), - &mmio_bus, mmio_device, cmdline, )?; @@ -676,7 +688,7 @@ pub(crate) mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); @@ -689,7 +701,7 @@ pub(crate) mod tests { .register_virtio_test_device( vm.fd(), vm.guest_memory().clone(), - &mut resource_allocator, + &resource_allocator, dummy, &mut cmdline, "dummy", @@ -698,7 +710,7 @@ pub(crate) mod tests { assert!(device_manager.get_virtio_device(0, "foo").is_none()); let dev = device_manager.get_virtio_device(0, "dummy").unwrap(); - assert_eq!(dev.resources.addr, arch::MMIO_MEM_START); + assert_eq!(dev.resources.addr, arch::MEM_32BIT_DEVICES_START); assert_eq!(dev.resources.len, MMIO_LEN); assert_eq!( dev.resources.irq, @@ -709,7 +721,7 @@ pub(crate) mod tests { .for_each_virtio_device(|virtio_type, device_id, mmio_device| { assert_eq!(*virtio_type, 0); assert_eq!(device_id, "dummy"); - assert_eq!(mmio_device.resources.addr, arch::MMIO_MEM_START); + assert_eq!(mmio_device.resources.addr, arch::MEM_32BIT_DEVICES_START); assert_eq!(mmio_device.resources.len, MMIO_LEN); assert_eq!( mmio_device.resources.irq, @@ -730,7 +742,7 @@ pub(crate) mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); #[cfg(target_arch = "x86_64")] @@ -743,7 +755,7 @@ pub(crate) mod tests { .register_virtio_test_device( vm.fd(), vm.guest_memory().clone(), - &mut resource_allocator, + &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy1", @@ -757,7 +769,7 @@ pub(crate) mod tests { .register_virtio_test_device( vm.fd(), vm.guest_memory().clone(), - &mut resource_allocator, + &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy2" @@ -792,7 +804,7 @@ pub(crate) mod tests { vm.setup_irqchip(1).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); @@ -802,7 +814,7 @@ pub(crate) mod tests { .register_virtio_test_device( vm.fd(), vm.guest_memory().clone(), - &mut resource_allocator, + &resource_allocator, dummy, &mut cmdline, &id, @@ -833,7 +845,7 @@ pub(crate) mod tests { .register_virtio_test_device( vm.fd(), vm.guest_memory().clone(), - &mut resource_allocator, + &resource_allocator, dummy2, &mut cmdline, &id2, @@ -859,10 +871,10 @@ pub(crate) mod tests { #[test] fn test_no_irq_allocation() { let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let device_info = device_manager - .allocate_mmio_resources(&mut resource_allocator, 0) + .allocate_mmio_resources(&resource_allocator, 0) .unwrap(); assert!(device_info.irq.is_none()); } @@ -870,10 +882,10 @@ pub(crate) mod tests { #[test] fn test_irq_allocation() { let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let device_info = device_manager - .allocate_mmio_resources(&mut resource_allocator, 1) + .allocate_mmio_resources(&resource_allocator, 1) .unwrap(); assert_eq!(device_info.irq.unwrap().get(), crate::arch::IRQ_BASE); } @@ -881,12 +893,12 @@ pub(crate) mod tests { #[test] fn test_allocation_failure() { let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); assert_eq!( format!( "{}", device_manager - .allocate_mmio_resources(&mut resource_allocator, 2) + .allocate_mmio_resources(&resource_allocator, 2) .unwrap_err() ), "Invalid MMIO IRQ configuration.".to_string() diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 3e3f0f0ffda..2922060bb13 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -16,6 +16,7 @@ use legacy::{LegacyDeviceError, PortIODeviceManager}; use linux_loader::loader::Cmdline; use log::error; use mmio::{MMIODeviceManager, MmioError}; +use pci_mngr::{PciDevices, PciManagerError}; use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; use resources::ResourceAllocator; use serde::{Deserialize, Serialize}; @@ -43,6 +44,8 @@ pub mod acpi; pub mod legacy; /// Memory Mapped I/O Manager. pub mod mmio; +/// PCIe device manager +pub mod pci_mngr; /// Device managers (de)serialization support. pub mod persist; /// Resource manager for devices. @@ -96,19 +99,16 @@ pub enum AttachLegacyMmioDeviceError { /// A manager of all peripheral devices of Firecracker pub struct DeviceManager { /// Allocator for system memory and interrupt numbers - pub resource_allocator: ResourceAllocator, - /// MMIO bus - pub mmio_bus: Arc, + pub resource_allocator: Arc, /// MMIO devices pub mmio_devices: MMIODeviceManager, #[cfg(target_arch = "x86_64")] - /// Port IO bus - pub pio_bus: Arc, - #[cfg(target_arch = "x86_64")] /// Legacy devices pub legacy_devices: PortIODeviceManager, /// ACPI devices pub acpi_devices: ACPIDeviceManager, + /// PCIe devices + pub pci_devices: PciDevices, } impl DeviceManager { @@ -145,10 +145,7 @@ impl DeviceManager { vcpu_exit_evt: &EventFd, vmfd: &VmFd, ) -> Result { - let mmio_bus = Arc::new(vm_device::Bus::new()); - - #[cfg(target_arch = "x86_64")] - let pio_bus = Arc::new(vm_device::Bus::new()); + let resource_allocator = Arc::new(ResourceAllocator::new()?); #[cfg(target_arch = "x86_64")] let legacy_devices = { Self::set_stdout_nonblocking(); @@ -163,19 +160,17 @@ impl DeviceManager { // create pio dev manager with legacy devices let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; - legacy_devices.register_devices(&pio_bus, vmfd)?; + legacy_devices.register_devices(&resource_allocator.pio_bus, vmfd)?; legacy_devices }; Ok(DeviceManager { - resource_allocator: ResourceAllocator::new()?, - mmio_bus, + resource_allocator, mmio_devices: MMIODeviceManager::new(), #[cfg(target_arch = "x86_64")] - pio_bus, - #[cfg(target_arch = "x86_64")] legacy_devices, acpi_devices: ACPIDeviceManager::new(), + pci_devices: PciDevices::new(), }) } @@ -194,9 +189,8 @@ impl DeviceManager { let device = MmioTransport::new(mem.clone(), interrupt, device, is_vhost_user); self.mmio_devices.register_mmio_virtio_for_boot( vmfd, - &mut self.resource_allocator, + &self.resource_allocator, id, - &self.mmio_bus, device, cmdline, )?; @@ -211,11 +205,8 @@ impl DeviceManager { ) -> Result<(), AttachMmioDeviceError> { let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); - self.mmio_devices.register_mmio_boot_timer( - &self.mmio_bus, - &mut self.resource_allocator, - boot_timer, - )?; + self.mmio_devices + .register_mmio_boot_timer(&self.resource_allocator.mmio_bus, boot_timer)?; Ok(()) } @@ -225,7 +216,7 @@ impl DeviceManager { mem: &GuestMemoryMmap, vmfd: &VmFd, ) -> Result<(), AttachVmgenidError> { - let vmgenid = VmGenId::new(mem, &mut self.resource_allocator)?; + let vmgenid = VmGenId::new(mem, &self.resource_allocator)?; self.acpi_devices.attach_vmgenid(vmgenid, vmfd)?; Ok(()) } @@ -249,25 +240,22 @@ impl DeviceManager { // Make stdout non-blocking. Self::set_stdout_nonblocking(); let serial = Self::setup_serial_device(event_manager)?; - self.mmio_devices.register_mmio_serial( - vmfd, - &self.mmio_bus, - &mut self.resource_allocator, - serial, - None, - )?; + self.mmio_devices + .register_mmio_serial(vmfd, &self.resource_allocator, serial, None)?; self.mmio_devices.add_mmio_serial_to_cmdline(cmdline)?; } let rtc = Arc::new(Mutex::new(RTCDevice::new())); - self.mmio_devices.register_mmio_rtc( - &self.mmio_bus, - &mut self.resource_allocator, - rtc, - None, - )?; + self.mmio_devices + .register_mmio_rtc(&self.resource_allocator, rtc, None)?; Ok(()) } + + /// Enables PCIe support for Firecracker devices + pub fn enable_pci(&mut self) -> Result<(), PciManagerError> { + self.pci_devices + .attach_pci_segment(&self.resource_allocator) + } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] @@ -277,6 +265,8 @@ pub struct DevicesState { pub mmio_state: persist::DeviceStates, /// ACPI devices state pub acpi_state: persist::ACPIDeviceManagerState, + /// PCI devices state + pub pci_state: pci_mngr::PciDevicesState, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -285,6 +275,8 @@ pub enum DevicePersistError { MmioRestore(#[from] persist::DevicePersistError), /// Error restoring ACPI devices: {0} AcpiRestore(#[from] persist::ACPIDeviceManagerRestoreError), + /// Error restoring PCI devices: {0} + PciRestore(#[from] PciManagerError), /// Error notifying VMGenID device: {0} VmGenidUpdate(#[from] std::io::Error), /// Error resetting serial console: {0} @@ -307,6 +299,7 @@ impl DeviceManager { DevicesState { mmio_state: self.mmio_devices.save(), acpi_state: self.acpi_devices.save(), + pci_state: self.pci_devices.save(), } } @@ -355,11 +348,10 @@ impl DeviceManager { ) -> Result<(), DevicePersistError> { // Restore MMIO devices let mmio_ctor_args = MMIODevManagerConstructorArgs { - mmio_bus: &self.mmio_bus, mem: restore_args.mem, vm: restore_args.vm, event_manager: restore_args.event_manager, - resource_allocator: &mut self.resource_allocator, + resource_allocator: &self.resource_allocator, vm_resources: restore_args.vm_resources, instance_id: restore_args.instance_id, restored_from_file: restore_args.restored_from_file, @@ -373,12 +365,16 @@ impl DeviceManager { // Restore ACPI devices let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { mem: restore_args.mem, - resource_allocator: &mut self.resource_allocator, + resource_allocator: &self.resource_allocator, vm: restore_args.vm, }; self.acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; self.acpi_devices.notify_vmgenid()?; + // Restore PCI devices + self.pci_devices + .restore(&state.pci_state, &self.resource_allocator)?; + Ok(()) } } @@ -390,12 +386,10 @@ pub(crate) mod tests { use crate::builder::tests::default_vmm; pub(crate) fn default_device_manager() -> DeviceManager { - let mmio_bus = Arc::new(vm_device::Bus::new()); - #[cfg(target_arch = "x86_64")] - let pio_bus = Arc::new(vm_device::Bus::new()); let mmio_devices = MMIODeviceManager::new(); let acpi_devices = ACPIDeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); + let pci_devices = PciDevices::new(); + let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); #[cfg(target_arch = "x86_64")] let legacy_devices = PortIODeviceManager::new( @@ -410,13 +404,11 @@ pub(crate) mod tests { DeviceManager { resource_allocator, - mmio_bus, mmio_devices, #[cfg(target_arch = "x86_64")] - pio_bus, - #[cfg(target_arch = "x86_64")] legacy_devices, acpi_devices, + pci_devices, } } diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs new file mode 100644 index 00000000000..e9ada60cc1f --- /dev/null +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -0,0 +1,69 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; +use vm_device::BusError; + +use super::resources::ResourceAllocator; +use crate::devices::pci::PciSegment; + +#[derive(Debug, Default)] +pub struct PciDevices { + /// PCIe segment of the VMM, if PCI is enabled. We currently support a single PCIe segment. + pub pci_segment: Option, +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum PciManagerError { + /// Resource allocation error: {0} + ResourceAllocation(#[from] vm_allocator::Error), + /// Bus error: {0} + Bus(#[from] BusError), +} + +impl PciDevices { + pub fn new() -> Self { + Default::default() + } + + pub fn attach_pci_segment( + &mut self, + resource_allocator: &Arc, + ) -> Result<(), PciManagerError> { + // We only support a single PCIe segment. Calling this function twice is a Firecracker + // internal error. + assert!(self.pci_segment.is_none()); + + // Currently we don't assign any IRQs to PCI devices. We will be using MSI-X interrupts + // only. + let pci_segment = PciSegment::new(0, resource_allocator, &[0u8; 32])?; + self.pci_segment = Some(pci_segment); + + Ok(()) + } + + pub fn save(&self) -> PciDevicesState { + PciDevicesState { + pci_enabled: self.pci_segment.is_some(), + } + } + + pub fn restore( + &mut self, + state: &PciDevicesState, + resource_allocator: &Arc, + ) -> Result<(), PciManagerError> { + if state.pci_enabled { + self.attach_pci_segment(resource_allocator)?; + } + + Ok(()) + } +} + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct PciDevicesState { + pci_enabled: bool, +} diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 99216ec77e7..e3c7d2a8475 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -214,11 +214,10 @@ pub enum SharedDeviceType { } pub struct MMIODevManagerConstructorArgs<'a> { - pub mmio_bus: &'a vm_device::Bus, pub mem: &'a GuestMemoryMmap, pub vm: &'a VmFd, pub event_manager: &'a mut EventManager, - pub resource_allocator: &'a mut ResourceAllocator, + pub resource_allocator: &'a ResourceAllocator, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, pub restored_from_file: bool, @@ -243,7 +242,7 @@ pub struct ACPIDeviceManagerState { pub struct ACPIDeviceManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a mut ResourceAllocator, + pub resource_allocator: &'a ResourceAllocator, pub vm: &'a VmFd, } @@ -433,20 +432,8 @@ impl<'a> Persist<'a> for MMIODeviceManager { .event_manager .add_subscriber(serial.clone()); - constructor_args - .resource_allocator - .allocate_mmio_memory( - MMIO_LEN, - MMIO_LEN, - AllocPolicy::ExactMatch(state.device_info.addr), - ) - .map_err(|e| { - DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) - })?; - dev_manager.register_mmio_serial( vm, - constructor_args.mmio_bus, constructor_args.resource_allocator, serial, Some(state.device_info), @@ -454,18 +441,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { } if state.type_ == DeviceType::Rtc { let rtc = Arc::new(Mutex::new(RTCDevice::new())); - constructor_args - .resource_allocator - .allocate_mmio_memory( - MMIO_LEN, - MMIO_LEN, - AllocPolicy::ExactMatch(state.device_info.addr), - ) - .map_err(|e| { - DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) - })?; dev_manager.register_mmio_rtc( - constructor_args.mmio_bus, constructor_args.resource_allocator, rtc, Some(state.device_info), @@ -507,7 +483,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { constructor_args .resource_allocator - .allocate_mmio_memory( + .allocate_32bit_mmio_memory( MMIO_LEN, MMIO_LEN, AllocPolicy::ExactMatch(device_info.addr), @@ -553,7 +529,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &balloon_state.transport_state, interrupt, &balloon_state.device_info, - constructor_args.mmio_bus, + &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -580,7 +556,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &block_state.transport_state, interrupt, &block_state.device_info, - constructor_args.mmio_bus, + &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -629,7 +605,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &net_state.transport_state, interrupt, &net_state.device_info, - constructor_args.mmio_bus, + &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -661,7 +637,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &vsock_state.transport_state, interrupt, &vsock_state.device_info, - constructor_args.mmio_bus, + &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -687,7 +663,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &entropy_state.transport_state, interrupt, &entropy_state.device_info, - constructor_args.mmio_bus, + &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -776,7 +752,7 @@ mod tests { // These need to survive so the restored blocks find them. let _block_files; let mut tmp_sock_file = TempFile::new().unwrap(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); tmp_sock_file.remove().unwrap(); // Set up a vmm with one of each device, and get the serialized DeviceStates. { @@ -839,11 +815,10 @@ mod tests { let device_states: DeviceStates = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { - mmio_bus: &vmm.device_manager.mmio_bus, mem: vmm.vm.guest_memory(), vm: vmm.vm.fd(), event_manager: &mut event_manager, - resource_allocator: &mut resource_allocator, + resource_allocator: &resource_allocator, vm_resources, instance_id: "microvm-id", restored_from_file: true, diff --git a/src/vmm/src/device_manager/resources.rs b/src/vmm/src/device_manager/resources.rs index 821148794ec..249d0507ba8 100644 --- a/src/vmm/src/device_manager/resources.rs +++ b/src/vmm/src/device_manager/resources.rs @@ -1,8 +1,12 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::{Arc, Mutex}; + +use pci::DeviceRelocation; pub use vm_allocator::AllocPolicy; use vm_allocator::{AddressAllocator, IdAllocator}; +use vm_device::Bus; use crate::arch; @@ -16,20 +20,40 @@ use crate::arch; #[derive(Debug)] pub struct ResourceAllocator { // Allocator for device interrupt lines - gsi_allocator: IdAllocator, - // Allocator for memory in the MMIO address space - mmio_memory: AddressAllocator, + pub gsi_allocator: Arc>, + // Allocator for memory in the 32-bit MMIO address space + pub mmio32_memory: Arc>, + // Allocator for memory in the 64-bit MMIO address space + pub mmio64_memory: Arc>, // Memory allocator for system data - system_memory: AddressAllocator, + pub system_memory: Arc>, + /// MMIO bus + pub mmio_bus: Arc, + #[cfg(target_arch = "x86_64")] + /// Port IO bus + pub pio_bus: Arc, } impl ResourceAllocator { /// Create a new resource allocator for Firecracker devices pub fn new() -> Result { Ok(Self { - gsi_allocator: IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX)?, - mmio_memory: AddressAllocator::new(arch::MMIO_MEM_START, arch::MMIO_MEM_SIZE)?, - system_memory: AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE)?, + gsi_allocator: Arc::new(Mutex::new(IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX)?)), + mmio32_memory: Arc::new(Mutex::new(AddressAllocator::new( + arch::MEM_32BIT_DEVICES_START, + arch::MEM_32BIT_DEVICES_SIZE, + )?)), + mmio64_memory: Arc::new(Mutex::new(AddressAllocator::new( + arch::MEM_64BIT_DEVICES_START, + arch::MEM_64BIT_DEVICES_SIZE, + )?)), + system_memory: Arc::new(Mutex::new(AddressAllocator::new( + arch::SYSTEM_MEM_START, + arch::SYSTEM_MEM_SIZE, + )?)), + mmio_bus: Arc::new(Bus::new()), + #[cfg(target_arch = "x86_64")] + pio_bus: Arc::new(Bus::new()), }) } @@ -38,16 +62,17 @@ impl ResourceAllocator { /// # Arguments /// /// * `gsi_count` - The number of GSIs to allocate - pub fn allocate_gsi(&mut self, gsi_count: u32) -> Result, vm_allocator::Error> { + pub fn allocate_gsi(&self, gsi_count: u32) -> Result, vm_allocator::Error> { + let mut gsi_allocator = self.gsi_allocator.lock().expect("Poisoned lock"); let mut gsis = Vec::with_capacity(gsi_count as usize); for _ in 0..gsi_count { - match self.gsi_allocator.allocate_id() { + match gsi_allocator.allocate_id() { Ok(gsi) => gsis.push(gsi), Err(err) => { // It is ok to unwrap here, we just allocated the GSI gsis.into_iter().for_each(|gsi| { - self.gsi_allocator.free_id(gsi).unwrap(); + gsi_allocator.free_id(gsi).unwrap(); }); return Err(err); } @@ -57,7 +82,30 @@ impl ResourceAllocator { Ok(gsis) } - /// Allocate a memory range in MMIO address space + /// Allocate a memory range in 32-bit MMIO address space + /// + /// If it succeeds, it returns the first address of the allocated range + /// + /// # Arguments + /// + /// * `size` - The size in bytes of the memory to allocate + /// * `alignment` - The alignment of the address of the first byte + /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy + pub fn allocate_32bit_mmio_memory( + &self, + size: u64, + alignment: u64, + policy: AllocPolicy, + ) -> Result { + Ok(self + .mmio32_memory + .lock() + .expect("Poisoned lock") + .allocate(size, alignment, policy)? + .start()) + } + + /// Allocate a memory range in 64-bit MMIO address space /// /// If it succeeds, it returns the first address of the allocated range /// @@ -66,13 +114,18 @@ impl ResourceAllocator { /// * `size` - The size in bytes of the memory to allocate /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy - pub fn allocate_mmio_memory( - &mut self, + pub fn allocate_64bit_mmio_memory( + &self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { - Ok(self.mmio_memory.allocate(size, alignment, policy)?.start()) + Ok(self + .mmio64_memory + .lock() + .expect("Poisoned lock") + .allocate(size, alignment, policy)? + .start()) } /// Allocate a memory range for system data @@ -85,18 +138,33 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_system_memory( - &mut self, + &self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .system_memory + .lock() + .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } } +impl DeviceRelocation for ResourceAllocator { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn pci::PciDevice, + _region_type: pci::PciBarRegionType, + ) -> Result<(), std::io::Error> { + todo!() + } +} + #[cfg(test)] mod tests { use super::ResourceAllocator; @@ -106,7 +174,7 @@ mod tests { #[test] fn test_allocate_gsi() { - let mut allocator = ResourceAllocator::new().unwrap(); + let allocator = ResourceAllocator::new().unwrap(); // asking for 0 IRQs should return us an empty vector assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); // We cannot allocate more GSIs than available @@ -127,7 +195,7 @@ mod tests { // But we should be able to ask for 0 GSIs assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); - let mut allocator = ResourceAllocator::new().unwrap(); + let allocator = ResourceAllocator::new().unwrap(); // We should be able to allocate 1 GSI assert_eq!(allocator.allocate_gsi(1), Ok(vec![arch::IRQ_BASE])); // We can't allocate MAX_IRQS any more diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index 31dbf64ec39..df0656bfbcc 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -86,7 +86,7 @@ impl VmGenId { /// Allocate memory and a GSI for sending notifications and build the device pub fn new( mem: &GuestMemoryMmap, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, ) -> Result { let gsi = resource_allocator.allocate_gsi(1)?; // The generation ID needs to live in an 8-byte aligned buffer @@ -133,7 +133,7 @@ pub struct VMGenIDState { #[derive(Debug)] pub struct VMGenIdConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a mut ResourceAllocator, + pub resource_allocator: &'a ResourceAllocator, } impl<'a> Persist<'a> for VmGenId { diff --git a/src/vmm/src/devices/mod.rs b/src/vmm/src/devices/mod.rs index 15d78e04907..cf6bef40c0d 100644 --- a/src/vmm/src/devices/mod.rs +++ b/src/vmm/src/devices/mod.rs @@ -7,10 +7,13 @@ //! Emulates virtual and hardware devices. +#![allow(unused)] + use std::io; pub mod acpi; pub mod legacy; +pub mod pci; pub mod pseudo; pub mod virtio; diff --git a/src/vmm/src/devices/pci/mod.rs b/src/vmm/src/devices/pci/mod.rs new file mode 100644 index 00000000000..e365b481893 --- /dev/null +++ b/src/vmm/src/devices/pci/mod.rs @@ -0,0 +1,6 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +pub mod pci_segment; + +pub use pci_segment::*; diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs new file mode 100644 index 00000000000..169ffdcba3b --- /dev/null +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -0,0 +1,464 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 - 2021 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +// + +use std::sync::{Arc, Mutex}; + +#[cfg(target_arch = "x86_64")] +use acpi_tables::{Aml, aml}; +use log::info; +#[cfg(target_arch = "x86_64")] +use pci::{PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE, PciConfigIo}; +use pci::{PciBdf, PciBus, PciConfigMmio, PciRoot, PciRootError}; +use uuid::Uuid; +use vm_allocator::AddressAllocator; +use vm_device::{BusDeviceSync, BusError}; + +use crate::arch::{PCI_MMCONFIG_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT}; +use crate::device_manager::resources::ResourceAllocator; + +pub struct PciSegment { + pub(crate) id: u16, + pub(crate) pci_bus: Arc>, + pub(crate) pci_config_mmio: Arc>, + pub(crate) mmio_config_address: u64, + pub(crate) proximity_domain: u32, + + #[cfg(target_arch = "x86_64")] + pub(crate) pci_config_io: Option>>, + + // Bitmap of PCI devices to hotplug. + pub(crate) pci_devices_up: u32, + // Bitmap of PCI devices to hotunplug. + pub(crate) pci_devices_down: u32, + // List of allocated IRQs for each PCI slot. + pub(crate) pci_irq_slots: [u8; 32], + + // Device memory covered by this segment + pub(crate) start_of_mem32_area: u64, + pub(crate) end_of_mem32_area: u64, + + pub(crate) start_of_mem64_area: u64, + pub(crate) end_of_mem64_area: u64, +} + +impl std::fmt::Debug for PciSegment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciSegment") + .field("id", &self.id) + .field("mmio_config_address", &self.mmio_config_address) + .field("proximity_domain", &self.proximity_domain) + .field("pci_devices_up", &self.pci_devices_up) + .field("pci_devices_down", &self.pci_devices_down) + .field("pci_irq_slots", &self.pci_irq_slots) + .field("start_of_mem32_area", &self.start_of_mem32_area) + .field("end_of_mem32_area", &self.end_of_mem32_area) + .field("start_of_mem64_area", &self.start_of_mem64_area) + .field("end_of_mem64_area", &self.end_of_mem64_area) + .finish() + } +} + +impl PciSegment { + fn build( + id: u16, + resource_allocator: &Arc, + pci_irq_slots: &[u8; 32], + ) -> Result { + let pci_root = PciRoot::new(None); + let pci_bus = Arc::new(Mutex::new(PciBus::new( + pci_root, + resource_allocator.clone(), + ))); + + let pci_config_mmio = Arc::new(Mutex::new(PciConfigMmio::new(Arc::clone(&pci_bus)))); + let mmio_config_address = PCI_MMCONFIG_START + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT * id as u64; + + resource_allocator.mmio_bus.insert( + Arc::clone(&pci_config_mmio) as Arc, + mmio_config_address, + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, + )?; + + let mem32_allocator = resource_allocator.mmio32_memory.clone(); + let mem64_allocator = resource_allocator.mmio64_memory.clone(); + + let start_of_mem32_area = mem32_allocator.lock().unwrap().base(); + let end_of_mem32_area = mem32_allocator.lock().unwrap().end(); + + let start_of_mem64_area = mem64_allocator.lock().unwrap().base(); + let end_of_mem64_area = mem64_allocator.lock().unwrap().end(); + + let segment = PciSegment { + id, + pci_bus, + pci_config_mmio, + mmio_config_address, + proximity_domain: 0, + pci_devices_up: 0, + pci_devices_down: 0, + #[cfg(target_arch = "x86_64")] + pci_config_io: None, + start_of_mem32_area, + end_of_mem32_area, + start_of_mem64_area, + end_of_mem64_area, + pci_irq_slots: *pci_irq_slots, + }; + + Ok(segment) + } + + #[cfg(target_arch = "x86_64")] + pub(crate) fn new( + id: u16, + resource_allocator: &Arc, + pci_irq_slots: &[u8; 32], + ) -> Result { + let mut segment = Self::build(id, resource_allocator, pci_irq_slots)?; + let pci_config_io = Arc::new(Mutex::new(PciConfigIo::new(Arc::clone(&segment.pci_bus)))); + + resource_allocator.pio_bus.insert( + pci_config_io.clone(), + PCI_CONFIG_IO_PORT, + PCI_CONFIG_IO_PORT_SIZE, + )?; + + segment.pci_config_io = Some(pci_config_io); + + info!( + "pci: adding PCI segment: id={:#x}, PCI MMIO config address: {:#x}, mem32 area: \ + [{:#x}-{:#x}], mem64 area: [{:#x}-{:#x}] IO area: [{PCI_CONFIG_IO_PORT:#x}-{:#x}]", + segment.id, + segment.mmio_config_address, + segment.start_of_mem32_area, + segment.end_of_mem32_area, + segment.start_of_mem64_area, + segment.end_of_mem64_area, + PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE - 1 + ); + + Ok(segment) + } + + #[cfg(target_arch = "aarch64")] + pub(crate) fn new( + id: u16, + resource_allocator: &Arc, + pci_irq_slots: &[u8; 32], + ) -> Result { + let segment = Self::build(id, resource_allocator, pci_irq_slots)?; + info!( + "pci: adding PCI segment: id={:#x}, PCI MMIO config address: {:#x}, mem32 area: \ + [{:#x}-{:#x}], mem64 area: [{:#x}-{:#x}]", + segment.id, + segment.mmio_config_address, + segment.start_of_mem32_area, + segment.end_of_mem32_area, + segment.start_of_mem64_area, + segment.end_of_mem64_area, + ); + + Ok(segment) + } + + pub(crate) fn next_device_bdf(&self) -> Result { + Ok(PciBdf::new( + self.id, + 0, + self.pci_bus + .lock() + .unwrap() + .next_device_id()? + .try_into() + .unwrap(), + 0, + )) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDevSlot { + device_id: u8, +} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDevSlot { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let sun = self.device_id; + let adr: u32 = (self.device_id as u32) << 16; + aml::Device::new( + format!("S{:03}", self.device_id).as_str().try_into()?, + vec![ + &aml::Name::new("_SUN".try_into()?, &sun)?, + &aml::Name::new("_ADR".try_into()?, &adr)?, + &aml::Method::new( + "_EJ0".try_into()?, + 1, + true, + vec![&aml::MethodCall::new( + "\\_SB_.PHPR.PCEJ".try_into()?, + vec![&aml::Path::new("_SUN")?, &aml::Path::new("_SEG")?], + )], + ), + ], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDevSlotNotify { + device_id: u8, +} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDevSlotNotify { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let device_id_mask: u32 = 1 << self.device_id; + let object = aml::Path::new(&format!("S{:03}", self.device_id))?; + aml::And::new(&aml::Local(0), &aml::Arg(0), &device_id_mask).append_aml_bytes(v)?; + aml::If::new( + &aml::Equal::new(&aml::Local(0), &device_id_mask), + vec![&aml::Notify::new(&object, &aml::Arg(1))], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDevSlotMethods {} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDevSlotMethods { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let mut device_notifies = Vec::new(); + for device_id in 0..32 { + device_notifies.push(PciDevSlotNotify { device_id }); + } + + let mut device_notifies_refs: Vec<&dyn Aml> = Vec::new(); + for device_notify in device_notifies.iter() { + device_notifies_refs.push(device_notify); + } + + aml::Method::new("DVNT".try_into()?, 2, true, device_notifies_refs).append_aml_bytes(v)?; + aml::Method::new( + "PCNT".try_into()?, + 0, + true, + vec![ + &aml::Acquire::new("\\_SB_.PHPR.BLCK".try_into()?, 0xffff), + &aml::Store::new( + &aml::Path::new("\\_SB_.PHPR.PSEG")?, + &aml::Path::new("_SEG")?, + ), + &aml::MethodCall::new( + "DVNT".try_into()?, + vec![&aml::Path::new("\\_SB_.PHPR.PCIU")?, &aml::ONE], + ), + &aml::MethodCall::new( + "DVNT".try_into()?, + vec![&aml::Path::new("\\_SB_.PHPR.PCID")?, &3usize], + ), + &aml::Release::new("\\_SB_.PHPR.BLCK".try_into()?), + ], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDsmMethod {} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDsmMethod { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + // Refer to ACPI spec v6.3 Ch 9.1.1 and PCI Firmware spec v3.3 Ch 4.6.1 + // _DSM (Device Specific Method), the following is the implementation in ASL. + + // Method (_DSM, 4, NotSerialized) // _DSM: Device-Specific Method + // { + // If ((Arg0 == ToUUID ("e5c937d0-3553-4d7a-9117-ea4d19c3434d") /* Device Labeling + // Interface */)) { + // If ((Arg2 == Zero)) + // { + // Return (Buffer (One) { 0x21 }) + // } + // If ((Arg2 == 0x05)) + // { + // Return (Zero) + // } + // } + // + // Return (Buffer (One) { 0x00 }) + // } + // + // As per ACPI v6.3 Ch 19.6.142, the UUID is required to be in mixed endian: + // Among the fields of a UUID: + // {d1 (8 digits)} - {d2 (4 digits)} - {d3 (4 digits)} - {d4 (16 digits)} + // d1 ~ d3 need to be little endian, d4 be big endian. + // See https://en.wikipedia.org/wiki/Universally_unique_identifier#Encoding . + let uuid = Uuid::parse_str("E5C937D0-3553-4D7A-9117-EA4D19C3434D").unwrap(); + let (uuid_d1, uuid_d2, uuid_d3, uuid_d4) = uuid.as_fields(); + let mut uuid_buf = vec![]; + uuid_buf.extend(uuid_d1.to_le_bytes()); + uuid_buf.extend(uuid_d2.to_le_bytes()); + uuid_buf.extend(uuid_d3.to_le_bytes()); + uuid_buf.extend(uuid_d4); + aml::Method::new( + "_DSM".try_into()?, + 4, + false, + vec![ + &aml::If::new( + &aml::Equal::new(&aml::Arg(0), &aml::Buffer::new(uuid_buf)), + vec![ + &aml::If::new( + &aml::Equal::new(&aml::Arg(2), &aml::ZERO), + vec![&aml::Return::new(&aml::Buffer::new(vec![0x21]))], + ), + &aml::If::new( + &aml::Equal::new(&aml::Arg(2), &0x05u8), + vec![&aml::Return::new(&aml::ZERO)], + ), + ], + ), + &aml::Return::new(&aml::Buffer::new(vec![0])), + ], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciSegment { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let mut pci_dsdt_inner_data: Vec<&dyn Aml> = Vec::new(); + let hid = aml::Name::new("_HID".try_into()?, &aml::EisaName::new("PNP0A08")?)?; + pci_dsdt_inner_data.push(&hid); + let cid = aml::Name::new("_CID".try_into()?, &aml::EisaName::new("PNP0A03")?)?; + pci_dsdt_inner_data.push(&cid); + let adr = aml::Name::new("_ADR".try_into()?, &aml::ZERO)?; + pci_dsdt_inner_data.push(&adr); + let seg = aml::Name::new("_SEG".try_into()?, &self.id)?; + pci_dsdt_inner_data.push(&seg); + let uid = aml::Name::new("_UID".try_into()?, &aml::ZERO)?; + pci_dsdt_inner_data.push(&uid); + let cca = aml::Name::new("_CCA".try_into()?, &aml::ONE)?; + pci_dsdt_inner_data.push(&cca); + let supp = aml::Name::new("SUPP".try_into()?, &aml::ZERO)?; + pci_dsdt_inner_data.push(&supp); + + let proximity_domain = self.proximity_domain; + let pxm_return = aml::Return::new(&proximity_domain); + let pxm = aml::Method::new("_PXM".try_into()?, 0, false, vec![&pxm_return]); + pci_dsdt_inner_data.push(&pxm); + + let pci_dsm = PciDsmMethod {}; + pci_dsdt_inner_data.push(&pci_dsm); + + #[allow(clippy::if_same_then_else)] + let crs = if self.id == 0 { + aml::Name::new( + "_CRS".try_into()?, + &aml::ResourceTemplate::new(vec![ + &aml::AddressSpace::new_bus_number(0x0u16, 0x0u16)?, + &aml::Io::new(0xcf8, 0xcf8, 1, 0x8), + &aml::Memory32Fixed::new( + true, + self.mmio_config_address.try_into().unwrap(), + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT.try_into().unwrap(), + ), + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem32_area, + self.end_of_mem32_area, + )?, + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem64_area, + self.end_of_mem64_area, + )?, + &aml::AddressSpace::new_io(0u16, 0x0cf7u16)?, + &aml::AddressSpace::new_io(0x0d00u16, 0xffffu16)?, + ]), + )? + } else { + aml::Name::new( + "_CRS".try_into()?, + &aml::ResourceTemplate::new(vec![ + &aml::AddressSpace::new_bus_number(0x0u16, 0x0u16)?, + &aml::Memory32Fixed::new( + true, + self.mmio_config_address.try_into().unwrap(), + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT.try_into().unwrap(), + ), + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem32_area, + self.end_of_mem32_area, + )?, + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem64_area, + self.end_of_mem64_area, + )?, + ]), + )? + }; + pci_dsdt_inner_data.push(&crs); + + let mut pci_devices = Vec::new(); + for device_id in 0..32 { + let pci_device = PciDevSlot { device_id }; + pci_devices.push(pci_device); + } + for pci_device in pci_devices.iter() { + pci_dsdt_inner_data.push(pci_device); + } + + let pci_device_methods = PciDevSlotMethods {}; + pci_dsdt_inner_data.push(&pci_device_methods); + + // Build PCI routing table, listing IRQs assigned to PCI devices. + let prt_package_list: Vec<(u32, u32)> = self + .pci_irq_slots + .iter() + .enumerate() + .map(|(i, irq)| { + ( + ((((u32::try_from(i).unwrap()) & 0x1fu32) << 16) | 0xffffu32), + *irq as u32, + ) + }) + .collect(); + let prt_package_list: Vec = prt_package_list + .iter() + .map(|(bdf, irq)| aml::Package::new(vec![bdf, &0u8, &0u8, irq])) + .collect(); + let prt_package_list: Vec<&dyn Aml> = prt_package_list + .iter() + .map(|item| item as &dyn Aml) + .collect(); + let prt = aml::Name::new("_PRT".try_into()?, &aml::Package::new(prt_package_list))?; + pci_dsdt_inner_data.push(&prt); + + aml::Device::new( + format!("_SB_.PC{:02X}", self.id).as_str().try_into()?, + pci_dsdt_inner_data, + ) + .append_aml_bytes(v) + } +} diff --git a/src/vmm/src/devices/virtio/vsock/event_handler.rs b/src/vmm/src/devices/virtio/vsock/event_handler.rs index e1b2876a0f3..47eb2640837 100755 --- a/src/vmm/src/devices/virtio/vsock/event_handler.rs +++ b/src/vmm/src/devices/virtio/vsock/event_handler.rs @@ -474,8 +474,8 @@ mod tests { #[cfg(target_arch = "x86_64")] #[allow(clippy::cast_possible_truncation)] /* casting of constants we know fit into u32 */ fn test_vsock_bof() { - use crate::arch::MMIO_MEM_START; - use crate::arch::x86_64::{FIRST_ADDR_PAST_32BITS, MEM_32BIT_GAP_SIZE}; + use crate::arch::x86_64::layout::FIRST_ADDR_PAST_32BITS; + use crate::arch::{MMIO32_MEM_SIZE, MMIO32_MEM_START}; use crate::devices::virtio::vsock::packet::VSOCK_PKT_HDR_SIZE; use crate::test_utils::multi_region_mem; use crate::utils::mib_to_bytes; @@ -486,7 +486,7 @@ mod tests { let mut test_ctx = TestContext::new(); test_ctx.mem = multi_region_mem(&[ (GuestAddress(0), 8 * MIB), - (GuestAddress(MMIO_MEM_START - MIB as u64), MIB), + (GuestAddress(MMIO32_MEM_START - MIB as u64), MIB), (GuestAddress(FIRST_ADDR_PAST_32BITS), MIB), ]); @@ -509,15 +509,15 @@ mod tests { } // Let's check what happens when the header descriptor is right before the gap. - vsock_bof_helper(&mut test_ctx, 0, MMIO_MEM_START - 1, VSOCK_PKT_HDR_SIZE); + vsock_bof_helper(&mut test_ctx, 0, MMIO32_MEM_START - 1, VSOCK_PKT_HDR_SIZE); // Let's check what happens when the buffer descriptor crosses into the gap, but does // not go past its right edge. vsock_bof_helper( &mut test_ctx, 1, - MMIO_MEM_START - 4, - MEM_32BIT_GAP_SIZE as u32 + 4, + MMIO32_MEM_START - 4, + MMIO32_MEM_SIZE as u32 + 4, ); // Let's modify the buffer descriptor addr and len such that it crosses over the MMIO gap, @@ -525,8 +525,8 @@ mod tests { vsock_bof_helper( &mut test_ctx, 1, - MMIO_MEM_START - 4, - MEM_32BIT_GAP_SIZE as u32 + 100, + MMIO32_MEM_START - 4, + MMIO32_MEM_SIZE as u32 + 100, ); } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 30104890e7d..01ef9547d82 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -371,10 +371,10 @@ impl Vmm { self.vcpus_handles.reserve(vcpu_count); for mut vcpu in vcpus.drain(..) { - vcpu.set_mmio_bus(self.device_manager.mmio_bus.clone()); + vcpu.set_mmio_bus(self.device_manager.resource_allocator.mmio_bus.clone()); #[cfg(target_arch = "x86_64")] vcpu.kvm_vcpu - .set_pio_bus(self.device_manager.pio_bus.clone()); + .set_pio_bus(self.device_manager.resource_allocator.pio_bus.clone()); self.vcpus_handles .push(vcpu.start_threaded(vcpu_seccomp_filter.clone(), barrier.clone())?); diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 70c317bb1e1..365355dfc2d 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -114,6 +114,8 @@ pub struct VmResources { pub mmds_size_limit: usize, /// Whether or not to load boot timer device. pub boot_timer: bool, + /// Whether or not to use PCIe transport for VirtIO devices. + pub pci_enabled: bool, } impl VmResources { @@ -472,7 +474,7 @@ impl VmResources { // a single way of backing guest memory for vhost-user and non-vhost-user cases, // that would not be worth the effort. let regions = - crate::arch::arch_memory_regions(0, mib_to_bytes(self.machine_config.mem_size_mib)); + crate::arch::arch_memory_regions(mib_to_bytes(self.machine_config.mem_size_mib)); if vhost_user_device_used { memory::memfd_backed( regions.as_ref(), @@ -613,6 +615,7 @@ mod tests { boot_timer: false, mmds_size_limit: HTTP_MAX_PAYLOAD_SIZE, entropy: Default::default(), + pci_enabled: false, } } diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index d868c022dd2..e79468ffb91 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -327,18 +327,16 @@ impl<'a> PrebootApiController<'a> { to_api: &std::sync::mpsc::Sender, api_event_fd: &vmm_sys_util::eventfd::EventFd, boot_timer_enabled: bool, + pci_enabled: bool, mmds_size_limit: usize, metadata_json: Option<&str>, ) -> Result<(VmResources, Arc>), BuildMicrovmFromRequestsError> { - let mut vm_resources = VmResources::default(); - // Silence false clippy warning. Clippy suggests using - // VmResources { boot_timer: boot_timer_enabled, ..Default::default() }; but this will - // generate build errors because VmResources contains private fields. - #[allow(clippy::field_reassign_with_default)] - { - vm_resources.mmds_size_limit = mmds_size_limit; - vm_resources.boot_timer = boot_timer_enabled; - } + let mut vm_resources = VmResources { + boot_timer: boot_timer_enabled, + mmds_size_limit, + pci_enabled, + ..Default::default() + }; // Init the data store from file, if present. if let Some(data) = metadata_json { diff --git a/src/vmm/src/test_utils/mod.rs b/src/vmm/src/test_utils/mod.rs index 7cb16a2a213..3a45ce1118d 100644 --- a/src/vmm/src/test_utils/mod.rs +++ b/src/vmm/src/test_utils/mod.rs @@ -58,17 +58,18 @@ pub fn multi_region_mem_raw(regions: &[(GuestAddress, usize)]) -> Vec GuestMemoryMmap { - multi_region_mem(&crate::arch::arch_memory_regions(0, mem_size_bytes)) + multi_region_mem(&crate::arch::arch_memory_regions(mem_size_bytes)) } pub fn arch_mem_raw(mem_size_bytes: usize) -> Vec { - multi_region_mem_raw(&crate::arch::arch_memory_regions(0, mem_size_bytes)) + multi_region_mem_raw(&crate::arch::arch_memory_regions(mem_size_bytes)) } pub fn create_vmm( _kernel_image: Option<&str>, is_diff: bool, boot_microvm: bool, + pci_enabled: bool, ) -> (Arc>, EventManager) { let mut event_manager = EventManager::new().unwrap(); let empty_seccomp_filters = get_empty_filters(); @@ -82,7 +83,7 @@ pub fn create_vmm( None => boot_source_cfg.into(), }; let mock_vm_res = MockVmResources::new().with_boot_source(boot_source_cfg); - let resources: VmResources = if is_diff { + let mut resources: VmResources = if is_diff { mock_vm_res .with_vm_config(MockVmConfig::new().with_dirty_page_tracking().into()) .into() @@ -90,6 +91,8 @@ pub fn create_vmm( mock_vm_res.into() }; + resources.pci_enabled = pci_enabled; + let vmm = build_microvm_for_boot( &InstanceInfo::default(), &resources, @@ -106,16 +109,24 @@ pub fn create_vmm( } pub fn default_vmm(kernel_image: Option<&str>) -> (Arc>, EventManager) { - create_vmm(kernel_image, false, true) + create_vmm(kernel_image, false, true, false) } pub fn default_vmm_no_boot(kernel_image: Option<&str>) -> (Arc>, EventManager) { - create_vmm(kernel_image, false, false) + create_vmm(kernel_image, false, false, false) +} + +pub fn default_vmm_pci_no_boot(kernel_image: Option<&str>) -> (Arc>, EventManager) { + create_vmm(kernel_image, false, false, true) } #[cfg(target_arch = "x86_64")] pub fn dirty_tracking_vmm(kernel_image: Option<&str>) -> (Arc>, EventManager) { - create_vmm(kernel_image, true, true) + create_vmm(kernel_image, true, true, false) +} + +pub fn default_vmm_pci(kernel_image: Option<&str>) -> (Arc>, EventManager) { + create_vmm(kernel_image, false, true, false) } #[allow(clippy::undocumented_unsafe_blocks)] diff --git a/src/vmm/src/vmm_config/boot_source.rs b/src/vmm/src/vmm_config/boot_source.rs index 37ba08be449..297f8abff04 100644 --- a/src/vmm/src/vmm_config/boot_source.rs +++ b/src/vmm/src/vmm_config/boot_source.rs @@ -9,14 +9,13 @@ use serde::{Deserialize, Serialize}; /// Default guest kernel command line: /// - `reboot=k` shut down the guest on reboot, instead of well... rebooting; /// - `panic=1` on panic, reboot after 1 second; -/// - `pci=off` do not scan for PCI devices (save boot time); /// - `nomodule` disable loadable kernel module support; /// - `8250.nr_uarts=0` disable 8250 serial interface; /// - `i8042.noaux` do not probe the i8042 controller for an attached mouse (save boot time); /// - `i8042.nomux` do not probe i8042 for a multiplexing controller (save boot time); /// - `i8042.dumbkbd` do not attempt to control kbd state via the i8042 (save boot time). pub const DEFAULT_KERNEL_CMDLINE: &str = - "reboot=k panic=1 pci=off nomodule 8250.nr_uarts=0 i8042.noaux i8042.nomux i8042.dumbkbd"; + "reboot=k panic=1 nomodule 8250.nr_uarts=0 i8042.noaux i8042.nomux i8042.dumbkbd"; /// Strongly typed data structure used to configure the boot source of the /// microvm. diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 6982bf08c5b..88738599917 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::io::{Seek, SeekFrom}; +use std::sync::{Arc, Mutex}; use std::thread; use std::time::Duration; @@ -15,7 +16,9 @@ use vmm::rpc_interface::{ use vmm::seccomp::get_empty_filters; use vmm::snapshot::Snapshot; use vmm::test_utils::mock_resources::{MockVmResources, NOISY_KERNEL_IMAGE}; -use vmm::test_utils::{create_vmm, default_vmm, default_vmm_no_boot}; +use vmm::test_utils::{ + create_vmm, default_vmm, default_vmm_no_boot, default_vmm_pci, default_vmm_pci_no_boot, +}; use vmm::vmm_config::balloon::BalloonDeviceConfig; use vmm::vmm_config::boot_source::BootSourceConfig; use vmm::vmm_config::drive::BlockDeviceConfig; @@ -26,9 +29,23 @@ use vmm::vmm_config::snapshot::{ CreateSnapshotParams, LoadSnapshotParams, MemBackendConfig, MemBackendType, SnapshotType, }; use vmm::vmm_config::vsock::VsockDeviceConfig; -use vmm::{DumpCpuConfigError, EventManager, FcExitCode}; +use vmm::{DumpCpuConfigError, EventManager, FcExitCode, Vmm}; use vmm_sys_util::tempfile::TempFile; +fn check_booted_microvm(vmm: Arc>, mut evmgr: EventManager) { + // On x86_64, the vmm should exit once its workload completes and signals the exit event. + // On aarch64, the test kernel doesn't exit, so the vmm is force-stopped. + #[cfg(target_arch = "x86_64")] + evmgr.run_with_timeout(500).unwrap(); + #[cfg(target_arch = "aarch64")] + vmm.lock().unwrap().stop(FcExitCode::Ok); + + assert_eq!( + vmm.lock().unwrap().shutdown_exit_code(), + Some(FcExitCode::Ok) + ); +} + #[test] fn test_build_and_boot_microvm() { // Error case: no boot source configured. @@ -47,25 +64,16 @@ fn test_build_and_boot_microvm() { } // Success case. - let (vmm, mut _evmgr) = default_vmm(None); + let (vmm, evmgr) = default_vmm(None); + check_booted_microvm(vmm, evmgr); - // On x86_64, the vmm should exit once its workload completes and signals the exit event. - // On aarch64, the test kernel doesn't exit, so the vmm is force-stopped. - #[cfg(target_arch = "x86_64")] - _evmgr.run_with_timeout(500).unwrap(); - #[cfg(target_arch = "aarch64")] - vmm.lock().unwrap().stop(FcExitCode::Ok); - - assert_eq!( - vmm.lock().unwrap().shutdown_exit_code(), - Some(FcExitCode::Ok) - ); + // microVM with PCI + let (vmm, evmgr) = default_vmm_pci(None); + check_booted_microvm(vmm, evmgr); } -#[test] -fn test_build_microvm() { +fn check_build_microvm(vmm: Arc>, mut evmgr: EventManager) { // The built microVM should be in the `VmState::Paused` state here. - let (vmm, mut _evtmgr) = default_vmm_no_boot(None); assert_eq!(vmm.lock().unwrap().instance_info().state, VmState::Paused); // The microVM should be able to resume and exit successfully. @@ -73,7 +81,7 @@ fn test_build_microvm() { // On aarch64, the test kernel doesn't exit, so the vmm is force-stopped. vmm.lock().unwrap().resume_vm().unwrap(); #[cfg(target_arch = "x86_64")] - _evtmgr.run_with_timeout(500).unwrap(); + evmgr.run_with_timeout(500).unwrap(); #[cfg(target_arch = "aarch64")] vmm.lock().unwrap().stop(FcExitCode::Ok); assert_eq!( @@ -83,10 +91,14 @@ fn test_build_microvm() { } #[test] -fn test_pause_resume_microvm() { - // Tests that pausing and resuming a microVM work as expected. - let (vmm, _) = default_vmm(None); +fn test_build_microvm() { + let (vmm, evtmgr) = default_vmm_no_boot(None); + check_build_microvm(vmm, evtmgr); + let (vmm, evtmgr) = default_vmm_pci_no_boot(None); + check_build_microvm(vmm, evtmgr); +} +fn pause_resume_microvm(vmm: Arc>) { let mut api_controller = RuntimeApiController::new(VmResources::default(), vmm.clone()); // There's a race between this thread and the vcpu thread, but this thread @@ -100,6 +112,17 @@ fn test_pause_resume_microvm() { vmm.lock().unwrap().stop(FcExitCode::Ok); } +#[test] +fn test_pause_resume_microvm() { + // Tests that pausing and resuming a microVM work as expected. + let (vmm, _) = default_vmm(None); + + pause_resume_microvm(vmm); + + let (vmm, _) = default_vmm_pci(None); + pause_resume_microvm(vmm); +} + #[test] fn test_dirty_bitmap_error() { // Error case: dirty tracking disabled. @@ -185,11 +208,11 @@ fn test_disallow_dump_cpu_config_without_pausing() { vmm.lock().unwrap().stop(FcExitCode::Ok); } -fn verify_create_snapshot(is_diff: bool) -> (TempFile, TempFile) { +fn verify_create_snapshot(is_diff: bool, pci_enabled: bool) -> (TempFile, TempFile) { let snapshot_file = TempFile::new().unwrap(); let memory_file = TempFile::new().unwrap(); - let (vmm, _) = create_vmm(Some(NOISY_KERNEL_IMAGE), is_diff, true); + let (vmm, _) = create_vmm(Some(NOISY_KERNEL_IMAGE), is_diff, true, pci_enabled); let resources = VmResources { machine_config: MachineConfig { mem_size_mib: 1, @@ -296,29 +319,27 @@ fn verify_load_snapshot(snapshot_file: TempFile, memory_file: TempFile) { #[test] fn test_create_and_load_snapshot() { - // Create diff snapshot. - let (snapshot_file, memory_file) = verify_create_snapshot(true); - // Create a new microVm from snapshot. This only tests code-level logic; it verifies - // that a microVM can be built with no errors from given snapshot. - // It does _not_ verify that the guest is actually restored properly. We're using - // python integration tests for that. - verify_load_snapshot(snapshot_file, memory_file); - - // Create full snapshot. - let (snapshot_file, memory_file) = verify_create_snapshot(false); - // Create a new microVm from snapshot. This only tests code-level logic; it verifies - // that a microVM can be built with no errors from given snapshot. - // It does _not_ verify that the guest is actually restored properly. We're using - // python integration tests for that. - verify_load_snapshot(snapshot_file, memory_file); + for (diff_snap, pci_enabled) in [(false, false), (false, true), (true, false), (true, true)] { + // Create snapshot. + let (snapshot_file, memory_file) = verify_create_snapshot(diff_snap, pci_enabled); + // Create a new microVm from snapshot. This only tests code-level logic; it verifies + // that a microVM can be built with no errors from given snapshot. + // It does _not_ verify that the guest is actually restored properly. We're using + // python integration tests for that. + verify_load_snapshot(snapshot_file, memory_file); + } } #[test] fn test_snapshot_load_sanity_checks() { - use vmm::persist::SnapShotStateSanityCheckError; - - let mut microvm_state = get_microvm_state_from_snapshot(); + let microvm_state = get_microvm_state_from_snapshot(false); + check_snapshot(microvm_state); + let microvm_state = get_microvm_state_from_snapshot(true); + check_snapshot(microvm_state); +} +fn check_snapshot(mut microvm_state: MicrovmState) { + use vmm::persist::SnapShotStateSanityCheckError; snapshot_state_sanity_check(µvm_state).unwrap(); // Remove memory regions. @@ -331,9 +352,9 @@ fn test_snapshot_load_sanity_checks() { ); } -fn get_microvm_state_from_snapshot() -> MicrovmState { +fn get_microvm_state_from_snapshot(pci_enabled: bool) -> MicrovmState { // Create a diff snapshot - let (snapshot_file, _) = verify_create_snapshot(true); + let (snapshot_file, _) = verify_create_snapshot(true, pci_enabled); // Deserialize the microVM state. let snapshot_file_metadata = snapshot_file.as_file().metadata().unwrap(); @@ -344,7 +365,7 @@ fn get_microvm_state_from_snapshot() -> MicrovmState { } fn verify_load_snap_disallowed_after_boot_resources(res: VmmAction, res_name: &str) { - let (snapshot_file, memory_file) = verify_create_snapshot(false); + let (snapshot_file, memory_file) = verify_create_snapshot(false, false); let mut event_manager = EventManager::new().unwrap(); let empty_seccomp_filters = get_empty_filters(); diff --git a/tests/conftest.py b/tests/conftest.py index 3464fbc4d62..477f5c9b65c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -534,12 +534,24 @@ def mem_size_mib(): return 256 +@pytest.fixture(params=[True, False]) +def pci_enabled(request): + """Fixture that allows configuring whether a microVM will have PCI enabled or not""" + yield request.param + + def uvm_booted( - microvm_factory, guest_kernel, rootfs, cpu_template, vcpu_count=2, mem_size_mib=256 + microvm_factory, + guest_kernel, + rootfs, + cpu_template, + pci_enabled, + vcpu_count=2, + mem_size_mib=256, ): """Return a booted uvm""" uvm = microvm_factory.build(guest_kernel, rootfs) - uvm.spawn() + uvm.spawn(pci=pci_enabled) uvm.basic_config(vcpu_count=vcpu_count, mem_size_mib=mem_size_mib) uvm.set_cpu_template(cpu_template) uvm.add_net_iface() @@ -547,9 +559,13 @@ def uvm_booted( return uvm -def uvm_restored(microvm_factory, guest_kernel, rootfs, cpu_template, **kwargs): +def uvm_restored( + microvm_factory, guest_kernel, rootfs, cpu_template, pci_enabled, **kwargs +): """Return a restored uvm""" - uvm = uvm_booted(microvm_factory, guest_kernel, rootfs, cpu_template, **kwargs) + uvm = uvm_booted( + microvm_factory, guest_kernel, rootfs, cpu_template, pci_enabled, **kwargs + ) snapshot = uvm.snapshot_full() uvm.kill() uvm2 = microvm_factory.build_from_snapshot(snapshot) @@ -570,6 +586,7 @@ def uvm_any( guest_kernel, rootfs, cpu_template_any, + pci_enabled, vcpu_count, mem_size_mib, ): @@ -579,6 +596,7 @@ def uvm_any( guest_kernel, rootfs, cpu_template_any, + pci_enabled, vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, ) @@ -586,7 +604,13 @@ def uvm_any( @pytest.fixture def uvm_any_booted( - microvm_factory, guest_kernel, rootfs, cpu_template_any, vcpu_count, mem_size_mib + microvm_factory, + guest_kernel, + rootfs, + cpu_template_any, + pci_enabled, + vcpu_count, + mem_size_mib, ): """Return booted uvms""" return uvm_booted( @@ -594,6 +618,51 @@ def uvm_any_booted( guest_kernel, rootfs, cpu_template_any, + pci_enabled, + vcpu_count=vcpu_count, + mem_size_mib=mem_size_mib, + ) + + +@pytest.fixture +def uvm_any_with_pci( + uvm_ctor, + microvm_factory, + guest_kernel_acpi, + rootfs, + cpu_template_any, + vcpu_count, + mem_size_mib, +): + """Return booted uvms with PCI enabled""" + return uvm_ctor( + microvm_factory, + guest_kernel_acpi, + rootfs, + cpu_template_any, + True, + vcpu_count=vcpu_count, + mem_size_mib=mem_size_mib, + ) + + +@pytest.fixture +def uvm_any_without_pci( + uvm_ctor, + microvm_factory, + guest_kernel, + rootfs, + cpu_template_any, + vcpu_count, + mem_size_mib, +): + """Return booted uvms with PCI disabled""" + return uvm_ctor( + microvm_factory, + guest_kernel, + rootfs, + cpu_template_any, + False, vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, ) diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 3f1aceccd20..2d812d3b488 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -198,6 +198,7 @@ def __init__( assert microvm_id is not None self._microvm_id = microvm_id + self.pci_enabled = False self.kernel_file = None self.rootfs_file = None self.ssh_key = None @@ -600,6 +601,7 @@ def spawn( log_show_origin=False, metrics_path="fc.ndjson", emit_metrics: bool = False, + pci: bool = False, ): """Start a microVM as a daemon or in a screen session.""" # pylint: disable=subprocess-run-check @@ -640,6 +642,10 @@ def spawn( # Checking the timings requires DEBUG level log messages self.time_api_requests = False + if pci: + self.pci_enabled = True + self.jailer.extra_args["enable-pci"] = None + cmd = [ *self._pre_cmd, str(self.jailer_binary_path), diff --git a/tests/host_tools/memory.py b/tests/host_tools/memory.py index 93380a9321d..eacc14ac48a 100644 --- a/tests/host_tools/memory.py +++ b/tests/host_tools/memory.py @@ -8,6 +8,8 @@ import psutil +from framework.properties import global_props + class MemoryUsageExceededError(Exception): """A custom exception containing details on excessive memory usage.""" @@ -15,8 +17,8 @@ class MemoryUsageExceededError(Exception): def __init__(self, usage, threshold, *args): """Compose the error message containing the memory consumption.""" super().__init__( - f"Memory usage ({usage / 2**20:.2f} MiB) exceeded maximum threshold " - f"({threshold / 2**20} MiB)", + f"Memory usage ({usage / 1 << 20:.2f} MiB) exceeded maximum threshold " + f"({threshold / 1 << 20} MiB)", *args, ) @@ -28,10 +30,20 @@ class MemoryMonitor(Thread): VMM memory usage. """ - # If guest memory is >3328MB, it is split in a 2nd region - X86_MEMORY_GAP_START = 3328 * 2**20 - - def __init__(self, vm, threshold=5 * 2**20, period_s=0.05): + # If guest memory is >3GiB, it is split in a 2nd region + # Gap starts at 3GiBs and is 1GiB long + X86_32BIT_MEMORY_GAP_START = 3 << 30 + X86_32BIT_MEMORY_GAP_SIZE = 1 << 30 + # If guest memory is >255GiB, it is split in a 3rd region + # Gap starts at 256 GiB and is 256GiB long + X86_64BIT_MEMORY_GAP_START = 256 << 30 + # On ARM64 we just have a single gap, but memory starts at an offset + # Gap starts at 256 GiB and is GiB long + # Memory starts at 2GiB + ARM64_64BIT_MEMORY_GAP_START = 256 << 30 + ARM64_MEMORY_START = 2 << 30 + + def __init__(self, vm, threshold=5 << 20, period_s=0.01): """Initialize monitor attributes.""" Thread.__init__(self) self._vm = vm @@ -72,7 +84,9 @@ def run(self): mem_total = 0 for mmap in mmaps: if self.is_guest_mem(mmap.size, guest_mem_bytes): + print(f"Region {mmap} is guest memory") continue + mem_total += mmap.rss self._current_rss = mem_total if mem_total > self.threshold: @@ -81,24 +95,55 @@ def run(self): time.sleep(self._period_s) - def is_guest_mem(self, size, guest_mem_bytes): + def is_guest_mem_x86(self, size, guest_mem_bytes): """ - If the address is recognised as a guest memory region, - return True, otherwise return False. + Checks if a region is a guest memory region based on + x86_64 physical memory layout """ + return size in ( + # memory fits before the first gap + guest_mem_bytes, + # guest memory spans at least two regions & memory fits before the second gap + self.X86_32BIT_MEMORY_GAP_START, + # guest memory spans exactly two regions + guest_mem_bytes - self.X86_32BIT_MEMORY_GAP_START, + # guest memory fills the space between the two gaps + self.X86_64BIT_MEMORY_GAP_START + - self.X86_32BIT_MEMORY_GAP_START + - self.X86_32BIT_MEMORY_GAP_SIZE, + # guest memory spans 3 regions, this is what remains past the second gap + guest_mem_bytes + - self.X86_64BIT_MEMORY_GAP_START + + self.X86_32BIT_MEMORY_GAP_SIZE, + ) - # If x86_64 guest memory exceeds 3328M, it will be split - # in 2 regions: 3328M and the rest. We have 3 cases here - # to recognise a guest memory region: - # - its size matches the guest memory exactly - # - its size is 3328M - # - its size is guest memory minus 3328M. + def is_guest_mem_arch64(self, size, guest_mem_bytes): + """ + Checks if a region is a guest memory region based on + ARM64 physical memory layout + """ return size in ( + # guest memory fits before the gap guest_mem_bytes, - self.X86_MEMORY_GAP_START, - guest_mem_bytes - self.X86_MEMORY_GAP_START, + # guest memory fills the space before the gap + self.ARM64_64BIT_MEMORY_GAP_START - self.ARM64_MEMORY_START, + # guest memory spans 2 regions, this is what remains past the gap + guest_mem_bytes + - self.ARM64_64BIT_MEMORY_GAP_START + + self.ARM64_MEMORY_START, ) + def is_guest_mem(self, size, guest_mem_bytes): + """ + If the address is recognised as a guest memory region, + return True, otherwise return False. + """ + + if global_props.cpu_architecture == "x86_64": + return self.is_guest_mem_x86(size, guest_mem_bytes) + + return self.is_guest_mem_arch64(size, guest_mem_bytes) + def check_samples(self): """Check that there are no samples over the threshold.""" if self._exceeded is not None: diff --git a/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py b/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py index 4b66b077839..bd0f640fe21 100644 --- a/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py +++ b/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py @@ -91,7 +91,6 @@ "cqm_occup_llc", "decodeassists", "extapic", - "extd_apicid", "flushbyasid", "hw_pstate", "ibs", diff --git a/tests/integration_tests/functional/test_net.py b/tests/integration_tests/functional/test_net.py index 7abf23406d5..10467affac8 100644 --- a/tests/integration_tests/functional/test_net.py +++ b/tests/integration_tests/functional/test_net.py @@ -85,9 +85,9 @@ def test_multi_queue_unsupported(uvm_plain): @pytest.fixture -def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs): +def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs, pci_enabled): """Return booted and restored uvm with no CPU templates""" - return uvm_ctor(microvm_factory, guest_kernel, rootfs, None) + return uvm_ctor(microvm_factory, guest_kernel, rootfs, None, pci_enabled) def test_tap_offload(uvm_any): diff --git a/tests/integration_tests/functional/test_pci.py b/tests/integration_tests/functional/test_pci.py new file mode 100644 index 00000000000..dc0827b1aae --- /dev/null +++ b/tests/integration_tests/functional/test_pci.py @@ -0,0 +1,28 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for the PCI devices""" + + +def test_pci_root_present(uvm_any_with_pci): + """ + Test that a guest with PCI enabled has a PCI root device. + """ + + vm = uvm_any_with_pci + devices = vm.ssh.run("lspci").stdout.strip().split("\n") + print(devices) + assert devices[0].startswith( + "00:00.0 Host bridge: Intel Corporation Device" + ), "PCI root not found in guest" + + +def test_pci_disabled(uvm_any_without_pci): + """ + Test that a guest with PCI disabled does not have a PCI root device but still works. + """ + + vm = uvm_any_without_pci + _, stdout, _ = vm.ssh.run("lspci") + assert ( + "00:00.0 Host bridge: Intel Corporation Device" not in stdout + ), "PCI root not found in guest" diff --git a/tests/integration_tests/functional/test_rng.py b/tests/integration_tests/functional/test_rng.py index 1893230c51a..f2acf96735a 100644 --- a/tests/integration_tests/functional/test_rng.py +++ b/tests/integration_tests/functional/test_rng.py @@ -8,10 +8,12 @@ from host_tools.network import SSHConnection -def uvm_with_rng_booted(microvm_factory, guest_kernel, rootfs, rate_limiter): +def uvm_with_rng_booted( + microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled +): """Return a booted microvm with virtio-rng configured""" uvm = microvm_factory.build(guest_kernel, rootfs) - uvm.spawn(log_level="INFO") + uvm.spawn(log_level="INFO", pci=pci_enabled) uvm.basic_config(vcpu_count=2, mem_size_mib=256) uvm.add_net_iface() uvm.api.entropy.put(rate_limiter=rate_limiter) @@ -21,9 +23,13 @@ def uvm_with_rng_booted(microvm_factory, guest_kernel, rootfs, rate_limiter): return uvm -def uvm_with_rng_restored(microvm_factory, guest_kernel, rootfs, rate_limiter): +def uvm_with_rng_restored( + microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled +): """Return a restored uvm with virtio-rng configured""" - uvm = uvm_with_rng_booted(microvm_factory, guest_kernel, rootfs, rate_limiter) + uvm = uvm_with_rng_booted( + microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled + ) snapshot = uvm.snapshot_full() uvm.kill() uvm2 = microvm_factory.build_from_snapshot(snapshot) @@ -44,9 +50,9 @@ def rate_limiter(request): @pytest.fixture -def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs, rate_limiter): +def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs, rate_limiter, pci_enabled): """Return booted and restored uvms""" - return uvm_ctor(microvm_factory, guest_kernel, rootfs, rate_limiter) + return uvm_ctor(microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled) def list_rng_available(ssh_connection: SSHConnection) -> list[str]: diff --git a/tests/integration_tests/security/test_vulnerabilities.py b/tests/integration_tests/security/test_vulnerabilities.py index 0e530123255..0e6eae283ab 100644 --- a/tests/integration_tests/security/test_vulnerabilities.py +++ b/tests/integration_tests/security/test_vulnerabilities.py @@ -222,12 +222,12 @@ def uvm_any_a(microvm_factory_a, uvm_ctor, guest_kernel, rootfs, cpu_template_an Since pytest caches fixtures, this guarantees uvm_any_a will match a vm from uvm_any. See https://docs.pytest.org/en/stable/how-to/fixtures.html#fixtures-can-be-requested-more-than-once-per-test-return-values-are-cached """ - return uvm_ctor(microvm_factory_a, guest_kernel, rootfs, cpu_template_any) + return uvm_ctor(microvm_factory_a, guest_kernel, rootfs, cpu_template_any, False) -def test_check_vulnerability_files_ab(request, uvm_any): +def test_check_vulnerability_files_ab(request, uvm_any_without_pci): """Test vulnerability files on guests""" - res_b = check_vulnerabilities_files_on_guest(uvm_any) + res_b = check_vulnerabilities_files_on_guest(uvm_any_without_pci) if global_props.buildkite_pr: # we only get the uvm_any_a fixtures if we need it uvm_a = request.getfixturevalue("uvm_any_a") @@ -239,11 +239,11 @@ def test_check_vulnerability_files_ab(request, uvm_any): def test_spectre_meltdown_checker_on_guest( request, - uvm_any, + uvm_any_without_pci, spectre_meltdown_checker, ): """Test with the spectre / meltdown checker on any supported guest.""" - res_b = spectre_meltdown_checker.get_report_for_guest(uvm_any) + res_b = spectre_meltdown_checker.get_report_for_guest(uvm_any_without_pci) if global_props.buildkite_pr: # we only get the uvm_any_a fixtures if we need it uvm_a = request.getfixturevalue("uvm_any_a") @@ -251,5 +251,5 @@ def test_spectre_meltdown_checker_on_guest( assert res_b <= res_a else: assert res_b == spectre_meltdown_checker.expected_vulnerabilities( - uvm_any.cpu_template_name + uvm_any_without_pci.cpu_template_name ) diff --git a/tools/devtool b/tools/devtool index 9c8b086e65f..bf6030c844f 100755 --- a/tools/devtool +++ b/tools/devtool @@ -569,8 +569,9 @@ ensure_ci_artifacts() { # Fetch all the artifacts so they are local say "Fetching CI artifacts from S3" - FC_VERSION=$(cmd_sh "cd src/firecracker/src; cargo pkgid | cut -d# -f2 | cut -d. -f1-2") - S3_URL=s3://spec.ccfc.min/firecracker-ci/v$FC_VERSION/$(uname -m) + # FC_VERSION=$(cmd_sh "cd src/firecracker/src; cargo pkgid | cut -d# -f2 | cut -d. -f1-2") + # S3_URL=s3://spec.ccfc.min/firecracker-ci/v$FC_VERSION/$(uname -m) + S3_URL=s3://spec.ccfc.min/firecracker-ci/v1.13-pcie/$(uname -m) ARTIFACTS=$MICROVM_IMAGES_DIR/$(uname -m) if [ ! -d "$ARTIFACTS" ]; then mkdir -pv $ARTIFACTS