Skip to content

feat: introduce useful util functions for cc and multi-gpu ppcie #90

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 182 additions & 0 deletions nvml-wrapper/src/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,188 @@
}
}

/**
Checks simultaneously if confidential compute is enabled, if the device is in a production environment,
and if the device is accepting client requests.
# Errors
* `Uninitialized`, if the library has not been successfully initialized
* `NotSupported`, if this query is not supported by the device
* `InvalidArg`, if confidential compute state is invalid
*/
pub fn check_confidential_compute_status(&self) -> Result<bool, NvmlError> {
let cc_state_sym = nvml_sym(self.nvml.lib.nvmlSystemGetConfComputeState.as_ref())?;
let cc_gpus_ready_sym = nvml_sym(
self.nvml
.lib
.nvmlSystemGetConfComputeGpusReadyState
.as_ref(),
)?;

unsafe {
let mut state: nvmlConfComputeSystemState_t = mem::zeroed();
nvml_try(cc_state_sym(&mut state))?;

let is_cc_enabled = state.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED;
let is_prod_environment = state.environment == NVML_CC_SYSTEM_ENVIRONMENT_PROD;

let mut cc_gpus_ready: std::os::raw::c_uint = 0;
nvml_try(cc_gpus_ready_sym(&mut cc_gpus_ready))?;
let is_accepting_client_requests =
cc_gpus_ready == NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE;

Ok(is_cc_enabled && is_prod_environment && is_accepting_client_requests)
}
}

/**
Gets the confidential compute state for this `Device`.
# Errors
* `Uninitialized`, if the library has not been successfully initialized
* `InvalidArg`, if device is invalid or memory is NULL
* `NotSupported`, if this query is not supported by the device
*/
#[doc(alias = "nvmlDeviceGetConfComputeGpusReadyState")]
pub fn get_confidential_compute_state(&self) -> Result<bool, NvmlError> {
let sym = nvml_sym(
self.nvml
.lib
.nvmlSystemGetConfComputeGpusReadyState
.as_ref(),
)?;

unsafe {
let mut is_accepting_work: u32 = 0;
nvml_try(sym(&mut is_accepting_work))?;
Ok(is_accepting_work == NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE)
}
}

/**
Sets the confidential compute state for this `Device`.
# Errors
* `Uninitialized`, if the library has not been successfully initialized
* `InvalidArg`, if device is invalid or memory is NULL
* `NotSupported`, if this query is not supported by the device
*/
#[doc(alias = "nvmlDeviceSetConfComputeState")]
pub fn set_confidential_compute_state(&self, is_accepting_work: bool) -> Result<(), NvmlError> {
let sym = nvml_sym(
self.nvml
.lib
.nvmlSystemSetConfComputeGpusReadyState
.as_ref(),
)?;

unsafe {
nvml_try(sym(is_accepting_work as u32))?;
Ok(())
}
}

/**
Gets the confidential compute state for this `Device`.
# Errors

* `Uninitialized`, if the library has not been successfully initialized
* `InvalidArg`, if device is invalid or counters is NULL
* `NotSupported`, if the device does not support this feature
* `GpuLost`, if the target GPU has fallen off the bus or is otherwise inaccessible
* `ArgumentVersionMismatch`, if the provided version is invalid/unsupported
* `Unknown`, on any unexpected error
*/
#[doc(alias = "nvmlDeviceSetConfComputeSettings")]
pub fn is_cc_enabled(&self) -> Result<bool, NvmlError> {
let sym = nvml_sym(self.nvml.lib.nvmlSystemGetConfComputeSettings.as_ref())?;

unsafe {
let mut settings: nvmlSystemConfComputeSettings_t = mem::zeroed();
nvml_try(sym(&mut settings))?;
Ok(settings.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED)
}
}

/**
Gets the confidential compute state for this `Device`.
# Errors

* `Uninitialized`, if the library has not been successfully initialized
* `InvalidArg`, if device is invalid or counters is NULL
* `NotSupported`, if the device does not support this feature
* `GpuLost`, if the target GPU has fallen off the bus or is otherwise inaccessible
* `ArgumentVersionMismatch`, if the provided version is invalid/unsupported
* `Unknown`, on any unexpected error
*/
#[doc(alias = "nvmlSystemGetConfComputeSettings")]
pub fn is_multi_gpu_protected_pcie_enabled(&self) -> Result<bool, NvmlError> {
let sym = nvml_sym(self.nvml.lib.nvmlSystemGetConfComputeSettings.as_ref())?;

unsafe {
let mut settings: nvmlSystemConfComputeSettings_t = mem::zeroed();
nvml_try(sym(&mut settings))?;
Ok(settings.multiGpuMode == NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE)
}
}

/**
Gets the confidential compute state for this `Device`.
# Errors

* `Uninitialized`, if the library has not been successfully initialized
* `InvalidArg`, if device is invalid or counters is NULL
* `NotSupported`, if the device does not support this feature
* `GpuLost`, if the target GPU has fallen off the bus or is otherwise inaccessible
* `ArgumentVersionMismatch`, if the provided version is invalid/unsupported
* `Unknown`, on any unexpected error
*/
#[doc(alias = "nvmlSystemGetConfComputeSettings")]
pub fn is_cc_dev_mode_enabled(&self) -> Result<bool, NvmlError> {
let sym = nvml_sym(self.nvml.lib.nvmlSystemGetConfComputeSettings.as_ref())?;

unsafe {
let mut settings: nvmlSystemConfComputeSettings_t = mem::zeroed();
nvml_try(sym(&mut settings))?;
Ok(settings.devToolsMode == NVML_CC_SYSTEM_DEVTOOLS_MODE_ON)
}
}

/**
Gets the confidential compute capabilities for this `Device`.
# Errors
* `Uninitialized`, if the library has not been successfully initialized
* `InvalidArg`, if device is invalid or memory is NULL
* `NotSupported`, if this query is not supported by the device
*/
pub fn get_confidential_compute_capabilities(
&self,
) -> Result<ConfidentialComputeCapabilities, NvmlError> {
let sym = nvml_sym(self.nvml.lib.nvmlSystemGetConfComputeCapabilities.as_ref())?;

unsafe {
let mut capabilities: nvmlConfComputeSystemCaps_t = mem::zeroed();
nvml_try(sym(&mut capabilities))?;

let cpu_caps = match capabilities.cpuCaps {
NVML_CC_SYSTEM_CPU_CAPS_NONE => ConfidentialComputeCpuCapabilities::None,
NVML_CC_SYSTEM_CPU_CAPS_AMD_SEV => ConfidentialComputeCpuCapabilities::AmdSev,
NVML_CC_SYSTEM_CPU_CAPS_INTEL_TDX => ConfidentialComputeCpuCapabilities::IntelTdx,
_ => return Err(NvmlError::Unknown),
};

let gpus_caps = match capabilities.gpusCaps {
NVML_CC_SYSTEM_GPUS_CC_CAPABLE => ConfidentialComputeGpuCapabilities::Capable,
NVML_CC_SYSTEM_GPUS_CC_NOT_CAPABLE => {
ConfidentialComputeGpuCapabilities::NotCapable
}
_ => return Err(NvmlError::Unknown),
};

Ok(ConfidentialComputeCapabilities {
cpu_caps,
gpus_caps,
})
}
}

/**
Fetches the confidential compute attestation report for this [`Device`].

Expand Down Expand Up @@ -955,7 +1137,7 @@

* `UnexpectedVariant`, for which you can read the docs for
* `IncorrectBits`, if bits are found in a session's info flags that don't
match the flags in this wrapper

Check warning on line 1140 in nvml-wrapper/src/device.rs

View workflow job for this annotation

GitHub Actions / Clippy

doc list item overindented
* `Uninitialized`, if the library has not been successfully initialized
* `NotSupported`, if this `Device` does not support this feature
* `GpuLost`, if this `Device` has fallen off the bus or is otherwise inaccessible
Expand Down Expand Up @@ -3242,7 +3424,7 @@

* `Uninitialized`, if the library has not been successfully initialized
* `IncorrectBits`, if NVML returns any bits that do not correspond to flags in
`ThrottleReasons`

Check warning on line 3427 in nvml-wrapper/src/device.rs

View workflow job for this annotation

GitHub Actions / Clippy

doc list item without indentation
* `GpuLost`, if this `Device` has fallen off the bus or is otherwise inaccessible
* `Unknown`, on any unexpected error

Expand Down Expand Up @@ -4289,7 +4471,7 @@
* `Uninitialized`, if the library has not been successfully initialized
* `InvalidArg`, if the `Device` is invalid
* `NotSupported`, if this `Device` does not support this feature or accounting mode
is disabled

Check warning on line 4474 in nvml-wrapper/src/device.rs

View workflow job for this annotation

GitHub Actions / Clippy

doc list item without indentation
* `Unknown`, on any unexpected error

# Device Support
Expand Down Expand Up @@ -4351,7 +4533,7 @@
* `Uninitialized`, if the library has not been successfully initialized
* `InvalidArg`, if the `Device` is invalid
* `NotSupported`, if this `Device` does not support this feature or accounting
mode is disabled

Check warning on line 4536 in nvml-wrapper/src/device.rs

View workflow job for this annotation

GitHub Actions / Clippy

doc list item without indentation
* `Unknown`, on any unexpected error
*/
// Checked against local
Expand Down Expand Up @@ -4408,9 +4590,9 @@
Note:
* Accounting mode needs to be on. See `.is_accounting_enabled()`.
* Only compute and graphics applications stats can be queried. Monitoring
applications can't be queried since they don't contribute to GPU utilization.

Check warning on line 4593 in nvml-wrapper/src/device.rs

View workflow job for this annotation

GitHub Actions / Clippy

doc list item without indentation
* If a PID collision occurs, the stats of the latest process (the one that
terminated last) will be reported.

Check warning on line 4595 in nvml-wrapper/src/device.rs

View workflow job for this annotation

GitHub Actions / Clippy

doc list item without indentation

# Errors

Expand All @@ -4418,7 +4600,7 @@
* `InvalidArg`, if the `Device` is invalid
* `NotFound`, if the process stats were not found
* `NotSupported`, if this `Device` does not support this feature or accounting
mode is disabled

Check warning on line 4603 in nvml-wrapper/src/device.rs

View workflow job for this annotation

GitHub Actions / Clippy

doc list item without indentation
* `Unknown`, on any unexpected error

# Device Support
Expand Down Expand Up @@ -4452,8 +4634,8 @@

Note:
* This setting is not persistent and will default to disabled after the driver
unloads. Enable persistence mode to be sure the setting doesn't switch off

Check warning on line 4637 in nvml-wrapper/src/device.rs

View workflow job for this annotation

GitHub Actions / Clippy

doc list item without indentation
to disabled.

Check warning on line 4638 in nvml-wrapper/src/device.rs

View workflow job for this annotation

GitHub Actions / Clippy

doc list item without indentation
* Enabling accounting mode has no negative impact on GPU performance.
* Disabling accounting clears accounting information for all PIDs

Expand Down Expand Up @@ -4526,7 +4708,7 @@
* `Uninitialized`, if the library has not been successfully initialized
* `InvalidArg`, if the `Device` is invalid or `api_type` is invalid (shouldn't occur?)
* `NotSupported`, if this `Device` does not support changing API restrictions or
this `Device` does not support the feature that API restrictions are being set for

Check warning on line 4711 in nvml-wrapper/src/device.rs

View workflow job for this annotation

GitHub Actions / Clippy

doc list item without indentation
(e.g. enabling/disabling auto boosted clocks is not supported by this `Device`).
* `NoPermission`, if the user doesn't have permission to perform this operation
* `GpuLost`, if this `Device` has fallen off the bus or is otherwise inaccessible
Expand Down
32 changes: 32 additions & 0 deletions nvml-wrapper/src/structs/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,38 @@ use crate::enum_wrappers::device::OperationMode;
#[cfg(feature = "serde")]
use serde_derive::{Deserialize, Serialize};

/// Returned from `Device.get_confidential_compute_capabilities()`
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct ConfidentialComputeCapabilities {
/// The CPU capabilities.
pub cpu_caps: ConfidentialComputeCpuCapabilities,
/// The GPU capabilities.
pub gpus_caps: ConfidentialComputeGpuCapabilities,
}

/// The possible CPU capabilities for confidential compute (either None, AMD SEV or Intel TDX)
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum ConfidentialComputeCpuCapabilities {
/// No CPU capabilities.
None,
/// AMD SEV confidential compute capabilities.
AmdSev,
/// Intel TDX confidential compute capabilities.
IntelTdx,
}

/// The possible GPU capabilities for confidential compute (either not capable or capable)
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum ConfidentialComputeGpuCapabilities {
/// Capable.
Capable,
/// Not capable.
NotCapable,
}

/// Returned from `Device.confidential_compute_gpu_attestation_report_bytes()`
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
Expand Down
Loading