Device Plugin Resource Naming Strategy

sriram-30 · sriram-30 · commit 970fceb63194 · 2025-04-17T05:23:52.000Z
The resource_naming_strategy is a new flag which can be passed to the
device plugin daemonset. The supported values for the flag are
"single" and "mixed"

Terms to understand before viewing the changes in this commit:

Homogeneous Node:
   If all GPUs in a node are following the same compute and memory
   partition style, the node is considered homogeneous

Heterogeneous Node:
   If the GPUs on a node have different different compute and memory
   partition styles, the node is considered heterogeneous (Put simply,
   if node is not homogeneous)

Behaviour of Resource Naming Strategy in different node types:

Homogeneous Node:

-&gt; If node is homogeneous and resource naming strategy is "single",
   one plugin is started using the DevicePluginManager with the last
   name as “gpu”.

   If node is homogeneous and resource naming strategy is "mixed",
   one plugin is started using the DevicePluginManager with the last
   name as the partition style present on the node.

-&gt; The ListAndWatch function remains almost the same as it was before.
   It reports resources under a single resource name(the name will
   either be "gpu" or the partition style present on the node(cpx_nps1)
   depending on strategy)

Heterogeneous:

-&gt; If node is heterogeneous and resource naming strategy is "mixed", we
   invoke the DevicePluginManager to start multiple plugins for
   different partitionTypes under the names “spx-nps1, “cpx-nps1”, etc.
   We use the devicesCount map to start plugins for the partitionTypes
   that are present in the map

-&gt; ListAndWatch sends the devices to the plugin for their respective
   resource type depending on its partitionType. Each device has
   computePartition and memoryPartition fields in its object as shown
   before, which is used to identify which plugin to report the
   resource under. (amd.com/spx-nps1,amd.com/cpx-nps1, etc..)

Note:
-&gt; If node is heterogeneous, "single" strategy is not supported as
   multiple resource types getting reported under a single resource
   name wouldn't be mathematically accurate as to how many true gpus of
   each type there are

-&gt; For nodes where partitioning is not supported(MI200), irrespective
   of strategy, the resources will get reported under "amd.com/gpu"

-&gt; If the flag is not set by user, default value is "single". This is
   to maintain backwards compatibility with older resource name before
   strategy was introduced (amd.com/gpu)
diff --git a/cmd/k8s-device-plugin/main.go b/cmd/k8s-device-plugin/main.go
@@ -32,25 +32,68 @@ import (
 
 var gitDescribe string
 
-func getResourceList() []string {
+type ResourceNamingStrategy string
+
+const (
+	StrategySingle ResourceNamingStrategy = "single"
+	StrategyMixed  ResourceNamingStrategy = "mixed"
+)
+
+func ParseStrategy(s string) (ResourceNamingStrategy, error) {
+	switch s {
+	case string(StrategySingle):
+		return StrategySingle, nil
+	case string(StrategyMixed):
+		return StrategyMixed, nil
+	default:
+		return "", fmt.Errorf("invalid resource naming strategy: %s", s)
+	}
+}
+
+func getResourceList(resourceNamingStrategy ResourceNamingStrategy) ([]string, error) {
 	var resources []string
 
 	// Check if the node is homogeneous
 	isHomogeneous := amdgpu.IsHomogeneous()
+	devices, deviceCountMap := amdgpu.GetAMDGPUs()
+	if len(devices) == 0 {
+		return resources, nil
+	}
 	if isHomogeneous {
-		// Homogeneous node will report only "gpu" resource
-		resources = []string{"gpu"}
+		// Homogeneous node will report only "gpu" resource if strategy is single. If strategy is mixed, it will report resources under the partition type name
+		if resourceNamingStrategy == StrategySingle {
+			resources = []string{"gpu"}
+		} else if resourceNamingStrategy == StrategyMixed {
+			if len(deviceCountMap) == 0 {
+				// If partitioning is not supported on the node, we should report resources under "gpu" regardless of the strategy
+				resources = []string{"gpu"}
+			} else {
+				for partitionType, count := range deviceCountMap {
+					if count > 0 {
+						resources = append(resources, partitionType)
+					}
+				}
+			}
+		}
 	} else {
 		// Heterogeneous node reports resources based on partition types
 		gpus := amdgpu.GetAMDGPUs()
 		deviceCountMap := amdgpu.GetAMDDeviceCountMap(gpus)
 		for partitionType, count := range deviceCountMap {
 			if count > 0 {
 				resources = append(resources, partitionType)
+		// Heterogeneous node reports resources based on partition types if strategy is mixed. Heterogeneous is not allowed if Strategy is single
+		if resourceNamingStrategy == StrategySingle {
+			return resources, fmt.Errorf("Partitions of different styles across GPUs in a node is not supported with single strategy. Please start device plugin with mixed strategy")
+		} else if resourceNamingStrategy == StrategyMixed {
+			for partitionType, count := range deviceCountMap {
+				if count > 0 {
+					resources = append(resources, partitionType)
+				}
 			}
 		}
 	}
-	return resources
+	return resources, nil
 }
 
 func main() {
@@ -68,9 +111,16 @@ func main() {
 		flag.PrintDefaults()
 	}
 	var pulse int
+	var resourceNamingStrategy string
 	flag.IntVar(&pulse, "pulse", 0, "time between health check polling in seconds.  Set to 0 to disable.")
+	flag.StringVar(&resourceNamingStrategy, "resource_naming_strategy", "single", "Resource strategy to be used: single or mixed")
 	// this is also needed to enable glog usage in dpm
 	flag.Parse()
+	strategy, err := ParseStrategy(resourceNamingStrategy)
+	if err != nil {
+		glog.Errorf("%v", err)
+		os.Exit(1)
+	}
 
 	for _, v := range versions {
 		glog.Infof("%s", v)
@@ -96,8 +146,14 @@ func main() {
 		// /sys/class/kfd only exists if ROCm kernel/driver is installed
 		var path = "/sys/class/kfd"
 		if _, err := os.Stat(path); err == nil {
-			resources := getResourceList()
-			l.ResUpdateChan <- resources
+			resources, err := getResourceList(strategy)
+			if err != nil {
+				glog.Errorf("Error occured: %v", err)
+				os.Exit(1)
+			}
+			if len(resources) > 0 {
+				l.ResUpdateChan <- resources
+			}
 		}
 	}()
 	manager.Run()
diff --git a/docs/user-guide/configuration.md b/docs/user-guide/configuration.md
@@ -29,6 +29,7 @@ The device plugin supports the following command-line flags:
 |-----|------|-------------|
 | `--kubelet-url` | `http://localhost:10250` | The URL of the kubelet for device plugin registration |
 | `--pulse` | `0` | Time between health check polling in seconds. Set to 0 to disable. |
+| `--resource_naming_strategy` | `single` | Resource Naming strategy chosen for k8s resource reporting. |
 
 ## Configuration File
 
@@ -139,16 +140,71 @@ The node labeller can expose labels such as:
 
 [Download link](https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-labeller.yaml)
 
-## Resource Naming
+## Resource Naming Strategy
 
-The device plugin advertises AMD GPUs as the `amd.com/gpu` resource type. Pods can request this resource in their specifications to access AMD GPUs:
+To customize the way device plugin reports gpu resources to kubernetes as allocatable k8s resources, use the `single` or `mixed` resource naming strategy flag mentioned above (--resource_naming_strategy)
+
+Before understanding each strategy, please note the definition of homogeneous and heterogeneous nodes
+
+Homogeneous node: A node whose gpu's follow the same compute-memory partition style 
+    -> Example: A node of 8 GPU's where all 8 GPU's are following CPX-NPS4 partition style
+
+Heterogeneous node: A node whose gpu's follow different compute-memory partition styles
+    -> Example: A node of 8 GPU's where 5 GPU's are following SPX-NPS1 and 3 GPU's are following CPX-NPS1
+
+### Single
+
+In `single` mode, the device plugin reports all gpu's (regardless of whether they are whole gpu's or partitions of a gpu) under the resource name `amd.com/gpu`
+This mode is supported for homogeneous nodes but not supported for heterogeneous nodes
+
+A node which has 8 GPUs where all GPUs are not partitioned will report its resources as:
+
+```bash
+amd.com/gpu: 8
+```
+
+A node which has 8 GPUs where all GPUs are partitioned using CPX-NPS4 style will report its resources as:
+
+```bash
+amd.com/gpu: 64
+```
+
+### Mixed
+
+In `mixed` mode, the device plugin reports all gpu's under a name which matches its partition style.
+This mode is supported for both homogeneous nodes and heterogeneous nodes
+
+A node which has 8 GPUs which are all partitioned using CPX-NPS4 style will report its resources as:
+
+```bash
+amd.com/cpx_nps4: 64
+```
+
+A node which has 8 GPUs where 5 GPU's are following SPX-NPS1 and 3 GPU's are following CPX-NPS1 will report its resources as:
+
+```bash
+amd.com/spx_nps1: 5
+amd.com/cpx_nps1: 24
+``` 
+
+- If `resource_naming_strategy` is not passed using the flag, then device plugin will internally default to `single` resource naming strategy. This maintains backwards compatibility with earlier release of device plugin with reported resource name of `amd.com/gpu`
+
+- If a node has GPUs which do not support partitioning, such as MI210, then the GPUs are reported under resource name `amd.com/gpu` regardless of the resource naming strategy
+
+Pods can request the resource as per the naming style in their specifications to access AMD GPUs:
 
 ```yaml
 resources:
   limits:
     amd.com/gpu: 1
 ```
 
+```yaml
+resources:
+  limits:
+    amd.com/cpx_nps4: 1
+```
+
 ## Security and Access Control
 
 ### Non-Privileged GPU Access
diff --git a/internal/pkg/plugin/plugin.go b/internal/pkg/plugin/plugin.go
@@ -307,9 +307,7 @@ loop:
 			// update with per device GPU health status
 			if isHomogeneous {
 				exporter.PopulatePerGPUDHealth(devs, health)
-				if p.Resource == "gpu" {
-					s.Send(&pluginapi.ListAndWatchResponse{Devices: devs})
-				}
+				s.Send(&pluginapi.ListAndWatchResponse{Devices: devs})
 			} else {
 				if devList, exists := resourceTypeDevs[p.Resource]; exists {
 					exporter.PopulatePerGPUDHealth(devList, health)