Skip to content

Commit c6f7ff4

Browse files
committed
Switch gpu-operator-validator to distroless base image
Signed-off-by: Christopher Desiniotis <[email protected]>
1 parent 707a29f commit c6f7ff4

File tree

2 files changed

+11
-25
lines changed

2 files changed

+11
-25
lines changed

validator/Dockerfile

Lines changed: 6 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -51,22 +51,16 @@ FROM ${CUDA_SAMPLE_IMAGE} AS sample-builder
5151
RUN mkdir /artifacts
5252
RUN cp /cuda-samples/vectorAdd /artifacts/vectorAdd
5353

54-
FROM nvcr.io/nvidia/cuda:12.8.1-base-ubi9
54+
# The C/C++ distroless image is used as a base since the CUDA vectorAdd \
55+
# sample application depends on C/C++ libraries.
56+
FROM nvcr.io/nvidia/distroless/cc:v3.1.7-dev
5557

56-
# Remove CUDA libs(compat etc) in favor of libs installed by the NVIDIA driver
57-
RUN dnf remove -y cuda-*
58+
USER 0:0
5859

59-
RUN dnf install -y \
60-
kmod \
61-
pciutils && \
62-
rm -rf /var/cache/yum/*
63-
64-
RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE
6560
COPY --from=build /artifacts/nvidia-validator /usr/bin/nvidia-validator
6661
COPY --from=sample-builder /artifacts/vectorAdd /usr/bin/vectorAdd
67-
RUN mkdir -p /var/nvidia/manifests
68-
COPY --from=build /artifacts/plugin-workload-validation.yaml /var/nvidia/manifests
69-
COPY --from=build /artifacts/cuda-workload-validation.yaml /var/nvidia/manifests
62+
COPY --from=build /artifacts/plugin-workload-validation.yaml /var/nvidia/manifests/plugin-workload-validation.yaml
63+
COPY --from=build /artifacts/cuda-workload-validation.yaml /var/nvidia/manifests/cuda-workload-validation.yaml
7064

7165
ENV NVIDIA_DISABLE_REQUIRE="true"
7266
ENV NVIDIA_VISIBLE_DEVICES=void
@@ -83,12 +77,3 @@ LABEL name="NVIDIA Validator for the GPU Operator"
8377
LABEL summary="NVIDIA Validator for the GPU Operator"
8478
LABEL description="See summary"
8579
LABEL vsc-ref=${GIT_COMMIT}
86-
87-
# Install / upgrade packages here that are required to resolve CVEs
88-
ARG CVE_UPDATES
89-
RUN if [ -n "${CVE_UPDATES}" ]; then \
90-
dnf update -y ${CVE_UPDATES} && \
91-
rm -rf /var/cache/yum/*; \
92-
fi
93-
94-
ENTRYPOINT ["/bin/bash"]

validator/main.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,7 @@ func runCommandWithWait(command string, args []string, sleepSeconds int, silent
616616
fmt.Printf("running command %s with args %v\n", command, args)
617617
err := cmd.Run()
618618
if err != nil {
619+
log.Warningf("error running command: %v", err)
619620
fmt.Printf("command failed, retrying after %d seconds\n", sleepSeconds)
620621
time.Sleep(time.Duration(sleepSeconds) * time.Second)
621622
continue
@@ -649,7 +650,7 @@ func setEnvVar(envvars []string, key, value string) []string {
649650
// For driver container installs, check existence of .driver-ctr-ready to confirm running driver
650651
// container has completed and is in Ready state.
651652
func assertDriverContainerReady(silent bool) error {
652-
command := "bash"
653+
command := "sh"
653654
args := []string{"-c", "stat /run/nvidia/validations/.driver-ctr-ready"}
654655

655656
if withWaitFlag {
@@ -932,7 +933,7 @@ func (n *NvidiaFs) validate() error {
932933

933934
func (n *NvidiaFs) runValidation(silent bool) error {
934935
// check for nvidia_fs module to be loaded
935-
command := "bash"
936+
command := "sh"
936937
args := []string{"-c", "lsmod | grep nvidia_fs"}
937938

938939
if withWaitFlag {
@@ -1067,7 +1068,7 @@ func (m *MOFED) validate() error {
10671068

10681069
func (m *MOFED) runValidation(silent bool) error {
10691070
// check for mlx5_core module to be loaded
1070-
command := "bash"
1071+
command := "sh"
10711072
args := []string{"-c", "lsmod | grep mlx5_core"}
10721073

10731074
// If MOFED container is running then use readiness flag set by the driver container instead
@@ -1632,7 +1633,7 @@ func (c *CCManager) setKubeClient(kubeClient kubernetes.Interface) {
16321633

16331634
// Check that the ccManager container is ready after applying required ccMode
16341635
func assertCCManagerContainerReady(silent, withWaitFlag bool) error {
1635-
command := "bash"
1636+
command := "sh"
16361637
args := []string{"-c", "stat /run/nvidia/validations/.cc-manager-ctr-ready"}
16371638

16381639
if withWaitFlag {

0 commit comments

Comments
 (0)