Skip to content

Commit 45b733c

Browse files
committed
test: add test for DCGM basic functionality
This commit introduces a new test script `templates/test-dcgm.sh.j2` to verify the installation and basic functionality of NVIDIA Data Center GPU Manager (DCGM). The script performs checks for `dcgmi` binary, `nvidia-dcgm` service status, GPU discovery, quick diagnostics. This script is only supposed to run on systems with NVIDIA GPUs. Signed-off-by: Yaju Cao <yacao@redhat.com>
1 parent 9321b60 commit 45b733c

2 files changed

Lines changed: 164 additions & 0 deletions

File tree

tasks/main.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,14 @@
616616
name: nvidia-dcgm
617617
enabled: true
618618

619+
- name: Install Diagnostics test script
620+
template:
621+
src: test-dcgm.sh.j2
622+
dest: "{{ __hpc_azure_tests_dir }}/test-dcgm.sh"
623+
owner: root
624+
group: root
625+
mode: '0755'
626+
619627
- name: Install RDMA packages
620628
when: hpc_install_rdma
621629
block:

templates/test-dcgm.sh.j2

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
#!/usr/bin/env bash
2+
# shellcheck disable=all
3+
{{ ansible_managed | comment }}
4+
{{ "system_role:hpc" | comment(prefix="", postfix="") }}
5+
# SPDX-License-Identifier: MIT
6+
#
7+
# Test Script: DCGM (NVIDIA Data Center GPU Manager) installation and basic functionality
8+
#
9+
10+
set -euo pipefail
11+
12+
VERBOSE=0
13+
PASSED=0
14+
15+
# ------------------------------------------------------------------------------
16+
# Helper Functions
17+
# ------------------------------------------------------------------------------
18+
19+
pass() {
20+
echo "[PASS] $1"
21+
PASSED=$((PASSED + 1))
22+
}
23+
24+
fail() {
25+
echo "[FAIL] $1"
26+
exit 1
27+
}
28+
29+
log() {
30+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"
31+
}
32+
33+
verbose_log() {
34+
if [[ $VERBOSE -eq 1 ]]; then
35+
echo "[$(date "+%Y-%m-%d %H:%M:%S")] $*"
36+
fi
37+
}
38+
39+
usage() {
40+
cat <<EOF
41+
Usage: $(basename "$0") [OPTIONS]
42+
Test NVIDIA DCGM GPU support presence and functionality
43+
44+
OPTIONS:
45+
-v Verbose output
46+
-h Show this help message
47+
48+
EOF
49+
exit 0
50+
}
51+
52+
# ------------------------------------------------------------------------------
53+
# Parse Arguments
54+
# ------------------------------------------------------------------------------
55+
56+
while getopts "vh" opt; do
57+
case $opt in
58+
v) VERBOSE=1 ;;
59+
h) usage ;;
60+
*) usage ;;
61+
esac
62+
done
63+
64+
# ------------------------------------------------------------------------------
65+
# Test: DCGM Binary
66+
# ------------------------------------------------------------------------------
67+
68+
test_dcgmi_binary() {
69+
log_msg="Checking for dcgmi binary"
70+
log "Test: $log_msg"
71+
if command -v dcgmi >/dev/null 2>&1; then
72+
pass "$log_msg"
73+
verbose_log "dcgmi path: $(command -v dcgmi)"
74+
else
75+
fail "$log_msg: dcgmi command not found"
76+
fi
77+
echo ""
78+
}
79+
80+
# ------------------------------------------------------------------------------
81+
# Test: DCGM Service Active
82+
# ------------------------------------------------------------------------------
83+
84+
test_dcgm_service() {
85+
log "Test: Checking DCGM service status"
86+
if systemctl is-active --quiet nvidia-dcgm; then
87+
pass "DCGM service is active"
88+
else
89+
log "$(systemctl status nvidia-dcgm || true)"
90+
fail "DCGM service is not active"
91+
fi
92+
echo ""
93+
}
94+
95+
# ------------------------------------------------------------------------------
96+
# Test: DCGM Discovery
97+
# ------------------------------------------------------------------------------
98+
99+
test_dcgm_discovery() {
100+
log "Test: Running 'dcgmi discovery -l'"
101+
DISCOVERY_OUT=$(dcgmi discovery -l || true)
102+
verbose_log "$DISCOVERY_OUT"
103+
104+
# check if at least one GPU found
105+
GPU_COUNT=$(echo "$DISCOVERY_OUT" | grep -c '^[0-9]\+ GPU')
106+
if [[ "$GPU_COUNT" =~ ^[0-9]+$ && "$GPU_COUNT" -gt 0 ]]; then
107+
pass "Discovery found $GPU_COUNT GPU(s)"
108+
else
109+
fail "Discovery did not report any GPUs"
110+
fi
111+
echo ""
112+
}
113+
114+
# ------------------------------------------------------------------------------
115+
# Test: DCGM Diagnostic (quick)
116+
# ------------------------------------------------------------------------------
117+
118+
test_dcgm_diag() {
119+
log "Test: Running 'dcgmi diag -r 1' (quick diagnostic)"
120+
DIAG_OUT=$(dcgmi diag -r 1 || true)
121+
verbose_log "$DIAG_OUT"
122+
123+
if echo "$DIAG_OUT" | grep -qiE "fail|error"; then
124+
fail "Diagnostic returned error/failure"
125+
else
126+
pass "Diagnostic returned no obvious failures"
127+
fi
128+
echo ""
129+
}
130+
131+
# ------------------------------------------------------------------------------
132+
# Main
133+
# ------------------------------------------------------------------------------
134+
135+
main() {
136+
log "========================================"
137+
log "NVIDIA DCGM Test"
138+
log "========================================"
139+
140+
test_dcgmi_binary
141+
142+
test_dcgm_service
143+
144+
test_dcgm_discovery
145+
146+
test_dcgm_diag
147+
148+
# If we get here, all tests passed
149+
150+
log "========================================"
151+
log "All tests passed ($PASSED)"
152+
log "========================================"
153+
exit 0
154+
}
155+
156+
main "$@"

0 commit comments

Comments
 (0)