|
| 1 | +#!/usr/bin/env bash |
| 2 | +# shellcheck disable=all |
| 3 | +{{ ansible_managed | comment }} |
| 4 | +{{ "system_role:hpc" | comment(prefix="", postfix="") }} |
| 5 | +# SPDX-License-Identifier: MIT |
| 6 | +# |
| 7 | +# Test Script: DCGM (NVIDIA Data Center GPU Manager) installation and basic functionality |
| 8 | +# |
| 9 | + |
| 10 | +set -euo pipefail |
| 11 | + |
| 12 | +VERBOSE=0 |
| 13 | +PASSED=0 |
| 14 | + |
| 15 | +# ------------------------------------------------------------------------------ |
| 16 | +# Helper Functions |
| 17 | +# ------------------------------------------------------------------------------ |
| 18 | + |
| 19 | +pass() { |
| 20 | + echo "[PASS] $1" |
| 21 | + PASSED=$((PASSED + 1)) |
| 22 | +} |
| 23 | + |
| 24 | +fail() { |
| 25 | + echo "[FAIL] $1" |
| 26 | + exit 1 |
| 27 | +} |
| 28 | + |
| 29 | +log() { |
| 30 | + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" |
| 31 | +} |
| 32 | + |
| 33 | +verbose_log() { |
| 34 | + if [[ $VERBOSE -eq 1 ]]; then |
| 35 | + echo "[$(date "+%Y-%m-%d %H:%M:%S")] $*" |
| 36 | + fi |
| 37 | +} |
| 38 | + |
| 39 | +usage() { |
| 40 | + cat <<EOF |
| 41 | +Usage: $(basename "$0") [OPTIONS] |
| 42 | +Test NVIDIA DCGM GPU support presence and functionality |
| 43 | +
|
| 44 | +OPTIONS: |
| 45 | + -v Verbose output |
| 46 | + -h Show this help message |
| 47 | +
|
| 48 | +EOF |
| 49 | + exit 0 |
| 50 | +} |
| 51 | + |
| 52 | +# ------------------------------------------------------------------------------ |
| 53 | +# Parse Arguments |
| 54 | +# ------------------------------------------------------------------------------ |
| 55 | + |
| 56 | +while getopts "vh" opt; do |
| 57 | + case $opt in |
| 58 | + v) VERBOSE=1 ;; |
| 59 | + h) usage ;; |
| 60 | + *) usage ;; |
| 61 | + esac |
| 62 | +done |
| 63 | + |
| 64 | +# ------------------------------------------------------------------------------ |
| 65 | +# Test: DCGM Binary |
| 66 | +# ------------------------------------------------------------------------------ |
| 67 | + |
| 68 | +test_dcgmi_binary() { |
| 69 | + log_msg="Checking for dcgmi binary" |
| 70 | + log "Test: $log_msg" |
| 71 | + if command -v dcgmi >/dev/null 2>&1; then |
| 72 | + pass "$log_msg" |
| 73 | + verbose_log "dcgmi path: $(command -v dcgmi)" |
| 74 | + else |
| 75 | + fail "$log_msg: dcgmi command not found" |
| 76 | + fi |
| 77 | + echo "" |
| 78 | +} |
| 79 | + |
| 80 | +# ------------------------------------------------------------------------------ |
| 81 | +# Test: DCGM Service Active |
| 82 | +# ------------------------------------------------------------------------------ |
| 83 | + |
| 84 | +test_dcgm_service() { |
| 85 | + log "Test: Checking DCGM service status" |
| 86 | + if systemctl is-active --quiet nvidia-dcgm; then |
| 87 | + pass "DCGM service is active" |
| 88 | + else |
| 89 | + log "$(systemctl status nvidia-dcgm || true)" |
| 90 | + fail "DCGM service is not active" |
| 91 | + fi |
| 92 | + echo "" |
| 93 | +} |
| 94 | + |
| 95 | +# ------------------------------------------------------------------------------ |
| 96 | +# Test: DCGM Discovery |
| 97 | +# ------------------------------------------------------------------------------ |
| 98 | + |
| 99 | +test_dcgm_discovery() { |
| 100 | + log "Test: Running 'dcgmi discovery -l'" |
| 101 | + DISCOVERY_OUT=$(dcgmi discovery -l || true) |
| 102 | + verbose_log "$DISCOVERY_OUT" |
| 103 | + |
| 104 | + # check if at least one GPU found |
| 105 | + GPU_COUNT=$(echo "$DISCOVERY_OUT" | grep -c '^[0-9]\+ GPU') |
| 106 | + if [[ "$GPU_COUNT" =~ ^[0-9]+$ && "$GPU_COUNT" -gt 0 ]]; then |
| 107 | + pass "Discovery found $GPU_COUNT GPU(s)" |
| 108 | + else |
| 109 | + fail "Discovery did not report any GPUs" |
| 110 | + fi |
| 111 | + echo "" |
| 112 | +} |
| 113 | + |
| 114 | +# ------------------------------------------------------------------------------ |
| 115 | +# Test: DCGM Diagnostic (quick) |
| 116 | +# ------------------------------------------------------------------------------ |
| 117 | + |
| 118 | +test_dcgm_diag() { |
| 119 | + log "Test: Running 'dcgmi diag -r 1' (quick diagnostic)" |
| 120 | + DIAG_OUT=$(dcgmi diag -r 1 || true) |
| 121 | + verbose_log "$DIAG_OUT" |
| 122 | + |
| 123 | + if echo "$DIAG_OUT" | grep -qiE "fail|error"; then |
| 124 | + fail "Diagnostic returned error/failure" |
| 125 | + else |
| 126 | + pass "Diagnostic returned no obvious failures" |
| 127 | + fi |
| 128 | + echo "" |
| 129 | +} |
| 130 | + |
| 131 | +# ------------------------------------------------------------------------------ |
| 132 | +# Main |
| 133 | +# ------------------------------------------------------------------------------ |
| 134 | + |
| 135 | +main() { |
| 136 | + log "========================================" |
| 137 | + log "NVIDIA DCGM Test" |
| 138 | + log "========================================" |
| 139 | + |
| 140 | + test_dcgmi_binary |
| 141 | + |
| 142 | + test_dcgm_service |
| 143 | + |
| 144 | + test_dcgm_discovery |
| 145 | + |
| 146 | + test_dcgm_diag |
| 147 | + |
| 148 | + # If we get here, all tests passed |
| 149 | + |
| 150 | + log "========================================" |
| 151 | + log "All tests passed ($PASSED)" |
| 152 | + log "========================================" |
| 153 | + exit 0 |
| 154 | +} |
| 155 | + |
| 156 | +main "$@" |
0 commit comments