|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# MongoDB Health Checker |
| 4 | +# Detects whether MongoDB is choking under migration load |
| 5 | + |
| 6 | +# Colors |
| 7 | +RED='\033[0;31m' |
| 8 | +GREEN='\033[0;32m' |
| 9 | +YELLOW='\033[1;33m' |
| 10 | +BLUE='\033[0;34m' |
| 11 | +CYAN='\033[0;36m' |
| 12 | +NC='\033[0m' |
| 13 | + |
| 14 | +# Configuration |
| 15 | +NAMESPACE_MONGODB="mongodb" |
| 16 | +MONGO_POD="app-mongodb-0" |
| 17 | +MONGO_CONTAINER="mongod" |
| 18 | +KEYFILE="/var/lib/mongodb-mms-automation/authentication/keyfile" |
| 19 | + |
| 20 | +# Thresholds |
| 21 | +QUEUE_WARN=5 # queued ops before warning |
| 22 | +QUEUE_CRIT=20 # queued ops before critical |
| 23 | +DIRTY_WARN=20 # WiredTiger dirty cache % before warning |
| 24 | +DIRTY_CRIT=40 # WiredTiger dirty cache % before critical |
| 25 | +SLOW_OPS_WARN=3 # long-running ops (>5s) before warning |
| 26 | + |
| 27 | +mongosh_exec() { |
| 28 | + local script="$1" |
| 29 | + # Base64-encode to avoid all shell quoting issues when passing JS to the container |
| 30 | + local encoded |
| 31 | + encoded=$(printf '%s' "$script" | base64 -w0) |
| 32 | + kubectl exec -n "$NAMESPACE_MONGODB" "$MONGO_POD" -c "$MONGO_CONTAINER" -- \ |
| 33 | + bash -c "echo $encoded | base64 -d > /tmp/_mhc.js && \ |
| 34 | + mongosh --authenticationDatabase local -u __system -p \"\$(cat $KEYFILE)\" --quiet --norc /tmp/_mhc.js 2>&1; \ |
| 35 | + rm -f /tmp/_mhc.js" \ |
| 36 | + 2>/dev/null | grep -v "Could not access" | grep -v "^$" | tail -1 |
| 37 | +} |
| 38 | + |
| 39 | +echo "" |
| 40 | +echo -e "${CYAN}========================================${NC}" |
| 41 | +echo -e "${CYAN} MONGODB HEALTH CHECK${NC}" |
| 42 | +echo -e "${CYAN}========================================${NC}" |
| 43 | +echo "" |
| 44 | + |
| 45 | +# ── 1. Global lock queue depth ───────────────────────────────────────────── |
| 46 | +HEALTH_RAW=$(mongosh_exec " |
| 47 | +var s = db.serverStatus(); |
| 48 | +var q = s.globalLock.currentQueue; |
| 49 | +var c = s.connections; |
| 50 | +var wt = s.wiredTiger.cache; |
| 51 | +var maxCache = wt['maximum bytes configured']; |
| 52 | +var dirtyPct = maxCache > 0 |
| 53 | + ? (wt['tracked dirty bytes in the cache'] * 100 / maxCache).toFixed(1) : '0'; |
| 54 | +var cachePct = maxCache > 0 |
| 55 | + ? (wt['bytes currently in the cache'] * 100 / maxCache).toFixed(1) : '0'; |
| 56 | +var opcnt = s.opcounters; |
| 57 | +print([q.readers, q.writers, c.current, c.available, dirtyPct, cachePct, |
| 58 | + opcnt.insert, opcnt.query, opcnt.update, opcnt.delete].join(',')); |
| 59 | +") |
| 60 | + |
| 61 | +if [ -z "$HEALTH_RAW" ] || ! echo "$HEALTH_RAW" | grep -qE '^[0-9]'; then |
| 62 | + echo -e "${RED}[X] Failed to get serverStatus${NC}" |
| 63 | +else |
| 64 | + QUEUE_READERS=$(echo "$HEALTH_RAW" | cut -d',' -f1) |
| 65 | + QUEUE_WRITERS=$(echo "$HEALTH_RAW" | cut -d',' -f2) |
| 66 | + CONNS_CURRENT=$(echo "$HEALTH_RAW" | cut -d',' -f3) |
| 67 | + CONNS_AVAILABLE=$(echo "$HEALTH_RAW" | cut -d',' -f4) |
| 68 | + DIRTY_PCT=$(echo "$HEALTH_RAW" | cut -d',' -f5) |
| 69 | + CACHE_PCT=$(echo "$HEALTH_RAW" | cut -d',' -f6) |
| 70 | + OPS_INSERT=$(echo "$HEALTH_RAW" | cut -d',' -f7) |
| 71 | + OPS_QUERY=$(echo "$HEALTH_RAW" | cut -d',' -f8) |
| 72 | + OPS_UPDATE=$(echo "$HEALTH_RAW" | cut -d',' -f9) |
| 73 | + OPS_DELETE=$(echo "$HEALTH_RAW" | cut -d',' -f10) |
| 74 | + QUEUE_TOTAL=$((QUEUE_READERS + QUEUE_WRITERS)) |
| 75 | + |
| 76 | + echo -e "${YELLOW}Global Lock Queue:${NC}" |
| 77 | + if [ "$QUEUE_TOTAL" -ge "$QUEUE_CRIT" ]; then |
| 78 | + echo -e " Readers waiting: ${RED}$QUEUE_READERS${NC}" |
| 79 | + echo -e " Writers waiting: ${RED}$QUEUE_WRITERS${NC}" |
| 80 | + echo -e " ${RED}CRITICAL: $QUEUE_TOTAL operations queued — MongoDB is choking${NC}" |
| 81 | + elif [ "$QUEUE_TOTAL" -ge "$QUEUE_WARN" ]; then |
| 82 | + echo -e " Readers waiting: ${YELLOW}$QUEUE_READERS${NC}" |
| 83 | + echo -e " Writers waiting: ${YELLOW}$QUEUE_WRITERS${NC}" |
| 84 | + echo -e " ${YELLOW}WARNING: $QUEUE_TOTAL operations queued${NC}" |
| 85 | + else |
| 86 | + echo -e " Readers waiting: ${GREEN}$QUEUE_READERS${NC}" |
| 87 | + echo -e " Writers waiting: ${GREEN}$QUEUE_WRITERS${NC}" |
| 88 | + fi |
| 89 | + |
| 90 | + echo "" |
| 91 | + echo -e "${YELLOW}Connections:${NC}" |
| 92 | + echo -e " Current: $CONNS_CURRENT" |
| 93 | + echo -e " Available: $CONNS_AVAILABLE" |
| 94 | + |
| 95 | + echo "" |
| 96 | + echo -e "${YELLOW}WiredTiger Cache:${NC}" |
| 97 | + DIRTY_INT=$(echo "$DIRTY_PCT / 1" | bc 2>/dev/null || echo "0") |
| 98 | + if [ "$DIRTY_INT" -ge "$DIRTY_CRIT" ]; then |
| 99 | + echo -e " Used: ${CACHE_PCT}%" |
| 100 | + echo -e " Dirty: ${RED}${DIRTY_PCT}%${NC} ← ${RED}CRITICAL: write pressure, eviction may stall operations${NC}" |
| 101 | + elif [ "$DIRTY_INT" -ge "$DIRTY_WARN" ]; then |
| 102 | + echo -e " Used: ${CACHE_PCT}%" |
| 103 | + echo -e " Dirty: ${YELLOW}${DIRTY_PCT}%${NC} ← ${YELLOW}WARNING: elevated write pressure${NC}" |
| 104 | + else |
| 105 | + echo -e " Used: ${CACHE_PCT}%" |
| 106 | + echo -e " Dirty: ${GREEN}${DIRTY_PCT}%${NC}" |
| 107 | + fi |
| 108 | + |
| 109 | + echo "" |
| 110 | + echo -e "${YELLOW}Operation Counters (cumulative since start):${NC}" |
| 111 | + echo -e " Inserts: $OPS_INSERT" |
| 112 | + echo -e " Queries: $OPS_QUERY" |
| 113 | + echo -e " Updates: $OPS_UPDATE" |
| 114 | + echo -e " Deletes: $OPS_DELETE" |
| 115 | +fi |
| 116 | + |
| 117 | +# ── 3. Long-running operations on countly_drill ──────────────────────────── |
| 118 | +echo "" |
| 119 | +echo -e "${YELLOW}Slow Operations on countly_drill (>5s):${NC}" |
| 120 | +SLOW_RAW=$(mongosh_exec " |
| 121 | +var ops = db.currentOp({ secs_running: { \$gt: 5 }, ns: /^countly_drill/ }); |
| 122 | +var lines = ops.inprog.map(function(op){ |
| 123 | + return op.secs_running + 's|' + op.op + '|' + op.ns; |
| 124 | +}).join('\n'); |
| 125 | +print(ops.inprog.length + '|' + lines); |
| 126 | +") |
| 127 | + |
| 128 | +if [ -z "$SLOW_RAW" ]; then |
| 129 | + echo -e " ${RED}Unable to get currentOp${NC}" |
| 130 | +else |
| 131 | + SLOW_COUNT=$(echo "$SLOW_RAW" | head -1 | cut -d'|' -f1) |
| 132 | + if [ "$SLOW_COUNT" -eq 0 ] 2>/dev/null; then |
| 133 | + echo -e " ${GREEN}None${NC}" |
| 134 | + elif [ "$SLOW_COUNT" -ge "$SLOW_OPS_WARN" ] 2>/dev/null; then |
| 135 | + echo -e " ${YELLOW}WARNING: $SLOW_COUNT slow ops running${NC}" |
| 136 | + echo "$SLOW_RAW" | tail -n +2 | while IFS='|' read -r secs op ns; do |
| 137 | + [ -n "$secs" ] && echo -e " ${secs} ${op} ${ns}" |
| 138 | + done |
| 139 | + else |
| 140 | + echo -e " $SLOW_COUNT slow op(s) running:" |
| 141 | + echo "$SLOW_RAW" | tail -n +2 | while IFS='|' read -r secs op ns; do |
| 142 | + [ -n "$secs" ] && echo -e " ${secs} ${op} ${ns}" |
| 143 | + done |
| 144 | + fi |
| 145 | +fi |
| 146 | + |
| 147 | +echo "" |
| 148 | +echo -e "${CYAN}========================================${NC}" |
| 149 | +echo -e "${BLUE}Report generated at: $(date '+%Y-%m-%d %H:%M:%S')${NC}" |
| 150 | +echo -e "${CYAN}========================================${NC}" |
| 151 | +echo "" |
0 commit comments