Skip to content

Commit be05816

Browse files
Merge pull request #3 from Countly/tier1
Tier1
2 parents 1b237a0 + c71701a commit be05816

File tree

11 files changed

+751
-94
lines changed

11 files changed

+751
-94
lines changed

charts/countly/values.schema.json

Lines changed: 350 additions & 94 deletions
Large diffs are not rendered by default.
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#!/bin/bash
2+
3+
# MongoDB Health Checker
4+
# Detects whether MongoDB is choking under migration load
5+
6+
# Colors
7+
RED='\033[0;31m'
8+
GREEN='\033[0;32m'
9+
YELLOW='\033[1;33m'
10+
BLUE='\033[0;34m'
11+
CYAN='\033[0;36m'
12+
NC='\033[0m'
13+
14+
# Configuration
15+
NAMESPACE_MONGODB="mongodb"
16+
MONGO_POD="app-mongodb-0"
17+
MONGO_CONTAINER="mongod"
18+
KEYFILE="/var/lib/mongodb-mms-automation/authentication/keyfile"
19+
20+
# Thresholds
21+
QUEUE_WARN=5 # queued ops before warning
22+
QUEUE_CRIT=20 # queued ops before critical
23+
DIRTY_WARN=20 # WiredTiger dirty cache % before warning
24+
DIRTY_CRIT=40 # WiredTiger dirty cache % before critical
25+
SLOW_OPS_WARN=3 # long-running ops (>5s) before warning
26+
27+
mongosh_exec() {
28+
local script="$1"
29+
# Base64-encode to avoid all shell quoting issues when passing JS to the container
30+
local encoded
31+
encoded=$(printf '%s' "$script" | base64 -w0)
32+
kubectl exec -n "$NAMESPACE_MONGODB" "$MONGO_POD" -c "$MONGO_CONTAINER" -- \
33+
bash -c "echo $encoded | base64 -d > /tmp/_mhc.js && \
34+
mongosh --authenticationDatabase local -u __system -p \"\$(cat $KEYFILE)\" --quiet --norc /tmp/_mhc.js 2>&1; \
35+
rm -f /tmp/_mhc.js" \
36+
2>/dev/null | grep -v "Could not access" | grep -v "^$" | tail -1
37+
}
38+
39+
echo ""
40+
echo -e "${CYAN}========================================${NC}"
41+
echo -e "${CYAN} MONGODB HEALTH CHECK${NC}"
42+
echo -e "${CYAN}========================================${NC}"
43+
echo ""
44+
45+
# ── 1. Global lock queue depth ─────────────────────────────────────────────
46+
HEALTH_RAW=$(mongosh_exec "
47+
var s = db.serverStatus();
48+
var q = s.globalLock.currentQueue;
49+
var c = s.connections;
50+
var wt = s.wiredTiger.cache;
51+
var maxCache = wt['maximum bytes configured'];
52+
var dirtyPct = maxCache > 0
53+
? (wt['tracked dirty bytes in the cache'] * 100 / maxCache).toFixed(1) : '0';
54+
var cachePct = maxCache > 0
55+
? (wt['bytes currently in the cache'] * 100 / maxCache).toFixed(1) : '0';
56+
var opcnt = s.opcounters;
57+
print([q.readers, q.writers, c.current, c.available, dirtyPct, cachePct,
58+
opcnt.insert, opcnt.query, opcnt.update, opcnt.delete].join(','));
59+
")
60+
61+
if [ -z "$HEALTH_RAW" ] || ! echo "$HEALTH_RAW" | grep -qE '^[0-9]'; then
62+
echo -e "${RED}[X] Failed to get serverStatus${NC}"
63+
else
64+
QUEUE_READERS=$(echo "$HEALTH_RAW" | cut -d',' -f1)
65+
QUEUE_WRITERS=$(echo "$HEALTH_RAW" | cut -d',' -f2)
66+
CONNS_CURRENT=$(echo "$HEALTH_RAW" | cut -d',' -f3)
67+
CONNS_AVAILABLE=$(echo "$HEALTH_RAW" | cut -d',' -f4)
68+
DIRTY_PCT=$(echo "$HEALTH_RAW" | cut -d',' -f5)
69+
CACHE_PCT=$(echo "$HEALTH_RAW" | cut -d',' -f6)
70+
OPS_INSERT=$(echo "$HEALTH_RAW" | cut -d',' -f7)
71+
OPS_QUERY=$(echo "$HEALTH_RAW" | cut -d',' -f8)
72+
OPS_UPDATE=$(echo "$HEALTH_RAW" | cut -d',' -f9)
73+
OPS_DELETE=$(echo "$HEALTH_RAW" | cut -d',' -f10)
74+
QUEUE_TOTAL=$((QUEUE_READERS + QUEUE_WRITERS))
75+
76+
echo -e "${YELLOW}Global Lock Queue:${NC}"
77+
if [ "$QUEUE_TOTAL" -ge "$QUEUE_CRIT" ]; then
78+
echo -e " Readers waiting: ${RED}$QUEUE_READERS${NC}"
79+
echo -e " Writers waiting: ${RED}$QUEUE_WRITERS${NC}"
80+
echo -e " ${RED}CRITICAL: $QUEUE_TOTAL operations queued — MongoDB is choking${NC}"
81+
elif [ "$QUEUE_TOTAL" -ge "$QUEUE_WARN" ]; then
82+
echo -e " Readers waiting: ${YELLOW}$QUEUE_READERS${NC}"
83+
echo -e " Writers waiting: ${YELLOW}$QUEUE_WRITERS${NC}"
84+
echo -e " ${YELLOW}WARNING: $QUEUE_TOTAL operations queued${NC}"
85+
else
86+
echo -e " Readers waiting: ${GREEN}$QUEUE_READERS${NC}"
87+
echo -e " Writers waiting: ${GREEN}$QUEUE_WRITERS${NC}"
88+
fi
89+
90+
echo ""
91+
echo -e "${YELLOW}Connections:${NC}"
92+
echo -e " Current: $CONNS_CURRENT"
93+
echo -e " Available: $CONNS_AVAILABLE"
94+
95+
echo ""
96+
echo -e "${YELLOW}WiredTiger Cache:${NC}"
97+
DIRTY_INT=$(echo "$DIRTY_PCT / 1" | bc 2>/dev/null || echo "0")
98+
if [ "$DIRTY_INT" -ge "$DIRTY_CRIT" ]; then
99+
echo -e " Used: ${CACHE_PCT}%"
100+
echo -e " Dirty: ${RED}${DIRTY_PCT}%${NC}${RED}CRITICAL: write pressure, eviction may stall operations${NC}"
101+
elif [ "$DIRTY_INT" -ge "$DIRTY_WARN" ]; then
102+
echo -e " Used: ${CACHE_PCT}%"
103+
echo -e " Dirty: ${YELLOW}${DIRTY_PCT}%${NC}${YELLOW}WARNING: elevated write pressure${NC}"
104+
else
105+
echo -e " Used: ${CACHE_PCT}%"
106+
echo -e " Dirty: ${GREEN}${DIRTY_PCT}%${NC}"
107+
fi
108+
109+
echo ""
110+
echo -e "${YELLOW}Operation Counters (cumulative since start):${NC}"
111+
echo -e " Inserts: $OPS_INSERT"
112+
echo -e " Queries: $OPS_QUERY"
113+
echo -e " Updates: $OPS_UPDATE"
114+
echo -e " Deletes: $OPS_DELETE"
115+
fi
116+
117+
# ── 3. Long-running operations on countly_drill ────────────────────────────
118+
echo ""
119+
echo -e "${YELLOW}Slow Operations on countly_drill (>5s):${NC}"
120+
SLOW_RAW=$(mongosh_exec "
121+
var ops = db.currentOp({ secs_running: { \$gt: 5 }, ns: /^countly_drill/ });
122+
var lines = ops.inprog.map(function(op){
123+
return op.secs_running + 's|' + op.op + '|' + op.ns;
124+
}).join('\n');
125+
print(ops.inprog.length + '|' + lines);
126+
")
127+
128+
if [ -z "$SLOW_RAW" ]; then
129+
echo -e " ${RED}Unable to get currentOp${NC}"
130+
else
131+
SLOW_COUNT=$(echo "$SLOW_RAW" | head -1 | cut -d'|' -f1)
132+
if [ "$SLOW_COUNT" -eq 0 ] 2>/dev/null; then
133+
echo -e " ${GREEN}None${NC}"
134+
elif [ "$SLOW_COUNT" -ge "$SLOW_OPS_WARN" ] 2>/dev/null; then
135+
echo -e " ${YELLOW}WARNING: $SLOW_COUNT slow ops running${NC}"
136+
echo "$SLOW_RAW" | tail -n +2 | while IFS='|' read -r secs op ns; do
137+
[ -n "$secs" ] && echo -e " ${secs} ${op} ${ns}"
138+
done
139+
else
140+
echo -e " $SLOW_COUNT slow op(s) running:"
141+
echo "$SLOW_RAW" | tail -n +2 | while IFS='|' read -r secs op ns; do
142+
[ -n "$secs" ] && echo -e " ${secs} ${op} ${ns}"
143+
done
144+
fi
145+
fi
146+
147+
echo ""
148+
echo -e "${CYAN}========================================${NC}"
149+
echo -e "${BLUE}Report generated at: $(date '+%Y-%m-%d %H:%M:%S')${NC}"
150+
echo -e "${CYAN}========================================${NC}"
151+
echo ""
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: mongodb-debug
5+
namespace: mongodb
6+
spec:
7+
restartPolicy: Never
8+
containers:
9+
- name: shell
10+
image: ubuntu:22.04
11+
command: ["/bin/bash", "-c", "sleep 1d"]
12+
volumeMounts:
13+
- mountPath: /data
14+
name: data
15+
volumes:
16+
- name: data
17+
persistentVolumeClaim:
18+
claimName: data-volume-countly-mongodb-0

devops/mongodb-debug/new-pv.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
apiVersion: v1
2+
kind: PersistentVolume
3+
metadata:
4+
name: pv-<customer name>-data
5+
spec:
6+
capacity:
7+
storage: 900Gi # MUST match PVC requested storage size
8+
accessModes:
9+
- ReadWriteOnce
10+
persistentVolumeReclaimPolicy: Retain
11+
storageClassName: "premium-rwo" # empty to ensure static PV (no dynamic provisioning)
12+
csi:
13+
driver: pd.csi.storage.gke.io
14+
# volumeHandle format: projects/PROJECT/zones/ZONE/disks/DISKNAME
15+
fsType: ext4 # MUST match the filesystem type expected by the application

devops/mongodb-debug/new-pvc.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
apiVersion: v1
2+
kind: PersistentVolumeClaim
3+
metadata:
4+
name: data-volume-countly-mongodb-0 # MUST match operator expected PVC name
5+
namespace: mongodb
6+
spec:
7+
accessModes:
8+
- ReadWriteOnce
9+
resources:
10+
requests:
11+
storage: 900Gi
12+
volumeName: pv-<customer name>-data # static bind the PV we created
13+
storageClassName: "premium-rwo"
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Countly TLS Certificate Configuration - Template
2+
# Copy this file to countly-tls.env and update with real values
3+
4+
# Base64 encoded TLS certificate (full chain)
5+
TLS_CRT=
6+
# Base64 encoded TLS private key
7+
TLS_KEY=
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Profile: tier1 — ClickHouse chart sizing
2+
# Machine: e2-standard-16 (16 CPU / 64 GB RAM)
3+
4+
shards: 1
5+
replicas: 1
6+
7+
server:
8+
resources:
9+
requests: { cpu: "1500m", memory: "11Gi" }
10+
limits: { cpu: "2", memory: "11Gi" }
11+
persistence:
12+
size: 200Gi
13+
scheduling:
14+
antiAffinity:
15+
enabled: false
16+
17+
keeper:
18+
replicas: 1
19+
resources:
20+
requests: { cpu: "1", memory: "2.5Gi" }
21+
limits: { cpu: "1", memory: "2.5Gi" }
22+
persistence:
23+
size: 10Gi

profiles/sizing/tier1/countly.yaml

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Profile: tier1 — Countly chart sizing
2+
# Machine: e2-standard-16 (16 CPU / 64 GB RAM)
3+
# Validated: yes — Perf Test #2 (2026-Jan)
4+
# Capacity: ~90 DP/s / 233M DP/month
5+
6+
nodeOptions:
7+
aggregator: "--max-old-space-size=2048 --max-semi-space-size=256"
8+
api: "--max-old-space-size=2048 --max-semi-space-size=256"
9+
frontend: "--max-old-space-size=1024"
10+
11+
aggregator:
12+
replicaCount: 1
13+
resources:
14+
requests: { cpu: "1", memory: "4Gi" }
15+
limits: { cpu: "1500m", memory: "4.5Gi" }
16+
hpa:
17+
minReplicas: 1
18+
maxReplicas: 1
19+
pdb:
20+
enabled: false
21+
scheduling:
22+
antiAffinity:
23+
enabled: false
24+
25+
api:
26+
replicaCount: 1
27+
resources:
28+
requests: { cpu: "500m", memory: "2.5Gi" }
29+
limits: { cpu: "1", memory: "2.5Gi" }
30+
hpa:
31+
maxReplicas: 1
32+
33+
frontend:
34+
replicaCount: 1
35+
resources:
36+
requests: { cpu: "500m", memory: "1.5Gi" }
37+
limits: { cpu: "1", memory: "1.5Gi" }
38+
39+
ingestor:
40+
replicaCount: 1
41+
resources:
42+
requests: { cpu: "750m", memory: "3.5Gi" }
43+
limits: { cpu: "1", memory: "4Gi" }
44+
hpa:
45+
maxReplicas: 1
46+
47+
jobserver:
48+
replicaCount: 1
49+
resources:
50+
requests: { cpu: "500m", memory: "2.5Gi" }
51+
limits: { cpu: "1", memory: "2.5Gi" }

profiles/sizing/tier1/kafka.yaml

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Profile: tier1 — Kafka chart sizing
2+
# Machine: e2-standard-16 (16 CPU / 64 GB RAM)
3+
#
4+
# 2 brokers + 1 controller. Replication factor 2, min.insync.replicas 1
5+
# (allows writes when one broker is unavailable on a 2-broker cluster).
6+
#
7+
# NOTE: kafkaConnect resources here will be overridden by the kafka-connect
8+
# dimension profile (throughput/balanced/low-latency). The values below
9+
# reflect the tier1-validated connect worker sizing; pair with a matching
10+
# kafka-connect profile if you need to honour them exactly.
11+
12+
brokers:
13+
replicas: 2
14+
resources:
15+
requests: { cpu: "500m", memory: "4.5Gi" }
16+
limits: { cpu: "750m", memory: "4.5Gi" }
17+
jvmOptions:
18+
xms: "2560m"
19+
xmx: "2560m"
20+
persistence:
21+
volumes:
22+
- id: 0
23+
size: 100Gi
24+
config:
25+
default.replication.factor: 2
26+
min.insync.replicas: 1
27+
offsets.topic.replication.factor: 2
28+
transaction.state.log.replication.factor: 2
29+
transaction.state.log.min.isr: 1
30+
31+
controllers:
32+
replicas: 1
33+
resources:
34+
requests: { cpu: "500m", memory: "2.5Gi" }
35+
limits: { cpu: "1", memory: "2.5Gi" }
36+
persistence:
37+
size: 10Gi
38+
39+
cruiseControl:
40+
enabled: false
41+
42+
kafkaConnect:
43+
replicas: 1
44+
resources:
45+
requests: { cpu: "1", memory: "2Gi" }
46+
limits: { cpu: "1", memory: "2Gi" }
47+
jvmOptions:
48+
xms: "1g"
49+
xmx: "1g"
50+
workerConfig:
51+
config.storage.replication.factor: 2
52+
offset.storage.replication.factor: 2
53+
status.storage.replication.factor: 2

profiles/sizing/tier1/mongodb.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Profile: tier1 — MongoDB chart sizing
2+
# Machine: e2-standard-16 (16 CPU / 64 GB RAM)
3+
4+
mongodb:
5+
members: 1
6+
resources:
7+
requests: { cpu: "1500m", memory: "11Gi" }
8+
limits: { cpu: "2", memory: "11Gi" }
9+
persistence:
10+
size: 200Gi
11+
scheduling:
12+
antiAffinity:
13+
enabled: false

0 commit comments

Comments
 (0)