Skip to content

Commit fce880d

Browse files
committed
Implementing inode full detection support, closes #18
1 parent 7c538b3 commit fce880d

File tree

4 files changed

+149
-12
lines changed

4 files changed

+149
-12
lines changed

Makefile

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
.PHONY: deps run run-hot start lint test-local help
2+
.DEFAULT_GOAL := help
3+
4+
SHELL = bash
5+
6+
# Install dependencies
7+
deps:
8+
pip3 install -r requirements.txt
9+
10+
# This is our default logic for "make run" or "make start", to use the backgrounded. This is dry-run'd to prevent it from doing anything while developing
11+
run: deps
12+
@echo -e "\n----- Starting service locally -----"
13+
# NOTE: In here is where you can throw your secrets and such to avoid it from being committed
14+
touch unused-local-envs.sh
15+
source unused-local-envs.sh
16+
DRY_RUN=true \
17+
VERBOSE=true \
18+
python3 main.py
19+
20+
# Warning this will run it "hot" with no dry-run in place
21+
run-hot: deps
22+
@echo -e "\n----- Starting service locally -----"
23+
# NOTE: In here is where you can throw your secrets and such to avoid it from being committed
24+
touch unused-local-envs.sh
25+
source unused-local-envs.sh
26+
python3 main.py
27+
28+
# Alternate for "run"
29+
start: run
30+
31+
# Lint our code
32+
lint: deps
33+
black .
34+
35+
test-local:
36+
@echo -e "TODO - Add tests"
37+
38+
help:
39+
@echo -e "Makefile options possible\n------------------------------"
40+
@echo -e "make deps # Install dependencies"
41+
@echo -e "make run # Run service locally"
42+
@echo -e "make start # (alternate) Run service locally"
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# This example below will create an PVC using the default StorageClass which you should
2+
# have configured to AllowVolumeExpansion set to True before using this. When the pod
3+
# boots up it will automatically fill up the PVC disk, which should if you have the
4+
# volume autoscaler installed automatically expand the volume based on the default parameters
5+
#
6+
# Simply run: kubectl apply -f examples/simple-pod-with-pvc.yaml
7+
---
8+
kind: PersistentVolumeClaim
9+
apiVersion: v1
10+
metadata:
11+
name: test-claim1
12+
spec:
13+
accessModes:
14+
- ReadWriteOnce
15+
resources:
16+
requests:
17+
storage: 3G
18+
---
19+
apiVersion: v1
20+
kind: Pod
21+
metadata:
22+
name: test-claim1
23+
spec:
24+
containers:
25+
- name: write
26+
image: alpine:latest
27+
command: ["/bin/sh"]
28+
args: ["-c", "cd /mnt/pv; i=0; while true; do touch \"file_$((i++))\"; done"]
29+
volumeMounts:
30+
- mountPath: "/mnt/pv"
31+
name: test-volume
32+
stdin: true
33+
stdinOnce: true
34+
tty: true
35+
volumes:
36+
- name: test-volume
37+
persistentVolumeClaim:
38+
claimName: test-claim1
39+
restartPolicy: Never

helpers.py

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def detectPrometheusURL():
3737
DRY_RUN = True if getenv('DRY_RUN', "false").lower() == "true" else False # If we want to dry-run this
3838
PROMETHEUS_LABEL_MATCH = getenv('PROMETHEUS_LABEL_MATCH') or '' # A PromQL label query to restrict volumes for this to see and scale, without braces. eg: 'namespace="dev"'
3939
HTTP_TIMEOUT = int(getenv('HTTP_TIMEOUT', "15")) or 15 # Allows to set the timeout for calls to Prometheus and Kubernetes. This might be needed if your Prometheus or Kubernetes is over a remote WAN link with high latency and/or is heavily loaded
40-
PROMETHEUS_VERSION = "Unknown" # Used to detect the availability of a new function called present_over_time only available on Prometheus v2.30.0 or newer, this is auto-detected and updated, not set by a user
40+
PROMETHEUS_VERSION = "0.0.0" # Used to detect the availability of a new function called present_over_time only available on Prometheus v2.30.0 or newer, this is auto-detected and updated, not set by a user
4141
VERBOSE = True if getenv('VERBOSE', "false").lower() == "true" else False # If we want to verbose mode
4242
VICTORIAMETRICS_COMPAT = True if getenv('VICTORIAMETRICS_MODE', "false").lower() == "true" else False # Whether to skip the prometheus check and assume victoriametrics
4343
SCOPE_ORGID_AUTH_HEADER = getenv('SCOPE_ORGID_AUTH_HEADER') or '' # If we want to use Mimir or AgentMode which requires an orgid header. See: https://grafana.com/docs/mimir/latest/references/http-api/#authentication
@@ -489,7 +489,36 @@ def fetch_pvcs_from_prometheus(url, label_match=PROMETHEUS_LABEL_MATCH):
489489
print("Prometheus Error: {}".format(response_object['error']))
490490
exit(-1)
491491

492-
return response_object['data']['result']
492+
#TODO: Inject here "trying" to get inode percentage usage also
493+
try:
494+
if version.parse(PROMETHEUS_VERSION) >= version.parse("2.30.0"):
495+
inodes_response = requests.get(url + '/api/v1/query', params={'query': "ceil((1 - kubelet_volume_stats_inodes_free{{ {} }} / kubelet_volume_stats_inodes)*100) and present_over_time(kubelet_volume_stats_inodes_free{{ {} }}[1h])".format(label_match,label_match)}, timeout=HTTP_TIMEOUT, headers=headers)
496+
else:
497+
inodes_response = requests.get(url + '/api/v1/query', params={'query': "ceil((1 - kubelet_volume_stats_inodes_free{{ {} }} / kubelet_volume_stats_inodes)*100)".format(label_match,label_match)}, timeout=HTTP_TIMEOUT, headers=headers)
498+
inodes_response_object = inodes_response.json()
499+
500+
# Prepare values to merge/inject with our first response_object list/array above
501+
inject_values = {}
502+
for item in inodes_response_object['data']['result']:
503+
ourkey = "{}_{}".format(item['metric']['namespace'], item['metric']['persistentvolumeclaim'])
504+
inject_values[ourkey] = item['value'][1]
505+
506+
output_response_object = []
507+
# Inject/merge them...
508+
for item in response_object['data']['result']:
509+
try:
510+
ourkey = "{}_{}".format(item['metric']['namespace'], item['metric']['persistentvolumeclaim'])
511+
if ourkey in inject_values:
512+
item['value_inodes'] = inject_values[ourkey]
513+
except Exception as e:
514+
print("Caught exception while trying to inject, please report me...")
515+
print(e)
516+
output_response_object.append(item)
517+
except Exception as e:
518+
print("Caught exception while trying to inject inode usage, please report me...")
519+
print(e)
520+
521+
return output_response_object
493522

494523

495524
# Describe an specific PVC
@@ -550,13 +579,13 @@ def send_kubernetes_event(namespace, name, reason, message, type="Normal"):
550579
# Print a sexy human readable dict for volume
551580
def print_human_readable_volume_dict(input_dict):
552581
for key in input_dict:
553-
print(" {}: {}".format(key.rjust(24), input_dict[key]), end='')
582+
print(" {}: {}".format(key.rjust(25), input_dict[key]), end='')
554583
if key in ['volume_size_spec','volume_size_spec_bytes','volume_size_status','volume_size_status_bytes','scale_up_min_increment','scale_up_max_increment','scale_up_max_size'] and is_integer_or_float(input_dict[key]):
555584
print(" ({})".format(convert_bytes_to_storage(input_dict[key])), end='')
556585
if key in ['scale_cooldown_time']:
557586
print(" ({})".format(time.strftime('%H:%M:%S', time.gmtime(input_dict[key]))), end='')
558587
if key in ['last_resized_at']:
559588
print(" ({})".format(time.strftime('%Y-%m-%d %H:%M:%S %Z %z', time.localtime(input_dict[key]))), end='')
560-
if key in ['scale_up_percent','scale_above_percent']:
589+
if key in ['scale_up_percent','scale_above_percent','volume_used_percent','volume_used_inode_percent']:
561590
print("%", end='')
562591
print("") # Newline

main.py

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -87,25 +87,37 @@
8787
volume_namespace = str(item['metric']['namespace'])
8888
volume_description = "{}.{}".format(item['metric']['namespace'], item['metric']['persistentvolumeclaim'])
8989
volume_used_percent = int(item['value'][1])
90+
pvcs_in_kubernetes[volume_description]['volume_used_percent'] = volume_used_percent
91+
try:
92+
volume_used_inode_percent = int(item['value_inodes'])
93+
except:
94+
volume_used_inode_percent = -1
95+
pvcs_in_kubernetes[volume_description]['volume_used_inode_percent'] = volume_used_inode_percent
9096

9197
# Precursor check to ensure we have info for this pvc in kubernetes object
9298
if volume_description not in pvcs_in_kubernetes:
9399
print("ERROR: The volume {} was not found in Kubernetes but had metrics in Prometheus. This may be an old volume, was just deleted, or some random jitter is occurring. If this continues to occur, please report an bug. You might also be using an older version of Prometheus, please make sure you're using v2.30.0 or newer before reporting a bug for this.".format(volume_description))
94100
continue
95101

96102
if VERBOSE:
97-
print("Volume {} is {}% in-use of the {} available".format(volume_description,volume_used_percent,pvcs_in_kubernetes[volume_description]['volume_size_status']))
98103
print(" VERBOSE DETAILS:")
99104
print("-------------------------------------------------------------------------------------------------------------")
100105
print_human_readable_volume_dict(pvcs_in_kubernetes[volume_description])
101106
print("-------------------------------------------------------------------------------------------------------------")
107+
print("Volume {} has {}% disk space used of the {} available".format(volume_description,volume_used_percent,pvcs_in_kubernetes[volume_description]['volume_size_status']))
108+
if volume_used_inode_percent > -1:
109+
print("Volume {} has {}% inodes used".format(volume_description,volume_used_inode_percent))
102110

103111
# Check if we are NOT in an alert condition
104-
if volume_used_percent < pvcs_in_kubernetes[volume_description]['scale_above_percent']:
112+
if volume_used_percent < pvcs_in_kubernetes[volume_description]['scale_above_percent'] and volume_used_inode_percent < pvcs_in_kubernetes[volume_description]['scale_above_percent']:
105113
PROMETHEUS_METRICS['num_pvcs_below_threshold'].inc()
106114
cache.unset(volume_description)
107115
if VERBOSE:
108-
print(" and is not above {}%".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
116+
print(" and is not above {}% used".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
117+
if volume_used_inode_percent > -1:
118+
print(" and is not above {}% inodes used".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
119+
if VERBOSE:
120+
print("=============================================================================================================")
109121
continue
110122
else:
111123
PROMETHEUS_METRICS['num_pvcs_above_threshold'].inc()
@@ -115,22 +127,30 @@
115127
cache.set(volume_description, cache.get(volume_description) + 1)
116128
else:
117129
cache.set(volume_description, 1)
130+
118131
# Incase we aren't verbose, and didn't print this above, now that we're in alert we will print this
119132
if not VERBOSE:
120133
print("Volume {} is {}% in-use of the {} available".format(volume_description,volume_used_percent,pvcs_in_kubernetes[volume_description]['volume_size_status']))
121-
# Print the alert status
122-
print(" BECAUSE it is above {}% used".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
134+
print("Volume {} is {}% inode in-use".format(volume_description,volume_used_inode_percent))
135+
136+
# Print the alert status and reason
137+
if volume_used_percent >= pvcs_in_kubernetes[volume_description]['scale_above_percent']:
138+
print(" BECAUSE it has space used above {}%".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
139+
elif volume_used_inode_percent >= pvcs_in_kubernetes[volume_description]['scale_above_percent']:
140+
print(" BECAUSE it has inodes used above {}%".format(pvcs_in_kubernetes[volume_description]['scale_above_percent']))
123141
print(" ALERT has been for {} period(s) which needs to at least {} period(s) to scale".format(cache.get(volume_description), pvcs_in_kubernetes[volume_description]['scale_after_intervals']))
124142

125143
# Check if we are NOT in a possible scale condition
126144
if cache.get(volume_description) < pvcs_in_kubernetes[volume_description]['scale_after_intervals']:
127145
print(" BUT need to wait for {} intervals in alert before considering to scale".format( pvcs_in_kubernetes[volume_description]['scale_after_intervals'] ))
128146
print(" FYI this has desired_size {} and current size {}".format( convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_spec_bytes']), convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes'])))
147+
print("=============================================================================================================")
129148
continue
130149

131150
# If we are in a possible scale condition, check if we recently scaled it and handle accordingly
132151
if pvcs_in_kubernetes[volume_description]['last_resized_at'] + pvcs_in_kubernetes[volume_description]['scale_cooldown_time'] >= int(time.mktime(time.gmtime())):
133152
print(" BUT need to wait {} seconds to scale since the last scale time {} seconds ago".format( abs(pvcs_in_kubernetes[volume_description]['last_resized_at'] + pvcs_in_kubernetes[volume_description]['scale_cooldown_time']) - int(time.mktime(time.gmtime())), abs(pvcs_in_kubernetes[volume_description]['last_resized_at'] - int(time.mktime(time.gmtime()))) ))
153+
print("=============================================================================================================")
134154
continue
135155

136156
# If we reach this far then we will be scaling the disk, all preconditions were passed from above
@@ -155,7 +175,7 @@
155175
print(" Error/Exception while trying to determine what to resize to, volume causing failure:")
156176
print("-------------------------------------------------------------------------------------------------------------")
157177
print(pvcs_in_kubernetes[volume_description])
158-
print("-------------------------------------------------------------------------------------------------------------")
178+
print("=============================================================================================================")
159179
continue
160180

161181
# If our resize bytes is less than our original size (because the user set the max-bytes to something too low)
@@ -169,33 +189,37 @@
169189
print("-------------------------------------------------------------------------------------------------------------")
170190
print(" Volume causing failure:")
171191
print_human_readable_volume_dict(pvcs_in_kubernetes[volume_description])
172-
print("-------------------------------------------------------------------------------------------------------------")
192+
print("=============================================================================================================")
173193
continue
174194

175195
# Check if we are already at the max volume size (either globally, or this-volume specific)
176196
if resize_to_bytes == pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']:
177197
print(" SKIPPING scaling this because we are at the maximum size of {}".format(convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['scale_up_max_size'])))
198+
print("=============================================================================================================")
178199
continue
179200

180201
# Check if we set on this PV we want to ignore the volume autoscaler
181202
if pvcs_in_kubernetes[volume_description]['ignore']:
182203
print(" IGNORING scaling this because the ignore annotation was set to true")
204+
print("=============================================================================================================")
183205
continue
184206

185207
# Lets debounce this incase we did this resize last interval(s)
186208
if cache.get(f"{volume_description}-has-been-resized"):
187209
print(" DEBOUNCING and skipping this scaling, we resized within recent intervals")
210+
print("=============================================================================================================")
188211
continue
189212

190213
# Check if we are DRY-RUN-ing and won't do anything
191214
if DRY_RUN:
192215
print(" DRY RUN was set, but we would have resized this disk from {} to {}".format(convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']), convert_bytes_to_storage(resize_to_bytes)))
216+
print("=============================================================================================================")
193217
continue
194218

195219
# If we aren't dry-run, lets resize
196220
PROMETHEUS_METRICS['resize_attempted'].inc()
197221
print(" RESIZING disk from {} to {}".format(convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']), convert_bytes_to_storage(resize_to_bytes)))
198-
status_output = "to scale up `{}` by `{}%` from `{}` to `{}`, it was using more than `{}%` disk space over the last `{} seconds`".format(
222+
status_output = "to scale up `{}` by `{}%` from `{}` to `{}`, it was using more than `{}%` disk or inode space over the last `{} seconds`".format(
199223
volume_description,
200224
pvcs_in_kubernetes[volume_description]['scale_up_percent'],
201225
convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']),
@@ -241,6 +265,9 @@
241265
print(item)
242266
traceback.print_exc()
243267

268+
if VERBOSE:
269+
print("=============================================================================================================")
270+
244271
# Wait until our next interval
245272
time.sleep(MAIN_LOOP_TIME)
246273

0 commit comments

Comments
 (0)