|
87 | 87 | volume_namespace = str(item['metric']['namespace'])
|
88 | 88 | volume_description = "{}.{}".format(item['metric']['namespace'], item['metric']['persistentvolumeclaim'])
|
89 | 89 | volume_used_percent = int(item['value'][1])
|
| 90 | + pvcs_in_kubernetes[volume_description]['volume_used_percent'] = volume_used_percent |
| 91 | + try: |
| 92 | + volume_used_inode_percent = int(item['value_inodes']) |
| 93 | + except: |
| 94 | + volume_used_inode_percent = -1 |
| 95 | + pvcs_in_kubernetes[volume_description]['volume_used_inode_percent'] = volume_used_inode_percent |
90 | 96 |
|
91 | 97 | # Precursor check to ensure we have info for this pvc in kubernetes object
|
92 | 98 | if volume_description not in pvcs_in_kubernetes:
|
93 | 99 | print("ERROR: The volume {} was not found in Kubernetes but had metrics in Prometheus. This may be an old volume, was just deleted, or some random jitter is occurring. If this continues to occur, please report an bug. You might also be using an older version of Prometheus, please make sure you're using v2.30.0 or newer before reporting a bug for this.".format(volume_description))
|
94 | 100 | continue
|
95 | 101 |
|
96 | 102 | if VERBOSE:
|
97 |
| - print("Volume {} is {}% in-use of the {} available".format(volume_description,volume_used_percent,pvcs_in_kubernetes[volume_description]['volume_size_status'])) |
98 | 103 | print(" VERBOSE DETAILS:")
|
99 | 104 | print("-------------------------------------------------------------------------------------------------------------")
|
100 | 105 | print_human_readable_volume_dict(pvcs_in_kubernetes[volume_description])
|
101 | 106 | print("-------------------------------------------------------------------------------------------------------------")
|
| 107 | + print("Volume {} has {}% disk space used of the {} available".format(volume_description,volume_used_percent,pvcs_in_kubernetes[volume_description]['volume_size_status'])) |
| 108 | + if volume_used_inode_percent > -1: |
| 109 | + print("Volume {} has {}% inodes used".format(volume_description,volume_used_inode_percent)) |
102 | 110 |
|
103 | 111 | # Check if we are NOT in an alert condition
|
104 |
| - if volume_used_percent < pvcs_in_kubernetes[volume_description]['scale_above_percent']: |
| 112 | + if volume_used_percent < pvcs_in_kubernetes[volume_description]['scale_above_percent'] and volume_used_inode_percent < pvcs_in_kubernetes[volume_description]['scale_above_percent']: |
105 | 113 | PROMETHEUS_METRICS['num_pvcs_below_threshold'].inc()
|
106 | 114 | cache.unset(volume_description)
|
107 | 115 | if VERBOSE:
|
108 |
| - print(" and is not above {}%".format(pvcs_in_kubernetes[volume_description]['scale_above_percent'])) |
| 116 | + print(" and is not above {}% used".format(pvcs_in_kubernetes[volume_description]['scale_above_percent'])) |
| 117 | + if volume_used_inode_percent > -1: |
| 118 | + print(" and is not above {}% inodes used".format(pvcs_in_kubernetes[volume_description]['scale_above_percent'])) |
| 119 | + if VERBOSE: |
| 120 | + print("=============================================================================================================") |
109 | 121 | continue
|
110 | 122 | else:
|
111 | 123 | PROMETHEUS_METRICS['num_pvcs_above_threshold'].inc()
|
|
115 | 127 | cache.set(volume_description, cache.get(volume_description) + 1)
|
116 | 128 | else:
|
117 | 129 | cache.set(volume_description, 1)
|
| 130 | + |
118 | 131 | # Incase we aren't verbose, and didn't print this above, now that we're in alert we will print this
|
119 | 132 | if not VERBOSE:
|
120 | 133 | print("Volume {} is {}% in-use of the {} available".format(volume_description,volume_used_percent,pvcs_in_kubernetes[volume_description]['volume_size_status']))
|
121 |
| - # Print the alert status |
122 |
| - print(" BECAUSE it is above {}% used".format(pvcs_in_kubernetes[volume_description]['scale_above_percent'])) |
| 134 | + print("Volume {} is {}% inode in-use".format(volume_description,volume_used_inode_percent)) |
| 135 | + |
| 136 | + # Print the alert status and reason |
| 137 | + if volume_used_percent >= pvcs_in_kubernetes[volume_description]['scale_above_percent']: |
| 138 | + print(" BECAUSE it has space used above {}%".format(pvcs_in_kubernetes[volume_description]['scale_above_percent'])) |
| 139 | + elif volume_used_inode_percent >= pvcs_in_kubernetes[volume_description]['scale_above_percent']: |
| 140 | + print(" BECAUSE it has inodes used above {}%".format(pvcs_in_kubernetes[volume_description]['scale_above_percent'])) |
123 | 141 | print(" ALERT has been for {} period(s) which needs to at least {} period(s) to scale".format(cache.get(volume_description), pvcs_in_kubernetes[volume_description]['scale_after_intervals']))
|
124 | 142 |
|
125 | 143 | # Check if we are NOT in a possible scale condition
|
126 | 144 | if cache.get(volume_description) < pvcs_in_kubernetes[volume_description]['scale_after_intervals']:
|
127 | 145 | print(" BUT need to wait for {} intervals in alert before considering to scale".format( pvcs_in_kubernetes[volume_description]['scale_after_intervals'] ))
|
128 | 146 | print(" FYI this has desired_size {} and current size {}".format( convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_spec_bytes']), convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes'])))
|
| 147 | + print("=============================================================================================================") |
129 | 148 | continue
|
130 | 149 |
|
131 | 150 | # If we are in a possible scale condition, check if we recently scaled it and handle accordingly
|
132 | 151 | if pvcs_in_kubernetes[volume_description]['last_resized_at'] + pvcs_in_kubernetes[volume_description]['scale_cooldown_time'] >= int(time.mktime(time.gmtime())):
|
133 | 152 | print(" BUT need to wait {} seconds to scale since the last scale time {} seconds ago".format( abs(pvcs_in_kubernetes[volume_description]['last_resized_at'] + pvcs_in_kubernetes[volume_description]['scale_cooldown_time']) - int(time.mktime(time.gmtime())), abs(pvcs_in_kubernetes[volume_description]['last_resized_at'] - int(time.mktime(time.gmtime()))) ))
|
| 153 | + print("=============================================================================================================") |
134 | 154 | continue
|
135 | 155 |
|
136 | 156 | # If we reach this far then we will be scaling the disk, all preconditions were passed from above
|
|
155 | 175 | print(" Error/Exception while trying to determine what to resize to, volume causing failure:")
|
156 | 176 | print("-------------------------------------------------------------------------------------------------------------")
|
157 | 177 | print(pvcs_in_kubernetes[volume_description])
|
158 |
| - print("-------------------------------------------------------------------------------------------------------------") |
| 178 | + print("=============================================================================================================") |
159 | 179 | continue
|
160 | 180 |
|
161 | 181 | # If our resize bytes is less than our original size (because the user set the max-bytes to something too low)
|
|
169 | 189 | print("-------------------------------------------------------------------------------------------------------------")
|
170 | 190 | print(" Volume causing failure:")
|
171 | 191 | print_human_readable_volume_dict(pvcs_in_kubernetes[volume_description])
|
172 |
| - print("-------------------------------------------------------------------------------------------------------------") |
| 192 | + print("=============================================================================================================") |
173 | 193 | continue
|
174 | 194 |
|
175 | 195 | # Check if we are already at the max volume size (either globally, or this-volume specific)
|
176 | 196 | if resize_to_bytes == pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']:
|
177 | 197 | print(" SKIPPING scaling this because we are at the maximum size of {}".format(convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['scale_up_max_size'])))
|
| 198 | + print("=============================================================================================================") |
178 | 199 | continue
|
179 | 200 |
|
180 | 201 | # Check if we set on this PV we want to ignore the volume autoscaler
|
181 | 202 | if pvcs_in_kubernetes[volume_description]['ignore']:
|
182 | 203 | print(" IGNORING scaling this because the ignore annotation was set to true")
|
| 204 | + print("=============================================================================================================") |
183 | 205 | continue
|
184 | 206 |
|
185 | 207 | # Lets debounce this incase we did this resize last interval(s)
|
186 | 208 | if cache.get(f"{volume_description}-has-been-resized"):
|
187 | 209 | print(" DEBOUNCING and skipping this scaling, we resized within recent intervals")
|
| 210 | + print("=============================================================================================================") |
188 | 211 | continue
|
189 | 212 |
|
190 | 213 | # Check if we are DRY-RUN-ing and won't do anything
|
191 | 214 | if DRY_RUN:
|
192 | 215 | print(" DRY RUN was set, but we would have resized this disk from {} to {}".format(convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']), convert_bytes_to_storage(resize_to_bytes)))
|
| 216 | + print("=============================================================================================================") |
193 | 217 | continue
|
194 | 218 |
|
195 | 219 | # If we aren't dry-run, lets resize
|
196 | 220 | PROMETHEUS_METRICS['resize_attempted'].inc()
|
197 | 221 | print(" RESIZING disk from {} to {}".format(convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']), convert_bytes_to_storage(resize_to_bytes)))
|
198 |
| - status_output = "to scale up `{}` by `{}%` from `{}` to `{}`, it was using more than `{}%` disk space over the last `{} seconds`".format( |
| 222 | + status_output = "to scale up `{}` by `{}%` from `{}` to `{}`, it was using more than `{}%` disk or inode space over the last `{} seconds`".format( |
199 | 223 | volume_description,
|
200 | 224 | pvcs_in_kubernetes[volume_description]['scale_up_percent'],
|
201 | 225 | convert_bytes_to_storage(pvcs_in_kubernetes[volume_description]['volume_size_status_bytes']),
|
|
241 | 265 | print(item)
|
242 | 266 | traceback.print_exc()
|
243 | 267 |
|
| 268 | + if VERBOSE: |
| 269 | + print("=============================================================================================================") |
| 270 | + |
244 | 271 | # Wait until our next interval
|
245 | 272 | time.sleep(MAIN_LOOP_TIME)
|
246 | 273 |
|
|
0 commit comments