Skip to content

Commit 5c4fc18

Browse files
devoncrousehyder
authored andcommitted
feat: Implement worker pool drain
Signed-off-by: Devon Crouse <[email protected]>
1 parent 3dd3bdf commit 5c4fc18

File tree

12 files changed

+146
-14
lines changed

12 files changed

+146
-14
lines changed

docs/src/guide/workers_drain.md

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,62 @@
55
```javascript
66
{{#include ../../../examples/workers/vars-workers-drain.auto.tfvars:4:}}
77
```
8+
9+
## Example
10+
11+
```
12+
Terraform will perform the following actions:
13+
14+
# module.workers_only.module.utilities[0].null_resource.drain_workers[0] will be created
15+
+ resource "null_resource" "drain_workers" {
16+
+ id = (known after apply)
17+
+ triggers = {
18+
+ "drain_commands" = jsonencode(
19+
[
20+
+ "kubectl drain --timeout=900s --ignore-daemonsets=true --delete-emptydir-data=true -l oke.oraclecloud.com/pool.name=oke-vm-draining",
21+
]
22+
)
23+
+ "drain_pools" = jsonencode(
24+
[
25+
+ "oke-vm-draining",
26+
]
27+
)
28+
}
29+
}
30+
31+
Plan: 1 to add, 0 to change, 0 to destroy.
32+
```
33+
34+
```
35+
module.workers_only.module.utilities[0].null_resource.drain_workers[0] (remote-exec): node/10.200.220.157 cordoned
36+
module.workers_only.module.utilities[0].null_resource.drain_workers[0] (remote-exec): WARNING: ignoring DaemonSet-managed Pods: kube-system/csi-oci-node-99x74, kube-system/kube-flannel-ds-spvsp, kube-system/kube-proxy-6m2kk, ...
37+
module.workers_only.module.utilities[0].null_resource.drain_workers[0] (remote-exec): node/10.200.220.157 drained
38+
module.workers_only.module.utilities[0].null_resource.drain_workers[0]: Creation complete after 18s [id=7686343707387113624]
39+
40+
Apply complete! Resources: 1 added, 0 changed, 0 destroyed.
41+
```
42+
43+
Observe that the node(s) are now disabled for scheduling, and free of workloads other than DaemonSet-managed Pods when `worker_drain_ignore_daemonsets = true` (default):
44+
```shell
45+
kubectl get nodes -l oke.oraclecloud.com/pool.name=oke-vm-draining
46+
NAME STATUS ROLES AGE VERSION
47+
10.200.220.157 Ready,SchedulingDisabled node 24m v1.26.2
48+
49+
kubectl get pods --all-namespaces --field-selector spec.nodeName=10.200.220.157
50+
NAMESPACE NAME READY STATUS RESTARTS AGE
51+
kube-system csi-oci-node-99x74 1/1 Running 0 50m
52+
kube-system kube-flannel-ds-spvsp 1/1 Running 0 50m
53+
kube-system kube-proxy-6m2kk 1/1 Running 0 50m
54+
kube-system proxymux-client-2r6lk 1/1 Running 0 50m
55+
```
56+
57+
Run the following command to uncordon a previously drained worker pool. The `drain = true` setting should be removed from the `worker_pools` entry to avoid re-draining the pool when running Terraform in the future.
58+
```shell
59+
kubectl uncordon -l oke.oraclecloud.com/pool.name=oke-vm-draining
60+
node/10.200.220.157 uncordoned
61+
```
62+
63+
## References
64+
* [Safely Drain a Node](https://kubernetes.io/docs/tasks/administer-cluster/safely-drain-node/)
65+
* [`kubectl drain`](https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands#drain)
66+
* [Deleting a Worker Node](https://docs.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengdeletingworkernodes.htm)

examples/profiles/workers-only/main.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ module "workers_only" {
1515
vcn_id = var.vcn_id
1616
bastion_public_ip = var.bastion_public_ip
1717
cluster_id = var.cluster_id
18+
operator_private_ip = var.operator_private_ip
1819
ssh_public_key_path = var.ssh_public_key_path
1920

2021
create_vcn = false

examples/profiles/workers-only/variables.tf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ variable "config_file_profile" {
1414
type = string
1515
}
1616

17+
variable "operator_private_ip" {
18+
default = null
19+
type = string
20+
}
21+
1722
variable "worker_nsg_ids" {
1823
default = []
1924
type = list(string)

examples/workers/vars-workers-drain.auto.tfvars

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
worker_pool_mode = "node-pool"
55
worker_pool_size = 1
66

7+
worker_drain_ignore_daemonsets = true
8+
worker_drain_delete_local_data = true
9+
worker_drain_timeout_seconds = 900
10+
711
worker_pools = {
812
oke-vm-active = {
913
description = "Node pool with active workers",

module-utilities.tf

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@ module "utilities" {
2525
ocir_secret_namespace = var.ocir_secret_namespace
2626
ocir_username = var.ocir_username
2727

28-
providers = {
29-
oci.home = oci.home
30-
}
28+
# Worker pool draining
29+
expected_drain_count = local.worker_drain_expected
30+
worker_drain_delete_local_data = var.worker_drain_delete_local_data
31+
worker_drain_ignore_daemonsets = var.worker_drain_ignore_daemonsets
32+
worker_drain_timeout_seconds = var.worker_drain_timeout_seconds
3133
}

module-workers.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
locals {
55
worker_count_expected = coalesce(one(module.workers[*].worker_count_expected), 0)
6+
worker_drain_expected = coalesce(one(module.workers[*].worker_drain_expected), 0)
67

78
# Distinct list of compartments for enabled worker pools
89
worker_compartments = distinct(compact([

modules/utilities/drain.tf

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,28 @@
22
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl
33

44
locals {
5-
drain_enabled = var.expected_node_count > 0 && var.worker_pools != null
6-
worker_pools_draining = (local.drain_enabled
7-
? { for k, v in var.worker_pools : k => v if tobool(lookup(v, "drain", false)) } : {}
5+
drain_enabled = var.expected_drain_count > 0
6+
drain_pools = (local.drain_enabled
7+
? tolist([for k, v in var.worker_pools : k if tobool(lookup(v, "drain", false))]) : []
8+
)
9+
10+
drain_commands = formatlist(
11+
format(
12+
"kubectl drain %v %v %v %v",
13+
format("--timeout=%vs", var.worker_drain_timeout_seconds),
14+
format("--ignore-daemonsets=%v", var.worker_drain_ignore_daemonsets),
15+
format("--delete-emptydir-data=%v", var.worker_drain_delete_local_data),
16+
"-l oke.oraclecloud.com/pool.name=%v" # interpolation deferred to formatlist
17+
),
18+
local.drain_pools
819
)
920
}
1021

1122
resource "null_resource" "drain_workers" {
1223
count = local.drain_enabled ? 1 : 0
1324
triggers = {
14-
drain_workers = jsonencode(sort(keys(local.worker_pools_draining)))
25+
drain_pools = jsonencode(sort(local.drain_pools))
26+
drain_commands = jsonencode(local.drain_commands)
1527
}
1628

1729
connection {
@@ -26,9 +38,6 @@ resource "null_resource" "drain_workers" {
2638
}
2739

2840
provisioner "remote-exec" {
29-
inline = [
30-
"echo kubectl get nodes ...", # TODO List nodes by label for draining pools
31-
"echo kubectl drain --ignore-daemonsets", # TODO Drain nodes for draining pools
32-
]
41+
inline = local.drain_commands
3342
}
3443
}

modules/utilities/variables.tf

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ variable "ocir_secret_name" { type = string }
2222
variable "ocir_secret_namespace" { type = string }
2323
variable "ocir_username" { type = string }
2424

25-
# Node readiness check
25+
# Node readiness check, drain
2626
variable "await_node_readiness" { type = string }
27+
variable "expected_drain_count" { type = number }
2728
variable "expected_node_count" { type = number }
29+
variable "worker_drain_ignore_daemonsets" { type = bool }
30+
variable "worker_drain_delete_local_data" { type = bool }
31+
variable "worker_drain_timeout_seconds" { type = number }

modules/utilities/versions.tf

Lines changed: 0 additions & 1 deletion
This file was deleted.

modules/utilities/versions.tf

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Copyright (c) 2017, 2023 Oracle Corporation and/or its affiliates.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl
3+
4+
terraform {
5+
required_version = ">= 1.2.0"
6+
7+
required_providers {
8+
null = {
9+
source = "hashicorp/null"
10+
version = ">= 3.2.1"
11+
}
12+
13+
oci = {
14+
source = "oracle/oci"
15+
version = ">= 4.115.0"
16+
}
17+
}
18+
}

modules/workers/locals.tf

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,12 @@ locals {
138138

139139
# Number of nodes expected from enabled worker pools
140140
expected_node_count = length(local.enabled_worker_pools) == 0 ? 0 : sum([
141-
for k, v in local.enabled_worker_pools : lookup(v, "size", 0)
141+
for k, v in local.enabled_worker_pools : lookup(v, "size", var.worker_pool_size)
142+
])
143+
144+
# Number of nodes expected to be draining in worker pools
145+
expected_drain_count = length(local.enabled_worker_pools) == 0 ? 0 : sum([
146+
for k, v in local.enabled_worker_pools : tobool(v.drain) ? lookup(v, "size", var.worker_pool_size) : 0
142147
])
143148

144149
# Enabled worker_pool map entries for node pools

modules/workers/outputs.tf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,8 @@ output "worker_count_expected" {
2020
description = "# of nodes expected from created worker pools"
2121
value = local.expected_node_count
2222
}
23+
24+
output "worker_drain_expected" {
25+
description = "# of nodes expected to be draining in worker pools"
26+
value = local.expected_drain_count
27+
}

variables-utilities.tf

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,23 @@ variable "ocir_username" {
4343
description = "A username with access to the OCI Vault secret for OCIR access. Required when 'ocir_secret_id' is provided."
4444
type = string
4545
}
46+
47+
# Worker pool draining
48+
49+
variable "worker_drain_ignore_daemonsets" {
50+
default = true
51+
description = "Whether to ignore DaemonSet-managed Pods when draining worker pools. See <a href=https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands#drain>kubectl drain</a> for more information."
52+
type = bool
53+
}
54+
55+
variable "worker_drain_delete_local_data" {
56+
default = true
57+
description = "Whether to accept removal of data stored locally on draining worker pools. See <a href=https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands#drain>kubectl drain</a> for more information."
58+
type = bool
59+
}
60+
61+
variable "worker_drain_timeout_seconds" {
62+
default = 900
63+
description = "The length of time to wait before giving up on draining nodes in a pool. See <a href=https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands#drain>kubectl drain</a> for more information."
64+
type = number
65+
}

0 commit comments

Comments
 (0)