Skip to content
This repository was archived by the owner on Apr 17, 2025. It is now read-only.

Commit 5851b38

Browse files
authored
Merge pull request #240 from adrianludwin/hrq-drift
Add periodic HRQ resync
2 parents 3dac447 + 3d135ee commit 5851b38

File tree

5 files changed

+114
-41
lines changed

5 files changed

+114
-41
lines changed

cmd/manager/main.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"net/http"
2222
"os"
2323
"strings"
24+
"time"
2425

2526
"contrib.go.opencensus.io/exporter/prometheus"
2627
"contrib.go.opencensus.io/exporter/stackdriver"
@@ -75,6 +76,7 @@ var (
7576
webhooksOnly bool
7677
enableHRQ bool
7778
hncNamespace string
79+
hrqSyncInterval time.Duration
7880
)
7981

8082
// init preloads some global vars before main() starts. Since this is the top-level module, I'm not
@@ -153,6 +155,7 @@ func parseFlags() {
153155
flag.BoolVar(&webhooksOnly, "webhooks-only", false, "Disables the controllers so HNC can be run in HA webhook mode")
154156
flag.BoolVar(&enableHRQ, "enable-hrq", false, "Enables hierarchical resource quotas")
155157
flag.StringVar(&hncNamespace, "namespace", "hnc-system", "Namespace where hnc-manager and hnc resources deployed")
158+
flag.DurationVar(&hrqSyncInterval, "hrq-sync-interval", 1*time.Minute, "Frequency to double-check that all HRQ usages are up-to-date (shouldn't be needed)")
156159
flag.Parse()
157160

158161
// Assign the array args to the configuration variables after the args are parsed.
@@ -307,9 +310,10 @@ func startControllers(mgr ctrl.Manager, certsReady chan struct{}) {
307310
f := forest.NewForest()
308311

309312
opts := setup.Options{
310-
NoWebhooks: noWebhooks,
311-
MaxReconciles: maxReconciles,
312-
HRQ: enableHRQ,
313+
NoWebhooks: noWebhooks,
314+
MaxReconciles: maxReconciles,
315+
HRQ: enableHRQ,
316+
HRQSyncInterval: hrqSyncInterval,
313317
}
314318
setup.Create(setupLog, mgr, f, opts)
315319

internal/forest/namespacehrq.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
package forest
22

33
import (
4+
"errors"
45
"fmt"
56

7+
"github.com/go-logr/logr"
68
v1 "k8s.io/api/core/v1"
9+
"k8s.io/apimachinery/pkg/types"
710

811
"sigs.k8s.io/hierarchical-namespaces/internal/hrq/utils"
912
)
@@ -246,3 +249,52 @@ func (n *Namespace) UpdateLimits(nm string, l v1.ResourceList) bool {
246249
n.quotas.limits[nm] = l
247250
return true
248251
}
252+
253+
// RectifySubtreeUsages ensures that the subtree usages of every namespaces is in sync with all of
254+
// its descendants. This should be ensured by the logic in UseResources, but bugs happen, so this is
255+
// an added level of safety. If any discrepancies are found, this function logs an error, updates
256+
// the corrected usages in-memory, and returns a list of affected HRQ objects so that they can be
257+
// re-reconciled to show the corrected usages.
258+
//
259+
// The forest lock must be held when calling this function.
260+
func (f *Forest) RectifySubtreeUsages(log logr.Logger) []types.NamespacedName {
261+
// Recalculate all usages from scratch
262+
usages := map[string]v1.ResourceList{}
263+
for _, ns := range f.namespaces {
264+
local := ns.quotas.used.local
265+
// NB: AncestryNames includes the namespace itself
266+
for _, anc := range ns.AncestryNames() {
267+
if existing, ok := usages[anc]; ok {
268+
usages[anc] = utils.Add(existing, local)
269+
} else {
270+
usages[anc] = local
271+
}
272+
}
273+
}
274+
275+
// Look for any out-of-date HRQ usages
276+
updated := []types.NamespacedName{}
277+
for nm, ns := range f.namespaces {
278+
have := ns.quotas.used.subtree
279+
actual := utils.FilterUnlimited(usages[nm], ns.Limits())
280+
if utils.Equals(have, actual) {
281+
continue
282+
}
283+
284+
// Oopsies.
285+
err := errors.New("HRQ correctness error")
286+
log.Error(err, "incrementally calculated usages are incorrect", "ns", nm, "incremental", have, "actual", actual)
287+
288+
// Update and return info so the reconciler can write back the corrected usages.
289+
ns.quotas.used.subtree = actual
290+
for hrq := range ns.quotas.limits {
291+
// These are the names of the actual objects
292+
updated = append(updated, types.NamespacedName{
293+
Namespace: nm,
294+
Name: hrq,
295+
})
296+
}
297+
}
298+
299+
return updated
300+
}

internal/hrq/hrqreconciler_test.go

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -122,35 +122,33 @@ var _ = Describe("HRQ reconciler tests", func() {
122122
Eventually(getRQSpec(ctx, barName)).Should(equalRL("secrets", "100", "cpu", "50"))
123123
})
124124

125-
XIt("should recover if the subtree usages are out of sync in the forest and in reality", func() {
126-
setHRQ(ctx, fooHRQName, fooName, "secrets", "6", "pods", "3")
127-
setHRQ(ctx, barHRQName, barName, "secrets", "100", "cpu", "50")
128-
// Simulate the K8s ResourceQuota controller to update usages.
129-
updateRQUsage(ctx, fooName, "secrets", "0", "pods", "0")
130-
updateRQUsage(ctx, barName, "secrets", "0", "cpu", "0", "pods", "0")
125+
It("should recover if the subtree usages are out of sync in the forest and in reality", func() {
126+
setHRQ(ctx, fooHRQName, fooName, "secrets", "100")
127+
setHRQ(ctx, barHRQName, barName, "secrets", "100")
131128

132-
// Consume 10 secrets in bar.
129+
// Consume 10 secrets in bar and verified that they're used
133130
updateRQUsage(ctx, barName, "secrets", "10")
131+
Eventually(getHRQUsed(ctx, fooName, fooHRQName)).Should(equalRL("secrets", "10"))
132+
Eventually(getHRQUsed(ctx, barName, barHRQName)).Should(equalRL("secrets", "10"))
134133

135-
// Get the expected bar local usages and foo subtree usages in the forest.
136-
Eventually(forestGetLocalUsages(barName)).Should(equalRL("secrets", "10", "cpu", "0", "pods", "0"))
137-
Eventually(forestGetSubtreeUsages(fooName)).Should(equalRL("secrets", "10", "pods", "0"))
134+
// If we do the check now, it shouldn't find anything
135+
Expect(TestCheckHRQDrift()).Should(Equal(false))
138136

139-
// Introduce a bug to make the foo subtree usages in the forest out-of-sync.
140-
// 10 secrets are consumed in reality while the forest has 9.
141-
forestSetSubtreeUsages(fooName, "secrets", "9", "pods", "0")
142-
// Verify the subtree secret usages is updated from 10 to 9.
143-
Eventually(forestGetSubtreeUsages(fooName)).Should(equalRL("secrets", "9", "pods", "0"))
137+
// Introduce a bug to make the foo subtree usages in the forest out-of-sync. Pretend that we
138+
// somehow dropped a secret.
139+
forestOverrideSubtreeUsages(fooName, "secrets", "9")
144140

145-
// Now consume 1 more secret (11 compared with previous 10).
141+
// Now use an additional secret in the child. The parent should still be out-of-sync by one.
146142
updateRQUsage(ctx, barName, "secrets", "11")
143+
Eventually(getHRQUsed(ctx, fooName, fooHRQName)).Should(equalRL("secrets", "10"))
144+
Eventually(getHRQUsed(ctx, barName, barHRQName)).Should(equalRL("secrets", "11"))
147145

148-
// We should eventually get 11 secrets subtree usages in the forest.
149-
// Note: without the recalculation of subtree usages from scratch in the HRQ
150-
// reconciler, we would get an incremental update to 10 according to the
151-
// calculation in "UseResources()": 9 + (11 - 10), which is still out-of-sync.
152-
Eventually(forestGetSubtreeUsages(fooName), HRQSyncInterval).Should(equalRL("secrets", "11", "pods", "0"))
153-
Eventually(getHRQUsed(ctx, fooName, fooHRQName)).Should(equalRL("secrets", "11", "pods", "0"))
146+
// Run the checker and make sure it finds something
147+
Expect(TestCheckHRQDrift()).Should(Equal(true))
148+
149+
// And everything should be in sync again
150+
Eventually(getHRQUsed(ctx, fooName, fooHRQName)).Should(equalRL("secrets", "11"))
151+
Eventually(getHRQUsed(ctx, barName, barHRQName)).Should(equalRL("secrets", "11"))
154152
})
155153

156154
It("should enqueue subtree even if the newly created HRQ has the same `spec.hard` and `status.hard`", func() {
@@ -227,7 +225,7 @@ var _ = Describe("HRQ reconciler tests", func() {
227225
})
228226
})
229227

230-
func forestSetSubtreeUsages(ns string, args ...string) {
228+
func forestOverrideSubtreeUsages(ns string, args ...string) {
231229
TestForest.Lock()
232230
defer TestForest.Unlock()
233231
TestForest.Get(ns).TestOnlySetSubtreeUsage(argsToResourceList(0, args...))

internal/integtest/setup.go

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,6 @@ var (
5151
TestForest *forest.Forest
5252
)
5353

54-
const (
55-
HRQSyncInterval = 5 * time.Second
56-
)
57-
5854
func HNCRun(t *testing.T, title string) {
5955
RunSpecs(t, title)
6056
}
@@ -107,11 +103,10 @@ func HNCBeforeSuite() {
107103

108104
By("creating reconcilers")
109105
opts := setup.Options{
110-
MaxReconciles: 100,
111-
UseFakeClient: true,
112-
HNCCfgRefresh: 1 * time.Second, // so we don't have to wait as long
113-
HRQ: true,
114-
HRQSyncInterval: HRQSyncInterval,
106+
MaxReconciles: 100,
107+
UseFakeClient: true,
108+
HNCCfgRefresh: 1 * time.Second, // so we don't have to wait as long
109+
HRQ: true,
115110
}
116111
TestForest = forest.NewForest()
117112
err = setup.CreateReconcilers(k8sManager, TestForest, opts)
@@ -140,3 +135,7 @@ func HNCAfterSuite() {
140135
err := testEnv.Stop()
141136
Expect(err).ToNot(HaveOccurred())
142137
}
138+
139+
func TestCheckHRQDrift() bool {
140+
return setup.TestOnlyCheckHRQDrift(TestForest)
141+
}

internal/setup/reconcilers.go

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ import (
1616
"sigs.k8s.io/hierarchical-namespaces/internal/hrq"
1717
)
1818

19+
var (
20+
// hrqrForTesting is used by TestOnlyCheckHRQDrift
21+
hrqrForTesting *hrq.HierarchicalResourceQuotaReconciler
22+
)
23+
1924
type Options struct {
2025
MaxReconciles int
2126
UseFakeClient bool
@@ -89,6 +94,7 @@ func CreateReconcilers(mgr ctrl.Manager, f *forest.Forest, opts Options) error {
8994
RQR: rqr,
9095
}
9196
rqr.HRQR = hrqr
97+
hrqrForTesting = hrqr
9298
f.AddListener(hrqr)
9399

94100
if err := rqr.SetupWithManager(mgr); err != nil {
@@ -101,7 +107,7 @@ func CreateReconcilers(mgr ctrl.Manager, f *forest.Forest, opts Options) error {
101107

102108
// Create a periodic checker to make sure the forest is not out-of-sync.
103109
if opts.HRQSyncInterval != 0 {
104-
go detectIncrementalHRQUsageDrift(f, opts.HRQSyncInterval, hrqr)
110+
go watchHRQDrift(f, opts.HRQSyncInterval, hrqr)
105111
}
106112
}
107113

@@ -118,13 +124,27 @@ func CreateReconcilers(mgr ctrl.Manager, f *forest.Forest, opts Options) error {
118124
return nil
119125
}
120126

121-
func detectIncrementalHRQUsageDrift(f *forest.Forest, forestSyncInterval time.Duration, hrqr *hrq.HierarchicalResourceQuotaReconciler) {
127+
func watchHRQDrift(f *forest.Forest, forestSyncInterval time.Duration, hrqr *hrq.HierarchicalResourceQuotaReconciler) {
122128
syncTicker := time.NewTicker(forestSyncInterval)
123129
for {
124130
<-syncTicker.C
125-
// If there's any out-of-sync, enqueue the affected HRQs to update usages.
126-
// If not, nothing will be enqueued.
127-
//hrqr.Enqueue(hrqr.Log, "recover out-of-sync usages", f.CheckSubtreeUsages())
128-
hrqr.Log.Info("TODO: recover out-of-sync usages")
131+
checkHRQDrift(f, hrqr)
129132
}
130133
}
134+
135+
func checkHRQDrift(f *forest.Forest, hrqr *hrq.HierarchicalResourceQuotaReconciler) bool {
136+
f.Lock()
137+
defer f.Unlock()
138+
found := false
139+
for _, nsnm := range f.RectifySubtreeUsages(hrqr.Log) {
140+
found = true
141+
hrqr.Enqueue(hrqr.Log, "usage out-of-sync", nsnm.Namespace, nsnm.Name)
142+
}
143+
return found
144+
}
145+
146+
// TestOnlyCheckHRQDrift is used in the integ tests to invoke the checker at exactly the right time
147+
// to verify that it works as expected.
148+
func TestOnlyCheckHRQDrift(f *forest.Forest) bool {
149+
return checkHRQDrift(f, hrqrForTesting)
150+
}

0 commit comments

Comments
 (0)