diff --git a/test/clusterloader2/overrides/scheduler_throughput.yaml b/test/clusterloader2/overrides/scheduler_throughput.yaml new file mode 100644 index 00000000000..625cb215c5b --- /dev/null +++ b/test/clusterloader2/overrides/scheduler_throughput.yaml @@ -0,0 +1,4 @@ +CL2_DEFAULT_QPS: 10000 # Default 500 +CL2_DEFAULT_BURST: 20000 # Default 1000 +CL2_UNIFORM_QPS: 10000 # Default 500 +CL2_SCHEDULER_THROUGHPUT_PODS_PER_DEPLOYMENT: 50000 \ No newline at end of file diff --git a/test/clusterloader2/testing/access-tokens/config.yaml b/test/clusterloader2/testing/access-tokens/config.yaml new file mode 100644 index 00000000000..82827c904bb --- /dev/null +++ b/test/clusterloader2/testing/access-tokens/config.yaml @@ -0,0 +1,181 @@ +# Stress testing access token validation +# +# Targeting 2 000 tokens with 5 000 total QPS for 5k node cluster, so it's 2.5 +# QPS per token. +# +# For this test number of tokens is not changed with number of nodes. +# By default, those 2 000 tokens are are assigned to 80 service accounts, with +# 25 tokens each. There is 1:1 mapping between deployments and service +# accounts, so 80 deployments is generated, each with one pod. +# +# For smaller cluster, we scale down lineary QPS per token to +# 2.5 * (Number of nodes)/(5 000). This results in 1 QPS per node, if there is +# 2 000 tokens. +# +# Structure and mapping: +# * For each namespace (by default 1), we are generating service accounts and +# deployments (by default 80). +# * For each service account we are generating tokens (by default 25). +# * For each deployment we are creating pods (by default 1) and for those pods +# we are mounting all tokens generated from linked service account. +# * Each pod is running a number of clients equal to number of assigned tokens. +# +# When defining your own parameters: +# Number of tokens = ${namespaces} * ${serviceAccounts} * ${tokensPerServiceAccount} +# Total QPS = Number of tokens * ${replicas} * ${qpsPerWorker} +# +# For default values in 5k cluster this means: +# Number of tokens = 1 * 80 * 25 = 2000 +# Total QPS = 2000 * 1 * 2.5 = 5000 + +# Size of test variables +{{$namespaces := DefaultParam .CL2_ACCESS_TOKENS_NAMESPACES 1}} +{{$serviceAccounts := DefaultParam .CL2_ACCESS_TOKENS_SERVICE_ACCOUNTS 80}} +{{$tokensPerServiceAccount := DefaultParam .CL2_ACCESS_TOKENS_TOKENS_PER_SERVICE_ACCOUNT 25}} +{{$replicas := DefaultParam .CL2_ACCESS_TOKENS_REPLICAS 1}} +{{$qpsPerWorker := DefaultParam .CL2_ACCESS_TOKENS_QPS (MultiplyFloat 2.5 (DivideFloat .Nodes 5000))}} + +# TestMetrics measurement variables +{{$ENABLE_SYSTEM_POD_METRICS:= DefaultParam .ENABLE_SYSTEM_POD_METRICS true}} +{{$ENABLE_RESTART_COUNT_CHECK := DefaultParam .ENABLE_RESTART_COUNT_CHECK true}} +{{$RESTART_COUNT_THRESHOLD_OVERRIDES:= DefaultParam .RESTART_COUNT_THRESHOLD_OVERRIDES ""}} + +# Configs +{{$ALLOWED_SLOW_API_CALLS := DefaultParam .CL2_ALLOWED_SLOW_API_CALLS 0}} + +name: access-tokens +namespace: + number: {{$namespaces}} +tuningSets: + - name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 +steps: +- name: Starting measurements + measurements: + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: start + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: start + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}} + enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} + allowedSlowCalls: {{$ALLOWED_SLOW_API_CALLS}} + +- name: Creating ServiceAccounts + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: Sequence + objectBundle: + - basename: service-account-getter + objectTemplatePath: role.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$serviceAccounts}} + tuningSet: Sequence + objectBundle: + - basename: account + objectTemplatePath: serviceAccount.yaml + - basename: account + objectTemplatePath: roleBinding.yaml + templateFillMap: + RoleName: service-account-getter + +- name: Creating Tokens + phases: + {{range $i := Loop $serviceAccounts}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$tokensPerServiceAccount}} + tuningSet: Sequence + objectBundle: + - basename: account-{{$i}} + objectTemplatePath: token.yaml + {{end}} + + +- name: Starting measurement for waiting for pods + measurements: + - Identifier: WaitForRunningPods + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = access-tokens + operationTimeout: 15m + +- name: Creating pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$serviceAccounts}} + tuningSet: Sequence + objectBundle: + - basename: account + objectTemplatePath: deployment.yaml + templateFillMap: + QpsPerWorker: {{$qpsPerWorker}} + Replicas: {{$replicas}} + Tokens: {{$tokensPerServiceAccount}} + +- name: Waiting for pods to be running + measurements: + - Identifier: WaitForRunningPods + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Wait 5min + measurements: + - Identifier: Wait + Method: Sleep + Params: + duration: 5m + +- name: Deleting pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Sequence + objectBundle: + - basename: account + objectTemplatePath: deployment.yaml + templateFillMap: + QpsPerWorker: {{$qpsPerWorker}} + Replicas: {{$replicas}} + Tokens: {{$tokensPerServiceAccount}} + +- name: Waiting for pods to be deleted + measurements: + - Identifier: WaitForRunningPods + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Collecting measurements + measurements: + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: gather + enableViolations: true + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: gather + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}} + enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} diff --git a/test/clusterloader2/testing/access-tokens/deployment.yaml b/test/clusterloader2/testing/access-tokens/deployment.yaml new file mode 100644 index 00000000000..38df88878f2 --- /dev/null +++ b/test/clusterloader2/testing/access-tokens/deployment.yaml @@ -0,0 +1,45 @@ +{{$name := .Name}} + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: access-tokens +spec: + selector: + matchLabels: + group: access-tokens + name: {{.Name}} + replicas: {{.Replicas}} + template: + metadata: + labels: + group: access-tokens + name: {{.Name}} + spec: + imagePullPolicy: Always + containers: + - name: access-tokens + image: gcr.io/k8s-testimages/perf-tests-util/access-tokens:v0.0.6 + args: + {{range $tokenId := Loop .Tokens}} + - --access-token-dirs=/var/tokens/{{$name}}-{{$tokenId}} + {{end}} + - --namespace={{.Namespace}} + - --qps-per-worker={{.QpsPerWorker}} + resources: + requests: + cpu: {{AddInt 10 (MultiplyFloat .Tokens .QpsPerWorker)}}m # 1mCpu per Token * per QPS + memory: {{AddInt 50 (MultiplyInt .Tokens 5)}}Mi + volumeMounts: + {{range $j := Loop .Tokens}} + - name: {{$name}}-{{$j}} + mountPath: /var/tokens/{{$name}}-{{$j}} + {{end}} + volumes: + {{range $j := Loop .Tokens}} + - name: {{$name}}-{{$j}} + secret: + secretName: {{$name}}-{{$j}} + {{end}} diff --git a/test/clusterloader2/testing/access-tokens/role.yaml b/test/clusterloader2/testing/access-tokens/role.yaml new file mode 100644 index 00000000000..f79f3853297 --- /dev/null +++ b/test/clusterloader2/testing/access-tokens/role.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{.Name}} +rules: + - apiGroups: + - "" + resources: + - serviceaccounts + verbs: + - get diff --git a/test/clusterloader2/testing/access-tokens/roleBinding.yaml b/test/clusterloader2/testing/access-tokens/roleBinding.yaml new file mode 100644 index 00000000000..cb0dad325a1 --- /dev/null +++ b/test/clusterloader2/testing/access-tokens/roleBinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{.Name}} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{.RoleName}}-0 +subjects: + - kind: ServiceAccount + name: {{.Name}} + namespace: {{.Namespace}} diff --git a/test/clusterloader2/testing/access-tokens/serviceAccount.yaml b/test/clusterloader2/testing/access-tokens/serviceAccount.yaml new file mode 100644 index 00000000000..c931b8d95b9 --- /dev/null +++ b/test/clusterloader2/testing/access-tokens/serviceAccount.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{.Name}} diff --git a/test/clusterloader2/testing/access-tokens/token.yaml b/test/clusterloader2/testing/access-tokens/token.yaml new file mode 100644 index 00000000000..34adf037951 --- /dev/null +++ b/test/clusterloader2/testing/access-tokens/token.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Secret +metadata: + name: {{.Name}} + annotations: + kubernetes.io/service-account.name: {{.BaseName}} +type: kubernetes.io/service-account-token diff --git a/test/clusterloader2/testing/batch/config.yaml b/test/clusterloader2/testing/batch/config.yaml new file mode 100644 index 00000000000..e1459f64e99 --- /dev/null +++ b/test/clusterloader2/testing/batch/config.yaml @@ -0,0 +1,94 @@ +{{$MODE := DefaultParam .MODE "Indexed"}} +{{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .NODES_PER_NAMESPACE 100)}} +{{$PODS_PER_NODE := DefaultParam .PODS_PER_NODE 30}} +{{$LOAD_TEST_THROUGHPUT := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 10}} + +{{$totalPods := MultiplyInt $PODS_PER_NODE .Nodes}} +{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}} +{{$podsPerNamespace := DivideInt $totalPods $namespaces}} + +# small_job: 1/2 of namespace pods should be in small Jobs. +{{$smallJobSize := 5}} +{{$smallJobsPerNamespace := DivideInt $podsPerNamespace (MultiplyInt 2 $smallJobSize)}} +# medium_job: 1/4 of namespace pods should be in medium Jobs. +{{$mediumJobSize := 20}} +{{$mediumJobsPerNamespace := DivideInt $podsPerNamespace (MultiplyInt 4 $mediumJobSize)}} +# Large_job: 1/4 of namespace pods should be in large Jobs. +{{$largeJobSize := 400}} +{{$largeJobsPerNamespace := DivideInt $podsPerNamespace (MultiplyInt 4 $largeJobSize)}} + +{{$jobRunningTime := DefaultParam .CL2_JOB_RUNNING_TIME "30s"}} + +name: batch + +namespace: + number: {{$namespaces}} + +tuningSets: +- name: UniformQPS + qpsLoad: + qps: {{$LOAD_TEST_THROUGHPUT}} + +steps: +- name: Start measurements + measurements: + - Identifier: WaitForFinishedJobs + Method: WaitForFinishedJobs + Params: + action: start + labelSelector: group = test-job + - Identifier: JobLifecycleLatency + Method: JobLifecycleLatency + Params: + action: start + labelSelector: group = test-job +- name: Create {{$MODE}} jobs + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$smallJobsPerNamespace}} + tuningSet: UniformQPS + objectBundle: + - basename: small + objectTemplatePath: "job.yaml" + templateFillMap: + Replicas: {{$smallJobSize}} + Mode: {{$MODE}} + Sleep: {{$jobRunningTime}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$mediumJobsPerNamespace}} + tuningSet: UniformQPS + objectBundle: + - basename: medium + objectTemplatePath: "job.yaml" + templateFillMap: + Replicas: {{$mediumJobSize}} + Mode: {{$MODE}} + Sleep: {{$jobRunningTime}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$largeJobsPerNamespace}} + tuningSet: UniformQPS + objectBundle: + - basename: large + objectTemplatePath: "job.yaml" + templateFillMap: + Replicas: {{$largeJobSize}} + Mode: {{$MODE}} + Sleep: {{$jobRunningTime}} +- name: Wait for {{$MODE}} jobs to finish + measurements: + - Identifier: JobLifecycleLatency + Method: JobLifecycleLatency + Params: + action: gather + timeout: 10m + - Identifier: WaitForFinishedJobs + Method: WaitForFinishedJobs + Params: + action: gather + timeout: 10m diff --git a/test/clusterloader2/testing/batch/job.yaml b/test/clusterloader2/testing/batch/job.yaml new file mode 100644 index 00000000000..aa77426df97 --- /dev/null +++ b/test/clusterloader2/testing/batch/job.yaml @@ -0,0 +1,21 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: {{.Name}} + labels: + group: test-job +spec: + parallelism: {{.Replicas}} + completions: {{.Replicas}} + completionMode: {{.Mode}} + template: + metadata: + labels: + group: test-pod + spec: + containers: + - name: {{.Name}} + image: gcr.io/k8s-staging-perf-tests/sleep:v0.0.3 + args: + - {{.Sleep}} + restartPolicy: Never diff --git a/test/clusterloader2/testing/chaosmonkey/ignore_node_killer_container_restarts_100.yaml b/test/clusterloader2/testing/chaosmonkey/ignore_node_killer_container_restarts_100.yaml new file mode 100644 index 00000000000..f94b340602a --- /dev/null +++ b/test/clusterloader2/testing/chaosmonkey/ignore_node_killer_container_restarts_100.yaml @@ -0,0 +1,10 @@ +RESTART_COUNT_THRESHOLD_OVERRIDES: | + # Main purpose of this check is detection crashlooping pods. + # With enabled node killer pods running on a killed node crash, and this is expected + coredns: 1 + fluentd-gcp: 1 + kube-proxy: 1 + konnectivity-agent: 1 + metadata-proxy: 1 + prometheus-to-sd-exporter: 1 + volume-snapshot-controller: 1 diff --git a/test/clusterloader2/testing/chaosmonkey/override.yaml b/test/clusterloader2/testing/chaosmonkey/override.yaml new file mode 100644 index 00000000000..84ab3a28aa8 --- /dev/null +++ b/test/clusterloader2/testing/chaosmonkey/override.yaml @@ -0,0 +1 @@ +ENABLE_CHAOSMONKEY: true diff --git a/test/clusterloader2/testing/density/config.yaml b/test/clusterloader2/testing/density/config.yaml new file mode 100644 index 00000000000..07cdd7c1a51 --- /dev/null +++ b/test/clusterloader2/testing/density/config.yaml @@ -0,0 +1,268 @@ +# ASSUMPTIONS: +# - Underlying cluster should have 100+ nodes. +# - Number of nodes should be divisible by NODES_PER_NAMESPACE (default 100). + +#Constants +{{$DENSITY_RESOURCE_CONSTRAINTS_FILE := DefaultParam .DENSITY_RESOURCE_CONSTRAINTS_FILE ""}} +# Cater for the case where the number of nodes is less than nodes per namespace. See https://github.com/kubernetes/perf-tests/issues/887 +{{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .NODES_PER_NAMESPACE 100)}} +{{$PODS_PER_NODE := DefaultParam .PODS_PER_NODE 30}} +{{$DENSITY_TEST_THROUGHPUT := DefaultParam .DENSITY_TEST_THROUGHPUT 20}} +{{$SCHEDULER_THROUGHPUT_THRESHOLD := DefaultParam .CL2_SCHEDULER_THROUGHPUT_THRESHOLD 0}} +# LATENCY_POD_MEMORY and LATENCY_POD_CPU are calculated for 1-core 4GB node. +# Increasing allocation of both memory and cpu by 10% +# decreases the value of priority function in scheduler by one point. +# This results in decreased probability of choosing the same node again. +{{$LATENCY_POD_CPU := DefaultParam .LATENCY_POD_CPU 100}} +{{$LATENCY_POD_MEMORY := DefaultParam .LATENCY_POD_MEMORY 350}} +{{$MIN_LATENCY_PODS := DefaultParam .MIN_LATENCY_PODS 500}} +{{$MIN_SATURATION_PODS_TIMEOUT := 180}} +{{$ENABLE_CHAOSMONKEY := DefaultParam .ENABLE_CHAOSMONKEY false}} +{{$ENABLE_SYSTEM_POD_METRICS:= DefaultParam .ENABLE_SYSTEM_POD_METRICS true}} +{{$ENABLE_CLUSTER_OOMS_TRACKER := DefaultParam .CL2_ENABLE_CLUSTER_OOMS_TRACKER true}} +{{$CLUSTER_OOMS_IGNORED_PROCESSES := DefaultParam .CL2_CLUSTER_OOMS_IGNORED_PROCESSES ""}} +{{$USE_SIMPLE_LATENCY_QUERY := DefaultParam .USE_SIMPLE_LATENCY_QUERY false}} +{{$ENABLE_RESTART_COUNT_CHECK := DefaultParam .ENABLE_RESTART_COUNT_CHECK true}} +{{$RESTART_COUNT_THRESHOLD_OVERRIDES:= DefaultParam .RESTART_COUNT_THRESHOLD_OVERRIDES ""}} +{{$ALLOWED_SLOW_API_CALLS := DefaultParam .CL2_ALLOWED_SLOW_API_CALLS 0}} +{{$ENABLE_VIOLATIONS_FOR_SCHEDULING_THROUGHPUT := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_SCHEDULING_THROUGHPUT true}} +#Variables +{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}} +{{$podsPerNamespace := MultiplyInt $PODS_PER_NODE $NODES_PER_NAMESPACE}} +{{$totalPods := MultiplyInt $podsPerNamespace $namespaces}} +{{$latencyReplicas := DivideInt (MaxInt $MIN_LATENCY_PODS .Nodes) $namespaces}} +{{$totalLatencyPods := MultiplyInt $namespaces $latencyReplicas}} +{{$saturationDeploymentTimeout := DivideFloat $totalPods $DENSITY_TEST_THROUGHPUT | AddInt $MIN_SATURATION_PODS_TIMEOUT}} +# saturationDeploymentHardTimeout must be at least 20m to make sure that ~10m node +# failure won't fail the test. See https://github.com/kubernetes/kubernetes/issues/73461#issuecomment-467338711 +{{$saturationDeploymentHardTimeout := MaxInt $saturationDeploymentTimeout 1200}} + +{{$saturationDeploymentSpec := DefaultParam .SATURATION_DEPLOYMENT_SPEC "deployment.yaml"}} +{{$latencyDeploymentSpec := DefaultParam .LATENCY_DEPLOYMENT_SPEC "deployment.yaml"}} + +# Probe measurements shared parameter +{{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT := DefaultParam .CL2_PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT "15m"}} + +name: density +namespace: + number: {{$namespaces}} +tuningSets: +- name: Uniform5qps + qpsLoad: + qps: 5 +{{if $ENABLE_CHAOSMONKEY}} +chaosMonkey: + nodeFailure: + failureRate: 0.01 + interval: 1m + jitterFactor: 10.0 + simulatedDowntime: 10m +{{end}} +steps: +- name: Starting measurements + measurements: + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: start + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: start + # TODO(oxddr): figure out how many probers to run in function of cluster + - Identifier: InClusterNetworkLatency + Method: InClusterNetworkLatency + Params: + action: start + checkProbesReadyTimeout: {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT}} + replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} + - Identifier: DnsLookupLatency + Method: DnsLookupLatency + Params: + action: start + checkProbesReadyTimeout: {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT}} + replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: start + resourceConstraints: {{$DENSITY_RESOURCE_CONSTRAINTS_FILE}} + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + clusterOOMsTrackerEnabled: {{$ENABLE_CLUSTER_OOMS_TRACKER}} + clusterOOMsIgnoredProcesses: {{$CLUSTER_OOMS_IGNORED_PROCESSES}} + restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}} + enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} + +- name: Starting saturation pod measurements + measurements: + - Identifier: SaturationPodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = saturation + threshold: {{$saturationDeploymentTimeout}}s + - Identifier: WaitForRunningSaturationDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = saturation + operationTimeout: {{$saturationDeploymentHardTimeout}}s + - Identifier: SchedulingThroughput + Method: SchedulingThroughput + Params: + action: start + labelSelector: group = saturation + +- name: Creating saturation pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: Uniform5qps + objectBundle: + - basename: saturation-deployment + objectTemplatePath: {{$saturationDeploymentSpec}} + templateFillMap: + Replicas: {{$podsPerNamespace}} + Group: saturation + CpuRequest: 1m + MemoryRequest: 10M + +- name: Waiting for saturation pods to be running + measurements: + - Identifier: WaitForRunningSaturationDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Collecting saturation pod measurements + measurements: + - Identifier: SaturationPodStartupLatency + Method: PodStartupLatency + Params: + action: gather + - Identifier: SchedulingThroughput + Method: SchedulingThroughput + Params: + action: gather + enableViolations: {{$ENABLE_VIOLATIONS_FOR_SCHEDULING_THROUGHPUT}} + threshold: {{$SCHEDULER_THROUGHPUT_THRESHOLD}} + +- name: Starting latency pod measurements + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = latency + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = latency + operationTimeout: 15m + +- name: Creating latency pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$latencyReplicas}} + tuningSet: Uniform5qps + objectBundle: + - basename: latency-deployment + objectTemplatePath: {{$latencyDeploymentSpec}} + templateFillMap: + Replicas: 1 + Group: latency + CpuRequest: {{$LATENCY_POD_CPU}}m + MemoryRequest: {{$LATENCY_POD_MEMORY}}M + +- name: Waiting for latency pods to be running + measurements: + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Deleting latency pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Uniform5qps + objectBundle: + - basename: latency-deployment + objectTemplatePath: {{$latencyDeploymentSpec}} + +- name: Waiting for latency pods to be deleted + measurements: + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Collecting pod startup latency + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather + +- name: Deleting saturation pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Uniform5qps + objectBundle: + - basename: saturation-deployment + objectTemplatePath: {{$saturationDeploymentSpec}} + +- name: Waiting for saturation pods to be deleted + measurements: + - Identifier: WaitForRunningSaturationDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Collecting measurements + measurements: + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: gather + enableViolations: true + useSimpleLatencyQuery: true + summaryName: APIResponsivenessPrometheus_simple + allowedSlowCalls: {{$ALLOWED_SLOW_API_CALLS}} + {{if not $USE_SIMPLE_LATENCY_QUERY}} + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: gather + allowedSlowCalls: {{$ALLOWED_SLOW_API_CALLS}} + {{end}} + - Identifier: InClusterNetworkLatency + Method: InClusterNetworkLatency + Params: + action: gather + - Identifier: DnsLookupLatency + Method: DnsLookupLatency + Params: + action: gather + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: gather + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + clusterOOMsTrackerEnabled: {{$ENABLE_CLUSTER_OOMS_TRACKER}} + restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}} + enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} diff --git a/test/clusterloader2/testing/density/deployment.yaml b/test/clusterloader2/testing/density/deployment.yaml new file mode 100644 index 00000000000..bf02c5ac369 --- /dev/null +++ b/test/clusterloader2/testing/density/deployment.yaml @@ -0,0 +1,37 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + name: {{.Name}} + group: {{.Group}} + spec: + containers: + - image: registry.k8s.io/pause:3.9 + imagePullPolicy: IfNotPresent + name: {{.Name}} + ports: + resources: + requests: + cpu: {{.CpuRequest}} + memory: {{.MemoryRequest}} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/test/clusterloader2/testing/density/high-density-config.yaml b/test/clusterloader2/testing/density/high-density-config.yaml new file mode 100644 index 00000000000..eba8cb879d8 --- /dev/null +++ b/test/clusterloader2/testing/density/high-density-config.yaml @@ -0,0 +1,253 @@ +# TODO(https://github.com/kubernetes/perf-tests/issues/1007): Make it possible to run high density as part of the load test. +# ASSUMPTIONS: +# - Underlying cluster should have 100+ nodes. +# - Number of nodes should be divisible by NODES_PER_NAMESPACE (default 100). + +#Constants +{{$DENSITY_RESOURCE_CONSTRAINTS_FILE := DefaultParam .DENSITY_RESOURCE_CONSTRAINTS_FILE ""}} +# Cater for the case where the number of nodes is less than nodes per namespace. See https://github.com/kubernetes/perf-tests/issues/887 +{{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .NODES_PER_NAMESPACE 100)}} +{{$PODS_PER_NODE := DefaultParam .PODS_PER_NODE 30}} +{{$DENSITY_TEST_THROUGHPUT := DefaultParam .DENSITY_TEST_THROUGHPUT 20}} +{{$SCHEDULER_THROUGHPUT_THRESHOLD := DefaultParam .CL2_SCHEDULER_THROUGHPUT_THRESHOLD 0}} +# LATENCY_POD_MEMORY and LATENCY_POD_CPU are calculated for 1-core 4GB node. +# Increasing allocation of both memory and cpu by 10% +# decreases the value of priority function in scheduler by one point. +# This results in decreased probability of choosing the same node again. +{{$LATENCY_POD_CPU := DefaultParam .LATENCY_POD_CPU 100}} +{{$LATENCY_POD_MEMORY := DefaultParam .LATENCY_POD_MEMORY 350}} +{{$MIN_LATENCY_PODS := 500}} +{{$MIN_SATURATION_PODS_TIMEOUT := 180}} +{{$ENABLE_CHAOSMONKEY := DefaultParam .ENABLE_CHAOSMONKEY false}} +{{$ENABLE_SYSTEM_POD_METRICS:= DefaultParam .ENABLE_SYSTEM_POD_METRICS true}} +{{$USE_SIMPLE_LATENCY_QUERY := DefaultParam .USE_SIMPLE_LATENCY_QUERY false}} +{{$ENABLE_RESTART_COUNT_CHECK := DefaultParam .ENABLE_RESTART_COUNT_CHECK false}} +{{$RESTART_COUNT_THRESHOLD_OVERRIDES:= DefaultParam .RESTART_COUNT_THRESHOLD_OVERRIDES ""}} +#Variables +{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}} +{{$podsPerNamespace := MultiplyInt $PODS_PER_NODE $NODES_PER_NAMESPACE}} +{{$totalPods := MultiplyInt $podsPerNamespace $namespaces}} +{{$latencyReplicas := DivideInt (MaxInt $MIN_LATENCY_PODS .Nodes) $namespaces}} +{{$totalLatencyPods := MultiplyInt $namespaces $latencyReplicas}} +{{$saturationDeploymentTimeout := DivideFloat $totalPods $DENSITY_TEST_THROUGHPUT | AddInt $MIN_SATURATION_PODS_TIMEOUT}} +# saturationDeploymentHardTimeout must be at least 20m to make sure that ~10m node +# failure won't fail the test. See https://github.com/kubernetes/kubernetes/issues/73461#issuecomment-467338711 +{{$saturationDeploymentHardTimeout := MaxInt $saturationDeploymentTimeout 1200}} + +{{$saturationDeploymentSpec := DefaultParam .SATURATION_DEPLOYMENT_SPEC "deployment.yaml"}} +{{$latencyDeploymentSpec := DefaultParam .LATENCY_DEPLOYMENT_SPEC "deployment.yaml"}} + +name: density +namespace: + number: {{$namespaces}} +tuningSets: +- name: Uniform5qps + qpsLoad: + qps: 5 +{{if $ENABLE_CHAOSMONKEY}} +chaosMonkey: + nodeFailure: + failureRate: 0.01 + interval: 1m + jitterFactor: 10.0 + simulatedDowntime: 10m +{{end}} +steps: +- name: Starting measurements + measurements: + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: start + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: start + # TODO(oxddr): figure out how many probers to run in function of cluster + - Identifier: InClusterNetworkLatency + Method: InClusterNetworkLatency + Params: + action: start + replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} + - Identifier: DnsLookupLatency + Method: DnsLookupLatency + Params: + action: start + replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: start + resourceConstraints: {{$DENSITY_RESOURCE_CONSTRAINTS_FILE}} + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}} + enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} + +- name: Starting saturation pod measurements + measurements: + - Identifier: SaturationPodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = saturation + threshold: {{$saturationDeploymentTimeout}}s + - Identifier: WaitForRunningSaturationDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = saturation + operationTimeout: {{$saturationDeploymentHardTimeout}}s + - Identifier: SchedulingThroughput + Method: SchedulingThroughput + Params: + action: start + labelSelector: group = saturation + +- name: Creating saturation pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 1 + tuningSet: Uniform5qps + objectBundle: + - basename: saturation-deployment + objectTemplatePath: {{$saturationDeploymentSpec}} + templateFillMap: + Replicas: {{$podsPerNamespace}} + Group: saturation + CpuRequest: 1m + MemoryRequest: 10M + +- name: Collecting saturation pod measurements + measurements: + - Identifier: WaitForRunningSaturationDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +- measurements: + - Identifier: SaturationPodStartupLatency + Method: PodStartupLatency + Params: + action: gather +- measurements: + - Identifier: SchedulingThroughput + Method: SchedulingThroughput + Params: + action: gather + threshold: {{$SCHEDULER_THROUGHPUT_THRESHOLD}} + +- name: Starting latency pod measurements + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = latency + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = latency + operationTimeout: 15m + +- name: Creating latency pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$latencyReplicas}} + tuningSet: Uniform5qps + objectBundle: + - basename: latency-deployment + objectTemplatePath: {{$latencyDeploymentSpec}} + templateFillMap: + Replicas: 1 + Group: latency + CpuRequest: {{$LATENCY_POD_CPU}}m + MemoryRequest: {{$LATENCY_POD_MEMORY}}M + +- name: Waiting for latency pods to be running + measurements: + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Deleting latency pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Uniform5qps + objectBundle: + - basename: latency-deployment + objectTemplatePath: {{$latencyDeploymentSpec}} + +- name: Waiting for latency pods to be deleted + measurements: + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Collecting pod startup latency + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather + +- name: Deleting saturation pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: Uniform5qps + objectBundle: + - basename: saturation-deployment + objectTemplatePath: {{$saturationDeploymentSpec}} + +- name: Waiting for saturation pods to be deleted + measurements: + - Identifier: WaitForRunningSaturationDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +- name: Collecting measurements + measurements: + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: gather + enableViolations: true + useSimpleLatencyQuery: true + summaryName: APIResponsivenessPrometheus_simple + {{if not $USE_SIMPLE_LATENCY_QUERY}} + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: gather + {{end}} + - Identifier: InClusterNetworkLatency + Method: InClusterNetworkLatency + Params: + action: gather + - Identifier: DnsLookupLatency + Method: DnsLookupLatency + Params: + action: gather + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: gather + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}} + enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} diff --git a/test/clusterloader2/testing/density/scheduler-suite.yaml b/test/clusterloader2/testing/density/scheduler-suite.yaml new file mode 100644 index 00000000000..d0733c27837 --- /dev/null +++ b/test/clusterloader2/testing/density/scheduler-suite.yaml @@ -0,0 +1,18 @@ +- identifier: vanilla + configPath: testing/density/config.yaml + overridePaths: [] + +- identifier: pod-affinity + configPath: testing/density/config.yaml + overridePaths: + - testing/density/scheduler/pod-affinity/overrides.yaml + +- identifier: pod-anti-affinity + configPath: testing/density/config.yaml + overridePaths: + - testing/density/scheduler/pod-anti-affinity/overrides.yaml + +- identifier: pod-topology-spread + configPath: testing/density/config.yaml + overridePaths: + - testing/density/scheduler/pod-topology-spread/overrides.yaml diff --git a/test/clusterloader2/testing/density/scheduler/custom-scheduler-metrics/config-custom-schd-sample.yaml b/test/clusterloader2/testing/density/scheduler/custom-scheduler-metrics/config-custom-schd-sample.yaml new file mode 100644 index 00000000000..7e731c62585 --- /dev/null +++ b/test/clusterloader2/testing/density/scheduler/custom-scheduler-metrics/config-custom-schd-sample.yaml @@ -0,0 +1,51 @@ +name: test + +namespace: + number: 1 + +tuningSets: +- name: Uniform1qps + qpsLoad: + qps: 1 + +steps: +- name: Start measurements + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = test-pod-default + threshold: 180s + - Identifier: WaitForControlledPodsRunning + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = test-deployment + operationTimeout: 180s +- name: Create deployment + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 1 + tuningSet: Uniform1qps + objectBundle: + - basename: test-deployment + objectTemplatePath: "deployment-sample.yaml" + templateFillMap: + Replicas: 7 +- name: Wait for pods to be running + measurements: + - Identifier: WaitForControlledPodsRunning + Method: WaitForControlledPodsRunning + Params: + action: gather +- name: Measure pod startup latency + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather \ No newline at end of file diff --git a/test/clusterloader2/testing/density/scheduler/custom-scheduler-metrics/deployment-custom-schd-sample.yaml b/test/clusterloader2/testing/density/scheduler/custom-scheduler-metrics/deployment-custom-schd-sample.yaml new file mode 100644 index 00000000000..31b55d99842 --- /dev/null +++ b/test/clusterloader2/testing/density/scheduler/custom-scheduler-metrics/deployment-custom-schd-sample.yaml @@ -0,0 +1,20 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: test-deployment +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + group: test-pod-default + template: + metadata: + labels: + group: test-pod-default + spec: + schedulerName: default-scheduler + containers: + - image: registry.k8s.io/pause:3.9 + name: {{.Name}} diff --git a/test/clusterloader2/testing/density/scheduler/pod-affinity/deployment.yaml b/test/clusterloader2/testing/density/scheduler/pod-affinity/deployment.yaml new file mode 100644 index 00000000000..dc40475471f --- /dev/null +++ b/test/clusterloader2/testing/density/scheduler/pod-affinity/deployment.yaml @@ -0,0 +1,46 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + name: {{.Name}} + group: {{.Group}} + spec: + affinity: + podAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + group: {{.Group}} + topologyKey: "kubernetes.io/hostname" + weight: 1 + containers: + - image: registry.k8s.io/pause:3.9 + imagePullPolicy: IfNotPresent + name: {{.Name}} + ports: + resources: + requests: + cpu: {{.CpuRequest}} + memory: {{.MemoryRequest}} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/test/clusterloader2/testing/density/scheduler/pod-affinity/overrides.yaml b/test/clusterloader2/testing/density/scheduler/pod-affinity/overrides.yaml new file mode 100644 index 00000000000..76a707ebce0 --- /dev/null +++ b/test/clusterloader2/testing/density/scheduler/pod-affinity/overrides.yaml @@ -0,0 +1,2 @@ +SATURATION_DEPLOYMENT_SPEC: scheduler/pod-affinity/deployment.yaml +LATENCY_DEPLOYMENT_SPEC: scheduler/pod-affinity/deployment.yaml diff --git a/test/clusterloader2/testing/density/scheduler/pod-anti-affinity/deployment.yaml b/test/clusterloader2/testing/density/scheduler/pod-anti-affinity/deployment.yaml new file mode 100644 index 00000000000..89f00eb8c88 --- /dev/null +++ b/test/clusterloader2/testing/density/scheduler/pod-anti-affinity/deployment.yaml @@ -0,0 +1,46 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + name: {{.Name}} + group: {{.Group}} + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + group: {{.Group}} + topologyKey: "kubernetes.io/hostname" + weight: 1 + containers: + - image: registry.k8s.io/pause:3.9 + imagePullPolicy: IfNotPresent + name: {{.Name}} + ports: + resources: + requests: + cpu: {{.CpuRequest}} + memory: {{.MemoryRequest}} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/test/clusterloader2/testing/density/scheduler/pod-anti-affinity/overrides.yaml b/test/clusterloader2/testing/density/scheduler/pod-anti-affinity/overrides.yaml new file mode 100644 index 00000000000..f4890161e77 --- /dev/null +++ b/test/clusterloader2/testing/density/scheduler/pod-anti-affinity/overrides.yaml @@ -0,0 +1,2 @@ +SATURATION_DEPLOYMENT_SPEC: scheduler/pod-anti-affinity/deployment.yaml +LATENCY_DEPLOYMENT_SPEC: scheduler/pod-anti-affinity/deployment.yaml diff --git a/test/clusterloader2/testing/density/scheduler/pod-topology-spread/deployment.yaml b/test/clusterloader2/testing/density/scheduler/pod-topology-spread/deployment.yaml new file mode 100644 index 00000000000..62b90f95d42 --- /dev/null +++ b/test/clusterloader2/testing/density/scheduler/pod-topology-spread/deployment.yaml @@ -0,0 +1,48 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + name: {{.Name}} + group: {{.Group}} + spec: + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + # Cannot be DoNotSchedule because the there's no way to differentiate + # the master node from a hollow node; as a result, the global minimum + # matching number will always be zero (since pods cannot be scheduled + # on to the master). + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + group: {{.Group}} + containers: + - image: registry.k8s.io/pause:3.9 + imagePullPolicy: IfNotPresent + name: {{.Name}} + ports: + resources: + requests: + cpu: {{.CpuRequest}} + memory: {{.MemoryRequest}} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/test/clusterloader2/testing/density/scheduler/pod-topology-spread/overrides.yaml b/test/clusterloader2/testing/density/scheduler/pod-topology-spread/overrides.yaml new file mode 100644 index 00000000000..d3577cc1fc6 --- /dev/null +++ b/test/clusterloader2/testing/density/scheduler/pod-topology-spread/overrides.yaml @@ -0,0 +1,2 @@ +SATURATION_DEPLOYMENT_SPEC: scheduler/pod-topology-spread/deployment.yaml +LATENCY_DEPLOYMENT_SPEC: scheduler/pod-topology-spread/deployment.yaml diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/README.md b/test/clusterloader2/testing/experimental/storage/pod-startup/README.md new file mode 100644 index 00000000000..12e304ff7ca --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/README.md @@ -0,0 +1,26 @@ +**Test Cases** + +- 1 pod with X volumes and 1 node + - run on a single node cluster + - use override file in `max_volumes_per_node` + - tries to stress the max number of volumes a single node can have +- X pods with 1 volume each on 1 node in parallel + - run on a single node cluster + - use override file in `max_volumes_per_pod` + - tries to stress the max number of volumes a single pod can have +- X pods with 1 volume each on cluster in parallel + - run on a cluster with num nodes that is a multiple of `NODES_PER_NAMESPACE` (default 100) + - use override file in `cluster_load_scale_by_nodes` +- X PVs, no pods + - use override file in `volume_binding` and `volume-types/persistentvolume` + - measures how long it take to create, bind and delete volumes +- X PVs, no pods, create volumes directly + - use override files `volume-types/persistentvolume` and `volume_creation` (in this order!) + - set PROVISIONER in a custom override file to the name of the external provisioner + - measures how long it take to create and delete volumes; only deletion involves + the PV controller + +**Volume Type** + +Each test must use a type of volume. Use a override in `volume-types` to set +a specific type of volume to test. diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/cluster_load_scale_by_nodes/override.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/cluster_load_scale_by_nodes/override.yaml new file mode 100644 index 00000000000..7be92cd8782 --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/cluster_load_scale_by_nodes/override.yaml @@ -0,0 +1,2 @@ +PODS_PER_NODE: 10 +VOLUMES_PER_POD: 1 diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/config.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/config.yaml new file mode 100644 index 00000000000..adb64d07c5e --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/config.yaml @@ -0,0 +1,209 @@ +# ASSUMPTIONS: +# - Number of nodes should be divisible by NODES_PER_NAMESPACE (default 100). +# - If using Persistent Volumes, the default storage class must have volumeBindingMode: Immediate + +# Cluster Variables + # Cater for the case where the number of nodes is less than nodes per namespace. See https://github.com/kubernetes/perf-tests/issues/887 + {{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .NODES_PER_NAMESPACE 100)}} + +# Test Variales + {{$PODS_PER_NODE := DefaultParam .PODS_PER_NODE 1}} + {{$DEPLOYMENT_TEMPLATE_PATH := .DEPLOYMENT_TEMPLATE_PATH }} + {{$VOLUMES_PER_POD := DefaultParam .VOLUMES_PER_POD 1}} + {{$VOLUME_TEMPLATE_PATH := .VOLUME_TEMPLATE_PATH}} + # Set this to false if metrics data is not needed. + {{$GATHER_METRICS := DefaultParam .GATHER_METRICS true}} + {{$START_PODS := DefaultParam .START_PODS true}} + {{$PROVISION_VOLUME := DefaultParam .PROVISION_VOLUME false}} + {{$STORAGE_CLASS := DefaultParam .STORAGE_CLASS "csi-gce-pd"}} + # Shortcut for provisioning volumes with some external provisioner. Without it, + # the PV controller must update each PVC before the external provisioner + # starts to work on it, which limits the volume creation rate. + # With it, volume creation starts immediately. This is best + # used together with WAIT_FOR_PVS_CREATED=true and + # WAIT_FOR_PVS_BOUND=false because then test does not to wait for the + # PV controller during volume creation at all. + {{$PROVISIONER := DefaultParam .PROVISIONER ""}} + # When checking PVs, the test by default considers *all* PVs in the cluster, + # whether they were created by the test or not. In other words, the test + # only works in clusters with no PVs. To make it a bit more flexible, + # the value of the expected provisioner can be set here and then only + # PVs with that annotation will be counted. + {{$EXPECTED_PROVISIONER := DefaultParam .EXPECTED_PROVISIONER ""}} + {{$VOL_SIZE := DefaultParam .VOL_SIZE "8Gi"}} + {{$WAIT_FOR_PVS_CREATED := DefaultParam .WAIT_FOR_PVS_CREATED false}} + {{$WAIT_FOR_PVS_BOUND := DefaultParam .WAIT_FOR_PVS_BOUND false}} + {{$WAIT_FOR_PVS_DELETED := DefaultParam .WAIT_FOR_PVS_DELETED false}} + {{$POD_THROUGHPUT := DefaultParam .POD_THROUGHPUT 10}} + # TODO(hantaowang): remove knob after deciding on right values + {{$POD_STARTUP_TIMEOUT := DefaultParam .POD_STARTUP_TIMEOUT "15m"}} + {{$POD_STARTUP_SLO := DefaultParam .POD_STARTUP_SLO 300}} + +# Computed Variables + {{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE | MaxInt 1}} + {{$podsPerNamespace := MultiplyInt $NODES_PER_NAMESPACE $PODS_PER_NODE}} + {{$volumesPerNamespace := MultiplyInt $podsPerNamespace $VOLUMES_PER_POD}} + {{$totalVols := MultiplyInt $volumesPerNamespace $namespaces}} + {{$guessedStepTime := MultiplyInt $totalVols 2 | MaxInt 60}} + +# This is a very conservative estimate of 2 seconds per volume. + {{$StepTimeSeconds := DefaultParam .STEP_TIME_SECONDS $guessedStepTime}} + + +name: storage +namespace: + number: {{$namespaces}} +tuningSets: +- name: UniformQPS + qpsLoad: + qps: {{$POD_THROUGHPUT}} +steps: +{{ if $GATHER_METRICS }} +# Start measurements +- name: Starting measurement for the entire test + measurements: + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: start + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: start + - Identifier: PodWithVolumesStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = volume-test + threshold: {{$POD_STARTUP_SLO}}s +{{ end }} +{{ if $PROVISION_VOLUME }} +# Provision volumes +- name: Provisioning volumes + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$volumesPerNamespace}} + tuningSet: UniformQPS + objectBundle: + - basename: vol + objectTemplatePath: {{$VOLUME_TEMPLATE_PATH}} + templateFillMap: + Group: volume-test + VolSize: {{$VOL_SIZE}} + StorageClass: {{$STORAGE_CLASS}} + Provisioner: {{$PROVISIONER}} +{{ end }} +{{if $WAIT_FOR_PVS_CREATED }} +- name: Waiting for volume creation + measurements: + - Identifier: WaitForPVsToBeCreated + Method: WaitForAvailablePVs + Params: + desiredPVCount: {{$totalVols}} + apiVersion: v1 + provisioner: {{$EXPECTED_PROVISIONER}} + timeout: {{$StepTimeSeconds}}s +{{ end }} +{{ if $WAIT_FOR_PVS_BOUND }} +- name: Waiting for PVs to be bound + measurements: + - Identifier: WaitForPVCsToBeBound + Method: WaitForBoundPVCs + Params: + desiredPVCCount: {{$totalVols}} + apiVersion: v1 + labelSelector: group = volume-test + timeout: {{$StepTimeSeconds}}s +{{ end }} +{{ if $START_PODS }} +- name: Starting measurement for waiting for deployments + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = volume-test + operationTimeout: {{$POD_STARTUP_TIMEOUT}} +# Create deployments +- name: Creating deployments + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$podsPerNamespace}} + tuningSet: UniformQPS + objectBundle: + - basename: deployment + objectTemplatePath: {{$DEPLOYMENT_TEMPLATE_PATH}} + templateFillMap: + Group: volume-test + VolumesPerPod: {{$VOLUMES_PER_POD}} + VolSize: {{$VOL_SIZE}} + StorageClass: {{$STORAGE_CLASS}} + Provisioner: {{$PROVISIONER}} +- name: Waiting for deployments to be running + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +# Delete deployments +- name: Deleting deployments + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: UniformQPS + objectBundle: + - basename: deployment + objectTemplatePath: {{$DEPLOYMENT_TEMPLATE_PATH}} +{{ end }} +{{ if $PROVISION_VOLUME }} +# Delete volumes +- name: Deleting volumes + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: UniformQPS + objectBundle: + - basename: vol + objectTemplatePath: {{$VOLUME_TEMPLATE_PATH}} +{{ end }} +{{if $WAIT_FOR_PVS_DELETED }} +- name: Waiting for volume deletion + measurements: + - Identifier: WaitForPVsToBeDeleted + Method: WaitForAvailablePVs + Params: + desiredPVCount: 0 + apiVersion: v1 + provisioner: {{$EXPECTED_PROVISIONER}} + timeout: {{$StepTimeSeconds}}s +{{ end }} +{{ if $GATHER_METRICS }} +# Collect measurements +- name: Gather test measurements + measurements: + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: gather + enableViolations: true + useSimpleLatencyQuery: true + summaryName: APIResponsivenessPrometheus_simple + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: gather + - Identifier: PodWithVolumesStartupLatency + Method: PodStartupLatency + Params: + action: gather +{{ end }} diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/max_volumes_per_node/25_pods_per_node/override.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/max_volumes_per_node/25_pods_per_node/override.yaml new file mode 100644 index 00000000000..91b815311b1 --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/max_volumes_per_node/25_pods_per_node/override.yaml @@ -0,0 +1,3 @@ +PODS_PER_NODE: 25 +VOLUMES_PER_POD: 1 +NODES_PER_NAMESPACE: 1 diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/max_volumes_per_node/override.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/max_volumes_per_node/override.yaml new file mode 100644 index 00000000000..151ba209eba --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/max_volumes_per_node/override.yaml @@ -0,0 +1,3 @@ +PODS_PER_NODE: 100 +VOLUMES_PER_POD: 1 +NODES_PER_NAMESPACE: 1 diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/max_volumes_per_pod/override.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/max_volumes_per_pod/override.yaml new file mode 100644 index 00000000000..be42ee471ee --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/max_volumes_per_pod/override.yaml @@ -0,0 +1,3 @@ +PODS_PER_NODE: 1 +VOLUMES_PER_POD: 100 +NODES_PER_NAMESPACE: 1 diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/suite.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/suite.yaml new file mode 100644 index 00000000000..7cdd60a8352 --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/suite.yaml @@ -0,0 +1,59 @@ +- identifier: emptydir-vol-per-pod + configPath: testing/experimental/storage/pod-startup/config.yaml + overridePaths: + - testing/experimental/storage/pod-startup/volume-types/emptydir/override.yaml + - testing/experimental/storage/pod-startup/max_volumes_per_pod/override.yaml + +- identifier: configmap-vol-per-pod + configPath: testing/experimental/storage/pod-startup/config.yaml + overridePaths: + - testing/experimental/storage/pod-startup/volume-types/configmap/override.yaml + - testing/experimental/storage/pod-startup/max_volumes_per_pod/override.yaml + +- identifier: downwardapi-vol-per-pod + configPath: testing/experimental/storage/pod-startup/config.yaml + overridePaths: + - testing/experimental/storage/pod-startup/volume-types/downwardapi/override.yaml + - testing/experimental/storage/pod-startup/max_volumes_per_pod/override.yaml + +- identifier: secret-vol-per-pod + configPath: testing/experimental/storage/pod-startup/config.yaml + overridePaths: + - testing/experimental/storage/pod-startup/volume-types/secret/override.yaml + - testing/experimental/storage/pod-startup/max_volumes_per_pod/override.yaml + +#- identifier: persistent-vol-per-pod +# configPath: testing/experimental/storage/pod-startup/config.yaml +# overridePaths: +# - testing/experimental/storage/pod-startup/volume-types/persistentvolume/override.yaml +# - testing/experimental/storage/pod-startup/max_volumes_per_pod/override.yaml + +- identifier: emptydir-vol-per-node + configPath: testing/experimental/storage/pod-startup/config.yaml + overridePaths: + - testing/experimental/storage/pod-startup/volume-types/emptydir/override.yaml + - testing/experimental/storage/pod-startup/max_volumes_per_node/override.yaml + +- identifier: configmap-vol-per-node + configPath: testing/experimental/storage/pod-startup/config.yaml + overridePaths: + - testing/experimental/storage/pod-startup/volume-types/configmap/override.yaml + - testing/experimental/storage/pod-startup/max_volumes_per_node/override.yaml + +- identifier: downwardapi-vol-per-node + configPath: testing/experimental/storage/pod-startup/config.yaml + overridePaths: + - testing/experimental/storage/pod-startup/volume-types/downwardapi/override.yaml + - testing/experimental/storage/pod-startup/max_volumes_per_node/override.yaml + +- identifier: secret-vol-per-node + configPath: testing/experimental/storage/pod-startup/config.yaml + overridePaths: + - testing/experimental/storage/pod-startup/volume-types/secret/override.yaml + - testing/experimental/storage/pod-startup/max_volumes_per_node/override.yaml + +#- identifier: persistent-vol-per-node +# configPath: testing/experimental/storage/pod-startup/config.yaml +# overridePaths: +# - testing/experimental/storage/pod-startup/volume-types/persistentvolume/override.yaml +# - testing/experimental/storage/pod-startup/max_volumes_per_node/override.yaml diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/test-result/volume_podstartup_time_results.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/test-result/volume_podstartup_time_results.yaml new file mode 100644 index 00000000000..d3275c7c9bd --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/test-result/volume_podstartup_time_results.yaml @@ -0,0 +1,175 @@ +pod-with-volumes-startup-latency-scenarios: +- volume-type: configmap + test-scenarios: + - test-artifacts: https://gcsweb.k8s.io/gcs/kubernetes-jenkins/logs/ci-kubernetes-storage-scalability-max-configmap-vol-per-pod/ + test-cases: + - number-of-volumes-per-pod: 100 + number-of-pods-per-node: 1 + number-of-node: 1 + kube-api-qps-limit: 5 + qps-pod-throughput: 100 + pod-startup: + perc50: 22240.349315 + perc90: 22240.349315 + perc99: 22240.349315 + - number-of-volumes-per-pod: 100 + number-of-pods-per-node: 1 + number-of-node: 1 + kube-api-qps-limit: 100 + qps-pod-throughput: 10 + pod-startup: + perc50: 2205.065107 + perc90: 2205.065107 + perc99: 2205.065107 + - test-artifacts: https://gcsweb.k8s.io/gcs/kubernetes-jenkins/logs/ci-kubernetes-storage-scalability-max-configmap-vol-per-node/ + test-cases: + - number-of-volumes-per-pod: 1 + number-of-pods-per-node: 100 + number-of-node: 1 + kube-api-qps-limit: 5 + qps-pod-throughput: 100 + pod-startup: + perc50: 74109.157201 + perc90: 89109.291651 + perc99: 93112.022213 + - number-of-volumes-per-pod: 1 + number-of-pods-per-node: 100 + number-of-node: 1 + kube-api-qps-limit: 100 + qps-pod-throughput: 10 + pod-startup: + perc50: 27144.21273 + perc90: 39171.178912 + perc99: 41234.220941 +- volume-type: downwardapi + test-scenarios: + - test-artifacts: https://gcsweb.k8s.io/gcs/kubernetes-jenkins/logs/ci-kubernetes-storage-scalability-max-downwardapi-vol-per-pod/ + test-cases: + - number-of-volumes-per-pod: 100 + number-of-pods-per-node: 1 + number-of-node: 1 + kube-api-qps-limit: 100 + qps-pod-throughput: 10 + pod-startup: + perc50: 2926.402409 + perc90: 2926.402409 + perc99: 2926.402409 + - test-artifacts: https://gcsweb.k8s.io/gcs/kubernetes-jenkins/logs/ci-kubernetes-storage-scalability-max-downwardapi-vol-per-node + test-cases: + - number-of-volumes-per-pod: 1 + number-of-pods-per-node: 100 + number-of-node: 1 + kube-api-qps-limit: 5 + qps-pod-throughput: 100 + pod-startup: + perc50: 62488.608972 + perc90: 78490.525426 + perc99: 82090.972003 + - number-of-volumes-per-pod: 1 + number-of-pods-per-node: 100 + number-of-node: 1 + kube-api-qps-limit: 100 + qps-pod-throughput: 10 + pod-startup: + perc50: 30039.645985 + perc90: 46560.1259 + perc99: 48975.619831 +- volume-type: emptydir + test-scenarios: + - test-artifacts: https://gcsweb.k8s.io/gcs/kubernetes-jenkins/logs/ci-kubernetes-storage-scalability-max-emptydir-vol-per-pod/ + test-cases: + - number-of-volumes-per-pod: 100 + number-of-pods-per-node: 1 + number-of-node: 1 + kube-api-qps-limit: 100 + qps-pod-throughput: 10 + pod-startup: + perc50: 2379.871531 + perc90: 2379.871531 + perc99: 2379.871531 + - test-artifacts: https://gcsweb.k8s.io/gcs/kubernetes-jenkins/logs/ci-kubernetes-storage-scalability-max-emptydir-vol-per-node/ + test-cases: + - number-of-volumes-per-pod: 1 + number-of-pods-per-node: 100 + number-of-node: 1 + kube-api-qps-limit: 5 + qps-pod-throughput: 100 + pod-startup: + perc50: 62845.550695 + perc90: 78844.040038 + perc99: 82445.521004 + - number-of-volumes-per-pod: 1 + number-of-pods-per-node: 100 + number-of-node: 1 + kube-api-qps-limit: 100 + qps-pod-throughput: 10 + pod-startup: + perc50: 40675.398759, + perc90: 43163.652082, + perc99: 45556.746457 +- volume-type: persistentvolume + test-scenarios: + - test-artifacts: https://gcsweb.k8s.io/gcs/kubernetes-jenkins/logs/ci-kubernetes-storage-scalability-max-persistent-vol-per-pod/ + test-cases: + - number-of-volumes-per-pod: 100 + number-of-pods-per-node: 1 + number-of-node: 1 + kube-api-qps-limit: 100 + qps-pod-throughput: 10 + pod-startup: + perc50: 505217.599907 + perc90: 505217.599907 + perc99: 505217.599907 + - test-artifacts: https://gcsweb.k8s.io/gcs/kubernetes-jenkins/logs/ci-kubernetes-storage-scalability-max-persistent-vol-per-node/ + test-cases: + - number-of-volumes-per-pod: 1 + number-of-pods-per-node: 100 + number-of-node: 1 + kube-api-qps-limit: 100 + qps-pod-throughput: 10 + pod-startup: + perc50: 285279.686245 + perc90: 397544.539025 + perc99: 499895.606574 +- volume-type: secret + test-scenarios: + - test-artifacts: https://gcsweb.k8s.io/gcs/kubernetes-jenkins/logs/ci-kubernetes-storage-scalability-max-secret-vol-per-pod/ + test-cases: + - number-of-volumes-per-pod: 100 + number-of-pods-per-node: 1 + number-of-node: 1 + kube-api-qps-limit: 5 + qps-pod-throughput: 100 + pod-startup: + perc50: 23779.322997 + perc90: 23779.322997 + perc99: 23779.322997 + - number-of-volumes-per-pod: 100 + number-of-pods-per-node: 1 + number-of-node: 1 + kube-api-qps-limit: 100 + qps-pod-throughput: 10 + pod-startup: + perc50: 2668.958672 + perc90: 2668.958672 + perc99: 2668.958672 + - test-artifacts: https://gcsweb.k8s.io/gcs/kubernetes-jenkins/logs/ci-kubernetes-storage-scalability-max-secret-vol-per-node/ + test-cases: + - number-of-volumes-per-pod: 1 + number-of-pods-per-node: 100 + number-of-node: 1 + kube-api-qps-limit: 5 + qps-pod-throughput: 100 + pod-startup: + perc50: 72712.815797 + perc90: 86111.974209 + perc99: 91911.36158 + - number-of-volumes-per-pod: 1 + number-of-pods-per-node: 100 + number-of-node: 1 + kube-api-qps-limit: 100 + qps-pod-throughput: 10 + pod-startup: + perc50: 28949.459883 + perc90: 43166.320477 + perc99: 45558.275832 diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/configmap/configmap.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/configmap/configmap.yaml new file mode 100644 index 00000000000..970ae9c7fdb --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/configmap/configmap.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{.Name}} + labels: + app: {{.Name}} + group: {{.Group}} +data: + TEST_KEY1: testData1 diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/configmap/deployment_with_configmap.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/configmap/deployment_with_configmap.yaml new file mode 100644 index 00000000000..bb8cf0eafe2 --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/configmap/deployment_with_configmap.yaml @@ -0,0 +1,45 @@ +{{$index := .Index}} +{{$volumesPerPod := .VolumesPerPod}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: 1 + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: {{.Group}} + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + image: registry.k8s.io/pause:3.9 + imagePullPolicy: IfNotPresent + volumeMounts: + {{ range $volumeIndex := Loop .VolumesPerPod }} + - name: vol-{{$volumeIndex}} + mountPath: /usr/share/{{$volumeIndex}} + {{ end }} + volumes: + {{ range $volumeIndex := Loop .VolumesPerPod }} + - name: vol-{{$volumeIndex}} + configMap: + name: vol-{{AddInt $volumeIndex (MultiplyInt $index $volumesPerPod)}} + {{ end }} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/configmap/override.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/configmap/override.yaml new file mode 100644 index 00000000000..92261d6c204 --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/configmap/override.yaml @@ -0,0 +1,3 @@ +PROVISION_VOLUME: true +DEPLOYMENT_TEMPLATE_PATH: "volume-types/configmap/deployment_with_configmap.yaml" +VOLUME_TEMPLATE_PATH: "volume-types/configmap/configmap.yaml" diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/downwardapi/deployment_with_downwardapi.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/downwardapi/deployment_with_downwardapi.yaml new file mode 100644 index 00000000000..611731bbdc5 --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/downwardapi/deployment_with_downwardapi.yaml @@ -0,0 +1,51 @@ +{{$index := .Index}} +{{$volumesPerPod := .VolumesPerPod}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: 1 + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: {{.Group}} + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + image: registry.k8s.io/pause:3.9 + imagePullPolicy: IfNotPresent + volumeMounts: + {{ range $volumeIndex := Loop .VolumesPerPod }} + - name: vol-{{$volumeIndex}} + mountPath: /usr/share/{{$volumeIndex}} + {{ end }} + volumes: + {{ range $volumeIndex := Loop .VolumesPerPod }} + - name: vol-{{$volumeIndex}} + downwardAPI: + items: + - path: "pod_name" + fieldRef: + fieldPath: metadata.name + - path: "pod_namespace" + fieldRef: + fieldPath: metadata.namespace + {{ end }} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/downwardapi/override.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/downwardapi/override.yaml new file mode 100644 index 00000000000..9fcc114be54 --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/downwardapi/override.yaml @@ -0,0 +1 @@ +DEPLOYMENT_TEMPLATE_PATH: "volume-types/downwardapi/deployment_with_downwardapi.yaml" diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/emptydir/deployment_with_emptydir.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/emptydir/deployment_with_emptydir.yaml new file mode 100644 index 00000000000..52645fb36a2 --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/emptydir/deployment_with_emptydir.yaml @@ -0,0 +1,44 @@ +{{$index := .Index}} +{{$volumesPerPod := .VolumesPerPod}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: 1 + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: {{.Group}} + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + image: registry.k8s.io/pause:3.9 + imagePullPolicy: IfNotPresent + volumeMounts: + {{ range $volumeIndex := Loop .VolumesPerPod }} + - name: vol-{{$volumeIndex}} + mountPath: /usr/share/{{$volumeIndex}} + {{ end }} + volumes: + {{ range $volumeIndex := Loop .VolumesPerPod }} + - name: vol-{{$volumeIndex}} + emptyDir: {} + {{ end }} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/emptydir/override.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/emptydir/override.yaml new file mode 100644 index 00000000000..93060b8757a --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/emptydir/override.yaml @@ -0,0 +1 @@ +DEPLOYMENT_TEMPLATE_PATH: "volume-types/emptydir/deployment_with_emptydir.yaml" diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/genericephemeralinline/deployment_with_inline.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/genericephemeralinline/deployment_with_inline.yaml new file mode 100644 index 00000000000..cda5dc8183d --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/genericephemeralinline/deployment_with_inline.yaml @@ -0,0 +1,67 @@ +{{$index := .Index}} +{{$volumesPerPod := .VolumesPerPod}} +{{$name := .Name}} +{{$group := .Group}} +{{$provisioner := .Provisioner}} +{{$storageclass := .StorageClass}} +{{$volsize := .VolSize}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: 1 + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: {{.Group}} + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + image: registry.k8s.io/pause:3.9 + imagePullPolicy: IfNotPresent + volumeMounts: + {{ range $volumeIndex := Loop .VolumesPerPod }} + - name: vol-{{$volumeIndex}} + mountPath: /usr/share/{{$volumeIndex}} + {{ end }} + volumes: + {{ range $volumeIndex := Loop .VolumesPerPod }} + - name: vol-{{$volumeIndex}} + ephemeral: + volumeClaimTemplate: + metadata: + labels: + app: {{$name}} + group: {{$group}} + {{ if $provisioner }} + annotations: + volume.beta.kubernetes.io/storage-provisioner: {{$provisioner}} + {{ end }} + spec: + accessModes: + - ReadWriteOnce + {{ if $storageclass }} + storageClassName: {{$storageclass}} + {{ end }} + resources: + requests: + storage: {{$volsize}} + {{ end }} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/genericephemeralinline/override.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/genericephemeralinline/override.yaml new file mode 100644 index 00000000000..636128acc30 --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/genericephemeralinline/override.yaml @@ -0,0 +1,4 @@ +PROVISION_VOLUME: false +WAIT_FOR_PVS_BOUND: false +WAIT_FOR_PVS_DELETED: false +DEPLOYMENT_TEMPLATE_PATH: "volume-types/genericephemeralinline/deployment_with_inline.yaml" diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/persistentvolume/deployment_with_pvc.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/persistentvolume/deployment_with_pvc.yaml new file mode 100644 index 00000000000..f4bb623b08e --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/persistentvolume/deployment_with_pvc.yaml @@ -0,0 +1,46 @@ +{{$index := .Index}} +{{$volumesPerPod := .VolumesPerPod}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: 1 + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: {{.Group}} + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + image: registry.k8s.io/pause:3.9 + imagePullPolicy: IfNotPresent + volumeMounts: + {{ range $volumeIndex := Loop .VolumesPerPod }} + - name: vol-{{$volumeIndex}} + mountPath: /usr/share/{{$volumeIndex}} + {{ end }} + volumes: + {{ range $volumeIndex := Loop .VolumesPerPod }} + - name: vol-{{$volumeIndex}} + persistentVolumeClaim: + claimName: vol-{{AddInt $volumeIndex (MultiplyInt $index $volumesPerPod)}} + readOnly: false + {{ end }} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/persistentvolume/override.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/persistentvolume/override.yaml new file mode 100644 index 00000000000..8d77a5f3527 --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/persistentvolume/override.yaml @@ -0,0 +1,5 @@ +PROVISION_VOLUME: true +WAIT_FOR_PVS_BOUND: true +WAIT_FOR_PVS_DELETED: true +DEPLOYMENT_TEMPLATE_PATH: "volume-types/persistentvolume/deployment_with_pvc.yaml" +VOLUME_TEMPLATE_PATH: "volume-types/persistentvolume/pvc.yaml" diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/persistentvolume/pvc.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/persistentvolume/pvc.yaml new file mode 100644 index 00000000000..136130547b7 --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/persistentvolume/pvc.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{.Name}} + labels: + app: {{.Name}} + group: {{.Group}} +{{ if .Provisioner }} + annotations: + volume.beta.kubernetes.io/storage-provisioner: {{.Provisioner}} +{{ end }} +spec: + accessModes: + - ReadWriteOnce + {{ if .StorageClass }} + storageClassName: {{.StorageClass}} + {{ end }} + resources: + requests: + storage: {{.VolSize}} diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/persistentvolume/usecsi.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/persistentvolume/usecsi.yaml new file mode 100644 index 00000000000..a4e96e94ec5 --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/persistentvolume/usecsi.yaml @@ -0,0 +1 @@ +USE_CSI: true diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/secret/deployment_with_secret.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/secret/deployment_with_secret.yaml new file mode 100644 index 00000000000..345242958a8 --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/secret/deployment_with_secret.yaml @@ -0,0 +1,45 @@ +{{$index := .Index}} +{{$volumesPerPod := .VolumesPerPod}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: 1 + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: {{.Group}} + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + image: registry.k8s.io/pause:3.9 + imagePullPolicy: IfNotPresent + volumeMounts: + {{ range $volumeIndex := Loop .VolumesPerPod }} + - name: vol-{{$volumeIndex}} + mountPath: /usr/share/{{$volumeIndex}} + {{ end }} + volumes: + {{ range $volumeIndex := Loop .VolumesPerPod }} + - name: vol-{{$volumeIndex}} + secret: + secretName: vol-{{AddInt $volumeIndex (MultiplyInt $index $volumesPerPod)}} + {{ end }} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/secret/override.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/secret/override.yaml new file mode 100644 index 00000000000..debe08f10d6 --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/secret/override.yaml @@ -0,0 +1,3 @@ +PROVISION_VOLUME: true +DEPLOYMENT_TEMPLATE_PATH: "volume-types/secret/deployment_with_secret.yaml" +VOLUME_TEMPLATE_PATH: "volume-types/secret/secret.yaml" diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/secret/secret.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/secret/secret.yaml new file mode 100644 index 00000000000..c86f74b3d5a --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume-types/secret/secret.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Secret +metadata: + name: {{.Name}} + labels: + app: {{.Name}} + group: {{.Group}} +type: Opaque +data: + username: dXNlcm5hbWVfCg== + password: cGFzc3dvcmRfCg== diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume_binding/override.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume_binding/override.yaml new file mode 100644 index 00000000000..afc6fd604fa --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume_binding/override.yaml @@ -0,0 +1,7 @@ +# Total number of volumes will be one per node. +PODS_PER_NODE: 1 +VOLUMES_PER_POD: 1 +NODES_PER_NAMESPACE: 1 + +# Disable pod creation. +START_PODS: false diff --git a/test/clusterloader2/testing/experimental/storage/pod-startup/volume_creation/override.yaml b/test/clusterloader2/testing/experimental/storage/pod-startup/volume_creation/override.yaml new file mode 100644 index 00000000000..4921014c261 --- /dev/null +++ b/test/clusterloader2/testing/experimental/storage/pod-startup/volume_creation/override.yaml @@ -0,0 +1,11 @@ +# Total number of volumes will be one per node. +PODS_PER_NODE: 1 +VOLUMES_PER_POD: 1 +NODES_PER_NAMESPACE: 1 + +# Disable pod creation. +START_PODS: false + +# Wait for creation instead of volume binding. +WAIT_FOR_PVS_CREATED: true +WAIT_FOR_PVS_BOUND: false diff --git a/test/clusterloader2/testing/experiments/enable_restart_count_check.yaml b/test/clusterloader2/testing/experiments/enable_restart_count_check.yaml new file mode 100644 index 00000000000..353ca87bfa8 --- /dev/null +++ b/test/clusterloader2/testing/experiments/enable_restart_count_check.yaml @@ -0,0 +1 @@ +ENABLE_RESTART_COUNT_CHECK: true diff --git a/test/clusterloader2/testing/experiments/ignore_known_gce_container_restarts.yaml b/test/clusterloader2/testing/experiments/ignore_known_gce_container_restarts.yaml new file mode 100644 index 00000000000..d1fcc91b852 --- /dev/null +++ b/test/clusterloader2/testing/experiments/ignore_known_gce_container_restarts.yaml @@ -0,0 +1,19 @@ +RESTART_COUNT_THRESHOLD_OVERRIDES: | + # To be investigated in https://github.com/kubernetes/perf-tests/issues/872. + fluentd-gcp: 999 + + # Main purpose of this check is detection crashlooping pods. + # It was extended to check whether master pods were restarted even once. + # For components that we run multiple instances of we should be less aggressive and tolerate restarts e.g. due to node restarts. + kube-proxy: 2 + metadata-proxy: 2 + prometheus-to-sd-exporter: 2 + coredns: 2 + konnectivity-agent: 2 + node-problem-detector: 2 + + # Allow for a single restart of master components. + # As long as tests are passing, a single restart shouldn't be a problem + kube-scheduler: 1 + kube-controller-manager: 1 + l7-lb-controller: 1 diff --git a/test/clusterloader2/testing/experiments/ignore_known_kubemark_container_restarts.yaml b/test/clusterloader2/testing/experiments/ignore_known_kubemark_container_restarts.yaml new file mode 100644 index 00000000000..d7f7dbe9359 --- /dev/null +++ b/test/clusterloader2/testing/experiments/ignore_known_kubemark_container_restarts.yaml @@ -0,0 +1,7 @@ +RESTART_COUNT_THRESHOLD_OVERRIDES: | + # To be fixed by https://github.com/kubernetes/perf-tests/issues/871. + kubernetes-dashboard: 999 + + # To be fixed by https://github.com/kubernetes/perf-tests/issues/874. + kube-scheduler: 5 + kube-controller-manager: 5 diff --git a/test/clusterloader2/testing/experiments/use_simple_latency_query.yaml b/test/clusterloader2/testing/experiments/use_simple_latency_query.yaml new file mode 100644 index 00000000000..df8a14fb44e --- /dev/null +++ b/test/clusterloader2/testing/experiments/use_simple_latency_query.yaml @@ -0,0 +1 @@ +USE_SIMPLE_LATENCY_QUERY: true diff --git a/test/clusterloader2/testing/huge-service/config.yaml b/test/clusterloader2/testing/huge-service/config.yaml new file mode 100644 index 00000000000..01d6af1473e --- /dev/null +++ b/test/clusterloader2/testing/huge-service/config.yaml @@ -0,0 +1,36 @@ +# Huge service test config +{{$HUGE_SERVICE_HEADLESS := DefaultParam .CL2_HUGE_SERVICE_HEADLESS false}} +{{$HUGE_SERVICE_ENDPOINTS := DefaultParam .CL2_HUGE_SERVICE_ENDPOINTS 1000}} +{{$STATEFULSET_ENDPOINTS := DefaultParam .CL2_STATEFULSET_ENDPOINTS 100}} +{{$DELETE_NAMESPACE_TIMEOUT := DefaultParam .CL2_DELETE_NAMESPACE_TIMEOUT "10m"}} + +name: huge-service +namespace: + number: 1 + deleteNamespaceTimeout: {{$DELETE_NAMESPACE_TIMEOUT}} +tuningSets: +- name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 +steps: +- module: + path: modules/measurements.yaml + params: + action: start + namespaceIdx: 1 + serviceName: huge-service-statefulset-0 + statefulsetEndpoints: {{$STATEFULSET_ENDPOINTS}} +- module: + path: modules/service.yaml + params: + endpoints: {{$HUGE_SERVICE_ENDPOINTS}} + statefulsetEndpoints: {{$STATEFULSET_ENDPOINTS}} + isHeadless: {{$HUGE_SERVICE_HEADLESS}} + serviceName: huge-service +- module: + path: modules/measurements.yaml + params: + action: gather + namespaceIdx: 1 + serviceName: huge-service-statefulset-0 + statefulsetEndpoints: {{$STATEFULSET_ENDPOINTS}} diff --git a/test/clusterloader2/testing/huge-service/modules/measurements.yaml b/test/clusterloader2/testing/huge-service/modules/measurements.yaml new file mode 100644 index 00000000000..9e7db784982 --- /dev/null +++ b/test/clusterloader2/testing/huge-service/modules/measurements.yaml @@ -0,0 +1,99 @@ +# Valid actions: "start", "gather" +{{$action := .action}} +{{$statefulsetEndpoints := .statefulsetEndpoints}} +{{$serviceName := .serviceName}} +{{$namespaceIdx := .namespaceIdx}} + +{{$ALLOWED_SLOW_API_CALLS := DefaultParam .CL2_ALLOWED_SLOW_API_CALLS 0}} +{{$HUGE_SERVICE_ALLOWED_SLOW_API_CALLS := DefaultParam .CL2_HUGE_SERVICE_ALLOWED_SLOW_API_CALLS 2}} +{{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT := DefaultParam .CL2_PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT "15m"}} +{{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS false}} +{{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE true}} + +{{$CUSTOM_API_CALL_THRESHOLDS := DefaultParam .CUSTOM_API_CALL_THRESHOLDS ""}} +{{$ENABLE_IN_CLUSTER_NETWORK_LATENCY := DefaultParam .CL2_ENABLE_IN_CLUSTER_NETWORK_LATENCY true}} +{{$ENABLE_RESTART_COUNT_CHECK := DefaultParam .ENABLE_RESTART_COUNT_CHECK true}} +{{$ENABLE_SYSTEM_POD_METRICS := DefaultParam .ENABLE_SYSTEM_POD_METRICS true}} +{{$RESTART_COUNT_THRESHOLD_OVERRIDES := DefaultParam .RESTART_COUNT_THRESHOLD_OVERRIDES ""}} +{{$USE_SIMPLE_LATENCY_QUERY := DefaultParam .USE_SIMPLE_LATENCY_QUERY false}} +{{$CLUSTER_OOMS_IGNORED_PROCESSES := DefaultParam .CL2_CLUSTER_OOMS_IGNORED_PROCESSES ""}} +{{$ENABLE_CLUSTER_OOMS_TRACKER := DefaultParam .CL2_ENABLE_CLUSTER_OOMS_TRACKER true}} +{{$ENABLE_CONTAINER_RESTARTS_MEASUREMENT := DefaultParam .CL2_ENABLE_CONTAINER_RESTARTS_MEASUREMENT false}} +{{$ALLOWED_CONTAINER_RESTARTS := DefaultParam .CL2_ALLOWED_CONTAINER_RESTARTS 1}} +{{$CUSTOM_ALLOWED_CONTAINER_RESTARTS := DefaultParam .CL2_CUSTOM_ALLOWED_CONTAINER_RESTARTS ""}} +{{$PROBE_MEASUREMENTS_PING_SLEEP_DURATION := DefaultParam .CL2_PROBE_MEASUREMENTS_PING_SLEEP_DURATION "1s"}} + +{{$allowedSlowCalls := AddInt $ALLOWED_SLOW_API_CALLS $HUGE_SERVICE_ALLOWED_SLOW_API_CALLS}} + +# DNS propagation configs, used to measure DNS propagation latency for statefulset in statefulset.yaml +# Flag to enable/disable the DNS propagation measurement +{{$ENABLE_DNS_PROPAGATION_MEASUREMENT := DefaultParam .CL2_ENABLE_DNS_PROPAGATION_MEASUREMENT false}} +# Time threshold for the DNS propagation measurement +{{$DNS_PROPAGATION_THRESHOLD := DefaultParam .CL2_DNS_PROPAGATION_THRESHOLD "10s"}} + + +steps: +- name: {{$action}}ing measurements + measurements: + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: {{$action}} +{{if not $USE_SIMPLE_LATENCY_QUERY}} + enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS}} + allowedSlowCalls: {{$allowedSlowCalls}} + customThresholds: {{YamlQuote $CUSTOM_API_CALL_THRESHOLDS 4}} +{{end}} + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: {{$action}} + enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE}} + useSimpleLatencyQuery: true + summaryName: APIResponsivenessPrometheus_simple + allowedSlowCalls: {{$allowedSlowCalls}} + customThresholds: {{YamlQuote $CUSTOM_API_CALL_THRESHOLDS 4}} + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: {{$action}} + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}} + enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} + clusterOOMsIgnoredProcesses: {{YamlQuote $CLUSTER_OOMS_IGNORED_PROCESSES 4}} + clusterOOMsTrackerEnabled: {{$ENABLE_CLUSTER_OOMS_TRACKER}} +{{if $ENABLE_IN_CLUSTER_NETWORK_LATENCY}} + - Identifier: InClusterNetworkLatency + Method: InClusterNetworkLatency + Params: + action: {{$action}} + checkProbesReadyTimeout: {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT}} + replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} + pingSleepDuration: {{$PROBE_MEASUREMENTS_PING_SLEEP_DURATION}} +{{end}} +{{if $ENABLE_CONTAINER_RESTARTS_MEASUREMENT}} + - Identifier: ContainerRestarts + Method: ContainerRestarts + Params: + action: {{$action}} + enableViolations: true + defaultAllowedRestarts: {{$ALLOWED_CONTAINER_RESTARTS}} + customAllowedRestarts: {{YamlQuote $CUSTOM_ALLOWED_CONTAINER_RESTARTS 4}} +{{end}} +{{if $ENABLE_DNS_PROPAGATION_MEASUREMENT}} + - Identifier: DnsPropagation + Method: DnsPropagation + Params: + action: {{$action}} + DNSPropagationProbeStatefulSet: {{$serviceName}} + DNSPropagationProbeService: {{$serviceName}} + DNSPropagationProbeNamespaceIndex: {{$namespaceIdx}} + DNSPropagationProbePodCount: {{$statefulsetEndpoints}} + DNSPropagationProbeSampleCount: {{MinInt 25 (AddInt 15 (DivideInt $statefulsetEndpoints 1000))}} + replicasPerProbe: {{MinInt 10 (AddInt 2 (DivideInt .Nodes 100))}} + threshold: {{$DNS_PROPAGATION_THRESHOLD}} +{{end}} +- module: + path: ../load/modules/dns-performance-metrics.yaml + params: + action: {{$action}} diff --git a/test/clusterloader2/testing/huge-service/modules/service.yaml b/test/clusterloader2/testing/huge-service/modules/service.yaml new file mode 100644 index 00000000000..c9fdf0653e1 --- /dev/null +++ b/test/clusterloader2/testing/huge-service/modules/service.yaml @@ -0,0 +1,137 @@ +{{$endpoints := .endpoints}} +{{$statefulsetEndpoints := .statefulsetEndpoints}} +{{$isHeadless := .isHeadless}} +{{$serviceName := .serviceName}} + +## CL2 params +{{$CHECK_IF_PODS_ARE_UPDATED := DefaultParam .CL2_CHECK_IF_PODS_ARE_UPDATED true}} +{{$ENABLE_LARGE_STATEFULSET := DefaultParam .CL2_ENABLE_LARGE_STATEFULSET false}} + +steps: +{{if $ENABLE_LARGE_STATEFULSET}} +- module: + path: modules/statefulset.yaml + params: + action: "create" + replicasPerNamespace: 1 + endpoints: {{$statefulsetEndpoints}} + serviceName: {{$serviceName}}-statefulset +{{end}} + +- name: Create {{$serviceName}} + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 1 + tuningSet: Sequence + objectBundle: + - basename: {{$serviceName}} + objectTemplatePath: service.yaml + templateFillMap: + HeadlessService: {{$isHeadless}} +- name: Creating {{$serviceName}} measurements + measurements: + - Identifier: WaitForHugeServiceDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + checkIfPodsAreUpdated: {{$CHECK_IF_PODS_ARE_UPDATED}} + kind: Deployment + labelSelector: group = {{$serviceName}} + operationTimeout: 30m +- name: Creating {{$serviceName}} pods + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 1 + tuningSet: Sequence + objectBundle: + - basename: huge-service-deployment + objectTemplatePath: simple-deployment.yaml + templateFillMap: + Replicas: {{$endpoints}} + EnvVar: a + Group: huge-service + CpuRequest: 1m + MemoryRequest: 10M + SvcName: {{$serviceName}} +- name: Waiting for {{$serviceName}} pods to be created + measurements: + - Identifier: WaitForHugeServiceDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +- module: + path: ../load/modules/dns-k8s-hostnames.yaml + +- name: Updating {{$serviceName}} pods + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 1 + tuningSet: Sequence + objectBundle: + - basename: huge-service-deployment + objectTemplatePath: simple-deployment.yaml + templateFillMap: + Replicas: {{$endpoints}} + EnvVar: b + Group: huge-service + CpuRequest: 1m + MemoryRequest: 10M + SvcName: {{$serviceName}} +- name: Waiting for {{$serviceName}} pods to be updated + measurements: + - Identifier: WaitForHugeServiceDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + +{{if $ENABLE_LARGE_STATEFULSET}} +- module: + path: modules/statefulset.yaml + params: + action: "delete" + replicasPerNamespace: 0 + serviceName: {{$serviceName}}-statefulset +{{end}} + +- name: Deleting {{$serviceName}} pods + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 0 + tuningSet: Sequence + objectBundle: + - basename: huge-service-deployment + objectTemplatePath: simple-deployment.yaml + templateFillMap: + Replicas: {{$endpoints}} + Group: {{$serviceName}} + CpuRequest: 1m + MemoryRequest: 10M + SvcName: {{$serviceName}} +- name: Waiting for {{$serviceName}} pods to be deleted + measurements: + - Identifier: WaitForHugeServiceDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +- name: Delete {{$serviceName}} + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 0 + tuningSet: Sequence + objectBundle: + - basename: {{$serviceName}} + objectTemplatePath: service.yaml + templateFillMap: + HeadlessService: {{$isHeadless}} diff --git a/test/clusterloader2/testing/huge-service/modules/statefulset.yaml b/test/clusterloader2/testing/huge-service/modules/statefulset.yaml new file mode 100644 index 00000000000..3a6cc02b0e1 --- /dev/null +++ b/test/clusterloader2/testing/huge-service/modules/statefulset.yaml @@ -0,0 +1,47 @@ +# Valid actions: "create", "delete" +{{$action := .action}} + +{{$replicasPerNamespace := .replicasPerNamespace}} +{{$endpoints := DefaultParam .endpoints 100}} +{{$serviceName := .serviceName}} + +steps: +- name: {{$action}} {{$serviceName}} + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: Sequence + objectBundle: + - basename: {{$serviceName}} + objectTemplatePath: statefulset_service.yaml +- name: Creating {{$serviceName}} measurements + measurements: + - Identifier: WaitForHugeServiceStatefulSet + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: StatefulSet + labelSelector: group = load + operationTimeout: 30m +- name: {{$action}} {{$serviceName}} pods + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: Sequence + objectBundle: + - basename: {{$serviceName}} + objectTemplatePath: statefulset.yaml + templateFillMap: + ReplicasMin: {{$endpoints}} + ReplicasMax: {{$endpoints}} +- name: Waiting for {{$serviceName}} pods to be {{$action}}d + measurements: + - Identifier: WaitForHugeServiceStatefulSet + Method: WaitForControlledPodsRunning + Params: + action: gather diff --git a/test/clusterloader2/testing/huge-service/service.yaml b/test/clusterloader2/testing/huge-service/service.yaml new file mode 120000 index 00000000000..81ead7b05c2 --- /dev/null +++ b/test/clusterloader2/testing/huge-service/service.yaml @@ -0,0 +1 @@ +../load/service.yaml \ No newline at end of file diff --git a/test/clusterloader2/testing/huge-service/simple-deployment.yaml b/test/clusterloader2/testing/huge-service/simple-deployment.yaml new file mode 120000 index 00000000000..b8d885b9e92 --- /dev/null +++ b/test/clusterloader2/testing/huge-service/simple-deployment.yaml @@ -0,0 +1 @@ +../load/simple-deployment.yaml \ No newline at end of file diff --git a/test/clusterloader2/testing/huge-service/statefulset.yaml b/test/clusterloader2/testing/huge-service/statefulset.yaml new file mode 120000 index 00000000000..ab4acbb1976 --- /dev/null +++ b/test/clusterloader2/testing/huge-service/statefulset.yaml @@ -0,0 +1 @@ +../load/statefulset.yaml \ No newline at end of file diff --git a/test/clusterloader2/testing/huge-service/statefulset_service.yaml b/test/clusterloader2/testing/huge-service/statefulset_service.yaml new file mode 120000 index 00000000000..86d600e3434 --- /dev/null +++ b/test/clusterloader2/testing/huge-service/statefulset_service.yaml @@ -0,0 +1 @@ +../load/statefulset_service.yaml \ No newline at end of file diff --git a/test/clusterloader2/testing/l4ilb/config-ilb-recovery.yaml b/test/clusterloader2/testing/l4ilb/config-ilb-recovery.yaml new file mode 100644 index 00000000000..73c691d9b6a --- /dev/null +++ b/test/clusterloader2/testing/l4ilb/config-ilb-recovery.yaml @@ -0,0 +1,130 @@ +# Timer measurement is used to measure the time between the start of an action executed by the +# Exec command and when the ILB services are running and reachable. +# ServiceCreationLatency measurement is used only for determining the moment when all of the +# ILBs are reachable. Since it reports time based on the initial creation of the service, and +# not from the start of the action, the measured time will not be shown. + +# Constants +{{$LB_REPLICAS_PER_NS := DefaultParam .CL2_LB_REPLICAS_PER_NS 5}} +{{$LB_BACKEND_SIZE := DefaultParam .CL2_LB_BACKEND_SIZE 10}} +{{$EXTERNAL_TRAFFIC_POLICY := DefaultParam .CL2_EXTERNAL_TRAFFIC_POLICY "Cluster"}} +{{$ilbWaitTimeout := DefaultParam .CL2_ILB_WAIT_TIMEOUT "10m"}} +{{$ilbQPS := DefaultParam .CL2_ILB_TEST_QPS 20}} +{{$ILB_RECOVERY_LABEL := "ilb-recovery"}} +{{$namespaces := 1}} + +# Command to be executed +{{$EXEC_COMMAND := DefaultParam .CL2_EXEC_COMMAND nil}} +{{$EXEC_TIMEOUT := DefaultParam .CL2_EXEC_TIMEOUT "60m"}} +{{$EXEC_ADDITIONAL_ARGUMENT := DefaultParam .CL2_EXEC_ADDITIONAL_ARGUMENT ""}} + +name: l4ilb-recovery +namespace: + number: {{$namespaces}} +tuningSets: +- name: ILBConstantQPS + qpsLoad: + qps: {{$ilbQPS}} +steps: +- name: Start measurement for running pods + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = ilb-load + operationTimeout: 15m +- name: Creating ILBs + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$LB_REPLICAS_PER_NS}} + tuningSet: ILBConstantQPS + objectBundle: + - basename: ilb-service + objectTemplatePath: service.yaml + templateFillMap: + DeploymentBaseName: ilb-dep + ExternalTrafficPolicy: {{$EXTERNAL_TRAFFIC_POLICY}} + ILBSizeLabel: {{$ILB_RECOVERY_LABEL}} + - basename: ilb-dep + objectTemplatePath: dep.yaml + templateFillMap: + NumReplicas: {{$LB_BACKEND_SIZE}} +- name: Waiting for objects creation to be completed + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +{{if $EXEC_COMMAND}} +- name: Starting measurements + measurements: + - Identifier: ServiceCreationLatency + Method: ServiceCreationLatency + Params: + action: start + waitTimeout: {{$ilbWaitTimeout}} + labelSelector: size = {{$ILB_RECOVERY_LABEL}} +- name: Starting ILB recovery timer + measurements: + - Identifier: ILBRecoveryTimer + Method: Timer + Params: + action: start + label: {{$ILB_RECOVERY_LABEL}} +- name: Execute command + measurements: + - Identifier: ExecCommand + Method: Exec + Params: + timeout: {{$EXEC_TIMEOUT}} + command: + {{range $EXEC_COMMAND}} + - {{.}} + {{end}} + {{if $EXEC_ADDITIONAL_ARGUMENT}} + - {{$EXEC_ADDITIONAL_ARGUMENT}} + {{end}} +- name: Waiting for ILBs to be reachable after the command is executed + measurements: + - Identifier: ServiceCreationLatency + Method: ServiceCreationLatency + Params: + action: waitForReady +- name: Stopping ILB recovery timer + measurements: + - Identifier: ILBRecoveryTimer + Method: Timer + Params: + action: stop + label: {{$ILB_RECOVERY_LABEL}} +{{end}} +- name: Deleting ILBs + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: ILBConstantQPS + objectBundle: + - basename: ilb-service + objectTemplatePath: service.yaml + - basename: ilb-dep + objectTemplatePath: dep.yaml +- name: Waiting for objects deletion to be completed + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +- name: Gathering measurements + measurements: + - Identifier: ILBRecoveryTimer + Method: Timer + Params: + action: gather + label: {{$ILB_RECOVERY_LABEL}} diff --git a/test/clusterloader2/testing/l4ilb/config.yaml b/test/clusterloader2/testing/l4ilb/config.yaml new file mode 100644 index 00000000000..65126087536 --- /dev/null +++ b/test/clusterloader2/testing/l4ilb/config.yaml @@ -0,0 +1,66 @@ +{{$LARGE_BACKEND_LB_SERVICE_COUNT := DefaultParam .CL2_LARGE_BACKEND_LB_SERVICE_COUNT 2}} +{{$MEDIUM_BACKEND_LB_SERVICE_COUNT := DefaultParam .CL2_MEDIUM_BACKEND_LB_SERVICE_COUNT 2}} +{{$SMALL_BACKEND_LB_SERVICE_COUNT := DefaultParam .CL2_SMALL_BACKEND_LB_SERVICE_COUNT 2}} +{{$ilbQPS := DefaultParam .CL2_ILB_TEST_QPS 20}} + +# Test +name: ilbload +namespace: + number: 1 +tuningSets: +- name: ILBConstantQPS + qpsLoad: + qps: {{$ilbQPS}} +steps: +# Mesure each of the ILB services separately, this will provide insight on how long programming +# ILB takes as a function of number of backends. +- module: + path: /modules/measurements.yaml + params: + action: start +- name: Start measurement for running pods + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = ilb-load + operationTimeout: 15m +# Create ILBs +- module: + path: /modules/services.yaml + params: + actionName: Create + largeBackendLbServiceCount: {{$LARGE_BACKEND_LB_SERVICE_COUNT}} + mediumBackendLbServiceCount: {{$MEDIUM_BACKEND_LB_SERVICE_COUNT}} + smallBackendLbServiceCount: {{$SMALL_BACKEND_LB_SERVICE_COUNT}} +- module: + path: /modules/measurements.yaml + params: + action: waitForReady +- name: Waiting for objects creation to be completed + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +# Delete ILBs +- module: + path: /modules/services.yaml + params: + actionName: Delete + largeBackendLbServiceCount: 0 + mediumBackendLbServiceCount: 0 + smallBackendLbServiceCount: 0 +- name: Waiting for objects deletion to be completed + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +- module: + path: /modules/measurements.yaml + params: + action: gather diff --git a/test/clusterloader2/testing/l4ilb/dep.yaml b/test/clusterloader2/testing/l4ilb/dep.yaml new file mode 100644 index 00000000000..8a2f505b284 --- /dev/null +++ b/test/clusterloader2/testing/l4ilb/dep.yaml @@ -0,0 +1,22 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: ilb-load +spec: + replicas: {{.NumReplicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: ilb-load + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + image: nginx + ports: + - containerPort: 8080 diff --git a/test/clusterloader2/testing/l4ilb/modules/measurements.yaml b/test/clusterloader2/testing/l4ilb/modules/measurements.yaml new file mode 100644 index 00000000000..1f30b66b917 --- /dev/null +++ b/test/clusterloader2/testing/l4ilb/modules/measurements.yaml @@ -0,0 +1,25 @@ +# Valid actions: "start", "waitForReady" and "gather" +{{$action := .action}} +{{$ilbWaitTimeout := DefaultParam .CL2_ILB_WAIT_TIMEOUT "10m"}} + +steps: +- name: Service creation latency measurements - '{{$action}}' + measurements: + - Identifier: ServiceCreationLatencyLarge + Method: ServiceCreationLatency + Params: + action: {{$action}} + waitTimeout: {{$ilbWaitTimeout}} + labelSelector: size = ilb-large + - Identifier: ServiceCreationLatencyMedium + Method: ServiceCreationLatency + Params: + action: {{$action}} + waitTimeout: {{$ilbWaitTimeout}} + labelSelector: size = ilb-medium + - Identifier: ServiceCreationLatencySmall + Method: ServiceCreationLatency + Params: + action: {{$action}} + waitTimeout: {{$ilbWaitTimeout}} + labelSelector: size = ilb-small diff --git a/test/clusterloader2/testing/l4ilb/modules/services.yaml b/test/clusterloader2/testing/l4ilb/modules/services.yaml new file mode 100644 index 00000000000..df737bd2a21 --- /dev/null +++ b/test/clusterloader2/testing/l4ilb/modules/services.yaml @@ -0,0 +1,61 @@ +{{$EXTERNAL_TRAFFIC_POLICY := DefaultParam .CL2_EXTERNAL_TRAFFIC_POLICY "Cluster"}} +{{$LARGE_BACKEND_SIZE := DefaultParam .CL2_LARGE_BACKEND_SIZE 300}} +{{$MEDIUM_BACKEND_SIZE := DefaultParam .CL2_MEDIUM_BACKEND_SIZE 150}} +{{$SMALL_BACKEND_SIZE := DefaultParam .CL2_SMALL_BACKEND_SIZE 10}} +{{$LARGE_BACKEND_LB_SERVICE_COUNT := .largeBackendLbServiceCount}} +{{$MEDIUM_BACKEND_LB_SERVICE_COUNT := .mediumBackendLbServiceCount}} +{{$SMALL_BACKEND_LB_SERVICE_COUNT := .smallBackendLbServiceCount}} +{{$actionName := .actionName}} +{{$namespaces := 1}} + +steps: +- name: {{$actionName}} ILBs + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$LARGE_BACKEND_LB_SERVICE_COUNT}} + tuningSet: ILBConstantQPS + objectBundle: + - basename: large-backends-service + objectTemplatePath: service.yaml + templateFillMap: + DeploymentBaseName: large-backends-dep + ExternalTrafficPolicy: {{$EXTERNAL_TRAFFIC_POLICY}} + ILBSizeLabel: ilb-large + - basename: large-backends-dep + objectTemplatePath: dep.yaml + templateFillMap: + NumReplicas: {{$LARGE_BACKEND_SIZE}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$MEDIUM_BACKEND_LB_SERVICE_COUNT}} + tuningSet: ILBConstantQPS + objectBundle: + - basename: medium-backends-service + objectTemplatePath: service.yaml + templateFillMap: + DeploymentBaseName: medium-backends-dep + ExternalTrafficPolicy: {{$EXTERNAL_TRAFFIC_POLICY}} + ILBSizeLabel: ilb-medium + - basename: medium-backends-dep + objectTemplatePath: dep.yaml + templateFillMap: + NumReplicas: {{$MEDIUM_BACKEND_SIZE}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$SMALL_BACKEND_LB_SERVICE_COUNT}} + tuningSet: ILBConstantQPS + objectBundle: + - basename: small-backends-service + objectTemplatePath: service.yaml + templateFillMap: + DeploymentBaseName: small-backends-dep + ExternalTrafficPolicy: {{$EXTERNAL_TRAFFIC_POLICY}} + ILBSizeLabel: ilb-small + - basename: small-backends-dep + objectTemplatePath: dep.yaml + templateFillMap: + NumReplicas: {{$SMALL_BACKEND_SIZE}} diff --git a/test/clusterloader2/testing/l4ilb/service.yaml b/test/clusterloader2/testing/l4ilb/service.yaml new file mode 100644 index 00000000000..0a105b36f79 --- /dev/null +++ b/test/clusterloader2/testing/l4ilb/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{.Name}} + labels: + size: {{.ILBSizeLabel}} + annotations: + cloud.google.com/load-balancer-type: "Internal" +spec: + externalTrafficPolicy: {{.ExternalTrafficPolicy}} + type: LoadBalancer + selector: + name: {{.DeploymentBaseName}}-{{.Index}} + ports: + - port: 8080 + targetPort: 80 diff --git a/test/clusterloader2/testing/l4lb/config.yaml b/test/clusterloader2/testing/l4lb/config.yaml new file mode 100644 index 00000000000..d50ac094d4a --- /dev/null +++ b/test/clusterloader2/testing/l4lb/config.yaml @@ -0,0 +1,105 @@ +# LOAD_BALANCER_BACKEND_SIZE specifies the number of backend pods behind each LB +{{$LOAD_BALANCER_BACKEND_SIZE := DefaultParam .CL2_LOAD_BALANCER_BACKEND_SIZE 5}} +# LOAD_BALANCER_REPLICAS specifies the number of Load balancer type service +{{$LOAD_BALANCER_REPLICAS := DefaultParam .CL2_LOAD_BALANCER_REPLICAS 3}} +# LOAD_BALANCER_TYPE specifies the type of L4 LB created. Valid values are "INTERNAL" and "EXTERNAL" +{{$LOAD_BALANCER_TYPE := DefaultParam .CL2_LOAD_BALANCER_TYPE "EXTERNAL"}} +# $EXTERNAL_TRAFFIC_POLICY specifies the externalTrafficPolicy on LB type service. Valid values are "Cluster" and "Local" +{{$EXTERNAL_TRAFFIC_POLICY := DefaultParam .CL2_EXTERNAL_TRAFFIC_POLICY "Cluster"}} +# $NODE_SYNC_TIMEOUT specifies the timeout to wait for nodesync to complete +{{$NODE_SYNC_TIMEOUT := DefaultParam .CL2_NODE_SYNC_TIMEOUT "30m"}} +# L4LB_SYNC_TIMEOUT specifies the timeout to wait for LB creation or deletion to complete +{{$L4LB_SYNC_TIMEOUT := DefaultParam .CL2_L4LB_SYNC_TIMEOUT "30m"}} + +# adding a fixed value for first version of the test, rate of pod creation not a concern yet. +{{$lbQPS := 20}} +{{$namespaces := 1}} + +name: l4lbload +namespace: + number: {{$namespaces}} +tuningSets: +- name: LBConstantQPS + qpsLoad: + qps: {{$lbQPS}} +steps: +- name: Initialize Measurements + measurements: + - Identifier: LBServiceCreationLatency + Method: ServiceCreationLatency + Params: + action: start + labelSelector: test = l4lb-load + waitTimeout: {{$L4LB_SYNC_TIMEOUT}} + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: test = l4lb-load + operationTimeout: 15m +- name: Creating LBs + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$LOAD_BALANCER_REPLICAS}} + tuningSet: LBConstantQPS + objectBundle: + - basename: lb-service + objectTemplatePath: service.yaml + templateFillMap: + DeploymentBaseName: lb-dep + ExternalTrafficPolicy: {{$EXTERNAL_TRAFFIC_POLICY}} + LoadBalancerType: {{$LOAD_BALANCER_TYPE}} + - basename: lb-dep + objectTemplatePath: dep.yaml + templateFillMap: + NumReplicas: {{$LOAD_BALANCER_BACKEND_SIZE}} +- name: Wait for LBs to be ready + measurements: + - Identifier: LBServiceCreationLatency + Method: ServiceCreationLatency + Params: + action: waitForReady + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +- name: Measure NodeSync latency + measurements: + - Identifier: NodeSyncLatency + Method: LoadBalancerNodeSyncLatency + Params: + action: measure + labelSelector: test = l4lb-load + waitTimeout: {{$NODE_SYNC_TIMEOUT}} +- name: Deleting LBs + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: LBConstantQPS + objectBundle: + - basename: lb-service + objectTemplatePath: service.yaml + - basename: lb-dep + objectTemplatePath: dep.yaml +- name: Wait for LBs to be deleted + measurements: + - Identifier: LBServiceCreationLatency + Method: ServiceCreationLatency + Params: + action: waitForDeletion +- name: Gather Measurements + measurements: + - Identifier: LBServiceCreationLatency + Method: ServiceCreationLatency + Params: + action: gather + - Identifier: NodeSyncLatency + Method: LoadBalancerNodeSyncLatency + Params: + action: gather diff --git a/test/clusterloader2/testing/l4lb/dep.yaml b/test/clusterloader2/testing/l4lb/dep.yaml new file mode 100644 index 00000000000..1a145702747 --- /dev/null +++ b/test/clusterloader2/testing/l4lb/dep.yaml @@ -0,0 +1,22 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + test: l4lb-load +spec: + replicas: {{.NumReplicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + test: l4lb-load + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + image: nginx + ports: + - containerPort: 80 diff --git a/test/clusterloader2/testing/l4lb/service.yaml b/test/clusterloader2/testing/l4lb/service.yaml new file mode 100644 index 00000000000..468c9a47acb --- /dev/null +++ b/test/clusterloader2/testing/l4lb/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{.Name}} + labels: + test: l4lb-load + annotations: + networking.gke.io/load-balancer-type: {{.LoadBalancerType}} +spec: + externalTrafficPolicy: {{.ExternalTrafficPolicy}} + type: LoadBalancer + selector: + name: {{.DeploymentBaseName}}-{{.Index}} + ports: + - port: 80 + targetPort: 80 diff --git a/test/clusterloader2/testing/list/README.md b/test/clusterloader2/testing/list/README.md new file mode 100644 index 00000000000..d20c975055e --- /dev/null +++ b/test/clusterloader2/testing/list/README.md @@ -0,0 +1,73 @@ +# List load test + +List load test perform the following steps: +- Create configmaps. + - The namespaces used here are managed by CL2. + - The size and number of configmaps can be specified using `CL2_LIST_CONFIG_MAP_BYTES` and `CL2_LIST_CONFIG_MAP_NUMBER`. +- Create RBAC rules to allow lister pods to access these configmaps +- Create lister pods using deployment in a separate namespace to list configmaps and secrets. + - The namespace is created using `namespace.yaml` as a template, but its lifecycle is managed by CL2. + - The number of replicas for the lister pods can be specified using `CL2_LIST_BENCHMARK_PODS`. + - Each lister pod will maintain `CL2_LIST_CONCURRENCY` number of inflight list requests. +- Measurement uses [`APIResponsivenessPrometheusSimple`](https://github.com/kubernetes/perf-tests/blob/master/clusterloader2/README.md) to meansure API latency for list configmaps and secrets calls. + +The lister pods leverage https://github.com/kubernetes/perf-tests/tree/master/util-images/request-benchmark to create in-cluster list load. + +## Running Tests Locally with Kind + +**Prerequisites:** + +* **Hardware:** To ensure smooth operation, it's recommended to have at least 4 CPU cores and 16GB of RAM free. +* **Kind:** Install Kind if you haven't already. (See: [https://kind.sigs.k8s.io/](https://kind.sigs.k8s.io/)) + +**Steps:** + +1. **Create the Kind Cluster:** + * Execute the following command: + ```bash + kind create cluster --config ./kind.yaml + ``` +2. **Export Kubeconfig:** + * Run: + ```bash + kind export kubeconfig + ``` +3. **Wait for Nodes:** + * Allow approximately 30 seconds for the cluster node to become schedulable. +4. **Run the Test:** + * Execute the following command: + ```bash + export CL2_ENABLE_CONTAINER_RESOURCES_MEASUREMENT=true + export CL2_PROMETHEUS_TOLERATE_MASTER=true + go run ../../cmd/clusterloader.go --provider kind -v=4 \ + --testconfig ./config.yaml \ + --kubeconfig $HOME/.kube/config \ + --enable-prometheus-server=true \ + --prometheus-scrape-kube-proxy=false \ + --prometheus-apiserver-scrape-port=6443 \ + --prometheus-scrape-master-kubelets \ + --report-dir=report + ``` + +5. **Check results:** + * See the `report` directory for report. +5. **Delete cluster before running another test:** + * Execute the following command: + ```bash + kind delete cluster + ``` + +**Local Development and Testing:** + +Kind's speed makes it ideal for rapid Kubernetes development and benchmarking. To build and test local changes: + +1. **Build a Node Image:** + * From the `kubernetes/kubernetes` directory, run: + ```bash + kind build node-image + ``` +2. **Create Cluster with Custom Image:** + * When creating the Kind cluster, add the `--image kindest/node:latest` flag to the `kind create cluster` command: + ```bash + kind create cluster --config ./kind.yaml --image kindest/node:latest + ``` diff --git a/test/clusterloader2/testing/list/clusterrole.yaml b/test/clusterloader2/testing/list/clusterrole.yaml new file mode 100644 index 00000000000..e8cb88a7231 --- /dev/null +++ b/test/clusterloader2/testing/list/clusterrole.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: list-clusterrole +rules: +- apiGroups: + - "" + resources: + - "*" + verbs: + - list diff --git a/test/clusterloader2/testing/list/clusterrolebinding.yaml b/test/clusterloader2/testing/list/clusterrolebinding.yaml new file mode 100644 index 00000000000..871a480b772 --- /dev/null +++ b/test/clusterloader2/testing/list/clusterrolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{.Name}} +subjects: +- kind: ServiceAccount + name: default + namespace: list-benchmark-0 +roleRef: + kind: ClusterRole + name: list-clusterrole-0 + apiGroup: rbac.authorization.k8s.io diff --git a/test/clusterloader2/testing/list/config.yaml b/test/clusterloader2/testing/list/config.yaml new file mode 100644 index 00000000000..7fcd7a5d4d9 --- /dev/null +++ b/test/clusterloader2/testing/list/config.yaml @@ -0,0 +1,85 @@ +# List benchmark +{{$configMapBytes := DefaultParam .CL2_LIST_CONFIG_MAP_BYTES 100000}} +{{$configMapNumber := DefaultParam .CL2_LIST_CONFIG_MAP_NUMBER 10000}} +{{$configMapGroup := DefaultParam .CL2_LIST_CONFIG_MAP_GROUP "list-configmap"}} + +{{$listReplicas := DefaultParam .CL2_LIST_BENCHMARK_PODS 1}} +{{$contentType := DefaultParam .CL2_LIST_BENCHMARK_CONTENT_TYPE "json"}} + +name: list-benchmark +namespace: + number: 1 +tuningSets: +- name: Sequence + parallelismLimitedLoad: + parallelismLimit: 10 +steps: +- name: Setup namespace for list benchmark pods + phases: + - replicasPerNamespace: 1 + tuningSet: Sequence + objectBundle: + - basename: list-benchmark + objectTemplatePath: namespace.yaml +- name: Setup permissions + phases: + - replicasPerNamespace: 1 + tuningSet: Sequence + objectBundle: + - basename: list-clusterrole + objectTemplatePath: clusterrole.yaml + - replicasPerNamespace: 1 + tuningSet: Sequence + objectBundle: + - basename: list-clusterrolebinding + objectTemplatePath: clusterrolebinding.yaml + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 1 + tuningSet: Sequence + objectBundle: + - basename: list-rolebinding + objectTemplatePath: rolebinding.yaml + +- name: Create configmaps + phases: + - namespaceRange: + min: 1 + max: 1 + tuningSet: Sequence + replicasPerNamespace: {{$configMapNumber}} + objectBundle: + - basename: {{$configMapGroup}} + objectTemplatePath: configmap.yaml + templateFillMap: + bytes: {{$configMapBytes}} + group: {{$configMapGroup}} + +- module: + path: modules/list-benchmark.yaml + params: + namePrefix: "list-configmaps-" + replicas: {{$listReplicas}} + uri: /api/v1/configmaps?resourceVersion=0 + namespaced: false + contentType: {{$contentType}} +- module: + path: /modules/measurements.yaml + params: + action: start +- name: Wait 5 minutes + measurements: + - Identifier: Wait + Method: Sleep + Params: + duration: 5m +- module: + path: /modules/measurements.yaml + params: + action: gather +- module: + path: modules/list-benchmark.yaml + params: + namePrefix: "list-configmaps-" + replicas: 0 diff --git a/test/clusterloader2/testing/list/configmap.yaml b/test/clusterloader2/testing/list/configmap.yaml new file mode 100644 index 00000000000..6e0582cb9d0 --- /dev/null +++ b/test/clusterloader2/testing/list/configmap.yaml @@ -0,0 +1,11 @@ +{{$bytes := .bytes}} +{{$group := DefaultParam .group .Name}} + +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{.Name}} + labels: + group: {{$group}} +data: + key: "{{RandData $bytes}}" diff --git a/test/clusterloader2/testing/list/deployment.yaml b/test/clusterloader2/testing/list/deployment.yaml new file mode 100644 index 00000000000..799bd21ae14 --- /dev/null +++ b/test/clusterloader2/testing/list/deployment.yaml @@ -0,0 +1,37 @@ +# The source of the image is at https://github.com/kubernetes/perf-tests/tree/master/util-images/request-benchmark +{{$image := DefaultParam .CL2_BENCHMARK_IMAGE "gcr.io/k8s-testimages/perf-tests-util/request-benchmark:latest"}} +{{$cpu := DefaultParam .cpu "1250m"}} +{{$memory := DefaultParam .memory "32Mi"}} + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: benchmark +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + name: {{.Name}} + group: benchmark + spec: + containers: + - name: {{.Name}} + image: {{$image}} + imagePullPolicy: Always + args: + - --namespace={{.Namespace}} + - --uri={{.Uri}} + - --content-type={{.ContentType}} + resources: + requests: + cpu: {{$cpu}} + memory: {{$memory}} + limits: + cpu: {{$cpu}} + memory: {{$memory}} diff --git a/test/clusterloader2/testing/list/kind.yaml b/test/clusterloader2/testing/list/kind.yaml new file mode 100644 index 00000000000..07750797bdf --- /dev/null +++ b/test/clusterloader2/testing/list/kind.yaml @@ -0,0 +1,13 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: +- role: control-plane + kubeadmConfigPatches: + - | + kind: ClusterConfiguration + controllerManager: + extraArgs: + bind-address: "0.0.0.0" + scheduler: + extraArgs: + bind-address: "0.0.0.0" diff --git a/test/clusterloader2/testing/list/modules/list-benchmark.yaml b/test/clusterloader2/testing/list/modules/list-benchmark.yaml new file mode 100644 index 00000000000..193f3d3660c --- /dev/null +++ b/test/clusterloader2/testing/list/modules/list-benchmark.yaml @@ -0,0 +1,38 @@ +{{$namePrefix := DefaultParam .namePrefix "list-benchmark-"}} +{{$replicas := DefaultParam .replicas 0}} +{{$uri := DefaultParam .uri "/"}} +{{$contentType := DefaultParam .contentType "json"}} + +steps: +- name: Creating WaitForControlledPodsRunning measurement + measurements: + - Identifier: WaitForListBenchmarkDeployment + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + checkIfPodsAreUpdated: true + kind: Deployment + labelSelector: group = benchmark + operationTimeout: 5m +- name: Deploying {{$namePrefix}}deployment + phases: + - tuningSet: Sequence + replicasPerNamespace: 1 + namespaceRange: + min: 0 + max: 0 + basename: list-benchmark + objectBundle: + - basename: {{$namePrefix}}deploy + objectTemplatePath: deployment.yaml + templateFillMap: + Replicas: {{$replicas}} + Uri: {{$uri}} + ContentType: {{$contentType}} +- name: Waiting for WaitForControlledPodsRunning gather + measurements: + - Identifier: WaitForListBenchmarkDeployment + Method: WaitForControlledPodsRunning + Params: + action: gather diff --git a/test/clusterloader2/testing/list/modules/measurements.yaml b/test/clusterloader2/testing/list/modules/measurements.yaml new file mode 100644 index 00000000000..c28f5af7003 --- /dev/null +++ b/test/clusterloader2/testing/list/modules/measurements.yaml @@ -0,0 +1,20 @@ +## Measurement module defines test scoped measurement. + +## Input params +# Valid actions: "start", "gather" +{{$action := .action}} + + +steps: +- name: "{{$action}}ing measurements" + measurements: + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: {{$action}} + enableViolations: false + useSimpleLatencyQuery: true + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: {{$action}} \ No newline at end of file diff --git a/test/clusterloader2/testing/list/namespace.yaml b/test/clusterloader2/testing/list/namespace.yaml new file mode 100644 index 00000000000..7543a92f490 --- /dev/null +++ b/test/clusterloader2/testing/list/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: list-benchmark diff --git a/test/clusterloader2/testing/list/rolebinding.yaml b/test/clusterloader2/testing/list/rolebinding.yaml new file mode 100644 index 00000000000..947e1785697 --- /dev/null +++ b/test/clusterloader2/testing/list/rolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{.Name}} +subjects: +- kind: ServiceAccount + name: default + namespace: {{.Namespace}} +roleRef: + kind: ClusterRole + name: list-clusterrole-0 + apiGroup: rbac.authorization.k8s.io diff --git a/test/clusterloader2/testing/load/config.yaml b/test/clusterloader2/testing/load/config.yaml new file mode 100644 index 00000000000..f20d519fa6d --- /dev/null +++ b/test/clusterloader2/testing/load/config.yaml @@ -0,0 +1,464 @@ +# ASSUMPTIONS: +# - Underlying cluster should have 100+ nodes. +# - Number of nodes should be divisible by NODES_PER_NAMESPACE (default 100). +# - The number of created SVCs is half the number of created Deployments. +# - Only half of Deployments will be assigned 1-1 to existing SVCs. + +#Constants +# Cater for the case where the number of nodes is less than nodes per namespace. See https://github.com/kubernetes/perf-tests/issues/887 +{{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .NODES_PER_NAMESPACE 100)}} +# See https://github.com/kubernetes/perf-tests/pull/1667#issuecomment-769642266 +{{$IS_SMALL_CLUSTER := lt .Nodes 100}} +{{$PODS_PER_NODE := DefaultParam .PODS_PER_NODE 30}} +{{$LOAD_TEST_THROUGHPUT := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 10}} +{{$DELETE_TEST_THROUGHPUT := DefaultParam .CL2_DELETE_TEST_THROUGHPUT $LOAD_TEST_THROUGHPUT}} +{{$RATE_LIMIT_POD_CREATION := DefaultParam .CL2_RATE_LIMIT_POD_CREATION true}} +{{$BIG_GROUP_SIZE := DefaultParam .BIG_GROUP_SIZE 250}} +{{$MEDIUM_GROUP_SIZE := DefaultParam .MEDIUM_GROUP_SIZE 30}} +{{$SMALL_GROUP_SIZE := DefaultParam .SMALL_GROUP_SIZE 5}} +{{$SMALL_STATEFUL_SETS_PER_NAMESPACE := DefaultParam .SMALL_STATEFUL_SETS_PER_NAMESPACE 1}} +{{$MEDIUM_STATEFUL_SETS_PER_NAMESPACE := DefaultParam .MEDIUM_STATEFUL_SETS_PER_NAMESPACE 1}} +{{$ENABLE_CHAOSMONKEY := DefaultParam .ENABLE_CHAOSMONKEY false}} +{{$ENABLE_API_AVAILABILITY_MEASUREMENT := DefaultParam .CL2_ENABLE_API_AVAILABILITY_MEASUREMENT false}} +{{$ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST := DefaultParam .CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST false}} +{{$RANDOM_SCALE_FACTOR := 0.5}} +#Variables +{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}} +{{$totalPods := MultiplyInt $namespaces $NODES_PER_NAMESPACE $PODS_PER_NODE}} +{{$podsPerNamespace := DivideInt $totalPods $namespaces}} +{{$saturationTime := DivideInt $totalPods $LOAD_TEST_THROUGHPUT}} +{{$deletionTime := DivideInt $totalPods $DELETE_TEST_THROUGHPUT}} +# bigDeployments - 1/4 of namespace pods should be in big Deployments. +{{$bigDeploymentsPerNamespace := DivideInt $podsPerNamespace (MultiplyInt 4 $BIG_GROUP_SIZE)}} +# mediumDeployments - 1/4 of namespace pods should be in medium Deployments. +{{$mediumDeploymentsPerNamespace := DivideInt $podsPerNamespace (MultiplyInt 4 $MEDIUM_GROUP_SIZE)}} +# smallDeployments - 1/2 of namespace pods should be in small Deployments. +{{$smallDeploymentsPerNamespace := DivideInt $podsPerNamespace (MultiplyInt 2 $SMALL_GROUP_SIZE)}} + +# Stateful sets are enabled. Reduce the number of small and medium deployments per namespace +# See https://github.com/kubernetes/perf-tests/issues/1036#issuecomment-607631768 +# Ensure non zero or negative after subtraction. +{{$smallDeploymentsPerNamespace := MaxInt 0 (SubtractInt $smallDeploymentsPerNamespace $SMALL_STATEFUL_SETS_PER_NAMESPACE)}} +{{$mediumDeploymentsPerNamespace := MaxInt 0 (SubtractInt $mediumDeploymentsPerNamespace $MEDIUM_STATEFUL_SETS_PER_NAMESPACE)}} + +# Jobs are enabled. Reduce the number of small, medium, big deployments per namespace. +# Ensure non zero or negative after subtraction. +{{$smallDeploymentsPerNamespace := MaxInt 0 (SubtractInt $smallDeploymentsPerNamespace 1)}} +{{$mediumDeploymentsPerNamespace := MaxInt 0 (SubtractInt $mediumDeploymentsPerNamespace 1)}} +{{$bigDeploymentsPerNamespace := MaxInt 0 (SubtractInt $bigDeploymentsPerNamespace 1)}} + +# Disable big jobs on small clusters. +{{$bigJobsPerNamespace := IfThenElse $IS_SMALL_CLUSTER 0 1}} + +# The minimal number of pods to be used to measure various things like +# pod-startup-latency or scheduler-throughput. The purpose of it is to avoid +# problems in small clusters where we wouldn't have enough samples (pods) to +# measure things accurately. +{{$MIN_PODS_IN_SMALL_CLUSTERS := 500}} + +# BEGIN scheduler-throughput section +# TODO( https://github.com/kubernetes/perf-tests/issues/1027): Lower the number of "min-pods" once we fix the scheduler throughput measurement. +{{$totalSchedulerThroughputPods := MaxInt (MultiplyInt 2 $MIN_PODS_IN_SMALL_CLUSTERS) .Nodes}} +# Determines number of pods per deployment. Should be a divider of $totalSchedulerThroughputPods. +{{$schedulerThroughputPodsPerDeployment := DefaultParam .CL2_SCHEDULER_THROUGHPUT_PODS_PER_DEPLOYMENT $totalSchedulerThroughputPods}} +{{$schedulerThroughputNamespaces := DivideInt $totalSchedulerThroughputPods $schedulerThroughputPodsPerDeployment}} + +# Set schedulerThroughputNamespaces to 1 on small clusters otherwise it will result +# in an unnecessary number of namespaces. +{{$schedulerThroughputNamespaces := IfThenElse $IS_SMALL_CLUSTER 1 $schedulerThroughputNamespaces}} +# END scheduler-throughput section + +# Command to be executed +{{$EXEC_COMMAND := DefaultParam .CL2_EXEC_COMMAND nil}} +{{$EXIT_AFTER_EXEC := DefaultParam .CL2_EXIT_AFTER_EXEC false}} +{{$EXEC_TIMEOUT := DefaultParam .CL2_EXEC_TIMEOUT "3600s"}} +{{$SLEEP_AFTER_EXEC_DURATION := DefaultParam .CL2_SLEEP_AFTER_EXEC_DURATION "0s"}} + +{{$registry := DefaultParam .CL2_LATENCY_POD_REGISTRY "registry.k8s.io"}} +{{$latencyPodImage := DefaultParam .CL2_LATENCY_POD_IMAGE (Concat $registry "/pause:3.9")}} +{{$defaultQps := DefaultParam .CL2_DEFAULT_QPS (IfThenElse (le .Nodes 500) 10 100)}} + +{{$ADDITIONAL_MEASUREMENT_MODULES := DefaultParam .CL2_ADDITIONAL_MEASUREMENT_MODULES nil}} +{{$ADDITIONAL_PHASES_MODULES := DefaultParam .CL2_ADDITIONAL_PHASES_MODULES nil}} + +name: load +namespace: + number: {{AddInt $namespaces $schedulerThroughputNamespaces}} +tuningSets: +- name: Sequence + parallelismLimitedLoad: + parallelismLimit: 1 +# TODO(https://github.com/kubernetes/perf-tests/issues/1024): This TuningSet is used only for pod-startup-latency, get rid of it +# Uniform5qps: for each running phase, use 5 qps. +- name: Uniform5qps + qpsLoad: + qps: 50000 + burst: 10000 +# default is a tuningset that is meant to be used when we don't have any specific requirements on pace of operations. +- name: default + globalQPSLoad: + qps: {{$defaultQps}} + burst: 10000 +- name: RandomizedSaturationTimeLimited + RandomizedTimeLimitedLoad: + timeLimit: {{$saturationTime}}s +- name: RandomizedScalingTimeLimited + RandomizedTimeLimitedLoad: + # The expected number of created/deleted pods is totalPods/4 when scaling, + # as each RS changes its size from X to a uniform random value in [X/2, 3X/2]. + # To match 10 [pods/s] requirement, we need to divide saturationTime by 4. + timeLimit: {{DivideInt $saturationTime 4}}s +- name: RandomizedDeletionTimeLimited + RandomizedTimeLimitedLoad: + timeLimit: {{$deletionTime}}s +{{if $ENABLE_CHAOSMONKEY}} +chaosMonkey: + nodeFailure: + failureRate: 0.01 + interval: 5m + jitterFactor: 2.0 + simulatedDowntime: 10m +{{end}} +steps: +- module: + path: /modules/measurements.yaml + params: + action: start + +{{if $ADDITIONAL_MEASUREMENT_MODULES}} +{{range $ADDITIONAL_MEASUREMENT_MODULES}} +- module: + path: {{.}} + params: + action: start +{{end}} +{{end}} + +{{if $ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST}} +- module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + setup: true + run: true + testType: "pod-creation" +{{end}} + +- module: + path: modules/services.yaml + params: + actionName: "Creating" + namespaces: {{$namespaces}} + smallServicesPerNamespace: {{DivideInt (AddInt $smallDeploymentsPerNamespace 1) 2}} + mediumServicesPerNamespace: {{DivideInt (AddInt $mediumDeploymentsPerNamespace 1) 2}} + bigServicesPerNamespace: {{DivideInt (AddInt $bigDeploymentsPerNamespace 1) 2}} + +- name: Creating PriorityClass for DaemonSets + phases: + - replicasPerNamespace: 1 + tuningSet: Sequence + objectBundle: + - basename: daemonset-priorityclass + objectTemplatePath: daemonset-priorityclass.yaml + +# Moved from reconcile-objects.yaml to mitigate https://github.com/kubernetes/kubernetes/issues/96635. +# TODO(https://github.com/kubernetes/perf-tests/issues/1823): Merge back to reconcile-objects.yaml once the k/k bug is fixed. +- module: + path: /modules/configmaps-secrets.yaml + params: + actionName: create + tuningSet: default + namespaces: {{$namespaces}} + bigDeploymentsPerNamespace: {{$bigDeploymentsPerNamespace}} + mediumDeploymentsPerNamespace: {{$mediumDeploymentsPerNamespace}} + smallDeploymentsPerNamespace: {{$smallDeploymentsPerNamespace}} + +- module: + path: /modules/reconcile-objects.yaml + params: + actionName: "create" + namespaces: {{$namespaces}} + {{if $RATE_LIMIT_POD_CREATION}} + tuningSet: RandomizedSaturationTimeLimited + operationTimeout: 15m + {{else}} + tuningSet: default + operationTimeout: {{AddInt $saturationTime 900}}s + {{end}} + testMaxReplicaFactor: {{$RANDOM_SCALE_FACTOR}} + # We rely on the fact that daemonset is using the same image as the 'pod-startup-latency' module. + # The goal is to cache the image to all nodes before we start any latency pod, + # so that when we measure pod startup latency, the image is already present on all nodes. + # This way, the pod startup latency we measure excludes (or limits impact) of image pulling, + # whuch matches our SLO definition: https://github.com/kubernetes/community/blob/master/sig-scalability/slos/pod_startup_latency.md. + daemonSetImage: {{$latencyPodImage}} + deploymentImage: {{$latencyPodImage}} + jobImage: {{$latencyPodImage}} + statefulSetImage: {{$latencyPodImage}} + daemonSetEnv: "before update" + daemonSetReplicas: 1 + bigDeploymentSize: {{$BIG_GROUP_SIZE}} + bigDeploymentsPerNamespace: {{$bigDeploymentsPerNamespace}} + mediumDeploymentSize: {{$MEDIUM_GROUP_SIZE}} + mediumDeploymentsPerNamespace: {{$mediumDeploymentsPerNamespace}} + smallDeploymentSize: {{$SMALL_GROUP_SIZE}} + smallDeploymentsPerNamespace: {{$smallDeploymentsPerNamespace}} + smallStatefulSetSize: {{$SMALL_GROUP_SIZE}} + smallStatefulSetsPerNamespace: {{$SMALL_STATEFUL_SETS_PER_NAMESPACE}} + mediumStatefulSetSize: {{$MEDIUM_GROUP_SIZE}} + mediumStatefulSetsPerNamespace: {{$MEDIUM_STATEFUL_SETS_PER_NAMESPACE}} + bigJobSize: {{$BIG_GROUP_SIZE}} + bigJobsPerNamespace: {{$bigJobsPerNamespace}} + mediumJobSize: {{$MEDIUM_GROUP_SIZE}} + mediumJobsPerNamespace: 1 + smallJobSize: {{$SMALL_GROUP_SIZE}} + smallJobsPerNamespace: 1 + +{{if $ADDITIONAL_PHASES_MODULES}} +{{range $ADDITIONAL_PHASES_MODULES}} +- module: + path: {{.}} + params: + action: "create" +{{end}} +{{end}} + +{{if not $IS_SMALL_CLUSTER}} +# BEGIN scheduler throughput +- module: + path: modules/scheduler-throughput.yaml + params: + action: create + namespaces: {{$namespaces}} + replicasPerNamespace: 1 + schedulerThroughputNamespaces: {{$schedulerThroughputNamespaces}} + schedulerThroughputPodsPerDeployment: {{$schedulerThroughputPodsPerDeployment}} + deploymentImage: {{$latencyPodImage}} +{{end}} + +- module: + path: modules/dns-k8s-hostnames.yaml + +{{if $EXEC_COMMAND}} + +{{if $ENABLE_API_AVAILABILITY_MEASUREMENT}} +- name: Pausing APIAvailability measurement + measurements: + - Identifier: APIAvailability + Method: APIAvailability + Params: + action: pause +{{end}} + +- name: Exec command + measurements: + - Identifier: ExecCommand + Method: Exec + Params: + timeout: {{$EXEC_TIMEOUT}} + command: + {{range $EXEC_COMMAND}} + - {{.}} + {{end}} + +{{if $ENABLE_API_AVAILABILITY_MEASUREMENT}} +- name: Unpausing APIAvailability measurement + measurements: + - Identifier: APIAvailability + Method: APIAvailability + Params: + action: unpause +{{end}} + +- name: Sleep + measurements: + - Identifier: WaitAfterExec + Method: Sleep + Params: + duration: {{$SLEEP_AFTER_EXEC_DURATION}} +{{end}} + +{{if not $EXIT_AFTER_EXEC}} + +{{if not $IS_SMALL_CLUSTER}} +- module: + path: modules/scheduler-throughput.yaml + params: + action: delete + namespaces: {{$namespaces}} + replicasPerNamespace: 0 + schedulerThroughputNamespaces: {{$schedulerThroughputNamespaces}} + schedulerThroughputPodsPerDeployment: {{$schedulerThroughputPodsPerDeployment}} +# END scheduler throughput +{{end}} + +{{if not $IS_SMALL_CLUSTER}} +# TODO(kubernetes/perf-tests/issues/1024): We shouldn't have a dedicated module for measuring pod-startup-latency. +- module: + path: modules/pod-startup-latency.yaml + params: + namespaces: {{$namespaces}} + minPodsInSmallCluster: {{$MIN_PODS_IN_SMALL_CLUSTERS}} + image: {{$latencyPodImage}} +{{end}} + +{{if $ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST}} +- module: + path: modules/network-policy/net-policy-metrics.yaml + params: + action: gather + usePolicyCreationMetrics: false + +- module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + complete: true + testType: "pod-creation" + +- module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + run: true + testType: "policy-creation" +{{end}} + +- module: + path: /modules/reconcile-objects.yaml + params: + actionName: "scale and update" + namespaces: {{$namespaces}} + {{if $RATE_LIMIT_POD_CREATION}} + tuningSet: RandomizedScalingTimeLimited + operationTimeout: 15m + {{else}} + tuningSet: default + operationTimeout: {{AddInt (DivideInt $saturationTime 4) 900}}s + {{end}} + randomScaleFactor: {{$RANDOM_SCALE_FACTOR}} + testMaxReplicaFactor: {{$RANDOM_SCALE_FACTOR}} + daemonSetImage: {{$latencyPodImage}} + deploymentImage: {{$latencyPodImage}} + jobImage: {{$latencyPodImage}} + statefulSetImage: {{$latencyPodImage}} + daemonSetEnv: "after update" + daemonSetReplicas: 1 + bigDeploymentSize: {{$BIG_GROUP_SIZE}} + bigDeploymentsPerNamespace: {{$bigDeploymentsPerNamespace}} + mediumDeploymentSize: {{$MEDIUM_GROUP_SIZE}} + mediumDeploymentsPerNamespace: {{$mediumDeploymentsPerNamespace}} + smallDeploymentSize: {{$SMALL_GROUP_SIZE}} + smallDeploymentsPerNamespace: {{$smallDeploymentsPerNamespace}} + smallStatefulSetSize: {{$SMALL_GROUP_SIZE}} + smallStatefulSetsPerNamespace: {{$SMALL_STATEFUL_SETS_PER_NAMESPACE}} + mediumStatefulSetSize: {{$MEDIUM_GROUP_SIZE}} + mediumStatefulSetsPerNamespace: {{$MEDIUM_STATEFUL_SETS_PER_NAMESPACE}} + bigJobSize: {{$BIG_GROUP_SIZE}} + bigJobsPerNamespace: {{$bigJobsPerNamespace}} + mediumJobSize: {{$MEDIUM_GROUP_SIZE}} + mediumJobsPerNamespace: 1 + smallJobSize: {{$SMALL_GROUP_SIZE}} + smallJobsPerNamespace: 1 + +{{if $ADDITIONAL_PHASES_MODULES}} +{{range $ADDITIONAL_PHASES_MODULES}} +- module: + path: {{.}} + params: + action: "scale and update" +{{end}} +{{end}} + +- module: + path: /modules/reconcile-objects.yaml + params: + actionName: "delete" + namespaces: {{$namespaces}} + {{if $RATE_LIMIT_POD_CREATION}} + tuningSet: RandomizedDeletionTimeLimited + operationTimeout: 15m + {{else}} + tuningSet: default + operationTimeout: {{AddInt $deletionTime 900}}s + {{end}} + testMaxReplicaFactor: {{$RANDOM_SCALE_FACTOR}} + daemonSetImage: {{$latencyPodImage}} + deploymentImage: {{$latencyPodImage}} + jobImage: {{$latencyPodImage}} + statefulSetImage: {{$latencyPodImage}} + daemonSetReplicas: 0 + bigDeploymentSize: {{$BIG_GROUP_SIZE}} + bigDeploymentsPerNamespace: 0 + mediumDeploymentSize: {{$MEDIUM_GROUP_SIZE}} + mediumDeploymentsPerNamespace: 0 + smallDeploymentSize: {{$SMALL_GROUP_SIZE}} + smallDeploymentsPerNamespace: 0 + smallStatefulSetSize: {{$SMALL_GROUP_SIZE}} + smallStatefulSetsPerNamespace: 0 + mediumStatefulSetSize: {{$MEDIUM_GROUP_SIZE}} + mediumStatefulSetsPerNamespace: 0 + bigJobSize: {{$BIG_GROUP_SIZE}} + bigJobsPerNamespace: 0 + mediumJobSize: {{$MEDIUM_GROUP_SIZE}} + mediumJobsPerNamespace: 0 + smallJobSize: {{$SMALL_GROUP_SIZE}} + smallJobsPerNamespace: 0 + pvSmallStatefulSetSize: {{$SMALL_STATEFUL_SETS_PER_NAMESPACE}} + pvMediumStatefulSetSize: {{$MEDIUM_STATEFUL_SETS_PER_NAMESPACE}} + +{{if $ADDITIONAL_PHASES_MODULES}} +{{range $ADDITIONAL_PHASES_MODULES}} +- module: + path: {{.}} + params: + action: "delete" +{{end}} +{{end}} + +- module: + path: /modules/configmaps-secrets.yaml + params: + actionName: delete + tuningSet: default + namespaces: {{$namespaces}} + bigDeploymentsPerNamespace: 0 + mediumDeploymentsPerNamespace: 0 + smallDeploymentsPerNamespace: 0 + +- name: Deleting PriorityClass for DaemonSets + phases: + - replicasPerNamespace: 0 + tuningSet: Sequence + objectBundle: + - basename: daemonset-priorityclass + objectTemplatePath: daemonset-priorityclass.yaml + +- module: + path: modules/services.yaml + params: + actionName: "Deleting" + namespaces: {{$namespaces}} + smallServicesPerNamespace: 0 + mediumServicesPerNamespace: 0 + bigServicesPerNamespace: 0 +{{end}} # not EXIT_AFTER_EXEC + +- module: + path: /modules/measurements.yaml + params: + action: gather + +{{if $ADDITIONAL_MEASUREMENT_MODULES}} +{{range $ADDITIONAL_MEASUREMENT_MODULES}} +- module: + path: {{.}} + params: + action: gather +{{end}} +{{end}} + +{{if $ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST}} +- module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + complete: true + testType: "policy-creation" +{{end}} diff --git a/test/clusterloader2/testing/load/configmap.yaml b/test/clusterloader2/testing/load/configmap.yaml new file mode 100644 index 00000000000..a8386140e5c --- /dev/null +++ b/test/clusterloader2/testing/load/configmap.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{.Name}} +{{if not (eq (Mod .Index 20) 0 19) }} # .Index % 20 in {0,19} - only 10% deployments will have non-immutable ConfigMap. +immutable: true +{{end}} +# Every pod that needs its own configmap entry should be unconditionally +# added below. That allows us to avoid complicating it with ifs. +data: + # all-queries is used by DNS tests. Since dnsperfgo counts NXDOMAINs as errors, this config contains only valid names. + # kubernetes.default results in 4 queries - kubernetes.default.default.svc.cluster.local(A, AAAA), kubernetes.default.svc.cluster.local(A, AAAA) + # metadata.google.internal is subject to 5 searchpaths(A, AAAA) + original query (A, AAAA) - 12 queries. + # google.com also results in 12 queries. + all-queries: | + kubernetes.default + metadata.google.internal + google.com diff --git a/test/clusterloader2/testing/load/daemonset-priorityclass.yaml b/test/clusterloader2/testing/load/daemonset-priorityclass.yaml new file mode 100644 index 00000000000..e264a740d56 --- /dev/null +++ b/test/clusterloader2/testing/load/daemonset-priorityclass.yaml @@ -0,0 +1,9 @@ +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: {{.Name}} +value: 1000000 +globalDefault: false +description: "Designated priority class to be used for DaemonSet pods. This is + to make sure they have higher priority than other test pods and there is always + place for them on each node, see kubernetes/kubernetes#82818." diff --git a/test/clusterloader2/testing/load/daemonset.yaml b/test/clusterloader2/testing/load/daemonset.yaml new file mode 100644 index 00000000000..5c693976935 --- /dev/null +++ b/test/clusterloader2/testing/load/daemonset.yaml @@ -0,0 +1,56 @@ +{{$HostNetworkMode := DefaultParam .CL2_USE_HOST_NETWORK_PODS false}} +{{$Image := DefaultParam .Image "registry.k8s.io/pause:3.9"}} +{{$Env := DefaultParam .Env ""}} +{{$DaemonSetSurge := DefaultParam .CL2_DS_SURGE (MaxInt 10 (DivideInt .Nodes 20))}} # 5% of nodes, but not less than 10 +{{$RUN_ON_ARM_NODES := DefaultParam .CL2_RUN_ON_ARM_NODES false}} + +{{$ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST := DefaultParam .CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST false}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE "net-policy-client"}} + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{.Name}} + labels: + group: load +spec: + updateStrategy: + rollingUpdate: + maxUnavailable: {{$DaemonSetSurge}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: load + name: {{.Name}} + spec: + hostNetwork: {{$HostNetworkMode}} + containers: + - name: {{.Name}} + image: {{$Image}} + env: + - name: TEST_ENV + value: {{$Env}} + resources: + # Keep the CpuRequest/MemoryRequest request equal percentage of 1-core, 4GB node. + # For now we're setting it to 0.5%. + requests: + cpu: 5m + memory: "20M" + priorityClassName: daemonset-priorityclass-0 # Name is autogenerated, hence the -0 prefix. + terminationGracePeriodSeconds: 1 + tolerations: + {{if $RUN_ON_ARM_NODES}} + - key: "kubernetes.io/arch" + operator: Equal + value: arm64 + effect: NoSchedule + {{end}} + {{if $ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST}} + - key: test-np + operator: Equal + value: {{$NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE}} + effect: NoSchedule + {{end}} diff --git a/test/clusterloader2/testing/load/deployment.yaml b/test/clusterloader2/testing/load/deployment.yaml new file mode 100644 index 00000000000..f665c09b7c2 --- /dev/null +++ b/test/clusterloader2/testing/load/deployment.yaml @@ -0,0 +1,114 @@ +{{$HostNetworkMode := DefaultParam .CL2_USE_HOST_NETWORK_PODS false}} +# Keep the CpuRequest/MemoryRequest request equal percentage of 1-core, 4GB node. +# For now we're setting it to 0.5%. +{{$CpuRequest := DefaultParam .CpuRequest "5m"}} +{{$MemoryRequest := DefaultParam .MemoryRequest "20M"}} +{{$dnsQPSPerClient := DefaultParam .CL2_DNS_QPS_PER_CLIENT 1}} +# Guard the new DNS tests. Remove it once it's confirmed that it works on a subset of tests. +{{$USE_ADVANCED_DNSTEST := DefaultParam .CL2_USE_ADVANCED_DNSTEST false}} +{{$RUN_ON_ARM_NODES := DefaultParam .CL2_RUN_ON_ARM_NODES false}} + +{{$EnableNetworkPolicyEnforcementLatencyTest := DefaultParam .EnableNetworkPolicyEnforcementLatencyTest false}} +{{$TargetLabelValue := DefaultParam .TargetLabelValue "enforcement-latency"}} +# Run a server pod for network policy enforcement latency test only on every Nth pod. +# Default every third pod. +{{$NetPolServerOnEveryNthPod := DefaultParam .NetPolServerOnEveryNthPod 3}} +{{$RunNetPolicyTest := and $EnableNetworkPolicyEnforcementLatencyTest (eq (Mod .Index $NetPolServerOnEveryNthPod) 0)}} + +{{$Image := DefaultParam .Image "registry.k8s.io/pause:3.9"}} + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: load + svc: {{.SvcName}}-{{.Index}} +spec: + replicas: {{RandIntRange .ReplicasMin .ReplicasMax}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: load + name: {{.Name}} + svc: {{.SvcName}}-{{.Index}} +{{if and .EnableDNSTests $USE_ADVANCED_DNSTEST}} + dns-test: dnsperfgo +{{else}} + {{if $RunNetPolicyTest}} + net-pol-test: {{$TargetLabelValue}} + {{end}} +{{end}} + spec: + hostNetwork: {{$HostNetworkMode}} + containers: +{{if .EnableDNSTests}} +{{if $USE_ADVANCED_DNSTEST}} + - image: gcr.io/k8s-staging-perf-tests/dnsperfgo:v1.4.0 + ports: + - containerPort: 9153 + name: dnsperfmetrics + protocol: TCP + {{else}} + - image: gcr.io/k8s-staging-perf-tests/dnsperfgo:v1.2.0 + {{end}} + # Fetches the dns server from /etc/resolv.conf and + # sends 1 query per second. + # With searchpath expansion, this is upto 12 queries per second. + # dnsperfgo has a default client timeout of 5s. It sends queries for 60s, + # then sleeps for 10s, to mimic bursts of DNS queries. + command: + - sh + - -c + - server=$(cat /etc/resolv.conf | grep nameserver | cut -d ' ' -f 2); echo + "Using nameserver ${server}"; + ./dnsperfgo -duration 60s -idle-duration 10s -inputfile /var/configmap/all-queries -qps {{$dnsQPSPerClient}}; + name: {{.Name}}-dnsperf +{{else}} + {{if $RunNetPolicyTest}} + - image: nginx + name: nginx-server + ports: + - containerPort: 80 + {{else}} + - image: {{$Image}} + name: {{.Name}} + {{end}} +{{end}} + resources: + requests: + cpu: {{$CpuRequest}} + memory: {{$MemoryRequest}} + volumeMounts: + - name: configmap + mountPath: /var/configmap + - name: secret + mountPath: /var/secret + terminationGracePeriodSeconds: 1 + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + {{if $RUN_ON_ARM_NODES}} + - key: "kubernetes.io/arch" + operator: Equal + value: arm64 + effect: NoSchedule + {{end}} + volumes: + - name: configmap + configMap: + name: {{.BaseName}}-{{.Index}} + - name: secret + secret: + secretName: {{.BaseName}}-{{.Index}} diff --git a/test/clusterloader2/testing/load/golang/custom_api_call_thresholds.yaml b/test/clusterloader2/testing/load/golang/custom_api_call_thresholds.yaml new file mode 100644 index 00000000000..a5a35f264ca --- /dev/null +++ b/test/clusterloader2/testing/load/golang/custom_api_call_thresholds.yaml @@ -0,0 +1,11 @@ +CUSTOM_API_CALL_THRESHOLDS: | + - verb: PUT + resource: leases + subresource: '' + scope: namespace + threshold: 500ms + - verb: DELETE + resource: pods + subresource: '' + scope: namespace + threshold: 700ms diff --git a/test/clusterloader2/testing/load/job.yaml b/test/clusterloader2/testing/load/job.yaml new file mode 100644 index 00000000000..338dc6f4837 --- /dev/null +++ b/test/clusterloader2/testing/load/job.yaml @@ -0,0 +1,54 @@ +{{$HostNetworkMode := DefaultParam .CL2_USE_HOST_NETWORK_PODS false}} +{{$RUN_ON_ARM_NODES := DefaultParam .CL2_RUN_ON_ARM_NODES false}} +{{$Image := DefaultParam .Image "registry.k8s.io/pause:3.9"}} + +apiVersion: batch/v1 +kind: Job +metadata: + name: {{.Name}} + labels: + group: load +spec: + backoffLimit: {{.ReplicasMin}} + manualSelector: true + parallelism: {{RandIntRange .ReplicasMin .ReplicasMax}} + completions: {{.Completions}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: load + name: {{.Name}} + spec: + hostNetwork: {{$HostNetworkMode}} + containers: + - name: {{.Name}} + # TODO(#799): We should test the "run-to-completion" workflow and hence don't use pause pods. + image: {{$Image}} + resources: + # Keep the CpuRequest/MemoryRequest request equal percentage of 1-core, 4GB node. + # For now we're setting it to 0.5%. + requests: + cpu: 5m + memory: "20M" + restartPolicy: OnFailure + terminationGracePeriodSeconds: 1 + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + {{if $RUN_ON_ARM_NODES}} + - key: "kubernetes.io/arch" + operator: Equal + value: arm64 + effect: NoSchedule + {{end}} diff --git a/test/clusterloader2/testing/load/modules/configmaps-secrets.yaml b/test/clusterloader2/testing/load/modules/configmaps-secrets.yaml new file mode 100644 index 00000000000..1aac578af31 --- /dev/null +++ b/test/clusterloader2/testing/load/modules/configmaps-secrets.yaml @@ -0,0 +1,45 @@ +# To mitigate https://github.com/kubernetes/kubernetes/issues/96635 we have to delete +# configmaps and secrets only AFTER the pods using that configmap/secret are deleted. +# TODO(mborsz): Merge this back to reconcile-objects.yaml once the issue above is fixed. + +{{$actionName := printf "%s objects" .actionName}} +{{$namespaces := .namespaces}} +{{$tuningSet := .tuningSet}} + +{{$bigDeploymentsPerNamespace := .bigDeploymentsPerNamespace}} +{{$mediumDeploymentsPerNamespace := .mediumDeploymentsPerNamespace}} +{{$smallDeploymentsPerNamespace := .smallDeploymentsPerNamespace}} + +steps: +- name: {{$actionName}} configmaps and secrets + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$bigDeploymentsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: big-deployment + objectTemplatePath: configmap.yaml + - basename: big-deployment + objectTemplatePath: secret.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$mediumDeploymentsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: medium-deployment + objectTemplatePath: configmap.yaml + - basename: medium-deployment + objectTemplatePath: secret.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$smallDeploymentsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: small-deployment + objectTemplatePath: configmap.yaml + - basename: small-deployment + objectTemplatePath: secret.yaml diff --git a/test/clusterloader2/testing/load/modules/dns-k8s-hostnames.yaml b/test/clusterloader2/testing/load/modules/dns-k8s-hostnames.yaml new file mode 100644 index 00000000000..5d5289f7e73 --- /dev/null +++ b/test/clusterloader2/testing/load/modules/dns-k8s-hostnames.yaml @@ -0,0 +1,29 @@ +{{$ENABLE_DNSTESTS := DefaultParam .CL2_ENABLE_DNSTESTS false}} +# Guard the new DNS tests. Remove it once it's confirmed that it works on a subset of tests. +{{$USE_ADVANCED_DNSTEST := DefaultParam .CL2_USE_ADVANCED_DNSTEST false}} + +# The number of DNS client pods for every 100 nodes. +{{$dnsClientPodsFactor := DefaultParam .CL2_DNS_K8S_HOSTNAMES_CLIENT_PODS_FACTOR 1}} +# Create 5 DNS client pods, plus additional based on the number of nodes. +{{$dnsClientPods := AddInt 5 (MultiplyInt $dnsClientPodsFactor (DivideInt .Nodes 100))}} +{{$qpsPerClient := DefaultParam .CL2_DNS_K8S_HOSTNAMES_PER_CLIENT_QPS 10}} +{{$testDurationMinutes := DefaultParam .CL2_DNS_K8S_HOSTNAMES_TEST_MINUTES 10}} + +{{if and $ENABLE_DNSTESTS $USE_ADVANCED_DNSTEST}} +steps: +- name: Run DNS performance test for k8s hostnames + measurements: + - Identifier: DNSPerformanceK8sHostnames + Method: DNSPerformanceK8sHostnames + Params: + podReplicas: {{$dnsClientPods}} + qpsPerClient: {{$qpsPerClient}} + testDurationMinutes: {{$testDurationMinutes}} + +- name: Wait 1m for DNS test to complete + measurements: + - Identifier: Wait + Method: Sleep + Params: + duration: 1m +{{end}} diff --git a/test/clusterloader2/testing/load/modules/dns-performance-metrics.yaml b/test/clusterloader2/testing/load/modules/dns-performance-metrics.yaml new file mode 100644 index 00000000000..a8297385278 --- /dev/null +++ b/test/clusterloader2/testing/load/modules/dns-performance-metrics.yaml @@ -0,0 +1,40 @@ +# Valid actions: "start", "gather" +{{$action := .action}} + +{{$ENABLE_DNSTESTS := DefaultParam .CL2_ENABLE_DNSTESTS false}} +# Guard the new DNS tests. Remove it once it's confirmed that it works on a subset of tests. +{{$USE_ADVANCED_DNSTEST := DefaultParam .CL2_USE_ADVANCED_DNSTEST false}} +# DNS test threshold parameters. +{{$DNS_ERROR_PERC_THRESHOLD := DefaultParam .CL2_DNS_ERROR_PERC_THRESHOLD 0.1}} +{{$DNS_LOOKUP_LATENCY_50_THRESHOLD := DefaultParam .CL2_DNS_LOOKUP_LATENCY_50_THRESHOLD 0.02}} +{{$DNS_LOOKUP_LATENCY_99_THRESHOLD := DefaultParam .CL2_DNS_LOOKUP_LATENCY_99_THRESHOLD 0.1}} + +{{if and $ENABLE_DNSTESTS $USE_ADVANCED_DNSTEST}} +steps: +- name: "{{$action}}ing measurements" + measurements: + - Identifier: DNSPerformanceMetrics + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: DNS Performance + metricVersion: v1 + unit: s + enableViolations: true + queries: + - name: DNS Lookup Count + query: sum(increase(dns_lookups_total[%v])) + - name: DNS Timeout Count + query: sum(increase(dns_timeouts_total[%v])) + - name: DNS Error Count + query: sum(increase(dns_errors_total[%v])) + - name: DNS Error Percentage + query: sum(increase(dns_errors_total[%v])) / sum(increase(dns_lookups_total[%v])) * 100 + threshold: {{$DNS_ERROR_PERC_THRESHOLD}} + - name: DNS Lookup Latency - Perc50 + query: histogram_quantile(0.5, sum(rate(dns_lookup_latency_bucket[%v])) by (le)) + threshold: {{$DNS_LOOKUP_LATENCY_50_THRESHOLD}} + - name: DNS Lookup Latency - Perc99 + query: histogram_quantile(0.99, sum(rate(dns_lookup_latency_bucket[%v])) by (le)) + threshold: {{$DNS_LOOKUP_LATENCY_99_THRESHOLD}} +{{end}} diff --git a/test/clusterloader2/testing/load/modules/measurements.yaml b/test/clusterloader2/testing/load/modules/measurements.yaml new file mode 100644 index 00000000000..ac5b0a298b6 --- /dev/null +++ b/test/clusterloader2/testing/load/modules/measurements.yaml @@ -0,0 +1,260 @@ +## Measurement module defines test scoped measurement. + +## Input params +# Valid actions: "start", "gather" +{{$action := .action}} + +## Feature-gates and configs: +{{$ALLOWED_SLOW_API_CALLS := DefaultParam .CL2_ALLOWED_SLOW_API_CALLS 0}} +{{$API_AVAILABILITY_PERCENTAGE_THRESHOLD := DefaultParam .CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD 99.5}} +{{$CLUSTER_OOMS_IGNORED_PROCESSES := DefaultParam .CL2_CLUSTER_OOMS_IGNORED_PROCESSES ""}} +{{$CUSTOM_API_CALL_THRESHOLDS := DefaultParam .CUSTOM_API_CALL_THRESHOLDS ""}} +{{$ENABLE_API_AVAILABILITY_MEASUREMENT := DefaultParam .CL2_ENABLE_API_AVAILABILITY_MEASUREMENT false}} +{{$API_AVAILABILITY_MEASUREMENT_IPS_CONFIGURED := DefaultParam .CL2_API_AVAILABILITY_MEASUREMENT_IPS_CONFIGURED false}} +{{$API_AVAILABILITY_MEASUREMENT_USE_INTERNAL_IPS := DefaultParam .CL2_API_AVAILABILITY_MEASUREMENT_USE_INTERNAL_IPS false}} +{{$API_AVAILABILITY_MEASUREMENT_USE_PUBLIC_IPS := DefaultParam .CL2_API_AVAILABILITY_MEASUREMENT_USE_PUBLIC_IPS false}} +{{$ENABLE_IN_CLUSTER_NETWORK_LATENCY := DefaultParam .CL2_ENABLE_IN_CLUSTER_NETWORK_LATENCY true}} +{{$ENABLE_SLO_MEASUREMENT := DefaultParam .CL2_ENABLE_SLO_MEASUREMENT true}} +{{$ENABLE_CLUSTER_OOMS_TRACKER := DefaultParam .CL2_ENABLE_CLUSTER_OOMS_TRACKER true}} +{{$ENABLE_NODE_LOCAL_DNS_LATENCY := DefaultParam .CL2_ENABLE_NODE_LOCAL_DNS_LATENCY false}} +{{$ENABLE_RESTART_COUNT_CHECK := DefaultParam .ENABLE_RESTART_COUNT_CHECK true}} +{{$ENABLE_SYSTEM_POD_METRICS:= DefaultParam .ENABLE_SYSTEM_POD_METRICS true}} +{{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS false}} +{{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE true}} +{{$ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT := DefaultParam .CL2_ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT false}} +{{$ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST := DefaultParam .CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST false}} +{{$CEP_PROPAGATION_DELAY_SLO_BUCKET := DefaultParam .CL2_CEP_PROPAGATION_DELAY_SLO_BUCKET 600}} +{{$CEP_PROPAGATION_DELAY_SLO_PERCENTILE := DefaultParam .CL2_CEP_PROPAGATION_DELAY_SLO_PERCENTILE 95.0}} +{{$ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST := DefaultParam .CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST false}} +{{$ENABLE_CONTAINER_RESTARTS_MEASUREMENT := DefaultParam .CL2_ENABLE_CONTAINER_RESTARTS_MEASUREMENT false}} +{{$ENABLE_CONTAINER_RESOURCES_MEASUREMENT := DefaultParam .CL2_ENABLE_CONTAINER_RESOURCES_MEASUREMENT false}} +{{$ENABLE_TERMINATED_WATCHES_MEASUREMENT := DefaultParam .CL2_ENABLE_TERMINATED_WATCHES_MEASUREMENT false}} +{{$ENABLE_QUOTAS_USAGE_MEASUREMENT := DefaultParam .CL2_ENABLE_QUOTAS_USAGE_MEASUREMENT false}} +{{$ALLOWED_CONTAINER_RESTARTS := DefaultParam .CL2_ALLOWED_CONTAINER_RESTARTS 1}} +{{$CUSTOM_ALLOWED_CONTAINER_RESTARTS := DefaultParam .CL2_CUSTOM_ALLOWED_CONTAINER_RESTARTS ""}} +{{$NODE_LOCAL_DNS_LATENCY_THRESHOLD := DefaultParam .CL2_NODE_LOCAL_DNS_LATENCY_THRESHOLD "5s"}} +{{$PROMETHEUS_SCRAPE_KUBE_PROXY := DefaultParam .PROMETHEUS_SCRAPE_KUBE_PROXY true}} +{{$PROMETHEUS_SCRAPE_KUBE_STATE_METRICS := DefaultParam .PROMETHEUS_SCRAPE_KUBE_STATE_METRICS false}} +{{$PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS := DefaultParam .PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS false}} +{{$PROBE_MEASUREMENTS_PING_SLEEP_DURATION := DefaultParam .CL2_PROBE_MEASUREMENTS_PING_SLEEP_DURATION "1s"}} +{{$RESTART_COUNT_THRESHOLD_OVERRIDES:= DefaultParam .RESTART_COUNT_THRESHOLD_OVERRIDES ""}} +{{$USE_SIMPLE_LATENCY_QUERY := DefaultParam .USE_SIMPLE_LATENCY_QUERY false}} +{{$ENABLE_VIOLATIONS_FOR_NETWORK_PROGRAMMING_LATENCIES := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_NETWORK_PROGRAMMING_LATENCIES false}} +{{$NETWORK_PROGRAMMING_LATENCY_THRESHOLD := DefaultParam .CL2_NETWORK_PROGRAMMING_LATENCY_THRESHOLD "30s"}} +{{$NETWORK_LATENCY_THRESHOLD := DefaultParam .CL2_NETWORK_LATENCY_THRESHOLD "0s"}} + +# Probe measurements shared parameter +{{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT := DefaultParam .CL2_PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT "15m"}} + +steps: +- name: "{{$action}}ing measurements" + measurements: + - Identifier: APIResponsivenessPrometheus + Method: APIResponsivenessPrometheus + Params: + action: {{$action}} +{{if not $USE_SIMPLE_LATENCY_QUERY}} + enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS}} + allowedSlowCalls: {{$ALLOWED_SLOW_API_CALLS}} + customThresholds: {{YamlQuote $CUSTOM_API_CALL_THRESHOLDS 4}} +{{end}} + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: {{$action}} + enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE}} + useSimpleLatencyQuery: true + summaryName: APIResponsivenessPrometheus_simple + allowedSlowCalls: {{$ALLOWED_SLOW_API_CALLS}} + customThresholds: {{YamlQuote $CUSTOM_API_CALL_THRESHOLDS 4}} + - Identifier: CreatePhasePodStartupLatency + Method: PodStartupLatency + Params: + action: {{$action}} + labelSelector: group = load + threshold: 1h # TODO(https://github.com/kubernetes/perf-tests/issues/1024): Ideally, this should be 5s +{{if $ENABLE_IN_CLUSTER_NETWORK_LATENCY}} + - Identifier: InClusterNetworkLatency + Method: InClusterNetworkLatency + Params: + action: {{$action}} + checkProbesReadyTimeout: {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT}} + replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} + pingSleepDuration: {{$PROBE_MEASUREMENTS_PING_SLEEP_DURATION}} + threshold: {{$NETWORK_LATENCY_THRESHOLD}} +{{end}} +{{if $ENABLE_NODE_LOCAL_DNS_LATENCY}} + - Identifier: NodeLocalDNSLatency + Method: NodeLocalDNSLatencyPrometheus + Params: + action: {{$action}} + enableViolations: true + threshold: {{$NODE_LOCAL_DNS_LATENCY_THRESHOLD}} +{{end}} +{{if $ENABLE_SLO_MEASUREMENT}} + - Identifier: SLOMeasurement + Method: SLOMeasurement + Params: + action: {{$action}} + checkProbesReadyTimeout: {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT}} + replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} +{{end}} +{{if $PROMETHEUS_SCRAPE_KUBE_PROXY}} + - Identifier: NetworkProgrammingLatency + Method: NetworkProgrammingLatency + Params: + action: {{$action}} + enableViolations: {{$ENABLE_VIOLATIONS_FOR_NETWORK_PROGRAMMING_LATENCIES}} + threshold: {{$NETWORK_PROGRAMMING_LATENCY_THRESHOLD}} + - Identifier: Kube-proxy partial iptables restore failures + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: KubeProxyIptablesRestoreFailures + metricVersion: v1alpha1 + unit: failures + queries: + - name: Total + query: sum(kubeproxy_sync_proxy_rules_iptables_partial_restore_failures_total) + requireSamples: false # It is a feature gate and may not be enabled + threshold: 0 +{{end}} +{{if $PROMETHEUS_SCRAPE_KUBE_STATE_METRICS}} + - Identifier: KubeStateMetricsLatency + Method: KubeStateMetricsLatency + Params: + action: {{$action}} +{{end}} +{{if $PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS}} + - Identifier: MetricsServerPrometheus + Method: MetricsServerPrometheus + Params: + action: {{$action}} +{{end}} + +{{if $ENABLE_API_AVAILABILITY_MEASUREMENT}} + - Identifier: APIAvailability + Method: APIAvailability + Params: + action: {{$action}} + pollFrequency: "5s" + hostPollTimeoutSeconds: 5 + threshold: {{$API_AVAILABILITY_PERCENTAGE_THRESHOLD}} + {{if $API_AVAILABILITY_MEASUREMENT_IPS_CONFIGURED}} + useHostInternalIPs: {{$API_AVAILABILITY_MEASUREMENT_USE_INTERNAL_IPS}} + useHostPublicIPs: {{$API_AVAILABILITY_MEASUREMENT_USE_PUBLIC_IPS}} + {{end}} +{{end}} +{{if $ENABLE_CONTAINER_RESTARTS_MEASUREMENT}} + - Identifier: ContainerRestarts + Method: ContainerRestarts + Params: + action: {{$action}} + enableViolations: true + defaultAllowedRestarts: {{$ALLOWED_CONTAINER_RESTARTS}} + customAllowedRestarts: {{YamlQuote $CUSTOM_ALLOWED_CONTAINER_RESTARTS 4}} +{{end}} +{{if $ENABLE_CONTAINER_RESOURCES_MEASUREMENT}} + - Identifier: ContainerCPU + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Container CPU + metricVersion: v1 + unit: cores + dimensions: + - container + queries: + - name: Perc99 + query: quantile_over_time(0.99, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) + - name: Perc90 + query: quantile_over_time(0.90, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) + - name: Perc50 + query: quantile_over_time(0.50, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) + - Identifier: ContainerMemory + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Container Memory + metricVersion: v1 + unit: MiB + dimensions: + - container + queries: + - name: Perc99 + query: quantile_over_time(0.99, sum by (container) (container_memory_working_set_bytes / 1024 / 1024)[%v:]) + - name: Perc90 + query: quantile_over_time(0.90, sum by (container) (container_memory_working_set_bytes / 1024 / 1024)[%v:]) + - name: Perc50 + query: quantile_over_time(0.50, sum by (container) (container_memory_working_set_bytes / 1024 / 1024)[%v:]) +{{end}} +{{if $ENABLE_TERMINATED_WATCHES_MEASUREMENT}} + - Identifier: TerminatedWatchesMetrics + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Terminated Watches + metricVersion: v1 + dimensions: + - resource + queries: + - name: Terminated watches + query: sum(increase(apiserver_terminated_watchers_total[%v:])) by (resource) + - Identifier: WatchCacheInitializations + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Watch Cache Initializations + metricVersion: v1 + dimensions: + - resource + queries: + - name: Watch cache reinitializations + query: sum(increase(apiserver_watch_cache_initializations_total[%v:])) by (resource) +{{end}} +{{if $ENABLE_QUOTAS_USAGE_MEASUREMENT}} + - Identifier: Quotas total usage + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: Quota usage + metricVersion: v1 + prometheusClient: managed + unit: QPMs + dimensions: + - quota_metric + queries: + - name: perc99 + query: quantile_over_time(0.99, sum by (quota_metric) (irate(serviceruntime_googleapis_com:quota_rate_net_usage{monitored_resource="consumer_quota"}[1m]))[%v:]) * 60 + - name: max + query: max_over_time(sum by (quota_metric) (irate(serviceruntime_googleapis_com:quota_rate_net_usage{monitored_resource="consumer_quota"}[1m]))[%v:]) * 60 +{{end}} +{{if $ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT}} + - Identifier: CiliumEndpointPropagationDelay + Method: CiliumEndpointPropagationDelay + Params: + action: {{$action}} + bucketSLO: {{$CEP_PROPAGATION_DELAY_SLO_BUCKET}} + percentileSLO: {{$CEP_PROPAGATION_DELAY_SLO_PERCENTILE}} + enableViolations: true +{{end}} + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: {{$action}} + systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} + clusterOOMsIgnoredProcesses: {{YamlQuote $CLUSTER_OOMS_IGNORED_PROCESSES 4}} + clusterOOMsTrackerEnabled: {{$ENABLE_CLUSTER_OOMS_TRACKER}} + restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}} + enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} +- module: + path: modules/dns-performance-metrics.yaml + params: + action: {{$action}} + +{{if $ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST}} +- module: + path: modules/network-policy/net-policy-metrics.yaml + params: + action: {{$action}} +{{end}} diff --git a/test/clusterloader2/testing/load/modules/network-policy/net-policy-enforcement-latency.yaml b/test/clusterloader2/testing/load/modules/network-policy/net-policy-enforcement-latency.yaml new file mode 100644 index 00000000000..5f0be5af3bd --- /dev/null +++ b/test/clusterloader2/testing/load/modules/network-policy/net-policy-enforcement-latency.yaml @@ -0,0 +1,55 @@ +{{$NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE := DefaultParam .CL2_NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE false}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY "net-pol-test"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE "enforcement-latency"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE "net-policy-client"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS 100}} +{{$NET_POLICY_ENFORCEMENT_LOAD_COUNT := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LOAD_COUNT 1000}} +{{$NET_POLICY_ENFORCEMENT_LOAD_QPS := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LOAD_QPS 10}} +{{$NET_POLICY_ENFORCEMENT_LOAD_TARGET_NAME := DefaultParam .CL2_POLICY_ENFORCEMENT_LOAD_TARGET_NAME "small-deployment"}} + +{{$setup := DefaultParam .setup false}} +{{$run := DefaultParam .run false}} +{{$complete := DefaultParam .complete false}} +{{$testType := DefaultParam .testType "policy-creation"}} +# Target port needs to match the server container port of target pods that have +# "targetLabelKey: targetLabelValue" label selector. +{{$targetPort := 80}} + +steps: + {{if $setup}} +- name: Setup network policy enforcement latency measurement + measurements: + - Identifier: NetworkPolicyEnforcement + Method: NetworkPolicyEnforcement + Params: + action: setup + targetLabelKey: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY}} + targetLabelValue: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE}} + baseline: {{$NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE}} + testClientNodeSelectorValue: {{$NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE}} + {{end}} + + {{if $run}} +- name: "Run pod creation network policy enforcement latency measurement (testType={{$testType}})" + measurements: + - Identifier: NetworkPolicyEnforcement + Method: NetworkPolicyEnforcement + Params: + action: run + testType: {{$testType}} + targetPort: {{$targetPort}} + maxTargets: {{$NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS}} + policyLoadCount: {{$NET_POLICY_ENFORCEMENT_LOAD_COUNT}} + policyLoadQPS: {{$NET_POLICY_ENFORCEMENT_LOAD_QPS}} + policyLoadTargetBaseName: {{$NET_POLICY_ENFORCEMENT_LOAD_TARGET_NAME}} + {{end}} + + {{if $complete}} +- name: "Complete pod creation network policy enforcement latency measurement (testType={{$testType}})" + measurements: + - Identifier: NetworkPolicyEnforcement + Method: NetworkPolicyEnforcement + Params: + action: complete + testType: {{$testType}} + {{end}} diff --git a/test/clusterloader2/testing/load/modules/network-policy/net-policy-metrics.yaml b/test/clusterloader2/testing/load/modules/network-policy/net-policy-metrics.yaml new file mode 100644 index 00000000000..25fee71a41e --- /dev/null +++ b/test/clusterloader2/testing/load/modules/network-policy/net-policy-metrics.yaml @@ -0,0 +1,122 @@ +# Valid actions: "start", "gather" +{{$action := .action}} +{{$usePolicyCreationMetrics := DefaultParam .usePolicyCreationMetrics true}} +{{$usePodCreationMetrics := DefaultParam .usePodCreationMetrics true}} +{{$useCiliumMetrics := DefaultParam .useCiliumMetrics true}} + +# CL2 params +# Negative default values are used to turn thresholds off if not overridden. Thresholds are only enabled with values of zero or higher. +{{$NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS -1}} +{{$NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS -1}} +{{$NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS -1}} +{{$CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD := DefaultParam .CL2_CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD 0}} +{{$CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD := DefaultParam .CL2_CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD 0.01}} +{{$CILIUM_POLICY_REGEN_TIME_99_THRESHOLD := DefaultParam .CL2_CILIUM_POLICY_REGEN_TIME_99_THRESHOLD -1}} +{{$CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD := DefaultParam .CL2_CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD -1}} + +steps: +- name: "{{$action}}ing network policy metrics" + measurements: + - Identifier: NetworkPolicyEnforcementLatency + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: "Network Policy Enforcement Latency" + metricVersion: v1 + unit: s + queries: + # Network policy enforcement metrics gathered from the test clients. + {{if $usePolicyCreationMetrics}} + - name: PolicyCreation - TargetCount + query: sum(policy_enforcement_latency_policy_creation_seconds_count) + - name: PolicyCreation - Perc50 + query: histogram_quantile(0.5, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + - name: PolicyCreation - Perc90 + query: histogram_quantile(0.9, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + - name: PolicyCreation - Perc95 + query: histogram_quantile(0.95, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + - name: PolicyCreation - Perc99 + query: histogram_quantile(0.99, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + {{if ge $NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS 0}} + threshold: {{$NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS}} + {{end}} + {{end}} + {{if $usePodCreationMetrics}} + - name: PodCreation - TargetCount + query: sum(pod_creation_reachability_latency_seconds_count) + - name: PodCreation - Perc50 + query: histogram_quantile(0.5, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + - name: PodCreation - Perc90 + query: histogram_quantile(0.9, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + - name: PodCreation - Perc95 + query: histogram_quantile(0.95, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + - name: PodCreation - Perc99 + query: histogram_quantile(0.99, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + {{if ge $NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS 0}} + threshold: {{$NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS}} + {{end}} + - name: PodIpAssignedLatency - TargetCount + query: sum(pod_ip_address_assigned_latency_seconds_count) + - name: PodIpAssignedLatency - Perc50 + query: histogram_quantile(0.50, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + - name: PodIpAssignedLatency - Perc90 + query: histogram_quantile(0.90, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + - name: PodIpAssignedLatency - Perc95 + query: histogram_quantile(0.95, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + - name: PodIpAssignedLatency - Perc99 + query: histogram_quantile(0.99, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + {{if ge $NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS 0}} + threshold: {{$NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS}} + {{end}} + {{end}} + + {{if $useCiliumMetrics}} + - Identifier: NetworkPolicyMetrics + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: "Network Policy Performance" + metricVersion: v1 + unit: s + queries: + # Cilium agent metrics that are related to network policies. + - name: Number of times a policy import has failed + # To be replaced with the new Cilium metric that counts all policy changes, not just import errors. + # With that, this can be a percentage of failed imports. + # https://github.com/cilium/cilium/pull/23349 + query: sum(cilium_policy_import_errors_total) + threshold: {{$CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD}} + - name: Failed endpoint regenerations percentage + query: sum(cilium_endpoint_regenerations_total{outcome="fail"}) / sum(cilium_endpoint_regenerations_total) * 100 + threshold: {{$CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD}} + - name: Policy regeneration time - Perc50 + query: histogram_quantile(0.50, sum(cilium_policy_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + - name: Policy regeneration time - Perc99 + query: histogram_quantile(0.99, sum(cilium_policy_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + {{if ge $CILIUM_POLICY_REGEN_TIME_99_THRESHOLD 0}} + threshold: {{$CILIUM_POLICY_REGEN_TIME_99_THRESHOLD}} + {{end}} + - name: Time between a policy change and it being fully deployed into the datapath - Perc50 + query: histogram_quantile(0.50, sum(cilium_policy_implementation_delay_bucket) by (le)) + - name: Time between a policy change and it being fully deployed into the datapath - Perc99 + query: histogram_quantile(0.99, sum(cilium_policy_implementation_delay_bucket) by (le)) + - name: Latency of policy update trigger - Perc50 + query: histogram_quantile(0.50, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="latency"}) by (le)) + - name: Latency of policy update trigger - Perc99 + query: histogram_quantile(0.99, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="latency"}) by (le)) + - name: Duration of policy update trigger - Perc50 + query: histogram_quantile(0.50, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="duration"}) by (le)) + - name: Duration of policy update trigger - Perc99 + query: histogram_quantile(0.99, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="duration"}) by (le)) + - name: Endpoint regeneration latency - Perc50 + query: histogram_quantile(0.50, sum(cilium_endpoint_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + - name: Endpoint regeneration latency - Perc99 + query: histogram_quantile(0.99, sum(cilium_endpoint_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + {{if ge $CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD 0}} + threshold: {{$CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD}} + {{end}} + - name: Number of policies currently loaded + query: avg(cilium_policy) + - name: Number of endpoints labeled by policy enforcement status + query: sum(cilium_policy_endpoint_enforcement_status) + {{end}} diff --git a/test/clusterloader2/testing/load/modules/pod-startup-latency.yaml b/test/clusterloader2/testing/load/modules/pod-startup-latency.yaml new file mode 100644 index 00000000000..3f0fa57104c --- /dev/null +++ b/test/clusterloader2/testing/load/modules/pod-startup-latency.yaml @@ -0,0 +1,86 @@ +## Pod-startup-latency module provides a module for latency pod measurements + +## Input Params +{{$namespaces := .namespaces}} +{{$minPodsInSmallCluster := .minPodsInSmallCluster}} +{{$image := .image}} + +## CL2 Params +# LATENCY_POD_MEMORY and LATENCY_POD_CPU are calculated for 1-core 4GB node. +# Increasing allocation of both memory and cpu by 5% +# decreases the value of priority function in scheduler by one 5 points. +# This results in decreased probability of choosing the same node again. +# TODO(https://github.com/kubernetes/perf-tests/issues/1024): See whether we can get rid of this +{{$LATENCY_POD_CPU := DefaultParam .CL2_LATENCY_POD_CPU 50}} +{{$LATENCY_POD_MEMORY := DefaultParam .CL2_LATENCY_POD_MEMORY 200}} +{{$LATENCY_POD_COUNT := DefaultParam .CL2_LATENCY_POD_COUNT $minPodsInSmallCluster}} +{{$SCHEDULER_NAME := DefaultParam .CL2_SCHEDULER_NAME "default-scheduler"}} + +## Variables +{{$latencyReplicas := Ceil (DivideFloat $LATENCY_POD_COUNT $namespaces)}} +{{$podStartupLatencyThreshold := DefaultParam .CL2_POD_STARTUP_LATENCY_THRESHOLD "5s"}} +{{$CHECK_IF_PODS_ARE_UPDATED := DefaultParam .CL2_CHECK_IF_PODS_ARE_UPDATED true}} + +steps: +- name: Starting latency pod measurements + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = latency + threshold: {{$podStartupLatencyThreshold}} + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + checkIfPodsAreUpdated: {{$CHECK_IF_PODS_ARE_UPDATED}} + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = latency + operationTimeout: 15m +- name: Creating latency pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$latencyReplicas}} + tuningSet: Uniform5qps + objectBundle: + - basename: latency-deployment + objectTemplatePath: simple-deployment.yaml + templateFillMap: + Replicas: 1 + Group: latency + CpuRequest: {{$LATENCY_POD_CPU}}m + MemoryRequest: {{$LATENCY_POD_MEMORY}}M + Image: {{$image}} +- name: Waiting for latency pods to be running + measurements: + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +- name: Deleting latency pods + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: default + objectBundle: + - basename: latency-deployment + objectTemplatePath: simple-deployment.yaml +- name: Waiting for latency pods to be deleted + measurements: + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +- name: Collecting pod startup latency + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather + schedulerName: {{$SCHEDULER_NAME}} diff --git a/test/clusterloader2/testing/load/modules/reconcile-objects.yaml b/test/clusterloader2/testing/load/modules/reconcile-objects.yaml new file mode 100644 index 00000000000..e96ef37657e --- /dev/null +++ b/test/clusterloader2/testing/load/modules/reconcile-objects.yaml @@ -0,0 +1,278 @@ +## Input params +# Valid actions: "create", "scale and update", "delete" +{{$actionName := printf "%s objects" .actionName}} +{{$namespaces := .namespaces}} +{{$tuningSet := .tuningSet}} + +# Derivative variables +{{$is_deleting := (eq .actionName "delete")}} +{{$randomScaleFactor := DefaultParam .randomScaleFactor 0}} +{{$minReplicaFactor := SubtractFloat 1 $randomScaleFactor}} +{{$maxReplicaFactor := AddFloat 1 $randomScaleFactor}} +{{$testMaxReplicaFactor := AddFloat 1 .testMaxReplicaFactor}} +{{$operationTimeout := .operationTimeout}} + +# DaemonSets +{{$daemonSetImage := DefaultParam .daemonSetImage "registry.k8s.io/pause:3.9"}} +{{$daemonSetReplicas := .daemonSetReplicas}} +{{$daemonSetEnv := .daemonSetEnv}} + +# Deployments +{{$deploymentImage := DefaultParam .deploymentImage "registry.k8s.io/pause:3.9"}} +{{$bigDeploymentSize := .bigDeploymentSize}} +{{$bigDeploymentsPerNamespace := .bigDeploymentsPerNamespace}} +{{$mediumDeploymentSize := .mediumDeploymentSize}} +{{$mediumDeploymentsPerNamespace := .mediumDeploymentsPerNamespace}} +{{$smallDeploymentSize := .smallDeploymentSize}} +{{$smallDeploymentsPerNamespace := .smallDeploymentsPerNamespace}} + +# StatefulSets +{{$statefulSetImage := DefaultParam .statefulSetImage "registry.k8s.io/pause:3.9"}} +{{$smallStatefulSetSize := .smallStatefulSetSize}} +{{$smallStatefulSetsPerNamespace := .smallStatefulSetsPerNamespace}} +{{$mediumStatefulSetSize := .mediumStatefulSetSize}} +{{$mediumStatefulSetsPerNamespace := .mediumStatefulSetsPerNamespace}} + +# Jobs +{{$jobImage := DefaultParam .jobImage "registry.k8s.io/pause:3.10"}} +{{$bigJobSize := .bigJobSize}} +{{$bigJobsPerNamespace := .bigJobsPerNamespace}} +{{$mediumJobSize := .mediumJobSize}} +{{$mediumJobsPerNamespace := .mediumJobsPerNamespace}} +{{$smallJobSize := .smallJobSize}} +{{$smallJobsPerNamespace := .smallJobsPerNamespace}} +{{$completionsFactor := MultiplyFloat 2 $testMaxReplicaFactor}} + +# PV +{{$pvSmallStatefulSetSize := DefaultParam .pvSmallStatefulSetSize 0}} +{{$pvMediumStatefulSetSize := DefaultParam .pvMediumStatefulSetSize 0}} + +## CL2 params +{{$CHECK_IF_PODS_ARE_UPDATED := DefaultParam .CL2_CHECK_IF_PODS_ARE_UPDATED true}} +{{$DISABLE_DAEMONSETS := DefaultParam .CL2_DISABLE_DAEMONSETS false}} +{{$ENABLE_DNSTESTS := DefaultParam .CL2_ENABLE_DNSTESTS false}} +{{$ENABLE_NETWORKPOLICIES := DefaultParam .CL2_ENABLE_NETWORKPOLICIES false}} +{{$ENABLE_PVS := DefaultParam .CL2_ENABLE_PVS true}} +{{$ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST := DefaultParam .CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST false}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY "net-pol-test"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE "enforcement-latency"}} +{{$NET_POLICY_SERVER_EVERY_NTH_POD := DefaultParam .CL2_NET_POLICY_SERVER_EVERY_NTH_POD 3}} + +steps: +- name: Starting measurement for '{{$actionName}}' + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForRunningDeployments + Params: + apiVersion: apps/v1 + kind: Deployment + - Identifier: WaitForRunningStatefulSets + Params: + apiVersion: apps/v1 + kind: StatefulSet + - Identifier: WaitForRunningDaemonSets + Params: + apiVersion: apps/v1 + kind: DaemonSet + - Identifier: WaitForRunningJobs + Params: + apiVersion: batch/v1 + kind: Job + Params: + action: start + checkIfPodsAreUpdated: {{$CHECK_IF_PODS_ARE_UPDATED}} + labelSelector: group = load + operationTimeout: {{$operationTimeout}} + +- name: {{$actionName}} + phases: +{{if not $DISABLE_DAEMONSETS}} + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: {{$daemonSetReplicas}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: daemonset + objectTemplatePath: daemonset.yaml + templateFillMap: + Image: {{$daemonSetImage}} + Env: {{$daemonSetEnv}} +{{end}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$bigDeploymentsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: +{{if $ENABLE_NETWORKPOLICIES}} + - basename: big-deployment + objectTemplatePath: networkpolicy.yaml +{{end}} + - basename: big-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + # DNS Test clients are enabled only in the medium-size deployment. + EnableDNSTests: false + ReplicasMin: {{MultiplyInt $bigDeploymentSize $minReplicaFactor}} + ReplicasMax: {{MultiplyInt $bigDeploymentSize $maxReplicaFactor}} + SvcName: big-service + Image: {{$deploymentImage}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$mediumDeploymentsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: +{{if $ENABLE_NETWORKPOLICIES}} + - basename: medium-deployment + objectTemplatePath: networkpolicy.yaml +{{end}} + - basename: medium-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + EnableDNSTests: {{$ENABLE_DNSTESTS}} + ReplicasMin: {{MultiplyInt $mediumDeploymentSize $minReplicaFactor}} + ReplicasMax: {{MultiplyInt $mediumDeploymentSize $maxReplicaFactor}} + SvcName: medium-service + Image: {{$deploymentImage}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$smallDeploymentsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: +{{if $ENABLE_NETWORKPOLICIES}} + - basename: small-deployment + objectTemplatePath: networkpolicy.yaml +{{end}} + - basename: small-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + # DNS Test clients are enabled only in the medium-size deployment. + EnableDNSTests: false + ReplicasMin: {{MultiplyInt $smallDeploymentSize $minReplicaFactor}} + ReplicasMax: {{MultiplyInt $smallDeploymentSize $maxReplicaFactor}} + SvcName: small-service + EnableNetworkPolicyEnforcementLatencyTest: {{$ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST}} + TargetLabelKey: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY}} + TargetLabelValue: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE}} + NetPolServerOnEveryNthPod: {{$NET_POLICY_SERVER_EVERY_NTH_POD}} + Image: {{$deploymentImage}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$smallStatefulSetsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: small-statefulset + objectTemplatePath: statefulset_service.yaml + - basename: small-statefulset + objectTemplatePath: statefulset.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $smallStatefulSetSize $minReplicaFactor}} + ReplicasMax: {{MultiplyInt $smallStatefulSetSize $maxReplicaFactor}} + Image: {{$statefulSetImage}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$mediumStatefulSetsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: medium-statefulset + objectTemplatePath: statefulset_service.yaml + - basename: medium-statefulset + objectTemplatePath: statefulset.yaml + templateFillMap: + ReplicasMin: {{MultiplyInt $mediumStatefulSetSize $minReplicaFactor}} + ReplicasMax: {{MultiplyInt $mediumStatefulSetSize $maxReplicaFactor}} + Image: {{$statefulSetImage}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$smallJobsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: small-job + objectTemplatePath: job.yaml + templateFillMap: + Completions: {{MultiplyInt $smallJobSize $completionsFactor}} + ReplicasMin: {{MultiplyInt $smallJobSize $minReplicaFactor}} + ReplicasMax: {{MultiplyInt $smallJobSize $maxReplicaFactor}} + Image: {{$jobImage}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$mediumJobsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: medium-job + objectTemplatePath: job.yaml + templateFillMap: + Completions: {{MultiplyInt $mediumJobSize $completionsFactor}} + ReplicasMin: {{MultiplyInt $mediumJobSize $minReplicaFactor}} + ReplicasMax: {{MultiplyInt $mediumJobSize $maxReplicaFactor}} + Image: {{$jobImage}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$bigJobsPerNamespace}} + tuningSet: {{$tuningSet}} + objectBundle: + - basename: big-job + objectTemplatePath: job.yaml + templateFillMap: + Completions: {{MultiplyInt $bigJobSize $completionsFactor}} + ReplicasMin: {{MultiplyInt $bigJobSize $minReplicaFactor}} + ReplicasMax: {{MultiplyInt $bigJobSize $maxReplicaFactor}} + Image: {{$jobImage}} +{{if and $is_deleting $ENABLE_PVS}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: {{$tuningSet}} + objectBundle: + {{range $ssIndex := Loop $pvSmallStatefulSetSize}} + - basename: pv-small-statefulset-{{$ssIndex}} + objectTemplatePath: pvc.yaml + listUnknownObjectOptions: + labelSelector: + matchLabels: + name: small-statefulset-{{$ssIndex}} + {{end}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: 0 + tuningSet: {{$tuningSet}} + objectBundle: + {{range $ssIndex := Loop $pvMediumStatefulSetSize}} + - basename: pv-medium-statefulset-{{$ssIndex}} + objectTemplatePath: pvc.yaml + listUnknownObjectOptions: + labelSelector: + matchLabels: + name: medium-statefulset-{{$ssIndex}} + {{end}} +{{end}} + +- name: Waiting for '{{$actionName}}' to be completed + measurements: + - Method: WaitForControlledPodsRunning + Instances: + - Identifier: WaitForRunningDeployments + - Identifier: WaitForRunningStatefulSets + - Identifier: WaitForRunningDaemonSets + - Identifier: WaitForRunningJobs + Params: + action: gather +{{if and $is_deleting $ENABLE_PVS}} + - Identifier: WaitForPVCsToBeDeleted + Method: WaitForBoundPVCs + Params: + desiredPVCCount: 0 + labelSelector: group = load + timeout: {{$operationTimeout}} +{{end}} diff --git a/test/clusterloader2/testing/load/modules/scheduler-throughput.yaml b/test/clusterloader2/testing/load/modules/scheduler-throughput.yaml new file mode 100644 index 00000000000..41e778cc9f6 --- /dev/null +++ b/test/clusterloader2/testing/load/modules/scheduler-throughput.yaml @@ -0,0 +1,80 @@ +## Input params +# Valid actions: "create", "delete" +{{$action := .action}} +{{$namespaces := .namespaces}} +{{$replicasPerNamespace := .replicasPerNamespace}} +{{$schedulerThroughputNamespaces := .schedulerThroughputNamespaces}} +{{$schedulerThroughputPodsPerDeployment := .schedulerThroughputPodsPerDeployment}} + +## Derivative variables +{{$is_creating := (eq .action "create")}} + +## CL2 params +{{$SCHEDULER_THROUGHPUT_THRESHOLD := DefaultParam .CL2_SCHEDULER_THROUGHPUT_THRESHOLD 100}} +{{$CHECK_IF_PODS_ARE_UPDATED := DefaultParam .CL2_CHECK_IF_PODS_ARE_UPDATED true}} + + +{{$deploymentImage := DefaultParam .deploymentImage "registry.k8s.io/pause:3.9"}} + +steps: +{{if $is_creating}} +- name: Creating scheduler throughput measurements + measurements: + - Identifier: HighThroughputPodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = scheduler-throughput + threshold: 1h # TODO(https://github.com/kubernetes/perf-tests/issues/1024): Ideally, this should be 5s + - Identifier: WaitForSchedulerThroughputDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + checkIfPodsAreUpdated: {{$CHECK_IF_PODS_ARE_UPDATED}} + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = scheduler-throughput + # The operation timeout shouldn't be less than 20m to make sure that ~10m node + # failure won't fail the test. See https://github.com/kubernetes/kubernetes/issues/73461#issuecomment-467338711 + operationTimeout: 20m + - Identifier: SchedulingThroughput + Method: SchedulingThroughput + Params: + action: start + labelSelector: group = scheduler-throughput + measurmentInterval: 1s +{{end}} +- name: {{$action}} scheduler throughput pods + phases: + - namespaceRange: + min: {{AddInt $namespaces 1}} + max: {{AddInt $namespaces $schedulerThroughputNamespaces}} + replicasPerNamespace: {{$replicasPerNamespace}} + tuningSet: default + objectBundle: + - basename: scheduler-throughput-deployment + objectTemplatePath: simple-deployment.yaml + templateFillMap: + Replicas: {{$schedulerThroughputPodsPerDeployment}} + Group: scheduler-throughput + Image: {{$deploymentImage}} +- name: Waiting for scheduler throughput pods to be {{$action}}d + measurements: + - Identifier: WaitForSchedulerThroughputDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +{{if $is_creating}} +- name: Collecting scheduler throughput measurements + measurements: + - Identifier: HighThroughputPodStartupLatency + Method: PodStartupLatency + Params: + action: gather + - Identifier: SchedulingThroughput + Method: SchedulingThroughput + Params: + action: gather + enableViolations: true + threshold: {{$SCHEDULER_THROUGHPUT_THRESHOLD}} +{{end}} diff --git a/test/clusterloader2/testing/load/modules/services.yaml b/test/clusterloader2/testing/load/modules/services.yaml new file mode 100644 index 00000000000..5acb85f9bc8 --- /dev/null +++ b/test/clusterloader2/testing/load/modules/services.yaml @@ -0,0 +1,36 @@ +## Services module provides a module for creating / deleting services. + +## Input params +{{$actionName := .actionName}} +{{$namespaces := .namespaces}} +{{$smallServicesPerNamespace := .smallServicesPerNamespace}} +{{$mediumServicesPerNamespace := .mediumServicesPerNamespace}} +{{$bigServicesPerNamespace := .bigServicesPerNamespace}} + +steps: +- name: "{{$actionName}} k8s services" + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$bigServicesPerNamespace}} + tuningSet: Sequence + objectBundle: + - basename: big-service + objectTemplatePath: service.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$mediumServicesPerNamespace}} + tuningSet: Sequence + objectBundle: + - basename: medium-service + objectTemplatePath: service.yaml + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$smallServicesPerNamespace}} + tuningSet: Sequence + objectBundle: + - basename: small-service + objectTemplatePath: service.yaml diff --git a/test/clusterloader2/testing/load/networkpolicy.yaml b/test/clusterloader2/testing/load/networkpolicy.yaml new file mode 100644 index 00000000000..1aae9b23c0f --- /dev/null +++ b/test/clusterloader2/testing/load/networkpolicy.yaml @@ -0,0 +1,19 @@ +{{if eq (Mod .Index 10) 0}} # Create for only 10% of deployments +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{.Name}} +spec: + podSelector: + matchLabels: + name: {{.BaseName}}-{{.Index}} + policyTypes: + - Egress + egress: + - to: + - ipBlock: + cidr: 10.0.0.0/24 + ports: + - protocol: TCP + port: 8080 +{{end}} diff --git a/test/clusterloader2/testing/load/pvc.yaml b/test/clusterloader2/testing/load/pvc.yaml new file mode 100644 index 00000000000..d19d23053e6 --- /dev/null +++ b/test/clusterloader2/testing/load/pvc.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{.Name}} diff --git a/test/clusterloader2/testing/load/secret.yaml b/test/clusterloader2/testing/load/secret.yaml new file mode 100644 index 00000000000..a06a6e83f44 --- /dev/null +++ b/test/clusterloader2/testing/load/secret.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Secret +metadata: + name: {{.Name}} +{{if not (eq (Mod .Index 20) 10 19) }} # .Index % 20 in {10,19} - only 10% deployments will have non-immutable Secret. +immutable: true +{{end}} +type: Opaque +data: + password: c2NhbGFiaWxpdHkK diff --git a/test/clusterloader2/testing/load/service.yaml b/test/clusterloader2/testing/load/service.yaml new file mode 100644 index 00000000000..fc8cf5b7ce5 --- /dev/null +++ b/test/clusterloader2/testing/load/service.yaml @@ -0,0 +1,19 @@ +{{$SetServiceProxyLabel := DefaultParam .SetServiceProxyLabel false}} + +apiVersion: v1 +kind: Service +metadata: + name: {{.Name}} +{{if and $SetServiceProxyLabel (eq (Mod .Index 2) 0)}} + labels: + service.kubernetes.io/service-proxy-name: foo +{{end}} +spec: +{{if .HeadlessService}} + clusterIP: None +{{end}} + selector: + svc: {{.Name}} + ports: + - port: 80 + targetPort: 80 diff --git a/test/clusterloader2/testing/load/simple-deployment.yaml b/test/clusterloader2/testing/load/simple-deployment.yaml new file mode 100644 index 00000000000..92685b93e73 --- /dev/null +++ b/test/clusterloader2/testing/load/simple-deployment.yaml @@ -0,0 +1,59 @@ +{{$HostNetworkMode := DefaultParam .CL2_USE_HOST_NETWORK_PODS false}} +# Keep the CpuRequest/MemoryRequest request equal percentage of 1-core, 4GB node. +# For now we're setting it to 0.5%. +{{$CpuRequest := DefaultParam .CpuRequest "5m"}} +{{$EnvVar := DefaultParam .EnvVar "a"}} +{{$MemoryRequest := DefaultParam .MemoryRequest "20M"}} +{{$Image := DefaultParam .Image "registry.k8s.io/pause:3.9"}} +{{$RUN_ON_ARM_NODES := DefaultParam .CL2_RUN_ON_ARM_NODES false}} + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + name: {{.Name}} + group: {{.Group}} +{{if .SvcName}} + svc: {{.SvcName}}-{{.Index}} +{{end}} + spec: + hostNetwork: {{$HostNetworkMode}} + containers: + - env: + - name: ENV_VAR + value: {{$EnvVar}} + image: {{$Image}} + imagePullPolicy: IfNotPresent + name: {{.Name}} + ports: + resources: + requests: + cpu: {{$CpuRequest}} + memory: {{$MemoryRequest}} + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + {{if $RUN_ON_ARM_NODES}} + - key: "kubernetes.io/arch" + operator: Equal + value: arm64 + effect: NoSchedule + {{end}} diff --git a/test/clusterloader2/testing/load/statefulset.yaml b/test/clusterloader2/testing/load/statefulset.yaml new file mode 100644 index 00000000000..174d3c66cbc --- /dev/null +++ b/test/clusterloader2/testing/load/statefulset.yaml @@ -0,0 +1,73 @@ +{{$HostNetworkMode := DefaultParam .CL2_USE_HOST_NETWORK_PODS false}} +{{$EnablePVs := DefaultParam .CL2_ENABLE_PVS true}} +{{$RUN_ON_ARM_NODES := DefaultParam .CL2_RUN_ON_ARM_NODES false}} +{{$Image := DefaultParam .Image "registry.k8s.io/pause:3.9"}} + +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{.Name}} + labels: + group: load +spec: + podManagementPolicy: Parallel + selector: + matchLabels: + group: load + name: {{.Name}} + serviceName: {{.Name}} + replicas: {{RandIntRange .ReplicasMin .ReplicasMax}} + template: + metadata: + labels: + group: load + name: {{.Name}} + spec: + hostNetwork: {{$HostNetworkMode}} + containers: + - name: {{.Name}} + image: {{$Image}} + ports: + - containerPort: 80 + name: web + resources: + # Keep the CpuRequest/MemoryRequest request equal percentage of 1-core, 4GB node. + # For now we're setting it to 0.5%. + requests: + cpu: 5m + memory: "20M" + {{if $EnablePVs}} + volumeMounts: + - name: pv + mountPath: /var/pv + {{end}} + terminationGracePeriodSeconds: 1 + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + {{if $RUN_ON_ARM_NODES}} + - key: "kubernetes.io/arch" + operator: Equal + value: arm64 + effect: NoSchedule + {{end}} + {{if $EnablePVs}} + # NOTE: PVs created this way should be cleaned-up manually, as deleting the StatefulSet doesn't automatically delete PVs. + # To avoid deleting all the PVs at once during namespace deletion, they should be deleted explicitly via Phase. + volumeClaimTemplates: + - metadata: + name: pv + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 100Mi + {{end}} diff --git a/test/clusterloader2/testing/load/statefulset_service.yaml b/test/clusterloader2/testing/load/statefulset_service.yaml new file mode 100644 index 00000000000..5e16a47a19a --- /dev/null +++ b/test/clusterloader2/testing/load/statefulset_service.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{.Name}} + labels: + name: {{.Name}} +spec: + clusterIP: None + selector: + name: {{.Name}} diff --git a/test/clusterloader2/testing/neg/config.yaml b/test/clusterloader2/testing/neg/config.yaml new file mode 100644 index 00000000000..5c21d5f6cc4 --- /dev/null +++ b/test/clusterloader2/testing/neg/config.yaml @@ -0,0 +1,127 @@ +{{$negQPS := DefaultParam .CL2_NEG_TEST_QPS 20}} +{{$smallBackendLbServiceCount := DefaultParam .CL2_SMALL_BACKEND_LB_SERVICE_COUNT 5}} +{{$mediumBackendLbServiceCount := DefaultParam .CL2_MEDIUM_BACKEND_LB_SERVICE_COUNT 3}} +{{$largeBackendLbServiceCount := DefaultParam .CL2_LARGE_BACKEND_LB_SERVICE_COUNT 1}} +{{$waitAfterDeletion := DefaultParam .CL2_WAIT_AFTER_DELETION “15m”}} + +# Test +name: neg-latency +namespace: + number: 1 +tuningSets: +- name: NegConstantQPS + qpsLoad: + qps: {{$negQPS}} +steps: +- module: + path: /modules/ingress-measurements.yaml + params: + action: start +- name: Start measurement for running pods + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = neg-load + operationTimeout: 15m +- module: + path: /modules/services.yaml + params: + actionName: Create + smallBackendLbServiceCount: {{$smallBackendLbServiceCount}} + mediumBackendLbServiceCount: {{$mediumBackendLbServiceCount}} + largeBackendLbServiceCount: {{$largeBackendLbServiceCount}} + smallBackendIngDeploymentCount: 1 + mediumBackendIngDeploymentCount: 1 + largeBackendIngDeploymentCount: 1 + standardBackendSize: true +- name: Waiting for objects creation to be completed + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +- module: + path: /modules/ingress-measurements.yaml + params: + action: waitForReady +- module: + path: /modules/measurements.yaml + params: + action: start +- module: + path: /modules/services.yaml + params: + actionName: Scale up + smallBackendLbServiceCount: {{AddInt 1 $smallBackendLbServiceCount}} + mediumBackendLbServiceCount: {{AddInt 1 $mediumBackendLbServiceCount}} + largeBackendLbServiceCount: {{AddInt 1 $largeBackendLbServiceCount}} + smallBackendIngDeploymentCount: 1 + mediumBackendIngDeploymentCount: 1 + largeBackendIngDeploymentCount: 1 + standardBackendSize: true +- name: Waiting for objects scaling up to be completed + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +- module: + path: /modules/ingress-measurements.yaml + params: + action: waitForReady +- module: + path: /modules/services.yaml + params: + actionName: Scale down + smallBackendLbServiceCount: {{AddInt 1 $smallBackendLbServiceCount}} + mediumBackendLbServiceCount: {{AddInt 1 $mediumBackendLbServiceCount}} + largeBackendLbServiceCount: {{AddInt 1 $largeBackendLbServiceCount}} + smallBackendIngDeploymentCount: 1 + mediumBackendIngDeploymentCount: 1 + largeBackendIngDeploymentCount: 1 + standardBackendSize: false +- name: Waiting for objects creation to be completed + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +- module: + path: /modules/ingress-measurements.yaml + params: + action: waitForReady +- module: + path: /modules/ingress-measurements.yaml + params: + action: gather +- module: + path: /modules/measurements.yaml + params: + action: gather +- module: + path: /modules/services.yaml + params: + actionName: Delete + smallBackendLbServiceCount: 0 + mediumBackendLbServiceCount: 0 + largeBackendLbServiceCount: 0 + smallBackendIngDeploymentCount: 0 + mediumBackendIngDeploymentCount: 0 + largeBackendIngDeploymentCount: 0 + standardBackendSize: true +- name: Waiting for objects deletion to be completed + measurements: + - Identifier: WaitForRunningDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather +- name: Wait after deletion + measurements: + - Identifier: Wait + Method: Sleep + Params: + duration: {{$waitAfterDeletion}} diff --git a/test/clusterloader2/testing/neg/dep.yaml b/test/clusterloader2/testing/neg/dep.yaml new file mode 100644 index 00000000000..6d7ddf65ae1 --- /dev/null +++ b/test/clusterloader2/testing/neg/dep.yaml @@ -0,0 +1,23 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: neg-load +spec: + minReadySeconds: 60 + replicas: {{.NumReplicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + group: neg-load + name: {{.Name}} + spec: + containers: + - name: {{.Name}} + image: nginx + ports: + - containerPort: 8080 diff --git a/test/clusterloader2/testing/neg/ing.yaml b/test/clusterloader2/testing/neg/ing.yaml new file mode 100644 index 00000000000..046bf8ea3ae --- /dev/null +++ b/test/clusterloader2/testing/neg/ing.yaml @@ -0,0 +1,24 @@ +{{$serviceBaseName := .ServiceBaseName}} + +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{.Name}} + labels: + size: {{.NegSizeLabel}} + annotations: + kubernetes.io/ingress.class: "gce" +spec: + rules: + - http: + paths: + {{ range $idx := Loop .NumServices }} + - path: /neg-load-custom-{{$idx}} + pathType: ImplementationSpecific + backend: + service: + name: {{$serviceBaseName}}-{{$idx}} + port: + number: 80 + {{end}} + host: "example.com" diff --git a/test/clusterloader2/testing/neg/modules/ingress-measurements.yaml b/test/clusterloader2/testing/neg/modules/ingress-measurements.yaml new file mode 100644 index 00000000000..74fc97faf65 --- /dev/null +++ b/test/clusterloader2/testing/neg/modules/ingress-measurements.yaml @@ -0,0 +1,27 @@ +{{$action := .action}} +{{$ingWaitTimeout := DefaultParam .CL2_ING_WAIT_TIMEOUT "30m"}} + +steps: +- name: Ingress creation latency measurements - '{{$action}}' + measurements: + - Identifier: ServiceCreationLatencySmall + Method: ServiceCreationLatency + Params: + action: {{$action}} + waitTimeout: {{$ingWaitTimeout}} + checkIngress: true + labelSelector: size = neg-small + - Identifier: ServiceCreationLatencyMedium + Method: ServiceCreationLatency + Params: + action: {{$action}} + waitTimeout: {{$ingWaitTimeout}} + checkIngress: true + labelSelector: size = neg-medium + - Identifier: ServiceCreationLatencyLarge + Method: ServiceCreationLatency + Params: + action: {{$action}} + waitTimeout: {{$ingWaitTimeout}} + checkIngress: true + labelSelector: size = neg-large diff --git a/test/clusterloader2/testing/neg/modules/measurements.yaml b/test/clusterloader2/testing/neg/modules/measurements.yaml new file mode 100644 index 00000000000..57e3b341493 --- /dev/null +++ b/test/clusterloader2/testing/neg/modules/measurements.yaml @@ -0,0 +1,10 @@ +{{$action := .action}} + +steps: +- name: Neg latency measurements - '{{$action}}' + measurements: + - Identifier: NegLatency + Method: NegLatency + Params: + action: {{$action}} + enableViolations: true \ No newline at end of file diff --git a/test/clusterloader2/testing/neg/modules/services.yaml b/test/clusterloader2/testing/neg/modules/services.yaml new file mode 100644 index 00000000000..62cfa6cd74b --- /dev/null +++ b/test/clusterloader2/testing/neg/modules/services.yaml @@ -0,0 +1,100 @@ +{{$SMALL_BACKEND_SIZE := DefaultParam .CL2_SMALL_BACKEND_SIZE 10}} +{{$MEDIUM_BACKEND_SIZE := DefaultParam .CL2_MEDIUM_BACKEND_SIZE 50}} +{{$LARGE_BACKEND_SIZE := DefaultParam .CL2_LARGE_BACKEND_SIZE 100}} +{{$SMALL_BACKEND_SIZE_REDUCED := DefaultParam .CL2_SMALL_BACKEND_SIZE_REDUCED 5}} +{{$MEDIUM_BACKEND_SIZE_REDUCED := DefaultParam .CL2_MEDIUM_BACKEND_SIZE_REDUCED 25}} +{{$LARGE_BACKEND_SIZE_REDUCED := DefaultParam .CL2_LARGE_BACKEND_SIZE_REDUCED 50}} +{{$SMALL_BACKEND_LB_SERVICE_COUNT := .smallBackendLbServiceCount}} +{{$MEDIUM_BACKEND_LB_SERVICE_COUNT := .mediumBackendLbServiceCount}} +{{$LARGE_BACKEND_LB_SERVICE_COUNT := .largeBackendLbServiceCount}} +{{$SMALL_BACKEND_ING_DEPLOYMENT_COUNT := .smallBackendIngDeploymentCount}} +{{$MEDIUM_BACKEND_ING_DEPLOYMENT_COUNT := .mediumBackendIngDeploymentCount}} +{{$LARGE_BACKEND_ING_DEPLOYMENT_COUNT := .largeBackendIngDeploymentCount}} +{{$standardBackendSize := .standardBackendSize}} +{{$actionName := .actionName}} +{{$namespaces := 1}} + +steps: +- name: {{$actionName}} services + phases: + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$SMALL_BACKEND_LB_SERVICE_COUNT}} + tuningSet: NegConstantQPS + objectBundle: + - basename: small-backends-svc + objectTemplatePath: service.yaml + templateFillMap: + DeploymentBaseName: small-backends-dep + NegSizeLabel: neg-small + - basename: small-backends-dep + objectTemplatePath: dep.yaml + templateFillMap: + NumReplicas: {{IfThenElse $standardBackendSize $SMALL_BACKEND_SIZE $SMALL_BACKEND_SIZE_REDUCED}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$MEDIUM_BACKEND_LB_SERVICE_COUNT}} + tuningSet: NegConstantQPS + objectBundle: + - basename: medium-backends-svc + objectTemplatePath: service.yaml + templateFillMap: + DeploymentBaseName: medium-backends-dep + NegSizeLabel: neg-medium + - basename: medium-backends-dep + objectTemplatePath: dep.yaml + templateFillMap: + NumReplicas: {{IfThenElse $standardBackendSize $MEDIUM_BACKEND_SIZE $MEDIUM_BACKEND_SIZE_REDUCED}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$LARGE_BACKEND_LB_SERVICE_COUNT}} + tuningSet: NegConstantQPS + objectBundle: + - basename: large-backends-svc + objectTemplatePath: service.yaml + templateFillMap: + DeploymentBaseName: large-backends-dep + NegSizeLabel: neg-large + - basename: large-backends-dep + objectTemplatePath: dep.yaml + templateFillMap: + NumReplicas: {{IfThenElse $standardBackendSize $LARGE_BACKEND_SIZE $LARGE_BACKEND_SIZE_REDUCED}} + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$SMALL_BACKEND_ING_DEPLOYMENT_COUNT}} + tuningSet: NegConstantQPS + objectBundle: + - basename: small-backends-ing + objectTemplatePath: ing.yaml + templateFillMap: + ServiceBaseName: small-backends-svc + NumServices: {{$SMALL_BACKEND_LB_SERVICE_COUNT}} + NegSizeLabel: neg-small + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$MEDIUM_BACKEND_ING_DEPLOYMENT_COUNT}} + tuningSet: NegConstantQPS + objectBundle: + - basename: medium-backends-ing + objectTemplatePath: ing.yaml + templateFillMap: + ServiceBaseName: medium-backends-svc + NumServices: {{$MEDIUM_BACKEND_LB_SERVICE_COUNT}} + NegSizeLabel: neg-medium + - namespaceRange: + min: 1 + max: {{$namespaces}} + replicasPerNamespace: {{$LARGE_BACKEND_ING_DEPLOYMENT_COUNT}} + tuningSet: NegConstantQPS + objectBundle: + - basename: large-backends-ing + objectTemplatePath: ing.yaml + templateFillMap: + ServiceBaseName: large-backends-svc + NumServices: {{$LARGE_BACKEND_LB_SERVICE_COUNT}} + NegSizeLabel: neg-large diff --git a/test/clusterloader2/testing/neg/service.yaml b/test/clusterloader2/testing/neg/service.yaml new file mode 100644 index 00000000000..30820c5cc5f --- /dev/null +++ b/test/clusterloader2/testing/neg/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{.Name}} + labels: + size: {{.NegSizeLabel}} + annotations: + cloud.google.com/neg: '{"ingress": true}' +spec: + type: ClusterIP + selector: + name: {{.DeploymentBaseName}}-{{.Index}} + ports: + - port: 80 + protocol: TCP + targetPort: 80 diff --git a/test/clusterloader2/testing/network/1_1_ratio_override.yaml b/test/clusterloader2/testing/network/1_1_ratio_override.yaml new file mode 100644 index 00000000000..615abbd2b6d --- /dev/null +++ b/test/clusterloader2/testing/network/1_1_ratio_override.yaml @@ -0,0 +1,2 @@ +CL2_NUMBER_OF_SERVERS: 1 +CL2_NUMBER_OF_CLIENTS: 1 diff --git a/test/clusterloader2/testing/network/50_50_ratio_override.yaml b/test/clusterloader2/testing/network/50_50_ratio_override.yaml new file mode 100644 index 00000000000..e4392c9d9ae --- /dev/null +++ b/test/clusterloader2/testing/network/50_50_ratio_override.yaml @@ -0,0 +1,2 @@ +CL2_NUMBER_OF_SERVERS: 50 +CL2_NUMBER_OF_CLIENTS: 50 diff --git a/test/clusterloader2/testing/network/config.yaml b/test/clusterloader2/testing/network/config.yaml new file mode 100644 index 00000000000..1cc2aa2da2d --- /dev/null +++ b/test/clusterloader2/testing/network/config.yaml @@ -0,0 +1,24 @@ +{{$PROTOCOL := .CL2_PROTOCOL}} +{{$NUMBER_OF_SERVERS := .CL2_NUMBER_OF_SERVERS}} +{{$NUMBER_OF_CLIENTS := .CL2_NUMBER_OF_CLIENTS}} + +name: network_performance +namespace: + number: 1 +steps: +- name: Start network performance measurement + measurements: + - Identifier: NetworkPerformanceMetrics + Method: NetworkPerformanceMetrics + Params: + action: start + duration: 10s + protocol: {{$PROTOCOL}} + numberOfServers: {{$NUMBER_OF_SERVERS}} + numberOfClients: {{$NUMBER_OF_CLIENTS}} +- name: Gather network performance measurement + measurements: + - Identifier: NetworkPerformanceMetrics + Method: NetworkPerformanceMetrics + Params: + action: gather diff --git a/test/clusterloader2/testing/network/http_protocol_override.yaml b/test/clusterloader2/testing/network/http_protocol_override.yaml new file mode 100644 index 00000000000..bc6d16204c0 --- /dev/null +++ b/test/clusterloader2/testing/network/http_protocol_override.yaml @@ -0,0 +1 @@ +CL2_PROTOCOL: HTTP diff --git a/test/clusterloader2/testing/network/suite.yaml b/test/clusterloader2/testing/network/suite.yaml new file mode 100644 index 00000000000..421a8e039d8 --- /dev/null +++ b/test/clusterloader2/testing/network/suite.yaml @@ -0,0 +1,30 @@ +- identifier: tcp-1:1 + configPath: testing/network/config.yaml + overridePaths: + - testing/network/tcp_protocol_override.yaml + - testing/network/1_1_ratio_override.yaml +- identifier: udp-1:1 + configPath: testing/network/config.yaml + overridePaths: + - testing/network/udp_protocol_override.yaml + - testing/network/1_1_ratio_override.yaml +- identifier: http-1:1 + configPath: testing/network/config.yaml + overridePaths: + - testing/network/http_protocol_override.yaml + - testing/network/1_1_ratio_override.yaml +- identifier: tcp-50:50 + configPath: testing/network/config.yaml + overridePaths: + - testing/network/tcp_protocol_override.yaml + - testing/network/50_50_ratio_override.yaml +- identifier: udp-50:50 + configPath: testing/network/config.yaml + overridePaths: + - testing/network/udp_protocol_override.yaml + - testing/network/50_50_ratio_override.yaml +- identifier: http-50:50 + configPath: testing/network/config.yaml + overridePaths: + - testing/network/http_protocol_override.yaml + - testing/network/50_50_ratio_override.yaml diff --git a/test/clusterloader2/testing/network/tcp_protocol_override.yaml b/test/clusterloader2/testing/network/tcp_protocol_override.yaml new file mode 100644 index 00000000000..d95b2f629db --- /dev/null +++ b/test/clusterloader2/testing/network/tcp_protocol_override.yaml @@ -0,0 +1 @@ +CL2_PROTOCOL: TCP diff --git a/test/clusterloader2/testing/network/udp_protocol_override.yaml b/test/clusterloader2/testing/network/udp_protocol_override.yaml new file mode 100644 index 00000000000..0580b05a72d --- /dev/null +++ b/test/clusterloader2/testing/network/udp_protocol_override.yaml @@ -0,0 +1 @@ +CL2_PROTOCOL: UDP diff --git a/test/clusterloader2/testing/node-throughput/config.yaml b/test/clusterloader2/testing/node-throughput/config.yaml new file mode 100644 index 00000000000..c6a6b3ba057 --- /dev/null +++ b/test/clusterloader2/testing/node-throughput/config.yaml @@ -0,0 +1,92 @@ +# ASSUMPTIONS: +# - This test is designed for 1-node cluster. + +#Constants +{{$POD_COUNT := DefaultParam .POD_COUNT 100}} +{{$POD_THROUGHPUT := DefaultParam .POD_THROUGHPUT 5}} +{{$CONTAINER_IMAGE := DefaultParam .CONTAINER_IMAGE "registry.k8s.io/pause:3.9"}} +{{$POD_STARTUP_LATENCY_THRESHOLD := DefaultParam .POD_STARTUP_LATENCY_THRESHOLD "5s"}} +{{$OPERATION_TIMEOUT := DefaultParam .OPERATION_TIMEOUT "15m"}} + +name: node-throughput +namespace: + number: {{$POD_COUNT}} +tuningSets: +- name: UniformQPS + qpsLoad: + qps: {{$POD_THROUGHPUT}} +steps: +- name: Starting measurements + measurements: + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: start + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = latency + threshold: {{$POD_STARTUP_LATENCY_THRESHOLD}} +- name: Starting pods measurements + measurements: + - Identifier: WaitForRunningLatencyRCs + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: v1 + kind: ReplicationController + labelSelector: group = latency + operationTimeout: {{$OPERATION_TIMEOUT}} +- name: Creating pods + phases: + - namespaceRange: + min: 1 + max: {{$POD_COUNT}} + replicasPerNamespace: 1 + tuningSet: UniformQPS + objectBundle: + - basename: latency-pod-rc + objectTemplatePath: rc.yaml + templateFillMap: + Replicas: 1 + Group: latency + Image: {{$CONTAINER_IMAGE}} +- name: Waiting for pods to be running + measurements: + - Identifier: WaitForRunningLatencyRCs + Method: WaitForControlledPodsRunning + Params: + action: gather +- name: Deleting pods + phases: + - namespaceRange: + min: 1 + max: {{$POD_COUNT}} + replicasPerNamespace: 0 + tuningSet: UniformQPS + objectBundle: + - basename: latency-pod-rc + objectTemplatePath: rc.yaml +- name: Waiting for pods to be deleted + measurements: + - Identifier: WaitForRunningLatencyRCs + Method: WaitForControlledPodsRunning + Params: + action: gather +# Collect measurements +- name: Collecting pods measurements + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather +- name: Collecting measurements + measurements: + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: gather + enableViolations: true + useSimpleLatencyQuery: true + summaryName: APIResponsivenessPrometheus_simple diff --git a/test/clusterloader2/testing/node-throughput/rc.yaml b/test/clusterloader2/testing/node-throughput/rc.yaml new file mode 100644 index 00000000000..e0688f338c3 --- /dev/null +++ b/test/clusterloader2/testing/node-throughput/rc.yaml @@ -0,0 +1,34 @@ +apiVersion: v1 +kind: ReplicationController +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: {{.Replicas}} + selector: + name: {{.Name}} + template: + metadata: + labels: + name: {{.Name}} + group: {{.Group}} + spec: + # Do not automount default service account, to eliminate its impact. + automountServiceAccountToken: false + containers: + - image: {{.Image}} + imagePullPolicy: IfNotPresent + name: {{.Name}} + ports: + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + tolerations: + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/test/clusterloader2/testing/overrides/2000_nodes.yaml b/test/clusterloader2/testing/overrides/2000_nodes.yaml new file mode 100644 index 00000000000..e97b547b2f9 --- /dev/null +++ b/test/clusterloader2/testing/overrides/2000_nodes.yaml @@ -0,0 +1 @@ +NODE_MODE: masterandnondaemons diff --git a/test/clusterloader2/testing/overrides/5000_nodes.yaml b/test/clusterloader2/testing/overrides/5000_nodes.yaml new file mode 100644 index 00000000000..e97b547b2f9 --- /dev/null +++ b/test/clusterloader2/testing/overrides/5000_nodes.yaml @@ -0,0 +1 @@ +NODE_MODE: masterandnondaemons diff --git a/test/clusterloader2/testing/overrides/600_nodes_high_density.yaml b/test/clusterloader2/testing/overrides/600_nodes_high_density.yaml new file mode 100644 index 00000000000..56d78a0775b --- /dev/null +++ b/test/clusterloader2/testing/overrides/600_nodes_high_density.yaml @@ -0,0 +1 @@ +PODS_PER_NODE: 95 diff --git a/test/clusterloader2/testing/overrides/kubemark_5000_nodes.yaml b/test/clusterloader2/testing/overrides/kubemark_5000_nodes.yaml new file mode 100644 index 00000000000..4df36dc9bb3 --- /dev/null +++ b/test/clusterloader2/testing/overrides/kubemark_5000_nodes.yaml @@ -0,0 +1,6 @@ +CUSTOM_API_CALL_THRESHOLDS: | + - verb: PATCH + resource: nodes + subresource: status + scope: resource + threshold: 3s diff --git a/test/clusterloader2/testing/overrides/kubemark_500_nodes.yaml b/test/clusterloader2/testing/overrides/kubemark_500_nodes.yaml new file mode 100644 index 00000000000..a5e7e5de606 --- /dev/null +++ b/test/clusterloader2/testing/overrides/kubemark_500_nodes.yaml @@ -0,0 +1 @@ +CL2_LOAD_TEST_THROUGHPUT: 50 diff --git a/test/clusterloader2/testing/overrides/kubemark_load_throughput.yaml b/test/clusterloader2/testing/overrides/kubemark_load_throughput.yaml new file mode 100644 index 00000000000..69c06d1f955 --- /dev/null +++ b/test/clusterloader2/testing/overrides/kubemark_load_throughput.yaml @@ -0,0 +1 @@ +CL2_LOAD_TEST_THROUGHPUT: 20 diff --git a/test/clusterloader2/testing/overrides/load_throughput.yaml b/test/clusterloader2/testing/overrides/load_throughput.yaml new file mode 100644 index 00000000000..04a2c97f8d7 --- /dev/null +++ b/test/clusterloader2/testing/overrides/load_throughput.yaml @@ -0,0 +1,2 @@ +CL2_LOAD_TEST_THROUGHPUT: 20 +CL2_DELETE_TEST_THROUGHPUT: 14 diff --git a/test/clusterloader2/testing/overrides/load_throughput_pre_1_23.yaml b/test/clusterloader2/testing/overrides/load_throughput_pre_1_23.yaml new file mode 100644 index 00000000000..4143a70e9ab --- /dev/null +++ b/test/clusterloader2/testing/overrides/load_throughput_pre_1_23.yaml @@ -0,0 +1,2 @@ +CL2_LOAD_TEST_THROUGHPUT: 20 +CL2_DELETE_TEST_THROUGHPUT: 7 diff --git a/test/clusterloader2/testing/overrides/node_containerd.yaml b/test/clusterloader2/testing/overrides/node_containerd.yaml new file mode 100644 index 00000000000..fb75750767d --- /dev/null +++ b/test/clusterloader2/testing/overrides/node_containerd.yaml @@ -0,0 +1 @@ +POD_THROUGHPUT: 5 diff --git a/test/clusterloader2/testing/overrides/node_docker.yaml b/test/clusterloader2/testing/overrides/node_docker.yaml new file mode 100644 index 00000000000..5b70e144110 --- /dev/null +++ b/test/clusterloader2/testing/overrides/node_docker.yaml @@ -0,0 +1 @@ +POD_THROUGHPUT: 2 diff --git a/test/clusterloader2/testing/overrides/watch_list_off.yaml b/test/clusterloader2/testing/overrides/watch_list_off.yaml new file mode 100644 index 00000000000..0d282a75d00 --- /dev/null +++ b/test/clusterloader2/testing/overrides/watch_list_off.yaml @@ -0,0 +1,8 @@ +# a higher latency is expected +# due to the size of the total payload +CUSTOM_API_CALL_THRESHOLDS: | + - verb: LIST + resource: secrets + subresource: '' + scope: namespace + threshold: 60s diff --git a/test/clusterloader2/testing/prometheus/not-scrape-kube-proxy.yaml b/test/clusterloader2/testing/prometheus/not-scrape-kube-proxy.yaml new file mode 100644 index 00000000000..72b16ee590b --- /dev/null +++ b/test/clusterloader2/testing/prometheus/not-scrape-kube-proxy.yaml @@ -0,0 +1 @@ +PROMETHEUS_SCRAPE_KUBE_PROXY: false diff --git a/test/clusterloader2/testing/prometheus/scrape-anet.yaml b/test/clusterloader2/testing/prometheus/scrape-anet.yaml new file mode 100644 index 00000000000..982387991ba --- /dev/null +++ b/test/clusterloader2/testing/prometheus/scrape-anet.yaml @@ -0,0 +1 @@ +PROMETHEUS_SCRAPE_ANET: true diff --git a/test/clusterloader2/testing/prometheus/scrape-etcd.yaml b/test/clusterloader2/testing/prometheus/scrape-etcd.yaml new file mode 100644 index 00000000000..1af5348ac0a --- /dev/null +++ b/test/clusterloader2/testing/prometheus/scrape-etcd.yaml @@ -0,0 +1 @@ +PROMETHEUS_SCRAPE_ETCD: true diff --git a/test/clusterloader2/testing/prometheus/scrape-kube-network-policies.yaml b/test/clusterloader2/testing/prometheus/scrape-kube-network-policies.yaml new file mode 100644 index 00000000000..42cf84a4beb --- /dev/null +++ b/test/clusterloader2/testing/prometheus/scrape-kube-network-policies.yaml @@ -0,0 +1 @@ +PROMETHEUS_SCRAPE_KUBE_NETWORK_POLICIES: true diff --git a/test/clusterloader2/testing/prometheus/scrape-node-exporter.yaml b/test/clusterloader2/testing/prometheus/scrape-node-exporter.yaml new file mode 100644 index 00000000000..7798bbc4c39 --- /dev/null +++ b/test/clusterloader2/testing/prometheus/scrape-node-exporter.yaml @@ -0,0 +1 @@ +PROMETHEUS_SCRAPE_NODE_EXPORTER: true diff --git a/test/clusterloader2/testing/request-benchmark/cluster-role-binding.yaml b/test/clusterloader2/testing/request-benchmark/cluster-role-binding.yaml new file mode 100644 index 00000000000..f614bf77dde --- /dev/null +++ b/test/clusterloader2/testing/request-benchmark/cluster-role-binding.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{.Name}} + namespace: {{$.Namespace}} +subjects: +- kind: ServiceAccount + name: default + namespace: {{$.Namespace}} +roleRef: + kind: Role + name: benchmark-role-0 + apiGroup: rbac.authorization.k8s.io diff --git a/test/clusterloader2/testing/request-benchmark/cluster-role.yaml b/test/clusterloader2/testing/request-benchmark/cluster-role.yaml new file mode 100644 index 00000000000..6cbbfa8a6dc --- /dev/null +++ b/test/clusterloader2/testing/request-benchmark/cluster-role.yaml @@ -0,0 +1,9 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: benchmark-role + namespace: {{$.Namespace}} +rules: +- apiGroups: [""] + resources: ["*"] + verbs: ["*"] diff --git a/test/clusterloader2/testing/request-benchmark/config.yaml b/test/clusterloader2/testing/request-benchmark/config.yaml new file mode 100644 index 00000000000..e8cb6eeb5f3 --- /dev/null +++ b/test/clusterloader2/testing/request-benchmark/config.yaml @@ -0,0 +1,68 @@ +# Request benchmark +{{$configMapBytes := DefaultParam .CL2_BENCHMARK_CONFIG_MAP_BYTES 1000}} +{{$configMapGroup := DefaultParam .CL2_BENCHMARK_CONFIG_MAP_GROUP "benchmark-config-map"}} +{{$configMapNumber := DefaultParam .CL2_BENCHMARK_CONFIG_MAP_NUMBER 1}} +{{$benchmarkReplicas := DefaultParam .CL2_BENCHMARK_PODS 1}} + +{{$inflight := DefaultParam .CL2_BENCHMARK_INFLIGHT 10}} +{{$qps := DefaultParam .CL2_BENCHMARK_QPS -1}} +{{$uri := DefaultParam .CL2_BENCHMARK_URI ""}} # URI example: /api/v1/namespaces/%namespace%/pods + +name: Request benchmark +namespace: + number: 1 +tuningSets: +- name: Sequence + parallelismLimitedLoad: + parallelismLimit: 10 +steps: +- name: Setup permissions + phases: + - namespaceRange: + min: 1 + max: 1 + tuningSet: Sequence + replicasPerNamespace: 1 + objectBundle: + - basename: benchmark-role + objectTemplatePath: cluster-role.yaml + - namespaceRange: + min: 1 + max: 1 + tuningSet: Sequence + replicasPerNamespace: 1 + objectBundle: + - basename: benchmark-role-binding + objectTemplatePath: cluster-role-binding.yaml +- name: Create config map(s) + phases: + - namespaceRange: + min: 1 + max: 1 + tuningSet: Sequence + replicasPerNamespace: {{$configMapNumber}} + objectBundle: + - basename: {{$configMapGroup}} + objectTemplatePath: configmap.yaml + templateFillMap: + bytes: {{$configMapBytes}} + group: {{$configMapGroup}} +- module: + path: modules/measurements.yaml + params: + name: baseline +- module: + path: modules/benchmark-deployment.yaml + params: + replicas: {{$benchmarkReplicas}} + inflight: {{$inflight}} + uri: {{$uri}} + qps: {{$qps}} +- module: + path: modules/measurements.yaml + params: + name: benchmark +- module: + path: modules/benchmark-deployment.yaml + params: + replicas: 0 diff --git a/test/clusterloader2/testing/request-benchmark/configmap.yaml b/test/clusterloader2/testing/request-benchmark/configmap.yaml new file mode 100644 index 00000000000..6e0582cb9d0 --- /dev/null +++ b/test/clusterloader2/testing/request-benchmark/configmap.yaml @@ -0,0 +1,11 @@ +{{$bytes := .bytes}} +{{$group := DefaultParam .group .Name}} + +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{.Name}} + labels: + group: {{$group}} +data: + key: "{{RandData $bytes}}" diff --git a/test/clusterloader2/testing/request-benchmark/deployment.yaml b/test/clusterloader2/testing/request-benchmark/deployment.yaml new file mode 100644 index 00000000000..fb907150020 --- /dev/null +++ b/test/clusterloader2/testing/request-benchmark/deployment.yaml @@ -0,0 +1,37 @@ +{{$image := DefaultParam .CL2_BENCHMARK_IMAGE "gcr.io/k8s-testimages/perf-tests-util/request-benchmark:latest"}} +{{$cpu := DefaultParam .CL2_BENCHMARK_POD_CPU (AddInt .Inflight 1)}} +{{$memory := DefaultParam .CL2_BENCHMARK_POD_MEMORY "100Mi"}} + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: benchmark +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + name: {{.Name}} + group: benchmark + spec: + containers: + - name: {{.Name}} + image: {{$image}} + imagePullPolicy: Always + args: + - --inflight={{.Inflight}} + - --namespace={{.Namespace}} + - --uri={{.Uri}} + - --qps={{.QPS}} + resources: + requests: + cpu: {{$cpu}} + memory: {{$memory}} + limits: + cpu: {{$cpu}} + memory: {{$memory}} diff --git a/test/clusterloader2/testing/request-benchmark/modules/benchmark-deployment.yaml b/test/clusterloader2/testing/request-benchmark/modules/benchmark-deployment.yaml new file mode 100644 index 00000000000..16bfd209e33 --- /dev/null +++ b/test/clusterloader2/testing/request-benchmark/modules/benchmark-deployment.yaml @@ -0,0 +1,38 @@ +{{$replicas := DefaultParam .replicas 0}} +{{$inflight := DefaultParam .inflight 0}} +{{$uri := DefaultParam .uri "/"}} +{{$qps := DefaultParam .qps -1}} + +steps: +- name: Creating WaitForControlledPodsRunning measurement + measurements: + - Identifier: WaitForBenchmarkDeployment + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + checkIfPodsAreUpdated: true + kind: Deployment + labelSelector: group = benchmark + operationTimeout: 5m +- name: Deploying benchmark + phases: + - namespaceRange: + min: 1 + max: 1 + tuningSet: Sequence + replicasPerNamespace: 1 + objectBundle: + - basename: benchmark-deployment + objectTemplatePath: deployment.yaml + templateFillMap: + Replicas: {{$replicas}} + Inflight: {{$inflight}} + Uri: {{$uri}} + QPS: {{$qps}} +- name: Waiting for WaitForControlledPodsRunning gather + measurements: + - Identifier: WaitForBenchmarkDeployment + Method: WaitForControlledPodsRunning + Params: + action: gather diff --git a/test/clusterloader2/testing/request-benchmark/modules/measurements.yaml b/test/clusterloader2/testing/request-benchmark/modules/measurements.yaml new file mode 100644 index 00000000000..3a0d57518aa --- /dev/null +++ b/test/clusterloader2/testing/request-benchmark/modules/measurements.yaml @@ -0,0 +1,63 @@ +## Measurement module defines test scoped measurement. + +steps: +- name: Wait 1 minute + measurements: + - Identifier: Wait + Method: Sleep + Params: + duration: 1m +- name: "Starting measurement - {{.name}}" + measurements: + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: start + enableViolations: false + useSimpleLatencyQuery: true + - Identifier: ContainerCPU-{{.name}} + Method: GenericPrometheusQuery + Params: + action: start + metricName: Container CPU + metricVersion: v1 + unit: cores + dimensions: + - container + queries: + - name: Perc99 + query: quantile_over_time(0.99, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) + - name: Perc90 + query: quantile_over_time(0.90, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) + - name: Perc50 + query: quantile_over_time(0.50, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) +- name: Wait 5 minutes + measurements: + - Identifier: Wait + Method: Sleep + Params: + duration: 5m +- name: "Gathering measurement - {{.name}}" + measurements: + - Identifier: ContainerCPU-{{.name}} + Method: GenericPrometheusQuery + Params: + action: gather + metricName: Container CPU + metricVersion: v1 + unit: cores + dimensions: + - container + queries: + - name: Perc99 + query: quantile_over_time(0.99, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) + - name: Perc90 + query: quantile_over_time(0.90, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) + - name: Perc50 + query: quantile_over_time(0.50, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: gather + enableViolations: false + useSimpleLatencyQuery: true diff --git a/test/clusterloader2/testing/scheduler-throughput/config.yaml b/test/clusterloader2/testing/scheduler-throughput/config.yaml new file mode 100644 index 00000000000..ff98a335ad7 --- /dev/null +++ b/test/clusterloader2/testing/scheduler-throughput/config.yaml @@ -0,0 +1,80 @@ +{{$totalSchedulerThroughputPods := DefaultParam .CL2_SCHEDULER_THROUGHPUT_PODS 5000}} +{{$defaultQps := DefaultParam .CL2_DEFAULT_QPS 500}} +{{$defaultBurst := DefaultParam .CL2_DEFAULT_BURST 1000}} +{{$uniformQps := DefaultParam .CL2_UNIFORM_QPS 500}} + +{{$SCHEDULER_THROUGHPUT_THRESHOLD := DefaultParam .CL2_SCHEDULER_THROUGHPUT_THRESHOLD 400}} + +name: direct-scheduler-throughput +namespace: + number: 1 +tuningSets: +# default is a tuningset that is meant to be used when we don't have any specific requirements on pace of operations. +- name: default + globalQPSLoad: + qps: {{$defaultQps}} + burst: {{$defaultBurst}} +- name: UniformQPS + qpsLoad: + qps: {{$uniformQps}} +steps: +- name: Creating scheduler throughput measurements + measurements: + - Identifier: DirectSchedulerThroughputPodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = direct-scheduler-throughput + threshold: 5s + - Identifier: DirectSchedulingThroughput +# TODO: Move to SchedulingThroughputPrometheus which requires cl2 prom stack setup as pre-req + Method: SchedulingThroughput + Params: + action: start + labelSelector: group = direct-scheduler-throughput + measurmentInterval: 1s +- name: create scheduler throughput pods + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: {{$totalSchedulerThroughputPods}} + tuningSet: UniformQPS + objectBundle: + - basename: direct-scheduler-throughput-pod + objectTemplatePath: pod-default.yaml + templateFillMap: + Group: direct-scheduler-throughput +- name: Waiting for scheduler throughput pods to be created + measurements: + - Identifier: WaitForDirectSchedulerThroughputPods + Method: WaitForRunningPods + Params: + action: gather + timeout: 5m + desiredPodCount: {{$totalSchedulerThroughputPods}} + labelSelector: group = direct-scheduler-throughput +- name: Collecting scheduler throughput measurements + measurements: + - Identifier: DirectSchedulerThroughputPodStartupLatency + Method: PodStartupLatency + Params: + action: gather + - Identifier: DirectSchedulingThroughput + Method: SchedulingThroughput + Params: + action: gather + enableViolations: true + threshold: {{$SCHEDULER_THROUGHPUT_THRESHOLD}} +- name: Delete scheduler throughput pods + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 0 + tuningSet: default + objectBundle: + - basename: direct-scheduler-throughput-pod + objectTemplatePath: pod-default.yaml + templateFillMap: + Group: direct-scheduler-throughput \ No newline at end of file diff --git a/test/clusterloader2/testing/scheduler-throughput/pod-default.yaml b/test/clusterloader2/testing/scheduler-throughput/pod-default.yaml new file mode 100644 index 00000000000..4885d7ec8d1 --- /dev/null +++ b/test/clusterloader2/testing/scheduler-throughput/pod-default.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Pod +metadata: + generateName: pod-churn- + labels: + group: {{.Group}} +spec: + containers: + - image: registry.k8s.io/pause:3.9 + name: pause \ No newline at end of file diff --git a/test/clusterloader2/testing/watch-list/config.yaml b/test/clusterloader2/testing/watch-list/config.yaml new file mode 100644 index 00000000000..74c7dee4ea9 --- /dev/null +++ b/test/clusterloader2/testing/watch-list/config.yaml @@ -0,0 +1,77 @@ +{{$enableWatchListFeature := DefaultParam .CL2_ENABLE_WATCH_LIST_FEATURE false}} +{{$testDuration := "5m"}} +{{$customApiCallThresholds := DefaultParam .CUSTOM_API_CALL_THRESHOLDS ""}} +name: watch-list +namespace: + number: 1 + prefix: "watch-list" +tuningSets: +- name: Uniform10qps + qpsLoad: + qps: 10 +steps: +- name: Create secrets + phases: + - namespaceRange: + min: 1 + max: 1 + basename: watch-list + replicasPerNamespace: 400 + tuningSet: Uniform10qps + objectBundle: + - basename: huge-secret + objectTemplatePath: "secret.yaml" +- name: Start measurements + measurements: + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: start + - Identifier: WaitForFinishedJobs + Method: WaitForFinishedJobs + Params: + action: start + labelSelector: group = watch-list + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: start +- name: Start the secret informers + phases: + - namespaceRange: + min: 1 + max: 1 + basename: watch-list + replicasPerNamespace: 2 + tuningSet: Uniform10qps + objectBundle: + - basename: watch-list-secret + objectTemplatePath: role.yaml + - basename: watch-list-secret + objectTemplatePath: roleBinding.yaml + - basename: watch-list + objectTemplatePath: "job.yaml" + templateFillMap: + Duration: {{$testDuration}} + EnableWatchListFeature: {{$enableWatchListFeature}} +- name: Wait for the secret informer job to finish + measurements: + - Identifier: WaitForFinishedJobs + Method: WaitForFinishedJobs + Params: + action: gather + timeout: 10m +- name: Collecting test measurements + measurements: + - Identifier: TestMetrics + Method: TestMetrics + Params: + action: gather + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: gather + enableViolations: true + useSimpleLatencyQuery: true + summaryName: APIResponsivenessPrometheus_simple + customThresholds: {{YamlQuote $customApiCallThresholds 4}} diff --git a/test/clusterloader2/testing/watch-list/job.yaml b/test/clusterloader2/testing/watch-list/job.yaml new file mode 100644 index 00000000000..b1b947022eb --- /dev/null +++ b/test/clusterloader2/testing/watch-list/job.yaml @@ -0,0 +1,25 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: {{.Name}} + labels: + group: watch-list +spec: + template: + metadata: + labels: + group: watch-list + spec: + containers: + - name: {{.Name}} + image: gcr.io/k8s-staging-perf-tests/watch-list:v0.0.1 + resources: + requests: + memory: "16Gi" + cpu: "6" + limits: + memory: "16Gi" + cpu: "6" + command: [ "watch-list" ] + args: [ "--alsologtostderr=true", "--v=4", "--timeout={{.Duration}}", "--count=16", "--namespace=watch-list-1", "--enableWatchListFeature={{.EnableWatchListFeature}}"] + restartPolicy: Never diff --git a/test/clusterloader2/testing/watch-list/role.yaml b/test/clusterloader2/testing/watch-list/role.yaml new file mode 100644 index 00000000000..f036f075013 --- /dev/null +++ b/test/clusterloader2/testing/watch-list/role.yaml @@ -0,0 +1,8 @@ +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{.Name}} +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch"] diff --git a/test/clusterloader2/testing/watch-list/roleBinding.yaml b/test/clusterloader2/testing/watch-list/roleBinding.yaml new file mode 100644 index 00000000000..09f2d57d22a --- /dev/null +++ b/test/clusterloader2/testing/watch-list/roleBinding.yaml @@ -0,0 +1,11 @@ +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{.Name}} +subjects: + - kind: ServiceAccount + name: default +roleRef: + kind: Role + name: {{.Name}} + apiGroup: rbac.authorization.k8s.io diff --git a/test/clusterloader2/testing/watch-list/secret.yaml b/test/clusterloader2/testing/watch-list/secret.yaml new file mode 100644 index 00000000000..9493d3d1acf --- /dev/null +++ b/test/clusterloader2/testing/watch-list/secret.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Secret +metadata: + name: {{.Name}} +type: Opaque +stringData: +# To get 1MB of payload we need: +# 1 mln(bytes)/1.33 (base64 after serialisation is ~30% increase) = ~760K bytes + password: {{RandData 760000}} diff --git a/test/clusterloader2/testing/windows-tests/OWNERS b/test/clusterloader2/testing/windows-tests/OWNERS new file mode 100644 index 00000000000..c1ae79437d6 --- /dev/null +++ b/test/clusterloader2/testing/windows-tests/OWNERS @@ -0,0 +1,3 @@ +approvers: +- pjh +- YangLu1031 diff --git a/test/clusterloader2/testing/windows-tests/config.yaml b/test/clusterloader2/testing/windows-tests/config.yaml new file mode 100644 index 00000000000..90034e0d0e3 --- /dev/null +++ b/test/clusterloader2/testing/windows-tests/config.yaml @@ -0,0 +1,92 @@ +# ASSUMPTIONS: +# - This test is designed for 1-node cluster. + +#Constants +{{$POD_COUNT := DefaultParam .CL2_POD_COUNT 80}} +{{$POD_THROUGHPUT := DefaultParam .CL2_POD_THROUGHPUT 0.03}} +{{$CONTAINER_IMAGE := DefaultParam .CL2_CONTAINER_IMAGE "registry.k8s.io/pause:3.9"}} +{{$POD_STARTUP_LATENCY_THRESHOLD := DefaultParam .CL2_POD_STARTUP_LATENCY_THRESHOLD "60m"}} +{{$OPERATION_TIMEOUT := DefaultParam .CL2_OPERATION_TIMEOUT "90m"}} + +name: node-throughput +namespace: + number: {{$POD_COUNT}} +tuningSets: +- name: UniformQPS + qpsLoad: + qps: {{$POD_THROUGHPUT}} +steps: +- measurements: + - Identifier: WindowsResourceUsagePrometheus + Method: WindowsResourceUsagePrometheus + Params: + action: start + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: start + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: group = latency + threshold: {{$POD_STARTUP_LATENCY_THRESHOLD}} +- measurements: + - Identifier: WaitForRunningLatencyRCs + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: v1 + kind: ReplicationController + labelSelector: group = latency + operationTimeout: {{$OPERATION_TIMEOUT}} +- phases: + - namespaceRange: + min: 1 + max: {{$POD_COUNT}} + replicasPerNamespace: 1 + tuningSet: UniformQPS + objectBundle: + - basename: latency-pod-rc + objectTemplatePath: rc.yaml + templateFillMap: + Replicas: 1 + Group: latency + Image: {{$CONTAINER_IMAGE}} +- measurements: + - Identifier: WaitForRunningLatencyRCs + Method: WaitForControlledPodsRunning + Params: + action: gather + - Identifier: WindowsResourceUsagePrometheus + Method: WindowsResourceUsagePrometheus + Params: + action: gather +- phases: + - namespaceRange: + min: 1 + max: {{$POD_COUNT}} + replicasPerNamespace: 0 + tuningSet: UniformQPS + objectBundle: + - basename: latency-pod-rc + objectTemplatePath: rc.yaml +- measurements: + - Identifier: WaitForRunningLatencyRCs + Method: WaitForControlledPodsRunning + Params: + action: gather +# Collect measurements +- measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather +- measurements: + - Identifier: APIResponsivenessPrometheusSimple + Method: APIResponsivenessPrometheus + Params: + action: gather + enableViolations: true + useSimpleLatencyQuery: true + summaryName: APIResponsivenessPrometheus_simple diff --git a/test/clusterloader2/testing/windows-tests/rc.yaml b/test/clusterloader2/testing/windows-tests/rc.yaml new file mode 100644 index 00000000000..9104c028ea4 --- /dev/null +++ b/test/clusterloader2/testing/windows-tests/rc.yaml @@ -0,0 +1,40 @@ +apiVersion: v1 +kind: ReplicationController +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: {{.Replicas}} + selector: + name: {{.Name}} + template: + metadata: + labels: + name: {{.Name}} + group: {{.Group}} + spec: + # Do not automount default service account, to eliminate its impact. + automountServiceAccountToken: false + nodeSelector: + kubernetes.io/os: windows + containers: + - image: {{.Image}} + imagePullPolicy: IfNotPresent + name: {{.Name}} + ports: + - containerPort: 80 + tolerations: + - key: "node.kubernetes.io/os" + operator: "Exists" + effect: "NoSchedule" + # Add not-ready/unreachable tolerations for 15 minutes so that node + # failure doesn't trigger pod deletion. + - key: "node.kubernetes.io/not-ready" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 + - key: "node.kubernetes.io/unreachable" + operator: "Exists" + effect: "NoExecute" + tolerationSeconds: 900 diff --git a/test/clusterloader2/testing/windows-tests/windows_override.yaml b/test/clusterloader2/testing/windows-tests/windows_override.yaml new file mode 100644 index 00000000000..985083ce08a --- /dev/null +++ b/test/clusterloader2/testing/windows-tests/windows_override.yaml @@ -0,0 +1,2 @@ +WINDOWS_NODE_TEST: true +PROMETHEUS_SCRAPE_WINDOWS_NODE_EXPORTER: true