Skip to content

Commit 047d6af

Browse files
authored
Move TF Operator e2e tests to AWS Prow (kubeflow#1204)
1 parent 62f0e0a commit 047d6af

23 files changed

+457
-156
lines changed

manifests/crd.yaml

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
apiVersion: apiextensions.k8s.io/v1beta1
2+
kind: CustomResourceDefinition
3+
metadata:
4+
name: tfjobs.kubeflow.org
5+
spec:
6+
group: kubeflow.org
7+
scope: Namespaced
8+
names:
9+
kind: TFJob
10+
singular: tfjob
11+
plural: tfjobs
12+
versions:
13+
- name: v1
14+
served: true
15+
storage: true
16+
subresources:
17+
status: {}
18+
validation:
19+
openAPIV3Schema:
20+
properties:
21+
spec:
22+
properties:
23+
tfReplicaSpecs:
24+
properties:
25+
# The validation works when the configuration contains
26+
# `Worker`, `PS` , `Chief` or `Evaluator`. Otherwise it will not be validated.
27+
Worker:
28+
properties:
29+
replicas:
30+
type: integer
31+
minimum: 1
32+
PS:
33+
properties:
34+
replicas:
35+
type: integer
36+
minimum: 1
37+
Chief:
38+
properties:
39+
replicas:
40+
type: integer
41+
minimum: 1
42+
maximum: 1
43+
Evaluator:
44+
properties:
45+
replicas:
46+
type: integer
47+
minimum: 0

manifests/deployment.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: tf-job-operator
5+
namespace: kubeflow
6+
spec:
7+
replicas: 1
8+
selector:
9+
matchLabels:
10+
name: tf-job-operator
11+
template:
12+
metadata:
13+
labels:
14+
name: tf-job-operator
15+
spec:
16+
containers:
17+
- args:
18+
- --monitoring-port=8443
19+
env:
20+
- name: MY_POD_NAMESPACE
21+
valueFrom:
22+
fieldRef:
23+
fieldPath: metadata.namespace
24+
- name: MY_POD_NAME
25+
valueFrom:
26+
fieldRef:
27+
fieldPath: metadata.name
28+
image: gcr.io/kubeflow-images-public/tf-operator:v0.6.0
29+
name: tf-job-operator
30+
serviceAccountName: tf-job-operator

manifests/kustomization.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
namespace: kubeflow
4+
resources:
5+
- crd.yaml
6+
- namespace.yaml
7+
- rbac.yaml
8+
- deployment.yaml
9+
- service.yaml
10+
commonLabels:
11+
kustomize.component: tf-job-operator
12+
images:
13+
- name: gcr.io/kubeflow-images-public/tf-operator
14+
newName: 809251082950.dkr.ecr.us-west-2.amazonaws.com/tf-operator
15+
newTag: "0.1"

manifests/namespace.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
apiVersion: v1
2+
kind: Namespace
3+
metadata:
4+
name: kubeflow

manifests/podgroup.yaml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
apiVersion: apiextensions.k8s.io/v1beta1
2+
kind: CustomResourceDefinition
3+
metadata:
4+
name: podgroups.scheduling.incubator.k8s.io
5+
spec:
6+
group: scheduling.incubator.k8s.io
7+
names:
8+
kind: PodGroup
9+
plural: podgroups
10+
scope: Namespaced
11+
validation:
12+
openAPIV3Schema:
13+
properties:
14+
apiVersion:
15+
type: string
16+
kind:
17+
type: string
18+
metadata:
19+
type: object
20+
spec:
21+
properties:
22+
minMember:
23+
format: int32
24+
type: integer
25+
type: object
26+
status:
27+
properties:
28+
succeeded:
29+
format: int32
30+
type: integer
31+
failed:
32+
format: int32
33+
type: integer
34+
running:
35+
format: int32
36+
type: integer
37+
type: object
38+
type: object
39+
version: v1alpha1

manifests/rbac.yaml

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
apiVersion: v1
2+
kind: ServiceAccount
3+
metadata:
4+
labels:
5+
app: tf-job-operator
6+
name: tf-job-operator
7+
namespace: kubeflow
8+
---
9+
apiVersion: rbac.authorization.k8s.io/v1beta1
10+
kind: ClusterRole
11+
metadata:
12+
labels:
13+
app: tf-job-operator
14+
name: tf-job-operator
15+
rules:
16+
- apiGroups:
17+
- kubeflow.org
18+
resources:
19+
- tfjobs
20+
- tfjobs/status
21+
- tfjobs/finalizers
22+
verbs:
23+
- '*'
24+
- apiGroups:
25+
- apiextensions.k8s.io
26+
resources:
27+
- customresourcedefinitions
28+
verbs:
29+
- '*'
30+
- apiGroups:
31+
- ""
32+
resources:
33+
- pods
34+
- services
35+
- endpoints
36+
- events
37+
verbs:
38+
- '*'
39+
- apiGroups:
40+
- apps
41+
- extensions
42+
resources:
43+
- deployments
44+
verbs:
45+
- '*'
46+
---
47+
apiVersion: rbac.authorization.k8s.io/v1beta1
48+
kind: ClusterRoleBinding
49+
metadata:
50+
labels:
51+
app: tf-job-operator
52+
name: tf-job-operator
53+
roleRef:
54+
apiGroup: rbac.authorization.k8s.io
55+
kind: ClusterRole
56+
name: tf-job-operator
57+
subjects:
58+
- kind: ServiceAccount
59+
name: tf-job-operator
60+
namespace: kubeflow
61+
---

manifests/service.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
annotations:
5+
prometheus.io/path: /metrics
6+
prometheus.io/scrape: "true"
7+
prometheus.io/port: "8443"
8+
labels:
9+
app: tf-job-operator
10+
name: tf-job-operator
11+
namespace: kubeflow
12+
spec:
13+
ports:
14+
- name: monitoring-port
15+
port: 8443
16+
targetPort: 8443
17+
selector:
18+
name: tf-job-operator
19+
type: ClusterIP

py/kubeflow/tf_operator/cleanpod_policy_tests.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from kubeflow.testing import ks_util, test_util, util
55
from kubeflow.tf_operator import k8s_util, test_runner, tf_job_client
6+
from kubeflow.tf_operator import util as tf_operator_util
67
from kubernetes import client as k8s_client
78

89
CLEANPOD_ALL_COMPONENT_NAME = "clean_pod_all"
@@ -23,11 +24,12 @@ def __init__(self, args):
2324
class_name="CleanPodPolicyTests", name=name)
2425

2526
def run_tfjob_with_cleanpod_policy(self, component, clean_pod_policy):
27+
tf_operator_util.load_kube_config()
2628
api_client = k8s_client.ApiClient()
2729

2830
# Setup the ksonnet app
2931
ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
30-
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
32+
tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
3133
self.params)
3234

3335
# Create the TF job

py/kubeflow/tf_operator/deploy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def ks_deploy(app_dir, component, params, env=None, account=None):
9090
if not re.search(".*environment.*already exists.*", e.output):
9191
raise
9292

93-
for k, v in params.iteritems():
93+
for k, v in params.items():
9494
util.run([ks_cmd, "param", "set", "--env=" + env, component, k, v],
9595
cwd=app_dir)
9696

py/kubeflow/tf_operator/distributed_training_tests.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from kubeflow.testing import ks_util, test_util, util
55
from kubeflow.tf_operator import test_runner, tf_job_client
6+
from kubeflow.tf_operator import util as tf_operator_util
67
from kubernetes import client as k8s_client
78

89
TFJOB_COMPONENT_NAME = "distributed_training"
@@ -23,10 +24,11 @@ def __init__(self, args):
2324
# Run a distributed training TFJob, wait for it to complete, and check for pod/service
2425
# creation errors.
2526
def run_distributed_training_job(self, component):
27+
tf_operator_util.load_kube_config()
2628
api_client = k8s_client.ApiClient()
2729

2830
# Setup the ksonnet app
29-
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
31+
tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
3032
self.params)
3133

3234
# Create the TF job

0 commit comments

Comments
 (0)