Skip to content

[Feature][RayCluster]: Deprecate the RayCluster .Status.State field #2288

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ray-operator/apis/ray/v1/raycluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ type RayClusterStatus struct {
// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
// Important: Run "make" to regenerate code after modifying this file
// Status reflects the status of the cluster
//
// Deprecated: the State field is replaced by the Conditions field.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we include the name of the conditions that replace the existing states here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or will there be public docs on the new conditions? If so we can just link to that in a follow-up PR

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we plan to make a public doc to introduce the new conditions and changes to the .status.state in the following weeks.
image

State ClusterState `json:"state,omitempty"`
// DesiredCPU indicates total desired CPUs for the cluster
DesiredCPU resource.Quantity `json:"desiredCPU,omitempty"`
Expand Down
2 changes: 2 additions & 0 deletions ray-operator/apis/ray/v1alpha1/raycluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ type RayClusterStatus struct {
// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
// Important: Run "make" to regenerate code after modifying this file
// Status reflects the status of the cluster
//
// Deprecated: the State field is replaced by the Conditions field.
State ClusterState `json:"state,omitempty"`
// Reason provides more information about current State
Reason string `json:"reason,omitempty"`
Expand Down
13 changes: 7 additions & 6 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -364,10 +364,11 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, request
// this field should be used to determine whether to update this CR or not.
func (r *RayClusterReconciler) inconsistentRayClusterStatus(ctx context.Context, oldStatus rayv1.RayClusterStatus, newStatus rayv1.RayClusterStatus) bool {
logger := ctrl.LoggerFrom(ctx)
if oldStatus.State != newStatus.State || oldStatus.Reason != newStatus.Reason {

if oldStatus.State != newStatus.State || oldStatus.Reason != newStatus.Reason { //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288
logger.Info("inconsistentRayClusterStatus", "detect inconsistency", fmt.Sprintf(
"old State: %s, new State: %s, old Reason: %s, new Reason: %s",
oldStatus.State, newStatus.State, oldStatus.Reason, newStatus.Reason))
oldStatus.State, newStatus.State, oldStatus.Reason, newStatus.Reason)) //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288
return true
}
if oldStatus.ReadyWorkerReplicas != newStatus.ReadyWorkerReplicas ||
Expand Down Expand Up @@ -1197,7 +1198,7 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra
newInstance.Status.DesiredTPU = totalResources[corev1.ResourceName("google.com/tpu")]

if utils.CheckAllPodsRunning(ctx, runtimePods) {
newInstance.Status.State = rayv1.Ready
newInstance.Status.State = rayv1.Ready //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288
}

// Check if the head node is running and ready by checking the head pod's status.
Expand Down Expand Up @@ -1243,7 +1244,7 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra
}

if newInstance.Spec.Suspend != nil && *newInstance.Spec.Suspend && len(runtimePods.Items) == 0 {
newInstance.Status.State = rayv1.Suspended
newInstance.Status.State = rayv1.Suspended //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288
}

if err := r.updateEndpoints(ctx, newInstance); err != nil {
Expand All @@ -1257,11 +1258,11 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra
timeNow := metav1.Now()
newInstance.Status.LastUpdateTime = &timeNow

if instance.Status.State != newInstance.Status.State {
if instance.Status.State != newInstance.Status.State { //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288
if newInstance.Status.StateTransitionTimes == nil {
newInstance.Status.StateTransitionTimes = make(map[rayv1.ClusterState]*metav1.Time)
}
newInstance.Status.StateTransitionTimes[newInstance.Status.State] = &timeNow
newInstance.Status.StateTransitionTimes[newInstance.Status.State] = &timeNow //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288
}

return newInstance, nil
Expand Down
10 changes: 5 additions & 5 deletions ray-operator/controllers/ray/raycluster_controller_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1533,7 +1533,7 @@ func TestReconcile_UpdateClusterState(t *testing.T) {
cluster := rayv1.RayCluster{}
err := fakeClient.Get(ctx, namespacedName, &cluster)
assert.Nil(t, err, "Fail to get RayCluster")
assert.Empty(t, cluster.Status.State, "Cluster state should be empty")
assert.Empty(t, cluster.Status.State, "Cluster state should be empty") //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288

testRayClusterReconciler := &RayClusterReconciler{
Client: fakeClient,
Expand All @@ -1543,13 +1543,13 @@ func TestReconcile_UpdateClusterState(t *testing.T) {

state := rayv1.Ready
newTestRayCluster := testRayCluster.DeepCopy()
newTestRayCluster.Status.State = state
newTestRayCluster.Status.State = state //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288
err = testRayClusterReconciler.updateRayClusterStatus(ctx, testRayCluster, newTestRayCluster)
assert.Nil(t, err, "Fail to update cluster state")

err = fakeClient.Get(ctx, namespacedName, &cluster)
assert.Nil(t, err, "Fail to get RayCluster after updating state")
assert.Equal(t, cluster.Status.State, state, "Cluster state should be updated")
assert.Equal(t, cluster.Status.State, state, "Cluster state should be updated") //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288
}

func TestInconsistentRayClusterStatus(t *testing.T) {
Expand Down Expand Up @@ -1593,7 +1593,7 @@ func TestInconsistentRayClusterStatus(t *testing.T) {

// Case 1: `State` is different => return true
newStatus := oldStatus.DeepCopy()
newStatus.State = rayv1.Suspended
newStatus.State = rayv1.Suspended //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288
assert.True(t, r.inconsistentRayClusterStatus(ctx, oldStatus, *newStatus))

// Case 2: `Reason` is different => return true
Expand Down Expand Up @@ -1884,7 +1884,7 @@ func TestStateTransitionTimes_NoStateChange(t *testing.T) {
}

preUpdateTime := metav1.Now()
testRayCluster.Status.State = rayv1.Ready
testRayCluster.Status.State = rayv1.Ready //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288
testRayCluster.Status.StateTransitionTimes = map[rayv1.ClusterState]*metav1.Time{rayv1.Ready: &preUpdateTime}
newInstance, err := r.calculateStatus(ctx, testRayCluster, nil)
assert.Nil(t, err)
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/controllers/ray/rayjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)

// Check the current status of RayCluster before submitting.
if clientURL := rayJobInstance.Status.DashboardURL; clientURL == "" {
if rayClusterInstance.Status.State != rayv1.Ready {
logger.Info("Wait for the RayCluster.Status.State to be ready before submitting the job.", "RayCluster", rayClusterInstance.Name, "State", rayClusterInstance.Status.State)
if rayClusterInstance.Status.State != rayv1.Ready { //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288
logger.Info("Wait for the RayCluster.Status.State to be ready before submitting the job.", "RayCluster", rayClusterInstance.Name, "State", rayClusterInstance.Status.State) //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
}

Expand Down
8 changes: 4 additions & 4 deletions ray-operator/controllers/ray/rayjob_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ var _ = Context("RayJob in K8sJobMode", func() {

It("Make RayCluster.Status.State to be rayv1.Ready", func() {
// The RayCluster is not 'Ready' yet because Pods are not running and ready.
Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288

updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
Expand Down Expand Up @@ -364,7 +364,7 @@ var _ = Context("RayJob in K8sJobMode", func() {

It("Make RayCluster.Status.State to be rayv1.Ready", func() {
// The RayCluster is not 'Ready' yet because Pods are not running and ready.
Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288

updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
Expand Down Expand Up @@ -526,7 +526,7 @@ var _ = Context("RayJob in K8sJobMode", func() {

It("Make RayCluster.Status.State to be rayv1.Ready", func() {
// The RayCluster is not 'Ready' yet because Pods are not running and ready.
Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288

updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
Expand Down Expand Up @@ -619,7 +619,7 @@ var _ = Context("RayJob in K8sJobMode", func() {

It("Make RayCluster.Status.State to be rayv1.Ready (attempt 2)", func() {
// The RayCluster is not 'Ready' yet because Pods are not running and ready.
Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready))
Expect(rayCluster.Status.State).NotTo(Equal(rayv1.Ready)) //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288

updateHeadPodToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
updateWorkerPodsToRunningAndReady(ctx, rayJob.Status.RayClusterName, namespace)
Expand Down
2 changes: 1 addition & 1 deletion ray-operator/controllers/ray/suite_helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func getClusterState(ctx context.Context, namespace string, clusterName string)
if err := k8sClient.Get(ctx, client.ObjectKey{Namespace: namespace, Name: clusterName}, &cluster); err != nil {
log.Fatal(err)
}
return cluster.Status.State
return cluster.Status.State //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288
}
}

Expand Down
2 changes: 1 addition & 1 deletion ray-operator/test/support/ray.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ func GetRayCluster(t Test, namespace, name string) *rayv1.RayCluster {
}

func RayClusterState(cluster *rayv1.RayCluster) rayv1.ClusterState {
return cluster.Status.State
return cluster.Status.State //nolint:staticcheck // https://github.com/ray-project/kuberay/pull/2288
}

func RayClusterDesiredWorkerReplicas(cluster *rayv1.RayCluster) int32 {
Expand Down
Loading