Add a new experimental restart policy for large scale model training

manav-a · facebook-github-bot · commit 0622a97bf868 · 2024-06-17T11:43:43.000-07:00
Summary: TSIA

Differential Revision: D58684341
diff --git a/torchx/specs/api.py b/torchx/specs/api.py
@@ -219,11 +219,13 @@ class RetryPolicy(str, Enum):
                 application to deal with failed replica departures and
                 replacement replica admittance.
     2. APPLICATION: Restarts the entire application.
-
+    3. QUORUM: Restarts the replicas for a role as long as the quorum is not
+               violated. (EXPERIMENTAL)
     """
 
     REPLICA = "REPLICA"
     APPLICATION = "APPLICATION"
+    QUORUM = "QUORUM"
 
 
 class MountType(str, Enum):
@@ -322,6 +324,8 @@ class Role:
                 and num_replicas depending on the cluster resources and
                 policies. If the scheduler doesn't support auto scaling this
                 field is ignored and the job size will be num_replicas.
+                EXPERIMENTAL: For Quorum Restart policy this field is used to indicate the
+                quorum required for the job to run.
             max_retries: max number of retries before giving up
             retry_policy: retry behavior upon replica failures
             resource: Resource requirement for the role. The role should be scheduled