diff --git a/torchx/schedulers/kubernetes_scheduler.py b/torchx/schedulers/kubernetes_scheduler.py index e1e73b91b..72ad7f04e 100644 --- a/torchx/schedulers/kubernetes_scheduler.py +++ b/torchx/schedulers/kubernetes_scheduler.py @@ -446,6 +446,7 @@ class KubernetesOpts(TypedDict, total=False): image_repo: Optional[str] service_account: Optional[str] priority_class: Optional[str] + in_cluster: Optional[bool] class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]): @@ -553,10 +554,16 @@ def _api_client(self) -> "ApiClient": c = self._client if c is None: configuration = client.Configuration() - try: - config.load_kube_config(client_configuration=configuration) - except config.ConfigException as e: - warnings.warn(f"failed to load kube config: {e}") + if self._in_cluster: + try: + config.load_incluster_config(client_configuration=configuration) + except config.ConfigException as e: + warnings.warn(f"failed to load incluster config: {e}") + else: + try: + config.load_kube_config(client_configuration=configuration) + except config.ConfigException as e: + warnings.warn(f"failed to load kube config: {e}") c = self._client = client.ApiClient(configuration) @@ -586,6 +593,12 @@ def schedule(self, dryrun_info: AppDryRunInfo[KubernetesJob]) -> str: cfg = dryrun_info._cfg assert cfg is not None, f"{dryrun_info} missing cfg" namespace = cfg.get("namespace") or "default" + + in_cluster = cfg.get("in_cluster") or False + if not isinstance(in_cluster, bool): + raise TypeError(f"config value 'in_cluster' must be a bool, got {in_cluster}") + + self._in_cluster = in_cluster images_to_push = dryrun_info.request.images_to_push self.push_images(images_to_push) @@ -675,6 +688,11 @@ def _run_opts(self) -> runopts: type_=str, help="The name of the PriorityClass to set on the job specs", ) + opts.add( + "in_cluster", + type_=bool, + help="Type of run to use local cluster if KUBECONFIG not provided" + ) return opts def describe(self, app_id: str) -> Optional[DescribeAppResponse]: