Skip to content

Commit f9fd26c

Browse files
zackycaofacebook-github-bot
authored andcommitted
Add appdef metadata to torchx event (#947)
Summary: Pull Request resolved: #947 Add appdef metadata to torchx event class. Scuba logs does not get modified yet. The motivation of adding this is that so far we need to log distributed_ai_stack which is part of the App Metadata into AI Instrumentation framework which can be used for post data process for training jobs of various training stacks, such as MVAI, Pyper, Conda_On_MAST etc. Reviewed By: andywag Differential Revision: D61632621
1 parent 86f4344 commit f9fd26c

File tree

4 files changed

+29
-5
lines changed

4 files changed

+29
-5
lines changed

torchx/runner/api.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,10 +198,12 @@ def run_component(
198198
parent_run_id=parent_run_id,
199199
)
200200
handle = self.schedule(dryrun_info)
201+
app = none_throws(dryrun_info._app)
201202
ctx._torchx_event.workspace = workspace
202203
ctx._torchx_event.scheduler = none_throws(dryrun_info._scheduler)
203-
ctx._torchx_event.app_image = none_throws(dryrun_info._app).roles[0].image
204+
ctx._torchx_event.app_image = app.roles[0].image
204205
ctx._torchx_event.app_id = parse_app_handle(handle)[2]
206+
ctx._torchx_event.app_metadata = app.metadata
205207
return handle
206208

207209
def dryrun_component(
@@ -263,6 +265,7 @@ def run(
263265
ctx._torchx_event.scheduler = none_throws(dryrun_info._scheduler)
264266
ctx._torchx_event.app_image = none_throws(dryrun_info._app).roles[0].image
265267
ctx._torchx_event.app_id = parse_app_handle(handle)[2]
268+
ctx._torchx_event.app_metadata = app.metadata
266269
return handle
267270

268271
def schedule(self, dryrun_info: AppDryRunInfo) -> AppHandle:

torchx/runner/events/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import time
2525
import traceback
2626
from types import TracebackType
27-
from typing import Optional, Type
27+
from typing import Dict, Optional, Type
2828

2929
from torchx.runner.events.handlers import get_logging_handler
3030

@@ -84,6 +84,7 @@ def __init__(
8484
scheduler: Optional[str] = None,
8585
app_id: Optional[str] = None,
8686
app_image: Optional[str] = None,
87+
app_metadata: Optional[Dict[str, str]] = None,
8788
runcfg: Optional[str] = None,
8889
workspace: Optional[str] = None,
8990
) -> None:
@@ -92,6 +93,7 @@ def __init__(
9293
scheduler or "",
9394
app_id,
9495
app_image=app_image,
96+
app_metadata=app_metadata,
9597
runcfg=runcfg,
9698
workspace=workspace,
9799
)
@@ -128,6 +130,7 @@ def _generate_torchx_event(
128130
scheduler: str,
129131
app_id: Optional[str] = None,
130132
app_image: Optional[str] = None,
133+
app_metadata: Optional[Dict[str, str]] = None,
131134
runcfg: Optional[str] = None,
132135
source: SourceType = SourceType.UNKNOWN,
133136
workspace: Optional[str] = None,
@@ -138,6 +141,7 @@ def _generate_torchx_event(
138141
api=api,
139142
app_id=app_id,
140143
app_image=app_image,
144+
app_metadata=app_metadata,
141145
runcfg=runcfg,
142146
source=source,
143147
workspace=workspace,

torchx/runner/events/api.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import json
1111
from dataclasses import asdict, dataclass
1212
from enum import Enum
13-
from typing import Optional, Union
13+
from typing import Dict, Optional, Union
1414

1515

1616
class SourceType(str, Enum):
@@ -30,17 +30,21 @@ class TorchxEvent:
3030
api: Api name
3131
app_id: Unique id that is set by the underlying scheduler
3232
image: Image/container bundle that is used to execute request.
33+
app_metadata: metadata to the app (treatment of metadata is scheduler dependent)
3334
runcfg: Run config that was used to schedule app.
3435
source: Type of source the event is generated.
3536
cpu_time_usec: CPU time spent in usec
3637
wall_time_usec: Wall time spent in usec
38+
start_epoch_time_usec: Epoch time in usec when runner event starts
39+
Workspace: Track how different workspaces/no workspace affects build and scheduler
3740
"""
3841

3942
session: str
4043
scheduler: str
4144
api: str
4245
app_id: Optional[str] = None
4346
app_image: Optional[str] = None
47+
app_metadata: Optional[Dict[str, str]] = None
4448
runcfg: Optional[str] = None
4549
raw_exception: Optional[str] = None
4650
source: SourceType = SourceType.UNKNOWN

torchx/runner/events/test/lib_test.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ def assert_event(
3131
self.assertEqual(actual_event.app_image, expected_event.app_image)
3232
self.assertEqual(actual_event.runcfg, expected_event.runcfg)
3333
self.assertEqual(actual_event.source, expected_event.source)
34+
self.assertEqual(actual_event.app_metadata, expected_event.app_metadata)
3435

3536
@patch("torchx.runner.events.get_logging_handler")
3637
def test_get_or_create_logger(self, logging_handler_mock: MagicMock) -> None:
@@ -41,11 +42,13 @@ def test_get_or_create_logger(self, logging_handler_mock: MagicMock) -> None:
4142
self.assertIsInstance(logger.handlers[0], logging.NullHandler)
4243

4344
def test_event_created(self) -> None:
45+
test_metadata = {"test_key": "test_value"}
4446
event = TorchxEvent(
4547
session="test_session",
4648
scheduler="test_scheduler",
4749
api="test_api",
4850
app_image="test_app_image",
51+
app_metadata=test_metadata,
4952
workspace="test_workspace",
5053
)
5154
self.assertEqual("test_session", event.session)
@@ -54,13 +57,16 @@ def test_event_created(self) -> None:
5457
self.assertEqual("test_app_image", event.app_image)
5558
self.assertEqual(SourceType.UNKNOWN, event.source)
5659
self.assertEqual("test_workspace", event.workspace)
60+
self.assertEqual(test_metadata, event.app_metadata)
5761

5862
def test_event_deser(self) -> None:
63+
test_metadata = {"test_key": "test_value"}
5964
event = TorchxEvent(
6065
session="test_session",
6166
scheduler="test_scheduler",
6267
api="test_api",
6368
app_image="test_app_image",
69+
app_metadata=test_metadata,
6470
workspace="test_workspace",
6571
source=SourceType.EXTERNAL,
6672
)
@@ -78,14 +84,17 @@ def assert_torchx_event(self, expected: TorchxEvent, actual: TorchxEvent) -> Non
7884
self.assertEqual(expected.app_image, actual.app_image)
7985
self.assertEqual(expected.source, actual.source)
8086
self.assertEqual(expected.workspace, actual.workspace)
87+
self.assertEqual(expected.app_metadata, actual.app_metadata)
8188

8289
def test_create_context(self, _) -> None:
83-
cfg = json.dumps({"test_key": "test_value"})
90+
test_dict = {"test_key": "test_value"}
91+
cfg = json.dumps(test_dict)
8492
context = log_event(
8593
"test_call",
8694
"local",
8795
"test_app_id",
8896
app_image="test_app_image_id",
97+
app_metadata=test_dict,
8998
runcfg=cfg,
9099
workspace="test_workspace",
91100
)
@@ -95,19 +104,22 @@ def test_create_context(self, _) -> None:
95104
"test_call",
96105
"test_app_id",
97106
app_image="test_app_image_id",
107+
app_metadata=test_dict,
98108
runcfg=cfg,
99109
workspace="test_workspace",
100110
)
101111

102112
self.assert_torchx_event(expected_torchx_event, context._torchx_event)
103113

104114
def test_record_event(self, record_mock: MagicMock) -> None:
105-
cfg = json.dumps({"test_key": "test_value"})
115+
test_dict = {"test_key": "test_value"}
116+
cfg = json.dumps(test_dict)
106117
with log_event(
107118
"test_call",
108119
"local",
109120
"test_app_id",
110121
app_image="test_app_image_id",
122+
app_metadata=test_dict,
111123
runcfg=cfg,
112124
workspace="test_workspace",
113125
) as ctx:
@@ -119,6 +131,7 @@ def test_record_event(self, record_mock: MagicMock) -> None:
119131
"test_call",
120132
"test_app_id",
121133
app_image="test_app_image_id",
134+
app_metadata=test_dict,
122135
runcfg=cfg,
123136
workspace="test_workspace",
124137
cpu_time_usec=ctx._torchx_event.cpu_time_usec,

0 commit comments

Comments
 (0)