DataFrame: Flatten MultiIndex; Set include_wall_time=False (#3605)

caisq · caisq · commit 4f0bafd54ee5 · 2020-05-27T16:09:43.000-04:00
* Motivation for features / changes
  * Improve DataFrame API based on the following user feedback
    * The MultiIndex DataFrame is hard to handle and not nice in terms of CSV round-trip compatibility
    * The wall_time column is usually not useful
* Technical description of changes
  * When pivoting the DataFrame using `pivot_table()`, follow it with `reset_index()` to flatten the 2-level MultiIndex to a single level of index.
  * Then, call `.columns.name` to remove the `tag` columns name, which is an artifact of the tensorboard data model and not directly relevant to users.
  * Add `include_wall_time=False` kwarg to get_scalars()
  * Change the default value of `pivot` from `None` to `False` as per more standard and expected default value scheme.
diff --git a/tensorboard/data/experimental/base_experiment.py b/tensorboard/data/experimental/base_experiment.py
@@ -28,7 +28,13 @@ class BaseExperiment(metaclass=abc.ABCMeta):
     # TODO(cais): Add list_scalar_tags().
 
     @abc.abstractmethod
-    def get_scalars(self, runs_filter=None, tags_filter=None, pivot=None):
+    def get_scalars(
+        self,
+        runs_filter=None,
+        tags_filter=None,
+        pivot=True,
+        include_wall_time=False,
+    ):
         """Export scalar data as a pandas.DataFrame.
 
         Args:
@@ -40,6 +46,9 @@ def get_scalars(self, runs_filter=None, tags_filter=None, pivot=None):
             `pivot_data()` method to a “wide” format wherein the tags of a
             given run and a given step are all collected in a single row.
             If not provided, defaults to `True`.
+          include_wall_time: Include wall_time (timestamps in nanoseconds since
+            the epoch in float64) as a column in the returned DataFrame.
+            If not provided, defaults to `False`.
 
         Returns:
           If `pivot` (default):
diff --git a/tensorboard/data/experimental/experiment_from_dev.py b/tensorboard/data/experimental/experiment_from_dev.py
@@ -70,7 +70,13 @@ def __init__(self, experiment_id, api_endpoint=None):
         self._experiment_id = experiment_id
         self._api_client = get_api_client(api_endpoint=api_endpoint)
 
-    def get_scalars(self, runs_filter=None, tags_filter=None, pivot=None):
+    def get_scalars(
+        self,
+        runs_filter=None,
+        tags_filter=None,
+        pivot=True,
+        include_wall_time=False,
+    ):
         if runs_filter is not None:
             raise NotImplementedError(
                 "runs_filter support for get_scalars() is not implemented yet."
@@ -79,7 +85,6 @@ def get_scalars(self, runs_filter=None, tags_filter=None, pivot=None):
             raise NotImplementedError(
                 "tags_filter support for get_scalars() is not implemented yet."
             )
-        pivot = True if pivot is None else pivot
 
         request = export_service_pb2.StreamExperimentDataRequest()
         request.experiment_id = self._experiment_id
@@ -107,33 +112,48 @@ def get_scalars(self, runs_filter=None, tags_filter=None, pivot=None):
             )
             values.extend(list(response.points.values))
 
-        dataframe = pandas.DataFrame(
-            {
-                "run": runs,
-                "tag": tags,
-                "step": steps,
-                "wall_time": wall_times,
-                "value": values,
-            }
-        )
+        data = {
+            "run": runs,
+            "tag": tags,
+            "step": steps,
+            "value": values,
+        }
+        if include_wall_time:
+            data["wall_time"] = wall_times
+        dataframe = pandas.DataFrame(data)
         if pivot:
             dataframe = self._pivot_dataframe(dataframe)
         return dataframe
 
     def _pivot_dataframe(self, dataframe):
         num_missing_0 = np.count_nonzero(dataframe.isnull().values)
         dataframe = dataframe.pivot_table(
-            ["value", "wall_time"], ["run", "step"], "tag",
+            values=(
+                ["value", "wall_time"]
+                if "wall_time" in dataframe.columns
+                else "value"
+            ),
+            index=["run", "step"],
+            columns="tag",
+            dropna=False,
         )
         num_missing_1 = np.count_nonzero(dataframe.isnull().values)
         if num_missing_1 > num_missing_0:
             raise ValueError(
-                "pivoted DataFrame contains %d missing value(s). "
+                "pivoted DataFrame contains missing value(s). "
                 "This is likely due to two timeseries having different "
                 "sets of steps in your experiment. "
                 "You can avoid this error by calling `get_scalars()` with "
                 "`pivot=False` to disable the DataFrame pivoting."
             )
+        # `reset_index()` removes the MultiIndex structure of the pivoted
+        # DataFrame. Before the call, the DataFrame consits of two levels
+        # of index: "run" and "step". After the call, the index become a
+        # single range index (e.g,. `dataframe[:2]` works).
+        dataframe = dataframe.reset_index()
+        # Remove the columns name "tag".
+        dataframe.columns.name = None
+        dataframe.columns.names = [None for name in dataframe.columns.names]
         return dataframe
 
 
diff --git a/tensorboard/data/experimental/experiment_from_dev_test.py b/tensorboard/data/experimental/experiment_from_dev_test.py
@@ -79,45 +79,94 @@ def stream_experiment_data(request, **kwargs):
             lambda api_endpoint: mock_api_client,
         ):
             experiment = experiment_from_dev.ExperimentFromDev("789")
-            for pivot in (None, False):
-                with self.subTest("pivot=%s" % pivot):
-                    dataframe = experiment.get_scalars(pivot=pivot)
-
-                    expected = pandas.DataFrame(
-                        {
-                            "run": ["train"] * 20 + ["test"] * 20,
-                            "tag": (["accuracy"] * 10 + ["loss"] * 10) * 2,
-                            "step": list(np.arange(0, 10)) * 4,
-                            "wall_time": np.concatenate(
-                                [
-                                    2.0 * np.arange(0, 10),
-                                    1.0 * np.arange(0, 10),
-                                    600.0 + 2.0 * np.arange(0, 10),
-                                    600.0 + np.arange(0, 10),
-                                ]
-                            ),
-                            "value": np.concatenate(
-                                [
-                                    1.0 / (10.0 - np.arange(0, 10)),
-                                    1.0 / (1.0 + np.arange(0, 10)),
-                                    -1.0 / (10.0 - np.arange(0, 10)),
-                                    -1.0 / (1.0 + np.arange(0, 10)),
-                                ]
-                            ),
-                        }
-                    )
-
-                    if pivot is None:  # Default behavior: pivot_table.
-                        pandas.testing.assert_frame_equal(
-                            dataframe,
-                            expected.pivot_table(
-                                ["value", "wall_time"], ["run", "step"], "tag"
-                            ),
-                            check_names=True,
+            for pivot in (True, False):
+                for include_wall_time in (False, True):
+                    with self.subTest(
+                        "pivot=%s; include_wall_time=%s"
+                        % (pivot, include_wall_time)
+                    ):
+                        dataframe = experiment.get_scalars(
+                            pivot=pivot, include_wall_time=include_wall_time
                         )
-                    else:  # pivot == False
+
+                        if pivot:
+                            run_key = (
+                                ("run", "") if include_wall_time else "run"
+                            )
+                            step_key = (
+                                ("step", "") if include_wall_time else "step"
+                            )
+                            accuracy_value_key = (
+                                ("value", "accuracy")
+                                if include_wall_time
+                                else "accuracy"
+                            )
+                            loss_value_key = (
+                                ("value", "loss")
+                                if include_wall_time
+                                else "loss"
+                            )
+                            data = {
+                                run_key: ["test"] * 10 + ["train"] * 10,
+                                step_key: np.concatenate(
+                                    [np.arange(0, 10), np.arange(0, 10)]
+                                ),
+                                accuracy_value_key: np.concatenate(
+                                    [
+                                        -1.0 / (10.0 - np.arange(0, 10)),
+                                        1.0 / (10.0 - np.arange(0, 10)),
+                                    ],
+                                ),
+                                loss_value_key: np.concatenate(
+                                    [
+                                        -1.0 / (1.0 + np.arange(0, 10)),
+                                        1.0 / (1.0 + np.arange(0, 10)),
+                                    ],
+                                ),
+                            }
+                            if include_wall_time:
+                                data[
+                                    ("wall_time", "accuracy")
+                                ] = np.concatenate(
+                                    [
+                                        600.0 + 2.0 * np.arange(0, 10),
+                                        2.0 * np.arange(0, 10),
+                                    ]
+                                )
+                                data[("wall_time", "loss")] = np.concatenate(
+                                    [
+                                        600.0 + np.arange(0, 10),
+                                        1.0 * np.arange(0, 10),
+                                    ]
+                                )
+                            expected = pandas.DataFrame(data)
+                        else:  # No pivot_table.
+                            data = {
+                                "run": ["train"] * 20 + ["test"] * 20,
+                                "tag": (["accuracy"] * 10 + ["loss"] * 10) * 2,
+                                "step": list(np.arange(0, 10)) * 4,
+                                "value": np.concatenate(
+                                    [
+                                        1.0 / (10.0 - np.arange(0, 10)),
+                                        1.0 / (1.0 + np.arange(0, 10)),
+                                        -1.0 / (10.0 - np.arange(0, 10)),
+                                        -1.0 / (1.0 + np.arange(0, 10)),
+                                    ]
+                                ),
+                            }
+                            if include_wall_time:
+                                data["wall_time"] = np.concatenate(
+                                    [
+                                        2.0 * np.arange(0, 10),
+                                        1.0 * np.arange(0, 10),
+                                        600.0 + 2.0 * np.arange(0, 10),
+                                        600.0 + np.arange(0, 10),
+                                    ]
+                                )
+                            expected = pandas.DataFrame(data)
+
                         pandas.testing.assert_frame_equal(
-                            dataframe, expected, check_names=True
+                            dataframe, expected, check_names=True,
                         )
 
     def test_get_scalars_with_pivot_table_with_missing_value(self):
@@ -156,7 +205,8 @@ def stream_experiment_data(request, **kwargs):
             experiment = experiment_from_dev.ExperimentFromDev("789")
             with self.assertRaisesRegexp(
                 ValueError,
-                r"missing value\(s\).*different sets of steps.*pivot=False",
+                r"contains missing value\(s\).*different sets of "
+                r"steps.*pivot=False",
             ):
                 experiment.get_scalars()
 
@@ -193,12 +243,10 @@ def stream_experiment_data(request, **kwargs):
         expected = pandas.DataFrame(
             {
                 "run": ["train"] * 2,
-                "tag": ["batch_loss"] * 2,
                 "step": [0, 1],
-                "value": [np.nan, np.inf],
-                "wall_time": [0.0, 10.0],
+                "batch_loss": [np.nan, np.inf],
             }
-        ).pivot_table(["value", "wall_time"], ["run", "step"], "tag")
+        )
         pandas.testing.assert_frame_equal(dataframe, expected, check_names=True)
 
 
diff --git a/tensorboard/data/experimental/test_binary.py b/tensorboard/data/experimental/test_binary.py
@@ -37,14 +37,19 @@ def parse_args():
         default=None,
         help="Optional API endpoint used to override the default",
     )
+    parser.add_argument(
+        "--include_wall_time",
+        action="store_true",
+        help="Include wall_time column(s) in the DataFrame",
+    )
     return parser.parse_args()
 
 
 def main(args):
     experiment = experiment_from_dev.ExperimentFromDev(
         args.experiment_id, api_endpoint=args.api_endpoint
     )
-    dataframe = experiment.get_scalars()
+    dataframe = experiment.get_scalars(include_wall_time=args.include_wall_time)
     print(dataframe)