@@ -190,9 +190,9 @@ def train(data):
190
190
#
191
191
# The overview shows a high-level summary of model performance.
192
192
#
193
- # The "GPU Summary" panel shows the GPU configuration and the GPU usage.
193
+ # The "GPU Summary" panel shows the GPU configuration, GPU usage and Tensor Cores usage.
194
194
# In this example, the GPU Utilization is low.
195
- # The details of these metrics are `here <https://github.com/guyang3532 /kineto/blob/readme /tb_plugin/docs/gpu_utilization.md>`_.
195
+ # The details of these metrics are `here <https://github.com/pytorch /kineto/blob/main /tb_plugin/docs/gpu_utilization.md>`_.
196
196
#
197
197
# The "Step Time Breakdown" shows distribution of time spent in each step over different categories of execution.
198
198
# In this example, you can see the ``DataLoader`` overhead is significant.
@@ -236,6 +236,9 @@ def train(data):
236
236
#
237
237
# .. image:: ../../_static/img/profiler_kernel_view.png
238
238
# :scale: 25 %
239
+ # Tensor Cores Used:
240
+ # Whether this kernel uses Tensor Cores.
241
+ #
239
242
# Mean Blocks per SM:
240
243
# Blocks per SM = Blocks of this kernel / SM number of this GPU.
241
244
# If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized.
@@ -260,6 +263,12 @@ def train(data):
260
263
# and the ‘a’ and ‘d’ keys move the timeline left and right.
261
264
# You can hit these keys multiple times until you see a readable representation.
262
265
#
266
+ # If a backward operator's "Incoming Flow" field is with value "forward correspond to backward",
267
+ # you can click the text to get its launching forward operator.
268
+ #
269
+ # .. image:: ../../_static/img/profiler_trace_view_fwd_bwd.png
270
+ # :scale: 25 %
271
+ #
263
272
# In this example, we can see the event prefixed with ``enumerate(DataLoader)`` costs a lot of time.
264
273
# And during most of this period, the GPU is idle.
265
274
# Because this function is loading data and transforming data on host side,
@@ -289,7 +298,7 @@ def train(data):
289
298
# .. image:: ../../_static/img/profiler_overview2.png
290
299
# :scale: 25 %
291
300
#
292
- # From the above view, we can find the step time is reduced to about 58ms comparing with previous run's 121ms ,
301
+ # From the above view, we can find the step time is reduced to about 76ms comparing with previous run's 132ms ,
293
302
# and the time reduction of ``DataLoader`` mainly contributes.
294
303
#
295
304
# .. image:: ../../_static/img/profiler_trace_view2.png
0 commit comments