Add support of request tracking (#891)

banitag1 · facebook-github-bot · commit d1075336f254 · 2022-12-13T17:38:19.000-08:00
Summary: Pull Request resolved: #891 This diff adds the support of request tracking, so we can visualize the request processing across threads and also monitor the tail latency behavior. Reviewed By: zyan0 Differential Revision: D42002684 fbshipit-source-id: 07cfff683c9de092d54547704e41f680b9c3e1e6
diff --git a/torchrec/inference/include/torchrec/inference/Types.h b/torchrec/inference/include/torchrec/inference/Types.h
@@ -62,6 +62,8 @@ struct PredictionResponse {
 struct RequestContext {
   uint32_t batchSize;
   folly::Promise<std::unique_ptr<PredictionResponse>> promise;
+  // folly request context for request tracking in crochet
+  std::shared_ptr<folly::RequestContext> follyRequestContext;
 };
 
 using PredictionException = std::runtime_error;
diff --git a/torchrec/inference/src/BatchingQueue.cpp b/torchrec/inference/src/BatchingQueue.cpp
@@ -101,7 +101,10 @@ void BatchingQueue::add(
     const auto batchSize = request->batch_size;
     queue.push(QueryQueueEntry{
         std::move(request),
-        RequestContext{batchSize, std::move(promise)},
+        RequestContext{
+            batchSize,
+            std::move(promise),
+            folly::RequestContext::saveContext()},
         addedTime});
   });
 }
@@ -150,6 +153,7 @@ void BatchingQueue::createBatch() {
         }
 
         auto& context = contexts.emplace_back(std::move(front.context));
+        folly::RequestContext::setContext(context.follyRequestContext);
         requests.push_back(std::move(front.request));
         batchSize += requests.back()->batch_size;
         queue.pop();
@@ -178,6 +182,8 @@ void BatchingQueue::createBatch() {
       contexts.clear();
     }
 
+    folly::RequestContext::setContext(nullptr);
+
     if (!full) {
       /* sleep override */
       std::this_thread::sleep_for(std::chrono::milliseconds(1));
@@ -207,6 +213,9 @@ void BatchingQueue::pinMemory(int gpuIdx) {
       if (!requests.empty() || !contexts.empty()) {
         RECORD_USER_SCOPE("PinMemory");
 
+        if (!contexts.empty()) {
+          folly::RequestContext::setContext(contexts[0].follyRequestContext);
+        }
         // Combine data.
         size_t combinedBatchSize = 0;
         for (auto i : c10::irange(requests.size())) {
@@ -323,6 +332,9 @@ void BatchingQueue::pinMemory(int gpuIdx) {
         observer_->observeBatchCompletion(batch->size(), batch->batchSize);
 
         cbs_[gpuIdx](batch);
+
+        // unset request tracking
+        folly::RequestContext::setContext(nullptr);
       }
     } catch (const std::exception& ex) {
       LOG(FATAL) << "Error batching requests, ex: " << folly::exceptionStr(ex);
diff --git a/torchrec/inference/src/GPUExecutor.cpp b/torchrec/inference/src/GPUExecutor.cpp
@@ -20,6 +20,7 @@
 #include <folly/executors/CPUThreadPoolExecutor.h>
 #include <folly/futures/Future.h>
 #include <folly/io/IOBuf.h>
+#include <folly/io/async/Request.h>
 #include <folly/stop_watch.h>
 #include <gflags/gflags.h>
 #include <glog/logging.h>
@@ -204,6 +205,10 @@ void GPUExecutor::process(int idx) {
       continue;
     }
 
+    if (!batch->contexts.empty()) {
+      folly::RequestContext::setContext(batch->contexts[0].follyRequestContext);
+    }
+
     auto timeInQueue = getTimeElapsedMS(batch->enqueueTime);
     observer_->recordQueueLatency(timeInQueue.count());
 
@@ -324,6 +329,9 @@ void GPUExecutor::process(int idx) {
           observer->recordTotalLatency(
               getTimeElapsedMS(batch->enqueueTime).count());
         });
+
+    // reset request tracking
+    folly::RequestContext::setContext(nullptr);
   }
 }