Turn BytesContext into FromTensorContext

scotts · scotts · commit 709686eb8de7 · 2025-06-10T20:30:28.000-07:00
diff --git a/src/torchcodec/_core/AVIOBytesContext.cpp b/src/torchcodec/_core/AVIOBytesContext.cpp
@@ -9,117 +9,87 @@
 
 namespace facebook::torchcodec {
 
-AVIOBytesContext::AVIOBytesContext(const void* data, int64_t dataSize)
-    : dataContext_{static_cast<const uint8_t*>(data), dataSize, 0} {
-  TORCH_CHECK(data != nullptr, "Video data buffer cannot be nullptr!");
-  TORCH_CHECK(dataSize > 0, "Video data size must be positive");
-  createAVIOContext(&read, nullptr, &seek, &dataContext_);
-}
+namespace {
+
+constexpr int64_t INITIAL_TENSOR_SIZE = 10'000'000; // 10 MB
+constexpr int64_t MAX_TENSOR_SIZE = 320'000'000; // 320 MB
+                                                 //
 
 // The signature of this function is defined by FFMPEG.
-int AVIOBytesContext::read(void* opaque, uint8_t* buf, int buf_size) {
-  auto dataContext = static_cast<DataContext*>(opaque);
+int read(void* opaque, uint8_t* buf, int buf_size) {
+  auto tensorContext = static_cast<TensorContext*>(opaque);
   TORCH_CHECK(
-      dataContext->current <= dataContext->size,
+      tensorContext->current <= tensorContext->data.numel(),
       "Tried to read outside of the buffer: current=",
-      dataContext->current,
+      tensorContext->current,
       ", size=",
-      dataContext->size);
+      tensorContext->data.numel());
 
   int64_t numBytesRead = std::min(
-      static_cast<int64_t>(buf_size), dataContext->size - dataContext->current);
+      static_cast<int64_t>(buf_size),
+      tensorContext->data.numel() - tensorContext->current);
 
   TORCH_CHECK(
       numBytesRead >= 0,
       "Tried to read negative bytes: numBytesRead=",
       numBytesRead,
       ", size=",
-      dataContext->size,
+      tensorContext->data.numel(),
       ", current=",
-      dataContext->current);
+      tensorContext->current);
 
   if (numBytesRead == 0) {
     return AVERROR_EOF;
   }
 
-  std::memcpy(buf, dataContext->data + dataContext->current, numBytesRead);
-  dataContext->current += numBytesRead;
+  std::memcpy(
+      buf,
+      tensorContext->data.data_ptr<uint8_t>() + tensorContext->current,
+      numBytesRead);
+  tensorContext->current += numBytesRead;
   return numBytesRead;
 }
 
 // The signature of this function is defined by FFMPEG.
-int64_t AVIOBytesContext::seek(void* opaque, int64_t offset, int whence) {
-  auto dataContext = static_cast<DataContext*>(opaque);
-  int64_t ret = -1;
-
-  switch (whence) {
-    case AVSEEK_SIZE:
-      ret = dataContext->size;
-      break;
-    case SEEK_SET:
-      dataContext->current = offset;
-      ret = offset;
-      break;
-    default:
-      break;
-  }
-
-  return ret;
-}
-
-AVIOToTensorContext::AVIOToTensorContext()
-    : dataContext_{
-          torch::empty(
-              {AVIOToTensorContext::INITIAL_TENSOR_SIZE},
-              {torch::kUInt8}),
-          0} {
-  createAVIOContext(nullptr, &write, &seek, &dataContext_);
-}
-
-// The signature of this function is defined by FFMPEG.
-int AVIOToTensorContext::write(void* opaque, const uint8_t* buf, int buf_size) {
-  auto dataContext = static_cast<DataContext*>(opaque);
+int write(void* opaque, const uint8_t* buf, int buf_size) {
+  auto tensorContext = static_cast<TensorContext*>(opaque);
 
   int64_t bufSize = static_cast<int64_t>(buf_size);
-  if (dataContext->current + bufSize > dataContext->outputTensor.numel()) {
+  if (tensorContext->current + bufSize > tensorContext->data.numel()) {
     TORCH_CHECK(
-        dataContext->outputTensor.numel() * 2 <=
-            AVIOToTensorContext::MAX_TENSOR_SIZE,
+        tensorContext->data.numel() * 2 <= MAX_TENSOR_SIZE,
         "We tried to allocate an output encoded tensor larger than ",
-        AVIOToTensorContext::MAX_TENSOR_SIZE,
+        MAX_TENSOR_SIZE,
         " bytes. If you think this should be supported, please report.");
 
     // We double the size of the outpout tensor. Calling cat() may not be the
     // most efficient, but it's simple.
-    dataContext->outputTensor =
-        torch::cat({dataContext->outputTensor, dataContext->outputTensor});
+    tensorContext->data =
+        torch::cat({tensorContext->data, tensorContext->data});
   }
 
   TORCH_CHECK(
-      dataContext->current + bufSize <= dataContext->outputTensor.numel(),
+      tensorContext->current + bufSize <= tensorContext->data.numel(),
       "Re-allocation of the output tensor didn't work. ",
       "This should not happen, please report on TorchCodec bug tracker");
 
-  uint8_t* outputTensorData = dataContext->outputTensor.data_ptr<uint8_t>();
-  std::memcpy(outputTensorData + dataContext->current, buf, bufSize);
-  dataContext->current += bufSize;
+  uint8_t* outputTensorData = tensorContext->data.data_ptr<uint8_t>();
+  std::memcpy(outputTensorData + tensorContext->current, buf, bufSize);
+  tensorContext->current += bufSize;
   return buf_size;
 }
 
 // The signature of this function is defined by FFMPEG.
-// Note: This `seek()` implementation is very similar to that of
-// AVIOBytesContext. We could consider merging both classes, or do some kind of
-// refac, but this doesn't seem worth it ATM.
-int64_t AVIOToTensorContext::seek(void* opaque, int64_t offset, int whence) {
-  auto dataContext = static_cast<DataContext*>(opaque);
+int64_t seek(void* opaque, int64_t offset, int whence) {
+  auto tensorContext = static_cast<TensorContext*>(opaque);
   int64_t ret = -1;
 
   switch (whence) {
     case AVSEEK_SIZE:
-      ret = dataContext->outputTensor.numel();
+      ret = tensorContext->data.numel();
       break;
     case SEEK_SET:
-      dataContext->current = offset;
+      tensorContext->current = offset;
       ret = offset;
       break;
     default:
@@ -129,9 +99,24 @@ int64_t AVIOToTensorContext::seek(void* opaque, int64_t offset, int whence) {
   return ret;
 }
 
+} // namespace
+
+AVIOFromTensorContext::AVIOFromTensorContext(torch::Tensor data)
+    : tensorContext_{data, 0} {
+  TORCH_CHECK(data.numel() > 0, "data must not be empty");
+  TORCH_CHECK(data.is_contiguous(), "data must be contiguous");
+  TORCH_CHECK(data.scalar_type() == torch::kUInt8, "data must be kUInt8");
+  createAVIOContext(&read, nullptr, &seek, &tensorContext_);
+}
+
+AVIOToTensorContext::AVIOToTensorContext()
+    : tensorContext_{torch::empty({INITIAL_TENSOR_SIZE}, {torch::kUInt8}), 0} {
+  createAVIOContext(nullptr, &write, &seek, &tensorContext_);
+}
+
 torch::Tensor AVIOToTensorContext::getOutputTensor() {
-  return dataContext_.outputTensor.narrow(
-      /*dim=*/0, /*start=*/0, /*length=*/dataContext_.current);
+  return tensorContext_.data.narrow(
+      /*dim=*/0, /*start=*/0, /*length=*/tensorContext_.current);
 }
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/AVIOBytesContext.h b/src/torchcodec/_core/AVIOBytesContext.h
@@ -11,23 +11,19 @@
 
 namespace facebook::torchcodec {
 
+struct TensorContext {
+  torch::Tensor data;
+  int64_t current;
+};
+
 // For Decoding: enables users to pass in the entire video or audio as bytes.
 // Our read and seek functions then traverse the bytes in memory.
-class AVIOBytesContext : public AVIOContextHolder {
+class AVIOFromTensorContext : public AVIOContextHolder {
  public:
-  explicit AVIOBytesContext(const void* data, int64_t dataSize);
+  explicit AVIOFromTensorContext(torch::Tensor data);
 
  private:
-  struct DataContext {
-    const uint8_t* data;
-    int64_t size;
-    int64_t current;
-  };
-
-  static int read(void* opaque, uint8_t* buf, int buf_size);
-  static int64_t seek(void* opaque, int64_t offset, int whence);
-
-  DataContext dataContext_;
+  TensorContext tensorContext_;
 };
 
 // For Encoding: used to encode into an output uint8 (bytes) tensor.
@@ -37,18 +33,7 @@ class AVIOToTensorContext : public AVIOContextHolder {
   torch::Tensor getOutputTensor();
 
  private:
-  struct DataContext {
-    torch::Tensor outputTensor;
-    int64_t current;
-  };
-
-  static constexpr int64_t INITIAL_TENSOR_SIZE = 10'000'000; // 10MB
-  static constexpr int64_t MAX_TENSOR_SIZE = 320'000'000; // 320 MB
-  static int write(void* opaque, const uint8_t* buf, int buf_size);
-  // We need to expose seek() for some formats like mp3.
-  static int64_t seek(void* opaque, int64_t offset, int whence);
-
-  DataContext dataContext_;
+  TensorContext tensorContext_;
 };
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/custom_ops.cpp b/src/torchcodec/_core/custom_ops.cpp
@@ -196,15 +196,13 @@ at::Tensor create_from_tensor(
   TORCH_CHECK(
       video_tensor.scalar_type() == torch::kUInt8,
       "video_tensor must be kUInt8");
-  void* data = video_tensor.mutable_data_ptr();
-  size_t length = video_tensor.numel();
 
   SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact;
   if (seek_mode.has_value()) {
     realSeek = seekModeFromString(seek_mode.value());
   }
 
-  auto contextHolder = std::make_unique<AVIOBytesContext>(data, length);
+  auto contextHolder = std::make_unique<AVIOFromTensorContext>(video_tensor);
 
   std::unique_ptr<SingleStreamDecoder> uniqueDecoder =
       std::make_unique<SingleStreamDecoder>(std::move(contextHolder), realSeek);
diff --git a/test/test_decoders.py b/test/test_decoders.py
@@ -93,7 +93,6 @@ def seek(self, offset: int, whence: int) -> bytes:
         decoder = Decoder(source)
         assert isinstance(decoder.metadata, _core._metadata.StreamMetadata)
 
-
     @pytest.mark.parametrize("Decoder", (VideoDecoder, AudioDecoder))
     def test_create_fails(self, Decoder):
         with pytest.raises(TypeError, match="Unknown source type"):
@@ -139,10 +138,9 @@ def test_create_bytes_ownership(self):
             decoder = VideoDecoder(f.read())
 
         assert decoder[0] is not None
-        assert decoder[len(decoder)//2] is not None
+        assert decoder[len(decoder) // 2] is not None
         assert decoder[-1] is not None
 
-
     def test_create_fails(self):
         with pytest.raises(ValueError, match="Invalid seek mode"):
             VideoDecoder(NASA_VIDEO.path, seek_mode="blah")