Add Support For Tokenizer Options (#1785)

sayanshaw24 · Sayan Shaw · web-flow · commit 02580c63ba9e · 2025-10-10T23:56:20.000Z
### Updates This PR introduces support for ORT Extensions introduced in [this PR](microsoft/onnxruntime-extensions#998), which allows passing an **options map** with `OrtxCreateTokenizerWithOptions` when creating tokenizers, or using a new `OrtxUpdateTokenizerOptions` method which allows updating the options map on an existing tokenizer object (including those created using `OrtxCreateTokenizer`), enabling more flexible configurations. It additionally removes the previously added `OrtxTokenizeWithOptions` and `OrtxDetokenize1DWithOptions` functions, which are now redundant. With the new design, **options are set once on the tokenizer object itself**, so there’s no longer a need to pass ad-hoc option sets into individual tokenize/detokenize calls — reducing API clutter and simplifying the C interface. In additions to the C API updates, it also adds bindings for C++, C# and Python. ### Sample Usage C++ ``` auto tokenizer = OgaTokenizer::Create(*model); // Define tokenizer options as C-style arrays const char* keys[] = {"add_bos_token", "trim_offsets"}; const char* values[] = {"true", "false"}; // Update tokenizer options tokenizer->UpdateOptions(keys, values, 2); ``` C# ``` var tokenizer = new Tokenizer(model); // Update tokenizer options using a dictionary var options = new Dictionary<string, string> { { "add_bos_token", "true" }, { "trim_offsets", "false" } }; tokenizer.UpdateOptions(options); ``` Python ``` tokenizer = Tokenizer(model) options = { "add_bos_token": "true", "trim_offsets": "false" } tokenizer.update_options(**options) ``` --------- Co-authored-by: Sayan Shaw <sayanshaw@microsoft.com>
diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -14,7 +14,7 @@ pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f78029
 googletest;https://github.com/google/googletest/archive/530d5c8c84abd2a46f38583ee817743c9b3a42b4.zip;5e3a61db2aa975cfd0f97ba92c818744e7fa7034
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
-onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;bd8fb6d86e98c17e397c42fc001913cc2e035597
+onnxruntime_extensions;https://github.com/microsoft/onnxruntime-extensions.git;9790faf2838d72cb229475cd2b5edc6fc779b5aa
 
 # These two dependencies are for the optional constrained decoding feature (USE_GUIDANCE)
 llguidance;https://github.com/microsoft/llguidance.git;2d2f1de3c87e3289528affc346f734f7471216d9
diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs
@@ -217,6 +217,13 @@ public static extern UIntPtr OgaSequencesGetSequenceCount(IntPtr /* const OgaSeq
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
         public static extern void OgaDestroyTokenizer(IntPtr /* OgaTokenizer* */ tokenizer);
 
+        [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
+        public static extern IntPtr /* OgaResult* */ OgaUpdateTokenizerOptions(
+                             IntPtr /* const OgaTokenizer* */ tokenizer,
+                             string[] /* const char*[] */ keys,
+                             string[] /* const char*[] */ values,
+                             UIntPtr /* size_t */ numOptions);
+
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
         public static extern IntPtr /* OgaResult* */ OgaTokenizerEncode(IntPtr /* const OgaTokenizer* */ tokenizer,
                                                                         byte[] /* const char* */ strings,
diff --git a/src/csharp/Tokenizer.cs b/src/csharp/Tokenizer.cs
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 using System;
+using System.Collections.Generic;
 
 namespace Microsoft.ML.OnnxRuntimeGenAI
 {
@@ -45,6 +46,31 @@ public string[] DecodeBatch(Sequences sequences)
             return result;
         }
 
+        public void UpdateOptions(Dictionary<string, string> options)
+        {
+            if (options == null)
+                throw new ArgumentNullException(nameof(options));
+
+            // Prepare native arrays
+            string[] keys = new string[options.Count];
+            string[] values = new string[options.Count];
+            int i = 0;
+            foreach (var kvp in options)
+            {
+                keys[i] = kvp.Key;
+                values[i] = kvp.Value;
+                i++;
+            }
+
+            // Call native function
+            Result.VerifySuccess(
+                NativeMethods.OgaUpdateTokenizerOptions(
+                    _tokenizerHandle,
+                    keys,
+                    values,
+                    (UIntPtr)options.Count));
+        }
+
         public Sequences Encode(string str)
         {
             Result.VerifySuccess(NativeMethods.OgaCreateSequences(out IntPtr nativeSequences));
diff --git a/src/models/model.cpp b/src/models/model.cpp
@@ -259,16 +259,25 @@ const std::string& TokenizerStream::Decode(int32_t token) {
 }
 
 Tokenizer::Tokenizer(Config& config) : pad_token_id_{config.model.pad_token_id} {
-  CheckResult(OrtxCreateTokenizer(tokenizer_.Address(), config.config_path.string().c_str()));
+  // Default tokenizer options
+  const char* keys[] = {"add_special_tokens", "skip_special_tokens"};
+  const char* values[] = {"false", "true"};
+
+  CheckResult(OrtxCreateTokenizerWithOptions(tokenizer_.Address(), config.config_path.string().c_str(), keys, values, 2));
 }
 
 std::unique_ptr<TokenizerStream> Tokenizer::CreateStream() const {
   return std::make_unique<TokenizerStream>(*this);
 }
 
+void Tokenizer::UpdateOptions(const char* const* keys, const char* const* values, size_t num_options) {
+  // Tap into ORT Extensions API
+  CheckResult(OrtxUpdateTokenizerOptions(tokenizer_, const_cast<const char**>(keys), const_cast<const char**>(values), num_options));
+}
+
 std::vector<int32_t> Tokenizer::Encode(const char* text) const {
   OrtxPtr<OrtxTokenId2DArray> ids;
-  CheckResult(OrtxTokenizeWithOptions(tokenizer_, &text, 1, ids.Address(), false /* add_special_tokens */));
+  CheckResult(OrtxTokenize(tokenizer_, &text, 1, ids.Address()));
 
   const extTokenId_t* tokens;
   size_t count;
@@ -278,7 +287,7 @@ std::vector<int32_t> Tokenizer::Encode(const char* text) const {
 
 std::string Tokenizer::Decode(std::span<const int32_t> tokens) const {
   OrtxPtr<OrtxStringArray> ortx_string_array;
-  CheckResult(OrtxDetokenize1DWithOptions(tokenizer_, reinterpret_cast<const uint32_t*>(tokens.data()), tokens.size(), ortx_string_array.Address(), true /* skip_special_tokens */));
+  CheckResult(OrtxDetokenize1D(tokenizer_, reinterpret_cast<const uint32_t*>(tokens.data()), tokens.size(), ortx_string_array.Address()));
 
   const char* string;
   CheckResult(OrtxStringArrayGetItem(ortx_string_array, 0, &string));
diff --git a/src/models/model.h b/src/models/model.h
@@ -85,6 +85,7 @@ struct Tokenizer : std::enable_shared_from_this<Tokenizer>, LeakChecked<Tokenize
 
   std::unique_ptr<TokenizerStream> CreateStream() const;
 
+  void UpdateOptions(const char* const* keys, const char* const* values, size_t num_options);
   std::vector<int32_t> Encode(const char* text) const;
   std::string Decode(std::span<const int32_t> tokens) const;
   std::string ApplyChatTemplate(const char* template_str, const char* messages, const char* tools, bool add_generation_prompt) const;
diff --git a/src/ort_genai.h b/src/ort_genai.h
@@ -305,6 +305,10 @@ struct OgaTokenizer : OgaAbstract {
     return std::unique_ptr<OgaTokenizer>(p);
   }
 
+  void UpdateOptions(const char* const* keys, const char* const* values, size_t num_options) {
+    OgaCheckResult(OgaUpdateTokenizerOptions(this, keys, values, num_options));
+  }
+
   void Encode(const char* str, OgaSequences& sequences) const {
     OgaCheckResult(OgaTokenizerEncode(this, str, &sequences));
   }
diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp
@@ -581,6 +581,23 @@ OgaResult* OGA_API_CALL OgaCreateTokenizer(const OgaModel* model, OgaTokenizer**
   OGA_CATCH
 }
 
+OgaResult* OGA_API_CALL OgaUpdateTokenizerOptions(
+    OgaTokenizer* tokenizer,
+    const char* const* keys,
+    const char* const* values,
+    size_t num_options) {
+  OGA_TRY
+
+  if (!tokenizer)
+    throw std::runtime_error("Tokenizer pointer is null");
+
+  tokenizer->UpdateOptions(keys, values, num_options);
+
+  return nullptr;
+
+  OGA_CATCH
+}
+
 OgaResult* OGA_API_CALL OgaTokenizerEncode(const OgaTokenizer* tokenizer, const char* str, OgaSequences* sequences) {
   OGA_TRY
   sequences->emplace_back(tokenizer->Encode(str));
diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h
@@ -572,6 +572,43 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateMultiModalProcessor(const OgaModel*
 
 OGA_EXPORT void OGA_API_CALL OgaDestroyMultiModalProcessor(OgaMultiModalProcessor* processor);
 
+/**
+ * Updates tokenizer options for the given OgaTokenizer instance.
+ * The provided keys and values must be null-terminated UTF-8 strings.
+ *
+ * This function allows updating tokenizer behavior at runtime by passing
+ * key/value string pairs. Each key corresponds to a configurable tokenizer
+ * option. Both keys and values must remain valid for the duration of this call.
+ *
+ * @param tokenizer Pointer to the OgaTokenizer whose options will be updated.
+ * @param keys Array of option key strings.
+ * @param values Array of corresponding option value strings (same length as keys).
+ * @param num_options Number of key/value pairs provided.
+ *
+ * @return nullptr on success, or an OgaResult* describing the error.
+ *         The returned OgaResult* (if not null) must be freed with OgaDestroyResult.
+ *
+ * Supported options:
+ *
+ * - `add_special_tokens`
+ *   - Purpose: Controls whether to add special tokens (e.g., BOS/EOS) during tokenization.
+ *   - Values: `"true"` / `"false"` or `"1"` / `"0"`.
+ *   - Default: `"false"`. This is the default value set by ORT GenAI prior to any options updating.
+ *
+ * - `skip_special_tokens`
+ *   - Purpose: Controls whether to remove special tokens during detokenization.
+ *   - Values: `"true"` / `"false"` or `"1"` / `"0"`.
+ *   - Default: `"true"`. This is the default value set by ORT GenAI prior to any options updating.
+ *
+ * Future tokenizer options may be added without changing this API signature.
+ * Passing unknown keys will result in an error.
+ */
+OGA_EXPORT OgaResult* OGA_API_CALL OgaUpdateTokenizerOptions(
+    OgaTokenizer* tokenizer,
+    const char* const* keys,
+    const char* const* values,
+    size_t num_options);
+
 /**
  * Encodes a single string and adds the encoded sequence of tokens to the OgaSequences. The OgaSequences must be freed with OgaDestroySequences
  * when it is no longer needed.
diff --git a/src/python/python.cpp b/src/python/python.cpp
@@ -361,11 +361,29 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
 
   pybind11::class_<OgaTokenizer>(m, "Tokenizer")
       .def(pybind11::init([](const OgaModel& model) { return OgaTokenizer::Create(model); }))
+      .def("update_options", [](OgaTokenizer& t, pybind11::kwargs kwargs) {
+        std::vector<std::string> key_storage;
+        std::vector<std::string> value_storage;
+        key_storage.reserve(kwargs.size());
+        value_storage.reserve(kwargs.size());
+
+        std::vector<const char*> keys;
+        std::vector<const char*> values;
+        keys.reserve(kwargs.size());
+        values.reserve(kwargs.size());
+
+        for (auto& item : kwargs) {
+            key_storage.emplace_back(pybind11::str(item.first));
+            value_storage.emplace_back(pybind11::str(item.second));
+            keys.push_back(key_storage.back().c_str());
+            values.push_back(value_storage.back().c_str());
+        }
+
+        t.UpdateOptions(keys.data(), values.data(), kwargs.size()); })
       .def("encode", [](const OgaTokenizer& t, std::string s) -> pybind11::array_t<int32_t> {
         auto sequences = OgaSequences::Create();
         t.Encode(s.c_str(), *sequences);
-        return ToPython(sequences->Get(0));
-      })
+        return ToPython(sequences->Get(0)); })
       .def("to_token_id", &OgaTokenizer::ToTokenId)
       .def("decode", [](const OgaTokenizer& t, pybind11::array_t<int32_t> tokens) -> std::string { return t.Decode(ToSpan(tokens)).p_; })
       .def("apply_chat_template", [](const OgaTokenizer& t, const char* messages, const char* template_str, const char* tools, bool add_generation_prompt) -> std::string { return t.ApplyChatTemplate(template_str, messages, tools, add_generation_prompt).p_; }, pybind11::arg("messages"), pybind11::kw_only(), pybind11::arg("template_str") = nullptr, pybind11::arg("tools") = nullptr, pybind11::arg("add_generation_prompt") = true)
diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp
@@ -113,6 +113,68 @@ TEST(CAPITests, TokenizerCAPI) {
 #endif
 }
 
+TEST(CAPITests, TokenizerUpdateOptions) {
+#if TEST_PHI2
+  auto config = OgaConfig::Create(PHI2_PATH);
+  auto model = OgaModel::Create(*config);
+  auto tokenizer = OgaTokenizer::Create(*model);
+
+  // Update tokenizer options
+  // Note: This simply tests the UpdateOptions API; these options are already set as default.
+  {
+    const char* keys[] = {"add_special_tokens", "skip_special_tokens"};
+    const char* values[] = {"false", "true"};
+    tokenizer->UpdateOptions(keys, values, 2);
+  }
+
+  // Encode single decode single
+  {
+    const char* input_string = "She sells sea shells by the sea shore.";
+    auto input_sequences = OgaSequences::Create();
+    tokenizer->Encode(input_string, *input_sequences);
+
+    auto out_string = tokenizer->Decode(input_sequences->SequenceData(0), input_sequences->SequenceCount(0));
+    ASSERT_STREQ(input_string, out_string);
+  }
+
+  const char* input_strings[] = {
+      "This is a test.",
+      "Rats are awesome pets!",
+      "The quick brown fox jumps over the lazy dog.",
+  };
+
+  auto sequences = OgaSequences::Create();
+
+  // Encode all strings
+  {
+    for (auto& string : input_strings)
+      tokenizer->Encode(string, *sequences);
+  }
+
+  // Decode one at a time
+  for (size_t i = 0; i < sequences->Count(); i++) {
+    auto out_string = tokenizer->Decode(sequences->SequenceData(i), sequences->SequenceCount(i));
+    std::cout << "Decoded string:" << out_string << std::endl;
+    if (strcmp(input_strings[i], out_string) != 0)
+      throw std::runtime_error("Token decoding mismatch");
+  }
+
+  // Stream Decode one at a time
+  for (size_t i = 0; i < sequences->Count(); i++) {
+    auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
+
+    auto* sequence = sequences->SequenceData(i);
+    std::string stream_result;
+    for (size_t j = 0; j < sequences->SequenceCount(i); j++) {
+      stream_result += tokenizer_stream->Decode(sequence[j]);
+    }
+    std::cout << "Stream decoded string:" << stream_result << std::endl;
+    if (strcmp(input_strings[i], stream_result.c_str()) != 0)
+      throw std::runtime_error("Stream token decoding mismatch");
+  }
+#endif
+}
+
 TEST(CAPITests, ChatTemplate) {
 #if TEST_PHI2
   // We load the phi-2 model just to get a tokenizer (phi-2 does not have a chat template)
@@ -1281,4 +1343,4 @@ TEST(CAPITests, SetGuidance) {
 
 #endif
 }
-#endif
+#endif

Original file line number	Diff line number	Diff line change
`@@ -305,6 +305,10 @@ struct OgaTokenizer : OgaAbstract {`
`305`	`305`	`return std::unique_ptr<OgaTokenizer>(p);`
`306`	`306`	`}`
`307`	`307`
	`308`	`+ void UpdateOptions(const char* const* keys, const char* const* values, size_t num_options) {`
	`309`	`+ OgaCheckResult(OgaUpdateTokenizerOptions(this, keys, values, num_options));`
	`310`	`+ }`
	`311`	`+`
`308`	`312`	`void Encode(const char* str, OgaSequences& sequences) const {`
`309`	`313`	`OgaCheckResult(OgaTokenizerEncode(this, str, &sequences));`
`310`	`314`	`}`