Merge pull request #11 from character-tech/tanuj/accum

tanujtiwari1998 · web-flow · commit 6c7451c2ca87 · 2025-07-02T12:03:43.000-07:00
add support for accumulate in vllm
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -855,6 +855,12 @@ class CompletionRequest(OpenAIBaseModel):
             " as strings of the form 'token_id:{token_id}' so that tokens "
             "that are not JSON-encodable can be identified."))
 
+    accumulate: Optional[bool] = Field(
+        default=None,
+        description=(
+            "Special kind of echo where in the response instead of delta we return the accumulated text"
+        )
+    )
     # doc: end-completion-extra-params
 
     # Default sampling parameters for completion requests
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -262,6 +262,9 @@ async def completion_stream_generator(
         previous_num_tokens = [0] * num_choices * num_prompts
         has_echoed = [False] * num_choices * num_prompts
         num_prompt_tokens = [0] * num_prompts
+        accumulated_text = [""] * num_choices * num_prompts
+        accumulated_tokens = [[] * num_choices * num_prompts]
+        accumulated_logprobs = [[] * num_choices * num_prompts]
 
         stream_options = request.stream_options
         if stream_options:
@@ -309,6 +312,16 @@ async def completion_stream_generator(
                                 *(output.logprobs or []),
                             ]
                         has_echoed[i] = True
+                    elif request.accumulate:
+                        i = output.index + prompt_idx * num_choices
+                        # return the accumulated response
+                        accumulated_text[i] += output.text
+                        accumulated_tokens[i].extend(output.token_ids)
+                        accumulated_logprobs[i].extend(output.logprobs or [])
+
+                        delta_text = accumulated_text[i]
+                        delta_token_ids = accumulated_tokens[i]
+                        out_logprobs = accumulated_logprobs[i]
                     else:
                         # return just the delta
                         delta_text = output.text