Skip to content

Commit 2985f65

Browse files
committed
fix
Signed-off-by: Woosuk Kwon <[email protected]>
1 parent b22ee16 commit 2985f65

File tree

5 files changed

+12
-16
lines changed

5 files changed

+12
-16
lines changed

vllm/v1/worker/gpu/cudagraph_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def capture_graph(
7878
kv_cache_config: KVCacheConfig,
7979
) -> None:
8080
num_reqs = min(num_tokens, self.max_num_reqs)
81-
input_ids = input_buffers.input_ids.gpu[:num_tokens]
81+
input_ids = input_buffers.input_ids[:num_tokens]
8282
positions = input_buffers.positions[:num_tokens]
8383
attn_metadata = prepare_inputs_to_capture(
8484
num_reqs,

vllm/v1/worker/gpu/input_batch.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def __init__(
2929
self.pin_memory = pin_memory
3030

3131
self.idx_mapping = self._make_buffer(max_num_reqs, dtype=torch.int32)
32-
self.input_ids = self._make_buffer(max_num_tokens, dtype=torch.int32)
32+
self.input_ids = torch.zeros(max_num_tokens, dtype=torch.int32, device=device)
3333
self.positions = torch.zeros(max_num_tokens, dtype=torch.int64, device=device)
3434
self.query_start_loc = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
3535
self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
@@ -116,7 +116,7 @@ def make_dummy(
116116
input_buffers.seq_lens[num_reqs:] = 0
117117
seq_lens = input_buffers.seq_lens[:num_reqs]
118118

119-
input_ids = input_buffers.input_ids.copy_to_gpu(num_tokens)
119+
input_ids = input_buffers.input_ids[:num_tokens]
120120
positions = input_buffers.positions[:num_tokens]
121121
# attn_metadata = defaultdict(lambda: None)
122122
logits_indices = query_start_loc[1:] - 1

vllm/v1/worker/gpu/model_runner.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -410,9 +410,6 @@ def update_states(self, scheduler_output: SchedulerOutput) -> None:
410410
cu_num_new_blocks[i].append(x + len(block_ids))
411411
new_block_ids[i].extend(block_ids)
412412
overwrite.append(True)
413-
# Update the GPU tensors for request states.
414-
if scheduler_output.scheduled_new_reqs:
415-
self.req_states.prefill_len.copy_to_gpu()
416413

417414
# Add new blocks for the existing requests.
418415
cached_reqs = scheduler_output.scheduled_cached_reqs
@@ -507,7 +504,7 @@ def prepare_inputs(
507504

508505
# Get prefill tokens.
509506
prepare_prefill_inputs(
510-
self.input_buffers.input_ids.gpu,
507+
self.input_buffers.input_ids,
511508
self.req_states.next_prefill_tokens,
512509
idx_mapping,
513510
query_start_loc_gpu,
@@ -529,7 +526,7 @@ def prepare_inputs(
529526
# Some input token ids are directly read from the last sampled tokens
530527
# and draft tokens. Also, get the logits indices to sample tokens from.
531528
logits_indices = combine_sampled_and_draft_tokens(
532-
self.input_buffers.input_ids.gpu,
529+
self.input_buffers.input_ids,
533530
idx_mapping,
534531
self.req_states.last_sampled_tokens,
535532
query_start_loc_gpu,
@@ -570,7 +567,7 @@ def prepare_inputs(
570567
kv_cache_config=self.kv_cache_config,
571568
)
572569

573-
input_ids = self.input_buffers.input_ids.gpu[:num_tokens_after_padding]
570+
input_ids = self.input_buffers.input_ids[:num_tokens_after_padding]
574571
positions = self.input_buffers.positions[:num_tokens_after_padding]
575572
return InputBatch(
576573
req_ids=req_ids,

vllm/v1/worker/gpu/spec_decode/eagle.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def run_model(
121121
num_tokens_across_dp=num_tokens_across_dp,
122122
):
123123
ret_hidden_states = self.model(
124-
input_ids=self.input_buffers.input_ids.gpu[:num_tokens],
124+
input_ids=self.input_buffers.input_ids[:num_tokens],
125125
positions=self.input_buffers.positions[:num_tokens],
126126
hidden_states=self.hidden_states[:num_tokens],
127127
)
@@ -139,7 +139,7 @@ def generate_draft(
139139
num_tokens_across_dp: torch.Tensor | None,
140140
) -> None:
141141
pos = self.input_buffers.positions[:num_reqs]
142-
query_start_loc = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
142+
query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1]
143143
for step in range(1, self.num_speculative_steps):
144144
# Run the eagle model.
145145
last_hidden_states, hidden_states = self.run_model(
@@ -379,7 +379,7 @@ def prepare_eagle_inputs(
379379
)
380380
_prepare_eagle_inputs_kernel[(num_reqs,)](
381381
last_token_indices,
382-
input_buffers.input_ids.gpu,
382+
input_buffers.input_ids,
383383
input_buffers.positions,
384384
input_batch.input_ids,
385385
input_batch.positions,
@@ -482,7 +482,7 @@ def prepare_eagle_decode(
482482
last_token_indices,
483483
target_seq_lens,
484484
num_rejected,
485-
input_buffers.input_ids.gpu,
485+
input_buffers.input_ids,
486486
input_buffers.positions,
487487
input_hidden_states,
488488
input_hidden_states.stride(0),
@@ -550,7 +550,7 @@ def update_eagle_inputs(
550550
):
551551
num_reqs, hidden_size = output_hidden_states.shape
552552
_update_eagle_inputs_kernel[(num_reqs,)](
553-
input_buffers.input_ids.gpu,
553+
input_buffers.input_ids,
554554
input_buffers.positions,
555555
hidden_states,
556556
hidden_states.stride(0),

vllm/v1/worker/gpu/states.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,7 @@ def __init__(
117117
self.prefill_token_ids = UvaBuffer(
118118
self.max_num_reqs, self.max_model_len, dtype=torch.int32
119119
)
120-
self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
121-
120+
self.prefill_len = UvaBuffer(self.max_num_reqs, dtype=torch.int32)
122121
# Number of computed tokens.
123122
self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
124123
self.num_computed_tokens = torch.zeros(

0 commit comments

Comments
 (0)