vllm-project · njhill · Jun 5, 2025 · May 23, 2025 · May 23, 2025 · May 23, 2025
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -116,6 +116,8 @@ def update_state_after_alloc(self, request: "Request",
         """
         Update KVConnector state after block allocation.
         """
+        if num_external_tokens == 0:
+            return
         self._lmcache_engine.update_state_after_alloc(request,
                                                       num_external_tokens)
 

@@ -238,9 +238,14 @@ def update_state_after_alloc(self, request: "Request",
             if params.get("remote_block_ids"):
                 if all(p in params for p in ("remote_engine_id", "remote_host",
                                              "remote_port")):
+                    # If remote_blocks and num_external_tokens = 0, we have
+                    # a full prefix cache hit on the D worker. We need to call
+                    # send_notif in _read_blocks to free the memory on the P.
+                    local_block_ids = (blocks.get_unhashed_block_ids()
+                                       if num_external_tokens > 0 else [])
                     # Get unhashed blocks to pull from remote.
                     self._reqs_need_recv[request.request_id] = (
-                        request, blocks.get_unhashed_block_ids())
+                        request, local_block_ids)
 # NOTE: if count is 0 here, we have less than block_size 
 # tokens to pull after subtracting the local prefix cache hit. 
 # The remote only sends fully computed blocks, so there is 
 # nothing to transfer but we still need to notify the 
 # prefill worker so that the remote blocks are freed. 
 if all(p in params for p in ("remote_engine_id", "remote_host", 
                              "remote_port")): 
     self._reqs_need_recv[request.request_id] = (request, []) 
 # NOTE: if count is 0 here, we have less than block_size 
 # tokens to pull after subtracting the local prefix cache hit. 
 # The remote only sends fully computed blocks, so there is 
 # nothing to transfer but we still need to notify the 
 # prefill worker so that the remote blocks are freed. 
 if all(p in params for p in ("remote_engine_id", "remote_host", 
                              "remote_port")): 
     self._reqs_need_recv[request.request_id] = (request, []) 
                 else:
                     logger.warning(
                         "Got invalid KVTransferParams: %s. This "
@@ -259,15 +264,6 @@ def build_connector_meta(
         # Loop through scheduled reqs and convert to ReqMeta.
         for req_id, (req, block_ids) in self._reqs_need_recv.items():
             assert req.kv_transfer_params is not None
-            # For the case where there are no remote blocks to pull
-            # (block_ids is empty), we don't need to schedule
-            # an async read on the worker side.
-            if not block_ids:
-                logger.debug(
-                    "Skipping adding request %s to NixlConnectorMetadata, "
-                    "as there are no remote blocks to pull", req_id)
-                continue
-
             meta.add_new_req(
                 request_id=req_id,
                 local_block_ids=block_ids,
@@ -731,7 +727,7 @@ def _read_blocks(
         # just notify P worker that we have the blocks we need.
         num_local_blocks = len(local_block_ids)
         if num_local_blocks == 0:
-            self.nixl_wrapper.send_notif(dst_engine_id,
+            self.nixl_wrapper.send_notif(self._remote_agents[dst_engine_id],
                                          notif_msg=request_id.encode("utf-8"))
             return
 

@@ -414,11 +414,11 @@ def schedule(self) -> SchedulerOutput:
                     # The request cannot be scheduled.
                     break
 
-                # KVConnector: update internal state after allocation.
+                # KVTransfer: the connector uses this info to determine
+                # if a load is needed. Note that
                 # This information is used to determine if a load is
                 # needed for this request.
-                if num_external_computed_tokens:
-                    assert self.connector is not None
+                if self.connector is not None:
                     self.connector.update_state_after_alloc(
                         request,
                         new_computed_blocks + new_blocks,