fix qwen3.5 moe reader

lvhan028 · lvhan028 · commit 219fea761891 · 2026-03-09T19:57:36.000+08:00
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -1,10 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
 import os.path as osp
+import re
 
 import torch
 
 from ..config import RopeParam
+from ..loader import create_loader
 from .base import INPUT_MODELS
 from .llama import LlamaModel, LlamaReader
 
@@ -383,13 +385,58 @@ def model_info(self):
 
 
 class Qwen3_5MoeReader(Qwen3_5ReaderMixin, Qwen3MoeReader):
-    pass
+
+    def _unpacked_moe_expert(self, e: int, i: int, kind: str):
+        prefix = f'{self.attn_layer_prefix}.{i}.mlp.experts'
+        gate_up = self.params.get(f'{prefix}.gate_up_proj.{kind}')
+        down = self.params.get(f'{prefix}.down_proj.{kind}')
+        if gate_up is None or down is None:
+            return None
+
+        # Packed Qwen3.5 MoE checkpoints store all experts in the first
+        # dimension. Slice one expert before transform so quantized policies
+        # still see a 2D tensor.
+        gate_up = self.transform(gate_up[e], kind)
+        down = self.transform(down[e], kind)
+        gate, up = gate_up.chunk(2, dim=0)
+        return (gate, down, up)
+
+    def moe_ffn_expert(self, e=None, i=None, kind=None):
+        if not kind:
+            return self.filter(r'experts', i)
+        unpacked = self._unpacked_moe_expert(e, i, kind)
+        if unpacked is not None:
+            return unpacked
+
+        return super().moe_ffn_expert(e, i, kind)
 
 
 @INPUT_MODELS.register_module(name='qwen3_5-moe')
 class Qwen3_5MoeModel(Qwen3MoeModel):
     Reader = Qwen3_5MoeReader
 
+    @staticmethod
+    def map_packed_qwen35_experts(name: str):
+        """Map packed expert names to weight names, i.e.,
+        "mlp.experts.gate_up_proj" -> "mlp.experts.gate_up_proj.weight" so that
+        class Weight in parameter.py can classify them."""
+        s = re.sub(r'(mlp\.experts\.(?:gate_up|down)_proj)$', r'\1.weight', name)
+        return s
+
+    def readers(self):
+        pattern = getattr(self.Reader, 'attn_layer_pattern', self.Reader.attn_layer_patten)
+        loader = create_loader(self.model_path, pattern, [])
+
+        has_packed_gate_up = any('mlp.experts.gate_up_proj' in k for k in loader.index.keys())
+        has_packed_down = any('mlp.experts.down_proj' in k for k in loader.index.keys())
+        if has_packed_gate_up and has_packed_down:
+            loader.mappings = [self.map_packed_qwen35_experts]
+
+        for i, param in loader.items():
+            reader = self.Reader(param, {}, False, self.model_config, policy=self.policy, fp8_quant=self.fp8_quant)
+            yield i, reader
+        torch.cuda.empty_cache()
+
     def model_info(self):
         if 'text_config' in self.model_config:
             self.model_config = self.model_config['text_config']
diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py
@@ -8,7 +8,6 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from logging import Logger, LogRecord
-from typing import List, Optional, Tuple, Union
 
 import torch
 from transformers import PretrainedConfig
@@ -26,7 +25,7 @@ class _ASNI_COLOR:
 
 # copy from: https://github.com/termcolor/termcolor
 @functools.cache
-def can_colorize(*, no_color: Optional[bool] = None, force_color: Optional[bool] = None) -> bool:
+def can_colorize(*, no_color: bool | None = None, force_color: bool | None = None) -> bool:
     """Check env vars and for tty/dumb terminal."""
     import io
     if no_color is not None and no_color:
@@ -110,8 +109,8 @@ def filter(self, record: LogRecord) -> bool:
           ' - %(message)s'
 
 
-def get_logger(name: Optional[str] = None,
-               log_file: Optional[str] = None,
+def get_logger(name: str | None = None,
+               log_file: str | None = None,
                log_level: int = logging.INFO,
                file_mode: str = 'a',
                log_formatter: str = _FORMAT) -> Logger:
@@ -178,7 +177,7 @@ def get_logger(name: Optional[str] = None,
     return logger
 
 
-def filter_suffix(response: str, suffixes: Optional[List[str]] = None) -> str:
+def filter_suffix(response: str, suffixes: list[str] | None = None) -> str:
     """Filter response with suffixes.
 
     Args:
@@ -197,12 +196,12 @@ def filter_suffix(response: str, suffixes: Optional[List[str]] = None) -> str:
 
 
 # TODO remove stop_word_offsets stuff and make it clean
-def _stop_words(stop_words: List[Union[int, str]], tokenizer: object):
+def _stop_words(stop_words: list[int | str], tokenizer: object):
     """Return list of stop-words to numpy.ndarray."""
     import numpy as np
     if stop_words is None:
         return None
-    assert isinstance(stop_words, List) and \
+    assert isinstance(stop_words, list) and \
         all(isinstance(elem, (str, int)) for elem in stop_words), \
         f'stop_words must be a list but got {type(stop_words)}'
     stop_indexes = []
@@ -211,7 +210,7 @@ def _stop_words(stop_words: List[Union[int, str]], tokenizer: object):
             stop_indexes += tokenizer.indexes_containing_token(stop_word)
         elif isinstance(stop_word, int):
             stop_indexes.append(stop_word)
-    assert isinstance(stop_indexes, List) and all(isinstance(elem, int) for elem in stop_indexes), 'invalid stop_words'
+    assert isinstance(stop_indexes, list) and all(isinstance(elem, int) for elem in stop_indexes), 'invalid stop_words'
     # each id in stop_indexes represents a stop word
     # refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
     # detailed explanation about fastertransformer's stop_indexes
@@ -297,7 +296,7 @@ async def __tmp():
 # modified from https://github.com/vllm-project/vllm/blob/0650e5935b0f6af35fb2acf71769982c47b804d7/vllm/config.py#L1082-L1150  # noqa
 def _get_and_verify_max_len(
     hf_config: PretrainedConfig,
-    max_model_len: Optional[int],
+    max_model_len: int | None,
 ) -> int:
     """Get and verify the model's maximum length."""
 
@@ -326,7 +325,11 @@ def _get_and_verify_max_len(
     ]
     max_len_key = None
     for key in possible_keys:
-        max_len = getattr(hf_config, key, None)
+        max_len = None
+        if hasattr(hf_config, key):
+            max_len = getattr(hf_config, key)
+        elif key in hf_config:
+            max_len = hf_config[key]
         if max_len is not None:
             max_len_key = key if max_len < derived_max_model_len \
                 else max_len_key
@@ -503,9 +506,9 @@ class FlattenedTensorBucket:
 
     def __init__(
         self,
-        named_tensors: List[Tuple[str, torch.Tensor]] = None,
+        named_tensors: list[tuple[str, torch.Tensor]] | None = None,
         flattened_tensor: torch.Tensor = None,
-        metadata: List[FlattenedTensorMetadata] = None,
+        metadata: list[FlattenedTensorMetadata] | None = None,
     ):
         """Initialize a tensor bucket from a list of named tensors or from pre-
         flattened data.
@@ -548,11 +551,11 @@ def get_flattened_tensor(self) -> torch.Tensor:
         """Get the flattened tensor containing multiple tensors."""
         return self.flattened_tensor
 
-    def get_metadata(self) -> List[FlattenedTensorMetadata]:
+    def get_metadata(self) -> list[FlattenedTensorMetadata]:
         """Get all metadatas for all tensors in the bucket."""
         return self.metadata
 
-    def reconstruct_tensors(self) -> List[Tuple[str, torch.Tensor]]:
+    def reconstruct_tensors(self) -> list[tuple[str, torch.Tensor]]:
         """Reconstruct original tensors."""
         # preallocate the result list
         reconstructed = [None] * len(self.metadata)
diff --git a/tests/test_lmdeploy/test_qwen3coder_parser.py b/tests/test_lmdeploy/test_qwen3coder_parser.py
@@ -7,13 +7,10 @@
 import shortuuid
 
 from lmdeploy.serve.openai.api_server import VariableInterface
-from lmdeploy.serve.openai.protocol import (
-    ChatCompletionRequest, ChatCompletionResponse,
-    ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
-    ChatCompletionStreamResponse, ChatMessage, DeltaMessage, DeltaToolCall,
-    UsageInfo)
-from lmdeploy.serve.openai.tool_parser.qwen3coder_parser import (
-    Qwen3CoderToolParser)
+from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice,
+                                            ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse,
+                                            ChatMessage, DeltaMessage, DeltaToolCall, UsageInfo)
+from lmdeploy.serve.openai.tool_parser.qwen3coder_parser import Qwen3CoderToolParser
 
 TestExpects = collections.namedtuple('TestExpects', 'func_name kwargs')
 
@@ -57,30 +54,26 @@ def encode(self, text: str) -> List[int]:
 
 
 def _chat_completion_v1(
-    request: ChatCompletionRequest, text_sequence: List[str]
-) -> Union[ChatCompletionResponse, Generator[ChatCompletionStreamResponse,
-                                             None, None]]:
+        request: ChatCompletionRequest,
+        text_sequence: List[str]) -> Union[ChatCompletionResponse, Generator[ChatCompletionStreamResponse, None, None]]:
     request_id = f'chat-{shortuuid.random()}'
     created_time = int(time.time())
     model_name = request.model
     if request.stream:
 
-        def completion_stream_generator(
-        ) -> Generator[ChatCompletionStreamResponse, None, None]:
+        def completion_stream_generator() -> Generator[ChatCompletionStreamResponse, None, None]:
             previous_text = ''
             current_text = ''
             finish_reason = 'stop'
-            has_parser = (VariableInterface.tool_parser is not None
-                          or VariableInterface.reasoning_parser is not None)
+            has_parser = (VariableInterface.tool_parser is not None or VariableInterface.reasoning_parser is not None)
             for text in text_sequence:
                 logprobs, usage = None, None
                 delta_message = DeltaMessage(role='assistant', content=text)
                 if has_parser:
                     current_text = current_text + text
                 has_tool = VariableInterface.tool_parser is not None
                 if request.tool_choice != 'none' and has_tool:
-                    tool_delta = VariableInterface.tool_parser\
-                        .extract_tool_calls_streaming(
+                    tool_delta = VariableInterface.tool_parser.extract_tool_calls_streaming(
                         previous_text=previous_text,
                         current_text=current_text,
                         delta_text=delta_message.content,
@@ -93,25 +86,22 @@ def completion_stream_generator(
                         delta_message.content = tool_delta.content or ''
                 if VariableInterface.reasoning_parser is not None:
                     parser = VariableInterface.reasoning_parser
-                    reasoning_delta = parser.extract_reasoning_content_streaming(
-                        previous_text=previous_text,
-                        current_text=current_text,
-                        delta_text=delta_message.content,
-                        previous_token_ids=[],
-                        current_token_ids=[],
-                        delta_token_ids=[])
+                    reasoning_delta = parser.extract_reasoning_content_streaming(previous_text=previous_text,
+                                                                                 current_text=current_text,
+                                                                                 delta_text=delta_message.content,
+                                                                                 previous_token_ids=[],
+                                                                                 current_token_ids=[],
+                                                                                 delta_token_ids=[])
                     if reasoning_delta is not None:
-                        delta_message.reasoning_content = (
-                            reasoning_delta.reasoning_content)
+                        delta_message.reasoning_content = (reasoning_delta.reasoning_content)
                         delta_message.content = reasoning_delta.content or ''
                 if has_parser:
                     previous_text = current_text
 
-                choice_data = ChatCompletionResponseStreamChoice(
-                    index=0,
-                    delta=delta_message,
-                    finish_reason=finish_reason,
-                    logprobs=logprobs)
+                choice_data = ChatCompletionResponseStreamChoice(index=0,
+                                                                 delta=delta_message,
+                                                                 finish_reason=finish_reason,
+                                                                 logprobs=logprobs)
                 response = ChatCompletionStreamResponse(
                     id=request_id,
                     created=created_time,
@@ -129,25 +119,20 @@ def completion_stream_generator(
     finish_reason = 'stop'
     has_tool = VariableInterface.tool_parser is not None
     if request.tool_choice != 'none' and has_tool:
-        tool_call_info = VariableInterface.tool_parser.extract_tool_calls(
-            text, request=request)
+        tool_call_info = VariableInterface.tool_parser.extract_tool_calls(text, request=request)
         text, tool_calls = tool_call_info.content, tool_call_info.tool_calls
         if isinstance(tool_calls, List) and len(tool_calls):
             if finish_reason == 'stop':
                 finish_reason = 'tool_calls'
 
     if VariableInterface.reasoning_parser is not None:
         parser = VariableInterface.reasoning_parser
-        reasoning_content, text = parser.extract_reasoning_content(
-            text, request)
+        reasoning_content, text = parser.extract_reasoning_content(text, request)
 
     choices = []
     choice_data = ChatCompletionResponseChoice(
         index=0,
-        message=ChatMessage(role='assistant',
-                            content=text,
-                            tool_calls=tool_calls,
-                            reasoning_content=reasoning_content),
+        message=ChatMessage(role='assistant', content=text, tool_calls=tool_calls, reasoning_content=reasoning_content),
         finish_reason=finish_reason,
     )
     choices.append(choice_data)
@@ -161,9 +146,7 @@ def completion_stream_generator(
     )
 
 
-def _stream_parse(
-        request: ChatCompletionRequest,
-        text_sequence: List[str]) -> Tuple[str, str, List[DeltaToolCall]]:
+def _stream_parse(request: ChatCompletionRequest, text_sequence: List[str]) -> Tuple[str, str, List[DeltaToolCall]]:
     content = ''
     reasoning_content = ''
     tool_calls = {}
@@ -184,19 +167,16 @@ def _stream_parse(
                 if c.function.name:
                     existing_call.function.name = c.function.name
                 if c.function.arguments:
-                    existing_call.function.arguments = (
-                        existing_call.function.arguments or '')
+                    existing_call.function.arguments = (existing_call.function.arguments or '')
                     existing_call.function.arguments += c.function.arguments
-    return content, reasoning_content, list(
-        sorted(tool_calls.values(), key=lambda x: x.index))
+    return content, reasoning_content, list(sorted(tool_calls.values(), key=lambda x: x.index))
 
 
 @pytest.mark.parametrize(('text_sequence', 'expects'), [
-    (DELTA_TEXT_SEQUENCE,
-     [TestExpects('get_weather', {
-         'location': '北京',
-         'unit': 'celsius'
-     })]),
+    (DELTA_TEXT_SEQUENCE, [TestExpects('get_weather', {
+        'location': '北京',
+        'unit': 'celsius'
+    })]),
     (DELTA_TEXT_SEQUENCE_MULTIPLE_CALLS, [
         TestExpects('get_weather', {
             'location': '北京',
@@ -209,11 +189,8 @@ def test_parser_stream(text_sequence: List[str], expects: List[TestExpects]):
     tokenizer = DummyTokenizer()
     VariableInterface.tool_parser = Qwen3CoderToolParser(tokenizer=tokenizer)
     VariableInterface.reasoning_parser = None
-    request = ChatCompletionRequest(model='qwen3coder',
-                                    messages=[],
-                                    stream=True)
-    content, reasoning_content, tool_calls = _stream_parse(
-        request, text_sequence)
+    request = ChatCompletionRequest(model='qwen3coder', messages=[], stream=True)
+    content, reasoning_content, tool_calls = _stream_parse(request, text_sequence)
     assert len(tool_calls) == len(expects)
     for parsed_call, expected_call in zip(tool_calls, expects):
         assert parsed_call.function.name == expected_call.func_name
@@ -223,11 +200,10 @@ def test_parser_stream(text_sequence: List[str], expects: List[TestExpects]):
 
 
 @pytest.mark.parametrize(('text_sequence', 'expects'), [
-    (DELTA_TEXT_SEQUENCE,
-     [TestExpects('get_weather', {
-         'location': '北京',
-         'unit': 'celsius'
-     })]),
+    (DELTA_TEXT_SEQUENCE, [TestExpects('get_weather', {
+        'location': '北京',
+        'unit': 'celsius'
+    })]),
     (DELTA_TEXT_SEQUENCE_MULTIPLE_CALLS, [
         TestExpects('get_weather', {
             'location': '北京',
@@ -236,14 +212,12 @@ def test_parser_stream(text_sequence: List[str], expects: List[TestExpects]):
         TestExpects('get_weather', {'location': '上海'})
     ]),
 ])
-def test_parser_nonstream(text_sequence: List[str],
-                          expects: List[TestExpects]):
+def test_parser_nonstream(text_sequence: List[str], expects: List[TestExpects]):
     tokenizer = DummyTokenizer()
     VariableInterface.tool_parser = Qwen3CoderToolParser(tokenizer=tokenizer)
     VariableInterface.reasoning_parser = None
     resp: ChatCompletionResponse = _chat_completion_v1(
-        ChatCompletionRequest(model='qwen3coder', messages=[], stream=False),
-        text_sequence)
+        ChatCompletionRequest(model='qwen3coder', messages=[], stream=False), text_sequence)
 
     assert len(resp.choices) == 1
     first_message = resp.choices[0].message
@@ -273,8 +247,7 @@ def test_no_think_nonstream():
     VariableInterface.tool_parser = Qwen3CoderToolParser(tokenizer=tokenizer)
     VariableInterface.reasoning_parser = None
     resp: ChatCompletionResponse = _chat_completion_v1(
-        ChatCompletionRequest(model='qwen3coder', messages=[], stream=False),
-        text_sequence)
+        ChatCompletionRequest(model='qwen3coder', messages=[], stream=False), text_sequence)
 
     assert len(resp.choices) == 1
     first_message = resp.choices[0].message