From 990f02e5892a423e9672c6b87f7b68cd1940af75 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Mon, 22 Jun 2026 16:48:00 +0800
Subject: [PATCH 1/8] chore: remove deprecated model support

---
 docs/en/inference/load_hf.md                  |  18 +-
 docs/en/llm/api_server.md                     |   2 +-
 docs/en/llm/api_server_anthropic.md           |   6 +-
 docs/en/multi_modal/internvl.md               |   1 -
 docs/en/multi_modal/qwen2_vl.md               |   1 -
 docs/en/supported_models/supported_models.md  |  16 +-
 docs/zh_cn/inference/load_hf.md               |  18 +-
 docs/zh_cn/llm/api_server.md                  |   2 +-
 docs/zh_cn/llm/api_server_anthropic.md        |   6 +-
 docs/zh_cn/multi_modal/internvl.md            |   1 -
 docs/zh_cn/multi_modal/qwen2_vl.md            |   1 -
 .../supported_models/supported_models.md      |  16 +-
 lmdeploy/api.py                               |   7 +-
 lmdeploy/archs.py                             |  15 +-
 lmdeploy/cli/cli.py                           |   6 +-
 lmdeploy/cli/serve.py                         |   6 +-
 lmdeploy/lite/apis/calibrate.py               |  30 +-
 lmdeploy/lite/apis/smooth_quant.py            |   8 -
 lmdeploy/lite/quantization/calibration.py     |   2 +-
 lmdeploy/model.py                             |  63 ---
 lmdeploy/pytorch/engine/engine.py             |   5 +-
 lmdeploy/pytorch/models/baichuan.py           | 414 ------------------
 lmdeploy/pytorch/models/internlm.py           | 414 ------------------
 lmdeploy/pytorch/models/internlm2_ve.py       | 322 --------------
 lmdeploy/pytorch/models/internvl.py           |  82 +---
 lmdeploy/pytorch/models/internvl_patch.py     |  73 ---
 lmdeploy/pytorch/models/module_map.py         |  35 +-
 lmdeploy/pytorch/models/patch.py              |  13 +-
 lmdeploy/pytorch/models/qwen.py               | 413 -----------------
 lmdeploy/pytorch/models/starcoder2.py         | 386 ----------------
 lmdeploy/serve/core/async_engine.py           |   9 +-
 lmdeploy/serve/openai/api_server.py           |   5 +-
 lmdeploy/serve/proxy/proxy.py                 |   4 +-
 lmdeploy/turbomind/supported_models.py        |   5 +-
 lmdeploy/turbomind/tokenizer_info.py          |   4 +-
 lmdeploy/turbomind/turbomind.py               |   5 +-
 lmdeploy/vl/model/builder.py                  |   2 -
 lmdeploy/vl/model/mllama.py                   |  67 ---
 lmdeploy/vl/model/qwen.py                     | 138 ------
 tests/pytorch/test_removed_models.py          |  21 +
 tests/test_lmdeploy/test_messages.py          |   2 +-
 tests/test_lmdeploy/test_model.py             |  34 +-
 tests/test_lmdeploy/test_tokenizer.py         |   6 +-
 43 files changed, 130 insertions(+), 2554 deletions(-)
 delete mode 100644 lmdeploy/pytorch/models/baichuan.py
 delete mode 100644 lmdeploy/pytorch/models/internlm.py
 delete mode 100644 lmdeploy/pytorch/models/internlm2_ve.py
 delete mode 100644 lmdeploy/pytorch/models/internvl_patch.py
 delete mode 100644 lmdeploy/pytorch/models/qwen.py
 delete mode 100644 lmdeploy/pytorch/models/starcoder2.py
 delete mode 100644 lmdeploy/vl/model/mllama.py
 delete mode 100644 lmdeploy/vl/model/qwen.py
 create mode 100644 tests/pytorch/test_removed_models.py

diff --git a/docs/en/inference/load_hf.md b/docs/en/inference/load_hf.md
index db72bbfd4a..46dddab987 100644
--- a/docs/en/inference/load_hf.md
+++ b/docs/en/inference/load_hf.md
@@ -6,18 +6,18 @@ Starting from v0.1.0, Turbomind adds the ability to pre-process the model parame
 
 Currently, Turbomind support loading three types of model:
 
-1. A lmdeploy-quantized model hosted on huggingface.co, such as [llama2-70b-4bit](https://huggingface.co/lmdeploy/llama2-chat-70b-4bit), [internlm-chat-20b-4bit](https://huggingface.co/internlm/internlm-chat-20b-4bit), etc.
-2. Other LM models on huggingface.co like Qwen/Qwen-7B-Chat
+1. A lmdeploy-quantized model hosted on huggingface.co, such as [llama2-70b-4bit](https://huggingface.co/lmdeploy/llama2-chat-70b-4bit), etc.
+2. Other LM models on huggingface.co like Qwen/Qwen2.5-7B-Instruct
 
 ## Usage
 
 ### 1) A lmdeploy-quantized model
 
-For models quantized by `lmdeploy.lite` such as [llama2-70b-4bit](https://huggingface.co/lmdeploy/llama2-chat-70b-4bit), [internlm-chat-20b-4bit](https://huggingface.co/internlm/internlm-chat-20b-4bit), etc.
+For models quantized by `lmdeploy.lite` such as [llama2-70b-4bit](https://huggingface.co/lmdeploy/llama2-chat-70b-4bit), etc.
 
 ```
-repo_id=internlm/internlm-chat-20b-4bit
-model_name=internlm-chat-20b
+repo_id=lmdeploy/llama2-chat-70b-4bit
+model_name=llama2-chat-70b
 # or
 # repo_id=/path/to/downloaded_model
 
@@ -30,13 +30,13 @@ lmdeploy serve api_server $repo_id --model-name $model_name --tp 1
 
 ### 2) Other LM models
 
-For other LM models such as Qwen/Qwen-7B-Chat or baichuan-inc/Baichuan2-7B-Chat. LMDeploy supported models can be viewed through `lmdeploy list`.
+For other LM models such as Qwen/Qwen2.5-7B-Instruct or internlm/internlm2-chat-7b. LMDeploy supported models can be viewed through `lmdeploy list`.
 
 ```
-repo_id=Qwen/Qwen-7B-Chat
-model_name=qwen-7b
+repo_id=Qwen/Qwen2.5-7B-Instruct
+model_name=qwen2.5-7b
 # or
-# repo_id=/path/to/Qwen-7B-Chat/local_path
+# repo_id=/path/to/Qwen2.5-7B-Instruct/local_path
 
 # Inference by TurboMind
 lmdeploy chat $repo_id --model-name $model_name
diff --git a/docs/en/llm/api_server.md b/docs/en/llm/api_server.md
index 4af78d8600..4f70d4d6fe 100644
--- a/docs/en/llm/api_server.md
+++ b/docs/en/llm/api_server.md
@@ -187,7 +187,7 @@ curl http://{server_ip}:{server_port}/v1/models
 curl http://{server_ip}:{server_port}/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "internlm-chat-7b",
+    "model": "intern-s2-preview",
     "messages": [{"role": "user", "content": "Hello! How are you?"}]
   }'
 ```
diff --git a/docs/en/llm/api_server_anthropic.md b/docs/en/llm/api_server_anthropic.md
index a728ba0ac3..d8f58b4e58 100644
--- a/docs/en/llm/api_server_anthropic.md
+++ b/docs/en/llm/api_server_anthropic.md
@@ -29,7 +29,7 @@ curl http://{server_ip}:{server_port}/v1/messages \
   -H "content-type: application/json" \
   -H "anthropic-version: 2023-06-01" \
   -d '{
-    "model": "internlm-chat-7b",
+    "model": "intern-s2-preview",
     "max_tokens": 128,
     "messages": [{"role": "user", "content": "Hello from Anthropic client"}]
   }'
@@ -42,7 +42,7 @@ curl http://{server_ip}:{server_port}/v1/messages \
   -H "content-type: application/json" \
   -H "anthropic-version: 2023-06-01" \
   -d '{
-    "model": "internlm-chat-7b",
+    "model": "intern-s2-preview",
     "max_tokens": 128,
     "messages": [{"role": "user", "content": "Find lmdeploy docs"}],
     "tools": [{
@@ -78,7 +78,7 @@ curl http://{server_ip}:{server_port}/v1/messages/count_tokens \
   -H "content-type: application/json" \
   -H "anthropic-version: 2023-06-01" \
   -d '{
-    "model": "internlm-chat-7b",
+    "model": "intern-s2-preview",
     "system": "You are a helpful assistant.",
     "messages": [{"role": "user", "content": "Count these tokens"}]
   }'
diff --git a/docs/en/multi_modal/internvl.md b/docs/en/multi_modal/internvl.md
index a592ba224e..896db2e5d9 100644
--- a/docs/en/multi_modal/internvl.md
+++ b/docs/en/multi_modal/internvl.md
@@ -9,7 +9,6 @@ LMDeploy supports the following InternVL series of models, which are detailed in
 |       InternVL2       |      4B       |          PyTorch           |
 |       InternVL2       | 1B-2B, 8B-76B |     TurboMind, PyTorch     |
 | InternVL2.5/2.5-MPO/3 |    1B-78B     |     TurboMind, PyTorch     |
-|     Mono-InternVL     |      2B       |          PyTorch           |
 
 The next chapter demonstrates how to deploy an InternVL model using LMDeploy, with [InternVL2-8B](https://huggingface.co/OpenGVLab/InternVL2-8B) as an example.
 
diff --git a/docs/en/multi_modal/qwen2_vl.md b/docs/en/multi_modal/qwen2_vl.md
index fd9f02abaa..5c7ddae402 100644
--- a/docs/en/multi_modal/qwen2_vl.md
+++ b/docs/en/multi_modal/qwen2_vl.md
@@ -4,7 +4,6 @@ LMDeploy supports the following Qwen-VL series of models, which are detailed in
 
 |    Model     |  Size  | Supported Inference Engine |
 | :----------: | :----: | :------------------------: |
-| Qwen-VL-Chat |   -    |         TurboMind          |
 |   Qwen2-VL   | 2B, 7B |          PyTorch           |
 
 The next chapter demonstrates how to deploy an Qwen-VL model using LMDeploy, with [Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) as an example.
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 8222c08c63..3694328f24 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -11,7 +11,6 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |              Llama3              |     8B, 70B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |             Llama3.1             |     8B, 70B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |     Llama3.2<sup>\[2\]</sup>     |      1B, 3B      | LLM  |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
-|             InternLM             |     7B - 20B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |            InternLM2             |     7B - 20B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |           InternLM2.5            |        7B        | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |            InternLM3             |        8B        | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -19,7 +18,6 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |      InternLM-XComposer2.5       |        7B        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |            Intern-S1             |       241B       | MLLM |    Yes    |   Yes   |   Yes   |  No   |
 |          Intern-S1-mini          |       8.3B       | MLLM |    Yes    |   Yes   |   Yes   |  No   |
-|               Qwen               |    1.8B - 72B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |     Qwen1.5<sup>\[1\]</sup>      |   1.8B - 110B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |      Qwen2<sup>\[2\]</sup>       |    0.5B - 72B    | LLM  |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
 |            Qwen2-MoE             |     57BA14B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -30,10 +28,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |             Mixtral              |   8x7B, 8x22B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |           DeepSeek-V2            |    16B, 236B     | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |          DeepSeek-V2.5           |       236B       | LLM  |    Yes    |   Yes   |   Yes   |  No   |
-|             Qwen-VL              |        7B        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |           DeepSeek-VL            |        7B        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|             Baichuan             |        7B        | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|            Baichuan2             |        7B        | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |            Code Llama            |     7B - 34B     | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |                YI                |     6B - 34B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |          LLaVA(1.5,1.6)          |     7B - 34B     | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -69,7 +64,6 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |            Llama3.1            |     8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |            Llama3.2            |     1B, 3B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |             Llama4             | Scout, Maverick | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
-|            InternLM            |    7B - 20B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |           InternLM2            |    7B - 20B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |          InternLM2.5           |       7B        | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |           InternLM3            |       8B        | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
@@ -77,13 +71,10 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |         Intern-S1-mini         |      8.3B       | MLLM |    Yes    |   Yes   |   Yes   | Yes  |   -   |
 |         Intern-S1-Pro          |       1TB       | MLLM |    Yes    |    -    |    -    |  -   |  No   |
 |       Intern-S2-Preview        |     35B-A3B     | MLLM |    Yes    |   No    |   No    |  No  |  No   |
-|           Baichuan2            |       7B        | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  No   |
-|           Baichuan2            |       13B       | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |            ChatGLM2            |       6B        | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |               YI               |    6B - 34B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |            Mistral             |       7B        | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |            Mixtral             |   8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|              QWen              |   1.8B - 72B    | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |            QWen1.5             |   0.5B - 110B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |          QWen1.5-MoE           |      A2.7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |             QWen2              |   0.5B - 72B    | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
@@ -104,19 +95,17 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |            MiniCPM3            |       4B        | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |         MiniCPM-V-2_6          |       8B        | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
 |             Gemma              |      2B-7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|           StarCoder2           |     3B-15B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |           Phi-3-mini           |      3.8B       | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |          Phi-3-vision          |      4.2B       | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |           Phi-4-mini           |      3.8B       | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |          CogVLM-Chat           |       17B       | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |          CogVLM2-Chat          |       19B       | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
-| LLaVA(1.5,1.6)<sup>\[2\]</sup> |     7B-34B      | MLLM |    No     |   No    |   No    |  No  |  No   |
+| LLaVA(1.5,1.6)<sup>\[1\]</sup> |     7B-34B      | MLLM |    No     |   No    |   No    |  No  |  No   |
 |         InternVL(v1.5)         |     2B-26B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |           InternVL2            |     1B-76B      | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |        InternVL2.5(MPO)        |     1B-78B      | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |           InternVL3            |     1B-78B      | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |          InternVL3.5           |   1B-241BA28B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
-| Mono-InternVL<sup>\[1\]</sup>  |       2B        | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |            ChemVLM             |     8B-26B      | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
 |             Gemma2             |     9B-27B      | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |             Gemma3             |     1B-27B      | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
@@ -135,8 +124,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |             GLM-5              |      754B       | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 
 ```{note}
-* [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
-* [2] PyTorch engine removes the support of original llava models after v0.6.4. Please use their corresponding transformers models instead, which can be found in https://huggingface.co/llava-hf
+* [1] PyTorch engine removes the support of original llava models after v0.6.4. Please use their corresponding transformers models instead, which can be found in https://huggingface.co/llava-hf
 Starting from version 0.11.1, PytorchEngine no longer provides support for mllama.
 ```
 
diff --git a/docs/zh_cn/inference/load_hf.md b/docs/zh_cn/inference/load_hf.md
index c538bf564f..b448208971 100644
--- a/docs/zh_cn/inference/load_hf.md
+++ b/docs/zh_cn/inference/load_hf.md
@@ -6,18 +6,18 @@
 
 目前，TurboMind 支持加载三种类型的模型：
 
-1. 在 huggingface.co 上面通过 lmdeploy 量化的模型，如 [llama2-70b-4bit](https://huggingface.co/lmdeploy/llama2-chat-70b-4bit), [internlm-chat-20b-4bit](https://huggingface.co/internlm/internlm-chat-20b-4bit)
-2. huggingface.co 上面其他 LM 模型，如Qwen/Qwen-7B-Chat
+1. 在 huggingface.co 上面通过 lmdeploy 量化的模型，如 [llama2-70b-4bit](https://huggingface.co/lmdeploy/llama2-chat-70b-4bit)
+2. huggingface.co 上面其他 LM 模型，如 Qwen/Qwen2.5-7B-Instruct
 
 ## 使用方式
 
 ### 1) 通过 lmdeploy 量化的模型
 
-对于通过 `lmdeploy.lite` 量化的模型，TurboMind 可以直接加载，比如 [llama2-70b-4bit](https://huggingface.co/lmdeploy/llama2-chat-70b-4bit), [internlm-chat-20b-4bit](https://huggingface.co/internlm/internlm-chat-20b-4bit).
+对于通过 `lmdeploy.lite` 量化的模型，TurboMind 可以直接加载，比如 [llama2-70b-4bit](https://huggingface.co/lmdeploy/llama2-chat-70b-4bit).
 
 ```
-repo_id=internlm/internlm-chat-20b-4bit
-model_name=internlm-chat-20b
+repo_id=lmdeploy/llama2-chat-70b-4bit
+model_name=llama2-chat-70b
 
 # or
 # repo_id=/path/to/downloaded_model
@@ -32,13 +32,13 @@ lmdeploy serve api_server $repo_id --model-name $model_name --tp 1
 
 ### 2) 其他的 LM 模型
 
-其他 LM 模型比如 Qwen/Qwen-7B-Chat, baichuan-inc/Baichuan2-7B-Chat。LMDeploy 模型支持情况可通过 `lmdeploy list` 查看。
+其他 LM 模型比如 Qwen/Qwen2.5-7B-Instruct, internlm/internlm2-chat-7b。LMDeploy 模型支持情况可通过 `lmdeploy list` 查看。
 
 ```
-repo_id=Qwen/Qwen-7B-Chat
-model_name=qwen-7b
+repo_id=Qwen/Qwen2.5-7B-Instruct
+model_name=qwen2.5-7b
 # or
-# repo_id=/path/to/Qwen-7B-Chat/local_path
+# repo_id=/path/to/Qwen2.5-7B-Instruct/local_path
 
 # Inference by TurboMind
 lmdeploy chat $repo_id --model-name $model_name
diff --git a/docs/zh_cn/llm/api_server.md b/docs/zh_cn/llm/api_server.md
index 1dd26db80b..a8f8cd0cce 100644
--- a/docs/zh_cn/llm/api_server.md
+++ b/docs/zh_cn/llm/api_server.md
@@ -200,7 +200,7 @@ curl http://{server_ip}:{server_port}/v1/models
 curl http://{server_ip}:{server_port}/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "internlm-chat-7b",
+    "model": "qwen2.5-7b",
     "messages": [{"role": "user", "content": "Hello! How are you?"}]
   }'
 ```
diff --git a/docs/zh_cn/llm/api_server_anthropic.md b/docs/zh_cn/llm/api_server_anthropic.md
index 5a2e605799..c4451bf7bf 100644
--- a/docs/zh_cn/llm/api_server_anthropic.md
+++ b/docs/zh_cn/llm/api_server_anthropic.md
@@ -29,7 +29,7 @@ curl http://{server_ip}:{server_port}/v1/messages \
   -H "content-type: application/json" \
   -H "anthropic-version: 2023-06-01" \
   -d '{
-    "model": "internlm-chat-7b",
+    "model": "qwen2.5-7b",
     "max_tokens": 128,
     "messages": [{"role": "user", "content": "Hello from Anthropic client"}]
   }'
@@ -42,7 +42,7 @@ curl http://{server_ip}:{server_port}/v1/messages \
   -H "content-type: application/json" \
   -H "anthropic-version: 2023-06-01" \
   -d '{
-    "model": "internlm-chat-7b",
+    "model": "qwen2.5-7b",
     "max_tokens": 128,
     "messages": [{"role": "user", "content": "Find lmdeploy docs"}],
     "tools": [{
@@ -78,7 +78,7 @@ curl http://{server_ip}:{server_port}/v1/messages/count_tokens \
   -H "content-type: application/json" \
   -H "anthropic-version: 2023-06-01" \
   -d '{
-    "model": "internlm-chat-7b",
+    "model": "qwen2.5-7b",
     "system": "You are a helpful assistant.",
     "messages": [{"role": "user", "content": "Count these tokens"}]
   }'
diff --git a/docs/zh_cn/multi_modal/internvl.md b/docs/zh_cn/multi_modal/internvl.md
index 3207550d82..6af6f45b0f 100644
--- a/docs/zh_cn/multi_modal/internvl.md
+++ b/docs/zh_cn/multi_modal/internvl.md
@@ -9,7 +9,6 @@ LMDeploy 支持 InternVL 系列模型，具体如下：
 |       InternVL2       |      4B       |          PyTorch           |
 |       InternVL2       | 1B-2B, 8B-76B |     TurboMind, PyTorch     |
 | InternVL2.5/2.5-MPO/3 |    1B-78B     |     TurboMind, PyTorch     |
-|     Mono-InternVL     |      2B       |          PyTorch           |
 
 本文将以[InternVL2-8B](https://huggingface.co/OpenGVLab/InternVL2-8B)为例，演示使用 LMDeploy 部署 InternVL 系列模型的方法。
 
diff --git a/docs/zh_cn/multi_modal/qwen2_vl.md b/docs/zh_cn/multi_modal/qwen2_vl.md
index 7cb7efe93b..bcd7982192 100644
--- a/docs/zh_cn/multi_modal/qwen2_vl.md
+++ b/docs/zh_cn/multi_modal/qwen2_vl.md
@@ -4,7 +4,6 @@ LMDeploy 支持 Qwen-VL 系列模型，具体如下：
 
 |    Model     |  Size  | Supported Inference Engine |
 | :----------: | :----: | :------------------------: |
-| Qwen-VL-Chat |   -    |         TurboMind          |
 |   Qwen2-VL   | 2B, 7B |          PyTorch           |
 
 本文将以[Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)为例，演示使用 LMDeploy 部署 Qwen2-VL 系列模型的方法
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 8f889c050b..5e29dfd0b8 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -11,7 +11,6 @@
 |              Llama3              |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |             Llama3.1             |    8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |     Llama3.2<sup>\[2\]</sup>     |     1B, 3B     | LLM  |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
-|             InternLM             |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |            InternLM2             |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |           InternLM2.5            |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |            InternLM3             |       8B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -20,7 +19,6 @@
 |            Intern-S1             |      241B      | MLLM |    Yes    |   Yes   |   Yes   |  No   |
 |          Intern-S1-mini          |      8.3B      | MLLM |    Yes    |   Yes   |   Yes   |  No   |
 |          Intern-S1-Pro           |      1TB       | MLLM |    Yes    |    -    |    -    |  No   |
-|               Qwen               |   1.8B - 72B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |     Qwen1.5<sup>\[1\]</sup>      |  1.8B - 110B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |      Qwen2<sup>\[2\]</sup>       |   0.5B - 72B   | LLM  |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
 |            Qwen2-MoE             |    57BA14B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -31,10 +29,7 @@
 |             Mixtral              |  8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |           DeepSeek-V2            |   16B, 236B    | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |          DeepSeek-V2.5           |      236B      | LLM  |    Yes    |   Yes   |   Yes   |  No   |
-|             Qwen-VL              |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |           DeepSeek-VL            |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|             Baichuan             |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|            Baichuan2             |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |            Code Llama            |    7B - 34B    | LLM  |    Yes    |   Yes   |   Yes   |  No   |
 |                YI                |    6B - 34B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |          LLaVA(1.5,1.6)          |    7B - 34B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -70,7 +65,6 @@
 |            Llama3.1            |     8B, 70B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |            Llama3.2            |     1B, 3B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |             Llama4             | Scout, Maverick | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
-|            InternLM            |    7B - 20B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |           InternLM2            |    7B - 20B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |          InternLM2.5           |       7B        | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |           InternLM3            |       8B        | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
@@ -78,13 +72,10 @@
 |         Intern-S1-mini         |      8.3B       | MLLM |    Yes    |   Yes   |   Yes   | Yes  |   -   |
 |         Intern-S1-Pro          |       1TB       | MLLM |    Yes    |    -    |    -    |  -   |  No   |
 |       Intern-S2-Preview        |     35B-A3B     | MLLM |    Yes    |   No    |   No    |  No  |  No   |
-|           Baichuan2            |       7B        | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  No   |
-|           Baichuan2            |       13B       | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |            ChatGLM2            |       6B        | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |               YI               |    6B - 34B     | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |            Mistral             |       7B        | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |            Mixtral             |   8x7B, 8x22B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|              QWen              |   1.8B - 72B    | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |            QWen1.5             |   0.5B - 110B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |          QWen1.5-MoE           |      A2.7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |             QWen2              |   0.5B - 72B    | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
@@ -105,19 +96,17 @@
 |            MiniCPM3            |       4B        | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |         MiniCPM-V-2_6          |       8B        | LLM  |    Yes    |   No    |   No    |  No  |  Yes  |
 |             Gemma              |      2B-7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
-|           StarCoder2           |     3B-15B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
 |           Phi-3-mini           |      3.8B       | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |          Phi-3-vision          |      4.2B       | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |           Phi-4-mini           |      3.8B       | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
 |          CogVLM-Chat           |       17B       | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |          CogVLM2-Chat          |       19B       | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
-| LLaVA(1.5,1.6)<sup>\[2\]</sup> |     7B-34B      | MLLM |    No     |   No    |   No    |  No  |  No   |
+| LLaVA(1.5,1.6)<sup>\[1\]</sup> |     7B-34B      | MLLM |    No     |   No    |   No    |  No  |  No   |
 |         InternVL(v1.5)         |     2B-26B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
 |           InternVL2            |     1B-76B      | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |        InternVL2.5(MPO)        |     1B-78B      | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |           InternVL3            |     1B-78B      | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |          InternVL3.5           |   1B-241BA28B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
-| Mono-InternVL<sup>\[1\]</sup>  |       2B        | MLLM |   Yes\*   |   Yes   |   Yes   |  -   |   -   |
 |            ChemVLM             |     8B-26B      | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
 |             Gemma2             |     9B-27B      | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |             Gemma3             |     1B-27B      | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
@@ -136,8 +125,7 @@
 |              SDAR              |    1.7B-30B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
-* [1] 目前，Mono-InternVL不支持FP16，因为数值不稳定。请改用BF16
-* [2] 自 0.6.4 之后，PyTorch 引擎移除了对 llava 模型原始格式的支持。我们建议使用它们对应的 transformers 格式的模型。这些模型可以在 https://huggingface.co/llava-hf 中找到
+* [1] 自 0.6.4 之后，PyTorch 引擎移除了对 llava 模型原始格式的支持。我们建议使用它们对应的 transformers 格式的模型。这些模型可以在 https://huggingface.co/llava-hf 中找到
 自 0.11.1 起，PytorchEngine 移除了 mllama 的支持
 ```
 
diff --git a/lmdeploy/api.py b/lmdeploy/api.py
index d674166ddf..6db813b9cc 100644
--- a/lmdeploy/api.py
+++ b/lmdeploy/api.py
@@ -30,11 +30,10 @@ def pipeline(model_path: str,
               ii) and iii).
             - ii) The model_id of a lmdeploy-quantized model hosted
               inside a model repo on huggingface.co, such as
-              ``InternLM/internlm-chat-20b-4bit``,
               ``lmdeploy/llama2-chat-70b-4bit``, etc.
             - iii) The model_id of a model hosted inside a model repo
-              on huggingface.co, such as ``internlm/internlm-chat-7b``,
-              ``Qwen/Qwen-7B-Chat``, ``baichuan-inc/Baichuan2-7B-Chat``
+              on huggingface.co, such as ``internlm/internlm2-chat-7b``,
+              ``Qwen/Qwen2.5-7B-Instruct``
               and so on.
         backend_config: backend config instance. Default to None.
         chat_template_config: chat template configuration. Default to None.
@@ -55,7 +54,7 @@ def pipeline(model_path: str,
 
             # LLM
             import lmdeploy
-            pipe = lmdeploy.pipeline('internlm/internlm-chat-7b')
+            pipe = lmdeploy.pipeline('internlm/internlm2-chat-7b')
             response = pipe(['hi','say this is a test'])
             print(response)
 
diff --git a/lmdeploy/archs.py b/lmdeploy/archs.py
index b99e834228..9545ba2d68 100644
--- a/lmdeploy/archs.py
+++ b/lmdeploy/archs.py
@@ -20,11 +20,10 @@ def autoget_backend(model_path: str, trust_remote_code: bool = False):
                     ii) and iii).
                 - ii) The model_id of a lmdeploy-quantized model hosted
                     inside a model repo on huggingface.co, such as
-                    "InternLM/internlm-chat-20b-4bit",
                     "lmdeploy/llama2-chat-70b-4bit", etc.
                 - iii) The model_id of a model hosted inside a model repo
-                    on huggingface.co, such as "internlm/internlm-chat-7b",
-                    "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
+                    on huggingface.co, such as "internlm/internlm2-chat-7b",
+                    "Qwen/Qwen2.5-7B-Instruct"
                     and so on.
 
     Returns:
@@ -110,16 +109,14 @@ def check_vl_llm(backend: str, config: dict) -> bool:
         'InternVLChatModel', 'MiniCPMV', 'LlavaForConditionalGeneration', 'LlavaNextForConditionalGeneration',
         'Phi3VForCausalLM', 'Qwen2VLForConditionalGeneration', 'Qwen2_5_VLForConditionalGeneration',
         'Qwen3VLForConditionalGeneration', 'Qwen3VLMoeForConditionalGeneration', 'Qwen3_5ForConditionalGeneration',
-        'Qwen3_5MoeForConditionalGeneration', 'Qwen3OmniMoeForConditionalGeneration', 'MllamaForConditionalGeneration',
-        'MolmoForCausalLM', 'Gemma3ForConditionalGeneration', 'Llama4ForConditionalGeneration',
-        'InternVLForConditionalGeneration', 'InternS1ForConditionalGeneration', 'InternS1ProForConditionalGeneration',
+        'Qwen3_5MoeForConditionalGeneration', 'Qwen3OmniMoeForConditionalGeneration', 'MolmoForCausalLM',
+        'Gemma3ForConditionalGeneration', 'Llama4ForConditionalGeneration', 'InternVLForConditionalGeneration',
+        'InternS1ForConditionalGeneration', 'InternS1ProForConditionalGeneration',
         'InternS1_1_ForConditionalGeneration', 'Glm4vForConditionalGeneration',
         'InternS2PreviewForConditionalGeneration', 'InternS2PreviewForCausalLM',
     ])
     turbomind_unsupported_archs = []
-    if arch == 'QWenLMHeadModel' and 'visual' in config:
-        return True
-    elif arch == 'MultiModalityCausalLM' and 'language_config' in config:
+    if arch == 'MultiModalityCausalLM' and 'language_config' in config:
         return True
     elif arch in ['ChatGLMModel', 'ChatGLMForConditionalGeneration'] and 'vision_config' in config:
         return True
diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
index 26c412bffb..8e6c97dcd6 100644
--- a/lmdeploy/cli/cli.py
+++ b/lmdeploy/cli/cli.py
@@ -34,11 +34,11 @@ def add_parser_chat():
                             ' which is converted by `lmdeploy convert` command or '
                             'download from ii) and iii). - ii) the model_id of a '
                             'lmdeploy-quantized model hosted inside a model repo on '
-                            'huggingface.co, such as "internlm/internlm-chat-20b-4bit",'
+                            'huggingface.co, such as "lmdeploy/llama2-chat-70b-4bit",'
                             ' "lmdeploy/llama2-chat-70b-4bit", etc. - iii) the model_id'
                             ' of a model hosted inside a model repo on huggingface.co,'
-                            ' such as "internlm/internlm-chat-7b", "qwen/qwen-7b-chat "'
-                            ', "baichuan-inc/baichuan2-7b-chat" and so on')
+                            ' such as "internlm/internlm2-chat-7b", "qwen/qwen2.5-7b-instruct"'
+                            ' and so on')
         # common args
         ArgumentHelper.backend(parser)
         # chat template args
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index 01ac1d44f1..a6097fe280 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -39,11 +39,11 @@ def add_parser_api_server():
                             ' which is converted by `lmdeploy convert` command or '
                             'download from ii) and iii). - ii) the model_id of a '
                             'lmdeploy-quantized model hosted inside a model repo on '
-                            'huggingface.co, such as "internlm/internlm-chat-20b-4bit",'
+                            'huggingface.co, such as "lmdeploy/llama2-chat-70b-4bit",'
                             ' "lmdeploy/llama2-chat-70b-4bit", etc. - iii) the model_id'
                             ' of a model hosted inside a model repo on huggingface.co,'
-                            ' such as "internlm/internlm-chat-7b", "qwen/qwen-7b-chat "'
-                            ', "baichuan-inc/baichuan2-7b-chat" and so on')
+                            ' such as "internlm/internlm2-chat-7b", "qwen/qwen2.5-7b-instruct"'
+                            ' and so on')
         parser.add_argument('--server-name', type=str, default='0.0.0.0', help='Host ip for serving')
         parser.add_argument('--server-port', type=int, default=23333, help='Server port')
         parser.add_argument('--allow-origins',
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index 8d873c5081..59e04db2fe 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -12,17 +12,13 @@
 from lmdeploy.vl.model.builder import load_vl_model
 
 LAYER_TYPE_MAP = {
-    'InternLMForCausalLM': 'InternLMDecoderLayer',
     'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
     'InternLM3ForCausalLM': 'InternLM3DecoderLayer',
-    'QWenLMHeadModel': 'QWenBlock',
     'Qwen2ForCausalLM': 'Qwen2DecoderLayer',
     'Qwen3ForCausalLM': 'Qwen3DecoderLayer',
     'Qwen3MoeForCausalLM': 'Qwen3MoeDecoderLayer',
     'Qwen3_5ForConditionalGeneration': 'Qwen3_5DecoderLayer',
     'Qwen3_5MoeForConditionalGeneration': 'Qwen3_5MoeDecoderLayer',
-    'BaiChuanForCausalLM': 'DecoderLayer',  # Baichuan 7B
-    'BaichuanForCausalLM': 'DecoderLayer',  # Baichuan2 7B
     'LlamaForCausalLM': 'LlamaDecoderLayer',
     'LlavaLlamaForCausalLM': 'LlamaDecoderLayer',
     'MGMLlamaForCausalLM': 'LlamaDecoderLayer',  # mini gemini
@@ -37,17 +33,13 @@
 }
 
 NORM_TYPE_MAP = {
-    'InternLMForCausalLM': 'InternLMRMSNorm',
     'InternLM2ForCausalLM': 'InternLM2RMSNorm',
     'InternLM3ForCausalLM': 'InternLM3RMSNorm',
-    'QWenLMHeadModel': 'RMSNorm',
     'Qwen2ForCausalLM': 'Qwen2RMSNorm',
     'Qwen3ForCausalLM': 'Qwen3RMSNorm',
     'Qwen3MoeForCausalLM': 'Qwen3MoeRMSNorm',
     'Qwen3_5ForConditionalGeneration': 'Qwen3_5RMSNorm',
     'Qwen3_5MoeForConditionalGeneration': 'Qwen3_5MoeRMSNorm',
-    'BaiChuanForCausalLM': 'RMSNorm',  # Baichuan 7B
-    'BaichuanForCausalLM': 'RMSNorm',  # Baichuan2 7B
     'LlamaForCausalLM': 'LlamaRMSNorm',
     'LlavaLlamaForCausalLM': 'LlamaRMSNorm',
     'MGMLlamaForCausalLM': 'LlamaRMSNorm',  # mini gemini
@@ -62,17 +54,13 @@
 }
 
 HEAD_NAME_MAP = {
-    'InternLMForCausalLM': 'lm_head',
     'InternLM2ForCausalLM': 'output',
     'InternLM3ForCausalLM': 'output',
-    'QWenLMHeadModel': 'lm_head',
     'Qwen2ForCausalLM': 'lm_head',
     'Qwen3ForCausalLM': 'lm_head',
     'Qwen3MoeForCausalLM': 'lm_head',
     'Qwen3_5ForConditionalGeneration': 'lm_head',
     'Qwen3_5MoeForConditionalGeneration': 'lm_head',
-    'BaiChuanForCausalLM': 'lm_head',  # Baichuan 7B
-    'BaichuanForCausalLM': 'lm_head',  # Baichuan2 7B
     'LlamaForCausalLM': 'lm_head',
     'LlavaLlamaForCausalLM': 'lm_head',
     'MGMLlamaForCausalLM': 'lm_head',  # mini gemini
@@ -104,15 +92,13 @@ def check_vl_llm(backend: str, config: dict) -> bool:
         'InternVLChatModel', 'MiniCPMV', 'LlavaForConditionalGeneration', 'LlavaNextForConditionalGeneration',
         'Phi3VForCausalLM', 'Qwen2VLForConditionalGeneration', 'Qwen2_5_VLForConditionalGeneration',
         'Qwen3VLForConditionalGeneration', 'Qwen3VLMoeForConditionalGeneration', 'Qwen3_5ForConditionalGeneration',
-        'Qwen3_5MoeForConditionalGeneration', 'MllamaForConditionalGeneration', 'MolmoForCausalLM',
-        'Gemma3ForConditionalGeneration', 'Llama4ForConditionalGeneration', 'InternVLForConditionalGeneration',
-        'InternS1ForConditionalGeneration', 'InternS1ProForConditionalGeneration',
+        'Qwen3_5MoeForConditionalGeneration', 'MolmoForCausalLM', 'Gemma3ForConditionalGeneration',
+        'Llama4ForConditionalGeneration', 'InternVLForConditionalGeneration', 'InternS1ForConditionalGeneration',
+        'InternS1ProForConditionalGeneration',
         'InternS1_1_ForConditionalGeneration', 'Glm4vForConditionalGeneration',
         'InternS2PreviewForConditionalGeneration'
     ])
-    if arch == 'QWenLMHeadModel' and 'visual' in config:
-        return True
-    elif arch == 'MultiModalityCausalLM' and 'language_config' in config:
+    if arch == 'MultiModalityCausalLM' and 'language_config' in config:
         return True
     elif arch in ['ChatGLMModel', 'ChatGLMForConditionalGeneration'] and 'vision_config' in config:
         return True
@@ -355,14 +341,6 @@ def calibrate(model: str,
     if model_type in ['MixtralForCausalLM']:
         update_moe_mapping(model, model_type)
 
-    if model_type == 'QWenLMHeadModel':
-        try:
-            import flash_attn  # noqa: F401
-        except ImportError:
-            raise RuntimeError('When using Qwen, you need to `pip install flash-attn` first, '
-                               'otherwise calibration and quantification will not work '
-                               'properly.')
-
     layer_type = LAYER_TYPE_MAP[type(model).__name__]
     norm_type = NORM_TYPE_MAP[type(model).__name__]
 
diff --git a/lmdeploy/lite/apis/smooth_quant.py b/lmdeploy/lite/apis/smooth_quant.py
index a7d4365044..f6beec1beb 100644
--- a/lmdeploy/lite/apis/smooth_quant.py
+++ b/lmdeploy/lite/apis/smooth_quant.py
@@ -70,14 +70,6 @@ def smooth_quant(model: str,
                            f'not supported. The supported model types are '
                            f"{', '.join(LAYER_TYPE_MAP.keys())}.")
 
-    if model_type == 'QWenLMHeadModel':
-        try:
-            import flash_attn  # noqa: F401
-        except ImportError:
-            raise RuntimeError('When using Qwen, you need to `pip install flash-attn` first, '
-                               'otherwise calibration and quantification will not work '
-                               'properly.')
-
     layer_type = LAYER_TYPE_MAP[type(model).__name__]
     norm_type = NORM_TYPE_MAP[type(model).__name__]
     fc2fcs = FC_FCS_MAP[layer_type]
diff --git a/lmdeploy/lite/quantization/calibration.py b/lmdeploy/lite/quantization/calibration.py
index bdff1eb1b7..a895b11e0c 100644
--- a/lmdeploy/lite/quantization/calibration.py
+++ b/lmdeploy/lite/quantization/calibration.py
@@ -226,7 +226,7 @@ def export(self, out_dir):
     def calibrate(self, data):
         """Forward pass through the model in inference mode with given data."""
 
-        if type(self.model).__name__ in ('QWenLMHeadModel', 'ChatGLMForConditionalGeneration'):
+        if type(self.model).__name__ == 'ChatGLMForConditionalGeneration':
             model = self.model.transformer
         else:
             model = self.model.model
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index d2394fec4c..07074e119a 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -310,69 +310,6 @@ def match(cls, model_path: str, **kwargs) -> str | None:
             return 'llava-v1'
 
 
-@MODELS.register_module(name='internlm')
-class InternLMChat7B(BaseChatTemplate):
-    """Chat template of InternLM model."""
-
-    def __init__(
-            self,
-            system='<|System|>:',
-            meta_instruction="""You are an AI assistant whose name is InternLM (书生·浦语).
-- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
-- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
-""",  # noqa: E501
-            eosys='\n',
-            user='<|User|>:',
-            eoh='\n',
-            assistant='<|Bot|>:',
-            eoa='<eoa>',
-            separator='\n',
-            stop_words=['<eoa>'],
-            **kwargs):
-        super().__init__(system=system,
-                         meta_instruction=meta_instruction,
-                         eosys=eosys,
-                         user=user,
-                         eoh=eoh,
-                         assistant=assistant,
-                         eoa=eoa,
-                         separator=separator,
-                         stop_words=stop_words,
-                         **kwargs)
-
-    @classmethod
-    def match(cls, model_path: str, **kwargs) -> str | None:
-        """Return the model_name that was registered to MODELS.
-
-        Args:
-            model_path (str): the model path used for matching.
-        """
-        path = model_path.lower()
-        if all([c not in path for c in ['internlm3', 'internlm2', '8k']]) and \
-                all([c in path for c in ['internlm', 'chat']]):
-            return 'internlm'
-
-
-@MODELS.register_module(name='baichuan2')
-class Baichuan2(BaseChatTemplate):
-    """Chat template and generation parameters of Baichuan2-7B-Base and
-    Baichuan2-7B-Chat models."""
-
-    def __init__(self, user='<reserved_106>', assistant='<reserved_107>', **kwargs):
-        super().__init__(user=user, assistant=assistant, **kwargs)
-
-    @classmethod
-    def match(cls, model_path: str, **kwargs) -> str | None:
-        """Return the model_name that was registered to MODELS.
-
-        Args:
-            model_path (str): the model path used for matching.
-        """
-        path = model_path.lower()
-        if 'baichuan2' in path and 'chat' in path:
-            return 'baichuan2'
-
-
 @MODELS.register_module(name='llama2')
 class Llama2(BaseChatTemplate):
     """Chat template of LLaMA2 model."""
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index 0489422a4b..e345c85190 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -230,11 +230,10 @@ def from_pretrained(cls,
                 It could be one of the following options:
                     - i) The model_id of a lmdeploy-quantized model hosted
                       inside a model repo on huggingface.co, such as
-                      "InternLM/internlm-chat-20b-4bit",
                       "lmdeploy/llama2-chat-70b-4bit", etc.
                     - ii) The model_id of a model hosted inside a model repo
-                      on huggingface.co, such as "InternLM/internlm-chat-7b",
-                      "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
+                      on huggingface.co, such as "internlm/internlm2-chat-7b",
+                      "Qwen/Qwen2.5-7B-Instruct"
                       and so on.
             engine_config (PytorchEngineConfig): Pytorch engine config.
             trust_remote_code (bool): Trust remote code
diff --git a/lmdeploy/pytorch/models/baichuan.py b/lmdeploy/pytorch/models/baichuan.py
deleted file mode 100644
index 5ef0aedf59..0000000000
--- a/lmdeploy/pytorch/models/baichuan.py
+++ /dev/null
@@ -1,414 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-from collections.abc import Iterable
-from typing import Any
-
-import torch
-from torch import nn
-
-from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding
-from lmdeploy.pytorch.nn.linear import (
-    build_down_linear,
-    build_gateup_linear,
-    build_o_proj,
-    build_qkv_proj,
-    build_rowwise_linear,
-)
-from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
-
-from .utils.cudagraph import CudaGraphMixin
-
-
-def _is_baichuan_13b(config: Any):
-    """Is baichuan 13b."""
-    return config.num_hidden_layers == 40
-
-
-class BaichuanAttention(nn.Module):
-    """Rewrite module of Attention."""
-
-    def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        quantization_config = getattr(config, 'quantization_config', None)
-        num_heads = config.num_attention_heads
-        num_key_value_heads = num_heads
-        hidden_size = config.hidden_size
-        head_dim = hidden_size // num_heads
-        self.is_13b = _is_baichuan_13b(config)
-
-        # packed qkv
-        self.W_pack = build_qkv_proj(
-            hidden_size,
-            num_q_heads=num_heads,
-            num_kv_heads=num_key_value_heads,
-            head_size=head_dim,
-            bias=False,
-            quant_config=quantization_config,
-            dtype=dtype,
-            device=device,
-        )
-
-        # rotary embedding
-        self.apply_rotary_pos_emb = ApplyRotaryEmb()
-
-        # attention
-        self.attn_fwd = Attention(
-            num_heads,
-            head_dim,
-            num_kv_heads=num_key_value_heads,
-            v_head_size=head_dim,
-            alibi=self.is_13b,
-        )
-
-        # o_proj
-        self.o_proj = build_o_proj(num_heads * head_dim,
-                                   hidden_size,
-                                   bias=False,
-                                   quant_config=quantization_config,
-                                   dtype=dtype,
-                                   device=device,
-                                   is_tp=True)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: tuple[torch.Tensor] | None = None,
-        attn_metadata: Any = None,
-    ):
-        """Rewrite of LlamaAttention.forward."""
-        # qkv proj
-        qkv_states = self.W_pack(hidden_states)
-        # (-1, heads, head_dim)
-        qkv_states = qkv_states.flatten(0, -2)
-        query_states, key_states, value_states = self.W_pack.split_qkv(qkv_states)
-
-        # apply rotary embedding
-        if not self.is_13b:
-            cos, sin = rotary_pos_emb
-            query_states, key_states = self.apply_rotary_pos_emb(
-                query_states,
-                key_states,
-                cos,
-                sin,
-                inplace=True,
-            )
-
-        # attention
-        attn_output = self.attn_fwd(
-            query_states,
-            key_states,
-            value_states,
-            past_key_value[0],
-            past_key_value[1],
-            attn_metadata,
-            k_scales_zeros=None if len(past_key_value) == 2 else past_key_value[2],
-            v_scales_zeros=None if len(past_key_value) == 2 else past_key_value[3],
-            inplace=True,
-        )
-        attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
-
-        # o proj
-        attn_output = self.o_proj(attn_output)
-        return attn_output
-
-
-class MLP(nn.Module):
-
-    def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        quantization_config = getattr(config, 'quantization_config', None)
-        # gate up
-        self.gate_up_proj = build_gateup_linear(
-            config.hidden_size,
-            [config.intermediate_size, config.intermediate_size],
-            bias=False,
-            dtype=dtype,
-            device=device,
-            quant_config=quantization_config,
-            is_tp=True,
-        )
-
-        # silu and mul
-        self.act_fn = SiluAndMul(inplace=True)
-
-        # down
-        self.down_proj = build_down_linear(config.intermediate_size,
-                                           config.hidden_size,
-                                           bias=False,
-                                           quant_config=quantization_config,
-                                           dtype=dtype,
-                                           device=device,
-                                           is_tp=True)
-
-    def forward(self, x):
-        """forward."""
-        gate_up = self.gate_up_proj(x)
-        act = self.act_fn(gate_up)
-        return self.down_proj(act)
-
-
-class DecoderLayer(nn.Module):
-    """Baichuan decoder layer."""
-
-    def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        self.layer_idx = layer_idx
-        quantization_config = getattr(config, 'quantization_config', None)
-
-        # build attention layer
-        self.self_attn = BaichuanAttention(config, dtype=dtype, device=device)
-
-        # build MLP
-        self.mlp = MLP(config, dtype=dtype, device=device)
-
-        # build input layer norm
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       config.rms_norm_eps,
-                                       quant_config=quantization_config,
-                                       dtype=dtype,
-                                       device=device)
-
-        # build attention layer norm
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                config.rms_norm_eps,
-                                                quant_config=quantization_config,
-                                                dtype=dtype,
-                                                device=device)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: list[torch.FloatTensor] | None,
-        residual: torch.Tensor | None = None,
-        attn_metadata: Any = None,
-    ):
-        """forward."""
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(hidden_states, residual)
-
-        # Self Attention
-        hidden_states = self.self_attn(
-            hidden_states=hidden_states,
-            rotary_pos_emb=rotary_pos_emb,
-            past_key_value=past_key_value,
-            attn_metadata=attn_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-
-        outputs = (hidden_states, residual)
-        return outputs
-
-
-class BaichuanModel(nn.Module):
-    """Baichuan model."""
-
-    def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size,
-                                         config.hidden_size,
-                                         self.padding_idx,
-                                         dtype=dtype,
-                                         device=device)
-
-        # build all decode layers
-        self.layers = nn.ModuleList([
-            DecoderLayer(config, layer_idx, dtype=dtype, device=device) for layer_idx in range(config.num_hidden_layers)
-        ])
-
-        # build norm
-        self.norm = RMSNorm(config.hidden_size, config.rms_norm_eps, dtype=dtype, device=device)
-
-        self.is_13b = _is_baichuan_13b(config)
-        if not self.is_13b:
-            # build rotary embedding in LlamaModel
-            emb_type = RopeType.LinearScaling
-            rope_dim = config.hidden_size // config.num_attention_heads
-            rope_max_pos_emb = config.max_position_embeddings
-            rope_base = 10000
-            scaling_factor = 1.0
-            self.rotary_emb = build_rotary_embedding(
-                rope_dim,
-                rope_max_pos_emb,
-                rope_base,
-                scaling_factor,
-                emb_type=emb_type,
-            )
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: list[torch.FloatTensor] | None = None,
-        attn_metadata: Any = None,
-        inputs_embeds: torch.FloatTensor | None = None,
-    ):
-        """forward."""
-
-        # token embedding
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        hidden_states = inputs_embeds
-
-        # rotary embedding
-        rotary_pos_emb = (None, None)
-        if not self.is_13b:
-            cos, sin = self.rotary_emb(hidden_states, position_ids)
-            cos, sin = cos[0], sin[0]
-            rotary_pos_emb = (cos, sin)
-
-        # decoding
-        residual = None
-        for idx, decoder_layer in enumerate(self.layers):
-            past_key_value = past_key_values[idx]
-            hidden_states, residual = decoder_layer(
-                hidden_states,
-                rotary_pos_emb=rotary_pos_emb,
-                past_key_value=past_key_value,
-                residual=residual,
-                attn_metadata=attn_metadata,
-            )
-
-        # norm
-        hidden_states, _ = self.norm(hidden_states, residual)
-
-        return hidden_states
-
-    def get_input_embeddings(self):
-        """Get input embeddings."""
-        return self.embed_tokens
-
-
-class BaichuanForCausalLM(nn.Module, CudaGraphMixin):
-    """Rewrote model of LlamaForCausalLM."""
-
-    packed_modules_mapping = {
-        'gate_up_proj': [
-            'gate_proj',
-            'up_proj',
-        ],
-    }
-
-    def __init__(self,
-                 config: Any,
-                 ctx_mgr: StepContextManager,
-                 dtype: torch.dtype = None,
-                 device: torch.device = None):
-        super().__init__()
-        self.config = config
-        self.ctx_mgr = ctx_mgr
-        # build BaichuanModel
-        self.model = BaichuanModel(config, dtype=dtype, device=device)
-        # build lm_head
-        self.lm_head = build_rowwise_linear(config.hidden_size,
-                                            config.vocab_size,
-                                            bias=False,
-                                            dtype=dtype,
-                                            device=device)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        past_key_values: list[list[torch.Tensor]],
-        attn_metadata: Any = None,
-        inputs_embeds: torch.Tensor = None,
-        **kwargs,
-    ):
-        """Model forward, return logits."""
-        hidden_states = self.model(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            attn_metadata=attn_metadata,
-            inputs_embeds=inputs_embeds,
-        )
-        return hidden_states
-
-    def get_logits(self, hidden_states: torch.Tensor):
-        """Compute logits of the model output."""
-        return self.lm_head(hidden_states)
-
-    def get_input_embeddings(self):
-        """Get input embeddings."""
-        return self.model.get_input_embeddings()
-
-    def prepare_inputs_for_generation(
-        self,
-        past_key_values: list[list[torch.Tensor]],
-        inputs_embeds: torch.Tensor | None = None,
-        context: StepContext = None,
-    ):
-        """Prepare input."""
-        # get input_ids, position_ids and attention metadatas
-        input_ids = context.input_ids
-        position_ids = context.position_ids
-        attn_metadata = context.attn_metadata
-
-        # process vision embeddings
-        vision_embeddings = context.input_embeddings
-        vision_embedding_indexing = context.input_embedding_indexing
-        if vision_embeddings is not None and len(vision_embeddings) > 0:
-            if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings()(input_ids)
-            inputs_embeds[:, vision_embedding_indexing, :] = vision_embeddings.to(inputs_embeds)
-
-        # inputs of forward
-        return dict(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            attn_metadata=attn_metadata,
-            inputs_embeds=inputs_embeds,
-        )
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-        """Load weights."""
-        # modify from vllm
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ('.gate_up_proj', '.gate_proj', 0),
-            ('.gate_up_proj', '.up_proj', 1),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            if 'rotary_emb.inv_freq' in name:
-                continue
-            if ('rotary_emb.cos_cached' in name or 'rotary_emb.sin_cached' in name):
-                continue
-            if self.config.tie_word_embeddings and 'lm_head.weight' in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                param = params_dict[name]
-                load_weight(param, loaded_weight, shard_id=shard_id)
-                break
-            else:
-                if '.W_pack' in name:
-                    param = params_dict[name]
-                    q, k, v = param.weight_spliter(loaded_weight)
-                    load_weight(param, q, shard_id='q')
-                    load_weight(param, k, shard_id='k')
-                    load_weight(param, v, shard_id='v')
-                elif 'lm_head' in name:
-                    loaded_weight = nn.functional.normalize(loaded_weight)
-                    param = params_dict[name]
-                    load_weight(param, loaded_weight)
-                else:
-                    param = params_dict[name]
-                    load_weight(param, loaded_weight)
diff --git a/lmdeploy/pytorch/models/internlm.py b/lmdeploy/pytorch/models/internlm.py
deleted file mode 100644
index ad53d6970b..0000000000
--- a/lmdeploy/pytorch/models/internlm.py
+++ /dev/null
@@ -1,414 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-from collections.abc import Iterable
-from typing import Any
-
-import torch
-from torch import nn
-from transformers.configuration_utils import PretrainedConfig
-
-from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding
-from lmdeploy.pytorch.nn.linear import (
-    build_down_linear,
-    build_gateup_linear,
-    build_o_proj,
-    build_qkv_proj,
-    build_rowwise_linear,
-)
-from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
-
-from .utils.cudagraph import CudaGraphMixin
-
-
-class InternLMAttention(nn.Module):
-    """Rewrite module of LlamaAttention."""
-
-    def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        quantization_config = getattr(config, 'quantization_config', None)
-        num_heads = config.num_attention_heads
-        num_key_value_heads = config.num_key_value_heads
-        hidden_size = config.hidden_size
-        head_dim = getattr(config, 'head_dim', hidden_size // num_heads)
-        num_replicate_kv_heads = getattr(config, 'num_replicate_key_value_heads', 1)
-        # packed qkv
-        self.qkv_proj = build_qkv_proj(hidden_size,
-                                       num_q_heads=num_heads,
-                                       num_kv_heads=num_key_value_heads,
-                                       head_size=head_dim,
-                                       bias=config.bias,
-                                       quant_config=quantization_config,
-                                       dtype=dtype,
-                                       device=device,
-                                       num_replicate_kv_heads=num_replicate_kv_heads)
-
-        # rotary embedding
-        self.apply_rotary_pos_emb = ApplyRotaryEmb()
-
-        # attention
-        self.attn_fwd = Attention(
-            num_heads,
-            head_dim,
-            num_kv_heads=num_key_value_heads,
-            v_head_size=head_dim,
-        )
-
-        # o_proj
-        self.o_proj = build_o_proj(num_heads * head_dim,
-                                   hidden_size,
-                                   bias=config.bias,
-                                   quant_config=quantization_config,
-                                   dtype=dtype,
-                                   device=device,
-                                   is_tp=True)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: tuple[torch.Tensor] | None = None,
-        attn_metadata: Any = None,
-    ):
-        """Rewrite of LlamaAttention.forward."""
-        # qkv proj
-        qkv_states = self.qkv_proj(hidden_states)
-        # (-1, heads, head_dim)
-        qkv_states = qkv_states.flatten(0, -2)
-        query_states, key_states, value_states = self.qkv_proj.split_qkv(qkv_states)
-
-        # apply rotary embedding
-        cos, sin = rotary_pos_emb
-        query_states, key_states = self.apply_rotary_pos_emb(
-            query_states,
-            key_states,
-            cos,
-            sin,
-            inplace=True,
-        )
-
-        # attention
-        attn_output = self.attn_fwd(
-            query_states,
-            key_states,
-            value_states,
-            past_key_value[0],
-            past_key_value[1],
-            attn_metadata,
-            k_scales_zeros=None if len(past_key_value) == 2 else past_key_value[2],
-            v_scales_zeros=None if len(past_key_value) == 2 else past_key_value[3],
-            inplace=True,
-        )
-        attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
-
-        # o proj
-        attn_output = self.o_proj(attn_output)
-        return attn_output
-
-
-class InternLMMLP(nn.Module):
-    """mlp."""
-
-    def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        quantization_config = getattr(config, 'quantization_config', None)
-        # gate up
-        self.gate_up_proj = build_gateup_linear(
-            config.hidden_size,
-            [config.intermediate_size, config.intermediate_size],
-            bias=config.bias,
-            dtype=dtype,
-            device=device,
-            quant_config=quantization_config,
-            is_tp=True,
-        )
-
-        # silu and mul
-        self.act_fn = SiluAndMul(inplace=True)
-
-        # down
-        self.down_proj = build_down_linear(config.intermediate_size,
-                                           config.hidden_size,
-                                           bias=config.bias,
-                                           quant_config=quantization_config,
-                                           dtype=dtype,
-                                           device=device,
-                                           is_tp=True)
-
-    def forward(self, x):
-        """forward."""
-        gate_up = self.gate_up_proj(x)
-        act = self.act_fn(gate_up)
-        return self.down_proj(act)
-
-
-class InternLMDecoderLayer(nn.Module):
-    """Decoder layer."""
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 layer_idx: int,
-                 dtype: torch.dtype = None,
-                 device: torch.device = None):
-        super().__init__()
-        self.layer_idx = layer_idx
-        quantization_config = getattr(config, 'quantization_config', None)
-
-        # build attention layer
-        self.self_attn = InternLMAttention(config, dtype=dtype, device=device)
-
-        # build MLP
-        self.mlp = InternLMMLP(config, dtype=dtype, device=device)
-
-        # build input layer norm
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       config.rms_norm_eps,
-                                       quant_config=quantization_config,
-                                       dtype=dtype,
-                                       device=device)
-
-        # build attention layer norm
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                config.rms_norm_eps,
-                                                quant_config=quantization_config,
-                                                dtype=dtype,
-                                                device=device)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: list[torch.FloatTensor] | None,
-        residual: torch.Tensor | None = None,
-        attn_metadata: Any = None,
-    ):
-
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(hidden_states, residual)
-
-        # Self Attention
-        hidden_states = self.self_attn(
-            hidden_states=hidden_states,
-            rotary_pos_emb=rotary_pos_emb,
-            past_key_value=past_key_value,
-            attn_metadata=attn_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-
-        outputs = (hidden_states, residual)
-        return outputs
-
-
-class InternLMModel(nn.Module):
-    """model."""
-
-    def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size,
-                                         config.hidden_size,
-                                         self.padding_idx,
-                                         dtype=dtype,
-                                         device=device)
-
-        # build all decode layers
-        self.layers = nn.ModuleList([
-            InternLMDecoderLayer(config, layer_idx, dtype=dtype, device=device)
-            for layer_idx in range(config.num_hidden_layers)
-        ])
-
-        # build norm
-        self.norm = RMSNorm(config.hidden_size, config.rms_norm_eps, dtype=dtype, device=device)
-
-        # build rotary embedding in LlamaModel
-        rope_dim = config.hidden_size // config.num_attention_heads
-        rope_max_pos_emb = config.max_position_embeddings
-        scaling_factor = 1.0
-        rope_scaling = config.rotary
-        rope_base = rope_scaling['base']
-        rope_type = rope_scaling['type']
-        if rope_type == 'dynamic':
-            emb_type = RopeType.DynamicNTKScaling
-            scaling_factor = rope_scaling.get('scaling_factor', 1.0)
-        elif rope_type == 'origin':
-            emb_type = RopeType.LinearScaling
-        else:
-            raise RuntimeError(f'Unsupported rope type: {rope_type}')
-
-        self.rotary_emb = build_rotary_embedding(
-            rope_dim,
-            rope_max_pos_emb,
-            rope_base,
-            scaling_factor,
-            emb_type=emb_type,
-        )
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: list[torch.FloatTensor] | None = None,
-        attn_metadata: Any = None,
-        inputs_embeds: torch.FloatTensor | None = None,
-    ):
-        """Rewrite of LlamaModel.forward."""
-
-        # token embedding
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        hidden_states = inputs_embeds
-
-        # rotary embedding
-        cos, sin = self.rotary_emb(hidden_states, position_ids)
-        cos, sin = cos[0], sin[0]
-        rotary_pos_emb = (cos, sin)
-
-        # decoding
-        residual = None
-        for idx, decoder_layer in enumerate(self.layers):
-            past_key_value = past_key_values[idx]
-            hidden_states, residual = decoder_layer(
-                hidden_states,
-                rotary_pos_emb=rotary_pos_emb,
-                past_key_value=past_key_value,
-                residual=residual,
-                attn_metadata=attn_metadata,
-            )
-
-        # norm
-        hidden_states, _ = self.norm(hidden_states, residual)
-
-        return hidden_states
-
-    def get_input_embeddings(self):
-        """Get input embeddings."""
-        return self.embed_tokens
-
-
-class InternLMForCausalLM(nn.Module, CudaGraphMixin):
-    """Rewrote model of LlamaForCausalLM."""
-
-    packed_modules_mapping = {
-        'qkv_proj': [
-            'q_proj',
-            'k_proj',
-            'v_proj',
-        ],
-        'gate_up_proj': [
-            'gate_proj',
-            'up_proj',
-        ],
-    }
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 ctx_mgr: StepContextManager,
-                 dtype: torch.dtype = None,
-                 device: torch.device = None):
-        super().__init__()
-        self.config = config
-        self.ctx_mgr = ctx_mgr
-        # build LLamaModel
-        self.model = InternLMModel(config, dtype=dtype, device=device)
-        # build lm_head
-        self.lm_head = build_rowwise_linear(config.hidden_size,
-                                            config.vocab_size,
-                                            bias=False,
-                                            dtype=dtype,
-                                            device=device)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        past_key_values: list[list[torch.Tensor]],
-        attn_metadata: Any = None,
-        inputs_embeds: torch.Tensor = None,
-        **kwargs,
-    ):
-        """Model forward, return logits."""
-        hidden_states = self.model(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            attn_metadata=attn_metadata,
-            inputs_embeds=inputs_embeds,
-        )
-        return hidden_states
-
-    def get_logits(self, hidden_states: torch.Tensor):
-        """Compute logits of the model output."""
-        return self.lm_head(hidden_states)
-
-    def get_input_embeddings(self):
-        """Get input embeddings."""
-        return self.model.get_input_embeddings()
-
-    def prepare_inputs_for_generation(
-        self,
-        past_key_values: list[list[torch.Tensor]],
-        inputs_embeds: torch.Tensor | None = None,
-        context: StepContext = None,
-    ):
-        """Prepare input."""
-        # get input_ids, position_ids and attention metadatas
-        input_ids = context.input_ids
-        position_ids = context.position_ids
-        attn_metadata = context.attn_metadata
-
-        # process vision embeddings
-        vision_embeddings = context.input_embeddings
-        vision_embedding_indexing = context.input_embedding_indexing
-        if vision_embeddings is not None and len(vision_embeddings) > 0:
-            if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings()(input_ids)
-            inputs_embeds[:, vision_embedding_indexing, :] = vision_embeddings.to(inputs_embeds)
-
-        # inputs of forward
-        return dict(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            attn_metadata=attn_metadata,
-            inputs_embeds=inputs_embeds,
-        )
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-        """Load weights."""
-        # modify from vllm
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ('.qkv_proj', '.q_proj', 'q'),
-            ('.qkv_proj', '.k_proj', 'k'),
-            ('.qkv_proj', '.v_proj', 'v'),
-            ('.gate_up_proj', '.gate_proj', 0),
-            ('.gate_up_proj', '.up_proj', 1),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            if 'rotary_emb.inv_freq' in name:
-                continue
-            if ('rotary_emb.cos_cached' in name or 'rotary_emb.sin_cached' in name):
-                continue
-            if self.config.tie_word_embeddings and 'lm_head.weight' in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                param = params_dict[name]
-                load_weight(param, loaded_weight, shard_id=shard_id)
-                break
-            else:
-                param = params_dict[name]
-                load_weight(param, loaded_weight)
diff --git a/lmdeploy/pytorch/models/internlm2_ve.py b/lmdeploy/pytorch/models/internlm2_ve.py
deleted file mode 100644
index 011c917194..0000000000
--- a/lmdeploy/pytorch/models/internlm2_ve.py
+++ /dev/null
@@ -1,322 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from collections.abc import Iterable
-from typing import Any
-
-import torch
-from torch import nn
-from transformers.configuration_utils import PretrainedConfig
-
-from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager, get_step_ctx_manager
-from lmdeploy.pytorch.models.internlm2 import InternLM2Attention, InternLM2MLP
-from lmdeploy.pytorch.nn import RMSNorm, RopeType, build_rotary_embedding
-from lmdeploy.pytorch.nn.linear import build_rowwise_linear
-from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters, get_rope_theta
-from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
-
-from .utils.cudagraph import CudaGraphMixin
-
-
-class InternLM2VEDecoderLayer(nn.Module):
-    """Decoder layer with visual expert."""
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 layer_idx: int,
-                 dtype: torch.dtype = None,
-                 device: torch.device = None):
-        super().__init__()
-        self.layer_idx = layer_idx
-        self.hidden_size = config.hidden_size
-        quantization_config = getattr(config, 'quantization_config', None)
-
-        # build attention layer
-        self.attention = InternLM2Attention(config, dtype=dtype, device=device)
-
-        # build MLP
-        self.feed_forward = InternLM2MLP(config, dtype=dtype, device=device)
-
-        # build visual expert
-        self.feed_forward_ve = InternLM2MLP(config, dtype=dtype, device=device)
-
-        # build input layer norm
-        self.attention_norm = RMSNorm(config.hidden_size,
-                                      config.rms_norm_eps,
-                                      quant_config=quantization_config,
-                                      dtype=dtype,
-                                      device=device)
-
-        # build attention layer norm
-        self.ffn_norm = RMSNorm(config.hidden_size,
-                                config.rms_norm_eps,
-                                quant_config=quantization_config,
-                                dtype=dtype,
-                                device=device)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: list[torch.FloatTensor] | None,
-        residual: torch.Tensor | None = None,
-        attn_metadata: Any = None,
-        vision_embedding_indexing: torch.Tensor | None = None,
-        text_embedding_indexing: torch.Tensor | None = None,
-    ):
-
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.attention_norm(hidden_states)
-        else:
-            hidden_states, residual = self.attention_norm(hidden_states, residual)
-
-        # Self Attention
-        hidden_states = self.attention(
-            hidden_states=hidden_states,
-            rotary_pos_emb=rotary_pos_emb,
-            past_key_value=past_key_value,
-            attn_metadata=attn_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.ffn_norm(hidden_states, residual)
-        if vision_embedding_indexing is not None:
-            hidden_states[:, vision_embedding_indexing, :] = self.feed_forward_ve(
-                hidden_states[:, vision_embedding_indexing, :].reshape(-1, self.hidden_size)).unsqueeze(0)
-            if text_embedding_indexing is not None:
-                hidden_states[:, text_embedding_indexing, :] = self.feed_forward(
-                    hidden_states[:, text_embedding_indexing, :].reshape(-1, self.hidden_size)).unsqueeze(0)
-        else:
-            hidden_states = self.feed_forward(hidden_states)
-
-        outputs = (hidden_states, residual)
-        return outputs
-
-
-class InternLM2VEModel(nn.Module):
-    """Internlm2 model with visual expert."""
-
-    def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.tok_embeddings = nn.Embedding(config.vocab_size,
-                                           config.hidden_size,
-                                           self.padding_idx,
-                                           dtype=dtype,
-                                           device=device)
-
-        # build all decode layers
-        self.layers = nn.ModuleList([
-            InternLM2VEDecoderLayer(config, layer_idx, dtype=dtype, device=device)
-            for layer_idx in range(config.num_hidden_layers)
-        ])
-
-        # build norm
-        self.norm = RMSNorm(config.hidden_size, config.rms_norm_eps, dtype=dtype, device=device)
-
-        # build rotary embedding in Model
-        rope_scaling = get_rope_parameters(config)
-        scaling_factor = 1.0
-        emb_type = RopeType.LinearScaling
-        if rope_scaling is not None:
-            scaling_factor = rope_scaling.get('factor', scaling_factor)
-            rope_type = rope_scaling['type']
-            if rope_type == 'linear':
-                emb_type = RopeType.LinearScaling
-            if rope_type == 'dynamic':
-                emb_type = RopeType.DynamicNTKScaling
-            else:
-                raise RuntimeError(f'Unsupported rope type: {rope_type}')
-        rope_dim = config.hidden_size // config.num_attention_heads
-        rope_max_pos_emb = config.max_position_embeddings
-        rope_base = get_rope_theta(config)
-        self.rotary_emb = build_rotary_embedding(
-            rope_dim,
-            rope_max_pos_emb,
-            rope_base,
-            scaling_factor,
-            emb_type=emb_type,
-        )
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: list[torch.FloatTensor] | None = None,
-        attn_metadata: Any = None,
-        inputs_embeds: torch.FloatTensor | None = None,
-        vision_embedding_indexing: torch.Tensor | None = None,
-        text_embedding_indexing: torch.Tensor | None = None,
-    ):
-        """Rewrite of forward."""
-
-        # token embedding
-        if inputs_embeds is None:
-            inputs_embeds = self.tok_embeddings(input_ids)
-
-        hidden_states = inputs_embeds
-
-        # rotary embedding
-        cos, sin = self.rotary_emb(hidden_states, position_ids)
-        cos, sin = cos[0], sin[0]
-        rotary_pos_emb = (cos, sin)
-
-        # decoding
-        residual = None
-        for idx, decoder_layer in enumerate(self.layers):
-            past_key_value = past_key_values[idx]
-            hidden_states, residual = decoder_layer(
-                hidden_states,
-                rotary_pos_emb=rotary_pos_emb,
-                past_key_value=past_key_value,
-                residual=residual,
-                attn_metadata=attn_metadata,
-                vision_embedding_indexing=vision_embedding_indexing,
-                text_embedding_indexing=text_embedding_indexing,
-            )
-
-        # norm
-        hidden_states, _ = self.norm(hidden_states, residual)
-
-        return hidden_states
-
-    def get_input_embeddings(self):
-        """Get input embeddings."""
-        return self.tok_embeddings
-
-
-class InternLM2VEForCausalLM(nn.Module, CudaGraphMixin):
-    """Rewrote model of InternLM2ForCausalLM with visual expert."""
-
-    packed_modules_mapping = {
-        'gate_up_proj': [
-            'w1',
-            'w3',
-        ],
-    }
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 ctx_mgr: StepContextManager,
-                 dtype: torch.dtype = None,
-                 device: torch.device = None):
-        super().__init__()
-        self.config = config
-        self.ctx_mgr = ctx_mgr
-        # build Model
-        self.model = InternLM2VEModel(config, dtype=dtype, device=device)
-        # build lm_head
-        self.output = build_rowwise_linear(config.hidden_size,
-                                           config.vocab_size,
-                                           bias=False,
-                                           dtype=dtype,
-                                           device=device)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        past_key_values: list[list[torch.Tensor]],
-        attn_metadata: Any = None,
-        inputs_embeds: torch.Tensor = None,
-        vision_embedding_indexing: torch.Tensor | None = None,
-        text_embedding_indexing: torch.Tensor | None = None,
-        **kwargs,
-    ):
-        """Model forward, return logits."""
-        hidden_states = self.model(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            attn_metadata=attn_metadata,
-            inputs_embeds=inputs_embeds,
-            vision_embedding_indexing=vision_embedding_indexing,
-            text_embedding_indexing=text_embedding_indexing,
-        )
-        return hidden_states
-
-    def get_logits(self, hidden_states: torch.Tensor):
-        """Compute logits of the model output."""
-        return self.output(hidden_states)
-
-    def support_cuda_graph(
-        self,
-        input_ids: torch.Tensor,
-        attn_metadata: Any = None,
-        **kwargs,
-    ):
-        """Support cudagraph."""
-        context = get_step_ctx_manager().current_context()
-        if not context.global_is_decoding():
-            return False
-        seq_lens = input_ids.size(1)
-        if seq_lens <= 512:
-            return True
-        return False
-
-    def get_input_embeddings(self):
-        """Get input embeddings."""
-        return self.model.get_input_embeddings()
-
-    def prepare_inputs_for_generation(
-        self,
-        past_key_values: list[list[torch.Tensor]],
-        inputs_embeds: torch.Tensor | None = None,
-        context: StepContext = None,
-    ):
-        """Prepare input."""
-        # get input_ids, position_ids and attention metadatas
-        input_ids = context.input_ids
-        position_ids = context.position_ids
-        attn_metadata = context.attn_metadata
-
-        # process vision embeddings
-        vision_embeddings = context.input_embeddings
-        vision_embedding_indexing = context.input_embedding_indexing
-        if vision_embeddings is not None and len(vision_embeddings) > 0:
-            if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings()(input_ids)
-            inputs_embeds[:, vision_embedding_indexing, :] = vision_embeddings.to(inputs_embeds)
-
-        # inputs of forward
-        return dict(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            attn_metadata=attn_metadata,
-            inputs_embeds=inputs_embeds,
-        )
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-        """Load weights."""
-        # modify from vllm
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ('.gate_up_proj', '.w1', 0),
-            ('.gate_up_proj', '.w3', 1),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            if 'rotary_emb.inv_freq' in name:
-                continue
-            if ('rotary_emb.cos_cached' in name or 'rotary_emb.sin_cached' in name):
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                param = params_dict[name]
-                load_weight(param, loaded_weight, shard_id=shard_id)
-                break
-            else:
-                if '.wqkv' in name:
-                    param = params_dict[name]
-                    q, k, v = param.weight_spliter(loaded_weight, layout='hgd')
-                    load_weight(param, q, shard_id='q')
-                    load_weight(param, k, shard_id='k')
-                    load_weight(param, v, shard_id='v')
-                else:
-                    param = params_dict[name]
-                    load_weight(param, loaded_weight)
diff --git a/lmdeploy/pytorch/models/internvl.py b/lmdeploy/pytorch/models/internvl.py
index d889f49bd7..d87bb44e3b 100644
--- a/lmdeploy/pytorch/models/internvl.py
+++ b/lmdeploy/pytorch/models/internvl.py
@@ -459,18 +459,9 @@ def __init__(self,
 
         llm_config = config.llm_config
         self.llm_arch_name = llm_config.architectures[0]
-        self.is_mono = self.llm_arch_name == 'InternLM2VEForCausalLM'
 
         vision_config = config.vision_config
-        if self.is_mono:
-            from .internvl_patch import InternVisionPatchModel
-            self.vision_model = InternVisionPatchModel(
-                vision_config,
-                dtype=dtype,
-                device=device,
-            )
-        else:
-            self.vision_model = InternVisionModel(vision_config, dtype=dtype, device=device)
+        self.vision_model = InternVisionModel(vision_config, dtype=dtype, device=device)
 
         self.language_model = build_model_from_hf_config(llm_config, dtype=dtype, device=device)
         self.lm_head = self.language_model.lm_head
@@ -486,11 +477,6 @@ def __init__(self,
                       device=device), nn.GELU(),
             nn.Linear(llm_hidden_size, llm_hidden_size, bias=True, dtype=dtype, device=device))
 
-        # for Mono-InternVL
-        if self.is_mono:
-            assert dtype != torch.float16, ('Currently Mono-InternVL does not support FP16 due to'
-                                            'numerical instability. Please use BF16 instead.')
-
         self.input_processor = InternVLInputProcessor(self.config, dtype)
 
         self.compile_vit = False
@@ -554,11 +540,7 @@ def extract_feature(self, pixel_values):
         """Extract vision feature."""
         assert self.select_layer == -1
         vit_embeds = self.vision_model(pixel_values)
-        if self.is_mono:
-            if int(vit_embeds.shape[1]**0.5)**2 != vit_embeds.shape[1]:
-                vit_embeds = vit_embeds[:, 1:, :]
-        else:
-            vit_embeds = vit_embeds[:, 1:, :]
+        vit_embeds = vit_embeds[:, 1:, :]
 
         h = w = int(vit_embeds.shape[1]**0.5)
         vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
@@ -788,20 +770,11 @@ def forward(
 
             inputs_embeds = lang_embeds
 
-        if self.is_mono:
-            return self.language_model.forward(input_ids=input_ids,
-                                               inputs_embeds=inputs_embeds,
-                                               past_key_values=past_key_values,
-                                               position_ids=position_ids,
-                                               attn_metadata=attn_metadata,
-                                               vision_embedding_indexing=vision_embedding_indexing,
-                                               text_embedding_indexing=text_embedding_indexing)
-        else:
-            return self.language_model.forward(input_ids=input_ids,
-                                               inputs_embeds=inputs_embeds,
-                                               past_key_values=past_key_values,
-                                               position_ids=position_ids,
-                                               attn_metadata=attn_metadata)
+        return self.language_model.forward(input_ids=input_ids,
+                                           inputs_embeds=inputs_embeds,
+                                           past_key_values=past_key_values,
+                                           position_ids=position_ids,
+                                           attn_metadata=attn_metadata)
 
     def get_input_embeddings(self):
         """Get input embeddings."""
@@ -836,10 +809,6 @@ def prepare_inputs_for_generation(
                 pixel_values = None
                 image_mask = None
 
-        if self.is_mono and pixel_values is not None:
-            vision_embedding_indexing = torch.arange(input_ids.shape[1], device=input_ids.device)
-            vision_embedding_indexing = vision_embedding_indexing[image_mask[0]]
-
         # get inputs from context
         if vision_embeddings is not None and len(vision_embeddings) > 0:
             vision_embedding_indexing = context.input_embedding_indexing
@@ -893,34 +862,15 @@ def prepare_inputs_for_generation(
                     # init model metas
                     context.model_metas = [{'new_seqlen': seqlen} for seqlen in seq_lengths.tolist()]
 
-        if self.is_mono and vision_embedding_indexing is not None:
-            all_indices = torch.arange(input_ids.shape[1]).to(input_ids)
-            text_embedding_indexing = all_indices[~torch.isin(all_indices, vision_embedding_indexing)]
-            if vision_embedding_indexing.numel() == 0:
-                vision_embedding_indexing = None
-            if text_embedding_indexing.numel() == 0:
-                text_embedding_indexing = None
-            return dict(input_ids=input_ids,
-                        position_ids=position_ids,
-                        past_key_values=past_key_values,
-                        attn_metadata=attn_metadata,
-                        pixel_values=pixel_values,
-                        image_mask=image_mask,
-                        inputs_embeds=inputs_embeds,
-                        vision_embedding_indexing=vision_embedding_indexing,
-                        text_embedding_indexing=text_embedding_indexing,
-                        image_token_id=image_token_id,
-                        context=context)
-        else:
-            return dict(input_ids=input_ids,
-                        position_ids=position_ids,
-                        past_key_values=past_key_values,
-                        attn_metadata=attn_metadata,
-                        pixel_values=pixel_values,
-                        image_mask=image_mask,
-                        inputs_embeds=inputs_embeds,
-                        image_token_id=image_token_id,
-                        context=context)
+        return dict(input_ids=input_ids,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    attn_metadata=attn_metadata,
+                    pixel_values=pixel_values,
+                    image_mask=image_mask,
+                    inputs_embeds=inputs_embeds,
+                    image_token_id=image_token_id,
+                    context=context)
 
     def load_lora_weights(self, weights: Iterable[tuple[str, torch.Tensor]], adapter_id: int):
         """Load lora weights."""
diff --git a/lmdeploy/pytorch/models/internvl_patch.py b/lmdeploy/pytorch/models/internvl_patch.py
deleted file mode 100644
index 1a53bc68ce..0000000000
--- a/lmdeploy/pytorch/models/internvl_patch.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from transformers.configuration_utils import PretrainedConfig
-
-
-class InternVisionEmbeddings(nn.Module):
-    """Mono vision."""
-
-    def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.image_size = config.image_size
-        self.patch_size = config.patch_size
-
-        self.class_embedding = nn.Parameter(torch.empty(1, 1, self.embed_dim, dtype=dtype, device=device), )
-
-        self.patch_embedding = nn.Conv2d(in_channels=3,
-                                         out_channels=self.embed_dim,
-                                         kernel_size=self.patch_size,
-                                         stride=self.patch_size,
-                                         dtype=dtype,
-                                         device=device)
-
-        self.num_patches = (self.image_size // self.patch_size)**2
-        self.num_positions = self.num_patches + 1
-
-        self.position_embedding = nn.Parameter(
-            torch.empty(1, self.num_positions, self.embed_dim, dtype=dtype, device=device))
-
-    def _get_pos_embed(self, pos_embed, H, W):
-        target_dtype = pos_embed.dtype
-        pos_embed = pos_embed.float().reshape(1, self.image_size // self.patch_size, self.image_size // self.patch_size,
-                                              -1).permute(0, 3, 1, 2)
-        pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False)
-        pos_embed = pos_embed.reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
-        return pos_embed
-
-    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
-        target_dtype = self.patch_embedding.weight.dtype
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, channel, width, height]
-        batch_size, _, height, width = patch_embeds.shape
-        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
-        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
-        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
-        position_embedding = torch.cat(
-            [self.position_embedding[:, :1, :],
-             self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)],
-            dim=1)
-        embeddings = embeddings + position_embedding.to(target_dtype)
-        return embeddings
-
-
-class InternVisionPatchModel(nn.Module):
-    """Mono vision."""
-
-    def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        self.config = config
-        self.embeddings = InternVisionEmbeddings(config, dtype=dtype, device=device)
-
-    def forward(
-        self,
-        pixel_values: torch.FloatTensor | None = None,
-    ):
-        if len(pixel_values.shape) != 4:
-            raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
-
-        hidden_states = self.embeddings(pixel_values)[:, 1:]
-        return hidden_states
diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py
index c0bdf5779a..d8ea847c3e 100644
--- a/lmdeploy/pytorch/models/module_map.py
+++ b/lmdeploy/pytorch/models/module_map.py
@@ -10,6 +10,16 @@
 
 DEVICE_SPECIAL_MODULE_MAP = dict(ascend=ASCEND_MODULE_MAP, maca=MACA_MODULE_MAP, camb=CAMB_MODULE_MAP)
 
+REMOVED_MODEL_MAP = {
+    'InternLMForCausalLM': 'InternLM',
+    'QWenLMHeadModel': 'Qwen and Qwen-VL',
+    'BaiChuanForCausalLM': 'Baichuan',
+    'BaichuanForCausalLM': 'Baichuan and Baichuan2',
+    'Starcoder2ForCausalLM': 'StarCoder2',
+    'InternLM2VEForCausalLM': 'Mono-InternVL',
+    'MllamaForConditionalGeneration': 'mllama',
+}
+
 # llama
 MODULE_MAP.update({
     'LlamaForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.llama.LlamaForCausalLM',
@@ -21,11 +31,6 @@
     f'{LMDEPLOY_PYTORCH_MODEL_PATH}.llama4.Llama4ForConditionalGeneration',
 })
 
-# baichuan
-MODULE_MAP.update({
-    'BaichuanForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.baichuan.BaichuanForCausalLM',
-})
-
 # chatglm
 MODULE_MAP.update({
     'ChatGLMForConditionalGeneration':
@@ -60,11 +65,6 @@
 # glm5
 MODULE_MAP.update({'GlmMoeDsaForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.deepseek_v32.DeepseekV32ForCausalLM'})
 
-# internlm
-MODULE_MAP.update({
-    'InternLMForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.internlm.InternLMForCausalLM',
-})
-
 # internlm2
 MODULE_MAP.update({
     'InternLM2ForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.internlm2.InternLM2ForCausalLM',
@@ -125,11 +125,6 @@
     f'{LMDEPLOY_PYTORCH_MODEL_PATH}.llava.LlavaNextForConditionalGeneration'  # noqa: E501
 })
 
-# qwen
-MODULE_MAP.update({
-    'QWenLMHeadModel': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.qwen.QWenLMHeadModel',
-})
-
 # qwen1.5
 MODULE_MAP.update({
     'Qwen2ForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.qwen2.Qwen2ForCausalLM',
@@ -207,11 +202,6 @@
     f'{LMDEPLOY_PYTORCH_MODEL_PATH}.qwen3_omni_moe_thinker.Qwen3OmniMoeThinkerForConditionalGeneration',
 })
 
-# starcoder2
-MODULE_MAP.update({
-    'Starcoder2ForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.starcoder2.Starcoder2ForCausalLM',
-})
-
 # phi-3
 MODULE_MAP.update({
     'Phi3ForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.phi3.Phi3ForCausalLM',
@@ -247,11 +237,6 @@
     f'{LMDEPLOY_PYTORCH_MODEL_PATH}.interns1_pro.InternS1ProForConditionalGeneration',
 })
 
-# mono-internvl
-MODULE_MAP.update({
-    'InternLM2VEForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.internlm2_ve.InternLM2VEForCausalLM',
-})
-
 # phi3 vision
 MODULE_MAP.update({
     'Phi3VForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.phi3_v.Phi3VForCausalLM',
diff --git a/lmdeploy/pytorch/models/patch.py b/lmdeploy/pytorch/models/patch.py
index 92d29e1d13..c2f1d60120 100644
--- a/lmdeploy/pytorch/models/patch.py
+++ b/lmdeploy/pytorch/models/patch.py
@@ -16,7 +16,7 @@
 
 from ..config import ModelConfig
 from ..devices import get_device_manager
-from .module_map import CUSTOM_MODULE_MAP, DEVICE_SPECIAL_MODULE_MAP, MODULE_MAP
+from .module_map import CUSTOM_MODULE_MAP, DEVICE_SPECIAL_MODULE_MAP, MODULE_MAP, REMOVED_MODEL_MAP
 
 logger = get_logger('lmdeploy')
 
@@ -153,6 +153,15 @@ def update_custom_module_map(module_map_path: str):
     CUSTOM_MODULE_MAP.update(new_mod_map)
 
 
+def _raise_if_removed_model(arch: str):
+    """Raise a clear error for intentionally removed model families."""
+    family = REMOVED_MODEL_MAP.get(arch)
+    if family is None:
+        return
+    raise RuntimeError(f'{family} ({arch}) support has been removed from LMDeploy. '
+                       'Please use an older LMDeploy release or migrate to a newer supported model family.')
+
+
 def _get_model_class(config, module_map):
     """Get model class."""
     auto_map = getattr(config, 'auto_map', dict())
@@ -160,6 +169,7 @@ def _get_model_class(config, module_map):
         mapname = auto_map['AutoModelForCausalLM']
         if '.' in mapname:
             mapname = mapname.split('.')[-1]
+        _raise_if_removed_model(mapname)
         if mapname in module_map:
             qualname = module_map[mapname]
             module_cls = _class_from_qualname(qualname)
@@ -177,6 +187,7 @@ def _get_model_class(config, module_map):
         return module_cls
 
     for arch in architectures:
+        _raise_if_removed_model(arch)
         if arch in module_map:
             qualname = module_map[arch]
             module_cls = _class_from_qualname(qualname)
diff --git a/lmdeploy/pytorch/models/qwen.py b/lmdeploy/pytorch/models/qwen.py
deleted file mode 100644
index 650222c4b3..0000000000
--- a/lmdeploy/pytorch/models/qwen.py
+++ /dev/null
@@ -1,413 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-from collections.abc import Iterable
-from typing import Any
-
-import torch
-from torch import nn
-from transformers.configuration_utils import PretrainedConfig
-
-from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding
-from lmdeploy.pytorch.nn.linear import (
-    build_down_linear,
-    build_gateup_linear,
-    build_o_proj,
-    build_qkv_proj,
-    build_rowwise_linear,
-)
-from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
-
-from .utils.cudagraph import CudaGraphMixin
-
-
-class QWenAttention(torch.nn.Module):
-    """Parallel self-attention layer abstract class.
-
-    Self-attention layer takes input with size [s, b, h] and returns output of the same size.
-    """
-
-    def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        quantization_config = getattr(config, 'quantization_config', None)
-
-        self.hidden_size = config.hidden_size
-        self.split_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.projection_size = config.kv_channels * config.num_attention_heads
-        self.num_attention_heads = config.num_attention_heads
-        self.num_kv_heads = self.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.c_attn = build_qkv_proj(
-            config.hidden_size,
-            num_q_heads=self.num_attention_heads,
-            num_kv_heads=self.num_kv_heads,
-            head_size=self.head_dim,
-            bias=True,
-            quant_config=quantization_config,
-            dtype=dtype,
-            device=device,
-        )
-
-        # apply rotary
-        self.apply_rotary_pos_emb = ApplyRotaryEmb()
-
-        # attention
-        self.attn_fwd = Attention(
-            self.num_attention_heads,
-            self.head_dim,
-            num_kv_heads=self.num_kv_heads,
-        )
-
-        # o_proj
-        self.c_proj = build_o_proj(self.projection_size,
-                                   config.hidden_size,
-                                   bias=not config.no_bias,
-                                   quant_config=quantization_config,
-                                   dtype=dtype,
-                                   device=device,
-                                   is_tp=True)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: tuple[torch.Tensor] | None = None,
-        attn_metadata: Any = None,
-    ):
-        """Rewrite of LlamaAttention.forward."""
-        # qkv proj
-        qkv_states = self.c_attn(hidden_states)
-        # (-1, heads, head_dim)
-        qkv_states = qkv_states.flatten(0, -2)
-        (query_states, key_states, value_states) = self.c_attn.split_qkv(qkv_states)
-
-        # apply rotary embedding
-        cos, sin = rotary_pos_emb
-        query_states, key_states = self.apply_rotary_pos_emb(
-            query_states,
-            key_states,
-            cos,
-            sin,
-            inplace=True,
-        )
-
-        # attention
-        attn_output = self.attn_fwd(
-            query_states,
-            key_states,
-            value_states,
-            past_key_value[0],
-            past_key_value[1],
-            attn_metadata,
-            k_scales_zeros=None if len(past_key_value) == 2 else past_key_value[2],
-            v_scales_zeros=None if len(past_key_value) == 2 else past_key_value[3],
-            inplace=True,
-        )
-        attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
-
-        # o proj
-        attn_output = self.c_proj(attn_output)
-        return attn_output
-
-
-class QWenMLP(nn.Module):
-    """mlp."""
-
-    def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        quantization_config = getattr(config, 'quantization_config', None)
-        ff_dim_in = config.intermediate_size // 2
-        # gate up
-        self.gate_up_proj = build_gateup_linear(
-            config.hidden_size,
-            [ff_dim_in, ff_dim_in],
-            bias=not config.no_bias,
-            dtype=dtype,
-            device=device,
-            quant_config=quantization_config,
-            is_tp=True,
-        )
-
-        # silu and mul
-        self.act_fn = SiluAndMul(inplace=True)
-
-        # down
-        self.c_proj = build_down_linear(ff_dim_in,
-                                        config.hidden_size,
-                                        bias=not config.no_bias,
-                                        quant_config=quantization_config,
-                                        dtype=dtype,
-                                        device=device,
-                                        is_tp=True)
-
-    def forward(self, x):
-        """forward."""
-        gate_up = self.gate_up_proj(x)
-        act = self.act_fn(gate_up)
-        return self.c_proj(act)
-
-
-class QWenBlock(torch.nn.Module):
-    """A single transformer layer.
-
-    Transformer layer takes input with size [s, b, h] and returns an output of the same size.
-    """
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 layer_number: int,
-                 dtype: torch.dtype = None,
-                 device: torch.device = None):
-        super().__init__()
-        self.layer_number = layer_number
-        hidden_size = config.hidden_size
-        self.bf16 = config.bf16
-
-        quantization_config = getattr(config, 'quantization_config', None)
-
-        # build attention layer
-        self.attn = QWenAttention(config, dtype=dtype, device=device)
-
-        # build MLP
-        self.mlp = QWenMLP(config, dtype=dtype, device=device)
-
-        # build input layer norm
-        self.ln_1 = RMSNorm(hidden_size,
-                            config.layer_norm_epsilon,
-                            quant_config=quantization_config,
-                            dtype=dtype,
-                            device=device)
-
-        # build attention layer norm
-        self.ln_2 = RMSNorm(hidden_size,
-                            config.layer_norm_epsilon,
-                            quant_config=quantization_config,
-                            dtype=dtype,
-                            device=device)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: list[torch.FloatTensor] | None,
-        residual: torch.Tensor | None = None,
-        attn_metadata: Any = None,
-    ):
-
-        if residual is None:
-            residual = hidden_states
-            layernorm_output = self.ln_1(hidden_states)
-        else:
-            layernorm_output, residual = self.ln_1(hidden_states, residual)
-
-        # Self Attention
-        layernorm_input = self.attn(
-            hidden_states=layernorm_output,
-            rotary_pos_emb=rotary_pos_emb,
-            past_key_value=past_key_value,
-            attn_metadata=attn_metadata,
-        )
-
-        # Fully Connected
-        layernorm_output, residual = self.ln_2(layernorm_input, residual)
-        mlp_output = self.mlp(layernorm_output)
-
-        outputs = (mlp_output, residual)
-        return outputs
-
-
-class QWenModel(nn.Module):
-
-    def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        self.vocab_size = config.vocab_size
-        self.embed_dim = config.hidden_size
-        self.wte = nn.Embedding(self.vocab_size, self.embed_dim, dtype=dtype, device=device)
-
-        # build all decode layers
-        self.h = nn.ModuleList(
-            [QWenBlock(config, layer_idx, dtype=dtype, device=device) for layer_idx in range(config.num_hidden_layers)])
-
-        # build rotary embedding
-        emb_type = RopeType.LinearScaling
-        if config.rotary_pct == 1.0:
-            self.rotary_ndims = None
-        else:
-            assert config.rotary_pct < 1
-            self.rotary_ndims = int(config.kv_channels * config.rotary_pct)
-        rope_dim = (self.rotary_ndims if self.rotary_ndims is not None else config.kv_channels)
-        rope_max_pos_emb = getattr(config, 'max_position_embeddings', 4096)
-        rope_base = config.rotary_emb_base
-        self.rotary_emb = build_rotary_embedding(
-            rope_dim,
-            rope_max_pos_emb,
-            rope_base,
-            emb_type=emb_type,
-        )
-
-        self.ln_f = RMSNorm(self.embed_dim, eps=config.layer_norm_epsilon, dtype=dtype, device=device)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: list[torch.FloatTensor] | None = None,
-        attn_metadata: Any = None,
-        inputs_embeds: torch.FloatTensor | None = None,
-    ):
-        """forward."""
-
-        # token embedding
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-
-        hidden_states = inputs_embeds
-
-        # rotary embedding
-        cos, sin = self.rotary_emb(hidden_states, position_ids)
-        cos, sin = cos[0], sin[0]
-        rotary_pos_emb = (cos, sin)
-
-        # decoding
-        residual = None
-        for idx, decoder_layer in enumerate(self.h):
-            past_key_value = past_key_values[idx]
-            hidden_states, residual = decoder_layer(
-                hidden_states,
-                rotary_pos_emb=rotary_pos_emb,
-                past_key_value=past_key_value,
-                residual=residual,
-                attn_metadata=attn_metadata,
-            )
-
-        # norm
-        hidden_states, residual = self.ln_f(hidden_states, residual)
-
-        return hidden_states
-
-    def get_input_embeddings(self):
-        """Get input embeddings."""
-        return self.wte
-
-
-class QWenLMHeadModel(nn.Module, CudaGraphMixin):
-    """Rewrote model."""
-
-    packed_modules_mapping = {
-        'gate_up_proj': [
-            'w2',
-            'w1',
-        ],
-    }
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 ctx_mgr: StepContextManager,
-                 dtype: torch.dtype = None,
-                 device: torch.device = None):
-        super().__init__()
-        self.config = config
-        self.ctx_mgr = ctx_mgr
-        # build Model
-        self.transformer = QWenModel(config, dtype=dtype, device=device)
-
-        # output_layers
-        self.lm_head = build_rowwise_linear(config.hidden_size,
-                                            config.vocab_size,
-                                            bias=False,
-                                            dtype=dtype,
-                                            device=device)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        past_key_values: list[list[torch.Tensor]],
-        attn_metadata: Any = None,
-        inputs_embeds: torch.Tensor = None,
-        **kwargs,
-    ):
-        """Model forward, return logits."""
-        hidden_states = self.transformer(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            attn_metadata=attn_metadata,
-            inputs_embeds=inputs_embeds,
-        )
-        return hidden_states
-
-    def get_logits(self, hidden_states: torch.Tensor):
-        """Compute logits of the model output."""
-        return self.lm_head(hidden_states)
-
-    def get_input_embeddings(self):
-        """Get input embeddings."""
-        return self.transformer.get_input_embeddings()
-
-    def prepare_inputs_for_generation(
-        self,
-        past_key_values: list[list[torch.Tensor]],
-        inputs_embeds: torch.Tensor | None = None,
-        context: StepContext = None,
-    ):
-        """Prepare input."""
-        # get input_ids, position_ids and attention metadatas
-        input_ids = context.input_ids
-        position_ids = context.position_ids
-        attn_metadata = context.attn_metadata
-
-        # process vision embeddings
-        vision_embeddings = context.input_embeddings
-        vision_embedding_indexing = context.input_embedding_indexing
-        if vision_embeddings is not None and len(vision_embeddings) > 0:
-            if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings()(input_ids)
-            inputs_embeds[:, vision_embedding_indexing, :] = vision_embeddings.to(inputs_embeds)
-
-        # inputs of forward
-        return dict(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            attn_metadata=attn_metadata,
-            inputs_embeds=inputs_embeds,
-        )
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-        """Load weights."""
-        # modify from vllm
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ('.gate_up_proj', '.w2', 0),
-            ('.gate_up_proj', '.w1', 1),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            if 'visual' in name:
-                continue
-            if 'rotary_pos_emb.inv_freq' in name:
-                continue
-            if ('rotary_pos_emb.cos_cached' in name or 'rotary_pos_emb.sin_cached' in name):
-                continue
-            if (self.config.tie_word_embeddings and 'lm_head.weight' in name):
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                param = params_dict[name]
-                load_weight(param, loaded_weight, shard_id=shard_id)
-                break
-            else:
-                if '.c_attn' in name:
-                    param = params_dict[name]
-                    q, k, v = param.weight_spliter(loaded_weight)
-                    load_weight(param, q, shard_id='q')
-                    load_weight(param, k, shard_id='k')
-                    load_weight(param, v, shard_id='v')
-                else:
-                    param = params_dict[name]
-                    load_weight(param, loaded_weight)
diff --git a/lmdeploy/pytorch/models/starcoder2.py b/lmdeploy/pytorch/models/starcoder2.py
deleted file mode 100644
index 43c7c9bc6f..0000000000
--- a/lmdeploy/pytorch/models/starcoder2.py
+++ /dev/null
@@ -1,386 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-from collections.abc import Iterable
-from typing import Any
-
-import torch
-from torch import nn
-from transformers.configuration_utils import PretrainedConfig
-
-from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, LayerNorm, build_rotary_embedding_from_config
-from lmdeploy.pytorch.nn.linear import build_colwise_linear, build_qkv_proj, build_rowwise_linear
-from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
-
-from .utils.cudagraph import CudaGraphMixin
-
-
-class Starcoder2Attention(nn.Module):
-    """Rewrite module of Starcoder2Attention."""
-
-    def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        quantization_config = getattr(config, 'quantization_config', None)
-        num_heads = config.num_attention_heads
-        num_key_value_heads = config.num_key_value_heads
-        hidden_size = config.hidden_size
-        head_dim = getattr(config, 'head_dim', hidden_size // num_heads)
-
-        # packed qkv
-        self.qkv_proj = build_qkv_proj(
-            hidden_size,
-            num_q_heads=num_heads,
-            num_kv_heads=num_key_value_heads,
-            head_size=head_dim,
-            bias=config.use_bias,
-            quant_config=quantization_config,
-            dtype=dtype,
-            device=device,
-        )
-
-        # rotary embedding
-        self.apply_rotary_pos_emb = ApplyRotaryEmb()
-
-        # attention
-        sliding_window = getattr(config, 'sliding_window', None)
-        self.attn_fwd = Attention(
-            num_heads,
-            head_dim,
-            num_kv_heads=num_key_value_heads,
-            v_head_size=head_dim,
-            sliding_window=sliding_window,
-        )
-
-        # o_proj
-        self.o_proj = build_rowwise_linear(num_heads * head_dim,
-                                           hidden_size,
-                                           bias=config.use_bias,
-                                           quant_config=quantization_config,
-                                           dtype=dtype,
-                                           device=device,
-                                           is_tp=True)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: tuple[torch.Tensor] | None = None,
-        attn_metadata: Any = None,
-    ):
-        """Rewrite of LlamaAttention.forward."""
-        # qkv proj
-        qkv_states = self.qkv_proj(hidden_states)
-        # (-1, heads, head_dim)
-        qkv_states = qkv_states.flatten(0, -2)
-        query_states, key_states, value_states = self.qkv_proj.split_qkv(qkv_states)
-
-        # apply rotary embedding
-        cos, sin = rotary_pos_emb
-        query_states, key_states = self.apply_rotary_pos_emb(
-            query_states,
-            key_states,
-            cos,
-            sin,
-            inplace=True,
-        )
-
-        # attention
-        attn_output = self.attn_fwd(
-            query_states,
-            key_states,
-            value_states,
-            past_key_value[0],
-            past_key_value[1],
-            attn_metadata,
-            k_scales_zeros=None if len(past_key_value) == 2 else past_key_value[2],
-            v_scales_zeros=None if len(past_key_value) == 2 else past_key_value[3],
-            inplace=True,
-        )
-        attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
-
-        # o proj
-        attn_output = self.o_proj(attn_output)
-        return attn_output
-
-
-class Starcoder2MLP(nn.Module):
-    """mlp."""
-
-    def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        quantization_config = getattr(config, 'quantization_config', None)
-        # gate up
-        self.c_fc = build_colwise_linear(
-            config.hidden_size,
-            config.intermediate_size,
-            bias=config.use_bias,
-            dtype=dtype,
-            device=device,
-            quant_config=quantization_config,
-            is_tp=True,
-        )
-
-        # silu and mul
-        hidden_act = config.hidden_act
-        if hidden_act is None:
-            hidden_act = 'gelu_pytorch_tanh'
-            assert hidden_act == 'gelu_pytorch_tanh'
-        self.act_fn = nn.GELU(approximate='tanh')
-
-        # down
-        self.c_proj = build_rowwise_linear(config.intermediate_size,
-                                           config.hidden_size,
-                                           bias=config.use_bias,
-                                           quant_config=quantization_config,
-                                           dtype=dtype,
-                                           device=device,
-                                           is_tp=True)
-
-    def forward(self, x):
-        """forward."""
-        gate_up = self.c_fc(x)
-        act = self.act_fn(gate_up)
-        return self.c_proj(act)
-
-
-class Starcoder2DecoderLayer(nn.Module):
-    """Decoder layer."""
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 layer_idx: int,
-                 dtype: torch.dtype = None,
-                 device: torch.device = None):
-        super().__init__()
-        self.layer_idx = layer_idx
-
-        # build attention layer
-        self.self_attn = Starcoder2Attention(config, dtype=dtype, device=device)
-
-        # build MLP
-        self.mlp = Starcoder2MLP(config, dtype=dtype, device=device)
-
-        # build input layer norm
-        self.input_layernorm = LayerNorm(config.hidden_size, eps=config.norm_epsilon, dtype=dtype, device=device)
-
-        # build attention layer norm
-        self.post_attention_layernorm = LayerNorm(config.hidden_size,
-                                                  eps=config.norm_epsilon,
-                                                  dtype=dtype,
-                                                  device=device)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: list[torch.FloatTensor] | None,
-        residual: torch.Tensor | None = None,
-        attn_metadata: Any = None,
-    ):
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(hidden_states, residual)
-
-        # Self Attention
-        hidden_states = self.self_attn(
-            hidden_states=hidden_states,
-            rotary_pos_emb=rotary_pos_emb,
-            past_key_value=past_key_value,
-            attn_metadata=attn_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-
-        outputs = (hidden_states, residual)
-        return outputs
-
-
-class Starcoder2Model(nn.Module):
-    """model."""
-
-    def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: torch.device = None):
-        super().__init__()
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size,
-                                         config.hidden_size,
-                                         self.padding_idx,
-                                         dtype=dtype,
-                                         device=device)
-
-        # build all decode layers
-        self.layers = nn.ModuleList([
-            Starcoder2DecoderLayer(config, layer_idx, dtype=dtype, device=device)
-            for layer_idx in range(config.num_hidden_layers)
-        ])
-
-        # build norm
-        self.norm = LayerNorm(config.hidden_size, eps=config.norm_epsilon, dtype=dtype, device=device)
-
-        # build rotary embedding
-        self.rotary_emb = build_rotary_embedding_from_config(config)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        position_ids: torch.LongTensor | None = None,
-        past_key_values: list[torch.FloatTensor] | None = None,
-        attn_metadata: Any = None,
-        inputs_embeds: torch.FloatTensor | None = None,
-    ):
-        """Rewrite of LlamaModel.forward."""
-
-        # token embedding
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        hidden_states = inputs_embeds
-
-        # rotary embedding
-        cos, sin = self.rotary_emb(hidden_states, position_ids)
-        cos, sin = cos[0], sin[0]
-        rotary_pos_emb = (cos, sin)
-
-        # decoding
-        residual = None
-        for idx, decoder_layer in enumerate(self.layers):
-            past_key_value = past_key_values[idx]
-            hidden_states, residual = decoder_layer(
-                hidden_states,
-                rotary_pos_emb=rotary_pos_emb,
-                past_key_value=past_key_value,
-                residual=residual,
-                attn_metadata=attn_metadata,
-            )
-
-        # norm
-        hidden_states, _ = self.norm(hidden_states, residual)
-
-        return hidden_states
-
-    def get_input_embeddings(self):
-        """Get input embeddings."""
-        return self.embed_tokens
-
-
-class Starcoder2ForCausalLM(nn.Module, CudaGraphMixin):
-    """ModelForCausalLM."""
-
-    packed_modules_mapping = {
-        'qkv_proj': [
-            'q_proj',
-            'k_proj',
-            'v_proj',
-        ],
-    }
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 ctx_mgr: StepContextManager,
-                 dtype: torch.dtype = None,
-                 device: torch.device = None):
-        super().__init__()
-        self.config = config
-        self.ctx_mgr = ctx_mgr
-        # build model
-        self.model = Starcoder2Model(config, dtype=dtype, device=device)
-        # build lm_head
-        self.lm_head = build_rowwise_linear(config.hidden_size,
-                                            config.vocab_size,
-                                            bias=False,
-                                            dtype=dtype,
-                                            device=device)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor,
-        past_key_values: list[list[torch.Tensor]],
-        attn_metadata: Any = None,
-        inputs_embeds: torch.Tensor = None,
-        **kwargs,
-    ):
-        """Model forward, return logits."""
-        hidden_states = self.model(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            attn_metadata=attn_metadata,
-            inputs_embeds=inputs_embeds,
-        )
-        return hidden_states
-
-    def get_logits(self, hidden_states: torch.Tensor):
-        """Compute logits of the model output."""
-        return self.lm_head(hidden_states)
-
-    def update_weights(self):
-        """Update weights."""
-        self.lm_head.weight = self.model.embed_tokens.weight
-
-    def get_input_embeddings(self):
-        """Get input embeddings."""
-        return self.model.get_input_embeddings()
-
-    def prepare_inputs_for_generation(
-        self,
-        past_key_values: list[list[torch.Tensor]],
-        inputs_embeds: torch.Tensor | None = None,
-        context: StepContext = None,
-    ):
-        """Prepare input."""
-        # get input_ids, position_ids and attention metadatas
-        input_ids = context.input_ids
-        position_ids = context.position_ids
-        attn_metadata = context.attn_metadata
-
-        # process vision embeddings
-        vision_embeddings = context.input_embeddings
-        vision_embedding_indexing = context.input_embedding_indexing
-        if vision_embeddings is not None and len(vision_embeddings) > 0:
-            if inputs_embeds is None:
-                inputs_embeds = self.get_input_embeddings()(input_ids)
-            inputs_embeds[:, vision_embedding_indexing, :] = vision_embeddings.to(inputs_embeds)
-
-        # inputs of forward
-        return dict(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            attn_metadata=attn_metadata,
-            inputs_embeds=inputs_embeds,
-        )
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-        """Load weights."""
-        # modify from vllm
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ('.qkv_proj', '.q_proj', 'q'),
-            ('.qkv_proj', '.k_proj', 'k'),
-            ('.qkv_proj', '.v_proj', 'v'),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            if 'rotary_emb.inv_freq' in name:
-                continue
-            if ('rotary_emb.cos_cached' in name or 'rotary_emb.sin_cached' in name):
-                continue
-            if self.config.tie_word_embeddings and 'lm_head.weight' in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                param = params_dict[name]
-                load_weight(param, loaded_weight, shard_id=shard_id)
-                break
-            else:
-                param = params_dict[name]
-                load_weight(param, loaded_weight)
diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py
index 68d4c51f85..dba72a7e8b 100644
--- a/lmdeploy/serve/core/async_engine.py
+++ b/lmdeploy/serve/core/async_engine.py
@@ -87,15 +87,14 @@ class AsyncEngine:
                     ii) and iii).
                 - ii) The model_id of a lmdeploy-quantized model hosted
                     inside a model repo on huggingface.co, such as
-                    "InternLM/internlm-chat-20b-4bit",
                     "lmdeploy/llama2-chat-70b-4bit", etc.
                 - iii) The model_id of a model hosted inside a model repo
-                    on huggingface.co, such as "internlm/internlm-chat-7b",
-                    "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
+                    on huggingface.co, such as "internlm/internlm2-chat-7b",
+                    "Qwen/Qwen2.5-7B-Instruct"
                     and so on.
         model_name (str): needed when model_path is a pytorch model on
-            huggingface.co, such as "internlm/internlm-chat-7b",
-            "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
+            huggingface.co, such as "internlm/internlm2-chat-7b",
+            "Qwen/Qwen2.5-7B-Instruct" and so on.
         backend (str): either `turbomind` or `pytorch` backend. Default to
             `turbomind` backend.
         backend_config (TurbomindEngineConfig | PytorchEngineConfig): beckend
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index e8a3ca6ecc..37fb86d7bc 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -1558,11 +1558,10 @@ def serve(model_path: str,
                     ii) and iii).
                 - ii) The model_id of a lmdeploy-quantized model hosted
                     inside a model repo on huggingface.co, such as
-                    "InternLM/internlm-chat-20b-4bit",
                     "lmdeploy/llama2-chat-70b-4bit", etc.
                 - iii) The model_id of a model hosted inside a model repo
-                    on huggingface.co, such as "internlm/internlm-chat-7b",
-                    "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
+                    on huggingface.co, such as "internlm/internlm2-chat-7b",
+                    "Qwen/Qwen2.5-7B-Instruct"
                     and so on.
         model_name (str): the name of the served model. It can be accessed
             by the RESTful API `/v1/models`. If it is not specified,
diff --git a/lmdeploy/serve/proxy/proxy.py b/lmdeploy/serve/proxy/proxy.py
index 667886273e..8653e56632 100644
--- a/lmdeploy/serve/proxy/proxy.py
+++ b/lmdeploy/serve/proxy/proxy.py
@@ -156,7 +156,7 @@ def add(self, node_url: str, status: Status | None = None):
             node_url (str): A http url. Can be the url generated by
                 `lmdeploy serve api_server`.
             description (dict): The description of the node. An example:
-                {'http://0.0.0.0:23333': {models: ['internlm-chat-7b]},
+                {'http://0.0.0.0:23333': {models: ['qwen2.5-7b']},
                 speed: -1}. The speed here can be RPM or other metric. All the
                 values of nodes should be the same metric.
         """
@@ -496,7 +496,7 @@ def add_node(node: Node, raw_request: Request = None):
     - **url** (str): A http url. Can be the url generated by
       `lmdeploy serve api_server`.
     - **status** (dict): The description of the node. An example:
-      ``{models: ['internlm-chat-7b],  speed: 1}``. The speed here can be
+      ``{models: ['qwen2.5-7b],  speed: 1}``. The speed here can be
       RPM or other metric. All the values of nodes should be the same metric.
     """
     try:
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
index 95f6f24d32..62f9a69ed9 100644
--- a/lmdeploy/turbomind/supported_models.py
+++ b/lmdeploy/turbomind/supported_models.py
@@ -43,11 +43,10 @@ def is_supported(model_path: str, trust_remote_code: bool = False):
                     ii) and iii).
                 - ii) The model_id of a lmdeploy-quantized model hosted
                     inside a model repo on huggingface.co, such as
-                    "InternLM/internlm-chat-20b-4bit",
                     "lmdeploy/llama2-chat-70b-4bit", etc.
                 - iii) The model_id of a model hosted inside a model repo
-                    on huggingface.co, such as "internlm/internlm-chat-7b",
-                    "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
+                    on huggingface.co, such as "internlm/internlm2-chat-7b",
+                    "Qwen/Qwen2.5-7B-Instruct"
                     and so on.
     Returns:
         support_by_turbomind (bool): Whether input model is supported by turbomind engine
diff --git a/lmdeploy/turbomind/tokenizer_info.py b/lmdeploy/turbomind/tokenizer_info.py
index d03eca023b..94558efd58 100644
--- a/lmdeploy/turbomind/tokenizer_info.py
+++ b/lmdeploy/turbomind/tokenizer_info.py
@@ -34,7 +34,7 @@ class VocabType(Enum):
     """The vocabulary is in the raw format.
 
     The tokens in the vocabulary are kept in their original form without any processing. This kind of tokenizer includes
-    the tiktoken tokenizer, e.g. microsoft/Phi-3-small-8k-instruct, Qwen/Qwen-7B-Chat, etc.
+    the tiktoken tokenizer, e.g. microsoft/Phi-3-small-8k-instruct, Qwen/Qwen2.5-7B-Instruct, etc.
     """
 
     BYTE_FALLBACK = 1
@@ -232,7 +232,7 @@ def from_huggingface(
 
         elif TokenizerInfo._is_tiktoken_tokenizer(tokenizer):
             # tiktoken tokenizer
-            # e.g. Phi-3-small-8k-instruct, Qwen-7B-Chat, stablelm-2-12b-chat (previously)
+            # e.g. Phi-3-small-8k-instruct, Qwen2.5-7B-Instruct, stablelm-2-12b-chat (previously)
             if stop_token_ids is None:
                 if hasattr(tokenizer, 'eos_token_id') and tokenizer.eos_token_id is not None:
                     stop_token_ids = [tokenizer.eos_token_id]
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 2df03cf340..b8e44cdc9d 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -353,11 +353,10 @@ def from_pretrained(cls,
                       ii) and iii)
                     - ii) The model_id of a lmdeploy-quantized model hosted
                       inside a model repo on huggingface.co, such as
-                      "InternLM/internlm-chat-20b-4bit",
                       "lmdeploy/llama2-chat-70b-4bit", etc.
                     - iii) The model_id of a model hosted inside a model repo
-                      on huggingface.co, such as "internlm/internlm-chat-7b",
-                      "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
+                      on huggingface.co, such as "internlm/internlm2-chat-7b",
+                      "Qwen/Qwen2.5-7B-Instruct"
                       and so on.
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update configuration when initialize the engine.
diff --git a/lmdeploy/vl/model/builder.py b/lmdeploy/vl/model/builder.py
index 9b4e4f429e..89e6c3e7cf 100644
--- a/lmdeploy/vl/model/builder.py
+++ b/lmdeploy/vl/model/builder.py
@@ -23,10 +23,8 @@
 from .llava_hf import LlavaHfVisionModel  # noqa F401
 from .llava_next import LlavaNextVisionModel  # noqa F401
 from .minicpmv import MiniCPMVModel  # noqa F401
-from .mllama import MllamaVLModel  # noqa F401
 from .molmo import MolmoVisionModel  # noqa F401
 from .phi3_vision import Phi3VisionModel  # noqa F401
-from .qwen import QwenVisionModel  # noqa F401
 from .qwen2 import Qwen2VLModel  # noqa F401
 from .qwen3 import Qwen3VLModel  # noqa F401
 from .qwen3_5 import Qwen3_5Model  # noqa F401
diff --git a/lmdeploy/vl/model/mllama.py b/lmdeploy/vl/model/mllama.py
deleted file mode 100644
index b2801b2726..0000000000
--- a/lmdeploy/vl/model/mllama.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
-
-
-def check_transformers():
-    try:
-        from transformers import MllamaForConditionalGeneration  # noqa: F401
-    except ImportError:
-        raise ImportError('please install latest transformers by '
-                          'pip install git+https://github.com/huggingface/transformers.git')
-
-
-@VISION_MODELS.register_module()
-class MllamaVLModel(VisionModel):
-    """llama3.2 model."""
-
-    _arch = 'MllamaForConditionalGeneration'
-
-    def build_preprocessor(self, trust_remote_code: bool = False):
-        from transformers import AutoProcessor
-        self.processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=trust_remote_code)
-        self.image_token_id = 128256
-
-    def preprocess(self, messages: list[dict]) -> list[dict]:
-        """Refer to the spec of `super().preprocess`"""
-        images = self.collect_multimodal_items(messages)
-        outputs = []
-        for modality, image, params in images:
-            results = self.processor.image_processor(images=image, return_tensors='pt')
-            results.update(image_size=image.size, image_tokens=1, image_token_id=self.image_token_id)
-            outputs.append(results)
-        messages.append(dict(role='preprocess', content=outputs))
-        return messages
-
-    def build_model(self, trust_remote_code: bool = False):
-        check_transformers()
-        if self.with_llm:
-            from transformers import MllamaForConditionalGeneration
-            model = MllamaForConditionalGeneration.from_pretrained(self.model_path, device_map='cpu',
-                                                                   trust_remote_code=trust_remote_code)
-            self.vl_model = model
-        else:
-            raise NotImplementedError('turbomind has not supported mllama yet')
-
-    @staticmethod
-    def proc_messages(messages, chat_template, sequence_start):
-        """Apply chat template to get the prompt."""
-        prompt_messages = []
-        IMAGE_TOKEN = '<|image|>'
-        for message in messages:
-            if isinstance(message['content'], str):
-                prompt_messages.append(message)
-                continue
-            elif message['role'] in ['images', 'preprocess', 'forward']:
-                continue
-            n_images = len([1 for x in message['content'] if x['type'] == 'image'])
-            content = [item['text'] for item in message['content'] if item['type'] == 'text']
-            prompt = (IMAGE_TOKEN) * n_images + content[0]
-            prompt_messages.append(dict(role='user', content=prompt))
-        prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
-        return prompt, IMAGE_TOKEN
-
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
-        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
-        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
diff --git a/lmdeploy/vl/model/qwen.py b/lmdeploy/vl/model/qwen.py
deleted file mode 100644
index 0402294ff4..0000000000
--- a/lmdeploy/vl/model/qwen.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-
-import torch
-from transformers import AutoModelForCausalLM
-
-from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
-from lmdeploy.vl.model.utils import disable_logging
-
-logger = get_logger('lmdeploy')
-
-
-@VISION_MODELS.register_module()
-class QwenVisionModel(VisionModel):
-    """Qwen vision model."""
-
-    _arch = 'QWenLMHeadModel'
-
-    def build_preprocessor(self, trust_remote_code: bool = False):
-        from torchvision import transforms
-        from torchvision.transforms import InterpolationMode
-        mean = (0.48145466, 0.4578275, 0.40821073)
-        std = (0.26862954, 0.26130258, 0.27577711)
-        image_size = self.hf_config.visual['image_size']
-        self.image_transform = transforms.Compose([
-            transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=mean, std=std),
-        ])
-
-    def build_model(self, trust_remote_code: bool = False):
-        """Build the vision part of a VLM model when backend is turbomind, or
-        load the whole VLM model when `self.with_llm==True`"""
-        from accelerate import init_empty_weights
-        with init_empty_weights():
-            config = self.hf_config
-            config.quantization_config = {}  # disable vision part quantization
-            model = AutoModelForCausalLM.from_config(config, trust_remote_code=trust_remote_code)
-            self.vl_model = model
-            if not self.with_llm:
-                del model.lm_head
-                for key in ['wte', 'h', 'ln_f']:
-                    setattr(model.transformer, key, None)
-
-        from accelerate.utils import get_balanced_memory, infer_auto_device_map
-        max_memory = get_balanced_memory(model,
-                                         max_memory=self.max_memory,
-                                         dtype=torch.half,
-                                         no_split_module_classes=['VisualAttentionBlock', 'Resampler'])
-        device_map = infer_auto_device_map(model,
-                                           no_split_module_classes=['VisualAttentionBlock', 'Resampler'],
-                                           max_memory=max_memory,
-                                           dtype=torch.half)
-        same_device_keys = [('transformer.visual.conv1', 'transformer.visual.positional_embedding'),
-                            ('transformer.visual.ln_post', 'transformer.visual.proj')]
-        for (a, b) in same_device_keys:
-            if a in device_map and b in device_map:
-                device_map[b] = device_map[a]
-
-        from accelerate import load_checkpoint_and_dispatch
-        with disable_logging():
-            load_checkpoint_and_dispatch(model=model,
-                                         checkpoint=self.model_path,
-                                         device_map=device_map if not self.with_llm else {'': 'cpu'},
-                                         no_split_module_classes=['VisualAttentionBlock'],
-                                         dtype=torch.half)
-
-        self.model = model.transformer.visual.eval()
-
-    def preprocess(self, messages: list[dict]) -> list[dict]:
-        """Refers to `super.preprocess() for spec."""
-        images = self.collect_multimodal_items(messages)
-        outputs = []
-        for modality, image, params in images:
-            pixel_values = self.image_transform(image)
-            outputs.append(
-                dict(pixel_values=pixel_values,
-                     image_size=image.size,
-                     image_tokens=256,
-                     image_token_id=self.image_token_id))
-        messages.append(dict(role='preprocess', content=outputs))
-        return messages
-
-    @torch.no_grad()
-    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
-        """Extract image feature. ONLY implement it when the backend is
-        turbomind engine.
-
-        Args:
-            messages(list[dict]): the outputs of `preprocess`
-            max_batch_size(int): the max batch size when forwarding vision
-                model
-        Return:
-            the message list with forwarding results included
-        """
-        inputs = [x['content'] for x in messages if x['role'] == 'preprocess']
-        inputs = inputs[0]
-        outputs = []
-        for idx in range(0, len(inputs), max_batch_size):
-            pixel_values = [x['pixel_values'] for x in inputs[idx:idx + max_batch_size]]
-            pixel_values = torch.stack(pixel_values, dim=0)
-            logger.info(f'vision forward shape: {pixel_values.shape}')
-            feats = self.model(pixel_values)
-            feats = torch.split(feats, 1, dim=0)
-            outputs.extend([x.squeeze() for x in feats])
-        messages.append(dict(role='forward', content=outputs))
-        return messages
-
-    @staticmethod
-    def proc_messages(messages, chat_template, sequence_start):
-        """Apply chat template to get the prompt."""
-        prompt_messages = []
-        IMAGE_TOKEN = '<IMAGE_TOKEN>'
-        for message in messages:
-            if isinstance(message['content'], str):
-                prompt_messages.append(message)
-                continue
-            elif message['role'] in ['images', 'preprocess', 'forward']:
-                continue
-            n_images = len([1 for x in message['content'] if x['type'] == 'image'])
-            content = [x.get('text', '') for x in message['content'] if x['type'] == 'text']
-            prompt = content[0]
-            if IMAGE_TOKEN in prompt:
-                pass
-            else:
-                prompt = ''.join([f'Picture {str(i)}:{IMAGE_TOKEN}\n' for i in range(n_images)]) + prompt
-            prompt_messages.append(dict(role='user', content=prompt))
-        prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
-        return prompt, IMAGE_TOKEN
-
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
-        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
-        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
-
-    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
-        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start)
-        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
diff --git a/tests/pytorch/test_removed_models.py b/tests/pytorch/test_removed_models.py
new file mode 100644
index 0000000000..8d15a16897
--- /dev/null
+++ b/tests/pytorch/test_removed_models.py
@@ -0,0 +1,21 @@
+import pytest
+import torch
+from transformers import PretrainedConfig
+
+from lmdeploy.pytorch.models.patch import build_model_from_hf_config
+
+
+@pytest.mark.parametrize('arch', [
+    'InternLMForCausalLM',
+    'QWenLMHeadModel',
+    'BaiChuanForCausalLM',
+    'BaichuanForCausalLM',
+    'Starcoder2ForCausalLM',
+    'InternLM2VEForCausalLM',
+    'MllamaForConditionalGeneration',
+])
+def test_removed_model_error(arch):
+    config = PretrainedConfig(architectures=[arch])
+
+    with pytest.raises(RuntimeError, match='support has been removed from LMDeploy'):
+        build_model_from_hf_config(config, device=torch.device('cpu'))
diff --git a/tests/test_lmdeploy/test_messages.py b/tests/test_lmdeploy/test_messages.py
index cb57079f07..fc6dd81e70 100644
--- a/tests/test_lmdeploy/test_messages.py
+++ b/tests/test_lmdeploy/test_messages.py
@@ -23,7 +23,7 @@ def test_chat_completion_request_repetition_ngram_ge_zero():
 
 
 def test_engine_generation_config():
-    tokenizer = Tokenizer('internlm/internlm-chat-7b', trust_remote_code=True)
+    tokenizer = Tokenizer('internlm/internlm2-chat-7b', trust_remote_code=True)
     config = GenerationConfig(n=3, stop_words=['<eoa>'])
     stop_token_ids = tokenizer.encode('<eoa>', add_bos=False)
     config.convert_stop_bad_words_to_ids(tokenizer)
diff --git a/tests/test_lmdeploy/test_model.py b/tests/test_lmdeploy/test_model.py
index 6fc2ecd089..0fbdf6dc0d 100644
--- a/tests/test_lmdeploy/test_model.py
+++ b/tests/test_lmdeploy/test_model.py
@@ -81,7 +81,7 @@ def test_HFChatTemplate_message2prompt_sequence_start_True(model_path):
 
 
 def test_base_model():
-    model = MODELS.get('internlm')(capability='completion')
+    model = MODELS.get('base')(capability='completion')
     assert model.capability == 'completion'
     assert model.get_prompt('hi') == 'hi'
     assert model.messages2prompt('test') == 'test'
@@ -112,38 +112,6 @@ def test_prefix_response():
     assert prompt[-len('prefix test'):] == 'prefix test'
 
 
-def test_internlm_chat():
-    prompt = 'hello, can u introduce yourself'
-    model = MODELS.get('internlm')(capability='completion')
-    assert model.get_prompt(prompt, sequence_start=True) == prompt
-    assert model.get_prompt(prompt, sequence_start=False) == prompt
-    assert model.stop_words is not None
-    assert model.system == '<|System|>:'
-
-    model = MODELS.get('internlm')(capability='chat', system='Provide answers in Python')
-    assert model.get_prompt(prompt, sequence_start=True) != prompt
-    assert model.get_prompt(prompt, sequence_start=False) != prompt
-    assert model.system == 'Provide answers in Python'
-
-    model = MODELS.get('internlm')(capability='voice')
-    _prompt = None
-    with pytest.raises(AssertionError):
-        _prompt = model.get_prompt(prompt, sequence_start=True)
-        assert _prompt is None
-
-
-def test_baichuan():
-    prompt = 'hello, can u introduce yourself'
-    model = MODELS.get('baichuan2')(capability='completion')
-    assert model.get_prompt(prompt, sequence_start=True) == prompt
-    assert model.get_prompt(prompt, sequence_start=False) == prompt
-    assert model.stop_words is None
-
-    model = MODELS.get('baichuan2')(capability='chat')
-    _prompt = model.get_prompt(prompt, sequence_start=True)
-    assert _prompt == '<reserved_106>' + prompt + '<reserved_107>'
-
-
 def test_llama2():
     prompt = 'hello, can u introduce yourself'
     model = MODELS.get('llama2')(capability='completion')
diff --git a/tests/test_lmdeploy/test_tokenizer.py b/tests/test_lmdeploy/test_tokenizer.py
index 5eb659de5e..3a3cbdbd19 100644
--- a/tests/test_lmdeploy/test_tokenizer.py
+++ b/tests/test_lmdeploy/test_tokenizer.py
@@ -6,8 +6,8 @@
 
 
 @pytest.mark.parametrize('model_path', [
-    'internlm/internlm-chat-7b', 'Qwen/Qwen-7B-Chat', 'baichuan-inc/Baichuan2-7B-Chat', 'upstage/SOLAR-0-70b-16bit',
-    'baichuan-inc/Baichuan-7B', 'codellama/CodeLlama-7b-hf', 'THUDM/chatglm2-6b', '01-ai/Yi-6B-200k',
+    'internlm/internlm2-chat-7b', 'Qwen/Qwen2.5-7B-Instruct', 'upstage/SOLAR-0-70b-16bit',
+    'codellama/CodeLlama-7b-hf', 'THUDM/chatglm2-6b', '01-ai/Yi-6B-200k',
     '01-ai/Yi-34B-Chat', '01-ai/Yi-6B-Chat', 'WizardLM/WizardLM-70B-V1.0', 'codellama/CodeLlama-34b-Instruct-hf'
 ])
 @pytest.mark.parametrize('input', [' hi, this is a test 😆😆! 為什麼我還在用繁體字 😆😆       ' * 5])
@@ -32,7 +32,7 @@ def test_tokenizer(model_path, input, interval, add_special_tokens, skip_special
 
 
 @pytest.mark.parametrize('model_path', [
-    'internlm/internlm-chat-7b', 'Qwen/Qwen-7B-Chat', 'baichuan-inc/Baichuan2-7B-Chat', 'codellama/CodeLlama-7b-hf',
+    'internlm/internlm2-chat-7b', 'Qwen/Qwen2.5-7B-Instruct', 'codellama/CodeLlama-7b-hf',
     'upstage/SOLAR-0-70b-16bit'
 ])
 @pytest.mark.parametrize('stop_words', ['.', ' ', '?', ''])

From 45c3bd161d20fbda04b8d0ad477054a72399baab Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Mon, 22 Jun 2026 17:20:52 +0800
Subject: [PATCH 2/8] chore: remove legacy vl adapters

---
 lmdeploy/vl/model/builder.py        |   2 -
 lmdeploy/vl/model/internvl_llava.py | 161 ----------------------------
 lmdeploy/vl/model/llava.py          |   8 --
 lmdeploy/vl/model/yi.py             | 133 -----------------------
 4 files changed, 304 deletions(-)
 delete mode 100644 lmdeploy/vl/model/internvl_llava.py
 delete mode 100644 lmdeploy/vl/model/yi.py

diff --git a/lmdeploy/vl/model/builder.py b/lmdeploy/vl/model/builder.py
index 89e6c3e7cf..58b29a7e86 100644
--- a/lmdeploy/vl/model/builder.py
+++ b/lmdeploy/vl/model/builder.py
@@ -17,7 +17,6 @@
 from .interns1_pro import InternS1ProVisionModel  # noqa F401
 from .internvl import InternVLVisionModel  # noqa F401
 from .internvl3_hf import InternVL3VisionModel  # noqa F401
-from .internvl_llava import InternVLLlavaVisionModel  # noqa F401
 from .llama4 import LLama4VisionModel  # noqa F401
 from .llava import LlavaVisionModel  # noqa F401
 from .llava_hf import LlavaHfVisionModel  # noqa F401
@@ -30,7 +29,6 @@
 from .qwen3_5 import Qwen3_5Model  # noqa F401
 from .qwen3_omni import Qwen3OmniModel  # noqa F401
 from .xcomposer2 import Xcomposer2VisionModel  # noqa F401
-from .yi import YiVisionModel  # noqa F401
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/internvl_llava.py b/lmdeploy/vl/model/internvl_llava.py
deleted file mode 100644
index 5288328bd8..0000000000
--- a/lmdeploy/vl/model/internvl_llava.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import warnings
-from contextlib import contextmanager
-
-import torch
-from transformers import AutoConfig, AutoModelForCausalLM
-
-from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.llava import VISION_MODELS, LlavaVisionModel
-from lmdeploy.vl.model.utils import rewrite_ctx
-
-from .utils import disable_logging, disable_transformers_logging
-
-logger = get_logger('lmdeploy')
-
-
-def check_llava_install():
-    try:
-        from llava.model.multimodal_encoder.clip_encoder import InternVisionModel  # noqa: F401
-    except ImportError:
-        raise ImportError(
-            'To use LlavaVLModel, please install llava by '
-            '`pip install git+https://github.com/OpenGVLab/InternVL#subdirectory=internvl_chat_llava --no-deps`')
-
-
-def _intern_vision_model__from_pretrained(vision_tower_name: str):
-    logger.info(f'init empty InternVisionModel: {vision_tower_name}')
-    from llava.model.multimodal_encoder.intern_vit_6b.modeling_intern_vit import InternVisionConfig, InternVisionModel
-    config = InternVisionConfig.from_pretrained(vision_tower_name)
-    model = InternVisionModel._from_config(config)
-    model.requires_grad_(False)
-    return model
-
-
-def _intern_vl_model__from_pretrained(vision_tower_name: str):
-    logger.info(f'init empty InternVLModel: {vision_tower_name}')
-
-    from llava.model.multimodal_encoder.internvl_14b.modeling_internvl import InternVLConfig, InternVLModel
-
-    config = InternVLConfig.from_pretrained(vision_tower_name)
-    model = InternVLModel._from_config(config)
-    model.requires_grad_(False)
-    return model
-
-
-@contextmanager
-def init_empty_vit():
-    """Skip download vision model if possible."""
-    origin_func_path = [
-        'llava.model.multimodal_encoder.intern_vit_6b.modeling_intern_vit.InternVisionModel.from_pretrained',  # noqa: E501
-        'llava.model.multimodal_encoder.internvl_14b.modeling_internvl.InternVLModel.from_pretrained',  # noqa: E501
-    ]
-    rewrite_func = [_intern_vision_model__from_pretrained, _intern_vl_model__from_pretrained]
-    with rewrite_ctx(origin_func_path, rewrite_func):
-        yield
-
-
-@VISION_MODELS.register_module()
-class InternVLLlavaVisionModel(LlavaVisionModel):
-    """Llava visual model."""
-
-    @classmethod
-    def match(cls, config: AutoConfig):
-        """Check whether the config match the model."""
-        arch = config.architectures[0] if config.architectures else None
-        if arch == 'LlavaLlamaForCausalLM':
-            mm_vision_tower = getattr(config, 'mm_vision_tower', '')
-            if 'OpenGVLab' in mm_vision_tower:
-                return True
-        return False
-
-    def build_preprocessor(self, trust_remote_code: bool = False):
-        return super().build_preprocessor(trust_remote_code=trust_remote_code)
-
-    def build_model(self, trust_remote_code: bool = False):
-        """Build the vision part of a VLM model when backend is turbomind, or
-        load the whole VLM model when `self.with_llm==True`"""
-        check_llava_install()
-        # currently, only support llava llama
-        from llava.model.language_model.llava_llama import LlavaConfig, LlavaLlamaForCausalLM  # noqa
-        self.config = LlavaConfig.from_pretrained(self.model_path, trust_remote_code=trust_remote_code)
-        assert self.config.model_type in ['llava', 'llava_llama'], \
-            'currently, only support llava llama'
-
-        # init empty model, skip layer initialization
-        from accelerate import init_empty_weights
-        with init_empty_weights(), warnings.catch_warnings(), \
-                disable_transformers_logging():
-            warnings.simplefilter('ignore')
-            self.config.quantization_config = {}  # disable vision part quantization
-            model = AutoModelForCausalLM.from_config(self.config, trust_remote_code=trust_remote_code)
-            self.vl_model = model
-            if not self.with_llm:
-                del model.lm_head
-                del model.model.embed_tokens
-                del model.model.layers
-                del model.model.norm
-
-            with init_empty_vit():
-                vision_tower = model.get_vision_tower()
-                vision_tower.is_loaded = False
-                vision_tower.load_model()
-            crop_size = vision_tower.image_processor.crop_size['height']
-            image_size = vision_tower.config.image_size
-            patch_size = vision_tower.config.patch_size
-            if crop_size != image_size:
-                vision_tower.vision_tower.resize_pos_embeddings(image_size, crop_size, patch_size)
-                vision_tower.vision_tower.embeddings.image_size = crop_size
-                vision_tower.config.image_size = crop_size
-                vision_tower.image_processor.crop_size = dict(height=crop_size, width=crop_size)
-                vision_tower.image_processor.size = dict(shortest_edge=crop_size)
-
-        from accelerate import load_checkpoint_and_dispatch
-        with disable_logging():
-            load_checkpoint_and_dispatch(model=model,
-                                         max_memory=self.max_memory,
-                                         checkpoint=self.model_path,
-                                         device_map='auto' if not self.with_llm else {'': 'cpu'},
-                                         no_split_module_classes=['InternVisionEncoderLayer'],
-                                         dtype=torch.half)
-
-        self.model = model.model.eval()
-        self.vision_tower = model.model.vision_tower.eval()
-        self.mm_projector = model.model.mm_projector.eval()
-
-    def preprocess(self, messages: list[dict]) -> list[dict]:
-        """Refer to `super().preprocess() for spec."""
-        return super().preprocess(messages)
-
-    @torch.no_grad()
-    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
-        """Extract image feature. ONLY implement it when the backend is
-        turbomind engine.
-
-        Args:
-            messages(list[dict]): the outputs of `preprocess`
-            max_batch_size(int): the max batch size when forwarding vision
-                model
-        Return:
-            the message list with forwarding results included
-        """
-        inputs = [x['content'] for x in messages if x['role'] == 'preprocess']
-        inputs = inputs[0]
-        outputs = []
-        for idx in range(0, len(inputs), max_batch_size):
-            pixel_values = [x['pixel_values'] for x in inputs[idx:idx + max_batch_size]]
-            split_sizes = [x.shape[0] for x in pixel_values]
-            pixel_values = torch.cat(pixel_values, dim=0)
-            pixel_values = pixel_values.to(device=self.vision_tower.device, dtype=torch.float16)
-            logger.info(f'vision forward shape: {pixel_values.shape}')
-            if pixel_values.ndim == 5:
-                feats = self.encode_images(pixel_values)
-                feats = torch.split(feats, split_sizes, dim=0)
-                feats = [x.flatten(0, 1) for x in feats]
-            else:
-                feats = self.encode_images(pixel_values)
-                feats = [x for x in feats]
-            outputs.extend(feats)
-        messages.append(dict(role='forward', content=outputs))
-        return messages
diff --git a/lmdeploy/vl/model/llava.py b/lmdeploy/vl/model/llava.py
index 3a8485e845..803b2067cb 100644
--- a/lmdeploy/vl/model/llava.py
+++ b/lmdeploy/vl/model/llava.py
@@ -209,14 +209,6 @@ def match(cls, config: AutoConfig):
         """Check whether the config match the model."""
         arch = config.architectures[0] if config.architectures else None
         if arch in ['LlavaLlamaForCausalLM', 'LlavaMistralForCausalLM']:
-            # internvl-llava has vision_tower of OpenGVLab/xxx
-            mm_vision_tower = getattr(config, 'mm_vision_tower', '')
-            # yi-vl has projector type of xxx_Norm
-            projector_type = getattr(config, 'mm_projector_type', 'linear')
-            if '_Norm' in projector_type:
-                return False
-            if 'OpenGVLab' in mm_vision_tower:
-                return False
             return True
         return False
 
diff --git a/lmdeploy/vl/model/yi.py b/lmdeploy/vl/model/yi.py
deleted file mode 100644
index 03d66bce2e..0000000000
--- a/lmdeploy/vl/model/yi.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import os
-from contextlib import contextmanager
-from os import path as osp
-
-import torch.nn as nn
-from transformers import AutoConfig
-
-from lmdeploy.vl.model.base import VISION_MODELS
-from lmdeploy.vl.model.llava import LlavaVisionModel, check_llava_install, process_images
-
-from .utils import disable_transformers_logging, rewrite_ctx
-
-_model_path = None
-
-
-def _build_vision_projector(config, delay_load=False, **kwargs):
-    """Build yi projector."""
-    # copy from https://github.com/01-ai/Yi/blob/main/VL/llava/model/multimodal_projector/builder.py # noqa: E501
-    projector_type = getattr(config, 'mm_projector_type', 'linear')
-
-    if projector_type == 'linear':
-        return nn.Linear(config.mm_hidden_size, config.hidden_size)
-
-    import re
-    use_norm = False
-    if '_Norm' in projector_type:
-        use_norm = True
-        projector_type = projector_type.replace('_Norm', '')
-    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
-    if mlp_gelu_match:
-        mlp_depth = int(mlp_gelu_match.group(1))
-        if use_norm:
-            modules = [
-                nn.Linear(config.mm_hidden_size, config.hidden_size),
-                nn.LayerNorm(config.hidden_size),
-            ]
-        else:
-            modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
-        for _ in range(1, mlp_depth):
-            modules.append(nn.GELU())
-            if use_norm:
-                modules.append(nn.Linear(config.hidden_size, config.hidden_size))
-                modules.append(nn.LayerNorm(config.hidden_size))
-            else:
-                modules.append(nn.Linear(config.hidden_size, config.hidden_size))
-        return nn.Sequential(*modules)
-
-    if projector_type == 'identity':
-        return nn.Identity()
-
-    raise ValueError(f'Unknown projector type: {projector_type}')
-
-
-def _build_vision_tower(vision_tower_cfg, **kwargs):
-    """Build yi vision tower."""
-    cfg = vision_tower_cfg
-    vision_tower = getattr(cfg, 'mm_vision_tower', getattr(cfg, 'vision_tower', None))
-    if os.path.exists(os.path.join(_model_path, vision_tower)):
-        vision_tower = os.path.join(_model_path, vision_tower)
-
-    from llava.model.multimodal_encoder.clip_encoder import CLIPVisionTower
-    is_absolute_path_exists = os.path.exists(vision_tower)
-    if is_absolute_path_exists or vision_tower.startswith('openai') or vision_tower.startswith(
-            'laion') or 'ShareGPT4V' in vision_tower:
-        return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
-
-    raise ValueError(f'Unknown vision tower: {vision_tower}')
-
-
-@contextmanager
-def init_yi_model():
-    origin_func_path = [
-        'llava.model.multimodal_projector.builder.build_vision_projector',
-        'llava.model.multimodal_encoder.builder.build_vision_tower'
-    ]
-    rewrite_func = [_build_vision_projector, _build_vision_tower]
-    with rewrite_ctx(origin_func_path, rewrite_func):
-        yield
-
-
-@VISION_MODELS.register_module()
-class YiVisionModel(LlavaVisionModel):
-    """Yi visual model."""
-
-    @classmethod
-    def match(cls, config: AutoConfig):
-        """Check whether the config match the model."""
-        arch = config.architectures[0] if config.architectures else None
-        if arch == 'LlavaLlamaForCausalLM':
-            projector_type = getattr(config, 'mm_projector_type', 'linear')
-            if '_Norm' in projector_type:
-                return True
-        return False
-
-    def build_preprocessor(self, trust_remote_code: bool = False):
-        from transformers import CLIPImageProcessor
-        vision_tower_name = osp.join(self.model_path, self.hf_config.mm_vision_tower)
-        self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name,
-                                                                  trust_remote_code=trust_remote_code)
-        config = AutoConfig.from_pretrained(vision_tower_name,
-                                            trust_remote_code=trust_remote_code)
-        image_size = config.image_size
-        patch_size = config.patch_size
-        self.n_token_per_image = (image_size // patch_size)**2
-        if self.hf_config.mm_vision_select_feature == 'cls_patch':
-            self.n_token_per_image += 1
-
-    def build_model(self, trust_remote_code: bool = False):
-        """Build the vision part of a VLM model when backend is turbomind, or
-        load the whole VLM model when `self.with_llm==True`"""
-        check_llava_install()
-
-        global _model_path
-        _model_path = self.model_path
-
-        with init_yi_model(), disable_transformers_logging():
-            super().build_model(trust_remote_code=trust_remote_code)
-
-    def preprocess(self, messages: list[dict]) -> list[dict]:
-        """Refer to `super().preprocess() for spec."""
-        images = self.collect_multimodal_items(messages)
-        outputs = []
-        for modality, image, params in images:
-            pixel_values = process_images([image], self.image_processor, self.config)
-            outputs.append(
-                dict(pixel_values=pixel_values,
-                     image_size=image.size,
-                     image_tokens=self.n_token_per_image,
-                     image_token_id=self.image_token_id))
-        messages.append(dict(role='preprocess', content=outputs))
-        return messages

From fbd4c8572a1b9326cd4cc053a3768d996d0e2117 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Mon, 22 Jun 2026 17:26:39 +0800
Subject: [PATCH 3/8] test: remove removed model coverage

---
 tests/pytorch/test_removed_models.py | 21 ---------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 tests/pytorch/test_removed_models.py

diff --git a/tests/pytorch/test_removed_models.py b/tests/pytorch/test_removed_models.py
deleted file mode 100644
index 8d15a16897..0000000000
--- a/tests/pytorch/test_removed_models.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import pytest
-import torch
-from transformers import PretrainedConfig
-
-from lmdeploy.pytorch.models.patch import build_model_from_hf_config
-
-
-@pytest.mark.parametrize('arch', [
-    'InternLMForCausalLM',
-    'QWenLMHeadModel',
-    'BaiChuanForCausalLM',
-    'BaichuanForCausalLM',
-    'Starcoder2ForCausalLM',
-    'InternLM2VEForCausalLM',
-    'MllamaForConditionalGeneration',
-])
-def test_removed_model_error(arch):
-    config = PretrainedConfig(architectures=[arch])
-
-    with pytest.raises(RuntimeError, match='support has been removed from LMDeploy'):
-        build_model_from_hf_config(config, device=torch.device('cpu'))

From ab10bd385c1d6cdac0d2be4caa3cac18e4e4dca8 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Mon, 22 Jun 2026 17:33:58 +0800
Subject: [PATCH 4/8] docs: apply markdown formatting

---
 docs/en/multi_modal/qwen2_vl.md    | 6 +++---
 docs/zh_cn/multi_modal/qwen2_vl.md | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/en/multi_modal/qwen2_vl.md b/docs/en/multi_modal/qwen2_vl.md
index 5c7ddae402..425d5d8f28 100644
--- a/docs/en/multi_modal/qwen2_vl.md
+++ b/docs/en/multi_modal/qwen2_vl.md
@@ -2,9 +2,9 @@
 
 LMDeploy supports the following Qwen-VL series of models, which are detailed in the table below:
 
-|    Model     |  Size  | Supported Inference Engine |
-| :----------: | :----: | :------------------------: |
-|   Qwen2-VL   | 2B, 7B |          PyTorch           |
+|  Model   |  Size  | Supported Inference Engine |
+| :------: | :----: | :------------------------: |
+| Qwen2-VL | 2B, 7B |          PyTorch           |
 
 The next chapter demonstrates how to deploy an Qwen-VL model using LMDeploy, with [Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) as an example.
 
diff --git a/docs/zh_cn/multi_modal/qwen2_vl.md b/docs/zh_cn/multi_modal/qwen2_vl.md
index bcd7982192..1d2d18f30a 100644
--- a/docs/zh_cn/multi_modal/qwen2_vl.md
+++ b/docs/zh_cn/multi_modal/qwen2_vl.md
@@ -2,9 +2,9 @@
 
 LMDeploy 支持 Qwen-VL 系列模型，具体如下：
 
-|    Model     |  Size  | Supported Inference Engine |
-| :----------: | :----: | :------------------------: |
-|   Qwen2-VL   | 2B, 7B |          PyTorch           |
+|  Model   |  Size  | Supported Inference Engine |
+| :------: | :----: | :------------------------: |
+| Qwen2-VL | 2B, 7B |          PyTorch           |
 
 本文将以[Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)为例，演示使用 LMDeploy 部署 Qwen2-VL 系列模型的方法
 

From e9ec2a597ddd33d4e2c785fdc4117d02344430a0 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Mon, 22 Jun 2026 17:35:05 +0800
Subject: [PATCH 5/8] docs: align zh api server examples

---
 docs/zh_cn/llm/api_server.md           | 6 +++++-
 docs/zh_cn/llm/api_server_anthropic.md | 6 +++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/docs/zh_cn/llm/api_server.md b/docs/zh_cn/llm/api_server.md
index a8f8cd0cce..131e845a02 100644
--- a/docs/zh_cn/llm/api_server.md
+++ b/docs/zh_cn/llm/api_server.md
@@ -161,6 +161,10 @@ for item in api_client.completions_v1(model=model_name, prompt='hi'):
 
 参考 [api_server_tools](./api_server_tools.md)。
 
+### Anthropic 兼容接口
+
+参考 [api_server_anthropic](./api_server_anthropic.md)。
+
 ### OpenAI Responses 兼容接口
 
 参考 [api_server_responses](./api_server_responses.md)。
@@ -200,7 +204,7 @@ curl http://{server_ip}:{server_port}/v1/models
 curl http://{server_ip}:{server_port}/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "qwen2.5-7b",
+    "model": "intern-s2-preview",
     "messages": [{"role": "user", "content": "Hello! How are you?"}]
   }'
 ```
diff --git a/docs/zh_cn/llm/api_server_anthropic.md b/docs/zh_cn/llm/api_server_anthropic.md
index c4451bf7bf..88a8bc247d 100644
--- a/docs/zh_cn/llm/api_server_anthropic.md
+++ b/docs/zh_cn/llm/api_server_anthropic.md
@@ -29,7 +29,7 @@ curl http://{server_ip}:{server_port}/v1/messages \
   -H "content-type: application/json" \
   -H "anthropic-version: 2023-06-01" \
   -d '{
-    "model": "qwen2.5-7b",
+    "model": "intern-s2-preview",
     "max_tokens": 128,
     "messages": [{"role": "user", "content": "Hello from Anthropic client"}]
   }'
@@ -42,7 +42,7 @@ curl http://{server_ip}:{server_port}/v1/messages \
   -H "content-type: application/json" \
   -H "anthropic-version: 2023-06-01" \
   -d '{
-    "model": "qwen2.5-7b",
+    "model": "intern-s2-preview",
     "max_tokens": 128,
     "messages": [{"role": "user", "content": "Find lmdeploy docs"}],
     "tools": [{
@@ -78,7 +78,7 @@ curl http://{server_ip}:{server_port}/v1/messages/count_tokens \
   -H "content-type: application/json" \
   -H "anthropic-version: 2023-06-01" \
   -d '{
-    "model": "qwen2.5-7b",
+    "model": "intern-s2-preview",
     "system": "You are a helpful assistant.",
     "messages": [{"role": "user", "content": "Count these tokens"}]
   }'

From 03d4f44f160c97ec1bb3dd3acc663da1557f0ffd Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Mon, 22 Jun 2026 17:39:46 +0800
Subject: [PATCH 6/8] docs: address copilot review comments

---
 lmdeploy/cli/cli.py           | 7 +++----
 lmdeploy/cli/serve.py         | 7 +++----
 lmdeploy/serve/proxy/proxy.py | 8 ++++----
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
index 8e6c97dcd6..e55918dc25 100644
--- a/lmdeploy/cli/cli.py
+++ b/lmdeploy/cli/cli.py
@@ -35,10 +35,9 @@ def add_parser_chat():
                             'download from ii) and iii). - ii) the model_id of a '
                             'lmdeploy-quantized model hosted inside a model repo on '
                             'huggingface.co, such as "lmdeploy/llama2-chat-70b-4bit",'
-                            ' "lmdeploy/llama2-chat-70b-4bit", etc. - iii) the model_id'
-                            ' of a model hosted inside a model repo on huggingface.co,'
-                            ' such as "internlm/internlm2-chat-7b", "qwen/qwen2.5-7b-instruct"'
-                            ' and so on')
+                            ' etc. - iii) the model_id of a model hosted inside a model'
+                            ' repo on huggingface.co, such as "internlm/internlm2_5-7b-chat",'
+                            ' "internlm/Intern-S2-Preview" and so on')
         # common args
         ArgumentHelper.backend(parser)
         # chat template args
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index a6097fe280..2085579acb 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -40,10 +40,9 @@ def add_parser_api_server():
                             'download from ii) and iii). - ii) the model_id of a '
                             'lmdeploy-quantized model hosted inside a model repo on '
                             'huggingface.co, such as "lmdeploy/llama2-chat-70b-4bit",'
-                            ' "lmdeploy/llama2-chat-70b-4bit", etc. - iii) the model_id'
-                            ' of a model hosted inside a model repo on huggingface.co,'
-                            ' such as "internlm/internlm2-chat-7b", "qwen/qwen2.5-7b-instruct"'
-                            ' and so on')
+                            ' etc. - iii) the model_id of a model hosted inside a model'
+                            ' repo on huggingface.co, such as "internlm/internlm2_5-7b-chat",'
+                            ' "internlm/Intern-S2-Preview" and so on')
         parser.add_argument('--server-name', type=str, default='0.0.0.0', help='Host ip for serving')
         parser.add_argument('--server-port', type=int, default=23333, help='Server port')
         parser.add_argument('--allow-origins',
diff --git a/lmdeploy/serve/proxy/proxy.py b/lmdeploy/serve/proxy/proxy.py
index 8653e56632..bb5231bb44 100644
--- a/lmdeploy/serve/proxy/proxy.py
+++ b/lmdeploy/serve/proxy/proxy.py
@@ -155,9 +155,9 @@ def add(self, node_url: str, status: Status | None = None):
         Args:
             node_url (str): A http url. Can be the url generated by
                 `lmdeploy serve api_server`.
-            description (dict): The description of the node. An example:
-                {'http://0.0.0.0:23333': {models: ['qwen2.5-7b']},
-                speed: -1}. The speed here can be RPM or other metric. All the
+            status (dict): The status of the node. An example:
+                {'models': ['intern-s2-preview'], 'speed': -1}.
+                The speed here can be RPM or other metric. All the
                 values of nodes should be the same metric.
         """
         if status is None:
@@ -496,7 +496,7 @@ def add_node(node: Node, raw_request: Request = None):
     - **url** (str): A http url. Can be the url generated by
       `lmdeploy serve api_server`.
     - **status** (dict): The description of the node. An example:
-      ``{models: ['qwen2.5-7b],  speed: 1}``. The speed here can be
+      ``{"models": ["intern-s2-preview"], "speed": 1}``. The speed here can be
       RPM or other metric. All the values of nodes should be the same metric.
     """
     try:

From 62406370c09dc61358eac09fe6ef9980f4e29db5 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Wed, 24 Jun 2026 11:15:46 +0800
Subject: [PATCH 7/8] chore: remove xcomposer support

---
 README.md                                     |   8 +-
 README_ja.md                                  |   8 +-
 README_zh-CN.md                               |   8 +-
 autotest/utils/get_run_config.py              |   4 -
 docs/en/multi_modal/index.rst                 |   1 -
 docs/en/multi_modal/xcomposer2d5.md           | 157 ----------
 docs/en/supported_models/supported_models.md  |   2 -
 docs/zh_cn/multi_modal/index.rst              |   1 -
 docs/zh_cn/multi_modal/xcomposer2d5.md        | 157 ----------
 .../supported_models/supported_models.md      |   2 -
 lmdeploy/archs.py                             |  17 +-
 lmdeploy/lite/apis/calibrate.py               |  14 +-
 lmdeploy/lite/quantization/calibration.py     |   6 -
 lmdeploy/vl/model/builder.py                  |   1 -
 lmdeploy/vl/model/xcomposer2.py               | 290 ------------------
 lmdeploy/vl/tools/merge_xcomposer2d5_task.py  |  65 ----
 16 files changed, 16 insertions(+), 725 deletions(-)
 delete mode 100644 docs/en/multi_modal/xcomposer2d5.md
 delete mode 100644 docs/zh_cn/multi_modal/xcomposer2d5.md
 delete mode 100644 lmdeploy/vl/model/xcomposer2.py
 delete mode 100644 lmdeploy/vl/tools/merge_xcomposer2d5_task.py

diff --git a/README.md b/README.md
index 5604f819b3..97b53a805a 100644
--- a/README.md
+++ b/README.md
@@ -52,11 +52,11 @@ ______________________________________________________________________
 - \[2024/09\] LMDeploy PyTorchEngine achieves 1.3x faster on Llama3-8B inference by introducing CUDA graph
 - \[2024/08\] LMDeploy is integrated into [modelscope/swift](https://github.com/modelscope/swift) as the default accelerator for VLMs inference
 - \[2024/07\] Support Llama3.1 8B, 70B and its TOOLS CALLING
-- \[2024/07\] Support [InternVL2](docs/en/multi_modal/internvl.md) full-series models, [InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md) and [function call](docs/en/llm/api_server_tools.md) of InternLM2.5
+- \[2024/07\] Support [InternVL2](docs/en/multi_modal/internvl.md) full-series models and [function call](docs/en/llm/api_server_tools.md) of InternLM2.5
 - \[2024/06\] PyTorch engine support DeepSeek-V2 and several VLMs, such as CogVLM2, Mini-InternVL, LlaVA-Next
 - \[2024/05\] Balance vision model when deploying VLMs with multiple GPUs
-- \[2024/05\] Support 4-bits weight-only quantization and inference on VLMs, such as InternVL v1.5, LLaVa, InternLMXComposer2
-- \[2024/04\] Support Llama3 and more VLMs, such as InternVL v1.1, v1.2, MiniGemini, InternLMXComposer2.
+- \[2024/05\] Support 4-bits weight-only quantization and inference on VLMs, such as InternVL v1.5 and LLaVa
+- \[2024/04\] Support Llama3 and more VLMs, such as InternVL v1.1, v1.2 and MiniGemini.
 - \[2024/04\] TurboMind adds online int8/int4 KV cache quantization and inference for all supported devices. Refer [here](docs/en/quantization/kv_quant.md) for detailed guide
 - \[2024/04\] TurboMind latest upgrade boosts GQA, rocketing the [internlm2-20b](https://huggingface.co/internlm/internlm2-20b) model inference to 16+ RPS, about 1.8x faster than vLLM.
 - \[2024/04\] Support Qwen1.5-MOE and dbrx.
@@ -171,8 +171,6 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
 <td>
 <ul>
   <li>LLaVA(1.5,1.6) (7B-34B)</li>
-  <li>InternLM-XComposer2 (7B, 4khd-7B)</li>
-  <li>InternLM-XComposer2.5 (7B)</li>
   <li>Qwen-VL (7B)</li>
   <li>Qwen2-VL (2B, 7B, 72B)</li>
   <li>Qwen2.5-VL (3B, 7B, 72B)</li>
diff --git a/README_ja.md b/README_ja.md
index 9a1eda7b3e..6de08b33da 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -37,11 +37,11 @@ ______________________________________________________________________
 
 - \[2024/08\] 🔥🔥 LMDeployは[modelscope/swift](https://github.com/modelscope/swift)に統合され、VLMs推論のデフォルトアクセラレータとなりました
 - \[2024/07\] 🎉🎉 Llama3.1 8B、70Bおよびそのツールコールをサポート
-- \[2024/07\] [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e)全シリーズモデル、[InternLM-XComposer2.5](docs/en/multi_modal/xcomposer2d5.md)およびInternLM2.5の[ファンクションコール](docs/en/llm/api_server_tools.md)をサポート
+- \[2024/07\] [InternVL2](https://huggingface.co/collections/OpenGVLab/internvl-20-667d3961ab5eb12c7ed1463e)全シリーズモデルおよびInternLM2.5の[ファンクションコール](docs/en/llm/api_server_tools.md)をサポート
 - \[2024/06\] PyTorchエンジンはDeepSeek-V2およびいくつかのVLMs、例えばCogVLM2、Mini-InternVL、LlaVA-Nextをサポート
 - \[2024/05\] 複数のGPUでVLMsをデプロイする際にビジョンモデルをバランスさせる
-- \[2024/05\] InternVL v1.5、LLaVa、InternLMXComposer2などのVLMsで4ビットの重みのみの量子化と推論をサポート
-- \[2024/04\] Llama3およびInternVL v1.1、v1.2、MiniGemini、InternLMXComposer2などのVLMモデルをサポート
+- \[2024/05\] InternVL v1.5、LLaVaなどのVLMsで4ビットの重みのみの量子化と推論をサポート
+- \[2024/04\] Llama3およびInternVL v1.1、v1.2、MiniGeminiなどのVLMモデルをサポート
 - \[2024/04\] TurboMindはすべてのサポートされているデバイスでのオンラインint8/int4 KVキャッシュ量子化と推論を追加しました。詳細なガイドは[こちら](docs/en/quantization/kv_quant.md)を参照してください
 - \[2024/04\] TurboMindの最新アップグレードによりGQAが強化され、[internlm2-20b](https://huggingface.co/internlm/internlm2-20b)モデルの推論が16+ RPSに達し、vLLMの約1.8倍の速さになりました
 - \[2024/04\] Qwen1.5-MOEおよびdbrxをサポート
@@ -158,8 +158,6 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
 <td>
 <ul>
   <li>LLaVA(1.5,1.6) (7B-34B)</li>
-  <li>InternLM-XComposer2 (7B, 4khd-7B)</li>
-  <li>InternLM-XComposer2.5 (7B)</li>
   <li>Qwen-VL (7B)</li>
   <li>Qwen2-VL (2B, 7B, 72B)</li>
   <li>Qwen2.5-VL (3B, 7B, 72B)</li>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 0eb8da4c39..7971019413 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -52,11 +52,11 @@ ______________________________________________________________________
 - \[2024/09\] 通过引入 CUDA Graph，LMDeploy PyTorchEngine 在 Llama3-8B 推理上实现了 1.3 倍的加速
 - \[2024/08\] LMDeploy现已集成至 [modelscope/swift](https://github.com/modelscope/swift)，成为 VLMs 推理的默认加速引擎
 - \[2024/07\] 支持 Llama3.1 8B 和 70B 模型，以及工具调用功能
-- \[2024/07\] 支持 [InternVL2](docs/zh_cn/multi_modal/internvl.md) 全系列模型，[InternLM-XComposer2.5](docs/zh_cn/multi_modal/xcomposer2d5.md) 模型和 InternLM2.5 的 [function call 功能](docs/zh_cn/llm/api_server_tools.md)
+- \[2024/07\] 支持 [InternVL2](docs/zh_cn/multi_modal/internvl.md) 全系列模型和 InternLM2.5 的 [function call 功能](docs/zh_cn/llm/api_server_tools.md)
 - \[2024/06\] PyTorch engine 支持了 DeepSeek-V2 和若干 VLM 模型推理, 比如 CogVLM2，Mini-InternVL，LlaVA-Next
 - \[2024/05\] 在多 GPU 上部署 VLM 模型时，支持把视觉部分的模型均分到多卡上
-- \[2024/05\] 支持InternVL v1.5, LLaVa, InternLMXComposer2 等 VLMs 模型的 4bit 权重量化和推理
-- \[2024/04\] 支持 Llama3 和 InternVL v1.1, v1.2，MiniGemini，InternLM-XComposer2 等 VLM 模型
+- \[2024/05\] 支持 InternVL v1.5 和 LLaVa 等 VLMs 模型的 4bit 权重量化和推理
+- \[2024/04\] 支持 Llama3 和 InternVL v1.1, v1.2，MiniGemini 等 VLM 模型
 - \[2024/04\] TurboMind 支持 kv cache int4/int8 在线量化和推理，适用已支持的所有型号显卡。详情请参考[这里](docs/zh_cn/quantization/kv_quant.md)
 - \[2024/04\] TurboMind 引擎升级，优化 GQA 推理。[internlm2-20b](https://huggingface.co/internlm/internlm2-20b) 推理速度达 16+ RPS，约是 vLLM 的 1.8 倍
 - \[2024/04\] 支持 Qwen1.5-MOE 和 dbrx.
@@ -173,8 +173,6 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
 <td>
 <ul>
   <li>LLaVA(1.5,1.6) (7B-34B)</li>
-  <li>InternLM-XComposer2 (7B, 4khd-7B)</li>
-  <li>InternLM-XComposer2.5 (7B)</li>
   <li>Qwen-VL (7B)</li>
   <li>Qwen2-VL (2B, 7B, 72B)</li>
   <li>Qwen2.5-VL (3B, 7B, 72B)</li>
diff --git a/autotest/utils/get_run_config.py b/autotest/utils/get_run_config.py
index c17e30440b..e8ba2b230e 100644
--- a/autotest/utils/get_run_config.py
+++ b/autotest/utils/get_run_config.py
@@ -37,10 +37,6 @@ def get_model_name(model):
         return 'internvl-internlm2'
     if ('internlm2') in model_name:
         return 'internlm2'
-    if ('internlm-xcomposer2d5') in model_name:
-        return 'internlm-xcomposer2d5'
-    if ('internlm-xcomposer2') in model_name:
-        return 'internlm-xcomposer2'
     if ('glm-4') in model_name:
         return 'glm4'
     if len(model_name.split('-')) > 2 and '-'.join(model_name.split('-')[0:2]) in model_names:
diff --git a/docs/en/multi_modal/index.rst b/docs/en/multi_modal/index.rst
index a041172edb..92356eec16 100644
--- a/docs/en/multi_modal/index.rst
+++ b/docs/en/multi_modal/index.rst
@@ -8,7 +8,6 @@ Vision-Language Models
    deepseek_vl2.md
    llava.md
    internvl.md
-   xcomposer2d5.md
    cogvlm.md
    minicpmv.md
    phi3.md
diff --git a/docs/en/multi_modal/xcomposer2d5.md b/docs/en/multi_modal/xcomposer2d5.md
deleted file mode 100644
index 2f56b65ea1..0000000000
--- a/docs/en/multi_modal/xcomposer2d5.md
+++ /dev/null
@@ -1,157 +0,0 @@
-# InternLM-XComposer-2.5
-
-## Introduction
-
-[InternLM-XComposer-2.5](https://github.com/InternLM/InternLM-XComposer) excels in various text-image comprehension and composition applications, achieving GPT-4V level capabilities with merely 7B LLM backend. IXC-2.5 is trained with 24K interleaved image-text contexts, it can seamlessly extend to 96K long contexts via RoPE extrapolation. This long-context capability allows IXC-2.5 to perform exceptionally well in tasks requiring extensive input and output contexts. LMDeploy supports model [internlm/internlm-xcomposer2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b)  in TurboMind engine.
-
-## Quick Start
-
-### Installation
-
-Please install LMDeploy by following the [installation guide](../get_started/installation.md), and install other packages that InternLM-XComposer-2.5 needs
-
-```shell
-pip install decord
-```
-
-### Offline inference pipeline
-
-The following sample code shows the basic usage of VLM pipeline. For more examples, please refer to [VLM Offline Inference Pipeline](./vl_pipeline.md)
-
-```python
-from lmdeploy import pipeline
-from lmdeploy.vl import load_image
-from lmdeploy.vl.constants import IMAGE_TOKEN
-
-pipe = pipeline('internlm/internlm-xcomposer2d5-7b')
-
-image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
-response = pipe((f'describe this image', image))
-print(response)
-```
-
-## Lora Model
-
-InternLM-XComposer-2.5 trained the LoRA weights for webpage creation and article writing. As TurboMind backend doesn't support slora, only one LoRA model can be deployed at a time, and the LoRA weights need to be merged when deploying the model. LMDeploy provides the corresponding conversion script, which is used as follows:
-
-```
-export HF_MODEL=internlm/internlm-xcomposer2d5-7b
-export WORK_DIR=internlm/internlm-xcomposer2d5-7b-web
-export TASK=web
-python -m lmdeploy.vl.tools.merge_xcomposer2d5_task $HF_MODEL $WORK_DIR --task $TASK
-```
-
-## Quantization
-
-The following takes the base model as an example to show the quantization method. If you want to use the LoRA model, please merge the LoRA model according to the previous section.
-
-```shell
-
-export HF_MODEL=internlm/internlm-xcomposer2d5-7b
-export WORK_DIR=internlm/internlm-xcomposer2d5-7b-4bit
-
-lmdeploy lite auto_awq \
-   $HF_MODEL \
-  --work-dir $WORK_DIR
-```
-
-## More examples
-
-<details>
-  <summary>
-    <b>Video Understanding</b>
-  </summary>
-
-The following uses the `pipeline.chat` interface api as an example to demonstrate its usage. Other interfaces apis also support inference but require manually splicing of conversation content.
-
-```python
-from lmdeploy import pipeline, GenerationConfig
-from transformers.dynamic_module_utils import get_class_from_dynamic_module
-
-HF_MODEL = 'internlm/internlm-xcomposer2d5-7b'
-load_video = get_class_from_dynamic_module('ixc_utils.load_video', HF_MODEL)
-frame2img = get_class_from_dynamic_module('ixc_utils.frame2img', HF_MODEL)
-Video_transform = get_class_from_dynamic_module('ixc_utils.Video_transform', HF_MODEL)
-get_font = get_class_from_dynamic_module('ixc_utils.get_font', HF_MODEL)
-
-video = load_video('liuxiang.mp4') # https://github.com/InternLM/InternLM-XComposer/raw/main/examples/liuxiang.mp4
-img = frame2img(video, get_font())
-img = Video_transform(img)
-
-pipe = pipeline(HF_MODEL)
-gen_config = GenerationConfig(top_k=50, top_p=0.8, temperature=1.0)
-query = 'Here are some frames of a video. Describe this video in detail'
-sess = pipe.chat((query, img), gen_config=gen_config)
-print(sess.response.text)
-
-query = 'tell me the athlete code of Liu Xiang'
-sess = pipe.chat(query, session=sess, gen_config=gen_config)
-print(sess.response.text)
-```
-
-</details>
-
-<details>
-  <summary>
-    <b>Multi-Image</b>
-  </summary>
-
-```python
-from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl import load_image
-
-query = f'Image1 {IMAGE_TOKEN}; Image2 {IMAGE_TOKEN}; Image3 {IMAGE_TOKEN}; I want to buy a car from the three given cars, analyze their advantages and weaknesses one by one'
-
-urls = ['https://raw.githubusercontent.com/InternLM/InternLM-XComposer/main/examples/cars1.jpg',
-        'https://raw.githubusercontent.com/InternLM/InternLM-XComposer/main/examples/cars2.jpg',
-        'https://raw.githubusercontent.com/InternLM/InternLM-XComposer/main/examples/cars3.jpg']
-images = [load_image(url) for url in urls]
-
-pipe = pipeline('internlm/internlm-xcomposer2d5-7b', log_level='INFO')
-output = pipe((query, images), gen_config=GenerationConfig(top_k=0, top_p=0.8, random_seed=89247526689433939))
-```
-
-Since LMDeploy does not support beam search, the generated results will be quite different from those using beam search with transformers. It is recommended to turn off top_k or use a larger top_k sampling to increase diversity.
-
-</details>
-
-<details>
-  <summary>
-    <b>Instruction to Webpage</b>
-  </summary>
-
-Please first convert the web model using the instructions above.
-
-```python
-from lmdeploy import pipeline, GenerationConfig
-
-pipe = pipeline('/nvme/shared/internlm-xcomposer2d5-7b-web', log_level='INFO')
-pipe.chat_template.meta_instruction = None
-
-query = 'A website for Research institutions. The name is Shanghai AI lab. Top Navigation Bar is blue.Below left, an image shows the logo of the lab. In the right, there is a passage of text below that describes the mission of the laboratory.There are several images to show the research projects of Shanghai AI lab.'
-output = pipe(query, gen_config=GenerationConfig(max_new_tokens=2048))
-```
-
-When using transformers for testing, it is found that if repetition_penalty is set, there is a high probability that the decode phase will not stop if `num_beams` is set to 1. As LMDeploy does not support beam search, it is recommended to turn off repetition_penalty when using LMDeploy for inference.
-
-</details>
-
-<details>
-  <summary>
-    <b>Write Article</b>
-  </summary>
-
-Please first convert the write model using the instructions above.
-
-```python
-from lmdeploy import pipeline, GenerationConfig
-
-pipe = pipeline('/nvme/shared/internlm-xcomposer2d5-7b-write', log_level='INFO')
-pipe.chat_template.meta_instruction = None
-
-query = 'Please write a blog based on the title: French Pastries: A Sweet Indulgence'
-output = pipe(query, gen_config=GenerationConfig(max_new_tokens=8192))
-```
-
-</details>
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
index 3694328f24..e32e6f0f9a 100644
--- a/docs/en/supported_models/supported_models.md
+++ b/docs/en/supported_models/supported_models.md
@@ -14,8 +14,6 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |            InternLM2             |     7B - 20B     | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |           InternLM2.5            |        7B        | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |            InternLM3             |        8B        | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM-XComposer2        |   7B, 4khd-7B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|      InternLM-XComposer2.5       |        7B        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |            Intern-S1             |       241B       | MLLM |    Yes    |   Yes   |   Yes   |  No   |
 |          Intern-S1-mini          |       8.3B       | MLLM |    Yes    |   Yes   |   Yes   |  No   |
 |     Qwen1.5<sup>\[1\]</sup>      |   1.8B - 110B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
diff --git a/docs/zh_cn/multi_modal/index.rst b/docs/zh_cn/multi_modal/index.rst
index 9a61f6efdb..0344770a3e 100644
--- a/docs/zh_cn/multi_modal/index.rst
+++ b/docs/zh_cn/multi_modal/index.rst
@@ -8,7 +8,6 @@
    deepseek_vl2.md
    llava.md
    internvl.md
-   xcomposer2d5.md
    cogvlm.md
    minicpmv.md
    phi3.md
diff --git a/docs/zh_cn/multi_modal/xcomposer2d5.md b/docs/zh_cn/multi_modal/xcomposer2d5.md
deleted file mode 100644
index 033d25c8ac..0000000000
--- a/docs/zh_cn/multi_modal/xcomposer2d5.md
+++ /dev/null
@@ -1,157 +0,0 @@
-# InternLM-XComposer-2.5
-
-## 简介
-
-[InternLM-XComposer-2.5](https://github.com/InternLM/InternLM-XComposer) 是基于书生·浦语2大语言模型研发的突破性的图文多模态大模型，仅使用 7B LLM 后端就达到了 GPT-4V 级别的能力。浦语·灵笔2.5使用24K交错的图像-文本上下文进行训练，通过RoPE外推可以无缝扩展到96K长的上下文。这种长上下文能力使浦语·灵笔2.5在需要广泛输入和输出上下文的任务中表现出色。 LMDeploy 支持了 [internlm/internlm-xcomposer2d5-7b](https://huggingface.co/internlm/internlm-xcomposer2d5-7b) 模型，通过 TurboMind 引擎推理。
-
-## 快速开始
-
-### 安装
-
-请参考[安装文档](../get_started/installation.md)安装 LMDeploy，并安装上游模型库 InternLM-XComposer-2.5 所需的依赖。
-
-```shell
-pip install decord
-```
-
-### 离线推理 pipeline
-
-以下是使用pipeline进行离线推理的示例，更多用法参考[VLM离线推理 pipeline](./vl_pipeline.md)
-
-```python
-from lmdeploy import pipeline
-from lmdeploy.vl import load_image
-from lmdeploy.vl.constants import IMAGE_TOKEN
-
-pipe = pipeline('internlm/internlm-xcomposer2d5-7b')
-
-image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
-response = pipe((f'describe this image', image))
-print(response)
-```
-
-## Lora 模型
-
-InternLM-XComposer-2.5 针对网页制作和文章创作训练了 LoRA 模型，由于 TurboMind 不支持 slora 特性，所以需要同时只能部署一个 LoRA 模型，需要先对权重进行合并。LMDeploy 提供相关的转换脚本，使用方式为:
-
-```
-export HF_MODEL=internlm/internlm-xcomposer2d5-7b
-export WORK_DIR=internlm/internlm-xcomposer2d5-7b-web
-export TASK=web
-python -m lmdeploy.vl.tools.merge_xcomposer2d5_task $HF_MODEL $WORK_DIR --task $TASK
-```
-
-## 量化
-
-下面以 base 模型为例，展示量化的方式，若要使用 LoRA 模型，请先按照上一章节提取 LoRA 模型。
-
-```shell
-
-export HF_MODEL=internlm/internlm-xcomposer2d5-7b
-export WORK_DIR=internlm/internlm-xcomposer2d5-7b-4bit
-
-lmdeploy lite auto_awq \
-   $HF_MODEL \
-  --work-dir $WORK_DIR
-```
-
-## 更多使用例子
-
-<details>
-  <summary>
-    <b>Video Understanding</b>
-  </summary>
-
-下面以 `pipeline.chat` 为例展示用法，其它接口同样支持推理，需要手动拼接对话内容。
-
-```python
-from lmdeploy import pipeline, GenerationConfig
-from transformers.dynamic_module_utils import get_class_from_dynamic_module
-
-HF_MODEL = 'internlm/internlm-xcomposer2d5-7b'
-load_video = get_class_from_dynamic_module('ixc_utils.load_video', HF_MODEL)
-frame2img = get_class_from_dynamic_module('ixc_utils.frame2img', HF_MODEL)
-Video_transform = get_class_from_dynamic_module('ixc_utils.Video_transform', HF_MODEL)
-get_font = get_class_from_dynamic_module('ixc_utils.get_font', HF_MODEL)
-
-video = load_video('liuxiang.mp4') # https://github.com/InternLM/InternLM-XComposer/raw/main/examples/liuxiang.mp4
-img = frame2img(video, get_font())
-img = Video_transform(img)
-
-pipe = pipeline(HF_MODEL)
-gen_config = GenerationConfig(top_k=50, top_p=0.8, temperature=1.0)
-query = 'Here are some frames of a video. Describe this video in detail'
-sess = pipe.chat((query, img), gen_config=gen_config)
-print(sess.response.text)
-
-query = 'tell me the athlete code of Liu Xiang'
-sess = pipe.chat(query, session=sess, gen_config=gen_config)
-print(sess.response.text)
-```
-
-</details>
-
-<details>
-  <summary>
-    <b>Multi-Image</b>
-  </summary>
-
-```python
-from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl import load_image
-
-query = f'Image1 {IMAGE_TOKEN}; Image2 {IMAGE_TOKEN}; Image3 {IMAGE_TOKEN}; I want to buy a car from the three given cars, analyze their advantages and weaknesses one by one'
-
-urls = ['https://raw.githubusercontent.com/InternLM/InternLM-XComposer/main/examples/cars1.jpg',
-        'https://raw.githubusercontent.com/InternLM/InternLM-XComposer/main/examples/cars2.jpg',
-        'https://raw.githubusercontent.com/InternLM/InternLM-XComposer/main/examples/cars3.jpg']
-images = [load_image(url) for url in urls]
-
-pipe = pipeline('internlm/internlm-xcomposer2d5-7b', log_level='INFO')
-output = pipe((query, images), gen_config=GenerationConfig(top_k=0, top_p=0.8, random_seed=89247526689433939))
-```
-
-由于 LMDeploy 不支持 beam search，生成的结果与使用 transformers 的 beam search 相比，会有较大的差异，建议关闭 top_k 或者使用较大的 top_k 采样来增加多样性。
-
-</details>
-
-<details>
-  <summary>
-    <b>Instruction to Webpage</b>
-  </summary>
-
-请先使用使用上述说明，转化 web 模型。
-
-```python
-from lmdeploy import pipeline, GenerationConfig
-
-pipe = pipeline('/nvme/shared/internlm-xcomposer2d5-7b-web', log_level='INFO')
-pipe.chat_template.meta_instruction = None
-
-query = 'A website for Research institutions. The name is Shanghai AI lab. Top Navigation Bar is blue.Below left, an image shows the logo of the lab. In the right, there is a passage of text below that describes the mission of the laboratory.There are several images to show the research projects of Shanghai AI lab.'
-output = pipe(query, gen_config=GenerationConfig(max_new_tokens=2048))
-```
-
-使用 transformers 测试时，发现如果设置了 repetition_penalty，beam search 为1时有较大概率停不下来，因为 LMDeploy 不支持 beam search，建议使用 LMDeploy 推理时关闭 repetition_penalty。
-
-</details>
-
-<details>
-  <summary>
-    <b>Write Article</b>
-  </summary>
-
-请先使用使用上述说明，转化 write 模型。
-
-```python
-from lmdeploy import pipeline, GenerationConfig
-
-pipe = pipeline('/nvme/shared/internlm-xcomposer2d5-7b-write', log_level='INFO')
-pipe.chat_template.meta_instruction = None
-
-query = 'Please write a blog based on the title: French Pastries: A Sweet Indulgence'
-output = pipe(query, gen_config=GenerationConfig(max_new_tokens=8192))
-```
-
-</details>
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
index 5e29dfd0b8..d565b1125c 100644
--- a/docs/zh_cn/supported_models/supported_models.md
+++ b/docs/zh_cn/supported_models/supported_models.md
@@ -14,8 +14,6 @@
 |            InternLM2             |    7B - 20B    | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |           InternLM2.5            |       7B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |            InternLM3             |       8B       | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
-|       InternLM-XComposer2        |  7B, 4khd-7B   | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
-|      InternLM-XComposer2.5       |       7B       | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |            Intern-S1             |      241B      | MLLM |    Yes    |   Yes   |   Yes   |  No   |
 |          Intern-S1-mini          |      8.3B      | MLLM |    Yes    |   Yes   |   Yes   |  No   |
 |          Intern-S1-Pro           |      1TB       | MLLM |    Yes    |    -    |    -    |  No   |
diff --git a/lmdeploy/archs.py b/lmdeploy/archs.py
index 9545ba2d68..2e2aa2615d 100644
--- a/lmdeploy/archs.py
+++ b/lmdeploy/archs.py
@@ -94,21 +94,16 @@ def autoget_backend_config(
 
 def check_vl_llm(backend: str, config: dict) -> bool:
     """Check if the model is a vl model from model config."""
-    if 'auto_map' in config:
-        for _, v in config['auto_map'].items():
-            if 'InternLMXComposer2ForCausalLM' in v:
-                return True
-
     if 'language_config' in config and 'vision_config' in config and config['language_config'].get(
             'architectures', [None])[0] == 'DeepseekV2ForCausalLM':
         return True
 
     arch = config['architectures'][0]
     supported_archs = set([
-        'LlavaLlamaForCausalLM', 'LlavaMistralForCausalLM', 'CogVLMForCausalLM', 'InternLMXComposer2ForCausalLM',
-        'InternVLChatModel', 'MiniCPMV', 'LlavaForConditionalGeneration', 'LlavaNextForConditionalGeneration',
-        'Phi3VForCausalLM', 'Qwen2VLForConditionalGeneration', 'Qwen2_5_VLForConditionalGeneration',
-        'Qwen3VLForConditionalGeneration', 'Qwen3VLMoeForConditionalGeneration', 'Qwen3_5ForConditionalGeneration',
+        'LlavaLlamaForCausalLM', 'LlavaMistralForCausalLM', 'CogVLMForCausalLM', 'InternVLChatModel', 'MiniCPMV',
+        'LlavaForConditionalGeneration', 'LlavaNextForConditionalGeneration', 'Phi3VForCausalLM',
+        'Qwen2VLForConditionalGeneration', 'Qwen2_5_VLForConditionalGeneration', 'Qwen3VLForConditionalGeneration',
+        'Qwen3VLMoeForConditionalGeneration', 'Qwen3_5ForConditionalGeneration',
         'Qwen3_5MoeForConditionalGeneration', 'Qwen3OmniMoeForConditionalGeneration', 'MolmoForCausalLM',
         'Gemma3ForConditionalGeneration', 'Llama4ForConditionalGeneration', 'InternVLForConditionalGeneration',
         'InternS1ForConditionalGeneration', 'InternS1ProForConditionalGeneration',
@@ -160,10 +155,6 @@ def get_model_arch(model_path: str, trust_remote_code: bool = False):
     _cfg = cfg.to_dict()
     if _cfg.get('architectures', None):
         arch = _cfg['architectures'][0]
-        if _cfg.get('auto_map'):
-            for _, v in _cfg['auto_map'].items():
-                if 'InternLMXComposer2ForCausalLM' in v:
-                    arch = 'InternLMXComposer2ForCausalLM'
     elif _cfg.get('auto_map', None) and 'AutoModelForCausalLM' in _cfg['auto_map']:
         arch = _cfg['auto_map']['AutoModelForCausalLM'].split('.')[-1]
     elif _cfg.get('language_config', None) and _cfg['language_config'].get(
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index 59e04db2fe..4b09fca9d4 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -22,7 +22,6 @@
     'LlamaForCausalLM': 'LlamaDecoderLayer',
     'LlavaLlamaForCausalLM': 'LlamaDecoderLayer',
     'MGMLlamaForCausalLM': 'LlamaDecoderLayer',  # mini gemini
-    'InternLMXComposer2ForCausalLM': 'InternLM2DecoderLayer',
     'InternS2PreviewForConditionalGeneration': 'InternS2PreviewDecoderLayer',
     'Phi3ForCausalLM': 'Phi3DecoderLayer',
     'ChatGLMForConditionalGeneration': 'GLMBlock',
@@ -43,7 +42,6 @@
     'LlamaForCausalLM': 'LlamaRMSNorm',
     'LlavaLlamaForCausalLM': 'LlamaRMSNorm',
     'MGMLlamaForCausalLM': 'LlamaRMSNorm',  # mini gemini
-    'InternLMXComposer2ForCausalLM': 'InternLM2RMSNorm',
     'InternS2PreviewForConditionalGeneration': 'InternS2PreviewRMSNorm',
     'Phi3ForCausalLM': 'Phi3RMSNorm',
     'ChatGLMForConditionalGeneration': 'RMSNorm',
@@ -64,7 +62,6 @@
     'LlamaForCausalLM': 'lm_head',
     'LlavaLlamaForCausalLM': 'lm_head',
     'MGMLlamaForCausalLM': 'lm_head',  # mini gemini
-    'InternLMXComposer2ForCausalLM': 'output',
     'InternS2PreviewForConditionalGeneration': 'lm_head',
     'Phi3ForCausalLM': 'lm_head',
     'ChatGLMForConditionalGeneration': 'output_layer',
@@ -77,20 +74,15 @@
 
 def check_vl_llm(backend: str, config: dict) -> bool:
     """Check if the model is a vl model from model config."""
-    if 'auto_map' in config:
-        for _, v in config['auto_map'].items():
-            if 'InternLMXComposer2ForCausalLM' in v:
-                return True
-
     if 'language_config' in config and 'vision_config' in config and config['language_config'].get(
             'architectures', [None])[0] == 'DeepseekV2ForCausalLM':
         return True
 
     arch = config['architectures'][0]
     supported_archs = set([
-        'LlavaLlamaForCausalLM', 'LlavaMistralForCausalLM', 'CogVLMForCausalLM', 'InternLMXComposer2ForCausalLM',
-        'InternVLChatModel', 'MiniCPMV', 'LlavaForConditionalGeneration', 'LlavaNextForConditionalGeneration',
-        'Phi3VForCausalLM', 'Qwen2VLForConditionalGeneration', 'Qwen2_5_VLForConditionalGeneration',
+        'LlavaLlamaForCausalLM', 'LlavaMistralForCausalLM', 'CogVLMForCausalLM', 'InternVLChatModel', 'MiniCPMV',
+        'LlavaForConditionalGeneration', 'LlavaNextForConditionalGeneration', 'Phi3VForCausalLM',
+        'Qwen2VLForConditionalGeneration', 'Qwen2_5_VLForConditionalGeneration',
         'Qwen3VLForConditionalGeneration', 'Qwen3VLMoeForConditionalGeneration', 'Qwen3_5ForConditionalGeneration',
         'Qwen3_5MoeForConditionalGeneration', 'MolmoForCausalLM', 'Gemma3ForConditionalGeneration',
         'Llama4ForConditionalGeneration', 'InternVLForConditionalGeneration', 'InternS1ForConditionalGeneration',
diff --git a/lmdeploy/lite/quantization/calibration.py b/lmdeploy/lite/quantization/calibration.py
index a895b11e0c..394279ecf1 100644
--- a/lmdeploy/lite/quantization/calibration.py
+++ b/lmdeploy/lite/quantization/calibration.py
@@ -323,12 +323,6 @@ def _auto_get_scale(layers, inp, module2inspect=None, kwargs=None):
         if module2inspect is None:
             assert len(layers) == 1
             module2inspect = layers[0]
-        # internlm-xcomposer2-vl applies plora, which requires im_mask arg
-        if module2inspect._get_name() == 'InternLM2MLP':
-            from inspect import signature
-            if 'im_mask' in signature(module2inspect.forward).parameters:
-                kwargs['im_mask'] = None
-
         best_ratio = _search_module_scale(module2inspect, layers, inp.value, kwargs)
         inp.save_ratio(best_ratio)
 
diff --git a/lmdeploy/vl/model/builder.py b/lmdeploy/vl/model/builder.py
index 58b29a7e86..6b2152bc09 100644
--- a/lmdeploy/vl/model/builder.py
+++ b/lmdeploy/vl/model/builder.py
@@ -28,7 +28,6 @@
 from .qwen3 import Qwen3VLModel  # noqa F401
 from .qwen3_5 import Qwen3_5Model  # noqa F401
 from .qwen3_omni import Qwen3OmniModel  # noqa F401
-from .xcomposer2 import Xcomposer2VisionModel  # noqa F401
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/xcomposer2.py b/lmdeploy/vl/model/xcomposer2.py
deleted file mode 100644
index 92d036c50d..0000000000
--- a/lmdeploy/vl/model/xcomposer2.py
+++ /dev/null
@@ -1,290 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import enum
-import os
-import sys
-import warnings
-from contextlib import contextmanager
-from typing import Any
-
-import torch
-from PIL.Image import Image
-from transformers import AutoConfig, AutoModelForCausalLM
-
-from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
-from lmdeploy.vl.model.utils import add_device_hook, disable_logging, rewrite_ctx
-
-logger = get_logger('lmdeploy')
-
-
-def check_xcomposer_install():
-    try:
-        # WARNING! we have to do this otherwise the model_type is wrong for
-        # xcomposer2d5
-        import decord  # noqa: F401
-    except ImportError:
-        raise ImportError("No module named 'decord'. Please install decord by `pip install decord`"  # noqa
-                          )
-
-
-class ModelType(enum.Enum):
-    """Request type."""
-    XCOMPOSER2 = enum.auto()
-    XCOMPOSER2_4KHD = enum.auto()
-    XCOMPOSER2D5 = enum.auto()
-
-
-def get_xcomposer_type(model_path: str) -> tuple[ModelType, Any]:
-    """Get xcomposer type."""
-    from transformers.dynamic_module_utils import get_class_from_dynamic_module
-    match_modules = {
-        'ixc_utils.Image_transform': ModelType.XCOMPOSER2D5,
-        'ixc_utils.HD_transform': ModelType.XCOMPOSER2_4KHD
-    }
-    for key, value in match_modules.items():
-        try:
-            module = get_class_from_dynamic_module(key, model_path)
-            return value, module
-        except Exception:
-            pass
-    return ModelType.XCOMPOSER2, None
-
-
-def _CLIPVisionModel_from_pretrained(vision_tower_name):
-    from transformers import CLIPVisionConfig, CLIPVisionModel
-    config = CLIPVisionConfig.from_pretrained(vision_tower_name)
-    model = CLIPVisionModel._from_config(config)
-    return model
-
-
-@contextmanager
-def init_empty_vit(model_path):
-    """Skip download vision model."""
-    origin_func_path = [
-        'transformers.CLIPVisionModel.from_pretrained',
-    ]
-    rewrite_func = [
-        _CLIPVisionModel_from_pretrained,
-    ]
-
-    model_type, _ = get_xcomposer_type(model_path)
-    if model_type == ModelType.XCOMPOSER2D5:
-        from transformers.dynamic_module_utils import get_class_from_dynamic_module
-        from transformers.utils import TRANSFORMERS_DYNAMIC_MODULE_NAME
-        _ = get_class_from_dynamic_module('modeling_internlm_xcomposer2.get_font', model_path)
-        folder = model_path.rstrip(os.sep).split(os.sep)[-1]
-        module_path = '.'.join([TRANSFORMERS_DYNAMIC_MODULE_NAME, folder, 'modeling_internlm_xcomposer2'])
-        origin_get_font_func = getattr(sys.modules[module_path], 'get_font')
-        origin_func_path.append(origin_get_font_func)
-        rewrite_func.append(lambda: None)
-
-    with rewrite_ctx(origin_func_path, rewrite_func):
-        yield
-
-
-@VISION_MODELS.register_module()
-class Xcomposer2VisionModel(VisionModel):
-    """InternLM-Xcomposer2 vision model."""
-
-    def __init__(self,
-                 model_path: str,
-                 with_llm: bool = False,
-                 max_memory: dict[int, int] = None,
-                 hf_config: AutoConfig = None,
-                 backend: str = '',
-                 trust_remote_code: bool = False):
-        model_path = model_path.rstrip(os.sep)
-        super().__init__(model_path, with_llm, max_memory, hf_config, backend, trust_remote_code=trust_remote_code)
-        check_xcomposer_install()
-        self.model_type, self.module = get_xcomposer_type(self.model_path)
-        logger.info(f'matching type of {self.model_type}')
-
-    @classmethod
-    def match(cls, config: AutoConfig):
-        """Check whether the config match the model."""
-        arch = config.architectures[0] if config.architectures else None
-        target = 'InternLMXComposer2ForCausalLM'
-        if arch == target:
-            return True
-        for _, v in getattr(config, 'auto_map', {}).items():
-            if target in v:
-                return True
-        return False
-
-    def build_preprocessor(self, trust_remote_code: bool = False):
-
-        import torchvision.transforms as transforms
-        from torchvision.transforms.functional import InterpolationMode
-
-        if self.model_type in [ModelType.XCOMPOSER2D5, ModelType.XCOMPOSER2_4KHD]:
-            self.HD_transform = self.module
-            self.vis_processor = transforms.Compose([
-                transforms.ToTensor(),
-                transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
-            ])
-            self.preprocess_func = (self._preprocess_2d5
-                                    if self.model_type == ModelType.XCOMPOSER2D5 else self._preprocess_4khd_7b)
-        else:
-            self.vis_processor = transforms.Compose([
-                transforms.Resize((self.hf_config.img_size, self.hf_config.img_size),
-                                  interpolation=InterpolationMode.BICUBIC),
-                transforms.ToTensor(),
-                transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
-            ])
-            self.preprocess_func = self._preprocess_7b
-
-    def build_model(self, trust_remote_code: bool = False):
-        """Build the vision part of a VLM model when backend is turbomind, or
-        load the whole VLM model when `self.with_llm==True`"""
-        from accelerate import init_empty_weights
-        with init_empty_weights(), warnings.catch_warnings(), \
-                init_empty_vit(self.model_path):
-            warnings.simplefilter('ignore')
-            config = self.hf_config
-            model = AutoModelForCausalLM.from_config(config, trust_remote_code=trust_remote_code)
-            model.vit.load_model()
-            model.vit.resize_pos()
-            if hasattr(self.hf_config, 'img_size'):
-                model.vit.vision_tower.vision_model.embeddings.image_size = \
-                    self.hf_config.img_size
-            model.vit.vision_tower.vision_model.post_layernorm.to_empty(device='cpu').half()
-            self.vl_model = model
-            if not self.with_llm:
-                del model.model
-                del model.output
-
-        from accelerate.utils import get_balanced_memory, infer_auto_device_map
-        max_memory = get_balanced_memory(model,
-                                         max_memory=self.max_memory,
-                                         dtype=torch.half,
-                                         no_split_module_classes=['CLIPEncoderLayer'])
-        device_map = infer_auto_device_map(model,
-                                           no_split_module_classes=['CLIPEncoderLayer'],
-                                           max_memory=max_memory,
-                                           dtype=torch.half)
-        # make all tensor on same device for postprocess
-        if 'plora_glb_GN' in device_map:
-            device_map['plora_sub_GN'] = device_map['plora_glb_GN']
-
-        from accelerate import load_checkpoint_and_dispatch
-        with disable_logging():
-            load_checkpoint_and_dispatch(model=model,
-                                         checkpoint=self.model_path,
-                                         device_map=device_map if not self.with_llm else {'': 'cpu'},
-                                         no_split_module_classes=['CLIPEncoderLayer'],
-                                         dtype=torch.half)
-
-        if 'plora_glb_GN' in device_map:
-            add_device_hook(model.vit.vision_tower.vision_model.encoder.layers[-1], device_map['plora_glb_GN'],
-                            lambda x: (x[0].to(device=device_map['plora_glb_GN']), ))
-
-        self.model = model.eval()
-
-    def _preprocess_2d5(self, image: Image, params: dict) -> dict:
-        """Image preprocessing for internlm-xcomposer2d5-7b."""
-        hd_num = params.get('hd_num', 24)
-        image = self.HD_transform(image, hd_num=hd_num)
-        pixel_values = self.vis_processor(image).unsqueeze(0).half()
-        w, h = image.size
-        w, h = w // 560, h // 560
-        n_token_per_image = int((h * w + 1) * 400 + 1 + (h + 1) * 20)
-        return pixel_values, n_token_per_image
-
-    def _preprocess_7b(self, image: Image, params: dict) -> dict:
-        """Image preprocessing for internlm-xcomposer2-7b."""
-        pixel_values = self.vis_processor(image).unsqueeze(0).half()
-        return pixel_values, 256
-
-    def _preprocess_4khd_7b(self, image: Image, params: dict) -> dict:
-        """Image preprocessing for internlm-xcomposer2-4khd-7b."""
-        image = self.HD_transform(image, hd_num=25)
-        pixel_values = self.vis_processor(image).unsqueeze(0).half()
-        w, h = image.size
-        w, h = w // 336, h // 336
-        n_token_per_image = int((h * w + 1) * 144 + 1 + (h + 1) * 12)
-        return pixel_values, n_token_per_image
-
-    def preprocess(self, messages: list[dict]) -> list[dict]:
-        """Refer to `super().preprocess() for spec."""
-        images = self.collect_multimodal_items(messages)
-        outputs = []
-        for modality, image, params in images:
-            pixel_values, n_token = self.preprocess_func(image, params)
-            outputs.append(
-                dict(pixel_values=pixel_values,
-                     image_size=image.size,
-                     image_tokens=n_token,
-                     image_token_id=self.image_token_id))
-        messages.append(dict(role='preprocess', content=outputs))
-        return messages
-
-    @torch.no_grad()
-    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
-        """Extract image feature. ONLY implement it when the backend is
-        turbomind engine.
-
-        Args:
-            messages(list[dict]): the outputs of `preprocess`
-            max_batch_size(int): the max batch size when forwarding vision
-                model
-        Return:
-            the message list with forwarding results included
-        """
-        inputs = [x['content'] for x in messages if x['role'] == 'preprocess']
-        inputs = inputs[0]
-        outputs = []
-        for idx in range(0, len(inputs), max_batch_size):
-            if self.model_type in [ModelType.XCOMPOSER2D5, ModelType.XCOMPOSER2_4KHD]:
-                pixel_values = [x['pixel_values'] for x in inputs[idx:idx + max_batch_size]]
-                embeds, split = self.model.vit(pixel_values, self.model.plora_glb_GN, self.model.plora_sub_GN)
-                embeds = self.model.vision_proj(embeds)
-                embeds = torch.split(embeds, split, dim=1)
-                embeds = [x.squeeze() for x in embeds]
-            else:
-                pixel_values = [x['pixel_values'] for x in inputs[idx:idx + max_batch_size]]
-                pixel_values = torch.cat(pixel_values, dim=0)
-                logger.info(f'vision forward shape: {pixel_values.shape}')
-                embeds = self.model.vit(pixel_values)
-                embeds = self.model.vision_proj(embeds)
-                embeds = torch.split(embeds, 1, dim=0)
-                embeds = [x.squeeze() for x in embeds]
-            outputs.extend(embeds)
-        messages.append(dict(role='forward', content=outputs))
-        return messages
-
-    @staticmethod
-    def proc_messages(messages, chat_template, sequence_start, model_type):
-        """Apply chat template to get the prompt."""
-        prompt_messages = []
-        IMAGE_TOKEN = '<IMAGE_TOKEN>'
-        prefix_image_token = ''
-        for message in messages:
-            if isinstance(message['content'], str):
-                prompt_messages.append(message)
-                continue
-            elif message['role'] in ['images', 'preprocess', 'forward']:
-                continue
-            n_images = len([1 for x in message['content'] if x['type'] == 'image'])
-            content = [item['text'] for item in message['content'] if item['type'] == 'text']
-            if IMAGE_TOKEN not in content[0]:
-                if model_type == ModelType.XCOMPOSER2D5:
-                    if n_images == 1:
-                        prefix_image_token, prompt = IMAGE_TOKEN, content[0]
-                    else:
-                        prompt = ''.join([f'Image{i+1} {IMAGE_TOKEN}; ' for i in range(n_images)]) + content[0]
-                else:
-                    prompt = ''.join([IMAGE_TOKEN] * n_images) + content[0]
-            else:
-                prompt = content[0]
-            prompt_messages.append(dict(role='user', content=prompt))
-        prompt = prefix_image_token + chat_template.messages2prompt(prompt_messages, sequence_start)
-        return prompt, IMAGE_TOKEN
-
-    def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
-        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start, self.model_type)
-        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
-
-    def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwargs):
-        prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start, self.model_type)
-        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
diff --git a/lmdeploy/vl/tools/merge_xcomposer2d5_task.py b/lmdeploy/vl/tools/merge_xcomposer2d5_task.py
deleted file mode 100644
index 9ba1c88a12..0000000000
--- a/lmdeploy/vl/tools/merge_xcomposer2d5_task.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-import shutil
-
-import fire
-import torch
-from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-
-def main(src_path: str, dst_path: str, task: str):
-    """Merge internlm-xcomposer2d5-7b LoRA model weights.
-
-    Args:
-        src_path (str): the source model path of internlm-xcomposer2d5-7b
-        dst_path (str): the target model path of merged model
-        task (str): the task of source model, should choose from
-            ['web', 'write']
-    """
-    if os.path.exists(dst_path):
-        shutil.rmtree(dst_path)
-
-    to_merged = dict(web=['lora_web'], write=['lora_sft', 'lora_dpo'])
-    keys = to_merged[task]
-
-    # load model
-    model = AutoModelForCausalLM.from_pretrained(src_path, trust_remote_code=True)
-    tokenizer = AutoTokenizer.from_pretrained(src_path, trust_remote_code=True)
-
-    # merge lora weight to base model
-    @torch.inference_mode
-    def _merge(module: torch.nn.Module, lora_weights):
-        # merge lora weight first to reduce precision loss
-        mw = None
-        for wa, wb in lora_weights:
-            if mw is None:
-                mw = (wb.float() @ wa.float())
-            else:
-                mw += (wb.float() @ wa.float())
-        ow = module.weight
-        mw += ow.float()
-        module.weight.data = mw.half()
-
-    def _extract_lora(module: torch.nn.Module, keys: str):
-        lora_weights = []
-        for key in keys:
-            lora_a_key = f'{key}_A'
-            lora_b_key = f'{key}_B'
-            wa = getattr(module, lora_a_key).weight
-            wb = getattr(module, lora_b_key).weight
-            lora_weights.append((wa, wb))
-        return lora_weights
-
-    for _, module in tqdm(model.named_modules()):
-        if type(module).__name__ == 'PLoRA':
-            lora_weights = _extract_lora(module, keys)
-            _merge(module, lora_weights)
-
-    # save model
-    model.save_pretrained(dst_path, torch_dtype=torch.half)
-    tokenizer.save_pretrained(dst_path)
-
-
-if __name__ == '__main__':
-    fire.Fire(main)

From 7016dd650634940dff91946bf458efd90b2d37fe Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Fri, 26 Jun 2026 14:05:48 +0800
Subject: [PATCH 8/8] refactor: rename vl package to multimodal

---
 docs/en/faq.md                                |  2 +-
 docs/en/get_started/ascend/get_started.md     |  2 +-
 docs/en/get_started/camb/get_started.md       |  2 +-
 docs/en/get_started/get_started.md            |  4 +--
 docs/en/get_started/maca/get_started.md       |  2 +-
 docs/en/multi_modal/cogvlm.md                 |  2 +-
 docs/en/multi_modal/deepseek_vl2.md           |  2 +-
 docs/en/multi_modal/gemma3.md                 |  2 +-
 docs/en/multi_modal/index.rst                 |  5 +++-
 docs/en/multi_modal/internvl.md               | 10 +++----
 docs/en/multi_modal/llava.md                  |  2 +-
 docs/en/multi_modal/minicpmv.md               |  4 +--
 docs/en/multi_modal/molmo.md                  |  2 +-
 docs/en/multi_modal/multimodal_inputs.md      | 10 +++----
 docs/en/multi_modal/phi3.md                   |  2 +-
 docs/en/multi_modal/qwen2_5_vl.md             |  6 ++---
 docs/en/multi_modal/qwen2_vl.md               |  2 +-
 docs/en/multi_modal/vl_pipeline.md            | 26 +++++++++----------
 docs/zh_cn/faq.md                             |  2 +-
 docs/zh_cn/get_started/ascend/get_started.md  |  2 +-
 docs/zh_cn/get_started/camb/get_started.md    |  2 +-
 docs/zh_cn/get_started/get_started.md         |  4 +--
 docs/zh_cn/get_started/maca/get_started.md    |  2 +-
 docs/zh_cn/multi_modal/cogvlm.md              |  2 +-
 docs/zh_cn/multi_modal/deepseek_vl2.md        |  2 +-
 docs/zh_cn/multi_modal/gemma3.md              |  2 +-
 docs/zh_cn/multi_modal/index.rst              |  4 ++-
 docs/zh_cn/multi_modal/internvl.md            | 10 +++----
 docs/zh_cn/multi_modal/llava.md               |  2 +-
 docs/zh_cn/multi_modal/minicpmv.md            |  4 +--
 docs/zh_cn/multi_modal/molmo.md               |  2 +-
 docs/zh_cn/multi_modal/multimodal_inputs.md   | 10 +++----
 docs/zh_cn/multi_modal/phi3.md                |  2 +-
 docs/zh_cn/multi_modal/qwen2_5_vl.md          |  6 ++---
 docs/zh_cn/multi_modal/qwen2_vl.md            |  2 +-
 docs/zh_cn/multi_modal/vl_pipeline.md         | 26 +++++++++----------
 lmdeploy/api.py                               |  2 +-
 lmdeploy/lite/apis/calibrate.py               |  2 +-
 lmdeploy/multimodal/__init__.py               | 22 ++++++++++++++++
 lmdeploy/{vl => multimodal}/constants.py      |  0
 lmdeploy/{vl => multimodal}/engine.py         |  2 +-
 lmdeploy/{vl => multimodal}/media/__init__.py |  0
 lmdeploy/{vl => multimodal}/media/audio.py    |  0
 lmdeploy/{vl => multimodal}/media/base.py     |  0
 .../{vl => multimodal}/media/connection.py    |  0
 lmdeploy/{vl => multimodal}/media/image.py    |  0
 .../{vl => multimodal}/media/time_series.py   |  0
 lmdeploy/{vl => multimodal}/media/video.py    |  0
 .../{vl => multimodal}/media/video_loader.py  |  0
 lmdeploy/{vl => multimodal}/model/__init__.py |  0
 lmdeploy/{vl => multimodal}/model/base.py     |  4 +--
 lmdeploy/{vl => multimodal}/model/builder.py  |  2 +-
 lmdeploy/{vl => multimodal}/model/cogvlm.py   |  2 +-
 lmdeploy/{vl => multimodal}/model/deepseek.py |  4 +--
 .../{vl => multimodal}/model/deepseek_vl2.py  |  2 +-
 .../{vl => multimodal}/model/gemma3_vl.py     |  2 +-
 lmdeploy/{vl => multimodal}/model/glm4_1v.py  |  2 +-
 lmdeploy/{vl => multimodal}/model/glm4_v.py   |  2 +-
 .../{vl => multimodal}/model/interns1_pro.py  |  4 +--
 lmdeploy/{vl => multimodal}/model/internvl.py |  4 +--
 .../{vl => multimodal}/model/internvl3_hf.py  |  4 +--
 lmdeploy/{vl => multimodal}/model/llama4.py   |  2 +-
 lmdeploy/{vl => multimodal}/model/llava.py    |  4 +--
 lmdeploy/{vl => multimodal}/model/llava_hf.py |  4 +--
 .../{vl => multimodal}/model/llava_next.py    |  4 +--
 lmdeploy/{vl => multimodal}/model/minicpmv.py |  4 +--
 lmdeploy/{vl => multimodal}/model/molmo.py    |  4 +--
 .../{vl => multimodal}/model/phi3_vision.py   |  2 +-
 .../model/preprocess_utils.py                 |  4 +--
 lmdeploy/{vl => multimodal}/model/qwen2.py    |  4 +--
 lmdeploy/{vl => multimodal}/model/qwen3.py    |  2 +-
 lmdeploy/{vl => multimodal}/model/qwen3_5.py  |  6 ++---
 .../{vl => multimodal}/model/qwen3_omni.py    |  2 +-
 lmdeploy/{vl => multimodal}/model/utils.py    |  0
 lmdeploy/{vl => multimodal}/tools/__init__.py |  0
 lmdeploy/{vl => multimodal}/utils.py          |  0
 lmdeploy/pytorch/messages.py                  |  2 +-
 lmdeploy/pytorch/models/interns1_pro.py       |  2 +-
 lmdeploy/pytorch/models/qwen3_5.py            |  2 +-
 .../pytorch/models/qwen3_omni_moe_thinker.py  |  2 +-
 lmdeploy/pytorch/models/qwen3_vl.py           |  2 +-
 lmdeploy/pytorch/models/utils/model.py        |  2 +-
 lmdeploy/pytorch/multimodal/data_type.py      |  2 +-
 lmdeploy/serve/core/vl_async_engine.py        |  2 +-
 lmdeploy/serve/openai/api_server.py           |  2 +-
 lmdeploy/serve/processors/multimodal.py       | 14 +++++-----
 lmdeploy/turbomind/models/qwen3_5.py          |  2 +-
 lmdeploy/vl/__init__.py                       | 23 ++--------------
 tests/pytorch/paging/test_block_trie.py       |  2 +-
 tests/test_lmdeploy/test_content_merge.py     |  2 +-
 .../test_hf_chat_template.py                  |  2 +-
 .../test_multimodal_encode.py}                |  2 +-
 .../test_nonhf_chat_template.py               |  2 +-
 .../test_preprocess_utils.py                  |  4 +--
 .../test_qwen3_omni_processor.py              |  8 +++---
 .../test_qwen3vl_processor.py                 |  6 ++---
 .../test_safe_url.py                          |  4 +--
 97 files changed, 186 insertions(+), 178 deletions(-)
 create mode 100644 lmdeploy/multimodal/__init__.py
 rename lmdeploy/{vl => multimodal}/constants.py (100%)
 rename lmdeploy/{vl => multimodal}/engine.py (99%)
 rename lmdeploy/{vl => multimodal}/media/__init__.py (100%)
 rename lmdeploy/{vl => multimodal}/media/audio.py (100%)
 rename lmdeploy/{vl => multimodal}/media/base.py (100%)
 rename lmdeploy/{vl => multimodal}/media/connection.py (100%)
 rename lmdeploy/{vl => multimodal}/media/image.py (100%)
 rename lmdeploy/{vl => multimodal}/media/time_series.py (100%)
 rename lmdeploy/{vl => multimodal}/media/video.py (100%)
 rename lmdeploy/{vl => multimodal}/media/video_loader.py (100%)
 rename lmdeploy/{vl => multimodal}/model/__init__.py (100%)
 rename lmdeploy/{vl => multimodal}/model/base.py (99%)
 rename lmdeploy/{vl => multimodal}/model/builder.py (98%)
 rename lmdeploy/{vl => multimodal}/model/cogvlm.py (98%)
 rename lmdeploy/{vl => multimodal}/model/deepseek.py (98%)
 rename lmdeploy/{vl => multimodal}/model/deepseek_vl2.py (99%)
 rename lmdeploy/{vl => multimodal}/model/gemma3_vl.py (98%)
 rename lmdeploy/{vl => multimodal}/model/glm4_1v.py (92%)
 rename lmdeploy/{vl => multimodal}/model/glm4_v.py (98%)
 rename lmdeploy/{vl => multimodal}/model/interns1_pro.py (96%)
 rename lmdeploy/{vl => multimodal}/model/internvl.py (99%)
 rename lmdeploy/{vl => multimodal}/model/internvl3_hf.py (97%)
 rename lmdeploy/{vl => multimodal}/model/llama4.py (99%)
 rename lmdeploy/{vl => multimodal}/model/llava.py (99%)
 rename lmdeploy/{vl => multimodal}/model/llava_hf.py (97%)
 rename lmdeploy/{vl => multimodal}/model/llava_next.py (98%)
 rename lmdeploy/{vl => multimodal}/model/minicpmv.py (98%)
 rename lmdeploy/{vl => multimodal}/model/molmo.py (98%)
 rename lmdeploy/{vl => multimodal}/model/phi3_vision.py (95%)
 rename lmdeploy/{vl => multimodal}/model/preprocess_utils.py (98%)
 rename lmdeploy/{vl => multimodal}/model/qwen2.py (98%)
 rename lmdeploy/{vl => multimodal}/model/qwen3.py (94%)
 rename lmdeploy/{vl => multimodal}/model/qwen3_5.py (97%)
 rename lmdeploy/{vl => multimodal}/model/qwen3_omni.py (94%)
 rename lmdeploy/{vl => multimodal}/model/utils.py (100%)
 rename lmdeploy/{vl => multimodal}/tools/__init__.py (100%)
 rename lmdeploy/{vl => multimodal}/utils.py (100%)
 rename tests/test_lmdeploy/{test_vl => test_multimodal}/test_hf_chat_template.py (99%)
 rename tests/test_lmdeploy/{test_vl/test_vl_encode.py => test_multimodal/test_multimodal_encode.py} (99%)
 rename tests/test_lmdeploy/{test_vl => test_multimodal}/test_nonhf_chat_template.py (99%)
 rename tests/test_lmdeploy/{test_vl => test_multimodal}/test_preprocess_utils.py (95%)
 rename tests/test_lmdeploy/{test_vl => test_multimodal}/test_qwen3_omni_processor.py (97%)
 rename tests/test_lmdeploy/{test_vl => test_multimodal}/test_qwen3vl_processor.py (97%)
 rename tests/test_lmdeploy/{test_vl => test_multimodal}/test_safe_url.py (92%)

diff --git a/docs/en/faq.md b/docs/en/faq.md
index 39563d5d01..345f508ce8 100644
--- a/docs/en/faq.md
+++ b/docs/en/faq.md
@@ -94,7 +94,7 @@ lmdeploy serve api_server internlm/internlm2_5-7b-chat --cache-max-entry-count 0
 ### Api Server Fetch Timeout
 
 The image URL fetch timeout for the API server can be configured via the environment variable `LMDEPLOY_FETCH_TIMEOUT`.
-By default, requests may take up to 10 seconds before timing out. See [lmdeploy/vl/utils.py](https://github.com/InternLM/lmdeploy/blob/7b6876eafcb842633e0efe8baabe5906d7beeeea/lmdeploy/vl/utils.py#L31) for usage.
+By default, requests may take up to 10 seconds before timing out. See [lmdeploy/multimodal/utils.py](https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/multimodal/utils.py) for usage.
 
 ## Quantization
 
diff --git a/docs/en/get_started/ascend/get_started.md b/docs/en/get_started/ascend/get_started.md
index 376548b0dc..e88e02e865 100644
--- a/docs/en/get_started/ascend/get_started.md
+++ b/docs/en/get_started/ascend/get_started.md
@@ -50,7 +50,7 @@ Set `device_type="ascend"` in the `PytorchEngineConfig`:
 
 ```python
 from lmdeploy import pipeline, PytorchEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 pipe = pipeline('OpenGVLab/InternVL2-2B',
         backend_config=PytorchEngineConfig(tp=1, device_type='ascend'))
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
diff --git a/docs/en/get_started/camb/get_started.md b/docs/en/get_started/camb/get_started.md
index 5b6e622667..0c1c35d302 100644
--- a/docs/en/get_started/camb/get_started.md
+++ b/docs/en/get_started/camb/get_started.md
@@ -43,7 +43,7 @@ Set `device_type="camb"` in the `PytorchEngineConfig`:
 
 ```python
 from lmdeploy import pipeline, PytorchEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 pipe = pipeline('OpenGVLab/InternVL2-2B',
         backend_config=PytorchEngineConfig(tp=1, device_type='camb'))
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
diff --git a/docs/en/get_started/get_started.md b/docs/en/get_started/get_started.md
index 8650858d12..b588f3a6b5 100644
--- a/docs/en/get_started/get_started.md
+++ b/docs/en/get_started/get_started.md
@@ -83,7 +83,7 @@ For example, you can utilize the following code snippet to perform the inference
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2-8B')
 
@@ -96,7 +96,7 @@ In VLM pipeline, the default image processing batch size is 1. This can be adjus
 
 ```python
 from lmdeploy import pipeline, VisionConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2-8B',
                 vision_config=VisionConfig(
diff --git a/docs/en/get_started/maca/get_started.md b/docs/en/get_started/maca/get_started.md
index 5c647a379e..47b622de7a 100644
--- a/docs/en/get_started/maca/get_started.md
+++ b/docs/en/get_started/maca/get_started.md
@@ -33,7 +33,7 @@ Set `device_type="maca"` in the `PytorchEngineConfig`:
 
 ```python
 from lmdeploy import pipeline, PytorchEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 pipe = pipeline('OpenGVLab/InternVL2-2B',
         backend_config=PytorchEngineConfig(tp=1, device_type='maca'))
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
diff --git a/docs/en/multi_modal/cogvlm.md b/docs/en/multi_modal/cogvlm.md
index 0a4e7849c7..a02cc62ba5 100644
--- a/docs/en/multi_modal/cogvlm.md
+++ b/docs/en/multi_modal/cogvlm.md
@@ -26,7 +26,7 @@ The following sample code shows the basic usage of VLM pipeline. For more exampl
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 
 if __name__ == "__main__":
diff --git a/docs/en/multi_modal/deepseek_vl2.md b/docs/en/multi_modal/deepseek_vl2.md
index c0c2129bea..71a592c889 100644
--- a/docs/en/multi_modal/deepseek_vl2.md
+++ b/docs/en/multi_modal/deepseek_vl2.md
@@ -30,7 +30,7 @@ To construct valid DeepSeek-VL2 prompts with image inputs, users should insert `
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 
 if __name__ == "__main__":
diff --git a/docs/en/multi_modal/gemma3.md b/docs/en/multi_modal/gemma3.md
index 20905bfe97..5e644b6ea0 100644
--- a/docs/en/multi_modal/gemma3.md
+++ b/docs/en/multi_modal/gemma3.md
@@ -18,7 +18,7 @@ The following sample code shows the basic usage of VLM pipeline. For more exampl
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 
 if __name__ == "__main__":
diff --git a/docs/en/multi_modal/index.rst b/docs/en/multi_modal/index.rst
index 92356eec16..db3df4bf3e 100644
--- a/docs/en/multi_modal/index.rst
+++ b/docs/en/multi_modal/index.rst
@@ -1,6 +1,9 @@
-Vision-Language Models
+Multimodal Models
 =================================
 
+Use ``lmdeploy.multimodal`` for multimodal helper APIs such as media loading
+and local-file encoding.
+
 .. toctree::
    :maxdepth: 2
    :caption: Examples
diff --git a/docs/en/multi_modal/internvl.md b/docs/en/multi_modal/internvl.md
index 896db2e5d9..8ffed56616 100644
--- a/docs/en/multi_modal/internvl.md
+++ b/docs/en/multi_modal/internvl.md
@@ -42,7 +42,7 @@ The following sample code shows the basic usage of VLM pipeline. For detailed in
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2-8B')
 
@@ -60,7 +60,7 @@ More examples are listed below:
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy.multimodal.constants import IMAGE_TOKEN
 
 pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
 messages = [
@@ -86,7 +86,7 @@ out = pipe(messages, gen_config=GenerationConfig(top_k=1))
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy.multimodal.constants import IMAGE_TOKEN
 
 pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
 messages = [
@@ -114,8 +114,8 @@ out = pipe(messages, gen_config=GenerationConfig(top_k=1))
 import numpy as np
 from lmdeploy import pipeline, GenerationConfig
 from decord import VideoReader, cpu
-from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl import encode_image_base64
+from lmdeploy.multimodal.constants import IMAGE_TOKEN
+from lmdeploy.multimodal import encode_image_base64
 from PIL import Image
 pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
 
diff --git a/docs/en/multi_modal/llava.md b/docs/en/multi_modal/llava.md
index c374b67121..d63d22a912 100644
--- a/docs/en/multi_modal/llava.md
+++ b/docs/en/multi_modal/llava.md
@@ -33,7 +33,7 @@ The following sample code shows the basic usage of VLM pipeline. For detailed in
 
 ```python
 from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 
 pipe = pipeline("llava-hf/llava-interleave-qwen-7b-hf", backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5),
diff --git a/docs/en/multi_modal/minicpmv.md b/docs/en/multi_modal/minicpmv.md
index 0f2bf176b9..f7da7238e8 100644
--- a/docs/en/multi_modal/minicpmv.md
+++ b/docs/en/multi_modal/minicpmv.md
@@ -19,7 +19,7 @@ The following sample code shows the basic usage of VLM pipeline. For detailed in
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('openbmb/MiniCPM-V-2_6')
 
@@ -97,7 +97,7 @@ print(out.text)
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl import encode_image_base64
+from lmdeploy.multimodal import encode_image_base64
 import torch
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer
diff --git a/docs/en/multi_modal/molmo.md b/docs/en/multi_modal/molmo.md
index dfff43dc64..8c98f38098 100644
--- a/docs/en/multi_modal/molmo.md
+++ b/docs/en/multi_modal/molmo.md
@@ -19,7 +19,7 @@ The following sample code shows the basic usage of VLM pipeline. For detailed in
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('allenai/Molmo-7B-D-0924')
 
diff --git a/docs/en/multi_modal/multimodal_inputs.md b/docs/en/multi_modal/multimodal_inputs.md
index 4f78ad2504..2b14addb86 100644
--- a/docs/en/multi_modal/multimodal_inputs.md
+++ b/docs/en/multi_modal/multimodal_inputs.md
@@ -398,7 +398,7 @@ In addition to HTTP URLs, lmdeploy accepts:
 - **Local file paths** via `file://` scheme: `file:///absolute/path/to/file.jpg`
 - **Base64-encoded data** via data URLs: `data:<mime>;base64,<encoded_data>`
 
-Use the helpers in `lmdeploy.vl.utils` to encode local files:
+Use the helpers in `lmdeploy.multimodal.utils` to encode local files:
 
 <details>
 <summary>Local file path example</summary>
@@ -434,7 +434,7 @@ print(response.choices[0].message.content)
 
 ```python
 from openai import OpenAI
-from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.multimodal.utils import encode_image_base64
 
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
@@ -465,7 +465,7 @@ print(response.choices[0].message.content)
 
 ```python
 from openai import OpenAI
-from lmdeploy.vl.utils import encode_video_base64
+from lmdeploy.multimodal.utils import encode_video_base64
 
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
@@ -497,7 +497,7 @@ print(response.choices[0].message.content)
 
 ```python
 from openai import OpenAI
-from lmdeploy.vl.utils import encode_audio_base64
+from lmdeploy.multimodal.utils import encode_audio_base64
 
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
@@ -528,7 +528,7 @@ print(response.choices[0].message.content)
 
 ```python
 from openai import OpenAI
-from lmdeploy.vl.utils import encode_time_series_base64
+from lmdeploy.multimodal.utils import encode_time_series_base64
 
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
diff --git a/docs/en/multi_modal/phi3.md b/docs/en/multi_modal/phi3.md
index a7ad0237e2..8767b88254 100644
--- a/docs/en/multi_modal/phi3.md
+++ b/docs/en/multi_modal/phi3.md
@@ -26,7 +26,7 @@ The following sample code shows the basic usage of VLM pipeline. For more exampl
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('microsoft/Phi-3.5-vision-instruct')
 
diff --git a/docs/en/multi_modal/qwen2_5_vl.md b/docs/en/multi_modal/qwen2_5_vl.md
index ac2ffa2ce6..d42c9d9552 100644
--- a/docs/en/multi_modal/qwen2_5_vl.md
+++ b/docs/en/multi_modal/qwen2_5_vl.md
@@ -25,7 +25,7 @@ The following sample code shows the basic usage of the VLM pipeline. For detaile
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('Qwen/Qwen2.5-VL-7B-Instruct')
 
@@ -98,8 +98,8 @@ out = pipe(messages, gen_config=GenerationConfig(top_k=1))
 import numpy as np
 from lmdeploy import pipeline, GenerationConfig
 from decord import VideoReader, cpu
-from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl import encode_image_base64
+from lmdeploy.multimodal.constants import IMAGE_TOKEN
+from lmdeploy.multimodal import encode_image_base64
 from PIL import Image
 pipe = pipeline('Qwen/Qwen2.5-VL-7B-Instruct', log_level='INFO')
 
diff --git a/docs/en/multi_modal/qwen2_vl.md b/docs/en/multi_modal/qwen2_vl.md
index 425d5d8f28..6a4d78ad0e 100644
--- a/docs/en/multi_modal/qwen2_vl.md
+++ b/docs/en/multi_modal/qwen2_vl.md
@@ -36,7 +36,7 @@ The following sample code shows the basic usage of VLM pipeline. For detailed in
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('Qwen/Qwen2-VL-2B-Instruct')
 
diff --git a/docs/en/multi_modal/vl_pipeline.md b/docs/en/multi_modal/vl_pipeline.md
index 4972ba91d5..6b3f506255 100644
--- a/docs/en/multi_modal/vl_pipeline.md
+++ b/docs/en/multi_modal/vl_pipeline.md
@@ -16,7 +16,7 @@ Using the pipeline interface to infer other VLM models is similar, with the main
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2_5-8B')
 
@@ -53,7 +53,7 @@ Tensor paramllelism can be activated by setting the engine parameter `tp`
 
 ```python
 from lmdeploy import pipeline, TurbomindEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(tp=2))
@@ -69,7 +69,7 @@ When creating the pipeline, you can customize the size of the context window by
 
 ```python
 from lmdeploy import pipeline, TurbomindEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(session_len=8192))
@@ -85,7 +85,7 @@ You can change the default sampling parameters of pipeline by passing `Generatio
 
 ```python
 from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(tp=2, session_len=8192))
@@ -101,8 +101,8 @@ By default, LMDeploy inserts the special image token into the user prompt follow
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
-from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy.multimodal import load_image
+from lmdeploy.multimodal.constants import IMAGE_TOKEN
 
 pipe = pipeline('deepseek-ai/deepseek-vl-1.3b-chat')
 
@@ -117,7 +117,7 @@ While performing inference, LMDeploy identifies an appropriate chat template fro
 
 ```python
 from lmdeploy import pipeline, ChatTemplateConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 pipe = pipeline('local_model_folder',
                 chat_template_config=ChatTemplateConfig(model_name='llava-v1'))
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
@@ -133,7 +133,7 @@ The default parameters of the visual model can be modified by setting `VisionCon
 
 ```python
 from lmdeploy import pipeline, VisionConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 vision_config=VisionConfig(max_batch_size=16)
 pipe = pipeline('liuhaotian/llava-v1.5-7b', vision_config=vision_config)
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
@@ -145,7 +145,7 @@ print(response)
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 pipe = pipeline('OpenGVLab/InternVL2_5-8B')
 
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
@@ -162,7 +162,7 @@ When dealing with multiple images, you can put them all in one list. Keep in min
 
 ```python
 from lmdeploy import pipeline, TurbomindEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(session_len=8192))
@@ -183,7 +183,7 @@ Conducting inference with batch prompts is quite straightforward; just place the
 
 ```python
 from lmdeploy import pipeline, TurbomindEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(session_len=8192))
@@ -203,7 +203,7 @@ There are two ways to do the multi-turn conversations with the pipeline. One is
 
 ```python
 from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(session_len=8192))
@@ -224,7 +224,7 @@ You can release the pipeline explicitly by calling its `close()` method, or alte
 from lmdeploy import pipeline
 
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 with pipeline('OpenGVLab/InternVL2_5-8B') as pipe:
     image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
diff --git a/docs/zh_cn/faq.md b/docs/zh_cn/faq.md
index 1822d079e6..9853729350 100644
--- a/docs/zh_cn/faq.md
+++ b/docs/zh_cn/faq.md
@@ -94,7 +94,7 @@ lmdeploy serve api_server internlm/internlm2_5-7b-chat --cache-max-entry-count 0
 
 API 服务器的图像 URL 获取超时可通过环境变量 `LMDEPLOY_FETCH_TIMEOUT` 进行配置。默认情况下，请求可能需要长达 10 秒才会超时。
 
-请参阅 [lmdeploy/vl/utils.py](https://github.com/InternLM/lmdeploy/blob/7b6876eafcb842633e0efe8baabe5906d7beeeea/lmdeploy/vl/utils.py#L31) 了解用法。
+请参阅 [lmdeploy/multimodal/utils.py](https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/multimodal/utils.py) 了解用法。
 
 ## 量化
 
diff --git a/docs/zh_cn/get_started/ascend/get_started.md b/docs/zh_cn/get_started/ascend/get_started.md
index bae1503470..a9559ed944 100644
--- a/docs/zh_cn/get_started/ascend/get_started.md
+++ b/docs/zh_cn/get_started/ascend/get_started.md
@@ -43,7 +43,7 @@ print(response)
 
 ```python
 from lmdeploy import pipeline, PytorchEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 pipe = pipeline('OpenGVLab/InternVL2-2B',
      backend_config=PytorchEngineConfig(tp=1, device_type='ascend'))
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
diff --git a/docs/zh_cn/get_started/camb/get_started.md b/docs/zh_cn/get_started/camb/get_started.md
index 4f3043ccce..393330b08a 100644
--- a/docs/zh_cn/get_started/camb/get_started.md
+++ b/docs/zh_cn/get_started/camb/get_started.md
@@ -40,7 +40,7 @@ print(response)
 
 ```python
 from lmdeploy import pipeline, PytorchEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 pipe = pipeline('OpenGVLab/InternVL2-2B',
      backend_config=PytorchEngineConfig(tp=1, device_type='camb'))
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
diff --git a/docs/zh_cn/get_started/get_started.md b/docs/zh_cn/get_started/get_started.md
index 51d5f0ff81..8e8de9b135 100644
--- a/docs/zh_cn/get_started/get_started.md
+++ b/docs/zh_cn/get_started/get_started.md
@@ -81,7 +81,7 @@ VLM 推理 pipeline 与 LLM 类似，但增加了使用 pipeline 处理图像数
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2-8B')
 
@@ -94,7 +94,7 @@ print(response)
 
 ```python
 from lmdeploy import pipeline, VisionConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2-8B',
                 vision_config=VisionConfig(
diff --git a/docs/zh_cn/get_started/maca/get_started.md b/docs/zh_cn/get_started/maca/get_started.md
index bbe57caf7f..2466fffcb1 100644
--- a/docs/zh_cn/get_started/maca/get_started.md
+++ b/docs/zh_cn/get_started/maca/get_started.md
@@ -31,7 +31,7 @@ print(response)
 
 ```python
 from lmdeploy import pipeline, PytorchEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 pipe = pipeline('OpenGVLab/InternVL2-2B',
      backend_config=PytorchEngineConfig(tp=1, device_type='maca'))
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
diff --git a/docs/zh_cn/multi_modal/cogvlm.md b/docs/zh_cn/multi_modal/cogvlm.md
index 5101f249f7..41acaf0303 100644
--- a/docs/zh_cn/multi_modal/cogvlm.md
+++ b/docs/zh_cn/multi_modal/cogvlm.md
@@ -25,7 +25,7 @@ huggingface-cli download lmsys/vicuna-7b-v1.5 special_tokens_map.json tokenizer.
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 
 if __name__ == "__main__":
diff --git a/docs/zh_cn/multi_modal/deepseek_vl2.md b/docs/zh_cn/multi_modal/deepseek_vl2.md
index 3e21288c55..86a4489e24 100644
--- a/docs/zh_cn/multi_modal/deepseek_vl2.md
+++ b/docs/zh_cn/multi_modal/deepseek_vl2.md
@@ -30,7 +30,7 @@ pip install attrdict timm 'transformers<4.48.0'
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 
 if __name__ == "__main__":
diff --git a/docs/zh_cn/multi_modal/gemma3.md b/docs/zh_cn/multi_modal/gemma3.md
index 5ad624424e..106a9076bd 100644
--- a/docs/zh_cn/multi_modal/gemma3.md
+++ b/docs/zh_cn/multi_modal/gemma3.md
@@ -18,7 +18,7 @@ Gemma 是 Google 推出的轻量级、最先进的开放模型系列，采用与
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 
 if __name__ == "__main__":
diff --git a/docs/zh_cn/multi_modal/index.rst b/docs/zh_cn/multi_modal/index.rst
index 0344770a3e..d6e1f7cde1 100644
--- a/docs/zh_cn/multi_modal/index.rst
+++ b/docs/zh_cn/multi_modal/index.rst
@@ -1,6 +1,8 @@
-视觉语言模型
+多模态模型
 =================================
 
+请使用 ``lmdeploy.multimodal`` 访问多模态辅助 API，例如媒体加载和本地文件编码。
+
 .. toctree::
    :maxdepth: 2
    :caption: 示例
diff --git a/docs/zh_cn/multi_modal/internvl.md b/docs/zh_cn/multi_modal/internvl.md
index 6af6f45b0f..30f8d97a28 100644
--- a/docs/zh_cn/multi_modal/internvl.md
+++ b/docs/zh_cn/multi_modal/internvl.md
@@ -42,7 +42,7 @@ docker build --build-arg CUDA_VERSION=cu11 -t openmmlab/lmdeploy:internvl . -f .
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2-8B')
 
@@ -60,7 +60,7 @@ print(response)
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy.multimodal.constants import IMAGE_TOKEN
 
 pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
 messages = [
@@ -86,7 +86,7 @@ out = pipe(messages, gen_config=GenerationConfig(top_k=1))
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy.multimodal.constants import IMAGE_TOKEN
 
 pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
 messages = [
@@ -114,8 +114,8 @@ out = pipe(messages, gen_config=GenerationConfig(top_k=1))
 import numpy as np
 from lmdeploy import pipeline, GenerationConfig
 from decord import VideoReader, cpu
-from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl import encode_image_base64
+from lmdeploy.multimodal.constants import IMAGE_TOKEN
+from lmdeploy.multimodal import encode_image_base64
 from PIL import Image
 pipe = pipeline('OpenGVLab/InternVL2-8B', log_level='INFO')
 
diff --git a/docs/zh_cn/multi_modal/llava.md b/docs/zh_cn/multi_modal/llava.md
index 6538d1b861..857030828f 100644
--- a/docs/zh_cn/multi_modal/llava.md
+++ b/docs/zh_cn/multi_modal/llava.md
@@ -33,7 +33,7 @@ docker pull openmmlab/lmdeploy:latest
 
 ```python
 from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline("llava-hf/llava-interleave-qwen-7b-hf", backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5),
     gen_config=GenerationConfig(max_new_tokens=512))
diff --git a/docs/zh_cn/multi_modal/minicpmv.md b/docs/zh_cn/multi_modal/minicpmv.md
index eb2a168cdb..784f3404ad 100644
--- a/docs/zh_cn/multi_modal/minicpmv.md
+++ b/docs/zh_cn/multi_modal/minicpmv.md
@@ -19,7 +19,7 @@ LMDeploy 支持 MiniCPM-V 系列模型，具体如下：
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('openbmb/MiniCPM-V-2_6')
 
@@ -97,7 +97,7 @@ print(out.text)
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl import encode_image_base64
+from lmdeploy.multimodal import encode_image_base64
 import torch
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer
diff --git a/docs/zh_cn/multi_modal/molmo.md b/docs/zh_cn/multi_modal/molmo.md
index 1dc8f8f79b..5e5f93a83f 100644
--- a/docs/zh_cn/multi_modal/molmo.md
+++ b/docs/zh_cn/multi_modal/molmo.md
@@ -19,7 +19,7 @@ LMDeploy 支持 Molmo 系列模型，具体如下：
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('allenai/Molmo-7B-D-0924')
 
diff --git a/docs/zh_cn/multi_modal/multimodal_inputs.md b/docs/zh_cn/multi_modal/multimodal_inputs.md
index d87b19e068..a9a2114eb3 100644
--- a/docs/zh_cn/multi_modal/multimodal_inputs.md
+++ b/docs/zh_cn/multi_modal/multimodal_inputs.md
@@ -397,7 +397,7 @@ ______________________________________________________________________
 - **本地文件路径**，使用 `file://` 协议：`file:///absolute/path/to/file.jpg`
 - **Base64 编码数据**，使用 data URL：`data:<mime>;base64,<encoded_data>`
 
-可使用 `lmdeploy.vl.utils` 中的工具函数对本地文件进行编码：
+可使用 `lmdeploy.multimodal.utils` 中的工具函数对本地文件进行编码：
 
 <details>
 <summary>本地文件路径示例</summary>
@@ -433,7 +433,7 @@ print(response.choices[0].message.content)
 
 ```python
 from openai import OpenAI
-from lmdeploy.vl.utils import encode_image_base64
+from lmdeploy.multimodal.utils import encode_image_base64
 
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
@@ -464,7 +464,7 @@ print(response.choices[0].message.content)
 
 ```python
 from openai import OpenAI
-from lmdeploy.vl.utils import encode_video_base64
+from lmdeploy.multimodal.utils import encode_video_base64
 
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
@@ -496,7 +496,7 @@ print(response.choices[0].message.content)
 
 ```python
 from openai import OpenAI
-from lmdeploy.vl.utils import encode_audio_base64
+from lmdeploy.multimodal.utils import encode_audio_base64
 
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
@@ -527,7 +527,7 @@ print(response.choices[0].message.content)
 
 ```python
 from openai import OpenAI
-from lmdeploy.vl.utils import encode_time_series_base64
+from lmdeploy.multimodal.utils import encode_time_series_base64
 
 client = OpenAI(api_key='EMPTY', base_url='http://localhost:23333/v1')
 model_name = client.models.list().data[0].id
diff --git a/docs/zh_cn/multi_modal/phi3.md b/docs/zh_cn/multi_modal/phi3.md
index b5545d30b6..880fe6f2ef 100644
--- a/docs/zh_cn/multi_modal/phi3.md
+++ b/docs/zh_cn/multi_modal/phi3.md
@@ -26,7 +26,7 @@ pip install flash-attn
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('microsoft/Phi-3.5-vision-instruct')
 
diff --git a/docs/zh_cn/multi_modal/qwen2_5_vl.md b/docs/zh_cn/multi_modal/qwen2_5_vl.md
index 2b1d81c0a4..91f0f2d4ba 100644
--- a/docs/zh_cn/multi_modal/qwen2_5_vl.md
+++ b/docs/zh_cn/multi_modal/qwen2_5_vl.md
@@ -25,7 +25,7 @@ pip install qwen-vl-utils[decord]==0.0.8
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('Qwen/Qwen2.5-VL-7B-Instruct')
 
@@ -98,8 +98,8 @@ out = pipe(messages, gen_config=GenerationConfig(top_k=1))
 import numpy as np
 from lmdeploy import pipeline, GenerationConfig
 from decord import VideoReader, cpu
-from lmdeploy.vl.constants import IMAGE_TOKEN
-from lmdeploy.vl import encode_image_base64
+from lmdeploy.multimodal.constants import IMAGE_TOKEN
+from lmdeploy.multimodal import encode_image_base64
 from PIL import Image
 pipe = pipeline('Qwen/Qwen2.5-VL-7B-Instruct', log_level='INFO')
 
diff --git a/docs/zh_cn/multi_modal/qwen2_vl.md b/docs/zh_cn/multi_modal/qwen2_vl.md
index 1d2d18f30a..10af3cc66f 100644
--- a/docs/zh_cn/multi_modal/qwen2_vl.md
+++ b/docs/zh_cn/multi_modal/qwen2_vl.md
@@ -36,7 +36,7 @@ docker build --build-arg CUDA_VERSION=cu11 -t openmmlab/lmdeploy:qwen2vl . -f ./
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('Qwen/Qwen2-VL-2B-Instruct')
 
diff --git a/docs/zh_cn/multi_modal/vl_pipeline.md b/docs/zh_cn/multi_modal/vl_pipeline.md
index 9662bcc569..d2d2c54c89 100644
--- a/docs/zh_cn/multi_modal/vl_pipeline.md
+++ b/docs/zh_cn/multi_modal/vl_pipeline.md
@@ -16,7 +16,7 @@ LMDeploy 把视觉-语言模型（VLM）复杂的推理过程，抽象为简单
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2_5-8B')
 
@@ -53,7 +53,7 @@ print(response)
 
 ```python
 from lmdeploy import pipeline, TurbomindEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(tp=2))
@@ -69,7 +69,7 @@ print(response)
 
 ```python
 from lmdeploy import pipeline, TurbomindEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(session_len=8192))
@@ -85,7 +85,7 @@ print(response)
 
 ```python
 from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(tp=2, session_len=8192))
@@ -101,8 +101,8 @@ print(response)
 
 ```python
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
-from lmdeploy.vl.constants import IMAGE_TOKEN
+from lmdeploy.multimodal import load_image
+from lmdeploy.multimodal.constants import IMAGE_TOKEN
 
 pipe = pipeline('deepseek-ai/deepseek-vl-1.3b-chat')
 
@@ -117,7 +117,7 @@ print(response)
 
 ```python
 from lmdeploy import pipeline, ChatTemplateConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 pipe = pipeline('local_model_folder',
                 chat_template_config=ChatTemplateConfig(model_name='llava-v1'))
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
@@ -133,7 +133,7 @@ print(response)
 
 ```python
 from lmdeploy import pipeline, VisionConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 vision_config=VisionConfig(max_batch_size=16)
 pipe = pipeline('liuhaotian/llava-v1.5-7b', vision_config=vision_config)
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
@@ -145,7 +145,7 @@ print(response)
 
 ```python
 from lmdeploy import pipeline, GenerationConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 pipe = pipeline('OpenGVLab/InternVL2_5-8B')
 
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
@@ -162,7 +162,7 @@ print(logits)
 
 ```python
 from lmdeploy import pipeline, TurbomindEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(session_len=8192))
@@ -183,7 +183,7 @@ print(response)
 
 ```python
 from lmdeploy import pipeline, TurbomindEngineConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(session_len=8192))
@@ -203,7 +203,7 @@ pipeline 进行多轮对话有两种方式，一种是按照 openai 的格式来
 
 ```python
 from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(session_len=8192))
@@ -224,7 +224,7 @@ print(sess.response.text)
 from lmdeploy import pipeline
 
 from lmdeploy import pipeline
-from lmdeploy.vl import load_image
+from lmdeploy.multimodal import load_image
 
 with pipeline('OpenGVLab/InternVL2_5-8B') as pipe:
     image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
diff --git a/lmdeploy/api.py b/lmdeploy/api.py
index 6db813b9cc..66a66a13c3 100644
--- a/lmdeploy/api.py
+++ b/lmdeploy/api.py
@@ -59,7 +59,7 @@ def pipeline(model_path: str,
             print(response)
 
             # VLM
-            from lmdeploy.vl import load_image
+            from lmdeploy.multimodal import load_image
             from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
             pipe = pipeline('liuhaotian/llava-v1.5-7b',
                             backend_config=TurbomindEngineConfig(session_len=8192),
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index 4b09fca9d4..ff23f39ba2 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -9,7 +9,7 @@
 from lmdeploy.archs import get_model_arch
 from lmdeploy.lite.quantization import CalibrationContext, CalibrationContextV2
 from lmdeploy.lite.utils import collect_target_modules, get_calib_loaders, load_hf_from_pretrained
-from lmdeploy.vl.model.builder import load_vl_model
+from lmdeploy.multimodal.model.builder import load_vl_model
 
 LAYER_TYPE_MAP = {
     'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
diff --git a/lmdeploy/multimodal/__init__.py b/lmdeploy/multimodal/__init__.py
new file mode 100644
index 0000000000..5dab7b9a83
--- /dev/null
+++ b/lmdeploy/multimodal/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .utils import (
+                    encode_audio_base64,
+                    encode_image_base64,
+                    encode_time_series_base64,
+                    encode_video_base64,
+                    load_audio,
+                    load_image,
+                    load_time_series,
+                    load_video,
+)
+
+__all__ = [
+    'load_image',
+    'load_video',
+    'load_audio',
+    'load_time_series',
+    'encode_image_base64',
+    'encode_video_base64',
+    'encode_audio_base64',
+    'encode_time_series_base64',
+]
diff --git a/lmdeploy/vl/constants.py b/lmdeploy/multimodal/constants.py
similarity index 100%
rename from lmdeploy/vl/constants.py
rename to lmdeploy/multimodal/constants.py
diff --git a/lmdeploy/vl/engine.py b/lmdeploy/multimodal/engine.py
similarity index 99%
rename from lmdeploy/vl/engine.py
rename to lmdeploy/multimodal/engine.py
index f2a5f62ccf..3a664595fa 100644
--- a/lmdeploy/vl/engine.py
+++ b/lmdeploy/multimodal/engine.py
@@ -9,7 +9,7 @@
 
 from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig, VisionConfig
 from lmdeploy.utils import is_bf16_supported
-from lmdeploy.vl.model.builder import load_vl_model
+from lmdeploy.multimodal.model.builder import load_vl_model
 
 
 def _get_hf_config_mm_feature_dtype(hf_config) -> torch.dtype | None:
diff --git a/lmdeploy/vl/media/__init__.py b/lmdeploy/multimodal/media/__init__.py
similarity index 100%
rename from lmdeploy/vl/media/__init__.py
rename to lmdeploy/multimodal/media/__init__.py
diff --git a/lmdeploy/vl/media/audio.py b/lmdeploy/multimodal/media/audio.py
similarity index 100%
rename from lmdeploy/vl/media/audio.py
rename to lmdeploy/multimodal/media/audio.py
diff --git a/lmdeploy/vl/media/base.py b/lmdeploy/multimodal/media/base.py
similarity index 100%
rename from lmdeploy/vl/media/base.py
rename to lmdeploy/multimodal/media/base.py
diff --git a/lmdeploy/vl/media/connection.py b/lmdeploy/multimodal/media/connection.py
similarity index 100%
rename from lmdeploy/vl/media/connection.py
rename to lmdeploy/multimodal/media/connection.py
diff --git a/lmdeploy/vl/media/image.py b/lmdeploy/multimodal/media/image.py
similarity index 100%
rename from lmdeploy/vl/media/image.py
rename to lmdeploy/multimodal/media/image.py
diff --git a/lmdeploy/vl/media/time_series.py b/lmdeploy/multimodal/media/time_series.py
similarity index 100%
rename from lmdeploy/vl/media/time_series.py
rename to lmdeploy/multimodal/media/time_series.py
diff --git a/lmdeploy/vl/media/video.py b/lmdeploy/multimodal/media/video.py
similarity index 100%
rename from lmdeploy/vl/media/video.py
rename to lmdeploy/multimodal/media/video.py
diff --git a/lmdeploy/vl/media/video_loader.py b/lmdeploy/multimodal/media/video_loader.py
similarity index 100%
rename from lmdeploy/vl/media/video_loader.py
rename to lmdeploy/multimodal/media/video_loader.py
diff --git a/lmdeploy/vl/model/__init__.py b/lmdeploy/multimodal/model/__init__.py
similarity index 100%
rename from lmdeploy/vl/model/__init__.py
rename to lmdeploy/multimodal/model/__init__.py
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/multimodal/model/base.py
similarity index 99%
rename from lmdeploy/vl/model/base.py
rename to lmdeploy/multimodal/model/base.py
index 47de994b5a..10ac159861 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/multimodal/model/base.py
@@ -10,8 +10,8 @@
 from transformers import AutoConfig, AutoTokenizer
 
 from lmdeploy.archs import get_model_arch
-from lmdeploy.vl.constants import Modality
-from lmdeploy.vl.model.preprocess_utils import (
+from lmdeploy.multimodal.constants import Modality
+from lmdeploy.multimodal.model.preprocess_utils import (
     get_expanded_input_ids,
     get_expanded_mm_items,
     get_mm_items_offset,
diff --git a/lmdeploy/vl/model/builder.py b/lmdeploy/multimodal/model/builder.py
similarity index 98%
rename from lmdeploy/vl/model/builder.py
rename to lmdeploy/multimodal/model/builder.py
index 6b2152bc09..91c8235ebf 100644
--- a/lmdeploy/vl/model/builder.py
+++ b/lmdeploy/multimodal/model/builder.py
@@ -6,7 +6,7 @@
 from lmdeploy.archs import get_model_arch
 from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig
 from lmdeploy.utils import get_logger, get_model
-from lmdeploy.vl.model.base import VISION_MODELS
+from lmdeploy.multimodal.model.base import VISION_MODELS
 
 from .cogvlm import CogVLMVisionModel  # noqa F401
 from .deepseek import DeepSeekVisionModel  # noqa F401
diff --git a/lmdeploy/vl/model/cogvlm.py b/lmdeploy/multimodal/model/cogvlm.py
similarity index 98%
rename from lmdeploy/vl/model/cogvlm.py
rename to lmdeploy/multimodal/model/cogvlm.py
index dd35c907b7..587bdf85bb 100644
--- a/lmdeploy/vl/model/cogvlm.py
+++ b/lmdeploy/multimodal/model/cogvlm.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
+from lmdeploy.multimodal.model.base import VISION_MODELS, VisionModel
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/deepseek.py b/lmdeploy/multimodal/model/deepseek.py
similarity index 98%
rename from lmdeploy/vl/model/deepseek.py
rename to lmdeploy/multimodal/model/deepseek.py
index bd0f68c2e3..081d0a300b 100644
--- a/lmdeploy/vl/model/deepseek.py
+++ b/lmdeploy/multimodal/model/deepseek.py
@@ -5,8 +5,8 @@
 from transformers import AutoModelForCausalLM
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
-from lmdeploy.vl.model.utils import disable_logging
+from lmdeploy.multimodal.model.base import VISION_MODELS, VisionModel
+from lmdeploy.multimodal.model.utils import disable_logging
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/deepseek_vl2.py b/lmdeploy/multimodal/model/deepseek_vl2.py
similarity index 99%
rename from lmdeploy/vl/model/deepseek_vl2.py
rename to lmdeploy/multimodal/model/deepseek_vl2.py
index c17b02c954..0f592cafdc 100644
--- a/lmdeploy/vl/model/deepseek_vl2.py
+++ b/lmdeploy/multimodal/model/deepseek_vl2.py
@@ -6,7 +6,7 @@
 from transformers import AutoConfig
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
+from lmdeploy.multimodal.model.base import VISION_MODELS, VisionModel
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/gemma3_vl.py b/lmdeploy/multimodal/model/gemma3_vl.py
similarity index 98%
rename from lmdeploy/vl/model/gemma3_vl.py
rename to lmdeploy/multimodal/model/gemma3_vl.py
index d957184621..a554c1135b 100644
--- a/lmdeploy/vl/model/gemma3_vl.py
+++ b/lmdeploy/multimodal/model/gemma3_vl.py
@@ -5,7 +5,7 @@
 from transformers.processing_utils import ImagesKwargs, ProcessingKwargs
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
+from lmdeploy.multimodal.model.base import VISION_MODELS, VisionModel
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/glm4_1v.py b/lmdeploy/multimodal/model/glm4_1v.py
similarity index 92%
rename from lmdeploy/vl/model/glm4_1v.py
rename to lmdeploy/multimodal/model/glm4_1v.py
index f534a592aa..acd4edf88e 100644
--- a/lmdeploy/vl/model/glm4_1v.py
+++ b/lmdeploy/multimodal/model/glm4_1v.py
@@ -3,7 +3,7 @@
 from transformers import AutoConfig
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, MultimodalSpecialTokens, VisionModel
+from lmdeploy.multimodal.model.base import VISION_MODELS, MultimodalSpecialTokens, VisionModel
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/glm4_v.py b/lmdeploy/multimodal/model/glm4_v.py
similarity index 98%
rename from lmdeploy/vl/model/glm4_v.py
rename to lmdeploy/multimodal/model/glm4_v.py
index 0892b02dc2..cd4ce1ac2c 100644
--- a/lmdeploy/vl/model/glm4_v.py
+++ b/lmdeploy/multimodal/model/glm4_v.py
@@ -3,7 +3,7 @@
 from transformers import AutoConfig
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
+from lmdeploy.multimodal.model.base import VISION_MODELS, VisionModel
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/interns1_pro.py b/lmdeploy/multimodal/model/interns1_pro.py
similarity index 96%
rename from lmdeploy/vl/model/interns1_pro.py
rename to lmdeploy/multimodal/model/interns1_pro.py
index d1f18e1828..4851fc4810 100644
--- a/lmdeploy/vl/model/interns1_pro.py
+++ b/lmdeploy/multimodal/model/interns1_pro.py
@@ -5,8 +5,8 @@
 import torch
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, MultimodalSpecialTokens
-from lmdeploy.vl.model.qwen3 import Qwen3VLModel
+from lmdeploy.multimodal.model.base import VISION_MODELS, MultimodalSpecialTokens
+from lmdeploy.multimodal.model.qwen3 import Qwen3VLModel
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/multimodal/model/internvl.py
similarity index 99%
rename from lmdeploy/vl/model/internvl.py
rename to lmdeploy/multimodal/model/internvl.py
index 1534ad3388..3ecb853db5 100644
--- a/lmdeploy/vl/model/internvl.py
+++ b/lmdeploy/multimodal/model/internvl.py
@@ -6,8 +6,8 @@
 from transformers import AutoConfig, AutoModel, AutoTokenizer, CLIPImageProcessor
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
-from lmdeploy.vl.model.utils import disable_logging
+from lmdeploy.multimodal.model.base import VISION_MODELS, VisionModel
+from lmdeploy.multimodal.model.utils import disable_logging
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/internvl3_hf.py b/lmdeploy/multimodal/model/internvl3_hf.py
similarity index 97%
rename from lmdeploy/vl/model/internvl3_hf.py
rename to lmdeploy/multimodal/model/internvl3_hf.py
index 3816cfe491..fed5605688 100644
--- a/lmdeploy/vl/model/internvl3_hf.py
+++ b/lmdeploy/multimodal/model/internvl3_hf.py
@@ -5,8 +5,8 @@
 from transformers.processing_utils import ImagesKwargs, ProcessingKwargs
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.internvl import VISION_MODELS, InternVLVisionModel
-from lmdeploy.vl.model.utils import disable_logging
+from lmdeploy.multimodal.model.internvl import VISION_MODELS, InternVLVisionModel
+from lmdeploy.multimodal.model.utils import disable_logging
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/llama4.py b/lmdeploy/multimodal/model/llama4.py
similarity index 99%
rename from lmdeploy/vl/model/llama4.py
rename to lmdeploy/multimodal/model/llama4.py
index 92199f458c..023861e61e 100644
--- a/lmdeploy/vl/model/llama4.py
+++ b/lmdeploy/multimodal/model/llama4.py
@@ -4,7 +4,7 @@
 from transformers import AutoConfig
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
+from lmdeploy.multimodal.model.base import VISION_MODELS, VisionModel
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/llava.py b/lmdeploy/multimodal/model/llava.py
similarity index 99%
rename from lmdeploy/vl/model/llava.py
rename to lmdeploy/multimodal/model/llava.py
index 803b2067cb..30842b82d3 100644
--- a/lmdeploy/vl/model/llava.py
+++ b/lmdeploy/multimodal/model/llava.py
@@ -11,8 +11,8 @@
 from transformers import AutoConfig, AutoModelForCausalLM
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.llava_hf import VISION_MODELS, LlavaHfVisionModel
-from lmdeploy.vl.model.utils import disable_logging, rewrite_ctx
+from lmdeploy.multimodal.model.llava_hf import VISION_MODELS, LlavaHfVisionModel
+from lmdeploy.multimodal.model.utils import disable_logging, rewrite_ctx
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/llava_hf.py b/lmdeploy/multimodal/model/llava_hf.py
similarity index 97%
rename from lmdeploy/vl/model/llava_hf.py
rename to lmdeploy/multimodal/model/llava_hf.py
index 08dfa3dbc6..ca83d94659 100644
--- a/lmdeploy/vl/model/llava_hf.py
+++ b/lmdeploy/multimodal/model/llava_hf.py
@@ -5,8 +5,8 @@
 from transformers import AutoProcessor
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
-from lmdeploy.vl.model.utils import disable_logging
+from lmdeploy.multimodal.model.base import VISION_MODELS, VisionModel
+from lmdeploy.multimodal.model.utils import disable_logging
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/llava_next.py b/lmdeploy/multimodal/model/llava_next.py
similarity index 98%
rename from lmdeploy/vl/model/llava_next.py
rename to lmdeploy/multimodal/model/llava_next.py
index e55bbe6f25..aa7c9636a0 100644
--- a/lmdeploy/vl/model/llava_next.py
+++ b/lmdeploy/multimodal/model/llava_next.py
@@ -5,8 +5,8 @@
 import torch
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.llava_hf import VISION_MODELS, LlavaHfVisionModel
-from lmdeploy.vl.model.utils import disable_logging
+from lmdeploy.multimodal.model.llava_hf import VISION_MODELS, LlavaHfVisionModel
+from lmdeploy.multimodal.model.utils import disable_logging
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/minicpmv.py b/lmdeploy/multimodal/model/minicpmv.py
similarity index 98%
rename from lmdeploy/vl/model/minicpmv.py
rename to lmdeploy/multimodal/model/minicpmv.py
index e64278244a..cd34d4271f 100644
--- a/lmdeploy/vl/model/minicpmv.py
+++ b/lmdeploy/multimodal/model/minicpmv.py
@@ -7,8 +7,8 @@
 from transformers import AutoConfig, AutoModelForCausalLM
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
-from lmdeploy.vl.model.utils import disable_logging
+from lmdeploy.multimodal.model.base import VISION_MODELS, VisionModel
+from lmdeploy.multimodal.model.utils import disable_logging
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/molmo.py b/lmdeploy/multimodal/model/molmo.py
similarity index 98%
rename from lmdeploy/vl/model/molmo.py
rename to lmdeploy/multimodal/model/molmo.py
index 1bad7474cb..cea6bb7e62 100644
--- a/lmdeploy/vl/model/molmo.py
+++ b/lmdeploy/multimodal/model/molmo.py
@@ -5,8 +5,8 @@
 from transformers import AutoModelForCausalLM, AutoProcessor
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
-from lmdeploy.vl.model.utils import disable_logging
+from lmdeploy.multimodal.model.base import VISION_MODELS, VisionModel
+from lmdeploy.multimodal.model.utils import disable_logging
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/phi3_vision.py b/lmdeploy/multimodal/model/phi3_vision.py
similarity index 95%
rename from lmdeploy/vl/model/phi3_vision.py
rename to lmdeploy/multimodal/model/phi3_vision.py
index 56ce69ce40..ae2fe2ace4 100644
--- a/lmdeploy/vl/model/phi3_vision.py
+++ b/lmdeploy/multimodal/model/phi3_vision.py
@@ -3,7 +3,7 @@
 
 from transformers import AutoProcessor
 
-from lmdeploy.vl.model.llava_hf import VISION_MODELS, LlavaHfVisionModel
+from lmdeploy.multimodal.model.llava_hf import VISION_MODELS, LlavaHfVisionModel
 
 
 @VISION_MODELS.register_module()
diff --git a/lmdeploy/vl/model/preprocess_utils.py b/lmdeploy/multimodal/model/preprocess_utils.py
similarity index 98%
rename from lmdeploy/vl/model/preprocess_utils.py
rename to lmdeploy/multimodal/model/preprocess_utils.py
index f267722ff7..e342cf7845 100644
--- a/lmdeploy/vl/model/preprocess_utils.py
+++ b/lmdeploy/multimodal/model/preprocess_utils.py
@@ -5,10 +5,10 @@
 import torch.nn.functional as F
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.constants import Modality
+from lmdeploy.multimodal.constants import Modality
 
 if TYPE_CHECKING:
-    from lmdeploy.vl.model.base import MultimodalSpecialTokens
+    from lmdeploy.multimodal.model.base import MultimodalSpecialTokens
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/qwen2.py b/lmdeploy/multimodal/model/qwen2.py
similarity index 98%
rename from lmdeploy/vl/model/qwen2.py
rename to lmdeploy/multimodal/model/qwen2.py
index 98e3f8cd09..af80807d81 100644
--- a/lmdeploy/vl/model/qwen2.py
+++ b/lmdeploy/multimodal/model/qwen2.py
@@ -1,8 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 
-from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
-from lmdeploy.vl.model.utils import disable_logging
+from lmdeploy.multimodal.model.base import VISION_MODELS, VisionModel
+from lmdeploy.multimodal.model.utils import disable_logging
 
 
 def check_qwen_vl_deps_install():
diff --git a/lmdeploy/vl/model/qwen3.py b/lmdeploy/multimodal/model/qwen3.py
similarity index 94%
rename from lmdeploy/vl/model/qwen3.py
rename to lmdeploy/multimodal/model/qwen3.py
index 733fd54750..484996baa9 100644
--- a/lmdeploy/vl/model/qwen3.py
+++ b/lmdeploy/multimodal/model/qwen3.py
@@ -3,7 +3,7 @@
 from transformers import AutoProcessor
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, MultimodalSpecialTokens, VisionModel
+from lmdeploy.multimodal.model.base import VISION_MODELS, MultimodalSpecialTokens, VisionModel
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/qwen3_5.py b/lmdeploy/multimodal/model/qwen3_5.py
similarity index 97%
rename from lmdeploy/vl/model/qwen3_5.py
rename to lmdeploy/multimodal/model/qwen3_5.py
index efcc971ac3..5aa810068e 100644
--- a/lmdeploy/vl/model/qwen3_5.py
+++ b/lmdeploy/multimodal/model/qwen3_5.py
@@ -5,9 +5,9 @@
 import torch
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, MultimodalSpecialTokens
-from lmdeploy.vl.model.qwen3 import Qwen3VLModel
-from lmdeploy.vl.model.utils import disable_logging
+from lmdeploy.multimodal.model.base import VISION_MODELS, MultimodalSpecialTokens
+from lmdeploy.multimodal.model.qwen3 import Qwen3VLModel
+from lmdeploy.multimodal.model.utils import disable_logging
 
 logger = get_logger('lmdeploy')
 
diff --git a/lmdeploy/vl/model/qwen3_omni.py b/lmdeploy/multimodal/model/qwen3_omni.py
similarity index 94%
rename from lmdeploy/vl/model/qwen3_omni.py
rename to lmdeploy/multimodal/model/qwen3_omni.py
index c654b375d0..6b2481dac4 100644
--- a/lmdeploy/vl/model/qwen3_omni.py
+++ b/lmdeploy/multimodal/model/qwen3_omni.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from transformers import AutoProcessor
 
-from lmdeploy.vl.model.base import VISION_MODELS, MultimodalSpecialTokens, VisionModel
+from lmdeploy.multimodal.model.base import VISION_MODELS, MultimodalSpecialTokens, VisionModel
 
 
 def check_transformers():
diff --git a/lmdeploy/vl/model/utils.py b/lmdeploy/multimodal/model/utils.py
similarity index 100%
rename from lmdeploy/vl/model/utils.py
rename to lmdeploy/multimodal/model/utils.py
diff --git a/lmdeploy/vl/tools/__init__.py b/lmdeploy/multimodal/tools/__init__.py
similarity index 100%
rename from lmdeploy/vl/tools/__init__.py
rename to lmdeploy/multimodal/tools/__init__.py
diff --git a/lmdeploy/vl/utils.py b/lmdeploy/multimodal/utils.py
similarity index 100%
rename from lmdeploy/vl/utils.py
rename to lmdeploy/multimodal/utils.py
diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
index 00888bfdbb..246bf380ec 100644
--- a/lmdeploy/pytorch/messages.py
+++ b/lmdeploy/pytorch/messages.py
@@ -12,7 +12,7 @@
 from lmdeploy.pytorch.disagg.conn.protocol import MigrationRequest
 from lmdeploy.pytorch.multimodal.data_type import MultiModalInputs, make_multimodal_content_hash
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.constants import Modality
+from lmdeploy.multimodal.constants import Modality
 
 from .block import LogicalTokenBlocks
 
diff --git a/lmdeploy/pytorch/models/interns1_pro.py b/lmdeploy/pytorch/models/interns1_pro.py
index 9a03f30640..29b7be2edf 100644
--- a/lmdeploy/pytorch/models/interns1_pro.py
+++ b/lmdeploy/pytorch/models/interns1_pro.py
@@ -10,7 +10,7 @@
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
-from lmdeploy.vl.constants import Modality
+from lmdeploy.multimodal.constants import Modality
 
 from .interns1_pro_time_series import InternS1ProTimeSeriesModel
 from .patch import add_prefix, get_build_model_context
diff --git a/lmdeploy/pytorch/models/qwen3_5.py b/lmdeploy/pytorch/models/qwen3_5.py
index 732d9b1166..1ce581d164 100644
--- a/lmdeploy/pytorch/models/qwen3_5.py
+++ b/lmdeploy/pytorch/models/qwen3_5.py
@@ -25,7 +25,7 @@
 )
 from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters
 from lmdeploy.pytorch.weight_loader.model_weight_loader import default_weight_loader, load_weight
-from lmdeploy.vl.constants import Modality
+from lmdeploy.multimodal.constants import Modality
 
 from .patch import add_prefix, get_build_model_context
 from .qwen2_5_vl import Qwen2_5_VisionRotaryEmbedding as Qwen3_5VisionRotaryEmbedding
diff --git a/lmdeploy/pytorch/models/qwen3_omni_moe_thinker.py b/lmdeploy/pytorch/models/qwen3_omni_moe_thinker.py
index 733c323dbf..eaf068f897 100644
--- a/lmdeploy/pytorch/models/qwen3_omni_moe_thinker.py
+++ b/lmdeploy/pytorch/models/qwen3_omni_moe_thinker.py
@@ -18,7 +18,7 @@
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, FlashAttention, LayerNorm
 from lmdeploy.pytorch.nn.linear import build_colwise_linear, build_qkv_proj, build_rowwise_linear
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
-from lmdeploy.vl.constants import Modality
+from lmdeploy.multimodal.constants import Modality
 
 from .qwen3_vl import Qwen3VLVisionBlock, Qwen3VLVisionPatchEmbed, Qwen3VLVisionRotaryEmbedding
 from .qwen3_vl_moe import Qwen3VLMoeTextModel
diff --git a/lmdeploy/pytorch/models/qwen3_vl.py b/lmdeploy/pytorch/models/qwen3_vl.py
index 9a0185e13b..5f082a5baa 100644
--- a/lmdeploy/pytorch/models/qwen3_vl.py
+++ b/lmdeploy/pytorch/models/qwen3_vl.py
@@ -17,7 +17,7 @@
 from lmdeploy.pytorch.nn.linear import build_colwise_linear, build_rowwise_linear
 from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
-from lmdeploy.vl.constants import Modality
+from lmdeploy.multimodal.constants import Modality
 
 from .patch import add_prefix
 from .qwen2_5_vl import Qwen2_5_VisionRotaryEmbedding as Qwen3VLVisionRotaryEmbedding
diff --git a/lmdeploy/pytorch/models/utils/model.py b/lmdeploy/pytorch/models/utils/model.py
index 4b140fa692..0a01b7aeb4 100644
--- a/lmdeploy/pytorch/models/utils/model.py
+++ b/lmdeploy/pytorch/models/utils/model.py
@@ -11,7 +11,7 @@
 from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.nn.embedding import ParallelEmbedding
 from lmdeploy.pytorch.nn.linear import build_rowwise_linear
-from lmdeploy.vl.constants import Modality
+from lmdeploy.multimodal.constants import Modality
 
 
 class BaseModelMetaProcessor:
diff --git a/lmdeploy/pytorch/multimodal/data_type.py b/lmdeploy/pytorch/multimodal/data_type.py
index f778c2aeb5..4bc75dbddc 100644
--- a/lmdeploy/pytorch/multimodal/data_type.py
+++ b/lmdeploy/pytorch/multimodal/data_type.py
@@ -8,7 +8,7 @@
 import torch
 from torch import Tensor
 
-from lmdeploy.vl.constants import Modality
+from lmdeploy.multimodal.constants import Modality
 
 NestedTensor = Tensor | list[Tensor]
 
diff --git a/lmdeploy/serve/core/vl_async_engine.py b/lmdeploy/serve/core/vl_async_engine.py
index d246a20f75..3700b51d36 100644
--- a/lmdeploy/serve/core/vl_async_engine.py
+++ b/lmdeploy/serve/core/vl_async_engine.py
@@ -21,7 +21,7 @@ def __init__(self,
                  **kwargs) -> None:
         from lmdeploy.serve.processors import MultimodalProcessor
         from lmdeploy.utils import try_import_deeplink
-        from lmdeploy.vl.engine import ImageEncoder
+        from lmdeploy.multimodal.engine import ImageEncoder
 
         if backend == 'pytorch':
             try_import_deeplink(backend_config.device_type)
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 37fb86d7bc..e6037ca17c 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -1392,7 +1392,7 @@ def dummy_get_device_id():
         return 0
 
     if int(os.environ.get('LOCAL_RANK', -1)) > 0:
-        from lmdeploy.vl.model.utils import _set_func
+        from lmdeploy.multimodal.model.utils import _set_func
 
         # the replacement can't be recovered
         _set_func('mmengine.logging.logger._get_device_id', dummy_get_device_id)
diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index cf2452935e..4b4c67a85a 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -7,12 +7,12 @@
 from lmdeploy.model import MODELS, BaseChatTemplate
 from lmdeploy.tokenizer import Tokenizer
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.constants import Modality
-from lmdeploy.vl.media.audio import AudioMediaIO
-from lmdeploy.vl.media.connection import load_from_url
-from lmdeploy.vl.media.image import ImageMediaIO
-from lmdeploy.vl.media.time_series import TimeSeriesMediaIO
-from lmdeploy.vl.media.video import VideoMediaIO
+from lmdeploy.multimodal.constants import Modality
+from lmdeploy.multimodal.media.audio import AudioMediaIO
+from lmdeploy.multimodal.media.connection import load_from_url
+from lmdeploy.multimodal.media.image import ImageMediaIO
+from lmdeploy.multimodal.media.time_series import TimeSeriesMediaIO
+from lmdeploy.multimodal.media.video import VideoMediaIO
 
 logger = get_logger('lmdeploy')
 
@@ -299,7 +299,7 @@ def _is_image_list(obj) -> bool:
     @staticmethod
     def _re_format_prompt_images_pair(prompt: tuple) -> dict:
         """Reformat the prompt to openai message format."""
-        from lmdeploy.vl import load_image
+        from lmdeploy.multimodal import load_image
 
         messages = {'role': 'user', 'content': []}
         prompt, images = prompt
diff --git a/lmdeploy/turbomind/models/qwen3_5.py b/lmdeploy/turbomind/models/qwen3_5.py
index 0d13f372ae..b87ad8bb03 100644
--- a/lmdeploy/turbomind/models/qwen3_5.py
+++ b/lmdeploy/turbomind/models/qwen3_5.py
@@ -26,7 +26,7 @@
 import _turbomind as _tm
 import torch
 
-from lmdeploy.vl.constants import Modality
+from lmdeploy.multimodal.constants import Modality
 
 from ..builders import (
     AttentionBuilder,
diff --git a/lmdeploy/vl/__init__.py b/lmdeploy/vl/__init__.py
index 5dab7b9a83..63dd255fe8 100644
--- a/lmdeploy/vl/__init__.py
+++ b/lmdeploy/vl/__init__.py
@@ -1,22 +1,3 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .utils import (
-                    encode_audio_base64,
-                    encode_image_base64,
-                    encode_time_series_base64,
-                    encode_video_base64,
-                    load_audio,
-                    load_image,
-                    load_time_series,
-                    load_video,
-)
-
-__all__ = [
-    'load_image',
-    'load_video',
-    'load_audio',
-    'load_time_series',
-    'encode_image_base64',
-    'encode_video_base64',
-    'encode_audio_base64',
-    'encode_time_series_base64',
-]
+raise ImportError('lmdeploy.vl has been renamed to lmdeploy.multimodal. '
+                  'Please update imports to use lmdeploy.multimodal.')
diff --git a/tests/pytorch/paging/test_block_trie.py b/tests/pytorch/paging/test_block_trie.py
index 8e5cf21033..9fa95fe50f 100644
--- a/tests/pytorch/paging/test_block_trie.py
+++ b/tests/pytorch/paging/test_block_trie.py
@@ -7,7 +7,7 @@
 from lmdeploy.pytorch.messages import SamplingParam, SequenceMeta, UpdateTokenMode
 from lmdeploy.pytorch.multimodal.data_type import MultiModalData
 from lmdeploy.pytorch.paging import Scheduler
-from lmdeploy.vl.constants import Modality
+from lmdeploy.multimodal.constants import Modality
 
 
 class TestBlockTire:
diff --git a/tests/test_lmdeploy/test_content_merge.py b/tests/test_lmdeploy/test_content_merge.py
index d494a8fcbd..49767f5a2a 100644
--- a/tests/test_lmdeploy/test_content_merge.py
+++ b/tests/test_lmdeploy/test_content_merge.py
@@ -5,7 +5,7 @@
 from PIL import Image
 
 from lmdeploy.serve.processors import MultimodalProcessor
-from lmdeploy.vl.constants import Modality
+from lmdeploy.multimodal.constants import Modality
 
 multimodal_module = sys.modules[MultimodalProcessor.__module__]
 
diff --git a/tests/test_lmdeploy/test_vl/test_hf_chat_template.py b/tests/test_lmdeploy/test_multimodal/test_hf_chat_template.py
similarity index 99%
rename from tests/test_lmdeploy/test_vl/test_hf_chat_template.py
rename to tests/test_lmdeploy/test_multimodal/test_hf_chat_template.py
index b6eabf8a54..e09b767287 100644
--- a/tests/test_lmdeploy/test_vl/test_hf_chat_template.py
+++ b/tests/test_lmdeploy/test_multimodal/test_hf_chat_template.py
@@ -3,7 +3,7 @@
 import pytest
 
 from lmdeploy.model import MODELS
-from lmdeploy.vl.model.builder import load_vl_model
+from lmdeploy.multimodal.model.builder import load_vl_model
 
 
 def get_model_and_chat_template(model_path, trust_remote_code=False):
diff --git a/tests/test_lmdeploy/test_vl/test_vl_encode.py b/tests/test_lmdeploy/test_multimodal/test_multimodal_encode.py
similarity index 99%
rename from tests/test_lmdeploy/test_vl/test_vl_encode.py
rename to tests/test_lmdeploy/test_multimodal/test_multimodal_encode.py
index 276d3cb936..82da21b95f 100644
--- a/tests/test_lmdeploy/test_vl/test_vl_encode.py
+++ b/tests/test_lmdeploy/test_multimodal/test_multimodal_encode.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from lmdeploy.vl import (
+from lmdeploy.multimodal import (
     encode_audio_base64,
     encode_image_base64,
     encode_time_series_base64,
diff --git a/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py b/tests/test_lmdeploy/test_multimodal/test_nonhf_chat_template.py
similarity index 99%
rename from tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py
rename to tests/test_lmdeploy/test_multimodal/test_nonhf_chat_template.py
index bf6399647b..73327743ed 100644
--- a/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py
+++ b/tests/test_lmdeploy/test_multimodal/test_nonhf_chat_template.py
@@ -3,7 +3,7 @@
 import pytest
 
 from lmdeploy.model import MODELS
-from lmdeploy.vl.model.builder import load_vl_model
+from lmdeploy.multimodal.model.builder import load_vl_model
 
 
 def get_model_and_chat_template(model_path, trust_remote_code=False):
diff --git a/tests/test_lmdeploy/test_vl/test_preprocess_utils.py b/tests/test_lmdeploy/test_multimodal/test_preprocess_utils.py
similarity index 95%
rename from tests/test_lmdeploy/test_vl/test_preprocess_utils.py
rename to tests/test_lmdeploy/test_multimodal/test_preprocess_utils.py
index 22084f0e88..a060f4ae28 100644
--- a/tests/test_lmdeploy/test_vl/test_preprocess_utils.py
+++ b/tests/test_lmdeploy/test_multimodal/test_preprocess_utils.py
@@ -2,8 +2,8 @@
 
 import torch
 
-from lmdeploy.vl.constants import Modality
-from lmdeploy.vl.model.preprocess_utils import get_expanded_mm_items
+from lmdeploy.multimodal.constants import Modality
+from lmdeploy.multimodal.model.preprocess_utils import get_expanded_mm_items
 
 
 class _Tokens:
diff --git a/tests/test_lmdeploy/test_vl/test_qwen3_omni_processor.py b/tests/test_lmdeploy/test_multimodal/test_qwen3_omni_processor.py
similarity index 97%
rename from tests/test_lmdeploy/test_vl/test_qwen3_omni_processor.py
rename to tests/test_lmdeploy/test_multimodal/test_qwen3_omni_processor.py
index 596467c1a2..0b88300672 100644
--- a/tests/test_lmdeploy/test_vl/test_qwen3_omni_processor.py
+++ b/tests/test_lmdeploy/test_multimodal/test_qwen3_omni_processor.py
@@ -5,10 +5,10 @@
 from lmdeploy.pytorch.models.qwen3_omni_moe_thinker import Qwen3OmniInputProcessor
 from lmdeploy.pytorch.models.utils.model import DeployModelMixinV1
 from lmdeploy.pytorch.multimodal.data_type import MultiModalData
-from lmdeploy.vl.constants import Modality
-from lmdeploy.vl.model.base import MultimodalSpecialTokens
-from lmdeploy.vl.model.preprocess_utils import get_expanded_mm_items
-from lmdeploy.vl.model.qwen3_omni import Qwen3OmniModel
+from lmdeploy.multimodal.constants import Modality
+from lmdeploy.multimodal.model.base import MultimodalSpecialTokens
+from lmdeploy.multimodal.model.preprocess_utils import get_expanded_mm_items
+from lmdeploy.multimodal.model.qwen3_omni import Qwen3OmniModel
 
 
 class FakeQwen3OmniProcessor:
diff --git a/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py b/tests/test_lmdeploy/test_multimodal/test_qwen3vl_processor.py
similarity index 97%
rename from tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py
rename to tests/test_lmdeploy/test_multimodal/test_qwen3vl_processor.py
index 5e13d4b4bf..59bd74d1c9 100644
--- a/tests/test_lmdeploy/test_vl/test_qwen3vl_processor.py
+++ b/tests/test_lmdeploy/test_multimodal/test_qwen3vl_processor.py
@@ -1,8 +1,8 @@
 import pytest
 
-from lmdeploy.vl import load_image, load_video
-from lmdeploy.vl.constants import Modality
-from lmdeploy.vl.model.qwen3 import Qwen3VLModel
+from lmdeploy.multimodal import load_image, load_video
+from lmdeploy.multimodal.constants import Modality
+from lmdeploy.multimodal.model.qwen3 import Qwen3VLModel
 
 QWEN3VL_MODELS = [
     'Qwen/Qwen3-VL-4B-Instruct',
diff --git a/tests/test_lmdeploy/test_vl/test_safe_url.py b/tests/test_lmdeploy/test_multimodal/test_safe_url.py
similarity index 92%
rename from tests/test_lmdeploy/test_vl/test_safe_url.py
rename to tests/test_lmdeploy/test_multimodal/test_safe_url.py
index f919b3f0b9..1f168f01dd 100644
--- a/tests/test_lmdeploy/test_vl/test_safe_url.py
+++ b/tests/test_lmdeploy/test_multimodal/test_safe_url.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from lmdeploy.vl.media.connection import _is_safe_url, _load_http_url
+from lmdeploy.multimodal.media.connection import _is_safe_url, _load_http_url
 
 
 @pytest.mark.parametrize(
@@ -35,7 +35,7 @@ def test_is_safe_url(url, expected_safe, mock_ips):
 
 
 @patch('requests.Session.get')
-@patch('lmdeploy.vl.media.connection._is_safe_url', return_value=(True, ''))
+@patch('lmdeploy.multimodal.media.connection._is_safe_url', return_value=(True, ''))
 def test_load_http_url_logic(mock_safe, mock_get):
     media_io = MagicMock()
     url_spec = urlparse('https://example.com/img.jpg')