Skip to content

Commit e1db341

Browse files
Phi3 runner uses TextLLMRunner (#12482)
This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: #12477 by @larryliu0820 ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/larryliu0820/69/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/larryliu0820/69/head Merge bot PR base: https://github.com/pytorch/executorch/tree/main Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/larryliu0820/69/orig @diff-train-skip-merge Co-authored-by: Mengwei Liu <[email protected]>
1 parent 00491fd commit e1db341

File tree

11 files changed

+147
-269
lines changed

11 files changed

+147
-269
lines changed

.ci/scripts/test_phi_3_mini.sh

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -22,31 +22,14 @@ NPROC=8
2222
if hash nproc &> /dev/null; then NPROC=$(nproc); fi
2323

2424
cmake_install_executorch_libraries() {
25-
cmake -DPYTHON_EXECUTABLE=python \
26-
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
27-
-DEXECUTORCH_ENABLE_LOGGING=1 \
28-
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
29-
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
30-
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
31-
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
32-
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
33-
-DEXECUTORCH_BUILD_XNNPACK=ON \
34-
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
35-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
36-
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
37-
-B${BUILD_DIR} .
38-
39-
cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
25+
rm -rf cmake-out
26+
cmake --preset llm -DCMAKE_INSTALL_PREFIX=cmake-out -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
27+
cmake --build cmake-out -j16 --target install --config ${BUILD_TYPE}
4028
}
4129

4230
cmake_build_phi_3_mini() {
43-
cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
44-
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
31+
cmake -DCMAKE_PREFIX_PATH=${BUILD_DIR} \
4532
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
46-
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
47-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
48-
-DEXECUTORCH_BUILD_XNNPACK=ON \
49-
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
5033
-B${BUILD_DIR}/${MODEL_DIR} \
5134
${MODEL_DIR}
5235

@@ -81,7 +64,7 @@ run_and_verify() {
8164
${BUILD_DIR}/${MODEL_DIR}/phi_3_mini_runner \
8265
--model_path=phi-3-mini.pte \
8366
--tokenizer_path=tokenizer.bin \
84-
--seq_len=128 \
67+
--seq_len=60 \
8568
--temperature=0 \
8669
--prompt="<|system|>
8770
You are a helpful assistant.<|end|>

.github/workflows/pull.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -603,7 +603,7 @@ jobs:
603603
bash examples/models/phi-3-mini/install_requirements.sh
604604
605605
# run e2e (export, tokenizer and runner)
606-
PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh
606+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh Release
607607
608608
test-eval_llama-wikitext-linux:
609609
name: test-eval_llama-wikitext-linux

examples/models/phi-3-mini/CMakeLists.txt

Lines changed: 20 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,43 +13,40 @@
1313
# It should also be cmake-lint clean.
1414
#
1515

16-
cmake_minimum_required(VERSION 3.19)
16+
cmake_minimum_required(VERSION 3.24)
17+
cmake_policy(SET CMP0144 NEW)
1718
project(phi_3_mini_runner)
1819

1920
set(CMAKE_CXX_STANDARD 17)
2021
set(CMAKE_CXX_STANDARD_REQUIRED True)
21-
set(CMAKE_BUILD_TYPE Release)
2222

23-
# Set options for executorch build.
24-
option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
25-
option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
26-
option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "" ON)
27-
option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON)
28-
option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
29-
option(EXECUTORCH_BUILD_XNNPACK "" ON)
23+
set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
24+
set(_common_include_directories
25+
${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10
26+
)
27+
set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../lib/cmake/ExecuTorch)
28+
find_package(executorch CONFIG REQUIRED)
29+
30+
target_link_options_shared_lib(executorch)
3031

32+
set(BUILD_TESTING OFF)
3133
add_subdirectory(
32-
${CMAKE_CURRENT_SOURCE_DIR}/../../.. ${CMAKE_BINARY_DIR}/../../..
34+
${EXECUTORCH_ROOT}/extension/llm/runner
35+
${CMAKE_BINARY_DIR}/../../../extension/llm/runner
3336
)
37+
3438
if(NOT TARGET gflags)
3539
add_subdirectory(
3640
${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags
3741
${CMAKE_BINARY_DIR}/gflags
3842
)
3943
endif()
4044

41-
add_executable(
42-
phi_3_mini_runner
43-
main.cpp runner.cpp
44-
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/sampler/sampler.cpp
45-
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/src/llama2c_tokenizer.cpp
46-
)
47-
target_include_directories(
48-
phi_3_mini_runner
49-
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags/src
50-
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/include
51-
)
45+
add_executable(phi_3_mini_runner main.cpp)
46+
47+
target_link_directories(phi_3_mini_runner PUBLIC ${_common_include_directories})
48+
5249
target_link_libraries(
53-
phi_3_mini_runner PRIVATE executorch extension_module_static extension_tensor
54-
optimized_native_cpu_ops_lib xnnpack_backend gflags
50+
phi_3_mini_runner PUBLIC executorch optimized_native_cpu_ops_lib
51+
xnnpack_backend gflags extension_llm_runner
5552
)

examples/models/phi-3-mini/README.md

Lines changed: 13 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ This example demonstrates how to run a [Phi-3-mini](https://huggingface.co/micro
44
# Instructions
55
## Step 1: Setup
66
1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_executorch.sh`
7-
2. Currently, we support transformers v4.44.2. Install transformers with the following command:
7+
2. Currently, we support transformers v4.53.1. Install transformers with the following command:
88
```
9-
pip uninstall -y transformers ; pip install transformers==4.44.2
9+
pip uninstall -y transformers ; pip install transformers==4.53.1
1010
```
1111
## Step 2: Prepare and run the model
1212
1. Download the `tokenizer.model` from HuggingFace and create `tokenizer.bin`.
@@ -17,41 +17,25 @@ python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokeniz
1717
```
1818
2. Export the model. This step will take a few minutes to finish.
1919
```
20-
python -m examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-mini.pte
20+
python -m executorch.examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-mini.pte
2121
```
2222
3. Build and run the model.
23-
- Build executorch with optimized CPU performance as follows. Build options available [here](https://github.com/pytorch/executorch/blob/main/CMakeLists.txt#L59).
24-
```
25-
cmake -DPYTHON_EXECUTABLE=python \
26-
-DCMAKE_INSTALL_PREFIX=cmake-out \
27-
-DEXECUTORCH_ENABLE_LOGGING=1 \
28-
-DCMAKE_BUILD_TYPE=Release \
29-
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
30-
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
31-
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
32-
-DEXECUTORCH_BUILD_XNNPACK=ON \
33-
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
34-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
35-
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
36-
-Bcmake-out .
23+
- Build executorch with LLM preset:
24+
```
25+
cmake --preset llm -DCMAKE_INSTALL_PREFIX=cmake-out
3726
38-
cmake --build cmake-out -j16 --target install --config Release
39-
```
27+
cmake --build cmake-out -j16 --target install --config Release
28+
```
4029
- Build Phi-3-mini runner.
4130
```
42-
cmake -DPYTHON_EXECUTABLE=python \
43-
-DCMAKE_INSTALL_PREFIX=cmake-out \
44-
-DCMAKE_BUILD_TYPE=Release \
45-
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
46-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
47-
-DEXECUTORCH_BUILD_XNNPACK=ON \
48-
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
49-
-Bcmake-out/examples/models/phi-3-mini \
50-
examples/models/phi-3-mini
31+
cmake -DCMAKE_PREFIX_PATH=cmake-out \
32+
-DCMAKE_BUILD_TYPE=Release \
33+
-Bcmake-out/examples/models/phi-3-mini \
34+
examples/models/phi-3-mini
5135
5236
cmake --build cmake-out/examples/models/phi-3-mini -j16 --config Release
5337
```
54-
- Run model. Options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/phi-3-mini/main.cpp#L13-L30)
38+
- Run model. Options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/phi-3-mini/main.cpp#L16-L33)
5539
```
5640
cmake-out/examples/models/phi-3-mini/phi_3_mini_runner \
5741
--model_path=phi-3-mini.pte \

examples/models/phi-3-mini/export_phi-3-mini.py

Lines changed: 81 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,42 @@
1919
XNNPACKQuantizer,
2020
)
2121
from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
22-
from executorch.exir import to_edge
22+
from executorch.exir import to_edge_transform_and_lower
23+
from executorch.exir.capture._config import ExecutorchBackendConfig
24+
from executorch.exir.passes import MemoryPlanningPass
25+
from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
2326
from torch.export import export_for_training
27+
from torch.nn.attention import SDPBackend
2428
from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
2529

2630
from transformers import Phi3ForCausalLM
31+
from transformers.cache_utils import StaticCacheConfig
2732

28-
from .phi_3_mini import Phi3Mini
33+
from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
34+
35+
36+
def _prepare_export_inputs(max_seq_len: int, sliding_window: int):
37+
"""
38+
Prepare example inputs and configurations for export.
39+
40+
Returns:
41+
example_input_ids (torch.Tensor): Example input IDs tensor.
42+
example_cache_position (torch.Tensor): Example cache position tensor.
43+
dynamic_shapes (dict or None): Dynamic shape specifications for export.
44+
strict (bool): Whether to use strict export mode.
45+
"""
46+
# Prepare inputs with dynamic shapes
47+
seq_length = 3 # Sequence length > 1 to avoid specialization issues
48+
example_input_ids = torch.zeros((1, seq_length), dtype=torch.long)
49+
example_cache_position = torch.arange(seq_length, dtype=torch.long)
50+
max_dim = min(max_seq_len, sliding_window) - 1
51+
seq_len_dim = torch.export.Dim("seq_length_dim", max=max_dim)
52+
dynamic_shapes = {
53+
"input_ids": {1: seq_len_dim},
54+
"cache_position": {0: seq_len_dim},
55+
}
56+
57+
return example_input_ids, example_cache_position, dynamic_shapes
2958

3059

3160
def export(args) -> None:
@@ -40,51 +69,70 @@ def export(args) -> None:
4069
f"Invalid context length {args.context_length}. Should be either 4k or 128k"
4170
)
4271

43-
with torch.no_grad():
44-
model = Phi3Mini(
45-
# pyre-ignore: Undefined attribute [16]: Module `transformers` has no attribute `Phi3ForCausalLM`
46-
model=Phi3ForCausalLM.from_pretrained(model_name),
72+
with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
73+
model = Phi3ForCausalLM.from_pretrained(model_name)
74+
model.generation_config.cache_implementation = "static"
75+
model.generation_config.cache_config = StaticCacheConfig(
76+
batch_size=1, max_cache_len=model.config.max_position_embeddings
77+
)
78+
79+
exportable_module = TorchExportableModuleForDecoderOnlyLM(
80+
model,
4781
max_batch_size=1,
48-
max_seq_len=args.seq_len,
82+
max_cache_len=model.config.max_position_embeddings,
4983
)
50-
example_inputs = (
51-
torch.tensor(
52-
[[1048, 263, 931, 746]], dtype=torch.long, requires_grad=False
53-
),
84+
input_ids, cache_position, dynamic_shapes = _prepare_export_inputs(
85+
model.config.max_position_embeddings, model.config.sliding_window
86+
)
87+
example_inputs = (input_ids, cache_position)
88+
exported_program = exportable_module.export(
89+
input_ids, cache_position, dynamic_shapes, strict=False
90+
)
91+
# Apply RemoveTransposes pass to remove
92+
# any back-to-back transpose ops that are not needed
93+
# e.g. output of update_cache is transposed and
94+
# input to custom_sdpa is transposed.
95+
from executorch.extension.llm.export.export_passes import (
96+
RemoveRedundantTransposes,
5497
)
55-
dynamic_shapes = {
56-
"input_ids": {
57-
1: torch.export.Dim("sequence_length", min=1, max=args.seq_len)
58-
}
59-
}
98+
99+
mutated_gm = RemoveRedundantTransposes()(exported_program.module())[0]
60100

61101
xnnpack_quant_config = get_symmetric_quantization_config(
62102
is_per_channel=True, is_dynamic=True
63103
)
64104
xnnpack_quantizer = XNNPACKQuantizer()
65105
xnnpack_quantizer.set_global(xnnpack_quant_config)
66106

67-
model = export_for_training(
68-
model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
69-
).module()
70-
model = prepare_pt2e(model, xnnpack_quantizer) # pyre-fixme[6]
71-
model(*example_inputs)
72-
model = convert_pt2e(model)
73-
DuplicateDynamicQuantChainPass()(model)
74-
# TODO(lunwenh): update it to use export once
75-
# https://github.com/pytorch/pytorch/issues/128394 is resolved.
76-
model = torch.export._trace._export(
77-
model,
78-
example_inputs,
79-
dynamic_shapes=dynamic_shapes,
80-
strict=False,
81-
pre_dispatch=False,
107+
gm = prepare_pt2e(mutated_gm, xnnpack_quantizer) # pyre-fixme[6]
108+
gm(*example_inputs)
109+
gm = convert_pt2e(gm)
110+
DuplicateDynamicQuantChainPass()(gm)
111+
exported_program = export_for_training(
112+
gm, example_inputs, dynamic_shapes=dynamic_shapes, strict=False
82113
)
83114

84115
edge_config = get_xnnpack_edge_compile_config()
85-
edge_manager = to_edge(model, compile_config=edge_config)
116+
edge_manager = to_edge_transform_and_lower(
117+
exported_program,
118+
partitioner=[XnnpackPartitioner()],
119+
compile_config=edge_config,
120+
constant_methods={
121+
"get_eos_ids": [32000],
122+
"use_kv_cache": True,
123+
"enable_dynamic_shape": True,
124+
"get_max_seq_len": model.config.max_position_embeddings - 1,
125+
},
126+
)
86127
edge_manager = edge_manager.to_backend(XnnpackPartitioner())
87-
et_program = edge_manager.to_executorch()
128+
et_program = edge_manager.to_executorch(
129+
ExecutorchBackendConfig(
130+
extract_delegate_segments=True,
131+
do_quant_fusion_and_const_prop=True,
132+
memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
133+
sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
134+
)
135+
)
88136

89137
with open(args.output_name, "wb") as file:
90138
file.write(et_program.buffer)

examples/models/phi-3-mini/install_requirements.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77

88
set -x
99

10-
pip install transformers==4.44.2
11-
1210
pip install sentencepiece
1311

1412
pip list

examples/models/phi-3-mini/main.cpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,12 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#include <executorch/extension/llm/runner/text_llm_runner.h>
910
#include <gflags/gflags.h>
11+
#include <pytorch/tokenizers/llama2c_tokenizer.h>
12+
#include <iostream>
1013

11-
#include <executorch/examples/models/phi-3-mini/runner.h>
14+
using executorch::extension::llm::TextLLMRunner;
1215

1316
DEFINE_string(
1417
model_path,
@@ -42,9 +45,16 @@ int main(int32_t argc, char** argv) {
4245

4346
int32_t seq_len = FLAGS_seq_len;
4447

45-
example::Runner runner(model_path, tokenizer_path, temperature);
48+
std::unique_ptr<tokenizers::Tokenizer> tokenizer =
49+
std::make_unique<tokenizers::Llama2cTokenizer>();
50+
tokenizer->load(tokenizer_path);
4651

47-
runner.generate(prompt, seq_len);
52+
auto runner = executorch::extension::llm::create_text_llm_runner(
53+
model_path, std::move(tokenizer));
54+
55+
runner->generate(
56+
prompt,
57+
{.seq_len = seq_len, .temperature = static_cast<float>(temperature)});
4858

4959
return 0;
5060
}

examples/models/phi-3-mini/phi_3_mini.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def __init__(self, model: Phi3ForCausalLM, max_batch_size: int, max_seq_len: int
3030
def forward(
3131
self,
3232
# pyre-fixme[9]: input_ids has type `LongTensor`; used as `None`.
33-
input_ids: torch.LongTensor = None,
33+
input_ids: torch.LongTensor,
3434
) -> torch.FloatTensor:
3535
# pyre-fixme[16]: `Phi3ForCausalLM` has no attribute `forward`.
3636
return self.model.forward(

0 commit comments

Comments
 (0)