Skip to content

Commit 3ae735d

Browse files
committed
Merge branch 'upstream_main' into v1_embeddings_runner
Signed-off-by: Max de Bayser <[email protected]>
2 parents 364ec25 + 02658c2 commit 3ae735d

File tree

106 files changed

+2071
-509
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

106 files changed

+2071
-509
lines changed

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ set -ex
66

77
# allow to bind to different cores
88
CORE_RANGE=${CORE_RANGE:-48-95}
9+
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
910
NUMA_NODE=${NUMA_NODE:-1}
1011

1112
export CMAKE_BUILD_PARALLEL_LEVEL=32
@@ -23,10 +24,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
2324
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
2425

2526
# Run the image, setting --shm-size=4g for tensor parallel.
26-
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
27-
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
28-
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
29-
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
27+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
28+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
3029

3130
function cpu_tests() {
3231
set -e
@@ -56,7 +55,7 @@ function cpu_tests() {
5655
# Run AWQ test
5756
docker exec cpu-test-"$NUMA_NODE" bash -c "
5857
set -e
59-
pytest -s -v \
58+
VLLM_USE_V1=0 pytest -s -v \
6059
tests/quantization/test_ipex_quant.py"
6160

6261
# Run chunked-prefill and prefix-cache test
@@ -68,8 +67,6 @@ function cpu_tests() {
6867
# online serving
6968
docker exec cpu-test-"$NUMA_NODE" bash -c "
7069
set -e
71-
export VLLM_CPU_KVCACHE_SPACE=10
72-
export VLLM_CPU_OMP_THREADS_BIND=$1
7370
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
7471
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
7572
python3 benchmarks/benchmark_serving.py \
@@ -89,4 +86,4 @@ function cpu_tests() {
8986

9087
# All of CPU tests are expected to be finished less than 40 mins.
9188
export -f cpu_tests
92-
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
89+
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ run_and_track_test 9 "test_multimodal.py" \
150150
run_and_track_test 10 "test_pallas.py" \
151151
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
152152
run_and_track_test 11 "test_struct_output_generate.py" \
153-
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
153+
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k 'not test_structured_output_with_reasoning_matrices'"
154154
run_and_track_test 12 "test_moe_pallas.py" \
155155
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
156156
run_and_track_test 13 "test_lora.py" \

.buildkite/test-pipeline.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ steps:
145145
- examples/offline_inference/rlhf_colocate.py
146146
- tests/examples/offline_inference/data_parallel.py
147147
- tests/v1/test_async_llm_dp.py
148+
- tests/v1/engine/test_engine_core_client.py
148149
commands:
149150
# test with tp=2 and external_dp=2
150151
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@@ -154,6 +155,7 @@ steps:
154155
# test with internal dp
155156
- python3 ../examples/offline_inference/data_parallel.py
156157
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
158+
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
157159
- pytest -v -s distributed/test_utils.py
158160
- pytest -v -s compile/test_basic_correctness.py
159161
- pytest -v -s distributed/test_pynccl.py
@@ -318,6 +320,7 @@ steps:
318320
# these tests need to be separated, cannot combine
319321
- pytest -v -s compile/piecewise/test_simple.py
320322
- pytest -v -s compile/piecewise/test_toy_llama.py
323+
- pytest -v -s compile/piecewise/test_full_cudagraph.py
321324

322325
- label: PyTorch Fullgraph Test # 18min
323326
mirror_hardwares: [amdexperimental, amdproduction]

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ set(VLLM_EXT_SRC
242242
"csrc/activation_kernels.cu"
243243
"csrc/layernorm_kernels.cu"
244244
"csrc/layernorm_quant_kernels.cu"
245+
"csrc/sampler.cu"
245246
"csrc/cuda_view.cu"
246247
"csrc/quantization/gptq/q_gemm.cu"
247248
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"

benchmarks/benchmark_dataset.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -865,7 +865,15 @@ def sample(
865865
for item in self.data:
866866
if len(sampled_requests) >= num_requests:
867867
break
868-
prompt = f"{item['instruction']}:\n{item['input']}"
868+
prompt = f"{item['input']}\n\n{item['instruction']} Just output \
869+
the code, do not include any explanation."
870+
871+
# apply template
872+
prompt = tokenizer.apply_chat_template(
873+
[{"role": "user", "content": prompt}],
874+
add_generation_prompt=True,
875+
tokenize=False,
876+
)
869877
prompt_len = len(tokenizer(prompt).input_ids)
870878
sampled_requests.append(
871879
SampleRequest(

csrc/attention/mla/cutlass_mla_kernels.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
119119
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
120120
static_cast<ElementAcc*>(nullptr), stride_LSE},
121121
hw_info,
122-
-1, // split_kv
122+
1, // split_kv
123123
nullptr, // is_var_split_kv
124124
};
125125
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute

csrc/ops.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,11 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
9292
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
9393
torch::Tensor& weight, double epsilon);
9494

95+
void apply_repetition_penalties_(torch::Tensor& logits,
96+
const torch::Tensor& prompt_mask,
97+
const torch::Tensor& output_mask,
98+
const torch::Tensor& repetition_penalties);
99+
95100
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
96101
torch::Tensor& weight, torch::Tensor& scale,
97102
double epsilon);

csrc/quantization/fp8/common.cu

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -39,33 +39,33 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
3939
fp8_type* __restrict__ token_output = &out[offset];
4040

4141
// For vectorization, token_input and token_output pointers need to be
42-
// aligned at 8-byte and 4-byte addresses respectively.
43-
bool const can_vectorize = hidden_size % 4 == 0;
42+
// aligned at 32-byte and 16-byte addresses respectively.
43+
bool const can_vectorize = hidden_size % 16 == 0;
4444

4545
float absmax_val = 0.0f;
4646
if (can_vectorize) {
4747
absmax_val = thread_max_vec(token_input, hidden_size, tid, blockDim.x);
4848
} else {
4949
for (int i = tid; i < hidden_size; i += blockDim.x) {
5050
float const x = static_cast<float>(token_input[i]);
51-
absmax_val = max(absmax_val, fabs(x));
51+
absmax_val = fmaxf(absmax_val, fabsf(x));
5252
}
5353
}
5454

55-
using BlockReduce = cub::BlockReduce<float, 1024>;
55+
using BlockReduce = cub::BlockReduce<float, 256>;
5656
__shared__ typename BlockReduce::TempStorage reduceStorage;
5757
float const block_absmax_val_maybe =
5858
BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
5959
__shared__ float token_scale;
6060
if (tid == 0) {
6161
if (scale_ub) {
62-
token_scale = min(block_absmax_val_maybe, *scale_ub);
62+
token_scale = fminf(block_absmax_val_maybe, *scale_ub);
6363
} else {
6464
token_scale = block_absmax_val_maybe;
6565
}
6666
// token scale computation
67-
token_scale = max(token_scale / quant_type_max_v<fp8_type>,
68-
min_scaling_factor<fp8_type>::val());
67+
token_scale = fmaxf(token_scale / quant_type_max_v<fp8_type>,
68+
min_scaling_factor<fp8_type>::val());
6969
scale[token_idx] = token_scale;
7070
}
7171
__syncthreads();
@@ -88,10 +88,11 @@ void static_scaled_fp8_quant(torch::Tensor& out, // [..., d]
8888
torch::Tensor const& input, // [..., d]
8989
torch::Tensor const& scale) // [1]
9090
{
91-
int64_t num_tokens = input.numel() / input.size(-1);
92-
int64_t num_elems = input.numel();
93-
dim3 grid(num_tokens);
94-
dim3 block(1024);
91+
int const block_size = 256;
92+
int const num_tokens = input.numel() / input.size(-1);
93+
int const num_elems = input.numel();
94+
dim3 const grid(num_tokens);
95+
dim3 const block(block_size);
9596
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
9697
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
9798
VLLM_DISPATCH_FLOATING_TYPES(
@@ -110,10 +111,11 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, // [..., d]
110111
torch::Tensor const& input, // [..., d]
111112
torch::Tensor& scale) // [1]
112113
{
113-
int64_t num_tokens = input.numel() / input.size(-1);
114-
int64_t num_elems = input.numel();
115-
dim3 grid(num_tokens);
116-
dim3 block(1024);
114+
int const block_size = 256;
115+
int const num_tokens = input.numel() / input.size(-1);
116+
int const num_elems = input.numel();
117+
dim3 const grid(num_tokens);
118+
dim3 const block(block_size);
117119
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
118120
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
119121
VLLM_DISPATCH_FLOATING_TYPES(
@@ -141,8 +143,9 @@ void dynamic_per_token_scaled_fp8_quant(
141143

142144
int const hidden_size = input.size(-1);
143145
int const num_tokens = input.numel() / hidden_size;
146+
int const block_size = 256;
144147
dim3 const grid(num_tokens);
145-
dim3 const block(std::min(hidden_size, 1024));
148+
dim3 const block(std::min(hidden_size, block_size));
146149

147150
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
148151
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

csrc/quantization/fp8/common.cuh

Lines changed: 35 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
4646
}
4747

4848
float r =
49-
fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
49+
fmaxf(-quant_type_max_v<fp8_type>, fminf(x, quant_type_max_v<fp8_type>));
5050
#ifndef USE_ROCM
5151
return static_cast<fp8_type>(r);
5252
#else
@@ -65,15 +65,15 @@ template <typename scalar_t, typename fp8_type>
6565
__global__ void segmented_max_reduction(float* __restrict__ scale,
6666
const scalar_t* __restrict__ input,
6767
int64_t num_elems) {
68-
__shared__ float cache[1024];
68+
__shared__ float cache[256];
6969
int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
7070

7171
// First store maximum for all values processes by
7272
// the current thread in cache[threadIdx.x]
7373
scalar_t tmp = 0.0;
7474
while (i < num_elems) {
7575
float x = static_cast<float>(input[i]);
76-
tmp = max(tmp, fabs(x));
76+
tmp = fmaxf(tmp, fabsf(x));
7777
i += blockDim.x * gridDim.x;
7878
}
7979
cache[threadIdx.x] = tmp;
@@ -100,25 +100,27 @@ template <typename scalar_t>
100100
__device__ float thread_max_vec(scalar_t const* __restrict__ input,
101101
int64_t const num_elems, int const tid,
102102
int const step) {
103+
constexpr size_t VEC_SIZE = 16;
104+
using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
103105
// Vectorized input/output to better utilize memory bandwidth.
104-
vec4_t<scalar_t> const* vectorized_in =
105-
reinterpret_cast<vec4_t<scalar_t> const*>(input);
106+
auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
106107

107-
int64_t const num_vec_elems = num_elems >> 2;
108+
// num_elems / VEC_SIZE (which is 16)
109+
int64_t const num_vec_elems = num_elems >> 4;
108110
float absmax_val = 0.0f;
109111

110-
#pragma unroll 4
112+
#pragma unroll
111113
for (int64_t i = tid; i < num_vec_elems; i += step) {
112-
vec4_t<scalar_t> in_vec = vectorized_in[i];
113-
absmax_val = max(absmax_val, fabs(in_vec.x));
114-
absmax_val = max(absmax_val, fabs(in_vec.y));
115-
absmax_val = max(absmax_val, fabs(in_vec.z));
116-
absmax_val = max(absmax_val, fabs(in_vec.w));
114+
scalarxN_t in_vec = vectorized_in[i];
115+
#pragma unroll
116+
for (int j = 0; j < VEC_SIZE; ++j) {
117+
absmax_val = fmaxf(absmax_val, fabsf(in_vec.val[j]));
118+
}
117119
}
118120

119-
// Handle the remaining elements if num_elems is not divisible by 4
120-
for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
121-
absmax_val = max(absmax_val, fabs(input[i]));
121+
// Handle the remaining elements if num_elems is not divisible by VEC_SIZE
122+
for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
123+
absmax_val = fmaxf(absmax_val, fabsf(input[i]));
122124
}
123125

124126
return absmax_val;
@@ -130,31 +132,31 @@ __device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out,
130132
float const scale,
131133
int64_t const num_elems,
132134
int const tid, int const step) {
133-
using float8x4_t = q8x4_t<fp8_type>;
135+
constexpr size_t VEC_SIZE = 16;
136+
using scalarxN_t = vec_n_t<scalar_t, VEC_SIZE>;
137+
using float8xN_t = q8_n_t<fp8_type, VEC_SIZE>;
134138
// Vectorized input/output to better utilize memory bandwidth.
135-
auto const* vectorized_in = reinterpret_cast<vec4_t<scalar_t> const*>(input);
136-
auto* vectorized_out = reinterpret_cast<float8x4_t*>(out);
139+
auto const* vectorized_in = reinterpret_cast<scalarxN_t const*>(input);
140+
auto* vectorized_out = reinterpret_cast<float8xN_t*>(out);
137141

138-
int64_t const num_vec_elems = num_elems >> 2;
142+
// num_elems / VEC_SIZE (which is 16)
143+
int64_t const num_vec_elems = num_elems >> 4;
139144

140-
#pragma unroll 4
145+
#pragma unroll
141146
for (int64_t i = tid; i < num_vec_elems; i += step) {
142-
vec4_t<scalar_t> in_vec = vectorized_in[i];
143-
float8x4_t out_vec;
144-
145-
out_vec.x = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
146-
static_cast<float>(in_vec.x), scale);
147-
out_vec.y = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
148-
static_cast<float>(in_vec.y), scale);
149-
out_vec.z = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
150-
static_cast<float>(in_vec.z), scale);
151-
out_vec.w = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
152-
static_cast<float>(in_vec.w), scale);
147+
scalarxN_t in_vec = vectorized_in[i];
148+
float8xN_t out_vec;
149+
150+
#pragma unroll
151+
for (int j = 0; j < VEC_SIZE; ++j) {
152+
out_vec.val[j] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
153+
static_cast<float>(in_vec.val[j]), scale);
154+
}
153155
vectorized_out[i] = out_vec;
154156
}
155157

156-
// Handle the remaining elements if num_elems is not divisible by 4
157-
for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
158+
// Handle the remaining elements if num_elems is not divisible by VEC_SIZE
159+
for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) {
158160
out[i] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
159161
static_cast<float>(input[i]), scale);
160162
}

0 commit comments

Comments
 (0)