Skip to content
This repository was archived by the owner on Jan 13, 2025. It is now read-only.

Commit 48a5361

Browse files
OuadiElfaroukis-Nickpgorlani
authored
Added OmatAdd operator support (#429)
This patch adds support for OmatAdd BLAS extension operator along with the relevant tests and benchmarks. Co-authored-by: s-Nick <[email protected]> Co-authored-by: pgorlani <[email protected]>
1 parent 26b8c4b commit 48a5361

File tree

22 files changed

+1479
-10
lines changed

22 files changed

+1479
-10
lines changed

README.md

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ For all these operations:
294294

295295
### EXTENSION
296296

297-
The following table sums up the interface that cab be found in
297+
The following table sums up the interface that can be found in
298298
[extension_interface.h](include/interface/extension_interface.h).
299299

300300
For all these operations:
@@ -304,19 +304,26 @@ For all these operations:
304304
(cf BLAS 2). The leading dimension of a matrix must be greater than or equal
305305
to its number of rows. In the case of in-place transpose, the same matrix `A`
306306
is used with two different leading dimensions for input & output.
307+
* `stride_a`, `stride_b` and `stride_c` are the striding size between consecutive
308+
matrices in a batched entry for inputs/outputs A, B and C.
309+
* `inc_a` and `inc_b` are the distance between consecutive elements in A & B matrices.
307310
* `transa` and `transb` are the transpose modes of the matrices A and B
308311
(cf BLAS 2).
309312
* `M` and `N` are the dimensions of the matrices.
310-
* `alpha` and `beta` are scalars.
311-
* `batch_size` is an integer.
312-
* `inc_a` and `inc_b` are integers. The distance between element in the same column.
313+
* `alpha` and `beta` are scaling scalars.
314+
* `batch_size` is the number of batch matrices.
313315

314316
| operation | arguments | description |
315317
|---|---|---|
316318
| `_omatcopy` | `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `B`, `ldb` | Perform an out-of-place scaled matrix transpose or copy operation using a general dense matrix. |
317319
| `_omatcopy2`| `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `inc_a`, `B`, `ldb`, `inc_b` | Computes two-strided scaling and out-of-place transposition or copying of general dense matrices. |
320+
| `_omatadd`| `sb_handle`, `transa`, `transb`, `M`, `N`, `alpha`, `A`, `lda`, `beta`, `B`, `ldb`, `C`,`ldc` | Computes scaled general dense matrix addition with possibly transposed arguments. |
321+
322+
Other non-official extension operators :
323+
| operation | arguments | description |
324+
|---|---|---|
318325
| `_transpose` | `sb_handle`, `M`, `N`, `A`, `lda`, `B`, `ldb` | Computes an out-of-place matrix transpose operation using a general dense matrix. |
319-
| `_transpose` | `sb_handle`, `M`, `N`, `A`, `ld_in`, `ld_out` | Computes an in-place matrix transpose operation using a general dense matrix. |
326+
| `_transpose*` | `sb_handle`, `M`, `N`, `A`, `lda`, `ldb` | Computes an in-place matrix transpose operation using a general dense matrix, lda & ldb being input and output leading dimensions of A respectively _(*Not implemented)_. |
320327
### Experimental Joint Matrix Support
321328

322329
SYCL-BLAS now supports sub-group based collective GEMM operation using the experimental

benchmark/cublas/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ set(sources
7171
blas3/trmm.cpp
7272
# extension blas
7373
extension/omatcopy.cpp
74+
extension/omatadd.cpp
7475
)
7576

7677
# Add individual benchmarks for each method
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
/* *************************************************************************
2+
*
3+
* @license
4+
* Copyright (C) Codeplay Software Limited
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* For your convenience, a copy of the License has been included in this
12+
* repository.
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*
20+
* SYCL-BLAS: BLAS implementation using SYCL
21+
*
22+
* @filename omatadd.cpp
23+
*
24+
**************************************************************************/
25+
26+
#include "../../../test/unittest/extension/extension_reference.hpp"
27+
#include "../utils.hpp"
28+
29+
template <typename scalar_t>
30+
std::string get_name(std::string ts_a, std::string ts_b, int m, int n,
31+
scalar_t alpha, scalar_t beta, index_t lda_mul,
32+
index_t ldb_mul, index_t ldc_mul) {
33+
std::ostringstream str{};
34+
str << "BM_omatadd<" << blas_benchmark::utils::get_type_name<scalar_t>()
35+
<< ">/" << ts_a << "/" << ts_b << "/" << m << "/" << n << "/" << alpha
36+
<< "/" << beta << "/" << lda_mul << "/" << ldb_mul << "/" << ldc_mul;
37+
return str.str();
38+
}
39+
40+
template <typename scalar_t, typename... args_t>
41+
static inline void cublas_routine(args_t&&... args) {
42+
if constexpr (std::is_same_v<scalar_t, float>) {
43+
CUBLAS_CHECK(cublasSgeam(std::forward<args_t>(args)...));
44+
} else if constexpr (std::is_same_v<scalar_t, double>) {
45+
CUBLAS_CHECK(cublasDgeam(std::forward<args_t>(args)...));
46+
}
47+
return;
48+
}
49+
50+
template <typename scalar_t>
51+
void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int ti_a,
52+
int ti_b, index_t m, index_t n, scalar_t alpha, scalar_t beta,
53+
index_t lda_mul, index_t ldb_mul, index_t ldc_mul, bool* success) {
54+
// initialize the state label
55+
blas_benchmark::utils::set_benchmark_label<scalar_t>(state);
56+
57+
// Standard test setup.
58+
std::string ts_a = blas_benchmark::utils::from_transpose_enum(
59+
static_cast<blas_benchmark::utils::Transposition>(ti_a));
60+
const char* t_str_a = ts_a.c_str();
61+
std::string ts_b = blas_benchmark::utils::from_transpose_enum(
62+
static_cast<blas_benchmark::utils::Transposition>(ti_b));
63+
const char* t_str_b = ts_b.c_str();
64+
65+
const auto lda = (*t_str_a == 't') ? lda_mul * n : lda_mul * m;
66+
const auto ldb = (*t_str_b == 't') ? ldb_mul * n : ldb_mul * m;
67+
const auto ldc = ldc_mul * m;
68+
69+
const auto size_a = lda * ((*t_str_a == 't') ? m : n);
70+
const auto size_b = ldb * ((*t_str_b == 't') ? m : n);
71+
const auto size_c = ldc * n;
72+
73+
blas_benchmark::utils::init_extension_counters<
74+
blas_benchmark::utils::ExtensionOP::omatadd, scalar_t>(
75+
state, t_str_a, t_str_b, m, n, lda_mul, ldb_mul, ldc_mul);
76+
77+
cublasHandle_t& cuda_handle = *cuda_handle_ptr;
78+
79+
// Input matrix/vector, output vector.
80+
std::vector<scalar_t> m_a =
81+
blas_benchmark::utils::random_data<scalar_t>(size_a);
82+
std::vector<scalar_t> m_b =
83+
blas_benchmark::utils::random_data<scalar_t>(size_b);
84+
std::vector<scalar_t> m_c =
85+
blas_benchmark::utils::random_data<scalar_t>(size_c);
86+
87+
blas_benchmark::utils::CUDAVector<scalar_t> m_a_gpu(size_a, m_a.data());
88+
blas_benchmark::utils::CUDAVector<scalar_t> m_b_gpu(size_b, m_b.data());
89+
blas_benchmark::utils::CUDAVector<scalar_t> m_c_gpu(size_c, m_c.data());
90+
91+
cublasOperation_t c_t_a = (*t_str_a == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T;
92+
cublasOperation_t c_t_b = (*t_str_b == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T;
93+
94+
#ifdef BLAS_VERIFY_BENCHMARK
95+
// Run a first time with a verification of the results
96+
std::vector<scalar_t> m_c_ref = m_c;
97+
98+
reference_blas::ext_omatadd(*t_str_a, *t_str_b, m, n, alpha, m_a, lda, beta,
99+
m_b, ldb, m_c_ref, ldc);
100+
101+
std::vector<scalar_t> m_c_temp = m_c;
102+
{
103+
blas_benchmark::utils::CUDAVector<scalar_t, true> m_c_temp_gpu(
104+
size_c, m_c_temp.data());
105+
106+
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, &alpha, m_a_gpu,
107+
lda, &beta, m_b_gpu, ldb, m_c_temp_gpu, ldc);
108+
}
109+
110+
std::ostringstream err_stream;
111+
if (!utils::compare_vectors(m_c_temp, m_c_ref, err_stream, "")) {
112+
const std::string& err_str = err_stream.str();
113+
state.SkipWithError(err_str.c_str());
114+
*success = false;
115+
};
116+
#endif
117+
auto blas_warmup = [&]() -> void {
118+
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, &alpha, m_a_gpu,
119+
lda, &beta, m_b_gpu, ldb, m_c_gpu, ldc);
120+
return;
121+
};
122+
123+
cudaEvent_t start;
124+
cudaEvent_t stop;
125+
CUDA_CHECK(cudaEventCreate(&start));
126+
CUDA_CHECK(cudaEventCreate(&stop));
127+
128+
auto blas_method_def = [&]() -> std::vector<cudaEvent_t> {
129+
CUDA_CHECK(cudaEventRecord(start));
130+
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, &alpha, m_a_gpu,
131+
lda, &beta, m_b_gpu, ldb, m_c_gpu, ldc);
132+
CUDA_CHECK(cudaEventRecord(stop));
133+
CUDA_CHECK(cudaEventSynchronize(stop));
134+
return std::vector{start, stop};
135+
};
136+
137+
// Warmup
138+
blas_benchmark::utils::warmup(blas_warmup);
139+
CUDA_CHECK(cudaStreamSynchronize(NULL));
140+
141+
blas_benchmark::utils::init_counters(state);
142+
143+
// Measure
144+
for (auto _ : state) {
145+
// Run
146+
std::tuple<double, double> times =
147+
blas_benchmark::utils::timef_cuda(blas_method_def);
148+
149+
// Report
150+
blas_benchmark::utils::update_counters(state, times);
151+
}
152+
153+
state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
154+
state.SetBytesProcessed(state.iterations() *
155+
state.counters["bytes_processed"]);
156+
157+
blas_benchmark::utils::calc_avg_counters(state);
158+
159+
CUDA_CHECK(cudaEventDestroy(start));
160+
CUDA_CHECK(cudaEventDestroy(stop));
161+
};
162+
163+
template <typename scalar_t>
164+
void register_benchmark(blas_benchmark::Args& args,
165+
cublasHandle_t* cublas_handle_ptr, bool* success) {
166+
auto omatadd_params =
167+
blas_benchmark::utils::get_omatadd_params<scalar_t>(args);
168+
169+
for (auto p : omatadd_params) {
170+
std::string ts_a, ts_b;
171+
index_t m, n, lda_mul, ldb_mul, ldc_mul;
172+
scalar_t alpha, beta;
173+
std::tie(ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul) = p;
174+
int t_a = static_cast<int>(blas_benchmark::utils::to_transpose_enum(ts_a));
175+
int t_b = static_cast<int>(blas_benchmark::utils::to_transpose_enum(ts_b));
176+
177+
auto BM_lambda =
178+
[&](benchmark::State& st, cublasHandle_t* cublas_handle_ptr, int t_a,
179+
int t_b, index_t m, index_t n, scalar_t alpha, scalar_t beta,
180+
index_t lda_mul, index_t ldb_mul, index_t ldc_mul, bool* success) {
181+
run<scalar_t>(st, cublas_handle_ptr, t_a, t_b, m, n, alpha, beta,
182+
lda_mul, ldb_mul, ldc_mul, success);
183+
};
184+
benchmark::RegisterBenchmark(
185+
get_name<scalar_t>(ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul,
186+
ldc_mul)
187+
.c_str(),
188+
BM_lambda, cublas_handle_ptr, t_a, t_b, m, n, alpha, beta, lda_mul,
189+
ldb_mul, ldc_mul, success)
190+
->UseRealTime();
191+
}
192+
}
193+
194+
namespace blas_benchmark {
195+
void create_benchmark(blas_benchmark::Args& args,
196+
cublasHandle_t* cuda_handle_ptr, bool* success) {
197+
BLAS_REGISTER_BENCHMARK(args, cuda_handle_ptr, success);
198+
}
199+
} // namespace blas_benchmark

benchmark/rocblas/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ set(sources
7373

7474
# Extension blas
7575
extension/omatcopy.cpp
76+
extension/omatadd.cpp
7677

7778
)
7879

0 commit comments

Comments
 (0)