Skip to content
This repository was archived by the owner on Jan 13, 2025. It is now read-only.

Commit 731695c

Browse files
s-NickOuadiElfaroukipgorlani
authored
Add omatcopy and omatcopy2 operators support (#428)
This PR add two BLAS extension operators: omatcopy and omatcopy2. The operators are quite similar and the implementation in split in two cases, one for non-transpose input matrix and one for transpose input matrix. Signed-off-by: s-Nick <[email protected]> Co-authored-by: Ouadie EL FAROUKI <[email protected]> Co-authored-by: pgorlani <[email protected]>
1 parent ec9ccdb commit 731695c

37 files changed

+2697
-249
lines changed

README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,31 @@ For all these operations:
292292
| `_gemm_strided_batched` | `sb_handle`, `transa`, `transb`, `M`, `N`, `K`, `alpha`, `A`, `lda`, `stridea`, `B`, `ldb`, `strideb`, `beta`, `C`, `ldc`, `stridec`, `batch_size` | Same as `_gemm` but the containers contain `batch_size` end-to-end matrices. GEMM operations are performed independently with matching matrices.
293293
| `_trsm` | `sb_handle`, `side`, `uplo`, `trans`, `diag`, `M`, `N`, `alpha`, `A`, `lda`, `B`, `ldb` | Triangular solve with Multiple Right-Hand Sides. |
294294

295+
### EXTENSION
296+
297+
The following table sums up the interface that cab be found in
298+
[extension_interface.h](include/interface/extension_interface.h).
299+
300+
For all these operations:
301+
302+
* `A`, `B` and `C` are containers for the column-major matrices A, B and C.
303+
* `lda`, `ldb` and `ldc` are the leading dimensions of the matrices A, B and C
304+
(cf BLAS 2). The leading dimension of a matrix must be greater than or equal
305+
to its number of rows. In the case of in-place transpose, the same matrix `A`
306+
is used with two different leading dimensions for input & output.
307+
* `transa` and `transb` are the transpose modes of the matrices A and B
308+
(cf BLAS 2).
309+
* `M` and `N` are the dimensions of the matrices.
310+
* `alpha` and `beta` are scalars.
311+
* `batch_size` is an integer.
312+
* `inc_a` and `inc_b` are integers. The distance between element in the same column.
313+
314+
| operation | arguments | description |
315+
|---|---|---|
316+
| `_omatcopy` | `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `B`, `ldb` | Perform an out-of-place scaled matrix transpose or copy operation using a general dense matrix. |
317+
| `_omatcopy2`| `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `inc_a`, `B`, `ldb`, `inc_b` | Computes two-strided scaling and out-of-place transposition or copying of general dense matrices. |
318+
| `_transpose` | `sb_handle`, `M`, `N`, `A`, `lda`, `B`, `ldb` | Computes an out-of-place matrix transpose operation using a general dense matrix. |
319+
| `_transpose` | `sb_handle`, `M`, `N`, `A`, `ld_in`, `ld_out` | Computes an in-place matrix transpose operation using a general dense matrix. |
295320
### Experimental Joint Matrix Support
296321

297322
SYCL-BLAS now supports sub-group based collective GEMM operation using the experimental

benchmark/cublas/CMakeLists.txt

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,10 @@ set(sources
6969
blas3/trsm.cpp
7070
blas3/trsm_batched.cpp
7171
blas3/trmm.cpp
72+
# extension blas
73+
extension/omatcopy.cpp
7274
)
7375

74-
#if(${BLAS_ENABLE_EXTENSIONS})
75-
# list(APPEND sources "extension/reduction.cpp")
76-
#endif()
77-
7876
# Add individual benchmarks for each method
7977
foreach(cublas_bench ${sources})
8078
get_filename_component(bench_cublas_exec ${cublas_bench} NAME_WE)
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
/* *************************************************************************
2+
*
3+
* @license
4+
* Copyright (C) Codeplay Software Limited
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* For your convenience, a copy of the License has been included in this
12+
* repository.
13+
*
14+
* Unless required by applicable law or agreed to in writing, software
15+
* distributed under the License is distributed on an "AS IS" BASIS,
16+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17+
* See the License for the specific language governing permissions and
18+
* limitations under the License.
19+
*
20+
* SYCL-BLAS: BLAS implementation using SYCL
21+
*
22+
* @filename omatcopy.cpp
23+
*
24+
**************************************************************************/
25+
26+
#include "../../../test/unittest/extension/extension_reference.hpp"
27+
#include "../utils.hpp"
28+
29+
template <typename scalar_t>
30+
std::string get_name(std::string ts_a, int m, int n, scalar_t alpha,
31+
index_t lda_mul, index_t ldb_mul) {
32+
std::ostringstream str{};
33+
str << "BM_omatcopy<" << blas_benchmark::utils::get_type_name<scalar_t>()
34+
<< ">/" << ts_a << "/" << m << "/" << n << "/" << alpha << "/" << lda_mul
35+
<< "/" << ldb_mul;
36+
return str.str();
37+
}
38+
39+
template <typename scalar_t, typename... args_t>
40+
static inline void cublas_routine(args_t&&... args) {
41+
if constexpr (std::is_same_v<scalar_t, float>) {
42+
CUBLAS_CHECK(cublasSgeam(std::forward<args_t>(args)...));
43+
} else if constexpr (std::is_same_v<scalar_t, double>) {
44+
CUBLAS_CHECK(cublasDgeam(std::forward<args_t>(args)...));
45+
}
46+
return;
47+
}
48+
49+
template <typename scalar_t>
50+
void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int ti,
51+
index_t m, index_t n, scalar_t alpha, index_t lda_mul, index_t ldb_mul,
52+
bool* success) {
53+
// initialize the state label
54+
blas_benchmark::utils::set_benchmark_label<scalar_t>(state);
55+
56+
// Standard test setup.
57+
std::string ts = blas_benchmark::utils::from_transpose_enum(
58+
static_cast<blas_benchmark::utils::Transposition>(ti));
59+
const char* t_str = ts.c_str();
60+
61+
// These arguments follows cublas indication for sizes and leading dimensions
62+
// instead of following oneMKL specification.
63+
const auto cuda_lda = (*t_str == 't') ? lda_mul * n : lda_mul * m;
64+
const auto cuda_ldb = ldb_mul * m;
65+
const auto cuda_size_a = cuda_lda * ((*t_str == 't') ? m : n);
66+
const auto cuda_size_b = cuda_ldb * n;
67+
68+
blas_benchmark::utils::init_extension_counters<
69+
blas_benchmark::utils::ExtensionOP::omatcopy, scalar_t>(
70+
state, t_str, m, n, lda_mul, ldb_mul);
71+
72+
cublasHandle_t& cuda_handle = *cuda_handle_ptr;
73+
74+
// Input matrix/vector, output vector.
75+
std::vector<scalar_t> m_a =
76+
blas_benchmark::utils::random_data<scalar_t>(cuda_size_a);
77+
std::vector<scalar_t> m_b =
78+
blas_benchmark::utils::random_data<scalar_t>(cuda_size_b);
79+
80+
blas_benchmark::utils::CUDAVector<scalar_t> m_a_gpu(cuda_size_a, m_a.data());
81+
blas_benchmark::utils::CUDAVector<scalar_t> m_b_gpu(cuda_size_b, m_b.data());
82+
83+
cublasOperation_t c_t_a = (*t_str == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T;
84+
85+
// beta set to zero to use cublasTgeam properly
86+
const scalar_t beta = static_cast<scalar_t>(0.0);
87+
// place holder to for second matrix in cublasTgeam
88+
cublasOperation_t c_t_b = CUBLAS_OP_N;
89+
90+
#ifdef BLAS_VERIFY_BENCHMARK
91+
// Run a first time with a verification of the results
92+
std::vector<scalar_t> m_b_ref = m_b; // m_b;
93+
94+
reference_blas::ext_omatcopy<false>(*t_str, m, n, alpha, m_a, cuda_lda,
95+
m_b_ref, cuda_ldb);
96+
97+
std::vector<scalar_t> m_b_temp = m_b;
98+
{
99+
blas_benchmark::utils::CUDAVector<scalar_t, true> m_b_temp_gpu(
100+
cuda_size_b, m_b_temp.data());
101+
102+
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, &alpha, m_a_gpu,
103+
cuda_lda, &beta, nullptr, cuda_ldb, m_b_temp_gpu,
104+
cuda_ldb);
105+
}
106+
107+
std::ostringstream err_stream;
108+
if (!utils::compare_vectors(m_b_temp, m_b_ref, err_stream, "")) {
109+
const std::string& err_str = err_stream.str();
110+
state.SkipWithError(err_str.c_str());
111+
*success = false;
112+
};
113+
#endif
114+
auto blas_warmup = [&]() -> void {
115+
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, &alpha, m_a_gpu,
116+
cuda_lda, &beta, nullptr, cuda_ldb, m_b_gpu,
117+
cuda_ldb);
118+
return;
119+
};
120+
121+
cudaEvent_t start;
122+
cudaEvent_t stop;
123+
CUDA_CHECK(cudaEventCreate(&start));
124+
CUDA_CHECK(cudaEventCreate(&stop));
125+
126+
auto blas_method_def = [&]() -> std::vector<cudaEvent_t> {
127+
CUDA_CHECK(cudaEventRecord(start));
128+
cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, &alpha, m_a_gpu,
129+
cuda_lda, &beta, nullptr, cuda_ldb, m_b_gpu,
130+
cuda_ldb);
131+
CUDA_CHECK(cudaEventRecord(stop));
132+
CUDA_CHECK(cudaEventSynchronize(stop));
133+
return std::vector{start, stop};
134+
};
135+
136+
// Warmup
137+
blas_benchmark::utils::warmup(blas_warmup);
138+
CUDA_CHECK(cudaStreamSynchronize(NULL));
139+
140+
blas_benchmark::utils::init_counters(state);
141+
142+
// Measure
143+
for (auto _ : state) {
144+
// Run
145+
std::tuple<double, double> times =
146+
blas_benchmark::utils::timef_cuda(blas_method_def);
147+
148+
// Report
149+
blas_benchmark::utils::update_counters(state, times);
150+
}
151+
152+
state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
153+
state.SetBytesProcessed(state.iterations() *
154+
state.counters["bytes_processed"]);
155+
156+
blas_benchmark::utils::calc_avg_counters(state);
157+
158+
CUDA_CHECK(cudaEventDestroy(start));
159+
CUDA_CHECK(cudaEventDestroy(stop));
160+
};
161+
162+
template <typename scalar_t>
163+
void register_benchmark(blas_benchmark::Args& args,
164+
cublasHandle_t* cublas_handle_ptr, bool* success) {
165+
auto omatcopy_params =
166+
blas_benchmark::utils::get_matcopy_params<scalar_t>(args);
167+
168+
for (auto p : omatcopy_params) {
169+
std::string ts_a;
170+
index_t m, n, lda_mul, ldb_mul;
171+
scalar_t alpha;
172+
std::tie(ts_a, m, n, alpha, lda_mul, ldb_mul) = p;
173+
int t_a = static_cast<int>(blas_benchmark::utils::to_transpose_enum(ts_a));
174+
175+
auto BM_lambda = [&](benchmark::State& st,
176+
cublasHandle_t* cublas_handle_ptr, int t_a, index_t m,
177+
index_t n, scalar_t alpha, index_t lda_mul,
178+
index_t ldb_mul, bool* success) {
179+
run<scalar_t>(st, cublas_handle_ptr, t_a, m, n, alpha, lda_mul, ldb_mul,
180+
success);
181+
};
182+
benchmark::RegisterBenchmark(
183+
get_name<scalar_t>(ts_a, m, n, alpha, lda_mul, ldb_mul).c_str(),
184+
BM_lambda, cublas_handle_ptr, t_a, m, n, alpha, lda_mul, ldb_mul,
185+
success)
186+
->UseRealTime();
187+
}
188+
}
189+
190+
namespace blas_benchmark {
191+
void create_benchmark(blas_benchmark::Args& args,
192+
cublasHandle_t* cuda_handle_ptr, bool* success) {
193+
BLAS_REGISTER_BENCHMARK(args, cuda_handle_ptr, success);
194+
}
195+
} // namespace blas_benchmark

benchmark/rocblas/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ set(sources
7171
blas3/gemm_batched.cpp
7272
blas3/gemm_batched_strided.cpp
7373

74+
# Extension blas
75+
extension/omatcopy.cpp
76+
7477
)
7578

7679
# Add individual benchmarks for each method

0 commit comments

Comments
 (0)