|
| 1 | +/* ************************************************************************* |
| 2 | + * |
| 3 | + * @license |
| 4 | + * Copyright (C) Codeplay Software Limited |
| 5 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | + * you may not use this file except in compliance with the License. |
| 7 | + * You may obtain a copy of the License at |
| 8 | + * |
| 9 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | + * |
| 11 | + * For your convenience, a copy of the License has been included in this |
| 12 | + * repository. |
| 13 | + * |
| 14 | + * Unless required by applicable law or agreed to in writing, software |
| 15 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 16 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 17 | + * See the License for the specific language governing permissions and |
| 18 | + * limitations under the License. |
| 19 | + * |
| 20 | + * SYCL-BLAS: BLAS implementation using SYCL |
| 21 | + * |
| 22 | + * @filename omatadd.cpp |
| 23 | + * |
| 24 | + **************************************************************************/ |
| 25 | + |
| 26 | +#include "../../../test/unittest/extension/extension_reference.hpp" |
| 27 | +#include "../utils.hpp" |
| 28 | + |
| 29 | +template <typename scalar_t> |
| 30 | +std::string get_name(std::string ts_a, std::string ts_b, int m, int n, |
| 31 | + scalar_t alpha, scalar_t beta, index_t lda_mul, |
| 32 | + index_t ldb_mul, index_t ldc_mul) { |
| 33 | + std::ostringstream str{}; |
| 34 | + str << "BM_omatadd<" << blas_benchmark::utils::get_type_name<scalar_t>() |
| 35 | + << ">/" << ts_a << "/" << ts_b << "/" << m << "/" << n << "/" << alpha |
| 36 | + << "/" << beta << "/" << lda_mul << "/" << ldb_mul << "/" << ldc_mul; |
| 37 | + return str.str(); |
| 38 | +} |
| 39 | + |
| 40 | +template <typename scalar_t, typename... args_t> |
| 41 | +static inline void cublas_routine(args_t&&... args) { |
| 42 | + if constexpr (std::is_same_v<scalar_t, float>) { |
| 43 | + CUBLAS_CHECK(cublasSgeam(std::forward<args_t>(args)...)); |
| 44 | + } else if constexpr (std::is_same_v<scalar_t, double>) { |
| 45 | + CUBLAS_CHECK(cublasDgeam(std::forward<args_t>(args)...)); |
| 46 | + } |
| 47 | + return; |
| 48 | +} |
| 49 | + |
| 50 | +template <typename scalar_t> |
| 51 | +void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int ti_a, |
| 52 | + int ti_b, index_t m, index_t n, scalar_t alpha, scalar_t beta, |
| 53 | + index_t lda_mul, index_t ldb_mul, index_t ldc_mul, bool* success) { |
| 54 | + // initialize the state label |
| 55 | + blas_benchmark::utils::set_benchmark_label<scalar_t>(state); |
| 56 | + |
| 57 | + // Standard test setup. |
| 58 | + std::string ts_a = blas_benchmark::utils::from_transpose_enum( |
| 59 | + static_cast<blas_benchmark::utils::Transposition>(ti_a)); |
| 60 | + const char* t_str_a = ts_a.c_str(); |
| 61 | + std::string ts_b = blas_benchmark::utils::from_transpose_enum( |
| 62 | + static_cast<blas_benchmark::utils::Transposition>(ti_b)); |
| 63 | + const char* t_str_b = ts_b.c_str(); |
| 64 | + |
| 65 | + const auto lda = (*t_str_a == 't') ? lda_mul * n : lda_mul * m; |
| 66 | + const auto ldb = (*t_str_b == 't') ? ldb_mul * n : ldb_mul * m; |
| 67 | + const auto ldc = ldc_mul * m; |
| 68 | + |
| 69 | + const auto size_a = lda * ((*t_str_a == 't') ? m : n); |
| 70 | + const auto size_b = ldb * ((*t_str_b == 't') ? m : n); |
| 71 | + const auto size_c = ldc * n; |
| 72 | + |
| 73 | + blas_benchmark::utils::init_extension_counters< |
| 74 | + blas_benchmark::utils::ExtensionOP::omatadd, scalar_t>( |
| 75 | + state, t_str_a, t_str_b, m, n, lda_mul, ldb_mul, ldc_mul); |
| 76 | + |
| 77 | + cublasHandle_t& cuda_handle = *cuda_handle_ptr; |
| 78 | + |
| 79 | + // Input matrix/vector, output vector. |
| 80 | + std::vector<scalar_t> m_a = |
| 81 | + blas_benchmark::utils::random_data<scalar_t>(size_a); |
| 82 | + std::vector<scalar_t> m_b = |
| 83 | + blas_benchmark::utils::random_data<scalar_t>(size_b); |
| 84 | + std::vector<scalar_t> m_c = |
| 85 | + blas_benchmark::utils::random_data<scalar_t>(size_c); |
| 86 | + |
| 87 | + blas_benchmark::utils::CUDAVector<scalar_t> m_a_gpu(size_a, m_a.data()); |
| 88 | + blas_benchmark::utils::CUDAVector<scalar_t> m_b_gpu(size_b, m_b.data()); |
| 89 | + blas_benchmark::utils::CUDAVector<scalar_t> m_c_gpu(size_c, m_c.data()); |
| 90 | + |
| 91 | + cublasOperation_t c_t_a = (*t_str_a == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T; |
| 92 | + cublasOperation_t c_t_b = (*t_str_b == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T; |
| 93 | + |
| 94 | +#ifdef BLAS_VERIFY_BENCHMARK |
| 95 | + // Run a first time with a verification of the results |
| 96 | + std::vector<scalar_t> m_c_ref = m_c; |
| 97 | + |
| 98 | + reference_blas::ext_omatadd(*t_str_a, *t_str_b, m, n, alpha, m_a, lda, beta, |
| 99 | + m_b, ldb, m_c_ref, ldc); |
| 100 | + |
| 101 | + std::vector<scalar_t> m_c_temp = m_c; |
| 102 | + { |
| 103 | + blas_benchmark::utils::CUDAVector<scalar_t, true> m_c_temp_gpu( |
| 104 | + size_c, m_c_temp.data()); |
| 105 | + |
| 106 | + cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, &alpha, m_a_gpu, |
| 107 | + lda, &beta, m_b_gpu, ldb, m_c_temp_gpu, ldc); |
| 108 | + } |
| 109 | + |
| 110 | + std::ostringstream err_stream; |
| 111 | + if (!utils::compare_vectors(m_c_temp, m_c_ref, err_stream, "")) { |
| 112 | + const std::string& err_str = err_stream.str(); |
| 113 | + state.SkipWithError(err_str.c_str()); |
| 114 | + *success = false; |
| 115 | + }; |
| 116 | +#endif |
| 117 | + auto blas_warmup = [&]() -> void { |
| 118 | + cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, &alpha, m_a_gpu, |
| 119 | + lda, &beta, m_b_gpu, ldb, m_c_gpu, ldc); |
| 120 | + return; |
| 121 | + }; |
| 122 | + |
| 123 | + cudaEvent_t start; |
| 124 | + cudaEvent_t stop; |
| 125 | + CUDA_CHECK(cudaEventCreate(&start)); |
| 126 | + CUDA_CHECK(cudaEventCreate(&stop)); |
| 127 | + |
| 128 | + auto blas_method_def = [&]() -> std::vector<cudaEvent_t> { |
| 129 | + CUDA_CHECK(cudaEventRecord(start)); |
| 130 | + cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, &alpha, m_a_gpu, |
| 131 | + lda, &beta, m_b_gpu, ldb, m_c_gpu, ldc); |
| 132 | + CUDA_CHECK(cudaEventRecord(stop)); |
| 133 | + CUDA_CHECK(cudaEventSynchronize(stop)); |
| 134 | + return std::vector{start, stop}; |
| 135 | + }; |
| 136 | + |
| 137 | + // Warmup |
| 138 | + blas_benchmark::utils::warmup(blas_warmup); |
| 139 | + CUDA_CHECK(cudaStreamSynchronize(NULL)); |
| 140 | + |
| 141 | + blas_benchmark::utils::init_counters(state); |
| 142 | + |
| 143 | + // Measure |
| 144 | + for (auto _ : state) { |
| 145 | + // Run |
| 146 | + std::tuple<double, double> times = |
| 147 | + blas_benchmark::utils::timef_cuda(blas_method_def); |
| 148 | + |
| 149 | + // Report |
| 150 | + blas_benchmark::utils::update_counters(state, times); |
| 151 | + } |
| 152 | + |
| 153 | + state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]); |
| 154 | + state.SetBytesProcessed(state.iterations() * |
| 155 | + state.counters["bytes_processed"]); |
| 156 | + |
| 157 | + blas_benchmark::utils::calc_avg_counters(state); |
| 158 | + |
| 159 | + CUDA_CHECK(cudaEventDestroy(start)); |
| 160 | + CUDA_CHECK(cudaEventDestroy(stop)); |
| 161 | +}; |
| 162 | + |
| 163 | +template <typename scalar_t> |
| 164 | +void register_benchmark(blas_benchmark::Args& args, |
| 165 | + cublasHandle_t* cublas_handle_ptr, bool* success) { |
| 166 | + auto omatadd_params = |
| 167 | + blas_benchmark::utils::get_omatadd_params<scalar_t>(args); |
| 168 | + |
| 169 | + for (auto p : omatadd_params) { |
| 170 | + std::string ts_a, ts_b; |
| 171 | + index_t m, n, lda_mul, ldb_mul, ldc_mul; |
| 172 | + scalar_t alpha, beta; |
| 173 | + std::tie(ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul) = p; |
| 174 | + int t_a = static_cast<int>(blas_benchmark::utils::to_transpose_enum(ts_a)); |
| 175 | + int t_b = static_cast<int>(blas_benchmark::utils::to_transpose_enum(ts_b)); |
| 176 | + |
| 177 | + auto BM_lambda = |
| 178 | + [&](benchmark::State& st, cublasHandle_t* cublas_handle_ptr, int t_a, |
| 179 | + int t_b, index_t m, index_t n, scalar_t alpha, scalar_t beta, |
| 180 | + index_t lda_mul, index_t ldb_mul, index_t ldc_mul, bool* success) { |
| 181 | + run<scalar_t>(st, cublas_handle_ptr, t_a, t_b, m, n, alpha, beta, |
| 182 | + lda_mul, ldb_mul, ldc_mul, success); |
| 183 | + }; |
| 184 | + benchmark::RegisterBenchmark( |
| 185 | + get_name<scalar_t>(ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul, |
| 186 | + ldc_mul) |
| 187 | + .c_str(), |
| 188 | + BM_lambda, cublas_handle_ptr, t_a, t_b, m, n, alpha, beta, lda_mul, |
| 189 | + ldb_mul, ldc_mul, success) |
| 190 | + ->UseRealTime(); |
| 191 | + } |
| 192 | +} |
| 193 | + |
| 194 | +namespace blas_benchmark { |
| 195 | +void create_benchmark(blas_benchmark::Args& args, |
| 196 | + cublasHandle_t* cuda_handle_ptr, bool* success) { |
| 197 | + BLAS_REGISTER_BENCHMARK(args, cuda_handle_ptr, success); |
| 198 | +} |
| 199 | +} // namespace blas_benchmark |
0 commit comments