codeplaysoftware
diff --git a/‎README.md
Lines changed: 25 additions & 0 deletions b/‎README.md
Lines changed: 25 additions & 0 deletions
diff --git a/‎benchmark/cublas/CMakeLists.txt
Lines changed: 2 additions & 4 deletions b/‎benchmark/cublas/CMakeLists.txt
Lines changed: 2 additions & 4 deletions
diff --git a/‎benchmark/cublas/extension/omatcopy.cpp
Lines changed: 195 additions & 0 deletions b/‎benchmark/cublas/extension/omatcopy.cpp
Lines changed: 195 additions & 0 deletions
diff --git a/‎benchmark/rocblas/CMakeLists.txt
Lines changed: 3 additions & 0 deletions b/‎benchmark/rocblas/CMakeLists.txt
Lines changed: 3 additions & 0 deletions
@@ -292,6 +292,31 @@ For all these operations:
 | `_gemm_strided_batched` | `sb_handle`, `transa`, `transb`, `M`, `N`, `K`, `alpha`, `A`, `lda`, `stridea`, `B`, `ldb`, `strideb`, `beta`, `C`, `ldc`, `stridec`, `batch_size` | Same as `_gemm` but the containers contain `batch_size` end-to-end matrices. GEMM operations are performed independently with matching matrices.
 | `_trsm` | `sb_handle`, `side`, `uplo`, `trans`, `diag`, `M`, `N`, `alpha`, `A`, `lda`, `B`, `ldb` | Triangular solve with Multiple Right-Hand Sides. |
 
+### EXTENSION
+
+The following table sums up the interface that cab be found in
+[extension_interface.h](include/interface/extension_interface.h).
+
+For all these operations:
+
+* `A`, `B` and `C` are containers for the column-major matrices A, B and C.
+* `lda`, `ldb` and `ldc` are the leading dimensions of the matrices A, B and C
+  (cf BLAS 2). The leading dimension of a matrix must be greater than or equal
+  to its number of rows. In the case of in-place transpose, the same matrix `A`
+  is used with two different leading dimensions for input & output.
+* `transa` and `transb` are the transpose modes of the matrices A and B
+  (cf BLAS 2).
+* `M` and `N` are the dimensions of the matrices.
+* `alpha` and `beta` are scalars.
+* `batch_size` is an integer.
+* `inc_a` and `inc_b` are integers. The distance between element in the same column.
+
+| operation | arguments | description |
+|---|---|---|
+| `_omatcopy` | `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `B`, `ldb`  | Perform an out-of-place scaled matrix transpose or copy operation using a general dense matrix. |
+| `_omatcopy2`| `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `inc_a`, `B`, `ldb`, `inc_b`  | Computes two-strided scaling and out-of-place transposition or copying of general dense matrices. |
+| `_transpose` | `sb_handle`, `M`, `N`, `A`, `lda`, `B`, `ldb`  | Computes an out-of-place matrix transpose operation using a general dense matrix. |
+| `_transpose` | `sb_handle`, `M`, `N`, `A`, `ld_in`, `ld_out`  | Computes an in-place matrix transpose operation using a general dense matrix. |
 ### Experimental Joint Matrix Support
 
 SYCL-BLAS now supports sub-group based collective GEMM operation using the experimental 
 
@@ -69,12 +69,10 @@ set(sources
   blas3/trsm.cpp
   blas3/trsm_batched.cpp
   blas3/trmm.cpp
+  # extension blas
+  extension/omatcopy.cpp
 ) 
 
-#if(${BLAS_ENABLE_EXTENSIONS})
-#  list(APPEND sources "extension/reduction.cpp")
-#endif()
-
 # Add individual benchmarks for each method
 foreach(cublas_bench ${sources})
   get_filename_component(bench_cublas_exec ${cublas_bench} NAME_WE)
 
@@ -0,0 +1,195 @@
+/* *************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL-BLAS: BLAS implementation using SYCL
+ *
+ *  @filename omatcopy.cpp
+ *
+ **************************************************************************/
+
+#include "../../../test/unittest/extension/extension_reference.hpp"
+#include "../utils.hpp"
+
+template <typename scalar_t>
+std::string get_name(std::string ts_a, int m, int n, scalar_t alpha,
+                     index_t lda_mul, index_t ldb_mul) {
+  std::ostringstream str{};
+  str << "BM_omatcopy<" << blas_benchmark::utils::get_type_name<scalar_t>()
+      << ">/" << ts_a << "/" << m << "/" << n << "/" << alpha << "/" << lda_mul
+      << "/" << ldb_mul;
+  return str.str();
+}
+
+template <typename scalar_t, typename... args_t>
+static inline void cublas_routine(args_t&&... args) {
+  if constexpr (std::is_same_v<scalar_t, float>) {
+    CUBLAS_CHECK(cublasSgeam(std::forward<args_t>(args)...));
+  } else if constexpr (std::is_same_v<scalar_t, double>) {
+    CUBLAS_CHECK(cublasDgeam(std::forward<args_t>(args)...));
+  }
+  return;
+}
+
+template <typename scalar_t>
+void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int ti,
+         index_t m, index_t n, scalar_t alpha, index_t lda_mul, index_t ldb_mul,
+         bool* success) {
+  // initialize the state label
+  blas_benchmark::utils::set_benchmark_label<scalar_t>(state);
+
+  // Standard test setup.
+  std::string ts = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(ti));
+  const char* t_str = ts.c_str();
+
+  // These arguments follows cublas indication for sizes and leading dimensions
+  // instead of following oneMKL specification.
+  const auto cuda_lda = (*t_str == 't') ? lda_mul * n : lda_mul * m;
+  const auto cuda_ldb = ldb_mul * m;
+  const auto cuda_size_a = cuda_lda * ((*t_str == 't') ? m : n);
+  const auto cuda_size_b = cuda_ldb * n;
+
+  blas_benchmark::utils::init_extension_counters<
+      blas_benchmark::utils::ExtensionOP::omatcopy, scalar_t>(
+      state, t_str, m, n, lda_mul, ldb_mul);
+
+  cublasHandle_t& cuda_handle = *cuda_handle_ptr;
+
+  // Input matrix/vector, output vector.
+  std::vector<scalar_t> m_a =
+      blas_benchmark::utils::random_data<scalar_t>(cuda_size_a);
+  std::vector<scalar_t> m_b =
+      blas_benchmark::utils::random_data<scalar_t>(cuda_size_b);
+
+  blas_benchmark::utils::CUDAVector<scalar_t> m_a_gpu(cuda_size_a, m_a.data());
+  blas_benchmark::utils::CUDAVector<scalar_t> m_b_gpu(cuda_size_b, m_b.data());
+
+  cublasOperation_t c_t_a = (*t_str == 'n') ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // beta set to zero to use cublasTgeam properly
+  const scalar_t beta = static_cast<scalar_t>(0.0);
+  // place holder to for second matrix in cublasTgeam
+  cublasOperation_t c_t_b = CUBLAS_OP_N;
+
+#ifdef BLAS_VERIFY_BENCHMARK
+  // Run a first time with a verification of the results
+  std::vector<scalar_t> m_b_ref = m_b;  // m_b;
+
+  reference_blas::ext_omatcopy<false>(*t_str, m, n, alpha, m_a, cuda_lda,
+                                      m_b_ref, cuda_ldb);
+
+  std::vector<scalar_t> m_b_temp = m_b;
+  {
+    blas_benchmark::utils::CUDAVector<scalar_t, true> m_b_temp_gpu(
+        cuda_size_b, m_b_temp.data());
+
+    cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, &alpha, m_a_gpu,
+                             cuda_lda, &beta, nullptr, cuda_ldb, m_b_temp_gpu,
+                             cuda_ldb);
+  }
+
+  std::ostringstream err_stream;
+  if (!utils::compare_vectors(m_b_temp, m_b_ref, err_stream, "")) {
+    const std::string& err_str = err_stream.str();
+    state.SkipWithError(err_str.c_str());
+    *success = false;
+  };
+#endif
+  auto blas_warmup = [&]() -> void {
+    cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, &alpha, m_a_gpu,
+                             cuda_lda, &beta, nullptr, cuda_ldb, m_b_gpu,
+                             cuda_ldb);
+    return;
+  };
+
+  cudaEvent_t start;
+  cudaEvent_t stop;
+  CUDA_CHECK(cudaEventCreate(&start));
+  CUDA_CHECK(cudaEventCreate(&stop));
+
+  auto blas_method_def = [&]() -> std::vector<cudaEvent_t> {
+    CUDA_CHECK(cudaEventRecord(start));
+    cublas_routine<scalar_t>(cuda_handle, c_t_a, c_t_b, m, n, &alpha, m_a_gpu,
+                             cuda_lda, &beta, nullptr, cuda_ldb, m_b_gpu,
+                             cuda_ldb);
+    CUDA_CHECK(cudaEventRecord(stop));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+    return std::vector{start, stop};
+  };
+
+  // Warmup
+  blas_benchmark::utils::warmup(blas_warmup);
+  CUDA_CHECK(cudaStreamSynchronize(NULL));
+
+  blas_benchmark::utils::init_counters(state);
+
+  // Measure
+  for (auto _ : state) {
+    // Run
+    std::tuple<double, double> times =
+        blas_benchmark::utils::timef_cuda(blas_method_def);
+
+    // Report
+    blas_benchmark::utils::update_counters(state, times);
+  }
+
+  state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
+  state.SetBytesProcessed(state.iterations() *
+                          state.counters["bytes_processed"]);
+
+  blas_benchmark::utils::calc_avg_counters(state);
+
+  CUDA_CHECK(cudaEventDestroy(start));
+  CUDA_CHECK(cudaEventDestroy(stop));
+};
+
+template <typename scalar_t>
+void register_benchmark(blas_benchmark::Args& args,
+                        cublasHandle_t* cublas_handle_ptr, bool* success) {
+  auto omatcopy_params =
+      blas_benchmark::utils::get_matcopy_params<scalar_t>(args);
+
+  for (auto p : omatcopy_params) {
+    std::string ts_a;
+    index_t m, n, lda_mul, ldb_mul;
+    scalar_t alpha;
+    std::tie(ts_a, m, n, alpha, lda_mul, ldb_mul) = p;
+    int t_a = static_cast<int>(blas_benchmark::utils::to_transpose_enum(ts_a));
+
+    auto BM_lambda = [&](benchmark::State& st,
+                         cublasHandle_t* cublas_handle_ptr, int t_a, index_t m,
+                         index_t n, scalar_t alpha, index_t lda_mul,
+                         index_t ldb_mul, bool* success) {
+      run<scalar_t>(st, cublas_handle_ptr, t_a, m, n, alpha, lda_mul, ldb_mul,
+                    success);
+    };
+    benchmark::RegisterBenchmark(
+        get_name<scalar_t>(ts_a, m, n, alpha, lda_mul, ldb_mul).c_str(),
+        BM_lambda, cublas_handle_ptr, t_a, m, n, alpha, lda_mul, ldb_mul,
+        success)
+        ->UseRealTime();
+  }
+}
+
+namespace blas_benchmark {
+void create_benchmark(blas_benchmark::Args& args,
+                      cublasHandle_t* cuda_handle_ptr, bool* success) {
+  BLAS_REGISTER_BENCHMARK(args, cuda_handle_ptr, success);
+}
+}  // namespace blas_benchmark
@@ -71,6 +71,9 @@ set(sources
   blas3/gemm_batched.cpp
   blas3/gemm_batched_strided.cpp
 
+  # Extension blas
+  extension/omatcopy.cpp
+
 )
 
 # Add individual benchmarks for each method
Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,9 @@ set(sources`
`71`	`71`	`blas3/gemm_batched.cpp`
`72`	`72`	`blas3/gemm_batched_strided.cpp`
`73`	`73`
	`74`	`+ # Extension blas`
	`75`	`+ extension/omatcopy.cpp`
	`76`	`+`
`74`	`77`	`)`
`75`	`78`
`76`	`79`	`# Add individual benchmarks for each method`