Skip to content
This repository was archived by the owner on Jan 13, 2025. It is now read-only.

Commit fec888f

Browse files
HipSYCL to AdaptiveCpp update & fixes (#493)
* Added scal operation for matrices as well
1 parent 861b310 commit fec888f

28 files changed

+466
-153
lines changed

CMakeLists.txt

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,19 @@ if(((NOT INSTALL_HEADER_ONLY) AND (TUNING_TARGET STREQUAL "DEFAULT_CPU"))
115115
message(STATUS "FP16 operations are not supported for CPU targets. BLAS_ENABLE_HALF is disabled")
116116
endif()
117117

118+
if (SYCL_COMPILER MATCHES "adaptivecpp")
119+
if(BLAS_ENABLE_COMPLEX)
120+
message(STATUS "SYCL Complex data is not supported on AdaptiveCpp/hipSYCL. Complex
121+
data type is disabled")
122+
set(BLAS_ENABLE_COMPLEX OFF)
123+
endif()
124+
if(BLAS_MEMPOOL_BENCHMARK)
125+
message(STATUS "Memory pool feature is not supported on AdaptiveCpp/hipSYCL. Corresponding
126+
benchmarks are disabled")
127+
set(BLAS_MEMPOOL_BENCHMARK OFF)
128+
endif()
129+
endif()
130+
118131
# CmakeFunctionHelper has to be included after any options that it depends on are declared.
119132
# These include:
120133
# * TARGET
@@ -145,17 +158,17 @@ else()
145158
target_link_libraries(portblas PUBLIC ComputeCpp::ComputeCpp)
146159
elseif(is_dpcpp)
147160
target_link_libraries(portblas PUBLIC DPCPP::DPCPP)
148-
elseif(is_hipsycl)
149-
target_link_libraries(portblas PUBLIC hipSYCL::hipSYCL-rt)
161+
elseif(is_adaptivecpp)
162+
target_link_libraries(portblas PUBLIC AdaptiveCpp::acpp-rt)
150163
endif()
151164
endif()
152165
if(is_computecpp)
153166
set(sycl_impl ComputeCpp::ComputeCpp)
154167
elseif(is_dpcpp)
155168
set(sycl_impl DPCPP::DPCPP)
156169
add_sycl_to_target(TARGET portblas SOURCES)
157-
elseif(is_hipsycl)
158-
set(sycl_impl hipSYCL::hipSYCL-rt)
170+
elseif(is_adaptivecpp)
171+
set(sycl_impl AdaptiveCpp::acpp-rt)
159172
add_sycl_to_target(TARGET portblas SOURCES)
160173
endif()
161174
if(IMGDNN_DIR)

README.md

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ the project.
3131
- [Requirements](#requirements)
3232
- [Setup](#setup)
3333
- [Compile with DPC++](#compile-with-dpc)
34-
- [Compile with hipSYCL](#compile-with-hipsycl)
34+
- [Compile with AdaptiveCpp *(Formerly hipSYCL)*](#compile-with-adaptivecpp)
3535
- [Instaling portBLAS](#instaling-portBLAS)
3636
- [Doxygen](#doxygen)
3737
- [CMake options](#cmake-options)
@@ -390,9 +390,9 @@ added to the `CMAKE_PREFIX_PATH` when building portBLAS (see
390390

391391
**IMPORTANT NOTE:** The `TARGET` CMake variable is no longer supported. It has
392392
been replaced by `TUNING_TARGET`, which accepts the same options.
393-
`TUNING_TARGET` affects only the tuning configuration, applicable for some operators such
394-
as GEMM, and has no effect on the target triplet for DPC++ or the hipSYCL target. Please
395-
refer to the sections below for setting them.
393+
`TUNING_TARGET` affects only the tuning configuration and has no effect on the target
394+
triplet for DPC++ or the AdaptiveCpp/hipSYCL target. Please refer to the sections
395+
below for setting them.
396396

397397
1. Clone the portBLAS repository, making sure to pass the `--recursive` option, in order
398398
to clone submodule(s).
@@ -417,13 +417,41 @@ advisable for NVIDIA and **mandatory for AMD** to provide the specific device
417417
architecture through `-DDPCPP_SYCL_ARCH=<arch>`, e.g., `<arch>` can be `sm_80`
418418
for NVIDIA or `gfx908` for AMD.
419419

420-
### Compile with hipSYCL
420+
### Compile with AdaptiveCpp *(Formerly hipSYCL)*
421+
The following instructions concern the **generic** *(clang-based)* flow supported
422+
by AdaptiveCpp.
423+
421424
```bash
422425
cd build
423-
cmake -GNinja ../ -DhipSYCL_DIR=/path/to/hipSYCL/install/lib/cmake/hipSYCL -DSYCL_COMPILER=hipsycl
426+
export CC=[path/to/system/clang]
427+
export CXX=[path/to/AdaptiveCpp/install/bin/acpp]
428+
export ACPP_TARGETS=[compilation_flow:target] # (e.g. cuda:sm_75)
429+
cmake -GNinja ../ -DAdaptiveCpp_DIR=/path/to/AdaptiveCpp/install/lib/cmake/AdaptiveCpp \
430+
-DSYCL_COMPILER=adaptivecpp -DACPP_TARGETS=$ACPP_TARGETS
424431
ninja
425432
```
426-
To build for other than the default devices (`omp`), set the `HIPSYCL_TARGETS` environment variable or specify `-DHIPSYCL_TARGETS` as [documented](https://github.com/illuhad/hipSYCL/blob/develop/doc/using-hipsycl.md).
433+
To build for other than the default backend *(host cpu through `omp`*)*, set the `ACPP_TARGETS` environment
434+
variable or specify `-DACPP_TARGETS` as
435+
[documented](https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/using-hipsycl.md).
436+
The available backends are the ones built with AdaptiveCpp in the first place.
437+
438+
Similarly to DPCPP's `sycl-ls`, AdaptiveCpp's `acpp-info` helps display the available
439+
backends informations. In case of building AdaptiveCpp against llvm *(generic-flow)*,
440+
the `llvm-to-xxx.so` library files should be visible by the runtime to target the
441+
appropriate device, which can be ensured by setting the ENV variable :
442+
443+
```bash
444+
export LD_LIBRARY_PATH=[path/to/AdaptiveCpp/install/lib/hipSYCL:$LD_LIBRARY_PATH]
445+
export LD_LIBRARY_PATH=[path/to/AdaptiveCpp/install/lib/hipSYCL/llvm-to-backend:$LD_LIBRARY_PATH]
446+
```
447+
448+
*Notes :*
449+
- Some operator kernels are implemented using extensions / SYCL 2020 features not yet implemented
450+
in AdaptiveCpp and are not supported when portBLAS is built with it. These operators include
451+
`asum`, `nrm2`, `dot`, `sdsdot`, `rot`, `trsv`, `tbsv` and `tpsv`.
452+
- The default `omp` host CPU backend *(as well as its optimized variant `omp.accelerated`)* hasn't been
453+
not been fully integrated into the library and currently causes some tests to fail *(interleaved batched
454+
gemm in particular)*. It's thus advised to use the llvm/OpenCL generic flow when targetting CPUs.
427455

428456
### Installing portBLAS
429457
To install the portBLAS library (see `CMAKE_INSTALL_PREFIX` below)
@@ -452,7 +480,7 @@ Some of the supported options are:
452480
|---|---|---|
453481
| `BLAS_ENABLE_TESTING` | `ON`/`OFF` | Set it to `OFF` to avoid building the tests (`ON` is the default value) |
454482
| `BLAS_ENABLE_BENCHMARK` | `ON`/`OFF` | Set it to `OFF` to avoid building the benchmarks (`ON` is the default value) |
455-
| `SYCL_COMPILER` | name | Used to determine which SYCL implementation to use. By default, the first implementation found is used. Supported values are: `dpcpp`, `hipsycl` and `computecpp`*(deprecated)*. |
483+
| `SYCL_COMPILER` | name | Used to determine which SYCL implementation to use. By default, the first implementation found is used. Supported values are: `dpcpp`, `adaptivecpp` and `computecpp`*(deprecated)*. |
456484
| `TUNING_TARGET` | name | By default, this flag is set to `DEFAULT_CPU` to restrict any device specific compiler optimizations. Use this flag to tune the code for a target (**highly recommended** for performance). The supported targets are: `INTEL_GPU`, `NVIDIA_GPU`, `AMD_GPU` |
457485
| `CMAKE_PREFIX_PATH` | path | List of paths to check when searching for dependencies |
458486
| `CMAKE_INSTALL_PREFIX` | path | Specify the install location, used when invoking `ninja install` |

benchmark/portblas/CMakeLists.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,21 @@ if(${BLAS_ENABLE_EXTENSIONS})
8585
list(APPEND sources extension/reduction.cpp)
8686
endif()
8787

88+
# Skip these benchmarks for AdaptiveCpp for SPIRV/OpenCL targets
89+
# that use SYCL 2020 features like group reduction or hang
90+
# during execution (https://github.com/AdaptiveCpp/AdaptiveCpp/issues/1309)
91+
set(ADAPTIVE_CPP_SKIP
92+
blas1/asum.cpp
93+
blas1/dot.cpp
94+
blas1/sdsdot.cpp
95+
blas1/nrm2.cpp
96+
blas2/trsv.cpp
97+
blas2/tbsv.cpp
98+
blas2/tpsv.cpp
99+
# Hang during execution (without failing)
100+
blas3/trsm.cpp
101+
)
102+
88103
# Operators supporting COMPLEX types benchmarking
89104
set(CPLX_OPS "gemm"
90105
"gemm_batched"
@@ -101,6 +116,9 @@ set(HALF_DATA_OPS "axpy"
101116
# Add individual benchmarks for each method
102117
foreach(portblas_bench ${sources})
103118
get_filename_component(bench_exec ${portblas_bench} NAME_WE)
119+
if(is_adaptivecpp AND ${portblas_bench} IN_LIST ADAPTIVE_CPP_SKIP)
120+
continue()
121+
endif()
104122
add_executable(bench_${bench_exec} ${portblas_bench} main.cpp)
105123
target_link_libraries(bench_${bench_exec} PRIVATE benchmark Clara::Clara portblas bench_info)
106124
target_compile_definitions(bench_${bench_exec} PRIVATE -DBLAS_INDEX_T=${BLAS_BENCHMARK_INDEX_TYPE})

cmake/Modules/SYCL.cmake

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,36 +25,39 @@
2525
include(CheckCXXCompilerFlag)
2626
include(ConfigurePORTBLAS)
2727

28-
# find_package(hipSYCL) requires HIPSYCL_TARGETS to be set, so set it to a default value before find_package(hipSYCL)
29-
if(SYCL_COMPILER MATCHES "hipsycl" AND NOT HIPSYCL_TARGETS AND NOT ENV{HIPSYCL_TARGETS})
30-
message(STATUS "Using `omp` as HIPSYCL_TARGETS")
31-
set(HIPSYCL_TARGETS "omp")
28+
# find_package(AdaptiveCpp) requires ACPP_TARGETS to be set, so set it to a default value before find_package(AdaptiveCpp)
29+
if(SYCL_COMPILER MATCHES "adaptivecpp" AND NOT ACPP_TARGETS AND NOT ENV{ACPP_TARGETS})
30+
message(STATUS "Using `omp` as ACPP_TARGETS")
31+
set(ACPP_TARGETS "omp")
32+
else()
33+
message(STATUS "Using ${ACPP_TARGETS} as ACPP_TARGETS")
3234
endif()
3335

36+
check_cxx_compiler_flag("--acpp-targets" has_acpp)
3437
check_cxx_compiler_flag("-fsycl" has_fsycl)
3538

3639
if(NOT SYCL_COMPILER)
37-
if(has_fsycl)
40+
if(has_acpp)
41+
find_package(AdaptiveCpp QUIET)
42+
set(is_adaptivecpp ${AdaptiveCpp_FOUND})
43+
set(SYCL_COMPILER "adaptivecpp")
44+
else()
3845
set(is_dpcpp ON)
3946
set(SYCL_COMPILER "dpcpp")
40-
else()
41-
find_package(hipSYCL QUIET)
42-
set(is_hipsycl ${hipSYCL_FOUND})
43-
set(SYCL_COMPILER "hipsycl")
44-
if(NOT is_hipsycl)
45-
set(is_computecpp ON)
46-
set(SYCL_COMPILER "computecpp")
47-
endif()
4847
endif()
4948
else()
5049
if(SYCL_COMPILER MATCHES "dpcpp")
5150
set(is_dpcpp ON)
5251
if(NOT has_fsycl)
5352
message(WARNING "Selected DPC++ as backend, but -fsycl not supported")
5453
endif()
55-
elseif(SYCL_COMPILER MATCHES "hipsycl")
56-
find_package(hipSYCL REQUIRED CONFIG)
57-
set(is_hipsycl ON)
54+
elseif(SYCL_COMPILER MATCHES "adaptivecpp")
55+
find_package(AdaptiveCpp CONFIG REQUIRED)
56+
set(is_adaptivecpp ${AdaptiveCpp_FOUND})
57+
if(NOT has_acpp)
58+
message(WARNING "Selected AdaptiveCpp as backend, but the compiler is not
59+
fully supported")
60+
endif()
5861
elseif(SYCL_COMPILER MATCHES "computecpp")
5962
set(is_computecpp ON)
6063
else()
@@ -88,8 +91,14 @@ elseif(is_dpcpp)
8891
endif()
8992
find_package(DPCPP REQUIRED)
9093
get_target_property(SYCL_INCLUDE_DIRS DPCPP::DPCPP INTERFACE_INCLUDE_DIRECTORIES)
91-
elseif(is_hipsycl)
94+
elseif(is_adaptivecpp)
9295
set(CMAKE_CXX_STANDARD 17)
9396
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
94-
get_target_property(SYCL_INCLUDE_DIRS hipSYCL::hipSYCL-rt INTERFACE_INCLUDE_DIRECTORIES)
97+
get_target_property(SYCL_INCLUDE_DIRS AdaptiveCpp::acpp-rt INTERFACE_INCLUDE_DIRECTORIES)
98+
set(HIP_BENCH_UNSUPPORTED_TARGETS "INTEL_GPU" "DEFAULT_CPU")
99+
if((${BLAS_ENABLE_BENCHMARK}) AND (${TUNING_TARGET} IN_LIST HIP_BENCH_UNSUPPORTED_TARGETS))
100+
message(STATUS "Benchmarks are not supported when targetting OpenCL/LevelZero backend
101+
devices. portBLAS Benchmarks are disabled.")
102+
set(BLAS_ENABLE_BENCHMARK OFF)
103+
endif()
95104
endif()

include/container/sycl_iterator.h

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -194,27 +194,51 @@ template <cl::sycl::access::mode acc_md_t>
194194
inline typename BufferIterator<element_t>::template accessor_t<acc_md_t>
195195
BufferIterator<element_t>::get_range_accessor(cl::sycl::handler& cgh,
196196
size_t size) {
197-
return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
198-
buffer_, cgh, cl::sycl::range<1>(size),
199-
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
197+
if constexpr (acc_md_t == cl::sycl::access::mode::read) {
198+
return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
199+
buffer_, cgh, cl::sycl::range<1>(size),
200+
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
201+
} else {
202+
// Skip data initialization if not accessing in read mode only
203+
return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
204+
buffer_, cgh, cl::sycl::range<1>(size),
205+
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()),
206+
cl::sycl::property::no_init{});
207+
}
200208
}
201209

202210
template <typename element_t>
203211
template <cl::sycl::access::mode acc_md_t>
204212
inline typename BufferIterator<element_t>::template accessor_t<acc_md_t>
205213
BufferIterator<element_t>::get_range_accessor(cl::sycl::handler& cgh) {
206-
return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
207-
cgh, BufferIterator<element_t>::get_size());
214+
if constexpr (acc_md_t == cl::sycl::access::mode::read) {
215+
return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
216+
cgh, BufferIterator<element_t>::get_size());
217+
} else {
218+
// Skip data initialization if not accessing in read mode only
219+
return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
220+
cgh, BufferIterator<element_t>::get_size(),
221+
cl::sycl::property::no_init{});
222+
}
208223
}
209224

210225
template <typename element_t>
211226
template <cl::sycl::access::mode acc_md_t>
212227
inline typename BufferIterator<element_t>::template placeholder_accessor_t<
213228
acc_md_t>
214229
BufferIterator<element_t>::get_range_accessor(size_t size) {
215-
return typename BufferIterator<element_t>::template placeholder_accessor_t<
216-
acc_md_t>(buffer_, cl::sycl::range<1>(size),
217-
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
230+
if constexpr (acc_md_t == cl::sycl::access::mode::read) {
231+
return typename BufferIterator<element_t>::template placeholder_accessor_t<
232+
acc_md_t>(buffer_, cl::sycl::range<1>(size),
233+
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
234+
235+
} else {
236+
// Skip data initialization if not accessing in read mode only
237+
return typename BufferIterator<element_t>::template placeholder_accessor_t<
238+
acc_md_t>(buffer_, cl::sycl::range<1>(size),
239+
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()),
240+
cl::sycl::property::no_init{});
241+
}
218242
}
219243

220244
template <typename element_t>

include/interface/blas1_interface.h

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ typename sb_handle_t::event_t _swap(
196196
const typename sb_handle_t::event_t &_dependencies);
197197

198198
/**
199-
* \brief SCALAR operation on a vector
199+
* \brief SCALAR operation on a vector
200200
* @param sb_handle_t sb_handle
201201
* @param _vx BufferIterator or USM pointer
202202
* @param _incx Increment for the vector X
@@ -208,6 +208,37 @@ typename sb_handle_t::event_t _scal(
208208
sb_handle_t &sb_handle, index_t _N, element_t _alpha, container_0_t _vx,
209209
increment_t _incx, const typename sb_handle_t::event_t &_dependencies);
210210

211+
/**
212+
* \brief SCALAR operation on a matrix. (this is a generalization of
213+
* vector-based _scal operator meant for internal use within the library, namely
214+
* for GEMM and inplace-Matcopy operators)
215+
* @param sb_handle_t sb_handle
216+
* @param _A Input/Output BufferIterator or USM pointer
217+
* @param _incA Increment for the matrix A
218+
* @param _lda Leading dimension for the matrix A
219+
* @param _M number of rows
220+
* @param _N number of columns
221+
* @param alpha scaling scalar
222+
* @param _dependencies Vector of events
223+
*/
224+
template <typename sb_handle_t, typename element_t, typename container_0_t,
225+
typename index_t, typename increment_t>
226+
typename sb_handle_t::event_t _scal_matrix(
227+
sb_handle_t &sb_handle, index_t _M, index_t _N, element_t _alpha,
228+
container_0_t _A, index_t _lda, increment_t _incA,
229+
const typename sb_handle_t::event_t &_dependencies);
230+
231+
/*!
232+
* \brief Prototype for the internal implementation of the _scal_matrix
233+
* operator.
234+
*/
235+
template <bool has_inc, typename sb_handle_t, typename element_t,
236+
typename container_0_t, typename index_t, typename increment_t>
237+
typename sb_handle_t::event_t _scal_matrix_impl(
238+
sb_handle_t &sb_handle, index_t _M, index_t _N, element_t _alpha,
239+
container_0_t _A, index_t _lda, increment_t _incA,
240+
const typename sb_handle_t::event_t &_dependencies);
241+
211242
/**
212243
* \brief NRM2 Returns the euclidian norm of a vector
213244
* @param sb_handle SB_Handle

include/operations/blas_constants.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,16 +263,18 @@ struct constant_pair {
263263

264264
} // namespace blas
265265

266+
#ifndef __ADAPTIVECPP__
266267
template <typename ind_t, typename val_t>
267-
struct sycl::is_device_copyable<blas::IndexValueTuple<ind_t, val_t>>
268+
struct cl::sycl::is_device_copyable<blas::IndexValueTuple<ind_t, val_t>>
268269
: std::true_type {};
269270

270271
template <typename ind_t, typename val_t>
271-
struct sycl::is_device_copyable<const blas::IndexValueTuple<ind_t, val_t>>
272+
struct cl::sycl::is_device_copyable<const blas::IndexValueTuple<ind_t, val_t>>
272273
: std::true_type {};
273274

274275
template <typename ind_t, typename val_t>
275276
struct std::is_trivially_copyable<blas::IndexValueTuple<ind_t, val_t>>
276277
: std::true_type {};
278+
#endif
277279

278280
#endif // BLAS_CONSTANTS_H

include/sb_handle/portblas_handle.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,18 +49,24 @@ class SB_Handle {
4949
public:
5050
using event_t = std::vector<cl::sycl::event>;
5151
inline SB_Handle(queue_t q)
52-
: tempMemPool_(nullptr),
52+
:
53+
#ifndef __ADAPTIVECPP__
54+
tempMemPool_(nullptr),
55+
#endif
5356
q_(q),
5457
workGroupSize_(helper::get_work_group_size(q)),
5558
localMemorySupport_(helper::has_local_memory(q)),
56-
computeUnits_(helper::get_num_compute_units(q)) {}
59+
computeUnits_(helper::get_num_compute_units(q)) {
60+
}
5761

62+
#ifndef __ADAPTIVECPP__
5863
inline SB_Handle(Temp_Mem_Pool* tmp)
5964
: tempMemPool_(tmp),
6065
q_(tmp->get_queue()),
6166
workGroupSize_(helper::get_work_group_size(q_)),
6267
localMemorySupport_(helper::has_local_memory(q_)),
6368
computeUnits_(helper::get_num_compute_units(q_)) {}
69+
#endif
6470

6571
template <helper::AllocType alloc, typename value_t>
6672
typename std::enable_if<
@@ -191,7 +197,9 @@ class SB_Handle {
191197
const size_t workGroupSize_;
192198
const bool localMemorySupport_;
193199
const size_t computeUnits_;
200+
#ifndef __ADAPTIVECPP__
194201
Temp_Mem_Pool* tempMemPool_;
202+
#endif
195203
};
196204

197205
} // namespace blas

0 commit comments

Comments
 (0)