|
| 1 | +//! Shared memory handling. Currently only macros. |
| 2 | +
|
| 3 | +/// Statically allocates a buffer large enough for `len` elements of `array_type`, yielding |
| 4 | +/// a `*mut array_type` that points to uninitialized shared memory. `len` must be a constant expression. |
| 5 | +/// |
| 6 | +/// Note that this allocates the memory __statically__, it expands to a static in the `shared` address space. |
| 7 | +/// Therefore, calling this macro multiple times in a loop will always yield the same data. However, separate |
| 8 | +/// invocations of the macro will yield different buffers. |
| 9 | +/// |
| 10 | +/// The data is uninitialized by default, therefore, you must be careful to not read the data before it is written to. |
| 11 | +/// The semantics of what "uninitialized" actually means on the GPU (i.e. if it yields unknown data or if it is UB to read it whatsoever) |
| 12 | +/// are not well known, so even if the type is valid for any backing memory, make sure to not read uninitialized data. |
| 13 | +/// |
| 14 | +/// # Safety |
| 15 | +/// |
| 16 | +/// Shared memory usage is fundamentally extremely unsafe and impossible to statically prove, therefore |
| 17 | +/// the burden of correctness is on the user. Some of the things you must ensure in your usage of |
| 18 | +/// shared memory are: |
| 19 | +/// - Shared memory is only shared across __thread blocks__, not the entire device, therefore it is |
| 20 | +/// unsound to try and rely on sharing data across more than one block. |
| 21 | +/// - You must write to the shared buffer before reading from it as the data is uninitialized by default. |
| 22 | +/// - [`thread::sync_threads`](crate::thread::sync_threads) must be called before relying on the results of other |
| 23 | +/// threads, this ensures every thread has reached that point before going on. For example, reading another thread's |
| 24 | +/// data after writing to the buffer. |
| 25 | +/// - No access may be out of bounds, this usually means making sure the amount of threads and their dimensions are correct. |
| 26 | +/// |
| 27 | +/// It is suggested to run your executable in `cuda-memcheck` to make sure usages of shared memory are right. |
| 28 | +/// |
| 29 | +/// # Examples |
| 30 | +/// |
| 31 | +/// ```no_run |
| 32 | +/// #[kernel] |
| 33 | +/// pub unsafe fn reverse_array(d: *mut i32, n: usize) { |
| 34 | +/// let s = shared_array![i32; 64]; |
| 35 | +/// let t = thread::thread_idx_x() as usize; |
| 36 | +/// let tr = n - t - 1; |
| 37 | +/// *s.add(t) = *d.add(t); |
| 38 | +/// thread::sync_threads(); |
| 39 | +/// *d.add(t) = *s.add(tr); |
| 40 | +/// } |
| 41 | +/// ``` |
| 42 | +#[macro_export] |
| 43 | +macro_rules! shared_array { |
| 44 | + ($array_type:ty; $len:expr) => {{ |
| 45 | + // the initializer is discarded when declaring shared globals, so it is unimportant. |
| 46 | + #[$crate::address_space(shared)] |
| 47 | + static mut SHARED: MaybeUninit<[$array_type; $len]> = MaybeUninit::uninit(); |
| 48 | + SHARED.as_mut_ptr() as *mut $array_type |
| 49 | + }}; |
| 50 | +} |
0 commit comments