Rust-GPU · LegNeato · Jul 12, 2025 · Apr 1, 2025 · Apr 24, 2025 · Jul 12, 2025
diff --git a/crates/cuda_std/Cargo.toml b/crates/cuda_std/Cargo.toml
@@ -8,6 +8,7 @@ repository = "https://github.com/Rust-GPU/Rust-CUDA"
 readme = "../../README.md"
 
 [dependencies]
+glam = { version = ">=0.22", default-features = false, features = ["libm", "cuda", "bytemuck"] }
 vek = { version = "0.17.1", default-features = false, features = ["libm"] }
 cuda_std_macros = { version = "0.2", path = "../cuda_std_macros" }
 half = "2.4.1"

diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs
@@ -49,7 +49,9 @@ mod float_ext;
 pub use cuda_std_macros::*;
 pub use float::GpuFloat;
 pub use float_ext::*;
+pub use glam;
 pub use half;
+#[deprecated(note = "The `vek` module is deprecated, use `glam` instead.")]
 pub use vek;
 
 pub use half::{bf16, f16};

diff --git a/crates/cuda_std/src/rt/mod.rs b/crates/cuda_std/src/rt/mod.rs
@@ -152,23 +152,23 @@ impl<'a> From<&'a GridSize> for GridSize {
         other.clone()
     }
 }
-impl From<vek::Vec2<u32>> for GridSize {
-    fn from(vec: vek::Vec2<u32>) -> Self {
+impl From<glam::UVec2> for GridSize {
+    fn from(vec: glam::UVec2) -> Self {
         GridSize::xy(vec.x, vec.y)
     }
 }
-impl From<vek::Vec3<u32>> for GridSize {
-    fn from(vec: vek::Vec3<u32>) -> Self {
+impl From<glam::UVec3> for GridSize {
+    fn from(vec: glam::UVec3) -> Self {
         GridSize::xyz(vec.x, vec.y, vec.z)
     }
 }
-impl From<vek::Vec2<usize>> for GridSize {
-    fn from(vec: vek::Vec2<usize>) -> Self {
+impl From<glam::USizeVec2> for GridSize {
+    fn from(vec: glam::USizeVec2) -> Self {
         GridSize::xy(vec.x as u32, vec.y as u32)
     }
 }
-impl From<vek::Vec3<usize>> for GridSize {
-    fn from(vec: vek::Vec3<usize>) -> Self {
+impl From<glam::USizeVec3> for GridSize {
+    fn from(vec: glam::USizeVec3) -> Self {
         GridSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32)
     }
 }
@@ -228,23 +228,23 @@ impl<'a> From<&'a BlockSize> for BlockSize {
         other.clone()
     }
 }
-impl From<vek::Vec2<u32>> for BlockSize {
-    fn from(vec: vek::Vec2<u32>) -> Self {
+impl From<glam::UVec2> for BlockSize {
+    fn from(vec: glam::UVec2) -> Self {
         BlockSize::xy(vec.x, vec.y)
     }
 }
-impl From<vek::Vec3<u32>> for BlockSize {
-    fn from(vec: vek::Vec3<u32>) -> Self {
+impl From<glam::UVec3> for BlockSize {
+    fn from(vec: glam::UVec3) -> Self {
         BlockSize::xyz(vec.x, vec.y, vec.z)
     }
 }
-impl From<vek::Vec2<usize>> for BlockSize {
-    fn from(vec: vek::Vec2<usize>) -> Self {
+impl From<glam::USizeVec2> for BlockSize {
+    fn from(vec: glam::USizeVec2) -> Self {
         BlockSize::xy(vec.x as u32, vec.y as u32)
     }
 }
-impl From<vek::Vec3<usize>> for BlockSize {
-    fn from(vec: vek::Vec3<usize>) -> Self {
+impl From<glam::USizeVec3> for BlockSize {
+    fn from(vec: glam::USizeVec3) -> Self {
         BlockSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32)
     }
 }
diff --git a/crates/cuda_std/src/thread.rs b/crates/cuda_std/src/thread.rs
@@ -19,7 +19,7 @@
 // TODO: write some docs about the terms used in this module.
 
 use cuda_std_macros::gpu_only;
-use vek::{Vec2, Vec3};
+use glam::{UVec2, UVec3};
 
 // different calling conventions dont exist in nvptx, so we just use C as a placeholder.
 extern "C" {
@@ -152,9 +152,9 @@ pub fn grid_dim_z() -> u32 {
 /// Gets the 3d index of the thread currently executing the kernel.
 #[gpu_only]
 #[inline(always)]
-pub fn thread_idx() -> Vec3<u32> {
+pub fn thread_idx() -> UVec3 {
     unsafe {
-        Vec3::new(
+        UVec3::new(
             __nvvm_thread_idx_x(),
             __nvvm_thread_idx_y(),
             __nvvm_thread_idx_z(),
@@ -165,9 +165,9 @@ pub fn thread_idx() -> Vec3<u32> {
 /// Gets the 3d index of the block that the thread currently executing the kernel is located in.
 #[gpu_only]
 #[inline(always)]
-pub fn block_idx() -> Vec3<u32> {
+pub fn block_idx() -> UVec3 {
     unsafe {
-        Vec3::new(
+        UVec3::new(
             __nvvm_block_idx_x(),
             __nvvm_block_idx_y(),
             __nvvm_block_idx_z(),
@@ -179,9 +179,9 @@ pub fn block_idx() -> Vec3<u32> {
 /// how many threads exist in each thread block in every direction.
 #[gpu_only]
 #[inline(always)]
-pub fn block_dim() -> Vec3<u32> {
+pub fn block_dim() -> UVec3 {
     unsafe {
-        Vec3::new(
+        UVec3::new(
             __nvvm_block_dim_x(),
             __nvvm_block_dim_y(),
             __nvvm_block_dim_z(),
@@ -193,9 +193,9 @@ pub fn block_dim() -> Vec3<u32> {
 /// how many thread blocks exist in each grid in every direction.
 #[gpu_only]
 #[inline(always)]
-pub fn grid_dim() -> Vec3<u32> {
+pub fn grid_dim() -> UVec3 {
     unsafe {
-        Vec3::new(
+        UVec3::new(
             __nvvm_grid_dim_x(),
             __nvvm_grid_dim_y(),
             __nvvm_grid_dim_z(),
@@ -206,7 +206,7 @@ pub fn grid_dim() -> Vec3<u32> {
 /// Gets the overall thread index, accounting for 1d/2d/3d block/grid dimensions. This
 /// value is most commonly used for indexing into data and this index is guaranteed to
 /// be unique for every single thread executing this kernel no matter the launch configuration.
-/// 
+///
 /// For very simple kernels it may be faster to use a more simple index calculation, however,
 /// it will be unsound if the kernel launches in a 2d/3d configuration.
 #[gpu_only]
@@ -218,10 +218,10 @@ pub fn index() -> u32 {
     let block_dim = block_dim();
     let thread_idx = thread_idx();
 
-    let block_id = block_idx.x + block_idx.y * grid_dim.x 
+    let block_id = block_idx.x + block_idx.y * grid_dim.x
                        + grid_dim.x * grid_dim.y * block_idx.z;
 
-    block_id * block_dim.product()
+    block_id * block_dim.element_product()
     + (thread_idx.z * (block_dim.x * block_dim.y))
     + (thread_idx.y * block_dim.x) + thread_idx.x
 }
@@ -232,26 +232,26 @@ pub fn index_1d() -> u32 {
 }
 
 #[inline(always)]
-pub fn index_2d() -> Vec2<u32> {
+pub fn index_2d() -> UVec2 {
     let i = thread_idx_x() + block_idx_x() * block_dim_x();
     let j = thread_idx_y() + block_idx_y() * block_dim_y();
-    Vec2::new(i, j)
+    UVec2::new(i, j)
 }
 
 #[inline(always)]
-pub fn index_3d() -> Vec3<u32> {
+pub fn index_3d() -> UVec3 {
     let i = thread_idx_x() + block_idx_x() * block_dim_x();
     let j = thread_idx_y() + block_idx_y() * block_dim_y();
     let k = thread_idx_z() + block_idx_z() * block_dim_z();
-    Vec3::new(i, j, k)
+    UVec3::new(i, j, k)
 }
 
 /// Whether this is the first thread (not the first thread to be executing). This function is guaranteed
 /// to only return true in a single thread that is invoking it. This is useful for only doing something
 /// once.
 #[inline(always)]
 pub fn first() -> bool {
-    block_idx() == Vec3::zero() && thread_idx() == Vec3::zero()
+    block_idx() == UVec3::ZERO && thread_idx() == UVec3::ZERO
 }
 
 /// Gets the number of threads inside of a warp. Currently 32 threads on every GPU architecture.

diff --git a/crates/cust/CHANGELOG.md b/crates/cust/CHANGELOG.md
@@ -4,6 +4,7 @@ Notable changes to this project will be documented in this file.
 
 ## Unreleased
 
+- `cuda_std::vek` is now deprecated. Use `cuda_std::glam`.
 - Add `memory::memcpy_dtoh` to allow copying from device to host.
 - `DeviceSlice` is represented as a slice again, but as `[()]` instead of `[T]`.
 - Reimplemented `Index` and `IndexMut` for `DeviceSlice` and removed `DeviceSlice::index`.

diff --git a/crates/cust/Cargo.toml b/crates/cust/Cargo.toml
@@ -17,7 +17,7 @@ cust_core = { path = "../cust_core", version = "0.1.0"}
 cust_raw = { path = "../cust_raw", default-features = false, features = ["driver"] }
 bitflags = "2.8"
 cust_derive = { path = "../cust_derive", version = "0.2" }
-glam = { version = "0.29.2", features=["cuda"], optional = true }
+glam = { version = "0.30", features=["cuda"], optional = true }
 mint = { version = "^0.5", optional = true }
 num-complex = { version = "0.4.6", optional = true }
 vek = { version = "0.17.1", optional = true, default-features = false }

diff --git a/crates/cust/src/function.rs b/crates/cust/src/function.rs
@@ -88,6 +88,31 @@ impl From<vek::Vec3<usize>> for GridSize {
     }
 }
 
+#[cfg(feature = "glam")]
+impl From<glam::UVec2> for GridSize {
+    fn from(vec: glam::UVec2) -> Self {
+        GridSize::xy(vec.x, vec.y)
+    }
+}
+#[cfg(feature = "glam")]
+impl From<glam::UVec3> for GridSize {
+    fn from(vec: glam::UVec3) -> Self {
+        GridSize::xyz(vec.x, vec.y, vec.z)
+    }
+}
+#[cfg(feature = "glam")]
+impl From<glam::USizeVec2> for GridSize {
+    fn from(vec: glam::USizeVec2) -> Self {
+        GridSize::xy(vec.x as u32, vec.y as u32)
+    }
+}
+#[cfg(feature = "glam")]
+impl From<glam::USizeVec3> for GridSize {
+    fn from(vec: glam::USizeVec3) -> Self {
+        GridSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32)
+    }
+}
+
 /// Dimensions of a thread block, or the number of threads in a block.
 ///
 /// Each component of a `BlockSize` must be at least 1. The maximum size depends on your device's
@@ -168,6 +193,31 @@ impl From<vek::Vec3<usize>> for BlockSize {
     }
 }
 
+#[cfg(feature = "glam")]
+impl From<glam::UVec2> for BlockSize {
+    fn from(vec: glam::UVec2) -> Self {
+        BlockSize::xy(vec.x, vec.y)
+    }
+}
+#[cfg(feature = "glam")]
+impl From<glam::UVec3> for BlockSize {
+    fn from(vec: glam::UVec3) -> Self {
+        BlockSize::xyz(vec.x, vec.y, vec.z)
+    }
+}
+#[cfg(feature = "glam")]
+impl From<glam::USizeVec2> for BlockSize {
+    fn from(vec: glam::USizeVec2) -> Self {
+        BlockSize::xy(vec.x as u32, vec.y as u32)
+    }
+}
+#[cfg(feature = "glam")]
+impl From<glam::USizeVec3> for BlockSize {
+    fn from(vec: glam::USizeVec3) -> Self {
+        BlockSize::xyz(vec.x as u32, vec.y as u32, vec.z as u32)
+    }
+}
+
 /// All supported function attributes for [Function::get_attribute](struct.Function.html#method.get_attribute)
 #[repr(u32)]
 #[non_exhaustive]

diff --git a/crates/cust_core/Cargo.toml b/crates/cust_core/Cargo.toml
@@ -9,7 +9,7 @@ readme = "../../README.md"
 
 [dependencies]
 vek = { version = "0.17.1", default-features=false, features=["libm"], optional = true }
-glam = { version = "0.29.2", features=["cuda", "libm"], default-features=false, optional=true }
+glam = { version = "0.30", features=["cuda", "libm"], default-features=false, optional=true }
 mint = { version = "^0.5", optional = true }
 half = { version = "2.4.1", optional = true }
 num-complex = { version = "0.4.6", optional = true }

diff --git a/crates/cust_core/src/lib.rs b/crates/cust_core/src/lib.rs
@@ -1,5 +1,3 @@
-#![no_std]
-
 pub use _hidden::*;
 pub use cust_derive::DeviceCopyCore as DeviceCopy;
 
@@ -143,6 +141,7 @@ pub mod _hidden {
     {
     }
 
+    #[allow(unused_macros)]
     macro_rules! impl_device_copy_generic {
     ($($($strukt:ident)::+),* $(,)?) => {
         $(
@@ -151,6 +150,7 @@ pub mod _hidden {
     }
 }
 
+    #[allow(unused_macros)]
     macro_rules! impl_device_copy {
     ($($strukt:ty),* $(,)?) => {
         $(
@@ -172,7 +172,22 @@ pub mod _hidden {
 
     #[cfg(feature = "glam")]
     impl_device_copy! {
-        glam::Vec2, glam::Vec3, glam::Vec4, glam::IVec2, glam::IVec3, glam::IVec4,
+        glam::BVec2, glam::BVec3, glam::BVec3A, glam::BVec4, glam::BVec4A,
+        glam::U8Vec2, glam::U8Vec3, glam::U8Vec4,
+        glam::I8Vec2, glam::I8Vec3, glam::I8Vec4,
+        glam::U16Vec2, glam::U16Vec3, glam::U16Vec4,
+        glam::I16Vec2, glam::I16Vec3, glam::I16Vec4,
+        glam::UVec2, glam::UVec3, glam::UVec4,
+        glam::IVec2, glam::IVec3, glam::IVec4,
+        glam::U64Vec2, glam::U64Vec3, glam::U64Vec4,
+        glam::I64Vec2, glam::I64Vec3, glam::I64Vec4,
+        glam::USizeVec2, glam::USizeVec3, glam::USizeVec4,
+        glam::Vec2, glam::Vec3, glam::Vec3A, glam::Vec4,
+        glam::DVec2, glam::DVec3, glam::DVec4,
+        glam::Mat2, glam::Mat3, glam::Mat3A, glam::Mat4,
+        glam::DMat2, glam::DMat3, glam::DMat4,
+        glam::Quat, glam::DQuat,
+        glam::Affine2, glam::Affine3A,
     }
 
     #[cfg(feature = "mint")]

diff --git a/crates/gpu_rand/src/lib.rs b/crates/gpu_rand/src/lib.rs
@@ -12,7 +12,6 @@
 #![deny(missing_docs)]
 #![deny(missing_debug_implementations)]
 #![allow(clippy::unreadable_literal)]
-#![cfg_attr(target_os = "cuda", no_std)]
 #![feature(doc_cfg)]
 
 pub mod xoroshiro;

diff --git a/crates/optix/Cargo.toml b/crates/optix/Cargo.toml
@@ -12,7 +12,7 @@ cust = { version = "0.3", path = "../cust", features=["impl_mint"] }
 cust_raw = { path = "../cust_raw", features=["driver"] }
 cfg-if = "1.0.0"
 bitflags = "2.9.0"
-glam = { version = "0.29", features=["cuda", "libm"], default-features=false, optional=true }
+glam = { version = "0.30", features=["cuda", "libm"], default-features=false, optional=true }
 half = { version = "2.4.1", optional = true }
 memoffset = "0.9.1"
 mint = "0.5.9"

diff --git a/crates/optix/examples/ex03_window/Cargo.toml b/crates/optix/examples/ex03_window/Cargo.toml
@@ -13,3 +13,4 @@ anyhow = "1.0.44"
 glfw = "0.42.0"
 gl = "0.14.0"
 num-traits = "0.2.14"
+glam = { version = "0.30", features = ["bytemuck"] }
diff --git a/crates/optix/examples/ex03_window/src/gl_util.rs b/crates/optix/examples/ex03_window/src/gl_util.rs
@@ -2,7 +2,7 @@ use gl;
 use gl::types::{GLchar, GLenum, GLint, GLsizeiptr, GLuint, GLvoid};
 use std::ffi::{CStr, CString};
 
-use crate::vector::*;
+use glam::Vec4;
 
 pub struct Shader {
     id: GLuint,
@@ -516,7 +516,7 @@ impl FullscreenQuad {
         self.vertex_array.unbind();
     }
 
-    pub fn update_texture(&self, data: &[V4f32]) {
+    pub fn update_texture(&self, data: &[Vec4]) {
         unsafe {
             gl::BindTexture(gl::TEXTURE_2D, self.texture_id);
             gl::TexSubImage2D(

diff --git a/crates/optix/examples/ex03_window/src/main.rs b/crates/optix/examples/ex03_window/src/main.rs
@@ -3,8 +3,7 @@
 mod renderer;
 use renderer::Renderer;
 
-mod vector;
-pub use vector::*;
+use glam::{IVec2, Vec4};
 mod gl_util;
 use gl_util::FullscreenQuad;
 use glfw::{Action, Context, Key};
@@ -42,7 +41,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     let mut fsq = FullscreenQuad::new(width, height).unwrap();
 
-    let mut image_data = vec![v4f32(0.0, 0.0, 0.0, 0.0); (width * height) as usize];
+    let mut image_data = vec![Vec4::new(0.0, 0.0, 0.0, 0.0); (width * height) as usize];
 
     unsafe {
         gl::Viewport(0, 0, fb_width, fb_height);
@@ -62,7 +61,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
             renderer.resize(w, h)?;
             width = w;
             height = h;
-            image_data.resize((width * height) as usize, v4f32(0.0, 0.0, 0.0, 0.0));
+            image_data.resize((width * height) as usize, Vec4::new(0.0, 0.0, 0.0, 0.0));
         }
 
         renderer.render()?;