Rust-GPU
diff --git a/‎crates/cuda_std/src/shared.rs‎
Lines changed: 29 additions & 19 deletions b/‎crates/cuda_std/src/shared.rs‎
Lines changed: 29 additions & 19 deletions
diff --git a/‎crates/cuda_std/src/warp.rs‎
Lines changed: 126 additions & 85 deletions b/‎crates/cuda_std/src/warp.rs‎
Lines changed: 126 additions & 85 deletions
diff --git a/‎crates/cust/src/context/legacy.rs‎
Lines changed: 43 additions & 24 deletions b/‎crates/cust/src/context/legacy.rs‎
Lines changed: 43 additions & 24 deletions
diff --git a/‎crates/cust/src/context/mod.rs‎
Lines changed: 46 additions & 26 deletions b/‎crates/cust/src/context/mod.rs‎
Lines changed: 46 additions & 26 deletions
diff --git a/‎crates/cust/src/device.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/cust/src/device.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/cust/src/error.rs‎
Lines changed: 6 additions & 3 deletions b/‎crates/cust/src/error.rs‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎crates/cust/src/function.rs‎
Lines changed: 20 additions & 6 deletions b/‎crates/cust/src/function.rs‎
Lines changed: 20 additions & 6 deletions
@@ -2,31 +2,41 @@
 
 use crate::gpu_only;
 
-/// Statically allocates a buffer large enough for `len` elements of `array_type`, yielding
-/// a `*mut array_type` that points to uninitialized shared memory. `len` must be a constant expression.
+/// Statically allocates a buffer large enough for `len` elements of `array_type`,
+/// yielding a `*mut array_type` that points to uninitialized shared memory. `len` must
+/// be a constant expression.
 ///
-/// Note that this allocates the memory __statically__, it expands to a static in the `shared` address space.
-/// Therefore, calling this macro multiple times in a loop will always yield the same data. However, separate
-/// invocations of the macro will yield different buffers.
+/// Note that this allocates the memory __statically__, it expands to a static in the
+/// `shared` address space. Therefore, calling this macro multiple times in a loop will
+/// always yield the same data. However, separate invocations of the macro will yield
+/// different buffers.
 ///
-/// The data is uninitialized by default, therefore, you must be careful to not read the data before it is written to.
-/// The semantics of what "uninitialized" actually means on the GPU (i.e. if it yields unknown data or if it is UB to read it whatsoever)
-/// are not well known, so even if the type is valid for any backing memory, make sure to not read uninitialized data.
+/// The data is uninitialized by default, therefore, you must be careful to not read the
+/// data before it is written to. The semantics of what "uninitialized" actually means
+/// on the GPU (i.e. if it yields unknown data or if it is UB to read it whatsoever) are
+/// not well known, so even if the type is valid for any backing memory, make sure to
+/// not read uninitialized data.
 ///
 /// # Safety
 ///
-/// Shared memory usage is fundamentally extremely unsafe and impossible to statically prove, therefore
-/// the burden of correctness is on the user. Some of the things you must ensure in your usage of
-/// shared memory are:
-/// - Shared memory is only shared across __thread blocks__, not the entire device, therefore it is
-///   unsound to try and rely on sharing data across more than one block.
-/// - You must write to the shared buffer before reading from it as the data is uninitialized by default.
-/// - [`thread::sync_threads`](crate::thread::sync_threads) must be called before relying on the results of other
-///   threads, this ensures every thread has reached that point before going on. For example, reading another thread's
-///   data after writing to the buffer.
-/// - No access may be out of bounds, this usually means making sure the amount of threads and their dimensions are correct.
+/// Shared memory usage is fundamentally extremely unsafe and impossible to statically
+/// prove, therefore the burden of correctness is on the user. Some of the things you
+/// must ensure in your usage of shared memory are:
 ///
-/// It is suggested to run your executable in `cuda-memcheck` to make sure usages of shared memory are right.
+///  - Shared memory is only shared across __thread blocks__, not the entire device,
+///    therefore it is unsound to try and rely on sharing data across more than one
+///    block.
+///   - You must write to the shared buffer before reading from it as the data is
+///     uninitialized by default.
+///   - [`thread::sync_threads`](crate::thread::sync_threads) must be called before
+///     relying on the results of other threads, this ensures every thread has reached
+///     that point before going on. For example, reading another thread's data after
+///     writing to the buffer.
+///   - No access may be out of bounds, this usually means making sure the amount of
+///     threads and their dimensions are correct.
+///
+/// It is suggested to run your executable in `cuda-memcheck` to make sure usages of
+/// shared memory are right.
 ///
 /// # Examples
 ///
 
@@ -601,7 +601,11 @@ impl CurrentContext {
     pub fn get_resource_limit(resource: ResourceLimit) -> CudaResult<usize> {
         unsafe {
             let mut limit: usize = 0;
-            cuda::cuCtxGetLimit(&mut limit as *mut usize, transmute(resource)).to_result()?;
+            cuda::cuCtxGetLimit(
+                &mut limit as *mut usize,
+                transmute::<ResourceLimit, cust_raw::CUlimit_enum>(resource),
+            )
+            .to_result()?;
             Ok(limit)
         }
     }
@@ -696,33 +700,38 @@ impl CurrentContext {
     /// # }
     /// ```
     pub fn set_cache_config(cfg: CacheConfig) -> CudaResult<()> {
-        unsafe { cuda::cuCtxSetCacheConfig(transmute(cfg)).to_result() }
+        unsafe {
+            cuda::cuCtxSetCacheConfig(transmute::<CacheConfig, cust_raw::CUfunc_cache_enum>(cfg))
+                .to_result()
+        }
     }
 
     /// Sets a requested resource limit for the current context.
     ///
-    /// Note that this is only a request; the driver is free to modify the requested value to meet
-    /// hardware requirements. Each limit has some specific restrictions.
+    /// Note that this is only a request; the driver is free to modify the requested
+    /// value to meet hardware requirements. Each limit has some specific restrictions.
     ///
     /// * `StackSize`: Controls the stack size in bytes for each GPU thread
-    /// * `PrintfFifoSize`: Controls the size in bytes of the FIFO used by the `printf()` device
-    ///   system call. This cannot be changed after a kernel has been launched which uses the
-    ///   `printf()` function.
-    /// * `MallocHeapSize`: Controls the size in bytes of the heap used by the `malloc()` and `free()`
-    ///   device system calls. This cannot be changed aftr a kernel has been launched which uses the
-    ///   `malloc()` and `free()` system calls.
-    /// * `DeviceRuntimeSyncDepth`: Controls the maximum nesting depth of a grid at which a thread
-    ///   can safely call `cudaDeviceSynchronize()`. This cannot be changed after a kernel has been
-    ///   launched which uses the device runtime. When setting this limit, keep in mind that
-    ///   additional levels of sync depth require the driver to reserve large amounts of device
-    ///   memory which can no longer be used for device allocations.
-    /// * `DeviceRuntimePendingLaunchCount`: Controls the maximum number of outstanding device
-    ///    runtime launches that can be made from the current context. A grid is outstanding from
-    ///    the point of the launch up until the grid is known to have completed. Keep in mind that
-    ///    increasing this limit will require the driver to reserve larger amounts of device memory
-    ///    which can no longer be used for device allocations.
-    /// * `MaxL2FetchGranularity`: Controls the L2 fetch granularity. This is purely a performance
-    ///    hint and it can be ignored or clamped depending on the platform.
+    /// * `PrintfFifoSize`: Controls the size in bytes of the FIFO used by the
+    ///   `printf()` device system call. This cannot be changed after a kernel has been
+    ///   launched which uses the `printf()` function.
+    /// * `MallocHeapSize`: Controls the size in bytes of the heap used by the
+    ///   `malloc()` and `free()` device system calls. This cannot be changed aftr a
+    ///   kernel has been launched which uses the `malloc()` and `free()` system calls.
+    /// * `DeviceRuntimeSyncDepth`: Controls the maximum nesting depth of a grid at
+    ///   which a thread can safely call `cudaDeviceSynchronize()`. This cannot be
+    ///   changed after a kernel has been launched which uses the device runtime. When
+    ///   setting this limit, keep in mind that additional levels of sync depth require
+    ///   the driver to reserve large amounts of device memory which can no longer be
+    ///   used for device allocations.
+    /// * `DeviceRuntimePendingLaunchCount`: Controls the maximum number of outstanding
+    ///   device runtime launches that can be made from the current context. A grid is
+    ///   outstanding from the point of the launch up until the grid is known to have
+    ///   completed. Keep in mind that increasing this limit will require the driver to
+    ///   reserve larger amounts of device memory which can no longer be used for device
+    ///   allocations.
+    /// * `MaxL2FetchGranularity`: Controls the L2 fetch granularity. This is purely a
+    ///   performance hint and it can be ignored or clamped depending on the platform.
     ///
     /// # Example
     ///
@@ -741,7 +750,11 @@ impl CurrentContext {
     /// ```
     pub fn set_resource_limit(resource: ResourceLimit, limit: usize) -> CudaResult<()> {
         unsafe {
-            cuda::cuCtxSetLimit(transmute(resource), limit).to_result()?;
+            cuda::cuCtxSetLimit(
+                transmute::<ResourceLimit, cust_raw::CUlimit_enum>(resource),
+                limit,
+            )
+            .to_result()?;
             Ok(())
         }
     }
@@ -767,7 +780,13 @@ impl CurrentContext {
     /// # }
     /// ```
     pub fn set_shared_memory_config(cfg: SharedMemoryConfig) -> CudaResult<()> {
-        unsafe { cuda::cuCtxSetSharedMemConfig(transmute(cfg)).to_result() }
+        unsafe {
+            cuda::cuCtxSetSharedMemConfig(transmute::<
+                SharedMemoryConfig,
+                cust_raw::CUsharedconfig_enum,
+            >(cfg))
+            .to_result()
+        }
     }
 
     /// Returns a non-owning handle to the current context.
 
@@ -422,7 +422,11 @@ impl CurrentContext {
     pub fn get_resource_limit(resource: ResourceLimit) -> CudaResult<usize> {
         unsafe {
             let mut limit: usize = 0;
-            cuda::cuCtxGetLimit(&mut limit as *mut usize, transmute(resource)).to_result()?;
+            cuda::cuCtxGetLimit(
+                &mut limit as *mut usize,
+                transmute::<ResourceLimit, cust_raw::CUlimit_enum>(resource),
+            )
+            .to_result()?;
             Ok(limit)
         }
     }
@@ -517,33 +521,39 @@ impl CurrentContext {
     /// # }
     /// ```
     pub fn set_cache_config(cfg: CacheConfig) -> CudaResult<()> {
-        unsafe { cuda::cuCtxSetCacheConfig(transmute(cfg)).to_result() }
+        unsafe {
+            cuda::cuCtxSetCacheConfig(transmute::<CacheConfig, cust_raw::CUfunc_cache_enum>(cfg))
+                .to_result()
+        }
     }
 
     /// Sets a requested resource limit for the current context.
     ///
-    /// Note that this is only a request; the driver is free to modify the requested value to meet
-    /// hardware requirements. Each limit has some specific restrictions.
-    ///
-    /// * `StackSize`: Controls the stack size in bytes for each GPU thread
-    /// * `PrintfFifoSize`: Controls the size in bytes of the FIFO used by the `printf()` device
-    ///   system call. This cannot be changed after a kernel has been launched which uses the
-    ///   `printf()` function.
-    /// * `MallocHeapSize`: Controls the size in bytes of the heap used by the `malloc()` and `free()`
-    ///   device system calls. This cannot be changed aftr a kernel has been launched which uses the
-    ///   `malloc()` and `free()` system calls.
-    /// * `DeviceRuntimeSyncDepth`: Controls the maximum nesting depth of a grid at which a thread
-    ///   can safely call `cudaDeviceSynchronize()`. This cannot be changed after a kernel has been
-    ///   launched which uses the device runtime. When setting this limit, keep in mind that
-    ///   additional levels of sync depth require the driver to reserve large amounts of device
-    ///   memory which can no longer be used for device allocations.
-    /// * `DeviceRuntimePendingLaunchCount`: Controls the maximum number of outstanding device
-    ///    runtime launches that can be made from the current context. A grid is outstanding from
-    ///    the point of the launch up until the grid is known to have completed. Keep in mind that
-    ///    increasing this limit will require the driver to reserve larger amounts of device memory
-    ///    which can no longer be used for device allocations.
-    /// * `MaxL2FetchGranularity`: Controls the L2 fetch granularity. This is purely a performance
-    ///    hint and it can be ignored or clamped depending on the platform.
+    /// Note that this is only a request; the driver is free to modify the requested
+    /// value to meet hardware requirements. Each limit has some specific restrictions.
+    ///
+    ///   * `StackSize`: Controls the stack size in bytes for each GPU thread
+    ///   * `PrintfFifoSize`: Controls the size in bytes of the FIFO used by the
+    ///     `printf()` device system call. This cannot be changed after a kernel has
+    ///     been launched which uses the `printf()` function.
+    ///   * `MallocHeapSize`: Controls the size in bytes of the heap used by the
+    ///     `malloc()` and `free()` device system calls. This cannot be changed aftr a
+    ///     kernel has been launched which uses the `malloc()` and `free()` system
+    ///     calls.
+    ///   * `DeviceRuntimeSyncDepth`: Controls the maximum nesting depth of a grid at
+    ///     which a thread can safely call `cudaDeviceSynchronize()`. This cannot be
+    ///     changed after a kernel has been launched which uses the device runtime. When
+    ///     setting this limit, keep in mind that additional levels of sync depth
+    ///     require the driver to reserve large amounts of device memory which can no
+    ///     longer be used for device allocations.
+    ///   * `DeviceRuntimePendingLaunchCount`: Controls the maximum number of
+    ///     outstanding device runtime launches that can be made from the current
+    ///     context. A grid is outstanding from the point of the launch up until the
+    ///     grid is known to have completed. Keep in mind that increasing this limit
+    ///     will require the driver to reserve larger amounts of device memory which can
+    ///     no longer be used for device allocations.
+    ///   * `MaxL2FetchGranularity`: Controls the L2 fetch granularity. This is purely a
+    ///     performance hint and it can be ignored or clamped depending on the platform.
     ///
     /// # Example
     ///
@@ -562,7 +572,11 @@ impl CurrentContext {
     /// ```
     pub fn set_resource_limit(resource: ResourceLimit, limit: usize) -> CudaResult<()> {
         unsafe {
-            cuda::cuCtxSetLimit(transmute(resource), limit).to_result()?;
+            cuda::cuCtxSetLimit(
+                transmute::<ResourceLimit, cust_raw::CUlimit_enum>(resource),
+                limit,
+            )
+            .to_result()?;
             Ok(())
         }
     }
@@ -588,7 +602,13 @@ impl CurrentContext {
     /// # }
     /// ```
     pub fn set_shared_memory_config(cfg: SharedMemoryConfig) -> CudaResult<()> {
-        unsafe { cuda::cuCtxSetSharedMemConfig(transmute(cfg)).to_result() }
+        unsafe {
+            cuda::cuCtxSetSharedMemConfig(transmute::<
+                SharedMemoryConfig,
+                cust_raw::CUsharedconfig_enum,
+            >(cfg))
+            .to_result()
+        }
     }
 
     /// Set the given context as the current context for this thread.
 
@@ -371,7 +371,7 @@ impl Device {
             cuDeviceGetAttribute(
                 &mut val as *mut i32,
                 // This should be safe, as the repr and values of DeviceAttribute should match.
-                ::std::mem::transmute(attr),
+                ::std::mem::transmute::<DeviceAttribute, cust_raw::CUdevice_attribute_enum>(attr),
                 self.device,
             )
             .to_result()?;
 
@@ -96,9 +96,12 @@ impl fmt::Display for CudaError {
                 let value = other as u32;
                 let mut ptr: *const c_char = ptr::null();
                 unsafe {
-                    cuda::cuGetErrorString(mem::transmute(value), &mut ptr as *mut *const c_char)
-                        .to_result()
-                        .map_err(|_| fmt::Error)?;
+                    cuda::cuGetErrorString(
+                        mem::transmute::<u32, cust_raw::cudaError_enum>(value),
+                        &mut ptr as *mut *const c_char,
+                    )
+                    .to_result()
+                    .map_err(|_| fmt::Error)?;
                     let cstr = CStr::from_ptr(ptr);
                     write!(f, "{:?}", cstr)
                 }
 
@@ -55,7 +55,7 @@ impl From<(u32, u32, u32)> for GridSize {
         GridSize::xyz(x, y, z)
     }
 }
-impl<'a> From<&'a GridSize> for GridSize {
+impl From<&GridSize> for GridSize {
     fn from(other: &GridSize) -> GridSize {
         *other
     }
@@ -135,7 +135,7 @@ impl From<(u32, u32, u32)> for BlockSize {
         BlockSize::xyz(x, y, z)
     }
 }
-impl<'a> From<&'a BlockSize> for BlockSize {
+impl From<&BlockSize> for BlockSize {
     fn from(other: &BlockSize) -> BlockSize {
         *other
     }
@@ -209,7 +209,7 @@ pub struct Function<'a> {
 unsafe impl Send for Function<'_> {}
 unsafe impl Sync for Function<'_> {}
 
-impl<'a> Function<'a> {
+impl Function<'_> {
     pub(crate) fn new(inner: CUfunction, _module: &Module) -> Function {
         Function {
             inner,
@@ -243,7 +243,9 @@ impl<'a> Function<'a> {
             cuda::cuFuncGetAttribute(
                 &mut val as *mut i32,
                 // This should be safe, as the repr and values of FunctionAttribute should match.
-                ::std::mem::transmute(attr),
+                ::std::mem::transmute::<FunctionAttribute, cust_raw::CUfunction_attribute_enum>(
+                    attr,
+                ),
                 self.inner,
             )
             .to_result()?;
@@ -280,7 +282,13 @@ impl<'a> Function<'a> {
     /// # }
     /// ```
     pub fn set_cache_config(&mut self, config: CacheConfig) -> CudaResult<()> {
-        unsafe { cuda::cuFuncSetCacheConfig(self.inner, transmute(config)).to_result() }
+        unsafe {
+            cuda::cuFuncSetCacheConfig(
+                self.inner,
+                transmute::<CacheConfig, cust_raw::CUfunc_cache_enum>(config),
+            )
+            .to_result()
+        }
     }
 
     /// Sets the preferred shared memory configuration for this function.
@@ -307,7 +315,13 @@ impl<'a> Function<'a> {
     /// # }
     /// ```
     pub fn set_shared_memory_config(&mut self, cfg: SharedMemoryConfig) -> CudaResult<()> {
-        unsafe { cuda::cuFuncSetSharedMemConfig(self.inner, transmute(cfg)).to_result() }
+        unsafe {
+            cuda::cuFuncSetSharedMemConfig(
+                self.inner,
+                transmute::<SharedMemoryConfig, cust_raw::CUsharedconfig_enum>(cfg),
+            )
+            .to_result()
+        }
     }
 
     /// Retrieves a raw handle to this function.
Original file line number	Diff line number	Diff line change
`@@ -371,7 +371,7 @@ impl Device {`
`371`	`371`	`cuDeviceGetAttribute(`
`372`	`372`	`&mut val as *mut i32,`
`373`	`373`	`// This should be safe, as the repr and values of DeviceAttribute should match.`
`374`		`- ::std::mem::transmute(attr),`
	`374`	`+ ::std::mem::transmute::<DeviceAttribute, cust_raw::CUdevice_attribute_enum>(attr),`
`375`	`375`	`self.device,`
`376`	`376`	`)`
`377`	`377`	`.to_result()?;`