Skip to content

Commit bd80dc3

Browse files
committed
Fix some compilation errors, warnings, and clippy issues
1 parent 8fc34d5 commit bd80dc3

32 files changed

+391
-287
lines changed

crates/cuda_std/src/shared.rs

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,41 @@
22
33
use crate::gpu_only;
44

5-
/// Statically allocates a buffer large enough for `len` elements of `array_type`, yielding
6-
/// a `*mut array_type` that points to uninitialized shared memory. `len` must be a constant expression.
5+
/// Statically allocates a buffer large enough for `len` elements of `array_type`,
6+
/// yielding a `*mut array_type` that points to uninitialized shared memory. `len` must
7+
/// be a constant expression.
78
///
8-
/// Note that this allocates the memory __statically__, it expands to a static in the `shared` address space.
9-
/// Therefore, calling this macro multiple times in a loop will always yield the same data. However, separate
10-
/// invocations of the macro will yield different buffers.
9+
/// Note that this allocates the memory __statically__, it expands to a static in the
10+
/// `shared` address space. Therefore, calling this macro multiple times in a loop will
11+
/// always yield the same data. However, separate invocations of the macro will yield
12+
/// different buffers.
1113
///
12-
/// The data is uninitialized by default, therefore, you must be careful to not read the data before it is written to.
13-
/// The semantics of what "uninitialized" actually means on the GPU (i.e. if it yields unknown data or if it is UB to read it whatsoever)
14-
/// are not well known, so even if the type is valid for any backing memory, make sure to not read uninitialized data.
14+
/// The data is uninitialized by default, therefore, you must be careful to not read the
15+
/// data before it is written to. The semantics of what "uninitialized" actually means
16+
/// on the GPU (i.e. if it yields unknown data or if it is UB to read it whatsoever) are
17+
/// not well known, so even if the type is valid for any backing memory, make sure to
18+
/// not read uninitialized data.
1519
///
1620
/// # Safety
1721
///
18-
/// Shared memory usage is fundamentally extremely unsafe and impossible to statically prove, therefore
19-
/// the burden of correctness is on the user. Some of the things you must ensure in your usage of
20-
/// shared memory are:
21-
/// - Shared memory is only shared across __thread blocks__, not the entire device, therefore it is
22-
/// unsound to try and rely on sharing data across more than one block.
23-
/// - You must write to the shared buffer before reading from it as the data is uninitialized by default.
24-
/// - [`thread::sync_threads`](crate::thread::sync_threads) must be called before relying on the results of other
25-
/// threads, this ensures every thread has reached that point before going on. For example, reading another thread's
26-
/// data after writing to the buffer.
27-
/// - No access may be out of bounds, this usually means making sure the amount of threads and their dimensions are correct.
22+
/// Shared memory usage is fundamentally extremely unsafe and impossible to statically
23+
/// prove, therefore the burden of correctness is on the user. Some of the things you
24+
/// must ensure in your usage of shared memory are:
2825
///
29-
/// It is suggested to run your executable in `cuda-memcheck` to make sure usages of shared memory are right.
26+
/// - Shared memory is only shared across __thread blocks__, not the entire device,
27+
/// therefore it is unsound to try and rely on sharing data across more than one
28+
/// block.
29+
/// - You must write to the shared buffer before reading from it as the data is
30+
/// uninitialized by default.
31+
/// - [`thread::sync_threads`](crate::thread::sync_threads) must be called before
32+
/// relying on the results of other threads, this ensures every thread has reached
33+
/// that point before going on. For example, reading another thread's data after
34+
/// writing to the buffer.
35+
/// - No access may be out of bounds, this usually means making sure the amount of
36+
/// threads and their dimensions are correct.
37+
///
38+
/// It is suggested to run your executable in `cuda-memcheck` to make sure usages of
39+
/// shared memory are right.
3040
///
3141
/// # Examples
3242
///

crates/cuda_std/src/warp.rs

Lines changed: 126 additions & 85 deletions
Large diffs are not rendered by default.

crates/cust/src/context/legacy.rs

Lines changed: 43 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,11 @@ impl CurrentContext {
601601
pub fn get_resource_limit(resource: ResourceLimit) -> CudaResult<usize> {
602602
unsafe {
603603
let mut limit: usize = 0;
604-
cuda::cuCtxGetLimit(&mut limit as *mut usize, transmute(resource)).to_result()?;
604+
cuda::cuCtxGetLimit(
605+
&mut limit as *mut usize,
606+
transmute::<ResourceLimit, cust_raw::CUlimit_enum>(resource),
607+
)
608+
.to_result()?;
605609
Ok(limit)
606610
}
607611
}
@@ -696,33 +700,38 @@ impl CurrentContext {
696700
/// # }
697701
/// ```
698702
pub fn set_cache_config(cfg: CacheConfig) -> CudaResult<()> {
699-
unsafe { cuda::cuCtxSetCacheConfig(transmute(cfg)).to_result() }
703+
unsafe {
704+
cuda::cuCtxSetCacheConfig(transmute::<CacheConfig, cust_raw::CUfunc_cache_enum>(cfg))
705+
.to_result()
706+
}
700707
}
701708

702709
/// Sets a requested resource limit for the current context.
703710
///
704-
/// Note that this is only a request; the driver is free to modify the requested value to meet
705-
/// hardware requirements. Each limit has some specific restrictions.
711+
/// Note that this is only a request; the driver is free to modify the requested
712+
/// value to meet hardware requirements. Each limit has some specific restrictions.
706713
///
707714
/// * `StackSize`: Controls the stack size in bytes for each GPU thread
708-
/// * `PrintfFifoSize`: Controls the size in bytes of the FIFO used by the `printf()` device
709-
/// system call. This cannot be changed after a kernel has been launched which uses the
710-
/// `printf()` function.
711-
/// * `MallocHeapSize`: Controls the size in bytes of the heap used by the `malloc()` and `free()`
712-
/// device system calls. This cannot be changed aftr a kernel has been launched which uses the
713-
/// `malloc()` and `free()` system calls.
714-
/// * `DeviceRuntimeSyncDepth`: Controls the maximum nesting depth of a grid at which a thread
715-
/// can safely call `cudaDeviceSynchronize()`. This cannot be changed after a kernel has been
716-
/// launched which uses the device runtime. When setting this limit, keep in mind that
717-
/// additional levels of sync depth require the driver to reserve large amounts of device
718-
/// memory which can no longer be used for device allocations.
719-
/// * `DeviceRuntimePendingLaunchCount`: Controls the maximum number of outstanding device
720-
/// runtime launches that can be made from the current context. A grid is outstanding from
721-
/// the point of the launch up until the grid is known to have completed. Keep in mind that
722-
/// increasing this limit will require the driver to reserve larger amounts of device memory
723-
/// which can no longer be used for device allocations.
724-
/// * `MaxL2FetchGranularity`: Controls the L2 fetch granularity. This is purely a performance
725-
/// hint and it can be ignored or clamped depending on the platform.
715+
/// * `PrintfFifoSize`: Controls the size in bytes of the FIFO used by the
716+
/// `printf()` device system call. This cannot be changed after a kernel has been
717+
/// launched which uses the `printf()` function.
718+
/// * `MallocHeapSize`: Controls the size in bytes of the heap used by the
719+
/// `malloc()` and `free()` device system calls. This cannot be changed aftr a
720+
/// kernel has been launched which uses the `malloc()` and `free()` system calls.
721+
/// * `DeviceRuntimeSyncDepth`: Controls the maximum nesting depth of a grid at
722+
/// which a thread can safely call `cudaDeviceSynchronize()`. This cannot be
723+
/// changed after a kernel has been launched which uses the device runtime. When
724+
/// setting this limit, keep in mind that additional levels of sync depth require
725+
/// the driver to reserve large amounts of device memory which can no longer be
726+
/// used for device allocations.
727+
/// * `DeviceRuntimePendingLaunchCount`: Controls the maximum number of outstanding
728+
/// device runtime launches that can be made from the current context. A grid is
729+
/// outstanding from the point of the launch up until the grid is known to have
730+
/// completed. Keep in mind that increasing this limit will require the driver to
731+
/// reserve larger amounts of device memory which can no longer be used for device
732+
/// allocations.
733+
/// * `MaxL2FetchGranularity`: Controls the L2 fetch granularity. This is purely a
734+
/// performance hint and it can be ignored or clamped depending on the platform.
726735
///
727736
/// # Example
728737
///
@@ -741,7 +750,11 @@ impl CurrentContext {
741750
/// ```
742751
pub fn set_resource_limit(resource: ResourceLimit, limit: usize) -> CudaResult<()> {
743752
unsafe {
744-
cuda::cuCtxSetLimit(transmute(resource), limit).to_result()?;
753+
cuda::cuCtxSetLimit(
754+
transmute::<ResourceLimit, cust_raw::CUlimit_enum>(resource),
755+
limit,
756+
)
757+
.to_result()?;
745758
Ok(())
746759
}
747760
}
@@ -767,7 +780,13 @@ impl CurrentContext {
767780
/// # }
768781
/// ```
769782
pub fn set_shared_memory_config(cfg: SharedMemoryConfig) -> CudaResult<()> {
770-
unsafe { cuda::cuCtxSetSharedMemConfig(transmute(cfg)).to_result() }
783+
unsafe {
784+
cuda::cuCtxSetSharedMemConfig(transmute::<
785+
SharedMemoryConfig,
786+
cust_raw::CUsharedconfig_enum,
787+
>(cfg))
788+
.to_result()
789+
}
771790
}
772791

773792
/// Returns a non-owning handle to the current context.

crates/cust/src/context/mod.rs

Lines changed: 46 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,11 @@ impl CurrentContext {
422422
pub fn get_resource_limit(resource: ResourceLimit) -> CudaResult<usize> {
423423
unsafe {
424424
let mut limit: usize = 0;
425-
cuda::cuCtxGetLimit(&mut limit as *mut usize, transmute(resource)).to_result()?;
425+
cuda::cuCtxGetLimit(
426+
&mut limit as *mut usize,
427+
transmute::<ResourceLimit, cust_raw::CUlimit_enum>(resource),
428+
)
429+
.to_result()?;
426430
Ok(limit)
427431
}
428432
}
@@ -517,33 +521,39 @@ impl CurrentContext {
517521
/// # }
518522
/// ```
519523
pub fn set_cache_config(cfg: CacheConfig) -> CudaResult<()> {
520-
unsafe { cuda::cuCtxSetCacheConfig(transmute(cfg)).to_result() }
524+
unsafe {
525+
cuda::cuCtxSetCacheConfig(transmute::<CacheConfig, cust_raw::CUfunc_cache_enum>(cfg))
526+
.to_result()
527+
}
521528
}
522529

523530
/// Sets a requested resource limit for the current context.
524531
///
525-
/// Note that this is only a request; the driver is free to modify the requested value to meet
526-
/// hardware requirements. Each limit has some specific restrictions.
527-
///
528-
/// * `StackSize`: Controls the stack size in bytes for each GPU thread
529-
/// * `PrintfFifoSize`: Controls the size in bytes of the FIFO used by the `printf()` device
530-
/// system call. This cannot be changed after a kernel has been launched which uses the
531-
/// `printf()` function.
532-
/// * `MallocHeapSize`: Controls the size in bytes of the heap used by the `malloc()` and `free()`
533-
/// device system calls. This cannot be changed aftr a kernel has been launched which uses the
534-
/// `malloc()` and `free()` system calls.
535-
/// * `DeviceRuntimeSyncDepth`: Controls the maximum nesting depth of a grid at which a thread
536-
/// can safely call `cudaDeviceSynchronize()`. This cannot be changed after a kernel has been
537-
/// launched which uses the device runtime. When setting this limit, keep in mind that
538-
/// additional levels of sync depth require the driver to reserve large amounts of device
539-
/// memory which can no longer be used for device allocations.
540-
/// * `DeviceRuntimePendingLaunchCount`: Controls the maximum number of outstanding device
541-
/// runtime launches that can be made from the current context. A grid is outstanding from
542-
/// the point of the launch up until the grid is known to have completed. Keep in mind that
543-
/// increasing this limit will require the driver to reserve larger amounts of device memory
544-
/// which can no longer be used for device allocations.
545-
/// * `MaxL2FetchGranularity`: Controls the L2 fetch granularity. This is purely a performance
546-
/// hint and it can be ignored or clamped depending on the platform.
532+
/// Note that this is only a request; the driver is free to modify the requested
533+
/// value to meet hardware requirements. Each limit has some specific restrictions.
534+
///
535+
/// * `StackSize`: Controls the stack size in bytes for each GPU thread
536+
/// * `PrintfFifoSize`: Controls the size in bytes of the FIFO used by the
537+
/// `printf()` device system call. This cannot be changed after a kernel has
538+
/// been launched which uses the `printf()` function.
539+
/// * `MallocHeapSize`: Controls the size in bytes of the heap used by the
540+
/// `malloc()` and `free()` device system calls. This cannot be changed aftr a
541+
/// kernel has been launched which uses the `malloc()` and `free()` system
542+
/// calls.
543+
/// * `DeviceRuntimeSyncDepth`: Controls the maximum nesting depth of a grid at
544+
/// which a thread can safely call `cudaDeviceSynchronize()`. This cannot be
545+
/// changed after a kernel has been launched which uses the device runtime. When
546+
/// setting this limit, keep in mind that additional levels of sync depth
547+
/// require the driver to reserve large amounts of device memory which can no
548+
/// longer be used for device allocations.
549+
/// * `DeviceRuntimePendingLaunchCount`: Controls the maximum number of
550+
/// outstanding device runtime launches that can be made from the current
551+
/// context. A grid is outstanding from the point of the launch up until the
552+
/// grid is known to have completed. Keep in mind that increasing this limit
553+
/// will require the driver to reserve larger amounts of device memory which can
554+
/// no longer be used for device allocations.
555+
/// * `MaxL2FetchGranularity`: Controls the L2 fetch granularity. This is purely a
556+
/// performance hint and it can be ignored or clamped depending on the platform.
547557
///
548558
/// # Example
549559
///
@@ -562,7 +572,11 @@ impl CurrentContext {
562572
/// ```
563573
pub fn set_resource_limit(resource: ResourceLimit, limit: usize) -> CudaResult<()> {
564574
unsafe {
565-
cuda::cuCtxSetLimit(transmute(resource), limit).to_result()?;
575+
cuda::cuCtxSetLimit(
576+
transmute::<ResourceLimit, cust_raw::CUlimit_enum>(resource),
577+
limit,
578+
)
579+
.to_result()?;
566580
Ok(())
567581
}
568582
}
@@ -588,7 +602,13 @@ impl CurrentContext {
588602
/// # }
589603
/// ```
590604
pub fn set_shared_memory_config(cfg: SharedMemoryConfig) -> CudaResult<()> {
591-
unsafe { cuda::cuCtxSetSharedMemConfig(transmute(cfg)).to_result() }
605+
unsafe {
606+
cuda::cuCtxSetSharedMemConfig(transmute::<
607+
SharedMemoryConfig,
608+
cust_raw::CUsharedconfig_enum,
609+
>(cfg))
610+
.to_result()
611+
}
592612
}
593613

594614
/// Set the given context as the current context for this thread.

crates/cust/src/device.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ impl Device {
371371
cuDeviceGetAttribute(
372372
&mut val as *mut i32,
373373
// This should be safe, as the repr and values of DeviceAttribute should match.
374-
::std::mem::transmute(attr),
374+
::std::mem::transmute::<DeviceAttribute, cust_raw::CUdevice_attribute_enum>(attr),
375375
self.device,
376376
)
377377
.to_result()?;

crates/cust/src/error.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,12 @@ impl fmt::Display for CudaError {
9696
let value = other as u32;
9797
let mut ptr: *const c_char = ptr::null();
9898
unsafe {
99-
cuda::cuGetErrorString(mem::transmute(value), &mut ptr as *mut *const c_char)
100-
.to_result()
101-
.map_err(|_| fmt::Error)?;
99+
cuda::cuGetErrorString(
100+
mem::transmute::<u32, cust_raw::cudaError_enum>(value),
101+
&mut ptr as *mut *const c_char,
102+
)
103+
.to_result()
104+
.map_err(|_| fmt::Error)?;
102105
let cstr = CStr::from_ptr(ptr);
103106
write!(f, "{:?}", cstr)
104107
}

crates/cust/src/function.rs

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ impl From<(u32, u32, u32)> for GridSize {
5555
GridSize::xyz(x, y, z)
5656
}
5757
}
58-
impl<'a> From<&'a GridSize> for GridSize {
58+
impl From<&GridSize> for GridSize {
5959
fn from(other: &GridSize) -> GridSize {
6060
*other
6161
}
@@ -135,7 +135,7 @@ impl From<(u32, u32, u32)> for BlockSize {
135135
BlockSize::xyz(x, y, z)
136136
}
137137
}
138-
impl<'a> From<&'a BlockSize> for BlockSize {
138+
impl From<&BlockSize> for BlockSize {
139139
fn from(other: &BlockSize) -> BlockSize {
140140
*other
141141
}
@@ -209,7 +209,7 @@ pub struct Function<'a> {
209209
unsafe impl Send for Function<'_> {}
210210
unsafe impl Sync for Function<'_> {}
211211

212-
impl<'a> Function<'a> {
212+
impl Function<'_> {
213213
pub(crate) fn new(inner: CUfunction, _module: &Module) -> Function {
214214
Function {
215215
inner,
@@ -243,7 +243,9 @@ impl<'a> Function<'a> {
243243
cuda::cuFuncGetAttribute(
244244
&mut val as *mut i32,
245245
// This should be safe, as the repr and values of FunctionAttribute should match.
246-
::std::mem::transmute(attr),
246+
::std::mem::transmute::<FunctionAttribute, cust_raw::CUfunction_attribute_enum>(
247+
attr,
248+
),
247249
self.inner,
248250
)
249251
.to_result()?;
@@ -280,7 +282,13 @@ impl<'a> Function<'a> {
280282
/// # }
281283
/// ```
282284
pub fn set_cache_config(&mut self, config: CacheConfig) -> CudaResult<()> {
283-
unsafe { cuda::cuFuncSetCacheConfig(self.inner, transmute(config)).to_result() }
285+
unsafe {
286+
cuda::cuFuncSetCacheConfig(
287+
self.inner,
288+
transmute::<CacheConfig, cust_raw::CUfunc_cache_enum>(config),
289+
)
290+
.to_result()
291+
}
284292
}
285293

286294
/// Sets the preferred shared memory configuration for this function.
@@ -307,7 +315,13 @@ impl<'a> Function<'a> {
307315
/// # }
308316
/// ```
309317
pub fn set_shared_memory_config(&mut self, cfg: SharedMemoryConfig) -> CudaResult<()> {
310-
unsafe { cuda::cuFuncSetSharedMemConfig(self.inner, transmute(cfg)).to_result() }
318+
unsafe {
319+
cuda::cuFuncSetSharedMemConfig(
320+
self.inner,
321+
transmute::<SharedMemoryConfig, cust_raw::CUsharedconfig_enum>(cfg),
322+
)
323+
.to_result()
324+
}
311325
}
312326

313327
/// Retrieves a raw handle to this function.

0 commit comments

Comments
 (0)