Skip to content

Commit 955e52c

Browse files
nlutsenkometa-codesync[bot]
authored andcommitted
clang-format | Format fbsource with clang-format 21.
Reviewed By: ChristianK275 Differential Revision: D85317706 fbshipit-source-id: b399c5c4b75252999442b7d7d2778e7a241b0025
1 parent b636955 commit 955e52c

File tree

15 files changed

+104
-144
lines changed

15 files changed

+104
-144
lines changed

fx2ait/fx2ait/csrc/AITModel.cpp

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,15 @@ std::string AITModel::serialize() const {
3939
pick_output_names.push_back(picojson::value(entry));
4040
}
4141
var[OUTPUT_NAMES_STR] = picojson::value(pick_output_names);
42-
var[FLOATING_POINT_INPUT_DTYPE_STR] = picojson::value(std::to_string(
43-
static_cast<int16_t>(aitModelImpl_.floatingPointInputDtype().value())));
42+
var[FLOATING_POINT_INPUT_DTYPE_STR] = picojson::value(
43+
std::to_string(
44+
static_cast<int16_t>(
45+
aitModelImpl_.floatingPointInputDtype().value())));
4446

45-
var[FLOATING_POINT_OUTPUT_DTYPE_STR] = picojson::value(std::to_string(
46-
static_cast<int16_t>(aitModelImpl_.floatingPointOutputDtype().value())));
47+
var[FLOATING_POINT_OUTPUT_DTYPE_STR] = picojson::value(
48+
std::to_string(
49+
static_cast<int16_t>(
50+
aitModelImpl_.floatingPointOutputDtype().value())));
4751

4852
result = picojson::value(var).serialize();
4953
return result;
@@ -58,14 +62,15 @@ void AITModel::loadAsTorchClass() {
5862

5963
static auto registerAITModel =
6064
torch::class_<AITModel>("ait", "AITModel")
61-
.def(torch::init<
62-
std::string,
63-
std::vector<std::string>,
64-
std::vector<std::string>,
65-
std::optional<at::ScalarType>,
66-
std::optional<at::ScalarType>,
67-
int64_t,
68-
bool>())
65+
.def(
66+
torch::init<
67+
std::string,
68+
std::vector<std::string>,
69+
std::vector<std::string>,
70+
std::optional<at::ScalarType>,
71+
std::optional<at::ScalarType>,
72+
int64_t,
73+
bool>())
6974
.def("forward", &AITModel::forward)
7075
.def("profile", &AITModel::profile)
7176
.def("get_library_path", &AITModel::libraryPath)

python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_kernel_1xN.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ inline __device__ void device_block_1xN_(
217217
float p_prev_lse[Mma_tile_p::MMAS_M * 2];
218218
if (!(Is_first || mask_val % 2 == 1)) {
219219
gmem_softmax_lse.load(
220-
reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
220+
reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
221221
}
222222

223223
// Commit the data for Q and V to shared memory.
@@ -348,7 +348,7 @@ inline __device__ void device_block_1xN_(
348348
// if (!Is_first) {
349349
if (!(Is_first || mask_val_next % 2 == 1)) {
350350
gmem_softmax_lse.load_next(
351-
reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse),
351+
reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse),
352352
block_row_idx_to_move);
353353
}
354354
}
@@ -526,7 +526,7 @@ inline __device__ void device_block_1xN_(
526526
if ((tidx % Gmem_tile_o::THREADS_PER_ROW == 0) &&
527527
(tidx / Gmem_tile_o::THREADS_PER_ROW < Gmem_tile_o::ROWS)) {
528528
gmem_softmax_lse.store_row(
529-
reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]),
529+
reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]),
530530
rows[jj]);
531531
}
532532
}

python/aitemplate/backend/cuda/attention/src/fmha_fprop_kernel_1xN.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ inline __device__ void device_1xN_(
376376
float p_prev_lse[Mma_tile_p::MMAS_M * 2];
377377
if (!Is_first) {
378378
gmem_softmax_lse.load(
379-
reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
379+
reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
380380
}
381381

382382
// Commit the data for Q and V to shared memory.
@@ -489,7 +489,7 @@ inline __device__ void device_1xN_(
489489
if (l < steps - 1) {
490490
if (!Is_first) {
491491
gmem_softmax_lse.load_next(
492-
reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
492+
reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
493493
}
494494
}
495495

@@ -674,7 +674,7 @@ inline __device__ void device_1xN_(
674674
if ((tidx % Gmem_tile_o::THREADS_PER_ROW == 0) &&
675675
(tidx / Gmem_tile_o::THREADS_PER_ROW < Gmem_tile_o::ROWS)) {
676676
gmem_softmax_lse.store_row(
677-
reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]),
677+
reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]),
678678
rows[jj]);
679679
}
680680
}

python/aitemplate/backend/cuda/elementwise/custom_math.cuh

Lines changed: 10 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -25,50 +25,20 @@
2525

2626
#define NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
2727

28-
#define CUDA_FP16_ZERO \
29-
__half { \
30-
0x0u \
31-
}
32-
#define CUDA_BF16_ZERO \
33-
__nv_bfloat16 { \
34-
0x0u \
35-
}
36-
#define CUDA_FP162_ZERO \
37-
__half2 { \
38-
0x0u, 0x0u \
39-
}
40-
#define CUDA_BF162_ZERO \
41-
__nv_bfloat162 { \
42-
0x0u, 0x0u \
43-
}
44-
#define CUDA_FP16_ONE \
45-
__half_raw { \
46-
0x3c00u \
47-
}
48-
#define CUDA_BF16_ONE \
49-
__nv_bfloat16_raw { \
50-
0x3f80u \
51-
}
52-
#define CUDA_FP16_ONE_HALF \
53-
__half_raw { \
54-
0x3800u \
55-
}
56-
#define CUDA_BF16_ONE_HALF \
57-
__nv_bfloat16_raw { \
58-
0x3f00u \
59-
}
28+
#define CUDA_FP16_ZERO __half{0x0u}
29+
#define CUDA_BF16_ZERO __nv_bfloat16{0x0u}
30+
#define CUDA_FP162_ZERO __half2{0x0u, 0x0u}
31+
#define CUDA_BF162_ZERO __nv_bfloat162{0x0u, 0x0u}
32+
#define CUDA_FP16_ONE __half_raw{0x3c00u}
33+
#define CUDA_BF16_ONE __nv_bfloat16_raw{0x3f80u}
34+
#define CUDA_FP16_ONE_HALF __half_raw{0x3800u}
35+
#define CUDA_BF16_ONE_HALF __nv_bfloat16_raw{0x3f00u}
6036

6137
// sqrt(2 / pi)
62-
#define CUDA_BF16_K1 \
63-
__nv_bfloat16_raw { \
64-
0x3f4c \
65-
}
38+
#define CUDA_BF16_K1 __nv_bfloat16_raw{0x3f4c}
6639

6740
// 2/(3*pi) - 1/6
68-
#define CUDA_BF16_K3 \
69-
__nv_bfloat16_raw { \
70-
0x3d3a \
71-
}
41+
#define CUDA_BF16_K3 __nv_bfloat16_raw{0x3d3a}
7242

7343
template <typename T>
7444
__device__ T sign_custom(const T a) {

python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -68,22 +68,10 @@ __device__ bfloat16 fast_tanh(bfloat16 x) {
6868
#endif
6969
}
7070

71-
#define CUDA_FP16_ONE_HALF \
72-
__half_raw { \
73-
0x3800u \
74-
}
75-
#define CUDA_FP16_ONE \
76-
__half_raw { \
77-
0x3c00u \
78-
}
79-
#define CUDA_BF16_ONE_HALF \
80-
__nv_bfloat16_raw { \
81-
0x3f00u \
82-
}
83-
#define CUDA_BF16_ONE \
84-
__nv_bfloat16_raw { \
85-
0x3f80u \
86-
}
71+
#define CUDA_FP16_ONE_HALF __half_raw{0x3800u}
72+
#define CUDA_FP16_ONE __half_raw{0x3c00u}
73+
#define CUDA_BF16_ONE_HALF __nv_bfloat16_raw{0x3f00u}
74+
#define CUDA_BF16_ONE __nv_bfloat16_raw{0x3f80u}
8775

8876
__device__ float sigmoid(const float a) {
8977
return (cutlass::fast_tanh(a * 0.5f) + 1.0f) * 0.5f;

python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,7 @@ struct MaxOp {
5050
};
5151

5252
template <
53-
template <typename>
54-
class ReductionOp,
53+
template <typename> class ReductionOp,
5554
typename T,
5655
int thread_group_width = kWarpSize>
5756
__inline__ __device__ T WarpAllReduce(T val) {

python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,7 @@ struct MaxOp {
5050
};
5151

5252
template <
53-
template <typename>
54-
class ReductionOp,
53+
template <typename> class ReductionOp,
5554
typename T,
5655
int thread_group_width = kWarpSize>
5756
__inline__ __device__ T WarpAllReduce(T val) {

python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_welford.cuh

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -60,22 +60,10 @@ __device__ bfloat16 fast_tanh(bfloat16 x) {
6060
#endif
6161
}
6262

63-
#define CUDA_FP16_ONE_HALF \
64-
__half_raw { \
65-
0x3800u \
66-
}
67-
#define CUDA_FP16_ONE \
68-
__half_raw { \
69-
0x3c00u \
70-
}
71-
#define CUDA_BF16_ONE_HALF \
72-
__nv_bfloat16_raw { \
73-
0x3f00u \
74-
}
75-
#define CUDA_BF16_ONE \
76-
__nv_bfloat16_raw { \
77-
0x3f80u \
78-
}
63+
#define CUDA_FP16_ONE_HALF __half_raw{0x3800u}
64+
#define CUDA_FP16_ONE __half_raw{0x3c00u}
65+
#define CUDA_BF16_ONE_HALF __nv_bfloat16_raw{0x3f00u}
66+
#define CUDA_BF16_ONE __nv_bfloat16_raw{0x3f80u}
7967

8068
__device__ float sigmoid(const float a) {
8169
return (cutlass::fast_tanh(a * 0.5f) + 1.0f) * 0.5f;

python/aitemplate/backend/cuda/softmax/softmax.cuh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -459,8 +459,9 @@ __global__ void softmax_block_smem(
459459

460460
const int m_idx = blockIdx.x;
461461
const int tid = threadIdx.x;
462-
extern __shared__ __align__(sizeof(
463-
float)) unsigned char shared_buf[]; // size_t smem = n*sizeof(float)
462+
extern __shared__ __align__(
463+
sizeof(
464+
float)) unsigned char shared_buf[]; // size_t smem = n*sizeof(float)
464465
auto* buf = reinterpret_cast<float*>(shared_buf);
465466
const int num_packs = (n + pack_size - 1) / pack_size;
466467
for (int64_t row = m_idx; row < m; row += gridDim.x) {

static/csrc/windll.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,10 @@ void GetConstantsBin(void** address, size_t* size) {
5555
if (!hResourceData) {
5656
// could not load a resource
5757
auto errorCode = GetLastError();
58-
TRIGGER_ERROR(std::string(
59-
"LoadResource() call in GetConstantsBin() has failed with error " +
60-
std::to_string(errorCode)));
58+
TRIGGER_ERROR(
59+
std::string(
60+
"LoadResource() call in GetConstantsBin() has failed with error " +
61+
std::to_string(errorCode)));
6162
}
6263

6364
DWORD resourceSize = SizeofResource(SavedDllHandle, hResource);

0 commit comments

Comments
 (0)