clang-format | Format fbsource with clang-format 21.

nlutsenko · meta-codesync[bot] · commit 955e52cec023 · 2025-10-26T23:40:59.000-07:00
Reviewed By: ChristianK275

Differential Revision: D85317706

fbshipit-source-id: b399c5c4b75252999442b7d7d2778e7a241b0025
diff --git a/fx2ait/fx2ait/csrc/AITModel.cpp b/fx2ait/fx2ait/csrc/AITModel.cpp
@@ -39,11 +39,15 @@ std::string AITModel::serialize() const {
     pick_output_names.push_back(picojson::value(entry));
   }
   var[OUTPUT_NAMES_STR] = picojson::value(pick_output_names);
-  var[FLOATING_POINT_INPUT_DTYPE_STR] = picojson::value(std::to_string(
-      static_cast<int16_t>(aitModelImpl_.floatingPointInputDtype().value())));
+  var[FLOATING_POINT_INPUT_DTYPE_STR] = picojson::value(
+      std::to_string(
+          static_cast<int16_t>(
+              aitModelImpl_.floatingPointInputDtype().value())));
 
-  var[FLOATING_POINT_OUTPUT_DTYPE_STR] = picojson::value(std::to_string(
-      static_cast<int16_t>(aitModelImpl_.floatingPointOutputDtype().value())));
+  var[FLOATING_POINT_OUTPUT_DTYPE_STR] = picojson::value(
+      std::to_string(
+          static_cast<int16_t>(
+              aitModelImpl_.floatingPointOutputDtype().value())));
 
   result = picojson::value(var).serialize();
   return result;
@@ -58,14 +62,15 @@ void AITModel::loadAsTorchClass() {
 
 static auto registerAITModel =
     torch::class_<AITModel>("ait", "AITModel")
-        .def(torch::init<
-             std::string,
-             std::vector<std::string>,
-             std::vector<std::string>,
-             std::optional<at::ScalarType>,
-             std::optional<at::ScalarType>,
-             int64_t,
-             bool>())
+        .def(
+            torch::init<
+                std::string,
+                std::vector<std::string>,
+                std::vector<std::string>,
+                std::optional<at::ScalarType>,
+                std::optional<at::ScalarType>,
+                int64_t,
+                bool>())
         .def("forward", &AITModel::forward)
         .def("profile", &AITModel::profile)
         .def("get_library_path", &AITModel::libraryPath)
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_kernel_1xN.h b/python/aitemplate/backend/cuda/attention/src/fmha_block_fprop_kernel_1xN.h
@@ -217,7 +217,7 @@ inline __device__ void device_block_1xN_(
   float p_prev_lse[Mma_tile_p::MMAS_M * 2];
   if (!(Is_first || mask_val % 2 == 1)) {
     gmem_softmax_lse.load(
-        reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
+        reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
   }
 
   // Commit the data for Q and V to shared memory.
@@ -348,7 +348,7 @@ inline __device__ void device_block_1xN_(
       // if (!Is_first) {
       if (!(Is_first || mask_val_next % 2 == 1)) {
         gmem_softmax_lse.load_next(
-            reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse),
+            reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse),
             block_row_idx_to_move);
       }
     }
@@ -526,7 +526,7 @@ inline __device__ void device_block_1xN_(
       if ((tidx % Gmem_tile_o::THREADS_PER_ROW == 0) &&
           (tidx / Gmem_tile_o::THREADS_PER_ROW < Gmem_tile_o::ROWS)) {
         gmem_softmax_lse.store_row(
-            reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]),
+            reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]),
             rows[jj]);
       }
     }
diff --git a/python/aitemplate/backend/cuda/attention/src/fmha_fprop_kernel_1xN.h b/python/aitemplate/backend/cuda/attention/src/fmha_fprop_kernel_1xN.h
@@ -376,7 +376,7 @@ inline __device__ void device_1xN_(
   float p_prev_lse[Mma_tile_p::MMAS_M * 2];
   if (!Is_first) {
     gmem_softmax_lse.load(
-        reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
+        reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
   }
 
   // Commit the data for Q and V to shared memory.
@@ -489,7 +489,7 @@ inline __device__ void device_1xN_(
     if (l < steps - 1) {
       if (!Is_first) {
         gmem_softmax_lse.load_next(
-            reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
+            reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));
       }
     }
 
@@ -674,7 +674,7 @@ inline __device__ void device_1xN_(
       if ((tidx % Gmem_tile_o::THREADS_PER_ROW == 0) &&
           (tidx / Gmem_tile_o::THREADS_PER_ROW < Gmem_tile_o::ROWS)) {
         gmem_softmax_lse.store_row(
-            reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]),
+            reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]),
             rows[jj]);
       }
     }
diff --git a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
@@ -25,50 +25,20 @@
 
 #define NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
 
-#define CUDA_FP16_ZERO \
-  __half {             \
-    0x0u               \
-  }
-#define CUDA_BF16_ZERO \
-  __nv_bfloat16 {      \
-    0x0u               \
-  }
-#define CUDA_FP162_ZERO \
-  __half2 {             \
-    0x0u, 0x0u          \
-  }
-#define CUDA_BF162_ZERO \
-  __nv_bfloat162 {      \
-    0x0u, 0x0u          \
-  }
-#define CUDA_FP16_ONE \
-  __half_raw {        \
-    0x3c00u           \
-  }
-#define CUDA_BF16_ONE \
-  __nv_bfloat16_raw { \
-    0x3f80u           \
-  }
-#define CUDA_FP16_ONE_HALF \
-  __half_raw {             \
-    0x3800u                \
-  }
-#define CUDA_BF16_ONE_HALF \
-  __nv_bfloat16_raw {      \
-    0x3f00u                \
-  }
+#define CUDA_FP16_ZERO __half{0x0u}
+#define CUDA_BF16_ZERO __nv_bfloat16{0x0u}
+#define CUDA_FP162_ZERO __half2{0x0u, 0x0u}
+#define CUDA_BF162_ZERO __nv_bfloat162{0x0u, 0x0u}
+#define CUDA_FP16_ONE __half_raw{0x3c00u}
+#define CUDA_BF16_ONE __nv_bfloat16_raw{0x3f80u}
+#define CUDA_FP16_ONE_HALF __half_raw{0x3800u}
+#define CUDA_BF16_ONE_HALF __nv_bfloat16_raw{0x3f00u}
 
 // sqrt(2 / pi)
-#define CUDA_BF16_K1  \
-  __nv_bfloat16_raw { \
-    0x3f4c            \
-  }
+#define CUDA_BF16_K1 __nv_bfloat16_raw{0x3f4c}
 
 // 2/(3*pi) - 1/6
-#define CUDA_BF16_K3  \
-  __nv_bfloat16_raw { \
-    0x3d3a            \
-  }
+#define CUDA_BF16_K3 __nv_bfloat16_raw{0x3d3a}
 
 template <typename T>
 __device__ T sign_custom(const T a) {
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
@@ -68,22 +68,10 @@ __device__ bfloat16 fast_tanh(bfloat16 x) {
 #endif
 }
 
-#define CUDA_FP16_ONE_HALF \
-  __half_raw {             \
-    0x3800u                \
-  }
-#define CUDA_FP16_ONE \
-  __half_raw {        \
-    0x3c00u           \
-  }
-#define CUDA_BF16_ONE_HALF \
-  __nv_bfloat16_raw {      \
-    0x3f00u                \
-  }
-#define CUDA_BF16_ONE \
-  __nv_bfloat16_raw { \
-    0x3f80u           \
-  }
+#define CUDA_FP16_ONE_HALF __half_raw{0x3800u}
+#define CUDA_FP16_ONE __half_raw{0x3c00u}
+#define CUDA_BF16_ONE_HALF __nv_bfloat16_raw{0x3f00u}
+#define CUDA_BF16_ONE __nv_bfloat16_raw{0x3f80u}
 
 __device__ float sigmoid(const float a) {
   return (cutlass::fast_tanh(a * 0.5f) + 1.0f) * 0.5f;
diff --git a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
@@ -50,8 +50,7 @@ struct MaxOp {
 };
 
 template <
-    template <typename>
-    class ReductionOp,
+    template <typename> class ReductionOp,
     typename T,
     int thread_group_width = kWarpSize>
 __inline__ __device__ T WarpAllReduce(T val) {
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh
@@ -50,8 +50,7 @@ struct MaxOp {
 };
 
 template <
-    template <typename>
-    class ReductionOp,
+    template <typename> class ReductionOp,
     typename T,
     int thread_group_width = kWarpSize>
 __inline__ __device__ T WarpAllReduce(T val) {
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_welford.cuh b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_welford.cuh
@@ -60,22 +60,10 @@ __device__ bfloat16 fast_tanh(bfloat16 x) {
 #endif
 }
 
-#define CUDA_FP16_ONE_HALF \
-  __half_raw {             \
-    0x3800u                \
-  }
-#define CUDA_FP16_ONE \
-  __half_raw {        \
-    0x3c00u           \
-  }
-#define CUDA_BF16_ONE_HALF \
-  __nv_bfloat16_raw {      \
-    0x3f00u                \
-  }
-#define CUDA_BF16_ONE \
-  __nv_bfloat16_raw { \
-    0x3f80u           \
-  }
+#define CUDA_FP16_ONE_HALF __half_raw{0x3800u}
+#define CUDA_FP16_ONE __half_raw{0x3c00u}
+#define CUDA_BF16_ONE_HALF __nv_bfloat16_raw{0x3f00u}
+#define CUDA_BF16_ONE __nv_bfloat16_raw{0x3f80u}
 
 __device__ float sigmoid(const float a) {
   return (cutlass::fast_tanh(a * 0.5f) + 1.0f) * 0.5f;
diff --git a/python/aitemplate/backend/cuda/softmax/softmax.cuh b/python/aitemplate/backend/cuda/softmax/softmax.cuh
@@ -459,8 +459,9 @@ __global__ void softmax_block_smem(
 
   const int m_idx = blockIdx.x;
   const int tid = threadIdx.x;
-  extern __shared__ __align__(sizeof(
-      float)) unsigned char shared_buf[]; // size_t smem = n*sizeof(float)
+  extern __shared__ __align__(
+      sizeof(
+          float)) unsigned char shared_buf[]; // size_t smem = n*sizeof(float)
   auto* buf = reinterpret_cast<float*>(shared_buf);
   const int num_packs = (n + pack_size - 1) / pack_size;
   for (int64_t row = m_idx; row < m; row += gridDim.x) {
diff --git a/static/csrc/windll.cpp b/static/csrc/windll.cpp
@@ -55,9 +55,10 @@ void GetConstantsBin(void** address, size_t* size) {
   if (!hResourceData) {
     // could not load a resource
     auto errorCode = GetLastError();
-    TRIGGER_ERROR(std::string(
-        "LoadResource() call in GetConstantsBin() has failed with error " +
-        std::to_string(errorCode)));
+    TRIGGER_ERROR(
+        std::string(
+            "LoadResource() call in GetConstantsBin() has failed with error " +
+            std::to_string(errorCode)));
   }
 
   DWORD resourceSize = SizeofResource(SavedDllHandle, hResource);
diff --git a/static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_access_iterator_residual_last.h b/static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_access_iterator_residual_last.h
@@ -223,8 +223,9 @@ class PredicatedTileAccessIteratorResidualLast<
       /// Gather indices
       int const* indices = nullptr)
       : params_(params),
-        pointer_(reinterpret_cast<BytePointer>(
-            const_cast<NonConstPointer>(pointer))),
+        pointer_(
+            reinterpret_cast<BytePointer>(
+                const_cast<NonConstPointer>(pointer))),
         the_predicates(extent),
         indices_(indices) {
     the_predicates.set_predicates(thread_id, threadblock_offset);
@@ -513,7 +514,7 @@ class PredicatedTileAccessIteratorResidualLast<
     /// Construct the Params object given a pitch-linear tensor's layout
     CUTLASS_HOST_DEVICE
     Params(Layout const& layout)
-        : params_(layout::PitchLinear(layout.stride(0))){};
+        : params_(layout::PitchLinear(layout.stride(0))) {};
 
     /// Construct the Params object given a pitch-linear tensor's layout
     CUTLASS_HOST_DEVICE
@@ -738,7 +739,7 @@ class PredicatedTileAccessIteratorResidualLast<
     /// Construct the Params object given a pitch-linear tensor's layout
     CUTLASS_HOST_DEVICE
     Params(Layout const& layout)
-        : params_(layout::PitchLinear(layout.stride(0))){};
+        : params_(layout::PitchLinear(layout.stride(0))) {};
 
     /// Construct the Params object given a pitch-linear tensor's layout
     CUTLASS_HOST_DEVICE
@@ -1060,8 +1061,9 @@ class PredicatedTileAccessIteratorResidualLast<
                   ///< gather/scatter at this specialization
       )
       : params_(params),
-        pointer_(reinterpret_cast<BytePointer>(
-            const_cast<NonConstPointer>(pointer))),
+        pointer_(
+            reinterpret_cast<BytePointer>(
+                const_cast<NonConstPointer>(pointer))),
         the_predicates(extent) {
     the_predicates.set_predicates(thread_id, threadblock_offset);
 
@@ -1294,7 +1296,8 @@ class PredicatedTileAccessIteratorResidualLast<
     /// Construct the Params object given an AffineRankN<2> tensor's layout
     CUTLASS_HOST_DEVICE
     Params(Layout const& layout)
-        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {
+          };
   };
 
  private:
@@ -1513,7 +1516,8 @@ class PredicatedTileAccessIteratorResidualLast<
     /// Construct the Params object given an AffineRankN<2> tensor's layout
     CUTLASS_HOST_DEVICE
     Params(Layout const& layout)
-        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {
+          };
   };
 
  private:
diff --git a/static/include/kernels/fmha_style_b2b_bmm/kernel_forward.h b/static/include/kernels/fmha_style_b2b_bmm/kernel_forward.h
@@ -101,8 +101,7 @@ template <
     // = `value.shape[-1] <= kKeysPerBlock`
     bool kSingleValueIteration,
     // Activation functor
-    template <typename T>
-    class ActivationFunctor,
+    template <typename T> class ActivationFunctor,
     typename offset_t_ = int64_t>
 struct AttentionKernel {
   using scalar_t = scalar_t_;
diff --git a/static/include/kernels/kat_printf.h b/static/include/kernels/kat_printf.h
@@ -1140,8 +1140,8 @@ __attribute__((device)) static int vsnprintf(
       case 'p': {
         width = sizeof(void*) * 2U + 2;
         flags |= flags::zeropad | flags::pointer;
-        uintptr_t value = (uintptr_t) __builtin_va_arg(args, void*);
-        (value == (uintptr_t) nullptr)
+        uintptr_t value = (uintptr_t)__builtin_va_arg(args, void*);
+        (value == (uintptr_t)nullptr)
             ? out_rev_(output, ")lin(", 5, width, flags)
             : print_integer(
                   output,
diff --git a/static/include/kernels/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h b/static/include/kernels/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h
diff --git a/static/include/kernels/mem_eff_attention/kernel_forward.h b/static/include/kernels/mem_eff_attention/kernel_forward.h

Original file line number	Diff line number	Diff line change
`@@ -217,7 +217,7 @@ inline __device__ void device_block_1xN_(`
`217`	`217`	`float p_prev_lse[Mma_tile_p::MMAS_M * 2];`
`218`	`218`	`if (!(Is_first \|\| mask_val % 2 == 1)) {`
`219`	`219`	`gmem_softmax_lse.load(`
`220`		`- reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));`
	`220`	`+ reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));`
`221`	`221`	`}`
`222`	`222`
`223`	`223`	`// Commit the data for Q and V to shared memory.`
`@@ -348,7 +348,7 @@ inline __device__ void device_block_1xN_(`
`348`	`348`	`// if (!Is_first) {`
`349`	`349`	`if (!(Is_first \|\| mask_val_next % 2 == 1)) {`
`350`	`350`	`gmem_softmax_lse.load_next(`
`351`		`- reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse),`
	`351`	`+ reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse),`
`352`	`352`	`block_row_idx_to_move);`
`353`	`353`	`}`
`354`	`354`	`}`
`@@ -526,7 +526,7 @@ inline __device__ void device_block_1xN_(`
`526`	`526`	`if ((tidx % Gmem_tile_o::THREADS_PER_ROW == 0) &&`
`527`	`527`	`(tidx / Gmem_tile_o::THREADS_PER_ROW < Gmem_tile_o::ROWS)) {`
`528`	`528`	`gmem_softmax_lse.store_row(`
`529`		`- reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]),`
	`529`	`+ reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]),`
`530`	`530`	`rows[jj]);`
`531`	`531`	`}`
`532`	`532`	`}`
Original file line number	Diff line number	Diff line change
`@@ -376,7 +376,7 @@ inline __device__ void device_1xN_(`
`376`	`376`	`float p_prev_lse[Mma_tile_p::MMAS_M * 2];`
`377`	`377`	`if (!Is_first) {`
`378`	`378`	`gmem_softmax_lse.load(`
`379`		`- reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));`
	`379`	`+ reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));`
`380`	`380`	`}`
`381`	`381`
`382`	`382`	`// Commit the data for Q and V to shared memory.`
`@@ -489,7 +489,7 @@ inline __device__ void device_1xN_(`
`489`	`489`	`if (l < steps - 1) {`
`490`	`490`	`if (!Is_first) {`
`491`	`491`	`gmem_softmax_lse.load_next(`
`492`		`- reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));`
	`492`	`+ reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M * 2]>(p_prev_lse));`
`493`	`493`	`}`
`494`	`494`	`}`
`495`	`495`
`@@ -674,7 +674,7 @@ inline __device__ void device_1xN_(`
`674`	`674`	`if ((tidx % Gmem_tile_o::THREADS_PER_ROW == 0) &&`
`675`	`675`	`(tidx / Gmem_tile_o::THREADS_PER_ROW < Gmem_tile_o::ROWS)) {`
`676`	`676`	`gmem_softmax_lse.store_row(`
`677`		`- reinterpret_cast<uint32_t(&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]),`
	`677`	`+ reinterpret_cast<uint32_t (&)[Mma_tile_p::MMAS_M]>(p_sum_log[jj]),`
`678`	`678`	`rows[jj]);`
`679`	`679`	`}`
`680`	`680`	`}`