PaddlePaddle · co63oc · Apr 11, 2026
diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu
@@ -28,25 +28,23 @@ template <typename T, typename Enable = void>
 struct CudaAbsFunctor;
 
 template <typename T>
-struct CudaAbsFunctor<T, funcs::Complex<T, phi::dtype::Real<T>>> {
-  __device__ __forceinline__ phi::dtype::Real<T> operator()(const T x) const {
+struct CudaAbsFunctor<T, funcs::Complex<T, dtype::Real<T>>> {
+  __device__ __forceinline__ dtype::Real<T> operator()(const T x) const {
     return abs(x);
   }
 };
 
 template <typename T>
-struct CudaAbsFunctor<
-    T,
-    std::enable_if_t<std::is_same<T, phi::dtype::Real<T>>::value &&
-                     std::is_same<T, phi::bfloat16>::value>> {
+struct CudaAbsFunctor<T,
+                      std::enable_if_t<std::is_same<T, dtype::Real<T>>::value &&
+                                       std::is_same<T, bfloat16>::value>> {
   __device__ __forceinline__ T operator()(const T x) const { return abs(x); }
 };
 
 template <typename T>
-struct CudaAbsFunctor<
-    T,
-    std::enable_if_t<std::is_same<T, phi::dtype::Real<T>>::value &&
-                     !std::is_same<T, phi::bfloat16>::value>> {
+struct CudaAbsFunctor<T,
+                      std::enable_if_t<std::is_same<T, dtype::Real<T>>::value &&
+                                       !std::is_same<T, bfloat16>::value>> {
   __device__ __forceinline__ T operator()(const T x) const {
     return std::abs(x);
   }
@@ -56,12 +54,12 @@ template <typename T, typename Context>
 PADDLE_API void AbsKernel(const Context& dev_ctx,
                           const DenseTensor& x,
                           DenseTensor* out) {
-  dev_ctx.template Alloc<phi::dtype::Real<T>>(out);
+  dev_ctx.template Alloc<dtype::Real<T>>(out);
   std::vector<const DenseTensor*> ins = {&x};
   std::vector<DenseTensor*> outs = {out};
   auto functor = CudaAbsFunctor<T>();
 
-  funcs::ElementwiseKernel<phi::dtype::Real<T>>(dev_ctx, ins, &outs, functor);
+  funcs::ElementwiseKernel<dtype::Real<T>>(dev_ctx, ins, &outs, functor);
 }
 
 }  // namespace phi

diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu
@@ -24,7 +24,6 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
-using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <int BlockSize, typename T>
 __global__ void AccuracyCudaKernel(const int64_t N,
@@ -34,7 +33,7 @@ __global__ void AccuracyCudaKernel(const int64_t N,
                                    int* correct_data,
                                    T* accuracy,
                                    int* total_data) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   int count = 0;
   __shared__ int total[BlockSize];
 
@@ -98,7 +97,7 @@ void AccuracyKernel(const Context& dev_ctx,
   int64_t num_samples = inference.dims()[0];
   size_t infer_width = inference.dims()[1];
   auto stream = dev_ctx.stream();
-  phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream);
+  backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream);
 
   PADDLE_ENFORCE_GT(label.dims().size(),
                     0,

diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -371,8 +371,7 @@ void PowGradKernel(const Context& dev_ctx,
                    DenseTensor* dx) {
   if (factor.to<double>() == 0) {
     std::vector<int64_t> vec_dims = vectorize(dx->dims());
-    phi::Full<T, Context>(
-        dev_ctx, phi::IntArray(vec_dims), static_cast<T>(0), dx);
+    Full<T, Context>(dev_ctx, IntArray(vec_dims), static_cast<T>(0), dx);
     return;
   }
   if (factor.to<double>() == 1) {

diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -283,8 +283,7 @@ void PowKernel(const Context& dev_ctx,
   }
   if (factor.to<double>() == 0) {
     std::vector<int64_t> vec_dims = vectorize(out->dims());
-    phi::Full<T, Context>(
-        dev_ctx, phi::IntArray(vec_dims), static_cast<T>(1), out);
+    Full<T, Context>(dev_ctx, IntArray(vec_dims), static_cast<T>(1), out);
     return;
   }
   if (factor.to<double>() == 1) {

diff --git a/paddle/phi/kernels/gpu/adagrad_kernel.cu b/paddle/phi/kernels/gpu/adagrad_kernel.cu
@@ -70,7 +70,7 @@ struct DenseAdagradFunctor<GPUContext, T> {
                   DenseTensor* param_out_tensor,
                   DenseTensor* moment_out_tensor,
                   DenseTensor* master_param_outs) {
-    using MT = typename phi::dtype::template MPTypeTrait<T>::Type;
+    using MT = typename dtype::template MPTypeTrait<T>::Type;
     T* param_out_data = dev_ctx.template Alloc<T>(param_out_tensor);
     MT* moment_out_data = dev_ctx.template Alloc<MT>(moment_out_tensor);
     const MT* master_in_data =
@@ -82,7 +82,7 @@ struct DenseAdagradFunctor<GPUContext, T> {
     MT epsilon = static_cast<MT>(epsilon_t);
 
     int64_t numel = param_t.numel();
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1);
     int grid = config.block_per_grid.x;
     int block = config.thread_per_block.x;
     auto stream = dev_ctx.stream();
@@ -180,7 +180,7 @@ struct SparseAdagradFunctor<GPUContext, T> {
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid2(1, merge_rows.size());
-    phi::MixVector<int64_t> mixv_merge_rows(&merge_rows);
+    MixVector<int64_t> mixv_merge_rows(&merge_rows);
     SparseAdagradFunctorKernel<T, 256>
         <<<grid2,
            threads,
@@ -201,7 +201,7 @@ template struct SparseAdagradFunctor<GPUContext, float>;
 template struct SparseAdagradFunctor<GPUContext, double>;
 template struct DenseAdagradFunctor<GPUContext, float>;
 template struct DenseAdagradFunctor<GPUContext, double>;
-template struct DenseAdagradFunctor<GPUContext, phi::float16>;
+template struct DenseAdagradFunctor<GPUContext, float16>;
 
 }  // namespace phi
 

diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu
@@ -187,7 +187,7 @@ PADDLE_API void AdamDenseKernel(const Context& dev_ctx,
                                 DenseTensor* beta1_pow_out,
                                 DenseTensor* beta2_pow_out,
                                 DenseTensor* master_param_outs) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   const auto grad_type = grad.dtype();
 
   VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
@@ -394,7 +394,7 @@ void MergedAdamKernel(
     std::vector<DenseTensor*> beta1_pow_out,
     std::vector<DenseTensor*> beta2_pow_out,
     std::vector<DenseTensor*> master_param_out) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
   MT beta1_ = beta1.to<MT>();
   MT beta2_ = beta2.to<MT>();

diff --git a/paddle/phi/kernels/gpu/adamax_kernel.cu b/paddle/phi/kernels/gpu/adamax_kernel.cu
@@ -84,7 +84,7 @@ void AdamaxKernel(const Context& dev_ctx,
                   DenseTensor* moment_out,
                   DenseTensor* inf_norm_out,
                   DenseTensor* master_param_outs) {
-  using MT = typename phi::dtype::template MPTypeTrait<T>::Type;
+  using MT = typename dtype::template MPTypeTrait<T>::Type;
   T* param_out_data = dev_ctx.template Alloc<T>(param_out);
   MT* moment_out_data = dev_ctx.template Alloc<MT>(moment_out);
   MT* inf_norm_out_data = dev_ctx.template Alloc<MT>(inf_norm_out);
@@ -104,7 +104,7 @@ void AdamaxKernel(const Context& dev_ctx,
   MT epsilon_ = static_cast<MT>(epsilon);
 
   int64_t numel = param.numel();
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1);
   int grid = config.block_per_grid.x;
   int block = config.thread_per_block.x;
   auto stream = dev_ctx.stream();

diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu
@@ -168,7 +168,7 @@ PADDLE_API void AdamwDenseKernel(const Context& dev_ctx,
                                  DenseTensor* beta1_pow_out,
                                  DenseTensor* beta2_pow_out,
                                  DenseTensor* master_param_outs) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   MT coeff_ = static_cast<MT>(coeff);
   MT lr_ratio_ = static_cast<MT>(lr_ratio);
 

diff --git a/paddle/phi/kernels/gpu/add_n_kernel.cu b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -25,7 +25,7 @@ namespace phi {
 template <class T>
 __global__ void SumArrayCUDAKernel(
     T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MPType = typename dtype::MPTypeTrait<T>::Type;
   CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) {
     MPType total(read_dst ? static_cast<MPType>(out[idx])
                           : static_cast<MPType>(0));
@@ -46,7 +46,7 @@ __global__ void SumArrayMixedTypeCUDAKernel(const T *in_0,
                                             int64_t N,
                                             size_t in_others_size,
                                             bool read_dst) {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MPType = typename dtype::MPTypeTrait<T>::Type;
   CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) {
     MPType total(read_dst ? static_cast<MPType>(out[idx])
                           : static_cast<MPType>(0));
@@ -128,7 +128,7 @@ void AddNKernel(const Context &dev_ctx,
     int64_t length_0 = in_0.numel();
     int64_t length_1 = in_1.numel();
     if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) {
-      using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+      using MPType = typename dtype::MPTypeTrait<T>::Type;
       auto result = EigenVector<T>::Flatten(*out);
       auto &place = *dev_ctx.eigen_device();
       auto in_0_e = EigenVector<T>::Flatten(in_0).template cast<MPType>();
@@ -281,7 +281,7 @@ void AddNKernel(const Context &dev_ctx,
       }
     }
     if (!sr_in_out_data.empty()) {
-      auto tmp_sr_in_out_array = phi::memory_utils::Alloc(
+      auto tmp_sr_in_out_array = memory_utils::Alloc(
           dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));
 
       size_t nbytes_sr = sr_in_out_data.size() * sizeof(T *);
@@ -306,8 +306,8 @@ void AddNKernel(const Context &dev_ctx,
   }
   // if indata not null, merge into one kernel call.
   if (!in_data.empty()) {
-    auto tmp_in_array = phi::memory_utils::Alloc(dev_ctx.GetPlace(),
-                                                 in_data.size() * sizeof(T *));
+    auto tmp_in_array =
+        memory_utils::Alloc(dev_ctx.GetPlace(), in_data.size() * sizeof(T *));
 
     size_t nbytes_in2 = in_data.size() * sizeof(T *);
     const void *stable_in2 =

diff --git a/paddle/phi/kernels/gpu/all_to_all_kernel.cu b/paddle/phi/kernels/gpu/all_to_all_kernel.cu
@@ -63,10 +63,9 @@ void AllToAllKernel(const Context& dev_ctx,
   const auto* send_buf = x.data<T>();
   auto* recv_buf = out->data<T>();
   for (auto i = 0; i < nranks; ++i) {
-    auto send_buf = phi::distributed::GetPartialTensor(x, offset, send_numel);
+    auto send_buf = distributed::GetPartialTensor(x, offset, send_numel);
     comm_ctx->Send(send_buf, send_numel, i, stream);
-    auto recv_buf =
-        phi::distributed::GetPartialTensor(*out, offset, send_numel);
+    auto recv_buf = distributed::GetPartialTensor(*out, offset, send_numel);
     comm_ctx->Recv(&recv_buf, send_numel, i, stream);
     offset += send_numel;
   }

diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu
@@ -36,7 +36,7 @@ __global__ void AllcloseCUDAKernel(const T* in_data,
                                    bool* out_data) {
   unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
   bool val;
-  using BaseMPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using BaseMPType = typename dtype::MPTypeTrait<T>::Type;
 
   using MPType =
       typename std::conditional<std::is_same<T, int32_t>::value ||
@@ -98,8 +98,7 @@ void AllCloseKernel(const Context& dev_ctx,
 
   int64_t num = x.numel();
   const int vec_size = 4;
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, num, vec_size);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, num, vec_size);
   uint32_t grid = config.block_per_grid.x;
   uint32_t block = config.thread_per_block.x;
 

diff --git a/paddle/phi/kernels/gpu/amp_kernel.cu b/paddle/phi/kernels/gpu/amp_kernel.cu
@@ -159,13 +159,13 @@ class LazyZeros<GPUContext, T> {
     const auto& cpu_place = CPUPlace();
     // alloc each tensor's start index and copy to device
     auto h_in_starts_mem =
-        phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
+        memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
     int64_t* h_starts = reinterpret_cast<int64_t*>(h_in_starts_mem->ptr());
 
-    auto d_in_starts_mem = phi::memory_utils::Alloc(
+    auto d_in_starts_mem = memory_utils::Alloc(
         dev_ctx.GetPlace(),
         (xs_size + 1) * sizeof(int64_t),
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+        Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
     int64_t* d_starts = reinterpret_cast<int64_t*>(d_in_starts_mem->ptr());
 
     // the start index value of each tensor is
@@ -186,14 +186,13 @@ class LazyZeros<GPUContext, T> {
                        dev_ctx.stream());
 
     // copy each tensor of "outs" data address array to device
-    auto h_out_addrs_mem =
-        phi::memory_utils::Alloc(cpu_place, xs_size * sizeof(T*));
+    auto h_out_addrs_mem = memory_utils::Alloc(cpu_place, xs_size * sizeof(T*));
     T** h_out_addrs = reinterpret_cast<T**>(h_out_addrs_mem->ptr());
 
-    auto d_out_addrs_mem = phi::memory_utils::Alloc(
+    auto d_out_addrs_mem = memory_utils::Alloc(
         dev_ctx.GetPlace(),
         xs_size * sizeof(T*),
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+        Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
     T** d_out_addrs = reinterpret_cast<T**>(d_out_addrs_mem->ptr());
 
     for (size_t i = 0; i < xs_size; ++i) {
@@ -277,7 +276,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
                                  const DenseTensor& scale,
                                  std::vector<DenseTensor*> outs,
                                  DenseTensor* found_infinite) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
 
   const MT* scale_data = scale.data<MT>();
   bool* found_inf_data = dev_ctx.template Alloc<bool>(found_infinite);
@@ -294,13 +293,13 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
   const auto& cpu_place = CPUPlace();
   // calculate each tensor's start index and copy to device
   auto h_starts_tensor =
-      phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
+      memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
   int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr());
 
-  auto d_starts_tensor = phi::memory_utils::Alloc(
-      dev_ctx.GetPlace(),
-      (xs_size + 1) * sizeof(int64_t),
-      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+  auto d_starts_tensor =
+      memory_utils::Alloc(dev_ctx.GetPlace(),
+                          (xs_size + 1) * sizeof(int64_t),
+                          Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
   int64_t* d_starts = reinterpret_cast<int64_t*>(d_starts_tensor->ptr());
 
   // the start index value of each tensor is
@@ -322,14 +321,14 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
                      dev_ctx.stream());
 
   // copy each tensor's data address to device
-  auto h_mem = phi::memory_utils::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
+  auto h_mem = memory_utils::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
   const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr());
   T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size;
 
-  auto d_mem = phi::memory_utils::Alloc(
-      dev_ctx.GetPlace(),
-      2 * xs_size * sizeof(T*),
-      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+  auto d_mem =
+      memory_utils::Alloc(dev_ctx.GetPlace(),
+                          2 * xs_size * sizeof(T*),
+                          Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
   const T** d_xs = reinterpret_cast<const T**>(d_mem->ptr());
   T** d_outs = reinterpret_cast<T**>(d_mem->ptr()) + xs_size;
 

diff --git a/paddle/phi/kernels/gpu/arange_kernel.cu b/paddle/phi/kernels/gpu/arange_kernel.cu
@@ -37,7 +37,7 @@ void ArangeTensorKernel(const Context& dev_ctx,
                         const DenseTensor& end,
                         const DenseTensor& step,
                         DenseTensor* out) {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MPType = typename dtype::MPTypeTrait<T>::Type;
   MPType start_value =
       static_cast<MPType>(GetValue<T, Context>(dev_ctx, start));
   MPType end_value = static_cast<MPType>(GetValue<T, Context>(dev_ctx, end));
@@ -64,7 +64,7 @@ void ArangeNullaryKernel(const Context& dev_ctx,
                          const T end_value,
                          const T step_value,
                          DenseTensor* out) {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MPType = typename dtype::MPTypeTrait<T>::Type;
   MPType start_value_mpt = static_cast<MPType>(start_value);
   MPType end_value_mpt = static_cast<MPType>(end_value);
   MPType step_value_mpt = static_cast<MPType>(step_value);

diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -232,7 +232,7 @@ void ArgMinMaxOpCUDAKernel(const Context& dev_ctx,
             dev_ctx, x, axis.to<int64_t>(), keepdims, flatten, out));
     return;
   }
-  phi::VisitDataTypeTiny(
+  VisitDataTypeTiny(
       dtype,
       VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
           dev_ctx, x, axis.to<int64_t>(), keepdims, flatten, out));

diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -375,7 +375,7 @@ void ArgsortKernel(const Context& dev_ctx,
         PerSort<T, int64_t>(
             dev_ctx, out_data, ids_data, start, end, stable, descending);
         if (start != 0) {
-          auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, end);
+          auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, end);
           merge_kernel<<<config.block_per_grid.x,
                          config.thread_per_block.x,
                          0,

diff --git a/paddle/phi/kernels/gpu/asgd_kernel.cu b/paddle/phi/kernels/gpu/asgd_kernel.cu
@@ -69,7 +69,7 @@ void ASGDKernel(const Context& dev_ctx,
                 DenseTensor* d_out,
                 DenseTensor* y_out,
                 DenseTensor* master_param_out) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   const MT* master_in_data =
       multi_precision ? master_param->data<MT>() : nullptr;
   MT* master_out_data =

diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu
@@ -18,8 +18,6 @@
 
 namespace phi {
 
-using phi::PADDLE_CUDA_NUM_THREADS;
-
 __global__ void ClearObsoleteDataKernel(int64_t *pos,
                                         int64_t *neg,
                                         const int bucket_length,