diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu
index ef69c22275b808..ed3c1882223390 100644
--- a/paddle/phi/kernels/gpu/abs_kernel.cu
+++ b/paddle/phi/kernels/gpu/abs_kernel.cu
@@ -28,25 +28,23 @@ template <typename T, typename Enable = void>
 struct CudaAbsFunctor;
 
 template <typename T>
-struct CudaAbsFunctor<T, funcs::Complex<T, phi::dtype::Real<T>>> {
-  __device__ __forceinline__ phi::dtype::Real<T> operator()(const T x) const {
+struct CudaAbsFunctor<T, funcs::Complex<T, dtype::Real<T>>> {
+  __device__ __forceinline__ dtype::Real<T> operator()(const T x) const {
     return abs(x);
   }
 };
 
 template <typename T>
-struct CudaAbsFunctor<
-    T,
-    std::enable_if_t<std::is_same<T, phi::dtype::Real<T>>::value &&
-                     std::is_same<T, phi::bfloat16>::value>> {
+struct CudaAbsFunctor<T,
+                      std::enable_if_t<std::is_same<T, dtype::Real<T>>::value &&
+                                       std::is_same<T, bfloat16>::value>> {
   __device__ __forceinline__ T operator()(const T x) const { return abs(x); }
 };
 
 template <typename T>
-struct CudaAbsFunctor<
-    T,
-    std::enable_if_t<std::is_same<T, phi::dtype::Real<T>>::value &&
-                     !std::is_same<T, phi::bfloat16>::value>> {
+struct CudaAbsFunctor<T,
+                      std::enable_if_t<std::is_same<T, dtype::Real<T>>::value &&
+                                       !std::is_same<T, bfloat16>::value>> {
   __device__ __forceinline__ T operator()(const T x) const {
     return std::abs(x);
   }
@@ -56,12 +54,12 @@ template <typename T, typename Context>
 PADDLE_API void AbsKernel(const Context& dev_ctx,
                           const DenseTensor& x,
                           DenseTensor* out) {
-  dev_ctx.template Alloc<phi::dtype::Real<T>>(out);
+  dev_ctx.template Alloc<dtype::Real<T>>(out);
   std::vector<const DenseTensor*> ins = {&x};
   std::vector<DenseTensor*> outs = {out};
   auto functor = CudaAbsFunctor<T>();
 
-  funcs::ElementwiseKernel<phi::dtype::Real<T>>(dev_ctx, ins, &outs, functor);
+  funcs::ElementwiseKernel<dtype::Real<T>>(dev_ctx, ins, &outs, functor);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu
index b1478e842f4cfa..f97d49d717144b 100644
--- a/paddle/phi/kernels/gpu/accuracy_kernel.cu
+++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu
@@ -24,7 +24,6 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
-using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <int BlockSize, typename T>
 __global__ void AccuracyCudaKernel(const int64_t N,
@@ -34,7 +33,7 @@ __global__ void AccuracyCudaKernel(const int64_t N,
                                    int* correct_data,
                                    T* accuracy,
                                    int* total_data) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   int count = 0;
   __shared__ int total[BlockSize];
 
@@ -98,7 +97,7 @@ void AccuracyKernel(const Context& dev_ctx,
   int64_t num_samples = inference.dims()[0];
   size_t infer_width = inference.dims()[1];
   auto stream = dev_ctx.stream();
-  phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream);
+  backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream);
 
   PADDLE_ENFORCE_GT(label.dims().size(),
                     0,
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index 35ed74c551ec06..4e867f387e9082 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -371,8 +371,7 @@ void PowGradKernel(const Context& dev_ctx,
                    DenseTensor* dx) {
   if (factor.to<double>() == 0) {
     std::vector<int64_t> vec_dims = vectorize(dx->dims());
-    phi::Full<T, Context>(
-        dev_ctx, phi::IntArray(vec_dims), static_cast<T>(0), dx);
+    Full<T, Context>(dev_ctx, IntArray(vec_dims), static_cast<T>(0), dx);
     return;
   }
   if (factor.to<double>() == 1) {
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index 2fb25ec6151085..5da1b755ee8cf6 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -283,8 +283,7 @@ void PowKernel(const Context& dev_ctx,
   }
   if (factor.to<double>() == 0) {
     std::vector<int64_t> vec_dims = vectorize(out->dims());
-    phi::Full<T, Context>(
-        dev_ctx, phi::IntArray(vec_dims), static_cast<T>(1), out);
+    Full<T, Context>(dev_ctx, IntArray(vec_dims), static_cast<T>(1), out);
     return;
   }
   if (factor.to<double>() == 1) {
diff --git a/paddle/phi/kernels/gpu/adagrad_kernel.cu b/paddle/phi/kernels/gpu/adagrad_kernel.cu
index 303b180244feb5..69431f94844837 100644
--- a/paddle/phi/kernels/gpu/adagrad_kernel.cu
+++ b/paddle/phi/kernels/gpu/adagrad_kernel.cu
@@ -70,7 +70,7 @@ struct DenseAdagradFunctor<GPUContext, T> {
                   DenseTensor* param_out_tensor,
                   DenseTensor* moment_out_tensor,
                   DenseTensor* master_param_outs) {
-    using MT = typename phi::dtype::template MPTypeTrait<T>::Type;
+    using MT = typename dtype::template MPTypeTrait<T>::Type;
     T* param_out_data = dev_ctx.template Alloc<T>(param_out_tensor);
     MT* moment_out_data = dev_ctx.template Alloc<MT>(moment_out_tensor);
     const MT* master_in_data =
@@ -82,7 +82,7 @@ struct DenseAdagradFunctor<GPUContext, T> {
     MT epsilon = static_cast<MT>(epsilon_t);
 
     int64_t numel = param_t.numel();
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1);
     int grid = config.block_per_grid.x;
     int block = config.thread_per_block.x;
     auto stream = dev_ctx.stream();
@@ -180,7 +180,7 @@ struct SparseAdagradFunctor<GPUContext, T> {
     const int block_size = 256;
     dim3 threads(block_size, 1);
     dim3 grid2(1, merge_rows.size());
-    phi::MixVector<int64_t> mixv_merge_rows(&merge_rows);
+    MixVector<int64_t> mixv_merge_rows(&merge_rows);
     SparseAdagradFunctorKernel<T, 256>
         <<<grid2,
            threads,
@@ -201,7 +201,7 @@ template struct SparseAdagradFunctor<GPUContext, float>;
 template struct SparseAdagradFunctor<GPUContext, double>;
 template struct DenseAdagradFunctor<GPUContext, float>;
 template struct DenseAdagradFunctor<GPUContext, double>;
-template struct DenseAdagradFunctor<GPUContext, phi::float16>;
+template struct DenseAdagradFunctor<GPUContext, float16>;
 
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu
index 8cbde3b60d716d..c3c50207d09b96 100644
--- a/paddle/phi/kernels/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/gpu/adam_kernel.cu
@@ -187,7 +187,7 @@ PADDLE_API void AdamDenseKernel(const Context& dev_ctx,
                                 DenseTensor* beta1_pow_out,
                                 DenseTensor* beta2_pow_out,
                                 DenseTensor* master_param_outs) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   const auto grad_type = grad.dtype();
 
   VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
@@ -394,7 +394,7 @@ void MergedAdamKernel(
     std::vector<DenseTensor*> beta1_pow_out,
     std::vector<DenseTensor*> beta2_pow_out,
     std::vector<DenseTensor*> master_param_out) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
   MT beta1_ = beta1.to<MT>();
   MT beta2_ = beta2.to<MT>();
diff --git a/paddle/phi/kernels/gpu/adamax_kernel.cu b/paddle/phi/kernels/gpu/adamax_kernel.cu
index e6c60d485528c1..dda8d4c45ba1d6 100644
--- a/paddle/phi/kernels/gpu/adamax_kernel.cu
+++ b/paddle/phi/kernels/gpu/adamax_kernel.cu
@@ -84,7 +84,7 @@ void AdamaxKernel(const Context& dev_ctx,
                   DenseTensor* moment_out,
                   DenseTensor* inf_norm_out,
                   DenseTensor* master_param_outs) {
-  using MT = typename phi::dtype::template MPTypeTrait<T>::Type;
+  using MT = typename dtype::template MPTypeTrait<T>::Type;
   T* param_out_data = dev_ctx.template Alloc<T>(param_out);
   MT* moment_out_data = dev_ctx.template Alloc<MT>(moment_out);
   MT* inf_norm_out_data = dev_ctx.template Alloc<MT>(inf_norm_out);
@@ -104,7 +104,7 @@ void AdamaxKernel(const Context& dev_ctx,
   MT epsilon_ = static_cast<MT>(epsilon);
 
   int64_t numel = param.numel();
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1);
   int grid = config.block_per_grid.x;
   int block = config.thread_per_block.x;
   auto stream = dev_ctx.stream();
diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu
index 0d11bc4ac8e932..7eee4fb0a4461f 100644
--- a/paddle/phi/kernels/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/gpu/adamw_kernel.cu
@@ -168,7 +168,7 @@ PADDLE_API void AdamwDenseKernel(const Context& dev_ctx,
                                  DenseTensor* beta1_pow_out,
                                  DenseTensor* beta2_pow_out,
                                  DenseTensor* master_param_outs) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   MT coeff_ = static_cast<MT>(coeff);
   MT lr_ratio_ = static_cast<MT>(lr_ratio);
 
diff --git a/paddle/phi/kernels/gpu/add_n_kernel.cu b/paddle/phi/kernels/gpu/add_n_kernel.cu
index 9aa073279319de..d2cc2b6e2b02a0 100644
--- a/paddle/phi/kernels/gpu/add_n_kernel.cu
+++ b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -25,7 +25,7 @@ namespace phi {
 template <class T>
 __global__ void SumArrayCUDAKernel(
     T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MPType = typename dtype::MPTypeTrait<T>::Type;
   CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) {
     MPType total(read_dst ? static_cast<MPType>(out[idx])
                           : static_cast<MPType>(0));
@@ -46,7 +46,7 @@ __global__ void SumArrayMixedTypeCUDAKernel(const T *in_0,
                                             int64_t N,
                                             size_t in_others_size,
                                             bool read_dst) {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MPType = typename dtype::MPTypeTrait<T>::Type;
   CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) {
     MPType total(read_dst ? static_cast<MPType>(out[idx])
                           : static_cast<MPType>(0));
@@ -128,7 +128,7 @@ void AddNKernel(const Context &dev_ctx,
     int64_t length_0 = in_0.numel();
     int64_t length_1 = in_1.numel();
     if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) {
-      using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+      using MPType = typename dtype::MPTypeTrait<T>::Type;
       auto result = EigenVector<T>::Flatten(*out);
       auto &place = *dev_ctx.eigen_device();
       auto in_0_e = EigenVector<T>::Flatten(in_0).template cast<MPType>();
@@ -281,7 +281,7 @@ void AddNKernel(const Context &dev_ctx,
       }
     }
     if (!sr_in_out_data.empty()) {
-      auto tmp_sr_in_out_array = phi::memory_utils::Alloc(
+      auto tmp_sr_in_out_array = memory_utils::Alloc(
           dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));
 
       size_t nbytes_sr = sr_in_out_data.size() * sizeof(T *);
@@ -306,8 +306,8 @@ void AddNKernel(const Context &dev_ctx,
   }
   // if indata not null, merge into one kernel call.
   if (!in_data.empty()) {
-    auto tmp_in_array = phi::memory_utils::Alloc(dev_ctx.GetPlace(),
-                                                 in_data.size() * sizeof(T *));
+    auto tmp_in_array =
+        memory_utils::Alloc(dev_ctx.GetPlace(), in_data.size() * sizeof(T *));
 
     size_t nbytes_in2 = in_data.size() * sizeof(T *);
     const void *stable_in2 =
diff --git a/paddle/phi/kernels/gpu/all_to_all_kernel.cu b/paddle/phi/kernels/gpu/all_to_all_kernel.cu
index 2ac99ee46b58a6..091662d90ea60c 100644
--- a/paddle/phi/kernels/gpu/all_to_all_kernel.cu
+++ b/paddle/phi/kernels/gpu/all_to_all_kernel.cu
@@ -63,10 +63,9 @@ void AllToAllKernel(const Context& dev_ctx,
   const auto* send_buf = x.data<T>();
   auto* recv_buf = out->data<T>();
   for (auto i = 0; i < nranks; ++i) {
-    auto send_buf = phi::distributed::GetPartialTensor(x, offset, send_numel);
+    auto send_buf = distributed::GetPartialTensor(x, offset, send_numel);
     comm_ctx->Send(send_buf, send_numel, i, stream);
-    auto recv_buf =
-        phi::distributed::GetPartialTensor(*out, offset, send_numel);
+    auto recv_buf = distributed::GetPartialTensor(*out, offset, send_numel);
     comm_ctx->Recv(&recv_buf, send_numel, i, stream);
     offset += send_numel;
   }
diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu
index dba76d4b1eac86..409d09c52e7bb1 100644
--- a/paddle/phi/kernels/gpu/allclose_kernel.cu
+++ b/paddle/phi/kernels/gpu/allclose_kernel.cu
@@ -36,7 +36,7 @@ __global__ void AllcloseCUDAKernel(const T* in_data,
                                    bool* out_data) {
   unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
   bool val;
-  using BaseMPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using BaseMPType = typename dtype::MPTypeTrait<T>::Type;
 
   using MPType =
       typename std::conditional<std::is_same<T, int32_t>::value ||
@@ -98,8 +98,7 @@ void AllCloseKernel(const Context& dev_ctx,
 
   int64_t num = x.numel();
   const int vec_size = 4;
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, num, vec_size);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, num, vec_size);
   uint32_t grid = config.block_per_grid.x;
   uint32_t block = config.thread_per_block.x;
 
diff --git a/paddle/phi/kernels/gpu/amp_kernel.cu b/paddle/phi/kernels/gpu/amp_kernel.cu
index e5880c74a4f501..22875642d68075 100644
--- a/paddle/phi/kernels/gpu/amp_kernel.cu
+++ b/paddle/phi/kernels/gpu/amp_kernel.cu
@@ -159,13 +159,13 @@ class LazyZeros<GPUContext, T> {
     const auto& cpu_place = CPUPlace();
     // alloc each tensor's start index and copy to device
     auto h_in_starts_mem =
-        phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
+        memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
     int64_t* h_starts = reinterpret_cast<int64_t*>(h_in_starts_mem->ptr());
 
-    auto d_in_starts_mem = phi::memory_utils::Alloc(
+    auto d_in_starts_mem = memory_utils::Alloc(
         dev_ctx.GetPlace(),
         (xs_size + 1) * sizeof(int64_t),
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+        Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
     int64_t* d_starts = reinterpret_cast<int64_t*>(d_in_starts_mem->ptr());
 
     // the start index value of each tensor is
@@ -186,14 +186,13 @@ class LazyZeros<GPUContext, T> {
                        dev_ctx.stream());
 
     // copy each tensor of "outs" data address array to device
-    auto h_out_addrs_mem =
-        phi::memory_utils::Alloc(cpu_place, xs_size * sizeof(T*));
+    auto h_out_addrs_mem = memory_utils::Alloc(cpu_place, xs_size * sizeof(T*));
     T** h_out_addrs = reinterpret_cast<T**>(h_out_addrs_mem->ptr());
 
-    auto d_out_addrs_mem = phi::memory_utils::Alloc(
+    auto d_out_addrs_mem = memory_utils::Alloc(
         dev_ctx.GetPlace(),
         xs_size * sizeof(T*),
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+        Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
     T** d_out_addrs = reinterpret_cast<T**>(d_out_addrs_mem->ptr());
 
     for (size_t i = 0; i < xs_size; ++i) {
@@ -277,7 +276,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
                                  const DenseTensor& scale,
                                  std::vector<DenseTensor*> outs,
                                  DenseTensor* found_infinite) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
 
   const MT* scale_data = scale.data<MT>();
   bool* found_inf_data = dev_ctx.template Alloc<bool>(found_infinite);
@@ -294,13 +293,13 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
   const auto& cpu_place = CPUPlace();
   // calculate each tensor's start index and copy to device
   auto h_starts_tensor =
-      phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
+      memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
   int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr());
 
-  auto d_starts_tensor = phi::memory_utils::Alloc(
-      dev_ctx.GetPlace(),
-      (xs_size + 1) * sizeof(int64_t),
-      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+  auto d_starts_tensor =
+      memory_utils::Alloc(dev_ctx.GetPlace(),
+                          (xs_size + 1) * sizeof(int64_t),
+                          Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
   int64_t* d_starts = reinterpret_cast<int64_t*>(d_starts_tensor->ptr());
 
   // the start index value of each tensor is
@@ -322,14 +321,14 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
                      dev_ctx.stream());
 
   // copy each tensor's data address to device
-  auto h_mem = phi::memory_utils::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
+  auto h_mem = memory_utils::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
   const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr());
   T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size;
 
-  auto d_mem = phi::memory_utils::Alloc(
-      dev_ctx.GetPlace(),
-      2 * xs_size * sizeof(T*),
-      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+  auto d_mem =
+      memory_utils::Alloc(dev_ctx.GetPlace(),
+                          2 * xs_size * sizeof(T*),
+                          Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
   const T** d_xs = reinterpret_cast<const T**>(d_mem->ptr());
   T** d_outs = reinterpret_cast<T**>(d_mem->ptr()) + xs_size;
 
diff --git a/paddle/phi/kernels/gpu/arange_kernel.cu b/paddle/phi/kernels/gpu/arange_kernel.cu
index 35e53055533460..ee7873e0110f4e 100644
--- a/paddle/phi/kernels/gpu/arange_kernel.cu
+++ b/paddle/phi/kernels/gpu/arange_kernel.cu
@@ -37,7 +37,7 @@ void ArangeTensorKernel(const Context& dev_ctx,
                         const DenseTensor& end,
                         const DenseTensor& step,
                         DenseTensor* out) {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MPType = typename dtype::MPTypeTrait<T>::Type;
   MPType start_value =
       static_cast<MPType>(GetValue<T, Context>(dev_ctx, start));
   MPType end_value = static_cast<MPType>(GetValue<T, Context>(dev_ctx, end));
@@ -64,7 +64,7 @@ void ArangeNullaryKernel(const Context& dev_ctx,
                          const T end_value,
                          const T step_value,
                          DenseTensor* out) {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MPType = typename dtype::MPTypeTrait<T>::Type;
   MPType start_value_mpt = static_cast<MPType>(start_value);
   MPType end_value_mpt = static_cast<MPType>(end_value);
   MPType step_value_mpt = static_cast<MPType>(step_value);
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
index 868257e9edbc64..3f98b957d60162 100644
--- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -232,7 +232,7 @@ void ArgMinMaxOpCUDAKernel(const Context& dev_ctx,
             dev_ctx, x, axis.to<int64_t>(), keepdims, flatten, out));
     return;
   }
-  phi::VisitDataTypeTiny(
+  VisitDataTypeTiny(
       dtype,
       VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
           dev_ctx, x, axis.to<int64_t>(), keepdims, flatten, out));
diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
index b351fe22a13104..7d73826d561c6e 100644
--- a/paddle/phi/kernels/gpu/argsort_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -375,7 +375,7 @@ void ArgsortKernel(const Context& dev_ctx,
         PerSort<T, int64_t>(
             dev_ctx, out_data, ids_data, start, end, stable, descending);
         if (start != 0) {
-          auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, end);
+          auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, end);
           merge_kernel<<<config.block_per_grid.x,
                          config.thread_per_block.x,
                          0,
diff --git a/paddle/phi/kernels/gpu/asgd_kernel.cu b/paddle/phi/kernels/gpu/asgd_kernel.cu
index 95a993fd3e5488..db66d3716e91c3 100644
--- a/paddle/phi/kernels/gpu/asgd_kernel.cu
+++ b/paddle/phi/kernels/gpu/asgd_kernel.cu
@@ -69,7 +69,7 @@ void ASGDKernel(const Context& dev_ctx,
                 DenseTensor* d_out,
                 DenseTensor* y_out,
                 DenseTensor* master_param_out) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   const MT* master_in_data =
       multi_precision ? master_param->data<MT>() : nullptr;
   MT* master_out_data =
diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu
index 0d3d3f2545866d..4da580f474912e 100644
--- a/paddle/phi/kernels/gpu/auc_kernel.cu
+++ b/paddle/phi/kernels/gpu/auc_kernel.cu
@@ -18,8 +18,6 @@
 
 namespace phi {
 
-using phi::PADDLE_CUDA_NUM_THREADS;
-
 __global__ void ClearObsoleteDataKernel(int64_t *pos,
                                         int64_t *neg,
                                         const int bucket_length,
diff --git a/paddle/phi/kernels/gpu/barrier_kernel.cu b/paddle/phi/kernels/gpu/barrier_kernel.cu
index fd639434f8193e..fe17bb4eb7b39e 100644
--- a/paddle/phi/kernels/gpu/barrier_kernel.cu
+++ b/paddle/phi/kernels/gpu/barrier_kernel.cu
@@ -27,8 +27,8 @@ void BarrierKernel(const Context &dev_ctx,
                    DenseTensor *out) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   auto in = &x;
-  auto comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
-      dev_ctx.GetCommContext());
+  auto comm_ctx =
+      static_cast<distributed::NCCLCommContext *>(dev_ctx.GetCommContext());
   PADDLE_ENFORCE_NE(comm_ctx,
                     nullptr,
                     common::errors::Unavailable(
@@ -37,7 +37,7 @@ void BarrierKernel(const Context &dev_ctx,
   auto stream = comm_ctx->GetStream();
   ncclRedOp_t nccl_red_type = ncclSum;
   comm_ctx->AllReduce(out, *in, nccl_red_type, stream);
-  phi::backends::gpu::GpuStreamSync(stream);
+  backends::gpu::GpuStreamSync(stream);
 #else
   PADDLE_THROW(
       common::errors::Unavailable("PaddlePaddle should compile with NCCL."));
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index cc81ffcc682bcf..abb34236991aa2 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -42,7 +42,7 @@ COMMON_DECLARE_bool(batch_norm_use_miopen);
 namespace phi {
 
 template <typename T>
-using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+using CudnnDataType = backends::gpu::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
@@ -611,7 +611,7 @@ void BatchNormGradFunctor(const Context &dev_ctx,
           C,
           new_scale.dims()[0]));
 
-  auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
+  auto dtype = backends::gpu::CudnnDataType<T>::type;
 #ifdef PADDLE_WITH_HIP
   auto compute_format =
       data_layout == DataLayout::NHWC
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 37b7cdfb1f0534..2500b2a553022d 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -42,7 +42,7 @@ COMMON_DECLARE_bool(batch_norm_use_miopen);
 namespace phi {
 
 template <typename T>
-using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+using CudnnDataType = backends::gpu::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
@@ -588,7 +588,7 @@ void BatchNormKernel(const Context &dev_ctx,
   int N, C, H, W, D;
   funcs::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
-  auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
+  auto dtype = backends::gpu::CudnnDataType<T>::type;
 
   auto *Scale = scale.get_ptr();
   auto *Bias = bias.get_ptr();
@@ -599,13 +599,13 @@ void BatchNormKernel(const Context &dev_ctx,
   if (Scale) {
     new_scale = scale.get();
   } else {
-    new_scale = phi::Full<T, Context>(dev_ctx, {C}, static_cast<T>(1));
+    new_scale = Full<T, Context>(dev_ctx, {C}, static_cast<T>(1));
   }
 
   if (Bias) {
     new_bias = bias.get();
   } else {
-    new_bias = phi::Full<T, Context>(dev_ctx, {C}, static_cast<T>(0));
+    new_bias = Full<T, Context>(dev_ctx, {C}, static_cast<T>(0));
   }
 
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu
index 3b05ea51a32151..3211a29d6cf365 100644
--- a/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu
@@ -27,7 +27,7 @@ namespace phi {
 
 template <typename T>
 struct BCELossGradFunctor {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   MT one = static_cast<MT>(1.0f);
   MT eps = static_cast<MT>(1e-12);
 
diff --git a/paddle/phi/kernels/gpu/bce_loss_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
index 05866af47c6130..4cbb343223ae77 100644
--- a/paddle/phi/kernels/gpu/bce_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
@@ -28,7 +28,7 @@ namespace phi {
 
 template <typename T>
 struct BCELossFunctor {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   MT zero = static_cast<MT>(0);
   MT one = static_cast<MT>(1.0f);
   MT neg_100 = static_cast<MT>(-100.);
@@ -42,8 +42,8 @@ struct BCELossFunctor {
         "Input is expected to be within the interval [0, 1], but received %f.",
         x_mt);
 
-    MT term1 = max(phi::kps::details::Log(x_mt), neg_100);
-    MT term2 = max(phi::kps::details::Log(one - x_mt), neg_100);
+    MT term1 = max(kps::details::Log(x_mt), neg_100);
+    MT term2 = max(kps::details::Log(one - x_mt), neg_100);
     return static_cast<T>((label_mt - one) * term2 - label_mt * term1);
   }
 };
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index e83cc590a1b310..9adf7c7cb4fe1c 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -53,7 +53,7 @@ __global__ void bernoulli_cuda_kernel(
   for (size_t i = 4 * thread_idx; i < size; i += total_thread * 4) {
     funcs::uniform_distribution<float> dist;
     float4 rand = dist(&state);
-    using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+    using MPType = typename dtype::MPTypeTrait<T>::Type;
 #pragma unroll
     for (size_t j = 0; j < 4; j++) {
       size_t idx = i + j;
@@ -82,7 +82,7 @@ void BernoulliKernel(const Context& dev_ctx,
   uint64_t seed = seed_offset.first;
   uint64_t offset = seed_offset.second;
 
-  auto gpu_config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 4);
+  auto gpu_config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 4);
   size_t grid_size = gpu_config.GetGridSize();
   size_t block_size = gpu_config.GetBlockSize();
 
diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu
index a770eaa4562079..7a1f5d7d8427e0 100644
--- a/paddle/phi/kernels/gpu/bincount_kernel.cu
+++ b/paddle/phi/kernels/gpu/bincount_kernel.cu
@@ -22,8 +22,6 @@
 #include "paddle/phi/kernels/funcs/math_function.h"
 namespace phi {
 
-using phi::PADDLE_CUDA_NUM_THREADS;
-
 inline int64_t GET_BLOCKS(const int64_t N) {
   return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
 }
@@ -62,8 +60,8 @@ __global__ void KernelReduceMinMax(const T* input,
   }
 
   if (tid == 0) {
-    phi::CudaAtomicMin(min_out, smin[0]);
-    phi::CudaAtomicMax(max_out, smax[0]);
+    CudaAtomicMin(min_out, smin[0]);
+    CudaAtomicMax(max_out, smax[0]);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/binomial_kernel.cu b/paddle/phi/kernels/gpu/binomial_kernel.cu
index a5547c9f9aa4f9..67e5a012ea6e35 100644
--- a/paddle/phi/kernels/gpu/binomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/binomial_kernel.cu
@@ -145,7 +145,7 @@ __global__ void BinomialSampling(const T* n,
                                  const int N,
                                  unsigned int seed,
                                  unsigned int offset) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) {
     MT nt = static_cast<MT>(n[idx]);
     MT pt = static_cast<MT>(p[idx]);
@@ -189,7 +189,7 @@ void BinomialKernel(const Context& dev_ctx,
   int block_size = std::min(kMaxBlockDim, dev_ctx.GetMaxThreadsPerBlock());
   dim3 dim_block(block_size);
   dim3 dim_grid((size + block_size - 1) / block_size);
-  phi::backends::gpu::LimitGridDim(dev_ctx, &dim_grid);
+  backends::gpu::LimitGridDim(dev_ctx, &dim_grid);
 
   auto gen_cuda = dev_ctx.GetGenerator();
   auto seed_offset = gen_cuda->IncrementOffset(20);
diff --git a/paddle/phi/kernels/gpu/box_clip_kernel.cu b/paddle/phi/kernels/gpu/box_clip_kernel.cu
index 0668965403e7ce..315934df1c6842 100644
--- a/paddle/phi/kernels/gpu/box_clip_kernel.cu
+++ b/paddle/phi/kernels/gpu/box_clip_kernel.cu
@@ -63,7 +63,7 @@ void GPUBoxClipKernel(const Context &dev_ctx,
   auto stream = dev_ctx.stream();
   const size_t batch_size = lod.back().size() - 1;
   T *output_data = dev_ctx.template Alloc<T>(output);
-  phi::MixVector<size_t> mix_vector(&abs_offset_lod[0]);
+  MixVector<size_t> mix_vector(&abs_offset_lod[0]);
   GPUBoxClip<T, 512><<<batch_size, 512, 0, stream>>>(
       input_p->data<T>(),
       mix_vector.CUDAMutableData(dev_ctx.GetPlace()),
diff --git a/paddle/phi/kernels/gpu/box_coder_kernel.cu b/paddle/phi/kernels/gpu/box_coder_kernel.cu
index 5ff86cac661ca1..bc2ed13f307649 100644
--- a/paddle/phi/kernels/gpu/box_coder_kernel.cu
+++ b/paddle/phi/kernels/gpu/box_coder_kernel.cu
@@ -209,10 +209,10 @@ void BoxCoderKernel(const Context &dev_ctx,
   int grid = (row * col + block - 1) / block;
 
   int64_t bytes = var_size * sizeof(float);
-  auto dev_var = phi::memory_utils::Alloc(
-      dev_ctx.GetPlace(),
-      bytes,
-      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+  auto dev_var =
+      memory_utils::Alloc(dev_ctx.GetPlace(),
+                          bytes,
+                          Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
   float *dev_var_data = reinterpret_cast<float *>(dev_var->ptr());
   auto cplace = CPUPlace();
   const auto gplace = dev_ctx.GetPlace();
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
index 518a8567ecd9f5..cb6cb3eb72f9a9 100644
--- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -89,12 +89,12 @@ void BroadcastTensorsGradKernel(const Context& dev_ctx,
       Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, output_tensor);
     } else {
       // reduce_sum implementation on CUDA
-      phi::SumKernel<T, Context>(dev_ctx,
-                                 *input_tensor,
-                                 reduce_dims_vec,
-                                 output_tensor->dtype(),
-                                 false,
-                                 output_tensor);
+      SumKernel<T, Context>(dev_ctx,
+                            *input_tensor,
+                            reduce_dims_vec,
+                            output_tensor->dtype(),
+                            false,
+                            output_tensor);
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/c_concat_kernel.cu b/paddle/phi/kernels/gpu/c_concat_kernel.cu
index 1cf9f41072708c..763915f70f1b49 100644
--- a/paddle/phi/kernels/gpu/c_concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/c_concat_kernel.cu
@@ -69,13 +69,13 @@ void CConcatKernel(const Context& dev_ctx,
   gpuStream_t stream = nullptr;
 
 #if defined(PADDLE_WITH_FLAGCX) && defined(PADDLE_KERNEL_WITH_FLAGCX)
-  phi::distributed::FlagcxCommContext* comm_ctx = nullptr;
-  comm_ctx = static_cast<phi::distributed::FlagcxCommContext*>(
-      dev_ctx.GetCommContext());
+  distributed::FlagcxCommContext* comm_ctx = nullptr;
+  comm_ctx =
+      static_cast<distributed::FlagcxCommContext*>(dev_ctx.GetCommContext());
 #else
-  phi::distributed::NCCLCommContext* comm_ctx = nullptr;
+  distributed::NCCLCommContext* comm_ctx = nullptr;
   comm_ctx =
-      static_cast<phi::distributed::NCCLCommContext*>(dev_ctx.GetCommContext());
+      static_cast<distributed::NCCLCommContext*>(dev_ctx.GetCommContext());
 #endif
   PADDLE_ENFORCE_NE(comm_ctx,
                     nullptr,
diff --git a/paddle/phi/kernels/gpu/c_scatter_kernel.cu b/paddle/phi/kernels/gpu/c_scatter_kernel.cu
index e8e1f37137fe3c..19aa361d57dadb 100644
--- a/paddle/phi/kernels/gpu/c_scatter_kernel.cu
+++ b/paddle/phi/kernels/gpu/c_scatter_kernel.cu
@@ -35,12 +35,12 @@ void CScatterOpCUDAKernel(const Context& dev_ctx,
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   auto x = &input;
   int64_t numel = x->numel();
-  ncclDataType_t dtype = phi::ToNCCLDataType(x->dtype());
+  ncclDataType_t dtype = ToNCCLDataType(x->dtype());
 
   int root_id = root;
   auto place = dev_ctx.GetPlace();
   gpuStream_t stream = nullptr;
-  phi::distributed::NCCLCommContext* comm_ctx = nullptr;
+  distributed::NCCLCommContext* comm_ctx = nullptr;
   PADDLE_ENFORCE_GE(
       root_id,
       0,
@@ -53,7 +53,7 @@ void CScatterOpCUDAKernel(const Context& dev_ctx,
           "The ring_id (%d) for c_scatter_op must be non-negative.", ring_id));
 
   comm_ctx =
-      static_cast<phi::distributed::NCCLCommContext*>(dev_ctx.GetCommContext());
+      static_cast<distributed::NCCLCommContext*>(dev_ctx.GetCommContext());
   PADDLE_ENFORCE_NE(comm_ctx,
                     nullptr,
                     common::errors::Unavailable(
diff --git a/paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_kernel.cu
index 2512503a102e3e..3df577e5d5a2da 100644
--- a/paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_kernel.cu
@@ -189,10 +189,10 @@ struct CSoftmaxWithCrossEntropyFunctor<GPUContext, T> {
     const DenseTensor* labels = &label_in;
 
     gpuStream_t stream = nullptr;
-    phi::distributed::NCCLCommContext* comm_ctx = nullptr;
+    distributed::NCCLCommContext* comm_ctx = nullptr;
 
-    comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
-        dev_ctx.GetCommContext());
+    comm_ctx =
+        static_cast<distributed::NCCLCommContext*>(dev_ctx.GetCommContext());
     PADDLE_ENFORCE_NE(comm_ctx,
                       nullptr,
                       common::errors::Unavailable(
@@ -223,7 +223,7 @@ struct CSoftmaxWithCrossEntropyFunctor<GPUContext, T> {
     logits_max.Resize({N, 1});
     dev_ctx.template Alloc<T>(&logits_max);
 
-    phi::MaxKernel<T, GPUContext>(dev_ctx, logits_2d, {-1}, true, &logits_max);
+    MaxKernel<T, GPUContext>(dev_ctx, logits_2d, {-1}, true, &logits_max);
 
     comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream);
 
@@ -305,14 +305,14 @@ struct CSoftmaxWithCrossEntropyFunctor<GPUContext, T> {
     comm_ctx->AllReduce(&predicted_logits, predicted_logits, ncclSum, stream);
 
     // step 4, obtain exp(logit)
-    phi::ExpKernel<T, GPUContext>(dev_ctx, softmax_2d, &softmax_2d);
+    ExpKernel<T, GPUContext>(dev_ctx, softmax_2d, &softmax_2d);
 
     // step 5, obtain sum_exp_logits
     DenseTensor sum_exp_logits;
     sum_exp_logits.Resize({N, 1});
     dev_ctx.template Alloc<T>(&sum_exp_logits);
 
-    phi::SumKernel<T, GPUContext>(
+    SumKernel<T, GPUContext>(
         dev_ctx, softmax_2d, {-1}, softmax_2d.dtype(), true, &sum_exp_logits);
 
     comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream);
@@ -358,8 +358,7 @@ struct CSoftmaxWithCrossEntropyFunctor<GPUContext, T> {
       }
     }
 
-    phi::ReciprocalKernel<T, GPUContext>(
-        dev_ctx, sum_exp_logits, &sum_exp_logits);
+    ReciprocalKernel<T, GPUContext>(dev_ctx, sum_exp_logits, &sum_exp_logits);
 
     inputs = std::vector<const DenseTensor*>{&softmax_2d, &sum_exp_logits};
     outputs = std::vector<DenseTensor*>{&softmax_2d};
diff --git a/paddle/phi/kernels/gpu/c_softmax_with_multi_label_cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/c_softmax_with_multi_label_cross_entropy_kernel.cu
index 4e34885514ae97..9de70f0a25491f 100644
--- a/paddle/phi/kernels/gpu/c_softmax_with_multi_label_cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/c_softmax_with_multi_label_cross_entropy_kernel.cu
@@ -159,10 +159,10 @@ struct CSoftmaxWithMultiLabelCrossEntropyFunctor<GPUContext, T> {
     const DenseTensor* smooth_weight = &smooth_weight_in;
 
     gpuStream_t stream = nullptr;
-    phi::distributed::NCCLCommContext* comm_ctx = nullptr;
+    distributed::NCCLCommContext* comm_ctx = nullptr;
 
-    comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
-        dev_ctx.GetCommContext());
+    comm_ctx =
+        static_cast<distributed::NCCLCommContext*>(dev_ctx.GetCommContext());
     PADDLE_ENFORCE_NE(comm_ctx,
                       nullptr,
                       common::errors::Unavailable(
@@ -266,7 +266,7 @@ struct CSoftmaxWithMultiLabelCrossEntropyFunctor<GPUContext, T> {
     sum_exp_logits.Resize({N, 1});
     dev_ctx.template Alloc<T>(&sum_exp_logits);
 
-    phi::SumKernel<T, GPUContext>(
+    SumKernel<T, GPUContext>(
         dev_ctx, softmax_2d, {-1}, softmax_2d.dtype(), true, &sum_exp_logits);
 
     comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream);
diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu
index a300efb15e7306..b635faf6d6b9ef 100644
--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
@@ -25,7 +25,7 @@ void CastKernel(const Context& dev_ctx,
                 DataType out_dtype,
                 DenseTensor* out) {
   if (x.dtype() == out_dtype) {
-    if (x.dims() == phi::make_ddim({-1})) {
+    if (x.dims() == make_ddim({-1})) {
       *out = x;
       return;
     }
@@ -53,8 +53,8 @@ INSTANTIATE_CAST_KERNEL(uint32_t, GPUContext)
 INSTANTIATE_CAST_KERNEL(uint64_t, GPUContext)
 INSTANTIATE_CAST_KERNEL(bool, GPUContext)
 INSTANTIATE_CAST_KERNEL(int16_t, GPUContext)
-INSTANTIATE_CAST_KERNEL(phi::float16, GPUContext)
-INSTANTIATE_CAST_KERNEL(phi::bfloat16, GPUContext)
+INSTANTIATE_CAST_KERNEL(float16, GPUContext)
+INSTANTIATE_CAST_KERNEL(bfloat16, GPUContext)
 #endif
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/check_numerics_kernel.cu b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
index b7fa564d9c3dea..fab45447fb5c95 100644
--- a/paddle/phi/kernels/gpu/check_numerics_kernel.cu
+++ b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
@@ -28,11 +28,9 @@ namespace phi {
 static std::once_flag init_multi_gpu_op_var_map_flag;
 
 // lazy init
-static std::vector<
-    std::unordered_map<std::string, phi::Allocator::AllocationPtr>>&
+static std::vector<std::unordered_map<std::string, Allocator::AllocationPtr>>&
 multi_op_var2gpu_str() {
-  static std::vector<
-      std::unordered_map<std::string, phi::Allocator::AllocationPtr>>
+  static std::vector<std::unordered_map<std::string, Allocator::AllocationPtr>>
       _multi_op_var2gpu_str;
   return _multi_op_var2gpu_str;
 }
@@ -43,14 +41,14 @@ static std::vector<std::mutex>& multi_op_var2gpu_str_mutex() {
 }
 
 static void InitMultiGPUOpVarMap() {
-  int dev_count = phi::backends::gpu::GetGPUDeviceCount();
+  int dev_count = backends::gpu::GetGPUDeviceCount();
   PADDLE_ENFORCE_GT(dev_count,
                     0,
                     common::errors::NotFound(
                         "cuda device must > 0, now dev_count=%d", dev_count));
 
   // https://stackoverflow.com/questions/16465633/how-can-i-use-something-like-stdvectorstdmutex
-  std::vector<std::unordered_map<std::string, phi::Allocator::AllocationPtr>>
+  std::vector<std::unordered_map<std::string, Allocator::AllocationPtr>>
       tmp_multi(dev_count);
   std::vector<std::mutex> tmp_multi_mutex(dev_count);
 
@@ -111,8 +109,8 @@ __device__ void BlockReduceNumNanInfAndWrite(const int64_t num_nan,
 }
 
 template <typename T,
-          std::enable_if_t<std::is_same<T, phi::complex64>::value ||
-                               std::is_same<T, phi::complex128>::value,
+          std::enable_if_t<std::is_same<T, complex64>::value ||
+                               std::is_same<T, complex128>::value,
                            bool> = true>
 __device__ void BlockReduceMaxMinAndWrite(const T max_value,
                                           const T min_value,
@@ -125,8 +123,8 @@ __device__ void BlockReduceMaxMinAndWrite(const T max_value,
 }
 
 template <typename T,
-          std::enable_if_t<!std::is_same<T, phi::complex64>::value &&
-                               !std::is_same<T, phi::complex128>::value,
+          std::enable_if_t<!std::is_same<T, complex64>::value &&
+                               !std::is_same<T, complex128>::value,
                            bool> = true>
 __device__ void BlockReduceMaxMinAndWrite(const T max_value,
                                           const T min_value,
@@ -272,7 +270,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr,
 template <typename T>
 inline std::string GetHintString(const std::string& op_type,
                                  const std::string& var_name,
-                                 const phi::Place& place,
+                                 const Place& place,
                                  int dev_id = -1) {
   std::string op_var =
       funcs::GetCpuHintString<T>(op_type, var_name, place, dev_id);
@@ -301,10 +299,10 @@ static char* GetGpuHintStringPtr(const GPUContext& dev_ctx,
 
     std::lock_guard<std::mutex> guard(op_var2gpu_str_mutex);
     if (op_var2gpu_str.find(op_var) == op_var2gpu_str.end()) {  // insert
-      auto gpu_str_tensor = phi::memory_utils::Alloc(
+      auto gpu_str_tensor = memory_utils::Alloc(
           dev_ctx.GetPlace(),
           op_var.length() + 1,
-          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+          Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
       gpu_str_ptr = reinterpret_cast<char*>(gpu_str_tensor->ptr());
 
       op_var2gpu_str.emplace(op_var, std::move(gpu_str_tensor));
@@ -353,14 +351,14 @@ static void PrintStack(const GPUContext& dev_ctx,
                        const std::string& op_type,
                        const std::string& var_name,
                        int dev_id) {
-  auto cpu_stats = phi::memory_utils::Alloc(CPUPlace(), sizeof(int64_t) * 3);
+  auto cpu_stats = memory_utils::Alloc(CPUPlace(), sizeof(int64_t) * 3);
   int64_t* cpu_stats_ptr = reinterpret_cast<int64_t*>(cpu_stats->ptr());
-  phi::memory_utils::Copy(CPUPlace(),
-                          cpu_stats_ptr,
-                          stats.place(),
-                          stats.data(),
-                          3 * sizeof(int64_t),
-                          dev_ctx.stream());
+  memory_utils::Copy(CPUPlace(),
+                     cpu_stats_ptr,
+                     stats.place(),
+                     stats.data(),
+                     3 * sizeof(int64_t),
+                     dev_ctx.stream());
   dev_ctx.Wait();
   if (cpu_stats_ptr[0] > 0 || cpu_stats_ptr[1] > 0) {
     const std::string debug_info =
@@ -437,7 +435,7 @@ void CheckNumericsKernel(const Context& dev_ctx,
       std::min(static_cast<size_t>(128),
                static_cast<size_t>((tensor.numel() + threads - 1) / threads));
 
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
 
   int64_t numel_max_min = blocks;
 
@@ -505,12 +503,12 @@ void CheckNumericsKernel(const Context& dev_ctx,
 #ifdef _WIN32
 INSTANTIATE_CHECKNUMBERICS_KERNEL(float, GPUContext)
 INSTANTIATE_CHECKNUMBERICS_KERNEL(double, GPUContext)
-INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::float16, GPUContext)
-INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::bfloat16, GPUContext)
-INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::complex64, GPUContext)
-INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::complex128, GPUContext)
-INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::float8_e4m3fn, GPUContext)
-INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::float8_e5m2, GPUContext)
+INSTANTIATE_CHECKNUMBERICS_KERNEL(float16, GPUContext)
+INSTANTIATE_CHECKNUMBERICS_KERNEL(bfloat16, GPUContext)
+INSTANTIATE_CHECKNUMBERICS_KERNEL(complex64, GPUContext)
+INSTANTIATE_CHECKNUMBERICS_KERNEL(complex128, GPUContext)
+INSTANTIATE_CHECKNUMBERICS_KERNEL(float8_e4m3fn, GPUContext)
+INSTANTIATE_CHECKNUMBERICS_KERNEL(float8_e5m2, GPUContext)
 #endif
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu
index 95e009785ee999..a74998897135e4 100644
--- a/paddle/phi/kernels/gpu/cholesky_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -82,10 +82,10 @@ struct MatrixBandPartFunctor {
     int workspace_size = 0;                                              \
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \
         handle, uplo, n, A, lda, &workspace_size));                      \
-    auto workspace = phi::memory_utils::Alloc(                           \
+    auto workspace = memory_utils::Alloc(                                \
         dev_ctx.GetPlace(),                                              \
         workspace_size * sizeof(T),                                      \
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); \
+        Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));           \
     T* workspace_ptr = reinterpret_cast<T*>(workspace->ptr());           \
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf(            \
         handle, uplo, n, A, lda, workspace_ptr, workspace_size, info));  \
@@ -119,12 +119,12 @@ FUNC_WITH_TYPES(POTRF_INSTANCE);
                                              data_type,                   \
                                              &workspace_device_size,      \
                                              &workspace_host_size));      \
-    auto workspace_device = phi::memory_utils::Alloc(                     \
+    auto workspace_device = memory_utils::Alloc(                          \
         dev_ctx.GetPlace(),                                               \
         workspace_device_size,                                            \
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));  \
+        Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));            \
     auto workspace_host =                                                 \
-        phi::memory_utils::Alloc(CPUPlace(), workspace_host_size);        \
+        memory_utils::Alloc(CPUPlace(), workspace_host_size);             \
     PADDLE_ENFORCE_GPU_SUCCESS(                                           \
         dynload::cusolverDnXpotrf(handle,                                 \
                                   params,                                 \
@@ -209,10 +209,10 @@ void CholeskyKernel(const Context& dev_ctx,
     for_range(matrix_band_part_functor);
   }
 
-  auto info = phi::memory_utils::Alloc(
-      dev_ctx.GetPlace(),
-      sizeof(int) * batch_count,
-      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+  auto info =
+      memory_utils::Alloc(dev_ctx.GetPlace(),
+                          sizeof(int) * batch_count,
+                          Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
   auto* info_ptr = reinterpret_cast<int*>(info->ptr());
 
 #if CUDA_VERSION >= 9020 && !defined(_WIN32)
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
index 609378cc3b224f..bc97987f15347b 100644
--- a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
@@ -37,7 +37,7 @@ void rocsolver_potrs(const solverHandle_t &handle,
                      T *Bdata,
                      int ldb);
 
-using phi::dtype::complex;
+using dtype::complex;
 #define FUNC_WITH_TYPES(m)                        \
   m(float, s, float) m(double, d, double)         \
       m(complex<float>, c, rocblas_float_complex) \
@@ -107,15 +107,15 @@ void cusolver_potrs<double>(const solverHandle_t &handle,
 }
 
 template <>
-void cusolver_potrs<phi::complex64>(const solverHandle_t &handle,
-                                    cublasFillMode_t uplo,
-                                    int M,
-                                    int N,
-                                    phi::complex64 *Adata,
-                                    int lda,
-                                    phi::complex64 *Bdata,
-                                    int ldb,
-                                    int *devInfo) {
+void cusolver_potrs<complex64>(const solverHandle_t &handle,
+                               cublasFillMode_t uplo,
+                               int M,
+                               int N,
+                               complex64 *Adata,
+                               int lda,
+                               complex64 *Bdata,
+                               int ldb,
+                               int *devInfo) {
   PADDLE_ENFORCE_GPU_SUCCESS(
       dynload::cusolverDnCpotrs(handle,
                                 uplo,
@@ -129,15 +129,15 @@ void cusolver_potrs<phi::complex64>(const solverHandle_t &handle,
 }
 
 template <>
-void cusolver_potrs<phi::complex128>(const cusolverDnHandle_t &handle,
-                                     cublasFillMode_t uplo,
-                                     int M,
-                                     int N,
-                                     phi::complex128 *Adata,
-                                     int lda,
-                                     phi::complex128 *Bdata,
-                                     int ldb,
-                                     int *devInfo) {
+void cusolver_potrs<complex128>(const cusolverDnHandle_t &handle,
+                                cublasFillMode_t uplo,
+                                int M,
+                                int N,
+                                complex128 *Adata,
+                                int lda,
+                                complex128 *Bdata,
+                                int ldb,
+                                int *devInfo) {
   PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnZpotrs(
       handle,
       uplo,
diff --git a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
index fab500ff2991d7..bcf7986c1f8d1c 100644
--- a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
@@ -350,9 +350,9 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (nranks > 1) {
     auto stream = dev_ctx.stream();
-    phi::distributed::NCCLCommContext* comm_ctx = nullptr;
-    comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
-        dev_ctx.GetCommContext());
+    distributed::NCCLCommContext* comm_ctx = nullptr;
+    comm_ctx =
+        static_cast<distributed::NCCLCommContext*>(dev_ctx.GetCommContext());
     PADDLE_ENFORCE_NE(comm_ctx,
                       nullptr,
                       common::errors::Unavailable(
@@ -361,7 +361,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
 
     comm_ctx->AllReduce(
         &num_classes_per_device, num_classes_per_device, ncclSum, stream);
-    phi::backends::gpu::GpuStreamSync(stream);
+    backends::gpu::GpuStreamSync(stream);
   }
 #endif
 
@@ -446,7 +446,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
                      (NumBlocks(num_classes) * kNumCUDAThreads * vec_size) +
                  1) *
                 vec_size;
-  // auto gen_cuda = phi::DefaultCUDAGenerator(device_id);
+  // auto gen_cuda = DefaultCUDAGenerator(device_id);
   auto gen_cuda = dev_ctx.GetGenerator();
   if (!fix_seed) {
     auto seed_offset = gen_cuda->IncrementOffset(offset);
diff --git a/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu
index 47783fa3c5a093..a5ef2390d0d911 100644
--- a/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu
@@ -111,18 +111,18 @@ void GPUCollectFpnProposalsOpKernel(
       }
     }
 
-    phi::memory_utils::Copy(place,
-                            concat_rois_data + roi_offset,
-                            place,
-                            roi_in->data<T>(),
-                            roi_in->numel() * sizeof(T),
-                            dev_ctx.stream());
-    phi::memory_utils::Copy(place,
-                            concat_scores_data + score_offset,
-                            place,
-                            score_in->data<T>(),
-                            score_in->numel() * sizeof(T),
-                            dev_ctx.stream());
+    memory_utils::Copy(place,
+                       concat_rois_data + roi_offset,
+                       place,
+                       roi_in->data<T>(),
+                       roi_in->numel() * sizeof(T),
+                       dev_ctx.stream());
+    memory_utils::Copy(place,
+                       concat_scores_data + score_offset,
+                       place,
+                       score_in->data<T>(),
+                       score_in->numel() * sizeof(T),
+                       dev_ctx.stream());
     roi_offset += roi_in->numel();
     score_offset += score_in->numel();
   }
@@ -161,7 +161,7 @@ void GPUCollectFpnProposalsOpKernel(
                                                     sizeof(T) * 8,
                                                     dev_ctx.stream());
   // Allocate temporary storage
-  auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);
+  auto d_temp_storage = memory_utils::Alloc(place, temp_storage_bytes);
 
   // Run sorting operation
   // sort score to get corresponding index
@@ -208,7 +208,7 @@ void GPUCollectFpnProposalsOpKernel(
                                             sizeof(int) * 8,
                                             dev_ctx.stream());
   // Allocate temporary storage
-  d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);
+  d_temp_storage = memory_utils::Alloc(place, temp_storage_bytes);
 
   // Run sorting operation
   // sort batch_id to get corresponding index
@@ -246,12 +246,12 @@ void GPUCollectFpnProposalsOpKernel(
           "address into the graph; on replay the vector is re-created at a "
           "different address, causing a dangling-pointer write."));
   std::vector<int> length_lod_cpu(lod_size);
-  phi::memory_utils::Copy(CPUPlace(),
-                          length_lod_cpu.data(),
-                          place,
-                          length_lod_data,
-                          sizeof(int) * lod_size,
-                          dev_ctx.stream());
+  memory_utils::Copy(CPUPlace(),
+                     length_lod_cpu.data(),
+                     place,
+                     length_lod_data,
+                     sizeof(int) * lod_size,
+                     dev_ctx.stream());
   dev_ctx.Wait();
 
   std::vector<size_t> offset(1, 0);
@@ -263,12 +263,12 @@ void GPUCollectFpnProposalsOpKernel(
     auto* rois_num = rois_num_out;
     rois_num->Resize({lod_size});
     int* rois_num_data = dev_ctx.template Alloc<int>(rois_num);
-    phi::memory_utils::Copy(place,
-                            rois_num_data,
-                            place,
-                            length_lod_data,
-                            lod_size * sizeof(int),
-                            dev_ctx.stream());
+    memory_utils::Copy(place,
+                       rois_num_data,
+                       place,
+                       length_lod_data,
+                       lod_size * sizeof(int),
+                       dev_ctx.stream());
   }
 
   LegacyLoD lod;
diff --git a/paddle/phi/kernels/gpu/comm_init_all_kernel.cu b/paddle/phi/kernels/gpu/comm_init_all_kernel.cu
index 9f759fdc0f4ddf..485bead29d228c 100644
--- a/paddle/phi/kernels/gpu/comm_init_all_kernel.cu
+++ b/paddle/phi/kernels/gpu/comm_init_all_kernel.cu
@@ -29,7 +29,7 @@ void CommInitAllKernel(const Context& dev_ctx,
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   std::vector<int> devices = devices_input;
   if (devices.empty()) {
-    devices = phi::backends::gpu::GetSelectedDevices();
+    devices = backends::gpu::GetSelectedDevices();
   }
 
   paddle::platform::NCCLCommContext::Instance().CreateAllNCCLComms(devices,
diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu
index 9618a4683c424a..d921d9badc021b 100644
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -72,8 +72,8 @@ void ConcatKernel(const Context& dev_ctx,
     if (lod_size) {
       auto* out_lod = out->mutable_lod();
       for (size_t i = 1; i < x.size(); ++i) {
-        auto in_lod = phi::ConvertToLengthBasedLegacyLoD(x[i]->lod());
-        phi::AppendLegacyLoD(out_lod, in_lod);
+        auto in_lod = ConvertToLengthBasedLegacyLoD(x[i]->lod());
+        AppendLegacyLoD(out_lod, in_lod);
       }
     }
   }
diff --git a/paddle/phi/kernels/gpu/contiguous_kernel.cu b/paddle/phi/kernels/gpu/contiguous_kernel.cu
index 90a734eddf76ac..3feb65323286f5 100644
--- a/paddle/phi/kernels/gpu/contiguous_kernel.cu
+++ b/paddle/phi/kernels/gpu/contiguous_kernel.cu
@@ -167,8 +167,8 @@ __global__ void ContiguousCaseOneFunc(
 template <typename T, size_t N>
 __global__ void ContiguousDefaultFunc(
     const T* input_data,
-    phi::Array<int64_t, DDim::kMaxRank + 1> input_stride,
-    phi::Array<int64_t, DDim::kMaxRank + 1> dims,
+    Array<int64_t, DDim::kMaxRank + 1> input_stride,
+    Array<int64_t, DDim::kMaxRank + 1> dims,
     const int64_t numel,
     T* out_data) {
   CUDA_KERNEL_LOOP_TYPE(i, numel, int64_t) {
@@ -233,8 +233,8 @@ template <typename T, typename Context>
 bool LaunchContiguousCaseZeroKernel(
     const Context& dev_ctx,
     const T* input_data,
-    const phi::Array<int64_t, DDim::kMaxRank + 1>& input_stride,
-    const phi::Array<int64_t, DDim::kMaxRank + 1>& input_dims,
+    const Array<int64_t, DDim::kMaxRank + 1>& input_stride,
+    const Array<int64_t, DDim::kMaxRank + 1>& input_dims,
     int rank,
     T* output_data) {
   if (rank > 6) {
@@ -305,13 +305,13 @@ template <typename T, typename Context>
 bool LaunchContiguousCaseOneKernel(
     const Context& dev_ctx,
     const T* input_data,
-    const phi::Array<int64_t, DDim::kMaxRank + 1>& input_stride,
-    const phi::Array<int64_t, DDim::kMaxRank + 1>& input_dims,
+    const Array<int64_t, DDim::kMaxRank + 1>& input_stride,
+    const Array<int64_t, DDim::kMaxRank + 1>& input_dims,
     int rank,
     int64_t numel,
     T* output_data) {
   Dim3 grid(1, 1, 1), block(1, 1, 1);
-  phi::Array<int64_t, 6> cur_input_dims;
+  Array<int64_t, 6> cur_input_dims;
   block.x = 512;
 
   if (rank >= 1) {
@@ -445,8 +445,8 @@ template <typename T, typename Context>
 void LaunchContiguousDefaultKernel(
     const Context& dev_ctx,
     const T* input_data,
-    const phi::Array<int64_t, DDim::kMaxRank + 1>& input_stride,
-    const phi::Array<int64_t, DDim::kMaxRank + 1>& input_dims,
+    const Array<int64_t, DDim::kMaxRank + 1>& input_stride,
+    const Array<int64_t, DDim::kMaxRank + 1>& input_dims,
     int rank,
     int64_t numel,
     T* output_data) {
diff --git a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
index b26fa327e01cc8..909fa7ef3a2a1b 100644
--- a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu
@@ -111,7 +111,7 @@ void DepthwiseConv2dTransposeGradKernel(const Context& dev_ctx,
       &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize);
 
   if (dx) {
-    phi::math::DepthwiseConvFunctor<Context, T> depthwiseConv;
+    math::DepthwiseConvFunctor<Context, T> depthwiseConv;
     depthwiseConv(dev_ctx,
                   dout,
                   filter_,
@@ -128,8 +128,7 @@ void DepthwiseConv2dTransposeGradKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(dfilter);
     set_zero(dev_ctx, dfilter, static_cast<T>(0));
 
-    phi::math::DepthwiseConvFilterGradFunctor<Context, T>
-        depthwiseConvFilterGrad;
+    math::DepthwiseConvFilterGradFunctor<Context, T> depthwiseConvFilterGrad;
     depthwiseConvFilterGrad(
         dev_ctx,
         dout,
diff --git a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
index bdca3cc64f667d..cb8f5d6eada59c 100644
--- a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu
@@ -85,7 +85,7 @@ void DepthwiseConv2dTransposeKernel(const Context& dev_ctx,
   funcs::SetConstant<Context, T> set_zero;
   set_zero(dev_ctx, out, static_cast<T>(0));
 
-  phi::math::DepthwiseConvInputGradFunctor<Context, T> depthwiseConvInputGrad;
+  math::DepthwiseConvInputGradFunctor<Context, T> depthwiseConvInputGrad;
   depthwiseConvInputGrad(
       dev_ctx,
       *out,
diff --git a/paddle/phi/kernels/gpu/cross_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
index 10f88d90317a3f..21f55aa59f702f 100644
--- a/paddle/phi/kernels/gpu/cross_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
@@ -40,7 +40,7 @@ __global__ void CrossGrad(const T* x,
     int64_t pos1 = offset + 1 * stride;
     int64_t pos2 = offset + 2 * stride;
 
-    using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+    using MPType = typename dtype::MPTypeTrait<T>::Type;
 
     MPType x_pos0_mp = static_cast<MPType>(x[pos0]);
     MPType x_pos1_mp = static_cast<MPType>(x[pos1]);
diff --git a/paddle/phi/kernels/gpu/cross_kernel.cu b/paddle/phi/kernels/gpu/cross_kernel.cu
index f93e92b343caff..3d491b5445d2bd 100644
--- a/paddle/phi/kernels/gpu/cross_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_kernel.cu
@@ -37,7 +37,7 @@ __global__ void Cross(const T* x,
     int64_t pos1 = offset + 1 * stride;
     int64_t pos2 = offset + 2 * stride;
 
-    using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+    using MPType = typename dtype::MPTypeTrait<T>::Type;
 
     MPType x_pos0_mp = static_cast<MPType>(x[pos0]);
     MPType x_pos1_mp = static_cast<MPType>(x[pos1]);
diff --git a/paddle/phi/kernels/gpu/ctc_align_kernel.cu b/paddle/phi/kernels/gpu/ctc_align_kernel.cu
index ca256a9f91b7a0..841965463ed124 100644
--- a/paddle/phi/kernels/gpu/ctc_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/ctc_align_kernel.cu
@@ -125,7 +125,7 @@ void CTCAlignOpCUDAKernel(const Context& dev_ctx,
     output->Resize({num_tokens, 1});
     T* output_data = dev_ctx.template Alloc<T>(output);
 
-    phi::MixVector<size_t> mixv_input_lod(&input_lod[level]);
+    MixVector<size_t> mixv_input_lod(&input_lod[level]);
     MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
         num_tokens,
         tokens,
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_cache.h b/paddle/phi/kernels/gpu/cudnn_lstm_cache.h
index aeb3af5350441b..e3439ed4264127 100644
--- a/paddle/phi/kernels/gpu/cudnn_lstm_cache.h
+++ b/paddle/phi/kernels/gpu/cudnn_lstm_cache.h
@@ -49,13 +49,13 @@ class ScopedRNNBase {
 
   template <typename T>
   void Create(const cudnnHandle_t& handle,
-              const phi::Place& place,
+              const Place& place,
               const std::vector<int>& sequence_length,
               size_t* workspace_size,
               size_t* reserve_size,
               DenseTensor* dropout_state) {
     int numDirections = is_bidirec_ ? 2 : 1;
-    cudnnDataType_t cudnn_type = phi::backends::gpu::CudnnDataType<T>::type;
+    cudnnDataType_t cudnn_type = backends::gpu::CudnnDataType<T>::type;
 
     // ------------------- cudnn x, y descriptors ---------------------
     std::vector<int> dims_x = {batch_size_, input_size_, 1};
@@ -238,19 +238,19 @@ class ScopedRNNBase {
   std::vector<cudnnTensorDescriptor_t> x_descs_;
   std::vector<cudnnTensorDescriptor_t> y_descs_;
 
-  phi::backends::gpu::ScopedTensorDescriptor x_desc_;
-  phi::backends::gpu::ScopedTensorDescriptor y_desc_;
+  backends::gpu::ScopedTensorDescriptor x_desc_;
+  backends::gpu::ScopedTensorDescriptor y_desc_;
 #if CUDNN_VERSION >= 7201
-  phi::backends::gpu::ScopedRNNTensorDescriptor x_seq_desc_;
-  phi::backends::gpu::ScopedRNNTensorDescriptor y_seq_desc_;
+  backends::gpu::ScopedRNNTensorDescriptor x_seq_desc_;
+  backends::gpu::ScopedRNNTensorDescriptor y_seq_desc_;
 #endif
-  phi::backends::gpu::ScopedTensorDescriptor init_h_desc_;
-  phi::backends::gpu::ScopedTensorDescriptor init_c_desc_;
-  phi::backends::gpu::ScopedTensorDescriptor last_h_desc_;
-  phi::backends::gpu::ScopedTensorDescriptor last_c_desc_;
-  phi::backends::gpu::ScopedDropoutDescriptor dropout_desc_;
-  phi::backends::gpu::ScopedFilterDescriptor weight_desc_;
-  phi::backends::gpu::ScopedRNNDescriptor rnn_desc_;
+  backends::gpu::ScopedTensorDescriptor init_h_desc_;
+  backends::gpu::ScopedTensorDescriptor init_c_desc_;
+  backends::gpu::ScopedTensorDescriptor last_h_desc_;
+  backends::gpu::ScopedTensorDescriptor last_c_desc_;
+  backends::gpu::ScopedDropoutDescriptor dropout_desc_;
+  backends::gpu::ScopedFilterDescriptor weight_desc_;
+  backends::gpu::ScopedRNNDescriptor rnn_desc_;
 };
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
index 845f3d17784479..877ffad5c46620 100644
--- a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu
@@ -112,7 +112,7 @@ void CudnnLSTMGradKernel(
   bool has_seq_length = running_seq_length != nullptr;
   std::vector<int> SequenceLength;
   if (has_seq_length) {
-    SequenceLength = phi::GetVectorFromTensor<int>(running_seq_length);
+    SequenceLength = GetVectorFromTensor<int>(running_seq_length);
   }
 
   int seq_length = input_dims[0];
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
index bb3b9fc6dd5086..76dd3ad716dc7b 100644
--- a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
+++ b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu
@@ -186,7 +186,7 @@ void CudnnLSTMKernel(
     if (seed == 0) {
       // If not specify seed, use global Generator to generate seed.
       int device_id = dev_ctx.GetPlace().GetDeviceId();
-      auto gen_cuda = phi::DefaultCUDAGenerator(device_id);
+      auto gen_cuda = DefaultCUDAGenerator(device_id);
       seed = static_cast<int>(gen_cuda->Random64());
     }
   }
@@ -195,7 +195,7 @@ void CudnnLSTMKernel(
   bool has_seq_length = running_sequence_length != nullptr;
   std::vector<int> SequenceLength;
   if (has_seq_length) {
-    SequenceLength = phi::GetVectorFromTensor<int>(running_sequence_length);
+    SequenceLength = GetVectorFromTensor<int>(running_sequence_length);
   }
 
   auto handle = dev_ctx.cudnn_handle();
diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_utils.h b/paddle/phi/kernels/gpu/cudnn_lstm_utils.h
index 182bc21af18bde..32a4738831bc1d 100644
--- a/paddle/phi/kernels/gpu/cudnn_lstm_utils.h
+++ b/paddle/phi/kernels/gpu/cudnn_lstm_utils.h
@@ -53,7 +53,7 @@ inline int size_sum(const std::vector<const DenseTensor *> &weight_list) {
 
 template <typename T>
 inline void weight_to_tensor(
-    const phi::Place &place,
+    const Place &place,
     gpuStream_t stream,
     const std::vector<const DenseTensor *> &weight_list,
     DenseTensor *weight) {
diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu
index 0a7d31e5c22860..a120262e3f20c1 100644
--- a/paddle/phi/kernels/gpu/cum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -203,7 +203,7 @@ __global__ void BlockScanKernel(T* d_out,
                                 int64_t scan_size,
                                 bool exclusive,
                                 Op op) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   using CallbackOp = BlockPrefixCallbackOp<MT, Op>;
 
   // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
@@ -263,13 +263,13 @@ void ThrustCumsumKernel(const Context& dev_ctx,
                         int64_t size,
                         bool reverse,
                         bool exclusive) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
 
 #ifdef __HIPCC__
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
 #else
-  phi::memory_utils::ThrustAllocator<cudaStream_t> allocator(dev_ctx.GetPlace(),
-                                                             dev_ctx.stream());
+  memory_utils::ThrustAllocator<cudaStream_t> allocator(dev_ctx.GetPlace(),
+                                                        dev_ctx.stream());
   const auto& policy = thrust::cuda::par(allocator).on(dev_ctx.stream());
 #endif
 
@@ -459,11 +459,10 @@ void CumsumKernel(const Context& dev_ctx,
                   bool exclusive,
                   bool reverse,
                   DenseTensor* out) {
-  using Op =
-      typename std::conditional<std::is_same<T, phi::complex64>::value ||
-                                    std::is_same<T, phi::complex128>::value,
-                                ComplexSum,
-                                cub::Sum>::type;
+  using Op = typename std::conditional<std::is_same<T, complex64>::value ||
+                                           std::is_same<T, complex128>::value,
+                                       ComplexSum,
+                                       cub::Sum>::type;
   if (FLAGS_use_accuracy_compatible_kernel && !exclusive) {
     if (out && out->numel() == 0) {
       dev_ctx.template Alloc<T>(out);
diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
index 10a11e1f1e16ce..a33d7e34a66538 100644
--- a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
@@ -169,15 +169,15 @@ void ReversedCumsum(const Context &dev_ctx,
   flipped_input.Resize(input.dims());
   dev_ctx.template Alloc<T>(&flipped_input);
   std::vector<int> axis = {dim};
-  phi::FlipKernel<T, Context>(dev_ctx, input, axis, &flipped_input);
+  FlipKernel<T, Context>(dev_ctx, input, axis, &flipped_input);
 
   DenseTensor cumsum_out;
   cumsum_out.Resize(input.dims());
   dev_ctx.template Alloc<T>(&cumsum_out);
-  phi::CumsumKernel<T, Context>(
+  CumsumKernel<T, Context>(
       dev_ctx, flipped_input, dim, false, false, false, &cumsum_out);
 
-  phi::FlipKernel<T, Context>(dev_ctx, cumsum_out, axis, output);
+  FlipKernel<T, Context>(dev_ctx, cumsum_out, axis, output);
 }
 
 template <typename T, typename Context>
@@ -195,7 +195,7 @@ bool CumprodGradCompatible(const Context &dev_ctx,
   bool is_trivial = (x.numel() <= 1) || (x_dims[wrap_dim] == 1);
   if (is_trivial) {
     dev_ctx.template Alloc<T>(dx);
-    phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
+    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
 
     return true;
   }
@@ -203,42 +203,40 @@ bool CumprodGradCompatible(const Context &dev_ctx,
   DenseTensor x_conj_tensor;
   DenseTensor out_conj_tensor;
 
-  if (phi::IsComplexType(x.dtype())) {
+  if (IsComplexType(x.dtype())) {
     x_conj_tensor.Resize(x.dims());
     out_conj_tensor.Resize(out.dims());
     dev_ctx.template Alloc<T>(&x_conj_tensor);
     dev_ctx.template Alloc<T>(&out_conj_tensor);
 
-    phi::ConjKernel<T, Context>(dev_ctx, x, &x_conj_tensor);
-    phi::ConjKernel<T, Context>(dev_ctx, out, &out_conj_tensor);
+    ConjKernel<T, Context>(dev_ctx, x, &x_conj_tensor);
+    ConjKernel<T, Context>(dev_ctx, out, &out_conj_tensor);
   }
 
-  const DenseTensor &x_ref = phi::IsComplexType(x.dtype()) ? x_conj_tensor : x;
-  const DenseTensor &out_ref =
-      phi::IsComplexType(x.dtype()) ? out_conj_tensor : out;
+  const DenseTensor &x_ref = IsComplexType(x.dtype()) ? x_conj_tensor : x;
+  const DenseTensor &out_ref = IsComplexType(x.dtype()) ? out_conj_tensor : out;
 
   DenseTensor zero_val;
   zero_val.Resize({1});
   dev_ctx.template Alloc<T>(&zero_val);
-  phi::FullKernel<T, Context>(
-      dev_ctx, {1}, static_cast<T>(0), x.dtype(), &zero_val);
+  FullKernel<T, Context>(dev_ctx, {1}, static_cast<T>(0), x.dtype(), &zero_val);
 
   DenseTensor is_zero_mask;
   is_zero_mask.Resize(x.dims());
   dev_ctx.template Alloc<bool>(&is_zero_mask);
-  phi::EqualKernel<T, Context>(dev_ctx, x, zero_val, &is_zero_mask);
+  EqualKernel<T, Context>(dev_ctx, x, zero_val, &is_zero_mask);
 
   DenseTensor any_zero;
   any_zero.Resize({1});
   dev_ctx.template Alloc<bool>(&any_zero);
 
-  phi::AnyKernel<bool, Context>(
+  AnyKernel<bool, Context>(
       dev_ctx, is_zero_mask, std::vector<int64_t>(), false, &any_zero);
 
   bool has_zero = false;
 #ifdef PADDLE_WITH_CUDA
   DenseTensor any_zero_cpu;
-  phi::Copy(dev_ctx, any_zero, CPUPlace(), true, &any_zero_cpu);
+  Copy(dev_ctx, any_zero, CPUPlace(), true, &any_zero_cpu);
   has_zero = *any_zero_cpu.data<bool>();
 #else
   has_zero = *any_zero.data<bool>();
@@ -253,7 +251,7 @@ bool CumprodGradCompatible(const Context &dev_ctx,
   DenseTensor w;
   w.Resize(out_ref.dims());
   dev_ctx.template Alloc<T>(&w);
-  phi::MultiplyKernel<T, Context>(dev_ctx, out_ref, dout, &w);
+  MultiplyKernel<T, Context>(dev_ctx, out_ref, dout, &w);
 
   DenseTensor w_flipped, w_cum, rc_w;
   w_flipped.Resize(w.dims());
@@ -265,14 +263,14 @@ bool CumprodGradCompatible(const Context &dev_ctx,
   dev_ctx.template Alloc<T>(&rc_w);
 
   std::vector<int> axis = {dim};
-  phi::FlipKernel<T, Context>(dev_ctx, w, axis, &w_flipped);
+  FlipKernel<T, Context>(dev_ctx, w, axis, &w_flipped);
 
-  phi::CumsumKernel<T, Context>(
+  CumsumKernel<T, Context>(
       dev_ctx, w_flipped, dim, false, false, false, &w_cum);
 
-  phi::FlipKernel<T, Context>(dev_ctx, w_cum, axis, &rc_w);
+  FlipKernel<T, Context>(dev_ctx, w_cum, axis, &rc_w);
 
-  phi::DivideKernel<T, Context>(dev_ctx, rc_w, x_ref, dx);
+  DivideKernel<T, Context>(dev_ctx, rc_w, x_ref, dx);
 
   return true;
 }
@@ -324,7 +322,7 @@ void CumprodGradKernel(const Context &dev_ctx,
   const T *y_data_deal;
   Allocator::AllocationPtr x_conj;
   Allocator::AllocationPtr y_conj;
-  if (phi::IsComplexType(x.dtype())) {
+  if (IsComplexType(x.dtype())) {
     x_conj = const_cast<Allocator &>(dev_ctx.GetAllocator())
                  .Allocate(numel * sizeof(T));
     auto *x_data_conj = reinterpret_cast<T *>(x_conj->ptr());
diff --git a/paddle/phi/kernels/gpu/cvm_grad_kernel.cu b/paddle/phi/kernels/gpu/cvm_grad_kernel.cu
index 957349834e461c..1d14e131203f5a 100644
--- a/paddle/phi/kernels/gpu/cvm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cvm_grad_kernel.cu
@@ -22,8 +22,6 @@
 
 namespace phi {
 
-using phi::PADDLE_CUDA_NUM_THREADS;
-
 template <typename T>
 __global__ void CvmGradComputeKernel(const bool use_cvm,
                                      const int64_t item_width,
@@ -105,7 +103,7 @@ void CVMGradCUDAKernel(const Context& dev_ctx,
         lod[lod.size() - 1],
         common::errors::PreconditionNotMet(
             "Output(X@GRAD)'s dim[0] must be equal to last element of lod"));
-    phi::MixVector<size_t> mixv_lod(&lod);
+    MixVector<size_t> mixv_lod(&lod);
     CvmGradComputeKernel<<<(dx_numel + PADDLE_CUDA_NUM_THREADS - 1) /
                                PADDLE_CUDA_NUM_THREADS,
                            PADDLE_CUDA_NUM_THREADS,
diff --git a/paddle/phi/kernels/gpu/cvm_kernel.cu b/paddle/phi/kernels/gpu/cvm_kernel.cu
index 597ecfb92b818b..204ed5cb2dd81b 100644
--- a/paddle/phi/kernels/gpu/cvm_kernel.cu
+++ b/paddle/phi/kernels/gpu/cvm_kernel.cu
@@ -21,8 +21,6 @@
 
 namespace phi {
 
-using phi::PADDLE_CUDA_NUM_THREADS;
-
 template <typename T>
 __global__ void CvmComputeKernel(const bool use_cvm,
                                  const int64_t item_width,
diff --git a/paddle/phi/kernels/gpu/depthwise_conv2d_bias_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv2d_bias_grad_kernel.cu
index d8d3577491e1a0..1594b4030e2a07 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv2d_bias_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv2d_bias_grad_kernel.cu
@@ -54,27 +54,21 @@ __device__ __forceinline__ T WARP_SHFL_DOWN(T value,
 }
 
 template <>
-__device__ __forceinline__ phi::dtype::float16
-WARP_SHFL_DOWN<phi::dtype::float16>(phi::dtype::float16 value,
-                                    unsigned int delta,
-                                    int width,
-                                    unsigned int mask) {
+__device__ __forceinline__ dtype::float16 WARP_SHFL_DOWN<dtype::float16>(
+    dtype::float16 value, unsigned int delta, int width, unsigned int mask) {
   uint16_t val_as_ushort = *reinterpret_cast<uint16_t*>(&value);
   uint16_t shuffled =
       WARP_SHFL_DOWN<uint16_t>(val_as_ushort, delta, width, mask);
-  return *reinterpret_cast<phi::dtype::float16*>(&shuffled);
+  return *reinterpret_cast<dtype::float16*>(&shuffled);
 }
 
 template <>
-__device__ __forceinline__ phi::dtype::bfloat16
-WARP_SHFL_DOWN<phi::dtype::bfloat16>(phi::dtype::bfloat16 value,
-                                     unsigned int delta,
-                                     int width,
-                                     unsigned int mask) {
+__device__ __forceinline__ dtype::bfloat16 WARP_SHFL_DOWN<dtype::bfloat16>(
+    dtype::bfloat16 value, unsigned int delta, int width, unsigned int mask) {
   uint16_t val_as_ushort = *reinterpret_cast<uint16_t*>(&value);
   uint16_t shuffled =
       WARP_SHFL_DOWN<uint16_t>(val_as_ushort, delta, width, mask);
-  return *reinterpret_cast<phi::dtype::bfloat16*>(&shuffled);
+  return *reinterpret_cast<dtype::bfloat16*>(&shuffled);
 }
 
 template <typename T>
@@ -124,7 +118,7 @@ __global__ void DWConv2dBwdInputKernel(const T* __restrict__ grad_output,
                                        const int padHeight,
                                        const int dilationWidth,
                                        const int dilationHeight) {
-  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using AccT = typename dtype::MPTypeTrait<T>::Type;
   const int KW_LIMIT = (kSize != 0) ? kSize : kernelWidth;
   const int KH_LIMIT = (kSize != 0) ? kSize : kernelHeight;
   const int strideW = (stride != 0) ? stride : strideWidth;
@@ -194,7 +188,7 @@ __global__ void DWConv2dBwdWeightKernel(const T* __restrict__ grad_output,
                                         const int padHeight,
                                         const int dilationWidth,
                                         const int dilationHeight) {
-  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using AccT = typename dtype::MPTypeTrait<T>::Type;
   const int channelStride = kernelWidth * kernelHeight;
 
   int bidx = blockIdx.x;
@@ -316,7 +310,7 @@ void LaunchDepthwiseConv2dBackwardCompatible(const Context& dev_ctx,
 
   // Launch Filter Gradient Kernel (grad_weight)
   if (filter_grad_nchw_ptr) {
-    phi::funcs::SetConstant<Context, T> set_zero;
+    funcs::SetConstant<Context, T> set_zero;
     set_zero(dev_ctx, filter_grad_nchw_ptr, static_cast<T>(0));
 
     int blocks = outputChannels * kH * kW;
@@ -324,7 +318,7 @@ void LaunchDepthwiseConv2dBackwardCompatible(const Context& dev_ctx,
     dim3 block(GetGradParamsNumThreads(batchSize));
 
     size_t smem = (block.x / CUDA_WARP_SIZE) *
-                  sizeof(typename phi::dtype::MPTypeTrait<T>::Type);
+                  sizeof(typename dtype::MPTypeTrait<T>::Type);
 
     DWConv2dBwdWeightKernel<T, int>
         <<<grid, block, smem, stream>>>(out_grad_nchw.data<T>(),
@@ -420,12 +414,12 @@ void LaunchDepthwiseConv2dBackwardCompatible(const Context& dev_ctx,
     // Reduce over N(0), H(2), W(3) to get [C]
     std::vector<int64_t> reduce_dims = {0, 2, 3};
 
-    phi::SumKernel<T, Context>(dev_ctx,
-                               out_grad_nchw,
-                               phi::IntArray(reduce_dims),
-                               CppTypeToDataType<T>::Type(),
-                               false,
-                               bias_grad);
+    SumKernel<T, Context>(dev_ctx,
+                          out_grad_nchw,
+                          IntArray(reduce_dims),
+                          CppTypeToDataType<T>::Type(),
+                          false,
+                          bias_grad);
   }
 
   if (input_grad && channel_last) {
diff --git a/paddle/phi/kernels/gpu/depthwise_conv2d_bias_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv2d_bias_kernel.cu
index 557717c821b8be..a9d022e2ab01ce 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv2d_bias_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv2d_bias_kernel.cu
@@ -52,7 +52,7 @@ __global__ void DWConv2dFwdKernel(const T* __restrict__ input,
                                   const int padHeight,
                                   const int dilationWidth,
                                   const int dilationHeight) {
-  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using AccT = typename dtype::MPTypeTrait<T>::Type;
   const int KW_LIMIT = (kSize != 0) ? kSize : kernelWidth;
   const int KH_LIMIT = (kSize != 0) ? kSize : kernelHeight;
 
@@ -119,7 +119,7 @@ __global__ void DWConv2dFwdKernelGeneric(const T* __restrict__ input,
                                          const int padHeight,
                                          const int dilationWidth,
                                          const int dilationHeight) {
-  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using AccT = typename dtype::MPTypeTrait<T>::Type;
 
   for (IndexT linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
        linearIndex < totalElements;
diff --git a/paddle/phi/kernels/gpu/depthwise_conv3d_bias_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv3d_bias_grad_kernel.cu
index 58515d8a415a03..2c7913a17f5083 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv3d_bias_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv3d_bias_grad_kernel.cu
@@ -353,7 +353,7 @@ void LaunchDepthwiseConv3dBackwardCompatible(const Context& dev_ctx,
   }
 
   auto stream = dev_ctx.stream();
-  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using AccT = typename dtype::MPTypeTrait<T>::Type;
 
   const T* input_ptr = input_ncdhw.data<T>();
   const T* grad_output_ptr = out_grad_ncdhw.data<T>();
@@ -600,12 +600,12 @@ void LaunchDepthwiseConv3dBackwardCompatible(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(bias_grad);
     // Reduce N(0), D(2), H(3), W(4) -> C(1) for NCDHW
     std::vector<int64_t> reduce_dims = {0, 2, 3, 4};
-    phi::SumKernel<T, Context>(dev_ctx,
-                               out_grad_ncdhw,
-                               phi::IntArray(reduce_dims),
-                               CppTypeToDataType<T>::Type(),
-                               false,
-                               bias_grad);
+    SumKernel<T, Context>(dev_ctx,
+                          out_grad_ncdhw,
+                          IntArray(reduce_dims),
+                          CppTypeToDataType<T>::Type(),
+                          false,
+                          bias_grad);
   }
 
   if (input_grad && channel_last) {
diff --git a/paddle/phi/kernels/gpu/depthwise_conv3d_bias_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv3d_bias_kernel.cu
index f3a0e1bf85f491..ef16c89fbc6888 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv3d_bias_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv3d_bias_kernel.cu
@@ -190,7 +190,7 @@ void LaunchDepthwiseConv3dCompatible(const Context& dev_ctx,
   int grid = std::min((num_outputs - 1) / block + 1, (int64_t)65536);
   auto stream = dev_ctx.stream();
 
-  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using AccT = typename dtype::MPTypeTrait<T>::Type;
 
   const T* input_ptr = input_ncdhw.data<T>();
   T* output_ptr = out_ncdhw.data<T>();
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
index 058f2d34cdf577..4be5bea2db8c20 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu
@@ -76,18 +76,18 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
       dev_ctx.template Alloc<T>(filter_grad);
       set_zero(dev_ctx, filter_grad, static_cast<T>(0));
     }
-    phi::DepthwiseConvCudnnGradKernel<T>(dev_ctx,
-                                         input,
-                                         filter,
-                                         *output_grad,
-                                         strides_t,
-                                         paddings_t,
-                                         padding_algorithm,
-                                         groups,
-                                         dilations_t,
-                                         data_format,
-                                         input_grad,
-                                         filter_grad);
+    DepthwiseConvCudnnGradKernel<T>(dev_ctx,
+                                    input,
+                                    filter,
+                                    *output_grad,
+                                    strides_t,
+                                    paddings_t,
+                                    padding_algorithm,
+                                    groups,
+                                    dilations_t,
+                                    data_format,
+                                    input_grad,
+                                    filter_grad);
     return;
   }
 #endif
@@ -121,7 +121,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
     set_zero(dev_ctx, input_grad, static_cast<T>(0));
 
     if (fuse_relu) {
-      phi::math::DepthwiseConvInputGradFunctor<Context, T, true>
+      math::DepthwiseConvInputGradFunctor<Context, T, true>
           depthwiseConvInputGrad;
       depthwiseConvInputGrad(dev_ctx,
                              input,
@@ -133,7 +133,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
                              input_grad,
                              data_layout);
     } else {
-      phi::math::DepthwiseConvInputGradFunctor<Context, T, false>
+      math::DepthwiseConvInputGradFunctor<Context, T, false>
           depthwiseConvInputGrad;
       depthwiseConvInputGrad(dev_ctx,
                              input,
@@ -151,7 +151,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(filter_grad);
     set_zero(dev_ctx, filter_grad, static_cast<T>(0));
     if (fuse_relu) {
-      phi::math::DepthwiseConvFilterGradFunctor<Context, T, true>
+      math::DepthwiseConvFilterGradFunctor<Context, T, true>
           depthwiseConvFilterGrad;
       depthwiseConvFilterGrad(dev_ctx,
                               input,
@@ -162,7 +162,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
                               filter_grad,
                               data_layout);
     } else {
-      phi::math::DepthwiseConvFilterGradFunctor<Context, T, false>
+      math::DepthwiseConvFilterGradFunctor<Context, T, false>
           depthwiseConvFilterGrad;
       depthwiseConvFilterGrad(dev_ctx,
                               input,
diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
index 27529b8cd707bc..6c1bfa0596a4c8 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu
@@ -80,16 +80,16 @@ void DepthwiseConvKernel(const Context& dev_ctx,
     !defined(PADDLE_WITH_HIP)
   DWConvParams params(has_fuse_relu, data_format, strides, dilations);
   if (params.UseCudnnDepthwise<Context>(dev_ctx, input, filter)) {
-    phi::DepthwiseConvCudnnKernel<T>(dev_ctx,
-                                     input,
-                                     filter,
-                                     strides_t,
-                                     paddings_t,
-                                     padding_algorithm,
-                                     groups,
-                                     dilations_t,
-                                     data_format,
-                                     out);
+    DepthwiseConvCudnnKernel<T>(dev_ctx,
+                                input,
+                                filter,
+                                strides_t,
+                                paddings_t,
+                                padding_algorithm,
+                                groups,
+                                dilations_t,
+                                data_format,
+                                out);
     return;
   }
 #endif
@@ -119,7 +119,7 @@ void DepthwiseConvKernel(const Context& dev_ctx,
   }
 
   if (fuse_relu) {
-    phi::math::DepthwiseConvFunctor<Context, T, true> depthwiseConv;
+    math::DepthwiseConvFunctor<Context, T, true> depthwiseConv;
     depthwiseConv(dev_ctx,
                   input,
                   filter,
@@ -129,7 +129,7 @@ void DepthwiseConvKernel(const Context& dev_ctx,
                   output,
                   data_layout);
   } else {
-    phi::math::DepthwiseConvFunctor<Context, T, false> depthwiseConv;
+    math::DepthwiseConvFunctor<Context, T, false> depthwiseConv;
     depthwiseConv(dev_ctx,
                   input,
                   filter,
diff --git a/paddle/phi/kernels/gpu/determinant_kernel.cu b/paddle/phi/kernels/gpu/determinant_kernel.cu
index b377f83f59a49e..60bb1b68f9038a 100644
--- a/paddle/phi/kernels/gpu/determinant_kernel.cu
+++ b/paddle/phi/kernels/gpu/determinant_kernel.cu
@@ -36,10 +36,9 @@ template <typename T>
 class EigenMatrix {};
 
 template <>
-class EigenMatrix<phi::float16> {
+class EigenMatrix<float16> {
  public:
-  using MatrixType =
-      Eigen::Matrix<phi::float16, Eigen::Dynamic, Eigen::Dynamic>;
+  using MatrixType = Eigen::Matrix<float16, Eigen::Dynamic, Eigen::Dynamic>;
 };
 
 template <>
@@ -85,7 +84,7 @@ struct DeterminantCudaFunctor {
     std::vector<T> input_vec;
     std::vector<T> output_vec;
     TensorToVector(input, dev_ctx, &input_vec);
-    using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+    using MPType = typename dtype::MPTypeTrait<T>::Type;
     for (int64_t i = 0; i < batch_count; ++i) {  // maybe can be parallel
       auto begin_iter = input_vec.begin() + i * rank * rank;
       auto end_iter = input_vec.begin() + (i + 1) * rank * rank;
@@ -129,31 +128,31 @@ __global__ void GetDetFromLUComplex(const T* lu_data,
 }
 
 template <typename T, typename Context>
-struct DeterminantCudaFunctor<phi::dtype::complex<T>, Context> {
+struct DeterminantCudaFunctor<dtype::complex<T>, Context> {
   void operator()(const Context& dev_ctx,
                   const DenseTensor& a,
                   int64_t n,
                   int64_t batch_size,
                   DenseTensor* output) {
 #ifndef PADDLE_WITH_HIP
-    phi::Allocator::AllocationPtr tmp_gpu_mat_data;
-    const phi::dtype::complex<T>* gpu_mat = a.data<phi::dtype::complex<T>>();
+    Allocator::AllocationPtr tmp_gpu_mat_data;
+    const dtype::complex<T>* gpu_mat = a.data<dtype::complex<T>>();
     // Copy all elements of input matrix A to a temporary memory space to
     // avoid being overridden by getrf.
-    tmp_gpu_mat_data = phi::memory_utils::Alloc(
+    tmp_gpu_mat_data = memory_utils::Alloc(
         dev_ctx.GetPlace(),
-        a.numel() * sizeof(phi::dtype::complex<T>),
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+        a.numel() * sizeof(dtype::complex<T>),
+        Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
     memory_utils::Copy(dev_ctx.GetPlace(),
                        tmp_gpu_mat_data->ptr(),
                        dev_ctx.GetPlace(),
                        a.data(),
-                       a.numel() * sizeof(phi::dtype::complex<T>),
+                       a.numel() * sizeof(dtype::complex<T>),
                        dev_ctx.stream());
-    gpu_mat = reinterpret_cast<const phi::dtype::complex<T>*>(
-        tmp_gpu_mat_data->ptr());
+    gpu_mat =
+        reinterpret_cast<const dtype::complex<T>*>(tmp_gpu_mat_data->ptr());
 
-    std::vector<const phi::dtype::complex<T>*> cpu_ptrs(batch_size);
+    std::vector<const dtype::complex<T>*> cpu_ptrs(batch_size);
     for (int i = 0; i < batch_size; ++i) {
       cpu_ptrs[i] = gpu_mat + i * n * n;
     }
@@ -161,45 +160,45 @@ struct DeterminantCudaFunctor<phi::dtype::complex<T>, Context> {
     int num_ints = batch_size * (n + 1);
     // num_ints is for pivot (n * batch_size) and info (batch_size)
     size_t total_bytes =
-        batch_size * sizeof(phi::dtype::complex<T>*) + num_ints * sizeof(int);
-    phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
+        batch_size * sizeof(dtype::complex<T>*) + num_ints * sizeof(int);
+    Allocator::AllocationPtr tmp_gpu_ptrs_data = memory_utils::Alloc(
         dev_ctx.GetPlace(),
         total_bytes,
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+        Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
     memory_utils::Copy(dev_ctx.GetPlace(),
                        tmp_gpu_ptrs_data->ptr(),
                        CPUPlace(),
                        static_cast<void*>(cpu_ptrs.data()),
-                       cpu_ptrs.size() * sizeof(phi::dtype::complex<T>*),
+                       cpu_ptrs.size() * sizeof(dtype::complex<T>*),
                        dev_ctx.stream());
 
-    phi::dtype::complex<T>** gpu_mat_ptr =
-        reinterpret_cast<phi::dtype::complex<T>**>(tmp_gpu_ptrs_data->ptr());
+    dtype::complex<T>** gpu_mat_ptr =
+        reinterpret_cast<dtype::complex<T>**>(tmp_gpu_ptrs_data->ptr());
     int* gpu_info_ptr = reinterpret_cast<int*>(gpu_mat_ptr + cpu_ptrs.size());
     int* pivot_data = gpu_info_ptr + batch_size;
 
-    auto blas = funcs::GetBlas<Context, phi::dtype::complex<T>>(dev_ctx);
+    auto blas = funcs::GetBlas<Context, dtype::complex<T>>(dev_ctx);
     // This function performs the LU factorization of each matrix A by the
     // equation P * A = L * U. L and U are written back to original matrix A,
     // and diagonal elements of L are discarded.
     blas.BatchedGETRF(n, gpu_mat_ptr, pivot_data, gpu_info_ptr, batch_size);
-    phi::dtype::complex<T>* out_data =
-        dev_ctx.template Alloc<phi::dtype::complex<T>>(output);
+    dtype::complex<T>* out_data =
+        dev_ctx.template Alloc<dtype::complex<T>>(output);
     int block_size = std::min(256, dev_ctx.GetMaxThreadsPerBlock());
     dim3 dim_block(block_size);
     dim3 num_blocks((batch_size + block_size - 1) / block_size);
-    GetDetFromLUComplex<phi::dtype::complex<T>><<<num_blocks, dim_block>>>(
+    GetDetFromLUComplex<dtype::complex<T>><<<num_blocks, dim_block>>>(
         gpu_mat, pivot_data, n, batch_size, out_data);
 #else
     using MatrixType =
         Eigen::Matrix<std::complex<T>, Eigen::Dynamic, Eigen::Dynamic>;
-    std::vector<phi::dtype::complex<T>> input_vec;
-    std::vector<phi::dtype::complex<T>> output_vec;
+    std::vector<dtype::complex<T>> input_vec;
+    std::vector<dtype::complex<T>> output_vec;
     TensorToVector(a, dev_ctx, &input_vec);
     for (int64_t i = 0; i < batch_size; ++i) {  // maybe can be parallel
       auto begin_iter = input_vec.begin() + i * n * n;
       auto end_iter = input_vec.begin() + (i + 1) * n * n;
-      std::vector<phi::dtype::complex<T>> sub_vec(
+      std::vector<dtype::complex<T>> sub_vec(
           begin_iter,
           end_iter);  // get every square matrix data
       MatrixType matrix(n, n);
@@ -209,7 +208,7 @@ struct DeterminantCudaFunctor<phi::dtype::complex<T>, Context> {
         }
       }
       output_vec.push_back(
-          static_cast<phi::dtype::complex<T>>(matrix.determinant()));
+          static_cast<dtype::complex<T>>(matrix.determinant()));
     }
     TensorFromVector(output_vec, dev_ctx, output);
 #endif
diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu
index f52882ef5a2327..5877cbdbe54089 100644
--- a/paddle/phi/kernels/gpu/dgc_kernel.cu
+++ b/paddle/phi/kernels/gpu/dgc_kernel.cu
@@ -186,18 +186,18 @@ void DGCKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(gather_buff);
 
   int buf_size = paddle::communication::dgc::get_buffer_size(k);
-  phi::Allocator::AllocationPtr tmp_ious_data;
+  Allocator::AllocationPtr tmp_ious_data;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (dev_ctx.GetPlace().GetType() == AllocationType::GPU ||
       dev_ctx.GetPlace().GetType() == AllocationType::CUSTOM) {
-    tmp_ious_data = phi::memory_utils::Alloc(
+    tmp_ious_data = memory_utils::Alloc(
         dev_ctx.GetPlace(),
         buf_size,
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+        Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
   }
 #endif
   if (dev_ctx.GetPlace().GetType() == AllocationType::CPU) {
-    tmp_ious_data = phi::memory_utils::Alloc(dev_ctx.GetPlace(), buf_size);
+    tmp_ious_data = memory_utils::Alloc(dev_ctx.GetPlace(), buf_size);
   }
 
   void* buf = reinterpret_cast<void*>(tmp_ious_data->ptr());
diff --git a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
index 7657c6c432935d..581e0f2f00e784 100644
--- a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
@@ -21,8 +21,6 @@
 
 namespace phi {
 
-using phi::PADDLE_CUDA_NUM_THREADS;
-
 template <typename T, typename Context>
 void DiagonalGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
@@ -66,7 +64,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
   int blocks = std::min((numel + threads - 1) / threads, blocks_max);
 
   int64_t dout_numel = out_grad.numel();
-  phi::backends::gpu::GpuMemsetAsync(
+  backends::gpu::GpuMemsetAsync(
       dx_data, 0, numel * sizeof(T), dev_ctx.stream());
 
   switch (dx_dim_size) {
diff --git a/paddle/phi/kernels/gpu/diagonal_kernel.cu b/paddle/phi/kernels/gpu/diagonal_kernel.cu
index 70770c90215b85..c2e10ecc809046 100644
--- a/paddle/phi/kernels/gpu/diagonal_kernel.cu
+++ b/paddle/phi/kernels/gpu/diagonal_kernel.cu
@@ -20,7 +20,6 @@
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/diagonal.h"
 namespace phi {
-using phi::PADDLE_CUDA_NUM_THREADS;
 template <typename T, typename Context>
 void DiagonalKernel(const Context& dev_ctx,
                     const DenseTensor& x,
diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu
index 414cec8b37d883..4df46426834338 100644
--- a/paddle/phi/kernels/gpu/dist_kernel.cu
+++ b/paddle/phi/kernels/gpu/dist_kernel.cu
@@ -77,7 +77,7 @@ struct PowFunctorHighPrecision {
 template <typename T, typename Functor>
 __global__ void ReduceSumWithSubtract(
     const T* x, const T* y, T* out, int64_t N, Functor func) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   MT sum_val(0.0);
   CUDA_KERNEL_LOOP_TYPE(i, N, int64_t) { sum_val += func(x[i], y[i]); }
 
@@ -92,7 +92,7 @@ __global__ void ReduceMaxWithSubtract(const T* x,
                                       const T* y,
                                       T* out,
                                       int64_t N) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   MT max_val = std::numeric_limits<MT>::min();
   CUDA_KERNEL_LOOP_TYPE(i, N, int64_t) {
     max_val = max(max_val, abs(static_cast<MT>(x[i]) - static_cast<MT>(y[i])));
@@ -109,7 +109,7 @@ __global__ void ReduceMinWithSubtract(const T* x,
                                       const T* y,
                                       T* out,
                                       int64_t N) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   MT min_val = std::numeric_limits<MT>::max();
   CUDA_KERNEL_LOOP_TYPE(i, N, int64_t) {
     min_val = min(min_val, abs(static_cast<MT>(x[i]) - static_cast<MT>(y[i])));
@@ -132,7 +132,7 @@ void DistKernel(const Context& dev_ctx,
     return;
   }
 
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   DenseTensor intermediate;
   const T* x_ptr = x.data<T>();
   const T* y_ptr = y.data<T>();
@@ -144,7 +144,7 @@ void DistKernel(const Context& dev_ctx,
   if (xdim == y.dims()) {  // same shape
     int64_t n = x.numel();
 
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n);
+    auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n);
     intermediate.Resize({config.block_per_grid.x});
     T* i_ptr = dev_ctx.template Alloc<T>(&intermediate);
     std::vector<int64_t> axis_dims = {static_cast<int64_t>(-1)};
@@ -161,7 +161,7 @@ void DistKernel(const Context& dev_ctx,
       ReduceMaxWithSubtract<T>
           <<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
               x_ptr, y_ptr, i_ptr, n);
-      phi::MaxRawKernel<T, Context>(
+      MaxRawKernel<T, Context>(
           dev_ctx, intermediate, reduce_axis, true, true, out);
 
     } else if (p == -INFINITY) {
@@ -169,7 +169,7 @@ void DistKernel(const Context& dev_ctx,
           <<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
               x_ptr, y_ptr, i_ptr, n);
 
-      phi::MinRawKernel<T, Context>(
+      MinRawKernel<T, Context>(
           dev_ctx, intermediate, reduce_axis, true, true, out);
 
     } else {
diff --git a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
index f082253897a7c0..3c6d6a74e299dc 100644
--- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
@@ -196,7 +196,7 @@ void DistributeFpnProposalsKernel(
                                             sizeof(int) * 8,
                                             dev_ctx.stream());
   // Allocate temporary storage
-  auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);
+  auto d_temp_storage = memory_utils::Alloc(place, temp_storage_bytes);
 
   // Run sorting operation
   // sort target level to get corresponding index
diff --git a/paddle/phi/kernels/gpu/edit_distance_kernel.cu b/paddle/phi/kernels/gpu/edit_distance_kernel.cu
index c12789577642a1..af3c25ef1251ed 100644
--- a/paddle/phi/kernels/gpu/edit_distance_kernel.cu
+++ b/paddle/phi/kernels/gpu/edit_distance_kernel.cu
@@ -27,8 +27,6 @@
 
 namespace phi {
 
-using phi::PADDLE_CUDA_NUM_THREADS;
-
 template <typename T>
 __global__ void FillFirstRow(T* dist, const int N) {
   int64_t idx =
diff --git a/paddle/phi/kernels/gpu/eig_grad_kernel.cu b/paddle/phi/kernels/gpu/eig_grad_kernel.cu
index e79f4500e299b5..89ee3f6a347ff3 100644
--- a/paddle/phi/kernels/gpu/eig_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/eig_grad_kernel.cu
@@ -55,13 +55,13 @@ void SolveLinearSystemGPU(const GPUContext& dev_ctx,
 
 #ifdef PADDLE_WITH_CUDA
 template <>
-void SolveLinearSystemGPU<phi::dtype::complex<float>>(
+void SolveLinearSystemGPU<dtype::complex<float>>(
     const GPUContext& dev_ctx,
-    const phi::dtype::complex<float>*
+    const dtype::complex<float>*
         matrix_data,  // device ptr, row-major, size batch*order*order
-    const phi::dtype::complex<float>*
+    const dtype::complex<float>*
         rhs_data,  // device ptr, row-major, size batch*order*rhs_cols
-    phi::dtype::complex<float>*
+    dtype::complex<float>*
         out_data,  // device ptr, row-major, size batch*order*rhs_cols
     int order,
     int rhs_cols,
@@ -69,7 +69,7 @@ void SolveLinearSystemGPU<phi::dtype::complex<float>>(
   // handles
   cublasHandle_t cublas_handle = dev_ctx.cublas_handle();
   cusolverDnHandle_t cusolver_handle = dev_ctx.cusolver_dn_handle();
-  auto stream = phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()));
+  auto stream = Stream(reinterpret_cast<StreamId>(dev_ctx.stream()));
 
   // cuComplex constants
   const cuComplex kAlpha = make_cuFloatComplex(1.0f, 0.0f);
@@ -88,22 +88,22 @@ void SolveLinearSystemGPU<phi::dtype::complex<float>>(
   cuComplex* X_row_all = reinterpret_cast<cuComplex*>(out_data);
 
   auto dA_col_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream);
+      memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream);
   auto dB_col_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream);
+      memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream);
   cuComplex* dA_col = reinterpret_cast<cuComplex*>(dA_col_alloc->ptr());
   cuComplex* dB_col = reinterpret_cast<cuComplex*>(dB_col_alloc->ptr());
 
-  auto d_pivots_alloc = phi::memory_utils::Alloc(
+  auto d_pivots_alloc = memory_utils::Alloc(
       dev_ctx.GetPlace(),
       static_cast<size_t>(batch_count) * order * sizeof(int),
       stream);
   int* d_pivots = reinterpret_cast<int*>(d_pivots_alloc->ptr());
 
   auto d_info_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(),
-                               static_cast<size_t>(batch_count) * sizeof(int),
-                               stream);
+      memory_utils::Alloc(dev_ctx.GetPlace(),
+                          static_cast<size_t>(batch_count) * sizeof(int),
+                          stream);
   int* d_info = reinterpret_cast<int*>(d_info_alloc->ptr());
 
   //    A_row layout: row-major (order x order), B_row layout: row-major (order
@@ -157,7 +157,7 @@ void SolveLinearSystemGPU<phi::dtype::complex<float>>(
 
   size_t work_bytes = static_cast<size_t>(lwork) * sizeof(cuComplex);
   auto d_work_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream);
+      memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream);
   cuComplex* d_work = reinterpret_cast<cuComplex*>(d_work_alloc->ptr());
 
   for (int i = 0; i < batch_count; ++i) {
@@ -238,13 +238,13 @@ void SolveLinearSystemGPU<phi::dtype::complex<float>>(
 }
 
 template <>
-void SolveLinearSystemGPU<phi::dtype::complex<double>>(
+void SolveLinearSystemGPU<dtype::complex<double>>(
     const GPUContext& dev_ctx,
-    const phi::dtype::complex<double>*
+    const dtype::complex<double>*
         matrix_data,  // device ptr, row-major, size batch*order*order
-    const phi::dtype::complex<double>*
+    const dtype::complex<double>*
         rhs_data,  // device ptr, row-major, size batch*order*rhs_cols
-    phi::dtype::complex<double>*
+    dtype::complex<double>*
         out_data,  // device ptr, row-major, size batch*order*rhs_cols
     int order,
     int rhs_cols,
@@ -252,7 +252,7 @@ void SolveLinearSystemGPU<phi::dtype::complex<double>>(
   // handles
   cublasHandle_t cublas_handle = dev_ctx.cublas_handle();
   cusolverDnHandle_t cusolver_handle = dev_ctx.cusolver_dn_handle();
-  auto stream = phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()));
+  auto stream = Stream(reinterpret_cast<StreamId>(dev_ctx.stream()));
 
   // cuDoubleComplex constants
   const cuDoubleComplex kAlpha = make_cuDoubleComplex(1.0f, 0.0f);
@@ -273,24 +273,24 @@ void SolveLinearSystemGPU<phi::dtype::complex<double>>(
   cuDoubleComplex* X_row_all = reinterpret_cast<cuDoubleComplex*>(out_data);
 
   auto dA_col_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream);
+      memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream);
   auto dB_col_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream);
+      memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream);
   cuDoubleComplex* dA_col =
       reinterpret_cast<cuDoubleComplex*>(dA_col_alloc->ptr());
   cuDoubleComplex* dB_col =
       reinterpret_cast<cuDoubleComplex*>(dB_col_alloc->ptr());
 
-  auto d_pivots_alloc = phi::memory_utils::Alloc(
+  auto d_pivots_alloc = memory_utils::Alloc(
       dev_ctx.GetPlace(),
       static_cast<size_t>(batch_count) * order * sizeof(int),
       stream);
   int* d_pivots = reinterpret_cast<int*>(d_pivots_alloc->ptr());
 
   auto d_info_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(),
-                               static_cast<size_t>(batch_count) * sizeof(int),
-                               stream);
+      memory_utils::Alloc(dev_ctx.GetPlace(),
+                          static_cast<size_t>(batch_count) * sizeof(int),
+                          stream);
   int* d_info = reinterpret_cast<int*>(d_info_alloc->ptr());
 
   //    A_row layout: row-major (order x order), B_row layout: row-major (order
@@ -345,7 +345,7 @@ void SolveLinearSystemGPU<phi::dtype::complex<double>>(
 
   size_t work_bytes = static_cast<size_t>(lwork) * sizeof(cuDoubleComplex);
   auto d_work_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream);
+      memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream);
   cuDoubleComplex* d_work =
       reinterpret_cast<cuDoubleComplex*>(d_work_alloc->ptr());
 
@@ -429,20 +429,20 @@ void SolveLinearSystemGPU<phi::dtype::complex<double>>(
 
 #ifdef PADDLE_WITH_HIP
 template <>
-void SolveLinearSystemGPU<phi::dtype::complex<float>>(
+void SolveLinearSystemGPU<dtype::complex<float>>(
     const GPUContext& dev_ctx,
-    const phi::dtype::complex<float>*
+    const dtype::complex<float>*
         matrix_data,  // device ptr, row-major, size batch*order*order
-    const phi::dtype::complex<float>*
+    const dtype::complex<float>*
         rhs_data,  // device ptr, row-major, size batch*order*rhs_cols
-    phi::dtype::complex<float>*
+    dtype::complex<float>*
         out_data,  // device ptr, row-major, size batch*order*rhs_cols
     int order,
     int rhs_cols,
     int batch_count) {
   // handles
   rocblas_handle rocblas_handle = dev_ctx.cusolver_dn_handle();
-  auto stream = phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()));
+  auto stream = Stream(reinterpret_cast<StreamId>(dev_ctx.stream()));
 
   // rocblas_float_complex constants
   const rocblas_float_complex kAlpha = rocblas_float_complex{1.0f, 0.0f};
@@ -464,21 +464,21 @@ void SolveLinearSystemGPU<phi::dtype::complex<float>>(
       reinterpret_cast<rocblas_float_complex*>(out_data);
 
   auto dA_col_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream);
+      memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream);
   auto dB_col_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream);
+      memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream);
   rocblas_float_complex* dA_col =
       reinterpret_cast<rocblas_float_complex*>(dA_col_alloc->ptr());
   rocblas_float_complex* dB_col =
       reinterpret_cast<rocblas_float_complex*>(dB_col_alloc->ptr());
 
-  auto d_pivots_alloc = phi::memory_utils::Alloc(
+  auto d_pivots_alloc = memory_utils::Alloc(
       dev_ctx.GetPlace(),
       static_cast<size_t>(batch_count) * order * sizeof(rocblas_int),
       stream);
   rocblas_int* d_pivots = reinterpret_cast<rocblas_int*>(d_pivots_alloc->ptr());
 
-  auto d_info_alloc = phi::memory_utils::Alloc(
+  auto d_info_alloc = memory_utils::Alloc(
       dev_ctx.GetPlace(),
       static_cast<size_t>(batch_count) * sizeof(rocblas_int),
       stream);
@@ -587,13 +587,12 @@ void SolveLinearSystemGPU<phi::dtype::complex<float>>(
   auto* cpu_ctx = static_cast<CPUContext*>(pool.Get(cpu_place));
 
   std::vector<rocblas_int> h_info(batch_count, 0);
-  phi::memory_utils::Copy(
-      CPUPlace(),
-      h_info.data(),
-      dev_ctx.GetPlace(),
-      d_info,
-      static_cast<size_t>(batch_count) * sizeof(rocblas_int),
-      reinterpret_cast<void*>(dev_ctx.stream()));
+  memory_utils::Copy(CPUPlace(),
+                     h_info.data(),
+                     dev_ctx.GetPlace(),
+                     d_info,
+                     static_cast<size_t>(batch_count) * sizeof(rocblas_int),
+                     reinterpret_cast<void*>(dev_ctx.stream()));
   dev_ctx.Wait();
 
   for (int i = 0; i < batch_count; ++i) {
@@ -607,20 +606,20 @@ void SolveLinearSystemGPU<phi::dtype::complex<float>>(
 }
 
 template <>
-void SolveLinearSystemGPU<phi::dtype::complex<double>>(
+void SolveLinearSystemGPU<dtype::complex<double>>(
     const GPUContext& dev_ctx,
-    const phi::dtype::complex<double>*
+    const dtype::complex<double>*
         matrix_data,  // device ptr, row-major, size batch*order*order
-    const phi::dtype::complex<double>*
+    const dtype::complex<double>*
         rhs_data,  // device ptr, row-major, size batch*order*rhs_cols
-    phi::dtype::complex<double>*
+    dtype::complex<double>*
         out_data,  // device ptr, row-major, size batch*order*rhs_cols
     int order,
     int rhs_cols,
     int batch_count) {
   // handles
   rocblas_handle rocblas_handle = dev_ctx.cusolver_dn_handle();
-  auto stream = phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()));
+  auto stream = Stream(reinterpret_cast<StreamId>(dev_ctx.stream()));
 
   // rocblas_double_complex constants
   const rocblas_double_complex kAlpha = rocblas_double_complex{1.0, 0.0};
@@ -642,21 +641,21 @@ void SolveLinearSystemGPU<phi::dtype::complex<double>>(
       reinterpret_cast<rocblas_double_complex*>(out_data);
 
   auto dA_col_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream);
+      memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream);
   auto dB_col_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream);
+      memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream);
   rocblas_double_complex* dA_col =
       reinterpret_cast<rocblas_double_complex*>(dA_col_alloc->ptr());
   rocblas_double_complex* dB_col =
       reinterpret_cast<rocblas_double_complex*>(dB_col_alloc->ptr());
 
-  auto d_pivots_alloc = phi::memory_utils::Alloc(
+  auto d_pivots_alloc = memory_utils::Alloc(
       dev_ctx.GetPlace(),
       static_cast<size_t>(batch_count) * order * sizeof(rocblas_int),
       stream);
   rocblas_int* d_pivots = reinterpret_cast<rocblas_int*>(d_pivots_alloc->ptr());
 
-  auto d_info_alloc = phi::memory_utils::Alloc(
+  auto d_info_alloc = memory_utils::Alloc(
       dev_ctx.GetPlace(),
       static_cast<size_t>(batch_count) * sizeof(rocblas_int),
       stream);
@@ -763,13 +762,12 @@ void SolveLinearSystemGPU<phi::dtype::complex<double>>(
   auto* cpu_ctx = static_cast<CPUContext*>(pool.Get(cpu_place));
 
   std::vector<rocblas_int> h_info(batch_count, 0);
-  phi::memory_utils::Copy(
-      CPUPlace(),
-      h_info.data(),
-      dev_ctx.GetPlace(),
-      d_info,
-      static_cast<size_t>(batch_count) * sizeof(rocblas_int),
-      reinterpret_cast<void*>(dev_ctx.stream()));
+  memory_utils::Copy(CPUPlace(),
+                     h_info.data(),
+                     dev_ctx.GetPlace(),
+                     d_info,
+                     static_cast<size_t>(batch_count) * sizeof(rocblas_int),
+                     reinterpret_cast<void*>(dev_ctx.stream()));
   dev_ctx.Wait();
 
   for (int i = 0; i < batch_count; ++i) {
@@ -806,13 +804,12 @@ void ComputeBackwardForComplexInputGPU(const DenseTensor& L,
     gV_safe = Fill<T, Context>(dev_ctx, vectorize<int64_t>(V.dims()), T(0));
   }
   DenseTensor trans_v = TransposeLast2Dim<T>(dev_ctx, V);
-  DenseTensor Vh = phi::Conj<T>(dev_ctx, trans_v);
-  DenseTensor Lconj = phi::Conj<T>(dev_ctx, L);
-  DenseTensor Econj = phi::Subtract<T>(dev_ctx,
-                                       phi::funcs::Unsqueeze(Lconj, -2),
-                                       phi::funcs::Unsqueeze(Lconj, -1));
-  DenseTensor VhgV = phi::Matmul<T>(dev_ctx, Vh, gV_safe);
-  DenseTensor diag_real = phi::Real<T>(dev_ctx, VhgV);
+  DenseTensor Vh = Conj<T>(dev_ctx, trans_v);
+  DenseTensor Lconj = Conj<T>(dev_ctx, L);
+  DenseTensor Econj = Subtract<T>(
+      dev_ctx, funcs::Unsqueeze(Lconj, -2), funcs::Unsqueeze(Lconj, -1));
+  DenseTensor VhgV = Matmul<T>(dev_ctx, Vh, gV_safe);
+  DenseTensor diag_real = Real<T>(dev_ctx, VhgV);
 
   auto cpu_place = CPUPlace();
   DeviceContextPool& pool = DeviceContextPool::Instance();
@@ -823,13 +820,13 @@ void ComputeBackwardForComplexInputGPU(const DenseTensor& L,
   Copy(dev_ctx, diag_real, cpu_place, false, &diag_real_cpu);
 
   DenseTensor diag_res_cpu =
-      phi::funcs::BatchDiag<T>((*cpu_ctx), diag_real_cpu, batch_count);
+      funcs::BatchDiag<T>((*cpu_ctx), diag_real_cpu, batch_count);
 
   DenseTensor diag_res;
   dev_ctx.template Alloc<T>(&diag_res);
   Copy(dev_ctx, diag_res_cpu, GPUPlace(), false, &diag_res);
 
-  DenseTensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2);
+  DenseTensor diag_unsqueezed = funcs::Unsqueeze(diag_res, -2);
 
   auto numel = diag_unsqueezed.numel();
   DenseTensor diag_unsqueezed_complex;
@@ -838,21 +835,20 @@ void ComputeBackwardForComplexInputGPU(const DenseTensor& L,
   auto* data_diag_un_com = dev_ctx.template Alloc<T>(
       &diag_unsqueezed_complex, static_cast<size_t>(numel * sizeof(T)));
 
-  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
-  phi::funcs::RealToComplexFunctor<T> functor(
-      data_diag_un, data_diag_un_com, numel);
+  funcs::ForRange<Context> for_range(dev_ctx, numel);
+  funcs::RealToComplexFunctor<T> functor(data_diag_un, data_diag_un_com, numel);
   for_range(functor);
   // real tensor multiply complex tensor in broadcast manner
-  DenseTensor res1 = phi::Multiply<T>(dev_ctx, V, diag_unsqueezed_complex);
-  DenseTensor res2 = phi::Matmul<T>(dev_ctx, Vh, res1);
-  DenseTensor result = phi::Subtract<T>(dev_ctx, VhgV, res2);
+  DenseTensor res1 = Multiply<T>(dev_ctx, V, diag_unsqueezed_complex);
+  DenseTensor res2 = Matmul<T>(dev_ctx, Vh, res1);
+  DenseTensor result = Subtract<T>(dev_ctx, VhgV, res2);
 
   result.Resize(V.dims());
   dev_ctx.template Alloc<T>(&result);
-  result = phi::Divide<T>(dev_ctx, result, Econj);
-  result = phi::funcs::DiagFill<T, T>(
-      dev_ctx, order, order, order, 0, gL_safe, result);
-  DenseTensor rhs = phi::Matmul<T>(dev_ctx, result, Vh);
+  result = Divide<T>(dev_ctx, result, Econj);
+  result =
+      funcs::DiagFill<T, T>(dev_ctx, order, order, order, 0, gL_safe, result);
+  DenseTensor rhs = Matmul<T>(dev_ctx, result, Vh);
 
   // solve linear system
   // solve(Vh, rhs, out, m, k)
@@ -877,7 +873,7 @@ void EigGradKernel(const Context& dev_ctx,
                    const optional<DenseTensor>& dout_w,
                    const optional<DenseTensor>& dout_v,
                    DenseTensor* dx) {
-  auto* dx_data = dev_ctx.template Alloc<phi::dtype::Complex<T>>(dx);
+  auto* dx_data = dev_ctx.template Alloc<dtype::Complex<T>>(dx);
   if (dx->numel() == 0) {
     return;
   }
@@ -885,7 +881,7 @@ void EigGradKernel(const Context& dev_ctx,
   int batch_count = BatchCount(out_v);
   const int64_t order = out_v.dims(-1);
 
-  ComputeBackwardForComplexInputGPU<phi::dtype::Complex<T>, Context>(
+  ComputeBackwardForComplexInputGPU<dtype::Complex<T>, Context>(
       out_w, out_v, dout_w, dout_v, dx_data, batch_count, order, dev_ctx);
 }
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/eig_kernel.cu b/paddle/phi/kernels/gpu/eig_kernel.cu
index e1322d11d1ee90..6578a4c054c41c 100644
--- a/paddle/phi/kernels/gpu/eig_kernel.cu
+++ b/paddle/phi/kernels/gpu/eig_kernel.cu
@@ -25,8 +25,8 @@ void EigKernel(const Context& dev_ctx,
                const DenseTensor& x,
                DenseTensor* out_w,
                DenseTensor* out_v) {
-  dev_ctx.template Alloc<phi::dtype::Complex<T>>(out_w);
-  dev_ctx.template Alloc<phi::dtype::Complex<T>>(out_v);
+  dev_ctx.template Alloc<dtype::Complex<T>>(out_w);
+  dev_ctx.template Alloc<dtype::Complex<T>>(out_v);
 
   if (x.numel() == 0) {
     return;
@@ -39,9 +39,9 @@ void EigKernel(const Context& dev_ctx,
   // prepare cpu Tensor here, since magma requires output on cpu
   DenseTensor out_w_cpu, out_v_cpu;
   out_w_cpu.Resize(out_w->dims());
-  (*cpu_ctx).template Alloc<phi::dtype::Complex<T>>(&out_w_cpu);
+  (*cpu_ctx).template Alloc<dtype::Complex<T>>(&out_w_cpu);
   out_v_cpu.Resize(x.dims());
-  (*cpu_ctx).template Alloc<phi::dtype::Complex<T>>(&out_v_cpu);
+  (*cpu_ctx).template Alloc<dtype::Complex<T>>(&out_v_cpu);
 
   if (!IsComplexType(x.dtype())) {
     // output still be complex though input is real
@@ -55,55 +55,53 @@ void EigKernel(const Context& dev_ctx,
     real_w_cpu.Resize(real_w_dim);
     (*cpu_ctx).template Alloc<phi::dtype::Real<T>>(&real_w_cpu);
     real_v_cpu.Resize(x.dims());
-    (*cpu_ctx).template Alloc<phi::dtype::Real<T>>(&real_v_cpu);
+    (*cpu_ctx).template Alloc<dtype::Real<T>>(&real_v_cpu);
 
-    phi::ApplyEigKernelMagma<phi::dtype::Real<T>, Context>(
+    ApplyEigKernelMagma<dtype::Real<T>, Context>(
         dev_ctx, x, &real_w_cpu, &real_v_cpu);
 
     // 1. extract real part & imag part from real_w_cpu
-    DenseTensor real_part_cpu = phi::funcs::Slice<phi::dtype::Real<T>>(
+    DenseTensor real_part_cpu = funcs::Slice<dtype::Real<T>>(
         (*cpu_ctx), real_w_cpu, {-1}, {0}, {order});
-    DenseTensor imag_part_cpu = phi::funcs::Slice<phi::dtype::Real<T>>(
+    DenseTensor imag_part_cpu = funcs::Slice<dtype::Real<T>>(
         (*cpu_ctx), real_w_cpu, {-1}, {order}, {order * 2});
 
     // 2. construct complex values
-    auto* real_part_data = real_part_cpu.data<phi::dtype::Real<T>>();
-    auto* imag_part_data = imag_part_cpu.data<phi::dtype::Real<T>>();
+    auto* real_part_data = real_part_cpu.data<dtype::Real<T>>();
+    auto* imag_part_data = imag_part_cpu.data<dtype::Real<T>>();
     int64_t out_w_numel = static_cast<int64_t>(out_w->numel());
 
-    phi::funcs::ForRange<CPUContext> for_range((*cpu_ctx), out_w_numel);
-    phi::funcs::RealImagToComplexFunctor<phi::dtype::Complex<T>> functor(
+    funcs::ForRange<CPUContext> for_range((*cpu_ctx), out_w_numel);
+    funcs::RealImagToComplexFunctor<dtype::Complex<T>> functor(
         real_part_data,
         imag_part_data,
-        out_w_cpu.data<phi::dtype::Complex<T>>(),
+        out_w_cpu.data<dtype::Complex<T>>(),
         out_w_numel);
     for_range(functor);
 
     // 3. construct complex vectors
     DenseTensor real_v_trans_cpu =
-        TransposeLast2Dim<phi::dtype::Real<T>, CPUContext>((*cpu_ctx),
-                                                           real_v_cpu);
+        TransposeLast2Dim<dtype::Real<T>, CPUContext>((*cpu_ctx), real_v_cpu);
     DenseTensor out_v_trans_cpu;
     out_v_trans_cpu.Resize(x.dims());
-    (*cpu_ctx).template Alloc<phi::dtype::Complex<T>>(&out_v_trans_cpu);
-
-    phi::ConstructComplexVectors<phi::dtype::Real<T>,
-                                 phi::dtype::Complex<T>,
-                                 CPUContext>(&out_v_trans_cpu,
-                                             out_w_cpu,
-                                             real_v_trans_cpu,
-                                             (*cpu_ctx),
-                                             batch_count,
-                                             order);
-
-    TransposeTwoAxis<phi::dtype::Complex<T>, CPUContext>(out_v_trans_cpu,
-                                                         &out_v_cpu,
-                                                         x.dims().size() - 1,
-                                                         x.dims().size() - 2,
-                                                         (*cpu_ctx));
+    (*cpu_ctx).template Alloc<dtype::Complex<T>>(&out_v_trans_cpu);
+
+    ConstructComplexVectors<dtype::Real<T>, dtype::Complex<T>, CPUContext>(
+        &out_v_trans_cpu,
+        out_w_cpu,
+        real_v_trans_cpu,
+        (*cpu_ctx),
+        batch_count,
+        order);
+
+    TransposeTwoAxis<dtype::Complex<T>, CPUContext>(out_v_trans_cpu,
+                                                    &out_v_cpu,
+                                                    x.dims().size() - 1,
+                                                    x.dims().size() - 2,
+                                                    (*cpu_ctx));
 
   } else {
-    phi::ApplyEigKernelMagma<T, Context>(dev_ctx, x, &out_w_cpu, &out_v_cpu);
+    ApplyEigKernelMagma<T, Context>(dev_ctx, x, &out_w_cpu, &out_v_cpu);
   }
 
   // copy result from cpu to gpu tensor
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
index 29fd53f88d6f57..52fc578a478f51 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -33,7 +33,7 @@ void ReduceWrapper(const GPUContext &dev_ctx,
                    DenseTensor *dst) {
   std::vector<int> reduce_dims =
       funcs::GetReduceDim(dst->dims(), src->dims(), axis);
-  phi::SumKernel<T, GPUContext>(
+  SumKernel<T, GPUContext>(
       dev_ctx, *src, reduce_dims, src->dtype(), false, dst);
 }
 
@@ -228,7 +228,7 @@ void DefaultMixedPrecisionAddGrad(const GPUContext &dev_ctx,
       }
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(x.dims(), dout.dims(), axis);
-      phi::SumKernel<T_dout, GPUContext>(
+      SumKernel<T_dout, GPUContext>(
           dev_ctx, dout, reduce_dims, dout.dtype(), false, dx);
     }
   }
@@ -244,7 +244,7 @@ void DefaultMixedPrecisionAddGrad(const GPUContext &dev_ctx,
       dev_ctx.template Alloc<float>(&dy_fp32);
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(y.dims(), dout.dims(), axis);
-      phi::SumKernel<float, GPUContext>(
+      SumKernel<float, GPUContext>(
           dev_ctx, dout, reduce_dims, dout.dtype(), false, &dy_fp32);
       CastKernel<float>(dev_ctx, dy_fp32, dy->dtype(), dy);
     }
@@ -309,7 +309,7 @@ void DefaultElementwiseAddGrad(const GPUContext &dev_ctx,
       }
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(x.dims(), out.dims(), axis);
-      phi::SumKernel<T, GPUContext>(
+      SumKernel<T, GPUContext>(
           dev_ctx, dout, reduce_dims, dout.dtype(), false, dx);
     }
   }
@@ -323,7 +323,7 @@ void DefaultElementwiseAddGrad(const GPUContext &dev_ctx,
     } else {
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(y.dims(), out.dims(), axis);
-      phi::SumKernel<T, GPUContext>(
+      SumKernel<T, GPUContext>(
           dev_ctx, dout, reduce_dims, dout.dtype(), false, dy);
     }
   }
@@ -432,7 +432,7 @@ void default_elementwise_sub_grad(const GPUContext &dev_ctx,
       }
       std::vector<int> reduce_dims =
           funcs::GetReduceDim(x.dims(), out.dims(), axis);
-      phi::SumKernel<T, GPUContext>(
+      SumKernel<T, GPUContext>(
           dev_ctx, dout, reduce_dims, dout.dtype(), false, dx);
     }
   }
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index 2e09dca7a9648a..54a64e344acd34 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -67,7 +67,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const optional<DenseTensor>& ddy,
                               int axis,
                               DenseTensor* ddout) {
-  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+  SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
 template <typename T, typename Context>
@@ -184,12 +184,12 @@ void AddGradKernel(const Context& dev_ctx,
 #ifdef PADDLE_WITH_CUDA
   if (x.dtype() == DataType::FLOAT32 &&
       (y.dtype() == DataType::FLOAT16 || y.dtype() == DataType::BFLOAT16)) {
-    phi::MixedPrecisionAddGradImpl<float>(
+    MixedPrecisionAddGradImpl<float>(
         dev_ctx, x, y, dout, axis, dx, dy, MixedPrecisionAddGradFunc<float>);
     return;
   }
 #endif
-  phi::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
+  AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
 }
 
 template <typename T, typename Context>
@@ -200,7 +200,7 @@ void AddDoubleGradKernel(const Context& dev_ctx,
                          const optional<DenseTensor>& ddy,
                          int axis,
                          DenseTensor* ddout) {
-  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+  AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
 template <typename T, typename Context>
@@ -211,7 +211,7 @@ void AddTripleGradKernel(const Context& dev_ctx,
                          int axis,
                          DenseTensor* d_ddx,
                          DenseTensor* d_ddy) {
-  phi::AddGradImpl<T>(
+  AddGradImpl<T>(
       dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc<T>);
 }
 
diff --git a/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu
index 1a23afd87566c0..e68b3acbddddd9 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu
@@ -33,7 +33,7 @@ namespace phi {
 
 template <typename T, typename IndexT>
 __global__ void EmbeddingGradAddTo(T* main_grad_out,
-                                   const phi::bfloat16* out_grad,
+                                   const bfloat16* out_grad,
                                    const IndexT* token_indices,
                                    const int64_t num_tokens,
                                    const int64_t token_length) {
@@ -44,7 +44,7 @@ __global__ void EmbeddingGradAddTo(T* main_grad_out,
 
   while (idy < num_tokens) {
     auto id = static_cast<int64_t>(token_indices[idy]);
-    const phi::bfloat16* token_out_grad = out_grad + idy * token_length;
+    const bfloat16* token_out_grad = out_grad + idy * token_length;
     T* token_main_grad = main_grad_out + id * token_length;
     for (int64_t i = idx; i < token_length; i += blockDim.x) {
       CudaAtomicAdd(&token_main_grad[i], static_cast<T>(token_out_grad[i]));
@@ -77,8 +77,8 @@ struct EmbeddingGradAddToCUDAFunctor {
       auto main_grad_out_t = main_grad_out_;
       const auto* token_indices = token_indices_.template data<IndexT>();
       T* main_grad_out = dev_ctx_.template Alloc<T>(main_grad_out_t);
-      const phi::bfloat16* out_grad = reinterpret_cast<const phi::bfloat16*>(
-          out_grad_.template data<phi::bfloat16>());
+      const bfloat16* out_grad = reinterpret_cast<const bfloat16*>(
+          out_grad_.template data<bfloat16>());
 
       const int gridx = 2 * dev_ctx_.GetSMCount();
       dim3 threads(128, 8);
diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
index 259f108adedf53..31617e4bd4d329 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -187,7 +187,7 @@ struct EmbeddingSparseGradCUDAFunctor {
     new_rows.resize(ids_num);
     auto gpu_place = dev_ctx_.GetPlace();
 
-    phi::MixVector<int64_t> mixv_new_rows(&new_rows);
+    MixVector<int64_t> mixv_new_rows(&new_rows);
     if (!std::is_same<IdT, int64_t>::value) {
       InputTypeConvert<<<grids, threads, 0, stream>>>(
           ids_data, ids_num, mixv_new_rows.MutableData(gpu_place));
diff --git a/paddle/phi/kernels/gpu/embedding_with_scaled_gradient_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_with_scaled_gradient_grad_kernel.cu
index 8fb4098f395065..af278b5f8ac7e7 100644
--- a/paddle/phi/kernels/gpu/embedding_with_scaled_gradient_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_with_scaled_gradient_grad_kernel.cu
@@ -102,7 +102,7 @@ __global__ void ScaleGradKernel(const int* count_data,
                                 int64_t num_weights,
                                 int64_t num_weight_dim,
                                 T* table) {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MPType = typename dtype::MPTypeTrait<T>::Type;
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < num_weights) {
     MPType freq = static_cast<MPType>(count_data[idx]);
diff --git a/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
index 7542d8be1bba24..f8112a8ef8963c 100644
--- a/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu
@@ -51,7 +51,7 @@ void ExpandAsGradKernel(const Context& dev_ctx,
   } else {
     std::vector<int> reduce_dims = funcs::GetReduceDim(in_dims, out_dims, -1);
 
-    phi::SumKernel<T, Context>(
+    SumKernel<T, Context>(
         dev_ctx, out_grad, reduce_dims, out_grad.dtype(), false, in_grad);
   }
 }
diff --git a/paddle/phi/kernels/gpu/expand_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_grad_kernel.cu
index e4fa8f88c35724..465c0db628f9d4 100644
--- a/paddle/phi/kernels/gpu/expand_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_grad_kernel.cu
@@ -44,7 +44,7 @@ void ExpandGradKernel(const Context& dev_ctx,
   } else {
     std::vector<int> reduce_dims =
         funcs::GetReduceDim(x_grad->dims(), out_grad.dims(), -1);
-    phi::SumKernel<T, Context>(
+    SumKernel<T, Context>(
         dev_ctx, out_grad, reduce_dims, out_grad.dtype(), false, x_grad);
   }
 }
diff --git a/paddle/phi/kernels/gpu/exponential_kernel.cu b/paddle/phi/kernels/gpu/exponential_kernel.cu
index 11747e7adefc1f..9e1694d5e1df2d 100644
--- a/paddle/phi/kernels/gpu/exponential_kernel.cu
+++ b/paddle/phi/kernels/gpu/exponential_kernel.cu
@@ -25,7 +25,7 @@ void ExponentialKernel(const Context &dev_ctx,
                        const DenseTensor &x,
                        float lambda,
                        DenseTensor *out) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   funcs::uniform_distribution<MT> dist;
   funcs::exponential_transform<MT> trans(lambda);
   funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
diff --git a/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
index 059468345ead84..31f56972ff0ba8 100644
--- a/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
@@ -386,13 +386,13 @@ void FlashAttnUnpaddedGradBaseKernel(const Context& dev_ctx,
   if (!is_mha) {
     if (dk) {
       if (dk->meta().is_contiguous())
-        phi::SumKernel<T, Context>(dev_ctx, dk_tmp, {2}, dk->type(), false, dk);
+        SumKernel<T, Context>(dev_ctx, dk_tmp, {2}, dk->type(), false, dk);
       else
         kvReduceForGQA<T, Context>(dev_ctx, dk_tmp, dk);
     }
     if (dv) {
       if (dv->meta().is_contiguous())
-        phi::SumKernel<T, Context>(dev_ctx, dv_tmp, {2}, dv->type(), false, dv);
+        SumKernel<T, Context>(dev_ctx, dv_tmp, {2}, dv->type(), false, dv);
       else
         kvReduceForGQA<T, Context>(dev_ctx, dv_tmp, dv);
     }
@@ -693,27 +693,27 @@ void FlashAttnGradBaseKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(&flashmask_maxmin);
 
     downstart_row_indices =
-        phi::Slice<int32_t>(dev_ctx, startend_row_indices.get(), {3}, {0}, {1});
+        Slice<int32_t>(dev_ctx, startend_row_indices.get(), {3}, {0}, {1});
     downstart_row_indices_data = downstart_row_indices.data();
     if (startend_row_indices->dims()[3] == 2) {
       if (!causal) {
-        upend_row_indices = phi::Slice<int32_t>(
-            dev_ctx, startend_row_indices.get(), {3}, {1}, {2});
+        upend_row_indices =
+            Slice<int32_t>(dev_ctx, startend_row_indices.get(), {3}, {1}, {2});
         upend_row_indices_data = upend_row_indices.data();
       } else {
-        downend_row_indices = phi::Slice<int32_t>(
-            dev_ctx, startend_row_indices.get(), {3}, {1}, {2});
+        downend_row_indices =
+            Slice<int32_t>(dev_ctx, startend_row_indices.get(), {3}, {1}, {2});
         downend_row_indices_data = downend_row_indices.data();
       }
     } else if (startend_row_indices->dims()[3] == 4) {
-      upend_row_indices = phi::Slice<int32_t>(
-          dev_ctx, startend_row_indices.get(), {3}, {3}, {4});
+      upend_row_indices =
+          Slice<int32_t>(dev_ctx, startend_row_indices.get(), {3}, {3}, {4});
       upend_row_indices_data = upend_row_indices.data();
-      downend_row_indices = phi::Slice<int32_t>(
-          dev_ctx, startend_row_indices.get(), {3}, {1}, {2});
+      downend_row_indices =
+          Slice<int32_t>(dev_ctx, startend_row_indices.get(), {3}, {1}, {2});
       downend_row_indices_data = downend_row_indices.data();
-      upstart_row_indices = phi::Slice<int32_t>(
-          dev_ctx, startend_row_indices.get(), {3}, {2}, {3});
+      upstart_row_indices =
+          Slice<int32_t>(dev_ctx, startend_row_indices.get(), {3}, {2}, {3});
       upstart_row_indices_data = upstart_row_indices.data();
     }
   }
@@ -888,16 +888,14 @@ void FlashAttnGradBaseKernel(const Context& dev_ctx,
     if (!is_mha) {
       if (dk) {
         if (dk->meta().is_contiguous())
-          phi::SumKernel<T, Context>(
-              dev_ctx, dk_tmp, {3}, dk->type(), false, dk);
+          SumKernel<T, Context>(dev_ctx, dk_tmp, {3}, dk->type(), false, dk);
         else
           kvReduceBatchedForGQA<T, Context>(dev_ctx, dk_tmp, dk);
       }
 
       if (dv) {
         if (dv->meta().is_contiguous())
-          phi::SumKernel<T, Context>(
-              dev_ctx, dv_tmp, {3}, dv->type(), false, dv);
+          SumKernel<T, Context>(dev_ctx, dv_tmp, {3}, dv->type(), false, dv);
         else
           kvReduceBatchedForGQA<T, Context>(dev_ctx, dv_tmp, dv);
       }
diff --git a/paddle/phi/kernels/gpu/flash_attn_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_kernel.cu
index 127ccb256c6285..07dc127414e624 100644
--- a/paddle/phi/kernels/gpu/flash_attn_kernel.cu
+++ b/paddle/phi/kernels/gpu/flash_attn_kernel.cu
@@ -451,27 +451,27 @@ void FlashAttnBaseKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(&flashmask_maxmin);
 
     downstart_row_indices =
-        phi::Slice<int32_t>(dev_ctx, startend_row_indices.get(), {3}, {0}, {1});
+        Slice<int32_t>(dev_ctx, startend_row_indices.get(), {3}, {0}, {1});
     downstart_row_indices_data = downstart_row_indices.data();
     if (startend_row_indices->dims()[3] == 2) {
       if (!causal) {
-        upend_row_indices = phi::Slice<int32_t>(
-            dev_ctx, startend_row_indices.get(), {3}, {1}, {2});
+        upend_row_indices =
+            Slice<int32_t>(dev_ctx, startend_row_indices.get(), {3}, {1}, {2});
         upend_row_indices_data = upend_row_indices.data();
       } else {
-        downend_row_indices = phi::Slice<int32_t>(
-            dev_ctx, startend_row_indices.get(), {3}, {1}, {2});
+        downend_row_indices =
+            Slice<int32_t>(dev_ctx, startend_row_indices.get(), {3}, {1}, {2});
         downend_row_indices_data = downend_row_indices.data();
       }
     } else if (startend_row_indices->dims()[3] == 4) {
-      upend_row_indices = phi::Slice<int32_t>(
-          dev_ctx, startend_row_indices.get(), {3}, {3}, {4});
+      upend_row_indices =
+          Slice<int32_t>(dev_ctx, startend_row_indices.get(), {3}, {3}, {4});
       upend_row_indices_data = upend_row_indices.data();
-      downend_row_indices = phi::Slice<int32_t>(
-          dev_ctx, startend_row_indices.get(), {3}, {1}, {2});
+      downend_row_indices =
+          Slice<int32_t>(dev_ctx, startend_row_indices.get(), {3}, {1}, {2});
       downend_row_indices_data = downend_row_indices.data();
-      upstart_row_indices = phi::Slice<int32_t>(
-          dev_ctx, startend_row_indices.get(), {3}, {2}, {3});
+      upstart_row_indices =
+          Slice<int32_t>(dev_ctx, startend_row_indices.get(), {3}, {2}, {3});
       upstart_row_indices_data = upstart_row_indices.data();
     }
   }
diff --git a/paddle/phi/kernels/gpu/flash_attn_v3_grad_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_v3_grad_kernel.cu
index c50ef02a78294c..c542e9da10b4b7 100644
--- a/paddle/phi/kernels/gpu/flash_attn_v3_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/flash_attn_v3_grad_kernel.cu
@@ -1032,22 +1032,22 @@ void FlashMaskV2GradBaseKernel(
     dev_ctx.template Alloc<int32_t>(&flashmask_maxmin);
 
     lt_start_row_indices =
-        phi::Slice<int32_t>(dev_ctx, startend_row_indices, {3}, {0}, {1});
+        Slice<int32_t>(dev_ctx, startend_row_indices, {3}, {0}, {1});
     if (startend_row_indices.dims()[3] == 2) {
       if (!is_causal) {
         ut_end_row_indices =
-            phi::Slice<int32_t>(dev_ctx, startend_row_indices, {3}, {1}, {2});
+            Slice<int32_t>(dev_ctx, startend_row_indices, {3}, {1}, {2});
       } else {
         lt_end_row_indices =
-            phi::Slice<int32_t>(dev_ctx, startend_row_indices, {3}, {1}, {2});
+            Slice<int32_t>(dev_ctx, startend_row_indices, {3}, {1}, {2});
       }
     } else if (startend_row_indices.dims()[3] == 4) {
       ut_end_row_indices =
-          phi::Slice<int32_t>(dev_ctx, startend_row_indices, {3}, {3}, {4});
+          Slice<int32_t>(dev_ctx, startend_row_indices, {3}, {3}, {4});
       lt_end_row_indices =
-          phi::Slice<int32_t>(dev_ctx, startend_row_indices, {3}, {1}, {2});
+          Slice<int32_t>(dev_ctx, startend_row_indices, {3}, {1}, {2});
       ut_start_row_indices =
-          phi::Slice<int32_t>(dev_ctx, startend_row_indices, {3}, {2}, {3});
+          Slice<int32_t>(dev_ctx, startend_row_indices, {3}, {2}, {3});
     }
   }
 
@@ -1393,7 +1393,7 @@ void FlashMaskV2GradBaseKernel(
                    // different from hdim_qk for now
   DenseTensor tile_count_semaphore;
   if (arch >= 90) {
-    tile_count_semaphore = phi::Full<int32_t, Context>(dev_ctx, {1}, 0);
+    tile_count_semaphore = Full<int32_t, Context>(dev_ctx, {1}, 0);
     dynload::flashmaskv2_bwd_params_set_tile_count_semaphore(
         params_handle, tile_count_semaphore.data<int>());
   } else {
diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu
index 5330c135233760..9298e0c965788c 100644
--- a/paddle/phi/kernels/gpu/flip_kernel.cu
+++ b/paddle/phi/kernels/gpu/flip_kernel.cu
@@ -24,9 +24,9 @@ namespace phi {
 template <typename T>
 __global__ void FlipCudaKernel(const T* in_data,
                                T* out_data,
-                               phi::Array<int64_t, DDim::kMaxRank> shape,
-                               phi::Array<int64_t, DDim::kMaxRank> stride,
-                               phi::Array<int, DDim::kMaxRank> flip_dims,
+                               Array<int64_t, DDim::kMaxRank> shape,
+                               Array<int64_t, DDim::kMaxRank> stride,
+                               Array<int, DDim::kMaxRank> flip_dims,
                                const int rank,
                                const int64_t numel,
                                const int flip_dims_size) {
@@ -79,9 +79,9 @@ void FlipKernel(const Context& dev_ctx,
   size_t flip_dims_size = axis.size();
   auto x_stride = common::stride(x_dims);
 
-  phi::Array<int64_t, DDim::kMaxRank> stride_array;
-  phi::Array<int64_t, DDim::kMaxRank> shape_array;
-  phi::Array<int, DDim::kMaxRank> flip_dims_array;
+  Array<int64_t, DDim::kMaxRank> stride_array;
+  Array<int64_t, DDim::kMaxRank> shape_array;
+  Array<int, DDim::kMaxRank> flip_dims_array;
 
   for (int i = 0; i < rank; ++i) {
     stride_array[i] = x_stride[i];
@@ -93,7 +93,7 @@ void FlipKernel(const Context& dev_ctx,
     }
   }
 
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel);
+  auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel);
   FlipCudaKernel<T>
       <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
           in_data,
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index 71f022b58b4377..828a0a7c298ad7 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -71,14 +71,13 @@ void FullLikeKernel(const Context& dev_ctx,
   // the operator is 0
   int64_t numel = out->numel();
 
-  if (!std::is_same<T, phi::complex64>::value &&
-      !std::is_same<T, phi::complex128>::value &&
-      !std::is_same<T, int64_t>::value) {
+  if (!std::is_same<T, complex64>::value &&
+      !std::is_same<T, complex128>::value && !std::is_same<T, int64_t>::value) {
     auto value = val.to<double>();
     using CommonType = typename std::common_type<
         float,
-        typename std::conditional<std::is_same<T, phi::float16>::value ||
-                                      std::is_same<T, phi::bfloat16>::value,
+        typename std::conditional<std::is_same<T, float16>::value ||
+                                      std::is_same<T, bfloat16>::value,
                                   float,
                                   T>::type>::type;
     auto common_type_value = static_cast<CommonType>(value);
diff --git a/paddle/phi/kernels/gpu/fused_adam_kernel.cu b/paddle/phi/kernels/gpu/fused_adam_kernel.cu
index f4261ffcbfb9ac..371632204d6e91 100644
--- a/paddle/phi/kernels/gpu/fused_adam_kernel.cu
+++ b/paddle/phi/kernels/gpu/fused_adam_kernel.cu
@@ -31,7 +31,7 @@ namespace phi {
 
 template <typename T, bool CPUBetaPows /*=true*/>
 struct FusedAdamBetaPowInfo {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   FusedAdamBetaPowInfo(const MT* beta1pow, const MT* beta2pow) {
     beta1pow_ = *beta1pow;
     beta2pow_ = *beta2pow;
@@ -48,7 +48,7 @@ struct FusedAdamBetaPowInfo {
 
 template <typename T>
 struct FusedAdamBetaPowInfo<T, /*CPUBetaPows=*/false> {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
   FusedAdamBetaPowInfo(const MT* beta1pow, const MT* beta2pow) {
     beta1pow_ = beta1pow;
     beta2pow_ = beta2pow;
@@ -286,7 +286,7 @@ static void CopyTensorIfDifferent(const Context& dev_ctx,
   for (size_t i = 0; i < src.size(); ++i) {
     if (src[i] != dst[i]) {
       VLOG(10) << "Copy Tensor " << i;
-      phi::Place place = (use_src_place ? src[i]->place() : dev_ctx.GetPlace());
+      Place place = (use_src_place ? src[i]->place() : dev_ctx.GetPlace());
       Copy<Context>(dev_ctx, *(src[i]), place, false, dst[i]);
     }
   }
@@ -330,7 +330,7 @@ PADDLE_API void FusedAdamKernel(
     std::vector<DenseTensor*> beta1_pows_out,
     std::vector<DenseTensor*> beta2_pows_out,
     std::vector<DenseTensor*> master_params_out) {
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MT = typename dtype::MPTypeTrait<T>::Type;
 
   auto n = params.size();
   auto beta1_pow_first = beta1_pows[0];
diff --git a/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu b/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu
index f6493b89b46ecb..0014b33005f170 100644
--- a/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu
+++ b/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu
@@ -1056,7 +1056,7 @@ struct AffineQuantStore {
       float normalized_val =
           normalized_i * static_cast<float>(gamma_pack.elem[i]) +
           static_cast<float>(beta_pack.elem[i]);
-      if constexpr (std::is_same_v<OutType, phi::float8_e4m3fn>) {
+      if constexpr (std::is_same_v<OutType, float8_e4m3fn>) {
         y_pack.elem[i] = FP8QuantHelperFunc<float, OutType>(normalized_val,
                                                             quant_out_scale,
                                                             quant_round_type,
@@ -1138,7 +1138,7 @@ void RmsNormQuantKernel(const Context& dev_ctx,
                           quant_scale));
   }
 
-  using ComputeType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using ComputeType = typename dtype::MPTypeTrait<T>::Type;
 
   const T* x_data = x.data<T>();
   const T* norm_weight_data = norm_weight.data<T>();
@@ -1181,9 +1181,8 @@ void RmsNormQuantKernel(const Context& dev_ctx,
           dev_ctx.stream(), load, store, rows, cols, epsilon, inv_var_data);
     } else if (out->dtype() == DataType::FLOAT8_E4M3FN) {
       // Quantize and output float8_e4m3fn.
-      phi::float8_e4m3fn* out_data =
-          dev_ctx.template Alloc<phi::float8_e4m3fn>(out);
-      AffineQuantStore<phi::float8_e4m3fn, ComputeType, T, true, true> store(
+      float8_e4m3fn* out_data = dev_ctx.template Alloc<float8_e4m3fn>(out);
+      AffineQuantStore<float8_e4m3fn, ComputeType, T, true, true> store(
           out_data,
           cols,
           norm_weight_data,
@@ -1220,9 +1219,8 @@ void RmsNormQuantKernel(const Context& dev_ctx,
           dev_ctx.stream(), load, store, rows, cols, epsilon, inv_var_data);
     } else if (out->dtype() == DataType::FLOAT8_E4M3FN) {
       // Quantize and output float8_e4m3fn.
-      phi::float8_e4m3fn* out_data =
-          dev_ctx.template Alloc<phi::float8_e4m3fn>(out);
-      AffineQuantStore<phi::float8_e4m3fn, ComputeType, T, true, true> store(
+      float8_e4m3fn* out_data = dev_ctx.template Alloc<float8_e4m3fn>(out);
+      AffineQuantStore<float8_e4m3fn, ComputeType, T, true, true> store(
           out_data,
           cols,
           norm_weight_data,
@@ -1256,7 +1254,7 @@ void ResidualAddRmsNormWrapper(const Context& dev_ctx,
                                const int cols,
                                T* residual_output,
                                T* output) {
-  using ComputeType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using ComputeType = typename dtype::MPTypeTrait<T>::Type;
   ResidualAddBiasLoad<T, ComputeType> load(
       x, residual, bias, residual_output, cols);
   AffineStore<ComputeType, T> store(output, cols, norm_weight, norm_bias);
@@ -1265,28 +1263,28 @@ void ResidualAddRmsNormWrapper(const Context& dev_ctx,
 }
 
 template void ResidualAddRmsNormWrapper(const GPUContext& dev_ctx,
-                                        const phi::float16* x,
-                                        const phi::float16* residual,
-                                        const phi::float16* bias,
-                                        const phi::float16* norm_weight,
-                                        const phi::float16* norm_bias,
+                                        const float16* x,
+                                        const float16* residual,
+                                        const float16* bias,
+                                        const float16* norm_weight,
+                                        const float16* norm_bias,
                                         const float epsilon,
                                         const int rows,
                                         const int cols,
-                                        phi::float16* residual_output,
-                                        phi::float16* output);
+                                        float16* residual_output,
+                                        float16* output);
 
 template void ResidualAddRmsNormWrapper(const GPUContext& dev_ctx,
-                                        const phi::bfloat16* x,
-                                        const phi::bfloat16* residual,
-                                        const phi::bfloat16* bias,
-                                        const phi::bfloat16* norm_weight,
-                                        const phi::bfloat16* norm_bias,
+                                        const bfloat16* x,
+                                        const bfloat16* residual,
+                                        const bfloat16* bias,
+                                        const bfloat16* norm_weight,
+                                        const bfloat16* norm_bias,
                                         const float epsilon,
                                         const int rows,
                                         const int cols,
-                                        phi::bfloat16* residual_output,
-                                        phi::bfloat16* output);
+                                        bfloat16* residual_output,
+                                        bfloat16* output);
 
 template void ResidualAddRmsNormWrapper(const GPUContext& dev_ctx,
                                         const float* x,
@@ -1309,7 +1307,7 @@ void RmsNormWrapper(const Context& dev_ctx,
                     const int rows,
                     const int cols,
                     T* output) {
-  using ComputeType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using ComputeType = typename dtype::MPTypeTrait<T>::Type;
 
   DirectLoad<T, ComputeType> load(x, cols);
   AffineStore<ComputeType, T> store(output, cols, weight, bias);
@@ -1318,22 +1316,22 @@ void RmsNormWrapper(const Context& dev_ctx,
 }
 
 template void RmsNormWrapper(const GPUContext& dev_ctx,
-                             const phi::float16* x,
-                             const phi::float16* weight,
-                             const phi::float16* bias,
+                             const float16* x,
+                             const float16* weight,
+                             const float16* bias,
                              const float epsilon,
                              const int rows,
                              const int cols,
-                             phi::float16* output);
+                             float16* output);
 
 template void RmsNormWrapper(const GPUContext& dev_ctx,
-                             const phi::bfloat16* x,
-                             const phi::bfloat16* weight,
-                             const phi::bfloat16* bias,
+                             const bfloat16* x,
+                             const bfloat16* weight,
+                             const bfloat16* bias,
                              const float epsilon,
                              const int rows,
                              const int cols,
-                             phi::bfloat16* output);
+                             bfloat16* output);
 
 template void RmsNormWrapper(const GPUContext& dev_ctx,
                              const float* x,
diff --git a/paddle/phi/kernels/gpu/fused_token_prune_kernel.cu b/paddle/phi/kernels/gpu/fused_token_prune_kernel.cu
index 2c6feb325ac789..10c3145f6f0309 100644
--- a/paddle/phi/kernels/gpu/fused_token_prune_kernel.cu
+++ b/paddle/phi/kernels/gpu/fused_token_prune_kernel.cu
@@ -148,8 +148,8 @@ void FusedTokenPruneOpCUDAKernel(const Context& dev_ctx,
       dev_ctx, attn_tmp, false, reduce_dims, attn_accu.dtype(), &attn_accu);
 
   // 3. Prepare token indices
-  phi::backends::gpu::GpuLaunchConfig config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz * max_seq_len);
+  backends::gpu::GpuLaunchConfig config =
+      backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz * max_seq_len);
   FillIndex<<<config.block_per_grid,
               config.thread_per_block,
               0,
@@ -158,7 +158,7 @@ void FusedTokenPruneOpCUDAKernel(const Context& dev_ctx,
   // 4. Sort token indices by attn
   if (keep_first_token) {
     T max = std::numeric_limits<T>::max();
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz);
+    config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz);
     MaximumFirst<T>
         <<<config.block_per_grid,
            config.thread_per_block,
@@ -258,8 +258,7 @@ void FusedTokenPruneOpCUDAKernel(const Context& dev_ctx,
          slimmed_indices);
   }
   // 7. Get slimmed X by indices
-  config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz * slimmed_x_len);
+  config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz * slimmed_x_len);
   TakeAlongAxis<T>
       <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
           x.data<T>(),
diff --git a/paddle/phi/kernels/gpu/gather_nd_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_kernel.cu
index 8cb5c5028415f6..18f0b880a55793 100644
--- a/paddle/phi/kernels/gpu/gather_nd_kernel.cu
+++ b/paddle/phi/kernels/gpu/gather_nd_kernel.cu
@@ -40,7 +40,7 @@ void GatherNdKernel(const Context &dev_ctx,
     for (int i = 0; i < x.dims().size(); ++i) {
       out_dims.emplace_back(1);
     }
-    phi::TileKernel<T, Context>(dev_ctx, x, phi::IntArray(out_dims), out);
+    TileKernel<T, Context>(dev_ctx, x, IntArray(out_dims), out);
     return;
   }
   if (index.dims()[0] == 0 && index.numel() == 0) return;
diff --git a/paddle/phi/kernels/gpu/gaussian_kernel.cu b/paddle/phi/kernels/gpu/gaussian_kernel.cu
index d105035fe1f28a..7eb7857015c95e 100644
--- a/paddle/phi/kernels/gpu/gaussian_kernel.cu
+++ b/paddle/phi/kernels/gpu/gaussian_kernel.cu
@@ -29,7 +29,7 @@
 namespace phi {
 
 template <typename T>
-using ComplexType = phi::dtype::complex<T>;
+using ComplexType = dtype::complex<T>;
 
 template <typename T>
 struct GaussianGenerator {
@@ -46,7 +46,7 @@ struct GaussianGenerator {
   __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed_);
-    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    using MT = typename dtype::MPTypeTrait<T>::Type;
     thrust::normal_distribution<MT> dist(static_cast<MT>(mean_),
                                          static_cast<MT>(std_));
     unsigned int new_n = n + offset_;
@@ -86,8 +86,8 @@ struct GaussianGenerator<ComplexType<T>> {
 // If T is not complex
 template <typename T,
           typename Context,
-          std::enable_if_t<!std::is_same<T, phi::complex64>::value &&
-                               !std::is_same<T, phi::complex128>::value,
+          std::enable_if_t<!std::is_same<T, complex64>::value &&
+                               !std::is_same<T, complex128>::value,
                            bool> = true>
 void GaussianRandom(const Context& dev_ctx,
                     const IntArray& shape,
@@ -100,7 +100,7 @@ void GaussianRandom(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(out);
   if (seed == 0) {
     // use global Generator seed
-    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    using MT = typename dtype::MPTypeTrait<T>::Type;
     funcs::normal_distribution<MT> dist;
     funcs::normal_transform<MT> trans(static_cast<MT>(mean),
                                       static_cast<MT>(std));
@@ -116,8 +116,8 @@ void GaussianRandom(const Context& dev_ctx,
 // If T is complex
 template <typename T,
           typename Context,
-          std::enable_if_t<std::is_same<T, phi::complex64>::value ||
-                               std::is_same<T, phi::complex128>::value,
+          std::enable_if_t<std::is_same<T, complex64>::value ||
+                               std::is_same<T, complex128>::value,
                            bool> = true>
 void GaussianRandom(const Context& dev_ctx,
                     const IntArray& shape,
@@ -137,15 +137,14 @@ void GaussianRandom(const Context& dev_ctx,
     out_imag.Resize(shape.GetData());
     dev_ctx.template Alloc<T>(&out_real);
     dev_ctx.template Alloc<T>(&out_imag);
-    funcs::normal_distribution<phi::dtype::Real<T>> dist;
-    funcs::normal_distribution<phi::dtype::Real<T>> dist_imag;
-    funcs::normal_transform<phi::dtype::Real<T>> trans(mean,
-                                                       std_of_real_or_imag);
-    funcs::distribution_and_transform<phi::dtype::Real<T>>(
+    funcs::normal_distribution<dtype::Real<T>> dist;
+    funcs::normal_distribution<dtype::Real<T>> dist_imag;
+    funcs::normal_transform<dtype::Real<T>> trans(mean, std_of_real_or_imag);
+    funcs::distribution_and_transform<dtype::Real<T>>(
         dev_ctx, &out_real, dist, trans);
-    funcs::distribution_and_transform<phi::dtype::Real<T>>(
+    funcs::distribution_and_transform<dtype::Real<T>>(
         dev_ctx, &out_imag, dist_imag, trans);
-    phi::ComplexKernel<phi::dtype::Real<T>>(dev_ctx, out_real, out_imag, out);
+    ComplexKernel<dtype::Real<T>>(dev_ctx, out_real, out_imag, out);
   } else {
     // use OP seed
     auto func = GaussianGenerator<T>(mean, std_of_real_or_imag, seed);
@@ -156,8 +155,8 @@ void GaussianRandom(const Context& dev_ctx,
 // If T is not complex
 template <typename T,
           typename Context,
-          std::enable_if_t<!std::is_same<T, phi::complex64>::value &&
-                               !std::is_same<T, phi::complex128>::value,
+          std::enable_if_t<!std::is_same<T, complex64>::value &&
+                               !std::is_same<T, complex128>::value,
                            bool> = true>
 void GaussianRandomInplace(const Context& dev_ctx,
                            const DenseTensor& x,
@@ -168,7 +167,7 @@ void GaussianRandomInplace(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(out);
   if (seed == 0) {
     // use global Generator seed
-    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    using MT = typename dtype::MPTypeTrait<T>::Type;
     funcs::normal_distribution<MT> dist;
     funcs::normal_transform<MT> trans(static_cast<MT>(mean),
                                       static_cast<MT>(std));
@@ -184,8 +183,8 @@ void GaussianRandomInplace(const Context& dev_ctx,
 // If T is complex
 template <typename T,
           typename Context,
-          std::enable_if_t<std::is_same<T, phi::complex64>::value ||
-                               std::is_same<T, phi::complex128>::value,
+          std::enable_if_t<std::is_same<T, complex64>::value ||
+                               std::is_same<T, complex128>::value,
                            bool> = true>
 void GaussianRandomInplace(const Context& dev_ctx,
                            const DenseTensor& x,
@@ -203,15 +202,14 @@ void GaussianRandomInplace(const Context& dev_ctx,
     out_imag.Resize(x.dims());
     dev_ctx.template Alloc<T>(&out_real);
     dev_ctx.template Alloc<T>(&out_imag);
-    funcs::normal_distribution<phi::dtype::Real<T>> dist;
-    funcs::normal_distribution<phi::dtype::Real<T>> dist_imag;
-    funcs::normal_transform<phi::dtype::Real<T>> trans(mean,
-                                                       std_of_real_or_imag);
-    funcs::distribution_and_transform<phi::dtype::Real<T>>(
+    funcs::normal_distribution<dtype::Real<T>> dist;
+    funcs::normal_distribution<dtype::Real<T>> dist_imag;
+    funcs::normal_transform<dtype::Real<T>> trans(mean, std_of_real_or_imag);
+    funcs::distribution_and_transform<dtype::Real<T>>(
         dev_ctx, &out_real, dist, trans);
-    funcs::distribution_and_transform<phi::dtype::Real<T>>(
+    funcs::distribution_and_transform<dtype::Real<T>>(
         dev_ctx, &out_imag, dist_imag, trans);
-    phi::ComplexKernel<phi::dtype::Real<T>>(dev_ctx, out_real, out_imag, out);
+    ComplexKernel<dtype::Real<T>>(dev_ctx, out_real, out_imag, out);
   } else {
     // use OP seed
     auto func = GaussianGenerator<T>(mean, std_of_real_or_imag, seed);
diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
index a2da0717bba5d7..bdc2265e636c27 100644
--- a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
@@ -28,7 +28,7 @@ namespace phi {
 
 template <typename T>
 struct GeluWithApproximateGradFunctor {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MPType = typename dtype::MPTypeTrait<T>::Type;
   inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
     MPType x = static_cast<MPType>(arg_x);
     MPType dout = static_cast<MPType>(arg_dout);
@@ -54,7 +54,7 @@ struct GeluWithApproximateGradFunctor {
 
 template <typename T>
 struct GeluWithoutApproximateGradFunctor {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MPType = typename dtype::MPTypeTrait<T>::Type;
   inline HOSTDEVICE T operator()(T arg_x, T arg_dout) {
     MPType x = static_cast<MPType>(arg_x);
     MPType dout = static_cast<MPType>(arg_dout);
diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu
index b276ca277311df..a9058c157774cf 100644
--- a/paddle/phi/kernels/gpu/gelu_kernel.cu
+++ b/paddle/phi/kernels/gpu/gelu_kernel.cu
@@ -32,7 +32,7 @@ namespace phi {
 
 template <typename T>
 struct GeluWithApproximateFunctor {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MPType = typename dtype::MPTypeTrait<T>::Type;
   inline HOSTDEVICE T operator()(T arg_x) {
     // this function is tanh approximation of gelu
     MPType x = static_cast<MPType>(arg_x);
@@ -48,7 +48,7 @@ struct GeluWithApproximateFunctor {
 
 template <typename T>
 struct GeluWithoutApproximateFunctor {
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using MPType = typename dtype::MPTypeTrait<T>::Type;
   inline HOSTDEVICE T operator()(T arg_x) {
     // actual gelu with approximation = false
     MPType x = static_cast<MPType>(arg_x);
diff --git a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
index 7015d483d2188f..bed31526f6ccb7 100644
--- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
@@ -68,7 +68,7 @@ static void SortDescending(const GPUContext &dev_ctx,
                                                     dev_ctx.stream());
   // Allocate temporary storage
   auto place = dev_ctx.GetPlace();
-  auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);
+  auto d_temp_storage = memory_utils::Alloc(place, temp_storage_bytes);
 
   // Run sorting operation
   cub::DeviceRadixSort::SortPairsDescending<T, int>(d_temp_storage->ptr(),
@@ -293,10 +293,10 @@ static void NMS(const GPUContext &dev_ctx,
 
   const T *boxes = proposals.data<T>();
   auto place = dev_ctx.GetPlace();
-  auto mask_ptr = phi::memory_utils::Alloc(
-      place,
-      boxes_num * col_blocks * sizeof(uint64_t),
-      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+  auto mask_ptr =
+      memory_utils::Alloc(place,
+                          boxes_num * col_blocks * sizeof(uint64_t),
+                          Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
   uint64_t *mask_dev = reinterpret_cast<uint64_t *>(mask_ptr->ptr());
 
   NMSKernel<<<blocks, threads, 0, dev_ctx.stream()>>>(
diff --git a/paddle/phi/kernels/gpu/global_gather_kernel.cu b/paddle/phi/kernels/gpu/global_gather_kernel.cu
index a7e31903e24daf..0663075345a1d0 100644
--- a/paddle/phi/kernels/gpu/global_gather_kernel.cu
+++ b/paddle/phi/kernels/gpu/global_gather_kernel.cu
@@ -76,15 +76,15 @@ struct GlobalGatherFunctor<GPUContext, T> {
       cpu_global_count_data = cpu_global_count.data<int64_t>();
     }
 
-    ncclDataType_t dtype = phi::ToNCCLDataType(x->dtype());
+    ncclDataType_t dtype = ToNCCLDataType(x->dtype());
     gpuStream_t stream = nullptr;
     stream = dev_ctx.stream();
 
-    phi::distributed::NCCLCommContext *comm_ctx = nullptr;
+    distributed::NCCLCommContext *comm_ctx = nullptr;
     int nranks = 0;
 
-    comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
-        dev_ctx.GetCommContext());
+    comm_ctx =
+        static_cast<distributed::NCCLCommContext *>(dev_ctx.GetCommContext());
     PADDLE_ENFORCE_NE(comm_ctx,
                       nullptr,
                       common::errors::Unavailable(
diff --git a/paddle/phi/kernels/gpu/global_scatter_kernel.cu b/paddle/phi/kernels/gpu/global_scatter_kernel.cu
index f14b3eea87517d..f1d4a4add322cf 100644
--- a/paddle/phi/kernels/gpu/global_scatter_kernel.cu
+++ b/paddle/phi/kernels/gpu/global_scatter_kernel.cu
@@ -75,14 +75,14 @@ struct GlobalScatterFunctor<GPUContext, T> {
       global_count_len = cpu_global_count.numel();
     }
 
-    ncclDataType_t dtype = phi::ToNCCLDataType(x->dtype());
+    ncclDataType_t dtype = ToNCCLDataType(x->dtype());
     gpuStream_t stream = nullptr;
     stream = dev_ctx.stream();
-    phi::distributed::NCCLCommContext* comm_ctx = nullptr;
+    distributed::NCCLCommContext* comm_ctx = nullptr;
     int nranks = 0;
 
-    comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
-        dev_ctx.GetCommContext());
+    comm_ctx =
+        static_cast<distributed::NCCLCommContext*>(dev_ctx.GetCommContext());
     PADDLE_ENFORCE_NE(comm_ctx,
                       nullptr,
                       common::errors::Unavailable(
diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
index 6cbdb172e0a9f6..e08075b69facc8 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -40,14 +40,14 @@ __global__ void InitializeHashTable(T* tensor, int len) {
 }
 
 template <typename T, typename Context>
-std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
-                                               const T* input,
-                                               int num_input,
-                                               int64_t len_hashtable,
-                                               T* keys,
-                                               int* values,
-                                               int* key_index,
-                                               int* final_nodes_len) {
+std::shared_ptr<Allocation> FillHashTable(const Context& dev_ctx,
+                                          const T* input,
+                                          int num_input,
+                                          int64_t len_hashtable,
+                                          T* keys,
+                                          int* values,
+                                          int* key_index,
+                                          int* final_nodes_len) {
   const auto place = dev_ctx.GetPlace();
 
   int block = 1024;
@@ -73,7 +73,7 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
 
   int total_unique_items = item_count[num_input];
   auto unique_items =
-      phi::memory_utils::AllocShared(place, total_unique_items * sizeof(T));
+      memory_utils::AllocShared(place, total_unique_items * sizeof(T));
   T* unique_items_data = reinterpret_cast<T*>(unique_items->ptr());
   *final_nodes_len = total_unique_items;
 
@@ -176,12 +176,11 @@ void Reindex(const Context& dev_ctx,
   int64_t log_num = 1 << static_cast<size_t>(1 + std::log2(num >> 1));
   int64_t table_size = log_num << 1;
 
-  auto keys =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(T));
+  auto keys = memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(T));
   auto values =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int));
+      memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int));
   auto key_index =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int));
+      memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int));
   T* keys_ptr = reinterpret_cast<T*>(keys->ptr());
   int* values_ptr = reinterpret_cast<int*>(values->ptr());
   int* key_index_ptr = reinterpret_cast<int*>(key_index->ptr());
@@ -197,7 +196,7 @@ void Reindex(const Context& dev_ctx,
           key_index_ptr, table_size);
 
   int unique_len = 0;
-  std::shared_ptr<phi::Allocation> unique_items =
+  std::shared_ptr<Allocation> unique_items =
       FillHashTable<T, Context>(dev_ctx,
                                 thrust::raw_pointer_cast(out_nodes->data()),
                                 out_nodes->size(),
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
index 71c915ef916192..32d6ac6a5947c1 100644
--- a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
+++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h
@@ -42,7 +42,7 @@ struct GraphSendRecvMaxCUDAFunctor {
                                 T* output,
                                 const IndexT& in_i,
                                 const IndexT& out_i) {
-    phi::CudaAtomicMax(output + out_i, *(params + in_i));
+    CudaAtomicMax(output + out_i, *(params + in_i));
   }
 };
 
@@ -52,7 +52,7 @@ struct GraphSendRecvMinCUDAFunctor {
                                 T* output,
                                 const IndexT& in_i,
                                 const IndexT& out_i) {
-    phi::CudaAtomicMin(output + out_i, *(params + in_i));
+    CudaAtomicMin(output + out_i, *(params + in_i));
   }
 };
 
diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
index 3d6eb173d10f47..30afc8569778f7 100644
--- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
+++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
@@ -109,14 +109,14 @@ struct GraphSendUERecvSumCUDAFunctor {
 template <typename T>
 struct GraphSendUERecvMaxCUDAFunctor {
   DEVICE inline void operator()(T* output, T val) {
-    phi::CudaAtomicMax(output, val);
+    CudaAtomicMax(output, val);
   }
 };
 
 template <typename T>
 struct GraphSendUERecvMinCUDAFunctor {
   DEVICE inline void operator()(T* output, T val) {
-    phi::CudaAtomicMin(output, val);
+    CudaAtomicMin(output, val);
   }
 };
 
diff --git a/paddle/phi/kernels/gpu/grid_sample_utils.h b/paddle/phi/kernels/gpu/grid_sample_utils.h
index 8097e6f7007837..66ee5d5fd50a18 100644
--- a/paddle/phi/kernels/gpu/grid_sample_utils.h
+++ b/paddle/phi/kernels/gpu/grid_sample_utils.h
@@ -62,14 +62,14 @@ inline bool cudnnIsAvailable() {
   return false;
 #elif defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // cuDNN/MIOpen version > 0 means DNN lib loaded; require v7+ for sampler
-  return phi::backends::gpu::DnnVersion() >= 7000;
+  return backends::gpu::DnnVersion() >= 7000;
 #else
   return false;
 #endif
 }
 
 inline bool isGpuTensor(const DenseTensor& x) {
-  return phi::is_gpu_place(x.place());
+  return is_gpu_place(x.place());
 }
 
 inline bool canUse32bitIndexMath(const DenseTensor& x) {
diff --git a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
index 5bb61893098ef9..1931e8042256f3 100644
--- a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
@@ -160,7 +160,7 @@ template <typename AccT>
 __device__ __forceinline__ AccT GradWarpReduceSum(AccT val) {
 #pragma unroll
   for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
-    val += phi::backends::gpu::CudaShuffleDownSync(0xffffffff, val, offset);
+    val += backends::gpu::CudaShuffleDownSync(0xffffffff, val, offset);
   }
   return val;
 }
@@ -752,7 +752,7 @@ void GroupNormGradKernel(const Context& dev_ctx,
     }
     return;
   }
-  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using AccT = typename dtype::MPTypeTrait<T>::Type;
   const DataLayout data_layout = StringToDataLayout(data_layout_str);
   const auto scale_ptr = scale.get_ptr();
   const auto bias_ptr = bias.get_ptr();
diff --git a/paddle/phi/kernels/gpu/group_norm_kernel.cu b/paddle/phi/kernels/gpu/group_norm_kernel.cu
index 9d94f6d36c1d6c..ebbb4ad20199c0 100644
--- a/paddle/phi/kernels/gpu/group_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/group_norm_kernel.cu
@@ -125,7 +125,7 @@ static int64_t findMaxDivisor(int64_t n, int64_t maxAllowedDivisor) {
 
 template <typename T, int THREADS_PER_CHANNEL>
 inline __device__ void UpdateSum(const T* srcX, float* sum, float* sumSq) {
-  float src_data = phi::__2float<T>(*srcX);
+  float src_data = __2float<T>(*srcX);
   *sum += src_data;
   *sumSq += src_data * src_data;
 }
@@ -135,8 +135,8 @@ inline __device__ void UpdateSum(const T* srcX,
                                  const T* srcR,
                                  float* sum,
                                  float* sumSq) {
-  float src_data = phi::__2float<T>(*srcX);
-  float srcy_data = phi::__2float<T>(*srcR);
+  float src_data = __2float<T>(*srcX);
+  float srcy_data = __2float<T>(*srcR);
   *sum += src_data + srcy_data;
   *sumSq += (src_data + srcy_data) * (src_data + srcy_data);
 }
@@ -166,9 +166,9 @@ inline __device__ void UpdateSum<__half, 2>(const __half* srcX,
 }
 
 template <>
-inline __device__ void UpdateSum<phi::float16, 2>(const phi::float16* srcX,
-                                                  float* sum,
-                                                  float* sumSq) {
+inline __device__ void UpdateSum<float16, 2>(const float16* srcX,
+                                             float* sum,
+                                             float* sumSq) {
   __half2 h2 = *reinterpret_cast<__half2 const*>(srcX);
   float2 f2 = __half22float2(h2);
   *sum += f2.x + f2.y;
@@ -176,10 +176,10 @@ inline __device__ void UpdateSum<phi::float16, 2>(const phi::float16* srcX,
 }
 
 template <>
-inline __device__ void UpdateSum<phi::float16, 2>(const phi::float16* srcX,
-                                                  const phi::float16* srcR,
-                                                  float* sum,
-                                                  float* sumSq) {
+inline __device__ void UpdateSum<float16, 2>(const float16* srcX,
+                                             const float16* srcR,
+                                             float* sum,
+                                             float* sumSq) {
   __half2 h2 = *reinterpret_cast<__half2 const*>(srcX);
   __half2 h2_r = *reinterpret_cast<__half2 const*>(srcR);
   float2 f2 = __half22float2(h2);
@@ -191,24 +191,24 @@ inline __device__ void UpdateSum<phi::float16, 2>(const phi::float16* srcX,
 
 #ifdef PADDLE_CUDA_BF16
 template <>
-inline __device__ void UpdateSum<phi::bfloat16, 2>(const phi::bfloat16* srcX,
-                                                   float* sum,
-                                                   float* sumSq) {
+inline __device__ void UpdateSum<bfloat16, 2>(const bfloat16* srcX,
+                                              float* sum,
+                                              float* sumSq) {
   __nv_bfloat162 h2 = *reinterpret_cast<__nv_bfloat162 const*>(srcX);
-  float2 f2 = phi::bfloat1622float2(h2);
+  float2 f2 = bfloat1622float2(h2);
   *sum += f2.x + f2.y;
   *sumSq += f2.x * f2.x + f2.y * f2.y;
 }
 
 template <>
-inline __device__ void UpdateSum<phi::bfloat16, 2>(const phi::bfloat16* srcX,
-                                                   const phi::bfloat16* srcR,
-                                                   float* sum,
-                                                   float* sumSq) {
+inline __device__ void UpdateSum<bfloat16, 2>(const bfloat16* srcX,
+                                              const bfloat16* srcR,
+                                              float* sum,
+                                              float* sumSq) {
   __nv_bfloat162 h2 = *reinterpret_cast<__nv_bfloat162 const*>(srcX);
   __nv_bfloat162 h2_r = *reinterpret_cast<__nv_bfloat162 const*>(srcR);
-  float2 f2 = phi::bfloat1622float2(h2);
-  float2 f2_r = phi::bfloat1622float2(h2_r);
+  float2 f2 = bfloat1622float2(h2);
+  float2 f2_r = bfloat1622float2(h2_r);
   *sum += f2.x + f2_r.x + f2.y + f2_r.y;
   *sumSq +=
       (f2.x + f2_r.x) * (f2.x + f2_r.x) + (f2.y + f2_r.y) * (f2.y + f2_r.y);
@@ -418,22 +418,20 @@ inline __device__ void GroupNormCompute(int64_t dhwBegin,
                                         const GroupNormNDHWCParams<T>& params,
                                         float mean,
                                         float invStdDev) {
-  float gamma =
-      phi::__2float<T>(*(reinterpret_cast<T const*>(params.gamma) + ci));
-  float beta =
-      phi::__2float<T>(*(reinterpret_cast<T const*>(params.beta) + ci));
+  float gamma = __2float<T>(*(reinterpret_cast<T const*>(params.gamma) + ci));
+  float beta = __2float<T>(*(reinterpret_cast<T const*>(params.beta) + ci));
   for (int64_t dhwi = dhwBegin; dhwi < dhwEnd; ++dhwi) {
     // The src/dst offset.
     int64_t offset =
         static_cast<int64_t>(blockIdx.z) * params.dhwc + dhwi * params.c + ci;
-    float src_data = phi::__2float<T>(params.srcX[offset]);
+    float src_data = __2float<T>(params.srcX[offset]);
     if (params.srcR != nullptr) {
       auto gi = ci / params.cPerGroup;
       auto gj = ci % params.cPerGroup;
       int64_t g_offset =
           params.y_same_with_x ? offset : gi * params.cPerGroup + gj;
-      src_data += phi::__2float<T>(params.srcR[g_offset]);
-      *reinterpret_cast<T*>(&params.eleOut[offset]) = phi::__2dst<T>(src_data);
+      src_data += __2float<T>(params.srcR[g_offset]);
+      *reinterpret_cast<T*>(&params.eleOut[offset]) = __2dst<T>(src_data);
     }
     // Normalize the channels.
     float dst_data = (src_data - mean) * invStdDev;
@@ -446,16 +444,16 @@ inline __device__ void GroupNormCompute(int64_t dhwBegin,
     }
 
     // Store the scaled values.
-    *reinterpret_cast<T*>(&params.dst[offset]) = phi::__2dst<T>(dst_data);
+    *reinterpret_cast<T*>(&params.dst[offset]) = __2dst<T>(dst_data);
   }
 }
 
 template <>
-inline __device__ void GroupNormCompute<phi::float16, 2>(
+inline __device__ void GroupNormCompute<float16, 2>(
     int64_t dhwBegin,
     int64_t dhwEnd,
     int32_t ci,
-    const GroupNormNDHWCParams<phi::float16>& params,
+    const GroupNormNDHWCParams<float16>& params,
     float mean,
     float invStdDev) {
   float2 gammaF2, betaF2;
@@ -563,17 +561,17 @@ inline __device__ void GroupNormCompute<__half, 2>(
 
 #ifdef PADDLE_CUDA_BF16
 template <>
-inline __device__ void GroupNormCompute<phi::bfloat16, 2>(
+inline __device__ void GroupNormCompute<bfloat16, 2>(
     int64_t dhwBegin,
     int64_t dhwEnd,
     int32_t ci,
-    const GroupNormNDHWCParams<phi::bfloat16>& params,
+    const GroupNormNDHWCParams<bfloat16>& params,
     float mean,
     float invStdDev) {
   float2 gammaF2, betaF2;
-  gammaF2 = phi::bfloat1622float2(*reinterpret_cast<__nv_bfloat162 const*>(
+  gammaF2 = bfloat1622float2(*reinterpret_cast<__nv_bfloat162 const*>(
       reinterpret_cast<__nv_bfloat16 const*>(params.gamma) + ci));
-  betaF2 = phi::bfloat1622float2(*reinterpret_cast<__nv_bfloat162 const*>(
+  betaF2 = bfloat1622float2(*reinterpret_cast<__nv_bfloat162 const*>(
       reinterpret_cast<__nv_bfloat16 const*>(params.beta) + ci));
 
   // Iterate over the activations to compute the sums.
@@ -587,7 +585,7 @@ inline __device__ void GroupNormCompute<phi::bfloat16, 2>(
         *reinterpret_cast<__nv_bfloat162 const*>(&params.srcX[offset]);
 
     // Extract the two half values.
-    float2 f2 = phi::bfloat1622float2(h2);
+    float2 f2 = bfloat1622float2(h2);
 
     if (params.srcR != nullptr) {
       auto gi = ci / params.cPerGroup;
@@ -596,11 +594,11 @@ inline __device__ void GroupNormCompute<phi::bfloat16, 2>(
           params.y_same_with_x ? offset : gi * params.cPerGroup + gj;
       __nv_bfloat162 r2 =
           *reinterpret_cast<__nv_bfloat162 const*>(&params.srcR[g_offset]);
-      float2 r_f2 = phi::bfloat1622float2(r2);
+      float2 r_f2 = bfloat1622float2(r2);
       f2.x += r_f2.x;
       f2.y += r_f2.y;
       *reinterpret_cast<__nv_bfloat162*>(&params.eleOut[offset]) =
-          phi::float22bfloat162_rn(f2);
+          float22bfloat162_rn(f2);
     }
     // Normalize the channels.
     f2.x = (f2.x - mean) * invStdDev;
@@ -617,7 +615,7 @@ inline __device__ void GroupNormCompute<phi::bfloat16, 2>(
     }
     // Store the scaled values.
     *reinterpret_cast<__nv_bfloat162*>(&params.dst[offset]) =
-        phi::float22bfloat162_rn(f2);
+        float22bfloat162_rn(f2);
   }
 }
 #endif
@@ -736,7 +734,7 @@ void GroupNormNDHWCKernel(const Context& dev_ctx,
   if (data_layout != DataLayout::NHWC) {
     PD_THROW("data_layout only supports NHWC and NDHWC");
   }
-  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using AccT = typename dtype::MPTypeTrait<T>::Type;
   GroupNormNDHWCParams<T> params_;
   params_.withSilu = activation == "silu" ? true : false;
 
@@ -886,17 +884,17 @@ void GroupNormNDHWCKernel(const Context& dev_ctx,
   groupNormNDHWCScale<T> ndhwc_scale;
   ndhwc_scale(params_, stream);
 #ifdef PADDLE_WITH_HIP
-  phi::backends::gpu::GpuMemcpyAsync(mean_data,
-                                     params_.redBuffer,
-                                     params_.n * groups * sizeof(float),
-                                     hipMemcpyDeviceToHost,
-                                     stream);
+  backends::gpu::GpuMemcpyAsync(mean_data,
+                                params_.redBuffer,
+                                params_.n * groups * sizeof(float),
+                                hipMemcpyDeviceToHost,
+                                stream);
 #else
-  phi::backends::gpu::GpuMemcpyAsync(mean_data,
-                                     params_.redBuffer,
-                                     params_.n * groups * sizeof(float),
-                                     cudaMemcpyDeviceToHost,
-                                     stream);
+  backends::gpu::GpuMemcpyAsync(mean_data,
+                                params_.redBuffer,
+                                params_.n * groups * sizeof(float),
+                                cudaMemcpyDeviceToHost,
+                                stream);
 #endif
 }
 
@@ -952,13 +950,10 @@ __device__ __forceinline__ WelfordData<AccT> WelfordWarpReduce(
   for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
     WelfordData<AccT> other;
     other.mean =
-        phi::backends::gpu::CudaShuffleDownSync(0xffffffff, val.mean, offset);
-    other.m2 =
-        phi::backends::gpu::CudaShuffleDownSync(0xffffffff, val.m2, offset);
-    other.n =
-        phi::backends::gpu::CudaShuffleDownSync(0xffffffff, val.n, offset);
-    other.nf =
-        phi::backends::gpu::CudaShuffleDownSync(0xffffffff, val.nf, offset);
+        backends::gpu::CudaShuffleDownSync(0xffffffff, val.mean, offset);
+    other.m2 = backends::gpu::CudaShuffleDownSync(0xffffffff, val.m2, offset);
+    other.n = backends::gpu::CudaShuffleDownSync(0xffffffff, val.n, offset);
+    other.nf = backends::gpu::CudaShuffleDownSync(0xffffffff, val.nf, offset);
     val = WelfordCombine(val, other);
   }
   return val;
@@ -1291,7 +1286,7 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x,
       CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
 #endif
 #ifdef __HIPCC__
-      if (blockDim.x < phi::kps::details::kWarpSize) {
+      if (blockDim.x < kps::details::kWarpSize) {
         CudaAtomicAdd(&mean[bid * groups + gid], x_mean);
         CudaAtomicAdd(&var[bid * groups + gid], x_var);
       } else {
@@ -1477,12 +1472,12 @@ void GroupNormDirectCUDAFunctor<T, AccT>::operator()(
       while (block_size_nchw < max_block_size) {
         block_size_nchw *= 2;
       }
-      block_size_nchw = std::max(block_size_nchw, phi::kps::details::kWarpSize);
+      block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
       int64_t n_groups = input_ddim[0] * static_cast<int64_t>(groups);
       dim3 grids(std::min(max_grid_x, n_groups));
       dim3 blocks(block_size_nchw);
       if (size < vec_size * block_size_nchw) {
-        phi::ScalarGetMeanAndVarNCHW<T, AccT><<<grids, blocks, 0, stream>>>(
+        ScalarGetMeanAndVarNCHW<T, AccT><<<grids, blocks, 0, stream>>>(
             input, mean, temp_variance, size, n_groups);
       } else {
         VectorizedGetMeanAndVarNCHW<T, AccT, vec_size>
@@ -1515,7 +1510,7 @@ void GroupNormDirectCUDAFunctor<T, AccT>::operator()(
     cudaMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups);
 #endif
 
-    phi::GroupNormForwardGetMeanAndVar<T, AccT>
+    GroupNormForwardGetMeanAndVar<T, AccT>
         <<<grid, threads, 0, stream>>>(input,
                                        input_ddim[0],
                                        C,
@@ -1559,7 +1554,7 @@ void GroupNormGeneralCaseKernel(const Context& dev_ctx,
                                 DenseTensor* y,
                                 DenseTensor* mean,
                                 DenseTensor* var) {
-  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  using AccT = typename dtype::MPTypeTrait<T>::Type;
   const DataLayout data_layout = StringToDataLayout(data_layout_str);
   const auto scale_ptr = scale.get_ptr();
   const auto bias_ptr = bias.get_ptr();
@@ -1843,42 +1838,42 @@ void GroupNormKernel(const Context& dev_ctx,
     return;
   }
   using std::is_same;
-  if (is_same<T, phi::float16>::value && data_layout_str == "NHWC") {
+  if (is_same<T, float16>::value && data_layout_str == "NHWC") {
     const optional<DenseTensor>& residual = optional<DenseTensor>(paddle::none);
     DenseTensor empty_tensor;
-    GroupNormNDHWCKernel<phi::float16, Context>(dev_ctx,
-                                                x,
-                                                residual,
-                                                scale,
-                                                bias,
-                                                epsilon,
-                                                groups,
-                                                data_layout_str,
-                                                "",
-                                                y,
-                                                &empty_tensor,
-                                                mean,
-                                                var);
+    GroupNormNDHWCKernel<float16, Context>(dev_ctx,
+                                           x,
+                                           residual,
+                                           scale,
+                                           bias,
+                                           epsilon,
+                                           groups,
+                                           data_layout_str,
+                                           "",
+                                           y,
+                                           &empty_tensor,
+                                           mean,
+                                           var);
     return;
   }
 
 #ifdef PADDLE_CUDA_BF16
-  if (is_same<T, phi::bfloat16>::value && data_layout_str == "NHWC") {
+  if (is_same<T, bfloat16>::value && data_layout_str == "NHWC") {
     const optional<DenseTensor>& residual = optional<DenseTensor>(paddle::none);
     DenseTensor empty_tensor;
-    GroupNormNDHWCKernel<phi::bfloat16, Context>(dev_ctx,
-                                                 x,
-                                                 residual,
-                                                 scale,
-                                                 bias,
-                                                 epsilon,
-                                                 groups,
-                                                 data_layout_str,
-                                                 "",
-                                                 y,
-                                                 &empty_tensor,
-                                                 mean,
-                                                 var);
+    GroupNormNDHWCKernel<bfloat16, Context>(dev_ctx,
+                                            x,
+                                            residual,
+                                            scale,
+                                            bias,
+                                            epsilon,
+                                            groups,
+                                            data_layout_str,
+                                            "",
+                                            y,
+                                            &empty_tensor,
+                                            mean,
+                                            var);
     return;
   }
 #endif
diff --git a/paddle/phi/kernels/gpu/group_norm_utils.h b/paddle/phi/kernels/gpu/group_norm_utils.h
index 6fb6d155398ead..c4929a4506f5f5 100644
--- a/paddle/phi/kernels/gpu/group_norm_utils.h
+++ b/paddle/phi/kernels/gpu/group_norm_utils.h
@@ -49,7 +49,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
 }
 
 template <typename T, typename AccT, int VecSize, int Num>
-__device__ __forceinline__ void ThreadReduce(phi::Array<const T*, Num> arrs,
+__device__ __forceinline__ void ThreadReduce(Array<const T*, Num> arrs,
                                              int64_t size,
                                              const int offset,
                                              AccT* out_mean,
@@ -168,7 +168,7 @@ __global__ void VectorizedGetMeanAndVarNCHW(
     AccT x_var = static_cast<AccT>(0);
     x += i * size;
     const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
-    phi::Array<const T*, 1> ins;
+    Array<const T*, 1> ins;
     ins[0] = x;
     ThreadReduce<T, AccT, VecSize, 1>(ins, size, input_offset, &x_mean, &x_var);
     ReduceMeanAndVar<AccT>(mean, var, x_mean, x_var, size, i);
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
index 39d15f32cd10d1..eff113fa9e975c 100644
--- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -137,7 +137,7 @@ struct GumbleNoiseGenerator<GPUContext, T> {
     DenseTensor random_tensor;
     int64_t size = size_to_axis * size_from_axis;
     random_tensor.Resize({size});
-    using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+    using MPType = typename dtype::MPTypeTrait<T>::Type;
     MPType* random_data = dev_ctx.template Alloc<MPType>(&random_tensor);
 
     // generate gumbel noise