diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu index ef69c22275b808..ed3c1882223390 100644 --- a/paddle/phi/kernels/gpu/abs_kernel.cu +++ b/paddle/phi/kernels/gpu/abs_kernel.cu @@ -28,25 +28,23 @@ template struct CudaAbsFunctor; template -struct CudaAbsFunctor>> { - __device__ __forceinline__ phi::dtype::Real operator()(const T x) const { +struct CudaAbsFunctor>> { + __device__ __forceinline__ dtype::Real operator()(const T x) const { return abs(x); } }; template -struct CudaAbsFunctor< - T, - std::enable_if_t>::value && - std::is_same::value>> { +struct CudaAbsFunctor>::value && + std::is_same::value>> { __device__ __forceinline__ T operator()(const T x) const { return abs(x); } }; template -struct CudaAbsFunctor< - T, - std::enable_if_t>::value && - !std::is_same::value>> { +struct CudaAbsFunctor>::value && + !std::is_same::value>> { __device__ __forceinline__ T operator()(const T x) const { return std::abs(x); } @@ -56,12 +54,12 @@ template PADDLE_API void AbsKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { - dev_ctx.template Alloc>(out); + dev_ctx.template Alloc>(out); std::vector ins = {&x}; std::vector outs = {out}; auto functor = CudaAbsFunctor(); - funcs::ElementwiseKernel>(dev_ctx, ins, &outs, functor); + funcs::ElementwiseKernel>(dev_ctx, ins, &outs, functor); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu index b1478e842f4cfa..f97d49d717144b 100644 --- a/paddle/phi/kernels/gpu/accuracy_kernel.cu +++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu @@ -24,7 +24,6 @@ #include "paddle/phi/core/kernel_registry.h" namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void AccuracyCudaKernel(const int64_t N, @@ -34,7 +33,7 @@ __global__ void AccuracyCudaKernel(const int64_t N, int* correct_data, T* accuracy, int* total_data) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; int count = 0; __shared__ int total[BlockSize]; @@ -98,7 +97,7 @@ void AccuracyKernel(const Context& dev_ctx, int64_t num_samples = inference.dims()[0]; size_t infer_width = inference.dims()[1]; auto stream = dev_ctx.stream(); - phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream); + backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream); PADDLE_ENFORCE_GT(label.dims().size(), 0, diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index 35ed74c551ec06..4e867f387e9082 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -371,8 +371,7 @@ void PowGradKernel(const Context& dev_ctx, DenseTensor* dx) { if (factor.to() == 0) { std::vector vec_dims = vectorize(dx->dims()); - phi::Full( - dev_ctx, phi::IntArray(vec_dims), static_cast(0), dx); + Full(dev_ctx, IntArray(vec_dims), static_cast(0), dx); return; } if (factor.to() == 1) { diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index 2fb25ec6151085..5da1b755ee8cf6 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -283,8 +283,7 @@ void PowKernel(const Context& dev_ctx, } if (factor.to() == 0) { std::vector vec_dims = vectorize(out->dims()); - phi::Full( - dev_ctx, phi::IntArray(vec_dims), static_cast(1), out); + Full(dev_ctx, IntArray(vec_dims), static_cast(1), out); return; } if (factor.to() == 1) { diff --git a/paddle/phi/kernels/gpu/adagrad_kernel.cu b/paddle/phi/kernels/gpu/adagrad_kernel.cu index 303b180244feb5..69431f94844837 100644 --- a/paddle/phi/kernels/gpu/adagrad_kernel.cu +++ b/paddle/phi/kernels/gpu/adagrad_kernel.cu @@ -70,7 +70,7 @@ struct DenseAdagradFunctor { DenseTensor* param_out_tensor, DenseTensor* moment_out_tensor, DenseTensor* master_param_outs) { - using MT = typename phi::dtype::template MPTypeTrait::Type; + using MT = typename dtype::template MPTypeTrait::Type; T* param_out_data = dev_ctx.template Alloc(param_out_tensor); MT* moment_out_data = dev_ctx.template Alloc(moment_out_tensor); const MT* master_in_data = @@ -82,7 +82,7 @@ struct DenseAdagradFunctor { MT epsilon = static_cast(epsilon_t); int64_t numel = param_t.numel(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1); int grid = config.block_per_grid.x; int block = config.thread_per_block.x; auto stream = dev_ctx.stream(); @@ -180,7 +180,7 @@ struct SparseAdagradFunctor { const int block_size = 256; dim3 threads(block_size, 1); dim3 grid2(1, merge_rows.size()); - phi::MixVector mixv_merge_rows(&merge_rows); + MixVector mixv_merge_rows(&merge_rows); SparseAdagradFunctorKernel <<; template struct SparseAdagradFunctor; template struct DenseAdagradFunctor; template struct DenseAdagradFunctor; -template struct DenseAdagradFunctor; +template struct DenseAdagradFunctor; } // namespace phi diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu index 8cbde3b60d716d..c3c50207d09b96 100644 --- a/paddle/phi/kernels/gpu/adam_kernel.cu +++ b/paddle/phi/kernels/gpu/adam_kernel.cu @@ -187,7 +187,7 @@ PADDLE_API void AdamDenseKernel(const Context& dev_ctx, DenseTensor* beta1_pow_out, DenseTensor* beta2_pow_out, DenseTensor* master_param_outs) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; const auto grad_type = grad.dtype(); VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; @@ -394,7 +394,7 @@ void MergedAdamKernel( std::vector beta1_pow_out, std::vector beta2_pow_out, std::vector master_param_out) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; MT beta1_ = beta1.to(); MT beta2_ = beta2.to(); diff --git a/paddle/phi/kernels/gpu/adamax_kernel.cu b/paddle/phi/kernels/gpu/adamax_kernel.cu index e6c60d485528c1..dda8d4c45ba1d6 100644 --- a/paddle/phi/kernels/gpu/adamax_kernel.cu +++ b/paddle/phi/kernels/gpu/adamax_kernel.cu @@ -84,7 +84,7 @@ void AdamaxKernel(const Context& dev_ctx, DenseTensor* moment_out, DenseTensor* inf_norm_out, DenseTensor* master_param_outs) { - using MT = typename phi::dtype::template MPTypeTrait::Type; + using MT = typename dtype::template MPTypeTrait::Type; T* param_out_data = dev_ctx.template Alloc(param_out); MT* moment_out_data = dev_ctx.template Alloc(moment_out); MT* inf_norm_out_data = dev_ctx.template Alloc(inf_norm_out); @@ -104,7 +104,7 @@ void AdamaxKernel(const Context& dev_ctx, MT epsilon_ = static_cast(epsilon); int64_t numel = param.numel(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1); int grid = config.block_per_grid.x; int block = config.thread_per_block.x; auto stream = dev_ctx.stream(); diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu index 0d11bc4ac8e932..7eee4fb0a4461f 100644 --- a/paddle/phi/kernels/gpu/adamw_kernel.cu +++ b/paddle/phi/kernels/gpu/adamw_kernel.cu @@ -168,7 +168,7 @@ PADDLE_API void AdamwDenseKernel(const Context& dev_ctx, DenseTensor* beta1_pow_out, DenseTensor* beta2_pow_out, DenseTensor* master_param_outs) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; MT coeff_ = static_cast(coeff); MT lr_ratio_ = static_cast(lr_ratio); diff --git a/paddle/phi/kernels/gpu/add_n_kernel.cu b/paddle/phi/kernels/gpu/add_n_kernel.cu index 9aa073279319de..d2cc2b6e2b02a0 100644 --- a/paddle/phi/kernels/gpu/add_n_kernel.cu +++ b/paddle/phi/kernels/gpu/add_n_kernel.cu @@ -25,7 +25,7 @@ namespace phi { template __global__ void SumArrayCUDAKernel( T **in, T *out, int64_t N, size_t in_size, bool read_dst) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { MPType total(read_dst ? static_cast(out[idx]) : static_cast(0)); @@ -46,7 +46,7 @@ __global__ void SumArrayMixedTypeCUDAKernel(const T *in_0, int64_t N, size_t in_others_size, bool read_dst) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { MPType total(read_dst ? static_cast(out[idx]) : static_cast(0)); @@ -128,7 +128,7 @@ void AddNKernel(const Context &dev_ctx, int64_t length_0 = in_0.numel(); int64_t length_1 = in_1.numel(); if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; auto result = EigenVector::Flatten(*out); auto &place = *dev_ctx.eigen_device(); auto in_0_e = EigenVector::Flatten(in_0).template cast(); @@ -281,7 +281,7 @@ void AddNKernel(const Context &dev_ctx, } } if (!sr_in_out_data.empty()) { - auto tmp_sr_in_out_array = phi::memory_utils::Alloc( + auto tmp_sr_in_out_array = memory_utils::Alloc( dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *)); size_t nbytes_sr = sr_in_out_data.size() * sizeof(T *); @@ -306,8 +306,8 @@ void AddNKernel(const Context &dev_ctx, } // if indata not null, merge into one kernel call. if (!in_data.empty()) { - auto tmp_in_array = phi::memory_utils::Alloc(dev_ctx.GetPlace(), - in_data.size() * sizeof(T *)); + auto tmp_in_array = + memory_utils::Alloc(dev_ctx.GetPlace(), in_data.size() * sizeof(T *)); size_t nbytes_in2 = in_data.size() * sizeof(T *); const void *stable_in2 = diff --git a/paddle/phi/kernels/gpu/all_to_all_kernel.cu b/paddle/phi/kernels/gpu/all_to_all_kernel.cu index 2ac99ee46b58a6..091662d90ea60c 100644 --- a/paddle/phi/kernels/gpu/all_to_all_kernel.cu +++ b/paddle/phi/kernels/gpu/all_to_all_kernel.cu @@ -63,10 +63,9 @@ void AllToAllKernel(const Context& dev_ctx, const auto* send_buf = x.data(); auto* recv_buf = out->data(); for (auto i = 0; i < nranks; ++i) { - auto send_buf = phi::distributed::GetPartialTensor(x, offset, send_numel); + auto send_buf = distributed::GetPartialTensor(x, offset, send_numel); comm_ctx->Send(send_buf, send_numel, i, stream); - auto recv_buf = - phi::distributed::GetPartialTensor(*out, offset, send_numel); + auto recv_buf = distributed::GetPartialTensor(*out, offset, send_numel); comm_ctx->Recv(&recv_buf, send_numel, i, stream); offset += send_numel; } diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu index dba76d4b1eac86..409d09c52e7bb1 100644 --- a/paddle/phi/kernels/gpu/allclose_kernel.cu +++ b/paddle/phi/kernels/gpu/allclose_kernel.cu @@ -36,7 +36,7 @@ __global__ void AllcloseCUDAKernel(const T* in_data, bool* out_data) { unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; bool val; - using BaseMPType = typename phi::dtype::MPTypeTrait::Type; + using BaseMPType = typename dtype::MPTypeTrait::Type; using MPType = typename std::conditional::value || @@ -98,8 +98,7 @@ void AllCloseKernel(const Context& dev_ctx, int64_t num = x.numel(); const int vec_size = 4; - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, num, vec_size); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, num, vec_size); uint32_t grid = config.block_per_grid.x; uint32_t block = config.thread_per_block.x; diff --git a/paddle/phi/kernels/gpu/amp_kernel.cu b/paddle/phi/kernels/gpu/amp_kernel.cu index e5880c74a4f501..22875642d68075 100644 --- a/paddle/phi/kernels/gpu/amp_kernel.cu +++ b/paddle/phi/kernels/gpu/amp_kernel.cu @@ -159,13 +159,13 @@ class LazyZeros { const auto& cpu_place = CPUPlace(); // alloc each tensor's start index and copy to device auto h_in_starts_mem = - phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); + memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); int64_t* h_starts = reinterpret_cast(h_in_starts_mem->ptr()); - auto d_in_starts_mem = phi::memory_utils::Alloc( + auto d_in_starts_mem = memory_utils::Alloc( dev_ctx.GetPlace(), (xs_size + 1) * sizeof(int64_t), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); int64_t* d_starts = reinterpret_cast(d_in_starts_mem->ptr()); // the start index value of each tensor is @@ -186,14 +186,13 @@ class LazyZeros { dev_ctx.stream()); // copy each tensor of "outs" data address array to device - auto h_out_addrs_mem = - phi::memory_utils::Alloc(cpu_place, xs_size * sizeof(T*)); + auto h_out_addrs_mem = memory_utils::Alloc(cpu_place, xs_size * sizeof(T*)); T** h_out_addrs = reinterpret_cast(h_out_addrs_mem->ptr()); - auto d_out_addrs_mem = phi::memory_utils::Alloc( + auto d_out_addrs_mem = memory_utils::Alloc( dev_ctx.GetPlace(), xs_size * sizeof(T*), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); T** d_out_addrs = reinterpret_cast(d_out_addrs_mem->ptr()); for (size_t i = 0; i < xs_size; ++i) { @@ -277,7 +276,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, const DenseTensor& scale, std::vector outs, DenseTensor* found_infinite) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; const MT* scale_data = scale.data(); bool* found_inf_data = dev_ctx.template Alloc(found_infinite); @@ -294,13 +293,13 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, const auto& cpu_place = CPUPlace(); // calculate each tensor's start index and copy to device auto h_starts_tensor = - phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); + memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); int64_t* h_starts = reinterpret_cast(h_starts_tensor->ptr()); - auto d_starts_tensor = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - (xs_size + 1) * sizeof(int64_t), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto d_starts_tensor = + memory_utils::Alloc(dev_ctx.GetPlace(), + (xs_size + 1) * sizeof(int64_t), + Stream(reinterpret_cast(dev_ctx.stream()))); int64_t* d_starts = reinterpret_cast(d_starts_tensor->ptr()); // the start index value of each tensor is @@ -322,14 +321,14 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, dev_ctx.stream()); // copy each tensor's data address to device - auto h_mem = phi::memory_utils::Alloc(cpu_place, 2 * xs_size * sizeof(T*)); + auto h_mem = memory_utils::Alloc(cpu_place, 2 * xs_size * sizeof(T*)); const T** h_xs = reinterpret_cast(h_mem->ptr()); T** h_outs = reinterpret_cast(h_mem->ptr()) + xs_size; - auto d_mem = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - 2 * xs_size * sizeof(T*), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto d_mem = + memory_utils::Alloc(dev_ctx.GetPlace(), + 2 * xs_size * sizeof(T*), + Stream(reinterpret_cast(dev_ctx.stream()))); const T** d_xs = reinterpret_cast(d_mem->ptr()); T** d_outs = reinterpret_cast(d_mem->ptr()) + xs_size; diff --git a/paddle/phi/kernels/gpu/arange_kernel.cu b/paddle/phi/kernels/gpu/arange_kernel.cu index 35e53055533460..ee7873e0110f4e 100644 --- a/paddle/phi/kernels/gpu/arange_kernel.cu +++ b/paddle/phi/kernels/gpu/arange_kernel.cu @@ -37,7 +37,7 @@ void ArangeTensorKernel(const Context& dev_ctx, const DenseTensor& end, const DenseTensor& step, DenseTensor* out) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType start_value = static_cast(GetValue(dev_ctx, start)); MPType end_value = static_cast(GetValue(dev_ctx, end)); @@ -64,7 +64,7 @@ void ArangeNullaryKernel(const Context& dev_ctx, const T end_value, const T step_value, DenseTensor* out) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType start_value_mpt = static_cast(start_value); MPType end_value_mpt = static_cast(end_value); MPType step_value_mpt = static_cast(step_value); diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu index 868257e9edbc64..3f98b957d60162 100644 --- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu +++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu @@ -232,7 +232,7 @@ void ArgMinMaxOpCUDAKernel(const Context& dev_ctx, dev_ctx, x, axis.to(), keepdims, flatten, out)); return; } - phi::VisitDataTypeTiny( + VisitDataTypeTiny( dtype, VisitDataCudaArgMinMaxFunctor( dev_ctx, x, axis.to(), keepdims, flatten, out)); diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu index b351fe22a13104..7d73826d561c6e 100644 --- a/paddle/phi/kernels/gpu/argsort_kernel.cu +++ b/paddle/phi/kernels/gpu/argsort_kernel.cu @@ -375,7 +375,7 @@ void ArgsortKernel(const Context& dev_ctx, PerSort( dev_ctx, out_data, ids_data, start, end, stable, descending); if (start != 0) { - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, end); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, end); merge_kernel<<::Type; + using MT = typename dtype::MPTypeTrait::Type; const MT* master_in_data = multi_precision ? master_param->data() : nullptr; MT* master_out_data = diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu index 0d3d3f2545866d..4da580f474912e 100644 --- a/paddle/phi/kernels/gpu/auc_kernel.cu +++ b/paddle/phi/kernels/gpu/auc_kernel.cu @@ -18,8 +18,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - __global__ void ClearObsoleteDataKernel(int64_t *pos, int64_t *neg, const int bucket_length, diff --git a/paddle/phi/kernels/gpu/barrier_kernel.cu b/paddle/phi/kernels/gpu/barrier_kernel.cu index fd639434f8193e..fe17bb4eb7b39e 100644 --- a/paddle/phi/kernels/gpu/barrier_kernel.cu +++ b/paddle/phi/kernels/gpu/barrier_kernel.cu @@ -27,8 +27,8 @@ void BarrierKernel(const Context &dev_ctx, DenseTensor *out) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto in = &x; - auto comm_ctx = static_cast( - dev_ctx.GetCommContext()); + auto comm_ctx = + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( @@ -37,7 +37,7 @@ void BarrierKernel(const Context &dev_ctx, auto stream = comm_ctx->GetStream(); ncclRedOp_t nccl_red_type = ncclSum; comm_ctx->AllReduce(out, *in, nccl_red_type, stream); - phi::backends::gpu::GpuStreamSync(stream); + backends::gpu::GpuStreamSync(stream); #else PADDLE_THROW( common::errors::Unavailable("PaddlePaddle should compile with NCCL.")); diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index cc81ffcc682bcf..abb34236991aa2 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -42,7 +42,7 @@ COMMON_DECLARE_bool(batch_norm_use_miopen); namespace phi { template -using CudnnDataType = phi::backends::gpu::CudnnDataType; +using CudnnDataType = backends::gpu::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; @@ -611,7 +611,7 @@ void BatchNormGradFunctor(const Context &dev_ctx, C, new_scale.dims()[0])); - auto dtype = phi::backends::gpu::CudnnDataType::type; + auto dtype = backends::gpu::CudnnDataType::type; #ifdef PADDLE_WITH_HIP auto compute_format = data_layout == DataLayout::NHWC diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 37b7cdfb1f0534..2500b2a553022d 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -42,7 +42,7 @@ COMMON_DECLARE_bool(batch_norm_use_miopen); namespace phi { template -using CudnnDataType = phi::backends::gpu::CudnnDataType; +using CudnnDataType = backends::gpu::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; @@ -588,7 +588,7 @@ void BatchNormKernel(const Context &dev_ctx, int N, C, H, W, D; funcs::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); - auto dtype = phi::backends::gpu::CudnnDataType::type; + auto dtype = backends::gpu::CudnnDataType::type; auto *Scale = scale.get_ptr(); auto *Bias = bias.get_ptr(); @@ -599,13 +599,13 @@ void BatchNormKernel(const Context &dev_ctx, if (Scale) { new_scale = scale.get(); } else { - new_scale = phi::Full(dev_ctx, {C}, static_cast(1)); + new_scale = Full(dev_ctx, {C}, static_cast(1)); } if (Bias) { new_bias = bias.get(); } else { - new_bias = phi::Full(dev_ctx, {C}, static_cast(0)); + new_bias = Full(dev_ctx, {C}, static_cast(0)); } #ifdef PADDLE_WITH_HIP diff --git a/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu index 3b05ea51a32151..3211a29d6cf365 100644 --- a/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu @@ -27,7 +27,7 @@ namespace phi { template struct BCELossGradFunctor { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; MT one = static_cast(1.0f); MT eps = static_cast(1e-12); diff --git a/paddle/phi/kernels/gpu/bce_loss_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_kernel.cu index 05866af47c6130..4cbb343223ae77 100644 --- a/paddle/phi/kernels/gpu/bce_loss_kernel.cu +++ b/paddle/phi/kernels/gpu/bce_loss_kernel.cu @@ -28,7 +28,7 @@ namespace phi { template struct BCELossFunctor { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; MT zero = static_cast(0); MT one = static_cast(1.0f); MT neg_100 = static_cast(-100.); @@ -42,8 +42,8 @@ struct BCELossFunctor { "Input is expected to be within the interval [0, 1], but received %f.", x_mt); - MT term1 = max(phi::kps::details::Log(x_mt), neg_100); - MT term2 = max(phi::kps::details::Log(one - x_mt), neg_100); + MT term1 = max(kps::details::Log(x_mt), neg_100); + MT term2 = max(kps::details::Log(one - x_mt), neg_100); return static_cast((label_mt - one) * term2 - label_mt * term1); } }; diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu index e83cc590a1b310..9adf7c7cb4fe1c 100644 --- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu +++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu @@ -53,7 +53,7 @@ __global__ void bernoulli_cuda_kernel( for (size_t i = 4 * thread_idx; i < size; i += total_thread * 4) { funcs::uniform_distribution dist; float4 rand = dist(&state); - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; #pragma unroll for (size_t j = 0; j < 4; j++) { size_t idx = i + j; @@ -82,7 +82,7 @@ void BernoulliKernel(const Context& dev_ctx, uint64_t seed = seed_offset.first; uint64_t offset = seed_offset.second; - auto gpu_config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 4); + auto gpu_config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 4); size_t grid_size = gpu_config.GetGridSize(); size_t block_size = gpu_config.GetBlockSize(); diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu index a770eaa4562079..7a1f5d7d8427e0 100644 --- a/paddle/phi/kernels/gpu/bincount_kernel.cu +++ b/paddle/phi/kernels/gpu/bincount_kernel.cu @@ -22,8 +22,6 @@ #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - inline int64_t GET_BLOCKS(const int64_t N) { return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; } @@ -62,8 +60,8 @@ __global__ void KernelReduceMinMax(const T* input, } if (tid == 0) { - phi::CudaAtomicMin(min_out, smin[0]); - phi::CudaAtomicMax(max_out, smax[0]); + CudaAtomicMin(min_out, smin[0]); + CudaAtomicMax(max_out, smax[0]); } } diff --git a/paddle/phi/kernels/gpu/binomial_kernel.cu b/paddle/phi/kernels/gpu/binomial_kernel.cu index a5547c9f9aa4f9..67e5a012ea6e35 100644 --- a/paddle/phi/kernels/gpu/binomial_kernel.cu +++ b/paddle/phi/kernels/gpu/binomial_kernel.cu @@ -145,7 +145,7 @@ __global__ void BinomialSampling(const T* n, const int N, unsigned int seed, unsigned int offset) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) { MT nt = static_cast(n[idx]); MT pt = static_cast(p[idx]); @@ -189,7 +189,7 @@ void BinomialKernel(const Context& dev_ctx, int block_size = std::min(kMaxBlockDim, dev_ctx.GetMaxThreadsPerBlock()); dim3 dim_block(block_size); dim3 dim_grid((size + block_size - 1) / block_size); - phi::backends::gpu::LimitGridDim(dev_ctx, &dim_grid); + backends::gpu::LimitGridDim(dev_ctx, &dim_grid); auto gen_cuda = dev_ctx.GetGenerator(); auto seed_offset = gen_cuda->IncrementOffset(20); diff --git a/paddle/phi/kernels/gpu/box_clip_kernel.cu b/paddle/phi/kernels/gpu/box_clip_kernel.cu index 0668965403e7ce..315934df1c6842 100644 --- a/paddle/phi/kernels/gpu/box_clip_kernel.cu +++ b/paddle/phi/kernels/gpu/box_clip_kernel.cu @@ -63,7 +63,7 @@ void GPUBoxClipKernel(const Context &dev_ctx, auto stream = dev_ctx.stream(); const size_t batch_size = lod.back().size() - 1; T *output_data = dev_ctx.template Alloc(output); - phi::MixVector mix_vector(&abs_offset_lod[0]); + MixVector mix_vector(&abs_offset_lod[0]); GPUBoxClip<<>>( input_p->data(), mix_vector.CUDAMutableData(dev_ctx.GetPlace()), diff --git a/paddle/phi/kernels/gpu/box_coder_kernel.cu b/paddle/phi/kernels/gpu/box_coder_kernel.cu index 5ff86cac661ca1..bc2ed13f307649 100644 --- a/paddle/phi/kernels/gpu/box_coder_kernel.cu +++ b/paddle/phi/kernels/gpu/box_coder_kernel.cu @@ -209,10 +209,10 @@ void BoxCoderKernel(const Context &dev_ctx, int grid = (row * col + block - 1) / block; int64_t bytes = var_size * sizeof(float); - auto dev_var = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - bytes, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto dev_var = + memory_utils::Alloc(dev_ctx.GetPlace(), + bytes, + Stream(reinterpret_cast(dev_ctx.stream()))); float *dev_var_data = reinterpret_cast(dev_var->ptr()); auto cplace = CPUPlace(); const auto gplace = dev_ctx.GetPlace(); diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu index 518a8567ecd9f5..cb6cb3eb72f9a9 100644 --- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -89,12 +89,12 @@ void BroadcastTensorsGradKernel(const Context& dev_ctx, Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, output_tensor); } else { // reduce_sum implementation on CUDA - phi::SumKernel(dev_ctx, - *input_tensor, - reduce_dims_vec, - output_tensor->dtype(), - false, - output_tensor); + SumKernel(dev_ctx, + *input_tensor, + reduce_dims_vec, + output_tensor->dtype(), + false, + output_tensor); } } } diff --git a/paddle/phi/kernels/gpu/c_concat_kernel.cu b/paddle/phi/kernels/gpu/c_concat_kernel.cu index 1cf9f41072708c..763915f70f1b49 100644 --- a/paddle/phi/kernels/gpu/c_concat_kernel.cu +++ b/paddle/phi/kernels/gpu/c_concat_kernel.cu @@ -69,13 +69,13 @@ void CConcatKernel(const Context& dev_ctx, gpuStream_t stream = nullptr; #if defined(PADDLE_WITH_FLAGCX) && defined(PADDLE_KERNEL_WITH_FLAGCX) - phi::distributed::FlagcxCommContext* comm_ctx = nullptr; - comm_ctx = static_cast( - dev_ctx.GetCommContext()); + distributed::FlagcxCommContext* comm_ctx = nullptr; + comm_ctx = + static_cast(dev_ctx.GetCommContext()); #else - phi::distributed::NCCLCommContext* comm_ctx = nullptr; + distributed::NCCLCommContext* comm_ctx = nullptr; comm_ctx = - static_cast(dev_ctx.GetCommContext()); + static_cast(dev_ctx.GetCommContext()); #endif PADDLE_ENFORCE_NE(comm_ctx, nullptr, diff --git a/paddle/phi/kernels/gpu/c_scatter_kernel.cu b/paddle/phi/kernels/gpu/c_scatter_kernel.cu index e8e1f37137fe3c..19aa361d57dadb 100644 --- a/paddle/phi/kernels/gpu/c_scatter_kernel.cu +++ b/paddle/phi/kernels/gpu/c_scatter_kernel.cu @@ -35,12 +35,12 @@ void CScatterOpCUDAKernel(const Context& dev_ctx, #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto x = &input; int64_t numel = x->numel(); - ncclDataType_t dtype = phi::ToNCCLDataType(x->dtype()); + ncclDataType_t dtype = ToNCCLDataType(x->dtype()); int root_id = root; auto place = dev_ctx.GetPlace(); gpuStream_t stream = nullptr; - phi::distributed::NCCLCommContext* comm_ctx = nullptr; + distributed::NCCLCommContext* comm_ctx = nullptr; PADDLE_ENFORCE_GE( root_id, 0, @@ -53,7 +53,7 @@ void CScatterOpCUDAKernel(const Context& dev_ctx, "The ring_id (%d) for c_scatter_op must be non-negative.", ring_id)); comm_ctx = - static_cast(dev_ctx.GetCommContext()); + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( diff --git a/paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_kernel.cu index 2512503a102e3e..3df577e5d5a2da 100644 --- a/paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_kernel.cu +++ b/paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_kernel.cu @@ -189,10 +189,10 @@ struct CSoftmaxWithCrossEntropyFunctor { const DenseTensor* labels = &label_in; gpuStream_t stream = nullptr; - phi::distributed::NCCLCommContext* comm_ctx = nullptr; + distributed::NCCLCommContext* comm_ctx = nullptr; - comm_ctx = static_cast( - dev_ctx.GetCommContext()); + comm_ctx = + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( @@ -223,7 +223,7 @@ struct CSoftmaxWithCrossEntropyFunctor { logits_max.Resize({N, 1}); dev_ctx.template Alloc(&logits_max); - phi::MaxKernel(dev_ctx, logits_2d, {-1}, true, &logits_max); + MaxKernel(dev_ctx, logits_2d, {-1}, true, &logits_max); comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream); @@ -305,14 +305,14 @@ struct CSoftmaxWithCrossEntropyFunctor { comm_ctx->AllReduce(&predicted_logits, predicted_logits, ncclSum, stream); // step 4, obtain exp(logit) - phi::ExpKernel(dev_ctx, softmax_2d, &softmax_2d); + ExpKernel(dev_ctx, softmax_2d, &softmax_2d); // step 5, obtain sum_exp_logits DenseTensor sum_exp_logits; sum_exp_logits.Resize({N, 1}); dev_ctx.template Alloc(&sum_exp_logits); - phi::SumKernel( + SumKernel( dev_ctx, softmax_2d, {-1}, softmax_2d.dtype(), true, &sum_exp_logits); comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream); @@ -358,8 +358,7 @@ struct CSoftmaxWithCrossEntropyFunctor { } } - phi::ReciprocalKernel( - dev_ctx, sum_exp_logits, &sum_exp_logits); + ReciprocalKernel(dev_ctx, sum_exp_logits, &sum_exp_logits); inputs = std::vector{&softmax_2d, &sum_exp_logits}; outputs = std::vector{&softmax_2d}; diff --git a/paddle/phi/kernels/gpu/c_softmax_with_multi_label_cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/c_softmax_with_multi_label_cross_entropy_kernel.cu index 4e34885514ae97..9de70f0a25491f 100644 --- a/paddle/phi/kernels/gpu/c_softmax_with_multi_label_cross_entropy_kernel.cu +++ b/paddle/phi/kernels/gpu/c_softmax_with_multi_label_cross_entropy_kernel.cu @@ -159,10 +159,10 @@ struct CSoftmaxWithMultiLabelCrossEntropyFunctor { const DenseTensor* smooth_weight = &smooth_weight_in; gpuStream_t stream = nullptr; - phi::distributed::NCCLCommContext* comm_ctx = nullptr; + distributed::NCCLCommContext* comm_ctx = nullptr; - comm_ctx = static_cast( - dev_ctx.GetCommContext()); + comm_ctx = + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( @@ -266,7 +266,7 @@ struct CSoftmaxWithMultiLabelCrossEntropyFunctor { sum_exp_logits.Resize({N, 1}); dev_ctx.template Alloc(&sum_exp_logits); - phi::SumKernel( + SumKernel( dev_ctx, softmax_2d, {-1}, softmax_2d.dtype(), true, &sum_exp_logits); comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream); diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu index a300efb15e7306..b635faf6d6b9ef 100644 --- a/paddle/phi/kernels/gpu/cast_kernel.cu +++ b/paddle/phi/kernels/gpu/cast_kernel.cu @@ -25,7 +25,7 @@ void CastKernel(const Context& dev_ctx, DataType out_dtype, DenseTensor* out) { if (x.dtype() == out_dtype) { - if (x.dims() == phi::make_ddim({-1})) { + if (x.dims() == make_ddim({-1})) { *out = x; return; } @@ -53,8 +53,8 @@ INSTANTIATE_CAST_KERNEL(uint32_t, GPUContext) INSTANTIATE_CAST_KERNEL(uint64_t, GPUContext) INSTANTIATE_CAST_KERNEL(bool, GPUContext) INSTANTIATE_CAST_KERNEL(int16_t, GPUContext) -INSTANTIATE_CAST_KERNEL(phi::float16, GPUContext) -INSTANTIATE_CAST_KERNEL(phi::bfloat16, GPUContext) +INSTANTIATE_CAST_KERNEL(float16, GPUContext) +INSTANTIATE_CAST_KERNEL(bfloat16, GPUContext) #endif } // namespace phi diff --git a/paddle/phi/kernels/gpu/check_numerics_kernel.cu b/paddle/phi/kernels/gpu/check_numerics_kernel.cu index b7fa564d9c3dea..fab45447fb5c95 100644 --- a/paddle/phi/kernels/gpu/check_numerics_kernel.cu +++ b/paddle/phi/kernels/gpu/check_numerics_kernel.cu @@ -28,11 +28,9 @@ namespace phi { static std::once_flag init_multi_gpu_op_var_map_flag; // lazy init -static std::vector< - std::unordered_map>& +static std::vector>& multi_op_var2gpu_str() { - static std::vector< - std::unordered_map> + static std::vector> _multi_op_var2gpu_str; return _multi_op_var2gpu_str; } @@ -43,14 +41,14 @@ static std::vector& multi_op_var2gpu_str_mutex() { } static void InitMultiGPUOpVarMap() { - int dev_count = phi::backends::gpu::GetGPUDeviceCount(); + int dev_count = backends::gpu::GetGPUDeviceCount(); PADDLE_ENFORCE_GT(dev_count, 0, common::errors::NotFound( "cuda device must > 0, now dev_count=%d", dev_count)); // https://stackoverflow.com/questions/16465633/how-can-i-use-something-like-stdvectorstdmutex - std::vector> + std::vector> tmp_multi(dev_count); std::vector tmp_multi_mutex(dev_count); @@ -111,8 +109,8 @@ __device__ void BlockReduceNumNanInfAndWrite(const int64_t num_nan, } template ::value || - std::is_same::value, + std::enable_if_t::value || + std::is_same::value, bool> = true> __device__ void BlockReduceMaxMinAndWrite(const T max_value, const T min_value, @@ -125,8 +123,8 @@ __device__ void BlockReduceMaxMinAndWrite(const T max_value, } template ::value && - !std::is_same::value, + std::enable_if_t::value && + !std::is_same::value, bool> = true> __device__ void BlockReduceMaxMinAndWrite(const T max_value, const T min_value, @@ -272,7 +270,7 @@ __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr, template inline std::string GetHintString(const std::string& op_type, const std::string& var_name, - const phi::Place& place, + const Place& place, int dev_id = -1) { std::string op_var = funcs::GetCpuHintString(op_type, var_name, place, dev_id); @@ -301,10 +299,10 @@ static char* GetGpuHintStringPtr(const GPUContext& dev_ctx, std::lock_guard guard(op_var2gpu_str_mutex); if (op_var2gpu_str.find(op_var) == op_var2gpu_str.end()) { // insert - auto gpu_str_tensor = phi::memory_utils::Alloc( + auto gpu_str_tensor = memory_utils::Alloc( dev_ctx.GetPlace(), op_var.length() + 1, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); gpu_str_ptr = reinterpret_cast(gpu_str_tensor->ptr()); op_var2gpu_str.emplace(op_var, std::move(gpu_str_tensor)); @@ -353,14 +351,14 @@ static void PrintStack(const GPUContext& dev_ctx, const std::string& op_type, const std::string& var_name, int dev_id) { - auto cpu_stats = phi::memory_utils::Alloc(CPUPlace(), sizeof(int64_t) * 3); + auto cpu_stats = memory_utils::Alloc(CPUPlace(), sizeof(int64_t) * 3); int64_t* cpu_stats_ptr = reinterpret_cast(cpu_stats->ptr()); - phi::memory_utils::Copy(CPUPlace(), - cpu_stats_ptr, - stats.place(), - stats.data(), - 3 * sizeof(int64_t), - dev_ctx.stream()); + memory_utils::Copy(CPUPlace(), + cpu_stats_ptr, + stats.place(), + stats.data(), + 3 * sizeof(int64_t), + dev_ctx.stream()); dev_ctx.Wait(); if (cpu_stats_ptr[0] > 0 || cpu_stats_ptr[1] > 0) { const std::string debug_info = @@ -437,7 +435,7 @@ void CheckNumericsKernel(const Context& dev_ctx, std::min(static_cast(128), static_cast((tensor.numel() + threads - 1) / threads)); - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; int64_t numel_max_min = blocks; @@ -505,12 +503,12 @@ void CheckNumericsKernel(const Context& dev_ctx, #ifdef _WIN32 INSTANTIATE_CHECKNUMBERICS_KERNEL(float, GPUContext) INSTANTIATE_CHECKNUMBERICS_KERNEL(double, GPUContext) -INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::float16, GPUContext) -INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::bfloat16, GPUContext) -INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::complex64, GPUContext) -INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::complex128, GPUContext) -INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::float8_e4m3fn, GPUContext) -INSTANTIATE_CHECKNUMBERICS_KERNEL(phi::float8_e5m2, GPUContext) +INSTANTIATE_CHECKNUMBERICS_KERNEL(float16, GPUContext) +INSTANTIATE_CHECKNUMBERICS_KERNEL(bfloat16, GPUContext) +INSTANTIATE_CHECKNUMBERICS_KERNEL(complex64, GPUContext) +INSTANTIATE_CHECKNUMBERICS_KERNEL(complex128, GPUContext) +INSTANTIATE_CHECKNUMBERICS_KERNEL(float8_e4m3fn, GPUContext) +INSTANTIATE_CHECKNUMBERICS_KERNEL(float8_e5m2, GPUContext) #endif } // namespace phi diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu index 95e009785ee999..a74998897135e4 100644 --- a/paddle/phi/kernels/gpu/cholesky_kernel.cu +++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu @@ -82,10 +82,10 @@ struct MatrixBandPartFunctor { int workspace_size = 0; \ PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \ handle, uplo, n, A, lda, &workspace_size)); \ - auto workspace = phi::memory_utils::Alloc( \ + auto workspace = memory_utils::Alloc( \ dev_ctx.GetPlace(), \ workspace_size * sizeof(T), \ - phi::Stream(reinterpret_cast(dev_ctx.stream()))); \ + Stream(reinterpret_cast(dev_ctx.stream()))); \ T* workspace_ptr = reinterpret_cast(workspace->ptr()); \ PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf( \ handle, uplo, n, A, lda, workspace_ptr, workspace_size, info)); \ @@ -119,12 +119,12 @@ FUNC_WITH_TYPES(POTRF_INSTANCE); data_type, \ &workspace_device_size, \ &workspace_host_size)); \ - auto workspace_device = phi::memory_utils::Alloc( \ + auto workspace_device = memory_utils::Alloc( \ dev_ctx.GetPlace(), \ workspace_device_size, \ - phi::Stream(reinterpret_cast(dev_ctx.stream()))); \ + Stream(reinterpret_cast(dev_ctx.stream()))); \ auto workspace_host = \ - phi::memory_utils::Alloc(CPUPlace(), workspace_host_size); \ + memory_utils::Alloc(CPUPlace(), workspace_host_size); \ PADDLE_ENFORCE_GPU_SUCCESS( \ dynload::cusolverDnXpotrf(handle, \ params, \ @@ -209,10 +209,10 @@ void CholeskyKernel(const Context& dev_ctx, for_range(matrix_band_part_functor); } - auto info = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - sizeof(int) * batch_count, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto info = + memory_utils::Alloc(dev_ctx.GetPlace(), + sizeof(int) * batch_count, + Stream(reinterpret_cast(dev_ctx.stream()))); auto* info_ptr = reinterpret_cast(info->ptr()); #if CUDA_VERSION >= 9020 && !defined(_WIN32) diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu index 609378cc3b224f..bc97987f15347b 100644 --- a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu +++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu @@ -37,7 +37,7 @@ void rocsolver_potrs(const solverHandle_t &handle, T *Bdata, int ldb); -using phi::dtype::complex; +using dtype::complex; #define FUNC_WITH_TYPES(m) \ m(float, s, float) m(double, d, double) \ m(complex, c, rocblas_float_complex) \ @@ -107,15 +107,15 @@ void cusolver_potrs(const solverHandle_t &handle, } template <> -void cusolver_potrs(const solverHandle_t &handle, - cublasFillMode_t uplo, - int M, - int N, - phi::complex64 *Adata, - int lda, - phi::complex64 *Bdata, - int ldb, - int *devInfo) { +void cusolver_potrs(const solverHandle_t &handle, + cublasFillMode_t uplo, + int M, + int N, + complex64 *Adata, + int lda, + complex64 *Bdata, + int ldb, + int *devInfo) { PADDLE_ENFORCE_GPU_SUCCESS( dynload::cusolverDnCpotrs(handle, uplo, @@ -129,15 +129,15 @@ void cusolver_potrs(const solverHandle_t &handle, } template <> -void cusolver_potrs(const cusolverDnHandle_t &handle, - cublasFillMode_t uplo, - int M, - int N, - phi::complex128 *Adata, - int lda, - phi::complex128 *Bdata, - int ldb, - int *devInfo) { +void cusolver_potrs(const cusolverDnHandle_t &handle, + cublasFillMode_t uplo, + int M, + int N, + complex128 *Adata, + int lda, + complex128 *Bdata, + int ldb, + int *devInfo) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnZpotrs( handle, uplo, diff --git a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu index fab500ff2991d7..bcf7986c1f8d1c 100644 --- a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu +++ b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu @@ -350,9 +350,9 @@ void ClassCenterSampleKernel(const Context& dev_ctx, #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nranks > 1) { auto stream = dev_ctx.stream(); - phi::distributed::NCCLCommContext* comm_ctx = nullptr; - comm_ctx = static_cast( - dev_ctx.GetCommContext()); + distributed::NCCLCommContext* comm_ctx = nullptr; + comm_ctx = + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( @@ -361,7 +361,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx, comm_ctx->AllReduce( &num_classes_per_device, num_classes_per_device, ncclSum, stream); - phi::backends::gpu::GpuStreamSync(stream); + backends::gpu::GpuStreamSync(stream); } #endif @@ -446,7 +446,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx, (NumBlocks(num_classes) * kNumCUDAThreads * vec_size) + 1) * vec_size; - // auto gen_cuda = phi::DefaultCUDAGenerator(device_id); + // auto gen_cuda = DefaultCUDAGenerator(device_id); auto gen_cuda = dev_ctx.GetGenerator(); if (!fix_seed) { auto seed_offset = gen_cuda->IncrementOffset(offset); diff --git a/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu index 47783fa3c5a093..a5ef2390d0d911 100644 --- a/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu +++ b/paddle/phi/kernels/gpu/collect_fpn_proposals_kernel.cu @@ -111,18 +111,18 @@ void GPUCollectFpnProposalsOpKernel( } } - phi::memory_utils::Copy(place, - concat_rois_data + roi_offset, - place, - roi_in->data(), - roi_in->numel() * sizeof(T), - dev_ctx.stream()); - phi::memory_utils::Copy(place, - concat_scores_data + score_offset, - place, - score_in->data(), - score_in->numel() * sizeof(T), - dev_ctx.stream()); + memory_utils::Copy(place, + concat_rois_data + roi_offset, + place, + roi_in->data(), + roi_in->numel() * sizeof(T), + dev_ctx.stream()); + memory_utils::Copy(place, + concat_scores_data + score_offset, + place, + score_in->data(), + score_in->numel() * sizeof(T), + dev_ctx.stream()); roi_offset += roi_in->numel(); score_offset += score_in->numel(); } @@ -161,7 +161,7 @@ void GPUCollectFpnProposalsOpKernel( sizeof(T) * 8, dev_ctx.stream()); // Allocate temporary storage - auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes); + auto d_temp_storage = memory_utils::Alloc(place, temp_storage_bytes); // Run sorting operation // sort score to get corresponding index @@ -208,7 +208,7 @@ void GPUCollectFpnProposalsOpKernel( sizeof(int) * 8, dev_ctx.stream()); // Allocate temporary storage - d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes); + d_temp_storage = memory_utils::Alloc(place, temp_storage_bytes); // Run sorting operation // sort batch_id to get corresponding index @@ -246,12 +246,12 @@ void GPUCollectFpnProposalsOpKernel( "address into the graph; on replay the vector is re-created at a " "different address, causing a dangling-pointer write.")); std::vector length_lod_cpu(lod_size); - phi::memory_utils::Copy(CPUPlace(), - length_lod_cpu.data(), - place, - length_lod_data, - sizeof(int) * lod_size, - dev_ctx.stream()); + memory_utils::Copy(CPUPlace(), + length_lod_cpu.data(), + place, + length_lod_data, + sizeof(int) * lod_size, + dev_ctx.stream()); dev_ctx.Wait(); std::vector offset(1, 0); @@ -263,12 +263,12 @@ void GPUCollectFpnProposalsOpKernel( auto* rois_num = rois_num_out; rois_num->Resize({lod_size}); int* rois_num_data = dev_ctx.template Alloc(rois_num); - phi::memory_utils::Copy(place, - rois_num_data, - place, - length_lod_data, - lod_size * sizeof(int), - dev_ctx.stream()); + memory_utils::Copy(place, + rois_num_data, + place, + length_lod_data, + lod_size * sizeof(int), + dev_ctx.stream()); } LegacyLoD lod; diff --git a/paddle/phi/kernels/gpu/comm_init_all_kernel.cu b/paddle/phi/kernels/gpu/comm_init_all_kernel.cu index 9f759fdc0f4ddf..485bead29d228c 100644 --- a/paddle/phi/kernels/gpu/comm_init_all_kernel.cu +++ b/paddle/phi/kernels/gpu/comm_init_all_kernel.cu @@ -29,7 +29,7 @@ void CommInitAllKernel(const Context& dev_ctx, #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) std::vector devices = devices_input; if (devices.empty()) { - devices = phi::backends::gpu::GetSelectedDevices(); + devices = backends::gpu::GetSelectedDevices(); } paddle::platform::NCCLCommContext::Instance().CreateAllNCCLComms(devices, diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu index 9618a4683c424a..d921d9badc021b 100644 --- a/paddle/phi/kernels/gpu/concat_kernel.cu +++ b/paddle/phi/kernels/gpu/concat_kernel.cu @@ -72,8 +72,8 @@ void ConcatKernel(const Context& dev_ctx, if (lod_size) { auto* out_lod = out->mutable_lod(); for (size_t i = 1; i < x.size(); ++i) { - auto in_lod = phi::ConvertToLengthBasedLegacyLoD(x[i]->lod()); - phi::AppendLegacyLoD(out_lod, in_lod); + auto in_lod = ConvertToLengthBasedLegacyLoD(x[i]->lod()); + AppendLegacyLoD(out_lod, in_lod); } } } diff --git a/paddle/phi/kernels/gpu/contiguous_kernel.cu b/paddle/phi/kernels/gpu/contiguous_kernel.cu index 90a734eddf76ac..3feb65323286f5 100644 --- a/paddle/phi/kernels/gpu/contiguous_kernel.cu +++ b/paddle/phi/kernels/gpu/contiguous_kernel.cu @@ -167,8 +167,8 @@ __global__ void ContiguousCaseOneFunc( template __global__ void ContiguousDefaultFunc( const T* input_data, - phi::Array input_stride, - phi::Array dims, + Array input_stride, + Array dims, const int64_t numel, T* out_data) { CUDA_KERNEL_LOOP_TYPE(i, numel, int64_t) { @@ -233,8 +233,8 @@ template bool LaunchContiguousCaseZeroKernel( const Context& dev_ctx, const T* input_data, - const phi::Array& input_stride, - const phi::Array& input_dims, + const Array& input_stride, + const Array& input_dims, int rank, T* output_data) { if (rank > 6) { @@ -305,13 +305,13 @@ template bool LaunchContiguousCaseOneKernel( const Context& dev_ctx, const T* input_data, - const phi::Array& input_stride, - const phi::Array& input_dims, + const Array& input_stride, + const Array& input_dims, int rank, int64_t numel, T* output_data) { Dim3 grid(1, 1, 1), block(1, 1, 1); - phi::Array cur_input_dims; + Array cur_input_dims; block.x = 512; if (rank >= 1) { @@ -445,8 +445,8 @@ template void LaunchContiguousDefaultKernel( const Context& dev_ctx, const T* input_data, - const phi::Array& input_stride, - const phi::Array& input_dims, + const Array& input_stride, + const Array& input_dims, int rank, int64_t numel, T* output_data) { diff --git a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu index b26fa327e01cc8..909fa7ef3a2a1b 100644 --- a/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu @@ -111,7 +111,7 @@ void DepthwiseConv2dTransposeGradKernel(const Context& dev_ctx, &paddings_, &dilations_, padding_algorithm, in_data_dims, strides, ksize); if (dx) { - phi::math::DepthwiseConvFunctor depthwiseConv; + math::DepthwiseConvFunctor depthwiseConv; depthwiseConv(dev_ctx, dout, filter_, @@ -128,8 +128,7 @@ void DepthwiseConv2dTransposeGradKernel(const Context& dev_ctx, dev_ctx.template Alloc(dfilter); set_zero(dev_ctx, dfilter, static_cast(0)); - phi::math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; + math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; depthwiseConvFilterGrad( dev_ctx, dout, diff --git a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu index bdca3cc64f667d..cb8f5d6eada59c 100644 --- a/paddle/phi/kernels/gpu/conv_transpose_kernel.cu +++ b/paddle/phi/kernels/gpu/conv_transpose_kernel.cu @@ -85,7 +85,7 @@ void DepthwiseConv2dTransposeKernel(const Context& dev_ctx, funcs::SetConstant set_zero; set_zero(dev_ctx, out, static_cast(0)); - phi::math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; + math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; depthwiseConvInputGrad( dev_ctx, *out, diff --git a/paddle/phi/kernels/gpu/cross_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_grad_kernel.cu index 10f88d90317a3f..21f55aa59f702f 100644 --- a/paddle/phi/kernels/gpu/cross_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_grad_kernel.cu @@ -40,7 +40,7 @@ __global__ void CrossGrad(const T* x, int64_t pos1 = offset + 1 * stride; int64_t pos2 = offset + 2 * stride; - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType x_pos0_mp = static_cast(x[pos0]); MPType x_pos1_mp = static_cast(x[pos1]); diff --git a/paddle/phi/kernels/gpu/cross_kernel.cu b/paddle/phi/kernels/gpu/cross_kernel.cu index f93e92b343caff..3d491b5445d2bd 100644 --- a/paddle/phi/kernels/gpu/cross_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_kernel.cu @@ -37,7 +37,7 @@ __global__ void Cross(const T* x, int64_t pos1 = offset + 1 * stride; int64_t pos2 = offset + 2 * stride; - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType x_pos0_mp = static_cast(x[pos0]); MPType x_pos1_mp = static_cast(x[pos1]); diff --git a/paddle/phi/kernels/gpu/ctc_align_kernel.cu b/paddle/phi/kernels/gpu/ctc_align_kernel.cu index ca256a9f91b7a0..841965463ed124 100644 --- a/paddle/phi/kernels/gpu/ctc_align_kernel.cu +++ b/paddle/phi/kernels/gpu/ctc_align_kernel.cu @@ -125,7 +125,7 @@ void CTCAlignOpCUDAKernel(const Context& dev_ctx, output->Resize({num_tokens, 1}); T* output_data = dev_ctx.template Alloc(output); - phi::MixVector mixv_input_lod(&input_lod[level]); + MixVector mixv_input_lod(&input_lod[level]); MergeAndDelCudaKernel<<<1, 1, 0, stream>>>( num_tokens, tokens, diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_cache.h b/paddle/phi/kernels/gpu/cudnn_lstm_cache.h index aeb3af5350441b..e3439ed4264127 100644 --- a/paddle/phi/kernels/gpu/cudnn_lstm_cache.h +++ b/paddle/phi/kernels/gpu/cudnn_lstm_cache.h @@ -49,13 +49,13 @@ class ScopedRNNBase { template void Create(const cudnnHandle_t& handle, - const phi::Place& place, + const Place& place, const std::vector& sequence_length, size_t* workspace_size, size_t* reserve_size, DenseTensor* dropout_state) { int numDirections = is_bidirec_ ? 2 : 1; - cudnnDataType_t cudnn_type = phi::backends::gpu::CudnnDataType::type; + cudnnDataType_t cudnn_type = backends::gpu::CudnnDataType::type; // ------------------- cudnn x, y descriptors --------------------- std::vector dims_x = {batch_size_, input_size_, 1}; @@ -238,19 +238,19 @@ class ScopedRNNBase { std::vector x_descs_; std::vector y_descs_; - phi::backends::gpu::ScopedTensorDescriptor x_desc_; - phi::backends::gpu::ScopedTensorDescriptor y_desc_; + backends::gpu::ScopedTensorDescriptor x_desc_; + backends::gpu::ScopedTensorDescriptor y_desc_; #if CUDNN_VERSION >= 7201 - phi::backends::gpu::ScopedRNNTensorDescriptor x_seq_desc_; - phi::backends::gpu::ScopedRNNTensorDescriptor y_seq_desc_; + backends::gpu::ScopedRNNTensorDescriptor x_seq_desc_; + backends::gpu::ScopedRNNTensorDescriptor y_seq_desc_; #endif - phi::backends::gpu::ScopedTensorDescriptor init_h_desc_; - phi::backends::gpu::ScopedTensorDescriptor init_c_desc_; - phi::backends::gpu::ScopedTensorDescriptor last_h_desc_; - phi::backends::gpu::ScopedTensorDescriptor last_c_desc_; - phi::backends::gpu::ScopedDropoutDescriptor dropout_desc_; - phi::backends::gpu::ScopedFilterDescriptor weight_desc_; - phi::backends::gpu::ScopedRNNDescriptor rnn_desc_; + backends::gpu::ScopedTensorDescriptor init_h_desc_; + backends::gpu::ScopedTensorDescriptor init_c_desc_; + backends::gpu::ScopedTensorDescriptor last_h_desc_; + backends::gpu::ScopedTensorDescriptor last_c_desc_; + backends::gpu::ScopedDropoutDescriptor dropout_desc_; + backends::gpu::ScopedFilterDescriptor weight_desc_; + backends::gpu::ScopedRNNDescriptor rnn_desc_; }; } // namespace phi diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu index 845f3d17784479..877ffad5c46620 100644 --- a/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cudnn_lstm_grad_kernel.cu @@ -112,7 +112,7 @@ void CudnnLSTMGradKernel( bool has_seq_length = running_seq_length != nullptr; std::vector SequenceLength; if (has_seq_length) { - SequenceLength = phi::GetVectorFromTensor(running_seq_length); + SequenceLength = GetVectorFromTensor(running_seq_length); } int seq_length = input_dims[0]; diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu index bb3b9fc6dd5086..76dd3ad716dc7b 100644 --- a/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu +++ b/paddle/phi/kernels/gpu/cudnn_lstm_kernel.cu @@ -186,7 +186,7 @@ void CudnnLSTMKernel( if (seed == 0) { // If not specify seed, use global Generator to generate seed. int device_id = dev_ctx.GetPlace().GetDeviceId(); - auto gen_cuda = phi::DefaultCUDAGenerator(device_id); + auto gen_cuda = DefaultCUDAGenerator(device_id); seed = static_cast(gen_cuda->Random64()); } } @@ -195,7 +195,7 @@ void CudnnLSTMKernel( bool has_seq_length = running_sequence_length != nullptr; std::vector SequenceLength; if (has_seq_length) { - SequenceLength = phi::GetVectorFromTensor(running_sequence_length); + SequenceLength = GetVectorFromTensor(running_sequence_length); } auto handle = dev_ctx.cudnn_handle(); diff --git a/paddle/phi/kernels/gpu/cudnn_lstm_utils.h b/paddle/phi/kernels/gpu/cudnn_lstm_utils.h index 182bc21af18bde..32a4738831bc1d 100644 --- a/paddle/phi/kernels/gpu/cudnn_lstm_utils.h +++ b/paddle/phi/kernels/gpu/cudnn_lstm_utils.h @@ -53,7 +53,7 @@ inline int size_sum(const std::vector &weight_list) { template inline void weight_to_tensor( - const phi::Place &place, + const Place &place, gpuStream_t stream, const std::vector &weight_list, DenseTensor *weight) { diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu index 0a7d31e5c22860..a120262e3f20c1 100644 --- a/paddle/phi/kernels/gpu/cum_kernel.cu +++ b/paddle/phi/kernels/gpu/cum_kernel.cu @@ -203,7 +203,7 @@ __global__ void BlockScanKernel(T* d_out, int64_t scan_size, bool exclusive, Op op) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; using CallbackOp = BlockPrefixCallbackOp; // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types @@ -263,13 +263,13 @@ void ThrustCumsumKernel(const Context& dev_ctx, int64_t size, bool reverse, bool exclusive) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; #ifdef __HIPCC__ const auto& policy = thrust::hip::par.on(dev_ctx.stream()); #else - phi::memory_utils::ThrustAllocator allocator(dev_ctx.GetPlace(), - dev_ctx.stream()); + memory_utils::ThrustAllocator allocator(dev_ctx.GetPlace(), + dev_ctx.stream()); const auto& policy = thrust::cuda::par(allocator).on(dev_ctx.stream()); #endif @@ -459,11 +459,10 @@ void CumsumKernel(const Context& dev_ctx, bool exclusive, bool reverse, DenseTensor* out) { - using Op = - typename std::conditional::value || - std::is_same::value, - ComplexSum, - cub::Sum>::type; + using Op = typename std::conditional::value || + std::is_same::value, + ComplexSum, + cub::Sum>::type; if (FLAGS_use_accuracy_compatible_kernel && !exclusive) { if (out && out->numel() == 0) { dev_ctx.template Alloc(out); diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu index 10a11e1f1e16ce..a33d7e34a66538 100644 --- a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu @@ -169,15 +169,15 @@ void ReversedCumsum(const Context &dev_ctx, flipped_input.Resize(input.dims()); dev_ctx.template Alloc(&flipped_input); std::vector axis = {dim}; - phi::FlipKernel(dev_ctx, input, axis, &flipped_input); + FlipKernel(dev_ctx, input, axis, &flipped_input); DenseTensor cumsum_out; cumsum_out.Resize(input.dims()); dev_ctx.template Alloc(&cumsum_out); - phi::CumsumKernel( + CumsumKernel( dev_ctx, flipped_input, dim, false, false, false, &cumsum_out); - phi::FlipKernel(dev_ctx, cumsum_out, axis, output); + FlipKernel(dev_ctx, cumsum_out, axis, output); } template @@ -195,7 +195,7 @@ bool CumprodGradCompatible(const Context &dev_ctx, bool is_trivial = (x.numel() <= 1) || (x_dims[wrap_dim] == 1); if (is_trivial) { dev_ctx.template Alloc(dx); - phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx); + Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx); return true; } @@ -203,42 +203,40 @@ bool CumprodGradCompatible(const Context &dev_ctx, DenseTensor x_conj_tensor; DenseTensor out_conj_tensor; - if (phi::IsComplexType(x.dtype())) { + if (IsComplexType(x.dtype())) { x_conj_tensor.Resize(x.dims()); out_conj_tensor.Resize(out.dims()); dev_ctx.template Alloc(&x_conj_tensor); dev_ctx.template Alloc(&out_conj_tensor); - phi::ConjKernel(dev_ctx, x, &x_conj_tensor); - phi::ConjKernel(dev_ctx, out, &out_conj_tensor); + ConjKernel(dev_ctx, x, &x_conj_tensor); + ConjKernel(dev_ctx, out, &out_conj_tensor); } - const DenseTensor &x_ref = phi::IsComplexType(x.dtype()) ? x_conj_tensor : x; - const DenseTensor &out_ref = - phi::IsComplexType(x.dtype()) ? out_conj_tensor : out; + const DenseTensor &x_ref = IsComplexType(x.dtype()) ? x_conj_tensor : x; + const DenseTensor &out_ref = IsComplexType(x.dtype()) ? out_conj_tensor : out; DenseTensor zero_val; zero_val.Resize({1}); dev_ctx.template Alloc(&zero_val); - phi::FullKernel( - dev_ctx, {1}, static_cast(0), x.dtype(), &zero_val); + FullKernel(dev_ctx, {1}, static_cast(0), x.dtype(), &zero_val); DenseTensor is_zero_mask; is_zero_mask.Resize(x.dims()); dev_ctx.template Alloc(&is_zero_mask); - phi::EqualKernel(dev_ctx, x, zero_val, &is_zero_mask); + EqualKernel(dev_ctx, x, zero_val, &is_zero_mask); DenseTensor any_zero; any_zero.Resize({1}); dev_ctx.template Alloc(&any_zero); - phi::AnyKernel( + AnyKernel( dev_ctx, is_zero_mask, std::vector(), false, &any_zero); bool has_zero = false; #ifdef PADDLE_WITH_CUDA DenseTensor any_zero_cpu; - phi::Copy(dev_ctx, any_zero, CPUPlace(), true, &any_zero_cpu); + Copy(dev_ctx, any_zero, CPUPlace(), true, &any_zero_cpu); has_zero = *any_zero_cpu.data(); #else has_zero = *any_zero.data(); @@ -253,7 +251,7 @@ bool CumprodGradCompatible(const Context &dev_ctx, DenseTensor w; w.Resize(out_ref.dims()); dev_ctx.template Alloc(&w); - phi::MultiplyKernel(dev_ctx, out_ref, dout, &w); + MultiplyKernel(dev_ctx, out_ref, dout, &w); DenseTensor w_flipped, w_cum, rc_w; w_flipped.Resize(w.dims()); @@ -265,14 +263,14 @@ bool CumprodGradCompatible(const Context &dev_ctx, dev_ctx.template Alloc(&rc_w); std::vector axis = {dim}; - phi::FlipKernel(dev_ctx, w, axis, &w_flipped); + FlipKernel(dev_ctx, w, axis, &w_flipped); - phi::CumsumKernel( + CumsumKernel( dev_ctx, w_flipped, dim, false, false, false, &w_cum); - phi::FlipKernel(dev_ctx, w_cum, axis, &rc_w); + FlipKernel(dev_ctx, w_cum, axis, &rc_w); - phi::DivideKernel(dev_ctx, rc_w, x_ref, dx); + DivideKernel(dev_ctx, rc_w, x_ref, dx); return true; } @@ -324,7 +322,7 @@ void CumprodGradKernel(const Context &dev_ctx, const T *y_data_deal; Allocator::AllocationPtr x_conj; Allocator::AllocationPtr y_conj; - if (phi::IsComplexType(x.dtype())) { + if (IsComplexType(x.dtype())) { x_conj = const_cast(dev_ctx.GetAllocator()) .Allocate(numel * sizeof(T)); auto *x_data_conj = reinterpret_cast(x_conj->ptr()); diff --git a/paddle/phi/kernels/gpu/cvm_grad_kernel.cu b/paddle/phi/kernels/gpu/cvm_grad_kernel.cu index 957349834e461c..1d14e131203f5a 100644 --- a/paddle/phi/kernels/gpu/cvm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cvm_grad_kernel.cu @@ -22,8 +22,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template __global__ void CvmGradComputeKernel(const bool use_cvm, const int64_t item_width, @@ -105,7 +103,7 @@ void CVMGradCUDAKernel(const Context& dev_ctx, lod[lod.size() - 1], common::errors::PreconditionNotMet( "Output(X@GRAD)'s dim[0] must be equal to last element of lod")); - phi::MixVector mixv_lod(&lod); + MixVector mixv_lod(&lod); CvmGradComputeKernel<<<(dx_numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, PADDLE_CUDA_NUM_THREADS, diff --git a/paddle/phi/kernels/gpu/cvm_kernel.cu b/paddle/phi/kernels/gpu/cvm_kernel.cu index 597ecfb92b818b..204ed5cb2dd81b 100644 --- a/paddle/phi/kernels/gpu/cvm_kernel.cu +++ b/paddle/phi/kernels/gpu/cvm_kernel.cu @@ -21,8 +21,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template __global__ void CvmComputeKernel(const bool use_cvm, const int64_t item_width, diff --git a/paddle/phi/kernels/gpu/depthwise_conv2d_bias_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv2d_bias_grad_kernel.cu index d8d3577491e1a0..1594b4030e2a07 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv2d_bias_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv2d_bias_grad_kernel.cu @@ -54,27 +54,21 @@ __device__ __forceinline__ T WARP_SHFL_DOWN(T value, } template <> -__device__ __forceinline__ phi::dtype::float16 -WARP_SHFL_DOWN(phi::dtype::float16 value, - unsigned int delta, - int width, - unsigned int mask) { +__device__ __forceinline__ dtype::float16 WARP_SHFL_DOWN( + dtype::float16 value, unsigned int delta, int width, unsigned int mask) { uint16_t val_as_ushort = *reinterpret_cast(&value); uint16_t shuffled = WARP_SHFL_DOWN(val_as_ushort, delta, width, mask); - return *reinterpret_cast(&shuffled); + return *reinterpret_cast(&shuffled); } template <> -__device__ __forceinline__ phi::dtype::bfloat16 -WARP_SHFL_DOWN(phi::dtype::bfloat16 value, - unsigned int delta, - int width, - unsigned int mask) { +__device__ __forceinline__ dtype::bfloat16 WARP_SHFL_DOWN( + dtype::bfloat16 value, unsigned int delta, int width, unsigned int mask) { uint16_t val_as_ushort = *reinterpret_cast(&value); uint16_t shuffled = WARP_SHFL_DOWN(val_as_ushort, delta, width, mask); - return *reinterpret_cast(&shuffled); + return *reinterpret_cast(&shuffled); } template @@ -124,7 +118,7 @@ __global__ void DWConv2dBwdInputKernel(const T* __restrict__ grad_output, const int padHeight, const int dilationWidth, const int dilationHeight) { - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; const int KW_LIMIT = (kSize != 0) ? kSize : kernelWidth; const int KH_LIMIT = (kSize != 0) ? kSize : kernelHeight; const int strideW = (stride != 0) ? stride : strideWidth; @@ -194,7 +188,7 @@ __global__ void DWConv2dBwdWeightKernel(const T* __restrict__ grad_output, const int padHeight, const int dilationWidth, const int dilationHeight) { - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; const int channelStride = kernelWidth * kernelHeight; int bidx = blockIdx.x; @@ -316,7 +310,7 @@ void LaunchDepthwiseConv2dBackwardCompatible(const Context& dev_ctx, // Launch Filter Gradient Kernel (grad_weight) if (filter_grad_nchw_ptr) { - phi::funcs::SetConstant set_zero; + funcs::SetConstant set_zero; set_zero(dev_ctx, filter_grad_nchw_ptr, static_cast(0)); int blocks = outputChannels * kH * kW; @@ -324,7 +318,7 @@ void LaunchDepthwiseConv2dBackwardCompatible(const Context& dev_ctx, dim3 block(GetGradParamsNumThreads(batchSize)); size_t smem = (block.x / CUDA_WARP_SIZE) * - sizeof(typename phi::dtype::MPTypeTrait::Type); + sizeof(typename dtype::MPTypeTrait::Type); DWConv2dBwdWeightKernel <<>>(out_grad_nchw.data(), @@ -420,12 +414,12 @@ void LaunchDepthwiseConv2dBackwardCompatible(const Context& dev_ctx, // Reduce over N(0), H(2), W(3) to get [C] std::vector reduce_dims = {0, 2, 3}; - phi::SumKernel(dev_ctx, - out_grad_nchw, - phi::IntArray(reduce_dims), - CppTypeToDataType::Type(), - false, - bias_grad); + SumKernel(dev_ctx, + out_grad_nchw, + IntArray(reduce_dims), + CppTypeToDataType::Type(), + false, + bias_grad); } if (input_grad && channel_last) { diff --git a/paddle/phi/kernels/gpu/depthwise_conv2d_bias_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv2d_bias_kernel.cu index 557717c821b8be..a9d022e2ab01ce 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv2d_bias_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv2d_bias_kernel.cu @@ -52,7 +52,7 @@ __global__ void DWConv2dFwdKernel(const T* __restrict__ input, const int padHeight, const int dilationWidth, const int dilationHeight) { - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; const int KW_LIMIT = (kSize != 0) ? kSize : kernelWidth; const int KH_LIMIT = (kSize != 0) ? kSize : kernelHeight; @@ -119,7 +119,7 @@ __global__ void DWConv2dFwdKernelGeneric(const T* __restrict__ input, const int padHeight, const int dilationWidth, const int dilationHeight) { - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; for (IndexT linearIndex = blockIdx.x * blockDim.x + threadIdx.x; linearIndex < totalElements; diff --git a/paddle/phi/kernels/gpu/depthwise_conv3d_bias_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv3d_bias_grad_kernel.cu index 58515d8a415a03..2c7913a17f5083 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv3d_bias_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv3d_bias_grad_kernel.cu @@ -353,7 +353,7 @@ void LaunchDepthwiseConv3dBackwardCompatible(const Context& dev_ctx, } auto stream = dev_ctx.stream(); - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; const T* input_ptr = input_ncdhw.data(); const T* grad_output_ptr = out_grad_ncdhw.data(); @@ -600,12 +600,12 @@ void LaunchDepthwiseConv3dBackwardCompatible(const Context& dev_ctx, dev_ctx.template Alloc(bias_grad); // Reduce N(0), D(2), H(3), W(4) -> C(1) for NCDHW std::vector reduce_dims = {0, 2, 3, 4}; - phi::SumKernel(dev_ctx, - out_grad_ncdhw, - phi::IntArray(reduce_dims), - CppTypeToDataType::Type(), - false, - bias_grad); + SumKernel(dev_ctx, + out_grad_ncdhw, + IntArray(reduce_dims), + CppTypeToDataType::Type(), + false, + bias_grad); } if (input_grad && channel_last) { diff --git a/paddle/phi/kernels/gpu/depthwise_conv3d_bias_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv3d_bias_kernel.cu index f3a0e1bf85f491..ef16c89fbc6888 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv3d_bias_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv3d_bias_kernel.cu @@ -190,7 +190,7 @@ void LaunchDepthwiseConv3dCompatible(const Context& dev_ctx, int grid = std::min((num_outputs - 1) / block + 1, (int64_t)65536); auto stream = dev_ctx.stream(); - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; const T* input_ptr = input_ncdhw.data(); T* output_ptr = out_ncdhw.data(); diff --git a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu index 058f2d34cdf577..4be5bea2db8c20 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu @@ -76,18 +76,18 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, dev_ctx.template Alloc(filter_grad); set_zero(dev_ctx, filter_grad, static_cast(0)); } - phi::DepthwiseConvCudnnGradKernel(dev_ctx, - input, - filter, - *output_grad, - strides_t, - paddings_t, - padding_algorithm, - groups, - dilations_t, - data_format, - input_grad, - filter_grad); + DepthwiseConvCudnnGradKernel(dev_ctx, + input, + filter, + *output_grad, + strides_t, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + input_grad, + filter_grad); return; } #endif @@ -121,7 +121,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, set_zero(dev_ctx, input_grad, static_cast(0)); if (fuse_relu) { - phi::math::DepthwiseConvInputGradFunctor + math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; depthwiseConvInputGrad(dev_ctx, input, @@ -133,7 +133,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, input_grad, data_layout); } else { - phi::math::DepthwiseConvInputGradFunctor + math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; depthwiseConvInputGrad(dev_ctx, input, @@ -151,7 +151,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, dev_ctx.template Alloc(filter_grad); set_zero(dev_ctx, filter_grad, static_cast(0)); if (fuse_relu) { - phi::math::DepthwiseConvFilterGradFunctor + math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; depthwiseConvFilterGrad(dev_ctx, input, @@ -162,7 +162,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, filter_grad, data_layout); } else { - phi::math::DepthwiseConvFilterGradFunctor + math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; depthwiseConvFilterGrad(dev_ctx, input, diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu index 27529b8cd707bc..6c1bfa0596a4c8 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu @@ -80,16 +80,16 @@ void DepthwiseConvKernel(const Context& dev_ctx, !defined(PADDLE_WITH_HIP) DWConvParams params(has_fuse_relu, data_format, strides, dilations); if (params.UseCudnnDepthwise(dev_ctx, input, filter)) { - phi::DepthwiseConvCudnnKernel(dev_ctx, - input, - filter, - strides_t, - paddings_t, - padding_algorithm, - groups, - dilations_t, - data_format, - out); + DepthwiseConvCudnnKernel(dev_ctx, + input, + filter, + strides_t, + paddings_t, + padding_algorithm, + groups, + dilations_t, + data_format, + out); return; } #endif @@ -119,7 +119,7 @@ void DepthwiseConvKernel(const Context& dev_ctx, } if (fuse_relu) { - phi::math::DepthwiseConvFunctor depthwiseConv; + math::DepthwiseConvFunctor depthwiseConv; depthwiseConv(dev_ctx, input, filter, @@ -129,7 +129,7 @@ void DepthwiseConvKernel(const Context& dev_ctx, output, data_layout); } else { - phi::math::DepthwiseConvFunctor depthwiseConv; + math::DepthwiseConvFunctor depthwiseConv; depthwiseConv(dev_ctx, input, filter, diff --git a/paddle/phi/kernels/gpu/determinant_kernel.cu b/paddle/phi/kernels/gpu/determinant_kernel.cu index b377f83f59a49e..60bb1b68f9038a 100644 --- a/paddle/phi/kernels/gpu/determinant_kernel.cu +++ b/paddle/phi/kernels/gpu/determinant_kernel.cu @@ -36,10 +36,9 @@ template class EigenMatrix {}; template <> -class EigenMatrix { +class EigenMatrix { public: - using MatrixType = - Eigen::Matrix; + using MatrixType = Eigen::Matrix; }; template <> @@ -85,7 +84,7 @@ struct DeterminantCudaFunctor { std::vector input_vec; std::vector output_vec; TensorToVector(input, dev_ctx, &input_vec); - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; for (int64_t i = 0; i < batch_count; ++i) { // maybe can be parallel auto begin_iter = input_vec.begin() + i * rank * rank; auto end_iter = input_vec.begin() + (i + 1) * rank * rank; @@ -129,31 +128,31 @@ __global__ void GetDetFromLUComplex(const T* lu_data, } template -struct DeterminantCudaFunctor, Context> { +struct DeterminantCudaFunctor, Context> { void operator()(const Context& dev_ctx, const DenseTensor& a, int64_t n, int64_t batch_size, DenseTensor* output) { #ifndef PADDLE_WITH_HIP - phi::Allocator::AllocationPtr tmp_gpu_mat_data; - const phi::dtype::complex* gpu_mat = a.data>(); + Allocator::AllocationPtr tmp_gpu_mat_data; + const dtype::complex* gpu_mat = a.data>(); // Copy all elements of input matrix A to a temporary memory space to // avoid being overridden by getrf. - tmp_gpu_mat_data = phi::memory_utils::Alloc( + tmp_gpu_mat_data = memory_utils::Alloc( dev_ctx.GetPlace(), - a.numel() * sizeof(phi::dtype::complex), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + a.numel() * sizeof(dtype::complex), + Stream(reinterpret_cast(dev_ctx.stream()))); memory_utils::Copy(dev_ctx.GetPlace(), tmp_gpu_mat_data->ptr(), dev_ctx.GetPlace(), a.data(), - a.numel() * sizeof(phi::dtype::complex), + a.numel() * sizeof(dtype::complex), dev_ctx.stream()); - gpu_mat = reinterpret_cast*>( - tmp_gpu_mat_data->ptr()); + gpu_mat = + reinterpret_cast*>(tmp_gpu_mat_data->ptr()); - std::vector*> cpu_ptrs(batch_size); + std::vector*> cpu_ptrs(batch_size); for (int i = 0; i < batch_size; ++i) { cpu_ptrs[i] = gpu_mat + i * n * n; } @@ -161,45 +160,45 @@ struct DeterminantCudaFunctor, Context> { int num_ints = batch_size * (n + 1); // num_ints is for pivot (n * batch_size) and info (batch_size) size_t total_bytes = - batch_size * sizeof(phi::dtype::complex*) + num_ints * sizeof(int); - phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc( + batch_size * sizeof(dtype::complex*) + num_ints * sizeof(int); + Allocator::AllocationPtr tmp_gpu_ptrs_data = memory_utils::Alloc( dev_ctx.GetPlace(), total_bytes, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); memory_utils::Copy(dev_ctx.GetPlace(), tmp_gpu_ptrs_data->ptr(), CPUPlace(), static_cast(cpu_ptrs.data()), - cpu_ptrs.size() * sizeof(phi::dtype::complex*), + cpu_ptrs.size() * sizeof(dtype::complex*), dev_ctx.stream()); - phi::dtype::complex** gpu_mat_ptr = - reinterpret_cast**>(tmp_gpu_ptrs_data->ptr()); + dtype::complex** gpu_mat_ptr = + reinterpret_cast**>(tmp_gpu_ptrs_data->ptr()); int* gpu_info_ptr = reinterpret_cast(gpu_mat_ptr + cpu_ptrs.size()); int* pivot_data = gpu_info_ptr + batch_size; - auto blas = funcs::GetBlas>(dev_ctx); + auto blas = funcs::GetBlas>(dev_ctx); // This function performs the LU factorization of each matrix A by the // equation P * A = L * U. L and U are written back to original matrix A, // and diagonal elements of L are discarded. blas.BatchedGETRF(n, gpu_mat_ptr, pivot_data, gpu_info_ptr, batch_size); - phi::dtype::complex* out_data = - dev_ctx.template Alloc>(output); + dtype::complex* out_data = + dev_ctx.template Alloc>(output); int block_size = std::min(256, dev_ctx.GetMaxThreadsPerBlock()); dim3 dim_block(block_size); dim3 num_blocks((batch_size + block_size - 1) / block_size); - GetDetFromLUComplex><<>>( + GetDetFromLUComplex><<>>( gpu_mat, pivot_data, n, batch_size, out_data); #else using MatrixType = Eigen::Matrix, Eigen::Dynamic, Eigen::Dynamic>; - std::vector> input_vec; - std::vector> output_vec; + std::vector> input_vec; + std::vector> output_vec; TensorToVector(a, dev_ctx, &input_vec); for (int64_t i = 0; i < batch_size; ++i) { // maybe can be parallel auto begin_iter = input_vec.begin() + i * n * n; auto end_iter = input_vec.begin() + (i + 1) * n * n; - std::vector> sub_vec( + std::vector> sub_vec( begin_iter, end_iter); // get every square matrix data MatrixType matrix(n, n); @@ -209,7 +208,7 @@ struct DeterminantCudaFunctor, Context> { } } output_vec.push_back( - static_cast>(matrix.determinant())); + static_cast>(matrix.determinant())); } TensorFromVector(output_vec, dev_ctx, output); #endif diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu index f52882ef5a2327..5877cbdbe54089 100644 --- a/paddle/phi/kernels/gpu/dgc_kernel.cu +++ b/paddle/phi/kernels/gpu/dgc_kernel.cu @@ -186,18 +186,18 @@ void DGCKernel(const Context& dev_ctx, dev_ctx.template Alloc(gather_buff); int buf_size = paddle::communication::dgc::get_buffer_size(k); - phi::Allocator::AllocationPtr tmp_ious_data; + Allocator::AllocationPtr tmp_ious_data; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (dev_ctx.GetPlace().GetType() == AllocationType::GPU || dev_ctx.GetPlace().GetType() == AllocationType::CUSTOM) { - tmp_ious_data = phi::memory_utils::Alloc( + tmp_ious_data = memory_utils::Alloc( dev_ctx.GetPlace(), buf_size, - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + Stream(reinterpret_cast(dev_ctx.stream()))); } #endif if (dev_ctx.GetPlace().GetType() == AllocationType::CPU) { - tmp_ious_data = phi::memory_utils::Alloc(dev_ctx.GetPlace(), buf_size); + tmp_ious_data = memory_utils::Alloc(dev_ctx.GetPlace(), buf_size); } void* buf = reinterpret_cast(tmp_ious_data->ptr()); diff --git a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu index 7657c6c432935d..581e0f2f00e784 100644 --- a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu @@ -21,8 +21,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template void DiagonalGradKernel(const Context& dev_ctx, const DenseTensor& x, @@ -66,7 +64,7 @@ void DiagonalGradKernel(const Context& dev_ctx, int blocks = std::min((numel + threads - 1) / threads, blocks_max); int64_t dout_numel = out_grad.numel(); - phi::backends::gpu::GpuMemsetAsync( + backends::gpu::GpuMemsetAsync( dx_data, 0, numel * sizeof(T), dev_ctx.stream()); switch (dx_dim_size) { diff --git a/paddle/phi/kernels/gpu/diagonal_kernel.cu b/paddle/phi/kernels/gpu/diagonal_kernel.cu index 70770c90215b85..c2e10ecc809046 100644 --- a/paddle/phi/kernels/gpu/diagonal_kernel.cu +++ b/paddle/phi/kernels/gpu/diagonal_kernel.cu @@ -20,7 +20,6 @@ #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/diagonal.h" namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; template void DiagonalKernel(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu index 414cec8b37d883..4df46426834338 100644 --- a/paddle/phi/kernels/gpu/dist_kernel.cu +++ b/paddle/phi/kernels/gpu/dist_kernel.cu @@ -77,7 +77,7 @@ struct PowFunctorHighPrecision { template __global__ void ReduceSumWithSubtract( const T* x, const T* y, T* out, int64_t N, Functor func) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; MT sum_val(0.0); CUDA_KERNEL_LOOP_TYPE(i, N, int64_t) { sum_val += func(x[i], y[i]); } @@ -92,7 +92,7 @@ __global__ void ReduceMaxWithSubtract(const T* x, const T* y, T* out, int64_t N) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; MT max_val = std::numeric_limits::min(); CUDA_KERNEL_LOOP_TYPE(i, N, int64_t) { max_val = max(max_val, abs(static_cast(x[i]) - static_cast(y[i]))); @@ -109,7 +109,7 @@ __global__ void ReduceMinWithSubtract(const T* x, const T* y, T* out, int64_t N) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; MT min_val = std::numeric_limits::max(); CUDA_KERNEL_LOOP_TYPE(i, N, int64_t) { min_val = min(min_val, abs(static_cast(x[i]) - static_cast(y[i]))); @@ -132,7 +132,7 @@ void DistKernel(const Context& dev_ctx, return; } - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; DenseTensor intermediate; const T* x_ptr = x.data(); const T* y_ptr = y.data(); @@ -144,7 +144,7 @@ void DistKernel(const Context& dev_ctx, if (xdim == y.dims()) { // same shape int64_t n = x.numel(); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n); intermediate.Resize({config.block_per_grid.x}); T* i_ptr = dev_ctx.template Alloc(&intermediate); std::vector axis_dims = {static_cast(-1)}; @@ -161,7 +161,7 @@ void DistKernel(const Context& dev_ctx, ReduceMaxWithSubtract <<>>( x_ptr, y_ptr, i_ptr, n); - phi::MaxRawKernel( + MaxRawKernel( dev_ctx, intermediate, reduce_axis, true, true, out); } else if (p == -INFINITY) { @@ -169,7 +169,7 @@ void DistKernel(const Context& dev_ctx, <<>>( x_ptr, y_ptr, i_ptr, n); - phi::MinRawKernel( + MinRawKernel( dev_ctx, intermediate, reduce_axis, true, true, out); } else { diff --git a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu index f082253897a7c0..3c6d6a74e299dc 100644 --- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu +++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu @@ -196,7 +196,7 @@ void DistributeFpnProposalsKernel( sizeof(int) * 8, dev_ctx.stream()); // Allocate temporary storage - auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes); + auto d_temp_storage = memory_utils::Alloc(place, temp_storage_bytes); // Run sorting operation // sort target level to get corresponding index diff --git a/paddle/phi/kernels/gpu/edit_distance_kernel.cu b/paddle/phi/kernels/gpu/edit_distance_kernel.cu index c12789577642a1..af3c25ef1251ed 100644 --- a/paddle/phi/kernels/gpu/edit_distance_kernel.cu +++ b/paddle/phi/kernels/gpu/edit_distance_kernel.cu @@ -27,8 +27,6 @@ namespace phi { -using phi::PADDLE_CUDA_NUM_THREADS; - template __global__ void FillFirstRow(T* dist, const int N) { int64_t idx = diff --git a/paddle/phi/kernels/gpu/eig_grad_kernel.cu b/paddle/phi/kernels/gpu/eig_grad_kernel.cu index e79f4500e299b5..89ee3f6a347ff3 100644 --- a/paddle/phi/kernels/gpu/eig_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/eig_grad_kernel.cu @@ -55,13 +55,13 @@ void SolveLinearSystemGPU(const GPUContext& dev_ctx, #ifdef PADDLE_WITH_CUDA template <> -void SolveLinearSystemGPU>( +void SolveLinearSystemGPU>( const GPUContext& dev_ctx, - const phi::dtype::complex* + const dtype::complex* matrix_data, // device ptr, row-major, size batch*order*order - const phi::dtype::complex* + const dtype::complex* rhs_data, // device ptr, row-major, size batch*order*rhs_cols - phi::dtype::complex* + dtype::complex* out_data, // device ptr, row-major, size batch*order*rhs_cols int order, int rhs_cols, @@ -69,7 +69,7 @@ void SolveLinearSystemGPU>( // handles cublasHandle_t cublas_handle = dev_ctx.cublas_handle(); cusolverDnHandle_t cusolver_handle = dev_ctx.cusolver_dn_handle(); - auto stream = phi::Stream(reinterpret_cast(dev_ctx.stream())); + auto stream = Stream(reinterpret_cast(dev_ctx.stream())); // cuComplex constants const cuComplex kAlpha = make_cuFloatComplex(1.0f, 0.0f); @@ -88,22 +88,22 @@ void SolveLinearSystemGPU>( cuComplex* X_row_all = reinterpret_cast(out_data); auto dA_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); auto dB_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); cuComplex* dA_col = reinterpret_cast(dA_col_alloc->ptr()); cuComplex* dB_col = reinterpret_cast(dB_col_alloc->ptr()); - auto d_pivots_alloc = phi::memory_utils::Alloc( + auto d_pivots_alloc = memory_utils::Alloc( dev_ctx.GetPlace(), static_cast(batch_count) * order * sizeof(int), stream); int* d_pivots = reinterpret_cast(d_pivots_alloc->ptr()); auto d_info_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), - static_cast(batch_count) * sizeof(int), - stream); + memory_utils::Alloc(dev_ctx.GetPlace(), + static_cast(batch_count) * sizeof(int), + stream); int* d_info = reinterpret_cast(d_info_alloc->ptr()); // A_row layout: row-major (order x order), B_row layout: row-major (order @@ -157,7 +157,7 @@ void SolveLinearSystemGPU>( size_t work_bytes = static_cast(lwork) * sizeof(cuComplex); auto d_work_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream); cuComplex* d_work = reinterpret_cast(d_work_alloc->ptr()); for (int i = 0; i < batch_count; ++i) { @@ -238,13 +238,13 @@ void SolveLinearSystemGPU>( } template <> -void SolveLinearSystemGPU>( +void SolveLinearSystemGPU>( const GPUContext& dev_ctx, - const phi::dtype::complex* + const dtype::complex* matrix_data, // device ptr, row-major, size batch*order*order - const phi::dtype::complex* + const dtype::complex* rhs_data, // device ptr, row-major, size batch*order*rhs_cols - phi::dtype::complex* + dtype::complex* out_data, // device ptr, row-major, size batch*order*rhs_cols int order, int rhs_cols, @@ -252,7 +252,7 @@ void SolveLinearSystemGPU>( // handles cublasHandle_t cublas_handle = dev_ctx.cublas_handle(); cusolverDnHandle_t cusolver_handle = dev_ctx.cusolver_dn_handle(); - auto stream = phi::Stream(reinterpret_cast(dev_ctx.stream())); + auto stream = Stream(reinterpret_cast(dev_ctx.stream())); // cuDoubleComplex constants const cuDoubleComplex kAlpha = make_cuDoubleComplex(1.0f, 0.0f); @@ -273,24 +273,24 @@ void SolveLinearSystemGPU>( cuDoubleComplex* X_row_all = reinterpret_cast(out_data); auto dA_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); auto dB_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); cuDoubleComplex* dA_col = reinterpret_cast(dA_col_alloc->ptr()); cuDoubleComplex* dB_col = reinterpret_cast(dB_col_alloc->ptr()); - auto d_pivots_alloc = phi::memory_utils::Alloc( + auto d_pivots_alloc = memory_utils::Alloc( dev_ctx.GetPlace(), static_cast(batch_count) * order * sizeof(int), stream); int* d_pivots = reinterpret_cast(d_pivots_alloc->ptr()); auto d_info_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), - static_cast(batch_count) * sizeof(int), - stream); + memory_utils::Alloc(dev_ctx.GetPlace(), + static_cast(batch_count) * sizeof(int), + stream); int* d_info = reinterpret_cast(d_info_alloc->ptr()); // A_row layout: row-major (order x order), B_row layout: row-major (order @@ -345,7 +345,7 @@ void SolveLinearSystemGPU>( size_t work_bytes = static_cast(lwork) * sizeof(cuDoubleComplex); auto d_work_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream); cuDoubleComplex* d_work = reinterpret_cast(d_work_alloc->ptr()); @@ -429,20 +429,20 @@ void SolveLinearSystemGPU>( #ifdef PADDLE_WITH_HIP template <> -void SolveLinearSystemGPU>( +void SolveLinearSystemGPU>( const GPUContext& dev_ctx, - const phi::dtype::complex* + const dtype::complex* matrix_data, // device ptr, row-major, size batch*order*order - const phi::dtype::complex* + const dtype::complex* rhs_data, // device ptr, row-major, size batch*order*rhs_cols - phi::dtype::complex* + dtype::complex* out_data, // device ptr, row-major, size batch*order*rhs_cols int order, int rhs_cols, int batch_count) { // handles rocblas_handle rocblas_handle = dev_ctx.cusolver_dn_handle(); - auto stream = phi::Stream(reinterpret_cast(dev_ctx.stream())); + auto stream = Stream(reinterpret_cast(dev_ctx.stream())); // rocblas_float_complex constants const rocblas_float_complex kAlpha = rocblas_float_complex{1.0f, 0.0f}; @@ -464,21 +464,21 @@ void SolveLinearSystemGPU>( reinterpret_cast(out_data); auto dA_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); auto dB_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); rocblas_float_complex* dA_col = reinterpret_cast(dA_col_alloc->ptr()); rocblas_float_complex* dB_col = reinterpret_cast(dB_col_alloc->ptr()); - auto d_pivots_alloc = phi::memory_utils::Alloc( + auto d_pivots_alloc = memory_utils::Alloc( dev_ctx.GetPlace(), static_cast(batch_count) * order * sizeof(rocblas_int), stream); rocblas_int* d_pivots = reinterpret_cast(d_pivots_alloc->ptr()); - auto d_info_alloc = phi::memory_utils::Alloc( + auto d_info_alloc = memory_utils::Alloc( dev_ctx.GetPlace(), static_cast(batch_count) * sizeof(rocblas_int), stream); @@ -587,13 +587,12 @@ void SolveLinearSystemGPU>( auto* cpu_ctx = static_cast(pool.Get(cpu_place)); std::vector h_info(batch_count, 0); - phi::memory_utils::Copy( - CPUPlace(), - h_info.data(), - dev_ctx.GetPlace(), - d_info, - static_cast(batch_count) * sizeof(rocblas_int), - reinterpret_cast(dev_ctx.stream())); + memory_utils::Copy(CPUPlace(), + h_info.data(), + dev_ctx.GetPlace(), + d_info, + static_cast(batch_count) * sizeof(rocblas_int), + reinterpret_cast(dev_ctx.stream())); dev_ctx.Wait(); for (int i = 0; i < batch_count; ++i) { @@ -607,20 +606,20 @@ void SolveLinearSystemGPU>( } template <> -void SolveLinearSystemGPU>( +void SolveLinearSystemGPU>( const GPUContext& dev_ctx, - const phi::dtype::complex* + const dtype::complex* matrix_data, // device ptr, row-major, size batch*order*order - const phi::dtype::complex* + const dtype::complex* rhs_data, // device ptr, row-major, size batch*order*rhs_cols - phi::dtype::complex* + dtype::complex* out_data, // device ptr, row-major, size batch*order*rhs_cols int order, int rhs_cols, int batch_count) { // handles rocblas_handle rocblas_handle = dev_ctx.cusolver_dn_handle(); - auto stream = phi::Stream(reinterpret_cast(dev_ctx.stream())); + auto stream = Stream(reinterpret_cast(dev_ctx.stream())); // rocblas_double_complex constants const rocblas_double_complex kAlpha = rocblas_double_complex{1.0, 0.0}; @@ -642,21 +641,21 @@ void SolveLinearSystemGPU>( reinterpret_cast(out_data); auto dA_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); auto dB_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); + memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); rocblas_double_complex* dA_col = reinterpret_cast(dA_col_alloc->ptr()); rocblas_double_complex* dB_col = reinterpret_cast(dB_col_alloc->ptr()); - auto d_pivots_alloc = phi::memory_utils::Alloc( + auto d_pivots_alloc = memory_utils::Alloc( dev_ctx.GetPlace(), static_cast(batch_count) * order * sizeof(rocblas_int), stream); rocblas_int* d_pivots = reinterpret_cast(d_pivots_alloc->ptr()); - auto d_info_alloc = phi::memory_utils::Alloc( + auto d_info_alloc = memory_utils::Alloc( dev_ctx.GetPlace(), static_cast(batch_count) * sizeof(rocblas_int), stream); @@ -763,13 +762,12 @@ void SolveLinearSystemGPU>( auto* cpu_ctx = static_cast(pool.Get(cpu_place)); std::vector h_info(batch_count, 0); - phi::memory_utils::Copy( - CPUPlace(), - h_info.data(), - dev_ctx.GetPlace(), - d_info, - static_cast(batch_count) * sizeof(rocblas_int), - reinterpret_cast(dev_ctx.stream())); + memory_utils::Copy(CPUPlace(), + h_info.data(), + dev_ctx.GetPlace(), + d_info, + static_cast(batch_count) * sizeof(rocblas_int), + reinterpret_cast(dev_ctx.stream())); dev_ctx.Wait(); for (int i = 0; i < batch_count; ++i) { @@ -806,13 +804,12 @@ void ComputeBackwardForComplexInputGPU(const DenseTensor& L, gV_safe = Fill(dev_ctx, vectorize(V.dims()), T(0)); } DenseTensor trans_v = TransposeLast2Dim(dev_ctx, V); - DenseTensor Vh = phi::Conj(dev_ctx, trans_v); - DenseTensor Lconj = phi::Conj(dev_ctx, L); - DenseTensor Econj = phi::Subtract(dev_ctx, - phi::funcs::Unsqueeze(Lconj, -2), - phi::funcs::Unsqueeze(Lconj, -1)); - DenseTensor VhgV = phi::Matmul(dev_ctx, Vh, gV_safe); - DenseTensor diag_real = phi::Real(dev_ctx, VhgV); + DenseTensor Vh = Conj(dev_ctx, trans_v); + DenseTensor Lconj = Conj(dev_ctx, L); + DenseTensor Econj = Subtract( + dev_ctx, funcs::Unsqueeze(Lconj, -2), funcs::Unsqueeze(Lconj, -1)); + DenseTensor VhgV = Matmul(dev_ctx, Vh, gV_safe); + DenseTensor diag_real = Real(dev_ctx, VhgV); auto cpu_place = CPUPlace(); DeviceContextPool& pool = DeviceContextPool::Instance(); @@ -823,13 +820,13 @@ void ComputeBackwardForComplexInputGPU(const DenseTensor& L, Copy(dev_ctx, diag_real, cpu_place, false, &diag_real_cpu); DenseTensor diag_res_cpu = - phi::funcs::BatchDiag((*cpu_ctx), diag_real_cpu, batch_count); + funcs::BatchDiag((*cpu_ctx), diag_real_cpu, batch_count); DenseTensor diag_res; dev_ctx.template Alloc(&diag_res); Copy(dev_ctx, diag_res_cpu, GPUPlace(), false, &diag_res); - DenseTensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2); + DenseTensor diag_unsqueezed = funcs::Unsqueeze(diag_res, -2); auto numel = diag_unsqueezed.numel(); DenseTensor diag_unsqueezed_complex; @@ -838,21 +835,20 @@ void ComputeBackwardForComplexInputGPU(const DenseTensor& L, auto* data_diag_un_com = dev_ctx.template Alloc( &diag_unsqueezed_complex, static_cast(numel * sizeof(T))); - phi::funcs::ForRange for_range(dev_ctx, numel); - phi::funcs::RealToComplexFunctor functor( - data_diag_un, data_diag_un_com, numel); + funcs::ForRange for_range(dev_ctx, numel); + funcs::RealToComplexFunctor functor(data_diag_un, data_diag_un_com, numel); for_range(functor); // real tensor multiply complex tensor in broadcast manner - DenseTensor res1 = phi::Multiply(dev_ctx, V, diag_unsqueezed_complex); - DenseTensor res2 = phi::Matmul(dev_ctx, Vh, res1); - DenseTensor result = phi::Subtract(dev_ctx, VhgV, res2); + DenseTensor res1 = Multiply(dev_ctx, V, diag_unsqueezed_complex); + DenseTensor res2 = Matmul(dev_ctx, Vh, res1); + DenseTensor result = Subtract(dev_ctx, VhgV, res2); result.Resize(V.dims()); dev_ctx.template Alloc(&result); - result = phi::Divide(dev_ctx, result, Econj); - result = phi::funcs::DiagFill( - dev_ctx, order, order, order, 0, gL_safe, result); - DenseTensor rhs = phi::Matmul(dev_ctx, result, Vh); + result = Divide(dev_ctx, result, Econj); + result = + funcs::DiagFill(dev_ctx, order, order, order, 0, gL_safe, result); + DenseTensor rhs = Matmul(dev_ctx, result, Vh); // solve linear system // solve(Vh, rhs, out, m, k) @@ -877,7 +873,7 @@ void EigGradKernel(const Context& dev_ctx, const optional& dout_w, const optional& dout_v, DenseTensor* dx) { - auto* dx_data = dev_ctx.template Alloc>(dx); + auto* dx_data = dev_ctx.template Alloc>(dx); if (dx->numel() == 0) { return; } @@ -885,7 +881,7 @@ void EigGradKernel(const Context& dev_ctx, int batch_count = BatchCount(out_v); const int64_t order = out_v.dims(-1); - ComputeBackwardForComplexInputGPU, Context>( + ComputeBackwardForComplexInputGPU, Context>( out_w, out_v, dout_w, dout_v, dx_data, batch_count, order, dev_ctx); } #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/phi/kernels/gpu/eig_kernel.cu b/paddle/phi/kernels/gpu/eig_kernel.cu index e1322d11d1ee90..6578a4c054c41c 100644 --- a/paddle/phi/kernels/gpu/eig_kernel.cu +++ b/paddle/phi/kernels/gpu/eig_kernel.cu @@ -25,8 +25,8 @@ void EigKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out_w, DenseTensor* out_v) { - dev_ctx.template Alloc>(out_w); - dev_ctx.template Alloc>(out_v); + dev_ctx.template Alloc>(out_w); + dev_ctx.template Alloc>(out_v); if (x.numel() == 0) { return; @@ -39,9 +39,9 @@ void EigKernel(const Context& dev_ctx, // prepare cpu Tensor here, since magma requires output on cpu DenseTensor out_w_cpu, out_v_cpu; out_w_cpu.Resize(out_w->dims()); - (*cpu_ctx).template Alloc>(&out_w_cpu); + (*cpu_ctx).template Alloc>(&out_w_cpu); out_v_cpu.Resize(x.dims()); - (*cpu_ctx).template Alloc>(&out_v_cpu); + (*cpu_ctx).template Alloc>(&out_v_cpu); if (!IsComplexType(x.dtype())) { // output still be complex though input is real @@ -55,55 +55,53 @@ void EigKernel(const Context& dev_ctx, real_w_cpu.Resize(real_w_dim); (*cpu_ctx).template Alloc>(&real_w_cpu); real_v_cpu.Resize(x.dims()); - (*cpu_ctx).template Alloc>(&real_v_cpu); + (*cpu_ctx).template Alloc>(&real_v_cpu); - phi::ApplyEigKernelMagma, Context>( + ApplyEigKernelMagma, Context>( dev_ctx, x, &real_w_cpu, &real_v_cpu); // 1. extract real part & imag part from real_w_cpu - DenseTensor real_part_cpu = phi::funcs::Slice>( + DenseTensor real_part_cpu = funcs::Slice>( (*cpu_ctx), real_w_cpu, {-1}, {0}, {order}); - DenseTensor imag_part_cpu = phi::funcs::Slice>( + DenseTensor imag_part_cpu = funcs::Slice>( (*cpu_ctx), real_w_cpu, {-1}, {order}, {order * 2}); // 2. construct complex values - auto* real_part_data = real_part_cpu.data>(); - auto* imag_part_data = imag_part_cpu.data>(); + auto* real_part_data = real_part_cpu.data>(); + auto* imag_part_data = imag_part_cpu.data>(); int64_t out_w_numel = static_cast(out_w->numel()); - phi::funcs::ForRange for_range((*cpu_ctx), out_w_numel); - phi::funcs::RealImagToComplexFunctor> functor( + funcs::ForRange for_range((*cpu_ctx), out_w_numel); + funcs::RealImagToComplexFunctor> functor( real_part_data, imag_part_data, - out_w_cpu.data>(), + out_w_cpu.data>(), out_w_numel); for_range(functor); // 3. construct complex vectors DenseTensor real_v_trans_cpu = - TransposeLast2Dim, CPUContext>((*cpu_ctx), - real_v_cpu); + TransposeLast2Dim, CPUContext>((*cpu_ctx), real_v_cpu); DenseTensor out_v_trans_cpu; out_v_trans_cpu.Resize(x.dims()); - (*cpu_ctx).template Alloc>(&out_v_trans_cpu); - - phi::ConstructComplexVectors, - phi::dtype::Complex, - CPUContext>(&out_v_trans_cpu, - out_w_cpu, - real_v_trans_cpu, - (*cpu_ctx), - batch_count, - order); - - TransposeTwoAxis, CPUContext>(out_v_trans_cpu, - &out_v_cpu, - x.dims().size() - 1, - x.dims().size() - 2, - (*cpu_ctx)); + (*cpu_ctx).template Alloc>(&out_v_trans_cpu); + + ConstructComplexVectors, dtype::Complex, CPUContext>( + &out_v_trans_cpu, + out_w_cpu, + real_v_trans_cpu, + (*cpu_ctx), + batch_count, + order); + + TransposeTwoAxis, CPUContext>(out_v_trans_cpu, + &out_v_cpu, + x.dims().size() - 1, + x.dims().size() - 2, + (*cpu_ctx)); } else { - phi::ApplyEigKernelMagma(dev_ctx, x, &out_w_cpu, &out_v_cpu); + ApplyEigKernelMagma(dev_ctx, x, &out_w_cpu, &out_v_cpu); } // copy result from cpu to gpu tensor diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h index 29fd53f88d6f57..52fc578a478f51 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad.h +++ b/paddle/phi/kernels/gpu/elementwise_grad.h @@ -33,7 +33,7 @@ void ReduceWrapper(const GPUContext &dev_ctx, DenseTensor *dst) { std::vector reduce_dims = funcs::GetReduceDim(dst->dims(), src->dims(), axis); - phi::SumKernel( + SumKernel( dev_ctx, *src, reduce_dims, src->dtype(), false, dst); } @@ -228,7 +228,7 @@ void DefaultMixedPrecisionAddGrad(const GPUContext &dev_ctx, } std::vector reduce_dims = funcs::GetReduceDim(x.dims(), dout.dims(), axis); - phi::SumKernel( + SumKernel( dev_ctx, dout, reduce_dims, dout.dtype(), false, dx); } } @@ -244,7 +244,7 @@ void DefaultMixedPrecisionAddGrad(const GPUContext &dev_ctx, dev_ctx.template Alloc(&dy_fp32); std::vector reduce_dims = funcs::GetReduceDim(y.dims(), dout.dims(), axis); - phi::SumKernel( + SumKernel( dev_ctx, dout, reduce_dims, dout.dtype(), false, &dy_fp32); CastKernel(dev_ctx, dy_fp32, dy->dtype(), dy); } @@ -309,7 +309,7 @@ void DefaultElementwiseAddGrad(const GPUContext &dev_ctx, } std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); - phi::SumKernel( + SumKernel( dev_ctx, dout, reduce_dims, dout.dtype(), false, dx); } } @@ -323,7 +323,7 @@ void DefaultElementwiseAddGrad(const GPUContext &dev_ctx, } else { std::vector reduce_dims = funcs::GetReduceDim(y.dims(), out.dims(), axis); - phi::SumKernel( + SumKernel( dev_ctx, dout, reduce_dims, dout.dtype(), false, dy); } } @@ -432,7 +432,7 @@ void default_elementwise_sub_grad(const GPUContext &dev_ctx, } std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); - phi::SumKernel( + SumKernel( dev_ctx, dout, reduce_dims, dout.dtype(), false, dx); } } diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index 2e09dca7a9648a..54a64e344acd34 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -67,7 +67,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx, const optional& ddy, int axis, DenseTensor* ddout) { - phi::SubtractDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); + SubtractDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } template @@ -184,12 +184,12 @@ void AddGradKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_CUDA if (x.dtype() == DataType::FLOAT32 && (y.dtype() == DataType::FLOAT16 || y.dtype() == DataType::BFLOAT16)) { - phi::MixedPrecisionAddGradImpl( + MixedPrecisionAddGradImpl( dev_ctx, x, y, dout, axis, dx, dy, MixedPrecisionAddGradFunc); return; } #endif - phi::AddGradImpl(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc); + AddGradImpl(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc); } template @@ -200,7 +200,7 @@ void AddDoubleGradKernel(const Context& dev_ctx, const optional& ddy, int axis, DenseTensor* ddout) { - phi::AddDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); + AddDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } template @@ -211,7 +211,7 @@ void AddTripleGradKernel(const Context& dev_ctx, int axis, DenseTensor* d_ddx, DenseTensor* d_ddy) { - phi::AddGradImpl( + AddGradImpl( dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc); } diff --git a/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu index 1a23afd87566c0..e68b3acbddddd9 100644 --- a/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu +++ b/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu @@ -33,7 +33,7 @@ namespace phi { template __global__ void EmbeddingGradAddTo(T* main_grad_out, - const phi::bfloat16* out_grad, + const bfloat16* out_grad, const IndexT* token_indices, const int64_t num_tokens, const int64_t token_length) { @@ -44,7 +44,7 @@ __global__ void EmbeddingGradAddTo(T* main_grad_out, while (idy < num_tokens) { auto id = static_cast(token_indices[idy]); - const phi::bfloat16* token_out_grad = out_grad + idy * token_length; + const bfloat16* token_out_grad = out_grad + idy * token_length; T* token_main_grad = main_grad_out + id * token_length; for (int64_t i = idx; i < token_length; i += blockDim.x) { CudaAtomicAdd(&token_main_grad[i], static_cast(token_out_grad[i])); @@ -77,8 +77,8 @@ struct EmbeddingGradAddToCUDAFunctor { auto main_grad_out_t = main_grad_out_; const auto* token_indices = token_indices_.template data(); T* main_grad_out = dev_ctx_.template Alloc(main_grad_out_t); - const phi::bfloat16* out_grad = reinterpret_cast( - out_grad_.template data()); + const bfloat16* out_grad = reinterpret_cast( + out_grad_.template data()); const int gridx = 2 * dev_ctx_.GetSMCount(); dim3 threads(128, 8); diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu index 259f108adedf53..31617e4bd4d329 100644 --- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu @@ -187,7 +187,7 @@ struct EmbeddingSparseGradCUDAFunctor { new_rows.resize(ids_num); auto gpu_place = dev_ctx_.GetPlace(); - phi::MixVector mixv_new_rows(&new_rows); + MixVector mixv_new_rows(&new_rows); if (!std::is_same::value) { InputTypeConvert<<>>( ids_data, ids_num, mixv_new_rows.MutableData(gpu_place)); diff --git a/paddle/phi/kernels/gpu/embedding_with_scaled_gradient_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_with_scaled_gradient_grad_kernel.cu index 8fb4098f395065..af278b5f8ac7e7 100644 --- a/paddle/phi/kernels/gpu/embedding_with_scaled_gradient_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/embedding_with_scaled_gradient_grad_kernel.cu @@ -102,7 +102,7 @@ __global__ void ScaleGradKernel(const int* count_data, int64_t num_weights, int64_t num_weight_dim, T* table) { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < num_weights) { MPType freq = static_cast(count_data[idx]); diff --git a/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu index 7542d8be1bba24..f8112a8ef8963c 100644 --- a/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/expand_as_grad_kernel.cu @@ -51,7 +51,7 @@ void ExpandAsGradKernel(const Context& dev_ctx, } else { std::vector reduce_dims = funcs::GetReduceDim(in_dims, out_dims, -1); - phi::SumKernel( + SumKernel( dev_ctx, out_grad, reduce_dims, out_grad.dtype(), false, in_grad); } } diff --git a/paddle/phi/kernels/gpu/expand_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_grad_kernel.cu index e4fa8f88c35724..465c0db628f9d4 100644 --- a/paddle/phi/kernels/gpu/expand_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/expand_grad_kernel.cu @@ -44,7 +44,7 @@ void ExpandGradKernel(const Context& dev_ctx, } else { std::vector reduce_dims = funcs::GetReduceDim(x_grad->dims(), out_grad.dims(), -1); - phi::SumKernel( + SumKernel( dev_ctx, out_grad, reduce_dims, out_grad.dtype(), false, x_grad); } } diff --git a/paddle/phi/kernels/gpu/exponential_kernel.cu b/paddle/phi/kernels/gpu/exponential_kernel.cu index 11747e7adefc1f..9e1694d5e1df2d 100644 --- a/paddle/phi/kernels/gpu/exponential_kernel.cu +++ b/paddle/phi/kernels/gpu/exponential_kernel.cu @@ -25,7 +25,7 @@ void ExponentialKernel(const Context &dev_ctx, const DenseTensor &x, float lambda, DenseTensor *out) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; funcs::uniform_distribution dist; funcs::exponential_transform trans(lambda); funcs::distribution_and_transform(dev_ctx, out, dist, trans); diff --git a/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu index 059468345ead84..31f56972ff0ba8 100644 --- a/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu @@ -386,13 +386,13 @@ void FlashAttnUnpaddedGradBaseKernel(const Context& dev_ctx, if (!is_mha) { if (dk) { if (dk->meta().is_contiguous()) - phi::SumKernel(dev_ctx, dk_tmp, {2}, dk->type(), false, dk); + SumKernel(dev_ctx, dk_tmp, {2}, dk->type(), false, dk); else kvReduceForGQA(dev_ctx, dk_tmp, dk); } if (dv) { if (dv->meta().is_contiguous()) - phi::SumKernel(dev_ctx, dv_tmp, {2}, dv->type(), false, dv); + SumKernel(dev_ctx, dv_tmp, {2}, dv->type(), false, dv); else kvReduceForGQA(dev_ctx, dv_tmp, dv); } @@ -693,27 +693,27 @@ void FlashAttnGradBaseKernel(const Context& dev_ctx, dev_ctx.template Alloc(&flashmask_maxmin); downstart_row_indices = - phi::Slice(dev_ctx, startend_row_indices.get(), {3}, {0}, {1}); + Slice(dev_ctx, startend_row_indices.get(), {3}, {0}, {1}); downstart_row_indices_data = downstart_row_indices.data(); if (startend_row_indices->dims()[3] == 2) { if (!causal) { - upend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); + upend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); upend_row_indices_data = upend_row_indices.data(); } else { - downend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); + downend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); downend_row_indices_data = downend_row_indices.data(); } } else if (startend_row_indices->dims()[3] == 4) { - upend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {3}, {4}); + upend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {3}, {4}); upend_row_indices_data = upend_row_indices.data(); - downend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); + downend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); downend_row_indices_data = downend_row_indices.data(); - upstart_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {2}, {3}); + upstart_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {2}, {3}); upstart_row_indices_data = upstart_row_indices.data(); } } @@ -888,16 +888,14 @@ void FlashAttnGradBaseKernel(const Context& dev_ctx, if (!is_mha) { if (dk) { if (dk->meta().is_contiguous()) - phi::SumKernel( - dev_ctx, dk_tmp, {3}, dk->type(), false, dk); + SumKernel(dev_ctx, dk_tmp, {3}, dk->type(), false, dk); else kvReduceBatchedForGQA(dev_ctx, dk_tmp, dk); } if (dv) { if (dv->meta().is_contiguous()) - phi::SumKernel( - dev_ctx, dv_tmp, {3}, dv->type(), false, dv); + SumKernel(dev_ctx, dv_tmp, {3}, dv->type(), false, dv); else kvReduceBatchedForGQA(dev_ctx, dv_tmp, dv); } diff --git a/paddle/phi/kernels/gpu/flash_attn_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_kernel.cu index 127ccb256c6285..07dc127414e624 100644 --- a/paddle/phi/kernels/gpu/flash_attn_kernel.cu +++ b/paddle/phi/kernels/gpu/flash_attn_kernel.cu @@ -451,27 +451,27 @@ void FlashAttnBaseKernel(const Context& dev_ctx, dev_ctx.template Alloc(&flashmask_maxmin); downstart_row_indices = - phi::Slice(dev_ctx, startend_row_indices.get(), {3}, {0}, {1}); + Slice(dev_ctx, startend_row_indices.get(), {3}, {0}, {1}); downstart_row_indices_data = downstart_row_indices.data(); if (startend_row_indices->dims()[3] == 2) { if (!causal) { - upend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); + upend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); upend_row_indices_data = upend_row_indices.data(); } else { - downend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); + downend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); downend_row_indices_data = downend_row_indices.data(); } } else if (startend_row_indices->dims()[3] == 4) { - upend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {3}, {4}); + upend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {3}, {4}); upend_row_indices_data = upend_row_indices.data(); - downend_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); + downend_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {1}, {2}); downend_row_indices_data = downend_row_indices.data(); - upstart_row_indices = phi::Slice( - dev_ctx, startend_row_indices.get(), {3}, {2}, {3}); + upstart_row_indices = + Slice(dev_ctx, startend_row_indices.get(), {3}, {2}, {3}); upstart_row_indices_data = upstart_row_indices.data(); } } diff --git a/paddle/phi/kernels/gpu/flash_attn_v3_grad_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_v3_grad_kernel.cu index c50ef02a78294c..c542e9da10b4b7 100644 --- a/paddle/phi/kernels/gpu/flash_attn_v3_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/flash_attn_v3_grad_kernel.cu @@ -1032,22 +1032,22 @@ void FlashMaskV2GradBaseKernel( dev_ctx.template Alloc(&flashmask_maxmin); lt_start_row_indices = - phi::Slice(dev_ctx, startend_row_indices, {3}, {0}, {1}); + Slice(dev_ctx, startend_row_indices, {3}, {0}, {1}); if (startend_row_indices.dims()[3] == 2) { if (!is_causal) { ut_end_row_indices = - phi::Slice(dev_ctx, startend_row_indices, {3}, {1}, {2}); + Slice(dev_ctx, startend_row_indices, {3}, {1}, {2}); } else { lt_end_row_indices = - phi::Slice(dev_ctx, startend_row_indices, {3}, {1}, {2}); + Slice(dev_ctx, startend_row_indices, {3}, {1}, {2}); } } else if (startend_row_indices.dims()[3] == 4) { ut_end_row_indices = - phi::Slice(dev_ctx, startend_row_indices, {3}, {3}, {4}); + Slice(dev_ctx, startend_row_indices, {3}, {3}, {4}); lt_end_row_indices = - phi::Slice(dev_ctx, startend_row_indices, {3}, {1}, {2}); + Slice(dev_ctx, startend_row_indices, {3}, {1}, {2}); ut_start_row_indices = - phi::Slice(dev_ctx, startend_row_indices, {3}, {2}, {3}); + Slice(dev_ctx, startend_row_indices, {3}, {2}, {3}); } } @@ -1393,7 +1393,7 @@ void FlashMaskV2GradBaseKernel( // different from hdim_qk for now DenseTensor tile_count_semaphore; if (arch >= 90) { - tile_count_semaphore = phi::Full(dev_ctx, {1}, 0); + tile_count_semaphore = Full(dev_ctx, {1}, 0); dynload::flashmaskv2_bwd_params_set_tile_count_semaphore( params_handle, tile_count_semaphore.data()); } else { diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu index 5330c135233760..9298e0c965788c 100644 --- a/paddle/phi/kernels/gpu/flip_kernel.cu +++ b/paddle/phi/kernels/gpu/flip_kernel.cu @@ -24,9 +24,9 @@ namespace phi { template __global__ void FlipCudaKernel(const T* in_data, T* out_data, - phi::Array shape, - phi::Array stride, - phi::Array flip_dims, + Array shape, + Array stride, + Array flip_dims, const int rank, const int64_t numel, const int flip_dims_size) { @@ -79,9 +79,9 @@ void FlipKernel(const Context& dev_ctx, size_t flip_dims_size = axis.size(); auto x_stride = common::stride(x_dims); - phi::Array stride_array; - phi::Array shape_array; - phi::Array flip_dims_array; + Array stride_array; + Array shape_array; + Array flip_dims_array; for (int i = 0; i < rank; ++i) { stride_array[i] = x_stride[i]; @@ -93,7 +93,7 @@ void FlipKernel(const Context& dev_ctx, } } - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); + auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel); FlipCudaKernel <<>>( in_data, diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu index 71f022b58b4377..828a0a7c298ad7 100644 --- a/paddle/phi/kernels/gpu/full_kernel.cu +++ b/paddle/phi/kernels/gpu/full_kernel.cu @@ -71,14 +71,13 @@ void FullLikeKernel(const Context& dev_ctx, // the operator is 0 int64_t numel = out->numel(); - if (!std::is_same::value && - !std::is_same::value && - !std::is_same::value) { + if (!std::is_same::value && + !std::is_same::value && !std::is_same::value) { auto value = val.to(); using CommonType = typename std::common_type< float, - typename std::conditional::value || - std::is_same::value, + typename std::conditional::value || + std::is_same::value, float, T>::type>::type; auto common_type_value = static_cast(value); diff --git a/paddle/phi/kernels/gpu/fused_adam_kernel.cu b/paddle/phi/kernels/gpu/fused_adam_kernel.cu index f4261ffcbfb9ac..371632204d6e91 100644 --- a/paddle/phi/kernels/gpu/fused_adam_kernel.cu +++ b/paddle/phi/kernels/gpu/fused_adam_kernel.cu @@ -31,7 +31,7 @@ namespace phi { template struct FusedAdamBetaPowInfo { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; FusedAdamBetaPowInfo(const MT* beta1pow, const MT* beta2pow) { beta1pow_ = *beta1pow; beta2pow_ = *beta2pow; @@ -48,7 +48,7 @@ struct FusedAdamBetaPowInfo { template struct FusedAdamBetaPowInfo { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; FusedAdamBetaPowInfo(const MT* beta1pow, const MT* beta2pow) { beta1pow_ = beta1pow; beta2pow_ = beta2pow; @@ -286,7 +286,7 @@ static void CopyTensorIfDifferent(const Context& dev_ctx, for (size_t i = 0; i < src.size(); ++i) { if (src[i] != dst[i]) { VLOG(10) << "Copy Tensor " << i; - phi::Place place = (use_src_place ? src[i]->place() : dev_ctx.GetPlace()); + Place place = (use_src_place ? src[i]->place() : dev_ctx.GetPlace()); Copy(dev_ctx, *(src[i]), place, false, dst[i]); } } @@ -330,7 +330,7 @@ PADDLE_API void FusedAdamKernel( std::vector beta1_pows_out, std::vector beta2_pows_out, std::vector master_params_out) { - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; auto n = params.size(); auto beta1_pow_first = beta1_pows[0]; diff --git a/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu b/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu index f6493b89b46ecb..0014b33005f170 100644 --- a/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu +++ b/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu @@ -1056,7 +1056,7 @@ struct AffineQuantStore { float normalized_val = normalized_i * static_cast(gamma_pack.elem[i]) + static_cast(beta_pack.elem[i]); - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { y_pack.elem[i] = FP8QuantHelperFunc(normalized_val, quant_out_scale, quant_round_type, @@ -1138,7 +1138,7 @@ void RmsNormQuantKernel(const Context& dev_ctx, quant_scale)); } - using ComputeType = typename phi::dtype::MPTypeTrait::Type; + using ComputeType = typename dtype::MPTypeTrait::Type; const T* x_data = x.data(); const T* norm_weight_data = norm_weight.data(); @@ -1181,9 +1181,8 @@ void RmsNormQuantKernel(const Context& dev_ctx, dev_ctx.stream(), load, store, rows, cols, epsilon, inv_var_data); } else if (out->dtype() == DataType::FLOAT8_E4M3FN) { // Quantize and output float8_e4m3fn. - phi::float8_e4m3fn* out_data = - dev_ctx.template Alloc(out); - AffineQuantStore store( + float8_e4m3fn* out_data = dev_ctx.template Alloc(out); + AffineQuantStore store( out_data, cols, norm_weight_data, @@ -1220,9 +1219,8 @@ void RmsNormQuantKernel(const Context& dev_ctx, dev_ctx.stream(), load, store, rows, cols, epsilon, inv_var_data); } else if (out->dtype() == DataType::FLOAT8_E4M3FN) { // Quantize and output float8_e4m3fn. - phi::float8_e4m3fn* out_data = - dev_ctx.template Alloc(out); - AffineQuantStore store( + float8_e4m3fn* out_data = dev_ctx.template Alloc(out); + AffineQuantStore store( out_data, cols, norm_weight_data, @@ -1256,7 +1254,7 @@ void ResidualAddRmsNormWrapper(const Context& dev_ctx, const int cols, T* residual_output, T* output) { - using ComputeType = typename phi::dtype::MPTypeTrait::Type; + using ComputeType = typename dtype::MPTypeTrait::Type; ResidualAddBiasLoad load( x, residual, bias, residual_output, cols); AffineStore store(output, cols, norm_weight, norm_bias); @@ -1265,28 +1263,28 @@ void ResidualAddRmsNormWrapper(const Context& dev_ctx, } template void ResidualAddRmsNormWrapper(const GPUContext& dev_ctx, - const phi::float16* x, - const phi::float16* residual, - const phi::float16* bias, - const phi::float16* norm_weight, - const phi::float16* norm_bias, + const float16* x, + const float16* residual, + const float16* bias, + const float16* norm_weight, + const float16* norm_bias, const float epsilon, const int rows, const int cols, - phi::float16* residual_output, - phi::float16* output); + float16* residual_output, + float16* output); template void ResidualAddRmsNormWrapper(const GPUContext& dev_ctx, - const phi::bfloat16* x, - const phi::bfloat16* residual, - const phi::bfloat16* bias, - const phi::bfloat16* norm_weight, - const phi::bfloat16* norm_bias, + const bfloat16* x, + const bfloat16* residual, + const bfloat16* bias, + const bfloat16* norm_weight, + const bfloat16* norm_bias, const float epsilon, const int rows, const int cols, - phi::bfloat16* residual_output, - phi::bfloat16* output); + bfloat16* residual_output, + bfloat16* output); template void ResidualAddRmsNormWrapper(const GPUContext& dev_ctx, const float* x, @@ -1309,7 +1307,7 @@ void RmsNormWrapper(const Context& dev_ctx, const int rows, const int cols, T* output) { - using ComputeType = typename phi::dtype::MPTypeTrait::Type; + using ComputeType = typename dtype::MPTypeTrait::Type; DirectLoad load(x, cols); AffineStore store(output, cols, weight, bias); @@ -1318,22 +1316,22 @@ void RmsNormWrapper(const Context& dev_ctx, } template void RmsNormWrapper(const GPUContext& dev_ctx, - const phi::float16* x, - const phi::float16* weight, - const phi::float16* bias, + const float16* x, + const float16* weight, + const float16* bias, const float epsilon, const int rows, const int cols, - phi::float16* output); + float16* output); template void RmsNormWrapper(const GPUContext& dev_ctx, - const phi::bfloat16* x, - const phi::bfloat16* weight, - const phi::bfloat16* bias, + const bfloat16* x, + const bfloat16* weight, + const bfloat16* bias, const float epsilon, const int rows, const int cols, - phi::bfloat16* output); + bfloat16* output); template void RmsNormWrapper(const GPUContext& dev_ctx, const float* x, diff --git a/paddle/phi/kernels/gpu/fused_token_prune_kernel.cu b/paddle/phi/kernels/gpu/fused_token_prune_kernel.cu index 2c6feb325ac789..10c3145f6f0309 100644 --- a/paddle/phi/kernels/gpu/fused_token_prune_kernel.cu +++ b/paddle/phi/kernels/gpu/fused_token_prune_kernel.cu @@ -148,8 +148,8 @@ void FusedTokenPruneOpCUDAKernel(const Context& dev_ctx, dev_ctx, attn_tmp, false, reduce_dims, attn_accu.dtype(), &attn_accu); // 3. Prepare token indices - phi::backends::gpu::GpuLaunchConfig config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz * max_seq_len); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz * max_seq_len); FillIndex<<::max(); - config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz); + config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz); MaximumFirst << <<>>( x.data(), diff --git a/paddle/phi/kernels/gpu/gather_nd_kernel.cu b/paddle/phi/kernels/gpu/gather_nd_kernel.cu index 8cb5c5028415f6..18f0b880a55793 100644 --- a/paddle/phi/kernels/gpu/gather_nd_kernel.cu +++ b/paddle/phi/kernels/gpu/gather_nd_kernel.cu @@ -40,7 +40,7 @@ void GatherNdKernel(const Context &dev_ctx, for (int i = 0; i < x.dims().size(); ++i) { out_dims.emplace_back(1); } - phi::TileKernel(dev_ctx, x, phi::IntArray(out_dims), out); + TileKernel(dev_ctx, x, IntArray(out_dims), out); return; } if (index.dims()[0] == 0 && index.numel() == 0) return; diff --git a/paddle/phi/kernels/gpu/gaussian_kernel.cu b/paddle/phi/kernels/gpu/gaussian_kernel.cu index d105035fe1f28a..7eb7857015c95e 100644 --- a/paddle/phi/kernels/gpu/gaussian_kernel.cu +++ b/paddle/phi/kernels/gpu/gaussian_kernel.cu @@ -29,7 +29,7 @@ namespace phi { template -using ComplexType = phi::dtype::complex; +using ComplexType = dtype::complex; template struct GaussianGenerator { @@ -46,7 +46,7 @@ struct GaussianGenerator { __host__ __device__ T operator()(const unsigned int n) const { thrust::minstd_rand rng; rng.seed(seed_); - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; thrust::normal_distribution dist(static_cast(mean_), static_cast(std_)); unsigned int new_n = n + offset_; @@ -86,8 +86,8 @@ struct GaussianGenerator> { // If T is not complex template ::value && - !std::is_same::value, + std::enable_if_t::value && + !std::is_same::value, bool> = true> void GaussianRandom(const Context& dev_ctx, const IntArray& shape, @@ -100,7 +100,7 @@ void GaussianRandom(const Context& dev_ctx, dev_ctx.template Alloc(out); if (seed == 0) { // use global Generator seed - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; funcs::normal_distribution dist; funcs::normal_transform trans(static_cast(mean), static_cast(std)); @@ -116,8 +116,8 @@ void GaussianRandom(const Context& dev_ctx, // If T is complex template ::value || - std::is_same::value, + std::enable_if_t::value || + std::is_same::value, bool> = true> void GaussianRandom(const Context& dev_ctx, const IntArray& shape, @@ -137,15 +137,14 @@ void GaussianRandom(const Context& dev_ctx, out_imag.Resize(shape.GetData()); dev_ctx.template Alloc(&out_real); dev_ctx.template Alloc(&out_imag); - funcs::normal_distribution> dist; - funcs::normal_distribution> dist_imag; - funcs::normal_transform> trans(mean, - std_of_real_or_imag); - funcs::distribution_and_transform>( + funcs::normal_distribution> dist; + funcs::normal_distribution> dist_imag; + funcs::normal_transform> trans(mean, std_of_real_or_imag); + funcs::distribution_and_transform>( dev_ctx, &out_real, dist, trans); - funcs::distribution_and_transform>( + funcs::distribution_and_transform>( dev_ctx, &out_imag, dist_imag, trans); - phi::ComplexKernel>(dev_ctx, out_real, out_imag, out); + ComplexKernel>(dev_ctx, out_real, out_imag, out); } else { // use OP seed auto func = GaussianGenerator(mean, std_of_real_or_imag, seed); @@ -156,8 +155,8 @@ void GaussianRandom(const Context& dev_ctx, // If T is not complex template ::value && - !std::is_same::value, + std::enable_if_t::value && + !std::is_same::value, bool> = true> void GaussianRandomInplace(const Context& dev_ctx, const DenseTensor& x, @@ -168,7 +167,7 @@ void GaussianRandomInplace(const Context& dev_ctx, dev_ctx.template Alloc(out); if (seed == 0) { // use global Generator seed - using MT = typename phi::dtype::MPTypeTrait::Type; + using MT = typename dtype::MPTypeTrait::Type; funcs::normal_distribution dist; funcs::normal_transform trans(static_cast(mean), static_cast(std)); @@ -184,8 +183,8 @@ void GaussianRandomInplace(const Context& dev_ctx, // If T is complex template ::value || - std::is_same::value, + std::enable_if_t::value || + std::is_same::value, bool> = true> void GaussianRandomInplace(const Context& dev_ctx, const DenseTensor& x, @@ -203,15 +202,14 @@ void GaussianRandomInplace(const Context& dev_ctx, out_imag.Resize(x.dims()); dev_ctx.template Alloc(&out_real); dev_ctx.template Alloc(&out_imag); - funcs::normal_distribution> dist; - funcs::normal_distribution> dist_imag; - funcs::normal_transform> trans(mean, - std_of_real_or_imag); - funcs::distribution_and_transform>( + funcs::normal_distribution> dist; + funcs::normal_distribution> dist_imag; + funcs::normal_transform> trans(mean, std_of_real_or_imag); + funcs::distribution_and_transform>( dev_ctx, &out_real, dist, trans); - funcs::distribution_and_transform>( + funcs::distribution_and_transform>( dev_ctx, &out_imag, dist_imag, trans); - phi::ComplexKernel>(dev_ctx, out_real, out_imag, out); + ComplexKernel>(dev_ctx, out_real, out_imag, out); } else { // use OP seed auto func = GaussianGenerator(mean, std_of_real_or_imag, seed); diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu index a2da0717bba5d7..bdc2265e636c27 100644 --- a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu @@ -28,7 +28,7 @@ namespace phi { template struct GeluWithApproximateGradFunctor { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { MPType x = static_cast(arg_x); MPType dout = static_cast(arg_dout); @@ -54,7 +54,7 @@ struct GeluWithApproximateGradFunctor { template struct GeluWithoutApproximateGradFunctor { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { MPType x = static_cast(arg_x); MPType dout = static_cast(arg_dout); diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu index b276ca277311df..a9058c157774cf 100644 --- a/paddle/phi/kernels/gpu/gelu_kernel.cu +++ b/paddle/phi/kernels/gpu/gelu_kernel.cu @@ -32,7 +32,7 @@ namespace phi { template struct GeluWithApproximateFunctor { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; inline HOSTDEVICE T operator()(T arg_x) { // this function is tanh approximation of gelu MPType x = static_cast(arg_x); @@ -48,7 +48,7 @@ struct GeluWithApproximateFunctor { template struct GeluWithoutApproximateFunctor { - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; inline HOSTDEVICE T operator()(T arg_x) { // actual gelu with approximation = false MPType x = static_cast(arg_x); diff --git a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu index 7015d483d2188f..bed31526f6ccb7 100644 --- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu +++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu @@ -68,7 +68,7 @@ static void SortDescending(const GPUContext &dev_ctx, dev_ctx.stream()); // Allocate temporary storage auto place = dev_ctx.GetPlace(); - auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes); + auto d_temp_storage = memory_utils::Alloc(place, temp_storage_bytes); // Run sorting operation cub::DeviceRadixSort::SortPairsDescending(d_temp_storage->ptr(), @@ -293,10 +293,10 @@ static void NMS(const GPUContext &dev_ctx, const T *boxes = proposals.data(); auto place = dev_ctx.GetPlace(); - auto mask_ptr = phi::memory_utils::Alloc( - place, - boxes_num * col_blocks * sizeof(uint64_t), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto mask_ptr = + memory_utils::Alloc(place, + boxes_num * col_blocks * sizeof(uint64_t), + Stream(reinterpret_cast(dev_ctx.stream()))); uint64_t *mask_dev = reinterpret_cast(mask_ptr->ptr()); NMSKernel<<>>( diff --git a/paddle/phi/kernels/gpu/global_gather_kernel.cu b/paddle/phi/kernels/gpu/global_gather_kernel.cu index a7e31903e24daf..0663075345a1d0 100644 --- a/paddle/phi/kernels/gpu/global_gather_kernel.cu +++ b/paddle/phi/kernels/gpu/global_gather_kernel.cu @@ -76,15 +76,15 @@ struct GlobalGatherFunctor { cpu_global_count_data = cpu_global_count.data(); } - ncclDataType_t dtype = phi::ToNCCLDataType(x->dtype()); + ncclDataType_t dtype = ToNCCLDataType(x->dtype()); gpuStream_t stream = nullptr; stream = dev_ctx.stream(); - phi::distributed::NCCLCommContext *comm_ctx = nullptr; + distributed::NCCLCommContext *comm_ctx = nullptr; int nranks = 0; - comm_ctx = static_cast( - dev_ctx.GetCommContext()); + comm_ctx = + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( diff --git a/paddle/phi/kernels/gpu/global_scatter_kernel.cu b/paddle/phi/kernels/gpu/global_scatter_kernel.cu index f14b3eea87517d..f1d4a4add322cf 100644 --- a/paddle/phi/kernels/gpu/global_scatter_kernel.cu +++ b/paddle/phi/kernels/gpu/global_scatter_kernel.cu @@ -75,14 +75,14 @@ struct GlobalScatterFunctor { global_count_len = cpu_global_count.numel(); } - ncclDataType_t dtype = phi::ToNCCLDataType(x->dtype()); + ncclDataType_t dtype = ToNCCLDataType(x->dtype()); gpuStream_t stream = nullptr; stream = dev_ctx.stream(); - phi::distributed::NCCLCommContext* comm_ctx = nullptr; + distributed::NCCLCommContext* comm_ctx = nullptr; int nranks = 0; - comm_ctx = static_cast( - dev_ctx.GetCommContext()); + comm_ctx = + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE(comm_ctx, nullptr, common::errors::Unavailable( diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu index 6cbdb172e0a9f6..e08075b69facc8 100644 --- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu @@ -40,14 +40,14 @@ __global__ void InitializeHashTable(T* tensor, int len) { } template -std::shared_ptr FillHashTable(const Context& dev_ctx, - const T* input, - int num_input, - int64_t len_hashtable, - T* keys, - int* values, - int* key_index, - int* final_nodes_len) { +std::shared_ptr FillHashTable(const Context& dev_ctx, + const T* input, + int num_input, + int64_t len_hashtable, + T* keys, + int* values, + int* key_index, + int* final_nodes_len) { const auto place = dev_ctx.GetPlace(); int block = 1024; @@ -73,7 +73,7 @@ std::shared_ptr FillHashTable(const Context& dev_ctx, int total_unique_items = item_count[num_input]; auto unique_items = - phi::memory_utils::AllocShared(place, total_unique_items * sizeof(T)); + memory_utils::AllocShared(place, total_unique_items * sizeof(T)); T* unique_items_data = reinterpret_cast(unique_items->ptr()); *final_nodes_len = total_unique_items; @@ -176,12 +176,11 @@ void Reindex(const Context& dev_ctx, int64_t log_num = 1 << static_cast(1 + std::log2(num >> 1)); int64_t table_size = log_num << 1; - auto keys = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(T)); + auto keys = memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(T)); auto values = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int)); + memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int)); auto key_index = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int)); + memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int)); T* keys_ptr = reinterpret_cast(keys->ptr()); int* values_ptr = reinterpret_cast(values->ptr()); int* key_index_ptr = reinterpret_cast(key_index->ptr()); @@ -197,7 +196,7 @@ void Reindex(const Context& dev_ctx, key_index_ptr, table_size); int unique_len = 0; - std::shared_ptr unique_items = + std::shared_ptr unique_items = FillHashTable(dev_ctx, thrust::raw_pointer_cast(out_nodes->data()), out_nodes->size(), diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h index 71c915ef916192..32d6ac6a5947c1 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h @@ -42,7 +42,7 @@ struct GraphSendRecvMaxCUDAFunctor { T* output, const IndexT& in_i, const IndexT& out_i) { - phi::CudaAtomicMax(output + out_i, *(params + in_i)); + CudaAtomicMax(output + out_i, *(params + in_i)); } }; @@ -52,7 +52,7 @@ struct GraphSendRecvMinCUDAFunctor { T* output, const IndexT& in_i, const IndexT& out_i) { - phi::CudaAtomicMin(output + out_i, *(params + in_i)); + CudaAtomicMin(output + out_i, *(params + in_i)); } }; diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h index 3d6eb173d10f47..30afc8569778f7 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h @@ -109,14 +109,14 @@ struct GraphSendUERecvSumCUDAFunctor { template struct GraphSendUERecvMaxCUDAFunctor { DEVICE inline void operator()(T* output, T val) { - phi::CudaAtomicMax(output, val); + CudaAtomicMax(output, val); } }; template struct GraphSendUERecvMinCUDAFunctor { DEVICE inline void operator()(T* output, T val) { - phi::CudaAtomicMin(output, val); + CudaAtomicMin(output, val); } }; diff --git a/paddle/phi/kernels/gpu/grid_sample_utils.h b/paddle/phi/kernels/gpu/grid_sample_utils.h index 8097e6f7007837..66ee5d5fd50a18 100644 --- a/paddle/phi/kernels/gpu/grid_sample_utils.h +++ b/paddle/phi/kernels/gpu/grid_sample_utils.h @@ -62,14 +62,14 @@ inline bool cudnnIsAvailable() { return false; #elif defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // cuDNN/MIOpen version > 0 means DNN lib loaded; require v7+ for sampler - return phi::backends::gpu::DnnVersion() >= 7000; + return backends::gpu::DnnVersion() >= 7000; #else return false; #endif } inline bool isGpuTensor(const DenseTensor& x) { - return phi::is_gpu_place(x.place()); + return is_gpu_place(x.place()); } inline bool canUse32bitIndexMath(const DenseTensor& x) { diff --git a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu index 5bb61893098ef9..1931e8042256f3 100644 --- a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu @@ -160,7 +160,7 @@ template __device__ __forceinline__ AccT GradWarpReduceSum(AccT val) { #pragma unroll for (int offset = warpSize / 2; offset > 0; offset >>= 1) { - val += phi::backends::gpu::CudaShuffleDownSync(0xffffffff, val, offset); + val += backends::gpu::CudaShuffleDownSync(0xffffffff, val, offset); } return val; } @@ -752,7 +752,7 @@ void GroupNormGradKernel(const Context& dev_ctx, } return; } - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; const DataLayout data_layout = StringToDataLayout(data_layout_str); const auto scale_ptr = scale.get_ptr(); const auto bias_ptr = bias.get_ptr(); diff --git a/paddle/phi/kernels/gpu/group_norm_kernel.cu b/paddle/phi/kernels/gpu/group_norm_kernel.cu index 9d94f6d36c1d6c..ebbb4ad20199c0 100644 --- a/paddle/phi/kernels/gpu/group_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/group_norm_kernel.cu @@ -125,7 +125,7 @@ static int64_t findMaxDivisor(int64_t n, int64_t maxAllowedDivisor) { template inline __device__ void UpdateSum(const T* srcX, float* sum, float* sumSq) { - float src_data = phi::__2float(*srcX); + float src_data = __2float(*srcX); *sum += src_data; *sumSq += src_data * src_data; } @@ -135,8 +135,8 @@ inline __device__ void UpdateSum(const T* srcX, const T* srcR, float* sum, float* sumSq) { - float src_data = phi::__2float(*srcX); - float srcy_data = phi::__2float(*srcR); + float src_data = __2float(*srcX); + float srcy_data = __2float(*srcR); *sum += src_data + srcy_data; *sumSq += (src_data + srcy_data) * (src_data + srcy_data); } @@ -166,9 +166,9 @@ inline __device__ void UpdateSum<__half, 2>(const __half* srcX, } template <> -inline __device__ void UpdateSum(const phi::float16* srcX, - float* sum, - float* sumSq) { +inline __device__ void UpdateSum(const float16* srcX, + float* sum, + float* sumSq) { __half2 h2 = *reinterpret_cast<__half2 const*>(srcX); float2 f2 = __half22float2(h2); *sum += f2.x + f2.y; @@ -176,10 +176,10 @@ inline __device__ void UpdateSum(const phi::float16* srcX, } template <> -inline __device__ void UpdateSum(const phi::float16* srcX, - const phi::float16* srcR, - float* sum, - float* sumSq) { +inline __device__ void UpdateSum(const float16* srcX, + const float16* srcR, + float* sum, + float* sumSq) { __half2 h2 = *reinterpret_cast<__half2 const*>(srcX); __half2 h2_r = *reinterpret_cast<__half2 const*>(srcR); float2 f2 = __half22float2(h2); @@ -191,24 +191,24 @@ inline __device__ void UpdateSum(const phi::float16* srcX, #ifdef PADDLE_CUDA_BF16 template <> -inline __device__ void UpdateSum(const phi::bfloat16* srcX, - float* sum, - float* sumSq) { +inline __device__ void UpdateSum(const bfloat16* srcX, + float* sum, + float* sumSq) { __nv_bfloat162 h2 = *reinterpret_cast<__nv_bfloat162 const*>(srcX); - float2 f2 = phi::bfloat1622float2(h2); + float2 f2 = bfloat1622float2(h2); *sum += f2.x + f2.y; *sumSq += f2.x * f2.x + f2.y * f2.y; } template <> -inline __device__ void UpdateSum(const phi::bfloat16* srcX, - const phi::bfloat16* srcR, - float* sum, - float* sumSq) { +inline __device__ void UpdateSum(const bfloat16* srcX, + const bfloat16* srcR, + float* sum, + float* sumSq) { __nv_bfloat162 h2 = *reinterpret_cast<__nv_bfloat162 const*>(srcX); __nv_bfloat162 h2_r = *reinterpret_cast<__nv_bfloat162 const*>(srcR); - float2 f2 = phi::bfloat1622float2(h2); - float2 f2_r = phi::bfloat1622float2(h2_r); + float2 f2 = bfloat1622float2(h2); + float2 f2_r = bfloat1622float2(h2_r); *sum += f2.x + f2_r.x + f2.y + f2_r.y; *sumSq += (f2.x + f2_r.x) * (f2.x + f2_r.x) + (f2.y + f2_r.y) * (f2.y + f2_r.y); @@ -418,22 +418,20 @@ inline __device__ void GroupNormCompute(int64_t dhwBegin, const GroupNormNDHWCParams& params, float mean, float invStdDev) { - float gamma = - phi::__2float(*(reinterpret_cast(params.gamma) + ci)); - float beta = - phi::__2float(*(reinterpret_cast(params.beta) + ci)); + float gamma = __2float(*(reinterpret_cast(params.gamma) + ci)); + float beta = __2float(*(reinterpret_cast(params.beta) + ci)); for (int64_t dhwi = dhwBegin; dhwi < dhwEnd; ++dhwi) { // The src/dst offset. int64_t offset = static_cast(blockIdx.z) * params.dhwc + dhwi * params.c + ci; - float src_data = phi::__2float(params.srcX[offset]); + float src_data = __2float(params.srcX[offset]); if (params.srcR != nullptr) { auto gi = ci / params.cPerGroup; auto gj = ci % params.cPerGroup; int64_t g_offset = params.y_same_with_x ? offset : gi * params.cPerGroup + gj; - src_data += phi::__2float(params.srcR[g_offset]); - *reinterpret_cast(¶ms.eleOut[offset]) = phi::__2dst(src_data); + src_data += __2float(params.srcR[g_offset]); + *reinterpret_cast(¶ms.eleOut[offset]) = __2dst(src_data); } // Normalize the channels. float dst_data = (src_data - mean) * invStdDev; @@ -446,16 +444,16 @@ inline __device__ void GroupNormCompute(int64_t dhwBegin, } // Store the scaled values. - *reinterpret_cast(¶ms.dst[offset]) = phi::__2dst(dst_data); + *reinterpret_cast(¶ms.dst[offset]) = __2dst(dst_data); } } template <> -inline __device__ void GroupNormCompute( +inline __device__ void GroupNormCompute( int64_t dhwBegin, int64_t dhwEnd, int32_t ci, - const GroupNormNDHWCParams& params, + const GroupNormNDHWCParams& params, float mean, float invStdDev) { float2 gammaF2, betaF2; @@ -563,17 +561,17 @@ inline __device__ void GroupNormCompute<__half, 2>( #ifdef PADDLE_CUDA_BF16 template <> -inline __device__ void GroupNormCompute( +inline __device__ void GroupNormCompute( int64_t dhwBegin, int64_t dhwEnd, int32_t ci, - const GroupNormNDHWCParams& params, + const GroupNormNDHWCParams& params, float mean, float invStdDev) { float2 gammaF2, betaF2; - gammaF2 = phi::bfloat1622float2(*reinterpret_cast<__nv_bfloat162 const*>( + gammaF2 = bfloat1622float2(*reinterpret_cast<__nv_bfloat162 const*>( reinterpret_cast<__nv_bfloat16 const*>(params.gamma) + ci)); - betaF2 = phi::bfloat1622float2(*reinterpret_cast<__nv_bfloat162 const*>( + betaF2 = bfloat1622float2(*reinterpret_cast<__nv_bfloat162 const*>( reinterpret_cast<__nv_bfloat16 const*>(params.beta) + ci)); // Iterate over the activations to compute the sums. @@ -587,7 +585,7 @@ inline __device__ void GroupNormCompute( *reinterpret_cast<__nv_bfloat162 const*>(¶ms.srcX[offset]); // Extract the two half values. - float2 f2 = phi::bfloat1622float2(h2); + float2 f2 = bfloat1622float2(h2); if (params.srcR != nullptr) { auto gi = ci / params.cPerGroup; @@ -596,11 +594,11 @@ inline __device__ void GroupNormCompute( params.y_same_with_x ? offset : gi * params.cPerGroup + gj; __nv_bfloat162 r2 = *reinterpret_cast<__nv_bfloat162 const*>(¶ms.srcR[g_offset]); - float2 r_f2 = phi::bfloat1622float2(r2); + float2 r_f2 = bfloat1622float2(r2); f2.x += r_f2.x; f2.y += r_f2.y; *reinterpret_cast<__nv_bfloat162*>(¶ms.eleOut[offset]) = - phi::float22bfloat162_rn(f2); + float22bfloat162_rn(f2); } // Normalize the channels. f2.x = (f2.x - mean) * invStdDev; @@ -617,7 +615,7 @@ inline __device__ void GroupNormCompute( } // Store the scaled values. *reinterpret_cast<__nv_bfloat162*>(¶ms.dst[offset]) = - phi::float22bfloat162_rn(f2); + float22bfloat162_rn(f2); } } #endif @@ -736,7 +734,7 @@ void GroupNormNDHWCKernel(const Context& dev_ctx, if (data_layout != DataLayout::NHWC) { PD_THROW("data_layout only supports NHWC and NDHWC"); } - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; GroupNormNDHWCParams params_; params_.withSilu = activation == "silu" ? true : false; @@ -886,17 +884,17 @@ void GroupNormNDHWCKernel(const Context& dev_ctx, groupNormNDHWCScale ndhwc_scale; ndhwc_scale(params_, stream); #ifdef PADDLE_WITH_HIP - phi::backends::gpu::GpuMemcpyAsync(mean_data, - params_.redBuffer, - params_.n * groups * sizeof(float), - hipMemcpyDeviceToHost, - stream); + backends::gpu::GpuMemcpyAsync(mean_data, + params_.redBuffer, + params_.n * groups * sizeof(float), + hipMemcpyDeviceToHost, + stream); #else - phi::backends::gpu::GpuMemcpyAsync(mean_data, - params_.redBuffer, - params_.n * groups * sizeof(float), - cudaMemcpyDeviceToHost, - stream); + backends::gpu::GpuMemcpyAsync(mean_data, + params_.redBuffer, + params_.n * groups * sizeof(float), + cudaMemcpyDeviceToHost, + stream); #endif } @@ -952,13 +950,10 @@ __device__ __forceinline__ WelfordData WelfordWarpReduce( for (int offset = warpSize / 2; offset > 0; offset >>= 1) { WelfordData other; other.mean = - phi::backends::gpu::CudaShuffleDownSync(0xffffffff, val.mean, offset); - other.m2 = - phi::backends::gpu::CudaShuffleDownSync(0xffffffff, val.m2, offset); - other.n = - phi::backends::gpu::CudaShuffleDownSync(0xffffffff, val.n, offset); - other.nf = - phi::backends::gpu::CudaShuffleDownSync(0xffffffff, val.nf, offset); + backends::gpu::CudaShuffleDownSync(0xffffffff, val.mean, offset); + other.m2 = backends::gpu::CudaShuffleDownSync(0xffffffff, val.m2, offset); + other.n = backends::gpu::CudaShuffleDownSync(0xffffffff, val.n, offset); + other.nf = backends::gpu::CudaShuffleDownSync(0xffffffff, val.nf, offset); val = WelfordCombine(val, other); } return val; @@ -1291,7 +1286,7 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var); #endif #ifdef __HIPCC__ - if (blockDim.x < phi::kps::details::kWarpSize) { + if (blockDim.x < kps::details::kWarpSize) { CudaAtomicAdd(&mean[bid * groups + gid], x_mean); CudaAtomicAdd(&var[bid * groups + gid], x_var); } else { @@ -1477,12 +1472,12 @@ void GroupNormDirectCUDAFunctor::operator()( while (block_size_nchw < max_block_size) { block_size_nchw *= 2; } - block_size_nchw = std::max(block_size_nchw, phi::kps::details::kWarpSize); + block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize); int64_t n_groups = input_ddim[0] * static_cast(groups); dim3 grids(std::min(max_grid_x, n_groups)); dim3 blocks(block_size_nchw); if (size < vec_size * block_size_nchw) { - phi::ScalarGetMeanAndVarNCHW<<>>( + ScalarGetMeanAndVarNCHW<<>>( input, mean, temp_variance, size, n_groups); } else { VectorizedGetMeanAndVarNCHW @@ -1515,7 +1510,7 @@ void GroupNormDirectCUDAFunctor::operator()( cudaMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups); #endif - phi::GroupNormForwardGetMeanAndVar + GroupNormForwardGetMeanAndVar <<>>(input, input_ddim[0], C, @@ -1559,7 +1554,7 @@ void GroupNormGeneralCaseKernel(const Context& dev_ctx, DenseTensor* y, DenseTensor* mean, DenseTensor* var) { - using AccT = typename phi::dtype::MPTypeTrait::Type; + using AccT = typename dtype::MPTypeTrait::Type; const DataLayout data_layout = StringToDataLayout(data_layout_str); const auto scale_ptr = scale.get_ptr(); const auto bias_ptr = bias.get_ptr(); @@ -1843,42 +1838,42 @@ void GroupNormKernel(const Context& dev_ctx, return; } using std::is_same; - if (is_same::value && data_layout_str == "NHWC") { + if (is_same::value && data_layout_str == "NHWC") { const optional& residual = optional(paddle::none); DenseTensor empty_tensor; - GroupNormNDHWCKernel(dev_ctx, - x, - residual, - scale, - bias, - epsilon, - groups, - data_layout_str, - "", - y, - &empty_tensor, - mean, - var); + GroupNormNDHWCKernel(dev_ctx, + x, + residual, + scale, + bias, + epsilon, + groups, + data_layout_str, + "", + y, + &empty_tensor, + mean, + var); return; } #ifdef PADDLE_CUDA_BF16 - if (is_same::value && data_layout_str == "NHWC") { + if (is_same::value && data_layout_str == "NHWC") { const optional& residual = optional(paddle::none); DenseTensor empty_tensor; - GroupNormNDHWCKernel(dev_ctx, - x, - residual, - scale, - bias, - epsilon, - groups, - data_layout_str, - "", - y, - &empty_tensor, - mean, - var); + GroupNormNDHWCKernel(dev_ctx, + x, + residual, + scale, + bias, + epsilon, + groups, + data_layout_str, + "", + y, + &empty_tensor, + mean, + var); return; } #endif diff --git a/paddle/phi/kernels/gpu/group_norm_utils.h b/paddle/phi/kernels/gpu/group_norm_utils.h index 6fb6d155398ead..c4929a4506f5f5 100644 --- a/paddle/phi/kernels/gpu/group_norm_utils.h +++ b/paddle/phi/kernels/gpu/group_norm_utils.h @@ -49,7 +49,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { } template -__device__ __forceinline__ void ThreadReduce(phi::Array arrs, +__device__ __forceinline__ void ThreadReduce(Array arrs, int64_t size, const int offset, AccT* out_mean, @@ -168,7 +168,7 @@ __global__ void VectorizedGetMeanAndVarNCHW( AccT x_var = static_cast(0); x += i * size; const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); - phi::Array ins; + Array ins; ins[0] = x; ThreadReduce(ins, size, input_offset, &x_mean, &x_var); ReduceMeanAndVar(mean, var, x_mean, x_var, size, i); diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu index 39d15f32cd10d1..eff113fa9e975c 100644 --- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu +++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu @@ -137,7 +137,7 @@ struct GumbleNoiseGenerator { DenseTensor random_tensor; int64_t size = size_to_axis * size_from_axis; random_tensor.Resize({size}); - using MPType = typename phi::dtype::MPTypeTrait::Type; + using MPType = typename dtype::MPTypeTrait::Type; MPType* random_data = dev_ctx.template Alloc(&random_tensor); // generate gumbel noise