Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 10 additions & 12 deletions paddle/phi/kernels/gpu/abs_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,25 +28,23 @@ template <typename T, typename Enable = void>
struct CudaAbsFunctor;

template <typename T>
struct CudaAbsFunctor<T, funcs::Complex<T, phi::dtype::Real<T>>> {
__device__ __forceinline__ phi::dtype::Real<T> operator()(const T x) const {
struct CudaAbsFunctor<T, funcs::Complex<T, dtype::Real<T>>> {
__device__ __forceinline__ dtype::Real<T> operator()(const T x) const {
return abs(x);
}
};

template <typename T>
struct CudaAbsFunctor<
T,
std::enable_if_t<std::is_same<T, phi::dtype::Real<T>>::value &&
std::is_same<T, phi::bfloat16>::value>> {
struct CudaAbsFunctor<T,
std::enable_if_t<std::is_same<T, dtype::Real<T>>::value &&
std::is_same<T, bfloat16>::value>> {
__device__ __forceinline__ T operator()(const T x) const { return abs(x); }
};

template <typename T>
struct CudaAbsFunctor<
T,
std::enable_if_t<std::is_same<T, phi::dtype::Real<T>>::value &&
!std::is_same<T, phi::bfloat16>::value>> {
struct CudaAbsFunctor<T,
std::enable_if_t<std::is_same<T, dtype::Real<T>>::value &&
!std::is_same<T, bfloat16>::value>> {
__device__ __forceinline__ T operator()(const T x) const {
return std::abs(x);
}
Expand All @@ -56,12 +54,12 @@ template <typename T, typename Context>
PADDLE_API void AbsKernel(const Context& dev_ctx,
const DenseTensor& x,
DenseTensor* out) {
dev_ctx.template Alloc<phi::dtype::Real<T>>(out);
dev_ctx.template Alloc<dtype::Real<T>>(out);
std::vector<const DenseTensor*> ins = {&x};
std::vector<DenseTensor*> outs = {out};
auto functor = CudaAbsFunctor<T>();

funcs::ElementwiseKernel<phi::dtype::Real<T>>(dev_ctx, ins, &outs, functor);
funcs::ElementwiseKernel<dtype::Real<T>>(dev_ctx, ins, &outs, functor);
}

} // namespace phi
Expand Down
5 changes: 2 additions & 3 deletions paddle/phi/kernels/gpu/accuracy_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
#include "paddle/phi/core/kernel_registry.h"

namespace phi {
using phi::PADDLE_CUDA_NUM_THREADS;

template <int BlockSize, typename T>
__global__ void AccuracyCudaKernel(const int64_t N,
Expand All @@ -34,7 +33,7 @@ __global__ void AccuracyCudaKernel(const int64_t N,
int* correct_data,
T* accuracy,
int* total_data) {
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
using MT = typename dtype::MPTypeTrait<T>::Type;
int count = 0;
__shared__ int total[BlockSize];

Expand Down Expand Up @@ -98,7 +97,7 @@ void AccuracyKernel(const Context& dev_ctx,
int64_t num_samples = inference.dims()[0];
size_t infer_width = inference.dims()[1];
auto stream = dev_ctx.stream();
phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream);
backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream);

PADDLE_ENFORCE_GT(label.dims().size(),
0,
Expand Down
3 changes: 1 addition & 2 deletions paddle/phi/kernels/gpu/activation_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -371,8 +371,7 @@ void PowGradKernel(const Context& dev_ctx,
DenseTensor* dx) {
if (factor.to<double>() == 0) {
std::vector<int64_t> vec_dims = vectorize(dx->dims());
phi::Full<T, Context>(
dev_ctx, phi::IntArray(vec_dims), static_cast<T>(0), dx);
Full<T, Context>(dev_ctx, IntArray(vec_dims), static_cast<T>(0), dx);
return;
}
if (factor.to<double>() == 1) {
Expand Down
3 changes: 1 addition & 2 deletions paddle/phi/kernels/gpu/activation_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -283,8 +283,7 @@ void PowKernel(const Context& dev_ctx,
}
if (factor.to<double>() == 0) {
std::vector<int64_t> vec_dims = vectorize(out->dims());
phi::Full<T, Context>(
dev_ctx, phi::IntArray(vec_dims), static_cast<T>(1), out);
Full<T, Context>(dev_ctx, IntArray(vec_dims), static_cast<T>(1), out);
return;
}
if (factor.to<double>() == 1) {
Expand Down
8 changes: 4 additions & 4 deletions paddle/phi/kernels/gpu/adagrad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ struct DenseAdagradFunctor<GPUContext, T> {
DenseTensor* param_out_tensor,
DenseTensor* moment_out_tensor,
DenseTensor* master_param_outs) {
using MT = typename phi::dtype::template MPTypeTrait<T>::Type;
using MT = typename dtype::template MPTypeTrait<T>::Type;
T* param_out_data = dev_ctx.template Alloc<T>(param_out_tensor);
MT* moment_out_data = dev_ctx.template Alloc<MT>(moment_out_tensor);
const MT* master_in_data =
Expand All @@ -82,7 +82,7 @@ struct DenseAdagradFunctor<GPUContext, T> {
MT epsilon = static_cast<MT>(epsilon_t);

int64_t numel = param_t.numel();
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1);
auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1);
int grid = config.block_per_grid.x;
int block = config.thread_per_block.x;
auto stream = dev_ctx.stream();
Expand Down Expand Up @@ -180,7 +180,7 @@ struct SparseAdagradFunctor<GPUContext, T> {
const int block_size = 256;
dim3 threads(block_size, 1);
dim3 grid2(1, merge_rows.size());
phi::MixVector<int64_t> mixv_merge_rows(&merge_rows);
MixVector<int64_t> mixv_merge_rows(&merge_rows);
SparseAdagradFunctorKernel<T, 256>
<<<grid2,
threads,
Expand All @@ -201,7 +201,7 @@ template struct SparseAdagradFunctor<GPUContext, float>;
template struct SparseAdagradFunctor<GPUContext, double>;
template struct DenseAdagradFunctor<GPUContext, float>;
template struct DenseAdagradFunctor<GPUContext, double>;
template struct DenseAdagradFunctor<GPUContext, phi::float16>;
template struct DenseAdagradFunctor<GPUContext, float16>;

} // namespace phi

Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/kernels/gpu/adam_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ PADDLE_API void AdamDenseKernel(const Context& dev_ctx,
DenseTensor* beta1_pow_out,
DenseTensor* beta2_pow_out,
DenseTensor* master_param_outs) {
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
using MT = typename dtype::MPTypeTrait<T>::Type;
const auto grad_type = grad.dtype();

VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
Expand Down Expand Up @@ -394,7 +394,7 @@ void MergedAdamKernel(
std::vector<DenseTensor*> beta1_pow_out,
std::vector<DenseTensor*> beta2_pow_out,
std::vector<DenseTensor*> master_param_out) {
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
using MT = typename dtype::MPTypeTrait<T>::Type;
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
MT beta1_ = beta1.to<MT>();
MT beta2_ = beta2.to<MT>();
Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/kernels/gpu/adamax_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ void AdamaxKernel(const Context& dev_ctx,
DenseTensor* moment_out,
DenseTensor* inf_norm_out,
DenseTensor* master_param_outs) {
using MT = typename phi::dtype::template MPTypeTrait<T>::Type;
using MT = typename dtype::template MPTypeTrait<T>::Type;
T* param_out_data = dev_ctx.template Alloc<T>(param_out);
MT* moment_out_data = dev_ctx.template Alloc<MT>(moment_out);
MT* inf_norm_out_data = dev_ctx.template Alloc<MT>(inf_norm_out);
Expand All @@ -104,7 +104,7 @@ void AdamaxKernel(const Context& dev_ctx,
MT epsilon_ = static_cast<MT>(epsilon);

int64_t numel = param.numel();
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1);
auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, 1);
int grid = config.block_per_grid.x;
int block = config.thread_per_block.x;
auto stream = dev_ctx.stream();
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/gpu/adamw_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ PADDLE_API void AdamwDenseKernel(const Context& dev_ctx,
DenseTensor* beta1_pow_out,
DenseTensor* beta2_pow_out,
DenseTensor* master_param_outs) {
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
using MT = typename dtype::MPTypeTrait<T>::Type;
MT coeff_ = static_cast<MT>(coeff);
MT lr_ratio_ = static_cast<MT>(lr_ratio);

Expand Down
12 changes: 6 additions & 6 deletions paddle/phi/kernels/gpu/add_n_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ namespace phi {
template <class T>
__global__ void SumArrayCUDAKernel(
T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
using MPType = typename dtype::MPTypeTrait<T>::Type;
CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) {
MPType total(read_dst ? static_cast<MPType>(out[idx])
: static_cast<MPType>(0));
Expand All @@ -46,7 +46,7 @@ __global__ void SumArrayMixedTypeCUDAKernel(const T *in_0,
int64_t N,
size_t in_others_size,
bool read_dst) {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
using MPType = typename dtype::MPTypeTrait<T>::Type;
CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) {
MPType total(read_dst ? static_cast<MPType>(out[idx])
: static_cast<MPType>(0));
Expand Down Expand Up @@ -128,7 +128,7 @@ void AddNKernel(const Context &dev_ctx,
int64_t length_0 = in_0.numel();
int64_t length_1 = in_1.numel();
if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
using MPType = typename dtype::MPTypeTrait<T>::Type;
auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device();
auto in_0_e = EigenVector<T>::Flatten(in_0).template cast<MPType>();
Expand Down Expand Up @@ -281,7 +281,7 @@ void AddNKernel(const Context &dev_ctx,
}
}
if (!sr_in_out_data.empty()) {
auto tmp_sr_in_out_array = phi::memory_utils::Alloc(
auto tmp_sr_in_out_array = memory_utils::Alloc(
dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));

size_t nbytes_sr = sr_in_out_data.size() * sizeof(T *);
Expand All @@ -306,8 +306,8 @@ void AddNKernel(const Context &dev_ctx,
}
// if indata not null, merge into one kernel call.
if (!in_data.empty()) {
auto tmp_in_array = phi::memory_utils::Alloc(dev_ctx.GetPlace(),
in_data.size() * sizeof(T *));
auto tmp_in_array =
memory_utils::Alloc(dev_ctx.GetPlace(), in_data.size() * sizeof(T *));

size_t nbytes_in2 = in_data.size() * sizeof(T *);
const void *stable_in2 =
Expand Down
5 changes: 2 additions & 3 deletions paddle/phi/kernels/gpu/all_to_all_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,9 @@ void AllToAllKernel(const Context& dev_ctx,
const auto* send_buf = x.data<T>();
auto* recv_buf = out->data<T>();
for (auto i = 0; i < nranks; ++i) {
auto send_buf = phi::distributed::GetPartialTensor(x, offset, send_numel);
auto send_buf = distributed::GetPartialTensor(x, offset, send_numel);
comm_ctx->Send(send_buf, send_numel, i, stream);
auto recv_buf =
phi::distributed::GetPartialTensor(*out, offset, send_numel);
auto recv_buf = distributed::GetPartialTensor(*out, offset, send_numel);
comm_ctx->Recv(&recv_buf, send_numel, i, stream);
offset += send_numel;
}
Expand Down
5 changes: 2 additions & 3 deletions paddle/phi/kernels/gpu/allclose_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ __global__ void AllcloseCUDAKernel(const T* in_data,
bool* out_data) {
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
bool val;
using BaseMPType = typename phi::dtype::MPTypeTrait<T>::Type;
using BaseMPType = typename dtype::MPTypeTrait<T>::Type;

using MPType =
typename std::conditional<std::is_same<T, int32_t>::value ||
Expand Down Expand Up @@ -98,8 +98,7 @@ void AllCloseKernel(const Context& dev_ctx,

int64_t num = x.numel();
const int vec_size = 4;
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, num, vec_size);
auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, num, vec_size);
uint32_t grid = config.block_per_grid.x;
uint32_t block = config.thread_per_block.x;

Expand Down
35 changes: 17 additions & 18 deletions paddle/phi/kernels/gpu/amp_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -159,13 +159,13 @@ class LazyZeros<GPUContext, T> {
const auto& cpu_place = CPUPlace();
// alloc each tensor's start index and copy to device
auto h_in_starts_mem =
phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
int64_t* h_starts = reinterpret_cast<int64_t*>(h_in_starts_mem->ptr());

auto d_in_starts_mem = phi::memory_utils::Alloc(
auto d_in_starts_mem = memory_utils::Alloc(
dev_ctx.GetPlace(),
(xs_size + 1) * sizeof(int64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
int64_t* d_starts = reinterpret_cast<int64_t*>(d_in_starts_mem->ptr());

// the start index value of each tensor is
Expand All @@ -186,14 +186,13 @@ class LazyZeros<GPUContext, T> {
dev_ctx.stream());

// copy each tensor of "outs" data address array to device
auto h_out_addrs_mem =
phi::memory_utils::Alloc(cpu_place, xs_size * sizeof(T*));
auto h_out_addrs_mem = memory_utils::Alloc(cpu_place, xs_size * sizeof(T*));
T** h_out_addrs = reinterpret_cast<T**>(h_out_addrs_mem->ptr());

auto d_out_addrs_mem = phi::memory_utils::Alloc(
auto d_out_addrs_mem = memory_utils::Alloc(
dev_ctx.GetPlace(),
xs_size * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
T** d_out_addrs = reinterpret_cast<T**>(d_out_addrs_mem->ptr());

for (size_t i = 0; i < xs_size; ++i) {
Expand Down Expand Up @@ -277,7 +276,7 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
const DenseTensor& scale,
std::vector<DenseTensor*> outs,
DenseTensor* found_infinite) {
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
using MT = typename dtype::MPTypeTrait<T>::Type;

const MT* scale_data = scale.data<MT>();
bool* found_inf_data = dev_ctx.template Alloc<bool>(found_infinite);
Expand All @@ -294,13 +293,13 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
const auto& cpu_place = CPUPlace();
// calculate each tensor's start index and copy to device
auto h_starts_tensor =
phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr());

auto d_starts_tensor = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
(xs_size + 1) * sizeof(int64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
auto d_starts_tensor =
memory_utils::Alloc(dev_ctx.GetPlace(),
(xs_size + 1) * sizeof(int64_t),
Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
int64_t* d_starts = reinterpret_cast<int64_t*>(d_starts_tensor->ptr());

// the start index value of each tensor is
Expand All @@ -322,14 +321,14 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
dev_ctx.stream());

// copy each tensor's data address to device
auto h_mem = phi::memory_utils::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
auto h_mem = memory_utils::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr());
T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size;

auto d_mem = phi::memory_utils::Alloc(
dev_ctx.GetPlace(),
2 * xs_size * sizeof(T*),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
auto d_mem =
memory_utils::Alloc(dev_ctx.GetPlace(),
2 * xs_size * sizeof(T*),
Stream(reinterpret_cast<StreamId>(dev_ctx.stream())));
const T** d_xs = reinterpret_cast<const T**>(d_mem->ptr());
T** d_outs = reinterpret_cast<T**>(d_mem->ptr()) + xs_size;

Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/kernels/gpu/arange_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ void ArangeTensorKernel(const Context& dev_ctx,
const DenseTensor& end,
const DenseTensor& step,
DenseTensor* out) {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
using MPType = typename dtype::MPTypeTrait<T>::Type;
MPType start_value =
static_cast<MPType>(GetValue<T, Context>(dev_ctx, start));
MPType end_value = static_cast<MPType>(GetValue<T, Context>(dev_ctx, end));
Expand All @@ -64,7 +64,7 @@ void ArangeNullaryKernel(const Context& dev_ctx,
const T end_value,
const T step_value,
DenseTensor* out) {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
using MPType = typename dtype::MPTypeTrait<T>::Type;
MPType start_value_mpt = static_cast<MPType>(start_value);
MPType end_value_mpt = static_cast<MPType>(end_value);
MPType step_value_mpt = static_cast<MPType>(step_value);
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/gpu/arg_min_max_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ void ArgMinMaxOpCUDAKernel(const Context& dev_ctx,
dev_ctx, x, axis.to<int64_t>(), keepdims, flatten, out));
return;
}
phi::VisitDataTypeTiny(
VisitDataTypeTiny(
dtype,
VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
dev_ctx, x, axis.to<int64_t>(), keepdims, flatten, out));
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/gpu/argsort_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ void ArgsortKernel(const Context& dev_ctx,
PerSort<T, int64_t>(
dev_ctx, out_data, ids_data, start, end, stable, descending);
if (start != 0) {
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, end);
auto config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, end);
merge_kernel<<<config.block_per_grid.x,
config.thread_per_block.x,
0,
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/kernels/gpu/asgd_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ void ASGDKernel(const Context& dev_ctx,
DenseTensor* d_out,
DenseTensor* y_out,
DenseTensor* master_param_out) {
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
using MT = typename dtype::MPTypeTrait<T>::Type;
const MT* master_in_data =
multi_precision ? master_param->data<MT>() : nullptr;
MT* master_out_data =
Expand Down
2 changes: 0 additions & 2 deletions paddle/phi/kernels/gpu/auc_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@

namespace phi {

using phi::PADDLE_CUDA_NUM_THREADS;

__global__ void ClearObsoleteDataKernel(int64_t *pos,
int64_t *neg,
const int bucket_length,
Expand Down
Loading
Loading