未验证 提交 f71241b9 编写于 作者: Y Yiqun Liu 提交者: GitHub

Rename partial function name TensorReduceFunctorImpl to TensorReduceImpl. (#39388)

上级 e4d475ea
...@@ -89,7 +89,7 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> { ...@@ -89,7 +89,7 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
} else { } else {
// reduce_sum implementation on CUDA // reduce_sum implementation on CUDA
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
context.cuda_device_context(), *input_tensor, output_tensor, context.cuda_device_context(), *input_tensor, output_tensor,
kps::IdentityFunctor<T>(), reduce_dims_vec, stream); kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
} }
......
...@@ -114,7 +114,7 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> { ...@@ -114,7 +114,7 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
} }
} }
gpuStream_t stream = ctx.cuda_device_context().stream(); gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(), ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
out_reduce_dims, stream); out_reduce_dims, stream);
} }
......
...@@ -75,8 +75,8 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16> ...@@ -75,8 +75,8 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
} }
Tensor tmp = context.AllocateTmpTensor<float, platform::CUDADeviceContext>( Tensor tmp = context.AllocateTmpTensor<float, platform::CUDADeviceContext>(
{1}, dev_ctx); {1}, dev_ctx);
TensorReduceFunctorImpl<platform::float16, float, kps::AddFunctor, TensorReduceImpl<platform::float16, float, kps::AddFunctor,
kps::SquareFunctor<platform::float16, float>>( kps::SquareFunctor<platform::float16, float>>(
dev_ctx, *input, &tmp, kps::SquareFunctor<platform::float16, float>(), dev_ctx, *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
reduce_dims, dev_ctx.stream()); reduce_dims, dev_ctx.stream());
auto tmp_eigen = EigenVector<float>::Flatten(tmp); auto tmp_eigen = EigenVector<float>::Flatten(tmp);
......
...@@ -63,8 +63,7 @@ class CompareReduceOpKernel ...@@ -63,8 +63,7 @@ class CompareReduceOpKernel
reduce_dims.resize(tmp.dims().size()); reduce_dims.resize(tmp.dims().size());
for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
TensorReduceFunctorImpl<bool, bool, BitwiseAdd, TensorReduceImpl<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
kps::IdentityFunctor<bool>>(
context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(), context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
reduce_dims, stream); reduce_dims, stream);
} }
......
...@@ -1188,7 +1188,7 @@ template <typename T> ...@@ -1188,7 +1188,7 @@ template <typename T>
void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis, void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
framework::Tensor *src, framework::Tensor *dst) { framework::Tensor *src, framework::Tensor *dst) {
std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis); std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims, dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
dev_ctx.stream()); dev_ctx.stream());
} }
......
...@@ -165,7 +165,7 @@ class AttnMatMul { ...@@ -165,7 +165,7 @@ class AttnMatMul {
(input_dims[2] == output_dims[0])); (input_dims[2] == output_dims[0]));
if (support_case_1 || support_case_2) { if (support_case_1 || support_case_2) {
gpuStream_t stream = dev_ctx_.stream(); gpuStream_t stream = dev_ctx_.stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1}, dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1},
stream); stream);
} else { } else {
......
...@@ -305,11 +305,11 @@ struct KronGradOpFunctor { ...@@ -305,11 +305,11 @@ struct KronGradOpFunctor {
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
auto stream = dev_ctx.stream(); // it is a cuda device_context auto stream = dev_ctx.stream(); // it is a cuda device_context
if (dx) { if (dx) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream); dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
} }
if (dy) { if (dy) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream); dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
} }
#else #else
......
...@@ -298,7 +298,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -298,7 +298,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
logits_max = logits_max =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx); ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
T* logits_max_buff = logits_max.mutable_data<T>(place); T* logits_max_buff = logits_max.mutable_data<T>(place);
TensorReduceFunctorImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>( TensorReduceImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
dev_ctx, softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1}, dev_ctx, softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
dev_ctx.stream()); dev_ctx.stream());
...@@ -320,7 +320,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -320,7 +320,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
sum_exp_logits = sum_exp_logits =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx); ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place); T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
dev_ctx, softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1}, dev_ctx, softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
dev_ctx.stream()); dev_ctx.stream());
......
...@@ -65,7 +65,7 @@ class MeanCUDAKernel : public framework::OpKernel<T> { ...@@ -65,7 +65,7 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
for (decltype(rank) i = 0; i < rank; ++i) { for (decltype(rank) i = 0; i < rank; ++i) {
reduce_dims.push_back(i); reduce_dims.push_back(i);
} }
TensorReduceFunctorImpl<T, T, kernel_primitives::AddFunctor, Div>( TensorReduceImpl<T, T, kernel_primitives::AddFunctor, Div>(
context.cuda_device_context(), *input, output, Div(numel), reduce_dims, context.cuda_device_context(), *input, output, Div(numel), reduce_dims,
stream); stream);
} }
......
...@@ -105,19 +105,19 @@ class PnormCUDAKernel : public framework::OpKernel<T> { ...@@ -105,19 +105,19 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
using MT = typename details::MPTypeTrait<T>::Type; using MT = typename details::MPTypeTrait<T>::Type;
if (porder == 0) { if (porder == 0) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
ctx.cuda_device_context(), *in_x, out_norm, NonzeroFunctor<T>(), ctx.cuda_device_context(), *in_x, out_norm, NonzeroFunctor<T>(),
reduce_axis, stream); reduce_axis, stream);
} else if (porder == INFINITY) { } else if (porder == INFINITY) {
TensorReduceFunctorImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>( TensorReduceImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>(
ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(), ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
reduce_axis, stream); reduce_axis, stream);
} else if (porder == -INFINITY) { } else if (porder == -INFINITY) {
TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>( TensorReduceImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(), ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
reduce_axis, stream); reduce_axis, stream);
} else { } else {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
ctx.cuda_device_context(), *in_x, out_norm, ctx.cuda_device_context(), *in_x, out_norm,
UnsignedPowFunctor<T>(porder), reduce_axis, stream); UnsignedPowFunctor<T>(porder), reduce_axis, stream);
......
...@@ -206,8 +206,7 @@ class PoolKernel : public framework::OpKernel<T> { ...@@ -206,8 +206,7 @@ class PoolKernel : public framework::OpKernel<T> {
adaptive) { // for adaptive_avg_pool2d && output_size == 1 adaptive) { // for adaptive_avg_pool2d && output_size == 1
#if defined(__HIPCC__) || defined(__NVCC__) #if defined(__HIPCC__) || defined(__NVCC__)
auto stream = dev_ctx.stream(); auto stream = dev_ctx.stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, TensorReduceImpl<T, T, kps::AddFunctor, kps::DivideFunctor<T>>(
kps::DivideFunctor<T>>(
dev_ctx, *in_x, out, kps::DivideFunctor<T>(reduce_num), dev_ctx, *in_x, out, kps::DivideFunctor<T>(reduce_num),
reduce_dim, stream); reduce_dim, stream);
#else // for cpu #else // for cpu
......
...@@ -185,7 +185,7 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> { ...@@ -185,7 +185,7 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
reduce_dims.push_back(i); reduce_dims.push_back(i);
} }
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
context.cuda_device_context(), dalpha_tmp, dalpha, context.cuda_device_context(), dalpha_tmp, dalpha,
kps::IdentityFunctor<T>(), reduce_dims, stream); kps::IdentityFunctor<T>(), reduce_dims, stream);
} }
......
...@@ -39,11 +39,11 @@ namespace operators { ...@@ -39,11 +39,11 @@ namespace operators {
template <typename Tx, typename Ty, template <typename> class ReduceOp, template <typename Tx, typename Ty, template <typename> class ReduceOp,
typename TransformOp> typename TransformOp>
void TensorReduceFunctorImpl(const platform::CUDADeviceContext& dev_ctx, void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor& x, framework::Tensor* y, const framework::Tensor& x, framework::Tensor* y,
const TransformOp& transform, const TransformOp& transform,
const std::vector<int>& origin_reduce_dims, const std::vector<int>& origin_reduce_dims,
gpuStream_t stream) { gpuStream_t stream) {
y->mutable_data<Ty>(x.place()); y->mutable_data<Ty>(x.place());
pten::kernels::TensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp>( pten::kernels::TensorReduceFunctorImpl<Tx, Ty, ReduceOp, TransformOp>(
......
...@@ -155,7 +155,7 @@ class CUDARenormKernel : public framework::OpKernel<T> { ...@@ -155,7 +155,7 @@ class CUDARenormKernel : public framework::OpKernel<T> {
ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>( ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
cuda_ctx, ins, &outs, func); cuda_ctx, ins, &outs, func);
std::vector<int> reduce_axis = {0, 2}; std::vector<int> reduce_axis = {0, 2};
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis, cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis,
stream); stream);
RenormKernelFunc3<T><<<grid2, block2, 0, stream>>>( RenormKernelFunc3<T><<<grid2, block2, 0, stream>>>(
...@@ -213,10 +213,10 @@ class CUDAGradRenormKernel : public framework::OpKernel<T> { ...@@ -213,10 +213,10 @@ class CUDAGradRenormKernel : public framework::OpKernel<T> {
mul_value.mutable_data<T>(ctx.GetPlace()), numel, dimension_each, p, mul_value.mutable_data<T>(ctx.GetPlace()), numel, dimension_each, p,
dim_divisor); dim_divisor);
std::vector<int> reduce_axis = {0, 2}; std::vector<int> reduce_axis = {0, 2};
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx.cuda_device_context(), pow_value, &dim_value, ctx.cuda_device_context(), pow_value, &dim_value,
kps::IdentityFunctor<T>(), reduce_axis, stream); kps::IdentityFunctor<T>(), reduce_axis, stream);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx.cuda_device_context(), mul_value, &weight_derivative, ctx.cuda_device_context(), mul_value, &weight_derivative,
kps::IdentityFunctor<T>(), reduce_axis, stream); kps::IdentityFunctor<T>(), reduce_axis, stream);
RenormGradKernelFunc2<T><<<grid, block, 0, stream>>>( RenormGradKernelFunc2<T><<<grid, block, 0, stream>>>(
......
...@@ -45,7 +45,7 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output, ...@@ -45,7 +45,7 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output,
const paddle::framework::ExecutionContext& ctx) { const paddle::framework::ExecutionContext& ctx) {
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx.cuda_device_context(), *input, output, kps::IdentityFunctor<T>(), ctx.cuda_device_context(), *input, output, kps::IdentityFunctor<T>(),
reduce_dims, stream); reduce_dims, stream);
#else #else
......
...@@ -39,7 +39,7 @@ class TraceCUDAKernel : public framework::OpKernel<T> { ...@@ -39,7 +39,7 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
std::vector<int> reduce_dims; std::vector<int> reduce_dims;
reduce_dims.push_back(out->dims().size()); reduce_dims.push_back(out->dims().size());
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
context.cuda_device_context(), diag, out, kps::IdentityFunctor<T>(), context.cuda_device_context(), diag, out, kps::IdentityFunctor<T>(),
reduce_dims, stream); reduce_dims, stream);
} else { } else {
......
...@@ -44,7 +44,7 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> { ...@@ -44,7 +44,7 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
} }
} }
gpuStream_t stream = ctx.cuda_device_context().stream(); gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>( TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(), ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
out_reduce_dims, stream); out_reduce_dims, stream);
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册