未验证 提交 f71241b9 编写于 作者: Y Yiqun Liu 提交者: GitHub

Rename partial function name TensorReduceFunctorImpl to TensorReduceImpl. (#39388)

上级 e4d475ea
......@@ -89,7 +89,7 @@ class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
} else {
// reduce_sum implementation on CUDA
auto stream = context.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
context.cuda_device_context(), *input_tensor, output_tensor,
kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
}
......
......@@ -114,7 +114,7 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
}
}
gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
out_reduce_dims, stream);
}
......
......@@ -75,7 +75,7 @@ class ClipByNormKernel<platform::CUDADeviceContext, platform::float16>
}
Tensor tmp = context.AllocateTmpTensor<float, platform::CUDADeviceContext>(
{1}, dev_ctx);
TensorReduceFunctorImpl<platform::float16, float, kps::AddFunctor,
TensorReduceImpl<platform::float16, float, kps::AddFunctor,
kps::SquareFunctor<platform::float16, float>>(
dev_ctx, *input, &tmp, kps::SquareFunctor<platform::float16, float>(),
reduce_dims, dev_ctx.stream());
......
......@@ -63,8 +63,7 @@ class CompareReduceOpKernel
reduce_dims.resize(tmp.dims().size());
for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
auto stream = context.cuda_device_context().stream();
TensorReduceFunctorImpl<bool, bool, BitwiseAdd,
kps::IdentityFunctor<bool>>(
TensorReduceImpl<bool, bool, BitwiseAdd, kps::IdentityFunctor<bool>>(
context.cuda_device_context(), tmp, z, kps::IdentityFunctor<bool>(),
reduce_dims, stream);
}
......
......@@ -1188,7 +1188,7 @@ template <typename T>
void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
framework::Tensor *src, framework::Tensor *dst) {
std::vector<int> reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dev_ctx, *src, dst, kps::IdentityFunctor<T>(), reduce_dims,
dev_ctx.stream());
}
......
......@@ -165,7 +165,7 @@ class AttnMatMul {
(input_dims[2] == output_dims[0]));
if (support_case_1 || support_case_2) {
gpuStream_t stream = dev_ctx_.stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1},
stream);
} else {
......
......@@ -305,11 +305,11 @@ struct KronGradOpFunctor {
#if defined(__NVCC__) || defined(__HIPCC__)
auto stream = dev_ctx.stream(); // it is a cuda device_context
if (dx) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dev_ctx, dout_x, dx, kps::IdentityFunctor<T>(), {1}, stream);
}
if (dy) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
dev_ctx, dout_y, dy, kps::IdentityFunctor<T>(), {1}, stream);
}
#else
......
......@@ -298,7 +298,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
logits_max =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
T* logits_max_buff = logits_max.mutable_data<T>(place);
TensorReduceFunctorImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
TensorReduceImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
dev_ctx, softmax_2d, &logits_max, kps::IdentityFunctor<T>(), {1},
dev_ctx.stream());
......@@ -320,7 +320,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
sum_exp_logits =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
dev_ctx, softmax_2d, &sum_exp_logits, kps::ExpFunctor<T>(), {1},
dev_ctx.stream());
......
......@@ -65,7 +65,7 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
for (decltype(rank) i = 0; i < rank; ++i) {
reduce_dims.push_back(i);
}
TensorReduceFunctorImpl<T, T, kernel_primitives::AddFunctor, Div>(
TensorReduceImpl<T, T, kernel_primitives::AddFunctor, Div>(
context.cuda_device_context(), *input, output, Div(numel), reduce_dims,
stream);
}
......
......@@ -105,19 +105,19 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
using MT = typename details::MPTypeTrait<T>::Type;
if (porder == 0) {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
ctx.cuda_device_context(), *in_x, out_norm, NonzeroFunctor<T>(),
reduce_axis, stream);
} else if (porder == INFINITY) {
TensorReduceFunctorImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>(
TensorReduceImpl<T, T, kps::MaxFunctor, AbsFunctor<T>>(
ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
reduce_axis, stream);
} else if (porder == -INFINITY) {
TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
TensorReduceImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
ctx.cuda_device_context(), *in_x, out_norm, AbsFunctor<T>(),
reduce_axis, stream);
} else {
TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
ctx.cuda_device_context(), *in_x, out_norm,
UnsignedPowFunctor<T>(porder), reduce_axis, stream);
......
......@@ -206,8 +206,7 @@ class PoolKernel : public framework::OpKernel<T> {
adaptive) { // for adaptive_avg_pool2d && output_size == 1
#if defined(__HIPCC__) || defined(__NVCC__)
auto stream = dev_ctx.stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor,
kps::DivideFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, kps::DivideFunctor<T>>(
dev_ctx, *in_x, out, kps::DivideFunctor<T>(reduce_num),
reduce_dim, stream);
#else // for cpu
......
......@@ -185,7 +185,7 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
reduce_dims.push_back(i);
}
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
context.cuda_device_context(), dalpha_tmp, dalpha,
kps::IdentityFunctor<T>(), reduce_dims, stream);
}
......
......@@ -39,7 +39,7 @@ namespace operators {
template <typename Tx, typename Ty, template <typename> class ReduceOp,
typename TransformOp>
void TensorReduceFunctorImpl(const platform::CUDADeviceContext& dev_ctx,
void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx,
const framework::Tensor& x, framework::Tensor* y,
const TransformOp& transform,
const std::vector<int>& origin_reduce_dims,
......
......@@ -155,7 +155,7 @@ class CUDARenormKernel : public framework::OpKernel<T> {
ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
cuda_ctx, ins, &outs, func);
std::vector<int> reduce_axis = {0, 2};
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
cuda_ctx, pow_value, &dim_value, kps::IdentityFunctor<T>(), reduce_axis,
stream);
RenormKernelFunc3<T><<<grid2, block2, 0, stream>>>(
......@@ -213,10 +213,10 @@ class CUDAGradRenormKernel : public framework::OpKernel<T> {
mul_value.mutable_data<T>(ctx.GetPlace()), numel, dimension_each, p,
dim_divisor);
std::vector<int> reduce_axis = {0, 2};
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx.cuda_device_context(), pow_value, &dim_value,
kps::IdentityFunctor<T>(), reduce_axis, stream);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx.cuda_device_context(), mul_value, &weight_derivative,
kps::IdentityFunctor<T>(), reduce_axis, stream);
RenormGradKernelFunc2<T><<<grid, block, 0, stream>>>(
......
......@@ -45,7 +45,7 @@ void ReduceSumForSolve(const Tensor* input, Tensor* output,
const paddle::framework::ExecutionContext& ctx) {
#if defined(__NVCC__) || defined(__HIPCC__)
auto stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx.cuda_device_context(), *input, output, kps::IdentityFunctor<T>(),
reduce_dims, stream);
#else
......
......@@ -39,7 +39,7 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
auto stream = context.cuda_device_context().stream();
std::vector<int> reduce_dims;
reduce_dims.push_back(out->dims().size());
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
context.cuda_device_context(), diag, out, kps::IdentityFunctor<T>(),
reduce_dims, stream);
} else {
......
......@@ -44,7 +44,7 @@ class MatrixReduceSumFunctor<platform::CUDADeviceContext, T> {
}
}
gpuStream_t stream = ctx.cuda_device_context().stream();
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx.cuda_device_context(), in, out, kps::IdentityFunctor<T>(),
out_reduce_dims, stream);
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册