// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include "paddle/fluid/operators/reduce_ops/cub_reduce.h" #include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" namespace paddle { namespace operators { template struct DivideFunctor { HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } private: T n_inv; }; template class ReduceMeanKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { bool reduce_all = context.Attr("reduce_all"); auto* input = context.Input("X"); auto* output = context.Output("Out"); auto dims = context.Attr>("dim"); bool keep_dim = context.Attr("keep_dim"); std::vector reduce_dims; if (reduce_all) { reduce_dims.resize(input->dims().size()); for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; } else { for (auto e : dims) { reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); } } int reduce_num = 1; for (int i = 0; i < reduce_dims.size(); ++i) { reduce_num *= input->dims()[reduce_dims[i]]; } auto stream = context.cuda_device_context().stream(); #ifdef PADDLE_WITH_HIP TensorReduce>( *input, output, reduce_dims, static_cast(0), hipcub::Sum(), DivideFunctor(reduce_num), stream); #else TensorReduce>( *input, output, reduce_dims, static_cast(0), cub::Sum(), DivideFunctor(reduce_num), stream); #endif } }; } // namespace operators } // namespace paddle REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel, ops::ReduceMeanKernel);