未验证 提交 480b284c 编写于 作者: N niuliling123 提交者: GitHub

modified reduce_max, reduce_min, reduce_prod to higher_performance implementation. (#32974)

上级 20eafd79
......@@ -13,46 +13,98 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/platform/device_context.h"
#include <cmath>
#include <limits>
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/macros.h"
#ifdef __HIPCC__
#include <hip/hip_runtime.h>
#endif
namespace paddle {
namespace operators {
template <typename T>
template <typename Tx, typename Ty = Tx>
struct CustomMin {
__device__ __forceinline__ T operator()(const T &a, const T &b) const {
using Transformer = detail::IdentityFunctor<Tx>;
inline Ty initial() {
return static_cast<Ty>(std::numeric_limits<Ty>::max());
}
__device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
return (b < a) ? b : a;
}
};
template <typename T>
template <typename Tx, typename Ty = Tx>
struct CustomMax {
__device__ __forceinline__ T operator()(const T &a, const T &b) const {
using Transformer = detail::IdentityFunctor<Tx>;
inline Ty initial() {
return static_cast<Ty>(std::numeric_limits<Ty>::lowest());
}
__device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
return (b > a) ? b : a;
}
};
template <typename T>
// for cub::Reduce
template <typename Tx, typename Ty = Tx>
struct CustomSum {
__device__ __forceinline__ T operator()(const T &a, const T &b) const {
using Transformer = detail::IdentityFunctor<Tx, Ty>;
inline Ty initial() { return static_cast<Ty>(0.0f); }
__device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
return b + a;
}
};
template <typename T>
template <typename Tx, typename Ty = Tx>
struct CustomMean {
using Transformer = detail::DivideFunctor<Tx>;
inline Ty initial() { return static_cast<Ty>(0.0f); }
__device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
return b + a;
}
};
template <typename Tx, typename Ty = Tx>
struct CustomMul {
__device__ __forceinline__ T operator()(const T &a, const T &b) const {
using Transformer = detail::IdentityFunctor<Tx>;
inline Ty initial() { return static_cast<Ty>(1.0f); }
__device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
return b * a;
}
};
template <typename Tx, typename Ty = Tx>
struct CustomLogicalOr {
using Transformer = detail::IdentityFunctor<Tx>;
inline Ty initial() { return static_cast<Ty>(false); }
__device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
return b || a;
}
};
template <typename Tx, typename Ty = Tx>
struct CustomLogicalAnd {
using Transformer = detail::IdentityFunctor<Tx>;
inline Ty initial() { return static_cast<Ty>(true); }
__device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
return b && a;
}
};
} // namespace operators
} // namespace paddle
......@@ -11,15 +11,13 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_max,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::MaxFunctor>);
// reduce_max
REGISTER_OP_CUDA_KERNEL(
reduce_max, ops::ReduceCudaKernel<float, paddle::operators::CustomMax>,
ops::ReduceCudaKernel<double, paddle::operators::CustomMax>,
ops::ReduceCudaKernel<int, paddle::operators::CustomMax>,
ops::ReduceCudaKernel<int64_t, paddle::operators::CustomMax>);
......@@ -11,15 +11,13 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_min,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::MinFunctor>);
// reduce_min
REGISTER_OP_CUDA_KERNEL(
reduce_min, ops::ReduceCudaKernel<float, paddle::operators::CustomMin>,
ops::ReduceCudaKernel<double, paddle::operators::CustomMin>,
ops::ReduceCudaKernel<int, paddle::operators::CustomMin>,
ops::ReduceCudaKernel<int64_t, paddle::operators::CustomMin>);
......@@ -12,26 +12,22 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
// reduce_prod
#ifdef __HIPCC__
// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
// do not support double in HIPCC platform (Eigen3 to be fixed)
REGISTER_OP_CUDA_KERNEL(reduce_prod,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::ProdFunctor>);
REGISTER_OP_CUDA_KERNEL(
reduce_prod, ops::ReduceCudaKernel<float, paddle::operators::CustomMul>,
ops::ReduceCudaKernel<int, paddle::operators::CustomMul>,
ops::ReduceCudaKernel<int64_t, paddle::operators::CustomMul>);
#else
REGISTER_OP_CUDA_KERNEL(reduce_prod,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
float, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
double, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::ProdFunctor>);
REGISTER_OP_CUDA_KERNEL(
reduce_prod, ops::ReduceCudaKernel<float, paddle::operators::CustomMul>,
ops::ReduceCudaKernel<int, paddle::operators::CustomMul>,
ops::ReduceCudaKernel<double, paddle::operators::CustomMul>,
ops::ReduceCudaKernel<int64_t, paddle::operators::CustomMul>);
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册