未验证 提交 7c020c71 编写于 作者: Y YuanRisheng 提交者: GitHub

[Pten]Move CPU_implementation of elementwise kernel in new directory (#38651)

* change 'math' to 'math_kernel'

* fix compile bugs

* merge develop

* fix compile bugs

* move cpu_impl of elementwise kernel to new directory
上级 6e9714a2
......@@ -23,6 +23,9 @@ limitations under the License. */
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/framework/pten_utils.h"
#include "paddle/pten/include/core.h"
namespace paddle {
namespace framework {
......@@ -73,9 +76,12 @@ class TestKernel : public OpKernel<float> {
output->Resize(input->dims());
output->mutable_data<T>(ctx.GetPlace());
operators::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
input, input, output, ctx.template device_context<DeviceContext>(),
AddFunctor<T>());
auto pt_input = paddle::experimental::MakePtenDenseTensor(*input);
auto pt_out = paddle::experimental::MakePtenDenseTensor(*output);
pten::funcs::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
*pt_input, *pt_input, pt_out.get(),
ctx.template device_context<DeviceContext>(), AddFunctor<T>());
functor.Run();
}
};
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/pten/kernels/funcs/elementwise_functor.h"
namespace paddle {
namespace operators {
......@@ -25,58 +26,31 @@ namespace operators {
// Add
template <typename T>
struct AddFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; }
};
using AddFunctor = pten::funcs::AddFunctor<T>;
template <typename T>
struct InverseAddFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; }
};
using InverseAddFunctor = pten::funcs::InverseAddFunctor<T>;
// Subtract
template <typename T>
struct SubFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; }
};
using SubFunctor = pten::funcs::SubtractFunctor<T>;
template <typename T>
struct InverseSubFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; }
};
using InverseSubFunctor = pten::funcs::InverseSubtractFunctor<T>;
// Multiply
template <typename T>
struct MulFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
};
using MulFunctor = pten::funcs::MultiplyFunctor<T>;
template <typename T>
struct InverseMulFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; }
};
using InverseMulFunctor = pten::funcs::InverseMultiplyFunctor<T>;
// Divide
#define DIV_ERROR_INFO \
"InvalidArgumentError: Integer division by zero encountered in " \
"(floor) divide. Please check the input value."
template <typename T, typename Enable = void>
struct DivFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
};
template <typename T>
struct DivFunctor<T,
typename std::enable_if<std::is_integral<T>::value>::type> {
inline HOSTDEVICE T operator()(const T& a, const T& b) const {
// For int32/int64, need to check whether the divison is zero.
PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO);
return a / b;
}
};
using DivFunctor = pten::funcs::DivideFunctor<T>;
template <typename T, typename Enable = void>
struct InverseDivFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b / a; }
};
template <typename T>
using InverseDivFunctor = pten::funcs::InverseDivideFunctor<T>;
// Floor Divide
template <typename T>
......
......@@ -31,8 +31,7 @@ limitations under the License. */
// only can include the headers in paddle/pten/include dirs
#include "paddle/pten/api/lib/utils/tensor_utils.h"
#include "paddle/pten/kernels/hybird/cpu/elementwise.h"
#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
#include "paddle/pten/kernels/cpu/elementwise_impl.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#ifdef __NVCC__
......@@ -151,9 +150,9 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
int *x_dims_array, int *y_dims_array,
int *out_dims_array, const int max_dim,
const int axis) {
pten::general::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array,
y_dims_array, out_dims_array, max_dim,
axis);
pten::funcs::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array,
y_dims_array, out_dims_array, max_dim,
axis);
}
template <typename Functor, typename T, typename OutType = T>
......@@ -1073,71 +1072,9 @@ void CommonGradBroadcastCUDA(
inline framework::DDim trim_trailing_singular_dims(
const framework::DDim &dims) {
return pten::general::trim_trailing_singular_dims(dims);
return pten::funcs::trim_trailing_singular_dims(dims);
}
template <typename Functor, typename T, typename DeviceContext,
typename OutType = T>
class TransformFunctor {
public:
TransformFunctor(const framework::Tensor *x, const framework::Tensor *y,
framework::Tensor *z, const DeviceContext &ctx, Functor func,
const bool is_xsize_larger = true)
: x_(x->data<T>()),
y_(y->data<T>()),
z_(z->mutable_data<OutType>(ctx.GetPlace())),
nx_(x->numel()),
ctx_(ctx),
func_(func),
is_xsize_larger_(is_xsize_larger) {
if (is_xsize_larger_ == false) {
nx_ = y->numel();
}
}
inline void Run() const {
platform::Transform<DeviceContext> trans;
trans(ctx_, x_, x_ + nx_, y_, z_, func_);
}
inline void RunRowWise(int n, int pre) const {
platform::Transform<DeviceContext> trans;
if (is_xsize_larger_) {
trans(ctx_, x_, x_ + nx_,
pten::general::RowwiseTransformIterator<T, DeviceContext>(y_, n),
z_, func_);
} else {
trans(ctx_, y_, y_ + nx_,
pten::general::RowwiseTransformIterator<T, DeviceContext>(x_, n),
z_, func_);
}
}
inline void RunMidWise(int n, int pre, int post) const {
platform::Transform<DeviceContext> trans;
if (is_xsize_larger_) {
trans(ctx_, x_, x_ + nx_,
pten::general::MidWiseTransformIterator<T, DeviceContext>(y_, n,
post),
z_, func_);
} else {
trans(ctx_, y_, y_ + nx_,
pten::general::MidWiseTransformIterator<T, DeviceContext>(x_, n,
post),
z_, func_);
}
}
private:
const T *x_;
const T *y_;
OutType *z_;
int64_t nx_;
const DeviceContext &ctx_;
Functor func_;
bool is_xsize_larger_;
};
template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
struct ElemwiseGradNoBroadcast {
const T *x_;
......@@ -1457,13 +1394,13 @@ void ElemwiseGradComputeWithBroadcast(
if (is_xsize_larger) {
auto y_dims_trimed = trim_trailing_singular_dims(y_dims);
axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
pten::general::get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n,
&post, &is_run_common_broadcast);
pten::funcs::get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n, &post,
&is_run_common_broadcast);
} else {
auto x_dims_trimed = trim_trailing_singular_dims(x_dims);
axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
pten::general::get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n,
&post, &is_run_common_broadcast);
pten::funcs::get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n, &post,
&is_run_common_broadcast);
}
// special case for common backward implementation.
if (is_run_common_broadcast) {
......@@ -1861,8 +1798,8 @@ void FusedElemwiseAndActComputeWithBroadcast(
axis = (y_dim.size() == 0) ? x_dim.size() : axis;
int pre, n, post, is_run_common_broadcast;
pten::general::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
&is_run_common_broadcast);
pten::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
&is_run_common_broadcast);
if (post == 1) {
int h = pre;
int w = n;
......@@ -2409,8 +2346,8 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
axis = (y_dim.size() == 0) ? x_dim.size() : axis;
int pre, n, post, is_run_common_broadcast;
pten::general::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
&is_run_common_broadcast);
pten::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
&is_run_common_broadcast);
const T *x_data = nullptr;
const T *y_data = nullptr;
if (x->IsInitialized()) x_data = x->data<T>();
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/pten/core/kernel_registry.h"
// TODO(chenweihang) After the kernel is split into a single file,
// the kernel declare statement is automatically generated according to the
// file name of the kernel, and this header file will be removed
......@@ -14,7 +14,7 @@ limitations under the License. */
// See Note [ Why still include the fluid headers? ]
#include "paddle/pten/infermeta/binary.h"
#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
#include "paddle/pten/kernels/funcs/elementwise_base.h"
namespace pten {
......@@ -162,13 +162,13 @@ DenseTensorMeta ElementwiseInferMeta(const DenseTensorMeta& x_meta,
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
general::GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
funcs::GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
return_meta.dims = paddle::framework::make_ddim(out_dims_array);
}
return_meta.lod = x_meta.lod;
......
......@@ -15,13 +15,175 @@ limitations under the License. */
#pragma once
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
#include "paddle/pten/kernels/funcs/elementwise_base.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/pten/kernels/hybird/eigen/common.h"
namespace pten {
inline void UpdateElementwiseIndexArray(const int *out_dims_array,
// Add
template <typename DevCtx, typename T, class Enable = void>
struct SameDimsAddFunctor {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z);
};
template <typename DevCtx, typename T>
struct SameDimsAddFunctor<
DevCtx,
T,
typename std::enable_if<std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
blas.VADD(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
}
};
template <typename DevCtx, typename T>
struct SameDimsAddFunctor<
DevCtx,
T,
typename std::enable_if<!std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
z->mutable_data<T>();
auto eigen_x = pten::EigenVector<T>::Flatten(x);
auto eigen_y = pten::EigenVector<T>::Flatten(y);
auto eigen_z = pten::EigenVector<T>::Flatten(*z);
auto& place = *dev_ctx.eigen_device();
eigen_z.device(place) = eigen_x + eigen_y;
}
};
// Subtract
template <typename DevCtx, typename T, class Enable = void>
struct SameDimsSubtractFunctor {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z);
};
template <typename DevCtx, typename T>
struct SameDimsSubtractFunctor<
DevCtx,
T,
typename std::enable_if<std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
blas.VSUB(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
}
};
template <typename DevCtx, typename T>
struct SameDimsSubtractFunctor<
DevCtx,
T,
typename std::enable_if<!std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
auto eigen_x = pten::EigenVector<T>::Flatten(x);
auto eigen_y = pten::EigenVector<T>::Flatten(y);
auto eigen_z = pten::EigenVector<T>::Flatten(*z);
auto& place = *dev_ctx.eigen_device();
eigen_z.device(place) = eigen_x - eigen_y;
}
};
// Divide
template <typename DevCtx, typename T, class Enable = void>
struct SameDimsDivideFunctor {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z);
};
template <typename DevCtx, typename T>
struct SameDimsDivideFunctor<
DevCtx,
T,
typename std::enable_if<!std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
paddle::platform::errors::InvalidArgument(
"If use SameDimsDivideFunctor, template args(T) must be floating "
"point. ");
}
};
template <typename DevCtx, typename T>
struct SameDimsDivideFunctor<
DevCtx,
T,
typename std::enable_if<std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
blas.VDIV(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
}
};
// Multiply
template <typename DevCtx, typename T, class Enable = void>
struct SameDimsMultiplyFunctor {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z);
};
template <typename DevCtx, typename T>
struct SameDimsMultiplyFunctor<
DevCtx,
T,
typename std::enable_if<std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
blas.VMUL(x.numel(), x.data<T>(), y.data<T>(), z->mutable_data<T>());
}
};
template <typename DevCtx, typename T>
struct SameDimsMultiplyFunctor<
DevCtx,
T,
typename std::enable_if<!std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
auto eigen_x = pten::EigenVector<T>::Flatten(x);
auto eigen_y = pten::EigenVector<T>::Flatten(y);
auto eigen_z = pten::EigenVector<T>::Flatten(*z);
auto& place = *dev_ctx.eigen_device();
eigen_z.device(place) = eigen_x * eigen_y;
}
};
inline void UpdateElementwiseIndexArray(const int* out_dims_array,
const int max_dim,
int *index_array) {
int* index_array) {
for (int i = max_dim - 1; i >= 0; --i) {
++index_array[i];
if (index_array[i] >= out_dims_array[i]) {
......@@ -32,9 +194,9 @@ inline void UpdateElementwiseIndexArray(const int *out_dims_array,
}
}
inline int GetElementwiseIndex(const int *x_dims_array,
inline int GetElementwiseIndex(const int* x_dims_array,
const int max_dim,
const int *index_array) {
const int* index_array) {
int index_ = 0;
for (int i = 0; i < max_dim; i++) {
if (x_dims_array[i] > 1) {
......@@ -45,26 +207,26 @@ inline int GetElementwiseIndex(const int *x_dims_array,
}
template <typename Functor, typename T, typename OutType = T>
void CommonForwardBroadcastCPU(const DenseTensor &x,
const DenseTensor &y,
DenseTensor *z,
int *x_dims_array,
int *y_dims_array,
int *out_dims_array,
void CommonForwardBroadcastCPU(const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z,
int* x_dims_array,
int* y_dims_array,
int* out_dims_array,
int max_dim,
const paddle::platform::CPUDeviceContext &ctx,
const paddle::platform::CPUDeviceContext& ctx,
Functor func,
const bool is_xsize_larger = true) {
std::vector<int> index_array(max_dim, 0);
const T *x_data = x.data<T>();
const T *y_data = y.data<T>();
const T* x_data = x.data<T>();
const T* y_data = y.data<T>();
PADDLE_ENFORCE_NOT_NULL(x_data,
paddle::platform::errors::InvalidArgument(
"The input X should not be empty."));
PADDLE_ENFORCE_NOT_NULL(y_data,
paddle::platform::errors::InvalidArgument(
"The input Y should not be empty."));
OutType *out_data = z->mutable_data<OutType>();
OutType* out_data = z->mutable_data<OutType>();
const int out_size = std::accumulate(
out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
......@@ -84,12 +246,12 @@ void CommonForwardBroadcastCPU(const DenseTensor &x,
template <typename Functor, typename T, typename OutType = T>
void CommonElementwiseBroadcastForward(
const paddle::platform::CPUDeviceContext &dev_ctx,
const DenseTensor &x,
const DenseTensor &y,
DenseTensor *z,
const DDim &x_dims,
const DDim &y_dims,
const paddle::platform::CPUDeviceContext& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z,
const DDim& x_dims,
const DDim& y_dims,
Functor func,
int axis,
const bool is_xsize_larger = true) {
......@@ -110,13 +272,13 @@ void CommonElementwiseBroadcastForward(
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
general::GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
funcs::GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
CommonForwardBroadcastCPU<Functor, T, OutType>(x,
y,
......@@ -140,12 +302,12 @@ void CommonElementwiseBroadcastForward(
// TODO(liuyiqun): optimize the CPU implementation to support all broadcast
// cases and avoid the need of XxxInverseFunctor.
template <typename Functor, typename T, typename OutType = T>
void ElementwiseCompute(const paddle::platform::CPUDeviceContext &dev_ctx,
const DenseTensor &x,
const DenseTensor &y,
void ElementwiseCompute(const paddle::platform::CPUDeviceContext& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int axis,
Functor func,
DenseTensor *z) {
DenseTensor* z) {
z->mutable_data<OutType>();
auto x_dims = x.dims();
auto y_dims = y.dims();
......@@ -155,7 +317,7 @@ void ElementwiseCompute(const paddle::platform::CPUDeviceContext &dev_ctx,
is_xsize_larger = false;
max_dim = y_dims.size();
}
general::
funcs::
TransformFunctor<Functor, T, paddle::platform::CPUDeviceContext, OutType>
functor(x, y, z, dev_ctx, func, is_xsize_larger);
if (x_dims == y_dims) {
......@@ -179,25 +341,25 @@ void ElementwiseCompute(const paddle::platform::CPUDeviceContext &dev_ctx,
int pre, n, post, is_run_common_broadcast, axis_trim = 0;
if (is_xsize_larger) {
auto y_dims_trimed = general::trim_trailing_singular_dims(y_dims);
auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
general::get_mid_dims(x_dims,
y_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
funcs::get_mid_dims(x_dims,
y_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
} else {
auto x_dims_trimed = general::trim_trailing_singular_dims(x_dims);
auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
general::get_mid_dims(y_dims,
x_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
funcs::get_mid_dims(y_dims,
x_dims_trimed,
axis_trim,
&pre,
&n,
&post,
&is_run_common_broadcast);
}
// special case for common implementation.
// case 1: x=[2,3,1,5], y=[2,1,4,1]
......@@ -219,10 +381,10 @@ void ElementwiseCompute(const paddle::platform::CPUDeviceContext &dev_ctx,
template <typename Functor>
struct SameDimsElementwiseCompute {
void operator()(const paddle::platform::CPUDeviceContext &dev_ctx,
const DenseTensor &x,
const DenseTensor &y,
DenseTensor *z) {
void operator()(const paddle::platform::CPUDeviceContext& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
Functor()(dev_ctx, x, y, z);
}
};
......
......@@ -18,9 +18,11 @@
#include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/common/scalar.h"
#include "paddle/pten/core/kernel_registry.h"
#include "paddle/pten/kernels/hybird/cpu/elementwise.h"
#include "paddle/pten/kernels/cpu/elementwise_impl.h"
#include "paddle/pten/kernels/funcs/elementwise_functor.h"
#include "paddle/pten/kernels/hybird/eigen/reduce.h"
#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
// See Note [ Why still include the fluid headers? ]
......@@ -30,29 +32,28 @@
namespace pten {
#define DEFINE_CPU_ELEMENTWISE_OP(name) \
template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
int axis, \
DenseTensor* out) { \
out->mutable_data<T>(); \
if (x.dims() == y.dims()) { \
SameDimsElementwiseCompute< \
general::SameDims##name##Functor<CPUContext, T>>()( \
dev_ctx, x, y, out); \
} else { \
auto x_dims = x.dims(); \
auto y_dims = y.dims(); \
if (x_dims.size() >= y_dims.size()) { \
ElementwiseCompute<general::name##Functor<T>, T>( \
dev_ctx, x, y, axis, general::name##Functor<T>(), out); \
} else { \
ElementwiseCompute<general::Inverse##name##Functor<T>, T>( \
dev_ctx, x, y, axis, general::Inverse##name##Functor<T>(), out); \
} \
} \
#define DEFINE_CPU_ELEMENTWISE_OP(name) \
template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
int axis, \
DenseTensor* out) { \
out->mutable_data<T>(); \
if (x.dims() == y.dims()) { \
SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
dev_ctx, x, y, out); \
} else { \
auto x_dims = x.dims(); \
auto y_dims = y.dims(); \
if (x_dims.size() >= y_dims.size()) { \
ElementwiseCompute<funcs::name##Functor<T>, T>( \
dev_ctx, x, y, axis, funcs::name##Functor<T>(), out); \
} else { \
ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>( \
dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out); \
} \
} \
}
template <typename T, typename Context>
......@@ -76,17 +77,17 @@ void DivideKernel(const Context& dev_ctx,
// allocate memory for out
out->mutable_data<T>();
if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
SameDimsElementwiseCompute<general::SameDimsDivideFunctor<CPUContext, T>>()(
SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
dev_ctx, x, y, out);
} else {
auto x_dims = x.dims();
auto y_dims = y.dims();
if (x_dims.size() >= y_dims.size()) {
ElementwiseCompute<general::DivideFunctor<T>, T>(
dev_ctx, x, y, axis, general::DivideFunctor<T>(), out);
ElementwiseCompute<funcs::DivideFunctor<T>, T>(
dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
} else {
ElementwiseCompute<general::InverseDivideFunctor<T>, T>(
dev_ctx, x, y, axis, general::InverseDivideFunctor<T>(), out);
ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
}
}
}
......
......@@ -19,7 +19,7 @@ limitations under the License. */
#include "paddle/pten/core/dense_tensor.h"
namespace pten {
namespace general {
namespace funcs {
using DDim = paddle::framework::DDim;
......@@ -378,6 +378,5 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims,
}
}
}
} // namespace general
} // namespace funcs
} // namespace pten
......@@ -17,50 +17,13 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/kernels/hybird/blas/elementwise.h"
#include "paddle/pten/kernels/hybird/eigen/elementwise.h"
namespace pten {
namespace general {
namespace funcs {
// Define the binary functors used in elementwise ops.
// Add
template <typename DevCtx, typename T, class Enable = void>
struct SameDimsAddFunctor {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z);
};
template <typename DevCtx, typename T>
struct SameDimsAddFunctor<
DevCtx,
T,
typename std::enable_if<std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
blas::ElementwiseAdd<DevCtx, T>(dev_ctx, x, y, z);
}
};
template <typename DevCtx, typename T>
struct SameDimsAddFunctor<
DevCtx,
T,
typename std::enable_if<!std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
eigen::ElementwiseAdd<DevCtx, T>(dev_ctx, x, y, z);
}
};
template <typename T>
struct AddFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; }
......@@ -71,40 +34,6 @@ struct InverseAddFunctor {
};
// Subtract
template <typename DevCtx, typename T, class Enable = void>
struct SameDimsSubtractFunctor {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z);
};
template <typename DevCtx, typename T>
struct SameDimsSubtractFunctor<
DevCtx,
T,
typename std::enable_if<std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
blas::ElementwiseSub<DevCtx, T>(dev_ctx, x, y, z);
}
};
template <typename DevCtx, typename T>
struct SameDimsSubtractFunctor<
DevCtx,
T,
typename std::enable_if<!std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
eigen::ElementwiseSub<DevCtx, T>(dev_ctx, x, y, z);
}
};
template <typename T>
struct SubtractFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; }
......@@ -114,43 +43,17 @@ struct InverseSubtractFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; }
};
// Divide
template <typename DevCtx, typename T, class Enable = void>
struct SameDimsDivideFunctor {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z);
};
template <typename DevCtx, typename T>
struct SameDimsDivideFunctor<
DevCtx,
T,
typename std::enable_if<!std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
paddle::platform::errors::InvalidArgument(
"If use SameDimsDivideFunctor, template args(T) must be floating "
"point. ");
}
// Multiply
template <typename T>
struct MultiplyFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
};
template <typename DevCtx, typename T>
struct SameDimsDivideFunctor<
DevCtx,
T,
typename std::enable_if<std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
blas::ElementwiseDiv<DevCtx, T>(dev_ctx, x, y, z);
}
template <typename T>
struct InverseMultiplyFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; }
};
// Divide
#define DIV_ERROR_INFO \
"InvalidArgumentError: Integer division by zero encountered in " \
"(floor) divide. Please check the input value."
......@@ -176,48 +79,5 @@ struct InverseDivideFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b / a; }
};
// Multiply
template <typename DevCtx, typename T, class Enable = void>
struct SameDimsMultiplyFunctor {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z);
};
template <typename DevCtx, typename T>
struct SameDimsMultiplyFunctor<
DevCtx,
T,
typename std::enable_if<std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
blas::ElementwiseMul<DevCtx, T>(dev_ctx, x, y, z);
}
};
template <typename DevCtx, typename T>
struct SameDimsMultiplyFunctor<
DevCtx,
T,
typename std::enable_if<!std::is_floating_point<T>::value>::type> {
void operator()(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* z) {
eigen::ElementwiseMul<DevCtx, T>(dev_ctx, x, y, z);
}
};
template <typename T>
struct MultiplyFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
};
template <typename T>
struct InverseMultiplyFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; }
};
} // namespace general
} // namespace funcs
} // namespace pten
......@@ -15,9 +15,9 @@ limitations under the License. */
#include "paddle/pten/kernels/math_kernel.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/kernels/funcs/elementwise_functor.h"
#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h"
#include "paddle/pten/kernels/hybird/cuda/reduce/reduce.h"
#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
#ifdef __NVCC__
......@@ -39,21 +39,21 @@ namespace kps = paddle::operators::kernel_primitives;
namespace pten {
#define DEFINE_CUDA_ELEMENTWISE_OP(name) \
template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
int axis, \
DenseTensor* out) { \
std::vector<const DenseTensor*> inputs; \
std::vector<DenseTensor*> outputs; \
inputs.emplace_back(&x); \
inputs.emplace_back(&y); \
outputs.emplace_back(out); \
out->mutable_data<T>(); \
LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>( \
dev_ctx, inputs, &outputs, axis, general::name##Functor<T>()); \
#define DEFINE_CUDA_ELEMENTWISE_OP(name) \
template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \
const DenseTensor& x, \
const DenseTensor& y, \
int axis, \
DenseTensor* out) { \
std::vector<const DenseTensor*> inputs; \
std::vector<DenseTensor*> outputs; \
inputs.emplace_back(&x); \
inputs.emplace_back(&y); \
outputs.emplace_back(out); \
out->mutable_data<T>(); \
LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>( \
dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
}
/**
......
add_subdirectory(eigen)
add_subdirectory(blas)
add_subdirectory(general)
cc_library(pten_transpose_cpu SRCS transpose.cc DEPS dense_tensor pten_context)
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/pten/core/dense_tensor.h"
namespace pten {
namespace blas {
template <typename DevCtx, typename T>
void ElementwiseAdd(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
blas.VADD(x.numel(), x.data<T>(), y.data<T>(), out->mutable_data<T>());
}
template <typename DevCtx, typename T>
void ElementwiseSub(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
blas.VSUB(x.numel(), x.data<T>(), y.data<T>(), out->mutable_data<T>());
}
template <typename DevCtx, typename T>
void ElementwiseDiv(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
blas.VDIV(x.numel(), x.data<T>(), y.data<T>(), out->mutable_data<T>());
}
template <typename DevCtx, typename T>
void ElementwiseMul(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
blas.VMUL(x.numel(), x.data<T>(), y.data<T>(), out->mutable_data<T>());
}
} // namespace blas
} // namespace pten
......@@ -18,7 +18,7 @@ limitations under the License. */
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/function_traits.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
#include "paddle/pten/kernels/funcs/elementwise_base.h"
namespace pten {
namespace kps = paddle::operators::kernel_primitives;
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/kernels/hybird/eigen/common.h"
namespace pten {
namespace eigen {
template <typename DevCtx, typename T>
void ElementwiseAdd(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
out->mutable_data<T>();
auto eigen_x = pten::EigenVector<T>::Flatten(x);
auto eigen_y = pten::EigenVector<T>::Flatten(y);
auto eigen_z = pten::EigenVector<T>::Flatten(*out);
auto& place = *dev_ctx.eigen_device();
eigen_z.device(place) = eigen_x + eigen_y;
}
template <typename DevCtx, typename T>
void ElementwiseSub(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
auto eigen_x = pten::EigenVector<T>::Flatten(x);
auto eigen_y = pten::EigenVector<T>::Flatten(y);
auto eigen_z = pten::EigenVector<T>::Flatten(*out);
auto& place = *dev_ctx.eigen_device();
eigen_z.device(place) = eigen_x - eigen_y;
}
template <typename DevCtx, typename T>
void ElementwiseMul(const DevCtx& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
auto eigen_x = pten::EigenVector<T>::Flatten(x);
auto eigen_y = pten::EigenVector<T>::Flatten(y);
auto eigen_z = pten::EigenVector<T>::Flatten(*out);
auto& place = *dev_ctx.eigen_device();
eigen_z.device(place) = eigen_x * eigen_y;
}
} // namespace eigen
} // namespace pten
......@@ -342,7 +342,6 @@ def source_include(header_file_path):
#include "paddle/pten/api/include/kernel_signature.h"
#include "paddle/pten/api/lib/api_registry.h"
#include "paddle/pten/api/lib/kernel_declare.h"
#include "paddle/pten/api/lib/kernel_dispatch.h"
#include "paddle/pten/api/lib/utils/storage.h"
#include "paddle/pten/core/kernel_registry.h"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册