提交 43f11b5e 编写于 作者: D danleifeng 提交者: gongweibao

[cherry pick]Improve elementwise operators performance in same dimensions (#20134)

上级 2b84f18e
...@@ -20,6 +20,34 @@ limitations under the License. */ ...@@ -20,6 +20,34 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
struct SameDimsElemwiseAdd<
platform::CPUDeviceContext, T,
typename std::enable_if<std::is_floating_point<T>::value>::type> {
void operator()(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y,
framework::Tensor *z) {
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
blas.VADD(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
}
};
template <typename T>
struct SameDimsElemwiseAdd<
platform::CPUDeviceContext, T,
typename std::enable_if<!std::is_floating_point<T>::value>::type> {
void operator()(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y,
framework::Tensor *z) {
auto eigen_x = framework::EigenVector<T>::Flatten(*x);
auto eigen_y = framework::EigenVector<T>::Flatten(*y);
auto eigen_z = framework::EigenVector<T>::Flatten(*z);
auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
.eigen_device();
eigen_z.device(place) = eigen_x + eigen_y;
}
};
class ElementwiseAddOpMaker : public ElementwiseOpMaker { class ElementwiseAddOpMaker : public ElementwiseOpMaker {
protected: protected:
std::string GetName() const override { return "Add"; } std::string GetName() const override { return "Add"; }
......
...@@ -11,13 +11,84 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,13 +11,84 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
namespace paddle {
namespace operators {
template <typename T>
struct SameDimsElemwiseAdd<platform::CUDADeviceContext, T> {
void operator()(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
AddRangeFunctor<T> functor(x->data<T>(), y->data<T>(), z->data<T>());
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
x->numel());
for_range(functor);
}
};
template <>
struct SameDimsElemwiseAdd<platform::CUDADeviceContext, platform::float16> {
void operator()(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
auto size = x->numel();
dim3 gird_size = dim3(
(size / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
const half* x2 =
reinterpret_cast<const half*>(x->data<platform::float16>());
const half* y2 =
reinterpret_cast<const half*>(y->data<platform::float16>());
half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
SameDimsElemwiseAddCUDAKernel<<<
gird_size, block_size, 0,
ctx.template device_context<platform::CUDADeviceContext>().stream()>>>(
x2, y2, z2, size);
}
};
template <typename T>
static __global__ void SimpleElemwiseAddGradCUDAKernel(const T* dout,
int64_t size, T* dx,
T* dy) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
while (col < size) {
dx[col] = dout[col];
dy[col] = dout[col];
col += blockDim.x * gridDim.x;
}
}
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
elementwise_add_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy) {
dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
auto size = x->numel();
dim3 gird_size =
dim3((size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
SimpleElemwiseAddGradCUDAKernel<
T><<<gird_size, block_size, 0,
ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
dout->data<T>(), size, dx->mutable_data<T>(ctx.GetPlace()),
dy->mutable_data<T>(ctx.GetPlace()));
}
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>, elementwise_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>, ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
......
...@@ -11,22 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,22 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
struct AddFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
void default_elementwise_add(const framework::ExecutionContext &ctx, void default_elementwise_add(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *x,
...@@ -36,31 +29,12 @@ void default_elementwise_add(const framework::ExecutionContext &ctx, ...@@ -36,31 +29,12 @@ void default_elementwise_add(const framework::ExecutionContext &ctx,
AddFunctor<T>(), z); AddFunctor<T>(), z);
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T, class Enable = void>
typename std::enable_if< struct SameDimsElemwiseAdd {
std::is_floating_point<T>::value && void operator()(const framework::ExecutionContext &ctx,
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type const framework::Tensor *x, const framework::Tensor *y,
elementwise_add_same_dims(const framework::ExecutionContext &ctx, framework::Tensor *z);
const framework::Tensor *x, };
const framework::Tensor *y, framework::Tensor *z) {
auto blas = math::GetBlas<DeviceContext, T>(ctx);
blas.VADD(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
}
template <typename DeviceContext, typename T>
typename std::enable_if<
!std::is_floating_point<T>::value ||
!std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_add_same_dims(const framework::ExecutionContext &ctx,
const framework::Tensor *x,
const framework::Tensor *y, framework::Tensor *z) {
auto eigen_x = framework::EigenVector<T>::Flatten(*x);
auto eigen_y = framework::EigenVector<T>::Flatten(*y);
auto eigen_z = framework::EigenVector<T>::Flatten(*z);
auto &place = *ctx.template device_context<DeviceContext>().eigen_device();
eigen_z.device(place) = eigen_x + eigen_y;
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ElementwiseAddKernel : public framework::OpKernel<T> { class ElementwiseAddKernel : public framework::OpKernel<T> {
...@@ -69,12 +43,11 @@ class ElementwiseAddKernel : public framework::OpKernel<T> { ...@@ -69,12 +43,11 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
auto *x = ctx.Input<framework::LoDTensor>("X"); auto *x = ctx.Input<framework::LoDTensor>("X");
auto *y = ctx.Input<framework::LoDTensor>("Y"); auto *y = ctx.Input<framework::LoDTensor>("Y");
auto *z = ctx.Output<framework::LoDTensor>("Out"); auto *z = ctx.Output<framework::LoDTensor>("Out");
z->mutable_data<T>(ctx.GetPlace()); z->mutable_data<T>(ctx.GetPlace());
auto dims_equal = x->dims() == y->dims(); auto dims_equal = x->dims() == y->dims();
if (dims_equal) { if (dims_equal) {
elementwise_add_same_dims<DeviceContext, T>(ctx, x, y, z); SameDimsElemwiseAdd<DeviceContext, T> same_dims_add;
same_dims_add(ctx, x, y, z);
} else { } else {
default_elementwise_add<DeviceContext, T>(ctx, x, y, z); default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
} }
...@@ -112,7 +85,6 @@ elementwise_add_grad(const framework::ExecutionContext &ctx, ...@@ -112,7 +85,6 @@ elementwise_add_grad(const framework::ExecutionContext &ctx,
const framework::Tensor *dout, framework::Tensor *dx, const framework::Tensor *dout, framework::Tensor *dx,
framework::Tensor *dy) { framework::Tensor *dy) {
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = math::GetBlas<DeviceContext, T>(ctx);
if (dx) { if (dx) {
blas.VCOPY(dout->numel(), dout->data<T>(), blas.VCOPY(dout->numel(), dout->data<T>(),
dx->mutable_data<T>(ctx.GetPlace())); dx->mutable_data<T>(ctx.GetPlace()));
...@@ -126,8 +98,8 @@ elementwise_add_grad(const framework::ExecutionContext &ctx, ...@@ -126,8 +98,8 @@ elementwise_add_grad(const framework::ExecutionContext &ctx,
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
typename std::enable_if< typename std::enable_if<
!std::is_floating_point<T>::value || !std::is_floating_point<T>::value &&
!std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_add_grad(const framework::ExecutionContext &ctx, elementwise_add_grad(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y, const framework::Tensor *x, const framework::Tensor *y,
const framework::Tensor *out, const framework::Tensor *out,
...@@ -136,6 +108,18 @@ elementwise_add_grad(const framework::ExecutionContext &ctx, ...@@ -136,6 +108,18 @@ elementwise_add_grad(const framework::ExecutionContext &ctx,
default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy); default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
} }
#ifdef PADDLE_WITH_CUDA
// cuda definition
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
elementwise_add_grad(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y,
const framework::Tensor *out,
const framework::Tensor *dout, framework::Tensor *dx,
framework::Tensor *dy);
#endif
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> { class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
public: public:
...@@ -151,8 +135,7 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> { ...@@ -151,8 +135,7 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
auto *out = dout; auto *out = dout;
auto *x = dout, *y = dout; auto *x = dout, *y = dout;
if (platform::is_cpu_place(ctx.GetPlace()) && dx != nullptr && if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
dy != nullptr && (dx->dims() == dy->dims())) {
elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy); elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
} else { } else {
default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
......
...@@ -20,6 +20,34 @@ limitations under the License. */ ...@@ -20,6 +20,34 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
struct SameDimsElemwiseDiv<
platform::CPUDeviceContext, T,
typename std::enable_if<std::is_floating_point<T>::value>::type> {
void operator()(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y,
framework::Tensor *z) {
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
blas.VDIV(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
}
};
template <typename T>
struct SameDimsElemwiseDiv<
platform::CPUDeviceContext, T,
typename std::enable_if<!std::is_floating_point<T>::value>::type> {
void operator()(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y,
framework::Tensor *z) {
auto eigen_x = framework::EigenVector<T>::Flatten(*x);
auto eigen_y = framework::EigenVector<T>::Flatten(*y);
auto eigen_z = framework::EigenVector<T>::Flatten(*z);
auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
.eigen_device();
eigen_z.device(place) = eigen_x / eigen_y;
}
};
class ElementwiseDivOpMaker : public ElementwiseOpMaker { class ElementwiseDivOpMaker : public ElementwiseOpMaker {
protected: protected:
std::string GetName() const override { return "Div"; } std::string GetName() const override { return "Div"; }
......
...@@ -12,9 +12,87 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,87 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_div_op.h" #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
namespace paddle {
namespace operators {
template <typename T>
struct SameDimsElemwiseDiv<platform::CUDADeviceContext, T> {
void operator()(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
DivRangeFunctor<T> functor(x->data<T>(), y->data<T>(), z->data<T>());
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
x->numel());
for_range(functor);
}
};
template <>
struct SameDimsElemwiseDiv<platform::CUDADeviceContext, platform::float16> {
void operator()(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
auto size = x->numel();
dim3 gird_size = dim3(
(size / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
const half* x2 =
reinterpret_cast<const half*>(x->data<platform::float16>());
const half* y2 =
reinterpret_cast<const half*>(y->data<platform::float16>());
half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
SameDimsElemwiseDivCUDAKernel<<<
gird_size, block_size, 0,
ctx.template device_context<platform::CUDADeviceContext>().stream()>>>(
x2, y2, z2, size);
}
};
template <typename T>
static __global__ void SimpleElemwiseDivGradCUDAKernel(const T* x, const T* y,
const T* out,
const T* dout,
int64_t size, T* dx,
T* dy) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
while (col < size) {
T o = dout[col];
dx[col] = o / y[col];
dy[col] = -o * out[col] / y[col];
col += blockDim.x * gridDim.x;
}
}
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
elementwise_div_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy) {
dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
auto size = x->numel();
dim3 gird_size =
dim3((size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
SimpleElemwiseDivGradCUDAKernel<
T><<<gird_size, block_size, 0,
ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
x->data<T>(), y->data<T>(), out->data<T>(), dout->data<T>(), size,
dx->mutable_data<T>(ctx.GetPlace()), dy->mutable_data<T>(ctx.GetPlace()));
}
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_div, elementwise_div,
......
...@@ -17,16 +17,29 @@ limitations under the License. */ ...@@ -17,16 +17,29 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T> template <typename DeviceContext, typename T>
struct DivFunctor { void default_elementwise_div(const framework::ExecutionContext& ctx,
inline HOSTDEVICE T operator()(T a, T b) const { return a / b; } const framework::Tensor* x,
const framework::Tensor* y, framework::Tensor* z) {
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
DivFunctor<T>(), z);
}
template <typename DeviceContext, typename T, class Enable = void>
struct SameDimsElemwiseDiv {
void operator()(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -36,11 +49,15 @@ class ElementwiseDivKernel : public framework::OpKernel<T> { ...@@ -36,11 +49,15 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
auto* x = ctx.Input<framework::LoDTensor>("X"); auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y"); auto* y = ctx.Input<framework::LoDTensor>("Y");
auto* z = ctx.Output<framework::LoDTensor>("Out"); auto* z = ctx.Output<framework::LoDTensor>("Out");
z->mutable_data<T>(ctx.GetPlace()); z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis, auto dims_equal = x->dims() == y->dims();
DivFunctor<T>(), z); if (dims_equal) {
SameDimsElemwiseDiv<DeviceContext, T> same_dims_div;
same_dims_div(ctx, x, y, z);
} else {
default_elementwise_div<DeviceContext, T>(ctx, x, y, z);
}
} }
}; };
...@@ -63,6 +80,31 @@ struct DivDoubleDY { ...@@ -63,6 +80,31 @@ struct DivDoubleDY {
} }
}; };
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_div_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy) {
int axis = ctx.Attr<int>("axis");
ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
}
#ifdef PADDLE_WITH_CUDA
// cuda definition
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
elementwise_div_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy);
#endif
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> { class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
public: public:
...@@ -76,11 +118,15 @@ class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> { ...@@ -76,11 +118,15 @@ class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
auto* x = dout; // Fake x, not used auto* x = dout; // Fake x, not used
ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>( if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>()); elementwise_div_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
} else {
ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(),
DivGradDY<T>());
}
} }
}; };
......
...@@ -20,6 +20,34 @@ limitations under the License. */ ...@@ -20,6 +20,34 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
struct SameDimsElemwiseMul<
platform::CPUDeviceContext, T,
typename std::enable_if<std::is_floating_point<T>::value>::type> {
void operator()(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y,
framework::Tensor *z) {
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
blas.VMUL(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
}
};
template <typename T>
struct SameDimsElemwiseMul<
platform::CPUDeviceContext, T,
typename std::enable_if<!std::is_floating_point<T>::value>::type> {
void operator()(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y,
framework::Tensor *z) {
auto eigen_x = framework::EigenVector<T>::Flatten(*x);
auto eigen_y = framework::EigenVector<T>::Flatten(*y);
auto eigen_z = framework::EigenVector<T>::Flatten(*z);
auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
.eigen_device();
eigen_z.device(place) = eigen_x * eigen_y;
}
};
class ElementwiseMulOpMaker : public ElementwiseOpMaker { class ElementwiseMulOpMaker : public ElementwiseOpMaker {
protected: protected:
std::string GetName() const override { return "Mul"; } std::string GetName() const override { return "Mul"; }
......
...@@ -13,15 +13,49 @@ See the License for the specific language governing permissions and ...@@ -13,15 +13,49 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#define TILE_SIZE 512
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
struct SameDimsElemwiseMul<platform::CUDADeviceContext, T> {
void operator()(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
MulRangeFunctor<T> functor(x->data<T>(), y->data<T>(), z->data<T>());
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
x->numel());
for_range(functor);
}
};
template <>
struct SameDimsElemwiseMul<platform::CUDADeviceContext, platform::float16> {
void operator()(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
auto size = x->numel();
dim3 gird_size = dim3(
(size / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
const half* x2 =
reinterpret_cast<const half*>(x->data<platform::float16>());
const half* y2 =
reinterpret_cast<const half*>(y->data<platform::float16>());
half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
SameDimsElemwiseMulCUDAKernel<<<
gird_size, block_size, 0,
ctx.template device_context<platform::CUDADeviceContext>().stream()>>>(
x2, y2, z2, size);
}
};
template <typename T> template <typename T>
static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y, static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y,
const T* out, const T* out,
...@@ -38,40 +72,24 @@ static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y, ...@@ -38,40 +72,24 @@ static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y,
} }
} }
template <typename T> template <typename DeviceContext, typename T>
class ElementwiseMulGradKernel<plat::CUDADeviceContext, T> typename std::enable_if<
: public ElemwiseGradKernel<T> { std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
public: elementwise_mul_grad(const framework::ExecutionContext& ctx,
void Compute(const framework::ExecutionContext& ctx) const override { const framework::Tensor* x, const framework::Tensor* y,
ElemwiseGradKernel<T>::Compute(ctx); const framework::Tensor* out,
using Tensor = framework::Tensor; const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy) {
auto* x = ctx.Input<Tensor>("X"); dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
auto* y = ctx.Input<Tensor>("Y"); auto size = x->numel();
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); dim3 gird_size =
auto* out = dout; // out is not necessary dim3((size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); SimpleElemwiseMulGradCUDAKernel<
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); T><<<gird_size, block_size, 0,
int axis = ctx.Attr<int>("axis"); ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
x->data<T>(), y->data<T>(), out->data<T>(), dout->data<T>(), size,
if (x->dims() == y->dims() && dx && dy) { dx->mutable_data<T>(ctx.GetPlace()), dy->mutable_data<T>(ctx.GetPlace()));
dim3 block_size = dim3(TILE_SIZE, 1); }
auto size = x->numel();
dim3 gird_size = dim3((size + TILE_SIZE - 1) / TILE_SIZE, 1);
SimpleElemwiseMulGradCUDAKernel<T><<<
gird_size, block_size, 0,
ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
x->data<T>(), y->data<T>(), out->data<T>(), dout->data<T>(), size,
dx->mutable_data<T>(ctx.GetPlace()),
dy->mutable_data<T>(ctx.GetPlace()));
return;
} else {
ElemwiseGradCompute<plat::CUDADeviceContext, T, MulGradDX<T>,
MulGradDY<T>>(ctx, *x, *y, *out, *dout, axis, dx, dy,
MulGradDX<T>(), MulGradDY<T>());
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
...@@ -14,17 +14,13 @@ limitations under the License. */ ...@@ -14,17 +14,13 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
struct MulFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
void default_elementwise_mul(const framework::ExecutionContext& ctx, void default_elementwise_mul(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* x,
...@@ -33,32 +29,12 @@ void default_elementwise_mul(const framework::ExecutionContext& ctx, ...@@ -33,32 +29,12 @@ void default_elementwise_mul(const framework::ExecutionContext& ctx,
ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis, ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
MulFunctor<T>(), z); MulFunctor<T>(), z);
} }
template <typename DeviceContext, typename T, class Enable = void>
template <typename DeviceContext, typename T> struct SameDimsElemwiseMul {
typename std::enable_if< void operator()(const framework::ExecutionContext& ctx,
std::is_floating_point<T>::value && const framework::Tensor* x, const framework::Tensor* y,
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type framework::Tensor* z);
elementwise_mul_same_dims(const framework::ExecutionContext& ctx, };
const framework::Tensor* x,
const framework::Tensor* y, framework::Tensor* z) {
auto blas = math::GetBlas<DeviceContext, T>(ctx);
blas.VMUL(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
}
template <typename DeviceContext, typename T>
typename std::enable_if<
!std::is_floating_point<T>::value ||
!std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_mul_same_dims(const framework::ExecutionContext& ctx,
const framework::Tensor* x,
const framework::Tensor* y, framework::Tensor* z) {
auto eigen_x = framework::EigenVector<T>::Flatten(*x);
auto eigen_y = framework::EigenVector<T>::Flatten(*y);
auto eigen_z = framework::EigenVector<T>::Flatten(*z);
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
eigen_z.device(place) = eigen_x * eigen_y;
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ElementwiseMulKernel : public framework::OpKernel<T> { class ElementwiseMulKernel : public framework::OpKernel<T> {
...@@ -92,7 +68,8 @@ class ElementwiseMulKernel : public framework::OpKernel<T> { ...@@ -92,7 +68,8 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
z->mutable_data<T>(ctx.GetPlace()); z->mutable_data<T>(ctx.GetPlace());
if (x.numel() == y->numel()) { if (x.numel() == y->numel()) {
elementwise_mul_same_dims<DeviceContext, T>(ctx, &x, y, z); SameDimsElemwiseMul<DeviceContext, T> same_dims_mul;
same_dims_mul(ctx, &x, y, z);
} else { } else {
default_elementwise_mul<DeviceContext, T>(ctx, &x, y, z); default_elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
} }
...@@ -109,6 +86,31 @@ struct MulGradDY { ...@@ -109,6 +86,31 @@ struct MulGradDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; } HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
}; };
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_mul_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy) {
int axis = ctx.Attr<int>("axis");
ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
}
#ifdef PADDLE_WITH_CUDA
// cuda definition
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
elementwise_mul_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy);
#endif
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> { class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
public: public:
...@@ -123,8 +125,13 @@ class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> { ...@@ -123,8 +125,13 @@ class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>( if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>()); elementwise_mul_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
} else {
ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(),
MulGradDY<T>());
}
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <glog/logging.h>
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/hostdevice.h"
#define PADDLE_CUDA_THREAD_SIZE 512
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_CUDA_FP16
#include <cuda_fp16.h>
#endif
#if CUDA_VERSION < 9000
#define __h2div h2div
#endif
namespace paddle {
namespace operators {
#define DEFINE_SIMPLE_BINARY_FUNCTOR(Func, expr) \
template <typename T> \
struct Func##Functor { \
inline HOSTDEVICE T operator()(const T& a, const T& b) const { \
return a expr b; \
} \
};
DEFINE_SIMPLE_BINARY_FUNCTOR(Add, +)
DEFINE_SIMPLE_BINARY_FUNCTOR(Sub, -)
DEFINE_SIMPLE_BINARY_FUNCTOR(Mul, *)
DEFINE_SIMPLE_BINARY_FUNCTOR(Div, /)
#undef DEFINE_SIMPLE_BINARY_FUNCTOR
#define DEFINE_SIMPLE_CUDA_BINARY_FUNCTOR(Func, expr) \
template <typename T> \
struct Func##RangeFunctor { \
Func##RangeFunctor(const T* x, const T* y, T* z) : x_(x), y_(y), z_(z) {} \
inline HOSTDEVICE void operator()(size_t id) const { \
z_[id] = x_[id] expr y_[id]; \
} \
const T* x_; \
const T* y_; \
T* z_; \
};
DEFINE_SIMPLE_CUDA_BINARY_FUNCTOR(Add, +)
DEFINE_SIMPLE_CUDA_BINARY_FUNCTOR(Sub, -)
DEFINE_SIMPLE_CUDA_BINARY_FUNCTOR(Mul, *)
DEFINE_SIMPLE_CUDA_BINARY_FUNCTOR(Div, /)
#undef DEFINE_SIMPLE_CUDA_BINARY_FUNCTOR
#ifdef PADDLE_CUDA_FP16
inline DEVICE half2 half2_add(const half2& a, const half2& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hadd2(a, b);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
float r1 = a1 + b1;
float r2 = a2 + b2;
return __floats2half2_rn(r1, r2);
#endif
}
inline DEVICE half2 half2_sub(const half2& a, const half2& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hsub2(a, b);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
float r1 = a1 - b1;
float r2 = a2 - b2;
return __floats2half2_rn(r1, r2);
#endif
}
inline DEVICE half2 half2_mul(const half2& a, const half2& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hmul2(a, b);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
float r1 = a1 * b1;
float r2 = a2 * b2;
return __floats2half2_rn(r1, r2);
#endif
}
inline DEVICE half2 half2_div(const half2& a, const half2& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __h2div(a, b);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
float b2 = __high2float(b);
float r1 = a1 / b1;
float r2 = a2 / b2;
return __floats2half2_rn(r1, r2);
#endif
}
#define DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Func, expr, FP16Function) \
template <typename T> \
__global__ void SameDimsElemwise##Func##CUDAKernel(const T* x, const T* y, \
T* z, int64_t size) { \
int col = blockIdx.x * blockDim.x + threadIdx.x; \
while (col < size) { \
z[col] = x[col] expr y[col]; \
col += blockDim.x * gridDim.x; \
} \
} \
template <> \
inline __global__ void SameDimsElemwise##Func##CUDAKernel<half>( \
const half* x, const half* y, half* z, int64_t size) { \
int start = threadIdx.x + blockDim.x * blockIdx.x; \
int stride = blockDim.x * gridDim.x; \
int n2 = size / 2; \
const half2* x2 = reinterpret_cast<const half2*>(x); \
const half2* y2 = reinterpret_cast<const half2*>(y); \
half2* z2 = reinterpret_cast<half2*>(z); \
for (int i = start; i < n2; i += stride) { \
z2[i] = FP16Function(x2[i], y2[i]); \
} \
if (start == 0 && (size % 2)) { \
z[size - 1] = __float2half(__half2float(x[size - 1]) \
expr __half2float(y[size - 1])); \
} \
}
DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Add, +, half2_add)
DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Sub, -, half2_sub)
DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Mul, *, half2_mul)
DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Div, /, half2_div)
#undef DEFINE_SIMPLE_CUDA_BINARY_KERNEL
#endif // PADDLE_CUDA_FP16
} // namespace operators
} // namespace paddle
...@@ -20,6 +20,33 @@ limitations under the License. */ ...@@ -20,6 +20,33 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
struct SameDimsElemwiseSub<
platform::CPUDeviceContext, T,
typename std::enable_if<std::is_floating_point<T>::value>::type> {
void operator()(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y,
framework::Tensor *z) {
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
blas.VSUB(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
}
};
template <typename T>
struct SameDimsElemwiseSub<
platform::CPUDeviceContext, T,
typename std::enable_if<!std::is_floating_point<T>::value>::type> {
void operator()(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y,
framework::Tensor *z) {
auto eigen_x = framework::EigenVector<T>::Flatten(*x);
auto eigen_y = framework::EigenVector<T>::Flatten(*y);
auto eigen_z = framework::EigenVector<T>::Flatten(*z);
auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
.eigen_device();
eigen_z.device(place) = eigen_x - eigen_y;
}
};
class ElementwiseSubOpMaker : public ElementwiseOpMaker { class ElementwiseSubOpMaker : public ElementwiseOpMaker {
protected: protected:
std::string GetName() const override { return "Sub"; } std::string GetName() const override { return "Sub"; }
......
...@@ -11,10 +11,85 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,10 +11,85 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
namespace paddle {
namespace operators {
template <typename T>
struct SameDimsElemwiseSub<platform::CUDADeviceContext, T> {
void operator()(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
SubRangeFunctor<T> functor(x->data<T>(), y->data<T>(), z->data<T>());
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
x->numel());
for_range(functor);
}
};
template <>
struct SameDimsElemwiseSub<platform::CUDADeviceContext, platform::float16> {
void operator()(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z) {
auto size = x->numel();
dim3 gird_size = dim3(
(size / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
const half* x2 =
reinterpret_cast<const half*>(x->data<platform::float16>());
const half* y2 =
reinterpret_cast<const half*>(y->data<platform::float16>());
half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
SameDimsElemwiseSubCUDAKernel<<<
gird_size, block_size, 0,
ctx.template device_context<platform::CUDADeviceContext>().stream()>>>(
x2, y2, z2, size);
}
};
template <typename T>
static __global__ void SimpleElemwiseSubGradCUDAKernel(const T* dout,
int64_t size, T* dx,
T* dy) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
while (col < size) {
dx[col] = dout[col];
dy[col] = -dout[col];
col += blockDim.x * gridDim.x;
}
}
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
elementwise_sub_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy) {
dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
auto size = x->numel();
dim3 gird_size =
dim3((size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
SimpleElemwiseSubGradCUDAKernel<
T><<<gird_size, block_size, 0,
ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
dout->data<T>(), size, dx->mutable_data<T>(ctx.GetPlace()),
dy->mutable_data<T>(ctx.GetPlace()));
}
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
elementwise_sub, elementwise_sub,
......
...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); ...@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
...@@ -14,14 +14,27 @@ limitations under the License. */ ...@@ -14,14 +14,27 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T> template <typename DeviceContext, typename T>
struct SubFunctor { void default_elementwise_sub(const framework::ExecutionContext& ctx,
inline HOSTDEVICE T operator()(T a, T b) const { return a - b; } const framework::Tensor* x,
const framework::Tensor* y, framework::Tensor* z) {
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
SubFunctor<T>(), z);
}
template <typename DeviceContext, typename T, class Enable = void>
struct SameDimsElemwiseSub {
void operator()(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
framework::Tensor* z);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -31,11 +44,15 @@ class ElementwiseSubKernel : public framework::OpKernel<T> { ...@@ -31,11 +44,15 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
auto* x = ctx.Input<framework::LoDTensor>("X"); auto* x = ctx.Input<framework::LoDTensor>("X");
auto* y = ctx.Input<framework::LoDTensor>("Y"); auto* y = ctx.Input<framework::LoDTensor>("Y");
auto* z = ctx.Output<framework::LoDTensor>("Out"); auto* z = ctx.Output<framework::LoDTensor>("Out");
z->mutable_data<T>(ctx.GetPlace()); z->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis, auto dims_equal = x->dims() == y->dims();
SubFunctor<T>(), z); if (dims_equal) {
SameDimsElemwiseSub<DeviceContext, T> same_dims_sub;
same_dims_sub(ctx, x, y, z);
} else {
default_elementwise_sub<DeviceContext, T>(ctx, x, y, z);
}
} }
}; };
...@@ -49,6 +66,31 @@ struct SubGradDY { ...@@ -49,6 +66,31 @@ struct SubGradDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; } HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
}; };
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_sub_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy) {
int axis = ctx.Attr<int>("axis");
ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
}
#ifdef PADDLE_WITH_CUDA
// cuda definition
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
elementwise_sub_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy);
#endif
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> { class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
public: public:
...@@ -63,9 +105,13 @@ class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> { ...@@ -63,9 +105,13 @@ class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
// skip out, x, y // skip out, x, y
auto* out = dout; auto* out = dout;
auto *x = dout, *y = dout; auto *x = dout, *y = dout;
if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>( elementwise_sub_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>()); } else {
ElemwiseExplicitGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(),
SubGradDY<T>());
}
} }
}; };
......
...@@ -159,9 +159,15 @@ class Blas { ...@@ -159,9 +159,15 @@ class Blas {
template <typename T> template <typename T>
void VADD(int n, const T* x, const T* y, T* z) const; void VADD(int n, const T* x, const T* y, T* z) const;
template <typename T>
void VSUB(int n, const T* x, const T* y, T* z) const;
template <typename T> template <typename T>
void VMUL(int n, const T* x, const T* y, T* z) const; void VMUL(int n, const T* x, const T* y, T* z) const;
template <typename T>
void VDIV(int n, const T* x, const T* y, T* z) const;
template <typename T> template <typename T>
void VCOPY(int n, const T* x, T* y) const; void VCOPY(int n, const T* x, T* y) const;
...@@ -275,11 +281,21 @@ class BlasT : private Blas<DeviceContext> { ...@@ -275,11 +281,21 @@ class BlasT : private Blas<DeviceContext> {
Base()->template VADD<T>(args...); Base()->template VADD<T>(args...);
} }
template <typename... ARGS>
void VSUB(ARGS... args) const {
Base()->template VSUB<T>(args...);
}
template <typename... ARGS> template <typename... ARGS>
void VMUL(ARGS... args) const { void VMUL(ARGS... args) const {
Base()->template VMUL<T>(args...); Base()->template VMUL<T>(args...);
} }
template <typename... ARGS>
void VDIV(ARGS... args) const {
Base()->template VDIV<T>(args...);
}
template <typename... ARGS> template <typename... ARGS>
void VCOPY(ARGS... args) const { void VCOPY(ARGS... args) const {
Base()->template VCOPY<T>(args...); Base()->template VCOPY<T>(args...);
......
...@@ -99,11 +99,21 @@ struct CBlas<float> { ...@@ -99,11 +99,21 @@ struct CBlas<float> {
platform::dynload::vsAdd(args...); platform::dynload::vsAdd(args...);
} }
template <typename... ARGS>
static void VSUB(ARGS... args) {
platform::dynload::vsSub(args...);
}
template <typename... ARGS> template <typename... ARGS>
static void VMUL(ARGS... args) { static void VMUL(ARGS... args) {
platform::dynload::vsMul(args...); platform::dynload::vsMul(args...);
} }
template <typename... ARGS>
static void VDIV(ARGS... args) {
platform::dynload::vsDiv(args...);
}
template <typename... ARGS> template <typename... ARGS>
static void VEXP(ARGS... args) { static void VEXP(ARGS... args) {
platform::dynload::vsExp(args...); platform::dynload::vsExp(args...);
...@@ -210,11 +220,21 @@ struct CBlas<double> { ...@@ -210,11 +220,21 @@ struct CBlas<double> {
platform::dynload::vdAdd(args...); platform::dynload::vdAdd(args...);
} }
template <typename... ARGS>
static void VSUB(ARGS... args) {
platform::dynload::vdSub(args...);
}
template <typename... ARGS> template <typename... ARGS>
static void VMUL(ARGS... args) { static void VMUL(ARGS... args) {
platform::dynload::vdMul(args...); platform::dynload::vdMul(args...);
} }
template <typename... ARGS>
static void VDIV(ARGS... args) {
platform::dynload::vdDiv(args...);
}
template <typename... ARGS> template <typename... ARGS>
static void VEXP(ARGS... args) { static void VEXP(ARGS... args) {
platform::dynload::vdExp(args...); platform::dynload::vdExp(args...);
...@@ -443,6 +463,20 @@ void Blas<platform::CPUDeviceContext>::VADD(int n, const T *x, const T *y, ...@@ -443,6 +463,20 @@ void Blas<platform::CPUDeviceContext>::VADD(int n, const T *x, const T *y,
#endif #endif
} }
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VSUB(int n, const T *x, const T *y,
T *z) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VSUB(n, x, y, z);
#else
// try to find if openblas support vsub
for (int i = 0; i < n; ++i) {
z[i] = x[i] - y[i];
}
#endif
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y, void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y,
...@@ -457,6 +491,20 @@ void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y, ...@@ -457,6 +491,20 @@ void Blas<platform::CPUDeviceContext>::VMUL(int n, const T *x, const T *y,
#endif #endif
} }
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VDIV(int n, const T *x, const T *y,
T *z) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VDIV(n, x, y, z);
#else
// try to find if openblas support vdiv
for (int i = 0; i < n; ++i) {
z[i] = x[i] / y[i];
}
#endif
}
template <> template <>
template <typename T> template <typename T>
void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const { void Blas<platform::CPUDeviceContext>::VEXP(int n, const T *x, T *y) const {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -76,8 +73,12 @@ extern void* mklml_dso_handle; ...@@ -76,8 +73,12 @@ extern void* mklml_dso_handle;
__macro(cblas_dscal); \ __macro(cblas_dscal); \
__macro(vsAdd); \ __macro(vsAdd); \
__macro(vdAdd); \ __macro(vdAdd); \
__macro(vsSub); \
__macro(vdSub); \
__macro(vsMul); \ __macro(vsMul); \
__macro(vdMul); \ __macro(vdMul); \
__macro(vsDiv); \
__macro(vdDiv); \
__macro(vsExp); \ __macro(vsExp); \
__macro(vdExp); \ __macro(vdExp); \
__macro(vsSqr); \ __macro(vsSqr); \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册