From 0e7baabe59759ada46079b2822f57b7805f53edc Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Tue, 19 Nov 2019 12:53:44 +0800 Subject: [PATCH] extend elementwise broadcast function (#20957) --- .../elementwise/elementwise_add_op.cc | 4 +- .../elementwise/elementwise_add_op.h | 14 +- .../elementwise/elementwise_div_op.cc | 1 + .../elementwise/elementwise_div_op.h | 22 +- .../elementwise/elementwise_mul_op.h | 10 +- .../operators/elementwise/elementwise_op.h | 140 +- .../elementwise/elementwise_op_function.cu.h | 6 + .../elementwise/elementwise_op_function.h | 1180 ++++++++++++----- .../elementwise/elementwise_sub_op.cc | 7 +- .../elementwise/elementwise_sub_op.h | 14 +- .../mkldnn/elementwise_add_mkldnn_op.cc | 7 +- .../mkldnn/elementwise_mul_mkldnn_op.cc | 10 +- paddle/fluid/operators/layer_norm_op.h | 16 +- .../fluid/operators/op_debug_string_test.cc | 5 + paddle/fluid/train/demo/run.sh | 4 +- .../unittests/test_elementwise_add_op.py | 30 + .../unittests/test_elementwise_div_op.py | 33 + .../unittests/test_elementwise_mul_op.py | 35 + .../unittests/test_elementwise_sub_op.py | 35 + ..._executor_return_tensor_not_overwriting.py | 2 +- .../unittests/test_fl_listen_and_serv_op.py | 2 +- 21 files changed, 1074 insertions(+), 503 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index 536e6cba40..4cdda93479 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -99,8 +99,8 @@ REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(elementwise_add, Add); namespace ops = paddle::operators; REGISTER_OPERATOR( - elementwise_add_grad, ops::ElementwiseOpExplicitGrad, - ops::ElementwiseGradOpInplace, ops::ElementwiseGradNoBufVarsInference, + elementwise_add_grad, ops::ElementwiseOpGrad, ops::ElementwiseGradOpInplace, + ops::ElementwiseGradNoBufVarsInference, ops::ElementwiseAddDoubleGradMaker, ops::ElementwiseAddDoubleGradMaker); diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index 315a229031..abb8673ee4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -25,8 +25,13 @@ void default_elementwise_add(const framework::ExecutionContext &ctx, const framework::Tensor *x, const framework::Tensor *y, framework::Tensor *z) { int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - AddFunctor(), z); + if (x->numel() >= y->numel()) { + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + AddFunctor(), z); + } else { + ElementwiseComputeEx, DeviceContext, T>( + ctx, x, y, axis, InverseAddFunctor(), z); + } } template @@ -128,12 +133,13 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel { using Tensor = framework::Tensor; + auto *x = ctx.Input("X"); + auto *y = ctx.Input("Y"); auto *dout = ctx.Input(framework::GradVarName("Out")); auto *dx = ctx.Output(framework::GradVarName("X")); auto *dy = ctx.Output(framework::GradVarName("Y")); - // skip out, x, y + // skip out auto *out = dout; - auto *x = dout, *y = dout; if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) { elementwise_add_grad(ctx, x, y, out, dout, dx, dy); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc index 4459555102..0343276b75 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc @@ -76,6 +76,7 @@ class ElementwiseDivGradOpMaker : public framework::SingleGradOpMaker { std::unique_ptr Apply() const override { std::unique_ptr op(new T()); op->SetType("elementwise_div_grad"); + op->SetInput("X", this->Input("X")); op->SetInput("Y", this->Input("Y")); op->SetInput("Out", this->Output("Out")); op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index 82cc6df4a6..8bb1dc4708 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -31,8 +31,13 @@ void default_elementwise_div(const framework::ExecutionContext& ctx, const framework::Tensor* x, const framework::Tensor* y, framework::Tensor* z) { int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - DivFunctor(), z); + if (x->numel() >= y->numel()) { + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + DivFunctor(), z); + } else { + ElementwiseComputeEx, DeviceContext, T>( + ctx, x, y, axis, InverseDivFunctor(), z); + } } template @@ -112,13 +117,13 @@ class ElementwiseDivGradKernel : public ElemwiseGradKernel { ElemwiseGradKernel::Compute(ctx); using Tensor = framework::Tensor; + auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); - auto* x = dout; // Fake x, not used if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) { elementwise_div_grad(ctx, x, y, out, dout, dx, dy); @@ -191,7 +196,7 @@ class ElementwiseDivDoubleGradKernel : public framework::OpKernel { // ddX_safe == null ? 0 : ddX // ddY_safe == null ? 0 : ddY Tensor ddX_safe, ddY_safe; - GetDoubleGradSafeTensor(ctx, Out, ddX, &ddX_safe); + GetDoubleGradSafeTensor(ctx, dX, ddX, &ddX_safe); GetDoubleGradSafeTensor(ctx, Y, ddY, &ddY_safe); // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y @@ -209,8 +214,7 @@ class ElementwiseDivDoubleGradKernel : public framework::OpKernel { if (dY) { // dX_div_Y = dX / Y; Tensor dX_div_Y = tmp; - ElementwiseComputeEx, DeviceContext, T>( - ctx, dX, Y, axis, DivFunctor(), &dX_div_Y); + default_elementwise_div(ctx, dX, Y, &dX_div_Y); // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the // first output tensor is nullptr, the branch to calculate first @@ -227,10 +231,8 @@ class ElementwiseDivDoubleGradKernel : public framework::OpKernel { if (ddOut) { // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y default_elementwise_mul(ctx, Out, &ddY_safe, &tmp); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &ddX_safe, &tmp, 0, SubFunctor(), &tmp); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &tmp, Y, axis, DivFunctor(), ddOut); + default_elementwise_sub(ctx, &ddX_safe, &tmp, &tmp); + default_elementwise_div(ctx, &tmp, Y, ddOut); } if (dOut) { diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index 49f0e305b6..cc3226135e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -26,9 +26,15 @@ void default_elementwise_mul(const framework::ExecutionContext& ctx, const framework::Tensor* x, const framework::Tensor* y, framework::Tensor* z) { int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - MulFunctor(), z); + if (x->numel() >= y->numel()) { + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + MulFunctor(), z); + } else { + ElementwiseComputeEx, DeviceContext, T>( + ctx, x, y, axis, InverseMulFunctor(), z); + } } + template struct SameDimsElemwiseMul { void operator()(const framework::ExecutionContext& ctx, diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 720d5d2564..120ee9336d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -14,12 +14,15 @@ limitations under the License. */ #pragma once +#include // for max #include #include #include +#include #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -35,12 +38,12 @@ class ElementwiseOp : public framework::OperatorWithKernel { using Tensor = framework::Tensor; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Y"), - "Input(Y) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of elementwise op should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of elementwise op should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true, + "Input(Y) of elementwise op should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of elementwise op should not be null."); PADDLE_ENFORCE( ctx->GetInputsVarType("Y").front() == @@ -49,18 +52,7 @@ class ElementwiseOp : public framework::OperatorWithKernel { ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front()); if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::LOD_TENSOR) { - auto x_dim = ctx->GetInputDim("X"); - auto y_dim = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_GE( - x_dim.size(), y_dim.size(), - "ShapeError: the dimension of input X must greater than or equal to " - "the one of input Y. But received: the shape of input X = [%s], the " - "dimension of input X = %d, the shape of input Y = [%s], the " - "dimension of input Y = %d", - x_dim, x_dim.size(), y_dim, y_dim.size()); - } else if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::SELECTED_ROWS) { + framework::proto::VarType::SELECTED_ROWS) { PADDLE_ENFORCE_EQ( ctx->GetInputDim("Y").size(), 1u, "ShapeError: For elementwise_op, if X is Sparse(VarType.SELECTED_ROWS" @@ -71,13 +63,31 @@ class ElementwiseOp : public framework::OperatorWithKernel { "ShapeError: For elementwise_op, if X is Sparse(VarType.SELECTED_ROWS" "), Y must be scalar. But reveived the first dimension of Y = %s", ctx->GetInputDim("Y")[0]); - } else { + } else if (ctx->GetInputsVarType("X").front() != + framework::proto::VarType::LOD_TENSOR) { PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", ctx->GetInputsVarType("X").front()); } - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); + if (ctx->GetInputDim("X") == ctx->GetInputDim("Y")) { + ctx->ShareDim("X", /*->*/ "Out"); + ctx->ShareLoD("X", /*->*/ "Out"); + } else { + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + int max_dim = std::max(x_dims.size(), y_dims.size()); + int axis = ctx->Attrs().Get("axis"); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), + y_dims_array.data(), out_dims_array.data(), + max_dim, axis); + ctx->SetOutputDim("Out", framework::make_ddim(out_dims_array)); + // to do + ctx->ShareLoD("X", /*->*/ "Out"); + } } framework::OpKernelType GetExpectedKernelType( @@ -207,26 +217,14 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { auto out_grad_name = framework::GradVarName("Out"); - PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); - PADDLE_ENFORCE(ctx->HasInput(out_grad_name), - "Input(Out@GRAD) should not be null"); - - auto x_dims = ctx->GetInputDim(out_grad_name); - auto y_dims = ctx->GetInputDim("Y"); - - PADDLE_ENFORCE_GE( - x_dims.size(), y_dims.size(), - "ShapeError: the dimension of Out@GRAD must greater than or equal to " - "the one of input Y. But received: the shape of Out@GRAD = [%s], the " - "dimension of Out@GRAD = %d, the shape of input Y = [%s], the " - "dimension of of input Y = %d", - x_dims, x_dims.size(), y_dims, y_dims.size()); - + PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true, "Input(Y) should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput(out_grad_name), true, + "Input(Out@GRAD) should not be null."); auto x_grad_name = framework::GradVarName("X"); auto y_grad_name = framework::GradVarName("Y"); if (ctx->HasOutput(x_grad_name)) { - ctx->ShareDim(out_grad_name, /*->*/ x_grad_name); - ctx->ShareLoD(out_grad_name, /*->*/ x_grad_name); + ctx->ShareDim("X", /*->*/ x_grad_name); + ctx->ShareLoD("X", /*->*/ x_grad_name); } if (ctx->HasOutput(y_grad_name)) { ctx->ShareDim("Y", /*->*/ y_grad_name); @@ -326,32 +324,6 @@ class ElementwiseOpDoubleGradWithoutDXDY } }; -// For Add, Sub op, the X, Out is not needed. -class ElementwiseOpExplicitGrad : public ElementwiseOpGrad { - public: - using operators::ElementwiseOpGrad::ElementwiseOpGrad; - using operators::ElementwiseOpGrad::GetExpectedKernelType; - using Tensor = framework::Tensor; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null"); - - auto x_grad_name = framework::GradVarName("X"); - if (ctx->HasOutput(x_grad_name)) { - ctx->ShareDim(framework::GradVarName("Out"), /*->*/ x_grad_name); - ctx->ShareLoD(framework::GradVarName("Out"), /*->*/ x_grad_name); - } - auto y_grad_name = framework::GradVarName("Y"); - if (ctx->HasOutput(y_grad_name)) { - PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); - - ctx->ShareDim("Y", /*->*/ y_grad_name); - ctx->ShareLoD("Y", /*->*/ y_grad_name); - } - } -}; - template class ElemwiseGradKernel : public framework::OpKernel { public: @@ -372,13 +344,13 @@ DECLARE_INPLACE_OP_INFERER(ElementwiseGradOpInplace, framework::GradVarName("X")}); DECLARE_INPLACE_OP_INFERER(ElementwiseDoubleGradOpInplace, {"DDX", "DDOut"}); -DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseGradNoBufVarsInference, "Y"); +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseGradNoBufVarsInference, "X", + "Y"); DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseDoubleGradNoBufVarsInference, "Y", "DOut"); } // namespace operators } // namespace paddle - #define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name) \ template \ class kernel_type##GradMaker \ @@ -390,6 +362,7 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseDoubleGradNoBufVarsInference, std::unique_ptr Apply() const override { \ auto *op = new T(); \ op->SetType(#kernel_type "_grad"); \ + op->SetInput("X", this->Input("X")); \ op->SetInput("Y", this->Input("Y")); \ op->SetInput(::paddle::framework::GradVarName("Out"), \ this->OutputGrad("Out")); \ @@ -402,41 +375,6 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseDoubleGradNoBufVarsInference, } \ } -#define REGISTER_ELEMWISE_OP(op_type, op_name, equation) \ - class __ElemwiseOp##op_type##Maker__ \ - : public ::paddle::operators::ElementwiseOpMaker { \ - protected: \ - virtual std::string GetName() const { return op_name; } \ - virtual std::string GetEquation() const { return equation; } \ - }; \ - REGISTER_OPERATOR( \ - op_type, ::paddle::operators::ElementwiseOp, \ - __ElemwiseOp##op_type##Maker__, \ - ::paddle::operators::ElementwiseOpInferVarType, \ - ::paddle::framework::DefaultGradOpMaker<::paddle::framework::OpDesc, \ - true>, \ - ::paddle::framework::DefaultGradOpMaker<::paddle::imperative::OpBase, \ - true>); \ - REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad) - -#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation) \ - class __ElemwiseOp##op_type##Maker__ \ - : public ::paddle::operators::ElementwiseOpMaker { \ - protected: \ - virtual std::string GetName() const { return op_name; } \ - virtual std::string GetEquation() const { return equation; } \ - }; \ - REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \ - __ElemwiseOp##op_type##Maker__, \ - ::paddle::operators::ElementwiseOpInferVarType, \ - op_type##GradMaker<::paddle::framework::OpDesc>, \ - op_type##GradMaker<::paddle::imperative::OpBase>, \ - ::paddle::operators::ElementwiseOpInplace); \ - REGISTER_OPERATOR(op_type##_grad, \ - ::paddle::operators::ElementwiseOpExplicitGrad, \ - ::paddle::operators::ElementwiseGradOpInplace, \ - ::paddle::operators::ElementwiseGradNoBufVarsInference) - #define REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(op_type, op_name) \ REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp, \ ::paddle::operators::Elementwise##op_name##OpMaker, \ diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h index d4b618860e..ad39441f37 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h @@ -44,6 +44,12 @@ namespace operators { inline HOSTDEVICE T operator()(const T& a, const T& b) const { \ return a expr b; \ } \ + }; \ + template \ + struct Inverse##Func##Functor { \ + inline HOSTDEVICE T operator()(const T& a, const T& b) const { \ + return b expr a; \ + } \ }; DEFINE_SIMPLE_BINARY_FUNCTOR(Add, +) diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 049a6fb0d1..644bda34a0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -16,11 +16,15 @@ limitations under the License. */ #include #include +#include // for multiplies #include #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" +#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/transform.h" #ifdef __NVCC__ @@ -33,6 +37,12 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/for_range.h" +#define GetDivMod(dividend, divisor, div, mod) \ + do { \ + const auto dividend_copy = dividend; \ + *div = dividend_copy / divisor; \ + *mod = dividend_copy % divisor; \ + } while (0) namespace paddle { namespace operators { @@ -48,72 +58,453 @@ namespace operators { * pre=2*3, n=4*5, post=1 * x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1) * - * New parameter: *mid_flag* is added to solve m*n*k & m*1*k - * broadcast cases. - * 3. shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1, 4, 5) - * mid_flag should not be NULL. - * x.shape(2, 3, 20) * y.shape(2, 1, 20).broadcast(2, 3, 20) + * New parameter: *is_run_common_broadcast* is a flag to record whether to run + * common broadcast code. */ inline void get_mid_dims(const framework::DDim &x_dims, const framework::DDim &y_dims, const int axis, - int *pre, int *n, int *post, int *mid_flag = NULL) { + int *pre, int *n, int *post, + int *is_run_common_broadcast) { *pre = 1; *n = 1; *post = 1; - if (mid_flag != NULL) { - *mid_flag = 0; - int mid = 0; - for (int i = 0; i < axis; ++i) { - (*pre) *= x_dims[i]; - } - for (int i = 0; i < y_dims.size(); ++i) { - if (x_dims[i + axis] != y_dims[i]) { - // only support single y_dims[i] = 1 now. - PADDLE_ENFORCE_EQ(*mid_flag, 0, - "Broadcast support y_dims with single 1."); - PADDLE_ENFORCE_EQ(y_dims[i], 1, - "ShapeError: broadcast dimension mismatch. Operands " - "could not be broadcast together with the shape of " - "X = [%s] and the shape of Y = [%s]. Received [%d] " - "in X is not equal to [%d] in Y", - x_dims, y_dims, x_dims[i + axis], y_dims[i]); - // m*n*k m*1*k - for (int j = 0; j < i; ++j) { - (*pre) *= y_dims[j]; - } - *n = std::max(x_dims[i + axis], y_dims[i]); - *mid_flag = 1; - mid = i; - break; - } - (*n) *= y_dims[i]; + *is_run_common_broadcast = 0; + for (int i = 0; i < axis; ++i) { + (*pre) *= x_dims[i]; + } + for (int i = 0; i < y_dims.size(); ++i) { + if (x_dims[i + axis] != y_dims[i]) { + PADDLE_ENFORCE(y_dims[i] == 1 || x_dims[i + axis] == 1, + "ShapeError: broadcast dimension mismatch. Operands " + "could not be broadcast together with the shape of " + "X = [%s] and the shape of Y = [%s]. Received [%d] " + "in X is not equal to [%d] in Y", + x_dims, y_dims, x_dims[i + axis], y_dims[i]); + *is_run_common_broadcast = 1; + return; } - if (*mid_flag) { - for (int i = mid + 1; i < x_dims.size(); ++i) { - (*post) *= x_dims[i]; - } + (*n) *= y_dims[i]; + } + for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { + (*post) *= x_dims[i]; + } +} +inline int GetElementwiseIndex(const int *x_dims_array, const int max_dim, + const int *index_array) { + int index_ = 0; + for (int i = 0; i < max_dim; i++) { + if (x_dims_array[i] > 1) { + index_ = index_ * x_dims_array[i] + index_array[i]; + } + } + return index_; +} + +inline void UpdateElementwiseIndexArray(const int *out_dims_array, + const int max_dim, int *index_array) { + for (int i = max_dim - 1; i >= 0; --i) { + ++index_array[i]; + if (index_array[i] >= out_dims_array[i]) { + index_array[i] -= out_dims_array[i]; } else { - for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { - (*post) *= x_dims[i]; - } + break; + } + } +} + +inline void GetBroadcastDimsArrays(const framework::DDim &x_dims, + const framework::DDim &y_dims, + int *x_dims_array, int *y_dims_array, + int *out_dims_array, const int max_dim, + const int axis) { + PADDLE_ENFORCE_GE(axis, 0, "Axis should be in range [0, %d)", axis); + PADDLE_ENFORCE_LT(axis, max_dim, "Axis should be in range [0, %d)", axis); + if (x_dims.size() > y_dims.size()) { + std::fill(y_dims_array, y_dims_array + axis, 1); + if (axis + y_dims.size() < max_dim) { + std::fill(y_dims_array + axis + y_dims.size(), y_dims_array + max_dim, 1); } - } else { // for fused_elementwise_activation_op. keep the old version. - for (int i = 0; i < axis; ++i) { - (*pre) *= x_dims[i]; + std::copy(x_dims.Get(), x_dims.Get() + x_dims.size(), x_dims_array); + std::copy(y_dims.Get(), y_dims.Get() + y_dims.size(), y_dims_array + axis); + } else { + std::fill(x_dims_array, x_dims_array + axis, 1); + if (axis + x_dims.size() < max_dim) { + std::fill(x_dims_array + axis + x_dims.size(), x_dims_array + max_dim, 1); } + std::copy(x_dims.Get(), x_dims.Get() + x_dims.size(), x_dims_array + axis); + std::copy(y_dims.Get(), y_dims.Get() + y_dims.size(), y_dims_array); + } - for (int i = 0; i < y_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i], - "Broadcast dimension mismatch."); - (*n) *= y_dims[i]; + for (int i = 0; i < max_dim; i++) { + PADDLE_ENFORCE(x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 || + y_dims_array[i] <= 1, + "ShapeError: broadcast dimension mismatch. Operands could " + "not be broadcast together with the shape of X = [%s] and " + "the shape of Y = [%s]. Received [%d] in X is not equal to " + "[%d] in Y", + x_dims, y_dims, x_dims_array[i], y_dims_array[i]); + if (x_dims_array[i] == -1 || y_dims_array[i] == -1) { + out_dims_array[i] = -1; + } else { + out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]); } + } +} - for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { - (*post) *= x_dims[i]; +template +void CommonForwardBroadcastCPU(const framework::Tensor *x, + const framework::Tensor *y, framework::Tensor *z, + int *x_dims_array, int *y_dims_array, + int *out_dims_array, int max_dim, + const platform::CPUDeviceContext &ctx, + Functor func, + const bool is_xsize_larger = true) { + std::vector index_array(max_dim, 0); + const T *x_data = x->data(); + const T *y_data = y->data(); + OutType *out_data = z->mutable_data(ctx.GetPlace()); + + const int out_size = std::accumulate(out_dims_array, out_dims_array + max_dim, + 1, std::multiplies()); + int x_index, y_index; + for (int out_index = 0; out_index < out_size; ++out_index) { + x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); + y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); + if (is_xsize_larger) { + out_data[out_index] = func(x_data[x_index], y_data[y_index]); + } else { + out_data[out_index] = func(y_data[y_index], x_data[x_index]); } + + UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); } } +#ifdef __NVCC__ +template +__global__ void CommonForwardBroadcastCUDAKernel( + const int *x_strides_array, const int *y_strides_array, + const int *out_dims_array, const T *x, const T *y, T *out, int out_size, + int max_dim, Functor func, const bool is_xsize_larger) { + for (int out_index = blockIdx.x * blockDim.x + threadIdx.x; + out_index < out_size; out_index += blockDim.x * gridDim.x) { + int x_index = 0; + int y_index = 0; + int out_index_quotient = out_index; + int remainder = 0; +#pragma unroll + for (int i = max_dim - 1; i >= 0; --i) { + GetDivMod(out_index_quotient, out_dims_array[i], &out_index_quotient, + &remainder); + x_index += remainder * x_strides_array[i]; + y_index += remainder * y_strides_array[i]; + } + if (is_xsize_larger) { + out[out_index] = func(x[x_index], y[y_index]); + } else { + out[out_index] = func(y[y_index], x[x_index]); + } + } +} + +template +void CommonForwardBroadcastCUDA( + const framework::Tensor *x, const framework::Tensor *y, + framework::Tensor *z, int *x_dims_array, int *y_dims_array, + int *out_dims_array, int max_dim, const platform::CUDADeviceContext &ctx, + Functor func, const bool is_xsize_larger = true) { + const auto gplace = boost::get(ctx.GetPlace()); + auto cplace = platform::CPUPlace(); + const T *x_data = x->data(); + const T *y_data = y->data(); + T *out_data = z->mutable_data(ctx.GetPlace()); + + std::vector x_strides_array(max_dim); + std::vector y_strides_array(max_dim); + int x_stride = 1; + int y_stride = 1; + for (int i = max_dim - 1; i >= 0; i--) { + x_strides_array[i] = x_dims_array[i] == 1 ? 0 : x_stride; + y_strides_array[i] = y_dims_array[i] == 1 ? 0 : y_stride; + x_stride *= x_dims_array[i]; + y_stride *= y_dims_array[i]; + } + + int bytes = max_dim * sizeof(int); + auto x_strides_array_tmp = memory::Alloc(ctx, bytes); + int *x_strides_array_gpu = + reinterpret_cast(x_strides_array_tmp->ptr()); + memory::Copy(gplace, x_strides_array_gpu, cplace, x_strides_array.data(), + bytes, ctx.stream()); + + auto y_strides_array_tmp = memory::Alloc(ctx, bytes); + int *y_strides_array_gpu = + reinterpret_cast(y_strides_array_tmp->ptr()); + memory::Copy(gplace, y_strides_array_gpu, cplace, y_strides_array.data(), + bytes, ctx.stream()); + + auto out_dims_array_tmp = memory::Alloc(ctx, bytes); + int *out_dims_array_gpu = reinterpret_cast(out_dims_array_tmp->ptr()); + memory::Copy(gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, + ctx.stream()); + + const int out_size = std::accumulate(out_dims_array, out_dims_array + max_dim, + 1, std::multiplies()); + dim3 gird_size = dim3( + (out_size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); + dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); + + CommonForwardBroadcastCUDAKernel< + Functor, T><<>>( + x_strides_array_gpu, y_strides_array_gpu, out_dims_array_gpu, x_data, + y_data, out_data, out_size, max_dim, func, is_xsize_larger); +} + +#endif // __NVCC__ + +template +void CommonGradBroadcastCPU( + const framework::Tensor &x, const framework::Tensor &y, + const framework::Tensor &out, const framework::Tensor &dout, + framework::Tensor *dx, framework::Tensor *dy, int *x_dims_array, + int *y_dims_array, int *out_dims_array, int max_dim, + const platform::CPUDeviceContext &ctx, DX_OP dx_op, DY_OP dy_op) { + std::vector index_array(max_dim, 0); + const T *x_data = x.data(); + const T *y_data = y.data(); + const T *out_data = out.data(); + const T *dout_data = dout.data(); + T *dx_data = dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()); + T *dy_data = dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()); + if (dx_data != nullptr) { + memset(dx_data, 0, dx->numel() * sizeof(T)); + } + if (dy_data != nullptr) { + memset(dy_data, 0, dy->numel() * sizeof(T)); + } + const int out_size = std::accumulate(out_dims_array, out_dims_array + max_dim, + 1, std::multiplies()); + int x_index, y_index; + for (int out_index = 0; out_index < out_size; ++out_index) { + x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); + y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); + if (dx_data != nullptr) { + dx_data[x_index] += dx_op(x_data[x_index], y_data[y_index], + out_data[out_index], dout_data[out_index]); + } + if (dy_data != nullptr) { + dy_data[y_index] += dy_op(x_data[x_index], y_data[y_index], + out_data[out_index], dout_data[out_index]); + } + + UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); + } +} + +inline void ComputeBroadcastKernelSize(int *x_dims_array, int *out_dims_array, + int *x_blocks, int *x_threads, + int max_dim) { + *x_blocks = 1; + *x_threads = 1; + for (int i = 0; i < max_dim; i++) { + if (x_dims_array[i] == out_dims_array[i]) { + *x_blocks *= x_dims_array[i]; + } else { + *x_threads *= out_dims_array[i]; + } + } +} + +inline void ComputeBroadcastTranspositionArray(const int *x_one_indexs, + int *x_trans_indexs, + const int max_dim, + const int x_one_size) { + int diff = max_dim - x_one_size; + std::copy_n(x_one_indexs, x_one_size, x_trans_indexs + diff); + int p = 0; + int q = diff; + for (int i = 0; i < max_dim; ++i) { + if (q < max_dim && i == x_trans_indexs[q]) { + ++q; + } else { + x_trans_indexs[p++] = i; + } + } +} + +#ifdef __NVCC__ +template +__global__ void CommonGradBroadcastCUDAKernel( + const int *x_strides_array, const int *y_strides_array, + const int *out_dims_array, const int *y_strides_order, + const int *y_dims_order, const T *x, const T *y, const T *out, + const T *dout, T *dx, int out_size, int max_dim, int thread_num, + DX_OP dx_op) { + T val(0); + int i = blockIdx.x; + int tid = threadIdx.x; + for (int j = tid; j < thread_num; j += blockDim.x) { + const int X_index = i * thread_num + j; + int out_index = X_index; + int C_index = 0; + int B_index = i * thread_num + j; + int remainder = 0; +#pragma unroll + for (int d = max_dim - 1; d >= 0; --d) { + GetDivMod(B_index, y_dims_order[d], &B_index, &remainder); + C_index += remainder * y_strides_order[d]; + } + int x_index = 0; + int y_index = 0; + int C_index_val = C_index; +#pragma unroll + for (int d = max_dim - 1; d >= 0; --d) { + GetDivMod(C_index_val, out_dims_array[d], &C_index_val, &remainder); + x_index += remainder * x_strides_array[d]; + y_index += remainder * y_strides_array[d]; + } + out_index = C_index; + val += dx_op(x[x_index], y[y_index], out[out_index], dout[out_index]); + } + val = paddle::platform::reduceSum(val, tid, thread_num); + if (threadIdx.x == 0) { + dx[i] = val; + } +} + +template +void CommonGradBroadcastCUDA( + const framework::Tensor &x, const framework::Tensor &y, + const framework::Tensor &out, const framework::Tensor &dout, + framework::Tensor *dx, framework::Tensor *dy, int *x_dims_array, + int *y_dims_array, int *out_dims_array, int max_dim, + const platform::CUDADeviceContext &ctx, DX_OP dx_op, DY_OP dy_op) { + const auto gplace = boost::get(ctx.GetPlace()); + auto cplace = platform::CPUPlace(); + const T *x_data = x.data(); + const T *y_data = y.data(); + const T *out_data = out.data(); + const T *dout_data = dout.data(); + T *dx_data = dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()); + T *dy_data = dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace()); + + std::vector x_one_indexs; + std::vector y_one_indexs; + for (int i = 0; i < max_dim; i++) { + if (x_dims_array[i] != y_dims_array[i]) { + if (x_dims_array[i] == 1) { + x_one_indexs.push_back(i); + } + if (y_dims_array[i] == 1) { + y_one_indexs.push_back(i); + } + } + } + + std::vector x_trans_indexs(max_dim); + std::vector y_trans_indexs(max_dim); + ComputeBroadcastTranspositionArray(x_one_indexs.data(), x_trans_indexs.data(), + max_dim, x_one_indexs.size()); + ComputeBroadcastTranspositionArray(y_one_indexs.data(), y_trans_indexs.data(), + max_dim, y_one_indexs.size()); + + // compute array stride for cuda kernel; + // e.g. x.dims=[2,3,4], x_stride=[12,4,1] + std::vector x_strides_array(max_dim); + std::vector y_strides_array(max_dim); + std::vector out_strides_array(max_dim); + int x_stride = 1; + int y_stride = 1; + int z_stride = 1; + for (int i = max_dim - 1; i >= 0; i--) { + x_strides_array[i] = x_dims_array[i] == 1 ? 0 : x_stride; + y_strides_array[i] = y_dims_array[i] == 1 ? 0 : y_stride; + out_strides_array[i] = z_stride; + x_stride *= x_dims_array[i]; + y_stride *= y_dims_array[i]; + z_stride *= out_dims_array[i]; + } + + std::vector x_strides_order(max_dim); + std::vector y_strides_order(max_dim); + std::vector x_dims_order(max_dim); + std::vector y_dims_order(max_dim); + for (int i = 0; i < max_dim; ++i) { + x_strides_order[i] = out_strides_array[x_trans_indexs[i]]; + y_strides_order[i] = out_strides_array[y_trans_indexs[i]]; + x_dims_order[i] = out_dims_array[x_trans_indexs[i]]; + y_dims_order[i] = out_dims_array[y_trans_indexs[i]]; + } + + int x_blocks = 0; + int x_threads = 0; + ComputeBroadcastKernelSize(x_dims_array, out_dims_array, &x_blocks, + &x_threads, max_dim); + int y_blocks = 0; + int y_threads = 0; + ComputeBroadcastKernelSize(y_dims_array, out_dims_array, &y_blocks, + &y_threads, max_dim); + + int bytes = max_dim * sizeof(int); + auto x_strides_array_tmp = memory::Alloc(ctx, bytes); + int *x_strides_array_gpu = + reinterpret_cast(x_strides_array_tmp->ptr()); + memory::Copy(gplace, x_strides_array_gpu, cplace, x_strides_array.data(), + bytes, ctx.stream()); + + auto y_strides_array_tmp = memory::Alloc(ctx, bytes); + int *y_strides_array_gpu = + reinterpret_cast(y_strides_array_tmp->ptr()); + memory::Copy(gplace, y_strides_array_gpu, cplace, y_strides_array.data(), + bytes, ctx.stream()); + + auto out_dims_array_tmp = memory::Alloc(ctx, bytes); + int *out_dims_array_gpu = reinterpret_cast(out_dims_array_tmp->ptr()); + memory::Copy(gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, + ctx.stream()); + + const int out_size = std::accumulate(out_dims_array, out_dims_array + max_dim, + 1, std::multiplies()); + int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads); + int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads); + if (dx) { + auto x_strides_order_tmp = memory::Alloc(ctx, bytes); + int *x_strides_order_gpu = + reinterpret_cast(x_strides_order_tmp->ptr()); + memory::Copy(gplace, x_strides_order_gpu, cplace, x_strides_order.data(), + bytes, ctx.stream()); + + auto x_dims_order_tmp = memory::Alloc(ctx, bytes); + int *x_dims_order_gpu = reinterpret_cast(x_dims_order_tmp->ptr()); + memory::Copy(gplace, x_dims_order_gpu, cplace, x_dims_order.data(), bytes, + ctx.stream()); + CommonGradBroadcastCUDAKernel< + T, DX_OP><<>>( + x_strides_array_gpu, y_strides_array_gpu, out_dims_array_gpu, + x_strides_order_gpu, x_dims_order_gpu, x_data, y_data, out_data, + dout_data, dx_data, out_size, max_dim, x_threads, dx_op); + } + if (dy) { + auto y_strides_order_tmp = memory::Alloc(ctx, bytes); + int *y_strides_order_gpu = + reinterpret_cast(y_strides_order_tmp->ptr()); + memory::Copy(gplace, y_strides_order_gpu, cplace, y_strides_order.data(), + bytes, ctx.stream()); + + auto y_dims_order_tmp = memory::Alloc(ctx, bytes); + int *y_dims_order_gpu = reinterpret_cast(y_dims_order_tmp->ptr()); + memory::Copy(gplace, y_dims_order_gpu, cplace, y_dims_order.data(), bytes, + ctx.stream()); + CommonGradBroadcastCUDAKernel< + T, DY_OP><<>>( + x_strides_array_gpu, y_strides_array_gpu, out_dims_array_gpu, + y_strides_order_gpu, y_dims_order_gpu, x_data, y_data, out_data, + dout_data, dy_data, out_size, max_dim, y_threads, dy_op); + } +} + +#endif // __NVCC__ + inline framework::DDim trim_trailing_singular_dims( const framework::DDim &dims) { // Remove trailing dimensions of size 1 for y @@ -121,7 +512,7 @@ inline framework::DDim trim_trailing_singular_dims( for (; actual_dims_size != 0; --actual_dims_size) { if (dims[actual_dims_size - 1] != 1) break; } - + if (actual_dims_size == dims.size()) return dims; std::vector trim_dims; trim_dims.resize(actual_dims_size); for (int i = 0; i < actual_dims_size; ++i) { @@ -287,13 +678,19 @@ template data()), y_(y->data()), z_(z->mutable_data(ctx.GetPlace())), nx_(x->numel()), ctx_(ctx), - func_(func) {} + func_(func), + is_xsize_larger_(is_xsize_larger) { + if (is_xsize_larger_ == false) { + nx_ = y->numel(); + } + } inline void Run() const { platform::Transform trans; @@ -302,22 +699,23 @@ class TransformFunctor { inline void RunRowWise(int n, int pre) const { platform::Transform trans; - trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator(y_, n), - z_, func_); + if (is_xsize_larger_) { + trans(ctx_, x_, x_ + nx_, + RowwiseTransformIterator(y_, n), z_, func_); + } else { + trans(ctx_, y_, y_ + nx_, + RowwiseTransformIterator(x_, n), z_, func_); + } } inline void RunMidWise(int n, int pre, int post) const { platform::Transform trans; - trans(ctx_, x_, x_ + nx_, - MidWiseTransformIterator(y_, n, post), z_, func_); - } - - inline void RunMidRowWise(int n, int pre, int post) const { - platform::Transform trans; - for (int i = 0; i < pre; i++) { - trans(ctx_, x_ + i * n * post, x_ + (i + 1) * n * post, - RowwiseTransformIterator(y_ + i * post, post), - z_ + i * n * post, func_); + if (is_xsize_larger_) { + trans(ctx_, x_, x_ + nx_, + MidWiseTransformIterator(y_, n, post), z_, func_); + } else { + trans(ctx_, y_, y_ + nx_, + MidWiseTransformIterator(x_, n, post), z_, func_); } } @@ -328,6 +726,7 @@ class TransformFunctor { int64_t nx_; const DeviceContext &ctx_; Functor func_; + bool is_xsize_larger_; }; template @@ -354,20 +753,42 @@ struct ElemwiseGradNoBroadcast { template static void ElemwiseGradBroadcast1CPU(const T *x, const T *y, const T *out, - const T *dout, int h, int w, DX_OP dx_op, + const T *dout, int h, int w, + bool is_xsize_larger, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - int x_offset = i * w + j; - if (dx != nullptr) { - dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + if (is_xsize_larger) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int x_offset = i * w + j; + if (dx != nullptr) { + dx[x_offset] = + dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + } + if (dy != nullptr) { + T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + if (i == 0) { + dy[j] = tmp; + } else { + dy[j] += tmp; + } + } } - if (dy != nullptr) { - T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); - if (i == 0) { - dy[j] = tmp; - } else { - dy[j] += tmp; + } + } else { // x.dims < y.dims, broadcast for x. + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int y_offset = i * w + j; + if (dy != nullptr) { + dy[y_offset] = + dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + } + if (dx != nullptr) { + T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + if (i == 0) { + dx[j] = tmp; + } else { + dx[j] += tmp; + } } } } @@ -378,28 +799,48 @@ static void ElemwiseGradBroadcast1CPU(const T *x, const T *y, const T *out, template static __global__ void ElemwiseGradBroadcast1CUDAKernel( const T *x, const T *y, const T *out, const T *dout, int h, int w, - DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { + bool is_xsize_larger, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { int j = blockIdx.x; int i = threadIdx.x; int tid = threadIdx.x; T val(0); + if (is_xsize_larger) { + do { + int x_offset = i * w + j; + if (dx) { + dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + } + if (dy) { + val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + } + i += ELEMWISE_MAX_BLOCK_DIM; + } while (i < h); - do { - int x_offset = i * w + j; - if (dx) { - dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); - } if (dy) { - val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; + val = paddle::platform::reduceSum(val, tid, h); + if (threadIdx.x == 0) { + dy[j] = val; + } } - i += ELEMWISE_MAX_BLOCK_DIM; - } while (i < h); + } else { // x.dims < y.dims, broadcast for x. + do { + int y_offset = i * w + j; + if (dy) { + dy[y_offset] = dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + } + if (dx) { + val += dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + } + i += ELEMWISE_MAX_BLOCK_DIM; + } while (i < h); - if (dy) { - h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - val = paddle::platform::reduceSum(val, tid, h); - if (threadIdx.x == 0) { - dy[j] = val; + if (dx) { + h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; + val = paddle::platform::reduceSum(val, tid, h); + if (threadIdx.x == 0) { + dx[j] = val; + } } } } @@ -412,7 +853,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel( template static __global__ void FastElemwiseGradBroadcast1CUDAKernel( const T *x, const T *y, const T *out, const T *dout, int h, int w, - DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { + bool is_xsize_larger, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { __shared__ T sdata[BLOCK_Y][BLOCK_X + 1]; T val(0); @@ -422,33 +863,66 @@ static __global__ void FastElemwiseGradBroadcast1CUDAKernel( (w & (~((uint64_t)(BLOCK_X - 1)))) + ((w & (BLOCK_X - 1)) ? BLOCK_X : 0); size_t full_height = (h & (~((uint64_t)(BLOCK_Y - 1)))) + ((h & (BLOCK_Y - 1)) ? BLOCK_Y : 0); - - for (int m = idx; m < full_width; m += width_stride) { - sdata[threadIdx.y][threadIdx.x] = 0; - for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) { - int x_offset = n * w + m; - if (dx && m < w && n < h) { - dx[x_offset] = dx_op(x[x_offset], y[m], out[x_offset], dout[x_offset]); + if (is_xsize_larger) { + for (int m = idx; m < full_width; m += width_stride) { + sdata[threadIdx.y][threadIdx.x] = 0; + for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) { + int x_offset = n * w + m; + if (dx && m < w && n < h) { + dx[x_offset] = + dx_op(x[x_offset], y[m], out[x_offset], dout[x_offset]); + } + if (dy) { + if (m < w && n < h) { + T val = dy_op(x[x_offset], y[m], out[x_offset], dout[x_offset]); + sdata[threadIdx.y][threadIdx.x] += val; + } + __syncthreads(); + } } if (dy) { - if (m < w && n < h) { - T val = dy_op(x[x_offset], y[m], out[x_offset], dout[x_offset]); - sdata[threadIdx.y][threadIdx.x] += val; + T my_val = sdata[threadIdx.x][threadIdx.y]; + for (int i = warpSize >> 1; i > 0; i >>= 1) + my_val += platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i); + __syncthreads(); + if ((threadIdx.x == 0)) { + sdata[0][threadIdx.y] = my_val; } __syncthreads(); + if (threadIdx.y == 0 && m < w) { + dy[m] = sdata[0][threadIdx.x]; + } } } - if (dy) { - T my_val = sdata[threadIdx.x][threadIdx.y]; - for (int i = warpSize >> 1; i > 0; i >>= 1) - my_val += platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i); - __syncthreads(); - if ((threadIdx.x == 0)) { - sdata[0][threadIdx.y] = my_val; + } else { // x.dims < y.dims, broadcast for x. + for (int m = idx; m < full_width; m += width_stride) { + sdata[threadIdx.y][threadIdx.x] = 0; + for (int n = threadIdx.y; n < full_height; n += BLOCK_Y) { + int y_offset = n * w + m; + if (dy && m < w && n < h) { + dy[y_offset] = + dy_op(x[m], y[y_offset], out[y_offset], dout[y_offset]); + } + if (dx) { + if (m < w && n < h) { + T val = dy_op(x[m], y[y_offset], out[y_offset], dout[y_offset]); + sdata[threadIdx.y][threadIdx.x] += val; + } + __syncthreads(); + } } - __syncthreads(); - if (threadIdx.y == 0 && m < w) { - dy[m] = sdata[0][threadIdx.x]; + if (dx) { + T my_val = sdata[threadIdx.x][threadIdx.y]; + for (int i = warpSize >> 1; i > 0; i >>= 1) + my_val += platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i); + __syncthreads(); + if ((threadIdx.x == 0)) { + sdata[0][threadIdx.y] = my_val; + } + __syncthreads(); + if (threadIdx.y == 0 && m < w) { + dx[m] = sdata[0][threadIdx.x]; + } } } } @@ -457,21 +931,21 @@ static __global__ void FastElemwiseGradBroadcast1CUDAKernel( template static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x, const T *y, const T *out, const T *dout, - int h, int w, DX_OP dx_op, DY_OP dy_op, - T *dx, T *dy) { + int h, int w, bool is_xsize_larger, + DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { // For small case use 1D block constexpr int half_walf = 16; if (w < half_walf || h < half_walf) { int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); int gird_size = w; ElemwiseGradBroadcast1CUDAKernel<<>>( - x, y, out, dout, h, w, dx_op, dy_op, dx, dy); + x, y, out, dout, h, w, is_xsize_larger, dx_op, dy_op, dx, dy); } else { // suppose perfoemance improves with h increased. dim3 block_size = dim3(BLOCK_X, BLOCK_Y); int grid_size = (w + BLOCK_X - 1) / BLOCK_X; FastElemwiseGradBroadcast1CUDAKernel<<>>( - x, y, out, dout, h, w, dx_op, dy_op, dx, dy); + x, y, out, dout, h, w, is_xsize_larger, dx_op, dy_op, dx, dy); } } @@ -480,21 +954,44 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x, template static void ElemwiseGradBroadcast2CPU(const T *x, const T *y, const T *out, const T *dout, int pre, int n, int post, - DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { - for (int i = 0; i < pre; ++i) { - for (int j = 0; j < n; ++j) { - for (int k = 0; k < post; ++k) { - int x_offset = i * n * post + j * post + k; - if (dx != nullptr) { - dx[x_offset] = - dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + bool is_xsize_larger, DX_OP dx_op, + DY_OP dy_op, T *dx, T *dy) { + if (is_xsize_larger) { + for (int i = 0; i < pre; ++i) { + for (int j = 0; j < n; ++j) { + for (int k = 0; k < post; ++k) { + int x_offset = i * n * post + j * post + k; + if (dx != nullptr) { + dx[x_offset] = + dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + } + if (dy != nullptr) { + T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + if (i == 0 && k == 0) { + dy[j] = tmp; + } else { + dy[j] += tmp; + } + } } - if (dy != nullptr) { - T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); - if (i == 0 && k == 0) { - dy[j] = tmp; - } else { - dy[j] += tmp; + } + } + } else { // x.dims < y.dims, broadcast for x. + for (int i = 0; i < pre; ++i) { + for (int j = 0; j < n; ++j) { + for (int k = 0; k < post; ++k) { + int y_offset = i * n * post + j * post + k; + if (dy != nullptr) { + dy[y_offset] = + dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + } + if (dx != nullptr) { + T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + if (i == 0 && k == 0) { + dx[j] = tmp; + } else { + dx[j] += tmp; + } } } } @@ -506,37 +1003,66 @@ static void ElemwiseGradBroadcast2CPU(const T *x, const T *y, const T *out, template static __global__ void ElemwiseGradBroadcast2CUDAKernel( const T *x, const T *y, const T *out, const T *dout, int pre, int n, - int post, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { + int post, bool is_xsize_larger, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { int tid = threadIdx.x; int j = blockIdx.x; T val(0); int ttid = tid; - while (true) { - int i = ttid / post; - int k = ttid % post; - if (i >= pre) break; + if (is_xsize_larger) { + while (true) { + int i = ttid / post; + int k = ttid % post; + if (i >= pre) break; - int x_offset = i * n * post + j * post + k; + int x_offset = i * n * post + j * post + k; - if (dx != nullptr) { - dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + if (dx != nullptr) { + dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + } + + if (dy != nullptr) { + val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + } + + ttid += ELEMWISE_MAX_BLOCK_DIM; } - if (dy != nullptr) { - val += dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + if (dy) { + int h = pre * post; + h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; + val = paddle::platform::reduceSum(val, tid, h); + if (threadIdx.x == 0) { + dy[j] = val; + } } + } else { // x.dims < y.dims, broadcast for x. + while (true) { + int i = ttid / post; + int k = ttid % post; + if (i >= pre) break; - ttid += ELEMWISE_MAX_BLOCK_DIM; - } + int y_offset = i * n * post + j * post + k; - if (dy) { - int h = pre * post; - h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - val = paddle::platform::reduceSum(val, tid, h); - if (threadIdx.x == 0) { - dy[j] = val; + if (dy != nullptr) { + dy[y_offset] = dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + } + + if (dx != nullptr) { + val += dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + } + + ttid += ELEMWISE_MAX_BLOCK_DIM; + } + + if (dx) { + int h = pre * post; + h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; + val = paddle::platform::reduceSum(val, tid, h); + if (threadIdx.x == 0) { + dx[j] = val; + } } } } @@ -544,98 +1070,57 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel( template static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T *x, const T *y, const T *out, const T *dout, - int pre, int n, int post, DX_OP dx_op, + int pre, int n, int post, + bool is_xsize_larger, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post); int gird_size = n; ElemwiseGradBroadcast2CUDAKernel<<>>( - x, y, out, dout, pre, n, post, dx_op, dy_op, dx, dy); + x, y, out, dout, pre, n, post, is_xsize_larger, dx_op, dy_op, dx, dy); } #endif -template -static void ElemwiseGradBroadcastMid2CPU(const T *x, const T *y, const T *out, - const T *dout, int pre, int n, - int post, DX_OP dx_op, DY_OP dy_op, - T *dx, T *dy) { - for (int i = 0; i < pre; ++i) { - for (int j = 0; j < n; ++j) { - for (int k = 0; k < post; ++k) { - int x_offset = i * n * post + j * post + k; - int y_offset = i * post + k; - if (dx != nullptr) { - dx[x_offset] = - dx_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]); - } - if (dy != nullptr) { - T tmp = - dy_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]); - if (j == 0) { - dy[y_offset] = tmp; - } else { - dy[y_offset] += tmp; - } - } - } - } +template +void CommonElementwiseBroadcastBackward( + const framework::ExecutionContext &ctx, const framework::DDim &x_dims, + const framework::DDim &y_dims, const framework::Tensor &x, + const framework::Tensor &y, const framework::Tensor &out, + const framework::Tensor &dout, int axis, framework::Tensor *dx, + framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { + int max_dim = std::max(x_dims.size(), y_dims.size()); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), + y_dims_array.data(), out_dims_array.data(), max_dim, + axis); + + // for inplace strategy. memset will make dx and dout clear and get wrong + // result. + if (dx && dout.Holder() == dx->Holder()) { + dx->clear(); + dx->mutable_data(x_dims, ctx.GetPlace()); } -} + if (platform::is_gpu_place(ctx.GetPlace())) { #ifdef __NVCC__ -template -static __global__ void ElemwiseGradBroadcastMid2CUDAKernel( - const T *x, const T *y, const T *out, const T *dout, int pre, int n, - int post, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { - int j = threadIdx.x; - int tid = blockIdx.x; - - T val(0); - int ttid = tid; - - while (true) { - int i = ttid / post; - int k = ttid % post; - if (i >= pre) break; - - int x_offset = i * n * post + j * post + k; - int y_offset = i * post + k; - if (dx != nullptr) { - dx[x_offset] = - dx_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]); - } - - if (dy != nullptr) { - val += dy_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]); - } - - ttid += ELEMWISE_MAX_BLOCK_DIM; - } - - if (dy) { - int h = n; - h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - val = paddle::platform::reduceSum(val, j, h); - if (threadIdx.x == 0) { - dy[tid] = val; - } + CommonGradBroadcastCUDA( + x, y, out, dout, dx, dy, x_dims_array.data(), y_dims_array.data(), + out_dims_array.data(), max_dim, + ctx.template device_context(), dx_op, + dy_op); +#endif + } else { + CommonGradBroadcastCPU( + x, y, out, dout, dx, dy, x_dims_array.data(), y_dims_array.data(), + out_dims_array.data(), max_dim, + ctx.template device_context(), dx_op, + dy_op); } } -template -static void ElemwiseGradBroadcastMid2CUDA(cudaStream_t stream, const T *x, - const T *y, const T *out, - const T *dout, int pre, int n, - int post, DX_OP dx_op, DY_OP dy_op, - T *dx, T *dy) { - int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, n); - int gird_size = pre * post; - ElemwiseGradBroadcastMid2CUDAKernel<<>>( - x, y, out, dout, pre, n, post, dx_op, dy_op, dx, dy); -} - -#endif - template void ElemwiseGradComputeNoBroadcast( const framework::ExecutionContext &ctx, const framework::DDim &x_dim, @@ -659,47 +1144,54 @@ void ElemwiseGradComputeNoBroadcast( template void ElemwiseGradComputeWithBroadcast( - const framework::ExecutionContext &ctx, const framework::DDim &x_dim, - const framework::DDim &y_dim_untrimed, const framework::Tensor &x, + const framework::ExecutionContext &ctx, const framework::DDim &x_dims, + const framework::DDim &y_dims, const framework::Tensor &x, const framework::Tensor &y, const framework::Tensor &out, const framework::Tensor &dout, int axis, framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { - axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis); - auto y_dim = trim_trailing_singular_dims(y_dim_untrimed); - axis = (y_dim.size() == 0) ? x_dim.size() : axis; + bool is_xsize_larger = true; + int max_dim = x_dims.size(); + if (x_dims.size() < y_dims.size()) { + is_xsize_larger = false; + max_dim = y_dims.size(); + } - int pre, n, post, mid_flag = 0; - get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, &mid_flag); - if (mid_flag) { - PADDLE_ENFORCE_EQ(mid_flag, 1, "mid_flag should be no more than 1."); - if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef __NVCC__ - ElemwiseGradBroadcastMid2CUDA( - ctx.template device_context().stream(), x.data(), - y.data(), out.data(), dout.data(), pre, n, post, dx_op, - dy_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); -#endif - } else { - ElemwiseGradBroadcastMid2CPU( - x.data(), y.data(), out.data(), dout.data(), pre, n, post, - dx_op, dy_op, - dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), - dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); - } - } else if (post == 1) { + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE(axis, 0, "Axis should be in range [0, %d)", axis); + PADDLE_ENFORCE_LT(axis, max_dim, "Axis should be in range [0, %d)", axis); + + int pre, n, post, is_run_common_broadcast, axis_trim = 0; + if (is_xsize_larger) { + auto y_dims_trimed = trim_trailing_singular_dims(y_dims); + axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; + get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n, &post, + &is_run_common_broadcast); + } else { + auto x_dims_trimed = trim_trailing_singular_dims(x_dims); + axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; + get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n, &post, + &is_run_common_broadcast); + } + // special case for common backward implementation. + if (is_run_common_broadcast) { + CommonElementwiseBroadcastBackward( + ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + return; + } + if (post == 1) { if (platform::is_gpu_place(ctx.GetPlace())) { #ifdef __NVCC__ ElemwiseGradBroadcast1CUDA( ctx.template device_context().stream(), x.data(), - y.data(), out.data(), dout.data(), pre, n, dx_op, dy_op, + y.data(), out.data(), dout.data(), pre, n, is_xsize_larger, + dx_op, dy_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); #endif } else { ElemwiseGradBroadcast1CPU( x.data(), y.data(), out.data(), dout.data(), pre, n, - dx_op, dy_op, + is_xsize_larger, dx_op, dy_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); } @@ -708,20 +1200,56 @@ void ElemwiseGradComputeWithBroadcast( #ifdef __NVCC__ ElemwiseGradBroadcast2CUDA( ctx.template device_context().stream(), x.data(), - y.data(), out.data(), dout.data(), pre, n, post, dx_op, - dy_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + y.data(), out.data(), dout.data(), pre, n, post, + is_xsize_larger, dx_op, dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); #endif } else { ElemwiseGradBroadcast2CPU( x.data(), y.data(), out.data(), dout.data(), pre, n, post, - dx_op, dy_op, + is_xsize_larger, dx_op, dy_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); } } } +template +void CommonElementwiseBroadcastForward( + const framework::ExecutionContext &ctx, const framework::Tensor *x, + const framework::Tensor *y, framework::Tensor *z, + const framework::DDim &x_dims, const framework::DDim &y_dims, Functor func, + int axis, const bool is_xsize_larger = true) { + int max_dim = std::max(x_dims.size(), y_dims.size()); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE(axis, 0, "Axis should be in range [0, %d)", axis); + PADDLE_ENFORCE_LT(axis, max_dim, "Axis should be in range [0, %d)", axis); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), + y_dims_array.data(), out_dims_array.data(), max_dim, + axis); + + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef __NVCC__ + CommonForwardBroadcastCUDA( + x, y, z, x_dims_array.data(), y_dims_array.data(), + out_dims_array.data(), max_dim, + ctx.template device_context(), func, + is_xsize_larger); +#endif + } else { + CommonForwardBroadcastCPU( + x, y, z, x_dims_array.data(), y_dims_array.data(), + out_dims_array.data(), max_dim, + ctx.template device_context(), func, + is_xsize_larger); + } +} + template void ElemwiseGradCompute(const framework::ExecutionContext &ctx, const framework::Tensor &x, const framework::Tensor &y, @@ -734,7 +1262,7 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx, if (x.dims() == y.dims()) { ElemwiseGradComputeNoBroadcast( ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - } else { // Y is a scalar + } else { ElemwiseGradComputeWithBroadcast( ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } @@ -752,103 +1280,61 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext &ctx, const framework::Tensor &dout, int axis, framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { - if (dy == nullptr) { - const framework::DDim &dx_dims = dout.dims(); - auto dy_dims = dx_dims; + const framework::DDim &x_dim = x.dims(); + const framework::DDim &y_dim = y.dims(); + if (x.dims() == y.dims()) { ElemwiseGradComputeNoBroadcast( - ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - } else { - if (dout.dims() == dy->dims()) { - const framework::DDim &dx_dims = dout.dims(); - const framework::DDim &dy_dims = dy->dims(); - ElemwiseGradComputeNoBroadcast( - ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - } else { // Y is a scalar - auto dx_dims = dout.dims(); - const framework::DDim &dy_dims = dy->dims(); - ElemwiseGradComputeWithBroadcast( - ctx, dx_dims, dy_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - } - } -} - -// Deprecated -template -void ElementwiseGradCompute(const framework::ExecutionContext &ctx, - const framework::Tensor *x, - const framework::Tensor *y, - const framework::Tensor *out, - const framework::Tensor *dout, int axis, - framework::Tensor *dx, framework::Tensor *dy) { - auto &place = *ctx.template device_context().eigen_device(); - - auto x_dims = x->dims(); - auto y_dims = y->dims(); - - if (dx) { - dx->mutable_data(ctx.GetPlace()); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - } - - if (x_dims == y_dims) { - functor f; - f(place, x, y, out, dx, dy, dout); - return; - } - - axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); - trim_trailing_singular_dims(y_dims); - axis = (y_dims.size() == 0) ? x_dims.size() : axis; - - int pre, n, post; - get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post); - - if (post == 1) { - broadcastfunctor f; - f(place, x, y, out, dx, dy, dout, pre, n); - return; + ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op, dy_op); } else { - broadcast2functor f; - f(place, x, y, out, dx, dy, dout, pre, n, post); - return; + ElemwiseGradComputeWithBroadcast( + ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op, dy_op); } } template - void ElementwiseComputeEx(const framework::ExecutionContext &ctx, const framework::Tensor *x, const framework::Tensor *y, int axis, Functor func, framework::Tensor *z) { - TransformFunctor functor( - x, y, z, ctx.template device_context(), func); auto x_dims = x->dims(); - auto y_dims_untrimed = y->dims(); - PADDLE_ENFORCE_GE( - x_dims.size(), y_dims_untrimed.size(), - "ShapeError: the dimension of input X must greater than or equal to " - "the one of input Y. But received: the shape of input X = [%s], the " - "dimension of input X = %d, the shape of input Y = [%s], the dimension " - "of of input Y = %d", - x_dims, x_dims.size(), y_dims_untrimed, y_dims_untrimed.size()); - if (x_dims == y_dims_untrimed) { + auto y_dims = y->dims(); + bool is_xsize_larger = true; + int max_dim = x_dims.size(); + if (x_dims.size() < y_dims.size()) { + is_xsize_larger = false; + max_dim = y_dims.size(); + } + TransformFunctor functor( + x, y, z, ctx.template device_context(), func, + is_xsize_larger); + if (x_dims == y_dims) { functor.Run(); return; } - axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis); - PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), - "Axis should be in range [0, x_dims)"); - auto y_dims = trim_trailing_singular_dims(y_dims_untrimed); - axis = (y_dims.size() == 0) ? x_dims.size() : axis; - int pre, n, post, mid_flag = 0; - get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, &mid_flag); - if (mid_flag) { - functor.RunMidRowWise(n, pre, post); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE(axis, 0, "Axis should be in range [0, %d)", axis); + PADDLE_ENFORCE_LT(axis, max_dim, "Axis should be in range [0, %d)", axis); + + int pre, n, post, is_run_common_broadcast, axis_trim = 0; + if (is_xsize_larger) { + auto y_dims_trimed = trim_trailing_singular_dims(y_dims); + axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; + get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n, &post, + &is_run_common_broadcast); + } else { + auto x_dims_trimed = trim_trailing_singular_dims(x_dims); + axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; + get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n, &post, + &is_run_common_broadcast); + } + // special case for common implementation. + // case 1: x=[2,3,1,5], y=[2,1,4,1] + // case 2: x=[2,3,4], y=[1,1,4] + if (is_run_common_broadcast == 1) { + CommonElementwiseBroadcastForward( + ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger); return; } if (post == 1) { @@ -1114,9 +1600,8 @@ void FusedElemwiseAndActComputeWithBroadcast( auto y_dim = trim_trailing_singular_dims(y_dim_untrimed); axis = (y_dim.size() == 0) ? x_dim.size() : axis; - int pre, n, post; - get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); - + int pre, n, post, is_run_common_broadcast; + get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, &is_run_common_broadcast); if (post == 1) { int h = pre; int w = n; @@ -1628,8 +2113,8 @@ void FusedElemwiseAndActGradComputeWithBroadcast( auto y_dim = trim_trailing_singular_dims(y_dim_untrimed); axis = (y_dim.size() == 0) ? x_dim.size() : axis; - int pre, n, post; - get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); + int pre, n, post, is_run_common_broadcast; + get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, &is_run_common_broadcast); if (post == 1) { int h = pre; int w = n; @@ -1763,16 +2248,7 @@ void FusedElemwiseAndActComputeEx(const framework::ExecutionContext &ctx, } else { // Whether the shape of Y is a continuous subsequence of X, // For more information please refer to the op's introduction. - bool bcast_y = x.dims().size() >= y.dims().size(); - if (x.dims().size() == y.dims().size()) { - for (int i = 0; i < x.dims().size(); ++i) { - if (x.dims()[i] < y.dims()[i]) { - bcast_y = false; - break; - } - } - } - + bool bcast_y = x.numel() >= y.numel(); // z = f1(x, f2(y)) // z = f1(f2(x, y)) if (bcast_y) { // Y should be broadcast. diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc index 480f41030f..9647c2915a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc @@ -93,13 +93,14 @@ class ElementwiseSubDoubleGradMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle -namespace ops = paddle::operators; REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub); REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(elementwise_sub, Sub); +namespace ops = paddle::operators; + REGISTER_OPERATOR( - elementwise_sub_grad, ops::ElementwiseOpExplicitGrad, - ops::ElementwiseGradOpInplace, ops::ElementwiseGradNoBufVarsInference, + elementwise_sub_grad, ops::ElementwiseOpGrad, ops::ElementwiseGradOpInplace, + ops::ElementwiseGradNoBufVarsInference, ops::ElementwiseSubDoubleGradMaker, ops::ElementwiseSubDoubleGradMaker); REGISTER_OPERATOR(elementwise_sub_grad_grad, diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h index 1a64a45359..7a20b1f986 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h @@ -26,8 +26,13 @@ void default_elementwise_sub(const framework::ExecutionContext& ctx, const framework::Tensor* x, const framework::Tensor* y, framework::Tensor* z) { int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - SubFunctor(), z); + if (x->numel() >= y->numel()) { + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + SubFunctor(), z); + } else { + ElementwiseComputeEx, DeviceContext, T>( + ctx, x, y, axis, InverseSubFunctor(), z); + } } template @@ -98,13 +103,14 @@ class ElementwiseSubGradKernel : public ElemwiseGradKernel { ElemwiseGradKernel::Compute(ctx); using Tensor = framework::Tensor; + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); - // skip out, x, y + // skip out auto* out = dout; - auto *x = dout, *y = dout; if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) { elementwise_sub_grad(ctx, x, y, out, dout, dx, dy); } else { diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 742f10cc4b..b0037925c1 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -108,8 +108,9 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { auto y_dims = trim_trailing_singular_dims(y_dims_untrimed); axis = (y_dims.size() == 0) ? x_dims.size() : axis; - int pre, n, post; - get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post); + int pre, n, post, is_run_common_broadcast; + get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, + &is_run_common_broadcast); if (post == 1) { functor.RunRowWise(n, pre); @@ -212,6 +213,8 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { } } else { // Execute default kernel when broadcast is needed + x = ctx.Input("X"); + y = ctx.Input("Y"); ElemwiseExplicitGradCompute, IdentityGrad>( ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad(), diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index cc72384427..6a3ecd493a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -91,8 +91,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { const bool is_y_format_correct = y->format() == MKLDNNMemoryFormat::nc; if (is_x_format_correct && is_y_format_correct && are_dims_divisable && is_avx512_enabled) { - int pre, n, post; - get_mid_dims(x_dims, y_dims_untrimmed, axis, &pre, &n, &post); + int pre, n, post, is_run_common_broadcast; + get_mid_dims(x_dims, y_dims_untrimmed, axis, &pre, &n, &post, + &is_run_common_broadcast); if (post == 1) { PADDLE_THROW("Not implemented when post is 1"); @@ -168,8 +169,9 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { auto y_dims = trim_trailing_singular_dims(y_dims_untrimmed); axis = (y_dims.size() == 0) ? x_dims.size() : axis; - int pre, n, post; - get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post); + int pre, n, post, is_run_common_broadcast; + get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, + &is_run_common_broadcast); if (post == 1) { functor.RunRowWise(n, pre); diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index db794ed421..ff2935b0b4 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/math/blas.h" #if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \ @@ -139,21 +140,6 @@ struct DivAndSqrtFunctor { T epsilon_; }; -template -struct MulFunctor { - inline HOSTDEVICE T operator()(T a, T b) const { return a * b; } -}; - -template -struct AddFunctor { - inline HOSTDEVICE T operator()(T a, T b) const { return a + b; } -}; - -template -struct SubFunctor { - inline HOSTDEVICE T operator()(T a, T b) const { return a - b; } -}; - template struct MulInvVarFunctor { inline HOSTDEVICE T operator()(T a, T b) const { diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc index 777b0e5dfd..67d71fe82a 100644 --- a/paddle/fluid/operators/op_debug_string_test.cc +++ b/paddle/fluid/operators/op_debug_string_test.cc @@ -32,6 +32,7 @@ TEST(op_debug_str, test_unknown_dtype) { framework::Scope scope; desc.SetType("elementwise_add_grad"); + desc.SetInput("X", {"X"}); desc.SetInput("Y", {"Y"}); desc.SetInput(framework::GradVarName("Out"), {framework::GradVarName("Out")}); desc.SetOutput(framework::GradVarName("X"), {framework::GradVarName("X")}); @@ -41,6 +42,10 @@ TEST(op_debug_str, test_unknown_dtype) { desc.SetAttr("x_data_format", ""); desc.SetAttr("y_data_format", ""); + auto x_tensor = scope.Var("X")->GetMutable(); + x_tensor->Resize(dim); + x_tensor->mutable_data(place); + auto y_tensor = scope.Var("Y")->GetMutable(); y_tensor->Resize(dim); y_tensor->mutable_data(place); diff --git a/paddle/fluid/train/demo/run.sh b/paddle/fluid/train/demo/run.sh index 42b331066f..f7efb3b3b7 100755 --- a/paddle/fluid/train/demo/run.sh +++ b/paddle/fluid/train/demo/run.sh @@ -7,8 +7,8 @@ TURN_ON_MKL=$2 # use MKL or Openblas # download models function download() { - wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR/main_program - wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR/startup_program + wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR-1-7/main_program + wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR-1-7/startup_program } download diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index e54827feee..1afe5db86a 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -308,6 +308,36 @@ class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp): self.axis = -1 +class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(1, 1, 4).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + +class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 3, 1, 5).astype(self.dtype) + self.y = np.random.rand(2, 1, 4, 1).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + +class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(4, 5).astype(self.dtype) + self.y = np.random.rand(2, 3, 4, 5).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = 2 + + class TestElementwiseAddOpError(OpTest): def test_errors(self): with program_guard(Program(), Program()): diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py index 4046e31b41..34db81ac69 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py @@ -151,6 +151,39 @@ class TestElementwiseDivOp_broadcast_5(ElementwiseDivOp): self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])} +class TestElementwiseDivOp_commonuse_1(ElementwiseDivOp): + def setUp(self): + self.op_type = "elementwise_div" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [1, 1, 4]).astype("float32"), + } + self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])} + + +class TestElementwiseDivOp_commonuse_2(ElementwiseDivOp): + def setUp(self): + self.op_type = "elementwise_div" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [2, 3, 1, 5]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [2, 1, 4, 1]).astype("float32"), + } + self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])} + + +class TestElementwiseDivOp_xsize_lessthan_ysize(ElementwiseDivOp): + def setUp(self): + self.op_type = "elementwise_div" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [4, 5]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float32"), + } + + self.attrs = {'axis': 2} + + self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])} + + class TestElementwiseDivOp_INT(OpTest): def setUp(self): self.op_type = "elementwise_div" diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py index 7ee24d465d..33dd2407b0 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py @@ -162,6 +162,41 @@ class TestElementwiseMulOpFp16(ElementwiseMulOp): self.dtype = np.float16 +class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp): + def setUp(self): + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(2, 3, 4).astype(np.float64), + 'Y': np.random.rand(1, 1, 4).astype(np.float64) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + + +class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp): + def setUp(self): + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(2, 3, 1, 5).astype(np.float64), + 'Y': np.random.rand(2, 1, 4, 1).astype(np.float64) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + + +class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp): + def setUp(self): + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(4, 5).astype(np.float64), + 'Y': np.random.rand(2, 3, 4, 5).astype(np.float64) + } + + self.attrs = {'axis': 2} + + self.outputs = { + 'Out': self.inputs['X'].reshape(1, 1, 4, 5) * self.inputs['Y'] + } + + class TestElementwiseMulOpError(OpTest): def test_errors(self): with program_guard(Program(), Program()): diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py index e9a389bbaf..2c5ccfd92b 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py @@ -127,5 +127,40 @@ class TestElementwiseSubOp_broadcast_4(TestElementwiseOp): self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} +class TestElementwiseSubOp_commonuse_1(TestElementwiseOp): + def setUp(self): + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.rand(2, 3, 4).astype(np.float32), + 'Y': np.random.rand(1, 1, 4).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} + + +class TestElementwiseSubOp_commonuse_2(TestElementwiseOp): + def setUp(self): + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.rand(2, 3, 1, 5).astype(np.float32), + 'Y': np.random.rand(2, 1, 4, 1).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} + + +class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseOp): + def setUp(self): + self.op_type = "elementwise_sub" + self.inputs = { + 'X': np.random.rand(4, 5).astype(np.float32), + 'Y': np.random.rand(2, 3, 4, 5).astype(np.float32) + } + + self.attrs = {'axis': 2} + + self.outputs = { + 'Out': self.inputs['X'].reshape(1, 1, 4, 5) - self.inputs['Y'] + } + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py b/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py index 64a8f20dae..3504081e01 100644 --- a/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py +++ b/python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py @@ -47,7 +47,7 @@ class TestExecutorReturnTensorNotOverwritingWithOptest(OpTest): 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) } self.outputs = {'Out': self.out} - self.op_type = "elementwise_mul" + self.op_type = "mul" self.dtype = np.float32 outs, fetch_list = self._calc_output(place, parallel=parallel) return outs diff --git a/python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py index fa393074a4..b9a58b6df5 100644 --- a/python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py +++ b/python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py @@ -57,7 +57,7 @@ def run_trainer(use_cuda, sync_mode, ip, port, trainers, trainer_id): exe.run(trainer_startup_program) for i in range(5): exe.run(recv_program) - exe.run(main_program, + exe.run(fluid.default_main_program(), feed={ "x": numpy.array([1, 2]).astype('float32').reshape(2, 1), "y": numpy.array([2, 3]).astype('float32').reshape(2, 1) -- GitLab