diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index aa41173c81a22ad6ac01fc68860f6486c5df63e8..3342d8bbd8fac72614b8bdd41dbf53ca99501675 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -35,7 +35,7 @@ ELSE () ENDIF() SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") -SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210804") +SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210818") SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc index 2e902bd277b1e4d016d0c3190579c409c8d361f3..769e61aba6131d4e2c526944a1b5083b439e387e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc @@ -23,93 +23,45 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class ElementwiseAddXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { - // XPUElementwise(ctx, xpu::add); - // ToDo(QingshuChen): update this optimization to elementwise_xpu.h - auto x_var = ctx.InputVar("X"); - PADDLE_ENFORCE_NE(x_var, nullptr, platform::errors::InvalidArgument( - "Cannot get input Variable X")); - PADDLE_ENFORCE_EQ( - x_var->IsType(), true, - platform::errors::InvalidArgument( - "XPU only support LoDTensor, Input(X) is not LoDTensor")); - - auto x = x_var->Get(); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - z->mutable_data(ctx.GetPlace()); - auto x_dims = x.dims(); - auto y_dims = y->dims(); - int max_dim = std::max(x_dims.size(), y_dims.size()); - int axis = ctx.Attr("axis"); - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + XPUElementwise(ctx, xpu::broadcast_add); + } +}; - PADDLE_ENFORCE_GE( - axis, 0, - platform::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT( - axis, max_dim, - platform::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", max_dim, - axis)); - std::vector x_dims_vec(max_dim, 1); - std::vector y_dims_vec(max_dim, 1); - if (x_dims.size() == max_dim) { - for (int i = 0; i < max_dim; i++) { - x_dims_vec[i] = x_dims[i]; - } - } else { - for (int i = 0; i < x_dims.size(); i++) { - x_dims_vec[i + axis] = x_dims[i]; - } +static std::vector get_rdims(const std::vector& xdims, + const std::vector& ydims) { + std::vector rdims; + for (size_t i = 0; i < xdims.size(); i++) { + if (xdims[i] != ydims[i]) { + rdims.push_back(i); } - if (y_dims.size() == max_dim) { - for (int i = 0; i < max_dim; i++) { - y_dims_vec[i] = y_dims[i]; - } - } else { - for (int i = 0; i < y_dims.size(); i++) { - y_dims_vec[i + axis] = y_dims[i]; - } - } - const T* x_data = x.data(); - const T* y_data = y->data(); - T* z_data = z->data(); - - auto& dev_ctx = - ctx.template device_context(); - int ret = xpu::SUCCESS; - ret = xpu::broadcast_add(dev_ctx.x_context(), x_data, y_data, z_data, - x_dims_vec, y_dims_vec); - PADDLE_ENFORCE_EQ( - ret, xpu::SUCCESS, - platform::errors::External( - "XPU kernel Elementwise occur error in XPUElementwise error code ", - ret, XPUAPIErrorMsg[ret])); } -}; + return rdims; +} -template +template class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { ElemwiseGradKernel::Compute(ctx); - // XPUElementwiseGrad(ctx, xpu::add_grad, false); auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); auto* dz = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); const framework::DDim& x_dims = x->dims(); const framework::DDim& y_dims = y->dims(); - int max_dim = std::max(x_dims.size(), y_dims.size()); + const framework::DDim& dz_dims = dz->dims(); + int axis = ctx.Attr("axis"); axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + int max_dim = std::max(x_dims.size(), y_dims.size()); PADDLE_ENFORCE_GE( axis, 0, platform::errors::InvalidArgument( @@ -120,66 +72,74 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel { platform::errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", max_dim, axis)); + std::vector x_dims_vec(max_dim, 1); std::vector y_dims_vec(max_dim, 1); - int x_len = 1; - int y_len = 1; + std::vector z_dims_vec(max_dim, 1); if (x_dims.size() == max_dim) { for (int i = 0; i < max_dim; i++) { x_dims_vec[i] = x_dims[i]; - x_len *= x_dims_vec[i]; } } else { for (int i = 0; i < x_dims.size(); i++) { x_dims_vec[i + axis] = x_dims[i]; - x_len *= x_dims_vec[i]; } } + if (y_dims.size() == max_dim) { for (int i = 0; i < max_dim; i++) { y_dims_vec[i] = y_dims[i]; - y_len *= y_dims_vec[i]; } } else { for (int i = 0; i < y_dims.size(); i++) { y_dims_vec[i + axis] = y_dims[i]; - y_len *= y_dims_vec[i]; } } - const T* dz_data = dz->data(); - framework::Tensor dx_local_tensor; - framework::Tensor dy_local_tensor; - bool need_wait = false; - T* dx_data = nullptr; - T* dy_data = nullptr; - if (dx) { - dx_data = dx->mutable_data(ctx.GetPlace()); - } else { - dx_data = - dx_local_tensor.mutable_data(ctx.GetPlace(), x_len * sizeof(T)); - need_wait = true; - } - if (dy) { - dy_data = dy->mutable_data(ctx.GetPlace()); - } else { - dy_data = - dy_local_tensor.mutable_data(ctx.GetPlace(), y_len * sizeof(T)); - need_wait = true; + for (int i = 0; i < max_dim; i++) { + z_dims_vec[i] = dz_dims[i]; } - + std::vector rdims_for_x; + std::vector rdims_for_y; + rdims_for_x = get_rdims(x_dims_vec, z_dims_vec); + rdims_for_y = get_rdims(y_dims_vec, z_dims_vec); + const T* dz_data = dz->data(); auto& dev_ctx = ctx.template device_context(); - int ret = xpu::broadcast_add_grad(dev_ctx.x_context(), dz_data, dz_data, - dz_data, dz_data, dy_data, dx_data, - x_dims_vec, y_dims_vec); - PADDLE_ENFORCE_EQ( - ret, xpu::SUCCESS, - platform::errors::External( - "XPU kernel Elementwise occur error in XPUElementwise error code ", - ret, XPUAPIErrorMsg[ret])); - if (need_wait && dev_ctx.x_context()->xpu_stream) { - dev_ctx.Wait(); + if (dx != nullptr) { + if (rdims_for_x.size() == 0) { + framework::TensorCopy( + *dz, ctx.GetPlace(), + ctx.template device_context(), dx); + } else { + T* dx_data = dx->mutable_data(ctx.GetPlace()); + int ret = xpu::reduce_sum( + dev_ctx.x_context(), reinterpret_cast(dz_data), + reinterpret_cast(dx_data), z_dims_vec, rdims_for_x); + PADDLE_ENFORCE_EQ( + ret, xpu::SUCCESS, + platform::errors::External("XPU kernel reduce_sum occur error in " + "XPUElementwise error code ", + ret, XPUAPIErrorMsg[ret])); + } + } + + if (dy != nullptr) { + if (rdims_for_y.size() == 0) { + framework::TensorCopy( + *dz, ctx.GetPlace(), + ctx.template device_context(), dy); + } else { + T* dy_data = dy->mutable_data(ctx.GetPlace()); + int ret = xpu::reduce_sum( + dev_ctx.x_context(), reinterpret_cast(dz_data), + reinterpret_cast(dy_data), z_dims_vec, rdims_for_y); + PADDLE_ENFORCE_EQ( + ret, xpu::SUCCESS, + platform::errors::External("XPU kernel reduce_sum occur error in " + "XPUElementwise error code ", + ret, XPUAPIErrorMsg[ret])); + } } } }; @@ -189,10 +149,9 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel { namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL(elementwise_add, ops::ElementwiseAddXPUKernel, + ops::ElementwiseAddXPUKernel); REGISTER_OP_XPU_KERNEL( - elementwise_add, - ops::ElementwiseAddXPUKernel); -REGISTER_OP_XPU_KERNEL(elementwise_add_grad, - ops::ElementwiseAddGradXPUKernel< - paddle::platform::XPUDeviceContext, float>); + elementwise_add_grad, ops::ElementwiseAddGradXPUKernel, + ops::ElementwiseAddGradXPUKernel); #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_xpu.cc index da676a7244fb3851a6b2b31a684ae07329e7f6f1..782f53f1bb12a0eeb8222e927b5591f8117e85b4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op_xpu.cc @@ -19,30 +19,33 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class ElementwiseDivXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { - XPUElementwise(ctx, xpu::div); + XPUElementwise(ctx, xpu::broadcast_div); } }; -template +template class ElementwiseDivGradXPUKernel : public ElemwiseGradKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { ElemwiseGradKernel::Compute(ctx); - XPUElementwiseGrad(ctx, xpu::div_grad, true); + XPUElementwiseGrad(ctx, xpu::broadcast_div_grad, true); } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL(elementwise_div, ops::ElementwiseDivXPUKernel, + ops::ElementwiseDivXPUKernel); REGISTER_OP_XPU_KERNEL( - elementwise_div, - ops::ElementwiseDivXPUKernel); -REGISTER_OP_XPU_KERNEL(elementwise_div_grad, - ops::ElementwiseDivGradXPUKernel< - paddle::platform::XPUDeviceContext, float>); + elementwise_div_grad, ops::ElementwiseDivGradXPUKernel, + ops::ElementwiseDivGradXPUKernel); #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_xpu.cc index 32ae3a6f2c0c2041265099923f6b090a444584f1..fed68b615c7f7d4cbb2ccd2d0de57b7591dc19d8 100644 --- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_xpu.cc @@ -21,17 +21,22 @@ namespace operators { template class ElementwiseFloordivXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { - XPUElementwise(ctx, xpu::floordiv); + XPUElementwise(ctx, xpu::broadcast_floordiv); } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL(elementwise_floordiv, - ops::ElementwiseFloordivXPUKernel< - paddle::platform::XPUDeviceContext, float>); +REGISTER_OP_XPU_KERNEL( + elementwise_floordiv, + ops::ElementwiseFloordivXPUKernel, + ops::ElementwiseFloordivXPUKernel); #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc index c87db69c57d78cdf6f5ecd289909267991098b70..9fb3c9d493dc04a3dda6bd4a25c0d759e0b44c20 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_max_op_xpu.cc @@ -20,20 +20,24 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class ElementwiseMaxXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { - XPUElementwise(ctx, xpu::max); + XPUElementwise(ctx, xpu::broadcast_max); } }; -template +template class ElementwiseMaxGradXPUKernel : public ElemwiseGradKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { ElemwiseGradKernel::Compute(ctx); - XPUElementwiseGrad(ctx, xpu::max_grad, true); + XPUElementwiseGrad(ctx, xpu::broadcast_max_grad, true); } }; @@ -41,10 +45,9 @@ class ElementwiseMaxGradXPUKernel : public ElemwiseGradKernel { } // namespace paddle namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL(elementwise_max, ops::ElementwiseMaxXPUKernel, + ops::ElementwiseMaxXPUKernel); REGISTER_OP_XPU_KERNEL( - elementwise_max, - ops::ElementwiseMaxXPUKernel); -REGISTER_OP_XPU_KERNEL(elementwise_max_grad, - ops::ElementwiseMaxGradXPUKernel< - paddle::platform::XPUDeviceContext, float>); + elementwise_max_grad, ops::ElementwiseMaxGradXPUKernel, + ops::ElementwiseMaxGradXPUKernel); #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_xpu.cc index f1401369ec69aff873f1f822bb25b1d212596eec..5d4ff69b06dffcf8e1391cb0bddfe4f01124c421 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_min_op_xpu.cc @@ -20,20 +20,24 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class ElementwiseMinXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { - XPUElementwise(ctx, xpu::min); + XPUElementwise(ctx, xpu::broadcast_min); } }; -template +template class ElementwiseMinGradXPUKernel : public ElemwiseGradKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { ElemwiseGradKernel::Compute(ctx); - XPUElementwiseGrad(ctx, xpu::min_grad, true); + XPUElementwiseGrad(ctx, xpu::broadcast_min_grad, true); } }; @@ -41,10 +45,9 @@ class ElementwiseMinGradXPUKernel : public ElemwiseGradKernel { } // namespace paddle namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL(elementwise_min, ops::ElementwiseMinXPUKernel, + ops::ElementwiseMinXPUKernel); REGISTER_OP_XPU_KERNEL( - elementwise_min, - ops::ElementwiseMinXPUKernel); -REGISTER_OP_XPU_KERNEL(elementwise_min_grad, - ops::ElementwiseMinGradXPUKernel< - paddle::platform::XPUDeviceContext, float>); + elementwise_min_grad, ops::ElementwiseMinGradXPUKernel, + ops::ElementwiseMinGradXPUKernel); #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_xpu.cc index 23bb04f60a842bd8ad4005770f42d357c4f48930..1b583104cc57080fd256b6cfb200f72f30f62cf6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_xpu.cc @@ -18,20 +18,25 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_xpu.h" namespace paddle { namespace operators { -template + +template class ElementwiseMulXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { - XPUElementwise(ctx, xpu::mul); + XPUElementwise(ctx, xpu::broadcast_mul); } }; -// DEFINE_XPU_GRAD_KERNEL(Mul, mul, true); -template + +template class ElementwiseMulGradXPUKernel : public ElemwiseGradKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { ElemwiseGradKernel::Compute(ctx); - XPUElementwiseGrad(ctx, xpu::mul_grad, true); + XPUElementwiseGrad(ctx, xpu::broadcast_mul_grad, true); } }; @@ -39,11 +44,10 @@ class ElementwiseMulGradXPUKernel : public ElemwiseGradKernel { } // namespace paddle namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL(elementwise_mul, ops::ElementwiseMulXPUKernel, + ops::ElementwiseMulXPUKernel); REGISTER_OP_XPU_KERNEL( - elementwise_mul, - ops::ElementwiseMulXPUKernel); -REGISTER_OP_XPU_KERNEL(elementwise_mul_grad, - ops::ElementwiseMulGradXPUKernel< - paddle::platform::XPUDeviceContext, float>); + elementwise_mul_grad, ops::ElementwiseMulGradXPUKernel, + ops::ElementwiseMulGradXPUKernel); #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc index 31b6ef9abce6edbd512c83b0af85b0ed9232c318..14b20baae1b0398a40ee74a3e16c2c992a4b557e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc @@ -23,9 +23,11 @@ namespace operators { template class ElementwisePowXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { - XPUElementwise(ctx, xpu::pow); + XPUElementwise(ctx, xpu::broadcast_pow); } }; @@ -35,6 +37,8 @@ class ElementwisePowXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( elementwise_pow, - ops::ElementwisePowXPUKernel); + ops::ElementwisePowXPUKernel, + ops::ElementwisePowXPUKernel); #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc index bef3a4904f4edbd0dd600bc797aa9ccc695b6525..d12c6fc30cebaafd27c099ab708e0662477cb017 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc @@ -21,20 +21,25 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class ElementwiseSubXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { - XPUElementwise(ctx, xpu::sub); + XPUElementwise(ctx, xpu::broadcast_sub); } }; -template +template class ElementwiseSubGradXPUKernel : public ElemwiseGradKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { ElemwiseGradKernel::Compute(ctx); - XPUElementwiseGrad(ctx, xpu::sub_grad, false); + XPUElementwiseGrad(ctx, xpu::broadcast_sub_grad, + false); } }; @@ -42,11 +47,10 @@ class ElementwiseSubGradXPUKernel : public ElemwiseGradKernel { } // namespace paddle namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL(elementwise_sub, ops::ElementwiseSubXPUKernel, + ops::ElementwiseSubXPUKernel); REGISTER_OP_XPU_KERNEL( - elementwise_sub, - ops::ElementwiseSubXPUKernel); -REGISTER_OP_XPU_KERNEL(elementwise_sub_grad, - ops::ElementwiseSubGradXPUKernel< - paddle::platform::XPUDeviceContext, float>); + elementwise_sub_grad, ops::ElementwiseSubGradXPUKernel, + ops::ElementwiseSubGradXPUKernel); #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_xpu.h b/paddle/fluid/operators/elementwise/elementwise_xpu.h index 89d8487fdbb4bbeb6189899ffc86ec230f111773..db5c94b9d1a6e2ded5410b04cdb6259cb259b58f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_xpu.h +++ b/paddle/fluid/operators/elementwise/elementwise_xpu.h @@ -25,64 +25,12 @@ limitations under the License. */ namespace paddle { namespace operators { -static std::pair, std::vector> XPUDimsToBroadcastVector( - const framework::DDim& x, const framework::DDim& y) { - std::vector x_v; - std::vector y_v; - int y_size = y.size(); - for (int i = 0; i < y_size; ++i) { - if (x[i] == y[i]) { - x_v.push_back(y[i]); - y_v.push_back(y[i]); - continue; - } - x_v.push_back(1); - x_v.push_back(x[i]); - y_v.push_back(y[i] / x[i]); - y_v.push_back(x[i]); - } - return std::make_pair(x_v, y_v); -} - -static std::pair, std::vector> XPUReducesAxisVector( - const framework::DDim& x, const framework::DDim& y) { - std::vector x_vector; - std::vector axis_v; - PADDLE_ENFORCE_GT( - x.size(), 0, platform::errors::OutOfRange("x size is less 1, x shape is ", - x.to_str())); - PADDLE_ENFORCE_GT( - y.size(), 0, platform::errors::OutOfRange("y size is less 1, y shape is ", - y.to_str())); - - int y_nums = framework::product(y); - x_vector = framework::vectorize(x); - if (y_nums == 1) { - for (int i = 0; i < x.size(); ++i) { - axis_v.push_back(i); - } - return std::make_pair(x_vector, axis_v); - } - int yidx = 0; - for (size_t i = 0; i < x_vector.size(); ++i) { - if (yidx >= y.size() || y[yidx] == 1) { - axis_v.push_back(i); - yidx++; - continue; - } - if (x_vector[i] != y[yidx]) { - axis_v.push_back(i); - continue; - } - yidx++; - } - return std::make_pair(x_vector, axis_v); -} - -template +template void XPUElementwise( const framework::ExecutionContext& ctx, - std::function func) { + std::function&, const std::vector&)> + func) { auto x_var = ctx.InputVar("X"); PADDLE_ENFORCE_NE(x_var, nullptr, platform::errors::InvalidArgument( "Cannot get input Variable X")); @@ -110,86 +58,59 @@ void XPUElementwise( platform::errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", max_dim, axis)); - - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), - y_dims_array.data(), out_dims_array.data(), max_dim, - axis); - framework::DDim out_dim = framework::make_ddim(out_dims_array); - + std::vector x_dims_vec(max_dim, 1); + std::vector y_dims_vec(max_dim, 1); + if (x_dims.size() == max_dim) { + for (int i = 0; i < max_dim; i++) { + x_dims_vec[i] = x_dims[i]; + } + } else { + for (int i = 0; i < x_dims.size(); i++) { + x_dims_vec[i + axis] = x_dims[i]; + } + } + if (y_dims.size() == max_dim) { + for (int i = 0; i < max_dim; i++) { + y_dims_vec[i] = y_dims[i]; + } + } else { + for (int i = 0; i < y_dims.size(); i++) { + y_dims_vec[i + axis] = y_dims[i]; + } + } const T* x_data = x.data(); const T* y_data = y->data(); T* z_data = z->data(); - bool need_wait = false; - framework::Tensor x_broadcast_tensor; - framework::Tensor y_broadcast_tensor; + auto& dev_ctx = ctx.template device_context(); - int ret = xpu::SUCCESS; - // begin broadcast now - if (x.numel() != z->numel()) { - // broadcast x - std::pair, std::vector> bcast_v = - XPUDimsToBroadcastVector(framework::make_ddim(x_dims_array), out_dim); - ret = xpu::broadcast(dev_ctx.x_context(), x_data, - x_broadcast_tensor.mutable_data( - ctx.GetPlace(), z->numel() * sizeof(T)), - bcast_v.first, bcast_v.second); - PADDLE_ENFORCE_EQ( - ret, xpu::SUCCESS, - platform::errors::External( - "XPU kernel broadcast occur error in XPUElementwise error code %d", - ret)); - need_wait = true; - x_data = x_broadcast_tensor.data(); - } + int ret = xpu::SUCCESS; - if (y->numel() != z->numel()) { - // broadcast y - std::vector bcast_x_v; - std::vector bcast_y_v; - std::pair, std::vector> bcast_v = - XPUDimsToBroadcastVector(framework::make_ddim(y_dims_array), out_dim); - ret = xpu::broadcast(dev_ctx.x_context(), y_data, - y_broadcast_tensor.mutable_data( - ctx.GetPlace(), z->numel() * sizeof(T)), - bcast_v.first, bcast_v.second); - PADDLE_ENFORCE_EQ( - ret, xpu::SUCCESS, - platform::errors::External( - "XPU kernel broadcast occur error in XPUElementwise error code %d", - ret)); - need_wait = true; - y_data = y_broadcast_tensor.data(); - } - int len = z->numel(); - ret = func(dev_ctx.x_context(), x_data, y_data, z_data, len); + ret = func(dev_ctx.x_context(), reinterpret_cast(x_data), + reinterpret_cast(y_data), + reinterpret_cast(z_data), x_dims_vec, y_dims_vec); PADDLE_ENFORCE_EQ( ret, xpu::SUCCESS, platform::errors::External( "XPU kernel Elementwise occur error in XPUElementwise error code ", - ret)); - - if (need_wait && dev_ctx.x_context()->xpu_stream) { - dev_ctx.Wait(); - } + ret, XPUAPIErrorMsg[ret])); } -template -void XPUElementwiseGrad(const framework::ExecutionContext& ctx, - std::function - func, - bool use_x_y_data) { +template +void XPUElementwiseGrad( + const framework::ExecutionContext& ctx, + std::function&, const std::vector&)> + func, + bool use_x_y_data) { auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); auto* dz = ctx.Input(framework::GradVarName("Out")); - auto* z = dz; auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); + auto* z = dz; int axis = ctx.Attr("axis"); const framework::DDim& x_dims = x->dims(); const framework::DDim& y_dims = y->dims(); @@ -204,120 +125,55 @@ void XPUElementwiseGrad(const framework::ExecutionContext& ctx, platform::errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", max_dim, axis)); + std::vector x_dims_vec(max_dim, 1); + std::vector y_dims_vec(max_dim, 1); + if (x_dims.size() == max_dim) { + for (int i = 0; i < max_dim; i++) { + x_dims_vec[i] = x_dims[i]; + } + } else { + for (int i = 0; i < x_dims.size(); i++) { + x_dims_vec[i + axis] = x_dims[i]; + } + } + if (y_dims.size() == max_dim) { + for (int i = 0; i < max_dim; i++) { + y_dims_vec[i] = y_dims[i]; + } + } else { + for (int i = 0; i < y_dims.size(); i++) { + y_dims_vec[i + axis] = y_dims[i]; + } + } - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), - y_dims_array.data(), out_dims_array.data(), max_dim, - axis); - framework::DDim out_dim = framework::make_ddim(out_dims_array); - - int len = framework::product(out_dim); - - framework::Tensor x_broadcast_tensor; - framework::Tensor y_broadcast_tensor; - - framework::Tensor dx_local_tensor; - framework::Tensor dy_local_tensor; - - bool need_wait = false; const T* x_data = use_x_y_data ? x->data() : z->data(); const T* y_data = use_x_y_data ? y->data() : z->data(); - const T* z_data = z->data(); - const T* dz_data = (const T*)dz->data(); - - bool dx_need_reduce = (dx != nullptr) && (dx->numel() != len); - bool dy_need_reduce = (dy != nullptr) && (dy->numel() != len); - T* dx_data = - ((dx == nullptr) || dx_need_reduce) - ? (dx_local_tensor.mutable_data(ctx.GetPlace(), len * sizeof(T))) - : (dx->mutable_data(ctx.GetPlace())); - - T* dy_data = - ((dy == nullptr) || dy_need_reduce) - ? (dy_local_tensor.mutable_data(ctx.GetPlace(), len * sizeof(T))) - : (dy->mutable_data(ctx.GetPlace())); - - int ret = xpu::SUCCESS; + const T* dz_data = dz->data(); + T* dx_data = nullptr; + T* dy_data = nullptr; auto& dev_ctx = ctx.template device_context(); - if (use_x_y_data && x->numel() != len) { - std::vector bcast_x_v; - std::vector bcast_y_v; - std::pair, std::vector> bcast_v = - XPUDimsToBroadcastVector(framework::make_ddim(x_dims_array), out_dim); - ret = xpu::broadcast( - dev_ctx.x_context(), x_data, - x_broadcast_tensor.mutable_data(ctx.GetPlace(), len * sizeof(T)), - bcast_v.first, bcast_v.second); - PADDLE_ENFORCE_EQ(ret, xpu::SUCCESS, - platform::errors::External( - "XPU kernel broadcast error occur! %d", ret)); - need_wait = true; - x_data = x_broadcast_tensor.data(); - } - - if (use_x_y_data && y->numel() != len) { - // broadcast y - std::vector bcast_x_v; - std::vector bcast_y_v; - std::pair, std::vector> bcast_v = - XPUDimsToBroadcastVector(framework::make_ddim(y_dims_array), out_dim); - ret = xpu::broadcast( - dev_ctx.x_context(), y_data, - y_broadcast_tensor.mutable_data(ctx.GetPlace(), len * sizeof(T)), - bcast_v.first, bcast_v.second); - PADDLE_ENFORCE_EQ(ret, xpu::SUCCESS, - platform::errors::External( - "XPU kernel broadcast error occur! %d", ret)); - need_wait = true; - y_data = y_broadcast_tensor.data(); - } - - ret = func(dev_ctx.x_context(), x_data, y_data, z_data, dz_data, dx_data, - dy_data, len); - PADDLE_ENFORCE_EQ(ret, xpu::SUCCESS, platform::errors::External( - "XPU kernel binary occur error in " - "XPUElementwiseGrad, error code %d", - ret)); - - if (dx_need_reduce) { - const framework::DDim& dx_dims = dx->dims(); - std::pair, std::vector> reduce_v = - XPUReducesAxisVector(out_dim, dx_dims); - ret = xpu::reduce_sum(dev_ctx.x_context(), dx_data, - dx->mutable_data(ctx.GetPlace()), - reduce_v.first, reduce_v.second); - PADDLE_ENFORCE_EQ( - ret, xpu::SUCCESS, - platform::errors::External("XPU kernel reduce_sum occur error in " - "XPUElementwiseGrad, error code %d", - ret)); - need_wait = true; + if (dx) { + dx_data = dx->mutable_data(ctx.GetPlace()); } - - if (dy_need_reduce) { - const framework::DDim& dy_dims = dy->dims(); - std::pair, std::vector> reduce_v = - XPUReducesAxisVector(out_dim, dy_dims); - ret = xpu::reduce_sum(dev_ctx.x_context(), dy_data, - dy->mutable_data(ctx.GetPlace()), - reduce_v.first, reduce_v.second); - PADDLE_ENFORCE_EQ( - ret, xpu::SUCCESS, - platform::errors::External("XPU kernel reduce_sum occur error in " - "XPUElementwiseGrad, error code %d", - ret)); - need_wait = true; + if (dy) { + dy_data = dy->mutable_data(ctx.GetPlace()); } - if (need_wait && dev_ctx.x_context()->xpu_stream) { - dev_ctx.Wait(); - } + int ret = func(dev_ctx.x_context(), reinterpret_cast(x_data), + reinterpret_cast(y_data), + reinterpret_cast(z_data), + reinterpret_cast(dz_data), + reinterpret_cast(dy_data), + reinterpret_cast(dx_data), x_dims_vec, y_dims_vec); + PADDLE_ENFORCE_EQ( + ret, xpu::SUCCESS, + platform::errors::External( + "XPU kernel Elementwise occur error in XPUElementwise error code ", + ret, XPUAPIErrorMsg[ret])); } } // namespace operators diff --git a/paddle/fluid/platform/xpu/xpu2_op_list.h b/paddle/fluid/platform/xpu/xpu2_op_list.h index fc80e5ee962f99f65cc7d722ff50dcbc1325790a..a8b2962d4acafd605480147711cf82f9e629ab69 100644 --- a/paddle/fluid/platform/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/xpu/xpu2_op_list.h @@ -31,6 +31,48 @@ XPUOpMap& get_kl2_ops() { static XPUOpMap s_xpu2_kernels{ {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_sub", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_sub_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_add", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_add_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_div", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_div_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_pow", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_floordiv", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_mul", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_mul_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_max", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_max_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_min", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_min_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, // AddMore }; diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py index c10a58bce1757a347fc554d5664be38a5b05afa3..8fd6b1ff4050ec0dc8e23574939dc6bb62e1a935 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py @@ -251,7 +251,10 @@ class TestRMSPropV2(XPUOpTest): cost = fluid.layers.square_error_cost(input=y_predict, label=y) avg_cost = fluid.layers.mean(cost) - rms_optimizer = paddle.optimizer.RMSProp(learning_rate=0.1) + print(avg_cost.shape) + linear = paddle.nn.Linear(13, 5) + rms_optimizer = paddle.optimizer.RMSProp( + learning_rate=0.1, parameters=linear.parameters()) rms_optimizer.minimize(avg_cost) fetch_list = [avg_cost]