From 86a6be1ae1f83e8f15dfe6a330baca7abd5cbb83 Mon Sep 17 00:00:00 2001 From: taixiurong Date: Mon, 13 Sep 2021 14:30:55 +0800 Subject: [PATCH] add xpu_wait & new implementation replace memcpy in adam, adamw (#35437) --- cmake/external/xpu.cmake | 2 +- paddle/fluid/operators/layer_norm_op_xpu.cc | 51 ++-- paddle/fluid/operators/mean_op_xpu.cc | 59 +++-- .../fluid/operators/optimizers/adam_op_xpu.cc | 87 ++++--- .../operators/optimizers/adamw_op_xpu.cc | 218 ++++++++++++++++++ .../softmax_with_cross_entropy_op_xpu.cc | 80 +++++++ paddle/fluid/operators/sum_op_xpu.cc | 39 ++-- paddle/fluid/operators/transpose_op_xpu.cc | 32 ++- paddle/fluid/platform/xpu/xpu2_op_list.h | 29 +++ 9 files changed, 482 insertions(+), 115 deletions(-) create mode 100644 paddle/fluid/operators/optimizers/adamw_op_xpu.cc diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index cb946fb85c0..02abf08a99c 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -35,7 +35,7 @@ ELSE () ENDIF() SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") -SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210830") +SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210909") SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) diff --git a/paddle/fluid/operators/layer_norm_op_xpu.cc b/paddle/fluid/operators/layer_norm_op_xpu.cc index c55250f2708..d5128f7cb21 100644 --- a/paddle/fluid/operators/layer_norm_op_xpu.cc +++ b/paddle/fluid/operators/layer_norm_op_xpu.cc @@ -24,6 +24,8 @@ using DDim = framework::DDim; template class LayerNormXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); @@ -39,15 +41,17 @@ class LayerNormXPUKernel : public framework::OpKernel { auto* mean = ctx.Output("Mean"); auto* variance = ctx.Output("Variance"); const auto* x_data = x->data(); - const auto* scale_data = (scale == nullptr ? nullptr : scale->data()); - const auto* bias_data = (bias == nullptr ? nullptr : bias->data()); + const auto* scale_data = + (scale == nullptr ? nullptr : scale->data()); + const auto* bias_data = (bias == nullptr ? nullptr : bias->data()); auto* y_data = y->mutable_data(ctx.GetPlace()); - auto* mean_data = mean->mutable_data(ctx.GetPlace()); - auto* variance_data = variance->mutable_data(ctx.GetPlace()); + auto* mean_data = mean->mutable_data(ctx.GetPlace()); + auto* variance_data = variance->mutable_data(ctx.GetPlace()); auto& dev_ctx = ctx.template device_context(); - int r = xpu::layer_norm(dev_ctx.x_context(), x_data, y_data, left, right, - epsilon, scale_data, bias_data, mean_data, - variance_data); + int r = xpu::layer_norm( + dev_ctx.x_context(), reinterpret_cast(x_data), + reinterpret_cast(y_data), left, right, epsilon, scale_data, + bias_data, mean_data, variance_data); PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( "XPU layer_norm kernel return wrong value[%d %s]", r, @@ -57,6 +61,8 @@ class LayerNormXPUKernel : public framework::OpKernel { template class LayerNormGradXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); @@ -75,19 +81,24 @@ class LayerNormGradXPUKernel : public framework::OpKernel { auto* dbias = ctx.Output(framework::GradVarName("Bias")); const auto* x_data = x->data(); const auto* dy_data = dy->data(); - const auto* mean_data = mean->data(); - const auto* variance_data = variance->data(); - const auto* scale_data = (scale == nullptr ? nullptr : scale->data()); + const auto* mean_data = mean->data(); + const auto* variance_data = variance->data(); + const auto* scale_data = + (scale == nullptr ? nullptr : scale->data()); auto* dscale_data = - (dscale == nullptr ? nullptr : dscale->mutable_data(ctx.GetPlace())); - auto* dbias_data = - (dbias == nullptr ? nullptr : dbias->mutable_data(ctx.GetPlace())); + (dscale == nullptr ? nullptr + : dscale->mutable_data(ctx.GetPlace())); + auto* dbias_data = (dbias == nullptr ? nullptr : dbias->mutable_data( + ctx.GetPlace())); auto* dx_data = (dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace())); auto& dev_ctx = ctx.template device_context(); - int r = xpu::layer_norm_grad(dev_ctx.x_context(), x_data, dy_data, dx_data, - left, right, epsilon, scale_data, mean_data, - variance_data, dscale_data, dbias_data); + + int r = xpu::layer_norm_grad( + dev_ctx.x_context(), reinterpret_cast(x_data), + reinterpret_cast(dy_data), + reinterpret_cast(dx_data), left, right, epsilon, scale_data, + mean_data, variance_data, dscale_data, dbias_data); PADDLE_ENFORCE_EQ( r, XPU_SUCCESS, platform::errors::External( @@ -103,9 +114,13 @@ namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( layer_norm, - ops::LayerNormXPUKernel); + ops::LayerNormXPUKernel, + ops::LayerNormXPUKernel); REGISTER_OP_XPU_KERNEL( layer_norm_grad, - ops::LayerNormGradXPUKernel); + ops::LayerNormGradXPUKernel, + ops::LayerNormGradXPUKernel); #endif // PADDLE_WITH_XPU diff --git a/paddle/fluid/operators/mean_op_xpu.cc b/paddle/fluid/operators/mean_op_xpu.cc index 71bcc4be15c..1521265e1b3 100644 --- a/paddle/fluid/operators/mean_op_xpu.cc +++ b/paddle/fluid/operators/mean_op_xpu.cc @@ -23,24 +23,33 @@ namespace operators { template class MeanXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { auto* input = context.Input("X"); auto* output = context.Output("Out"); output->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); - const float* x_data = input->data(); - float* y_data = output->data(); - int r = xpu::mean(dev_ctx.x_context(), x_data, y_data, input->numel()); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External( - "XPU kernel error, Mean op execution not succeed, error code=%d", - r)); + const T* x_data = input->data(); + T* y_data = output->data(); + std::vector x_shape; + x_shape.push_back(1); + x_shape.push_back(input->numel()); + std::vector rdims = {1}; + int r = xpu::reduce_mean( + dev_ctx.x_context(), reinterpret_cast(x_data), + reinterpret_cast(y_data), x_shape, rdims); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU reduce_mean kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); } }; template class MeanGradXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { auto OG = context.Input(framework::GradVarName("Out")); @@ -49,14 +58,24 @@ class MeanGradXPUKernel : public framework::OpKernel { auto IG = context.Output(framework::GradVarName("X")); IG->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); - float* dx = IG->data(); - const float* dy = OG->data(); - int r = xpu::mean_grad(dev_ctx.x_context(), dx, dy, IG->numel()); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External( - "XPU kernel error. Mean_grad execution not succeed, error code=%d", - r)); + + XPUType* dx = reinterpret_cast(IG->data()); + + const T* dy = OG->data(); + T dy0_value; + xpu_wait(dev_ctx.x_context()->xpu_stream); + memory::Copy(platform::CPUPlace(), &dy0_value, + BOOST_GET_CONST(platform::XPUPlace, OG->place()), dy, + sizeof(T)); + float dy0_fp32 = static_cast(dy0_value); + dy0_fp32 = dy0_fp32 / static_cast(IG->numel()); + + int r = xpu::constant(dev_ctx.x_context(), dx, IG->numel(), + static_cast(dy0_fp32)); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU constant kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); } }; @@ -65,8 +84,12 @@ class MeanGradXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( - mean, ops::MeanXPUKernel); + mean, ops::MeanXPUKernel, + ops::MeanXPUKernel); REGISTER_OP_XPU_KERNEL( mean_grad, - ops::MeanGradXPUKernel); + ops::MeanGradXPUKernel, + ops::MeanGradXPUKernel); #endif diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc index 172088ebcbd..318a8f14930 100644 --- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc @@ -113,27 +113,27 @@ class AdamOpXPUKernel : public framework::OpKernel { bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - T beta1 = static_cast(ctx.Attr("beta1")); + float beta1 = static_cast(ctx.Attr("beta1")); if (ctx.HasInput("Beta1Tensor")) { auto* beta1_tensor = ctx.Input("Beta1Tensor"); - beta1 = static_cast(GetAttrFromTensor(beta1_tensor)); + beta1 = static_cast(GetAttrFromTensor(beta1_tensor)); } - T beta2 = static_cast(ctx.Attr("beta2")); + float beta2 = static_cast(ctx.Attr("beta2")); if (ctx.HasInput("Beta2Tensor")) { auto* beta2_tensor = ctx.Input("Beta2Tensor"); - beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); + beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); } - T epsilon = static_cast(ctx.Attr("epsilon")); + float epsilon = static_cast(ctx.Attr("epsilon")); if (ctx.HasInput("EpsilonTensor")) { auto* epsilon_tensor = ctx.Input("EpsilonTensor"); - epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); + epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); } if (grad_var->IsType()) { auto& grad = GET_DATA_SAFELY(ctx.Input("Grad"), "Input", "Grad", "Adam"); auto& dev_ctx = ctx.template device_context(); - const T* beta1_pow_ptr = beta1_pow.template data(); - const T* beta2_pow_ptr = beta2_pow.template data(); + const float* beta1_pow_ptr = beta1_pow.template data(); + const float* beta2_pow_ptr = beta2_pow.template data(); Tensor xpu_beta1_pow; Tensor xpu_beta2_pow; if (beta1_pow.place() == platform::CPUPlace() && @@ -141,50 +141,49 @@ class AdamOpXPUKernel : public framework::OpKernel { TensorCopy(beta1_pow, ctx.GetPlace(), dev_ctx, &xpu_beta1_pow); TensorCopy(beta2_pow, ctx.GetPlace(), dev_ctx, &xpu_beta2_pow); dev_ctx.Wait(); - beta1_pow_ptr = xpu_beta1_pow.template data(); - beta2_pow_ptr = xpu_beta2_pow.template data(); + beta1_pow_ptr = xpu_beta1_pow.template data(); + beta2_pow_ptr = xpu_beta2_pow.template data(); } - int r = xpu::adam( - dev_ctx.x_context(), grad.template data(), mom1.template data(), - mom2.template data(), param.template data(), beta1_pow_ptr, - beta2_pow_ptr, beta1, beta2, epsilon, lr.template data(), - mom1_out.template mutable_data(ctx.GetPlace()), - mom2_out.template mutable_data(ctx.GetPlace()), - param_out.template mutable_data(ctx.GetPlace()), param.numel()); + + int r = xpu::adam(dev_ctx.x_context(), grad.template data(), + mom1.template data(), mom2.template data(), + param.template data(), beta1_pow_ptr, + beta2_pow_ptr, lr.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2_out.template mutable_data(ctx.GetPlace()), + param_out.template mutable_data(ctx.GetPlace()), + beta1, beta2, epsilon, param.numel()); if (!use_global_beta_pow) { // update in cpu and then copy to xpu if (beta1_pow.place() == platform::CPUPlace() && beta2_pow.place() == platform::CPUPlace()) { - const T* beta1_pow_p = beta1_pow.template data(); - beta1_pow_out->mutable_data(platform::CPUPlace())[0] = + const float* beta1_pow_p = beta1_pow.template data(); + beta1_pow_out->mutable_data(platform::CPUPlace())[0] = beta1 * beta1_pow_p[0]; - const T* beta2_pow_p = beta2_pow.template data(); - beta2_pow_out->mutable_data(platform::CPUPlace())[0] = + const float* beta2_pow_p = beta2_pow.template data(); + beta2_pow_out->mutable_data(platform::CPUPlace())[0] = beta2 * beta2_pow_p[0]; - + xpu_wait(dev_ctx.x_context()->xpu_stream); } else { - T cpu_beta1_pow_out_data; - T cpu_beta2_pow_out_data; - - memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data, - BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()), - beta1_pow_ptr, sizeof(T)); - - cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1; - memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data, - BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()), - beta2_pow_ptr, sizeof(T)); - - cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2; - - T* beta1_pow_out_p = beta1_pow_out->mutable_data(ctx.GetPlace()); - T* beta2_pow_out_p = beta2_pow_out->mutable_data(ctx.GetPlace()); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), - beta1_pow_out_p, platform::CPUPlace(), - &cpu_beta1_pow_out_data, sizeof(T)); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), - beta2_pow_out_p, platform::CPUPlace(), - &cpu_beta2_pow_out_data, sizeof(T)); + float* beta1_pow_out_p = + beta1_pow_out->mutable_data(ctx.GetPlace()); + float* beta2_pow_out_p = + beta2_pow_out->mutable_data(ctx.GetPlace()); + int r = + xpu::scale(dev_ctx.x_context(), beta1_pow_ptr, beta1_pow_out_p, + beta1_pow.numel(), false, beta1, 0.0f); + PADDLE_ENFORCE_EQ( + r, xpu::SUCCESS, + platform::errors::External( + "XPU kernel scale occur error in adamw error code ", r, + XPUAPIErrorMsg[r])); + r = xpu::scale(dev_ctx.x_context(), beta2_pow_ptr, beta2_pow_out_p, + beta2_pow.numel(), false, beta2, 0.0f); + PADDLE_ENFORCE_EQ( + r, xpu::SUCCESS, + platform::errors::External( + "XPU kernel scale occur error in adamw error code ", r, + XPUAPIErrorMsg[r])); } PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true, diff --git a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc new file mode 100644 index 00000000000..c20bd6a9fad --- /dev/null +++ b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc @@ -0,0 +1,218 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gflags/gflags.h" +#include "paddle/fluid/operators/optimizers/adam_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +#ifdef PADDLE_WITH_XPU +template +class AdamwOpXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + platform::errors::InvalidArgument( + "Tensor holds the wrong type,Expected Var(%s)'s " + "type is LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); + using paddle::framework::LoDTensor; + + auto& param = GET_DATA_SAFELY(ctx.Input("Param"), "Input", + "Param", "Adam"); + // auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); + auto* grad_var = ctx.InputVar("Grad"); + auto& mom1 = GET_DATA_SAFELY(ctx.Input("Moment1"), "Input", + "Moment1", "Adam"); + auto& mom2 = GET_DATA_SAFELY(ctx.Input("Moment2"), "Input", + "Moment2", "Adam"); + auto& lr = GET_DATA_SAFELY(ctx.Input("LearningRate"), "Input", + "LearningRate", "Adam"); + auto& beta1_pow = GET_DATA_SAFELY(ctx.Input("Beta1Pow"), "Input", + "Beta1Pow", "Adam"); + auto& beta2_pow = GET_DATA_SAFELY(ctx.Input("Beta2Pow"), "Input", + "Beta2Pow", "Adam"); + + auto& param_out = GET_DATA_SAFELY(ctx.Output("ParamOut"), + "Output", "ParamOut", "Adam"); + auto& mom1_out = GET_DATA_SAFELY(ctx.Output("Moment1Out"), + "Output", "Moment1Out", "Adam"); + auto& mom2_out = GET_DATA_SAFELY(ctx.Output("Moment2Out"), + "Output", "Moment2Out", "Adam"); + + auto* beta1_pow_out = ctx.Output("Beta1PowOut"); + auto* beta2_pow_out = ctx.Output("Beta2PowOut"); + + bool skip_update = false; + if (ctx.HasInput("SkipUpdate")) { + auto* skip_update_tensor = ctx.Input("SkipUpdate"); + PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(SkipUpdate) size must be 1, but get %d", + skip_update_tensor->numel())); + std::vector skip_update_vec; + TensorToVector(*skip_update_tensor, ctx.device_context(), + &skip_update_vec); + skip_update = skip_update_vec[0]; + } + auto& dev_ctx = ctx.template device_context(); + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update) { + VLOG(4) << "Adam skip update"; + framework::TensorCopy(param, ctx.GetPlace(), dev_ctx, ¶m_out); + framework::TensorCopy(mom1, ctx.GetPlace(), dev_ctx, &mom1_out); + framework::TensorCopy(mom2, ctx.GetPlace(), dev_ctx, &mom2_out); + framework::TensorCopy(beta1_pow, ctx.GetPlace(), dev_ctx, beta1_pow_out); + framework::TensorCopy(beta2_pow, ctx.GetPlace(), dev_ctx, beta2_pow_out); + return; + } + + bool with_decay = ctx.Attr("with_decay"); + + PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "Tensor holds the wrong size, Expected beta1 pow " + "output size is 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "Tensor holds the wrong size, Expected beta2 pow " + "output size is 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + + float beta1 = static_cast(ctx.Attr("beta1")); + if (ctx.HasInput("Beta1Tensor")) { + auto* beta1_tensor = ctx.Input("Beta1Tensor"); + beta1 = static_cast(GetAttrFromTensor(beta1_tensor)); + } + float beta2 = static_cast(ctx.Attr("beta2")); + if (ctx.HasInput("Beta2Tensor")) { + auto* beta2_tensor = ctx.Input("Beta2Tensor"); + beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); + } + float epsilon = static_cast(ctx.Attr("epsilon")); + if (ctx.HasInput("EpsilonTensor")) { + auto* epsilon_tensor = ctx.Input("EpsilonTensor"); + epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); + } + if (grad_var->IsType()) { + auto& grad = GET_DATA_SAFELY(ctx.Input("Grad"), "Input", + "Grad", "Adam"); + + const float* beta1_pow_ptr = beta1_pow.template data(); + const float* beta2_pow_ptr = beta2_pow.template data(); + Tensor xpu_beta1_pow; + Tensor xpu_beta2_pow; + if (beta1_pow.place() == platform::CPUPlace() && + beta2_pow.place() == platform::CPUPlace()) { + TensorCopy(beta1_pow, ctx.GetPlace(), dev_ctx, &xpu_beta1_pow); + TensorCopy(beta2_pow, ctx.GetPlace(), dev_ctx, &xpu_beta2_pow); + dev_ctx.Wait(); + beta1_pow_ptr = xpu_beta1_pow.template data(); + beta2_pow_ptr = xpu_beta2_pow.template data(); + } + if (with_decay) { + float coeff = ctx.Attr("coeff"); + int r = + xpu::adamw(dev_ctx.x_context(), grad.template data(), + mom1.template data(), mom2.template data(), + param.template data(), beta1_pow_ptr, beta2_pow_ptr, + lr.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2_out.template mutable_data(ctx.GetPlace()), + param_out.template mutable_data(ctx.GetPlace()), + beta1, beta2, epsilon, coeff, param.numel()); + PADDLE_ENFORCE_EQ( + r, xpu::SUCCESS, + platform::errors::External( + "XPU kernel adamw occur error in adamw error code ", r, + XPUAPIErrorMsg[r])); + } else { + int r = + xpu::adam(dev_ctx.x_context(), grad.template data(), + mom1.template data(), mom2.template data(), + param.template data(), beta1_pow_ptr, beta2_pow_ptr, + lr.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2_out.template mutable_data(ctx.GetPlace()), + param_out.template mutable_data(ctx.GetPlace()), beta1, + beta2, epsilon, param.numel()); + PADDLE_ENFORCE_EQ( + r, xpu::SUCCESS, + platform::errors::External( + "XPU kernel adam occur error in adamw error code ", r, + XPUAPIErrorMsg[r])); + } + + if (!use_global_beta_pow) { + // update in cpu and then copy to xpu + if (beta1_pow.place() == platform::CPUPlace() && + beta2_pow.place() == platform::CPUPlace()) { + const float* beta1_pow_p = beta1_pow.template data(); + beta1_pow_out->mutable_data(platform::CPUPlace())[0] = + beta1 * beta1_pow_p[0]; + const float* beta2_pow_p = beta2_pow.template data(); + beta2_pow_out->mutable_data(platform::CPUPlace())[0] = + beta2 * beta2_pow_p[0]; + xpu_wait(dev_ctx.x_context()->xpu_stream); + } else { + float* beta1_pow_out_p = + beta1_pow_out->mutable_data(ctx.GetPlace()); + float* beta2_pow_out_p = + beta2_pow_out->mutable_data(ctx.GetPlace()); + int r = + xpu::scale(dev_ctx.x_context(), beta1_pow_ptr, beta1_pow_out_p, + beta1_pow.numel(), false, beta1, 0.0f); + PADDLE_ENFORCE_EQ( + r, xpu::SUCCESS, + platform::errors::External( + "XPU kernel scale occur error in adamw error code ", r, + XPUAPIErrorMsg[r])); + r = xpu::scale(dev_ctx.x_context(), beta2_pow_ptr, beta2_pow_out_p, + beta2_pow.numel(), false, beta2, 0.0f); + PADDLE_ENFORCE_EQ( + r, xpu::SUCCESS, + platform::errors::External( + "XPU kernel scale occur error in adamw error code ", r, + XPUAPIErrorMsg[r])); + } + } + } else { + PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument( + "Variable type not supported by adamw_op")); + } + } +}; +#endif + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +#ifdef PADDLE_WITH_XPU +REGISTER_OP_XPU_KERNEL(adamw, ops::AdamwOpXPUKernel); +#endif diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc index a79e31eb8d0..63f0548a0c6 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc @@ -54,9 +54,11 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel { int len = logits->numel(); T* clip_logits_data = clip_logits.mutable_data(context.GetPlace(), len * sizeof(T)); + r = xpu::clip_v2(dev_ctx.x_context(), logits->data(), clip_logits_data, len, static_cast(-1e20), static_cast(1e20)); + PADDLE_ENFORCE_EQ( r, xpu::Error_t::SUCCESS, platform::errors::External("XPU kernel error. clip " @@ -108,10 +110,88 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel { } } }; + +template +class SoftmaxWithCrossEntropyGradXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* out_grad = + context.Input(framework::GradVarName("Loss")); + const Tensor* labels = context.Input("Label"); + Tensor* logit_grad = + context.Output(framework::GradVarName("Logits")); + + logit_grad->mutable_data(context.GetPlace()); + + const Tensor* softmax = context.Input("Softmax"); + const bool use_softmax = context.Attr("use_softmax"); + + const bool soft_label = context.Attr("soft_label"); + auto ignore_index = context.Attr("ignore_index"); + + const int rank = logit_grad->dims().size(); + const int axis = CanonicalAxis(context.Attr("axis"), rank); + PADDLE_ENFORCE_EQ(axis, rank - 1, platform::errors::InvalidArgument( + "axis should == rank - 1")); + const int n = SizeToAxis(axis, logit_grad->dims()); + const int d = SizeFromAxis(axis, logit_grad->dims()); + + auto& dev_ctx = + context.template device_context(); + int r = XPU_SUCCESS; + + if (soft_label) { + r = xpu::soft_softmax_with_cross_entropy_grad( + dev_ctx.x_context(), + reinterpret_cast(out_grad->data()), + reinterpret_cast(labels->data()), + reinterpret_cast(softmax->data()), + reinterpret_cast(logit_grad->data()), use_softmax, n, d); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External( + "XPU API(soft_softmax_with_cross_entropy_grad) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + } else { + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + int* labels_int_ptr_l3 = + RAII_GUARD.alloc_l3_or_gm(labels->numel()); + r = xpu::cast_v2(dev_ctx.x_context(), + labels->data(), + labels_int_ptr_l3, labels->numel()); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(cast_v2) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + + r = xpu::hard_softmax_with_cross_entropy_grad( + dev_ctx.x_context(), + reinterpret_cast(out_grad->data()), + labels_int_ptr_l3, + reinterpret_cast(softmax->data()), + reinterpret_cast(logit_grad->data()), ignore_index, + use_softmax, n, d); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External( + "XPU API(hard_softmax_with_cross_entropy_grad) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + } + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyXPUKernel); +REGISTER_OP_XPU_KERNEL( + softmax_with_cross_entropy_grad, + ops::SoftmaxWithCrossEntropyGradXPUKernel, + ops::SoftmaxWithCrossEntropyGradXPUKernel); #endif diff --git a/paddle/fluid/operators/sum_op_xpu.cc b/paddle/fluid/operators/sum_op_xpu.cc index d16bb5562ed..89a48a2144e 100644 --- a/paddle/fluid/operators/sum_op_xpu.cc +++ b/paddle/fluid/operators/sum_op_xpu.cc @@ -21,6 +21,8 @@ using framework::Tensor; template class SumXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext &context) const override { auto in_vars = context.MultiInputVar("X"); @@ -35,8 +37,7 @@ class SumXPUKernel : public framework::OpKernel { out->mutable_data(context.GetPlace()); } auto &dev_ctx = context.template device_context(); - std::vector ptrs(N, nullptr); - int valid_count = 0; + std::vector ptrs; for (int i = 0; i < N; ++i) { PADDLE_ENFORCE_EQ( in_vars[i]->IsType(), true, @@ -45,30 +46,14 @@ class SumXPUKernel : public framework::OpKernel { if (in_t.numel() == 0) { continue; } - ptrs[valid_count] = reinterpret_cast(in_t.data()); - valid_count++; - } - int r = xpu::sum_batch(dev_ctx.x_context(), ptrs.data(), out->data(), - valid_count, out->numel()); - if (r == xpu::Error_t::INVALID_PARAM) { - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::InvalidArgument( - "XPU kernel error of SumOp, error message: INVALID_PARAM, " - "please check your input & output.")); - } else if (r == xpu::Error_t::RUNTIME_ERROR) { - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::Unavailable( - "XPU kernel error of SumOp, error message: " - "RUNTIME_ERROR, please check whether Baidu " - "Kunlun Card is properly installed.")); - } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) { - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::ResourceExhausted( - "XPU kernel error of SumOp, error " - "message: NO_ENOUGH_WORKSPACE, XPU " - "has no enough memory.")); + ptrs.push_back(reinterpret_cast(in_t.data())); } + int r = xpu::sum(dev_ctx.x_context(), ptrs, + reinterpret_cast(out->data()), out->numel()); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU sum kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); } }; @@ -78,5 +63,7 @@ class SumXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( - sum, ops::SumXPUKernel); + sum, ops::SumXPUKernel, + ops::SumXPUKernel); #endif diff --git a/paddle/fluid/operators/transpose_op_xpu.cc b/paddle/fluid/operators/transpose_op_xpu.cc index 360c2125ed1..0e25a69f87c 100644 --- a/paddle/fluid/operators/transpose_op_xpu.cc +++ b/paddle/fluid/operators/transpose_op_xpu.cc @@ -26,6 +26,8 @@ using framework::Tensor; template class TransposeXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { auto x = context.Input("X"); @@ -46,8 +48,9 @@ class TransposeXPUKernel : public framework::OpKernel { x_shape_host[i] = x_dims[i]; } auto& dev_ctx = context.template device_context(); - int r = xpu::transpose(dev_ctx.x_context(), x_data, y_data, x_shape_host, - axis); + int r = xpu::transpose( + dev_ctx.x_context(), reinterpret_cast(x_data), + reinterpret_cast(y_data), x_shape_host, axis); PADDLE_ENFORCE_EQ( r, xpu::Error_t::SUCCESS, platform::errors::External("XPU kernel error! error code=%d", r)); @@ -56,6 +59,8 @@ class TransposeXPUKernel : public framework::OpKernel { template class TransposeGradXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { auto* out_grad = @@ -77,8 +82,11 @@ class TransposeGradXPUKernel : public framework::OpKernel { out_shape_host[i] = out_grad->dims()[i]; } auto& dev_ctx = context.template device_context(); - int r = xpu::transpose(dev_ctx.x_context(), out_grad->data(), - x_grad->data(), out_shape_host, reversed_axis); + int r = xpu::transpose( + dev_ctx.x_context(), + reinterpret_cast(out_grad->data()), + reinterpret_cast(x_grad->data()), out_shape_host, + reversed_axis); PADDLE_ENFORCE_EQ( r, xpu::Error_t::SUCCESS, platform::errors::External("XPU kernel error! error code=%d", r)); @@ -92,15 +100,23 @@ namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( transpose, - ops::TransposeXPUKernel); + ops::TransposeXPUKernel, + ops::TransposeXPUKernel); REGISTER_OP_XPU_KERNEL( transpose_grad, - ops::TransposeGradXPUKernel); + ops::TransposeGradXPUKernel, + ops::TransposeGradXPUKernel); REGISTER_OP_XPU_KERNEL( transpose2, - ops::TransposeXPUKernel); + ops::TransposeXPUKernel, + ops::TransposeXPUKernel); REGISTER_OP_XPU_KERNEL( transpose2_grad, - ops::TransposeGradXPUKernel); + ops::TransposeGradXPUKernel, + ops::TransposeGradXPUKernel); #endif // PADDLE_WITH_XPU diff --git a/paddle/fluid/platform/xpu/xpu2_op_list.h b/paddle/fluid/platform/xpu/xpu2_op_list.h index 0989f215687..5b9e1a34bfc 100644 --- a/paddle/fluid/platform/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/xpu/xpu2_op_list.h @@ -79,6 +79,35 @@ XPUOpMap& get_kl2_ops() { {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"batch_norm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"layer_norm_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"mean_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"softmax_with_cross_entropy", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"softmax_with_cross_entropy_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"transpose_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"transpose2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"transpose2_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + // AddMore }; -- GitLab