From a012d426b634bdccca119eaeff79191bffcb962c Mon Sep 17 00:00:00 2001 From: mengqingchun02 <103740521+mengqingchun02@users.noreply.github.com> Date: Wed, 24 Aug 2022 16:13:01 +0800 Subject: [PATCH] Support fp16 of adam operator in xpu environment (#45292) * support beam_search operator on xpu. test=kunlun * support beam_search operator on xpu. test=kunlun * support beam_search operator on xpu. test=kunlun * support beam_search operator on xpu. test=kunlun * support beam_search operator on xpu. test=kunlun * support fp16 of adam operator in xpu environment. test=kunlun * support fp16 of adam operator in xpu environment. test=kunlun * support fp16 of adam operator in xpu environment. test=kunlun --- .../fluid/operators/optimizers/adam_op_xpu.cc | 555 +++++++++++++----- .../fluid/platform/device/xpu/xpu2_op_list.h | 4 +- .../tests/unittests/xpu/test_adam_op_xpu.py | 105 ++++ 3 files changed, 513 insertions(+), 151 deletions(-) diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc index 20fb7cf2653..c9e2f71c9e2 100644 --- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc @@ -2,7 +2,6 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. -You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 @@ -14,14 +13,174 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/optimizers/adam_op_functor.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +using float16 = paddle::platform::float16; #ifdef PADDLE_WITH_XPU +template +static int ConvertDataByType(const T1* x, + T2** y, + int len, + bool allocateFlag, + const framework::ExecutionContext& ctx) { + if (nullptr == x || nullptr == y || len <= 0) + return xpu::Error_t::INVALID_PARAM; + int r = 0; + if (allocateFlag) { + r = xpu_malloc(reinterpret_cast(y), sizeof(T2) * len); + + PADDLE_ENFORCE_EQ( + r, + xpu::Error_t::SUCCESS, + platform::errors::External( + "Alloc memory in xpu for result data failed with [%d]", r)); + } + + T1* cpu_data = reinterpret_cast(malloc(sizeof(T1) * len)); + + paddle::memory::Copy(paddle::platform::CPUPlace(), + cpu_data, + ctx.GetPlace(), + x, + len * sizeof(T1)); + + T2* cpu_real_data = reinterpret_cast(malloc(sizeof(T2) * len)); + for (int i = 0; i < len; i++) cpu_real_data[i] = static_cast(cpu_data[i]); + + paddle::memory::Copy(ctx.GetPlace(), + *y, + paddle::platform::CPUPlace(), + cpu_real_data, + len * sizeof(T2)); + + free(cpu_data); + free(cpu_real_data); + + return xpu::Error_t::SUCCESS; +} + +template +static void getDataPointer(const phi::DenseTensor& tensorData, + T** result, + const framework::ExecutionContext& ctx) { + if (tensorData.dtype() == paddle::experimental::DataType::FLOAT16) { + const float16* real_data = + tensorData.template data(); + int len = tensorData.numel(); + + int r = ConvertDataByType(real_data, result, len, true, ctx); + PADDLE_ENFORCE_EQ( + r, + xpu::Error_t::SUCCESS, + platform::errors::External( + "execute function ConvertDataByType failed with [%d]", r)); + } +} + +template +static void getOutDataPointer(phi::DenseTensor* tensorData, + Tensor* out, + T** result, + const framework::ExecutionContext& ctx) { + if (tensorData->dtype() == paddle::experimental::DataType::FLOAT16) { + *result = out->template mutable_data(ctx.GetPlace()); + } else { + *result = tensorData->template mutable_data(ctx.GetPlace()); + } +} + +template +static void copyOutData(const Tensor& srcTensor, + phi::DenseTensor* dstTensor, + const framework::ExecutionContext& ctx) { + if (dstTensor->dtype() == paddle::experimental::DataType::FLOAT16) { + const T* xpu_out_data = srcTensor.template data(); + float16* out_data = + dstTensor->template mutable_data(ctx.GetPlace()); + + int len = srcTensor.numel(); + + int r = + ConvertDataByType(xpu_out_data, &out_data, len, false, ctx); + PADDLE_ENFORCE_EQ( + r, + xpu::Error_t::SUCCESS, + platform::errors::External( + "execute function ConvertDataByType failed with[%d]", r)); + } +} + +template +static void setBetaData(const phi::DenseTensor& beta_pow, + phi::DenseTensor* beta_pow_out, + const T& beta) { + if (beta_pow.dtype() == paddle::experimental::DataType::FLOAT16) { + const float16* beta_pow_p = beta_pow.template data(); + beta_pow_out->mutable_data(platform::CPUPlace())[0] = + static_cast(beta) * beta_pow_p[0]; + } else { + const T* beta_pow_p = beta_pow.template data(); + beta_pow_out->mutable_data(platform::CPUPlace())[0] = + beta * beta_pow_p[0]; + } +} + +template +static void scale(phi::DenseTensor* beta_pow_out, + const phi::DenseTensor& beta_pow, + T* beta_pow_ptr, + const T& beta, + const framework::ExecutionContext& ctx) { + float16* beta_pow_out_p2 = + beta_pow_out->mutable_data(ctx.GetPlace()); + + Tensor xpu_beta_pow_out; + const phi::DenseTensorMeta meta_beta_pow_out( + paddle::experimental::DataType::FLOAT32, beta_pow_out->dims()); + xpu_beta_pow_out.set_meta(meta_beta_pow_out); + + T* beta_pow_out_ptr = + xpu_beta_pow_out.template mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.template device_context(); + int r = xpu::scale(dev_ctx.x_context(), + beta_pow_ptr, + beta_pow_out_ptr, + beta_pow.numel(), + false, + beta, + 0.0f); + PADDLE_ENFORCE_EQ(r, + xpu::SUCCESS, + platform::errors::External( + "XPU kernel scale occur error in adam error code ", + r, + XPUAPIErrorMsg[r])); + + const float* xpu_beta_pow_out_data = xpu_beta_pow_out.template data(); + int len = xpu_beta_pow_out.numel(); + + r = ConvertDataByType( + xpu_beta_pow_out_data, &beta_pow_out_p2, len, false, ctx); + PADDLE_ENFORCE_EQ( + r, + xpu::Error_t::SUCCESS, + platform::errors::External( + "execute function ConvertDataByType failed with [%d]", r)); +} + +template +static void freeData(const phi::DenseTensor& tensorData, T* dataPtr) { + if (tensorData.dtype() == paddle::experimental::DataType::FLOAT16) + xpu_free(dataPtr); +} + template class AdamOpXPUKernel : public framework::OpKernel { public: @@ -39,25 +198,93 @@ class AdamOpXPUKernel : public framework::OpKernel { auto& param = GET_DATA_SAFELY( ctx.Input("Param"), "Input", "Param", "Adam"); - // auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); + + float* param_ptr = nullptr; + getDataPointer(param, ¶m_ptr, ctx); + auto* grad_var = ctx.InputVar("Grad"); + float* grad_c = nullptr; + auto& mom1 = GET_DATA_SAFELY( ctx.Input("Moment1"), "Input", "Moment1", "Adam"); + float* mom1_ptr = nullptr; + getDataPointer(mom1, &mom1_ptr, ctx); + auto& mom2 = GET_DATA_SAFELY( ctx.Input("Moment2"), "Input", "Moment2", "Adam"); + float* mom2_ptr = nullptr; + getDataPointer(mom2, &mom2_ptr, ctx); + auto& lr = GET_DATA_SAFELY( ctx.Input("LearningRate"), "Input", "LearningRate", "Adam"); + float* lr_ptr = nullptr; + getDataPointer(lr, &lr_ptr, ctx); + auto& beta1_pow = GET_DATA_SAFELY( ctx.Input("Beta1Pow"), "Input", "Beta1Pow", "Adam"); + auto& dev_ctx = ctx.template device_context(); + float* beta1_pow_ptr = nullptr; + const float* beta1_const_pow_ptr = nullptr; + if (beta1_pow.place() == platform::CPUPlace()) { + Tensor xpu_beta1_pow; + paddle::framework::TensorCopy( + beta1_pow, ctx.GetPlace(), dev_ctx, &xpu_beta1_pow); + if (xpu_beta1_pow.dtype() == paddle::experimental::DataType::FLOAT16) + getDataPointer(xpu_beta1_pow, &beta1_pow_ptr, ctx); + else + beta1_const_pow_ptr = xpu_beta1_pow.template data(); + } else { + if (beta1_pow.dtype() == paddle::experimental::DataType::FLOAT16) + getDataPointer(beta1_pow, &beta1_pow_ptr, ctx); + else + beta1_const_pow_ptr = beta1_pow.template data(); + } + auto& beta2_pow = GET_DATA_SAFELY( ctx.Input("Beta2Pow"), "Input", "Beta2Pow", "Adam"); + float* beta2_pow_ptr = nullptr; + const float* beta2_const_pow_ptr = nullptr; + if (beta2_pow.place() == platform::CPUPlace()) { + Tensor xpu_beta2_pow; + paddle::framework::TensorCopy( + beta2_pow, ctx.GetPlace(), dev_ctx, &xpu_beta2_pow); + if (xpu_beta2_pow.dtype() == paddle::experimental::DataType::FLOAT16) + getDataPointer(xpu_beta2_pow, &beta2_pow_ptr, ctx); + else + beta2_const_pow_ptr = xpu_beta2_pow.template data(); + } else { + if (beta2_pow.dtype() == paddle::experimental::DataType::FLOAT16) + getDataPointer(beta2_pow, &beta2_pow_ptr, ctx); + else + beta2_const_pow_ptr = beta2_pow.template data(); + } auto& param_out = GET_DATA_SAFELY( ctx.Output("ParamOut"), "Output", "ParamOut", "Adam"); + Tensor xpu_param_out; + float* param_out_ptr = nullptr; + const phi::DenseTensorMeta meta_param( + paddle::experimental::DataType::FLOAT32, param_out.dims()); + xpu_param_out.set_meta(meta_param); + getOutDataPointer(¶m_out, &xpu_param_out, ¶m_out_ptr, ctx); + auto& mom1_out = GET_DATA_SAFELY( ctx.Output("Moment1Out"), "Output", "Moment1Out", "Adam"); + Tensor xpu_mom1_out; + float* mom1_out_ptr = nullptr; + const phi::DenseTensorMeta meta_mom1( + paddle::experimental::DataType::FLOAT32, mom1_out.dims()); + xpu_mom1_out.set_meta(meta_mom1); + getOutDataPointer(&mom1_out, &xpu_mom1_out, &mom1_out_ptr, ctx); + auto& mom2_out = GET_DATA_SAFELY( ctx.Output("Moment2Out"), "Output", "Moment2Out", "Adam"); + Tensor xpu_mom2_out; + float* mom2_out_ptr = nullptr; + const phi::DenseTensorMeta meta_mom2( + paddle::experimental::DataType::FLOAT32, mom2_out.dims()); + xpu_mom2_out.set_meta(meta_mom2); + getOutDataPointer(&mom2_out, &xpu_mom2_out, &mom2_out_ptr, ctx); auto* beta1_pow_out = ctx.Output("Beta1PowOut"); auto* beta2_pow_out = ctx.Output("Beta2PowOut"); @@ -136,101 +363,110 @@ class AdamOpXPUKernel : public framework::OpKernel { auto* beta2_tensor = ctx.Input("Beta2Tensor"); beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); } - float epsilon = static_cast(ctx.Attr("epsilon")); + float epsilon = static_cast(ctx.Attr("epsilon")); if (ctx.HasInput("EpsilonTensor")) { auto* epsilon_tensor = ctx.Input("EpsilonTensor"); epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); } + if (grad_var->IsType()) { auto& grad = GET_DATA_SAFELY( ctx.Input("Grad"), "Input", "Grad", "Adam"); - auto& dev_ctx = ctx.template device_context(); - const float* beta1_pow_ptr = beta1_pow.template data(); - const float* beta2_pow_ptr = beta2_pow.template data(); - Tensor xpu_beta1_pow; - Tensor xpu_beta2_pow; - if (beta1_pow.place() == platform::CPUPlace() && - beta2_pow.place() == platform::CPUPlace()) { - paddle::framework::TensorCopy( - beta1_pow, ctx.GetPlace(), dev_ctx, &xpu_beta1_pow); - paddle::framework::TensorCopy( - beta2_pow, ctx.GetPlace(), dev_ctx, &xpu_beta2_pow); - dev_ctx.Wait(); - beta1_pow_ptr = xpu_beta1_pow.template data(); - beta2_pow_ptr = xpu_beta2_pow.template data(); - } + getDataPointer(grad, &grad_c, ctx); - int r = xpu::adam(dev_ctx.x_context(), - grad.template data(), - mom1.template data(), - mom2.template data(), - param.template data(), - beta1_pow_ptr, - beta2_pow_ptr, - lr.template data(), - mom1_out.template mutable_data(ctx.GetPlace()), - mom2_out.template mutable_data(ctx.GetPlace()), - param_out.template mutable_data(ctx.GetPlace()), - beta1, - beta2, - epsilon, - param.numel()); + int r = xpu::adam( + dev_ctx.x_context(), + grad_c != nullptr ? grad_c : grad.template data(), + mom1_ptr != nullptr ? mom1_ptr : mom1.template data(), + mom2_ptr != nullptr ? mom2_ptr : mom2.template data(), + param_ptr != nullptr ? param_ptr : param.template data(), + beta1_pow_ptr != nullptr ? beta1_pow_ptr : beta1_const_pow_ptr, + beta2_pow_ptr != nullptr ? beta2_pow_ptr : beta2_const_pow_ptr, + lr_ptr != nullptr ? lr_ptr : lr.template data(), + mom1_out_ptr, + mom2_out_ptr, + param_out_ptr, + beta1, + beta2, + epsilon, + param.numel()); xpu_wait(dev_ctx.x_context()->xpu_stream); PADDLE_ENFORCE_EQ( r == xpu::Error_t::SUCCESS, true, platform::errors::External("XPU API return wrong value[%d],", r)); + + freeData(grad, grad_c); + + copyOutData(xpu_mom1_out, &mom1_out, ctx); + copyOutData(xpu_mom2_out, &mom2_out, ctx); + copyOutData(xpu_param_out, ¶m_out, ctx); + if (!use_global_beta_pow) { // update in cpu and then copy to xpu if (beta1_pow.place() == platform::CPUPlace() && beta2_pow.place() == platform::CPUPlace()) { - const float* beta1_pow_p = beta1_pow.template data(); - beta1_pow_out->mutable_data(platform::CPUPlace())[0] = - beta1 * beta1_pow_p[0]; - const float* beta2_pow_p = beta2_pow.template data(); - beta2_pow_out->mutable_data(platform::CPUPlace())[0] = - beta2 * beta2_pow_p[0]; + setBetaData(beta1_pow, beta1_pow_out, beta1); + + setBetaData(beta2_pow, beta2_pow_out, beta2); } else { - float* beta1_pow_out_p = - beta1_pow_out->mutable_data(ctx.GetPlace()); - float* beta2_pow_out_p = - beta2_pow_out->mutable_data(ctx.GetPlace()); - int r = xpu::scale(dev_ctx.x_context(), - beta1_pow_ptr, - beta1_pow_out_p, - beta1_pow.numel(), - false, - beta1, - 0.0f); - PADDLE_ENFORCE_EQ( - r, - xpu::SUCCESS, - platform::errors::External( - "XPU kernel scale occur error in adam error code ", - r, - XPUAPIErrorMsg[r])); - r = xpu::scale(dev_ctx.x_context(), - beta2_pow_ptr, - beta2_pow_out_p, - beta2_pow.numel(), - false, - beta2, - 0.0f); - PADDLE_ENFORCE_EQ( - r, - xpu::SUCCESS, - platform::errors::External( - "XPU kernel scale occur error in adam error code ", - r, - XPUAPIErrorMsg[r])); - - xpu_wait(dev_ctx.x_context()->xpu_stream); + float* beta1_pow_out_p1 = nullptr; + + if (beta1_pow_out->dtype() == + paddle::experimental::DataType::FLOAT16) { + scale( + beta1_pow_out, beta1_pow, beta1_pow_ptr, beta1, ctx); + } else { + const float* beta1_pow_data = beta1_pow.template data(); + beta1_pow_out_p1 = + beta1_pow_out->mutable_data(ctx.GetPlace()); + r = xpu::scale(dev_ctx.x_context(), + beta1_pow_data, + beta1_pow_out_p1, + beta1_pow.numel(), + false, + beta1, + 0.0f); + xpu_wait(dev_ctx.x_context()->xpu_stream); + PADDLE_ENFORCE_EQ( + r, + xpu::SUCCESS, + platform::errors::External( + "XPU kernel scale occur error in adam error code ", + r, + XPUAPIErrorMsg[r])); + } + + float* beta2_pow_out_p1 = nullptr; + if (beta2_pow_out->dtype() == + paddle::experimental::DataType::FLOAT16) { + scale( + beta2_pow_out, beta2_pow, beta2_pow_ptr, beta2, ctx); + } else { + const float* beta2_pow_data = beta2_pow.template data(); + beta2_pow_out_p1 = + beta2_pow_out->mutable_data(ctx.GetPlace()); + r = xpu::scale(dev_ctx.x_context(), + beta2_pow_data, + beta2_pow_out_p1, + beta2_pow.numel(), + false, + beta2, + 0.0f); + xpu_wait(dev_ctx.x_context()->xpu_stream); + PADDLE_ENFORCE_EQ( + r, + xpu::SUCCESS, + platform::errors::External( + "XPU kernel scale occur error in adam error code ", + r, + XPUAPIErrorMsg[r])); + } } } } else if (grad_var->IsType()) { auto* grad = ctx.Input("Grad"); - auto& dev_ctx = ctx.template device_context(); if (grad->rows().size() == 0) { VLOG(3) << "grad row size is 0!!"; @@ -251,7 +487,7 @@ class AdamOpXPUKernel : public framework::OpKernel { if (is_strict_sorted) { grad_merge_ptr = grad; } else { - scatter::MergeAdd merge_func; + scatter::MergeAdd merge_func; merge_func(ctx.template device_context(), *grad, &tmp_grad_merge, @@ -260,23 +496,12 @@ class AdamOpXPUKernel : public framework::OpKernel { xpu_wait(dev_ctx.x_context()->xpu_stream); grad_merge_ptr = &tmp_grad_merge; } - const T* beta1_pow_ptr = beta1_pow.template data(); - const T* beta2_pow_ptr = beta2_pow.template data(); - Tensor xpu_beta1_pow; - Tensor xpu_beta2_pow; - if (beta1_pow.place() == platform::CPUPlace() && - beta2_pow.place() == platform::CPUPlace()) { - paddle::framework::TensorCopy( - beta1_pow, ctx.GetPlace(), dev_ctx, &xpu_beta1_pow); - paddle::framework::TensorCopy( - beta2_pow, ctx.GetPlace(), dev_ctx, &xpu_beta2_pow); - dev_ctx.Wait(); - beta1_pow_ptr = xpu_beta1_pow.template data(); - beta2_pow_ptr = xpu_beta2_pow.template data(); - } + auto& grad_merge = *grad_merge_ptr; auto& grad_tensor = grad_merge.value(); - const T* grad_data = grad_tensor.template data(); + + getDataPointer(grad_tensor, &grad_c, ctx); + int row_count = grad_merge.rows().size(); std::vector rows(row_count); xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); @@ -296,84 +521,111 @@ class AdamOpXPUKernel : public framework::OpKernel { auto ori_rows = param.numel() / row_numel; int lazy_mode = static_cast(ctx.Attr("lazy_mode")); - int r = - xpu::sparse_adam(dev_ctx.x_context(), - grad_data, - mom1.template data(), - mom2.template data(), - param.template data(), - beta1_pow_ptr, - beta2_pow_ptr, - lr.template data(), - mom1_out.template mutable_data(ctx.GetPlace()), - mom2_out.template mutable_data(ctx.GetPlace()), - param_out.template mutable_data(ctx.GetPlace()), - beta1, - beta2, - epsilon, - ori_rows, - xpu_rows, - row_numel, - grad_merge.rows().size(), - lazy_mode); + int r = xpu::sparse_adam( + dev_ctx.x_context(), + grad_c != nullptr ? grad_c : grad_tensor.template data(), + mom1_ptr != nullptr ? mom1_ptr : mom1.template data(), + mom2_ptr != nullptr ? mom2_ptr : mom2.template data(), + param_ptr != nullptr ? param_ptr : param.template data(), + beta1_pow_ptr != nullptr ? beta1_pow_ptr : beta1_const_pow_ptr, + beta2_pow_ptr != nullptr ? beta2_pow_ptr : beta2_const_pow_ptr, + lr_ptr != nullptr ? lr_ptr : lr.template data(), + mom1_out_ptr, + mom2_out_ptr, + param_out_ptr, + beta1, + beta2, + epsilon, + ori_rows, + xpu_rows, + row_numel, + grad_merge.rows().size(), + lazy_mode); PADDLE_ENFORCE_EQ( r == xpu::Error_t::SUCCESS, true, platform::errors::External("XPU API return wrong value[%d],", r)); + freeData(grad_tensor, grad_c); + + copyOutData(xpu_mom1_out, &mom1_out, ctx); + copyOutData(xpu_mom2_out, &mom2_out, ctx); + copyOutData(xpu_param_out, ¶m_out, ctx); + if (!use_global_beta_pow) { // update in cpu and then copy to xpu if (beta1_pow.place() == platform::CPUPlace() && beta2_pow.place() == platform::CPUPlace()) { - const float* beta1_pow_p = beta1_pow.template data(); - beta1_pow_out->mutable_data(platform::CPUPlace())[0] = - beta1 * beta1_pow_p[0]; - const float* beta2_pow_p = beta2_pow.template data(); - beta2_pow_out->mutable_data(platform::CPUPlace())[0] = - beta2 * beta2_pow_p[0]; + setBetaData(beta1_pow, beta1_pow_out, beta1); + + setBetaData(beta2_pow, beta2_pow_out, beta2); } else { - float* beta1_pow_out_p = - beta1_pow_out->mutable_data(ctx.GetPlace()); - float* beta2_pow_out_p = - beta2_pow_out->mutable_data(ctx.GetPlace()); - int r = xpu::scale(dev_ctx.x_context(), - beta1_pow_ptr, - beta1_pow_out_p, - beta1_pow.numel(), - false, - beta1, - 0.0f); - PADDLE_ENFORCE_EQ( - r, - xpu::SUCCESS, - platform::errors::External( - "XPU kernel scale occur error in adam error code ", - r, - XPUAPIErrorMsg[r])); - r = xpu::scale(dev_ctx.x_context(), - beta2_pow_ptr, - beta2_pow_out_p, - beta2_pow.numel(), - false, - beta2, - 0.0f); - PADDLE_ENFORCE_EQ( - r, - xpu::SUCCESS, - platform::errors::External( - "XPU kernel scale occur error in adam error code ", - r, - XPUAPIErrorMsg[r])); + float* beta1_pow_out_p1 = nullptr; + + if (beta1_pow_out->dtype() == + paddle::experimental::DataType::FLOAT16) { + scale( + beta1_pow_out, beta1_pow, beta1_pow_ptr, beta1, ctx); + } else { + const float* beta1_pow_data = beta1_pow.template data(); + beta1_pow_out_p1 = + beta1_pow_out->mutable_data(ctx.GetPlace()); + r = xpu::scale(dev_ctx.x_context(), + beta1_pow_data, + beta1_pow_out_p1, + beta1_pow.numel(), + false, + beta1, + 0.0f); + xpu_wait(dev_ctx.x_context()->xpu_stream); + PADDLE_ENFORCE_EQ( + r, + xpu::SUCCESS, + platform::errors::External( + "XPU kernel scale occur error in adam error code ", + r, + XPUAPIErrorMsg[r])); + } + + float* beta2_pow_out_p1 = nullptr; + if (beta2_pow_out->dtype() == + paddle::experimental::DataType::FLOAT16) { + scale( + beta2_pow_out, beta2_pow, beta2_pow_ptr, beta2, ctx); + } else { + const float* beta2_pow_data = beta2_pow.template data(); + beta2_pow_out_p1 = + beta2_pow_out->mutable_data(ctx.GetPlace()); + r = xpu::scale(dev_ctx.x_context(), + beta2_pow_data, + beta2_pow_out_p1, + beta2_pow.numel(), + false, + beta2, + 0.0f); + xpu_wait(dev_ctx.x_context()->xpu_stream); + PADDLE_ENFORCE_EQ( + r, + xpu::SUCCESS, + platform::errors::External( + "XPU kernel scale occur error in adam error code ", + r, + XPUAPIErrorMsg[r])); + } } } - xpu_wait(dev_ctx.x_context()->xpu_stream); } else { PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument( "Variable type not supported by adam_op")); } + + freeData(param, param_ptr); + freeData(mom1, mom1_ptr); + freeData(mom2, mom2_ptr); + freeData(lr, lr_ptr); } }; #endif @@ -384,5 +636,8 @@ class AdamOpXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; #ifdef PADDLE_WITH_XPU REGISTER_OP_XPU_KERNEL( - adam, ops::AdamOpXPUKernel); + adam, + ops::AdamOpXPUKernel, + ops::AdamOpXPUKernel); #endif diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 83f0c21315b..8e791b3739a 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -34,7 +34,9 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"adam", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"argsort", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), diff --git a/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py index 3be4cac81ca..17b6531520d 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py @@ -159,6 +159,57 @@ class XPUTestAdamOp(XPUOpTestWrapper): self.learning_rate = 0.001 self.epsilon = 1e-8 + class TestAdamOp7(TestAdamOp): + '''Test Adam Op with float16 accuracy + ''' + + def setUp(self): + self.init_dtype() + self.set_xpu() + self.op_type = "adam" + self.place = paddle.XPUPlace(0) + self.set_data() + self.set_attrs() + self.set_shape() + self.set_inputs() + self.set_steps() + param_out, moment1_out, \ + moment2_out = adam_step(self.inputs, self.attrs) + + self.outputs = { + 'Moment1Out': + moment1_out, + 'Moment2Out': + moment2_out, + 'ParamOut': + param_out, + 'Beta1PowOut': + np.array([self.beta1_pow]).astype("float16") * self.beta1, + 'Beta2PowOut': + np.array([self.beta2_pow]).astype("float16") * self.beta2 + } + + def set_inputs(self): + param = np.random.uniform(-1, 1, self.shape).astype(self.dtype) + grad = np.random.uniform(-1, 1, self.shape).astype(self.dtype) + moment1 = np.random.uniform(-1, 1, self.shape).astype(self.dtype) + # The second moment is positive + moment2 = np.random.random(self.shape).astype(self.dtype) + + self.beta1_pow = self.beta1**10 + self.beta2_pow = self.beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': + np.array([self.learning_rate]).astype("float16"), + 'Beta1Pow': np.array([self.beta1_pow]).astype("float16"), + 'Beta2Pow': np.array([self.beta2_pow]).astype("float16") + } + class TestAdamOpMultipleSteps(TestAdamOp2): '''Test Adam Operator with supplied attributes ''' @@ -372,6 +423,60 @@ class TestSparseAdamOp(unittest.TestCase): self.check_with_place(paddle.XPUPlace(0), False) +class TestSparseAdamOp1(TestSparseAdamOp): + + def setup(self, scope, place, lazy_mode): + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = np.array([beta1**10]).astype("float16") + beta2_pow = np.array([beta2**10]).astype("float16") + + height = 10 + rows = [0, 4, 7] + self.rows = rows + row_numel = 12 + self.row_numel = row_numel + self.dense_inputs = { + "Param": np.full((height, row_numel), 5.0).astype("float16"), + "Moment1": np.full((height, row_numel), 5.0).astype("float16"), + "Moment2": np.full((height, row_numel), 5.0).astype("float16"), + 'Beta1Pow': beta1_pow, + 'Beta2Pow': beta2_pow, + "LearningRate": np.full((1), 2.0).astype("float16") + } + self.init_output = np.full((height, row_numel), 0.0).astype("float16") + self.attrs = { + 'epsilon': epsilon, + 'beta1': beta1, + 'beta2': beta2, + 'min_row_size_to_use_multithread': 2 + } + + grad_selected_rows = scope.var('Grad').get_selected_rows() + grad_selected_rows.set_height(height) + grad_selected_rows.set_rows(rows) + np_array = np.ones((len(rows), row_numel)).astype("float16") + np_array[0, 0] = 2.0 + np_array[2, 8] = 4.0 + + grad_tensor = grad_selected_rows.get_tensor() + grad_tensor.set(np_array, place) + + self.sparse_inputs = ["Grad"] + + param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs, + height, rows, row_numel, + np_array, lazy_mode) + self.outputs = { + "ParamOut": param_out, + "Moment1Out": mom1, + "Moment2Out": mom2, + 'Beta1PowOut': beta1_pow * beta1, + 'Beta2PowOut': beta2_pow * beta2 + } + + support_types = get_xpu_op_support_types('adam') for stype in support_types: create_test_class(globals(), XPUTestAdamOp, stype) -- GitLab