未验证 提交 a012d426 编写于 作者: M mengqingchun02 提交者: GitHub

Support fp16 of adam operator in xpu environment (#45292)

* support beam_search operator on xpu. test=kunlun

* support beam_search operator on xpu. test=kunlun

* support beam_search operator on xpu. test=kunlun

* support beam_search operator on xpu. test=kunlun

* support beam_search operator on xpu. test=kunlun

* support fp16 of adam operator in xpu environment. test=kunlun

* support fp16 of adam operator in xpu environment. test=kunlun

* support fp16 of adam operator in xpu environment. test=kunlun
上级 0a67c2e5
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
...@@ -14,14 +13,174 @@ limitations under the License. */ ...@@ -14,14 +13,174 @@ limitations under the License. */
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/optimizers/adam_op_functor.h" #include "paddle/fluid/operators/optimizers/adam_op_functor.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using float16 = paddle::platform::float16;
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
template <typename T1, typename T2>
static int ConvertDataByType(const T1* x,
T2** y,
int len,
bool allocateFlag,
const framework::ExecutionContext& ctx) {
if (nullptr == x || nullptr == y || len <= 0)
return xpu::Error_t::INVALID_PARAM;
int r = 0;
if (allocateFlag) {
r = xpu_malloc(reinterpret_cast<void**>(y), sizeof(T2) * len);
PADDLE_ENFORCE_EQ(
r,
xpu::Error_t::SUCCESS,
platform::errors::External(
"Alloc memory in xpu for result data failed with [%d]", r));
}
T1* cpu_data = reinterpret_cast<T1*>(malloc(sizeof(T1) * len));
paddle::memory::Copy(paddle::platform::CPUPlace(),
cpu_data,
ctx.GetPlace(),
x,
len * sizeof(T1));
T2* cpu_real_data = reinterpret_cast<T2*>(malloc(sizeof(T2) * len));
for (int i = 0; i < len; i++) cpu_real_data[i] = static_cast<T2>(cpu_data[i]);
paddle::memory::Copy(ctx.GetPlace(),
*y,
paddle::platform::CPUPlace(),
cpu_real_data,
len * sizeof(T2));
free(cpu_data);
free(cpu_real_data);
return xpu::Error_t::SUCCESS;
}
template <typename T>
static void getDataPointer(const phi::DenseTensor& tensorData,
T** result,
const framework::ExecutionContext& ctx) {
if (tensorData.dtype() == paddle::experimental::DataType::FLOAT16) {
const float16* real_data =
tensorData.template data<paddle::platform::float16>();
int len = tensorData.numel();
int r = ConvertDataByType<float16, T>(real_data, result, len, true, ctx);
PADDLE_ENFORCE_EQ(
r,
xpu::Error_t::SUCCESS,
platform::errors::External(
"execute function ConvertDataByType failed with [%d]", r));
}
}
template <typename T>
static void getOutDataPointer(phi::DenseTensor* tensorData,
Tensor* out,
T** result,
const framework::ExecutionContext& ctx) {
if (tensorData->dtype() == paddle::experimental::DataType::FLOAT16) {
*result = out->template mutable_data<T>(ctx.GetPlace());
} else {
*result = tensorData->template mutable_data<T>(ctx.GetPlace());
}
}
template <typename T>
static void copyOutData(const Tensor& srcTensor,
phi::DenseTensor* dstTensor,
const framework::ExecutionContext& ctx) {
if (dstTensor->dtype() == paddle::experimental::DataType::FLOAT16) {
const T* xpu_out_data = srcTensor.template data<T>();
float16* out_data =
dstTensor->template mutable_data<float16>(ctx.GetPlace());
int len = srcTensor.numel();
int r =
ConvertDataByType<T, float16>(xpu_out_data, &out_data, len, false, ctx);
PADDLE_ENFORCE_EQ(
r,
xpu::Error_t::SUCCESS,
platform::errors::External(
"execute function ConvertDataByType failed with[%d]", r));
}
}
template <typename T>
static void setBetaData(const phi::DenseTensor& beta_pow,
phi::DenseTensor* beta_pow_out,
const T& beta) {
if (beta_pow.dtype() == paddle::experimental::DataType::FLOAT16) {
const float16* beta_pow_p = beta_pow.template data<float16>();
beta_pow_out->mutable_data<float16>(platform::CPUPlace())[0] =
static_cast<float16>(beta) * beta_pow_p[0];
} else {
const T* beta_pow_p = beta_pow.template data<T>();
beta_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
beta * beta_pow_p[0];
}
}
template <typename DeviceContext, typename T>
static void scale(phi::DenseTensor* beta_pow_out,
const phi::DenseTensor& beta_pow,
T* beta_pow_ptr,
const T& beta,
const framework::ExecutionContext& ctx) {
float16* beta_pow_out_p2 =
beta_pow_out->mutable_data<float16>(ctx.GetPlace());
Tensor xpu_beta_pow_out;
const phi::DenseTensorMeta meta_beta_pow_out(
paddle::experimental::DataType::FLOAT32, beta_pow_out->dims());
xpu_beta_pow_out.set_meta(meta_beta_pow_out);
T* beta_pow_out_ptr =
xpu_beta_pow_out.template mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<DeviceContext>();
int r = xpu::scale(dev_ctx.x_context(),
beta_pow_ptr,
beta_pow_out_ptr,
beta_pow.numel(),
false,
beta,
0.0f);
PADDLE_ENFORCE_EQ(r,
xpu::SUCCESS,
platform::errors::External(
"XPU kernel scale occur error in adam error code ",
r,
XPUAPIErrorMsg[r]));
const float* xpu_beta_pow_out_data = xpu_beta_pow_out.template data<T>();
int len = xpu_beta_pow_out.numel();
r = ConvertDataByType<T, float16>(
xpu_beta_pow_out_data, &beta_pow_out_p2, len, false, ctx);
PADDLE_ENFORCE_EQ(
r,
xpu::Error_t::SUCCESS,
platform::errors::External(
"execute function ConvertDataByType failed with [%d]", r));
}
template <typename T>
static void freeData(const phi::DenseTensor& tensorData, T* dataPtr) {
if (tensorData.dtype() == paddle::experimental::DataType::FLOAT16)
xpu_free(dataPtr);
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class AdamOpXPUKernel : public framework::OpKernel<T> { class AdamOpXPUKernel : public framework::OpKernel<T> {
public: public:
...@@ -39,25 +198,93 @@ class AdamOpXPUKernel : public framework::OpKernel<T> { ...@@ -39,25 +198,93 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
auto& param = GET_DATA_SAFELY( auto& param = GET_DATA_SAFELY(
ctx.Input<LoDTensor>("Param"), "Input", "Param", "Adam"); ctx.Input<LoDTensor>("Param"), "Input", "Param", "Adam");
// auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
float* param_ptr = nullptr;
getDataPointer<float>(param, &param_ptr, ctx);
auto* grad_var = ctx.InputVar("Grad"); auto* grad_var = ctx.InputVar("Grad");
float* grad_c = nullptr;
auto& mom1 = GET_DATA_SAFELY( auto& mom1 = GET_DATA_SAFELY(
ctx.Input<LoDTensor>("Moment1"), "Input", "Moment1", "Adam"); ctx.Input<LoDTensor>("Moment1"), "Input", "Moment1", "Adam");
float* mom1_ptr = nullptr;
getDataPointer<float>(mom1, &mom1_ptr, ctx);
auto& mom2 = GET_DATA_SAFELY( auto& mom2 = GET_DATA_SAFELY(
ctx.Input<LoDTensor>("Moment2"), "Input", "Moment2", "Adam"); ctx.Input<LoDTensor>("Moment2"), "Input", "Moment2", "Adam");
float* mom2_ptr = nullptr;
getDataPointer<float>(mom2, &mom2_ptr, ctx);
auto& lr = GET_DATA_SAFELY( auto& lr = GET_DATA_SAFELY(
ctx.Input<LoDTensor>("LearningRate"), "Input", "LearningRate", "Adam"); ctx.Input<LoDTensor>("LearningRate"), "Input", "LearningRate", "Adam");
float* lr_ptr = nullptr;
getDataPointer<float>(lr, &lr_ptr, ctx);
auto& beta1_pow = GET_DATA_SAFELY( auto& beta1_pow = GET_DATA_SAFELY(
ctx.Input<LoDTensor>("Beta1Pow"), "Input", "Beta1Pow", "Adam"); ctx.Input<LoDTensor>("Beta1Pow"), "Input", "Beta1Pow", "Adam");
auto& dev_ctx = ctx.template device_context<DeviceContext>();
float* beta1_pow_ptr = nullptr;
const float* beta1_const_pow_ptr = nullptr;
if (beta1_pow.place() == platform::CPUPlace()) {
Tensor xpu_beta1_pow;
paddle::framework::TensorCopy(
beta1_pow, ctx.GetPlace(), dev_ctx, &xpu_beta1_pow);
if (xpu_beta1_pow.dtype() == paddle::experimental::DataType::FLOAT16)
getDataPointer<float>(xpu_beta1_pow, &beta1_pow_ptr, ctx);
else
beta1_const_pow_ptr = xpu_beta1_pow.template data<float>();
} else {
if (beta1_pow.dtype() == paddle::experimental::DataType::FLOAT16)
getDataPointer<float>(beta1_pow, &beta1_pow_ptr, ctx);
else
beta1_const_pow_ptr = beta1_pow.template data<float>();
}
auto& beta2_pow = GET_DATA_SAFELY( auto& beta2_pow = GET_DATA_SAFELY(
ctx.Input<LoDTensor>("Beta2Pow"), "Input", "Beta2Pow", "Adam"); ctx.Input<LoDTensor>("Beta2Pow"), "Input", "Beta2Pow", "Adam");
float* beta2_pow_ptr = nullptr;
const float* beta2_const_pow_ptr = nullptr;
if (beta2_pow.place() == platform::CPUPlace()) {
Tensor xpu_beta2_pow;
paddle::framework::TensorCopy(
beta2_pow, ctx.GetPlace(), dev_ctx, &xpu_beta2_pow);
if (xpu_beta2_pow.dtype() == paddle::experimental::DataType::FLOAT16)
getDataPointer<float>(xpu_beta2_pow, &beta2_pow_ptr, ctx);
else
beta2_const_pow_ptr = xpu_beta2_pow.template data<float>();
} else {
if (beta2_pow.dtype() == paddle::experimental::DataType::FLOAT16)
getDataPointer<float>(beta2_pow, &beta2_pow_ptr, ctx);
else
beta2_const_pow_ptr = beta2_pow.template data<float>();
}
auto& param_out = GET_DATA_SAFELY( auto& param_out = GET_DATA_SAFELY(
ctx.Output<LoDTensor>("ParamOut"), "Output", "ParamOut", "Adam"); ctx.Output<LoDTensor>("ParamOut"), "Output", "ParamOut", "Adam");
Tensor xpu_param_out;
float* param_out_ptr = nullptr;
const phi::DenseTensorMeta meta_param(
paddle::experimental::DataType::FLOAT32, param_out.dims());
xpu_param_out.set_meta(meta_param);
getOutDataPointer(&param_out, &xpu_param_out, &param_out_ptr, ctx);
auto& mom1_out = GET_DATA_SAFELY( auto& mom1_out = GET_DATA_SAFELY(
ctx.Output<LoDTensor>("Moment1Out"), "Output", "Moment1Out", "Adam"); ctx.Output<LoDTensor>("Moment1Out"), "Output", "Moment1Out", "Adam");
Tensor xpu_mom1_out;
float* mom1_out_ptr = nullptr;
const phi::DenseTensorMeta meta_mom1(
paddle::experimental::DataType::FLOAT32, mom1_out.dims());
xpu_mom1_out.set_meta(meta_mom1);
getOutDataPointer(&mom1_out, &xpu_mom1_out, &mom1_out_ptr, ctx);
auto& mom2_out = GET_DATA_SAFELY( auto& mom2_out = GET_DATA_SAFELY(
ctx.Output<LoDTensor>("Moment2Out"), "Output", "Moment2Out", "Adam"); ctx.Output<LoDTensor>("Moment2Out"), "Output", "Moment2Out", "Adam");
Tensor xpu_mom2_out;
float* mom2_out_ptr = nullptr;
const phi::DenseTensorMeta meta_mom2(
paddle::experimental::DataType::FLOAT32, mom2_out.dims());
xpu_mom2_out.set_meta(meta_mom2);
getOutDataPointer(&mom2_out, &xpu_mom2_out, &mom2_out_ptr, ctx);
auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut"); auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut"); auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
...@@ -136,101 +363,110 @@ class AdamOpXPUKernel : public framework::OpKernel<T> { ...@@ -136,101 +363,110 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
auto* beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor"); auto* beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
beta2 = static_cast<float>(GetAttrFromTensor(beta2_tensor)); beta2 = static_cast<float>(GetAttrFromTensor(beta2_tensor));
} }
float epsilon = static_cast<T>(ctx.Attr<float>("epsilon")); float epsilon = static_cast<float>(ctx.Attr<float>("epsilon"));
if (ctx.HasInput("EpsilonTensor")) { if (ctx.HasInput("EpsilonTensor")) {
auto* epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor"); auto* epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor");
epsilon = static_cast<float>(GetAttrFromTensor(epsilon_tensor)); epsilon = static_cast<float>(GetAttrFromTensor(epsilon_tensor));
} }
if (grad_var->IsType<framework::LoDTensor>()) { if (grad_var->IsType<framework::LoDTensor>()) {
auto& grad = GET_DATA_SAFELY( auto& grad = GET_DATA_SAFELY(
ctx.Input<LoDTensor>("Grad"), "Input", "Grad", "Adam"); ctx.Input<LoDTensor>("Grad"), "Input", "Grad", "Adam");
auto& dev_ctx = ctx.template device_context<DeviceContext>(); getDataPointer<float>(grad, &grad_c, ctx);
const float* beta1_pow_ptr = beta1_pow.template data<float>();
const float* beta2_pow_ptr = beta2_pow.template data<float>();
Tensor xpu_beta1_pow;
Tensor xpu_beta2_pow;
if (beta1_pow.place() == platform::CPUPlace() &&
beta2_pow.place() == platform::CPUPlace()) {
paddle::framework::TensorCopy(
beta1_pow, ctx.GetPlace(), dev_ctx, &xpu_beta1_pow);
paddle::framework::TensorCopy(
beta2_pow, ctx.GetPlace(), dev_ctx, &xpu_beta2_pow);
dev_ctx.Wait();
beta1_pow_ptr = xpu_beta1_pow.template data<float>();
beta2_pow_ptr = xpu_beta2_pow.template data<float>();
}
int r = xpu::adam(dev_ctx.x_context(), int r = xpu::adam(
grad.template data<T>(), dev_ctx.x_context(),
mom1.template data<T>(), grad_c != nullptr ? grad_c : grad.template data<float>(),
mom2.template data<T>(), mom1_ptr != nullptr ? mom1_ptr : mom1.template data<float>(),
param.template data<float>(), mom2_ptr != nullptr ? mom2_ptr : mom2.template data<float>(),
beta1_pow_ptr, param_ptr != nullptr ? param_ptr : param.template data<float>(),
beta2_pow_ptr, beta1_pow_ptr != nullptr ? beta1_pow_ptr : beta1_const_pow_ptr,
lr.template data<float>(), beta2_pow_ptr != nullptr ? beta2_pow_ptr : beta2_const_pow_ptr,
mom1_out.template mutable_data<float>(ctx.GetPlace()), lr_ptr != nullptr ? lr_ptr : lr.template data<float>(),
mom2_out.template mutable_data<float>(ctx.GetPlace()), mom1_out_ptr,
param_out.template mutable_data<float>(ctx.GetPlace()), mom2_out_ptr,
beta1, param_out_ptr,
beta2, beta1,
epsilon, beta2,
param.numel()); epsilon,
param.numel());
xpu_wait(dev_ctx.x_context()->xpu_stream); xpu_wait(dev_ctx.x_context()->xpu_stream);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
r == xpu::Error_t::SUCCESS, r == xpu::Error_t::SUCCESS,
true, true,
platform::errors::External("XPU API return wrong value[%d],", r)); platform::errors::External("XPU API return wrong value[%d],", r));
freeData<float>(grad, grad_c);
copyOutData<float>(xpu_mom1_out, &mom1_out, ctx);
copyOutData<float>(xpu_mom2_out, &mom2_out, ctx);
copyOutData<float>(xpu_param_out, &param_out, ctx);
if (!use_global_beta_pow) { if (!use_global_beta_pow) {
// update in cpu and then copy to xpu // update in cpu and then copy to xpu
if (beta1_pow.place() == platform::CPUPlace() && if (beta1_pow.place() == platform::CPUPlace() &&
beta2_pow.place() == platform::CPUPlace()) { beta2_pow.place() == platform::CPUPlace()) {
const float* beta1_pow_p = beta1_pow.template data<float>(); setBetaData(beta1_pow, beta1_pow_out, beta1);
beta1_pow_out->mutable_data<float>(platform::CPUPlace())[0] =
beta1 * beta1_pow_p[0]; setBetaData(beta2_pow, beta2_pow_out, beta2);
const float* beta2_pow_p = beta2_pow.template data<float>();
beta2_pow_out->mutable_data<float>(platform::CPUPlace())[0] =
beta2 * beta2_pow_p[0];
} else { } else {
float* beta1_pow_out_p = float* beta1_pow_out_p1 = nullptr;
beta1_pow_out->mutable_data<float>(ctx.GetPlace());
float* beta2_pow_out_p = if (beta1_pow_out->dtype() ==
beta2_pow_out->mutable_data<float>(ctx.GetPlace()); paddle::experimental::DataType::FLOAT16) {
int r = xpu::scale(dev_ctx.x_context(), scale<DeviceContext, float>(
beta1_pow_ptr, beta1_pow_out, beta1_pow, beta1_pow_ptr, beta1, ctx);
beta1_pow_out_p, } else {
beta1_pow.numel(), const float* beta1_pow_data = beta1_pow.template data<float>();
false, beta1_pow_out_p1 =
beta1, beta1_pow_out->mutable_data<float>(ctx.GetPlace());
0.0f); r = xpu::scale(dev_ctx.x_context(),
PADDLE_ENFORCE_EQ( beta1_pow_data,
r, beta1_pow_out_p1,
xpu::SUCCESS, beta1_pow.numel(),
platform::errors::External( false,
"XPU kernel scale occur error in adam error code ", beta1,
r, 0.0f);
XPUAPIErrorMsg[r])); xpu_wait(dev_ctx.x_context()->xpu_stream);
r = xpu::scale(dev_ctx.x_context(), PADDLE_ENFORCE_EQ(
beta2_pow_ptr, r,
beta2_pow_out_p, xpu::SUCCESS,
beta2_pow.numel(), platform::errors::External(
false, "XPU kernel scale occur error in adam error code ",
beta2, r,
0.0f); XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_EQ( }
r,
xpu::SUCCESS, float* beta2_pow_out_p1 = nullptr;
platform::errors::External( if (beta2_pow_out->dtype() ==
"XPU kernel scale occur error in adam error code ", paddle::experimental::DataType::FLOAT16) {
r, scale<DeviceContext, float>(
XPUAPIErrorMsg[r])); beta2_pow_out, beta2_pow, beta2_pow_ptr, beta2, ctx);
} else {
xpu_wait(dev_ctx.x_context()->xpu_stream); const float* beta2_pow_data = beta2_pow.template data<float>();
beta2_pow_out_p1 =
beta2_pow_out->mutable_data<float>(ctx.GetPlace());
r = xpu::scale(dev_ctx.x_context(),
beta2_pow_data,
beta2_pow_out_p1,
beta2_pow.numel(),
false,
beta2,
0.0f);
xpu_wait(dev_ctx.x_context()->xpu_stream);
PADDLE_ENFORCE_EQ(
r,
xpu::SUCCESS,
platform::errors::External(
"XPU kernel scale occur error in adam error code ",
r,
XPUAPIErrorMsg[r]));
}
} }
} }
} else if (grad_var->IsType<phi::SelectedRows>()) { } else if (grad_var->IsType<phi::SelectedRows>()) {
auto* grad = ctx.Input<phi::SelectedRows>("Grad"); auto* grad = ctx.Input<phi::SelectedRows>("Grad");
auto& dev_ctx = ctx.template device_context<DeviceContext>();
if (grad->rows().size() == 0) { if (grad->rows().size() == 0) {
VLOG(3) << "grad row size is 0!!"; VLOG(3) << "grad row size is 0!!";
...@@ -251,7 +487,7 @@ class AdamOpXPUKernel : public framework::OpKernel<T> { ...@@ -251,7 +487,7 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
if (is_strict_sorted) { if (is_strict_sorted) {
grad_merge_ptr = grad; grad_merge_ptr = grad;
} else { } else {
scatter::MergeAdd<platform::XPUDeviceContext, T> merge_func; scatter::MergeAdd<platform::XPUDeviceContext, float> merge_func;
merge_func(ctx.template device_context<platform::XPUDeviceContext>(), merge_func(ctx.template device_context<platform::XPUDeviceContext>(),
*grad, *grad,
&tmp_grad_merge, &tmp_grad_merge,
...@@ -260,23 +496,12 @@ class AdamOpXPUKernel : public framework::OpKernel<T> { ...@@ -260,23 +496,12 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
xpu_wait(dev_ctx.x_context()->xpu_stream); xpu_wait(dev_ctx.x_context()->xpu_stream);
grad_merge_ptr = &tmp_grad_merge; grad_merge_ptr = &tmp_grad_merge;
} }
const T* beta1_pow_ptr = beta1_pow.template data<T>();
const T* beta2_pow_ptr = beta2_pow.template data<T>();
Tensor xpu_beta1_pow;
Tensor xpu_beta2_pow;
if (beta1_pow.place() == platform::CPUPlace() &&
beta2_pow.place() == platform::CPUPlace()) {
paddle::framework::TensorCopy(
beta1_pow, ctx.GetPlace(), dev_ctx, &xpu_beta1_pow);
paddle::framework::TensorCopy(
beta2_pow, ctx.GetPlace(), dev_ctx, &xpu_beta2_pow);
dev_ctx.Wait();
beta1_pow_ptr = xpu_beta1_pow.template data<T>();
beta2_pow_ptr = xpu_beta2_pow.template data<T>();
}
auto& grad_merge = *grad_merge_ptr; auto& grad_merge = *grad_merge_ptr;
auto& grad_tensor = grad_merge.value(); auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>();
getDataPointer<float>(grad_tensor, &grad_c, ctx);
int row_count = grad_merge.rows().size(); int row_count = grad_merge.rows().size();
std::vector<int> rows(row_count); std::vector<int> rows(row_count);
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
...@@ -296,84 +521,111 @@ class AdamOpXPUKernel : public framework::OpKernel<T> { ...@@ -296,84 +521,111 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
auto ori_rows = param.numel() / row_numel; auto ori_rows = param.numel() / row_numel;
int lazy_mode = static_cast<int>(ctx.Attr<bool>("lazy_mode")); int lazy_mode = static_cast<int>(ctx.Attr<bool>("lazy_mode"));
int r = int r = xpu::sparse_adam(
xpu::sparse_adam(dev_ctx.x_context(), dev_ctx.x_context(),
grad_data, grad_c != nullptr ? grad_c : grad_tensor.template data<float>(),
mom1.template data<T>(), mom1_ptr != nullptr ? mom1_ptr : mom1.template data<float>(),
mom2.template data<T>(), mom2_ptr != nullptr ? mom2_ptr : mom2.template data<float>(),
param.template data<T>(), param_ptr != nullptr ? param_ptr : param.template data<float>(),
beta1_pow_ptr, beta1_pow_ptr != nullptr ? beta1_pow_ptr : beta1_const_pow_ptr,
beta2_pow_ptr, beta2_pow_ptr != nullptr ? beta2_pow_ptr : beta2_const_pow_ptr,
lr.template data<T>(), lr_ptr != nullptr ? lr_ptr : lr.template data<float>(),
mom1_out.template mutable_data<T>(ctx.GetPlace()), mom1_out_ptr,
mom2_out.template mutable_data<T>(ctx.GetPlace()), mom2_out_ptr,
param_out.template mutable_data<T>(ctx.GetPlace()), param_out_ptr,
beta1, beta1,
beta2, beta2,
epsilon, epsilon,
ori_rows, ori_rows,
xpu_rows, xpu_rows,
row_numel, row_numel,
grad_merge.rows().size(), grad_merge.rows().size(),
lazy_mode); lazy_mode);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
r == xpu::Error_t::SUCCESS, r == xpu::Error_t::SUCCESS,
true, true,
platform::errors::External("XPU API return wrong value[%d],", r)); platform::errors::External("XPU API return wrong value[%d],", r));
freeData<float>(grad_tensor, grad_c);
copyOutData<float>(xpu_mom1_out, &mom1_out, ctx);
copyOutData<float>(xpu_mom2_out, &mom2_out, ctx);
copyOutData<float>(xpu_param_out, &param_out, ctx);
if (!use_global_beta_pow) { if (!use_global_beta_pow) {
// update in cpu and then copy to xpu // update in cpu and then copy to xpu
if (beta1_pow.place() == platform::CPUPlace() && if (beta1_pow.place() == platform::CPUPlace() &&
beta2_pow.place() == platform::CPUPlace()) { beta2_pow.place() == platform::CPUPlace()) {
const float* beta1_pow_p = beta1_pow.template data<float>(); setBetaData(beta1_pow, beta1_pow_out, beta1);
beta1_pow_out->mutable_data<float>(platform::CPUPlace())[0] =
beta1 * beta1_pow_p[0]; setBetaData(beta2_pow, beta2_pow_out, beta2);
const float* beta2_pow_p = beta2_pow.template data<float>();
beta2_pow_out->mutable_data<float>(platform::CPUPlace())[0] =
beta2 * beta2_pow_p[0];
} else { } else {
float* beta1_pow_out_p = float* beta1_pow_out_p1 = nullptr;
beta1_pow_out->mutable_data<float>(ctx.GetPlace());
float* beta2_pow_out_p = if (beta1_pow_out->dtype() ==
beta2_pow_out->mutable_data<float>(ctx.GetPlace()); paddle::experimental::DataType::FLOAT16) {
int r = xpu::scale(dev_ctx.x_context(), scale<DeviceContext, float>(
beta1_pow_ptr, beta1_pow_out, beta1_pow, beta1_pow_ptr, beta1, ctx);
beta1_pow_out_p, } else {
beta1_pow.numel(), const float* beta1_pow_data = beta1_pow.template data<float>();
false, beta1_pow_out_p1 =
beta1, beta1_pow_out->mutable_data<float>(ctx.GetPlace());
0.0f); r = xpu::scale(dev_ctx.x_context(),
PADDLE_ENFORCE_EQ( beta1_pow_data,
r, beta1_pow_out_p1,
xpu::SUCCESS, beta1_pow.numel(),
platform::errors::External( false,
"XPU kernel scale occur error in adam error code ", beta1,
r, 0.0f);
XPUAPIErrorMsg[r])); xpu_wait(dev_ctx.x_context()->xpu_stream);
r = xpu::scale(dev_ctx.x_context(), PADDLE_ENFORCE_EQ(
beta2_pow_ptr, r,
beta2_pow_out_p, xpu::SUCCESS,
beta2_pow.numel(), platform::errors::External(
false, "XPU kernel scale occur error in adam error code ",
beta2, r,
0.0f); XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_EQ( }
r,
xpu::SUCCESS, float* beta2_pow_out_p1 = nullptr;
platform::errors::External( if (beta2_pow_out->dtype() ==
"XPU kernel scale occur error in adam error code ", paddle::experimental::DataType::FLOAT16) {
r, scale<DeviceContext, float>(
XPUAPIErrorMsg[r])); beta2_pow_out, beta2_pow, beta2_pow_ptr, beta2, ctx);
} else {
const float* beta2_pow_data = beta2_pow.template data<float>();
beta2_pow_out_p1 =
beta2_pow_out->mutable_data<float>(ctx.GetPlace());
r = xpu::scale(dev_ctx.x_context(),
beta2_pow_data,
beta2_pow_out_p1,
beta2_pow.numel(),
false,
beta2,
0.0f);
xpu_wait(dev_ctx.x_context()->xpu_stream);
PADDLE_ENFORCE_EQ(
r,
xpu::SUCCESS,
platform::errors::External(
"XPU kernel scale occur error in adam error code ",
r,
XPUAPIErrorMsg[r]));
}
} }
} }
xpu_wait(dev_ctx.x_context()->xpu_stream);
} else { } else {
PADDLE_ENFORCE_EQ(1, PADDLE_ENFORCE_EQ(1,
2, 2,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Variable type not supported by adam_op")); "Variable type not supported by adam_op"));
} }
freeData<float>(param, param_ptr);
freeData<float>(mom1, mom1_ptr);
freeData<float>(mom2, mom2_ptr);
freeData<float>(lr, lr_ptr);
} }
}; };
#endif #endif
...@@ -384,5 +636,8 @@ class AdamOpXPUKernel : public framework::OpKernel<T> { ...@@ -384,5 +636,8 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
REGISTER_OP_XPU_KERNEL( REGISTER_OP_XPU_KERNEL(
adam, ops::AdamOpXPUKernel<paddle::platform::XPUDeviceContext, float>); adam,
ops::AdamOpXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::AdamOpXPUKernel<paddle::platform::XPUDeviceContext,
paddle::platform::float16>);
#endif #endif
...@@ -34,7 +34,9 @@ XPUOpMap& get_kl2_ops() { ...@@ -34,7 +34,9 @@ XPUOpMap& get_kl2_ops() {
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})}, pOpKernelType(vartype::FP16, XPUPlace())})},
{"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"adam",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"argsort", {"argsort",
XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
......
...@@ -159,6 +159,57 @@ class XPUTestAdamOp(XPUOpTestWrapper): ...@@ -159,6 +159,57 @@ class XPUTestAdamOp(XPUOpTestWrapper):
self.learning_rate = 0.001 self.learning_rate = 0.001
self.epsilon = 1e-8 self.epsilon = 1e-8
class TestAdamOp7(TestAdamOp):
'''Test Adam Op with float16 accuracy
'''
def setUp(self):
self.init_dtype()
self.set_xpu()
self.op_type = "adam"
self.place = paddle.XPUPlace(0)
self.set_data()
self.set_attrs()
self.set_shape()
self.set_inputs()
self.set_steps()
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, self.attrs)
self.outputs = {
'Moment1Out':
moment1_out,
'Moment2Out':
moment2_out,
'ParamOut':
param_out,
'Beta1PowOut':
np.array([self.beta1_pow]).astype("float16") * self.beta1,
'Beta2PowOut':
np.array([self.beta2_pow]).astype("float16") * self.beta2
}
def set_inputs(self):
param = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
grad = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
moment1 = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
# The second moment is positive
moment2 = np.random.random(self.shape).astype(self.dtype)
self.beta1_pow = self.beta1**10
self.beta2_pow = self.beta2**10
self.inputs = {
'Param': param,
'Grad': grad,
'Moment1': moment1,
'Moment2': moment2,
'LearningRate':
np.array([self.learning_rate]).astype("float16"),
'Beta1Pow': np.array([self.beta1_pow]).astype("float16"),
'Beta2Pow': np.array([self.beta2_pow]).astype("float16")
}
class TestAdamOpMultipleSteps(TestAdamOp2): class TestAdamOpMultipleSteps(TestAdamOp2):
'''Test Adam Operator with supplied attributes '''Test Adam Operator with supplied attributes
''' '''
...@@ -372,6 +423,60 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -372,6 +423,60 @@ class TestSparseAdamOp(unittest.TestCase):
self.check_with_place(paddle.XPUPlace(0), False) self.check_with_place(paddle.XPUPlace(0), False)
class TestSparseAdamOp1(TestSparseAdamOp):
def setup(self, scope, place, lazy_mode):
beta1 = 0.78
beta2 = 0.836
epsilon = 1e-4
beta1_pow = np.array([beta1**10]).astype("float16")
beta2_pow = np.array([beta2**10]).astype("float16")
height = 10
rows = [0, 4, 7]
self.rows = rows
row_numel = 12
self.row_numel = row_numel
self.dense_inputs = {
"Param": np.full((height, row_numel), 5.0).astype("float16"),
"Moment1": np.full((height, row_numel), 5.0).astype("float16"),
"Moment2": np.full((height, row_numel), 5.0).astype("float16"),
'Beta1Pow': beta1_pow,
'Beta2Pow': beta2_pow,
"LearningRate": np.full((1), 2.0).astype("float16")
}
self.init_output = np.full((height, row_numel), 0.0).astype("float16")
self.attrs = {
'epsilon': epsilon,
'beta1': beta1,
'beta2': beta2,
'min_row_size_to_use_multithread': 2
}
grad_selected_rows = scope.var('Grad').get_selected_rows()
grad_selected_rows.set_height(height)
grad_selected_rows.set_rows(rows)
np_array = np.ones((len(rows), row_numel)).astype("float16")
np_array[0, 0] = 2.0
np_array[2, 8] = 4.0
grad_tensor = grad_selected_rows.get_tensor()
grad_tensor.set(np_array, place)
self.sparse_inputs = ["Grad"]
param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
height, rows, row_numel,
np_array, lazy_mode)
self.outputs = {
"ParamOut": param_out,
"Moment1Out": mom1,
"Moment2Out": mom2,
'Beta1PowOut': beta1_pow * beta1,
'Beta2PowOut': beta2_pow * beta2
}
support_types = get_xpu_op_support_types('adam') support_types = get_xpu_op_support_types('adam')
for stype in support_types: for stype in support_types:
create_test_class(globals(), XPUTestAdamOp, stype) create_test_class(globals(), XPUTestAdamOp, stype)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册