未验证 提交 0f266ac1 编写于 作者: T taixiurong 提交者: GitHub

cherry pick xpu to 2.1 (#34000)

* update xpu cmake for kunlun (#33328)

* xpu support amp (#33809)

* fix bug DLTP-31078 (#33877)

* update xpu cmake (#33906)

* [xpu] add dropout & amp ops in xpu place (#33891)
Co-authored-by: NTTerror <tangzhiyi11@users.noreply.github.com>
上级 ed7903cd
...@@ -27,19 +27,18 @@ ELSEIF(WITH_CENTOS) ...@@ -27,19 +27,18 @@ ELSEIF(WITH_CENTOS)
SET(XPU_XRE_DIR_NAME "xre-centos7_x86_64") SET(XPU_XRE_DIR_NAME "xre-centos7_x86_64")
SET(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64") SET(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64")
SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
ELSE () ELSE ()
SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64") SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
ENDIF() ENDIF()
IF(NOT XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev/20210527") SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210701")
ENDIF()
SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XCCL_URL "${XPU_BASE_URL}/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE) SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE)
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
...@@ -96,7 +95,11 @@ ELSE(WITH_XPU_BKCL) ...@@ -96,7 +95,11 @@ ELSE(WITH_XPU_BKCL)
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB}) TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
ENDIF(WITH_XPU_BKCL) ENDIF(WITH_XPU_BKCL)
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT}) if(NOT XPU_SDK_ROOT)
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
else()
ADD_CUSTOM_TARGET(extern_xpu DEPENDS xpulib)
endif()
# Ensure that xpu/api.h can be included without dependency errors. # Ensure that xpu/api.h can be included without dependency errors.
file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "") file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "")
......
...@@ -33,7 +33,8 @@ AmpOperators::AmpOperators() ...@@ -33,7 +33,8 @@ AmpOperators::AmpOperators()
for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) { for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
bool supported = false; bool supported = false;
for (auto& kernel_type : it->second) { for (auto& kernel_type : it->second) {
if (platform::is_gpu_place(kernel_type.first.place_) && if ((platform::is_gpu_place(kernel_type.first.place_) ||
platform::is_xpu_place(kernel_type.first.place_)) &&
kernel_type.first.data_type_ == fp16_dtype) { kernel_type.first.data_type_ == fp16_dtype) {
supported = true; supported = true;
} }
...@@ -91,7 +92,8 @@ inline std::string GetDtypeStr( ...@@ -91,7 +92,8 @@ inline std::string GetDtypeStr(
inline bool NeedCast(const std::shared_ptr<VarBase>& var) { inline bool NeedCast(const std::shared_ptr<VarBase>& var) {
if (platform::is_gpu_place(var->Place()) || if (platform::is_gpu_place(var->Place()) ||
platform::is_cuda_pinned_place(var->Place())) { platform::is_cuda_pinned_place(var->Place()) ||
platform::is_xpu_place(var->Place())) {
// CudaPinndePlace is added for varbase created by dataloader // CudaPinndePlace is added for varbase created by dataloader
if (var->DataType() == framework::proto::VarType::FP32 || if (var->DataType() == framework::proto::VarType::FP32 ||
var->DataType() == framework::proto::VarType::FP16) { var->DataType() == framework::proto::VarType::FP16) {
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
template <typename T>
class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
using MPDType = typename details::MPTypeTrait<T>::Type;
using XPUTyp = typename XPUTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* scale = ctx.Input<framework::Tensor>("Scale");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
const MPDType* scale_data = scale->data<MPDType>();
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
// cpy to cpu
bool cpu_found_inf_data = false;
MPDType cpu_scale_data;
if (platform::is_xpu_place(scale->place())) {
xpu_memcpy(&cpu_scale_data, scale_data, sizeof(MPDType),
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
} else {
cpu_scale_data = (*scale_data);
}
MPDType inverse_scale = 1.0 / cpu_scale_data;
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(dev_ctx.GetPlace());
framework::Tensor is_finite =
ctx.AllocateTmpTensor<bool, platform::XPUDeviceContext>(x->dims(),
dev_ctx);
framework::Tensor is_nan =
ctx.AllocateTmpTensor<bool, platform::XPUDeviceContext>(x->dims(),
dev_ctx);
framework::Tensor is_finite_and_nan =
ctx.AllocateTmpTensor<bool, platform::XPUDeviceContext>(x->dims(),
dev_ctx);
if (cpu_found_inf_data == false) {
int r = xpu::isfinite(dev_ctx.x_context(),
reinterpret_cast<const XPUTyp*>(x->data<T>()),
is_finite.data<bool>(), x->numel());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(isfinite) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::logical_not(dev_ctx.x_context(), reinterpret_cast<const bool*>(
is_finite.data<bool>()),
is_finite.data<bool>(), x->numel());
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU API(logical_not) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::isnan(dev_ctx.x_context(),
reinterpret_cast<const XPUTyp*>(x->data<T>()),
is_nan.data<bool>(), x->numel());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(isnan) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::logical_or(dev_ctx.x_context(), is_finite.data<bool>(),
is_nan.data<bool>(), is_finite.data<bool>(),
x->numel());
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU API(logical_or) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::any(dev_ctx.x_context(), is_finite.data<bool>(),
found_inf_data, x->numel());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(any) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
memory::Copy(platform::CPUPlace(), &cpu_found_inf_data,
BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
found_inf_data, sizeof(bool));
}
if (cpu_found_inf_data) {
inverse_scale = 0.0;
}
auto dev_env = XPUEnv::getenv("XPUSIM_DEVICE_MODEL");
if (std::is_same<T, paddle::platform::float16>::value &&
(dev_env == nullptr || std::strcmp(dev_env, "KUNLUN1"))) {
framework::Tensor float_x;
framework::Tensor float_out;
float_x.mutable_data<MPDType>(dev_ctx.GetPlace(),
x->numel() * sizeof(MPDType));
float_out.mutable_data<MPDType>(dev_ctx.GetPlace(),
out->numel() * sizeof(MPDType));
int r = xpu::cast_v2(dev_ctx.x_context(),
reinterpret_cast<const float16*>(x->data<T>()),
float_x.data<MPDType>(), x->numel());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(cast_v2) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::scale(dev_ctx.x_context(), float_x.data<MPDType>(),
float_out.data<MPDType>(), x->numel(), false,
inverse_scale, 0.0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(scale) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::cast_v2(dev_ctx.x_context(), float_out.data<MPDType>(),
reinterpret_cast<float16*>(out->data<T>()),
out->numel());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(cast_v2) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
if (dev_ctx.x_context()->xpu_stream) {
dev_ctx.Wait();
}
} else {
int r = xpu::scale(dev_ctx.x_context(),
reinterpret_cast<const XPUTyp*>(x->data<T>()),
reinterpret_cast<XPUTyp*>(out->data<T>()),
x->numel(), false, inverse_scale, 0.0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(scale) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
}
}
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
found_inf_data, platform::CPUPlace(), &cpu_found_inf_data,
sizeof(bool));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL(check_finite_and_unscale,
ops::CheckFiniteAndUnscaleXPUKernel<float>,
ops::CheckFiniteAndUnscaleXPUKernel<plat::float16>);
#endif
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
#include <cstring>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
template <typename T>
class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
using MPDType = typename details::MPTypeTrait<T>::Type;
using XPUTyp = typename XPUTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
const auto xs = ctx.MultiInput<framework::Tensor>("X");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
PADDLE_ENFORCE_EQ(found_inf->numel(), 1,
platform::errors::InvalidArgument(
"FoundInfinite must has only one element."));
const bool* found_inf_data = found_inf->data<bool>();
bool cpu_found_inf_data = false;
if (platform::is_xpu_place(found_inf->place())) {
xpu_memcpy(&cpu_found_inf_data, found_inf_data, sizeof(bool),
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
} else {
cpu_found_inf_data = (*found_inf_data);
}
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = out->numel();
if (cpu_found_inf_data) {
VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --";
int r = 0;
r = xpu::constant(dev_ctx.x_context(),
reinterpret_cast<XPUTyp*>(out_data), num,
XPUTyp(0.0));
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(constant) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
}
}
const bool stop_update = ctx.Attr<bool>("stop_update");
if (stop_update) {
return;
}
const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
const MPDType* pre_loss_scaling_data = pre_loss_scaling->data<MPDType>();
const int* good_in_data = good_in->data<int>();
const int* bad_in_data = bad_in->data<int>();
MPDType* updated_loss_scaling_data =
updated_loss_scaling->mutable_data<MPDType>(dev_ctx.GetPlace());
int* good_out_data = good_out->mutable_data<int>(dev_ctx.GetPlace());
int* bad_out_data = bad_out->mutable_data<int>(dev_ctx.GetPlace());
const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
const int decr_every_n_nan_or_inf =
ctx.Attr<int>("decr_every_n_nan_or_inf");
const float incr_ratio = ctx.Attr<float>("incr_ratio");
const float decr_ratio = ctx.Attr<float>("decr_ratio");
int cpu_bad_in_data;
int cpu_good_in_data;
MPDType cpu_pre_loss_scaling_data;
if (platform::is_xpu_place(bad_in->place())) {
xpu_memcpy(&cpu_bad_in_data, bad_in_data, sizeof(int),
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
} else {
cpu_bad_in_data = (*bad_in_data);
}
if (platform::is_xpu_place(good_in->place())) {
xpu_memcpy(&cpu_good_in_data, good_in_data, sizeof(int),
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
} else {
cpu_good_in_data = (*good_in_data);
}
if (platform::is_xpu_place(pre_loss_scaling->place())) {
xpu_memcpy(&cpu_pre_loss_scaling_data, pre_loss_scaling_data,
sizeof(MPDType), XPUMemcpyKind::XPU_DEVICE_TO_HOST);
} else {
cpu_pre_loss_scaling_data = (*pre_loss_scaling_data);
}
int cpu_good_out_data = 0;
int cpu_bad_out_data = 0;
MPDType cpu_updated_loss_scaling_data;
if (cpu_found_inf_data) {
cpu_good_out_data = 0;
cpu_bad_out_data = cpu_bad_in_data + 1;
if (cpu_bad_out_data == decr_every_n_nan_or_inf) {
MPDType new_loss_scaling = cpu_pre_loss_scaling_data * decr_ratio;
cpu_updated_loss_scaling_data =
(new_loss_scaling < static_cast<MPDType>(1))
? (static_cast<MPDType>(1))
: (new_loss_scaling);
cpu_bad_out_data = 0;
}
} else {
cpu_bad_out_data = 0;
cpu_good_out_data = cpu_good_in_data + 1;
if (cpu_good_out_data == incr_every_n_steps) {
MPDType new_loss_scaling = cpu_pre_loss_scaling_data * incr_ratio;
cpu_updated_loss_scaling_data = (std::isfinite(new_loss_scaling))
? new_loss_scaling
: cpu_pre_loss_scaling_data;
cpu_good_out_data = 0;
}
}
// copy to host
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
bad_out_data, platform::CPUPlace(), &cpu_bad_out_data,
sizeof(int));
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
good_out_data, platform::CPUPlace(), &cpu_good_out_data,
sizeof(int));
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
updated_loss_scaling_data, platform::CPUPlace(),
&cpu_updated_loss_scaling_data, sizeof(MPDType));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL(update_loss_scaling,
ops::UpdateLossScalingXPUKernel<float>,
ops::UpdateLossScalingXPUKernel<plat::float16>);
#endif
...@@ -23,21 +23,9 @@ limitations under the License. */ ...@@ -23,21 +23,9 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
class XPUFPTypeTrait {
public:
using Type = T;
};
template <>
class XPUFPTypeTrait<platform::float16> {
public:
using Type = float16;
};
template <typename DeviceContext, typename InT> template <typename DeviceContext, typename InT>
class CastXPUKernel : public framework::OpKernel<InT> { class CastXPUKernel : public framework::OpKernel<InT> {
using XPUInTDType = typename XPUFPTypeTrait<InT>::Type; using XPUInTDType = typename XPUTypeTrait<InT>::Type;
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
...@@ -49,7 +37,6 @@ class CastXPUKernel : public framework::OpKernel<InT> { ...@@ -49,7 +37,6 @@ class CastXPUKernel : public framework::OpKernel<InT> {
context.Attr<int>("out_dtype")); context.Attr<int>("out_dtype"));
auto* in_data = in->data<InT>(); auto* in_data = in->data<InT>();
// using XPUOutTDType = typename XPUFPTypeTrait<InT>::Type;
auto numel = in->numel(); auto numel = in->numel();
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
int r = -1; int r = -1;
......
...@@ -16,11 +16,11 @@ namespace paddle { ...@@ -16,11 +16,11 @@ namespace paddle {
namespace operators { namespace operators {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
static std::map<int, float*> mask_data_tables;
static const int max_data_size = 32 * 1024 * 1024;
static std::mutex s_mask_data_table_lock;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class DropoutXPUKernel : public framework::OpKernel<T> { class DropoutXPUKernel : public framework::OpKernel<T> {
using XPUTyp = typename XPUTypeTrait<T>::Type;
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X"); auto* x = context.Input<Tensor>("X");
...@@ -30,93 +30,70 @@ class DropoutXPUKernel : public framework::OpKernel<T> { ...@@ -30,93 +30,70 @@ class DropoutXPUKernel : public framework::OpKernel<T> {
float dropout_prob = context.Attr<float>("dropout_prob"); float dropout_prob = context.Attr<float>("dropout_prob");
auto dropout_implementation = auto dropout_implementation =
context.Attr<std::string>("dropout_implementation"); context.Attr<std::string>("dropout_implementation");
float* mask_data_table = nullptr; auto& dev_ctx = context.template device_context<DeviceContext>();
PADDLE_ENFORCE_EQ(!context.HasInput("Seed"), true, PADDLE_ENFORCE_EQ(!context.HasInput("Seed"), true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
("Input(Seed) not supported on XPU"))); ("Input(Seed) not supported on XPU")));
if (!context.Attr<bool>("is_test")) {
int dev_id =
BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()).GetDeviceId();
int prop = static_cast<int>(dropout_prob * 100);
int is_upscale = (dropout_implementation == "upscale_in_train"); int is_upscale = (dropout_implementation == "upscale_in_train");
/* mask_data_tables key contains 3 part:
* | 31-16 | 15-8 | 7-0 | if (!context.Attr<bool>("is_test")) {
* | dev_id | prob | is_upscale |
*/
int index = (dev_id << 16) + (prop << 8) + is_upscale;
std::lock_guard<std::mutex> lock(s_mask_data_table_lock);
if (mask_data_tables.find(index) == mask_data_tables.end()) {
float* mask_data_host = new float[max_data_size];
std::random_device rnd; std::random_device rnd;
std::minstd_rand engine; // int seed = (context.Attr<bool>("fix_seed")) ?
int seed = // int(context.Attr<int>("seed")) : (rnd());
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd(); int seed = 0;
engine.seed(seed); if (context.Attr<bool>("fix_seed") == true) {
std::uniform_real_distribution<float> dist(0, 1); seed = static_cast<int>(context.Attr<int>("seed"));
for (size_t i = 0; i < max_data_size; ++i) {
if (dist(engine) < dropout_prob) {
mask_data_host[i] = 0.0f;
} else { } else {
if (is_upscale) { seed = rnd();
mask_data_host[i] = 1.0f / static_cast<T>(1.0f - dropout_prob);
} else {
mask_data_host[i] = 1.0;
}
}
}
PADDLE_ENFORCE_EQ(
xpu_malloc(reinterpret_cast<void**>(&mask_data_table),
max_data_size * sizeof(float)),
XPU_SUCCESS,
platform::errors::ResourceExhausted(
"\n\nOut of memory error on XPU, Cannot"
"allocate %s memory on XPU. \n\nPlease "
"check whether there is any other process "
"using XPU.\n",
string::HumanReadableSize(max_data_size * sizeof(void*))));
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()),
mask_data_table, platform::CPUPlace(), mask_data_host,
max_data_size * sizeof(float));
mask_data_tables[index] = mask_data_table;
free(mask_data_host);
} else {
mask_data_table = mask_data_tables[index];
}
} }
if (!context.Attr<bool>("is_test")) { // Train
auto* mask = context.Output<Tensor>("Mask"); auto* mask = context.Output<Tensor>("Mask");
auto* mask_data = mask->mutable_data<T>(context.GetPlace()); auto* mask_data = mask->mutable_data<T>(context.GetPlace());
size_t size = framework::product(mask->dims()); // Special case when dropout_prob is 1.0
auto& dev_ctx = context.template device_context<DeviceContext>(); if (dropout_prob == 1.0f) {
int r = xpu::dropout(dev_ctx.x_context(), mask_data_table, x_data, int r = xpu::constant(dev_ctx.x_context(),
mask_data, y_data, max_data_size, size); reinterpret_cast<XPUTyp*>(y_data), y->numel(),
PADDLE_ENFORCE_EQ( XPUTyp(0));
r, xpu::Error_t::SUCCESS, PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
platform::errors::External( "XPU API(constant) return wrong "
"XPU dropout return wrong value[%d], please check whether " "value[%d %s]",
"Baidu Kunlun Card is properly installed.", r, XPUAPIErrorMsg[r]));
r)); r = xpu::constant(dev_ctx.x_context(),
} else { // Infer reinterpret_cast<XPUTyp*>(mask_data), mask->numel(),
float scale = 0.0f; XPUTyp(0));
if (dropout_implementation == "upscale_in_train") { PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
scale = 1.0f; "XPU API(constant) return wrong "
} else { "value[%d %s]",
scale = static_cast<T>(1.0f - dropout_prob); r, XPUAPIErrorMsg[r]));
return;
} }
auto& dev_ctx = context.template device_context<DeviceContext>(); int r = xpu::dropout(dev_ctx.x_context(),
int r = xpu::scale(dev_ctx.x_context(), x->numel(), scale, 0.0f, 0, reinterpret_cast<const XPUTyp*>(x->data<T>()),
x_data, y_data); reinterpret_cast<XPUTyp*>(y->data<T>()),
PADDLE_ENFORCE_EQ( reinterpret_cast<XPUTyp*>(mask_data), seed,
r, xpu::Error_t::SUCCESS, mask->numel(), is_upscale, dropout_prob);
platform::errors::External( PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU dropout return wrong value[%d], please check whether " "XPU API(dropout) return wrong "
"Baidu Kunlun Card is properly installed.", "value[%d %s]",
r)); r, XPUAPIErrorMsg[r]));
} else {
float scale =
(is_upscale) ? (1.0) : (static_cast<float>(1.0f - dropout_prob));
int r = xpu::scale(
dev_ctx.x_context(), reinterpret_cast<const XPUTyp*>(x_data),
reinterpret_cast<XPUTyp*>(y_data), x->numel(), false, scale, 0.0f);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(scale) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
} }
} }
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class DropoutGradXPUKernel : public framework::OpKernel<T> { class DropoutGradXPUKernel : public framework::OpKernel<T> {
using XPUTyp = typename XPUTypeTrait<T>::Type;
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE_EQ(!context.Attr<bool>("is_test"), true, PADDLE_ENFORCE_EQ(!context.Attr<bool>("is_test"), true,
...@@ -127,23 +104,47 @@ class DropoutGradXPUKernel : public framework::OpKernel<T> { ...@@ -127,23 +104,47 @@ class DropoutGradXPUKernel : public framework::OpKernel<T> {
auto* mask = context.Input<Tensor>("Mask"); auto* mask = context.Input<Tensor>("Mask");
grad_x->mutable_data<T>(context.GetPlace()); grad_x->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
int r = xpu::elementwise_mul(dev_ctx.x_context(), grad_y->data<T>(), auto& dropout_implementation =
mask->data<T>(), grad_x->data<T>(), context.Attr<std::string>("dropout_implementation");
grad_y->numel()); float dropout_prob = context.Attr<float>("dropout_prob");
PADDLE_ENFORCE_EQ( const T* mask_data = mask->data<T>();
r, xpu::Error_t::SUCCESS, framework::Tensor mask_new;
platform::errors::External( if (dropout_implementation == "upscale_in_train") {
"XPU dropout return wrong value[%d], please check whether " mask_new = context.AllocateTmpTensor<T, platform::XPUDeviceContext>(
"Baidu Kunlun Card is properly installed.", mask->dims(), dev_ctx);
r)); float scale =
(dropout_prob == 1.0f) ? (1.0f) : (1.0f / (1.0f - dropout_prob));
int r = xpu::scale(dev_ctx.x_context(),
reinterpret_cast<const XPUTyp*>(mask->data<T>()),
reinterpret_cast<XPUTyp*>(mask_new.data<T>()),
mask->numel(), false, scale, 0.0f);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(scale) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
mask_data = mask_new.data<T>();
}
int r = xpu::mul(
dev_ctx.x_context(), reinterpret_cast<const XPUTyp*>(grad_y->data<T>()),
reinterpret_cast<const XPUTyp*>(mask_data),
reinterpret_cast<XPUTyp*>(grad_x->data<T>()), grad_y->numel());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(mul) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL( REGISTER_OP_XPU_KERNEL(
dropout, ops::DropoutXPUKernel<paddle::platform::XPUDeviceContext, float>); dropout, ops::DropoutXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::DropoutXPUKernel<paddle::platform::XPUDeviceContext, plat::float16>);
REGISTER_OP_XPU_KERNEL( REGISTER_OP_XPU_KERNEL(
dropout_grad, dropout_grad,
ops::DropoutGradXPUKernel<paddle::platform::XPUDeviceContext, float>); ops::DropoutGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::DropoutGradXPUKernel<paddle::platform::XPUDeviceContext,
plat::float16>);
#endif #endif
...@@ -122,33 +122,50 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> { ...@@ -122,33 +122,50 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
axis)); axis));
std::vector<int> x_dims_vec(max_dim, 1); std::vector<int> x_dims_vec(max_dim, 1);
std::vector<int> y_dims_vec(max_dim, 1); std::vector<int> y_dims_vec(max_dim, 1);
int x_len = 1;
int y_len = 1;
if (x_dims.size() == max_dim) { if (x_dims.size() == max_dim) {
for (int i = 0; i < max_dim; i++) { for (int i = 0; i < max_dim; i++) {
x_dims_vec[i] = x_dims[i]; x_dims_vec[i] = x_dims[i];
x_len *= x_dims_vec[i];
} }
} else { } else {
for (int i = 0; i < x_dims.size(); i++) { for (int i = 0; i < x_dims.size(); i++) {
x_dims_vec[i + axis] = x_dims[i]; x_dims_vec[i + axis] = x_dims[i];
x_len *= x_dims_vec[i];
} }
} }
if (y_dims.size() == max_dim) { if (y_dims.size() == max_dim) {
for (int i = 0; i < max_dim; i++) { for (int i = 0; i < max_dim; i++) {
y_dims_vec[i] = y_dims[i]; y_dims_vec[i] = y_dims[i];
y_len *= y_dims_vec[i];
} }
} else { } else {
for (int i = 0; i < y_dims.size(); i++) { for (int i = 0; i < y_dims.size(); i++) {
y_dims_vec[i + axis] = y_dims[i]; y_dims_vec[i + axis] = y_dims[i];
y_len *= y_dims_vec[i];
} }
} }
const T* dz_data = dz->data<T>(); const T* dz_data = dz->data<T>();
framework::Tensor dx_local_tensor;
framework::Tensor dy_local_tensor;
bool need_wait = false;
T* dx_data = nullptr; T* dx_data = nullptr;
T* dy_data = nullptr; T* dy_data = nullptr;
if (dx) { if (dx) {
dx_data = dx->mutable_data<T>(ctx.GetPlace()); dx_data = dx->mutable_data<T>(ctx.GetPlace());
} else {
dx_data =
dx_local_tensor.mutable_data<T>(ctx.GetPlace(), x_len * sizeof(T));
need_wait = true;
} }
if (dy) { if (dy) {
dy_data = dy->mutable_data<T>(ctx.GetPlace()); dy_data = dy->mutable_data<T>(ctx.GetPlace());
} else {
dy_data =
dy_local_tensor.mutable_data<T>(ctx.GetPlace(), y_len * sizeof(T));
need_wait = true;
} }
auto& dev_ctx = auto& dev_ctx =
...@@ -161,6 +178,9 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> { ...@@ -161,6 +178,9 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
platform::errors::External( platform::errors::External(
"XPU kernel Elementwise occur error in XPUElementwise error code ", "XPU kernel Elementwise occur error in XPUElementwise error code ",
ret, XPUAPIErrorMsg[ret])); ret, XPUAPIErrorMsg[ret]));
if (need_wait && dev_ctx.x_context()->xpu_stream) {
dev_ctx.Wait();
}
} }
}; };
......
...@@ -102,6 +102,7 @@ template <typename T, typename FCT> ...@@ -102,6 +102,7 @@ template <typename T, typename FCT>
static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out, static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
bool trans_x, bool trans_y, bool trans_x, bool trans_y,
const paddle::framework::ExecutionContext &ctx) { const paddle::framework::ExecutionContext &ctx) {
using XPUType = typename XPUTypeTrait<T>::Type;
const auto &x_dims = x->dims(); const auto &x_dims = x->dims();
const auto &y_dims = y->dims(); const auto &y_dims = y->dims();
auto &dev_ctx = auto &dev_ctx =
...@@ -162,17 +163,19 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out, ...@@ -162,17 +163,19 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
int ldout = n; int ldout = n;
if (batch_size <= 1) { if (batch_size <= 1) {
int r = 0; int r = 0;
r = xpu::fc_fusion<T, T, T, FCT>( r = xpu::fc_fusion<XPUType, XPUType, XPUType, FCT>(
dev_ctx.x_context(), x->data<T>(), y->data<T>(), data_c, m, n, k, dev_ctx.x_context(), reinterpret_cast<const XPUType *>(x->data<T>()),
mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy, reinterpret_cast<const XPUType *>(y->data<T>()),
ldout, alpha, 0, nullptr, xpu::Activation_t::LINEAR); reinterpret_cast<XPUType *>(data_c), m, n, k, mat_dim_a.trans_,
mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy, ldout, alpha, 0,
nullptr, xpu::Activation_t::LINEAR);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External( platform::errors::External(
"XPU fc_fusion kernel return wrong value[%d %s]", r, "XPU fc_fusion kernel return wrong value[%d %s]", r,
XPUAPIErrorMsg[r])); XPUAPIErrorMsg[r]));
} else { } else {
// batch matmul // batch matmul
int r = xpu::fc_batched<T, T, T, FCT>( int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>(
dev_ctx.x_context(), // Context* ctx, dev_ctx.x_context(), // Context* ctx,
batch_size, // int batch_size, batch_size, // int batch_size,
mat_dim_a.trans_, // bool x_trans, mat_dim_a.trans_, // bool x_trans,
...@@ -181,12 +184,12 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out, ...@@ -181,12 +184,12 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
n, // int n, n, // int n,
k, // int k, k, // int k,
alpha, // float alpha, alpha, // float alpha,
reinterpret_cast<const T *>(x->data<T>()), // const TX* x, reinterpret_cast<const XPUType *>(x->data<T>()), // const TX* x,
mat_dim_a.stride_, // int stride_a, mat_dim_a.stride_, // int stride_a,
reinterpret_cast<const T *>(y->data<T>()), // const TW* w, reinterpret_cast<const XPUType *>(y->data<T>()), // const TW* w,
mat_dim_b.stride_, // int stride_b, mat_dim_b.stride_, // int stride_b,
0.0, // float beta, 0.0, // float beta,
reinterpret_cast<T *>(data_c), // TY* y, reinterpret_cast<XPUType *>(data_c), // TY* y,
m * n, // int stride_c, m * n, // int stride_c,
nullptr, // const float* x_maxptr, nullptr, // const float* x_maxptr,
nullptr); // const float* w_maxptr nullptr); // const float* w_maxptr
...@@ -210,12 +213,16 @@ class MatMulXPUKernel : public framework::OpKernel<T> { ...@@ -210,12 +213,16 @@ class MatMulXPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
bool trans_x = context.Attr<bool>("transpose_X"); bool trans_x = context.Attr<bool>("transpose_X");
bool trans_y = context.Attr<bool>("transpose_Y"); bool trans_y = context.Attr<bool>("transpose_Y");
if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context);
} else {
if (std::getenv("XPU_PADDLE_MAT_MUL_FCINT32") != nullptr) { if (std::getenv("XPU_PADDLE_MAT_MUL_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, context); MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, context);
} else { } else {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context); MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context);
} }
} }
}
}; };
// Reshape a rank-3 tensor from P x M x N to M x (P * N). // Reshape a rank-3 tensor from P x M x N to M x (P * N).
...@@ -224,6 +231,7 @@ class MatMulXPUKernel : public framework::OpKernel<T> { ...@@ -224,6 +231,7 @@ class MatMulXPUKernel : public framework::OpKernel<T> {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
static framework::Tensor XPUFoldHeadAndLastDims( static framework::Tensor XPUFoldHeadAndLastDims(
const DeviceContext &context, const framework::Tensor &input) { const DeviceContext &context, const framework::Tensor &input) {
using XPUType = typename XPUTypeTrait<T>::Type;
auto in_dims = input.dims(); auto in_dims = input.dims();
if (in_dims.size() != 3) { if (in_dims.size() != 3) {
return input; return input;
...@@ -236,8 +244,9 @@ static framework::Tensor XPUFoldHeadAndLastDims( ...@@ -236,8 +244,9 @@ static framework::Tensor XPUFoldHeadAndLastDims(
static_cast<int>(in_dims[1]), static_cast<int>(in_dims[1]),
static_cast<int>(in_dims[2])}; static_cast<int>(in_dims[2])};
std::vector<int> axis_host = {1, 0, 2}; std::vector<int> axis_host = {1, 0, 2};
int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(), int r = xpu::transpose(
in_shape_host, axis_host); context.x_context(), reinterpret_cast<const XPUType *>(input.data<T>()),
reinterpret_cast<XPUType *>(output.data<T>()), in_shape_host, axis_host);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External( platform::errors::External(
"XPU transpose kernel return wrong value[%d %s]", r, "XPU transpose kernel return wrong value[%d %s]", r,
...@@ -280,12 +289,16 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> { ...@@ -280,12 +289,16 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> {
const framework::Tensor &b, bool trans_b, const framework::Tensor &b, bool trans_b,
framework::Tensor *out) const { framework::Tensor *out) const {
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context);
} else {
if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_FCINT32") != nullptr) { if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, context); MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, context);
} else { } else {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context); MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context);
} }
} }
}
void CalcInputGrad(const framework::ExecutionContext &context, void CalcInputGrad(const framework::ExecutionContext &context,
const framework::Tensor &a, bool trans_a, const framework::Tensor &a, bool trans_a,
...@@ -370,10 +383,14 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> { ...@@ -370,10 +383,14 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL( REGISTER_OP_XPU_KERNEL(
matmul, ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, float>); matmul, ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, plat::float16>);
REGISTER_OP_XPU_KERNEL( REGISTER_OP_XPU_KERNEL(
matmul_grad, matmul_grad,
ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext, float>); ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext,
plat::float16>);
#endif #endif
...@@ -25,6 +25,7 @@ template <typename T, typename FCT> ...@@ -25,6 +25,7 @@ template <typename T, typename FCT>
static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
bool trans_x, bool trans_y, bool trans_x, bool trans_y,
const paddle::framework::ExecutionContext& ctx) { const paddle::framework::ExecutionContext& ctx) {
using XPUType = typename XPUTypeTrait<T>::Type;
const auto& x_dims = x->dims(); const auto& x_dims = x->dims();
const auto& y_dims = y->dims(); const auto& y_dims = y->dims();
auto& dev_ctx = auto& dev_ctx =
...@@ -75,8 +76,10 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, ...@@ -75,8 +76,10 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
int batch_size = mat_dim_a.batch_size_; int batch_size = mat_dim_a.batch_size_;
if (batch_size <= 1) { if (batch_size <= 1) {
int r = 0; int r = 0;
r = xpu::fc<T, T, T, FCT>(dev_ctx.x_context(), x->data<T>(), y->data<T>(), r = xpu::fc<XPUType, XPUType, XPUType, FCT>(
data_c, m, n, k, mat_dim_a.trans_, dev_ctx.x_context(), reinterpret_cast<const XPUType*>(x->data<T>()),
reinterpret_cast<const XPUType*>(y->data<T>()),
reinterpret_cast<XPUType*>(data_c), m, n, k, mat_dim_a.trans_,
mat_dim_b.trans_, nullptr, nullptr, nullptr); mat_dim_b.trans_, nullptr, nullptr, nullptr);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS, r, XPU_SUCCESS,
...@@ -87,7 +90,7 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, ...@@ -87,7 +90,7 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
r, XPUAPIErrorMsg[r], m, n, k, mat_dim_a.trans_, mat_dim_b.trans_)); r, XPUAPIErrorMsg[r], m, n, k, mat_dim_a.trans_, mat_dim_b.trans_));
} else { } else {
// batch matmul // batch matmul
int r = xpu::fc_batched<T, T, T, FCT>( int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>(
dev_ctx.x_context(), // Context* ctx, dev_ctx.x_context(), // Context* ctx,
batch_size, // int batch_size, batch_size, // int batch_size,
mat_dim_a.trans_, // bool x_trans, mat_dim_a.trans_, // bool x_trans,
...@@ -96,12 +99,12 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, ...@@ -96,12 +99,12 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
n, // int n, n, // int n,
k, // int k, k, // int k,
1.0, // float alpha, 1.0, // float alpha,
reinterpret_cast<const T*>(x->data<T>()), // const TX* x, reinterpret_cast<const XPUType*>(x->data<T>()), // const TX* x,
mat_dim_a.stride_, // int stride_a, mat_dim_a.stride_, // int stride_a,
reinterpret_cast<const T*>(y->data<T>()), // const TW* w, reinterpret_cast<const XPUType*>(y->data<T>()), // const TW* w,
mat_dim_b.stride_, // int stride_b, mat_dim_b.stride_, // int stride_b,
0.0, // float beta, 0.0, // float beta,
reinterpret_cast<T*>(data_c), // TY* y, reinterpret_cast<XPUType*>(data_c), // TY* y,
m * n, // int stride_c, m * n, // int stride_c,
nullptr, // const float* x_maxptr, nullptr, // const float* x_maxptr,
nullptr); // const float* w_maxptr nullptr); // const float* w_maxptr
...@@ -123,17 +126,22 @@ class MatMulV2XPUKernel : public framework::OpKernel<T> { ...@@ -123,17 +126,22 @@ class MatMulV2XPUKernel : public framework::OpKernel<T> {
bool trans_x = ctx.Attr<bool>("trans_x"); bool trans_x = ctx.Attr<bool>("trans_x");
bool trans_y = ctx.Attr<bool>("trans_y"); bool trans_y = ctx.Attr<bool>("trans_y");
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
} else {
if (std::getenv("XPU_PADDLE_MAT_MUL_V2_FCINT32") != nullptr) { if (std::getenv("XPU_PADDLE_MAT_MUL_V2_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, ctx); MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, ctx);
} else { } else {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx); MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
} }
} }
}
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
static framework::Tensor XPUFoldHeadAndLastDims( static framework::Tensor XPUFoldHeadAndLastDims(
const DeviceContext& context, const framework::Tensor& input) { const DeviceContext& context, const framework::Tensor& input) {
using XPUType = typename XPUTypeTrait<T>::Type;
auto in_dims = input.dims(); auto in_dims = input.dims();
if (in_dims.size() != 3) { if (in_dims.size() != 3) {
return input; return input;
...@@ -147,8 +155,9 @@ static framework::Tensor XPUFoldHeadAndLastDims( ...@@ -147,8 +155,9 @@ static framework::Tensor XPUFoldHeadAndLastDims(
static_cast<int>(in_dims[2])}; static_cast<int>(in_dims[2])};
std::vector<int> axis_host = {1, 0, 2}; std::vector<int> axis_host = {1, 0, 2};
int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(), int r = xpu::transpose(
in_shape_host, axis_host); context.x_context(), reinterpret_cast<const XPUType*>(input.data<T>()),
reinterpret_cast<XPUType*>(output.data<T>()), in_shape_host, axis_host);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External( platform::errors::External(
"XPU transpose kernel return wrong value[%d %s]", r, "XPU transpose kernel return wrong value[%d %s]", r,
...@@ -166,12 +175,16 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> { ...@@ -166,12 +175,16 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> {
const framework::Tensor& b, bool trans_b, const framework::Tensor& b, bool trans_b,
framework::Tensor* out) const { framework::Tensor* out) const {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
} else {
if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_V2_FCINT32") != nullptr) { if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_V2_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, ctx); MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, ctx);
} else { } else {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx); MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
} }
} }
}
void CalcInputGrad(const framework::ExecutionContext& context, void CalcInputGrad(const framework::ExecutionContext& context,
const framework::Tensor& a, bool trans_a, const framework::Tensor& a, bool trans_a,
...@@ -261,8 +274,10 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> { ...@@ -261,8 +274,10 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL(matmul_v2, ops::MatMulV2XPUKernel<float>); REGISTER_OP_XPU_KERNEL(matmul_v2, ops::MatMulV2XPUKernel<float>,
REGISTER_OP_XPU_KERNEL(matmul_v2_grad, ops::MatMulV2XPUGradKernel<float>); ops::MatMulV2XPUKernel<plat::float16>);
REGISTER_OP_XPU_KERNEL(matmul_v2_grad, ops::MatMulV2XPUGradKernel<float>,
ops::MatMulV2XPUGradKernel<plat::float16>);
#endif #endif
...@@ -47,8 +47,8 @@ class SoftmaxXPUKernel : public framework::OpKernel<T> { ...@@ -47,8 +47,8 @@ class SoftmaxXPUKernel : public framework::OpKernel<T> {
int len = x->numel(); int len = x->numel();
T* clip_x_data = T* clip_x_data =
clip_x.mutable_data<T>(context.GetPlace(), len * sizeof(T)); clip_x.mutable_data<T>(context.GetPlace(), len * sizeof(T));
r = xpu::clip(dev_ctx.x_context(), x->data<float>(), clip_x_data, len, r = xpu::clip_v2(dev_ctx.x_context(), x->data<float>(), clip_x_data, len,
-1e30, 1e30); static_cast<float>(-1e20), static_cast<float>(1e20));
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(clip) return wrong " platform::errors::External("XPU API(clip) return wrong "
"value[%d %s]", "value[%d %s]",
......
...@@ -54,8 +54,9 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> { ...@@ -54,8 +54,9 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
int len = logits->numel(); int len = logits->numel();
T* clip_logits_data = T* clip_logits_data =
clip_logits.mutable_data<T>(context.GetPlace(), len * sizeof(T)); clip_logits.mutable_data<T>(context.GetPlace(), len * sizeof(T));
r = xpu::clip(dev_ctx.x_context(), logits->data<float>(), clip_logits_data, r = xpu::clip_v2(dev_ctx.x_context(), logits->data<float>(),
len, -1e30, 1e30); clip_logits_data, len, static_cast<float>(-1e20),
static_cast<float>(1e20));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
r, xpu::Error_t::SUCCESS, r, xpu::Error_t::SUCCESS,
platform::errors::External("XPU kernel error. clip " platform::errors::External("XPU kernel error. clip "
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <unordered_map> #include <unordered_map>
#include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/float16.h"
#include "xpu/api.h" #include "xpu/api.h"
#include "xpu/refactor/fusion.h" #include "xpu/refactor/fusion.h"
#include "xpu/refactor/math.h" #include "xpu/refactor/math.h"
...@@ -58,4 +59,16 @@ static std::map<int, std::string> XPUAPIErrorMsg = { ...@@ -58,4 +59,16 @@ static std::map<int, std::string> XPUAPIErrorMsg = {
{xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"}, {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"},
{xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}}; {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}};
template <typename T>
class XPUTypeTrait {
public:
using Type = T;
};
template <>
class XPUTypeTrait<paddle::platform::float16> {
public:
using Type = float16;
};
#endif #endif
...@@ -224,7 +224,9 @@ OpSupportedInfos(const std::string &place, ...@@ -224,7 +224,9 @@ OpSupportedInfos(const std::string &place,
[](unsigned char c) { return std::toupper(c); }); [](unsigned char c) { return std::toupper(c); });
using fn_type = std::add_pointer<bool(const platform::Place &)>::type; using fn_type = std::add_pointer<bool(const platform::Place &)>::type;
std::unordered_map<std::string, fn_type> is_target_place{ std::unordered_map<std::string, fn_type> is_target_place{
{"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place}, {"GPU", &platform::is_gpu_place},
{"CPU", &platform::is_cpu_place},
{"XPU", &platform::is_xpu_place},
}; };
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
is_target_place.count(query_place), 0, is_target_place.count(query_place), 0,
......
...@@ -149,8 +149,14 @@ gray_list = { ...@@ -149,8 +149,14 @@ gray_list = {
# The set of ops that don't support fp16 calculation # The set of ops that don't support fp16 calculation
# lookup_table fp16 is slower than fp32, though fp16 is supported. # lookup_table fp16 is slower than fp32, though fp16 is supported.
_, _, _sys_unsupported_fp16_list = core.op_supported_infos( _sys_unsupported_fp16_list = []
if core.is_compiled_with_xpu():
_, _, _sys_unsupported_fp16_list = core.op_supported_infos(
'XPU', core.VarDesc.VarType.FP16)
else:
_, _, _sys_unsupported_fp16_list = core.op_supported_infos(
'GPU', core.VarDesc.VarType.FP16) 'GPU', core.VarDesc.VarType.FP16)
unsupported_fp16_list = {'lookup_table', unsupported_fp16_list = {'lookup_table',
'lookup_table_v2'} | _sys_unsupported_fp16_list 'lookup_table_v2'} | _sys_unsupported_fp16_list
......
...@@ -128,9 +128,10 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None): ...@@ -128,9 +128,10 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
raise ValueError( raise ValueError(
"current_tracer is None, maybe it is not in imperative mode.") "current_tracer is None, maybe it is not in imperative mode.")
if enable and not tracer._expected_place.is_gpu_place(): if enable and not (tracer._expected_place.is_gpu_place() or
tracer._expected_place.is_xpu_place()):
warnings.warn( warnings.warn(
'amp_guard can only be enabled on CUDAPlace, current place is %s, so it makes no effect.' 'amp_guard can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.'
% tracer._expected_place) % tracer._expected_place)
enable = False enable = False
......
...@@ -90,9 +90,10 @@ class AmpScaler(object): ...@@ -90,9 +90,10 @@ class AmpScaler(object):
raise ValueError( raise ValueError(
"current_tracer is None, maybe it is not in imperative mode.") "current_tracer is None, maybe it is not in imperative mode.")
if enable and not tracer._expected_place.is_gpu_place(): if enable and not (tracer._expected_place.is_gpu_place() or
tracer._expected_place.is_xpu_place()):
warnings.warn( warnings.warn(
'AmpScaler can only be enabled on CUDAPlace, current place is %s, so it makes no effect.' 'AmpScaler can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.'
% tracer._expected_place) % tracer._expected_place)
enable = False enable = False
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
sys.path.append("..")
import paddle
import unittest
import numpy as np
from op_test_xpu import XPUOpTest
from op_test import OpTest, skip_check_grad_ci
import paddle.fluid as fluid
paddle.enable_static()
class TestCheckFiniteAndUnscaleOp(XPUOpTest):
def setUp(self):
self.op_type = "check_finite_and_unscale"
self.init_dtype()
x = np.random.random((1024, 1024)).astype(self.dtype)
scale = np.random.random((1)).astype(self.dtype)
# self.attrs = {'stop_gradient': True}
self.inputs = {'X': [('x0', x)], 'Scale': scale}
self.outputs = {
'FoundInfinite': np.array([0]),
'Out': [('out0', x / scale)],
}
def init_dtype(self):
self.dtype = np.float32
def test_check_output(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_output_with_place(place)
# class TestCheckFiniteAndUnscaleOpWithNan(XPUOpTest):
# def setUp(self):
# self.op_type = "check_finite_and_unscale"
# self.init_dtype()
# x = np.random.random((1024, 1024)).astype(self.dtype)
# x[128][128] = np.nan
# print("x shape = ", x.shape)
# print(x)
# scale = np.random.random((1)).astype(self.dtype)
# self.inputs = {'X': [('x0', x)], 'Scale': scale}
# self.outputs = {
# 'FoundInfinite': np.array([1]),
# 'Out': [('out0', x)],
# }
# def init_dtype(self):
# self.dtype = np.float32
# def test_check_output(self):
# # When input contains nan, do not check the output,
# # since the output may be nondeterministic and will be discarded.
# if paddle.is_compiled_with_xpu():
# place = paddle.XPUPlace(0)
# self.check_output_with_place(place, no_check_set=['Out'])
# class TestCheckFiniteAndUnscaleOpWithInf(XPUOpTest):
# def setUp(self):
# self.op_type = "check_finite_and_unscale"
# self.init_dtype()
# x = np.random.random((1024, 1024)).astype(self.dtype)
# x[128][128] = np.inf
# scale = np.random.random((1)).astype(self.dtype)
# self.inputs = {'X': [('x0', x)], 'Scale': scale}
# self.outputs = {
# 'FoundInfinite': np.array([1]),
# 'Out': [('out0', x)],
# }
# def init_dtype(self):
# self.dtype = np.float32
# def test_check_output(self):
# # When input contains inf, do not check the output,
# # since the output may be nondeterministic and will be discarded.
# if paddle.is_compiled_with_xpu():
# place = paddle.XPUPlace(0)
# self.check_output_with_place(place, no_check_set=['Out'])
if __name__ == '__main__':
unittest.main()
...@@ -22,9 +22,11 @@ from op_test import OpTest, skip_check_grad_ci ...@@ -22,9 +22,11 @@ from op_test import OpTest, skip_check_grad_ci
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import Program, program_guard from paddle.fluid import Program, program_guard
from op_test_xpu import XPUOpTest
paddle.enable_static()
class TestDropoutOp(OpTest): class TestDropoutOp(XPUOpTest):
def setUp(self): def setUp(self):
self.op_type = "dropout" self.op_type = "dropout"
self.inputs = {'X': np.random.random((32, 64)).astype("float32")} self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
...@@ -47,7 +49,7 @@ class TestDropoutOp(OpTest): ...@@ -47,7 +49,7 @@ class TestDropoutOp(OpTest):
self.check_grad_with_place(place, ['X'], 'Out') self.check_grad_with_place(place, ['X'], 'Out')
class TestDropoutOpInput1d(OpTest): class TestDropoutOpInput1d(XPUOpTest):
def setUp(self): def setUp(self):
self.op_type = "dropout" self.op_type = "dropout"
self.inputs = {'X': np.random.random((2000, )).astype("float32")} self.inputs = {'X': np.random.random((2000, )).astype("float32")}
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import sys
sys.path.append("..")
import numpy as np
from op_test import OpTest
from op_test_xpu import XPUOpTest
import paddle
import paddle.fluid as fluid
import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn
paddle.enable_static()
class TestUpdateLossScalingOp(XPUOpTest):
def setUp(self):
self.op_type = "update_loss_scaling"
self.init()
found_inf = np.array([False], dtype=np.bool)
x = np.random.random((1024, 1024)).astype(self.dtype)
self.inputs = {
'X': [('x0', x)],
'FoundInfinite': found_inf,
'PrevLossScaling': self.prev_loss_scaling,
'InGoodSteps': self.num_good_steps,
'InBadSteps': self.num_bad_steps
}
self.outputs = {
'Out': [('out0', x)],
'LossScaling': self.prev_loss_scaling * self.incr_ratio,
'OutGoodSteps': self.zero_steps,
'OutBadSteps': self.zero_steps
}
def init(self):
self.incr_ratio = 2.0
self.decr_ratio = 0.8
self.dtype = np.float32
self.prev_loss_scaling = np.array([2048]).astype(self.dtype)
self.num_good_steps = np.array([999], dtype=np.int32)
self.num_bad_steps = np.array([1], dtype=np.int32)
self.zero_steps = np.array([0], dtype=np.int32)
self.attrs = {
'incr_every_n_steps': 1000,
'decr_every_n_nan_or_inf': 2,
'incr_ratio': self.incr_ratio,
'decr_ratio': self.decr_ratio,
}
def test_check_output(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_output_with_place(place, no_check_set=['Out'])
class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
def setUp(self):
self.op_type = "update_loss_scaling"
self.init()
found_inf = np.array([True], dtype=np.bool)
x = np.random.random((1024, 1024)).astype(self.dtype)
i = np.random.randint(0, 1024, 1)
j = np.random.randint(0, 1024, 1)
x[i[0]][j[0]] = np.inf
self.inputs = {
'X': [('x0', x)],
'FoundInfinite': found_inf,
'PrevLossScaling': self.prev_loss_scaling,
'InGoodSteps': self.num_good_steps,
'InBadSteps': self.num_bad_steps
}
self.outputs = {
'Out': [('out0', np.zeros_like(x))],
'LossScaling': self.prev_loss_scaling * self.decr_ratio,
'OutGoodSteps': self.zero_steps,
'OutBadSteps': self.zero_steps
}
def test_check_output(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_output_with_place(place)
#self.check_output()
class TestUpdateLossScalingLayer(unittest.TestCase):
def loss_scaling_check(self, scope=fluid.Scope()):
a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
b = fluid.data(name="b", shape=[512, 128], dtype='float32')
x = [a, b]
found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
prev_loss_scaling = fluid.data(
name="prev_loss_scaling", shape=[1], dtype='float32')
num_good_steps = fluid.data(
name="num_good_steps", shape=[1], dtype='int32')
num_bad_steps = fluid.data(
name="num_bad_steps", shape=[1], dtype='int32')
a_v = np.random.random([1024, 1024]).astype('float32')
b_v = np.random.random([512, 128]).astype('float32')
found_inf_v = np.array([False]).astype('bool')
prev_loss_scaling_v = np.array([2048]).astype('float32')
num_good_steps_v = np.array([999], dtype=np.int32)
num_bad_steps_v = np.array([1], dtype=np.int32)
incr_every_n_steps = 1000
decr_every_n_nan_or_inf = 2
incr_ratio = 2
decr_ratio = 0.8
result = amp_nn.update_loss_scaling(
x,
found_inf,
prev_loss_scaling,
num_good_steps,
num_bad_steps,
incr_every_n_steps,
decr_every_n_nan_or_inf,
incr_ratio,
decr_ratio,
name="update_loss_scaling")
place = fluid.XPUPlace(0)
exe = fluid.Executor(place)
with fluid.scope_guard(scope):
exe.run(fluid.default_startup_program())
result_v = exe.run(feed={
'a': a_v,
'b': b_v,
'found_inf': found_inf_v,
'prev_loss_scaling': prev_loss_scaling_v,
'num_good_steps': num_good_steps_v,
'num_bad_steps': num_bad_steps_v
},
fetch_list=[
result, x, found_inf, prev_loss_scaling,
num_good_steps, num_bad_steps
])
assert np.array_equal(result_v[0], a_v)
assert np.array_equal(result_v[1], b_v)
assert np.array_equal(result_v[0], result_v[2])
assert np.array_equal(result_v[1], result_v[3])
assert np.array_equal(result_v[4], found_inf_v)
assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio)
assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()):
a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
b = fluid.data(name="b", shape=[512, 128], dtype='float32')
x = [a, b]
found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
prev_loss_scaling = fluid.data(
name="prev_loss_scaling", shape=[1], dtype='float32')
num_good_steps = fluid.data(
name="num_good_steps", shape=[1], dtype='int32')
num_bad_steps = fluid.data(
name="num_bad_steps", shape=[1], dtype='int32')
a_v = np.random.random([1024, 1024]).astype('float32')
b_v = np.random.random([512, 128]).astype('float32')
i = np.random.randint(0, 1024, 1)
j = np.random.randint(0, 1024, 1)
a_v[i[0]][j[0]] = np.inf
found_inf_v = np.array([True]).astype('bool')
prev_loss_scaling_v = np.array([2048]).astype('float32')
num_good_steps_v = np.array([999], dtype=np.int32)
num_bad_steps_v = np.array([1], dtype=np.int32)
incr_every_n_steps = 1000
decr_every_n_nan_or_inf = 2
incr_ratio = 2
decr_ratio = 0.8
result = amp_nn.update_loss_scaling(
x,
found_inf,
prev_loss_scaling,
num_good_steps,
num_bad_steps,
incr_every_n_steps,
decr_every_n_nan_or_inf,
incr_ratio,
decr_ratio,
name="update_loss_scaling")
place = fluid.XPUPlace(0)
exe = fluid.Executor(place)
with fluid.scope_guard(scope):
exe.run(fluid.default_startup_program())
result_v = exe.run(feed={
'a': a_v,
'b': b_v,
'found_inf': found_inf_v,
'prev_loss_scaling': prev_loss_scaling_v,
'num_good_steps': num_good_steps_v,
'num_bad_steps': num_bad_steps_v
},
fetch_list=[
result, x, found_inf, prev_loss_scaling,
num_good_steps, num_bad_steps
])
assert np.array_equal(result_v[0], np.zeros_like(a_v))
assert np.array_equal(result_v[1], np.zeros_like(b_v))
assert np.array_equal(result_v[2], np.zeros_like(a_v))
assert np.array_equal(result_v[3], np.zeros_like(b_v))
assert np.array_equal(result_v[4], found_inf_v)
assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio)
assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
def test_loss_scaling(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
self.loss_scaling_check()
def test_loss_scaling_inf(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
self.loss_scaling_check_inf()
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册