未验证 提交 0f266ac1 编写于 作者: T taixiurong 提交者: GitHub

cherry pick xpu to 2.1 (#34000)

* update xpu cmake for kunlun (#33328)

* xpu support amp (#33809)

* fix bug DLTP-31078 (#33877)

* update xpu cmake (#33906)

* [xpu] add dropout & amp ops in xpu place (#33891)
Co-authored-by: NTTerror <tangzhiyi11@users.noreply.github.com>
上级 ed7903cd
......@@ -27,19 +27,18 @@ ELSEIF(WITH_CENTOS)
SET(XPU_XRE_DIR_NAME "xre-centos7_x86_64")
SET(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64")
SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
ELSE ()
SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
ENDIF()
IF(NOT XPU_BASE_URL)
SET(XPU_BASE_URL "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev/20210527")
ENDIF()
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210701")
SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XCCL_URL "${XPU_BASE_URL}/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE)
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
......@@ -96,7 +95,11 @@ ELSE(WITH_XPU_BKCL)
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
ENDIF(WITH_XPU_BKCL)
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
if(NOT XPU_SDK_ROOT)
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
else()
ADD_CUSTOM_TARGET(extern_xpu DEPENDS xpulib)
endif()
# Ensure that xpu/api.h can be included without dependency errors.
file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "")
......
......@@ -33,7 +33,8 @@ AmpOperators::AmpOperators()
for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
bool supported = false;
for (auto& kernel_type : it->second) {
if (platform::is_gpu_place(kernel_type.first.place_) &&
if ((platform::is_gpu_place(kernel_type.first.place_) ||
platform::is_xpu_place(kernel_type.first.place_)) &&
kernel_type.first.data_type_ == fp16_dtype) {
supported = true;
}
......@@ -91,7 +92,8 @@ inline std::string GetDtypeStr(
inline bool NeedCast(const std::shared_ptr<VarBase>& var) {
if (platform::is_gpu_place(var->Place()) ||
platform::is_cuda_pinned_place(var->Place())) {
platform::is_cuda_pinned_place(var->Place()) ||
platform::is_xpu_place(var->Place())) {
// CudaPinndePlace is added for varbase created by dataloader
if (var->DataType() == framework::proto::VarType::FP32 ||
var->DataType() == framework::proto::VarType::FP16) {
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
template <typename T>
class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
using MPDType = typename details::MPTypeTrait<T>::Type;
using XPUTyp = typename XPUTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
const auto xs = ctx.MultiInput<framework::Tensor>("X");
const auto* scale = ctx.Input<framework::Tensor>("Scale");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
const MPDType* scale_data = scale->data<MPDType>();
bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
// cpy to cpu
bool cpu_found_inf_data = false;
MPDType cpu_scale_data;
if (platform::is_xpu_place(scale->place())) {
xpu_memcpy(&cpu_scale_data, scale_data, sizeof(MPDType),
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
} else {
cpu_scale_data = (*scale_data);
}
MPDType inverse_scale = 1.0 / cpu_scale_data;
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(dev_ctx.GetPlace());
framework::Tensor is_finite =
ctx.AllocateTmpTensor<bool, platform::XPUDeviceContext>(x->dims(),
dev_ctx);
framework::Tensor is_nan =
ctx.AllocateTmpTensor<bool, platform::XPUDeviceContext>(x->dims(),
dev_ctx);
framework::Tensor is_finite_and_nan =
ctx.AllocateTmpTensor<bool, platform::XPUDeviceContext>(x->dims(),
dev_ctx);
if (cpu_found_inf_data == false) {
int r = xpu::isfinite(dev_ctx.x_context(),
reinterpret_cast<const XPUTyp*>(x->data<T>()),
is_finite.data<bool>(), x->numel());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(isfinite) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::logical_not(dev_ctx.x_context(), reinterpret_cast<const bool*>(
is_finite.data<bool>()),
is_finite.data<bool>(), x->numel());
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU API(logical_not) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::isnan(dev_ctx.x_context(),
reinterpret_cast<const XPUTyp*>(x->data<T>()),
is_nan.data<bool>(), x->numel());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(isnan) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::logical_or(dev_ctx.x_context(), is_finite.data<bool>(),
is_nan.data<bool>(), is_finite.data<bool>(),
x->numel());
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External("XPU API(logical_or) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::any(dev_ctx.x_context(), is_finite.data<bool>(),
found_inf_data, x->numel());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(any) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
memory::Copy(platform::CPUPlace(), &cpu_found_inf_data,
BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
found_inf_data, sizeof(bool));
}
if (cpu_found_inf_data) {
inverse_scale = 0.0;
}
auto dev_env = XPUEnv::getenv("XPUSIM_DEVICE_MODEL");
if (std::is_same<T, paddle::platform::float16>::value &&
(dev_env == nullptr || std::strcmp(dev_env, "KUNLUN1"))) {
framework::Tensor float_x;
framework::Tensor float_out;
float_x.mutable_data<MPDType>(dev_ctx.GetPlace(),
x->numel() * sizeof(MPDType));
float_out.mutable_data<MPDType>(dev_ctx.GetPlace(),
out->numel() * sizeof(MPDType));
int r = xpu::cast_v2(dev_ctx.x_context(),
reinterpret_cast<const float16*>(x->data<T>()),
float_x.data<MPDType>(), x->numel());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(cast_v2) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::scale(dev_ctx.x_context(), float_x.data<MPDType>(),
float_out.data<MPDType>(), x->numel(), false,
inverse_scale, 0.0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(scale) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::cast_v2(dev_ctx.x_context(), float_out.data<MPDType>(),
reinterpret_cast<float16*>(out->data<T>()),
out->numel());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(cast_v2) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
if (dev_ctx.x_context()->xpu_stream) {
dev_ctx.Wait();
}
} else {
int r = xpu::scale(dev_ctx.x_context(),
reinterpret_cast<const XPUTyp*>(x->data<T>()),
reinterpret_cast<XPUTyp*>(out->data<T>()),
x->numel(), false, inverse_scale, 0.0);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(scale) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
}
}
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
found_inf_data, platform::CPUPlace(), &cpu_found_inf_data,
sizeof(bool));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL(check_finite_and_unscale,
ops::CheckFiniteAndUnscaleXPUKernel<float>,
ops::CheckFiniteAndUnscaleXPUKernel<plat::float16>);
#endif
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
#include <cstring>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
template <typename T>
class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
using MPDType = typename details::MPTypeTrait<T>::Type;
using XPUTyp = typename XPUTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
const auto xs = ctx.MultiInput<framework::Tensor>("X");
auto outs = ctx.MultiOutput<framework::Tensor>("Out");
const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
PADDLE_ENFORCE_EQ(found_inf->numel(), 1,
platform::errors::InvalidArgument(
"FoundInfinite must has only one element."));
const bool* found_inf_data = found_inf->data<bool>();
bool cpu_found_inf_data = false;
if (platform::is_xpu_place(found_inf->place())) {
xpu_memcpy(&cpu_found_inf_data, found_inf_data, sizeof(bool),
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
} else {
cpu_found_inf_data = (*found_inf_data);
}
for (size_t i = 0; i < xs.size(); ++i) {
auto* out = outs[i];
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
int num = out->numel();
if (cpu_found_inf_data) {
VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --";
int r = 0;
r = xpu::constant(dev_ctx.x_context(),
reinterpret_cast<XPUTyp*>(out_data), num,
XPUTyp(0.0));
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(constant) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
}
}
const bool stop_update = ctx.Attr<bool>("stop_update");
if (stop_update) {
return;
}
const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
const MPDType* pre_loss_scaling_data = pre_loss_scaling->data<MPDType>();
const int* good_in_data = good_in->data<int>();
const int* bad_in_data = bad_in->data<int>();
MPDType* updated_loss_scaling_data =
updated_loss_scaling->mutable_data<MPDType>(dev_ctx.GetPlace());
int* good_out_data = good_out->mutable_data<int>(dev_ctx.GetPlace());
int* bad_out_data = bad_out->mutable_data<int>(dev_ctx.GetPlace());
const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
const int decr_every_n_nan_or_inf =
ctx.Attr<int>("decr_every_n_nan_or_inf");
const float incr_ratio = ctx.Attr<float>("incr_ratio");
const float decr_ratio = ctx.Attr<float>("decr_ratio");
int cpu_bad_in_data;
int cpu_good_in_data;
MPDType cpu_pre_loss_scaling_data;
if (platform::is_xpu_place(bad_in->place())) {
xpu_memcpy(&cpu_bad_in_data, bad_in_data, sizeof(int),
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
} else {
cpu_bad_in_data = (*bad_in_data);
}
if (platform::is_xpu_place(good_in->place())) {
xpu_memcpy(&cpu_good_in_data, good_in_data, sizeof(int),
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
} else {
cpu_good_in_data = (*good_in_data);
}
if (platform::is_xpu_place(pre_loss_scaling->place())) {
xpu_memcpy(&cpu_pre_loss_scaling_data, pre_loss_scaling_data,
sizeof(MPDType), XPUMemcpyKind::XPU_DEVICE_TO_HOST);
} else {
cpu_pre_loss_scaling_data = (*pre_loss_scaling_data);
}
int cpu_good_out_data = 0;
int cpu_bad_out_data = 0;
MPDType cpu_updated_loss_scaling_data;
if (cpu_found_inf_data) {
cpu_good_out_data = 0;
cpu_bad_out_data = cpu_bad_in_data + 1;
if (cpu_bad_out_data == decr_every_n_nan_or_inf) {
MPDType new_loss_scaling = cpu_pre_loss_scaling_data * decr_ratio;
cpu_updated_loss_scaling_data =
(new_loss_scaling < static_cast<MPDType>(1))
? (static_cast<MPDType>(1))
: (new_loss_scaling);
cpu_bad_out_data = 0;
}
} else {
cpu_bad_out_data = 0;
cpu_good_out_data = cpu_good_in_data + 1;
if (cpu_good_out_data == incr_every_n_steps) {
MPDType new_loss_scaling = cpu_pre_loss_scaling_data * incr_ratio;
cpu_updated_loss_scaling_data = (std::isfinite(new_loss_scaling))
? new_loss_scaling
: cpu_pre_loss_scaling_data;
cpu_good_out_data = 0;
}
}
// copy to host
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
bad_out_data, platform::CPUPlace(), &cpu_bad_out_data,
sizeof(int));
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
good_out_data, platform::CPUPlace(), &cpu_good_out_data,
sizeof(int));
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
updated_loss_scaling_data, platform::CPUPlace(),
&cpu_updated_loss_scaling_data, sizeof(MPDType));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL(update_loss_scaling,
ops::UpdateLossScalingXPUKernel<float>,
ops::UpdateLossScalingXPUKernel<plat::float16>);
#endif
......@@ -23,21 +23,9 @@ limitations under the License. */
namespace paddle {
namespace operators {
template <typename T>
class XPUFPTypeTrait {
public:
using Type = T;
};
template <>
class XPUFPTypeTrait<platform::float16> {
public:
using Type = float16;
};
template <typename DeviceContext, typename InT>
class CastXPUKernel : public framework::OpKernel<InT> {
using XPUInTDType = typename XPUFPTypeTrait<InT>::Type;
using XPUInTDType = typename XPUTypeTrait<InT>::Type;
public:
void Compute(const framework::ExecutionContext& context) const override {
......@@ -49,7 +37,6 @@ class CastXPUKernel : public framework::OpKernel<InT> {
context.Attr<int>("out_dtype"));
auto* in_data = in->data<InT>();
// using XPUOutTDType = typename XPUFPTypeTrait<InT>::Type;
auto numel = in->numel();
auto& dev_ctx = context.template device_context<DeviceContext>();
int r = -1;
......
......@@ -16,11 +16,11 @@ namespace paddle {
namespace operators {
#ifdef PADDLE_WITH_XPU
static std::map<int, float*> mask_data_tables;
static const int max_data_size = 32 * 1024 * 1024;
static std::mutex s_mask_data_table_lock;
template <typename DeviceContext, typename T>
class DropoutXPUKernel : public framework::OpKernel<T> {
using XPUTyp = typename XPUTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X");
......@@ -30,93 +30,70 @@ class DropoutXPUKernel : public framework::OpKernel<T> {
float dropout_prob = context.Attr<float>("dropout_prob");
auto dropout_implementation =
context.Attr<std::string>("dropout_implementation");
float* mask_data_table = nullptr;
auto& dev_ctx = context.template device_context<DeviceContext>();
PADDLE_ENFORCE_EQ(!context.HasInput("Seed"), true,
platform::errors::InvalidArgument(
("Input(Seed) not supported on XPU")));
int is_upscale = (dropout_implementation == "upscale_in_train");
if (!context.Attr<bool>("is_test")) {
int dev_id =
BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()).GetDeviceId();
int prop = static_cast<int>(dropout_prob * 100);
int is_upscale = (dropout_implementation == "upscale_in_train");
/* mask_data_tables key contains 3 part:
* | 31-16 | 15-8 | 7-0 |
* | dev_id | prob | is_upscale |
*/
int index = (dev_id << 16) + (prop << 8) + is_upscale;
std::lock_guard<std::mutex> lock(s_mask_data_table_lock);
if (mask_data_tables.find(index) == mask_data_tables.end()) {
float* mask_data_host = new float[max_data_size];
std::random_device rnd;
std::minstd_rand engine;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
engine.seed(seed);
std::uniform_real_distribution<float> dist(0, 1);
for (size_t i = 0; i < max_data_size; ++i) {
if (dist(engine) < dropout_prob) {
mask_data_host[i] = 0.0f;
} else {
if (is_upscale) {
mask_data_host[i] = 1.0f / static_cast<T>(1.0f - dropout_prob);
} else {
mask_data_host[i] = 1.0;
}
}
}
PADDLE_ENFORCE_EQ(
xpu_malloc(reinterpret_cast<void**>(&mask_data_table),
max_data_size * sizeof(float)),
XPU_SUCCESS,
platform::errors::ResourceExhausted(
"\n\nOut of memory error on XPU, Cannot"
"allocate %s memory on XPU. \n\nPlease "
"check whether there is any other process "
"using XPU.\n",
string::HumanReadableSize(max_data_size * sizeof(void*))));
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()),
mask_data_table, platform::CPUPlace(), mask_data_host,
max_data_size * sizeof(float));
mask_data_tables[index] = mask_data_table;
free(mask_data_host);
std::random_device rnd;
// int seed = (context.Attr<bool>("fix_seed")) ?
// int(context.Attr<int>("seed")) : (rnd());
int seed = 0;
if (context.Attr<bool>("fix_seed") == true) {
seed = static_cast<int>(context.Attr<int>("seed"));
} else {
mask_data_table = mask_data_tables[index];
seed = rnd();
}
}
if (!context.Attr<bool>("is_test")) { // Train
auto* mask = context.Output<Tensor>("Mask");
auto* mask_data = mask->mutable_data<T>(context.GetPlace());
size_t size = framework::product(mask->dims());
auto& dev_ctx = context.template device_context<DeviceContext>();
int r = xpu::dropout(dev_ctx.x_context(), mask_data_table, x_data,
mask_data, y_data, max_data_size, size);
PADDLE_ENFORCE_EQ(
r, xpu::Error_t::SUCCESS,
platform::errors::External(
"XPU dropout return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
} else { // Infer
float scale = 0.0f;
if (dropout_implementation == "upscale_in_train") {
scale = 1.0f;
} else {
scale = static_cast<T>(1.0f - dropout_prob);
// Special case when dropout_prob is 1.0
if (dropout_prob == 1.0f) {
int r = xpu::constant(dev_ctx.x_context(),
reinterpret_cast<XPUTyp*>(y_data), y->numel(),
XPUTyp(0));
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(constant) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
r = xpu::constant(dev_ctx.x_context(),
reinterpret_cast<XPUTyp*>(mask_data), mask->numel(),
XPUTyp(0));
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(constant) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
return;
}
auto& dev_ctx = context.template device_context<DeviceContext>();
int r = xpu::scale(dev_ctx.x_context(), x->numel(), scale, 0.0f, 0,
x_data, y_data);
PADDLE_ENFORCE_EQ(
r, xpu::Error_t::SUCCESS,
platform::errors::External(
"XPU dropout return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
int r = xpu::dropout(dev_ctx.x_context(),
reinterpret_cast<const XPUTyp*>(x->data<T>()),
reinterpret_cast<XPUTyp*>(y->data<T>()),
reinterpret_cast<XPUTyp*>(mask_data), seed,
mask->numel(), is_upscale, dropout_prob);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(dropout) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
} else {
float scale =
(is_upscale) ? (1.0) : (static_cast<float>(1.0f - dropout_prob));
int r = xpu::scale(
dev_ctx.x_context(), reinterpret_cast<const XPUTyp*>(x_data),
reinterpret_cast<XPUTyp*>(y_data), x->numel(), false, scale, 0.0f);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(scale) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
}
}
};
template <typename DeviceContext, typename T>
class DropoutGradXPUKernel : public framework::OpKernel<T> {
using XPUTyp = typename XPUTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE_EQ(!context.Attr<bool>("is_test"), true,
......@@ -127,23 +104,47 @@ class DropoutGradXPUKernel : public framework::OpKernel<T> {
auto* mask = context.Input<Tensor>("Mask");
grad_x->mutable_data<T>(context.GetPlace());
auto& dev_ctx = context.template device_context<DeviceContext>();
int r = xpu::elementwise_mul(dev_ctx.x_context(), grad_y->data<T>(),
mask->data<T>(), grad_x->data<T>(),
grad_y->numel());
PADDLE_ENFORCE_EQ(
r, xpu::Error_t::SUCCESS,
platform::errors::External(
"XPU dropout return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
r));
auto& dropout_implementation =
context.Attr<std::string>("dropout_implementation");
float dropout_prob = context.Attr<float>("dropout_prob");
const T* mask_data = mask->data<T>();
framework::Tensor mask_new;
if (dropout_implementation == "upscale_in_train") {
mask_new = context.AllocateTmpTensor<T, platform::XPUDeviceContext>(
mask->dims(), dev_ctx);
float scale =
(dropout_prob == 1.0f) ? (1.0f) : (1.0f / (1.0f - dropout_prob));
int r = xpu::scale(dev_ctx.x_context(),
reinterpret_cast<const XPUTyp*>(mask->data<T>()),
reinterpret_cast<XPUTyp*>(mask_new.data<T>()),
mask->numel(), false, scale, 0.0f);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
"XPU API(scale) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
mask_data = mask_new.data<T>();
}
int r = xpu::mul(
dev_ctx.x_context(), reinterpret_cast<const XPUTyp*>(grad_y->data<T>()),
reinterpret_cast<const XPUTyp*>(mask_data),
reinterpret_cast<XPUTyp*>(grad_x->data<T>()), grad_y->numel());
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(mul) return wrong "
"value[%d %s]",
r, XPUAPIErrorMsg[r]));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL(
dropout, ops::DropoutXPUKernel<paddle::platform::XPUDeviceContext, float>);
dropout, ops::DropoutXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::DropoutXPUKernel<paddle::platform::XPUDeviceContext, plat::float16>);
REGISTER_OP_XPU_KERNEL(
dropout_grad,
ops::DropoutGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
ops::DropoutGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::DropoutGradXPUKernel<paddle::platform::XPUDeviceContext,
plat::float16>);
#endif
......@@ -122,33 +122,50 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
axis));
std::vector<int> x_dims_vec(max_dim, 1);
std::vector<int> y_dims_vec(max_dim, 1);
int x_len = 1;
int y_len = 1;
if (x_dims.size() == max_dim) {
for (int i = 0; i < max_dim; i++) {
x_dims_vec[i] = x_dims[i];
x_len *= x_dims_vec[i];
}
} else {
for (int i = 0; i < x_dims.size(); i++) {
x_dims_vec[i + axis] = x_dims[i];
x_len *= x_dims_vec[i];
}
}
if (y_dims.size() == max_dim) {
for (int i = 0; i < max_dim; i++) {
y_dims_vec[i] = y_dims[i];
y_len *= y_dims_vec[i];
}
} else {
for (int i = 0; i < y_dims.size(); i++) {
y_dims_vec[i + axis] = y_dims[i];
y_len *= y_dims_vec[i];
}
}
const T* dz_data = dz->data<T>();
framework::Tensor dx_local_tensor;
framework::Tensor dy_local_tensor;
bool need_wait = false;
T* dx_data = nullptr;
T* dy_data = nullptr;
if (dx) {
dx_data = dx->mutable_data<T>(ctx.GetPlace());
} else {
dx_data =
dx_local_tensor.mutable_data<T>(ctx.GetPlace(), x_len * sizeof(T));
need_wait = true;
}
if (dy) {
dy_data = dy->mutable_data<T>(ctx.GetPlace());
} else {
dy_data =
dy_local_tensor.mutable_data<T>(ctx.GetPlace(), y_len * sizeof(T));
need_wait = true;
}
auto& dev_ctx =
......@@ -161,6 +178,9 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
platform::errors::External(
"XPU kernel Elementwise occur error in XPUElementwise error code ",
ret, XPUAPIErrorMsg[ret]));
if (need_wait && dev_ctx.x_context()->xpu_stream) {
dev_ctx.Wait();
}
}
};
......
......@@ -102,6 +102,7 @@ template <typename T, typename FCT>
static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
bool trans_x, bool trans_y,
const paddle::framework::ExecutionContext &ctx) {
using XPUType = typename XPUTypeTrait<T>::Type;
const auto &x_dims = x->dims();
const auto &y_dims = y->dims();
auto &dev_ctx =
......@@ -162,34 +163,36 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
int ldout = n;
if (batch_size <= 1) {
int r = 0;
r = xpu::fc_fusion<T, T, T, FCT>(
dev_ctx.x_context(), x->data<T>(), y->data<T>(), data_c, m, n, k,
mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy,
ldout, alpha, 0, nullptr, xpu::Activation_t::LINEAR);
r = xpu::fc_fusion<XPUType, XPUType, XPUType, FCT>(
dev_ctx.x_context(), reinterpret_cast<const XPUType *>(x->data<T>()),
reinterpret_cast<const XPUType *>(y->data<T>()),
reinterpret_cast<XPUType *>(data_c), m, n, k, mat_dim_a.trans_,
mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy, ldout, alpha, 0,
nullptr, xpu::Activation_t::LINEAR);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External(
"XPU fc_fusion kernel return wrong value[%d %s]", r,
XPUAPIErrorMsg[r]));
} else {
// batch matmul
int r = xpu::fc_batched<T, T, T, FCT>(
dev_ctx.x_context(), // Context* ctx,
batch_size, // int batch_size,
mat_dim_a.trans_, // bool x_trans,
mat_dim_b.trans_, // bool w_trans,
m, // int m,
n, // int n,
k, // int k,
alpha, // float alpha,
reinterpret_cast<const T *>(x->data<T>()), // const TX* x,
mat_dim_a.stride_, // int stride_a,
reinterpret_cast<const T *>(y->data<T>()), // const TW* w,
mat_dim_b.stride_, // int stride_b,
0.0, // float beta,
reinterpret_cast<T *>(data_c), // TY* y,
m * n, // int stride_c,
nullptr, // const float* x_maxptr,
nullptr); // const float* w_maxptr
int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>(
dev_ctx.x_context(), // Context* ctx,
batch_size, // int batch_size,
mat_dim_a.trans_, // bool x_trans,
mat_dim_b.trans_, // bool w_trans,
m, // int m,
n, // int n,
k, // int k,
alpha, // float alpha,
reinterpret_cast<const XPUType *>(x->data<T>()), // const TX* x,
mat_dim_a.stride_, // int stride_a,
reinterpret_cast<const XPUType *>(y->data<T>()), // const TW* w,
mat_dim_b.stride_, // int stride_b,
0.0, // float beta,
reinterpret_cast<XPUType *>(data_c), // TY* y,
m * n, // int stride_c,
nullptr, // const float* x_maxptr,
nullptr); // const float* w_maxptr
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External(
......@@ -210,10 +213,14 @@ class MatMulXPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(context.GetPlace());
bool trans_x = context.Attr<bool>("transpose_X");
bool trans_y = context.Attr<bool>("transpose_Y");
if (std::getenv("XPU_PADDLE_MAT_MUL_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, context);
} else {
if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context);
} else {
if (std::getenv("XPU_PADDLE_MAT_MUL_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, context);
} else {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context);
}
}
}
};
......@@ -224,6 +231,7 @@ class MatMulXPUKernel : public framework::OpKernel<T> {
template <typename DeviceContext, typename T>
static framework::Tensor XPUFoldHeadAndLastDims(
const DeviceContext &context, const framework::Tensor &input) {
using XPUType = typename XPUTypeTrait<T>::Type;
auto in_dims = input.dims();
if (in_dims.size() != 3) {
return input;
......@@ -236,8 +244,9 @@ static framework::Tensor XPUFoldHeadAndLastDims(
static_cast<int>(in_dims[1]),
static_cast<int>(in_dims[2])};
std::vector<int> axis_host = {1, 0, 2};
int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(),
in_shape_host, axis_host);
int r = xpu::transpose(
context.x_context(), reinterpret_cast<const XPUType *>(input.data<T>()),
reinterpret_cast<XPUType *>(output.data<T>()), in_shape_host, axis_host);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External(
"XPU transpose kernel return wrong value[%d %s]", r,
......@@ -280,10 +289,14 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> {
const framework::Tensor &b, bool trans_b,
framework::Tensor *out) const {
out->mutable_data<T>(context.GetPlace());
if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, context);
} else {
if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context);
} else {
if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, context);
} else {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context);
}
}
}
......@@ -370,10 +383,14 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL(
matmul, ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, float>);
matmul, ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, plat::float16>);
REGISTER_OP_XPU_KERNEL(
matmul_grad,
ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext,
plat::float16>);
#endif
......@@ -25,6 +25,7 @@ template <typename T, typename FCT>
static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
bool trans_x, bool trans_y,
const paddle::framework::ExecutionContext& ctx) {
using XPUType = typename XPUTypeTrait<T>::Type;
const auto& x_dims = x->dims();
const auto& y_dims = y->dims();
auto& dev_ctx =
......@@ -75,9 +76,11 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
int batch_size = mat_dim_a.batch_size_;
if (batch_size <= 1) {
int r = 0;
r = xpu::fc<T, T, T, FCT>(dev_ctx.x_context(), x->data<T>(), y->data<T>(),
data_c, m, n, k, mat_dim_a.trans_,
mat_dim_b.trans_, nullptr, nullptr, nullptr);
r = xpu::fc<XPUType, XPUType, XPUType, FCT>(
dev_ctx.x_context(), reinterpret_cast<const XPUType*>(x->data<T>()),
reinterpret_cast<const XPUType*>(y->data<T>()),
reinterpret_cast<XPUType*>(data_c), m, n, k, mat_dim_a.trans_,
mat_dim_b.trans_, nullptr, nullptr, nullptr);
PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS,
platform::errors::External(
......@@ -87,24 +90,24 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
r, XPUAPIErrorMsg[r], m, n, k, mat_dim_a.trans_, mat_dim_b.trans_));
} else {
// batch matmul
int r = xpu::fc_batched<T, T, T, FCT>(
dev_ctx.x_context(), // Context* ctx,
batch_size, // int batch_size,
mat_dim_a.trans_, // bool x_trans,
mat_dim_b.trans_, // bool w_trans,
m, // int m,
n, // int n,
k, // int k,
1.0, // float alpha,
reinterpret_cast<const T*>(x->data<T>()), // const TX* x,
mat_dim_a.stride_, // int stride_a,
reinterpret_cast<const T*>(y->data<T>()), // const TW* w,
mat_dim_b.stride_, // int stride_b,
0.0, // float beta,
reinterpret_cast<T*>(data_c), // TY* y,
m * n, // int stride_c,
nullptr, // const float* x_maxptr,
nullptr); // const float* w_maxptr
int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>(
dev_ctx.x_context(), // Context* ctx,
batch_size, // int batch_size,
mat_dim_a.trans_, // bool x_trans,
mat_dim_b.trans_, // bool w_trans,
m, // int m,
n, // int n,
k, // int k,
1.0, // float alpha,
reinterpret_cast<const XPUType*>(x->data<T>()), // const TX* x,
mat_dim_a.stride_, // int stride_a,
reinterpret_cast<const XPUType*>(y->data<T>()), // const TW* w,
mat_dim_b.stride_, // int stride_b,
0.0, // float beta,
reinterpret_cast<XPUType*>(data_c), // TY* y,
m * n, // int stride_c,
nullptr, // const float* x_maxptr,
nullptr); // const float* w_maxptr
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External(
......@@ -123,10 +126,14 @@ class MatMulV2XPUKernel : public framework::OpKernel<T> {
bool trans_x = ctx.Attr<bool>("trans_x");
bool trans_y = ctx.Attr<bool>("trans_y");
out->mutable_data<T>(ctx.GetPlace());
if (std::getenv("XPU_PADDLE_MAT_MUL_V2_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, ctx);
} else {
if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
} else {
if (std::getenv("XPU_PADDLE_MAT_MUL_V2_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, ctx);
} else {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
}
}
}
};
......@@ -134,6 +141,7 @@ class MatMulV2XPUKernel : public framework::OpKernel<T> {
template <typename DeviceContext, typename T>
static framework::Tensor XPUFoldHeadAndLastDims(
const DeviceContext& context, const framework::Tensor& input) {
using XPUType = typename XPUTypeTrait<T>::Type;
auto in_dims = input.dims();
if (in_dims.size() != 3) {
return input;
......@@ -147,8 +155,9 @@ static framework::Tensor XPUFoldHeadAndLastDims(
static_cast<int>(in_dims[2])};
std::vector<int> axis_host = {1, 0, 2};
int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(),
in_shape_host, axis_host);
int r = xpu::transpose(
context.x_context(), reinterpret_cast<const XPUType*>(input.data<T>()),
reinterpret_cast<XPUType*>(output.data<T>()), in_shape_host, axis_host);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External(
"XPU transpose kernel return wrong value[%d %s]", r,
......@@ -166,10 +175,14 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> {
const framework::Tensor& b, bool trans_b,
framework::Tensor* out) const {
out->mutable_data<T>(ctx.GetPlace());
if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_V2_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, ctx);
} else {
if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
} else {
if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_V2_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, ctx);
} else {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
}
}
}
......@@ -261,8 +274,10 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(matmul_v2, ops::MatMulV2XPUKernel<float>);
REGISTER_OP_XPU_KERNEL(matmul_v2_grad, ops::MatMulV2XPUGradKernel<float>);
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL(matmul_v2, ops::MatMulV2XPUKernel<float>,
ops::MatMulV2XPUKernel<plat::float16>);
REGISTER_OP_XPU_KERNEL(matmul_v2_grad, ops::MatMulV2XPUGradKernel<float>,
ops::MatMulV2XPUGradKernel<plat::float16>);
#endif
......@@ -47,8 +47,8 @@ class SoftmaxXPUKernel : public framework::OpKernel<T> {
int len = x->numel();
T* clip_x_data =
clip_x.mutable_data<T>(context.GetPlace(), len * sizeof(T));
r = xpu::clip(dev_ctx.x_context(), x->data<float>(), clip_x_data, len,
-1e30, 1e30);
r = xpu::clip_v2(dev_ctx.x_context(), x->data<float>(), clip_x_data, len,
static_cast<float>(-1e20), static_cast<float>(1e20));
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(clip) return wrong "
"value[%d %s]",
......
......@@ -54,8 +54,9 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
int len = logits->numel();
T* clip_logits_data =
clip_logits.mutable_data<T>(context.GetPlace(), len * sizeof(T));
r = xpu::clip(dev_ctx.x_context(), logits->data<float>(), clip_logits_data,
len, -1e30, 1e30);
r = xpu::clip_v2(dev_ctx.x_context(), logits->data<float>(),
clip_logits_data, len, static_cast<float>(-1e20),
static_cast<float>(1e20));
PADDLE_ENFORCE_EQ(
r, xpu::Error_t::SUCCESS,
platform::errors::External("XPU kernel error. clip "
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -20,6 +20,7 @@
#include <unordered_map>
#include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/float16.h"
#include "xpu/api.h"
#include "xpu/refactor/fusion.h"
#include "xpu/refactor/math.h"
......@@ -58,4 +59,16 @@ static std::map<int, std::string> XPUAPIErrorMsg = {
{xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"},
{xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}};
template <typename T>
class XPUTypeTrait {
public:
using Type = T;
};
template <>
class XPUTypeTrait<paddle::platform::float16> {
public:
using Type = float16;
};
#endif
......@@ -224,7 +224,9 @@ OpSupportedInfos(const std::string &place,
[](unsigned char c) { return std::toupper(c); });
using fn_type = std::add_pointer<bool(const platform::Place &)>::type;
std::unordered_map<std::string, fn_type> is_target_place{
{"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place},
{"GPU", &platform::is_gpu_place},
{"CPU", &platform::is_cpu_place},
{"XPU", &platform::is_xpu_place},
};
PADDLE_ENFORCE_NE(
is_target_place.count(query_place), 0,
......
......@@ -149,8 +149,14 @@ gray_list = {
# The set of ops that don't support fp16 calculation
# lookup_table fp16 is slower than fp32, though fp16 is supported.
_, _, _sys_unsupported_fp16_list = core.op_supported_infos(
'GPU', core.VarDesc.VarType.FP16)
_sys_unsupported_fp16_list = []
if core.is_compiled_with_xpu():
_, _, _sys_unsupported_fp16_list = core.op_supported_infos(
'XPU', core.VarDesc.VarType.FP16)
else:
_, _, _sys_unsupported_fp16_list = core.op_supported_infos(
'GPU', core.VarDesc.VarType.FP16)
unsupported_fp16_list = {'lookup_table',
'lookup_table_v2'} | _sys_unsupported_fp16_list
......
......@@ -128,9 +128,10 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
raise ValueError(
"current_tracer is None, maybe it is not in imperative mode.")
if enable and not tracer._expected_place.is_gpu_place():
if enable and not (tracer._expected_place.is_gpu_place() or
tracer._expected_place.is_xpu_place()):
warnings.warn(
'amp_guard can only be enabled on CUDAPlace, current place is %s, so it makes no effect.'
'amp_guard can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.'
% tracer._expected_place)
enable = False
......
......@@ -90,9 +90,10 @@ class AmpScaler(object):
raise ValueError(
"current_tracer is None, maybe it is not in imperative mode.")
if enable and not tracer._expected_place.is_gpu_place():
if enable and not (tracer._expected_place.is_gpu_place() or
tracer._expected_place.is_xpu_place()):
warnings.warn(
'AmpScaler can only be enabled on CUDAPlace, current place is %s, so it makes no effect.'
'AmpScaler can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.'
% tracer._expected_place)
enable = False
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
sys.path.append("..")
import paddle
import unittest
import numpy as np
from op_test_xpu import XPUOpTest
from op_test import OpTest, skip_check_grad_ci
import paddle.fluid as fluid
paddle.enable_static()
class TestCheckFiniteAndUnscaleOp(XPUOpTest):
def setUp(self):
self.op_type = "check_finite_and_unscale"
self.init_dtype()
x = np.random.random((1024, 1024)).astype(self.dtype)
scale = np.random.random((1)).astype(self.dtype)
# self.attrs = {'stop_gradient': True}
self.inputs = {'X': [('x0', x)], 'Scale': scale}
self.outputs = {
'FoundInfinite': np.array([0]),
'Out': [('out0', x / scale)],
}
def init_dtype(self):
self.dtype = np.float32
def test_check_output(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_output_with_place(place)
# class TestCheckFiniteAndUnscaleOpWithNan(XPUOpTest):
# def setUp(self):
# self.op_type = "check_finite_and_unscale"
# self.init_dtype()
# x = np.random.random((1024, 1024)).astype(self.dtype)
# x[128][128] = np.nan
# print("x shape = ", x.shape)
# print(x)
# scale = np.random.random((1)).astype(self.dtype)
# self.inputs = {'X': [('x0', x)], 'Scale': scale}
# self.outputs = {
# 'FoundInfinite': np.array([1]),
# 'Out': [('out0', x)],
# }
# def init_dtype(self):
# self.dtype = np.float32
# def test_check_output(self):
# # When input contains nan, do not check the output,
# # since the output may be nondeterministic and will be discarded.
# if paddle.is_compiled_with_xpu():
# place = paddle.XPUPlace(0)
# self.check_output_with_place(place, no_check_set=['Out'])
# class TestCheckFiniteAndUnscaleOpWithInf(XPUOpTest):
# def setUp(self):
# self.op_type = "check_finite_and_unscale"
# self.init_dtype()
# x = np.random.random((1024, 1024)).astype(self.dtype)
# x[128][128] = np.inf
# scale = np.random.random((1)).astype(self.dtype)
# self.inputs = {'X': [('x0', x)], 'Scale': scale}
# self.outputs = {
# 'FoundInfinite': np.array([1]),
# 'Out': [('out0', x)],
# }
# def init_dtype(self):
# self.dtype = np.float32
# def test_check_output(self):
# # When input contains inf, do not check the output,
# # since the output may be nondeterministic and will be discarded.
# if paddle.is_compiled_with_xpu():
# place = paddle.XPUPlace(0)
# self.check_output_with_place(place, no_check_set=['Out'])
if __name__ == '__main__':
unittest.main()
......@@ -22,9 +22,11 @@ from op_test import OpTest, skip_check_grad_ci
import paddle
import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
from op_test_xpu import XPUOpTest
paddle.enable_static()
class TestDropoutOp(OpTest):
class TestDropoutOp(XPUOpTest):
def setUp(self):
self.op_type = "dropout"
self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
......@@ -47,7 +49,7 @@ class TestDropoutOp(OpTest):
self.check_grad_with_place(place, ['X'], 'Out')
class TestDropoutOpInput1d(OpTest):
class TestDropoutOpInput1d(XPUOpTest):
def setUp(self):
self.op_type = "dropout"
self.inputs = {'X': np.random.random((2000, )).astype("float32")}
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import sys
sys.path.append("..")
import numpy as np
from op_test import OpTest
from op_test_xpu import XPUOpTest
import paddle
import paddle.fluid as fluid
import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn
paddle.enable_static()
class TestUpdateLossScalingOp(XPUOpTest):
def setUp(self):
self.op_type = "update_loss_scaling"
self.init()
found_inf = np.array([False], dtype=np.bool)
x = np.random.random((1024, 1024)).astype(self.dtype)
self.inputs = {
'X': [('x0', x)],
'FoundInfinite': found_inf,
'PrevLossScaling': self.prev_loss_scaling,
'InGoodSteps': self.num_good_steps,
'InBadSteps': self.num_bad_steps
}
self.outputs = {
'Out': [('out0', x)],
'LossScaling': self.prev_loss_scaling * self.incr_ratio,
'OutGoodSteps': self.zero_steps,
'OutBadSteps': self.zero_steps
}
def init(self):
self.incr_ratio = 2.0
self.decr_ratio = 0.8
self.dtype = np.float32
self.prev_loss_scaling = np.array([2048]).astype(self.dtype)
self.num_good_steps = np.array([999], dtype=np.int32)
self.num_bad_steps = np.array([1], dtype=np.int32)
self.zero_steps = np.array([0], dtype=np.int32)
self.attrs = {
'incr_every_n_steps': 1000,
'decr_every_n_nan_or_inf': 2,
'incr_ratio': self.incr_ratio,
'decr_ratio': self.decr_ratio,
}
def test_check_output(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_output_with_place(place, no_check_set=['Out'])
class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
def setUp(self):
self.op_type = "update_loss_scaling"
self.init()
found_inf = np.array([True], dtype=np.bool)
x = np.random.random((1024, 1024)).astype(self.dtype)
i = np.random.randint(0, 1024, 1)
j = np.random.randint(0, 1024, 1)
x[i[0]][j[0]] = np.inf
self.inputs = {
'X': [('x0', x)],
'FoundInfinite': found_inf,
'PrevLossScaling': self.prev_loss_scaling,
'InGoodSteps': self.num_good_steps,
'InBadSteps': self.num_bad_steps
}
self.outputs = {
'Out': [('out0', np.zeros_like(x))],
'LossScaling': self.prev_loss_scaling * self.decr_ratio,
'OutGoodSteps': self.zero_steps,
'OutBadSteps': self.zero_steps
}
def test_check_output(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_output_with_place(place)
#self.check_output()
class TestUpdateLossScalingLayer(unittest.TestCase):
def loss_scaling_check(self, scope=fluid.Scope()):
a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
b = fluid.data(name="b", shape=[512, 128], dtype='float32')
x = [a, b]
found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
prev_loss_scaling = fluid.data(
name="prev_loss_scaling", shape=[1], dtype='float32')
num_good_steps = fluid.data(
name="num_good_steps", shape=[1], dtype='int32')
num_bad_steps = fluid.data(
name="num_bad_steps", shape=[1], dtype='int32')
a_v = np.random.random([1024, 1024]).astype('float32')
b_v = np.random.random([512, 128]).astype('float32')
found_inf_v = np.array([False]).astype('bool')
prev_loss_scaling_v = np.array([2048]).astype('float32')
num_good_steps_v = np.array([999], dtype=np.int32)
num_bad_steps_v = np.array([1], dtype=np.int32)
incr_every_n_steps = 1000
decr_every_n_nan_or_inf = 2
incr_ratio = 2
decr_ratio = 0.8
result = amp_nn.update_loss_scaling(
x,
found_inf,
prev_loss_scaling,
num_good_steps,
num_bad_steps,
incr_every_n_steps,
decr_every_n_nan_or_inf,
incr_ratio,
decr_ratio,
name="update_loss_scaling")
place = fluid.XPUPlace(0)
exe = fluid.Executor(place)
with fluid.scope_guard(scope):
exe.run(fluid.default_startup_program())
result_v = exe.run(feed={
'a': a_v,
'b': b_v,
'found_inf': found_inf_v,
'prev_loss_scaling': prev_loss_scaling_v,
'num_good_steps': num_good_steps_v,
'num_bad_steps': num_bad_steps_v
},
fetch_list=[
result, x, found_inf, prev_loss_scaling,
num_good_steps, num_bad_steps
])
assert np.array_equal(result_v[0], a_v)
assert np.array_equal(result_v[1], b_v)
assert np.array_equal(result_v[0], result_v[2])
assert np.array_equal(result_v[1], result_v[3])
assert np.array_equal(result_v[4], found_inf_v)
assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio)
assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()):
a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
b = fluid.data(name="b", shape=[512, 128], dtype='float32')
x = [a, b]
found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
prev_loss_scaling = fluid.data(
name="prev_loss_scaling", shape=[1], dtype='float32')
num_good_steps = fluid.data(
name="num_good_steps", shape=[1], dtype='int32')
num_bad_steps = fluid.data(
name="num_bad_steps", shape=[1], dtype='int32')
a_v = np.random.random([1024, 1024]).astype('float32')
b_v = np.random.random([512, 128]).astype('float32')
i = np.random.randint(0, 1024, 1)
j = np.random.randint(0, 1024, 1)
a_v[i[0]][j[0]] = np.inf
found_inf_v = np.array([True]).astype('bool')
prev_loss_scaling_v = np.array([2048]).astype('float32')
num_good_steps_v = np.array([999], dtype=np.int32)
num_bad_steps_v = np.array([1], dtype=np.int32)
incr_every_n_steps = 1000
decr_every_n_nan_or_inf = 2
incr_ratio = 2
decr_ratio = 0.8
result = amp_nn.update_loss_scaling(
x,
found_inf,
prev_loss_scaling,
num_good_steps,
num_bad_steps,
incr_every_n_steps,
decr_every_n_nan_or_inf,
incr_ratio,
decr_ratio,
name="update_loss_scaling")
place = fluid.XPUPlace(0)
exe = fluid.Executor(place)
with fluid.scope_guard(scope):
exe.run(fluid.default_startup_program())
result_v = exe.run(feed={
'a': a_v,
'b': b_v,
'found_inf': found_inf_v,
'prev_loss_scaling': prev_loss_scaling_v,
'num_good_steps': num_good_steps_v,
'num_bad_steps': num_bad_steps_v
},
fetch_list=[
result, x, found_inf, prev_loss_scaling,
num_good_steps, num_bad_steps
])
assert np.array_equal(result_v[0], np.zeros_like(a_v))
assert np.array_equal(result_v[1], np.zeros_like(b_v))
assert np.array_equal(result_v[2], np.zeros_like(a_v))
assert np.array_equal(result_v[3], np.zeros_like(b_v))
assert np.array_equal(result_v[4], found_inf_v)
assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio)
assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
def test_loss_scaling(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
self.loss_scaling_check()
def test_loss_scaling_inf(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
self.loss_scaling_check_inf()
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册