未验证 提交 4d4fb660 编写于 作者: T taixiurong 提交者: GitHub

xpu support amp (#33809)

上级 0d3de8d0
...@@ -27,19 +27,17 @@ ELSEIF(WITH_CENTOS) ...@@ -27,19 +27,17 @@ ELSEIF(WITH_CENTOS)
SET(XPU_XRE_DIR_NAME "xre-centos7_x86_64") SET(XPU_XRE_DIR_NAME "xre-centos7_x86_64")
SET(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64") SET(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64")
SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
ELSE () ELSE ()
SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64") SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
ENDIF() ENDIF()
IF(NOT XPU_BASE_URL) SET(XPU_BASE_URL "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev/20210527") SET(XPU_XRE_URL "${XPU_BASE_URL}/20210625/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
ENDIF() SET(XPU_XDNN_URL "${XPU_BASE_URL}/20210625/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XCCL_URL "${XPU_BASE_URL}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XCCL_URL "${XPU_BASE_URL}/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE) SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE)
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
......
...@@ -33,7 +33,8 @@ AmpOperators::AmpOperators() ...@@ -33,7 +33,8 @@ AmpOperators::AmpOperators()
for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) { for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
bool supported = false; bool supported = false;
for (auto& kernel_type : it->second) { for (auto& kernel_type : it->second) {
if (platform::is_gpu_place(kernel_type.first.place_) && if ((platform::is_gpu_place(kernel_type.first.place_) ||
platform::is_xpu_place(kernel_type.first.place_)) &&
kernel_type.first.data_type_ == fp16_dtype) { kernel_type.first.data_type_ == fp16_dtype) {
supported = true; supported = true;
} }
...@@ -91,7 +92,8 @@ inline std::string GetDtypeStr( ...@@ -91,7 +92,8 @@ inline std::string GetDtypeStr(
inline bool NeedCast(const std::shared_ptr<VarBase>& var) { inline bool NeedCast(const std::shared_ptr<VarBase>& var) {
if (platform::is_gpu_place(var->Place()) || if (platform::is_gpu_place(var->Place()) ||
platform::is_cuda_pinned_place(var->Place())) { platform::is_cuda_pinned_place(var->Place()) ||
platform::is_xpu_place(var->Place())) {
// CudaPinndePlace is added for varbase created by dataloader // CudaPinndePlace is added for varbase created by dataloader
if (var->DataType() == framework::proto::VarType::FP32 || if (var->DataType() == framework::proto::VarType::FP32 ||
var->DataType() == framework::proto::VarType::FP16) { var->DataType() == framework::proto::VarType::FP16) {
......
...@@ -23,21 +23,9 @@ limitations under the License. */ ...@@ -23,21 +23,9 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
class XPUFPTypeTrait {
public:
using Type = T;
};
template <>
class XPUFPTypeTrait<platform::float16> {
public:
using Type = float16;
};
template <typename DeviceContext, typename InT> template <typename DeviceContext, typename InT>
class CastXPUKernel : public framework::OpKernel<InT> { class CastXPUKernel : public framework::OpKernel<InT> {
using XPUInTDType = typename XPUFPTypeTrait<InT>::Type; using XPUInTDType = typename XPUTypeTrait<InT>::Type;
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
...@@ -49,7 +37,6 @@ class CastXPUKernel : public framework::OpKernel<InT> { ...@@ -49,7 +37,6 @@ class CastXPUKernel : public framework::OpKernel<InT> {
context.Attr<int>("out_dtype")); context.Attr<int>("out_dtype"));
auto* in_data = in->data<InT>(); auto* in_data = in->data<InT>();
// using XPUOutTDType = typename XPUFPTypeTrait<InT>::Type;
auto numel = in->numel(); auto numel = in->numel();
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
int r = -1; int r = -1;
......
...@@ -102,6 +102,7 @@ template <typename T, typename FCT> ...@@ -102,6 +102,7 @@ template <typename T, typename FCT>
static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out, static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
bool trans_x, bool trans_y, bool trans_x, bool trans_y,
const paddle::framework::ExecutionContext &ctx) { const paddle::framework::ExecutionContext &ctx) {
using XPUType = typename XPUTypeTrait<T>::Type;
const auto &x_dims = x->dims(); const auto &x_dims = x->dims();
const auto &y_dims = y->dims(); const auto &y_dims = y->dims();
auto &dev_ctx = auto &dev_ctx =
...@@ -162,17 +163,19 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out, ...@@ -162,17 +163,19 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
int ldout = n; int ldout = n;
if (batch_size <= 1) { if (batch_size <= 1) {
int r = 0; int r = 0;
r = xpu::fc_fusion<T, T, T, FCT>( r = xpu::fc_fusion<XPUType, XPUType, XPUType, FCT>(
dev_ctx.x_context(), x->data<T>(), y->data<T>(), data_c, m, n, k, dev_ctx.x_context(), reinterpret_cast<const XPUType *>(x->data<T>()),
mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy, reinterpret_cast<const XPUType *>(y->data<T>()),
ldout, alpha, 0, nullptr, xpu::Activation_t::LINEAR); reinterpret_cast<XPUType *>(data_c), m, n, k, mat_dim_a.trans_,
mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy, ldout, alpha, 0,
nullptr, xpu::Activation_t::LINEAR);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External( platform::errors::External(
"XPU fc_fusion kernel return wrong value[%d %s]", r, "XPU fc_fusion kernel return wrong value[%d %s]", r,
XPUAPIErrorMsg[r])); XPUAPIErrorMsg[r]));
} else { } else {
// batch matmul // batch matmul
int r = xpu::fc_batched<T, T, T, FCT>( int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>(
dev_ctx.x_context(), // Context* ctx, dev_ctx.x_context(), // Context* ctx,
batch_size, // int batch_size, batch_size, // int batch_size,
mat_dim_a.trans_, // bool x_trans, mat_dim_a.trans_, // bool x_trans,
...@@ -181,12 +184,12 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out, ...@@ -181,12 +184,12 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
n, // int n, n, // int n,
k, // int k, k, // int k,
alpha, // float alpha, alpha, // float alpha,
reinterpret_cast<const T *>(x->data<T>()), // const TX* x, reinterpret_cast<const XPUType *>(x->data<T>()), // const TX* x,
mat_dim_a.stride_, // int stride_a, mat_dim_a.stride_, // int stride_a,
reinterpret_cast<const T *>(y->data<T>()), // const TW* w, reinterpret_cast<const XPUType *>(y->data<T>()), // const TW* w,
mat_dim_b.stride_, // int stride_b, mat_dim_b.stride_, // int stride_b,
0.0, // float beta, 0.0, // float beta,
reinterpret_cast<T *>(data_c), // TY* y, reinterpret_cast<XPUType *>(data_c), // TY* y,
m * n, // int stride_c, m * n, // int stride_c,
nullptr, // const float* x_maxptr, nullptr, // const float* x_maxptr,
nullptr); // const float* w_maxptr nullptr); // const float* w_maxptr
...@@ -210,12 +213,16 @@ class MatMulXPUKernel : public framework::OpKernel<T> { ...@@ -210,12 +213,16 @@ class MatMulXPUKernel : public framework::OpKernel<T> {
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
bool trans_x = context.Attr<bool>("transpose_X"); bool trans_x = context.Attr<bool>("transpose_X");
bool trans_y = context.Attr<bool>("transpose_Y"); bool trans_y = context.Attr<bool>("transpose_Y");
if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context);
} else {
if (std::getenv("XPU_PADDLE_MAT_MUL_FCINT32") != nullptr) { if (std::getenv("XPU_PADDLE_MAT_MUL_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, context); MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, context);
} else { } else {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context); MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context);
} }
} }
}
}; };
// Reshape a rank-3 tensor from P x M x N to M x (P * N). // Reshape a rank-3 tensor from P x M x N to M x (P * N).
...@@ -224,6 +231,7 @@ class MatMulXPUKernel : public framework::OpKernel<T> { ...@@ -224,6 +231,7 @@ class MatMulXPUKernel : public framework::OpKernel<T> {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
static framework::Tensor XPUFoldHeadAndLastDims( static framework::Tensor XPUFoldHeadAndLastDims(
const DeviceContext &context, const framework::Tensor &input) { const DeviceContext &context, const framework::Tensor &input) {
using XPUType = typename XPUTypeTrait<T>::Type;
auto in_dims = input.dims(); auto in_dims = input.dims();
if (in_dims.size() != 3) { if (in_dims.size() != 3) {
return input; return input;
...@@ -236,8 +244,9 @@ static framework::Tensor XPUFoldHeadAndLastDims( ...@@ -236,8 +244,9 @@ static framework::Tensor XPUFoldHeadAndLastDims(
static_cast<int>(in_dims[1]), static_cast<int>(in_dims[1]),
static_cast<int>(in_dims[2])}; static_cast<int>(in_dims[2])};
std::vector<int> axis_host = {1, 0, 2}; std::vector<int> axis_host = {1, 0, 2};
int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(), int r = xpu::transpose(
in_shape_host, axis_host); context.x_context(), reinterpret_cast<const XPUType *>(input.data<T>()),
reinterpret_cast<XPUType *>(output.data<T>()), in_shape_host, axis_host);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External( platform::errors::External(
"XPU transpose kernel return wrong value[%d %s]", r, "XPU transpose kernel return wrong value[%d %s]", r,
...@@ -280,12 +289,16 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> { ...@@ -280,12 +289,16 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> {
const framework::Tensor &b, bool trans_b, const framework::Tensor &b, bool trans_b,
framework::Tensor *out) const { framework::Tensor *out) const {
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context);
} else {
if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_FCINT32") != nullptr) { if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, context); MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, context);
} else { } else {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context); MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context);
} }
} }
}
void CalcInputGrad(const framework::ExecutionContext &context, void CalcInputGrad(const framework::ExecutionContext &context,
const framework::Tensor &a, bool trans_a, const framework::Tensor &a, bool trans_a,
...@@ -370,10 +383,14 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> { ...@@ -370,10 +383,14 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL( REGISTER_OP_XPU_KERNEL(
matmul, ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, float>); matmul, ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::MatMulXPUKernel<paddle::platform::XPUDeviceContext, plat::float16>);
REGISTER_OP_XPU_KERNEL( REGISTER_OP_XPU_KERNEL(
matmul_grad, matmul_grad,
ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext, float>); ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::MatMulGradXPUKernel<paddle::platform::XPUDeviceContext,
plat::float16>);
#endif #endif
...@@ -25,6 +25,7 @@ template <typename T, typename FCT> ...@@ -25,6 +25,7 @@ template <typename T, typename FCT>
static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
bool trans_x, bool trans_y, bool trans_x, bool trans_y,
const paddle::framework::ExecutionContext& ctx) { const paddle::framework::ExecutionContext& ctx) {
using XPUType = typename XPUTypeTrait<T>::Type;
const auto& x_dims = x->dims(); const auto& x_dims = x->dims();
const auto& y_dims = y->dims(); const auto& y_dims = y->dims();
auto& dev_ctx = auto& dev_ctx =
...@@ -75,8 +76,10 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, ...@@ -75,8 +76,10 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
int batch_size = mat_dim_a.batch_size_; int batch_size = mat_dim_a.batch_size_;
if (batch_size <= 1) { if (batch_size <= 1) {
int r = 0; int r = 0;
r = xpu::fc<T, T, T, FCT>(dev_ctx.x_context(), x->data<T>(), y->data<T>(), r = xpu::fc<XPUType, XPUType, XPUType, FCT>(
data_c, m, n, k, mat_dim_a.trans_, dev_ctx.x_context(), reinterpret_cast<const XPUType*>(x->data<T>()),
reinterpret_cast<const XPUType*>(y->data<T>()),
reinterpret_cast<XPUType*>(data_c), m, n, k, mat_dim_a.trans_,
mat_dim_b.trans_, nullptr, nullptr, nullptr); mat_dim_b.trans_, nullptr, nullptr, nullptr);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS, r, XPU_SUCCESS,
...@@ -87,7 +90,7 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, ...@@ -87,7 +90,7 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
r, XPUAPIErrorMsg[r], m, n, k, mat_dim_a.trans_, mat_dim_b.trans_)); r, XPUAPIErrorMsg[r], m, n, k, mat_dim_a.trans_, mat_dim_b.trans_));
} else { } else {
// batch matmul // batch matmul
int r = xpu::fc_batched<T, T, T, FCT>( int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>(
dev_ctx.x_context(), // Context* ctx, dev_ctx.x_context(), // Context* ctx,
batch_size, // int batch_size, batch_size, // int batch_size,
mat_dim_a.trans_, // bool x_trans, mat_dim_a.trans_, // bool x_trans,
...@@ -96,12 +99,12 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, ...@@ -96,12 +99,12 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
n, // int n, n, // int n,
k, // int k, k, // int k,
1.0, // float alpha, 1.0, // float alpha,
reinterpret_cast<const T*>(x->data<T>()), // const TX* x, reinterpret_cast<const XPUType*>(x->data<T>()), // const TX* x,
mat_dim_a.stride_, // int stride_a, mat_dim_a.stride_, // int stride_a,
reinterpret_cast<const T*>(y->data<T>()), // const TW* w, reinterpret_cast<const XPUType*>(y->data<T>()), // const TW* w,
mat_dim_b.stride_, // int stride_b, mat_dim_b.stride_, // int stride_b,
0.0, // float beta, 0.0, // float beta,
reinterpret_cast<T*>(data_c), // TY* y, reinterpret_cast<XPUType*>(data_c), // TY* y,
m * n, // int stride_c, m * n, // int stride_c,
nullptr, // const float* x_maxptr, nullptr, // const float* x_maxptr,
nullptr); // const float* w_maxptr nullptr); // const float* w_maxptr
...@@ -123,17 +126,22 @@ class MatMulV2XPUKernel : public framework::OpKernel<T> { ...@@ -123,17 +126,22 @@ class MatMulV2XPUKernel : public framework::OpKernel<T> {
bool trans_x = ctx.Attr<bool>("trans_x"); bool trans_x = ctx.Attr<bool>("trans_x");
bool trans_y = ctx.Attr<bool>("trans_y"); bool trans_y = ctx.Attr<bool>("trans_y");
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
} else {
if (std::getenv("XPU_PADDLE_MAT_MUL_V2_FCINT32") != nullptr) { if (std::getenv("XPU_PADDLE_MAT_MUL_V2_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, ctx); MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, ctx);
} else { } else {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx); MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
} }
} }
}
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
static framework::Tensor XPUFoldHeadAndLastDims( static framework::Tensor XPUFoldHeadAndLastDims(
const DeviceContext& context, const framework::Tensor& input) { const DeviceContext& context, const framework::Tensor& input) {
using XPUType = typename XPUTypeTrait<T>::Type;
auto in_dims = input.dims(); auto in_dims = input.dims();
if (in_dims.size() != 3) { if (in_dims.size() != 3) {
return input; return input;
...@@ -147,8 +155,9 @@ static framework::Tensor XPUFoldHeadAndLastDims( ...@@ -147,8 +155,9 @@ static framework::Tensor XPUFoldHeadAndLastDims(
static_cast<int>(in_dims[2])}; static_cast<int>(in_dims[2])};
std::vector<int> axis_host = {1, 0, 2}; std::vector<int> axis_host = {1, 0, 2};
int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(), int r = xpu::transpose(
in_shape_host, axis_host); context.x_context(), reinterpret_cast<const XPUType*>(input.data<T>()),
reinterpret_cast<XPUType*>(output.data<T>()), in_shape_host, axis_host);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External( platform::errors::External(
"XPU transpose kernel return wrong value[%d %s]", r, "XPU transpose kernel return wrong value[%d %s]", r,
...@@ -166,12 +175,16 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> { ...@@ -166,12 +175,16 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> {
const framework::Tensor& b, bool trans_b, const framework::Tensor& b, bool trans_b,
framework::Tensor* out) const { framework::Tensor* out) const {
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
} else {
if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_V2_FCINT32") != nullptr) { if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_V2_FCINT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, ctx); MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, ctx);
} else { } else {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx); MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
} }
} }
}
void CalcInputGrad(const framework::ExecutionContext& context, void CalcInputGrad(const framework::ExecutionContext& context,
const framework::Tensor& a, bool trans_a, const framework::Tensor& a, bool trans_a,
...@@ -261,8 +274,10 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> { ...@@ -261,8 +274,10 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL(matmul_v2, ops::MatMulV2XPUKernel<float>); REGISTER_OP_XPU_KERNEL(matmul_v2, ops::MatMulV2XPUKernel<float>,
REGISTER_OP_XPU_KERNEL(matmul_v2_grad, ops::MatMulV2XPUGradKernel<float>); ops::MatMulV2XPUKernel<plat::float16>);
REGISTER_OP_XPU_KERNEL(matmul_v2_grad, ops::MatMulV2XPUGradKernel<float>,
ops::MatMulV2XPUGradKernel<plat::float16>);
#endif #endif
...@@ -47,8 +47,8 @@ class SoftmaxXPUKernel : public framework::OpKernel<T> { ...@@ -47,8 +47,8 @@ class SoftmaxXPUKernel : public framework::OpKernel<T> {
int len = x->numel(); int len = x->numel();
T* clip_x_data = T* clip_x_data =
clip_x.mutable_data<T>(context.GetPlace(), len * sizeof(T)); clip_x.mutable_data<T>(context.GetPlace(), len * sizeof(T));
r = xpu::clip(dev_ctx.x_context(), x->data<float>(), clip_x_data, len, r = xpu::clip_v2(dev_ctx.x_context(), x->data<float>(), clip_x_data, len,
-1e30, 1e30); static_cast<float>(-1e20), static_cast<float>(1e20));
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU API(clip) return wrong " platform::errors::External("XPU API(clip) return wrong "
"value[%d %s]", "value[%d %s]",
......
...@@ -54,8 +54,9 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> { ...@@ -54,8 +54,9 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
int len = logits->numel(); int len = logits->numel();
T* clip_logits_data = T* clip_logits_data =
clip_logits.mutable_data<T>(context.GetPlace(), len * sizeof(T)); clip_logits.mutable_data<T>(context.GetPlace(), len * sizeof(T));
r = xpu::clip(dev_ctx.x_context(), logits->data<float>(), clip_logits_data, r = xpu::clip_v2(dev_ctx.x_context(), logits->data<float>(),
len, -1e30, 1e30); clip_logits_data, len, static_cast<float>(-1e20),
static_cast<float>(1e20));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
r, xpu::Error_t::SUCCESS, r, xpu::Error_t::SUCCESS,
platform::errors::External("XPU kernel error. clip " platform::errors::External("XPU kernel error. clip "
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <unordered_map> #include <unordered_map>
#include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/errors.h"
#include "paddle/fluid/platform/float16.h"
#include "xpu/api.h" #include "xpu/api.h"
#include "xpu/refactor/fusion.h" #include "xpu/refactor/fusion.h"
#include "xpu/refactor/math.h" #include "xpu/refactor/math.h"
...@@ -58,4 +59,16 @@ static std::map<int, std::string> XPUAPIErrorMsg = { ...@@ -58,4 +59,16 @@ static std::map<int, std::string> XPUAPIErrorMsg = {
{xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"}, {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"},
{xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}}; {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}};
template <typename T>
class XPUTypeTrait {
public:
using Type = T;
};
template <>
class XPUTypeTrait<paddle::platform::float16> {
public:
using Type = float16;
};
#endif #endif
...@@ -225,7 +225,9 @@ OpSupportedInfos(const std::string &place, ...@@ -225,7 +225,9 @@ OpSupportedInfos(const std::string &place,
[](unsigned char c) { return std::toupper(c); }); [](unsigned char c) { return std::toupper(c); });
using fn_type = std::add_pointer<bool(const platform::Place &)>::type; using fn_type = std::add_pointer<bool(const platform::Place &)>::type;
std::unordered_map<std::string, fn_type> is_target_place{ std::unordered_map<std::string, fn_type> is_target_place{
{"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place}, {"GPU", &platform::is_gpu_place},
{"CPU", &platform::is_cpu_place},
{"XPU", &platform::is_xpu_place},
}; };
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
is_target_place.count(query_place), 0, is_target_place.count(query_place), 0,
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import copy import copy
from ... import core from ... import core
import paddle.fluid as fluid
__all__ = ["CustomOpLists", "AutoMixedPrecisionLists"] __all__ = ["CustomOpLists", "AutoMixedPrecisionLists"]
...@@ -152,8 +153,14 @@ gray_list = { ...@@ -152,8 +153,14 @@ gray_list = {
# The set of ops that don't support fp16 calculation # The set of ops that don't support fp16 calculation
# lookup_table fp16 is slower than fp32, though fp16 is supported. # lookup_table fp16 is slower than fp32, though fp16 is supported.
_, _, _sys_unsupported_fp16_list = core.op_supported_infos( _sys_unsupported_fp16_list = []
if fluid.is_compiled_with_xpu():
_, _, _sys_unsupported_fp16_list = core.op_supported_infos(
'XPU', core.VarDesc.VarType.FP16)
else:
_, _, _sys_unsupported_fp16_list = core.op_supported_infos(
'GPU', core.VarDesc.VarType.FP16) 'GPU', core.VarDesc.VarType.FP16)
unsupported_fp16_list = {'lookup_table', unsupported_fp16_list = {'lookup_table',
'lookup_table_v2'} | _sys_unsupported_fp16_list 'lookup_table_v2'} | _sys_unsupported_fp16_list
......
...@@ -130,9 +130,10 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None): ...@@ -130,9 +130,10 @@ def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
raise ValueError( raise ValueError(
"current_tracer is None, maybe it is not in imperative mode.") "current_tracer is None, maybe it is not in imperative mode.")
if enable and not tracer._expected_place.is_gpu_place(): if enable and not (tracer._expected_place.is_gpu_place() or
tracer._expected_place.is_xpu_place()):
warnings.warn( warnings.warn(
'amp_guard can only be enabled on CUDAPlace, current place is %s, so it makes no effect.' 'amp_guard can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.'
% tracer._expected_place) % tracer._expected_place)
enable = False enable = False
......
...@@ -90,9 +90,10 @@ class AmpScaler(object): ...@@ -90,9 +90,10 @@ class AmpScaler(object):
raise ValueError( raise ValueError(
"current_tracer is None, maybe it is not in imperative mode.") "current_tracer is None, maybe it is not in imperative mode.")
if enable and not tracer._expected_place.is_gpu_place(): if enable and not (tracer._expected_place.is_gpu_place() or
tracer._expected_place.is_xpu_place()):
warnings.warn( warnings.warn(
'AmpScaler can only be enabled on CUDAPlace, current place is %s, so it makes no effect.' 'AmpScaler can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.'
% tracer._expected_place) % tracer._expected_place)
enable = False enable = False
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册