未验证 提交 e512aa9a 编写于 作者: Q QingshuChen 提交者: GitHub

support different precision in kunlun (#36836)

* support different precision in kunlun

* minor

* minor

* minor
上级 5c4c55f9
...@@ -35,7 +35,8 @@ ELSE () ...@@ -35,7 +35,8 @@ ELSE ()
ENDIF() ENDIF()
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211020") SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211029")
#SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211020")
SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
......
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/xpu_api_wrapper.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -151,28 +152,26 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out, ...@@ -151,28 +152,26 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
x_dims.to_str().c_str(), y_dims.to_str().c_str())); x_dims.to_str().c_str(), y_dims.to_str().c_str()));
float alpha = static_cast<T>(ctx.Attr<float>("alpha")); float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
T *data_c = out->data<T>(); T *data_c = out->data<T>();
int m = mat_dim_a.height_; int m = mat_dim_a.height_;
int n = mat_dim_b.width_; int n = mat_dim_b.width_;
int k = mat_dim_a.width_; int k = mat_dim_a.width_;
int batch_size = mat_dim_a.batch_size_; int batch_size = mat_dim_a.batch_size_;
int ldx = mat_dim_a.trans_ ? m : k; int ldx = mat_dim_a.trans_ ? m : k;
int ldy = mat_dim_b.trans_ ? k : n; int ldy = mat_dim_b.trans_ ? k : n;
int ldout = n; int ldout = n;
if (batch_size <= 1) { if (batch_size <= 1) {
int r = 0; int r = 0;
r = xpu::fc_fusion<XPUType, XPUType, XPUType, FCT>( r = xpu_fc_wrapper<XPUType, FCT>(
dev_ctx.x_context(), reinterpret_cast<const XPUType *>(x->data<T>()), dev_ctx.x_context(), reinterpret_cast<const XPUType *>(x->data<T>()),
reinterpret_cast<const XPUType *>(y->data<T>()), reinterpret_cast<const XPUType *>(y->data<T>()),
reinterpret_cast<XPUType *>(data_c), m, n, k, mat_dim_a.trans_, reinterpret_cast<XPUType *>(data_c), m, n, k, mat_dim_a.trans_,
mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy, ldout, alpha, 0, mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy, ldout, alpha, 0,
nullptr, xpu::Activation_t::LINEAR); nullptr, xpu::Activation_t::LINEAR);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, PADDLE_ENFORCE_EQ(
platform::errors::External( r, XPU_SUCCESS,
"XPU fc_fusion kernel return wrong value[%d %s]", r, platform::errors::External("XPU fc kernel return wrong value[%d %s]", r,
XPUAPIErrorMsg[r])); XPUAPIErrorMsg[r]));
} else { } else {
// batch matmul // batch matmul
int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>( int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>(
...@@ -216,8 +215,10 @@ class MatMulXPUKernel : public framework::OpKernel<T> { ...@@ -216,8 +215,10 @@ class MatMulXPUKernel : public framework::OpKernel<T> {
if (std::is_same<paddle::platform::float16, T>::value) { if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context); MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context);
} else { } else {
if (std::getenv("XPU_PADDLE_MAT_MUL_FCINT32") != nullptr) { if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, context); MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, context);
} else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
MatMulXPUFunction<T, float>(x, y, out, trans_x, trans_y, context);
} else { } else {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context); MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, context);
} }
...@@ -292,8 +293,10 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> { ...@@ -292,8 +293,10 @@ class MatMulGradXPUKernel : public framework::OpKernel<T> {
if (std::is_same<paddle::platform::float16, T>::value) { if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context); MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context);
} else { } else {
if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_FCINT32") != nullptr) { if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, context); MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, context);
} else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
MatMulXPUFunction<T, float>(&a, &b, out, trans_a, trans_b, context);
} else { } else {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context); MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, context);
} }
......
...@@ -18,6 +18,8 @@ ...@@ -18,6 +18,8 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/operators/xpu_api_wrapper.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -74,17 +76,21 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, ...@@ -74,17 +76,21 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
int n = mat_dim_b.width_; int n = mat_dim_b.width_;
int k = mat_dim_a.width_; int k = mat_dim_a.width_;
int batch_size = mat_dim_a.batch_size_; int batch_size = mat_dim_a.batch_size_;
int ldx = mat_dim_a.trans_ ? m : k;
int ldy = mat_dim_b.trans_ ? k : n;
int ldout = n;
if (batch_size <= 1) { if (batch_size <= 1) {
int r = 0; int r = 0;
r = xpu::fc<XPUType, XPUType, XPUType, FCT>( r = xpu_fc_wrapper<XPUType, FCT>(
dev_ctx.x_context(), reinterpret_cast<const XPUType*>(x->data<T>()), dev_ctx.x_context(), reinterpret_cast<const XPUType*>(x->data<T>()),
reinterpret_cast<const XPUType*>(y->data<T>()), reinterpret_cast<const XPUType*>(y->data<T>()),
reinterpret_cast<XPUType*>(data_c), m, n, k, mat_dim_a.trans_, reinterpret_cast<XPUType*>(data_c), m, n, k, mat_dim_a.trans_,
mat_dim_b.trans_, nullptr, nullptr, nullptr); mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy, ldout, 1.0, 0,
nullptr, xpu::Activation_t::LINEAR);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
r, XPU_SUCCESS, r, XPU_SUCCESS,
platform::errors::External( platform::errors::External(
"XPU fc_fusion kernel return wrong value[%d %s] , m = %d, n = " "XPU fc kernel return wrong value[%d %s] , m = %d, n = "
"%d, " "%d, "
"k = %d, a_tr = %d, b_tr = %d", "k = %d, a_tr = %d, b_tr = %d",
r, XPUAPIErrorMsg[r], m, n, k, mat_dim_a.trans_, mat_dim_b.trans_)); r, XPUAPIErrorMsg[r], m, n, k, mat_dim_a.trans_, mat_dim_b.trans_));
...@@ -129,8 +135,10 @@ class MatMulV2XPUKernel : public framework::OpKernel<T> { ...@@ -129,8 +135,10 @@ class MatMulV2XPUKernel : public framework::OpKernel<T> {
if (std::is_same<paddle::platform::float16, T>::value) { if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx); MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
} else { } else {
if (std::getenv("XPU_PADDLE_MAT_MUL_V2_FCINT32") != nullptr) { if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, ctx); MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, ctx);
} else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
MatMulXPUFunction<T, float>(x, y, out, trans_x, trans_y, ctx);
} else { } else {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx); MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
} }
...@@ -178,8 +186,10 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> { ...@@ -178,8 +186,10 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> {
if (std::is_same<paddle::platform::float16, T>::value) { if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx); MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
} else { } else {
if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_V2_FCINT32") != nullptr) { if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, ctx); MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, ctx);
} else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
MatMulXPUFunction<T, float>(&a, &b, out, trans_a, trans_b, ctx);
} else { } else {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx); MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
} }
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_XPU
#include <vector>
namespace paddle {
namespace operators {
template <typename XPUType, typename FCT>
int xpu_fc_wrapper(xpu::Context* ctx, const XPUType* x, const XPUType* w,
XPUType* y, int m, int n, int k, bool x_trans, bool w_trans,
const float* x_maxptr, const float* w_maxptr,
float* y_maxptr, int ldx, int ldw, int ldy, float alpha,
float beta, const float* bias,
const xpu::Activation_t& act) {
int r = 0;
if (x_trans && std::getenv("XPU_PADDLE_FC_TRANS_A") != nullptr &&
std::is_same<float, XPUType>::value) {
XPUType* l3_addr = nullptr;
xpu::ctx_guard RAII_GUARD(ctx);
l3_addr = RAII_GUARD.alloc_l3_or_gm<XPUType>(m * k);
if (l3_addr == nullptr) return XPUERR_NOMEM;
std::vector<int> shape = {k, m};
std::vector<int> axis = {1, 0};
r = xpu::transpose<XPUType>(ctx, x, l3_addr, shape, axis);
if (r != XPU_SUCCESS) return r;
r = xpu::fc_fusion<XPUType, XPUType, XPUType, FCT>(
ctx, l3_addr, w, y, m, n, k, false, w_trans, x_maxptr, w_maxptr,
y_maxptr, k, ldw, ldy, alpha, beta, bias, act);
if (r != XPU_SUCCESS) return r;
} else {
r = xpu::fc_fusion<XPUType, XPUType, XPUType, FCT>(
ctx, x, w, y, m, n, k, x_trans, w_trans, x_maxptr, w_maxptr, y_maxptr,
ldx, ldw, ldy, alpha, beta, bias, act);
}
return r;
}
} // namespace operators
} // namespace paddle
#endif
...@@ -222,9 +222,13 @@ XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) { ...@@ -222,9 +222,13 @@ XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
context_ = xpu::create_context(); context_ = xpu::create_context();
const int MAX_XPU_NUM = 16; const int MAX_XPU_NUM = 16;
const int l3_size = 13.5 * 1024 * 1024;
static void* l3ptrs[MAX_XPU_NUM] = {nullptr}; static void* l3ptrs[MAX_XPU_NUM] = {nullptr};
int l3_size = 13.5 * 1024 * 1024;
if (std::getenv("XPU_PADDLE_L3_SIZE") != nullptr) {
l3_size = atoi(std::getenv("XPU_PADDLE_L3_SIZE"));
}
auto selected_xpus = GetXPUSelectedDevices(); auto selected_xpus = GetXPUSelectedDevices();
for (unsigned int i = 0; i < selected_xpus.size(); i++) { for (unsigned int i = 0; i < selected_xpus.size(); i++) {
if (place.device == selected_xpus[i]) { if (place.device == selected_xpus[i]) {
......
...@@ -90,6 +90,12 @@ XPUOpMap& get_kl2_ops() { ...@@ -90,6 +90,12 @@ XPUOpMap& get_kl2_ops() {
pOpKernelType(vartype::FP16, XPUPlace())})}, pOpKernelType(vartype::FP16, XPUPlace())})},
{"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"reduce_sum_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"softmax_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"softmax_with_cross_entropy", {"softmax_with_cross_entropy",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"softmax_with_cross_entropy_grad", {"softmax_with_cross_entropy_grad",
...@@ -171,6 +177,39 @@ XPUOpMap& get_kl2_ops() { ...@@ -171,6 +177,39 @@ XPUOpMap& get_kl2_ops() {
pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()),
pOpKernelType(vartype::INT8, XPUPlace()), pOpKernelType(vartype::INT8, XPUPlace()),
pOpKernelType(vartype::FP32, XPUPlace())})}, pOpKernelType(vartype::FP32, XPUPlace())})},
{"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"matmul_v2_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"assign_value",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"dropout_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"elementwise_div",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"elementwise_div_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::INT64, XPUPlace())})},
{"reshape2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"reshape2_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::INT64, XPUPlace())})},
{"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::INT64, XPUPlace())})},
{"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"layer_norm_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"lookup_table_v2",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"lookup_table_v2_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"flatten_contiguous_range", {"flatten_contiguous_range",
XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册