未验证 提交 60bec700 编写于 作者: z8hanghuan's avatar z8hanghuan 提交者: GitHub

support bmm&bmm_grad for KL2, *test=kunlun (#41935)

上级 4f461ab9
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_XPU
#include <string>
#include <vector>
#include "paddle/fluid/operators/matmul_v2_op.h"
#include "paddle/fluid/operators/xpu_api_wrapper.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
namespace paddle {
namespace operators {
template <typename T, typename FCT>
static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
bool trans_x, bool trans_y,
const paddle::framework::ExecutionContext& ctx) {
using XPUType = typename XPUTypeTrait<T>::Type;
const auto& x_dims = x->dims();
const auto& y_dims = y->dims();
auto& dev_ctx =
ctx.template device_context<paddle::platform::XPUDeviceContext>();
auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(
RowMatrixFromVector(x_dims), 0, trans_x);
auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(
ColumnMatrixFromVector(y_dims), 0, trans_y);
T* data_c = out->data<T>();
int m = mat_dim_a.height_;
int n = mat_dim_b.width_;
int k = mat_dim_a.width_;
int batch_size = mat_dim_a.batch_size_;
// batch matmul
int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>(
dev_ctx.x_context(), // Context* ctx,
batch_size, // int batch_size,
mat_dim_a.trans_, // bool x_trans,
mat_dim_b.trans_, // bool w_trans,
m, // int m,
n, // int n,
k, // int k,
1.0, // float alpha,
reinterpret_cast<const XPUType*>(x->data<T>()), // const TX* x,
mat_dim_a.stride_, // int stride_a,
reinterpret_cast<const XPUType*>(y->data<T>()), // const TW* w,
mat_dim_b.stride_, // int stride_b,
0.0, // float beta,
reinterpret_cast<XPUType*>(data_c), // TY* y,
m * n, // int stride_c,
nullptr, // const float* x_maxptr,
nullptr); // const float* w_maxptr
PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_batched");
}
template <typename T>
class BmmXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Output<Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
if (x->numel() == 0 || y->numel() == 0) {
return;
}
bool trans_x = false;
bool trans_y = false;
auto x_dims = x->dims();
auto y_dims = y->dims();
PADDLE_ENFORCE_EQ(x_dims.size(), 3,
platform::errors::InvalidArgument(
"Input(X) of BmmOp must be 3-dimensional in BmmOp, "
"but received X's shape: [%s].",
x_dims));
PADDLE_ENFORCE_EQ(y_dims.size(), 3,
platform::errors::InvalidArgument(
"Input(Y) of BmmOp must be 3-dimensional in BmmOp, "
"but received Y's shape: [%s].",
y_dims));
PADDLE_ENFORCE_EQ(
x_dims[0], y_dims[0],
platform::errors::InvalidArgument(
"Input(X) and Input(Y) must have the same batch size in BmmOp, "
"but received X's batch size: [%s],"
"Y's batch size [%s]",
x_dims[0], y_dims[0]));
PADDLE_ENFORCE_EQ(
x_dims[2], y_dims[1],
platform::errors::InvalidArgument(
"Input(X)'s width must be equal with Input(Y)'s height in BmmOp,"
"but receive X's width: [%s],"
"Y's height: [%s].",
x_dims[2], y_dims[1]));
if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
} else {
if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, ctx);
} else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
MatMulXPUFunction<T, float>(x, y, out, trans_x, trans_y, ctx);
} else {
MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
}
}
}
};
template <typename T>
class BmmXPUGradKernel : public framework::OpKernel<T> {
public:
void MatMul(const framework::ExecutionContext& ctx,
const framework::Tensor& a, bool trans_a,
const framework::Tensor& b, bool trans_b,
framework::Tensor* out) const {
out->mutable_data<T>(ctx.GetPlace());
if (std::is_same<paddle::platform::float16, T>::value) {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
} else {
if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, ctx);
} else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
MatMulXPUFunction<T, float>(&a, &b, out, trans_a, trans_b, ctx);
} else {
MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
}
}
}
void CalcInputGrad(const framework::ExecutionContext& context,
const framework::Tensor& a, bool trans_a,
const framework::Tensor& b, bool trans_b,
framework::Tensor* out) const {
if (out == nullptr) return;
MatMul(context, a, trans_a, b, trans_b, out);
}
void Compute(const framework::ExecutionContext& context) const override {
auto x = *context.Input<framework::Tensor>("X");
auto y = *context.Input<framework::Tensor>("Y");
auto dout =
*context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
auto* dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, false, false);
framework::DDim dx_dims;
if (dx) {
dx_dims = dx->dims();
if (dx_dims != x.dims()) {
dx->Resize(x.dims());
}
}
framework::DDim dy_dims;
if (dy) {
dy_dims = dy->dims();
if (dy_dims != y.dims()) {
dy->Resize(y.dims());
}
}
CalcInputGrad(context, dout, false, y, true, dx);
CalcInputGrad(context, x, true, dout, false, dy);
// CalcInputGrad(context, dout, false, false, y, true, false, dx);
// CalcInputGrad(context, x, true, true, dout, false, true, dy);
if (dx) {
if (dx_dims != x.dims()) {
dx->Resize(dx_dims);
}
}
if (dy) {
if (dy_dims != y.dims()) {
dy->Resize(dy_dims);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL(bmm, ops::BmmXPUKernel<float>,
ops::BmmXPUKernel<plat::float16>);
REGISTER_OP_XPU_KERNEL(bmm_grad, ops::BmmXPUGradKernel<float>,
ops::BmmXPUGradKernel<plat::float16>);
#endif
......@@ -43,6 +43,8 @@ XPUOpMap& get_kl2_ops() {
{"batch_norm_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"bmm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"bmm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"bce_loss_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"bce_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at #
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import sys
sys.path.append("..")
import paddle
import paddle.fluid.core as core
import paddle.fluid as fluid
import paddle.tensor as tensor
import unittest
import numpy as np
from op_test import OpTest
from op_test_xpu import XPUOpTest
from paddle.fluid.framework import Program, program_guard
from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
paddle.enable_static()
class XPUTestBmmOp(XPUOpTestWrapper):
"""
func desc:: https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/bmm_cn.html#bmm
"""
def __init__(self):
self.op_name = 'bmm'
self.use_dynamic_create_class = False
class TestBmmOp(XPUOpTest):
def setUp(self):
self.init_dtype()
self.set_xpu()
self.op_type = "bmm"
self.place = paddle.XPUPlace(0)
self.set_shape()
X = np.random.random(self.Xshape).astype(self.dtype)
Y = np.random.random(self.Yshape).astype(self.dtype)
self.inputs = {'X': X, 'Y': Y}
Out = np.matmul(X, Y)
self.outputs = {'Out': Out}
def init_dtype(self):
self.dtype = self.in_type
def set_shape(self):
self.Xshape = (10, 3, 4)
self.Yshape = (10, 4, 5)
def set_xpu(self):
self.__class__.use_xpu = True
self.__class__.no_need_check_grad = False
self.__class__.op_type = self.in_type
def test_check_output(self):
self.check_output_with_place(self.place)
def test_check_grad_normal(self):
self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
class TestBmmOp1(TestBmmOp):
def set_shape(self):
self.Xshape = (3, 3, 3)
self.Yshape = (3, 3, 3)
class TestBmmOp2(TestBmmOp):
def set_shape(self):
self.Xshape = (128, 3, 16)
self.Yshape = (128, 16, 3)
class TestBmmOp3(TestBmmOp):
def set_shape(self):
self.Xshape = (2048, 16, 27)
self.Yshape = (2048, 27, 16)
class TestBmmOp4(TestBmmOp):
def set_shape(self):
self.Xshape = (2, 27, 27)
self.Yshape = (2, 27, 27)
class TestBmmOp5(TestBmmOp):
def set_shape(self):
self.Xshape = (2, 1, 1)
self.Yshape = (2, 1, 1)
support_types = get_xpu_op_support_types('bmm')
for stype in support_types:
create_test_class(globals(), XPUTestBmmOp, stype)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册