Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
d752a7f2
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
d752a7f2
编写于
7月 07, 2022
作者:
T
taixiurong
提交者:
GitHub
7月 07, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
xpu-paddlepaddle-31 优化matmul test=kunlun (#43975)
上级
33540e10
变更
7
显示空白变更内容
内联
并排
Showing
7 changed file
with
735 addition
and
767 deletion
+735
-767
paddle/fluid/operators/matmul_op_xpu.cc
paddle/fluid/operators/matmul_op_xpu.cc
+68
-343
paddle/fluid/operators/matmul_v2_op_xpu.cc
paddle/fluid/operators/matmul_v2_op_xpu.cc
+59
-264
paddle/fluid/operators/mul_op_xpu.cc
paddle/fluid/operators/mul_op_xpu.cc
+57
-131
paddle/fluid/operators/xpu_api_wrapper.h
paddle/fluid/operators/xpu_api_wrapper.h
+535
-23
paddle/fluid/platform/device/xpu/xpu2_op_list.h
paddle/fluid/platform/device/xpu/xpu2_op_list.h
+11
-4
python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
...n/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
+3
-1
python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
...on/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
+2
-1
未找到文件。
paddle/fluid/operators/matmul_op_xpu.cc
浏览文件 @
d752a7f2
...
...
@@ -20,275 +20,39 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/xpu_api_wrapper.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
namespace
paddle
{
namespace
operators
{
using
framework
::
Tensor
;
static
framework
::
DDim
RowMatrixFromVector
(
const
framework
::
DDim
&
x_dim
)
{
if
(
x_dim
.
size
()
>
1
)
{
return
x_dim
;
}
return
phi
::
make_ddim
({
1
,
x_dim
[
0
]});
}
static
framework
::
Tensor
FoldInitDims
(
const
framework
::
Tensor
&
input
)
{
auto
output
=
input
;
auto
in_dims
=
input
.
dims
();
if
(
in_dims
.
size
()
==
3
)
{
output
.
Resize
({
in_dims
[
0
]
*
in_dims
[
1
],
in_dims
[
2
]});
}
return
output
;
}
/**
* Get column matrix shape from a vector shape. If the ran of y_dim > 1, the
* original y_dim is returned.
*/
static
framework
::
DDim
ColumnMatrixFromVector
(
const
framework
::
DDim
&
y_dim
)
{
if
(
y_dim
.
size
()
>
1
)
{
return
y_dim
;
}
return
phi
::
make_ddim
({
y_dim
[
0
],
1
});
}
static
void
ReshapeTensorIntoMatrixSequence
(
framework
::
Tensor
*
x
,
const
phi
::
funcs
::
MatDescriptor
&
descriptor
)
{
int64_t
h
,
w
;
h
=
descriptor
.
height_
;
w
=
descriptor
.
width_
;
if
(
descriptor
.
trans_
)
{
std
::
swap
(
w
,
h
);
}
if
(
descriptor
.
batch_size_
)
{
x
->
Resize
({
descriptor
.
batch_size_
,
h
,
w
});
}
else
{
x
->
Resize
({
h
,
w
});
}
}
/**
* Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor
* Out = matmul(x, y)
*
* This method will first calculate X,Y matrix sequence, and then calculate
* the out shape.
*
* Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2]
* The out = [BatchSize, H1, W2]
*
* If there is no batch size in `X` and `Y`, the out will be [H1, W2]
* If any of `X` and `Y` has batch size BatchSize, the out will have the
* BatchSize.
*/
static
void
ReshapeXYOutIntoMatrixSequence
(
framework
::
Tensor
*
x
,
framework
::
Tensor
*
y
,
framework
::
Tensor
*
out
,
bool
trans_x
,
bool
trans_y
)
{
auto
x_dim
=
RowMatrixFromVector
(
x
->
dims
());
auto
y_dim
=
ColumnMatrixFromVector
(
y
->
dims
());
auto
mat_dim_x
=
phi
::
funcs
::
CreateMatrixDescriptor
(
x_dim
,
0
,
trans_x
);
auto
mat_dim_y
=
phi
::
funcs
::
CreateMatrixDescriptor
(
y_dim
,
0
,
trans_y
);
if
(
mat_dim_x
.
batch_size_
==
0
&&
mat_dim_y
.
batch_size_
==
0
)
{
out
->
Resize
({
mat_dim_x
.
height_
,
mat_dim_y
.
width_
});
}
else
{
out
->
Resize
({
std
::
max
(
mat_dim_x
.
batch_size_
,
mat_dim_y
.
batch_size_
),
mat_dim_x
.
height_
,
mat_dim_y
.
width_
});
}
ReshapeTensorIntoMatrixSequence
(
x
,
mat_dim_x
);
ReshapeTensorIntoMatrixSequence
(
y
,
mat_dim_y
);
}
template
<
typename
T
,
typename
FCT
>
static
void
MatMulXPUFunction
(
const
Tensor
*
x
,
const
Tensor
*
y
,
Tensor
*
out
,
bool
trans_x
,
bool
trans_y
,
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
{
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
const
auto
&
x_dims
=
x
->
dims
();
const
auto
&
y_dims
=
y
->
dims
();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
XPUDeviceContext
>();
auto
mat_dim_a
=
phi
::
funcs
::
CreateMatrixDescriptor
(
RowMatrixFromVector
(
x_dims
),
0
,
trans_x
);
auto
mat_dim_b
=
phi
::
funcs
::
CreateMatrixDescriptor
(
ColumnMatrixFromVector
(
y_dims
),
0
,
trans_y
);
if
(
x_dims
.
size
()
==
3
&&
y_dims
.
size
()
<=
2
)
{
// if transpose_X is true, the transpose cost much time
if
(
!
trans_x
)
{
mat_dim_a
.
height_
*=
mat_dim_a
.
batch_size_
;
mat_dim_a
.
batch_size_
=
0
;
}
else
{
mat_dim_b
.
batch_size_
=
mat_dim_a
.
batch_size_
;
mat_dim_b
.
height_
=
mat_dim_b
.
height_
/
mat_dim_b
.
batch_size_
;
}
}
if
(
mat_dim_a
.
width_
==
mat_dim_b
.
height_
)
{
if
(
mat_dim_a
.
batch_size_
==
0
&&
mat_dim_b
.
batch_size_
==
1
)
{
mat_dim_a
.
batch_size_
=
mat_dim_b
.
batch_size_
=
0
;
}
if
(
mat_dim_a
.
batch_size_
==
1
&&
mat_dim_b
.
batch_size_
==
0
)
{
mat_dim_a
.
batch_size_
=
mat_dim_b
.
batch_size_
=
0
;
}
}
PADDLE_ENFORCE_EQ
(
mat_dim_a
.
width_
,
mat_dim_b
.
height_
,
platform
::
errors
::
InvalidArgument
(
"Shape mistake in matmul_op, the "
"first tensor width must be same as "
"second tensor height, but received "
"width:%d, height:%d x_dims = %s , y_dims = %s"
,
mat_dim_a
.
width_
,
mat_dim_b
.
height_
,
x_dims
.
to_str
().
c_str
(),
y_dims
.
to_str
().
c_str
()));
PADDLE_ENFORCE_EQ
(
mat_dim_a
.
batch_size_
,
mat_dim_b
.
batch_size_
,
platform
::
errors
::
InvalidArgument
(
"Shape mistake in matmul_op, the two input"
"tensor batch_size must be same, but received first "
"tensor batch_size:%d, second "
"tensor batch_size:%d, x_dims = %s , y_dims = %s"
,
mat_dim_a
.
batch_size_
,
mat_dim_b
.
batch_size_
,
x_dims
.
to_str
().
c_str
(),
y_dims
.
to_str
().
c_str
()));
float
alpha
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"alpha"
));
T
*
data_c
=
out
->
data
<
T
>
();
int
m
=
mat_dim_a
.
height_
;
int
n
=
mat_dim_b
.
width_
;
int
k
=
mat_dim_a
.
width_
;
int
batch_size
=
mat_dim_a
.
batch_size_
;
int
ldx
=
mat_dim_a
.
trans_
?
m
:
k
;
int
ldy
=
mat_dim_b
.
trans_
?
k
:
n
;
int
ldout
=
n
;
if
(
batch_size
<=
1
)
{
int
r
=
0
;
r
=
xpu_fc_wrapper
<
XPUType
,
FCT
>
(
dev_ctx
.
x_context
(),
reinterpret_cast
<
const
XPUType
*>
(
x
->
data
<
T
>
()),
reinterpret_cast
<
const
XPUType
*>
(
y
->
data
<
T
>
()),
reinterpret_cast
<
XPUType
*>
(
data_c
),
m
,
n
,
k
,
mat_dim_a
.
trans_
,
mat_dim_b
.
trans_
,
nullptr
,
nullptr
,
nullptr
,
ldx
,
ldy
,
ldout
,
alpha
,
0
,
nullptr
,
xpu
::
Activation_t
::
LINEAR
);
PADDLE_ENFORCE_EQ
(
r
,
XPU_SUCCESS
,
platform
::
errors
::
External
(
"XPU fc kernel return wrong value[%d %s]"
,
r
,
XPUAPIErrorMsg
[
r
]));
}
else
{
// batch matmul
int
r
=
xpu
::
fc_batched
<
XPUType
,
XPUType
,
XPUType
,
FCT
>
(
dev_ctx
.
x_context
(),
// Context* ctx,
batch_size
,
// int batch_size,
mat_dim_a
.
trans_
,
// bool x_trans,
mat_dim_b
.
trans_
,
// bool w_trans,
m
,
// int m,
n
,
// int n,
k
,
// int k,
alpha
,
// float alpha,
reinterpret_cast
<
const
XPUType
*>
(
x
->
data
<
T
>
()),
// const TX* x,
mat_dim_a
.
stride_
,
// int stride_a,
reinterpret_cast
<
const
XPUType
*>
(
y
->
data
<
T
>
()),
// const TW* w,
mat_dim_b
.
stride_
,
// int stride_b,
0.0
,
// float beta,
reinterpret_cast
<
XPUType
*>
(
data_c
),
// TY* y,
m
*
n
,
// int stride_c,
nullptr
,
// const float* x_maxptr,
nullptr
);
// const float* w_maxptr
PADDLE_ENFORCE_EQ
(
r
,
XPU_SUCCESS
,
platform
::
errors
::
External
(
"XPU fc_batched kernel return wrong value[%d %s] "
"x_dims = %s , y_dims = %s"
,
r
,
XPUAPIErrorMsg
[
r
],
x_dims
.
to_str
().
c_str
(),
y_dims
.
to_str
().
c_str
()));
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
MatMulXPUKernel
:
public
framework
::
OpKernel
<
T
>
{
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
x
=
context
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
y
=
context
.
Input
<
framework
::
Tensor
>
(
"Y"
);
auto
*
out
=
context
.
Output
<
framework
::
Tensor
>
(
"Out"
);
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
x
=
context
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
y
=
context
.
Input
<
framework
::
Tensor
>
(
"Y"
);
auto
*
out
=
context
.
Output
<
framework
::
Tensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
bool
trans_x
=
context
.
Attr
<
bool
>
(
"transpose_X"
);
bool
trans_y
=
context
.
Attr
<
bool
>
(
"transpose_Y"
);
if
(
std
::
is_same
<
paddle
::
platform
::
float16
,
T
>::
value
)
{
MatMulXPUFunction
<
T
,
int16_t
>
(
x
,
y
,
out
,
trans_x
,
trans_y
,
context
);
}
else
{
if
(
std
::
getenv
(
"XPU_PADDLE_FC_INT32"
)
!=
nullptr
)
{
MatMulXPUFunction
<
T
,
int32_t
>
(
x
,
y
,
out
,
trans_x
,
trans_y
,
context
);
}
else
if
(
std
::
getenv
(
"XPU_PADDLE_FC_LOCAL_INT16"
)
!=
nullptr
)
{
MatMulXPUFunction
<
T
,
float
>
(
x
,
y
,
out
,
trans_x
,
trans_y
,
context
);
}
else
{
MatMulXPUFunction
<
T
,
int16_t
>
(
x
,
y
,
out
,
trans_x
,
trans_y
,
context
);
}
}
}
};
// Reshape a rank-3 tensor from P x M x N to M x (P * N).
// (Warning: This requires transposing data and writes into new memory.)
// Identity op if the tensor is not of rank 3.
template
<
typename
DeviceContext
,
typename
T
>
static
framework
::
Tensor
XPUFoldHeadAndLastDims
(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
)
{
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
auto
in_dims
=
input
.
dims
();
if
(
in_dims
.
size
()
!=
3
)
{
return
input
;
}
float
alpha
=
static_cast
<
T
>
(
context
.
Attr
<
float
>
(
"alpha"
));
const
XPUType
*
x_ptr
=
reinterpret_cast
<
const
XPUType
*>
(
x
->
data
<
T
>
());
const
XPUType
*
y_ptr
=
reinterpret_cast
<
const
XPUType
*>
(
y
->
data
<
T
>
());
XPUType
*
out_ptr
=
reinterpret_cast
<
XPUType
*>
(
out
->
data
<
T
>
());
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
framework
::
Tensor
output
;
output
.
Resize
({
in_dims
[
1
],
in_dims
[
0
],
in_dims
[
2
]});
output
.
mutable_data
<
T
>
(
context
.
GetPlace
());
std
::
vector
<
int
>
in_shape_host
=
{
static_cast
<
int
>
(
in_dims
[
0
]),
static_cast
<
int
>
(
in_dims
[
1
]),
static_cast
<
int
>
(
in_dims
[
2
])};
std
::
vector
<
int
>
axis_host
=
{
1
,
0
,
2
};
int
r
=
xpu
::
transpose
(
context
.
x_context
(),
reinterpret_cast
<
const
XPUType
*>
(
input
.
data
<
T
>
()),
reinterpret_cast
<
XPUType
*>
(
output
.
data
<
T
>
()),
in_shape_host
,
axis_host
);
PADDLE_ENFORCE_EQ
(
r
,
XPU_SUCCESS
,
platform
::
errors
::
External
(
"XPU transpose kernel return wrong value[%d %s]"
,
r
,
XPUAPIErrorMsg
[
r
]));
output
.
Resize
({
in_dims
[
1
],
in_dims
[
0
]
*
in_dims
[
2
]});
XpuFcInfo
fc_info
;
GetFCInfo
(
x_dims
,
y_dims
,
trans_x
,
trans_y
,
&
fc_info
);
auto
&
dev_ctx
=
context
.
template
device_context
<
paddle
::
platform
::
XPUDeviceContext
>();
xpu
::
Context
*
xpu_ctx
=
dev_ctx
.
x_context
();
return
output
;
}
MatMulXPUFunction
<
XPUType
>
(
xpu_ctx
,
x_ptr
,
y_ptr
,
out_ptr
,
fc_info
,
alpha
);
}
};
// Using dimensional constraints on matrix multiplication, it is
// straight-forward to check the following table for when X and Y
...
...
@@ -317,107 +81,68 @@ static framework::Tensor XPUFoldHeadAndLastDims(
// to X: (P * M) x K, dOut: (P * M) x N.
template
<
typename
DeviceContext
,
typename
T
>
class
MatMulGradXPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
MatMul
(
const
framework
::
ExecutionContext
&
context
,
const
framework
::
Tensor
&
a
,
bool
trans_a
,
const
framework
::
Tensor
&
b
,
bool
trans_b
,
framework
::
Tensor
*
out
)
const
{
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
if
(
std
::
is_same
<
paddle
::
platform
::
float16
,
T
>::
value
)
{
MatMulXPUFunction
<
T
,
int16_t
>
(
&
a
,
&
b
,
out
,
trans_a
,
trans_b
,
context
);
}
else
{
if
(
std
::
getenv
(
"XPU_PADDLE_FC_INT32"
)
!=
nullptr
)
{
MatMulXPUFunction
<
T
,
int32_t
>
(
&
a
,
&
b
,
out
,
trans_a
,
trans_b
,
context
);
}
else
if
(
std
::
getenv
(
"XPU_PADDLE_FC_LOCAL_INT16"
)
!=
nullptr
)
{
MatMulXPUFunction
<
T
,
float
>
(
&
a
,
&
b
,
out
,
trans_a
,
trans_b
,
context
);
}
else
{
MatMulXPUFunction
<
T
,
int16_t
>
(
&
a
,
&
b
,
out
,
trans_a
,
trans_b
,
context
);
}
}
}
void
CalcInputGrad
(
const
framework
::
ExecutionContext
&
context
,
const
framework
::
Tensor
&
a
,
bool
trans_a
,
bool
is_fold_init_dims_a
,
const
framework
::
Tensor
&
b
,
bool
trans_b
,
bool
is_fold_init_dims_b
,
framework
::
Tensor
*
out
)
const
{
if
(
out
==
nullptr
)
return
;
bool
need_combine
=
(
a
.
dims
().
size
()
==
3
||
b
.
dims
().
size
()
==
3
)
&&
out
->
dims
().
size
()
==
2
;
if
(
!
need_combine
)
{
MatMul
(
context
,
a
,
trans_a
,
b
,
trans_b
,
out
);
}
else
{
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
MatMul
(
context
,
is_fold_init_dims_a
?
FoldInitDims
(
a
)
:
XPUFoldHeadAndLastDims
<
DeviceContext
,
T
>
(
dev_ctx
,
a
),
trans_a
,
is_fold_init_dims_b
?
FoldInitDims
(
b
)
:
XPUFoldHeadAndLastDims
<
DeviceContext
,
T
>
(
dev_ctx
,
b
),
trans_b
,
out
);
}
}
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
x
=
*
context
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
y
=
*
context
.
Input
<
framework
::
Tensor
>
(
"Y"
);
auto
dout
=
*
context
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
context
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
context
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
*
dx
=
context
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
context
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
bool
transpose_x
=
context
.
Attr
<
bool
>
(
"transpose_X"
);
bool
transpose_y
=
context
.
Attr
<
bool
>
(
"transpose_Y"
);
ReshapeXYOutIntoMatrixSequence
(
&
x
,
&
y
,
&
dout
,
transpose_x
,
transpose_y
);
framework
::
DDim
dx_dims
;
float
alpha
=
static_cast
<
T
>
(
context
.
Attr
<
float
>
(
"alpha"
));
if
(
dx
)
{
dx_dims
=
dx
->
dims
();
if
(
dx_dims
!=
x
.
dims
())
{
dx
->
Resize
(
x
.
dims
());
dx
->
mutable_data
<
T
>
(
context
.
GetPlace
());
}
}
framework
::
DDim
dy_dims
;
if
(
dy
)
{
dy_dims
=
dy
->
dims
();
if
(
dy_dims
!=
y
.
dims
())
{
dy
->
Resize
(
y
.
dims
());
}
}
if
(
transpose_x
&&
transpose_y
)
{
CalcInputGrad
(
context
,
y
,
true
,
true
,
dout
,
true
,
false
,
dx
);
CalcInputGrad
(
context
,
dout
,
true
,
true
,
x
,
true
,
false
,
dy
);
}
else
if
(
transpose_x
)
{
CalcInputGrad
(
context
,
y
,
false
,
false
,
dout
,
true
,
false
,
dx
);
CalcInputGrad
(
context
,
x
,
false
,
false
,
dout
,
false
,
true
,
dy
);
}
else
if
(
transpose_y
)
{
CalcInputGrad
(
context
,
dout
,
false
,
false
,
y
,
false
,
true
,
dx
);
CalcInputGrad
(
context
,
dout
,
true
,
true
,
x
,
false
,
true
,
dy
);
}
else
{
CalcInputGrad
(
context
,
dout
,
false
,
false
,
y
,
true
,
false
,
dx
);
CalcInputGrad
(
context
,
x
,
true
,
true
,
dout
,
false
,
true
,
dy
);
}
dy
->
mutable_data
<
T
>
(
context
.
GetPlace
());
}
auto
&
dev_ctx
=
context
.
template
device_context
<
paddle
::
platform
::
XPUDeviceContext
>();
const
XPUType
*
dout_ptr
=
reinterpret_cast
<
const
XPUType
*>
(
dout
.
data
<
T
>
());
const
XPUType
*
x_ptr
=
reinterpret_cast
<
const
XPUType
*>
(
x
.
data
<
T
>
());
const
XPUType
*
y_ptr
=
reinterpret_cast
<
const
XPUType
*>
(
y
.
data
<
T
>
());
xpu
::
Context
*
xpu_ctx
=
dev_ctx
.
x_context
();
XpuFcInfo
info_forward
;
GetFCInfo
(
x
.
dims
(),
y
.
dims
(),
transpose_x
,
transpose_y
,
&
info_forward
);
xpu
::
ctx_guard
RAII_GUARD
(
xpu_ctx
);
// begin calculate
const
XPUType
*
a_1
=
reinterpret_cast
<
const
XPUType
*>
(
NULL
);
const
XPUType
*
b_1
=
reinterpret_cast
<
const
XPUType
*>
(
NULL
);
const
XPUType
*
a_2
=
reinterpret_cast
<
const
XPUType
*>
(
NULL
);
const
XPUType
*
b_2
=
reinterpret_cast
<
const
XPUType
*>
(
NULL
);
XPUType
*
c_1
=
(
dx
==
NULL
)
?
reinterpret_cast
<
XPUType
*>
(
NULL
)
:
reinterpret_cast
<
XPUType
*>
(
dx
->
data
<
T
>
());
XPUType
*
c_2
=
(
dy
==
NULL
)
?
reinterpret_cast
<
XPUType
*>
(
NULL
)
:
reinterpret_cast
<
XPUType
*>
(
dy
->
data
<
T
>
());
XpuFcInfo
info_dx
;
XpuFcInfo
info_dy
;
std
::
tuple
<
XpuFcInfo
,
XpuFcInfo
,
const
XPUType
*
,
const
XPUType
*
,
const
XPUType
*
,
const
XPUType
*>
fc_info
=
MatmulGradFcInfo
(
xpu_ctx
,
&
RAII_GUARD
,
info_forward
,
transpose_x
,
transpose_y
,
x_ptr
,
y_ptr
,
dout_ptr
);
std
::
tie
(
info_dx
,
info_dy
,
a_1
,
b_1
,
a_2
,
b_2
)
=
fc_info
;
if
(
dx
)
{
if
(
dx_dims
!=
x
.
dims
())
{
dx
->
Resize
(
dx_dims
);
MatMulXPUFunction
<
XPUType
>
(
xpu_ctx
,
a_1
,
b_1
,
c_1
,
info_dx
,
alpha
);
}
}
if
(
dy
)
{
if
(
dy_dims
!=
y
.
dims
())
{
dy
->
Resize
(
dy_dims
);
}
MatMulXPUFunction
<
XPUType
>
(
xpu_ctx
,
a_2
,
b_2
,
c_2
,
info_dy
,
alpha
);
}
}
};
...
...
paddle/fluid/operators/matmul_v2_op_xpu.cc
浏览文件 @
d752a7f2
...
...
@@ -16,146 +16,17 @@
#include <string>
#include <vector>
#include "paddle/fluid/operators/matmul_v2_op.h"
#include "paddle/fluid/operators/xpu_api_wrapper.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
,
typename
FCT
>
static
void
MatMulXPUFunction
(
const
Tensor
*
x
,
const
Tensor
*
y
,
Tensor
*
out
,
bool
trans_x
,
bool
trans_y
,
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
{
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
const
auto
&
x_dims
=
x
->
dims
();
const
auto
&
y_dims
=
y
->
dims
();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
XPUDeviceContext
>();
auto
mat_dim_a
=
phi
::
funcs
::
CreateMatrixDescriptor
(
RowMatrixFromVector
(
x_dims
),
0
,
trans_x
);
auto
mat_dim_b
=
phi
::
funcs
::
CreateMatrixDescriptor
(
ColumnMatrixFromVector
(
y_dims
),
0
,
trans_y
);
if
(
x_dims
.
size
()
>=
3
&&
y_dims
.
size
()
<=
2
)
{
// if transpose_X is true, the transpose cost much time
if
(
!
trans_x
)
{
mat_dim_a
.
height_
*=
mat_dim_a
.
batch_size_
;
mat_dim_a
.
batch_size_
=
0
;
}
else
{
mat_dim_b
.
batch_size_
=
mat_dim_a
.
batch_size_
;
mat_dim_b
.
height_
=
mat_dim_b
.
height_
/
mat_dim_b
.
batch_size_
;
}
}
if
(
mat_dim_a
.
width_
==
mat_dim_b
.
height_
)
{
if
(
mat_dim_a
.
batch_size_
==
0
&&
mat_dim_b
.
batch_size_
==
1
)
{
mat_dim_a
.
batch_size_
=
mat_dim_b
.
batch_size_
=
0
;
}
if
(
mat_dim_a
.
batch_size_
==
1
&&
mat_dim_b
.
batch_size_
==
0
)
{
mat_dim_a
.
batch_size_
=
mat_dim_b
.
batch_size_
=
0
;
}
}
PADDLE_ENFORCE_EQ
(
mat_dim_a
.
width_
,
mat_dim_b
.
height_
,
platform
::
errors
::
InvalidArgument
(
"Shape mistake in matmul_v2_op xdims = %s ydims = %s "
"x_trans = %d y_trans = %d"
,
x_dims
.
to_str
(),
y_dims
.
to_str
(),
mat_dim_a
.
trans_
,
mat_dim_b
.
trans_
));
PADDLE_ENFORCE_EQ
(
mat_dim_a
.
batch_size_
,
mat_dim_b
.
batch_size_
,
platform
::
errors
::
InvalidArgument
(
"Shape mistake in matmul_v2_op xdims = %s ydims = %s "
"x_trans = %d y_trans = %d"
,
x_dims
.
to_str
(),
y_dims
.
to_str
(),
mat_dim_a
.
trans_
,
mat_dim_b
.
trans_
));
T
*
data_c
=
out
->
data
<
T
>
();
int
m
=
mat_dim_a
.
height_
;
int
n
=
mat_dim_b
.
width_
;
int
k
=
mat_dim_a
.
width_
;
int
batch_size
=
mat_dim_a
.
batch_size_
;
int
ldx
=
mat_dim_a
.
trans_
?
m
:
k
;
int
ldy
=
mat_dim_b
.
trans_
?
k
:
n
;
int
ldout
=
n
;
if
(
batch_size
<=
1
)
{
int
r
=
0
;
r
=
xpu_fc_wrapper
<
XPUType
,
FCT
>
(
dev_ctx
.
x_context
(),
reinterpret_cast
<
const
XPUType
*>
(
x
->
data
<
T
>
()),
reinterpret_cast
<
const
XPUType
*>
(
y
->
data
<
T
>
()),
reinterpret_cast
<
XPUType
*>
(
data_c
),
m
,
n
,
k
,
mat_dim_a
.
trans_
,
mat_dim_b
.
trans_
,
nullptr
,
nullptr
,
nullptr
,
ldx
,
ldy
,
ldout
,
1.0
,
0
,
nullptr
,
xpu
::
Activation_t
::
LINEAR
);
PADDLE_ENFORCE_EQ
(
r
,
XPU_SUCCESS
,
platform
::
errors
::
External
(
"XPU fc kernel return wrong value[%d %s] , m = %d, n = "
"%d, "
"k = %d, a_tr = %d, b_tr = %d"
,
r
,
XPUAPIErrorMsg
[
r
],
m
,
n
,
k
,
mat_dim_a
.
trans_
,
mat_dim_b
.
trans_
));
}
else
{
// batch matmul
int
r
=
xpu
::
fc_batched
<
XPUType
,
XPUType
,
XPUType
,
FCT
>
(
dev_ctx
.
x_context
(),
// Context* ctx,
batch_size
,
// int batch_size,
mat_dim_a
.
trans_
,
// bool x_trans,
mat_dim_b
.
trans_
,
// bool w_trans,
m
,
// int m,
n
,
// int n,
k
,
// int k,
1.0
,
// float alpha,
reinterpret_cast
<
const
XPUType
*>
(
x
->
data
<
T
>
()),
// const TX* x,
mat_dim_a
.
stride_
,
// int stride_a,
reinterpret_cast
<
const
XPUType
*>
(
y
->
data
<
T
>
()),
// const TW* w,
mat_dim_b
.
stride_
,
// int stride_b,
0.0
,
// float beta,
reinterpret_cast
<
XPUType
*>
(
data_c
),
// TY* y,
m
*
n
,
// int stride_c,
nullptr
,
// const float* x_maxptr,
nullptr
);
// const float* w_maxptr
PADDLE_ENFORCE_EQ
(
r
,
XPU_SUCCESS
,
platform
::
errors
::
External
(
"XPU fc_batched kernel return wrong value[%d %s]"
,
r
,
XPUAPIErrorMsg
[
r
]));
}
}
template
<
typename
T
>
class
MatMulV2XPUKernel
:
public
framework
::
OpKernel
<
T
>
{
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
public:
void
Compute
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
...
...
@@ -164,160 +35,84 @@ class MatMulV2XPUKernel : public framework::OpKernel<T> {
bool
trans_x
=
ctx
.
Attr
<
bool
>
(
"trans_x"
);
bool
trans_y
=
ctx
.
Attr
<
bool
>
(
"trans_y"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
std
::
is_same
<
paddle
::
platform
::
float16
,
T
>::
value
)
{
MatMulXPUFunction
<
T
,
int16_t
>
(
x
,
y
,
out
,
trans_x
,
trans_y
,
ctx
);
}
else
{
if
(
std
::
getenv
(
"XPU_PADDLE_FC_INT32"
)
!=
nullptr
)
{
MatMulXPUFunction
<
T
,
int32_t
>
(
x
,
y
,
out
,
trans_x
,
trans_y
,
ctx
);
}
else
if
(
std
::
getenv
(
"XPU_PADDLE_FC_LOCAL_INT16"
)
!=
nullptr
)
{
MatMulXPUFunction
<
T
,
float
>
(
x
,
y
,
out
,
trans_x
,
trans_y
,
ctx
);
}
else
{
MatMulXPUFunction
<
T
,
int16_t
>
(
x
,
y
,
out
,
trans_x
,
trans_y
,
ctx
);
}
}
const
XPUType
*
x_ptr
=
reinterpret_cast
<
const
XPUType
*>
(
x
->
data
<
T
>
());
const
XPUType
*
y_ptr
=
reinterpret_cast
<
const
XPUType
*>
(
y
->
data
<
T
>
());
XPUType
*
out_ptr
=
reinterpret_cast
<
XPUType
*>
(
out
->
data
<
T
>
());
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
XpuFcInfo
fc_info
;
GetFCInfo
(
x_dims
,
y_dims
,
trans_x
,
trans_y
,
&
fc_info
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
XPUDeviceContext
>();
xpu
::
Context
*
xpu_ctx
=
dev_ctx
.
x_context
();
MatMulXPUFunction
<
XPUType
>
(
xpu_ctx
,
x_ptr
,
y_ptr
,
out_ptr
,
fc_info
,
1.0
f
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
static
framework
::
Tensor
XPUFoldHeadAndLastDims
(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
)
{
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
auto
in_dims
=
input
.
dims
();
if
(
in_dims
.
size
()
!=
3
)
{
return
input
;
}
framework
::
Tensor
output
;
output
.
Resize
({
in_dims
[
1
],
in_dims
[
0
],
in_dims
[
2
]});
output
.
mutable_data
<
T
>
(
context
.
GetPlace
());
std
::
vector
<
int
>
in_shape_host
=
{
static_cast
<
int
>
(
in_dims
[
0
]),
static_cast
<
int
>
(
in_dims
[
1
]),
static_cast
<
int
>
(
in_dims
[
2
])};
std
::
vector
<
int
>
axis_host
=
{
1
,
0
,
2
};
int
r
=
xpu
::
transpose
(
context
.
x_context
(),
reinterpret_cast
<
const
XPUType
*>
(
input
.
data
<
T
>
()),
reinterpret_cast
<
XPUType
*>
(
output
.
data
<
T
>
()),
in_shape_host
,
axis_host
);
PADDLE_ENFORCE_EQ
(
r
,
XPU_SUCCESS
,
platform
::
errors
::
External
(
"XPU transpose kernel return wrong value[%d %s]"
,
r
,
XPUAPIErrorMsg
[
r
]));
output
.
Resize
({
in_dims
[
1
],
in_dims
[
0
]
*
in_dims
[
2
]});
return
output
;
}
template
<
typename
T
>
class
MatMulV2XPUGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
MatMul
(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
&
a
,
bool
trans_a
,
const
framework
::
Tensor
&
b
,
bool
trans_b
,
framework
::
Tensor
*
out
)
const
{
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
std
::
is_same
<
paddle
::
platform
::
float16
,
T
>::
value
)
{
MatMulXPUFunction
<
T
,
int16_t
>
(
&
a
,
&
b
,
out
,
trans_a
,
trans_b
,
ctx
);
}
else
{
if
(
std
::
getenv
(
"XPU_PADDLE_FC_INT32"
)
!=
nullptr
)
{
MatMulXPUFunction
<
T
,
int32_t
>
(
&
a
,
&
b
,
out
,
trans_a
,
trans_b
,
ctx
);
}
else
if
(
std
::
getenv
(
"XPU_PADDLE_FC_LOCAL_INT16"
)
!=
nullptr
)
{
MatMulXPUFunction
<
T
,
float
>
(
&
a
,
&
b
,
out
,
trans_a
,
trans_b
,
ctx
);
}
else
{
MatMulXPUFunction
<
T
,
int16_t
>
(
&
a
,
&
b
,
out
,
trans_a
,
trans_b
,
ctx
);
}
}
}
void
CalcInputGrad
(
const
framework
::
ExecutionContext
&
context
,
const
framework
::
Tensor
&
a
,
bool
trans_a
,
bool
is_fold_init_dims_a
,
const
framework
::
Tensor
&
b
,
bool
trans_b
,
bool
is_fold_init_dims_b
,
framework
::
Tensor
*
out
)
const
{
if
(
out
==
nullptr
)
return
;
bool
need_combine
=
(
a
.
dims
().
size
()
==
3
||
b
.
dims
().
size
()
==
3
)
&&
out
->
dims
().
size
()
==
2
;
if
(
!
need_combine
)
{
MatMul
(
context
,
a
,
trans_a
,
b
,
trans_b
,
out
);
}
else
{
auto
&
dev_ctx
=
context
.
template
device_context
<
paddle
::
platform
::
XPUDeviceContext
>();
MatMul
(
context
,
is_fold_init_dims_a
?
FoldInitDims
(
a
)
:
XPUFoldHeadAndLastDims
<
paddle
::
platform
::
XPUDeviceContext
,
T
>
(
dev_ctx
,
a
),
trans_a
,
is_fold_init_dims_b
?
FoldInitDims
(
b
)
:
XPUFoldHeadAndLastDims
<
paddle
::
platform
::
XPUDeviceContext
,
T
>
(
dev_ctx
,
b
),
trans_b
,
out
);
}
}
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
bool
transpose_x
=
context
.
Attr
<
bool
>
(
"trans_x"
);
bool
transpose_y
=
context
.
Attr
<
bool
>
(
"trans_y"
);
auto
x
=
*
context
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
y
=
*
context
.
Input
<
framework
::
Tensor
>
(
"Y"
);
auto
dout
=
*
context
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
context
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
context
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
ReshapeXYOutIntoMatrixSequence
(
&
x
,
&
y
,
&
dout
,
transpose_x
,
transpose_y
);
framework
::
DDim
dx_dims
;
if
(
dx
)
{
dx_dims
=
dx
->
dims
();
if
(
dx_dims
!=
x
.
dims
())
{
dx
->
Resize
(
x
.
dims
());
}
dx
->
mutable_data
<
T
>
(
context
.
GetPlace
());
}
framework
::
DDim
dy_dims
;
if
(
dy
)
{
dy_dims
=
dy
->
dims
();
if
(
dy_dims
!=
y
.
dims
())
{
dy
->
Resize
(
y
.
dims
());
}
}
if
(
transpose_x
&&
transpose_y
)
{
CalcInputGrad
(
context
,
y
,
true
,
true
,
dout
,
true
,
false
,
dx
);
CalcInputGrad
(
context
,
dout
,
true
,
true
,
x
,
true
,
false
,
dy
);
}
else
if
(
transpose_x
)
{
CalcInputGrad
(
context
,
y
,
false
,
false
,
dout
,
true
,
false
,
dx
);
CalcInputGrad
(
context
,
x
,
false
,
false
,
dout
,
false
,
true
,
dy
);
}
else
if
(
transpose_y
)
{
CalcInputGrad
(
context
,
dout
,
false
,
false
,
y
,
false
,
true
,
dx
);
CalcInputGrad
(
context
,
dout
,
true
,
true
,
x
,
false
,
true
,
dy
);
}
else
{
CalcInputGrad
(
context
,
dout
,
false
,
false
,
y
,
true
,
false
,
dx
);
CalcInputGrad
(
context
,
x
,
true
,
true
,
dout
,
false
,
true
,
dy
);
dy
->
mutable_data
<
T
>
(
context
.
GetPlace
());
}
auto
&
dev_ctx
=
context
.
template
device_context
<
paddle
::
platform
::
XPUDeviceContext
>();
const
XPUType
*
dout_ptr
=
reinterpret_cast
<
const
XPUType
*>
(
dout
.
data
<
T
>
());
const
XPUType
*
x_ptr
=
reinterpret_cast
<
const
XPUType
*>
(
x
.
data
<
T
>
());
const
XPUType
*
y_ptr
=
reinterpret_cast
<
const
XPUType
*>
(
y
.
data
<
T
>
());
xpu
::
Context
*
xpu_ctx
=
dev_ctx
.
x_context
();
XpuFcInfo
info_forward
;
GetFCInfo
(
x
.
dims
(),
y
.
dims
(),
transpose_x
,
transpose_y
,
&
info_forward
);
xpu
::
ctx_guard
RAII_GUARD
(
xpu_ctx
);
// begin calculate
const
XPUType
*
a_1
=
reinterpret_cast
<
const
XPUType
*>
(
NULL
);
const
XPUType
*
b_1
=
reinterpret_cast
<
const
XPUType
*>
(
NULL
);
const
XPUType
*
a_2
=
reinterpret_cast
<
const
XPUType
*>
(
NULL
);
const
XPUType
*
b_2
=
reinterpret_cast
<
const
XPUType
*>
(
NULL
);
XPUType
*
c_1
=
(
dx
==
NULL
)
?
reinterpret_cast
<
XPUType
*>
(
NULL
)
:
reinterpret_cast
<
XPUType
*>
(
dx
->
data
<
T
>
());
XPUType
*
c_2
=
(
dy
==
NULL
)
?
reinterpret_cast
<
XPUType
*>
(
NULL
)
:
reinterpret_cast
<
XPUType
*>
(
dy
->
data
<
T
>
());
XpuFcInfo
info_dx
;
XpuFcInfo
info_dy
;
std
::
tuple
<
XpuFcInfo
,
XpuFcInfo
,
const
XPUType
*
,
const
XPUType
*
,
const
XPUType
*
,
const
XPUType
*>
fc_info
=
MatmulGradFcInfo
(
xpu_ctx
,
&
RAII_GUARD
,
info_forward
,
transpose_x
,
transpose_y
,
x_ptr
,
y_ptr
,
dout_ptr
);
std
::
tie
(
info_dx
,
info_dy
,
a_1
,
b_1
,
a_2
,
b_2
)
=
fc_info
;
if
(
dx
)
{
if
(
dx_dims
!=
x
.
dims
())
{
dx
->
Resize
(
dx_dims
);
MatMulXPUFunction
<
XPUType
>
(
xpu_ctx
,
a_1
,
b_1
,
c_1
,
info_dx
,
1.0
f
);
}
}
if
(
dy
)
{
if
(
dy_dims
!=
y
.
dims
())
{
dy
->
Resize
(
dy_dims
);
}
MatMulXPUFunction
<
XPUType
>
(
xpu_ctx
,
a_2
,
b_2
,
c_2
,
info_dy
,
1.0
f
);
}
}
};
...
...
paddle/fluid/operators/mul_op_xpu.cc
浏览文件 @
d752a7f2
...
...
@@ -49,50 +49,23 @@ class MulXPUKernel : public framework::OpKernel<T> {
*
y
,
context
.
template
Attr
<
int
>(
"y_num_col_dims"
))
:
*
y
;
z
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
z_dim
=
z
->
dims
();
if
(
z_dim
.
size
()
!=
2
)
{
z
->
Resize
({
x_matrix
.
dims
()[
0
],
y_matrix
.
dims
()[
1
]});
}
const
XPUType
*
x_ptr
=
reinterpret_cast
<
const
XPUType
*>
(
x_matrix
.
data
<
T
>
());
const
XPUType
*
y_ptr
=
reinterpret_cast
<
const
XPUType
*>
(
y_matrix
.
data
<
T
>
());
XPUType
*
out_ptr
=
reinterpret_cast
<
XPUType
*>
(
z
->
data
<
T
>
());
bool
trans_a
=
false
;
bool
trans_b
=
false
;
int
m
=
x_matrix
.
dims
()[
0
];
int
k
=
x_matrix
.
dims
()[
1
];
int
k1
=
y_matrix
.
dims
()[
0
];
int
n
=
y_matrix
.
dims
()[
1
];
PADDLE_ENFORCE_EQ
(
k
,
k1
,
platform
::
errors
::
InvalidArgument
(
"Shape mistake in mul_op"
));
T
alpha
=
static_cast
<
T
>
(
1.0
);
T
beta
=
static_cast
<
T
>
(
0.0
);
const
T
*
data_a
=
x_matrix
.
data
<
T
>
();
const
T
*
data_b
=
y_matrix
.
data
<
T
>
();
T
*
data_c
=
z
->
data
<
T
>
();
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
int
ret
=
xpu_fc_wrapper
<
XPUType
,
int16_t
>
(
dev_ctx
.
x_context
(),
reinterpret_cast
<
const
XPUType
*>
(
data_a
),
reinterpret_cast
<
const
XPUType
*>
(
data_b
),
reinterpret_cast
<
XPUType
*>
(
data_c
),
m
,
n
,
k
,
trans_a
,
trans_b
,
nullptr
,
nullptr
,
nullptr
,
k
,
n
,
n
,
alpha
,
beta
,
nullptr
,
xpu
::
Activation_t
::
LINEAR
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
ret
,
"xpu_fc_wrapper"
);
if
(
z_dim
.
size
()
!=
2
)
{
z
->
Resize
(
z_dim
);
}
auto
x_dims
=
x_matrix
.
dims
();
auto
y_dims
=
y_matrix
.
dims
();
XpuFcInfo
fc_info
;
GetFCInfo
(
x_dims
,
y_dims
,
trans_a
,
trans_b
,
&
fc_info
);
auto
&
dev_ctx
=
context
.
template
device_context
<
paddle
::
platform
::
XPUDeviceContext
>();
xpu
::
Context
*
xpu_ctx
=
dev_ctx
.
x_context
();
MatMulXPUFunction
<
XPUType
>
(
xpu_ctx
,
x_ptr
,
y_ptr
,
out_ptr
,
fc_info
,
1.0
f
);
}
};
...
...
@@ -125,98 +98,51 @@ class MulGradXPUKernel : public framework::OpKernel<T> {
dy
->
set_lod
(
y
->
lod
());
}
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
XpuFcInfo
info_forward
;
GetFCInfo
(
x_matrix
.
dims
(),
y_matrix
.
dims
(),
false
,
false
,
&
info_forward
);
const
XPUType
*
dout_ptr
=
reinterpret_cast
<
const
XPUType
*>
(
dout
->
data
<
T
>
());
const
XPUType
*
x_ptr
=
reinterpret_cast
<
const
XPUType
*>
(
x
->
data
<
T
>
());
const
XPUType
*
y_ptr
=
reinterpret_cast
<
const
XPUType
*>
(
y
->
data
<
T
>
());
xpu
::
Context
*
xpu_ctx
=
dev_ctx
.
x_context
();
xpu
::
ctx_guard
RAII_GUARD
(
xpu_ctx
);
// begin calculate
const
XPUType
*
a_1
=
reinterpret_cast
<
const
XPUType
*>
(
NULL
);
const
XPUType
*
b_1
=
reinterpret_cast
<
const
XPUType
*>
(
NULL
);
const
XPUType
*
a_2
=
reinterpret_cast
<
const
XPUType
*>
(
NULL
);
const
XPUType
*
b_2
=
reinterpret_cast
<
const
XPUType
*>
(
NULL
);
XPUType
*
c_1
=
(
dx
==
NULL
)
?
reinterpret_cast
<
XPUType
*>
(
NULL
)
:
reinterpret_cast
<
XPUType
*>
(
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
XPUType
*
c_2
=
(
dy
==
NULL
)
?
reinterpret_cast
<
XPUType
*>
(
NULL
)
:
reinterpret_cast
<
XPUType
*>
(
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
XpuFcInfo
info_dx
;
XpuFcInfo
info_dy
;
std
::
tuple
<
XpuFcInfo
,
XpuFcInfo
,
const
XPUType
*
,
const
XPUType
*
,
const
XPUType
*
,
const
XPUType
*>
fc_info
=
MatmulGradFcInfo
(
xpu_ctx
,
&
RAII_GUARD
,
info_forward
,
false
,
false
,
x_ptr
,
y_ptr
,
dout_ptr
);
std
::
tie
(
info_dx
,
info_dy
,
a_1
,
b_1
,
a_2
,
b_2
)
=
fc_info
;
if
(
dx
)
{
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
Tensor
dx_matrix
=
dx
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
dx
,
x_num_col_dims
)
:
*
dx
;
// dx = dout * y'. dx: M x K, dout : M x N, y : K x N
// blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
bool
trans_a
=
false
;
bool
trans_b
=
true
;
int
m
=
dout_mat
.
dims
()[
0
];
int
k
=
dout_mat
.
dims
()[
1
];
int
n
=
y_matrix
.
dims
()[
0
];
int
k1
=
y_matrix
.
dims
()[
1
];
PADDLE_ENFORCE_EQ
(
k
,
k1
,
platform
::
errors
::
InvalidArgument
(
"Shape mistake in mul_op"
));
int
lda
=
(
!
trans_a
)
?
k
:
m
;
int
ldb
=
(
!
trans_b
)
?
n
:
k
;
int
ldc
=
n
;
T
alpha
=
static_cast
<
T
>
(
1.0
);
T
beta
=
static_cast
<
T
>
(
0.0
);
const
T
*
data_a
=
dout
->
data
<
T
>
();
const
T
*
data_b
=
y_matrix
.
data
<
T
>
();
T
*
data_c
=
dx_matrix
.
data
<
T
>
();
int
ret
=
xpu_fc_wrapper
<
XPUType
,
int16_t
>
(
dev_ctx
.
x_context
(),
reinterpret_cast
<
const
XPUType
*>
(
data_a
),
reinterpret_cast
<
const
XPUType
*>
(
data_b
),
reinterpret_cast
<
XPUType
*>
(
data_c
),
m
,
n
,
k
,
trans_a
,
trans_b
,
nullptr
,
nullptr
,
nullptr
,
lda
,
ldb
,
ldc
,
alpha
,
beta
,
nullptr
,
xpu
::
Activation_t
::
LINEAR
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
ret
,
"xpu_fc_wrapper"
);
MatMulXPUFunction
<
XPUType
>
(
xpu_ctx
,
a_1
,
b_1
,
c_1
,
info_dx
,
1.0
f
);
}
if
(
dy
)
{
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
Tensor
dy_matrix
=
dy
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
dy
,
y_num_col_dims
)
:
*
dy
;
// dy = x' * dout. dy K x N, dout : M x N, x : M x K
// blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
bool
trans_a
=
true
;
bool
trans_b
=
false
;
int
k
=
x_matrix
.
dims
()[
0
];
int
m
=
x_matrix
.
dims
()[
1
];
int
k1
=
dout_mat
.
dims
()[
0
];
int
n
=
dout_mat
.
dims
()[
1
];
PADDLE_ENFORCE_EQ
(
k
,
k1
,
platform
::
errors
::
InvalidArgument
(
"Shape mistake in mul_op"
));
int
lda
=
(
!
trans_a
)
?
k
:
m
;
int
ldb
=
(
!
trans_b
)
?
n
:
k
;
int
ldc
=
n
;
T
alpha
=
static_cast
<
T
>
(
1.0
);
T
beta
=
static_cast
<
T
>
(
0.0
);
const
T
*
data_a
=
x_matrix
.
data
<
T
>
();
const
T
*
data_b
=
dout
->
data
<
T
>
();
T
*
data_c
=
dy_matrix
.
data
<
T
>
();
int
ret
=
xpu_fc_wrapper
<
XPUType
,
int16_t
>
(
dev_ctx
.
x_context
(),
reinterpret_cast
<
const
XPUType
*>
(
data_a
),
reinterpret_cast
<
const
XPUType
*>
(
data_b
),
reinterpret_cast
<
XPUType
*>
(
data_c
),
m
,
n
,
k
,
trans_a
,
trans_b
,
nullptr
,
nullptr
,
nullptr
,
lda
,
ldb
,
ldc
,
alpha
,
beta
,
nullptr
,
xpu
::
Activation_t
::
LINEAR
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
ret
,
"xpu_fc_wrapper"
);
MatMulXPUFunction
<
XPUType
>
(
xpu_ctx
,
a_2
,
b_2
,
c_2
,
info_dy
,
1.0
f
);
}
}
};
...
...
paddle/fluid/operators/xpu_api_wrapper.h
浏览文件 @
d752a7f2
...
...
@@ -12,12 +12,176 @@ limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include <vector>
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
namespace
paddle
{
namespace
operators
{
using
float16
=
typename
XPUTypeTrait
<
paddle
::
platform
::
float16
>::
Type
;
enum
XPUFCCalcType
{
FC_INT16
=
0
,
FC_INT32
,
FC_FLOAT
,
};
template
<
typename
T
>
XPUFCCalcType
FCCalcType
()
{
if
(
std
::
is_same
<
paddle
::
platform
::
float16
,
T
>::
value
||
std
::
is_same
<
float16
,
T
>::
value
)
{
return
XPUFCCalcType
::
FC_INT16
;
}
else
if
(
std
::
getenv
(
"XPU_PADDLE_FC_INT32"
)
!=
nullptr
)
{
return
XPUFCCalcType
::
FC_INT32
;
}
else
if
(
std
::
getenv
(
"XPU_PADDLE_FC_LOCAL_INT16"
)
!=
nullptr
)
{
return
XPUFCCalcType
::
FC_FLOAT
;
}
return
XPUFCCalcType
::
FC_INT16
;
}
struct
XpuFcInfo
{
int
bs
;
int
m
;
int
n
;
int
k
;
bool
trans_x
;
bool
trans_y
;
int
stride_x
;
int
stride_y
;
int
stride_out
;
float
*
max_x
;
float
*
max_y
;
float
*
max_out
;
XpuFcInfo
()
:
bs
(
0
),
m
(
0
),
n
(
0
),
k
(
0
),
trans_x
(
false
),
trans_y
(
false
),
stride_x
(
0
),
stride_y
(
0
),
stride_out
(
0
),
max_x
(
nullptr
),
max_y
(
nullptr
),
max_out
(
nullptr
)
{}
void
InitFcInfo
(
int
bs
,
int
m
,
int
n
,
int
k
,
bool
trans_x
,
bool
trans_y
,
float
*
max_x
,
float
*
max_y
,
float
*
max_out
)
{
this
->
bs
=
bs
;
this
->
m
=
m
;
this
->
n
=
n
;
this
->
k
=
k
;
this
->
trans_x
=
trans_x
;
this
->
trans_y
=
trans_y
;
this
->
max_x
=
max_x
;
this
->
max_y
=
max_y
;
this
->
max_out
=
max_out
;
if
(
this
->
bs
<=
1
)
{
this
->
stride_x
=
trans_x
?
m
:
k
;
this
->
stride_y
=
trans_y
?
k
:
n
;
this
->
stride_out
=
n
;
}
else
{
this
->
stride_x
=
m
*
k
;
this
->
stride_y
=
k
*
n
;
this
->
stride_out
=
m
*
n
;
}
}
};
static
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
XpuFcInfo
&
fc_inf
)
{
os
<<
"fc_inf[ bs, m, n, k, trans_x, trans_y, stride_x, stride_y, "
"stride_out] = "
<<
"["
<<
fc_inf
.
bs
<<
", "
<<
fc_inf
.
m
<<
", "
<<
fc_inf
.
n
<<
", "
<<
fc_inf
.
k
<<
", "
<<
fc_inf
.
trans_x
<<
", "
<<
fc_inf
.
trans_y
<<
", "
<<
fc_inf
.
stride_x
<<
", "
<<
fc_inf
.
stride_y
<<
", "
<<
fc_inf
.
stride_out
;
return
os
;
}
static
void
GetFCInfo
(
const
phi
::
DDim
&
x_dims
,
const
phi
::
DDim
&
y_dims
,
bool
trans_x
,
bool
trans_y
,
XpuFcInfo
*
info
)
{
framework
::
DDim
new_x_dims
=
(
x_dims
.
size
()
>
1
)
?
x_dims
:
phi
::
make_ddim
({
1
,
x_dims
[
0
]});
framework
::
DDim
new_y_dims
=
(
y_dims
.
size
()
>
1
)
?
y_dims
:
phi
::
make_ddim
({
y_dims
[
0
],
1
});
auto
mat_dim_a
=
phi
::
funcs
::
CreateMatrixDescriptor
(
new_x_dims
,
0
,
trans_x
);
auto
mat_dim_b
=
phi
::
funcs
::
CreateMatrixDescriptor
(
new_y_dims
,
0
,
trans_y
);
if
(
x_dims
.
size
()
>=
3
&&
y_dims
.
size
()
<=
2
)
{
if
(
!
trans_x
)
{
mat_dim_a
.
height_
*=
mat_dim_a
.
batch_size_
;
mat_dim_a
.
batch_size_
=
0
;
}
else
{
mat_dim_b
.
batch_size_
=
mat_dim_a
.
batch_size_
;
mat_dim_b
.
height_
=
mat_dim_b
.
height_
/
mat_dim_b
.
batch_size_
;
}
}
if
(
y_dims
.
size
()
>=
3
&&
x_dims
.
size
()
<=
2
)
{
PADDLE_ENFORCE_EQ
(
mat_dim_b
.
trans_
,
false
,
platform
::
errors
::
InvalidArgument
(
"xpu not support this Shape in matmul_op xdims = %s ydims = %s "
"x_trans = %d y_trans = %d"
,
x_dims
.
to_str
(),
y_dims
.
to_str
(),
mat_dim_a
.
trans_
,
mat_dim_b
.
trans_
));
mat_dim_b
.
height_
*=
mat_dim_b
.
batch_size_
;
mat_dim_b
.
batch_size_
=
0
;
}
if
(
mat_dim_a
.
width_
==
mat_dim_b
.
height_
)
{
if
(
mat_dim_a
.
batch_size_
==
0
&&
mat_dim_b
.
batch_size_
==
1
)
{
mat_dim_a
.
batch_size_
=
mat_dim_b
.
batch_size_
=
0
;
}
if
(
mat_dim_a
.
batch_size_
==
1
&&
mat_dim_b
.
batch_size_
==
0
)
{
mat_dim_a
.
batch_size_
=
mat_dim_b
.
batch_size_
=
0
;
}
}
PADDLE_ENFORCE_EQ
(
mat_dim_a
.
width_
,
mat_dim_b
.
height_
,
platform
::
errors
::
InvalidArgument
(
"Shape mistake in matmul_op xdims = %s ydims = %s "
"x_trans = %d y_trans = %d"
,
x_dims
.
to_str
(),
y_dims
.
to_str
(),
mat_dim_a
.
trans_
,
mat_dim_b
.
trans_
));
info
->
m
=
mat_dim_a
.
height_
;
info
->
n
=
mat_dim_b
.
width_
;
info
->
k
=
mat_dim_a
.
width_
;
info
->
bs
=
mat_dim_a
.
batch_size_
;
info
->
trans_x
=
trans_x
;
info
->
trans_y
=
trans_y
;
if
(
info
->
bs
<=
1
)
{
info
->
stride_x
=
trans_x
?
info
->
m
:
info
->
k
;
info
->
stride_y
=
trans_y
?
info
->
k
:
info
->
n
;
info
->
stride_out
=
info
->
n
;
}
else
{
info
->
stride_x
=
info
->
m
*
info
->
k
;
info
->
stride_y
=
info
->
k
*
info
->
n
;
info
->
stride_out
=
info
->
m
*
info
->
n
;
}
}
template
<
typename
XPUType
,
typename
FCT
>
int
xpu_fc_wrapper
(
xpu
::
Context
*
ctx
,
static
void
xpu_fc_wrapper
(
xpu
::
Context
*
ctx
,
const
XPUType
*
x
,
const
XPUType
*
w
,
XPUType
*
y
,
...
...
@@ -42,12 +206,12 @@ int xpu_fc_wrapper(xpu::Context* ctx,
XPUType
*
l3_addr
=
nullptr
;
xpu
::
ctx_guard
RAII_GUARD
(
ctx
);
l3_addr
=
RAII_GUARD
.
alloc_l3_or_gm
<
XPUType
>
(
m
*
k
);
if
(
l3_addr
==
nullptr
)
return
XPUERR_NOMEM
;
PADDLE_ENFORCE_XDNN_NOT_NULL
(
l3_addr
)
;
std
::
vector
<
int
>
shape
=
{
k
,
m
};
std
::
vector
<
int
>
axis
=
{
1
,
0
};
r
=
xpu
::
transpose
<
XPUType
>
(
ctx
,
x
,
l3_addr
,
shape
,
axis
);
if
(
r
!=
XPU_SUCCESS
)
return
r
;
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"transpose"
)
;
r
=
xpu
::
fc_fusion
<
XPUType
,
XPUType
,
XPUType
,
FCT
>
(
ctx
,
l3_addr
,
...
...
@@ -68,7 +232,7 @@ int xpu_fc_wrapper(xpu::Context* ctx,
beta
,
bias
,
act
);
if
(
r
!=
XPU_SUCCESS
)
return
r
;
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"fc_fusion"
)
;
}
else
{
r
=
xpu
::
fc_fusion
<
XPUType
,
XPUType
,
XPUType
,
FCT
>
(
ctx
,
x
,
...
...
@@ -89,8 +253,356 @@ int xpu_fc_wrapper(xpu::Context* ctx,
beta
,
bias
,
act
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"fc_fusion"
);
}
return
r
;
}
template
<
>
void
xpu_fc_wrapper
<
float16
,
int32_t
>
(
xpu
::
Context
*
ctx
,
const
float16
*
x
,
const
float16
*
w
,
float16
*
y
,
int
m
,
int
n
,
int
k
,
bool
x_trans
,
bool
w_trans
,
const
float
*
x_maxptr
,
const
float
*
w_maxptr
,
float
*
y_maxptr
,
int
ldx
,
int
ldw
,
int
ldy
,
float
alpha
,
float
beta
,
const
float
*
bias
,
const
xpu
::
Activation_t
&
act
)
{
int
r
=
xpu
::
Error_t
::
INVALID_PARAM
;
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"xpu_fc_wrapper"
);
}
template
<
typename
XPUType
,
typename
FCT
>
static
void
xpu_fc_batch_wrapper
(
xpu
::
Context
*
xpu_ctx
,
int
bs
,
bool
trans_x
,
bool
trans_w
,
int
m
,
int
n
,
int
k
,
float
alpha
,
const
XPUType
*
x
,
int
stride_x
,
const
XPUType
*
w
,
int
stride_w
,
float
beta
,
XPUType
*
y
,
int
stride_y
,
const
float
*
x_maxptr
,
const
float
*
w_maxptr
)
{
int
r
=
xpu
::
fc_batched
<
XPUType
,
XPUType
,
XPUType
,
FCT
>
(
xpu_ctx
,
// Context* ctx,
bs
,
// int batch_size,
trans_x
,
// bool x_trans,
trans_w
,
// bool w_trans,
m
,
// int m,
n
,
// int n,
k
,
// int k,
alpha
,
// float alpha,
reinterpret_cast
<
const
XPUType
*>
(
x
),
// const TX* x,
stride_x
,
// int stride_a,
reinterpret_cast
<
const
XPUType
*>
(
w
),
// const TW* w,
stride_w
,
// int stride_b,
0.0
,
// float beta,
reinterpret_cast
<
XPUType
*>
(
y
),
// TY* y,
stride_y
,
// int stride_c,
x_maxptr
,
// const float* x_maxptr,
w_maxptr
);
// const float* w_maxptr
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"fc_batched"
);
}
template
<
>
void
xpu_fc_batch_wrapper
<
float16
,
int32_t
>
(
xpu
::
Context
*
xpu_ctx
,
int
bs
,
bool
trans_x
,
bool
trans_w
,
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float16
*
x
,
int
stride_x
,
const
float16
*
w
,
int
stride_w
,
float
beta
,
float16
*
y
,
int
stride_y
,
const
float
*
x_maxptr
,
const
float
*
w_maxptr
)
{
int
r
=
xpu
::
Error_t
::
INVALID_PARAM
;
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"xpu_fc_batch_wrapper"
);
}
template
<
>
void
xpu_fc_batch_wrapper
<
float16
,
float
>
(
xpu
::
Context
*
xpu_ctx
,
int
bs
,
bool
trans_x
,
bool
trans_w
,
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float16
*
x
,
int
stride_x
,
const
float16
*
w
,
int
stride_w
,
float
beta
,
float16
*
y
,
int
stride_y
,
const
float
*
x_maxptr
,
const
float
*
w_maxptr
)
{
int
r
=
xpu
::
Error_t
::
INVALID_PARAM
;
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"xpu_fc_batch_wrapper"
);
}
template
<
typename
T
>
static
void
MatMulXPUFunction
(
xpu
::
Context
*
xpu_ctx
,
const
T
*
x
,
const
T
*
y
,
T
*
out
,
const
XpuFcInfo
&
fcinfo
,
float
alpha
)
{
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
using
float16
=
typename
XPUTypeTrait
<
paddle
::
platform
::
float16
>::
Type
;
int
fccal_type
=
FCCalcType
<
XPUType
>
();
decltype
(
&
paddle
::
operators
::
xpu_fc_wrapper
<
XPUType
,
int16_t
>
)
fc_api_list
[
3
]
=
{
&
paddle
::
operators
::
xpu_fc_wrapper
<
XPUType
,
int16_t
>
,
&
paddle
::
operators
::
xpu_fc_wrapper
<
XPUType
,
int32_t
>
,
&
paddle
::
operators
::
xpu_fc_wrapper
<
XPUType
,
float
>
,
};
decltype
(
&
paddle
::
operators
::
xpu_fc_batch_wrapper
<
XPUType
,
int16_t
>
)
fc_batch_api_list
[
3
]
=
{
&
paddle
::
operators
::
xpu_fc_batch_wrapper
<
XPUType
,
int16_t
>
,
&
paddle
::
operators
::
xpu_fc_batch_wrapper
<
XPUType
,
int32_t
>
,
&
paddle
::
operators
::
xpu_fc_batch_wrapper
<
XPUType
,
float
>
,
};
auto
fc_api
=
fc_api_list
[
fccal_type
];
auto
fc_batch_api
=
fc_batch_api_list
[
fccal_type
];
int
m
=
fcinfo
.
m
;
int
n
=
fcinfo
.
n
;
int
k
=
fcinfo
.
k
;
int
batch_size
=
fcinfo
.
bs
;
int
ldx
=
fcinfo
.
stride_x
;
int
ldy
=
fcinfo
.
stride_y
;
int
ldout
=
fcinfo
.
stride_out
;
bool
trans_x
=
fcinfo
.
trans_x
;
bool
trans_y
=
fcinfo
.
trans_y
;
float
*
max_x
=
fcinfo
.
max_x
;
float
*
max_y
=
fcinfo
.
max_y
;
float
*
max_out
=
fcinfo
.
max_out
;
if
(
batch_size
<=
1
)
{
fc_api
(
xpu_ctx
,
reinterpret_cast
<
const
XPUType
*>
(
x
),
reinterpret_cast
<
const
XPUType
*>
(
y
),
reinterpret_cast
<
XPUType
*>
(
out
),
m
,
n
,
k
,
trans_x
,
trans_y
,
max_x
,
max_y
,
max_out
,
ldx
,
ldy
,
ldout
,
alpha
,
0
,
nullptr
,
xpu
::
Activation_t
::
LINEAR
);
}
else
{
// batch matmul
fc_batch_api
(
xpu_ctx
,
// Context* ctx,
batch_size
,
// int batch_size,
trans_x
,
// bool x_trans,
trans_y
,
// bool w_trans,
m
,
// int m,
n
,
// int n,
k
,
// int k,
alpha
,
// float alpha,
reinterpret_cast
<
const
XPUType
*>
(
x
),
// const TX* x,
ldx
,
// int stride_a,
reinterpret_cast
<
const
XPUType
*>
(
y
),
// const TW* w,
ldy
,
// int stride_b,
0.0
,
// float beta,
reinterpret_cast
<
XPUType
*>
(
out
),
// TY* y,
ldout
,
// int stride_c,
max_x
,
// const float* x_maxptr,
max_y
);
// const float* w_maxptr
}
}
template
<
typename
T
>
static
std
::
tuple
<
XpuFcInfo
,
XpuFcInfo
,
const
T
*
,
const
T
*
,
const
T
*
,
const
T
*>
MatmulGradFcInfo
(
xpu
::
Context
*
xpu_ctx
,
xpu
::
ctx_guard
*
RAII_GUARD
,
const
XpuFcInfo
&
dout_shape
,
bool
trans_x
,
bool
trans_y
,
const
T
*
x
,
const
T
*
y
,
const
T
*
dout
)
{
XpuFcInfo
dx_shape
,
dy_shape
;
const
T
*
dx_a
=
NULL
;
const
T
*
dx_b
=
NULL
;
const
T
*
dy_a
=
NULL
;
const
T
*
dy_b
=
NULL
;
bool
copy_to_l3
=
false
;
float
*
max_dout
=
NULL
;
int
maxptr_size
=
xpu_ctx
->
max_ptr_size
();
uint64_t
l3_size
=
uint64_t
(
xpu_ctx
->
_l3_mgr
.
get_size
());
int
bs
=
(
dout_shape
.
bs
<=
1
)
?
(
1
)
:
(
dout_shape
.
bs
);
int
dx_size
=
bs
*
dout_shape
.
m
*
dout_shape
.
k
;
int
dy_size
=
bs
*
dout_shape
.
k
*
dout_shape
.
n
;
int
dout_size
=
bs
*
dout_shape
.
m
*
dout_shape
.
n
;
if
(
trans_x
&&
trans_y
)
{
copy_to_l3
=
l3_size
>=
(
dout_size
*
2
+
dy_size
)
*
sizeof
(
T
);
}
else
if
(
trans_x
)
{
copy_to_l3
=
l3_size
>=
dout_size
*
sizeof
(
T
);
}
else
if
(
trans_y
)
{
copy_to_l3
=
l3_size
>=
dout_size
*
2
*
sizeof
(
T
);
}
else
{
copy_to_l3
=
l3_size
>=
(
dout_size
+
dx_size
)
*
sizeof
(
T
);
}
const
T
*
dout_new
=
dout
;
int
r
=
0
;
if
(
copy_to_l3
)
{
T
*
dout_l3
=
RAII_GUARD
->
alloc_l3
<
T
>
(
dout_size
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
dout_l3
);
if
((
dout_shape
.
bs
>
1
)
||
((
dout_shape
.
bs
<=
1
)
&&
(
FCCalcType
<
T
>
()
==
XPUFCCalcType
::
FC_FLOAT
)))
{
r
=
xpu
::
copy
(
xpu_ctx
,
dout
,
dout_l3
,
dout_size
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"copy"
);
dout_new
=
dout_l3
;
}
else
{
max_dout
=
RAII_GUARD
->
alloc_l3_or_gm
<
float
>
(
maxptr_size
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
max_dout
);
r
=
xpu
::
findmax_copy_fusion
(
xpu_ctx
,
dout
,
max_dout
,
dout_l3
,
dout_size
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"findmax_copy_fusion"
);
dout_new
=
dout_l3
;
}
}
else
if
(((
dout_shape
.
bs
<=
1
)
&&
(
FCCalcType
<
T
>
()
!=
XPUFCCalcType
::
FC_FLOAT
)))
{
max_dout
=
RAII_GUARD
->
alloc_l3_or_gm
<
float
>
(
maxptr_size
);
PADDLE_ENFORCE_XDNN_NOT_NULL
(
max_dout
);
r
=
xpu
::
findmax_copy_fusion
(
xpu_ctx
,
dout
,
max_dout
,
reinterpret_cast
<
T
*>
(
NULL
),
dout_size
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"findmax_copy_fusion"
);
}
if
(
trans_x
&&
trans_y
)
{
// dx = T(y) * T(dout)
dx_shape
.
InitFcInfo
(
dout_shape
.
bs
,
dout_shape
.
k
,
dout_shape
.
m
,
dout_shape
.
n
,
true
,
true
,
nullptr
,
max_dout
,
nullptr
);
dx_a
=
y
,
dx_b
=
dout_new
;
// dy = T(dout) * T(x)
dy_shape
.
InitFcInfo
(
dout_shape
.
bs
,
dout_shape
.
n
,
dout_shape
.
k
,
dout_shape
.
m
,
true
,
true
,
max_dout
,
nullptr
,
nullptr
);
dy_a
=
dout_new
,
dy_b
=
x
;
}
else
if
(
trans_x
)
{
// dx = y * T(dout)
dx_shape
.
InitFcInfo
(
dout_shape
.
bs
,
dout_shape
.
k
,
dout_shape
.
m
,
dout_shape
.
n
,
false
,
true
,
nullptr
,
max_dout
,
nullptr
);
dx_a
=
y
,
dx_b
=
dout_new
;
// dy = x * dout
dy_shape
.
InitFcInfo
(
dout_shape
.
bs
,
dout_shape
.
k
,
dout_shape
.
n
,
dout_shape
.
m
,
false
,
false
,
nullptr
,
max_dout
,
nullptr
);
dy_a
=
x
,
dy_b
=
dout_new
;
}
else
if
(
trans_y
)
{
// dx = dout * y
dx_shape
.
InitFcInfo
(
dout_shape
.
bs
,
dout_shape
.
m
,
dout_shape
.
k
,
dout_shape
.
n
,
false
,
false
,
max_dout
,
nullptr
,
nullptr
);
dx_a
=
dout_new
,
dx_b
=
y
;
// dy = T(dout) * x
dy_shape
.
InitFcInfo
(
dout_shape
.
bs
,
dout_shape
.
n
,
dout_shape
.
k
,
dout_shape
.
m
,
true
,
false
,
max_dout
,
nullptr
,
nullptr
);
dy_a
=
dout_new
,
dy_b
=
x
;
}
else
{
// dx = dout * T(y)
dx_shape
.
InitFcInfo
(
dout_shape
.
bs
,
dout_shape
.
m
,
dout_shape
.
k
,
dout_shape
.
n
,
false
,
true
,
max_dout
,
nullptr
,
nullptr
);
dx_a
=
dout_new
,
dx_b
=
y
;
// dy = T(x) * dout
dy_shape
.
InitFcInfo
(
dout_shape
.
bs
,
dout_shape
.
k
,
dout_shape
.
n
,
dout_shape
.
m
,
true
,
false
,
nullptr
,
max_dout
,
nullptr
);
dy_a
=
x
,
dy_b
=
dout_new
;
}
std
::
tuple
<
XpuFcInfo
,
XpuFcInfo
,
const
T
*
,
const
T
*
,
const
T
*
,
const
T
*>
result
=
std
::
make_tuple
(
dx_shape
,
dy_shape
,
dx_a
,
dx_b
,
dy_a
,
dy_b
);
return
result
;
}
}
// namespace operators
...
...
paddle/fluid/platform/device/xpu/xpu2_op_list.h
浏览文件 @
d752a7f2
...
...
@@ -281,11 +281,18 @@ XPUOpMap& get_kl2_ops() {
pOpKernelType
(
vartype
::
INT64
,
XPUPlace
()),
pOpKernelType
(
vartype
::
FP16
,
XPUPlace
()),
pOpKernelType
(
vartype
::
FP32
,
XPUPlace
())})},
{
"matmul_grad"
,
XPUKernelSet
({
pOpKernelType
(
vartype
::
FP32
,
XPUPlace
())})},
{
"matmul_grad"
,
XPUKernelSet
({
pOpKernelType
(
vartype
::
FP32
,
XPUPlace
()),
pOpKernelType
(
vartype
::
FP16
,
XPUPlace
())})},
{
"matmul_v2_grad"
,
XPUKernelSet
({
pOpKernelType
(
vartype
::
FP32
,
XPUPlace
())})},
{
"matmul_v2"
,
XPUKernelSet
({
pOpKernelType
(
vartype
::
FP32
,
XPUPlace
())})},
{
"matmul"
,
XPUKernelSet
({
pOpKernelType
(
vartype
::
FP32
,
XPUPlace
())})},
XPUKernelSet
({
pOpKernelType
(
vartype
::
FP32
,
XPUPlace
()),
pOpKernelType
(
vartype
::
FP16
,
XPUPlace
())})},
{
"matmul_v2"
,
XPUKernelSet
({
pOpKernelType
(
vartype
::
FP32
,
XPUPlace
()),
pOpKernelType
(
vartype
::
FP16
,
XPUPlace
())})},
{
"matmul"
,
XPUKernelSet
({
pOpKernelType
(
vartype
::
FP32
,
XPUPlace
()),
pOpKernelType
(
vartype
::
FP16
,
XPUPlace
())})},
{
"mean_grad"
,
XPUKernelSet
({
pOpKernelType
(
vartype
::
FP32
,
XPUPlace
()),
pOpKernelType
(
vartype
::
FP16
,
XPUPlace
())})},
...
...
python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
浏览文件 @
d752a7f2
...
...
@@ -84,7 +84,9 @@ type_dict_str_to_numpy = {
xpu_test_op_white_list
=
[]
xpu_test_device_type_white_list
=
[
'xpu1_float64'
]
xpu_test_op_type_white_list
=
[
'dropout_float16'
,
'dropout_grad_float16'
]
xpu_test_op_type_white_list
=
[
'dropout_float16'
,
'dropout_grad_float16'
,
'matmul_v2_float16'
]
xpu_test_device_op_white_list
=
[]
xpu_test_device_op_type_white_list
=
[]
...
...
python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
浏览文件 @
d752a7f2
...
...
@@ -303,7 +303,8 @@ class TestMatmulBaseGenerator(XPUOpTest):
X
=
np
.
random
.
random
(
shape_X
).
astype
(
self
.
dtype
)
Y
=
np
.
random
.
random
(
shape_Y
).
astype
(
self
.
dtype
)
Out
=
reference_matmul
(
X
,
Y
,
transpose_X
,
transpose_Y
)
Out
=
reference_matmul
(
X
,
Y
,
transpose_X
,
transpose_Y
).
astype
(
self
.
dtype
)
self
.
inputs
=
{
'X'
:
X
,
'Y'
:
Y
}
self
.
attrs
=
{
'transpose_X'
:
transpose_X
,
'transpose_Y'
:
transpose_Y
}
self
.
outputs
=
{
'Out'
:
Out
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录