Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
c396ee65
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2305
Star
20932
Fork
5423
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
c396ee65
编写于
12月 24, 2021
作者:
努力努力在努力丶
提交者:
GitHub
12月 24, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[MLU]add mlu op interface (#38241)
* [MLU]add mlu op interface * [MLU]fix alpha of activation op
上级
572b3e90
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
3280 addition
and
52 deletion
+3280
-52
paddle/fluid/operators/activation_op_mlu.cc
paddle/fluid/operators/activation_op_mlu.cc
+12
-22
paddle/fluid/operators/mlu/mlu_baseop.cc
paddle/fluid/operators/mlu/mlu_baseop.cc
+2345
-20
paddle/fluid/operators/mlu/mlu_baseop.h
paddle/fluid/operators/mlu/mlu_baseop.h
+923
-10
未找到文件。
paddle/fluid/operators/activation_op_mlu.cc
浏览文件 @
c396ee65
...
...
@@ -27,40 +27,37 @@ namespace operators {
using
Tensor
=
framework
::
Tensor
;
template
<
typename
DeviceContext
,
cnnlActivationMode_t
act_mode
,
typename
T
>
template
<
cnnlActivationMode_t
act_mode
,
typename
T
>
class
ActivationMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>()
;
float
alpha
=
ctx
.
HasAttr
(
"alpha"
)
?
ctx
.
Attr
<
float
>
(
"alpha"
)
:
1.0
f
;
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlActivationDesc
act_desc
(
act_mode
,
alpha
_
);
MLUCnnlActivationDesc
act_desc
(
act_mode
,
alpha
);
MLUCnnlTensorDesc
input_desc
(
*
input
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
input
->
type
()));
MLUCnnlTensorDesc
output_desc
(
*
output
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
output
->
type
()));
MLUCnnl
::
Active
(
dev_
ctx
,
act_desc
.
get
(),
input_desc
.
get
(),
MLUCnnl
::
Active
(
ctx
,
act_desc
.
get
(),
input_desc
.
get
(),
reinterpret_cast
<
const
void
*>
(
input
->
data
<
T
>
()),
output_desc
.
get
(),
reinterpret_cast
<
void
*>
(
output
->
data
<
T
>
()));
}
private:
float
alpha_
=
1.0
;
};
template
<
typename
DeviceContext
,
cnnlActivationMode_t
act_mode
,
typename
T
>
template
<
cnnlActivationMode_t
act_mode
,
typename
T
>
class
ActivationGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
out
=
ctx
.
Input
<
Tensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>()
;
float
alpha
=
ctx
.
HasAttr
(
"alpha"
)
?
ctx
.
Attr
<
float
>
(
"alpha"
)
:
1.0
f
;
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
...
...
@@ -70,16 +67,13 @@ class ActivationGradMLUKernel : public framework::OpKernel<T> {
ToCnnlDataType
(
out
->
type
()));
MLUCnnlTensorDesc
dx_desc
(
*
dx
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
dx
->
type
()));
MLUCnnlActivationDesc
act_desc
(
act_mode
,
alpha
_
);
MLUCnnlActivationDesc
act_desc
(
act_mode
,
alpha
);
MLUCnnl
::
ActiveGrad
(
dev_
ctx
,
act_desc
.
get
(),
nullptr
,
nullptr
,
nullptr
,
nullptr
,
ctx
,
act_desc
.
get
(),
nullptr
,
nullptr
,
nullptr
,
nullptr
,
dout_desc
.
get
(),
reinterpret_cast
<
const
void
*>
(
dout
->
data
<
T
>
()),
out_desc
.
get
(),
reinterpret_cast
<
const
void
*>
(
out
->
data
<
T
>
()),
dx_desc
.
get
(),
reinterpret_cast
<
void
*>
(
dx
->
data
<
T
>
()));
}
private:
float
alpha_
=
1.0
;
};
}
// namespace operators
...
...
@@ -88,13 +82,9 @@ class ActivationGradMLUKernel : public framework::OpKernel<T> {
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
relu
,
ops
::
ActivationMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
CNNL_ACTIVATION_RELU
,
float
>
,
ops
::
ActivationMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
CNNL_ACTIVATION_RELU
,
paddle
::
platform
::
float16
>
);
relu
,
ops
::
ActivationMLUKernel
<
CNNL_ACTIVATION_RELU
,
float
>
,
ops
::
ActivationMLUKernel
<
CNNL_ACTIVATION_RELU
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
relu_grad
,
ops
::
ActivationGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
CNNL_ACTIVATION_RELU
,
float
>
,
ops
::
ActivationGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
CNNL_ACTIVATION_RELU
,
relu_grad
,
ops
::
ActivationGradMLUKernel
<
CNNL_ACTIVATION_RELU
,
float
>
,
ops
::
ActivationGradMLUKernel
<
CNNL_ACTIVATION_RELU
,
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/mlu/mlu_baseop.cc
浏览文件 @
c396ee65
...
...
@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include <paddle/fluid/framework/data_type.h>
#include <paddle/fluid/framework/operator.h>
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/operator.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -57,7 +54,7 @@ class MLUCnnlTensorDescPool {
static
MLUCnnlTensorDescPool
g_cnnl_tensor_desc_pool
;
MLUCnnlTensorDesc
&
MLUCnnlTensorDesc
::
operator
=
(
MLUCnnlTensorDesc
&&
rhs
)
{
MLUCnnlTensorDesc
&
MLUCnnlTensorDesc
::
operator
=
(
MLUCnnlTensorDesc
&&
rhs
)
{
if
(
raw_tensor_desc
)
{
g_cnnl_tensor_desc_pool
.
Recycle
(
raw_tensor_desc
);
}
...
...
@@ -138,7 +135,7 @@ MLUCnnlTensorDesc::MLUCnnlTensorDesc(const int tensor_dim,
cnnlSetTensorDescriptorPosition
(
raw_tensor_desc
,
position
));
}
MLUCnnlTensorDesc
::
MLUCnnlTensorDesc
(
const
Tensor
&
tensor
,
MLUCnnlTensorDesc
::
MLUCnnlTensorDesc
(
const
Tensor
&
tensor
,
const
cnnlTensorLayout_t
layout
,
const
cnnlDataType_t
tensor_dtype
)
{
auto
dims
=
framework
::
vectorize
<
int
>
(
tensor
.
dims
());
...
...
@@ -156,7 +153,7 @@ MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor &tensor,
}
}
MLUCnnlTensorDesc
::
MLUCnnlTensorDesc
(
const
Tensor
&
tensor
,
MLUCnnlTensorDesc
::
MLUCnnlTensorDesc
(
const
Tensor
&
tensor
,
cnnlTensorLayout_t
layout
,
const
cnnlDataType_t
tensor_dtype
,
int
position
)
...
...
@@ -165,7 +162,7 @@ MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor &tensor,
cnnlSetTensorDescriptorPosition
(
raw_tensor_desc
,
position
));
}
MLUCnnlTensorDesc
::
MLUCnnlTensorDesc
(
const
Tensor
&
tensor
,
MLUCnnlTensorDesc
::
MLUCnnlTensorDesc
(
const
Tensor
&
tensor
,
cnnlTensorLayout_t
layout
,
const
cnnlDataType_t
tensor_dtype
,
int
position
,
float
scale
)
...
...
@@ -197,31 +194,2359 @@ MLUCnnlActivationDesc::~MLUCnnlActivationDesc() {
}
}
/* static */
void
MLUCnnl
::
Active
(
const
platform
::
MLUDeviceContext
&
ctx
,
MLUCnnlPoolingDesc
::
MLUCnnlPoolingDesc
(
const
cnnlPoolingMode_t
mode
,
const
cnnlNanPropagation_t
maxpooling_nan_opt
,
int
window_rows
,
int
window_cols
,
int64_t
pad_up
,
int64_t
pad_down
,
int64_t
pad_left
,
int64_t
pad_right
,
int
row_stride
,
int
col_stride
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCreatePoolingDescriptor
(
&
pooling_desc_
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetPooling2dDescriptor
(
pooling_desc_
,
mode
,
maxpooling_nan_opt
,
window_rows
,
window_cols
,
pad_up
,
pad_down
,
pad_left
,
pad_right
,
row_stride
,
col_stride
));
}
MLUCnnlPoolingDesc
::
MLUCnnlPoolingDesc
(
const
cnnlPoolingMode_t
mode
,
const
cnnlNanPropagation_t
maxpooling_nan_opt
,
const
int
tensor_rank
,
const
std
::
vector
<
int
>&
window
,
const
std
::
vector
<
int
>&
padding
,
const
std
::
vector
<
int
>&
stride
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCreatePoolingDescriptor
(
&
pooling_desc_
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetPoolingNdDescriptor
(
pooling_desc_
,
mode
,
maxpooling_nan_opt
,
tensor_rank
,
window
.
data
(),
padding
.
data
(),
stride
.
data
()));
}
const
cnnlPoolingDescriptor_t
MLUCnnlPoolingDesc
::
get
()
const
{
return
pooling_desc_
;
}
MLUCnnlPoolingDesc
::~
MLUCnnlPoolingDesc
()
{
if
(
pooling_desc_
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlDestroyPoolingDescriptor
(
pooling_desc_
));
}
}
MLUCnnlRandomGeneratorDesc
::
MLUCnnlRandomGeneratorDesc
(
const
bool
is_mlu200
,
const
int
seed
)
{
if
(
is_mlu200
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlRandCreateGenerator
(
&
mlu_generator
,
CNNL_RAND_RNG_FAST
));
}
else
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlRandCreateGenerator
(
&
mlu_generator
,
CNNL_RAND_RNG_MTGP32
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlRandSetPseudoRandomGeneratorSeed
(
mlu_generator
,
seed
));
}
}
const
cnnlRandGenerator_t
MLUCnnlRandomGeneratorDesc
::
get
()
const
{
return
mlu_generator
;
}
MLUCnnlRandomGeneratorDesc
::~
MLUCnnlRandomGeneratorDesc
()
{
if
(
mlu_generator
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlRandDestroyGenerator
(
mlu_generator
));
}
}
MLUCnnlNMSDesc
::
MLUCnnlNMSDesc
(
const
cnnlNmsOutputMode_t
mode
,
const
float
iou_threshold
,
const
int
max_output_size
,
const
float
confidence_threshold
,
const
int
input_layout
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCreateNmsDescriptor
(
&
nms_desc_
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetNmsDescriptor_v2
(
nms_desc_
,
mode
,
iou_threshold
,
max_output_size
,
confidence_threshold
,
input_layout
));
}
const
cnnlNmsDescriptor_t
MLUCnnlNMSDesc
::
get
()
const
{
return
nms_desc_
;
}
MLUCnnlNMSDesc
::~
MLUCnnlNMSDesc
()
{
if
(
nms_desc_
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlDestroyNmsDescriptor
(
nms_desc_
));
}
}
MLUCnnlReduceDesc
::
MLUCnnlReduceDesc
(
const
std
::
vector
<
int
>&
axis_vec
,
const
cnnlReduceOp_t
reduce_op
,
const
cnnlDataType_t
data_type
,
const
cnnlNanPropagation_t
nan_propagation
,
const
cnnlReduceIndices_t
reduce_indices
,
const
cnnlIndicesType_t
indices_type
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCreateReduceDescriptor
(
&
reduction_desc_
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetReduceDescriptor
(
reduction_desc_
,
const_cast
<
int
*>
(
axis_vec
.
data
()),
axis_vec
.
size
(),
reduce_op
,
data_type
,
nan_propagation
,
reduce_indices
,
indices_type
));
}
const
cnnlReduceDescriptor_t
MLUCnnlReduceDesc
::
get
()
const
{
return
reduction_desc_
;
}
MLUCnnlReduceDesc
::~
MLUCnnlReduceDesc
()
{
if
(
reduction_desc_
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlDestroyReduceDescriptor
(
reduction_desc_
));
}
}
MLUCnnlOpTensorDesc
::
MLUCnnlOpTensorDesc
(
cnnlOpTensorDesc_t
op_tensor_op
,
cnnlDataType_t
op_tensor_comp_type
,
cnnlNanPropagation_t
op_tensor_nan_opt
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCreateOpTensorDescriptor
(
&
op_tensor_desc_
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetOpTensorDescriptor
(
op_tensor_desc_
,
op_tensor_op
,
op_tensor_comp_type
,
op_tensor_nan_opt
));
}
const
cnnlOpTensorDescriptor_t
MLUCnnlOpTensorDesc
::
get
()
const
{
return
op_tensor_desc_
;
}
MLUCnnlOpTensorDesc
::~
MLUCnnlOpTensorDesc
()
{
if
(
op_tensor_desc_
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlDestroyOpTensorDescriptor
(
op_tensor_desc_
));
}
}
MLUCnnlConvolutionDesc
::
MLUCnnlConvolutionDesc
(
const
int
dims
,
const
int
pad
[],
const
int
stride
[],
const
int
dilation
[],
const
int
group_count
,
const
cnnlDataType_t
tensor_dtype
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCreateConvolutionDescriptor
(
&
conv_desc_
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetConvolutionDescriptor
(
conv_desc_
,
dims
,
pad
,
stride
,
dilation
,
group_count
,
tensor_dtype
));
}
MLUCnnlConvolutionDesc
::
MLUCnnlConvolutionDesc
(
const
int
dims
,
const
int64_t
pad
[],
const
int64_t
stride
[],
const
int64_t
dilation
[],
const
int
group_count
,
const
cnnlDataType_t
tensor_dtype
)
{
const
int
spatial_dims
=
dims
-
2
;
const
int
pad_dims
=
spatial_dims
*
2
;
std
::
vector
<
int
>
pad_int32
(
pad_dims
);
std
::
vector
<
int
>
stride_int32
(
spatial_dims
);
std
::
vector
<
int
>
dilation_int32
(
spatial_dims
);
std
::
vector
<
int64_t
>::
const_iterator
int64_pad_cbegin
(
pad
);
std
::
vector
<
int64_t
>::
const_iterator
int64_pad_cend
(
pad
+
pad_dims
);
std
::
vector
<
int64_t
>::
const_iterator
int64_stride_cbegin
(
stride
);
std
::
vector
<
int64_t
>::
const_iterator
int64_stride_cend
(
stride
+
spatial_dims
);
std
::
vector
<
int64_t
>::
const_iterator
int64_dilation_cbegin
(
dilation
);
std
::
vector
<
int64_t
>::
const_iterator
int64_dilation_cend
(
dilation
+
spatial_dims
);
std
::
transform
(
int64_pad_cbegin
,
int64_pad_cend
,
pad_int32
.
begin
(),
&
CheckedNarrowing
<
int64_t
,
int
>
);
std
::
transform
(
int64_stride_cbegin
,
int64_stride_cend
,
stride_int32
.
begin
(),
&
CheckedNarrowing
<
int64_t
,
int
>
);
std
::
transform
(
int64_dilation_cbegin
,
int64_dilation_cend
,
dilation_int32
.
begin
(),
&
CheckedNarrowing
<
int64_t
,
int
>
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCreateConvolutionDescriptor
(
&
conv_desc_
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetConvolutionDescriptor
(
conv_desc_
,
dims
,
pad_int32
.
data
(),
stride_int32
.
data
(),
dilation_int32
.
data
(),
group_count
,
tensor_dtype
));
}
const
cnnlConvolutionDescriptor_t
MLUCnnlConvolutionDesc
::
get
()
const
{
return
conv_desc_
;
}
MLUCnnlConvolutionDesc
::~
MLUCnnlConvolutionDesc
()
{
if
(
conv_desc_
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlDestroyConvolutionDescriptor
(
conv_desc_
));
}
}
MLUCnnlBatchSpaceDesc
::
MLUCnnlBatchSpaceDesc
(
uint32_t
block_shape
[],
uint32_t
paddings
[],
const
uint32_t
block_shape_size
,
const
uint32_t
paddings_size
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCreateSpaceBatchNdDescriptor
(
&
op_desc_
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetSpaceBatchNdDescriptor
(
op_desc_
,
block_shape
,
block_shape_size
,
paddings
,
paddings_size
));
}
void
MLUCnnlBatchSpaceDesc
::
getSpace2batchNdextraInputSize
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetSpace2batchNdExtraInputSize
(
handle
,
input_desc
,
op_desc_
,
&
extra_input_size_
));
}
void
MLUCnnlBatchSpaceDesc
::
getBatch2spaceNdextraInputSize
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetBatch2spaceNdExtraInputSize
(
handle
,
input_desc
,
op_desc_
,
&
extra_input_size_
));
}
void
MLUCnnlBatchSpaceDesc
::
initSpace2batchNdExtraInput
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
void
*
extra_host_input
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlInitSpace2batchNdExtraInput
(
handle
,
input_desc
,
op_desc_
,
extra_host_input
));
}
void
MLUCnnlBatchSpaceDesc
::
initBatch2spaceNdExtraInput
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
void
*
extra_host_input
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlInitBatch2spaceNdExtraInput
(
handle
,
input_desc
,
op_desc_
,
extra_host_input
));
}
const
cnnlSpaceBatchNdDescriptor_t
MLUCnnlBatchSpaceDesc
::
get
()
const
{
return
op_desc_
;
}
size_t
MLUCnnlBatchSpaceDesc
::
getExtraInputSize
()
const
{
return
extra_input_size_
;
}
MLUCnnlBatchSpaceDesc
::~
MLUCnnlBatchSpaceDesc
()
{
if
(
op_desc_
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlDestroySpaceBatchNdDescriptor
(
op_desc_
));
}
}
MLUCnnlTrigonDesc
::
MLUCnnlTrigonDesc
(
const
cnnlTrigonFunctionMode_t
trigon_function_mode
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCreateTrigonDescriptor
(
&
trigon_desc_
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetTrigonDescriptor
(
trigon_desc_
,
trigon_function_mode
));
}
const
cnnlTrigonDescriptor_t
MLUCnnlTrigonDesc
::
get
()
const
{
return
trigon_desc_
;
}
MLUCnnlTrigonDesc
::~
MLUCnnlTrigonDesc
()
{
if
(
trigon_desc_
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlDestroyTrigonDescriptor
(
trigon_desc_
));
}
}
/* static */
void
MLUCnnl
::
Active
(
const
ExecutionContext
&
ctx
,
cnnlActivationDescriptor_t
active_desc
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
ctx
.
cnnl_handle
(
);
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlActivationForward
(
handle
,
active_desc
,
NULL
,
input_desc
,
input
,
NULL
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
ActiveGrad
(
const
platform
::
MLUDeviceContext
&
ctx
,
cnnlActivationDescriptor_t
active_desc
,
const
void
*
alpha
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
y_desc
,
const
void
*
y
,
const
cnnlTensorDescriptor_t
diff_y_desc
,
const
void
*
diff_y
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
diff_x_desc
,
void
*
diff_x
)
{
cnnlHandle_t
handle
=
ctx
.
cnnl_handle
();
const
ExecutionContext
&
ctx
,
cnnlActivationDescriptor_t
active_desc
,
const
void
*
alpha
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
y_desc
,
const
void
*
y
,
const
cnnlTensorDescriptor_t
diff_y_desc
,
const
void
*
diff_y
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
diff_x_desc
,
void
*
diff_x
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlActivationBackward
(
handle
,
active_desc
,
alpha
,
y_desc
,
y
,
diff_y_desc
,
diff_y
,
x_desc
,
x
,
beta
,
diff_x_desc
,
diff_x
));
}
/* static */
void
MLUCnnl
::
Concat
(
const
ExecutionContext
&
ctx
,
const
int
pack_num
,
const
int
axis
,
const
cnnlTensorDescriptor_t
inputs_desc
[],
const
void
*
const
inputs
[],
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetConcatWorkspaceSize
(
handle
,
pack_num
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlConcat
(
handle
,
pack_num
,
axis
,
inputs_desc
,
inputs
,
workspace_ptr
,
workspace_size
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Div
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
in0_desc
,
const
void
*
in0
,
const
cnnlTensorDescriptor_t
in1_desc
,
const
void
*
in1
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetDivWorkspaceSize
(
handle
,
in0_desc
,
in1_desc
,
output_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlDiv_v2
(
handle
,
prefer
,
in0_desc
,
in0
,
in1_desc
,
in1
,
workspace_ptr
,
workspace_size
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Fill
(
const
ExecutionContext
&
ctx
,
float
value
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlFill
(
handle
,
value
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
QuantifyOffline
(
const
ExecutionContext
&
ctx
,
cnnlQuantizeMode_t
mode
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlQuantizeV1
(
handle
,
mode
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
LRN
(
const
ExecutionContext
&
ctx
,
const
int
local_size
,
const
double
alpha
,
const
double
beta
,
const
double
k
,
const
cnnlTensorDescriptor_t
input_quant_desc
,
const
void
*
input_quant
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetLrnWorkspaceSize
(
handle
,
input_quant_desc
,
output_desc
,
local_size
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
const
cnnlLrnMode_t
mode
=
CNNL_LRN_CROSS_CHANNEL
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlLrn
(
handle
,
mode
,
local_size
,
alpha
,
beta
,
k
,
workspace_ptr
,
workspace_size
,
input_quant_desc
,
const_cast
<
void
*>
(
input_quant
),
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
QuantifyOnline
(
const
ExecutionContext
&
ctx
,
const
int
bitwidth
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
bool
compute_scale
,
void
*
position
,
void
*
scale
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetQuantizeParamWorkspaceSize
(
handle
,
input_desc
,
&
workspace_size
));
// use ctx allocate interface for profiling purpose
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
const
cnnlQuantizeMode_t
mode
=
compute_scale
?
CNNL_QUANTIZE_POSITION_SCALE
:
CNNL_QUANTIZE_POSITION
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlQuantizeParam
(
handle
,
mode
,
input_desc
,
input
,
bitwidth
,
workspace_ptr
,
workspace_size
,
position
,
scale
,
nullptr
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlQuantizeV2
(
handle
,
mode
,
input_desc
,
input
,
position
,
scale
,
nullptr
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Range
(
const
ExecutionContext
&
ctx
,
const
void
*
start
,
const
void
*
end
,
const
void
*
step
,
const
cnnlDataType_t
output_dtype
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlArange
(
handle
,
start
,
end
,
step
,
output_dtype
,
output
));
}
/* static */
void
MLUCnnl
::
Round
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlRound
(
handle
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
SparseSoftmaxXentWithLogits
(
const
ExecutionContext
&
ctx
,
cnnlSoftmaxMode_t
mode
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
label_desc
,
const
void
*
label
,
const
cnnlTensorDescriptor_t
y_desc
,
void
*
output
,
const
cnnlTensorDescriptor_t
diff_y_desc
,
void
*
back_out
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSparseSoftmaxCrossEntropyWithLogits
(
handle
,
mode
,
x_desc
,
input
,
label_desc
,
label
,
y_desc
,
output
,
diff_y_desc
,
back_out
));
}
/* static */
void
MLUCnnl
::
Cumsum
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
bool
exclusive
,
const
bool
reverse
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
ouput_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
// NAN propagation mode: Only support CNNL_NOT_PROPAGATE_NAN now.
cnnlNanPropagation_t
mode
=
CNNL_NOT_PROPAGATE_NAN
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCumsum
(
handle
,
input_desc
,
input
,
axis
,
exclusive
,
reverse
,
mode
,
ouput_desc
,
output
));
}
/* static */
void
MLUCnnl
::
BroadcastTo
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlExpand
(
handle
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
AssignAdd
(
const
ExecutionContext
&
ctx
,
const
void
*
alpha
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
update_desc
,
const
void
*
update
,
const
cnnlTensorDescriptor_t
param_desc
,
void
*
param
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlAssignAdd
(
handle
,
alpha
,
update_desc
,
update
,
nullptr
,
0
,
beta
,
param_desc
,
param
));
}
/* static */
void
MLUCnnl
::
AssignSub
(
const
ExecutionContext
&
ctx
,
const
void
*
alpha
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
update_desc
,
const
void
*
update
,
const
cnnlTensorDescriptor_t
param_desc
,
void
*
param
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlAssignSub
(
handle
,
alpha
,
update_desc
,
update
,
nullptr
,
0
,
beta
,
param_desc
,
param
));
}
/* static */
void
MLUCnnl
::
Assign
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
update_desc
,
const
void
*
update
,
const
cnnlTensorDescriptor_t
param_desc
,
void
*
param
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCopy
(
handle
,
update_desc
,
update
,
param_desc
,
param
));
}
/* static */
void
MLUCnnl
::
SGD
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
void
*
lr
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGradientDescent
(
handle
,
grad_desc
,
grad
,
lr
,
var_desc
,
var
));
}
/* static */
void
MLUCnnl
::
ApplyAdaGrad
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
cnnlTensorDescriptor_t
accum_desc
,
void
*
accum
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
,
const
void
*
lr
,
const
bool
update_slots
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlApplyAdaGrad
(
handle
,
grad_desc
,
grad
,
accum_desc
,
accum
,
var_desc
,
var
,
lr
,
update_slots
));
}
/* static */
void
MLUCnnl
::
ApplyRMSProp
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
void
*
lr
,
const
void
*
rho
,
const
void
*
momentum
,
const
void
*
epsilon
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
,
const
cnnlTensorDescriptor_t
ms_desc
,
void
*
ms
,
const
cnnlTensorDescriptor_t
mom_desc
,
void
*
mom
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlRMSProp
(
handle
,
lr
,
rho
,
epsilon
,
momentum
,
grad_desc
,
grad
,
var_desc
,
var
,
ms_desc
,
ms
,
mom_desc
,
mom
));
}
/* static */
void
MLUCnnl
::
ApplyCenterRMSProp
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
void
*
lr
,
const
void
*
rho
,
const
void
*
momentum
,
const
void
*
epsilon
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
,
const
cnnlTensorDescriptor_t
mg_desc
,
void
*
mg
,
const
cnnlTensorDescriptor_t
ms_desc
,
void
*
ms
,
const
cnnlTensorDescriptor_t
mom_desc
,
void
*
mom
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlApplyCenterRMSProp
(
handle
,
var_desc
,
var
,
mg_desc
,
mg
,
ms_desc
,
ms
,
mom_desc
,
mom
,
grad_desc
,
grad
,
lr
,
rho
,
momentum
,
epsilon
));
}
/* static */
void
MLUCnnl
::
ApplyAdam
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
void
*
lr
,
const
void
*
beta1
,
const
void
*
beta2
,
const
void
*
beta1_power
,
const
void
*
beta2_power
,
const
void
*
epsilon
,
const
bool
use_nesterov
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
,
const
cnnlTensorDescriptor_t
m_desc
,
void
*
m
,
const
cnnlTensorDescriptor_t
v_desc
,
void
*
v
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlApplyAdam
(
handle
,
grad_desc
,
var
,
grad_desc
,
m
,
grad_desc
,
v
,
grad_desc
,
grad
,
lr
,
beta1
,
beta2
,
beta1_power
,
beta2_power
,
epsilon
,
use_nesterov
));
}
/* static */
void
MLUCnnl
::
ApplyAdaMax
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
,
const
cnnlTensorDescriptor_t
m_desc
,
void
*
m
,
const
cnnlTensorDescriptor_t
v_desc
,
void
*
v
,
const
void
*
diff
,
const
void
*
lr
,
const
void
*
beta1
,
const
void
*
beta2
,
const
void
*
beta1_power
,
const
void
*
epsilon
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlApplyAdaMax
(
handle
,
var_desc
,
var
,
m_desc
,
m
,
v_desc
,
v
,
grad_desc
,
diff
,
lr
,
beta1
,
beta2
,
beta1_power
,
epsilon
));
}
/* static */
void
MLUCnnl
::
ApplyMomentum
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
bool
use_nesterov
,
const
void
*
lr
,
const
void
*
momentum
,
void
*
var
,
void
*
accum
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlMomentum
(
handle
,
grad_desc
,
var
,
grad_desc
,
accum
,
grad_desc
,
grad
,
lr
,
momentum
,
use_nesterov
));
}
/* static */
void
MLUCnnl
::
ApplyKerasMomentum
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
bool
use_nesterov
,
const
void
*
lr
,
const
void
*
momentum
,
void
*
var
,
void
*
accum
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlKerasMomentum
(
handle
,
grad_desc
,
var
,
grad_desc
,
accum
,
grad_desc
,
grad
,
lr
,
momentum
,
use_nesterov
));
}
/* static */
void
MLUCnnl
::
ApplyAdadelta
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
diff
,
const
void
*
lr
,
const
void
*
rho
,
const
void
*
epsilon
,
void
*
var
,
void
*
accum
,
void
*
accum_update
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlApplyAdadelta
(
handle
,
grad_desc
,
var
,
grad_desc
,
accum
,
grad_desc
,
accum_update
,
grad_desc
,
diff
,
lr
,
rho
,
epsilon
));
}
/* static */
void
MLUCnnl
::
Scale
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
alpha_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
beta_desc
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlScale
(
handle
,
axis
,
input_desc
,
input
,
alpha_desc
,
alpha
,
beta_desc
,
beta
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
AddN
(
const
ExecutionContext
&
ctx
,
uint32_t
input_num
,
const
cnnlTensorDescriptor_t
inputs_desc
[],
const
void
*
inputs
[],
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlAddN
(
handle
,
inputs_desc
,
inputs
,
input_num
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Log
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
cnnlLogBase_t
log_base
=
CNNL_LOG_E
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlLog_v2
(
handle
,
prefer
,
log_base
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Matmul
(
const
ExecutionContext
&
ctx
,
const
bool
transpose_a
,
const
bool
transpose_b
,
const
cnnlTensorDescriptor_t
in0_desc
,
const
void
*
in0
,
const
cnnlTensorDescriptor_t
in1_desc
,
const
void
*
in1
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
float
alpha
=
1.0
f
;
float
beta
=
0.0
f
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlMatMul
(
handle
,
transpose_a
,
transpose_b
,
reinterpret_cast
<
void
*>
(
&
alpha
),
in0_desc
,
in0
,
in1_desc
,
in1
,
reinterpret_cast
<
void
*>
(
&
beta
),
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
BatchMatmul
(
const
ExecutionContext
&
ctx
,
const
bool
transpose_a
,
const
bool
transpose_b
,
const
cnnlTensorDescriptor_t
in0_desc
,
const
void
*
in0
,
const
cnnlTensorDescriptor_t
in1_desc
,
const
void
*
in1
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetBatchMatMulBCastWorkspaceSize
(
handle
,
in0_desc
,
in1_desc
,
output_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBatchMatMulBCast
(
handle
,
transpose_a
,
transpose_b
,
in0_desc
,
in0
,
in1_desc
,
in1
,
workspace_ptr
,
workspace_size
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
OpTensor
(
const
ExecutionContext
&
ctx
,
const
cnnlOpTensorDescriptor_t
op_tensor_desc
,
const
cnnlTensorDescriptor_t
a_desc
,
const
void
*
a
,
const
cnnlTensorDescriptor_t
b_desc
,
const
void
*
b
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
const
cnnlDataType_t
dtype
)
{
static
const
int
alpha1_int
=
1
,
alpha2_int
=
1
,
beta_int
=
0
;
static
const
float
alpha1_float
=
1.
f
,
alpha2_float
=
1.
f
,
beta_float
=
0.
f
;
const
void
*
alpha1_ptr
=
static_cast
<
const
void
*>
(
&
alpha1_float
);
const
void
*
alpha2_ptr
=
static_cast
<
const
void
*>
(
&
alpha2_float
);
const
void
*
beta_ptr
=
static_cast
<
const
void
*>
(
&
beta_float
);
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
bool
is_dt_float
=
(
dtype
==
CNNL_DTYPE_FLOAT
||
dtype
==
CNNL_DTYPE_HALF
);
// if datatype is not float, we set alpha and beta to be int
if
(
!
is_dt_float
)
{
alpha1_ptr
=
static_cast
<
const
void
*>
(
&
alpha1_int
);
alpha2_ptr
=
static_cast
<
const
void
*>
(
&
alpha2_int
);
beta_ptr
=
static_cast
<
const
void
*>
(
&
beta_int
);
}
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetOpTensorWorkspaceSize_v2
(
handle
,
op_tensor_desc
,
alpha1_ptr
,
a_desc
,
a
,
alpha2_ptr
,
b_desc
,
b
,
beta_ptr
,
output_desc
,
output
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlOpTensor
(
handle
,
op_tensor_desc
,
alpha1_ptr
,
a_desc
,
a
,
alpha2_ptr
,
b_desc
,
b
,
workspace_ptr
,
workspace_size
,
beta_ptr
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
BiasAddGrad
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
cnnlTensorDescriptor_t
out_backprop_desc
,
const
void
*
out_backprop
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBiasAddBackward
(
handle
,
out_backprop_desc
,
out_backprop
,
axis
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
RandomUniform
(
const
ExecutionContext
&
ctx
,
const
int
num
,
const
cnnlDataType_t
data_type
,
const
cnnlRandGenerator_t
mlu_generator
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlRandGenerateUniform
(
handle
,
mlu_generator
,
data_type
,
nullptr
,
num
,
0
,
1
,
output
));
}
/* static */
void
MLUCnnl
::
TopK
(
const
ExecutionContext
&
ctx
,
const
int
k
,
const
int
dim
,
const
bool
largest
,
const
bool
sorted
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
values_output_desc
,
void
*
values_out
,
const
cnnlTensorDescriptor_t
indices_output_desc
,
void
*
indices_out
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlTopKTensor
(
handle
,
input_desc
,
input
,
k
,
dim
,
largest
,
sorted
,
values_output_desc
,
values_out
,
indices_output_desc
,
indices_out
));
}
/* static */
void
MLUCnnl
::
StridedSlice
(
const
ExecutionContext
&
ctx
,
const
int
begin
[],
const
int
end
[],
const
int
strides
[],
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlStridedSlice
(
handle
,
input_desc
,
input
,
begin
,
end
,
strides
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Split
(
const
ExecutionContext
&
ctx
,
int
split_num
,
int
axis
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input_ptr
,
const
cnnlTensorDescriptor_t
output_descs
[],
void
*
output_ptrs
[])
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetSplitWorkspaceSize
(
handle
,
split_num
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSplit
(
handle
,
split_num
,
axis
,
input_desc
,
input_ptr
,
workspace_ptr
,
workspace_size
,
output_descs
,
output_ptrs
));
}
/* static */
void
MLUCnnl
::
GatherFunctor
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
int
batch_dims
,
const
cnnlTensorDescriptor_t
params_desc
,
const
void
*
params
,
const
cnnlTensorDescriptor_t
indices_desc
,
const
void
*
indices
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBatchGatherV2
(
handle
,
axis
,
batch_dims
,
params_desc
,
params
,
indices_desc
,
indices
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
ScatterFunctor
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
params_desc
,
const
void
*
params
,
const
cnnlTensorDescriptor_t
updates_desc
,
const
void
*
updates
,
const
cnnlTensorDescriptor_t
indices_desc
,
const
void
*
indices
,
const
cnnlScatterRefMode_t
mode
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlScatterRef
(
handle
,
params_desc
,
params
,
indices_desc
,
indices
,
updates_desc
,
updates
,
0
,
mode
));
}
/* static */
void
MLUCnnl
::
StridedSliceGrad
(
const
ExecutionContext
&
ctx
,
const
int
begin
[],
const
int
end
[],
const
int
strides
[],
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlStridedSliceBackward
(
handle
,
begin
,
end
,
strides
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Logic
(
const
ExecutionContext
&
ctx
,
const
MLULogicMethod
log_method
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetLogicOpWorkspaceSize
(
handle
,
input1_desc
,
input2_desc
,
output_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlLogicOp
(
handle
,
cnnlLogicOp_t
(
log_method
),
input1_desc
,
input1
,
input2_desc
,
input2
,
workspace_ptr
,
workspace_size
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Select
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
then_desc
,
const
void
*
p_then
,
const
cnnlTensorDescriptor_t
else_desc
,
const
void
*
p_else
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
const
bool
*
condition
,
const
int
condition_size
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSelect
(
handle
,
then_desc
,
p_then
,
else_desc
,
p_else
,
output_desc
,
output
,
condition
,
condition_size
));
}
/*static */
void
MLUCnnl
::
GatherNd
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
params_desc
,
const
void
*
params
,
const
cnnlTensorDescriptor_t
indices_desc
,
const
void
*
indices
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGatherNd
(
handle
,
params_desc
,
params
,
indices_desc
,
indices
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
BatchToSpace
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
const
cnnlSpaceBatchParam_t
param
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetBatch2spaceWorkspaceSize
(
handle
,
input_desc
,
output_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBatch2space
(
handle
,
input_desc
,
input
,
output_desc
,
output
,
param
,
workspace_ptr
,
workspace_size
));
}
/* static */
void
MLUCnnl
::
BatchToSpaceNd
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
cnnlSpaceBatchNdDescriptor_t
param
,
void
*
extra_device_input
,
size_t
extra_input_size
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBatch2spaceNd_v2
(
handle
,
input_desc
,
input
,
output_desc
,
output
,
param
,
extra_device_input
,
extra_input_size
));
}
/* static */
void
MLUCnnl
::
SoftmaxForward
(
const
ExecutionContext
&
ctx
,
cnnlSoftmaxAlgorithm_t
algorithm
,
cnnlSoftmaxMode_t
mode
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSoftmaxForward
(
handle
,
algorithm
,
mode
,
alpha
,
input_desc
,
input
,
beta
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Softplus
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
features_desc
,
const
void
*
features
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
const
int
beta
=
1
;
const
int
threshold
=
20
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSoftplusForward
(
handle
,
features_desc
,
features
,
output_desc
,
output
,
beta
,
threshold
));
}
/* static */
void
MLUCnnl
::
SoftplusGrad
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
gradients_desc
,
const
void
*
gradients
,
const
cnnlTensorDescriptor_t
features_desc
,
const
void
*
features
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
int
beta
=
1
;
int
threshold
=
20
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSoftplusBackward
(
handle
,
features_desc
,
features
,
gradients_desc
,
gradients
,
output_desc
,
output
,
beta
,
threshold
));
}
/* static */
void
MLUCnnl
::
PoolingForward
(
const
ExecutionContext
&
ctx
,
cnnlPoolingMode_t
pool_mode
,
const
std
::
vector
<
int64_t
>&
output_shape
,
const
cnnlPoolingDescriptor_t
pooling_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
void
*
beta
,
const
void
*
extra_input_ptr
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetPoolingWorkspaceSize
(
handle
,
pool_mode
,
output_shape
[
2
],
output_shape
[
1
],
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlPoolingForward_v2
(
handle
,
pooling_desc
,
alpha
,
input_desc
,
input
,
beta
,
extra_input_ptr
,
output_desc
,
output
,
workspace_ptr
,
workspace_size
));
}
/* static */
void
MLUCnnl
::
Pool3D
(
const
ExecutionContext
&
ctx
,
cnnlPoolingMode_t
pool_mode
,
const
std
::
vector
<
int64_t
>&
output_shape
,
const
cnnlPoolingDescriptor_t
pooling_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetPoolingWorkspaceSize
(
handle
,
pool_mode
,
output_shape
[
2
],
output_shape
[
1
],
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlPoolingForward
(
handle
,
pooling_desc
,
alpha
,
input_desc
,
input
,
beta
,
output_desc
,
output
,
workspace_ptr
,
workspace_size
));
}
/* static */
void
MLUCnnl
::
RsqrtGrad
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
data_desc
,
const
void
*
y
,
const
void
*
diff_y
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlRsqrtBackward
(
handle
,
data_desc
,
y
,
diff_y
,
output
));
}
/* static */
void
MLUCnnl
::
SqrtGrad
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
data_desc
,
const
void
*
y
,
const
void
*
diff_y
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSqrtBackward
(
handle
,
data_desc
,
y
,
diff_y
,
output
));
}
/* static */
void
MLUCnnl
::
UnsortedSegmentSum
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
data_desc
,
const
void
*
data
,
const
cnnlTensorDescriptor_t
ids_desc
,
const
int
*
segment_ids
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetUnsortedSegmentSumWorkspaceSize
(
handle
,
data_desc
,
output_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlUnsortedSegmentSum
(
handle
,
data_desc
,
data
,
ids_desc
,
segment_ids
,
workspace_ptr
,
workspace_size
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Pad
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
void
*
paddings
,
const
void
*
padding_value
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlPad
(
handle
,
input_desc
,
input
,
paddings
,
padding_value
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
OneHot
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
desc_indices
,
const
void
*
indices
,
const
int
depth
,
const
void
*
on_value
,
const
void
*
off_value
,
const
int
axis
,
cnnlDataType_t
output_data_type
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlOneHot
(
handle
,
desc_indices
,
indices
,
depth
,
on_value
,
off_value
,
axis
,
output_data_type
,
output
));
}
/* static */
void
MLUCnnl
::
ConvolutionForward
(
const
ExecutionContext
&
ctx
,
cnnlConvolutionDescriptor_t
conv_desc
,
const
void
*
alpha
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
bias_desc
,
const
void
*
bias_ptr
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
filtet_desc
,
const
void
*
filter
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
// cnnl: select best algorithm for convolution compution.
cnnlConvolutionForwardAlgo_t
algo
;
cnnlConvolutionFwdPreference_t
preference
=
CNNL_CONVOLUTION_FWD_FASTEST
;
cnnlGetConvolutionForwardAlgorithm
(
handle
,
conv_desc
,
input_desc
,
filtet_desc
,
output_desc
,
preference
,
&
algo
);
// get workspace size
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetConvolutionForwardWorkspaceSize
(
handle
,
input_desc
,
filtet_desc
,
output_desc
,
bias_desc
,
conv_desc
,
algo
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlConvolutionForward
(
handle
,
conv_desc
,
algo
,
alpha
,
input_desc
,
input
,
filtet_desc
,
filter
,
bias_desc
,
bias_ptr
,
workspace_ptr
,
workspace_size
,
beta
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Tile
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlTile
(
handle
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
SoftmaxCrossEntropyWithLogits
(
const
ExecutionContext
&
ctx
,
cnnlSoftmaxMode_t
mode
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
logits_in
,
const
cnnlTensorDescriptor_t
label_desc
,
const
void
*
labels_in
,
const
cnnlTensorDescriptor_t
loss_out_desc
,
void
*
loss_out
,
const
cnnlTensorDescriptor_t
back_out_desc
,
void
*
back_out
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSoftmaxCrossEntropyWithLogits_v2
(
handle
,
mode
,
prefer
,
input_desc
,
logits_in
,
label_desc
,
labels_in
,
loss_out_desc
,
loss_out
,
back_out_desc
,
back_out
));
}
/* static */
void
MLUCnnl
::
Reduce
(
const
ExecutionContext
&
ctx
,
const
bool
need_workspace
,
const
cnnlReduceDescriptor_t
reduction_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
size_t
indices_size
,
void
*
indices
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
=
0
;
void
*
workspace_ptr
=
nullptr
;
Tensor
workspace
;
if
(
need_workspace
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetReduceOpWorkspaceSize
(
handle
,
input_desc
,
output_desc
,
reduction_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
}
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlReduce
(
handle
,
reduction_desc
,
workspace_ptr
,
workspace_size
,
alpha
,
input_desc
,
input
,
indices_size
,
indices
,
beta
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
FloorDiv
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetFloorDivWorkspaceSize
(
handle
,
input1_desc
,
input2_desc
,
output_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlFloorDiv_v2
(
handle
,
prefer
,
input1_desc
,
input1
,
input2_desc
,
input2
,
output_desc
,
output
,
workspace_ptr
,
workspace_size
));
}
/* static */
void
MLUCnnl
::
FloorMod
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetFloorModWorkspaceSize
(
handle
,
input1_desc
,
input2_desc
,
output_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlFloorMod
(
handle
,
input1_desc
,
input1
,
input2_desc
,
input2
,
output_desc
,
output
,
workspace_ptr
,
workspace_size
));
}
/* static */
void
MLUCnnl
::
Maximum
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetMaximumWorkspaceSize
(
handle
,
output_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlMaximum
(
handle
,
input1_desc
,
input1
,
input2_desc
,
input2
,
output_desc
,
output
,
workspace_ptr
,
workspace_size
));
}
/* static */
void
MLUCnnl
::
Minimum
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetMinimumWorkspaceSize
(
handle
,
output_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlMinimum
(
handle
,
input1_desc
,
input1
,
input2_desc
,
input2
,
output_desc
,
output
,
workspace_ptr
,
workspace_size
));
}
/* static */
void
MLUCnnl
::
PowR
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetPowRWorkspaceSize
(
handle
,
input1_desc
,
input2_desc
,
output_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlPowR_v2
(
handle
,
prefer
,
input1_desc
,
input1
,
input2_desc
,
input2
,
workspace_ptr
,
workspace_size
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
DivNoNan
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetDivNoNanWorkspaceSize
(
handle
,
input1_desc
,
input2_desc
,
output_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlDivNoNan_v2
(
handle
,
prefer
,
input1_desc
,
input1
,
input2_desc
,
input2
,
workspace_ptr
,
workspace_size
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
SquaredDifference
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetSquaredDifferenceWorkspaceSize
(
handle
,
input1_desc
,
input2_desc
,
output_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSquaredDifference
(
handle
,
input1_desc
,
input1
,
input2_desc
,
input2
,
output_desc
,
output
,
workspace_ptr
,
workspace_size
));
}
/* static */
void
MLUCnnl
::
L2Loss
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlL2Loss
(
handle
,
input_desc
,
input
,
output
));
}
/* static */
void
MLUCnnl
::
Abs
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlAbs
(
handle
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Neg
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlNegTensor
(
handle
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Floor
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlFloor
(
handle
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Ceil
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCeil
(
handle
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
IsNan
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlIsNan
(
handle
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Square
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSquare
(
handle
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Sqrt
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSqrt_v2
(
handle
,
prefer
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Rsqrt
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlRsqrt_v2
(
handle
,
prefer
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Cos
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCos_v2
(
handle
,
prefer
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Sin
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSin_v2
(
handle
,
prefer
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
TrigonForward
(
const
ExecutionContext
&
ctx
,
const
cnnlTrigonDescriptor_t
trigon_desc
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlTrigonForward
(
handle
,
trigon_desc
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Exp
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlExp_v2
(
handle
,
prefer
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Sign
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSign
(
handle
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
IsFinite
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlIsFinite
(
handle
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
IsNanInf
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
// TODO(CTR-3849): output type should be void*, but now bool*.
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlNanInf
(
handle
,
input_desc
,
input
,
reinterpret_cast
<
bool
*>
(
output
)));
}
/* static */
void
MLUCnnl
::
Erf
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlErf_v2
(
handle
,
prefer
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Log1p
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlLog1p
(
handle
,
prefer
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
LogicalNot
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlLogicOp
(
handle
,
CNNL_LOGIC_OP_NOT
,
input_desc
,
input
,
input_desc
,
input
,
nullptr
,
0
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
DynamicStitch
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
*
indices_desc
,
const
int
**
indices
,
const
cnnlTensorDescriptor_t
*
data_desc
,
const
void
**
data
,
const
int
size
,
int
*
indices_dims
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetDynamicStitchWorkspaceSize
(
handle
,
size
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlDynamicStitch
(
handle
,
indices_desc
,
indices
,
data_desc
,
data
,
size
,
indices_dims
,
workspace_ptr
,
workspace_size
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
CropAndResize
(
const
ExecutionContext
&
ctx
,
const
std
::
string
method_name
,
const
float
extrapolation_value
,
const
cnnlTensorDescriptor_t
image_desc
,
const
void
*
image
,
const
cnnlTensorDescriptor_t
boxes_desc
,
const
void
*
boxes
,
const
cnnlTensorDescriptor_t
box_index_desc
,
const
void
*
box_index
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
cnnlCropAndResizeMode_t
mode
=
CNNL_CROP_AND_RESIZE_BILINEAR
;
if
(
method_name
==
"nearest"
)
{
mode
=
CNNL_CROP_AND_RESIZE_NEAREST
;
}
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCropAndResize
(
handle
,
image_desc
,
image
,
boxes_desc
,
boxes
,
box_index_desc
,
box_index
,
mode
,
extrapolation_value
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
CropAndResizeBackwardImage
(
const
ExecutionContext
&
ctx
,
const
std
::
string
method_name
,
const
cnnlTensorDescriptor_t
grads_desc
,
const
void
*
grads
,
const
cnnlTensorDescriptor_t
boxes_desc
,
const
void
*
boxes
,
const
cnnlTensorDescriptor_t
box_idx_desc
,
const
void
*
box_idx
,
const
cnnlTensorDescriptor_t
grads_image_desc
,
void
*
grads_image
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
cnnlCropAndResizeMode_t
mode
=
CNNL_CROP_AND_RESIZE_BILINEAR
;
if
(
method_name
==
"nearest"
)
{
mode
=
CNNL_CROP_AND_RESIZE_NEAREST
;
}
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCropAndResizeBackwardImage
(
handle
,
grads_desc
,
grads
,
boxes_desc
,
boxes
,
box_idx_desc
,
box_idx
,
mode
,
grads_image_desc
,
grads_image
));
}
/* static */
void
MLUCnnl
::
CropAndResizeBackwardBoxes
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
image_desc
,
const
void
*
image
,
const
cnnlTensorDescriptor_t
boxes_desc
,
const
void
*
boxes
,
const
cnnlTensorDescriptor_t
box_idx_desc
,
const
void
*
box_idx
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
cnnlCropAndResizeMode_t
mode
=
CNNL_CROP_AND_RESIZE_BILINEAR
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCropAndResizeBackwardBoxes
(
handle
,
input_desc
,
input
,
image_desc
,
image
,
boxes_desc
,
boxes
,
box_idx_desc
,
box_idx
,
output_desc
,
output
,
mode
));
}
/* static */
void
MLUCnnl
::
Interp
(
const
ExecutionContext
&
ctx
,
const
cnnlInterpMode_t
mode
,
const
bool
align_corners
,
const
bool
half_pixel_centers
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlInterp_v2
(
handle
,
align_corners
,
half_pixel_centers
,
mode
,
NULL
,
true
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
InterpBackward
(
const
ExecutionContext
&
ctx
,
const
cnnlInterpBackwardMode_t
mode
,
const
bool
align_corners
,
const
bool
half_pixel_centers
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlInterpBackward
(
handle
,
align_corners
,
half_pixel_centers
,
mode
,
input_desc
,
input
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Cast
(
const
ExecutionContext
&
ctx
,
cnnlCastDataType_t
cast_type
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCastDataType
(
handle
,
input_desc
,
input
,
cast_type
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
PoolingBackward
(
const
ExecutionContext
&
ctx
,
const
cnnlPoolingDescriptor_t
pooling_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
y_desc
,
const
void
*
y
,
const
cnnlTensorDescriptor_t
diff_y_desc
,
const
void
*
diff_y
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
diff_x_desc
,
void
*
diff_x
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlPoolingBackward
(
handle
,
const_cast
<
cnnlPoolingDescriptor_t
>
(
pooling_desc
),
alpha
,
y_desc
,
y
,
diff_y_desc
,
diff_y
,
x_desc
,
x
,
beta
,
diff_x_desc
,
diff_x
));
}
/* static */
void
MLUCnnl
::
NonMaxSuppression
(
const
ExecutionContext
&
ctx
,
const
cnnlNmsDescriptor_t
nms_desc
,
const
cnnlTensorDescriptor_t
boxes_desc
,
const
void
*
boxes
,
const
cnnlTensorDescriptor_t
confidence_desc
,
const
void
*
confidence
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
void
*
output_size
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetNmsWorkspaceSize_v2
(
handle
,
confidence_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlNms_v2
(
handle
,
nms_desc
,
boxes_desc
,
boxes
,
confidence_desc
,
confidence
,
workspace_ptr
,
workspace_size
,
output_desc
,
output
,
output_size
));
}
/* static */
void
MLUCnnl
::
PoolingIndex
(
const
ExecutionContext
&
ctx
,
const
cnnlPoolingDescriptor_t
pooling_desc
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
y_desc
,
void
*
y
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlPoolingIndex
(
handle
,
const_cast
<
cnnlPoolingDescriptor_t
>
(
pooling_desc
),
x_desc
,
x
,
y_desc
,
y
));
}
/* static */
void
MLUCnnl
::
SpaceToBatch
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
const
int64_t
block_shape
[])
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetSpace2batchWorkspaceSize
(
handle
,
input_desc
,
output_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
cnnlSpaceBatchParam_t
param
=
{
static_cast
<
uint32_t
>
(
block_shape
[
0
]),
static_cast
<
uint32_t
>
(
block_shape
[
1
])};
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSpace2batch
(
handle
,
input_desc
,
input
,
output_desc
,
output
,
param
,
workspace_ptr
,
workspace_size
));
}
/* static */
void
MLUCnnl
::
SpaceToBatchNd
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
cnnlSpaceBatchNdDescriptor_t
param
,
void
*
extra_device_input
,
size_t
extra_host_input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSpace2batchNd_v2
(
handle
,
input_desc
,
input
,
output_desc
,
output
,
param
,
extra_device_input
,
extra_host_input
));
}
/* static */
void
MLUCnnl
::
FusedBatchNorm
(
const
ExecutionContext
&
ctx
,
const
bool
is_training
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
scale_desc
,
const
void
*
scale
,
const
void
*
offset
,
const
void
*
running_mean_input
,
const
void
*
running_variance_input
,
float
epsilon
,
float
momentum
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
void
*
running_mean_output
,
void
*
running_var_output
,
void
*
saved_batch_mean_output
,
void
*
saved_batch_var_output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
if
(
is_training
)
{
/*
* If in Paddle, running_mean_output = momentum * runnning_mean_input +
* (1 - momentum) * batch_mean. However, In CNNL,
* running_mean_output = (1 - momentum) * running_mean_input +
* momentum * batch_mean. So we pass (1.0 - momentum) to momentum param.
*/
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBatchNormForwardTraining
(
handle
,
NULL
,
NULL
,
x_desc
,
x
,
scale_desc
,
scale
,
offset
,
running_mean_output
,
running_var_output
,
epsilon
,
1.0
-
momentum
,
output_desc
,
output
,
saved_batch_mean_output
,
saved_batch_var_output
));
}
else
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBatchNormForwardInference
(
handle
,
NULL
,
NULL
,
x_desc
,
x
,
scale_desc
,
scale
,
offset
,
running_mean_input
,
running_variance_input
,
epsilon
,
output_desc
,
output
));
}
}
/* static */
void
MLUCnnl
::
FusedBatchNormGrad
(
const
ExecutionContext
&
ctx
,
const
bool
is_training
,
const
cnnlTensorDescriptor_t
y_backprop_desc
,
const
void
*
y_backprop
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
scale_desc
,
const
void
*
scale
,
const
void
*
saved_mean
,
const
void
*
saved_var
,
float
epsilon
,
const
cnnlTensorDescriptor_t
x_backprop_desc
,
void
*
x_backprop
,
void
*
scale_backprop
,
void
*
offset_backprop
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
if
(
is_training
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBatchNormBackward
(
handle
,
NULL
,
NULL
,
NULL
,
NULL
,
x_desc
,
x
,
y_backprop_desc
,
y_backprop
,
scale_desc
,
scale
,
saved_mean
,
saved_var
,
epsilon
,
x_backprop_desc
,
x_backprop
,
scale_backprop
,
offset_backprop
));
}
else
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlFrozenBatchNormBackward
(
handle
,
x_desc
,
x
,
y_backprop_desc
,
y_backprop
,
scale_desc
,
scale
,
saved_mean
,
saved_var
,
epsilon
,
x_backprop_desc
,
x_backprop
,
scale_backprop
,
offset_backprop
));
}
}
/* static */
void
MLUCnnl
::
QuantizeParam
(
const
ExecutionContext
&
ctx
,
const
cnnlQuantizeMode_t
mode
,
const
int
bitwidth
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
void
*
position
,
void
*
scale
,
void
*
offset
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetQuantizeParamWorkspaceSize
(
handle
,
input_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlQuantizeParam
(
handle
,
mode
,
input_desc
,
input
,
bitwidth
,
workspace_ptr
,
workspace_size
,
position
,
scale
,
offset
));
}
/* static */
void
MLUCnnl
::
Conv2D
(
const
ExecutionContext
&
ctx
,
const
cnnlConvolutionDescriptor_t
conv_desc
,
const
cnnlDataType_t
tensor_dtype
,
const
cnnlDataType_t
dt_onchip
,
const
void
*
input_position
,
const
void
*
input_scale
,
const
void
*
input_offset
,
const
void
*
filter_position
,
const
void
*
filter_scale
,
const
void
*
filter_offset
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
filter_desc
,
const
void
*
filter
,
const
cnnlTensorDescriptor_t
bias_desc
,
const
void
*
bias
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetTensorDescriptorOnchipDataType
(
input_desc
,
dt_onchip
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetTensorDescriptorOnchipDataType
(
filter_desc
,
dt_onchip
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetTensorDescriptorOnchipDataType
(
output_desc
,
tensor_dtype
));
cnnlConvolutionForwardAlgo_t
algo
;
const
cnnlConvolutionFwdPreference_t
preference
=
CNNL_CONVOLUTION_FWD_FASTEST
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetConvolutionForwardAlgorithm
(
handle
,
conv_desc
,
input_desc
,
filter_desc
,
output_desc
,
preference
,
&
algo
));
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetConvolutionForwardWorkspaceSize
(
handle
,
input_desc
,
filter_desc
,
output_desc
,
bias_desc
,
conv_desc
,
algo
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlQuantizeConvolutionForward
(
handle
,
conv_desc
,
algo
,
nullptr
/*alpha*/
,
input_desc
,
input
,
input_position
,
input_scale
,
input_offset
,
filter_desc
,
filter
,
filter_position
,
filter_scale
,
filter_offset
,
bias_desc
,
bias
,
workspace_ptr
,
workspace_size
,
nullptr
/*beta*/
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
FusedConvBNQuantify
(
const
ExecutionContext
&
ctx
,
cnnlConvolutionDescriptor_t
conv_desc
,
const
void
*
epsilon_ptr
,
const
int
fused_ops_number
,
const
cnnlDataType_t
tensor_dtype
,
const
int
input_position
,
const
float
input_scale
,
const
int
filter_position
,
const
float
filter_scale
,
const
cnnlTensorDescriptor_t
scale_desc
,
const
void
*
scale_ptr
,
const
cnnlTensorDescriptor_t
offset_desc
,
const
void
*
offset_ptr
,
const
cnnlTensorDescriptor_t
mean_desc
,
const
void
*
mean_ptr
,
const
cnnlTensorDescriptor_t
variance_desc
,
const
void
*
variance_ptr
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
filter_desc
,
const
void
*
filter
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetTensorDescriptorOnchipDataType
(
input_desc
,
CNNL_DTYPE_INT16
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetTensorDescriptorOnchipDataType
(
filter_desc
,
CNNL_DTYPE_INT16
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetTensorDescriptorOnchipDataType
(
output_desc
,
tensor_dtype
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetTensorDescriptorPositionAndScale
(
input_desc
,
input_position
,
input_scale
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetTensorDescriptorPositionAndScale
(
filter_desc
,
filter_position
,
filter_scale
));
cnnlFusedOpsPlan_t
fusion_plan
=
nullptr
;
cnnlActivationDescriptor_t
active_desc
=
nullptr
;
cnnlFusedOpsConstParamPack_t
cparam_pack
=
nullptr
;
cnnlFusedOpsVariantParamPack_t
vparam_pack
=
nullptr
;
cnnlConvolutionForwardAlgo_t
algo
;
cnnlFusedOps_t
fusion_type
=
CNNL_CONV_SCALE_BN_ACTIVATION
;
cnnlConvolutionCastMode_t
cast_mode
=
CNNL_OFFLINE_SYMMETRIC_QUANTIZE
;
cnnlConvolutionFwdPreference_t
preference
=
CNNL_CONVOLUTION_FWD_FASTEST
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetConvolutionForwardAlgorithm
(
handle
,
conv_desc
,
input_desc
,
filter_desc
,
output_desc
,
preference
,
&
algo
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCreateFusedOpsPlan
(
&
fusion_plan
,
fusion_type
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCreateFusedOpsConstParamPack
(
&
cparam_pack
,
fusion_type
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCreateFusedOpsVariantParamPack
(
&
vparam_pack
,
fusion_type
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsConstParamPackAttribute
(
cparam_pack
,
CNNL_XDESC
,
input_desc
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsVariantParamPackAttribute
(
vparam_pack
,
CNNL_PTR_X
,
input
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsConstParamPackAttribute
(
cparam_pack
,
CNNL_WDESC
,
filter_desc
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsVariantParamPackAttribute
(
vparam_pack
,
CNNL_PTR_W
,
filter
));
if
(
fused_ops_number
>
1
)
{
cnnlCreateActivationDescriptor
(
&
active_desc
);
cnnlNanPropagation_t
nan_opt
=
CNNL_NOT_PROPAGATE_NAN
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetActivationDescriptor
(
active_desc
,
CNNL_ACTIVATION_RELU
,
nan_opt
,
0.0
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsConstParamPackAttribute
(
cparam_pack
,
CNNL_ACTIVATION_DESC
,
active_desc
));
}
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsConstParamPackAttribute
(
cparam_pack
,
CNNL_BN_WEIGHT_BIAS_MEAN_VAR_DESC
,
scale_desc
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsVariantParamPackAttribute
(
vparam_pack
,
CNNL_PTR_BN_WEIGHT
,
scale_ptr
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsConstParamPackAttribute
(
cparam_pack
,
CNNL_BN_WEIGHT_BIAS_MEAN_VAR_DESC
,
offset_desc
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsVariantParamPackAttribute
(
vparam_pack
,
CNNL_PTR_BN_BIAS
,
offset_ptr
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsConstParamPackAttribute
(
cparam_pack
,
CNNL_BN_WEIGHT_BIAS_MEAN_VAR_DESC
,
mean_desc
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsVariantParamPackAttribute
(
vparam_pack
,
CNNL_PTR_BN_MEAN
,
mean_ptr
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsConstParamPackAttribute
(
cparam_pack
,
CNNL_BN_WEIGHT_BIAS_MEAN_VAR_DESC
,
variance_desc
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsVariantParamPackAttribute
(
vparam_pack
,
CNNL_PTR_BN_VAR
,
variance_ptr
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsConstParamPackAttribute
(
cparam_pack
,
CNNL_CONV_DESC
,
conv_desc
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsConstParamPackAttribute
(
cparam_pack
,
CNNL_SCALAR_CONV_FWD_ALGO
,
&
algo
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsConstParamPackAttribute
(
cparam_pack
,
CNNL_SCALAR_CONV_FWD_CAST_MODE
,
&
cast_mode
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsVariantParamPackAttribute
(
vparam_pack
,
CNNL_SCALAR_BN_EPSILON
,
epsilon_ptr
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsConstParamPackAttribute
(
cparam_pack
,
CNNL_YDESC
,
output_desc
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsVariantParamPackAttribute
(
vparam_pack
,
CNNL_PTR_Y
,
output
));
// get workspace size
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlMakeFusedOpsPlan
(
handle
,
fusion_plan
,
cparam_pack
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
if
(
workspace_size
>
0
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsVariantParamPackAttribute
(
vparam_pack
,
CNNL_PTR_WORKSPACE
,
workspace_ptr
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetFusedOpsVariantParamPackAttribute
(
vparam_pack
,
CNNL_SCALAR_WORKSPACE_SIZE
,
&
workspace_size
));
}
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlFusedOpsExecute
(
handle
,
fusion_plan
,
vparam_pack
));
if
(
active_desc
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlDestroyActivationDescriptor
(
active_desc
));
}
if
(
cparam_pack
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlDestroyFusedOpsConstParamPack
(
cparam_pack
));
}
if
(
vparam_pack
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlDestroyFusedOpsVariantParamPack
(
vparam_pack
));
}
if
(
fusion_plan
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlDestroyFusedOpsPlan
(
fusion_plan
));
}
}
/* static */
void
MLUCnnl
::
ConvBackpropInput
(
const
ExecutionContext
&
ctx
,
const
cnnlConvolutionDescriptor_t
conv_desc
,
const
cnnlTensorDescriptor_t
filter_desc
,
const
void
*
filter
,
const
cnnlTensorDescriptor_t
out_backprop_desc
,
const
void
*
out_backprop
,
const
cnnlTensorDescriptor_t
in_backprop_desc
,
void
*
in_backprop
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
cnnlConvolutionBwdDataAlgo_t
algo
;
const
cnnlConvolutionBwdDataPreference_t
preference
=
CNNL_CONVOLUTION_BWD_DATA_FASTEST
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetConvolutionBackwardDataAlgorithm
(
handle
,
filter_desc
,
out_backprop_desc
,
conv_desc
,
in_backprop_desc
,
preference
,
&
algo
));
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetConvolutionBackwardDataWorkspaceSize
(
handle
,
filter_desc
,
out_backprop_desc
,
conv_desc
,
in_backprop_desc
,
algo
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlConvolutionBackwardData
(
handle
,
nullptr
/*alpha*/
,
filter_desc
,
filter
,
out_backprop_desc
,
out_backprop
,
conv_desc
,
algo
,
workspace_ptr
,
workspace_size
,
nullptr
/*beta*/
,
in_backprop_desc
,
in_backprop
));
}
/* static */
void
MLUCnnl
::
QuantizeConvBackpropInput
(
const
ExecutionContext
&
ctx
,
const
cnnlConvolutionDescriptor_t
conv_desc
,
const
cnnlDataType_t
tensor_dtype
,
const
cnnlDataType_t
dt_onchip
,
const
void
*
filter_position
,
const
void
*
filter_scale
,
const
void
*
filter_offset
,
const
void
*
out_backprop_position
,
const
void
*
out_backprop_scale
,
const
void
*
out_backprop_offset
,
const
cnnlTensorDescriptor_t
filter_desc
,
const
void
*
filter
,
const
cnnlTensorDescriptor_t
out_backprop_desc
,
const
void
*
out_backprop
,
const
cnnlTensorDescriptor_t
in_backprop_desc
,
void
*
in_backprop
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetTensorDescriptorOnchipDataType
(
filter_desc
,
dt_onchip
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetTensorDescriptorOnchipDataType
(
out_backprop_desc
,
dt_onchip
));
cnnlConvolutionBwdDataAlgo_t
algo
;
const
cnnlConvolutionBwdDataPreference_t
preference
=
CNNL_CONVOLUTION_BWD_DATA_FASTEST
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetConvolutionBackwardDataAlgorithm
(
handle
,
filter_desc
,
out_backprop_desc
,
conv_desc
,
in_backprop_desc
,
preference
,
&
algo
));
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetConvolutionBackwardDataWorkspaceSize
(
handle
,
filter_desc
,
out_backprop_desc
,
conv_desc
,
in_backprop_desc
,
algo
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlQuantizeConvolutionBackwardData
(
handle
,
nullptr
/*alpha*/
,
filter_desc
,
filter
,
filter_position
,
filter_scale
,
filter_offset
,
out_backprop_desc
,
out_backprop
,
out_backprop_position
,
out_backprop_scale
,
out_backprop_offset
,
conv_desc
,
algo
,
workspace_ptr
,
workspace_size
,
nullptr
/*beta*/
,
in_backprop_desc
,
in_backprop
));
}
/* static */
void
MLUCnnl
::
ConvBackpropFilter
(
const
ExecutionContext
&
ctx
,
const
cnnlConvolutionDescriptor_t
conv_desc
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
out_backprop_desc
,
const
void
*
out_backprop
,
const
cnnlTensorDescriptor_t
filter_backprop_desc
,
void
*
filter_backprop
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
cnnlConvolutionBwdFilterAlgo_t
algo
;
const
cnnlConvolutionBwdFilterPreference_t
preference
=
CNNL_CONVOLUTION_BWD_FILTER_FASTEST
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetConvolutionBackwardFilterAlgorithm
(
handle
,
conv_desc
,
input_desc
,
out_backprop_desc
,
filter_backprop_desc
,
preference
,
&
algo
));
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetConvolutionBackwardFilterWorkspaceSize
(
handle
,
input_desc
,
out_backprop_desc
,
filter_backprop_desc
,
conv_desc
,
algo
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlConvolutionBackwardFilter
(
handle
,
nullptr
/*alpha*/
,
input_desc
,
input
,
out_backprop_desc
,
out_backprop
,
conv_desc
,
algo
,
workspace_ptr
,
workspace_size
,
nullptr
/*beta*/
,
filter_backprop_desc
,
filter_backprop
));
}
/* static */
void
MLUCnnl
::
QuantizeConvBackpropFilter
(
const
ExecutionContext
&
ctx
,
const
cnnlConvolutionDescriptor_t
conv_desc
,
const
cnnlDataType_t
tensor_dtype
,
const
cnnlDataType_t
dt_onchip
,
const
void
*
input_position
,
const
void
*
input_scale
,
const
void
*
input_offset
,
const
void
*
out_backprop_position
,
const
void
*
out_backprop_scale
,
const
void
*
out_backprop_offset
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
out_backprop_desc
,
const
void
*
out_backprop
,
const
cnnlTensorDescriptor_t
filter_backprop_desc
,
void
*
filter_backprop
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetTensorDescriptorOnchipDataType
(
input_desc
,
dt_onchip
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetTensorDescriptorOnchipDataType
(
out_backprop_desc
,
dt_onchip
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetTensorDescriptorOnchipDataType
(
filter_backprop_desc
,
tensor_dtype
));
cnnlConvolutionBwdFilterAlgo_t
algo
;
const
cnnlConvolutionBwdFilterPreference_t
preference
=
CNNL_CONVOLUTION_BWD_FILTER_FASTEST
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetConvolutionBackwardFilterAlgorithm
(
handle
,
conv_desc
,
input_desc
,
out_backprop_desc
,
filter_backprop_desc
,
preference
,
&
algo
));
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetConvolutionBackwardFilterWorkspaceSize
(
handle
,
input_desc
,
out_backprop_desc
,
filter_backprop_desc
,
conv_desc
,
algo
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlQuantizeConvolutionBackwardFilter
(
handle
,
nullptr
/*alpha*/
,
input_desc
,
input
,
input_position
,
input_scale
,
input_offset
,
out_backprop_desc
,
out_backprop
,
out_backprop_position
,
out_backprop_scale
,
out_backprop_offset
,
conv_desc
,
algo
,
workspace_ptr
,
workspace_size
,
nullptr
/*beta*/
,
filter_backprop_desc
,
filter_backprop
));
}
/* static */
void
MLUCnnl
::
QuantizeMatMul
(
const
ExecutionContext
&
ctx
,
const
bool
transpose_a
,
const
bool
transpose_b
,
const
cnnlTensorDescriptor_t
a_desc
,
const
void
*
a
,
const
void
*
a_position
,
const
void
*
a_scale
,
const
void
*
a_offset
,
const
cnnlTensorDescriptor_t
b_desc
,
const
void
*
b
,
const
void
*
b_position
,
const
void
*
b_scale
,
const
void
*
b_offset
,
const
cnnlDataType_t
quant_type
,
const
cnnlDataType_t
data_type
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
// Set onchip data type
cnnlSetTensorDescriptorOnchipDataType
(
a_desc
,
quant_type
);
cnnlSetTensorDescriptorOnchipDataType
(
b_desc
,
quant_type
);
cnnlSetTensorDescriptorOnchipDataType
(
output_desc
,
data_type
);
// Create and set matmul descriptor
cnnlMatMulDescriptor_t
matmul_desc
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlMatMulDescCreate
(
&
matmul_desc
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetMatMulDescAttr
(
matmul_desc
,
CNNL_MATMUL_DESC_COMPUTE_TYPE
,
&
data_type
,
sizeof
(
int
)));
int
transpose_a_int
=
static_cast
<
int
>
(
transpose_a
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetMatMulDescAttr
(
matmul_desc
,
CNNL_MATMUL_DESC_TRANSA
,
&
(
transpose_a_int
),
sizeof
(
int
)));
int
transpose_b_int
=
static_cast
<
int
>
(
transpose_b
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetMatMulDescAttr
(
matmul_desc
,
CNNL_MATMUL_DESC_TRANSB
,
&
(
transpose_b_int
),
sizeof
(
int
)));
// Create and get matmul algorithim
cnnlMatMulAlgo_t
algo
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlMatMulAlgoCreate
(
&
algo
));
const
cnnlMatMulPreference_t
preference
=
CNNL_MATMUL_FASTEST
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetQuantizeMatMulAlgorithm
(
handle
,
matmul_desc
,
a_desc
,
b_desc
,
output_desc
,
preference
,
&
algo
));
// Get workspace
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetQuantizeMatMulWorkspaceSize
(
handle
,
matmul_desc
,
a_desc
,
b_desc
,
output_desc
,
algo
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
// Compute
float
alpha
=
1.0
;
float
beta
=
0.0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlQuantizeMatMul
(
handle
,
matmul_desc
,
reinterpret_cast
<
void
*>
(
&
alpha
),
a_desc
,
a
,
a_position
,
a_scale
,
a_offset
,
b_desc
,
b
,
b_position
,
b_scale
,
b_offset
,
reinterpret_cast
<
void
*>
(
&
beta
),
output_desc
,
output
,
algo
,
workspace_ptr
,
workspace_size
));
// Destroy matmul descriptor and algorithim
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlMatMulDescDestroy
(
matmul_desc
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlMatMulAlgoDestroy
(
algo
));
}
/* static */
void
MLUCnnl
::
QuantizeBatchMatMul
(
const
ExecutionContext
&
ctx
,
const
bool
adj_x
,
const
bool
adj_y
,
const
cnnlTensorDescriptor_t
in0_desc
,
const
void
*
in0
,
const
void
*
in0_position
,
const
void
*
in0_scale
,
const
void
*
in0_offset
,
const
cnnlTensorDescriptor_t
in1_desc
,
const
void
*
in1
,
const
void
*
in1_position
,
const
void
*
in1_scale
,
const
void
*
in1_offset
,
const
cnnlDataType_t
quant_type
,
const
cnnlDataType_t
data_type
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
// Set onchip data type
cnnlSetTensorDescriptorOnchipDataType
(
in0_desc
,
quant_type
);
cnnlSetTensorDescriptorOnchipDataType
(
in1_desc
,
quant_type
);
cnnlSetTensorDescriptorOnchipDataType
(
output_desc
,
data_type
);
// Create and set batch matmul descriptor
cnnlBatchMatMulDescriptor_t
bmm_desc
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBatchMatMulDescCreate
(
&
bmm_desc
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetBatchMatMulDescAttr
(
bmm_desc
,
CNNL_BMM_DESC_COMPUTE_TYPE
,
&
data_type
,
sizeof
(
int
)));
int
transpose_a_int
=
static_cast
<
int
>
(
adj_x
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetBatchMatMulDescAttr
(
bmm_desc
,
CNNL_BMM_DESC_TRANSA
,
&
(
transpose_a_int
),
sizeof
(
int
)));
int
transpose_b_int
=
static_cast
<
int
>
(
adj_y
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetBatchMatMulDescAttr
(
bmm_desc
,
CNNL_BMM_DESC_TRANSB
,
&
(
transpose_b_int
),
sizeof
(
int
)));
// Create and get batch matmul algorithim
cnnlBatchMatMulAlgo_t
algo
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBatchMatMulAlgoCreate
(
&
algo
));
const
cnnlBatchMatMulPreference_t
preference
=
CNNL_BMM_FASTEST
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetQuantizeBatchMatMulAlgorithm
(
handle
,
bmm_desc
,
in0_desc
,
in1_desc
,
output_desc
,
preference
,
&
algo
));
// Get workspace
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetQuantizeBatchMatMulWorkspaceSize
(
handle
,
bmm_desc
,
in0_desc
,
in1_desc
,
output_desc
,
algo
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
// Compute
float
alpha
=
1.0
;
float
beta
=
0.0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlQuantizeBatchMatMul
(
handle
,
bmm_desc
,
reinterpret_cast
<
void
*>
(
&
alpha
),
in0_desc
,
in0
,
in0_position
,
in0_scale
,
in0_offset
,
in1_desc
,
in1
,
in1_position
,
in1_scale
,
in1_offset
,
reinterpret_cast
<
void
*>
(
&
beta
),
output_desc
,
output
,
algo
,
workspace_ptr
,
workspace_size
));
// Destroy matmul descriptor and algorithim
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBatchMatMulDescDestroy
(
bmm_desc
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBatchMatMulAlgoDestroy
(
algo
));
}
/* static */
void
MLUCnnl
::
QuantizeBatchMatMulBCast
(
const
ExecutionContext
&
ctx
,
const
bool
adj_x
,
const
bool
adj_y
,
const
cnnlTensorDescriptor_t
in0_desc
,
const
void
*
in0
,
const
void
*
in0_position
,
const
void
*
in0_scale
,
const
void
*
in0_offset
,
const
cnnlTensorDescriptor_t
in1_desc
,
const
void
*
in1
,
const
void
*
in1_position
,
const
void
*
in1_scale
,
const
void
*
in1_offset
,
const
cnnlDataType_t
quant_type
,
const
cnnlDataType_t
data_type
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
// Set onchip data type
cnnlSetTensorDescriptorOnchipDataType
(
in0_desc
,
quant_type
);
cnnlSetTensorDescriptorOnchipDataType
(
in1_desc
,
quant_type
);
cnnlSetTensorDescriptorOnchipDataType
(
output_desc
,
data_type
);
// Create and set batch matmul descriptor
cnnlBatchMatMulBCastDescriptor_t
bmm_bcast_desc
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBatchMatMulBCastDescCreate
(
&
bmm_bcast_desc
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetBatchMatMulBCastDescAttr
(
bmm_bcast_desc
,
CNNL_BMM_BCAST_DESC_COMPUTE_TYPE
,
&
data_type
,
sizeof
(
int
)));
int
transpose_a_int
=
static_cast
<
int
>
(
adj_x
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetBatchMatMulBCastDescAttr
(
bmm_bcast_desc
,
CNNL_BMM_BCAST_DESC_TRANSA
,
&
(
transpose_a_int
),
sizeof
(
int
)));
int
transpose_b_int
=
static_cast
<
int
>
(
adj_y
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetBatchMatMulBCastDescAttr
(
bmm_bcast_desc
,
CNNL_BMM_BCAST_DESC_TRANSB
,
&
(
transpose_b_int
),
sizeof
(
int
)));
// Create and get batch matmul algorithim
cnnlBatchMatMulBCastAlgo_t
algo
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBatchMatMulBCastAlgoCreate
(
&
algo
));
const
cnnlBatchMatMulBCastPreference_t
preference
=
CNNL_BMM_BCAST_FASTEST
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetQuantizeBatchMatMulBCastAlgorithm
(
handle
,
bmm_bcast_desc
,
in0_desc
,
in1_desc
,
output_desc
,
preference
,
&
algo
));
// Get workspace
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetQuantizeBatchMatMulBCastWorkspaceSize
(
handle
,
bmm_bcast_desc
,
in0_desc
,
in1_desc
,
output_desc
,
algo
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
// Compute
float
alpha
=
1.0
;
float
beta
=
0.0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlQuantizeBatchMatMulBCast
(
handle
,
bmm_bcast_desc
,
reinterpret_cast
<
void
*>
(
&
alpha
),
in0_desc
,
in0
,
in0_position
,
in0_scale
,
in0_offset
,
in1_desc
,
in1
,
in1_position
,
in1_scale
,
in1_offset
,
reinterpret_cast
<
void
*>
(
&
beta
),
output_desc
,
output
,
algo
,
workspace_ptr
,
workspace_size
));
// Destroy matmul descriptor and algorithim
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBatchMatMulBCastDescDestroy
(
bmm_bcast_desc
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBatchMatMulBCastAlgoDestroy
(
algo
));
}
/* static */
void
MLUCnnl
::
Transpose
(
const
ExecutionContext
&
ctx
,
const
std
::
vector
<
int
>
perm
,
const
int
input_dim
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
cnnlTransposeDescriptor_t
perm_desc
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlCreateTransposeDescriptor
(
&
perm_desc
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSetTransposeDescriptor
(
perm_desc
,
input_dim
,
perm
.
data
()));
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetTransposeWorkspaceSize
(
handle
,
input_desc
,
perm_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlTranspose_v2
(
handle
,
perm_desc
,
input_desc
,
input
,
output_desc
,
output
,
workspace_ptr
,
workspace_size
));
if
(
perm_desc
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlDestroyTransposeDescriptor
(
perm_desc
));
}
}
/* static */
void
MLUCnnl
::
MatrixBandPart
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
data_desc
,
const
void
*
input
,
const
int
num_lower
,
const
int
num_upper
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlMatrixBandPart
(
handle
,
data_desc
,
input
,
num_lower
,
num_upper
,
output
));
}
/* static */
void
MLUCnnl
::
NumTrue
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
Tensor
index
,
uint32_t
*
num_true
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetNumTrueWorkspaceSize
(
handle
,
x_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
index
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
index_ptr
=
index
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlNumTrue
(
handle
,
x_desc
,
x
,
static_cast
<
uint32_t
*>
(
index_ptr
),
num_true
));
}
/* static */
void
MLUCnnl
::
Where
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
uint32_t
*
strides
,
const
uint32_t
*
index
,
const
cnnlTensorDescriptor_t
y_desc
,
int
*
y
,
const
bool
as_tuple
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlWhere
(
handle
,
x_desc
,
x
,
strides
,
index
,
y_desc
,
y
,
as_tuple
));
}
/* static */
void
MLUCnnl
::
InTopK
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
predictions_desc
,
const
void
*
predictions
,
const
cnnlTensorDescriptor_t
targets_desc
,
const
void
*
targets
,
const
cnnlTensorDescriptor_t
k_desc
,
const
void
*
k
,
const
int
k_int
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlInTopK
(
handle
,
predictions_desc
,
predictions
,
targets_desc
,
targets
,
k_desc
,
k
,
k_int
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
ScatterNd
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
indices_desc
,
const
void
*
indices
,
const
cnnlTensorDescriptor_t
updates_desc
,
const
void
*
updates
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlScatterNd
(
handle
,
indices_desc
,
indices
,
updates_desc
,
updates
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
BitWise
(
const
ExecutionContext
&
ctx
,
const
cnnlBitComputeOp_t
optype
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetBitComputeWorkspaceSize
(
handle
,
input1_desc
,
input2_desc
,
output_desc
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlBitCompute_v2
(
handle
,
optype
,
input1_desc
,
input1
,
input2_desc
,
input2
,
output_desc
,
output
,
workspace_ptr
,
workspace_size
));
}
/* static */
void
MLUCnnl
::
QR
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
a_desc
,
const
void
*
a
,
const
cnnlTensorDescriptor_t
q_desc
,
void
*
q
,
const
cnnlTensorDescriptor_t
r_desc
,
void
*
r
,
const
bool
some
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetQRWorkspaceSize
(
handle
,
a_desc
,
some
,
&
workspace_size
));
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
Tensor
workspace
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
workspace_size
)},
dev_ctx
);
void
*
workspace_ptr
=
workspace
.
mutable_data
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlQR
(
handle
,
a_desc
,
a
,
q_desc
,
q
,
r_desc
,
r
,
workspace_ptr
,
workspace_size
,
some
));
}
/* static */
void
MLUCnnl
::
Reciprocal
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
GetHandleFromCTX
(
ctx
);
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlReciprocal
(
handle
,
input_desc
,
input
,
output_desc
,
output
));
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/mlu/mlu_baseop.h
浏览文件 @
c396ee65
...
...
@@ -30,7 +30,20 @@ namespace operators {
using
Tensor
=
framework
::
Tensor
;
using
DataLayout
=
framework
::
DataLayout
;
using
ExecutionContext
=
framework
::
ExecutionContext
;
using
DeviceContextPool
=
platform
::
DeviceContextPool
;
using
MLUDeviceContext
=
platform
::
MLUDeviceContext
;
enum
MLULogicMethod
{
CNNL_LOGIC_OP_EQ
=
0
,
CNNL_LOGIC_OP_NE
=
1
,
CNNL_LOGIC_OP_GT
=
2
,
CNNL_LOGIC_OP_GE
=
3
,
CNNL_LOGIC_OP_LT
=
4
,
CNNL_LOGIC_OP_LE
=
5
,
CNNL_LOGIC_OP_AND
=
6
,
CNNL_LOGIC_OP_OR
=
7
,
};
template
<
typename
T
>
inline
cnnlDataType_t
ToCnnlDataType
(
const
T
&
t
)
{
...
...
@@ -76,6 +89,14 @@ NarrowT CheckedNarrowing(const WideT& wide) {
return
narrow
;
}
static
cnnlHandle_t
GetHandleFromCTX
(
const
ExecutionContext
&
ctx
)
{
return
ctx
.
template
device_context
<
MLUDeviceContext
>().
cnnl_handle
();
}
static
const
MLUDeviceContext
&
GetDevCtxFromCTX
(
const
ExecutionContext
&
ctx
)
{
return
ctx
.
template
device_context
<
MLUDeviceContext
>();
}
cnnlDeviceType_t
GetCnnlDev
(
int
dev_ordinal
);
using
CnnlTensorDesc
=
cnnlTensorDescriptor_t
;
...
...
@@ -146,22 +167,914 @@ class MLUCnnlActivationDesc {
cnnlActivationDescriptor_t
active_desc_
=
nullptr
;
};
class
MLUCnnlPoolingDesc
{
public:
MLUCnnlPoolingDesc
(
const
MLUCnnlPoolingDesc
&
desc
)
=
delete
;
MLUCnnlPoolingDesc
&
operator
=
(
const
MLUCnnlPoolingDesc
&
desc
)
=
delete
;
MLUCnnlPoolingDesc
(
const
cnnlPoolingMode_t
mode
,
const
cnnlNanPropagation_t
maxpooling_nan_opt
,
int
window_rows
,
int
window_cols
,
int64_t
pad_up
,
int64_t
pad_down
,
int64_t
pad_left
,
int64_t
pad_right
,
int
row_stride
,
int
col_stride
);
MLUCnnlPoolingDesc
(
const
cnnlPoolingMode_t
mode
,
const
cnnlNanPropagation_t
maxpooling_nan_opt
,
const
int
tensor_rank
,
const
std
::
vector
<
int
>&
window
,
const
std
::
vector
<
int
>&
padding
,
const
std
::
vector
<
int
>&
stride
);
const
cnnlPoolingDescriptor_t
get
()
const
;
~
MLUCnnlPoolingDesc
();
private:
cnnlPoolingDescriptor_t
pooling_desc_
=
nullptr
;
};
class
MLUCnnlRandomGeneratorDesc
{
public:
MLUCnnlRandomGeneratorDesc
(
const
bool
is_mlu200
,
const
int
seed
);
const
cnnlRandGenerator_t
get
()
const
;
~
MLUCnnlRandomGeneratorDesc
();
private:
cnnlRandGenerator_t
mlu_generator
=
nullptr
;
};
class
MLUCnnlReduceDesc
{
public:
MLUCnnlReduceDesc
(
const
MLUCnnlReduceDesc
&
desc
)
=
delete
;
MLUCnnlReduceDesc
&
operator
=
(
const
MLUCnnlReduceDesc
&
desc
)
=
delete
;
MLUCnnlReduceDesc
(
const
std
::
vector
<
int
>&
axis_vec
,
const
cnnlReduceOp_t
reduce_op
,
const
cnnlDataType_t
data_type
,
const
cnnlNanPropagation_t
nan_propagation
,
const
cnnlReduceIndices_t
reduce_indices
,
const
cnnlIndicesType_t
indices_type
);
const
cnnlReduceDescriptor_t
get
()
const
;
~
MLUCnnlReduceDesc
();
private:
cnnlReduceDescriptor_t
reduction_desc_
=
nullptr
;
};
class
MLUCnnlOpTensorDesc
{
public:
MLUCnnlOpTensorDesc
(
const
MLUCnnlOpTensorDesc
&
desc
)
=
delete
;
void
operator
=
(
const
MLUCnnlOpTensorDesc
&
)
=
delete
;
MLUCnnlOpTensorDesc
(
cnnlOpTensorDesc_t
op_tensor_op
,
cnnlDataType_t
op_tensor_comp_type
,
cnnlNanPropagation_t
op_tensor_nan_opt
);
const
cnnlOpTensorDescriptor_t
get
()
const
;
~
MLUCnnlOpTensorDesc
();
private:
cnnlOpTensorDescriptor_t
op_tensor_desc_
=
nullptr
;
};
class
MLUCnnlNMSDesc
{
public:
MLUCnnlNMSDesc
(
const
MLUCnnlNMSDesc
&
desc
)
=
delete
;
MLUCnnlNMSDesc
&
operator
=
(
const
MLUCnnlNMSDesc
&
desc
)
=
delete
;
MLUCnnlNMSDesc
(
const
cnnlNmsOutputMode_t
mode
,
const
float
iou_threshold
,
const
int
max_output_size
,
const
float
confidence_threshold
,
const
int
input_layout
);
const
cnnlNmsDescriptor_t
get
()
const
;
~
MLUCnnlNMSDesc
();
private:
cnnlNmsDescriptor_t
nms_desc_
=
nullptr
;
};
class
MLUCnnlConvolutionDesc
{
public:
MLUCnnlConvolutionDesc
(
const
int
dims
,
const
int
pad
[],
const
int
stride
[],
const
int
dilation
[],
const
int
group_count
,
const
cnnlDataType_t
tensor_dtype
);
MLUCnnlConvolutionDesc
(
const
int
dims
,
const
int64_t
pad
[],
const
int64_t
stride
[],
const
int64_t
dilation
[],
const
int
group_count
,
const
cnnlDataType_t
tensor_dtype
);
MLUCnnlConvolutionDesc
(
const
MLUCnnlConvolutionDesc
&
desc
)
=
delete
;
MLUCnnlConvolutionDesc
&
operator
=
(
const
MLUCnnlConvolutionDesc
&
desc
)
=
delete
;
const
cnnlConvolutionDescriptor_t
get
()
const
;
~
MLUCnnlConvolutionDesc
();
private:
cnnlConvolutionDescriptor_t
conv_desc_
=
nullptr
;
};
class
MLUCnnlBatchSpaceDesc
{
public:
MLUCnnlBatchSpaceDesc
(
uint32_t
block_shape
[],
uint32_t
paddings
[],
const
uint32_t
block_shape_size
,
const
uint32_t
paddings_size
);
void
getBatch2spaceNdextraInputSize
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
);
void
getSpace2batchNdextraInputSize
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
);
void
initSpace2batchNdExtraInput
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
void
*
extra_host_input
);
void
initBatch2spaceNdExtraInput
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
void
*
extra_host_input
);
const
cnnlSpaceBatchNdDescriptor_t
get
()
const
;
size_t
getExtraInputSize
()
const
;
~
MLUCnnlBatchSpaceDesc
();
private:
cnnlSpaceBatchNdDescriptor_t
op_desc_
=
nullptr
;
size_t
extra_input_size_
;
};
class
MLUCnnlTrigonDesc
{
public:
explicit
MLUCnnlTrigonDesc
(
const
cnnlTrigonFunctionMode_t
trigon_function_mode
);
const
cnnlTrigonDescriptor_t
get
()
const
;
~
MLUCnnlTrigonDesc
();
private:
cnnlTrigonDescriptor_t
trigon_desc_
=
nullptr
;
};
class
MLUCnnl
{
public:
static
void
Active
(
const
platform
::
MLUDevice
Context
&
ctx
,
static
void
Active
(
const
Execution
Context
&
ctx
,
cnnlActivationDescriptor_t
active_desc
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
ActiveGrad
(
const
platform
::
MLUDeviceContext
&
ctx
,
cnnlActivationDescriptor_t
active_desc
,
const
void
*
alpha
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
y_desc
,
const
void
*
y
,
const
cnnlTensorDescriptor_t
diff_y_desc
,
const
void
*
diff_y
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
diff_x_desc
,
void
*
diff_x
);
static
void
ActiveGrad
(
const
ExecutionContext
&
ctx
,
cnnlActivationDescriptor_t
active_desc
,
const
void
*
alpha
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
y_desc
,
const
void
*
y
,
const
cnnlTensorDescriptor_t
diff_y_desc
,
const
void
*
diff_y
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
diff_x_desc
,
void
*
diff_x
);
static
void
Concat
(
const
ExecutionContext
&
ctx
,
const
int
pack_num
,
const
int
axis
,
const
cnnlTensorDescriptor_t
inputs_desc
[],
const
void
*
const
inputs
[],
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Cast
(
const
ExecutionContext
&
ctx
,
cnnlCastDataType_t
cast_type
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Div
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
in0_desc
,
const
void
*
in0
,
const
cnnlTensorDescriptor_t
in1_desc
,
const
void
*
in1
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Fill
(
const
ExecutionContext
&
ctx
,
float
value
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
LRN
(
const
ExecutionContext
&
ctx
,
const
int
local_size
,
const
double
alpha
,
const
double
beta
,
const
double
k
,
const
cnnlTensorDescriptor_t
input_quant_desc
,
const
void
*
input_quant
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
QuantifyOffline
(
const
ExecutionContext
&
context
,
cnnlQuantizeMode_t
mode
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
ouput_desc
,
void
*
output
);
static
void
QuantifyOnline
(
const
ExecutionContext
&
context
,
const
int
bitwidth
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
bool
compute_scale
,
void
*
position
,
void
*
scale
,
const
cnnlTensorDescriptor_t
ouput_desc
,
void
*
output
);
static
void
SGD
(
const
ExecutionContext
&
context
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
void
*
lr
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
);
static
void
ApplyAdaGrad
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
cnnlTensorDescriptor_t
accum_desc
,
void
*
accum
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
,
const
void
*
lr
,
const
bool
update_slots
);
static
void
ApplyRMSProp
(
const
ExecutionContext
&
context
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
void
*
lr
,
const
void
*
rho
,
const
void
*
momentum
,
const
void
*
epsilon
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
,
const
cnnlTensorDescriptor_t
ms_desc
,
void
*
ms
,
const
cnnlTensorDescriptor_t
mom_desc
,
void
*
mom
);
static
void
ApplyCenterRMSProp
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
void
*
lr
,
const
void
*
rho
,
const
void
*
momentum
,
const
void
*
epsilon
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
,
const
cnnlTensorDescriptor_t
mg_desc
,
void
*
mg
,
const
cnnlTensorDescriptor_t
ms_desc
,
void
*
ms
,
const
cnnlTensorDescriptor_t
mom_desc
,
void
*
mom
);
static
void
ApplyAdam
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
void
*
lr
,
const
void
*
beta1
,
const
void
*
beta2
,
const
void
*
beta1_power
,
const
void
*
beta2_power
,
const
void
*
epsilon
,
const
bool
use_nesterov
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
,
const
cnnlTensorDescriptor_t
m_desc
,
void
*
m
,
const
cnnlTensorDescriptor_t
v_desc
,
void
*
v
);
static
void
ApplyAdaMax
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
cnnlTensorDescriptor_t
var_desc
,
void
*
var
,
const
cnnlTensorDescriptor_t
m_desc
,
void
*
m
,
const
cnnlTensorDescriptor_t
v_desc
,
void
*
v
,
const
void
*
diff
,
const
void
*
lr
,
const
void
*
beta1
,
const
void
*
beta2
,
const
void
*
beta1_power
,
const
void
*
epsilon
);
static
void
ApplyMomentum
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
bool
use_nesterov
,
const
void
*
lr
,
const
void
*
momentum
,
void
*
var
,
void
*
accum
);
static
void
ApplyKerasMomentum
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
grad
,
const
bool
use_nesterov
,
const
void
*
lr
,
const
void
*
momentum
,
void
*
var
,
void
*
accum
);
static
void
ApplyAdadelta
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
grad_desc
,
const
void
*
diff
,
const
void
*
lr
,
const
void
*
rho
,
const
void
*
epsilon
,
void
*
var
,
void
*
accum
,
void
*
accum_update
);
static
void
SparseSoftmaxXentWithLogits
(
const
ExecutionContext
&
ctx
,
cnnlSoftmaxMode_t
mode
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
label_desc
,
const
void
*
label
,
const
cnnlTensorDescriptor_t
y_desc
,
void
*
output
,
const
cnnlTensorDescriptor_t
diff_y_desc
,
void
*
back_out
);
static
void
RandomUniform
(
const
ExecutionContext
&
ctx
,
const
int
num
,
const
cnnlDataType_t
data_type
,
const
cnnlRandGenerator_t
mlu_generator
,
void
*
output
);
static
void
Cumsum
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
bool
exclusive
,
const
bool
reverse
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
ouput_desc
,
void
*
output
);
static
void
BroadcastTo
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
GatherFunctor
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
int
batch_dims
,
const
cnnlTensorDescriptor_t
params_desc
,
const
void
*
params
,
const
cnnlTensorDescriptor_t
indices_desc
,
const
void
*
indices
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
ScatterFunctor
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
params_desc
,
const
void
*
params
,
const
cnnlTensorDescriptor_t
updates_desc
,
const
void
*
updates
,
const
cnnlTensorDescriptor_t
indices_desc
,
const
void
*
indices
,
const
cnnlScatterRefMode_t
mode
);
static
void
Range
(
const
ExecutionContext
&
ctx
,
const
void
*
start
,
const
void
*
end
,
const
void
*
step
,
const
cnnlDataType_t
output_dtype
,
void
*
output
);
static
void
Round
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
TopK
(
const
ExecutionContext
&
ctx
,
const
int
k
,
const
int
dim
,
const
bool
largest
,
const
bool
sorted
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
values_output_desc
,
void
*
values_out
,
const
cnnlTensorDescriptor_t
indices_output_desc
,
void
*
indices_out
);
static
void
StridedSlice
(
const
ExecutionContext
&
ctx
,
const
int
begin
[],
const
int
end
[],
const
int
strides
[],
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Split
(
const
ExecutionContext
&
ctx
,
int
split_num
,
int
axis
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input_ptr
,
const
cnnlTensorDescriptor_t
output_descs
[],
void
*
output_ptrs
[]);
static
void
Scale
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
alpha_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
beta_desc
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
AddN
(
const
ExecutionContext
&
ctx
,
uint32_t
input_num
,
const
cnnlTensorDescriptor_t
inputs_desc
[],
const
void
*
inputs
[],
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Log
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
StridedSliceGrad
(
const
ExecutionContext
&
ctx
,
const
int
begin
[],
const
int
end
[],
const
int
strides
[],
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Logic
(
const
ExecutionContext
&
ctx
,
const
MLULogicMethod
log_method
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
ouput_desc
,
void
*
output
);
static
void
Select
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
then_desc
,
const
void
*
p_then
,
const
cnnlTensorDescriptor_t
else_desc
,
const
void
*
p_else
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
const
bool
*
condition
,
const
int
condition_size
);
static
void
AssignAdd
(
const
ExecutionContext
&
ctx
,
const
void
*
alpha
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
update_desc
,
const
void
*
update
,
const
cnnlTensorDescriptor_t
param_desc
,
void
*
param
);
static
void
AssignSub
(
const
ExecutionContext
&
ctx
,
const
void
*
alpha
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
update_desc
,
const
void
*
update
,
const
cnnlTensorDescriptor_t
param_desc
,
void
*
param
);
static
void
Assign
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
update_desc
,
const
void
*
update
,
const
cnnlTensorDescriptor_t
param_desc
,
void
*
param
);
static
void
GatherNd
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
params_desc
,
const
void
*
params
,
const
cnnlTensorDescriptor_t
indices_desc
,
const
void
*
indices
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
BatchToSpace
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
const
cnnlSpaceBatchParam_t
param
);
static
void
BatchToSpaceNd
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
cnnlSpaceBatchNdDescriptor_t
param
,
void
*
extra_device_input
,
size_t
extra_input_size
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
PoolingForward
(
const
ExecutionContext
&
ctx
,
cnnlPoolingMode_t
pool_mode
,
const
std
::
vector
<
int64_t
>&
output_shape
,
cnnlPoolingDescriptor_t
pooling_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
void
*
beta
,
const
void
*
extra_input_ptr
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Pool3D
(
const
ExecutionContext
&
ctx
,
cnnlPoolingMode_t
pool_mode
,
const
std
::
vector
<
int64_t
>&
output_shape
,
cnnlPoolingDescriptor_t
pooling_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Pad
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
void
*
paddings
,
const
void
*
padding_value
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Matmul
(
const
ExecutionContext
&
ctx
,
const
bool
transpose_a
,
const
bool
transpose_b
,
const
cnnlTensorDescriptor_t
in0_desc
,
const
void
*
in0
,
const
cnnlTensorDescriptor_t
in1_desc
,
const
void
*
in1
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
BatchMatmul
(
const
ExecutionContext
&
ctx
,
const
bool
transpose_a
,
const
bool
transpose_b
,
const
cnnlTensorDescriptor_t
in0_desc
,
const
void
*
in0
,
const
cnnlTensorDescriptor_t
in1_desc
,
const
void
*
in1
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
OpTensor
(
const
ExecutionContext
&
ctx
,
const
cnnlOpTensorDescriptor_t
op_tensor_desc
,
const
cnnlTensorDescriptor_t
a_desc
,
const
void
*
a
,
const
cnnlTensorDescriptor_t
b_desc
,
const
void
*
b
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
const
cnnlDataType_t
dtype
);
static
void
BiasAddGrad
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
cnnlTensorDescriptor_t
out_backprop_desc
,
const
void
*
out_backprop
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
OneHot
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
desc_indices
,
const
void
*
indices
,
const
int
depth
,
const
void
*
on_value
,
const
void
*
off_value
,
const
int
axis
,
cnnlDataType_t
output_data_type
,
void
*
output
);
static
void
NonMaxSuppression
(
const
ExecutionContext
&
ctx
,
const
cnnlNmsDescriptor_t
nms_desc
,
const
cnnlTensorDescriptor_t
boxes_desc
,
const
void
*
boxes
,
const
cnnlTensorDescriptor_t
confidence_desc
,
const
void
*
confidence
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
void
*
output_size
);
static
void
SoftmaxCrossEntropyWithLogits
(
const
ExecutionContext
&
ctx
,
cnnlSoftmaxMode_t
mode
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
logits_in
,
const
cnnlTensorDescriptor_t
label_desc
,
const
void
*
labels_in
,
const
cnnlTensorDescriptor_t
loss_out_desc
,
void
*
loss_out
,
const
cnnlTensorDescriptor_t
back_out_desc
,
void
*
back_out
);
static
void
SoftmaxForward
(
const
ExecutionContext
&
ctx
,
cnnlSoftmaxAlgorithm_t
algorithm
,
cnnlSoftmaxMode_t
mode
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Softplus
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
features_desc
,
const
void
*
features
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
SoftplusGrad
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
gradients_desc
,
const
void
*
gradients
,
const
cnnlTensorDescriptor_t
features_desc
,
const
void
*
features
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
RsqrtGrad
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
data_desc
,
const
void
*
y
,
const
void
*
diff_y
,
void
*
output
);
static
void
SqrtGrad
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
data_desc
,
const
void
*
y
,
const
void
*
diff_y
,
void
*
output
);
static
void
ConvolutionForward
(
const
ExecutionContext
&
ctx
,
cnnlConvolutionDescriptor_t
conv_desc_
,
const
void
*
alpha
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
bias_desc
,
const
void
*
bias_ptr
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
filtet_desc
,
const
void
*
filter
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
FusedConvBNQuantify
(
const
ExecutionContext
&
ctx
,
cnnlConvolutionDescriptor_t
conv_desc
,
const
void
*
epsilon_ptr
,
const
int
fused_ops_number
,
const
cnnlDataType_t
tensor_dtype
,
const
int
input_position
,
const
float
input_scale
,
const
int
filter_position
,
const
float
filter_scale
,
const
cnnlTensorDescriptor_t
scale_desc
,
const
void
*
scale_ptr
,
const
cnnlTensorDescriptor_t
offset_desc
,
const
void
*
offset_ptr
,
const
cnnlTensorDescriptor_t
mean_desc
,
const
void
*
mean_ptr
,
const
cnnlTensorDescriptor_t
variance_desc
,
const
void
*
variance_ptr
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
filtet_desc
,
const
void
*
filter
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Tile
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
UnsortedSegmentSum
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
data_desc
,
const
void
*
data
,
const
cnnlTensorDescriptor_t
ids_desc
,
const
int
*
segment_ids
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Reduce
(
const
ExecutionContext
&
ctx
,
const
bool
need_workspace
,
const
cnnlReduceDescriptor_t
reduction_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
size_t
indices_size
,
void
*
indices
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
FloorDiv
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
FloorMod
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Maximum
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Minimum
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
PowR
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
DivNoNan
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
SquaredDifference
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
L2Loss
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
void
*
output
);
static
void
Abs
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Neg
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Floor
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Ceil
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
IsNan
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Square
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Sqrt
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Rsqrt
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Cos
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Sin
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
TrigonForward
(
const
ExecutionContext
&
ctx
,
const
cnnlTrigonDescriptor_t
trigon_desc
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Exp
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Sign
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
IsFinite
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
IsNanInf
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
void
*
output
);
static
void
Erf
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Log1p
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
LogicalNot
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
DynamicStitch
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
*
indices_desc
,
const
int
**
indices
,
const
cnnlTensorDescriptor_t
*
data_desc
,
const
void
**
data
,
const
int
size
,
int
*
indices_dims
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
CropAndResize
(
const
ExecutionContext
&
ctx
,
const
std
::
string
method_name
,
const
float
extrapolation_value
,
const
cnnlTensorDescriptor_t
image_desc
,
const
void
*
image
,
const
cnnlTensorDescriptor_t
boxes_desc
,
const
void
*
boxes
,
const
cnnlTensorDescriptor_t
box_index_desc
,
const
void
*
box_index
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
CropAndResizeBackwardImage
(
const
ExecutionContext
&
ctx
,
const
std
::
string
method_name
,
const
cnnlTensorDescriptor_t
image_desc
,
const
void
*
image
,
const
cnnlTensorDescriptor_t
boxes_desc
,
const
void
*
boxes
,
const
cnnlTensorDescriptor_t
box_idx_desc
,
const
void
*
box_idx
,
const
cnnlTensorDescriptor_t
grads_image_desc
,
void
*
grads_image
);
static
void
CropAndResizeBackwardBoxes
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
image_desc
,
const
void
*
image
,
const
cnnlTensorDescriptor_t
boxes_desc
,
const
void
*
boxes
,
const
cnnlTensorDescriptor_t
box_idx_desc
,
const
void
*
box_idx
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
PoolingBackward
(
const
ExecutionContext
&
ctx
,
const
cnnlPoolingDescriptor_t
pooling_desc
,
const
void
*
alpha
,
const
cnnlTensorDescriptor_t
y_desc
,
const
void
*
y
,
const
cnnlTensorDescriptor_t
diff_y_desc
,
const
void
*
diff_y
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
void
*
beta
,
const
cnnlTensorDescriptor_t
diff_x_desc
,
void
*
diff_x
);
static
void
PoolingIndex
(
const
ExecutionContext
&
ctx
,
const
cnnlPoolingDescriptor_t
pooling_desc
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
y_desc
,
void
*
y
);
static
void
SpaceToBatch
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
const
int64_t
block_shape
[]);
static
void
SpaceToBatchNd
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
cnnlSpaceBatchNdDescriptor_t
param
,
void
*
extra_device_input
,
size_t
extra_input_size
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Interp
(
const
ExecutionContext
&
ctx
,
const
cnnlInterpMode_t
mode
,
const
bool
align_corners
,
const
bool
half_pixel_centers
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
InterpBackward
(
const
ExecutionContext
&
ctx
,
const
cnnlInterpBackwardMode_t
mode
,
const
bool
align_corners
,
const
bool
half_pixel_centers
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
QuantizeParam
(
const
ExecutionContext
&
ctx
,
const
cnnlQuantizeMode_t
mode
,
const
int
bitwidth
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
void
*
position
,
void
*
scale
,
void
*
offset
);
static
void
QuantizeMatMul
(
const
ExecutionContext
&
ctx
,
const
bool
transpose_a
,
const
bool
transpose_b
,
const
cnnlTensorDescriptor_t
a_desc
,
const
void
*
a
,
const
void
*
a_position
,
const
void
*
a_scale
,
const
void
*
a_offset
,
const
cnnlTensorDescriptor_t
b_desc
,
const
void
*
b
,
const
void
*
b_position
,
const
void
*
b_scale
,
const
void
*
b_offset
,
const
cnnlDataType_t
quant_type
,
const
cnnlDataType_t
data_type
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
QuantizeBatchMatMul
(
const
ExecutionContext
&
ctx
,
const
bool
adj_x
,
const
bool
adj_y
,
const
cnnlTensorDescriptor_t
a_desc
,
const
void
*
a
,
const
void
*
a_position
,
const
void
*
a_scale
,
const
void
*
a_offset
,
const
cnnlTensorDescriptor_t
b_desc
,
const
void
*
b
,
const
void
*
b_position
,
const
void
*
b_scale
,
const
void
*
b_offset
,
const
cnnlDataType_t
quant_type
,
const
cnnlDataType_t
data_type
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
QuantizeBatchMatMulBCast
(
const
ExecutionContext
&
ctx
,
const
bool
adj_x
,
const
bool
adj_y
,
const
cnnlTensorDescriptor_t
a_desc
,
const
void
*
a
,
const
void
*
a_position
,
const
void
*
a_scale
,
const
void
*
a_offset
,
const
cnnlTensorDescriptor_t
b_desc
,
const
void
*
b
,
const
void
*
b_position
,
const
void
*
b_scale
,
const
void
*
b_offset
,
const
cnnlDataType_t
quant_type
,
const
cnnlDataType_t
data_type
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
FusedBatchNorm
(
const
ExecutionContext
&
ctx
,
const
bool
is_training
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
scale_desc
,
const
void
*
scale
,
const
void
*
offset
,
const
void
*
estimated_mean
,
const
void
*
estimated_variance
,
float
epsilon
,
float
momentum
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
,
void
*
batch_mean
,
void
*
batch_var
,
void
*
saved_mean
,
void
*
saved_var
);
static
void
FusedBatchNormGrad
(
const
ExecutionContext
&
ctx
,
const
bool
is_training
,
const
cnnlTensorDescriptor_t
y_backprop_desc
,
const
void
*
y_backprop
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
cnnlTensorDescriptor_t
scale_desc
,
const
void
*
scale
,
const
void
*
saved_mean
,
const
void
*
saved_var
,
float
epsilon
,
const
cnnlTensorDescriptor_t
x_backprop_desc
,
void
*
x_backprop
,
void
*
scale_backprop
,
void
*
offset_backprop
);
static
void
Transpose
(
const
ExecutionContext
&
ctx
,
const
std
::
vector
<
int
>
perm
,
const
int
input_dim
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
MatrixBandPart
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
data_desc
,
const
void
*
input
,
const
int
num_lower
,
const
int
num_upper
,
void
*
output
);
static
void
NumTrue
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
Tensor
index
,
uint32_t
*
num_true
);
static
void
Where
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
x_desc
,
const
void
*
x
,
const
uint32_t
*
strides
,
const
uint32_t
*
index
,
const
cnnlTensorDescriptor_t
y_desc
,
int
*
y
,
const
bool
as_tuple
);
static
void
Conv2D
(
const
ExecutionContext
&
ctx
,
const
cnnlConvolutionDescriptor_t
conv_desc
,
const
cnnlDataType_t
tensor_dtype
,
const
cnnlDataType_t
dt_onchip
,
const
void
*
input_position
,
const
void
*
input_scale
,
const
void
*
input_offset
,
const
void
*
filter_position
,
const
void
*
filter_scale
,
const
void
*
filter_offset
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
filter_desc
,
const
void
*
filter
,
const
cnnlTensorDescriptor_t
bias_desc
,
const
void
*
bias
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
ConvBackpropInput
(
const
ExecutionContext
&
ctx
,
const
cnnlConvolutionDescriptor_t
conv_desc
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
filter
,
const
cnnlTensorDescriptor_t
out_backprop_desc
,
const
void
*
out_backprop
,
const
cnnlTensorDescriptor_t
in_backprop_desc
,
void
*
in_backprop
);
static
void
QuantizeConvBackpropInput
(
const
ExecutionContext
&
ctx
,
const
cnnlConvolutionDescriptor_t
conv_desc
,
const
cnnlDataType_t
tensor_dtype
,
const
cnnlDataType_t
dt_onchip
,
const
void
*
filter_position
,
const
void
*
filter_scale
,
const
void
*
filter_offset
,
const
void
*
out_backprop_position
,
const
void
*
out_backprop_scale
,
const
void
*
out_backprop_offset
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
filter
,
const
cnnlTensorDescriptor_t
out_backprop_desc
,
const
void
*
out_backprop
,
const
cnnlTensorDescriptor_t
in_backprop_desc
,
void
*
in_backprop
);
static
void
ConvBackpropFilter
(
const
ExecutionContext
&
ctx
,
const
cnnlConvolutionDescriptor_t
conv_desc
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
out_backprop_desc
,
const
void
*
out_backprop
,
const
cnnlTensorDescriptor_t
filter_backprop_desc
,
void
*
filter_backprop
);
static
void
QuantizeConvBackpropFilter
(
const
ExecutionContext
&
ctx
,
const
cnnlConvolutionDescriptor_t
conv_desc
,
const
cnnlDataType_t
tensor_dtype
,
const
cnnlDataType_t
dt_onchip
,
const
void
*
input_position
,
const
void
*
input_scale
,
const
void
*
input_offset
,
const
void
*
out_backprop_position
,
const
void
*
out_backprop_scale
,
const
void
*
out_backprop_offset
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
out_backprop_desc
,
const
void
*
out_backprop
,
const
cnnlTensorDescriptor_t
filter_backprop_desc
,
void
*
filter_backprop
);
static
void
InTopK
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
predictions_desc
,
const
void
*
predictions
,
const
cnnlTensorDescriptor_t
targets_desc
,
const
void
*
targets
,
const
cnnlTensorDescriptor_t
k_desc
,
const
void
*
k
,
const
int
k_int
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
ScatterNd
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
indices_desc
,
const
void
*
indices
,
const
cnnlTensorDescriptor_t
updates_desc
,
const
void
*
updates
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
BitWise
(
const
ExecutionContext
&
ctx
,
const
cnnlBitComputeOp_t
optype
,
const
cnnlTensorDescriptor_t
input1_desc
,
const
void
*
input1
,
const
cnnlTensorDescriptor_t
input2_desc
,
const
void
*
input2
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
QR
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
a_desc
,
const
void
*
a
,
const
cnnlTensorDescriptor_t
q_desc
,
void
*
q
,
const
cnnlTensorDescriptor_t
r_desc
,
void
*
r
,
const
bool
some
);
static
void
Reciprocal
(
const
ExecutionContext
&
ctx
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
};
}
// namespace operators
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录