Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
bb48b596
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
bb48b596
编写于
4月 03, 2023
作者:
Y
Young-Flash
提交者:
GitHub
4月 03, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
delete paddle/fluid/operators/*_mlu.* files (#52435)
上级
0e3f7ab1
变更
70
隐藏空白更改
内联
并排
Showing
70 changed file
with
0 addition
and
12274 deletion
+0
-12274
paddle/fluid/operators/abs_op_mlu.cc
paddle/fluid/operators/abs_op_mlu.cc
+0
-87
paddle/fluid/operators/activation_op_mlu.cc
paddle/fluid/operators/activation_op_mlu.cc
+0
-612
paddle/fluid/operators/arg_max_op_mlu.cc
paddle/fluid/operators/arg_max_op_mlu.cc
+0
-133
paddle/fluid/operators/argsort_op_mlu.cc
paddle/fluid/operators/argsort_op_mlu.cc
+0
-124
paddle/fluid/operators/assign_op_mlu.cc
paddle/fluid/operators/assign_op_mlu.cc
+0
-48
paddle/fluid/operators/assign_value_op_mlu.cc
paddle/fluid/operators/assign_value_op_mlu.cc
+0
-23
paddle/fluid/operators/batch_norm_op_mlu.cc
paddle/fluid/operators/batch_norm_op_mlu.cc
+0
-336
paddle/fluid/operators/bce_loss_op_mlu.cc
paddle/fluid/operators/bce_loss_op_mlu.cc
+0
-88
paddle/fluid/operators/cast_op_mlu.cc
paddle/fluid/operators/cast_op_mlu.cc
+0
-70
paddle/fluid/operators/clip_op_mlu.cc
paddle/fluid/operators/clip_op_mlu.cc
+0
-132
paddle/fluid/operators/concat_op_mlu.cc
paddle/fluid/operators/concat_op_mlu.cc
+0
-170
paddle/fluid/operators/conv_op_mlu.cc
paddle/fluid/operators/conv_op_mlu.cc
+0
-590
paddle/fluid/operators/conv_transpose_op_mlu.cc
paddle/fluid/operators/conv_transpose_op_mlu.cc
+0
-311
paddle/fluid/operators/cumsum_op_mlu.cc
paddle/fluid/operators/cumsum_op_mlu.cc
+0
-74
paddle/fluid/operators/deformable_conv_op_mlu.cc
paddle/fluid/operators/deformable_conv_op_mlu.cc
+0
-317
paddle/fluid/operators/dropout_op_mlu.cc
paddle/fluid/operators/dropout_op_mlu.cc
+0
-207
paddle/fluid/operators/expand_as_v2_op_mlu.cc
paddle/fluid/operators/expand_as_v2_op_mlu.cc
+0
-109
paddle/fluid/operators/expand_v2_op_mlu.cc
paddle/fluid/operators/expand_v2_op_mlu.cc
+0
-120
paddle/fluid/operators/fill_any_like_op_mlu.cc
paddle/fluid/operators/fill_any_like_op_mlu.cc
+0
-76
paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
...e/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
+0
-99
paddle/fluid/operators/fill_constant_op_mlu.cc
paddle/fluid/operators/fill_constant_op_mlu.cc
+0
-94
paddle/fluid/operators/flatten_op_mlu.cc
paddle/fluid/operators/flatten_op_mlu.cc
+0
-256
paddle/fluid/operators/gather_nd_op_mlu.cc
paddle/fluid/operators/gather_nd_op_mlu.cc
+0
-135
paddle/fluid/operators/gather_op_mlu.cc
paddle/fluid/operators/gather_op_mlu.cc
+0
-127
paddle/fluid/operators/gaussian_random_op_mlu.cc
paddle/fluid/operators/gaussian_random_op_mlu.cc
+0
-55
paddle/fluid/operators/grid_sampler_op_mlu.cc
paddle/fluid/operators/grid_sampler_op_mlu.cc
+0
-109
paddle/fluid/operators/huber_loss_op_mlu.cc
paddle/fluid/operators/huber_loss_op_mlu.cc
+0
-185
paddle/fluid/operators/interpolate_v2_op_mlu.cc
paddle/fluid/operators/interpolate_v2_op_mlu.cc
+0
-546
paddle/fluid/operators/label_smooth_op_mlu.cc
paddle/fluid/operators/label_smooth_op_mlu.cc
+0
-87
paddle/fluid/operators/layer_norm_op_mlu.cc
paddle/fluid/operators/layer_norm_op_mlu.cc
+0
-280
paddle/fluid/operators/lookup_table_v2_op_mlu.cc
paddle/fluid/operators/lookup_table_v2_op_mlu.cc
+0
-129
paddle/fluid/operators/masked_select_op_mlu.cc
paddle/fluid/operators/masked_select_op_mlu.cc
+0
-204
paddle/fluid/operators/matmul_op_mlu.cc
paddle/fluid/operators/matmul_op_mlu.cc
+0
-389
paddle/fluid/operators/matmul_v2_op_mlu.cc
paddle/fluid/operators/matmul_v2_op_mlu.cc
+0
-406
paddle/fluid/operators/mean_op_mlu.cc
paddle/fluid/operators/mean_op_mlu.cc
+0
-146
paddle/fluid/operators/meshgrid_op_mlu.cc
paddle/fluid/operators/meshgrid_op_mlu.cc
+0
-80
paddle/fluid/operators/one_hot_v2_op_mlu.cc
paddle/fluid/operators/one_hot_v2_op_mlu.cc
+0
-103
paddle/fluid/operators/pool_op_mlu.cc
paddle/fluid/operators/pool_op_mlu.cc
+0
-384
paddle/fluid/operators/randperm_op_mlu.cc
paddle/fluid/operators/randperm_op_mlu.cc
+0
-46
paddle/fluid/operators/range_op_mlu.cc
paddle/fluid/operators/range_op_mlu.cc
+0
-79
paddle/fluid/operators/reshape_op_mlu.cc
paddle/fluid/operators/reshape_op_mlu.cc
+0
-158
paddle/fluid/operators/rnn_op_mlu.cc
paddle/fluid/operators/rnn_op_mlu.cc
+0
-745
paddle/fluid/operators/roi_align_op_mlu.cc
paddle/fluid/operators/roi_align_op_mlu.cc
+0
-296
paddle/fluid/operators/scale_op_mlu.cc
paddle/fluid/operators/scale_op_mlu.cc
+0
-137
paddle/fluid/operators/scatter_op_mlu.cc
paddle/fluid/operators/scatter_op_mlu.cc
+0
-84
paddle/fluid/operators/set_value_op_mlu.cc
paddle/fluid/operators/set_value_op_mlu.cc
+0
-214
paddle/fluid/operators/shape_op_mlu.cc
paddle/fluid/operators/shape_op_mlu.cc
+0
-69
paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
...uid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
+0
-121
paddle/fluid/operators/size_op_mlu.cc
paddle/fluid/operators/size_op_mlu.cc
+0
-45
paddle/fluid/operators/slice_op_mlu.cc
paddle/fluid/operators/slice_op_mlu.cc
+0
-217
paddle/fluid/operators/softmax_op_mlu.cc
paddle/fluid/operators/softmax_op_mlu.cc
+0
-126
paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
+0
-182
paddle/fluid/operators/split_op_mlu.cc
paddle/fluid/operators/split_op_mlu.cc
+0
-93
paddle/fluid/operators/squared_l2_norm_op_mlu.cc
paddle/fluid/operators/squared_l2_norm_op_mlu.cc
+0
-145
paddle/fluid/operators/squeeze_op_mlu.cc
paddle/fluid/operators/squeeze_op_mlu.cc
+0
-65
paddle/fluid/operators/stack_op_mlu.cc
paddle/fluid/operators/stack_op_mlu.cc
+0
-75
paddle/fluid/operators/strided_slice_op_mlu.cc
paddle/fluid/operators/strided_slice_op_mlu.cc
+0
-417
paddle/fluid/operators/sum_op_mlu.cc
paddle/fluid/operators/sum_op_mlu.cc
+0
-80
paddle/fluid/operators/sync_batch_norm_op_mlu.cc
paddle/fluid/operators/sync_batch_norm_op_mlu.cc
+0
-510
paddle/fluid/operators/tile_op_mlu.cc
paddle/fluid/operators/tile_op_mlu.cc
+0
-125
paddle/fluid/operators/top_k_op_mlu.cc
paddle/fluid/operators/top_k_op_mlu.cc
+0
-91
paddle/fluid/operators/top_k_v2_op_mlu.cc
paddle/fluid/operators/top_k_v2_op_mlu.cc
+0
-99
paddle/fluid/operators/transpose_op_mlu.cc
paddle/fluid/operators/transpose_op_mlu.cc
+0
-73
paddle/fluid/operators/tril_triu_op_mlu.cc
paddle/fluid/operators/tril_triu_op_mlu.cc
+0
-53
paddle/fluid/operators/truncated_gaussian_random_op_mlu.cc
paddle/fluid/operators/truncated_gaussian_random_op_mlu.cc
+0
-62
paddle/fluid/operators/uniform_random_op_mlu.cc
paddle/fluid/operators/uniform_random_op_mlu.cc
+0
-114
paddle/fluid/operators/unsqueeze_op_mlu.cc
paddle/fluid/operators/unsqueeze_op_mlu.cc
+0
-61
paddle/fluid/operators/unstack_op_mlu.cc
paddle/fluid/operators/unstack_op_mlu.cc
+0
-107
paddle/fluid/operators/where_index_op_mlu.cc
paddle/fluid/operators/where_index_op_mlu.cc
+0
-86
paddle/fluid/operators/where_op_mlu.cc
paddle/fluid/operators/where_op_mlu.cc
+0
-58
未找到文件。
paddle/fluid/operators/abs_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the Licnse. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
AbsMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
input_desc
(
*
input
);
MLUCnnlTensorDesc
output_desc
(
*
output
);
MLUCnnl
::
Abs
(
ctx
,
input_desc
.
get
(),
GetBasePtr
(
input
),
output_desc
.
get
(),
GetBasePtr
(
output
));
}
};
template
<
typename
T
>
class
AbsGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
input_desc
(
*
x
);
MLUCnnlOpTensorDesc
mul_op_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
phi
::
DenseTensor
sign_x
;
sign_x
.
mutable_data
<
T
>
(
x
->
dims
(),
ctx
.
GetPlace
());
MLUCnnl
::
Sign
(
ctx
,
input_desc
.
get
(),
GetBasePtr
(
x
),
input_desc
.
get
(),
GetBasePtr
(
&
sign_x
));
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
input_desc
.
get
(),
GetBasePtr
(
&
sign_x
),
input_desc
.
get
(),
GetBasePtr
(
dout
),
input_desc
.
get
(),
GetBasePtr
(
dx
),
ToCnnlDataType
<
T
>
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
abs
,
ops
::
AbsMLUKernel
<
float
>
,
ops
::
AbsMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
abs_grad
,
ops
::
AbsGradMLUKernel
<
float
>
,
ops
::
AbsGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/activation_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the Licnse. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
cnnlActivationMode_t
act_mode
,
typename
T
>
class
ActivationMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
float
alpha
=
ctx
.
HasAttr
(
"alpha"
)
?
ctx
.
Attr
<
float
>
(
"alpha"
)
:
1.0
f
;
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlActivationDesc
act_desc
(
act_mode
,
alpha
);
MLUCnnlTensorDesc
input_desc
(
*
input
);
MLUCnnlTensorDesc
output_desc
(
*
output
);
MLUCnnl
::
Active
(
ctx
,
act_desc
.
get
(),
input_desc
.
get
(),
GetBasePtr
(
input
),
output_desc
.
get
(),
GetBasePtr
(
output
));
}
};
// For gelu, leaky_relu
template
<
cnnlActivationMode_t
act_mode
,
typename
T
>
class
ActivationGradMLUKernelV1
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
float
alpha
=
ctx
.
HasAttr
(
"alpha"
)
?
ctx
.
Attr
<
float
>
(
"alpha"
)
:
1.0
f
;
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
MLUCnnlActivationDesc
act_desc
(
act_mode
,
alpha
);
MLUCnnl
::
ActiveGrad
(
ctx
,
act_desc
.
get
(),
nullptr
,
nullptr
,
nullptr
,
nullptr
,
dout_desc
.
get
(),
GetBasePtr
(
dout
),
x_desc
.
get
(),
GetBasePtr
(
x
),
dx_desc
.
get
(),
GetBasePtr
(
dx
));
}
};
// For tanh, sigmoid
template
<
cnnlActivationMode_t
act_mode
,
typename
T
>
class
ActivationGradMLUKernelV2
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
float
alpha
=
ctx
.
HasAttr
(
"alpha"
)
?
ctx
.
Attr
<
float
>
(
"alpha"
)
:
1.0
f
;
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
MLUCnnlActivationDesc
act_desc
(
act_mode
,
alpha
);
MLUCnnl
::
ActiveGrad
(
ctx
,
act_desc
.
get
(),
nullptr
,
nullptr
,
out_desc
.
get
(),
GetBasePtr
(
out
),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
nullptr
,
nullptr
,
dx_desc
.
get
(),
GetBasePtr
(
dx
));
}
};
// For relu, relu6
template
<
cnnlActivationMode_t
act_mode
,
typename
T
>
class
ActivationGradMLUKernelV3
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
float
alpha
=
ctx
.
HasAttr
(
"alpha"
)
?
ctx
.
Attr
<
float
>
(
"alpha"
)
:
1.0
f
;
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
MLUCnnlActivationDesc
act_desc
(
act_mode
,
alpha
);
MLUCnnl
::
ActiveGrad
(
ctx
,
act_desc
.
get
(),
nullptr
,
nullptr
,
nullptr
,
nullptr
,
dout_desc
.
get
(),
GetBasePtr
(
dout
),
out_desc
.
get
(),
GetBasePtr
(
out
),
dx_desc
.
get
(),
GetBasePtr
(
dx
));
}
};
// For sqrt
template
<
typename
T
>
class
SqrtMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
ctx
.
GetPlace
();
out
->
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
input_desc
(
*
x
);
MLUCnnlTensorDesc
output_desc
(
*
out
);
cnnlComputationPreference_t
prefer
=
CNNL_COMPUTATION_FAST
;
MLUCnnl
::
Sqrt
(
ctx
,
prefer
,
input_desc
.
get
(),
GetBasePtr
(
x
),
output_desc
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
T
>
class
SqrtGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
place
=
ctx
.
GetPlace
();
dx
->
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
data_desc
(
*
out
);
MLUCnnl
::
SqrtGrad
(
ctx
,
data_desc
.
get
(),
GetBasePtr
(
out
),
GetBasePtr
(
dout
),
GetBasePtr
(
dx
));
}
};
// CNNL_LOG_E = 0,
// CNNL_LOG_2 = 1,
// CNNL_LOG_10 = 2,
template
<
cnnlLogBase_t
Log_base
,
typename
T
>
class
LogMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
input_desc
(
*
input
);
MLUCnnlTensorDesc
output_desc
(
*
output
);
cnnlComputationPreference_t
prefer
=
CNNL_COMPUTATION_HIGH_PRECISION
;
MLUCnnl
::
Log
(
ctx
,
prefer
,
Log_base
,
input_desc
.
get
(),
GetBasePtr
(
input
),
output_desc
.
get
(),
GetBasePtr
(
output
));
}
};
template
<
typename
T
>
class
ExpMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
input_desc
(
*
input
);
MLUCnnlTensorDesc
output_desc
(
*
output
);
cnnlComputationPreference_t
prefer
=
CNNL_COMPUTATION_HIGH_PRECISION
;
MLUCnnl
::
Exp
(
ctx
,
prefer
,
input_desc
.
get
(),
GetBasePtr
(
input
),
output_desc
.
get
(),
GetBasePtr
(
output
));
}
};
template
<
typename
T
>
class
ExpGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnlOpTensorDesc
op_tensor_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
op_tensor_desc
.
get
(),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
out_desc
.
get
(),
GetBasePtr
(
out
),
dx_desc
.
get
(),
GetBasePtr
(
dx
),
ToCnnlDataType
<
T
>
());
}
};
template
<
typename
T
>
class
HardSwishMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
float
threshold
=
ctx
.
Attr
<
float
>
(
"threshold"
);
float
scale
=
ctx
.
Attr
<
float
>
(
"scale"
);
float
offset
=
ctx
.
Attr
<
float
>
(
"offset"
);
PADDLE_ENFORCE_EQ
(
threshold
,
6.0
f
,
platform
::
errors
::
External
(
"Not support threshold [%f] in MLU"
,
threshold
));
PADDLE_ENFORCE_EQ
(
scale
,
6.0
f
,
platform
::
errors
::
External
(
"Not support scale [%f] in MLU"
,
scale
));
PADDLE_ENFORCE_EQ
(
offset
,
3.0
f
,
platform
::
errors
::
External
(
"Not support offset [%f] in MLU"
,
offset
));
MLUCnnlActivationDesc
act_desc
(
CNNL_ACTIVATION_HARDSWISH
,
1.0
f
/*ceof useless*/
);
MLUCnnlTensorDesc
input_desc
(
*
input
);
MLUCnnlTensorDesc
output_desc
(
*
output
);
MLUCnnl
::
Active
(
ctx
,
act_desc
.
get
(),
input_desc
.
get
(),
GetBasePtr
(
input
),
output_desc
.
get
(),
GetBasePtr
(
output
));
}
};
template
<
typename
T
>
class
HardSwishGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
float
threshold
=
ctx
.
Attr
<
float
>
(
"threshold"
);
float
scale
=
ctx
.
Attr
<
float
>
(
"scale"
);
float
offset
=
ctx
.
Attr
<
float
>
(
"offset"
);
PADDLE_ENFORCE_EQ
(
threshold
,
6.0
f
,
platform
::
errors
::
External
(
"Not support threshold [%f] in MLU"
,
threshold
));
PADDLE_ENFORCE_EQ
(
scale
,
6.0
f
,
platform
::
errors
::
External
(
"Not support scale [%f] in MLU"
,
scale
));
PADDLE_ENFORCE_EQ
(
offset
,
3.0
f
,
platform
::
errors
::
External
(
"Not support offset [%f] in MLU"
,
offset
));
auto
*
out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
MLUCnnlActivationDesc
act_desc
(
CNNL_ACTIVATION_HARDSWISH
,
1.0
f
/*ceof useless*/
);
MLUCnnl
::
ActiveGrad
(
ctx
,
act_desc
.
get
(),
nullptr
,
nullptr
,
nullptr
,
nullptr
,
dout_desc
.
get
(),
GetBasePtr
(
dout
),
out_desc
.
get
(),
GetBasePtr
(
out
),
dx_desc
.
get
(),
GetBasePtr
(
dx
));
}
};
template
<
typename
T
>
class
HardSigmoidMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
float
slope
=
ctx
.
Attr
<
float
>
(
"slope"
);
float
offset
=
ctx
.
Attr
<
float
>
(
"offset"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlActivationDesc
act_desc
(
CNNL_ACTIVATION_HARDSIGMOID
,
1.0
f
/*ceof useless*/
,
1.0
f
/*sliced_dim useless*/
,
slope
,
offset
);
MLUCnnlTensorDesc
input_desc
(
*
input
);
MLUCnnlTensorDesc
output_desc
(
*
output
);
MLUCnnl
::
Active
(
ctx
,
act_desc
.
get
(),
input_desc
.
get
(),
GetBasePtr
(
input
),
output_desc
.
get
(),
GetBasePtr
(
output
));
}
};
template
<
typename
T
>
class
HardSigmoidGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
float
slope
=
ctx
.
Attr
<
float
>
(
"slope"
);
float
offset
=
ctx
.
Attr
<
float
>
(
"offset"
);
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlActivationDesc
act_desc
(
CNNL_ACTIVATION_HARDSIGMOID
,
1.0
f
/*ceof useless*/
,
1.0
f
/*sliced_dim useless*/
,
slope
,
offset
);
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
MLUCnnl
::
ActiveGrad
(
ctx
,
act_desc
.
get
(),
nullptr
,
nullptr
,
nullptr
,
nullptr
,
dout_desc
.
get
(),
GetBasePtr
(
dout
),
x_desc
.
get
(),
GetBasePtr
(
x
),
dx_desc
.
get
(),
GetBasePtr
(
dx
));
}
};
template
<
typename
T
>
class
FloorMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
input_desc
(
*
input
);
MLUCnnlTensorDesc
output_desc
(
*
output
);
MLUCnnl
::
Floor
(
ctx
,
input_desc
.
get
(),
GetBasePtr
(
input
),
output_desc
.
get
(),
GetBasePtr
(
output
));
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
ReciprocalMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
ctx
.
GetPlace
();
out
->
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
Reciprocal
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
x
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
ReciprocalGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
place
=
ctx
.
GetPlace
();
dx
->
mutable_data
<
T
>
(
place
);
phi
::
DenseTensor
square_out
;
square_out
.
Resize
(
out
->
dims
());
square_out
.
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
MLUCnnlTensorDesc
square_out_desc
(
square_out
);
MLUCnnl
::
Square
(
ctx
,
out_desc
.
get
(),
GetBasePtr
(
out
),
square_out_desc
.
get
(),
GetBasePtr
(
&
square_out
));
cnnlOpTensorDesc_t
op_tensor_op
=
CNNL_OP_TENSOR_MUL
;
cnnlDataType_t
op_tensor_comp_type
=
CNNL_DTYPE_FLOAT
;
cnnlNanPropagation_t
op_tensor_nan_opt
=
CNNL_NOT_PROPAGATE_NAN
;
MLUCnnlOpTensorDesc
op_tensor_desc
(
op_tensor_op
,
op_tensor_comp_type
,
op_tensor_nan_opt
);
float
alpha1_float
=
-
1
;
float
alpha2_float
=
1
;
float
beta_float
=
0
;
MLUCnnl
::
OpTensor
(
ctx
,
op_tensor_desc
.
get
(),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
square_out_desc
.
get
(),
GetBasePtr
(
&
square_out
),
dx_desc
.
get
(),
GetBasePtr
(
dx
),
op_tensor_comp_type
,
alpha1_float
,
alpha2_float
,
beta_float
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
// reciprocal
REGISTER_OP_MLU_KERNEL
(
reciprocal
,
ops
::
ReciprocalMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
float
>
,
ops
::
ReciprocalMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
reciprocal_grad
,
ops
::
ReciprocalGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
float
>
,
ops
::
ReciprocalGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
paddle
::
platform
::
float16
>
);
// relu
REGISTER_OP_MLU_KERNEL
(
relu
,
ops
::
ActivationMLUKernel
<
CNNL_ACTIVATION_RELU
,
float
>
,
ops
::
ActivationMLUKernel
<
CNNL_ACTIVATION_RELU
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
relu_grad
,
ops
::
ActivationGradMLUKernelV3
<
CNNL_ACTIVATION_RELU
,
float
>
,
ops
::
ActivationGradMLUKernelV3
<
CNNL_ACTIVATION_RELU
,
paddle
::
platform
::
float16
>
);
// relu6
REGISTER_OP_MLU_KERNEL
(
relu6
,
ops
::
ActivationMLUKernel
<
CNNL_ACTIVATION_RELU6
,
float
>
,
ops
::
ActivationMLUKernel
<
CNNL_ACTIVATION_RELU6
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
relu6_grad
,
ops
::
ActivationGradMLUKernelV3
<
CNNL_ACTIVATION_RELU6
,
float
>
,
ops
::
ActivationGradMLUKernelV3
<
CNNL_ACTIVATION_RELU6
,
paddle
::
platform
::
float16
>
);
// sigmoid
REGISTER_OP_MLU_KERNEL
(
sigmoid
,
ops
::
ActivationMLUKernel
<
CNNL_ACTIVATION_SIGMOID
,
float
>
,
ops
::
ActivationMLUKernel
<
CNNL_ACTIVATION_SIGMOID
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
sigmoid_grad
,
ops
::
ActivationGradMLUKernelV2
<
CNNL_ACTIVATION_SIGMOID
,
float
>
,
ops
::
ActivationGradMLUKernelV2
<
CNNL_ACTIVATION_SIGMOID
,
paddle
::
platform
::
float16
>
);
// tanh
REGISTER_OP_MLU_KERNEL
(
tanh
,
ops
::
ActivationMLUKernel
<
CNNL_ACTIVATION_TANH
,
float
>
,
ops
::
ActivationMLUKernel
<
CNNL_ACTIVATION_TANH
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
tanh_grad
,
ops
::
ActivationGradMLUKernelV2
<
CNNL_ACTIVATION_TANH
,
float
>
,
ops
::
ActivationGradMLUKernelV2
<
CNNL_ACTIVATION_TANH
,
paddle
::
platform
::
float16
>
);
// gelu
REGISTER_OP_MLU_KERNEL
(
gelu
,
ops
::
ActivationMLUKernel
<
CNNL_ACTIVATION_GELU
,
float
>
,
ops
::
ActivationMLUKernel
<
CNNL_ACTIVATION_GELU
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
gelu_grad
,
ops
::
ActivationGradMLUKernelV1
<
CNNL_ACTIVATION_GELU
,
float
>
,
ops
::
ActivationGradMLUKernelV1
<
CNNL_ACTIVATION_GELU
,
paddle
::
platform
::
float16
>
);
// leaky_relu
REGISTER_OP_MLU_KERNEL
(
leaky_relu
,
ops
::
ActivationMLUKernel
<
CNNL_ACTIVATION_LEAKYRELU
,
float
>
,
ops
::
ActivationMLUKernel
<
CNNL_ACTIVATION_LEAKYRELU
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
leaky_relu_grad
,
ops
::
ActivationGradMLUKernelV1
<
CNNL_ACTIVATION_LEAKYRELU
,
float
>
,
ops
::
ActivationGradMLUKernelV1
<
CNNL_ACTIVATION_LEAKYRELU
,
paddle
::
platform
::
float16
>
);
// sqrt
REGISTER_OP_MLU_KERNEL
(
sqrt
,
ops
::
SqrtMLUKernel
<
float
>
,
ops
::
SqrtMLUKernel
<
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
sqrt_grad
,
ops
::
SqrtGradMLUKernel
<
float
>
,
ops
::
SqrtGradMLUKernel
<
paddle
::
platform
::
float16
>
);
// log log2 log10
REGISTER_OP_MLU_KERNEL
(
log
,
ops
::
LogMLUKernel
<
CNNL_LOG_E
,
float
>
,
ops
::
LogMLUKernel
<
CNNL_LOG_E
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
log2
,
ops
::
LogMLUKernel
<
CNNL_LOG_2
,
float
>
,
ops
::
LogMLUKernel
<
CNNL_LOG_2
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
log10
,
ops
::
LogMLUKernel
<
CNNL_LOG_10
,
float
>
,
ops
::
LogMLUKernel
<
CNNL_LOG_10
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
exp
,
ops
::
ExpMLUKernel
<
float
>
,
ops
::
ExpMLUKernel
<
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
exp_grad
,
ops
::
ExpGradMLUKernel
<
float
>
,
ops
::
ExpGradMLUKernel
<
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
hard_swish
,
ops
::
HardSwishMLUKernel
<
float
>
,
ops
::
HardSwishMLUKernel
<
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
hard_swish_grad
,
ops
::
HardSwishGradMLUKernel
<
float
>
,
ops
::
HardSwishGradMLUKernel
<
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
hard_sigmoid
,
ops
::
HardSigmoidMLUKernel
<
float
>
,
ops
::
HardSigmoidMLUKernel
<
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
hard_sigmoid_grad
,
ops
::
HardSigmoidGradMLUKernel
<
float
>
,
ops
::
HardSigmoidGradMLUKernel
<
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
floor
,
ops
::
FloorMLUKernel
<
float
>
,
ops
::
FloorMLUKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/arg_max_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ArgMaxMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
axis
=
static_cast
<
int
>
(
ctx
.
Attr
<
int64_t
>
(
"axis"
));
auto
dtype
=
ctx
.
Attr
<
int
>
(
"dtype"
);
const
bool
&
flatten
=
ctx
.
Attr
<
bool
>
(
"flatten"
);
if
(
x
->
numel
()
==
0
)
return
;
PADDLE_ENFORCE_EQ
(
(
dtype
==
2
||
dtype
==
3
),
true
,
platform
::
errors
::
InvalidArgument
(
"The attribute of dtype in argmax op must be [%s] or [%s], "
"but "
"received [%s]"
,
paddle
::
framework
::
DataTypeToString
(
framework
::
proto
::
VarType
::
INT64
),
paddle
::
framework
::
DataTypeToString
(
framework
::
proto
::
VarType
::
INT32
),
paddle
::
framework
::
DataTypeToString
(
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
dtype
))));
if
(
axis
<
0
)
{
framework
::
DDim
x_dims
;
x_dims
=
x
->
dims
();
axis
+=
x_dims
.
size
();
}
phi
::
DenseTensor
flatten_x
(
x
->
type
());
flatten_x
.
ShareDataWith
(
*
x
);
if
(
flatten
)
{
flatten_x
.
Resize
(
phi
::
make_ddim
({
x
->
numel
()}));
// if flatten, the axis just as 0
axis
=
0
;
}
std
::
vector
<
int
>
reduce_dims
;
reduce_dims
.
push_back
(
axis
);
auto
out_dims
=
out
->
dims
();
int
out_count
=
out_dims
[
0
];
for
(
int
i
=
1
;
i
<
out_dims
.
size
();
i
++
)
{
out_count
=
out_count
*
out_dims
[
i
];
}
size_t
indices_size_inbytes
=
out_count
*
sizeof
(
int32_t
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MLUDeviceContext
>();
phi
::
DenseTensor
value_out
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
out
->
dims
(),
dev_ctx
);
MLUCnnlTensorDesc
value_out_desc
(
value_out
);
MLUCnnlTensorDesc
input_desc
(
flatten_x
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
flatten_x
.
dtype
()));
MLUCnnlReduceDesc
reduction_desc
(
reduce_dims
,
CNNL_REDUCE_MAX
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_ONLY_INDICES
,
CNNL_32BIT_INDICES
);
if
(
dtype
==
2
)
{
out
->
template
mutable_data
<
int32_t
>(
ctx
.
GetPlace
());
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
input_desc
.
get
(),
GetBasePtr
(
&
flatten_x
),
indices_size_inbytes
/*indices_size*/
,
GetBasePtr
(
out
),
nullptr
,
value_out_desc
.
get
(),
GetBasePtr
(
&
value_out
));
}
else
{
out
->
template
mutable_data
<
int64_t
>(
ctx
.
GetPlace
());
phi
::
DenseTensor
out_int32
=
ctx
.
AllocateTmpTensor
<
int32_t
,
MLUDeviceContext
>
(
out
->
dims
(),
dev_ctx
);
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
input_desc
.
get
(),
GetBasePtr
(
&
flatten_x
),
indices_size_inbytes
/*indices_size*/
,
GetBasePtr
(
&
out_int32
),
nullptr
,
value_out_desc
.
get
(),
GetBasePtr
(
&
value_out
));
// cast indices type to int64
MLUCnnlTensorDesc
out_int32_desc
(
out_int32
);
MLUCnnlTensorDesc
cast_output_desc
(
*
out
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
VT
::
INT32
,
VT
::
INT64
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
out_int32_desc
.
get
(),
GetBasePtr
(
&
out_int32
),
cast_output_desc
.
get
(),
GetBasePtr
(
out
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
arg_max
,
ops
::
ArgMaxMLUKernel
<
int
>
,
ops
::
ArgMaxMLUKernel
<
float
>
,
ops
::
ArgMaxMLUKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/argsort_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ArgsortMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
indices
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Indices"
);
const
auto
&
place
=
ctx
.
GetPlace
();
const
auto
&
sorted
=
true
;
const
bool
descending
=
ctx
.
Attr
<
bool
>
(
"descending"
);
// axis < 0, cacluate the real axis
int
axis
=
static_cast
<
int
>
(
ctx
.
Attr
<
int
>
(
"axis"
));
if
(
axis
<
0
)
{
const
auto
&
in_dims
=
input
->
dims
();
axis
+=
in_dims
.
size
();
}
auto
in_dims
=
input
->
dims
();
size_t
k
=
in_dims
[
axis
];
output
->
mutable_data
<
T
>
(
place
);
indices
->
mutable_data
<
int64_t
>
(
place
);
// cnnl only support int32/int16 type of indices
phi
::
DenseTensor
indices_int32
(
framework
::
TransToPhiDataType
(
VT
::
INT32
));
indices_int32
.
Resize
(
indices
->
dims
());
indices_int32
.
mutable_data
<
int32_t
>
(
place
);
MLUCnnlTensorDesc
input_desc
(
*
input
);
MLUCnnlTensorDesc
values_output_desc
(
*
output
);
MLUCnnlTensorDesc
indices_int32_desc
(
indices_int32
);
MLUCnnl
::
TopK
(
ctx
,
k
,
axis
,
descending
,
sorted
,
input_desc
.
get
(),
GetBasePtr
(
input
),
values_output_desc
.
get
(),
GetBasePtr
(
output
),
indices_int32_desc
.
get
(),
GetBasePtr
(
&
indices_int32
));
// cast indices type to int64
MLUCnnlTensorDesc
cast_output_desc
(
*
indices
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
VT
::
INT32
,
VT
::
INT64
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
indices_int32_desc
.
get
(),
GetBasePtr
(
&
indices_int32
),
cast_output_desc
.
get
(),
GetBasePtr
(
indices
));
}
};
template
<
typename
T
>
class
ArgsortGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
indices
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Indices"
);
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
in_dims
=
indices
->
dims
();
axis
=
(
axis
<
0
)
?
(
in_dims
.
size
()
+
axis
)
:
axis
;
if
(
dout
->
numel
()
==
0
)
return
;
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
MLUCnnlTensorDesc
indices_desc
(
*
indices
);
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
MLUCnnl
::
ScatterFunctor
(
ctx
,
dx_desc
.
get
(),
GetBasePtr
(
dx
),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
indices_desc
.
get
(),
GetBasePtr
(
indices
),
axis
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
argsort
,
ops
::
ArgsortMLUKernel
<
paddle
::
platform
::
float16
>
,
ops
::
ArgsortMLUKernel
<
float
>
,
ops
::
ArgsortMLUKernel
<
int8_t
>
,
ops
::
ArgsortMLUKernel
<
uint8_t
>
,
ops
::
ArgsortMLUKernel
<
int16_t
>
,
ops
::
ArgsortMLUKernel
<
int
>
);
REGISTER_OP_MLU_KERNEL
(
argsort_grad
,
ops
::
ArgsortGradMLUKernel
<
paddle
::
platform
::
float16
>
,
ops
::
ArgsortGradMLUKernel
<
float
>
,
ops
::
ArgsortGradMLUKernel
<
int8_t
>
,
ops
::
ArgsortGradMLUKernel
<
uint8_t
>
,
ops
::
ArgsortGradMLUKernel
<
int16_t
>
,
ops
::
ArgsortGradMLUKernel
<
int
>
);
paddle/fluid/operators/assign_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "paddle/fluid/operators/assign_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
AssignMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
Assign
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
x
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
assign
,
ops
::
AssignMLUKernel
<
int
>
,
ops
::
AssignMLUKernel
<
float
>
,
ops
::
AssignMLUKernel
<
plat
::
float16
>
,
ops
::
AssignMLUKernel
<
bool
>
)
paddle/fluid/operators/assign_value_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/assign_value_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
assign_value
,
ops
::
AssignValueKernel
<
bool
>
,
ops
::
AssignValueKernel
<
int
>
,
ops
::
AssignValueKernel
<
int64_t
>
,
ops
::
AssignValueKernel
<
float
>
);
paddle/fluid/operators/batch_norm_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
MLUBatchNormOpKernel
:
public
framework
::
OpKernel
<
T
>
{
using
MPDType
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
&
place
=
ctx
.
GetPlace
();
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
float
momentum
=
ctx
.
Attr
<
float
>
(
"momentum"
);
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
const
bool
use_global_stats
=
ctx
.
Attr
<
bool
>
(
"use_global_stats"
);
const
bool
trainable_stats
=
ctx
.
Attr
<
bool
>
(
"trainable_statistics"
);
bool
test_mode
=
is_test
&&
(
!
trainable_stats
);
bool
global_stats
=
test_mode
||
use_global_stats
;
const
std
::
string
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
DataLayout
data_layout
=
phi
::
StringToDataLayout
(
data_layout_str
);
const
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
const
auto
&
x_dims
=
x
->
dims
();
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
2
,
platform
::
errors
::
InvalidArgument
(
"The size of input X's dimensions should be larger than 1."
"But received: the size of input X's dimensions is [%d]"
,
x_dims
.
size
()));
PADDLE_ENFORCE_LE
(
x_dims
.
size
(),
5
,
platform
::
errors
::
InvalidArgument
(
"The size of input X's dimensions should be less than 6."
"But received: the size of input X's dimensions is [%d]"
,
x_dims
.
size
()));
const
int
N
=
x_dims
[
0
];
const
int
C
=
(
data_layout
==
DataLayout
::
kNCHW
?
x_dims
[
1
]
:
x_dims
[
x_dims
.
size
()
-
1
]);
const
int
sample_size
=
x
->
numel
()
/
N
/
C
;
const
auto
*
running_mean
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Mean"
);
const
auto
*
running_var
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Variance"
);
const
auto
*
scale
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Scale"
);
const
auto
*
bias
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Bias"
);
auto
*
y
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
mean_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"MeanOut"
);
auto
*
variance_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"VarianceOut"
);
auto
*
saved_mean
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"SavedMean"
);
auto
*
saved_variance
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"SavedVariance"
);
// alloc memory
y
->
mutable_data
<
T
>
(
place
);
mean_out
->
mutable_data
<
MPDType
>
(
place
);
variance_out
->
mutable_data
<
MPDType
>
(
place
);
saved_mean
->
mutable_data
<
MPDType
>
(
place
);
saved_variance
->
mutable_data
<
MPDType
>
(
place
);
phi
::
DenseTensor
transformed_x
;
phi
::
DenseTensor
transformed_y
;
const
int
transformed_dim_size
=
4
;
const
int
transformed_shape
[
transformed_dim_size
]
=
{
N
,
sample_size
,
1
,
C
};
MLUCnnlTensorDesc
transformed_desc
(
transformed_dim_size
,
transformed_shape
,
ToCnnlDataType
<
T
>
(),
CNNL_LAYOUT_NHWC
);
MLUCnnlTensorDesc
others_input_desc
(
*
scale
);
// input dimension is 2 and the format is NCHW. The input can be regarded as
// NHWC format. Don't need to transpose.
bool
need_transpose
=
(
data_layout
==
DataLayout
::
kNCHW
&&
x_dims
.
size
()
!=
2
);
if
(
need_transpose
)
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MLUDeviceContext
>();
transformed_x
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
framework
::
DDim
(
transformed_shape
,
transformed_dim_size
),
dev_ctx
);
transformed_y
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
framework
::
DDim
(
transformed_shape
,
transformed_dim_size
),
dev_ctx
);
const
int
x_reshaped
[]
=
{
N
,
C
,
sample_size
,
1
};
MLUCnnlTensorDesc
x_reshaped_desc
(
transformed_dim_size
,
x_reshaped
,
ToCnnlDataType
<
T
>
());
const
std
::
vector
<
int
>
perm
=
{
0
,
2
,
3
,
1
};
MLUCnnl
::
Transpose
(
ctx
,
perm
,
transformed_dim_size
,
x_reshaped_desc
.
get
(),
GetBasePtr
(
x
),
transformed_desc
.
get
(),
GetBasePtr
(
&
transformed_x
));
}
else
{
transformed_x
=
*
x
;
transformed_y
=
*
y
;
}
if
(
ctx
.
HasInput
(
"MomentumTensor"
))
{
const
auto
*
mom_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"MomentumTensor"
);
phi
::
DenseTensor
mom_cpu
;
framework
::
TensorCopySync
(
*
mom_tensor
,
platform
::
CPUPlace
(),
&
mom_cpu
);
momentum
=
mom_cpu
.
data
<
float
>
()[
0
];
}
MLUCnnl
::
FusedBatchNorm
(
ctx
,
!
global_stats
,
transformed_desc
.
get
(),
GetBasePtr
(
&
transformed_x
),
others_input_desc
.
get
(),
GetBasePtr
(
scale
),
GetBasePtr
(
bias
),
GetBasePtr
(
running_mean
),
GetBasePtr
(
running_var
),
epsilon
,
momentum
,
transformed_desc
.
get
(),
GetBasePtr
(
&
transformed_y
),
GetBasePtr
(
mean_out
),
GetBasePtr
(
variance_out
),
GetBasePtr
(
saved_mean
),
GetBasePtr
(
saved_variance
));
if
(
need_transpose
)
{
const
int
y_reshaped
[]
=
{
N
,
C
,
sample_size
,
1
};
MLUCnnlTensorDesc
y_reshaped_desc
(
transformed_dim_size
,
y_reshaped
,
ToCnnlDataType
<
T
>
());
const
std
::
vector
<
int
>
perm
=
{
0
,
3
,
1
,
2
};
MLUCnnl
::
Transpose
(
ctx
,
perm
,
transformed_y
.
dims
().
size
(),
transformed_desc
.
get
(),
GetBasePtr
(
&
transformed_y
),
y_reshaped_desc
.
get
(),
GetBasePtr
(
y
));
}
}
};
template
<
typename
T
>
class
MLUBatchNormGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
using
MPDType
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
const
auto
*
d_y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
const
auto
*
scale
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Scale"
);
const
auto
*
bias
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Bias"
);
const
auto
*
saved_mean
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"SavedMean"
);
// SavedVariance have been reverted in forward operator
const
auto
*
saved_inv_variance
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"SavedVariance"
);
const
std
::
string
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
bool
use_global_stats
=
ctx
.
Attr
<
bool
>
(
"use_global_stats"
);
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
DataLayout
data_layout
=
phi
::
StringToDataLayout
(
data_layout_str
);
auto
*
d_x
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_scale
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Bias"
));
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MLUDeviceContext
>();
auto
d_x_tmp
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
x
->
dims
(),
dev_ctx
);
auto
scale_grad_tmp
=
ctx
.
AllocateTmpTensor
<
MPDType
,
MLUDeviceContext
>
(
scale
->
dims
(),
dev_ctx
);
auto
bias_grad_tmp
=
ctx
.
AllocateTmpTensor
<
MPDType
,
MLUDeviceContext
>
(
bias
->
dims
(),
dev_ctx
);
if
(
d_x
==
nullptr
)
{
d_x
=
&
d_x_tmp
;
}
if
(
d_scale
==
nullptr
)
{
d_scale
=
&
scale_grad_tmp
;
}
if
(
d_bias
==
nullptr
)
{
d_bias
=
&
bias_grad_tmp
;
}
const
auto
&
place
=
ctx
.
GetPlace
();
d_x
->
mutable_data
<
T
>
(
place
);
d_scale
->
mutable_data
<
MPDType
>
(
place
);
d_bias
->
mutable_data
<
MPDType
>
(
place
);
use_global_stats
=
is_test
||
use_global_stats
;
const
auto
&
x_dims
=
x
->
dims
();
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
2
,
platform
::
errors
::
InvalidArgument
(
"The size of input X's dimensions should be larger than 1."
"But received: the size of input X's dimensions is [%d]"
,
x_dims
.
size
()));
PADDLE_ENFORCE_LE
(
x_dims
.
size
(),
5
,
platform
::
errors
::
InvalidArgument
(
"The size of input X's dimensions should be less than 6."
"But received: the size of input X's dimensions is [%d]"
,
x_dims
.
size
()));
const
int
N
=
x_dims
[
0
];
const
int
C
=
(
data_layout
==
DataLayout
::
kNCHW
?
x_dims
[
1
]
:
x_dims
[
x_dims
.
size
()
-
1
]);
const
int
sample_size
=
x
->
numel
()
/
N
/
C
;
phi
::
DenseTensor
transformed_d_y
;
phi
::
DenseTensor
transformed_x
;
phi
::
DenseTensor
transformed_d_x
;
const
int
transformed_dim_size
=
4
;
const
int
transformed_shape
[
transformed_dim_size
]
=
{
N
,
sample_size
,
1
,
C
};
MLUCnnlTensorDesc
transformed_desc
(
transformed_dim_size
,
transformed_shape
,
ToCnnlDataType
<
T
>
(),
CNNL_LAYOUT_NHWC
);
MLUCnnlTensorDesc
others_input_desc
(
*
scale
);
bool
need_transpose
=
(
data_layout
==
DataLayout
::
kNCHW
&&
x_dims
.
size
()
!=
2
);
if
(
need_transpose
)
{
transformed_d_y
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
framework
::
DDim
(
transformed_shape
,
transformed_dim_size
),
dev_ctx
);
transformed_x
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
framework
::
DDim
(
transformed_shape
,
transformed_dim_size
),
dev_ctx
);
transformed_d_x
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
framework
::
DDim
(
transformed_shape
,
transformed_dim_size
),
dev_ctx
);
const
int
org_reshaped
[]
=
{
N
,
C
,
sample_size
,
1
};
MLUCnnlTensorDesc
org_reshaped_desc
(
transformed_dim_size
,
org_reshaped
,
ToCnnlDataType
<
T
>
());
const
std
::
vector
<
int
>
perm
=
{
0
,
2
,
3
,
1
};
MLUCnnl
::
Transpose
(
ctx
,
perm
,
transformed_dim_size
,
org_reshaped_desc
.
get
(),
GetBasePtr
(
d_y
),
transformed_desc
.
get
(),
GetBasePtr
(
&
transformed_d_y
));
MLUCnnl
::
Transpose
(
ctx
,
perm
,
transformed_dim_size
,
org_reshaped_desc
.
get
(),
GetBasePtr
(
x
),
transformed_desc
.
get
(),
GetBasePtr
(
&
transformed_x
));
}
else
{
transformed_d_y
=
*
d_y
;
transformed_x
=
*
x
;
transformed_d_x
=
*
d_x
;
}
if
(
use_global_stats
)
{
const
auto
*
running_mean
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Mean"
);
const
auto
*
running_variance
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Variance"
);
MLUCnnl
::
FusedBatchNormGrad
(
ctx
,
false
/*is_training*/
,
transformed_desc
.
get
(),
GetBasePtr
(
&
transformed_d_y
),
transformed_desc
.
get
(),
GetBasePtr
(
&
transformed_x
),
others_input_desc
.
get
(),
GetBasePtr
(
scale
),
GetBasePtr
(
running_mean
),
GetBasePtr
(
running_variance
),
epsilon
,
transformed_desc
.
get
(),
GetBasePtr
(
&
transformed_d_x
),
GetBasePtr
(
d_scale
),
GetBasePtr
(
d_bias
));
}
else
{
MLUCnnl
::
FusedBatchNormGrad
(
ctx
,
true
/*is_training*/
,
transformed_desc
.
get
(),
GetBasePtr
(
&
transformed_d_y
),
transformed_desc
.
get
(),
GetBasePtr
(
&
transformed_x
),
others_input_desc
.
get
(),
GetBasePtr
(
scale
),
GetBasePtr
(
saved_mean
),
GetBasePtr
(
saved_inv_variance
),
epsilon
,
transformed_desc
.
get
(),
GetBasePtr
(
&
transformed_d_x
),
GetBasePtr
(
d_scale
),
GetBasePtr
(
d_bias
));
}
if
(
need_transpose
)
{
const
int
d_x_reshaped
[]
=
{
N
,
C
,
sample_size
,
1
};
MLUCnnlTensorDesc
d_x_reshaped_desc
(
transformed_dim_size
,
d_x_reshaped
,
ToCnnlDataType
<
T
>
());
const
std
::
vector
<
int
>
perm
=
{
0
,
3
,
1
,
2
};
MLUCnnl
::
Transpose
(
ctx
,
perm
,
transformed_dim_size
,
transformed_desc
.
get
(),
GetBasePtr
(
&
transformed_d_x
),
d_x_reshaped_desc
.
get
(),
GetBasePtr
(
d_x
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
batch_norm
,
ops
::
MLUBatchNormOpKernel
<
float
>
,
ops
::
MLUBatchNormOpKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
batch_norm_grad
,
ops
::
MLUBatchNormGradOpKernel
<
float
>
,
ops
::
MLUBatchNormGradOpKernel
<
plat
::
float16
>
);
paddle/fluid/operators/bce_loss_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
BCELossMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
labels
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Label"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
label_desc
(
*
labels
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
BceLoss
(
ctx
,
CNNL_BCE_LOSS_NONE
,
x_desc
.
get
(),
GetBasePtr
(
x
),
label_desc
.
get
(),
GetBasePtr
(
labels
),
nullptr
,
nullptr
,
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
T
>
class
BCELossGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
labels
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Label"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
label_desc
(
*
labels
);
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
MLUCnnl
::
BceLossBackward
(
ctx
,
CNNL_BCE_LOSS_NONE
,
dout_desc
.
get
(),
GetBasePtr
(
dout
),
x_desc
.
get
(),
GetBasePtr
(
x
),
label_desc
.
get
(),
GetBasePtr
(
labels
),
nullptr
,
nullptr
,
x_desc
.
get
(),
GetBasePtr
(
dx
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
bce_loss
,
ops
::
BCELossMLUKernel
<
float
>
,
ops
::
BCELossMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
bce_loss_grad
,
ops
::
BCELossGradMLUKernel
<
float
>
,
ops
::
BCELossGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/cast_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/platform/device/mlu/device_context.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
CastMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
src_type
=
static_cast
<
VT
::
Type
>
(
ctx
.
Attr
<
int
>
(
"in_dtype"
));
auto
dst_type
=
static_cast
<
VT
::
Type
>
(
ctx
.
Attr
<
int
>
(
"out_dtype"
));
auto
place
=
ctx
.
GetPlace
();
if
(
src_type
==
dst_type
)
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>();
output
->
mutable_data
<
T
>
(
place
);
framework
::
TensorCopy
(
*
input
,
place
,
dev_ctx
,
output
);
return
;
}
PADDLE_ENFORCE_EQ
(
MLUSupportsCast
(
src_type
,
dst_type
),
true
,
platform
::
errors
::
InvalidArgument
(
"MLU not support cast [%d] to [%d]"
,
framework
::
DataTypeToString
(
src_type
),
framework
::
DataTypeToString
(
dst_type
)));
output
->
mutable_data
(
place
,
framework
::
TransToPhiDataType
(
dst_type
));
MLUCnnlTensorDesc
input_desc
(
*
input
);
MLUCnnlTensorDesc
output_desc
(
*
output
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
src_type
,
dst_type
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
input_desc
.
get
(),
GetBasePtr
(
input
),
output_desc
.
get
(),
GetBasePtr
(
output
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
cast
,
ops
::
CastMLUKernel
<
float
>
,
ops
::
CastMLUKernel
<
int
>
,
ops
::
CastMLUKernel
<
int16_t
>
,
ops
::
CastMLUKernel
<
uint8_t
>
,
ops
::
CastMLUKernel
<
bool
>
,
ops
::
CastMLUKernel
<
int64_t
>
,
ops
::
CastMLUKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/clip_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ClipMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
min
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"min"
));
auto
max
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"max"
));
if
(
ctx
.
HasInput
(
"Min"
))
{
phi
::
DenseTensor
min_cpu
;
auto
*
min_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Min"
);
auto
*
min_data
=
min_tensor
->
data
<
T
>
();
if
(
platform
::
is_mlu_place
(
min_tensor
->
place
()))
{
paddle
::
framework
::
TensorCopySync
(
*
min_tensor
,
platform
::
CPUPlace
(),
&
min_cpu
);
min_data
=
min_cpu
.
data
<
T
>
();
}
min
=
min_data
[
0
];
}
if
(
ctx
.
HasInput
(
"Max"
))
{
phi
::
DenseTensor
max_cpu
;
auto
*
max_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Max"
);
auto
*
max_data
=
max_tensor
->
data
<
T
>
();
if
(
platform
::
is_mlu_place
(
max_tensor
->
place
()))
{
paddle
::
framework
::
TensorCopySync
(
*
max_tensor
,
platform
::
CPUPlace
(),
&
max_cpu
);
max_data
=
max_cpu
.
data
<
T
>
();
}
max
=
max_data
[
0
];
}
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
Clip
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
x
),
static_cast
<
const
void
*>
(
&
min
),
static_cast
<
const
void
*>
(
&
max
),
GetBasePtr
(
out
));
}
};
template
<
typename
T
>
class
ClipGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
min_tensor
=
ctx
.
HasInput
(
"Min"
)
?
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Min"
)
:
nullptr
;
auto
*
max_tensor
=
ctx
.
HasInput
(
"Max"
)
?
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Max"
)
:
nullptr
;
auto
min_val
=
ctx
.
Attr
<
float
>
(
"min"
);
if
(
min_tensor
)
{
phi
::
DenseTensor
min_data
;
framework
::
TensorCopy
(
*
min_tensor
,
platform
::
CPUPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
&
min_data
);
ctx
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>().
Wait
();
min_val
=
static_cast
<
float
>
(
min_data
.
data
<
T
>
()[
0
]);
}
auto
max_val
=
ctx
.
Attr
<
float
>
(
"max"
);
if
(
max_tensor
)
{
phi
::
DenseTensor
max_data
;
framework
::
TensorCopy
(
*
max_tensor
,
platform
::
CPUPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
&
max_data
);
ctx
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>().
Wait
();
max_val
=
static_cast
<
float
>
(
max_data
.
data
<
T
>
()[
0
]);
}
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
MLUCnnl
::
HardtanhBackward
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
x
),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
max_val
,
min_val
,
dx_desc
.
get
(),
GetBasePtr
(
dx
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
clip
,
ops
::
ClipMLUKernel
<
float
>
,
ops
::
ClipMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
clip_grad
,
ops
::
ClipGradMLUKernel
<
float
>
,
ops
::
ClipGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/concat_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/concat_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/phi/core/tensor_utils.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ConcatMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
ins
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"X"
);
phi
::
DenseTensor
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
PADDLE_ENFORCE_NOT_NULL
(
ins
[
0
],
platform
::
errors
::
NotFound
(
"The first input tensor is not initalized."
));
auto
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
auto
ins_size
=
ins
.
size
();
bool
need_resize_out_dims
=
false
;
if
(
ctx
.
HasInput
(
"AxisTensor"
))
{
auto
*
axis_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"AxisTensor"
);
axis
=
phi
::
GetVectorFromTensor
<
int
>
(
axis_tensor
)[
0
];
need_resize_out_dims
=
true
;
}
axis
=
ComputeAxis
(
static_cast
<
int64_t
>
(
axis
),
static_cast
<
int64_t
>
(
ins
[
0
]
->
dims
().
size
()));
if
(
need_resize_out_dims
)
{
const
size_t
n
=
ins
.
size
();
std
::
vector
<
framework
::
DDim
>
ins_dims
(
n
);
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
ins_dims
[
i
]
=
ins
[
i
]
->
dims
();
}
framework
::
DDim
out_dims
=
phi
::
funcs
::
ComputeAndCheckShape
(
true
,
ins_dims
,
axis
);
out
->
Resize
(
out_dims
);
}
const
int
axis_t
=
axis
;
const
int
ins_size_t
=
ins_size
;
auto
place
=
ctx
.
GetPlace
();
out
->
mutable_data
<
T
>
(
place
);
// mlu should do sth
// init ins tensors
std
::
vector
<
const
void
*>
inputs
;
std
::
vector
<
MLUCnnlTensorDesc
>
input_descs
;
std
::
vector
<
cnnlTensorDescriptor_t
>
desc_vector
;
for
(
size_t
i
=
0
;
i
<
ins_size
;
i
++
)
{
input_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
*
ins
[
i
],
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
ins
[
i
]
->
dtype
())));
desc_vector
.
push_back
(
input_descs
.
back
().
get
());
inputs
.
push_back
(
GetBasePtr
(
ins
[
i
]));
}
// init out tensors
MLUCnnlTensorDesc
output_desc
(
*
out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
out
->
dtype
()));
// MLU should do sth
MLUCnnl
::
Concat
(
ctx
,
ins_size_t
,
axis_t
,
desc_vector
.
data
(),
inputs
.
data
(),
output_desc
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
T
>
class
ConcatGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
out_grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
ins
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"X"
);
auto
out_var_names
=
ctx
.
OutputNames
(
framework
::
GradVarName
(
"X"
));
auto
outs
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
int
split_num
=
ins
.
size
();
PADDLE_ENFORCE_NOT_NULL
(
ins
[
0
],
platform
::
errors
::
NotFound
(
"The first input tensor is not initalized."
));
if
(
ctx
.
HasInput
(
"AxisTensor"
))
{
auto
*
axis_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"AxisTensor"
);
axis
=
phi
::
GetVectorFromTensor
<
int
>
(
axis_tensor
)[
0
];
}
axis
=
ComputeAxis
(
static_cast
<
int64_t
>
(
axis
),
static_cast
<
int64_t
>
(
ins
[
0
]
->
dims
().
size
()));
PADDLE_ENFORCE_GE
(
axis
,
0
,
platform
::
errors
::
InvalidArgument
(
"concat_grad: axis should be larger than or "
"equal to 0, but received axis is %d."
,
axis
));
PADDLE_ENFORCE_LT
(
axis
,
out_grad
->
dims
().
size
(),
platform
::
errors
::
InvalidArgument
(
"concat_grad: axis should be less than ins[0]->dims()!"
"But received axis is %d, while ins[0]->dims()"
"size is %d."
,
axis
,
out_grad
->
dims
().
size
()));
// get output tensor that the name is not kEmptyVarName
std
::
vector
<
void
*>
outputs_vec
;
std
::
vector
<
phi
::
DenseTensor
>
tmp_outputs_vec
;
std
::
vector
<
MLUCnnlTensorDesc
>
output_descs
;
std
::
vector
<
cnnlTensorDescriptor_t
>
descs_vec
;
for
(
size_t
j
=
0
;
j
<
outs
.
size
();
++
j
)
{
if
(
out_var_names
[
j
]
!=
framework
::
kEmptyVarName
&&
outs
[
j
]
->
numel
()
!=
0UL
)
{
outs
[
j
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
output_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
*
outs
[
j
]));
outputs_vec
.
push_back
(
GetBasePtr
(
outs
[
j
]));
}
else
{
phi
::
DenseTensor
tmp_tensor
;
tmp_tensor
.
mutable_data
<
T
>
(
ins
[
j
]
->
dims
(),
ctx
.
GetPlace
());
tmp_outputs_vec
.
push_back
(
tmp_tensor
);
output_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
*
ins
[
j
]));
outputs_vec
.
push_back
(
GetBasePtr
(
&
(
tmp_outputs_vec
.
back
())));
}
descs_vec
.
push_back
(
output_descs
.
back
().
get
());
}
MLUCnnlTensorDesc
out_grad_desc
(
*
out_grad
);
MLUCnnl
::
Split
(
ctx
,
static_cast
<
int
>
(
split_num
),
static_cast
<
int
>
(
axis
),
out_grad_desc
.
get
(),
GetBasePtr
(
out_grad
),
descs_vec
.
data
(),
outputs_vec
.
data
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
concat
,
ops
::
ConcatMLUKernel
<
float
>
,
ops
::
ConcatMLUKernel
<
paddle
::
platform
::
float16
>
,
ops
::
ConcatMLUKernel
<
int64_t
>
,
ops
::
ConcatMLUKernel
<
bool
>
,
ops
::
ConcatMLUKernel
<
int
>
,
ops
::
ConcatMLUKernel
<
uint8_t
>
);
REGISTER_OP_MLU_KERNEL
(
concat_grad
,
ops
::
ConcatGradMLUKernel
<
float
>
,
ops
::
ConcatGradMLUKernel
<
paddle
::
platform
::
float16
>
,
ops
::
ConcatGradMLUKernel
<
int64_t
>
,
ops
::
ConcatGradMLUKernel
<
bool
>
,
ops
::
ConcatGradMLUKernel
<
int
>
,
ops
::
ConcatGradMLUKernel
<
uint8_t
>
);
paddle/fluid/operators/conv_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
using
DataLayout
=
phi
::
DataLayout
;
template
<
typename
T
>
class
MLUConvOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
phi
::
DenseTensor
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
*
filter
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Filter"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Output"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
const
std
::
string
padding_algorithm
=
ctx
.
Attr
<
std
::
string
>
(
"padding_algorithm"
);
const
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
const
bool
channel_last
=
data_format
==
"NHWC"
;
// update padding and dilation
auto
in_dims
=
input
->
dims
();
auto
filter_dims
=
filter
->
dims
();
auto
in_dims_size
=
in_dims
.
size
();
framework
::
DDim
in_data_dims
;
framework
::
DDim
filter_data_dims
;
if
(
channel_last
)
{
in_data_dims
=
phi
::
slice_ddim
(
in_dims
,
1
,
in_dims
.
size
()
-
1
);
}
else
{
in_data_dims
=
phi
::
slice_ddim
(
in_dims
,
2
,
in_dims
.
size
());
}
filter_data_dims
=
phi
::
slice_ddim
(
filter_dims
,
2
,
in_dims
.
size
());
std
::
vector
<
int
>
ksize
=
phi
::
vectorize
<
int
>
(
filter_data_dims
);
UpdatePaddingAndDilation
(
&
paddings
,
&
dilations
,
padding_algorithm
,
in_data_dims
,
strides
,
ksize
);
phi
::
DenseTensor
input_tensor
(
input
->
type
());
phi
::
DenseTensor
output_tensor
(
output
->
type
());
const
std
::
vector
<
int
>
perm_to_nhwc
=
{
0
,
2
,
3
,
1
};
if
(
channel_last
)
{
input_tensor
.
ShareDataWith
(
*
input
);
output_tensor
.
ShareDataWith
(
*
output
);
}
else
{
// transpose input from NCHW to NHWC
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
input
,
&
input_tensor
,
true
/*need_reshape_or_alloc*/
);
auto
output_dims
=
output
->
dims
();
output_tensor
.
mutable_data
<
T
>
(
{
output_dims
[
0
],
output_dims
[
2
],
output_dims
[
3
],
output_dims
[
1
]},
ctx
.
GetPlace
());
}
input_tensor
.
set_layout
(
DataLayout
::
kNHWC
);
output_tensor
.
set_layout
(
DataLayout
::
kNHWC
);
// transpose filter from MCHW to MHWC
phi
::
DenseTensor
trans_filter
(
filter
->
type
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
filter
,
&
trans_filter
,
true
/*need_reshape_or_alloc*/
);
cnnlTensorLayout_t
data_layout
=
CNNL_LAYOUT_NHWC
;
MLUCnnlTensorDesc
input_desc
(
input_tensor
,
data_layout
,
ToCnnlDataType
(
input_tensor
.
dtype
()));
MLUCnnlTensorDesc
filter_desc
(
trans_filter
,
data_layout
,
ToCnnlDataType
(
trans_filter
.
type
()));
MLUCnnlTensorDesc
output_desc
(
output_tensor
,
data_layout
,
ToCnnlDataType
(
output_tensor
.
dtype
()));
MLUCnnlConvolutionDesc
conv_desc
(
in_dims_size
,
paddings
.
data
(),
strides
.
data
(),
dilations
.
data
(),
groups
,
ToCnnlDataType
<
T
>
());
MLUCnnl
::
ConvolutionForward
(
ctx
,
conv_desc
.
get
(),
nullptr
/*alpha*/
,
nullptr
/*beta*/
,
nullptr
/*bias_desc*/
,
nullptr
/*bias_ptr*/
,
input_desc
.
get
(),
GetBasePtr
(
&
input_tensor
),
filter_desc
.
get
(),
GetBasePtr
(
&
trans_filter
),
output_desc
.
get
(),
GetBasePtr
(
&
output_tensor
));
if
(
!
channel_last
)
{
// transpose output from NHWC to NCHW
const
std
::
vector
<
int
>
perm_to_nchw
=
{
0
,
3
,
1
,
2
};
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nchw
,
&
output_tensor
,
output
,
false
/*need_reshape_or_alloc*/
);
}
}
};
template
<
typename
T
>
class
MLUConvGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
filter
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Filter"
);
auto
output_grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Output"
));
auto
input_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Input"
));
auto
filter_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Filter"
));
const
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
const
std
::
string
padding_algorithm
=
ctx
.
Attr
<
std
::
string
>
(
"padding_algorithm"
);
const
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
const
bool
channel_last
=
data_format
==
"NHWC"
;
// update padding and dilation
auto
in_dims
=
input
->
dims
();
auto
filter_dims
=
filter
->
dims
();
auto
in_dims_size
=
in_dims
.
size
();
framework
::
DDim
in_data_dims
;
framework
::
DDim
filter_data_dims
;
if
(
channel_last
)
{
in_data_dims
=
phi
::
slice_ddim
(
in_dims
,
1
,
in_dims
.
size
()
-
1
);
}
else
{
in_data_dims
=
phi
::
slice_ddim
(
in_dims
,
2
,
in_dims
.
size
());
}
filter_data_dims
=
phi
::
slice_ddim
(
filter_dims
,
2
,
in_dims
.
size
());
std
::
vector
<
int
>
ksize
=
phi
::
vectorize
<
int
>
(
filter_data_dims
);
UpdatePaddingAndDilation
(
&
paddings
,
&
dilations
,
padding_algorithm
,
in_data_dims
,
strides
,
ksize
);
phi
::
DenseTensor
input_tensor
(
input
->
type
());
phi
::
DenseTensor
output_grad_tensor
(
output_grad
->
type
());
const
std
::
vector
<
int
>
perm_to_nhwc
=
{
0
,
2
,
3
,
1
};
const
std
::
vector
<
int
>
perm_to_nchw
=
{
0
,
3
,
1
,
2
};
if
(
channel_last
)
{
input_tensor
.
ShareDataWith
(
*
input
);
output_grad_tensor
.
ShareDataWith
(
*
output_grad
);
}
else
{
// transpose input and output_grad from NCHW to NHWC
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
input
,
&
input_tensor
,
true
/*need_reshape_or_alloc*/
);
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
output_grad
,
&
output_grad_tensor
,
true
/*need_reshape_or_alloc*/
);
}
input_tensor
.
set_layout
(
DataLayout
::
kNHWC
);
output_grad_tensor
.
set_layout
(
DataLayout
::
kNHWC
);
if
(
filter_grad
)
{
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
filter_grad_dims
=
filter_grad
->
dims
();
phi
::
DenseTensor
temp_filter_grad
(
filter_grad
->
type
());
temp_filter_grad
.
mutable_data
<
T
>
({
filter_grad_dims
[
0
],
filter_grad_dims
[
2
],
filter_grad_dims
[
3
],
filter_grad_dims
[
1
]},
ctx
.
GetPlace
());
cnnlDataType_t
tensor_dtype
=
ToCnnlDataType
<
T
>
();
cnnlTensorLayout_t
data_layout
=
CNNL_LAYOUT_NHWC
;
MLUCnnlTensorDesc
input_desc
(
input_tensor
,
data_layout
,
tensor_dtype
);
MLUCnnlTensorDesc
out_grad_desc
(
output_grad_tensor
,
data_layout
,
tensor_dtype
);
MLUCnnlTensorDesc
temp_filter_grad_desc
(
temp_filter_grad
,
data_layout
,
tensor_dtype
);
MLUCnnlConvolutionDesc
conv_desc
(
in_dims_size
,
paddings
.
data
(),
strides
.
data
(),
dilations
.
data
(),
groups
,
tensor_dtype
);
MLUCnnl
::
ConvBackpropFilter
(
ctx
,
conv_desc
.
get
(),
input_desc
.
get
(),
GetBasePtr
(
&
input_tensor
),
out_grad_desc
.
get
(),
GetBasePtr
(
&
output_grad_tensor
),
temp_filter_grad_desc
.
get
(),
GetBasePtr
(
&
temp_filter_grad
));
// transpose filter_grad from MHWC to MCHW
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nchw
,
&
temp_filter_grad
,
filter_grad
,
false
/*need_reshape_or_alloc*/
);
}
if
(
input_grad
)
{
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
phi
::
DenseTensor
input_grad_tensor
(
input_grad
->
type
());
if
(
channel_last
)
{
input_grad_tensor
.
ShareDataWith
(
*
input_grad
);
}
else
{
auto
input_grad_dims
=
input_grad
->
dims
();
input_grad_tensor
.
mutable_data
<
T
>
({
input_grad_dims
[
0
],
input_grad_dims
[
2
],
input_grad_dims
[
3
],
input_grad_dims
[
1
]},
ctx
.
GetPlace
());
}
input_grad_tensor
.
set_layout
(
DataLayout
::
kNHWC
);
// transpose filter from MCHW to MHWC
phi
::
DenseTensor
trans_filter
(
filter
->
type
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
filter
,
&
trans_filter
,
true
/*need_reshape_or_alloc*/
);
cnnlDataType_t
tensor_dtype
=
ToCnnlDataType
<
T
>
();
cnnlTensorLayout_t
data_layout
=
CNNL_LAYOUT_NHWC
;
MLUCnnlTensorDesc
filter_desc
(
trans_filter
,
data_layout
,
tensor_dtype
);
MLUCnnlTensorDesc
out_grad_desc
(
output_grad_tensor
,
data_layout
,
tensor_dtype
);
MLUCnnlTensorDesc
in_grad_desc
(
input_grad_tensor
,
data_layout
,
tensor_dtype
);
MLUCnnlConvolutionDesc
conv_desc
(
in_dims_size
,
paddings
.
data
(),
strides
.
data
(),
dilations
.
data
(),
groups
,
tensor_dtype
);
MLUCnnl
::
ConvBackpropInput
(
ctx
,
conv_desc
.
get
(),
filter_desc
.
get
(),
GetBasePtr
(
&
trans_filter
),
out_grad_desc
.
get
(),
GetBasePtr
(
&
output_grad_tensor
),
in_grad_desc
.
get
(),
GetBasePtr
(
&
input_grad_tensor
));
if
(
!
channel_last
)
{
// transpose input_grad from NHWC to NCHW
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nchw
,
&
input_grad_tensor
,
input_grad
,
false
/*need_reshape_or_alloc*/
);
}
}
}
};
template
<
typename
T
>
class
MLUDepthwiseConvOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
phi
::
DenseTensor
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
*
filter
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Filter"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Output"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
const
std
::
string
padding_algorithm
=
ctx
.
Attr
<
std
::
string
>
(
"padding_algorithm"
);
const
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
const
bool
channel_last
=
data_format
==
"NHWC"
;
int
groups
;
// update padding and dilation
auto
in_dims
=
input
->
dims
();
auto
filter_dims
=
filter
->
dims
();
auto
in_dims_size
=
in_dims
.
size
();
framework
::
DDim
in_data_dims
;
framework
::
DDim
filter_data_dims
;
if
(
channel_last
)
{
in_data_dims
=
phi
::
slice_ddim
(
in_dims
,
1
,
in_dims
.
size
()
-
1
);
}
else
{
in_data_dims
=
phi
::
slice_ddim
(
in_dims
,
2
,
in_dims
.
size
());
}
filter_data_dims
=
phi
::
slice_ddim
(
filter_dims
,
2
,
in_dims
.
size
());
std
::
vector
<
int
>
ksize
=
phi
::
vectorize
<
int
>
(
filter_data_dims
);
UpdatePaddingAndDilation
(
&
paddings
,
&
dilations
,
padding_algorithm
,
in_data_dims
,
strides
,
ksize
);
phi
::
DenseTensor
input_tensor
(
input
->
type
());
phi
::
DenseTensor
output_tensor
(
output
->
type
());
const
std
::
vector
<
int
>
perm_to_nhwc
=
{
0
,
2
,
3
,
1
};
if
(
channel_last
)
{
groups
=
in_dims
[
3
];
input_tensor
.
ShareDataWith
(
*
input
);
output_tensor
.
ShareDataWith
(
*
output
);
}
else
{
// transpose input from NCHW to NHWC
groups
=
in_dims
[
1
];
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
input
,
&
input_tensor
,
true
/*need_reshape_or_alloc*/
);
auto
output_dims
=
output
->
dims
();
output_tensor
.
mutable_data
<
T
>
(
{
output_dims
[
0
],
output_dims
[
2
],
output_dims
[
3
],
output_dims
[
1
]},
ctx
.
GetPlace
());
}
input_tensor
.
set_layout
(
DataLayout
::
kNHWC
);
output_tensor
.
set_layout
(
DataLayout
::
kNHWC
);
// transpose filter from MCHW to MHWC
phi
::
DenseTensor
trans_filter
(
filter
->
type
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
filter
,
&
trans_filter
,
true
/*need_reshape_or_alloc*/
);
cnnlTensorLayout_t
data_layout
=
CNNL_LAYOUT_NHWC
;
MLUCnnlTensorDesc
input_desc
(
input_tensor
,
data_layout
,
ToCnnlDataType
(
input_tensor
.
dtype
()));
MLUCnnlTensorDesc
filter_desc
(
trans_filter
,
data_layout
,
ToCnnlDataType
(
trans_filter
.
type
()));
MLUCnnlTensorDesc
output_desc
(
output_tensor
,
data_layout
,
ToCnnlDataType
(
output_tensor
.
dtype
()));
MLUCnnlConvolutionDesc
conv_desc
(
in_dims_size
,
paddings
.
data
(),
strides
.
data
(),
dilations
.
data
(),
groups
,
ToCnnlDataType
<
T
>
());
MLUCnnl
::
ConvolutionForward
(
ctx
,
conv_desc
.
get
(),
nullptr
/*alpha*/
,
nullptr
/*beta*/
,
nullptr
/*bias_desc*/
,
nullptr
/*bias_ptr*/
,
input_desc
.
get
(),
GetBasePtr
(
&
input_tensor
),
filter_desc
.
get
(),
GetBasePtr
(
&
trans_filter
),
output_desc
.
get
(),
GetBasePtr
(
&
output_tensor
));
if
(
!
channel_last
)
{
// transpose output from NHWC to NCHW
const
std
::
vector
<
int
>
perm_to_nchw
=
{
0
,
3
,
1
,
2
};
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nchw
,
&
output_tensor
,
output
,
false
/*need_reshape_or_alloc*/
);
}
}
};
template
<
typename
T
>
class
MLUDepthwiseConvGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
filter
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Filter"
);
auto
output_grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Output"
));
auto
input_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Input"
));
auto
filter_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Filter"
));
const
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
const
std
::
string
padding_algorithm
=
ctx
.
Attr
<
std
::
string
>
(
"padding_algorithm"
);
const
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
const
bool
channel_last
=
data_format
==
"NHWC"
;
// update padding and dilation
auto
in_dims
=
input
->
dims
();
auto
filter_dims
=
filter
->
dims
();
auto
in_dims_size
=
in_dims
.
size
();
framework
::
DDim
in_data_dims
;
framework
::
DDim
filter_data_dims
;
int
groups
;
if
(
channel_last
)
{
in_data_dims
=
phi
::
slice_ddim
(
in_dims
,
1
,
in_dims
.
size
()
-
1
);
}
else
{
in_data_dims
=
phi
::
slice_ddim
(
in_dims
,
2
,
in_dims
.
size
());
}
filter_data_dims
=
phi
::
slice_ddim
(
filter_dims
,
2
,
in_dims
.
size
());
std
::
vector
<
int
>
ksize
=
phi
::
vectorize
<
int
>
(
filter_data_dims
);
UpdatePaddingAndDilation
(
&
paddings
,
&
dilations
,
padding_algorithm
,
in_data_dims
,
strides
,
ksize
);
phi
::
DenseTensor
input_tensor
(
input
->
type
());
phi
::
DenseTensor
output_grad_tensor
(
output_grad
->
type
());
const
std
::
vector
<
int
>
perm_to_nhwc
=
{
0
,
2
,
3
,
1
};
const
std
::
vector
<
int
>
perm_to_nchw
=
{
0
,
3
,
1
,
2
};
const
std
::
vector
<
int
>
perm_hwcm_to_mchw
=
{
3
,
2
,
0
,
1
};
const
std
::
vector
<
int
>
perm_mchw_to_hwcm
=
{
2
,
3
,
1
,
0
};
if
(
channel_last
)
{
input_tensor
.
ShareDataWith
(
*
input
);
output_grad_tensor
.
ShareDataWith
(
*
output_grad
);
groups
=
in_dims
[
3
];
}
else
{
groups
=
in_dims
[
1
];
// transpose input and output_grad from NCHW to NHWC
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
input
,
&
input_tensor
,
true
/*need_reshape_or_alloc*/
);
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
output_grad
,
&
output_grad_tensor
,
true
/*need_reshape_or_alloc*/
);
}
input_tensor
.
set_layout
(
DataLayout
::
kNHWC
);
output_grad_tensor
.
set_layout
(
DataLayout
::
kNHWC
);
if
(
filter_grad
)
{
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
filter_grad_dims
=
filter_grad
->
dims
();
phi
::
DenseTensor
temp_filter_grad
(
filter_grad
->
type
());
// Details about setting diff_w hwcn for better performance, see the CNNL
// documentation.
temp_filter_grad
.
mutable_data
<
T
>
({
filter_grad_dims
[
perm_mchw_to_hwcm
[
0
]],
filter_grad_dims
[
perm_mchw_to_hwcm
[
1
]],
filter_grad_dims
[
perm_mchw_to_hwcm
[
2
]],
filter_grad_dims
[
perm_mchw_to_hwcm
[
3
]]},
ctx
.
GetPlace
());
cnnlDataType_t
tensor_dtype
=
ToCnnlDataType
<
T
>
();
cnnlTensorLayout_t
data_layout
=
CNNL_LAYOUT_NHWC
;
MLUCnnlTensorDesc
input_desc
(
input_tensor
,
data_layout
,
tensor_dtype
);
MLUCnnlTensorDesc
out_grad_desc
(
output_grad_tensor
,
data_layout
,
tensor_dtype
);
MLUCnnlTensorDesc
temp_filter_grad_desc
(
temp_filter_grad
,
CNNL_LAYOUT_HWCN
,
tensor_dtype
);
MLUCnnlConvolutionDesc
conv_desc
(
in_dims_size
,
paddings
.
data
(),
strides
.
data
(),
dilations
.
data
(),
groups
,
tensor_dtype
);
MLUCnnl
::
ConvBackpropFilter
(
ctx
,
conv_desc
.
get
(),
input_desc
.
get
(),
GetBasePtr
(
&
input_tensor
),
out_grad_desc
.
get
(),
GetBasePtr
(
&
output_grad_tensor
),
temp_filter_grad_desc
.
get
(),
GetBasePtr
(
&
temp_filter_grad
));
// transpose filter_grad from HWCM to MCHW
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_hwcm_to_mchw
,
&
temp_filter_grad
,
filter_grad
,
false
/*need_reshape_or_alloc*/
);
}
if
(
input_grad
)
{
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
phi
::
DenseTensor
input_grad_tensor
(
input_grad
->
type
());
if
(
channel_last
)
{
input_grad_tensor
.
ShareDataWith
(
*
input_grad
);
}
else
{
auto
input_grad_dims
=
input_grad
->
dims
();
input_grad_tensor
.
mutable_data
<
T
>
({
input_grad_dims
[
0
],
input_grad_dims
[
2
],
input_grad_dims
[
3
],
input_grad_dims
[
1
]},
ctx
.
GetPlace
());
}
input_grad_tensor
.
set_layout
(
DataLayout
::
kNHWC
);
// transpose filter from MCHW to MHWC
phi
::
DenseTensor
trans_filter
(
filter
->
type
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
filter
,
&
trans_filter
,
true
/*need_reshape_or_alloc*/
);
cnnlDataType_t
tensor_dtype
=
ToCnnlDataType
<
T
>
();
cnnlTensorLayout_t
data_layout
=
CNNL_LAYOUT_NHWC
;
MLUCnnlTensorDesc
filter_desc
(
trans_filter
,
data_layout
,
tensor_dtype
);
MLUCnnlTensorDesc
out_grad_desc
(
output_grad_tensor
,
data_layout
,
tensor_dtype
);
MLUCnnlTensorDesc
in_grad_desc
(
input_grad_tensor
,
data_layout
,
tensor_dtype
);
MLUCnnlConvolutionDesc
conv_desc
(
in_dims_size
,
paddings
.
data
(),
strides
.
data
(),
dilations
.
data
(),
groups
,
tensor_dtype
);
MLUCnnl
::
ConvBackpropInput
(
ctx
,
conv_desc
.
get
(),
filter_desc
.
get
(),
GetBasePtr
(
&
trans_filter
),
out_grad_desc
.
get
(),
GetBasePtr
(
&
output_grad_tensor
),
in_grad_desc
.
get
(),
GetBasePtr
(
&
input_grad_tensor
));
if
(
!
channel_last
)
{
// transpose input_grad from NHWC to NCHW
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nchw
,
&
input_grad_tensor
,
input_grad
,
false
/*need_reshape_or_alloc*/
);
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
conv2d
,
ops
::
MLUConvOpKernel
<
float
>
,
ops
::
MLUConvOpKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
conv2d_grad
,
ops
::
MLUConvGradOpKernel
<
float
>
,
ops
::
MLUConvGradOpKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
depthwise_conv2d
,
ops
::
MLUDepthwiseConvOpKernel
<
float
>
,
ops
::
MLUDepthwiseConvOpKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
depthwise_conv2d_grad
,
ops
::
MLUDepthwiseConvGradOpKernel
<
float
>
,
ops
::
MLUDepthwiseConvGradOpKernel
<
plat
::
float16
>
);
paddle/fluid/operators/conv_transpose_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_transpose_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
namespace
paddle
{
namespace
operators
{
using
DataLayout
=
phi
::
DataLayout
;
template
<
typename
T
>
class
Conv2DTransposeMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
phi
::
DenseTensor
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
const
phi
::
DenseTensor
*
filter
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Filter"
);
phi
::
DenseTensor
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Output"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
vector
<
int
>
output_padding
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"output_padding"
);
const
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
const
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
const
std
::
string
padding_algorithm
=
ctx
.
Attr
<
std
::
string
>
(
"padding_algorithm"
);
// check dimension
const
bool
channel_last
=
data_format
==
"NHWC"
;
auto
in_dims
=
input
->
dims
();
auto
filter_dims
=
filter
->
dims
();
auto
in_dims_size
=
in_dims
.
size
();
framework
::
DDim
in_data_dims
;
framework
::
DDim
filter_data_dims
;
if
(
channel_last
)
{
in_data_dims
=
phi
::
slice_ddim
(
in_dims
,
1
,
in_dims
.
size
()
-
1
);
}
else
{
in_data_dims
=
phi
::
slice_ddim
(
in_dims
,
2
,
in_dims
.
size
());
}
filter_data_dims
=
phi
::
slice_ddim
(
filter_dims
,
2
,
in_dims
.
size
());
std
::
vector
<
int
>
ksize
=
phi
::
vectorize
<
int
>
(
filter_data_dims
);
phi
::
UpdatePaddingAndDilation
(
&
paddings
,
&
dilations
,
padding_algorithm
,
in_data_dims
,
strides
,
ksize
);
phi
::
DenseTensor
input_tensor
(
input
->
type
());
phi
::
DenseTensor
output_tensor
(
output
->
type
());
input_tensor
.
set_layout
(
DataLayout
::
kNHWC
);
output_tensor
.
set_layout
(
DataLayout
::
kNHWC
);
const
std
::
vector
<
int
>
perm_to_nhwc
=
{
0
,
2
,
3
,
1
};
if
(
channel_last
)
{
input_tensor
.
ShareDataWith
(
*
input
);
output_tensor
.
ShareDataWith
(
*
output
);
}
else
{
// transpose input from NCHW to NHWC
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
input
,
&
input_tensor
,
true
/*need_reshape_or_alloc*/
);
auto
output_dims
=
output
->
dims
();
output_tensor
.
mutable_data
<
T
>
(
{
output_dims
[
0
],
output_dims
[
2
],
output_dims
[
3
],
output_dims
[
1
]},
ctx
.
GetPlace
());
}
// transpose filter from MCHW to MHWC
phi
::
DenseTensor
trans_filter
(
filter
->
type
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
filter
,
&
trans_filter
,
true
/*need_reshape_or_alloc*/
);
// construct MLU attr
cnnlTensorLayout_t
data_layout
=
CNNL_LAYOUT_NHWC
;
MLUCnnlTensorDesc
input_desc
(
input_tensor
,
data_layout
,
ToCnnlDataType
(
input_tensor
.
dtype
()));
MLUCnnlTensorDesc
filter_desc
(
trans_filter
,
data_layout
,
ToCnnlDataType
(
trans_filter
.
type
()));
MLUCnnlTensorDesc
output_desc
(
output_tensor
,
data_layout
,
ToCnnlDataType
(
output_tensor
.
dtype
()));
MLUCnnlConvolutionDesc
conv_desc
(
in_dims_size
,
paddings
.
data
(),
strides
.
data
(),
dilations
.
data
(),
groups
,
ToCnnlDataType
<
T
>
());
MLUCnnl
::
ConvBackpropInput
(
ctx
,
conv_desc
.
get
(),
filter_desc
.
get
(),
GetBasePtr
(
&
trans_filter
),
input_desc
.
get
(),
GetBasePtr
(
&
input_tensor
),
output_desc
.
get
(),
GetBasePtr
(
&
output_tensor
));
if
(
!
channel_last
)
{
// transpose output from NHWC to NCHW
const
std
::
vector
<
int
>
perm_to_nchw
=
{
0
,
3
,
1
,
2
};
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nchw
,
&
output_tensor
,
output
,
false
/*need_reshape_or_alloc*/
);
}
}
};
template
<
typename
T
>
class
Conv2DTransposeGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
phi
::
DenseTensor
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
const
phi
::
DenseTensor
*
filter
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Filter"
);
const
phi
::
DenseTensor
*
output_grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Output"
));
phi
::
DenseTensor
*
input_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Input"
));
phi
::
DenseTensor
*
filter_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Filter"
));
if
((
!
input_grad
)
&&
(
!
filter_grad
))
return
;
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
const
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
std
::
string
padding_algorithm
=
ctx
.
Attr
<
std
::
string
>
(
"padding_algorithm"
);
const
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
const
phi
::
DataLayout
data_layout
=
phi
::
StringToDataLayout
(
data_format
);
auto
in_dims
=
input
->
dims
();
auto
filter_dims
=
filter
->
dims
();
auto
in_dims_size
=
in_dims
.
size
();
const
bool
channel_last
=
(
data_layout
==
phi
::
DataLayout
::
kNHWC
);
framework
::
DDim
in_data_dims
;
if
(
channel_last
)
{
in_data_dims
=
phi
::
slice_ddim
(
in_dims
,
1
,
in_dims
.
size
()
-
1
);
}
else
{
in_data_dims
=
phi
::
slice_ddim
(
in_dims
,
2
,
in_dims
.
size
());
}
framework
::
DDim
filter_data_dims
=
phi
::
slice_ddim
(
filter_dims
,
2
,
filter_dims
.
size
());
std
::
vector
<
int
>
ksize
=
phi
::
vectorize
<
int
>
(
filter_data_dims
);
phi
::
UpdatePaddingAndDilation
(
&
paddings
,
&
dilations
,
padding_algorithm
,
in_data_dims
,
strides
,
ksize
);
phi
::
DenseTensor
input_tensor
(
input
->
type
());
phi
::
DenseTensor
output_grad_tensor
(
output_grad
->
type
());
output_grad_tensor
.
set_layout
(
DataLayout
::
kNHWC
);
const
std
::
vector
<
int
>
perm_to_nhwc
=
{
0
,
2
,
3
,
1
};
if
(
channel_last
)
{
input_tensor
.
ShareDataWith
(
*
input
);
output_grad_tensor
.
ShareDataWith
(
*
output_grad
);
}
else
{
// transpose input from NCHW to NHWC
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
input
,
&
input_tensor
,
true
/*need_reshape_or_alloc*/
);
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
output_grad
,
&
output_grad_tensor
,
true
/*need_reshape_or_alloc*/
);
}
// transpose filter from MCHW to MHWC
phi
::
DenseTensor
trans_filter
(
filter
->
type
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
filter
,
&
trans_filter
,
true
/*need_reshape_or_alloc*/
);
// MLU descs
cnnlTensorLayout_t
data_layout_mlu
=
CNNL_LAYOUT_NHWC
;
MLUCnnlTensorDesc
input_desc
(
input_tensor
,
data_layout_mlu
,
ToCnnlDataType
(
input_tensor
.
dtype
()));
MLUCnnlTensorDesc
trans_filter_desc
(
trans_filter
,
data_layout_mlu
,
ToCnnlDataType
(
trans_filter
.
type
()));
MLUCnnlTensorDesc
output_grad_desc
(
output_grad_tensor
,
data_layout_mlu
,
ToCnnlDataType
(
output_grad_tensor
.
dtype
()));
MLUCnnlConvolutionDesc
conv_desc
(
in_dims_size
,
paddings
.
data
(),
strides
.
data
(),
dilations
.
data
(),
groups
,
ToCnnlDataType
<
T
>
());
if
(
filter_grad
)
{
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
phi
::
DenseTensor
filter_grad_tensor
(
filter_grad
->
type
());
// filter_grad always MCHW
// filter_grad_tensor always MHWC
auto
filter_grad_dims
=
filter_grad
->
dims
();
filter_grad_tensor
.
mutable_data
<
T
>
({
filter_grad_dims
[
0
],
filter_grad_dims
[
2
],
filter_grad_dims
[
3
],
filter_grad_dims
[
1
]},
ctx
.
GetPlace
());
//}
filter_grad_tensor
.
set_layout
(
DataLayout
::
kNHWC
);
MLUCnnlTensorDesc
filter_grad_desc
(
filter_grad_tensor
,
data_layout_mlu
,
ToCnnlDataType
(
filter_grad_tensor
.
dtype
()));
MLUCnnl
::
ConvBackpropFilter
(
ctx
,
conv_desc
.
get
(),
output_grad_desc
.
get
(),
GetBasePtr
(
output_grad
),
input_desc
.
get
(),
GetBasePtr
(
&
input_tensor
),
filter_grad_desc
.
get
(),
GetBasePtr
(
&
filter_grad_tensor
));
// transpose output from MHWC to MCHW
const
std
::
vector
<
int
>
perm_to_mchw
=
{
0
,
3
,
1
,
2
};
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_mchw
,
&
filter_grad_tensor
,
filter_grad
,
false
/*need_reshape_or_alloc*/
);
}
if
(
input_grad
)
{
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
phi
::
DenseTensor
input_grad_tensor
(
input_grad
->
type
());
input_tensor
.
set_layout
(
DataLayout
::
kNHWC
);
if
(
channel_last
)
{
input_grad_tensor
.
ShareDataWith
(
*
input_grad
);
}
else
{
auto
input_grad_dims
=
input_grad
->
dims
();
input_grad_tensor
.
mutable_data
<
T
>
({
input_grad_dims
[
0
],
input_grad_dims
[
2
],
input_grad_dims
[
3
],
input_grad_dims
[
1
]},
ctx
.
GetPlace
());
}
MLUCnnlTensorDesc
input_grad_desc
(
input_grad_tensor
,
data_layout_mlu
,
ToCnnlDataType
(
input_grad_tensor
.
dtype
()));
MLUCnnl
::
ConvolutionForward
(
ctx
,
conv_desc
.
get
(),
nullptr
/*alpha*/
,
nullptr
/*beta*/
,
nullptr
/*bias_desc*/
,
nullptr
/*bias_ptr*/
,
output_grad_desc
.
get
(),
GetBasePtr
(
&
output_grad_tensor
),
trans_filter_desc
.
get
(),
GetBasePtr
(
&
trans_filter
),
input_grad_desc
.
get
(),
GetBasePtr
(
&
input_grad_tensor
));
if
(
!
channel_last
)
{
// transpose output from NHWC to NCHW
const
std
::
vector
<
int
>
perm_to_nchw
=
{
0
,
3
,
1
,
2
};
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nchw
,
&
input_grad_tensor
,
input_grad
,
false
/*need_reshape_or_alloc*/
);
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
conv2d_transpose
,
ops
::
Conv2DTransposeMLUKernel
<
float
>
,
ops
::
Conv2DTransposeMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
conv2d_transpose_grad
,
ops
::
Conv2DTransposeGradMLUKernel
<
float
>
,
ops
::
Conv2DTransposeGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/cumsum_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
CumSumMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
bool
exclusive
=
ctx
.
Attr
<
bool
>
(
"exclusive"
);
bool
reverse
=
ctx
.
Attr
<
bool
>
(
"reverse"
);
bool
flatten
=
ctx
.
Attr
<
bool
>
(
"flatten"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
phi
::
DenseTensor
*
input_ptr
=
const_cast
<
phi
::
DenseTensor
*>
(
x
);
phi
::
DenseTensor
flat_x
(
x
->
type
());
if
(
flatten
)
{
PADDLE_ENFORCE_EQ
(
axis
,
-
1
,
platform
::
errors
::
InvalidArgument
(
"when flatten is true, attr axis must be default %d, but got %d"
,
-
1
,
axis
));
flat_x
.
ShareDataWith
(
*
x
);
flat_x
.
Resize
(
phi
::
make_ddim
({
x
->
numel
()}));
input_ptr
=
&
flat_x
;
}
const
int
true_axis
=
(
axis
<
0
)
?
input_ptr
->
dims
().
size
()
+
axis
:
axis
;
MLUCnnlTensorDesc
input_desc
(
*
input_ptr
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
Cumsum
(
ctx
,
true_axis
,
exclusive
,
reverse
,
input_desc
.
get
(),
GetBasePtr
(
input_ptr
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
cumsum
,
ops
::
CumSumMLUKernel
<
int
>
,
ops
::
CumSumMLUKernel
<
float
>
,
ops
::
CumSumMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/deformable_conv_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
DeformableConvMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
*
offset
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Offset"
);
auto
*
mask
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Mask"
);
auto
*
filter
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Filter"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Output"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
const
int
deformable_groups
=
ctx
.
Attr
<
int
>
(
"deformable_groups"
);
const
int
im2col_step
=
ctx
.
Attr
<
int
>
(
"im2col_step"
);
const
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
const
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
const
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
// TODO(fwg): Remove this check when cnnl fix the bug that groups > 1.
PADDLE_ENFORCE_EQ
(
groups
==
1
,
true
,
platform
::
errors
::
InvalidArgument
(
"MLU deformable_conv kernel only support groups == 1, but get %d."
,
groups
));
// transform paddings from {h, w} to {top, bottom, left, right}.
const
std
::
vector
<
int
>
trans_paddings
{
paddings
[
0
],
paddings
[
0
],
paddings
[
1
],
paddings
[
1
]};
MLUCnnlDCNDesc
dcn_desc
(
input
->
dims
().
size
(),
trans_paddings
.
data
(),
strides
.
data
(),
dilations
.
data
(),
deformable_groups
,
groups
,
im2col_step
);
const
std
::
vector
<
int
>
perm_to_nhwc
=
{
0
,
2
,
3
,
1
};
phi
::
DenseTensor
trans_input
(
input
->
dtype
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
input
,
&
trans_input
,
true
/*need_reshape_or_alloc*/
);
phi
::
DenseTensor
trans_offset
(
offset
->
dtype
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
offset
,
&
trans_offset
,
true
/*need_reshape_or_alloc*/
);
phi
::
DenseTensor
trans_mask
(
mask
->
dtype
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
mask
,
&
trans_mask
,
true
/*need_reshape_or_alloc*/
);
phi
::
DenseTensor
trans_filter
(
filter
->
dtype
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
filter
,
&
trans_filter
,
true
/*need_reshape_or_alloc*/
);
phi
::
DenseTensor
tmp_output
(
output
->
dtype
());
auto
output_dims
=
output
->
dims
();
tmp_output
.
mutable_data
<
T
>
(
{
output_dims
[
0
],
output_dims
[
2
],
output_dims
[
3
],
output_dims
[
1
]},
ctx
.
GetPlace
());
cnnlTensorLayout_t
data_layout
=
CNNL_LAYOUT_NHWC
;
MLUCnnlTensorDesc
input_desc
(
trans_input
,
data_layout
,
ToCnnlDataType
(
trans_input
.
dtype
()));
MLUCnnlTensorDesc
offset_desc
(
trans_offset
,
data_layout
,
ToCnnlDataType
(
trans_offset
.
dtype
()));
MLUCnnlTensorDesc
mask_desc
(
trans_mask
,
data_layout
,
ToCnnlDataType
(
trans_mask
.
dtype
()));
MLUCnnlTensorDesc
filter_desc
(
trans_filter
,
data_layout
,
ToCnnlDataType
(
trans_filter
.
dtype
()));
MLUCnnlTensorDesc
output_desc
(
tmp_output
,
data_layout
,
ToCnnlDataType
(
tmp_output
.
dtype
()));
MLUCnnl
::
DCNForward
(
ctx
,
dcn_desc
.
get
(),
input_desc
.
get
(),
GetBasePtr
(
&
trans_input
),
offset_desc
.
get
(),
GetBasePtr
(
&
trans_offset
),
mask_desc
.
get
(),
GetBasePtr
(
&
trans_mask
),
filter_desc
.
get
(),
GetBasePtr
(
&
trans_filter
),
nullptr
,
nullptr
,
output_desc
.
get
(),
GetBasePtr
(
&
tmp_output
));
const
std
::
vector
<
int
>
perm_to_nchw
=
{
0
,
3
,
1
,
2
};
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nchw
,
&
tmp_output
,
output
,
false
/*need_reshape_or_alloc*/
);
}
};
template
<
typename
T
>
class
DeformableConvGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
phi
::
DenseTensor
*
output_grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Output"
));
auto
*
input_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Input"
));
auto
*
filter_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Filter"
));
auto
*
offset_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Offset"
));
auto
*
mask_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Mask"
));
const
phi
::
DenseTensor
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
*
offset
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Offset"
);
auto
*
mask
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Mask"
);
auto
*
filter
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Filter"
);
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
int
deformable_groups
=
ctx
.
Attr
<
int
>
(
"deformable_groups"
);
int
im2col_step
=
ctx
.
Attr
<
int
>
(
"im2col_step"
);
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
// TODO(fwg): Remove this check when cnnl fix the bug that groups > 1.
PADDLE_ENFORCE_EQ
(
groups
==
1
,
true
,
platform
::
errors
::
InvalidArgument
(
"MLU deformable_conv_grad kernel only support groups "
"== 1, but get %d."
,
groups
));
// transform paddings from {h, w} to {top, bottom, left, right}.
const
std
::
vector
<
int
>
trans_paddings
{
paddings
[
0
],
paddings
[
0
],
paddings
[
1
],
paddings
[
1
]};
MLUCnnlDCNDesc
dcn_desc
(
input
->
dims
().
size
(),
trans_paddings
.
data
(),
strides
.
data
(),
dilations
.
data
(),
deformable_groups
,
groups
,
im2col_step
);
phi
::
DenseTensor
tmp_input_grad
;
auto
input_dims
=
input
->
dims
();
tmp_input_grad
.
mutable_data
<
T
>
(
{
input_dims
[
0
],
input_dims
[
2
],
input_dims
[
3
],
input_dims
[
1
]},
ctx
.
GetPlace
());
phi
::
DenseTensor
tmp_filter_grad
;
auto
filter_dims
=
filter
->
dims
();
tmp_filter_grad
.
mutable_data
<
T
>
(
{
filter_dims
[
0
],
filter_dims
[
2
],
filter_dims
[
3
],
filter_dims
[
1
]},
ctx
.
GetPlace
());
phi
::
DenseTensor
tmp_offset_grad
;
auto
offset_dims
=
offset
->
dims
();
tmp_offset_grad
.
mutable_data
<
T
>
(
{
offset_dims
[
0
],
offset_dims
[
2
],
offset_dims
[
3
],
offset_dims
[
1
]},
ctx
.
GetPlace
());
phi
::
DenseTensor
tmp_mask_grad
;
auto
mask_dims
=
mask
->
dims
();
tmp_mask_grad
.
mutable_data
<
T
>
(
{
mask_dims
[
0
],
mask_dims
[
2
],
mask_dims
[
3
],
mask_dims
[
1
]},
ctx
.
GetPlace
());
const
std
::
vector
<
int
>
perm_to_nhwc
=
{
0
,
2
,
3
,
1
};
phi
::
DenseTensor
trans_output_grad
(
output_grad
->
dtype
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
output_grad
,
&
trans_output_grad
,
true
/*need_reshape_or_alloc*/
);
phi
::
DenseTensor
trans_input
(
input
->
dtype
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
input
,
&
trans_input
,
true
/*need_reshape_or_alloc*/
);
phi
::
DenseTensor
trans_offset
(
offset
->
dtype
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
offset
,
&
trans_offset
,
true
/*need_reshape_or_alloc*/
);
phi
::
DenseTensor
trans_mask
(
mask
->
dtype
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
mask
,
&
trans_mask
,
true
/*need_reshape_or_alloc*/
);
phi
::
DenseTensor
trans_filter
(
filter
->
dtype
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
filter
,
&
trans_filter
,
true
/*need_reshape_or_alloc*/
);
cnnlTensorLayout_t
data_layout
=
CNNL_LAYOUT_NHWC
;
MLUCnnlTensorDesc
output_grad_desc
(
trans_output_grad
,
data_layout
,
ToCnnlDataType
(
trans_output_grad
.
dtype
()));
MLUCnnlTensorDesc
input_desc
(
trans_input
,
data_layout
,
ToCnnlDataType
(
trans_input
.
dtype
()));
MLUCnnlTensorDesc
offset_desc
(
trans_offset
,
data_layout
,
ToCnnlDataType
(
trans_offset
.
dtype
()));
MLUCnnlTensorDesc
mask_desc
(
trans_mask
,
data_layout
,
ToCnnlDataType
(
trans_mask
.
dtype
()));
MLUCnnlTensorDesc
filter_desc
(
trans_filter
,
data_layout
,
ToCnnlDataType
(
trans_filter
.
dtype
()));
MLUCnnl
::
DCNBackwardData
(
ctx
,
dcn_desc
.
get
(),
input_desc
.
get
(),
GetBasePtr
(
&
trans_input
),
offset_desc
.
get
(),
GetBasePtr
(
&
trans_offset
),
mask_desc
.
get
(),
GetBasePtr
(
&
trans_mask
),
filter_desc
.
get
(),
GetBasePtr
(
&
trans_filter
),
output_grad_desc
.
get
(),
GetBasePtr
(
&
trans_output_grad
),
input_desc
.
get
(),
GetBasePtr
(
&
tmp_input_grad
),
offset_desc
.
get
(),
GetBasePtr
(
&
tmp_offset_grad
),
mask_desc
.
get
(),
GetBasePtr
(
&
tmp_mask_grad
));
MLUCnnl
::
DCNBackwardWeight
(
ctx
,
dcn_desc
.
get
(),
input_desc
.
get
(),
GetBasePtr
(
&
trans_input
),
offset_desc
.
get
(),
GetBasePtr
(
&
trans_offset
),
mask_desc
.
get
(),
GetBasePtr
(
&
trans_mask
),
output_grad_desc
.
get
(),
GetBasePtr
(
&
trans_output_grad
),
filter_desc
.
get
(),
GetBasePtr
(
&
tmp_filter_grad
),
nullptr
,
nullptr
);
const
std
::
vector
<
int
>
perm_to_nchw
=
{
0
,
3
,
1
,
2
};
if
(
input_grad
)
{
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nchw
,
&
tmp_input_grad
,
input_grad
,
false
/*need_reshape_or_alloc*/
);
}
if
(
filter_grad
)
{
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nchw
,
&
tmp_filter_grad
,
filter_grad
,
false
/*need_reshape_or_alloc*/
);
}
if
(
offset_grad
)
{
offset_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nchw
,
&
tmp_offset_grad
,
offset_grad
,
false
/*need_reshape_or_alloc*/
);
}
if
(
mask_grad
)
{
mask_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nchw
,
&
tmp_mask_grad
,
mask_grad
,
false
/*need_reshape_or_alloc*/
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
deformable_conv
,
ops
::
DeformableConvMLUKernel
<
float
>
);
REGISTER_OP_MLU_KERNEL
(
deformable_conv_grad
,
ops
::
DeformableConvGradMLUKernel
<
float
>
);
paddle/fluid/operators/dropout_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
DropoutMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
dropout_prob
=
ctx
.
Attr
<
float
>
(
"dropout_prob"
);
auto
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
auto
*
seed_tensor
=
ctx
.
HasInput
(
"Seed"
)
?
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Seed"
)
:
nullptr
;
auto
dropout_implementation
=
ctx
.
Attr
<
std
::
string
>
(
"dropout_implementation"
);
const
bool
is_upscale
=
(
dropout_implementation
==
"upscale_in_train"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
if
(
is_test
&&
is_upscale
)
{
// dropout op for inference: out = input.
framework
::
TensorCopy
(
*
x
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
out
);
return
;
}
else
if
(
!
is_test
)
{
// dropout op for training: out = input * mask / ( 1.0 - dropout_prob ) or
// out = input * mask.
int
seed_data
=
0
;
if
(
seed_tensor
)
{
if
(
platform
::
is_mlu_place
(
seed_tensor
->
place
()))
{
memory
::
Copy
(
platform
::
CPUPlace
(),
&
seed_data
,
seed_tensor
->
place
(),
seed_tensor
->
data
<
int
>
(),
sizeof
(
int
));
}
else
{
seed_data
=
*
(
seed_tensor
->
data
<
int
>
());
}
}
else
{
seed_data
=
ctx
.
Attr
<
bool
>
(
"fix_seed"
)
?
ctx
.
Attr
<
int
>
(
"seed"
)
:
0
;
}
auto
*
mask
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Mask"
);
mask
->
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
mask_desc
(
*
mask
);
// Special case when dropout_prob is 1.0
if
(
dropout_prob
==
1.0
f
)
{
auto
value_t
=
static_cast
<
T
>
(
0.0
f
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
value_t
,
out_desc
.
get
(),
GetBasePtr
(
out
));
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
value_t
,
mask_desc
.
get
(),
GetBasePtr
(
mask
));
return
;
}
// create mlu random generator
const
int
device_id
=
ctx
.
GetPlace
().
GetDeviceId
();
auto
mlu_gen_random
=
GetMLURandomGenerator
(
ctx
,
device_id
,
seed_data
);
// compute out = input * mask / ( 1.0 - dropout_prob )
MLUCnnl
::
FusedDropout
(
ctx
,
mlu_gen_random
->
get
(),
x_desc
.
get
(),
GetBasePtr
(
x
),
dropout_prob
,
GetBasePtr
(
&
(
mlu_gen_random
->
get_state
())),
mask_desc
.
get
(),
GetBasePtr
(
mask
),
out_desc
.
get
(),
GetBasePtr
(
out
));
if
(
is_upscale
)
{
return
;
}
}
// In downgrade_in_infer mode, need to multiply (1.0f - dropout_prob).
phi
::
DenseTensor
scale_tensor
(
x
->
dtype
());
phi
::
DenseTensor
bias_tensor
(
x
->
dtype
());
scale_tensor
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
bias_tensor
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
scale_desc
(
scale_tensor
);
MLUCnnlTensorDesc
bias_desc
(
bias_tensor
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
1.0
f
-
dropout_prob
),
&
scale_tensor
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0.0
f
),
&
bias_tensor
);
MLUCnnl
::
Scale
(
ctx
,
0
,
is_test
?
x_desc
.
get
()
:
out_desc
.
get
(),
is_test
?
GetBasePtr
(
x
)
:
GetBasePtr
(
out
),
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
),
bias_desc
.
get
(),
GetBasePtr
(
&
bias_tensor
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
T
>
class
DropoutGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE_EQ
(
!
ctx
.
Attr
<
bool
>
(
"is_test"
),
true
,
platform
::
errors
::
InvalidArgument
(
"GradOp is only callable when is_test is false"
));
auto
*
grad_x
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
grad_out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
mask
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Mask"
);
auto
dropout_prob
=
ctx
.
Attr
<
float
>
(
"dropout_prob"
);
auto
dropout_impl
=
ctx
.
Attr
<
std
::
string
>
(
"dropout_implementation"
);
grad_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
grad_x_desc
(
*
grad_x
);
if
(
dropout_prob
==
1.
)
{
auto
value_t
=
static_cast
<
T
>
(
0.0
f
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
value_t
,
grad_x_desc
.
get
(),
GetBasePtr
(
grad_x
));
return
;
}
// cast mask from uint8 to float32/float16
phi
::
DenseTensor
cast_mask
(
grad_x
->
dtype
());
cast_mask
.
Resize
(
mask
->
dims
());
cast_mask
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
mask_desc
(
*
mask
);
MLUCnnlTensorDesc
cast_mask_desc
(
cast_mask
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
framework
::
TransToProtoVarType
(
mask
->
dtype
()),
framework
::
TransToProtoVarType
(
cast_mask
.
dtype
()));
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
mask_desc
.
get
(),
GetBasePtr
(
mask
),
cast_mask_desc
.
get
(),
GetBasePtr
(
&
cast_mask
));
const
bool
is_upscale
=
(
dropout_impl
==
"upscale_in_train"
);
const
float
scale
=
is_upscale
?
(
1.0
f
/
(
1.0
f
-
dropout_prob
))
:
(
1.0
f
);
auto
data_type
=
ToCnnlDataType
<
T
>
();
MLUCnnlTensorDesc
grad_out_desc
(
*
grad_out
);
MLUCnnlOpTensorDesc
op_tensor_desc
(
CNNL_OP_TENSOR_MUL
,
data_type
,
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
op_tensor_desc
.
get
(),
cast_mask_desc
.
get
(),
GetBasePtr
(
&
cast_mask
),
grad_out_desc
.
get
(),
GetBasePtr
(
grad_out
),
grad_x_desc
.
get
(),
GetBasePtr
(
grad_x
),
data_type
,
scale
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
dropout
,
ops
::
DropoutMLUKernel
<
float
>
,
ops
::
DropoutMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
dropout_grad
,
ops
::
DropoutGradMLUKernel
<
float
>
,
ops
::
DropoutGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/expand_as_v2_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/expand_as_v2_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ExpandAsV2MLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
rank
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
)
->
dims
().
size
();
auto
target_shape
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"target_shape"
);
auto
target_rank
=
target_shape
.
size
();
PADDLE_ENFORCE_GE
(
target_rank
,
rank
,
platform
::
errors
::
InvalidArgument
(
"The rank (%d) of the input 'target_tensor' for "
"expand_as_v2 op must be greater than or equal to "
"the rank (%d) of the input 'x'."
,
target_rank
,
rank
));
PADDLE_ENFORCE_GE
(
rank
,
1
,
platform
::
errors
::
InvalidArgument
(
"The rank (%d) of the input 'x' for "
"expand_as_v2 op must be positive."
,
rank
));
PADDLE_ENFORCE_LE
(
target_rank
,
MAX_RANK_SUPPORTED
,
platform
::
errors
::
InvalidArgument
(
"The rank (%d) of the input 'target_tensor' for "
"expand_as_v2 op must be less than or equal to %d."
,
target_rank
,
MAX_RANK_SUPPORTED
));
ExpandAs
(
context
);
}
protected:
void
ExpandAs
(
const
framework
::
ExecutionContext
&
context
)
const
{
auto
*
in0
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
in_dims
=
in0
->
dims
();
auto
target_shape
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"target_shape"
);
auto
vec_in_dims
=
phi
::
vectorize
<
int
>
(
in_dims
);
auto
diff
=
target_shape
.
size
()
-
vec_in_dims
.
size
();
vec_in_dims
.
insert
(
vec_in_dims
.
begin
(),
diff
,
1
);
for
(
size_t
i
=
0
;
i
<
vec_in_dims
.
size
();
++
i
)
{
PADDLE_ENFORCE_NE
(
target_shape
[
i
],
0
,
platform
::
errors
::
InvalidArgument
(
"The value of target shape cannot be zero."
));
if
(
vec_in_dims
[
i
]
!=
1
)
{
PADDLE_ENFORCE_EQ
(
vec_in_dims
[
i
],
target_shape
[
i
],
platform
::
errors
::
InvalidArgument
(
"The value (%d) of the non-singleton dimension does not match"
" the corresponding value (%d) in "
"target tensor for expand_as_v2 op."
,
vec_in_dims
[
i
],
target_shape
[
i
]));
}
}
auto
*
out0
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
framework
::
DDim
out_dims
=
phi
::
make_ddim
(
target_shape
);
out0
->
Resize
(
out_dims
);
out0
->
mutable_data
<
T
>
(
context
.
GetPlace
());
MLUCnnlTensorDesc
x_desc
(
*
in0
);
MLUCnnlTensorDesc
out_desc
(
*
out0
);
MLUCnnl
::
BroadcastTo
(
context
,
x_desc
.
get
(),
GetBasePtr
(
in0
),
out_desc
.
get
(),
GetBasePtr
(
out0
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
expand_as_v2
,
ops
::
ExpandAsV2MLUKernel
<
float
>
,
ops
::
ExpandAsV2MLUKernel
<
int
>
,
ops
::
ExpandAsV2MLUKernel
<
int64_t
>
,
ops
::
ExpandAsV2MLUKernel
<
int8_t
>
,
ops
::
ExpandAsV2MLUKernel
<
uint8_t
>
,
ops
::
ExpandAsV2MLUKernel
<
bool
>
,
ops
::
ExpandAsV2MLUKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/expand_v2_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/expand_v2_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ExpandV2MLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
X
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
Out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
in_dims
=
X
->
dims
();
auto
expand_shape
=
get_expand_shape
(
ctx
);
auto
vec_in_dims
=
phi
::
vectorize
<
int
>
(
in_dims
);
auto
diff
=
expand_shape
.
size
()
-
vec_in_dims
.
size
();
vec_in_dims
.
insert
(
vec_in_dims
.
begin
(),
diff
,
1
);
std
::
vector
<
int
>
final_expand_shape
(
vec_in_dims
.
size
());
for
(
size_t
i
=
0
;
i
<
vec_in_dims
.
size
();
++
i
)
{
PADDLE_ENFORCE_NE
(
expand_shape
[
i
],
0
,
platform
::
errors
::
InvalidArgument
(
"The expanded size cannot be zero."
));
if
(
i
<
diff
)
{
// expand_shape = [3,4,-1,-1], X = [10,2] -->
// final_expand_shape = [3,4,10,2]
PADDLE_ENFORCE_GT
(
expand_shape
[
i
],
0
,
platform
::
errors
::
InvalidArgument
(
"The expanded size (%d) for non-existing dimensions must be "
"positive for expand_v2 op."
,
expand_shape
[
i
]));
final_expand_shape
[
i
]
=
expand_shape
[
i
];
}
else
if
(
expand_shape
[
i
]
>
0
)
{
// expand_shape = [3,4,10,4], X =
// [10,1] --> final_expand_shape =
// [3,4,10,4]
if
(
vec_in_dims
[
i
]
!=
1
)
{
PADDLE_ENFORCE_EQ
(
vec_in_dims
[
i
],
expand_shape
[
i
],
platform
::
errors
::
InvalidArgument
(
"The value (%d) of the non-singleton dimension does not match"
" the corresponding value (%d) in shape for expand_v2 op."
,
vec_in_dims
[
i
],
expand_shape
[
i
]));
final_expand_shape
[
i
]
=
expand_shape
[
i
];
}
else
{
final_expand_shape
[
i
]
=
expand_shape
[
i
];
}
}
else
{
// expand_shape = [3,4,-1,-1], X = [10,2] --> final_expand_shape
// = [3,4,10,2]
PADDLE_ENFORCE_EQ
(
expand_shape
[
i
],
-
1
,
platform
::
errors
::
InvalidArgument
(
"When the value in shape is negative for expand_v2 op, "
"only -1 is supported, but the value received is %d."
,
expand_shape
[
i
]));
final_expand_shape
[
i
]
=
vec_in_dims
[
i
];
}
}
auto
rank
=
X
->
dims
().
size
();
PADDLE_ENFORCE_GE
(
rank
,
1
,
platform
::
errors
::
InvalidArgument
(
"The rank of the input 'X' for expand_v2_mlu op must be positive, "
"but the value received is %d."
,
rank
));
auto
shape_size
=
final_expand_shape
.
size
();
PADDLE_ENFORCE_GE
(
shape_size
,
rank
,
platform
::
errors
::
InvalidArgument
(
"The number (%d) of elements of 'shape' for expand_v2_mlu op must "
"be "
"greater than or equal to the rank (%d) of the input 'X'."
,
shape_size
,
rank
));
framework
::
DDim
out_dims
=
phi
::
make_ddim
(
final_expand_shape
);
Out
->
Resize
(
out_dims
);
auto
place
=
ctx
.
GetPlace
();
Out
->
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
x_desc
(
*
X
);
MLUCnnlTensorDesc
out_desc
(
*
Out
);
MLUCnnl
::
BroadcastTo
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
X
),
out_desc
.
get
(),
GetBasePtr
(
Out
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
expand_v2
,
ops
::
ExpandV2MLUKernel
<
float
>
,
ops
::
ExpandV2MLUKernel
<
paddle
::
platform
::
float16
>
,
ops
::
ExpandV2MLUKernel
<
bool
>
,
ops
::
ExpandV2MLUKernel
<
int
>
,
ops
::
ExpandV2MLUKernel
<
int64_t
>
);
#endif
paddle/fluid/operators/fill_any_like_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
FillAnyLikeMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
using
CommonType
=
typename
std
::
common_type
<
float
,
typename
std
::
conditional
<
std
::
is_same
<
T
,
platform
::
float16
>::
value
,
float
,
T
>::
type
>::
type
;
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
float
value
=
ctx
.
Attr
<
float
>
(
"value"
);
auto
common_type_value
=
static_cast
<
CommonType
>
(
value
);
PADDLE_ENFORCE_EQ
(
(
common_type_value
>=
static_cast
<
CommonType
>
(
std
::
numeric_limits
<
T
>::
lowest
()))
&&
(
common_type_value
<=
static_cast
<
CommonType
>
(
std
::
numeric_limits
<
T
>::
max
())),
true
,
platform
::
errors
::
InvalidArgument
(
"The filled value is out of range for target type, "
"current kernel type is %s, the range should between %f "
"and %f, but now value is %f."
,
typeid
(
T
).
name
(),
static_cast
<
CommonType
>
(
std
::
numeric_limits
<
T
>::
lowest
()),
static_cast
<
CommonType
>
(
std
::
numeric_limits
<
T
>::
max
()),
value
));
PADDLE_ENFORCE_EQ
(
std
::
isnan
(
value
),
false
,
platform
::
errors
::
InvalidArgument
(
"The filled value is NaN."
));
auto
value_t
=
static_cast
<
T
>
(
value
);
MLUCnnlTensorDesc
out_desc
(
*
out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
value_t
,
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
fill_any_like
,
ops
::
FillAnyLikeMLUKernel
<
int
>
,
ops
::
FillAnyLikeMLUKernel
<
int64_t
>
,
ops
::
FillAnyLikeMLUKernel
<
float
>
,
ops
::
FillAnyLikeMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
FillConstantBatchSizeLikeOpMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
data_type
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
ctx
.
Attr
<
int
>
(
"dtype"
));
auto
float_value
=
ctx
.
Attr
<
float
>
(
"value"
);
auto
str_value
=
ctx
.
Attr
<
std
::
string
>
(
"str_value"
);
auto
force_cpu
=
ctx
.
Attr
<
bool
>
(
"force_cpu"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
if
(
in
->
lod
().
size
()
&&
ctx
.
Attr
<
int
>
(
"input_dim_idx"
)
==
0
)
{
// set the correct batch size for the phi::DenseTensor.
auto
odims
=
out
->
dims
();
int
output_dim_idx
=
ctx
.
Attr
<
int
>
(
"output_dim_idx"
);
odims
[
output_dim_idx
]
=
static_cast
<
int
>
(
in
->
lod
().
back
().
size
())
-
1
;
out
->
mutable_data
<
T
>
(
odims
,
ctx
.
GetPlace
());
}
T
value
;
if
(
str_value
.
empty
())
{
value
=
static_cast
<
T
>
(
float_value
);
}
else
{
// handle NaN/Inf first, which cannot be read from stream.
if
(
str_value
==
"inf"
)
{
value
=
static_cast
<
T
>
(
std
::
numeric_limits
<
double
>::
infinity
());
}
else
if
(
str_value
==
"-inf"
)
{
value
=
static_cast
<
T
>
(
-
std
::
numeric_limits
<
double
>::
infinity
());
}
else
if
(
str_value
==
"nan"
)
{
value
=
static_cast
<
T
>
(
std
::
numeric_limits
<
double
>::
quiet_NaN
());
}
else
{
std
::
stringstream
convert_stream
(
str_value
);
if
(
std
::
is_same
<
int64_t
,
T
>::
value
)
{
int64_t
tmp_value
;
convert_stream
>>
tmp_value
;
value
=
static_cast
<
T
>
(
tmp_value
);
}
else
{
double
tmp_value
;
convert_stream
>>
tmp_value
;
value
=
static_cast
<
T
>
(
tmp_value
);
}
}
}
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
bool
cpu_place
=
force_cpu
||
ctx
.
GetPlace
()
==
platform
::
CPUPlace
();
if
(
cpu_place
)
{
auto
&
dev_ctx
=
*
pool
.
Get
(
platform
::
CPUPlace
());
phi
::
funcs
::
SetConstant
<
phi
::
CPUContext
,
T
>
functor
;
out
->
mutable_data
(
platform
::
CPUPlace
(),
framework
::
TransToPhiDataType
(
data_type
));
functor
(
reinterpret_cast
<
const
phi
::
CPUContext
&>
(
dev_ctx
),
out
,
static_cast
<
T
>
(
value
));
}
else
{
out
->
mutable_data
(
ctx
.
GetPlace
(),
framework
::
TransToPhiDataType
(
data_type
));
const
T
*
value_data
=
&
value
;
cnnlPointerMode_t
pointer_mode
=
CNNL_POINTER_MODE_HOST
;
MLUCnnlTensorDesc
output_desc
(
*
out
);
MLUCnnl
::
Fill
(
ctx
,
pointer_mode
,
value_data
,
output_desc
.
get
(),
GetBasePtr
(
out
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
fill_constant_batch_size_like
,
ops
::
FillConstantBatchSizeLikeOpMLUKernel
<
int
>
,
ops
::
FillConstantBatchSizeLikeOpMLUKernel
<
float
>
,
ops
::
FillConstantBatchSizeLikeOpMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/fill_constant_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/utils.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
FillConstantMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
str_value
=
ctx
.
Attr
<
std
::
string
>
(
"str_value"
);
auto
float_value
=
ctx
.
Attr
<
float
>
(
"value"
);
auto
*
out_var
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
T
value
;
if
(
str_value
.
empty
())
{
value
=
static_cast
<
T
>
(
float_value
);
}
else
{
// handle NaN/Inf first, which cannot be read from stream.
if
(
str_value
==
"inf"
)
{
value
=
static_cast
<
T
>
(
std
::
numeric_limits
<
double
>::
infinity
());
}
else
if
(
str_value
==
"-inf"
)
{
value
=
static_cast
<
T
>
(
-
std
::
numeric_limits
<
double
>::
infinity
());
}
else
if
(
str_value
==
"nan"
)
{
value
=
static_cast
<
T
>
(
std
::
numeric_limits
<
double
>::
quiet_NaN
());
}
else
{
std
::
stringstream
convert_stream
(
str_value
);
if
(
std
::
is_same
<
int64_t
,
T
>::
value
)
{
int64_t
tmp_value
;
convert_stream
>>
tmp_value
;
value
=
static_cast
<
T
>
(
tmp_value
);
}
else
{
double
tmp_value
;
convert_stream
>>
tmp_value
;
value
=
static_cast
<
T
>
(
tmp_value
);
}
}
}
const
T
*
value_data
=
&
value
;
cnnlPointerMode_t
pointer_mode
=
CNNL_POINTER_MODE_HOST
;
if
(
ctx
.
HasInput
(
"ValueTensor"
))
{
auto
*
value_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"ValueTensor"
);
PADDLE_ENFORCE_EQ
(
value_tensor
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"When use phi::DenseTensor as value to set phi::DenseTensor "
"value in fill_cosntant, "
"value input(ValueTensor) size must be 1, but get %d"
,
value_tensor
->
numel
()));
value_data
=
value_tensor
->
data
<
T
>
();
auto
tmp_place
=
value_tensor
->
place
();
if
(
platform
::
is_mlu_place
(
tmp_place
))
{
pointer_mode
=
CNNL_POINTER_MODE_DEVICE
;
}
}
auto
shape
=
GetShape
(
ctx
);
out_var
->
mutable_data
<
T
>
(
shape
,
ctx
.
GetPlace
());
MLUCnnlTensorDesc
output_desc
(
*
out_var
);
MLUCnnl
::
Fill
(
ctx
,
pointer_mode
,
value_data
,
output_desc
.
get
(),
GetBasePtr
(
out_var
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
fill_constant
,
paddle
::
operators
::
FillConstantMLUKernel
<
float
>
,
paddle
::
operators
::
FillConstantMLUKernel
<
bool
>
,
paddle
::
operators
::
FillConstantMLUKernel
<
int
>
,
paddle
::
operators
::
FillConstantMLUKernel
<
uint8_t
>
,
paddle
::
operators
::
FillConstantMLUKernel
<
int16_t
>
,
paddle
::
operators
::
FillConstantMLUKernel
<
int64_t
>
,
paddle
::
operators
::
FillConstantMLUKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/flatten_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/flatten_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
FlattenMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
in
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
&
axes
=
context
.
Attr
<
int
>
(
"axis"
);
auto
x_dims
=
in
->
dims
();
auto
out_dims
=
phi
::
make_ddim
(
GetOutputShape
(
axes
,
x_dims
));
out
->
mutable_data
(
context
.
GetPlace
(),
in
->
type
());
framework
::
TensorCopy
(
*
in
,
context
.
GetPlace
(),
context
.
template
device_context
<
platform
::
DeviceContext
>(),
out
);
out
->
Resize
(
out_dims
);
}
static
std
::
vector
<
int32_t
>
GetOutputShape
(
const
int
axis
,
const
framework
::
DDim
&
in_dims
)
{
int64_t
outer
=
1
,
inner
=
1
;
for
(
int
i
=
0
;
i
<
in_dims
.
size
();
++
i
)
{
if
(
i
<
axis
)
{
outer
*=
in_dims
[
i
];
}
else
{
inner
*=
in_dims
[
i
];
}
}
std
::
vector
<
int32_t
>
out_shape
(
2
);
out_shape
[
0
]
=
outer
;
out_shape
[
1
]
=
inner
;
return
out_shape
;
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
FlattenGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
d_x
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
in_dims
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
)
->
dims
();
d_x
->
mutable_data
(
ctx
.
GetPlace
(),
d_out
->
type
());
framework
::
TensorCopy
(
*
d_out
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
d_x
);
d_x
->
Resize
(
in_dims
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
Flatten2MLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
&
axes
=
context
.
Attr
<
int
>
(
"axis"
);
auto
*
in
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
x_dims
=
in
->
dims
();
auto
*
out
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
out_dims
=
phi
::
make_ddim
(
FlattenMLUKernel
<
DeviceContext
,
T
>::
GetOutputShape
(
axes
,
x_dims
));
out
->
mutable_data
(
context
.
GetPlace
(),
in
->
type
());
framework
::
TensorCopy
(
*
in
,
context
.
GetPlace
(),
context
.
template
device_context
<
platform
::
DeviceContext
>(),
out
);
out
->
Resize
(
out_dims
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
Flatten2GradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
d_x
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
xshape_dims
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"XShape"
)
->
dims
();
auto
x_dims
=
phi
::
slice_ddim
(
xshape_dims
,
1
,
xshape_dims
.
size
());
d_x
->
mutable_data
(
ctx
.
GetPlace
(),
d_out
->
type
());
framework
::
TensorCopy
(
*
d_out
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
d_x
);
d_x
->
Resize
(
x_dims
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
FlattenContiguousRangeMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
in
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
(
context
.
GetPlace
(),
in
->
type
());
auto
&
start_axis
=
context
.
Attr
<
int
>
(
"start_axis"
);
auto
&
stop_axis
=
context
.
Attr
<
int
>
(
"stop_axis"
);
// make out dims
auto
in_dims
=
in
->
dims
();
auto
out_dims
=
phi
::
make_ddim
(
GetOutputShape
(
start_axis
,
stop_axis
,
in_dims
));
framework
::
TensorCopy
(
*
in
,
context
.
GetPlace
(),
context
.
template
device_context
<
platform
::
DeviceContext
>(),
out
);
out
->
Resize
(
out_dims
);
}
static
std
::
vector
<
int32_t
>
GetOutputShape
(
const
int
start_axis
,
const
int
stop_axis
,
const
framework
::
DDim
&
in_dims
)
{
int64_t
outer
=
1
;
std
::
vector
<
int32_t
>
out_shape
;
int
in_dims_size
=
in_dims
.
size
();
out_shape
.
reserve
(
in_dims_size
-
stop_axis
+
start_axis
);
int
real_start_axis
=
start_axis
,
real_stop_axis
=
stop_axis
;
if
(
start_axis
<
0
)
{
real_start_axis
=
start_axis
+
in_dims_size
;
}
if
(
stop_axis
<
0
)
{
real_stop_axis
=
stop_axis
+
in_dims_size
;
}
for
(
int
i
=
0
;
i
<
real_start_axis
;
++
i
)
{
out_shape
.
push_back
(
in_dims
[
i
]);
}
for
(
int
i
=
real_start_axis
;
i
<=
real_stop_axis
;
i
++
)
{
if
(
in_dims
[
i
]
==
-
1
||
outer
==
-
1
)
{
outer
=
-
1
;
}
else
{
outer
*=
in_dims
[
i
];
}
}
out_shape
.
push_back
(
outer
);
for
(
int
i
=
real_stop_axis
+
1
;
i
<
in_dims_size
;
i
++
)
{
out_shape
.
push_back
(
in_dims
[
i
]);
}
return
out_shape
;
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
FlattenContiguousRangeGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
d_x
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
xshape_dims
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"XShape"
)
->
dims
();
auto
x_dims
=
phi
::
slice_ddim
(
xshape_dims
,
1
,
xshape_dims
.
size
());
d_x
->
mutable_data
(
ctx
.
GetPlace
(),
d_out
->
type
());
framework
::
TensorCopy
(
*
d_out
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>(),
d_x
);
d_x
->
Resize
(
x_dims
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
flatten
,
ops
::
FlattenMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
float
>
,
ops
::
FlattenMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
double
>
,
ops
::
FlattenMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
FlattenMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int
>
,
ops
::
FlattenMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int8_t
>
,
ops
::
FlattenMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int64_t
>
);
REGISTER_OP_MLU_KERNEL
(
flatten_grad
,
ops
::
FlattenGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
float
>
,
ops
::
FlattenGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
double
>
,
ops
::
FlattenGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
FlattenGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int
>
,
ops
::
FlattenGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int8_t
>
,
ops
::
FlattenGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int64_t
>
);
REGISTER_OP_MLU_KERNEL
(
flatten2
,
ops
::
Flatten2MLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
float
>
,
ops
::
Flatten2MLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
double
>
,
ops
::
Flatten2MLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
Flatten2MLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int
>
,
ops
::
Flatten2MLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int8_t
>
,
ops
::
Flatten2MLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int64_t
>
);
REGISTER_OP_MLU_KERNEL
(
flatten2_grad
,
ops
::
Flatten2GradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
float
>
,
ops
::
Flatten2GradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
double
>
,
ops
::
Flatten2GradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
Flatten2GradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int
>
,
ops
::
Flatten2GradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int8_t
>
,
ops
::
Flatten2GradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int64_t
>
);
REGISTER_OP_MLU_KERNEL
(
flatten_contiguous_range
,
ops
::
FlattenContiguousRangeMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
float
>
,
ops
::
FlattenContiguousRangeMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
double
>
,
ops
::
FlattenContiguousRangeMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
FlattenContiguousRangeMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int
>
,
ops
::
FlattenContiguousRangeMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int8_t
>
,
ops
::
FlattenContiguousRangeMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int64_t
>
);
REGISTER_OP_MLU_KERNEL
(
flatten_contiguous_range_grad
,
ops
::
FlattenContiguousRangeGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
float
>
,
ops
::
FlattenContiguousRangeGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
double
>
,
ops
::
FlattenContiguousRangeGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
FlattenContiguousRangeGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int
>
,
ops
::
FlattenContiguousRangeGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int8_t
>
,
ops
::
FlattenContiguousRangeGradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int64_t
>
);
paddle/fluid/operators/gather_nd_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
GatherNdMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
index
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Index"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
ctx
.
GetPlace
();
out
->
template
mutable_data
<
T
>(
place
);
if
(
x
->
numel
()
==
0
)
return
;
if
(
index
->
numel
()
==
0
)
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>();
framework
::
TensorCopy
(
*
x
,
place
,
dev_ctx
,
out
);
return
;
}
const
auto
&
index_type
=
framework
::
TransToProtoVarType
(
index
->
dtype
());
bool
index_type_match
=
index_type
==
framework
::
proto
::
VarType
::
INT32
||
index_type
==
framework
::
proto
::
VarType
::
INT64
;
PADDLE_ENFORCE_EQ
(
index_type_match
,
true
,
platform
::
errors
::
InvalidArgument
(
"Index holds the wrong type, it holds [%s],"
"but desires to be [%s] or [%s]"
,
paddle
::
framework
::
DataTypeToString
(
index_type
),
paddle
::
framework
::
DataTypeToString
(
framework
::
proto
::
VarType
::
INT32
),
paddle
::
framework
::
DataTypeToString
(
framework
::
proto
::
VarType
::
INT64
)));
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
index_desc
(
*
index
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
GatherNd
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
x
),
index_desc
.
get
(),
GetBasePtr
(
index
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
T
>
class
GatherNdGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
index
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Index"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
if
(
dx
->
numel
()
==
0
)
return
;
if
(
index
->
numel
()
==
0
)
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>();
framework
::
TensorCopy
(
*
dout
,
ctx
.
GetPlace
(),
dev_ctx
,
dx
);
return
;
}
phi
::
DenseTensor
tmp_tensor
(
index
->
type
());
phi
::
DenseTensor
tmp_tensor2
(
dout
->
type
());
const
auto
index_dims
=
index
->
dims
();
if
(
index_dims
.
size
()
==
1
)
{
tmp_tensor
.
ShareDataWith
(
*
index
);
std
::
vector
<
int64_t
>
new_dim
=
{
1
,
index_dims
[
0
]};
tmp_tensor
.
Resize
(
phi
::
make_ddim
(
new_dim
));
index
=
&
tmp_tensor
;
tmp_tensor2
.
ShareDataWith
(
*
dout
);
std
::
vector
<
int64_t
>
new_dim2
{
1
};
for
(
int
i
=
index
->
numel
();
i
<
x
->
dims
().
size
();
i
++
)
{
new_dim2
.
push_back
(
x
->
dims
()[
i
]);
}
tmp_tensor2
.
Resize
(
phi
::
make_ddim
(
new_dim2
));
dout
=
&
tmp_tensor2
;
}
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
auto
value
=
static_cast
<
T
>
(
0
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
value
,
dx_desc
.
get
(),
GetBasePtr
(
dx
));
MLUCnnlTensorDesc
index_desc
(
*
index
);
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
const
cnnlScatterNdMode_t
mode
=
CNNL_SCATTERND_ADD
;
MLUCnnl
::
ScatterNd
(
ctx
,
mode
,
index_desc
.
get
(),
GetBasePtr
(
index
),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
dx_desc
.
get
(),
GetBasePtr
(
dx
),
dx_desc
.
get
(),
GetBasePtr
(
dx
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
gather_nd
,
ops
::
GatherNdMLUKernel
<
float
>
,
ops
::
GatherNdMLUKernel
<
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
gather_nd_grad
,
ops
::
GatherNdGradMLUKernel
<
paddle
::
platform
::
float16
>
,
ops
::
GatherNdGradMLUKernel
<
float
>
);
paddle/fluid/operators/gather_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
GatherOpMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
index
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Index"
);
auto
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
const
auto
index_dims
=
index
->
dims
();
if
(
index_dims
.
size
()
==
2
)
{
PADDLE_ENFORCE_EQ
(
index_dims
[
1
],
1
,
platform
::
errors
::
InvalidArgument
(
"The last dim of index should be 1 when it is 2D, but we get %d"
,
index_dims
[
1
]));
}
else
{
PADDLE_ENFORCE_EQ
(
index_dims
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"The index should be 1D, when it is not 2D, but we get %d"
,
index_dims
.
size
()));
}
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
x_desc
(
*
x
);
int
index_shape_1d
[
1
]
=
{
static_cast
<
int
>
(
index_dims
[
0
])};
MLUCnnlTensorDesc
index_desc
(
1
,
index_shape_1d
,
ToCnnlDataType
(
index
->
dtype
()));
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
GatherFunctor
(
ctx
,
axis
,
0
/*batch_dims*/
,
x_desc
.
get
(),
GetBasePtr
(
x
),
index_desc
.
get
(),
GetBasePtr
(
index
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
T
>
class
GatherGradOpMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
index
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Index"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
const
auto
index_dims
=
index
->
dims
();
if
(
index_dims
.
size
()
==
2
)
{
PADDLE_ENFORCE_EQ
(
index_dims
[
1
],
1
,
platform
::
errors
::
InvalidArgument
(
"The last dim of index should be 1 when it is 2D, but we get %d"
,
index_dims
[
1
]));
}
else
{
PADDLE_ENFORCE_EQ
(
index_dims
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"The index should be 1D, when it is not 2D, but we get %d"
,
index_dims
.
size
()));
}
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
auto
value
=
static_cast
<
T
>
(
0
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
value
,
dx_desc
.
get
(),
GetBasePtr
(
dx
));
int
index_shape_1d
[
1
]
=
{
static_cast
<
int
>
(
index_dims
[
0
])};
MLUCnnlTensorDesc
index_desc
(
1
,
index_shape_1d
,
ToCnnlDataType
(
index
->
dtype
()));
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
const
cnnlScatterRefMode_t
mode
=
CNNL_SCATTERREF_UPDATE
;
MLUCnnl
::
ScatterRefFunctor
(
ctx
,
dx_desc
.
get
(),
GetBasePtr
(
dx
),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
index_desc
.
get
(),
GetBasePtr
(
index
),
mode
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
gather
,
ops
::
GatherOpMLUKernel
<
float
>
,
ops
::
GatherOpMLUKernel
<
paddle
::
platform
::
float16
>
,
ops
::
GatherOpMLUKernel
<
int
>
);
REGISTER_OP_MLU_KERNEL
(
gather_grad
,
ops
::
GatherGradOpMLUKernel
<
float
>
,
ops
::
GatherGradOpMLUKernel
<
paddle
::
platform
::
float16
>
,
ops
::
GatherGradOpMLUKernel
<
int
>
);
paddle/fluid/operators/gaussian_random_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <random>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/generator.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
MLUGaussianRandomKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
float
mean
=
context
.
Attr
<
float
>
(
"mean"
);
float
std
=
context
.
Attr
<
float
>
(
"std"
);
auto
*
tensor
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
tensor
->
mutable_data
<
T
>
(
context
.
GetPlace
());
phi
::
DenseTensor
cpu_tensor
(
tensor
->
type
());
cpu_tensor
.
Resize
(
tensor
->
dims
());
T
*
cpu_data
=
cpu_tensor
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
std
::
normal_distribution
<
T
>
dist
(
mean
,
std
);
int64_t
size
=
tensor
->
numel
();
unsigned
int
seed
=
static_cast
<
unsigned
int
>
(
context
.
Attr
<
int
>
(
"seed"
));
auto
engine
=
phi
::
GetCPURandomEngine
(
seed
);
for
(
int64_t
i
=
0
;
i
<
size
;
++
i
)
{
cpu_data
[
i
]
=
dist
(
*
engine
);
}
auto
&
dev_ctx
=
context
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>();
framework
::
TensorCopy
(
cpu_tensor
,
context
.
GetPlace
(),
dev_ctx
,
tensor
);
dev_ctx
.
Wait
();
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
gaussian_random
,
ops
::
MLUGaussianRandomKernel
<
float
>
);
paddle/fluid/operators/grid_sampler_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
GridSamplerMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE_EQ
(
platform
::
is_mlu_place
(
ctx
.
GetPlace
()),
true
,
platform
::
errors
::
Unavailable
(
"This kernel only runs on MLU."
));
// input and output data
const
phi
::
DenseTensor
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
const
phi
::
DenseTensor
*
grid
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Grid"
);
phi
::
DenseTensor
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Output"
);
int
n
=
input
->
dims
()[
0
];
int
c
=
input
->
dims
()[
1
];
int
out_h
=
grid
->
dims
()[
1
];
int
out_w
=
grid
->
dims
()[
2
];
output
->
mutable_data
<
T
>
({
n
,
c
,
out_h
,
out_w
},
ctx
.
GetPlace
());
// attrs
// paddle.nn.functional.grid_sample(x, grid, mode='bilinear',
// padding_mode='zeros', align_corners=True, name=None)
const
std
::
string
mode
=
ctx
.
Attr
<
std
::
string
>
(
"mode"
);
const
std
::
string
padding_mode
=
ctx
.
Attr
<
std
::
string
>
(
"padding_mode"
);
bool
align_corners
=
ctx
.
Attr
<
bool
>
(
"align_corners"
);
const
std
::
string
data_format
=
phi
::
DataLayoutToString
(
input
->
layout
());
PADDLE_ENFORCE_EQ
(
mode
==
"bilinear"
,
true
,
platform
::
errors
::
Unavailable
(
"Only support bilinear mode in mlu grid_sample kernel."
));
PADDLE_ENFORCE_EQ
(
padding_mode
==
"zeros"
,
true
,
platform
::
errors
::
Unavailable
(
"Only support zeros padding_mode in mlu grid_sample kernel."
));
phi
::
DenseTensor
trans_input
(
input
->
dtype
());
// transpose input from NCHW to NHWC
const
std
::
vector
<
int
>
perm_to_nhwc
=
{
0
,
2
,
3
,
1
};
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
input
,
&
trans_input
,
true
/*need_reshape_or_alloc*/
);
phi
::
DenseTensor
tmp_output
(
output
->
dtype
());
tmp_output
.
mutable_data
<
T
>
({
n
,
out_h
,
out_w
,
c
},
ctx
.
GetPlace
());
MLUCnnlGridSampleDesc
grid_sample_desc
(
mode
,
padding_mode
,
align_corners
);
MLUCnnlTensorDesc
input_desc
(
trans_input
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
grid_desc
(
*
grid
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
tmp_output_desc
(
tmp_output
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
<
T
>
());
MLUCnnl
::
GridSample
(
ctx
,
grid_sample_desc
.
get
(),
input_desc
.
get
(),
GetBasePtr
(
&
trans_input
),
grid_desc
.
get
(),
GetBasePtr
(
grid
),
tmp_output_desc
.
get
(),
GetBasePtr
(
&
tmp_output
));
// transpose output from NHWC to NCHW
const
std
::
vector
<
int
>
perm_to_nchw
=
{
0
,
3
,
1
,
2
,
};
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nchw
,
&
tmp_output
,
output
,
false
/*need_reshape_or_alloc*/
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
grid_sampler
,
ops
::
GridSamplerMLUKernel
<
float
>
,
ops
::
GridSamplerMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/huber_loss_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
HuberLossMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
residual
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Residual"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
delta
=
ctx
.
Attr
<
float
>
(
"delta"
);
auto
place
=
ctx
.
GetPlace
();
// compute y-x
cnnlDataType_t
data_type
=
ToCnnlDataType
<
T
>
();
residual
->
mutable_data
<
T
>
(
x
->
dims
(),
place
);
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlOpTensorDesc
sub_op_desc
(
CNNL_OP_TENSOR_SUB
,
data_type
,
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
sub_op_desc
.
get
(),
x_desc
.
get
(),
GetBasePtr
(
y
),
x_desc
.
get
(),
GetBasePtr
(
x
),
x_desc
.
get
(),
GetBasePtr
(
residual
),
data_type
);
// compute smoothl1loss
out
->
mutable_data
<
T
>
(
x
->
dims
(),
place
);
cnnlSmoothL1LossAlgorithm_t
smoothl1_algo
=
CNNL_SMOOTHL1LOSS_REDUCTION_NONE
;
// defines whether to do reduction
// here
MLUCnnl
::
SmoothL1LossForward
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
x
),
x_desc
.
get
(),
/* target has same shape as x */
GetBasePtr
(
y
),
static_cast
<
float
>
(
delta
),
smoothl1_algo
,
x_desc
.
get
(),
/* out has same shape as x */
GetBasePtr
(
out
));
// compute multiply by delta
phi
::
DenseTensor
scale_tensor
,
bias_tensor
;
scale_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
bias_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
delta
),
&
scale_tensor
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0.
f
),
&
bias_tensor
);
const
int
axis
=
std
::
max
(
out
->
dims
().
size
()
-
1
,
0
);
MLUCnnlTensorDesc
scale_desc
(
scale_tensor
);
MLUCnnlTensorDesc
bias_desc
(
bias_tensor
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
Scale
(
ctx
,
axis
,
out_desc
.
get
(),
GetBasePtr
(
out
),
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
),
bias_desc
.
get
(),
GetBasePtr
(
&
bias_tensor
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
T
>
class
HuberLossGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
auto
*
residual
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Residual"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
delta
=
ctx
.
Attr
<
float
>
(
"delta"
);
auto
place
=
ctx
.
GetPlace
();
phi
::
DenseTensor
t_grad_rd
;
t_grad_rd
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
residual
->
dims
(),
dev_ctx
);
MLUCnnlTensorDesc
t_grad_rd_desc
(
t_grad_rd
);
if
(
dx
||
dy
)
{
phi
::
DenseTensor
t_zero
;
t_zero
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
residual
->
dims
(),
dev_ctx
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0.
f
),
&
t_zero
);
MLUCnnlTensorDesc
residual_desc
(
*
residual
);
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
cnnlSmoothL1LossAlgorithm_t
smoothl1_algo
=
CNNL_SMOOTHL1LOSS_REDUCTION_NONE
;
// defines whether to do reduction
// here
MLUCnnl
::
SmoothL1LossBackward
(
ctx
,
residual_desc
.
get
(),
GetBasePtr
(
residual
),
residual_desc
.
get
(),
GetBasePtr
(
&
t_zero
),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
static_cast
<
float
>
(
delta
),
smoothl1_algo
,
t_grad_rd_desc
.
get
(),
GetBasePtr
(
&
t_grad_rd
));
}
// compute multiply by delta
phi
::
DenseTensor
scale_tensor
,
bias_tensor
;
scale_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
bias_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0.
f
),
&
bias_tensor
);
const
int
axis
=
std
::
max
(
t_grad_rd
.
dims
().
size
()
-
1
,
0
);
MLUCnnlTensorDesc
scale_desc
(
scale_tensor
);
MLUCnnlTensorDesc
bias_desc
(
bias_tensor
);
if
(
dx
)
{
dx
->
mutable_data
<
T
>
(
place
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
-
delta
),
&
scale_tensor
);
MLUCnnlTensorDesc
out_desc
(
*
dx
);
MLUCnnl
::
Scale
(
ctx
,
axis
,
t_grad_rd_desc
.
get
(),
GetBasePtr
(
&
t_grad_rd
),
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
),
bias_desc
.
get
(),
GetBasePtr
(
&
bias_tensor
),
out_desc
.
get
(),
GetBasePtr
(
dx
));
}
if
(
dy
)
{
dy
->
mutable_data
<
T
>
(
place
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
delta
),
&
scale_tensor
);
MLUCnnlTensorDesc
out_desc
(
*
dy
);
MLUCnnl
::
Scale
(
ctx
,
axis
,
t_grad_rd_desc
.
get
(),
GetBasePtr
(
&
t_grad_rd
),
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
),
bias_desc
.
get
(),
GetBasePtr
(
&
bias_tensor
),
out_desc
.
get
(),
GetBasePtr
(
dy
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
huber_loss
,
ops
::
HuberLossMLUKernel
<
float
>
,
ops
::
HuberLossMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
huber_loss_grad
,
ops
::
HuberLossGradMLUKernel
<
float
>
,
ops
::
HuberLossGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/interpolate_v2_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/interpolate_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/phi/core/tensor_utils.h"
namespace
paddle
{
namespace
operators
{
using
DataLayout
=
phi
::
DataLayout
;
inline
std
::
vector
<
int
>
get_new_shape_mlu
(
const
std
::
vector
<
const
phi
::
DenseTensor
*>&
list_new_shape_tensor
)
{
// get tensor from
std
::
vector
<
int
>
vec_new_shape
;
for
(
size_t
i
=
0
;
i
<
list_new_shape_tensor
.
size
();
++
i
)
{
auto
tensor
=
list_new_shape_tensor
[
i
];
PADDLE_ENFORCE_EQ
(
tensor
->
dims
(),
phi
::
make_ddim
({
1
}),
platform
::
errors
::
InvalidArgument
(
"shape of dim tensor should be [1]"
));
phi
::
DenseTensor
temp
;
paddle
::
framework
::
TensorCopySync
(
*
tensor
,
platform
::
CPUPlace
(),
&
temp
);
vec_new_shape
.
push_back
(
static_cast
<
int32_t
>
(
*
temp
.
data
<
int32_t
>
()));
}
return
vec_new_shape
;
}
template
<
typename
T
>
class
InterpolateV2MLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MLUDeviceContext
>();
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
input_dims
=
input
->
dims
();
PADDLE_ENFORCE_GE
(
input_dims
.
size
(),
4
,
platform
::
errors
::
External
(
"MLU Interpolate kernel supports input "
"range greater or equal than 4."
));
PADDLE_ENFORCE_LE
(
input_dims
.
size
(),
5
,
platform
::
errors
::
External
(
"MLU Interpolate kernel supports input "
"range less or equal than 5. "
));
const
std
::
string
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
const
DataLayout
data_layout
=
phi
::
StringToDataLayout
(
data_layout_str
);
int
n
,
c
,
in_d
,
in_h
,
in_w
;
ExtractNCDWH
(
input_dims
,
data_layout
,
&
n
,
&
c
,
&
in_d
,
&
in_h
,
&
in_w
);
auto
interp_method
=
ctx
.
Attr
<
std
::
string
>
(
"interp_method"
);
bool
align_corners
=
ctx
.
Attr
<
bool
>
(
"align_corners"
);
int
align_mode
=
ctx
.
Attr
<
int
>
(
"align_mode"
);
int
align_center
=
align_corners
?
0
:
(
align_mode
==
1
?
0
:
1
);
int
out_d
=
ctx
.
Attr
<
int
>
(
"out_d"
);
int
out_h
=
ctx
.
Attr
<
int
>
(
"out_h"
);
int
out_w
=
ctx
.
Attr
<
int
>
(
"out_w"
);
float
scale_d
=
-
1
;
float
scale_h
=
-
1
;
float
scale_w
=
-
1
;
auto
list_new_size_tensor
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"SizeTensor"
);
if
(
list_new_size_tensor
.
size
()
>
0
)
{
// have size tensor
auto
new_size
=
get_new_shape_mlu
(
list_new_size_tensor
);
if
(
new_size
.
size
()
<=
2
)
{
// default NCHW
out_h
=
new_size
[
0
];
out_w
=
new_size
[
1
];
}
else
{
// rank of input is 5, HCDHW
out_d
=
new_size
[
0
];
out_h
=
new_size
[
1
];
out_w
=
new_size
[
2
];
}
}
else
{
auto
scale_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Scale"
);
auto
scale
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"scale"
);
if
(
scale_tensor
!=
nullptr
)
{
std
::
vector
<
float
>
scale_data
;
scale_data
=
phi
::
GetVectorFromTensor
<
float
>
(
scale_tensor
);
if
(
scale_data
.
size
()
>
1
&&
scale_data
.
size
()
<=
2
)
{
scale_h
=
scale_data
[
0
];
scale_w
=
scale_data
[
1
];
}
else
if
(
scale_data
.
size
()
>
2
)
{
scale_d
=
scale_data
[
0
];
scale_h
=
scale_data
[
1
];
scale_w
=
scale_data
[
2
];
}
else
{
scale_d
=
scale_data
[
0
];
scale_h
=
scale_data
[
0
];
scale_w
=
scale_data
[
0
];
}
PADDLE_ENFORCE_EQ
(
scale_w
>
0
&&
scale_h
>
0
,
true
,
platform
::
errors
::
InvalidArgument
(
"scale of Op(interpolate) "
"should be greater than 0."
));
}
else
{
if
(
scale
.
size
()
>
1
&&
scale
.
size
()
<=
2
)
{
scale_h
=
scale
[
0
];
scale_w
=
scale
[
1
];
PADDLE_ENFORCE_EQ
(
scale_w
>
0
&&
scale_h
>
0
,
true
,
platform
::
errors
::
InvalidArgument
(
"scale of Op(interpolate) "
"should be greater than 0."
));
}
else
if
(
scale
.
size
()
>
2
)
{
scale_d
=
scale
[
0
];
scale_h
=
scale
[
1
];
scale_w
=
scale
[
2
];
PADDLE_ENFORCE_EQ
(
scale_d
>
0
&&
scale_w
>
0
&&
scale_h
>
0
,
true
,
platform
::
errors
::
InvalidArgument
(
"scale of Op(interpolate) "
"should be greater than 0."
));
}
}
if
(
scale_h
>
0.
&&
scale_w
>
0.
)
{
out_h
=
static_cast
<
int
>
(
in_h
*
scale_h
);
out_w
=
static_cast
<
int
>
(
in_w
*
scale_w
);
}
if
(
scale_d
>
0.
)
{
out_d
=
static_cast
<
int
>
(
in_d
*
scale_d
);
}
auto
out_size
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"OutSize"
);
if
(
out_size
!=
nullptr
)
{
std
::
vector
<
int32_t
>
out_size_data
;
out_size_data
=
phi
::
GetVectorFromTensor
<
int
>
(
out_size
);
if
(
out_size_data
.
size
()
<=
2
)
{
out_h
=
out_size_data
[
0
];
out_w
=
out_size_data
[
1
];
}
else
{
out_d
=
out_size_data
[
0
];
out_h
=
out_size_data
[
1
];
out_w
=
out_size_data
[
2
];
}
}
}
PADDLE_ENFORCE_GT
(
out_h
,
0
,
platform
::
errors
::
InvalidArgument
(
"out_h in Attr(out_shape) of "
"Op(interpolate) "
"should be greater than 0."
));
PADDLE_ENFORCE_GT
(
out_w
,
0
,
platform
::
errors
::
InvalidArgument
(
"out_w in Attr(out_shape) of "
"Op(interpolate) "
"should be greater than 0."
));
// do transpose according to cnnl's constraints
// cnnlInterp_v2 only accepts NHWC when mode is CNNL_INTERP_BILINEAR and
// CNNL_INTERP_NEAREST,
framework
::
DDim
dim_in
,
dim_in_trans
,
dim_out
,
dim_out_trans
;
phi
::
DenseTensor
transformed_input
,
transformed_output
;
bool
need_transpose
=
input_dims
.
size
()
!=
2
;
if
(
input_dims
.
size
()
==
4
)
{
// need to do transpose if layout is kNCHW
need_transpose
&=
data_layout
==
DataLayout
::
kNCHW
;
if
(
need_transpose
)
{
// if need_transpose, do the following
// 1. transpose input NCHW -> NHWC
// 2. interpolation in(NHWC) -> out(NHWC)
// 3. transpose output NHWC -> HCHW
// dim_in = {n, c, in_h, in_w};
dim_in_trans
=
{
n
,
in_h
,
in_w
,
c
};
dim_out
=
{
n
,
c
,
out_h
,
out_w
};
dim_out_trans
=
{
n
,
out_h
,
out_w
,
c
};
output
->
mutable_data
<
T
>
(
dim_out
,
ctx
.
GetPlace
());
if
(
in_h
==
out_h
&&
in_w
==
out_w
)
{
framework
::
TensorCopy
(
*
input
,
ctx
.
GetPlace
(),
output
);
return
;
}
// do transpose on input tensor, then do interpolation
MLUCnnlTensorDesc
input_desc
(
*
input
,
CNNL_LAYOUT_NCHW
,
ToCnnlDataType
(
input
->
dtype
()));
transformed_input
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
dim_in_trans
,
dev_ctx
);
transformed_output
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
dim_out_trans
,
dev_ctx
);
MLUCnnlTensorDesc
input_reshaped_desc
(
transformed_input
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
(
transformed_input
.
dtype
()));
const
std
::
vector
<
int
>
perm
=
{
0
,
2
,
3
,
1
};
MLUCnnl
::
Transpose
(
ctx
,
perm
,
input_dims
.
size
(),
input_desc
.
get
(),
GetBasePtr
(
input
),
input_reshaped_desc
.
get
(),
GetBasePtr
(
&
transformed_input
));
}
else
{
// if no need_transpose, do the following
// 1. interpolation in(NHWC) -> out(NHWC)
// dim_in = {n, in_h, in_w, c};
dim_out
=
{
n
,
out_h
,
out_w
,
c
};
output
->
mutable_data
<
T
>
(
dim_out
,
ctx
.
GetPlace
());
if
(
in_h
==
out_h
&&
in_w
==
out_w
)
{
framework
::
TensorCopy
(
*
input
,
ctx
.
GetPlace
(),
output
);
return
;
}
transformed_input
=
*
input
;
transformed_output
=
*
output
;
}
MLUCnnlTensorDesc
input_desc
(
transformed_input
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
(
transformed_input
.
dtype
()));
MLUCnnlTensorDesc
output_desc
(
transformed_output
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
(
transformed_output
.
dtype
()));
MLUCnnl
::
Interp
(
ctx
,
GetMLUCnnlInterpMode
(
interp_method
),
align_corners
,
align_center
,
input_desc
.
get
(),
GetBasePtr
(
&
transformed_input
),
output_desc
.
get
(),
GetBasePtr
(
&
transformed_output
));
if
(
need_transpose
)
{
// if need_transpose, reshape output back to NCHW
const
std
::
vector
<
int
>
perm
=
{
0
,
3
,
1
,
2
};
MLUCnnlTensorDesc
output_reshape_desc
(
*
output
,
CNNL_LAYOUT_NCHW
,
ToCnnlDataType
(
output
->
dtype
()));
MLUCnnl
::
Transpose
(
ctx
,
perm
,
dim_out_trans
.
size
(),
output_desc
.
get
(),
GetBasePtr
(
&
transformed_output
),
output_reshape_desc
.
get
(),
GetBasePtr
(
output
));
}
}
else
{
PADDLE_ENFORCE_EQ
(
interp_method
,
"trilinear"
,
platform
::
errors
::
External
(
"MLU Interpolate kernel only supports 5D "
"data in trilinear mode."
));
// need to do transpose if layout is kNCDHW
need_transpose
&=
data_layout
==
DataLayout
::
kNCHW
;
if
(
need_transpose
)
{
// if need_transpose, do the following
// 1. transpose input NCDHW -> NDHWC
// 2. interpolation in(NDHWC) -> out(NDHWC)
// 3. transpose output NDHWC -> HCDHW
// dim_in = {n, c, in_d, in_h, in_w};
dim_in_trans
=
{
n
,
in_d
,
in_h
,
in_w
,
c
};
dim_out
=
{
n
,
c
,
out_d
,
out_h
,
out_w
};
dim_out_trans
=
{
n
,
out_d
,
out_h
,
out_w
,
c
};
output
->
mutable_data
<
T
>
(
dim_out
,
ctx
.
GetPlace
());
if
(
in_h
==
out_h
&&
in_w
==
out_w
&&
in_d
==
out_d
)
{
framework
::
TensorCopy
(
*
input
,
ctx
.
GetPlace
(),
output
);
return
;
}
// do transpose on input tensor (HCDHW -> NDHWC), then do interpolation
MLUCnnlTensorDesc
input_desc
(
*
input
,
CNNL_LAYOUT_NCDHW
,
ToCnnlDataType
(
input
->
dtype
()));
transformed_input
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
dim_in_trans
,
dev_ctx
);
transformed_output
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
dim_out_trans
,
dev_ctx
);
MLUCnnlTensorDesc
input_reshaped_desc
(
transformed_input
,
CNNL_LAYOUT_NDHWC
,
ToCnnlDataType
(
transformed_input
.
dtype
()));
const
std
::
vector
<
int
>
perm
=
{
0
,
2
,
3
,
4
,
1
};
MLUCnnl
::
Transpose
(
ctx
,
perm
,
input_dims
.
size
(),
input_desc
.
get
(),
GetBasePtr
(
input
),
input_reshaped_desc
.
get
(),
GetBasePtr
(
&
transformed_input
));
}
else
{
// if no need_transpose, do the following
// 1. interpolation in(NDHWC) -> out(NDHWC)
// dim_in = {n, in_d, in_h, in_w, c};
dim_out
=
{
n
,
out_d
,
out_h
,
out_w
,
c
};
output
->
mutable_data
<
T
>
(
dim_out
,
ctx
.
GetPlace
());
if
(
in_h
==
out_h
&&
in_w
==
out_w
&&
in_d
==
out_d
)
{
framework
::
TensorCopy
(
*
input
,
ctx
.
GetPlace
(),
output
);
return
;
}
transformed_input
=
*
input
;
transformed_output
=
*
output
;
}
MLUCnnlTensorDesc
input_desc
(
transformed_input
,
CNNL_LAYOUT_NDHWC
,
ToCnnlDataType
(
transformed_input
.
dtype
()));
MLUCnnlTensorDesc
output_desc
(
transformed_output
,
CNNL_LAYOUT_NDHWC
,
ToCnnlDataType
(
transformed_output
.
dtype
()));
// use trilinear mode in HCDHW layout
MLUCnnl
::
Interp
(
ctx
,
GetMLUCnnlInterpMode
(
interp_method
),
align_corners
,
align_center
,
input_desc
.
get
(),
GetBasePtr
(
&
transformed_input
),
output_desc
.
get
(),
GetBasePtr
(
&
transformed_output
));
if
(
need_transpose
)
{
// if need_transpose, reshape output back (NDHWC -> NCDHW)
const
std
::
vector
<
int
>
perm
=
{
0
,
4
,
1
,
2
,
3
};
MLUCnnlTensorDesc
output_reshape_desc
(
*
output
,
CNNL_LAYOUT_NCDHW
,
ToCnnlDataType
(
output
->
dtype
()));
MLUCnnl
::
Transpose
(
ctx
,
perm
,
dim_out_trans
.
size
(),
output_desc
.
get
(),
GetBasePtr
(
&
transformed_output
),
output_reshape_desc
.
get
(),
GetBasePtr
(
output
));
}
}
}
};
template
<
typename
T
>
class
InterpolateV2GradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MLUDeviceContext
>();
auto
*
input_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
output_grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
output_grad_dims
=
output_grad
->
dims
();
PADDLE_ENFORCE_EQ
(
output_grad_dims
.
size
(),
4
,
platform
::
errors
::
External
(
"XPU Interpolategrad kernel only support 2d"
));
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
input_dims
=
input
->
dims
();
const
std
::
string
data_layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
const
DataLayout
data_layout
=
phi
::
StringToDataLayout
(
data_layout_str
);
int
n
,
c
,
in_d
,
in_h
,
in_w
;
ExtractNCDWH
(
input
->
dims
(),
data_layout
,
&
n
,
&
c
,
&
in_d
,
&
in_h
,
&
in_w
);
auto
interp_method
=
ctx
.
Attr
<
std
::
string
>
(
"interp_method"
);
bool
align_corners
=
ctx
.
Attr
<
bool
>
(
"align_corners"
);
int
align_mode
=
ctx
.
Attr
<
int
>
(
"align_mode"
);
int
align_center
=
align_corners
?
0
:
(
align_mode
==
0
?
0
:
1
);
align_center
=
0
;
int
out_h
=
ctx
.
Attr
<
int
>
(
"out_h"
);
int
out_w
=
ctx
.
Attr
<
int
>
(
"out_w"
);
float
scale_h
=
-
1
;
float
scale_w
=
-
1
;
auto
list_new_size_tensor
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"SizeTensor"
);
if
(
list_new_size_tensor
.
size
()
>
0
)
{
// have size tensor
auto
new_size
=
get_new_shape_mlu
(
list_new_size_tensor
);
out_h
=
new_size
[
0
];
out_w
=
new_size
[
1
];
}
else
{
auto
scale_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Scale"
);
auto
scale
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"scale"
);
if
(
scale_tensor
!=
nullptr
)
{
std
::
vector
<
float
>
scale_data
;
scale_data
=
phi
::
GetVectorFromTensor
<
float
>
(
scale_tensor
);
if
(
scale_data
.
size
()
>
1
)
{
scale_h
=
scale_data
[
0
];
scale_w
=
scale_data
[
1
];
}
else
{
scale_h
=
scale_data
[
0
];
scale_w
=
scale_data
[
0
];
}
PADDLE_ENFORCE_EQ
(
scale_w
>
0
&&
scale_h
>
0
,
true
,
platform
::
errors
::
InvalidArgument
(
"scale of Op(interpolate) "
"should be greater than 0."
));
}
else
{
if
(
scale
.
size
()
>
1
)
{
scale_h
=
scale
[
0
];
scale_w
=
scale
[
1
];
PADDLE_ENFORCE_EQ
(
scale_w
>
0
&&
scale_h
>
0
,
true
,
platform
::
errors
::
InvalidArgument
(
"scale of Op(interpolate) "
"should be greater than 0."
));
}
}
if
(
scale_h
>
0.
&&
scale_w
>
0.
)
{
out_h
=
static_cast
<
int
>
(
in_h
*
scale_h
);
out_w
=
static_cast
<
int
>
(
in_w
*
scale_w
);
}
auto
out_size
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"OutSize"
);
if
(
out_size
!=
nullptr
)
{
std
::
vector
<
int32_t
>
out_size_data
;
out_size_data
=
phi
::
GetVectorFromTensor
<
int
>
(
out_size
);
out_h
=
out_size_data
[
0
];
out_w
=
out_size_data
[
1
];
}
}
framework
::
DDim
dim_grad
;
framework
::
DDim
dim_out_grad
,
dim_out_trans_grad
,
dim_in_grad
,
dim_in_trans_grad
;
phi
::
DenseTensor
transformed_output_grad
,
transformed_input_grad
;
bool
need_transpose
=
input_dims
.
size
()
!=
2
&&
data_layout
==
DataLayout
::
kNCHW
;
if
(
need_transpose
)
{
// if need_transpose, do the following
// 1. transpose output_grad NCHW -> NHWC
// 2. InterpBackward output_grad(NHWC) -> input_grad(NHWC)
// 3. transpose input_grad NHWC -> HCHW
// dim_out_grad = {n, c, out_h, out_w};
dim_out_trans_grad
=
{
n
,
out_h
,
out_w
,
c
};
dim_in_grad
=
{
n
,
c
,
in_h
,
in_w
};
dim_in_trans_grad
=
{
n
,
in_h
,
in_w
,
c
};
input_grad
->
mutable_data
<
T
>
(
dim_in_grad
,
ctx
.
GetPlace
());
if
(
in_h
==
out_h
&&
in_w
==
out_w
)
{
framework
::
TensorCopy
(
*
output_grad
,
ctx
.
GetPlace
(),
input_grad
);
return
;
}
// do transpose on input tensor, then do interpolation
MLUCnnlTensorDesc
input_desc
(
*
output_grad
,
CNNL_LAYOUT_NCHW
,
ToCnnlDataType
(
output_grad
->
dtype
()));
transformed_output_grad
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
dim_out_trans_grad
,
dev_ctx
);
transformed_input_grad
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
dim_in_trans_grad
,
dev_ctx
);
MLUCnnlTensorDesc
input_reshaped_desc
(
transformed_output_grad
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
(
transformed_output_grad
.
dtype
()));
const
std
::
vector
<
int
>
perm
=
{
0
,
2
,
3
,
1
};
MLUCnnl
::
Transpose
(
ctx
,
perm
,
input_dims
.
size
(),
input_desc
.
get
(),
GetBasePtr
(
output_grad
),
input_reshaped_desc
.
get
(),
GetBasePtr
(
&
transformed_output_grad
));
}
else
{
// if no need_transpose, do the following
// 1. InterpBackward output_grad(NHWC) -> input_grad(NHWC)
dim_in_grad
=
{
n
,
in_h
,
in_w
,
c
};
input_grad
->
mutable_data
<
T
>
(
dim_in_grad
,
ctx
.
GetPlace
());
if
(
in_h
==
out_h
&&
in_w
==
out_w
)
{
framework
::
TensorCopy
(
*
output_grad
,
ctx
.
GetPlace
(),
input_grad
);
return
;
}
transformed_output_grad
=
*
output_grad
;
transformed_input_grad
=
*
input_grad
;
}
MLUCnnlTensorDesc
input_desc
(
transformed_output_grad
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
(
transformed_output_grad
.
dtype
()));
MLUCnnlTensorDesc
output_desc
(
transformed_input_grad
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
(
transformed_input_grad
.
dtype
()));
MLUCnnl
::
InterpBackward
(
ctx
,
GetMLUCnnlInterpBackwardMode
(
interp_method
),
align_corners
,
align_center
,
input_desc
.
get
(),
GetBasePtr
(
&
transformed_output_grad
),
output_desc
.
get
(),
GetBasePtr
(
&
transformed_input_grad
));
if
(
need_transpose
)
{
const
std
::
vector
<
int
>
perm
=
{
0
,
3
,
1
,
2
};
MLUCnnlTensorDesc
output_reshape_desc
(
*
input_grad
,
CNNL_LAYOUT_NCHW
,
ToCnnlDataType
(
input_grad
->
dtype
()));
MLUCnnl
::
Transpose
(
ctx
,
perm
,
dim_in_trans_grad
.
size
(),
output_desc
.
get
(),
GetBasePtr
(
&
transformed_input_grad
),
output_reshape_desc
.
get
(),
GetBasePtr
(
input_grad
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
bilinear_interp_v2
,
ops
::
InterpolateV2MLUKernel
<
float
>
,
ops
::
InterpolateV2MLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
nearest_interp_v2
,
ops
::
InterpolateV2MLUKernel
<
float
>
,
ops
::
InterpolateV2MLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
nearest_interp_v2_grad
,
ops
::
InterpolateV2GradMLUKernel
<
float
>
,
ops
::
InterpolateV2GradMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
bilinear_interp_v2_grad
,
ops
::
InterpolateV2GradMLUKernel
<
float
>
,
ops
::
InterpolateV2GradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/label_smooth_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
LabelSmoothMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
in_t
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
dist_t
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"PriorDist"
);
auto
*
out_t
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
auto
epsilon_gt
=
1.0
f
-
epsilon
;
if
(
in_t
->
numel
()
==
0
)
return
;
out_t
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
label_dim
=
in_t
->
dims
()[
in_t
->
dims
().
size
()
-
1
];
MLUCnnlTensorDesc
x_desc
(
*
in_t
);
MLUCnnlTensorDesc
out_desc
(
*
out_t
);
auto
data_type
=
ToCnnlDataType
<
T
>
();
MLUCnnlOpTensorDesc
op_tensor_desc
(
CNNL_OP_TENSOR_ADD
,
data_type
,
CNNL_NOT_PROPAGATE_NAN
);
if
(
ctx
.
HasInput
(
"PriorDist"
))
{
MLUCnnlTensorDesc
dist_desc
(
*
dist_t
);
MLUCnnl
::
OpTensor
(
ctx
,
op_tensor_desc
.
get
(),
x_desc
.
get
(),
GetBasePtr
(
in_t
),
dist_desc
.
get
(),
GetBasePtr
(
dist_t
),
out_desc
.
get
(),
GetBasePtr
(
out_t
),
data_type
,
epsilon_gt
,
epsilon
);
}
else
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
MLUDeviceContext
>();
phi
::
DenseTensor
dist_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
,
label_dim
},
dev_ctx
);
MLUCnnlTensorDesc
dist_desc
(
dist_tensor
);
auto
value
=
static_cast
<
T
>
(
1.0
f
/
label_dim
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
value
,
dist_desc
.
get
(),
GetBasePtr
(
&
dist_tensor
));
MLUCnnl
::
OpTensor
(
ctx
,
op_tensor_desc
.
get
(),
x_desc
.
get
(),
GetBasePtr
(
in_t
),
dist_desc
.
get
(),
GetBasePtr
(
&
dist_tensor
),
out_desc
.
get
(),
GetBasePtr
(
out_t
),
data_type
,
epsilon_gt
,
epsilon
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
label_smooth
,
ops
::
LabelSmoothMLUKernel
<
float
>
,
ops
::
LabelSmoothMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/layer_norm_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
using
DDim
=
framework
::
DDim
;
template
<
typename
T
>
class
LayerNormMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
begin_norm_axis
=
ctx
.
Attr
<
int
>
(
"begin_norm_axis"
);
const
auto
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
const
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
const
auto
*
scale
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Scale"
);
const
auto
*
bias
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Bias"
);
auto
*
y
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
mean
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Mean"
);
auto
*
variance
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Variance"
);
auto
place
=
ctx
.
GetPlace
();
y
->
mutable_data
<
T
>
(
place
);
mean
->
mutable_data
<
T
>
(
place
);
variance
->
mutable_data
<
T
>
(
place
);
const
auto
&
x_dims
=
x
->
dims
();
std
::
vector
<
int
>
scale_bias_axes
;
std
::
vector
<
int
>
mean_var_axes
;
for
(
auto
i
=
0
;
i
<
x_dims
.
size
();
++
i
)
{
if
(
i
>=
begin_norm_axis
)
{
scale_bias_axes
.
push_back
(
x_dims
[
i
]);
}
else
{
mean_var_axes
.
push_back
(
x_dims
[
i
]);
}
}
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
y_desc
(
*
y
);
MLUCnnlTensorDesc
mean_var_desc
(
mean_var_axes
.
size
(),
mean_var_axes
.
data
(),
ToCnnlDataType
<
T
>
());
// cnnl only support both of scale and bias is NULL or not.
if
(
!
scale
&&
!
bias
)
{
MLUCnnl
::
LayerNormForward
(
ctx
,
begin_norm_axis
,
x_desc
.
get
(),
GetBasePtr
(
x
),
nullptr
/*scale_bias_desc*/
,
nullptr
/*scale*/
,
nullptr
/*bias*/
,
epsilon
,
y_desc
.
get
(),
GetBasePtr
(
y
),
mean_var_desc
.
get
(),
GetBasePtr
(
mean
),
GetBasePtr
(
variance
));
}
else
{
phi
::
DenseTensor
tmp_scale
(
x
->
dtype
());
if
(
!
scale
)
{
tmp_scale
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
scale_bias_axes
),
place
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
1
),
&
tmp_scale
);
}
else
{
tmp_scale
=
*
scale
;
}
phi
::
DenseTensor
tmp_bias
(
x
->
dtype
());
if
(
!
bias
)
{
tmp_bias
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
scale_bias_axes
),
place
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0
),
&
tmp_bias
);
}
else
{
tmp_bias
=
*
bias
;
}
// scale and bias should have same type with x/y
MLUCnnlTensorDesc
float32_desc
(
scale_bias_axes
.
size
(),
scale_bias_axes
.
data
(),
CNNL_DTYPE_FLOAT
);
MLUCnnlTensorDesc
float16_desc
(
scale_bias_axes
.
size
(),
scale_bias_axes
.
data
(),
CNNL_DTYPE_HALF
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
VT
::
FP32
,
VT
::
FP16
);
phi
::
DenseTensor
final_scale
(
x
->
dtype
());
if
(
final_scale
.
dtype
()
==
DataType
::
FLOAT16
&&
tmp_scale
.
dtype
()
==
DataType
::
FLOAT32
)
{
final_scale
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
scale_bias_axes
),
place
);
// cast scale to fp16
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
float32_desc
.
get
(),
GetBasePtr
(
&
tmp_scale
),
float16_desc
.
get
(),
GetBasePtr
(
&
final_scale
));
}
else
{
final_scale
=
tmp_scale
;
}
phi
::
DenseTensor
final_bias
(
x
->
dtype
());
if
(
final_bias
.
dtype
()
==
DataType
::
FLOAT16
&&
tmp_bias
.
dtype
()
==
DataType
::
FLOAT32
)
{
final_bias
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
scale_bias_axes
),
place
);
// cast bias to fp16
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
float32_desc
.
get
(),
GetBasePtr
(
&
tmp_bias
),
float16_desc
.
get
(),
GetBasePtr
(
&
final_bias
));
}
else
{
final_bias
=
tmp_bias
;
}
MLUCnnlTensorDesc
scale_bias_desc
(
scale_bias_axes
.
size
(),
scale_bias_axes
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnl
::
LayerNormForward
(
ctx
,
begin_norm_axis
,
x_desc
.
get
(),
GetBasePtr
(
x
),
scale_bias_desc
.
get
(),
GetBasePtr
(
&
final_scale
),
GetBasePtr
(
&
final_bias
),
epsilon
,
y_desc
.
get
(),
GetBasePtr
(
y
),
mean_var_desc
.
get
(),
GetBasePtr
(
mean
),
GetBasePtr
(
variance
));
}
}
};
template
<
typename
T
>
class
LayerNormGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
using
MPDType
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
begin_norm_axis
=
ctx
.
Attr
<
int
>
(
"begin_norm_axis"
);
const
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
const
auto
*
mean
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Mean"
);
const
auto
*
variance
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Variance"
);
const
auto
*
scale
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Scale"
);
const
auto
*
dy
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dscale
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
dbias
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Bias"
));
auto
place
=
ctx
.
GetPlace
();
dx
->
mutable_data
<
T
>
(
place
);
const
auto
&
x_dims
=
x
->
dims
();
std
::
vector
<
int
>
scale_bias_axes
;
std
::
vector
<
int
>
mean_var_axes
;
for
(
auto
i
=
0
;
i
<
x_dims
.
size
();
++
i
)
{
if
(
i
>=
begin_norm_axis
)
{
scale_bias_axes
.
push_back
(
x_dims
[
i
]);
}
else
{
mean_var_axes
.
push_back
(
x_dims
[
i
]);
}
}
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
dy_desc
(
*
dy
);
MLUCnnlTensorDesc
mean_var_desc
(
mean_var_axes
.
size
(),
mean_var_axes
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
dx_desc
(
*
dx
);
phi
::
DenseTensor
tmp_scale
(
x
->
dtype
());
if
(
!
scale
)
{
tmp_scale
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
scale_bias_axes
),
place
);
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
1
),
&
tmp_scale
);
}
else
{
tmp_scale
=
*
scale
;
}
MLUCnnlTensorDesc
float32_desc
(
scale_bias_axes
.
size
(),
scale_bias_axes
.
data
(),
CNNL_DTYPE_FLOAT
);
MLUCnnlTensorDesc
float16_desc
(
scale_bias_axes
.
size
(),
scale_bias_axes
.
data
(),
CNNL_DTYPE_HALF
);
cnnlCastDataType_t
cast_fp32_to_fp16
=
GetCastDataType
(
VT
::
FP32
,
VT
::
FP16
);
cnnlCastDataType_t
cast_fp16_to_fp32
=
GetCastDataType
(
VT
::
FP16
,
VT
::
FP32
);
phi
::
DenseTensor
final_scale
(
x
->
dtype
());
if
(
final_scale
.
dtype
()
==
DataType
::
FLOAT16
&&
tmp_scale
.
dtype
()
==
DataType
::
FLOAT32
)
{
final_scale
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
scale_bias_axes
),
place
);
// cast scale to fp16
MLUCnnl
::
Cast
(
ctx
,
cast_fp32_to_fp16
,
float32_desc
.
get
(),
GetBasePtr
(
&
tmp_scale
),
float16_desc
.
get
(),
GetBasePtr
(
&
final_scale
));
}
else
{
final_scale
=
tmp_scale
;
}
phi
::
DenseTensor
tmp_dscale
(
x
->
dtype
());
if
(
dscale
&&
(
tmp_dscale
.
dtype
()
==
dscale
->
dtype
()))
{
dscale
->
mutable_data
<
T
>
(
place
);
tmp_dscale
=
*
dscale
;
}
else
{
tmp_dscale
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
scale_bias_axes
),
place
);
}
phi
::
DenseTensor
tmp_dbias
(
x
->
dtype
());
if
(
dbias
&&
(
tmp_dbias
.
dtype
()
==
dbias
->
dtype
()))
{
dbias
->
mutable_data
<
T
>
(
place
);
tmp_dbias
=
*
dbias
;
}
else
{
tmp_dbias
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
scale_bias_axes
),
place
);
}
MLUCnnlTensorDesc
scale_desc
(
scale_bias_axes
.
size
(),
scale_bias_axes
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnl
::
LayerNormBackward
(
ctx
,
begin_norm_axis
,
x_desc
.
get
(),
GetBasePtr
(
x
),
dy_desc
.
get
(),
GetBasePtr
(
dy
),
scale_desc
.
get
(),
GetBasePtr
(
&
final_scale
),
mean_var_desc
.
get
(),
GetBasePtr
(
mean
),
GetBasePtr
(
variance
),
dx_desc
.
get
(),
GetBasePtr
(
dx
),
GetBasePtr
(
&
tmp_dscale
),
GetBasePtr
(
&
tmp_dbias
));
if
(
dscale
&&
(
tmp_dscale
.
dtype
()
==
DataType
::
FLOAT16
&&
dscale
->
dtype
()
==
DataType
::
FLOAT32
))
{
dscale
->
mutable_data
<
MPDType
>
(
place
);
MLUCnnl
::
Cast
(
ctx
,
cast_fp16_to_fp32
,
float16_desc
.
get
(),
GetBasePtr
(
&
tmp_dscale
),
float32_desc
.
get
(),
GetBasePtr
(
dscale
));
}
if
(
dbias
&&
(
tmp_dbias
.
dtype
()
==
DataType
::
FLOAT16
&&
dbias
->
dtype
()
==
DataType
::
FLOAT32
))
{
dbias
->
mutable_data
<
MPDType
>
(
place
);
MLUCnnl
::
Cast
(
ctx
,
cast_fp16_to_fp32
,
float16_desc
.
get
(),
GetBasePtr
(
&
tmp_dbias
),
float32_desc
.
get
(),
GetBasePtr
(
dbias
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
layer_norm
,
ops
::
LayerNormMLUKernel
<
float
>
,
ops
::
LayerNormMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
layer_norm_grad
,
ops
::
LayerNormGradMLUKernel
<
float
>
,
ops
::
LayerNormGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/lookup_table_v2_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
LookupTableV2MLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
ids_t
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Ids"
);
// int tensor
auto
*
output_t
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
// float tensor
auto
*
table_t
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"W"
);
int
padding_idx
=
static_cast
<
int
>
(
ctx
.
Attr
<
int64_t
>
(
"padding_idx"
));
auto
*
table_var
=
ctx
.
InputVar
(
"W"
);
PADDLE_ENFORCE_EQ
(
table_var
->
IsType
<
phi
::
DenseTensor
>
(),
true
,
platform
::
errors
::
InvalidArgument
(
"mlu only accept phi::DenseTensor"
));
output_t
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
ids_desc
(
*
ids_t
);
MLUCnnlTensorDesc
table_desc
(
*
table_t
);
MLUCnnlTensorDesc
output_desc
(
*
output_t
);
MLUCnnl
::
EmbeddingForward
(
ctx
,
padding_idx
,
table_desc
.
get
(),
GetBasePtr
(
table_t
),
ids_desc
.
get
(),
static_cast
<
const
int
*>
(
GetBasePtr
(
ids_t
)),
output_desc
.
get
(),
GetBasePtr
(
output_t
));
}
};
template
<
typename
T
>
class
LookupTableV2GradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
table_var
=
ctx
.
InputVar
(
"W"
);
PADDLE_ENFORCE_EQ
(
table_var
->
IsType
<
phi
::
DenseTensor
>
(),
true
,
platform
::
errors
::
PermissionDenied
(
"Unsupported Variable Type , idx in "
"LookupTableV2GradMLUKernel should be phi::DenseTensor."
));
bool
is_sparse
=
ctx
.
Attr
<
bool
>
(
"is_sparse"
);
PADDLE_ENFORCE_EQ
(
is_sparse
,
false
,
platform
::
errors
::
InvalidArgument
(
"LookupTableV2GradMLUKernel dose NOT support is_sparse = True."
));
auto
*
ids_t
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Ids"
);
auto
*
output_grad_t
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
table_grad_t
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"W"
));
table_grad_t
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
padding_idx
=
static_cast
<
int
>
(
ctx
.
Attr
<
int64_t
>
(
"padding_idx"
));
int64_t
ids_numel
=
ids_t
->
numel
();
PADDLE_ENFORCE_EQ
(
ids_numel
<=
std
::
numeric_limits
<
int32_t
>::
max
(),
true
,
platform
::
errors
::
OutOfRange
(
"Number of ids greater than int32_t::max , please check "
"number of ids in LookupTableV2GradMLUKernel."
));
phi
::
DenseTensor
ids_int32
(
ids_t
->
dtype
());
if
(
ids_t
->
dtype
()
!=
DataType
::
INT32
)
{
ids_int32
.
mutable_data
<
int
>
(
ids_t
->
dims
(),
ctx
.
GetPlace
());
MLUCnnlTensorDesc
ids_desc
(
*
ids_t
);
MLUCnnlTensorDesc
ids_int32_desc
(
ids_int32
);
auto
cast_type
=
GetCastDataType
(
ids_t
->
dtype
(),
DataType
::
INT32
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
ids_desc
.
get
(),
GetBasePtr
(
ids_t
),
ids_int32_desc
.
get
(),
GetBasePtr
(
&
ids_int32
));
}
else
{
ids_int32
=
*
ids_t
;
}
MLUCnnlTensorDesc
ids_int32_desc
(
ids_int32
);
MLUCnnlTensorDesc
output_grad_desc
(
*
output_grad_t
);
MLUCnnlTensorDesc
table_grad_desc
(
*
table_grad_t
);
MLUCnnl
::
EmbeddingBackward
(
ctx
,
padding_idx
,
false
,
ids_int32_desc
.
get
(),
GetBasePtr
(
&
ids_int32
),
output_grad_desc
.
get
(),
GetBasePtr
(
output_grad_t
),
table_grad_desc
.
get
(),
GetBasePtr
(
table_grad_t
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
lookup_table_v2
,
ops
::
LookupTableV2MLUKernel
<
float
>
,
ops
::
LookupTableV2MLUKernel
<
int
>
,
ops
::
LookupTableV2MLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
lookup_table_v2_grad
,
ops
::
LookupTableV2GradMLUKernel
<
float
>
,
ops
::
LookupTableV2GradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/masked_select_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
MaskedSelectedMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
mask
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Mask"
);
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Y"
);
auto
input_dim
=
input
->
dims
();
auto
mask_dim
=
mask
->
dims
();
PADDLE_ENFORCE_EQ
(
input_dim
,
mask_dim
,
platform
::
errors
::
InvalidArgument
(
"The dim size of input and mask in OP(masked_selected) "
"must be equal, but got input dim:(%ld), mask dim: "
"(%ld). Please check input "
"value."
,
input_dim
,
mask_dim
));
phi
::
DenseTensor
number
(
framework
::
TransToPhiDataType
(
VT
::
INT32
));
void
*
number_ptr
=
number
.
mutable_data
<
int32_t
>
({
1
},
ctx
.
GetPlace
());
out
->
Resize
(
mask
->
dims
());
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
input_desc
(
*
input
);
MLUCnnlTensorDesc
mask_desc
(
*
mask
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
Mask
(
ctx
,
CNNL_MASKED_SELECT
,
input_desc
.
get
(),
GetBasePtr
(
input
),
mask_desc
.
get
(),
GetBasePtr
(
mask
),
nullptr
,
nullptr
,
out_desc
.
get
(),
GetBasePtr
(
out
),
static_cast
<
uint32_t
*>
(
number_ptr
));
}
};
template
<
typename
T
>
class
MaskedSelectedGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
mask
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Mask"
);
auto
y_grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
x_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>();
phi
::
DenseTensor
mask_int32
,
out_size
;
std
::
vector
<
int32_t
>
out_size_vec
;
mask_int32
.
mutable_data
<
int32_t
>
(
mask
->
dims
(),
ctx
.
GetPlace
());
out_size
.
mutable_data
<
int32_t
>
({
1
},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
mask_desc
(
*
mask
);
MLUCnnlTensorDesc
mask_int32_desc
(
mask_int32
);
MLUCnnlTensorDesc
out_size_desc
(
out_size
);
auto
cast_type
=
GetCastDataType
(
mask
->
dtype
(),
DataType
::
INT32
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
mask_desc
.
get
(),
GetBasePtr
(
mask
),
mask_int32_desc
.
get
(),
GetBasePtr
(
&
mask_int32
));
auto
mask_int32_dim
=
phi
::
vectorize
(
mask_int32
.
dims
());
std
::
vector
<
int32_t
>
reduce_dims
;
for
(
size_t
i
=
0
;
i
<
mask_int32_dim
.
size
();
i
++
)
{
reduce_dims
.
push_back
(
static_cast
<
int
>
(
i
));
}
std
::
string
reduce_name
=
"reduce_sum"
;
cnnlReduceOp_t
reduce_op
=
GetMLUCnnlReduceOp
(
reduce_name
);
MLUCnnlReduceDesc
reduce_desc
(
reduce_dims
,
reduce_op
,
ToCnnlDataType
<
int32_t
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnl
::
Reduce
(
ctx
,
true
,
reduce_desc
.
get
(),
nullptr
,
mask_int32_desc
.
get
(),
GetBasePtr
(
&
mask_int32
),
0
,
nullptr
,
nullptr
,
out_size_desc
.
get
(),
GetBasePtr
(
&
out_size
));
paddle
::
framework
::
TensorToVector
(
out_size
,
dev_ctx
,
&
out_size_vec
);
dev_ctx
.
Wait
();
phi
::
DenseTensor
mask_int32_tmp
;
mask_int32_tmp
.
ShareDataWith
(
mask_int32
);
mask_int32_tmp
.
Resize
({
mask_int32
.
numel
()});
phi
::
DenseTensor
topk_v2_out
(
framework
::
TransToPhiDataType
(
VT
::
INT32
)),
indices_int32
(
framework
::
TransToPhiDataType
(
VT
::
INT32
));
topk_v2_out
.
mutable_data
<
int32_t
>
({
mask_int32
.
numel
()},
ctx
.
GetPlace
());
indices_int32
.
mutable_data
<
int32_t
>
({
mask_int32
.
numel
()},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
topk_v2_out_desc
(
topk_v2_out
);
MLUCnnlTensorDesc
indices_int32_desc
(
indices_int32
);
MLUCnnlTensorDesc
mask_int32_tmp_desc
(
mask_int32_tmp
);
const
int
dim
=
0
;
MLUCnnl
::
TopK
(
ctx
,
mask_int32
.
numel
(),
dim
,
true
,
false
,
mask_int32_tmp_desc
.
get
(),
GetBasePtr
(
&
mask_int32_tmp
),
topk_v2_out_desc
.
get
(),
GetBasePtr
(
&
topk_v2_out
),
indices_int32_desc
.
get
(),
GetBasePtr
(
&
indices_int32
));
auto
stream
=
ctx
.
template
device_context
<
MLUDeviceContext
>().
stream
();
phi
::
DenseTensor
indices_int32_out
;
indices_int32_out
.
mutable_data
<
int32_t
>
({
out_size_vec
[
0
]},
ctx
.
GetPlace
());
memory
::
Copy
(
ctx
.
GetPlace
(),
GetBasePtr
(
&
indices_int32_out
),
ctx
.
GetPlace
(),
GetBasePtr
(
&
indices_int32
),
out_size_vec
[
0
]
*
sizeof
(
int32_t
),
stream
);
phi
::
DenseTensor
y_grad_tmp_out
;
y_grad_tmp_out
.
mutable_data
<
T
>
({
out_size_vec
[
0
]},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
y_grad_tmp_out_desc
(
y_grad_tmp_out
);
memory
::
Copy
(
ctx
.
GetPlace
(),
GetBasePtr
(
&
y_grad_tmp_out
),
ctx
.
GetPlace
(),
GetBasePtr
(
y_grad
),
out_size_vec
[
0
]
*
sizeof
(
T
),
stream
);
phi
::
DenseTensor
indices_int32_tmp
;
indices_int32_tmp
.
ShareDataWith
(
indices_int32_out
);
indices_int32_tmp
.
Resize
({
out_size_vec
[
0
],
1
});
MLUCnnlTensorDesc
indices_int32_tmp_desc
(
indices_int32_tmp
);
const
cnnlScatterNdMode_t
mode
=
CNNL_SCATTERND_UPDATE
;
x_grad
->
Resize
({
x_grad
->
numel
()});
x_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
x_grad_desc
(
*
x_grad
);
MLUCnnl
::
ScatterNd
(
ctx
,
mode
,
indices_int32_tmp_desc
.
get
(),
GetBasePtr
(
&
indices_int32_tmp
),
y_grad_tmp_out_desc
.
get
(),
GetBasePtr
(
&
y_grad_tmp_out
),
nullptr
,
nullptr
,
x_grad_desc
.
get
(),
GetBasePtr
(
x_grad
));
x_grad
->
Resize
(
mask
->
dims
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
masked_select
,
ops
::
MaskedSelectedMLUKernel
<
float
>
,
ops
::
MaskedSelectedMLUKernel
<
int
>
,
ops
::
MaskedSelectedMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
masked_select_grad
,
ops
::
MaskedSelectedGradMLUKernel
<
float
>
,
ops
::
MaskedSelectedGradMLUKernel
<
int
>
,
ops
::
MaskedSelectedGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/matmul_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
static
void
Mul
(
const
framework
::
ExecutionContext
&
ctx
,
const
phi
::
DenseTensor
&
X
,
const
phi
::
DenseTensor
&
Y
,
phi
::
DenseTensor
*
Out
,
const
float
alpha
)
{
Out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
x_desc
(
X
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
y_desc
(
Y
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
out_desc
(
*
Out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlOpTensorDesc
mul_op_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
x_desc
.
get
(),
GetBasePtr
(
&
X
),
y_desc
.
get
(),
GetBasePtr
(
&
Y
),
out_desc
.
get
(),
GetBasePtr
(
Out
),
ToCnnlDataType
<
T
>
(),
alpha
);
}
template
<
typename
T
>
static
void
MatMul2D
(
const
framework
::
ExecutionContext
&
ctx
,
const
phi
::
DenseTensor
&
X
,
const
phi
::
DenseTensor
&
Y
,
phi
::
DenseTensor
*
Out
,
const
bool
trans_x
,
const
bool
trans_y
,
const
float
alpha
)
{
Out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
PADDLE_ENFORCE_LT
(
fabs
(
alpha
-
1.0
),
std
::
numeric_limits
<
float
>::
epsilon
(),
platform
::
errors
::
InvalidArgument
(
"MLU(matmul): alpha should be equal to 1.0! "
"Other values are not supported yet."
"But received alpha is %d."
,
alpha
));
MLUCnnlTensorDesc
x_desc
(
X
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
y_desc
(
Y
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
out_desc
(
*
Out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnl
::
Matmul
(
ctx
,
trans_x
,
trans_y
,
x_desc
.
get
(),
GetBasePtr
(
&
X
),
y_desc
.
get
(),
GetBasePtr
(
&
Y
),
out_desc
.
get
(),
GetBasePtr
(
Out
));
}
template
<
typename
T
>
static
void
MatMulND
(
const
framework
::
ExecutionContext
&
ctx
,
const
phi
::
DenseTensor
&
X
,
const
phi
::
DenseTensor
&
Y
,
phi
::
DenseTensor
*
Out
,
const
bool
trans_x
,
const
bool
trans_y
,
const
float
alpha
)
{
if
(
!
Out
->
initialized
())
{
Out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
PADDLE_ENFORCE_LT
(
fabs
(
alpha
-
1.0
),
std
::
numeric_limits
<
float
>::
epsilon
(),
platform
::
errors
::
InvalidArgument
(
"MLU(matmul): alpha should be equal to 1.0! "
"Other values are not supported yet."
"But received alpha is %d."
,
alpha
));
MLUCnnlTensorDesc
x_desc
(
X
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
y_desc
(
Y
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
out_desc
(
*
Out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnl
::
BatchMatmul
(
ctx
,
trans_x
,
trans_y
,
x_desc
.
get
(),
GetBasePtr
(
&
X
),
y_desc
.
get
(),
GetBasePtr
(
&
Y
),
out_desc
.
get
(),
GetBasePtr
(
Out
));
}
template
<
typename
T
>
static
void
ReduceDims
(
const
framework
::
ExecutionContext
&
ctx
,
const
std
::
vector
<
int64_t
>&
dims
,
const
std
::
vector
<
int64_t
>&
bcast_dims
,
const
phi
::
DenseTensor
&
in
,
phi
::
DenseTensor
*
out
)
{
std
::
vector
<
int64_t
>
axes
;
int64_t
size
=
bcast_dims
.
size
();
int64_t
diff
=
bcast_dims
.
size
()
-
dims
.
size
();
for
(
int64_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
i
<
diff
)
{
axes
.
push_back
(
i
);
continue
;
}
if
(
bcast_dims
[
i
]
>
dims
[
i
-
diff
])
{
axes
.
push_back
(
i
);
}
}
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
in_desc
(
in
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
out_desc
(
*
out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
std
::
vector
<
int
>
reduce_dims
(
axes
.
begin
(),
axes
.
end
());
MLUCnnlReduceDesc
reduce_desc
(
reduce_dims
,
CNNL_REDUCE_ADD
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduce_desc
.
get
(),
nullptr
,
in_desc
.
get
(),
GetBasePtr
(
&
in
),
0
/*indices_size*/
,
nullptr
,
nullptr
,
out_desc
.
get
(),
GetBasePtr
(
out
));
}
template
<
typename
T
>
class
MatMulMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
X
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
Y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
Out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
bool
transpose_x
=
ctx
.
Attr
<
bool
>
(
"transpose_X"
);
bool
transpose_y
=
ctx
.
Attr
<
bool
>
(
"transpose_Y"
);
float
alpha
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"alpha"
));
std
::
vector
<
int64_t
>
x_dims
=
phi
::
vectorize
(
X
->
dims
());
std
::
vector
<
int64_t
>
y_dims
=
phi
::
vectorize
(
Y
->
dims
());
std
::
vector
<
int64_t
>
out_dims
=
phi
::
vectorize
(
Out
->
dims
());
int
x_ndim
=
x_dims
.
size
();
int
y_ndim
=
y_dims
.
size
();
// Case 1: [K] x [K] = [1]
// Equal: [1, K] x [K, 1] = [1, 1] => [1]
const
bool
all_one_dim
=
(
x_ndim
==
1
&&
y_ndim
==
1
);
if
(
all_one_dim
)
{
Out
->
Resize
({
1
,
1
});
}
// Resize dim 1 to 2
phi
::
DenseTensor
x_temp
,
y_temp
;
x_temp
.
ShareDataWith
(
*
X
);
y_temp
.
ShareDataWith
(
*
Y
);
if
(
x_ndim
==
1
)
{
x_dims
.
insert
(
x_dims
.
begin
(),
1
);
x_temp
.
Resize
(
phi
::
make_ddim
(
x_dims
));
x_ndim
=
2
;
// matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
if
(
out_dims
.
size
()
<
y_dims
.
size
())
{
std
::
vector
<
int64_t
>
temp_out_dims
(
out_dims
.
begin
(),
out_dims
.
end
());
temp_out_dims
.
insert
(
temp_out_dims
.
end
()
-
1
,
1
);
Out
->
Resize
(
phi
::
make_ddim
(
temp_out_dims
));
}
}
if
(
y_ndim
==
1
)
{
y_dims
.
push_back
(
1
);
y_temp
.
Resize
(
phi
::
make_ddim
(
y_dims
));
y_ndim
=
2
;
// matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
if
(
out_dims
.
size
()
<
x_dims
.
size
())
{
std
::
vector
<
int64_t
>
temp_out_dims
(
out_dims
.
begin
(),
out_dims
.
end
());
temp_out_dims
.
push_back
(
1
);
Out
->
Resize
(
phi
::
make_ddim
(
temp_out_dims
));
}
}
const
int
K
=
transpose_x
?
x_dims
[
x_ndim
-
2
]
:
x_dims
[
x_ndim
-
1
];
if
(
transpose_y
)
{
PADDLE_ENFORCE_EQ
(
y_dims
[
y_ndim
-
1
],
K
,
platform
::
errors
::
InvalidArgument
(
"Input(Y) has error dim."
"Y'dims[%d] must be equal to %d"
"But received Y'dims[%d] is %d"
,
y_ndim
-
1
,
K
,
y_ndim
-
1
,
y_dims
[
y_ndim
-
1
]));
}
else
{
PADDLE_ENFORCE_EQ
(
y_dims
[
y_ndim
-
2
],
K
,
platform
::
errors
::
InvalidArgument
(
"Input(Y) has error dim."
"Y'dims[%d] must be equal to %d"
"But received Y'dims[%d] is %d"
,
y_ndim
-
2
,
K
,
y_ndim
-
2
,
y_dims
[
y_ndim
-
2
]));
}
if
(
x_ndim
==
2
&&
y_ndim
==
2
)
{
// Case 2: [M, K] x [K, N] = [M, N]
MatMul2D
<
T
>
(
ctx
,
x_temp
,
y_temp
,
Out
,
transpose_x
,
transpose_y
,
alpha
);
}
else
{
// Case 3: [B, M, K] x [K, N] = [B, M, N]
// Case 4: [B, M, K] x [B, K, N] = [B, M, N]
MatMulND
<
T
>
(
ctx
,
x_temp
,
y_temp
,
Out
,
transpose_x
,
transpose_y
,
alpha
);
}
if
(
phi
::
vectorize
(
Out
->
dims
())
!=
out_dims
)
{
Out
->
Resize
(
phi
::
make_ddim
(
out_dims
));
}
}
};
template
<
typename
T
>
class
MatMulGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
X
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
Y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
dOut
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dX
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dY
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
bool
transpose_x
=
ctx
.
Attr
<
bool
>
(
"transpose_X"
);
bool
transpose_y
=
ctx
.
Attr
<
bool
>
(
"transpose_Y"
);
float
alpha
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"alpha"
));
std
::
vector
<
int64_t
>
x_dims
=
phi
::
vectorize
(
X
->
dims
());
std
::
vector
<
int64_t
>
y_dims
=
phi
::
vectorize
(
Y
->
dims
());
std
::
vector
<
int64_t
>
out_dims
=
phi
::
vectorize
(
dOut
->
dims
());
int
x_ndim
=
x_dims
.
size
();
int
y_ndim
=
y_dims
.
size
();
int
out_ndim
=
out_dims
.
size
();
// Case 1: [K] x [K] = [1]
if
(
x_ndim
==
1
&&
y_ndim
==
1
)
{
if
(
dX
)
{
Mul
<
T
>
(
ctx
,
*
dOut
,
*
Y
,
dX
,
alpha
);
}
if
(
dY
)
{
Mul
<
T
>
(
ctx
,
*
dOut
,
*
X
,
dY
,
alpha
);
}
return
;
}
// Resize dim 1 to 2
phi
::
DenseTensor
x_temp
,
y_temp
,
dout_temp
;
x_temp
.
ShareDataWith
(
*
X
);
y_temp
.
ShareDataWith
(
*
Y
);
dout_temp
.
ShareDataWith
(
*
dOut
);
if
(
x_ndim
==
1
)
{
x_dims
.
insert
(
x_dims
.
begin
(),
1
);
out_dims
.
insert
(
out_dims
.
end
()
-
1
,
1
);
x_temp
.
Resize
(
phi
::
make_ddim
(
x_dims
));
dout_temp
.
Resize
(
phi
::
make_ddim
(
out_dims
));
x_ndim
=
2
;
out_ndim
+=
1
;
}
if
(
y_ndim
==
1
)
{
y_dims
.
push_back
(
1
);
out_dims
.
push_back
(
1
);
y_temp
.
Resize
(
phi
::
make_ddim
(
y_dims
));
dout_temp
.
Resize
(
phi
::
make_ddim
(
out_dims
));
y_ndim
=
2
;
out_ndim
+=
1
;
}
// Case 2: [M, K] x [K, N] = [M, N]
if
(
out_ndim
==
2
)
{
if
(
dX
)
{
dX
->
Resize
(
phi
::
make_ddim
(
x_dims
));
if
(
transpose_x
)
{
MatMul2D
<
T
>
(
ctx
,
y_temp
,
dout_temp
,
dX
,
transpose_y
,
true
,
alpha
);
}
else
{
MatMul2D
<
T
>
(
ctx
,
dout_temp
,
y_temp
,
dX
,
false
,
!
transpose_y
,
alpha
);
}
dX
->
Resize
(
X
->
dims
());
}
if
(
dY
)
{
dY
->
Resize
(
phi
::
make_ddim
(
y_dims
));
if
(
transpose_y
)
{
MatMul2D
<
T
>
(
ctx
,
dout_temp
,
x_temp
,
dY
,
true
,
transpose_x
,
alpha
);
}
else
{
MatMul2D
<
T
>
(
ctx
,
x_temp
,
dout_temp
,
dY
,
!
transpose_x
,
false
,
alpha
);
}
dY
->
Resize
(
Y
->
dims
());
}
return
;
}
// Case 3: [B, M, K] x [K, N] = [B, M, N]
// Case 4: [B, M, K] x [B, K, N] = [B, M, N]
std
::
vector
<
int64_t
>
x_bcast_dims
(
out_ndim
,
1
);
std
::
vector
<
int64_t
>
y_bcast_dims
(
out_ndim
,
1
);
std
::
copy
(
out_dims
.
begin
(),
out_dims
.
end
()
-
2
,
x_bcast_dims
.
begin
());
std
::
copy
(
out_dims
.
begin
(),
out_dims
.
end
()
-
2
,
y_bcast_dims
.
begin
());
std
::
copy
(
x_dims
.
end
()
-
2
,
x_dims
.
end
(),
x_bcast_dims
.
end
()
-
2
);
std
::
copy
(
y_dims
.
end
()
-
2
,
y_dims
.
end
(),
y_bcast_dims
.
end
()
-
2
);
if
(
dX
)
{
phi
::
DenseTensor
dx_temp
(
X
->
type
());
if
(
x_dims
!=
x_bcast_dims
)
{
dx_temp
.
Resize
(
phi
::
make_ddim
(
x_bcast_dims
));
}
else
{
dX
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
dx_temp
.
ShareDataWith
(
*
dX
);
}
if
(
transpose_x
)
{
MatMulND
<
T
>
(
ctx
,
y_temp
,
dout_temp
,
&
dx_temp
,
transpose_y
,
true
,
alpha
);
}
else
{
MatMulND
<
T
>
(
ctx
,
dout_temp
,
y_temp
,
&
dx_temp
,
false
,
!
transpose_y
,
alpha
);
}
if
(
x_dims
!=
x_bcast_dims
)
{
ReduceDims
<
T
>
(
ctx
,
x_dims
,
x_bcast_dims
,
dx_temp
,
dX
);
}
}
if
(
dY
)
{
phi
::
DenseTensor
dy_temp
(
Y
->
type
());
if
(
y_dims
!=
y_bcast_dims
)
{
dy_temp
.
Resize
(
phi
::
make_ddim
(
y_bcast_dims
));
}
else
{
dY
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
dy_temp
.
ShareDataWith
(
*
dY
);
}
if
(
transpose_y
)
{
MatMulND
<
T
>
(
ctx
,
dout_temp
,
x_temp
,
&
dy_temp
,
true
,
transpose_x
,
alpha
);
}
else
{
MatMulND
<
T
>
(
ctx
,
x_temp
,
dout_temp
,
&
dy_temp
,
!
transpose_x
,
false
,
alpha
);
}
if
(
y_dims
!=
y_bcast_dims
)
{
ReduceDims
<
T
>
(
ctx
,
y_dims
,
y_bcast_dims
,
dy_temp
,
dY
);
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
matmul
,
ops
::
MatMulMLUKernel
<
float
>
,
ops
::
MatMulMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
matmul_grad
,
ops
::
MatMulGradMLUKernel
<
float
>
,
ops
::
MatMulGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/matmul_v2_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/matmul_v2_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
static
void
Mul
(
const
framework
::
ExecutionContext
&
ctx
,
const
phi
::
DenseTensor
&
X
,
const
phi
::
DenseTensor
&
Y
,
phi
::
DenseTensor
*
Out
)
{
Out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
x_desc
(
X
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
y_desc
(
Y
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
out_desc
(
*
Out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlOpTensorDesc
mul_op_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
x_desc
.
get
(),
GetBasePtr
(
&
X
),
y_desc
.
get
(),
GetBasePtr
(
&
Y
),
out_desc
.
get
(),
GetBasePtr
(
Out
),
ToCnnlDataType
<
T
>
());
}
template
<
typename
T
>
static
void
MatMul2D
(
const
framework
::
ExecutionContext
&
ctx
,
const
phi
::
DenseTensor
&
X
,
const
phi
::
DenseTensor
&
Y
,
phi
::
DenseTensor
*
Out
,
const
bool
trans_x
,
const
bool
trans_y
)
{
Out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
x_desc
(
X
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
y_desc
(
Y
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
out_desc
(
*
Out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnl
::
Matmul
(
ctx
,
trans_x
,
trans_y
,
x_desc
.
get
(),
GetBasePtr
(
&
X
),
y_desc
.
get
(),
GetBasePtr
(
&
Y
),
out_desc
.
get
(),
GetBasePtr
(
Out
));
}
template
<
typename
T
>
static
void
MatMul2DwithReduceBatch
(
const
framework
::
ExecutionContext
&
ctx
,
const
phi
::
DenseTensor
&
X
,
const
phi
::
DenseTensor
&
Y
,
phi
::
DenseTensor
*
Out
,
const
bool
trans_x
,
const
bool
trans_y
)
{
if
(
!
Out
->
initialized
())
{
Out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
// reshape to 2D matmul
std
::
vector
<
int64_t
>
x_dims
=
phi
::
vectorize
(
X
.
dims
());
std
::
vector
<
int64_t
>
y_dims
=
phi
::
vectorize
(
Y
.
dims
());
std
::
vector
<
int
>
realx_dims
(
{
static_cast
<
int
>
(
x_dims
[
0
]
*
x_dims
[
1
]),
static_cast
<
int
>
(
x_dims
[
2
])});
std
::
vector
<
int
>
realy_dims
(
{
static_cast
<
int
>
(
y_dims
[
0
]
*
y_dims
[
1
]),
static_cast
<
int
>
(
y_dims
[
2
])});
MLUCnnlTensorDesc
x_desc
(
2
,
realx_dims
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
y_desc
(
2
,
realy_dims
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
out_desc
(
*
Out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnl
::
Matmul
(
ctx
,
trans_x
,
trans_y
,
x_desc
.
get
(),
GetBasePtr
(
&
X
),
y_desc
.
get
(),
GetBasePtr
(
&
Y
),
out_desc
.
get
(),
GetBasePtr
(
Out
));
}
template
<
typename
T
>
static
void
MatMulND
(
const
framework
::
ExecutionContext
&
ctx
,
const
phi
::
DenseTensor
&
X
,
const
phi
::
DenseTensor
&
Y
,
phi
::
DenseTensor
*
Out
,
const
bool
trans_x
,
const
bool
trans_y
)
{
if
(
!
Out
->
initialized
())
{
Out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
MLUCnnlTensorDesc
x_desc
(
X
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
y_desc
(
Y
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
out_desc
(
*
Out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnl
::
BatchMatmul
(
ctx
,
trans_x
,
trans_y
,
x_desc
.
get
(),
GetBasePtr
(
&
X
),
y_desc
.
get
(),
GetBasePtr
(
&
Y
),
out_desc
.
get
(),
GetBasePtr
(
Out
));
}
template
<
typename
T
>
static
void
ReduceDims
(
const
framework
::
ExecutionContext
&
ctx
,
const
std
::
vector
<
int64_t
>&
dims
,
const
std
::
vector
<
int64_t
>&
bcast_dims
,
const
phi
::
DenseTensor
&
in
,
phi
::
DenseTensor
*
out
)
{
std
::
vector
<
int64_t
>
axes
;
int64_t
size
=
bcast_dims
.
size
();
int64_t
diff
=
bcast_dims
.
size
()
-
dims
.
size
();
for
(
int64_t
i
=
0
;
i
<
size
;
++
i
)
{
if
(
i
<
diff
)
{
axes
.
push_back
(
i
);
continue
;
}
if
(
bcast_dims
[
i
]
>
dims
[
i
-
diff
])
{
axes
.
push_back
(
i
);
}
}
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
in_desc
(
in
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
out_desc
(
*
out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
<
T
>
());
std
::
vector
<
int
>
reduce_dims
(
axes
.
begin
(),
axes
.
end
());
MLUCnnlReduceDesc
reduce_desc
(
reduce_dims
,
CNNL_REDUCE_ADD
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnl
::
Reduce
(
ctx
,
true
/*need_workspace*/
,
reduce_desc
.
get
(),
nullptr
,
in_desc
.
get
(),
GetBasePtr
(
&
in
),
0
/*indices_size*/
,
nullptr
,
nullptr
,
out_desc
.
get
(),
GetBasePtr
(
out
));
}
template
<
typename
T
>
class
MatMulV2MLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
X
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
Y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
Out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
const
bool
trans_x
=
ctx
.
Attr
<
bool
>
(
"trans_x"
);
const
bool
trans_y
=
ctx
.
Attr
<
bool
>
(
"trans_y"
);
std
::
vector
<
int64_t
>
x_dims
=
phi
::
vectorize
(
X
->
dims
());
std
::
vector
<
int64_t
>
y_dims
=
phi
::
vectorize
(
Y
->
dims
());
std
::
vector
<
int64_t
>
out_dims
=
phi
::
vectorize
(
Out
->
dims
());
int
x_ndim
=
x_dims
.
size
();
int
y_ndim
=
y_dims
.
size
();
// Case 1: [K] x [K] = [1]
// Equal: [1, K] x [K, 1] = [1, 1] => [1]
const
bool
all_one_dim
=
(
x_ndim
==
1
&&
y_ndim
==
1
);
if
(
all_one_dim
)
{
Out
->
Resize
({
1
,
1
});
}
// Resize dim 1 to 2
phi
::
DenseTensor
x_temp
,
y_temp
;
x_temp
.
ShareDataWith
(
*
X
);
y_temp
.
ShareDataWith
(
*
Y
);
if
(
x_ndim
==
1
)
{
x_dims
.
insert
(
x_dims
.
begin
(),
1
);
x_temp
.
Resize
(
phi
::
make_ddim
(
x_dims
));
x_ndim
=
2
;
// matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
if
(
out_dims
.
size
()
<
y_dims
.
size
())
{
std
::
vector
<
int64_t
>
temp_out_dims
(
out_dims
.
begin
(),
out_dims
.
end
());
temp_out_dims
.
insert
(
temp_out_dims
.
end
()
-
1
,
1
);
Out
->
Resize
(
phi
::
make_ddim
(
temp_out_dims
));
}
}
if
(
y_ndim
==
1
)
{
y_dims
.
push_back
(
1
);
y_temp
.
Resize
(
phi
::
make_ddim
(
y_dims
));
y_ndim
=
2
;
// matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
if
(
out_dims
.
size
()
<
x_dims
.
size
())
{
std
::
vector
<
int64_t
>
temp_out_dims
(
out_dims
.
begin
(),
out_dims
.
end
());
temp_out_dims
.
push_back
(
1
);
Out
->
Resize
(
phi
::
make_ddim
(
temp_out_dims
));
}
}
const
int
K
=
trans_x
?
x_dims
[
x_ndim
-
2
]
:
x_dims
[
x_ndim
-
1
];
if
(
trans_y
)
{
PADDLE_ENFORCE_EQ
(
y_dims
[
y_ndim
-
1
],
K
,
platform
::
errors
::
InvalidArgument
(
"Input(Y) has error dim."
"Y'dims[%d] must be equal to %d"
"But received Y'dims[%d] is %d"
,
y_ndim
-
1
,
K
,
y_ndim
-
1
,
y_dims
[
y_ndim
-
1
]));
}
else
{
PADDLE_ENFORCE_EQ
(
y_dims
[
y_ndim
-
2
],
K
,
platform
::
errors
::
InvalidArgument
(
"Input(Y) has error dim."
"Y'dims[%d] must be equal to %d"
"But received Y'dims[%d] is %d"
,
y_ndim
-
2
,
K
,
y_ndim
-
2
,
y_dims
[
y_ndim
-
2
]));
}
if
(
x_ndim
==
2
&&
y_ndim
==
2
)
{
// Case 2: [M, K] x [K, N] = [M, N]
MatMul2D
<
T
>
(
ctx
,
x_temp
,
y_temp
,
Out
,
trans_x
,
trans_y
);
}
else
{
// Case 3: [B, M, K] x [K, N] = [B, M, N]
// Case 4: [B, M, K] x [B, K, N] = [B, M, N]
MatMulND
<
T
>
(
ctx
,
x_temp
,
y_temp
,
Out
,
trans_x
,
trans_y
);
}
if
(
phi
::
vectorize
(
Out
->
dims
())
!=
out_dims
)
{
Out
->
Resize
(
phi
::
make_ddim
(
out_dims
));
}
}
};
template
<
typename
T
>
class
MatMulGradV2MLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
X
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
Y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
dOut
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dX
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dY
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
const
bool
trans_x
=
ctx
.
Attr
<
bool
>
(
"trans_x"
);
const
bool
trans_y
=
ctx
.
Attr
<
bool
>
(
"trans_y"
);
std
::
vector
<
int64_t
>
x_dims
=
phi
::
vectorize
(
X
->
dims
());
std
::
vector
<
int64_t
>
y_dims
=
phi
::
vectorize
(
Y
->
dims
());
std
::
vector
<
int64_t
>
out_dims
=
phi
::
vectorize
(
dOut
->
dims
());
int
x_ndim
=
x_dims
.
size
();
int
y_ndim
=
y_dims
.
size
();
int
out_ndim
=
out_dims
.
size
();
// Case 1: [K] x [K] = [1]
if
(
x_ndim
==
1
&&
y_ndim
==
1
)
{
if
(
dX
)
{
Mul
<
T
>
(
ctx
,
*
dOut
,
*
Y
,
dX
);
}
if
(
dY
)
{
Mul
<
T
>
(
ctx
,
*
dOut
,
*
X
,
dY
);
}
return
;
}
// Resize dim 1 to 2
phi
::
DenseTensor
x_temp
,
y_temp
,
dout_temp
;
x_temp
.
ShareDataWith
(
*
X
);
y_temp
.
ShareDataWith
(
*
Y
);
dout_temp
.
ShareDataWith
(
*
dOut
);
if
(
x_ndim
==
1
)
{
x_dims
.
insert
(
x_dims
.
begin
(),
1
);
out_dims
.
insert
(
out_dims
.
end
()
-
1
,
1
);
x_temp
.
Resize
(
phi
::
make_ddim
(
x_dims
));
dout_temp
.
Resize
(
phi
::
make_ddim
(
out_dims
));
x_ndim
=
2
;
out_ndim
+=
1
;
}
if
(
y_ndim
==
1
)
{
y_dims
.
push_back
(
1
);
out_dims
.
push_back
(
1
);
y_temp
.
Resize
(
phi
::
make_ddim
(
y_dims
));
dout_temp
.
Resize
(
phi
::
make_ddim
(
out_dims
));
y_ndim
=
2
;
out_ndim
+=
1
;
}
// Case 2: [M, K] x [K, N] = [M, N]
if
(
out_ndim
==
2
)
{
if
(
dX
)
{
dX
->
Resize
(
phi
::
make_ddim
(
x_dims
));
if
(
trans_x
)
{
MatMul2D
<
T
>
(
ctx
,
y_temp
,
dout_temp
,
dX
,
trans_y
,
true
);
}
else
{
MatMul2D
<
T
>
(
ctx
,
dout_temp
,
y_temp
,
dX
,
false
,
!
trans_y
);
}
dX
->
Resize
(
X
->
dims
());
}
if
(
dY
)
{
dY
->
Resize
(
phi
::
make_ddim
(
y_dims
));
if
(
trans_y
)
{
MatMul2D
<
T
>
(
ctx
,
dout_temp
,
x_temp
,
dY
,
true
,
trans_x
);
}
else
{
MatMul2D
<
T
>
(
ctx
,
x_temp
,
dout_temp
,
dY
,
!
trans_x
,
false
);
}
dY
->
Resize
(
Y
->
dims
());
}
return
;
}
// Case 3: [B, M, K] x [K, N] = [B, M, N]
// Case 4: [B, M, K] x [B, K, N] = [B, M, N]
std
::
vector
<
int64_t
>
x_bcast_dims
(
out_ndim
,
1
);
std
::
vector
<
int64_t
>
y_bcast_dims
(
out_ndim
,
1
);
std
::
copy
(
out_dims
.
begin
(),
out_dims
.
end
()
-
2
,
x_bcast_dims
.
begin
());
std
::
copy
(
out_dims
.
begin
(),
out_dims
.
end
()
-
2
,
y_bcast_dims
.
begin
());
std
::
copy
(
x_dims
.
end
()
-
2
,
x_dims
.
end
(),
x_bcast_dims
.
end
()
-
2
);
std
::
copy
(
y_dims
.
end
()
-
2
,
y_dims
.
end
(),
y_bcast_dims
.
end
()
-
2
);
if
(
dX
)
{
phi
::
DenseTensor
dx_temp
(
X
->
type
());
if
(
x_dims
!=
x_bcast_dims
)
{
dx_temp
.
Resize
(
phi
::
make_ddim
(
x_bcast_dims
));
}
else
{
dX
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
dx_temp
.
ShareDataWith
(
*
dX
);
}
if
(
trans_x
)
{
MatMulND
<
T
>
(
ctx
,
y_temp
,
dout_temp
,
&
dx_temp
,
trans_y
,
true
);
}
else
{
MatMulND
<
T
>
(
ctx
,
dout_temp
,
y_temp
,
&
dx_temp
,
false
,
!
trans_y
);
}
if
(
x_dims
!=
x_bcast_dims
)
{
ReduceDims
<
T
>
(
ctx
,
x_dims
,
x_bcast_dims
,
dx_temp
,
dX
);
}
}
if
(
dY
)
{
// Case 3: [B, M, K] x [K, N] = [B, M, N] better performance
// otherwise, tensor dy_temp in else branch might encounter
// numel overflow due to cnnlTensorDescriptor limitation
if
(
x_dims
.
size
()
==
3
&&
phi
::
vectorize
(
Y
->
dims
()).
size
()
==
2
)
{
if
(
trans_y
)
{
MatMul2DwithReduceBatch
<
T
>
(
ctx
,
dout_temp
,
x_temp
,
dY
,
true
,
trans_x
);
}
else
{
MatMul2DwithReduceBatch
<
T
>
(
ctx
,
x_temp
,
dout_temp
,
dY
,
!
trans_x
,
false
);
}
}
else
{
phi
::
DenseTensor
dy_temp
(
Y
->
type
());
if
(
y_dims
!=
y_bcast_dims
)
{
dy_temp
.
Resize
(
phi
::
make_ddim
(
y_bcast_dims
));
}
else
{
dY
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
dy_temp
.
ShareDataWith
(
*
dY
);
}
if
(
trans_y
)
{
MatMulND
<
T
>
(
ctx
,
dout_temp
,
x_temp
,
&
dy_temp
,
true
,
trans_x
);
}
else
{
MatMulND
<
T
>
(
ctx
,
x_temp
,
dout_temp
,
&
dy_temp
,
!
trans_x
,
false
);
}
if
(
y_dims
!=
y_bcast_dims
)
{
ReduceDims
<
T
>
(
ctx
,
y_dims
,
y_bcast_dims
,
dy_temp
,
dY
);
}
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
matmul_v2
,
ops
::
MatMulV2MLUKernel
<
float
>
,
ops
::
MatMulV2MLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
matmul_v2_grad
,
ops
::
MatMulGradV2MLUKernel
<
float
>
,
ops
::
MatMulGradV2MLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/mean_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/platform/device/mlu/device_context.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
MeanMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
input
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
const
T
*
in_data
=
input
->
data
<
T
>
();
T
*
out_data
=
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
numel
=
input
->
numel
();
auto
rank
=
input
->
dims
().
size
();
auto
place
=
context
.
GetPlace
();
auto
stream
=
context
.
template
device_context
<
MLUDeviceContext
>().
stream
();
if
(
rank
==
0
)
{
// scalar
memory
::
Copy
(
place
,
out_data
,
place
,
in_data
,
numel
*
sizeof
(
T
),
stream
);
return
;
}
std
::
vector
<
int
>
reduce_dims
;
reduce_dims
.
reserve
(
rank
);
for
(
decltype
(
rank
)
i
=
0
;
i
<
rank
;
++
i
)
{
reduce_dims
.
push_back
(
i
);
}
MLUCnnlTensorDesc
input_desc
(
*
input
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
input
->
dtype
()));
MLUCnnlTensorDesc
output_desc
(
*
output
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
output
->
dtype
()));
MLUCnnlReduceDesc
reduction_desc
(
reduce_dims
,
CNNL_REDUCE_AVG
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
,
CNNL_REDUCE_NO_INDICES
,
CNNL_32BIT_INDICES
);
MLUCnnl
::
Reduce
(
context
,
true
/*need_workspace*/
,
reduction_desc
.
get
(),
nullptr
,
input_desc
.
get
(),
reinterpret_cast
<
const
void
*>
(
in_data
),
0
/*indices_size*/
,
nullptr
,
nullptr
,
output_desc
.
get
(),
reinterpret_cast
<
void
*>
(
out_data
));
}
};
template
<
typename
T
>
class
MeanMLUGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
output_grad
=
context
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
PADDLE_ENFORCE_EQ
(
output_grad
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Mean Gradient Input phi::DenseTensor len should be 1. But "
"received Out@Grad's elements num is %d."
,
output_grad
->
numel
()));
auto
input_grad
=
context
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
in_data
=
output_grad
->
data
<
T
>
();
auto
numel
=
input_grad
->
numel
();
auto
rank
=
input_grad
->
dims
().
size
();
auto
out_data
=
input_grad
->
data
<
T
>
();
auto
place
=
context
.
GetPlace
();
auto
stream
=
context
.
template
device_context
<
MLUDeviceContext
>().
stream
();
if
(
rank
==
0
)
{
// scalar
memory
::
Copy
(
place
,
out_data
,
place
,
in_data
,
numel
*
sizeof
(
T
),
stream
);
return
;
}
// means
phi
::
DenseTensor
mean_var
(
output_grad
->
dtype
());
mean_var
.
mutable_data
<
T
>
(
input_grad
->
dims
(),
context
.
GetPlace
());
MLUCnnlTensorDesc
mean_var_desc
(
mean_var
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
mean_var
.
dtype
()));
auto
value
=
static_cast
<
T
>
(
1.0
/
static_cast
<
float
>
(
input_grad
->
numel
()));
MLUCnnl
::
Fill
(
context
,
CNNL_POINTER_MODE_HOST
,
&
value
,
mean_var_desc
.
get
(),
GetBasePtr
(
&
mean_var
));
// means mul output_grad
MLUCnnlTensorDesc
in_desc
(
*
output_grad
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
output_grad
->
dtype
()));
MLUCnnlTensorDesc
out_desc
(
*
input_grad
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
input_grad
->
dtype
()));
MLUCnnlOpTensorDesc
op_tensor_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
context
,
op_tensor_desc
.
get
(),
in_desc
.
get
(),
reinterpret_cast
<
const
void
*>
(
in_data
),
mean_var_desc
.
get
(),
GetBasePtr
(
&
mean_var
),
out_desc
.
get
(),
reinterpret_cast
<
void
*>
(
out_data
),
ToCnnlDataType
<
T
>
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
mean
,
ops
::
MeanMLUKernel
<
float
>
,
ops
::
MeanMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
mean_grad
,
ops
::
MeanMLUGradKernel
<
float
>
,
ops
::
MeanMLUGradKernel
<
plat
::
float16
>
);
paddle/fluid/operators/meshgrid_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
MeshgridMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
ins
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"X"
);
auto
outs
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
"Out"
);
PADDLE_ENFORCE_EQ
((
ins
.
size
()
>
1
)
&&
(
ins
.
size
()
<
7
),
true
,
platform
::
errors
::
InvalidArgument
(
"Excepted phi::DenseTensor numbers between 2 and 6, "
"but only received d% ."
,
ins
.
size
()));
int64_t
size
=
ins
.
size
();
std
::
vector
<
int64_t
>
shape
(
size
);
for
(
int64_t
i
=
0
;
i
<
size
;
i
++
)
{
switch
(
ins
[
i
]
->
dims
().
size
())
{
case
0
:
shape
[
i
]
=
1
;
break
;
case
1
:
shape
[
i
]
=
ins
[
i
]
->
dims
()[
0
];
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Expected scalar or 1D tensor in the tensor list but got tensor "
"%d: "
,
i
));
}
}
MLUCnnlTensorDesc
out_desc
(
size
,
shape
.
data
(),
ToCnnlDataType
<
T
>
());
framework
::
DDim
out_dims
=
phi
::
make_ddim
(
shape
);
for
(
int64_t
i
=
0
;
i
<
size
;
i
++
)
{
std
::
vector
<
int64_t
>
view_shape
(
size
,
1
);
view_shape
[
i
]
=
shape
[
i
];
outs
[
i
]
->
Resize
(
out_dims
);
outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
in_desc
(
size
,
view_shape
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnl
::
BroadcastTo
(
ctx
,
in_desc
.
get
(),
GetBasePtr
(
ins
[
i
]),
out_desc
.
get
(),
GetBasePtr
(
outs
[
i
]));
}
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OP_MLU_KERNEL
(
meshgrid
,
paddle
::
operators
::
MeshgridMLUKernel
<
int
>
,
paddle
::
operators
::
MeshgridMLUKernel
<
float
>
,
paddle
::
operators
::
MeshgridMLUKernel
<
int64_t
>
,
paddle
::
operators
::
MeshgridMLUKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/one_hot_v2_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/phi/core/tensor_utils.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
OneHotV2MLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>();
auto
*
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
int
depth
=
ctx
.
Attr
<
int
>
(
"depth"
);
if
(
ctx
.
HasInput
(
"depth_tensor"
))
{
std
::
vector
<
int32_t
>
depth_data
;
depth_data
=
phi
::
GetVectorFromTensor
<
int
>
(
ctx
.
Input
<
phi
::
DenseTensor
>
(
"depth_tensor"
));
depth
=
depth_data
[
0
];
auto
out_dims
=
out
->
dims
();
out_dims
[
out_dims
.
size
()
-
1
]
=
depth
;
out
->
Resize
(
out_dims
);
}
out
->
mutable_data
<
float
>
(
ctx
.
GetPlace
());
float
on_value
=
1.0
f
,
off_value
=
0.0
f
;
const
int
in_off_dim
[
1
]
=
{
1
};
phi
::
DenseTensor
on_value_tensor
=
ctx
.
AllocateTmpTensor
<
float
,
MLUDeviceContext
>
(
framework
::
DDim
(
in_off_dim
,
1
),
dev_ctx
);
phi
::
DenseTensor
off_value_tensor
=
ctx
.
AllocateTmpTensor
<
float
,
MLUDeviceContext
>
(
framework
::
DDim
(
in_off_dim
,
1
),
dev_ctx
);
FillMLUTensorWithHostValue
(
ctx
,
on_value
,
&
on_value_tensor
);
FillMLUTensorWithHostValue
(
ctx
,
off_value
,
&
off_value_tensor
);
if
(
framework
::
TransToProtoVarType
(
in
->
dtype
())
==
framework
::
proto
::
VarType
::
INT32
)
{
MLUCnnlTensorDesc
desc_indices
(
*
in
);
MLUCnnl
::
OneHot
(
ctx
,
desc_indices
.
get
(),
GetBasePtr
(
in
),
depth
,
GetBasePtr
(
&
on_value_tensor
),
GetBasePtr
(
&
off_value_tensor
),
-
1
,
ToCnnlDataType
(
out
->
dtype
()),
GetBasePtr
(
out
));
}
else
{
phi
::
DenseTensor
transformed_in
;
transformed_in
.
mutable_data
<
int32_t
>
(
in
->
dims
(),
dev_ctx
.
GetPlace
());
// use cnnlCast to cast int64_t to int32_t then do one_hot
MLUCnnlTensorDesc
in_desc
(
*
in
);
MLUCnnlTensorDesc
transformed_in_desc
(
transformed_in
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
framework
::
TransToProtoVarType
(
in
->
dtype
()),
framework
::
TransToProtoVarType
(
transformed_in
.
dtype
()));
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
in_desc
.
get
(),
GetBasePtr
(
in
),
transformed_in_desc
.
get
(),
GetBasePtr
(
&
transformed_in
));
MLUCnnl
::
OneHot
(
ctx
,
transformed_in_desc
.
get
(),
GetBasePtr
(
&
transformed_in
),
depth
,
GetBasePtr
(
&
on_value_tensor
),
GetBasePtr
(
&
off_value_tensor
),
-
1
,
ToCnnlDataType
(
out
->
dtype
()),
GetBasePtr
(
out
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
one_hot_v2
,
ops
::
OneHotV2MLUKernel
<
int32_t
>
,
ops
::
OneHotV2MLUKernel
<
int64_t
>
);
paddle/fluid/operators/pool_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/phi/kernels/funcs/pooling.h"
namespace
paddle
{
namespace
operators
{
namespace
{
cnnlPoolingMode_t
ToCnnlPoolingMode
(
const
std
::
string
&
pooling_type
,
bool
exclusive
,
bool
adaptive
)
{
cnnlPoolingMode_t
pooling_mode
;
if
(
pooling_type
==
"max"
)
{
pooling_mode
=
CNNL_POOLING_MAX
;
}
else
if
(
pooling_type
==
"avg"
)
{
if
(
exclusive
&&
!
adaptive
)
{
pooling_mode
=
CNNL_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
;
}
else
{
pooling_mode
=
CNNL_POOLING_AVERAGE_COUNT_INCLUDE_PADDING
;
}
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Unknown pooling_type: %s"
,
pooling_type
));
}
return
pooling_mode
;
}
}
// namespace
template
<
typename
T
>
class
MLUPoolOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>();
const
phi
::
DenseTensor
*
in_x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
phi
::
DenseTensor
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
string
pooling_type
=
ctx
.
Attr
<
std
::
string
>
(
"pooling_type"
);
std
::
vector
<
int
>
ksize
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"ksize"
);
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
bool
global_pooling
=
ctx
.
Attr
<
bool
>
(
"global_pooling"
);
bool
ceil_mode
=
ctx
.
Attr
<
bool
>
(
"ceil_mode"
);
bool
exclusive
=
ctx
.
Attr
<
bool
>
(
"exclusive"
);
bool
adaptive
=
ctx
.
Attr
<
bool
>
(
"adaptive"
);
std
::
string
padding_algorithm
=
ctx
.
Attr
<
std
::
string
>
(
"padding_algorithm"
);
PADDLE_ENFORCE_EQ
(
in_x
->
dims
().
size
(),
4
,
platform
::
errors
::
InvalidArgument
(
"Only support 4-dims for mlu pool2d kernel."
));
const
bool
channel_last
=
data_format
==
"NHWC"
;
// default
cnnlTensorLayout_t
cnnl_layout
=
CNNL_LAYOUT_NCHW
;
auto
out_dims
=
out
->
dims
();
int64_t
out_h
=
out_dims
[
2
];
int64_t
out_w
=
out_dims
[
3
];
auto
in_x_dims
=
in_x
->
dims
();
framework
::
DDim
data_dims
=
phi
::
slice_ddim
(
in_x_dims
,
2
,
in_x_dims
.
size
());
if
(
channel_last
)
{
cnnl_layout
=
CNNL_LAYOUT_NHWC
;
out_h
=
out_dims
[
1
];
out_w
=
out_dims
[
2
];
data_dims
=
phi
::
slice_ddim
(
in_x_dims
,
1
,
in_x_dims
.
size
()
-
1
);
}
phi
::
funcs
::
UpdatePadding
(
&
paddings
,
global_pooling
,
adaptive
,
padding_algorithm
,
data_dims
,
strides
,
ksize
);
if
(
global_pooling
)
{
phi
::
funcs
::
UpdateKernelSize
(
&
ksize
,
data_dims
);
}
MLUCnnlTensorDesc
in_x_desc
(
*
in_x
,
cnnl_layout
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
out_desc
(
*
out
,
cnnl_layout
,
ToCnnlDataType
<
T
>
());
cnnlPoolingMode_t
pool_mode
=
ToCnnlPoolingMode
(
pooling_type
,
exclusive
,
adaptive
);
// transpose NCHW to NHWC since cnnl pool2d has worse performance in that
// layout.
phi
::
DenseTensor
trans_in_x
;
phi
::
DenseTensor
trans_out
;
if
(
channel_last
)
{
trans_in_x
=
*
in_x
;
trans_out
=
*
out
;
}
else
{
std
::
vector
<
int
>
perm
{
0
,
2
,
3
,
1
};
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm
,
in_x
,
&
trans_in_x
,
true
/*need_reshape_or_alloc*/
);
trans_out
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
(
{
out_dims
[
0
],
out_dims
[
2
],
out_dims
[
3
],
out_dims
[
1
]},
dev_ctx
);
}
MLUCnnlTensorDesc
trans_in_x_desc
(
trans_in_x
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
trans_out_desc
(
trans_out
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
<
T
>
());
if
(
!
adaptive
)
{
MLUCnnlPoolingDesc
pool_desc
(
pool_mode
,
CNNL_NOT_PROPAGATE_NAN
,
ksize
[
0
],
ksize
[
1
],
paddings
[
0
],
paddings
[
1
],
paddings
[
2
],
paddings
[
3
],
strides
[
0
],
strides
[
1
],
1
/*row_dilation*/
,
1
/*col_dilation*/
,
ceil_mode
);
size_t
extra_input_size
=
0
;
cnnlHandle_t
handle
=
ctx
.
template
device_context
<
MLUDeviceContext
>().
cnnl_handle
();
cnnlGetPoolingExtraInputSize
(
handle
,
pool_mode
,
out_w
,
out_h
,
&
extra_input_size
);
if
(
extra_input_size
>
0
)
{
phi
::
DenseTensor
extra_host_tensor
;
extra_host_tensor
.
mutable_data
<
int8_t
>
(
{
static_cast
<
int64_t
>
(
extra_input_size
)},
platform
::
CPUPlace
());
cnnlInitPoolingExtraInput
(
handle
,
pool_desc
.
get
(),
trans_in_x_desc
.
get
(),
trans_out_desc
.
get
(),
GetBasePtr
(
&
extra_host_tensor
));
phi
::
DenseTensor
extra_device_tensor
=
ctx
.
AllocateTmpTensor
<
int8_t
,
MLUDeviceContext
>
(
{
static_cast
<
int64_t
>
(
extra_input_size
)},
dev_ctx
);
framework
::
TensorCopy
(
extra_host_tensor
,
ctx
.
GetPlace
(),
&
extra_device_tensor
);
// Increase extra_host_tensor holder_ reference count until copy
// complete.
auto
increase_ref_count
=
[
extra_host_tensor
]()
{
VLOG
(
4
)
<<
"Finished copying extra_host_tensor["
<<
GetBasePtr
(
&
extra_host_tensor
)
<<
"] in mlu pooling kernel."
;
};
dev_ctx
.
AddStreamCallback
(
increase_ref_count
);
MLUCnnl
::
PoolingForward
(
ctx
,
pool_mode
,
out_h
,
out_w
,
pool_desc
.
get
(),
nullptr
/*alpha*/
,
trans_in_x_desc
.
get
(),
GetBasePtr
(
&
trans_in_x
),
nullptr
/*beta*/
,
GetBasePtr
(
&
extra_device_tensor
)
/*params_shape_ptr*/
,
trans_out_desc
.
get
(),
GetBasePtr
(
&
trans_out
));
}
else
{
MLUCnnl
::
PoolingForward
(
ctx
,
pool_mode
,
out_h
,
out_w
,
pool_desc
.
get
(),
nullptr
/*alpha*/
,
trans_in_x_desc
.
get
(),
GetBasePtr
(
&
trans_in_x
),
nullptr
/*beta*/
,
nullptr
/*params_shape_ptr*/
,
trans_out_desc
.
get
(),
GetBasePtr
(
&
trans_out
));
}
}
else
{
MLUCnnl
::
AdaptivePoolingForward
(
ctx
,
pool_mode
,
trans_in_x_desc
.
get
(),
GetBasePtr
(
&
trans_in_x
),
trans_out_desc
.
get
(),
GetBasePtr
(
&
trans_out
),
nullptr
,
nullptr
);
}
if
(
!
channel_last
)
{
std
::
vector
<
int
>
perm
{
0
,
3
,
1
,
2
};
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm
,
&
trans_out
,
out
,
false
/*need_reshape_or_alloc*/
);
}
}
};
template
<
typename
T
,
typename
IDX_T
>
class
MLUPoolGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>();
const
phi
::
DenseTensor
*
in_x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
const
phi
::
DenseTensor
*
out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Out"
);
const
phi
::
DenseTensor
*
out_grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
phi
::
DenseTensor
*
in_x_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
in_x_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
string
pooling_type
=
ctx
.
Attr
<
std
::
string
>
(
"pooling_type"
);
std
::
vector
<
int
>
ksize
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"ksize"
);
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
bool
ceil_mode
=
ctx
.
Attr
<
bool
>
(
"ceil_mode"
);
bool
exclusive
=
ctx
.
Attr
<
bool
>
(
"exclusive"
);
bool
adaptive
=
ctx
.
Attr
<
bool
>
(
"adaptive"
);
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
bool
global_pooling
=
ctx
.
Attr
<
bool
>
(
"global_pooling"
);
std
::
string
padding_algorithm
=
ctx
.
Attr
<
std
::
string
>
(
"padding_algorithm"
);
const
bool
channel_last
=
data_format
==
"NHWC"
;
auto
in_x_dims
=
in_x
->
dims
();
framework
::
DDim
data_dims
=
phi
::
slice_ddim
(
in_x_dims
,
2
,
in_x_dims
.
size
());
if
(
channel_last
)
{
data_dims
=
phi
::
slice_ddim
(
in_x_dims
,
1
,
in_x_dims
.
size
()
-
1
);
}
phi
::
funcs
::
UpdatePadding
(
&
paddings
,
global_pooling
,
adaptive
,
padding_algorithm
,
data_dims
,
strides
,
ksize
);
if
(
global_pooling
)
{
phi
::
funcs
::
UpdateKernelSize
(
&
ksize
,
data_dims
);
}
// inputs need with NHWC layout
phi
::
DenseTensor
trans_in_x
;
phi
::
DenseTensor
trans_out
;
phi
::
DenseTensor
trans_out_grad
;
phi
::
DenseTensor
trans_in_x_grad
;
if
(
channel_last
)
{
trans_in_x
=
*
in_x
;
trans_out
=
*
out
;
trans_out_grad
=
*
out_grad
;
trans_in_x_grad
=
*
in_x_grad
;
}
else
{
std
::
vector
<
int
>
perm
{
0
,
2
,
3
,
1
};
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm
,
in_x
,
&
trans_in_x
,
true
/*need_reshape_or_alloc*/
);
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm
,
out
,
&
trans_out
,
true
/*need_reshape_or_alloc*/
);
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm
,
out_grad
,
&
trans_out_grad
,
true
/*need_reshape_or_alloc*/
);
auto
in_x_grad_dims
=
in_x_grad
->
dims
();
trans_in_x_grad
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
in_x_grad_dims
[
0
],
in_x_grad_dims
[
2
],
in_x_grad_dims
[
3
],
in_x_grad_dims
[
1
]},
dev_ctx
);
}
MLUCnnlTensorDesc
trans_in_x_desc
(
trans_in_x
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
trans_out_desc
(
trans_out
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
trans_out_grad_desc
(
trans_out_grad
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
trans_in_x_grad_desc
(
trans_in_x_grad
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
<
T
>
());
cnnlPoolingMode_t
pool_mode
=
ToCnnlPoolingMode
(
pooling_type
,
exclusive
,
adaptive
);
MLUCnnlPoolingDesc
pool_desc
(
pool_mode
,
CNNL_NOT_PROPAGATE_NAN
,
ksize
[
0
],
ksize
[
1
],
paddings
[
0
],
paddings
[
1
],
paddings
[
2
],
paddings
[
3
],
strides
[
0
],
strides
[
1
],
1
/*row_dilation*/
,
1
/*col_dilation*/
,
ceil_mode
);
if
(
pooling_type
==
"max"
)
{
phi
::
DenseTensor
index_tensor
=
ctx
.
AllocateTmpTensor
<
IDX_T
,
MLUDeviceContext
>
(
trans_out_grad
.
dims
(),
dev_ctx
);
MLUCnnlTensorDesc
index_tensor_desc
(
index_tensor
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
<
IDX_T
>
());
MLUCnnl
::
PoolingIndex
(
ctx
,
pool_desc
.
get
(),
trans_in_x_desc
.
get
(),
GetBasePtr
(
&
trans_in_x
),
index_tensor_desc
.
get
(),
GetBasePtr
(
&
index_tensor
));
if
(
adaptive
)
{
MLUCnnl
::
AdaptivePoolingBackward
(
ctx
,
pool_mode
,
trans_out_grad_desc
.
get
(),
GetBasePtr
(
&
trans_out_grad
),
index_tensor_desc
.
get
(),
GetBasePtr
(
&
index_tensor
),
trans_in_x_grad_desc
.
get
(),
GetBasePtr
(
&
trans_in_x_grad
));
}
else
{
MLUCnnl
::
PoolingBackward
(
ctx
,
pool_desc
.
get
(),
nullptr
/*alpha*/
,
index_tensor_desc
.
get
(),
GetBasePtr
(
&
index_tensor
),
trans_out_grad_desc
.
get
(),
GetBasePtr
(
&
trans_out_grad
),
trans_in_x_desc
.
get
(),
GetBasePtr
(
&
trans_in_x
),
nullptr
/*beta*/
,
trans_in_x_grad_desc
.
get
(),
GetBasePtr
(
&
trans_in_x_grad
));
}
}
else
{
if
(
adaptive
)
{
MLUCnnl
::
AdaptivePoolingBackward
(
ctx
,
pool_mode
,
trans_out_grad_desc
.
get
(),
GetBasePtr
(
&
trans_out_grad
),
nullptr
/*index_tensor_desc.get()*/
,
nullptr
/*GetBasePtr(&index_tensor)*/
,
trans_in_x_grad_desc
.
get
(),
GetBasePtr
(
&
trans_in_x_grad
));
}
else
{
MLUCnnl
::
PoolingBackward
(
ctx
,
pool_desc
.
get
(),
nullptr
/*alpha*/
,
nullptr
,
nullptr
,
trans_out_grad_desc
.
get
(),
GetBasePtr
(
&
trans_out_grad
),
nullptr
,
nullptr
,
nullptr
/*beta*/
,
trans_in_x_grad_desc
.
get
(),
GetBasePtr
(
&
trans_in_x_grad
));
}
}
if
(
!
channel_last
)
{
std
::
vector
<
int
>
perm
{
0
,
3
,
1
,
2
};
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm
,
&
trans_in_x_grad
,
in_x_grad
,
false
/*need_reshape_or_alloc*/
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
pool2d
,
ops
::
MLUPoolOpKernel
<
float
>
,
ops
::
MLUPoolOpKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
pool2d_grad
,
ops
::
MLUPoolGradOpKernel
<
float
,
int
>
,
ops
::
MLUPoolGradOpKernel
<
plat
::
float16
,
int16_t
>
);
paddle/fluid/operators/randperm_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/randperm_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
RandpermMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
int
n
=
ctx
.
Attr
<
int
>
(
"n"
);
unsigned
int
seed
=
static_cast
<
unsigned
int
>
(
ctx
.
Attr
<
int
>
(
"seed"
));
framework
::
Variable
*
out_var
=
ctx
.
OutputVar
(
"Out"
);
phi
::
DenseTensor
*
out_tensor
=
framework
::
GetMutableLoDTensorOrSelectedRowsValueFromVar
(
out_var
);
phi
::
DenseTensor
tmp_tensor
;
tmp_tensor
.
Resize
(
phi
::
make_ddim
({
n
}));
T
*
tmp_data
=
tmp_tensor
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
random_permate
<
T
>
(
tmp_data
,
n
,
seed
);
framework
::
TensorCopySync
(
tmp_tensor
,
ctx
.
GetPlace
(),
out_tensor
);
}
};
}
// namespace operators
}
// namespace paddle
template
<
typename
T
>
using
kernel
=
paddle
::
operators
::
RandpermMLUKernel
<
T
>
;
REGISTER_OP_MLU_KERNEL
(
randperm
,
kernel
<
int64_t
>
,
kernel
<
int
>
,
kernel
<
float
>
,
kernel
<
double
>
);
paddle/fluid/operators/range_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/range_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
RangeMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
start_t
=
context
.
Input
<
phi
::
DenseTensor
>
(
"Start"
);
auto
*
end_t
=
context
.
Input
<
phi
::
DenseTensor
>
(
"End"
);
auto
*
step_t
=
context
.
Input
<
phi
::
DenseTensor
>
(
"Step"
);
auto
*
out
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
phi
::
DenseTensor
n
;
framework
::
TensorCopy
(
*
start_t
,
platform
::
CPUPlace
(),
context
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
&
n
);
context
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>()
.
Wait
();
T
start
=
n
.
data
<
T
>
()[
0
];
framework
::
TensorCopy
(
*
end_t
,
platform
::
CPUPlace
(),
context
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
&
n
);
context
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>()
.
Wait
();
T
end
=
n
.
data
<
T
>
()[
0
];
framework
::
TensorCopy
(
*
step_t
,
platform
::
CPUPlace
(),
context
.
template
device_context
<
platform
::
MLUDeviceContext
>(),
&
n
);
context
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>()
.
Wait
();
T
step
=
n
.
data
<
T
>
()[
0
];
int64_t
size
=
0
;
GetSize
(
start
,
end
,
step
,
&
size
);
out
->
Resize
(
phi
::
make_ddim
({
size
}));
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
std
::
vector
<
T
>
odata
;
T
value
=
start
;
for
(
int64_t
i
=
0
;
i
<
size
;
++
i
)
{
odata
.
push_back
(
value
);
value
+=
step
;
}
framework
::
TensorFromVector
(
odata
,
context
.
device_context
(),
out
);
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OP_MLU_KERNEL
(
range
,
paddle
::
operators
::
RangeMLUKernel
<
int
>
,
paddle
::
operators
::
RangeMLUKernel
<
int64_t
>
,
paddle
::
operators
::
RangeMLUKernel
<
float
>
,
paddle
::
operators
::
RangeMLUKernel
<
double
>
)
paddle/fluid/operators/reshape_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/tensor_utils.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
Reshape2MLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
std
::
vector
<
int32_t
>
target_shape_vector
;
auto
shape_tensor_vector
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"ShapeTensor"
);
if
(
shape_tensor_vector
.
size
()
>
0
)
{
for
(
auto
*
shape_tensor
:
shape_tensor_vector
)
{
PADDLE_ENFORCE_EQ
(
shape_tensor
->
dims
().
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"If the element type of 'shape' in Reshape Op is Tensor, "
"the element's shape must be [1]. But received the element's "
"shape is [%d]"
,
shape_tensor
->
dims
().
size
()));
target_shape_vector
.
push_back
(
phi
::
GetVectorFromTensor
<
int
>
(
shape_tensor
)[
0
]);
}
}
else
{
auto
*
shape_tensor
=
ctx
.
HasInput
(
"Shape"
)
?
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Shape"
)
:
nullptr
;
if
(
shape_tensor
)
{
target_shape_vector
=
phi
::
GetVectorFromTensor
<
int
>
(
shape_tensor
);
}
else
{
target_shape_vector
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"shape"
);
PADDLE_ENFORCE_GT
(
target_shape_vector
.
size
(),
0
,
platform
::
errors
::
InvalidArgument
(
"The length of shape attribute should be larger than 0 when "
"input ShapeTensor and Shape are empty!"
));
}
}
int
num_negative
=
std
::
count
(
target_shape_vector
.
begin
(),
target_shape_vector
.
end
(),
-
1
);
PADDLE_ENFORCE_LE
(
num_negative
,
1
,
platform
::
errors
::
InvalidArgument
(
"The max number of -1 in shape attribute or shape tensor is 1 "
"but received %d."
,
num_negative
));
auto
it_zero
=
std
::
find
(
target_shape_vector
.
begin
(),
target_shape_vector
.
end
(),
0
);
if
(
it_zero
!=
target_shape_vector
.
end
())
{
int
x_rank
=
x
->
dims
().
size
();
for
(
size_t
i
=
0
;
i
<
target_shape_vector
.
size
();
i
++
)
{
if
(
target_shape_vector
[
i
]
==
0
)
{
PADDLE_ENFORCE_LT
(
i
,
x_rank
,
platform
::
errors
::
InvalidArgument
(
"The index of 0 in shape attribute or shape tensor"
,
"should be less than input dim size, "
,
"but the index is %d and input dim size is %d"
,
i
,
x_rank
));
target_shape_vector
[
i
]
=
x
->
dims
().
at
(
i
);
}
}
}
auto
it
=
std
::
find
(
target_shape_vector
.
begin
(),
target_shape_vector
.
end
(),
-
1
);
if
(
it
!=
target_shape_vector
.
end
())
{
auto
ddim_out_vec
=
phi
::
vectorize
(
x
->
dims
());
int
ddim_out_product
=
std
::
accumulate
(
ddim_out_vec
.
begin
(),
ddim_out_vec
.
end
(),
1
,
std
::
multiplies
<
int
>
());
int
reshape_out_product
=
std
::
accumulate
(
target_shape_vector
.
begin
(),
target_shape_vector
.
end
(),
-
1
,
std
::
multiplies
<
int
>
());
int
index
=
std
::
distance
(
target_shape_vector
.
begin
(),
it
);
target_shape_vector
[
index
]
=
ddim_out_product
/
reshape_out_product
;
}
auto
out_dims
=
phi
::
make_ddim
(
target_shape_vector
);
out
->
mutable_data
<
T
>
(
out_dims
,
ctx
.
GetPlace
());
// output should copy to mlu
framework
::
TensorCopy
(
*
x
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
out
);
out
->
Resize
(
out_dims
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
Reshape2GradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
d_x
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
in_dims
=
d_x
->
dims
();
d_x
->
mutable_data
(
ctx
.
GetPlace
(),
d_out
->
type
());
framework
::
TensorCopy
(
*
d_out
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
d_x
);
d_x
->
Resize
(
in_dims
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
reshape2
,
ops
::
Reshape2MLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
float
>
,
ops
::
Reshape2MLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int
>
,
ops
::
Reshape2MLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int64_t
>
,
ops
::
Reshape2MLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
bool
>
,
ops
::
Reshape2MLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
double
>
,
ops
::
Reshape2MLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
Reshape2MLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
reshape2_grad
,
ops
::
Reshape2GradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
float
>
,
ops
::
Reshape2GradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int
>
,
ops
::
Reshape2GradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int64_t
>
,
ops
::
Reshape2GradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
bool
>
,
ops
::
Reshape2GradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
double
>
,
ops
::
Reshape2GradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
Reshape2GradMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/rnn_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
paddle
{
namespace
operators
{
using
DDim
=
framework
::
DDim
;
using
TensorList
=
std
::
vector
<
phi
::
DenseTensor
>
;
template
<
typename
TensorType
,
typename
T
>
void
reset_parameter_vector
(
const
std
::
vector
<
TensorType
>&
raw_params_vec
,
const
int
&
num_layers
,
const
bool
&
is_bidirec
,
std
::
vector
<
std
::
vector
<
std
::
pair
<
T
*
,
size_t
>>>*
params_vec
)
{
// the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers
// + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to
// ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers
const
int
&
direction_num
=
is_bidirec
?
2
:
1
;
const
int
&
layer_weight_size
=
4
*
direction_num
;
const
int
&
all_weight_size
=
num_layers
*
layer_weight_size
;
const
int
&
bias_start_idx
=
all_weight_size
/
2
;
for
(
int
i
=
0
;
i
<
num_layers
;
i
++
)
{
params_vec
->
at
(
i
).
resize
(
layer_weight_size
);
for
(
int
j
=
0
;
j
<
layer_weight_size
;
j
++
)
{
int
k
=
j
%
4
;
const
int
&
section
=
j
/
4
;
int
tensor_idx
=
i
*
2
*
direction_num
+
section
*
2
+
k
%
2
;
if
(
k
>=
2
)
{
tensor_idx
+=
bias_start_idx
;
}
using
remove_cv_t
=
typename
std
::
remove_cv
<
T
>::
type
;
params_vec
->
at
(
i
)[
j
]
=
std
::
make_pair
(
const_cast
<
T
*>
(
raw_params_vec
[
tensor_idx
]
->
template
data
<
remove_cv_t
>()),
raw_params_vec
[
tensor_idx
]
->
numel
()
*
sizeof
(
T
));
}
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
RNNMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
// Input
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
pre_state
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"PreState"
);
auto
weight_list
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"WeightList"
);
bool
has_seq_length
=
ctx
.
HasInput
(
"SequenceLength"
);
// Output
auto
state
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
"State"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
reserve_data
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Reserve"
);
// Attributes
const
int
&
num_layers
=
ctx
.
Attr
<
int
>
(
"num_layers"
);
const
bool
&
is_bidirec
=
ctx
.
Attr
<
bool
>
(
"is_bidirec"
);
const
int
&
hidden_size
=
ctx
.
Attr
<
int
>
(
"hidden_size"
);
const
std
::
string
&
mode
=
ctx
.
Attr
<
std
::
string
>
(
"mode"
);
const
phi
::
DenseTensor
*
sequence_length
=
nullptr
;
if
(
has_seq_length
)
{
sequence_length
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"SequenceLength"
);
}
auto
init_h
=
pre_state
[
0
];
// -> hx
auto
init_c
=
pre_state
[
1
];
// -> cx
auto
last_h
=
state
[
0
];
auto
last_c
=
state
[
1
];
// check shape
const
int
in_out_dim_num
=
input
->
dims
().
size
();
const
int
&
seq_len
=
input
->
dims
()[
0
];
// time_step
const
int
&
batch_size
=
input
->
dims
()[
1
];
const
int
&
input_dim
=
input
->
dims
()[
2
];
const
int
&
direction_num
=
is_bidirec
?
2
:
1
;
int
in_dim_arr
[
in_out_dim_num
]
=
{
seq_len
,
batch_size
,
input_dim
};
int
out_dim_arr
[
in_out_dim_num
]
=
{
seq_len
,
batch_size
,
direction_num
*
hidden_size
};
int
proj_size
=
hidden_size
;
std
::
vector
<
int
>
seq_len_vec
(
batch_size
,
seq_len
);
if
(
has_seq_length
)
{
// set seq_len if no padding, otherwise seq_len for
// each element.
seq_len_vec
=
phi
::
GetVectorFromTensor
(
sequence_length
);
}
cnnlDirectionMode_t
direction
=
is_bidirec
?
CNNL_RNN_BIDIRECTIONAL
:
CNNL_RNN_UNIDIRECTIONAL
;
PADDLE_ENFORCE_EQ
(
mode
,
"LSTM"
,
platform
::
errors
::
InvalidArgument
(
"MLU only support LSTM mode now, current mode is %s"
,
mode
));
PADDLE_ENFORCE_EQ
(
num_layers
,
1
,
platform
::
errors
::
InvalidArgument
(
"MLU only support 1 num_layers, current num_layers is %s"
,
num_layers
));
PADDLE_ENFORCE_EQ
(
init_h
->
dims
()[
0
],
num_layers
*
direction_num
,
platform
::
errors
::
InvalidArgument
(
"The num_layers of in RNN layer must"
" be the same as first dim of init "
"hidden, but received num_layers:%d,"
" dim:%d"
,
num_layers
,
init_h
->
dims
()[
0
]));
PADDLE_ENFORCE_EQ
(
init_c
->
dims
()[
0
],
num_layers
*
direction_num
,
platform
::
errors
::
InvalidArgument
(
"The num_layers of in RNN layer must"
" be the same as first dim of cell state hidden, but received"
" num_layers:%d, dim:%d"
,
num_layers
,
init_c
->
dims
()[
0
]));
// weightlist
std
::
vector
<
std
::
vector
<
std
::
pair
<
T
*
,
size_t
>>>
parameter_lists
;
parameter_lists
.
resize
(
num_layers
);
reset_parameter_vector
(
weight_list
,
num_layers
,
is_bidirec
,
&
parameter_lists
);
// init the output and allocate the memory
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// -> y in cnnl
last_h
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// -> hy in cnnl
last_c
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// -> cy in cnnl
MLUSeqDataDesc
input_seq_data_desc
(
CNNL_SEQDATA_TNC
,
ToCnnlDataType
(
input
->
dtype
()),
in_out_dim_num
,
in_dim_arr
,
static_cast
<
int
>
(
seq_len_vec
.
size
()),
seq_len_vec
.
data
(),
nullptr
);
MLUSeqDataDesc
out_seq_data_desc
(
CNNL_SEQDATA_TNC
,
ToCnnlDataType
(
input
->
dtype
()),
in_out_dim_num
,
out_dim_arr
,
static_cast
<
int
>
(
seq_len_vec
.
size
()),
seq_len_vec
.
data
(),
nullptr
);
MLUCnnlTensorDesc
hx_desc
(
*
init_h
);
MLUCnnlTensorDesc
cx_desc
(
*
init_c
);
MLURNNDesc
rnn_desc
(
CNNL_LSTM
,
CNNL_RNN_DOUBLE_BIAS
,
direction
,
CNNL_RNN_LINEAR_INPUT
,
ToCnnlDataType
(
input
->
dtype
()),
ToCnnlDataType
(
input
->
dtype
()),
input_dim
,
hidden_size
,
/*projection*/
proj_size
,
num_layers
,
nullptr
,
CNNL_RNN_PADDED_IO_DISABLED
);
rnn_desc
.
SetRNNMaskMode
(
CNNL_LSTM_MASK_ENABLED
);
// copy weight params
size_t
weightspace_size
;
phi
::
DenseTensor
weightspace
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetRNNWeightSpaceSize
(
GetHandleFromCTX
(
ctx
),
rnn_desc
.
get
(),
&
weightspace_size
));
weightspace
=
ctx
.
AllocateTmpTensor
<
T
,
DeviceContext
>
(
{
static_cast
<
int64_t
>
(
weightspace_size
)},
dev_ctx
);
void
*
weightspace_ptr
=
weightspace
.
mutable_data
(
ctx
.
GetPlace
());
auto
w_x
=
parameter_lists
[
0
][
0
];
auto
w_h
=
parameter_lists
[
0
][
1
];
auto
b_x
=
parameter_lists
[
0
][
2
];
auto
b_h
=
parameter_lists
[
0
][
3
];
auto
actual_total_w_size
=
w_x
.
second
+
w_h
.
second
+
b_x
.
second
+
b_h
.
second
;
void
*
w_x_ptr
=
weightspace_ptr
;
void
*
w_h_ptr
=
static_cast
<
char
*>
(
weightspace_ptr
)
+
w_x
.
second
;
void
*
b_x_ptr
=
static_cast
<
char
*>
(
weightspace_ptr
)
+
w_x
.
second
+
w_h
.
second
;
void
*
b_h_ptr
=
static_cast
<
char
*>
(
weightspace_ptr
)
+
w_x
.
second
+
w_h
.
second
+
b_x
.
second
;
memory
::
Copy
(
weightspace
.
place
(),
w_x_ptr
,
weightspace
.
place
(),
w_x
.
first
,
w_x
.
second
,
nullptr
);
memory
::
Copy
(
weightspace
.
place
(),
w_h_ptr
,
weightspace
.
place
(),
w_h
.
first
,
w_h
.
second
,
nullptr
);
memory
::
Copy
(
weightspace
.
place
(),
b_x_ptr
,
weightspace
.
place
(),
b_x
.
first
,
b_x
.
second
,
nullptr
);
memory
::
Copy
(
weightspace
.
place
(),
b_h_ptr
,
weightspace
.
place
(),
b_h
.
first
,
b_h
.
second
,
nullptr
);
if
(
is_bidirec
)
{
auto
bw_x
=
parameter_lists
[
0
][
4
];
auto
bw_h
=
parameter_lists
[
0
][
5
];
auto
bb_x
=
parameter_lists
[
0
][
6
];
auto
bb_h
=
parameter_lists
[
0
][
7
];
void
*
bw_x_ptr
=
static_cast
<
char
*>
(
weightspace_ptr
)
+
actual_total_w_size
;
void
*
bw_h_ptr
=
static_cast
<
char
*>
(
weightspace_ptr
)
+
actual_total_w_size
+
bw_x
.
second
;
void
*
bb_x_ptr
=
static_cast
<
char
*>
(
weightspace_ptr
)
+
actual_total_w_size
+
bw_x
.
second
+
bw_h
.
second
;
void
*
bb_h_ptr
=
static_cast
<
char
*>
(
weightspace_ptr
)
+
actual_total_w_size
+
bw_x
.
second
+
bw_h
.
second
+
bb_x
.
second
;
actual_total_w_size
+=
bw_x
.
second
+
bw_h
.
second
+
bb_x
.
second
+
bb_h
.
second
;
memory
::
Copy
(
weightspace
.
place
(),
bw_x_ptr
,
weightspace
.
place
(),
bw_x
.
first
,
bw_x
.
second
,
nullptr
);
memory
::
Copy
(
weightspace
.
place
(),
bw_h_ptr
,
weightspace
.
place
(),
bw_h
.
first
,
bw_h
.
second
,
nullptr
);
memory
::
Copy
(
weightspace
.
place
(),
bb_x_ptr
,
weightspace
.
place
(),
bb_x
.
first
,
bb_x
.
second
,
nullptr
);
memory
::
Copy
(
weightspace
.
place
(),
bb_h_ptr
,
weightspace
.
place
(),
bb_h
.
first
,
bb_h
.
second
,
nullptr
);
}
PADDLE_ENFORCE_EQ
(
weightspace_size
,
actual_total_w_size
,
platform
::
errors
::
InvalidArgument
(
"The weightsize doesn't match"
" weightspace_size:%d, actual_total_w_size:%d"
,
weightspace_size
,
actual_total_w_size
));
// get reservespace_ptr
int
gate_num
=
4
;
int
hidden_data_idx
=
(
num_layers
-
1
);
hidden_data_idx
+=
(
gate_num
+
1
)
*
num_layers
;
const
int
&
block_size
=
direction_num
*
seq_len
*
batch_size
*
hidden_size
;
reserve_data
->
Resize
({
hidden_data_idx
,
block_size
});
reserve_data
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnl
::
RNNForward
(
ctx
,
rnn_desc
.
get
(),
seq_len_vec
.
data
(),
weightspace_ptr
,
weightspace_size
,
input_seq_data_desc
.
get
(),
GetBasePtr
(
input
),
out_seq_data_desc
.
get
(),
GetBasePtr
(
output
),
hx_desc
.
get
(),
GetBasePtr
(
init_h
),
GetBasePtr
(
last_h
),
cx_desc
.
get
(),
GetBasePtr
(
init_c
),
GetBasePtr
(
last_c
),
GetBasePtr
(
reserve_data
));
if
(
has_seq_length
)
{
// if has_seq_length, do mask out the output of cnnlRNNForwardTraining
auto
masked_mode
=
CNNL_MASKED_FILL
;
float
off_value
=
0.0
f
;
phi
::
DenseTensor
on_value_tensor
(
input
->
dtype
());
phi
::
DenseTensor
masked_tensor
(
framework
::
TransToPhiDataType
(
VT
::
INT8
));
phi
::
DenseTensor
h_masked_tensor
(
framework
::
TransToPhiDataType
(
VT
::
INT8
));
on_value_tensor
.
Resize
({
1
});
masked_tensor
.
Resize
({
seq_len
,
batch_size
,
direction_num
*
hidden_size
});
h_masked_tensor
.
Resize
(
{
seq_len
,
batch_size
,
direction_num
*
hidden_size
});
on_value_tensor
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
masked_tensor
.
mutable_data
<
int8_t
>
(
ctx
.
GetPlace
());
int8_t
*
h_masked_ptr
=
h_masked_tensor
.
mutable_data
<
int8_t
>
(
platform
::
CPUPlace
());
for
(
int
t
=
0
;
t
<
seq_len
;
++
t
)
{
for
(
int
n
=
0
;
n
<
batch_size
;
++
n
)
{
for
(
int
c
=
0
;
c
<
direction_num
*
hidden_size
;
++
c
)
{
auto
tmp_seq_len
=
seq_len_vec
[
n
];
auto
offset
=
t
*
batch_size
*
direction_num
*
hidden_size
+
n
*
direction_num
*
hidden_size
+
c
;
*
(
h_masked_ptr
+
offset
)
=
t
>=
tmp_seq_len
?
1
:
0
;
}
}
}
framework
::
TensorCopy
(
h_masked_tensor
,
ctx
.
GetPlace
(),
dev_ctx
,
&
masked_tensor
);
dev_ctx
.
Wait
();
FillMLUTensorWithHostValue
(
ctx
,
off_value
,
&
on_value_tensor
);
MLUCnnlTensorDesc
on_value_desc
(
on_value_tensor
);
MLUCnnlTensorDesc
output_desc
(
*
output
);
MLUCnnlTensorDesc
masked_desc
(
masked_tensor
);
MLUCnnl
::
Mask
(
ctx
,
masked_mode
,
output_desc
.
get
(),
GetBasePtr
(
output
),
masked_desc
.
get
(),
GetBasePtr
(
&
masked_tensor
),
on_value_desc
.
get
(),
GetBasePtr
(
&
on_value_tensor
),
output_desc
.
get
(),
GetBasePtr
(
output
),
nullptr
);
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
RNNMLUGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
stream
=
ctx
.
template
device_context
<
MLUDeviceContext
>().
stream
();
// get the tensor pointer for the input
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
pre_state
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"PreState"
);
auto
weight_list
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"WeightList"
);
auto
*
output
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
reserve_data
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Reserve"
);
const
int
&
num_layers
=
ctx
.
Attr
<
int
>
(
"num_layers"
);
const
bool
&
is_bidirec
=
ctx
.
Attr
<
bool
>
(
"is_bidirec"
);
const
int
&
hidden_size
=
ctx
.
Attr
<
int
>
(
"hidden_size"
);
const
std
::
string
&
mode
=
ctx
.
Attr
<
std
::
string
>
(
"mode"
);
bool
has_seq_length
=
ctx
.
HasInput
(
"SequenceLength"
);
const
phi
::
DenseTensor
*
sequence_length
=
nullptr
;
if
(
has_seq_length
)
{
sequence_length
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"SequenceLength"
);
}
PADDLE_ENFORCE_EQ
(
mode
,
"LSTM"
,
platform
::
errors
::
InvalidArgument
(
"XPU only support LSTM mode now, current mode is %s"
,
mode
));
auto
init_h
=
pre_state
[
0
];
// -> hx
auto
init_c
=
pre_state
[
1
];
// -> cx
auto
output_grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
state_grad
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"State"
));
auto
last_h_grad
=
state_grad
[
0
];
// -> dhy
auto
last_c_grad
=
state_grad
[
1
];
// -> dcy
// get the tensor pointer for the output
auto
*
input_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Input"
));
auto
weight_grad_list
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"WeightList"
));
auto
pre_state_grad
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"PreState"
));
phi
::
DenseTensor
*
init_h_grad
=
nullptr
;
phi
::
DenseTensor
*
init_c_grad
=
nullptr
;
if
(
pre_state_grad
.
size
()
>
0
)
{
// has gradient
init_h_grad
=
pre_state_grad
[
0
];
// -> dhx
init_c_grad
=
pre_state_grad
[
1
];
// -> dcx
}
// check shape
const
int
in_out_dim_num
=
input
->
dims
().
size
();
const
int
&
seq_len
=
input
->
dims
()[
0
];
const
int
&
batch_size
=
input
->
dims
()[
1
];
const
int
&
input_dim
=
input
->
dims
()[
2
];
const
int
&
direction_num
=
is_bidirec
?
2
:
1
;
int
in_dim_arr
[
in_out_dim_num
]
=
{
seq_len
,
batch_size
,
input_dim
};
int
out_dim_arr
[
in_out_dim_num
]
=
{
seq_len
,
batch_size
,
direction_num
*
hidden_size
};
int
proj_size
=
hidden_size
;
PADDLE_ENFORCE_EQ
(
num_layers
,
1
,
platform
::
errors
::
InvalidArgument
(
"MLU only support 1 num_layers, current num_layers is %s"
,
num_layers
));
PADDLE_ENFORCE_EQ
(
init_h
->
dims
()[
0
],
num_layers
*
direction_num
,
platform
::
errors
::
InvalidArgument
(
"The num_layers of in RNN layer must"
" be the same as first dim of init"
"hidden, but received num_layers:%d,"
" dim:%d"
,
num_layers
,
init_h
->
dims
()[
0
]));
PADDLE_ENFORCE_EQ
(
init_c
->
dims
()[
0
],
num_layers
*
direction_num
,
platform
::
errors
::
InvalidArgument
(
"The num_layers of in RNN layer must"
" be the same as first dim of cell state hidden, but received"
" num_layers:%d, dim:%d"
,
num_layers
,
init_c
->
dims
()[
0
]));
std
::
vector
<
std
::
vector
<
std
::
pair
<
T
*
,
size_t
>>>
parameter_lists
;
parameter_lists
.
resize
(
num_layers
);
reset_parameter_vector
(
weight_list
,
num_layers
,
is_bidirec
,
&
parameter_lists
);
for
(
unsigned
int
i
=
0
;
i
<
weight_grad_list
.
size
();
++
i
)
{
weight_grad_list
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
}
std
::
vector
<
std
::
vector
<
std
::
pair
<
T
*
,
size_t
>>>
parameter_lists_grad
;
parameter_lists_grad
.
resize
(
num_layers
);
reset_parameter_vector
(
weight_grad_list
,
num_layers
,
is_bidirec
,
&
parameter_lists_grad
);
// allocate the memory and initization the input_grad
input_grad
->
mutable_data
<
T
>
(
input
->
dims
(),
ctx
.
GetPlace
());
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0.0
),
input_grad
);
phi
::
DenseTensor
a
,
b
;
phi
::
DenseTensor
*
dynamic_grad_pre_h
=
&
a
;
phi
::
DenseTensor
*
dynamic_grad_pre_c
=
&
b
;
if
(
init_h_grad
)
{
init_h_grad
->
mutable_data
<
T
>
(
last_h_grad
->
dims
(),
ctx
.
GetPlace
());
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0.0
),
init_h_grad
);
}
else
{
dynamic_grad_pre_h
->
Resize
(
last_h_grad
->
dims
());
dynamic_grad_pre_h
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
FillMLUTensorWithHostValue
(
ctx
,
static_cast
<
T
>
(
0.0
),
dynamic_grad_pre_h
);
init_h_grad
=
dynamic_grad_pre_h
;
}
if
(
init_c_grad
)
{
init_c_grad
->
mutable_data
<
T
>
(
last_c_grad
->
dims
(),
ctx
.
GetPlace
());
}
else
{
dynamic_grad_pre_c
->
Resize
(
last_h_grad
->
dims
());
dynamic_grad_pre_c
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
init_c_grad
=
dynamic_grad_pre_c
;
}
std
::
vector
<
int
>
seq_len_vec
(
batch_size
,
seq_len
);
if
(
has_seq_length
)
{
seq_len_vec
=
phi
::
GetVectorFromTensor
(
sequence_length
);
}
cnnlDirectionMode_t
direction
=
is_bidirec
?
CNNL_RNN_BIDIRECTIONAL
:
CNNL_RNN_UNIDIRECTIONAL
;
MLUSeqDataDesc
input_seq_data_desc
(
CNNL_SEQDATA_TNC
,
ToCnnlDataType
(
input
->
dtype
()),
in_out_dim_num
,
in_dim_arr
,
static_cast
<
int
>
(
seq_len_vec
.
size
()),
seq_len_vec
.
data
(),
nullptr
);
MLUSeqDataDesc
out_seq_data_desc
(
CNNL_SEQDATA_TNC
,
ToCnnlDataType
(
input
->
dtype
()),
in_out_dim_num
,
out_dim_arr
,
static_cast
<
int
>
(
seq_len_vec
.
size
()),
seq_len_vec
.
data
(),
nullptr
);
MLUCnnlTensorDesc
hx_desc
(
*
init_h
);
MLUCnnlTensorDesc
cx_desc
(
*
init_c
);
MLURNNDesc
rnn_desc
(
CNNL_LSTM
,
CNNL_RNN_DOUBLE_BIAS
,
direction
,
CNNL_RNN_LINEAR_INPUT
,
ToCnnlDataType
(
input
->
dtype
()),
ToCnnlDataType
(
input
->
dtype
()),
input_dim
,
hidden_size
,
/*projection*/
proj_size
,
num_layers
,
nullptr
,
CNNL_RNN_PADDED_IO_DISABLED
);
rnn_desc
.
SetRNNMaskMode
(
CNNL_LSTM_MASK_ENABLED
);
// copy weight
size_t
weightspace_size
;
phi
::
DenseTensor
weightspace
,
dweightspace
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetRNNWeightSpaceSize
(
GetHandleFromCTX
(
ctx
),
rnn_desc
.
get
(),
&
weightspace_size
));
weightspace
=
ctx
.
AllocateTmpTensor
<
T
,
DeviceContext
>
(
{
static_cast
<
int64_t
>
(
weightspace_size
)},
dev_ctx
);
dweightspace
=
ctx
.
AllocateTmpTensor
<
T
,
DeviceContext
>
(
{
static_cast
<
int64_t
>
(
weightspace_size
)},
dev_ctx
);
void
*
weightspace_ptr
=
weightspace
.
mutable_data
(
ctx
.
GetPlace
());
auto
w_x
=
parameter_lists
[
0
][
0
];
auto
w_h
=
parameter_lists
[
0
][
1
];
auto
b_x
=
parameter_lists
[
0
][
2
];
auto
b_h
=
parameter_lists
[
0
][
3
];
auto
actual_total_w_size
=
w_x
.
second
+
w_h
.
second
+
b_x
.
second
+
b_h
.
second
;
void
*
w_x_ptr
=
weightspace_ptr
;
void
*
w_h_ptr
=
static_cast
<
char
*>
(
weightspace_ptr
)
+
w_x
.
second
;
void
*
b_x_ptr
=
static_cast
<
char
*>
(
weightspace_ptr
)
+
w_x
.
second
+
w_h
.
second
;
void
*
b_h_ptr
=
static_cast
<
char
*>
(
weightspace_ptr
)
+
w_x
.
second
+
w_h
.
second
+
b_x
.
second
;
memory
::
Copy
(
weightspace
.
place
(),
w_x_ptr
,
weightspace
.
place
(),
w_x
.
first
,
w_x
.
second
,
stream
);
memory
::
Copy
(
weightspace
.
place
(),
w_h_ptr
,
weightspace
.
place
(),
w_h
.
first
,
w_h
.
second
,
stream
);
memory
::
Copy
(
weightspace
.
place
(),
b_x_ptr
,
weightspace
.
place
(),
b_x
.
first
,
b_x
.
second
,
stream
);
memory
::
Copy
(
weightspace
.
place
(),
b_h_ptr
,
weightspace
.
place
(),
b_h
.
first
,
b_h
.
second
,
stream
);
if
(
is_bidirec
)
{
auto
bw_x
=
parameter_lists
[
0
][
4
];
auto
bw_h
=
parameter_lists
[
0
][
5
];
auto
bb_x
=
parameter_lists
[
0
][
6
];
auto
bb_h
=
parameter_lists
[
0
][
7
];
void
*
bw_x_ptr
=
static_cast
<
char
*>
(
weightspace_ptr
)
+
actual_total_w_size
;
void
*
bw_h_ptr
=
static_cast
<
char
*>
(
weightspace_ptr
)
+
actual_total_w_size
+
bw_x
.
second
;
void
*
bb_x_ptr
=
static_cast
<
char
*>
(
weightspace_ptr
)
+
actual_total_w_size
+
bw_x
.
second
+
bw_h
.
second
;
void
*
bb_h_ptr
=
static_cast
<
char
*>
(
weightspace_ptr
)
+
actual_total_w_size
+
bw_x
.
second
+
bw_h
.
second
+
bb_x
.
second
;
actual_total_w_size
+=
bw_x
.
second
+
bw_h
.
second
+
bb_x
.
second
+
bb_h
.
second
;
memory
::
Copy
(
weightspace
.
place
(),
bw_x_ptr
,
weightspace
.
place
(),
bw_x
.
first
,
bw_x
.
second
,
stream
);
memory
::
Copy
(
weightspace
.
place
(),
bw_h_ptr
,
weightspace
.
place
(),
bw_h
.
first
,
bw_h
.
second
,
stream
);
memory
::
Copy
(
weightspace
.
place
(),
bb_x_ptr
,
weightspace
.
place
(),
bb_x
.
first
,
bb_x
.
second
,
stream
);
memory
::
Copy
(
weightspace
.
place
(),
bb_h_ptr
,
weightspace
.
place
(),
bb_h
.
first
,
bb_h
.
second
,
stream
);
}
dev_ctx
.
Wait
();
PADDLE_ENFORCE_EQ
(
weightspace_size
,
actual_total_w_size
,
platform
::
errors
::
InvalidArgument
(
"The weightsize doesn't match"
" weightspace_size:%d, actual_total_w_size:%d"
,
weightspace_size
,
actual_total_w_size
));
MLUCnnl
::
RNNBackward
(
ctx
,
rnn_desc
.
get
(),
CNNL_WGRAD_MODE_SET
,
seq_len_vec
.
data
(),
GetBasePtr
(
&
weightspace
),
GetBasePtr
(
&
dweightspace
),
weightspace
.
numel
()
*
sizeof
(
T
),
input_seq_data_desc
.
get
(),
GetBasePtr
(
input
),
GetBasePtr
(
input_grad
),
out_seq_data_desc
.
get
(),
GetBasePtr
(
output
),
GetBasePtr
(
output_grad
),
hx_desc
.
get
(),
GetBasePtr
(
init_h
),
GetBasePtr
(
last_h_grad
),
GetBasePtr
(
init_h_grad
),
cx_desc
.
get
(),
GetBasePtr
(
init_c
),
GetBasePtr
(
last_c_grad
),
GetBasePtr
(
init_c_grad
),
const_cast
<
void
*>
(
GetBasePtr
(
reserve_data
)),
reserve_data
->
numel
()
*
sizeof
(
T
));
void
*
dweightspace_ptr
=
dweightspace
.
mutable_data
(
ctx
.
GetPlace
());
auto
dw_x
=
parameter_lists_grad
[
0
][
0
];
auto
dw_h
=
parameter_lists_grad
[
0
][
1
];
auto
db_x
=
parameter_lists_grad
[
0
][
2
];
auto
db_h
=
parameter_lists_grad
[
0
][
3
];
auto
dactual_total_w_size
=
dw_x
.
second
+
dw_h
.
second
+
db_x
.
second
+
db_h
.
second
;
void
*
dw_x_ptr
=
dweightspace_ptr
;
void
*
dw_h_ptr
=
static_cast
<
char
*>
(
dweightspace_ptr
)
+
dw_x
.
second
;
void
*
db_x_ptr
=
static_cast
<
char
*>
(
dweightspace_ptr
)
+
dw_x
.
second
+
dw_h
.
second
;
void
*
db_h_ptr
=
static_cast
<
char
*>
(
dweightspace_ptr
)
+
dw_x
.
second
+
dw_h
.
second
+
db_x
.
second
;
memory
::
Copy
(
weightspace
.
place
(),
dw_x
.
first
,
weightspace
.
place
(),
dw_x_ptr
,
dw_x
.
second
,
stream
);
memory
::
Copy
(
weightspace
.
place
(),
dw_h
.
first
,
weightspace
.
place
(),
dw_h_ptr
,
dw_h
.
second
,
stream
);
memory
::
Copy
(
weightspace
.
place
(),
db_x
.
first
,
weightspace
.
place
(),
db_x_ptr
,
db_x
.
second
,
stream
);
memory
::
Copy
(
weightspace
.
place
(),
db_h
.
first
,
weightspace
.
place
(),
db_h_ptr
,
db_h
.
second
,
stream
);
if
(
is_bidirec
)
{
auto
dbw_x
=
parameter_lists_grad
[
0
][
4
];
auto
dbw_h
=
parameter_lists_grad
[
0
][
5
];
auto
dbb_x
=
parameter_lists_grad
[
0
][
6
];
auto
dbb_h
=
parameter_lists_grad
[
0
][
7
];
void
*
dbw_x_ptr
=
static_cast
<
char
*>
(
dweightspace_ptr
)
+
dactual_total_w_size
;
void
*
dbw_h_ptr
=
static_cast
<
char
*>
(
dweightspace_ptr
)
+
dactual_total_w_size
+
dbw_x
.
second
;
void
*
dbb_x_ptr
=
static_cast
<
char
*>
(
dweightspace_ptr
)
+
dactual_total_w_size
+
dbw_x
.
second
+
dbw_h
.
second
;
void
*
dbb_h_ptr
=
static_cast
<
char
*>
(
dweightspace_ptr
)
+
dactual_total_w_size
+
dbw_x
.
second
+
dbw_h
.
second
+
dbb_x
.
second
;
dactual_total_w_size
+=
dbw_x
.
second
+
dbw_h
.
second
+
dbb_x
.
second
+
dbb_h
.
second
;
memory
::
Copy
(
weightspace
.
place
(),
dbw_x
.
first
,
weightspace
.
place
(),
dbw_x_ptr
,
dbw_x
.
second
,
stream
);
memory
::
Copy
(
weightspace
.
place
(),
dbw_h
.
first
,
weightspace
.
place
(),
dbw_h_ptr
,
dbw_h
.
second
,
stream
);
memory
::
Copy
(
weightspace
.
place
(),
dbb_x
.
first
,
weightspace
.
place
(),
dbb_x_ptr
,
dbb_x
.
second
,
stream
);
memory
::
Copy
(
weightspace
.
place
(),
dbb_h
.
first
,
weightspace
.
place
(),
dbb_h_ptr
,
dbb_h
.
second
,
stream
);
}
dev_ctx
.
Wait
();
PADDLE_ENFORCE_EQ
(
weightspace_size
,
dactual_total_w_size
,
platform
::
errors
::
InvalidArgument
(
"The weightsize doesn't match"
" weightspace_size:%d, dactual_total_w_size:%d"
,
weightspace_size
,
dactual_total_w_size
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
rnn
,
ops
::
RNNMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
float
>
);
REGISTER_OP_MLU_KERNEL
(
rnn_grad
,
ops
::
RNNMLUGradKernel
<
paddle
::
platform
::
MLUDeviceContext
,
float
>
);
paddle/fluid/operators/roi_align_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ROIAlignOpMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
rois
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"ROIs"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
out
->
set_layout
(
phi
::
DataLayout
::
kNHWC
);
auto
pooled_height
=
ctx
.
Attr
<
int
>
(
"pooled_height"
);
auto
pooled_width
=
ctx
.
Attr
<
int
>
(
"pooled_width"
);
auto
spatial_scale
=
ctx
.
Attr
<
float
>
(
"spatial_scale"
);
auto
sampling_ratio
=
ctx
.
Attr
<
int
>
(
"sampling_ratio"
);
auto
aligned
=
ctx
.
Attr
<
bool
>
(
"aligned"
);
const
auto
&
in_dims
=
in
->
dims
();
int
batch_size
=
in_dims
[
0
];
int
rois_num
=
rois
->
dims
()[
0
];
if
(
rois_num
==
0
)
return
;
auto
cplace
=
platform
::
CPUPlace
();
std
::
vector
<
int
>
roi_batch_id_list
(
rois_num
);
int
rois_batch_size
=
0
;
if
(
ctx
.
HasInput
(
"RoisNum"
))
{
auto
*
rois_num_t
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"RoisNum"
);
rois_batch_size
=
rois_num_t
->
numel
();
PADDLE_ENFORCE_EQ
(
rois_batch_size
,
batch_size
,
platform
::
errors
::
InvalidArgument
(
"The batch size of rois and the batch size of images "
" must be the same. But received the batch size of rois is %d, "
"and the batch size of images is %d"
,
rois_batch_size
,
batch_size
));
std
::
vector
<
int
>
rois_num_list
(
rois_batch_size
);
memory
::
Copy
(
cplace
,
rois_num_list
.
data
(),
ctx
.
GetPlace
(),
rois_num_t
->
data
<
int
>
(),
sizeof
(
int
)
*
rois_batch_size
,
nullptr
/*stream*/
);
int
last_idx
=
0
;
for
(
int
i
=
0
;
i
<
rois_batch_size
;
i
++
)
{
int
end_idx
=
last_idx
+
rois_num_list
[
i
];
for
(
int
j
=
last_idx
;
j
<
end_idx
;
j
++
)
{
roi_batch_id_list
[
j
]
=
i
;
}
last_idx
=
end_idx
;
}
}
else
{
auto
lod
=
rois
->
lod
();
PADDLE_ENFORCE_EQ
(
lod
.
empty
(),
false
,
platform
::
errors
::
InvalidArgument
(
"Input(ROIs) phi::DenseTensor of ROIAlignOp "
"does not contain LoD information."
));
auto
rois_lod
=
lod
.
back
();
rois_batch_size
=
rois_lod
.
size
()
-
1
;
PADDLE_ENFORCE_EQ
(
rois_batch_size
,
batch_size
,
platform
::
errors
::
InvalidArgument
(
"The rois_batch_size and imgs "
"batch_size must be the same. But received "
"rois_batch_size = %d, "
"batch_size = %d"
,
rois_batch_size
,
batch_size
));
int
rois_num_with_lod
=
rois_lod
[
rois_batch_size
];
PADDLE_ENFORCE_EQ
(
rois_num
,
rois_num_with_lod
,
platform
::
errors
::
InvalidArgument
(
"The actual number of rois and the number of rois "
"provided from Input(RoIsLoD) in RoIAlign must be the same."
" But received actual number of rois is %d, and the number "
"of rois from RoIsLoD is %d"
,
rois_num
,
rois_num_with_lod
));
for
(
int
i
=
0
;
i
<
rois_batch_size
;
i
++
)
{
int
start_idx
=
rois_lod
[
i
];
int
end_idx
=
rois_lod
[
i
+
1
];
for
(
int
j
=
start_idx
;
j
<
end_idx
;
j
++
)
{
roi_batch_id_list
[
j
]
=
i
;
}
}
}
// only support float32 for now
phi
::
DenseTensor
rois_cpu
(
framework
::
TransToPhiDataType
(
VT
::
FP32
));
rois_cpu
.
Resize
({
rois_num
,
4
});
rois_cpu
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>();
framework
::
TensorCopy
(
*
rois
,
cplace
,
dev_ctx
,
&
rois_cpu
);
dev_ctx
.
Wait
();
T
*
rois_cpu_ptr
=
rois_cpu
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
// boxes; [batch_idx, x1, y1, x2, y2]
phi
::
DenseTensor
boxes_cpu
(
framework
::
TransToPhiDataType
(
VT
::
FP32
));
phi
::
DenseTensor
boxes_mlu
(
framework
::
TransToPhiDataType
(
VT
::
FP32
));
boxes_cpu
.
Resize
({
rois_num
,
5
});
boxes_mlu
.
Resize
({
rois_num
,
5
});
T
*
boxes_cpu_ptr
=
boxes_cpu
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
boxes_mlu
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
for
(
int
i
=
0
;
i
<
rois_num
;
++
i
)
{
boxes_cpu_ptr
[
i
*
5
+
0
]
=
static_cast
<
T
>
(
roi_batch_id_list
[
i
]);
boxes_cpu_ptr
[
i
*
5
+
1
]
=
rois_cpu_ptr
[
i
*
4
+
0
];
boxes_cpu_ptr
[
i
*
5
+
2
]
=
rois_cpu_ptr
[
i
*
4
+
1
];
boxes_cpu_ptr
[
i
*
5
+
3
]
=
rois_cpu_ptr
[
i
*
4
+
2
];
boxes_cpu_ptr
[
i
*
5
+
4
]
=
rois_cpu_ptr
[
i
*
4
+
3
];
}
// copy boxes_cpu to boxes_mlu
framework
::
TensorCopy
(
boxes_cpu
,
ctx
.
GetPlace
(),
dev_ctx
,
&
boxes_mlu
);
dev_ctx
.
Wait
();
const
std
::
vector
<
int
>
perm_to_nhwc
=
{
0
,
2
,
3
,
1
};
const
std
::
vector
<
int
>
perm_to_nchw
=
{
0
,
3
,
1
,
2
};
phi
::
DenseTensor
input_nhwc
(
in
->
type
());
phi
::
DenseTensor
output_nhwc
(
out
->
type
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
in
,
&
input_nhwc
,
true
/*need_reshape_or_alloc*/
);
auto
output_dims
=
out
->
dims
();
output_nhwc
.
mutable_data
<
T
>
(
{
output_dims
[
0
],
output_dims
[
2
],
output_dims
[
3
],
output_dims
[
1
]},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
input_desc
(
input_nhwc
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
(
input_nhwc
.
dtype
()));
MLUCnnlTensorDesc
boxes_desc
(
boxes_mlu
);
MLUCnnlTensorDesc
out_desc
(
output_nhwc
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
(
output_nhwc
.
dtype
()));
MLUCnnl
::
RoiAlign
(
ctx
,
pooled_height
,
pooled_width
,
sampling_ratio
,
spatial_scale
,
aligned
,
input_desc
.
get
(),
GetBasePtr
(
&
input_nhwc
),
boxes_desc
.
get
(),
GetBasePtr
(
&
boxes_mlu
),
out_desc
.
get
(),
GetBasePtr
(
&
output_nhwc
));
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nchw
,
&
output_nhwc
,
out
,
false
/*need_reshape_or_alloc*/
);
};
};
template
<
typename
T
>
class
ROIAlignGradOpMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
rois
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"ROIs"
);
auto
*
out_grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
in_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
spatial_scale
=
ctx
.
Attr
<
T
>
(
"spatial_scale"
);
auto
sampling_ratio
=
ctx
.
Attr
<
int
>
(
"sampling_ratio"
);
auto
aligned
=
ctx
.
Attr
<
bool
>
(
"aligned"
);
int
rois_num
=
rois
->
dims
()[
0
];
if
(
!
in_grad
)
{
return
;
}
in_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
vector
<
int
>
roi_batch_id_list
(
rois_num
);
auto
cplace
=
platform
::
CPUPlace
();
int
rois_batch_size
=
0
;
if
(
ctx
.
HasInput
(
"RoisNum"
))
{
auto
*
rois_num_t
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"RoisNum"
);
rois_batch_size
=
rois_num_t
->
numel
();
std
::
vector
<
int
>
rois_num_list
(
rois_batch_size
);
memory
::
Copy
(
cplace
,
rois_num_list
.
data
(),
ctx
.
GetPlace
(),
rois_num_t
->
data
<
int
>
(),
sizeof
(
int
)
*
rois_batch_size
,
nullptr
/*stream*/
);
int
last_idx
=
0
;
for
(
int
i
=
0
;
i
<
rois_batch_size
;
i
++
)
{
int
end_idx
=
last_idx
+
rois_num_list
[
i
];
for
(
int
j
=
last_idx
;
j
<
end_idx
;
j
++
)
{
roi_batch_id_list
[
j
]
=
i
;
}
last_idx
=
end_idx
;
}
}
else
{
auto
rois_lod
=
rois
->
lod
().
back
();
rois_batch_size
=
rois_lod
.
size
()
-
1
;
for
(
int
i
=
0
;
i
<
rois_batch_size
;
i
++
)
{
int
start_idx
=
rois_lod
[
i
];
int
end_idx
=
rois_lod
[
i
+
1
];
for
(
int
j
=
start_idx
;
j
<
end_idx
;
j
++
)
{
roi_batch_id_list
[
j
]
=
i
;
}
}
}
phi
::
DenseTensor
rois_cpu
(
framework
::
TransToPhiDataType
(
VT
::
FP32
));
rois_cpu
.
Resize
({
rois_num
,
4
});
rois_cpu
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>();
framework
::
TensorCopy
(
*
rois
,
cplace
,
dev_ctx
,
&
rois_cpu
);
dev_ctx
.
Wait
();
T
*
rois_cpu_ptr
=
rois_cpu
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
// boxes; [batch_idx, x1, y1, x2, y2]
phi
::
DenseTensor
boxes_cpu
(
framework
::
TransToPhiDataType
(
VT
::
FP32
));
phi
::
DenseTensor
boxes_mlu
(
framework
::
TransToPhiDataType
(
VT
::
FP32
));
boxes_cpu
.
Resize
({
rois_num
,
5
});
boxes_mlu
.
Resize
({
rois_num
,
5
});
T
*
boxes_cpu_ptr
=
boxes_cpu
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
boxes_mlu
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
for
(
int
i
=
0
;
i
<
rois_num
;
++
i
)
{
boxes_cpu_ptr
[
i
*
5
+
0
]
=
static_cast
<
T
>
(
roi_batch_id_list
[
i
]);
boxes_cpu_ptr
[
i
*
5
+
1
]
=
rois_cpu_ptr
[
i
*
4
+
0
];
boxes_cpu_ptr
[
i
*
5
+
2
]
=
rois_cpu_ptr
[
i
*
4
+
1
];
boxes_cpu_ptr
[
i
*
5
+
3
]
=
rois_cpu_ptr
[
i
*
4
+
2
];
boxes_cpu_ptr
[
i
*
5
+
4
]
=
rois_cpu_ptr
[
i
*
4
+
3
];
}
// copy boxes_cpu to boxes_mlu
framework
::
TensorCopy
(
boxes_cpu
,
ctx
.
GetPlace
(),
dev_ctx
,
&
boxes_mlu
);
dev_ctx
.
Wait
();
const
std
::
vector
<
int
>
perm_to_nhwc
=
{
0
,
2
,
3
,
1
};
const
std
::
vector
<
int
>
perm_to_nchw
=
{
0
,
3
,
1
,
2
};
phi
::
DenseTensor
grads_nhwc
(
out_grad
->
type
());
phi
::
DenseTensor
grads_image_nhwc
(
in_grad
->
type
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nhwc
,
out_grad
,
&
grads_nhwc
,
true
/*need_reshape_or_alloc*/
);
auto
grads_image_dims
=
in_grad
->
dims
();
grads_image_nhwc
.
mutable_data
<
T
>
({
grads_image_dims
[
0
],
grads_image_dims
[
2
],
grads_image_dims
[
3
],
grads_image_dims
[
1
]},
ctx
.
GetPlace
());
MLUCnnlTensorDesc
grads_desc
(
grads_nhwc
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
(
grads_nhwc
.
dtype
()));
MLUCnnlTensorDesc
boxes_desc
(
boxes_mlu
);
MLUCnnlTensorDesc
grads_image_desc
(
grads_image_nhwc
,
CNNL_LAYOUT_NHWC
,
ToCnnlDataType
(
grads_image_nhwc
.
dtype
()));
MLUCnnl
::
RoiAlignBackward
(
ctx
,
sampling_ratio
,
spatial_scale
,
aligned
,
grads_desc
.
get
(),
GetBasePtr
(
&
grads_nhwc
),
boxes_desc
.
get
(),
GetBasePtr
(
&
boxes_mlu
),
grads_image_desc
.
get
(),
GetBasePtr
(
&
grads_image_nhwc
));
TransposeFromMLUTensor
<
T
>
(
ctx
,
perm_to_nchw
,
&
grads_image_nhwc
,
in_grad
,
false
/*need_reshape_or_alloc*/
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
roi_align
,
ops
::
ROIAlignOpMLUKernel
<
float
>
);
REGISTER_OP_MLU_KERNEL
(
roi_align_grad
,
ops
::
ROIAlignGradOpMLUKernel
<
float
>
);
paddle/fluid/operators/scale_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ScaleMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
&
dev_ctx
=
GetDevCtxFromCTX
(
ctx
);
auto
*
in_var
=
ctx
.
InputVar
(
"X"
);
auto
*
in
=
framework
::
GetLoDTensorOrSelectedRowsValueFromVar
(
*
in_var
);
// cnnl require input, scale, bias with same type. And all in device side.
auto
scale
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"scale"
));
phi
::
DenseTensor
scale_tensor
;
if
(
ctx
.
HasInput
(
"ScaleTensor"
))
{
phi
::
DenseTensor
float_scale_tensor
=
*
ctx
.
Input
<
phi
::
DenseTensor
>
(
"ScaleTensor"
);
if
(
framework
::
TransToProtoVarType
(
float_scale_tensor
.
dtype
())
!=
framework
::
TransToProtoVarType
(
in
->
dtype
()))
{
scale_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
MLUCnnlTensorDesc
float_scale_desc
(
float_scale_tensor
);
MLUCnnlTensorDesc
final_scale_desc
(
scale_tensor
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
framework
::
TransToProtoVarType
(
float_scale_tensor
.
dtype
()),
framework
::
TransToProtoVarType
(
scale_tensor
.
dtype
()));
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
float_scale_desc
.
get
(),
GetBasePtr
(
&
float_scale_tensor
),
final_scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
));
}
else
{
scale_tensor
=
float_scale_tensor
;
}
}
else
{
scale_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
MLUCnnlTensorDesc
scale_desc
(
scale_tensor
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
scale
,
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
));
}
auto
bias
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"bias"
));
phi
::
DenseTensor
bias_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
MLUCnnlTensorDesc
bias_desc
(
bias_tensor
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
bias
,
bias_desc
.
get
(),
GetBasePtr
(
&
bias_tensor
));
auto
*
out_var
=
ctx
.
OutputVar
(
"Out"
);
if
(
in_var
->
IsType
<
phi
::
SelectedRows
>
()
&&
in_var
!=
out_var
)
{
auto
&
in_slr
=
in_var
->
Get
<
phi
::
SelectedRows
>
();
auto
*
out_slr
=
out_var
->
GetMutable
<
phi
::
SelectedRows
>
();
out_slr
->
set_rows
(
in_slr
.
rows
());
out_slr
->
set_height
(
in_slr
.
height
());
}
auto
*
out
=
framework
::
GetMutableLoDTensorOrSelectedRowsValueFromVar
(
out_var
);
out
->
mutable_data
<
T
>
(
in
->
place
());
MLUCnnlTensorDesc
input_desc
(
*
in
);
MLUCnnlTensorDesc
scale_desc
(
scale_tensor
);
MLUCnnlTensorDesc
output_desc
(
*
out
);
const
int
axis
=
std
::
max
(
in
->
dims
().
size
()
-
1
,
0
);
auto
bias_after_scale
=
ctx
.
Attr
<
bool
>
(
"bias_after_scale"
);
if
(
bias_after_scale
)
{
MLUCnnl
::
Scale
(
ctx
,
axis
,
input_desc
.
get
(),
GetBasePtr
(
in
),
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
),
bias_desc
.
get
(),
GetBasePtr
(
&
bias_tensor
),
output_desc
.
get
(),
GetBasePtr
(
out
));
}
else
{
phi
::
DenseTensor
new_bias_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
MLUCnnlTensorDesc
new_bias_desc
(
new_bias_tensor
);
MLUCnnlOpTensorDesc
mul_op_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
(
in
->
dtype
()),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
),
bias_desc
.
get
(),
GetBasePtr
(
&
bias_tensor
),
new_bias_desc
.
get
(),
GetBasePtr
(
&
new_bias_tensor
),
ToCnnlDataType
(
in
->
dtype
()));
MLUCnnl
::
Scale
(
ctx
,
axis
,
input_desc
.
get
(),
GetBasePtr
(
in
),
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
),
new_bias_desc
.
get
(),
GetBasePtr
(
&
new_bias_tensor
),
output_desc
.
get
(),
GetBasePtr
(
out
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
scale
,
ops
::
ScaleMLUKernel
<
float
>
,
ops
::
ScaleMLUKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/scatter_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ScatterMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
indices
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Ids"
);
auto
*
updates
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Updates"
);
bool
overwrite
=
ctx
.
Attr
<
bool
>
(
"overwrite"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
ctx
.
GetPlace
();
out
->
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
indices_desc
(
*
indices
);
MLUCnnlTensorDesc
updates_desc
(
*
updates
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
cnnlScatterRefMode_t
mode
;
if
(
overwrite
)
{
mode
=
CNNL_SCATTERREF_UPDATE
;
MLUCnnl
::
ScatterRefFunctor
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
x
),
updates_desc
.
get
(),
GetBasePtr
(
updates
),
indices_desc
.
get
(),
GetBasePtr
(
indices
),
mode
);
}
else
{
phi
::
DenseTensor
tensor_zeros
(
updates
->
type
());
tensor_zeros
.
mutable_data
<
T
>
(
updates
->
dims
(),
ctx
.
GetPlace
());
MLUCnnlTensorDesc
tensor_zeros_desc
(
tensor_zeros
);
float
value
=
0.0
;
auto
value_t
=
static_cast
<
T
>
(
value
);
MLUCnnl
::
Fill
(
ctx
,
CNNL_POINTER_MODE_HOST
,
&
value_t
,
tensor_zeros_desc
.
get
(),
GetBasePtr
(
&
tensor_zeros
));
mode
=
CNNL_SCATTERREF_UPDATE
;
MLUCnnl
::
ScatterRefFunctor
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
x
),
tensor_zeros_desc
.
get
(),
GetBasePtr
(
&
tensor_zeros
),
indices_desc
.
get
(),
GetBasePtr
(
indices
),
mode
);
mode
=
CNNL_SCATTERREF_ADD
;
MLUCnnl
::
ScatterRefFunctor
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
x
),
updates_desc
.
get
(),
GetBasePtr
(
updates
),
indices_desc
.
get
(),
GetBasePtr
(
indices
),
mode
);
}
paddle
::
framework
::
TensorCopy
(
*
x
,
place
,
out
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
scatter
,
ops
::
ScatterMLUKernel
<
float
>
,
ops
::
ScatterMLUKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/set_value_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <numeric>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/set_value_op.h"
namespace
paddle
{
namespace
operators
{
using
MLUDeviceContext
=
platform
::
MLUDeviceContext
;
template
<
typename
T
>
class
SetValueMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
*
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
*
value_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"ValueTensor"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
starts_tensor_list
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"StartsTensorList"
);
auto
ends_tensor_list
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"EndsTensorList"
);
auto
steps_tensor_list
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"StepsTensorList"
);
auto
axes
=
ctx
.
Attr
<
std
::
vector
<
int64_t
>>
(
"axes"
);
auto
starts
=
ctx
.
Attr
<
std
::
vector
<
int64_t
>>
(
"starts"
);
auto
ends
=
ctx
.
Attr
<
std
::
vector
<
int64_t
>>
(
"ends"
);
auto
steps
=
ctx
.
Attr
<
std
::
vector
<
int64_t
>>
(
"steps"
);
auto
shape
=
ctx
.
Attr
<
std
::
vector
<
int64_t
>>
(
"shape"
);
auto
decrease_axes
=
ctx
.
Attr
<
std
::
vector
<
int64_t
>>
(
"decrease_axes"
);
auto
none_axes
=
ctx
.
Attr
<
std
::
vector
<
int64_t
>>
(
"none_axes"
);
if
(
!
starts_tensor_list
.
empty
())
{
starts
=
GetDataFromTensorList
<
int64_t
>
(
starts_tensor_list
);
}
if
(
!
ends_tensor_list
.
empty
())
{
ends
=
GetDataFromTensorList
<
int64_t
>
(
ends_tensor_list
);
}
if
(
!
steps_tensor_list
.
empty
())
{
steps
=
GetDataFromTensorList
<
int64_t
>
(
steps_tensor_list
);
}
auto
in_dims
=
in
->
dims
();
phi
::
funcs
::
CheckAndUpdateSliceAttrs
(
in_dims
,
axes
,
&
starts
,
&
ends
,
&
steps
);
auto
slice_dims
=
phi
::
funcs
::
GetSliceDims
(
in_dims
,
axes
,
starts
,
ends
,
&
steps
);
auto
decrease_slice_dims
=
phi
::
funcs
::
GetDecreasedDims
(
slice_dims
,
decrease_axes
);
auto
slice_dims_for_assign
=
decrease_slice_dims
;
if
(
!
none_axes
.
empty
())
{
std
::
vector
<
int64_t
>
slice_dims_with_none
;
size_t
none_axes_cur
=
0
,
decrease_axes_cur
=
0
;
for
(
int
i
=
0
;
i
<
slice_dims
.
size
();
++
i
)
{
while
(
none_axes_cur
<
none_axes
.
size
()
&&
none_axes
[
none_axes_cur
]
<=
i
)
{
slice_dims_with_none
.
push_back
(
1
);
none_axes_cur
++
;
}
if
(
decrease_axes_cur
<
decrease_axes
.
size
()
&&
decrease_axes
[
decrease_axes_cur
]
==
i
)
{
decrease_axes_cur
++
;
}
else
{
slice_dims_with_none
.
push_back
(
slice_dims
[
i
]);
}
}
while
(
none_axes_cur
<
none_axes
.
size
())
{
slice_dims_with_none
.
push_back
(
1
);
none_axes_cur
++
;
}
slice_dims_for_assign
=
phi
::
make_ddim
(
slice_dims_with_none
);
}
int
in_size
=
in_dims
.
size
();
int
starts_indices
[
in_size
]
=
{
0
};
int
ends_indices
[
in_size
]
=
{
0
};
int
strides_indices
[
in_size
]
=
{
0
};
for
(
int
i
=
0
;
i
<
in_dims
.
size
();
++
i
)
{
starts_indices
[
i
]
=
0
;
ends_indices
[
i
]
=
static_cast
<
int
>
(
slice_dims
[
i
]);
strides_indices
[
i
]
=
1
;
}
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
i
++
)
{
int
axis_index
=
axes
[
i
];
starts_indices
[
axis_index
]
=
static_cast
<
int
>
(
starts
[
i
]);
ends_indices
[
axis_index
]
=
static_cast
<
int
>
(
ends
[
i
]);
strides_indices
[
axis_index
]
=
static_cast
<
int
>
(
steps
[
i
]);
}
phi
::
DenseTensor
value_t
(
in
->
type
());
if
(
value_tensor
!=
nullptr
)
{
value_t
.
ShareDataWith
(
*
value_tensor
);
}
else
{
auto
value_dims
=
phi
::
make_ddim
(
shape
);
CheckIsDimsMatch
(
slice_dims_for_assign
,
value_dims
);
value_t
.
mutable_data
<
T
>
(
value_dims
,
ctx
.
GetPlace
());
auto
value_name
=
GetValueName
(
framework
::
TransToProtoVarType
(
in
->
dtype
()));
CopyVectorToTensor
<
T
>
(
value_name
.
c_str
(),
&
value_t
,
ctx
);
value_t
.
Resize
(
value_dims
);
}
phi
::
DenseTensor
value_temp
(
in
->
type
());
if
(
slice_dims_for_assign
==
value_t
.
dims
())
{
value_temp
.
ShareDataWith
(
value_t
);
}
else
{
value_temp
.
Resize
(
slice_dims_for_assign
);
value_temp
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
value_t_desc
(
value_t
);
MLUCnnlTensorDesc
value_temp_desc
(
value_temp
);
MLUCnnl
::
BroadcastTo
(
ctx
,
value_t_desc
.
get
(),
GetBasePtr
(
&
value_t
),
value_temp_desc
.
get
(),
GetBasePtr
(
&
value_temp
));
}
int64_t
input_numel
=
phi
::
product
(
in_dims
);
int64_t
value_numel
=
phi
::
product
(
value_temp
.
dims
());
phi
::
DenseTensor
in_temp
,
out_temp
,
val_temp
,
index_out
;
int64_t
stride_step
=
phi
::
product
(
in_dims
);
std
::
vector
<
int64_t
>
index_indices
(
stride_step
);
std
::
iota
(
index_indices
.
begin
(),
index_indices
.
end
(),
0
);
phi
::
DenseTensor
index_temp
;
in_temp
.
ShareDataWith
(
*
in
);
val_temp
.
ShareDataWith
(
value_temp
);
paddle
::
framework
::
TensorFromVector
(
index_indices
,
ctx
.
device_context
(),
&
index_temp
);
index_temp
.
Resize
(
in_dims
);
auto
index_dims
=
in_dims
;
for
(
int
i
=
0
;
i
<
in_dims
.
size
();
++
i
)
{
if
(
starts_indices
[
i
]
<
0
||
ends_indices
[
i
]
<
0
)
{
starts_indices
[
i
]
-=
in_dims
[
i
];
ends_indices
[
i
]
-=
in_dims
[
i
];
}
if
(
strides_indices
[
i
]
>
0
)
index_dims
[
i
]
=
static_cast
<
int
>
((
ends_indices
[
i
]
-
starts_indices
[
i
]
-
1
)
/
strides_indices
[
i
])
+
1
;
else
index_dims
[
i
]
=
static_cast
<
int
>
((
ends_indices
[
i
]
-
starts_indices
[
i
]
+
1
)
/
strides_indices
[
i
])
+
1
;
}
auto
new_in_dims
=
phi
::
make_ddim
({
input_numel
});
auto
new_val_dims
=
phi
::
make_ddim
({
value_numel
});
in_temp
.
Resize
(
new_in_dims
);
val_temp
.
Resize
(
new_val_dims
);
index_out
.
Resize
(
index_dims
);
index_out
.
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
cnnlScatterRefMode_t
mode
=
CNNL_SCATTERREF_UPDATE
;
MLUCnnlTensorDesc
x_desc
(
in_temp
);
MLUCnnlTensorDesc
indices_desc
(
index_temp
);
MLUCnnlTensorDesc
indices_out_desc
(
index_out
);
MLUCnnlTensorDesc
updates_desc
(
val_temp
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
StridedSlice
(
ctx
,
starts_indices
,
ends_indices
,
strides_indices
,
indices_desc
.
get
(),
GetBasePtr
(
&
index_temp
),
indices_out_desc
.
get
(),
GetBasePtr
(
&
index_out
));
PADDLE_ENFORCE_EQ
(
static_cast
<
int64_t
>
(
phi
::
product
(
index_out
.
dims
())),
phi
::
product
(
slice_dims_for_assign
),
platform
::
errors
::
InvalidArgument
(
"OP(set_value) error index indices and value update not match "
));
phi
::
DenseTensor
index_final
;
index_final
.
ShareDataWith
(
index_out
);
int64_t
indices_numel
=
phi
::
product
(
index_dims
);
auto
new_index_dims
=
phi
::
make_ddim
({
indices_numel
});
index_final
.
Resize
(
new_index_dims
);
MLUCnnlTensorDesc
indices_final_desc
(
index_final
);
MLUCnnl
::
ScatterRefFunctor
(
ctx
,
x_desc
.
get
(),
GetBasePtr
(
&
in_temp
),
updates_desc
.
get
(),
GetBasePtr
(
&
val_temp
),
indices_final_desc
.
get
(),
GetBasePtr
(
&
index_final
),
mode
);
in_temp
.
Resize
(
in_dims
);
paddle
::
framework
::
TensorCopy
(
in_temp
,
ctx
.
GetPlace
(),
out
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
set_value
,
ops
::
SetValueMLUKernel
<
int
>
,
ops
::
SetValueMLUKernel
<
float
>
);
paddle/fluid/operators/shape_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_MLU
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
using
SelectedRows
=
phi
::
SelectedRows
;
template
<
typename
T
>
class
ShapeMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
in_var
=
ctx
.
InputVar
(
"Input"
);
framework
::
DDim
in_dims
;
if
(
in_var
->
IsType
<
phi
::
SelectedRows
>
())
{
in_dims
=
in_var
->
Get
<
phi
::
SelectedRows
>
().
value
().
dims
();
}
else
{
in_dims
=
in_var
->
Get
<
phi
::
DenseTensor
>
().
dims
();
}
auto
*
out_t
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out_t
->
Resize
({
in_dims
.
size
()});
out_t
->
mutable_data
<
int32_t
>
(
ctx
.
GetPlace
());
// shape op cpu
phi
::
DenseTensor
shape_on_cpu
(
framework
::
TransToPhiDataType
(
framework
::
proto
::
VarType
::
INT32
));
shape_on_cpu
.
Resize
({
in_dims
.
size
()});
auto
cpu_data
=
shape_on_cpu
.
mutable_data
<
int32_t
>
(
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
in_dims
.
size
();
++
i
)
{
cpu_data
[
i
]
=
in_dims
[
i
];
}
// cpu to mlu
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MLUDeviceContext
>();
framework
::
TensorCopy
(
shape_on_cpu
,
ctx
.
GetPlace
(),
dev_ctx
,
out_t
);
dev_ctx
.
Wait
();
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
shape
,
ops
::
ShapeMLUKernel
<
bool
>
,
ops
::
ShapeMLUKernel
<
uint8_t
>
,
ops
::
ShapeMLUKernel
<
int8_t
>
,
ops
::
ShapeMLUKernel
<
int
>
,
ops
::
ShapeMLUKernel
<
int64_t
>
,
ops
::
ShapeMLUKernel
<
paddle
::
platform
::
float16
>
,
ops
::
ShapeMLUKernel
<
float
>
,
ops
::
ShapeMLUKernel
<
double
>
);
#endif
paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
const
int
kIgnoreIndex
=
-
100
;
void
CheckAttrs
(
const
framework
::
ExecutionContext
&
ctx
)
{
// cnnl not support normalize and ignore_index
bool
normalize
=
ctx
.
Attr
<
bool
>
(
"normalize"
);
int
ignore_index
=
ctx
.
Attr
<
int
>
(
"ignore_index"
);
PADDLE_ENFORCE_EQ
(
normalize
,
false
,
platform
::
errors
::
InvalidArgument
(
"attr normalize must be false, but got true"
));
PADDLE_ENFORCE_EQ
(
ignore_index
,
kIgnoreIndex
,
platform
::
errors
::
InvalidArgument
(
"attr ignore_index must be default %d, but got %d"
,
kIgnoreIndex
,
ignore_index
));
}
template
<
typename
T
>
class
SigmoidCrossEntropyWithLogitsMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
CheckAttrs
(
ctx
);
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
label
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Label"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
ctx
.
GetPlace
();
out
->
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
label_desc
(
*
label
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
BceWithLogits
(
ctx
,
CNNL_BCE_WITH_LOGITS_NONE
,
x_desc
.
get
(),
GetBasePtr
(
x
),
label_desc
.
get
(),
GetBasePtr
(
label
),
nullptr
,
nullptr
,
nullptr
,
nullptr
,
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
T
>
class
SigmoidCrossEntropyWithLogitsMLUGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
CheckAttrs
(
ctx
);
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
label
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Label"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
place
=
ctx
.
GetPlace
();
dx
->
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
label_desc
(
*
label
);
MLUCnnlTensorDesc
dout_desc
(
*
dout
);
MLUCnnl
::
BceWithLogitsBackward
(
ctx
,
CNNL_BCE_WITH_LOGITS_NONE
,
dout_desc
.
get
(),
GetBasePtr
(
dout
),
x_desc
.
get
(),
GetBasePtr
(
x
),
label_desc
.
get
(),
GetBasePtr
(
label
),
nullptr
,
nullptr
,
nullptr
,
nullptr
,
x_desc
.
get
(),
GetBasePtr
(
dx
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
sigmoid_cross_entropy_with_logits
,
ops
::
SigmoidCrossEntropyWithLogitsMLUKernel
<
float
>
,
ops
::
SigmoidCrossEntropyWithLogitsMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
sigmoid_cross_entropy_with_logits_grad
,
ops
::
SigmoidCrossEntropyWithLogitsMLUGradKernel
<
float
>
,
ops
::
SigmoidCrossEntropyWithLogitsMLUGradKernel
<
plat
::
float16
>
);
paddle/fluid/operators/size_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
SizeMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
int64_t
size
=
x
->
numel
();
FillMLUTensorWithHostValue
<
int64_t
>
(
ctx
,
size
,
out
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
size
,
ops
::
SizeMLUKernel
<
int
>
,
ops
::
SizeMLUKernel
<
int64_t
>
,
ops
::
SizeMLUKernel
<
paddle
::
platform
::
float16
>
,
ops
::
SizeMLUKernel
<
float
>
,
ops
::
SizeMLUKernel
<
double
>
,
ops
::
SizeMLUKernel
<
bool
>
);
paddle/fluid/operators/slice_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/slice_utils.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
SliceMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
axes
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"axes"
);
auto
starts
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"starts"
);
auto
ends
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"ends"
);
auto
decrease_axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"decrease_axis"
);
auto
infer_flags
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"infer_flags"
);
// Get the accurate attribute value of starts and ends
auto
starts_tensor_list
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"StartsTensorList"
);
if
(
ctx
.
HasInput
(
"StartsTensor"
))
{
starts
=
phi
::
GetVectorFromTensor
<
int
>
(
ctx
.
Input
<
phi
::
DenseTensor
>
(
"StartsTensor"
));
}
else
if
(
starts_tensor_list
.
size
()
>
0
)
{
starts
=
GetDataFromTensorList
<
int
>
(
starts_tensor_list
);
}
auto
ends_tensor_list
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"EndsTensorList"
);
if
(
ctx
.
HasInput
(
"EndsTensor"
))
{
ends
=
phi
::
GetVectorFromTensor
<
int
>
(
ctx
.
Input
<
phi
::
DenseTensor
>
(
"EndsTensor"
));
}
else
if
(
ends_tensor_list
.
size
()
>
0
)
{
ends
=
GetDataFromTensorList
<
int
>
(
ends_tensor_list
);
}
PADDLE_ENFORCE_EQ
(
starts
.
size
(),
axes
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of starts must be equal to the size of axes."
));
PADDLE_ENFORCE_EQ
(
ends
.
size
(),
axes
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of ends must be equal to the size of axes."
));
const
auto
&
in_dims
=
input
->
dims
();
auto
slice_dims
=
out
->
dims
();
bool
reset_slice_dims
=
false
;
if
(
ctx
.
HasInput
(
"StartsTensor"
)
||
ctx
.
HasInput
(
"EndsTensor"
)
||
starts_tensor_list
.
size
()
>
0
||
ends_tensor_list
.
size
()
>
0
)
{
// Infer output dims
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
++
i
)
{
// when start == -1 && end == start+1
if
(
starts
[
i
]
==
-
1
&&
ends
[
i
]
==
0
&&
infer_flags
[
i
]
==
-
1
)
{
auto
ret
=
std
::
find
(
decrease_axis
.
begin
(),
decrease_axis
.
end
(),
axes
[
i
]);
if
(
ret
!=
decrease_axis
.
end
())
{
ends
[
i
]
=
in_dims
[
axes
[
i
]];
}
}
}
phi
::
funcs
::
CheckAndUpdateSliceAttrs
(
in_dims
,
axes
,
&
starts
,
&
ends
);
slice_dims
=
phi
::
funcs
::
GetSliceDims
<
int
>
(
in_dims
,
axes
,
starts
,
ends
,
nullptr
,
nullptr
);
reset_slice_dims
=
true
;
auto
out_dims
=
phi
::
funcs
::
GetDecreasedDims
(
slice_dims
,
decrease_axis
);
out
->
Resize
(
out_dims
);
}
if
(
slice_dims
.
size
()
!=
in_dims
.
size
()
&&
!
reset_slice_dims
)
{
phi
::
funcs
::
CheckAndUpdateSliceAttrs
(
in_dims
,
axes
,
&
starts
,
&
ends
);
slice_dims
=
phi
::
funcs
::
GetSliceDims
<
int
>
(
in_dims
,
axes
,
starts
,
ends
,
nullptr
,
nullptr
);
}
int
in_dim_size
=
input
->
dims
().
size
();
if
(
static_cast
<
int
>
(
axes
.
size
())
!=
in_dim_size
)
{
std
::
vector
<
int
>
tmp_starts
(
in_dim_size
,
0
);
const
auto
&
in_dims_vec
=
phi
::
vectorize
(
input
->
dims
());
std
::
vector
<
int
>
tmp_ends
(
in_dims_vec
.
begin
(),
in_dims_vec
.
end
());
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
++
i
)
{
tmp_starts
[
axes
[
i
]]
=
starts
[
i
];
tmp_ends
[
axes
[
i
]]
=
ends
[
i
];
}
starts
.
swap
(
tmp_starts
);
ends
.
swap
(
tmp_ends
);
}
std
::
vector
<
int
>
strides
(
in_dim_size
,
1
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
input_desc
(
*
input
);
MLUCnnlTensorDesc
out_desc
(
slice_dims
.
size
(),
phi
::
vectorize
(
slice_dims
).
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnl
::
StridedSlice
(
ctx
,
starts
.
data
(),
ends
.
data
(),
strides
.
data
(),
input_desc
.
get
(),
GetBasePtr
(
input
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
T
>
class
SliceGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dinput
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Input"
));
auto
axes
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"axes"
);
auto
starts
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"starts"
);
auto
ends
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"ends"
);
// Get the accurate attribute value of starts and ends
auto
starts_tensor_list
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"StartsTensorList"
);
if
(
ctx
.
HasInput
(
"StartsTensor"
))
{
starts
=
phi
::
GetVectorFromTensor
<
int
>
(
ctx
.
Input
<
phi
::
DenseTensor
>
(
"StartsTensor"
));
}
else
if
(
starts_tensor_list
.
size
()
>
0
)
{
starts
=
GetDataFromTensorList
<
int
>
(
starts_tensor_list
);
}
auto
ends_tensor_list
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"EndsTensorList"
);
if
(
ctx
.
HasInput
(
"EndsTensor"
))
{
ends
=
phi
::
GetVectorFromTensor
<
int
>
(
ctx
.
Input
<
phi
::
DenseTensor
>
(
"EndsTensor"
));
}
else
if
(
ends_tensor_list
.
size
()
>
0
)
{
ends
=
GetDataFromTensorList
<
int
>
(
ends_tensor_list
);
}
const
auto
&
in_dims
=
input
->
dims
();
auto
slice_dims
=
dout
->
dims
();
if
(
slice_dims
.
size
()
!=
in_dims
.
size
())
{
phi
::
funcs
::
CheckAndUpdateSliceAttrs
(
in_dims
,
axes
,
&
starts
,
&
ends
);
slice_dims
=
phi
::
funcs
::
GetSliceDims
<
int
>
(
in_dims
,
axes
,
starts
,
ends
,
nullptr
,
nullptr
);
}
int
in_dim_size
=
input
->
dims
().
size
();
if
(
static_cast
<
int
>
(
axes
.
size
())
!=
in_dim_size
)
{
std
::
vector
<
int
>
tmp_starts
(
in_dim_size
,
0
);
const
auto
&
in_dims_vec
=
phi
::
vectorize
(
input
->
dims
());
std
::
vector
<
int
>
tmp_ends
(
in_dims_vec
.
begin
(),
in_dims_vec
.
end
());
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
++
i
)
{
tmp_starts
[
axes
[
i
]]
=
starts
[
i
];
tmp_ends
[
axes
[
i
]]
=
ends
[
i
];
}
starts
.
swap
(
tmp_starts
);
ends
.
swap
(
tmp_ends
);
}
std
::
vector
<
int
>
strides
(
in_dim_size
,
1
);
dinput
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
dout_desc
(
slice_dims
.
size
(),
phi
::
vectorize
(
slice_dims
).
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
dinput_desc
(
*
dinput
);
MLUCnnl
::
StridedSliceGrad
(
ctx
,
starts
.
data
(),
ends
.
data
(),
strides
.
data
(),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
dinput_desc
.
get
(),
GetBasePtr
(
dinput
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
slice
,
ops
::
SliceMLUKernel
<
float
>
,
ops
::
SliceMLUKernel
<
int
>
,
ops
::
SliceMLUKernel
<
bool
>
,
ops
::
SliceMLUKernel
<
int64_t
>
,
ops
::
SliceMLUKernel
<
double
>
,
ops
::
SliceMLUKernel
<
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
slice_grad
,
ops
::
SliceGradMLUKernel
<
float
>
,
ops
::
SliceGradMLUKernel
<
int
>
,
ops
::
SliceGradMLUKernel
<
bool
>
,
ops
::
SliceGradMLUKernel
<
int64_t
>
,
ops
::
SliceGradMLUKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/softmax_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
namespace
paddle
{
namespace
operators
{
template
<
cnnlSoftmaxAlgorithm_t
softmax_algo
,
typename
T
>
class
SoftmaxMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
int
rank
=
in
->
dims
().
size
();
const
int
axis
=
phi
::
funcs
::
CanonicalAxis
(
ctx
.
Attr
<
int
>
(
"axis"
),
rank
);
// cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3]
const
int
cnnl_softmax_dims
=
3
;
const
int
d1
=
phi
::
funcs
::
SizeToAxis
(
axis
,
in
->
dims
());
const
int
d2
=
in
->
dims
()[
axis
];
const
int
d3
=
phi
::
funcs
::
SizeOutAxis
(
axis
,
in
->
dims
());
// CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as
// possible.
cnnlSoftmaxMode_t
mode
=
CNNL_SOFTMAX_MODE_LOW_DIMENSION
;
std
::
vector
<
int
>
regard_in_shape
{
d1
,
1
,
d2
};
if
(
d3
!=
1
)
{
mode
=
CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION
;
regard_in_shape
=
{
d1
,
d2
,
d3
};
}
static
const
cnnlSoftmaxAlgorithm_t
algo
=
softmax_algo
;
MLUCnnlTensorDesc
in_desc
(
cnnl_softmax_dims
,
regard_in_shape
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnl
::
SoftmaxForward
(
ctx
,
algo
,
mode
,
NULL
,
in_desc
.
get
(),
GetBasePtr
(
in
),
NULL
,
in_desc
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
cnnlSoftmaxAlgorithm_t
softmax_algo
,
typename
T
>
class
SoftmaxGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
dOut
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dX
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
dX
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
int
rank
=
out
->
dims
().
size
();
const
int
axis
=
phi
::
funcs
::
CanonicalAxis
(
ctx
.
Attr
<
int
>
(
"axis"
),
rank
);
// cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3]
const
int
cnnl_softmax_dims
=
3
;
const
int
d1
=
phi
::
funcs
::
SizeToAxis
(
axis
,
out
->
dims
());
const
int
d2
=
out
->
dims
()[
axis
];
const
int
d3
=
phi
::
funcs
::
SizeOutAxis
(
axis
,
out
->
dims
());
// CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as
// possible.
cnnlSoftmaxMode_t
mode
=
CNNL_SOFTMAX_MODE_LOW_DIMENSION
;
std
::
vector
<
int
>
regard_out_shape
{
d1
,
1
,
d2
};
if
(
d3
!=
1
)
{
mode
=
CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION
;
regard_out_shape
=
{
d1
,
d2
,
d3
};
}
static
const
cnnlSoftmaxAlgorithm_t
algo
=
softmax_algo
;
MLUCnnlTensorDesc
out_desc
(
cnnl_softmax_dims
,
regard_out_shape
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnl
::
SoftmaxBackward
(
ctx
,
algo
,
mode
,
out_desc
.
get
(),
GetBasePtr
(
out
),
out_desc
.
get
(),
GetBasePtr
(
dOut
),
out_desc
.
get
(),
GetBasePtr
(
dX
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
softmax
,
ops
::
SoftmaxMLUKernel
<
CNNL_SOFTMAX_ACCURATE
,
float
>
,
ops
::
SoftmaxMLUKernel
<
CNNL_SOFTMAX_ACCURATE
,
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
softmax_grad
,
ops
::
SoftmaxGradMLUKernel
<
CNNL_SOFTMAX_ACCURATE
,
float
>
,
ops
::
SoftmaxGradMLUKernel
<
CNNL_SOFTMAX_ACCURATE
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
log_softmax
,
ops
::
SoftmaxMLUKernel
<
CNNL_SOFTMAX_LOG
,
float
>
,
ops
::
SoftmaxMLUKernel
<
CNNL_SOFTMAX_LOG
,
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
log_softmax_grad
,
ops
::
SoftmaxGradMLUKernel
<
CNNL_SOFTMAX_LOG
,
float
>
,
ops
::
SoftmaxGradMLUKernel
<
CNNL_SOFTMAX_LOG
,
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
SoftmaxWithCrossEntropyMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
logits
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Logits"
);
auto
*
labels
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Label"
);
auto
*
softmax
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Softmax"
);
auto
*
loss
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Loss"
);
auto
*
backprop
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Backprop"
);
auto
soft_label
=
ctx
.
Attr
<
bool
>
(
"soft_label"
);
PADDLE_ENFORCE_EQ
(
ctx
.
Attr
<
bool
>
(
"use_softmax"
),
true
,
platform
::
errors
::
InvalidArgument
(
"use_softmax=False is not supported in "
"the mlu kernel of softmax_with_cross_entropy."
));
const
int
rank
=
logits
->
dims
().
size
();
const
int
axis
=
phi
::
funcs
::
CanonicalAxis
(
ctx
.
Attr
<
int
>
(
"axis"
),
rank
);
loss
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
backprop
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
softmax
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3]
const
int
cnnl_softmax_dims
=
3
;
const
int
d1
=
phi
::
funcs
::
SizeToAxis
(
axis
,
logits
->
dims
());
const
int
d2_logits
=
logits
->
dims
()[
axis
];
const
int
d2_labels
=
labels
->
dims
()[
axis
];
const
int
d3
=
phi
::
funcs
::
SizeOutAxis
(
axis
,
logits
->
dims
());
// CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as
// possible.
cnnlSoftmaxMode_t
mode
=
CNNL_SOFTMAX_MODE_LOW_DIMENSION
;
std
::
vector
<
int
>
regard_logits_shape
{
d1
,
1
,
d2_logits
};
std
::
vector
<
int
>
regard_labels_shape
{
d1
,
1
,
d2_labels
};
std
::
vector
<
int
>
regard_loss_shape
{
d1
,
1
,
1
};
if
(
d3
!=
1
)
{
mode
=
CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION
;
regard_logits_shape
=
{
d1
,
d2_logits
,
d3
};
regard_labels_shape
=
{
d1
,
d2_labels
,
d3
};
regard_loss_shape
=
{
d1
,
1
,
d3
};
}
MLUCnnlTensorDesc
logits_desc
(
cnnl_softmax_dims
,
regard_logits_shape
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
labels_desc
(
cnnl_softmax_dims
,
regard_labels_shape
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
loss_desc
(
cnnl_softmax_dims
,
regard_loss_shape
.
data
(),
ToCnnlDataType
<
T
>
());
const
cnnlSoftmaxAlgorithm_t
algo
=
CNNL_SOFTMAX_ACCURATE
;
MLUCnnl
::
SoftmaxForward
(
ctx
,
algo
,
mode
,
NULL
,
logits_desc
.
get
(),
GetBasePtr
(
logits
),
NULL
,
logits_desc
.
get
(),
GetBasePtr
(
softmax
));
if
(
soft_label
)
{
const
cnnlComputationPreference_t
prefer
=
CNNL_COMPUTATION_HIGH_PRECISION
;
MLUCnnl
::
SoftmaxCrossEntropyWithLogits
(
ctx
,
mode
,
prefer
,
logits_desc
.
get
(),
GetBasePtr
(
logits
),
labels_desc
.
get
(),
GetBasePtr
(
labels
),
loss_desc
.
get
(),
GetBasePtr
(
loss
),
logits_desc
.
get
(),
GetBasePtr
(
backprop
));
}
else
{
PADDLE_ENFORCE_EQ
(
d3
,
1
,
platform
::
errors
::
InvalidArgument
(
"If soft_label=False, axis must be -1 or"
" can be regard as last dimention in mlu kernel."
));
phi
::
DenseTensor
labels_int32
(
framework
::
TransToPhiDataType
(
VT
::
INT32
));
labels_int32
.
Resize
(
labels
->
dims
());
labels_int32
.
mutable_data
<
int32_t
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
labels_int64_desc
(
*
labels
);
MLUCnnlTensorDesc
labels_int32_desc
(
labels_int32
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
VT
::
INT64
,
VT
::
INT32
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
labels_int64_desc
.
get
(),
GetBasePtr
(
labels
),
labels_int32_desc
.
get
(),
GetBasePtr
(
&
labels_int32
));
const
int
regard_sparse_shape
[
cnnl_softmax_dims
-
1
]
=
{
d1
,
1
};
MLUCnnlTensorDesc
sparse_labels_desc
(
cnnl_softmax_dims
-
1
,
regard_sparse_shape
,
ToCnnlDataType
<
int32_t
>
());
MLUCnnlTensorDesc
sparse_loss_desc
(
cnnl_softmax_dims
-
1
,
regard_sparse_shape
,
ToCnnlDataType
<
T
>
());
MLUCnnl
::
SparseSoftmaxXentWithLogits
(
ctx
,
mode
,
logits_desc
.
get
(),
GetBasePtr
(
logits
),
sparse_labels_desc
.
get
(),
GetBasePtr
(
&
labels_int32
),
sparse_loss_desc
.
get
(),
GetBasePtr
(
loss
),
logits_desc
.
get
(),
GetBasePtr
(
backprop
));
}
}
};
template
<
typename
T
>
class
SoftmaxWithCrossEntropyGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
backprop
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Backprop"
);
auto
*
loss_grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Loss"
));
auto
*
logits_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Logits"
));
PADDLE_ENFORCE_NOT_NULL
(
backprop
,
platform
::
errors
::
PreconditionNotMet
(
"backprop should not be null in MLU kernel of "
"softmax_with_cross_entropy_grad."
));
logits_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlOpTensorDesc
mul_op_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
<
T
>
(),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnlTensorDesc
backprop_desc
(
*
backprop
);
MLUCnnlTensorDesc
loss_grad_desc
(
*
loss_grad
);
MLUCnnlTensorDesc
logits_grad_desc
(
*
logits_grad
);
MLUCnnl
::
OpTensor
(
ctx
,
mul_op_desc
.
get
(),
backprop_desc
.
get
(),
GetBasePtr
(
backprop
),
loss_grad_desc
.
get
(),
GetBasePtr
(
loss_grad
),
logits_grad_desc
.
get
(),
GetBasePtr
(
logits_grad
),
ToCnnlDataType
<
T
>
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
softmax_with_cross_entropy
,
ops
::
SoftmaxWithCrossEntropyMLUKernel
<
float
>
,
ops
::
SoftmaxWithCrossEntropyMLUKernel
<
paddle
::
platform
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
softmax_with_cross_entropy_grad
,
ops
::
SoftmaxWithCrossEntropyGradMLUKernel
<
float
>
,
ops
::
SoftmaxWithCrossEntropyGradMLUKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/split_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/split_op.h"
#include "paddle/phi/core/tensor_utils.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
SplitMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
// init parameter
auto
*
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
outs
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
"Out"
);
int
num
=
ctx
.
Attr
<
int
>
(
"num"
);
std
::
vector
<
int
>
sections
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"sections"
);
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
auto
in_dims
=
in
->
dims
();
auto
out_size
=
outs
.
size
();
auto
num_tensor
=
num
==
0
?
out_size
:
num
;
bool
need_resize_outs_dims
=
false
;
if
(
ctx
.
HasInput
(
"AxisTensor"
))
{
auto
*
axis_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"AxisTensor"
);
axis
=
phi
::
GetVectorFromTensor
(
axis_tensor
)[
0
];
need_resize_outs_dims
=
true
;
}
auto
sections_tensor_list
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"SectionsTensorList"
);
if
(
sections_tensor_list
.
size
()
>
0
)
{
sections
=
GetDataFromTensorList
(
sections_tensor_list
);
need_resize_outs_dims
=
true
;
}
if
(
need_resize_outs_dims
)
{
std
::
vector
<
framework
::
DDim
>
outs_dims
=
UpdateOutsDims
(
true
,
true
,
in_dims
,
num
,
sections
,
axis
,
out_size
);
for
(
size_t
j
=
0
;
j
<
outs
.
size
();
++
j
)
{
outs
[
j
]
->
Resize
(
outs_dims
[
j
]);
}
}
// init out tensors
std
::
vector
<
void
*>
vct_tensor
;
std
::
vector
<
MLUCnnlTensorDesc
>
output_descs
;
std
::
vector
<
cnnlTensorDescriptor_t
>
desc_vector
;
for
(
size_t
i
=
0
;
i
<
outs
.
size
();
i
++
)
{
outs
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
output_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
*
outs
[
i
],
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
outs
[
i
]
->
dtype
())));
desc_vector
.
push_back
(
output_descs
.
back
().
get
());
vct_tensor
.
push_back
(
GetBasePtr
(
outs
[
i
]));
}
// init in tensors
MLUCnnlTensorDesc
input_desc
(
*
in
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
in
->
dtype
()));
// MLU should do sth
MLUCnnl
::
Split
(
ctx
,
num_tensor
,
axis
,
input_desc
.
get
(),
GetBasePtr
(
in
),
desc_vector
.
data
(),
vct_tensor
.
data
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
split
,
ops
::
SplitMLUKernel
<
float
>
,
ops
::
SplitMLUKernel
<
int64_t
>
,
ops
::
SplitMLUKernel
<
int
>
,
ops
::
SplitMLUKernel
<
bool
>
,
ops
::
SplitMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/squared_l2_norm_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
SquaredL2NormMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
&
dev_ctx
=
context
.
template
device_context
<
MLUDeviceContext
>();
auto
*
x
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
context
.
GetPlace
();
out
->
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
input_desc
(
*
x
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
// L2Loss
MLUCnnl
::
L2Loss
(
context
,
input_desc
.
get
(),
GetBasePtr
(
x
),
GetBasePtr
(
out
));
// do mul
phi
::
DenseTensor
scale_tensor
=
context
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
phi
::
DenseTensor
bias_tensor
=
context
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
MLUCnnlTensorDesc
scale_desc
(
scale_tensor
);
MLUCnnlTensorDesc
bias_desc
(
bias_tensor
);
FillMLUTensorWithHostValue
(
context
,
static_cast
<
T
>
(
2.0
f
),
&
scale_tensor
);
FillMLUTensorWithHostValue
(
context
,
static_cast
<
T
>
(
0.0
f
),
&
bias_tensor
);
MLUCnnl
::
Scale
(
context
,
0
,
out_desc
.
get
(),
GetBasePtr
(
out
),
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
),
bias_desc
.
get
(),
GetBasePtr
(
&
bias_tensor
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
T
>
class
SquaredL2NormGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
&
dev_ctx
=
context
.
template
device_context
<
MLUDeviceContext
>();
auto
*
x
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
x_grad
=
context
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
out_grad
=
context
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
PADDLE_ENFORCE_EQ
(
out_grad
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(GRAD@Out) of SquaredL2NormGradOP should be a scalar."
));
auto
place
=
context
.
GetPlace
();
// broadcast out_grad
phi
::
DenseTensor
broadcasted_out_grad
;
broadcasted_out_grad
.
mutable_data
<
T
>
(
x_grad
->
dims
(),
place
);
MLUCnnlTensorDesc
broadcasted_out_grad_desc
(
broadcasted_out_grad
);
MLUCnnlTensorDesc
out_grad_desc
(
*
out_grad
);
MLUCnnl
::
BroadcastTo
(
context
,
out_grad_desc
.
get
(),
GetBasePtr
(
out_grad
),
broadcasted_out_grad_desc
.
get
(),
GetBasePtr
(
&
broadcasted_out_grad
));
// mul x
phi
::
DenseTensor
tmp_x_grad
;
tmp_x_grad
.
mutable_data
<
T
>
(
x_grad
->
dims
(),
place
);
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
tmp_x_grad_desc
(
tmp_x_grad
);
MLUCnnlOpTensorDesc
mul_op_desc
(
CNNL_OP_TENSOR_MUL
,
ToCnnlDataType
(
x
->
dtype
()),
CNNL_NOT_PROPAGATE_NAN
);
MLUCnnl
::
OpTensor
(
context
,
mul_op_desc
.
get
(),
x_desc
.
get
(),
GetBasePtr
(
x
),
broadcasted_out_grad_desc
.
get
(),
GetBasePtr
(
&
broadcasted_out_grad
),
tmp_x_grad_desc
.
get
(),
GetBasePtr
(
&
tmp_x_grad
),
ToCnnlDataType
(
x
->
dtype
()));
// mul
phi
::
DenseTensor
scale_tensor
=
context
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
phi
::
DenseTensor
bias_tensor
=
context
.
AllocateTmpTensor
<
T
,
MLUDeviceContext
>
({
1
},
dev_ctx
);
MLUCnnlTensorDesc
scale_desc
(
scale_tensor
);
MLUCnnlTensorDesc
bias_desc
(
bias_tensor
);
FillMLUTensorWithHostValue
(
context
,
static_cast
<
T
>
(
2.0
f
),
&
scale_tensor
);
FillMLUTensorWithHostValue
(
context
,
static_cast
<
T
>
(
0.0
f
),
&
bias_tensor
);
x_grad
->
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
x_grad_desc
(
*
x_grad
);
MLUCnnl
::
Scale
(
context
,
0
,
tmp_x_grad_desc
.
get
(),
GetBasePtr
(
&
tmp_x_grad
),
scale_desc
.
get
(),
GetBasePtr
(
&
scale_tensor
),
bias_desc
.
get
(),
GetBasePtr
(
&
bias_tensor
),
x_grad_desc
.
get
(),
GetBasePtr
(
x_grad
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
squared_l2_norm
,
ops
::
SquaredL2NormMLUKernel
<
float
>
,
ops
::
SquaredL2NormMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
squared_l2_norm_grad
,
ops
::
SquaredL2NormGradMLUKernel
<
float
>
,
ops
::
SquaredL2NormGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/squeeze_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_MLU
#include <memory>
#include <string>
#include "paddle/fluid/operators/squeeze_op.h"
#include "paddle/fluid/platform/device/mlu/device_context.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
squeeze
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
float
>
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
double
>
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
plat
::
float16
>
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
bool
>
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
int
>
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
int8_t
>
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
int64_t
>
);
REGISTER_OP_MLU_KERNEL
(
squeeze_grad
,
ops
::
SqueezeGradKernel
<
plat
::
MLUDeviceContext
,
float
>
,
ops
::
SqueezeGradKernel
<
plat
::
MLUDeviceContext
,
double
>
,
ops
::
SqueezeGradKernel
<
plat
::
MLUDeviceContext
,
plat
::
float16
>
,
ops
::
SqueezeGradKernel
<
plat
::
MLUDeviceContext
,
bool
>
,
ops
::
SqueezeGradKernel
<
plat
::
MLUDeviceContext
,
int
>
,
ops
::
SqueezeGradKernel
<
plat
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
SqueezeGradKernel
<
plat
::
MLUDeviceContext
,
int8_t
>
,
ops
::
SqueezeGradKernel
<
plat
::
MLUDeviceContext
,
int64_t
>
);
REGISTER_OP_MLU_KERNEL
(
squeeze2
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
float
>
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
double
>
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
plat
::
float16
>
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
bool
>
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
int
>
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
int8_t
>
,
ops
::
SqueezeKernel
<
plat
::
MLUDeviceContext
,
int64_t
>
);
REGISTER_OP_MLU_KERNEL
(
squeeze2_grad
,
ops
::
Squeeze2GradKernel
<
plat
::
MLUDeviceContext
,
float
>
,
ops
::
Squeeze2GradKernel
<
plat
::
MLUDeviceContext
,
double
>
,
ops
::
Squeeze2GradKernel
<
plat
::
MLUDeviceContext
,
plat
::
float16
>
,
ops
::
Squeeze2GradKernel
<
plat
::
MLUDeviceContext
,
bool
>
,
ops
::
Squeeze2GradKernel
<
plat
::
MLUDeviceContext
,
int
>
,
ops
::
Squeeze2GradKernel
<
plat
::
MLUDeviceContext
,
uint8_t
>
,
ops
::
Squeeze2GradKernel
<
plat
::
MLUDeviceContext
,
int8_t
>
,
ops
::
Squeeze2GradKernel
<
plat
::
MLUDeviceContext
,
int64_t
>
);
#endif
paddle/fluid/operators/stack_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
StackMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
x
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Y"
);
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
if
(
axis
<
0
)
axis
+=
(
x
[
0
]
->
dims
().
size
()
+
1
);
int
num
=
static_cast
<
int
>
(
x
.
size
());
PADDLE_ENFORCE_GT
(
num
,
0
,
platform
::
errors
::
InvalidArgument
(
"number of input phi::DenseTensor <= 0"
));
std
::
vector
<
MLUCnnlTensorDesc
>
x_descs
;
std
::
vector
<
cnnlTensorDescriptor_t
>
x_raw_descs
;
std
::
vector
<
const
void
*>
x_ptrs
;
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
if
(
x
[
i
]
->
dims
().
size
()
!=
0
)
{
std
::
vector
<
int64_t
>
in_dims
=
phi
::
vectorize
(
x
[
i
]
->
dims
());
in_dims
.
insert
(
in_dims
.
begin
()
+
axis
,
1
);
x_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
in_dims
.
size
(),
in_dims
.
data
(),
ToCnnlDataType
<
T
>
()));
}
else
{
int
input_dims
=
1
;
x_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
1
,
&
input_dims
,
ToCnnlDataType
<
T
>
()));
}
x_raw_descs
.
push_back
(
x_descs
.
back
().
get
());
x_ptrs
.
push_back
(
GetBasePtr
(
x
[
i
]));
}
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
y_desc
(
*
y
);
MLUCnnl
::
Concat
(
ctx
,
num
,
axis
,
x_raw_descs
.
data
(),
x_ptrs
.
data
(),
y_desc
.
get
(),
GetBasePtr
(
y
));
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OP_MLU_KERNEL
(
stack
,
paddle
::
operators
::
StackMLUKernel
<
int64_t
>
,
paddle
::
operators
::
StackMLUKernel
<
int
>
,
paddle
::
operators
::
StackMLUKernel
<
float
>
,
paddle
::
operators
::
StackMLUKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/strided_slice_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/strided_slice.h"
namespace
paddle
{
namespace
operators
{
using
Variable
=
framework
::
Variable
;
using
LoDTensorArray
=
framework
::
LoDTensorArray
;
using
DDim
=
framework
::
DDim
;
static
void
ProcessStridedSliceParams
(
const
std
::
vector
<
int
>&
axes
,
const
DDim
&
input_dims
,
const
std
::
vector
<
int64_t
>&
starts
,
const
std
::
vector
<
int64_t
>&
ends
,
const
std
::
vector
<
int64_t
>&
strides
,
const
std
::
vector
<
int
>&
infer_flags
,
const
std
::
vector
<
int
>&
decrease_axis
,
std
::
vector
<
int
>*
starts_indices_vector
,
std
::
vector
<
int
>*
ends_indices_vector
,
std
::
vector
<
int
>*
strides_indices_vector
)
{
for
(
size_t
axis
=
0
;
axis
<
axes
.
size
();
axis
++
)
{
int64_t
start
=
starts
[
axis
];
int64_t
end
=
ends
[
axis
];
int64_t
stride
=
strides
[
axis
];
int
axis_index
=
axes
[
axis
];
int64_t
dim_size
=
input_dims
[
axis_index
];
bool
decrease_axis_affect
=
false
;
if
(
start
==
-
1
&&
end
==
0
&&
infer_flags
[
axis
]
==
-
1
)
{
auto
ret
=
std
::
find
(
decrease_axis
.
begin
(),
decrease_axis
.
end
(),
axis_index
);
if
(
ret
!=
decrease_axis
.
end
())
{
decrease_axis_affect
=
true
;
}
}
if
(
stride
<
0
)
{
if
(
start
<
0
)
{
start
=
std
::
max
(
start
,
-
dim_size
);
}
else
{
start
=
std
::
min
(
start
,
dim_size
-
1
)
-
dim_size
;
}
if
(
end
<
0
)
{
end
=
std
::
max
(
end
,
-
dim_size
-
1
);
}
else
{
end
=
end
-
dim_size
;
}
}
else
{
if
(
start
<
0
)
{
start
=
std
::
max
(
start
,
-
dim_size
)
+
dim_size
;
}
else
{
start
=
std
::
min
(
start
,
dim_size
-
1
);
}
if
(
end
<
0
)
{
end
=
end
+
dim_size
;
}
else
{
end
=
std
::
min
(
end
,
dim_size
);
}
}
if
(
decrease_axis_affect
)
{
if
(
stride
<
0
)
{
end
=
start
-
1
;
}
else
{
end
=
start
+
1
;
}
}
(
*
starts_indices_vector
)[
axis_index
]
=
static_cast
<
int
>
(
start
);
(
*
ends_indices_vector
)[
axis_index
]
=
static_cast
<
int
>
(
end
);
(
*
strides_indices_vector
)[
axis_index
]
=
static_cast
<
int
>
(
stride
);
}
}
template
<
typename
T
>
class
StridedSliceMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
Variable
*
input_var
=
ctx
.
InputVar
(
"Input"
);
bool
is_tensor_array
=
input_var
->
IsType
<
LoDTensorArray
>
();
PADDLE_ENFORCE_EQ
(
is_tensor_array
,
false
,
platform
::
errors
::
InvalidArgument
(
"phi::DenseTensor array as input is not supported."
));
int
rank
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
)
->
dims
().
size
();
switch
(
rank
)
{
case
1
:
StridedSliceCompute
<
1
>
(
ctx
);
break
;
case
2
:
StridedSliceCompute
<
2
>
(
ctx
);
break
;
case
3
:
StridedSliceCompute
<
3
>
(
ctx
);
break
;
case
4
:
StridedSliceCompute
<
4
>
(
ctx
);
break
;
case
5
:
StridedSliceCompute
<
5
>
(
ctx
);
break
;
case
6
:
StridedSliceCompute
<
6
>
(
ctx
);
break
;
case
7
:
StridedSliceCompute
<
7
>
(
ctx
);
break
;
case
8
:
StridedSliceCompute
<
8
>
(
ctx
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"The rank of input is supported up to 8."
));
break
;
}
}
private:
template
<
size_t
D
>
void
StridedSliceCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
place
=
ctx
.
GetPlace
();
auto
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
in_dims
=
in
->
dims
();
// list<int>
auto
starts_int
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"starts"
);
auto
ends_int
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"ends"
);
auto
strides_int
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int64_t
>
starts
(
starts_int
.
begin
(),
starts_int
.
end
());
std
::
vector
<
int64_t
>
ends
(
ends_int
.
begin
(),
ends_int
.
end
());
std
::
vector
<
int64_t
>
strides
(
strides_int
.
begin
(),
strides_int
.
end
());
auto
axes
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"axes"
);
auto
infer_flags
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"infer_flags"
);
auto
decrease_axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"decrease_axis"
);
// vector<phi::DenseTensor<int32>>
auto
list_new_starts_tensor
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"StartsTensorList"
);
auto
list_new_ends_tensor
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"EndsTensorList"
);
auto
list_new_strides_tensor
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"StridesTensorList"
);
// phi::DenseTensor<int32>
if
(
list_new_starts_tensor
.
size
()
>
0
)
{
starts
=
GetDataFromTensorList
<
int64_t
>
(
list_new_starts_tensor
);
}
else
if
(
ctx
.
HasInput
(
"StartsTensor"
))
{
auto
*
starts_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"StartsTensor"
);
starts
=
phi
::
GetVectorFromTensor
<
int64_t
>
(
starts_tensor
);
}
if
(
list_new_ends_tensor
.
size
()
>
0
)
{
ends
=
GetDataFromTensorList
<
int64_t
>
(
list_new_ends_tensor
);
}
else
if
(
ctx
.
HasInput
(
"EndsTensor"
))
{
auto
*
ends_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"EndsTensor"
);
ends
=
phi
::
GetVectorFromTensor
<
int64_t
>
(
ends_tensor
);
}
if
(
list_new_strides_tensor
.
size
()
>
0
)
{
strides
=
GetDataFromTensorList
<
int64_t
>
(
list_new_strides_tensor
);
}
else
if
(
ctx
.
HasInput
(
"StridesTensor"
))
{
auto
*
strides_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"StridesTensor"
);
strides
=
phi
::
GetVectorFromTensor
<
int64_t
>
(
strides_tensor
);
}
// out dims calculation
std
::
vector
<
int64_t
>
out_dims_vector
(
in_dims
.
size
(),
-
1
);
phi
::
funcs
::
StridedSliceOutDims
(
starts
,
ends
,
strides
,
axes
,
infer_flags
,
in_dims
,
decrease_axis
,
out_dims_vector
.
data
(),
axes
.
size
(),
false
);
framework
::
DDim
out_dims
(
phi
::
make_ddim
(
out_dims_vector
));
// construct the starts_indices, ends_indices and strides_indices tensor for
// calling StridedSlice op
std
::
vector
<
int
>
starts_indices_vector
(
D
,
0
);
std
::
vector
<
int
>
ends_indices_vector
(
out_dims_vector
.
begin
(),
out_dims_vector
.
end
());
std
::
vector
<
int
>
strides_indices_vector
(
D
,
1
);
ProcessStridedSliceParams
(
axes
,
in_dims
,
starts
,
ends
,
strides
,
infer_flags
,
decrease_axis
,
&
starts_indices_vector
,
&
ends_indices_vector
,
&
strides_indices_vector
);
auto
out_dims_origin
=
out_dims
;
if
(
decrease_axis
.
size
()
>
0
)
{
std
::
vector
<
int64_t
>
new_out_shape
;
for
(
size_t
i
=
0
;
i
<
decrease_axis
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
out_dims
[
decrease_axis
[
i
]],
1
,
platform
::
errors
::
InvalidArgument
(
"the size of decrease dimension should be 1, but received %d."
,
out_dims
[
decrease_axis
[
i
]]));
out_dims_origin
[
decrease_axis
[
i
]]
=
0
;
}
for
(
int
i
=
0
;
i
<
out_dims_origin
.
size
();
++
i
)
{
if
(
out_dims_origin
[
i
]
!=
0
)
{
new_out_shape
.
push_back
(
out_dims_origin
[
i
]);
}
}
if
(
new_out_shape
.
size
()
==
0
)
{
new_out_shape
.
push_back
(
1
);
}
out_dims_origin
=
phi
::
make_ddim
(
new_out_shape
);
}
out
->
Resize
(
out_dims_origin
);
out
->
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
in_desc
(
*
in
);
MLUCnnlTensorDesc
out_desc
(
out_dims_vector
.
size
(),
out_dims_vector
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnl
::
StridedSlice
(
ctx
,
starts_indices_vector
.
data
(),
ends_indices_vector
.
data
(),
strides_indices_vector
.
data
(),
in_desc
.
get
(),
GetBasePtr
(
in
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
template
<
typename
T
>
class
StridedSliceGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
Variable
*
input_var
=
ctx
.
InputVar
(
"Input"
);
bool
is_tensor_array
=
input_var
->
IsType
<
LoDTensorArray
>
();
PADDLE_ENFORCE_EQ
(
is_tensor_array
,
false
,
platform
::
errors
::
InvalidArgument
(
"phi::DenseTensor array as input is not supported."
));
int
rank
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
)
->
dims
().
size
();
switch
(
rank
)
{
case
1
:
StridedSliceGradCompute
<
1
>
(
ctx
);
break
;
case
2
:
StridedSliceGradCompute
<
2
>
(
ctx
);
break
;
case
3
:
StridedSliceGradCompute
<
3
>
(
ctx
);
break
;
case
4
:
StridedSliceGradCompute
<
4
>
(
ctx
);
break
;
case
5
:
StridedSliceGradCompute
<
5
>
(
ctx
);
break
;
case
6
:
StridedSliceGradCompute
<
6
>
(
ctx
);
break
;
case
7
:
StridedSliceGradCompute
<
7
>
(
ctx
);
break
;
case
8
:
StridedSliceGradCompute
<
8
>
(
ctx
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"The rank of input is supported up to 8."
));
break
;
}
}
private:
template
<
size_t
D
>
void
StridedSliceGradCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
auto
place
=
ctx
.
GetPlace
();
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Input"
);
auto
input_dims
=
input
->
dims
();
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Input"
));
dx
->
mutable_data
<
T
>
(
input_dims
,
place
);
auto
starts_int
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"starts"
);
auto
ends_int
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"ends"
);
auto
strides_int
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int64_t
>
starts
(
starts_int
.
begin
(),
starts_int
.
end
());
std
::
vector
<
int64_t
>
ends
(
ends_int
.
begin
(),
ends_int
.
end
());
std
::
vector
<
int64_t
>
strides
(
strides_int
.
begin
(),
strides_int
.
end
());
auto
axes
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"axes"
);
auto
infer_flags
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"infer_flags"
);
auto
decrease_axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"decrease_axis"
);
auto
list_new_ends_tensor
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"EndsTensorList"
);
auto
list_new_starts_tensor
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"StartsTensorList"
);
auto
list_new_strides_tensor
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"StridesTensorList"
);
if
(
list_new_starts_tensor
.
size
()
>
0
)
{
starts
=
GetDataFromTensorList
<
int64_t
>
(
list_new_starts_tensor
);
}
else
if
(
ctx
.
HasInput
(
"StartsTensor"
))
{
auto
*
starts_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"StartsTensor"
);
starts
=
phi
::
GetVectorFromTensor
<
int64_t
>
(
starts_tensor
);
}
if
(
list_new_ends_tensor
.
size
()
>
0
)
{
ends
=
GetDataFromTensorList
<
int64_t
>
(
list_new_ends_tensor
);
}
else
if
(
ctx
.
HasInput
(
"EndsTensor"
))
{
auto
*
ends_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"EndsTensor"
);
ends
=
phi
::
GetVectorFromTensor
<
int64_t
>
(
ends_tensor
);
}
if
(
list_new_strides_tensor
.
size
()
>
0
)
{
strides
=
GetDataFromTensorList
<
int64_t
>
(
list_new_strides_tensor
);
}
else
if
(
ctx
.
HasInput
(
"StridesTensor"
))
{
auto
*
strides_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"StridesTensor"
);
strides
=
phi
::
GetVectorFromTensor
<
int64_t
>
(
strides_tensor
);
}
std
::
vector
<
int64_t
>
out_dims_vector
(
input_dims
.
size
(),
-
1
);
phi
::
funcs
::
StridedSliceOutDims
(
starts
,
ends
,
strides
,
axes
,
infer_flags
,
input_dims
,
decrease_axis
,
out_dims_vector
.
data
(),
axes
.
size
(),
false
);
std
::
vector
<
int
>
starts_indices_vector
(
D
,
0
);
std
::
vector
<
int
>
ends_indices_vector
(
out_dims_vector
.
begin
(),
out_dims_vector
.
end
());
std
::
vector
<
int
>
strides_indices_vector
(
D
,
1
);
ProcessStridedSliceParams
(
axes
,
input_dims
,
starts
,
ends
,
strides
,
infer_flags
,
decrease_axis
,
&
starts_indices_vector
,
&
ends_indices_vector
,
&
strides_indices_vector
);
MLUCnnlTensorDesc
dout_desc
(
out_dims_vector
.
size
(),
out_dims_vector
.
data
(),
ToCnnlDataType
<
T
>
());
MLUCnnlTensorDesc
dx_desc
(
*
input
);
MLUCnnl
::
StridedSliceGrad
(
ctx
,
starts_indices_vector
.
data
(),
ends_indices_vector
.
data
(),
strides_indices_vector
.
data
(),
dout_desc
.
get
(),
GetBasePtr
(
dout
),
dx_desc
.
get
(),
GetBasePtr
(
dx
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
strided_slice
,
ops
::
StridedSliceMLUKernel
<
plat
::
float16
>
,
ops
::
StridedSliceMLUKernel
<
bool
>
,
ops
::
StridedSliceMLUKernel
<
int
>
,
ops
::
StridedSliceMLUKernel
<
int64_t
>
,
ops
::
StridedSliceMLUKernel
<
float
>
);
REGISTER_OP_MLU_KERNEL
(
strided_slice_grad
,
ops
::
StridedSliceGradMLUKernel
<
plat
::
float16
>
,
ops
::
StridedSliceGradMLUKernel
<
float
>
,
ops
::
StridedSliceGradMLUKernel
<
bool
>
,
ops
::
StridedSliceGradMLUKernel
<
int
>
,
ops
::
StridedSliceGradMLUKernel
<
int64_t
>
);
paddle/fluid/operators/sum_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
using
SelectedRows
=
phi
::
SelectedRows
;
template
<
typename
DeviceContext
,
typename
T
>
class
SumMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
out_var
=
ctx
.
OutputVar
(
"Out"
);
if
(
out_var
->
IsType
<
phi
::
DenseTensor
>
())
{
// init
auto
*
out
=
out_var
->
GetMutable
<
phi
::
DenseTensor
>
();
auto
ins
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"X"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
place
=
ctx
.
GetPlace
();
int
ins_size
=
static_cast
<
int
>
(
ins
.
size
());
if
(
ins_size
==
1
)
{
framework
::
TensorCopy
(
*
ins
[
0
],
place
,
out
);
return
;
}
// MLU shoul do sth
std
::
vector
<
const
void
*>
inputs
;
std
::
vector
<
MLUCnnlTensorDesc
>
input_descs
;
std
::
vector
<
cnnlTensorDescriptor_t
>
desc_vector
;
for
(
int
i
=
0
;
i
<
ins_size
;
i
++
)
{
input_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
*
ins
[
i
],
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
ins
[
i
]
->
dtype
())));
desc_vector
.
push_back
(
input_descs
.
back
().
get
());
inputs
.
push_back
(
GetBasePtr
(
ins
[
i
]));
}
// init out tensors
MLUCnnlTensorDesc
output_desc
(
*
out
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
out
->
dtype
()));
uint32_t
ins_size_t
=
static_cast
<
uint32_t
>
(
ins_size
);
MLUCnnl
::
AddN
(
ctx
,
ins_size_t
,
desc_vector
.
data
(),
inputs
.
data
(),
output_desc
.
get
(),
GetBasePtr
(
out
));
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Expected type of Output(out) must be phi::DenseTensor or But got "
"unsupport type: %s."
,
framework
::
ToTypeName
(
out_var
->
Type
())));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
sum
,
ops
::
SumMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
float
>
,
ops
::
SumMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/sync_batch_norm_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the Licnse. */
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/platform/collective_helper.h"
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
#define NO_USE_CNCL 0
#define GET_LAYOUT_OFFSET 2
static
std
::
vector
<
cnnlTensorLayout_t
>
supported_input_layout
=
{
CNNL_LAYOUT_NC
,
CNNL_LAYOUT_NLC
,
CNNL_LAYOUT_NHWC
,
CNNL_LAYOUT_NDHWC
};
template
<
typename
T
>
class
SyncBatchNormMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
using
MPDType
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
float
momentum
=
ctx
.
Attr
<
float
>
(
"momentum"
);
const
bool
is_test
=
ctx
.
Attr
<
bool
>
(
"is_test"
);
const
bool
use_global_stats
=
ctx
.
Attr
<
bool
>
(
"use_global_stats"
);
const
bool
trainable_stats
=
ctx
.
Attr
<
bool
>
(
"trainable_statistics"
);
const
std
::
string
layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
const
DataLayout
layout
=
phi
::
StringToDataLayout
(
layout_str
);
PADDLE_ENFORCE_EQ
(
use_global_stats
,
false
,
platform
::
errors
::
InvalidArgument
(
"sync_batch_norm doesn't support "
"to set use_global_stats True. Please use batch_norm "
"in this case."
));
const
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
const
auto
*
scale
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Scale"
);
const
auto
*
bias
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Bias"
);
const
auto
*
mean
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Mean"
);
const
auto
*
variance
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Variance"
);
auto
*
mean_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"MeanOut"
);
auto
*
variance_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"VarianceOut"
);
auto
*
saved_mean
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"SavedMean"
);
auto
*
saved_variance
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"SavedVariance"
);
auto
*
y
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Y"
);
const
auto
&
x_dims
=
x
->
dims
();
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
2
,
platform
::
errors
::
InvalidArgument
(
"The Input dim size should be larger than 1."
));
PADDLE_ENFORCE_LE
(
x_dims
.
size
(),
5
,
platform
::
errors
::
InvalidArgument
(
"The Input dim size should be less than 6."
));
int
N
,
C
,
H
,
W
,
D
;
phi
::
funcs
::
ExtractNCWHD
(
x_dims
,
layout
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
mean_out
->
mutable_data
<
MPDType
>
(
ctx
.
GetPlace
());
variance_out
->
mutable_data
<
MPDType
>
(
ctx
.
GetPlace
());
saved_mean
->
mutable_data
<
MPDType
>
(
ctx
.
GetPlace
());
saved_variance
->
mutable_data
<
MPDType
>
(
ctx
.
GetPlace
());
phi
::
DenseTensor
trans_x
;
phi
::
DenseTensor
trans_y
;
std
::
vector
<
int
>
forward_perm
;
std
::
vector
<
int
>
backward_perm
;
std
::
vector
<
int
>
trans_shape
;
const
bool
need_transpose
=
((
layout
==
DataLayout
::
kNCHW
&&
x_dims
.
size
()
!=
2
)
||
x_dims
.
size
()
==
5
);
if
(
need_transpose
)
{
SetMLUTransposePerm
(
x_dims
,
layout
,
&
forward_perm
,
&
backward_perm
,
&
trans_shape
);
trans_x
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
trans_shape
),
ctx
.
GetPlace
());
trans_y
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
trans_shape
),
ctx
.
GetPlace
());
MLUCnnlTensorDesc
desc_x
(
*
x
);
MLUCnnlTensorDesc
desc_trans_x
(
trans_shape
.
size
(),
trans_shape
.
data
(),
ToCnnlDataType
(
x
->
dtype
()));
MLUCnnl
::
Transpose
(
ctx
,
forward_perm
,
x_dims
.
size
(),
desc_x
.
get
(),
GetBasePtr
(
x
),
desc_trans_x
.
get
(),
GetBasePtr
(
&
trans_x
));
}
else
{
trans_x
=
*
x
;
trans_y
=
*
y
;
}
MLUCnnlTensorDesc
desc_trans
(
trans_x
,
supported_input_layout
[
x_dims
.
size
()
-
GET_LAYOUT_OFFSET
],
ToCnnlDataType
<
T
>
());
bool
test_mode
=
is_test
&&
(
!
trainable_stats
);
if
(
test_mode
)
{
// inference
MLUCnnlTensorDesc
desc_weight_bias_mean_var
(
*
bias
);
MLUCnnl
::
FusedBatchNorm
(
ctx
,
false
/*is_training*/
,
desc_trans
.
get
(),
GetBasePtr
(
&
trans_x
),
desc_weight_bias_mean_var
.
get
(),
GetBasePtr
(
scale
),
GetBasePtr
(
bias
),
GetBasePtr
(
mean
),
GetBasePtr
(
variance
),
epsilon
,
momentum
,
desc_trans
.
get
(),
GetBasePtr
(
&
trans_y
),
nullptr
,
nullptr
,
nullptr
,
nullptr
);
}
else
{
// training
if
(
ctx
.
HasInput
(
"MomentumTensor"
))
{
const
auto
*
mom_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"MomentumTensor"
);
phi
::
DenseTensor
mom_cpu
;
paddle
::
framework
::
TensorCopySync
(
*
mom_tensor
,
platform
::
CPUPlace
(),
&
mom_cpu
);
momentum
=
mom_cpu
.
data
<
float
>
()[
0
];
}
phi
::
DenseTensor
local_mean
,
local_var
;
local_mean
.
mutable_data
<
MPDType
>
(
mean
->
dims
(),
ctx
.
GetPlace
());
local_var
.
mutable_data
<
MPDType
>
(
variance
->
dims
(),
ctx
.
GetPlace
());
MLUCnnlTensorDesc
desc_mean_var
(
*
mean_out
);
// cacl local_mean and local_var
MLUCnnl
::
SyncBatchNormStats
(
ctx
,
desc_trans
.
get
(),
GetBasePtr
(
&
trans_x
),
epsilon
,
desc_mean_var
.
get
(),
GetBasePtr
(
&
local_mean
),
desc_mean_var
.
get
(),
GetBasePtr
(
&
local_var
));
phi
::
DenseTensor
input_count
;
input_count
.
mutable_data
<
MPDType
>
(
phi
::
make_ddim
({
1
}),
ctx
.
GetPlace
());
FillMLUTensorWithHostValue
<
MPDType
>
(
ctx
,
static_cast
<
MPDType
>
(
x
->
numel
()
/
C
),
&
input_count
);
phi
::
DenseTensor
count_all
;
phi
::
DenseTensor
mean_all
(
mean
->
dtype
());
phi
::
DenseTensor
invstd_all
(
variance
->
dtype
());
#ifdef PADDLE_WITH_CNCL
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>();
auto
*
comm
=
dev_ctx
.
cncl_comm
();
if
(
comm
)
{
auto
cncl_comm
=
paddle
::
platform
::
CNCLCommContext
::
Instance
().
Get
(
0
,
ctx
.
GetPlace
());
auto
*
comm
=
cncl_comm
->
comm
();
auto
comm_stream
=
cncl_comm
->
stream
();
int
count
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclGetCommCount
(
&
count
,
comm
));
count_all
.
mutable_data
<
MPDType
>
(
phi
::
make_ddim
({
count
}),
ctx
.
GetPlace
());
mean_all
.
mutable_data
<
MPDType
>
(
phi
::
make_ddim
({
count
,
mean
->
numel
()}),
ctx
.
GetPlace
());
invstd_all
.
mutable_data
<
MPDType
>
(
phi
::
make_ddim
({
count
,
variance
->
numel
()}),
ctx
.
GetPlace
());
// before comm_stream exec, need sync compute_stream.
dev_ctx
.
Wait
();
cnclDataType_t
dtype
=
platform
::
ToCNCLDataType
(
framework
::
TransToProtoVarType
(
count_all
.
dtype
()));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclAllGather
(
GetBasePtr
(
&
input_count
),
GetBasePtr
(
&
count_all
),
1
,
dtype
,
comm
,
comm_stream
));
auto
cncl_dtype
=
platform
::
ToCNCLDataType
(
framework
::
TransToProtoVarType
(
mean_all
.
dtype
()));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclAllGather
(
GetBasePtr
(
&
local_mean
),
GetBasePtr
(
&
mean_all
),
local_mean
.
numel
(),
cncl_dtype
,
comm
,
comm_stream
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclAllGather
(
GetBasePtr
(
&
local_var
),
GetBasePtr
(
&
invstd_all
),
local_var
.
numel
(),
cncl_dtype
,
comm
,
comm_stream
));
// after comm_stream exec, need sync queue for using compute_stream
// correctly.
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtQueueSync
(
comm_stream
));
#else
if
(
NO_USE_CNCL
)
{
#endif
}
else
{
count_all
=
input_count
;
mean_all
.
ShareDataWith
(
local_mean
);
invstd_all
.
ShareDataWith
(
local_var
);
mean_all
.
Resize
(
phi
::
make_ddim
({
1
,
local_mean
.
numel
()}));
invstd_all
.
Resize
(
phi
::
make_ddim
({
1
,
local_var
.
numel
()}));
}
MLUCnnlTensorDesc
desc_all_mean_invstd
(
invstd_all
,
CNNL_LAYOUT_NC
,
ToCnnlDataType
<
MPDType
>
());
MLUCnnlTensorDesc
desc_moving_mean_var
(
*
mean_out
);
MLUCnnlTensorDesc
desc_saved_mean_var
(
*
saved_mean
);
MLUCnnlTensorDesc
desc_count_all
(
count_all
);
MLUCnnl
::
SyncBatchNormGatherStatsWithCounts
(
ctx
,
momentum
,
epsilon
,
desc_all_mean_invstd
.
get
(),
GetBasePtr
(
&
mean_all
),
desc_all_mean_invstd
.
get
(),
GetBasePtr
(
&
invstd_all
),
desc_moving_mean_var
.
get
(),
GetBasePtr
(
mean_out
),
desc_moving_mean_var
.
get
(),
GetBasePtr
(
variance_out
),
desc_count_all
.
get
(),
GetBasePtr
(
&
count_all
),
desc_saved_mean_var
.
get
(),
GetBasePtr
(
saved_mean
),
desc_saved_mean_var
.
get
(),
GetBasePtr
(
saved_variance
));
MLUCnnlTensorDesc
desc_other_param
(
*
saved_mean
);
MLUCnnl
::
SyncBatchNormElemt
(
ctx
,
desc_trans
.
get
(),
GetBasePtr
(
&
trans_x
),
desc_other_param
.
get
(),
GetBasePtr
(
saved_mean
),
desc_other_param
.
get
(),
GetBasePtr
(
saved_variance
),
desc_other_param
.
get
(),
GetBasePtr
(
scale
),
desc_other_param
.
get
(),
GetBasePtr
(
bias
),
desc_trans
.
get
(),
GetBasePtr
(
&
trans_y
));
}
if
(
need_transpose
)
{
MLUCnnlTensorDesc
desc_y
(
*
y
);
MLUCnnlTensorDesc
desc_trans_y
(
trans_y
);
MLUCnnl
::
Transpose
(
ctx
,
backward_perm
,
trans_y
.
dims
().
size
(),
desc_trans_y
.
get
(),
GetBasePtr
(
&
trans_y
),
desc_y
.
get
(),
GetBasePtr
(
y
));
}
}
};
template
<
typename
T
>
class
SyncBatchNormMLUGradKernel
:
public
framework
::
OpKernel
<
T
>
{
using
MPDType
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
std
::
string
layout_str
=
ctx
.
Attr
<
std
::
string
>
(
"data_layout"
);
const
DataLayout
layout
=
phi
::
StringToDataLayout
(
layout_str
);
const
auto
*
d_y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
const
auto
*
scale
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Scale"
);
const
auto
*
bias
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Bias"
);
// init output
auto
*
d_x
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_scale
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Bias"
));
const
auto
*
saved_mean
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"SavedMean"
);
const
auto
*
saved_inv_var
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"SavedVariance"
);
const
phi
::
DenseTensor
*
x
;
if
(
ctx
.
HasInput
(
"Y"
))
{
PADDLE_ENFORCE_EQ
(
true
,
false
,
platform
::
errors
::
InvalidArgument
(
"sync_batch_norm_grad doesn't support input Y"
));
}
else
{
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
}
const
auto
&
x_dims
=
x
->
dims
();
PADDLE_ENFORCE_GE
(
x_dims
.
size
(),
2
,
platform
::
errors
::
InvalidArgument
(
"The Input X dim size should be larger than 1."
));
PADDLE_ENFORCE_LE
(
x_dims
.
size
(),
5
,
platform
::
errors
::
InvalidArgument
(
"The Input X dim size should be less than 6."
));
int
N
,
C
,
H
,
W
,
D
;
phi
::
funcs
::
ExtractNCWHD
(
x_dims
,
layout
,
&
N
,
&
C
,
&
H
,
&
W
,
&
D
);
PADDLE_ENFORCE_EQ
(
scale
->
dims
()[
0
],
C
,
platform
::
errors
::
InvalidArgument
(
"Expected first dim for input parameter(scale) of "
"OP(sync_batch_norm) be (%d), but given (%d)."
,
C
,
scale
->
dims
()[
0
]));
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
d_scale
&&
d_bias
)
{
d_scale
->
mutable_data
<
MPDType
>
(
ctx
.
GetPlace
());
d_bias
->
mutable_data
<
MPDType
>
(
ctx
.
GetPlace
());
}
PADDLE_ENFORCE_EQ
(
scale
->
dims
().
size
(),
1UL
,
platform
::
errors
::
InvalidArgument
(
"Expected rank for input parameter(scale) of "
"OP(sync_batch_norm) be (1), but given (%d)."
,
scale
->
dims
().
size
()));
phi
::
DenseTensor
trans_x
;
phi
::
DenseTensor
trans_dy
;
phi
::
DenseTensor
trans_dx
;
std
::
vector
<
int
>
forward_perm
;
std
::
vector
<
int
>
backward_perm
;
std
::
vector
<
int
>
trans_shape
;
const
bool
need_transpose
=
((
layout
==
DataLayout
::
kNCHW
&&
x_dims
.
size
()
!=
2
)
||
x_dims
.
size
()
==
5
);
if
(
need_transpose
)
{
SetMLUTransposePerm
(
x_dims
,
layout
,
&
forward_perm
,
&
backward_perm
,
&
trans_shape
);
trans_x
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
trans_shape
),
ctx
.
GetPlace
());
trans_dy
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
trans_shape
),
ctx
.
GetPlace
());
trans_dx
.
mutable_data
<
T
>
(
phi
::
make_ddim
(
trans_shape
),
ctx
.
GetPlace
());
MLUCnnlTensorDesc
desc_x
(
*
x
);
MLUCnnlTensorDesc
desc_trans_x
(
trans_shape
.
size
(),
trans_shape
.
data
(),
ToCnnlDataType
(
x
->
dtype
()));
MLUCnnl
::
Transpose
(
ctx
,
forward_perm
,
x_dims
.
size
(),
desc_x
.
get
(),
GetBasePtr
(
x
),
desc_trans_x
.
get
(),
GetBasePtr
(
&
trans_x
));
MLUCnnl
::
Transpose
(
ctx
,
forward_perm
,
x_dims
.
size
(),
desc_x
.
get
(),
GetBasePtr
(
d_y
),
desc_trans_x
.
get
(),
GetBasePtr
(
&
trans_dy
));
}
else
{
trans_x
=
*
x
;
trans_dy
=
*
d_y
;
trans_dx
=
*
d_x
;
}
MLUCnnlTensorDesc
desc_trans
(
trans_x
,
supported_input_layout
[
x_dims
.
size
()
-
GET_LAYOUT_OFFSET
],
ToCnnlDataType
<
T
>
());
phi
::
DenseTensor
sum_dy
,
sum_dy_xmu
;
sum_dy
.
mutable_data
<
MPDType
>
(
bias
->
dims
(),
ctx
.
GetPlace
());
sum_dy_xmu
.
mutable_data
<
MPDType
>
(
bias
->
dims
(),
ctx
.
GetPlace
());
MLUCnnlTensorDesc
desc_other_param
(
*
bias
);
MLUCnnl
::
SyncBatchnormBackwardReduce
(
ctx
,
desc_trans
.
get
(),
GetBasePtr
(
&
trans_dy
),
desc_trans
.
get
(),
GetBasePtr
(
&
trans_x
),
desc_other_param
.
get
(),
GetBasePtr
(
saved_mean
),
desc_other_param
.
get
(),
GetBasePtr
(
saved_inv_var
),
d_scale
?
desc_other_param
.
get
()
:
nullptr
,
d_scale
?
GetBasePtr
(
d_scale
)
:
nullptr
,
d_bias
?
desc_other_param
.
get
()
:
nullptr
,
d_bias
?
GetBasePtr
(
d_bias
)
:
nullptr
,
desc_other_param
.
get
(),
GetBasePtr
(
&
sum_dy
),
desc_other_param
.
get
(),
GetBasePtr
(
&
sum_dy_xmu
),
true
/*compute sum_dy, sum_dy_xmu*/
,
d_scale
?
true
:
false
/*compute d_scale*/
,
d_bias
?
true
:
false
/*compute d_bias*/
);
phi
::
DenseTensor
numel_count
;
numel_count
.
mutable_data
<
int32_t
>
(
phi
::
make_ddim
({
1
}),
ctx
.
GetPlace
());
FillMLUTensorWithHostValue
<
int32_t
>
(
ctx
,
static_cast
<
int32_t
>
(
x
->
numel
()
/
C
),
&
numel_count
);
#ifdef PADDLE_WITH_CNCL
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>();
auto
*
comm
=
dev_ctx
.
cncl_comm
();
if
(
comm
)
{
auto
cncl_comm
=
paddle
::
platform
::
CNCLCommContext
::
Instance
().
Get
(
0
,
ctx
.
GetPlace
());
auto
*
comm
=
cncl_comm
->
comm
();
auto
comm_stream
=
cncl_comm
->
stream
();
// before comm_stream exec, need sync compute_stream.
dev_ctx
.
Wait
();
cnclDataType_t
dtype
=
platform
::
ToCNCLDataType
(
framework
::
TransToProtoVarType
(
numel_count
.
dtype
()));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclAllReduce
(
GetBasePtr
(
&
numel_count
),
GetBasePtr
(
&
numel_count
),
1
,
dtype
,
cnclSum
,
comm
,
comm_stream
));
auto
cncl_dtype
=
platform
::
ToCNCLDataType
(
framework
::
TransToProtoVarType
(
sum_dy
.
dtype
()));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclAllReduce
(
GetBasePtr
(
&
sum_dy
),
GetBasePtr
(
&
sum_dy
),
sum_dy
.
numel
(),
cncl_dtype
,
cnclSum
,
comm
,
comm_stream
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclAllReduce
(
GetBasePtr
(
&
sum_dy_xmu
),
GetBasePtr
(
&
sum_dy_xmu
),
sum_dy_xmu
.
numel
(),
cncl_dtype
,
cnclSum
,
comm
,
comm_stream
));
// after comm_stream exec, need sync queue for using compute_stream
// correctly.
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtQueueSync
(
comm_stream
));
}
#endif
if
(
d_x
)
{
MLUCnnlTensorDesc
desc_count
(
numel_count
);
MLUCnnl
::
SyncBatchNormBackwardElemt
(
ctx
,
desc_trans
.
get
(),
GetBasePtr
(
&
trans_dy
),
desc_trans
.
get
(),
GetBasePtr
(
&
trans_x
),
desc_other_param
.
get
(),
GetBasePtr
(
saved_mean
),
desc_other_param
.
get
(),
GetBasePtr
(
saved_inv_var
),
desc_other_param
.
get
(),
GetBasePtr
(
scale
),
desc_other_param
.
get
(),
GetBasePtr
(
&
sum_dy
),
desc_other_param
.
get
(),
GetBasePtr
(
&
sum_dy_xmu
),
desc_count
.
get
(),
GetBasePtr
(
&
numel_count
),
desc_trans
.
get
(),
GetBasePtr
(
&
trans_dx
));
if
(
need_transpose
)
{
MLUCnnlTensorDesc
desc_dx
(
*
d_x
);
MLUCnnlTensorDesc
desc_trans_dx
(
trans_dx
);
MLUCnnl
::
Transpose
(
ctx
,
backward_perm
,
trans_dx
.
dims
().
size
(),
desc_trans_dx
.
get
(),
GetBasePtr
(
&
trans_dx
),
desc_dx
.
get
(),
GetBasePtr
(
d_x
));
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
sync_batch_norm
,
ops
::
SyncBatchNormMLUKernel
<
float
>
,
ops
::
SyncBatchNormMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
sync_batch_norm_grad
,
ops
::
SyncBatchNormMLUGradKernel
<
float
>
,
ops
::
SyncBatchNormMLUGradKernel
<
plat
::
float16
>
);
paddle/fluid/operators/tile_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/tile_op_functor.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
TileMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
rank
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
)
->
dims
().
size
();
PADDLE_ENFORCE_GE
(
rank
,
1
,
platform
::
errors
::
InvalidArgument
(
"The rank of the input 'x' for tile op must be a positive "
"integer, but the value received is %d."
,
rank
));
PADDLE_ENFORCE_LE
(
rank
,
MAX_RANK_SUPPORTED
,
platform
::
errors
::
InvalidArgument
(
"The rank of the input 'x' for tile op "
"must be less than or equal to %d, but the value received is %d."
,
MAX_RANK_SUPPORTED
,
rank
));
auto
repeat_times
=
get_repeat_times
(
context
);
int
repeat_times_size
=
repeat_times
.
size
();
PADDLE_ENFORCE_GE
(
repeat_times_size
,
1
,
platform
::
errors
::
InvalidArgument
(
"The number of elements of the input 'repeat_times' for tile "
"op must be positive, but the value received is %d."
,
repeat_times_size
));
PADDLE_ENFORCE_LE
(
repeat_times_size
,
MAX_RANK_SUPPORTED
,
platform
::
errors
::
InvalidArgument
(
"The number of elements of the input 'repeat_times' for tile op "
"must be less than or equal to %d, but the value received is %d."
,
MAX_RANK_SUPPORTED
,
repeat_times_size
));
auto
*
in0
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
in_dims
=
in0
->
dims
();
for
(
size_t
i
=
0
;
i
<
repeat_times
.
size
();
++
i
)
{
PADDLE_ENFORCE_GT
(
repeat_times
[
i
],
0
,
platform
::
errors
::
InvalidArgument
(
"All elements of the input 'repeat_times' for tile op must "
"be positive integers, but the value received is %d."
,
repeat_times
[
i
]));
}
auto
vec_in_dims
=
phi
::
vectorize
<
int
>
(
in_dims
);
if
(
repeat_times
.
size
()
<
vec_in_dims
.
size
())
{
int
diff
=
vec_in_dims
.
size
()
-
repeat_times
.
size
();
repeat_times
.
insert
(
repeat_times
.
begin
(),
diff
,
1
);
}
else
{
int
diff
=
repeat_times
.
size
()
-
vec_in_dims
.
size
();
vec_in_dims
.
insert
(
vec_in_dims
.
begin
(),
diff
,
1
);
}
PADDLE_ENFORCE_EQ
(
repeat_times
.
size
(),
vec_in_dims
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The rank (%d) of the input 'x' and the rank (%d) of the input "
"'repeat_times' for tile op must match after promotion."
,
vec_in_dims
.
size
(),
repeat_times
.
size
()));
auto
*
out0
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
bool
repeat_one_times
=
true
;
for
(
size_t
i
=
0
;
i
<
repeat_times
.
size
();
++
i
)
{
if
(
repeat_times
[
i
]
!=
1
)
{
repeat_one_times
=
false
;
}
}
if
(
repeat_one_times
)
{
paddle
::
framework
::
TensorCopy
(
*
in0
,
context
.
GetPlace
(),
out0
);
}
else
{
framework
::
DDim
new_in_dims
=
phi
::
make_ddim
(
vec_in_dims
);
framework
::
DDim
out_dims
(
new_in_dims
);
for
(
size_t
i
=
0
;
i
<
repeat_times
.
size
();
++
i
)
{
out_dims
[
i
]
*=
repeat_times
[
i
];
}
out0
->
Resize
(
out_dims
);
out0
->
mutable_data
<
T
>
(
context
.
GetPlace
());
MLUCnnlTensorDesc
x_desc
(
*
in0
);
MLUCnnlTensorDesc
out_desc
(
*
out0
);
MLUCnnl
::
BroadcastTo
(
context
,
x_desc
.
get
(),
GetBasePtr
(
in0
),
out_desc
.
get
(),
GetBasePtr
(
out0
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
tile
,
ops
::
TileMLUKernel
<
bool
>
,
ops
::
TileMLUKernel
<
int
>
,
ops
::
TileMLUKernel
<
int64_t
>
,
ops
::
TileMLUKernel
<
float
>
);
#endif
paddle/fluid/operators/top_k_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/top_k_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
TopkMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
indices
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Indices"
);
const
auto
&
place
=
ctx
.
GetPlace
();
size_t
k
=
static_cast
<
int
>
(
ctx
.
Attr
<
int
>
(
"k"
));
auto
*
k_t
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"K"
);
if
(
k_t
)
{
auto
k_t_ptr
=
static_cast
<
const
void
*>
(
k_t
->
data
<
int
>
());
auto
size
=
k_t
->
numel
()
*
sizeof
(
int
);
memory
::
Copy
(
platform
::
CPUPlace
(),
reinterpret_cast
<
void
*>
(
&
k
),
k_t
->
place
(),
k_t_ptr
,
size
,
nullptr
);
framework
::
DDim
output_dims
=
output
->
dims
();
output_dims
[
output_dims
.
size
()
-
1
]
=
k
;
output
->
Resize
(
output_dims
);
indices
->
Resize
(
output_dims
);
}
output
->
mutable_data
<
T
>
(
place
);
indices
->
mutable_data
<
int64_t
>
(
place
);
const
bool
largest
=
true
;
const
bool
sorted
=
true
;
const
int
axis
=
-
1
;
// cnnl only support int32/int16 type of indices
phi
::
DenseTensor
indices_int32
(
framework
::
TransToPhiDataType
(
VT
::
INT32
));
indices_int32
.
Resize
(
indices
->
dims
());
indices_int32
.
mutable_data
<
int32_t
>
(
place
);
MLUCnnlTensorDesc
input_desc
(
*
input
);
MLUCnnlTensorDesc
values_output_desc
(
*
output
);
MLUCnnlTensorDesc
indices_int32_desc
(
indices_int32
);
MLUCnnl
::
TopK
(
ctx
,
k
,
axis
,
largest
,
sorted
,
input_desc
.
get
(),
GetBasePtr
(
input
),
values_output_desc
.
get
(),
GetBasePtr
(
output
),
indices_int32_desc
.
get
(),
GetBasePtr
(
&
indices_int32
));
// cast indices type to int64
MLUCnnlTensorDesc
cast_output_desc
(
*
indices
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
VT
::
INT32
,
VT
::
INT64
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
indices_int32_desc
.
get
(),
GetBasePtr
(
&
indices_int32
),
cast_output_desc
.
get
(),
GetBasePtr
(
indices
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
top_k
,
ops
::
TopkMLUKernel
<
float
>
,
ops
::
TopkMLUKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/top_k_v2_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
TopkV2MLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
indices
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Indices"
);
const
auto
&
place
=
ctx
.
GetPlace
();
const
auto
&
sorted
=
static_cast
<
bool
>
(
ctx
.
Attr
<
bool
>
(
"sorted"
));
const
auto
&
largest
=
static_cast
<
bool
>
(
ctx
.
Attr
<
bool
>
(
"largest"
));
// axis < 0, cacluate the real axis
int
axis
=
static_cast
<
int
>
(
ctx
.
Attr
<
int
>
(
"axis"
));
if
(
axis
<
0
)
{
const
auto
&
in_dims
=
input
->
dims
();
axis
+=
in_dims
.
size
();
}
size_t
k
=
static_cast
<
int
>
(
ctx
.
Attr
<
int
>
(
"k"
));
auto
*
k_t
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"K"
);
if
(
k_t
)
{
auto
k_t_ptr
=
static_cast
<
const
void
*>
(
k_t
->
data
<
int
>
());
auto
size
=
k_t
->
numel
()
*
sizeof
(
int
);
memory
::
Copy
(
platform
::
CPUPlace
(),
reinterpret_cast
<
void
*>
(
&
k
),
k_t
->
place
(),
k_t_ptr
,
size
,
nullptr
);
framework
::
DDim
output_dims
=
output
->
dims
();
// accroding to axis to set K value in the dim
output_dims
[
axis
]
=
k
;
output
->
Resize
(
output_dims
);
indices
->
Resize
(
output_dims
);
}
output
->
mutable_data
<
T
>
(
place
);
indices
->
mutable_data
<
int64_t
>
(
place
);
// cnnl only support int32/int16 type of indices
phi
::
DenseTensor
indices_int32
(
framework
::
TransToPhiDataType
(
VT
::
INT32
));
indices_int32
.
Resize
(
indices
->
dims
());
indices_int32
.
mutable_data
<
int32_t
>
(
place
);
MLUCnnlTensorDesc
input_desc
(
*
input
);
MLUCnnlTensorDesc
values_output_desc
(
*
output
);
MLUCnnlTensorDesc
indices_int32_desc
(
indices_int32
);
MLUCnnl
::
TopK
(
ctx
,
k
,
axis
,
largest
,
sorted
,
input_desc
.
get
(),
GetBasePtr
(
input
),
values_output_desc
.
get
(),
GetBasePtr
(
output
),
indices_int32_desc
.
get
(),
GetBasePtr
(
&
indices_int32
));
// cast indices type to int64
MLUCnnlTensorDesc
cast_output_desc
(
*
indices
);
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
VT
::
INT32
,
VT
::
INT64
);
MLUCnnl
::
Cast
(
ctx
,
cast_type
,
indices_int32_desc
.
get
(),
GetBasePtr
(
&
indices_int32
),
cast_output_desc
.
get
(),
GetBasePtr
(
indices
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
top_k_v2
,
ops
::
TopkV2MLUKernel
<
float
>
,
ops
::
TopkV2MLUKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/transpose_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
TransposeMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
std
::
vector
<
int
>
axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"axis"
);
out
->
mutable_data
<
T
>
(
ctx
.
device_context
().
GetPlace
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
axis
,
x
,
out
,
false
/*need_reshape_or_alloc*/
);
}
};
template
<
typename
T
>
class
TransposeGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
out_grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
x_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
std
::
vector
<
int
>
axis
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"axis"
);
std
::
vector
<
int
>
reversed_axis
(
axis
);
for
(
size_t
i
=
0
;
i
<
axis
.
size
();
i
++
)
{
reversed_axis
[
axis
[
i
]]
=
i
;
}
x_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
TransposeFromMLUTensor
<
T
>
(
ctx
,
reversed_axis
,
out_grad
,
x_grad
,
false
/*need_reshape_or_alloc*/
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
transpose2
,
ops
::
TransposeMLUKernel
<
float
>
,
ops
::
TransposeMLUKernel
<
paddle
::
platform
::
float16
>
,
ops
::
TransposeMLUKernel
<
int
>
,
ops
::
TransposeMLUKernel
<
int16_t
>
,
ops
::
TransposeMLUKernel
<
uint8_t
>
,
ops
::
TransposeMLUKernel
<
int8_t
>
,
ops
::
TransposeMLUKernel
<
bool
>
);
REGISTER_OP_MLU_KERNEL
(
transpose2_grad
,
ops
::
TransposeGradMLUKernel
<
float
>
,
ops
::
TransposeGradMLUKernel
<
paddle
::
platform
::
float16
>
,
ops
::
TransposeGradMLUKernel
<
int
>
,
ops
::
TransposeGradMLUKernel
<
int16_t
>
,
ops
::
TransposeGradMLUKernel
<
uint8_t
>
,
ops
::
TransposeGradMLUKernel
<
int8_t
>
,
ops
::
TransposeGradMLUKernel
<
bool
>
);
paddle/fluid/operators/tril_triu_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
TrilTriuMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
int
diagonal
=
ctx
.
Attr
<
int
>
(
"diagonal"
);
bool
lower
=
ctx
.
Attr
<
bool
>
(
"lower"
);
bool
upper
;
if
(
lower
)
{
upper
=
0
;
}
else
{
upper
=
1
;
}
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
TrilTriu
(
ctx
,
diagonal
,
upper
,
x_desc
.
get
(),
GetBasePtr
(
x
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
tril_triu
,
ops
::
TrilTriuMLUKernel
<
float
>
,
ops
::
TrilTriuMLUKernel
<
int32_t
>
,
ops
::
TrilTriuMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/truncated_gaussian_random_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <limits>
#include <random>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
#include "paddle/phi/core/generator.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
TruncatedGaussianRandomMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
float
mean
=
context
.
Attr
<
float
>
(
"mean"
);
float
std
=
context
.
Attr
<
float
>
(
"std"
);
auto
*
tensor
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
tensor
->
mutable_data
<
T
>
(
context
.
GetPlace
());
phi
::
DenseTensor
cpu_tensor
(
tensor
->
dtype
());
cpu_tensor
.
Resize
(
tensor
->
dims
());
T
*
data_cpu
=
cpu_tensor
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
std
::
uniform_real_distribution
<
T
>
dist
(
std
::
numeric_limits
<
float
>::
min
(),
1.0
);
TruncatedNormal
<
T
>
truncated_normal
(
mean
,
std
);
int64_t
size
=
tensor
->
numel
();
unsigned
int
seed
=
static_cast
<
unsigned
int
>
(
context
.
Attr
<
int
>
(
"seed"
));
auto
engine
=
phi
::
GetCPURandomEngine
(
seed
);
for
(
int64_t
i
=
0
;
i
<
size
;
++
i
)
{
data_cpu
[
i
]
=
truncated_normal
(
dist
(
*
engine
));
}
auto
&
dev_ctx
=
context
.
template
device_context
<
platform
::
MLUDeviceContext
>();
framework
::
TensorCopy
(
cpu_tensor
,
context
.
GetPlace
(),
dev_ctx
,
tensor
);
dev_ctx
.
Wait
();
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
truncated_gaussian_random
,
ops
::
TruncatedGaussianRandomMLUKernel
<
float
>
);
paddle/fluid/operators/uniform_random_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/uniform_random_op.h"
#include "paddle/phi/core/generator.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
MLUUniformRandomKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
phi
::
DenseTensor
*
tensor
=
nullptr
;
auto
out_var
=
ctx
.
OutputVar
(
"Out"
);
std
::
vector
<
int64_t
>
new_shape
;
auto
list_new_shape_tensor
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"ShapeTensorList"
);
if
(
list_new_shape_tensor
.
size
()
>
0
||
ctx
.
HasInput
(
"ShapeTensor"
))
{
if
(
ctx
.
HasInput
(
"ShapeTensor"
))
{
auto
*
shape_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"ShapeTensor"
);
new_shape
=
GetNewDataFromShapeTensor
(
shape_tensor
);
}
else
if
(
list_new_shape_tensor
.
size
()
>
0
)
{
new_shape
=
GetNewDataFromShapeTensorList
(
list_new_shape_tensor
);
}
}
if
(
out_var
->
IsType
<
phi
::
SelectedRows
>
())
{
auto
*
selected_rows
=
out_var
->
GetMutable
<
phi
::
SelectedRows
>
();
tensor
=
selected_rows
->
mutable_value
();
auto
shape
=
ctx
.
Attr
<
std
::
vector
<
int64_t
>>
(
"shape"
);
if
(
!
new_shape
.
empty
())
shape
=
new_shape
;
tensor
->
Resize
(
phi
::
make_ddim
(
shape
));
selected_rows
->
mutable_rows
()
->
reserve
(
shape
[
0
]);
}
else
if
(
out_var
->
IsType
<
phi
::
DenseTensor
>
())
{
tensor
=
out_var
->
GetMutable
<
phi
::
DenseTensor
>
();
if
(
!
new_shape
.
empty
())
tensor
->
Resize
(
phi
::
make_ddim
(
new_shape
));
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Expected type of Output(out) in uniform_random_op must be "
"phi::DenseTensor, "
"SelectedRows. But got "
"unsupport type: %s."
,
framework
::
ToTypeName
(
out_var
->
Type
())));
}
tensor
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int64_t
size
=
tensor
->
numel
();
phi
::
DenseTensor
cpu_tensor
(
tensor
->
dtype
());
cpu_tensor
.
Resize
(
tensor
->
dims
());
T
*
data_cpu
=
cpu_tensor
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
std
::
uniform_real_distribution
<
T
>
dist
(
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"min"
)),
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"max"
)));
unsigned
int
seed
=
static_cast
<
unsigned
int
>
(
ctx
.
Attr
<
int
>
(
"seed"
));
auto
engine
=
phi
::
GetCPURandomEngine
(
seed
);
for
(
int64_t
i
=
0
;
i
<
size
;
++
i
)
{
data_cpu
[
i
]
=
dist
(
*
engine
);
}
unsigned
int
diag_num
=
static_cast
<
unsigned
int
>
(
ctx
.
Attr
<
int
>
(
"diag_num"
));
unsigned
int
diag_step
=
static_cast
<
unsigned
int
>
(
ctx
.
Attr
<
int
>
(
"diag_step"
));
auto
diag_val
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"diag_val"
));
if
(
diag_num
>
0
)
{
PADDLE_ENFORCE_GT
(
size
,
(
diag_num
-
1
)
*
(
diag_step
+
1
),
platform
::
errors
::
InvalidArgument
(
"ShapeInvalid: the diagonal's elements is equal (num-1) "
"* (step-1) with num %d, step %d,"
"It should be smaller than %d, but received %d"
,
diag_num
,
diag_step
,
(
diag_num
-
1
)
*
(
diag_step
+
1
),
size
));
for
(
int64_t
i
=
0
;
i
<
diag_num
;
++
i
)
{
int64_t
pos
=
i
*
diag_step
+
i
;
data_cpu
[
pos
]
=
diag_val
;
}
}
// copy to MLU
framework
::
TensorCopy
(
cpu_tensor
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
tensor
);
ctx
.
template
device_context
<
paddle
::
platform
::
MLUDeviceContext
>().
Wait
();
}
};
}
// namespace operators
}
// namespace paddle
REGISTER_OP_MLU_KERNEL
(
uniform_random
,
paddle
::
operators
::
MLUUniformRandomKernel
<
float
>
);
paddle/fluid/operators/unsqueeze_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_MLU
#include <memory>
#include <string>
#include "paddle/fluid/operators/unsqueeze_op.h"
#include "paddle/fluid/platform/device/mlu/device_context.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_MLU_KERNEL
(
unsqueeze
,
ops
::
UnsqueezeKernel
<
plat
::
MLUDeviceContext
,
float
>
,
ops
::
UnsqueezeKernel
<
plat
::
MLUDeviceContext
,
double
>
,
ops
::
UnsqueezeKernel
<
plat
::
MLUDeviceContext
,
plat
::
float16
>
,
ops
::
UnsqueezeKernel
<
plat
::
MLUDeviceContext
,
bool
>
,
ops
::
UnsqueezeKernel
<
plat
::
MLUDeviceContext
,
int
>
,
ops
::
UnsqueezeKernel
<
plat
::
MLUDeviceContext
,
int8_t
>
,
ops
::
UnsqueezeKernel
<
plat
::
MLUDeviceContext
,
int64_t
>
);
REGISTER_OP_MLU_KERNEL
(
unsqueeze2
,
ops
::
UnsqueezeKernel
<
plat
::
MLUDeviceContext
,
float
>
,
ops
::
UnsqueezeKernel
<
plat
::
MLUDeviceContext
,
double
>
,
ops
::
UnsqueezeKernel
<
plat
::
MLUDeviceContext
,
plat
::
float16
>
,
ops
::
UnsqueezeKernel
<
plat
::
MLUDeviceContext
,
bool
>
,
ops
::
UnsqueezeKernel
<
plat
::
MLUDeviceContext
,
int
>
,
ops
::
UnsqueezeKernel
<
plat
::
MLUDeviceContext
,
int8_t
>
,
ops
::
UnsqueezeKernel
<
plat
::
MLUDeviceContext
,
int64_t
>
);
REGISTER_OP_MLU_KERNEL
(
unsqueeze_grad
,
ops
::
UnsqueezeGradKernel
<
plat
::
MLUDeviceContext
,
float
>
,
ops
::
UnsqueezeGradKernel
<
plat
::
MLUDeviceContext
,
double
>
,
ops
::
UnsqueezeGradKernel
<
plat
::
MLUDeviceContext
,
plat
::
float16
>
,
ops
::
UnsqueezeGradKernel
<
plat
::
MLUDeviceContext
,
bool
>
,
ops
::
UnsqueezeGradKernel
<
plat
::
MLUDeviceContext
,
int
>
,
ops
::
UnsqueezeGradKernel
<
plat
::
MLUDeviceContext
,
int8_t
>
,
ops
::
UnsqueezeGradKernel
<
plat
::
MLUDeviceContext
,
int64_t
>
);
REGISTER_OP_MLU_KERNEL
(
unsqueeze2_grad
,
ops
::
Unsqueeze2GradKernel
<
plat
::
MLUDeviceContext
,
float
>
,
ops
::
Unsqueeze2GradKernel
<
plat
::
MLUDeviceContext
,
double
>
,
ops
::
Unsqueeze2GradKernel
<
plat
::
MLUDeviceContext
,
plat
::
float16
>
,
ops
::
Unsqueeze2GradKernel
<
plat
::
MLUDeviceContext
,
bool
>
,
ops
::
Unsqueeze2GradKernel
<
plat
::
MLUDeviceContext
,
int
>
,
ops
::
Unsqueeze2GradKernel
<
plat
::
MLUDeviceContext
,
int8_t
>
,
ops
::
Unsqueeze2GradKernel
<
plat
::
MLUDeviceContext
,
int64_t
>
);
#endif
paddle/fluid/operators/unstack_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
UnStackMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
out
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
"Y"
);
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
if
(
axis
<
0
)
axis
+=
x
->
dims
().
size
();
int
num
=
x
->
dims
()[
axis
];
std
::
vector
<
MLUCnnlTensorDesc
>
out_descs
;
std
::
vector
<
cnnlTensorDescriptor_t
>
out_raw_descs
;
std
::
vector
<
void
*>
out_ptrs
;
std
::
vector
<
int64_t
>
new_dims
=
phi
::
vectorize
(
x
->
dims
());
new_dims
[
axis
]
=
1
;
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
out
[
i
]
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
out_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
new_dims
.
size
(),
new_dims
.
data
(),
ToCnnlDataType
<
T
>
()));
out_raw_descs
.
push_back
(
out_descs
.
back
().
get
());
out_ptrs
.
push_back
(
GetBasePtr
(
out
[
i
]));
}
MLUCnnlTensorDesc
x_desc
(
*
x
);
MLUCnnl
::
Split
(
ctx
,
num
,
axis
,
x_desc
.
get
(),
GetBasePtr
(
x
),
out_raw_descs
.
data
(),
out_ptrs
.
data
());
}
};
template
<
typename
T
>
class
UnStackGradMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
x
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
*
y
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
if
(
axis
<
0
)
axis
+=
(
x
[
0
]
->
dims
().
size
()
+
1
);
int
num
=
static_cast
<
int
>
(
x
.
size
());
std
::
vector
<
MLUCnnlTensorDesc
>
x_descs
;
std
::
vector
<
cnnlTensorDescriptor_t
>
x_raw_descs
;
std
::
vector
<
const
void
*>
x_ptrs
;
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
if
(
x
[
i
]
->
dims
().
size
()
!=
0
)
{
std
::
vector
<
int64_t
>
in_dims
=
phi
::
vectorize
(
x
[
i
]
->
dims
());
in_dims
.
insert
(
in_dims
.
begin
()
+
axis
,
1
);
x_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
in_dims
.
size
(),
in_dims
.
data
(),
ToCnnlDataType
<
T
>
()));
}
else
{
int
input_dims
=
1
;
x_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
1
,
&
input_dims
,
ToCnnlDataType
<
T
>
()));
}
x_raw_descs
.
push_back
(
x_descs
.
back
().
get
());
x_ptrs
.
push_back
(
GetBasePtr
(
x
[
i
]));
}
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
MLUCnnlTensorDesc
y_desc
(
*
y
);
MLUCnnl
::
Concat
(
ctx
,
num
,
axis
,
x_raw_descs
.
data
(),
x_ptrs
.
data
(),
y_desc
.
get
(),
GetBasePtr
(
y
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
plat
=
paddle
::
platform
;
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
unstack
,
ops
::
UnStackMLUKernel
<
float
>
,
ops
::
UnStackMLUKernel
<
plat
::
float16
>
);
REGISTER_OP_MLU_KERNEL
(
unstack_grad
,
ops
::
UnStackGradMLUKernel
<
float
>
,
ops
::
UnStackGradMLUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/where_index_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
MLUWhereIndexKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
condition
=
context
.
Input
<
phi
::
DenseTensor
>
(
"Condition"
);
auto
*
out
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
dims
=
condition
->
dims
();
const
int
rank
=
dims
.
size
();
phi
::
DenseTensor
num_true
;
num_true
.
mutable_data
<
int
>
({
1
},
context
.
GetPlace
());
MLUCnnlTensorDesc
con_desc
(
*
condition
);
MLUCnnlTensorDesc
num_true_desc
(
num_true
);
MLUCnnl
::
NumTrue
(
context
,
con_desc
.
get
(),
GetBasePtr
(
condition
),
num_true_desc
.
get
(),
GetBasePtr
(
&
num_true
));
phi
::
DenseTensor
local_true_num
;
paddle
::
framework
::
TensorCopySync
(
num_true
,
platform
::
CPUPlace
(),
&
local_true_num
);
auto
true_num
=
*
local_true_num
.
data
<
int
>
();
out
->
Resize
(
phi
::
make_ddim
({
true_num
,
rank
}));
out
->
mutable_data
<
int64_t
>
(
context
.
GetPlace
());
if
(
true_num
==
0
)
{
return
;
}
auto
&
dev_ctx
=
context
.
template
device_context
<
MLUDeviceContext
>();
phi
::
DenseTensor
out_int32
=
context
.
AllocateTmpTensor
<
int32_t
,
MLUDeviceContext
>
(
out
->
dims
(),
dev_ctx
);
MLUCnnlTensorDesc
out_int32_desc
(
out_int32
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
bool
as_tuple
=
false
;
MLUCnnl
::
Where
(
context
,
con_desc
.
get
(),
GetBasePtr
(
condition
),
num_true_desc
.
get
(),
GetBasePtr
(
&
num_true
),
as_tuple
,
out_int32_desc
.
get
(),
GetBasePtr
(
&
out_int32
));
cnnlCastDataType_t
cast_type
=
GetCastDataType
(
VT
::
INT32
,
VT
::
INT64
);
MLUCnnl
::
Cast
(
context
,
cast_type
,
out_int32_desc
.
get
(),
GetBasePtr
(
&
out_int32
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
where_index
,
ops
::
MLUWhereIndexKernel
<
int
>
,
ops
::
MLUWhereIndexKernel
<
bool
>
,
ops
::
MLUWhereIndexKernel
<
float
>
);
paddle/fluid/operators/where_op_mlu.cc
已删除
100644 → 0
浏览文件 @
0e3f7ab1
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
WhereMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
condition
=
context
.
Input
<
phi
::
DenseTensor
>
(
"Condition"
);
auto
*
X
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
Y
=
context
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
context
.
GetPlace
();
out
->
mutable_data
<
T
>
(
place
);
MLUCnnlTensorDesc
x_desc
(
*
X
);
MLUCnnlTensorDesc
y_desc
(
*
Y
);
MLUCnnlTensorDesc
condition_desc
(
*
condition
);
MLUCnnlTensorDesc
out_desc
(
*
out
);
MLUCnnl
::
Select
(
context
,
condition_desc
.
get
(),
GetBasePtr
(
condition
),
x_desc
.
get
(),
GetBasePtr
(
X
),
y_desc
.
get
(),
GetBasePtr
(
Y
),
out_desc
.
get
(),
GetBasePtr
(
out
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_MLU_KERNEL
(
where
,
ops
::
WhereMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
float
>
,
ops
::
WhereMLUKernel
<
paddle
::
platform
::
MLUDeviceContext
,
int
>
);
#endif
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录