Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
599a201f
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2310
Star
20933
Fork
5423
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
599a201f
编写于
4月 10, 2023
作者:
J
jjyaoao
提交者:
GitHub
4月 10, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
delete paddle/fluid/operators/elementwise/*_npu.* (#52675)
上级
0f3bbe10
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
0 addition
and
1858 deletion
+0
-1858
paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+0
-161
paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
+0
-179
paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
...luid/operators/elementwise/elementwise_floordiv_op_npu.cc
+0
-49
paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
+0
-251
paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
+0
-224
paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
+0
-71
paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
+0
-160
paddle/fluid/operators/elementwise/elementwise_npu.h
paddle/fluid/operators/elementwise/elementwise_npu.h
+0
-147
paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
...le/fluid/operators/elementwise/elementwise_op_npu_test.cc
+0
-185
paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
+0
-242
paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+0
-189
未找到文件。
paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
已删除
100644 → 0
浏览文件 @
0f3bbe10
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ElementwiseAddNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
bool
direct_compute
=
false
;
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
axis
=
(
axis
==
-
1
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
:
axis
);
if
(
x_dims
.
size
()
==
y_dims
.
size
())
{
direct_compute
=
true
;
}
else
if
(
x_dims
.
size
()
>
y_dims
.
size
())
{
direct_compute
=
x_dims
.
size
()
==
(
y_dims
.
size
()
+
axis
);
}
else
{
direct_compute
=
y_dims
.
size
()
==
(
x_dims
.
size
()
+
axis
);
}
if
(
direct_compute
)
{
const
auto
&
runner
=
NpuOpRunner
(
"Add"
,
{
*
x
,
*
y
},
{
*
out
},
{});
runner
.
Run
(
dev_ctx
.
stream
());
}
else
{
phi
::
DenseTensor
transformed_x
,
transformed_y
;
NpuElementWiseOpBroadcast
<
T
>
(
dev_ctx
,
x
,
y
,
axis
,
&
transformed_x
,
&
transformed_y
);
const
auto
&
runner
=
NpuOpRunner
(
"Add"
,
{
transformed_x
,
transformed_y
},
{
*
out
},
{});
runner
.
Run
(
dev_ctx
.
stream
());
}
}
};
template
<
typename
T
>
class
ElementwiseAddGradNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
axis
=
(
axis
==
-
1
?
std
::
abs
(
x
->
dims
().
size
()
-
y
->
dims
().
size
())
:
axis
);
auto
stream
=
dev_ctx
.
stream
();
if
(
dx
)
{
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
dx
->
dims
()
!=
dout
->
dims
())
{
std
::
vector
<
int
>
dst_dims_vec
;
std
::
vector
<
int
>
reduce_axes
;
auto
src_dims
=
dx
->
dims
();
auto
dout_dims
=
dout
->
dims
();
int
src_axis
=
(
src_dims
.
size
()
<
dout_dims
.
size
()
?
axis
:
0
);
for
(
int
ax
=
0
;
ax
<
dout_dims
.
size
();
++
ax
)
{
if
((
ax
<
src_axis
||
ax
>=
src_axis
+
src_dims
.
size
())
||
(
dout_dims
[
ax
]
>
1
&&
src_dims
[
ax
-
src_axis
]
==
1
))
{
reduce_axes
.
push_back
(
ax
);
}
else
{
dst_dims_vec
.
push_back
(
dout_dims
[
ax
]);
}
}
if
(
!
reduce_axes
.
empty
())
{
phi
::
DenseTensor
tmp
;
tmp
.
ShareDataWith
(
*
dx
);
tmp
.
Resize
(
phi
::
make_ddim
(
dst_dims_vec
));
const
auto
&
runner
=
NpuOpRunner
(
"ReduceSumD"
,
{
*
dout
},
{
tmp
},
{{
"axes"
,
reduce_axes
},
{
"keep_dims"
,
false
}});
runner
.
Run
(
stream
);
}
}
else
{
framework
::
TensorCopy
(
*
dout
,
ctx
.
GetPlace
(),
dev_ctx
,
dx
);
}
}
if
(
dy
)
{
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
dy
->
dims
()
!=
dout
->
dims
())
{
std
::
vector
<
int
>
dst_dims_vec
;
std
::
vector
<
int
>
reduce_axes
;
auto
src_dims
=
dy
->
dims
();
auto
dout_dims
=
dout
->
dims
();
int
src_axis
=
(
src_dims
.
size
()
<
dout_dims
.
size
()
?
axis
:
0
);
for
(
int
ax
=
0
;
ax
<
dout_dims
.
size
();
++
ax
)
{
if
((
ax
<
src_axis
||
ax
>=
src_axis
+
src_dims
.
size
())
||
(
dout_dims
[
ax
]
>
1
&&
src_dims
[
ax
-
src_axis
]
==
1
))
{
reduce_axes
.
push_back
(
ax
);
}
else
{
dst_dims_vec
.
push_back
(
dout_dims
[
ax
]);
}
}
if
(
!
reduce_axes
.
empty
())
{
phi
::
DenseTensor
tmp
;
tmp
.
ShareDataWith
(
*
dy
);
tmp
.
Resize
(
phi
::
make_ddim
(
dst_dims_vec
));
const
auto
&
runner
=
NpuOpRunner
(
"ReduceSumD"
,
{
*
dout
},
{
tmp
},
{{
"axes"
,
reduce_axes
},
{
"keep_dims"
,
false
}});
runner
.
Run
(
stream
);
}
}
else
{
framework
::
TensorCopy
(
*
dout
,
ctx
.
GetPlace
(),
dev_ctx
,
dy
);
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
elementwise_add
,
ops
::
ElementwiseAddNPUKernel
<
float
>
,
#ifdef PADDLE_WITH_ASCEND_INT64
ops
::
ElementwiseAddNPUKernel
<
int64_t
>
,
#endif
ops
::
ElementwiseAddNPUKernel
<
plat
::
float16
>
);
REGISTER_OP_NPU_KERNEL
(
elementwise_add_grad
,
ops
::
ElementwiseAddGradNPUKernel
<
float
>
,
ops
::
ElementwiseAddGradNPUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
已删除
100644 → 0
浏览文件 @
0f3bbe10
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
ElementwiseDivNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
ctx
.
GetPlace
();
out
->
mutable_data
<
T
>
(
place
);
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
const
auto
&
runner
=
NpuOpRunner
(
"Div"
,
{
*
x
,
*
y
},
{
*
out
},
{});
runner
.
Run
(
stream
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
ElementwiseDivGradNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
out
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
place
=
ctx
.
GetPlace
();
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
if
(
dx
)
{
dx
->
mutable_data
<
T
>
(
place
);
phi
::
DenseTensor
tensor_one
(
y
->
type
());
tensor_one
.
mutable_data
<
float
>
({
1
},
place
);
FillNpuTensorWithConstant
<
float
>
(
&
tensor_one
,
static_cast
<
float
>
(
1.0
));
// Use `Div` CANN OP to achieve `1/y` instead of `Power` CANN OP.
// Because `Power` will cause precision overflow, that is, `float_status`
// will be set to 1.
phi
::
DenseTensor
y_div
(
y
->
type
());
y_div
.
mutable_data
<
T
>
(
y
->
dims
(),
place
);
const
auto
&
runner_one_div_y
=
NpuOpRunner
(
"Div"
,
{
tensor_one
,
*
y
},
{
y_div
},
{});
runner_one_div_y
.
Run
(
stream
);
phi
::
DenseTensor
tensor_zeros
(
x
->
type
());
tensor_zeros
.
mutable_data
<
T
>
(
x
->
dims
(),
place
);
const
auto
&
runner_tensor_zeros
=
NpuOpRunner
(
"ZerosLike"
,
{
*
x
},
{
tensor_zeros
},
{});
runner_tensor_zeros
.
Run
(
stream
);
phi
::
DenseTensor
x_zero
(
phi
::
DataType
::
BOOL
);
x_zero
.
mutable_data
<
bool
>
(
x
->
dims
(),
place
);
const
auto
&
runner_x_zero
=
NpuOpRunner
(
"Equal"
,
{
*
x
,
tensor_zeros
},
{
x_zero
},
{});
runner_x_zero
.
Run
(
stream
);
phi
::
DenseTensor
x_nozero
(
phi
::
DataType
::
BOOL
);
x_nozero
.
mutable_data
<
bool
>
(
x
->
dims
(),
place
);
const
auto
&
runner_x_nonzero
=
NpuOpRunner
(
"LogicalNot"
,
{
x_zero
},
{
x_nozero
},
{});
runner_x_nonzero
.
Run
(
stream
);
phi
::
DenseTensor
x_nozero_f
(
x
->
type
());
x_nozero_f
.
mutable_data
<
T
>
(
x
->
dims
(),
place
);
const
auto
&
runner_x_nonzero_f
=
NpuOpRunner
(
"Cast"
,
{
x_nozero
},
{
x_nozero_f
},
{{
"dst_type"
,
static_cast
<
int32_t
>
(
0
)}});
runner_x_nonzero_f
.
Run
(
stream
);
phi
::
DenseTensor
x_grad_w
(
x
->
type
());
x_grad_w
.
mutable_data
<
T
>
(
x
->
dims
(),
place
);
const
auto
&
runner_x_grad_w
=
NpuOpRunner
(
"Mul"
,
{
x_nozero_f
,
y_div
},
{
x_grad_w
},
{});
runner_x_grad_w
.
Run
(
stream
);
const
auto
&
runner_x_grad
=
NpuOpRunner
(
"Mul"
,
{
x_grad_w
,
*
dout
},
{
*
dx
},
{});
runner_x_grad
.
Run
(
stream
);
}
if
(
dy
)
{
dy
->
mutable_data
<
T
>
(
place
);
phi
::
DenseTensor
neg_out
(
out
->
type
());
neg_out
.
mutable_data
<
T
>
(
out
->
dims
(),
place
);
const
auto
&
runner_neg_out
=
NpuOpRunner
(
"Neg"
,
{
*
out
},
{
neg_out
},
{});
runner_neg_out
.
Run
(
stream
);
phi
::
DenseTensor
tmp_mul
(
out
->
type
());
tmp_mul
.
mutable_data
<
T
>
(
out
->
dims
(),
place
);
const
auto
&
runner_mul
=
NpuOpRunner
(
"Mul"
,
{
neg_out
,
*
dout
},
{
tmp_mul
},
{});
runner_mul
.
Run
(
stream
);
if
(
dy
->
dims
()
!=
dout
->
dims
())
{
phi
::
DenseTensor
reduced_tmp_mul
(
y
->
type
());
reduced_tmp_mul
.
mutable_data
<
T
>
(
y
->
dims
(),
place
);
std
::
vector
<
int64_t
>
axes
;
int64_t
diff
=
dout
->
dims
().
size
()
-
dy
->
dims
().
size
();
for
(
int64_t
i
=
0
;
i
<
dout
->
dims
().
size
();
++
i
)
{
if
(
i
<
diff
)
{
axes
.
push_back
(
i
);
continue
;
}
if
(
dout
->
dims
()[
i
]
>
dy
->
dims
()[
i
-
diff
])
{
axes
.
push_back
(
i
);
}
}
const
auto
&
runner_reduce
=
NpuOpRunner
(
"ReduceSumD"
,
{
tmp_mul
},
{
reduced_tmp_mul
},
{{
"axes"
,
axes
},
{
"keep_dims"
,
false
}});
runner_reduce
.
Run
(
stream
);
const
auto
&
runner_y_grad
=
NpuOpRunner
(
"Div"
,
{
reduced_tmp_mul
,
*
y
},
{
*
dy
},
{});
runner_y_grad
.
Run
(
stream
);
}
else
{
const
auto
&
runner_y_grad
=
NpuOpRunner
(
"Div"
,
{
tmp_mul
,
*
y
},
{
*
dy
},
{});
runner_y_grad
.
Run
(
stream
);
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_NPU_KERNEL
(
elementwise_div
,
ops
::
ElementwiseDivNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
float
>
,
ops
::
ElementwiseDivNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_NPU_KERNEL
(
elementwise_div_grad
,
ops
::
ElementwiseDivGradNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
float
>
,
ops
::
ElementwiseDivGradNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
已删除
100644 → 0
浏览文件 @
0f3bbe10
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ElementwiseFloorDivNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
const
auto
&
runner
=
NpuOpRunner
(
"FloorDiv"
,
{
*
x
,
*
y
},
{
*
out
},
{});
runner
.
Run
(
stream
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_NPU_KERNEL
(
elementwise_floordiv
,
ops
::
ElementwiseFloorDivNPUKernel
<
int
>
,
ops
::
ElementwiseFloorDivNPUKernel
<
int64_t
>
);
paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
已删除
100644 → 0
浏览文件 @
0f3bbe10
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
ElementwiseMaxNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
bool
direct_compute
=
false
;
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
axis
=
(
axis
==
-
1
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
:
axis
);
if
(
x_dims
.
size
()
>=
y_dims
.
size
())
{
direct_compute
=
y_dims
==
phi
::
slice_ddim
(
x_dims
,
axis
,
x_dims
.
size
());
}
else
{
direct_compute
=
x_dims
==
phi
::
slice_ddim
(
y_dims
,
axis
,
y_dims
.
size
());
}
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
if
(
direct_compute
)
{
const
auto
&
runner
=
NpuOpRunner
(
"Maximum"
,
{
*
x
,
*
y
},
{
*
out
},
{});
runner
.
Run
(
stream
);
}
else
{
phi
::
DenseTensor
transformed_x
,
transformed_y
;
NpuElementWiseOpBroadcast
<
T
>
(
dev_ctx
,
x
,
y
,
axis
,
&
transformed_x
,
&
transformed_y
);
const
auto
&
runner
=
NpuOpRunner
(
"Maximum"
,
{
transformed_x
,
transformed_y
},
{
*
out
},
{});
runner
.
Run
(
stream
);
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
ElementwiseMaxGradNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
// The ascend elementwise_max_grad op only supports broadcast
// when axis is -1, and requires all the inputs must have the
// same shape when axis is not -1. For convenience, we should
// broadcast the original input x and y to transformed_x and
// transformed_x firstly, then use tmp tensor to get the op
// output, last reduce the tmp tensor shape to match the
// paddle output.
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
axis
=
(
axis
==
-
1
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
:
axis
);
phi
::
DenseTensor
transformed_x
,
transformed_y
;
NpuElementWiseOpBroadcast
<
T
>
(
dev_ctx
,
x
,
y
,
axis
,
&
transformed_x
,
&
transformed_y
);
auto
dout_dims
=
dout
->
dims
();
auto
stream
=
dev_ctx
.
stream
();
framework
::
NPUAttributeMap
attr_input
=
{{
"grad_x"
,
true
},
{
"grad_y"
,
true
}};
// Reshape info vector.
std
::
vector
<
int
>
reduce_axes
;
if
(
dx
&&
dy
)
{
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
phi
::
DenseTensor
tmp_dx
;
tmp_dx
.
mutable_data
<
T
>
(
dout_dims
,
ctx
.
GetPlace
());
phi
::
DenseTensor
tmp_dy
;
tmp_dy
.
mutable_data
<
T
>
(
dout_dims
,
ctx
.
GetPlace
());
const
auto
&
runner
=
NpuOpRunner
(
"MaximumGrad"
,
{
*
dout
,
transformed_x
,
transformed_y
},
{
tmp_dx
,
tmp_dy
},
attr_input
);
runner
.
Run
(
stream
);
if
(
x_dims
!=
dout_dims
)
{
reduce_axes
.
clear
();
int
src_axis
=
(
x_dims
.
size
()
<
dout_dims
.
size
()
?
axis
:
0
);
for
(
int
ax
=
0
;
ax
<
dout_dims
.
size
();
++
ax
)
{
if
((
ax
<
src_axis
||
ax
>=
src_axis
+
x_dims
.
size
())
||
(
dout_dims
[
ax
]
>
1
&&
x_dims
[
ax
-
src_axis
]
==
1
))
{
reduce_axes
.
push_back
(
ax
);
}
}
if
(
!
reduce_axes
.
empty
())
{
const
auto
&
runner
=
NpuOpRunner
(
"ReduceSumD"
,
{
tmp_dx
},
{
*
dx
},
{{
"axes"
,
reduce_axes
},
{
"keep_dims"
,
false
}});
runner
.
Run
(
stream
);
}
}
else
{
framework
::
TensorCopy
(
tmp_dx
,
ctx
.
GetPlace
(),
dev_ctx
,
dx
);
}
if
(
y_dims
!=
dout_dims
)
{
reduce_axes
.
clear
();
int
src_axis
=
(
y_dims
.
size
()
<
dout_dims
.
size
()
?
axis
:
0
);
for
(
int
ax
=
0
;
ax
<
dout_dims
.
size
();
++
ax
)
{
if
((
ax
<
src_axis
||
ax
>=
src_axis
+
y_dims
.
size
())
||
(
dout_dims
[
ax
]
>
1
&&
y_dims
[
ax
-
src_axis
]
==
1
))
{
reduce_axes
.
push_back
(
ax
);
}
}
if
(
!
reduce_axes
.
empty
())
{
const
auto
&
runner
=
NpuOpRunner
(
"ReduceSumD"
,
{
tmp_dy
},
{
*
dy
},
{{
"axes"
,
reduce_axes
},
{
"keep_dims"
,
false
}});
runner
.
Run
(
stream
);
}
}
else
{
framework
::
TensorCopy
(
tmp_dy
,
ctx
.
GetPlace
(),
dev_ctx
,
dy
);
}
}
else
if
(
dx
)
{
phi
::
DenseTensor
zero_tensor
(
dout
->
type
());
zero_tensor
.
mutable_data
<
T
>
(
dout_dims
,
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
zero_tensor
,
static_cast
<
T
>
(
0
));
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
phi
::
DenseTensor
tmp_dx
;
tmp_dx
.
mutable_data
<
T
>
(
dout_dims
,
ctx
.
GetPlace
());
const
auto
&
runner
=
NpuOpRunner
(
"MaximumGrad"
,
{
*
dout
,
transformed_x
,
transformed_y
},
{
tmp_dx
,
zero_tensor
},
attr_input
);
runner
.
Run
(
stream
);
if
(
x_dims
!=
dout_dims
)
{
reduce_axes
.
clear
();
int
src_axis
=
(
x_dims
.
size
()
<
dout_dims
.
size
()
?
axis
:
0
);
for
(
int
ax
=
0
;
ax
<
dout_dims
.
size
();
++
ax
)
{
if
((
ax
<
src_axis
||
ax
>=
src_axis
+
x_dims
.
size
())
||
(
dout_dims
[
ax
]
>
1
&&
x_dims
[
ax
-
src_axis
]
==
1
))
{
reduce_axes
.
push_back
(
ax
);
}
}
if
(
!
reduce_axes
.
empty
())
{
const
auto
&
runner
=
NpuOpRunner
(
"ReduceSumD"
,
{
tmp_dx
},
{
*
dx
},
{{
"axes"
,
reduce_axes
},
{
"keep_dims"
,
false
}});
runner
.
Run
(
stream
);
}
}
else
{
framework
::
TensorCopy
(
tmp_dx
,
ctx
.
GetPlace
(),
dev_ctx
,
dx
);
}
}
else
if
(
dy
)
{
phi
::
DenseTensor
zero_tensor
(
dout
->
type
());
zero_tensor
.
mutable_data
<
T
>
(
dout_dims
,
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
zero_tensor
,
static_cast
<
T
>
(
0
));
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
phi
::
DenseTensor
tmp_dy
;
tmp_dy
.
mutable_data
<
T
>
(
dout_dims
,
ctx
.
GetPlace
());
const
auto
&
runner
=
NpuOpRunner
(
"MaximumGrad"
,
{
*
dout
,
transformed_x
,
transformed_y
},
{
zero_tensor
,
tmp_dy
},
attr_input
);
runner
.
Run
(
stream
);
if
(
y_dims
!=
dout_dims
)
{
reduce_axes
.
clear
();
int
src_axis
=
(
y_dims
.
size
()
<
dout_dims
.
size
()
?
axis
:
0
);
for
(
int
ax
=
0
;
ax
<
dout_dims
.
size
();
++
ax
)
{
if
((
ax
<
src_axis
||
ax
>=
src_axis
+
y_dims
.
size
())
||
(
dout_dims
[
ax
]
>
1
&&
y_dims
[
ax
-
src_axis
]
==
1
))
{
reduce_axes
.
push_back
(
ax
);
}
}
if
(
!
reduce_axes
.
empty
())
{
const
auto
&
runner
=
NpuOpRunner
(
"ReduceSumD"
,
{
tmp_dy
},
{
*
dy
},
{{
"axes"
,
reduce_axes
},
{
"keep_dims"
,
false
}});
runner
.
Run
(
stream
);
}
}
else
{
framework
::
TensorCopy
(
tmp_dy
,
ctx
.
GetPlace
(),
dev_ctx
,
dy
);
}
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Do not support all outputs to be empty."
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
elementwise_max
,
ops
::
ElementwiseMaxNPUKernel
<
plat
::
NPUDeviceContext
,
plat
::
float16
>
,
ops
::
ElementwiseMaxNPUKernel
<
plat
::
NPUDeviceContext
,
float
>
,
ops
::
ElementwiseMaxNPUKernel
<
plat
::
NPUDeviceContext
,
double
>
,
ops
::
ElementwiseMaxNPUKernel
<
plat
::
NPUDeviceContext
,
int
>
,
ops
::
ElementwiseMaxNPUKernel
<
plat
::
NPUDeviceContext
,
int64_t
>
);
REGISTER_OP_NPU_KERNEL
(
elementwise_max_grad
,
ops
::
ElementwiseMaxGradNPUKernel
<
plat
::
NPUDeviceContext
,
plat
::
float16
>
,
ops
::
ElementwiseMaxGradNPUKernel
<
plat
::
NPUDeviceContext
,
float
>
,
ops
::
ElementwiseMaxGradNPUKernel
<
plat
::
NPUDeviceContext
,
double
>
,
ops
::
ElementwiseMaxGradNPUKernel
<
plat
::
NPUDeviceContext
,
int
>
);
paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
已删除
100644 → 0
浏览文件 @
0f3bbe10
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
ElementwiseMinNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
ctx
.
GetPlace
();
out
->
mutable_data
<
T
>
(
place
);
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
bool
direct_compute
=
false
;
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
axis
=
(
axis
==
-
1
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
:
axis
);
if
(
x_dims
.
size
()
>=
y_dims
.
size
())
{
direct_compute
=
y_dims
==
phi
::
slice_ddim
(
x_dims
,
axis
,
x_dims
.
size
());
}
else
{
direct_compute
=
x_dims
==
phi
::
slice_ddim
(
y_dims
,
axis
,
y_dims
.
size
());
}
phi
::
DenseTensor
transformed_x
,
transformed_y
;
if
(
direct_compute
)
{
transformed_x
.
ShareDataWith
(
*
x
);
transformed_y
.
ShareDataWith
(
*
y
);
}
else
{
NpuElementWiseOpBroadcast
<
T
>
(
dev_ctx
,
x
,
y
,
axis
,
&
transformed_x
,
&
transformed_y
);
}
const
auto
&
runner
=
NpuOpRunner
(
"Minimum"
,
{
transformed_x
,
transformed_y
},
{
*
out
},
{});
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
runner
.
Run
(
stream
);
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
ElementwiseMinGradNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
axis
=
(
axis
==
-
1
?
std
::
abs
(
x
->
dims
().
size
()
-
y
->
dims
().
size
())
:
axis
);
auto
stream
=
dev_ctx
.
stream
();
if
(
dx
&&
dy
)
{
// dx
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
phi
::
DenseTensor
tmp_x
;
tmp_x
.
ShareDataWith
(
*
dx
);
if
(
dx
->
dims
()
!=
dout
->
dims
())
{
std
::
vector
<
int
>
dst_dims_vec_x
;
std
::
vector
<
int
>
reduce_axes_x
;
auto
src_dims_x
=
dx
->
dims
();
auto
dout_dims
=
dout
->
dims
();
int
src_axis_x
=
(
src_dims_x
.
size
()
<
dout_dims
.
size
()
?
axis
:
0
);
for
(
int
ax
=
0
;
ax
<
dout_dims
.
size
();
++
ax
)
{
if
((
ax
<
src_axis_x
||
ax
>=
src_axis_x
+
src_dims_x
.
size
())
||
(
dout_dims
[
ax
]
>
1
&&
src_dims_x
[
ax
-
src_axis_x
]
==
1
))
{
reduce_axes_x
.
push_back
(
ax
);
}
else
{
dst_dims_vec_x
.
push_back
(
dout_dims
[
ax
]);
}
}
if
(
!
reduce_axes_x
.
empty
())
{
tmp_x
.
Resize
(
phi
::
make_ddim
(
dst_dims_vec_x
));
}
}
// dy
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
phi
::
DenseTensor
tmp_y
;
tmp_y
.
ShareDataWith
(
*
dy
);
if
(
dy
->
dims
()
!=
dout
->
dims
())
{
std
::
vector
<
int
>
dst_dims_vec_y
;
std
::
vector
<
int
>
reduce_axes_y
;
auto
src_dims_y
=
dy
->
dims
();
auto
dout_dims
=
dout
->
dims
();
int
src_axis_y
=
(
src_dims_y
.
size
()
<
dout_dims
.
size
()
?
axis
:
0
);
for
(
int
ax
=
0
;
ax
<
dout_dims
.
size
();
++
ax
)
{
if
((
ax
<
src_axis_y
||
ax
>=
src_axis_y
+
src_dims_y
.
size
())
||
(
dout_dims
[
ax
]
>
1
&&
src_dims_y
[
ax
-
src_axis_y
]
==
1
))
{
reduce_axes_y
.
push_back
(
ax
);
}
else
{
dst_dims_vec_y
.
push_back
(
dout_dims
[
ax
]);
}
}
if
(
!
reduce_axes_y
.
empty
())
{
tmp_y
.
Resize
(
phi
::
make_ddim
(
dst_dims_vec_y
));
}
}
const
auto
&
runner
=
NpuOpRunner
(
"MinimumGrad"
,
{
*
dout
,
*
x
,
*
y
},
{
tmp_x
,
tmp_y
},
{{
"grad_x"
,
true
},
{
"grad_y"
,
true
}});
runner
.
Run
(
stream
);
}
else
if
(
dx
)
{
phi
::
DenseTensor
zero_tensor
(
dout
->
type
());
zero_tensor
.
mutable_data
<
T
>
(
y
->
dims
(),
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
zero_tensor
,
static_cast
<
T
>
(
0
));
// dx
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
phi
::
DenseTensor
tmp_x
;
tmp_x
.
ShareDataWith
(
*
dx
);
if
(
dx
->
dims
()
!=
dout
->
dims
())
{
std
::
vector
<
int
>
dst_dims_vec_x
;
std
::
vector
<
int
>
reduce_axes_x
;
auto
src_dims_x
=
dx
->
dims
();
auto
dout_dims
=
dout
->
dims
();
int
src_axis_x
=
(
src_dims_x
.
size
()
<
dout_dims
.
size
()
?
axis
:
0
);
for
(
int
ax
=
0
;
ax
<
dout_dims
.
size
();
++
ax
)
{
if
((
ax
<
src_axis_x
||
ax
>=
src_axis_x
+
src_dims_x
.
size
())
||
(
dout_dims
[
ax
]
>
1
&&
src_dims_x
[
ax
-
src_axis_x
]
==
1
))
{
reduce_axes_x
.
push_back
(
ax
);
}
else
{
dst_dims_vec_x
.
push_back
(
dout_dims
[
ax
]);
}
}
if
(
!
reduce_axes_x
.
empty
())
{
tmp_x
.
Resize
(
phi
::
make_ddim
(
dst_dims_vec_x
));
}
}
const
auto
&
runner
=
NpuOpRunner
(
"MinimumGrad"
,
{
*
dout
,
*
x
,
*
y
},
{
tmp_x
,
zero_tensor
},
{{
"grad_x"
,
true
},
{
"grad_y"
,
true
}});
runner
.
Run
(
stream
);
}
else
if
(
dy
)
{
phi
::
DenseTensor
zero_tensor
(
dout
->
type
());
zero_tensor
.
mutable_data
<
T
>
(
x
->
dims
(),
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
zero_tensor
,
static_cast
<
T
>
(
0
));
// dy
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
phi
::
DenseTensor
tmp_y
;
tmp_y
.
ShareDataWith
(
*
dy
);
if
(
dy
->
dims
()
!=
dout
->
dims
())
{
std
::
vector
<
int
>
dst_dims_vec_y
;
std
::
vector
<
int
>
reduce_axes_y
;
auto
src_dims_y
=
dy
->
dims
();
auto
dout_dims
=
dout
->
dims
();
int
src_axis_y
=
(
src_dims_y
.
size
()
<
dout_dims
.
size
()
?
axis
:
0
);
for
(
int
ax
=
0
;
ax
<
dout_dims
.
size
();
++
ax
)
{
if
((
ax
<
src_axis_y
||
ax
>=
src_axis_y
+
src_dims_y
.
size
())
||
(
dout_dims
[
ax
]
>
1
&&
src_dims_y
[
ax
-
src_axis_y
]
==
1
))
{
reduce_axes_y
.
push_back
(
ax
);
}
else
{
dst_dims_vec_y
.
push_back
(
dout_dims
[
ax
]);
}
}
if
(
!
reduce_axes_y
.
empty
())
{
tmp_y
.
Resize
(
phi
::
make_ddim
(
dst_dims_vec_y
));
}
}
const
auto
&
runner
=
NpuOpRunner
(
"MinimumGrad"
,
{
*
dout
,
*
x
,
*
y
},
{
zero_tensor
,
tmp_y
},
{{
"grad_x"
,
true
},
{
"grad_y"
,
true
}});
runner
.
Run
(
stream
);
}
else
{
std
::
cout
<<
"error"
<<
std
::
endl
;
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
elementwise_min
,
ops
::
ElementwiseMinNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
float
>
,
ops
::
ElementwiseMinNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_NPU_KERNEL
(
elementwise_min_grad
,
ops
::
ElementwiseMinGradNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
float
>
,
ops
::
ElementwiseMinGradNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
已删除
100644 → 0
浏览文件 @
0f3bbe10
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
ElementwiseModNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
axis
=
(
axis
==
-
1
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
:
axis
);
bool
direct_compute
=
false
;
if
(
x_dims
.
size
()
>=
y_dims
.
size
())
{
direct_compute
=
y_dims
==
phi
::
slice_ddim
(
x_dims
,
axis
,
x_dims
.
size
());
}
else
{
direct_compute
=
x_dims
==
phi
::
slice_ddim
(
y_dims
,
axis
,
y_dims
.
size
());
}
phi
::
DenseTensor
transformed_x
,
transformed_y
;
if
(
direct_compute
)
{
transformed_x
.
ShareDataWith
(
*
x
);
transformed_y
.
ShareDataWith
(
*
y
);
}
else
{
NpuElementWiseOpBroadcast
<
T
>
(
dev_ctx
,
x
,
y
,
axis
,
&
transformed_x
,
&
transformed_y
);
}
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
auto
&
runner
=
NpuOpRunner
(
"FloorMod"
,
{
transformed_x
,
transformed_y
},
{
*
out
},
{});
auto
stream
=
dev_ctx
.
stream
();
runner
.
Run
(
stream
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_NPU_KERNEL
(
elementwise_mod
,
ops
::
ElementwiseModNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
float
>
,
ops
::
ElementwiseModNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
double
>
,
ops
::
ElementwiseModNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
int
>
,
ops
::
ElementwiseModNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
int64_t
>
,
ops
::
ElementwiseModNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
已删除
100644 → 0
浏览文件 @
0f3bbe10
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
namespace
paddle
{
namespace
operators
{
using
NPUDeviceContext
=
platform
::
NPUDeviceContext
;
template
<
typename
T
>
static
void
ReduceDims
(
const
framework
::
ExecutionContext
&
ctx
,
const
aclrtStream
&
stream
,
const
int
axis
,
const
framework
::
DDim
&
ddims
,
const
framework
::
DDim
&
brd_ddims
,
const
phi
::
DenseTensor
&
in
,
phi
::
DenseTensor
*
out
)
{
std
::
vector
<
int64_t
>
axes
;
int64_t
brd_size
=
brd_ddims
.
size
();
int64_t
org_size
=
ddims
.
size
();
// int64_t diff = brd_dims.size() - dims.size();
for
(
int64_t
i
=
0
;
i
<
brd_size
;
++
i
)
{
if
(
i
<
axis
||
i
>=
org_size
+
axis
)
{
axes
.
push_back
(
i
);
continue
;
}
if
(
brd_ddims
[
i
]
>
ddims
[
i
-
axis
])
{
axes
.
push_back
(
i
);
}
}
// LOG(INFO) << "axes = " << phi::make_ddim(axes).to_str();
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
auto
&
runner
=
NpuOpRunner
(
"ReduceSumD"
,
{
in
},
{
*
out
},
{{
"axes"
,
axes
},
{
"keep_dims"
,
false
}});
runner
.
Run
(
stream
);
}
template
<
typename
T
>
class
ElementwiseMulNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
NPUDeviceContext
>();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
bool
direct_compute
=
false
;
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
axis
=
(
axis
==
-
1
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
:
axis
);
if
(
x_dims
.
size
()
>=
y_dims
.
size
())
{
direct_compute
=
x_dims
.
size
()
==
(
y_dims
.
size
()
+
axis
);
}
else
{
direct_compute
=
y_dims
.
size
()
==
(
x_dims
.
size
()
+
axis
);
}
auto
stream
=
ctx
.
template
device_context
<
NPUDeviceContext
>().
stream
();
if
(
direct_compute
)
{
const
auto
&
runner
=
NpuOpRunner
(
"Mul"
,
{
*
x
,
*
y
},
{
*
out
},
{});
runner
.
Run
(
stream
);
}
else
{
phi
::
DenseTensor
trans_x
,
trans_y
;
NpuElementWiseOpBroadcast
<
T
>
(
dev_ctx
,
x
,
y
,
axis
,
&
trans_x
,
&
trans_y
);
const
auto
&
runner
=
NpuOpRunner
(
"Mul"
,
{
trans_x
,
trans_y
},
{
*
out
},
{});
runner
.
Run
(
stream
);
}
}
};
template
<
typename
T
>
class
ElementwiseMulGradNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
NPUDeviceContext
>();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
axis
=
(
axis
==
-
1
?
std
::
abs
(
x
->
dims
().
size
()
-
y
->
dims
().
size
())
:
axis
);
auto
stream
=
ctx
.
template
device_context
<
NPUDeviceContext
>().
stream
();
phi
::
DenseTensor
trans_x
,
trans_y
;
NpuElementWiseOpBroadcast
<
T
>
(
dev_ctx
,
x
,
y
,
axis
,
&
trans_x
,
&
trans_y
);
if
(
dx
)
{
if
(
dx
->
dims
()
==
dout
->
dims
())
{
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
auto
&
runner_dx
=
NpuOpRunner
(
"Mul"
,
{
*
dout
,
trans_y
},
{
*
dx
},
{});
runner_dx
.
Run
(
stream
);
}
else
{
phi
::
DenseTensor
dx_temp
(
x
->
type
());
dx_temp
.
Resize
(
trans_x
.
dims
());
dx_temp
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
auto
&
runner_dx
=
NpuOpRunner
(
"Mul"
,
{
*
dout
,
trans_y
},
{
dx_temp
},
{});
runner_dx
.
Run
(
stream
);
ReduceDims
<
T
>
(
ctx
,
stream
,
axis
,
dx
->
dims
(),
trans_x
.
dims
(),
dx_temp
,
dx
);
}
}
if
(
dy
)
{
if
(
dy
->
dims
()
==
dout
->
dims
())
{
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
auto
&
runner_dy
=
NpuOpRunner
(
"Mul"
,
{
trans_x
,
*
dout
},
{
*
dy
},
{});
runner_dy
.
Run
(
stream
);
}
else
{
phi
::
DenseTensor
dy_temp
(
y
->
type
());
dy_temp
.
Resize
(
trans_y
.
dims
());
dy_temp
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
auto
&
runner_dy
=
NpuOpRunner
(
"Mul"
,
{
trans_x
,
*
dout
},
{
dy_temp
},
{});
runner_dy
.
Run
(
stream
);
ReduceDims
<
T
>
(
ctx
,
stream
,
axis
,
dy
->
dims
(),
trans_y
.
dims
(),
dy_temp
,
dy
);
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_NPU_KERNEL
(
elementwise_mul
,
ops
::
ElementwiseMulNPUKernel
<
float
>
,
ops
::
ElementwiseMulNPUKernel
<
paddle
::
platform
::
float16
>
,
#ifdef PADDLE_WITH_ASCEND_INT64
ops
::
ElementwiseMulNPUKernel
<
int64_t
>
,
#endif
ops
::
ElementwiseMulNPUKernel
<
int
>
);
REGISTER_OP_NPU_KERNEL
(
elementwise_mul_grad
,
ops
::
ElementwiseMulGradNPUKernel
<
float
>
,
ops
::
ElementwiseMulGradNPUKernel
<
paddle
::
platform
::
float16
>
,
#ifdef PADDLE_WITH_ASCEND_INT64
ops
::
ElementwiseMulGradNPUKernel
<
int64_t
>
,
#endif
ops
::
ElementwiseMulGradNPUKernel
<
int
>
);
paddle/fluid/operators/elementwise/elementwise_npu.h
已删除
100644 → 0
浏览文件 @
0f3bbe10
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
void
NpuBroadcast
(
const
platform
::
NPUDeviceContext
&
dev_ctx
,
const
phi
::
DenseTensor
*
src
,
int
axis
,
const
framework
::
DDim
&
dst_dims
,
phi
::
DenseTensor
*
transformed_src
)
{
auto
stream
=
dev_ctx
.
stream
();
// 1. expand the axis with dim 1
auto
src_dims
=
src
->
dims
();
phi
::
DenseTensor
tmp_src
;
tmp_src
.
ShareDataWith
(
*
src
);
tmp_src
.
Resize
(
src_dims
);
for
(
int
i
=
0
;
i
<
src_dims
.
size
();
++
i
)
{
if
(
src_dims
[
i
]
==
1
&&
dst_dims
[
i
+
axis
]
>
1
)
{
phi
::
DenseTensor
tmp_tensor
;
auto
tmp_tensor_dims
=
tmp_src
.
dims
();
tmp_tensor_dims
[
i
]
=
dst_dims
[
i
+
axis
];
tmp_tensor
.
mutable_data
<
T
>
(
tmp_tensor_dims
,
dev_ctx
.
GetPlace
());
const
auto
&
runner
=
NpuOpRunner
(
"TileWithAxis"
,
{
tmp_src
},
{
tmp_tensor
},
{{
"axis"
,
static_cast
<
int64_t
>
(
i
)},
{
"tiles"
,
static_cast
<
int64_t
>
(
dst_dims
[
i
+
axis
])}});
runner
.
Run
(
stream
);
tmp_src
.
ShareDataWith
(
tmp_tensor
);
tmp_src
.
Resize
(
tmp_tensor_dims
);
}
}
// 2.expand the ahead axis
auto
prev
=
phi
::
product
(
phi
::
slice_ddim
(
dst_dims
,
0
,
axis
));
if
(
prev
>
1
)
{
phi
::
DenseTensor
tmp_tensor
;
auto
tmp_tensor_dims
=
phi
::
slice_ddim
(
dst_dims
,
0
,
axis
+
src_dims
.
size
());
tmp_tensor
.
mutable_data
<
T
>
(
tmp_tensor_dims
,
dev_ctx
.
GetPlace
());
const
auto
&
runner
=
NpuOpRunner
(
"ExpandD"
,
{
tmp_src
},
{
tmp_tensor
},
{{
"shape"
,
phi
::
vectorize
<
int64_t
>
(
tmp_tensor_dims
)}});
runner
.
Run
(
stream
);
tmp_src
.
ShareDataWith
(
tmp_tensor
);
tmp_src
.
Resize
(
tmp_tensor_dims
);
}
else
{
tmp_src
.
Resize
(
phi
::
slice_ddim
(
dst_dims
,
0
,
axis
+
src_dims
.
size
()));
}
// 3.expand the tail axis
auto
post
=
phi
::
product
(
phi
::
slice_ddim
(
dst_dims
,
axis
+
src_dims
.
size
(),
dst_dims
.
size
()));
if
(
post
>
1
)
{
auto
src_dims_vec
=
phi
::
vectorize
<
int
>
(
tmp_src
.
dims
());
src_dims_vec
.
push_back
(
1
);
tmp_src
.
Resize
(
phi
::
make_ddim
(
src_dims_vec
));
phi
::
DenseTensor
tmp_tensor
;
tmp_tensor
.
mutable_data
<
T
>
(
dst_dims
,
dev_ctx
.
GetPlace
());
const
auto
&
runner
=
NpuOpRunner
(
"TileWithAxis"
,
{
tmp_src
},
{
tmp_tensor
},
{{
"axis"
,
static_cast
<
int64_t
>
(
axis
+
src_dims
.
size
())},
{
"tiles"
,
static_cast
<
int64_t
>
(
post
)}});
runner
.
Run
(
stream
);
tmp_src
.
ShareDataWith
(
tmp_tensor
);
}
tmp_src
.
Resize
(
dst_dims
);
framework
::
TensorCopy
(
tmp_src
,
dev_ctx
.
GetPlace
(),
transformed_src
);
}
template
<
typename
T
>
void
NpuElementWiseOpBroadcast
(
const
platform
::
NPUDeviceContext
&
dev_ctx
,
const
phi
::
DenseTensor
*
x
,
const
phi
::
DenseTensor
*
y
,
int
axis
,
phi
::
DenseTensor
*
transformed_x
,
phi
::
DenseTensor
*
transformed_y
)
{
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
bool
is_xsize_larger
=
true
;
int
max_dim
=
x_dims
.
size
();
std
::
vector
<
int
>
dst_dims_vec
=
phi
::
vectorize
<
int
>
(
x_dims
);
if
(
x_dims
.
size
()
<
y_dims
.
size
())
{
is_xsize_larger
=
false
;
max_dim
=
y_dims
.
size
();
dst_dims_vec
=
phi
::
vectorize
<
int
>
(
y_dims
);
}
axis
=
(
axis
==
-
1
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
:
axis
);
int
x_axis
=
is_xsize_larger
?
0
:
axis
;
int
y_axis
=
is_xsize_larger
?
axis
:
0
;
PADDLE_ENFORCE_GE
(
axis
,
0
,
platform
::
errors
::
InvalidArgument
(
"Axis should be great than or equal to 0, but received axis is %d."
,
axis
));
PADDLE_ENFORCE_LE
(
axis
,
max_dim
,
platform
::
errors
::
InvalidArgument
(
"Axis should be less than or equal to %d, but received axis is %d."
,
max_dim
,
axis
));
for
(
int
i
=
0
;
i
<
x_dims
.
size
();
++
i
)
{
dst_dims_vec
[
i
+
x_axis
]
=
std
::
max
(
dst_dims_vec
[
i
+
x_axis
],
static_cast
<
int
>
(
x_dims
[
i
]));
}
for
(
int
i
=
0
;
i
<
y_dims
.
size
();
++
i
)
{
dst_dims_vec
[
i
+
y_axis
]
=
std
::
max
(
dst_dims_vec
[
i
+
y_axis
],
static_cast
<
int
>
(
y_dims
[
i
]));
}
auto
dst_dims
=
phi
::
make_ddim
(
dst_dims_vec
);
NpuBroadcast
<
T
>
(
dev_ctx
,
x
,
x_axis
,
dst_dims
,
transformed_x
);
NpuBroadcast
<
T
>
(
dev_ctx
,
y
,
y_axis
,
dst_dims
,
transformed_y
);
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
已删除
100644 → 0
浏览文件 @
0f3bbe10
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
USE_OP_ITSELF
(
elementwise_add
);
USE_OP_DEVICE_KERNEL
(
elementwise_add
,
NPU
);
USE_OP_ITSELF
(
elementwise_sub
);
USE_OP_DEVICE_KERNEL
(
elementwise_sub
,
NPU
);
template
<
typename
T
>
void
Compare
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
std
::
string
op_type
)
{
// init
auto
x
=
scope
->
Var
(
"X"
);
auto
tensor_x
=
x
->
GetMutable
<
phi
::
DenseTensor
>
();
auto
y
=
scope
->
Var
(
"Y"
);
auto
tensor_y
=
y
->
GetMutable
<
phi
::
DenseTensor
>
();
std
::
vector
<
T
>
init_x
;
for
(
int64_t
i
=
0
;
i
<
10
*
10
;
++
i
)
{
init_x
.
push_back
(
static_cast
<
T
>
(
1.0
));
}
std
::
vector
<
T
>
init_y
;
for
(
int64_t
i
=
0
;
i
<
10
*
10
;
++
i
)
{
init_y
.
push_back
(
static_cast
<
T
>
(
2.0
));
}
paddle
::
framework
::
TensorFromVector
(
init_x
,
ctx
,
tensor_x
);
tensor_x
->
Resize
({
10
,
10
});
paddle
::
framework
::
TensorFromVector
(
init_y
,
ctx
,
tensor_y
);
tensor_y
->
Resize
({
10
,
10
});
auto
place
=
ctx
.
GetPlace
();
auto
out
=
scope
->
Var
(
"Out"
);
auto
tensor_out
=
out
->
GetMutable
<
phi
::
DenseTensor
>
();
// run
f
::
AttributeMap
attrs
;
auto
op
=
f
::
OpRegistry
::
CreateOp
(
op_type
,
{{
"X"
,
{
"X"
}},
{
"Y"
,
{
"Y"
}}},
{{
"Out"
,
{
"Out"
}}},
attrs
);
op
->
Run
(
*
scope
,
place
);
std
::
vector
<
T
>
out_vec
;
paddle
::
framework
::
TensorToVector
(
*
tensor_out
,
ctx
,
&
out_vec
);
ctx
.
Wait
();
float
expected
=
0.0
;
if
(
op_type
==
"elementwise_add"
)
{
expected
=
3.0
;
}
else
if
(
op_type
==
"elementwise_sub"
)
{
expected
=
-
1.0
;
}
EXPECT_EQ
(
out_vec
.
size
(),
init_x
.
size
());
for
(
uint32_t
i
=
0
;
i
<
out_vec
.
size
();
i
++
)
{
EXPECT_EQ
(
out_vec
[
i
],
static_cast
<
T
>
(
expected
));
}
}
template
<
typename
T
>
void
CompareGrad
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
std
::
string
op_type
)
{
// init
auto
dout
=
scope
->
Var
(
"DOut"
);
auto
tensor_dout
=
dout
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor_dout
->
Resize
({
2
,
3
,
5
});
auto
x
=
scope
->
Var
(
"X"
);
auto
tensor_x
=
x
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor_x
->
Resize
({
2
,
3
,
5
});
auto
y
=
scope
->
Var
(
"Y"
);
auto
tensor_y
=
y
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor_y
->
Resize
({
1
,
5
});
auto
dx
=
scope
->
Var
(
"DX"
);
auto
tensor_dx
=
dx
->
GetMutable
<
phi
::
DenseTensor
>
();
auto
dy
=
scope
->
Var
(
"DY"
);
auto
tensor_dy
=
dy
->
GetMutable
<
phi
::
DenseTensor
>
();
std
::
vector
<
T
>
init_dout
;
for
(
int64_t
i
=
0
;
i
<
tensor_dout
->
numel
();
++
i
)
{
init_dout
.
push_back
(
static_cast
<
T
>
(
1.0
));
}
paddle
::
framework
::
TensorFromVector
(
init_dout
,
ctx
,
tensor_dout
);
tensor_dout
->
Resize
({
2
,
3
,
5
});
// run
f
::
AttributeMap
attrs
;
auto
op
=
f
::
OpRegistry
::
CreateOp
(
op_type
,
{{
"Out@GRAD"
,
{
"DOut"
}},
{
"X"
,
{
"X"
}},
{
"Y"
,
{
"Y"
}}},
{{
"X@GRAD"
,
{
"DX"
}},
{
"Y@GRAD"
,
{
"DY"
}}},
attrs
);
auto
place
=
ctx
.
GetPlace
();
op
->
Run
(
*
scope
,
place
);
std
::
vector
<
T
>
dx_vec
;
paddle
::
framework
::
TensorToVector
(
*
tensor_dx
,
ctx
,
&
dx_vec
);
std
::
vector
<
T
>
dy_vec
;
paddle
::
framework
::
TensorToVector
(
*
tensor_dy
,
ctx
,
&
dy_vec
);
ctx
.
Wait
();
float
expected_x
=
0
,
expected_y
=
0
;
if
(
op_type
==
"elementwise_add_grad"
)
{
expected_x
=
1.0
;
expected_y
=
6.0
;
}
else
if
(
op_type
==
"elementwise_sub_grad"
)
{
expected_x
=
1.0
;
expected_y
=
-
6.0
;
}
for
(
uint32_t
i
=
0
;
i
<
dx_vec
.
size
();
i
++
)
{
EXPECT_EQ
(
dx_vec
[
i
],
static_cast
<
T
>
(
expected_x
));
}
for
(
uint32_t
i
=
0
;
i
<
dy_vec
.
size
();
i
++
)
{
EXPECT_EQ
(
dy_vec
[
i
],
static_cast
<
T
>
(
expected_y
));
}
}
TEST
(
elementwise_add
,
NPU_fp32
)
{
f
::
Scope
scope
;
auto
*
ctx
=
p
::
DeviceContextPool
::
Instance
().
Get
(
p
::
NPUPlace
(
0
));
Compare
<
float
>
(
&
scope
,
*
ctx
,
"elementwise_add"
);
}
TEST
(
elementwise_sub
,
NPU_fp32
)
{
f
::
Scope
scope
;
auto
*
ctx
=
p
::
DeviceContextPool
::
Instance
().
Get
(
p
::
NPUPlace
(
0
));
Compare
<
float
>
(
&
scope
,
*
ctx
,
"elementwise_sub"
);
}
TEST
(
elementwise_sub
,
NPU_fp16
)
{
f
::
Scope
scope
;
auto
*
ctx
=
p
::
DeviceContextPool
::
Instance
().
Get
(
p
::
NPUPlace
(
0
));
Compare
<
p
::
float16
>
(
&
scope
,
*
ctx
,
"elementwise_sub"
);
}
TEST
(
elementwise_sub_grad
,
NPU
)
{
f
::
Scope
scope
;
auto
*
ctx
=
p
::
DeviceContextPool
::
Instance
().
Get
(
p
::
NPUPlace
(
0
));
CompareGrad
<
float
>
(
&
scope
,
*
ctx
,
"elementwise_sub_grad"
);
}
TEST
(
elementwise_add_grad
,
NPU
)
{
f
::
Scope
scope
;
auto
*
ctx
=
p
::
DeviceContextPool
::
Instance
().
Get
(
p
::
NPUPlace
(
0
));
CompareGrad
<
float
>
(
&
scope
,
*
ctx
,
"elementwise_add_grad"
);
}
paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
已删除
100644 → 0
浏览文件 @
0f3bbe10
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
ElementwisePowNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
ctx
.
GetPlace
();
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
out
->
mutable_data
<
T
>
(
place
);
bool
direct_compute
=
false
;
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
axis
=
(
axis
<
0
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
+
axis
+
1
:
axis
);
if
(
x_dims
.
size
()
>=
y_dims
.
size
())
{
direct_compute
=
y_dims
==
phi
::
slice_ddim
(
x_dims
,
axis
,
x_dims
.
size
());
}
else
{
direct_compute
=
x_dims
==
phi
::
slice_ddim
(
y_dims
,
axis
,
y_dims
.
size
());
}
auto
stream
=
dev_ctx
.
stream
();
if
(
direct_compute
)
{
const
auto
&
runner
=
NpuOpRunner
(
"Pow"
,
{
*
x
,
*
y
},
{
*
out
},
{});
runner
.
Run
(
stream
);
}
else
{
phi
::
DenseTensor
transformed_x
,
transformed_y
;
NpuElementWiseOpBroadcast
<
T
>
(
dev_ctx
,
x
,
y
,
axis
,
&
transformed_x
,
&
transformed_y
);
const
auto
&
runner
=
NpuOpRunner
(
"Pow"
,
{
transformed_x
,
transformed_y
},
{
*
out
},
{});
runner
.
Run
(
stream
);
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
ElementwisePowGradNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
auto
place
=
ctx
.
GetPlace
();
auto
x_dims
=
x
->
dims
();
auto
y_dims
=
y
->
dims
();
axis
=
(
axis
<
0
?
std
::
abs
(
x_dims
.
size
()
-
y_dims
.
size
())
+
axis
+
1
:
axis
);
phi
::
DenseTensor
transformed_x
,
transformed_y
;
NpuElementWiseOpBroadcast
<
T
>
(
dev_ctx
,
x
,
y
,
axis
,
&
transformed_x
,
&
transformed_y
);
auto
dout_dims
=
dout
->
dims
();
auto
stream
=
dev_ctx
.
stream
();
// Reshape info vector.
std
::
vector
<
int
>
reduce_axes
;
if
(
dx
)
{
phi
::
DenseTensor
zero_tensor
(
dout
->
type
());
zero_tensor
.
mutable_data
<
T
>
(
dout_dims
,
place
);
FillNpuTensorWithConstant
<
T
>
(
&
zero_tensor
,
static_cast
<
T
>
(
0
));
dx
->
mutable_data
<
T
>
(
place
);
phi
::
DenseTensor
tmp_dx
;
tmp_dx
.
mutable_data
<
T
>
(
dout_dims
,
place
);
// dx = dout * y * pow(x, y - 1);
phi
::
DenseTensor
PowGrad_dx_temp1
(
dout
->
type
());
PowGrad_dx_temp1
.
mutable_data
<
T
>
(
dout
->
dims
(),
place
);
const
auto
&
runner_PowGrad_dx_temp1
=
NpuOpRunner
(
"Mul"
,
{
*
dout
,
transformed_y
},
{
PowGrad_dx_temp1
},
{});
runner_PowGrad_dx_temp1
.
Run
(
stream
);
phi
::
DenseTensor
one_dx
(
transformed_y
.
type
());
one_dx
.
mutable_data
<
T
>
(
transformed_y
.
dims
(),
place
);
const
auto
&
runner_one_dx
=
NpuOpRunner
(
"OnesLike"
,
{
transformed_y
},
{
one_dx
},
{});
runner_one_dx
.
Run
(
stream
);
phi
::
DenseTensor
sub_dx
(
transformed_y
.
type
());
sub_dx
.
mutable_data
<
T
>
(
transformed_y
.
dims
(),
place
);
const
auto
&
runner_sub_dx
=
NpuOpRunner
(
"Sub"
,
{
transformed_y
,
one_dx
},
{
sub_dx
},
{});
runner_sub_dx
.
Run
(
stream
);
phi
::
DenseTensor
PowGrad_dx_temp2
(
transformed_x
.
type
());
PowGrad_dx_temp2
.
mutable_data
<
T
>
(
transformed_x
.
dims
(),
place
);
const
auto
&
runner_PowGrad_dx_temp2
=
NpuOpRunner
(
"Pow"
,
{
transformed_x
,
sub_dx
},
{
PowGrad_dx_temp2
},
{});
runner_PowGrad_dx_temp2
.
Run
(
stream
);
const
auto
&
runner_dx
=
NpuOpRunner
(
"Mul"
,
{
PowGrad_dx_temp1
,
PowGrad_dx_temp2
},
{
tmp_dx
},
{});
runner_dx
.
Run
(
stream
);
if
(
x_dims
!=
dout_dims
)
{
reduce_axes
.
clear
();
int
src_axis
=
(
x_dims
.
size
()
<
dout_dims
.
size
()
?
axis
:
0
);
for
(
int
ax
=
0
;
ax
<
dout_dims
.
size
();
++
ax
)
{
if
((
ax
<
src_axis
||
ax
>=
src_axis
+
x_dims
.
size
())
||
(
dout_dims
[
ax
]
>
1
&&
x_dims
[
ax
-
src_axis
]
==
1
))
{
reduce_axes
.
push_back
(
ax
);
}
}
if
(
!
reduce_axes
.
empty
())
{
const
auto
&
runner
=
NpuOpRunner
(
"ReduceSumD"
,
{
tmp_dx
},
{
*
dx
},
{{
"axes"
,
reduce_axes
},
{
"keep_dims"
,
false
}});
runner
.
Run
(
stream
);
}
}
else
{
framework
::
TensorCopy
(
tmp_dx
,
place
,
dev_ctx
,
dx
);
}
}
if
(
dy
)
{
phi
::
DenseTensor
zero_tensor
(
dout
->
type
());
zero_tensor
.
mutable_data
<
T
>
(
dout_dims
,
place
);
FillNpuTensorWithConstant
<
T
>
(
&
zero_tensor
,
static_cast
<
T
>
(
0
));
dy
->
mutable_data
<
T
>
(
place
);
phi
::
DenseTensor
tmp_dy
;
tmp_dy
.
mutable_data
<
T
>
(
dout_dims
,
place
);
// dy = dout * log(x) * pow(x, y)
phi
::
DenseTensor
PowGrad_dy_temp1
(
transformed_x
.
type
());
PowGrad_dy_temp1
.
mutable_data
<
T
>
(
transformed_x
.
dims
(),
place
);
const
auto
&
runner_PowGrad_dy_temp1
=
NpuOpRunner
(
"Pow"
,
{
transformed_x
,
transformed_y
},
{
PowGrad_dy_temp1
},
{});
runner_PowGrad_dy_temp1
.
Run
(
stream
);
phi
::
DenseTensor
one_dy
(
transformed_x
.
type
());
one_dy
.
mutable_data
<
T
>
(
transformed_x
.
dims
(),
place
);
const
auto
&
runner_one_dy
=
NpuOpRunner
(
"OnesLike"
,
{
transformed_x
},
{
one_dy
},
{});
runner_one_dy
.
Run
(
stream
);
phi
::
DenseTensor
sub_dy
(
transformed_x
.
type
());
sub_dy
.
mutable_data
<
T
>
(
transformed_x
.
dims
(),
place
);
const
auto
&
runner_sub_dy
=
NpuOpRunner
(
"Sub"
,
{
transformed_x
,
one_dy
},
{
sub_dy
},
{});
runner_sub_dy
.
Run
(
stream
);
phi
::
DenseTensor
log_dy
(
transformed_x
.
type
());
log_dy
.
mutable_data
<
T
>
(
transformed_x
.
dims
(),
place
);
const
auto
&
runner_log_dy
=
NpuOpRunner
(
"Log1p"
,
{
sub_dy
},
{
log_dy
},
{});
runner_log_dy
.
Run
(
stream
);
phi
::
DenseTensor
PowGrad_dy_temp2
(
transformed_x
.
type
());
PowGrad_dy_temp2
.
mutable_data
<
T
>
(
transformed_x
.
dims
(),
place
);
const
auto
&
runner_PowGrad_dy_temp2
=
NpuOpRunner
(
"Mul"
,
{
log_dy
,
PowGrad_dy_temp1
},
{
PowGrad_dy_temp2
},
{});
runner_PowGrad_dy_temp2
.
Run
(
stream
);
const
auto
&
runner_dy
=
NpuOpRunner
(
"Mul"
,
{
*
dout
,
PowGrad_dy_temp2
},
{
tmp_dy
},
{});
runner_dy
.
Run
(
stream
);
if
(
y_dims
!=
dout_dims
)
{
reduce_axes
.
clear
();
int
src_axis
=
(
y_dims
.
size
()
<
dout_dims
.
size
()
?
axis
:
0
);
for
(
int
ax
=
0
;
ax
<
dout_dims
.
size
();
++
ax
)
{
if
((
ax
<
src_axis
||
ax
>=
src_axis
+
y_dims
.
size
())
||
(
dout_dims
[
ax
]
>
1
&&
y_dims
[
ax
-
src_axis
]
==
1
))
{
reduce_axes
.
push_back
(
ax
);
}
}
if
(
!
reduce_axes
.
empty
())
{
const
auto
&
runner
=
NpuOpRunner
(
"ReduceSumD"
,
{
tmp_dy
},
{
*
dy
},
{{
"axes"
,
reduce_axes
},
{
"keep_dims"
,
false
}});
runner
.
Run
(
stream
);
}
}
else
{
framework
::
TensorCopy
(
tmp_dy
,
place
,
dev_ctx
,
dy
);
}
}
if
(
!
dx
&&
!
dy
)
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not support all outputs to be empty."
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
elementwise_pow
,
ops
::
ElementwisePowNPUKernel
<
plat
::
NPUDeviceContext
,
plat
::
float16
>
,
ops
::
ElementwisePowNPUKernel
<
plat
::
NPUDeviceContext
,
float
>
,
ops
::
ElementwisePowNPUKernel
<
plat
::
NPUDeviceContext
,
double
>
,
ops
::
ElementwisePowNPUKernel
<
plat
::
NPUDeviceContext
,
int
>
);
REGISTER_OP_NPU_KERNEL
(
elementwise_pow_grad
,
ops
::
ElementwisePowGradNPUKernel
<
plat
::
NPUDeviceContext
,
plat
::
float16
>
,
ops
::
ElementwisePowGradNPUKernel
<
plat
::
NPUDeviceContext
,
float
>
,
ops
::
ElementwisePowGradNPUKernel
<
plat
::
NPUDeviceContext
,
double
>
,
ops
::
ElementwisePowGradNPUKernel
<
plat
::
NPUDeviceContext
,
int
>
);
paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
已删除
100644 → 0
浏览文件 @
0f3bbe10
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ElementwiseSubNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Y"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
auto
&
runner
=
NpuOpRunner
(
"Sub"
,
{
*
x
,
*
y
},
{
*
out
},
{});
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
runner
.
Run
(
stream
);
}
};
template
<
typename
T
>
class
ElementwiseSubGradNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
dout
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
// NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
// default axis=-1?
// So, the sub_grad should do reduce if needed.
// For example, the shape of each variable in elementwise_sub:
// x, dx: [2, 3, 5]
// y, dy: [1, 5]
// out, dout: [2, 3, 5]
// Then, out = x - y => dx = dout, dy = -dout
// And, the shape of dy can be computed by two stages reduce,
// 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
// 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
if
(
dx
)
{
dx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// For dx
// stage 1
auto
reduce_ndim
=
dout
->
dims
().
size
()
-
dx
->
dims
().
size
();
std
::
vector
<
int
>
axes
;
for
(
auto
i
=
0
;
i
<
reduce_ndim
;
++
i
)
{
axes
.
push_back
(
i
);
}
phi
::
DenseTensor
*
tmp_dout
=
const_cast
<
phi
::
DenseTensor
*>
(
dout
);
phi
::
DenseTensor
reduced_dout
(
dx
->
type
());
if
(
axes
.
size
()
!=
0
)
{
std
::
vector
<
int64_t
>
reduced_dout_dims
;
for
(
auto
i
=
reduce_ndim
;
i
<
dout
->
dims
().
size
();
++
i
)
{
reduced_dout_dims
.
push_back
(
dout
->
dims
()[
i
]);
}
reduced_dout
.
Resize
(
phi
::
make_ddim
(
reduced_dout_dims
));
reduced_dout
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
auto
&
runner
=
NpuOpRunner
(
"ReduceSumD"
,
{
*
dout
},
{
reduced_dout
},
{{
"axes"
,
axes
},
{
"keep_dims"
,
false
}});
runner
.
Run
(
stream
);
tmp_dout
=
&
reduced_dout
;
}
// stage 2
axes
.
clear
();
for
(
auto
i
=
0
;
i
<
dx
->
dims
().
size
();
++
i
)
{
if
(
dx
->
dims
()[
i
]
==
1
)
{
axes
.
push_back
(
i
);
}
}
if
(
axes
.
size
()
!=
0
)
{
const
auto
&
runner
=
NpuOpRunner
(
"ReduceSumD"
,
{
*
tmp_dout
},
{
*
dx
},
{{
"axes"
,
axes
},
{
"keep_dims"
,
true
}});
runner
.
Run
(
stream
);
}
else
{
framework
::
TensorCopy
(
*
tmp_dout
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
dx
);
}
}
if
(
dy
)
{
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// For dy
// stage 1
auto
reduce_ndim
=
dout
->
dims
().
size
()
-
dy
->
dims
().
size
();
std
::
vector
<
int
>
axes
;
for
(
auto
i
=
0
;
i
<
reduce_ndim
;
++
i
)
{
axes
.
push_back
(
i
);
}
phi
::
DenseTensor
*
tmp_dout
=
const_cast
<
phi
::
DenseTensor
*>
(
dout
);
phi
::
DenseTensor
reduced_dy
(
dy
->
type
());
phi
::
DenseTensor
reduced_dout
(
dy
->
type
());
if
(
axes
.
size
()
!=
0
)
{
std
::
vector
<
int64_t
>
reduced_dout_dims
;
for
(
auto
i
=
reduce_ndim
;
i
<
dout
->
dims
().
size
();
++
i
)
{
reduced_dout_dims
.
push_back
(
dout
->
dims
()[
i
]);
}
reduced_dout
.
Resize
(
phi
::
make_ddim
(
reduced_dout_dims
));
reduced_dout
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
auto
&
runner
=
NpuOpRunner
(
"ReduceSumD"
,
{
*
dout
},
{
reduced_dout
},
{{
"axes"
,
axes
},
{
"keep_dims"
,
false
}});
runner
.
Run
(
stream
);
tmp_dout
=
&
reduced_dout
;
}
// stage 2
axes
.
clear
();
phi
::
DenseTensor
*
tmp_dy
=
tmp_dout
;
for
(
auto
i
=
0
;
i
<
dy
->
dims
().
size
();
++
i
)
{
if
(
dy
->
dims
()[
i
]
==
1
)
{
axes
.
push_back
(
i
);
}
}
if
(
axes
.
size
()
!=
0
)
{
reduced_dy
.
Resize
(
dy
->
dims
());
reduced_dy
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
auto
&
runner
=
NpuOpRunner
(
"ReduceSumD"
,
{
*
tmp_dout
},
{
reduced_dy
},
{{
"axes"
,
axes
},
{
"keep_dims"
,
true
}});
runner
.
Run
(
stream
);
tmp_dy
=
&
reduced_dy
;
}
// stage 3, negative
const
auto
&
runner
=
NpuOpRunner
(
"Neg"
,
{
*
tmp_dy
},
{
*
dy
},
{});
runner
.
Run
(
stream
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
elementwise_sub
,
ops
::
ElementwiseSubNPUKernel
<
int
>
,
#ifdef PADDLE_WITH_ASCEND_INT64
ops
::
ElementwiseSubNPUKernel
<
int64_t
>
,
#endif
ops
::
ElementwiseSubNPUKernel
<
float
>
,
ops
::
ElementwiseSubNPUKernel
<
plat
::
float16
>
);
REGISTER_OP_NPU_KERNEL
(
elementwise_sub_grad
,
ops
::
ElementwiseSubGradNPUKernel
<
int
>
,
#ifdef PADDLE_WITH_ASCEND_INT64
ops
::
ElementwiseSubGradNPUKernel
<
int64_t
>
,
#endif
ops
::
ElementwiseSubGradNPUKernel
<
float
>
,
ops
::
ElementwiseSubGradNPUKernel
<
plat
::
float16
>
);
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录