Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
a6aa701e
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a6aa701e
编写于
4月 10, 2023
作者:
J
jjyaoao
提交者:
GitHub
4月 10, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
delete paddle/fluid/operators/math,metrics,optimizers,reduce_ops/*_npu.* (#52674)
上级
b451aff8
变更
15
展开全部
隐藏空白更改
内联
并排
Showing
15 changed file
with
0 addition
and
2586 deletion
+0
-2586
paddle/fluid/operators/math/beam_search_npu.cc
paddle/fluid/operators/math/beam_search_npu.cc
+0
-588
paddle/fluid/operators/metrics/accuracy_op_npu.cc
paddle/fluid/operators/metrics/accuracy_op_npu.cc
+0
-162
paddle/fluid/operators/optimizers/adam_op_npu.cc
paddle/fluid/operators/optimizers/adam_op_npu.cc
+0
-345
paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
+0
-194
paddle/fluid/operators/optimizers/momentum_op_npu.cc
paddle/fluid/operators/optimizers/momentum_op_npu.cc
+0
-105
paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
+0
-106
paddle/fluid/operators/optimizers/sgd_op_npu.cc
paddle/fluid/operators/optimizers/sgd_op_npu.cc
+0
-66
paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
+0
-53
paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+0
-80
paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+0
-216
paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
+0
-129
paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
+0
-123
paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
+0
-102
paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
+0
-171
paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
+0
-146
未找到文件。
paddle/fluid/operators/math/beam_search_npu.cc
已删除
100644 → 0
浏览文件 @
b451aff8
此差异已折叠。
点击以展开。
paddle/fluid/operators/metrics/accuracy_op_npu.cc
已删除
100644 → 0
浏览文件 @
b451aff8
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
AccuracyNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
inference
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
label
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Label"
);
auto
*
indices
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Indices"
);
auto
*
accuracy
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Accuracy"
);
auto
*
correct
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Correct"
);
auto
*
total
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Total"
);
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
int
num_samples
=
inference
->
dims
()[
0
];
if
(
num_samples
==
0
)
{
return
;
}
// cast `indices` or `label` if their type is not consistent
Tensor
cast_indices
(
phi
::
DataType
::
INT32
);
Tensor
cast_label
(
phi
::
DataType
::
INT32
);
if
(
indices
->
dtype
()
!=
label
->
dtype
())
{
auto
dst_dtype
=
ConvertToNpuDtype
(
framework
::
proto
::
VarType
::
INT32
);
if
(
framework
::
TransToProtoVarType
(
indices
->
dtype
())
!=
framework
::
proto
::
VarType
::
INT32
)
{
cast_indices
.
Resize
(
indices
->
dims
());
cast_indices
.
mutable_data
<
int
>
(
ctx
.
GetPlace
());
const
auto
&
runner_cast_indices
=
NpuOpRunner
(
"Cast"
,
{
*
indices
},
{
cast_indices
},
{{
"dst_type"
,
static_cast
<
int
>
(
dst_dtype
)}});
runner_cast_indices
.
Run
(
stream
);
}
else
{
cast_indices
.
ShareDataWith
(
*
indices
);
}
if
(
framework
::
TransToProtoVarType
(
label
->
dtype
())
!=
framework
::
proto
::
VarType
::
INT32
)
{
cast_label
.
Resize
(
label
->
dims
());
cast_label
.
mutable_data
<
int
>
(
ctx
.
GetPlace
());
const
auto
&
runner_cast_label
=
NpuOpRunner
(
"Cast"
,
{
*
label
},
{
cast_label
},
{{
"dst_type"
,
static_cast
<
int
>
(
dst_dtype
)}});
runner_cast_label
.
Run
(
stream
);
}
else
{
cast_label
.
ShareDataWith
(
*
label
);
}
}
else
{
cast_indices
.
ShareDataWith
(
*
indices
);
cast_label
.
ShareDataWith
(
*
label
);
}
// equal
Tensor
tmp_equal
(
phi
::
DataType
::
BOOL
);
tmp_equal
.
Resize
(
inference
->
dims
());
tmp_equal
.
mutable_data
<
bool
>
(
ctx
.
GetPlace
());
const
auto
&
runner_equal
=
NpuOpRunner
(
"Equal"
,
{
cast_indices
,
cast_label
},
{
tmp_equal
},
{});
runner_equal
.
Run
(
stream
);
// cast equal
Tensor
tmp_equal_cast
(
phi
::
DataType
::
FLOAT32
);
tmp_equal_cast
.
Resize
(
inference
->
dims
());
tmp_equal_cast
.
mutable_data
<
float
>
(
ctx
.
GetPlace
());
const
auto
&
runner_cast_equal
=
NpuOpRunner
(
"Cast"
,
{
tmp_equal
},
{
tmp_equal_cast
},
{{
"dst_type"
,
static_cast
<
int
>
(
ConvertToNpuDtype
(
framework
::
TransToProtoVarType
(
tmp_equal_cast
.
dtype
())))}});
runner_cast_equal
.
Run
(
stream
);
// [correct]
// reduce_max
Tensor
tmp_correct_max
(
phi
::
DataType
::
FLOAT32
);
tmp_correct_max
.
Resize
(
phi
::
make_ddim
({
num_samples
}));
tmp_correct_max
.
mutable_data
<
float
>
(
ctx
.
GetPlace
());
const
auto
&
runner_reduce_max
=
NpuOpRunner
(
"ReduceMaxD"
,
{
tmp_equal_cast
},
{
tmp_correct_max
},
{{
"axes"
,
std
::
vector
<
int
>
{
1
}},
{
"keep_dims"
,
false
}});
runner_reduce_max
.
Run
(
stream
);
// reduce_sum
Tensor
tmp_correct
(
phi
::
DataType
::
FLOAT32
);
tmp_correct
.
Resize
(
correct
->
dims
());
tmp_correct
.
mutable_data
<
float
>
(
ctx
.
GetPlace
());
const
auto
&
runner_reduce_sum
=
NpuOpRunner
(
"ReduceSumD"
,
{
tmp_correct_max
},
{
tmp_correct
},
{{
"axes"
,
std
::
vector
<
int
>
{
0
}},
{
"keep_dims"
,
false
}});
runner_reduce_sum
.
Run
(
stream
);
// cast to int
correct
->
mutable_data
<
int
>
(
ctx
.
GetPlace
());
const
auto
&
runner_cast_correct
=
NpuOpRunner
(
"Cast"
,
{
tmp_correct
},
{
*
correct
},
{{
"dst_type"
,
static_cast
<
int
>
(
ConvertToNpuDtype
(
framework
::
TransToProtoVarType
(
correct
->
dtype
())))}});
runner_cast_correct
.
Run
(
stream
);
// [total]
total
->
mutable_data
<
int
>
(
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
int
>
(
total
,
static_cast
<
int
>
(
num_samples
));
// use `total` of type `float32` for calculating accuracy
Tensor
tmp_total
(
phi
::
DataType
::
FLOAT32
);
tmp_total
.
Resize
(
total
->
dims
());
tmp_total
.
mutable_data
<
float
>
(
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
float
>
(
&
tmp_total
,
static_cast
<
float
>
(
num_samples
));
// [accuracy]
accuracy
->
mutable_data
<
float
>
(
ctx
.
GetPlace
());
const
auto
&
runner_accuracy
=
NpuOpRunner
(
"Div"
,
{
tmp_correct
,
tmp_total
},
{
*
accuracy
},
{});
runner_accuracy
.
Run
(
stream
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_NPU_KERNEL
(
accuracy
,
ops
::
AccuracyNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
float
>
,
ops
::
AccuracyNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
paddle
::
platform
::
float16
>
,
ops
::
AccuracyNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
int
>
,
ops
::
AccuracyNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
int64_t
>
);
paddle/fluid/operators/optimizers/adam_op_npu.cc
已删除
100644 → 0
浏览文件 @
b451aff8
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
AdamNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE_EQ
(
param_var
->
IsType
<
phi
::
DenseTensor
>
(),
true
,
platform
::
errors
::
InvalidArgument
(
"The Var(%s)'s type should be phi::DenseTensor, "
"but the received is %s"
,
ctx
.
InputNames
(
"Param"
).
front
(),
framework
::
ToTypeName
(
param_var
->
Type
())));
auto
*
param
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Param"
);
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
PADDLE_ENFORCE_EQ
(
grad_var
->
IsType
<
phi
::
DenseTensor
>
(),
true
,
platform
::
errors
::
InvalidArgument
(
"The Grad(%s)'s type should be phi::DenseTensor, "
"but the received is %s"
,
ctx
.
InputNames
(
"Grad"
).
front
(),
framework
::
ToTypeName
(
param_var
->
Type
())));
auto
*
grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Grad"
);
auto
*
mom1
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Moment1"
);
auto
*
mom2
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Moment2"
);
auto
*
lr
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"LearningRate"
);
auto
*
beta1_pow
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Beta1Pow"
);
auto
*
beta2_pow
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Beta2Pow"
);
auto
*
param_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"ParamOut"
);
auto
*
mom1_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Moment1Out"
);
auto
*
mom2_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Moment2Out"
);
auto
*
beta1_pow_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Beta1PowOut"
);
auto
*
beta2_pow_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Beta2PowOut"
);
bool
skip_update
=
false
;
if
(
ctx
.
HasInput
(
"SkipUpdate"
))
{
auto
*
skip_update_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"SkipUpdate"
);
PADDLE_ENFORCE_EQ
(
skip_update_tensor
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(SkipUpdate) size must be 1, but get %d"
,
skip_update_tensor
->
numel
()));
std
::
vector
<
bool
>
skip_update_vec
;
paddle
::
framework
::
TensorToVector
(
*
skip_update_tensor
,
ctx
.
device_context
(),
&
skip_update_vec
);
skip_update
=
skip_update_vec
[
0
];
}
// skip_update=true, just copy input to output, and TensorCopy will call
// mutable_data
if
(
skip_update
)
{
VLOG
(
4
)
<<
"Adam skip update"
;
framework
::
TensorCopy
(
*
param
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
param_out
);
framework
::
TensorCopy
(
*
mom1
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
mom1_out
);
framework
::
TensorCopy
(
*
mom2
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
mom2_out
);
framework
::
TensorCopy
(
*
beta1_pow
,
beta1_pow
->
place
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
beta1_pow_out
);
framework
::
TensorCopy
(
*
beta2_pow
,
beta2_pow
->
place
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
beta2_pow_out
);
return
;
}
bool
use_global_beta_pow
=
ctx
.
Attr
<
bool
>
(
"use_global_beta_pow"
);
VLOG
(
4
)
<<
"use_global_beta_pow:"
<<
use_global_beta_pow
;
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
mom1_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
mom2_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform
// place.
phi
::
DenseTensor
beta1_pow_tmp
;
phi
::
DenseTensor
beta2_pow_tmp
;
if
(
beta1_pow
->
place
()
==
platform
::
CPUPlace
())
{
T
beta1
=
*
beta1_pow
->
data
<
T
>
();
beta1_pow_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
beta1_pow_tmp
,
beta1
);
beta1_pow
=
&
beta1_pow_tmp
;
}
if
(
beta2_pow
->
place
()
==
platform
::
CPUPlace
())
{
T
beta2
=
*
beta2_pow
->
data
<
T
>
();
beta2_pow_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
beta2_pow_tmp
,
beta2
);
beta2_pow
=
&
beta2_pow_tmp
;
}
const
phi
::
DenseTensor
*
beta1_tensor
=
nullptr
;
const
phi
::
DenseTensor
*
beta2_tensor
=
nullptr
;
const
phi
::
DenseTensor
*
epsilon_tensor
=
nullptr
;
phi
::
DenseTensor
beta1_tmp
(
phi
::
DataType
::
FLOAT32
);
phi
::
DenseTensor
beta2_tmp
(
phi
::
DataType
::
FLOAT32
);
phi
::
DenseTensor
epsilon_tmp
(
phi
::
DataType
::
FLOAT32
);
if
(
ctx
.
HasInput
(
"Beta1Tensor"
))
{
beta1_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Beta1Tensor"
);
PADDLE_ENFORCE_EQ
(
beta1_tensor
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(Beta1Tensor) size must be 1, but get %d"
,
beta1_tensor
->
numel
()));
}
else
{
T
beta1
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta1"
));
beta1_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
beta1_tmp
,
beta1
);
beta1_tensor
=
&
beta1_tmp
;
}
if
(
ctx
.
HasInput
(
"Beta2Tensor"
))
{
beta2_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Beta2Tensor"
);
PADDLE_ENFORCE_EQ
(
beta2_tensor
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(Beta2Tensor) size must be 1, but get %d"
,
beta2_tensor
->
numel
()));
}
else
{
T
beta2
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"beta2"
));
beta2_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
beta2_tmp
,
beta2
);
beta2_tensor
=
&
beta2_tmp
;
}
if
(
ctx
.
HasInput
(
"EpsilonTensor"
))
{
epsilon_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"EpsilonTensor"
);
PADDLE_ENFORCE_EQ
(
epsilon_tensor
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(EpsilonTensor) size must be 1, but get %d"
,
epsilon_tensor
->
numel
()));
}
else
{
T
epsilon
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
epsilon_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
epsilon_tmp
,
epsilon
);
epsilon_tensor
=
&
epsilon_tmp
;
}
VLOG
(
3
)
<<
"beta1_pow.numel() : "
<<
beta1_pow
->
numel
()
<<
"beta2_pow.numel() : "
<<
beta2_pow
->
numel
();
VLOG
(
3
)
<<
"param.numel(): "
<<
param
->
numel
();
PADDLE_ENFORCE_EQ
(
beta1_pow_out
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"beta1 pow output size should be 1, but received "
"value is:%d."
,
beta1_pow_out
->
numel
()));
PADDLE_ENFORCE_EQ
(
beta2_pow_out
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"beta2 pow output size should be 1, but received "
"value is:%d."
,
beta2_pow_out
->
numel
()));
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
const
auto
&
runner
=
NpuOpRunner
(
"ApplyAdamD"
,
{
*
param
,
*
mom1
,
*
mom2
,
*
beta1_pow
,
*
beta2_pow
,
*
lr
,
*
beta1_tensor
,
*
beta2_tensor
,
*
epsilon_tensor
,
*
grad
,
},
{
*
param_out
,
*
mom1_out
,
*
mom2_out
,
},
{});
runner
.
Run
(
stream
);
// NOTE(zhiqiu): ApplyAdamD updates params inplace, so
// if param and param_out is not same, we need to do copy.
if
(
param_out
->
data
<
T
>
()
!=
param
->
data
<
T
>
())
{
framework
::
TensorCopy
(
*
param
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
param_out
);
}
if
(
mom1_out
->
data
<
T
>
()
!=
mom1
->
data
<
T
>
())
{
framework
::
TensorCopy
(
*
mom1
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
mom1_out
);
}
if
(
mom2_out
->
data
<
T
>
()
!=
mom2
->
data
<
T
>
())
{
framework
::
TensorCopy
(
*
mom2
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
mom2_out
);
}
if
(
!
use_global_beta_pow
)
{
beta1_pow_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
beta2_pow_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
auto
&
runner_m1
=
NpuOpRunner
(
"Mul"
,
{
*
beta1_pow
,
*
beta1_tensor
},
{
*
beta1_pow_out
},
{});
runner_m1
.
Run
(
stream
);
const
auto
&
runner_m2
=
NpuOpRunner
(
"Mul"
,
{
*
beta2_pow
,
*
beta2_tensor
},
{
*
beta2_pow_out
},
{});
runner_m2
.
Run
(
stream
);
}
}
};
template
<
typename
T
>
class
AdamWNPUKernel
:
public
AdamNPUKernel
<
platform
::
NPUDeviceContext
,
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
VLOG
(
3
)
<<
"NPU AdamW Kernel"
;
bool
skip_update
=
false
;
if
(
ctx
.
HasInput
(
"SkipUpdate"
))
{
VLOG
(
3
)
<<
"Has SkipUpdate"
;
auto
*
skip_update_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"SkipUpdate"
);
PADDLE_ENFORCE_EQ
(
skip_update_tensor
->
numel
(),
1
,
platform
::
errors
::
InvalidArgument
(
"Input(SkipUpdate) size must be 1, but get %d"
,
skip_update_tensor
->
numel
()));
std
::
vector
<
bool
>
skip_update_vec
;
paddle
::
framework
::
TensorToVector
(
*
skip_update_tensor
,
ctx
.
device_context
(),
&
skip_update_vec
);
skip_update
=
skip_update_vec
[
0
];
}
VLOG
(
3
)
<<
"Skip update"
<<
skip_update
;
bool
with_decay
=
ctx
.
Attr
<
bool
>
(
"with_decay"
);
if
(
!
skip_update
&&
with_decay
)
{
float
coeff
=
ctx
.
Attr
<
float
>
(
"coeff"
);
auto
*
lr
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"LearningRate"
);
auto
place
=
ctx
.
GetPlace
();
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
phi
::
DenseTensor
one
(
phi
::
DataType
::
FLOAT32
);
phi
::
DenseTensor
decay
(
phi
::
DataType
::
FLOAT32
);
phi
::
DenseTensor
tmp
(
phi
::
DataType
::
FLOAT32
);
tmp
.
mutable_data
<
float
>
({
1
},
place
);
one
.
mutable_data
<
float
>
({
1
},
place
);
decay
.
mutable_data
<
float
>
({
1
},
place
);
FillNpuTensorWithConstant
<
float
>
(
&
one
,
1.0
f
);
framework
::
NPUAttributeMap
attr_input
=
{{
"value"
,
coeff
}};
const
auto
&
runner1
=
NpuOpRunner
(
"Muls"
,
{
*
lr
},
{
tmp
},
attr_input
);
runner1
.
Run
(
stream
);
const
auto
&
runner2
=
NpuOpRunner
(
"Sub"
,
{
one
,
tmp
},
{
decay
},
{});
runner2
.
Run
(
stream
);
if
(
ctx
.
HasInput
(
"MasterParam"
))
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Master Parma is not supported on npu"
));
}
else
{
auto
*
param_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"ParamOut"
);
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
auto
*
param_var
=
ctx
.
InputVar
(
"Param"
);
PADDLE_ENFORCE_EQ
(
param_var
->
IsType
<
phi
::
DenseTensor
>
(),
true
,
platform
::
errors
::
InvalidArgument
(
"The Var(%s)'s type should be phi::DenseTensor, "
"but the received is %s"
,
ctx
.
InputNames
(
"Param"
).
front
(),
framework
::
ToTypeName
(
param_var
->
Type
())));
auto
*
param
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Param"
);
const
auto
&
runner
=
NpuOpRunner
(
"Mul"
,
{
*
param
,
decay
},
{
*
const_cast
<
phi
::
DenseTensor
*>
(
param
)},
{});
runner
.
Run
(
stream
);
}
}
AdamNPUKernel
<
platform
::
NPUDeviceContext
,
T
>::
Compute
(
ctx
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_NPU_KERNEL
(
adam
,
ops
::
AdamNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
float
>
,
ops
::
AdamNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_NPU_KERNEL
(
adamw
,
ops
::
AdamWNPUKernel
<
float
>
,
ops
::
AdamWNPUKernel
<
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
已删除
100644 → 0
浏览文件 @
b451aff8
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
NPUMergedMomentumOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
params
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"Param"
);
auto
params_out
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
"ParamOut"
);
size_t
n
=
params
.
size
();
PADDLE_ENFORCE_EQ
(
n
,
params_out
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of Output(ParamOut) must be equal to "
"Input(Param), but got the size of Output(ParamOut) "
"is %d, the size of Input(Param) is %d."
,
params_out
.
size
(),
n
));
for
(
size_t
i
=
0
;
i
<
n
;
++
i
)
{
PADDLE_ENFORCE_EQ
(
params
[
i
],
params_out
[
i
],
platform
::
errors
::
InvalidArgument
(
"The size of Input(Param) and Output(ParamOut) "
"must be the same Tensors."
));
}
auto
grads
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"Grad"
);
PADDLE_ENFORCE_EQ
(
n
,
grads
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of Input(Grad) must be equal to Input(Param), but got "
"the size of Input(Grad) is %d, the size of Input(Param) is %d."
,
grads
.
size
(),
n
));
auto
velocitys
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"Velocity"
);
PADDLE_ENFORCE_EQ
(
n
,
velocitys
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of Input(Velocity) must be equal to "
"Input(Param), but got the size of Input(Velocity) "
"is %d, the size of Input(Param) is %d."
,
velocitys
.
size
(),
n
));
auto
velocitys_out
=
ctx
.
MultiOutput
<
phi
::
DenseTensor
>
(
"VelocityOut"
);
PADDLE_ENFORCE_EQ
(
n
,
velocitys_out
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of Output(VelocityOut) must be "
"equal to Input(Param), but got the size of Output(VelocityOut) is "
"%d, the size of Input(Param) is %d."
,
velocitys_out
.
size
(),
n
));
for
(
size_t
i
=
0
;
i
<
n
;
++
i
)
{
PADDLE_ENFORCE_EQ
(
velocitys
[
i
],
velocitys_out
[
i
],
platform
::
errors
::
InvalidArgument
(
"Input(Velocity) and Output(VelocityOut) must be "
"the same Tensors."
));
}
T
mu
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"mu"
));
auto
lrs
=
ctx
.
MultiInput
<
phi
::
DenseTensor
>
(
"LearningRate"
);
if
(
lrs
.
size
()
!=
1
)
{
PADDLE_ENFORCE_EQ
(
n
,
lrs
.
size
(),
platform
::
errors
::
InvalidArgument
(
"If the size of Input(LearningRate) is not 1, the size of "
"Input(LearningRate) must be "
"equal to Input(Param), but got the size of Input(LearningRate) "
"is %d, the size of Input(Param) is %d."
,
lrs
.
size
(),
n
));
}
auto
use_nesterov
=
ctx
.
Attr
<
bool
>
(
"use_nesterov"
);
auto
regularization_methods
=
ctx
.
Attr
<
std
::
vector
<
std
::
string
>>
(
"regularization_method"
);
auto
regularization_coeffs
=
ctx
.
Attr
<
std
::
vector
<
float
>>
(
"regularization_coeff"
);
if
(
regularization_methods
.
size
()
!=
0
)
{
PADDLE_ENFORCE_EQ
(
n
,
regularization_methods
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of Attr(regularization_method) must be equal "
"to Input(Param), but got the size of "
"Attr(regularization_method) is %d, the size of Input(Param) is "
"%d."
,
regularization_methods
.
size
(),
n
));
PADDLE_ENFORCE_EQ
(
n
,
regularization_coeffs
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The size of Attr(regularization_coeff) must be equal "
"to Input(Param), but got the size of Attr(regularization_coeff) "
"is %d, the size of Input(Param) is %d."
,
regularization_coeffs
.
size
(),
n
));
}
VLOG
(
5
)
<<
"use_nesterov: "
<<
use_nesterov
<<
", regularization_methods.size(): "
<<
regularization_methods
.
size
()
<<
", regularization_coeffs.size(): "
<<
regularization_coeffs
.
size
();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
NPUDeviceContext
>();
Tensor
mu_tensor
;
mu_tensor
.
mutable_data
<
T
>
(
phi
::
make_ddim
({
1
}),
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
mu_tensor
,
mu
);
for
(
size_t
idx
=
0
;
idx
<
n
;
++
idx
)
{
phi
::
RegularizationType
regularization_flag
=
regularization_methods
.
size
()
>
0
&&
regularization_methods
[
idx
]
==
"l2_decay"
?
phi
::
RegularizationType
::
kL2DECAY
:
phi
::
RegularizationType
::
kNONE
;
float
regularization_coeff
=
0.0
;
if
(
regularization_coeffs
.
size
()
!=
0
)
{
regularization_coeff
=
regularization_coeffs
[
idx
];
}
auto
learning_rate
=
lrs
.
size
()
>
1
?
lrs
[
idx
]
:
lrs
[
0
];
auto
param
=
params
[
idx
];
auto
param_out
=
params_out
[
idx
];
auto
velocity
=
velocitys
[
idx
];
auto
velocity_out
=
velocitys_out
[
idx
];
auto
grad
=
grads
[
idx
];
Tensor
regularized_grad
;
if
(
regularization_flag
==
phi
::
RegularizationType
::
kL2DECAY
)
{
regularized_grad
.
mutable_data
<
T
>
(
grad
->
dims
(),
ctx
.
GetPlace
());
const
auto
&
runner1
=
NpuOpRunner
(
"Muls"
,
{
*
param
},
{
regularized_grad
},
{{
"value"
,
regularization_coeff
}});
runner1
.
Run
(
dev_ctx
.
stream
());
const
auto
&
runner2
=
NpuOpRunner
(
"Add"
,
{
regularized_grad
,
*
grad
},
{
regularized_grad
},
{});
runner2
.
Run
(
dev_ctx
.
stream
());
}
else
{
regularized_grad
.
ShareDataWith
(
*
grad
);
}
framework
::
TensorCopy
(
*
param
,
ctx
.
GetPlace
(),
dev_ctx
,
param_out
);
framework
::
TensorCopy
(
*
velocity
,
ctx
.
GetPlace
(),
dev_ctx
,
velocity_out
);
// NOTE: ApplyMomentum will change the input
const
auto
&
runner
=
NpuOpRunner
(
"ApplyMomentum"
,
{
*
param_out
,
*
velocity_out
,
*
learning_rate
,
regularized_grad
,
mu_tensor
},
{
*
param_out
},
{{
"use_nesterov"
,
use_nesterov
}});
runner
.
Run
(
dev_ctx
.
stream
());
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
merged_momentum
,
ops
::
NPUMergedMomentumOpKernel
<
float
>
,
ops
::
NPUMergedMomentumOpKernel
<
plat
::
float16
>
);
paddle/fluid/operators/optimizers/momentum_op_npu.cc
已删除
100644 → 0
浏览文件 @
b451aff8
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/optimizers/momentum_op.h"
#include "paddle/fluid/operators/optimizers/sgd_op.h"
#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
NPUMomentumOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
NPUDeviceContext
>();
std
::
string
regularization_method
=
ctx
.
Attr
<
std
::
string
>
(
"regularization_method"
);
auto
regularization_coeff
=
ctx
.
Attr
<
float
>
(
"regularization_coeff"
);
phi
::
RegularizationType
regularization_flag
{
phi
::
RegularizationType
::
kNONE
};
// disable regularization
if
(
regularization_method
==
"l2_decay"
)
{
regularization_flag
=
phi
::
RegularizationType
::
kL2DECAY
;
}
T
mu
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"mu"
));
bool
use_nesterov
=
ctx
.
Attr
<
bool
>
(
"use_nesterov"
);
auto
learning_rate
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"LearningRate"
);
auto
param
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Param"
);
auto
velocity
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Velocity"
);
auto
param_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"ParamOut"
);
auto
velocity_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"VelocityOut"
);
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
velocity_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
if
(
grad_var
->
IsType
<
phi
::
DenseTensor
>
())
{
auto
grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Grad"
);
Tensor
mu_tensor
;
mu_tensor
.
mutable_data
<
T
>
(
phi
::
make_ddim
({
1
}),
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
mu_tensor
,
mu
);
Tensor
regularized_grad
;
if
(
regularization_flag
==
phi
::
RegularizationType
::
kL2DECAY
)
{
regularized_grad
.
mutable_data
<
T
>
(
grad
->
dims
(),
ctx
.
GetPlace
());
const
auto
&
runner1
=
NpuOpRunner
(
"Muls"
,
{
*
param
},
{
regularized_grad
},
{{
"value"
,
regularization_coeff
}});
runner1
.
Run
(
dev_ctx
.
stream
());
const
auto
&
runner2
=
NpuOpRunner
(
"Add"
,
{
regularized_grad
,
*
grad
},
{
regularized_grad
},
{});
runner2
.
Run
(
dev_ctx
.
stream
());
}
else
{
regularized_grad
.
ShareDataWith
(
*
grad
);
}
framework
::
TensorCopy
(
*
param
,
ctx
.
GetPlace
(),
dev_ctx
,
param_out
);
framework
::
TensorCopy
(
*
velocity
,
ctx
.
GetPlace
(),
dev_ctx
,
velocity_out
);
// NOTE: ApplyMomentum will change the input
const
auto
&
runner
=
NpuOpRunner
(
"ApplyMomentum"
,
{
*
param_out
,
*
velocity_out
,
*
learning_rate
,
regularized_grad
,
mu_tensor
},
{
*
param_out
},
{{
"use_nesterov"
,
use_nesterov
}});
runner
.
Run
(
dev_ctx
.
stream
());
}
else
if
(
grad_var
->
IsType
<
phi
::
SelectedRows
>
())
{
PADDLE_ENFORCE_EQ
(
false
,
true
,
platform
::
errors
::
PermissionDenied
(
"Unsupport SparseMomentum"
));
}
else
{
PADDLE_ENFORCE_EQ
(
false
,
true
,
platform
::
errors
::
PermissionDenied
(
"Unsupported Variable Type of Grad "
"in MomentumOp. Excepted LodTensor "
"or SelectedRows, But received [%s]"
,
paddle
::
framework
::
ToTypeName
(
grad_var
->
Type
())));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
momentum
,
ops
::
NPUMomentumOpKernel
<
float
>
,
ops
::
NPUMomentumOpKernel
<
plat
::
float16
>
);
paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
已删除
100644 → 0
浏览文件 @
b451aff8
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
RMSPROPNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
grad_var
=
ctx
.
InputVar
(
"Grad"
);
auto
*
param_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"ParamOut"
);
auto
*
moment_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"MomentOut"
);
auto
*
mean_square_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"MeanSquareOut"
);
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
moment_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
mean_square_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
epsilon
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"epsilon"
));
auto
rho
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"decay"
));
auto
momentum
=
static_cast
<
T
>
(
ctx
.
Attr
<
float
>
(
"momentum"
));
auto
*
p_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Param"
);
auto
*
ms_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"MeanSquare"
);
auto
*
lr_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"LearningRate"
);
auto
*
mom_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Moment"
);
bool
centered
=
ctx
.
Attr
<
bool
>
(
"centered"
);
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
if
(
grad_var
->
IsType
<
phi
::
DenseTensor
>
())
{
auto
*
grad_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Grad"
);
if
(
centered
)
{
framework
::
NPUAttributeMap
attr_input
=
{{
"use_locking"
,
false
}};
const
phi
::
DenseTensor
*
rho_tensor
=
nullptr
;
const
phi
::
DenseTensor
*
momentum_tensor
=
nullptr
;
const
phi
::
DenseTensor
*
epsilon_tensor
=
nullptr
;
phi
::
DenseTensor
rho_tmp
(
phi
::
DataType
::
FLOAT32
);
rho_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
rho_tmp
,
rho
);
rho_tensor
=
&
rho_tmp
;
phi
::
DenseTensor
momentum_tmp
(
phi
::
DataType
::
FLOAT32
);
momentum_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
momentum_tmp
,
momentum
);
momentum_tensor
=
&
momentum_tmp
;
phi
::
DenseTensor
epsilon_tmp
(
phi
::
DataType
::
FLOAT32
);
epsilon_tmp
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
epsilon_tmp
,
epsilon
);
epsilon_tensor
=
&
epsilon_tmp
;
auto
*
mg_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"MeanGrad"
);
auto
*
mean_grad_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"MeanGradOut"
);
mean_grad_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
auto
&
runner_applycenterrmsprop
=
NpuOpRunner
(
std
::
string
(
"ApplyCenteredRMSPropD"
),
{
*
p_tensor
,
*
mg_tensor
,
*
ms_tensor
,
*
mom_tensor
,
*
lr_tensor
,
*
rho_tensor
,
*
momentum_tensor
,
*
epsilon_tensor
,
*
grad_tensor
},
{
*
param_out
,
*
mean_grad_out
,
*
mean_square_out
,
*
moment_out
},
{
attr_input
});
runner_applycenterrmsprop
.
Run
(
stream
);
}
else
{
framework
::
NPUAttributeMap
attr_input
=
{
{
"rho"
,
rho
},
{
"momentum"
,
momentum
},
{
"epsilon"
,
epsilon
}};
const
auto
&
runner_applyrmsprop
=
NpuOpRunner
(
std
::
string
(
"ApplyRMSPropD"
),
{
*
p_tensor
,
*
ms_tensor
,
*
mom_tensor
,
*
lr_tensor
,
*
grad_tensor
},
{
*
param_out
,
*
mean_square_out
,
*
moment_out
},
{
attr_input
});
runner_applyrmsprop
.
Run
(
stream
);
}
}
else
{
PADDLE_ENFORCE_EQ
(
false
,
true
,
platform
::
errors
::
PermissionDenied
(
"Unsupported Variable Type of Grad "
"in RmspropOp. Excepted LodTensor, "
"But received [%s]"
,
paddle
::
framework
::
ToTypeName
(
grad_var
->
Type
())));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_NPU_KERNEL
(
rmsprop
,
ops
::
RMSPROPNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
float
>
)
paddle/fluid/operators/optimizers/sgd_op_npu.cc
已删除
100644 → 0
浏览文件 @
b451aff8
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/optimizers/sgd_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
SGDNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
learning_rate
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"LearningRate"
);
auto
*
param_var
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Param"
);
auto
*
grad_var
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Grad"
);
auto
*
param_out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"ParamOut"
);
param_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
auto
&
runner
=
NpuOpRunner
(
"ApplyGradientDescent"
,
{
*
param_var
,
*
learning_rate
,
*
grad_var
},
{
*
param_out
},
{});
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
runner
.
Run
(
stream
);
// NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so
// if param and param_out is not same, we need to do copy.
if
(
param_out
->
data
<
T
>
()
!=
param_var
->
data
<
T
>
())
{
framework
::
TensorCopy
(
*
param_var
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
param_out
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_NPU_KERNEL
(
sgd
,
ops
::
SGDNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
float
>
,
ops
::
SGDNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
double
>
,
ops
::
SGDNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
已删除
100644 → 0
浏览文件 @
b451aff8
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
ReduceAnyNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
phi
::
DenseTensor
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
bool
keep_dim
=
ctx
.
Attr
<
bool
>
(
"keep_dim"
);
auto
dims
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// set attr
NPUAttributeMap
attr
=
{{
"keep_dims"
,
keep_dim
},
{
"axes"
,
dims
}};
const
auto
&
runner
=
NpuOpRunner
(
"ReduceAnyD"
,
{
*
x
},
{
*
out
},
attr
);
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
runner
.
Run
(
stream
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
reduce_any
,
ops
::
ReduceAnyNPUKernel
<
bool
>
);
paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
已删除
100644 → 0
浏览文件 @
b451aff8
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <memory>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
USE_OP_ITSELF
(
reduce_any
);
USE_OP_DEVICE_KERNEL
(
reduce_any
,
NPU
);
template
<
typename
T
>
void
Compare
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
)
{
// init
auto
x
=
scope
->
Var
(
"X"
);
auto
tensor_x
=
x
->
GetMutable
<
phi
::
DenseTensor
>
();
std
::
vector
<
bool
>
init_x
=
{
true
,
false
,
false
,
false
};
f
::
TensorFromVector
<
bool
>
(
init_x
,
ctx
,
tensor_x
);
tensor_x
->
Resize
(
phi
::
make_ddim
({
2
}));
ctx
.
Wait
();
auto
place
=
ctx
.
GetPlace
();
auto
out
=
scope
->
Var
(
"Out"
);
auto
tensor_out
=
out
->
GetMutable
<
phi
::
DenseTensor
>
();
// run
std
::
vector
<
int
>
axes
;
f
::
AttributeMap
attrs
=
{{
"axes"
,
axes
},
{
"keep_dims"
,
true
}};
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"reduce_any"
,
{{
"X"
,
{
"X"
}}},
{{
"Out"
,
{
"Out"
}}},
attrs
);
op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
std
::
vector
<
bool
>
out_vec
;
f
::
TensorToVector
<
bool
>
(
*
tensor_out
,
ctx
,
&
out_vec
);
ctx
.
Wait
();
std
::
vector
<
bool
>
expected_vec
=
{
true
};
EXPECT_EQ
(
out_vec
.
size
(),
expected_vec
.
size
());
for
(
uint32_t
i
=
0
;
i
<
out_vec
.
size
();
i
++
)
{
EXPECT_EQ
(
out_vec
[
i
],
expected_vec
[
i
]);
}
}
TEST
(
reduce_any
,
NPU
)
{
f
::
Scope
scope
;
auto
*
ctx
=
p
::
DeviceContextPool
::
Instance
().
Get
(
p
::
NPUPlace
(
0
));
Compare
<
bool
>
(
&
scope
,
*
ctx
);
}
paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
已删除
100644 → 0
浏览文件 @
b451aff8
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
ReduceMaxNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
dims
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
bool
keep_dim
=
ctx
.
Attr
<
bool
>
(
"keep_dim"
);
bool
reduce_all
=
ctx
.
Attr
<
bool
>
(
"reduce_all"
);
int
out_dtype
=
ctx
.
Attr
<
int
>
(
"out_dtype"
);
auto
place
=
ctx
.
GetPlace
();
phi
::
DenseTensor
cast_out
(
x
->
type
());
cast_out
.
Resize
(
out
->
dims
());
cast_out
.
mutable_data
<
T
>
(
place
);
auto
cast_out_dtype
=
framework
::
TransToProtoVarType
(
x
->
dtype
());
if
(
out_dtype
!=
-
1
)
{
cast_out_dtype
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
out_dtype
);
}
if
(
framework
::
TransToProtoVarType
(
x
->
dtype
())
!=
cast_out_dtype
)
{
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
FP32
)
{
out
->
mutable_data
<
float
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
FP16
)
{
out
->
mutable_data
<
paddle
::
platform
::
float16
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
INT16
)
{
out
->
mutable_data
<
int16_t
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
INT32
)
{
out
->
mutable_data
<
int32_t
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
INT64
)
{
out
->
mutable_data
<
int64_t
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
FP64
)
{
out
->
mutable_data
<
double
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
BOOL
)
{
out
->
mutable_data
<
bool
>
(
place
);
}
}
else
{
out
->
ShareDataWith
(
cast_out
);
}
framework
::
NPUAttributeMap
attr_input
=
{{
"axes"
,
dims
},
{
"keep_dims"
,
keep_dim
}};
if
(
reduce_all
)
{
std
::
vector
<
int
>
dim_vec
;
for
(
int
i
=
0
;
i
<
x
->
dims
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
attr_input
=
{{
"axes"
,
dim_vec
},
{
"keep_dims"
,
keep_dim
}};
}
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
if
(
framework
::
TransToProtoVarType
(
x
->
dtype
())
==
framework
::
proto
::
VarType
::
INT64
)
{
auto
op_func
=
[](
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
const
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
NPUAttributeMap
&
attrs
,
const
platform
::
NPUDeviceContext
&
dev_ctx
)
{
const
auto
&
runner
=
NpuOpRunner
(
"ReduceMaxD"
,
{
inputs
[
0
]},
{
outputs
[
0
]},
attrs
);
runner
.
Run
(
dev_ctx
.
stream
());
};
NpuOpRunner
::
TypeAdapter
({
*
x
},
{
cast_out
},
attr_input
,
dev_ctx
,
op_func
,
{
framework
::
proto
::
VarType
::
INT32
},
{
framework
::
proto
::
VarType
::
INT32
});
}
else
{
const
auto
&
runner
=
NpuOpRunner
(
"ReduceMaxD"
,
{
*
x
},
{
cast_out
},
attr_input
);
runner
.
Run
(
dev_ctx
.
stream
());
}
if
(
framework
::
TransToProtoVarType
(
x
->
dtype
())
!=
cast_out_dtype
)
{
auto
dst_dtype
=
ConvertToNpuDtype
(
cast_out_dtype
);
const
auto
&
runner_cast
=
NpuOpRunner
(
"Cast"
,
{
cast_out
},
{
*
out
},
{{
"dst_type"
,
static_cast
<
int
>
(
dst_dtype
)}});
runner_cast
.
Run
(
dev_ctx
.
stream
());
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
ReduceMaxGradNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
x
=
context
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
context
.
Input
<
phi
::
DenseTensor
>
(
"Out"
);
auto
*
out_grad
=
context
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
reduce_dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
int
in_dtype
=
context
.
Attr
<
int
>
(
"in_dtype"
);
PADDLE_ENFORCE_EQ
(
in_dtype
==
-
1
,
true
,
platform
::
errors
::
InvalidArgument
(
"NPU only support in_dtype == -1 in reduce_max_grad op."
));
auto
*
x_grad
=
context
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
x_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
&
dev_ctx
=
context
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
auto
place
=
context
.
GetPlace
();
auto
stream
=
dev_ctx
.
stream
();
// broadcast
auto
x_dims_vec
=
phi
::
vectorize
(
x
->
dims
());
if
(
reduce_all
)
{
reduce_dims
.
clear
();
for
(
size_t
d
=
0
;
d
<
x_dims_vec
.
size
();
++
d
)
{
reduce_dims
.
push_back
(
static_cast
<
int
>
(
d
));
}
}
phi
::
DenseTensor
tmp_out
,
tmp_out_grad
;
auto
tmp_out_dims_vec
=
x_dims_vec
;
for
(
auto
d
:
reduce_dims
)
{
if
(
d
<
0
)
{
d
+=
x_dims_vec
.
size
();
}
tmp_out_dims_vec
[
d
]
=
1
;
}
tmp_out
.
ShareDataWith
(
*
out
);
tmp_out
.
Resize
(
phi
::
make_ddim
(
tmp_out_dims_vec
));
tmp_out_grad
.
ShareDataWith
(
*
out_grad
);
tmp_out_grad
.
Resize
(
phi
::
make_ddim
(
tmp_out_dims_vec
));
phi
::
DenseTensor
transformed_out
(
x
->
type
());
transformed_out
.
Resize
(
phi
::
make_ddim
(
x_dims_vec
));
transformed_out
.
mutable_data
<
T
>
(
place
);
NpuOpRunner
r_brd_out
;
r_brd_out
.
SetType
(
"BroadcastTo"
)
.
AddInput
(
tmp_out
)
.
AddInput
(
std
::
move
(
x_dims_vec
))
.
AddOutput
(
transformed_out
)
.
Run
(
stream
);
phi
::
DenseTensor
transformed_out_grad
(
x
->
type
());
transformed_out_grad
.
Resize
(
phi
::
make_ddim
(
x_dims_vec
));
transformed_out_grad
.
mutable_data
<
T
>
(
place
);
NpuOpRunner
r_brd_out_grad
;
r_brd_out_grad
.
SetType
(
"BroadcastTo"
)
.
AddInput
(
tmp_out_grad
)
.
AddInput
(
std
::
move
(
x_dims_vec
))
.
AddOutput
(
transformed_out_grad
)
.
Run
(
stream
);
// compare
phi
::
DenseTensor
equal_cond
;
equal_cond
.
mutable_data
<
bool
>
(
x_grad
->
dims
(),
place
);
const
auto
&
r_equal
=
NpuOpRunner
(
"Equal"
,
{
*
x
,
transformed_out
},
{
equal_cond
},
{});
r_equal
.
Run
(
stream
);
// select
phi
::
DenseTensor
t_zero
;
t_zero
.
mutable_data
<
T
>
(
x_grad
->
dims
(),
place
);
FillNpuTensorWithConstant
(
&
t_zero
,
static_cast
<
T
>
(
0
));
t_zero
.
Resize
(
x_grad
->
dims
());
const
auto
&
r_sel
=
NpuOpRunner
(
"SelectV2"
,
{
equal_cond
,
transformed_out_grad
,
t_zero
},
{
*
x_grad
},
{});
r_sel
.
Run
(
stream
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
reduce_max
,
ops
::
ReduceMaxNPUKernel
<
plat
::
NPUDeviceContext
,
float
>
,
ops
::
ReduceMaxNPUKernel
<
plat
::
NPUDeviceContext
,
plat
::
float16
>
,
ops
::
ReduceMaxNPUKernel
<
plat
::
NPUDeviceContext
,
int64_t
>
,
ops
::
ReduceMaxNPUKernel
<
plat
::
NPUDeviceContext
,
int
>
);
REGISTER_OP_NPU_KERNEL
(
reduce_max_grad
,
ops
::
ReduceMaxGradNPUKernel
<
plat
::
NPUDeviceContext
,
float
>
,
ops
::
ReduceMaxGradNPUKernel
<
plat
::
NPUDeviceContext
,
plat
::
float16
>
,
ops
::
ReduceMaxGradNPUKernel
<
plat
::
NPUDeviceContext
,
int64_t
>
,
ops
::
ReduceMaxGradNPUKernel
<
plat
::
NPUDeviceContext
,
int
>
);
paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
已删除
100644 → 0
浏览文件 @
b451aff8
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
NPUReduceMeanOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
bool
reduce_all
=
ctx
.
Attr
<
bool
>
(
"reduce_all"
);
auto
dims
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
bool
keep_dim
=
ctx
.
Attr
<
bool
>
(
"keep_dim"
);
auto
input_dims
=
input
->
dims
();
if
(
reduce_all
)
{
dims
.
clear
();
for
(
int
i
=
0
;
i
<
input_dims
.
size
();
i
++
)
{
dims
.
push_back
(
static_cast
<
int
>
(
i
));
}
}
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
NpuOpRunner
runner
;
runner
.
SetType
(
"ReduceMean"
)
.
AddInput
(
*
input
)
.
AddInput
(
std
::
move
(
dims
))
.
AddOutput
(
*
output
)
.
AddAttrs
({{
"keep_dims"
,
keep_dim
}})
.
Run
(
stream
);
}
};
template
<
typename
T
>
class
NPUReduceMeanGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
output_grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
input_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
bool
reduce_all
=
ctx
.
Attr
<
bool
>
(
"reduce_all"
);
auto
reduce_dims
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
auto
input_dims
=
input
->
dims
();
int
reduce_numel
=
1
;
if
(
reduce_all
)
{
reduce_dims
.
clear
();
for
(
int
d
=
0
;
d
<
input_dims
.
size
();
++
d
)
{
reduce_dims
.
push_back
(
static_cast
<
int
>
(
d
));
}
}
for
(
auto
&
d
:
reduce_dims
)
{
if
(
d
<
0
)
{
d
=
d
+
input_dims
.
size
();
}
reduce_numel
*=
input_dims
[
d
];
}
phi
::
DenseTensor
tensor_value
(
input_grad
->
dtype
());
tensor_value
.
mutable_data
<
T
>
({
1
},
ctx
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
tensor_value
,
static_cast
<
T
>
(
1.0
f
/
static_cast
<
T
>
(
reduce_numel
)));
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
NpuOpRunner
runner
;
runner
.
SetType
(
"Fill"
)
.
AddInput
(
phi
::
vectorize
(
input_dims
))
.
AddInput
(
tensor_value
)
.
AddOutput
(
*
input_grad
)
.
Run
(
stream
);
phi
::
DenseTensor
transformed_input_grad
,
transformed_out_grad
;
phi
::
DenseTensor
tmp_output_grad
;
auto
tmp_output_dims
=
input_dims
;
for
(
auto
d
:
reduce_dims
)
{
tmp_output_dims
[
d
]
=
1
;
}
tmp_output_grad
.
ShareDataWith
(
*
output_grad
);
tmp_output_grad
.
Resize
(
tmp_output_dims
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
NpuElementWiseOpBroadcast
<
T
>
(
dev_ctx
,
input_grad
,
&
tmp_output_grad
,
0
,
&
transformed_input_grad
,
&
transformed_out_grad
);
const
auto
&
runner2
=
NpuOpRunner
(
"Mul"
,
{
transformed_input_grad
,
transformed_out_grad
},
{
*
input_grad
},
{});
runner2
.
Run
(
stream
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
reduce_mean
,
ops
::
NPUReduceMeanOpKernel
<
float
>
);
REGISTER_OP_NPU_KERNEL
(
reduce_mean_grad
,
ops
::
NPUReduceMeanGradOpKernel
<
float
>
);
paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
已删除
100644 → 0
浏览文件 @
b451aff8
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
ReduceMinNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
dims
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
bool
keep_dim
=
ctx
.
Attr
<
bool
>
(
"keep_dim"
);
bool
reduce_all
=
ctx
.
Attr
<
bool
>
(
"reduce_all"
);
int
out_dtype
=
ctx
.
Attr
<
int
>
(
"out_dtype"
);
auto
place
=
ctx
.
GetPlace
();
phi
::
DenseTensor
cast_out
(
x
->
type
());
cast_out
.
Resize
(
out
->
dims
());
cast_out
.
mutable_data
<
T
>
(
place
);
auto
cast_out_dtype
=
framework
::
TransToProtoVarType
(
x
->
dtype
());
if
(
out_dtype
!=
-
1
)
{
cast_out_dtype
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
out_dtype
);
}
if
(
framework
::
TransToProtoVarType
(
x
->
type
())
!=
cast_out_dtype
)
{
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
FP32
)
{
out
->
mutable_data
<
float
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
FP16
)
{
out
->
mutable_data
<
paddle
::
platform
::
float16
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
INT16
)
{
out
->
mutable_data
<
int16_t
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
INT32
)
{
out
->
mutable_data
<
int32_t
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
INT64
)
{
out
->
mutable_data
<
int64_t
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
FP64
)
{
out
->
mutable_data
<
double
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
BOOL
)
{
out
->
mutable_data
<
bool
>
(
place
);
}
}
else
{
out
->
ShareDataWith
(
cast_out
);
}
framework
::
NPUAttributeMap
attr_input
=
{{
"axes"
,
dims
},
{
"keep_dims"
,
keep_dim
}};
if
(
reduce_all
)
{
std
::
vector
<
int
>
dim_vec
;
for
(
int
i
=
0
;
i
<
x
->
dims
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
attr_input
=
{{
"axes"
,
dim_vec
},
{
"keep_dims"
,
keep_dim
}};
}
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
if
(
x
->
dtype
()
==
phi
::
DataType
::
INT64
)
{
auto
op_func
=
[](
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
const
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
NPUAttributeMap
&
attrs
,
const
platform
::
NPUDeviceContext
&
dev_ctx
)
{
const
auto
&
runner
=
NpuOpRunner
(
"ReduceMinD"
,
{
inputs
[
0
]},
{
outputs
[
0
]},
attrs
);
runner
.
Run
(
dev_ctx
.
stream
());
};
NpuOpRunner
::
TypeAdapter
({
*
x
},
{
cast_out
},
attr_input
,
dev_ctx
,
op_func
,
{
framework
::
proto
::
VarType
::
INT32
},
{
framework
::
proto
::
VarType
::
INT32
});
}
else
{
const
auto
&
runner
=
NpuOpRunner
(
"ReduceMinD"
,
{
*
x
},
{
cast_out
},
attr_input
);
runner
.
Run
(
dev_ctx
.
stream
());
}
if
(
framework
::
TransToProtoVarType
(
x
->
type
())
!=
cast_out_dtype
)
{
auto
dst_dtype
=
ConvertToNpuDtype
(
cast_out_dtype
);
const
auto
&
runner_cast
=
NpuOpRunner
(
"Cast"
,
{
cast_out
},
{
*
out
},
{{
"dst_type"
,
static_cast
<
int
>
(
dst_dtype
)}});
runner_cast
.
Run
(
dev_ctx
.
stream
());
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
reduce_min
,
ops
::
ReduceMinNPUKernel
<
plat
::
NPUDeviceContext
,
float
>
,
ops
::
ReduceMinNPUKernel
<
plat
::
NPUDeviceContext
,
plat
::
float16
>
,
#ifdef PADDLE_WITH_ASCEND_INT64
ops
::
ReduceMinNPUKernel
<
plat
::
NPUDeviceContext
,
int64_t
>
,
#endif
ops
::
ReduceMinNPUKernel
<
plat
::
NPUDeviceContext
,
int
>
);
paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
已删除
100644 → 0
浏览文件 @
b451aff8
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
ReduceProdNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
dims
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
bool
keep_dim
=
ctx
.
Attr
<
bool
>
(
"keep_dim"
);
bool
reduce_all
=
ctx
.
Attr
<
bool
>
(
"reduce_all"
);
int
out_dtype
=
ctx
.
Attr
<
int
>
(
"out_dtype"
);
auto
place
=
ctx
.
GetPlace
();
phi
::
DenseTensor
cast_out
(
x
->
type
());
cast_out
.
Resize
(
out
->
dims
());
cast_out
.
mutable_data
<
T
>
(
place
);
auto
cast_out_dtype
=
framework
::
TransToProtoVarType
(
x
->
dtype
());
if
(
out_dtype
!=
-
1
)
{
cast_out_dtype
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
out_dtype
);
}
if
(
framework
::
TransToProtoVarType
(
x
->
dtype
())
!=
cast_out_dtype
)
{
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
FP32
)
{
out
->
mutable_data
<
float
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
FP16
)
{
out
->
mutable_data
<
paddle
::
platform
::
float16
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
INT16
)
{
out
->
mutable_data
<
int16_t
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
INT32
)
{
out
->
mutable_data
<
int32_t
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
INT64
)
{
out
->
mutable_data
<
int64_t
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
FP64
)
{
out
->
mutable_data
<
double
>
(
place
);
}
else
if
(
cast_out_dtype
==
framework
::
proto
::
VarType
::
BOOL
)
{
out
->
mutable_data
<
bool
>
(
place
);
}
}
else
{
out
->
ShareDataWith
(
cast_out
);
}
framework
::
NPUAttributeMap
attr_input
=
{{
"axes"
,
dims
},
{
"keep_dims"
,
keep_dim
}};
if
(
reduce_all
)
{
std
::
vector
<
int
>
dim_vec
;
for
(
int
i
=
0
;
i
<
x
->
dims
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
attr_input
=
{{
"axes"
,
dim_vec
},
{
"keep_dims"
,
keep_dim
}};
}
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
const
auto
&
runner
=
NpuOpRunner
(
"ReduceProdD"
,
{
*
x
},
{
cast_out
},
attr_input
);
runner
.
Run
(
stream
);
if
(
framework
::
TransToProtoVarType
(
x
->
dtype
())
!=
cast_out_dtype
)
{
auto
dst_dtype
=
ConvertToNpuDtype
(
cast_out_dtype
);
const
auto
&
runner_cast
=
NpuOpRunner
(
"Cast"
,
{
cast_out
},
{
*
out
},
{{
"dst_type"
,
static_cast
<
int
>
(
dst_dtype
)}});
runner_cast
.
Run
(
stream
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
reduce_prod
,
ops
::
ReduceProdNPUKernel
<
plat
::
NPUDeviceContext
,
float
>
,
ops
::
ReduceProdNPUKernel
<
plat
::
NPUDeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
已删除
100644 → 0
浏览文件 @
b451aff8
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
#include "paddle/fluid/operators/unsqueeze_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
ReduceSumNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
bool
reduce_all
=
ctx
.
Attr
<
bool
>
(
"reduce_all"
);
bool
keep_dims
=
ctx
.
Attr
<
bool
>
(
"keep_dim"
);
auto
dims
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// special case
if
(
x
->
dims
().
size
()
==
1
&&
keep_dims
==
false
)
{
keep_dims
=
true
;
}
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
phi
::
DenseTensor
cast_x
;
phi
::
DenseTensor
cast_out
;
// NOTE: ReduceSumD only supports fp32 and fp16
if
(
framework
::
TransToProtoVarType
(
x
->
dtype
())
!=
framework
::
proto
::
VarType
::
FP32
&&
framework
::
TransToProtoVarType
(
x
->
dtype
())
!=
framework
::
proto
::
VarType
::
FP16
)
{
cast_x
.
Resize
(
x
->
dims
());
cast_x
.
mutable_data
<
float
>
(
ctx
.
GetPlace
());
auto
dst_dtype
=
ConvertToNpuDtype
(
framework
::
proto
::
VarType
::
FP32
);
const
auto
&
runner_cast
=
NpuOpRunner
(
"Cast"
,
{
*
x
},
{
cast_x
},
{{
"dst_type"
,
static_cast
<
int
>
(
dst_dtype
)}});
runner_cast
.
Run
(
stream
);
cast_out
.
Resize
(
out
->
dims
());
cast_out
.
mutable_data
<
float
>
(
ctx
.
GetPlace
());
}
else
{
cast_x
.
ShareDataWith
(
*
x
);
cast_out
.
ShareDataWith
(
*
out
);
}
if
(
reduce_all
)
{
std
::
vector
<
int
>
dim_vec
;
for
(
int
i
=
0
;
i
<
x
->
dims
().
size
();
i
++
)
{
dim_vec
.
push_back
(
i
);
}
const
auto
&
runner
=
NpuOpRunner
(
"ReduceSumD"
,
{
cast_x
},
{
cast_out
},
{{
"axes"
,
dim_vec
},
{
"keep_dims"
,
keep_dims
}});
runner
.
Run
(
stream
);
}
else
{
const
auto
&
runner
=
NpuOpRunner
(
"ReduceSumD"
,
{
cast_x
},
{
cast_out
},
{{
"axes"
,
dims
},
{
"keep_dims"
,
keep_dims
}});
runner
.
Run
(
stream
);
}
if
(
framework
::
TransToProtoVarType
(
x
->
dtype
())
!=
framework
::
proto
::
VarType
::
FP32
&&
framework
::
TransToProtoVarType
(
x
->
dtype
())
!=
framework
::
proto
::
VarType
::
FP16
)
{
auto
dst_dtype
=
ConvertToNpuDtype
(
framework
::
TransToProtoVarType
(
out
->
dtype
()));
const
auto
&
runner_cast
=
NpuOpRunner
(
"Cast"
,
{
cast_out
},
{
*
out
},
{{
"dst_type"
,
static_cast
<
int
>
(
dst_dtype
)}});
runner_cast
.
Run
(
stream
);
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
ReduceSumGradNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
out_grad
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
x_grad
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"X"
));
bool
reduce_all
=
ctx
.
Attr
<
bool
>
(
"reduce_all"
);
bool
keep_dims
=
ctx
.
Attr
<
bool
>
(
"keep_dim"
);
auto
dims
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
x_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
stream
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
if
(
keep_dims
||
reduce_all
)
{
const
auto
&
runner
=
NpuOpRunner
(
"BroadcastToD"
,
{
*
out_grad
},
{
*
x_grad
},
{{
"shape"
,
phi
::
vectorize
(
x
->
dims
())}});
runner
.
Run
(
stream
);
}
else
{
framework
::
DDim
out_dims
;
out_dims
=
UnsqueezeKernel
<
DeviceContext
,
T
>::
GetOutputShape
(
dims
,
out_grad
->
dims
());
phi
::
DenseTensor
out_grad_tmp
(
out_grad
->
type
());
out_grad_tmp
.
Resize
(
out_dims
);
out_grad_tmp
.
mutable_data
<
T
>
(
ctx
.
GetPlace
());
framework
::
TensorCopy
(
*
out_grad
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
platform
::
DeviceContext
>(),
&
out_grad_tmp
);
out_grad_tmp
.
Resize
(
out_dims
);
const
auto
&
runner
=
NpuOpRunner
(
"BroadcastToD"
,
{
out_grad_tmp
},
{
*
x_grad
},
{{
"shape"
,
phi
::
vectorize
(
x
->
dims
())}});
runner
.
Run
(
stream
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_NPU_KERNEL
(
reduce_sum
,
ops
::
ReduceSumNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
float
>
,
#ifdef PADDLE_WITH_ASCEND_INT64
ops
::
ReduceSumNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
int64_t
>
,
#endif
ops
::
ReduceSumNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
int
>
,
ops
::
ReduceSumNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
paddle
::
platform
::
float16
>
);
REGISTER_OP_NPU_KERNEL
(
reduce_sum_grad
,
ops
::
ReduceSumGradNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
float
>
,
#ifdef PADDLE_WITH_ASCEND_INT64
ops
::
ReduceSumGradNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
int64_t
>
,
#endif
ops
::
ReduceSumGradNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
int
>
,
ops
::
ReduceSumGradNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
已删除
100644 → 0
浏览文件 @
b451aff8
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
SequenceMaskNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
y
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Y"
);
int
maxlen
=
ctx
.
Attr
<
int
>
(
"maxlen"
);
if
(
ctx
.
HasInput
(
"MaxLenTensor"
))
{
auto
max_len_tensor
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"MaxLenTensor"
);
PADDLE_ENFORCE_NOT_NULL
(
max_len_tensor
,
platform
::
errors
::
InvalidArgument
(
"Input(MaxLenTensor) should not be NULL."
"But received Input(MaxLenTensor) is NULL"
));
phi
::
DenseTensor
temp
;
paddle
::
framework
::
TensorCopySync
(
*
max_len_tensor
,
platform
::
CPUPlace
(),
&
temp
);
maxlen
=
*
temp
.
data
<
int32_t
>
();
PADDLE_ENFORCE_GT
(
maxlen
,
0
,
platform
::
errors
::
InvalidArgument
(
"Input(MaxLenTensor) value should be greater than 0. But "
"received Input(MaxLenTensor) value = %d."
,
maxlen
));
}
if
(
maxlen
<
0
)
{
auto
x_numel
=
x
->
numel
();
if
(
x_numel
==
0
)
{
maxlen
=
0
;
}
else
{
std
::
vector
<
T
>
x_vec
;
framework
::
TensorToVector
(
*
x
,
dev_ctx
,
&
x_vec
);
auto
x_data
=
x_vec
.
data
();
maxlen
=
static_cast
<
int
>
(
*
std
::
max_element
(
x_data
,
x_data
+
x_numel
));
}
}
auto
y_dim
=
phi
::
vectorize
<
int
>
(
x
->
dims
());
y_dim
.
push_back
(
maxlen
);
phi
::
DenseTensor
cast_x
;
cast_x
.
mutable_data
<
int32_t
>
(
x
->
dims
(),
ctx
.
GetPlace
());
const
auto
&
cast1_runner
=
NpuOpRunner
(
"Cast"
,
{
*
x
},
{
cast_x
},
{{
"dst_type"
,
ConvertToNpuDtype
(
framework
::
TransToProtoVarType
(
cast_x
.
dtype
()))}});
cast1_runner
.
Run
(
dev_ctx
.
stream
());
phi
::
DenseTensor
tmp
;
tmp
.
mutable_data
<
int32_t
>
(
phi
::
make_ddim
({
maxlen
}),
ctx
.
GetPlace
());
NpuOpRunner
range_runner
;
range_runner
.
SetType
(
"Range"
);
range_runner
.
AddInput
(
std
::
vector
<
int32_t
>
({
0
}));
range_runner
.
AddInput
(
std
::
vector
<
int32_t
>
({
maxlen
}));
range_runner
.
AddInput
(
std
::
vector
<
int32_t
>
({
1
}));
range_runner
.
AddOutput
(
tmp
);
range_runner
.
Run
(
dev_ctx
.
stream
());
phi
::
DenseTensor
expand_tmp
;
expand_tmp
.
mutable_data
<
int32_t
>
(
phi
::
make_ddim
(
y_dim
),
ctx
.
GetPlace
());
const
auto
&
expand_runner
=
NpuOpRunner
(
"ExpandD"
,
{
tmp
},
{
expand_tmp
},
{{
"shape"
,
y_dim
}});
expand_runner
.
Run
(
dev_ctx
.
stream
());
auto
x_dims
=
phi
::
vectorize
<
int
>
(
x
->
dims
());
x_dims
.
push_back
(
1
);
cast_x
.
Resize
(
phi
::
make_ddim
({
x_dims
}));
phi
::
DenseTensor
x_tmp
;
x_tmp
.
mutable_data
<
int32_t
>
(
phi
::
make_ddim
(
y_dim
),
ctx
.
GetPlace
());
const
auto
&
tile_runner
=
NpuOpRunner
(
"TileWithAxis"
,
{
cast_x
},
{
x_tmp
},
{{
"axis"
,
x
->
dims
().
size
()},
{
"tiles"
,
maxlen
}});
tile_runner
.
Run
(
dev_ctx
.
stream
());
phi
::
DenseTensor
y_tmp
;
y_tmp
.
mutable_data
<
uint8_t
>
(
phi
::
make_ddim
(
y_dim
),
ctx
.
GetPlace
());
const
auto
&
less_runner
=
NpuOpRunner
(
"Less"
,
{
expand_tmp
,
x_tmp
},
{
y_tmp
},
{});
less_runner
.
Run
(
dev_ctx
.
stream
());
y
->
Resize
(
phi
::
make_ddim
(
y_dim
));
auto
out_dtype
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
ctx
.
Attr
<
int
>
(
"out_dtype"
));
if
(
out_dtype
==
framework
::
proto
::
VarType
::
INT32
)
{
y
->
mutable_data
<
int32_t
>
(
ctx
.
GetPlace
());
}
else
if
(
out_dtype
==
framework
::
proto
::
VarType
::
INT64
)
{
y
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
}
else
if
(
out_dtype
==
framework
::
proto
::
VarType
::
FP32
)
{
y
->
mutable_data
<
float
>
(
ctx
.
GetPlace
());
}
else
if
(
out_dtype
==
framework
::
proto
::
VarType
::
FP64
)
{
y
->
mutable_data
<
double
>
(
ctx
.
GetPlace
());
}
else
if
(
out_dtype
==
framework
::
proto
::
VarType
::
BOOL
)
{
y
->
mutable_data
<
bool
>
(
ctx
.
GetPlace
());
}
else
if
(
out_dtype
==
framework
::
proto
::
VarType
::
UINT8
)
{
y
->
mutable_data
<
uint8_t
>
(
ctx
.
GetPlace
());
}
else
{
PADDLE_ENFORCE
(
false
,
platform
::
errors
::
InvalidArgument
(
"out_dtype only supporing int32, int64, fp32, fp64, "
"bool, uint8, but receive out_dtype is %d"
,
out_dtype
));
}
const
auto
&
cast2_runner
=
NpuOpRunner
(
"Cast"
,
{
y_tmp
},
{
*
y
},
{{
"dst_type"
,
ConvertToNpuDtype
(
out_dtype
)}});
cast2_runner
.
Run
(
dev_ctx
.
stream
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
sequence_mask
,
ops
::
SequenceMaskNPUKernel
<
plat
::
NPUDeviceContext
,
int32_t
>
,
ops
::
SequenceMaskNPUKernel
<
plat
::
NPUDeviceContext
,
int64_t
>
,
ops
::
SequenceMaskNPUKernel
<
plat
::
NPUDeviceContext
,
float
>
,
ops
::
SequenceMaskNPUKernel
<
plat
::
NPUDeviceContext
,
double
>
);
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录