Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
85512d60
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
85512d60
编写于
5月 12, 2021
作者:
L
liym27
提交者:
GitHub
5月 12, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[NPU] Support async copy for TensorFromVector with event (#32563)
上级
f1d63029
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
84 addition
and
32 deletion
+84
-32
paddle/fluid/framework/tensor_util.h
paddle/fluid/framework/tensor_util.h
+53
-4
paddle/fluid/operators/npu_op_runner.h
paddle/fluid/operators/npu_op_runner.h
+31
-28
未找到文件。
paddle/fluid/framework/tensor_util.h
浏览文件 @
85512d60
...
...
@@ -19,6 +19,10 @@ limitations under the License. */
#include "paddle/fluid/framework/dlpack_tensor.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
...
...
@@ -166,8 +170,30 @@ void TensorFromVector(const std::vector<T>& src,
// Since vector is on cpu, I think this function should be a "sync" operation,
// so pass nullptr as stream to memory::Copy().
else
if
(
platform
::
is_npu_place
(
dst_place
))
{
// NOLINT
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
NPUPlace
,
dst_place
),
dst_ptr
,
src_place
,
src_ptr
,
size
,
nullptr
);
// 1. vector -> npu pinned tensor
Tensor
npu_pinned_tensor
(
dst
->
type
());
platform
::
NPUPinnedPlace
npu_pinned_place
;
auto
npu_pinned_ptr
=
npu_pinned_tensor
.
mutable_data
<
T
>
(
dst
->
dims
(),
npu_pinned_place
);
memory
::
Copy
(
npu_pinned_place
,
npu_pinned_ptr
,
src_place
,
src_ptr
,
size
);
// 2. async copy npu pinned tensor -> npu tensor
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
NPUPlace
,
dst_place
),
dst_ptr
,
npu_pinned_place
,
npu_pinned_ptr
,
size
,
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
());
// 3. record event
auto
npu_pinned_allocator
=
static_cast
<
paddle
::
memory
::
allocation
::
NPUPinnedAllocator
*>
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
npu_pinned_place
)
.
get
());
paddle
::
memory
::
allocation
::
Allocation
*
allocation
=
npu_pinned_tensor
.
Holder
().
get
();
npu_pinned_allocator
->
RecordEvent
(
allocation
,
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
());
}
#endif
}
...
...
@@ -206,8 +232,31 @@ inline void TensorFromVector(const std::vector<bool>& src,
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else
if
(
platform
::
is_npu_place
(
dst_place
))
{
// NOLINT
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
NPUPlace
,
dst_place
),
dst_ptr
,
src_place
,
src_ptr
,
size
,
nullptr
);
// 1. vector -> npu pinned tensor
platform
::
NPUPinnedPlace
npu_pinned_place
;
Tensor
npu_pinned_tensor
;
npu_pinned_tensor
.
Resize
(
dst
->
dims
());
auto
npu_pinned_ptr
=
npu_pinned_tensor
.
mutable_data
(
npu_pinned_place
,
dst
->
type
());
memory
::
Copy
(
npu_pinned_place
,
npu_pinned_ptr
,
src_place
,
src_ptr
,
size
);
// 2. async copy npu pinned tensor -> npu tensor
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
NPUPlace
,
dst_place
),
dst_ptr
,
npu_pinned_place
,
npu_pinned_ptr
,
size
,
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
());
// 3. record event
auto
npu_pinned_allocator
=
static_cast
<
paddle
::
memory
::
allocation
::
NPUPinnedAllocator
*>
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
npu_pinned_place
)
.
get
());
paddle
::
memory
::
allocation
::
Allocation
*
allocation
=
npu_pinned_tensor
.
Holder
().
get
();
npu_pinned_allocator
->
RecordEvent
(
allocation
,
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
).
stream
());
}
#endif
delete
[]
array
;
...
...
paddle/fluid/operators/npu_op_runner.h
浏览文件 @
85512d60
...
...
@@ -21,6 +21,7 @@ limitations under the License. */
#include <vector>
#include "acl/acl.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/npu_op_runner.h"
namespace
paddle
{
...
...
@@ -30,6 +31,7 @@ using Tensor = framework::Tensor;
using
DataLayout
=
framework
::
DataLayout
;
using
NPUAttribute
=
framework
::
NPUAttribute
;
using
NPUAttributeMap
=
framework
::
NPUAttributeMap
;
using
DeviceContextPool
=
platform
::
DeviceContextPool
;
class
NpuOpRunner
{
public:
...
...
@@ -90,41 +92,42 @@ aclrtStream GetCurrentNPUStream(int device_id = -1);
template
<
typename
T
>
void
FillNpuTensorWithConstant
(
Tensor
*
tensor
,
T
val
)
{
// NOTE(zhiqiu): we found that power sometimes returns 0 when val is small
// like 1e-8.
constexpr
float
MIN_PRECISION_FOR_POWER
=
1e-3
;
PADDLE_ENFORCE_EQ
(
tensor
->
IsInitialized
(),
true
,
platform
::
errors
::
InvalidArgument
(
"The tensor should be initialized."
));
PADDLE_ENFORCE_EQ
(
platform
::
is_npu_place
(
tensor
->
place
()),
true
,
platform
::
errors
::
InvalidArgument
(
"The tensor should be on NPUPlace."
));
// do async for better performance
if
((
typeid
(
float
)
==
typeid
(
T
)
||
typeid
(
platform
::
float16
)
==
typeid
(
T
))
&&
static_cast
<
float
>
(
val
)
>
MIN_PRECISION_FOR_POWER
)
{
Tensor
tmp
(
tensor
->
type
());
tmp
.
Resize
(
tensor
->
dims
());
tmp
.
mutable_data
<
T
>
(
tensor
->
place
());
auto
stream
=
GetCurrentNPUStream
(
BOOST_GET_CONST
(
platform
::
NPUPlace
,
tensor
->
place
()).
device
);
platform
::
NPUMemsetAsync
(
tmp
.
data
<
void
>
(),
0
,
tmp
.
numel
()
*
sizeof
(
T
),
stream
);
auto
runner
=
NpuOpRunner
(
"Power"
,
{
tmp
},
{
*
tensor
},
{{
"power"
,
static_cast
<
float
>
(
1
)},
{
"scale"
,
static_cast
<
float
>
(
0
)},
{
"shift"
,
static_cast
<
float
>
(
val
)}});
runner
.
Run
(
stream
);
}
else
{
T
*
array
=
new
T
[
tensor
->
numel
()];
for
(
unsigned
int
i
=
0
;
i
<
tensor
->
numel
();
++
i
)
{
array
[
i
]
=
static_cast
<
T
>
(
val
);
}
std
::
vector
<
T
>
vec
(
tensor
->
numel
(),
static_cast
<
T
>
(
val
));
// do sync copy
int
numel
=
tensor
->
numel
();
if
(
numel
==
1
)
{
Tensor
npu_pinned_tensor
(
tensor
->
type
());
platform
::
NPUPinnedPlace
npu_pinned_place
;
auto
npu_pinned_ptr
=
npu_pinned_tensor
.
mutable_data
<
T
>
({
1
},
npu_pinned_place
);
*
npu_pinned_ptr
=
val
;
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
NPUPlace
,
tensor
->
place
()),
tensor
->
data
<
void
>
(),
platform
::
CPUPlace
(),
array
,
tensor
->
numel
()
*
sizeof
(
T
),
nullptr
);
delete
[]
array
;
tensor
->
data
<
void
>
(),
npu_pinned_place
,
npu_pinned_ptr
,
sizeof
(
T
),
GetCurrentNPUStream
());
auto
npu_pinned_allocator
=
static_cast
<
paddle
::
memory
::
allocation
::
NPUPinnedAllocator
*>
(
paddle
::
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
npu_pinned_place
)
.
get
());
paddle
::
memory
::
allocation
::
Allocation
*
allocation
=
npu_pinned_tensor
.
Holder
().
get
();
npu_pinned_allocator
->
RecordEvent
(
allocation
,
GetCurrentNPUStream
());
}
else
{
std
::
vector
<
T
>
vec
(
numel
,
static_cast
<
T
>
(
val
));
auto
device_id
=
platform
::
GetCurrentNPUDeviceId
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
platform
::
NPUPlace
(
device_id
)));
paddle
::
framework
::
TensorFromVector
<
T
>
(
vec
,
*
dev_ctx
,
tensor
);
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录