Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
6b3bb796
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
6b3bb796
编写于
5月 12, 2021
作者:
L
liym27
提交者:
GitHub
5月 12, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[NPU] Support npu pinned allocator and manage Tensor on NPUPinnedPlace (#32840)
上级
890f626b
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
528 addition
and
5 deletion
+528
-5
paddle/fluid/framework/dlpack_tensor.cc
paddle/fluid/framework/dlpack_tensor.cc
+5
-0
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+17
-0
paddle/fluid/imperative/gradient_accumulator.cc
paddle/fluid/imperative/gradient_accumulator.cc
+6
-0
paddle/fluid/memory/allocation/CMakeLists.txt
paddle/fluid/memory/allocation/CMakeLists.txt
+7
-1
paddle/fluid/memory/allocation/allocator_facade.cc
paddle/fluid/memory/allocation/allocator_facade.cc
+15
-0
paddle/fluid/memory/allocation/allocator_facade.h
paddle/fluid/memory/allocation/allocator_facade.h
+7
-0
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+68
-0
paddle/fluid/memory/allocation/npu_pinned_allocator.cc
paddle/fluid/memory/allocation/npu_pinned_allocator.cc
+82
-0
paddle/fluid/memory/allocation/npu_pinned_allocator.h
paddle/fluid/memory/allocation/npu_pinned_allocator.h
+51
-0
paddle/fluid/memory/detail/system_allocator.cc
paddle/fluid/memory/detail/system_allocator.cc
+54
-0
paddle/fluid/memory/detail/system_allocator.h
paddle/fluid/memory/detail/system_allocator.h
+10
-0
paddle/fluid/memory/memcpy.cc
paddle/fluid/memory/memcpy.cc
+81
-1
paddle/fluid/operators/math/math_function.cc
paddle/fluid/operators/math/math_function.cc
+8
-0
paddle/fluid/platform/cpu_info.cc
paddle/fluid/platform/cpu_info.cc
+17
-0
paddle/fluid/platform/cpu_info.h
paddle/fluid/platform/cpu_info.h
+9
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+26
-0
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+21
-0
paddle/fluid/platform/place.cc
paddle/fluid/platform/place.cc
+5
-0
paddle/fluid/platform/place.h
paddle/fluid/platform/place.h
+39
-3
未找到文件。
paddle/fluid/framework/dlpack_tensor.cc
浏览文件 @
6b3bb796
...
...
@@ -87,6 +87,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
platform
::
errors
::
Unimplemented
(
"platform::NPUPlace is not supported"
));
}
inline
::
DLContext
operator
()(
const
platform
::
NPUPinnedPlace
&
place
)
const
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"platform::NPUPinnedPlace is not supported"
));
}
inline
::
DLContext
operator
()(
const
platform
::
CUDAPlace
&
place
)
const
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
::
DLContext
ctx
;
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
6b3bb796
...
...
@@ -503,6 +503,11 @@ class AnyVisitor : public boost::static_visitor<bool> {
// return GetResultHelper(out, npu);
}
bool
GetResult
(
const
framework
::
Tensor
&
out
,
const
platform
::
NPUPinnedPlace
&
cpu
)
const
{
return
*
out
.
data
<
bool
>
();
}
bool
GetResult
(
const
framework
::
Tensor
&
out
,
const
platform
::
CPUPlace
&
cpu
)
const
{
return
*
out
.
data
<
bool
>
();
...
...
@@ -731,6 +736,18 @@ struct BothFalseVisitor : public boost::static_visitor<> {
out_ptr
[
i
]
=
lhs
&&
rhs
;
}
}
void
VisitorImpl
(
const
platform
::
NPUPinnedPlace
&
cpu
/* equals to cpu*/
)
const
{
int
num
=
in_
.
numel
();
const
bool
*
in_ptr
=
in_
.
data
<
bool
>
();
bool
*
out_ptr
=
out_
->
data
<
bool
>
();
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
bool
lhs
=
!
in_ptr
[
i
];
bool
rhs
=
!
out_ptr
[
i
];
out_ptr
[
i
]
=
lhs
&&
rhs
;
}
}
};
void
TensorIsfinite
(
const
framework
::
Tensor
&
tensor
,
framework
::
Tensor
*
out
)
{
...
...
paddle/fluid/imperative/gradient_accumulator.cc
浏览文件 @
6b3bb796
...
...
@@ -132,6 +132,12 @@ class TensorAddFunctor : public boost::static_visitor<> {
}
#endif
void
operator
()(
const
platform
::
NPUPinnedPlace
&
place
)
{
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode"
,
place
));
}
// there is NO blas in CUDAPinnedPlace
void
operator
()(
const
platform
::
CUDAPinnedPlace
&
place
)
{
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
...
...
paddle/fluid/memory/allocation/CMakeLists.txt
浏览文件 @
6b3bb796
...
...
@@ -29,6 +29,7 @@ endif()
if
(
WITH_ASCEND_CL
)
cc_library
(
npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info
)
cc_library
(
npu_pinned_allocator SRCS npu_pinned_allocator.cc DEPS allocator npu_info
)
endif
()
cc_library
(
retry_allocator SRCS retry_allocator.cc DEPS allocator
)
...
...
@@ -73,10 +74,15 @@ endif()
list
(
APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator
)
if
(
WITH_ASCEND_CL
)
list
(
APPEND AllocatorFacadeDeps npu_pinned_allocator
)
endif
()
cc_library
(
aligned_allocator SRCS aligned_allocator.cc DEPS allocator
)
cc_test
(
test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator
)
cc_library
(
allocator_strategy SRCS allocator_strategy.cc DEPS gflags
${
AllocatorFacadeDeps
}
)
cc_library
(
allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy
)
cc_library
(
allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy
)
cc_test
(
retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator
)
if
(
WITH_TESTING
)
...
...
paddle/fluid/memory/allocation/allocator_facade.cc
浏览文件 @
6b3bb796
...
...
@@ -20,6 +20,9 @@
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
...
...
@@ -72,6 +75,7 @@ class AllocatorFacadePrivate {
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetNPUDeviceCount
();
++
dev_id
)
{
InitNaiveBestFitNPUAllocator
(
platform
::
NPUPlace
(
dev_id
));
}
InitNaiveBestFitNPUPinnedAllocator
();
#endif
break
;
}
...
...
@@ -195,6 +199,12 @@ class AllocatorFacadePrivate {
void
InitNaiveBestFitNPUAllocator
(
platform
::
NPUPlace
p
)
{
allocators_
[
p
]
=
std
::
make_shared
<
NaiveBestFitAllocator
>
(
p
);
}
void
InitNaiveBestFitNPUPinnedAllocator
()
{
allocators_
[
platform
::
NPUPinnedPlace
()]
=
std
::
make_shared
<
paddle
::
memory
::
allocation
::
NPUPinnedAllocator
>
();
}
#endif
class
ZeroSizeAllocator
:
public
Allocator
{
...
...
@@ -294,6 +304,11 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
->
Release
(
place
);
}
const
std
::
shared_ptr
<
Allocator
>&
AllocatorFacade
::
GetAllocator
(
const
platform
::
Place
&
place
)
{
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
);
}
}
// namespace allocation
}
// namespace memory
}
// namespace paddle
paddle/fluid/memory/allocation/allocator_facade.h
浏览文件 @
6b3bb796
...
...
@@ -15,11 +15,17 @@
#pragma once
#include <memory>
#include "paddle/fluid/memory/allocation/allocator.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
#ifdef PADDLE_WITH_ASCEND_CL
using
NPUPinnedAllocator
=
paddle
::
memory
::
allocation
::
NPUPinnedAllocator
;
#endif
// Allocator Facade is the interface exposed to other modules.
// All the configuration or dirty code under development should
...
...
@@ -46,6 +52,7 @@ class AllocatorFacade {
// Release unused memory pool.
uint64_t
Release
(
const
platform
::
Place
&
place
);
const
std
::
shared_ptr
<
Allocator
>&
GetAllocator
(
const
platform
::
Place
&
place
);
// TODO(yy): Allocate a Copy-On-Write allocation?
private:
...
...
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
浏览文件 @
6b3bb796
...
...
@@ -287,6 +287,21 @@ class NPUBuddyAllocatorList {
BuddyAllocator
*
GetNPUBuddyAllocator
(
int
npu_id
)
{
return
NPUBuddyAllocatorList
::
Instance
()
->
Get
(
npu_id
);
}
BuddyAllocator
*
GetNPUPinnedBuddyAllocator
()
{
static
std
::
once_flag
init_flag
;
static
BuddyAllocator
*
ba
=
nullptr
;
std
::
call_once
(
init_flag
,
[]()
{
ba
=
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
NPUPinnedAllocator
),
platform
::
NPUPinnedMinChunkSize
(),
platform
::
NPUPinnedMaxChunkSize
());
});
return
ba
;
}
#endif
template
<
>
...
...
@@ -351,6 +366,59 @@ uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
#endif
}
template
<
>
size_t
Used
<
platform
::
NPUPinnedPlace
>
(
const
platform
::
NPUPinnedPlace
&
place
)
{
#ifdef PADDLE_WITH_ASCEND_CL
return
GetNPUPinnedBuddyAllocator
()
->
Used
();
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPinnedPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
void
*
Alloc
<
platform
::
NPUPinnedPlace
>
(
const
platform
::
NPUPinnedPlace
&
place
,
size_t
size
)
{
#ifdef PADDLE_WITH_ASCEND_CL
auto
*
buddy_allocator
=
GetNPUPinnedBuddyAllocator
();
void
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
if
(
ptr
==
nullptr
)
{
LOG
(
WARNING
)
<<
"aclrtMallocHost Cannot allocate "
<<
size
<<
" bytes in NPUPinnedPlace"
;
}
if
(
FLAGS_init_allocated_mem
)
{
memset
(
ptr
,
0xEF
,
size
);
}
return
ptr
;
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPinnedPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
void
Free
<
platform
::
NPUPinnedPlace
>
(
const
platform
::
NPUPinnedPlace
&
place
,
void
*
p
,
size_t
size
)
{
#ifdef PADDLE_WITH_ASCEND_CL
GetNPUPinnedBuddyAllocator
()
->
Free
(
p
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPinnedPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
uint64_t
Release
<
platform
::
NPUPinnedPlace
>
(
const
platform
::
NPUPinnedPlace
&
place
)
{
#ifdef PADDLE_WITH_ASCEND_CL
return
GetNPUPinnedBuddyAllocator
()
->
Release
();
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPinnedPlace' is not supported in CPU only device."
));
#endif
}
// For CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class
GPUBuddyAllocatorList
{
...
...
paddle/fluid/memory/allocation/npu_pinned_allocator.cc
0 → 100644
浏览文件 @
6b3bb796
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
void
NPUPinnedAllocator
::
ProcessEventsAndFree
()
{
for
(
auto
it
=
npu_events_
.
begin
();
it
!=
npu_events_
.
end
();)
{
aclrtEvent
event
=
it
->
second
;
aclrtEventStatus
status
=
ACL_EVENT_STATUS_COMPLETE
;
PADDLE_ENFORCE_NPU_SUCCESS
(
aclrtQueryEvent
(
event
,
&
status
));
if
(
status
==
ACL_EVENT_STATUS_COMPLETE
)
{
Allocation
*
allocation
=
it
->
first
;
void
*
ptr
=
allocation
->
ptr
();
free
(
ptr
);
npu_events_
.
erase
(
it
++
);
delete
allocation
;
PADDLE_ENFORCE_NPU_SUCCESS
(
aclrtDestroyEvent
(
event
));
}
else
{
++
it
;
}
}
}
Allocation
*
NPUPinnedAllocator
::
AllocateImpl
(
size_t
size
)
{
ProcessEventsAndFree
();
void
*
ptr
;
int
error
=
posix_memalign
(
&
ptr
,
kAlignment
,
size
);
PADDLE_ENFORCE_EQ
(
error
,
0
,
platform
::
errors
::
ResourceExhausted
(
"Fail to alloc memory of %ld size, error code is %d."
,
size
,
error
));
return
new
Allocation
(
ptr
,
size
,
platform
::
NPUPinnedPlace
());
}
void
NPUPinnedAllocator
::
FreeImpl
(
Allocation
*
allocation
)
{
void
*
ptr
=
allocation
->
ptr
();
auto
iter
=
npu_events_
.
find
(
allocation
);
aclrtEvent
event
=
iter
->
second
;
aclrtEventStatus
status
=
ACL_EVENT_STATUS_COMPLETE
;
PADDLE_ENFORCE_NPU_SUCCESS
(
aclrtQueryEvent
(
event
,
&
status
));
if
(
status
==
ACL_EVENT_STATUS_COMPLETE
)
{
free
(
ptr
);
npu_events_
.
erase
(
allocation
);
delete
allocation
;
PADDLE_ENFORCE_NPU_SUCCESS
(
aclrtDestroyEvent
(
event
));
}
return
;
}
uint64_t
NPUPinnedAllocator
::
ReleaseImpl
(
const
platform
::
Place
&
place
)
{
return
static_cast
<
uint64_t
>
(
0
);
}
void
NPUPinnedAllocator
::
RecordEvent
(
Allocation
*
allocation
,
aclrtStream
stream
)
{
aclrtEvent
event
=
nullptr
;
PADDLE_ENFORCE_NPU_SUCCESS
(
aclrtCreateEvent
(
&
event
));
PADDLE_ENFORCE_NPU_SUCCESS
(
aclrtRecordEvent
(
event
,
stream
));
npu_events_
.
insert
({
allocation
,
event
});
}
}
// namespace allocation
}
// namespace memory
}
// namespace paddle
#endif
paddle/fluid/memory/allocation/npu_pinned_allocator.h
0 → 100644
浏览文件 @
6b3bb796
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include <mutex> // NOLINT
#include <string>
#include <unordered_map>
#include "acl/acl.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/npu_info.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
class
NPUPinnedAllocator
:
public
Allocator
{
public:
bool
IsAllocThreadSafe
()
const
override
{
return
true
;
}
void
ProcessEventsAndFree
();
void
RecordEvent
(
Allocation
*
allocation
,
aclrtStream
stream
);
constexpr
static
size_t
kAlignment
=
4096UL
;
protected:
Allocation
*
AllocateImpl
(
size_t
size
)
override
;
void
FreeImpl
(
Allocation
*
allocation
)
override
;
uint64_t
ReleaseImpl
(
const
platform
::
Place
&
place
)
override
;
private:
std
::
unordered_map
<
Allocation
*
,
aclrtEvent
>
npu_events_
;
};
}
// namespace allocation
}
// namespace memory
}
// namespace paddle
#endif
paddle/fluid/memory/detail/system_allocator.cc
浏览文件 @
6b3bb796
...
...
@@ -310,6 +310,60 @@ void NPUAllocator::Free(void* p, size_t size, size_t index) {
}
bool
NPUAllocator
::
UseGpu
()
const
{
return
true
;
}
void
*
NPUPinnedAllocator
::
Alloc
(
size_t
*
index
,
size_t
size
)
{
if
(
size
<=
0
)
return
nullptr
;
size_t
usable
=
paddle
::
platform
::
NPUPinnedMaxAllocSize
()
-
npu_pinnd_alloc_size_
;
if
(
size
>
usable
)
{
LOG
(
WARNING
)
<<
"Cannot malloc "
<<
size
/
1024.0
/
1024.0
<<
" MB pinned memory."
<<
", available "
<<
usable
/
1024.0
/
1024.0
<<
" MB"
;
return
nullptr
;
}
void
*
p
;
// PINNED memory is visible to all NPU contexts.
auto
result
=
aclrtMallocHost
(
&
p
,
size
);
if
(
result
==
ACL_ERROR_NONE
)
{
*
index
=
1
;
// PINNED memory
npu_pinnd_alloc_size_
+=
size
;
return
p
;
}
else
{
LOG
(
WARNING
)
<<
"aclrtMallocHost failed."
;
return
nullptr
;
}
return
nullptr
;
}
void
NPUPinnedAllocator
::
Free
(
void
*
p
,
size_t
size
,
size_t
index
)
{
aclError
err
;
PADDLE_ENFORCE_EQ
(
index
,
1
,
platform
::
errors
::
InvalidArgument
(
"The index should be 1, but got %d"
,
index
));
PADDLE_ENFORCE_GE
(
npu_pinnd_alloc_size_
,
size
,
platform
::
errors
::
InvalidArgument
(
"The size of memory (%d) to free exceeds the size of "
"allocated npu pinned memory (%d)"
,
size
,
npu_pinnd_alloc_size_
));
npu_pinnd_alloc_size_
-=
size
;
err
=
aclrtFreeHost
(
p
);
if
(
err
!=
ACL_ERROR_NONE
)
{
PADDLE_ENFORCE_EQ
(
err
,
0
,
platform
::
errors
::
Fatal
(
"aclrtFreeHost failed in NPUPinnedAllocator, error code is %d"
,
err
));
}
}
bool
NPUPinnedAllocator
::
UseGpu
()
const
{
return
false
;
}
#endif
}
// namespace detail
...
...
paddle/fluid/memory/detail/system_allocator.h
浏览文件 @
6b3bb796
...
...
@@ -80,6 +80,16 @@ class NPUAllocator : public SystemAllocator {
size_t
npu_alloc_size_
=
0
;
int
npu_id_
;
};
class
NPUPinnedAllocator
:
public
SystemAllocator
{
public:
virtual
void
*
Alloc
(
size_t
*
index
,
size_t
size
);
virtual
void
Free
(
void
*
p
,
size_t
size
,
size_t
index
);
virtual
bool
UseGpu
()
const
;
private:
size_t
npu_pinnd_alloc_size_
=
0
;
};
#endif
}
// namespace detail
...
...
paddle/fluid/memory/memcpy.cc
浏览文件 @
6b3bb796
...
...
@@ -245,7 +245,7 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
static_cast
<
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
src_place
))
->
Wait
();
platform
::
RecordEvent
record_event
(
"
G
puMemcpySync:NPU->CPU"
);
platform
::
RecordEvent
record_event
(
"
N
puMemcpySync:NPU->CPU"
);
platform
::
NPUMemcpySync
(
dst
,
src
,
num
,
ACL_MEMCPY_DEVICE_TO_HOST
);
}
}
...
...
@@ -294,6 +294,86 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
}
}
}
template
<
>
void
Copy
<
platform
::
CPUPlace
,
platform
::
NPUPinnedPlace
>
(
platform
::
CPUPlace
dst_place
,
void
*
dst
,
platform
::
NPUPinnedPlace
src_place
,
const
void
*
src
,
size_t
num
)
{
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
;
if
(
UNLIKELY
(
num
==
0
))
return
;
std
::
memcpy
(
dst
,
src
,
num
);
}
template
<
>
void
Copy
<
platform
::
NPUPinnedPlace
,
platform
::
CPUPlace
>
(
platform
::
NPUPinnedPlace
dst_place
,
void
*
dst
,
platform
::
CPUPlace
src_place
,
const
void
*
src
,
size_t
num
)
{
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
;
if
(
UNLIKELY
(
num
==
0
))
return
;
std
::
memcpy
(
dst
,
src
,
num
);
}
template
<
>
void
Copy
<
platform
::
NPUPinnedPlace
,
platform
::
NPUPinnedPlace
>
(
platform
::
NPUPinnedPlace
dst_place
,
void
*
dst
,
platform
::
NPUPinnedPlace
src_place
,
const
void
*
src
,
size_t
num
)
{
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
;
if
(
UNLIKELY
(
num
==
0
))
return
;
std
::
memcpy
(
dst
,
src
,
num
);
}
template
<
>
void
Copy
<
platform
::
NPUPinnedPlace
,
platform
::
NPUPlace
>
(
platform
::
NPUPinnedPlace
dst_place
,
void
*
dst
,
platform
::
NPUPlace
src_place
,
const
void
*
src
,
size_t
num
,
aclrtStream
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
platform
::
SetNPUDeviceId
(
src_place
.
device
);
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
<<
" by thream("
<<
stream
<<
")"
;
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"NpuMemcpyAsync:NPU->NPUPinned"
);
platform
::
NPUMemcpyAsync
(
dst
,
src
,
num
,
ACL_MEMCPY_DEVICE_TO_HOST
,
stream
);
}
else
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
static_cast
<
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
src_place
))
->
Wait
();
platform
::
RecordEvent
record_event
(
"NpuMemcpySync:NPU->NPUPinned"
);
platform
::
NPUMemcpySync
(
dst
,
src
,
num
,
ACL_MEMCPY_DEVICE_TO_HOST
);
}
}
template
<
>
void
Copy
<
platform
::
NPUPlace
,
platform
::
NPUPinnedPlace
>
(
platform
::
NPUPlace
dst_place
,
void
*
dst
,
platform
::
NPUPinnedPlace
src_place
,
const
void
*
src
,
size_t
num
,
aclrtStream
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
platform
::
SetNPUDeviceId
(
dst_place
.
device
);
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
<<
" by thream("
<<
stream
<<
")"
;
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"NpuMemcpyAsync:NPUPinned->NPU"
);
platform
::
NPUMemcpyAsync
(
dst
,
src
,
num
,
ACL_MEMCPY_HOST_TO_DEVICE
,
stream
);
}
else
{
// On NPU, async operation after sync operation is ok, while sync operation
// after async is not ok, since the async operation may not done.
// So, its needed to do wait before sync operation.
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
static_cast
<
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
dst_place
))
->
Wait
();
platform
::
RecordEvent
record_event
(
"NpuMemcpySync:NPUPinned->NPU"
);
platform
::
NPUMemcpySync
(
dst
,
src
,
num
,
ACL_MEMCPY_HOST_TO_DEVICE
);
}
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
paddle/fluid/operators/math/math_function.cc
浏览文件 @
6b3bb796
...
...
@@ -158,6 +158,14 @@ void set_constant_with_place<platform::NPUPlace>(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"NPUPlace is not supported"
));
}
template
<
>
void
set_constant_with_place
<
platform
::
NPUPinnedPlace
>
(
const
platform
::
DeviceContext
&
context
,
framework
::
Tensor
*
tensor
,
float
value
)
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"NPUPinnedPlace is not supported"
));
}
template
<
>
void
set_constant_with_place
<
platform
::
CPUPlace
>
(
const
platform
::
DeviceContext
&
context
,
framework
::
Tensor
*
tensor
,
...
...
paddle/fluid/platform/cpu_info.cc
浏览文件 @
6b3bb796
...
...
@@ -104,6 +104,23 @@ size_t CUDAPinnedMaxChunkSize() {
return
CUDAPinnedMaxAllocSize
()
/
256
;
}
size_t
NPUPinnedMaxAllocSize
()
{
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
return
FLAGS_fraction_of_cuda_pinned_memory_to_use
*
CpuTotalPhysicalMemory
();
}
size_t
NPUPinnedMinChunkSize
()
{
// Allow to allocate the minimum chunk size is 64 KB.
return
1
<<
16
;
}
size_t
NPUPinnedMaxChunkSize
()
{
// Allow to allocate the maximum chunk size is roughly 1/256 of NPU_PINNED
// memory.
return
NPUPinnedMaxAllocSize
()
/
256
;
}
#ifdef PADDLE_WITH_XBYAK
static
Xbyak
::
util
::
Cpu
cpu
;
bool
MayIUse
(
const
cpu_isa_t
cpu_isa
)
{
...
...
paddle/fluid/platform/cpu_info.h
浏览文件 @
6b3bb796
...
...
@@ -73,6 +73,15 @@ size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t
CUDAPinnedMaxChunkSize
();
//! Get the maximum allocation size for a machine.
size_t
NPUPinnedMaxAllocSize
();
//! Get the minimum chunk size for buddy allocator.
size_t
NPUPinnedMinChunkSize
();
//! Get the maximum chunk size for buddy allocator.
size_t
NPUPinnedMaxChunkSize
();
typedef
enum
{
isa_any
,
sse42
,
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
6b3bb796
...
...
@@ -153,6 +153,16 @@ DeviceContextPool::DeviceContextPool(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"NPUPlace is not supported. Please "
"re-compile with WITH_ASCEND_CL option."
));
#endif
}
else
if
(
platform
::
is_npu_pinned_place
(
p
))
{
#ifdef PADDLE_WITH_ASCEND_CL
EmplaceDeviceContext
<
NPUPinnedDeviceContext
,
NPUPinnedPlace
>
(
&
device_contexts_
,
p
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"NPUPinnedPlace is not supported. Please re-compile with "
"WITH_ASCEND_CL "
"option."
));
#endif
}
}
...
...
@@ -264,6 +274,22 @@ aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
Place
NPUDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
aclrtContext
NPUDeviceContext
::
context
()
const
{
return
context_
;
}
NPUPinnedDeviceContext
::
NPUPinnedDeviceContext
()
{
eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
}
NPUPinnedDeviceContext
::
NPUPinnedDeviceContext
(
NPUPinnedPlace
place
)
:
place_
(
place
)
{
eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
}
Eigen
::
DefaultDevice
*
NPUPinnedDeviceContext
::
eigen_device
()
const
{
return
eigen_device_
.
get
();
}
Place
NPUPinnedDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
6b3bb796
...
...
@@ -233,6 +233,27 @@ template <>
struct
DefaultDeviceContextType
<
platform
::
NPUPlace
>
{
using
TYPE
=
NPUDeviceContext
;
};
// Currently, NPUPinnedDeviceContext is only used to data copying.
class
NPUPinnedDeviceContext
:
public
DeviceContext
{
public:
NPUPinnedDeviceContext
();
explicit
NPUPinnedDeviceContext
(
NPUPinnedPlace
place
);
Place
GetPlace
()
const
override
;
Eigen
::
DefaultDevice
*
eigen_device
()
const
;
private:
NPUPinnedPlace
place_
;
std
::
unique_ptr
<
Eigen
::
DefaultDevice
>
eigen_device_
;
};
template
<
>
struct
DefaultDeviceContextType
<
platform
::
NPUPinnedPlace
>
{
using
TYPE
=
NPUPinnedDeviceContext
;
};
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
paddle/fluid/platform/place.cc
浏览文件 @
6b3bb796
...
...
@@ -34,6 +34,7 @@ class PlacePrinter : public boost::static_visitor<> {
}
void
operator
()(
const
XPUPlace
&
p
)
{
os_
<<
"XPUPlace("
<<
p
.
device
<<
")"
;
}
void
operator
()(
const
NPUPlace
&
p
)
{
os_
<<
"NPUPlace("
<<
p
.
device
<<
")"
;
}
void
operator
()(
const
NPUPinnedPlace
&
p
)
{
os_
<<
"NPUPinnedPlace"
;
}
void
operator
()(
const
CUDAPinnedPlace
&
p
)
{
os_
<<
"CUDAPinnedPlace"
;
}
private:
...
...
@@ -62,6 +63,10 @@ bool is_cuda_pinned_place(const Place &p) {
return
boost
::
apply_visitor
(
IsCUDAPinnedPlace
(),
p
);
}
bool
is_npu_pinned_place
(
const
Place
&
p
)
{
return
boost
::
apply_visitor
(
IsNPUPinnedPlace
(),
p
);
}
bool
places_are_same_class
(
const
Place
&
p1
,
const
Place
&
p2
)
{
return
p1
.
which
()
==
p2
.
which
();
}
...
...
paddle/fluid/platform/place.h
浏览文件 @
6b3bb796
...
...
@@ -85,10 +85,19 @@ struct NPUPlace {
int
device
;
};
struct
NPUPinnedPlace
{
NPUPinnedPlace
()
{}
inline
bool
operator
==
(
const
NPUPinnedPlace
&
)
const
{
return
true
;
}
inline
bool
operator
!=
(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
inline
bool
operator
<
(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
};
struct
IsCUDAPlace
:
public
boost
::
static_visitor
<
bool
>
{
bool
operator
()(
const
CPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
};
...
...
@@ -97,6 +106,7 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
bool
operator
()(
const
CPUPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
};
...
...
@@ -105,6 +115,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
bool
operator
()(
const
CPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
cuda_pinned
)
const
{
return
true
;
}
};
...
...
@@ -113,6 +124,7 @@ struct IsXPUPlace : public boost::static_visitor<bool> {
bool
operator
()(
const
CPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
};
...
...
@@ -121,15 +133,25 @@ struct IsNPUPlace : public boost::static_visitor<bool> {
bool
operator
()(
const
CPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
};
struct
IsNPUPinnedPlace
:
public
boost
::
static_visitor
<
bool
>
{
bool
operator
()(
const
CPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
};
class
Place
:
public
boost
::
variant
<
CUDAPlace
,
XPUPlace
,
NPUPlace
,
CPUPlace
,
CUDAPinnedPlace
>
{
CUDAPinnedPlace
,
NPUPinnedPlace
>
{
private:
using
PlaceBase
=
boost
::
variant
<
CUDAPlace
,
XPUPlace
,
NPUPlace
,
CPUPlace
,
CUDA
PinnedPlace
>
;
using
PlaceBase
=
boost
::
variant
<
CUDAPlace
,
XPUPlace
,
NPUPlace
,
CPUPlace
,
CUDAPinnedPlace
,
NPU
PinnedPlace
>
;
public:
Place
()
=
default
;
...
...
@@ -139,6 +161,8 @@ class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
Place
(
const
CUDAPlace
&
cuda_place
)
:
PlaceBase
(
cuda_place
)
{}
// NOLINT
Place
(
const
CUDAPinnedPlace
&
cuda_pinned_place
)
// NOLINT
:
PlaceBase
(
cuda_pinned_place
)
{}
Place
(
const
NPUPinnedPlace
&
npu_pinned_place
)
// NOLINT
:
PlaceBase
(
npu_pinned_place
)
{}
bool
operator
<
(
const
Place
&
place
)
const
{
return
PlaceBase
::
operator
<
(
static_cast
<
const
PlaceBase
&>
(
place
));
...
...
@@ -155,6 +179,7 @@ bool is_xpu_place(const Place &);
bool
is_npu_place
(
const
Place
&
);
bool
is_cpu_place
(
const
Place
&
);
bool
is_cuda_pinned_place
(
const
Place
&
);
bool
is_npu_pinned_place
(
const
Place
&
);
bool
places_are_same_class
(
const
Place
&
,
const
Place
&
);
bool
is_same_place
(
const
Place
&
,
const
Place
&
);
...
...
@@ -190,6 +215,17 @@ struct PlaceVisitorWrapper
#endif
}
typename
Visitor
::
result_type
operator
()(
const
NPUPinnedPlace
&
npu_pinned
)
const
{
#ifdef PADDLE_WITH_ASCEND_CL
return
visitor_
(
npu_pinned
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Paddle is not compiled with NPU. Cannot visit npu_pinned"
));
return
typename
Visitor
::
result_type
();
#endif
}
typename
Visitor
::
result_type
operator
()(
const
CUDAPlace
&
cuda
)
const
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return
visitor_
(
cuda
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录