Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
6b3bb796
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
6b3bb796
编写于
5月 12, 2021
作者:
L
liym27
提交者:
GitHub
5月 12, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[NPU] Support npu pinned allocator and manage Tensor on NPUPinnedPlace (#32840)
上级
890f626b
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
528 addition
and
5 deletion
+528
-5
paddle/fluid/framework/dlpack_tensor.cc
paddle/fluid/framework/dlpack_tensor.cc
+5
-0
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+17
-0
paddle/fluid/imperative/gradient_accumulator.cc
paddle/fluid/imperative/gradient_accumulator.cc
+6
-0
paddle/fluid/memory/allocation/CMakeLists.txt
paddle/fluid/memory/allocation/CMakeLists.txt
+7
-1
paddle/fluid/memory/allocation/allocator_facade.cc
paddle/fluid/memory/allocation/allocator_facade.cc
+15
-0
paddle/fluid/memory/allocation/allocator_facade.h
paddle/fluid/memory/allocation/allocator_facade.h
+7
-0
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+68
-0
paddle/fluid/memory/allocation/npu_pinned_allocator.cc
paddle/fluid/memory/allocation/npu_pinned_allocator.cc
+82
-0
paddle/fluid/memory/allocation/npu_pinned_allocator.h
paddle/fluid/memory/allocation/npu_pinned_allocator.h
+51
-0
paddle/fluid/memory/detail/system_allocator.cc
paddle/fluid/memory/detail/system_allocator.cc
+54
-0
paddle/fluid/memory/detail/system_allocator.h
paddle/fluid/memory/detail/system_allocator.h
+10
-0
paddle/fluid/memory/memcpy.cc
paddle/fluid/memory/memcpy.cc
+81
-1
paddle/fluid/operators/math/math_function.cc
paddle/fluid/operators/math/math_function.cc
+8
-0
paddle/fluid/platform/cpu_info.cc
paddle/fluid/platform/cpu_info.cc
+17
-0
paddle/fluid/platform/cpu_info.h
paddle/fluid/platform/cpu_info.h
+9
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+26
-0
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+21
-0
paddle/fluid/platform/place.cc
paddle/fluid/platform/place.cc
+5
-0
paddle/fluid/platform/place.h
paddle/fluid/platform/place.h
+39
-3
未找到文件。
paddle/fluid/framework/dlpack_tensor.cc
浏览文件 @
6b3bb796
...
...
@@ -87,6 +87,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
platform
::
errors
::
Unimplemented
(
"platform::NPUPlace is not supported"
));
}
inline
::
DLContext
operator
()(
const
platform
::
NPUPinnedPlace
&
place
)
const
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"platform::NPUPinnedPlace is not supported"
));
}
inline
::
DLContext
operator
()(
const
platform
::
CUDAPlace
&
place
)
const
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
::
DLContext
ctx
;
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
6b3bb796
...
...
@@ -503,6 +503,11 @@ class AnyVisitor : public boost::static_visitor<bool> {
// return GetResultHelper(out, npu);
}
bool
GetResult
(
const
framework
::
Tensor
&
out
,
const
platform
::
NPUPinnedPlace
&
cpu
)
const
{
return
*
out
.
data
<
bool
>
();
}
bool
GetResult
(
const
framework
::
Tensor
&
out
,
const
platform
::
CPUPlace
&
cpu
)
const
{
return
*
out
.
data
<
bool
>
();
...
...
@@ -731,6 +736,18 @@ struct BothFalseVisitor : public boost::static_visitor<> {
out_ptr
[
i
]
=
lhs
&&
rhs
;
}
}
void
VisitorImpl
(
const
platform
::
NPUPinnedPlace
&
cpu
/* equals to cpu*/
)
const
{
int
num
=
in_
.
numel
();
const
bool
*
in_ptr
=
in_
.
data
<
bool
>
();
bool
*
out_ptr
=
out_
->
data
<
bool
>
();
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
bool
lhs
=
!
in_ptr
[
i
];
bool
rhs
=
!
out_ptr
[
i
];
out_ptr
[
i
]
=
lhs
&&
rhs
;
}
}
};
void
TensorIsfinite
(
const
framework
::
Tensor
&
tensor
,
framework
::
Tensor
*
out
)
{
...
...
paddle/fluid/imperative/gradient_accumulator.cc
浏览文件 @
6b3bb796
...
...
@@ -132,6 +132,12 @@ class TensorAddFunctor : public boost::static_visitor<> {
}
#endif
void
operator
()(
const
platform
::
NPUPinnedPlace
&
place
)
{
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode"
,
place
));
}
// there is NO blas in CUDAPinnedPlace
void
operator
()(
const
platform
::
CUDAPinnedPlace
&
place
)
{
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
...
...
paddle/fluid/memory/allocation/CMakeLists.txt
浏览文件 @
6b3bb796
...
...
@@ -29,6 +29,7 @@ endif()
if
(
WITH_ASCEND_CL
)
cc_library
(
npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info
)
cc_library
(
npu_pinned_allocator SRCS npu_pinned_allocator.cc DEPS allocator npu_info
)
endif
()
cc_library
(
retry_allocator SRCS retry_allocator.cc DEPS allocator
)
...
...
@@ -73,10 +74,15 @@ endif()
list
(
APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator
)
if
(
WITH_ASCEND_CL
)
list
(
APPEND AllocatorFacadeDeps npu_pinned_allocator
)
endif
()
cc_library
(
aligned_allocator SRCS aligned_allocator.cc DEPS allocator
)
cc_test
(
test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator
)
cc_library
(
allocator_strategy SRCS allocator_strategy.cc DEPS gflags
${
AllocatorFacadeDeps
}
)
cc_library
(
allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy
)
cc_library
(
allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy
)
cc_test
(
retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator
)
if
(
WITH_TESTING
)
...
...
paddle/fluid/memory/allocation/allocator_facade.cc
浏览文件 @
6b3bb796
...
...
@@ -20,6 +20,9 @@
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
...
...
@@ -72,6 +75,7 @@ class AllocatorFacadePrivate {
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetNPUDeviceCount
();
++
dev_id
)
{
InitNaiveBestFitNPUAllocator
(
platform
::
NPUPlace
(
dev_id
));
}
InitNaiveBestFitNPUPinnedAllocator
();
#endif
break
;
}
...
...
@@ -195,6 +199,12 @@ class AllocatorFacadePrivate {
void
InitNaiveBestFitNPUAllocator
(
platform
::
NPUPlace
p
)
{
allocators_
[
p
]
=
std
::
make_shared
<
NaiveBestFitAllocator
>
(
p
);
}
void
InitNaiveBestFitNPUPinnedAllocator
()
{
allocators_
[
platform
::
NPUPinnedPlace
()]
=
std
::
make_shared
<
paddle
::
memory
::
allocation
::
NPUPinnedAllocator
>
();
}
#endif
class
ZeroSizeAllocator
:
public
Allocator
{
...
...
@@ -294,6 +304,11 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
->
Release
(
place
);
}
const
std
::
shared_ptr
<
Allocator
>&
AllocatorFacade
::
GetAllocator
(
const
platform
::
Place
&
place
)
{
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
);
}
}
// namespace allocation
}
// namespace memory
}
// namespace paddle
paddle/fluid/memory/allocation/allocator_facade.h
浏览文件 @
6b3bb796
...
...
@@ -15,11 +15,17 @@
#pragma once
#include <memory>
#include "paddle/fluid/memory/allocation/allocator.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
#ifdef PADDLE_WITH_ASCEND_CL
using
NPUPinnedAllocator
=
paddle
::
memory
::
allocation
::
NPUPinnedAllocator
;
#endif
// Allocator Facade is the interface exposed to other modules.
// All the configuration or dirty code under development should
...
...
@@ -46,6 +52,7 @@ class AllocatorFacade {
// Release unused memory pool.
uint64_t
Release
(
const
platform
::
Place
&
place
);
const
std
::
shared_ptr
<
Allocator
>&
GetAllocator
(
const
platform
::
Place
&
place
);
// TODO(yy): Allocate a Copy-On-Write allocation?
private:
...
...
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
浏览文件 @
6b3bb796
...
...
@@ -287,6 +287,21 @@ class NPUBuddyAllocatorList {
BuddyAllocator
*
GetNPUBuddyAllocator
(
int
npu_id
)
{
return
NPUBuddyAllocatorList
::
Instance
()
->
Get
(
npu_id
);
}
BuddyAllocator
*
GetNPUPinnedBuddyAllocator
()
{
static
std
::
once_flag
init_flag
;
static
BuddyAllocator
*
ba
=
nullptr
;
std
::
call_once
(
init_flag
,
[]()
{
ba
=
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
NPUPinnedAllocator
),
platform
::
NPUPinnedMinChunkSize
(),
platform
::
NPUPinnedMaxChunkSize
());
});
return
ba
;
}
#endif
template
<
>
...
...
@@ -351,6 +366,59 @@ uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
#endif
}
template
<
>
size_t
Used
<
platform
::
NPUPinnedPlace
>
(
const
platform
::
NPUPinnedPlace
&
place
)
{
#ifdef PADDLE_WITH_ASCEND_CL
return
GetNPUPinnedBuddyAllocator
()
->
Used
();
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPinnedPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
void
*
Alloc
<
platform
::
NPUPinnedPlace
>
(
const
platform
::
NPUPinnedPlace
&
place
,
size_t
size
)
{
#ifdef PADDLE_WITH_ASCEND_CL
auto
*
buddy_allocator
=
GetNPUPinnedBuddyAllocator
();
void
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
if
(
ptr
==
nullptr
)
{
LOG
(
WARNING
)
<<
"aclrtMallocHost Cannot allocate "
<<
size
<<
" bytes in NPUPinnedPlace"
;
}
if
(
FLAGS_init_allocated_mem
)
{
memset
(
ptr
,
0xEF
,
size
);
}
return
ptr
;
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPinnedPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
void
Free
<
platform
::
NPUPinnedPlace
>
(
const
platform
::
NPUPinnedPlace
&
place
,
void
*
p
,
size_t
size
)
{
#ifdef PADDLE_WITH_ASCEND_CL
GetNPUPinnedBuddyAllocator
()
->
Free
(
p
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPinnedPlace' is not supported in CPU only device."
));
#endif
}
template
<
>
uint64_t
Release
<
platform
::
NPUPinnedPlace
>
(
const
platform
::
NPUPinnedPlace
&
place
)
{
#ifdef PADDLE_WITH_ASCEND_CL
return
GetNPUPinnedBuddyAllocator
()
->
Release
();
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"'NPUPinnedPlace' is not supported in CPU only device."
));
#endif
}
// For CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class
GPUBuddyAllocatorList
{
...
...
paddle/fluid/memory/allocation/npu_pinned_allocator.cc
0 → 100644
浏览文件 @
6b3bb796
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
void
NPUPinnedAllocator
::
ProcessEventsAndFree
()
{
for
(
auto
it
=
npu_events_
.
begin
();
it
!=
npu_events_
.
end
();)
{
aclrtEvent
event
=
it
->
second
;
aclrtEventStatus
status
=
ACL_EVENT_STATUS_COMPLETE
;
PADDLE_ENFORCE_NPU_SUCCESS
(
aclrtQueryEvent
(
event
,
&
status
));
if
(
status
==
ACL_EVENT_STATUS_COMPLETE
)
{
Allocation
*
allocation
=
it
->
first
;
void
*
ptr
=
allocation
->
ptr
();
free
(
ptr
);
npu_events_
.
erase
(
it
++
);
delete
allocation
;
PADDLE_ENFORCE_NPU_SUCCESS
(
aclrtDestroyEvent
(
event
));
}
else
{
++
it
;
}
}
}
Allocation
*
NPUPinnedAllocator
::
AllocateImpl
(
size_t
size
)
{
ProcessEventsAndFree
();
void
*
ptr
;
int
error
=
posix_memalign
(
&
ptr
,
kAlignment
,
size
);
PADDLE_ENFORCE_EQ
(
error
,
0
,
platform
::
errors
::
ResourceExhausted
(
"Fail to alloc memory of %ld size, error code is %d."
,
size
,
error
));
return
new
Allocation
(
ptr
,
size
,
platform
::
NPUPinnedPlace
());
}
void
NPUPinnedAllocator
::
FreeImpl
(
Allocation
*
allocation
)
{
void
*
ptr
=
allocation
->
ptr
();
auto
iter
=
npu_events_
.
find
(
allocation
);
aclrtEvent
event
=
iter
->
second
;
aclrtEventStatus
status
=
ACL_EVENT_STATUS_COMPLETE
;
PADDLE_ENFORCE_NPU_SUCCESS
(
aclrtQueryEvent
(
event
,
&
status
));
if
(
status
==
ACL_EVENT_STATUS_COMPLETE
)
{
free
(
ptr
);
npu_events_
.
erase
(
allocation
);
delete
allocation
;
PADDLE_ENFORCE_NPU_SUCCESS
(
aclrtDestroyEvent
(
event
));
}
return
;
}
uint64_t
NPUPinnedAllocator
::
ReleaseImpl
(
const
platform
::
Place
&
place
)
{
return
static_cast
<
uint64_t
>
(
0
);
}
void
NPUPinnedAllocator
::
RecordEvent
(
Allocation
*
allocation
,
aclrtStream
stream
)
{
aclrtEvent
event
=
nullptr
;
PADDLE_ENFORCE_NPU_SUCCESS
(
aclrtCreateEvent
(
&
event
));
PADDLE_ENFORCE_NPU_SUCCESS
(
aclrtRecordEvent
(
event
,
stream
));
npu_events_
.
insert
({
allocation
,
event
});
}
}
// namespace allocation
}
// namespace memory
}
// namespace paddle
#endif
paddle/fluid/memory/allocation/npu_pinned_allocator.h
0 → 100644
浏览文件 @
6b3bb796
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include <mutex> // NOLINT
#include <string>
#include <unordered_map>
#include "acl/acl.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/npu_info.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
class
NPUPinnedAllocator
:
public
Allocator
{
public:
bool
IsAllocThreadSafe
()
const
override
{
return
true
;
}
void
ProcessEventsAndFree
();
void
RecordEvent
(
Allocation
*
allocation
,
aclrtStream
stream
);
constexpr
static
size_t
kAlignment
=
4096UL
;
protected:
Allocation
*
AllocateImpl
(
size_t
size
)
override
;
void
FreeImpl
(
Allocation
*
allocation
)
override
;
uint64_t
ReleaseImpl
(
const
platform
::
Place
&
place
)
override
;
private:
std
::
unordered_map
<
Allocation
*
,
aclrtEvent
>
npu_events_
;
};
}
// namespace allocation
}
// namespace memory
}
// namespace paddle
#endif
paddle/fluid/memory/detail/system_allocator.cc
浏览文件 @
6b3bb796
...
...
@@ -310,6 +310,60 @@ void NPUAllocator::Free(void* p, size_t size, size_t index) {
}
bool
NPUAllocator
::
UseGpu
()
const
{
return
true
;
}
void
*
NPUPinnedAllocator
::
Alloc
(
size_t
*
index
,
size_t
size
)
{
if
(
size
<=
0
)
return
nullptr
;
size_t
usable
=
paddle
::
platform
::
NPUPinnedMaxAllocSize
()
-
npu_pinnd_alloc_size_
;
if
(
size
>
usable
)
{
LOG
(
WARNING
)
<<
"Cannot malloc "
<<
size
/
1024.0
/
1024.0
<<
" MB pinned memory."
<<
", available "
<<
usable
/
1024.0
/
1024.0
<<
" MB"
;
return
nullptr
;
}
void
*
p
;
// PINNED memory is visible to all NPU contexts.
auto
result
=
aclrtMallocHost
(
&
p
,
size
);
if
(
result
==
ACL_ERROR_NONE
)
{
*
index
=
1
;
// PINNED memory
npu_pinnd_alloc_size_
+=
size
;
return
p
;
}
else
{
LOG
(
WARNING
)
<<
"aclrtMallocHost failed."
;
return
nullptr
;
}
return
nullptr
;
}
void
NPUPinnedAllocator
::
Free
(
void
*
p
,
size_t
size
,
size_t
index
)
{
aclError
err
;
PADDLE_ENFORCE_EQ
(
index
,
1
,
platform
::
errors
::
InvalidArgument
(
"The index should be 1, but got %d"
,
index
));
PADDLE_ENFORCE_GE
(
npu_pinnd_alloc_size_
,
size
,
platform
::
errors
::
InvalidArgument
(
"The size of memory (%d) to free exceeds the size of "
"allocated npu pinned memory (%d)"
,
size
,
npu_pinnd_alloc_size_
));
npu_pinnd_alloc_size_
-=
size
;
err
=
aclrtFreeHost
(
p
);
if
(
err
!=
ACL_ERROR_NONE
)
{
PADDLE_ENFORCE_EQ
(
err
,
0
,
platform
::
errors
::
Fatal
(
"aclrtFreeHost failed in NPUPinnedAllocator, error code is %d"
,
err
));
}
}
bool
NPUPinnedAllocator
::
UseGpu
()
const
{
return
false
;
}
#endif
}
// namespace detail
...
...
paddle/fluid/memory/detail/system_allocator.h
浏览文件 @
6b3bb796
...
...
@@ -80,6 +80,16 @@ class NPUAllocator : public SystemAllocator {
size_t
npu_alloc_size_
=
0
;
int
npu_id_
;
};
class
NPUPinnedAllocator
:
public
SystemAllocator
{
public:
virtual
void
*
Alloc
(
size_t
*
index
,
size_t
size
);
virtual
void
Free
(
void
*
p
,
size_t
size
,
size_t
index
);
virtual
bool
UseGpu
()
const
;
private:
size_t
npu_pinnd_alloc_size_
=
0
;
};
#endif
}
// namespace detail
...
...
paddle/fluid/memory/memcpy.cc
浏览文件 @
6b3bb796
...
...
@@ -245,7 +245,7 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
static_cast
<
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
src_place
))
->
Wait
();
platform
::
RecordEvent
record_event
(
"
G
puMemcpySync:NPU->CPU"
);
platform
::
RecordEvent
record_event
(
"
N
puMemcpySync:NPU->CPU"
);
platform
::
NPUMemcpySync
(
dst
,
src
,
num
,
ACL_MEMCPY_DEVICE_TO_HOST
);
}
}
...
...
@@ -294,6 +294,86 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
}
}
}
template
<
>
void
Copy
<
platform
::
CPUPlace
,
platform
::
NPUPinnedPlace
>
(
platform
::
CPUPlace
dst_place
,
void
*
dst
,
platform
::
NPUPinnedPlace
src_place
,
const
void
*
src
,
size_t
num
)
{
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
;
if
(
UNLIKELY
(
num
==
0
))
return
;
std
::
memcpy
(
dst
,
src
,
num
);
}
template
<
>
void
Copy
<
platform
::
NPUPinnedPlace
,
platform
::
CPUPlace
>
(
platform
::
NPUPinnedPlace
dst_place
,
void
*
dst
,
platform
::
CPUPlace
src_place
,
const
void
*
src
,
size_t
num
)
{
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
;
if
(
UNLIKELY
(
num
==
0
))
return
;
std
::
memcpy
(
dst
,
src
,
num
);
}
template
<
>
void
Copy
<
platform
::
NPUPinnedPlace
,
platform
::
NPUPinnedPlace
>
(
platform
::
NPUPinnedPlace
dst_place
,
void
*
dst
,
platform
::
NPUPinnedPlace
src_place
,
const
void
*
src
,
size_t
num
)
{
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
;
if
(
UNLIKELY
(
num
==
0
))
return
;
std
::
memcpy
(
dst
,
src
,
num
);
}
template
<
>
void
Copy
<
platform
::
NPUPinnedPlace
,
platform
::
NPUPlace
>
(
platform
::
NPUPinnedPlace
dst_place
,
void
*
dst
,
platform
::
NPUPlace
src_place
,
const
void
*
src
,
size_t
num
,
aclrtStream
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
platform
::
SetNPUDeviceId
(
src_place
.
device
);
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
<<
" by thream("
<<
stream
<<
")"
;
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"NpuMemcpyAsync:NPU->NPUPinned"
);
platform
::
NPUMemcpyAsync
(
dst
,
src
,
num
,
ACL_MEMCPY_DEVICE_TO_HOST
,
stream
);
}
else
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
static_cast
<
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
src_place
))
->
Wait
();
platform
::
RecordEvent
record_event
(
"NpuMemcpySync:NPU->NPUPinned"
);
platform
::
NPUMemcpySync
(
dst
,
src
,
num
,
ACL_MEMCPY_DEVICE_TO_HOST
);
}
}
template
<
>
void
Copy
<
platform
::
NPUPlace
,
platform
::
NPUPinnedPlace
>
(
platform
::
NPUPlace
dst_place
,
void
*
dst
,
platform
::
NPUPinnedPlace
src_place
,
const
void
*
src
,
size_t
num
,
aclrtStream
stream
)
{
if
(
UNLIKELY
(
num
==
0
))
return
;
platform
::
SetNPUDeviceId
(
dst_place
.
device
);
VLOG
(
4
)
<<
"memory::Copy "
<<
num
<<
" Bytes from "
<<
src_place
<<
" to "
<<
dst_place
<<
" by thream("
<<
stream
<<
")"
;
if
(
stream
)
{
platform
::
RecordEvent
record_event
(
"NpuMemcpyAsync:NPUPinned->NPU"
);
platform
::
NPUMemcpyAsync
(
dst
,
src
,
num
,
ACL_MEMCPY_HOST_TO_DEVICE
,
stream
);
}
else
{
// On NPU, async operation after sync operation is ok, while sync operation
// after async is not ok, since the async operation may not done.
// So, its needed to do wait before sync operation.
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
static_cast
<
platform
::
NPUDeviceContext
*>
(
pool
.
Get
(
dst_place
))
->
Wait
();
platform
::
RecordEvent
record_event
(
"NpuMemcpySync:NPUPinned->NPU"
);
platform
::
NPUMemcpySync
(
dst
,
src
,
num
,
ACL_MEMCPY_HOST_TO_DEVICE
);
}
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
paddle/fluid/operators/math/math_function.cc
浏览文件 @
6b3bb796
...
...
@@ -158,6 +158,14 @@ void set_constant_with_place<platform::NPUPlace>(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"NPUPlace is not supported"
));
}
template
<
>
void
set_constant_with_place
<
platform
::
NPUPinnedPlace
>
(
const
platform
::
DeviceContext
&
context
,
framework
::
Tensor
*
tensor
,
float
value
)
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"NPUPinnedPlace is not supported"
));
}
template
<
>
void
set_constant_with_place
<
platform
::
CPUPlace
>
(
const
platform
::
DeviceContext
&
context
,
framework
::
Tensor
*
tensor
,
...
...
paddle/fluid/platform/cpu_info.cc
浏览文件 @
6b3bb796
...
...
@@ -104,6 +104,23 @@ size_t CUDAPinnedMaxChunkSize() {
return
CUDAPinnedMaxAllocSize
()
/
256
;
}
size_t
NPUPinnedMaxAllocSize
()
{
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
return
FLAGS_fraction_of_cuda_pinned_memory_to_use
*
CpuTotalPhysicalMemory
();
}
size_t
NPUPinnedMinChunkSize
()
{
// Allow to allocate the minimum chunk size is 64 KB.
return
1
<<
16
;
}
size_t
NPUPinnedMaxChunkSize
()
{
// Allow to allocate the maximum chunk size is roughly 1/256 of NPU_PINNED
// memory.
return
NPUPinnedMaxAllocSize
()
/
256
;
}
#ifdef PADDLE_WITH_XBYAK
static
Xbyak
::
util
::
Cpu
cpu
;
bool
MayIUse
(
const
cpu_isa_t
cpu_isa
)
{
...
...
paddle/fluid/platform/cpu_info.h
浏览文件 @
6b3bb796
...
...
@@ -73,6 +73,15 @@ size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t
CUDAPinnedMaxChunkSize
();
//! Get the maximum allocation size for a machine.
size_t
NPUPinnedMaxAllocSize
();
//! Get the minimum chunk size for buddy allocator.
size_t
NPUPinnedMinChunkSize
();
//! Get the maximum chunk size for buddy allocator.
size_t
NPUPinnedMaxChunkSize
();
typedef
enum
{
isa_any
,
sse42
,
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
6b3bb796
...
...
@@ -153,6 +153,16 @@ DeviceContextPool::DeviceContextPool(
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"NPUPlace is not supported. Please "
"re-compile with WITH_ASCEND_CL option."
));
#endif
}
else
if
(
platform
::
is_npu_pinned_place
(
p
))
{
#ifdef PADDLE_WITH_ASCEND_CL
EmplaceDeviceContext
<
NPUPinnedDeviceContext
,
NPUPinnedPlace
>
(
&
device_contexts_
,
p
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"NPUPinnedPlace is not supported. Please re-compile with "
"WITH_ASCEND_CL "
"option."
));
#endif
}
}
...
...
@@ -264,6 +274,22 @@ aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
Place
NPUDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
aclrtContext
NPUDeviceContext
::
context
()
const
{
return
context_
;
}
NPUPinnedDeviceContext
::
NPUPinnedDeviceContext
()
{
eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
}
NPUPinnedDeviceContext
::
NPUPinnedDeviceContext
(
NPUPinnedPlace
place
)
:
place_
(
place
)
{
eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
}
Eigen
::
DefaultDevice
*
NPUPinnedDeviceContext
::
eigen_device
()
const
{
return
eigen_device_
.
get
();
}
Place
NPUPinnedDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
6b3bb796
...
...
@@ -233,6 +233,27 @@ template <>
struct
DefaultDeviceContextType
<
platform
::
NPUPlace
>
{
using
TYPE
=
NPUDeviceContext
;
};
// Currently, NPUPinnedDeviceContext is only used to data copying.
class
NPUPinnedDeviceContext
:
public
DeviceContext
{
public:
NPUPinnedDeviceContext
();
explicit
NPUPinnedDeviceContext
(
NPUPinnedPlace
place
);
Place
GetPlace
()
const
override
;
Eigen
::
DefaultDevice
*
eigen_device
()
const
;
private:
NPUPinnedPlace
place_
;
std
::
unique_ptr
<
Eigen
::
DefaultDevice
>
eigen_device_
;
};
template
<
>
struct
DefaultDeviceContextType
<
platform
::
NPUPinnedPlace
>
{
using
TYPE
=
NPUPinnedDeviceContext
;
};
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
paddle/fluid/platform/place.cc
浏览文件 @
6b3bb796
...
...
@@ -34,6 +34,7 @@ class PlacePrinter : public boost::static_visitor<> {
}
void
operator
()(
const
XPUPlace
&
p
)
{
os_
<<
"XPUPlace("
<<
p
.
device
<<
")"
;
}
void
operator
()(
const
NPUPlace
&
p
)
{
os_
<<
"NPUPlace("
<<
p
.
device
<<
")"
;
}
void
operator
()(
const
NPUPinnedPlace
&
p
)
{
os_
<<
"NPUPinnedPlace"
;
}
void
operator
()(
const
CUDAPinnedPlace
&
p
)
{
os_
<<
"CUDAPinnedPlace"
;
}
private:
...
...
@@ -62,6 +63,10 @@ bool is_cuda_pinned_place(const Place &p) {
return
boost
::
apply_visitor
(
IsCUDAPinnedPlace
(),
p
);
}
bool
is_npu_pinned_place
(
const
Place
&
p
)
{
return
boost
::
apply_visitor
(
IsNPUPinnedPlace
(),
p
);
}
bool
places_are_same_class
(
const
Place
&
p1
,
const
Place
&
p2
)
{
return
p1
.
which
()
==
p2
.
which
();
}
...
...
paddle/fluid/platform/place.h
浏览文件 @
6b3bb796
...
...
@@ -85,10 +85,19 @@ struct NPUPlace {
int
device
;
};
struct
NPUPinnedPlace
{
NPUPinnedPlace
()
{}
inline
bool
operator
==
(
const
NPUPinnedPlace
&
)
const
{
return
true
;
}
inline
bool
operator
!=
(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
inline
bool
operator
<
(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
};
struct
IsCUDAPlace
:
public
boost
::
static_visitor
<
bool
>
{
bool
operator
()(
const
CPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
};
...
...
@@ -97,6 +106,7 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
bool
operator
()(
const
CPUPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
};
...
...
@@ -105,6 +115,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
bool
operator
()(
const
CPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
cuda_pinned
)
const
{
return
true
;
}
};
...
...
@@ -113,6 +124,7 @@ struct IsXPUPlace : public boost::static_visitor<bool> {
bool
operator
()(
const
CPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
};
...
...
@@ -121,15 +133,25 @@ struct IsNPUPlace : public boost::static_visitor<bool> {
bool
operator
()(
const
CPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
};
struct
IsNPUPinnedPlace
:
public
boost
::
static_visitor
<
bool
>
{
bool
operator
()(
const
CPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
XPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
NPUPinnedPlace
&
)
const
{
return
true
;
}
bool
operator
()(
const
CUDAPlace
&
)
const
{
return
false
;
}
bool
operator
()(
const
CUDAPinnedPlace
&
)
const
{
return
false
;
}
};
class
Place
:
public
boost
::
variant
<
CUDAPlace
,
XPUPlace
,
NPUPlace
,
CPUPlace
,
CUDAPinnedPlace
>
{
CUDAPinnedPlace
,
NPUPinnedPlace
>
{
private:
using
PlaceBase
=
boost
::
variant
<
CUDAPlace
,
XPUPlace
,
NPUPlace
,
CPUPlace
,
CUDA
PinnedPlace
>
;
using
PlaceBase
=
boost
::
variant
<
CUDAPlace
,
XPUPlace
,
NPUPlace
,
CPUPlace
,
CUDAPinnedPlace
,
NPU
PinnedPlace
>
;
public:
Place
()
=
default
;
...
...
@@ -139,6 +161,8 @@ class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
Place
(
const
CUDAPlace
&
cuda_place
)
:
PlaceBase
(
cuda_place
)
{}
// NOLINT
Place
(
const
CUDAPinnedPlace
&
cuda_pinned_place
)
// NOLINT
:
PlaceBase
(
cuda_pinned_place
)
{}
Place
(
const
NPUPinnedPlace
&
npu_pinned_place
)
// NOLINT
:
PlaceBase
(
npu_pinned_place
)
{}
bool
operator
<
(
const
Place
&
place
)
const
{
return
PlaceBase
::
operator
<
(
static_cast
<
const
PlaceBase
&>
(
place
));
...
...
@@ -155,6 +179,7 @@ bool is_xpu_place(const Place &);
bool
is_npu_place
(
const
Place
&
);
bool
is_cpu_place
(
const
Place
&
);
bool
is_cuda_pinned_place
(
const
Place
&
);
bool
is_npu_pinned_place
(
const
Place
&
);
bool
places_are_same_class
(
const
Place
&
,
const
Place
&
);
bool
is_same_place
(
const
Place
&
,
const
Place
&
);
...
...
@@ -190,6 +215,17 @@ struct PlaceVisitorWrapper
#endif
}
typename
Visitor
::
result_type
operator
()(
const
NPUPinnedPlace
&
npu_pinned
)
const
{
#ifdef PADDLE_WITH_ASCEND_CL
return
visitor_
(
npu_pinned
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Paddle is not compiled with NPU. Cannot visit npu_pinned"
));
return
typename
Visitor
::
result_type
();
#endif
}
typename
Visitor
::
result_type
operator
()(
const
CUDAPlace
&
cuda
)
const
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return
visitor_
(
cuda
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录