Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
21f11d35
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
21f11d35
编写于
5月 27, 2022
作者:
R
Ruibiao Chen
提交者:
GitHub
5月 27, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Support memory stats for CPU (#42945)
* Support memory stats for CPU * Add UTs * Fix typos * Fix typos
上级
b2b78cd4
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
385 addition
and
176 deletion
+385
-176
paddle/fluid/memory/CMakeLists.txt
paddle/fluid/memory/CMakeLists.txt
+1
-0
paddle/fluid/memory/allocation/allocator_facade.cc
paddle/fluid/memory/allocation/allocator_facade.cc
+1
-4
paddle/fluid/memory/allocation/stat_allocator.h
paddle/fluid/memory/allocation/stat_allocator.h
+16
-4
paddle/fluid/memory/detail/system_allocator.cc
paddle/fluid/memory/detail/system_allocator.cc
+6
-0
paddle/fluid/memory/memory_stats_test.cc
paddle/fluid/memory/memory_stats_test.cc
+64
-0
paddle/fluid/memory/stats.cc
paddle/fluid/memory/stats.cc
+58
-34
paddle/fluid/memory/stats.h
paddle/fluid/memory/stats.h
+101
-70
paddle/fluid/memory/stats_test.cc
paddle/fluid/memory/stats_test.cc
+112
-46
paddle/fluid/operators/conv_cudnn_helper.h
paddle/fluid/operators/conv_cudnn_helper.h
+4
-2
paddle/fluid/platform/device/gpu/gpu_info.cc
paddle/fluid/platform/device/gpu/gpu_info.cc
+11
-8
paddle/fluid/platform/profiler_helper.h
paddle/fluid/platform/profiler_helper.h
+4
-2
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+3
-2
python/paddle/device/cuda/__init__.py
python/paddle/device/cuda/__init__.py
+4
-4
未找到文件。
paddle/fluid/memory/CMakeLists.txt
浏览文件 @
21f11d35
...
...
@@ -13,6 +13,7 @@ cc_library(memcpy SRCS memcpy.cc DEPS place device_context)
cc_library
(
stats SRCS stats.cc DEPS enforce
)
cc_library
(
memory DEPS malloc memcpy stats
)
cc_test
(
memory_stats_test SRCS memory_stats_test.cc DEPS memory
)
cc_test
(
stats_test SRCS stats_test.cc DEPS stats
)
if
(
WITH_GPU
)
...
...
paddle/fluid/memory/allocation/allocator_facade.cc
浏览文件 @
21f11d35
...
...
@@ -931,10 +931,7 @@ class AllocatorFacadePrivate {
void
WrapStatAllocator
()
{
for
(
auto
&
pair
:
allocators_
)
{
// Now memory stats is only supported for GPU
if
(
platform
::
is_gpu_place
(
pair
.
first
))
{
pair
.
second
=
std
::
make_shared
<
StatAllocator
>
(
pair
.
second
);
}
pair
.
second
=
std
::
make_shared
<
StatAllocator
>
(
pair
.
second
);
}
}
...
...
paddle/fluid/memory/allocation/stat_allocator.h
浏览文件 @
21f11d35
...
...
@@ -30,16 +30,28 @@ class StatAllocator : public Allocator {
protected:
void
FreeImpl
(
phi
::
Allocation
*
allocation
)
override
{
MEMORY_STAT_UPDATE
(
Allocated
,
allocation
->
place
().
GetDeviceId
(),
-
allocation
->
size
());
if
(
platform
::
is_cpu_place
(
allocation
->
place
()))
{
HOST_MEMORY_STAT_UPDATE
(
Allocated
,
allocation
->
place
().
GetDeviceId
(),
-
allocation
->
size
());
}
else
{
DEVICE_MEMORY_STAT_UPDATE
(
Allocated
,
allocation
->
place
().
GetDeviceId
(),
-
allocation
->
size
());
}
underlying_allocator_
->
Free
(
allocation
);
}
phi
::
Allocation
*
AllocateImpl
(
size_t
size
)
override
{
phi
::
Allocator
::
AllocationPtr
allocation
=
underlying_allocator_
->
Allocate
(
size
);
MEMORY_STAT_UPDATE
(
Allocated
,
allocation
->
place
().
GetDeviceId
(),
allocation
->
size
());
if
(
platform
::
is_cpu_place
(
allocation
->
place
()))
{
HOST_MEMORY_STAT_UPDATE
(
Allocated
,
allocation
->
place
().
GetDeviceId
(),
allocation
->
size
());
}
else
{
DEVICE_MEMORY_STAT_UPDATE
(
Allocated
,
allocation
->
place
().
GetDeviceId
(),
allocation
->
size
());
}
return
allocation
.
release
();
}
...
...
paddle/fluid/memory/detail/system_allocator.cc
浏览文件 @
21f11d35
...
...
@@ -15,6 +15,8 @@ limitations under the License. */
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/memory/stats.h"
#ifdef _WIN32
#include <malloc.h>
#ifndef NOMINMAX
...
...
@@ -92,6 +94,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
}
}
HOST_MEMORY_STAT_UPDATE
(
Reserved
,
0
,
size
);
return
p
;
}
...
...
@@ -108,6 +112,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
#else
free
(
p
);
#endif
HOST_MEMORY_STAT_UPDATE
(
Reserved
,
0
,
-
size
);
}
bool
CPUAllocator
::
UseGpu
()
const
{
return
false
;
}
...
...
paddle/fluid/memory/memory_stats_test.cc
0 → 100644
浏览文件 @
21f11d35
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/memory.h"
#include <algorithm>
#include <vector>
#include "gtest/gtest.h"
namespace
paddle
{
namespace
memory
{
TEST
(
stat_allocator_test
,
host_memory_stat_test
)
{
std
::
vector
<
int64_t
>
alloc_sizes
{
5278
,
9593
,
8492
,
5041
,
3351
,
4232
,
3706
,
5963
,
5896
,
5057
,
7527
,
6235
,
0
,
7810
,
940
,
1239
,
1945
,
789
,
2891
,
7553
,
8046
,
2685
,
1332
,
6547
,
5238
,
5345
,
1133
,
5475
,
9137
,
3111
,
8478
,
6350
,
9395
,
4
,
1185
,
2186
,
357
,
9774
,
6743
,
6136
,
7073
,
7674
,
5640
,
3935
,
528
,
6699
,
9821
,
8717
,
2264
,
4708
,
9936
,
3566
,
1373
,
6955
,
3694
,
221
,
309
,
3617
,
3793
,
3334
,
7281
,
1302
};
int64_t
max_alloc_size
=
0
;
for
(
int64_t
size
:
alloc_sizes
)
{
AllocationPtr
allocation
=
Alloc
(
platform
::
CPUPlace
(),
size
);
int64_t
alloc_size
=
static_cast
<
int64_t
>
(
allocation
->
size
());
max_alloc_size
=
std
::
max
(
max_alloc_size
,
alloc_size
);
EXPECT_EQ
(
HostMemoryStatCurrentValue
(
"Allocated"
,
0
),
alloc_size
);
}
EXPECT_EQ
(
HostMemoryStatPeakValue
(
"Allocated"
,
0
),
max_alloc_size
);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST
(
stat_allocator_test
,
device_memory_stat_test
)
{
std
::
vector
<
int64_t
>
alloc_sizes
{
5278
,
9593
,
8492
,
5041
,
3351
,
4232
,
3706
,
5963
,
5896
,
5057
,
7527
,
6235
,
0
,
7810
,
940
,
1239
,
1945
,
789
,
2891
,
7553
,
8046
,
2685
,
1332
,
6547
,
5238
,
5345
,
1133
,
5475
,
9137
,
3111
,
8478
,
6350
,
9395
,
4
,
1185
,
2186
,
357
,
9774
,
6743
,
6136
,
7073
,
7674
,
5640
,
3935
,
528
,
6699
,
9821
,
8717
,
2264
,
4708
,
9936
,
3566
,
1373
,
6955
,
3694
,
221
,
309
,
3617
,
3793
,
3334
,
7281
,
1302
};
int64_t
max_alloc_size
=
0
;
for
(
int64_t
size
:
alloc_sizes
)
{
AllocationPtr
allocation
=
Alloc
(
platform
::
CUDAPlace
(),
size
);
int64_t
alloc_size
=
static_cast
<
int64_t
>
(
allocation
->
size
());
max_alloc_size
=
std
::
max
(
max_alloc_size
,
alloc_size
);
EXPECT_EQ
(
DeviceMemoryStatCurrentValue
(
"Allocated"
,
0
),
alloc_size
);
}
EXPECT_EQ
(
DeviceMemoryStatPeakValue
(
"Allocated"
,
0
),
max_alloc_size
);
}
#endif
}
// namespace memory
}
// namespace paddle
paddle/fluid/memory/stats.cc
浏览文件 @
21f11d35
...
...
@@ -38,7 +38,7 @@ class StatRegistry {
}
std
::
string
GetStatKey
(
const
std
::
string
&
stat_type
,
int
dev_id
)
{
return
"STAT_Device"
+
std
::
to_string
(
dev_id
)
+
"_"
+
stat_type
;
return
stat_type
+
std
::
to_string
(
dev_id
)
;
}
int64_t
GetCurrentValue
(
const
std
::
string
&
stat_type
,
int
dev_id
)
{
...
...
@@ -49,6 +49,10 @@ class StatRegistry {
return
GetStat
(
stat_type
,
dev_id
)
->
GetPeakValue
();
}
void
Update
(
const
std
::
string
&
stat_type
,
int
dev_id
,
int64_t
increment
)
{
GetStat
(
stat_type
,
dev_id
)
->
Update
(
increment
);
}
void
Register
(
const
std
::
string
&
stat_type
,
int
dev_id
,
StatBase
*
stat
)
{
std
::
lock_guard
<
SpinLock
>
lock_guard
(
stat_map_lock_
);
stat_map_
[
GetStatKey
(
stat_type
,
dev_id
)]
=
stat
;
...
...
@@ -59,10 +63,6 @@ class StatRegistry {
stat_map_
.
erase
(
GetStatKey
(
stat_type
,
dev_id
));
}
void
Update
(
const
std
::
string
&
stat_type
,
int
dev_id
,
int64_t
increment
)
{
stat_map_
[
GetStatKey
(
stat_type
,
dev_id
)]
->
Update
(
increment
);
}
private:
StatRegistry
()
=
default
;
...
...
@@ -72,43 +72,67 @@ class StatRegistry {
SpinLock
stat_map_lock_
;
};
int64_t
StatGetCurrentValue
(
const
std
::
string
&
stat_type
,
int
dev_id
)
{
return
StatRegistry
::
GetInstance
()
->
GetCurrentValue
(
stat_type
,
dev_id
);
int64_t
DeviceMemoryStatCurrentValue
(
const
std
::
string
&
stat_type
,
int
dev_id
)
{
return
StatRegistry
::
GetInstance
()
->
GetCurrentValue
(
"Device"
+
stat_type
,
dev_id
);
}
int64_t
StatGetPeakValue
(
const
std
::
string
&
stat_type
,
int
dev_id
)
{
return
StatRegistry
::
GetInstance
()
->
GetPeakValue
(
stat_type
,
dev_id
);
int64_t
DeviceMemoryStatPeakValue
(
const
std
::
string
&
stat_type
,
int
dev_id
)
{
return
StatRegistry
::
GetInstance
()
->
GetPeakValue
(
"Device"
+
stat_type
,
dev_id
);
}
void
StatUpdate
(
const
std
::
string
&
stat_type
,
int
dev_id
,
int64_t
increment
)
{
StatRegistry
::
GetInstance
()
->
Update
(
stat_type
,
dev_id
,
increment
);
void
DeviceMemoryStatUpdate
(
const
std
::
string
&
stat_type
,
int
dev_id
,
int64_t
increment
)
{
StatRegistry
::
GetInstance
()
->
Update
(
"Device"
+
stat_type
,
dev_id
,
increment
);
}
#define MEMORY_STAT_REGISTER_WITH_ID(item, id) \
StatRegistry::GetInstance()->Register( \
#item, id, Stat<ThreadLocalStatDevice##id##item>::GetInstance());
#define MEMORY_STAT_REGISTER(item) \
MEMORY_STAT_REGISTER_WITH_ID(item, 0); \
MEMORY_STAT_REGISTER_WITH_ID(item, 1); \
MEMORY_STAT_REGISTER_WITH_ID(item, 2); \
MEMORY_STAT_REGISTER_WITH_ID(item, 3); \
MEMORY_STAT_REGISTER_WITH_ID(item, 4); \
MEMORY_STAT_REGISTER_WITH_ID(item, 5); \
MEMORY_STAT_REGISTER_WITH_ID(item, 6); \
MEMORY_STAT_REGISTER_WITH_ID(item, 7); \
MEMORY_STAT_REGISTER_WITH_ID(item, 8); \
MEMORY_STAT_REGISTER_WITH_ID(item, 9); \
MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
MEMORY_STAT_REGISTER_WITH_ID(item, 15)
int64_t
HostMemoryStatCurrentValue
(
const
std
::
string
&
stat_type
,
int
dev_id
)
{
return
StatRegistry
::
GetInstance
()
->
GetCurrentValue
(
"Host"
+
stat_type
,
dev_id
);
}
int64_t
HostMemoryStatPeakValue
(
const
std
::
string
&
stat_type
,
int
dev_id
)
{
return
StatRegistry
::
GetInstance
()
->
GetPeakValue
(
"Host"
+
stat_type
,
dev_id
);
}
void
HostMemoryStatUpdate
(
const
std
::
string
&
stat_type
,
int
dev_id
,
int64_t
increment
)
{
StatRegistry
::
GetInstance
()
->
Update
(
"Host"
+
stat_type
,
dev_id
,
increment
);
}
#define DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, id) \
StatRegistry::GetInstance()->Register( \
"Device" #item, id, Stat<DeviceMemoryStat##item##id>::GetInstance());
#define DEVICE_MEMORY_STAT_REGISTER(item) \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 0); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 1); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 2); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 3); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 4); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 5); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 6); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 7); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 8); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 9); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 15)
#define HOST_MEMORY_STAT_REGISTER(item) \
StatRegistry::GetInstance()->Register( \
"Host" #item, 0, Stat<HostMemoryStat##item##0>::GetInstance());
int
RegisterAllStats
()
{
MEMORY_STAT_REGISTER
(
Allocated
);
MEMORY_STAT_REGISTER
(
Reserved
);
DEVICE_MEMORY_STAT_REGISTER
(
Allocated
);
DEVICE_MEMORY_STAT_REGISTER
(
Reserved
);
HOST_MEMORY_STAT_REGISTER
(
Allocated
);
HOST_MEMORY_STAT_REGISTER
(
Reserved
);
return
0
;
}
...
...
paddle/fluid/memory/stats.h
浏览文件 @
21f11d35
...
...
@@ -91,82 +91,113 @@ class Stat : public StatBase {
std
::
atomic
<
int64_t
>
peak_value_
{
0
};
};
// StatGetCurrentValue, StatGetPeakValue and StatUpdate support to operate STAT
// values by a string, however, they has worse performance than the macro
// function MEMORY_STAT_CURRENT_VALUE, MEMORY_STAT_PEAK_VALUE, and
// MEMORY_STAT_UPDATE. Try to use the macro functions where ultra-low
// performance overhead is required.
int64_t
StatGetCurrentValue
(
const
std
::
string
&
stat_type
,
int
dev_id
);
int64_t
StatGetPeakValue
(
const
std
::
string
&
stat_type
,
int
dev_id
);
void
StatUpdate
(
const
std
::
string
&
stat_type
,
int
dev_id
,
int64_t
increment
);
#define MEMORY_STAT_FUNC_SWITHCH_CASE(item, id) \
case id: \
stat = paddle::memory::Stat< \
paddle::memory::ThreadLocalStatDevice##id##item>::GetInstance(); \
// xxxMemoryStatCurrentValue, xxxMemoryStatPeakValue and xxxMemoryStatUpdate
// support to operate STAT values by a string, however, they has worse
// performance than the macro function xxx_MEMORY_STAT_CURRENT_VALUE,
// xxx_MEMORY_STAT_PEAK_VALUE, and xxx_MEMORY_STAT_UPDATE. Try to use the macro
// functions where ultra-low performance overhead is required.
int64_t
DeviceMemoryStatCurrentValue
(
const
std
::
string
&
stat_type
,
int
dev_id
);
int64_t
DeviceMemoryStatPeakValue
(
const
std
::
string
&
stat_type
,
int
dev_id
);
void
DeviceMemoryStatUpdate
(
const
std
::
string
&
stat_type
,
int
dev_id
,
int64_t
increment
);
int64_t
HostMemoryStatCurrentValue
(
const
std
::
string
&
stat_type
,
int
dev_id
);
int64_t
HostMemoryStatPeakValue
(
const
std
::
string
&
stat_type
,
int
dev_id
);
void
HostMemoryStatUpdate
(
const
std
::
string
&
stat_type
,
int
dev_id
,
int64_t
increment
);
#define DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, id) \
case id: \
stat = paddle::memory::Stat< \
paddle::memory::DeviceMemoryStat##item##id>::GetInstance(); \
break
#define
MEMORY_STAT_FUNC(item, id, func, ...)
\
[&] { \
paddle::memory::StatBase* stat = nullptr; \
switch (id) { \
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0);
\
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1);
\
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2);
\
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3);
\
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4);
\
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5);
\
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6);
\
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7);
\
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8);
\
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9);
\
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10);
\
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11);
\
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12);
\
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13);
\
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14);
\
MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15);
\
default: \
PADDLE_THROW(paddle::platform::errors::OutOfRange( \
"Only support device id between [0, 15]
in
memory stats," \
"not support device id: %d", \
id)); \
break; \
} \
return stat->func(__VA_ARGS__); \
#define
DEVICE_MEMORY_STAT_FUNC(item, id, func, ...)
\
[&] {
\
paddle::memory::StatBase* stat = nullptr;
\
switch (id) {
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0);
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1);
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2);
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3);
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4);
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5);
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6);
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7);
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8);
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9);
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10);
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11);
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12);
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13);
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14);
\
DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15);
\
default:
\
PADDLE_THROW(paddle::platform::errors::OutOfRange(
\
"Only support device id between [0, 15]
for device
memory stats," \
"not support device id: %d",
\
id));
\
break;
\
}
\
return stat->func(__VA_ARGS__);
\
}()
#define MEMORY_STAT_CURRENT_VALUE(item, id) \
MEMORY_STAT_FUNC(item, id, GetCurrentValue)
#define MEMORY_STAT_PEAK_VALUE(item, id) \
MEMORY_STAT_FUNC(item, id, GetPeakValue)
#define MEMORY_STAT_UPDATE(item, id, increment) \
MEMORY_STAT_FUNC(item, id, Update, increment)
#define MEMORY_STAT_DECLARE_WITH_ID(item, id) \
struct ThreadLocalStatDevice##id##item : public ThreadLocalStatBase {};
#define MEMORY_STAT_DECLARE(item) \
MEMORY_STAT_DECLARE_WITH_ID(item, 0); \
MEMORY_STAT_DECLARE_WITH_ID(item, 1); \
MEMORY_STAT_DECLARE_WITH_ID(item, 2); \
MEMORY_STAT_DECLARE_WITH_ID(item, 3); \
MEMORY_STAT_DECLARE_WITH_ID(item, 4); \
MEMORY_STAT_DECLARE_WITH_ID(item, 5); \
MEMORY_STAT_DECLARE_WITH_ID(item, 6); \
MEMORY_STAT_DECLARE_WITH_ID(item, 7); \
MEMORY_STAT_DECLARE_WITH_ID(item, 8); \
MEMORY_STAT_DECLARE_WITH_ID(item, 9); \
MEMORY_STAT_DECLARE_WITH_ID(item, 10); \
MEMORY_STAT_DECLARE_WITH_ID(item, 11); \
MEMORY_STAT_DECLARE_WITH_ID(item, 12); \
MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
MEMORY_STAT_DECLARE_WITH_ID(item, 14); \
MEMORY_STAT_DECLARE_WITH_ID(item, 15)
#define DEVICE_MEMORY_STAT_CURRENT_VALUE(item, id) \
DEVICE_MEMORY_STAT_FUNC(item, id, GetCurrentValue)
#define DEVICE_MEMORY_STAT_PEAK_VALUE(item, id) \
DEVICE_MEMORY_STAT_FUNC(item, id, GetPeakValue)
#define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \
DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment)
#define HOST_MEMORY_STAT_FUNC(item, id, func, ...) \
[&] { \
PADDLE_ENFORCE_EQ(id, 0, paddle::platform::errors::OutOfRange( \
"Only support device id 0 for host memory " \
"stats, not support device id: %d", \
id)); \
return paddle::memory::Stat< \
paddle::memory::HostMemoryStat##item##0>::GetInstance() \
->func(__VA_ARGS__); \
}()
#define HOST_MEMORY_STAT_CURRENT_VALUE(item, id) \
HOST_MEMORY_STAT_FUNC(item, id, GetCurrentValue)
#define HOST_MEMORY_STAT_PEAK_VALUE(item, id) \
HOST_MEMORY_STAT_FUNC(item, id, GetPeakValue)
#define HOST_MEMORY_STAT_UPDATE(item, id, increment) \
HOST_MEMORY_STAT_FUNC(item, id, Update, increment)
#define DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, id) \
struct DeviceMemoryStat##item##id : public ThreadLocalStatBase {}
#define DEVICE_MEMORY_STAT_DECLARE(item) \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 0); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 1); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 2); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 3); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 4); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 5); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 6); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 7); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 8); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 9); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 10); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 11); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 12); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 14); \
DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 15)
// Only support id 0 for host memory stat
#define HOST_MEMORY_STAT_DECLARE(item) \
struct HostMemoryStat##item##0 : public ThreadLocalStatBase{};
// To add a new STAT type, declare here and register in stats.cc
MEMORY_STAT_DECLARE
(
Allocated
);
MEMORY_STAT_DECLARE
(
Reserved
);
DEVICE_MEMORY_STAT_DECLARE
(
Allocated
);
DEVICE_MEMORY_STAT_DECLARE
(
Reserved
);
HOST_MEMORY_STAT_DECLARE
(
Allocated
);
HOST_MEMORY_STAT_DECLARE
(
Reserved
);
}
// namespace memory
}
// namespace paddle
paddle/fluid/memory/stats_test.cc
浏览文件 @
21f11d35
...
...
@@ -23,50 +23,77 @@
namespace
paddle
{
namespace
memory
{
TEST
(
stats_test
,
MultiThreadReadWriteTest
)
{
std
::
string
stat_type
=
"Allocated"
;
size_t
thread_num
=
3
;
size_t
data_num
=
10
;
std
::
condition_variable
cv
;
std
::
mutex
mutex
;
std
::
vector
<
std
::
thread
>
threads
;
size_t
ready_thread_num
=
0
;
for
(
size_t
i
=
0
;
i
<
thread_num
;
++
i
)
{
threads
.
emplace_back
(
[
&
stat_type
,
data_num
,
&
cv
,
&
mutex
,
&
ready_thread_num
]()
{
for
(
size_t
data
=
0
;
data
<
data_num
;
++
data
)
{
StatUpdate
(
stat_type
,
0
,
data
);
}
/* lock guard*/
{
std
::
lock_guard
<
std
::
mutex
>
lock_guard
{
mutex
};
++
ready_thread_num
;
cv
.
notify_one
();
}
// Sleep here to not exit before the main thread checking stat
// results, because the thread-local stat data will be destroyed when
// the thread exit
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
seconds
(
1
));
});
class
StatsTest
:
public
::
testing
::
Test
{
protected:
void
SetStatType
(
const
std
::
string
&
stat_type
)
{
stat_type_
=
stat_type
;
}
void
SetFunc
(
std
::
function
<
void
(
const
std
::
string
,
int
,
int64_t
)
>
update_func
,
std
::
function
<
int64_t
(
const
std
::
string
,
int
)
>
current_value_func
,
std
::
function
<
int64_t
(
const
std
::
string
,
int
)
>
peak_value_func
)
{
update_func_
=
update_func
;
current_value_func_
=
current_value_func
;
peak_value_func_
=
peak_value_func
;
}
void
RunTests
()
{
MultiThreadReadWriteTest
();
PeakValueTest
();
}
std
::
unique_lock
<
std
::
mutex
>
unique_lock
(
mutex
);
cv
.
wait
(
unique_lock
,
[
&
ready_thread_num
,
thread_num
]()
{
return
ready_thread_num
==
thread_num
;
});
private:
void
MultiThreadReadWriteTest
()
{
size_t
thread_num
=
3
;
size_t
data_num
=
10
;
std
::
condition_variable
cv
;
std
::
mutex
mutex
;
std
::
vector
<
std
::
thread
>
threads
;
size_t
ready_thread_num
=
0
;
for
(
size_t
i
=
0
;
i
<
thread_num
;
++
i
)
{
threads
.
emplace_back
([
&
]()
{
for
(
size_t
data
=
0
;
data
<
data_num
;
++
data
)
{
update_func_
(
stat_type_
,
0
,
data
);
}
/* lock guard*/
{
std
::
lock_guard
<
std
::
mutex
>
lock_guard
{
mutex
};
++
ready_thread_num
;
cv
.
notify_one
();
}
// Sleep here to not exit before the main thread checking stat
// results, because the thread-local stat data will be destroyed when
// the thread exit
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
seconds
(
1
));
});
}
EXPECT_EQ
(
StatGetCurrentValue
(
stat_type
,
0
),
int64_t
((
thread_num
*
data_num
*
(
data_num
-
1
))
>>
1
));
std
::
unique_lock
<
std
::
mutex
>
unique_lock
(
mutex
);
cv
.
wait
(
unique_lock
,
[
&
ready_thread_num
,
thread_num
]()
{
return
ready_thread_num
==
thread_num
;
});
for
(
size_t
i
=
0
;
i
<
thread_num
;
++
i
)
{
threads
[
i
].
join
();
EXPECT_EQ
(
current_value_func_
(
stat_type_
,
0
),
int64_t
((
thread_num
*
data_num
*
(
data_num
-
1
))
>>
1
));
for
(
size_t
i
=
0
;
i
<
thread_num
;
++
i
)
{
threads
[
i
].
join
();
}
}
void
PeakValueTest
()
{
int64_t
peak_value
=
((
int64_t
)
1
)
<<
63
;
int64_t
sum
=
0
;
for
(
int64_t
data
:
datas_
)
{
update_func_
(
stat_type_
,
0
,
data
);
sum
+=
data
;
peak_value
=
std
::
max
(
peak_value
,
sum
);
}
EXPECT_EQ
(
peak_value_func_
(
stat_type_
,
0
),
peak_value
);
}
}
TEST
(
stats_test
,
PeakValueTest
)
{
std
::
string
stat_type
=
"Allocated"
;
std
::
vector
<
int64_t
>
datas
=
{
std
::
string
stat_type_
;
std
::
vector
<
int64_t
>
datas_
{
543149808935355
,
634698327471328
,
706215795436611
,
577939367795333
,
419479490054362
,
21975227714595
,
812939817942250
,
984428837942082
,
537304104446806
,
685008544452453
,
563352858161268
,
690143831596330
,
...
...
@@ -93,14 +120,53 @@ TEST(stats_test, PeakValueTest) {
746465732805300
,
-
74049761897414
,
-
65640372433924
,
852009039806484
,
305079802044257
,
-
48409757869238
,
266031781660228
,
327287322379820
};
int64_t
peak_value
=
((
int64_t
)
1
)
<<
63
;
int64_t
sum
=
0
;
for
(
int64_t
data
:
datas
)
{
StatUpdate
(
stat_type
,
0
,
data
);
sum
+=
data
;
peak_value
=
std
::
max
(
peak_value
,
sum
);
}
EXPECT_EQ
(
StatGetPeakValue
(
stat_type
,
0
),
peak_value
);
std
::
function
<
void
(
const
std
::
string
,
int
,
int64_t
)
>
update_func_
;
std
::
function
<
int64_t
(
const
std
::
string
,
int
)
>
current_value_func_
;
std
::
function
<
int64_t
(
const
std
::
string
,
int
)
>
peak_value_func_
;
};
TEST_F
(
StatsTest
,
DeviceAllocatedTest
)
{
SetStatType
(
"Allocated"
);
SetFunc
(
DeviceMemoryStatUpdate
,
DeviceMemoryStatCurrentValue
,
DeviceMemoryStatPeakValue
);
RunTests
();
}
TEST_F
(
StatsTest
,
DeviceReservedMacroTest
)
{
SetStatType
(
"Reserved"
);
SetFunc
(
[](
const
std
::
string
stat_type
,
int
id
,
int64_t
increment
)
{
return
DEVICE_MEMORY_STAT_UPDATE
(
Reserved
,
id
,
increment
);
},
[](
const
std
::
string
stat_type
,
int
id
)
{
return
DEVICE_MEMORY_STAT_CURRENT_VALUE
(
Reserved
,
id
);
},
[](
const
std
::
string
stat_type
,
int
id
)
{
return
DEVICE_MEMORY_STAT_PEAK_VALUE
(
Reserved
,
id
);
});
RunTests
();
}
TEST_F
(
StatsTest
,
HostAllocatedMacroTest
)
{
SetStatType
(
"Allocated"
);
SetFunc
(
[](
const
std
::
string
stat_type
,
int
id
,
int64_t
increment
)
{
return
HOST_MEMORY_STAT_UPDATE
(
Allocated
,
id
,
increment
);
},
[](
const
std
::
string
stat_type
,
int
id
)
{
return
HOST_MEMORY_STAT_CURRENT_VALUE
(
Allocated
,
id
);
},
[](
const
std
::
string
stat_type
,
int
id
)
{
return
HOST_MEMORY_STAT_PEAK_VALUE
(
Allocated
,
id
);
});
RunTests
();
}
TEST_F
(
StatsTest
,
HostReservedTest
)
{
SetStatType
(
"Reserved"
);
SetFunc
(
HostMemoryStatUpdate
,
HostMemoryStatCurrentValue
,
HostMemoryStatPeakValue
);
RunTests
();
}
}
// namespace memory
...
...
paddle/fluid/operators/conv_cudnn_helper.h
浏览文件 @
21f11d35
...
...
@@ -72,8 +72,10 @@ static inline bool UseFixedWorkspace() {
static
size_t
CalcWorkspaceLimitInBytes
(
bool
use_fixed_workspace
)
{
if
(
!
use_fixed_workspace
)
{
int
device_id
=
platform
::
GetCurrentDeviceId
();
int64_t
allocated
=
memory
::
StatGetCurrentValue
(
"Allocated"
,
device_id
);
int64_t
reserved
=
memory
::
StatGetCurrentValue
(
"Reserved"
,
device_id
);
int64_t
allocated
=
memory
::
DeviceMemoryStatCurrentValue
(
"Allocated"
,
device_id
);
int64_t
reserved
=
memory
::
DeviceMemoryStatCurrentValue
(
"Reserved"
,
device_id
);
int64_t
availble
=
platform
::
GpuAvailableMemToAlloc
();
VLOG
(
3
)
<<
"[memory] allocated="
<<
ToMegaBytes
(
allocated
)
<<
" MB, reserved="
<<
ToMegaBytes
(
reserved
)
...
...
paddle/fluid/platform/device/gpu/gpu_info.cc
浏览文件 @
21f11d35
...
...
@@ -149,8 +149,8 @@ class RecordedGpuMallocHelper {
if
(
FLAGS_enable_gpu_memory_usage_log
)
{
// A fake UPDATE to trigger the construction of memory stat instances,
// make sure that they are destructed after RecordedGpuMallocHelper.
MEMORY_STAT_UPDATE
(
Reserved
,
dev_id
,
0
);
MEMORY_STAT_UPDATE
(
Allocated
,
dev_id
,
0
);
DEVICE_
MEMORY_STAT_UPDATE
(
Reserved
,
dev_id
,
0
);
DEVICE_
MEMORY_STAT_UPDATE
(
Allocated
,
dev_id
,
0
);
}
}
...
...
@@ -161,15 +161,18 @@ class RecordedGpuMallocHelper {
if
(
FLAGS_enable_gpu_memory_usage_log
)
{
if
(
FLAGS_enable_gpu_memory_usage_log_mb
)
{
std
::
cout
<<
"[Memory Usage (MB)] gpu "
<<
dev_id_
<<
" : Reserved = "
<<
MEMORY_STAT_PEAK_VALUE
(
Reserved
,
dev_id_
)
/
1048576.0
<<
DEVICE_MEMORY_STAT_PEAK_VALUE
(
Reserved
,
dev_id_
)
/
1048576.0
<<
", Allocated = "
<<
MEMORY_STAT_PEAK_VALUE
(
Allocated
,
dev_id_
)
/
1048576.0
<<
DEVICE_MEMORY_STAT_PEAK_VALUE
(
Allocated
,
dev_id_
)
/
1048576.0
<<
std
::
endl
;
}
else
{
std
::
cout
<<
"[Memory Usage (Byte)] gpu "
<<
dev_id_
<<
" : Reserved = "
<<
MEMORY_STAT_PEAK_VALUE
(
Reserved
,
dev_id_
)
<<
DEVICE_
MEMORY_STAT_PEAK_VALUE
(
Reserved
,
dev_id_
)
<<
", Allocated = "
<<
MEMORY_STAT_PEAK_VALUE
(
Allocated
,
dev_id_
)
<<
std
::
endl
;
<<
DEVICE_MEMORY_STAT_PEAK_VALUE
(
Allocated
,
dev_id_
)
<<
std
::
endl
;
}
}
}
...
...
@@ -230,7 +233,7 @@ class RecordedGpuMallocHelper {
if
(
result
==
gpuSuccess
)
{
cur_size_
.
fetch_add
(
size
);
STAT_INT_ADD
(
"STAT_gpu"
+
std
::
to_string
(
dev_id_
)
+
"_mem_size"
,
size
);
MEMORY_STAT_UPDATE
(
Reserved
,
dev_id_
,
size
);
DEVICE_
MEMORY_STAT_UPDATE
(
Reserved
,
dev_id_
,
size
);
#ifdef PADDLE_WITH_TESTING
gpu_ptrs
.
insert
(
*
ptr
);
...
...
@@ -269,7 +272,7 @@ class RecordedGpuMallocHelper {
PADDLE_ENFORCE_GPU_SUCCESS
(
err
);
cur_size_
.
fetch_sub
(
size
);
STAT_INT_SUB
(
"STAT_gpu"
+
std
::
to_string
(
dev_id_
)
+
"_mem_size"
,
size
);
MEMORY_STAT_UPDATE
(
Reserved
,
dev_id_
,
-
size
);
DEVICE_
MEMORY_STAT_UPDATE
(
Reserved
,
dev_id_
,
-
size
);
}
else
{
platform
::
GpuGetLastError
();
// clear the error flag when
// cudaErrorCudartUnloading /
...
...
paddle/fluid/platform/profiler_helper.h
浏览文件 @
21f11d35
...
...
@@ -168,8 +168,10 @@ void PrintMemProfiler(
if
(
num_gpus
>
0
)
{
std
::
cout
<<
"GPU Memory Usage (MB):
\n
"
;
for
(
int
dev_id
=
0
;
dev_id
<
num_gpus
;
++
dev_id
)
{
int64_t
allocated
=
memory
::
StatGetCurrentValue
(
"Allocated"
,
dev_id
);
int64_t
reserved
=
memory
::
StatGetCurrentValue
(
"Reserved"
,
dev_id
);
int64_t
allocated
=
memory
::
DeviceMemoryStatCurrentValue
(
"Allocated"
,
dev_id
);
int64_t
reserved
=
memory
::
DeviceMemoryStatCurrentValue
(
"Reserved"
,
dev_id
);
size_t
available
=
0
,
total
=
0
,
actual_available
=
0
,
actual_total
=
0
;
RecordedGpuMemGetInfo
(
&
available
,
&
total
,
&
actual_available
,
&
actual_total
,
dev_id
);
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
21f11d35
...
...
@@ -3005,8 +3005,9 @@ All parameter, weight, gradient are variables in Paddle.
}
return
stats_map
;
});
m
.
def
(
"memory_stat_get_current"
,
memory
::
StatGetCurrentValue
);
m
.
def
(
"memory_stat_get_peak"
,
memory
::
StatGetPeakValue
);
m
.
def
(
"device_memory_stat_current_value"
,
memory
::
DeviceMemoryStatCurrentValue
);
m
.
def
(
"device_memory_stat_peak_value"
,
memory
::
DeviceMemoryStatPeakValue
);
m
.
def
(
"run_cmd"
,
[](
const
std
::
string
&
cmd
,
int
time_out
=
-
1
,
int
sleep_inter
=
-
1
)
->
const
std
::
string
{
...
...
python/paddle/device/cuda/__init__.py
浏览文件 @
21f11d35
...
...
@@ -224,7 +224,7 @@ def max_memory_allocated(device=None):
f
"The API
{
name
}
is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
)
device_id
=
extract_cuda_device_id
(
device
,
op_name
=
name
)
return
core
.
memory_stat_get_peak
(
"Allocated"
,
device_id
)
return
core
.
device_memory_stat_peak_value
(
"Allocated"
,
device_id
)
def
max_memory_reserved
(
device
=
None
):
...
...
@@ -255,7 +255,7 @@ def max_memory_reserved(device=None):
f
"The API
{
name
}
is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
)
device_id
=
extract_cuda_device_id
(
device
,
op_name
=
name
)
return
core
.
memory_stat_get_peak
(
"Reserved"
,
device_id
)
return
core
.
device_memory_stat_peak_value
(
"Reserved"
,
device_id
)
def
memory_allocated
(
device
=
None
):
...
...
@@ -290,7 +290,7 @@ def memory_allocated(device=None):
f
"The API
{
name
}
is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
)
device_id
=
extract_cuda_device_id
(
device
,
op_name
=
name
)
return
core
.
memory_stat_get_current
(
"Allocated"
,
device_id
)
return
core
.
device_memory_stat_current_value
(
"Allocated"
,
device_id
)
def
memory_reserved
(
device
=
None
):
...
...
@@ -321,7 +321,7 @@ def memory_reserved(device=None):
f
"The API
{
name
}
is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
)
device_id
=
extract_cuda_device_id
(
device
,
op_name
=
name
)
return
core
.
memory_stat_get_current
(
"Reserved"
,
device_id
)
return
core
.
device_memory_stat_current_value
(
"Reserved"
,
device_id
)
def
_set_current_stream
(
stream
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录