Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
4f01de63
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
4f01de63
编写于
5年前
作者:
D
dzhwinter
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'origin/develop' into feature/ir_inplace_pass
上级
5cab99a6
46a6cac9
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
191 addition
and
20 deletion
+191
-20
paddle/fluid/framework/scope.cc
paddle/fluid/framework/scope.cc
+1
-5
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+2
-1
paddle/fluid/memory/allocation/legacy_allocator.cc
paddle/fluid/memory/allocation/legacy_allocator.cc
+64
-12
paddle/fluid/memory/allocation/legacy_allocator.h
paddle/fluid/memory/allocation/legacy_allocator.h
+47
-0
paddle/fluid/operators/batch_norm_op.cc
paddle/fluid/operators/batch_norm_op.cc
+4
-2
paddle/fluid/platform/place.cc
paddle/fluid/platform/place.cc
+6
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+8
-0
python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py
.../paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py
+59
-0
未找到文件。
paddle/fluid/framework/scope.cc
浏览文件 @
4f01de63
...
...
@@ -22,11 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h"
DEFINE_bool
(
benchmark
,
false
,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode."
);
DECLARE_bool
(
benchmark
);
DEFINE_bool
(
eager_delete_scope
,
true
,
...
...
This diff is collapsed.
Click to expand it.
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
4f01de63
...
...
@@ -58,7 +58,8 @@ namespace {
bool
IsPersistable
(
const
framework
::
VarDesc
*
var
)
{
if
(
var
->
Persistable
()
&&
var
->
GetType
()
!=
framework
::
proto
::
VarType
::
FEED_MINIBATCH
&&
var
->
GetType
()
!=
framework
::
proto
::
VarType
::
FETCH_LIST
)
{
var
->
GetType
()
!=
framework
::
proto
::
VarType
::
FETCH_LIST
&&
var
->
GetType
()
!=
framework
::
proto
::
VarType
::
RAW
)
{
return
true
;
}
return
false
;
...
...
This diff is collapsed.
Click to expand it.
paddle/fluid/memory/allocation/legacy_allocator.cc
浏览文件 @
4f01de63
...
...
@@ -35,6 +35,7 @@ DEFINE_bool(init_allocated_mem, false,
"To find this error in time, we use init_allocated_mem to indicate "
"that initializing the allocated memory with a small value "
"during unit testing."
);
DECLARE_bool
(
benchmark
);
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
namespace
paddle
{
...
...
@@ -59,11 +60,6 @@ size_t memory_usage(const platform::Place &p);
using
BuddyAllocator
=
detail
::
BuddyAllocator
;
std
::
unordered_map
<
/*device id*/
int
,
std
::
pair
<
/*current memory usage*/
uint64_t
,
/*peak memory usage*/
uint64_t
>>
gpu_mem_info
;
BuddyAllocator
*
GetCPUBuddyAllocator
()
{
// We tried thread_local for inference::RNN1 model, but that not works much
// for multi-thread test.
...
...
@@ -144,6 +140,8 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
devices
=
platform
::
GetSelectedDevices
();
int
gpu_num
=
devices
.
size
();
allocation
::
GPUMemMonitor
.
Initialize
(
devices
.
size
());
a_arr
=
new
BuddyAllocator
*
[
gpu_num
];
for
(
size_t
i
=
0
;
i
<
devices
.
size
();
++
i
)
{
int
dev_id
=
devices
[
i
];
...
...
@@ -204,12 +202,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
<<
string
::
HumanReadableSize
(
Used
<
platform
::
CUDAPlace
>
(
place
));
platform
::
SetDeviceId
(
cur_dev
);
}
else
{
gpu_mem_info
[
place
.
device
].
first
+=
size
;
if
(
gpu_mem_info
[
place
.
device
].
first
>
gpu_mem_info
[
place
.
device
].
second
)
{
gpu_mem_info
[
place
.
device
].
second
=
gpu_mem_info
[
place
.
device
].
first
;
VLOG
(
3
)
<<
"device: "
<<
place
.
device
<<
" peak memory usage : "
<<
(
gpu_mem_info
[
place
.
device
].
second
>>
20
)
<<
" MiB"
;
}
if
(
FLAGS_benchmark
)
allocation
::
GPUMemMonitor
.
Add
(
place
.
device
,
size
);
if
(
FLAGS_init_allocated_mem
)
{
cudaMemset
(
ptr
,
0xEF
,
size
);
}
...
...
@@ -225,7 +218,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
size_t
size
)
{
#ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator
(
place
.
device
)
->
Free
(
p
);
gpu_mem_info
[
place
.
device
].
first
-=
size
;
if
(
FLAGS_benchmark
)
allocation
::
GPUMemMonitor
.
Minus
(
place
.
device
,
size
)
;
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported in CPU only device."
);
#endif
...
...
@@ -335,6 +328,8 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
namespace
allocation
{
LegacyMemMonitor
GPUMemMonitor
;
Allocation
*
LegacyAllocator
::
AllocateImpl
(
size_t
size
,
Allocator
::
Attr
attr
)
{
void
*
ptr
=
boost
::
apply_visitor
(
legacy
::
AllocVisitor
(
size
),
place_
);
return
new
Allocation
(
ptr
,
size
,
place_
);
...
...
@@ -346,6 +341,63 @@ void LegacyAllocator::Free(Allocation *allocation) {
allocation
->
place
());
delete
allocation
;
}
bool
MemInfo
::
Add
(
const
size_t
&
size
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
usage_
+=
size
;
bool
peak_point
=
usage_
>
peak_usage_
;
if
(
peak_point
)
peak_usage_
=
usage_
;
return
peak_point
;
}
void
MemInfo
::
Minus
(
const
size_t
&
size
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
usage_
-=
size
;
}
uint64_t
MemInfo
::
GetPeakUsage
()
{
return
peak_usage_
;
}
LegacyMemMonitor
::~
LegacyMemMonitor
()
{
for
(
auto
&
item
:
gpu_mem_info_
)
delete
item
.
second
;
}
void
LegacyMemMonitor
::
Initialize
(
const
int
&
device_num
)
{
for
(
auto
i
=
0
;
i
<
device_num
;
++
i
)
{
gpu_mem_info_
[
i
]
=
new
MemInfo
();
}
}
void
LegacyMemMonitor
::
Add
(
const
int
&
device
,
const
size_t
&
size
)
{
if
(
gpu_mem_info_
[
device
]
->
Add
(
size
))
{
VLOG
(
3
)
<<
"#LegacyMemMonitor# device: "
<<
device
<<
" peak memory usage : "
<<
(
gpu_mem_info_
[
device
]
->
GetPeakUsage
()
>>
20
)
<<
" MiB"
;
}
}
void
LegacyMemMonitor
::
Minus
(
const
int
&
device
,
const
size_t
&
size
)
{
gpu_mem_info_
[
device
]
->
Minus
(
size
);
}
uint64_t
LegacyMemMonitor
::
GetMemUsage
(
const
int
&
device
)
{
return
gpu_mem_info_
.
find
(
device
)
==
gpu_mem_info_
.
end
()
?
0
:
gpu_mem_info_
[
device
]
->
GetPeakUsage
();
}
void
LegacyMemMonitor
::
PrintMemUsage
()
{
std
::
vector
<
int
>
devices
;
for
(
const
auto
&
item
:
gpu_mem_info_
)
{
devices
.
emplace_back
(
item
.
first
);
}
std
::
sort
(
devices
.
begin
(),
devices
.
end
());
for
(
const
auto
&
device
:
devices
)
{
std
::
cout
<<
"Device : "
<<
device
<<
" Peak Memory Usage : "
<<
(
gpu_mem_info_
[
device
]
->
GetPeakUsage
()
>>
20
)
<<
" MiB"
<<
std
::
endl
;
}
}
}
// namespace allocation
}
// namespace memory
}
// namespace paddle
This diff is collapsed.
Click to expand it.
paddle/fluid/memory/allocation/legacy_allocator.h
浏览文件 @
4f01de63
...
...
@@ -13,12 +13,59 @@
// limitations under the License.
#pragma once
#include <algorithm>
#include <mutex> // NOLINT
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
class
MemInfo
{
public:
MemInfo
()
:
usage_
(
0
),
peak_usage_
(
0
)
{}
MemInfo
(
const
MemInfo
&
)
=
delete
;
MemInfo
&
operator
=
(
const
MemInfo
&
)
=
delete
;
// return a flag to indicate current operation will create a peak point or not
bool
Add
(
const
size_t
&
);
void
Minus
(
const
size_t
&
);
uint64_t
GetPeakUsage
();
private:
/* current memory usage*/
uint64_t
usage_
;
uint64_t
peak_usage_
;
std
::
mutex
mutex_
;
};
class
LegacyMemMonitor
{
public:
// used to store the GPU memory usage of each devices
using
MemUsage
=
std
::
unordered_map
<
/*device id*/
int
,
/*mem usage info node*/
MemInfo
*>
;
MemUsage
GetMemUsageInfo
()
{
return
gpu_mem_info_
;
}
~
LegacyMemMonitor
();
void
Initialize
(
const
int
&
);
void
Add
(
const
int
&
,
const
size_t
&
);
void
Minus
(
const
int
&
,
const
size_t
&
);
uint64_t
GetMemUsage
(
const
int
&
);
void
PrintMemUsage
();
protected:
MemUsage
gpu_mem_info_
;
};
extern
LegacyMemMonitor
GPUMemMonitor
;
class
LegacyAllocatorPrivate
;
class
LegacyAllocator
:
public
Allocator
{
public:
...
...
This diff is collapsed.
Click to expand it.
paddle/fluid/operators/batch_norm_op.cc
浏览文件 @
4f01de63
...
...
@@ -589,8 +589,10 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
op
->
SetInput
(
"SavedVariance"
,
Output
(
"SavedVariance"
));
// used when setting use_global_stats True during training
if
(
boost
::
get
<
bool
>
(
GetAttr
(
"use_global_stats"
)))
{
op
->
SetInput
(
"Mean"
,
Output
(
"MeanOut"
));
op
->
SetInput
(
"Variance"
,
Output
(
"VarianceOut"
));
}
op
->
SetAttrMap
(
Attrs
());
...
...
This diff is collapsed.
Click to expand it.
paddle/fluid/platform/place.cc
浏览文件 @
4f01de63
...
...
@@ -14,6 +14,12 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
DEFINE_bool
(
benchmark
,
false
,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode."
);
namespace
paddle
{
namespace
platform
{
...
...
This diff is collapsed.
Click to expand it.
paddle/fluid/pybind/pybind.cc
浏览文件 @
4f01de63
...
...
@@ -37,6 +37,7 @@ limitations under the License. */
#include "paddle/fluid/framework/version.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
...
...
@@ -127,6 +128,13 @@ PYBIND11_MODULE(core, m) {
m
.
add_object
(
"_cleanup"
,
py
::
capsule
([]()
{
ScopePool
::
Instance
().
Clear
();
}));
m
.
def
(
"get_mem_usage"
,
[](
int
device
)
{
return
memory
::
allocation
::
GPUMemMonitor
.
GetMemUsage
(
device
);
});
m
.
def
(
"print_mem_usage"
,
[]()
{
return
memory
::
allocation
::
GPUMemMonitor
.
PrintMemUsage
();
});
py
::
class_
<
imperative
::
VarBase
>
(
m
,
"VarBase"
,
R"DOC()DOC"
)
// .def(py::init<>())
.
def
(
py
::
init
<
bool
>
(),
py
::
arg
(
"stop_gradient"
)
=
false
)
...
...
This diff is collapsed.
Click to expand it.
python/paddle/fluid/tests/unittests/test_peak_gpumem_monitor.py
0 → 100644
浏览文件 @
4f01de63
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
os
os
.
environ
[
'FLAGS_benchmark'
]
=
'True'
import
numpy
import
paddle.fluid.core
as
core
from
paddle.fluid.executor
import
Executor
from
paddle.fluid.layers
import
mul
,
data
class
TestPeakMemoryMonitoring
(
unittest
.
TestCase
):
def
test_mul
(
self
):
a
=
data
(
name
=
'a'
,
shape
=
[
784
],
dtype
=
'float32'
)
b
=
data
(
name
=
'b'
,
shape
=
[
784
,
100
],
dtype
=
'float32'
,
append_batch_size
=
False
)
out
=
mul
(
x
=
a
,
y
=
b
)
if
core
.
is_compiled_with_cuda
():
place
=
core
.
CUDAPlace
(
0
)
a_np
=
numpy
.
random
.
random
((
100
,
784
)).
astype
(
'float32'
)
b_np
=
numpy
.
random
.
random
((
784
,
100
)).
astype
(
'float32'
)
self
.
assertEqual
(
0
,
core
.
get_mem_usage
(
0
))
exe
=
Executor
(
place
)
outs
=
exe
.
run
(
feed
=
{
'a'
:
a_np
,
'b'
:
b_np
},
fetch_list
=
[
out
])
out
=
outs
[
0
]
#disable this assert since ctest will ignore the os.environ setting
#self.assertGreater(core.get_mem_usage(0), 0)
raised
=
False
try
:
core
.
print_mem_usage
()
except
:
raised
=
True
self
.
assertFalse
(
raised
,
'Exception raised'
)
if
__name__
==
'__main__'
:
unittest
.
main
()
This diff is collapsed.
Click to expand it.
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录