Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
97b143fb
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
97b143fb
编写于
2月 12, 2019
作者:
Q
Qiao Longfei
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into fix-cpu-broadcast
test=develop
上级
ffd0d1d2
d424e5b4
变更
46
隐藏空白更改
内联
并排
Showing
46 changed file
with
1110 addition
and
1163 deletion
+1110
-1163
cmake/flags.cmake
cmake/flags.cmake
+2
-1
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+11
-10
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+2
-6
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+0
-2
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+0
-3
paddle/fluid/framework/details/inplace_op_pass.cc
paddle/fluid/framework/details/inplace_op_pass.cc
+6
-7
paddle/fluid/framework/details/inplace_op_pass.h
paddle/fluid/framework/details/inplace_op_pass.h
+8
-7
paddle/fluid/framework/details/memory_early_delete_pass.cc
paddle/fluid/framework/details/memory_early_delete_pass.cc
+0
-117
paddle/fluid/framework/details/memory_early_delete_pass.h
paddle/fluid/framework/details/memory_early_delete_pass.h
+0
-32
paddle/fluid/framework/details/memory_optimize_helper.cc
paddle/fluid/framework/details/memory_optimize_helper.cc
+306
-30
paddle/fluid/framework/details/memory_optimize_helper.h
paddle/fluid/framework/details/memory_optimize_helper.h
+83
-36
paddle/fluid/framework/details/memory_optimize_helper_test.cc
...le/fluid/framework/details/memory_optimize_helper_test.cc
+408
-9
paddle/fluid/framework/details/memory_optimize_pass.cc
paddle/fluid/framework/details/memory_optimize_pass.cc
+8
-289
paddle/fluid/framework/details/memory_optimize_pass.h
paddle/fluid/framework/details/memory_optimize_pass.h
+3
-47
paddle/fluid/framework/details/memory_optimize_pass_test.cc
paddle/fluid/framework/details/memory_optimize_pass_test.cc
+0
-417
paddle/fluid/framework/details/sequential_execution_pass.cc
paddle/fluid/framework/details/sequential_execution_pass.cc
+1
-0
paddle/fluid/framework/details/sequential_execution_pass.h
paddle/fluid/framework/details/sequential_execution_pass.h
+0
-2
paddle/fluid/framework/inplace_op_inference.h
paddle/fluid/framework/inplace_op_inference.h
+1
-1
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+2
-9
paddle/fluid/framework/scope.cc
paddle/fluid/framework/scope.cc
+1
-5
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+2
-2
paddle/fluid/memory/allocation/legacy_allocator.cc
paddle/fluid/memory/allocation/legacy_allocator.cc
+4
-3
paddle/fluid/memory/allocation/pinned_allocator.cc
paddle/fluid/memory/allocation/pinned_allocator.cc
+1
-1
paddle/fluid/memory/allocation/pinned_allocator.h
paddle/fluid/memory/allocation/pinned_allocator.h
+1
-1
paddle/fluid/memory/detail/system_allocator.cc
paddle/fluid/memory/detail/system_allocator.cc
+2
-2
paddle/fluid/operators/elementwise/elementwise_op.h
paddle/fluid/operators/elementwise/elementwise_op.h
+19
-1
paddle/fluid/operators/ngraph/ngraph_engine_op.h
paddle/fluid/operators/ngraph/ngraph_engine_op.h
+1
-1
paddle/fluid/operators/reader/buffered_reader.cc
paddle/fluid/operators/reader/buffered_reader.cc
+55
-1
paddle/fluid/operators/reader/buffered_reader.h
paddle/fluid/operators/reader/buffered_reader.h
+8
-0
paddle/fluid/platform/place.cc
paddle/fluid/platform/place.cc
+6
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+7
-4
python/paddle/__init__.py
python/paddle/__init__.py
+1
-0
python/paddle/distributed/__init__.py
python/paddle/distributed/__init__.py
+13
-0
python/paddle/distributed/launch.py
python/paddle/distributed/launch.py
+22
-16
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+0
-1
python/paddle/fluid/layer_helper.py
python/paddle/fluid/layer_helper.py
+2
-1
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+2
-1
python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
...e/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
+27
-4
python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
...dle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
+25
-1
python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
.../tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
+5
-62
python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
...addle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
+2
-6
python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
...paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
+25
-14
python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
...dle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
+25
-1
python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
...ddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
+8
-10
python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
...ddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
+4
-0
python/setup.py.in
python/setup.py.in
+1
-0
未找到文件。
cmake/flags.cmake
浏览文件 @
97b143fb
...
@@ -21,12 +21,13 @@ function(CheckCompilerCXX11Flag)
...
@@ -21,12 +21,13 @@ function(CheckCompilerCXX11Flag)
if
(
${
CMAKE_CXX_COMPILER_VERSION
}
VERSION_LESS 3.3
)
if
(
${
CMAKE_CXX_COMPILER_VERSION
}
VERSION_LESS 3.3
)
message
(
FATAL_ERROR
"Unsupported Clang version. Clang >= 3.3 required."
)
message
(
FATAL_ERROR
"Unsupported Clang version. Clang >= 3.3 required."
)
endif
()
endif
()
endif
()
endif
()
endif
()
endif
()
endfunction
()
endfunction
()
CheckCompilerCXX11Flag
()
CheckCompilerCXX11Flag
()
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-std=c++11"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-std=c++11"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-m64"
)
# safe_set_flag
# safe_set_flag
#
#
# Set a compile flag only if compiler is support
# Set a compile flag only if compiler is support
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
97b143fb
...
@@ -158,18 +158,19 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
...
@@ -158,18 +158,19 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
cc_library
(
naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper
)
cc_library
(
naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper
)
if
(
WITH_DISTRIBUTE
)
if
(
WITH_NGRAPH
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
set
(
NGRAPH_EXE_DEPS ngraph_engine
)
lod_rank_table feed_fetch_method sendrecvop_rpc
${
GLOB_DISTRIBUTE_DEPS
}
graph_to_program_pass variable_helper
)
else
()
set
(
NGRAPH_EXE_DEPS
)
endif
()
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
if
(
WITH_DISTRIBUTE
)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
lod_rank_table feed_fetch_method sendrecvop_rpc
${
GLOB_DISTRIBUTE_DEPS
}
graph_to_program_pass variable_helper
${
NGRAPH_EXE_DEPS
}
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
else
()
else
()
if
(
WITH_NGRAPH
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper
${
NGRAPH_EXE_DEPS
}
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ngraph_engine
)
else
()
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper
)
endif
()
cc_test
(
test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op
)
cc_test
(
test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op
)
endif
()
endif
()
...
...
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
97b143fb
...
@@ -54,8 +54,6 @@ cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph grap
...
@@ -54,8 +54,6 @@ cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph grap
cc_library
(
memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass
)
cc_library
(
memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass
)
cc_library
(
inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info
)
cc_library
(
inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info
)
cc_library
(
modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper
)
cc_library
(
modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper
)
cc_library
(
memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass
)
cc_library
(
reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle
)
cc_library
(
reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle
)
cc_library
(
eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper
)
cc_library
(
eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper
)
cc_library
(
eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass
)
cc_library
(
eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass
)
...
@@ -67,13 +65,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
...
@@ -67,13 +65,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
cc_library
(
multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
cc_library
(
multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle
)
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle
)
set
(
SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass
memory_early_delete_pass
inplace_op_pass
)
set
(
SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
list
(
APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass
)
list
(
APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass
)
endif
()
endif
()
cc_test
(
memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph
)
cc_test
(
memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry
)
cc_test
(
memory_optimize_pass_test SRCS memory_optimize_pass_test.cc memory_optimize_pass.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry pass
)
cc_library
(
ssa_graph_executor SRCS ssa_graph_executor.cc DEPS
${
SSA_GRAPH_EXECUTOR_DEPS
}
)
cc_library
(
ssa_graph_executor SRCS ssa_graph_executor.cc DEPS
${
SSA_GRAPH_EXECUTOR_DEPS
}
)
cc_library
(
threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
cc_library
(
threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
...
...
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
97b143fb
...
@@ -209,8 +209,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
...
@@ -209,8 +209,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
());
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
());
graph
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
graph
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
all_op_descs
);
// take ownership
all_op_descs
);
// take ownership
graph
->
Set
<
GraphNodePool
>
(
kGraphNodePool
,
new
GraphNodePool
);
// take ownership
pass
->
Erase
(
kAllOpDescs
);
pass
->
Erase
(
kAllOpDescs
);
pass
->
SetNotOwned
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
all_op_descs
);
pass
->
SetNotOwned
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
all_op_descs
);
...
...
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
97b143fb
...
@@ -77,9 +77,6 @@ struct BuildStrategy {
...
@@ -77,9 +77,6 @@ struct BuildStrategy {
bool
fuse_relu_depthwise_conv_
{
false
};
bool
fuse_relu_depthwise_conv_
{
false
};
bool
memory_optimize_
{
false
};
bool
memory_optimize_
{
false
};
bool
memory_early_delete_
{
false
};
// TODO(dzhwinter):
// TODO(dzhwinter):
// make enable_inplace, memory_optimize_
// make enable_inplace, memory_optimize_
// memory_early_delete_ true by default
// memory_early_delete_ true by default
...
...
paddle/fluid/framework/details/inplace_op_pass.cc
浏览文件 @
97b143fb
...
@@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
...
@@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
}
}
}
}
const
SSANodePair
InplacePass
::
TryInplaceModifyVar
(
const
std
::
string
&
var
,
const
NodeSwapQueue
InplacePass
::
TryInplaceModifyVar
(
const
std
::
string
&
cache_var
,
const
std
::
string
&
var
,
const
std
::
string
&
cache_var
,
const
size_t
&
idx
,
const
size_t
&
idx
,
ir
::
Graph
*
graph
)
const
{
ir
::
Graph
*
graph
)
const
{
PADDLE_ENFORCE
(
var_nodes_
[
var
].
size
()
>=
1
&&
PADDLE_ENFORCE
(
var_nodes_
[
var
].
size
()
>=
1
&&
var_nodes_
[
var
].
at
(
0
)
->
Var
()
!=
nullptr
);
var_nodes_
[
var
].
at
(
0
)
->
Var
()
!=
nullptr
);
std
::
unique_ptr
<
VarDesc
>
var_desc
(
new
VarDesc
(
*
var_nodes_
[
var
].
at
(
0
)
->
Var
()));
std
::
unique_ptr
<
VarDesc
>
var_desc
(
new
VarDesc
(
*
var_nodes_
[
var
].
at
(
0
)
->
Var
()));
var_desc
->
SetName
(
cache_var
);
var_desc
->
SetName
(
cache_var
);
SSANodePair
swap_nodes
;
NodeSwapQueue
swap_nodes
;
for
(
size_t
i
=
idx
;
i
<
view_
.
AllOps
().
size
();
++
i
)
{
for
(
size_t
i
=
idx
;
i
<
view_
.
AllOps
().
size
();
++
i
)
{
auto
*
op
=
view_
.
AllOps
()[
i
];
auto
*
op
=
view_
.
AllOps
()[
i
];
...
@@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var,
...
@@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var,
return
swap_nodes
;
return
swap_nodes
;
}
}
void
InplacePass
::
CommitModify
(
const
SSANodePair
&
swap_nodes
,
void
InplacePass
::
CommitModify
(
const
NodeSwapQueue
&
swap_nodes
,
ir
::
Graph
*
graph
)
const
{
ir
::
Graph
*
graph
)
const
{
for
(
auto
&
pair
:
swap_nodes
)
{
for
(
auto
&
pair
:
swap_nodes
)
{
auto
*
node
=
pair
.
first
,
*
cache_node
=
pair
.
second
;
auto
*
node
=
pair
.
first
,
*
cache_node
=
pair
.
second
;
...
@@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes,
...
@@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes,
}
}
}
}
void
InplacePass
::
WithdrawModify
(
const
SSANodePair
&
nodes
,
void
InplacePass
::
WithdrawModify
(
const
NodeSwapQueue
&
nodes
,
ir
::
Graph
*
graph
)
const
{
ir
::
Graph
*
graph
)
const
{
for
(
auto
&
pair
:
nodes
)
{
for
(
auto
&
pair
:
nodes
)
{
auto
*
node
=
pair
.
first
,
*
cache_node
=
pair
.
second
;
auto
*
node
=
pair
.
first
,
*
cache_node
=
pair
.
second
;
...
...
paddle/fluid/framework/details/inplace_op_pass.h
浏览文件 @
97b143fb
...
@@ -56,7 +56,8 @@ class GraphView {
...
@@ -56,7 +56,8 @@ class GraphView {
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
adj_list_
;
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
adj_list_
;
};
};
typedef
std
::
vector
<
std
::
pair
<
ir
::
Node
*
,
ir
::
Node
*>>
SSANodePair
;
// swap pairs in sequence
typedef
std
::
vector
<
std
::
pair
<
ir
::
Node
*
,
ir
::
Node
*>>
NodeSwapQueue
;
class
InplacePass
:
public
ir
::
Pass
{
class
InplacePass
:
public
ir
::
Pass
{
public:
public:
InplacePass
();
InplacePass
();
...
@@ -68,14 +69,14 @@ class InplacePass : public ir::Pass {
...
@@ -68,14 +69,14 @@ class InplacePass : public ir::Pass {
void
InitSSAGraphNodes
()
const
;
void
InitSSAGraphNodes
()
const
;
private:
private:
const
SSANodePair
TryInplaceModifyVar
(
const
std
::
string
&
var
,
const
NodeSwapQueue
TryInplaceModifyVar
(
const
std
::
string
&
var
,
const
std
::
string
&
cache_var
,
const
std
::
string
&
cache_var
,
const
size_t
&
idx
,
const
size_t
&
idx
,
ir
::
Graph
*
graph
)
const
;
ir
::
Graph
*
graph
)
const
;
void
CommitModify
(
const
SSANodePair
&
,
ir
::
Graph
*
graph
)
const
;
void
CommitModify
(
const
NodeSwapQueue
&
,
ir
::
Graph
*
graph
)
const
;
void
WithdrawModify
(
const
SSANodePair
&
nodes
,
ir
::
Graph
*
graph
)
const
;
void
WithdrawModify
(
const
NodeSwapQueue
&
nodes
,
ir
::
Graph
*
graph
)
const
;
void
InplaceModifyDesc
(
const
std
::
string
&
in_var
,
const
std
::
string
&
out_var
,
void
InplaceModifyDesc
(
const
std
::
string
&
in_var
,
const
std
::
string
&
out_var
,
const
size_t
&
idx
)
const
;
const
size_t
&
idx
)
const
;
...
...
paddle/fluid/framework/details/memory_early_delete_pass.cc
已删除
100644 → 0
浏览文件 @
ffd0d1d2
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_early_delete_pass.h"
#include <queue>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
static
ComputationOpHandle
*
FindNextComputationOpHandle
(
VarHandle
*
var_in
)
{
std
::
queue
<
VarHandleBase
*>
queue
;
queue
.
push
(
var_in
);
do
{
auto
*
var
=
queue
.
front
();
queue
.
pop
();
for
(
auto
*
op
:
var
->
PendingOps
())
{
auto
*
compute_op
=
dynamic_cast
<
ComputationOpHandle
*>
(
op
);
if
(
compute_op
!=
nullptr
&&
compute_op
->
GetPlace
()
==
var_in
->
place
())
{
return
compute_op
;
}
for
(
auto
*
out_var
:
op
->
Outputs
())
{
queue
.
push
(
out_var
);
}
}
}
while
(
!
queue
.
empty
());
return
nullptr
;
}
std
::
unique_ptr
<
ir
::
Graph
>
MemoryEarlyDeletePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
auto
&
graph_pool
=
Get
<
GraphNodePool
>
(
kGraphNodePool
);
auto
&
gcs
=
Get
<
GarbageCollectorMap
>
(
kGarbageCollector
);
std
::
unordered_map
<
std
::
string
,
std
::
unordered_set
<
OpDesc
*>>
unlived_vars
;
unlived_vars
.
reserve
(
graph_pool
.
size
());
for
(
auto
&
pair
:
graph_pool
)
{
unlived_vars
.
insert
(
std
::
make_pair
(
pair
.
first
,
pair
.
second
));
}
auto
compare_and_insert_early_delete_op
=
[
&
](
OpHandleBase
*
op
,
const
std
::
vector
<
VarHandleBase
*>&
vars
)
{
if
(
unlived_vars
.
empty
())
return
;
// unlived vars can be deleted after the last used op has finished.
auto
*
compute_op
=
dynamic_cast
<
ComputationOpHandle
*>
(
op
);
const
auto
&
places
=
Get
<
std
::
vector
<
platform
::
Place
>>
(
kAllPlaces
);
for
(
auto
&
var
:
vars
)
{
auto
*
var_handle
=
dynamic_cast
<
VarHandle
*>
(
var
);
auto
var_name
=
var
->
Node
()
->
Name
();
auto
&
var_place
=
var_handle
->
place
();
if
(
unlived_vars
.
count
(
var_name
)
==
0
)
continue
;
if
(
!
unlived_vars
[
var_name
].
empty
())
{
if
(
compute_op
!=
nullptr
&&
unlived_vars
[
var_name
].
count
(
compute_op
->
Node
()
->
Op
())
!=
0
)
{
unlived_vars
[
var_name
].
erase
(
compute_op
->
Node
()
->
Op
());
}
continue
;
}
if
(
var_handle
==
nullptr
||
!
var_handle
->
Node
()
->
IsVar
()
||
var_handle
->
Node
()
->
IsCtrlVar
())
continue
;
// shameless copyed from reference count pass.
if
(
compute_op
==
nullptr
)
{
// use next computation op scope
compute_op
=
FindNextComputationOpHandle
(
var_handle
);
}
auto
*
early_delete_node
=
graph
->
CreateEmptyNode
(
"early_delete"
,
ir
::
Node
::
Type
::
kOperation
);
GarbageCollector
*
gc
=
gcs
.
at
(
places
[
compute_op
->
GetScopeIdx
()]).
get
();
auto
*
early_delete_handle
=
new
EarlyDeleteOpHandle
(
early_delete_node
,
compute_op
->
GetScope
(),
var_place
,
{
var_name
},
gc
);
if
(
compute_op
->
Outputs
().
empty
())
{
auto
*
dep_var
=
new
DummyVarHandle
(
graph
->
CreateControlDepVar
());
compute_op
->
AddOutput
(
dep_var
);
graph
->
Get
<
GraphDepVars
>
(
kGraphDepVars
).
emplace
(
dep_var
);
}
early_delete_handle
->
AddInput
(
compute_op
->
Outputs
().
front
());
VLOG
(
5
)
<<
"Add early delete op "
<<
var_name
<<
" to Operator"
<<
compute_op
->
Name
();
}
};
auto
all_ops
=
ir
::
FilterByNodeWrapper
<
OpHandleBase
>
(
*
graph
);
for
(
auto
&
op
:
all_ops
)
{
compare_and_insert_early_delete_op
(
op
,
op
->
Inputs
());
compare_and_insert_early_delete_op
(
op
,
op
->
Outputs
());
}
return
graph
;
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
memory_early_delete_pass
,
paddle
::
framework
::
details
::
MemoryEarlyDeletePass
)
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
kGraphNodePool
)
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
kGarbageCollector
);
paddle/fluid/framework/details/memory_early_delete_pass.h
已删除
100644 → 0
浏览文件 @
ffd0d1d2
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/early_delete_op_handle.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
class
MemoryEarlyDeletePass
:
public
ir
::
Pass
{
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/memory_optimize_helper.cc
浏览文件 @
97b143fb
...
@@ -13,17 +13,108 @@
...
@@ -13,17 +13,108 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <deque>
#include <functional>
#include <functional>
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
#include <sstream>
#include <sstream>
#include <string>
#include <string>
#include "paddle/fluid/framework/var_desc.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
using
paddle
::
framework
::
VarDesc
;
size_t
NodeSizeInBytes
(
const
VarDesc
&
node
)
{
std
::
vector
<
ir
::
Node
*>
SortOpLikeDescOrder
(
const
ir
::
Graph
&
graph
)
{
PADDLE_ENFORCE
(
graph
.
Has
(
kAllOpDescs
),
"Graph has no attribute of kAllOpDescs."
);
// 1. get op desc order
auto
&
op_descs
=
graph
.
Get
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
);
// 2. topology sort order
auto
nodes
=
graph
.
Nodes
();
std
::
deque
<
ir
::
Node
*>
ops
;
FilterVariables
(
nodes
,
[
&
](
ir
::
Node
*
op
)
{
if
(
op
->
IsOp
()
&&
op
->
Op
()
!=
nullptr
)
{
ops
.
emplace_back
(
op
);
}
});
std
::
unordered_map
<
ir
::
Node
*
,
size_t
>
op_deps
;
std
::
list
<
ir
::
Node
*>
ready_ops
;
std
::
unordered_map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
pending_ops
;
for
(
auto
*
op
:
ops
)
{
std
::
unordered_set
<
ir
::
Node
*>
preceding_op
;
for
(
auto
*
in
:
op
->
inputs
)
{
if
(
in
->
inputs
.
empty
())
continue
;
PADDLE_ENFORCE
(
in
->
inputs
.
size
()
==
1
&&
in
->
inputs
[
0
]
->
IsOp
());
preceding_op
.
emplace
(
in
->
inputs
[
0
]);
pending_ops
[
in
->
inputs
[
0
]].
emplace
(
op
);
}
op_deps
[
op
]
=
preceding_op
.
size
();
if
(
preceding_op
.
empty
())
{
ready_ops
.
emplace_back
(
op
);
}
}
// 3. generated op list based desc order and the topology order
std
::
vector
<
ir
::
Node
*>
ret
;
std
::
list
<
OpDesc
*>
op_descs_list
(
op_descs
.
begin
(),
op_descs
.
end
());
auto
update_by_found_node
=
[
&
](
ir
::
Node
*
found_node
)
{
for
(
auto
*
pending_op
:
pending_ops
[
found_node
])
{
if
(
--
op_deps
[
pending_op
]
==
0
)
{
ready_ops
.
emplace_back
(
pending_op
);
}
}
ready_ops
.
remove
(
found_node
);
ret
.
emplace_back
(
found_node
);
};
while
(
!
ready_ops
.
empty
())
{
bool
all_of_ready_op_unmatched
=
true
;
for
(
auto
it
=
op_descs_list
.
begin
();
it
!=
op_descs_list
.
end
();)
{
auto
op_desc
=
*
it
;
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
*
op
:
ready_ops
)
{
if
(
IsSameDesc
(
op
->
Op
(),
op_desc
))
{
found_node
=
op
;
break
;
}
}
// 3.1 op desc deleted by other pass
if
(
found_node
==
nullptr
)
{
++
it
;
continue
;
}
else
{
all_of_ready_op_unmatched
=
false
;
it
=
op_descs_list
.
erase
(
it
);
}
update_by_found_node
(
found_node
);
}
// 3.2 op descs are added by other pass
// preceding op non empty means some new op descs are
// created, but not contained in return node list.
// these new op desc may depend on each other.
std
::
list
<
ir
::
Node
*>
prev_ready_ops
(
ready_ops
);
if
(
all_of_ready_op_unmatched
)
{
for
(
auto
op
:
prev_ready_ops
)
{
update_by_found_node
(
op
);
}
}
}
PADDLE_ENFORCE
(
std
::
all_of
(
op_deps
.
begin
(),
op_deps
.
end
(),
[
&
](
const
std
::
pair
<
ir
::
Node
*
,
size_t
>&
p
)
{
return
p
.
second
==
0
;
}));
return
ret
;
}
size_t
NodeSize
(
const
VarDesc
&
node
)
{
auto
shape
=
node
.
GetShape
();
auto
shape
=
node
.
GetShape
();
int
size
=
int
size
=
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
int
>
());
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
int
>
());
...
@@ -31,9 +122,9 @@ size_t NodeSizeInBytes(const VarDesc& node) {
...
@@ -31,9 +122,9 @@ size_t NodeSizeInBytes(const VarDesc& node) {
return
type_size
*
std
::
abs
(
size
);
return
type_size
*
std
::
abs
(
size
);
}
}
size_t
NodeSize
InBytes
(
ir
::
Node
*
n
)
{
size_t
NodeSize
(
ir
::
Node
*
n
)
{
auto
*
desc
=
FindVarDescInBlock
(
n
);
auto
*
desc
=
FindVarDescInBlock
(
n
);
return
NodeSize
InBytes
(
*
desc
);
return
NodeSize
(
*
desc
);
}
}
std
::
string
DebugStringImpl
(
VarDesc
*
var
)
{
std
::
string
DebugStringImpl
(
VarDesc
*
var
)
{
...
@@ -59,7 +150,6 @@ std::string DebugStringImpl(VarDesc* var) {
...
@@ -59,7 +150,6 @@ std::string DebugStringImpl(VarDesc* var) {
std
::
string
DebugString
(
ir
::
Node
*
var
)
{
std
::
string
DebugString
(
ir
::
Node
*
var
)
{
return
DebugStringImpl
(
FindVarDescInBlock
(
var
));
return
DebugStringImpl
(
FindVarDescInBlock
(
var
));
}
}
// return DebugString(var->Var()); }
// NOTE(dzh): based ir node, if a large node has been reused
// NOTE(dzh): based ir node, if a large node has been reused
// by a small size node, then next time it appear in pool, it will
// by a small size node, then next time it appear in pool, it will
...
@@ -80,18 +170,17 @@ struct NodeComparator {
...
@@ -80,18 +170,17 @@ struct NodeComparator {
auto
rhs_shape
=
rhs_desc
->
GetShape
();
auto
rhs_shape
=
rhs_desc
->
GetShape
();
if
((
lhs_shape
[
0
]
==
-
1
&&
rhs_shape
[
0
]
==
-
1
)
||
if
((
lhs_shape
[
0
]
==
-
1
&&
rhs_shape
[
0
]
==
-
1
)
||
(
lhs_shape
[
0
]
!=
-
1
&&
rhs_shape
[
0
]
!=
-
1
))
{
(
lhs_shape
[
0
]
!=
-
1
&&
rhs_shape
[
0
]
!=
-
1
))
{
return
NodeSize
InBytes
(
lhs
)
<=
NodeSizeInBytes
(
rhs
);
return
NodeSize
(
lhs
)
<=
NodeSize
(
rhs
);
}
else
{
}
else
{
return
false
;
return
false
;
}
}
}
}
};
};
void
Ordered
NodeList
::
Insert
(
ir
::
Node
*
var
,
ir
::
Node
*
op
)
{
void
Ordered
Set
::
Insert
(
ir
::
Node
*
var
)
{
PADDLE_ENFORCE
(
var
->
IsVar
()
&&
!
var
->
IsCtrlVar
());
PADDLE_ENFORCE
(
var
->
IsVar
()
&&
!
var
->
IsCtrlVar
());
PADDLE_ENFORCE
(
op
->
IsOp
());
if
(
mark_table_
.
count
(
var
->
Name
())
!=
0
)
{
if
(
mark_table_
.
count
(
var
->
Name
())
!=
0
)
{
mark_table_
[
var
->
Name
()]
->
second
.
insert
(
op
);
mark_table_
[
var
->
Name
()]
->
emplace_back
(
var
);
return
;
return
;
}
}
...
@@ -99,14 +188,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
...
@@ -99,14 +188,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
auto
var_shape
=
var_desc
->
GetShape
();
auto
var_shape
=
var_desc
->
GetShape
();
int
batch_size
=
static_cast
<
int
>
(
var_shape
[
0
]);
int
batch_size
=
static_cast
<
int
>
(
var_shape
[
0
]);
NodeComparator
compare_node
;
NodeComparator
functor
;
Iter
it
=
nodes_
.
begin
();
Iter
it
=
nodes_
.
begin
();
while
(
it
!=
nodes_
.
end
())
{
while
(
it
!=
nodes_
.
end
())
{
auto
*
cache_desc
=
FindVarDescInBlock
(
it
->
first
);
auto
&
prev
=
it
->
front
();
auto
*
cache_desc
=
FindVarDescInBlock
(
prev
);
int
cache_batch_size
=
cache_desc
->
GetShape
()[
0
];
int
cache_batch_size
=
cache_desc
->
GetShape
()[
0
];
if
((
cache_batch_size
==
-
1
&&
batch_size
==
-
1
)
||
if
((
cache_batch_size
==
-
1
&&
batch_size
==
-
1
)
||
(
cache_batch_size
!=
-
1
&&
batch_size
!=
-
1
))
{
(
cache_batch_size
!=
-
1
&&
batch_size
!=
-
1
))
{
if
(
compare_node
(
it
->
first
,
var
))
{
if
(
functor
(
prev
,
var
))
{
++
it
;
++
it
;
}
else
{
}
else
{
break
;
break
;
...
@@ -118,62 +208,80 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
...
@@ -118,62 +208,80 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
}
}
}
}
it
=
it
=
nodes_
.
insert
(
it
,
{
var
});
nodes_
.
insert
(
it
,
std
::
make_pair
(
var
,
std
::
unordered_set
<
ir
::
Node
*>
{
op
}));
mark_table_
[
var
->
Name
()]
=
it
;
mark_table_
[
var
->
Name
()]
=
it
;
}
}
int
Ordered
NodeList
::
GetIndex
(
ir
::
Node
*
var
)
{
int
Ordered
Set
::
GetNodeIndexInPool
(
ir
::
Node
*
var
)
{
return
std
::
distance
(
nodes_
.
begin
(),
mark_table_
[
var
->
Name
()]);
return
std
::
distance
(
nodes_
.
begin
(),
mark_table_
[
var
->
Name
()]);
}
}
ir
::
Node
*
Ordered
NodeList
::
NodeMatch
(
ir
::
Node
*
var
)
const
{
ir
::
Node
*
Ordered
Set
::
FindBestFitNode
(
ir
::
Node
*
var
)
const
{
ir
::
Node
*
found_node
=
nullptr
;
ir
::
Node
*
found_node
=
nullptr
;
NodeComparator
compare_node
;
NodeComparator
functor
;
for
(
auto
it
=
nodes_
.
begin
();
it
!=
nodes_
.
end
();
++
it
)
{
for
(
auto
it
=
nodes_
.
begin
();
it
!=
nodes_
.
end
();
++
it
)
{
if
(
compare_node
(
var
,
it
->
first
))
{
auto
&
candidate
=
it
->
front
();
found_node
=
it
->
first
;
if
(
functor
(
var
,
candidate
))
{
found_node
=
candidate
;
break
;
break
;
}
}
}
}
return
found_node
;
return
found_node
;
}
}
void
OrderedNodeList
::
Erase
(
ir
::
Node
*
var
)
{
Erase
(
var
->
Name
());
}
bool
OrderedSet
::
Has
(
ir
::
Node
*
var
)
const
{
if
(
mark_table_
.
count
(
var
->
Name
()))
{
auto
&
node_in_samename
=
mark_table_
.
at
(
var
->
Name
());
auto
iter
=
std
::
find_if
(
node_in_samename
->
begin
(),
node_in_samename
->
end
(),
[
&
](
ir
::
Node
*
n
)
{
return
n
->
Name
()
==
var
->
Name
();
});
return
iter
!=
node_in_samename
->
end
();
}
return
false
;
}
void
Ordered
NodeList
::
Erase
(
const
std
::
string
&
var
)
{
void
Ordered
Set
::
Erase
(
ir
::
Node
*
var
)
{
PADDLE_ENFORCE
(
mark_table_
.
count
(
var
));
PADDLE_ENFORCE
(
mark_table_
.
count
(
var
->
Name
()
));
nodes_
.
erase
(
mark_table_
[
var
]);
nodes_
.
erase
(
mark_table_
[
var
->
Name
()
]);
mark_table_
.
erase
(
var
);
mark_table_
.
erase
(
var
->
Name
()
);
}
}
std
::
string
Ordered
NodeLis
t
::
ToString
()
const
{
std
::
string
Ordered
Se
t
::
ToString
()
const
{
std
::
stringstream
ss
;
std
::
stringstream
ss
;
for
(
auto
it
=
nodes_
.
begin
();
it
!=
nodes_
.
end
();
++
it
)
{
for
(
auto
it
=
nodes_
.
begin
();
it
!=
nodes_
.
end
();
++
it
)
{
ss
<<
DebugString
(
it
->
first
)
<<
" "
;
for
(
auto
&
node
:
*
it
)
{
ss
<<
DebugString
(
node
)
<<
" "
;
}
}
}
return
ss
.
str
();
return
ss
.
str
();
}
}
bool
NodeCanReused
(
ir
::
Node
*
node
)
{
bool
NodeCanReused
(
ir
::
Node
*
node
)
{
// valid the node is a var node
if
(
node
==
nullptr
||
!
node
->
IsVar
()
||
node
->
IsCtrlVar
())
return
false
;
if
(
node
==
nullptr
||
!
node
->
IsVar
()
||
node
->
IsCtrlVar
())
return
false
;
// auto* desc = node->Var();
bool
flag
=
NodeCanReused
(
*
node
->
Var
());
bool
flag
=
true
;
// op output force generated in cpu, can not be reused.
for
(
auto
*
op
:
node
->
inputs
)
{
for
(
auto
*
op
:
node
->
inputs
)
{
if
(
op
->
Op
()
->
HasAttr
(
"force_cpu"
))
{
if
(
op
->
Op
()
->
HasAttr
(
"force_cpu"
))
{
// op output force generated in cpu, can not be reused.
flag
&=
framework
::
AttrReader
(
op
->
Op
()
->
GetAttrMap
())
flag
&=
framework
::
AttrReader
(
op
->
Op
()
->
GetAttrMap
())
.
Get
<
bool
>
(
"force_cpu"
)
==
0
;
.
Get
<
bool
>
(
"force_cpu"
)
==
0
;
}
}
}
}
// var desc validation.
flag
&=
NodeCanReused
(
*
node
->
Var
());
return
flag
;
return
flag
;
}
}
bool
NodeCanReused
(
const
VarDesc
&
node
)
{
bool
NodeCanReused
(
const
VarDesc
&
node
)
{
auto
type
=
node
.
GetType
();
auto
type
=
node
.
GetType
();
if
(
node
.
Persistable
()
||
type
!=
proto
::
VarType
::
LOD_TENSOR
||
if
(
!
(
type
==
proto
::
VarType
::
LOD_TENSOR
||
node
.
GetShape
().
empty
())
{
type
==
proto
::
VarType
::
SELECTED_ROWS
||
type
==
proto
::
VarType
::
LOD_TENSOR_ARRAY
))
{
return
false
;
}
if
(
node
.
Persistable
()
||
node
.
GetShape
().
empty
())
{
return
false
;
return
false
;
}
}
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
...
@@ -193,6 +301,174 @@ bool OpHasSubBlock(OpDesc* desc) {
...
@@ -193,6 +301,174 @@ bool OpHasSubBlock(OpDesc* desc) {
return
false
;
return
false
;
}
}
ControlFlowGraph
::
ControlFlowGraph
(
const
ir
::
Graph
&
graph
)
{
ops_
=
SortOpLikeDescOrder
(
graph
);
ConnectNodes
();
}
void
ControlFlowGraph
::
BuildCFGGraph
()
{
// FIXME(dzh): same effect with ConnectNodes, but use the control
// link to build dependency graph, it goes wrong in transformer.
for
(
ir
::
Node
*
op
:
ops_
)
{
for
(
auto
&
input_var
:
op
->
inputs
)
{
if
(
!
input_var
->
inputs
.
empty
())
{
PADDLE_ENFORCE
(
input_var
->
inputs
.
size
()
==
1
&&
input_var
->
inputs
[
0
]
->
IsOp
(),
"Preceding Op Node of Var Node must be unique"
);
auto
*
pred_op
=
input_var
->
inputs
[
0
];
if
(
pred_op
->
Op
()
!=
nullptr
)
{
predecessors_
[
op
].
insert
(
pred_op
);
successors_
[
pred_op
].
insert
(
op
);
}
}
if
(
input_var
->
IsVar
()
&&
!
input_var
->
IsCtrlVar
())
{
uses_
[
op
].
insert
(
input_var
->
Name
());
}
}
for
(
auto
&
output_var
:
op
->
outputs
)
{
// output var may be used by many op
for
(
auto
*
succ_op
:
output_var
->
outputs
)
{
if
(
succ_op
->
Op
()
!=
nullptr
)
{
successors_
[
op
].
insert
(
succ_op
);
predecessors_
[
succ_op
].
insert
(
op
);
}
}
if
(
output_var
->
IsVar
()
&&
!
output_var
->
IsCtrlVar
())
{
defs_
[
op
].
insert
(
output_var
->
Name
());
}
}
}
}
void
ControlFlowGraph
::
ConnectNodes
()
{
for
(
size_t
i
=
0
;
i
<
ops_
.
size
();
++
i
)
{
auto
&
op
=
ops_
[
i
];
try
{
auto
&
next_op
=
ops_
.
at
(
i
+
1
);
successors_
[
op
].
insert
(
next_op
);
predecessors_
[
next_op
].
insert
(
op
);
}
catch
(...)
{
// do nothing
}
FilterVariables
(
op
->
inputs
,
[
&
](
ir
::
Node
*
var
)
{
uses_
[
op
].
emplace
(
var
->
Name
());
});
FilterVariables
(
op
->
outputs
,
[
&
](
ir
::
Node
*
var
)
{
defs_
[
op
].
emplace
(
var
->
Name
());
});
}
}
void
ControlFlowGraph
::
LiveVariableAnalysis
()
{
// NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
// compute the liveness of for each variable though reversed_ops algorithm.
// It iterates the operators from end to begin, compute the live in/live out
// variable set for each op, then the diff between in/out will be used for
// the variable reuse. For detail refer to
// http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
std
::
list
<
ir
::
Node
*>
work_list
(
ops_
.
rbegin
(),
ops_
.
rend
());
while
(
!
work_list
.
empty
())
{
ir
::
Node
*
op
=
work_list
.
front
();
work_list
.
pop_front
();
// get the live_in calculated before. Empty if first.
auto
prev_live_in
=
std
::
move
(
live_in_
[
op
]);
for
(
auto
&
s
:
successors_
[
op
])
{
for
(
auto
&
var
:
live_in_
[
s
])
{
live_out_
[
op
].
insert
(
var
);
}
}
for
(
auto
&
var
:
uses_
[
op
])
{
live_in_
[
op
].
insert
(
var
);
}
for
(
auto
&
var
:
live_out_
[
op
])
{
live_in_
[
op
].
insert
(
var
);
}
for
(
auto
&
var
:
defs_
[
op
])
{
live_in_
[
op
].
erase
(
var
);
}
// If the live_in is not changed, then the liveness analysis of
// predecessors is completed.
//
// Otherwise, recalculate the predecessors liveness
if
(
live_in_
[
op
]
!=
prev_live_in
)
{
for
(
auto
&
pre
:
predecessors_
[
op
])
{
work_list
.
push_back
(
pre
);
}
}
}
}
void
ControlFlowGraph
::
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
const
std
::
string
&
new_node
,
int
begin_idx
)
{
// update graph from begin idx to the end
for
(
size_t
i
=
begin_idx
;
i
!=
ops_
.
size
();
++
i
)
{
auto
*
op
=
ops_
[
i
];
if
(
uses_
[
op
].
find
(
old_node
)
!=
uses_
[
op
].
end
())
{
uses_
[
op
].
erase
(
old_node
);
uses_
[
op
].
insert
(
new_node
);
}
if
(
defs_
[
op
].
find
(
old_node
)
!=
defs_
[
op
].
end
())
{
defs_
[
op
].
erase
(
old_node
);
defs_
[
op
].
insert
(
new_node
);
}
if
(
live_in_
[
op
].
find
(
old_node
)
!=
live_in_
[
op
].
end
())
{
live_in_
[
op
].
erase
(
old_node
);
live_in_
[
op
].
insert
(
new_node
);
}
if
(
live_out_
[
op
].
find
(
old_node
)
!=
live_out_
[
op
].
end
())
{
live_out_
[
op
].
erase
(
old_node
);
live_out_
[
op
].
insert
(
new_node
);
}
}
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
LiveIn
(
ir
::
Node
*
op
)
const
{
auto
it
=
live_in_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
live_in_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_in, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
LiveOut
(
ir
::
Node
*
op
)
const
{
auto
it
=
live_out_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
live_out_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_out, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
Use
(
ir
::
Node
*
op
)
const
{
auto
it
=
uses_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
uses_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_out, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
vector
<
ir
::
Node
*>
ControlFlowGraph
::
Ops
()
const
{
return
ops_
;
}
std
::
vector
<
ir
::
Node
*>&
ControlFlowGraph
::
Ops
()
{
return
ops_
;
}
ir
::
Node
*
ControlFlowGraph
::
GetNodeByName
(
const
std
::
string
&
name
,
ir
::
Node
*
op
)
const
{
// in ssa-graph, different version nodes have same name,
// this function get the latest version var before target op
// It may return nullptr, such as data node.
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
*
node
:
ops_
)
{
if
(
node
==
op
)
break
;
for
(
auto
&
output
:
node
->
outputs
)
{
if
(
output
->
Name
()
==
name
)
{
found_node
=
output
;
}
}
}
return
found_node
;
}
}
// namespace details
}
// namespace details
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/details/memory_optimize_helper.h
浏览文件 @
97b143fb
...
@@ -17,6 +17,8 @@
...
@@ -17,6 +17,8 @@
#include <iostream>
#include <iostream>
#include <iterator>
#include <iterator>
#include <list>
#include <list>
#include <map>
#include <set>
#include <string>
#include <string>
#include <utility>
#include <utility>
#include <vector>
#include <vector>
...
@@ -27,41 +29,41 @@ namespace paddle {
...
@@ -27,41 +29,41 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
constexpr
char
kFetchedVars
[]
=
"fetched_vars"
;
constexpr
char
kAllOpDescs
[]
=
"all_op_descs"
;
constexpr
char
kGraphNodePool
[]
=
"graph_node_pool"
;
// NOTE(dzh): Variable and the operators use the var.
std
::
vector
<
ir
::
Node
*>
SortOpLikeDescOrder
(
const
ir
::
Graph
&
graph
);
// for early delete pass.
// Because analysis var pass build base on ir::Node, which maybe released
// or modified between passes, so we use OpDesc* to mark ops.
using
GraphNodePool
=
std
::
vector
<
std
::
pair
<
std
::
string
/*var node*/
,
std
::
unordered_set
<
OpDesc
*>
/* ops */
>>
;
// NOTE(dzh): by default, it sort node in ascend order(by node bytes size).
// NOTE(dzh): A ordered set for node reuse in memory optimize.
// in fluid, -1 means the batch_size is determined in runtime.
// the orderedset sort node in ascend order(by node bytes size).
// the node batch_size equal -1 always ranking in the front than the node not.
// in fluid, -1 means the batch_size, which is determined in runtime.
// So the reuse happens between nodes who's batch_size both are -1
// simultaneously or not.
//
// sort rule:
// rule 0 : smaller node ranking in front.
// rule 1 : batch_size equal -1 ranking in the front than the node not.
//
// For example,
// For example,
// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
// O(1) insert, delete
class
OrderedNodeList
{
public:
using
NodePair
=
std
::
pair
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
;
using
Iter
=
typename
std
::
list
<
NodePair
>::
iterator
;
using
ConstIter
=
typename
std
::
list
<
NodePair
>::
const_iterator
;
void
Insert
(
ir
::
Node
*
var
,
ir
::
Node
*
op
);
class
OrderedSet
{
public:
// nodes with same name exists in pool.
using
NodeVector
=
std
::
vector
<
ir
::
Node
*>
;
using
Iter
=
typename
std
::
list
<
NodeVector
>::
iterator
;
using
ConstIter
=
typename
std
::
list
<
NodeVector
>::
const_iterator
;
void
Insert
(
ir
::
Node
*
var
);
void
Erase
(
ir
::
Node
*
var
);
void
Erase
(
ir
::
Node
*
var
);
bool
Has
(
ir
::
Node
*
var
)
const
;
void
Erase
(
const
std
::
string
&
var
);
void
Clear
()
{
mark_table_
.
clear
();
bool
Has
(
ir
::
Node
*
var
)
{
return
mark_table_
.
count
(
var
->
Name
());
}
nodes_
.
clear
();
}
bool
Has
(
const
std
::
string
&
var
)
{
return
mark_table_
.
count
(
var
);
}
// find the bestfit shape node block with var.
ir
::
Node
*
FindBestFitNode
(
ir
::
Node
*
var
)
const
;
ir
::
Node
*
NodeMatch
(
ir
::
Node
*
var
)
const
;
// map store non-const iterator, can not promise const
// map store non-const iterator, can not promise const
int
Get
Index
(
ir
::
Node
*
var
);
int
Get
NodeIndexInPool
(
ir
::
Node
*
var
);
// pool all node to string
// pool all node to string
std
::
string
ToString
()
const
;
std
::
string
ToString
()
const
;
...
@@ -69,18 +71,54 @@ class OrderedNodeList {
...
@@ -69,18 +71,54 @@ class OrderedNodeList {
Iter
end
()
{
return
nodes_
.
end
();
}
Iter
end
()
{
return
nodes_
.
end
();
}
ConstIter
begin
()
const
{
return
nodes_
.
begin
();
}
ConstIter
begin
()
const
{
return
nodes_
.
begin
();
}
ConstIter
end
()
const
{
return
nodes_
.
end
();
}
ConstIter
end
()
const
{
return
nodes_
.
end
();
}
size_t
size
()
const
{
return
nodes_
.
size
();
}
void
Clear
()
{
size_t
size
()
const
{
return
nodes_
.
size
();
}
mark_table_
.
clear
();
nodes_
.
clear
();
}
private:
private:
// for searching.
// for searching.
std
::
unordered_map
<
std
::
string
,
Iter
>
mark_table_
;
std
::
unordered_map
<
std
::
string
,
Iter
>
mark_table_
;
// node swap pairs. var -> ops dep var
// node pool
std
::
list
<
NodePair
>
nodes_
;
std
::
list
<
NodeVector
>
nodes_
;
};
class
ControlFlowGraph
{
public:
ControlFlowGraph
()
=
default
;
// IR Graph
explicit
ControlFlowGraph
(
const
ir
::
Graph
&
graph
);
void
LiveVariableAnalysis
();
void
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
const
std
::
string
&
new_node
,
int
begin_idx
);
const
std
::
set
<
std
::
string
>
LiveIn
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>
LiveOut
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>
Use
(
ir
::
Node
*
op
)
const
;
const
std
::
vector
<
ir
::
Node
*>
Ops
()
const
;
std
::
vector
<
ir
::
Node
*>&
Ops
();
// for ssa-graph nodes
ir
::
Node
*
GetNodeByName
(
const
std
::
string
&
name
,
ir
::
Node
*
op
)
const
;
private:
void
BuildCFGGraph
();
void
ConnectNodes
();
using
NodeListMap
=
std
::
unordered_map
<
ir
::
Node
*
,
std
::
set
<
ir
::
Node
*>>
;
using
VarSetMap
=
std
::
map
<
ir
::
Node
*
,
std
::
set
<
std
::
string
>>
;
// successors ops use the output variables.
NodeListMap
successors_
;
// predecessors ops generated input variables.
NodeListMap
predecessors_
;
// variables lived before run current op.
VarSetMap
live_in_
;
// variables lived after run current op.
VarSetMap
live_out_
;
VarSetMap
uses_
;
// op inputs
VarSetMap
defs_
;
// op outputs
std
::
vector
<
ir
::
Node
*>
ops_
;
// op sequence by topology sort
};
};
// valid a tensor can be reuse or not
// valid a tensor can be reuse or not
...
@@ -93,15 +131,24 @@ bool NodeCanReused(const VarDesc& node);
...
@@ -93,15 +131,24 @@ bool NodeCanReused(const VarDesc& node);
bool
OpHasSubBlock
(
OpDesc
*
desc
);
bool
OpHasSubBlock
(
OpDesc
*
desc
);
// node memory size in bytes
// node memory size in bytes
size_t
NodeSize
InBytes
(
ir
::
Node
*
n
);
size_t
NodeSize
(
ir
::
Node
*
n
);
// node memory size in bytes
// node memory size in bytes
size_t
NodeSize
InBytes
(
const
VarDesc
&
);
size_t
NodeSize
(
const
VarDesc
&
);
std
::
string
DebugString
(
ir
::
Node
*
var
);
std
::
string
DebugString
(
ir
::
Node
*
var
);
// NOTE(dzhwinter)
// after node reuse, the replaced node shape is
// different with its VarDesc. So need to find the
// correct VarDesc in Block.
VarDesc
*
FindVarDescInBlock
(
ir
::
Node
*
n
);
VarDesc
*
FindVarDescInBlock
(
ir
::
Node
*
n
);
static
inline
bool
IsSameDesc
(
OpDesc
*
op1
,
OpDesc
*
op2
)
{
return
op1
->
Type
()
==
op2
->
Type
()
&&
op1
->
Inputs
()
==
op2
->
Inputs
()
&&
op1
->
Outputs
()
==
op2
->
Outputs
();
}
template
<
typename
Container
,
typename
Callback
>
template
<
typename
Container
,
typename
Callback
>
class
FilterVariableImpl
{
class
FilterVariableImpl
{
public:
public:
...
...
paddle/fluid/framework/details/memory_optimize_helper_test.cc
浏览文件 @
97b143fb
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <algorithm>
#include <algorithm>
#include <iostream>
#include <iostream>
#include <iterator>
#include <memory>
#include <memory>
#include <sstream>
#include <sstream>
#include <string>
#include <string>
...
@@ -22,13 +23,19 @@
...
@@ -22,13 +23,19 @@
#include <vector>
#include <vector>
#include "glog/logging.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/graph_test_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
TEST
(
Ordered
NodeLis
t
,
Normal
)
{
TEST
(
Ordered
Se
t
,
Normal
)
{
Ordered
NodeLis
t
pool
;
Ordered
Se
t
pool
;
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>>
nodes
;
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>>
nodes
;
// clang-format off
// clang-format off
...
@@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) {
...
@@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) {
nodes
.
emplace_back
(
std
::
move
(
node
));
nodes
.
emplace_back
(
std
::
move
(
node
));
}
}
// Insert
for
(
auto
&
node
:
nodes
)
{
for
(
auto
&
node
:
nodes
)
{
pool
.
Insert
(
node
.
get
(),
op
.
get
());
pool
.
Insert
(
node
.
get
());
}
// Has/size
ASSERT_EQ
(
pool
.
size
(),
shapes
.
size
());
for
(
auto
&
node
:
nodes
)
{
ASSERT_TRUE
(
pool
.
Has
(
node
.
get
()));
}
}
// assert its order and interface.
// assert its order and interface.
...
@@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) {
...
@@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) {
std
::
cout
<<
pool
.
ToString
()
<<
std
::
endl
;
std
::
cout
<<
pool
.
ToString
()
<<
std
::
endl
;
ASSERT_EQ
(
pool
.
size
(),
static_cast
<
size_t
>
(
COUNT
-
1
));
ASSERT_EQ
(
pool
.
size
(),
static_cast
<
size_t
>
(
COUNT
-
1
));
ASSERT_EQ
(
pool
.
Get
Index
(
nodes
.
back
().
get
()),
0
);
ASSERT_EQ
(
pool
.
Get
NodeIndexInPool
(
nodes
.
back
().
get
()),
0
);
{
{
auto
v1
=
block_desc
->
Var
(
"11"
);
auto
v1
=
block_desc
->
Var
(
"11"
);
v1
->
SetShape
({
-
1
,
256
,
56
,
56
});
v1
->
SetShape
({
-
1
,
256
,
56
,
56
});
std
::
unique_ptr
<
ir
::
Node
>
node1
=
ir
::
CreateNodeForTest
(
v1
);
std
::
unique_ptr
<
ir
::
Node
>
node1
=
ir
::
CreateNodeForTest
(
v1
);
node1
->
inputs
.
emplace_back
(
op
.
get
());
node1
->
inputs
.
emplace_back
(
op
.
get
());
auto
*
cache
=
pool
.
NodeMatch
(
node1
.
get
());
auto
*
cache
=
pool
.
FindBestFitNode
(
node1
.
get
());
ASSERT_EQ
(
cache
,
nullptr
);
ASSERT_EQ
(
cache
,
nullptr
);
}
}
{
{
...
@@ -81,16 +95,401 @@ TEST(OrderedNodeList, Normal) {
...
@@ -81,16 +95,401 @@ TEST(OrderedNodeList, Normal) {
v2
->
SetShape
({
-
1
,
2
,
5
});
v2
->
SetShape
({
-
1
,
2
,
5
});
std
::
unique_ptr
<
ir
::
Node
>
node1
=
ir
::
CreateNodeForTest
(
v2
);
std
::
unique_ptr
<
ir
::
Node
>
node1
=
ir
::
CreateNodeForTest
(
v2
);
node1
->
inputs
.
emplace_back
(
op
.
get
());
node1
->
inputs
.
emplace_back
(
op
.
get
());
auto
*
cache
=
pool
.
NodeMatch
(
node1
.
get
());
auto
*
cache
=
pool
.
FindBestFitNode
(
node1
.
get
());
ASSERT_EQ
(
pool
.
Get
Index
(
cache
),
2
);
// match 6:[-1,2,5]
ASSERT_EQ
(
pool
.
Get
NodeIndexInPool
(
cache
),
2
);
// match 6:[-1,2,5]
}
}
{
{
auto
v3
=
block_desc
->
Var
(
"13"
);
auto
v3
=
block_desc
->
Var
(
"13"
);
v3
->
SetShape
({
2
,
5
});
v3
->
SetShape
({
2
,
5
});
std
::
unique_ptr
<
ir
::
Node
>
node1
=
ir
::
CreateNodeForTest
(
v3
);
std
::
unique_ptr
<
ir
::
Node
>
node1
=
ir
::
CreateNodeForTest
(
v3
);
node1
->
inputs
.
emplace_back
(
op
.
get
());
node1
->
inputs
.
emplace_back
(
op
.
get
());
auto
*
cache
=
pool
.
NodeMatch
(
node1
.
get
());
auto
*
cache
=
pool
.
FindBestFitNode
(
node1
.
get
());
ASSERT_EQ
(
pool
.
GetIndex
(
cache
),
5
);
// match 4:[5,2]
ASSERT_EQ
(
pool
.
GetNodeIndexInPool
(
cache
),
5
);
// match 4:[5,2]
}
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
REGISTER_OPERATOR
(
sum
,
paddle
::
framework
::
DummyOp
,
paddle
::
framework
::
SumOpMaker
,
paddle
::
framework
::
DummyVarTypeInference
);
REGISTER_OPERATOR
(
assign
,
paddle
::
framework
::
DummyOp
,
paddle
::
framework
::
AssignOpMaker
,
paddle
::
framework
::
DummyVarTypeInference
);
REGISTER_OPERATOR
(
dummy
,
paddle
::
framework
::
DummyOp
,
paddle
::
framework
::
SumOpMaker
,
paddle
::
framework
::
DummyVarTypeInference
);
/*
https://en.wikipedia.org/wiki/Live_variable_analysis
Create a customed classical dependency graph, left row is the instruction
number.
1. a = 1
2. b = a
3. c = a
4. d = b + c
5. e = d
a--------+
| |
b c
| |
d--------+
|
e
Then analysis these variable's liveness range
*/
namespace
paddle
{
namespace
framework
{
namespace
details
{
inline
static
ProgramDesc
FillProgramDesc
()
{
ProgramDesc
prog
;
prog
.
MutableBlock
(
0
)
->
Var
(
"a"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"b"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"c"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"d"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"e"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"assign"
);
op
->
SetInput
(
"X"
,
{
"a"
});
op
->
SetOutput
(
"Out"
,
{
"b"
});
}
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"assign"
);
op
->
SetInput
(
"X"
,
{
"a"
});
op
->
SetOutput
(
"Out"
,
{
"c"
});
}
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d"
});
}
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"assign"
);
op
->
SetInput
(
"X"
,
{
"d"
});
op
->
SetOutput
(
"Out"
,
{
"e"
});
}
return
prog
;
}
TEST
(
CFGGraph
,
IRGraph
)
{
// prepare ir graph
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
ControlFlowGraph
cfg
(
graph
);
cfg
.
LiveVariableAnalysis
();
// test assign op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"a"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
0
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"a"
,
"b"
}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
0
])));
// test assign op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"a"
,
"b"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
1
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"b"
,
"c"
}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
1
])));
// test sum op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"b"
,
"c"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
2
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"d"
}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
2
])));
// test assign op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"d"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
3
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
3
])));
}
// 1. normal test
TEST
(
SortOpLikeDescOrder
,
NormalTest
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
SortOpLikeDescOrder
(
graph
);
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
auto
node
=
nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 2. remove some op_desc
TEST
(
SortOpLikeDescOrder
,
RemoveOpDesc
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
graph
.
Nodes
();
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
node
:
nodes
)
{
if
(
node
->
IsOp
()
&&
node
->
outputs
.
back
()
->
Name
()
==
"e"
)
{
found_node
=
node
;
break
;
}
}
PADDLE_ENFORCE
(
found_node
!=
nullptr
);
for
(
auto
it
=
op_descs
.
begin
();
it
!=
op_descs
.
end
();)
{
if
(
IsSameDesc
(
*
it
,
found_node
->
Op
()))
{
it
=
op_descs
.
erase
(
it
);
}
else
{
++
it
;
}
}
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
std
::
remove
(
d
->
outputs
.
begin
(),
d
->
outputs
.
end
(),
found_node
);
graph
.
RemoveNode
(
found_node
);
graph
.
RemoveNode
(
e
);
// other node keeps the same order
auto
remain_nodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
remain_nodes
.
size
();
++
i
)
{
auto
node
=
remain_nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 3. add some op_desc
TEST
(
SortOpLikeDescOrder
,
AddOpDesc
)
{
auto
prog
=
FillProgramDesc
();
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
ir
::
Graph
graph
(
prog
);
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d1"
});
ir
::
Node
*
node
=
graph
.
CreateOpNode
(
op
);
ir
::
Node
*
d1
=
graph
.
CreateVarNode
(
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
));
ir
::
Node
*
b
=
find_node_in_graph
(
"b"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
node
->
outputs
.
emplace_back
(
d1
);
node
->
inputs
.
emplace_back
(
b
);
node
->
inputs
.
emplace_back
(
c
);
d1
->
inputs
.
emplace_back
(
node
);
b
->
outputs
.
emplace_back
(
node
);
c
->
outputs
.
emplace_back
(
node
);
op_descs
.
insert
(
op_descs
.
begin
()
+
4
,
op
);
auto
nodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
auto
node
=
nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 4. add and delete some op_desc
TEST
(
SortOpLikeDescOrder
,
AddAndDeleteOpDesc
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
// remove sum node
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
auto
nodes
=
graph
.
Nodes
();
for
(
auto
node
:
nodes
)
{
if
(
node
->
Name
()
==
"sum"
)
{
found_node
=
node
;
break
;
}
}
PADDLE_ENFORCE
(
found_node
!=
nullptr
);
for
(
auto
it
=
op_descs
.
begin
();
it
!=
op_descs
.
end
();)
{
if
(
IsSameDesc
(
*
it
,
found_node
->
Op
()))
{
it
=
op_descs
.
erase
(
it
);
}
else
{
++
it
;
}
}
{
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
std
::
remove
(
d
->
outputs
.
begin
(),
d
->
outputs
.
end
(),
found_node
);
std
::
remove
(
c
->
outputs
.
begin
(),
c
->
outputs
.
end
(),
found_node
);
ir
::
Node
*
pending_op
=
found_node
->
outputs
[
0
]
->
outputs
[
0
];
graph
.
RemoveNode
(
e
);
graph
.
RemoveNode
(
pending_op
);
graph
.
RemoveNode
(
found_node
);
}
// add node
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d1"
});
{
ir
::
Node
*
node
=
graph
.
CreateOpNode
(
op
);
ir
::
Node
*
d1
=
graph
.
CreateVarNode
(
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
));
ir
::
Node
*
b
=
find_node_in_graph
(
"b"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
node
->
outputs
.
emplace_back
(
d1
);
node
->
inputs
.
emplace_back
(
b
);
node
->
inputs
.
emplace_back
(
c
);
b
->
outputs
.
emplace_back
(
node
);
c
->
outputs
.
emplace_back
(
node
);
}
op_descs
.
insert
(
op_descs
.
begin
()
+
2
,
op
);
// check the order
auto
mynodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
mynodes
.
size
();
++
i
)
{
auto
node
=
mynodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 5. add and replace some op_desc inplace.
TEST
(
SortOpLikeDescOrder
,
AddAndReplaceOpDescInplace
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
// add node
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d1"
});
{
ir
::
Node
*
node
=
graph
.
CreateOpNode
(
op
);
ir
::
Node
*
d1
=
graph
.
CreateVarNode
(
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
));
ir
::
Node
*
b
=
find_node_in_graph
(
"b"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
node
->
outputs
.
emplace_back
(
d1
);
node
->
inputs
.
emplace_back
(
b
);
node
->
inputs
.
emplace_back
(
c
);
d1
->
inputs
.
emplace_back
(
node
);
b
->
outputs
.
emplace_back
(
node
);
c
->
outputs
.
emplace_back
(
node
);
}
op_descs
.
emplace_back
(
op
);
// replace op_desc inplace
auto
nodes
=
graph
.
Nodes
();
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
node
:
nodes
)
{
if
(
node
->
IsOp
()
&&
node
->
Op
()
&&
node
->
Name
()
==
"assign"
)
{
if
(
node
->
outputs
.
size
()
==
1
&&
node
->
outputs
[
0
]
->
Name
()
==
"e"
)
{
found_node
=
node
;
break
;
}
}
}
{
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
std
::
remove
(
d
->
outputs
.
begin
(),
d
->
outputs
.
end
(),
found_node
);
std
::
remove
(
e
->
inputs
.
begin
(),
e
->
inputs
.
end
(),
found_node
);
graph
.
RemoveNode
(
found_node
);
}
op_descs
.
erase
(
op_descs
.
begin
()
+
3
);
auto
replace_op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
replace_op
->
SetType
(
"sum"
);
replace_op
->
SetInput
(
"X"
,
{
"d"
,
"d1"
});
replace_op
->
SetOutput
(
"Out"
,
{
"e"
});
{
ir
::
Node
*
sum2
=
graph
.
CreateOpNode
(
replace_op
);
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
ir
::
Node
*
d1
=
find_node_in_graph
(
"d1"
);
sum2
->
inputs
.
emplace_back
(
d
);
sum2
->
inputs
.
emplace_back
(
d1
);
sum2
->
outputs
.
emplace_back
(
e
);
e
->
inputs
.
emplace_back
(
sum2
);
d
->
outputs
.
emplace_back
(
sum2
);
d1
->
outputs
.
emplace_back
(
sum2
);
}
op_descs
.
emplace_back
(
replace_op
);
// compare op order
auto
graph_nodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
graph_nodes
.
size
();
++
i
)
{
auto
node
=
graph_nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
}
}
...
...
paddle/fluid/framework/details/memory_optimize_pass.cc
浏览文件 @
97b143fb
...
@@ -43,11 +43,6 @@ namespace paddle {
...
@@ -43,11 +43,6 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
static
inline
bool
IsSameDesc
(
OpDesc
*
op1
,
OpDesc
*
op2
)
{
return
op1
->
Type
()
==
op2
->
Type
()
&&
op1
->
Inputs
()
==
op2
->
Inputs
()
&&
op1
->
Outputs
()
==
op2
->
Outputs
();
}
std
::
unique_ptr
<
ir
::
Graph
>
MemoryOptimizePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
MemoryOptimizePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
auto
nodes
=
graph
->
Nodes
();
auto
nodes
=
graph
->
Nodes
();
...
@@ -77,7 +72,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
...
@@ -77,7 +72,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
if
(
!
NodeCanReused
(
var
)
||
cfg_
->
Use
(
op
).
count
(
var
->
Name
())
==
0
||
if
(
!
NodeCanReused
(
var
)
||
cfg_
->
Use
(
op
).
count
(
var
->
Name
())
==
0
||
skip_set_
.
count
(
var
->
Name
()))
skip_set_
.
count
(
var
->
Name
()))
continue
;
continue
;
ir
::
Node
*
cache
=
pool_
.
NodeMatch
(
var
);
ir
::
Node
*
cache
=
pool_
.
FindBestFitNode
(
var
);
if
(
var
->
Name
()
==
FLAGS_memory_optimize_debug
)
{
if
(
var
->
Name
()
==
FLAGS_memory_optimize_debug
)
{
VLOG
(
3
)
<<
"start match var "
<<
DebugString
(
var
)
<<
" of op "
VLOG
(
3
)
<<
"start match var "
<<
DebugString
(
var
)
<<
" of op "
...
@@ -95,11 +90,12 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
...
@@ -95,11 +90,12 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
<<
"replace it again. Skip this candidate."
;
<<
"replace it again. Skip this candidate."
;
continue
;
continue
;
int
node_idx_in_pool
=
pool_
.
Get
Index
(
cache
);
int
node_idx_in_pool
=
pool_
.
Get
NodeIndexInPool
(
cache
);
VLOG
(
3
)
<<
string
::
Sprintf
(
VLOG
(
3
)
<<
string
::
Sprintf
(
"!!! %s, %s => %s, cache idx %d, pool size %d"
,
"!!! %s, %s => %s, cache idx %d, pool size %d"
,
std
::
to_string
(
reuse_id
++
),
DebugString
(
var
),
DebugString
(
cache
),
std
::
to_string
(
reuse_id
++
),
DebugString
(
var
),
DebugString
(
cache
),
node_idx_in_pool
,
static_cast
<
int
>
(
pool_
.
size
()));
node_idx_in_pool
,
static_cast
<
int
>
(
pool_
.
size
()));
// update CFG Graph on the fly.
// update CFG Graph on the fly.
// reused var maybe re-fill into the pool
// reused var maybe re-fill into the pool
cfg_
->
RenameVarInCFGGraph
(
var
->
Name
(),
cache
->
Name
(),
idx
);
cfg_
->
RenameVarInCFGGraph
(
var
->
Name
(),
cache
->
Name
(),
idx
);
...
@@ -112,6 +108,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
...
@@ -112,6 +108,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
pool_
.
Erase
(
cache
);
pool_
.
Erase
(
cache
);
}
}
// fill the pool
// fill the pool
std
::
unordered_set
<
std
::
string
>
unlived_vars
;
std
::
unordered_set
<
std
::
string
>
unlived_vars
;
for
(
auto
var
:
cfg_
->
LiveIn
(
op
))
{
for
(
auto
var
:
cfg_
->
LiveIn
(
op
))
{
...
@@ -120,36 +117,15 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
...
@@ -120,36 +117,15 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
}
}
}
}
for
(
auto
var
:
unlived_vars
)
{
for
(
auto
var
:
unlived_vars
)
{
ir
::
Node
*
var_node
=
cfg_
->
GetNode
FromVar
Name
(
var
,
op
);
ir
::
Node
*
var_node
=
cfg_
->
GetNode
By
Name
(
var
,
op
);
if
(
NodeCanReused
(
var_node
)
&&
!
pool_
.
Has
(
var_node
))
{
if
(
NodeCanReused
(
var_node
)
&&
!
pool_
.
Has
(
var_node
))
{
pool_
.
Insert
(
var_node
,
op
);
pool_
.
Insert
(
var_node
);
}
}
}
}
}
}
}
}
graph
->
ResolveHazard
(
var_nodes_
);
graph
->
ResolveHazard
(
var_nodes_
);
// For early delete pass. use GraphNodePool load the unlived vars.
// 1. find all deps op for each unlived var in memory pool.
for
(
auto
&
op
:
graph
->
Nodes
())
{
for
(
auto
&
var
:
op
->
inputs
)
{
if
(
pool_
.
Has
(
var
))
{
pool_
.
Insert
(
var
,
op
);
}
}
}
// 2. convert ir node based memory pool to graph node
// because Node* maybe released bettwen passes.
auto
&
graph_pool
=
graph
->
Get
<
GraphNodePool
>
(
kGraphNodePool
);
for
(
auto
it
=
pool_
.
begin
();
it
!=
pool_
.
end
();
++
it
)
{
std
::
unordered_set
<
OpDesc
*>
descs
;
for
(
auto
&
op
:
it
->
second
)
{
PADDLE_ENFORCE
(
op
->
IsOp
());
descs
.
insert
(
op
->
Op
());
}
graph_pool
.
push_back
(
std
::
make_pair
(
it
->
first
->
Name
(),
descs
));
}
return
graph
;
return
graph
;
}
}
...
@@ -198,12 +174,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
...
@@ -198,12 +174,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
PADDLE_ENFORCE
(
sub_op
!=
nullptr
);
PADDLE_ENFORCE
(
sub_op
!=
nullptr
);
for
(
auto
*
var
:
sub_op
->
outputs
)
{
for
(
auto
*
var
:
sub_op
->
outputs
)
{
if
(
NodeCanReused
(
var
))
{
if
(
NodeCanReused
(
var
))
{
ir
::
Node
*
cache
=
pool_
.
NodeMatch
(
var
);
ir
::
Node
*
cache
=
pool_
.
FindBestFitNode
(
var
);
if
(
cache
!=
nullptr
)
{
if
(
cache
!=
nullptr
)
{
if
(
var
->
Var
()
->
GetDataType
()
!=
cache
->
Var
()
->
GetDataType
())
{
if
(
var
->
Var
()
->
GetDataType
()
!=
cache
->
Var
()
->
GetDataType
())
{
continue
;
continue
;
}
}
int
node_idx_in_pool
=
pool_
.
Get
Index
(
cache
);
int
node_idx_in_pool
=
pool_
.
Get
NodeIndexInPool
(
cache
);
VLOG
(
3
)
<<
string
::
Sprintf
(
VLOG
(
3
)
<<
string
::
Sprintf
(
"!!! %s, %s => %s, cache idx %d, pool size %d"
,
"!!! %s, %s => %s, cache idx %d, pool size %d"
,
std
::
to_string
(
sub_reuse_id
++
),
DebugString
(
var
),
std
::
to_string
(
sub_reuse_id
++
),
DebugString
(
var
),
...
@@ -342,267 +318,10 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
...
@@ -342,267 +318,10 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
var_nodes_
.
at
(
var
).
clear
();
var_nodes_
.
at
(
var
).
clear
();
}
}
std
::
vector
<
ir
::
Node
*>
SortOpLikeDescOrder
(
const
ir
::
Graph
&
graph
)
{
PADDLE_ENFORCE
(
graph
.
Has
(
kAllOpDescs
),
"Graph has no attribute of kAllOpDescs."
);
// 1. get op desc order
auto
&
op_descs
=
graph
.
Get
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
);
// 2. topology sort order
auto
nodes
=
graph
.
Nodes
();
std
::
deque
<
ir
::
Node
*>
ops
;
FilterVariables
(
nodes
,
[
&
](
ir
::
Node
*
op
)
{
if
(
op
->
IsOp
()
&&
op
->
Op
()
!=
nullptr
)
{
ops
.
emplace_back
(
op
);
}
});
std
::
unordered_map
<
ir
::
Node
*
,
size_t
>
op_deps
;
std
::
list
<
ir
::
Node
*>
ready_ops
;
std
::
unordered_map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
pending_ops
;
for
(
auto
*
op
:
ops
)
{
std
::
unordered_set
<
ir
::
Node
*>
preceding_op
;
for
(
auto
*
in
:
op
->
inputs
)
{
if
(
in
->
inputs
.
empty
())
continue
;
PADDLE_ENFORCE
(
in
->
inputs
.
size
()
==
1
&&
in
->
inputs
[
0
]
->
IsOp
());
preceding_op
.
emplace
(
in
->
inputs
[
0
]);
pending_ops
[
in
->
inputs
[
0
]].
emplace
(
op
);
}
op_deps
[
op
]
=
preceding_op
.
size
();
if
(
preceding_op
.
empty
())
{
ready_ops
.
emplace_back
(
op
);
}
}
// 3. generated op list based desc order and the topology order
std
::
vector
<
ir
::
Node
*>
ret
;
std
::
list
<
OpDesc
*>
op_descs_list
(
op_descs
.
begin
(),
op_descs
.
end
());
auto
update_by_found_node
=
[
&
](
ir
::
Node
*
found_node
)
{
for
(
auto
*
pending_op
:
pending_ops
[
found_node
])
{
if
(
--
op_deps
[
pending_op
]
==
0
)
{
ready_ops
.
emplace_back
(
pending_op
);
}
}
ready_ops
.
remove
(
found_node
);
ret
.
emplace_back
(
found_node
);
};
while
(
!
ready_ops
.
empty
())
{
bool
all_of_ready_op_unmatched
=
true
;
for
(
auto
it
=
op_descs_list
.
begin
();
it
!=
op_descs_list
.
end
();)
{
auto
op_desc
=
*
it
;
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
*
op
:
ready_ops
)
{
if
(
IsSameDesc
(
op
->
Op
(),
op_desc
))
{
found_node
=
op
;
break
;
}
}
// 3.1 op desc deleted by other pass
if
(
found_node
==
nullptr
)
{
++
it
;
continue
;
}
else
{
all_of_ready_op_unmatched
=
false
;
it
=
op_descs_list
.
erase
(
it
);
}
update_by_found_node
(
found_node
);
}
// 3.2 op descs are added by other pass
// preceding op non empty means some new op descs are
// created, but not contained in return node list.
// these new op desc may depend on each other.
std
::
list
<
ir
::
Node
*>
prev_ready_ops
(
ready_ops
);
if
(
all_of_ready_op_unmatched
)
{
for
(
auto
op
:
prev_ready_ops
)
{
update_by_found_node
(
op
);
}
}
}
PADDLE_ENFORCE
(
std
::
all_of
(
op_deps
.
begin
(),
op_deps
.
end
(),
[
&
](
const
std
::
pair
<
ir
::
Node
*
,
size_t
>&
p
)
{
return
p
.
second
==
0
;
}));
return
ret
;
}
ControlFlowGraph
::
ControlFlowGraph
(
const
ir
::
Graph
&
graph
)
{
ops_
=
SortOpLikeDescOrder
(
graph
);
ConnectNodes
();
}
void
ControlFlowGraph
::
BuildCFGGraph
()
{
// FIXME(dzh): same effect with ConnectNodes, but use the control
// link to build dependency graph, it goes wrong in transformer.
for
(
ir
::
Node
*
op
:
ops_
)
{
for
(
auto
&
input_var
:
op
->
inputs
)
{
if
(
!
input_var
->
inputs
.
empty
())
{
PADDLE_ENFORCE
(
input_var
->
inputs
.
size
()
==
1
&&
input_var
->
inputs
[
0
]
->
IsOp
(),
"Preceding Op Node of Var Node must be unique"
);
auto
*
pred_op
=
input_var
->
inputs
[
0
];
if
(
pred_op
->
Op
()
!=
nullptr
)
{
predecessors_
[
op
].
insert
(
pred_op
);
successors_
[
pred_op
].
insert
(
op
);
}
}
if
(
input_var
->
IsVar
()
&&
!
input_var
->
IsCtrlVar
())
{
uses_
[
op
].
insert
(
input_var
->
Name
());
}
}
for
(
auto
&
output_var
:
op
->
outputs
)
{
// output var may be used by many op
for
(
auto
*
succ_op
:
output_var
->
outputs
)
{
if
(
succ_op
->
Op
()
!=
nullptr
)
{
successors_
[
op
].
insert
(
succ_op
);
predecessors_
[
succ_op
].
insert
(
op
);
}
}
if
(
output_var
->
IsVar
()
&&
!
output_var
->
IsCtrlVar
())
{
defs_
[
op
].
insert
(
output_var
->
Name
());
}
}
}
}
void
ControlFlowGraph
::
ConnectNodes
()
{
for
(
size_t
i
=
0
;
i
<
ops_
.
size
();
++
i
)
{
auto
&
op
=
ops_
[
i
];
try
{
auto
&
next_op
=
ops_
.
at
(
i
+
1
);
successors_
[
op
].
insert
(
next_op
);
predecessors_
[
next_op
].
insert
(
op
);
}
catch
(...)
{
// do nothing
}
FilterVariables
(
op
->
inputs
,
[
&
](
ir
::
Node
*
var
)
{
uses_
[
op
].
emplace
(
var
->
Name
());
});
FilterVariables
(
op
->
outputs
,
[
&
](
ir
::
Node
*
var
)
{
defs_
[
op
].
emplace
(
var
->
Name
());
});
}
}
void
ControlFlowGraph
::
LiveVariableAnalysis
()
{
// NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
// compute the liveness of for each variable though reversed_ops algorithm.
// It iterates the operators from end to begin, compute the live in/live out
// variable set for each op, then the diff between in/out will be used for
// the variable reuse. For detail refer to
// http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
std
::
list
<
ir
::
Node
*>
work_list
(
ops_
.
rbegin
(),
ops_
.
rend
());
while
(
!
work_list
.
empty
())
{
ir
::
Node
*
op
=
work_list
.
front
();
work_list
.
pop_front
();
// get the live_in calculated before. Empty if first.
auto
prev_live_in
=
std
::
move
(
live_in_
[
op
]);
for
(
auto
&
s
:
successors_
[
op
])
{
for
(
auto
&
var
:
live_in_
[
s
])
{
live_out_
[
op
].
insert
(
var
);
}
}
for
(
auto
&
var
:
uses_
[
op
])
{
live_in_
[
op
].
insert
(
var
);
}
for
(
auto
&
var
:
live_out_
[
op
])
{
live_in_
[
op
].
insert
(
var
);
}
for
(
auto
&
var
:
defs_
[
op
])
{
live_in_
[
op
].
erase
(
var
);
}
// If the live_in is not changed, then the liveness analysis of
// predecessors is completed.
//
// Otherwise, recalculate the predecessors liveness
if
(
live_in_
[
op
]
!=
prev_live_in
)
{
for
(
auto
&
pre
:
predecessors_
[
op
])
{
work_list
.
push_back
(
pre
);
}
}
}
}
void
ControlFlowGraph
::
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
const
std
::
string
&
new_node
,
int
begin_idx
)
{
// update graph from begin idx to the end
for
(
size_t
i
=
begin_idx
;
i
!=
ops_
.
size
();
++
i
)
{
auto
*
op
=
ops_
[
i
];
if
(
uses_
[
op
].
find
(
old_node
)
!=
uses_
[
op
].
end
())
{
uses_
[
op
].
erase
(
old_node
);
uses_
[
op
].
insert
(
new_node
);
}
if
(
defs_
[
op
].
find
(
old_node
)
!=
defs_
[
op
].
end
())
{
defs_
[
op
].
erase
(
old_node
);
defs_
[
op
].
insert
(
new_node
);
}
if
(
live_in_
[
op
].
find
(
old_node
)
!=
live_in_
[
op
].
end
())
{
live_in_
[
op
].
erase
(
old_node
);
live_in_
[
op
].
insert
(
new_node
);
}
if
(
live_out_
[
op
].
find
(
old_node
)
!=
live_out_
[
op
].
end
())
{
live_out_
[
op
].
erase
(
old_node
);
live_out_
[
op
].
insert
(
new_node
);
}
}
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
LiveIn
(
ir
::
Node
*
op
)
const
{
auto
it
=
live_in_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
live_in_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_in, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
LiveOut
(
ir
::
Node
*
op
)
const
{
auto
it
=
live_out_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
live_out_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_out, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
Use
(
ir
::
Node
*
op
)
const
{
auto
it
=
uses_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
uses_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_out, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
vector
<
ir
::
Node
*>
ControlFlowGraph
::
Ops
()
const
{
return
ops_
;
}
std
::
vector
<
ir
::
Node
*>&
ControlFlowGraph
::
Ops
()
{
return
ops_
;
}
ir
::
Node
*
ControlFlowGraph
::
GetNodeFromVarName
(
const
std
::
string
&
name
,
ir
::
Node
*
op
)
const
{
// in ssa-graph, different version nodes have same name,
// this function get the latest version var before target op
// It may return nullptr, such as data node.
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
*
node
:
ops_
)
{
if
(
node
==
op
)
break
;
for
(
auto
&
output
:
node
->
outputs
)
{
if
(
output
->
Name
()
==
name
)
{
found_node
=
output
;
}
}
}
return
found_node
;
}
}
// namespace details
}
// namespace details
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
REGISTER_PASS
(
memory_optimize_pass
,
REGISTER_PASS
(
memory_optimize_pass
,
paddle
::
framework
::
details
::
MemoryOptimizePass
)
paddle
::
framework
::
details
::
MemoryOptimizePass
)
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
kGraphNodePool
)
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
kAllOpDescs
);
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
kAllOpDescs
);
paddle/fluid/framework/details/memory_optimize_pass.h
浏览文件 @
97b143fb
...
@@ -32,20 +32,15 @@
...
@@ -32,20 +32,15 @@
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
constexpr
char
kAllOpDescs
[]
=
"all_op_descs"
;
std
::
vector
<
ir
::
Node
*>
SortOpLikeDescOrder
(
const
ir
::
Graph
&
graph
);
class
ControlFlowGraph
;
class
MemoryOptimizePass
:
public
ir
::
Pass
{
class
MemoryOptimizePass
:
public
ir
::
Pass
{
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
private:
// fill the variable map(var_nodes) by version.
// fill the variable map(var_nodes) by version.
void
InitSSAGraphNodes
()
const
;
void
InitSSAGraphNodes
()
const
;
private:
// update program descs
// update program descs
void
RenameVarInGraphDesc
(
const
std
::
string
&
var
,
void
RenameVarInGraphDesc
(
const
std
::
string
&
var
,
const
std
::
string
&
cache_var
,
size_t
idx
)
const
;
const
std
::
string
&
cache_var
,
size_t
idx
)
const
;
...
@@ -62,7 +57,7 @@ class MemoryOptimizePass : public ir::Pass {
...
@@ -62,7 +57,7 @@ class MemoryOptimizePass : public ir::Pass {
private:
private:
// Reuse Node Pool, Owned.
// Reuse Node Pool, Owned.
mutable
Ordered
NodeLis
t
pool_
;
mutable
Ordered
Se
t
pool_
;
// controlflow Graph
// controlflow Graph
mutable
std
::
unique_ptr
<
ControlFlowGraph
>
cfg_
;
mutable
std
::
unique_ptr
<
ControlFlowGraph
>
cfg_
;
// skip set
// skip set
...
@@ -71,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass {
...
@@ -71,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass {
mutable
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
var_nodes_
;
mutable
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
var_nodes_
;
};
};
class
ControlFlowGraph
{
public:
ControlFlowGraph
()
=
default
;
// For IR Graph in parallelexecutor
explicit
ControlFlowGraph
(
const
ir
::
Graph
&
graph
);
void
LiveVariableAnalysis
();
void
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
const
std
::
string
&
new_node
,
int
begin_idx
);
const
std
::
set
<
std
::
string
>
LiveIn
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>
LiveOut
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>
Use
(
ir
::
Node
*
op
)
const
;
const
std
::
vector
<
ir
::
Node
*>
Ops
()
const
;
std
::
vector
<
ir
::
Node
*>&
Ops
();
// for ssa-graph nodes
ir
::
Node
*
GetNodeFromVarName
(
const
std
::
string
&
name
,
ir
::
Node
*
op
)
const
;
private:
void
BuildCFGGraph
();
void
ConnectNodes
();
using
NodeListMap
=
std
::
unordered_map
<
ir
::
Node
*
,
std
::
set
<
ir
::
Node
*>>
;
using
VarSetMap
=
std
::
map
<
ir
::
Node
*
,
std
::
set
<
std
::
string
>>
;
// successors ops use the output variables.
NodeListMap
successors_
;
// predecessors ops generated input variables.
NodeListMap
predecessors_
;
// variables lived before run current op.
VarSetMap
live_in_
;
// variables lived after run current op.
VarSetMap
live_out_
;
VarSetMap
uses_
;
// op inputs
VarSetMap
defs_
;
// op outputs
std
::
vector
<
ir
::
Node
*>
ops_
;
// op sequence by topology sort
};
}
// namespace details
}
// namespace details
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/details/memory_optimize_pass_test.cc
已删除
100644 → 0
浏览文件 @
ffd0d1d2
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_pass.h"
#include <algorithm>
#include <iostream>
#include <iterator>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/graph_test_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
REGISTER_OPERATOR
(
sum
,
paddle
::
framework
::
DummyOp
,
paddle
::
framework
::
SumOpMaker
,
paddle
::
framework
::
DummyVarTypeInference
);
REGISTER_OPERATOR
(
assign
,
paddle
::
framework
::
DummyOp
,
paddle
::
framework
::
AssignOpMaker
,
paddle
::
framework
::
DummyVarTypeInference
);
REGISTER_OPERATOR
(
dummy
,
paddle
::
framework
::
DummyOp
,
paddle
::
framework
::
SumOpMaker
,
paddle
::
framework
::
DummyVarTypeInference
);
/*
https://en.wikipedia.org/wiki/Live_variable_analysis
Create a customed classical dependency graph, left row is the instruction
number.
1. a = 1
2. b = a
3. c = a
4. d = b + c
5. e = d
a--------+
| |
b c
| |
d--------+
|
e
Then analysis these variable's liveness range
*/
namespace
paddle
{
namespace
framework
{
namespace
details
{
static
inline
bool
IsSameDesc
(
OpDesc
*
op1
,
OpDesc
*
op2
)
{
return
op1
->
Type
()
==
op2
->
Type
()
&&
op1
->
Inputs
()
==
op2
->
Inputs
()
&&
op1
->
Outputs
()
==
op2
->
Outputs
();
}
inline
static
ProgramDesc
FillProgramDesc
()
{
ProgramDesc
prog
;
prog
.
MutableBlock
(
0
)
->
Var
(
"a"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"b"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"c"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"d"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"e"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"assign"
);
op
->
SetInput
(
"X"
,
{
"a"
});
op
->
SetOutput
(
"Out"
,
{
"b"
});
}
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"assign"
);
op
->
SetInput
(
"X"
,
{
"a"
});
op
->
SetOutput
(
"Out"
,
{
"c"
});
}
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d"
});
}
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"assign"
);
op
->
SetInput
(
"X"
,
{
"d"
});
op
->
SetOutput
(
"Out"
,
{
"e"
});
}
return
prog
;
}
TEST
(
CFGGraph
,
IRGraph
)
{
// prepare ir graph
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
ControlFlowGraph
cfg
(
graph
);
cfg
.
LiveVariableAnalysis
();
// test assign op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"a"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
0
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"a"
,
"b"
}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
0
])));
// test assign op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"a"
,
"b"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
1
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"b"
,
"c"
}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
1
])));
// test sum op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"b"
,
"c"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
2
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"d"
}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
2
])));
// test assign op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"d"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
3
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
3
])));
}
// 1. normal test
TEST
(
SortOpLikeDescOrder
,
NormalTest
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
SortOpLikeDescOrder
(
graph
);
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
auto
node
=
nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 2. remove some op_desc
TEST
(
SortOpLikeDescOrder
,
RemoveOpDesc
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
graph
.
Nodes
();
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
node
:
nodes
)
{
if
(
node
->
IsOp
()
&&
node
->
outputs
.
back
()
->
Name
()
==
"e"
)
{
found_node
=
node
;
break
;
}
}
PADDLE_ENFORCE
(
found_node
!=
nullptr
);
for
(
auto
it
=
op_descs
.
begin
();
it
!=
op_descs
.
end
();)
{
if
(
IsSameDesc
(
*
it
,
found_node
->
Op
()))
{
it
=
op_descs
.
erase
(
it
);
}
else
{
++
it
;
}
}
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
std
::
remove
(
d
->
outputs
.
begin
(),
d
->
outputs
.
end
(),
found_node
);
graph
.
RemoveNode
(
found_node
);
graph
.
RemoveNode
(
e
);
// other node keeps the same order
auto
remain_nodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
remain_nodes
.
size
();
++
i
)
{
auto
node
=
remain_nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 3. add some op_desc
TEST
(
SortOpLikeDescOrder
,
AddOpDesc
)
{
auto
prog
=
FillProgramDesc
();
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
ir
::
Graph
graph
(
prog
);
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d1"
});
ir
::
Node
*
node
=
graph
.
CreateOpNode
(
op
);
ir
::
Node
*
d1
=
graph
.
CreateVarNode
(
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
));
ir
::
Node
*
b
=
find_node_in_graph
(
"b"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
node
->
outputs
.
emplace_back
(
d1
);
node
->
inputs
.
emplace_back
(
b
);
node
->
inputs
.
emplace_back
(
c
);
d1
->
inputs
.
emplace_back
(
node
);
b
->
outputs
.
emplace_back
(
node
);
c
->
outputs
.
emplace_back
(
node
);
op_descs
.
insert
(
op_descs
.
begin
()
+
4
,
op
);
auto
nodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
auto
node
=
nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 4. add and delete some op_desc
TEST
(
SortOpLikeDescOrder
,
AddAndDeleteOpDesc
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
// remove sum node
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
auto
nodes
=
graph
.
Nodes
();
for
(
auto
node
:
nodes
)
{
if
(
node
->
Name
()
==
"sum"
)
{
found_node
=
node
;
break
;
}
}
PADDLE_ENFORCE
(
found_node
!=
nullptr
);
for
(
auto
it
=
op_descs
.
begin
();
it
!=
op_descs
.
end
();)
{
if
(
IsSameDesc
(
*
it
,
found_node
->
Op
()))
{
it
=
op_descs
.
erase
(
it
);
}
else
{
++
it
;
}
}
{
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
std
::
remove
(
d
->
outputs
.
begin
(),
d
->
outputs
.
end
(),
found_node
);
std
::
remove
(
c
->
outputs
.
begin
(),
c
->
outputs
.
end
(),
found_node
);
ir
::
Node
*
pending_op
=
found_node
->
outputs
[
0
]
->
outputs
[
0
];
graph
.
RemoveNode
(
e
);
graph
.
RemoveNode
(
pending_op
);
graph
.
RemoveNode
(
found_node
);
}
// add node
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d1"
});
{
ir
::
Node
*
node
=
graph
.
CreateOpNode
(
op
);
ir
::
Node
*
d1
=
graph
.
CreateVarNode
(
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
));
ir
::
Node
*
b
=
find_node_in_graph
(
"b"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
node
->
outputs
.
emplace_back
(
d1
);
node
->
inputs
.
emplace_back
(
b
);
node
->
inputs
.
emplace_back
(
c
);
b
->
outputs
.
emplace_back
(
node
);
c
->
outputs
.
emplace_back
(
node
);
}
op_descs
.
insert
(
op_descs
.
begin
()
+
2
,
op
);
// check the order
auto
mynodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
mynodes
.
size
();
++
i
)
{
auto
node
=
mynodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 5. add and replace some op_desc inplace.
TEST
(
SortOpLikeDescOrder
,
AddAndReplaceOpDescInplace
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
// add node
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d1"
});
{
ir
::
Node
*
node
=
graph
.
CreateOpNode
(
op
);
ir
::
Node
*
d1
=
graph
.
CreateVarNode
(
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
));
ir
::
Node
*
b
=
find_node_in_graph
(
"b"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
node
->
outputs
.
emplace_back
(
d1
);
node
->
inputs
.
emplace_back
(
b
);
node
->
inputs
.
emplace_back
(
c
);
d1
->
inputs
.
emplace_back
(
node
);
b
->
outputs
.
emplace_back
(
node
);
c
->
outputs
.
emplace_back
(
node
);
}
op_descs
.
emplace_back
(
op
);
// replace op_desc inplace
auto
nodes
=
graph
.
Nodes
();
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
node
:
nodes
)
{
if
(
node
->
IsOp
()
&&
node
->
Op
()
&&
node
->
Name
()
==
"assign"
)
{
if
(
node
->
outputs
.
size
()
==
1
&&
node
->
outputs
[
0
]
->
Name
()
==
"e"
)
{
found_node
=
node
;
break
;
}
}
}
{
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
std
::
remove
(
d
->
outputs
.
begin
(),
d
->
outputs
.
end
(),
found_node
);
std
::
remove
(
e
->
inputs
.
begin
(),
e
->
inputs
.
end
(),
found_node
);
graph
.
RemoveNode
(
found_node
);
}
op_descs
.
erase
(
op_descs
.
begin
()
+
3
);
auto
replace_op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
replace_op
->
SetType
(
"sum"
);
replace_op
->
SetInput
(
"X"
,
{
"d"
,
"d1"
});
replace_op
->
SetOutput
(
"Out"
,
{
"e"
});
{
ir
::
Node
*
sum2
=
graph
.
CreateOpNode
(
replace_op
);
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
ir
::
Node
*
d1
=
find_node_in_graph
(
"d1"
);
sum2
->
inputs
.
emplace_back
(
d
);
sum2
->
inputs
.
emplace_back
(
d1
);
sum2
->
outputs
.
emplace_back
(
e
);
e
->
inputs
.
emplace_back
(
sum2
);
d
->
outputs
.
emplace_back
(
sum2
);
d1
->
outputs
.
emplace_back
(
sum2
);
}
op_descs
.
emplace_back
(
replace_op
);
// compare op order
auto
graph_nodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
graph_nodes
.
size
();
++
i
)
{
auto
node
=
graph_nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/sequential_execution_pass.cc
浏览文件 @
97b143fb
...
@@ -17,6 +17,7 @@
...
@@ -17,6 +17,7 @@
#include <unordered_map>
#include <unordered_map>
#include <unordered_set>
#include <unordered_set>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/op_proto_maker.h"
namespace
paddle
{
namespace
paddle
{
...
...
paddle/fluid/framework/details/sequential_execution_pass.h
浏览文件 @
97b143fb
...
@@ -21,8 +21,6 @@ namespace paddle {
...
@@ -21,8 +21,6 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
constexpr
char
kAllOpDescs
[]
=
"all_op_descs"
;
class
SequentialExecutionPass
:
public
ir
::
Pass
{
class
SequentialExecutionPass
:
public
ir
::
Pass
{
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
...
...
paddle/fluid/framework/inplace_op_inference.h
浏览文件 @
97b143fb
...
@@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference {
...
@@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference {
bool
TryInplaceInputOutput
(
const
VarDesc
&
in
,
const
VarDesc
&
out
)
const
{
bool
TryInplaceInputOutput
(
const
VarDesc
&
in
,
const
VarDesc
&
out
)
const
{
return
in
.
Name
()
!=
out
.
Name
()
&&
details
::
NodeCanReused
(
in
)
&&
return
in
.
Name
()
!=
out
.
Name
()
&&
details
::
NodeCanReused
(
in
)
&&
details
::
NodeCanReused
(
out
)
&&
details
::
NodeCanReused
(
out
)
&&
details
::
NodeSize
InBytes
(
out
)
<=
details
::
NodeSizeInBytes
(
in
);
details
::
NodeSize
(
out
)
<=
details
::
NodeSize
(
in
);
}
}
};
};
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
97b143fb
...
@@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
...
@@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
eager_deletion_pass
->
SetNotOwned
(
details
::
kAllPlaces
,
&
places_
);
eager_deletion_pass
->
SetNotOwned
(
details
::
kAllPlaces
,
&
places_
);
graph
=
eager_deletion_pass
->
Apply
(
std
::
move
(
graph
));
graph
=
eager_deletion_pass
->
Apply
(
std
::
move
(
graph
));
VLOG
(
10
)
<<
"EagerDeletionPass Applied"
;
VLOG
(
10
)
<<
"EagerDeletionPass Applied"
;
if
(
build_strategy_
.
memory_early_delete_
)
{
auto
early_delete_pass
=
ir
::
PassRegistry
::
Instance
().
Get
(
"memory_early_delete_pass"
);
early_delete_pass
->
SetNotOwned
(
details
::
kGarbageCollector
,
&
gcs_
);
graph
=
early_delete_pass
->
Apply
(
std
::
move
(
graph
));
}
VLOG
(
10
)
<<
"MemoryEarlyDeletePass Applied."
;
}
}
return
graph
;
return
graph
;
...
@@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor(
...
@@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor(
graphs
.
push_back
(
std
::
move
(
graph
));
graphs
.
push_back
(
std
::
move
(
graph
));
#endif
#endif
auto
max_memory_size
=
GetEagerDeletionThreshold
();
auto
max_memory_size
=
GetEagerDeletionThreshold
();
VLOG
(
10
)
<<
"Eager Deletion Threshold "
<<
static_cast
<
float
>
(
max_memory_size
)
/
(
1
<<
30
);
if
(
max_memory_size
>=
0
)
{
if
(
max_memory_size
>=
0
)
{
for
(
size_t
i
=
0
;
i
<
graphs
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
graphs
.
size
();
++
i
)
{
graphs
[
i
]
=
member_
->
PrepareGCAndRefCnts
(
graphs
[
i
]
=
member_
->
PrepareGCAndRefCnts
(
...
@@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() {
...
@@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() {
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
USE_PASS
(
memory_early_delete_pass
);
USE_PASS
(
reference_count_pass
);
USE_PASS
(
reference_count_pass
);
USE_PASS
(
eager_deletion_pass
);
USE_PASS
(
eager_deletion_pass
);
paddle/fluid/framework/scope.cc
浏览文件 @
97b143fb
...
@@ -22,11 +22,7 @@ limitations under the License. */
...
@@ -22,11 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/printf.h"
DEFINE_bool
(
benchmark
,
false
,
DECLARE_bool
(
benchmark
);
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode."
);
DEFINE_bool
(
DEFINE_bool
(
eager_delete_scope
,
true
,
eager_delete_scope
,
true
,
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
97b143fb
...
@@ -52,8 +52,8 @@ cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_
...
@@ -52,8 +52,8 @@ cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
# compile the libinference_anakin_api.a and anakin.so.
# compile the libinference_anakin_api.a and anakin.so.
cc_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy
)
cc_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy
device_context
)
cc_library
(
inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy
)
cc_library
(
inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy
device_context
)
function
(
anakin_target target_name
)
function
(
anakin_target target_name
)
target_compile_options
(
${
target_name
}
BEFORE PUBLIC
${
ANAKIN_COMPILE_EXTRA_FLAGS
}
)
target_compile_options
(
${
target_name
}
BEFORE PUBLIC
${
ANAKIN_COMPILE_EXTRA_FLAGS
}
)
endfunction
()
endfunction
()
...
...
paddle/fluid/memory/allocation/legacy_allocator.cc
浏览文件 @
97b143fb
...
@@ -36,6 +36,7 @@ DEFINE_bool(init_allocated_mem, false,
...
@@ -36,6 +36,7 @@ DEFINE_bool(init_allocated_mem, false,
"that initializing the allocated memory with a small value "
"that initializing the allocated memory with a small value "
"during unit testing."
);
"during unit testing."
);
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_double
(
fraction_of_gpu_memory_to_use
);
DECLARE_bool
(
benchmark
);
namespace
paddle
{
namespace
paddle
{
namespace
memory
{
namespace
memory
{
...
@@ -198,7 +199,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
...
@@ -198,7 +199,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
<<
string
::
HumanReadableSize
(
Used
<
platform
::
CUDAPlace
>
(
place
));
<<
string
::
HumanReadableSize
(
Used
<
platform
::
CUDAPlace
>
(
place
));
platform
::
SetDeviceId
(
cur_dev
);
platform
::
SetDeviceId
(
cur_dev
);
}
else
{
}
else
{
if
(
VLOG_IS_ON
(
3
)
)
{
if
(
FLAGS_benchmark
)
{
allocation
::
GPUMemMonitor
.
Add
(
place
.
device
,
size
);
allocation
::
GPUMemMonitor
.
Add
(
place
.
device
,
size
);
}
}
if
(
FLAGS_init_allocated_mem
)
{
if
(
FLAGS_init_allocated_mem
)
{
...
@@ -216,7 +217,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
...
@@ -216,7 +217,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
size_t
size
)
{
size_t
size
)
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
GetGPUBuddyAllocator
(
place
.
device
)
->
Free
(
p
);
GetGPUBuddyAllocator
(
place
.
device
)
->
Free
(
p
);
if
(
VLOG_IS_ON
(
3
)
)
{
if
(
FLAGS_benchmark
)
{
allocation
::
GPUMemMonitor
.
Minus
(
place
.
device
,
size
);
allocation
::
GPUMemMonitor
.
Minus
(
place
.
device
,
size
);
}
}
#else
#else
...
@@ -257,7 +258,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
...
@@ -257,7 +258,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
void
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
void
*
ptr
=
buddy_allocator
->
Alloc
(
size
);
if
(
ptr
==
nullptr
)
{
if
(
ptr
==
nullptr
)
{
LOG
(
WARNING
)
<<
"cuda
MallocHost
Cannot allocate "
<<
size
LOG
(
WARNING
)
<<
"cuda
HostAlloc
Cannot allocate "
<<
size
<<
" bytes in CUDAPinnedPlace"
;
<<
" bytes in CUDAPinnedPlace"
;
}
}
if
(
FLAGS_init_allocated_mem
)
{
if
(
FLAGS_init_allocated_mem
)
{
...
...
paddle/fluid/memory/allocation/pinned_allocator.cc
浏览文件 @
97b143fb
...
@@ -32,7 +32,7 @@ Allocation *CPUPinnedAllocator::AllocateImpl(size_t size,
...
@@ -32,7 +32,7 @@ Allocation *CPUPinnedAllocator::AllocateImpl(size_t size,
// "CPUPinnedAllocator should be used for Cross-Device Communication");
// "CPUPinnedAllocator should be used for Cross-Device Communication");
void
*
ptr
;
void
*
ptr
;
PADDLE_ENFORCE
(
cuda
MallocHost
(
&
ptr
,
siz
e
));
PADDLE_ENFORCE
(
cuda
HostAlloc
(
&
ptr
,
size
,
cudaHostAllocPortabl
e
));
return
new
CPUPinnedAllocation
(
ptr
,
size
);
return
new
CPUPinnedAllocation
(
ptr
,
size
);
}
}
}
// namespace allocation
}
// namespace allocation
...
...
paddle/fluid/memory/allocation/pinned_allocator.h
浏览文件 @
97b143fb
...
@@ -19,7 +19,7 @@ namespace paddle {
...
@@ -19,7 +19,7 @@ namespace paddle {
namespace
memory
{
namespace
memory
{
namespace
allocation
{
namespace
allocation
{
// Allocator uses `cuda
MallocHost
`
// Allocator uses `cuda
HostAlloc
`
class
CPUPinnedAllocation
:
public
Allocation
{
class
CPUPinnedAllocation
:
public
Allocation
{
public:
public:
CPUPinnedAllocation
(
void
*
ptr
,
size_t
size
)
CPUPinnedAllocation
(
void
*
ptr
,
size_t
size
)
...
...
paddle/fluid/memory/detail/system_allocator.cc
浏览文件 @
97b143fb
...
@@ -173,14 +173,14 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
...
@@ -173,14 +173,14 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
void
*
p
;
void
*
p
;
// PINNED memory is visible to all CUDA contexts.
// PINNED memory is visible to all CUDA contexts.
cudaError_t
result
=
cuda
MallocHost
(
&
p
,
siz
e
);
cudaError_t
result
=
cuda
HostAlloc
(
&
p
,
size
,
cudaHostAllocPortabl
e
);
if
(
result
==
cudaSuccess
)
{
if
(
result
==
cudaSuccess
)
{
*
index
=
1
;
// PINNED memory
*
index
=
1
;
// PINNED memory
cuda_pinnd_alloc_size_
+=
size
;
cuda_pinnd_alloc_size_
+=
size
;
return
p
;
return
p
;
}
else
{
}
else
{
LOG
(
WARNING
)
<<
"cuda
MallocHost
failed."
;
LOG
(
WARNING
)
<<
"cuda
HostAlloc
failed."
;
return
nullptr
;
return
nullptr
;
}
}
...
...
paddle/fluid/operators/elementwise/elementwise_op.h
浏览文件 @
97b143fb
...
@@ -264,6 +264,23 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
...
@@ -264,6 +264,23 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
}
}
};
};
class
ElementwiseGradOpInplace
:
public
framework
::
InplaceInToOut
{
public:
using
framework
::
InplaceInToOut
::
InplaceInToOut
;
protected:
std
::
unordered_map
<
std
::
string
,
std
::
string
>
Apply
(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
std
::
unordered_map
<
std
::
string
,
std
::
string
>
ret
;
if
(
block
->
HasVar
(
framework
::
GradVarName
(
"X"
))
&&
block
->
HasVar
(
framework
::
GradVarName
(
"Out"
)))
{
ret
[
framework
::
GradVarName
(
"Out"
)]
=
framework
::
GradVarName
(
"X"
);
}
return
ret
;
}
};
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
...
@@ -316,4 +333,5 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
...
@@ -316,4 +333,5 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
op_type##GradMaker, \
op_type##GradMaker, \
::paddle::operators::ElementwiseOpInplace); \
::paddle::operators::ElementwiseOpInplace); \
REGISTER_OPERATOR(op_type##_grad, \
REGISTER_OPERATOR(op_type##_grad, \
::paddle::operators::ElementwiseOpExplicitGrad)
::paddle::operators::ElementwiseOpExplicitGrad, \
::paddle::operators::ElementwiseGradOpInplace)
paddle/fluid/operators/ngraph/ngraph_engine_op.h
浏览文件 @
97b143fb
...
@@ -35,7 +35,7 @@ class NgraphEngineOp : public framework::OperatorWithKernel {
...
@@ -35,7 +35,7 @@ class NgraphEngineOp : public framework::OperatorWithKernel {
framework
::
OpKernelType
GetExpectedKernelType
(
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
framework
::
OpKernelType
kt
=
framework
::
OpKernelType
(
framework
::
OpKernelType
kt
=
framework
::
OpKernelType
(
framework
::
proto
::
VarType
::
FP32
,
ctx
.
Get
Place
());
framework
::
proto
::
VarType
::
FP32
,
platform
::
CPU
Place
());
return
kt
;
return
kt
;
}
}
};
};
...
...
paddle/fluid/operators/reader/buffered_reader.cc
浏览文件 @
97b143fb
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
#include "paddle/fluid/operators/reader/buffered_reader.h"
#include "paddle/fluid/operators/reader/buffered_reader.h"
#include <vector>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -24,6 +25,13 @@ BufferedReader::~BufferedReader() {
...
@@ -24,6 +25,13 @@ BufferedReader::~BufferedReader() {
position_
.
front
().
wait
();
position_
.
front
().
wait
();
position_
.
pop
();
position_
.
pop
();
}
}
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
place_
))
{
platform
::
SetDeviceId
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
).
device
);
PADDLE_ENFORCE
(
cudaStreamDestroy
(
stream
));
for
(
auto
&
event
:
events
)
PADDLE_ENFORCE
(
cudaEventDestroy
(
event
));
}
#endif
}
}
BufferedReader
::
BufferedReader
(
BufferedReader
::
BufferedReader
(
...
@@ -33,6 +41,19 @@ BufferedReader::BufferedReader(
...
@@ -33,6 +41,19 @@ BufferedReader::BufferedReader(
thread_pool_
(
1
),
thread_pool_
(
1
),
place_
(
place
),
place_
(
place
),
buffer_size_
(
buffer_size
)
{
buffer_size_
(
buffer_size
)
{
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
place_
))
{
platform
::
SetDeviceId
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
).
device
);
compute_stream
=
((
platform
::
CUDADeviceContext
*
)(
platform
::
DeviceContextPool
::
Instance
()
.
Get
(
place_
)))
->
stream
();
events
.
resize
(
buffer_size
);
for
(
auto
&
event
:
events
)
PADDLE_ENFORCE
(
cudaEventCreateWithFlags
(
&
event
,
cudaEventDisableTiming
));
PADDLE_ENFORCE
(
cudaStreamCreateWithFlags
(
&
stream
,
cudaStreamNonBlocking
));
}
#endif
cpu_buffer_
.
resize
(
buffer_size
);
cpu_buffer_
.
resize
(
buffer_size
);
gpu_buffer_
.
resize
(
buffer_size
);
gpu_buffer_
.
resize
(
buffer_size
);
ReadTillBufferFullAsync
();
ReadTillBufferFullAsync
();
...
@@ -46,6 +67,12 @@ void BufferedReader::ReadTillBufferFullAsync() {
...
@@ -46,6 +67,12 @@ void BufferedReader::ReadTillBufferFullAsync() {
}
}
void
BufferedReader
::
ReadAsync
(
size_t
i
)
{
void
BufferedReader
::
ReadAsync
(
size_t
i
)
{
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
place_
))
{
platform
::
SetDeviceId
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
).
device
);
PADDLE_ENFORCE
(
cudaEventRecord
(
events
[
i
],
compute_stream
));
}
#endif
position_
.
emplace
(
thread_pool_
.
enqueue
([
this
,
i
]()
->
size_t
{
position_
.
emplace
(
thread_pool_
.
enqueue
([
this
,
i
]()
->
size_t
{
TensorVec
&
cpu
=
cpu_buffer_
[
i
];
TensorVec
&
cpu
=
cpu_buffer_
[
i
];
reader_
->
ReadNext
(
&
cpu
);
reader_
->
ReadNext
(
&
cpu
);
...
@@ -54,14 +81,41 @@ void BufferedReader::ReadAsync(size_t i) {
...
@@ -54,14 +81,41 @@ void BufferedReader::ReadAsync(size_t i) {
return
-
1UL
;
return
-
1UL
;
}
}
#ifdef PADDLE_WITH_CUDA
// NOTE(liangdun): using async copy instead of TensorCopySync
// TensorCopySync would block other stream
if
(
platform
::
is_gpu_place
(
place_
))
{
if
(
platform
::
is_gpu_place
(
place_
))
{
platform
::
SetDeviceId
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
).
device
);
PADDLE_ENFORCE
(
cudaStreamWaitEvent
(
stream
,
events
[
i
],
0
));
TensorVec
&
gpu
=
gpu_buffer_
[
i
];
TensorVec
&
gpu
=
gpu_buffer_
[
i
];
gpu
.
resize
(
cpu
.
size
());
gpu
.
resize
(
cpu
.
size
());
for
(
size_t
i
=
0
;
i
<
cpu
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
cpu
.
size
();
++
i
)
{
framework
::
TensorCopySync
(
cpu
[
i
],
place_
,
&
gpu
[
i
]);
gpu
[
i
].
Resize
(
cpu
[
i
].
dims
());
gpu
[
i
].
set_layout
(
cpu
[
i
].
layout
());
auto
cpu_place
=
cpu
[
i
].
place
();
auto
cpu_ptr
=
cpu
[
i
].
data
<
void
>
();
auto
gpu_ptr
=
gpu
[
i
].
mutable_data
(
place_
,
cpu
[
i
].
type
());
auto
size
=
cpu
[
i
].
numel
()
*
paddle
::
framework
::
SizeOfType
(
cpu
[
i
].
type
());
if
(
platform
::
is_cuda_pinned_place
(
cpu_place
))
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
boost
::
get
<
platform
::
CUDAPinnedPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
stream
);
else
if
((
platform
::
is_gpu_place
(
cpu_place
)))
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
boost
::
get
<
platform
::
CUDAPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
stream
);
else
// if cpu place is not pinned, async copy is slower than sync copy,
// so we use sync copy instead.
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
gpu_ptr
,
boost
::
get
<
platform
::
CPUPlace
>
(
cpu_place
),
cpu_ptr
,
size
,
0
);
gpu
[
i
].
set_lod
(
cpu
[
i
].
lod
());
gpu
[
i
].
set_lod
(
cpu
[
i
].
lod
());
}
}
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream
));
}
}
#endif
return
i
;
return
i
;
}));
}));
}
}
...
...
paddle/fluid/operators/reader/buffered_reader.h
浏览文件 @
97b143fb
...
@@ -19,6 +19,9 @@
...
@@ -19,6 +19,9 @@
#include <vector>
#include <vector>
#include "ThreadPool.h"
#include "ThreadPool.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/reader.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -59,6 +62,11 @@ class BufferedReader : public framework::DecoratedReader {
...
@@ -59,6 +62,11 @@ class BufferedReader : public framework::DecoratedReader {
std
::
vector
<
TensorVec
>
cpu_buffer_
;
std
::
vector
<
TensorVec
>
cpu_buffer_
;
std
::
vector
<
TensorVec
>
gpu_buffer_
;
std
::
vector
<
TensorVec
>
gpu_buffer_
;
size_t
prev_pos_
{
-
1UL
};
size_t
prev_pos_
{
-
1UL
};
#ifdef PADDLE_WITH_CUDA
cudaStream_t
stream
;
cudaStream_t
compute_stream
;
std
::
vector
<
cudaEvent_t
>
events
;
#endif
};
};
}
// namespace reader
}
// namespace reader
...
...
paddle/fluid/platform/place.cc
浏览文件 @
97b143fb
...
@@ -14,6 +14,12 @@ limitations under the License. */
...
@@ -14,6 +14,12 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
DEFINE_bool
(
benchmark
,
false
,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode."
);
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
97b143fb
...
@@ -295,6 +295,7 @@ PYBIND11_MODULE(core, m) {
...
@@ -295,6 +295,7 @@ PYBIND11_MODULE(core, m) {
.
def
(
"_get_float_element"
,
TensorGetElement
<
float
>
)
.
def
(
"_get_float_element"
,
TensorGetElement
<
float
>
)
.
def
(
"_set_double_element"
,
TensorSetElement
<
double
>
)
.
def
(
"_set_double_element"
,
TensorSetElement
<
double
>
)
.
def
(
"_get_double_element"
,
TensorGetElement
<
double
>
)
.
def
(
"_get_double_element"
,
TensorGetElement
<
double
>
)
.
def
(
"_place"
,
[](
Tensor
&
self
)
{
return
self
.
place
();
})
.
def
(
"_dtype"
,
[](
Tensor
&
self
)
{
return
self
.
type
();
});
.
def
(
"_dtype"
,
[](
Tensor
&
self
)
{
return
self
.
type
();
});
py
::
class_
<
LoDTensor
,
Tensor
>
(
m
,
"LoDTensor"
,
R"DOC(
py
::
class_
<
LoDTensor
,
Tensor
>
(
m
,
"LoDTensor"
,
R"DOC(
...
@@ -673,6 +674,12 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -673,6 +674,12 @@ All parameter, weight, gradient are variables in Paddle.
py
::
class_
<
platform
::
Place
>
(
m
,
"Place"
)
py
::
class_
<
platform
::
Place
>
(
m
,
"Place"
)
.
def
(
py
::
init
<>
())
.
def
(
py
::
init
<>
())
.
def
(
"is_gpu_place"
,
[](
platform
::
Place
&
self
)
{
return
platform
::
is_gpu_place
(
self
);
})
.
def
(
"gpu_device_id"
,
[](
platform
::
Place
&
self
)
{
return
boost
::
get
<
platform
::
CUDAPlace
>
(
self
).
device
;
})
.
def
(
"set_place"
,
.
def
(
"set_place"
,
[](
platform
::
Place
&
self
,
const
platform
::
CPUPlace
&
cpu_place
)
{
[](
platform
::
Place
&
self
,
const
platform
::
CPUPlace
&
cpu_place
)
{
self
=
cpu_place
;
self
=
cpu_place
;
...
@@ -1092,10 +1099,6 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -1092,10 +1099,6 @@ All parameter, weight, gradient are variables in Paddle.
"is_distribution"
,
"is_distribution"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
is_distribution_
;
},
[](
const
BuildStrategy
&
self
)
{
return
self
.
is_distribution_
;
},
[](
BuildStrategy
&
self
,
bool
b
)
{
self
.
is_distribution_
=
b
;
})
[](
BuildStrategy
&
self
,
bool
b
)
{
self
.
is_distribution_
=
b
;
})
.
def_property
(
"memory_early_delete"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
memory_early_delete_
;
},
[](
BuildStrategy
&
self
,
bool
b
)
{
self
.
memory_early_delete_
=
b
;
})
.
def_property
(
.
def_property
(
"enable_inplace"
,
"enable_inplace"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
enable_inplace_
;
},
[](
const
BuildStrategy
&
self
)
{
return
self
.
enable_inplace_
;
},
...
...
python/paddle/__init__.py
浏览文件 @
97b143fb
...
@@ -25,4 +25,5 @@ import paddle.reader
...
@@ -25,4 +25,5 @@ import paddle.reader
import
paddle.dataset
import
paddle.dataset
import
paddle.batch
import
paddle.batch
import
paddle.compat
import
paddle.compat
import
paddle.distributed
batch
=
batch
.
batch
batch
=
batch
.
batch
python/paddle/distributed/__init__.py
0 → 100644
浏览文件 @
97b143fb
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tools/run_mp
.py
→
python/paddle/distributed/launch
.py
浏览文件 @
97b143fb
...
@@ -37,7 +37,7 @@ default_envs = {
...
@@ -37,7 +37,7 @@ default_envs = {
GPUS
=
8
GPUS
=
8
def
start_procs
(
gpus
,
cmd
,
log_dir
):
def
start_procs
(
gpus
,
entrypoint
,
entrypoint_args
,
log_dir
):
procs
=
[]
procs
=
[]
log_fns
=
[]
log_fns
=
[]
os
.
system
(
"mkdir -p %s"
%
log_dir
)
os
.
system
(
"mkdir -p %s"
%
log_dir
)
...
@@ -73,12 +73,11 @@ def start_procs(gpus, cmd, log_dir):
...
@@ -73,12 +73,11 @@ def start_procs(gpus, cmd, log_dir):
"PADDLE_TRAINER_ENDPOINTS"
:
all_nodes_devices_endpoints
"PADDLE_TRAINER_ENDPOINTS"
:
all_nodes_devices_endpoints
})
})
print
(
"starting process "
,
i
,
cmd
,
curr_env
)
print
(
"starting process "
,
i
,
entrypoint
,
entrypoint_args
,
curr_env
)
fn
=
open
(
"%s/workerlog.%d"
%
(
log_dir
,
i
),
"w"
)
fn
=
open
(
"%s/workerlog.%d"
%
(
log_dir
,
i
),
"w"
)
log_fns
.
append
(
fn
)
log_fns
.
append
(
fn
)
procs
.
append
(
cmd
=
[
sys
.
executable
,
"-u"
,
entrypoint
]
+
entrypoint_args
subprocess
.
Popen
(
procs
.
append
(
subprocess
.
Popen
(
cmd
,
stdout
=
fn
,
stderr
=
fn
,
env
=
curr_env
))
cmd
.
strip
().
split
(
" "
),
stdout
=
fn
,
stderr
=
fn
,
env
=
curr_env
))
for
i
in
range
(
gpus
):
for
i
in
range
(
gpus
):
try
:
try
:
...
@@ -89,7 +88,8 @@ def start_procs(gpus, cmd, log_dir):
...
@@ -89,7 +88,8 @@ def start_procs(gpus, cmd, log_dir):
pass
pass
def
main
():
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
description
=
'''start paddle training using multi-process mode.
description
=
'''start paddle training using multi-process mode.
NOTE: your train program ***must*** run as distributed nccl2 mode,
NOTE: your train program ***must*** run as distributed nccl2 mode,
...
@@ -108,21 +108,27 @@ POD_IP (current node ip address, not needed for local training)
...
@@ -108,21 +108,27 @@ POD_IP (current node ip address, not needed for local training)
type
=
int
,
type
=
int
,
default
=
8
,
default
=
8
,
help
=
'start number of processes for every gpu'
)
help
=
'start number of processes for every gpu'
)
parser
.
add_argument
(
'--cmd'
,
type
=
str
,
default
=
""
,
help
=
'command to run for each process, e.g. python train.py --lr 0.1'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--log_dir'
,
'--log_dir'
,
type
=
str
,
type
=
str
,
default
=
"mylog"
,
default
=
"mylog"
,
help
=
'directory to put logs per process.'
)
help
=
'directory to put logs per process.'
)
args
=
parser
.
parse_args
()
parser
.
add_argument
(
if
args
.
cmd
==
""
:
'entrypoint_script'
,
parser
.
print_help
()
type
=
str
,
exit
(
0
)
help
=
"The entrypoint script to be launched in parallel,"
start_procs
(
args
.
gpus
,
args
.
cmd
,
args
.
log_dir
)
"followed by all the arguments for each process,"
"e.g. train.py --lr 0.1"
)
parser
.
add_argument
(
'entrypoint_args'
,
nargs
=
argparse
.
REMAINDER
)
return
parser
.
parse_args
()
def
main
():
args
=
parse_args
()
# launch multiple training process
start_procs
(
args
.
gpus
,
args
.
entrypoint_script
,
args
.
entrypoint_args
,
args
.
log_dir
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/__init__.py
浏览文件 @
97b143fb
...
@@ -161,7 +161,6 @@ def __bootstrap__():
...
@@ -161,7 +161,6 @@ def __bootstrap__():
'times_excess_than_required_tmp_allocation'
,
'times_excess_than_required_tmp_allocation'
,
'enable_inplace_whitelist'
'enable_inplace_whitelist'
]
]
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
[
"--tryfromenv="
+
","
.
join
(
read_env_flags
)])
[
"--tryfromenv="
+
","
.
join
(
read_env_flags
)])
core
.
init_glog
(
sys
.
argv
[
0
])
core
.
init_glog
(
sys
.
argv
[
0
])
...
...
python/paddle/fluid/layer_helper.py
浏览文件 @
97b143fb
...
@@ -302,7 +302,8 @@ class LayerHelper(object):
...
@@ -302,7 +302,8 @@ class LayerHelper(object):
if
default_initializer
is
None
and
attr
.
initializer
is
None
:
if
default_initializer
is
None
and
attr
.
initializer
is
None
:
if
isinstance
(
dtype
,
core
.
VarDesc
.
VarType
):
if
isinstance
(
dtype
,
core
.
VarDesc
.
VarType
):
if
dtype
!=
core
.
VarDesc
.
VarType
.
FP32
and
\
if
dtype
!=
core
.
VarDesc
.
VarType
.
FP32
and
\
dtype
!=
core
.
VarDesc
.
VarType
.
FP64
:
dtype
!=
core
.
VarDesc
.
VarType
.
FP64
and
\
dtype
!=
core
.
VarDesc
.
VarType
.
FP16
:
raise
TypeError
(
raise
TypeError
(
"Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
"Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
)
)
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
97b143fb
...
@@ -148,7 +148,8 @@ class ParallelExecutor(object):
...
@@ -148,7 +148,8 @@ class ParallelExecutor(object):
else
framework
.
default_main_program
()
else
framework
.
default_main_program
()
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
# if turn on python memory optimize, turn off the inplace_pass.
build_strategy
.
enable_inplace
=
False
if
main
.
_is_mem_optimized
else
True
if
build_strategy
.
enable_inplace
is
None
:
build_strategy
.
enable_inplace
=
False
if
main
.
_is_mem_optimized
else
True
scope
=
scope
if
scope
is
not
None
else
executor
.
global_scope
()
scope
=
scope
if
scope
is
not
None
else
executor
.
global_scope
()
if
share_vars_from
and
not
isinstance
(
share_vars_from
,
if
share_vars_from
and
not
isinstance
(
share_vars_from
,
...
...
python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py
浏览文件 @
97b143fb
...
@@ -16,14 +16,37 @@ from __future__ import print_function
...
@@ -16,14 +16,37 @@ from __future__ import print_function
import
unittest
import
unittest
import
numpy
as
np
import
numpy
as
np
import
paddle.fluid.core
as
core
from
paddle.fluid.tests.unittests.op_test
import
OpTest
from
paddle.fluid.tests.unittests.op_test
import
OpTest
from
paddle.fluid.tests.unittests.test_accuracy_op
import
TestAccuracyOp
class
TestNGRAPHAccuracyOp
(
TestAccuracyOp
):
class
TestNGRAPHAccuracyOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
super
(
TestNGRAPHAccuracyOp
,
self
).
setUp
()
self
.
op_type
=
"accuracy"
self
.
dtype
=
np
.
float32
self
.
init_dtype
()
n
=
128
infer
=
np
.
random
.
random
((
n
,
1
)).
astype
(
self
.
dtype
)
indices
=
np
.
random
.
randint
(
0
,
2
,
(
n
,
1
))
label
=
np
.
random
.
randint
(
0
,
2
,
(
n
,
1
))
self
.
inputs
=
{
'Out'
:
infer
,
'Indices'
:
indices
,
"Label"
:
label
}
num_correct
=
0
for
rowid
in
range
(
n
):
for
ele
in
indices
[
rowid
]:
if
ele
==
label
[
rowid
]:
num_correct
+=
1
break
self
.
outputs
=
{
'Accuracy'
:
np
.
array
([
num_correct
/
float
(
n
)]).
astype
(
self
.
dtype
),
'Correct'
:
np
.
array
([
num_correct
]).
astype
(
"int64"
),
'Total'
:
np
.
array
([
n
]).
astype
(
"int64"
)
}
self
.
_cpu_only
=
True
def
init_dtype
(
self
):
pass
def
test_check_output
(
self
):
self
.
check_output
()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
浏览文件 @
97b143fb
...
@@ -15,35 +15,59 @@
...
@@ -15,35 +15,59 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
unittest
import
unittest
from
paddle.fluid.tests.unittests.test_conv2d_op
import
*
from
paddle.fluid.tests.unittests.test_conv2d_op
import
TestConv2dOp
,
TestWithPad
,
TestWithStride
,
TestWithGroup
,
TestWith1x1
,
TestWithInput1x1Filter1x1
class
TestNGRAPH
(
TestConv2dOp
):
class
TestNGRAPH
(
TestConv2dOp
):
def
setUp
(
self
):
super
(
TestNGRAPH
,
self
).
setUp
()
self
.
_cpu_only
=
True
def
init_kernel_type
(
self
):
def
init_kernel_type
(
self
):
super
(
TestNGRAPH
,
self
).
init_kernel_type
()
super
(
TestNGRAPH
,
self
).
init_kernel_type
()
class
TestNGRAPHWithPad
(
TestWithPad
):
class
TestNGRAPHWithPad
(
TestWithPad
):
def
setUp
(
self
):
super
(
TestNGRAPHWithPad
,
self
).
setUp
()
self
.
_cpu_only
=
True
def
init_kernel_type
(
self
):
def
init_kernel_type
(
self
):
super
(
TestNGRAPHWithPad
,
self
).
init_kernel_type
()
super
(
TestNGRAPHWithPad
,
self
).
init_kernel_type
()
class
TestNGRAPHWithStride
(
TestWithStride
):
class
TestNGRAPHWithStride
(
TestWithStride
):
def
setUp
(
self
):
super
(
TestNGRAPHWithStride
,
self
).
setUp
()
self
.
_cpu_only
=
True
def
init_kernel_type
(
self
):
def
init_kernel_type
(
self
):
super
(
TestNGRAPHWithStride
,
self
).
init_kernel_type
()
super
(
TestNGRAPHWithStride
,
self
).
init_kernel_type
()
class
TestNGRAPHWithGroup
(
TestWithGroup
):
class
TestNGRAPHWithGroup
(
TestWithGroup
):
def
setUp
(
self
):
super
(
TestNGRAPHWithGroup
,
self
).
setUp
()
self
.
_cpu_only
=
True
def
init_kernel_type
(
self
):
def
init_kernel_type
(
self
):
super
(
TestNGRAPHWithGroup
,
self
).
init_kernel_type
()
super
(
TestNGRAPHWithGroup
,
self
).
init_kernel_type
()
class
TestNGRAPHWith1x1
(
TestWith1x1
):
class
TestNGRAPHWith1x1
(
TestWith1x1
):
def
setUp
(
self
):
super
(
TestNGRAPHWith1x1
,
self
).
setUp
()
self
.
_cpu_only
=
True
def
init_kernel_type
(
self
):
def
init_kernel_type
(
self
):
super
(
TestNGRAPHWith1x1
,
self
).
init_kernel_type
()
super
(
TestNGRAPHWith1x1
,
self
).
init_kernel_type
()
class
TestNGRAPHWithInput1x1Filter1x1
(
TestWithInput1x1Filter1x1
):
class
TestNGRAPHWithInput1x1Filter1x1
(
TestWithInput1x1Filter1x1
):
def
setUp
(
self
):
super
(
TestNGRAPHWithInput1x1Filter1x1
,
self
).
setUp
()
self
.
_cpu_only
=
True
def
init_kernel_type
(
self
):
def
init_kernel_type
(
self
):
super
(
TestNGRAPHWithInput1x1Filter1x1
,
self
).
init_kernel_type
()
super
(
TestNGRAPHWithInput1x1Filter1x1
,
self
).
init_kernel_type
()
...
...
python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py
浏览文件 @
97b143fb
...
@@ -14,73 +14,16 @@
...
@@ -14,73 +14,16 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
unittest
import
unittest
from
paddle.fluid.tests.unittests.test_elementwise_add_op
import
*
from
paddle.fluid.tests.unittests.test_elementwise_add_op
import
TestElementwiseAddOp
class
TestNGRAPHElementwiseAddOp
(
TestElementwiseAddOp
):
class
TestNGRAPHElementwiseAddOp
(
TestElementwiseAddOp
):
def
init_input_output
(
self
):
def
setUp
(
self
):
super
(
TestNGRAPHElementwiseAddOp
,
self
).
init_input_output
()
super
(
TestNGRAPHElementwiseAddOp
,
self
).
setUp
()
self
.
_cpu_only
=
True
class
TestNGRAPHElementwiseAddOp_scalar
(
TestElementwiseAddOp_scalar
):
def
init_input_output
(
self
):
super
(
TestNGRAPHElementwiseAddOp_scalar
,
self
).
init_input_output
()
class
TestNGRAPHElementwiseAddOp_scalar2
(
TestElementwiseAddOp_scalar2
):
def
init_input_output
(
self
):
super
(
TestNGRAPHElementwiseAddOp_scalar2
,
self
).
init_input_output
()
class
TestNGRAPHElementwiseAddOp_Vector
(
TestElementwiseAddOp_Vector
):
def
init_input_output
(
self
):
super
(
TestNGRAPHElementwiseAddOp_Vector
,
self
).
init_input_output
()
class
TesNGRAPHtElementwiseAddOp_broadcast_0
(
TestElementwiseAddOp_broadcast_0
):
def
init_input_output
(
self
):
super
(
TesNGRAPHtElementwiseAddOp_broadcast_0
,
self
).
init_input_output
()
class
TestNGRAPHElementwiseAddOp_broadcast_1
(
TestElementwiseAddOp_broadcast_1
):
def
init_input_output
(
self
):
super
(
TestNGRAPHElementwiseAddOp_broadcast_1
,
self
).
init_input_output
()
class
TestNGRAPHElementwiseAddOp_broadcast_2
(
TestElementwiseAddOp_broadcast_2
):
def
init_input_output
(
self
):
def
init_input_output
(
self
):
super
(
TestNGRAPHElementwiseAddOp_broadcast_2
,
self
).
init_input_output
()
super
(
TestNGRAPHElementwiseAddOp
,
self
).
init_input_output
()
class
TestNGRAPHElementwiseAddOp_broadcast_3
(
TestElementwiseAddOp_broadcast_3
):
def
init_input_output
(
self
):
super
(
TestNGRAPHElementwiseAddOp_broadcast_3
,
self
).
init_input_output
()
class
TestNGRAPHElementwiseAddOp_broadcast_4
(
TestElementwiseAddOp_broadcast_4
):
def
init_input_output
(
self
):
super
(
TestNGRAPHElementwiseAddOp_broadcast_4
,
self
).
init_input_output
()
class
TestNGRAPHElementwiseAddOp_rowwise_add_0
(
TestElementwiseAddOp_rowwise_add_0
):
def
init_input_output
(
self
):
super
(
TestNGRAPHElementwiseAddOp_rowwise_add_0
,
self
).
init_input_output
()
class
TestNGRAPHElementwiseAddOp_rowwise_add_1
(
TestElementwiseAddOp_rowwise_add_1
):
def
init_input_output
(
self
):
super
(
TestNGRAPHElementwiseAddOp_rowwise_add_1
,
self
).
init_input_output
()
class
TestNGRAPHElementwiseAddOp_channelwise_add
(
TestElementwiseAddOp_channelwise_add
):
def
init_input_output
(
self
):
super
(
TestNGRAPHElementwiseAddOp_channelwise_add
,
self
).
init_input_output
()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py
浏览文件 @
97b143fb
...
@@ -14,17 +14,13 @@
...
@@ -14,17 +14,13 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
unittest
import
unittest
from
paddle.fluid.tests.unittests.test_mean_op
import
TestMeanOp
,
TestFP16MeanOp
from
paddle.fluid.tests.unittests.test_mean_op
import
TestMeanOp
class
TestNGRAPHMeanOp
(
TestMeanOp
):
class
TestNGRAPHMeanOp
(
TestMeanOp
):
def
setUp
(
self
):
def
setUp
(
self
):
super
(
TestNGRAPHMeanOp
,
self
).
setUp
()
super
(
TestNGRAPHMeanOp
,
self
).
setUp
()
self
.
_cpu_only
=
True
class
TestNGRAPHFP16MeanOp
(
TestFP16MeanOp
):
def
setUp
(
self
):
super
(
TestNGRAPHFP16MeanOp
,
self
).
setUp
()
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
浏览文件 @
97b143fb
...
@@ -15,27 +15,38 @@
...
@@ -15,27 +15,38 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
unittest
import
unittest
from
paddle.fluid.tests.unittests.test_mul_op
import
TestMulOp
,
TestMulOp2
,
TestFP16MulOp1
,
TestFP16MulOp2
import
numpy
as
np
from
paddle.fluid.tests.unittests.op_test
import
OpTest
class
TestNGRAPHMulOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"mul"
self
.
dtype
=
np
.
float32
self
.
init_dtype_type
()
self
.
inputs
=
{
'X'
:
np
.
random
.
random
((
2
,
4
)).
astype
(
self
.
dtype
),
'Y'
:
np
.
random
.
random
((
4
,
4
)).
astype
(
self
.
dtype
)
}
self
.
outputs
=
{
'Out'
:
np
.
dot
(
self
.
inputs
[
'X'
],
self
.
inputs
[
'Y'
])}
self
.
_cpu_only
=
True
class
TestNGRAPHMulOp
(
TestMulOp
):
def
init_dtype_type
(
self
):
def
init_dtype_type
(
self
):
pass
pass
def
test_check_output
(
self
):
self
.
check_output
()
class
TestNGRAPHMulOp2
(
TestMulOp2
):
def
test_check_grad_normal
(
self
):
def
init_dtype_type
(
self
):
self
.
check_grad
([
'X'
,
'Y'
],
'Out'
,
max_relative_error
=
0.5
)
pass
def
test_check_grad_ingore_x
(
self
):
self
.
check_grad
(
[
'Y'
],
'Out'
,
max_relative_error
=
0.5
,
no_grad_set
=
set
(
"X"
))
class
TestNGRAPHFP16MulOp1
(
TestFP16MulOp1
):
def
test_check_grad_ingore_y
(
self
):
def
init_dtype_type
(
self
):
self
.
check_grad
(
pass
[
'X'
],
'Out'
,
max_relative_error
=
0.5
,
no_grad_set
=
set
(
'Y'
))
class
TestNGRAPHFP16MulOp2
(
TestFP16MulOp2
):
def
init_dtype_type
(
self
):
pass
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py
浏览文件 @
97b143fb
...
@@ -14,35 +14,59 @@
...
@@ -14,35 +14,59 @@
from
__future__
import
print_function
from
__future__
import
print_function
from
paddle.fluid.tests.unittests.test_pool2d_op
import
*
from
paddle.fluid.tests.unittests.test_pool2d_op
import
TestPool2D_Op
,
TestCase1
,
TestCase2
,
TestCase3
,
TestCase4
,
TestCase5
class
TestNGRAPHPool2D_Op
(
TestPool2D_Op
):
class
TestNGRAPHPool2D_Op
(
TestPool2D_Op
):
def
setUp
(
self
):
super
(
TestNGRAPHPool2D_Op
,
self
).
setUp
()
self
.
_cpu_only
=
True
def
init_test_case
(
self
):
def
init_test_case
(
self
):
super
(
TestNGRAPHPool2D_Op
,
self
).
init_test_case
()
super
(
TestNGRAPHPool2D_Op
,
self
).
init_test_case
()
class
TestNGRAPHCase1
(
TestCase1
):
class
TestNGRAPHCase1
(
TestCase1
):
def
setUp
(
self
):
super
(
TestNGRAPHCase1
,
self
).
setUp
()
self
.
_cpu_only
=
True
def
init_test_case
(
self
):
def
init_test_case
(
self
):
super
(
TestNGRAPHCase1
,
self
).
init_test_case
()
super
(
TestNGRAPHCase1
,
self
).
init_test_case
()
class
TestNGRAPHCase2
(
TestCase2
):
class
TestNGRAPHCase2
(
TestCase2
):
def
setUp
(
self
):
super
(
TestNGRAPHCase2
,
self
).
setUp
()
self
.
_cpu_only
=
True
def
init_test_case
(
self
):
def
init_test_case
(
self
):
super
(
TestNGRAPHCase2
,
self
).
init_test_case
()
super
(
TestNGRAPHCase2
,
self
).
init_test_case
()
class
TestNGRAPHCase3
(
TestCase3
):
class
TestNGRAPHCase3
(
TestCase3
):
def
setUp
(
self
):
super
(
TestNGRAPHCase3
,
self
).
setUp
()
self
.
_cpu_only
=
True
def
init_pool_type
(
self
):
def
init_pool_type
(
self
):
super
(
TestNGRAPHCase3
,
self
).
init_pool_type
()
super
(
TestNGRAPHCase3
,
self
).
init_pool_type
()
class
TestNGRAPHCase4
(
TestCase4
):
class
TestNGRAPHCase4
(
TestCase4
):
def
setUp
(
self
):
super
(
TestNGRAPHCase4
,
self
).
setUp
()
self
.
_cpu_only
=
True
def
init_pool_type
(
self
):
def
init_pool_type
(
self
):
super
(
TestNGRAPHCase4
,
self
).
init_pool_type
()
super
(
TestNGRAPHCase4
,
self
).
init_pool_type
()
class
TestNGRAPHCase5
(
TestCase5
):
class
TestNGRAPHCase5
(
TestCase5
):
def
setUp
(
self
):
super
(
TestNGRAPHCase5
,
self
).
setUp
()
self
.
_cpu_only
=
True
def
init_pool_type
(
self
):
def
init_pool_type
(
self
):
super
(
TestNGRAPHCase5
,
self
).
init_pool_type
()
super
(
TestNGRAPHCase5
,
self
).
init_pool_type
()
...
...
python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py
浏览文件 @
97b143fb
...
@@ -13,25 +13,23 @@
...
@@ -13,25 +13,23 @@
# limitations under the License.
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
print_function
import
unittest
import
unittest
from
paddle.fluid.tests.unittests.test_scale_op
import
TestScaleOp
,
TestScaleOpSelectedRows
,
TestScaleFp16Op
,
TestScaleFp16OpSelectedRows
from
paddle.fluid.tests.unittests.test_scale_op
import
TestScaleOp
,
TestScaleOpSelectedRows
class
TestNGRAPHScaleOp
(
TestScaleOp
):
class
TestNGRAPHScaleOp
(
TestScaleOp
):
def
init_dtype_type
(
self
):
def
setUp
(
self
):
pass
super
(
TestNGRAPHScaleOp
,
self
).
setUp
()
self
.
_cpu_only
=
True
class
TestNGRAPHScaleOpSelectedRows
(
TestScaleOpSelectedRows
):
def
init_dtype_type
(
self
):
def
init_dtype_type
(
self
):
pass
pass
class
TestNGRAPHScale
Fp16Op
(
TestScaleFp16Op
):
class
TestNGRAPHScale
OpSelectedRows
(
TestScaleOpSelectedRows
):
def
init_dtype_type
(
self
):
def
setUp
(
self
):
pass
super
(
TestNGRAPHScaleOpSelectedRows
,
self
).
setUp
()
self
.
_cpu_only
=
True
class
TestNGRAPHScaleFp16OpSelectedRows
(
TestScaleFp16OpSelectedRows
):
def
init_dtype_type
(
self
):
def
init_dtype_type
(
self
):
pass
pass
...
...
python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py
浏览文件 @
97b143fb
...
@@ -20,21 +20,25 @@ from paddle.fluid.tests.unittests.test_top_k_op import TestTopkOp, TestTopkOp3d,
...
@@ -20,21 +20,25 @@ from paddle.fluid.tests.unittests.test_top_k_op import TestTopkOp, TestTopkOp3d,
class
TestNGRAPHTopkOp
(
TestTopkOp
):
class
TestNGRAPHTopkOp
(
TestTopkOp
):
def
setUp
(
self
):
def
setUp
(
self
):
super
(
TestNGRAPHTopkOp
,
self
).
setUp
()
super
(
TestNGRAPHTopkOp
,
self
).
setUp
()
self
.
_cpu_only
=
True
class
TestNGRAPHTopkOp2
(
TestTopkOp2
):
class
TestNGRAPHTopkOp2
(
TestTopkOp2
):
def
setUp
(
self
):
def
setUp
(
self
):
super
(
TestNGRAPHTopkOp2
,
self
).
setUp
()
super
(
TestNGRAPHTopkOp2
,
self
).
setUp
()
self
.
_cpu_only
=
True
class
TestNGRAPHTopkOp3
(
TestTopkOp3
):
class
TestNGRAPHTopkOp3
(
TestTopkOp3
):
def
setUp
(
self
):
def
setUp
(
self
):
super
(
TestNGRAPHTopkOp3
,
self
).
setUp
()
super
(
TestNGRAPHTopkOp3
,
self
).
setUp
()
self
.
_cpu_only
=
True
class
TestNGRAPHTopkOp4
(
TestTopkOp4
):
class
TestNGRAPHTopkOp4
(
TestTopkOp4
):
def
setUp
(
self
):
def
setUp
(
self
):
super
(
TestNGRAPHTopkOp4
,
self
).
setUp
()
super
(
TestNGRAPHTopkOp4
,
self
).
setUp
()
self
.
_cpu_only
=
True
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
python/setup.py.in
浏览文件 @
97b143fb
...
@@ -100,6 +100,7 @@ packages=['paddle',
...
@@ -100,6 +100,7 @@ packages=['paddle',
'paddle.utils',
'paddle.utils',
'paddle.dataset',
'paddle.dataset',
'paddle.reader',
'paddle.reader',
'paddle.distributed',
'paddle.fluid',
'paddle.fluid',
'paddle.fluid.imperative',
'paddle.fluid.imperative',
'paddle.fluid.proto',
'paddle.fluid.proto',
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录