Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
f79a3a83
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 2 年 前同步成功
通知
708
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f79a3a83
编写于
3月 22, 2019
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
merge release/1.3
上级
d2d3f2b5
变更
64
展开全部
隐藏空白更改
内联
并排
Showing
64 changed file
with
2149 addition
and
1712 deletion
+2149
-1712
cmake/external/protobuf.cmake
cmake/external/protobuf.cmake
+1
-1
cmake/flags.cmake
cmake/flags.cmake
+2
-1
paddle/fluid/API.spec
paddle/fluid/API.spec
+1
-1
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+8
-7
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+2
-2
paddle/fluid/framework/details/inplace_op_pass.cc
paddle/fluid/framework/details/inplace_op_pass.cc
+18
-17
paddle/fluid/framework/details/inplace_op_pass.h
paddle/fluid/framework/details/inplace_op_pass.h
+8
-7
paddle/fluid/framework/details/memory_optimize_helper.cc
paddle/fluid/framework/details/memory_optimize_helper.cc
+364
-28
paddle/fluid/framework/details/memory_optimize_helper.h
paddle/fluid/framework/details/memory_optimize_helper.h
+84
-35
paddle/fluid/framework/details/memory_optimize_helper_test.cc
...le/fluid/framework/details/memory_optimize_helper_test.cc
+454
-9
paddle/fluid/framework/details/memory_optimize_pass.cc
paddle/fluid/framework/details/memory_optimize_pass.cc
+78
-347
paddle/fluid/framework/details/memory_optimize_pass.h
paddle/fluid/framework/details/memory_optimize_pass.h
+7
-50
paddle/fluid/framework/details/memory_optimize_pass_test.cc
paddle/fluid/framework/details/memory_optimize_pass_test.cc
+0
-417
paddle/fluid/framework/details/sequential_execution_pass.cc
paddle/fluid/framework/details/sequential_execution_pass.cc
+1
-0
paddle/fluid/framework/details/sequential_execution_pass.h
paddle/fluid/framework/details/sequential_execution_pass.h
+0
-2
paddle/fluid/framework/inplace_op_inference.h
paddle/fluid/framework/inplace_op_inference.h
+1
-1
paddle/fluid/framework/inplace_op_inference_test.cc
paddle/fluid/framework/inplace_op_inference_test.cc
+17
-16
paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+7
-3
paddle/fluid/framework/ir/infer_clean_graph_pass.cc
paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+1
-0
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+2
-9
paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
...e/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+0
-71
paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
...le/fluid/inference/analysis/ir_passes/subgraph_detector.h
+1
-26
paddle/fluid/inference/api/paddle_api.h
paddle/fluid/inference/api/paddle_api.h
+47
-15
paddle/fluid/inference/api/paddle_pass_builder.cc
paddle/fluid/inference/api/paddle_pass_builder.cc
+46
-0
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+2
-45
paddle/fluid/operators/controlflow/compare_op.cc
paddle/fluid/operators/controlflow/compare_op.cc
+5
-5
paddle/fluid/operators/detection/density_prior_box_op.h
paddle/fluid/operators/detection/density_prior_box_op.h
+6
-7
paddle/fluid/operators/detection/prior_box_op.h
paddle/fluid/operators/detection/prior_box_op.h
+30
-39
paddle/fluid/operators/detection/yolov3_loss_op.cc
paddle/fluid/operators/detection/yolov3_loss_op.cc
+23
-17
paddle/fluid/operators/elementwise/elementwise_op.h
paddle/fluid/operators/elementwise/elementwise_op.h
+19
-1
paddle/fluid/operators/lstm_op.h
paddle/fluid/operators/lstm_op.h
+4
-0
paddle/fluid/operators/lstmp_op.h
paddle/fluid/operators/lstmp_op.h
+5
-0
paddle/fluid/operators/pool_op.cc
paddle/fluid/operators/pool_op.cc
+87
-57
paddle/fluid/operators/random_crop_op.h
paddle/fluid/operators/random_crop_op.h
+1
-1
paddle/fluid/operators/slice_op.cu
paddle/fluid/operators/slice_op.cu
+122
-2
paddle/fluid/operators/softmax_with_cross_entropy_op.cu
paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+2
-1
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+6
-4
paddle/scripts/fast_install.sh
paddle/scripts/fast_install.sh
+436
-233
python/paddle/__init__.py
python/paddle/__init__.py
+1
-0
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+0
-1
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+4
-1
python/paddle/fluid/contrib/int8_inference/README.md
python/paddle/fluid/contrib/int8_inference/README.md
+2
-2
python/paddle/fluid/contrib/tests/CMakeLists.txt
python/paddle/fluid/contrib/tests/CMakeLists.txt
+5
-1
python/paddle/fluid/contrib/tests/test_calibration.py
python/paddle/fluid/contrib/tests/test_calibration.py
+0
-4
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+34
-34
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+2
-1
python/paddle/fluid/io.py
python/paddle/fluid/io.py
+11
-5
python/paddle/fluid/layers/control_flow.py
python/paddle/fluid/layers/control_flow.py
+8
-12
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+10
-9
python/paddle/fluid/layers/io.py
python/paddle/fluid/layers/io.py
+4
-1
python/paddle/fluid/layers/layer_function_generator.py
python/paddle/fluid/layers/layer_function_generator.py
+6
-2
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+83
-18
python/paddle/fluid/layers/ops.py
python/paddle/fluid/layers/ops.py
+2
-2
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+3
-3
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+4
-1
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+9
-0
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
...ddle/fluid/tests/unittests/parallel_executor_test_base.py
+1
-1
python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
...e/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+4
-0
python/paddle/fluid/tests/unittests/test_slice_op.py
python/paddle/fluid/tests/unittests/test_slice_op.py
+50
-0
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+5
-1
python/requirements.txt
python/requirements.txt
+1
-1
python/setup.py.in
python/setup.py.in
+1
-0
tools/manylinux1/build_scripts/build.sh
tools/manylinux1/build_scripts/build.sh
+1
-6
tools/run_mp.py
tools/run_mp.py
+0
-129
未找到文件。
cmake/external/protobuf.cmake
浏览文件 @
f79a3a83
...
@@ -231,7 +231,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
...
@@ -231,7 +231,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
)
)
ENDFUNCTION
()
ENDFUNCTION
()
SET
(
PROTOBUF_VERSION 3.1
)
SET
(
PROTOBUF_VERSION 3.1
.0
)
IF
(
NOT PROTOBUF_FOUND
)
IF
(
NOT PROTOBUF_FOUND
)
build_protobuf
(
extern_protobuf FALSE
)
build_protobuf
(
extern_protobuf FALSE
)
...
...
cmake/flags.cmake
浏览文件 @
f79a3a83
...
@@ -21,7 +21,7 @@ function(CheckCompilerCXX11Flag)
...
@@ -21,7 +21,7 @@ function(CheckCompilerCXX11Flag)
if
(
${
CMAKE_CXX_COMPILER_VERSION
}
VERSION_LESS 3.3
)
if
(
${
CMAKE_CXX_COMPILER_VERSION
}
VERSION_LESS 3.3
)
message
(
FATAL_ERROR
"Unsupported Clang version. Clang >= 3.3 required."
)
message
(
FATAL_ERROR
"Unsupported Clang version. Clang >= 3.3 required."
)
endif
()
endif
()
endif
()
endif
()
endif
()
endif
()
endfunction
()
endfunction
()
...
@@ -147,6 +147,7 @@ set(GPU_COMMON_FLAGS
...
@@ -147,6 +147,7 @@ set(GPU_COMMON_FLAGS
-Wno-error=unused-function
# Warnings in Numpy Header.
-Wno-error=unused-function
# Warnings in Numpy Header.
-Wno-error=array-bounds
# Warnings in Eigen::array
-Wno-error=array-bounds
# Warnings in Eigen::array
)
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-m64"
)
endif
(
NOT WIN32
)
endif
(
NOT WIN32
)
if
(
APPLE
)
if
(
APPLE
)
...
...
paddle/fluid/API.spec
浏览文件 @
f79a3a83
...
@@ -261,7 +261,7 @@ paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=N
...
@@ -261,7 +261,7 @@ paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=N
paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=
'ignored'
, defaults=(None,))
paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=
None
, defaults=(None,))
paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
...
...
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
f79a3a83
...
@@ -50,12 +50,15 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
...
@@ -50,12 +50,15 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
cc_library
(
gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor
)
cc_library
(
gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor
)
cc_library
(
fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope
)
cc_library
(
fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope
)
cc_library
(
memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper
)
if
(
WITH_GPU
)
cc_library
(
memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info
)
else
()
cc_library
(
memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info
)
endif
()
cc_library
(
memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass
)
cc_library
(
memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass
)
cc_library
(
inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info
)
cc_library
(
inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info
)
cc_library
(
modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper
)
cc_library
(
modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper
)
cc_library
(
memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass
)
cc_library
(
reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle
)
cc_library
(
reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle
)
cc_library
(
eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper
)
cc_library
(
eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper
)
cc_library
(
eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass
)
cc_library
(
eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass
)
...
@@ -67,13 +70,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
...
@@ -67,13 +70,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
cc_library
(
multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
cc_library
(
multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle
)
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle
)
set
(
SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass
memory_early_delete_pass
inplace_op_pass
)
set
(
SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
list
(
APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass
)
list
(
APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass
)
endif
()
endif
()
cc_test
(
memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph
)
cc_test
(
memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry
)
cc_test
(
memory_optimize_pass_test SRCS memory_optimize_pass_test.cc memory_optimize_pass.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry pass
)
cc_library
(
ssa_graph_executor SRCS ssa_graph_executor.cc DEPS
${
SSA_GRAPH_EXECUTOR_DEPS
}
)
cc_library
(
ssa_graph_executor SRCS ssa_graph_executor.cc DEPS
${
SSA_GRAPH_EXECUTOR_DEPS
}
)
cc_library
(
threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
cc_library
(
threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
...
...
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
f79a3a83
...
@@ -206,8 +206,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
...
@@ -206,8 +206,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
());
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
());
graph
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
graph
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
all_op_descs
);
// take ownership
all_op_descs
);
// take ownership
graph
->
Set
<
GraphNodePool
>
(
kGraphNodePool
,
new
GraphNodePool
);
// take ownership
pass
->
Erase
(
kAllOpDescs
);
pass
->
Erase
(
kAllOpDescs
);
pass
->
SetNotOwned
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
all_op_descs
);
pass
->
SetNotOwned
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
all_op_descs
);
...
@@ -242,7 +240,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
...
@@ -242,7 +240,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
continue
;
continue
;
}
}
}
}
VLOG
(
3
)
<<
"Start Apply Pass "
<<
pass
->
Type
();
graph
=
pass
->
Apply
(
std
::
move
(
graph
));
graph
=
pass
->
Apply
(
std
::
move
(
graph
));
VLOG
(
3
)
<<
"Finish Apply Pass "
<<
pass
->
Type
();
}
}
return
graph
;
return
graph
;
}
}
...
...
paddle/fluid/framework/details/inplace_op_pass.cc
浏览文件 @
f79a3a83
...
@@ -49,7 +49,7 @@ DEFINE_bool(
...
@@ -49,7 +49,7 @@ DEFINE_bool(
"If this option turns on, only these op in whitelist can be inplaced."
"If this option turns on, only these op in whitelist can be inplaced."
"If it turns off, all of the running op can be candidate of inplaced op."
"If it turns off, all of the running op can be candidate of inplaced op."
"Such as scale, elementwise_add"
"Such as scale, elementwise_add"
"By default, it's turned o
n
"
);
"By default, it's turned o
ff
"
);
DECLARE_string
(
memory_optimize_debug
);
DECLARE_string
(
memory_optimize_debug
);
...
@@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
...
@@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
}
}
}
}
const
SSANodePair
InplacePass
::
TryInplaceModifyVar
(
const
std
::
string
&
var
,
const
NodeSwapQueue
InplacePass
::
TryInplaceModifyVar
(
const
std
::
string
&
cache_var
,
const
std
::
string
&
var
,
const
std
::
string
&
cache_var
,
const
size_t
&
idx
,
const
size_t
&
idx
,
ir
::
Graph
*
graph
)
const
{
ir
::
Graph
*
graph
)
const
{
PADDLE_ENFORCE
(
var_nodes_
[
var
].
size
()
>=
1
&&
PADDLE_ENFORCE
(
var_nodes_
[
var
].
size
()
>=
1
&&
var_nodes_
[
var
].
at
(
0
)
->
Var
()
!=
nullptr
);
var_nodes_
[
var
].
at
(
0
)
->
Var
()
!=
nullptr
);
std
::
unique_ptr
<
VarDesc
>
var_desc
(
new
VarDesc
(
*
var_nodes_
[
var
].
at
(
0
)
->
Var
()));
std
::
unique_ptr
<
VarDesc
>
var_desc
(
new
VarDesc
(
*
var_nodes_
[
var
].
at
(
0
)
->
Var
()));
var_desc
->
SetName
(
cache_var
);
var_desc
->
SetName
(
cache_var
);
SSANodePair
swap_nodes
;
NodeSwapQueue
swap_nodes
;
for
(
size_t
i
=
idx
;
i
<
view_
.
AllOps
().
size
();
++
i
)
{
for
(
size_t
i
=
idx
;
i
<
view_
.
AllOps
().
size
();
++
i
)
{
auto
*
op
=
view_
.
AllOps
()[
i
];
auto
*
op
=
view_
.
AllOps
()[
i
];
...
@@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var,
...
@@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var,
return
swap_nodes
;
return
swap_nodes
;
}
}
void
InplacePass
::
CommitModify
(
const
SSANodePair
&
swap_nodes
,
void
InplacePass
::
CommitModify
(
const
NodeSwapQueue
&
swap_nodes
,
ir
::
Graph
*
graph
)
const
{
ir
::
Graph
*
graph
)
const
{
for
(
auto
&
pair
:
swap_nodes
)
{
for
(
auto
&
pair
:
swap_nodes
)
{
auto
*
node
=
pair
.
first
,
*
cache_node
=
pair
.
second
;
auto
*
node
=
pair
.
first
,
*
cache_node
=
pair
.
second
;
...
@@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes,
...
@@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes,
}
}
}
}
void
InplacePass
::
WithdrawModify
(
const
SSANodePair
&
nodes
,
void
InplacePass
::
WithdrawModify
(
const
NodeSwapQueue
&
nodes
,
ir
::
Graph
*
graph
)
const
{
ir
::
Graph
*
graph
)
const
{
for
(
auto
&
pair
:
nodes
)
{
for
(
auto
&
pair
:
nodes
)
{
auto
*
node
=
pair
.
first
,
*
cache_node
=
pair
.
second
;
auto
*
node
=
pair
.
first
,
*
cache_node
=
pair
.
second
;
...
@@ -403,18 +402,20 @@ void GraphView::Build(ir::Graph* g) {
...
@@ -403,18 +402,20 @@ void GraphView::Build(ir::Graph* g) {
// 2. track the nodes which used by parameter server.
// 2. track the nodes which used by parameter server.
// these node can not be inplaced, otherwise trainer
// these node can not be inplaced, otherwise trainer
// pserver can not find each other name.
// pserver can not find each other name.
for
(
auto
&
node
:
g
->
Nodes
())
{
auto
update_skip_set
=
[
&
](
ir
::
Node
*
node
)
{
if
(
!
node
->
IsOp
())
continue
;
for
(
auto
&
in
:
node
->
inputs
)
{
if
(
node
->
Name
()
==
"send"
)
{
if
(
in
->
IsVar
()
&&
in
->
Var
()
!=
nullptr
)
dup_nodes_
.
emplace
(
in
->
Name
());
for
(
auto
&
in
:
node
->
inputs
)
{
dup_nodes_
.
emplace
(
in
->
Name
());
}
}
}
if
(
node
->
Name
()
==
"recv"
)
{
for
(
auto
&
out
:
node
->
outputs
)
{
for
(
auto
&
out
:
node
->
outputs
)
{
if
(
out
->
IsVar
()
&&
out
->
Var
()
!=
nullptr
)
dup_nodes_
.
emplace
(
out
->
Name
());
dup_nodes_
.
emplace
(
out
->
Name
());
}
}
}
};
for
(
auto
&
node
:
g
->
Nodes
())
{
if
(
!
node
->
IsOp
())
continue
;
if
(
node
->
Name
()
==
"send"
)
update_skip_set
(
node
);
if
(
node
->
Name
()
==
"recv"
)
update_skip_set
(
node
);
if
(
node
->
Name
()
==
"prefetch"
)
update_skip_set
(
node
);
}
}
}
}
...
...
paddle/fluid/framework/details/inplace_op_pass.h
浏览文件 @
f79a3a83
...
@@ -56,7 +56,8 @@ class GraphView {
...
@@ -56,7 +56,8 @@ class GraphView {
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
adj_list_
;
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
adj_list_
;
};
};
typedef
std
::
vector
<
std
::
pair
<
ir
::
Node
*
,
ir
::
Node
*>>
SSANodePair
;
// swap pairs in sequence
typedef
std
::
vector
<
std
::
pair
<
ir
::
Node
*
,
ir
::
Node
*>>
NodeSwapQueue
;
class
InplacePass
:
public
ir
::
Pass
{
class
InplacePass
:
public
ir
::
Pass
{
public:
public:
InplacePass
();
InplacePass
();
...
@@ -68,14 +69,14 @@ class InplacePass : public ir::Pass {
...
@@ -68,14 +69,14 @@ class InplacePass : public ir::Pass {
void
InitSSAGraphNodes
()
const
;
void
InitSSAGraphNodes
()
const
;
private:
private:
const
SSANodePair
TryInplaceModifyVar
(
const
std
::
string
&
var
,
const
NodeSwapQueue
TryInplaceModifyVar
(
const
std
::
string
&
var
,
const
std
::
string
&
cache_var
,
const
std
::
string
&
cache_var
,
const
size_t
&
idx
,
const
size_t
&
idx
,
ir
::
Graph
*
graph
)
const
;
ir
::
Graph
*
graph
)
const
;
void
CommitModify
(
const
SSANodePair
&
,
ir
::
Graph
*
graph
)
const
;
void
CommitModify
(
const
NodeSwapQueue
&
,
ir
::
Graph
*
graph
)
const
;
void
WithdrawModify
(
const
SSANodePair
&
nodes
,
ir
::
Graph
*
graph
)
const
;
void
WithdrawModify
(
const
NodeSwapQueue
&
nodes
,
ir
::
Graph
*
graph
)
const
;
void
InplaceModifyDesc
(
const
std
::
string
&
in_var
,
const
std
::
string
&
out_var
,
void
InplaceModifyDesc
(
const
std
::
string
&
in_var
,
const
std
::
string
&
out_var
,
const
size_t
&
idx
)
const
;
const
size_t
&
idx
)
const
;
...
...
paddle/fluid/framework/details/memory_optimize_helper.cc
浏览文件 @
f79a3a83
...
@@ -13,17 +13,114 @@
...
@@ -13,17 +13,114 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <algorithm>
#include <deque>
#include <functional>
#include <functional>
#include <i
ostream
>
#include <i
terator
>
#include <numeric>
#include <numeric>
#include <sstream>
#include <sstream>
#include <string>
#include <string>
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h"
#endif // PADDLE_WITH_CUDA
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
using
paddle
::
framework
::
VarDesc
;
std
::
vector
<
ir
::
Node
*>
SortOpLikeDescOrder
(
const
ir
::
Graph
&
graph
)
{
PADDLE_ENFORCE
(
graph
.
Has
(
kAllOpDescs
),
"Graph has no attribute of kAllOpDescs."
);
// 1. get op desc order
auto
&
op_descs
=
graph
.
Get
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
);
// 2. topology sort order
auto
nodes
=
graph
.
Nodes
();
std
::
deque
<
ir
::
Node
*>
ops
;
FilterVariables
(
nodes
,
[
&
](
ir
::
Node
*
op
)
{
if
(
op
->
IsOp
()
&&
op
->
Op
()
!=
nullptr
)
{
ops
.
emplace_back
(
op
);
}
});
std
::
unordered_map
<
ir
::
Node
*
,
size_t
>
op_deps
;
std
::
list
<
ir
::
Node
*>
ready_ops
;
std
::
unordered_map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
pending_ops
;
for
(
auto
*
op
:
ops
)
{
std
::
unordered_set
<
ir
::
Node
*>
preceding_op
;
for
(
auto
*
in
:
op
->
inputs
)
{
if
(
in
->
inputs
.
empty
())
continue
;
PADDLE_ENFORCE
(
in
->
inputs
.
size
()
==
1
&&
in
->
inputs
[
0
]
->
IsOp
());
preceding_op
.
emplace
(
in
->
inputs
[
0
]);
pending_ops
[
in
->
inputs
[
0
]].
emplace
(
op
);
}
op_deps
[
op
]
=
preceding_op
.
size
();
if
(
preceding_op
.
empty
())
{
ready_ops
.
emplace_back
(
op
);
}
}
// 3. generated op list based desc order and the topology order
std
::
vector
<
ir
::
Node
*>
ret
;
std
::
list
<
OpDesc
*>
op_descs_list
(
op_descs
.
begin
(),
op_descs
.
end
());
auto
update_by_found_node
=
[
&
](
ir
::
Node
*
found_node
)
{
for
(
auto
*
pending_op
:
pending_ops
[
found_node
])
{
if
(
--
op_deps
[
pending_op
]
==
0
)
{
ready_ops
.
emplace_back
(
pending_op
);
}
}
ready_ops
.
remove
(
found_node
);
ret
.
emplace_back
(
found_node
);
};
while
(
!
ready_ops
.
empty
())
{
bool
all_of_ready_op_unmatched
=
true
;
for
(
auto
it
=
op_descs_list
.
begin
();
it
!=
op_descs_list
.
end
();)
{
auto
op_desc
=
*
it
;
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
*
op
:
ready_ops
)
{
if
(
IsSameDesc
(
op
->
Op
(),
op_desc
))
{
found_node
=
op
;
break
;
}
}
// 3.1 op desc deleted by other pass
if
(
found_node
==
nullptr
)
{
++
it
;
continue
;
}
else
{
all_of_ready_op_unmatched
=
false
;
it
=
op_descs_list
.
erase
(
it
);
}
update_by_found_node
(
found_node
);
}
// 3.2 op descs are added by other pass
// preceding op non empty means some new op descs are
// created, but not contained in return node list.
// these new op desc may depend on each other.
std
::
list
<
ir
::
Node
*>
prev_ready_ops
(
ready_ops
);
if
(
all_of_ready_op_unmatched
)
{
for
(
auto
op
:
prev_ready_ops
)
{
update_by_found_node
(
op
);
}
}
}
PADDLE_ENFORCE
(
std
::
all_of
(
op_deps
.
begin
(),
op_deps
.
end
(),
[
&
](
const
std
::
pair
<
ir
::
Node
*
,
size_t
>&
p
)
{
return
p
.
second
==
0
;
}));
return
ret
;
}
size_t
NodeSize
InBytes
(
const
VarDesc
&
node
)
{
size_t
NodeSize
(
const
VarDesc
&
node
)
{
auto
shape
=
node
.
GetShape
();
auto
shape
=
node
.
GetShape
();
int
size
=
int
size
=
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
int
>
());
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
int
>
());
...
@@ -31,9 +128,9 @@ size_t NodeSizeInBytes(const VarDesc& node) {
...
@@ -31,9 +128,9 @@ size_t NodeSizeInBytes(const VarDesc& node) {
return
type_size
*
std
::
abs
(
size
);
return
type_size
*
std
::
abs
(
size
);
}
}
size_t
NodeSize
InBytes
(
ir
::
Node
*
n
)
{
size_t
NodeSize
(
ir
::
Node
*
n
)
{
auto
*
desc
=
FindVarDescInBlock
(
n
);
auto
*
desc
=
FindVarDescInBlock
(
n
);
return
NodeSize
InBytes
(
*
desc
);
return
NodeSize
(
*
desc
);
}
}
std
::
string
DebugStringImpl
(
VarDesc
*
var
)
{
std
::
string
DebugStringImpl
(
VarDesc
*
var
)
{
...
@@ -59,7 +156,6 @@ std::string DebugStringImpl(VarDesc* var) {
...
@@ -59,7 +156,6 @@ std::string DebugStringImpl(VarDesc* var) {
std
::
string
DebugString
(
ir
::
Node
*
var
)
{
std
::
string
DebugString
(
ir
::
Node
*
var
)
{
return
DebugStringImpl
(
FindVarDescInBlock
(
var
));
return
DebugStringImpl
(
FindVarDescInBlock
(
var
));
}
}
// return DebugString(var->Var()); }
// NOTE(dzh): based ir node, if a large node has been reused
// NOTE(dzh): based ir node, if a large node has been reused
// by a small size node, then next time it appear in pool, it will
// by a small size node, then next time it appear in pool, it will
...
@@ -76,22 +172,26 @@ struct NodeComparator {
...
@@ -76,22 +172,26 @@ struct NodeComparator {
bool
operator
()(
ir
::
Node
*
lhs
,
ir
::
Node
*
rhs
)
const
{
bool
operator
()(
ir
::
Node
*
lhs
,
ir
::
Node
*
rhs
)
const
{
auto
*
lhs_desc
=
FindVarDescInBlock
(
lhs
);
auto
*
lhs_desc
=
FindVarDescInBlock
(
lhs
);
auto
*
rhs_desc
=
FindVarDescInBlock
(
rhs
);
auto
*
rhs_desc
=
FindVarDescInBlock
(
rhs
);
// match data type
if
(
lhs_desc
->
GetDataType
()
!=
rhs_desc
->
GetDataType
())
{
return
false
;
}
// match shape
auto
lhs_shape
=
lhs_desc
->
GetShape
();
auto
lhs_shape
=
lhs_desc
->
GetShape
();
auto
rhs_shape
=
rhs_desc
->
GetShape
();
auto
rhs_shape
=
rhs_desc
->
GetShape
();
if
((
lhs_shape
[
0
]
==
-
1
&&
rhs_shape
[
0
]
==
-
1
)
||
if
((
lhs_shape
[
0
]
==
-
1
&&
rhs_shape
[
0
]
==
-
1
)
||
(
lhs_shape
[
0
]
!=
-
1
&&
rhs_shape
[
0
]
!=
-
1
))
{
(
lhs_shape
[
0
]
!=
-
1
&&
rhs_shape
[
0
]
!=
-
1
))
{
return
NodeSize
InBytes
(
lhs
)
<=
NodeSizeInBytes
(
rhs
);
return
NodeSize
(
lhs
)
<=
NodeSize
(
rhs
);
}
else
{
}
else
{
return
false
;
return
false
;
}
}
}
}
};
};
void
Ordered
NodeList
::
Insert
(
ir
::
Node
*
var
,
ir
::
Node
*
op
)
{
void
Ordered
Set
::
Insert
(
ir
::
Node
*
var
)
{
PADDLE_ENFORCE
(
var
->
IsVar
()
&&
!
var
->
IsCtrlVar
());
PADDLE_ENFORCE
(
var
->
IsVar
()
&&
!
var
->
IsCtrlVar
());
PADDLE_ENFORCE
(
op
->
IsOp
());
if
(
mark_table_
.
count
(
var
->
Name
())
!=
0
)
{
if
(
mark_table_
.
count
(
var
->
Name
())
!=
0
)
{
mark_table_
[
var
->
Name
()]
->
second
.
insert
(
op
);
mark_table_
[
var
->
Name
()]
->
emplace_back
(
var
);
return
;
return
;
}
}
...
@@ -99,14 +199,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
...
@@ -99,14 +199,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
auto
var_shape
=
var_desc
->
GetShape
();
auto
var_shape
=
var_desc
->
GetShape
();
int
batch_size
=
static_cast
<
int
>
(
var_shape
[
0
]);
int
batch_size
=
static_cast
<
int
>
(
var_shape
[
0
]);
NodeComparator
compare_node
;
NodeComparator
functor
;
Iter
it
=
nodes_
.
begin
();
Iter
it
=
nodes_
.
begin
();
while
(
it
!=
nodes_
.
end
())
{
while
(
it
!=
nodes_
.
end
())
{
auto
*
cache_desc
=
FindVarDescInBlock
(
it
->
first
);
auto
&
prev
=
it
->
front
();
auto
*
cache_desc
=
FindVarDescInBlock
(
prev
);
int
cache_batch_size
=
cache_desc
->
GetShape
()[
0
];
int
cache_batch_size
=
cache_desc
->
GetShape
()[
0
];
if
((
cache_batch_size
==
-
1
&&
batch_size
==
-
1
)
||
if
((
cache_batch_size
==
-
1
&&
batch_size
==
-
1
)
||
(
cache_batch_size
!=
-
1
&&
batch_size
!=
-
1
))
{
(
cache_batch_size
!=
-
1
&&
batch_size
!=
-
1
))
{
if
(
compare_node
(
it
->
first
,
var
))
{
if
(
functor
(
prev
,
var
))
{
++
it
;
++
it
;
}
else
{
}
else
{
break
;
break
;
...
@@ -118,62 +219,127 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
...
@@ -118,62 +219,127 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
}
}
}
}
it
=
it
=
nodes_
.
insert
(
it
,
{
var
});
nodes_
.
insert
(
it
,
std
::
make_pair
(
var
,
std
::
unordered_set
<
ir
::
Node
*>
{
op
}));
mark_table_
[
var
->
Name
()]
=
it
;
mark_table_
[
var
->
Name
()]
=
it
;
}
}
int
Ordered
NodeList
::
GetIndex
(
ir
::
Node
*
var
)
{
int
Ordered
Set
::
GetNodeIndexInPool
(
ir
::
Node
*
var
)
{
return
std
::
distance
(
nodes_
.
begin
(),
mark_table_
[
var
->
Name
()]);
return
std
::
distance
(
nodes_
.
begin
(),
mark_table_
[
var
->
Name
()]);
}
}
ir
::
Node
*
Ordered
NodeList
::
NodeMatch
(
ir
::
Node
*
var
)
const
{
ir
::
Node
*
Ordered
Set
::
FindBestFitNode
(
ir
::
Node
*
var
)
const
{
ir
::
Node
*
found_node
=
nullptr
;
ir
::
Node
*
found_node
=
nullptr
;
NodeComparator
compare_node
;
NodeComparator
functor
;
for
(
auto
it
=
nodes_
.
begin
();
it
!=
nodes_
.
end
();
++
it
)
{
for
(
auto
it
=
nodes_
.
begin
();
it
!=
nodes_
.
end
();
++
it
)
{
if
(
compare_node
(
var
,
it
->
first
))
{
auto
&
candidate
=
it
->
front
();
found_node
=
it
->
first
;
if
(
functor
(
var
,
candidate
))
{
found_node
=
candidate
;
break
;
break
;
}
}
}
}
return
found_node
;
return
found_node
;
}
}
void
OrderedNodeList
::
Erase
(
ir
::
Node
*
var
)
{
Erase
(
var
->
Name
());
}
ir
::
Node
*
OrderedSet
::
FindNextBestFitNode
(
ir
::
Node
*
var
,
ir
::
Node
*
prev
)
const
{
ir
::
Node
*
found_node
=
nullptr
;
NodeComparator
functor
;
auto
it
=
std
::
find_if
(
nodes_
.
begin
(),
nodes_
.
end
(),
[
&
](
const
NodeVector
&
v
)
{
if
(
v
.
front
()
==
prev
)
return
true
;
else
return
false
;
});
PADDLE_ENFORCE
(
it
!=
nodes_
.
end
(),
"Not found previous in node list!"
);
for
(
it
=
std
::
next
(
it
);
it
!=
nodes_
.
end
();
++
it
)
{
auto
&
candidate
=
it
->
front
();
if
(
functor
(
var
,
candidate
))
{
found_node
=
candidate
;
break
;
}
}
return
found_node
;
}
bool
OrderedSet
::
Has
(
ir
::
Node
*
var
)
const
{
if
(
mark_table_
.
count
(
var
->
Name
()))
{
auto
&
node_in_samename
=
mark_table_
.
at
(
var
->
Name
());
auto
iter
=
std
::
find_if
(
node_in_samename
->
begin
(),
node_in_samename
->
end
(),
[
&
](
ir
::
Node
*
n
)
{
return
n
->
Name
()
==
var
->
Name
();
});
return
iter
!=
node_in_samename
->
end
();
}
return
false
;
}
void
Ordered
NodeLis
t
::
Erase
(
const
std
::
string
&
var
)
{
void
Ordered
Se
t
::
Erase
(
const
std
::
string
&
var
)
{
PADDLE_ENFORCE
(
mark_table_
.
count
(
var
));
PADDLE_ENFORCE
(
mark_table_
.
count
(
var
));
nodes_
.
erase
(
mark_table_
[
var
]);
nodes_
.
erase
(
mark_table_
[
var
]);
mark_table_
.
erase
(
var
);
mark_table_
.
erase
(
var
);
}
}
std
::
string
OrderedNodeList
::
ToString
()
const
{
void
OrderedSet
::
Erase
(
ir
::
Node
*
var
)
{
PADDLE_ENFORCE
(
var
!=
nullptr
);
Erase
(
var
->
Name
());
}
std
::
string
OrderedSet
::
ToString
()
const
{
std
::
stringstream
ss
;
std
::
stringstream
ss
;
for
(
auto
it
=
nodes_
.
begin
();
it
!=
nodes_
.
end
();
++
it
)
{
for
(
auto
it
=
nodes_
.
begin
();
it
!=
nodes_
.
end
();
++
it
)
{
ss
<<
DebugString
(
it
->
first
)
<<
" "
;
for
(
auto
&
node
:
*
it
)
{
ss
<<
DebugString
(
node
)
<<
" "
;
}
}
}
return
ss
.
str
();
return
ss
.
str
();
}
}
bool
NodeCanReused
(
ir
::
Node
*
node
)
{
bool
NodeCanReused
(
ir
::
Node
*
node
)
{
// valid the node is a var node
if
(
node
==
nullptr
||
!
node
->
IsVar
()
||
node
->
IsCtrlVar
())
return
false
;
if
(
node
==
nullptr
||
!
node
->
IsVar
()
||
node
->
IsCtrlVar
())
return
false
;
// auto* desc = node->Var();
bool
flag
=
NodeCanReused
(
*
node
->
Var
());
bool
flag
=
true
;
// op output force generated in cpu, can not be reused.
for
(
auto
*
op
:
node
->
inputs
)
{
for
(
auto
*
op
:
node
->
inputs
)
{
if
(
op
->
Op
()
->
HasAttr
(
"force_cpu"
))
{
if
(
op
->
Op
()
->
HasAttr
(
"force_cpu"
))
{
// op output force generated in cpu, can not be reused.
flag
&=
framework
::
AttrReader
(
op
->
Op
()
->
GetAttrMap
())
flag
&=
framework
::
AttrReader
(
op
->
Op
()
->
GetAttrMap
())
.
Get
<
bool
>
(
"force_cpu"
)
==
0
;
.
Get
<
bool
>
(
"force_cpu"
)
==
0
;
}
}
}
}
// var desc validation.
flag
&=
NodeCanReused
(
*
node
->
Var
());
return
flag
;
return
flag
;
}
}
int
MinChunkSize
()
{
int
size
{
0
};
#ifdef PADDLE_WITH_CUDA
size
=
platform
::
GpuMinChunkSize
();
#else
size
=
platform
::
CpuMinChunkSize
();
#endif // PADDLE_WITH_CUDA
return
size
;
}
bool
NodeCanReused
(
const
VarDesc
&
node
)
{
bool
NodeCanReused
(
const
VarDesc
&
node
)
{
auto
type
=
node
.
GetType
();
auto
type
=
node
.
GetType
();
if
(
node
.
Persistable
()
||
type
!=
proto
::
VarType
::
LOD_TENSOR
||
// only these types holds bulk of gpu memory
node
.
GetShape
().
empty
())
{
if
(
!
(
type
==
proto
::
VarType
::
LOD_TENSOR
||
type
==
proto
::
VarType
::
SELECTED_ROWS
||
type
==
proto
::
VarType
::
LOD_TENSOR_ARRAY
))
{
return
false
;
}
// persistable variable is parameter
if
(
node
.
Persistable
())
{
return
false
;
}
// shape < min_chunk_size is meaningless.
// further more, fetched loss always has size = 1
// which should not be reused.
auto
shape
=
node
.
GetShape
();
int
size
=
std
::
abs
(
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
int
>
()));
if
(
shape
.
empty
()
||
size
<
MinChunkSize
())
{
return
false
;
return
false
;
}
}
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
// vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
...
@@ -193,6 +359,176 @@ bool OpHasSubBlock(OpDesc* desc) {
...
@@ -193,6 +359,176 @@ bool OpHasSubBlock(OpDesc* desc) {
return
false
;
return
false
;
}
}
ControlFlowGraph
::
ControlFlowGraph
(
const
ir
::
Graph
&
graph
)
{
ops_
=
SortOpLikeDescOrder
(
graph
);
ConnectNodes
();
}
void
ControlFlowGraph
::
BuildCFGGraph
()
{
// FIXME(dzh): same effect with ConnectNodes, but use the control
// link to build dependency graph, it goes wrong in transformer.
for
(
ir
::
Node
*
op
:
ops_
)
{
for
(
auto
&
input_var
:
op
->
inputs
)
{
if
(
!
input_var
->
inputs
.
empty
())
{
PADDLE_ENFORCE
(
input_var
->
inputs
.
size
()
==
1
&&
input_var
->
inputs
[
0
]
->
IsOp
(),
"Preceding Op Node of Var Node must be unique"
);
auto
*
pred_op
=
input_var
->
inputs
[
0
];
if
(
pred_op
->
Op
()
!=
nullptr
)
{
predecessors_
[
op
].
insert
(
pred_op
);
successors_
[
pred_op
].
insert
(
op
);
}
}
if
(
input_var
->
IsVar
()
&&
!
input_var
->
IsCtrlVar
())
{
uses_
[
op
].
insert
(
input_var
->
Name
());
}
}
for
(
auto
&
output_var
:
op
->
outputs
)
{
// output var may be used by many op
for
(
auto
*
succ_op
:
output_var
->
outputs
)
{
if
(
succ_op
->
Op
()
!=
nullptr
)
{
successors_
[
op
].
insert
(
succ_op
);
predecessors_
[
succ_op
].
insert
(
op
);
}
}
if
(
output_var
->
IsVar
()
&&
!
output_var
->
IsCtrlVar
())
{
defs_
[
op
].
insert
(
output_var
->
Name
());
}
}
}
}
void
ControlFlowGraph
::
ConnectNodes
()
{
for
(
size_t
i
=
0
;
i
<
ops_
.
size
();
++
i
)
{
auto
&
op
=
ops_
[
i
];
try
{
auto
&
next_op
=
ops_
.
at
(
i
+
1
);
successors_
[
op
].
insert
(
next_op
);
predecessors_
[
next_op
].
insert
(
op
);
}
catch
(...)
{
// do nothing
}
FilterVariables
(
op
->
inputs
,
[
&
](
ir
::
Node
*
var
)
{
uses_
[
op
].
emplace
(
var
->
Name
());
});
FilterVariables
(
op
->
outputs
,
[
&
](
ir
::
Node
*
var
)
{
defs_
[
op
].
emplace
(
var
->
Name
());
});
}
}
void
ControlFlowGraph
::
LiveVariableAnalysis
()
{
// NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
// compute the liveness of for each variable though reversed_ops algorithm.
// It iterates the operators from end to begin, compute the live in/live out
// variable set for each op, then the diff between in/out will be used for
// the variable reuse. For detail refer to
// http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
std
::
list
<
ir
::
Node
*>
work_list
(
ops_
.
rbegin
(),
ops_
.
rend
());
while
(
!
work_list
.
empty
())
{
ir
::
Node
*
op
=
work_list
.
front
();
work_list
.
pop_front
();
// get the live_in calculated before. Empty if first.
auto
prev_live_in
=
std
::
move
(
live_in_
[
op
]);
for
(
auto
&
s
:
successors_
[
op
])
{
for
(
auto
&
var
:
live_in_
[
s
])
{
live_out_
[
op
].
insert
(
var
);
}
}
for
(
auto
&
var
:
uses_
[
op
])
{
live_in_
[
op
].
insert
(
var
);
}
for
(
auto
&
var
:
live_out_
[
op
])
{
live_in_
[
op
].
insert
(
var
);
}
for
(
auto
&
var
:
defs_
[
op
])
{
live_in_
[
op
].
erase
(
var
);
}
// If the live_in is not changed, then the liveness analysis of
// predecessors is completed.
//
// Otherwise, recalculate the predecessors liveness
if
(
live_in_
[
op
]
!=
prev_live_in
)
{
for
(
auto
&
pre
:
predecessors_
[
op
])
{
work_list
.
push_back
(
pre
);
}
}
}
}
void
ControlFlowGraph
::
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
const
std
::
string
&
new_node
,
int
begin_idx
)
{
// update graph from begin idx to the end
for
(
size_t
i
=
begin_idx
;
i
!=
ops_
.
size
();
++
i
)
{
auto
*
op
=
ops_
[
i
];
if
(
uses_
[
op
].
find
(
old_node
)
!=
uses_
[
op
].
end
())
{
uses_
[
op
].
erase
(
old_node
);
uses_
[
op
].
insert
(
new_node
);
}
if
(
defs_
[
op
].
find
(
old_node
)
!=
defs_
[
op
].
end
())
{
defs_
[
op
].
erase
(
old_node
);
defs_
[
op
].
insert
(
new_node
);
}
if
(
live_in_
[
op
].
find
(
old_node
)
!=
live_in_
[
op
].
end
())
{
live_in_
[
op
].
erase
(
old_node
);
live_in_
[
op
].
insert
(
new_node
);
}
if
(
live_out_
[
op
].
find
(
old_node
)
!=
live_out_
[
op
].
end
())
{
live_out_
[
op
].
erase
(
old_node
);
live_out_
[
op
].
insert
(
new_node
);
}
}
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
LiveIn
(
ir
::
Node
*
op
)
const
{
auto
it
=
live_in_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
live_in_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_in, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
LiveOut
(
ir
::
Node
*
op
)
const
{
auto
it
=
live_out_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
live_out_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_out, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
Use
(
ir
::
Node
*
op
)
const
{
auto
it
=
uses_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
uses_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_out, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
vector
<
ir
::
Node
*>
ControlFlowGraph
::
Ops
()
const
{
return
ops_
;
}
std
::
vector
<
ir
::
Node
*>&
ControlFlowGraph
::
Ops
()
{
return
ops_
;
}
ir
::
Node
*
ControlFlowGraph
::
GetNodeByName
(
const
std
::
string
&
name
,
ir
::
Node
*
op
)
const
{
// in ssa-graph, different version nodes have same name,
// this function get the latest version var before target op
// It may return nullptr, such as data node.
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
*
node
:
ops_
)
{
if
(
node
==
op
)
break
;
for
(
auto
&
output
:
node
->
outputs
)
{
PADDLE_ENFORCE
((
output
!=
nullptr
&&
output
->
IsVar
()),
"Output is empty!"
);
if
(
output
->
Var
()
&&
output
->
Name
()
==
name
)
{
found_node
=
output
;
}
}
}
return
found_node
;
}
}
// namespace details
}
// namespace details
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/details/memory_optimize_helper.h
浏览文件 @
f79a3a83
...
@@ -17,6 +17,8 @@
...
@@ -17,6 +17,8 @@
#include <iostream>
#include <iostream>
#include <iterator>
#include <iterator>
#include <list>
#include <list>
#include <map>
#include <set>
#include <string>
#include <string>
#include <utility>
#include <utility>
#include <vector>
#include <vector>
...
@@ -27,41 +29,43 @@ namespace paddle {
...
@@ -27,41 +29,43 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
constexpr
char
kFetchedVars
[]
=
"fetched_vars"
;
constexpr
char
kAllOpDescs
[]
=
"all_op_descs"
;
constexpr
char
kGraphNodePool
[]
=
"graph_node_pool"
;
// NOTE(dzh): Variable and the operators use the var.
std
::
vector
<
ir
::
Node
*>
SortOpLikeDescOrder
(
const
ir
::
Graph
&
graph
);
// for early delete pass.
// Because analysis var pass build base on ir::Node, which maybe released
// or modified between passes, so we use OpDesc* to mark ops.
using
GraphNodePool
=
std
::
vector
<
std
::
pair
<
std
::
string
/*var node*/
,
std
::
unordered_set
<
OpDesc
*>
/* ops */
>>
;
// NOTE(dzh): by default, it sort node in ascend order(by node bytes size).
// NOTE(dzh): A ordered set for node reuse in memory optimize.
// in fluid, -1 means the batch_size is determined in runtime.
// the orderedset sort node in ascend order(by node bytes size).
// the node batch_size equal -1 always ranking in the front than the node not.
// in fluid, -1 means the batch_size, which is determined in runtime.
// So the reuse happens between nodes who's batch_size both are -1
// simultaneously or not.
//
// sort rule:
// rule 0 : smaller node ranking in front.
// rule 1 : batch_size equal -1 ranking in the front than the node not.
//
// For example,
// For example,
// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
// O(1) insert, delete
class
OrderedNodeList
{
public:
using
NodePair
=
std
::
pair
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
;
using
Iter
=
typename
std
::
list
<
NodePair
>::
iterator
;
using
ConstIter
=
typename
std
::
list
<
NodePair
>::
const_iterator
;
void
Insert
(
ir
::
Node
*
var
,
ir
::
Node
*
op
);
class
OrderedSet
{
public:
// nodes with same name exists in pool.
using
NodeVector
=
std
::
vector
<
ir
::
Node
*>
;
using
Iter
=
typename
std
::
list
<
NodeVector
>::
iterator
;
using
ConstIter
=
typename
std
::
list
<
NodeVector
>::
const_iterator
;
void
Insert
(
ir
::
Node
*
var
);
void
Erase
(
ir
::
Node
*
var
);
void
Erase
(
ir
::
Node
*
var
);
void
Erase
(
const
std
::
string
&
var
);
void
Erase
(
const
std
::
string
&
var
);
bool
Has
(
ir
::
Node
*
var
)
const
;
bool
Has
(
ir
::
Node
*
var
)
{
return
mark_table_
.
count
(
var
->
Name
());
}
void
Clear
()
{
mark_table_
.
clear
();
bool
Has
(
const
std
::
string
&
var
)
{
return
mark_table_
.
count
(
var
);
}
nodes_
.
clear
();
}
ir
::
Node
*
NodeMatch
(
ir
::
Node
*
var
)
const
;
// find the bestfit shape node block with var.
ir
::
Node
*
FindBestFitNode
(
ir
::
Node
*
var
)
const
;
ir
::
Node
*
FindNextBestFitNode
(
ir
::
Node
*
var
,
ir
::
Node
*
prev
)
const
;
// map store non-const iterator, can not promise const
// map store non-const iterator, can not promise const
int
Get
Index
(
ir
::
Node
*
var
);
int
Get
NodeIndexInPool
(
ir
::
Node
*
var
);
// pool all node to string
// pool all node to string
std
::
string
ToString
()
const
;
std
::
string
ToString
()
const
;
...
@@ -69,18 +73,54 @@ class OrderedNodeList {
...
@@ -69,18 +73,54 @@ class OrderedNodeList {
Iter
end
()
{
return
nodes_
.
end
();
}
Iter
end
()
{
return
nodes_
.
end
();
}
ConstIter
begin
()
const
{
return
nodes_
.
begin
();
}
ConstIter
begin
()
const
{
return
nodes_
.
begin
();
}
ConstIter
end
()
const
{
return
nodes_
.
end
();
}
ConstIter
end
()
const
{
return
nodes_
.
end
();
}
size_t
size
()
const
{
return
nodes_
.
size
();
}
void
Clear
()
{
size_t
size
()
const
{
return
nodes_
.
size
();
}
mark_table_
.
clear
();
nodes_
.
clear
();
}
private:
private:
// for searching.
// for searching.
std
::
unordered_map
<
std
::
string
,
Iter
>
mark_table_
;
std
::
unordered_map
<
std
::
string
,
Iter
>
mark_table_
;
// node swap pairs. var -> ops dep var
// node pool
std
::
list
<
NodePair
>
nodes_
;
std
::
list
<
NodeVector
>
nodes_
;
};
class
ControlFlowGraph
{
public:
ControlFlowGraph
()
=
default
;
// IR Graph
explicit
ControlFlowGraph
(
const
ir
::
Graph
&
graph
);
void
LiveVariableAnalysis
();
void
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
const
std
::
string
&
new_node
,
int
begin_idx
);
const
std
::
set
<
std
::
string
>
LiveIn
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>
LiveOut
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>
Use
(
ir
::
Node
*
op
)
const
;
const
std
::
vector
<
ir
::
Node
*>
Ops
()
const
;
std
::
vector
<
ir
::
Node
*>&
Ops
();
// for ssa-graph nodes
ir
::
Node
*
GetNodeByName
(
const
std
::
string
&
name
,
ir
::
Node
*
op
)
const
;
private:
void
BuildCFGGraph
();
void
ConnectNodes
();
using
NodeListMap
=
std
::
unordered_map
<
ir
::
Node
*
,
std
::
set
<
ir
::
Node
*>>
;
using
VarSetMap
=
std
::
map
<
ir
::
Node
*
,
std
::
set
<
std
::
string
>>
;
// successors ops use the output variables.
NodeListMap
successors_
;
// predecessors ops generated input variables.
NodeListMap
predecessors_
;
// variables lived before run current op.
VarSetMap
live_in_
;
// variables lived after run current op.
VarSetMap
live_out_
;
VarSetMap
uses_
;
// op inputs
VarSetMap
defs_
;
// op outputs
std
::
vector
<
ir
::
Node
*>
ops_
;
// op sequence by topology sort
};
};
// valid a tensor can be reuse or not
// valid a tensor can be reuse or not
...
@@ -93,15 +133,24 @@ bool NodeCanReused(const VarDesc& node);
...
@@ -93,15 +133,24 @@ bool NodeCanReused(const VarDesc& node);
bool
OpHasSubBlock
(
OpDesc
*
desc
);
bool
OpHasSubBlock
(
OpDesc
*
desc
);
// node memory size in bytes
// node memory size in bytes
size_t
NodeSize
InBytes
(
ir
::
Node
*
n
);
size_t
NodeSize
(
ir
::
Node
*
n
);
// node memory size in bytes
// node memory size in bytes
size_t
NodeSize
InBytes
(
const
VarDesc
&
);
size_t
NodeSize
(
const
VarDesc
&
);
std
::
string
DebugString
(
ir
::
Node
*
var
);
std
::
string
DebugString
(
ir
::
Node
*
var
);
// NOTE(dzhwinter)
// after node reuse, the replaced node shape is
// different with its VarDesc. So need to find the
// correct VarDesc in Block.
VarDesc
*
FindVarDescInBlock
(
ir
::
Node
*
n
);
VarDesc
*
FindVarDescInBlock
(
ir
::
Node
*
n
);
static
inline
bool
IsSameDesc
(
OpDesc
*
op1
,
OpDesc
*
op2
)
{
return
op1
->
Type
()
==
op2
->
Type
()
&&
op1
->
Inputs
()
==
op2
->
Inputs
()
&&
op1
->
Outputs
()
==
op2
->
Outputs
();
}
template
<
typename
Container
,
typename
Callback
>
template
<
typename
Container
,
typename
Callback
>
class
FilterVariableImpl
{
class
FilterVariableImpl
{
public:
public:
...
...
paddle/fluid/framework/details/memory_optimize_helper_test.cc
浏览文件 @
f79a3a83
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include <algorithm>
#include <algorithm>
#include <iostream>
#include <iostream>
#include <iterator>
#include <memory>
#include <memory>
#include <sstream>
#include <sstream>
#include <string>
#include <string>
...
@@ -22,13 +23,19 @@
...
@@ -22,13 +23,19 @@
#include <vector>
#include <vector>
#include "glog/logging.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/graph_test_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
TEST
(
Ordered
NodeLis
t
,
Normal
)
{
TEST
(
Ordered
Se
t
,
Normal
)
{
Ordered
NodeLis
t
pool
;
Ordered
Se
t
pool
;
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>>
nodes
;
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>>
nodes
;
// clang-format off
// clang-format off
...
@@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) {
...
@@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) {
nodes
.
emplace_back
(
std
::
move
(
node
));
nodes
.
emplace_back
(
std
::
move
(
node
));
}
}
// Insert
for
(
auto
&
node
:
nodes
)
{
for
(
auto
&
node
:
nodes
)
{
pool
.
Insert
(
node
.
get
(),
op
.
get
());
pool
.
Insert
(
node
.
get
());
}
// Has/size
ASSERT_EQ
(
pool
.
size
(),
shapes
.
size
());
for
(
auto
&
node
:
nodes
)
{
ASSERT_TRUE
(
pool
.
Has
(
node
.
get
()));
}
}
// assert its order and interface.
// assert its order and interface.
...
@@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) {
...
@@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) {
std
::
cout
<<
pool
.
ToString
()
<<
std
::
endl
;
std
::
cout
<<
pool
.
ToString
()
<<
std
::
endl
;
ASSERT_EQ
(
pool
.
size
(),
static_cast
<
size_t
>
(
COUNT
-
1
));
ASSERT_EQ
(
pool
.
size
(),
static_cast
<
size_t
>
(
COUNT
-
1
));
ASSERT_EQ
(
pool
.
Get
Index
(
nodes
.
back
().
get
()),
0
);
ASSERT_EQ
(
pool
.
Get
NodeIndexInPool
(
nodes
.
back
().
get
()),
0
);
{
{
auto
v1
=
block_desc
->
Var
(
"11"
);
auto
v1
=
block_desc
->
Var
(
"11"
);
v1
->
SetShape
({
-
1
,
256
,
56
,
56
});
v1
->
SetShape
({
-
1
,
256
,
56
,
56
});
std
::
unique_ptr
<
ir
::
Node
>
node1
=
ir
::
CreateNodeForTest
(
v1
);
std
::
unique_ptr
<
ir
::
Node
>
node1
=
ir
::
CreateNodeForTest
(
v1
);
node1
->
inputs
.
emplace_back
(
op
.
get
());
node1
->
inputs
.
emplace_back
(
op
.
get
());
auto
*
cache
=
pool
.
NodeMatch
(
node1
.
get
());
auto
*
cache
=
pool
.
FindBestFitNode
(
node1
.
get
());
ASSERT_EQ
(
cache
,
nullptr
);
ASSERT_EQ
(
cache
,
nullptr
);
}
}
{
{
...
@@ -81,16 +95,447 @@ TEST(OrderedNodeList, Normal) {
...
@@ -81,16 +95,447 @@ TEST(OrderedNodeList, Normal) {
v2
->
SetShape
({
-
1
,
2
,
5
});
v2
->
SetShape
({
-
1
,
2
,
5
});
std
::
unique_ptr
<
ir
::
Node
>
node1
=
ir
::
CreateNodeForTest
(
v2
);
std
::
unique_ptr
<
ir
::
Node
>
node1
=
ir
::
CreateNodeForTest
(
v2
);
node1
->
inputs
.
emplace_back
(
op
.
get
());
node1
->
inputs
.
emplace_back
(
op
.
get
());
auto
*
cache
=
pool
.
NodeMatch
(
node1
.
get
());
auto
*
cache
=
pool
.
FindBestFitNode
(
node1
.
get
());
ASSERT_EQ
(
pool
.
Get
Index
(
cache
),
2
);
// match 6:[-1,2,5]
ASSERT_EQ
(
pool
.
Get
NodeIndexInPool
(
cache
),
2
);
// match 6:[-1,2,5]
}
}
{
{
auto
v3
=
block_desc
->
Var
(
"13"
);
auto
v3
=
block_desc
->
Var
(
"13"
);
v3
->
SetShape
({
2
,
5
});
v3
->
SetShape
({
2
,
5
});
std
::
unique_ptr
<
ir
::
Node
>
node1
=
ir
::
CreateNodeForTest
(
v3
);
std
::
unique_ptr
<
ir
::
Node
>
node1
=
ir
::
CreateNodeForTest
(
v3
);
node1
->
inputs
.
emplace_back
(
op
.
get
());
node1
->
inputs
.
emplace_back
(
op
.
get
());
auto
*
cache
=
pool
.
NodeMatch
(
node1
.
get
());
auto
*
cache
=
pool
.
FindBestFitNode
(
node1
.
get
());
ASSERT_EQ
(
pool
.
GetIndex
(
cache
),
5
);
// match 4:[5,2]
ASSERT_EQ
(
pool
.
GetNodeIndexInPool
(
cache
),
5
);
// match 4:[5,2]
}
}
TEST
(
OrderedSet
,
FindBestFitNode
)
{
OrderedSet
pool
;
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>>
nodes
;
ProgramDesc
prog
;
BlockDesc
*
block_desc
=
prog
.
MutableBlock
(
0
);
auto
*
op_desc
=
block_desc
->
AppendOp
();
op_desc
->
SetType
(
"dummy"
);
std
::
unique_ptr
<
ir
::
Node
>
op
=
ir
::
CreateNodeForTest
(
op_desc
);
{
auto
desc
=
block_desc
->
Var
(
"a"
);
desc
->
SetShape
({
128
,
128
});
std
::
unique_ptr
<
ir
::
Node
>
node
=
ir
::
CreateNodeForTest
(
desc
);
node
->
inputs
.
emplace_back
(
op
.
get
());
nodes
.
emplace_back
(
std
::
move
(
node
));
}
{
auto
desc
=
block_desc
->
Var
(
"b"
);
desc
->
SetShape
({
128
,
129
});
std
::
unique_ptr
<
ir
::
Node
>
node
=
ir
::
CreateNodeForTest
(
desc
);
node
->
inputs
.
emplace_back
(
op
.
get
());
nodes
.
emplace_back
(
std
::
move
(
node
));
}
{
auto
desc
=
block_desc
->
Var
(
"c"
);
desc
->
SetShape
({
128
,
128
});
std
::
unique_ptr
<
ir
::
Node
>
node
=
ir
::
CreateNodeForTest
(
desc
);
node
->
inputs
.
emplace_back
(
op
.
get
());
nodes
.
emplace_back
(
std
::
move
(
node
));
}
for
(
auto
&
node
:
nodes
)
{
pool
.
Insert
(
node
.
get
());
}
// FindNextBestFitNode
auto
*
n
=
nodes
[
0
].
get
();
auto
*
cache
=
pool
.
FindBestFitNode
(
n
);
PADDLE_ENFORCE
(
cache
->
Name
()
==
"a"
);
cache
=
pool
.
FindNextBestFitNode
(
n
,
cache
);
PADDLE_ENFORCE
(
cache
->
Name
()
==
"c"
);
cache
=
pool
.
FindNextBestFitNode
(
n
,
cache
);
PADDLE_ENFORCE
(
cache
->
Name
()
==
"b"
);
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
REGISTER_OPERATOR
(
sum
,
paddle
::
framework
::
DummyOp
,
paddle
::
framework
::
SumOpMaker
,
paddle
::
framework
::
DummyVarTypeInference
);
REGISTER_OPERATOR
(
assign
,
paddle
::
framework
::
DummyOp
,
paddle
::
framework
::
AssignOpMaker
,
paddle
::
framework
::
DummyVarTypeInference
);
REGISTER_OPERATOR
(
dummy
,
paddle
::
framework
::
DummyOp
,
paddle
::
framework
::
SumOpMaker
,
paddle
::
framework
::
DummyVarTypeInference
);
/*
https://en.wikipedia.org/wiki/Live_variable_analysis
Create a customed classical dependency graph, left row is the instruction
number.
1. a = 1
2. b = a
3. c = a
4. d = b + c
5. e = d
a--------+
| |
b c
| |
d--------+
|
e
Then analysis these variable's liveness range
*/
namespace
paddle
{
namespace
framework
{
namespace
details
{
inline
static
ProgramDesc
FillProgramDesc
()
{
ProgramDesc
prog
;
prog
.
MutableBlock
(
0
)
->
Var
(
"a"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"b"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"c"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"d"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"e"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"assign"
);
op
->
SetInput
(
"X"
,
{
"a"
});
op
->
SetOutput
(
"Out"
,
{
"b"
});
}
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"assign"
);
op
->
SetInput
(
"X"
,
{
"a"
});
op
->
SetOutput
(
"Out"
,
{
"c"
});
}
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d"
});
}
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"assign"
);
op
->
SetInput
(
"X"
,
{
"d"
});
op
->
SetOutput
(
"Out"
,
{
"e"
});
}
return
prog
;
}
TEST
(
CFGGraph
,
IRGraph
)
{
// prepare ir graph
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
ControlFlowGraph
cfg
(
graph
);
cfg
.
LiveVariableAnalysis
();
// test assign op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"a"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
0
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"a"
,
"b"
}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
0
])));
// test assign op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"a"
,
"b"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
1
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"b"
,
"c"
}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
1
])));
// test sum op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"b"
,
"c"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
2
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"d"
}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
2
])));
// test assign op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"d"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
3
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
3
])));
}
// 1. normal test
TEST
(
SortOpLikeDescOrder
,
NormalTest
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
SortOpLikeDescOrder
(
graph
);
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
auto
node
=
nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 2. remove some op_desc
TEST
(
SortOpLikeDescOrder
,
RemoveOpDesc
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
graph
.
Nodes
();
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
node
:
nodes
)
{
if
(
node
->
IsOp
()
&&
node
->
outputs
.
back
()
->
Name
()
==
"e"
)
{
found_node
=
node
;
break
;
}
}
PADDLE_ENFORCE
(
found_node
!=
nullptr
);
for
(
auto
it
=
op_descs
.
begin
();
it
!=
op_descs
.
end
();)
{
if
(
IsSameDesc
(
*
it
,
found_node
->
Op
()))
{
it
=
op_descs
.
erase
(
it
);
}
else
{
++
it
;
}
}
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
std
::
remove
(
d
->
outputs
.
begin
(),
d
->
outputs
.
end
(),
found_node
);
graph
.
RemoveNode
(
found_node
);
graph
.
RemoveNode
(
e
);
// other node keeps the same order
auto
remain_nodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
remain_nodes
.
size
();
++
i
)
{
auto
node
=
remain_nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 3. add some op_desc
TEST
(
SortOpLikeDescOrder
,
AddOpDesc
)
{
auto
prog
=
FillProgramDesc
();
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
ir
::
Graph
graph
(
prog
);
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d1"
});
ir
::
Node
*
node
=
graph
.
CreateOpNode
(
op
);
ir
::
Node
*
d1
=
graph
.
CreateVarNode
(
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
));
ir
::
Node
*
b
=
find_node_in_graph
(
"b"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
node
->
outputs
.
emplace_back
(
d1
);
node
->
inputs
.
emplace_back
(
b
);
node
->
inputs
.
emplace_back
(
c
);
d1
->
inputs
.
emplace_back
(
node
);
b
->
outputs
.
emplace_back
(
node
);
c
->
outputs
.
emplace_back
(
node
);
op_descs
.
insert
(
op_descs
.
begin
()
+
4
,
op
);
auto
nodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
auto
node
=
nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 4. add and delete some op_desc
TEST
(
SortOpLikeDescOrder
,
AddAndDeleteOpDesc
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
// remove sum node
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
auto
nodes
=
graph
.
Nodes
();
for
(
auto
node
:
nodes
)
{
if
(
node
->
Name
()
==
"sum"
)
{
found_node
=
node
;
break
;
}
}
PADDLE_ENFORCE
(
found_node
!=
nullptr
);
for
(
auto
it
=
op_descs
.
begin
();
it
!=
op_descs
.
end
();)
{
if
(
IsSameDesc
(
*
it
,
found_node
->
Op
()))
{
it
=
op_descs
.
erase
(
it
);
}
else
{
++
it
;
}
}
{
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
std
::
remove
(
d
->
outputs
.
begin
(),
d
->
outputs
.
end
(),
found_node
);
std
::
remove
(
c
->
outputs
.
begin
(),
c
->
outputs
.
end
(),
found_node
);
ir
::
Node
*
pending_op
=
found_node
->
outputs
[
0
]
->
outputs
[
0
];
graph
.
RemoveNode
(
e
);
graph
.
RemoveNode
(
pending_op
);
graph
.
RemoveNode
(
found_node
);
}
// add node
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d1"
});
{
ir
::
Node
*
node
=
graph
.
CreateOpNode
(
op
);
ir
::
Node
*
d1
=
graph
.
CreateVarNode
(
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
));
ir
::
Node
*
b
=
find_node_in_graph
(
"b"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
node
->
outputs
.
emplace_back
(
d1
);
node
->
inputs
.
emplace_back
(
b
);
node
->
inputs
.
emplace_back
(
c
);
b
->
outputs
.
emplace_back
(
node
);
c
->
outputs
.
emplace_back
(
node
);
}
op_descs
.
insert
(
op_descs
.
begin
()
+
2
,
op
);
// check the order
auto
mynodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
mynodes
.
size
();
++
i
)
{
auto
node
=
mynodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 5. add and replace some op_desc inplace.
TEST
(
SortOpLikeDescOrder
,
AddAndReplaceOpDescInplace
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
// add node
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d1"
});
{
ir
::
Node
*
node
=
graph
.
CreateOpNode
(
op
);
ir
::
Node
*
d1
=
graph
.
CreateVarNode
(
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
));
ir
::
Node
*
b
=
find_node_in_graph
(
"b"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
node
->
outputs
.
emplace_back
(
d1
);
node
->
inputs
.
emplace_back
(
b
);
node
->
inputs
.
emplace_back
(
c
);
d1
->
inputs
.
emplace_back
(
node
);
b
->
outputs
.
emplace_back
(
node
);
c
->
outputs
.
emplace_back
(
node
);
}
op_descs
.
emplace_back
(
op
);
// replace op_desc inplace
auto
nodes
=
graph
.
Nodes
();
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
node
:
nodes
)
{
if
(
node
->
IsOp
()
&&
node
->
Op
()
&&
node
->
Name
()
==
"assign"
)
{
if
(
node
->
outputs
.
size
()
==
1
&&
node
->
outputs
[
0
]
->
Name
()
==
"e"
)
{
found_node
=
node
;
break
;
}
}
}
{
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
std
::
remove
(
d
->
outputs
.
begin
(),
d
->
outputs
.
end
(),
found_node
);
std
::
remove
(
e
->
inputs
.
begin
(),
e
->
inputs
.
end
(),
found_node
);
graph
.
RemoveNode
(
found_node
);
}
op_descs
.
erase
(
op_descs
.
begin
()
+
3
);
auto
replace_op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
replace_op
->
SetType
(
"sum"
);
replace_op
->
SetInput
(
"X"
,
{
"d"
,
"d1"
});
replace_op
->
SetOutput
(
"Out"
,
{
"e"
});
{
ir
::
Node
*
sum2
=
graph
.
CreateOpNode
(
replace_op
);
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
ir
::
Node
*
d1
=
find_node_in_graph
(
"d1"
);
sum2
->
inputs
.
emplace_back
(
d
);
sum2
->
inputs
.
emplace_back
(
d1
);
sum2
->
outputs
.
emplace_back
(
e
);
e
->
inputs
.
emplace_back
(
sum2
);
d
->
outputs
.
emplace_back
(
sum2
);
d1
->
outputs
.
emplace_back
(
sum2
);
}
op_descs
.
emplace_back
(
replace_op
);
// compare op order
auto
graph_nodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
graph_nodes
.
size
();
++
i
)
{
auto
node
=
graph_nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
}
}
...
...
paddle/fluid/framework/details/memory_optimize_pass.cc
浏览文件 @
f79a3a83
...
@@ -43,16 +43,10 @@ namespace paddle {
...
@@ -43,16 +43,10 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
static
inline
bool
IsSameDesc
(
OpDesc
*
op1
,
OpDesc
*
op2
)
{
return
op1
->
Type
()
==
op2
->
Type
()
&&
op1
->
Inputs
()
==
op2
->
Inputs
()
&&
op1
->
Outputs
()
==
op2
->
Outputs
();
}
std
::
unique_ptr
<
ir
::
Graph
>
MemoryOptimizePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
MemoryOptimizePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
auto
nodes
=
graph
->
Nodes
();
auto
nodes
=
graph
->
Nodes
();
auto
subblock_vars
=
GetSubBlockVars
(
nodes
);
CollectSkipVarsSet
(
nodes
);
skip_set_
.
insert
(
subblock_vars
.
begin
(),
subblock_vars
.
end
());
cfg_
.
reset
(
new
details
::
ControlFlowGraph
(
*
graph
));
cfg_
.
reset
(
new
details
::
ControlFlowGraph
(
*
graph
));
cfg_
->
LiveVariableAnalysis
();
cfg_
->
LiveVariableAnalysis
();
...
@@ -75,82 +69,67 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
...
@@ -75,82 +69,67 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
}
}
for
(
auto
&
var
:
op
->
outputs
)
{
for
(
auto
&
var
:
op
->
outputs
)
{
if
(
!
NodeCanReused
(
var
)
||
cfg_
->
Use
(
op
).
count
(
var
->
Name
())
==
0
||
if
(
var
->
IsVar
()
&&
!
var
->
IsCtrlVar
()
&&
skip_set_
.
count
(
var
->
Name
()))
{
skip_set_
.
count
(
var
->
Name
()))
VLOG
(
3
)
<<
"Skip set contains variable of "
<<
var
->
Name
()
<<
"disable reuse on it. skipped"
;
continue
;
continue
;
ir
::
Node
*
cache
=
pool_
.
NodeMatch
(
var
);
if
(
var
->
Name
()
==
FLAGS_memory_optimize_debug
)
{
VLOG
(
3
)
<<
"start match var "
<<
DebugString
(
var
)
<<
" of op "
<<
op
->
Name
();
VLOG
(
3
)
<<
pool_
.
ToString
();
VLOG
(
3
)
<<
"matched in pool : "
<<
((
cache
==
nullptr
)
?
"False"
:
"True"
);
}
}
if
(
NodeCanReused
(
var
)
&&
cfg_
->
Use
(
op
).
count
(
var
->
Name
())
==
0
)
{
ir
::
Node
*
cache
=
pool_
.
FindBestFitNode
(
var
);
while
(
cache
!=
nullptr
&&
var
->
Name
()
==
cache
->
Name
())
{
VLOG
(
3
)
<<
"The same cache variable is cascade reused. "
<<
cache
->
Name
()
<<
" is re-filled to the pool after "
<<
"the reused op is finished. Current op can not "
<<
"replace it again. Skip this candidate."
;
cache
=
pool_
.
FindNextBestFitNode
(
var
,
cache
);
}
if
(
var
->
Name
()
==
FLAGS_memory_optimize_debug
)
{
VLOG
(
3
)
<<
"start match var "
<<
DebugString
(
var
)
<<
" of op "
<<
op
->
Name
();
VLOG
(
3
)
<<
pool_
.
ToString
();
VLOG
(
3
)
<<
"matched in pool : "
<<
((
cache
==
nullptr
)
?
"False"
:
"True"
);
}
if
(
cache
==
nullptr
)
continue
;
if
(
cache
!=
nullptr
)
{
if
(
var
->
Name
()
==
cache
->
Name
())
{
int
node_idx_in_pool
=
pool_
.
GetNodeIndexInPool
(
cache
);
VLOG
(
3
)
<<
"The same cache variable is cascade reused."
<<
var
->
Name
()
VLOG
(
3
)
<<
string
::
Sprintf
(
<<
" is re-filled to the pool after"
"!!! %s, %s => %s, cache idx %d, pool size %d"
,
<<
"the reused op is finished. Current op can not "
std
::
to_string
(
reuse_id
++
),
DebugString
(
var
),
DebugString
(
cache
),
<<
"replace it again. Skip this candidate."
;
node_idx_in_pool
,
static_cast
<
int
>
(
pool_
.
size
()));
continue
;
// NOTE(dzhwinter): update the ProgramDesc/IR Graph
// and the CFG Graph on the fly.
int
node_idx_in_pool
=
pool_
.
GetIndex
(
cache
);
//
VLOG
(
3
)
<<
string
::
Sprintf
(
// IR Graph define the dependence relationship between nodes.
"!!! %s, %s => %s, cache idx %d, pool size %d"
,
//
std
::
to_string
(
reuse_id
++
),
DebugString
(
var
),
DebugString
(
cache
),
// ProgramDesc defines the input/output vars. Its used in
node_idx_in_pool
,
static_cast
<
int
>
(
pool_
.
size
()));
// CreateOp, CreateVar when running happens.
// update CFG Graph on the fly.
//
// reused var maybe re-fill into the pool
// CFG Graph store the liveness information, when reuse happens
cfg_
->
RenameVarInCFGGraph
(
var
->
Name
(),
cache
->
Name
(),
idx
);
// we also need to update the variable liveness.
// NOTE(dzhwinter): we need to both update the ProgramDesc
const
std
::
string
var_name
=
var
->
Name
();
// and IR Graph. because op_desc/var_desc is used in CreateOp,
const
std
::
string
cache_name
=
cache
->
Name
();
// CreateVar when running happens. But IR Graph
// define the dependence relationship between nodes.
cfg_
->
RenameVarInCFGGraph
(
var_name
,
cache_name
,
idx
);
RenameVarInGraphDesc
(
var
->
Name
(),
cache
->
Name
(),
idx
);
RenameVarInGraphDesc
(
var_name
,
cache_name
,
idx
);
RenameVarInGraphNode
(
var
->
Name
(),
cache
->
Name
(),
idx
,
graph
.
get
());
RenameVarInGraphNode
(
var_name
,
cache_name
,
idx
,
graph
.
get
());
pool_
.
Erase
(
cache_name
);
pool_
.
Erase
(
cache
);
}
// fill the pool
std
::
unordered_set
<
std
::
string
>
unlived_vars
;
for
(
auto
var
:
cfg_
->
LiveIn
(
op
))
{
if
(
cfg_
->
LiveOut
(
op
).
count
(
var
)
==
0
)
{
unlived_vars
.
emplace
(
var
);
}
}
}
}
for
(
auto
var
:
unlived_vars
)
{
}
ir
::
Node
*
var_node
=
cfg_
->
GetNodeFromVarName
(
var
,
op
);
// fill the pool
for
(
auto
var
:
cfg_
->
LiveIn
(
op
))
{
if
(
cfg_
->
LiveOut
(
op
).
count
(
var
)
==
0
)
{
ir
::
Node
*
var_node
=
cfg_
->
GetNodeByName
(
var
,
op
);
if
(
var_node
==
nullptr
||
var_node
->
IsCtrlVar
())
continue
;
if
(
NodeCanReused
(
var_node
)
&&
!
pool_
.
Has
(
var_node
))
{
if
(
NodeCanReused
(
var_node
)
&&
!
pool_
.
Has
(
var_node
))
{
pool_
.
Insert
(
var_node
,
op
);
pool_
.
Insert
(
var_node
);
}
}
}
}
}
}
}
}
graph
->
ResolveHazard
(
var_nodes_
);
graph
->
ResolveHazard
(
var_nodes_
);
// For early delete pass. use GraphNodePool load the unlived vars.
// 1. find all deps op for each unlived var in memory pool.
for
(
auto
&
op
:
graph
->
Nodes
())
{
for
(
auto
&
var
:
op
->
inputs
)
{
if
(
pool_
.
Has
(
var
))
{
pool_
.
Insert
(
var
,
op
);
}
}
}
// 2. convert ir node based memory pool to graph node
// because Node* maybe released bettwen passes.
auto
&
graph_pool
=
graph
->
Get
<
GraphNodePool
>
(
kGraphNodePool
);
for
(
auto
it
=
pool_
.
begin
();
it
!=
pool_
.
end
();
++
it
)
{
std
::
unordered_set
<
OpDesc
*>
descs
;
for
(
auto
&
op
:
it
->
second
)
{
PADDLE_ENFORCE
(
op
->
IsOp
());
descs
.
insert
(
op
->
Op
());
}
graph_pool
.
push_back
(
std
::
make_pair
(
it
->
first
->
Name
(),
descs
));
}
return
graph
;
return
graph
;
}
}
...
@@ -199,12 +178,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
...
@@ -199,12 +178,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
PADDLE_ENFORCE
(
sub_op
!=
nullptr
);
PADDLE_ENFORCE
(
sub_op
!=
nullptr
);
for
(
auto
*
var
:
sub_op
->
outputs
)
{
for
(
auto
*
var
:
sub_op
->
outputs
)
{
if
(
NodeCanReused
(
var
))
{
if
(
NodeCanReused
(
var
))
{
ir
::
Node
*
cache
=
pool_
.
NodeMatch
(
var
);
ir
::
Node
*
cache
=
pool_
.
FindBestFitNode
(
var
);
if
(
cache
!=
nullptr
)
{
if
(
cache
!=
nullptr
)
{
if
(
var
->
Var
()
->
GetDataType
()
!=
cache
->
Var
()
->
GetDataType
())
{
if
(
var
->
Var
()
->
GetDataType
()
!=
cache
->
Var
()
->
GetDataType
())
{
continue
;
continue
;
}
}
int
node_idx_in_pool
=
pool_
.
Get
Index
(
cache
);
int
node_idx_in_pool
=
pool_
.
Get
NodeIndexInPool
(
cache
);
VLOG
(
3
)
<<
string
::
Sprintf
(
VLOG
(
3
)
<<
string
::
Sprintf
(
"!!! %s, %s => %s, cache idx %d, pool size %d"
,
"!!! %s, %s => %s, cache idx %d, pool size %d"
,
std
::
to_string
(
sub_reuse_id
++
),
DebugString
(
var
),
std
::
to_string
(
sub_reuse_id
++
),
DebugString
(
var
),
...
@@ -224,20 +203,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
...
@@ -224,20 +203,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
}
}
}
}
std
::
unordered_set
<
std
::
string
>
MemoryOptimizePass
::
GetSubBlockVars
(
void
MemoryOptimizePass
::
CollectSkipVarsSet
(
const
std
::
unordered_set
<
ir
::
Node
*>&
nodes
)
const
{
const
std
::
unordered_set
<
ir
::
Node
*>&
nodes
)
const
{
std
::
unordered_set
<
std
::
string
>
vars
;
auto
update_skip_set
=
[
&
](
OpDesc
*
op_desc
)
{
auto
inputs
=
op_desc
->
InputArgumentNames
();
auto
outputs
=
op_desc
->
OutputArgumentNames
();
skip_set_
.
insert
(
inputs
.
begin
(),
inputs
.
end
());
skip_set_
.
insert
(
outputs
.
begin
(),
outputs
.
end
());
};
for
(
auto
&
op
:
nodes
)
{
for
(
auto
&
op
:
nodes
)
{
if
(
!
op
->
IsOp
()
||
op
->
Op
()
==
nullptr
)
continue
;
if
(
!
op
->
IsOp
()
||
op
->
Op
()
==
nullptr
)
continue
;
auto
*
op_desc
=
op
->
Op
();
auto
*
op_desc
=
op
->
Op
();
if
(
OpHasSubBlock
(
op_desc
))
{
// NOTE(dzhwinter):
auto
inputs
=
op_desc
->
InputArgumentNames
();
// current block can not reuse next level block vars.
auto
outputs
=
op_desc
->
OutputArgumentNames
();
if
(
OpHasSubBlock
(
op_desc
))
update_skip_set
(
op_desc
);
vars
.
insert
(
inputs
.
begin
(),
inputs
.
end
());
// NOTE(dzhwinter):
vars
.
insert
(
outputs
.
begin
(),
outputs
.
end
());
// distributed ops input/output name need to
}
// keep same bettwen trainer/pserver
if
(
op_desc
->
Type
()
==
"send"
)
update_skip_set
(
op_desc
);
if
(
op_desc
->
Type
()
==
"recv"
)
update_skip_set
(
op_desc
);
if
(
op_desc
->
Type
()
==
"prefetch"
)
update_skip_set
(
op_desc
);
}
}
return
vars
;
}
}
void
MemoryOptimizePass
::
RenameVarInGraphDesc
(
const
std
::
string
&
var
,
void
MemoryOptimizePass
::
RenameVarInGraphDesc
(
const
std
::
string
&
var
,
...
@@ -291,8 +277,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
...
@@ -291,8 +277,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
// redirect the input to the latest version of cache_var
// redirect the input to the latest version of cache_var
for
(
auto
*
node
:
op
->
inputs
)
{
for
(
auto
*
node
:
op
->
inputs
)
{
if
(
node
->
Name
()
==
var
)
{
if
(
node
->
Name
()
==
var
)
{
ir
::
Node
*
cache_node
=
graph
->
CreateVarNode
(
var_desc
.
get
());
ir
::
Node
*
cache_node
=
var_nodes_
[
cache_var
].
back
();
var_nodes_
[
cache_var
].
emplace_back
(
cache_node
);
// swap node to cache_node
// swap node to cache_node
cache_node
->
outputs
.
insert
(
cache_node
->
outputs
.
end
(),
cache_node
->
outputs
.
insert
(
cache_node
->
outputs
.
end
(),
...
@@ -301,11 +286,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
...
@@ -301,11 +286,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
auto
*
prev_op
=
node
->
inputs
[
0
];
auto
*
prev_op
=
node
->
inputs
[
0
];
std
::
replace
(
prev_op
->
outputs
.
begin
(),
prev_op
->
outputs
.
end
(),
node
,
std
::
replace
(
prev_op
->
outputs
.
begin
(),
prev_op
->
outputs
.
end
(),
node
,
cache_node
);
cache_node
);
cache_node
->
inputs
.
emplace_back
(
prev_op
);
for
(
auto
*
next_op
:
node
->
outputs
)
{
for
(
auto
*
next_op
:
node
->
outputs
)
{
std
::
replace
(
next_op
->
inputs
.
begin
(),
next_op
->
inputs
.
end
(),
node
,
std
::
replace
(
next_op
->
inputs
.
begin
(),
next_op
->
inputs
.
end
(),
node
,
cache_node
);
cache_node
);
}
}
// erase unused node
auto
&
nodes
=
var_nodes_
.
at
(
var
);
nodes
.
erase
(
std
::
remove
(
nodes
.
begin
(),
nodes
.
end
(),
node
),
nodes
.
end
());
graph
->
RemoveNode
(
node
);
}
}
}
}
...
@@ -325,271 +314,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
...
@@ -325,271 +314,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
std
::
replace
(
next_op
->
inputs
.
begin
(),
next_op
->
inputs
.
end
(),
node
,
std
::
replace
(
next_op
->
inputs
.
begin
(),
next_op
->
inputs
.
end
(),
node
,
cache_node
);
cache_node
);
}
}
}
}
}
// release node of unused var in graph
for
(
auto
*
node
:
var_nodes_
[
var
])
{
graph
->
RemoveNode
(
node
);
}
var_nodes_
.
at
(
var
).
clear
();
}
std
::
vector
<
ir
::
Node
*>
SortOpLikeDescOrder
(
const
ir
::
Graph
&
graph
)
{
PADDLE_ENFORCE
(
graph
.
Has
(
kAllOpDescs
),
"Graph has no attribute of kAllOpDescs."
);
// 1. get op desc order
auto
&
op_descs
=
graph
.
Get
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
);
// 2. topology sort order
auto
nodes
=
graph
.
Nodes
();
std
::
deque
<
ir
::
Node
*>
ops
;
FilterVariables
(
nodes
,
[
&
](
ir
::
Node
*
op
)
{
if
(
op
->
IsOp
()
&&
op
->
Op
()
!=
nullptr
)
{
ops
.
emplace_back
(
op
);
}
});
std
::
unordered_map
<
ir
::
Node
*
,
size_t
>
op_deps
;
std
::
list
<
ir
::
Node
*>
ready_ops
;
std
::
unordered_map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
pending_ops
;
for
(
auto
*
op
:
ops
)
{
std
::
unordered_set
<
ir
::
Node
*>
preceding_op
;
for
(
auto
*
in
:
op
->
inputs
)
{
if
(
in
->
inputs
.
empty
())
continue
;
PADDLE_ENFORCE
(
in
->
inputs
.
size
()
==
1
&&
in
->
inputs
[
0
]
->
IsOp
());
preceding_op
.
emplace
(
in
->
inputs
[
0
]);
pending_ops
[
in
->
inputs
[
0
]].
emplace
(
op
);
}
op_deps
[
op
]
=
preceding_op
.
size
();
if
(
preceding_op
.
empty
())
{
ready_ops
.
emplace_back
(
op
);
}
}
// 3. generated op list based desc order and the topology order
std
::
vector
<
ir
::
Node
*>
ret
;
std
::
list
<
OpDesc
*>
op_descs_list
(
op_descs
.
begin
(),
op_descs
.
end
());
auto
update_by_found_node
=
[
&
](
ir
::
Node
*
found_node
)
{
for
(
auto
*
pending_op
:
pending_ops
[
found_node
])
{
if
(
--
op_deps
[
pending_op
]
==
0
)
{
ready_ops
.
emplace_back
(
pending_op
);
}
}
ready_ops
.
remove
(
found_node
);
ret
.
emplace_back
(
found_node
);
};
while
(
!
ready_ops
.
empty
())
{
bool
all_of_ready_op_unmatched
=
true
;
for
(
auto
it
=
op_descs_list
.
begin
();
it
!=
op_descs_list
.
end
();)
{
auto
op_desc
=
*
it
;
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
*
op
:
ready_ops
)
{
if
(
IsSameDesc
(
op
->
Op
(),
op_desc
))
{
found_node
=
op
;
break
;
}
}
// 3.1 op desc deleted by other pass
if
(
found_node
==
nullptr
)
{
++
it
;
continue
;
}
else
{
all_of_ready_op_unmatched
=
false
;
it
=
op_descs_list
.
erase
(
it
);
}
update_by_found_node
(
found_node
);
}
// 3.2 op descs are added by other pass
// preceding op non empty means some new op descs are
// created, but not contained in return node list.
// these new op desc may depend on each other.
std
::
list
<
ir
::
Node
*>
prev_ready_ops
(
ready_ops
);
if
(
all_of_ready_op_unmatched
)
{
for
(
auto
op
:
prev_ready_ops
)
{
update_by_found_node
(
op
);
}
}
}
PADDLE_ENFORCE
(
std
::
all_of
(
op_deps
.
begin
(),
op_deps
.
end
(),
[
&
](
const
std
::
pair
<
ir
::
Node
*
,
size_t
>&
p
)
{
return
p
.
second
==
0
;
}));
return
ret
;
}
ControlFlowGraph
::
ControlFlowGraph
(
const
ir
::
Graph
&
graph
)
{
ops_
=
SortOpLikeDescOrder
(
graph
);
ConnectNodes
();
}
void
ControlFlowGraph
::
BuildCFGGraph
()
{
// FIXME(dzh): same effect with ConnectNodes, but use the control
// link to build dependency graph, it goes wrong in transformer.
for
(
ir
::
Node
*
op
:
ops_
)
{
for
(
auto
&
input_var
:
op
->
inputs
)
{
if
(
!
input_var
->
inputs
.
empty
())
{
PADDLE_ENFORCE
(
input_var
->
inputs
.
size
()
==
1
&&
input_var
->
inputs
[
0
]
->
IsOp
(),
"Preceding Op Node of Var Node must be unique"
);
auto
*
pred_op
=
input_var
->
inputs
[
0
];
if
(
pred_op
->
Op
()
!=
nullptr
)
{
predecessors_
[
op
].
insert
(
pred_op
);
successors_
[
pred_op
].
insert
(
op
);
}
}
if
(
input_var
->
IsVar
()
&&
!
input_var
->
IsCtrlVar
())
{
uses_
[
op
].
insert
(
input_var
->
Name
());
}
}
for
(
auto
&
output_var
:
op
->
outputs
)
{
// output var may be used by many op
for
(
auto
*
succ_op
:
output_var
->
outputs
)
{
if
(
succ_op
->
Op
()
!=
nullptr
)
{
successors_
[
op
].
insert
(
succ_op
);
predecessors_
[
succ_op
].
insert
(
op
);
}
}
if
(
output_var
->
IsVar
()
&&
!
output_var
->
IsCtrlVar
())
{
defs_
[
op
].
insert
(
output_var
->
Name
());
}
}
}
}
void
ControlFlowGraph
::
ConnectNodes
()
{
for
(
size_t
i
=
0
;
i
<
ops_
.
size
();
++
i
)
{
auto
&
op
=
ops_
[
i
];
try
{
auto
&
next_op
=
ops_
.
at
(
i
+
1
);
successors_
[
op
].
insert
(
next_op
);
predecessors_
[
next_op
].
insert
(
op
);
}
catch
(...)
{
// do nothing
}
FilterVariables
(
op
->
inputs
,
[
&
](
ir
::
Node
*
var
)
{
uses_
[
op
].
emplace
(
var
->
Name
());
});
FilterVariables
(
op
->
outputs
,
[
&
](
ir
::
Node
*
var
)
{
defs_
[
op
].
emplace
(
var
->
Name
());
});
}
}
void
ControlFlowGraph
::
LiveVariableAnalysis
()
{
// NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
// compute the liveness of for each variable though reversed_ops algorithm.
// It iterates the operators from end to begin, compute the live in/live out
// variable set for each op, then the diff between in/out will be used for
// the variable reuse. For detail refer to
// http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
std
::
list
<
ir
::
Node
*>
work_list
(
ops_
.
rbegin
(),
ops_
.
rend
());
while
(
!
work_list
.
empty
())
{
ir
::
Node
*
op
=
work_list
.
front
();
work_list
.
pop_front
();
// get the live_in calculated before. Empty if first.
auto
prev_live_in
=
std
::
move
(
live_in_
[
op
]);
for
(
auto
&
s
:
successors_
[
op
])
{
for
(
auto
&
var
:
live_in_
[
s
])
{
live_out_
[
op
].
insert
(
var
);
}
}
for
(
auto
&
var
:
uses_
[
op
])
{
live_in_
[
op
].
insert
(
var
);
}
for
(
auto
&
var
:
live_out_
[
op
])
{
live_in_
[
op
].
insert
(
var
);
}
for
(
auto
&
var
:
defs_
[
op
])
{
live_in_
[
op
].
erase
(
var
);
}
// If the live_in is not changed, then the liveness analysis of
// predecessors is completed.
//
// Otherwise, recalculate the predecessors liveness
if
(
live_in_
[
op
]
!=
prev_live_in
)
{
for
(
auto
&
pre
:
predecessors_
[
op
])
{
work_list
.
push_back
(
pre
);
}
}
}
}
void
ControlFlowGraph
::
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
const
std
::
string
&
new_node
,
int
begin_idx
)
{
// update graph from begin idx to the end
for
(
size_t
i
=
begin_idx
;
i
!=
ops_
.
size
();
++
i
)
{
auto
*
op
=
ops_
[
i
];
if
(
uses_
[
op
].
find
(
old_node
)
!=
uses_
[
op
].
end
())
{
uses_
[
op
].
erase
(
old_node
);
uses_
[
op
].
insert
(
new_node
);
}
if
(
defs_
[
op
].
find
(
old_node
)
!=
defs_
[
op
].
end
())
{
defs_
[
op
].
erase
(
old_node
);
defs_
[
op
].
insert
(
new_node
);
}
if
(
live_in_
[
op
].
find
(
old_node
)
!=
live_in_
[
op
].
end
())
{
live_in_
[
op
].
erase
(
old_node
);
live_in_
[
op
].
insert
(
new_node
);
}
if
(
live_out_
[
op
].
find
(
old_node
)
!=
live_out_
[
op
].
end
())
{
live_out_
[
op
].
erase
(
old_node
);
live_out_
[
op
].
insert
(
new_node
);
}
}
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
LiveIn
(
ir
::
Node
*
op
)
const
{
auto
it
=
live_in_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
live_in_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_in, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
LiveOut
(
ir
::
Node
*
op
)
const
{
auto
it
=
live_out_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
live_out_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_out, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
set
<
std
::
string
>
ControlFlowGraph
::
Use
(
ir
::
Node
*
op
)
const
{
auto
it
=
uses_
.
find
(
op
);
PADDLE_ENFORCE
(
it
!=
uses_
.
end
(),
string
::
Sprintf
(
"Expect %s in live_out, but Not Found."
,
op
->
Name
()));
return
it
->
second
;
}
const
std
::
vector
<
ir
::
Node
*>
ControlFlowGraph
::
Ops
()
const
{
return
ops_
;
}
std
::
vector
<
ir
::
Node
*>&
ControlFlowGraph
::
Ops
()
{
return
ops_
;
}
ir
::
Node
*
ControlFlowGraph
::
GetNodeFromVarName
(
const
std
::
string
&
name
,
// erase unused node
ir
::
Node
*
op
)
const
{
auto
&
nodes
=
var_nodes_
.
at
(
var
);
// in ssa-graph, different version nodes have same name,
nodes
.
erase
(
std
::
remove
(
nodes
.
begin
(),
nodes
.
end
(),
node
),
nodes
.
end
());
// this function get the latest version var before target op
graph
->
RemoveNode
(
node
);
// It may return nullptr, such as data node.
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
*
node
:
ops_
)
{
if
(
node
==
op
)
break
;
for
(
auto
&
output
:
node
->
outputs
)
{
if
(
output
->
Name
()
==
name
)
{
found_node
=
output
;
}
}
}
}
}
}
return
found_node
;
}
}
}
// namespace details
}
// namespace details
...
@@ -598,5 +330,4 @@ ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name,
...
@@ -598,5 +330,4 @@ ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name,
REGISTER_PASS
(
memory_optimize_pass
,
REGISTER_PASS
(
memory_optimize_pass
,
paddle
::
framework
::
details
::
MemoryOptimizePass
)
paddle
::
framework
::
details
::
MemoryOptimizePass
)
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
kGraphNodePool
)
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
kAllOpDescs
);
.
RequireGraphAttr
(
paddle
::
framework
::
details
::
kAllOpDescs
);
paddle/fluid/framework/details/memory_optimize_pass.h
浏览文件 @
f79a3a83
...
@@ -32,20 +32,15 @@
...
@@ -32,20 +32,15 @@
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
constexpr
char
kAllOpDescs
[]
=
"all_op_descs"
;
std
::
vector
<
ir
::
Node
*>
SortOpLikeDescOrder
(
const
ir
::
Graph
&
graph
);
class
ControlFlowGraph
;
class
MemoryOptimizePass
:
public
ir
::
Pass
{
class
MemoryOptimizePass
:
public
ir
::
Pass
{
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
private:
// fill the variable map(var_nodes) by version.
// fill the variable map(var_nodes) by version.
void
InitSSAGraphNodes
()
const
;
void
InitSSAGraphNodes
()
const
;
private:
// update program descs
// update program descs
void
RenameVarInGraphDesc
(
const
std
::
string
&
var
,
void
RenameVarInGraphDesc
(
const
std
::
string
&
var
,
const
std
::
string
&
cache_var
,
size_t
idx
)
const
;
const
std
::
string
&
cache_var
,
size_t
idx
)
const
;
...
@@ -55,13 +50,14 @@ class MemoryOptimizePass : public ir::Pass {
...
@@ -55,13 +50,14 @@ class MemoryOptimizePass : public ir::Pass {
ir
::
Graph
*
graph
)
const
;
ir
::
Graph
*
graph
)
const
;
void
SubGraphOptimize
(
OpDesc
*
op_desc
)
const
;
void
SubGraphOptimize
(
OpDesc
*
op_desc
)
const
;
// scan subblock and collect the output/input variables.
// 1. scan op with subblock and collect the output/input vars.
std
::
unordered_set
<
std
::
string
>
GetSubBlockVars
(
// while, while_grad, conditional_block
const
std
::
unordered_set
<
ir
::
Node
*>&
)
const
;
// 2. scan distributed ops and collect the output/input vars
void
CollectSkipVarsSet
(
const
std
::
unordered_set
<
ir
::
Node
*>&
)
const
;
private:
private:
// Reuse Node Pool, Owned.
// Reuse Node Pool, Owned.
mutable
Ordered
NodeLis
t
pool_
;
mutable
Ordered
Se
t
pool_
;
// controlflow Graph
// controlflow Graph
mutable
std
::
unique_ptr
<
ControlFlowGraph
>
cfg_
;
mutable
std
::
unique_ptr
<
ControlFlowGraph
>
cfg_
;
// skip set
// skip set
...
@@ -70,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass {
...
@@ -70,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass {
mutable
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
var_nodes_
;
mutable
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>>
var_nodes_
;
};
};
class
ControlFlowGraph
{
public:
ControlFlowGraph
()
=
default
;
// For IR Graph in parallelexecutor
explicit
ControlFlowGraph
(
const
ir
::
Graph
&
graph
);
void
LiveVariableAnalysis
();
void
RenameVarInCFGGraph
(
const
std
::
string
&
old_node
,
const
std
::
string
&
new_node
,
int
begin_idx
);
const
std
::
set
<
std
::
string
>
LiveIn
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>
LiveOut
(
ir
::
Node
*
op
)
const
;
const
std
::
set
<
std
::
string
>
Use
(
ir
::
Node
*
op
)
const
;
const
std
::
vector
<
ir
::
Node
*>
Ops
()
const
;
std
::
vector
<
ir
::
Node
*>&
Ops
();
// for ssa-graph nodes
ir
::
Node
*
GetNodeFromVarName
(
const
std
::
string
&
name
,
ir
::
Node
*
op
)
const
;
private:
void
BuildCFGGraph
();
void
ConnectNodes
();
using
NodeListMap
=
std
::
unordered_map
<
ir
::
Node
*
,
std
::
set
<
ir
::
Node
*>>
;
using
VarSetMap
=
std
::
map
<
ir
::
Node
*
,
std
::
set
<
std
::
string
>>
;
// successors ops use the output variables.
NodeListMap
successors_
;
// predecessors ops generated input variables.
NodeListMap
predecessors_
;
// variables lived before run current op.
VarSetMap
live_in_
;
// variables lived after run current op.
VarSetMap
live_out_
;
VarSetMap
uses_
;
// op inputs
VarSetMap
defs_
;
// op outputs
std
::
vector
<
ir
::
Node
*>
ops_
;
// op sequence by topology sort
};
}
// namespace details
}
// namespace details
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/details/memory_optimize_pass_test.cc
已删除
100644 → 0
浏览文件 @
d2d3f2b5
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/memory_optimize_pass.h"
#include <algorithm>
#include <iostream>
#include <iterator>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/graph_test_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
REGISTER_OPERATOR
(
sum
,
paddle
::
framework
::
DummyOp
,
paddle
::
framework
::
SumOpMaker
,
paddle
::
framework
::
DummyVarTypeInference
);
REGISTER_OPERATOR
(
assign
,
paddle
::
framework
::
DummyOp
,
paddle
::
framework
::
AssignOpMaker
,
paddle
::
framework
::
DummyVarTypeInference
);
REGISTER_OPERATOR
(
dummy
,
paddle
::
framework
::
DummyOp
,
paddle
::
framework
::
SumOpMaker
,
paddle
::
framework
::
DummyVarTypeInference
);
/*
https://en.wikipedia.org/wiki/Live_variable_analysis
Create a customed classical dependency graph, left row is the instruction
number.
1. a = 1
2. b = a
3. c = a
4. d = b + c
5. e = d
a--------+
| |
b c
| |
d--------+
|
e
Then analysis these variable's liveness range
*/
namespace
paddle
{
namespace
framework
{
namespace
details
{
static
inline
bool
IsSameDesc
(
OpDesc
*
op1
,
OpDesc
*
op2
)
{
return
op1
->
Type
()
==
op2
->
Type
()
&&
op1
->
Inputs
()
==
op2
->
Inputs
()
&&
op1
->
Outputs
()
==
op2
->
Outputs
();
}
inline
static
ProgramDesc
FillProgramDesc
()
{
ProgramDesc
prog
;
prog
.
MutableBlock
(
0
)
->
Var
(
"a"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"b"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"c"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"d"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"e"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"assign"
);
op
->
SetInput
(
"X"
,
{
"a"
});
op
->
SetOutput
(
"Out"
,
{
"b"
});
}
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"assign"
);
op
->
SetInput
(
"X"
,
{
"a"
});
op
->
SetOutput
(
"Out"
,
{
"c"
});
}
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d"
});
}
{
auto
*
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
"assign"
);
op
->
SetInput
(
"X"
,
{
"d"
});
op
->
SetOutput
(
"Out"
,
{
"e"
});
}
return
prog
;
}
TEST
(
CFGGraph
,
IRGraph
)
{
// prepare ir graph
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
ControlFlowGraph
cfg
(
graph
);
cfg
.
LiveVariableAnalysis
();
// test assign op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"a"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
0
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"a"
,
"b"
}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
0
])));
// test assign op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"a"
,
"b"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
1
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"b"
,
"c"
}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
1
])));
// test sum op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"b"
,
"c"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
2
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"d"
}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
2
])));
// test assign op
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{
"d"
}
==
cfg
.
LiveIn
(
cfg
.
Ops
()[
3
])));
ASSERT_TRUE
((
std
::
set
<
std
::
string
>
{}
==
cfg
.
LiveOut
(
cfg
.
Ops
()[
3
])));
}
// 1. normal test
TEST
(
SortOpLikeDescOrder
,
NormalTest
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
SortOpLikeDescOrder
(
graph
);
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
auto
node
=
nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 2. remove some op_desc
TEST
(
SortOpLikeDescOrder
,
RemoveOpDesc
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
nodes
=
graph
.
Nodes
();
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
node
:
nodes
)
{
if
(
node
->
IsOp
()
&&
node
->
outputs
.
back
()
->
Name
()
==
"e"
)
{
found_node
=
node
;
break
;
}
}
PADDLE_ENFORCE
(
found_node
!=
nullptr
);
for
(
auto
it
=
op_descs
.
begin
();
it
!=
op_descs
.
end
();)
{
if
(
IsSameDesc
(
*
it
,
found_node
->
Op
()))
{
it
=
op_descs
.
erase
(
it
);
}
else
{
++
it
;
}
}
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
std
::
remove
(
d
->
outputs
.
begin
(),
d
->
outputs
.
end
(),
found_node
);
graph
.
RemoveNode
(
found_node
);
graph
.
RemoveNode
(
e
);
// other node keeps the same order
auto
remain_nodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
remain_nodes
.
size
();
++
i
)
{
auto
node
=
remain_nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 3. add some op_desc
TEST
(
SortOpLikeDescOrder
,
AddOpDesc
)
{
auto
prog
=
FillProgramDesc
();
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
ir
::
Graph
graph
(
prog
);
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
// cached desc different with real one
// mimic the intermidiete pass modify the programdesc.
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d1"
});
ir
::
Node
*
node
=
graph
.
CreateOpNode
(
op
);
ir
::
Node
*
d1
=
graph
.
CreateVarNode
(
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
));
ir
::
Node
*
b
=
find_node_in_graph
(
"b"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
node
->
outputs
.
emplace_back
(
d1
);
node
->
inputs
.
emplace_back
(
b
);
node
->
inputs
.
emplace_back
(
c
);
d1
->
inputs
.
emplace_back
(
node
);
b
->
outputs
.
emplace_back
(
node
);
c
->
outputs
.
emplace_back
(
node
);
op_descs
.
insert
(
op_descs
.
begin
()
+
4
,
op
);
auto
nodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
nodes
.
size
();
++
i
)
{
auto
node
=
nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 4. add and delete some op_desc
TEST
(
SortOpLikeDescOrder
,
AddAndDeleteOpDesc
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
// remove sum node
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
ir
::
Node
*
found_node
=
nullptr
;
auto
nodes
=
graph
.
Nodes
();
for
(
auto
node
:
nodes
)
{
if
(
node
->
Name
()
==
"sum"
)
{
found_node
=
node
;
break
;
}
}
PADDLE_ENFORCE
(
found_node
!=
nullptr
);
for
(
auto
it
=
op_descs
.
begin
();
it
!=
op_descs
.
end
();)
{
if
(
IsSameDesc
(
*
it
,
found_node
->
Op
()))
{
it
=
op_descs
.
erase
(
it
);
}
else
{
++
it
;
}
}
{
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
std
::
remove
(
d
->
outputs
.
begin
(),
d
->
outputs
.
end
(),
found_node
);
std
::
remove
(
c
->
outputs
.
begin
(),
c
->
outputs
.
end
(),
found_node
);
ir
::
Node
*
pending_op
=
found_node
->
outputs
[
0
]
->
outputs
[
0
];
graph
.
RemoveNode
(
e
);
graph
.
RemoveNode
(
pending_op
);
graph
.
RemoveNode
(
found_node
);
}
// add node
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d1"
});
{
ir
::
Node
*
node
=
graph
.
CreateOpNode
(
op
);
ir
::
Node
*
d1
=
graph
.
CreateVarNode
(
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
));
ir
::
Node
*
b
=
find_node_in_graph
(
"b"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
node
->
outputs
.
emplace_back
(
d1
);
node
->
inputs
.
emplace_back
(
b
);
node
->
inputs
.
emplace_back
(
c
);
b
->
outputs
.
emplace_back
(
node
);
c
->
outputs
.
emplace_back
(
node
);
}
op_descs
.
insert
(
op_descs
.
begin
()
+
2
,
op
);
// check the order
auto
mynodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
mynodes
.
size
();
++
i
)
{
auto
node
=
mynodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
// 5. add and replace some op_desc inplace.
TEST
(
SortOpLikeDescOrder
,
AddAndReplaceOpDescInplace
)
{
auto
prog
=
FillProgramDesc
();
ir
::
Graph
graph
(
prog
);
const
std
::
vector
<
OpDesc
*>*
all_op_descs
=
new
std
::
vector
<
OpDesc
*>
(
prog
.
Block
(
0
).
AllOps
());
graph
.
Set
(
details
::
kAllOpDescs
,
all_op_descs
);
// take ownership
auto
find_node_in_graph
=
[
&
](
std
::
string
s
)
{
ir
::
Node
*
ret
=
nullptr
;
for
(
auto
n
:
graph
.
Nodes
())
{
if
(
n
->
Name
()
==
s
)
{
ret
=
n
;
break
;
}
}
PADDLE_ENFORCE
(
ret
!=
nullptr
);
return
ret
;
};
auto
op_descs
=
prog
.
Block
(
0
).
AllOps
();
// add node
auto
op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"b"
,
"c"
});
op
->
SetOutput
(
"Out"
,
{
"d1"
});
{
ir
::
Node
*
node
=
graph
.
CreateOpNode
(
op
);
ir
::
Node
*
d1
=
graph
.
CreateVarNode
(
prog
.
MutableBlock
(
0
)
->
Var
(
"d1"
));
ir
::
Node
*
b
=
find_node_in_graph
(
"b"
);
ir
::
Node
*
c
=
find_node_in_graph
(
"c"
);
node
->
outputs
.
emplace_back
(
d1
);
node
->
inputs
.
emplace_back
(
b
);
node
->
inputs
.
emplace_back
(
c
);
d1
->
inputs
.
emplace_back
(
node
);
b
->
outputs
.
emplace_back
(
node
);
c
->
outputs
.
emplace_back
(
node
);
}
op_descs
.
emplace_back
(
op
);
// replace op_desc inplace
auto
nodes
=
graph
.
Nodes
();
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
node
:
nodes
)
{
if
(
node
->
IsOp
()
&&
node
->
Op
()
&&
node
->
Name
()
==
"assign"
)
{
if
(
node
->
outputs
.
size
()
==
1
&&
node
->
outputs
[
0
]
->
Name
()
==
"e"
)
{
found_node
=
node
;
break
;
}
}
}
{
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
std
::
remove
(
d
->
outputs
.
begin
(),
d
->
outputs
.
end
(),
found_node
);
std
::
remove
(
e
->
inputs
.
begin
(),
e
->
inputs
.
end
(),
found_node
);
graph
.
RemoveNode
(
found_node
);
}
op_descs
.
erase
(
op_descs
.
begin
()
+
3
);
auto
replace_op
=
prog
.
MutableBlock
(
0
)
->
AppendOp
();
replace_op
->
SetType
(
"sum"
);
replace_op
->
SetInput
(
"X"
,
{
"d"
,
"d1"
});
replace_op
->
SetOutput
(
"Out"
,
{
"e"
});
{
ir
::
Node
*
sum2
=
graph
.
CreateOpNode
(
replace_op
);
ir
::
Node
*
e
=
find_node_in_graph
(
"e"
);
ir
::
Node
*
d
=
find_node_in_graph
(
"d"
);
ir
::
Node
*
d1
=
find_node_in_graph
(
"d1"
);
sum2
->
inputs
.
emplace_back
(
d
);
sum2
->
inputs
.
emplace_back
(
d1
);
sum2
->
outputs
.
emplace_back
(
e
);
e
->
inputs
.
emplace_back
(
sum2
);
d
->
outputs
.
emplace_back
(
sum2
);
d1
->
outputs
.
emplace_back
(
sum2
);
}
op_descs
.
emplace_back
(
replace_op
);
// compare op order
auto
graph_nodes
=
SortOpLikeDescOrder
(
graph
);
for
(
size_t
i
=
0
;
i
<
graph_nodes
.
size
();
++
i
)
{
auto
node
=
graph_nodes
[
i
];
auto
op_desc
=
op_descs
[
i
];
ASSERT_TRUE
(
IsSameDesc
(
node
->
Op
(),
op_desc
));
}
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/sequential_execution_pass.cc
浏览文件 @
f79a3a83
...
@@ -17,6 +17,7 @@
...
@@ -17,6 +17,7 @@
#include <unordered_map>
#include <unordered_map>
#include <unordered_set>
#include <unordered_set>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/details/memory_optimize_helper.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/op_proto_maker.h"
namespace
paddle
{
namespace
paddle
{
...
...
paddle/fluid/framework/details/sequential_execution_pass.h
浏览文件 @
f79a3a83
...
@@ -21,8 +21,6 @@ namespace paddle {
...
@@ -21,8 +21,6 @@ namespace paddle {
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
constexpr
char
kAllOpDescs
[]
=
"all_op_descs"
;
class
SequentialExecutionPass
:
public
ir
::
Pass
{
class
SequentialExecutionPass
:
public
ir
::
Pass
{
protected:
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
...
...
paddle/fluid/framework/inplace_op_inference.h
浏览文件 @
f79a3a83
...
@@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference {
...
@@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference {
bool
TryInplaceInputOutput
(
const
VarDesc
&
in
,
const
VarDesc
&
out
)
const
{
bool
TryInplaceInputOutput
(
const
VarDesc
&
in
,
const
VarDesc
&
out
)
const
{
return
in
.
Name
()
!=
out
.
Name
()
&&
details
::
NodeCanReused
(
in
)
&&
return
in
.
Name
()
!=
out
.
Name
()
&&
details
::
NodeCanReused
(
in
)
&&
details
::
NodeCanReused
(
out
)
&&
details
::
NodeCanReused
(
out
)
&&
details
::
NodeSize
InBytes
(
out
)
<=
details
::
NodeSizeInBytes
(
in
);
details
::
NodeSize
(
out
)
<=
details
::
NodeSize
(
in
);
}
}
};
};
...
...
paddle/fluid/framework/inplace_op_inference_test.cc
浏览文件 @
f79a3a83
...
@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) {
...
@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) {
op
->
SetOutput
(
"Out"
,
{
"test2_out"
});
op
->
SetOutput
(
"Out"
,
{
"test2_out"
});
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_a"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_a"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_a"
)
->
SetShape
({
32
,
64
});
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_a"
)
->
SetShape
({
32
,
64
,
128
,
128
});
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_b"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_b"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_c"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_c"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_out"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_out"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_out"
)
->
SetShape
({
32
,
16
});
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_out"
)
->
SetShape
({
32
,
16
,
128
,
128
});
auto
&
infer_inplace
=
OpInfoMap
::
Instance
().
Get
(
op
->
Type
()).
infer_inplace_
;
auto
&
infer_inplace
=
OpInfoMap
::
Instance
().
Get
(
op
->
Type
()).
infer_inplace_
;
auto
in_to_outs
=
infer_inplace
(
*
op
,
op
->
Block
());
auto
in_to_outs
=
infer_inplace
(
*
op
,
op
->
Block
());
...
@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) {
...
@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) {
op
->
SetOutput
(
GradVarName
(
"X"
),
{
"test2_a"
,
"test2_b"
,
"test2_c"
});
op
->
SetOutput
(
GradVarName
(
"X"
),
{
"test2_a"
,
"test2_b"
,
"test2_c"
});
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_a"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_a"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_a"
)
->
SetShape
({
32
,
16
});
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_a"
)
->
SetShape
({
32
,
16
,
1024
,
1024
});
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_b"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_b"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_c"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_c"
)
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_out"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_out"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_out"
)
->
SetShape
({
32
,
16
});
prog
.
MutableBlock
(
0
)
->
Var
(
"test2_out"
)
->
SetShape
({
32
,
16
,
1024
,
1024
});
auto
&
infer_inplace
=
OpInfoMap
::
Instance
().
Get
(
op
->
Type
()).
infer_inplace_
;
auto
&
infer_inplace
=
OpInfoMap
::
Instance
().
Get
(
op
->
Type
()).
infer_inplace_
;
auto
in_to_outs
=
infer_inplace
(
*
op
,
op
->
Block
());
auto
in_to_outs
=
infer_inplace
(
*
op
,
op
->
Block
());
...
@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) {
...
@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) {
prog
.
MutableBlock
(
0
)
->
Var
(
"o0"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"o0"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"y0"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"y0"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"z0"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"z0"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"a0"
)
->
SetShape
({
32
,
16
});
prog
.
MutableBlock
(
0
)
->
Var
(
"a0"
)
->
SetShape
({
32
,
16
,
1024
,
1024
});
prog
.
MutableBlock
(
0
)
->
Var
(
"b0"
)
->
SetShape
({
32
,
16
});
prog
.
MutableBlock
(
0
)
->
Var
(
"b0"
)
->
SetShape
({
32
,
16
,
1024
,
1024
});
prog
.
MutableBlock
(
0
)
->
Var
(
"c0"
)
->
SetShape
({
32
,
16
});
prog
.
MutableBlock
(
0
)
->
Var
(
"c0"
)
->
SetShape
({
32
,
16
,
1024
,
1024
});
prog
.
MutableBlock
(
0
)
->
Var
(
"o0"
)
->
SetShape
({
32
,
16
});
prog
.
MutableBlock
(
0
)
->
Var
(
"o0"
)
->
SetShape
({
32
,
16
,
1024
,
1024
});
prog
.
MutableBlock
(
0
)
->
Var
(
"y0"
)
->
SetShape
({
32
,
16
});
prog
.
MutableBlock
(
0
)
->
Var
(
"y0"
)
->
SetShape
({
32
,
16
,
1024
,
1024
});
prog
.
MutableBlock
(
0
)
->
Var
(
"z0"
)
->
SetShape
({
32
,
16
});
prog
.
MutableBlock
(
0
)
->
Var
(
"z0"
)
->
SetShape
({
32
,
16
,
1024
,
1024
});
auto
&
infer_inplace
=
OpInfoMap
::
Instance
().
Get
(
op
->
Type
()).
infer_inplace_
;
auto
&
infer_inplace
=
OpInfoMap
::
Instance
().
Get
(
op
->
Type
()).
infer_inplace_
;
auto
in_to_outs
=
infer_inplace
(
*
op
,
op
->
Block
());
auto
in_to_outs
=
infer_inplace
(
*
op
,
op
->
Block
());
...
@@ -267,15 +267,16 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
...
@@ -267,15 +267,16 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
prog
.
MutableBlock
(
0
)
->
Var
(
"o0"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"o0"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"y0"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"y0"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"z0"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"z0"
);
prog
.
MutableBlock
(
0
)
->
Var
(
"a0"
)
->
SetShape
({
32
,
16
});
prog
.
MutableBlock
(
0
)
->
Var
(
"a0"
)
->
SetShape
({
32
,
16
,
1024
,
1024
});
prog
.
MutableBlock
(
0
)
->
Var
(
"b0"
)
->
SetShape
({
32
,
16
});
prog
.
MutableBlock
(
0
)
->
Var
(
"b0"
)
->
SetShape
({
32
,
16
,
1024
,
1024
});
prog
.
MutableBlock
(
0
)
->
Var
(
"c0"
)
->
SetShape
({
32
,
16
});
prog
.
MutableBlock
(
0
)
->
Var
(
"c0"
)
->
SetShape
({
32
,
16
,
1024
,
1024
});
prog
.
MutableBlock
(
0
)
->
Var
(
"o0"
)
->
SetShape
({
32
,
16
});
prog
.
MutableBlock
(
0
)
->
Var
(
"o0"
)
->
SetShape
({
32
,
16
,
1024
,
1024
});
prog
.
MutableBlock
(
0
)
->
Var
(
"y0"
)
->
SetShape
({
32
,
16
});
prog
.
MutableBlock
(
0
)
->
Var
(
"y0"
)
->
SetShape
({
32
,
16
,
1024
,
1024
});
prog
.
MutableBlock
(
0
)
->
Var
(
"z0"
)
->
SetShape
({
32
,
16
});
prog
.
MutableBlock
(
0
)
->
Var
(
"z0"
)
->
SetShape
({
32
,
16
,
1024
,
1024
});
auto
&
infer_inplace
=
OpInfoMap
::
Instance
().
Get
(
op
->
Type
()).
infer_inplace_
;
auto
&
infer_inplace
=
OpInfoMap
::
Instance
().
Get
(
op
->
Type
()).
infer_inplace_
;
auto
in_to_outs
=
infer_inplace
(
*
op
,
op
->
Block
());
auto
in_to_outs
=
infer_inplace
(
*
op
,
op
->
Block
());
EXPECT_EQ
(
in_to_outs
.
size
(),
3ul
);
EXPECT_EQ
(
in_to_outs
.
size
(),
3ul
);
std
::
unordered_map
<
std
::
string
,
std
::
string
>
expects
=
{
std
::
unordered_map
<
std
::
string
,
std
::
string
>
expects
=
{
{
"o0"
,
"a0"
},
{
"y0"
,
"b0"
},
{
"z0"
,
"c0"
},
{
"o0"
,
"a0"
},
{
"y0"
,
"b0"
},
{
"z0"
,
"c0"
},
...
...
paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
浏览文件 @
f79a3a83
...
@@ -38,9 +38,13 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
...
@@ -38,9 +38,13 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
->
assert_is_op
(
"scale"
)
->
assert_is_op
(
"scale"
)
->
assert_op_attr
<
float
>
(
"scale"
,
1.
)
->
assert_op_attr
<
float
>
(
"scale"
,
1.
)
->
assert_op_attr
<
float
>
(
"bias"
,
0.
);
->
assert_op_attr
<
float
>
(
"bias"
,
0.
);
auto
scale_out
=
detector
.
mutable_pattern
()
auto
scale_out
=
->
NewNode
(
"scale_out"
)
detector
.
mutable_pattern
()
->
assert_is_op_output
(
"scale"
);
->
NewNode
(
"scale_out"
)
->
assert_is_op_output
(
"scale"
)
// scale's output var should has only one consumer, or it can't be
// removed.
->
assert_more
([](
Node
*
x
)
{
return
x
->
outputs
.
size
()
==
1UL
;
});
pre_op
->
LinksTo
({
scale_in
});
pre_op
->
LinksTo
({
scale_in
});
scale_op
->
LinksFrom
({
scale_in
}).
LinksTo
({
scale_out
});
scale_op
->
LinksFrom
({
scale_in
}).
LinksTo
({
scale_out
});
...
...
paddle/fluid/framework/ir/infer_clean_graph_pass.cc
浏览文件 @
f79a3a83
...
@@ -37,6 +37,7 @@ class InferCleanGraphPass : public FusePassBase {
...
@@ -37,6 +37,7 @@ class InferCleanGraphPass : public FusePassBase {
std
::
unordered_set
<
const
Node
*>
invalid_nodes
;
std
::
unordered_set
<
const
Node
*>
invalid_nodes
;
int
valid_op
=
0
;
int
valid_op
=
0
;
for
(
auto
*
node
:
graph
->
Nodes
())
{
for
(
auto
*
node
:
graph
->
Nodes
())
{
PADDLE_ENFORCE_NOT_NULL
(
node
);
if
(
is_valid_node
(
node
))
{
if
(
is_valid_node
(
node
))
{
invalid_nodes
.
insert
(
node
);
invalid_nodes
.
insert
(
node
);
}
else
if
(
node
->
IsOp
())
{
}
else
if
(
node
->
IsOp
())
{
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
f79a3a83
...
@@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
...
@@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
eager_deletion_pass
->
SetNotOwned
(
details
::
kAllPlaces
,
&
places_
);
eager_deletion_pass
->
SetNotOwned
(
details
::
kAllPlaces
,
&
places_
);
graph
=
eager_deletion_pass
->
Apply
(
std
::
move
(
graph
));
graph
=
eager_deletion_pass
->
Apply
(
std
::
move
(
graph
));
VLOG
(
10
)
<<
"EagerDeletionPass Applied"
;
VLOG
(
10
)
<<
"EagerDeletionPass Applied"
;
if
(
build_strategy_
.
memory_early_delete_
)
{
auto
early_delete_pass
=
ir
::
PassRegistry
::
Instance
().
Get
(
"memory_early_delete_pass"
);
early_delete_pass
->
SetNotOwned
(
details
::
kGarbageCollector
,
&
gcs_
);
graph
=
early_delete_pass
->
Apply
(
std
::
move
(
graph
));
}
VLOG
(
10
)
<<
"MemoryEarlyDeletePass Applied."
;
}
}
return
graph
;
return
graph
;
...
@@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor(
...
@@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor(
graphs
.
push_back
(
std
::
move
(
graph
));
graphs
.
push_back
(
std
::
move
(
graph
));
#endif
#endif
auto
max_memory_size
=
GetEagerDeletionThreshold
();
auto
max_memory_size
=
GetEagerDeletionThreshold
();
VLOG
(
10
)
<<
"Eager Deletion Threshold "
<<
static_cast
<
float
>
(
max_memory_size
)
/
(
1
<<
30
);
if
(
max_memory_size
>=
0
)
{
if
(
max_memory_size
>=
0
)
{
for
(
size_t
i
=
0
;
i
<
graphs
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
graphs
.
size
();
++
i
)
{
graphs
[
i
]
=
member_
->
PrepareGCAndRefCnts
(
graphs
[
i
]
=
member_
->
PrepareGCAndRefCnts
(
...
@@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() {
...
@@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() {
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
USE_PASS
(
memory_early_delete_pass
);
USE_PASS
(
reference_count_pass
);
USE_PASS
(
reference_count_pass
);
USE_PASS
(
eager_deletion_pass
);
USE_PASS
(
eager_deletion_pass
);
paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
浏览文件 @
f79a3a83
...
@@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
...
@@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
return
node
.
inputs
.
size
()
==
n
;
return
node
.
inputs
.
size
()
==
n
;
}
}
NodesTSIterator
::
NodesTSIterator
(
const
std
::
vector
<
Node
*>
&
source
)
{
PADDLE_ENFORCE
(
!
source
.
empty
(),
"Start points of topological sorting should not be empty!"
);
// CHECK all the inputs' in-degree is 0
for
(
auto
*
node
:
source
)
{
PADDLE_ENFORCE
(
CheckNodeIndegreeEquals
(
*
node
,
0
));
}
std
::
unordered_set
<
Node
*>
visited
;
std
::
unordered_set
<
Node
*>
to_visit
{
source
.
begin
(),
source
.
end
()};
std
::
vector
<
Node
*>
inlink_visited
;
while
(
!
to_visit
.
empty
())
{
std
::
vector
<
Node
*>
queue
(
to_visit
.
begin
(),
to_visit
.
end
());
for
(
auto
*
p
:
queue
)
{
if
(
Agent
(
p
).
deleted
())
{
visited
.
insert
(
p
);
to_visit
.
erase
(
p
);
}
inlink_visited
.
clear
();
std
::
copy_if
(
p
->
inputs
.
begin
(),
p
->
inputs
.
end
(),
std
::
back_inserter
(
inlink_visited
),
[
&
](
Node
*
x
)
->
bool
{
return
visited
.
count
(
x
)
!=
0
;
});
if
(
inlink_visited
.
size
()
==
p
->
inputs
.
size
())
{
sorted_
.
push_back
(
p
);
for
(
auto
*
_
:
p
->
outputs
)
{
if
(
!
visited
.
count
(
_
))
{
to_visit
.
insert
(
_
);
}
}
to_visit
.
erase
(
p
);
visited
.
insert
(
p
);
}
}
}
}
NodesTSIterator
::
NodesTSIterator
(
const
NodesTSIterator
&
other
)
:
sorted_
(
other
.
sorted_
),
cursor_
(
other
.
cursor_
)
{}
Node
&
NodesTSIterator
::
operator
*
()
{
PADDLE_ENFORCE_LT
(
cursor_
,
sorted_
.
size
());
return
*
sorted_
[
cursor_
];
}
NodesTSIterator
&
NodesTSIterator
::
operator
++
()
{
if
(
++
cursor_
>=
sorted_
.
size
())
{
sorted_
.
clear
();
cursor_
=
0
;
}
return
*
this
;
}
NodesTSIterator
&
NodesTSIterator
::
operator
=
(
const
NodesTSIterator
&
other
)
{
cursor_
=
other
.
cursor_
;
sorted_
=
other
.
sorted_
;
return
*
this
;
}
bool
NodesTSIterator
::
operator
==
(
const
NodesTSIterator
&
other
)
{
return
sorted_
==
other
.
sorted_
&&
cursor_
==
other
.
cursor_
;
}
Node
*
NodesTSIterator
::
operator
->
()
{
PADDLE_ENFORCE_LT
(
cursor_
,
sorted_
.
size
());
return
sorted_
[
cursor_
];
}
}
// namespace analysis
}
// namespace analysis
}
// namespace inference
}
// namespace inference
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
浏览文件 @
f79a3a83
...
@@ -30,6 +30,7 @@ namespace inference {
...
@@ -30,6 +30,7 @@ namespace inference {
namespace
analysis
{
namespace
analysis
{
using
framework
::
ir
::
Graph
;
using
framework
::
ir
::
Graph
;
using
framework
::
ir
::
NodesTSIterator
;
const
char
kIsFunctionNode
[]
=
"__is_function_node__"
;
const
char
kIsFunctionNode
[]
=
"__is_function_node__"
;
const
char
kFunctionNodeSubGraph
[]
=
"__function_node_sub_graph__"
;
const
char
kFunctionNodeSubGraph
[]
=
"__function_node_sub_graph__"
;
...
@@ -132,32 +133,6 @@ struct Agent {
...
@@ -132,32 +133,6 @@ struct Agent {
framework
::
ir
::
Node
*
x_
;
framework
::
ir
::
Node
*
x_
;
};
};
// Topological sorting iterator on nodes.
struct
NodesTSIterator
:
public
std
::
iterator
<
std
::
forward_iterator_tag
,
framework
::
ir
::
Node
*>
{
NodesTSIterator
()
=
default
;
explicit
NodesTSIterator
(
const
std
::
vector
<
framework
::
ir
::
Node
*>
&
source
);
NodesTSIterator
(
NodesTSIterator
&&
other
)
:
sorted_
(
std
::
move
(
other
.
sorted_
)),
cursor_
(
other
.
cursor_
)
{
other
.
cursor_
=
0
;
}
NodesTSIterator
(
const
NodesTSIterator
&
other
);
framework
::
ir
::
Node
&
operator
*
();
NodesTSIterator
&
operator
++
();
// TODO(Superjomn) current implementation just compare the first
// element, need to compare the graph and all the elements in the queue and
// set.
NodesTSIterator
&
operator
=
(
const
NodesTSIterator
&
other
);
bool
operator
==
(
const
NodesTSIterator
&
other
);
bool
operator
!=
(
const
NodesTSIterator
&
other
)
{
return
!
(
*
this
==
other
);
}
framework
::
ir
::
Node
*
operator
->
();
private:
std
::
vector
<
framework
::
ir
::
Node
*>
sorted_
;
size_t
cursor_
{
0
};
};
// The nodes those have no input will be treated as start points.
// The nodes those have no input will be treated as start points.
static
std
::
vector
<
framework
::
ir
::
Node
*>
ExtractStartPoints
(
const
Graph
&
g
)
{
static
std
::
vector
<
framework
::
ir
::
Node
*>
ExtractStartPoints
(
const
Graph
&
g
)
{
std
::
vector
<
framework
::
ir
::
Node
*>
result
;
std
::
vector
<
framework
::
ir
::
Node
*>
result
;
...
...
paddle/fluid/inference/api/paddle_api.h
浏览文件 @
f79a3a83
...
@@ -16,6 +16,12 @@
...
@@ -16,6 +16,12 @@
/*! \file paddle_api.h
/*! \file paddle_api.h
*/
*/
/*! \mainpage Paddle Inference APIs
* \section intro_sec Introduction
* The Paddle inference library aims to offer an high performance inference SDK
* for Paddle users.
*/
#include <cassert>
#include <cassert>
#include <memory>
#include <memory>
#include <string>
#include <string>
...
@@ -34,26 +40,49 @@ enum PaddleDType {
...
@@ -34,26 +40,49 @@ enum PaddleDType {
};
};
/**
/**
*
\brief Memory menager for PaddleTensor
.
*
\brief Memory manager for `PaddleTensor`
.
*
*
*The PaddleBuf holds a buffer for data input or output. The memory can be
*
The PaddleBuf holds a buffer for data input or output. The memory can be
*allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
*
allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
*should be reused for better performance.
*
should be reused for better performance.
*
*
*For user allocated memory, the following API can be used:
* For user allocated memory, the following API can be used:
*- PaddleBuf(void* data, size_t length) to set an external memory by
* - PaddleBuf(void* data, size_t length) to set an external memory by
*specifying
* specifying the memory address and length.
* the memory address and length.
* - Reset(void* data, size_t length) to reset the PaddleBuf with an external
*- Reset(void* data, size_t length) to reset the PaddleBuf with an external
*memory.
*memory.
*ATTENTION, for user allocated memory, deallocation should be done by users
*
ATTENTION, for user allocated memory, deallocation should be done by users
*externally after the program finished. The PaddleBuf won't do any allocation
*externally after the program finished. The PaddleBuf won't do any allocation
*or deallocation.
*or deallocation.
*
*
*To have the PaddleBuf allocate and manage the memory:
*
To have the PaddleBuf allocate and manage the memory:
*- PaddleBuf(size_t length) will allocate a memory of size `length`.
*
- PaddleBuf(size_t length) will allocate a memory of size `length`.
*- Resize(size_t length) resize the memory to no less than `length`, ATTENTION
*
- Resize(size_t length) resize the memory to no less than `length`, ATTENTION
* if the allocated memory is larger than `length`, nothing will done.
* if the allocated memory is larger than `length`, nothing will done.
*
* Usage:
*
* Let PaddleBuf manage the memory internally.
* \code{cpp}
* const int num_elements = 128;
* PaddleBuf buf(num_elements * sizeof(float));
* \endcode
*
* Or
* \code{cpp}
* PaddleBuf buf;
* buf.Resize(num_elements * sizeof(float));
* \endcode
* Works the exactly the same.
*
* One can also make the `PaddleBuf` use the external memory.
* \code{cpp}
* PaddleBuf buf;
* void* external_memory = new float[num_elements];
* buf.Reset(external_memory, num_elements*sizeof(float));
* ...
* delete[] external_memory; // manage the memory lifetime outside.
* \endcode
*/
*/
class
PaddleBuf
{
class
PaddleBuf
{
public:
public:
...
@@ -78,7 +107,7 @@ class PaddleBuf {
...
@@ -78,7 +107,7 @@ class PaddleBuf {
/** Tell whether the buffer is empty.
/** Tell whether the buffer is empty.
*/
*/
bool
empty
()
const
{
return
length_
==
0
;
}
bool
empty
()
const
{
return
length_
==
0
;
}
/** Get the memory address.
/** Get the
data's
memory address.
*/
*/
void
*
data
()
const
{
return
data_
;
}
void
*
data
()
const
{
return
data_
;
}
/** Get the memory length.
/** Get the memory length.
...
@@ -110,7 +139,8 @@ struct PaddleTensor {
...
@@ -110,7 +139,8 @@ struct PaddleTensor {
};
};
enum
class
PaddlePlace
{
kUNK
=
-
1
,
kCPU
,
kGPU
};
enum
class
PaddlePlace
{
kUNK
=
-
1
,
kCPU
,
kGPU
};
/** Tensor without copy, currently only supports AnalysisPredictor.
/** Tensor without copy, currently only supports `AnalysisPredictor`.
*/
*/
class
ZeroCopyTensor
{
class
ZeroCopyTensor
{
public:
public:
...
@@ -269,9 +299,11 @@ struct NativeConfig : public PaddlePredictor::Config {
...
@@ -269,9 +299,11 @@ struct NativeConfig : public PaddlePredictor::Config {
*
*
* Usage:
* Usage:
*
*
* \code{.cpp}
* NativeConfig config;
* NativeConfig config;
* ... // change the configs.
* ... // change the configs.
* auto native_predictor = CreatePaddlePredictor(config);
* auto native_predictor = CreatePaddlePredictor(config);
* \endcode
*
*
* FOR EXTENSION DEVELOPER:
* FOR EXTENSION DEVELOPER:
* Different predictors are designated by config type. Similar configs can be
* Different predictors are designated by config type. Similar configs can be
...
...
paddle/fluid/inference/api/paddle_pass_builder.cc
浏览文件 @
f79a3a83
...
@@ -66,8 +66,54 @@ void GpuPassStrategy::EnableMKLDNN() {
...
@@ -66,8 +66,54 @@ void GpuPassStrategy::EnableMKLDNN() {
LOG
(
ERROR
)
<<
"GPU not support MKLDNN yet"
;
LOG
(
ERROR
)
<<
"GPU not support MKLDNN yet"
;
}
}
GpuPassStrategy
::
GpuPassStrategy
()
:
PassStrategy
({})
{
passes_
.
assign
({
"infer_clean_graph_pass"
,
//
"identity_scale_op_clean_pass"
,
//
"conv_affine_channel_fuse_pass"
,
//
"conv_eltwiseadd_affine_channel_fuse_pass"
,
//
"conv_bn_fuse_pass"
,
//
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
"conv_elementwise_add_act_fuse_pass"
,
//
"conv_elementwise_add2_act_fuse_pass"
,
//
"conv_elementwise_add_fuse_pass"
,
//
#endif
});
for
(
int
i
=
6
;
i
>=
3
;
i
--
)
{
passes_
.
push_back
(
"transpose_flatten"
+
std
::
to_string
(
i
)
+
"_concat_fuse_pass"
);
}
use_gpu_
=
true
;
}
void
PaddlePassBuilder
::
AppendAnalysisPass
(
const
std
::
string
&
pass
)
{
void
PaddlePassBuilder
::
AppendAnalysisPass
(
const
std
::
string
&
pass
)
{
analysis_passes_
.
push_back
(
pass
);
analysis_passes_
.
push_back
(
pass
);
}
}
CpuPassStrategy
::
CpuPassStrategy
()
:
PassStrategy
({})
{
// NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones.
passes_
.
assign
({
"infer_clean_graph_pass"
,
//
"attention_lstm_fuse_pass"
,
//
"seqpool_concat_fuse_pass"
,
//
"seqconv_eltadd_relu_fuse_pass"
,
//
// "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass"
,
//
"mul_lstm_fuse_pass"
,
//
"fc_gru_fuse_pass"
,
//
"mul_gru_fuse_pass"
,
//
"seq_concat_fc_fuse_pass"
,
//
"fc_fuse_pass"
,
//
"repeated_fc_relu_fuse_pass"
,
//
"squared_mat_sub_fuse_pass"
,
//
"conv_bn_fuse_pass"
,
//
"conv_eltwiseadd_bn_fuse_pass"
,
//
"is_test_pass"
,
//
"identity_scale_op_clean_pass"
,
//
});
use_gpu_
=
false
;
}
}
// namespace paddle
}
// namespace paddle
paddle/fluid/inference/api/paddle_pass_builder.h
浏览文件 @
f79a3a83
...
@@ -97,30 +97,7 @@ class PassStrategy : public PaddlePassBuilder {
...
@@ -97,30 +97,7 @@ class PassStrategy : public PaddlePassBuilder {
*/
*/
class
CpuPassStrategy
:
public
PassStrategy
{
class
CpuPassStrategy
:
public
PassStrategy
{
public:
public:
CpuPassStrategy
()
:
PassStrategy
({})
{
CpuPassStrategy
();
// NOTE the large fusions should be located in the front, so that they will
// not be damaged by smaller ones.
passes_
.
assign
({
"infer_clean_graph_pass"
,
//
"attention_lstm_fuse_pass"
,
//
"seqpool_concat_fuse_pass"
,
//
"seqconv_eltadd_relu_fuse_pass"
,
//
// "embedding_fc_lstm_fuse_pass", //
"fc_lstm_fuse_pass"
,
//
"mul_lstm_fuse_pass"
,
//
"fc_gru_fuse_pass"
,
//
"mul_gru_fuse_pass"
,
//
"seq_concat_fc_fuse_pass"
,
//
"fc_fuse_pass"
,
//
"repeated_fc_relu_fuse_pass"
,
//
"squared_mat_sub_fuse_pass"
,
//
"conv_bn_fuse_pass"
,
//
"conv_eltwiseadd_bn_fuse_pass"
,
//
"is_test_pass"
,
//
"identity_scale_op_clean_pass"
,
//
});
use_gpu_
=
false
;
}
explicit
CpuPassStrategy
(
const
CpuPassStrategy
&
other
)
explicit
CpuPassStrategy
(
const
CpuPassStrategy
&
other
)
:
PassStrategy
(
other
.
AllPasses
())
{}
:
PassStrategy
(
other
.
AllPasses
())
{}
...
@@ -153,27 +130,7 @@ class CpuPassStrategy : public PassStrategy {
...
@@ -153,27 +130,7 @@ class CpuPassStrategy : public PassStrategy {
*/
*/
class
GpuPassStrategy
:
public
PassStrategy
{
class
GpuPassStrategy
:
public
PassStrategy
{
public:
public:
GpuPassStrategy
()
:
PassStrategy
({})
{
GpuPassStrategy
();
passes_
.
assign
({
"infer_clean_graph_pass"
,
//
"identity_scale_op_clean_pass"
,
//
"conv_affine_channel_fuse_pass"
,
//
"conv_eltwiseadd_affine_channel_fuse_pass"
,
//
"conv_bn_fuse_pass"
,
//
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
"conv_elementwise_add_act_fuse_pass"
,
//
"conv_elementwise_add2_act_fuse_pass"
,
//
"conv_elementwise_add_fuse_pass"
,
//
#endif
});
for
(
int
i
=
6
;
i
>=
3
;
i
--
)
{
passes_
.
push_back
(
"transpose_flatten"
+
std
::
to_string
(
i
)
+
"_concat_fuse_pass"
);
}
use_gpu_
=
true
;
}
explicit
GpuPassStrategy
(
const
GpuPassStrategy
&
other
)
explicit
GpuPassStrategy
(
const
GpuPassStrategy
&
other
)
:
PassStrategy
(
other
.
AllPasses
())
{
:
PassStrategy
(
other
.
AllPasses
())
{
...
...
paddle/fluid/operators/controlflow/compare_op.cc
浏览文件 @
f79a3a83
...
@@ -51,6 +51,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -51,6 +51,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
comment
.
type
));
comment
.
type
));
AddInput
(
"Y"
,
string
::
Sprintf
(
"the right hand operand of %s operator"
,
AddInput
(
"Y"
,
string
::
Sprintf
(
"the right hand operand of %s operator"
,
comment
.
type
));
comment
.
type
));
AddAttr
<
int
>
(
"axis"
,
"The start dimension index for broadcasting Y onto X. [default -1]"
)
.
SetDefault
(
-
1
)
.
EqualGreaterThan
(
-
1
);
AddAttr
<
bool
>
(
"force_cpu"
,
AddAttr
<
bool
>
(
"force_cpu"
,
"Force fill output variable to cpu "
"Force fill output variable to cpu "
"memory. Otherwise, fill output variable to the running "
"memory. Otherwise, fill output variable to the running "
...
@@ -64,11 +69,6 @@ N-dim tensor. X and Y could be any type. The each element of the Out tensor is
...
@@ -64,11 +69,6 @@ N-dim tensor. X and Y could be any type. The each element of the Out tensor is
calculated by $%s$
calculated by $%s$
)DOC"
,
)DOC"
,
comment
.
equation
));
comment
.
equation
));
AddAttr
<
int
>
(
"axis"
,
"The start dimension index for broadcasting Y onto X. [default -1]"
)
.
SetDefault
(
-
1
)
.
EqualGreaterThan
(
-
1
);
}
}
};
};
...
...
paddle/fluid/operators/detection/density_prior_box_op.h
浏览文件 @
f79a3a83
...
@@ -72,7 +72,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
...
@@ -72,7 +72,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_MKLML
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#pragma omp parallel for
#endif
#endif
for
(
in
t
i
=
0
;
i
<
fixed_ratios
.
size
();
i
++
)
{
for
(
size_
t
i
=
0
;
i
<
fixed_ratios
.
size
();
i
++
)
{
sqrt_fixed_ratios
.
push_back
(
sqrt
(
fixed_ratios
[
i
]));
sqrt_fixed_ratios
.
push_back
(
sqrt
(
fixed_ratios
[
i
]));
}
}
...
@@ -115,11 +115,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
...
@@ -115,11 +115,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
}
}
}
}
if
(
clip
)
{
if
(
clip
)
{
platform
::
Transform
<
platform
::
CPUDeviceContext
>
trans
;
T
*
dt
=
boxes
->
data
<
T
>
();
ClipFunctor
<
T
>
clip_func
;
std
::
transform
(
dt
,
dt
+
boxes
->
numel
(),
dt
,
[](
T
v
)
->
T
{
trans
(
ctx
.
template
device_context
<
platform
::
CPUDeviceContext
>(),
return
std
::
min
<
T
>
(
std
::
max
<
T
>
(
v
,
0.
),
1.
);
boxes
->
data
<
T
>
(),
boxes
->
data
<
T
>
()
+
boxes
->
numel
(),
});
boxes
->
data
<
T
>
(),
clip_func
);
}
}
framework
::
Tensor
var_t
;
framework
::
Tensor
var_t
;
var_t
.
mutable_data
<
T
>
(
var_t
.
mutable_data
<
T
>
(
...
@@ -141,7 +140,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
...
@@ -141,7 +140,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2)
#endif
#endif
for
(
int
i
=
0
;
i
<
box_num
;
++
i
)
{
for
(
int
i
=
0
;
i
<
box_num
;
++
i
)
{
for
(
in
t
j
=
0
;
j
<
variances
.
size
();
++
j
)
{
for
(
size_
t
j
=
0
;
j
<
variances
.
size
();
++
j
)
{
e_vars
(
i
,
j
)
=
variances
[
j
];
e_vars
(
i
,
j
)
=
variances
[
j
];
}
}
}
}
...
...
paddle/fluid/operators/detection/prior_box_op.h
浏览文件 @
f79a3a83
...
@@ -46,13 +46,6 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
...
@@ -46,13 +46,6 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
}
}
}
}
template
<
typename
T
>
struct
ClipFunctor
{
HOSTDEVICE
inline
T
operator
()(
T
in
)
const
{
return
std
::
min
<
T
>
(
std
::
max
<
T
>
(
in
,
0.
),
1.
);
}
};
template
<
typename
T
>
template
<
typename
T
>
class
PriorBoxOpKernel
:
public
framework
::
OpKernel
<
T
>
{
class
PriorBoxOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
...
@@ -101,31 +94,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
...
@@ -101,31 +94,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
vars
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
vars
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
e_boxes
=
framework
::
EigenTensor
<
T
,
4
>::
From
(
*
boxes
);
T
*
b_t
=
boxes
->
data
<
T
>
(
);
for
(
int
h
=
0
;
h
<
feature_height
;
++
h
)
{
for
(
int
h
=
0
;
h
<
feature_height
;
++
h
)
{
for
(
int
w
=
0
;
w
<
feature_width
;
++
w
)
{
for
(
int
w
=
0
;
w
<
feature_width
;
++
w
)
{
T
center_x
=
(
w
+
offset
)
*
step_width
;
T
center_x
=
(
w
+
offset
)
*
step_width
;
T
center_y
=
(
h
+
offset
)
*
step_height
;
T
center_y
=
(
h
+
offset
)
*
step_height
;
T
box_width
,
box_height
;
T
box_width
,
box_height
;
int
idx
=
0
;
for
(
size_t
s
=
0
;
s
<
min_sizes
.
size
();
++
s
)
{
for
(
size_t
s
=
0
;
s
<
min_sizes
.
size
();
++
s
)
{
auto
min_size
=
min_sizes
[
s
];
auto
min_size
=
min_sizes
[
s
];
if
(
min_max_aspect_ratios_order
)
{
if
(
min_max_aspect_ratios_order
)
{
box_width
=
box_height
=
min_size
/
2.
;
box_width
=
box_height
=
min_size
/
2.
;
e_boxes
(
h
,
w
,
idx
,
0
)
=
(
center_x
-
box_width
)
/
img_width
;
b_t
[
0
]
=
(
center_x
-
box_width
)
/
img_width
;
e_boxes
(
h
,
w
,
idx
,
1
)
=
(
center_y
-
box_height
)
/
img_height
;
b_t
[
1
]
=
(
center_y
-
box_height
)
/
img_height
;
e_boxes
(
h
,
w
,
idx
,
2
)
=
(
center_x
+
box_width
)
/
img_width
;
b_t
[
2
]
=
(
center_x
+
box_width
)
/
img_width
;
e_boxes
(
h
,
w
,
idx
,
3
)
=
(
center_y
+
box_height
)
/
img_height
;
b_t
[
3
]
=
(
center_y
+
box_height
)
/
img_height
;
idx
++
;
b_t
+=
4
;
if
(
max_sizes
.
size
()
>
0
)
{
if
(
max_sizes
.
size
()
>
0
)
{
auto
max_size
=
max_sizes
[
s
];
auto
max_size
=
max_sizes
[
s
];
// square prior with size sqrt(minSize * maxSize)
// square prior with size sqrt(minSize * maxSize)
box_width
=
box_height
=
sqrt
(
min_size
*
max_size
)
/
2.
;
box_width
=
box_height
=
sqrt
(
min_size
*
max_size
)
/
2.
;
e_boxes
(
h
,
w
,
idx
,
0
)
=
(
center_x
-
box_width
)
/
img_width
;
b_t
[
0
]
=
(
center_x
-
box_width
)
/
img_width
;
e_boxes
(
h
,
w
,
idx
,
1
)
=
(
center_y
-
box_height
)
/
img_height
;
b_t
[
1
]
=
(
center_y
-
box_height
)
/
img_height
;
e_boxes
(
h
,
w
,
idx
,
2
)
=
(
center_x
+
box_width
)
/
img_width
;
b_t
[
2
]
=
(
center_x
+
box_width
)
/
img_width
;
e_boxes
(
h
,
w
,
idx
,
3
)
=
(
center_y
+
box_height
)
/
img_height
;
b_t
[
3
]
=
(
center_y
+
box_height
)
/
img_height
;
idx
++
;
b_t
+=
4
;
}
}
// priors with different aspect ratios
// priors with different aspect ratios
for
(
size_t
r
=
0
;
r
<
aspect_ratios
.
size
();
++
r
)
{
for
(
size_t
r
=
0
;
r
<
aspect_ratios
.
size
();
++
r
)
{
...
@@ -135,11 +127,11 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
...
@@ -135,11 +127,11 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
}
}
box_width
=
min_size
*
sqrt
(
ar
)
/
2.
;
box_width
=
min_size
*
sqrt
(
ar
)
/
2.
;
box_height
=
min_size
/
sqrt
(
ar
)
/
2.
;
box_height
=
min_size
/
sqrt
(
ar
)
/
2.
;
e_boxes
(
h
,
w
,
idx
,
0
)
=
(
center_x
-
box_width
)
/
img_width
;
b_t
[
0
]
=
(
center_x
-
box_width
)
/
img_width
;
e_boxes
(
h
,
w
,
idx
,
1
)
=
(
center_y
-
box_height
)
/
img_height
;
b_t
[
1
]
=
(
center_y
-
box_height
)
/
img_height
;
e_boxes
(
h
,
w
,
idx
,
2
)
=
(
center_x
+
box_width
)
/
img_width
;
b_t
[
2
]
=
(
center_x
+
box_width
)
/
img_width
;
e_boxes
(
h
,
w
,
idx
,
3
)
=
(
center_y
+
box_height
)
/
img_height
;
b_t
[
3
]
=
(
center_y
+
box_height
)
/
img_height
;
idx
++
;
b_t
+=
4
;
}
}
}
else
{
}
else
{
// priors with different aspect ratios
// priors with different aspect ratios
...
@@ -147,21 +139,21 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
...
@@ -147,21 +139,21 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
float
ar
=
aspect_ratios
[
r
];
float
ar
=
aspect_ratios
[
r
];
box_width
=
min_size
*
sqrt
(
ar
)
/
2.
;
box_width
=
min_size
*
sqrt
(
ar
)
/
2.
;
box_height
=
min_size
/
sqrt
(
ar
)
/
2.
;
box_height
=
min_size
/
sqrt
(
ar
)
/
2.
;
e_boxes
(
h
,
w
,
idx
,
0
)
=
(
center_x
-
box_width
)
/
img_width
;
b_t
[
0
]
=
(
center_x
-
box_width
)
/
img_width
;
e_boxes
(
h
,
w
,
idx
,
1
)
=
(
center_y
-
box_height
)
/
img_height
;
b_t
[
1
]
=
(
center_y
-
box_height
)
/
img_height
;
e_boxes
(
h
,
w
,
idx
,
2
)
=
(
center_x
+
box_width
)
/
img_width
;
b_t
[
2
]
=
(
center_x
+
box_width
)
/
img_width
;
e_boxes
(
h
,
w
,
idx
,
3
)
=
(
center_y
+
box_height
)
/
img_height
;
b_t
[
3
]
=
(
center_y
+
box_height
)
/
img_height
;
idx
++
;
b_t
+=
4
;
}
}
if
(
max_sizes
.
size
()
>
0
)
{
if
(
max_sizes
.
size
()
>
0
)
{
auto
max_size
=
max_sizes
[
s
];
auto
max_size
=
max_sizes
[
s
];
// square prior with size sqrt(minSize * maxSize)
// square prior with size sqrt(minSize * maxSize)
box_width
=
box_height
=
sqrt
(
min_size
*
max_size
)
/
2.
;
box_width
=
box_height
=
sqrt
(
min_size
*
max_size
)
/
2.
;
e_boxes
(
h
,
w
,
idx
,
0
)
=
(
center_x
-
box_width
)
/
img_width
;
b_t
[
0
]
=
(
center_x
-
box_width
)
/
img_width
;
e_boxes
(
h
,
w
,
idx
,
1
)
=
(
center_y
-
box_height
)
/
img_height
;
b_t
[
1
]
=
(
center_y
-
box_height
)
/
img_height
;
e_boxes
(
h
,
w
,
idx
,
2
)
=
(
center_x
+
box_width
)
/
img_width
;
b_t
[
2
]
=
(
center_x
+
box_width
)
/
img_width
;
e_boxes
(
h
,
w
,
idx
,
3
)
=
(
center_y
+
box_height
)
/
img_height
;
b_t
[
3
]
=
(
center_y
+
box_height
)
/
img_height
;
idx
++
;
b_t
+=
4
;
}
}
}
}
}
}
...
@@ -169,11 +161,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
...
@@ -169,11 +161,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
}
}
if
(
clip
)
{
if
(
clip
)
{
platform
::
Transform
<
platform
::
CPUDeviceContext
>
trans
;
T
*
dt
=
boxes
->
data
<
T
>
();
ClipFunctor
<
T
>
clip_func
;
std
::
transform
(
dt
,
dt
+
boxes
->
numel
(),
dt
,
[](
T
v
)
->
T
{
trans
(
ctx
.
template
device_context
<
platform
::
CPUDeviceContext
>(),
return
std
::
min
<
T
>
(
std
::
max
<
T
>
(
v
,
0.
),
1.
);
boxes
->
data
<
T
>
(),
boxes
->
data
<
T
>
()
+
boxes
->
numel
(),
});
boxes
->
data
<
T
>
(),
clip_func
);
}
}
framework
::
Tensor
var_t
;
framework
::
Tensor
var_t
;
...
...
paddle/fluid/operators/detection/yolov3_loss_op.cc
浏览文件 @
f79a3a83
...
@@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
"The ignore threshold to ignore confidence loss."
)
"The ignore threshold to ignore confidence loss."
)
.
SetDefault
(
0.7
);
.
SetDefault
(
0.7
);
AddComment
(
R"DOC(
AddComment
(
R"DOC(
This operator generate
yolov3 loss by
given predict result and ground
This operator generate
s yolov3 loss based on
given predict result and ground
truth boxes.
truth boxes.
The output of previous network is in shape [N, C, H, W], while H and W
The output of previous network is in shape [N, C, H, W], while H and W
should be the same,
specify the grid size, each grid point predict given
should be the same,
H and W specify the grid size, each grid point predict
number boxes, this given number is specified by anchors, it should be
given number boxes, this given number, which following will be represented as S,
half anchors length, which following will be represented as S. In the
is specified by the number of anchors, In the second dimension(the channel
second dimention(the channel dimention), C should be S * (class_num + 5),
dimension), C should be equal to S * (class_num + 5), class_num is the object
c
lass_num is the box categoriy number of source dataset(such as coco),
c
ategory number of source dataset(such as 80 in coco dataset), so in the
s
o in the second dimention, stores 4 box location coordinates x, y, w, h
s
econd(channel) dimension, apart from 4 box location coordinates x, y, w, h,
a
nd
confidence score of the box and class one-hot key of each anchor box.
a
lso includes
confidence score of the box and class one-hot key of each anchor box.
While the 4 location coordinates if $$tx, ty, tw, th$$
, the box predictions
Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`
, the box predictions
correspnd to
:
should be as follows
:
$$
$$
b_x = \sigma(t_x) + c_x
b_x = \\sigma(t_x) + c_x
b_y = \sigma(t_y) + c_y
$$
$$
b_y = \\sigma(t_y) + c_y
$$
$$
b_w = p_w e^{t_w}
b_w = p_w e^{t_w}
$$
$$
b_h = p_h e^{t_h}
b_h = p_h e^{t_h}
$$
$$
While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
In the equation above, :math:`c_x, c_y` is the left top corner of current grid
is specified by anchors.
and :math:`p_w, p_h`
is specified by anchors.
As for confidence score, it is the logistic regression value of IoU between
As for confidence score, it is the logistic regression value of IoU between
anchor boxes and ground truth boxes, the score of the anchor box which has
anchor boxes and ground truth boxes, the score of the anchor box which has
the max IoU should be 1, and if the anchor box has IoU bigger th
e
n ignore
the max IoU should be 1, and if the anchor box has IoU bigger th
a
n ignore
thresh, the confidence score loss of this anchor box will be ignored.
thresh, the confidence score loss of this anchor box will be ignored.
Therefore, the yolov3 loss consist of three major parts, box location loss,
Therefore, the yolov3 loss consist of three major parts, box location loss,
...
@@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
In order to trade off box coordinate losses between big boxes and small
In order to trade off box coordinate losses between big boxes and small
boxes, box coordinate losses will be mutiplied by scale weight, which is
boxes, box coordinate losses will be mutiplied by scale weight, which is
calculated as follow.
calculated as follow
s
.
$$
$$
weight_{box} = 2.0 - t_w * t_h
weight_{box} = 2.0 - t_w * t_h
$$
$$
Final loss will be represented as follow.
Final loss will be represented as follow
s
.
$$
$$
loss = (loss_{xy} + loss_{wh}) * weight_{box}
loss = (loss_{xy} + loss_{wh}) * weight_{box}
...
...
paddle/fluid/operators/elementwise/elementwise_op.h
浏览文件 @
f79a3a83
...
@@ -264,6 +264,23 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
...
@@ -264,6 +264,23 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
}
}
};
};
class
ElementwiseGradOpInplace
:
public
framework
::
InplaceInToOut
{
public:
using
framework
::
InplaceInToOut
::
InplaceInToOut
;
protected:
std
::
unordered_map
<
std
::
string
,
std
::
string
>
Apply
(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
std
::
unordered_map
<
std
::
string
,
std
::
string
>
ret
;
if
(
block
->
HasVar
(
framework
::
GradVarName
(
"X"
))
&&
block
->
HasVar
(
framework
::
GradVarName
(
"Out"
)))
{
ret
[
framework
::
GradVarName
(
"Out"
)]
=
framework
::
GradVarName
(
"X"
);
}
return
ret
;
}
};
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
...
@@ -316,4 +333,5 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
...
@@ -316,4 +333,5 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
op_type##GradMaker, \
op_type##GradMaker, \
::paddle::operators::ElementwiseOpInplace); \
::paddle::operators::ElementwiseOpInplace); \
REGISTER_OPERATOR(op_type##_grad, \
REGISTER_OPERATOR(op_type##_grad, \
::paddle::operators::ElementwiseOpExplicitGrad)
::paddle::operators::ElementwiseOpExplicitGrad, \
::paddle::operators::ElementwiseGradOpInplace)
paddle/fluid/operators/lstm_op.h
浏览文件 @
f79a3a83
...
@@ -311,6 +311,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
...
@@ -311,6 +311,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
lstm_grad
.
prev_state_grad
=
c0_g
?
ordered_c0_g
.
data
<
T
>
()
:
nullptr
;
lstm_grad
.
prev_state_grad
=
c0_g
?
ordered_c0_g
.
data
<
T
>
()
:
nullptr
;
}
}
// lstm_value.output_value not used in bp, set to nullptr
// lstm_grad.state_active_grad not used in bp, set to nullptr
lstm_value
.
output_value
=
nullptr
;
lstm_grad
.
state_active_grad
=
nullptr
;
int
cur_batch_size
=
bend
-
bstart
;
int
cur_batch_size
=
bend
-
bstart
;
math
::
LstmUnitGradFunctor
<
DeviceContext
,
T
>::
compute
(
math
::
LstmUnitGradFunctor
<
DeviceContext
,
T
>::
compute
(
device_ctx
,
lstm_value
,
lstm_grad
,
frame_size
,
cur_batch_size
,
device_ctx
,
lstm_value
,
lstm_grad
,
frame_size
,
cur_batch_size
,
...
...
paddle/fluid/operators/lstmp_op.h
浏览文件 @
f79a3a83
...
@@ -405,6 +405,11 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
...
@@ -405,6 +405,11 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
}
}
int
cur_batch_size
=
bend
-
bstart
;
int
cur_batch_size
=
bend
-
bstart
;
// lstmp_value.output_value not used in bp, set to null
// lstmp_grad.state_active_grad not used in bp, set to null
lstmp_value
.
output_value
=
nullptr
;
lstmp_grad
.
state_active_grad
=
nullptr
;
math
::
LstmUnitGradFunctor
<
DeviceContext
,
T
>::
compute
(
math
::
LstmUnitGradFunctor
<
DeviceContext
,
T
>::
compute
(
device_ctx
,
lstmp_value
,
lstmp_grad
,
frame_size
,
cur_batch_size
,
device_ctx
,
lstmp_value
,
lstmp_grad
,
frame_size
,
cur_batch_size
,
gate_act
,
cell_act
,
cand_act
);
gate_act
,
cell_act
,
cand_act
);
...
...
paddle/fluid/operators/pool_op.cc
浏览文件 @
f79a3a83
...
@@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() {
...
@@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() {
"be ignored."
);
// TODO(Chengduo): Add checker.
"be ignored."
);
// TODO(Chengduo): Add checker.
// (Currently,
// (Currently,
// TypedAttrChecker don't support vector type.)
// TypedAttrChecker don't support vector type.)
AddAttr
<
bool
>
(
"global_pooling"
,
AddAttr
<
bool
>
(
"(bool, default false) Whether to use the global pooling. "
"global_pooling"
,
"If global_pooling = true, ksize and paddings will be ignored."
)
"(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, kernel size and paddings will be ignored."
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
std
::
vector
<
int
>>
(
"strides"
,
AddAttr
<
std
::
vector
<
int
>>
(
"strides"
,
"(vector<int>, default {1, 1}), strides(height, "
"(vector<int>, default {1, 1}), strides(height, "
...
@@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() {
...
@@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() {
"paddings"
,
"paddings"
,
"(vector<int>, default {0,0}), paddings(height, width) of pooling "
"(vector<int>, default {0,0}), paddings(height, width) of pooling "
"operator."
"operator."
"If global_pooling = true, paddings and ksize will be ignored."
)
"If global_pooling = true, paddings and k
ernel
size will be ignored."
)
.
SetDefault
({
0
,
0
});
.
SetDefault
({
0
,
0
});
AddAttr
<
bool
>
(
AddAttr
<
bool
>
(
"exclusive"
,
"exclusive"
,
...
@@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() {
...
@@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() {
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
AddAttr
<
bool
>
(
"ceil_mode"
,
"ceil_mode"
,
"(bool, default false) Wether to use the ceil function to calculate "
"(bool, default false) W
h
ether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, "
"output height and width. False is the default. If it is set to False, "
"the floor function will be used."
)
"the floor function will be used."
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
...
@@ -259,31 +260,40 @@ Example:
...
@@ -259,31 +260,40 @@ Example:
W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
$$
$$
For exclusive =
tru
e:
For exclusive =
fals
e:
$$
$$
hstart = i * strides[0] - paddings[0]
hstart = i * strides[0] - paddings[0]
$$
$$
hend = hstart + ksize[0]
hend = hstart + ksize[0]
$$
$$
wstart = j * strides[1] - paddings[1]
wstart = j * strides[1] - paddings[1]
$$
$$
wend = wstart + ksize[1]
wend = wstart + ksize[1]
$$
$$
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
$$
$$
For exclusive = false:
For exclusive = true:
$$
$$
hstart = max(0, i * strides[0] - paddings[0])
hstart = max(0, i * strides[0] - paddings[0])
$$
$$
hend = min(H, hstart + ksize[0])
hend = min(H, hstart + ksize[0])
$$
$$
wstart = max(0, j * strides[1] - paddings[1])
wstart = max(0, j * strides[1] - paddings[1])
$$
$$
wend = min(W, wstart + ksize[1])
wend = min(W, wstart + ksize[1])
$$
$$
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$
$$
For adaptive = true:
$$
hstart = floor(i * H_{in} / H_{out})
hend = ceil((i + 1) * H_{in} / H_{out})
wstart = floor(j * W_{in} / W_{out})
wend = ceil((j + 1) * W_{in} / W_{out})
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$
)DOC"
);
)DOC"
);
}
}
...
@@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() {
...
@@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() {
AddAttr
<
bool
>
(
AddAttr
<
bool
>
(
"global_pooling"
,
"global_pooling"
,
"(bool, default false) Whether to use the global pooling. "
"(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, k
size and paddings wille
be ignored."
)
"If global_pooling = true, k
ernel size and paddings will
be ignored."
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
std
::
vector
<
int
>>
(
AddAttr
<
std
::
vector
<
int
>>
(
"strides"
,
"strides"
,
...
@@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() {
...
@@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() {
.
SetDefault
(
false
);
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
AddAttr
<
bool
>
(
"ceil_mode"
,
"ceil_mode"
,
"(bool, default false) Wether to use the ceil function to calculate "
"(bool, default false) W
h
ether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, "
"output height and width. False is the default. If it is set to False, "
"the floor function will be used."
)
"the floor function will be used."
)
.
SetDefault
(
false
);
.
SetDefault
(
false
);
...
@@ -392,48 +402,68 @@ Example:
...
@@ -392,48 +402,68 @@ Example:
Output:
Output:
Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
For ceil_mode = false:
For ceil_mode = false:
$$
$$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
$$
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$
$$
H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[2]} + 1
$$
$$
W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$
For ceil_mode = true:
For ceil_mode = true:
$$
$$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\
D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
$$
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
$$
$$
H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1
For exclusive = true:
$$
$$
$$
dstart = i * strides[0] - paddings[0]
W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
dend = dstart + ksize[0]
$$
hstart = j * strides[1] - paddings[1]
hend = hstart + ksize[1]
wstart = k * strides[2] - paddings[2]
wend = wstart + ksize[2]
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
$$
For exclusive = false:
For exclusive = false:
$$
$$
dstart = max(0, i * strides[0] - paddings[0])
dstart = i * strides[0] - paddings[0]
dend = min(D, dstart + ksize[0])
$$
hstart = max(0, j * strides[1] - paddings[1])
$$
hend = min(H, hstart + ksize[1])
dend = dstart + ksize[0]
wstart = max(0, k * strides[2] - paddings[2])
$$
wend = min(W, wstart + ksize[2])
$$
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
hstart = j * strides[1] - paddings[1]
$$
$$
$$
For adaptive = true:
hend = hstart + ksize[1]
$$
$$
dstart = floor(i * D_{in} / D_{out})
$$
dend = ceil((i + 1) * D_{in} / D_{out})
wstart = k * strides[2] - paddings[2]
hstart = floor(j * H_{in} / H_{out})
$$
hend = ceil((j + 1) * H_{in} / H_{out})
$$
wstart = floor(k * W_{in} / W_{out})
wend = wstart + ksize[2]
wend = ceil((k + 1) * W_{in} / W_{out})
$$
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
$$
$$
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
$$
For exclusive = true:
$$
dstart = max(0, i * strides[0] - paddings[0])
$$
$$
dend = min(D, dstart + ksize[0])
$$
$$
hend = min(H, hstart + ksize[1])
$$
$$
wstart = max(0, k * strides[2] - paddings[2])
$$
$$
wend = min(W, wstart + ksize[2])
$$
$$
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
$$
)DOC"
);
)DOC"
);
}
}
...
...
paddle/fluid/operators/random_crop_op.h
浏览文件 @
f79a3a83
...
@@ -121,7 +121,7 @@ struct RandomCropFunctor {
...
@@ -121,7 +121,7 @@ struct RandomCropFunctor {
HOSTDEVICE
void
operator
()(
size_t
ins_idx
)
{
HOSTDEVICE
void
operator
()(
size_t
ins_idx
)
{
typename
Random
<
DeviceContext
>::
Engine
engine
(
seed_
);
typename
Random
<
DeviceContext
>::
Engine
engine
(
seed_
);
engine
.
discard
(
ins_idx
*
(
rank_
-
num_batchsize_dims_
));
engine
.
discard
(
ins_idx
*
(
rank_
-
num_batchsize_dims_
));
size_t
offsets
[
9
];
size_t
offsets
[
9
]
=
{}
;
for
(
int
i
=
num_batchsize_dims_
;
i
<
rank_
;
++
i
)
{
for
(
int
i
=
num_batchsize_dims_
;
i
<
rank_
;
++
i
)
{
typename
Random
<
DeviceContext
>::
template
UniformIntDist
<
size_t
>
dist
(
typename
Random
<
DeviceContext
>::
template
UniformIntDist
<
size_t
>
dist
(
0
,
x_dims_
[
i
]
-
out_dims_
[
i
]);
0
,
x_dims_
[
i
]
-
out_dims_
[
i
]);
...
...
paddle/fluid/operators/slice_op.cu
浏览文件 @
f79a3a83
...
@@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <thrust/device_vector.h>
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/slice_op.h"
#include "paddle/fluid/operators/slice_op.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
using
platform
::
PADDLE_CUDA_NUM_THREADS
;
template
<
size_t
D
>
__global__
void
Padding
(
const
paddle
::
platform
::
float16
*
d_out
,
const
int
*
out_dims
,
const
int
*
in_dims
,
const
int
*
offsets
,
int64_t
n
,
paddle
::
platform
::
float16
*
d_in
)
{
int64_t
out_idx
=
threadIdx
.
x
+
blockDim
.
x
*
blockIdx
.
x
;
if
(
out_idx
<
n
)
{
int64_t
out_idx_tmp
=
out_idx
;
int
coords
[
D
]
=
{
0
};
for
(
int
i
=
D
-
1
;
i
>=
0
;
--
i
)
{
coords
[
i
]
=
out_idx_tmp
%
out_dims
[
i
];
out_idx_tmp
/=
out_dims
[
i
];
coords
[
i
]
+=
offsets
[
i
];
}
int64_t
in_idx
=
0
;
for
(
int
i
=
0
;
i
<
D
;
++
i
)
{
in_idx
=
in_idx
*
in_dims
[
i
]
+
coords
[
i
];
}
d_in
[
in_idx
]
=
d_out
[
out_idx
];
}
}
template
<
>
class
SliceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
:
public
framework
::
OpKernel
<
paddle
::
platform
::
float16
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
d_out
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
d_in
=
ctx
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Input"
));
d_in
->
mutable_data
<
paddle
::
platform
::
float16
>
(
ctx
.
GetPlace
());
auto
out_dims
=
d_out
->
dims
();
auto
in_dims
=
d_in
->
dims
();
int
rank
=
out_dims
.
size
();
std
::
vector
<
int
>
offsets
(
rank
,
0
);
auto
axes
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"axes"
);
auto
starts
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"starts"
);
for
(
size_t
i
=
0
;
i
<
starts
.
size
();
++
i
)
{
if
(
starts
[
i
]
<
0
)
{
starts
[
i
]
+=
in_dims
[
axes
[
i
]];
}
offsets
[
axes
[
i
]]
=
std
::
max
(
starts
[
i
],
0
);
}
math
::
SetConstant
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
set_zero
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
CUDADeviceContext
>();
set_zero
(
dev_ctx
,
d_in
,
static_cast
<
paddle
::
platform
::
float16
>
(
0
));
int64_t
numel
=
d_out
->
numel
();
dim3
blocks
((
numel
-
1
)
/
PADDLE_CUDA_NUM_THREADS
+
1
);
dim3
threads
(
PADDLE_CUDA_NUM_THREADS
);
auto
stream
=
ctx
.
cuda_device_context
().
stream
();
auto
out_shape
=
framework
::
vectorize2int
(
out_dims
);
thrust
::
device_vector
<
int
>
out_dims_vec
(
out_shape
.
begin
(),
out_shape
.
end
());
auto
in_shape
=
framework
::
vectorize2int
(
in_dims
);
thrust
::
device_vector
<
int
>
in_dims_vec
(
in_shape
.
begin
(),
in_shape
.
end
());
thrust
::
device_vector
<
int
>
offsets_vec
(
offsets
.
begin
(),
offsets
.
end
());
const
int
*
out_dims_ptr
=
thrust
::
raw_pointer_cast
(
out_dims_vec
.
data
());
const
int
*
in_dims_ptr
=
thrust
::
raw_pointer_cast
(
in_dims_vec
.
data
());
const
int
*
offsets_ptr
=
thrust
::
raw_pointer_cast
(
offsets_vec
.
data
());
switch
(
rank
)
{
case
1
:
Padding
<
1
><<<
blocks
,
threads
,
0
,
stream
>>>
(
d_out
->
data
<
paddle
::
platform
::
float16
>
(),
out_dims_ptr
,
in_dims_ptr
,
offsets_ptr
,
numel
,
d_in
->
data
<
paddle
::
platform
::
float16
>
());
break
;
case
2
:
Padding
<
2
><<<
blocks
,
threads
,
0
,
stream
>>>
(
d_out
->
data
<
paddle
::
platform
::
float16
>
(),
out_dims_ptr
,
in_dims_ptr
,
offsets_ptr
,
numel
,
d_in
->
data
<
paddle
::
platform
::
float16
>
());
break
;
case
3
:
Padding
<
3
><<<
blocks
,
threads
,
0
,
stream
>>>
(
d_out
->
data
<
paddle
::
platform
::
float16
>
(),
out_dims_ptr
,
in_dims_ptr
,
offsets_ptr
,
numel
,
d_in
->
data
<
paddle
::
platform
::
float16
>
());
break
;
case
4
:
Padding
<
4
><<<
blocks
,
threads
,
0
,
stream
>>>
(
d_out
->
data
<
paddle
::
platform
::
float16
>
(),
out_dims_ptr
,
in_dims_ptr
,
offsets_ptr
,
numel
,
d_in
->
data
<
paddle
::
platform
::
float16
>
());
break
;
case
5
:
Padding
<
5
><<<
blocks
,
threads
,
0
,
stream
>>>
(
d_out
->
data
<
paddle
::
platform
::
float16
>
(),
out_dims_ptr
,
in_dims_ptr
,
offsets_ptr
,
numel
,
d_in
->
data
<
paddle
::
platform
::
float16
>
());
break
;
case
6
:
Padding
<
6
><<<
blocks
,
threads
,
0
,
stream
>>>
(
d_out
->
data
<
paddle
::
platform
::
float16
>
(),
out_dims_ptr
,
in_dims_ptr
,
offsets_ptr
,
numel
,
d_in
->
data
<
paddle
::
platform
::
float16
>
());
break
;
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
slice
,
ops
::
SliceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
slice
,
ops
::
SliceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
SliceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
SliceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
SliceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
SliceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
SliceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
SliceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
SliceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
slice_grad
,
slice_grad
,
ops
::
SliceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
SliceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
SliceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
SliceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
SliceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
SliceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
SliceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
ops
::
SliceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
SliceGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/operators/softmax_with_cross_entropy_op.cu
浏览文件 @
f79a3a83
...
@@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
...
@@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Loss"
))
->
data
<
T
>
();
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Loss"
))
->
data
<
T
>
();
Tensor
*
logit_grad
=
Tensor
*
logit_grad
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Logits"
));
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Logits"
));
logit_grad
->
ShareDataWith
(
*
context
.
Input
<
Tensor
>
(
"Softmax"
));
framework
::
TensorCopy
(
*
context
.
Input
<
Tensor
>
(
"Softmax"
),
context
.
GetPlace
(),
context
.
device_context
(),
logit_grad
);
T
*
logit_grad_data
=
logit_grad
->
data
<
T
>
();
T
*
logit_grad_data
=
logit_grad
->
data
<
T
>
();
const
int
batch_size
=
logit_grad
->
dims
()[
0
];
const
int
batch_size
=
logit_grad
->
dims
()[
0
];
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
f79a3a83
...
@@ -398,9 +398,9 @@ PYBIND11_MODULE(core, m) {
...
@@ -398,9 +398,9 @@ PYBIND11_MODULE(core, m) {
py
::
arg
(
"recursive_sequence_lengths"
),
R"DOC(
py
::
arg
(
"recursive_sequence_lengths"
),
R"DOC(
Set LoD of the LoDTensor according to recursive sequence length.
Set LoD of the LoDTensor according to recursive sequence length.
For example, if recursive_sequence_lengths=[
2, 3
], meaning that
For example, if recursive_sequence_lengths=[
[2, 3]
], meaning that
there are two sequences with length 2 and 3 respectively, the
there are two sequences with length 2 and 3 respectively, the
corresponding lod would be [
0, 2, 2+3], i.e, [0, 2, 5
].
corresponding lod would be [
[0, 2, 2+3]], i.e, [[0, 2, 5]
].
Args:
Args:
recursive_sequence_lengths (List[List[int]]): sequence lengths.
recursive_sequence_lengths (List[List[int]]): sequence lengths.
...
@@ -803,9 +803,11 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -803,9 +803,11 @@ All parameter, weight, gradient are variables in Paddle.
.
def
(
py
::
init
<
const
platform
::
Place
&>
())
.
def
(
py
::
init
<
const
platform
::
Place
&>
())
.
def
(
"close"
,
&
Executor
::
Close
)
.
def
(
"close"
,
&
Executor
::
Close
)
.
def
(
"run"
,
[](
Executor
&
self
,
const
ProgramDesc
&
prog
,
Scope
*
scope
,
.
def
(
"run"
,
[](
Executor
&
self
,
const
ProgramDesc
&
prog
,
Scope
*
scope
,
int
block_id
,
bool
create_local_scope
,
bool
create_vars
)
{
int
block_id
,
bool
create_local_scope
,
bool
create_vars
,
const
std
::
vector
<
std
::
string
>
&
fetch_vars
)
{
pybind11
::
gil_scoped_release
release
;
pybind11
::
gil_scoped_release
release
;
self
.
Run
(
prog
,
scope
,
block_id
,
create_local_scope
,
create_vars
);
self
.
Run
(
prog
,
scope
,
block_id
,
create_local_scope
,
create_vars
,
fetch_vars
);
});
});
m
.
def
(
"init_gflags"
,
framework
::
InitGflags
);
m
.
def
(
"init_gflags"
,
framework
::
InitGflags
);
...
...
paddle/scripts/fast_install.sh
浏览文件 @
f79a3a83
此差异已折叠。
点击以展开。
python/paddle/__init__.py
浏览文件 @
f79a3a83
...
@@ -25,4 +25,5 @@ import paddle.reader
...
@@ -25,4 +25,5 @@ import paddle.reader
import
paddle.dataset
import
paddle.dataset
import
paddle.batch
import
paddle.batch
import
paddle.compat
import
paddle.compat
import
paddle.distributed
batch
=
batch
.
batch
batch
=
batch
.
batch
python/paddle/fluid/__init__.py
浏览文件 @
f79a3a83
...
@@ -161,7 +161,6 @@ def __bootstrap__():
...
@@ -161,7 +161,6 @@ def __bootstrap__():
'times_excess_than_required_tmp_allocation'
,
'times_excess_than_required_tmp_allocation'
,
'enable_inplace_whitelist'
'enable_inplace_whitelist'
]
]
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
[
"--tryfromenv="
+
","
.
join
(
read_env_flags
)])
[
"--tryfromenv="
+
","
.
join
(
read_env_flags
)])
core
.
init_glog
(
sys
.
argv
[
0
])
core
.
init_glog
(
sys
.
argv
[
0
])
...
...
python/paddle/fluid/compiler.py
浏览文件 @
f79a3a83
...
@@ -177,7 +177,10 @@ class CompiledProgram(object):
...
@@ -177,7 +177,10 @@ class CompiledProgram(object):
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
# if turn on python memory optimize, turn off the inplace_pass.
self
.
_build_strategy
.
enable_inplace
=
False
if
self
.
_program
.
_is_mem_optimized
else
True
if
self
.
_build_strategy
.
memory_optimize
is
True
:
self
.
_build_strategy
.
memory_optimize
=
False
if
self
.
_program
.
_is_mem_optimized
else
True
if
self
.
_build_strategy
.
enable_inplace
is
True
:
self
.
_build_strategy
.
enable_inplace
=
False
if
self
.
_program
.
_is_mem_optimized
else
True
if
self
.
_build_strategy
.
num_trainers
>
1
and
trainers_endpoints
:
if
self
.
_build_strategy
.
num_trainers
>
1
and
trainers_endpoints
:
assert
self
.
_build_strategy
.
num_trainers
==
len
(
assert
self
.
_build_strategy
.
num_trainers
==
len
(
...
...
python/paddle/fluid/contrib/int8_inference/README.md
浏览文件 @
f79a3a83
...
@@ -63,10 +63,10 @@ Notes:
...
@@ -63,10 +63,10 @@ Notes:
## 4. How to reproduce the results
## 4. How to reproduce the results
*
Small dataset
*
Small dataset
```
bash
```
bash
python python/paddle/fluid/contrib/tests/test_calibration.py
FLAGS_use_mkldnn
=
true
python python/paddle/fluid/contrib/tests/test_calibration.py
```
```
*
Full dataset
*
Full dataset
```
bash
```
bash
DATASET
=
full python python/paddle/fluid/contrib/tests/test_calibration.py
FLAGS_use_mkldnn
=
true
DATASET
=
full python python/paddle/fluid/contrib/tests/test_calibration.py
```
```
python/paddle/fluid/contrib/tests/CMakeLists.txt
浏览文件 @
f79a3a83
...
@@ -6,5 +6,9 @@ if(APPLE OR WIN32 OR NOT WITH_MKL)
...
@@ -6,5 +6,9 @@ if(APPLE OR WIN32 OR NOT WITH_MKL)
endif
()
endif
()
foreach
(
src
${
TEST_OPS
}
)
foreach
(
src
${
TEST_OPS
}
)
py_test
(
${
src
}
SRCS
${
src
}
.py
)
if
(
src MATCHES
"test_calibration"
)
py_test
(
${
src
}
SRCS
${
src
}
.py ENVS FLAGS_use_mkldnn=true
)
else
()
py_test
(
${
src
}
SRCS
${
src
}
.py
)
endif
()
endforeach
()
endforeach
()
python/paddle/fluid/contrib/tests/test_calibration.py
浏览文件 @
f79a3a83
...
@@ -199,7 +199,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
...
@@ -199,7 +199,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
def
run_program
(
self
,
model_path
,
generate_int8
=
False
,
algo
=
'direct'
):
def
run_program
(
self
,
model_path
,
generate_int8
=
False
,
algo
=
'direct'
):
image_shape
=
[
3
,
224
,
224
]
image_shape
=
[
3
,
224
,
224
]
os
.
environ
[
'FLAGS_use_mkldnn'
]
=
'True'
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
fluid
.
memory_optimize
(
fluid
.
default_main_program
())
...
@@ -241,9 +240,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
...
@@ -241,9 +240,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
label
=
label
.
reshape
([
-
1
,
1
])
label
=
label
.
reshape
([
-
1
,
1
])
running_program
=
calibrator
.
sampling_program
.
clone
(
running_program
=
calibrator
.
sampling_program
.
clone
(
)
if
generate_int8
else
infer_program
.
clone
()
)
if
generate_int8
else
infer_program
.
clone
()
for
op
in
running_program
.
current_block
().
ops
:
if
op
.
has_attr
(
"use_mkldnn"
):
op
.
_set_attr
(
"use_mkldnn"
,
True
)
t1
=
time
.
time
()
t1
=
time
.
time
()
_
,
acc1
,
_
=
exe
.
run
(
_
,
acc1
,
_
=
exe
.
run
(
...
...
python/paddle/fluid/executor.py
浏览文件 @
f79a3a83
...
@@ -261,45 +261,42 @@ def _as_lodtensor(data, place):
...
@@ -261,45 +261,42 @@ def _as_lodtensor(data, place):
class
Executor
(
object
):
class
Executor
(
object
):
"""
"""
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running.
ParallelExecutor.
Python executor takes a program, adds feed operators and fetch operators to this program according
Python executor takes a program, add feed operators and fetch operators to this program according
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
the variables(or names) that user want
to get after program run
. Note: the executor will run all
the variables(or names) that user want
s to get after program runs
. Note: the executor will run all
operators in the program but not only the operators dependent by the fetch_list.
operators in the program but not only the operators dependent by the fetch_list.
It store the global variables into the global scope, and create a local scope for the temporary
It stores the global variables into the global scope, and creates a local scope for the temporary
variables. The local scope contents will be discarded after every minibatch forward/backward finished.
variables. The contents in local scope may be discarded after every minibatch forward/backward
But the global scope variables will be persistent through different runs.
finished. But the global scope variables will be persistent through different runs.
All of ops in program will be running in sequence.
Example:
Example:
.. code-block:: python
# First create the Executor.
.. code-block:: python
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# First create the Executor.
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
# Run the startup program once and only once.
exe = fluid.Executor(place)
# Not need to optimize/compile the startup program.
exe.run(fluid.default_startup_program())
# Run the startup program once and only once.
# Not need to optimize/compile the startup program.
# Run the main program directly without compile.
exe.run(fluid.default_startup_program())
loss, = exe.run(fluid.default_main_program(),
feed=feed_dict,
# Run the main program directly without compile.
fetch_list=[loss.name])
loss, = exe.run(fluid.default_main_program(),
# Or, compiled the program and run. See `CompiledProgram` for more detail.
feed=feed_dict,
compiled_prog = compiler.CompiledProgram(
fetch_list=[loss.name])
fluid.default_main_program()).with_data_parallel(
# Or, compiled the program and run. See `CompiledProgram` for more detail.
loss_name=loss.name)
compiled_prog = compiler.CompiledProgram(
loss, = exe.run(compiled_prog,
fluid.default_main_program()).with_data_parallel(
feed=feed_dict,
loss_name=loss.name)
fetch_list=[loss.name])
loss, = exe.run(compiled_prog,
feed=feed_dict,
fetch_list=[loss.name])
Args:
Args:
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
They has the exactly same arguments, and expected the same results.
"""
"""
def
__init__
(
self
,
place
):
def
__init__
(
self
,
place
):
...
@@ -382,6 +379,12 @@ class Executor(object):
...
@@ -382,6 +379,12 @@ class Executor(object):
]
]
return
outs
return
outs
'''
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
'''
def
close
(
self
):
def
close
(
self
):
"""
"""
Close this executor.
Close this executor.
...
@@ -389,9 +392,6 @@ class Executor(object):
...
@@ -389,9 +392,6 @@ class Executor(object):
You can no longer use this executor after calling this method.
You can no longer use this executor after calling this method.
For the distributed training, this method would free the resource on PServers related to
For the distributed training, this method would free the resource on PServers related to
the current Trainer.
the current Trainer.
TODO(typhoonzero): Define "no longer use" meaning? Can user create
a new Executor for the same program and run?
TODO(panyx0718): Why ParallelExecutor doesn't have close?
Example:
Example:
>>> cpu = core.CPUPlace()
>>> cpu = core.CPUPlace()
...
...
python/paddle/fluid/framework.py
浏览文件 @
f79a3a83
...
@@ -555,7 +555,8 @@ class OpProtoHolder(object):
...
@@ -555,7 +555,8 @@ class OpProtoHolder(object):
return
{
return
{
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
(),
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
(),
core
.
op_proto_and_checker_maker
.
kOpRoleVarAttrName
(),
core
.
op_proto_and_checker_maker
.
kOpRoleVarAttrName
(),
core
.
op_proto_and_checker_maker
.
kOpNameScopeAttrName
()
core
.
op_proto_and_checker_maker
.
kOpNameScopeAttrName
(),
core
.
op_proto_and_checker_maker
.
kOpCreationCallstackAttrName
()
}
}
...
...
python/paddle/fluid/io.py
浏览文件 @
f79a3a83
...
@@ -766,7 +766,10 @@ def _load_distributed_persistables(executor, dirname, main_program=None):
...
@@ -766,7 +766,10 @@ def _load_distributed_persistables(executor, dirname, main_program=None):
dtype
=
slice_var
.
dtype
,
dtype
=
slice_var
.
dtype
,
persistable
=
True
)
persistable
=
True
)
dim1_flatten
=
reduce
(
lambda
x
,
y
:
x
*
y
,
slice
.
shape
[
1
:])
dim1_flatten
=
1
if
len
(
slice
.
shape
)
>=
2
:
dim1_flatten
=
reduce
(
lambda
x
,
y
:
x
*
y
,
slice
.
shape
[
1
:])
start
=
int
(
offset
/
dim1_flatten
)
start
=
int
(
offset
/
dim1_flatten
)
end
=
int
(
offset
/
dim1_flatten
+
slice
.
shape
[
0
])
end
=
int
(
offset
/
dim1_flatten
+
slice
.
shape
[
0
])
...
@@ -892,7 +895,7 @@ def save_inference_model(dirname,
...
@@ -892,7 +895,7 @@ def save_inference_model(dirname,
True is supported.
True is supported.
Returns:
Returns:
None
target_var_name_list(list): The fetch variables' name list
Raises:
Raises:
ValueError: If `feed_var_names` is not a list of basestring.
ValueError: If `feed_var_names` is not a list of basestring.
...
@@ -945,11 +948,13 @@ def save_inference_model(dirname,
...
@@ -945,11 +948,13 @@ def save_inference_model(dirname,
# TODO(Superjomn) add an IR pass to remove 1-scale op.
# TODO(Superjomn) add an IR pass to remove 1-scale op.
with
program_guard
(
main_program
):
with
program_guard
(
main_program
):
uniq_target_vars
=
[]
uniq_target_vars
=
[]
for
var
in
target_vars
:
for
i
,
var
in
enumerate
(
target_vars
)
:
if
isinstance
(
var
,
Variable
):
if
isinstance
(
var
,
Variable
):
var1
=
layers
.
scale
(
var
,
1.
)
var
=
layers
.
scale
(
uniq_target_vars
.
append
(
var1
)
var
,
1.
,
name
=
"save_infer_model/scale_{}"
.
format
(
i
))
uniq_target_vars
.
append
(
var
)
target_vars
=
uniq_target_vars
target_vars
=
uniq_target_vars
target_var_name_list
=
[
var
.
name
for
var
in
target_vars
]
# when a pserver and a trainer running on the same machine, mkdir may conflict
# when a pserver and a trainer running on the same machine, mkdir may conflict
try
:
try
:
...
@@ -1006,6 +1011,7 @@ def save_inference_model(dirname,
...
@@ -1006,6 +1011,7 @@ def save_inference_model(dirname,
params_filename
=
os
.
path
.
basename
(
params_filename
)
params_filename
=
os
.
path
.
basename
(
params_filename
)
save_persistables
(
executor
,
dirname
,
main_program
,
params_filename
)
save_persistables
(
executor
,
dirname
,
main_program
,
params_filename
)
return
target_var_name_list
def
load_inference_model
(
dirname
,
def
load_inference_model
(
dirname
,
...
...
python/paddle/fluid/layers/control_flow.py
浏览文件 @
f79a3a83
...
@@ -506,9 +506,9 @@ class While(object):
...
@@ -506,9 +506,9 @@ class While(object):
while loop control flow.
while loop control flow.
Args:
Args:
cond
(Variable): condition used to compare.
cond(Variable): condition used to compare.
is_test(bool): A flag indicating whether execution is in test phase.
is_test(bool): A flag indicating whether execution is in test phase.
name
(str): The name of this layer.
name(str): The name of this layer.
Examples:
Examples:
.. code-block:: python
.. code-block:: python
...
@@ -589,7 +589,8 @@ class While(object):
...
@@ -589,7 +589,8 @@ class While(object):
def
lod_rank_table
(
x
,
level
=
0
):
def
lod_rank_table
(
x
,
level
=
0
):
"""LoD Rank Table Operator. Given an input variable **x** and a level number
"""
LoD Rank Table Operator. Given an input variable **x** and a level number
of LoD, this layer creates a LodRankTable object. A LoDRankTable object
of LoD, this layer creates a LodRankTable object. A LoDRankTable object
contains a list of bi-element tuples. Each tuple consists of an index and
contains a list of bi-element tuples. Each tuple consists of an index and
a length, both of which are int type. Refering to specified level of LoD,
a length, both of which are int type. Refering to specified level of LoD,
...
@@ -883,10 +884,8 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored):
...
@@ -883,10 +884,8 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored):
return
cond
return
cond
def
equal
(
x
,
y
,
cond
=
None
,
**
ignored
):
def
equal
(
x
,
y
,
cond
=
None
):
"""
"""
**equal**
This layer returns the truth value of :math:`x == y` elementwise.
This layer returns the truth value of :math:`x == y` elementwise.
Args:
Args:
...
@@ -1458,7 +1457,6 @@ class DynamicRNN(object):
...
@@ -1458,7 +1457,6 @@ class DynamicRNN(object):
Returns:
Returns:
The current timestep in the input sequence.
The current timestep in the input sequence.
"""
"""
self
.
_assert_in_rnn_block_
(
"step_input"
)
self
.
_assert_in_rnn_block_
(
"step_input"
)
if
not
isinstance
(
x
,
Variable
):
if
not
isinstance
(
x
,
Variable
):
...
@@ -1535,8 +1533,7 @@ class DynamicRNN(object):
...
@@ -1535,8 +1533,7 @@ class DynamicRNN(object):
@
signature_safe_contextmanager
@
signature_safe_contextmanager
def
block
(
self
):
def
block
(
self
):
"""
"""
The block for user to define operators in RNN. See the class docstring
The block for user to define operators in RNN.
for more details.
"""
"""
if
self
.
status
!=
DynamicRNN
.
BEFORE_RNN
:
if
self
.
status
!=
DynamicRNN
.
BEFORE_RNN
:
raise
ValueError
(
"rnn.block() can only be invoke once"
)
raise
ValueError
(
"rnn.block() can only be invoke once"
)
...
@@ -1640,8 +1637,7 @@ class DynamicRNN(object):
...
@@ -1640,8 +1637,7 @@ class DynamicRNN(object):
dtype(str|numpy.dtype): The data type of the initialized memory.
dtype(str|numpy.dtype): The data type of the initialized memory.
Returns:
Returns:
the memory variable.
The memory variable.
"""
"""
self
.
_assert_in_rnn_block_
(
'memory'
)
self
.
_assert_in_rnn_block_
(
'memory'
)
self
.
_init_zero_idx_
()
self
.
_init_zero_idx_
()
...
@@ -1740,7 +1736,7 @@ class DynamicRNN(object):
...
@@ -1740,7 +1736,7 @@ class DynamicRNN(object):
def
output
(
self
,
*
outputs
):
def
output
(
self
,
*
outputs
):
"""
"""
m
ark the RNN output variables.
M
ark the RNN output variables.
Args:
Args:
outputs: The output variables.
outputs: The output variables.
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
f79a3a83
...
@@ -545,15 +545,16 @@ def yolov3_loss(x,
...
@@ -545,15 +545,16 @@ def yolov3_loss(x,
TypeError: Attr ignore_thresh of yolov3_loss must be a float number
TypeError: Attr ignore_thresh of yolov3_loss must be a float number
Examples:
Examples:
.. code-block:: python
.. code-block:: python
x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
anchors = [0, 1, 2]
anchor_mask = [0, 1, 2]
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors,
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors,
ignore_thresh=0.5, downsample_ratio=32)
anchor_mask=anchor_mask, class_num=80,
ignore_thresh=0.7, downsample_ratio=32)
"""
"""
helper
=
LayerHelper
(
'yolov3_loss'
,
**
locals
())
helper
=
LayerHelper
(
'yolov3_loss'
,
**
locals
())
...
...
python/paddle/fluid/layers/io.py
浏览文件 @
f79a3a83
...
@@ -56,7 +56,10 @@ def data(name,
...
@@ -56,7 +56,10 @@ def data(name,
Args:
Args:
name(str): The name/alias of the function
name(str): The name/alias of the function
shape(list): Tuple declaring the shape.
shape(list): Tuple declaring the shape. If :code:`append_batch_size` is
True and there is no -1 inside :code:`shape`, it should be
considered as the shape of the each sample. Otherwise, it
should be considered as the shape of the batched data.
append_batch_size(bool):
append_batch_size(bool):
1. If true, it prepends -1 to the shape.
1. If true, it prepends -1 to the shape.
For example if shape=[1], the resulting shape is [-1, 1].
For example if shape=[1], the resulting shape is [-1, 1].
...
...
python/paddle/fluid/layers/layer_function_generator.py
浏览文件 @
f79a3a83
...
@@ -24,7 +24,7 @@ from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype
...
@@ -24,7 +24,7 @@ from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype
from
..layer_helper
import
LayerHelper
from
..layer_helper
import
LayerHelper
__all__
=
[
__all__
=
[
'deprecated'
,
'generate_layer_fn'
,
'generate_
layer_fn_noattr
'
,
'autodoc'
,
'deprecated'
,
'generate_layer_fn'
,
'generate_
activation_fn
'
,
'autodoc'
,
'templatedoc'
'templatedoc'
]
]
...
@@ -89,6 +89,9 @@ def _generate_doc_string_(op_proto, additional_args_lines=None):
...
@@ -89,6 +89,9 @@ def _generate_doc_string_(op_proto, additional_args_lines=None):
buf
.
write
(
'
\n
'
)
buf
.
write
(
'
\n
'
)
skip_attrs
=
OpProtoHolder
.
generated_op_attr_names
()
skip_attrs
=
OpProtoHolder
.
generated_op_attr_names
()
# attr use_mkldnn and is_test also should not be visible to users.
skip_attrs
.
add
(
"use_mkldnn"
)
skip_attrs
.
add
(
"is_test"
)
for
each_attr
in
op_proto
.
attrs
:
for
each_attr
in
op_proto
.
attrs
:
if
each_attr
.
name
in
skip_attrs
:
if
each_attr
.
name
in
skip_attrs
:
...
@@ -226,7 +229,7 @@ def generate_layer_fn(op_type):
...
@@ -226,7 +229,7 @@ def generate_layer_fn(op_type):
return
func
return
func
def
generate_
layer_fn_noattr
(
op_type
):
def
generate_
activation_fn
(
op_type
):
"""Register the Python layer for an Operator without Attribute.
"""Register the Python layer for an Operator without Attribute.
Args:
Args:
...
@@ -246,6 +249,7 @@ def generate_layer_fn_noattr(op_type):
...
@@ -246,6 +249,7 @@ def generate_layer_fn_noattr(op_type):
func
.
__name__
=
op_type
func
.
__name__
=
op_type
func
.
__doc__
=
_generate_doc_string_
(
op_proto
)
func
.
__doc__
=
_generate_doc_string_
(
op_proto
)
return
func
return
func
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
f79a3a83
...
@@ -2441,7 +2441,7 @@ def pool2d(input,
...
@@ -2441,7 +2441,7 @@ def pool2d(input,
data = fluid.layers.data(
data = fluid.layers.data(
name='data', shape=[3, 32, 32], dtype='float32')
name='data', shape=[3, 32, 32], dtype='float32')
conv
2d = fluid.layers.pool2d(
pool
2d = fluid.layers.pool2d(
input=data,
input=data,
pool_size=2,
pool_size=2,
pool_type='max',
pool_type='max',
...
@@ -2490,6 +2490,7 @@ def pool2d(input,
...
@@ -2490,6 +2490,7 @@ def pool2d(input,
return
pool_out
return
pool_out
@
templatedoc
()
def
pool3d
(
input
,
def
pool3d
(
input
,
pool_size
=-
1
,
pool_size
=-
1
,
pool_type
=
"max"
,
pool_type
=
"max"
,
...
@@ -2501,13 +2502,19 @@ def pool3d(input,
...
@@ -2501,13 +2502,19 @@ def pool3d(input,
name
=
None
,
name
=
None
,
exclusive
=
True
):
exclusive
=
True
):
"""
"""
This function adds the operator for pooling in 3-dimensions, using the
${comment}
pooling configurations mentioned in input parameters.
Args:
Args:
input (Variable): ${input_comment}
input (Variable): The input tensor of pooling operator. The format of
pool_size (int): ${ksize_comment}
input tensor is NCDHW, where N is batch size, C is
pool_type (str): ${pooling_type_comment}
the number of channels, D is the depth of the feature,
H is the height of the feature, and W is the width
of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size
is a tuple or list, it must contain three integers,
(pool_size_Depth, pool_size_Height, pool_size_Width).
Otherwise, the pool kernel size will be the cube of an int.
pool_type (string): ${pooling_type_comment}
pool_stride (int): stride of the pooling layer.
pool_stride (int): stride of the pooling layer.
pool_padding (int): padding size.
pool_padding (int): padding size.
global_pooling (bool): ${global_pooling_comment}
global_pooling (bool): ${global_pooling_comment}
...
@@ -2520,6 +2527,19 @@ def pool3d(input,
...
@@ -2520,6 +2527,19 @@ def pool3d(input,
Returns:
Returns:
Variable: output of pool3d layer.
Variable: output of pool3d layer.
Examples:
.. code-block:: python
data = fluid.layers.data(
name='data', shape=[3, 32, 32, 32], dtype='float32')
pool3d = fluid.layers.pool3d(
input=data,
pool_size=2,
pool_type='max',
pool_stride=1,
global_pooling=False)
"""
"""
if
pool_type
not
in
[
"max"
,
"avg"
]:
if
pool_type
not
in
[
"max"
,
"avg"
]:
raise
ValueError
(
raise
ValueError
(
...
@@ -2569,7 +2589,27 @@ def adaptive_pool2d(input,
...
@@ -2569,7 +2589,27 @@ def adaptive_pool2d(input,
require_index
=
False
,
require_index
=
False
,
name
=
None
):
name
=
None
):
"""
"""
${comment}
**Adaptive Pool2d Operator**
The adaptive_pool2d operation calculates the output based on the input, pool_size,
pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch
size, C is the number of channels, H is the height of the feature, and W is
the width of the feature. Parameters(pool_size) should contain two elements which
represent height and width, respectively. Also the H and W dimensions of output(Out)
is same as Parameter(pool_size).
For average adaptive pool2d:
.. math::
hstart &= floor(i * H_{in} / H_{out})
hend &= ceil((i + 1) * H_{in} / H_{out})
wstart &= floor(j * W_{in} / W_{out})
wend &= ceil((j + 1) * W_{in} / W_{out})
Output(i ,j) &=
\\
frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
Args:
Args:
input (Variable): The input tensor of pooling operator. The format of
input (Variable): The input tensor of pooling operator. The format of
...
@@ -2579,8 +2619,8 @@ def adaptive_pool2d(input,
...
@@ -2579,8 +2619,8 @@ def adaptive_pool2d(input,
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two integers, (pool_size_Height, pool_size_Width).
it must contain two integers, (pool_size_Height, pool_size_Width).
pool_type: ${pooling_type_comment}
pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point
along with outputs.
require_index (bool): If true, the index of max pooling point
will be returned along
i
t cannot be set in average pooling type.
with outputs. I
t cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
layer will be named automatically.
...
@@ -2661,18 +2701,42 @@ def adaptive_pool3d(input,
...
@@ -2661,18 +2701,42 @@ def adaptive_pool3d(input,
require_index
=
False
,
require_index
=
False
,
name
=
None
):
name
=
None
):
"""
"""
${comment}
**Adaptive Pool3d Operator**
The adaptive_pool3d operation calculates the output based on the input, pool_size,
pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch
size, C is the number of channels, D is the depth of the feature, H is the height of
the feature, and W is the width of the feature. Parameters(pool_size) should contain
three elements which represent height and width, respectively. Also the D, H and W
dimensions of output(Out) is same as Parameter(pool_size).
For average adaptive pool3d:
.. math::
dstart &= floor(i * D_{in} / D_{out})
dend &= ceil((i + 1) * D_{in} / D_{out})
hstart &= floor(j * H_{in} / H_{out})
hend &= ceil((j + 1) * H_{in} / H_{out})
wstart &= floor(k * W_{in} / W_{out})
wend &= ceil((k + 1) * W_{in} / W_{out})
Output(i ,j, k) &=
\\
frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
Args:
Args:
input (Variable): The input tensor of pooling operator. The format of
input (Variable): The input tensor of pooling operator. The format of
input tensor is NCHW, where N is batch size, C is
input tensor is NC
D
HW, where N is batch size, C is
the number of channels,
H is the height of the
the number of channels,
D is the depth of the feature,
feature, and W is the width of the feature.
H is the height of the
feature, and W is the width of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain t
wo
integers, (Depth, Height, Width).
it must contain t
hree
integers, (Depth, Height, Width).
pool_type: ${pooling_type_comment}
pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point
along with outputs.
require_index (bool): If true, the index of max pooling point
will be returned along
i
t cannot be set in average pooling type.
with outputs. I
t cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
layer will be named automatically.
...
@@ -2709,7 +2773,7 @@ def adaptive_pool3d(input,
...
@@ -2709,7 +2773,7 @@ def adaptive_pool3d(input,
name='data', shape=[3, 32, 32], dtype='float32')
name='data', shape=[3, 32, 32], dtype='float32')
pool_out, mask = fluid.layers.adaptive_pool3d(
pool_out, mask = fluid.layers.adaptive_pool3d(
input=data,
input=data,
pool_size=[3, 3],
pool_size=[3, 3
, 3
],
pool_type='avg')
pool_type='avg')
"""
"""
if
pool_type
not
in
[
"max"
,
"avg"
]:
if
pool_type
not
in
[
"max"
,
"avg"
]:
...
@@ -2930,6 +2994,7 @@ def batch_norm(input,
...
@@ -2930,6 +2994,7 @@ def batch_norm(input,
"momentum"
:
momentum
,
"momentum"
:
momentum
,
"epsilon"
:
epsilon
,
"epsilon"
:
epsilon
,
"is_test"
:
is_test
,
"is_test"
:
is_test
,
"data_layout"
:
data_layout
,
"use_mkldnn"
:
False
,
"use_mkldnn"
:
False
,
"fuse_with_relu"
:
fuse_with_relu
,
"fuse_with_relu"
:
fuse_with_relu
,
"use_global_stats"
:
use_global_stats
"use_global_stats"
:
use_global_stats
...
@@ -3235,7 +3300,7 @@ def group_norm(input,
...
@@ -3235,7 +3300,7 @@ def group_norm(input,
# create output
# create output
mean_out
=
helper
.
create_variable
(
dtype
=
dtype
,
stop_gradient
=
True
)
mean_out
=
helper
.
create_variable
(
dtype
=
dtype
,
stop_gradient
=
True
)
variance_out
=
helper
.
create_variable
(
dtype
=
dtype
,
stop_gradient
=
True
)
variance_out
=
helper
.
create_variable
(
dtype
=
dtype
,
stop_gradient
=
True
)
group_norm_out
=
helper
.
create_variable
(
dtype
)
group_norm_out
=
helper
.
create_variable
(
dtype
=
dtype
)
helper
.
append_op
(
helper
.
append_op
(
type
=
"group_norm"
,
type
=
"group_norm"
,
...
...
python/paddle/fluid/layers/ops.py
浏览文件 @
f79a3a83
...
@@ -14,7 +14,7 @@
...
@@ -14,7 +14,7 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
os
import
os
from
.layer_function_generator
import
generate_layer_fn
,
generate_
layer_fn_noattr
from
.layer_function_generator
import
generate_layer_fn
,
generate_
activation_fn
from
..
import
core
from
..
import
core
from
..framework
import
convert_np_dtype_to_dtype_
from
..framework
import
convert_np_dtype_to_dtype_
...
@@ -53,7 +53,7 @@ globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
...
@@ -53,7 +53,7 @@ globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
__all__
+=
__activations_noattr__
__all__
+=
__activations_noattr__
for
_OP
in
set
(
__activations_noattr__
):
for
_OP
in
set
(
__activations_noattr__
):
globals
()[
_OP
]
=
generate_
layer_fn_noattr
(
_OP
)
globals
()[
_OP
]
=
generate_
activation_fn
(
_OP
)
__all__
+=
[
"uniform_random"
]
__all__
+=
[
"uniform_random"
]
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
f79a3a83
...
@@ -1368,9 +1368,9 @@ class FtrlOptimizer(Optimizer):
...
@@ -1368,9 +1368,9 @@ class FtrlOptimizer(Optimizer):
Args:
Args:
learning_rate (float|Variable): global learning rate.
learning_rate (float|Variable): global learning rate.
l1 (float):
l1 (float):
L1 regularization strength.
l2 (float):
l2 (float):
L2 regularization strength.
lr_power (float):
lr_power (float):
Learning Rate Power.
regularization: A Regularizer, such as
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
name: A optional name prefix.
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
f79a3a83
...
@@ -148,7 +148,10 @@ class ParallelExecutor(object):
...
@@ -148,7 +148,10 @@ class ParallelExecutor(object):
else
framework
.
default_main_program
()
else
framework
.
default_main_program
()
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
# if turn on python memory optimize, turn off the inplace_pass.
build_strategy
.
enable_inplace
=
False
if
main
.
_is_mem_optimized
else
True
if
build_strategy
.
memory_optimize
is
True
:
build_strategy
.
memory_optimize
=
False
if
main
.
_is_mem_optimized
else
True
if
build_strategy
.
enable_inplace
is
True
:
build_strategy
.
enable_inplace
=
False
if
main
.
_is_mem_optimized
else
True
scope
=
scope
if
scope
is
not
None
else
executor
.
global_scope
()
scope
=
scope
if
scope
is
not
None
else
executor
.
global_scope
()
if
share_vars_from
and
not
isinstance
(
share_vars_from
,
if
share_vars_from
and
not
isinstance
(
share_vars_from
,
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
f79a3a83
...
@@ -77,6 +77,7 @@ list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
...
@@ -77,6 +77,7 @@ list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
list
(
REMOVE_ITEM TEST_OPS test_nearest_interp_op
)
list
(
REMOVE_ITEM TEST_OPS test_nearest_interp_op
)
list
(
REMOVE_ITEM TEST_OPS test_imperative_resnet
)
list
(
REMOVE_ITEM TEST_OPS test_imperative_resnet
)
list
(
REMOVE_ITEM TEST_OPS test_imperative_optimizer
)
list
(
REMOVE_ITEM TEST_OPS test_imperative_optimizer
)
list
(
REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer
)
foreach
(
TEST_OP
${
TEST_OPS
}
)
foreach
(
TEST_OP
${
TEST_OPS
}
)
py_test_modules
(
${
TEST_OP
}
MODULES
${
TEST_OP
}
)
py_test_modules
(
${
TEST_OP
}
MODULES
${
TEST_OP
}
)
endforeach
(
TEST_OP
)
endforeach
(
TEST_OP
)
...
@@ -107,10 +108,18 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
...
@@ -107,10 +108,18 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
py_test_modules
(
test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL
)
py_test_modules
(
test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL
)
set_tests_properties
(
test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450
)
set_tests_properties
(
test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450
)
py_test_modules
(
test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL
)
py_test_modules
(
test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL
)
if
(
NOT WIN32
)
py_test_modules
(
test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL
)
endif
()
if
(
NOT APPLE
)
if
(
NOT APPLE
)
py_test_modules
(
test_image_classification_resnet MODULES test_image_classification_resnet SERIAL
)
py_test_modules
(
test_image_classification_resnet MODULES test_image_classification_resnet SERIAL
)
if
(
CMAKE_BUILD_TYPE STREQUAL
"Debug"
)
# change the timeout from 600 to 1200, because in debug mode, this test need more time.
set_tests_properties
(
test_image_classification_resnet PROPERTIES TIMEOUT 1200
)
endif
()
endif
()
endif
()
if
(
WITH_NGRAPH
)
if
(
WITH_NGRAPH
)
add_subdirectory
(
ngraph
)
add_subdirectory
(
ngraph
)
endif
()
endif
()
...
...
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
浏览文件 @
f79a3a83
...
@@ -79,7 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -79,7 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase):
if
use_reduce
else
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
if
use_reduce
else
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
build_strategy
.
fuse_elewise_add_act_ops
=
fuse_elewise_add_act_ops
build_strategy
.
fuse_elewise_add_act_ops
=
fuse_elewise_add_act_ops
build_strategy
.
fuse_relu_depthwise_conv
=
fuse_relu_depthwise_conv
build_strategy
.
fuse_relu_depthwise_conv
=
fuse_relu_depthwise_conv
build_strategy
.
memory_optimize
=
use_ir_memory_optimize
build_strategy
.
memory_optimize
=
False
if
memory_opt
else
use_ir_memory_optimize
# python memory optimization is conflict with inplace pass.
# python memory optimization is conflict with inplace pass.
# Use ir graph memory optimization after inplace pass is the correct way.
# Use ir graph memory optimization after inplace pass is the correct way.
build_strategy
.
enable_inplace
=
False
if
memory_opt
else
enable_inplace
build_strategy
.
enable_inplace
=
False
if
memory_opt
else
enable_inplace
...
...
python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
浏览文件 @
f79a3a83
...
@@ -121,6 +121,8 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -121,6 +121,8 @@ class TestMNIST(TestParallelExecutorBase):
regularization
=
fluid
.
regularizer
.
L2Decay
(
1e-6
))
regularization
=
fluid
.
regularizer
.
L2Decay
(
1e-6
))
return
optimizer
return
optimizer
# NOTE(dzh):
# need to make it compatible with elewise fuse act
not_fuse_op_first_loss
,
not_fuse_op_last_loss
=
self
.
check_network_convergence
(
not_fuse_op_first_loss
,
not_fuse_op_last_loss
=
self
.
check_network_convergence
(
model
,
model
,
feed_dict
=
{
"image"
:
img
,
feed_dict
=
{
"image"
:
img
,
...
@@ -128,6 +130,7 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -128,6 +130,7 @@ class TestMNIST(TestParallelExecutorBase):
use_cuda
=
use_cuda
,
use_cuda
=
use_cuda
,
fuse_elewise_add_act_ops
=
False
,
fuse_elewise_add_act_ops
=
False
,
memory_opt
=
False
,
memory_opt
=
False
,
use_ir_memory_optimize
=
False
,
optimizer
=
_optimizer
)
optimizer
=
_optimizer
)
fuse_op_first_loss
,
fuse_op_last_loss
=
self
.
check_network_convergence
(
fuse_op_first_loss
,
fuse_op_last_loss
=
self
.
check_network_convergence
(
model
,
model
,
...
@@ -136,6 +139,7 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -136,6 +139,7 @@ class TestMNIST(TestParallelExecutorBase):
use_cuda
=
use_cuda
,
use_cuda
=
use_cuda
,
fuse_elewise_add_act_ops
=
True
,
fuse_elewise_add_act_ops
=
True
,
memory_opt
=
False
,
memory_opt
=
False
,
use_ir_memory_optimize
=
False
,
optimizer
=
_optimizer
)
optimizer
=
_optimizer
)
for
loss
in
zip
(
not_fuse_op_first_loss
,
fuse_op_first_loss
):
for
loss
in
zip
(
not_fuse_op_first_loss
,
fuse_op_first_loss
):
...
...
python/paddle/fluid/tests/unittests/test_slice_op.py
浏览文件 @
f79a3a83
此差异已折叠。
点击以展开。
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
f79a3a83
...
@@ -1020,7 +1020,11 @@ class DistributeTranspiler(object):
...
@@ -1020,7 +1020,11 @@ class DistributeTranspiler(object):
skip_dim0
=
0
skip_dim0
=
0
slice_vars
=
self
.
param_var_mapping
[
orig_var_name
]
slice_vars
=
self
.
param_var_mapping
[
orig_var_name
]
orig_dim1_flatten
=
reduce
(
lambda
x
,
y
:
x
*
y
,
slice_vars
[
0
].
shape
[
1
:])
orig_dim1_flatten
=
1
if
len
(
slice_vars
[
0
].
shape
)
>=
2
:
orig_dim1_flatten
=
reduce
(
lambda
x
,
y
:
x
*
y
,
slice_vars
[
0
].
shape
[
1
:])
for
slice_var
in
slice_vars
[:
block_idx
]:
for
slice_var
in
slice_vars
[:
block_idx
]:
skip_dim0
+=
slice_var
.
shape
[
0
]
skip_dim0
+=
slice_var
.
shape
[
0
]
...
...
python/requirements.txt
浏览文件 @
f79a3a83
requests==2.9.2
requests==2.9.2
numpy>=1.12
numpy>=1.12
protobuf
==3.1
protobuf
>=3.1.0
recordio>=0.1.0
recordio>=0.1.0
matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
rarfile
rarfile
...
...
python/setup.py.in
浏览文件 @
f79a3a83
...
@@ -100,6 +100,7 @@ packages=['paddle',
...
@@ -100,6 +100,7 @@ packages=['paddle',
'paddle.utils',
'paddle.utils',
'paddle.dataset',
'paddle.dataset',
'paddle.reader',
'paddle.reader',
'paddle.distributed',
'paddle.fluid',
'paddle.fluid',
'paddle.fluid.imperative',
'paddle.fluid.imperative',
'paddle.fluid.proto',
'paddle.fluid.proto',
...
...
tools/manylinux1/build_scripts/build.sh
浏览文件 @
f79a3a83
此差异已折叠。
点击以展开。
tools/run_mp.py
已删除
100644 → 0
浏览文件 @
d2d3f2b5
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录