Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
8f18f4d1
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 1 年 前同步成功
通知
695
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8f18f4d1
编写于
7月 27, 2018
作者:
M
minqiyang
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into fix_hang_up
上级
ce4ddd78
4f100396
变更
89
显示空白变更内容
内联
并排
Showing
89 changed file
with
1351 addition
and
703 deletion
+1351
-703
doc/fluid/design/ir/draft.md
doc/fluid/design/ir/draft.md
+24
-24
paddle/fluid/API.spec
paddle/fluid/API.spec
+1
-20
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+6
-1
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+2
-2
paddle/fluid/framework/details/multi_devices_graph_builder.cc
...le/fluid/framework/details/multi_devices_graph_builder.cc
+101
-50
paddle/fluid/framework/details/multi_devices_graph_builder.h
paddle/fluid/framework/details/multi_devices_graph_builder.h
+17
-14
paddle/fluid/framework/details/rpc_op_handle.cc
paddle/fluid/framework/details/rpc_op_handle.cc
+2
-1
paddle/fluid/framework/details/ssa_graph_builder.cc
paddle/fluid/framework/details/ssa_graph_builder.cc
+16
-8
paddle/fluid/framework/details/ssa_graph_builder.h
paddle/fluid/framework/details/ssa_graph_builder.h
+9
-12
paddle/fluid/framework/details/ssa_graph_checker.cc
paddle/fluid/framework/details/ssa_graph_checker.cc
+1
-1
paddle/fluid/framework/details/ssa_graph_checker.h
paddle/fluid/framework/details/ssa_graph_checker.h
+3
-2
paddle/fluid/framework/details/ssa_graph_printer.cc
paddle/fluid/framework/details/ssa_graph_printer.cc
+2
-2
paddle/fluid/framework/details/ssa_graph_printer.h
paddle/fluid/framework/details/ssa_graph_printer.h
+4
-3
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+2
-1
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+2
-2
paddle/fluid/framework/details/var_handle.cc
paddle/fluid/framework/details/var_handle.cc
+1
-1
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+3
-9
paddle/fluid/framework/executor.h
paddle/fluid/framework/executor.h
+3
-9
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+3
-2
paddle/fluid/framework/ir/graph.cc
paddle/fluid/framework/ir/graph.cc
+68
-17
paddle/fluid/framework/ir/graph.h
paddle/fluid/framework/ir/graph.h
+58
-16
paddle/fluid/framework/ir/graph_helper.cc
paddle/fluid/framework/ir/graph_helper.cc
+118
-0
paddle/fluid/framework/ir/graph_helper.h
paddle/fluid/framework/ir/graph_helper.h
+40
-0
paddle/fluid/framework/ir/graph_helper_test.cc
paddle/fluid/framework/ir/graph_helper_test.cc
+125
-0
paddle/fluid/framework/ir/graph_test.cc
paddle/fluid/framework/ir/graph_test.cc
+17
-15
paddle/fluid/framework/ir/node.cc
paddle/fluid/framework/ir/node.cc
+5
-1
paddle/fluid/framework/ir/node.h
paddle/fluid/framework/ir/node.h
+3
-0
paddle/fluid/framework/mixed_vector.h
paddle/fluid/framework/mixed_vector.h
+6
-6
paddle/fluid/framework/mixed_vector_test.cc
paddle/fluid/framework/mixed_vector_test.cc
+72
-0
paddle/fluid/framework/mixed_vector_test.cu
paddle/fluid/framework/mixed_vector_test.cu
+2
-41
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+1
-1
paddle/fluid/inference/api/api_impl.cc
paddle/fluid/inference/api/api_impl.cc
+1
-0
paddle/fluid/inference/api/demo_ci/clean.sh
paddle/fluid/inference/api/demo_ci/clean.sh
+4
-0
paddle/fluid/inference/tensorrt/convert/fc_op.cc
paddle/fluid/inference/tensorrt/convert/fc_op.cc
+6
-8
paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
...le/fluid/inference/tensorrt/convert/test_activation_op.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
+7
-6
paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
+1
-1
paddle/fluid/inference/tensorrt/convert/ut_helper.h
paddle/fluid/inference/tensorrt/convert/ut_helper.h
+5
-5
paddle/fluid/inference/tensorrt/engine.cc
paddle/fluid/inference/tensorrt/engine.cc
+39
-23
paddle/fluid/inference/tensorrt/engine.h
paddle/fluid/inference/tensorrt/engine.h
+4
-0
paddle/fluid/inference/tensorrt/test_engine.cc
paddle/fluid/inference/tensorrt/test_engine.cc
+37
-3
paddle/fluid/inference/tests/test_helper.h
paddle/fluid/inference/tests/test_helper.h
+6
-4
paddle/fluid/memory/detail/buddy_allocator.cc
paddle/fluid/memory/detail/buddy_allocator.cc
+11
-6
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+2
-2
paddle/fluid/operators/conv_cudnn_op.cu.cc
paddle/fluid/operators/conv_cudnn_op.cu.cc
+13
-13
paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+9
-9
paddle/fluid/operators/distributed/CMakeLists.txt
paddle/fluid/operators/distributed/CMakeLists.txt
+1
-1
paddle/fluid/operators/distributed/grpc_client.cc
paddle/fluid/operators/distributed/grpc_client.cc
+11
-28
paddle/fluid/operators/distributed/grpc_client.h
paddle/fluid/operators/distributed/grpc_client.h
+8
-9
paddle/fluid/operators/distributed/request_handler.h
paddle/fluid/operators/distributed/request_handler.h
+0
-2
paddle/fluid/operators/distributed/request_handler_impl.cc
paddle/fluid/operators/distributed/request_handler_impl.cc
+4
-7
paddle/fluid/operators/distributed/rpc_client.h
paddle/fluid/operators/distributed/rpc_client.h
+5
-9
paddle/fluid/operators/distributed/rpc_server.cc
paddle/fluid/operators/distributed/rpc_server.cc
+6
-12
paddle/fluid/operators/distributed/rpc_server.h
paddle/fluid/operators/distributed/rpc_server.h
+2
-3
paddle/fluid/operators/distributed/rpc_server_test.cc
paddle/fluid/operators/distributed/rpc_server_test.cc
+25
-4
paddle/fluid/operators/math/im2col.cc
paddle/fluid/operators/math/im2col.cc
+30
-5
paddle/fluid/operators/math/im2col_test.cc
paddle/fluid/operators/math/im2col_test.cc
+72
-0
paddle/fluid/operators/math/softmax.cu
paddle/fluid/operators/math/softmax.cu
+2
-2
paddle/fluid/operators/pool_cudnn_op.cu.cc
paddle/fluid/operators/pool_cudnn_op.cu.cc
+2
-2
paddle/fluid/operators/send_recv_util.h
paddle/fluid/operators/send_recv_util.h
+5
-1
paddle/fluid/operators/tensorrt_engine_op.cc
paddle/fluid/operators/tensorrt_engine_op.cc
+4
-3
paddle/fluid/operators/tensorrt_engine_op.h
paddle/fluid/operators/tensorrt_engine_op.h
+3
-1
paddle/fluid/operators/tensorrt_engine_op_test.cc
paddle/fluid/operators/tensorrt_engine_op_test.cc
+16
-16
paddle/fluid/platform/cudnn_helper.h
paddle/fluid/platform/cudnn_helper.h
+6
-7
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+1
-4
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+1
-0
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+28
-28
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+21
-4
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+2
-3
python/paddle/fluid/io.py
python/paddle/fluid/io.py
+0
-98
python/paddle/fluid/layers/control_flow.py
python/paddle/fluid/layers/control_flow.py
+3
-11
python/paddle/fluid/tests/demo/file_reader/.gitignore
python/paddle/fluid/tests/demo/file_reader/.gitignore
+0
-0
python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
.../fluid/tests/demo/file_reader/convert_data_to_recordio.py
+2
-2
python/paddle/fluid/tests/demo/file_reader/train.py
python/paddle/fluid/tests/demo/file_reader/train.py
+138
-0
python/paddle/fluid/tests/test_error_clip.py
python/paddle/fluid/tests/test_error_clip.py
+1
-1
python/paddle/fluid/tests/test_if_else_op.py
python/paddle/fluid/tests/test_if_else_op.py
+8
-5
python/paddle/fluid/tests/unittests/op_test.py
python/paddle/fluid/tests/unittests/op_test.py
+1
-1
python/paddle/fluid/tests/unittests/test_conditional_block.py
...on/paddle/fluid/tests/unittests/test_conditional_block.py
+3
-2
python/paddle/fluid/tests/unittests/test_const_value.py
python/paddle/fluid/tests/unittests/test_const_value.py
+1
-1
python/paddle/fluid/tests/unittests/test_dyn_rnn.py
python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+11
-7
python/paddle/fluid/tests/unittests/test_lod_rank_table.py
python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+2
-1
python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
...paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+12
-7
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
...dle/fluid/tests/unittests/test_parallel_executor_mnist.py
+28
-57
python/paddle/fluid/tests/unittests/test_parallel_op.py
python/paddle/fluid/tests/unittests/test_parallel_op.py
+1
-1
python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
...n/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+2
-1
python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
...on/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+7
-4
python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
...uid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+6
-6
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+6
-4
python/paddle/fluid/transpiler/memory_optimization_transpiler.py
...paddle/fluid/transpiler/memory_optimization_transpiler.py
+1
-1
未找到文件。
doc/fluid/design/ir/draft.md
浏览文件 @
8f18f4d1
## Motivation
There is a
`
``gap```
between the
```Program``
`
defined by
user and the
`
``Executable``
`
that can be scheduled
There is a
`
gap`
between the
`Program
`
defined by
user and the
`
Executable
`
that can be scheduled
efficiently on heterogeneous hardware, either locally
or distributedly.
Usually, the
`
``gap``
`
is bridged by
Usually, the
`
gap
`
is bridged by
*
A serious transformations with defined order.
*
These transformations usually involve
`
``
insert, delete, clustering, split, dependency analysis``
`.
`
insert, delete, clustering, split, dependency analysis
`
.
*
Has a simple way to verify and debug each transformation.
...
...
@@ -38,44 +38,44 @@ design below.
#### Node
`
``
Node
``
` represents an operation that performs some computation or
`
Node
`
represents an operation that performs some computation or
a variable that is input or output of operation.
`
``
Node
```s are connected to other ```
Node
``
`s via inputs and outputs.
`
Node`
s are connected to other
`Node
`
s via inputs and outputs.
Other properties (maybe device placement information) can be added
to `
``
Node
``
` in the future if it's a
common requirement of many other `
``
Pass
``
`es. Otherwise, it should live
in a `
``
Node
``` wrapper class that is private to some ```
Pass
``
` or be
a local member of a `
``
Pass
``
`.
to
`
Node
`
in the future if it's a
common requirement of many other
`
Pass
`
es. Otherwise, it should live
in a
`
Node`
wrapper class that is private to some
`Pass
`
or be
a local member of a
`
Pass
`
.
#### Graph
`
``
Graph
``` contains a list of ```
Node
``
`s, which are connected to
`
Graph`
contains a list of
`Node
`
s, which are connected to
each other via inputs and outputs.
TODO: Better definitions for the graph.
`
``
Graph
``` can also contain ```
Attribute
```s. ```
Attribute
``
`s
can be `
`any`
` thing. For example, it can be a list of "wraper"
nodes. The `
``
wrapper
``` nodes compose ```
Node
``
`s and provide
helper method for execution or transformation. `
``
Attribute
``
`
`
Graph`
can also contain
`Attribute`
s.
`Attribute
`
s
can be
`
any
`
thing. For example, it can be a list of "wraper"
nodes. The
`
wrapper`
nodes compose
`Node
`
s and provide
helper method for execution or transformation.
`
Attribute
`
can also contain other things that describe some properties of
the `
``
Graph
``` or ```
Graph
``` nodes. ```
Attribute
``
` can be passed
across `
``
Pass
``
`. However, it should be used with care.
the
`
Graph`
or
`Graph`
nodes.
`Attribute
`
can be passed
across
`
Pass
`
. However, it should be used with care.
#### Pass
`
``
Pass
``` represents a transformation of ```
Graph
``
`. Its input
is a `
``
Graph
``` and its output is also a ```
Graph
``
`. For example,
a `
``
Pass
``` can simply print out the ```
Graph
```. A ```
Pass
``
`
can also fuse some `
``
Graph
```'s ```
Node
``
`s.
`
Pass`
represents a transformation of
`Graph
`
. Its input
is a
`
Graph`
and its output is also a
`Graph
`
. For example,
a
`
Pass`
can simply print out the
`Graph`
. A
`Pass
`
can also fuse some
`
Graph`
's
`Node
`
s.
#### Optimize
`
``
Optimize
``` contains a series of ```
Pass
``
` with defined order.
`
``
Optimize
``` transforms a ```
Graph
``
` that only contains raw
modeling logic to a `
``
Graph
``
`
that can be run efficiently while
`
Optimize`
contains a series of
`Pass
`
with defined order.
`
Optimize`
transforms a
`Graph
`
that only contains raw
modeling logic to a
`
Graph
`
that can be run efficiently while
maintaining the original modeling logic.
...
...
paddle/fluid/API.spec
浏览文件 @
8f18f4d1
paddle.fluid.Variable.__init__ ArgSpec(args=['self', 'block', 'type', 'name', 'shape', 'dtype', 'lod_level', 'capacity', 'persistable', 'error_clip', 'stop_gradient', 'is_data'], varargs=None, keywords='kwargs', defaults=(VarType.LOD_TENSOR, None, None, None, None, None, None, None, False, False))
paddle.fluid.Variable.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Variable.set_desc ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Variable.set_error_clip ArgSpec(args=['self', 'error_clip'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Variable.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,))
...
...
@@ -33,8 +28,6 @@ paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=Non
paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.__init__ ArgSpec(args=['self', 'block', 'shape', 'dtype'], varargs=None, keywords='kwargs', defaults=None)
paddle.fluid.Parameter.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.set_desc ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.set_error_clip ArgSpec(args=['self', 'error_clip'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Parameter.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
...
...
@@ -42,8 +35,7 @@ paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', def
paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.begin_pass ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.end_pass ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
...
...
@@ -207,31 +199,23 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None
paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.split_lod_tensor ArgSpec(args=['input', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.merge_lod_tensor ArgSpec(args=['in_true', 'in_false', 'x', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.lod_rank_table ArgSpec(args=['x', 'level'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.max_sequence_len ArgSpec(args=['rank_table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.lod_tensor_to_array ArgSpec(args=['x', 'table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.array_to_lod_tensor ArgSpec(args=['x', 'table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True))
paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.shrink_memory ArgSpec(args=['x', 'i', 'table'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.IfElse.false_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.output ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
paddle.fluid.layers.IfElse.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.IfElse.true_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.DynamicRNN.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
...
...
@@ -240,9 +224,6 @@ paddle.fluid.layers.DynamicRNN.output ArgSpec(args=['self'], varargs='outputs',
paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ConditionalBlock.__init__ ArgSpec(args=['self', 'inputs', 'is_scalar_condition', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.ConditionalBlock.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ConditionalBlock.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.StaticRNN.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1))
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
8f18f4d1
...
...
@@ -22,7 +22,12 @@ endif()
cc_test
(
eigen_test SRCS eigen_test.cc DEPS tensor
)
nv_test
(
mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context tensor
)
if
(
WITH_GPU
)
nv_test
(
mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor
)
else
()
cc_test
(
mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor
)
endif
()
cc_library
(
lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio
)
cc_test
(
lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory
)
nv_test
(
lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor
)
...
...
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
8f18f4d1
cc_library
(
var_handle SRCS var_handle.cc DEPS place framework_proto
)
cc_library
(
var_handle SRCS var_handle.cc DEPS place framework_proto
node
)
cc_library
(
op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor
)
cc_library
(
scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
)
cc_library
(
fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
)
cc_library
(
computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry
)
cc_library
(
rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry
)
cc_library
(
ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph
)
cc_library
(
ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph
graph_helper
)
cc_library
(
ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder
)
cc_library
(
ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder
)
...
...
paddle/fluid/framework/details/multi_devices_graph_builder.cc
浏览文件 @
8f18f4d1
...
...
@@ -25,6 +25,7 @@
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/rpc_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/scope.h"
...
...
@@ -67,7 +68,8 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
}
}
void
MultiDevSSAGraphBuilder
::
CreateOpHandleIOs
(
Graph
*
result
,
ir
::
Node
*
node
,
void
MultiDevSSAGraphBuilder
::
CreateOpHandleIOs
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
size_t
place_id
)
const
{
auto
p
=
places_
[
place_id
];
auto
*
op_handle
=
result
->
Get
<
GraphOps
>
(
"ops"
).
back
().
get
();
...
...
@@ -92,12 +94,11 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(Graph *result, ir::Node *node,
}
std
::
vector
<
std
::
string
>
MultiDevSSAGraphBuilder
::
FindDistTrainSendVars
(
const
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>
>
&
nodes
)
const
{
const
std
::
vector
<
ir
::
Node
*
>
&
nodes
)
const
{
std
::
vector
<
std
::
string
>
send_vars
;
// since parameters are all in block 0,
// it's enough to only scan send ops in block 0
for
(
auto
&
node
:
nodes
)
{
if
(
node
->
NodeType
()
!=
ir
::
Node
::
Type
::
kOperation
)
continue
;
OpDesc
*
op
=
node
->
Op
();
// TODO(Yancey1989): use a graceful method to find send op,
// instead of the the hard code string
...
...
@@ -112,10 +113,9 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
}
std
::
vector
<
std
::
string
>
MultiDevSSAGraphBuilder
::
FindDistTrainRecvVars
(
const
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>
>
&
nodes
)
const
{
const
std
::
vector
<
ir
::
Node
*
>
&
nodes
)
const
{
std
::
vector
<
std
::
string
>
recv_vars
;
for
(
auto
&
node
:
nodes
)
{
if
(
node
->
NodeType
()
!=
ir
::
Node
::
Type
::
kOperation
)
continue
;
OpDesc
*
op
=
node
->
Op
();
// TODO(Yancey1989): use a graceful method to find recv op,
// instead of the hard code string
...
...
@@ -170,6 +170,7 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
const
std
::
vector
<
std
::
string
>
&
var_names
)
const
{
int64_t
numel_sum
=
0
;
for
(
auto
var_name
:
var_names
)
{
if
(
all_vars_
.
find
(
var_name
)
==
all_vars_
.
end
())
continue
;
auto
var_desc
=
all_vars_
.
at
(
var_name
);
PADDLE_ENFORCE_NOT_NULL
(
var_desc
);
auto
dim
=
framework
::
make_ddim
(
var_desc
->
GetShape
());
...
...
@@ -186,19 +187,70 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
return
dev_id
;
}
std
::
unique_ptr
<
Graph
>
MultiDevSSAGraphBuilder
::
Apply
(
std
::
unique_ptr
<
Graph
>
graph
)
const
{
// Rebuild the graph structure.
auto
nodes
=
std
::
move
(
graph
->
nodes
);
graph
->
nodes
.
clear
();
// Topology sort the graph nodes from inputs to outputs.
// Since SSAGraphBuilder depends on forward/backward nodes to assign devices
// to parameter/gradients before optimizer ops, topo sort is insufficient. (
// some optimizer ops might not depend on any nodes), we manually move all
// optimizer nodes after last backward nodes.
// However, the assumption by SSAGraphBuilder should be relaxed in the future.
std
::
vector
<
ir
::
Node
*>
SortOpsAndDelayOptimizeOp
(
const
ir
::
Graph
&
graph
)
{
std
::
vector
<
ir
::
Node
*>
ret
=
ir
::
TopologySortOperations
(
graph
);
size_t
last_backward
=
0
;
for
(
size_t
i
=
0
;
i
<
ret
.
size
();
++
i
)
{
if
(
boost
::
get
<
int
>
(
ret
[
i
]
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
()))
==
static_cast
<
int
>
(
OpRole
::
kBackward
))
{
last_backward
=
i
;
}
}
std
::
vector
<
ir
::
Node
*>
optimize_ops
;
std
::
vector
<
ir
::
Node
*>
sorted_ret
;
for
(
size_t
i
=
0
;
i
<
ret
.
size
();
++
i
)
{
if
(
i
<
last_backward
)
{
if
(
boost
::
get
<
int
>
(
ret
[
i
]
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
()))
==
static_cast
<
int
>
(
OpRole
::
kOptimize
))
{
optimize_ops
.
push_back
(
ret
[
i
]);
}
else
{
sorted_ret
.
push_back
(
ret
[
i
]);
}
}
else
if
(
i
==
last_backward
)
{
sorted_ret
.
push_back
(
ret
[
i
]);
// Verify that no operations before optimize ops depends on optimize ops.
std
::
unordered_set
<
ir
::
Node
*>
optimize_set
(
optimize_ops
.
begin
(),
optimize_ops
.
end
());
for
(
ir
::
Node
*
n
:
sorted_ret
)
{
for
(
ir
::
Node
*
in
:
n
->
inputs
)
{
for
(
ir
::
Node
*
pre_n
:
in
->
inputs
)
{
PADDLE_ENFORCE
(
optimize_set
.
find
(
pre_n
)
==
optimize_set
.
end
(),
"optimize operations cannot be depended by forward "
"or backward node %s -> %s"
,
pre_n
->
Name
(),
n
->
Name
());
}
}
}
sorted_ret
.
insert
(
sorted_ret
.
end
(),
optimize_ops
.
begin
(),
optimize_ops
.
end
());
}
else
{
sorted_ret
.
push_back
(
ret
[
i
]);
}
}
return
sorted_ret
;
}
std
::
unique_ptr
<
ir
::
Graph
>
MultiDevSSAGraphBuilder
::
Apply
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
// Give the topology sort order and rebuild the graph structure.
std
::
vector
<
ir
::
Node
*>
sorted_ops
=
SortOpsAndDelayOptimizeOp
(
*
graph
);
auto
nodes
=
graph
->
ReleaseNodes
();
ir
::
Graph
&
result
=
*
graph
;
for
(
auto
&
node
:
nodes
)
{
if
(
node
->
NodeType
()
==
ir
::
Node
::
Type
::
kVariable
)
{
if
(
node
->
NodeType
()
==
ir
::
Node
::
Type
::
kVariable
&&
node
->
Var
()
)
{
all_vars_
.
emplace
(
node
->
Name
(),
node
->
Var
());
}
}
Graph
&
result
=
*
graph
;
std
::
unordered_set
<
std
::
string
>
og_has_been_broadcast
;
// We cannot invoke resize. It is a bug of GCC 4.8
...
...
@@ -207,9 +259,9 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
result
.
Set
(
"ops"
,
new
GraphOps
);
// find send/recv vars so that we can place the distributed training
// re
al
ted op in the place 0
auto
send_vars
=
FindDistTrainSendVars
(
node
s
);
auto
recv_vars
=
FindDistTrainRecvVars
(
node
s
);
// re
la
ted op in the place 0
auto
send_vars
=
FindDistTrainSendVars
(
sorted_op
s
);
auto
recv_vars
=
FindDistTrainRecvVars
(
sorted_op
s
);
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
bcast_var_name_set
;
bcast_var_name_set
.
resize
(
places_
.
size
());
...
...
@@ -217,22 +269,18 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
size_t
cur_device_id
=
0
;
bool
is_forwarding
=
true
;
// NOTE: Currently, passes before SSAGraphBuilder cannot reorder
// forward, backward nodes. E.g. you can't append an forward node
// at the end of the node list.
// TODO(panyx0718): FIXME: Needs to sort by forward->backward order.
for
(
auto
&
node
:
nodes
)
{
if
(
node
->
NodeType
()
!=
ir
::
Node
::
Type
::
kOperation
)
continue
;
for
(
ir
::
Node
*
node
:
sorted_ops
)
{
if
(
boost
::
get
<
int
>
(
node
->
Op
()
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
()))
==
static_cast
<
int
>
(
OpRole
::
kRPC
))
{
CreateRPCOp
(
&
result
,
node
.
get
()
);
}
else
if
(
IsDistTrainOp
(
node
.
get
()
,
send_vars
,
recv_vars
))
{
CreateDistTrainOp
(
&
result
,
node
.
get
()
);
}
else
if
(
IsScaleLossOp
(
node
.
get
()
))
{
CreateRPCOp
(
&
result
,
node
);
}
else
if
(
IsDistTrainOp
(
node
,
send_vars
,
recv_vars
))
{
CreateDistTrainOp
(
&
result
,
node
);
}
else
if
(
IsScaleLossOp
(
node
))
{
// user can customize loss@grad if not use_default_grad_scale_
if
(
strategy_
.
gradient_scale_
!=
BuildStrategy
::
GradientScaleStrategy
::
kCustomized
)
{
// TODO(paddle-dev): Why is there no input for this op_handle?
CreateScaleLossGradOp
(
&
result
);
}
// This assumes the backward generating code will ensure IsScaleLossOp
...
...
@@ -241,24 +289,23 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
// the block.
is_forwarding
=
false
;
}
else
{
int
op_dev_id
=
GetOpDeviceID
(
node
.
get
()
);
int
op_dev_id
=
GetOpDeviceID
(
node
);
if
(
op_dev_id
!=
-
1
)
{
// This op only runs on one specific device.
CreateComputationalOp
(
&
result
,
node
.
get
()
,
op_dev_id
);
CreateComputationalOp
(
&
result
,
node
,
op_dev_id
);
for
(
ir
::
Node
*
n
:
node
->
outputs
)
{
var_name_on_devices_
.
emplace
(
n
->
Name
(),
op_dev_id
);
}
}
else
{
// This op runs on all devices, and its output may have parameter's
// gradients.
// TODO(paddle-dev): Why is so special about "read" op?
if
(
node
->
Op
()
->
Type
()
==
"read"
&&
strategy_
.
enable_data_balance_
)
{
node
->
Op
()
->
SetAttr
(
"throw_eof_exp"
,
false
);
CreateComputationalOps
(
&
result
,
node
.
get
(),
places_
.
size
());
// TODO(paddle-dev): builder shouldn't depend on the out logic of
// a specific op.
CreateComputationalOps
(
&
result
,
node
,
places_
.
size
());
const
auto
&
data_var_names
=
node
->
Op
()
->
Output
(
"Out"
);
InsertDataBalanceOp
(
&
result
,
data_var_names
);
}
else
{
CreateComputationalOps
(
&
result
,
node
.
get
()
,
places_
.
size
());
CreateComputationalOps
(
&
result
,
node
,
places_
.
size
());
}
if
(
!
is_forwarding
&&
places_
.
size
()
>
1
)
{
...
...
@@ -322,7 +369,6 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
}
}
}
/*
Dependency graph has been constructed. However, there are still data
hazards need to be handled.
...
...
@@ -333,6 +379,7 @@ std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
* Only variables should be the leaves of graph.
*/
AddOutputToLeafOps
(
&
result
);
PADDLE_ENFORCE
(
!
ir
::
HasCircle
(
result
));
return
graph
;
}
...
...
@@ -357,7 +404,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
#endif
}
void
MultiDevSSAGraphBuilder
::
CreateBroadcastOp
(
Graph
*
result
,
void
MultiDevSSAGraphBuilder
::
CreateBroadcastOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
p_name
,
size_t
src_dev_id
)
const
{
#ifdef PADDLE_WITH_CUDA
...
...
@@ -387,7 +434,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(Graph *result,
}
}
void
MultiDevSSAGraphBuilder
::
CreateComputationalOp
(
Graph
*
result
,
void
MultiDevSSAGraphBuilder
::
CreateComputationalOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
int
dev_id
)
const
{
result
->
Get
<
GraphOps
>
(
"ops"
).
emplace_back
(
...
...
@@ -396,7 +443,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(Graph *result,
CreateOpHandleIOs
(
result
,
node
,
dev_id
);
}
void
MultiDevSSAGraphBuilder
::
InsertAllReduceOp
(
Graph
*
result
,
void
MultiDevSSAGraphBuilder
::
InsertAllReduceOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
og
)
const
{
#ifdef PADDLE_WITH_CUDA
result
->
Get
<
GraphOps
>
(
"ops"
).
emplace_back
(
new
AllReduceOpHandle
(
...
...
@@ -426,7 +473,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(Graph *result,
}
void
MultiDevSSAGraphBuilder
::
InsertDataBalanceOp
(
Graph
*
result
,
const
std
::
vector
<
std
::
string
>
&
datas
)
const
{
ir
::
Graph
*
result
,
const
std
::
vector
<
std
::
string
>
&
datas
)
const
{
#ifdef PADDLE_WITH_CUDA
result
->
Get
<
GraphOps
>
(
"ops"
).
emplace_back
(
new
DataBalanceOpHandle
(
result
->
CreateEmptyNode
(
"data_balance"
,
ir
::
Node
::
Type
::
kOperation
),
...
...
@@ -479,8 +526,8 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
PADDLE_ENFORCE_EQ
(
param_grad
.
size
(),
2U
);
int
dev_id
=
GetVarDeviceID
(
param_grad
[
1
]);
PADDLE_ENFORCE_NE
(
dev_id
,
-
1
,
"dev_id should not be -1.[%s, %s]"
,
node
->
Op
()
->
Type
(),
param_grad
[
0
]);
PADDLE_ENFORCE_NE
(
dev_id
,
-
1
,
"dev_id should not be -1.[%s, %s
, %s
]"
,
node
->
Op
()
->
Type
(),
param_grad
[
0
]
,
param_grad
[
1
]
);
return
dev_id
;
}
...
...
@@ -489,7 +536,7 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
return
got
==
var_name_on_devices_
.
end
()
?
-
1
:
got
->
second
;
}
void
MultiDevSSAGraphBuilder
::
CreateScaleLossGradOp
(
Graph
*
result
)
const
{
void
MultiDevSSAGraphBuilder
::
CreateScaleLossGradOp
(
ir
::
Graph
*
result
)
const
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
// Insert ScaleCost OpHandle
#ifdef PADDLE_WITH_CUDA
...
...
@@ -519,7 +566,7 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(Graph *result) const {
}
}
void
MultiDevSSAGraphBuilder
::
CreateComputationalOps
(
Graph
*
result
,
void
MultiDevSSAGraphBuilder
::
CreateComputationalOps
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
size_t
num_places
)
const
{
for
(
size_t
scope_idx
=
0
;
scope_idx
<
num_places
;
++
scope_idx
)
{
...
...
@@ -531,7 +578,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(Graph *result,
}
}
VarHandle
*
MultiDevSSAGraphBuilder
::
CreateReduceOp
(
Graph
*
result
,
VarHandle
*
MultiDevSSAGraphBuilder
::
CreateReduceOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
og
,
int
dst_dev_id
)
const
{
#ifdef PADDLE_WITH_CUDA
...
...
@@ -564,12 +611,11 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(Graph *result,
// Find the first occurence of `prev_op_name` and make current `op` depend
// on it.
void
MultiDevSSAGraphBuilder
::
ConnectOp
(
Graph
*
result
,
OpHandleBase
*
op
,
void
MultiDevSSAGraphBuilder
::
ConnectOp
(
ir
::
Graph
*
result
,
OpHandleBase
*
op
,
const
std
::
string
&
prev_op_name
)
const
{
for
(
auto
&
prev_op
:
result
->
Get
<
GraphOps
>
(
"ops"
))
{
if
(
prev_op
->
Name
()
==
prev_op_name
)
{
auto
*
dep_var
=
new
DummyVarHandle
(
result
->
CreateEmptyNode
(
"dummy"
,
ir
::
Node
::
Type
::
kVariable
));
auto
*
dep_var
=
new
DummyVarHandle
(
result
->
CreateControlDepVar
());
prev_op
->
AddOutput
(
dep_var
);
result
->
Get
<
GraphDepVars
>
(
"dep_vars"
).
emplace
(
dep_var
);
op
->
AddInput
(
dep_var
);
...
...
@@ -577,7 +623,7 @@ void MultiDevSSAGraphBuilder::ConnectOp(Graph *result, OpHandleBase *op,
}
}
void
MultiDevSSAGraphBuilder
::
CreateDistTrainOp
(
Graph
*
result
,
void
MultiDevSSAGraphBuilder
::
CreateDistTrainOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
{
int
op_dev_id
=
-
1
;
std
::
vector
<
std
::
string
>
input_var_names
;
...
...
@@ -591,6 +637,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(Graph *result,
if
(
node
->
Op
()
->
Type
()
==
"split_byref"
||
node
->
Op
()
->
Type
()
==
"split_selected_rows"
)
{
// TODO(paddle-dev): getting the first var is not safe.
op_dev_id
=
GetVarDeviceID
(
input_var_names
[
0
]);
if
(
strategy_
.
reduce_
==
BuildStrategy
::
ReduceStrategy
::
kAllReduce
)
{
op_dev_id
=
GetAppropriateDeviceID
(
input_var_names
);
...
...
@@ -624,10 +671,14 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(Graph *result,
}
// Create RPC related op handles that connects its in ops and out ops.
void
MultiDevSSAGraphBuilder
::
CreateRPCOp
(
Graph
*
result
,
ir
::
Node
*
node
)
const
{
void
MultiDevSSAGraphBuilder
::
CreateRPCOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
{
int
op_dev_id
=
-
1
;
if
(
node
->
Op
()
->
Type
()
==
"send"
)
{
// TODO(paddle-dev): getting the first var is not safe.
op_dev_id
=
GetVarDeviceID
(
node
->
inputs
[
0
]
->
Name
());
PADDLE_ENFORCE
(
!
ir
::
IsControlDepVar
(
*
node
->
inputs
[
0
]),
"This hack no longer holds, please fix."
);
// the variable name which contains .block means it was splited by
// split_byref op
// so that we can balance the variable blocks to all the pserver
...
...
paddle/fluid/framework/details/multi_devices_graph_builder.h
浏览文件 @
8f18f4d1
...
...
@@ -46,11 +46,13 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
BuildStrategy
&
strategy
);
#endif
std
::
unique_ptr
<
Graph
>
Apply
(
std
::
unique_ptr
<
Graph
>
graph
)
const
override
;
std
::
unique_ptr
<
ir
::
Graph
>
Apply
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
int
GetVarDeviceID
(
const
std
::
string
&
varname
)
const
override
;
private:
void
CreateOpHandleIOs
(
Graph
*
result
,
ir
::
Node
*
node
,
size_t
device_id
)
const
;
void
CreateOpHandleIOs
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
size_t
device_id
)
const
;
private:
std
::
string
loss_var_name_
;
...
...
@@ -64,8 +66,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
bool
IsScaleLossOp
(
ir
::
Node
*
node
)
const
;
void
CreateRPCOp
(
Graph
*
result
,
ir
::
Node
*
node
)
const
;
void
CreateDistTrainOp
(
Graph
*
result
,
ir
::
Node
*
node
)
const
;
void
CreateRPCOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
;
void
CreateDistTrainOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
)
const
;
/**
* Is this operator as the end-point operator before/after send operator.
...
...
@@ -74,21 +76,22 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
const
std
::
vector
<
std
::
string
>
&
recv_vars
)
const
;
std
::
vector
<
std
::
string
>
FindDistTrainSendVars
(
const
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>
>
&
nodes
)
const
;
const
std
::
vector
<
ir
::
Node
*
>
&
nodes
)
const
;
std
::
vector
<
std
::
string
>
FindDistTrainRecvVars
(
const
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>
>
&
nodes
)
const
;
const
std
::
vector
<
ir
::
Node
*
>
&
nodes
)
const
;
void
ConnectOp
(
Graph
*
result
,
OpHandleBase
*
op
,
void
ConnectOp
(
ir
::
Graph
*
result
,
OpHandleBase
*
op
,
const
std
::
string
&
prev_op_name
)
const
;
void
CreateComputationalOps
(
Graph
*
result
,
ir
::
Node
*
node
,
void
CreateComputationalOps
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
size_t
num_places
)
const
;
void
CreateScaleLossGradOp
(
Graph
*
result
)
const
;
VarHandle
*
CreateReduceOp
(
Graph
*
result
,
const
std
::
string
&
og
,
void
CreateScaleLossGradOp
(
ir
::
Graph
*
result
)
const
;
VarHandle
*
CreateReduceOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
og
,
int
dst_dev_id
)
const
;
void
CreateComputationalOp
(
Graph
*
result
,
ir
::
Node
*
node
,
int
dev_id
)
const
;
void
CreateComputationalOp
(
ir
::
Graph
*
result
,
ir
::
Node
*
node
,
int
dev_id
)
const
;
bool
IsParameterGradientOnce
(
const
std
::
string
&
og
,
...
...
@@ -96,12 +99,12 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
int
GetOpDeviceID
(
ir
::
Node
*
node
)
const
;
void
InsertAllReduceOp
(
Graph
*
result
,
const
std
::
string
&
og
)
const
;
void
InsertAllReduceOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
og
)
const
;
void
InsertDataBalanceOp
(
Graph
*
result
,
void
InsertDataBalanceOp
(
ir
::
Graph
*
result
,
const
std
::
vector
<
std
::
string
>
&
datas
)
const
;
void
CreateBroadcastOp
(
Graph
*
result
,
const
std
::
string
&
p_name
,
void
CreateBroadcastOp
(
ir
::
Graph
*
result
,
const
std
::
string
&
p_name
,
size_t
src_dev_id
)
const
;
bool
IsSparseGradient
(
const
std
::
string
&
og
)
const
;
...
...
paddle/fluid/framework/details/rpc_op_handle.cc
浏览文件 @
8f18f4d1
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/details/rpc_op_handle.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -33,7 +34,7 @@ void RPCOpHandle::RunImpl() {
for
(
auto
*
in
:
inputs_
)
{
auto
&
p
=
static_cast
<
VarHandle
*>
(
in
)
->
place_
;
// FIXME(Yancey1989): need a better solution instead of use DebugString()
if
(
i
n
->
DebugString
()
==
"dummy"
)
{
// HACK
if
(
i
r
::
IsControlDepVar
(
*
in
->
Node
())
)
{
// HACK
continue
;
}
if
(
in
->
GeneratedOp
())
{
...
...
paddle/fluid/framework/details/ssa_graph_builder.cc
浏览文件 @
8f18f4d1
...
...
@@ -17,7 +17,7 @@
namespace
paddle
{
namespace
framework
{
namespace
details
{
void
SSAGraphBuilder
::
PolishGraphToSupportDataHazards
(
Graph
*
graph
)
{
void
SSAGraphBuilder
::
PolishGraphToSupportDataHazards
(
ir
::
Graph
*
graph
)
{
for
(
auto
&
var_map
:
graph
->
Get
<
GraphVars
>
(
"vars"
))
{
for
(
auto
&
name_pair
:
var_map
)
{
if
(
name_pair
.
second
.
size
()
<=
1
)
{
...
...
@@ -36,9 +36,18 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(Graph *graph) {
// Read Write is the same op.
continue
;
}
bool
has_dep
=
false
;
for
(
auto
*
r_out
:
read_op
->
Outputs
())
{
for
(
auto
*
w_in
:
write_op
->
Inputs
())
{
if
(
r_out
->
Node
()
==
w_in
->
Node
())
{
has_dep
=
true
;
break
;
}
}
}
if
(
has_dep
)
continue
;
auto
*
dep_var
=
new
DummyVarHandle
(
graph
->
CreateEmptyNode
(
"dummy"
,
ir
::
Node
::
Type
::
kVariable
));
auto
*
dep_var
=
new
DummyVarHandle
(
graph
->
CreateControlDepVar
());
read_op
->
AddOutput
(
dep_var
);
write_op
->
AddInput
(
dep_var
);
graph
->
Get
<
GraphDepVars
>
(
"dep_vars"
).
emplace
(
dep_var
);
...
...
@@ -49,7 +58,7 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(Graph *graph) {
}
VarHandle
*
SSAGraphBuilder
::
CreateOrGetLatestVarHandle
(
Graph
*
graph
,
ir
::
Node
*
node
,
const
platform
::
Place
&
place
,
ir
::
Graph
*
graph
,
ir
::
Node
*
node
,
const
platform
::
Place
&
place
,
size_t
place_offset
)
{
auto
&
var_holders
=
graph
->
Get
<
GraphVars
>
(
"vars"
)[
place_offset
];
auto
&
var_holder
=
var_holders
[
node
->
Name
()];
...
...
@@ -70,7 +79,7 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
return
var
;
}
void
SSAGraphBuilder
::
CreateOpOutput
(
Graph
*
graph
,
OpHandleBase
*
op_handle
,
void
SSAGraphBuilder
::
CreateOpOutput
(
ir
::
Graph
*
graph
,
OpHandleBase
*
op_handle
,
ir
::
Node
*
new_node
,
const
platform
::
Place
&
place
,
size_t
place_offset
)
{
...
...
@@ -82,13 +91,12 @@ void SSAGraphBuilder::CreateOpOutput(Graph *graph, OpHandleBase *op_handle,
op_handle
->
AddOutput
(
var
);
}
void
SSAGraphBuilder
::
AddOutputToLeafOps
(
Graph
*
graph
)
{
void
SSAGraphBuilder
::
AddOutputToLeafOps
(
ir
::
Graph
*
graph
)
{
for
(
auto
&
op
:
graph
->
Get
<
GraphOps
>
(
"ops"
))
{
if
(
!
op
->
Outputs
().
empty
())
{
continue
;
}
auto
*
dummy_leaf
=
new
DummyVarHandle
(
graph
->
CreateEmptyNode
(
"dummy"
,
ir
::
Node
::
Type
::
kVariable
));
auto
*
dummy_leaf
=
new
DummyVarHandle
(
graph
->
CreateControlDepVar
());
graph
->
Get
<
GraphDepVars
>
(
"dep_vars"
).
emplace
(
dummy_leaf
);
op
->
AddOutput
(
dummy_leaf
);
}
...
...
paddle/fluid/framework/details/ssa_graph_builder.h
浏览文件 @
8f18f4d1
...
...
@@ -57,26 +57,23 @@ class SSAGraphBuilder : public ir::Pass {
DISABLE_COPY_AND_ASSIGN
(
SSAGraphBuilder
);
protected:
/**
* We only handle write after read(WAR), since it should not have a write
* after write in program. If there are write after write operators, we need
* prune them.
*
* https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
/*
Dependency graph has been constructed. However, there are still data
hazards need to be handled.
*/
static
void
PolishGraphToSupportDataHazards
(
Graph
*
graph
);
static
void
PolishGraphToSupportDataHazards
(
ir
::
Graph
*
graph
);
static
VarHandle
*
CreateOrGetLatestVarHandle
(
Graph
*
graph
,
ir
::
Node
*
node
,
static
VarHandle
*
CreateOrGetLatestVarHandle
(
ir
::
Graph
*
graph
,
ir
::
Node
*
node
,
const
platform
::
Place
&
place
,
size_t
place_offset
);
// Add an output variable (each_var_name, place, place_offset) to op_handle,
// which belongs to graph
static
void
CreateOpOutput
(
Graph
*
graph
,
OpHandleBase
*
op_handle
,
static
void
CreateOpOutput
(
ir
::
Graph
*
graph
,
OpHandleBase
*
op_handle
,
ir
::
Node
*
new_node
,
const
platform
::
Place
&
place
,
size_t
place_offset
);
static
void
AddOutputToLeafOps
(
Graph
*
graph
);
static
void
AddOutputToLeafOps
(
ir
::
Graph
*
graph
);
};
}
// namespace details
}
// namespace framework
...
...
paddle/fluid/framework/details/ssa_graph_checker.cc
浏览文件 @
8f18f4d1
...
...
@@ -20,7 +20,7 @@ namespace paddle {
namespace
framework
{
namespace
details
{
bool
SSAGraghBuilderWithChecker
::
IsValidGraph
(
const
Graph
*
graph
)
const
{
bool
SSAGraghBuilderWithChecker
::
IsValidGraph
(
const
ir
::
Graph
*
graph
)
const
{
std
::
unordered_map
<
OpHandleBase
*
,
size_t
>
pending_ops
;
std
::
unordered_set
<
VarHandleBase
*>
pending_vars
;
std
::
unordered_set
<
VarHandleBase
*>
ready_vars
;
...
...
paddle/fluid/framework/details/ssa_graph_checker.h
浏览文件 @
8f18f4d1
...
...
@@ -28,7 +28,8 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
std
::
unique_ptr
<
SSAGraphBuilder
>&&
builder
)
:
builder_
(
std
::
move
(
builder
))
{}
std
::
unique_ptr
<
Graph
>
Apply
(
std
::
unique_ptr
<
Graph
>
graph
)
const
override
{
std
::
unique_ptr
<
ir
::
Graph
>
Apply
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
{
auto
new_graph
=
builder_
->
Apply
(
std
::
move
(
graph
));
PADDLE_ENFORCE
(
IsValidGraph
(
new_graph
.
get
()));
return
new_graph
;
...
...
@@ -38,7 +39,7 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
return
builder_
->
GetVarDeviceID
(
var_name
);
}
bool
IsValidGraph
(
const
Graph
*
graph
)
const
;
bool
IsValidGraph
(
const
ir
::
Graph
*
graph
)
const
;
private:
std
::
unique_ptr
<
SSAGraphBuilder
>
builder_
;
...
...
paddle/fluid/framework/details/ssa_graph_printer.cc
浏览文件 @
8f18f4d1
...
...
@@ -21,7 +21,7 @@ namespace framework {
namespace
details
{
template
<
typename
Callback
>
static
inline
void
IterAllVar
(
const
Graph
&
graph
,
Callback
callback
)
{
static
inline
void
IterAllVar
(
const
ir
::
Graph
&
graph
,
Callback
callback
)
{
for
(
auto
&
each
:
graph
.
Get
<
GraphVars
>
(
"vars"
))
{
for
(
auto
&
pair1
:
each
)
{
for
(
auto
&
pair2
:
pair1
.
second
)
{
...
...
@@ -35,7 +35,7 @@ static inline void IterAllVar(const Graph &graph, Callback callback) {
}
}
void
GraphvizSSAGraphPrinter
::
Print
(
const
Graph
&
graph
,
void
GraphvizSSAGraphPrinter
::
Print
(
const
ir
::
Graph
&
graph
,
std
::
ostream
&
sout
)
const
{
size_t
var_id
=
0
;
std
::
unordered_map
<
const
VarHandleBase
*
,
size_t
>
vars
;
...
...
paddle/fluid/framework/details/ssa_graph_printer.h
浏览文件 @
8f18f4d1
...
...
@@ -25,12 +25,12 @@ namespace details {
class
SSAGraphPrinter
{
public:
virtual
~
SSAGraphPrinter
()
{}
virtual
void
Print
(
const
Graph
&
graph
,
std
::
ostream
&
sout
)
const
=
0
;
virtual
void
Print
(
const
ir
::
Graph
&
graph
,
std
::
ostream
&
sout
)
const
=
0
;
};
class
GraphvizSSAGraphPrinter
:
public
SSAGraphPrinter
{
public:
void
Print
(
const
Graph
&
graph
,
std
::
ostream
&
sout
)
const
override
;
void
Print
(
const
ir
::
Graph
&
graph
,
std
::
ostream
&
sout
)
const
override
;
};
class
SSAGraghBuilderWithPrinter
:
public
SSAGraphBuilder
{
...
...
@@ -50,7 +50,8 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
stream_ptr_
(
std
::
move
(
sout
)),
stream_ref_
(
*
stream_ptr_
)
{}
std
::
unique_ptr
<
Graph
>
Apply
(
std
::
unique_ptr
<
Graph
>
graph
)
const
override
{
std
::
unique_ptr
<
ir
::
Graph
>
Apply
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
{
auto
new_graph
=
builder_
->
Apply
(
std
::
move
(
graph
));
printer_
->
Print
(
*
new_graph
,
stream_ref_
);
return
new_graph
;
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
8f18f4d1
...
...
@@ -21,7 +21,8 @@ namespace framework {
namespace
details
{
ThreadedSSAGraphExecutor
::
ThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
Graph
>
&&
graph
)
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
)
:
graph_
(
std
::
move
(
graph
)),
pool_
(
strategy
.
num_threads_
>=
2
?
new
::
ThreadPool
(
strategy
.
num_threads_
)
:
nullptr
),
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
浏览文件 @
8f18f4d1
...
...
@@ -40,7 +40,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
ThreadedSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
unique_ptr
<
Graph
>
&&
graph
);
std
::
unique_ptr
<
ir
::
Graph
>
&&
graph
);
// Run a SSAGraph by a thread pool
// Use topological sort algorithm
...
...
@@ -53,7 +53,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
details
::
OpHandleBase
*
op
);
private:
std
::
unique_ptr
<
Graph
>
graph_
;
std
::
unique_ptr
<
ir
::
Graph
>
graph_
;
std
::
unique_ptr
<::
ThreadPool
>
pool_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
...
...
paddle/fluid/framework/details/var_handle.cc
浏览文件 @
8f18f4d1
...
...
@@ -26,7 +26,7 @@ std::string VarHandle::DebugString() const {
return
ss
.
str
();
}
std
::
string
DummyVarHandle
::
DebugString
()
const
{
return
"dummy"
;
}
std
::
string
DummyVarHandle
::
DebugString
()
const
{
return
node_
->
Name
()
;
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/executor.cc
浏览文件 @
8f18f4d1
...
...
@@ -45,19 +45,13 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
Executor
::
Executor
(
const
platform
::
Place
&
place
)
:
place_
(
place
)
{}
void
Executor
::
Close
()
{
#ifdef PADDLE_WITH_DISTRIBUTE
void
Executor
::
BeginPass
()
{
::
paddle
::
operators
::
distributed
::
RPCClient
::
GetInstance
<
::
paddle
::
operators
::
distributed
::
GRPCClient
>
()
->
SendBeginPass
();
}
void
Executor
::
EndPass
()
{
::
paddle
::
operators
::
distributed
::
RPCClient
::
GetInstance
<
::
paddle
::
operators
::
distributed
::
GRPCClient
>
()
->
SendEndPass
();
}
->
SendComplete
();
#endif
}
void
InitializeVariable
(
Variable
*
var
,
proto
::
VarType
::
Type
var_type
)
{
if
(
var_type
==
proto
::
VarType
::
LOD_TENSOR
)
{
...
...
paddle/fluid/framework/executor.h
浏览文件 @
8f18f4d1
...
...
@@ -44,17 +44,11 @@ class Executor {
explicit
Executor
(
const
platform
::
Place
&
place
);
#ifdef PADDLE_WITH_DISTRIBUTE
/*
* Sending signal to pserver to mark current pass started.
* Close this Executor.
* Calling this method will send complete messages to all pserver instances.
*/
void
BeginPass
();
/*
* Sending signal to pserver to mark current pass finished.
*/
void
EndPass
();
#endif
void
Close
();
/* @Brief
* Runtime evaluation of the given ProgramDesc under certain Scope
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
8f18f4d1
cc_library
(
node SRCS node.cc DEPS proto_desc
)
cc_library
(
graph SRCS graph.cc DEPS node
)
cc_library
(
graph_helper SRCS graph_helper.cc DEPS graph
)
cc_library
(
pass SRCS pass.cc DEPS graph node
)
cc_test
(
graph_
test SRCS graph_test.cc DEPS graph proto_desc
op_registry
)
cc_test
(
graph_test SRCS graph_test.cc DEPS graph op_registry
)
cc_test
(
graph_
helper_test SRCS graph_helper_test.cc DEPS graph_helper
op_registry
)
paddle/fluid/framework/ir/graph.cc
浏览文件 @
8f18f4d1
...
...
@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <unordered_set>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/var_desc.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
// NOTE(paddle-dev): This graph contains circle.
Graph
::
Graph
(
const
ProgramDesc
&
program
)
:
program_
(
program
)
{
VLOG
(
3
)
<<
"block in program:"
<<
program_
.
Size
();
std
::
unordered_map
<
std
::
string
,
VarDesc
*>
all_vars
;
...
...
@@ -27,40 +31,87 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
all_vars
.
emplace
(
var
->
Name
(),
var
);
}
std
::
map
<
std
::
string
,
ir
::
Node
*
>
var_nodes
;
std
::
map
<
std
::
string
,
std
::
vector
<
ir
::
Node
*>
>
var_nodes
;
for
(
auto
*
op
:
program
.
Block
(
0
).
AllOps
())
{
ir
::
Node
*
node
=
CreateOpNode
(
op
);
// For input args, reuse the same var name if it was created before.
// Otherwise, create a new one.
for
(
auto
&
each_var_name
:
op
->
InputArgumentNames
())
{
ir
::
Node
*
var
=
nullptr
;
if
(
var_nodes
.
find
(
each_var_name
)
!=
var_nodes
.
end
())
{
var
=
var_nodes
.
at
(
each_var_name
);
var
=
var_nodes
.
at
(
each_var_name
)
.
back
()
;
}
else
if
(
all_vars
.
count
(
each_var_name
)
!=
0
)
{
var
=
CreateVarNode
(
all_vars
.
at
(
each_var_name
));
var_nodes
[
each_var_name
]
=
var
;
var_nodes
[
each_var_name
]
.
push_back
(
var
)
;
}
else
{
//
TODO(paddle-dev): Seems some assumption doesn't hold?
VLOG
(
3
)
<<
op
->
Type
()
<<
" input var not in all_var list: "
<<
each_var_name
;
//
Operation input var can be optional (dispensable). Which means
// the operation doesn't really need the var at runtime. In this
// case, the no-existed var is ready at the beginning.
var
=
CreateEmptyNode
(
each_var_name
,
ir
::
Node
::
Type
::
kVariable
);
var_nodes
[
each_var_name
]
=
var
;
var_nodes
[
each_var_name
]
.
push_back
(
var
)
;
}
node
->
inputs
.
push_back
(
var
);
var
->
outputs
.
push_back
(
node
);
}
// For output args, always create a new var.
for
(
auto
&
each_var_name
:
op
->
OutputArgumentNames
())
{
ir
::
Node
*
var
=
nullptr
;
if
(
var_nodes
.
find
(
each_var_name
)
!=
var_nodes
.
end
())
{
var
=
var_nodes
.
at
(
each_var_name
);
}
else
{
var
=
CreateVarNode
(
all_vars
.
at
(
each_var_name
));
var_nodes
[
each_var_name
]
=
var
;
}
ir
::
Node
*
var
=
CreateVarNode
(
all_vars
.
at
(
each_var_name
));
var_nodes
[
each_var_name
].
push_back
(
var
);
node
->
outputs
.
push_back
(
var
);
var
->
inputs
.
push_back
(
node
);
}
}
/**
* We only handle write after read(WAR), since it should not have a write
* after write in program. If there are write after write operators, we need
* prune them.
*
* https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
*/
for
(
auto
&
var
:
var_nodes
)
{
auto
&
versions
=
var
.
second
;
if
(
versions
.
size
()
<=
1
)
continue
;
auto
it_new
=
versions
.
rbegin
();
auto
it_old
=
versions
.
rbegin
();
++
it_old
;
for
(;
it_old
!=
versions
.
rend
();
it_new
=
it_old
,
++
it_old
)
{
ir
::
Node
*
write_op
=
(
*
it_new
)
->
inputs
.
empty
()
?
nullptr
:
(
*
it_new
)
->
inputs
[
0
];
const
auto
&
read_ops
=
(
*
it_old
)
->
outputs
;
for
(
auto
*
read_op
:
read_ops
)
{
// Manually add a dependency var from read_op to write_op;
if
(
read_op
==
write_op
)
{
// Read Write is the same op.
continue
;
}
// 2 ops might have been connected via other vars.
bool
has_dep
=
false
;
for
(
ir
::
Node
*
r_out
:
read_op
->
outputs
)
{
for
(
ir
::
Node
*
w_in
:
write_op
->
inputs
)
{
if
(
r_out
==
w_in
)
{
has_dep
=
true
;
break
;
}
}
}
if
(
has_dep
)
continue
;
ir
::
Node
*
dep_var
=
CreateControlDepVar
();
read_op
->
outputs
.
push_back
(
dep_var
);
dep_var
->
inputs
.
push_back
(
read_op
);
write_op
->
inputs
.
push_back
(
dep_var
);
dep_var
->
outputs
.
push_back
(
write_op
);
}
}
}
}
bool
IsControlDepVar
(
const
ir
::
Node
&
var
)
{
return
var
.
Name
().
find
(
ir
::
Node
::
kControlDepVarName
)
!=
std
::
string
::
npos
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph.h
浏览文件 @
8f18f4d1
...
...
@@ -26,13 +26,14 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
Graph
{
public:
explicit
Graph
(
const
ProgramDesc
&
program
);
explicit
Graph
(
const
ProgramDesc
&
program
);
virtual
~
Graph
()
{
for
(
auto
&
attr
:
attrs_
)
{
for
(
auto
&
attr
:
attrs_
)
{
attr_dels_
[
attr
.
first
]();
}
attrs_
.
clear
();
...
...
@@ -40,12 +41,12 @@ class Graph {
}
template
<
typename
AttrType
>
AttrType
&
Get
(
const
std
::
string
&
attr_name
)
const
{
return
*
boost
::
any_cast
<
AttrType
*>
(
attrs_
.
at
(
attr_name
));
AttrType
&
Get
(
const
std
::
string
&
attr_name
)
const
{
return
*
boost
::
any_cast
<
AttrType
*>
(
attrs_
.
at
(
attr_name
));
}
template
<
typename
AttrType
>
void
Set
(
const
std
::
string
&
attr_name
,
AttrType
*
attr
)
{
void
Set
(
const
std
::
string
&
attr_name
,
AttrType
*
attr
)
{
PADDLE_ENFORCE
(
attrs_
.
count
(
attr_name
)
==
0
);
attrs_
[
attr_name
]
=
attr
;
attr_dels_
[
attr_name
]
=
[
attr
,
attr_name
]()
{
...
...
@@ -54,29 +55,70 @@ class Graph {
};
}
ir
::
Node
*
CreateVarNode
(
VarDesc
*
var_desc
)
{
nodes
.
emplace_back
(
new
ir
::
Node
(
var_desc
));
return
nodes
.
back
().
get
();
const
std
::
unordered_set
<
ir
::
Node
*>
&
Nodes
()
const
{
return
node_set_
;
}
// Create a normal variable with non-null VarDesc.
ir
::
Node
*
CreateVarNode
(
VarDesc
*
var_desc
)
{
return
AddNode
(
new
ir
::
Node
(
var_desc
));
}
// Create a normal runnable operator with OpDesc.
ir
::
Node
*
CreateOpNode
(
OpDesc
*
op_desc
)
{
return
AddNode
(
new
ir
::
Node
(
op_desc
));
}
ir
::
Node
*
CreateOpNode
(
OpDesc
*
op_desc
)
{
nodes
.
emplace_back
(
new
ir
::
Node
(
op_desc
));
return
nodes
.
back
().
get
();
// Create a control dependency var that connects 2 operations. The
// var doesn't hold any data. Other than that, it's no different from
// other var, considering dependency analysis.
ir
::
Node
*
CreateControlDepVar
()
{
// TODO(panyx0718): control var name should be really unique.
const
std
::
string
name
=
string
::
Sprintf
(
"%s@%llu"
,
ir
::
Node
::
kControlDepVarName
,
node_set_
.
size
());
return
AddNode
(
new
ir
::
Node
(
name
,
ir
::
Node
::
Type
::
kVariable
));
}
ir
::
Node
*
CreateEmptyNode
(
const
std
::
string
&
name
,
ir
::
Node
::
Type
type
)
{
nodes
.
emplace_back
(
new
ir
::
Node
(
name
,
type
));
return
nodes
.
back
().
get
();
// A more free style way of creating a graph node. Mostly use for test
// or "copy" from another node. Avoid using it if possible.
ir
::
Node
*
CreateEmptyNode
(
const
std
::
string
&
name
,
ir
::
Node
::
Type
type
)
{
return
AddNode
(
new
ir
::
Node
(
name
,
type
));
}
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>>
nodes
;
// Clear all node information of the graph and return the ownership of the
// nodes.
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>>
ReleaseNodes
()
{
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>>
ret
;
for
(
auto
&
n
:
nodes_
)
{
ret
.
emplace_back
(
n
.
second
.
release
());
}
nodes_
.
clear
();
node_set_
.
clear
();
return
ret
;
}
private:
// This method takes ownership of `node`.
ir
::
Node
*
AddNode
(
ir
::
Node
*
node
)
{
PADDLE_ENFORCE
(
node_set_
.
find
(
node
)
==
node_set_
.
end
());
nodes_
[
node
].
reset
(
node
);
node_set_
.
insert
(
node
);
return
node
;
}
void
RemoveNode
(
ir
::
Node
*
node
)
{
PADDLE_ENFORCE
(
node_set_
.
find
(
node
)
!=
node_set_
.
end
());
node_set_
.
erase
(
node
);
nodes_
.
erase
(
node
);
}
// NOTE: program_ shouldn't be exposed to user.
const
ProgramDesc
&
program_
;
const
ProgramDesc
&
program_
;
std
::
map
<
std
::
string
,
boost
::
any
>
attrs_
;
std
::
map
<
std
::
string
,
std
::
function
<
void
(
void
)
>>
attr_dels_
;
std
::
map
<
ir
::
Node
*
,
std
::
unique_ptr
<
ir
::
Node
>>
nodes_
;
std
::
unordered_set
<
ir
::
Node
*>
node_set_
;
};
bool
IsControlDepVar
(
const
ir
::
Node
&
var
);
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph_helper.cc
0 → 100644
浏览文件 @
8f18f4d1
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <unordered_set>
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
namespace
{
void
SortHelper
(
const
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
&
adj_list
,
ir
::
Node
*
node
,
std
::
unordered_set
<
ir
::
Node
*>
*
visited
,
std
::
vector
<
ir
::
Node
*>
*
ret
)
{
visited
->
insert
(
node
);
for
(
auto
adj
:
adj_list
.
at
(
node
))
{
if
(
visited
->
find
(
adj
)
==
visited
->
end
())
{
SortHelper
(
adj_list
,
adj
,
visited
,
ret
);
}
}
VLOG
(
3
)
<<
"topology sort insert: "
<<
node
->
Name
()
<<
reinterpret_cast
<
void
*>
(
node
)
<<
" input "
<<
node
->
inputs
.
size
();
ret
->
push_back
(
node
);
}
bool
HasCircleHelper
(
ir
::
Node
*
node
,
const
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
&
adj_list
,
std
::
unordered_set
<
ir
::
Node
*>
*
visited
,
std
::
unordered_set
<
ir
::
Node
*>
*
in_trace
)
{
if
(
visited
->
find
(
node
)
==
visited
->
end
())
{
visited
->
insert
(
node
);
in_trace
->
insert
(
node
);
for
(
ir
::
Node
*
in
:
adj_list
.
at
(
node
))
{
if
(
visited
->
find
(
in
)
==
visited
->
end
()
&&
HasCircleHelper
(
in
,
adj_list
,
visited
,
in_trace
))
{
return
true
;
}
else
if
(
in_trace
->
find
(
in
)
!=
in_trace
->
end
())
{
return
true
;
}
}
}
in_trace
->
erase
(
node
);
return
false
;
}
bool
HasCircleInternal
(
const
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
&
adj_list
)
{
std
::
unordered_set
<
ir
::
Node
*>
visited
;
std
::
unordered_set
<
ir
::
Node
*>
in_trace
;
for
(
auto
&
adj
:
adj_list
)
{
if
(
HasCircleHelper
(
adj
.
first
,
adj_list
,
&
visited
,
&
in_trace
))
{
return
true
;
}
}
return
false
;
}
}
// namespace
bool
HasCircle
(
const
Graph
&
graph
)
{
return
HasCircleInternal
(
BuildOperationAdjList
(
graph
));
}
std
::
vector
<
ir
::
Node
*>
TopologySortOperations
(
const
Graph
&
graph
)
{
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
adj_list
=
BuildOperationAdjList
(
graph
);
PADDLE_ENFORCE
(
!
HasCircleInternal
(
adj_list
));
std
::
unordered_set
<
ir
::
Node
*>
visited
;
std
::
vector
<
ir
::
Node
*>
ret
;
for
(
auto
adj
:
adj_list
)
{
if
(
visited
.
find
(
adj
.
first
)
==
visited
.
end
())
{
SortHelper
(
adj_list
,
adj
.
first
,
&
visited
,
&
ret
);
}
}
return
ret
;
}
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
BuildOperationAdjList
(
const
Graph
&
graph
)
{
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
adj_list
;
for
(
auto
&
n
:
graph
.
Nodes
())
{
if
(
n
->
NodeType
()
!=
ir
::
Node
::
Type
::
kOperation
)
continue
;
if
(
adj_list
.
find
(
n
)
==
adj_list
.
end
())
{
adj_list
[
n
]
=
std
::
unordered_set
<
ir
::
Node
*>
();
}
for
(
auto
&
var
:
n
->
inputs
)
{
for
(
auto
&
adj_n
:
var
->
inputs
)
{
PADDLE_ENFORCE
(
adj_n
->
NodeType
()
==
ir
::
Node
::
Type
::
kOperation
);
adj_list
[
n
].
insert
(
adj_n
);
VLOG
(
3
)
<<
"adj "
<<
adj_n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
adj_n
)
<<
" -> "
<<
n
->
Name
()
<<
reinterpret_cast
<
void
*>
(
n
)
<<
" via "
<<
var
->
Name
()
<<
reinterpret_cast
<
void
*>
(
var
);
}
}
}
return
adj_list
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph_helper.h
0 → 100644
浏览文件 @
8f18f4d1
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <memory>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
// Test if the graph contains circle.
bool
HasCircle
(
const
Graph
&
graph
);
// Topology Sort the operations in the graph from inputs to outputs.
// `graph` cannot contain circle.
std
::
vector
<
ir
::
Node
*>
TopologySortOperations
(
const
Graph
&
graph
);
// Build an adjacency list of operations for the `graph`.
std
::
map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
BuildOperationAdjList
(
const
Graph
&
graph
);
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph_helper_test.cc
0 → 100644
浏览文件 @
8f18f4d1
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h"
#include <string>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/program_desc.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
BuildCircleGraph
(
Graph
*
g
)
{
ir
::
Node
*
o1
=
g
->
CreateEmptyNode
(
"op1"
,
Node
::
Type
::
kOperation
);
ir
::
Node
*
v1
=
g
->
CreateEmptyNode
(
"var1"
,
Node
::
Type
::
kVariable
);
o1
->
outputs
.
push_back
(
v1
);
o1
->
inputs
.
push_back
(
v1
);
v1
->
inputs
.
push_back
(
o1
);
v1
->
outputs
.
push_back
(
o1
);
}
void
BuildCircleGraph2
(
Graph
*
g
)
{
ir
::
Node
*
o1
=
g
->
CreateEmptyNode
(
"op1"
,
Node
::
Type
::
kOperation
);
ir
::
Node
*
o2
=
g
->
CreateEmptyNode
(
"op2"
,
Node
::
Type
::
kOperation
);
ir
::
Node
*
v1
=
g
->
CreateEmptyNode
(
"var1"
,
Node
::
Type
::
kVariable
);
ir
::
Node
*
v2
=
g
->
CreateEmptyNode
(
"var2"
,
Node
::
Type
::
kVariable
);
o1
->
outputs
.
push_back
(
v1
);
o2
->
inputs
.
push_back
(
v1
);
v1
->
inputs
.
push_back
(
o1
);
v1
->
outputs
.
push_back
(
o2
);
o2
->
outputs
.
push_back
(
v2
);
o1
->
inputs
.
push_back
(
v2
);
v2
->
inputs
.
push_back
(
o2
);
v2
->
outputs
.
push_back
(
o1
);
}
void
BuildNoCircleGraph
(
Graph
*
g
)
{
ir
::
Node
*
o1
=
g
->
CreateEmptyNode
(
"op1"
,
Node
::
Type
::
kOperation
);
ir
::
Node
*
o2
=
g
->
CreateEmptyNode
(
"op2"
,
Node
::
Type
::
kOperation
);
ir
::
Node
*
o3
=
g
->
CreateEmptyNode
(
"op3"
,
Node
::
Type
::
kOperation
);
ir
::
Node
*
o4
=
g
->
CreateEmptyNode
(
"op4"
,
Node
::
Type
::
kOperation
);
ir
::
Node
*
o5
=
g
->
CreateEmptyNode
(
"op5"
,
Node
::
Type
::
kOperation
);
ir
::
Node
*
v1
=
g
->
CreateEmptyNode
(
"var1"
,
Node
::
Type
::
kVariable
);
ir
::
Node
*
v2
=
g
->
CreateEmptyNode
(
"var2"
,
Node
::
Type
::
kVariable
);
ir
::
Node
*
v3
=
g
->
CreateEmptyNode
(
"var3"
,
Node
::
Type
::
kVariable
);
ir
::
Node
*
v4
=
g
->
CreateEmptyNode
(
"var4"
,
Node
::
Type
::
kVariable
);
// o1->v1->o2
o1
->
outputs
.
push_back
(
v1
);
o2
->
inputs
.
push_back
(
v1
);
v1
->
inputs
.
push_back
(
o1
);
v1
->
outputs
.
push_back
(
o2
);
// o2->v2->o3
// o2->v2->o4
o2
->
outputs
.
push_back
(
v2
);
o3
->
inputs
.
push_back
(
v2
);
o4
->
inputs
.
push_back
(
v2
);
v2
->
inputs
.
push_back
(
o2
);
v2
->
outputs
.
push_back
(
o3
);
v2
->
outputs
.
push_back
(
o4
);
// o2->v3->o5
o2
->
outputs
.
push_back
(
v3
);
o5
->
inputs
.
push_back
(
v3
);
v3
->
inputs
.
push_back
(
o2
);
v3
->
outputs
.
push_back
(
o5
);
// o3-v4->o5
o3
->
outputs
.
push_back
(
v4
);
o5
->
inputs
.
push_back
(
v4
);
v4
->
inputs
.
push_back
(
o3
);
v4
->
outputs
.
push_back
(
o5
);
}
TEST
(
GraphHelperTest
,
Basic
)
{
ProgramDesc
prog
;
Graph
g
(
prog
);
BuildCircleGraph
(
&
g
);
ASSERT_TRUE
(
HasCircle
(
g
));
Graph
g2
(
prog
);
BuildCircleGraph2
(
&
g2
);
ASSERT_TRUE
(
HasCircle
(
g2
));
auto
adj_list
=
BuildOperationAdjList
(
g2
);
for
(
auto
&
adj
:
adj_list
)
{
auto
&
adj_set
=
adj
.
second
;
if
(
adj
.
first
->
Name
()
==
"op1"
)
{
ASSERT_EQ
((
*
adj_set
.
begin
())
->
Name
(),
"op2"
);
}
else
if
(
adj
.
first
->
Name
()
==
"op2"
)
{
ASSERT_EQ
((
*
adj_set
.
begin
())
->
Name
(),
"op1"
);
}
else
{
ASSERT_TRUE
(
false
);
}
}
Graph
g3
(
prog
);
BuildNoCircleGraph
(
&
g3
);
ASSERT_FALSE
(
HasCircle
(
g3
));
auto
sorted
=
TopologySortOperations
(
g3
);
std
::
map
<
std
::
string
,
size_t
>
node_map
;
for
(
size_t
i
=
0
;
i
<
sorted
.
size
();
++
i
)
{
node_map
[
sorted
[
i
]
->
Name
()]
=
i
;
}
ASSERT_EQ
(
node_map
.
at
(
"op1"
),
0
);
ASSERT_EQ
(
node_map
.
at
(
"op2"
),
1
);
ASSERT_TRUE
(
node_map
.
at
(
"op3"
)
<
node_map
.
at
(
"op5"
));
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph_test.cc
浏览文件 @
8f18f4d1
...
...
@@ -76,6 +76,7 @@ TEST(GraphTest, Basic) {
op
->
SetType
(
"sum"
);
op
->
SetInput
(
"X"
,
{
"test_a"
,
"test_b"
,
"test_c"
});
op
->
SetOutput
(
"Out"
,
{
"test_out"
});
op
->
SetAttr
(
"op_role"
,
1
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test_a"
)
->
SetType
(
proto
::
VarType
::
SELECTED_ROWS
);
prog
.
MutableBlock
(
0
)
->
Var
(
"test_b"
)
->
SetType
(
proto
::
VarType
::
SELECTED_ROWS
);
...
...
@@ -92,21 +93,22 @@ TEST(GraphTest, Basic) {
ASSERT_EQ
(
proto
::
VarType
::
LOD_TENSOR
,
prog
.
MutableBlock
(
0
)
->
Var
(
"test_out"
)
->
GetType
());
std
::
unique_ptr
<
Graph
>
g
(
new
Graph
(
prog
));
ASSERT_EQ
(
g
->
nodes
[
0
]
->
Name
(),
"sum"
);
ASSERT_EQ
(
g
->
nodes
[
0
]
->
inputs
[
0
]
->
Name
(),
"test_a"
);
ASSERT_EQ
(
g
->
nodes
[
0
]
->
inputs
[
1
]
->
Name
(),
"test_b"
);
ASSERT_EQ
(
g
->
nodes
[
0
]
->
inputs
[
2
]
->
Name
(),
"test_c"
);
ASSERT_EQ
(
g
->
nodes
[
0
]
->
outputs
[
0
]
->
Name
(),
"test_out"
);
ASSERT_EQ
(
g
->
nodes
[
1
]
->
Name
(),
"test_a"
);
ASSERT_EQ
(
g
->
nodes
[
1
]
->
outputs
[
0
]
->
Name
(),
"sum"
);
ASSERT_EQ
(
g
->
nodes
[
2
]
->
Name
(),
"test_b"
);
ASSERT_EQ
(
g
->
nodes
[
2
]
->
outputs
[
0
]
->
Name
(),
"sum"
);
ASSERT_EQ
(
g
->
nodes
[
3
]
->
Name
(),
"test_c"
);
ASSERT_EQ
(
g
->
nodes
[
3
]
->
outputs
[
0
]
->
Name
(),
"sum"
);
ASSERT_EQ
(
g
->
nodes
[
4
]
->
Name
(),
"test_out"
);
ASSERT_EQ
(
g
->
nodes
[
4
]
->
inputs
[
0
]
->
Name
(),
"sum"
);
ASSERT_EQ
(
g
->
nodes
.
size
(),
5
);
std
::
unique_ptr
<
ir
::
Graph
>
g
(
new
ir
::
Graph
(
prog
));
std
::
vector
<
ir
::
Node
*>
nodes
(
g
->
Nodes
().
begin
(),
g
->
Nodes
().
end
());
for
(
ir
::
Node
*
n
:
nodes
)
{
if
(
n
->
Name
()
==
"sum"
)
{
ASSERT_EQ
(
n
->
inputs
.
size
(),
3
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
1
);
}
else
if
(
n
->
Name
()
==
"test_a"
||
n
->
Name
()
==
"test_b"
||
n
->
Name
()
==
"test_c"
)
{
ASSERT_EQ
(
n
->
inputs
.
size
(),
0
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
1
);
}
else
if
(
n
->
Name
()
==
"test_out"
)
{
ASSERT_EQ
(
n
->
inputs
.
size
(),
1
);
ASSERT_EQ
(
n
->
outputs
.
size
(),
0
);
}
}
ASSERT_EQ
(
nodes
.
size
(),
5
);
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/node.cc
浏览文件 @
8f18f4d1
...
...
@@ -15,5 +15,9 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/node.h"
namespace
paddle
{
namespace
framework
{}
// namespace framework
namespace
framework
{
namespace
ir
{
const
char
Node
::
kControlDepVarName
[]
=
"__control_var"
;
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/node.h
浏览文件 @
8f18f4d1
...
...
@@ -27,6 +27,8 @@ namespace ir {
class
Node
{
public:
enum
class
Type
{
kOperation
,
kVariable
};
static
const
char
kControlDepVarName
[];
explicit
Node
(
const
std
::
string
&
name
,
Type
type
)
:
name_
(
name
),
var_desc_
(
nullptr
),
op_desc_
(
nullptr
),
type_
(
type
)
{}
...
...
@@ -50,6 +52,7 @@ class Node {
PADDLE_ENFORCE
(
type_
==
Type
::
kVariable
);
return
var_desc_
;
}
OpDesc
*
Op
()
{
PADDLE_ENFORCE
(
type_
==
Type
::
kOperation
);
return
op_desc_
;
...
...
paddle/fluid/framework/mixed_vector.h
浏览文件 @
8f18f4d1
...
...
@@ -16,6 +16,7 @@
#include <algorithm>
#include <initializer_list>
#include <memory>
#include <vector>
#include "paddle/fluid/framework/tensor.h"
...
...
@@ -386,13 +387,14 @@ template <typename T>
class
CPUVector
:
public
std
::
vector
<
T
,
std
::
allocator
<
T
>>
{
public:
CPUVector
()
:
std
::
vector
<
T
>
()
{}
CPUVector
(
size_t
count
,
const
T
&
value
=
T
())
CPUVector
(
size_t
count
,
const
T
&
value
=
T
())
// NOLINT
:
std
::
vector
<
T
>
(
count
,
value
)
{}
CPUVector
(
std
::
initializer_list
<
T
>
init
)
:
std
::
vector
<
T
>
(
init
)
{}
CPUVector
(
const
std
::
vector
<
T
>
&
other
)
:
std
::
vector
<
T
>
(
other
)
{}
explicit
CPUVector
(
const
CPUVector
<
T
>
&
other
)
:
std
::
vector
<
T
>
(
other
)
{}
CPUVector
(
const
std
::
vector
<
T
>
&
other
)
:
std
::
vector
<
T
>
(
other
)
{}
// NOLINT
CPUVector
(
const
CPUVector
<
T
>
&
other
)
:
std
::
vector
<
T
>
(
other
)
{}
CPUVector
(
CPUVector
<
T
>
&&
other
)
:
std
::
vector
<
T
>
(
std
::
move
(
other
))
{}
CPUVector
(
std
::
vector
<
T
>
&&
other
)
:
std
::
vector
<
T
>
(
std
::
move
(
other
))
{}
CPUVector
(
std
::
vector
<
T
>
&&
other
)
// NOLINT
:
std
::
vector
<
T
>
(
std
::
move
(
other
))
{}
CPUVector
&
operator
=
(
const
CPUVector
&
other
)
{
this
->
assign
(
other
.
begin
(),
other
.
end
());
return
*
this
;
...
...
@@ -410,8 +412,6 @@ class CPUVector : public std::vector<T, std::allocator<T>> {
return
os
;
}
void
resize
(
size_t
size
)
{
this
->
resize
(
size
);
}
T
&
operator
[](
size_t
id
)
{
return
this
->
at
(
id
);
}
const
T
&
operator
[](
size_t
id
)
const
{
return
this
->
at
(
id
);
}
...
...
paddle/fluid/framework/mixed_vector_test.cc
0 → 100644
浏览文件 @
8f18f4d1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/mixed_vector.h"
template
<
typename
T
>
using
vec
=
paddle
::
framework
::
Vector
<
T
>
;
TEST
(
mixed_vector
,
CPU_VECTOR
)
{
vec
<
int
>
tmp
;
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
tmp
.
push_back
(
i
);
}
ASSERT_EQ
(
tmp
.
size
(),
10UL
);
vec
<
int
>
tmp2
;
tmp2
=
tmp
;
ASSERT_EQ
(
tmp2
.
size
(),
10UL
);
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
ASSERT_EQ
(
tmp2
[
i
],
i
);
ASSERT_EQ
(
tmp2
[
i
],
tmp
[
i
]);
}
int
cnt
=
0
;
for
(
auto
&
t
:
tmp2
)
{
ASSERT_EQ
(
t
,
cnt
);
++
cnt
;
}
}
TEST
(
mixed_vector
,
InitWithCount
)
{
paddle
::
framework
::
Vector
<
int
>
vec
(
10
,
10
);
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
ASSERT_EQ
(
vec
[
i
],
10
);
}
}
TEST
(
mixed_vector
,
ForEach
)
{
vec
<
int
>
tmp
;
for
(
auto
&
v
:
tmp
)
{
VLOG
(
3
)
<<
v
;
}
}
TEST
(
mixed_vector
,
Reserve
)
{
paddle
::
framework
::
Vector
<
int
>
vec
;
vec
.
reserve
(
1
);
vec
.
push_back
(
0
);
vec
.
push_back
(
0
);
vec
.
push_back
(
0
);
}
TEST
(
mixed_vector
,
Resize
)
{
paddle
::
framework
::
Vector
<
int
>
vec
;
vec
.
resize
(
1
);
vec
.
push_back
(
0
);
vec
.
push_back
(
0
);
vec
.
push_back
(
0
);
}
paddle/fluid/framework/mixed_vector_test.cu
浏览文件 @
8f18f4d1
...
...
@@ -11,7 +11,9 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cuda_runtime.h>
#include <memory>
#include "glog/logging.h"
#include "gtest/gtest.h"
...
...
@@ -21,26 +23,6 @@
template
<
typename
T
>
using
vec
=
paddle
::
framework
::
Vector
<
T
>
;
TEST
(
mixed_vector
,
CPU_VECTOR
)
{
vec
<
int
>
tmp
;
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
tmp
.
push_back
(
i
);
}
ASSERT_EQ
(
tmp
.
size
(),
10UL
);
vec
<
int
>
tmp2
;
tmp2
=
tmp
;
ASSERT_EQ
(
tmp2
.
size
(),
10UL
);
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
ASSERT_EQ
(
tmp2
[
i
],
i
);
ASSERT_EQ
(
tmp2
[
i
],
tmp
[
i
]);
}
int
cnt
=
0
;
for
(
auto
&
t
:
tmp2
)
{
ASSERT_EQ
(
t
,
cnt
);
++
cnt
;
}
}
static
__global__
void
multiply_10
(
int
*
ptr
)
{
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
ptr
[
i
]
*=
10
;
...
...
@@ -91,24 +73,3 @@ TEST(mixed_vector, MultiGPU) {
ASSERT_EQ
(
tmp
[
i
],
i
*
100
);
}
}
TEST
(
mixed_vector
,
InitWithCount
)
{
paddle
::
framework
::
Vector
<
int
>
vec
(
10
,
10
);
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
ASSERT_EQ
(
vec
[
i
],
10
);
}
}
TEST
(
mixed_vector
,
ForEach
)
{
vec
<
int
>
tmp
;
for
(
auto
&
v
:
tmp
)
{
}
}
TEST
(
mixed_vector
,
Reserve
)
{
paddle
::
framework
::
Vector
<
int
>
vec
;
vec
.
reserve
(
1
);
vec
.
push_back
(
0
);
vec
.
push_back
(
0
);
vec
.
push_back
(
0
);
}
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
8f18f4d1
...
...
@@ -132,7 +132,7 @@ ParallelExecutor::ParallelExecutor(
#endif
}
builder_
=
builder_factory
.
Create
();
std
::
unique_ptr
<
Graph
>
graph
(
new
Graph
(
main_program
));
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
main_program
));
graph
=
builder_
->
Apply
(
std
::
move
(
graph
));
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
places
,
std
::
move
(
graph
)));
...
...
paddle/fluid/inference/api/api_impl.cc
浏览文件 @
8f18f4d1
...
...
@@ -137,6 +137,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
executor_
->
RunPreparedContext
(
ctx_
.
get
(),
sub_scope_
!=
nullptr
?
sub_scope_
:
scope_
.
get
(),
&
feed_targets
,
&
fetch_targets
,
false
,
/* don't create local scope each time*/
false
/* don't create variable eatch time */
);
VLOG
(
4
)
<<
"Finish prepared context"
;
if
(
!
GetFetch
(
fetchs
,
output_data
))
{
...
...
paddle/fluid/inference/api/demo_ci/clean.sh
0 → 100755
浏览文件 @
8f18f4d1
set
-x
cd
`
dirname
$0
`
rm
-rf
build/ data/
set
+x
paddle/fluid/inference/tensorrt/convert/fc_op.cc
浏览文件 @
8f18f4d1
...
...
@@ -32,11 +32,11 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
for
(
int
h
=
0
;
h
<
shape
.
h
();
++
h
)
{
for
(
int
w
=
0
;
w
<
shape
.
w
();
++
w
)
{
odata
[
h
*
ostrides
.
h
()
+
w
*
ostrides
.
w
()]
=
idata
[
h
*
ostrides
.
h
()
+
w
*
o
strides
.
w
()];
idata
[
h
*
istrides
.
h
()
+
w
*
i
strides
.
w
()];
}
}
}
// indata c * k
// Reorder the data layout from CK to KC.
void
ReorderCKtoKC
(
TensorRTEngine
::
Weight
&
iweights
,
TensorRTEngine
::
Weight
*
oweights
)
{
...
...
@@ -79,9 +79,8 @@ class FcOpConverter : public OpConverter {
framework
::
LoDTensor
tmp
;
tmp
.
Resize
(
Y_t
->
dims
());
memcpy
(
tmp
.
mutable_data
<
float
>
(
platform
::
CPUPlace
()),
Y_t
->
data
<
float
>
(),
Y_t
->
dims
()[
0
]
*
Y_t
->
dims
()[
1
]);
memcpy
(
tmp
.
mutable_data
<
float
>
(
platform
::
CPUPlace
()),
weight_data
,
Y_t
->
dims
()[
0
]
*
Y_t
->
dims
()[
1
]
*
sizeof
(
float
));
TensorRTEngine
::
Weight
weight
{
nvinfer1
::
DataType
::
kFLOAT
,
static_cast
<
void
*>
(
weight_data
),
Y_t
->
memory_size
()
/
sizeof
(
float
)};
...
...
@@ -93,7 +92,7 @@ class FcOpConverter : public OpConverter {
// The data layout of TRT FC layer's weight is different from fluid's FC,
// need to reorder the elements.
ReorderCKtoKC
(
tmp_weight
,
&
weight
);
ReorderCKtoKC
(
weight
,
&
tmp_
weight
);
// Currently, the framework can only handle one fluid op -> one TRT layer,
// but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
...
...
@@ -103,7 +102,7 @@ class FcOpConverter : public OpConverter {
auto
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
FullyConnected
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
X
),
n_output
,
weight
.
get
(),
bias
.
get
());
n_output
,
tmp_
weight
.
get
(),
bias
.
get
());
auto
output_name
=
op_desc
.
Output
(
"Out"
).
front
();
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
...
...
@@ -118,4 +117,3 @@ class FcOpConverter : public OpConverter {
}
// namespace paddle
REGISTER_TRT_OP_CONVERTER
(
fc
,
FcOpConverter
);
USE_OP
(
mul
);
paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
浏览文件 @
8f18f4d1
...
...
@@ -37,7 +37,7 @@ TEST(ReluOpConverter, main) {
validator
.
SetOp
(
*
desc
.
Proto
());
LOG
(
INFO
)
<<
"execute"
;
validator
.
Execute
(
1
0
);
validator
.
Execute
(
1
);
}
}
// namespace tensorrt
...
...
paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
浏览文件 @
8f18f4d1
...
...
@@ -23,11 +23,11 @@ namespace tensorrt {
TEST
(
fc_op
,
test
)
{
std
::
unordered_set
<
std
::
string
>
parameters
({
"mul-Y"
});
framework
::
Scope
scope
;
TRTConvertValidation
validator
(
2
0
,
parameters
,
scope
,
1000
);
validator
.
Decl
InputVar
(
"mul-X"
,
nvinfer1
::
Dims4
(
8
,
3
,
1
,
1
));
validator
.
DeclParamVar
(
"mul-Y"
,
nvinfer1
::
Dims2
(
3
,
2
));
validator
.
DeclOutputVar
(
"mul-Out"
,
nvinfer1
::
Dims2
(
8
,
2
));
TRTConvertValidation
validator
(
1
0
,
parameters
,
scope
,
1000
);
validator
.
DeclInputVar
(
"mul-X"
,
nvinfer1
::
Dims4
(
1
,
10
,
1
,
1
));
validator
.
Decl
ParamVar
(
"mul-Y"
,
nvinfer1
::
Dims2
(
10
,
2
));
// validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8
, 2));
validator
.
DeclOutputVar
(
"mul-Out"
,
nvinfer1
::
Dims2
(
1
,
2
));
// Prepare Op description
framework
::
OpDesc
desc
;
...
...
@@ -38,9 +38,10 @@ TEST(fc_op, test) {
validator
.
SetOp
(
*
desc
.
Proto
());
validator
.
Execute
(
1
0
);
validator
.
Execute
(
1
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
USE_OP
(
mul
);
paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
浏览文件 @
8f18f4d1
...
...
@@ -39,7 +39,7 @@ TEST(MulOpConverter, main) {
validator
.
SetOp
(
*
desc
.
Proto
());
LOG
(
INFO
)
<<
"execute"
;
validator
.
Execute
(
1
0
);
validator
.
Execute
(
1
);
}
}
// namespace tensorrt
...
...
paddle/fluid/inference/tensorrt/convert/ut_helper.h
浏览文件 @
8f18f4d1
...
...
@@ -39,7 +39,7 @@ namespace tensorrt {
float
random
(
float
low
,
float
high
)
{
static
std
::
random_device
rd
;
static
std
::
mt19937
mt
(
rd
());
std
::
uniform_real_distribution
<
double
>
dist
(
1.0
,
10.0
);
std
::
uniform_real_distribution
<
double
>
dist
(
low
,
high
);
return
dist
(
mt
);
}
...
...
@@ -49,6 +49,7 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
size_t
num_elements
=
analysis
::
AccuDims
(
dims
,
dims
.
size
());
PADDLE_ENFORCE_GT
(
num_elements
,
0
);
auto
*
data
=
tensor
->
mutable_data
<
float
>
(
place
);
for
(
size_t
i
=
0
;
i
<
num_elements
;
i
++
)
{
*
(
data
+
i
)
=
random
(
0.
,
1.
);
}
...
...
@@ -68,7 +69,7 @@ class TRTConvertValidation {
int
workspace_size
=
1
<<
10
)
:
parameters_
(
parameters
),
scope_
(
scope
)
{
// create engine.
engine_
.
reset
(
new
TensorRTEngine
(
10
,
1
<<
10
,
&
stream_
));
engine_
.
reset
(
new
TensorRTEngine
(
batch_size
,
workspace_size
,
&
stream_
));
engine_
->
InitNetwork
();
PADDLE_ENFORCE_EQ
(
cudaStreamCreate
(
&
stream_
),
0
);
...
...
@@ -138,12 +139,11 @@ class TRTConvertValidation {
cudaStreamSynchronize
(
*
engine_
->
stream
());
ASSERT_FALSE
(
op_desc_
->
OutputArgumentNames
().
empty
());
const
size_t
output_space_size
=
200
;
const
size_t
output_space_size
=
200
0
;
for
(
const
auto
&
output
:
op_desc_
->
OutputArgumentNames
())
{
std
::
vector
<
float
>
fluid_out
;
std
::
vector
<
float
>
trt_out
(
output_space_size
);
engine_
->
GetOutputInCPU
(
output
,
&
trt_out
[
0
],
output_space_size
*
sizeof
(
float
));
engine_
->
GetOutputInCPU
(
output
,
&
trt_out
[
0
],
output_space_size
);
cudaStreamSynchronize
(
*
engine_
->
stream
());
auto
*
var
=
scope_
.
FindVar
(
output
);
...
...
paddle/fluid/inference/tensorrt/engine.cc
浏览文件 @
8f18f4d1
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use
this file except in compliance with the License.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use
this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
...
...
@@ -26,6 +26,8 @@ namespace paddle {
namespace
inference
{
namespace
tensorrt
{
int
TensorRTEngine
::
runtime_batch_
=
1
;
void
TensorRTEngine
::
Build
(
const
DescType
&
paddle_model
)
{
PADDLE_ENFORCE
(
false
,
"not implemented"
);
}
...
...
@@ -42,6 +44,7 @@ void TensorRTEngine::Execute(int batch_size) {
PADDLE_ENFORCE_NOT_NULL
(
stream_
);
infer_context_
->
enqueue
(
batch_size
,
buffers
.
data
(),
*
stream_
,
nullptr
);
cudaStreamSynchronize
(
*
stream_
);
SetRuntimeBatch
(
batch_size
);
}
TensorRTEngine
::~
TensorRTEngine
()
{
...
...
@@ -80,17 +83,17 @@ void TensorRTEngine::FreezeNetwork() {
auto
dims
=
infer_engine_
->
getBindingDimensions
(
slot_offset
);
item
.
second
=
kDataTypeSize
[
static_cast
<
int
>
(
infer_engine_
->
getBindingDataType
(
slot_offset
))]
*
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
)
*
max_batch_
;
PADDLE_ENFORCE_GT
(
item
.
second
,
0
);
}
auto
&
buf
=
buffer
(
item
.
first
);
buf
.
max_size
=
item
.
second
*
max_batch_
;
CHECK
(
buf
.
buffer
==
nullptr
);
// buffer should be allocated only once.
PADDLE_ENFORCE_EQ
(
0
,
cudaMalloc
(
&
buf
.
buffer
,
buf
.
max_size
));
PADDLE_ENFORCE_LE
(
buf
.
max_size
,
1
<<
30
);
// 10G
// buf.size will changed in the runtime.
PADDLE_ENFORCE_EQ
(
0
,
cudaMalloc
(
&
buf
.
buffer
,
item
.
second
*
max_batch_
));
buf
.
size
=
0
;
PADDLE_ENFORCE_LE
(
buf
.
max_size
,
1
<<
30
);
// 10G
buf
.
device
=
DeviceType
::
GPU
;
}
}
...
...
@@ -105,7 +108,7 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
auto
*
input
=
infer_network_
->
addInput
(
name
.
c_str
(),
dtype
,
dims
);
PADDLE_ENFORCE
(
input
,
"infer network add input %s failed"
,
name
);
buffer_sizes_
[
name
]
=
kDataTypeSize
[
static_cast
<
int
>
(
dtype
)]
*
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
)
*
max_batch_
;
PADDLE_ENFORCE
(
input
->
isNetworkInput
());
TensorRTEngine
::
SetITensor
(
name
,
input
);
return
input
;
...
...
@@ -149,35 +152,42 @@ void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
void
TensorRTEngine
::
GetOutputInGPU
(
const
std
::
string
&
name
,
void
*
dst
,
size_t
max_size
)
{
// determine data size
auto
*
output
=
TensorRTEngine
::
GetITensor
(
name
);
nvinfer1
::
Dims
dims
=
output
->
getDimensions
();
auto
dim_size
=
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
size_t
dst_size
=
dim_size
*
runtime_batch_
*
kDataTypeSize
[
static_cast
<
int
>
(
output
->
getType
())];
auto
it
=
buffer_sizes_
.
find
(
name
);
PADDLE_ENFORCE
(
it
!=
buffer_sizes_
.
end
());
PADDLE_ENFORCE_GT
(
it
->
second
,
0
);
PADDLE_ENFORCE_GE
(
max_size
,
it
->
second
);
PADDLE_ENFORCE_LE
(
dst_size
,
it
->
second
);
PADDLE_ENFORCE_GE
(
max_size
,
dst_size
);
auto
&
buf
=
buffer
(
name
);
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
,
"buffer should be allocated before"
);
PADDLE_ENFORCE_EQ
(
cudaMemcpyAsync
(
dst
,
buf
.
buffer
,
it
->
second
,
PADDLE_ENFORCE_EQ
(
cudaMemcpyAsync
(
dst
,
buf
.
buffer
,
dst_size
,
cudaMemcpyDeviceToDevice
,
*
stream_
),
0
);
}
void
TensorRTEngine
::
GetOutputInCPU
(
const
std
::
string
&
name
,
void
*
dst
,
size_t
max_size
)
{
VLOG
(
4
)
<<
"get output in cpu"
;
auto
&
buf
=
buffer
(
name
);
// Update needed buffer size.
auto
slot_offset
=
infer_engine_
->
getBindingIndex
(
name
.
c_str
());
auto
dims
=
infer_engine_
->
getBindingDimensions
(
slot_offset
);
buf
.
size
=
kDataTypeSize
[
static_cast
<
int
>
(
infer_engine_
->
getBindingDataType
(
slot_offset
))]
*
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
PADDLE_ENFORCE_LE
(
buf
.
size
,
buf
.
max_size
);
// determine data size
auto
*
output
=
TensorRTEngine
::
GetITensor
(
name
);
nvinfer1
::
Dims
dims
=
output
->
getDimensions
();
auto
dim_size
=
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
size_t
dst_size
=
dim_size
*
runtime_batch_
*
kDataTypeSize
[
static_cast
<
int
>
(
output
->
getType
())];
auto
it
=
buffer_sizes_
.
find
(
name
);
PADDLE_ENFORCE
(
it
!=
buffer_sizes_
.
end
());
PADDLE_ENFORCE_GT
(
it
->
second
,
0
);
PADDLE_ENFORCE_LE
(
dst_size
,
it
->
second
);
PADDLE_ENFORCE_GE
(
max_size
,
dst_size
);
auto
&
buf
=
buffer
(
name
);
PADDLE_ENFORCE_NOT_NULL
(
buf
.
buffer
,
"buffer should be allocated before"
);
// DEBUG
memset
(
dst
,
0
,
buf
.
size
);
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpy
(
dst
,
buf
.
buffer
,
buf
.
size
,
cudaMemcpyDeviceToHost
));
PADDLE_ENFORCE_EQ
(
0
,
cudaMemcpyAsync
(
dst
,
buf
.
buffer
,
dst_size
,
cudaMemcpyDeviceToHost
,
*
stream_
));
}
Buffer
&
TensorRTEngine
::
buffer
(
const
std
::
string
&
name
)
{
...
...
@@ -225,6 +235,12 @@ nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
return
itensor_map_
[
name
];
}
void
TensorRTEngine
::
SetRuntimeBatch
(
size_t
batch_size
)
{
runtime_batch_
=
batch_size
;
}
int
TensorRTEngine
::
GetRuntimeBatch
()
{
return
runtime_batch_
;
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tensorrt/engine.h
浏览文件 @
8f18f4d1
...
...
@@ -117,10 +117,14 @@ class TensorRTEngine : public EngineBase {
nvinfer1
::
ICudaEngine
*
engine
()
{
return
infer_engine_
.
get
();
}
nvinfer1
::
INetworkDefinition
*
network
()
{
return
infer_network_
.
get
();
}
void
SetRuntimeBatch
(
size_t
batch_size
);
int
GetRuntimeBatch
();
private:
// the max batch size
int
max_batch_
;
// the runtime batch size
static
int
runtime_batch_
;
// the max memory size the engine uses
int
max_workspace_
;
...
...
paddle/fluid/inference/tensorrt/test_engine.cc
浏览文件 @
8f18f4d1
...
...
@@ -28,7 +28,7 @@ class TensorRTEngineTest : public ::testing::Test {
protected:
void
SetUp
()
override
{
ASSERT_EQ
(
0
,
cudaStreamCreate
(
&
stream_
));
engine_
=
new
TensorRTEngine
(
1
,
1
<<
10
,
&
stream_
);
engine_
=
new
TensorRTEngine
(
1
0
,
1
<<
10
,
&
stream_
);
engine_
->
InitNetwork
();
}
...
...
@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) {
LOG
(
INFO
)
<<
"to get output"
;
float
y_cpu
;
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
,
sizeof
(
float
));
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
,
1
*
sizeof
(
float
));
LOG
(
INFO
)
<<
"to checkout output"
;
ASSERT_EQ
(
y_cpu
,
x_v
*
2
+
3
);
...
...
@@ -103,15 +103,49 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
LOG
(
INFO
)
<<
"to get output"
;
float
y_cpu
[
2
]
=
{
-
1.
,
-
1.
};
auto
dims
=
engine_
->
GetITensor
(
"y"
)
->
getDimensions
();
ASSERT_EQ
(
dims
.
nbDims
,
3
);
ASSERT_EQ
(
dims
.
d
[
0
],
2
);
ASSERT_EQ
(
dims
.
d
[
1
],
1
);
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
[
0
],
sizeof
(
float
)
*
2
);
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
[
0
],
2
*
sizeof
(
float
)
);
ASSERT_EQ
(
y_cpu
[
0
],
4.5
);
ASSERT_EQ
(
y_cpu
[
1
],
14.5
);
}
TEST_F
(
TensorRTEngineTest
,
test_conv2d_temp
)
{
// Weight in CPU memory.
float
raw_weight
[
9
]
=
{
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
};
float
raw_bias
[
1
]
=
{
0
};
TensorRTEngine
::
Weight
weight
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_weight
,
9
);
TensorRTEngine
::
Weight
bias
(
nvinfer1
::
DataType
::
kFLOAT
,
raw_bias
,
1
);
auto
*
x
=
engine_
->
DeclareInput
(
"x"
,
nvinfer1
::
DataType
::
kFLOAT
,
nvinfer1
::
Dims3
{
1
,
3
,
3
});
auto
*
conv_layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Convolution
,
*
x
,
1
,
nvinfer1
::
DimsHW
{
3
,
3
},
weight
.
get
(),
bias
.
get
());
PADDLE_ENFORCE
(
conv_layer
!=
nullptr
);
conv_layer
->
setStride
(
nvinfer1
::
DimsHW
{
1
,
1
});
conv_layer
->
setPadding
(
nvinfer1
::
DimsHW
{
1
,
1
});
engine_
->
DeclareOutput
(
conv_layer
,
0
,
"y"
);
engine_
->
FreezeNetwork
();
ASSERT_EQ
(
engine_
->
engine
()
->
getNbBindings
(),
2
);
float
x_v
[
18
]
=
{
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
};
engine_
->
SetInputFromCPU
(
"x"
,
reinterpret_cast
<
void
*>
(
&
x_v
),
18
*
sizeof
(
float
));
engine_
->
Execute
(
2
);
LOG
(
INFO
)
<<
"to get output"
;
float
*
y_cpu
=
new
float
[
18
];
engine_
->
GetOutputInCPU
(
"y"
,
&
y_cpu
[
0
],
18
*
sizeof
(
float
));
ASSERT_EQ
(
y_cpu
[
0
],
4.0
);
ASSERT_EQ
(
y_cpu
[
1
],
6.0
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
paddle/fluid/inference/tests/test_helper.h
浏览文件 @
8f18f4d1
...
...
@@ -210,13 +210,14 @@ void TestInference(const std::string& dirname,
// Ignore the profiling results of the first run
std
::
unique_ptr
<
paddle
::
framework
::
ExecutorPrepareContext
>
ctx
;
bool
CreateLocalScope
=
CreateVars
;
if
(
PrepareContext
)
{
ctx
=
executor
.
Prepare
(
*
inference_program
,
0
);
executor
.
RunPreparedContext
(
ctx
.
get
(),
scope
,
&
feed_targets
,
&
fetch_targets
,
tru
e
,
CreateVars
);
&
fetch_targets
,
CreateLocalScop
e
,
CreateVars
);
}
else
{
executor
.
Run
(
*
inference_program
,
scope
,
&
feed_targets
,
&
fetch_targets
,
tru
e
,
CreateVars
);
CreateLocalScop
e
,
CreateVars
);
}
// Enable the profiler
...
...
@@ -232,10 +233,11 @@ void TestInference(const std::string& dirname,
// Note: if you change the inference_program, you need to call
// executor.Prepare() again to get a new ExecutorPrepareContext.
executor
.
RunPreparedContext
(
ctx
.
get
(),
scope
,
&
feed_targets
,
&
fetch_targets
,
CreateVars
);
&
fetch_targets
,
CreateLocalScope
,
CreateVars
);
}
else
{
executor
.
Run
(
*
inference_program
,
scope
,
&
feed_targets
,
&
fetch_targets
,
CreateVars
);
Create
LocalScope
,
Create
Vars
);
}
}
...
...
paddle/fluid/memory/detail/buddy_allocator.cc
浏览文件 @
8f18f4d1
...
...
@@ -15,6 +15,10 @@ limitations under the License. */
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "glog/logging.h"
DEFINE_bool
(
free_idle_memory
,
false
,
"If it is true, Paddle will try to free idle memory trunks during "
"running time."
);
namespace
paddle
{
namespace
memory
{
namespace
detail
{
...
...
@@ -152,13 +156,14 @@ void BuddyAllocator::Free(void* p) {
pool_
.
insert
(
IndexSizeAddress
(
block
->
index
(
cache_
),
block
->
total_size
(
cache_
),
block
));
if
(
FLAGS_free_idle_memory
)
{
// Clean up if existing too much free memory
// Prefer freeing fallback allocation first
CleanIdleFallBackAlloc
();
// Free normal allocation
CleanIdleNormalAlloc
();
}
}
size_t
BuddyAllocator
::
Used
()
{
return
total_used_
;
}
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
8f18f4d1
...
...
@@ -192,9 +192,9 @@ if(WITH_DISTRIBUTE)
set
(
DISTRIBUTE_DEPS
""
)
if
(
WITH_GRPC
)
set
(
DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf
)
set
(
DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf
node
)
else
()
set
(
DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib
)
set
(
DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib
node
)
if
(
WITH_BRPC_RDMA
)
find_library
(
IBVERBS_LIBRARY NAMES ibverbs
)
ADD_LIBRARY
(
ibverbs SHARED IMPORTED GLOBAL
)
...
...
paddle/fluid/operators/conv_cudnn_op.cu.cc
浏览文件 @
8f18f4d1
...
...
@@ -77,7 +77,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// cudnn 7 can support groups, no need to do it mannually
// FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1.
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionGroupCount
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionGroupCount
(
cudnn_conv_desc
,
groups
));
groups
=
1
;
#endif
...
...
@@ -129,7 +129,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
...
...
@@ -140,18 +140,18 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
platform
::
float16
)))
{
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_TENSOR_OP_MATH
));
// Currently tensor core is only enabled using this algo
algo
=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
;
}
else
{
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cudnn_conv_desc
,
CUDNN_DEFAULT_MATH
));
}
#endif
// get workspace size able to allocate
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
cudnn_input_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
algo
,
&
workspace_size_in_bytes
));
// It is possible for float16 on Volta GPU to allocate more memory than
...
...
@@ -165,7 +165,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv forward ---------------------
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
+
i
*
group_offset_in
,
cudnn_filter_desc
,
filter_data
+
i
*
group_offset_filter
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
...
...
@@ -218,7 +218,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// cudnn 7 can support groups, no need to do it mannually
// FIXME(typhoonzero): find a better way to disable groups
// rather than setting it to 1.
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionGroupCount
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnSetConvolutionGroupCount
(
cudnn_conv_desc
,
groups
));
groups
=
1
;
#endif
...
...
@@ -273,7 +273,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
auto
handle
=
dev_ctx
.
cudnn_handle
();
if
(
input_grad
)
{
if
(
FLAGS_cudnn_deterministic
)
{
PADDLE
_ENFORCE
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
handle
,
cudnn_filter_desc
,
// dyDesc: Handle to the previously initialized input
...
...
@@ -289,7 +289,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
data_algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
PADDLE
_ENFORCE
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
handle
,
cudnn_filter_desc
,
cudnn_output_grad_desc
,
cudnn_conv_desc
,
cudnn_input_desc
,
data_algo
,
&
tmp_size
));
...
...
@@ -298,7 +298,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
if
(
filter_grad
)
{
if
(
FLAGS_cudnn_deterministic
)
{
PADDLE
_ENFORCE
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
handle
,
cudnn_input_desc
,
cudnn_output_grad_desc
,
cudnn_conv_desc
,
cudnn_filter_desc
,
...
...
@@ -308,7 +308,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
filter_algo
=
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
;
}
PADDLE
_ENFORCE
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
handle
,
cudnn_input_desc
,
cudnn_output_grad_desc
,
cudnn_conv_desc
,
cudnn_filter_desc
,
filter_algo
,
&
tmp_size
));
...
...
@@ -326,7 +326,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset input_grad.
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
i
*
group_offset_filter
,
cudnn_output_grad_desc
,
output_grad_data
+
i
*
group_offset_out
,
cudnn_conv_desc
,
data_algo
,
...
...
@@ -339,7 +339,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
T
*
filter_grad_data
=
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Because beta is zero, it is unnecessary to reset filter_grad.
for
(
int
i
=
0
;
i
<
groups
;
i
++
)
{
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
&
alpha
,
cudnn_input_desc
,
input_data
+
i
*
group_offset_in
,
cudnn_output_grad_desc
,
output_grad_data
+
i
*
group_offset_out
,
cudnn_conv_desc
,
filter_algo
,
cudnn_workspace
,
...
...
paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
浏览文件 @
8f18f4d1
...
...
@@ -87,7 +87,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
// Get the algorithm
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
handle
,
cudnn_filter_desc
,
cudnn_input_desc
,
cudnn_conv_desc
,
// dxDesc: Handle to the previously initialized output tensor
// descriptor.
...
...
@@ -95,7 +95,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
workspace_size_limit
,
&
algo
));
// get workspace size able to allocate
PADDLE
_ENFORCE
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
handle
,
cudnn_filter_desc
,
cudnn_input_desc
,
cudnn_conv_desc
,
cudnn_output_desc
,
algo
,
&
workspace_size_in_bytes
));
...
...
@@ -110,7 +110,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
int
filter_offset
=
filter
->
numel
()
/
groups
;
T
alpha
=
1.0
f
,
beta
=
0.0
f
;
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
handle
,
&
alpha
,
cudnn_filter_desc
,
filter_data
+
filter_offset
*
g
,
cudnn_input_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
algo
,
cudnn_workspace
,
workspace_size_in_bytes
,
&
beta
,
...
...
@@ -178,11 +178,11 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
auto
handle
=
dev_ctx
.
cudnn_handle
();
if
(
input_grad
)
{
// choose backward algorithm for data
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
handle
,
cudnn_output_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_input_desc
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
data_algo
));
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
handle
,
cudnn_output_desc
,
cudnn_filter_desc
,
cudnn_conv_desc
,
cudnn_input_desc
,
data_algo
,
&
fwd_ws_size
));
workspace_size_in_bytes
=
std
::
max
(
workspace_size_in_bytes
,
fwd_ws_size
);
...
...
@@ -190,7 +190,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
if
(
filter_grad
)
{
// choose backward algorithm for filter
PADDLE
_ENFORCE
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
handle
,
cudnn_output_desc
,
cudnn_input_desc
,
cudnn_conv_desc
,
cudnn_filter_desc
,
...
...
@@ -198,7 +198,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
workspace_size_limit
,
&
filter_algo
));
// get workspace for backwards filter algorithm
PADDLE
_ENFORCE
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
handle
,
cudnn_output_desc
,
cudnn_input_desc
,
cudnn_conv_desc
,
cudnn_filter_desc
,
filter_algo
,
&
bwd_filter_ws_size
));
...
...
@@ -222,7 +222,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Because beta is zero, it is unnecessary to reset input_grad.
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionForward
(
handle
,
&
alpha
,
cudnn_output_desc
,
output_grad_data
+
output_grad_offset
*
g
,
cudnn_filter_desc
,
filter_data
+
filter_offset
*
g
,
cudnn_conv_desc
,
data_algo
,
...
...
@@ -237,7 +237,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
// Because beta is zero, it is unnecessary to reset filter_grad.
// Gradient with respect to the filter
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnConvolutionBackwardFilter
(
handle
,
&
alpha
,
cudnn_output_desc
,
output_grad_data
+
output_grad_offset
*
g
,
cudnn_input_desc
,
input_data
+
input_offset
*
g
,
cudnn_conv_desc
,
filter_algo
,
...
...
paddle/fluid/operators/distributed/CMakeLists.txt
浏览文件 @
8f18f4d1
...
...
@@ -18,7 +18,7 @@ if(WITH_GRPC)
set_source_files_properties
(
grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
cc_test
(
grpc_serde_test SRCS grpc_serde_test.cc
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL
)
cc_test
(
g
rpc_server_test SRCS rpc_server_test.cc
cc_test
(
rpc_server_test SRCS rpc_server_test.cc
DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op SERIAL
)
return
()
endif
()
...
...
paddle/fluid/operators/distributed/grpc_client.cc
浏览文件 @
8f18f4d1
...
...
@@ -36,20 +36,16 @@ void GRPCClient::InitEventLoop() {
client_thread_
.
reset
(
new
std
::
thread
(
std
::
bind
(
&
GRPCClient
::
Proceed
,
this
)));
}
void
GRPCClient
::
SendBeginPass
()
{
void
GRPCClient
::
SendComplete
()
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
completed_mutex_
);
if
(
!
completed_
)
{
for
(
auto
&
it
:
channels_
)
{
VLOG
(
3
)
<<
"send begin pass to:
"
<<
it
.
first
;
this
->
AsyncSendBeginPass
(
it
.
first
);
VLOG
(
3
)
<<
"send complete message to
"
<<
it
.
first
;
this
->
AsyncSendComplete
(
it
.
first
);
}
this
->
Wait
();
}
void
GRPCClient
::
SendEndPass
()
{
for
(
auto
&
it
:
channels_
)
{
VLOG
(
3
)
<<
"send end pass to "
<<
it
.
first
;
this
->
AsyncSendEndPass
(
it
.
first
);
PADDLE_ENFORCE
(
this
->
Wait
(),
"internal grpc error"
);
completed_
=
true
;
}
this
->
Wait
();
}
GRPCClient
::~
GRPCClient
()
{
...
...
@@ -239,32 +235,19 @@ void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
req_count_
++
;
}
void
GRPCClient
::
AsyncSend
BeginPass
(
const
std
::
string
&
ep
,
int64_t
time_out
)
{
void
GRPCClient
::
AsyncSend
Complete
(
const
std
::
string
&
ep
,
int64_t
time_out
)
{
const
auto
ch
=
GetChannel
(
ep
);
BatchBarrierProcessor
*
s
=
new
BatchBarrierProcessor
(
ch
);
s
->
Prepare
(
time_out
);
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
BEGIN_PASS
_MESSAGE
);
req
.
set_varname
(
COMPLETE
_MESSAGE
);
auto
rpc
=
s
->
stub_
->
AsyncSendVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
reinterpret_cast
<
void
*>
(
s
));
req_count_
++
;
}
void
GRPCClient
::
AsyncSendEndPass
(
const
std
::
string
&
ep
,
int64_t
time_out
)
{
const
auto
ch
=
GetChannel
(
ep
);
FetchBarrierProcessor
*
s
=
new
FetchBarrierProcessor
(
ch
);
s
->
Prepare
(
time_out
);
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
END_PASS_MESSAGE
);
auto
rpc
=
s
->
stub_
->
AsyncGetVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
reinterpret_cast
<
void
*>
(
s
));
req_count_
++
;
}
void
GRPCClient
::
AsyncCheckpointNotify
(
const
std
::
string
&
ep
,
const
std
::
string
&
dir
,
int64_t
time_out
)
{
...
...
paddle/fluid/operators/distributed/grpc_client.h
浏览文件 @
8f18f4d1
...
...
@@ -174,7 +174,7 @@ class CheckpointNotifyProcessor : public BaseProcessor {
class
GRPCClient
:
public
RPCClient
{
public:
GRPCClient
()
:
ok_
(
true
)
{}
GRPCClient
()
:
ok_
(
true
)
,
completed_
(
false
)
{}
virtual
~
GRPCClient
();
bool
AsyncSendVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
...
...
@@ -201,17 +201,12 @@ class GRPCClient : public RPCClient {
void
AsyncCheckpointNotify
(
const
std
::
string
&
ep
,
const
std
::
string
&
dir
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
void
AsyncSendBeginPass
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
void
AsyncSendEndPass
(
const
std
::
string
&
ep
,
void
AsyncSendComplete
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
bool
Wait
()
override
;
void
SendBeginPass
()
override
;
void
SendEndPass
()
override
;
void
SendComplete
()
override
;
protected:
void
InitImpl
()
override
;
...
...
@@ -238,6 +233,10 @@ class GRPCClient : public RPCClient {
// mutex for GetChannel thread safety
std
::
mutex
chan_mutex_
;
DISABLE_COPY_AND_ASSIGN
(
GRPCClient
);
// mutex for sending complete message only once
std
::
mutex
completed_mutex_
;
bool
completed_
;
};
}
// namespace distributed
...
...
paddle/fluid/operators/distributed/request_handler.h
浏览文件 @
8f18f4d1
...
...
@@ -43,8 +43,6 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
#define COMPLETE_MESSAGE "COMPLETE@RECV"
#define BEGIN_PASS_MESSAGE "BEGIN_PASS@RECV"
#define END_PASS_MESSAGE "END_PASS@RECV"
#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
...
...
paddle/fluid/operators/distributed/request_handler_impl.cc
浏览文件 @
8f18f4d1
...
...
@@ -55,10 +55,9 @@ bool RequestSendHandler::Handle(const std::string& varname,
if
(
varname
==
BATCH_BARRIER_MESSAGE
)
{
VLOG
(
3
)
<<
"sync: recv BATCH_BARRIER_MESSAGE"
;
rpc_server_
->
IncreaseBatchBarrier
(
kRequestSend
);
}
else
if
(
varname
==
BEGIN_PASS_MESSAGE
)
{
VLOG
(
3
)
<<
"sync: recv begin pass message"
;
rpc_server_
->
WaitCond
(
kRequestSend
);
rpc_server_
->
BeginPass
();
}
else
if
(
varname
==
COMPLETE_MESSAGE
)
{
VLOG
(
3
)
<<
"sync: recv complete message"
;
rpc_server_
->
Complete
();
}
else
{
VLOG
(
3
)
<<
"sync: received var_name: "
<<
varname
;
rpc_server_
->
WaitCond
(
kRequestSend
);
...
...
@@ -94,14 +93,12 @@ bool RequestGetHandler::Handle(const std::string& varname,
if
(
varname
==
FETCH_BARRIER_MESSAGE
)
{
VLOG
(
3
)
<<
"sync: recv fetch barrier message"
;
rpc_server_
->
IncreaseBatchBarrier
(
kRequestGet
);
}
else
if
(
varname
==
END_PASS_MESSAGE
)
{
rpc_server_
->
EndPass
();
}
else
{
rpc_server_
->
WaitCond
(
kRequestGet
);
*
outvar
=
scope_
->
FindVar
(
varname
);
}
}
else
{
if
(
varname
!=
FETCH_BARRIER_MESSAGE
&&
varname
!=
END_PASS
_MESSAGE
)
{
if
(
varname
!=
FETCH_BARRIER_MESSAGE
&&
varname
!=
COMPLETE
_MESSAGE
)
{
*
outvar
=
scope_
->
FindVar
(
varname
);
}
}
...
...
paddle/fluid/operators/distributed/rpc_client.h
浏览文件 @
8f18f4d1
...
...
@@ -60,17 +60,13 @@ class RPCClient {
const
std
::
string
&
dir
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
=
0
;
virtual
void
AsyncSend
BeginPass
(
const
std
::
string
&
ep
,
virtual
void
AsyncSend
Complete
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
=
0
;
virtual
void
AsyncSendEndPass
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
=
0
;
// BeginePass/EndPass tells all the pserver that start/end a pass, so that
// the pserver can increase/reduce it's barrier count, and continue to train
// Complete tells all the pserver instances that finishe the training,
// the pserver can reduce it's barrier count, and continue to train
// with other trainers.
virtual
void
SendBeginPass
()
=
0
;
virtual
void
SendEndPass
()
=
0
;
virtual
void
SendComplete
()
=
0
;
virtual
bool
Wait
()
=
0
;
...
...
paddle/fluid/operators/distributed/rpc_server.cc
浏览文件 @
8f18f4d1
...
...
@@ -64,18 +64,7 @@ void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
}
}
void
RPCServer
::
BeginPass
()
{
VLOG
(
4
)
<<
"RPCServer begin increase pass barrier"
;
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
client_num_
++
;
VLOG
(
4
)
<<
"increase client_num to: "
<<
client_num_
;
}
barrier_cond_
.
notify_all
();
}
void
RPCServer
::
EndPass
()
{
VLOG
(
4
)
<<
"RPCServer begin increase pass barrier"
;
void
RPCServer
::
Complete
()
{
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
client_num_
--
;
...
...
@@ -87,6 +76,11 @@ void RPCServer::EndPass() {
barrier_cond_
.
notify_all
();
}
int
RPCServer
::
GetClientNum
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
return
client_num_
;
}
void
RPCServer
::
ResetBarrierCounter
()
{
VLOG
(
3
)
<<
"RPCServer ResetBarrierCounter "
;
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
...
...
paddle/fluid/operators/distributed/rpc_server.h
浏览文件 @
8f18f4d1
...
...
@@ -44,7 +44,7 @@ class RPCServer {
int
GetSelectedPort
()
const
{
return
selected_port_
;
}
int
GetClientNum
()
const
;
int
GetClientNum
();
void
SavePort
()
const
;
...
...
@@ -64,8 +64,7 @@ class RPCServer {
void
WaitCond
(
const
std
::
string
&
rpc_name
);
void
IncreaseBatchBarrier
(
const
std
::
string
rpc_name
);
void
BeginPass
();
void
EndPass
();
void
Complete
();
void
ResetBarrierCounter
();
...
...
paddle/fluid/operators/distributed/rpc_server_test.cc
浏览文件 @
8f18f4d1
...
...
@@ -91,7 +91,7 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
}
}
void
StartServer
()
{
void
StartServer
(
const
std
::
string
&
rpc_name
)
{
framework
::
ProgramDesc
program
;
framework
::
Scope
scope
;
platform
::
CPUPlace
place
;
...
...
@@ -107,14 +107,14 @@ void StartServer() {
std
::
shared_ptr
<
framework
::
ExecutorPrepareContext
>>
prefetch_var_name_to_prepared
;
prefetch_var_name_to_prepared
[
in_var_name
]
=
prepared
[
0
];
g_req_handler
->
SetProgram
(
&
program
);
g_req_handler
->
SetPrefetchPreparedCtx
(
&
prefetch_var_name_to_prepared
);
g_req_handler
->
SetDevCtx
(
&
ctx
);
g_req_handler
->
SetScope
(
&
scope
);
g_req_handler
->
SetExecutor
(
&
exe
);
g_rpc_service
->
RegisterRPC
(
distributed
::
kRequestPrefetch
,
g_req_handler
.
get
());
g_rpc_service
->
RegisterRPC
(
rpc_name
,
g_req_handler
.
get
());
g_req_handler
->
SetRPCServer
(
g_rpc_service
.
get
());
std
::
thread
server_thread
(
...
...
@@ -129,7 +129,7 @@ TEST(PREFETCH, CPU) {
distributed
::
RPCClient
*
client
=
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
();
std
::
thread
server_thread
(
StartServer
);
std
::
thread
server_thread
(
StartServer
,
distributed
::
kRequestPrefetch
);
g_rpc_service
->
WaitServerReady
();
int
port
=
g_rpc_service
->
GetSelectedPort
();
...
...
@@ -162,3 +162,24 @@ TEST(PREFETCH, CPU) {
g_rpc_service
.
reset
(
nullptr
);
g_req_handler
.
reset
(
nullptr
);
}
TEST
(
COMPLETE
,
CPU
)
{
g_req_handler
.
reset
(
new
distributed
::
RequestSendHandler
(
true
));
g_rpc_service
.
reset
(
new
RPCSERVER_T
(
"127.0.0.1:0"
,
2
));
distributed
::
RPCClient
*
client
=
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
();
PADDLE_ENFORCE
(
client
!=
nullptr
);
std
::
thread
server_thread
(
StartServer
,
distributed
::
kRequestSend
);
g_rpc_service
->
WaitServerReady
();
int
port
=
g_rpc_service
->
GetSelectedPort
();
std
::
string
ep
=
paddle
::
string
::
Sprintf
(
"127.0.0.1:%d"
,
port
);
client
->
AsyncSendComplete
(
ep
);
client
->
Wait
();
EXPECT_EQ
(
g_rpc_service
->
GetClientNum
(),
1
);
g_rpc_service
->
ShutDown
();
server_thread
.
join
();
g_rpc_service
.
reset
(
nullptr
);
g_req_handler
.
reset
(
nullptr
);
}
paddle/fluid/operators/math/im2col.cc
浏览文件 @
8f18f4d1
...
...
@@ -40,22 +40,47 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
col
_height
=
col
->
dims
()[
3
];
int
col
_width
=
col
->
dims
()[
4
];
int
output
_height
=
col
->
dims
()[
3
];
int
output
_width
=
col
->
dims
()[
4
];
int
channels_col
=
im_channels
*
filter_height
*
filter_width
;
const
T
*
im_data
=
im
.
data
<
T
>
();
T
*
col_data
=
col
->
data
<
T
>
();
// TODO(TJ): change me to template
// further optimaze:
// 1. padding != 1
// 2. could also support stride_h != 1
if
(
stride
[
0
]
==
1
&&
stride
[
1
]
==
1
&&
dilation
[
0
]
==
1
&&
dilation
[
1
]
==
1
&&
padding
[
0
]
==
0
&&
padding
[
1
]
==
0
)
{
int
col_matrix_width
=
output_width
*
output_height
;
size_t
copy_size
=
sizeof
(
T
)
*
output_width
;
for
(
int
oh
=
0
;
oh
<
output_height
;
++
oh
)
{
const
T
*
im_data_start
=
im_data
+
oh
*
im_width
;
T
*
dst_data
=
col_data
+
oh
*
output_width
;
for
(
int
ic
=
0
;
ic
<
im_channels
;
++
ic
)
{
const
T
*
src_data
=
im_data_start
+
ic
*
im_height
*
im_width
;
for
(
int
kh
=
0
;
kh
<
filter_height
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
filter_width
;
++
kw
)
{
std
::
memcpy
(
dst_data
,
src_data
+
kw
,
copy_size
);
dst_data
=
dst_data
+
col_matrix_width
;
}
src_data
=
src_data
+
im_width
;
}
}
}
return
;
}
for
(
int
c
=
0
;
c
<
channels_col
;
++
c
)
{
int
w_offset
=
c
%
filter_width
;
int
h_offset
=
(
c
/
filter_width
)
%
filter_height
;
int
c_im
=
c
/
(
filter_width
*
filter_height
);
for
(
int
h
=
0
;
h
<
col
_height
;
++
h
)
{
for
(
int
h
=
0
;
h
<
output
_height
;
++
h
)
{
int
im_row_idx
=
h
*
stride
[
0
]
-
padding
[
0
]
+
h_offset
*
dilation
[
0
];
for
(
int
w
=
0
;
w
<
col
_width
;
++
w
)
{
for
(
int
w
=
0
;
w
<
output
_width
;
++
w
)
{
int
im_col_idx
=
w
*
stride
[
1
]
-
padding
[
1
]
+
w_offset
*
dilation
[
1
];
int
col_idx
=
(
c
*
col_height
+
h
)
*
col
_width
+
w
;
int
col_idx
=
(
c
*
output_height
+
h
)
*
output
_width
+
w
;
int
im_idx
=
(
im_row_idx
+
c_im
*
im_height
)
*
im_width
+
im_col_idx
;
col_data
[
col_idx
]
=
(
im_row_idx
<
0
||
im_row_idx
>=
im_height
||
...
...
paddle/fluid/operators/math/im2col_test.cc
浏览文件 @
8f18f4d1
...
...
@@ -160,8 +160,80 @@ void testIm2col() {
delete
context
;
}
void
testIm2colCPU
(
int
ic
,
int
ih
,
int
iw
,
int
fh
,
int
fw
,
int
ph
,
int
pw
)
{
paddle
::
framework
::
Tensor
input
;
paddle
::
framework
::
Tensor
output
;
paddle
::
framework
::
Tensor
ref_output
;
std
::
vector
<
int
>
padding
({
ph
,
pw
});
std
::
vector
<
int
>
stride
({
1
,
1
});
// stride_y, stride_x
std
::
vector
<
int
>
dilation
({
1
,
1
});
// dilation_y, dilation_x
int
output_height
=
(
ih
-
fh
+
padding
[
0
]
*
2
)
/
stride
[
0
]
+
1
;
int
output_width
=
(
iw
-
fw
+
padding
[
1
]
*
2
)
/
stride
[
1
]
+
1
;
float
*
input_ptr
=
input
.
mutable_data
<
float
>
({
ic
,
ih
,
iw
},
paddle
::
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
input
.
numel
();
++
i
)
{
input_ptr
[
i
]
=
static_cast
<
float
>
(
i
+
1
);
}
paddle
::
platform
::
CPUPlace
place
;
paddle
::
platform
::
CPUDeviceContext
context
(
place
);
output
.
mutable_data
<
float
>
({
ic
,
fh
,
fw
,
output_height
,
output_width
},
place
);
ref_output
.
mutable_data
<
float
>
({
ic
,
fh
,
fw
,
output_height
,
output_width
},
place
);
paddle
::
operators
::
math
::
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
paddle
::
platform
::
CPUDeviceContext
,
float
>
im2col
;
im2col
(
context
,
input
,
dilation
,
stride
,
padding
,
&
output
);
auto
ref_im2col
=
[
&
](
const
paddle
::
framework
::
Tensor
&
im
,
const
std
::
vector
<
int
>&
dilation
,
const
std
::
vector
<
int
>&
stride
,
const
std
::
vector
<
int
>&
padding
,
paddle
::
framework
::
Tensor
*
col
)
{
int
im_channels
=
im
.
dims
()[
0
];
int
im_height
=
im
.
dims
()[
1
];
int
im_width
=
im
.
dims
()[
2
];
int
filter_height
=
col
->
dims
()[
1
];
int
filter_width
=
col
->
dims
()[
2
];
int
output_height
=
col
->
dims
()[
3
];
int
output_width
=
col
->
dims
()[
4
];
int
channels_col
=
im_channels
*
filter_height
*
filter_width
;
const
float
*
im_data
=
im
.
data
<
float
>
();
float
*
col_data
=
col
->
data
<
float
>
();
for
(
int
c
=
0
;
c
<
channels_col
;
++
c
)
{
int
w_offset
=
c
%
filter_width
;
int
h_offset
=
(
c
/
filter_width
)
%
filter_height
;
int
c_im
=
c
/
(
filter_width
*
filter_height
);
for
(
int
h
=
0
;
h
<
output_height
;
++
h
)
{
int
im_row_idx
=
h
*
stride
[
0
]
-
padding
[
0
]
+
h_offset
*
dilation
[
0
];
for
(
int
w
=
0
;
w
<
output_width
;
++
w
)
{
int
im_col_idx
=
w
*
stride
[
1
]
-
padding
[
1
]
+
w_offset
*
dilation
[
1
];
int
col_idx
=
(
c
*
output_height
+
h
)
*
output_width
+
w
;
int
im_idx
=
(
im_row_idx
+
c_im
*
im_height
)
*
im_width
+
im_col_idx
;
col_data
[
col_idx
]
=
(
im_row_idx
<
0
||
im_row_idx
>=
im_height
||
im_col_idx
<
0
||
im_col_idx
>=
im_width
)
?
0.
f
:
im_data
[
im_idx
];
}
}
}
};
ref_im2col
(
input
,
dilation
,
stride
,
padding
,
&
ref_output
);
float
*
out_cfo_ptr
=
output
.
data
<
float
>
();
float
*
out_ref_ptr
=
ref_output
.
data
<
float
>
();
for
(
int
i
=
0
;
i
<
output
.
numel
();
++
i
)
{
EXPECT_EQ
(
out_cfo_ptr
[
i
],
out_ref_ptr
[
i
]);
}
}
TEST
(
math
,
im2col
)
{
testIm2col
<
paddle
::
platform
::
CPUDeviceContext
,
paddle
::
platform
::
CPUPlace
>
();
testIm2colCPU
(
/*ic*/
3
,
/*ih*/
5
,
/*iw*/
5
,
/*fh*/
3
,
/*fw*/
2
,
/*ph*/
0
,
/*pw*/
0
);
testIm2colCPU
(
/*ic*/
2
,
/*ih*/
5
,
/*iw*/
4
,
/*fh*/
3
,
/*fw*/
3
,
/*ph*/
1
,
/*pw*/
1
);
#ifdef PADDLE_WITH_CUDA
testIm2col
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
CUDAPlace
>
();
...
...
paddle/fluid/operators/math/softmax.cu
浏览文件 @
8f18f4d1
...
...
@@ -52,7 +52,7 @@ void SoftmaxCUDNNFunctor<T>::operator()(
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
cudnnTensorDescriptor_t
cudnn_y_desc
=
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnSoftmaxForward
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnSoftmaxForward
(
context
.
cudnn_handle
(),
CUDNN_SOFTMAX_ACCURATE
,
CUDNN_SOFTMAX_MODE_INSTANCE
,
CudnnDataType
<
T
>::
kOne
(),
cudnn_x_desc
,
X
->
data
<
T
>
(),
CudnnDataType
<
T
>::
kZero
(),
cudnn_y_desc
,
...
...
@@ -83,7 +83,7 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
dxDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
cudnnTensorDescriptor_t
cudnn_ygrad_desc
=
dyDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnSoftmaxBackward
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnSoftmaxBackward
(
context
.
cudnn_handle
(),
CUDNN_SOFTMAX_ACCURATE
,
CUDNN_SOFTMAX_MODE_INSTANCE
,
CudnnDataType
<
T
>::
kOne
(),
cudnn_y_desc
,
Y
->
data
<
T
>
(),
cudnn_ygrad_desc
,
YGrad
->
data
<
T
>
(),
...
...
paddle/fluid/operators/pool_cudnn_op.cu.cc
浏览文件 @
8f18f4d1
...
...
@@ -81,7 +81,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn pool algorithm ---------------------
auto
handle
=
ctx
.
cuda_device_context
().
cudnn_handle
();
ScalingParamType
<
T
>
alpha
=
1.0
f
,
beta
=
0.0
f
;
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnPoolingForward
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnPoolingForward
(
handle
,
cudnn_pool_desc
,
&
alpha
,
cudnn_input_desc
,
input_data
,
&
beta
,
cudnn_output_desc
,
output_data
));
}
...
...
@@ -154,7 +154,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Because beta is zero, it is unnecessary to reset input_grad.
PADDLE
_ENFORCE
(
platform
::
dynload
::
cudnnPoolingBackward
(
CUDNN
_ENFORCE
(
platform
::
dynload
::
cudnnPoolingBackward
(
handle
,
cudnn_pool_desc
,
&
alpha
,
cudnn_output_desc
,
output_data
,
cudnn_output_desc
,
output_grad_data
,
cudnn_input_desc
,
input_data
,
&
beta
,
cudnn_input_desc
,
input_grad_data
));
...
...
paddle/fluid/operators/send_recv_util.h
浏览文件 @
8f18f4d1
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/node.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -22,7 +23,10 @@ inline bool NeedSend(const framework::Scope& scope,
const
std
::
string
&
varname
)
{
// dummy variable is only used in parallel executor to represent
// some dependency relationship, we don't need to send/recv it.
if
(
varname
==
"dummy"
)
return
false
;
// TODO(paddle-dev): Why would parallel executor logic leaked into here?
if
(
varname
.
find
(
framework
::
ir
::
Node
::
kControlDepVarName
)
!=
std
::
string
::
npos
)
return
false
;
auto
*
var
=
scope
.
FindVar
(
varname
);
PADDLE_ENFORCE_NOT_NULL
(
var
,
"Can not find variable '%s' in the send side."
,
varname
);
...
...
paddle/fluid/operators/tensorrt_engine_op.cc
浏览文件 @
8f18f4d1
...
...
@@ -55,13 +55,14 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
"TensorRT' tensor input requires at least 2 dimensions"
);
PADDLE_ENFORCE_LE
(
shape
.
size
(),
4UL
,
"TensorRT' tensor input requires at most 4 dimensions"
);
switch
(
shape
.
size
())
{
case
2
:
return
nvinfer1
::
Dims2
(
shape
[
0
]
,
shape
[
1
]);
return
nvinfer1
::
Dims2
(
1
,
shape
[
1
]);
case
3
:
return
nvinfer1
::
Dims3
(
shape
[
0
]
,
shape
[
1
],
shape
[
2
]);
return
nvinfer1
::
Dims3
(
1
,
shape
[
1
],
shape
[
2
]);
case
4
:
return
nvinfer1
::
Dims4
(
shape
[
0
]
,
shape
[
1
],
shape
[
2
],
shape
[
3
]);
return
nvinfer1
::
Dims4
(
1
,
shape
[
1
],
shape
[
2
],
shape
[
3
]);
default:
return
nvinfer1
::
Dims
();
}
...
...
paddle/fluid/operators/tensorrt_engine_op.h
浏览文件 @
8f18f4d1
...
...
@@ -93,13 +93,15 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
auto
*
fluid_v
=
context
.
scope
().
FindVar
(
y
);
PADDLE_ENFORCE_NOT_NULL
(
fluid_v
,
"no output variable called %s"
,
y
);
auto
*
fluid_t
=
fluid_v
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
size
=
inference
::
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
);
fluid_t
->
Resize
(
framework
::
make_ddim
(
ddim
));
// TODO(Superjomn) find some way to determine which device to output the
// tensor.
// if (platform::is_cpu_place(fluid_t->place())) {
// TODO(Superjomn) change this float to dtype size.
auto
size
=
inference
::
analysis
::
AccuDims
(
dims
.
d
,
dims
.
nbDims
)
*
FLAGS_tensorrt_engine_batch_size
;
engine
->
GetOutputInCPU
(
y
,
fluid_t
->
mutable_data
<
float
>
(
platform
::
CPUPlace
()),
size
*
sizeof
(
float
));
...
...
paddle/fluid/operators/tensorrt_engine_op_test.cc
浏览文件 @
8f18f4d1
...
...
@@ -64,36 +64,37 @@ TEST(TensorRTEngineOp, manual) {
LOG
(
INFO
)
<<
"create block desc"
;
framework
::
BlockDesc
block_desc
(
&
program
,
block_
);
LOG
(
INFO
)
<<
"create
mul
op"
;
auto
*
mul
=
block_desc
.
AppendOp
();
mul
->
SetType
(
"mul
"
);
mul
->
SetInput
(
"X"
,
std
::
vector
<
std
::
string
>
({
"x"
}));
// 2 x 4
mul
->
SetInput
(
"Y"
,
std
::
vector
<
std
::
string
>
({
"y"
}));
// 4 x 6
mul
->
SetOutput
(
"Out"
,
std
::
vector
<
std
::
string
>
({
"z"
}));
// 2 x 6
LOG
(
INFO
)
<<
"create
fc
op"
;
auto
*
fc0
=
block_desc
.
AppendOp
();
fc0
->
SetType
(
"fc
"
);
fc0
->
SetInput
(
"X"
,
std
::
vector
<
std
::
string
>
({
"x"
}));
// 4 x 1 x 1
fc0
->
SetInput
(
"Y"
,
std
::
vector
<
std
::
string
>
({
"y"
}));
// 4 x 6
fc0
->
SetOutput
(
"Out"
,
std
::
vector
<
std
::
string
>
({
"z"
}));
// 6 x 1 x 1
LOG
(
INFO
)
<<
"create fc op"
;
auto
*
fc
=
block_desc
.
AppendOp
();
fc
->
SetType
(
"mul
"
);
fc
->
SetInput
(
"X"
,
std
::
vector
<
std
::
string
>
({
"z"
}));
fc
->
SetInput
(
"Y"
,
std
::
vector
<
std
::
string
>
({
"y0"
}));
// 6 x 8
fc
->
SetOutput
(
"Out"
,
std
::
vector
<
std
::
string
>
({
"z0"
}));
// 2 x 8
auto
*
fc
1
=
block_desc
.
AppendOp
();
fc
1
->
SetType
(
"fc
"
);
fc
1
->
SetInput
(
"X"
,
std
::
vector
<
std
::
string
>
({
"z"
}));
fc
1
->
SetInput
(
"Y"
,
std
::
vector
<
std
::
string
>
({
"y0"
}));
// 6 x 8
fc
1
->
SetOutput
(
"Out"
,
std
::
vector
<
std
::
string
>
({
"z0"
}));
// 8 x 1 x 1
// Set inputs' variable shape in BlockDesc
AddTensorToBlockDesc
(
block_
,
"x"
,
std
::
vector
<
int64_t
>
({
2
,
4
}));
// the batch size is 2, so the dims of 'x' is {2, 4, 1, 1}
AddTensorToBlockDesc
(
block_
,
"x"
,
std
::
vector
<
int64_t
>
({
2
,
4
,
1
,
1
}));
AddTensorToBlockDesc
(
block_
,
"y"
,
std
::
vector
<
int64_t
>
({
4
,
6
}));
AddTensorToBlockDesc
(
block_
,
"y0"
,
std
::
vector
<
int64_t
>
({
6
,
8
}));
AddTensorToBlockDesc
(
block_
,
"z"
,
std
::
vector
<
int64_t
>
({
2
,
6
}));
// It is wired, need to copy manually.
*
block_
->
add_ops
()
=
*
mul
->
Proto
();
*
block_
->
add_ops
()
=
*
fc
->
Proto
();
*
block_
->
add_ops
()
=
*
fc0
->
Proto
();
*
block_
->
add_ops
()
=
*
fc
1
->
Proto
();
ASSERT_EQ
(
block_
->
ops_size
(),
2
);
LOG
(
INFO
)
<<
"create tensorrt desc"
;
framework
::
OpDesc
engine_op_desc
(
nullptr
);
engine_op_desc
.
SetType
(
"tensorrt_engine"
);
engine_op_desc
.
SetInput
(
"Xs"
,
std
::
vector
<
std
::
string
>
({
"x"
,
"y"
,
"y0"
}));
engine_op_desc
.
SetInput
(
"Xs"
,
std
::
vector
<
std
::
string
>
({
"x"
}));
engine_op_desc
.
SetOutput
(
"Ys"
,
std
::
vector
<
std
::
string
>
({
"z0"
}));
SetAttr
<
std
::
string
>
(
engine_op_desc
.
Proto
(),
"subgraph"
,
block_
->
SerializeAsString
());
...
...
@@ -207,5 +208,4 @@ TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
}
// namespace operators
}
// namespace paddle
USE_TRT_CONVERTER
(
mul
)
USE_TRT_CONVERTER
(
fc
)
paddle/fluid/platform/cudnn_helper.h
浏览文件 @
8f18f4d1
...
...
@@ -62,9 +62,8 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
#define CUDNN_ENFORCE(condition) \
do { \
cudnnStatus_t status = condition; \
if (status != CUDNN_STATUS_SUCCESS) { \
VLOG(1) << ::paddle::platform::cudnnGetErrorString(status); \
PADDLE_THROW("cuDNN call failed"); \
if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) { \
PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
} \
} while (false)
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
8f18f4d1
...
...
@@ -498,10 +498,7 @@ All parameter, weight, gradient are variables in Paddle.
py
::
class_
<
framework
::
Executor
>
(
m
,
"Executor"
)
.
def
(
py
::
init
<
const
platform
::
Place
&>
())
#ifdef PADDLE_WITH_DISTRIBUTE
.
def
(
"begin_pass"
,
&
Executor
::
BeginPass
)
.
def
(
"end_pass"
,
&
Executor
::
EndPass
)
#endif
.
def
(
"close"
,
&
Executor
::
Close
)
.
def
(
"run"
,
[](
Executor
&
self
,
const
ProgramDesc
&
prog
,
Scope
*
scope
,
int
block_id
,
bool
create_local_scope
,
bool
create_vars
)
{
pybind11
::
gil_scoped_release
release
;
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
8f18f4d1
...
...
@@ -547,6 +547,7 @@ function test_fluid_inference_lib() {
EOF
cd
${
PADDLE_ROOT
}
/paddle/fluid/inference/api/demo_ci
./run.sh
${
PADDLE_ROOT
}
${
WITH_MKL
:-
ON
}
${
WITH_GPU
:-
OFF
}
./clean.sh
fi
}
...
...
python/paddle/fluid/__init__.py
浏览文件 @
8f18f4d1
...
...
@@ -123,7 +123,7 @@ def __bootstrap__():
read_env_flags
=
[
'use_pinned_memory'
,
'check_nan_inf'
,
'benchmark'
,
'warpctc_dir'
,
'eager_delete_scope'
,
'use_mkldnn'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem'
'init_allocated_mem'
,
'free_idle_memory'
]
if
core
.
is_compiled_with_dist
():
read_env_flags
.
append
(
'rpc_deadline'
)
...
...
python/paddle/fluid/executor.py
浏览文件 @
8f18f4d1
...
...
@@ -247,6 +247,7 @@ class Executor(object):
p
.
set_place
(
place
)
self
.
executor
=
core
.
Executor
(
p
)
self
.
program_caches
=
dict
()
self
.
_closed
=
False
def
as_lodtensor
(
self
,
data
):
"""
...
...
@@ -348,11 +349,23 @@ class Executor(object):
]
return
outs
def
begin_pass
(
self
):
self
.
executor
.
begin_pass
()
def
close
(
self
):
"""
Close this executor.
You can no long use this executor after calling this method.
For the distributed training, this method would free the resource on PServers related to
the current Trainer.
def
end_pass
(
self
):
self
.
executor
.
end_pass
()
Example:
>>> cpu = core.CPUPlace()
>>> exe = Executor(cpu)
>>> ...
>>> exe.close()
"""
if
not
self
.
_closed
:
self
.
executor
.
close
()
self
.
_closed
=
True
def
run
(
self
,
program
=
None
,
...
...
@@ -405,6 +418,10 @@ class Executor(object):
>>> feed={'X': x},
>>> fetch_list=[loss.name])
"""
if
self
.
_closed
:
raise
RuntimeError
(
"Attempted to use a closed Executor"
)
if
feed
is
None
:
feed
=
{}
if
not
isinstance
(
feed
,
dict
):
...
...
python/paddle/fluid/framework.py
浏览文件 @
8f18f4d1
...
...
@@ -32,7 +32,6 @@ except Exception, e:
import
unique_name
__all__
=
[
'Variable'
,
'Program'
,
'Operator'
,
'Parameter'
,
...
...
@@ -302,7 +301,7 @@ class Variable(object):
__repr__
=
__str__
def
set_desc
(
self
,
input
):
def
_
set_desc
(
self
,
input
):
"""
Set the variable description.
...
...
@@ -347,7 +346,7 @@ class Variable(object):
def
type
(
self
):
return
self
.
desc
.
type
()
def
set_error_clip
(
self
,
error_clip
):
def
_
set_error_clip
(
self
,
error_clip
):
"""
Set the error_clip.
...
...
python/paddle/fluid/io.py
浏览文件 @
8f18f4d1
...
...
@@ -790,101 +790,3 @@ def get_parameter_value_by_name(name, executor, program=None):
program
=
default_main_program
()
var
=
program
.
global_block
().
var
(
name
)
return
get_parameter_value
(
var
,
executor
)
def
get_test_program
(
filelist
,
program
=
None
,
startup_program
=
None
):
"""
Transpile current train program to a program to read test dataset
if the program is using reader ops like "open_files_op".
"""
def
_copy_reader_var_
(
block
,
var
,
new_name
=
None
):
if
new_name
==
None
:
new_name
=
var
.
name
new_var
=
block
.
create_var
(
name
=
str
(
new_name
),
type
=
core
.
VarDesc
.
VarType
.
READER
)
new_var
.
desc
.
set_shapes
(
var
.
desc
.
shapes
())
new_var
.
desc
.
set_dtypes
(
var
.
desc
.
dtypes
())
new_var
.
persistable
=
True
return
new_var
def
_get_test_reader_name
(
train_reader_name
):
return
train_reader_name
+
"_test"
def
_is_reader_op
(
op
):
block
=
op
.
block
if
"Out"
in
op
.
output_names
:
reader_out
=
block
.
vars
[
op
.
output
(
"Out"
)[
0
]]
if
reader_out
.
type
==
core
.
VarDesc
.
VarType
.
READER
:
return
True
return
False
if
program
==
None
:
program
=
default_main_program
()
if
startup_program
==
None
:
startup_program
=
default_startup_program
()
startup_block
=
startup_program
.
global_block
()
# 1. find out the orignal reader var name
startup_reader_op_list
=
[]
for
op
in
startup_block
.
ops
:
if
_is_reader_op
(
op
):
startup_reader_op_list
.
append
(
op
)
if
len
(
startup_reader_op_list
)
==
0
:
return
program
root_reader_op
=
startup_reader_op_list
[
0
]
train_test_reader_map
=
{}
# 2. add operators to startup to read open and read test data files
for
op
in
startup_reader_op_list
:
assert
(
len
(
op
.
output
(
"Out"
))
==
1
)
train_reader_name
=
op
.
output
(
"Out"
)[
0
]
train_reader
=
startup_block
.
vars
[
train_reader_name
]
test_reader
=
_copy_reader_var_
(
startup_block
,
train_reader
,
new_name
=
_get_test_reader_name
(
train_reader_name
))
train_test_reader_map
[
train_reader
.
name
]
=
test_reader
test_op_inputs
=
{}
for
name
in
op
.
input_names
:
train_arg_names
=
op
.
input
(
name
)
test_arg_vars
=
[]
for
arg_name
in
train_arg_names
:
arg_var
=
train_test_reader_map
[
arg_name
]
if
name
==
"UnderlyingReader"
else
startup_block
.
vars
[
arg_name
]
test_arg_vars
.
append
(
arg_var
)
test_op_inputs
[
name
]
=
test_arg_vars
test_op
=
startup_block
.
append_op
(
type
=
op
.
type
,
inputs
=
test_op_inputs
,
outputs
=
{
'Out'
:
[
test_reader
]},
attrs
=
op
.
attrs
)
# root reader op's filelist attr for read test files
if
op
.
type
==
root_reader_op
.
type
:
test_op
.
set_attr
(
"file_names"
,
filelist
)
if
op
.
type
==
"create_multi_pass_reader"
:
test_op
.
set_attr
(
"pass_num"
,
1
)
# 3. rename reader vars in inference program to different name
# to avoid read from train data.
main_block
=
program
.
global_block
()
for
var
in
main_block
.
vars
.
values
():
if
var
.
type
==
core
.
VarDesc
.
VarType
.
READER
:
main_block
.
_rename_var
(
str
(
var
.
name
),
str
(
_get_test_reader_name
(
var
.
name
)))
for
op
in
main_block
.
ops
:
if
op
.
type
==
root_reader_op
.
type
:
test_op
.
set_attr
(
"file_names"
,
filelist
)
if
op
.
type
==
"create_multi_pass_reader"
:
test_op
.
set_attr
(
"pass_num"
,
1
)
startup_program
.
_sync_with_cpp
()
program
.
_sync_with_cpp
()
return
program
python/paddle/fluid/layers/control_flow.py
浏览文件 @
8f18f4d1
...
...
@@ -23,25 +23,17 @@ from ops import logical_and, logical_not, logical_or
import
numpy
__all__
=
[
'split_lod_tensor'
,
'merge_lod_tensor'
,
'While'
,
'Switch'
,
'lod_rank_table'
,
'max_sequence_len'
,
'lod_tensor_to_array'
,
'array_to_lod_tensor'
,
'increment'
,
'array_write'
,
'create_array'
,
'less_than'
,
'equal'
,
'array_read'
,
'shrink_memory'
,
'array_length'
,
'IfElse'
,
'DynamicRNN'
,
'ConditionalBlock'
,
'StaticRNN'
,
'reorder_lod_tensor_by_rank'
,
'ParallelDo'
,
...
...
@@ -1457,7 +1449,7 @@ class IfElse(object):
if
self
.
status
==
IfElse
.
OUT_IF_ELSE_BLOCKS
:
raise
ValueError
(
"input must in true/false blocks"
)
if
id
(
x
)
not
in
self
.
input_table
:
parent_block
=
self
.
parent_block
()
parent_block
=
self
.
_
parent_block
()
out_true
=
parent_block
.
create_var
(
name
=
unique_name
.
generate
(
'ifelse_input'
+
self
.
helper
.
name
),
dtype
=
x
.
dtype
)
...
...
@@ -1483,7 +1475,7 @@ class IfElse(object):
else
:
return
out_false
def
parent_block
(
self
):
def
_
parent_block
(
self
):
current_block
=
self
.
helper
.
main_program
.
current_block
()
return
self
.
helper
.
main_program
.
block
(
current_block
.
parent_idx
)
...
...
@@ -1499,7 +1491,7 @@ class IfElse(object):
out_table
=
self
.
output_table
[
1
if
self
.
status
==
self
.
IN_IF_ELSE_TRUE_BLOCKS
else
0
]
parent_block
=
self
.
parent_block
()
parent_block
=
self
.
_
parent_block
()
for
each_out
in
outs
:
if
not
isinstance
(
each_out
,
Variable
):
raise
TypeError
(
"Each output should be a variable"
)
...
...
python/paddle/fluid/tests/demo/
text_classification
/.gitignore
→
python/paddle/fluid/tests/demo/
file_reader
/.gitignore
浏览文件 @
8f18f4d1
文件已移动
python/paddle/fluid/tests/demo/
text_classification
/convert_data_to_recordio.py
→
python/paddle/fluid/tests/demo/
file_reader
/convert_data_to_recordio.py
浏览文件 @
8f18f4d1
...
...
@@ -35,7 +35,7 @@ if len(sys.argv) == 1:
word_dict
=
paddle
.
dataset
.
imdb
.
word_dict
()
else
:
word_dict
=
load_vocab
(
sys
.
argv
[
1
])
word_dict
[
"<unk>"
]
=
len
(
word_dict
)
word_dict
[
"<unk>"
]
=
len
(
word_dict
)
print
"Dict dim = "
,
len
(
word_dict
)
# input text data
...
...
@@ -50,7 +50,7 @@ feeder = fluid.DataFeeder(feed_list=[data, label], place=fluid.CPUPlace())
BATCH_SIZE
=
128
train_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
paddle
.
dataset
.
imdb
.
train
(
word_dict
),
buf_size
=
10
000
),
paddle
.
dataset
.
imdb
.
train
(
word_dict
),
buf_size
=
25
000
),
batch_size
=
BATCH_SIZE
)
test_reader
=
paddle
.
batch
(
...
...
python/paddle/fluid/tests/demo/
text_classification
/train.py
→
python/paddle/fluid/tests/demo/
file_reader
/train.py
浏览文件 @
8f18f4d1
...
...
@@ -19,7 +19,7 @@ import sys
TRAIN_FILES
=
[
'train.recordio'
]
TEST_FILES
=
[
'test.recordio'
]
DICT_DIM
=
89528
DICT_DIM
=
5147
# embedding dim
emb_dim
=
128
...
...
@@ -27,58 +27,46 @@ emb_dim = 128
# hidden dim
hid_dim
=
128
# hidden dim2
hid_dim2
=
96
# class num
class_dim
=
2
# epoch num
epoch_num
=
10
def
network_cfg
(
is_train
,
pass_num
=
100
):
with
fluid
.
unique_name
.
guard
():
train_file_obj
=
fluid
.
layers
.
open_files
(
filenames
=
TRAIN_FILES
,
pass_num
=
pass_num
,
shapes
=
[[
-
1
,
1
],
[
-
1
,
1
]],
lod_levels
=
[
1
,
0
],
dtypes
=
[
'int64'
,
'int64'
])
test_file_obj
=
fluid
.
layers
.
open_files
(
filenames
=
TEST_FILES
,
pass_num
=
1
,
def
build_program
(
is_train
):
file_obj_handle
=
fluid
.
layers
.
io
.
open_files
(
filenames
=
TRAIN_FILES
if
is_train
else
TEST_FILES
,
shapes
=
[[
-
1
,
1
],
[
-
1
,
1
]],
lod_levels
=
[
1
,
0
],
dtypes
=
[
'int64'
,
'int64'
])
if
is_train
:
file_obj
=
fluid
.
layers
.
shuffle
(
train_file_obj
,
buffer_size
=
1000
)
else
:
file_obj
=
test_file_obj
file_obj
=
fluid
.
layers
.
io
.
double_buffer
(
file_obj_handle
)
file_obj
=
fluid
.
layers
.
double_buffer
(
file_obj
,
name
=
"train_double_buffer"
if
is_train
else
'test_double_buffer'
)
with
fluid
.
unique_name
.
guard
():
data
,
label
=
fluid
.
layers
.
read_file
(
file_obj
)
emb
=
fluid
.
layers
.
embedding
(
input
=
data
,
size
=
[
DICT_DIM
,
emb_dim
])
# sequence conv with window size = 3
win_size
=
3
conv_3
=
fluid
.
nets
.
sequence_conv_pool
(
input
=
emb
,
num_filters
=
hid_dim
,
filter_size
=
win_size
,
filter_size
=
3
,
act
=
"tanh"
,
pool_type
=
"
max
"
)
pool_type
=
"
sqrt
"
)
# fc layer after conv
fc_1
=
fluid
.
layers
.
fc
(
input
=
[
conv_3
],
size
=
hid_dim2
)
conv_4
=
fluid
.
nets
.
sequence_conv_pool
(
input
=
emb
,
num_filters
=
hid_dim
,
filter_size
=
4
,
act
=
"tanh"
,
pool_type
=
"sqrt"
)
# probability of each class
prediction
=
fluid
.
layers
.
fc
(
input
=
[
fc_1
],
prediction
=
fluid
.
layers
.
fc
(
input
=
[
conv_3
,
conv_4
],
size
=
class_dim
,
act
=
"softmax"
)
# cross entropy loss
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
...
...
@@ -88,58 +76,62 @@ def network_cfg(is_train, pass_num=100):
if
is_train
:
# SGD optimizer
sgd_optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
0.01
)
sgd_optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
0.0
0
1
)
sgd_optimizer
.
minimize
(
avg_cost
)
return
{
'loss'
:
avg_cost
,
'log'
:
[
avg_cost
,
acc
],
'file'
:
train_file_obj
if
is_train
else
test_file_obj
}
return
{
'loss'
:
avg_cost
,
'log'
:
[
avg_cost
,
acc
],
'file'
:
file_obj_handle
}
def
main
():
train
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
test
=
fluid
.
Program
()
with
fluid
.
program_guard
(
train
,
startup
):
train_args
=
network_cfg
(
is_train
=
True
)
test
=
fluid
.
Program
()
train_args
=
build_program
(
is_train
=
True
)
with
fluid
.
program_guard
(
test
,
fluid
.
Program
()
):
test_args
=
network_cfg
(
is_train
=
False
)
with
fluid
.
program_guard
(
test
,
startup
):
test_args
=
build_program
(
is_train
=
False
)
use_cuda
=
fluid
.
core
.
is_compiled_with_cuda
()
# startup
place
=
fluid
.
CUDAPlace
(
0
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
=
place
)
exe
.
run
(
startup
)
train_exe
=
fluid
.
ParallelExecutor
(
use_cuda
=
True
,
loss_name
=
train_args
[
'loss'
].
name
,
main_program
=
train
)
use_cuda
=
use_cuda
,
loss_name
=
train_args
[
'loss'
].
name
,
main_program
=
train
)
test_exe
=
fluid
.
ParallelExecutor
(
use_cuda
=
use_cuda
,
main_program
=
test
,
share_vars_from
=
train_exe
)
fetch_var_list
=
[
var
.
name
for
var
in
train_args
[
'log'
]]
for
i
in
xrange
(
sys
.
maxint
):
result
=
map
(
numpy
.
array
,
train_exe
.
run
(
fetch_list
=
fetch_var_list
if
i
%
1000
==
0
else
[]))
if
len
(
result
)
!=
0
:
print
'Train: '
,
result
if
i
%
1000
==
0
:
test_exe
=
fluid
.
ParallelExecutor
(
use_cuda
=
True
,
main_program
=
test
,
share_vars_from
=
train_exe
)
for
epoch_id
in
range
(
epoch_num
):
# train
try
:
batch_id
=
0
while
True
:
loss
,
acc
=
map
(
numpy
.
array
,
train_exe
.
run
(
fetch_list
=
fetch_var_list
))
print
'Train epoch'
,
epoch_id
,
'batch'
,
batch_id
,
'loss:'
,
loss
,
'acc:'
,
acc
batch_id
+=
1
except
fluid
.
core
.
EOFException
:
print
'End of epoch'
,
epoch_id
train_args
[
'file'
].
reset
()
# test
loss
=
[]
acc
=
[]
try
:
while
True
:
loss_np
,
acc_np
=
map
(
numpy
.
array
,
test_exe
.
run
(
fetch_list
=
fetch_var_list
))
loss_np
,
acc_np
=
map
(
numpy
.
array
,
test_exe
.
run
(
fetch_list
=
fetch_var_list
))
loss
.
append
(
loss_np
[
0
])
acc
.
append
(
acc_np
[
0
])
except
:
test_args
[
'file'
].
reset
()
print
'TEST: '
,
numpy
.
mean
(
loss
)
,
numpy
.
mean
(
acc
)
print
'Test loss:'
,
numpy
.
mean
(
loss
),
'acc:'
,
numpy
.
mean
(
acc
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/test_error_clip.py
浏览文件 @
8f18f4d1
...
...
@@ -36,7 +36,7 @@ with fluid.program_guard(main_program=prog):
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
prog_clip
=
prog
.
clone
()
prog_clip
.
block
(
0
).
var
(
hidden1
.
name
).
set_error_clip
(
prog_clip
.
block
(
0
).
var
(
hidden1
.
name
).
_
set_error_clip
(
fluid
.
clip
.
ErrorClipByValue
(
max
=
CLIP_MAX
,
min
=
CLIP_MIN
))
...
...
python/paddle/fluid/tests/test_if_else_op.py
浏览文件 @
8f18f4d1
...
...
@@ -19,6 +19,10 @@ from paddle.fluid.executor import Executor
from
paddle.fluid.optimizer
import
MomentumOptimizer
import
paddle.fluid.core
as
core
import
paddle.fluid
as
fluid
from
paddle.fluid.layers.control_flow
import
split_lod_tensor
from
paddle.fluid.layers.control_flow
import
merge_lod_tensor
from
paddle.fluid.layers.control_flow
import
ConditionalBlock
import
unittest
import
numpy
as
np
...
...
@@ -34,11 +38,10 @@ class TestMNISTIfElseOp(unittest.TestCase):
limit
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
'int64'
,
value
=
5
)
cond
=
layers
.
less_than
(
x
=
label
,
y
=
limit
)
true_image
,
false_image
=
layers
.
split_lod_tensor
(
input
=
image
,
mask
=
cond
)
true_image
,
false_image
=
split_lod_tensor
(
input
=
image
,
mask
=
cond
)
true_out
=
layers
.
create_tensor
(
dtype
=
'float32'
)
true_cond
=
layers
.
ConditionalBlock
([
cond
])
true_cond
=
ConditionalBlock
([
cond
])
with
true_cond
.
block
():
hidden
=
layers
.
fc
(
input
=
true_image
,
size
=
100
,
act
=
'tanh'
)
...
...
@@ -46,14 +49,14 @@ class TestMNISTIfElseOp(unittest.TestCase):
layers
.
assign
(
input
=
prob
,
output
=
true_out
)
false_out
=
layers
.
create_tensor
(
dtype
=
'float32'
)
false_cond
=
layers
.
ConditionalBlock
([
cond
])
false_cond
=
ConditionalBlock
([
cond
])
with
false_cond
.
block
():
hidden
=
layers
.
fc
(
input
=
false_image
,
size
=
200
,
act
=
'tanh'
)
prob
=
layers
.
fc
(
input
=
hidden
,
size
=
10
,
act
=
'softmax'
)
layers
.
assign
(
input
=
prob
,
output
=
false_out
)
prob
=
layers
.
merge_lod_tensor
(
prob
=
merge_lod_tensor
(
in_true
=
true_out
,
in_false
=
false_out
,
mask
=
cond
,
x
=
image
)
loss
=
layers
.
cross_entropy
(
input
=
prob
,
label
=
label
)
avg_loss
=
layers
.
mean
(
loss
)
...
...
python/paddle/fluid/tests/unittests/op_test.py
浏览文件 @
8f18f4d1
...
...
@@ -251,7 +251,7 @@ class OpTest(unittest.TestCase):
for
out_name
,
out_dup
in
Operator
.
get_op_outputs
(
self
.
op_type
):
fetch_list
.
append
(
str
(
out_name
))
# fetch_list = map(block.var, fetch_list)
if
not
isinstance
(
fetch_list
[
0
],
Variable
):
if
not
isinstance
(
fetch_list
[
0
],
fluid
.
framework
.
Variable
):
fetch_list
=
map
(
block
.
var
,
fetch_list
)
outs
=
executor
.
run
(
program
,
feed
=
feed_map
,
...
...
python/paddle/fluid/tests/unittests/test_conditional_block.py
浏览文件 @
8f18f4d1
...
...
@@ -18,14 +18,15 @@ import paddle.fluid.core as core
from
paddle.fluid.framework
import
default_startup_program
,
default_main_program
from
paddle.fluid.executor
import
Executor
from
paddle.fluid.backward
import
append_backward
from
paddle.fluid.layers.control_flow
import
ConditionalBlock
import
numpy
class
ConditionalBlock
(
unittest
.
TestCase
):
class
ConditionalBlock
Test
(
unittest
.
TestCase
):
def
test_forward
(
self
):
data
=
layers
.
data
(
name
=
'X'
,
shape
=
[
1
],
dtype
=
'float32'
)
data
.
stop_gradient
=
False
cond
=
layers
.
ConditionalBlock
(
inputs
=
[
data
])
cond
=
ConditionalBlock
(
inputs
=
[
data
])
out
=
layers
.
create_tensor
(
dtype
=
'float32'
)
with
cond
.
block
():
hidden
=
layers
.
fc
(
input
=
data
,
size
=
10
)
...
...
python/paddle/fluid/tests/unittests/test_const_value.py
浏览文件 @
8f18f4d1
...
...
@@ -16,7 +16,7 @@ import unittest
import
paddle.fluid.framework
as
framework
class
Con
ditionalBlock
(
unittest
.
TestCase
):
class
Con
stantTest
(
unittest
.
TestCase
):
def
test_const_value
(
self
):
self
.
assertEqual
(
framework
.
GRAD_VAR_SUFFIX
,
"@GRAD"
)
self
.
assertEqual
(
framework
.
TEMP_VAR_NAME
,
"@TEMP@"
)
...
...
python/paddle/fluid/tests/unittests/test_dyn_rnn.py
浏览文件 @
8f18f4d1
...
...
@@ -17,6 +17,12 @@ import paddle
import
unittest
import
numpy
from
paddle.fluid.layers.control_flow
import
lod_rank_table
from
paddle.fluid.layers.control_flow
import
max_sequence_len
from
paddle.fluid.layers.control_flow
import
lod_tensor_to_array
from
paddle.fluid.layers.control_flow
import
array_to_lod_tensor
from
paddle.fluid.layers.control_flow
import
shrink_memory
class
TestDynRNN
(
unittest
.
TestCase
):
def
setUp
(
self
):
...
...
@@ -38,12 +44,11 @@ class TestDynRNN(unittest.TestCase):
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'float32'
)
rank_table
=
fluid
.
layers
.
lod_rank_table
(
x
=
sent_emb
)
rank_table
=
lod_rank_table
(
x
=
sent_emb
)
sent_emb_array
=
fluid
.
layers
.
lod_tensor_to_array
(
x
=
sent_emb
,
table
=
rank_table
)
sent_emb_array
=
lod_tensor_to_array
(
x
=
sent_emb
,
table
=
rank_table
)
seq_len
=
fluid
.
layers
.
max_sequence_len
(
rank_table
=
rank_table
)
seq_len
=
max_sequence_len
(
rank_table
=
rank_table
)
i
=
fluid
.
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
'int64'
,
value
=
0
)
i
.
stop_gradient
=
False
...
...
@@ -66,7 +71,7 @@ class TestDynRNN(unittest.TestCase):
mem
=
fluid
.
layers
.
array_read
(
array
=
mem_array
,
i
=
i
)
ipt
=
fluid
.
layers
.
array_read
(
array
=
sent_emb_array
,
i
=
i
)
mem
=
fluid
.
layers
.
shrink_memory
(
x
=
mem
,
i
=
i
,
table
=
rank_table
)
mem
=
shrink_memory
(
x
=
mem
,
i
=
i
,
table
=
rank_table
)
hidden
=
fluid
.
layers
.
fc
(
input
=
[
mem
,
ipt
],
size
=
100
,
act
=
'tanh'
)
...
...
@@ -75,8 +80,7 @@ class TestDynRNN(unittest.TestCase):
fluid
.
layers
.
array_write
(
x
=
hidden
,
i
=
i
,
array
=
mem_array
)
fluid
.
layers
.
less_than
(
x
=
i
,
y
=
seq_len
,
cond
=
cond
)
all_timesteps
=
fluid
.
layers
.
array_to_lod_tensor
(
x
=
out
,
table
=
rank_table
)
all_timesteps
=
array_to_lod_tensor
(
x
=
out
,
table
=
rank_table
)
last
=
fluid
.
layers
.
sequence_last_step
(
input
=
all_timesteps
)
logits
=
fluid
.
layers
.
fc
(
input
=
last
,
size
=
1
,
act
=
None
)
loss
=
fluid
.
layers
.
sigmoid_cross_entropy_with_logits
(
...
...
python/paddle/fluid/tests/unittests/test_lod_rank_table.py
浏览文件 @
8f18f4d1
...
...
@@ -12,7 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from
paddle.fluid.layers
import
lod_rank_table
,
data
from
paddle.fluid.layers
import
data
from
paddle.fluid.layers.control_flow
import
lod_rank_table
from
paddle.fluid.executor
import
Executor
import
paddle.fluid.core
as
core
import
numpy
...
...
python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
浏览文件 @
8f18f4d1
...
...
@@ -20,6 +20,11 @@ from paddle.fluid.framework import Program, program_guard
from
paddle.fluid.executor
import
Executor
from
paddle.fluid.backward
import
append_backward
from
paddle.fluid.layers.control_flow
import
lod_rank_table
from
paddle.fluid.layers.control_flow
import
max_sequence_len
from
paddle.fluid.layers.control_flow
import
lod_tensor_to_array
from
paddle.fluid.layers.control_flow
import
array_to_lod_tensor
class
TestCPULoDTensorArrayOps
(
unittest
.
TestCase
):
def
place
(
self
):
...
...
@@ -137,13 +142,13 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
with
program_guard
(
program
):
x
=
layers
.
data
(
name
=
'x'
,
shape
=
[
10
])
x
.
persistable
=
True
table
=
l
ayers
.
l
od_rank_table
(
x
,
level
=
level
)
max_len
=
layers
.
max_sequence_len
(
table
)
table
=
lod_rank_table
(
x
,
level
=
level
)
max_len
=
max_sequence_len
(
table
)
max_len
.
persistable
=
True
array
=
l
ayers
.
l
od_tensor_to_array
(
x
,
table
)
array
=
lod_tensor_to_array
(
x
,
table
)
array
.
persistable
=
True
result
=
layers
.
array_to_lod_tensor
(
array
,
table
)
result
=
array_to_lod_tensor
(
array
,
table
)
result
.
persistable
=
True
exe
=
Executor
(
place
)
scope
=
core
.
Scope
()
...
...
@@ -181,9 +186,9 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
with
program_guard
(
program
):
x
=
layers
.
data
(
name
=
'x'
,
shape
=
[
1
],
dtype
=
'float32'
,
stop_gradient
=
False
)
table
=
l
ayers
.
l
od_rank_table
(
x
,
level
=
0
)
array
=
l
ayers
.
l
od_tensor_to_array
(
x
,
table
)
result
=
layers
.
array_to_lod_tensor
(
array
,
table
)
table
=
lod_rank_table
(
x
,
level
=
0
)
array
=
lod_tensor_to_array
(
x
,
table
)
result
=
array_to_lod_tensor
(
array
,
table
)
mean
=
layers
.
mean
(
result
)
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
浏览文件 @
8f18f4d1
...
...
@@ -107,44 +107,24 @@ class TestMNIST(TestParallelExecutorBase):
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
return
img
,
label
# simple_fc
def
check_simple_fc_convergence
(
self
,
use_cuda
,
use_reduce
=
False
):
def
_compare_reduce_and_allreduce
(
self
,
model
,
use_cuda
,
random_data
=
True
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
return
self
.
check_network_convergence
(
simple_fc_net
,
use_cuda
=
use_cuda
)
self
.
check_network_convergence
(
simple_fc_net
,
use_cuda
=
use_cuda
,
allow_op_delay
=
True
)
img
,
label
=
self
.
_init_data
()
model
,
use_cuda
=
use_cuda
,
use_reduce
=
True
)
self
.
check_network_convergence
(
simple_fc_net
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_cuda
=
use_cuda
,
use_reduce
=
use_reduce
)
model
,
use_cuda
=
use_cuda
,
allow_op_delay
=
True
,
use_reduce
=
True
)
def
check_simple_fc_convergence_with_Reduce
(
self
,
use_cuda
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
return
self
.
check_network_convergence
(
simple_fc_net
,
use_cuda
=
use_cuda
,
use_reduce
=
True
)
self
.
check_network_convergence
(
simple_fc_net
,
use_cuda
=
use_cuda
,
allow_op_delay
=
True
,
use_reduce
=
True
)
img
,
label
=
self
.
_init_data
()
img
,
label
=
self
.
_init_data
(
random_data
)
all_reduce_first_loss
,
all_reduce_last_loss
=
self
.
check_network_convergence
(
simple_fc_net
,
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_cuda
=
use_cuda
,
use_reduce
=
False
)
reduce_first_loss
,
reduce_last_loss
=
self
.
check_network_convergence
(
simple_fc_net
,
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_cuda
=
use_cuda
,
...
...
@@ -153,7 +133,24 @@ class TestMNIST(TestParallelExecutorBase):
for
loss
in
zip
(
all_reduce_first_loss
,
reduce_first_loss
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
for
loss
in
zip
(
all_reduce_last_loss
,
reduce_last_loss
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-4
)
# simple_fc
def
check_simple_fc_convergence
(
self
,
use_cuda
,
use_reduce
=
False
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
return
self
.
check_network_convergence
(
simple_fc_net
,
use_cuda
=
use_cuda
)
self
.
check_network_convergence
(
simple_fc_net
,
use_cuda
=
use_cuda
,
allow_op_delay
=
True
)
img
,
label
=
self
.
_init_data
()
self
.
check_network_convergence
(
simple_fc_net
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_cuda
=
use_cuda
,
use_reduce
=
use_reduce
)
def
test_simple_fc
(
self
):
# use_cuda
...
...
@@ -162,8 +159,8 @@ class TestMNIST(TestParallelExecutorBase):
def
test_simple_fc_with_new_strategy
(
self
):
# use_cuda, use_reduce
self
.
check_simple_fc_convergence_with_Reduce
(
True
)
self
.
check_simple_fc_convergence_with_Reduce
(
False
)
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
True
)
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
False
)
def
check_simple_fc_parallel_accuracy
(
self
,
use_cuda
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
...
...
@@ -209,39 +206,13 @@ class TestMNIST(TestParallelExecutorBase):
"label"
:
label
},
use_cuda
=
use_cuda
)
def
check_batchnorm_fc_convergence_use_reduce
(
self
,
use_cuda
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
return
self
.
check_network_convergence
(
fc_with_batchnorm
,
use_cuda
=
use_cuda
,
use_reduce
=
True
)
img
,
label
=
self
.
_init_data
()
all_reduce_first_loss
,
all_reduce_last_loss
=
self
.
check_network_convergence
(
fc_with_batchnorm
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_cuda
=
use_cuda
,
use_reduce
=
False
)
reduce_first_loss
,
reduce_last_loss
=
self
.
check_network_convergence
(
fc_with_batchnorm
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_cuda
=
use_cuda
,
use_reduce
=
True
)
for
loss
in
zip
(
all_reduce_first_loss
,
reduce_first_loss
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
for
loss
in
zip
(
all_reduce_last_loss
,
reduce_last_loss
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-4
)
def
test_batchnorm_fc
(
self
):
self
.
check_batchnorm_fc_convergence
(
True
)
self
.
check_batchnorm_fc_convergence
(
False
)
def
test_batchnorm_fc_with_new_strategy
(
self
):
self
.
check_batchnorm_fc_convergence_use_reduce
(
True
)
self
.
check_batchnorm_fc_convergence_use_reduce
(
False
)
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
True
)
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
False
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_op.py
浏览文件 @
8f18f4d1
...
...
@@ -120,7 +120,7 @@ class BaseParallelForTest(unittest.TestCase):
pd
=
fluid
.
layers
.
ParallelDo
(
places
,
use_nccl
=
use_nccl
)
data
=
next
(
generator
)
if
isinstance
(
data
,
fluid
.
Variable
):
if
isinstance
(
data
,
fluid
.
framework
.
Variable
):
data
=
[
data
]
with
pd
.
do
():
...
...
python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
浏览文件 @
8f18f4d1
...
...
@@ -15,6 +15,7 @@
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.fluid.layers.control_flow
import
lod_rank_table
import
numpy
...
...
@@ -34,7 +35,7 @@ class TestReorderLoDTensor(unittest.TestCase):
dat
.
stop_gradient
=
False
rank_dat
=
fluid
.
layers
.
data
(
name
=
cls
.
data_desc
[
1
][
0
],
shape
=
cls
.
data_desc
[
1
][
1
])
table
=
fluid
.
layers
.
lod_rank_table
(
rank_dat
)
table
=
lod_rank_table
(
rank_dat
)
new_dat
=
fluid
.
layers
.
reorder_lod_tensor_by_rank
(
x
=
dat
,
rank_table
=
table
)
loss
=
fluid
.
layers
.
reduce_sum
(
new_dat
)
...
...
python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
浏览文件 @
8f18f4d1
...
...
@@ -21,6 +21,9 @@ from paddle.fluid.framework import default_main_program, switch_main_program
from
paddle.fluid.framework
import
Program
import
numpy
as
np
from
paddle.fluid.layers.control_flow
import
shrink_memory
from
paddle.fluid.layers.control_flow
import
lod_rank_table
class
TestShrinkRNNMemoryBase
(
unittest
.
TestCase
):
def
setUp
(
self
):
...
...
@@ -30,15 +33,15 @@ class TestShrinkRNNMemoryBase(unittest.TestCase):
x
.
stop_gradient
=
False
rank_table_tensor
=
layers
.
data
(
'rank_table_tensor'
,
shape
=
[
1
],
dtype
=
'float32'
,
lod_level
=
1
)
table
=
l
ayers
.
l
od_rank_table
(
x
=
rank_table_tensor
)
table
=
lod_rank_table
(
x
=
rank_table_tensor
)
i
=
layers
.
zeros
(
dtype
=
'int64'
,
shape
=
[
1
])
self
.
mem1
=
layers
.
shrink_memory
(
x
=
x
,
i
=
i
,
table
=
table
)
self
.
mem1
=
shrink_memory
(
x
=
x
,
i
=
i
,
table
=
table
)
i
=
layers
.
increment
(
x
=
i
)
i
.
stop_gradient
=
True
self
.
mem2
=
layers
.
shrink_memory
(
x
=
self
.
mem1
,
i
=
i
,
table
=
table
)
self
.
mem2
=
shrink_memory
(
x
=
self
.
mem1
,
i
=
i
,
table
=
table
)
i
=
layers
.
increment
(
x
=
i
)
i
.
stop_gradient
=
True
self
.
mem3
=
layers
.
shrink_memory
(
x
=
self
.
mem2
,
i
=
i
,
table
=
table
)
self
.
mem3
=
shrink_memory
(
x
=
self
.
mem2
,
i
=
i
,
table
=
table
)
mem3_mean
=
layers
.
mean
(
self
.
mem3
)
append_backward
(
loss
=
mem3_mean
)
self
.
x_grad
=
self
.
main_program
.
global_block
().
var
(
'x@GRAD'
)
...
...
python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
浏览文件 @
8f18f4d1
...
...
@@ -19,6 +19,8 @@ import paddle.fluid.layers as layers
from
paddle.fluid.framework
import
Program
,
program_guard
from
paddle.fluid.executor
import
Executor
from
paddle.fluid.backward
import
append_backward
from
paddle.fluid.layers.control_flow
import
split_lod_tensor
from
paddle.fluid.layers.control_flow
import
merge_lod_tensor
class
TestCPULoDTensorArrayOps
(
unittest
.
TestCase
):
...
...
@@ -96,12 +98,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
y
=
layers
.
data
(
name
=
'y'
,
shape
=
[
1
])
y
.
persistable
=
True
out_true
,
out_false
=
layers
.
split_lod_tensor
(
input
=
x
,
mask
=
y
,
level
=
level
)
out_true
,
out_false
=
split_lod_tensor
(
input
=
x
,
mask
=
y
,
level
=
level
)
out_true
.
persistable
=
True
out_false
.
persistable
=
True
out
=
layers
.
merge_lod_tensor
(
out
=
merge_lod_tensor
(
in_true
=
out_true
,
in_false
=
out_false
,
mask
=
y
,
x
=
x
,
level
=
level
)
out
.
persistable
=
True
...
...
@@ -142,9 +143,8 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
level
=
0
out_true
,
out_false
=
layers
.
split_lod_tensor
(
input
=
x
,
mask
=
y
,
level
=
level
)
out
=
layers
.
merge_lod_tensor
(
out_true
,
out_false
=
split_lod_tensor
(
input
=
x
,
mask
=
y
,
level
=
level
)
out
=
merge_lod_tensor
(
in_true
=
out_true
,
in_false
=
out_false
,
mask
=
y
,
x
=
x
,
level
=
level
)
mean
=
layers
.
mean
(
out
)
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
8f18f4d1
...
...
@@ -38,7 +38,7 @@ from ps_dispatcher import RoundRobin, HashName, PSDispatcher
from
..
import
core
,
framework
from
..framework
import
Program
,
default_main_program
,
\
default_startup_program
,
Block
,
\
Variable
,
Parameter
,
grad_var_name
Parameter
,
grad_var_name
from
details
import
*
LOOKUP_TABLE_TYPE
=
"lookup_table"
...
...
@@ -779,7 +779,9 @@ class DistributeTranspiler(object):
outputs
=
{
"Out"
:
prefetch_output_vars
},
attrs
=
{
"epmap"
:
pserver_endpoints
,
RPC_OP_ROLE_ATTR_NAME
:
RPC_OP_ROLE_ATTR_VALUE
# FIXME(qiao) temporarily disable this config because prefetch
# is not act as other rpc op, it's more like a forward op
# RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
})
# insert concat_op
...
...
@@ -887,7 +889,8 @@ class DistributeTranspiler(object):
# create table optimize block in pserver program
table_opt_op
=
[
op
for
op
in
self
.
optimize_ops
if
op
.
input
(
"Param"
)[
0
]
==
self
.
table_name
if
'Param'
in
op
.
input_names
and
op
.
input
(
"Param"
)[
0
]
==
self
.
table_name
][
0
]
table_opt_block
=
pserver_program
.
create_block
(
pre_block_idx
)
# only support sgd now
...
...
@@ -1044,7 +1047,6 @@ class DistributeTranspiler(object):
]
def
_clone_var
(
self
,
block
,
var
,
persistable
=
True
):
assert
isinstance
(
var
,
Variable
)
return
block
.
create_var
(
name
=
var
.
name
,
shape
=
var
.
shape
,
...
...
python/paddle/fluid/transpiler/memory_optimization_transpiler.py
浏览文件 @
8f18f4d1
...
...
@@ -14,7 +14,7 @@
from
collections
import
defaultdict
from
..
import
core
from
..framework
import
Program
,
default_main_program
,
Parameter
,
Variable
from
..framework
import
Program
,
default_main_program
,
Parameter
from
..backward
import
_rename_arg_
dtype_to_size
=
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录