Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
a4951843
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a4951843
编写于
2月 11, 2020
作者:
S
sneaxiy
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
inference feed partial data, test=develop
上级
6dadb5de
变更
22
隐藏空白更改
内联
并排
Showing
22 changed file
with
904 addition
and
106 deletion
+904
-106
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+3
-2
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+5
-8
paddle/fluid/framework/details/computation_op_handle.h
paddle/fluid/framework/details/computation_op_handle.h
+2
-0
paddle/fluid/framework/details/eager_deletion_op_handle.cc
paddle/fluid/framework/details/eager_deletion_op_handle.cc
+3
-1
paddle/fluid/framework/details/eager_deletion_op_handle.h
paddle/fluid/framework/details/eager_deletion_op_handle.h
+4
-1
paddle/fluid/framework/details/multi_devices_helper.cc
paddle/fluid/framework/details/multi_devices_helper.cc
+189
-1
paddle/fluid/framework/details/multi_devices_helper.h
paddle/fluid/framework/details/multi_devices_helper.h
+6
-0
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
...le/fluid/framework/details/parallel_ssa_graph_executor.cc
+104
-21
paddle/fluid/framework/details/parallel_ssa_graph_executor.h
paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+24
-3
paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
.../framework/ir/memory_optimize_pass/eager_deletion_pass.cc
+1
-1
paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
...optimize_pass/test_reference_count_pass_last_lived_ops.cc
+1
-1
paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
...luid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
+1
-1
paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_pass.cc
...r/multi_devices_graph_pass/set_reader_device_info_pass.cc
+101
-0
paddle/fluid/framework/lod_tensor.cc
paddle/fluid/framework/lod_tensor.cc
+22
-18
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+137
-17
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+2
-0
paddle/fluid/operators/reader/read_op.cc
paddle/fluid/operators/reader/read_op.cc
+4
-0
paddle/fluid/pybind/reader_py.cc
paddle/fluid/pybind/reader_py.cc
+88
-12
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+3
-14
python/paddle/fluid/reader.py
python/paddle/fluid/reader.py
+13
-5
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+1
-0
python/paddle/fluid/tests/unittests/test_parallel_executor_inference_feed_partial_data.py
...sts/test_parallel_executor_inference_feed_partial_data.py
+190
-0
未找到文件。
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
a4951843
...
...
@@ -9,7 +9,7 @@ cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_pr
cc_library
(
share_tensor_buffer_op_handle SRCS share_tensor_buffer_op_handle.cc DEPS op_handle_base scope computation_op_handle share_tensor_buffer_functor
)
cc_library
(
rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry
)
cc_library
(
fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry
)
cc_library
(
multi_devices_helper
INTERFACE
SRCS multi_devices_helper.cc DEPS graph graph_helper
)
cc_library
(
multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper
)
cc_library
(
variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows
)
...
...
@@ -65,6 +65,7 @@ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope d
cc_library
(
eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper
)
set
(
SSA_GRAPH_EXECUTOR_DEPS graph framework_proto
multi_devices_helper
sequential_execution_pass
modify_op_lock_and_record_event_pass
all_reduce_deps_pass
...
...
@@ -72,7 +73,7 @@ set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto
eager_deletion_pass
buffer_shared_inplace_op_pass
buffer_shared_cross_op_memory_reuse_pass
set_reader_device_
count
_pass
)
set_reader_device_
info
_pass
)
cc_library
(
ssa_graph_executor SRCS ssa_graph_executor.cc DEPS
${
SSA_GRAPH_EXECUTOR_DEPS
}
)
cc_library
(
threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
...
...
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
a4951843
...
...
@@ -66,7 +66,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPrintGraphPass
(
"graph_viz_pass"
,
"_fused_graph"
);
AppendMultiDevPass
();
AppendSetReaderDevice
Count
Pass
();
AppendSetReaderDevice
Index
Pass
();
AppendMultiGraphOptPasses
();
AppendPassToSetMkldnnAttr
(
"mkldnn_placement_pass"
);
...
...
@@ -225,8 +225,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
&
strategy_
);
}
void
AppendSetReaderDevice
Count
Pass
()
{
AppendPass
(
"set_reader_device_
count
_pass"
);
void
AppendSetReaderDevice
Index
Pass
()
{
AppendPass
(
"set_reader_device_
index
_pass"
);
}
void
AppendPrintGraphPass
(
const
std
::
string
&
pass_name
,
...
...
@@ -399,12 +399,9 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
"GPU, skipped."
;
continue
;
}
}
else
if
(
pass
->
Type
()
==
"set_reader_device_
count
_pass"
)
{
}
else
if
(
pass
->
Type
()
==
"set_reader_device_
index
_pass"
)
{
pass
->
Erase
(
kPlaces
);
pass
->
SetNotOwned
<
const
std
::
vector
<
platform
::
Place
>>
(
kPlaces
,
&
places
);
pass
->
Erase
(
kLocalScopes
);
pass
->
SetNotOwned
<
const
std
::
vector
<
Scope
*>>
(
kLocalScopes
,
&
local_scopes
);
}
VLOG
(
1
)
<<
"Start Apply Pass "
<<
pass
->
Type
();
graph
=
pass
->
Apply
(
graph
);
...
...
@@ -441,7 +438,7 @@ USE_PASS(fuse_sgd_op_pass);
USE_PASS
(
fuse_momentum_op_pass
);
USE_PASS
(
fuse_all_reduce_op_pass
);
USE_PASS
(
runtime_context_cache_pass
);
USE_PASS
(
set_reader_device_
count
_pass
);
USE_PASS
(
set_reader_device_
index
_pass
);
#ifdef PADDLE_WITH_MKLDNN
USE_PASS
(
mkldnn_placement_pass
);
#endif
...
...
paddle/fluid/framework/details/computation_op_handle.h
浏览文件 @
a4951843
...
...
@@ -34,6 +34,8 @@ class ComputationOpHandle : public OpHandleBase {
OperatorBase
*
GetOp
()
{
return
op_
.
get
();
}
const
OperatorBase
*
GetOp
()
const
{
return
op_
.
get
();
}
std
::
string
Name
()
const
override
;
const
Scope
*
GetScope
()
const
{
return
scope_
;
}
...
...
paddle/fluid/framework/details/eager_deletion_op_handle.cc
浏览文件 @
a4951843
...
...
@@ -31,10 +31,12 @@ namespace framework {
namespace
details
{
EagerDeletionOpHandle
::
EagerDeletionOpHandle
(
ir
::
Node
*
node
,
Scope
*
scope
,
const
platform
::
Place
&
place
,
ir
::
Node
*
node
,
Scope
*
scope
,
size_t
scope_idx
,
const
platform
::
Place
&
place
,
const
std
::
unordered_set
<
ir
::
MemOptVarInfo
*>
&
vars
,
GarbageCollector
*
gc
)
:
OpHandleBase
(
node
),
scope_
(
scope
),
scope_idx_
(
scope_idx
),
place_
(
place
),
var_infos_
(
vars
.
begin
(),
vars
.
end
()),
gc_
(
gc
)
{
...
...
paddle/fluid/framework/details/eager_deletion_op_handle.h
浏览文件 @
a4951843
...
...
@@ -34,7 +34,7 @@ namespace details {
class
EagerDeletionOpHandle
:
public
OpHandleBase
{
public:
EagerDeletionOpHandle
(
ir
::
Node
*
node
,
Scope
*
scope
,
EagerDeletionOpHandle
(
ir
::
Node
*
node
,
Scope
*
scope
,
size_t
scope_idx
,
const
platform
::
Place
&
place
,
const
std
::
unordered_set
<
ir
::
MemOptVarInfo
*>
&
vars
,
GarbageCollector
*
gc
);
...
...
@@ -50,6 +50,8 @@ class EagerDeletionOpHandle : public OpHandleBase {
*/
Priority
GetPriority
()
const
override
{
return
kHighest
;
}
size_t
GetScopeIdx
()
const
{
return
scope_idx_
;
}
protected:
void
RunImpl
()
override
;
...
...
@@ -63,6 +65,7 @@ class EagerDeletionOpHandle : public OpHandleBase {
void
CallOnce
();
Scope
*
scope_
;
size_t
scope_idx_
;
platform
::
Place
place_
;
std
::
vector
<
ir
::
MemOptVarInfo
*>
var_infos_
;
// not own
GarbageCollector
*
gc_
;
// not own
...
...
paddle/fluid/framework/details/multi_devices_helper.cc
浏览文件 @
a4951843
...
...
@@ -12,9 +12,197 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include <algorithm>
#include <unordered_set>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{}
// namespace details
namespace
details
{
static
constexpr
size_t
kUndefinedDevIdx
=
-
1UL
;
static
std
::
unordered_set
<
std
::
string
>
kMultiDeviceOps
{
"sync_batch_norm"
,
"sync_batch_norm_grad"
,
"allreduce"
,
"c_allreduce_sum"
,
"c_allreduce_prod"
,
"c_allreduce_min"
,
"c_allreduce_max"
,
"c_allgather"
,
"c_reducescatter"
,
"c_broadcast"
,
"c_comm_init"
,
"c_comm_init_all"
,
"c_gen_nccl_id"
,
"c_sync_comm_stream"
,
"send"
,
"recv"
,
"send_barrier"
,
"fetch_barrier"
,
};
static
size_t
GetScopeIdxFromOp
(
const
details
::
OpHandleBase
&
op
)
{
if
(
auto
*
compute_op
=
dynamic_cast
<
const
details
::
ComputationOpHandle
*>
(
&
op
))
{
return
kMultiDeviceOps
.
count
(
compute_op
->
GetOp
()
->
Type
())
==
0
?
compute_op
->
GetScopeIdx
()
:
kUndefinedDevIdx
;
}
else
if
(
auto
*
gc_op
=
dynamic_cast
<
const
details
::
EagerDeletionOpHandle
*>
(
&
op
))
{
return
gc_op
->
GetScopeIdx
();
}
else
if
(
auto
*
share_op
=
dynamic_cast
<
const
details
::
ShareTensorBufferOpHandle
*>
(
&
op
))
{
return
share_op
->
GetScopeIdx
();
}
else
{
return
kUndefinedDevIdx
;
}
}
static
bool
ContainMultiDeviceOp
(
const
ProgramDesc
&
program
,
size_t
begin_block_idx
)
{
for
(
size_t
block_idx
=
begin_block_idx
;
block_idx
<
program
.
Size
();
++
block_idx
)
{
for
(
auto
*
op_desc
:
program
.
Block
(
block_idx
).
AllOps
())
{
if
(
kMultiDeviceOps
.
count
(
op_desc
->
Type
())
>
0
)
{
return
true
;
}
}
}
return
false
;
}
static
size_t
GetUniqueDeviceIdOfOp
(
const
details
::
OpHandleBase
&
op
)
{
size_t
dev_idx
=
GetScopeIdxFromOp
(
op
);
if
(
dev_idx
==
kUndefinedDevIdx
)
{
return
kUndefinedDevIdx
;
}
const
auto
&
ins
=
op
.
Inputs
();
const
auto
&
outs
=
op
.
Outputs
();
auto
in_outs
=
ins
;
in_outs
.
insert
(
in_outs
.
end
(),
outs
.
begin
(),
outs
.
end
());
for
(
auto
*
var
:
in_outs
)
{
auto
*
var_handle
=
dynamic_cast
<
details
::
VarHandle
*>
(
var
);
if
(
var_handle
==
nullptr
)
{
continue
;
}
if
(
dev_idx
!=
var_handle
->
scope_idx
())
{
return
kUndefinedDevIdx
;
}
}
return
dev_idx
;
}
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
TrySeparateToMultipleSingleDeviceGraphs
(
ir
::
Graph
*
graph
)
{
if
(
ContainMultiDeviceOp
(
graph
->
OriginProgram
(),
1
))
{
return
{};
}
size_t
place_num
=
0
;
auto
op_handles
=
ir
::
FilterByNodeWrapper
<
OpHandleBase
>
(
*
graph
);
if
(
op_handles
.
empty
())
{
return
{};
}
std
::
unordered_map
<
details
::
OpHandleBase
*
,
size_t
>
op_to_dev_idx
;
for
(
auto
&
op
:
op_handles
)
{
auto
dev_idx
=
GetUniqueDeviceIdOfOp
(
*
op
);
if
(
dev_idx
==
kUndefinedDevIdx
)
{
VLOG
(
10
)
<<
"Op "
<<
op
->
Name
()
<<
" is not determined"
;
return
{};
}
place_num
=
std
::
max
(
place_num
,
dev_idx
+
1
);
op_to_dev_idx
[
op
]
=
dev_idx
;
}
for
(
auto
&
op
:
op_handles
)
{
auto
dev_idx
=
op_to_dev_idx
.
at
(
op
);
for
(
auto
&
in_var
:
op
->
Inputs
())
{
if
(
in_var
->
GeneratedOp
())
{
auto
iter
=
op_to_dev_idx
.
find
(
in_var
->
GeneratedOp
());
if
(
iter
==
op_to_dev_idx
.
end
()
||
iter
->
second
!=
dev_idx
)
{
return
{};
}
}
}
for
(
auto
&
out_var
:
op
->
Outputs
())
{
for
(
auto
&
pending_op
:
out_var
->
PendingOps
())
{
auto
iter
=
op_to_dev_idx
.
find
(
pending_op
);
if
(
iter
==
op_to_dev_idx
.
end
()
||
iter
->
second
!=
dev_idx
)
{
return
{};
}
}
}
}
PADDLE_ENFORCE_GE
(
place_num
,
1
,
platform
::
errors
::
NotFound
(
"No place found, this may be a bug"
));
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
graphs
(
place_num
);
for
(
auto
&
g
:
graphs
)
{
g
.
reset
(
new
ir
::
Graph
(
ProgramDesc
()));
g
->
Set
(
kGraphVars
,
new
GraphVars
(
1UL
));
g
->
Set
(
kGraphDepVars
,
new
GraphDepVars
());
}
for
(
auto
&
op
:
op_handles
)
{
auto
dev_idx
=
op_to_dev_idx
.
at
(
op
);
auto
*
ret_graph
=
graphs
[
dev_idx
].
get
();
auto
&
ret_vars
=
ret_graph
->
Get
<
GraphVars
>
(
kGraphVars
)[
0
];
auto
&
ret_dummy_vars
=
ret_graph
->
Get
<
GraphDepVars
>
(
kGraphDepVars
);
auto
&
origin_vars
=
graph
->
Get
<
GraphVars
>
(
kGraphVars
)[
dev_idx
];
ret_graph
->
AddNode
(
graph
->
RemoveNode
(
op
->
Node
()).
release
());
auto
handler
=
[
&
](
const
std
::
vector
<
VarHandleBase
*>
&
vars
)
{
for
(
auto
*
var
:
vars
)
{
if
(
graph
->
Nodes
().
count
(
var
->
Node
())
>
0
)
{
ret_graph
->
AddNode
(
graph
->
RemoveNode
(
var
->
Node
()).
release
());
auto
*
dummy_var
=
dynamic_cast
<
DummyVarHandle
*>
(
var
);
if
(
dummy_var
==
nullptr
)
{
ret_vars
.
emplace
(
var
->
Name
(),
origin_vars
.
at
(
var
->
Name
()));
}
else
{
ret_dummy_vars
.
emplace
(
dummy_var
);
}
}
}
};
handler
(
op
->
Inputs
());
handler
(
op
->
Outputs
());
}
graph
->
Erase
(
kGraphVars
);
graph
->
Erase
(
kGraphDepVars
);
return
graphs
;
}
bool
HasDropLastReadOp
(
const
ir
::
Graph
&
graph
)
{
auto
ops
=
ir
::
FilterByNodeWrapper
<
OpHandleBase
>
(
graph
);
for
(
auto
*
op
:
ops
)
{
auto
*
compute_op
=
dynamic_cast
<
ComputationOpHandle
*>
(
op
);
if
(
compute_op
&&
compute_op
->
GetOp
()
->
Type
()
==
"read"
&&
compute_op
->
GetOp
()
->
Attr
<
bool
>
(
"drop_last"
))
{
VLOG
(
10
)
<<
"The graph has drop_last=True read op"
;
return
true
;
}
}
VLOG
(
10
)
<<
"The graph does not have drop_last=True read op"
;
return
false
;
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/multi_devices_helper.h
浏览文件 @
a4951843
...
...
@@ -47,6 +47,7 @@ constexpr char kGraphVars[] = "vars";
constexpr
char
kNRanks
[]
=
"nranks"
;
constexpr
char
kPlaces
[]
=
"places"
;
constexpr
char
kGlobalScope
[]
=
"global_scope"
;
constexpr
char
kLocalScopes
[]
=
"local_scopes"
;
constexpr
char
kNCCLCtxs
[]
=
"nccl_ctxs"
;
constexpr
char
kUseHierarchicalAllReduce
[]
=
"use_hierarchical_allreduce"
;
...
...
@@ -100,6 +101,11 @@ inline std::vector<std::string> GetOpRoleVarsOrEmpty(const OpDesc &op) {
return
boost
::
get
<
std
::
vector
<
std
::
string
>>
(
iter
->
second
);
}
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
TrySeparateToMultipleSingleDeviceGraphs
(
ir
::
Graph
*
graph
);
bool
HasDropLastReadOp
(
const
ir
::
Graph
&
graph
);
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
浏览文件 @
a4951843
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include <algorithm>
#include <memory>
#include <utility>
#include "paddle/fluid/framework/ir/graph_helper.h"
...
...
@@ -21,11 +22,11 @@ namespace paddle {
namespace
framework
{
namespace
details
{
st
d
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
ParallelSSAGraphExecutor
::
SeparateMultiDevicesGraph
(
ir
::
Graph
*
graph
)
{
st
atic
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
SeparateMultiDevicesGraph
(
ir
::
Graph
*
graph
,
size_t
place_num
)
{
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
graphs
;
graphs
.
reserve
(
place
s_
.
size
()
);
for
(
size_t
i
=
0
;
i
<
place
s_
.
size
()
;
++
i
)
{
graphs
.
reserve
(
place
_num
);
for
(
size_t
i
=
0
;
i
<
place
_num
;
++
i
)
{
ProgramDesc
empty
;
graphs
.
emplace_back
(
std
::
unique_ptr
<
ir
::
Graph
>
(
new
ir
::
Graph
(
empty
)));
auto
&
g
=
graphs
.
back
();
...
...
@@ -64,7 +65,7 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) {
}
}
for
(
size_t
dev_id
=
0
;
dev_id
<
place
s_
.
size
()
;
++
dev_id
)
{
for
(
size_t
dev_id
=
0
;
dev_id
<
place
_num
;
++
dev_id
)
{
auto
&
dev_vars
=
graphs
[
dev_id
]
->
Get
<
GraphVars
>
(
kGraphVars
)[
0
];
auto
&
origin_vars
=
graph
->
Get
<
GraphVars
>
(
kGraphVars
)[
dev_id
];
for
(
auto
&
name_pair
:
origin_vars
)
{
...
...
@@ -85,15 +86,34 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_exec_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
ir
::
Graph
*
graph
)
// TODO(Yancey1989): Copying graphs is not safely since it deleted the
// attrs.
:
ParallelSSAGraphExecutor
(
strategy
,
local_scopes
,
local_exec_scopes
,
places
,
SeparateMultiDevicesGraph
(
graph
,
places
.
size
()))
{}
ParallelSSAGraphExecutor
::
ParallelSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_exec_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
graphs
)
:
strategy_
(
std
::
move
(
strategy
)),
local_scopes_
(
std
::
move
(
local_scopes
)),
pool_
(
places
.
size
()
>=
2
?
new
::
ThreadPool
(
places
.
size
())
:
nullptr
),
places_
(
std
::
move
(
places
)),
// TODO(Yancey1989): Copying graphs is not safely since it deleted the
// attrs.
graphs_
(
SeparateMultiDevicesGraph
(
graph
))
{
places_
(
places
),
graphs_
(
std
::
move
(
graphs
)),
feed_status_
(
places
.
size
(),
FeedStatus
::
kNone
)
{
PADDLE_ENFORCE_EQ
(
places_
.
size
(),
local_scopes_
.
size
());
PADDLE_ENFORCE_EQ
(
places_
.
size
(),
graphs_
.
size
(),
platform
::
errors
::
InvalidArgument
(
"Graph number does not match place number"
));
PADDLE_ENFORCE_GT
(
places_
.
size
(),
0
,
platform
::
errors
::
InvalidArgument
(
"place number must be larger than 0"
));
auto
seq_allreduce_pass
=
ir
::
PassRegistry
::
Instance
().
Get
(
"all_reduce_deps_pass"
);
seq_allreduce_pass
->
Set
<
bool
>
(
kUseHierarchicalAllReduce
,
new
bool
(
false
));
...
...
@@ -123,22 +143,43 @@ std::vector<ir::Graph *> ParallelSSAGraphExecutor::Graphs() {
return
result
;
}
enum
ExceptionStatus
{
kSuccess
=
0
,
kEOF
,
kOther
};
FeedFetchList
ParallelSSAGraphExecutor
::
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
{
size_t
feed_num
=
std
::
count
(
feed_status_
.
begin
(),
feed_status_
.
end
(),
FeedStatus
::
kHasFeed
);
bool
has_feed
=
(
feed_num
>
0
);
VLOG
(
10
)
<<
"Feed num "
<<
feed_num
;
size_t
place_num
=
places_
.
size
();
std
::
vector
<
std
::
future
<
FeedFetchList
>>
run_futures
;
std
::
vector
<
ExceptionStatus
>
exception_status
(
place_num
,
ExceptionStatus
::
kSuccess
);
std
::
vector
<
FeedFetchList
>
fetch_data
;
FeedFetchList
ret
;
fetch_data
.
reserve
(
place
s_
.
size
()
);
ret
.
reserve
(
fetch_tensors
.
size
()
);
fetch_data
.
reserve
(
place
_num
);
ret
.
reserve
(
place_num
);
exception_holder_
.
Clear
();
for
(
size_t
i
=
0
;
i
<
place
s_
.
size
()
;
++
i
)
{
auto
call
=
[
this
,
i
,
&
fetch_tensors
]()
->
FeedFetchList
{
for
(
size_t
i
=
0
;
i
<
place
_num
;
++
i
)
{
auto
call
=
[
&
,
i
]()
->
FeedFetchList
{
try
{
return
executors_
[
i
]
->
Run
(
fetch_tensors
);
if
(
!
support_partial_feed_
||
!
has_feed
||
feed_status_
[
i
]
==
FeedStatus
::
kHasFeed
)
{
return
executors_
[
i
]
->
Run
(
fetch_tensors
);
}
else
{
return
FeedFetchList
();
}
}
catch
(
platform
::
EOFException
&
)
{
exception_status
[
i
]
=
ExceptionStatus
::
kEOF
;
exception_holder_
.
Catch
(
std
::
current_exception
());
}
catch
(...)
{
exception_status
[
i
]
=
ExceptionStatus
::
kOther
;
exception_holder_
.
Catch
(
std
::
current_exception
());
}
return
FeedFetchList
();
...
...
@@ -153,21 +194,63 @@ FeedFetchList ParallelSSAGraphExecutor::Run(
if
(
pool_
)
{
for
(
auto
&
f
:
run_futures
)
{
if
(
exception_holder_
.
IsCaught
())
{
f
.
wait
();
}
else
{
fetch_data
.
emplace_back
(
f
.
get
());
fetch_data
.
emplace_back
(
f
.
get
());
}
}
bool
has_exception
=
exception_holder_
.
IsCaught
();
if
(
!
support_partial_feed_
&&
has_exception
)
{
VLOG
(
10
)
<<
"Exception rethrow because partial feed is not supported"
;
exception_holder_
.
ReThrow
();
}
std
::
vector
<
bool
>
is_valid
(
place_num
,
true
);
if
(
support_partial_feed_
)
{
if
(
has_feed
)
{
for
(
size_t
i
=
0
;
i
<
place_num
;
++
i
)
{
if
(
feed_status_
[
i
]
==
FeedStatus
::
kNone
)
{
is_valid
[
i
]
=
false
;
}
else
if
(
exception_status
[
i
]
!=
ExceptionStatus
::
kSuccess
)
{
PADDLE_ENFORCE_EQ
(
has_exception
,
true
,
platform
::
errors
::
InvalidArgument
(
"Thread pool raises exception but not caught"
));
VLOG
(
10
)
<<
"Exception rethrow because non-EOF exception raises when "
"feed is given"
;
exception_holder_
.
ReThrow
();
}
}
}
else
{
for
(
size_t
i
=
0
;
i
<
place_num
;
++
i
)
{
if
(
exception_status
[
i
]
==
ExceptionStatus
::
kOther
)
{
PADDLE_ENFORCE_EQ
(
has_exception
,
true
,
platform
::
errors
::
InvalidArgument
(
"Thread pool raises exception but not caught"
));
VLOG
(
10
)
<<
"Exception rethrow because non-EOF exception raises when "
"feed is not given"
;
exception_holder_
.
ReThrow
();
}
else
if
(
exception_status
[
i
]
!=
ExceptionStatus
::
kSuccess
)
{
is_valid
[
i
]
=
false
;
}
}
}
}
if
(
exception_holder_
.
IsCaught
())
{
if
(
std
::
count
(
is_valid
.
begin
(),
is_valid
.
end
(),
true
)
==
0
)
{
PADDLE_ENFORCE_EQ
(
has_exception
,
true
,
platform
::
errors
::
InvalidArgument
(
"Thread pool raises exception but not caught"
));
VLOG
(
10
)
<<
"Raise exception because there is no success worker"
;
exception_holder_
.
ReThrow
();
}
for
(
size_t
fetch_idx
=
0
;
fetch_idx
<
fetch_tensors
.
size
();
++
fetch_idx
)
{
std
::
vector
<
const
LoDTensor
*>
lodtensor_ptrs
;
lodtensor_ptrs
.
reserve
(
local_scopes_
.
size
());
for
(
size_t
scope_idx
=
0
;
scope_idx
<
local_scopes_
.
size
();
++
scope_idx
)
{
lodtensor_ptrs
.
reserve
(
place_num
);
for
(
size_t
scope_idx
=
0
;
scope_idx
<
place_num
;
++
scope_idx
)
{
if
(
!
is_valid
[
scope_idx
])
{
continue
;
}
lodtensor_ptrs
.
push_back
(
&
fetch_data
.
at
(
scope_idx
).
at
(
fetch_idx
));
}
ret
.
emplace_back
();
...
...
paddle/fluid/framework/details/parallel_ssa_graph_executor.h
浏览文件 @
a4951843
...
...
@@ -27,12 +27,25 @@ namespace framework {
namespace
details
{
class
ParallelSSAGraphExecutor
:
public
SSAGraphExecutor
{
public:
enum
FeedStatus
{
kNone
=
0
,
// No feed
kHasFeed
=
1
// Has feed
};
public:
ParallelSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_exec_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
ir
::
Graph
*
graph
);
ParallelSSAGraphExecutor
(
const
ExecutionStrategy
&
strategy
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
Scope
*>
&
local_exec_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
graphs
);
~
ParallelSSAGraphExecutor
()
final
=
default
;
const
ir
::
Graph
&
Graph
()
const
override
{
return
*
graphs_
[
0
];
}
...
...
@@ -41,10 +54,15 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
)
override
;
private:
std
::
vector
<
std
::
unique_ptr
<
ir
::
Graph
>>
SeparateMultiDevicesGraph
(
ir
::
Graph
*
graph
);
void
SetHasFeed
(
size_t
dev_idx
,
bool
has_feed
)
{
feed_status_
[
dev_idx
]
=
has_feed
?
FeedStatus
::
kHasFeed
:
FeedStatus
::
kNone
;
}
void
EnablePartialFeedSupport
()
{
support_partial_feed_
=
true
;
}
bool
SupportPartialFeed
()
const
{
return
support_partial_feed_
;
}
private:
ExecutionStrategy
strategy_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
unique_ptr
<::
ThreadPool
>
pool_
{
nullptr
};
...
...
@@ -54,6 +72,9 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
std
::
vector
<
std
::
unique_ptr
<
details
::
FastThreadedSSAGraphExecutor
>>
executors_
;
ExceptionHolder
exception_holder_
;
bool
support_partial_feed_
{
false
};
std
::
vector
<
FeedStatus
>
feed_status_
;
};
}
// namespace details
...
...
paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
浏览文件 @
a4951843
...
...
@@ -228,7 +228,7 @@ void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const {
}
auto
*
eager_deletion_op
=
new
details
::
EagerDeletionOpHandle
(
eager_deletion_node
,
op
->
GetScope
(),
op
->
GetPlace
(),
eager_deletion_node
,
op
->
GetScope
(),
op
->
Get
ScopeIdx
(),
op
->
Get
Place
(),
std
::
move
(
var_info
),
gcs
.
at
(
places
[
op
->
GetScopeIdx
()]).
get
());
auto
it
=
std
::
find_if
(
...
...
paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
浏览文件 @
a4951843
...
...
@@ -98,7 +98,7 @@ class ReferenceCountPassTestHelper {
ir
::
PassRegistry
::
Instance
().
Get
(
"reference_count_pass"
);
ref_cnt_pass
->
SetNotOwned
(
ir
::
kMemOptVarInfoMapList
,
&
mem_opt_var_infos_
);
ref_cnt_pass
->
SetNotOwned
(
ir
::
kLastLiveOpsOfVars
,
&
last_live_ops_of_vars_
);
ref_cnt_pass
->
Apply
(
&
graph_
);
ref_cnt_pass
->
Apply
(
&
const_cast
<
ir
::
Graph
&>
(
executor_
->
Graph
())
);
}
bool
IsLastLivedOps
(
const
std
::
string
&
name
,
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
浏览文件 @
a4951843
...
...
@@ -11,7 +11,7 @@ endif()
cc_library
(
multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle
${
ALL_REDUCE_OP_HANDLES
}
reduce_op_handle broadcast_op_handle fused_broadcast_op_handle
)
cc_library
(
sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass
)
cc_library
(
set_reader_device_
count_pass SRCS set_reader_device_count
_pass.cc DEPS graph graph_helper pass multi_devices_graph_pass
)
cc_library
(
set_reader_device_
info_pass SRCS set_reader_device_info
_pass.cc DEPS graph graph_helper pass multi_devices_graph_pass
)
cc_library
(
fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle
)
cc_library
(
all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS all_reduce_op_handle graph graph_helper pass
)
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_
count
_pass.cc
→
paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_
info
_pass.cc
浏览文件 @
a4951843
...
...
@@ -22,77 +22,80 @@ namespace paddle {
namespace
framework
{
namespace
ir
{
class
SetReaderDeviceCountPass
:
public
Pass
{
protected:
void
ApplyImpl
(
Graph
*
graph
)
const
override
;
private:
int
GetDeviceCount
()
const
;
std
::
unordered_set
<
std
::
string
>
ReaderOpSet
()
const
;
const
Scope
*
GlobalScope
()
const
;
};
int
SetReaderDeviceCountPass
::
GetDeviceCount
()
const
{
static
int
GetDeviceCountFromPassAttr
(
const
Pass
&
pass
)
{
return
static_cast
<
int
>
(
Get
<
const
std
::
vector
<
platform
::
Place
>>
(
details
::
kPlaces
).
size
());
pass
.
Get
<
const
std
::
vector
<
platform
::
Place
>>
(
details
::
kPlaces
).
size
());
}
st
d
::
unordered_set
<
std
::
string
>
SetReaderDeviceCountPass
::
ReaderOpSet
()
const
{
st
atic
std
::
unordered_set
<
std
::
string
>
ReaderOpSet
()
{
return
{
"create_py_reader"
};
}
const
Scope
*
SetReaderDeviceCountPass
::
GlobalScope
()
const
{
return
Get
<
const
std
::
vector
<
Scope
*>>
(
details
::
kLocalScopes
)[
0
];
}
void
SetReaderDeviceCountPass
::
ApplyImpl
(
Graph
*
graph
)
const
{
auto
dev_cnt
=
GetDeviceCount
();
auto
reader_ops
=
ReaderOpSet
();
auto
scope
=
GlobalScope
();
size_t
found_op_num
=
0
;
for
(
auto
&
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
()
&&
node
->
Op
()
&&
reader_ops
.
count
(
node
->
Op
()
->
Type
())
!=
0
)
{
auto
&
op_handle
=
dynamic_cast
<
details
::
ComputationOpHandle
&>
(
node
->
Wrapper
<
details
::
OpHandleBase
>
());
auto
*
op_desc
=
node
->
Op
();
auto
&
op_base_attrs
=
const_cast
<
framework
::
AttributeMap
&>
(
op_handle
.
GetOp
()
->
Attrs
());
int
dev_idx
=
static_cast
<
int
>
(
op_handle
.
GetScopeIdx
());
op_desc
->
SetAttr
(
"device_index"
,
dev_idx
);
op_desc
->
SetAttr
(
"device_count"
,
dev_cnt
);
op_base_attrs
[
"device_index"
]
=
dev_idx
;
op_base_attrs
[
"device_count"
]
=
dev_cnt
;
auto
queue_name
=
op_handle
.
GetOp
()
->
Input
(
"blocking_queue"
);
auto
var
=
scope
->
FindVar
(
queue_name
);
PADDLE_ENFORCE_NOT_NULL
(
var
,
platform
::
errors
::
NotFound
(
"Blocking queue of DataLoader not found"
));
using
QueueHolder
=
operators
::
reader
::
OrderedMultiDeviceLoDTensorBlockingQueueHolder
;
if
(
var
->
IsType
<
QueueHolder
>
())
{
var
->
GetMutable
<
QueueHolder
>
()
->
GetQueue
()
->
SetDeviceCount
(
dev_cnt
);
class
InitReaderDeviceCountPass
:
public
Pass
{
protected:
void
ApplyImpl
(
Graph
*
graph
)
const
override
{
using
QueueHolder
=
operators
::
reader
::
OrderedMultiDeviceLoDTensorBlockingQueueHolder
;
auto
reader_ops
=
ReaderOpSet
();
auto
dev_cnt
=
GetDeviceCountFromPassAttr
(
*
this
);
const
auto
&
scope
=
Get
<
const
Scope
>
(
details
::
kGlobalScope
);
for
(
auto
&
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
()
&&
node
->
Op
()
&&
reader_ops
.
count
(
node
->
Op
()
->
Type
())
!=
0
)
{
auto
queue_name
=
node
->
Op
()
->
Input
(
"blocking_queue"
)[
0
];
auto
var
=
scope
.
FindVar
(
queue_name
);
if
(
var
&&
var
->
IsType
<
QueueHolder
>
())
{
VLOG
(
10
)
<<
"Set device count of "
<<
queue_name
<<
" to be "
<<
dev_cnt
;
var
->
GetMutable
<
QueueHolder
>
()
->
GetQueue
()
->
SetDeviceCount
(
dev_cnt
);
}
}
++
found_op_num
;
VLOG
(
10
)
<<
"Found op "
<<
op_desc
->
Type
()
<<
" on device "
<<
dev_idx
;
}
}
};
VLOG
(
10
)
<<
"Found op number "
<<
found_op_num
;
}
class
SetReaderDeviceIndexPass
:
public
Pass
{
protected:
void
ApplyImpl
(
Graph
*
graph
)
const
override
{
auto
dev_cnt
=
GetDeviceCountFromPassAttr
(
*
this
);
auto
reader_ops
=
ReaderOpSet
();
size_t
found_op_num
=
0
;
for
(
auto
&
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
()
&&
node
->
Op
()
&&
reader_ops
.
count
(
node
->
Op
()
->
Type
())
!=
0
)
{
auto
&
op_handle
=
dynamic_cast
<
details
::
ComputationOpHandle
&>
(
node
->
Wrapper
<
details
::
OpHandleBase
>
());
auto
*
op_desc
=
node
->
Op
();
auto
&
op_base_attrs
=
const_cast
<
framework
::
AttributeMap
&>
(
op_handle
.
GetOp
()
->
Attrs
());
int
dev_idx
=
static_cast
<
int
>
(
op_handle
.
GetScopeIdx
());
op_desc
->
SetAttr
(
"device_index"
,
dev_idx
);
op_desc
->
SetAttr
(
"device_count"
,
dev_cnt
);
op_base_attrs
[
"device_index"
]
=
dev_idx
;
op_base_attrs
[
"device_count"
]
=
dev_cnt
;
++
found_op_num
;
VLOG
(
10
)
<<
"Found op "
<<
op_desc
->
Type
()
<<
" on device "
<<
dev_idx
;
}
}
VLOG
(
10
)
<<
"Found op number "
<<
found_op_num
;
}
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
set_reader_device_count_pass
,
paddle
::
framework
::
ir
::
SetReaderDeviceCountPass
)
REGISTER_PASS
(
init_reader_device_count_pass
,
paddle
::
framework
::
ir
::
InitReaderDeviceCountPass
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kGlobalScope
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kPlaces
);
REGISTER_PASS
(
set_reader_device_index_pass
,
paddle
::
framework
::
ir
::
SetReaderDeviceIndexPass
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kPlaces
);
paddle/fluid/framework/lod_tensor.cc
浏览文件 @
a4951843
...
...
@@ -307,18 +307,18 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
std
::
vector
<
LoDTensor
>
LoDTensor
::
SplitLoDTensor
(
const
std
::
vector
<
platform
::
Place
>
places
)
const
{
PADDLE_ENFORCE_GT
(
places
.
size
(),
0
,
platform
::
errors
::
InvalidArgument
(
"place number cannot be empty when splitting"
));
check_memory_size
();
int
batch_size
=
lod
().
empty
()
?
dims
()[
0
]
:
static_cast
<
int
>
(
lod
()[
0
].
size
())
-
1
;
size_t
result_size
=
std
::
min
(
static_cast
<
size_t
>
(
batch_size
),
places
.
size
());
size_t
remainder
=
batch_size
%
places
.
size
();
size_t
batch_size
=
lod
().
empty
()
?
static_cast
<
size_t
>
(
dims
()[
0
])
:
lod
()[
0
].
size
()
-
1
;
std
::
vector
<
LoDTensor
>
results
;
results
.
reserve
(
result_size
);
// if result_size(batch_size) is 0, just return #places.size() copys of empty
// if batch_size is 0, just return #places.size() copys of empty
// tensors.
if
(
result_size
==
0
)
{
if
(
batch_size
==
0
)
{
std
::
vector
<
LoDTensor
>
empty_results
;
empty_results
.
reserve
(
places
.
size
());
for
(
size_t
i
=
0
;
i
<
places
.
size
();
++
i
)
{
LoDTensor
dst
;
dst
.
Resize
(
dims
());
...
...
@@ -326,18 +326,22 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
if
(
!
lod
().
empty
())
{
dst
.
set_lod
(
lod
());
}
results
.
emplace_back
(
dst
);
empty_results
.
emplace_back
(
std
::
move
(
dst
)
);
}
return
results
;
return
empty_
results
;
}
int
step_width
=
static_cast
<
int
>
(
batch_size
/
result_size
);
auto
step_width
=
(
batch_size
+
places
.
size
()
-
1
)
/
places
.
size
();
auto
result_size
=
(
batch_size
+
step_width
-
1
)
/
step_width
;
std
::
vector
<
LoDTensor
>
results
;
results
.
reserve
(
result_size
);
for
(
size_t
i
=
0
;
i
<
result_size
;
++
i
)
{
int
begin
=
static_cast
<
int
>
(
i
*
step_width
)
;
int
end
=
static_cast
<
int
>
((
i
+
1
)
*
step_width
);
if
(
i
+
1
==
places
.
size
())
{
// last
end
+=
remainder
;
}
auto
begin
=
i
*
step_width
;
auto
end
=
std
::
min
<
size_t
>
((
i
+
1
)
*
step_width
,
batch_size
);
PADDLE_ENFORCE_LT
(
begin
,
end
,
platform
::
errors
::
InvalidArgument
(
"begin must be less than end, this may be a bug"
));
LoDTensor
dst
;
if
(
lod
().
empty
())
{
...
...
@@ -362,7 +366,7 @@ std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
}
dst
.
set_lod
(
my_lod
);
}
results
.
emplace_back
(
dst
);
results
.
emplace_back
(
std
::
move
(
dst
)
);
}
return
results
;
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
a4951843
...
...
@@ -55,8 +55,9 @@ static bool gProfileStarted = false;
class
ParallelExecutorPrivate
{
public:
explicit
ParallelExecutorPrivate
(
const
std
::
vector
<
platform
::
Place
>
&
places
)
:
places_
(
places
)
{
ParallelExecutorPrivate
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
Scope
*
global_scope
)
:
places_
(
places
),
global_scope_
(
global_scope
)
{
if
(
!
FLAGS_pe_profile_fname
.
empty
())
{
std
::
call_once
(
gProfileOnce
,
[]
{
#ifdef WITH_GPERFTOOLS
...
...
@@ -82,6 +83,19 @@ class ParallelExecutorPrivate {
}
}
void
InitReaderDeviceCount
(
ir
::
Graph
*
graph
)
const
{
auto
pass
=
ir
::
PassRegistry
::
Instance
().
Get
(
"init_reader_device_count_pass"
);
pass
->
SetNotOwned
<
const
Scope
>
(
details
::
kGlobalScope
,
global_scope_
);
pass
->
SetNotOwned
<
const
std
::
vector
<
platform
::
Place
>>
(
details
::
kPlaces
,
&
places_
);
pass
->
Apply
(
graph
);
}
void
SetHasFeed
(
size_t
dev_idx
,
bool
has_feed
=
true
);
bool
AllowPartialFeed
()
const
;
ir
::
Graph
*
ApplyMemoryOptimizePass
(
ir
::
Graph
*
graph
);
inline
bool
HasGarbageCollectors
()
const
{
return
!
gcs_
.
empty
();
}
...
...
@@ -257,8 +271,20 @@ class ParallelExecutorPrivate {
ir
::
MemOptVarInfoMapList
mem_opt_var_infos_
;
ir
::
GarbageCollectorMap
gcs_
;
details
::
ParallelSSAGraphExecutor
*
inference_executor_
{
nullptr
};
};
void
ParallelExecutorPrivate
::
SetHasFeed
(
size_t
dev_idx
,
bool
has_feed
)
{
if
(
inference_executor_
)
{
inference_executor_
->
SetHasFeed
(
dev_idx
,
has_feed
);
}
}
bool
ParallelExecutorPrivate
::
AllowPartialFeed
()
const
{
return
inference_executor_
&&
inference_executor_
->
SupportPartialFeed
();
}
ir
::
Graph
*
ParallelExecutorPrivate
::
ApplyMemoryOptimizePass
(
ir
::
Graph
*
graph
)
{
if
(
FLAGS_use_ngraph
)
{
LOG_FIRST_N
(
WARNING
,
1
)
...
...
@@ -379,6 +405,21 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
return
graph
;
}
class
ResetHasFeedGuard
{
public:
explicit
ResetHasFeedGuard
(
ParallelExecutorPrivate
*
pe_member
)
:
pe_member_
(
pe_member
)
{}
~
ResetHasFeedGuard
()
{
for
(
size_t
i
=
0
;
i
<
pe_member_
->
places_
.
size
();
++
i
)
{
pe_member_
->
SetHasFeed
(
i
,
false
);
}
}
private:
ParallelExecutorPrivate
*
pe_member_
;
};
size_t
ParallelExecutor
::
DeviceCount
()
const
{
return
member_
->
places_
.
size
();
}
std
::
vector
<
Scope
*>
&
ParallelExecutor
::
GetLocalScopes
()
{
...
...
@@ -407,8 +448,8 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const
ExecutionStrategy
&
exec_strategy
,
const
BuildStrategy
&
build_strategy
,
ir
::
Graph
*
graph
)
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
member_
->
global_scope_
=
scope
;
:
member_
(
new
ParallelExecutorPrivate
(
places
,
scope
))
{
member_
->
InitReaderDeviceCount
(
graph
)
;
member_
->
use_cuda_
=
exec_strategy
.
use_cuda_
;
member_
->
build_strategy_
=
build_strategy
;
member_
->
use_all_reduce_
=
member_
->
build_strategy_
.
reduce_
==
...
...
@@ -606,18 +647,35 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
"Paddle should be compiled with CUDA for ParallelGraph Execution."
);
#endif
}
else
{
if
(
exec_strategy
.
type_
==
ExecutionStrategy
::
kDefault
)
{
VLOG
(
3
)
<<
"use ThreadedSSAGraphExecutor"
;
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
bool
has_drop_last_read_op
=
details
::
HasDropLastReadOp
(
*
graph
);
auto
possible_inference_graphs
=
details
::
TrySeparateToMultipleSingleDeviceGraphs
(
graph
);
if
(
!
possible_inference_graphs
.
empty
())
{
VLOG
(
5
)
<<
"Use ParallelSSAGraphExecutor in inference phase"
;
auto
*
pg_exe
=
new
details
::
ParallelSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
local_exec_scopes_
,
member_
->
places_
,
graph
));
member_
->
places_
,
std
::
move
(
possible_inference_graphs
));
if
(
!
has_drop_last_read_op
)
{
VLOG
(
5
)
<<
"Enable partial feed support in inference phase"
;
pg_exe
->
EnablePartialFeedSupport
();
}
final_graphs
=
pg_exe
->
Graphs
();
member_
->
executor_
.
reset
(
pg_exe
);
member_
->
inference_executor_
=
pg_exe
;
}
else
{
VLOG
(
3
)
<<
"use FastThreadedSSAGraphExecutor"
;
member_
->
executor_
.
reset
(
new
details
::
FastThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
local_exec_scopes_
,
member_
->
places_
,
graph
));
if
(
exec_strategy
.
type_
==
ExecutionStrategy
::
kDefault
)
{
VLOG
(
3
)
<<
"use ThreadedSSAGraphExecutor"
;
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
local_exec_scopes_
,
member_
->
places_
,
graph
));
}
else
{
VLOG
(
3
)
<<
"use FastThreadedSSAGraphExecutor"
;
member_
->
executor_
.
reset
(
new
details
::
FastThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
member_
->
local_exec_scopes_
,
member_
->
places_
,
graph
));
}
final_graphs
.
emplace_back
(
graph
);
}
final_graphs
.
emplace_back
(
graph
);
}
VLOG
(
3
)
<<
"use ScopeBufferedSSAGraphExecutor"
;
...
...
@@ -724,6 +782,8 @@ FeedFetchList ParallelExecutor::Run(
platform
::
RecordBlock
b
(
0
);
ResetHasFeedGuard
reset_has_feed_guard
(
member_
);
ir
::
SkipMemOptVarsGuard
guard
(
&
(
member_
->
mem_opt_var_infos_
),
fetch_tensors
,
member_
->
HasGarbageCollectors
());
...
...
@@ -734,10 +794,22 @@ FeedFetchList ParallelExecutor::Run(
void
ParallelExecutor
::
FeedTensorsIntoLocalScopes
(
const
std
::
vector
<
std
::
unordered_map
<
std
::
string
,
LoDTensor
>>
&
tensors
)
{
PADDLE_ENFORCE_EQ
(
member_
->
local_scopes_
.
size
(),
tensors
.
size
());
if
(
!
member_
->
AllowPartialFeed
())
{
PADDLE_ENFORCE_EQ
(
member_
->
local_scopes_
.
size
(),
tensors
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The feed tensor number does not match the device number"
));
}
else
{
PADDLE_ENFORCE_GE
(
member_
->
local_scopes_
.
size
(),
tensors
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The feed tensor number exceeds the device number"
));
}
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
++
i
)
{
auto
&
map
=
tensors
[
i
];
if
(
!
map
.
empty
())
{
member_
->
SetHasFeed
(
i
);
}
for
(
auto
&
pair
:
map
)
{
bool
is_persistable
=
member_
->
IsPersistable
(
pair
.
first
);
if
(
!
is_persistable
)
{
...
...
@@ -757,6 +829,11 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes(
void
ParallelExecutor
::
FeedAndSplitTensorIntoLocalScopes
(
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>
&
tensors
)
{
size_t
num_places
=
member_
->
places_
.
size
();
bool
allow_partial_feed
=
member_
->
AllowPartialFeed
();
size_t
persistable_feed_len
=
-
1UL
;
size_t
non_persistable_feed_len
=
-
1UL
;
for
(
auto
&
pair
:
tensors
)
{
bool
is_persistable
=
member_
->
IsPersistable
(
pair
.
first
);
VLOG
(
3
)
<<
"Split "
<<
(
is_persistable
?
"persistable"
:
"no persistable"
)
...
...
@@ -764,7 +841,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
<<
", place: "
<<
pair
.
second
.
place
();
auto
lod_tensors
=
pair
.
second
.
SplitLoDTensor
(
member_
->
places_
);
bool
is_cpu_place
=
platform
::
is_cpu_place
(
member_
->
places_
.
front
());
if
(
!
is_persistable
&&
num_places
!=
lod_tensors
.
size
())
{
if
(
!
is_persistable
&&
num_places
!=
lod_tensors
.
size
()
&&
!
allow_partial_feed
)
{
auto
error_info
=
string
::
Sprintf
(
"The number(%d) of samples[%s] of current batch is less than the "
"count(%d) of devices(%s), currently, it is not allowed. "
,
...
...
@@ -790,7 +868,7 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
framework
::
TensorCopy
(
pair
.
second
,
member_
->
places_
.
at
(
i
),
&
tmp
);
}
}
if
(
lod_tensors
.
size
()
!=
num_places
)
{
if
(
lod_tensors
.
size
()
!=
num_places
&&
!
allow_partial_feed
)
{
auto
error_info
=
string
::
Sprintf
(
"The number(%d) of samples[%s] of the current batch does not match "
"the count(%d) of devices(%s). Because that %s is a persistable "
...
...
@@ -804,7 +882,31 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
}
}
for
(
size_t
j
=
0
;
j
<
num_places
;
++
j
)
{
if
(
allow_partial_feed
)
{
if
(
is_persistable
)
{
if
(
persistable_feed_len
==
-
1UL
)
{
persistable_feed_len
=
lod_tensors
.
size
();
}
else
{
PADDLE_ENFORCE_EQ
(
persistable_feed_len
,
lod_tensors
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The feeded number of different persistable variables "
"should be the same"
));
}
}
else
{
if
(
non_persistable_feed_len
==
-
1UL
)
{
non_persistable_feed_len
=
lod_tensors
.
size
();
}
else
{
PADDLE_ENFORCE_EQ
(
non_persistable_feed_len
,
lod_tensors
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The feeded number of different non-persistable variables "
"should be the same"
));
}
}
}
for
(
size_t
j
=
0
;
j
<
lod_tensors
.
size
();
++
j
)
{
auto
*
feed_scope
=
is_persistable
?
member_
->
local_scopes_
[
j
]
:
member_
->
local_exec_scopes_
[
j
];
auto
*
feed_var
=
feed_scope
->
Var
(
pair
.
first
);
...
...
@@ -814,6 +916,19 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
t
->
set_lod
(
lod_tensors
[
j
].
lod
());
}
}
if
(
allow_partial_feed
&&
persistable_feed_len
!=
-
1UL
&&
non_persistable_feed_len
!=
-
1UL
)
{
VLOG
(
10
)
<<
"Persistable len "
<<
persistable_feed_len
;
VLOG
(
10
)
<<
"Non persistable len "
<<
non_persistable_feed_len
;
PADDLE_ENFORCE_GE
(
persistable_feed_len
,
non_persistable_feed_len
,
platform
::
errors
::
InvalidArgument
(
"The feeded number of persistable variables should "
"not be less than non-persistable variables"
));
for
(
size_t
i
=
0
;
i
<
non_persistable_feed_len
;
++
i
)
{
member_
->
SetHasFeed
(
i
);
}
}
}
ParallelExecutor
::~
ParallelExecutor
()
{
...
...
@@ -864,6 +979,10 @@ bool ParallelExecutor::EnableParallelGraphExecution(
return
enable_parallel_graph
;
}
const
ir
::
Graph
&
ParallelExecutor
::
Graph
()
const
{
return
member_
->
executor_
->
Graph
();
}
}
// namespace framework
}
// namespace paddle
...
...
@@ -871,3 +990,4 @@ USE_PASS(reference_count_pass);
USE_PASS
(
eager_deletion_pass
);
USE_PASS
(
buffer_shared_inplace_pass
);
USE_PASS
(
buffer_shared_cross_op_memory_reuse_pass
);
USE_PASS
(
init_reader_device_count_pass
);
paddle/fluid/framework/parallel_executor.h
浏览文件 @
a4951843
...
...
@@ -79,6 +79,8 @@ class ParallelExecutor {
FeedFetchList
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
);
const
ir
::
Graph
&
Graph
()
const
;
private:
// broadcast the parameters from the 0th device.
// trainer_id the trainer index in nccl distributed training.
...
...
paddle/fluid/operators/reader/read_op.cc
浏览文件 @
a4951843
...
...
@@ -156,6 +156,10 @@ class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
" and it is set by ParallelExecutor instance, not users."
)
.
SetDefault
(
true
);
AddAttr
<
bool
>
(
"infer_out"
,
""
).
SetDefault
(
true
);
AddAttr
<
bool
>
(
"drop_last"
,
"Whether to drop last batches whose number is less than CPU "
"cores/GPU cards number"
)
.
SetDefault
(
true
);
AddComment
(
R"DOC(
Read Operator
...
...
paddle/fluid/pybind/reader_py.cc
浏览文件 @
a4951843
...
...
@@ -20,6 +20,7 @@
#include <utility>
#include <vector>
#include "Python.h"
#include "boost/optional.hpp"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/reader.h"
...
...
@@ -41,6 +42,58 @@ namespace pybind {
namespace
py
=
pybind11
;
namespace
reader
=
operators
::
reader
;
// Check whether the tensor shape matches the VarDesc shape
// Return the different shape if exists
static
boost
::
optional
<
std
::
vector
<
int64_t
>>
DiffTensorShapeWithVarDesc
(
const
framework
::
LoDTensor
&
tensor
,
const
framework
::
VarDesc
&
var_desc
,
size_t
num_places
)
{
auto
tensor_shape
=
tensor
.
dims
();
auto
desc_shape
=
var_desc
.
GetShape
();
int64_t
rank
=
tensor_shape
.
size
();
if
(
UNLIKELY
(
rank
==
0
))
{
if
(
desc_shape
.
size
()
!=
0
)
{
// Tensor rank = 0 but desc does not match
return
framework
::
vectorize
<
int64_t
>
(
tensor_shape
);
}
else
{
return
boost
::
none
;
}
}
PADDLE_ENFORCE_GE
(
tensor_shape
[
0
],
0
,
platform
::
errors
::
InvalidArgument
(
"Tensor shape must not be less than 0"
));
if
(
!
tensor
.
lod
().
empty
())
{
tensor_shape
[
0
]
=
-
1
;
// unknown shape
}
else
{
int64_t
split_size
=
(
tensor_shape
[
0
]
+
num_places
-
1
)
/
num_places
;
int64_t
remainder
=
(
split_size
==
0
?
0
:
tensor_shape
[
0
]
%
split_size
);
tensor_shape
[
0
]
=
split_size
;
if
(
desc_shape
[
0
]
>=
0
)
{
// need check dim 0
if
(
tensor_shape
[
0
]
!=
desc_shape
[
0
])
{
return
framework
::
vectorize
<
int64_t
>
(
tensor_shape
);
}
if
(
remainder
>
0
)
{
tensor_shape
[
0
]
=
remainder
;
return
framework
::
vectorize
<
int64_t
>
(
tensor_shape
);
}
}
}
for
(
int64_t
idx
=
1
;
idx
<
rank
;
++
idx
)
{
PADDLE_ENFORCE_GE
(
tensor_shape
[
idx
],
0
,
platform
::
errors
::
InvalidArgument
(
"Tensor shape must not be less than 0"
));
if
(
desc_shape
[
idx
]
>=
0
&&
tensor_shape
[
idx
]
!=
desc_shape
[
idx
])
{
return
framework
::
vectorize
<
int64_t
>
(
tensor_shape
);
}
}
return
boost
::
none
;
}
static
const
std
::
shared_ptr
<
reader
::
LoDTensorBlockingQueue
>
&
GetQueue
(
const
std
::
shared_ptr
<
reader
::
LoDTensorBlockingQueue
>
&
queue
,
size_t
idx
)
{
return
queue
;
...
...
@@ -66,10 +119,12 @@ class MultiDeviceFeedReader {
const
std
::
vector
<
std
::
vector
<
int
>>
&
shapes
,
const
std
::
vector
<
framework
::
proto
::
VarType
::
Type
>
&
dtypes
,
const
std
::
vector
<
bool
>
&
need_check_feed
,
const
std
::
vector
<
platform
::
Place
>
&
dst_places
,
bool
use_double_buffer
)
const
std
::
vector
<
platform
::
Place
>
&
dst_places
,
bool
use_double_buffer
,
bool
drop_last
)
:
queue_
(
queue
),
names_
(
names
),
pool_
(
new
::
ThreadPool
(
dst_places
.
size
()))
{
pool_
(
new
::
ThreadPool
(
dst_places
.
size
())),
drop_last_
(
drop_last
)
{
std
::
vector
<
framework
::
DDim
>
dims
;
for
(
auto
&
shape
:
shapes
)
{
dims
.
push_back
(
framework
::
make_ddim
(
shape
));
...
...
@@ -113,12 +168,16 @@ class MultiDeviceFeedReader {
ReadAsync
();
}
bool
DropLast
()
const
{
return
drop_last_
;
}
ResultDictList
ReadNext
()
{
CheckNextStatus
();
ResultDictList
result
(
ret_
.
size
());
for
(
size_t
i
=
0
;
i
<
ret_
.
size
();
++
i
)
{
for
(
size_t
j
=
0
;
j
<
names_
.
size
();
++
j
)
{
result
[
i
].
emplace
(
names_
[
j
],
std
::
move
(
ret_
[
i
][
j
]));
if
(
!
ret_
[
i
].
empty
())
{
for
(
size_t
j
=
0
;
j
<
names_
.
size
();
++
j
)
{
result
[
i
].
emplace
(
names_
[
j
],
std
::
move
(
ret_
[
i
][
j
]));
}
}
}
ReadAsync
();
...
...
@@ -155,24 +214,29 @@ class MultiDeviceFeedReader {
};
Status
WaitFutures
(
std
::
exception_ptr
*
excep
)
{
bool
is_success
=
true
;
*
excep
=
nullptr
;
size_t
success_num
=
0
;
for
(
size_t
i
=
0
;
i
<
futures_
.
size
();
++
i
)
{
auto
each_status
=
futures_
[
i
].
get
();
if
(
UNLIKELY
(
each_status
!=
Status
::
kSuccess
))
{
is_success
=
false
;
if
(
UNLIKELY
(
each_status
==
Status
::
kException
))
{
PADDLE_ENFORCE_NOT_NULL
(
exceptions_
[
i
]);
*
excep
=
exceptions_
[
i
];
exceptions_
[
i
]
=
nullptr
;
}
}
else
{
++
success_num
;
}
}
if
(
UNLIKELY
(
*
excep
))
{
return
Status
::
kException
;
}
if
(
drop_last_
)
{
return
success_num
==
futures_
.
size
()
?
Status
::
kSuccess
:
Status
::
kEOF
;
}
else
{
return
is_success
?
Status
::
kSuccess
:
Status
::
kEOF
;
return
success_num
>
0
?
Status
::
kSuccess
:
Status
::
kEOF
;
}
}
...
...
@@ -226,6 +290,7 @@ class MultiDeviceFeedReader {
std
::
vector
<
std
::
exception_ptr
>
exceptions_
;
std
::
vector
<
std
::
vector
<
framework
::
LoDTensor
>>
ret_
;
bool
drop_last_
;
};
template
<
typename
QueueType
>
...
...
@@ -270,6 +335,17 @@ void BindMultiDeviceReader(py::module *module, const char *reader_name) {
void
BindReader
(
py
::
module
*
module
)
{
auto
&
m
=
*
module
;
m
.
def
(
"diff_tensor_shape"
,
[](
const
framework
::
LoDTensor
&
tensor
,
const
framework
::
VarDesc
&
var_desc
,
size_t
num_places
)
->
py
::
object
{
auto
diff
=
DiffTensorShapeWithVarDesc
(
tensor
,
var_desc
,
num_places
);
if
(
diff
)
{
return
py
::
cast
(
std
::
move
(
diff
.
get
()));
}
else
{
return
py
::
cast
(
nullptr
);
}
});
m
.
def
(
"init_lod_tensor_blocking_queue"
,
[](
framework
::
Variable
&
var
,
size_t
capacity
,
bool
is_ordered
)
->
py
::
object
{
...
...
@@ -337,10 +413,10 @@ void BindReader(py::module *module) {
const
std
::
vector
<
framework
::
proto
::
VarType
::
Type
>
&
dtypes
,
const
std
::
vector
<
bool
>
&
need_check_feed
,
const
std
::
vector
<
platform
::
Place
>
&
dst_places
,
bool
use_double_buffer
)
{
bool
use_double_buffer
,
bool
drop_last
)
{
return
new
MultiDeviceFeedReader
<
reader
::
LoDTensorBlockingQueue
>
(
queue
,
names
,
shapes
,
dtypes
,
need_check_feed
,
dst_places
,
use_double_buffer
);
use_double_buffer
,
drop_last
);
},
py
::
return_value_policy
::
take_ownership
);
...
...
@@ -352,13 +428,13 @@ void BindReader(py::module *module) {
const
std
::
vector
<
std
::
vector
<
int
>>
&
shapes
,
const
std
::
vector
<
framework
::
proto
::
VarType
::
Type
>
&
dtypes
,
const
std
::
vector
<
bool
>
&
need_check_feed
,
const
std
::
vector
<
platform
::
Place
>
&
dst_places
,
bool
use_double_buffer
)
{
const
std
::
vector
<
platform
::
Place
>
&
dst_places
,
bool
use_double_buffer
,
bool
drop_last
)
{
queue
->
SetDeviceCount
(
dst_places
.
size
());
return
new
MultiDeviceFeedReader
<
reader
::
OrderedMultiDeviceLoDTensorBlockingQueue
>
(
queue
,
names
,
shapes
,
dtypes
,
need_check_feed
,
dst_places
,
use_double_buffer
);
use_double_buffer
,
drop_last
);
},
py
::
return_value_policy
::
take_ownership
);
}
...
...
python/paddle/fluid/executor.py
浏览文件 @
a4951843
...
...
@@ -216,18 +216,12 @@ def check_feed_shape_type(var, feed, num_places=1):
the feed value
"""
if
var
.
desc
.
need_check_feed
():
feed_shape
=
feed
.
shape
()
if
six
.
PY2
:
feed_shape
[
0
]
=
long
(
feed_shape
[
0
]
/
num_places
)
if
len
(
feed
.
lod
())
==
0
else
-
1
else
:
feed_shape
[
0
]
=
int
(
feed_shape
[
0
]
/
num_places
)
if
len
(
feed
.
lod
())
==
0
else
-
1
if
not
dimension_is_compatible_with
(
feed_shape
,
var
.
shape
):
diff_shape
=
core
.
diff_tensor_shape
(
feed
,
var
.
desc
,
num_places
)
if
diff_shape
is
not
None
:
raise
ValueError
(
'The feeded Variable %r should have dimensions = %d, shape = '
'%r, but received feeded shape %r on each device'
%
(
var
.
name
,
len
(
var
.
shape
),
var
.
shape
,
feed
_shape
))
(
var
.
name
,
len
(
var
.
shape
),
var
.
shape
,
diff
_shape
))
if
not
dtype_is_compatible_with
(
feed
.
_dtype
(),
var
.
dtype
):
var_dtype_format
=
convert_dtype
(
var
.
dtype
)
if
isinstance
(
var
.
dtype
,
core
.
VarDesc
.
VarType
)
else
var
.
dtype
...
...
@@ -646,11 +640,6 @@ class Executor(object):
exe
.
feed_and_split_tensor_into_local_scopes
(
feed_tensor_dict
)
elif
isinstance
(
feed
,
list
)
or
isinstance
(
feed
,
tuple
):
if
len
(
feed
)
!=
len
(
program
.
_places
):
raise
ValueError
(
"Feed a list of tensor, the list should be the same size as places"
)
res
=
list
()
for
i
,
each
in
enumerate
(
feed
):
if
not
isinstance
(
each
,
dict
):
...
...
python/paddle/fluid/reader.py
浏览文件 @
a4951843
...
...
@@ -88,6 +88,7 @@ class DataLoader(object):
iterable
=
True
,
return_list
=
False
,
use_multiprocess
=
False
,
drop_last
=
True
,
keep_order
=
False
):
"""
Create a DataLoader object for loading data from Python generator.
...
...
@@ -134,6 +135,9 @@ class DataLoader(object):
can be used in the dygraph mode. In the static graph mode,
whether this parameter is set or not has no effect.
The Default value is False.
drop_last (bool): whether to drop the last batches whose number is
less than the CPU core/GPU card number. The default value is
True.
keep_order (bool): whether to assign the data to CPU cores or GPU
cards in order. Supposing that there are 2 batches and we use
2 GPU cards to run the network. If keep_order=True, GPU 0 would
...
...
@@ -289,7 +293,7 @@ class DataLoader(object):
return_list
,
use_multiprocess
)
else
:
return
GeneratorLoader
(
feed_list
,
capacity
,
use_double_buffer
,
iterable
,
return_list
,
keep_order
)
iterable
,
return_list
,
drop_last
,
keep_order
)
@
staticmethod
def
from_dataset
(
dataset
,
places
,
drop_last
=
True
):
...
...
@@ -422,7 +426,7 @@ class DygraphGeneratorLoader(DataLoaderBase):
core
.
Variable
(),
self
.
_capacity
,
False
)
self
.
_reader
=
core
.
create_py_reader
(
self
.
queue
,
self
.
_var_names
,
self
.
_shapes
,
self
.
_dtypes
,
self
.
_need_check_feed
,
self
.
_places
,
self
.
_use_double_buffer
)
self
.
_need_check_feed
,
self
.
_places
,
self
.
_use_double_buffer
,
True
)
def
_start
(
self
):
if
self
.
_use_multiprocess
:
...
...
@@ -628,6 +632,7 @@ class GeneratorLoader(DataLoaderBase):
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
False
,
drop_last
=
True
,
keep_order
=
False
):
self
.
_tensor_reader
=
None
self
.
_places
=
None
...
...
@@ -635,6 +640,8 @@ class GeneratorLoader(DataLoaderBase):
self
.
_queue
=
None
self
.
_feed_list
=
feed_list
self
.
_exited
=
False
self
.
_drop_last
=
drop_last
self
.
_keep_order
=
keep_order
if
not
capacity
:
raise
ValueError
(
"Please give value to capacity."
)
self
.
_iterable
=
iterable
...
...
@@ -643,7 +650,6 @@ class GeneratorLoader(DataLoaderBase):
raise
Exception
(
"Feed list must be given under static mode."
)
self
.
_use_double_buffer
=
use_double_buffer
self
.
_capacity
=
capacity
self
.
_keep_order
=
keep_order
if
not
self
.
_iterable
:
self
.
_init_non_iterable
()
...
...
@@ -667,7 +673,8 @@ class GeneratorLoader(DataLoaderBase):
core
.
Variable
(),
self
.
_capacity
,
self
.
_keep_order
)
self
.
_reader
=
core
.
create_py_reader
(
self
.
queue
,
self
.
_var_names
,
self
.
_shapes
,
self
.
_dtypes
,
self
.
_need_check_feed
,
self
.
_places
,
self
.
_use_double_buffer
)
self
.
_need_check_feed
,
self
.
_places
,
self
.
_use_double_buffer
,
self
.
_drop_last
)
def
_init_non_iterable
(
self
):
lod_levels
=
[]
...
...
@@ -744,7 +751,8 @@ class GeneratorLoader(DataLoaderBase):
default_main_program
().
current_block
().
append_op
(
type
=
'read'
,
inputs
=
{
'Reader'
:
[
self
.
_reader
]},
outputs
=
{
'Out'
:
self
.
_feed_list
})
outputs
=
{
'Out'
:
self
.
_feed_list
},
attrs
=
{
'drop_last'
:
self
.
_drop_last
})
@
property
def
queue
(
self
):
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
a4951843
...
...
@@ -355,4 +355,5 @@ set_tests_properties(test_parallel_executor_test_while_train test_parallel_execu
test_parallel_executor_crf_auto_growth test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
test_data_norm_op test_imperative_using_non_zero_gpu test_fuse_bn_act_pass
test_optimizer_in_control_flow test_dataloader_keep_order
test_parallel_executor_inference_feed_partial_data
test_buffer_shared_memory_reuse_pass PROPERTIES LABELS
"RUN_TYPE=DIST"
)
python/paddle/fluid/tests/unittests/test_parallel_executor_inference_feed_partial_data.py
0 → 100644
浏览文件 @
a4951843
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.fluid
as
fluid
import
numpy
as
np
import
unittest
import
six
class
TestInferencePartialFeed
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
iterations
=
10
self
.
size
=
10
def
run_network
(
self
,
places
,
use_split
):
startup_prog
=
fluid
.
Program
()
main_prog
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main_prog
,
startup_prog
):
x
=
fluid
.
data
(
name
=
'x'
,
shape
=
[
None
,
self
.
size
],
dtype
=
'float32'
)
y
=
fluid
.
data
(
name
=
'y'
,
shape
=
[
None
,
self
.
size
],
dtype
=
'float32'
)
lr
=
fluid
.
data
(
name
=
'lr'
,
shape
=
[
1
],
dtype
=
'float32'
)
lr
.
persistable
=
True
relu_x
=
fluid
.
layers
.
relu
(
x
)
relu_y
=
fluid
.
layers
.
relu
(
y
)
relu_lr
=
fluid
.
layers
.
relu
(
lr
)
exe
=
fluid
.
Executor
(
places
[
0
])
exe
.
run
(
startup_prog
)
prog
=
fluid
.
CompiledProgram
(
main_prog
).
with_data_parallel
(
places
=
places
)
gen_random
=
lambda
shape
:
np
.
random
.
uniform
(
low
=-
1.0
,
high
=
1.0
,
size
=
shape
).
astype
(
'float32'
)
assert_result
=
lambda
feed
,
result
:
self
.
assertTrue
(
np
.
array_equal
(
np
.
maximum
(
0
,
feed
),
result
))
def
feed_split_test
():
for
place_num
in
six
.
moves
.
range
(
1
,
len
(
places
)
*
3
):
x_np
=
gen_random
([
place_num
,
self
.
size
])
y_np
=
gen_random
([
place_num
,
self
.
size
])
if
place_num
<=
len
(
places
):
lr_np
=
gen_random
([
place_num
])
else
:
lr_np
=
gen_random
([
1
])
relu_x_np
,
relu_y_np
,
relu_lr_np
=
exe
.
run
(
prog
,
feed
=
{
x
.
name
:
x_np
,
y
.
name
:
y_np
,
lr
.
name
:
lr_np
},
fetch_list
=
[
relu_x
,
relu_y
,
relu_lr
])
assert_result
(
x_np
,
relu_x_np
)
assert_result
(
y_np
,
relu_y_np
)
if
place_num
<=
len
(
places
):
assert_result
(
lr_np
,
relu_lr_np
)
else
:
expected_relu_lr_np
=
max
(
lr_np
[
0
],
0
)
self
.
assertTrue
(
np
.
all
(
expected_relu_lr_np
==
relu_lr_np
))
def
feed_list_test
():
for
place_num
in
six
.
moves
.
range
(
1
,
len
(
places
)
+
1
):
x_np_list
=
[]
y_np_list
=
[]
lr_np_list
=
[]
feed_list
=
[]
for
_
in
six
.
moves
.
range
(
place_num
):
x_np
=
gen_random
([
1
,
self
.
size
])
y_np
=
gen_random
([
1
,
self
.
size
])
lr_np
=
gen_random
([
1
])
x_np_list
.
append
(
x_np
)
y_np_list
.
append
(
y_np
)
lr_np_list
.
append
(
lr_np
)
feed_list
.
append
({
x
.
name
:
x_np
,
y
.
name
:
y_np
,
lr
.
name
:
lr_np
})
relu_x_np
,
relu_y_np
,
relu_lr_np
=
exe
.
run
(
prog
,
feed
=
feed_list
,
fetch_list
=
[
relu_x
,
relu_y
,
relu_lr
])
x_np
=
np
.
concatenate
(
x_np_list
)
y_np
=
np
.
concatenate
(
y_np_list
)
lr_np
=
np
.
concatenate
(
lr_np_list
)
assert_result
(
x_np
,
relu_x_np
)
assert_result
(
y_np
,
relu_y_np
)
assert_result
(
lr_np
,
relu_lr_np
)
for
_
in
six
.
moves
.
range
(
self
.
iterations
):
if
use_split
:
feed_split_test
()
else
:
feed_list_test
()
def
test_main
(
self
):
places
=
[
fluid
.
cpu_places
(
4
)]
if
fluid
.
is_compiled_with_cuda
():
places
.
append
(
fluid
.
cuda_places
())
for
p
in
places
:
self
.
run_network
(
p
,
use_split
=
True
)
self
.
run_network
(
p
,
use_split
=
False
)
class
TestInferencePartialFeedUsingDataLoader
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
epoch_num
=
3
self
.
batch_num
=
101
# a prime number
self
.
batch_size
=
32
def
create_reader
(
self
):
def
__impl__
():
for
_
in
six
.
moves
.
range
(
self
.
batch_num
):
yield
np
.
random
.
random
([
self
.
batch_size
,
1
]).
astype
(
'float32'
),
return
__impl__
def
run_network
(
self
,
iterable
,
use_cuda
,
drop_last
):
x
=
fluid
.
data
(
shape
=
[
None
,
1
],
name
=
'x'
,
dtype
=
'float32'
)
places
=
fluid
.
cuda_places
()
if
use_cuda
else
fluid
.
cpu_places
(
4
)
loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
[
x
],
capacity
=
16
,
iterable
=
iterable
,
drop_last
=
drop_last
)
y
=
fluid
.
layers
.
fc
(
x
,
size
=
10
)
loss
=
fluid
.
layers
.
reduce_mean
(
y
)
exe
=
fluid
.
Executor
(
places
[
0
])
exe
.
run
(
fluid
.
default_startup_program
())
prog
=
fluid
.
CompiledProgram
(
fluid
.
default_main_program
(
)).
with_data_parallel
(
places
=
places
,
loss_name
=
loss
.
name
)
loader
.
set_batch_generator
(
self
.
create_reader
(),
places
=
places
if
iterable
else
None
)
for
_
in
six
.
moves
.
range
(
self
.
epoch_num
):
actual_batch_num
=
0
if
loader
.
iterable
:
for
feed_data
in
loader
():
x_data
,
=
exe
.
run
(
prog
,
feed
=
feed_data
,
fetch_list
=
[
x
])
self
.
assertEqual
(
x_data
.
shape
[
0
]
%
self
.
batch_size
,
0
)
self
.
assertTrue
(
x_data
.
shape
[
0
]
!=
0
)
actual_batch_num
+=
int
(
x_data
.
shape
[
0
]
/
self
.
batch_size
)
else
:
loader
.
start
()
try
:
while
True
:
x_data
,
=
exe
.
run
(
prog
,
fetch_list
=
[
x
])
self
.
assertEqual
(
x_data
.
shape
[
0
]
%
self
.
batch_size
,
0
)
self
.
assertTrue
(
x_data
.
shape
[
0
]
!=
0
)
actual_batch_num
+=
int
(
x_data
.
shape
[
0
]
/
self
.
batch_size
)
except
fluid
.
core
.
EOFException
:
loader
.
reset
()
if
not
drop_last
or
len
(
places
)
==
1
:
self
.
assertEqual
(
self
.
batch_num
,
actual_batch_num
)
else
:
self
.
assertGreater
(
self
.
batch_num
,
actual_batch_num
)
def
test_main
(
self
):
use_cuda_list
=
[
False
,
True
]
if
fluid
.
is_compiled_with_cuda
(
)
else
[
False
]
iterable_list
=
[
False
,
True
]
drop_last_list
=
[
False
,
True
]
for
iterable
in
iterable_list
:
for
use_cuda
in
use_cuda_list
:
for
drop_last
in
drop_last_list
:
with
fluid
.
program_guard
(
fluid
.
Program
(),
fluid
.
Program
()):
with
fluid
.
scope_guard
(
fluid
.
Scope
()):
self
.
run_network
(
iterable
,
use_cuda
,
drop_last
)
if
__name__
==
'__main__'
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录