Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
16658f7b
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 2 年 前同步成功
通知
708
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
16658f7b
编写于
6月 11, 2018
作者:
Q
qiaolongfei
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into refine-prefetch
上级
83a577e8
1d198494
变更
27
隐藏空白更改
内联
并排
Showing
27 changed file
with
269 addition
and
145 deletion
+269
-145
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+4
-4
paddle/fluid/framework/details/all_reduce_op_handle.cc
paddle/fluid/framework/details/all_reduce_op_handle.cc
+26
-13
paddle/fluid/framework/details/all_reduce_op_handle.h
paddle/fluid/framework/details/all_reduce_op_handle.h
+14
-6
paddle/fluid/framework/details/execution_strategy.h
paddle/fluid/framework/details/execution_strategy.h
+1
-1
paddle/fluid/framework/details/multi_devices_graph_builder.cc
...le/fluid/framework/details/multi_devices_graph_builder.cc
+29
-22
paddle/fluid/framework/details/multi_devices_graph_builder.h
paddle/fluid/framework/details/multi_devices_graph_builder.h
+4
-1
paddle/fluid/framework/details/op_handle_base.cc
paddle/fluid/framework/details/op_handle_base.cc
+3
-3
paddle/fluid/framework/details/op_handle_base.h
paddle/fluid/framework/details/op_handle_base.h
+1
-1
paddle/fluid/framework/details/reduce_and_gather.h
paddle/fluid/framework/details/reduce_and_gather.h
+3
-1
paddle/fluid/framework/details/ssa_graph_builder_factory.h
paddle/fluid/framework/details/ssa_graph_builder_factory.h
+5
-1
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+1
-1
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+29
-19
paddle/fluid/operators/detail/grpc_server.cc
paddle/fluid/operators/detail/grpc_server.cc
+18
-13
paddle/fluid/operators/detail/grpc_server.h
paddle/fluid/operators/detail/grpc_server.h
+1
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+4
-4
python/paddle/dataset/flowers.py
python/paddle/dataset/flowers.py
+2
-1
python/paddle/fluid/data_feeder.py
python/paddle/fluid/data_feeder.py
+4
-1
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+9
-8
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+2
-2
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
...ddle/fluid/tests/unittests/parallel_executor_test_base.py
+8
-3
python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
...addle/fluid/tests/unittests/test_parallel_executor_crf.py
+20
-7
python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
...luid/tests/unittests/test_parallel_executor_fetch_feed.py
+19
-8
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
...dle/fluid/tests/unittests/test_parallel_executor_mnist.py
+29
-13
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
...fluid/tests/unittests/test_parallel_executor_seresnext.py
+13
-4
python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
...ests/unittests/test_parallel_executor_test_while_train.py
+14
-6
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
...uid/tests/unittests/test_parallel_executor_transformer.py
+4
-1
python/paddle/v2/dataset/flowers.py
python/paddle/v2/dataset/flowers.py
+2
-1
未找到文件。
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
16658f7b
...
@@ -13,14 +13,14 @@ cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
...
@@ -13,14 +13,14 @@ cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
cc_library
(
variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows
)
cc_library
(
variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows
)
if
(
WITH_GPU
)
if
(
WITH_GPU
)
nv_library
(
nccl_all_reduce_op_handle SRCS nccl_
all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
nv_library
(
all_reduce_op_handle SRCS
all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor
)
dynload_cuda variable_visitor
)
set
(
multi_devices_graph_builder_deps nccl_all_reduce_op_handle
)
nv_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda
)
nv_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda
)
nv_library
(
broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda
)
nv_library
(
broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda
)
else
()
else
()
set
(
multi_devices_graph_builder_deps
)
cc_library
(
all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
variable_visitor
)
cc_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim
)
cc_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim
)
cc_library
(
broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor
)
cc_library
(
broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor
)
endif
()
endif
()
...
@@ -29,7 +29,7 @@ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope d
...
@@ -29,7 +29,7 @@ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope d
cc_library
(
fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope
)
cc_library
(
fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope
)
cc_library
(
multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
cc_library
(
multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle rpc_op_handle
${
multi_devices_graph_builder_deps
}
reduce_op_handle broadcast_op_handle
)
scale_loss_grad_op_handle rpc_op_handle
all_reduce_op_handle
reduce_op_handle broadcast_op_handle
)
cc_library
(
ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker
)
cc_library
(
ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker
)
...
...
paddle/fluid/framework/details/
nccl_
all_reduce_op_handle.cc
→
paddle/fluid/framework/details/all_reduce_op_handle.cc
浏览文件 @
16658f7b
...
@@ -13,25 +13,33 @@
...
@@ -13,25 +13,33 @@
// limitations under the License.
// limitations under the License.
#include <algorithm>
#include <algorithm>
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
NCCLAllReduceOpHandle
::
NCCLAllReduceOpHandle
(
const
std
::
vector
<
Scope
*>
&
local_scopes
,
#ifdef PADDLE_WITH_CUDA
const
std
::
vector
<
platform
::
Place
>
&
places
,
AllReduceOpHandle
::
AllReduceOpHandle
(
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
platform
::
NCCLContextMap
&
ctxs
)
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
*
ctxs
)
:
local_scopes_
(
local_scopes
),
places_
(
places
),
nccl_ctxs_
(
ctxs
)
{
:
local_scopes_
(
local_scopes
),
places_
(
places
),
nccl_ctxs_
(
ctxs
)
{
for
(
auto
&
p
:
places_
)
{
if
(
nccl_ctxs_
)
{
this
->
dev_ctxes_
[
p
]
=
nccl_ctxs_
.
DevCtx
(
p
);
for
(
auto
&
p
:
places_
)
{
this
->
dev_ctxes_
[
p
]
=
nccl_ctxs_
->
DevCtx
(
p
);
}
}
}
}
}
#else
AllReduceOpHandle
::
AllReduceOpHandle
(
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
)
:
local_scopes_
(
local_scopes
),
places_
(
places
)
{}
#endif
void
NCCL
AllReduceOpHandle
::
RunImpl
()
{
void
AllReduceOpHandle
::
RunImpl
()
{
if
(
NoDummyInputSize
()
==
1
)
{
if
(
NoDummyInputSize
()
==
1
)
{
return
;
// No need to all reduce when GPU count = 1;
return
;
// No need to all reduce when GPU count = 1;
}
else
{
}
else
{
...
@@ -58,6 +66,8 @@ void NCCLAllReduceOpHandle::RunImpl() {
...
@@ -58,6 +66,8 @@ void NCCLAllReduceOpHandle::RunImpl() {
}
}
if
(
platform
::
is_gpu_place
(
lod_tensors
[
0
]
->
place
()))
{
if
(
platform
::
is_gpu_place
(
lod_tensors
[
0
]
->
place
()))
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE
(
nccl_ctxs_
,
"nccl_ctxs should not be nullptr."
);
int
dtype
=
-
1
;
int
dtype
=
-
1
;
size_t
numel
=
0
;
size_t
numel
=
0
;
std
::
vector
<
std
::
function
<
void
()
>>
all_reduce_calls
;
std
::
vector
<
std
::
function
<
void
()
>>
all_reduce_calls
;
...
@@ -75,7 +85,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
...
@@ -75,7 +85,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
}
}
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
).
device
;
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
).
device
;
auto
&
nccl_ctx
=
nccl_ctxs_
.
at
(
dev_id
);
auto
&
nccl_ctx
=
nccl_ctxs_
->
at
(
dev_id
);
auto
stream
=
nccl_ctx
.
stream
();
auto
stream
=
nccl_ctx
.
stream
();
auto
comm
=
nccl_ctx
.
comm_
;
auto
comm
=
nccl_ctx
.
comm_
;
all_reduce_calls
.
emplace_back
([
=
]
{
all_reduce_calls
.
emplace_back
([
=
]
{
...
@@ -90,22 +100,25 @@ void NCCLAllReduceOpHandle::RunImpl() {
...
@@ -90,22 +100,25 @@ void NCCLAllReduceOpHandle::RunImpl() {
call
();
call
();
}
}
});
});
#else
PADDLE_THROW
(
"Not compiled with CUDA"
);
#endif
}
else
{
// Special handle CPU only Operator's gradient. Like CRF
}
else
{
// Special handle CPU only Operator's gradient. Like CRF
auto
&
trg
=
*
this
->
local_scopes_
[
0
]
auto
&
trg
=
*
this
->
local_scopes_
[
0
]
->
FindVar
(
kLocalExecScopeName
)
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
()
->
Get
<
Scope
*>
()
->
Var
(
)
->
FindVar
(
out_var_handles
[
0
]
->
name_
)
->
GetMutable
<
framework
::
LoDTensor
>
();
->
GetMutable
<
framework
::
LoDTensor
>
();
// Reduce All Tensor to trg in CPU
// Reduce All Tensor to trg in CPU
ReduceLoDTensor
func
(
lod_tensors
,
&
trg
);
ReduceLoDTensor
func
(
lod_tensors
,
&
trg
);
VisitDataType
(
ToDataType
(
lod_tensors
[
0
]
->
type
()),
func
);
VisitDataType
(
ToDataType
(
lod_tensors
[
0
]
->
type
()),
func
);
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
local_scopes_
.
size
();
++
i
)
{
auto
&
scope
=
auto
&
scope
=
*
local_scopes_
[
i
]
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
*
local_scopes_
[
i
]
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
&
p
=
places_
[
i
];
auto
&
p
=
places_
[
i
];
auto
*
var
=
scope
.
FindVar
(
in
_var_handles
[
i
]
->
name_
);
auto
*
var
=
scope
.
FindVar
(
out
_var_handles
[
i
]
->
name_
);
auto
*
dev_ctx
=
dev_ctxes_
[
p
];
auto
*
dev_ctx
=
dev_ctxes_
[
p
];
RunAndRecordEvent
(
p
,
[
&
trg
,
var
,
dev_ctx
,
p
]
{
RunAndRecordEvent
(
p
,
[
&
trg
,
var
,
dev_ctx
,
p
]
{
...
@@ -118,7 +131,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
...
@@ -118,7 +131,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
}
}
}
}
std
::
string
NCCLAllReduceOpHandle
::
Name
()
const
{
return
"nccl_
all_reduce"
;
}
std
::
string
AllReduceOpHandle
::
Name
()
const
{
return
"
all_reduce"
;
}
}
// namespace details
}
// namespace details
}
// namespace framework
}
// namespace framework
}
// namespace paddle
}
// namespace paddle
paddle/fluid/framework/details/
nccl_
all_reduce_op_handle.h
→
paddle/fluid/framework/details/all_reduce_op_handle.h
浏览文件 @
16658f7b
...
@@ -20,17 +20,23 @@
...
@@ -20,17 +20,23 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
struct
NCCLAllReduceOpHandle
:
public
OpHandleBase
{
struct
AllReduceOpHandle
:
public
OpHandleBase
{
NCCLAllReduceOpHandle
(
const
std
::
vector
<
Scope
*>
&
local_scopes
,
#ifdef PADDLE_WITH_CUDA
const
std
::
vector
<
platform
::
Place
>
&
places
,
AllReduceOpHandle
(
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
platform
::
NCCLContextMap
&
ctxs
);
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
*
ctxs
);
#else
AllReduceOpHandle
(
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
);
#endif
std
::
string
Name
()
const
override
;
std
::
string
Name
()
const
override
;
// Delay and buffer nccl_all_reduce together can significantly increase
// Delay and buffer nccl_all_reduce together can significantly increase
...
@@ -43,7 +49,9 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
...
@@ -43,7 +49,9 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
private:
private:
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
platform
::
Place
>
places_
;
const
platform
::
NCCLContextMap
&
nccl_ctxs_
;
#ifdef PADDLE_WITH_CUDA
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
#endif
};
};
}
// namespace details
}
// namespace details
...
...
paddle/fluid/framework/details/execution_strategy.h
浏览文件 @
16658f7b
...
@@ -20,7 +20,7 @@ namespace details {
...
@@ -20,7 +20,7 @@ namespace details {
struct
ExecutionStrategy
{
struct
ExecutionStrategy
{
size_t
num_threads_
{
0
};
size_t
num_threads_
{
0
};
bool
use_
event
_
{
true
};
bool
use_
cuda
_
{
true
};
bool
allow_op_delay_
{
false
};
bool
allow_op_delay_
{
false
};
size_t
num_iteration_per_drop_scope_
{
100
};
size_t
num_iteration_per_drop_scope_
{
100
};
};
};
...
...
paddle/fluid/framework/details/multi_devices_graph_builder.cc
浏览文件 @
16658f7b
...
@@ -17,6 +17,7 @@
...
@@ -17,6 +17,7 @@
#include <utility>
#include <utility>
#include <vector>
#include <vector>
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
...
@@ -26,10 +27,6 @@
...
@@ -26,10 +27,6 @@
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
namespace
details
{
namespace
details
{
...
@@ -243,7 +240,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
...
@@ -243,7 +240,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
CreateReduceOp
(
&
result
,
g_name
,
0
);
CreateReduceOp
(
&
result
,
g_name
,
0
);
CreateBroadcastOp
(
&
result
,
g_name
,
0
);
CreateBroadcastOp
(
&
result
,
g_name
,
0
);
}
else
{
}
else
{
Insert
NCCL
AllReduceOp
(
&
result
,
g_name
);
InsertAllReduceOp
(
&
result
,
g_name
);
}
}
break
;
break
;
}
}
...
@@ -286,6 +283,19 @@ bool MultiDevSSAGraphBuilder::IsSparseGradient(
...
@@ -286,6 +283,19 @@ bool MultiDevSSAGraphBuilder::IsSparseGradient(
return
false
;
return
false
;
}
}
void
MultiDevSSAGraphBuilder
::
SetCommunicationContext
(
OpHandleBase
*
op_handle
,
const
platform
::
Place
&
p
)
const
{
#ifdef PADDLE_WITH_CUDA
if
(
nccl_ctxs_
==
nullptr
)
{
op_handle
->
SetDeviceContext
(
p
,
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
));
}
#else
op_handle
->
SetDeviceContext
(
p
,
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
));
#endif
}
void
MultiDevSSAGraphBuilder
::
CreateBroadcastOp
(
SSAGraph
*
result
,
void
MultiDevSSAGraphBuilder
::
CreateBroadcastOp
(
SSAGraph
*
result
,
const
std
::
string
&
p_name
,
const
std
::
string
&
p_name
,
size_t
src_dev_id
)
const
{
size_t
src_dev_id
)
const
{
...
@@ -300,15 +310,12 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result,
...
@@ -300,15 +310,12 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result,
op_handle
->
AddInput
(
in
);
op_handle
->
AddInput
(
in
);
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
auto
&
vars
=
result
->
vars_
.
at
(
i
).
at
(
p_name
);
auto
&
p
=
places_
[
i
];
auto
&
p
=
places_
[
i
];
SetCommunicationContext
(
op_handle
,
p
);
auto
&
vars
=
result
->
vars_
.
at
(
i
).
at
(
p_name
);
auto
*
out_var
=
new
VarHandle
(
vars
.
size
(),
i
,
p_name
,
p
);
auto
*
out_var
=
new
VarHandle
(
vars
.
size
(),
i
,
p_name
,
p
);
vars
.
emplace_back
(
out_var
);
vars
.
emplace_back
(
out_var
);
op_handle
->
AddOutput
(
out_var
);
op_handle
->
AddOutput
(
out_var
);
#ifndef ADDLE_WITH_CUDA
op_handle
->
SetDeviceContext
(
p
,
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
));
#endif
}
}
}
}
...
@@ -320,15 +327,19 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result,
...
@@ -320,15 +327,19 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result,
CreateOpHandleIOs
(
result
,
op
,
dev_id
);
CreateOpHandleIOs
(
result
,
op
,
dev_id
);
}
}
void
MultiDevSSAGraphBuilder
::
Insert
NCCLAllReduceOp
(
void
MultiDevSSAGraphBuilder
::
Insert
AllReduceOp
(
SSAGraph
*
result
,
SSAGraph
*
result
,
const
std
::
string
&
og
)
const
{
const
std
::
string
&
og
)
const
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
result
->
ops_
.
emplace_back
(
result
->
ops_
.
emplace_back
(
new
NCCLAllReduceOpHandle
(
local_scopes_
,
places_
,
*
nccl_ctxs_
));
new
AllReduceOpHandle
(
local_scopes_
,
places_
,
nccl_ctxs_
));
#else
result
->
ops_
.
emplace_back
(
new
AllReduceOpHandle
(
local_scopes_
,
places_
));
#endif
auto
*
op_handle
=
result
->
ops_
.
back
().
get
();
auto
*
op_handle
=
result
->
ops_
.
back
().
get
();
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
auto
&
p
=
places_
[
i
];
auto
&
p
=
places_
[
i
];
SetCommunicationContext
(
op_handle
,
p
);
auto
&
vars
=
result
->
vars_
[
i
][
og
];
auto
&
vars
=
result
->
vars_
[
i
][
og
];
PADDLE_ENFORCE
(
!
vars
.
empty
());
PADDLE_ENFORCE
(
!
vars
.
empty
());
auto
&
prev_grad
=
vars
.
back
();
auto
&
prev_grad
=
vars
.
back
();
...
@@ -338,9 +349,6 @@ void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
...
@@ -338,9 +349,6 @@ void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp(
vars
.
emplace_back
(
var
);
vars
.
emplace_back
(
var
);
op_handle
->
AddOutput
(
var
);
op_handle
->
AddOutput
(
var
);
}
}
#else
PADDLE_ENFORCE
(
"Not implemented"
);
#endif
}
}
bool
MultiDevSSAGraphBuilder
::
IsParameterGradientOnce
(
bool
MultiDevSSAGraphBuilder
::
IsParameterGradientOnce
(
...
@@ -379,7 +387,9 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
...
@@ -379,7 +387,9 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
// Insert ScaleCost OpHandle
// Insert ScaleCost OpHandle
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
auto
*
communication_dev_ctx
=
nccl_ctxs_
->
DevCtx
(
places_
[
i
]);
auto
*
communication_dev_ctx
=
nccl_ctxs_
?
nccl_ctxs_
->
DevCtx
(
places_
[
i
])
:
platform
::
DeviceContextPool
::
Instance
().
Get
(
places_
[
i
]);
#else
#else
auto
*
communication_dev_ctx
=
auto
*
communication_dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CPUPlace
());
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CPUPlace
());
...
@@ -424,12 +434,9 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
...
@@ -424,12 +434,9 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
auto
*
op_handle
=
result
->
ops_
.
back
().
get
();
auto
*
op_handle
=
result
->
ops_
.
back
().
get
();
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
auto
&
vars
=
result
->
vars_
[
i
][
og
];
#ifndef PADDLE_WITH_CUDA
auto
&
p
=
places_
[
i
];
auto
&
p
=
places_
[
i
];
op_handle
->
SetDeviceContext
(
p
,
SetCommunicationContext
(
op_handle
,
p
);
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
));
auto
&
vars
=
result
->
vars_
[
i
][
og
];
#endif
PADDLE_ENFORCE
(
!
vars
.
empty
());
PADDLE_ENFORCE
(
!
vars
.
empty
());
auto
&
prev_grad
=
vars
.
back
();
auto
&
prev_grad
=
vars
.
back
();
op_handle
->
AddInput
(
prev_grad
.
get
());
op_handle
->
AddInput
(
prev_grad
.
get
());
...
...
paddle/fluid/framework/details/multi_devices_graph_builder.h
浏览文件 @
16658f7b
...
@@ -100,7 +100,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
...
@@ -100,7 +100,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
&
var_name_on_devices
,
const
std
::
vector
<
std
::
unordered_set
<
std
::
string
>>
&
var_name_on_devices
,
const
OpDesc
&
op
)
const
;
const
OpDesc
&
op
)
const
;
void
Insert
NCCL
AllReduceOp
(
SSAGraph
*
result
,
const
std
::
string
&
og
)
const
;
void
InsertAllReduceOp
(
SSAGraph
*
result
,
const
std
::
string
&
og
)
const
;
void
CreateBroadcastOp
(
SSAGraph
*
result
,
const
std
::
string
&
p_name
,
void
CreateBroadcastOp
(
SSAGraph
*
result
,
const
std
::
string
&
p_name
,
size_t
src_dev_id
)
const
;
size_t
src_dev_id
)
const
;
...
@@ -111,6 +111,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
...
@@ -111,6 +111,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
private:
private:
BuildStrategy
strategy_
;
BuildStrategy
strategy_
;
void
SetCommunicationContext
(
OpHandleBase
*
op_handle
,
const
platform
::
Place
&
p
)
const
;
};
};
}
// namespace details
}
// namespace details
}
// namespace framework
}
// namespace framework
...
...
paddle/fluid/framework/details/op_handle_base.cc
浏览文件 @
16658f7b
...
@@ -39,9 +39,9 @@ OpHandleBase::~OpHandleBase() {
...
@@ -39,9 +39,9 @@ OpHandleBase::~OpHandleBase() {
#endif
#endif
}
}
void
OpHandleBase
::
Run
(
bool
use_
event
)
{
void
OpHandleBase
::
Run
(
bool
use_
cuda
)
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
if
(
events_
.
empty
()
&&
use_
event
)
{
if
(
events_
.
empty
()
&&
use_
cuda
)
{
for
(
auto
&
p
:
dev_ctxes_
)
{
for
(
auto
&
p
:
dev_ctxes_
)
{
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
.
first
).
device
;
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
.
first
).
device
;
PADDLE_ENFORCE
(
cudaSetDevice
(
dev_id
));
PADDLE_ENFORCE
(
cudaSetDevice
(
dev_id
));
...
@@ -50,7 +50,7 @@ void OpHandleBase::Run(bool use_event) {
...
@@ -50,7 +50,7 @@ void OpHandleBase::Run(bool use_event) {
}
}
}
}
#else
#else
PADDLE_ENFORCE
(
!
use_
event
);
PADDLE_ENFORCE
(
!
use_
cuda
);
#endif
#endif
RunImpl
();
RunImpl
();
...
...
paddle/fluid/framework/details/op_handle_base.h
浏览文件 @
16658f7b
...
@@ -36,7 +36,7 @@ class OpHandleBase {
...
@@ -36,7 +36,7 @@ class OpHandleBase {
virtual
std
::
string
Name
()
const
=
0
;
virtual
std
::
string
Name
()
const
=
0
;
void
Run
(
bool
use_
event
);
void
Run
(
bool
use_
cuda
);
virtual
void
RecordWaitEventOnCtx
(
platform
::
DeviceContext
*
waited_ctx
);
virtual
void
RecordWaitEventOnCtx
(
platform
::
DeviceContext
*
waited_ctx
);
...
...
paddle/fluid/framework/details/reduce_and_gather.h
浏览文件 @
16658f7b
...
@@ -37,7 +37,9 @@ struct ReduceLoDTensor {
...
@@ -37,7 +37,9 @@ struct ReduceLoDTensor {
PADDLE_ENFORCE_NE
(
t0
.
numel
(),
0
);
PADDLE_ENFORCE_NE
(
t0
.
numel
(),
0
);
dst_tensor_
.
Resize
(
t0
.
dims
());
dst_tensor_
.
Resize
(
t0
.
dims
());
T
*
dst
=
dst_tensor_
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
T
*
dst
=
dst_tensor_
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
std
::
copy
(
t0
.
data
<
T
>
(),
t0
.
data
<
T
>
()
+
t0
.
numel
(),
dst
);
if
(
dst
!=
t0
.
data
<
T
>
())
{
std
::
copy
(
t0
.
data
<
T
>
(),
t0
.
data
<
T
>
()
+
t0
.
numel
(),
dst
);
}
for
(
size_t
i
=
1
;
i
<
src_tensors_
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
src_tensors_
.
size
();
++
i
)
{
auto
&
t
=
*
src_tensors_
[
i
];
auto
&
t
=
*
src_tensors_
[
i
];
...
...
paddle/fluid/framework/details/ssa_graph_builder_factory.h
浏览文件 @
16658f7b
...
@@ -40,7 +40,11 @@ class SSAGraphBuilderFactory {
...
@@ -40,7 +40,11 @@ class SSAGraphBuilderFactory {
loss_var_name_
(
loss_var_name
),
loss_var_name_
(
loss_var_name
),
param_names_
(
param_names
),
param_names_
(
param_names
),
local_scopes_
(
local_scopes
),
local_scopes_
(
local_scopes
),
strategy_
(
strategy
)
{}
strategy_
(
strategy
)
{
#ifdef PADDLE_WITH_CUDA
nccl_ctxs_
=
nullptr
;
#endif
}
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
void
SetNCCLContextMap
(
platform
::
NCCLContextMap
*
nccl_ctxs
)
{
void
SetNCCLContextMap
(
platform
::
NCCLContextMap
*
nccl_ctxs
)
{
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
16658f7b
...
@@ -193,7 +193,7 @@ void ThreadedSSAGraphExecutor::RunOp(
...
@@ -193,7 +193,7 @@ void ThreadedSSAGraphExecutor::RunOp(
if
(
VLOG_IS_ON
(
10
))
{
if
(
VLOG_IS_ON
(
10
))
{
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" : "
<<
op
->
DebugString
();
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" : "
<<
op
->
DebugString
();
}
}
op
->
Run
(
strategy_
.
use_
event
_
);
op
->
Run
(
strategy_
.
use_
cuda
_
);
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" Done "
;
VLOG
(
10
)
<<
op
<<
" "
<<
op
->
Name
()
<<
" Done "
;
running_ops_
--
;
running_ops_
--
;
ready_var_q
->
Extend
(
op
->
Outputs
());
ready_var_q
->
Extend
(
op
->
Outputs
());
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
16658f7b
...
@@ -43,7 +43,8 @@ class ParallelExecutorPrivate {
...
@@ -43,7 +43,8 @@ class ParallelExecutorPrivate {
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
#endif
#endif
bool
own_local_scope
;
bool
own_local_scope_
;
bool
use_cuda_
;
};
};
std
::
vector
<
Scope
*>
&
ParallelExecutor
::
GetLocalScopes
()
{
std
::
vector
<
Scope
*>
&
ParallelExecutor
::
GetLocalScopes
()
{
...
@@ -60,35 +61,40 @@ ParallelExecutor::ParallelExecutor(
...
@@ -60,35 +61,40 @@ ParallelExecutor::ParallelExecutor(
size_t
num_trainers
,
size_t
trainer_id
)
size_t
num_trainers
,
size_t
trainer_id
)
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
:
member_
(
new
ParallelExecutorPrivate
(
places
))
{
member_
->
global_scope_
=
scope
;
member_
->
global_scope_
=
scope
;
member_
->
use_cuda_
=
exec_strategy
.
use_cuda_
;
// Step 1. Bcast the params to devs.
// Step 1. Bcast the params to devs.
// Create local scopes
// Create local scopes
if
(
local_scopes
.
empty
())
{
if
(
local_scopes
.
empty
())
{
member_
->
own_local_scope
=
true
;
member_
->
own_local_scope
_
=
true
;
member_
->
local_scopes_
.
emplace_back
(
member_
->
global_scope_
);
member_
->
local_scopes_
.
emplace_back
(
member_
->
global_scope_
);
for
(
size_t
i
=
1
;
i
<
member_
->
places_
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
member_
->
places_
.
size
();
++
i
)
{
member_
->
local_scopes_
.
emplace_back
(
&
scope
->
NewScope
());
member_
->
local_scopes_
.
emplace_back
(
&
scope
->
NewScope
());
}
}
}
else
{
}
else
{
member_
->
own_local_scope
=
false
;
member_
->
own_local_scope
_
=
false
;
PADDLE_ENFORCE_EQ
(
member_
->
places_
.
size
(),
local_scopes
.
size
());
PADDLE_ENFORCE_EQ
(
member_
->
places_
.
size
(),
local_scopes
.
size
());
for
(
size_t
i
=
0
;
i
<
member_
->
places_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
member_
->
places_
.
size
();
++
i
)
{
member_
->
local_scopes_
.
emplace_back
(
&
local_scopes
[
i
]
->
NewScope
());
member_
->
local_scopes_
.
emplace_back
(
&
local_scopes
[
i
]
->
NewScope
());
}
}
}
}
if
(
member_
->
use_cuda_
)
{
// Bcast Parameters to all GPUs
// Bcast Parameters to all GPUs
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
auto
*
nccl_id_var
=
scope
->
FindVar
(
NCCL_ID_VARNAME
);
auto
*
nccl_id_var
=
scope
->
FindVar
(
NCCL_ID_VARNAME
);
ncclUniqueId
*
nccl_id
=
nullptr
;
ncclUniqueId
*
nccl_id
=
nullptr
;
if
(
nccl_id_var
!=
nullptr
)
{
if
(
nccl_id_var
!=
nullptr
)
{
nccl_id
=
nccl_id_var
->
GetMutable
<
ncclUniqueId
>
();
nccl_id
=
nccl_id_var
->
GetMutable
<
ncclUniqueId
>
();
}
}
member_
->
nccl_ctxs_
.
reset
(
new
platform
::
NCCLContextMap
(
member_
->
nccl_ctxs_
.
reset
(
new
platform
::
NCCLContextMap
(
member_
->
places_
,
nccl_id
,
num_trainers
,
trainer_id
));
member_
->
places_
,
nccl_id
,
num_trainers
,
trainer_id
));
#else
PADDLE_THROW
(
"Not compiled with CUDA"
);
#endif
#endif
if
(
platform
::
is_gpu_place
(
places
[
0
])
&&
member_
->
local_scopes_
.
size
()
!=
1
&&
}
local_scopes
.
empty
())
{
// Is CUDA
if
(
member_
->
local_scopes_
.
size
()
!=
1
&&
local_scopes
.
empty
())
{
BCastParamsToGPUs
(
bcast_vars
);
BCastParamsToGPUs
(
bcast_vars
);
}
}
// Startup Program has been run. All local scopes has correct parameters.
// Startup Program has been run. All local scopes has correct parameters.
...
@@ -108,9 +114,13 @@ ParallelExecutor::ParallelExecutor(
...
@@ -108,9 +114,13 @@ ParallelExecutor::ParallelExecutor(
details
::
SSAGraphBuilderFactory
builder_factory
(
details
::
SSAGraphBuilderFactory
builder_factory
(
member_
->
places_
,
loss_var_name
,
params
,
member_
->
local_scopes_
,
member_
->
places_
,
loss_var_name
,
params
,
member_
->
local_scopes_
,
build_strategy
);
build_strategy
);
if
(
member_
->
use_cuda_
)
{
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
builder_factory
.
SetNCCLContextMap
(
member_
->
nccl_ctxs_
.
get
());
builder_factory
.
SetNCCLContextMap
(
member_
->
nccl_ctxs_
.
get
());
#else
PADDLE_THROW
(
"Not compiled with CUDA"
);
#endif
#endif
}
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
member_
->
executor_
.
reset
(
new
details
::
ThreadedSSAGraphExecutor
(
exec_strategy
,
member_
->
local_scopes_
,
places
,
exec_strategy
,
member_
->
local_scopes_
,
places
,
...
@@ -123,7 +133,6 @@ ParallelExecutor::ParallelExecutor(
...
@@ -123,7 +133,6 @@ ParallelExecutor::ParallelExecutor(
void
ParallelExecutor
::
BCastParamsToGPUs
(
void
ParallelExecutor
::
BCastParamsToGPUs
(
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
{
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
{
#ifdef PADDLE_WITH_CUDA
auto
*
main_scope
=
member_
->
local_scopes_
[
0
];
auto
*
main_scope
=
member_
->
local_scopes_
[
0
];
for
(
auto
&
var
:
vars
)
{
for
(
auto
&
var
:
vars
)
{
...
@@ -135,6 +144,7 @@ void ParallelExecutor::BCastParamsToGPUs(
...
@@ -135,6 +144,7 @@ void ParallelExecutor::BCastParamsToGPUs(
auto
&
main_tensor
=
main_var
->
Get
<
LoDTensor
>
();
auto
&
main_tensor
=
main_var
->
Get
<
LoDTensor
>
();
auto
&
dims
=
main_tensor
.
dims
();
auto
&
dims
=
main_tensor
.
dims
();
if
(
paddle
::
platform
::
is_gpu_place
(
main_tensor
.
place
()))
{
if
(
paddle
::
platform
::
is_gpu_place
(
main_tensor
.
place
()))
{
#ifdef PADDLE_WITH_CUDA
size_t
numel
=
main_tensor
.
numel
();
size_t
numel
=
main_tensor
.
numel
();
ncclDataType_t
data_type
=
platform
::
ToNCCLDataType
(
main_tensor
.
type
());
ncclDataType_t
data_type
=
platform
::
ToNCCLDataType
(
main_tensor
.
type
());
platform
::
NCCLGroupGuard
guard
;
platform
::
NCCLGroupGuard
guard
;
...
@@ -153,6 +163,10 @@ void ParallelExecutor::BCastParamsToGPUs(
...
@@ -153,6 +163,10 @@ void ParallelExecutor::BCastParamsToGPUs(
platform
::
dynload
::
ncclBcast
(
buffer
,
numel
,
data_type
,
0
,
platform
::
dynload
::
ncclBcast
(
buffer
,
numel
,
data_type
,
0
,
nccl_ctx
.
comm_
,
nccl_ctx
.
stream
());
nccl_ctx
.
comm_
,
nccl_ctx
.
stream
());
}
}
member_
->
nccl_ctxs_
->
WaitAll
();
#else
PADDLE_THROW
(
"Not compiled with CUDA"
);
#endif
}
else
{
}
else
{
platform
::
CPUPlace
cpu
;
platform
::
CPUPlace
cpu
;
for
(
size_t
i
=
1
;
i
<
member_
->
places_
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
member_
->
places_
.
size
();
++
i
)
{
...
@@ -163,11 +177,7 @@ void ParallelExecutor::BCastParamsToGPUs(
...
@@ -163,11 +177,7 @@ void ParallelExecutor::BCastParamsToGPUs(
paddle
::
framework
::
TensorCopy
(
main_tensor
,
cpu
,
t
);
paddle
::
framework
::
TensorCopy
(
main_tensor
,
cpu
,
t
);
}
}
}
}
member_
->
nccl_ctxs_
->
WaitAll
();
}
}
#else
PADDLE_THROW
(
"Not compiled with CUDA"
);
#endif
}
}
void
ParallelExecutor
::
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
,
void
ParallelExecutor
::
Run
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
,
...
@@ -213,7 +223,7 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
...
@@ -213,7 +223,7 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
}
}
ParallelExecutor
::~
ParallelExecutor
()
{
ParallelExecutor
::~
ParallelExecutor
()
{
if
(
member_
->
own_local_scope
)
{
if
(
member_
->
own_local_scope
_
)
{
for
(
size_t
i
=
1
;
i
<
member_
->
local_scopes_
.
size
();
++
i
)
{
for
(
size_t
i
=
1
;
i
<
member_
->
local_scopes_
.
size
();
++
i
)
{
member_
->
global_scope_
->
DeleteScope
(
member_
->
local_scopes_
[
i
]);
member_
->
global_scope_
->
DeleteScope
(
member_
->
local_scopes_
[
i
]);
}
}
...
...
paddle/fluid/operators/detail/grpc_server.cc
浏览文件 @
16658f7b
...
@@ -41,11 +41,22 @@ class RequestBase {
...
@@ -41,11 +41,22 @@ class RequestBase {
virtual
~
RequestBase
()
{}
virtual
~
RequestBase
()
{}
virtual
void
Process
()
=
0
;
virtual
void
Process
()
=
0
;
CallStatus
Status
()
{
return
status_
;
}
CallStatus
Status
()
const
{
void
SetStatus
(
CallStatus
status
)
{
status_
=
status
;
}
std
::
lock_guard
<
std
::
mutex
>
l
(
status_mu_
);
return
status_
;
}
template
<
typename
T
>
void
Finish
(
const
T
&
reply
,
ServerAsyncResponseWriter
<
T
>*
responder
)
{
std
::
lock_guard
<
std
::
mutex
>
l
(
status_mu_
);
status_
=
FINISH
;
responder
->
Finish
(
reply
,
::
grpc
::
Status
::
OK
,
reinterpret_cast
<
void
*>
(
static_cast
<
intptr_t
>
(
req_id_
)));
}
virtual
std
::
string
GetReqName
()
=
0
;
virtual
std
::
string
GetReqName
()
=
0
;
protected:
protected:
mutable
std
::
mutex
status_mu_
;
::
grpc
::
ServerContext
ctx_
;
::
grpc
::
ServerContext
ctx_
;
GrpcService
::
AsyncService
*
service_
;
GrpcService
::
AsyncService
*
service_
;
::
grpc
::
ServerCompletionQueue
*
cq_
;
::
grpc
::
ServerCompletionQueue
*
cq_
;
...
@@ -80,9 +91,7 @@ class RequestSend final : public RequestBase {
...
@@ -80,9 +91,7 @@ class RequestSend final : public RequestBase {
framework
::
Variable
*
outvar
=
nullptr
;
framework
::
Variable
*
outvar
=
nullptr
;
request_handler_
->
Handle
(
varname
,
scope
,
invar
,
&
outvar
);
request_handler_
->
Handle
(
varname
,
scope
,
invar
,
&
outvar
);
status_
=
FINISH
;
Finish
(
reply_
,
&
responder_
);
responder_
.
Finish
(
reply_
,
::
grpc
::
Status
::
OK
,
reinterpret_cast
<
void
*>
(
static_cast
<
intptr_t
>
(
req_id_
)));
}
}
protected:
protected:
...
@@ -122,9 +131,7 @@ class RequestGet final : public RequestBase {
...
@@ -122,9 +131,7 @@ class RequestGet final : public RequestBase {
SerializeToByteBuffer
(
varname
,
outvar
,
*
request_handler_
->
dev_ctx
(),
SerializeToByteBuffer
(
varname
,
outvar
,
*
request_handler_
->
dev_ctx
(),
&
reply_
);
&
reply_
);
}
}
status_
=
FINISH
;
Finish
(
reply_
,
&
responder_
);
responder_
.
Finish
(
reply_
,
::
grpc
::
Status
::
OK
,
reinterpret_cast
<
void
*>
(
static_cast
<
intptr_t
>
(
req_id_
)));
}
}
protected:
protected:
...
@@ -157,8 +164,8 @@ class RequestPrefetch final : public RequestBase {
...
@@ -157,8 +164,8 @@ class RequestPrefetch final : public RequestBase {
// prefetch process...
// prefetch process...
std
::
string
in_var_name
=
request_
->
Varname
();
std
::
string
in_var_name
=
request_
->
Varname
();
std
::
string
out_var_name
=
request_
->
OutVarname
();
std
::
string
out_var_name
=
request_
->
OutVarname
();
VLOG
(
3
)
<<
"in_var_name: "
<<
in_var_name
VLOG
(
3
)
<<
"
RequestPrefetch,
in_var_name: "
<<
in_var_name
<<
"
RequestPrefetch
: "
<<
out_var_name
;
<<
"
out_var_name
: "
<<
out_var_name
;
auto
scope
=
request_
->
GetMutableLocalScope
();
auto
scope
=
request_
->
GetMutableLocalScope
();
auto
invar
=
scope
->
FindVar
(
in_var_name
);
auto
invar
=
scope
->
FindVar
(
in_var_name
);
...
@@ -168,9 +175,7 @@ class RequestPrefetch final : public RequestBase {
...
@@ -168,9 +175,7 @@ class RequestPrefetch final : public RequestBase {
SerializeToByteBuffer
(
out_var_name
,
outvar
,
*
request_handler_
->
dev_ctx
(),
SerializeToByteBuffer
(
out_var_name
,
outvar
,
*
request_handler_
->
dev_ctx
(),
&
reply_
);
&
reply_
);
status_
=
FINISH
;
Finish
(
reply_
,
&
responder_
);
responder_
.
Finish
(
reply_
,
::
grpc
::
Status
::
OK
,
reinterpret_cast
<
void
*>
(
static_cast
<
intptr_t
>
(
req_id_
)));
}
}
protected:
protected:
...
...
paddle/fluid/operators/detail/grpc_server.h
浏览文件 @
16658f7b
...
@@ -53,6 +53,7 @@ class AsyncGRPCServer final : public RPCServer {
...
@@ -53,6 +53,7 @@ class AsyncGRPCServer final : public RPCServer {
void
StartServer
()
override
;
void
StartServer
()
override
;
private:
private:
// HandleRequest needs to be thread-safe.
void
HandleRequest
(
void
HandleRequest
(
::
grpc
::
ServerCompletionQueue
*
cq
,
const
std
::
string
&
rpc_name
,
::
grpc
::
ServerCompletionQueue
*
cq
,
const
std
::
string
&
rpc_name
,
std
::
function
<
void
(
const
std
::
string
&
,
int
)
>
TryToRegisterNewOne
);
std
::
function
<
void
(
const
std
::
string
&
,
int
)
>
TryToRegisterNewOne
);
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
16658f7b
...
@@ -509,10 +509,10 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -509,10 +509,10 @@ All parameter, weight, gradient are variables in Paddle.
self
.
num_threads_
=
num_threads
;
self
.
num_threads_
=
num_threads
;
})
})
.
def_property
(
.
def_property
(
"use_
event
"
,
"use_
cuda
"
,
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
use_
event
_
;
},
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
use_
cuda
_
;
},
[](
ExecutionStrategy
&
self
,
bool
use_
event
)
{
[](
ExecutionStrategy
&
self
,
bool
use_
cuda
)
{
self
.
use_
event_
=
use_event
;
self
.
use_
cuda_
=
use_cuda
;
})
})
.
def_property
(
.
def_property
(
"allow_op_delay"
,
"allow_op_delay"
,
...
...
python/paddle/dataset/flowers.py
浏览文件 @
16658f7b
...
@@ -119,7 +119,8 @@ def reader_creator(data_file,
...
@@ -119,7 +119,8 @@ def reader_creator(data_file,
yield
sample
,
int
(
label
)
-
1
yield
sample
,
int
(
label
)
-
1
if
use_xmap
:
if
use_xmap
:
return
xmap_readers
(
mapper
,
reader
,
cpu_count
(),
buffered_size
)
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
cpu_count
()))
return
xmap_readers
(
mapper
,
reader
,
cpu_num
,
buffered_size
)
else
:
else
:
return
map_readers
(
mapper
,
reader
)
return
map_readers
(
mapper
,
reader
)
...
...
python/paddle/fluid/data_feeder.py
浏览文件 @
16658f7b
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
core
import
core
import
numpy
import
numpy
import
os
import
six.moves
as
six
import
six.moves
as
six
import
multiprocessing
import
multiprocessing
...
@@ -150,7 +151,9 @@ class DataFeeder(object):
...
@@ -150,7 +151,9 @@ class DataFeeder(object):
elif
isinstance
(
self
.
place
,
core
.
CUDAPlace
):
elif
isinstance
(
self
.
place
,
core
.
CUDAPlace
):
return
core
.
get_cuda_device_count
()
return
core
.
get_cuda_device_count
()
else
:
else
:
return
multiprocessing
.
cpu_count
()
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
return
cpu_num
def
decorate_reader
(
self
,
def
decorate_reader
(
self
,
reader
,
reader
,
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
16658f7b
...
@@ -18,6 +18,7 @@ import framework
...
@@ -18,6 +18,7 @@ import framework
import
executor
import
executor
import
warnings
import
warnings
import
sys
import
sys
import
os
__all__
=
[
'ParallelExecutor'
,
'ExecutionStrategy'
,
'BuildStrategy'
]
__all__
=
[
'ParallelExecutor'
,
'ExecutionStrategy'
,
'BuildStrategy'
]
...
@@ -101,7 +102,9 @@ class ParallelExecutor(object):
...
@@ -101,7 +102,9 @@ class ParallelExecutor(object):
p
.
set_place
(
self
.
_act_places
[
-
1
])
p
.
set_place
(
self
.
_act_places
[
-
1
])
self
.
_places
.
append
(
p
)
self
.
_places
.
append
(
p
)
else
:
else
:
for
i
in
xrange
(
multiprocessing
.
cpu_count
()):
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
for
i
in
xrange
(
cpu_num
):
p
=
core
.
Place
()
p
=
core
.
Place
()
self
.
_act_places
.
append
(
core
.
CPUPlace
())
self
.
_act_places
.
append
(
core
.
CPUPlace
())
p
.
set_place
(
self
.
_act_places
[
-
1
])
p
.
set_place
(
self
.
_act_places
[
-
1
])
...
@@ -110,19 +113,17 @@ class ParallelExecutor(object):
...
@@ -110,19 +113,17 @@ class ParallelExecutor(object):
if
exec_strategy
is
None
:
if
exec_strategy
is
None
:
exec_strategy
=
ExecutionStrategy
()
exec_strategy
=
ExecutionStrategy
()
if
use_cuda
:
exec_strategy
.
use_cuda
=
use_cuda
exec_strategy
.
use_event
=
True
else
:
exec_strategy
.
use_event
=
False
if
exec_strategy
.
num_threads
==
0
:
if
exec_strategy
.
num_threads
==
0
:
if
use_cuda
:
if
use_cuda
:
# Experiments on se-resnext shows that too many threads hurt
# Experiments on se-resnext shows that too many threads hurt
# performance. Worth tunning for other models in the future.
# performance. Worth tunning for other models in the future.
exec_strategy
.
num_threads
=
len
(
self
.
_places
)
*
2
exec_strategy
.
num_threads
=
len
(
self
.
_places
)
*
4
else
:
else
:
exec_strategy
.
num_threads
=
min
(
cpu_num
=
int
(
len
(
self
.
_places
)
*
2
,
multiprocessing
.
cpu_count
())
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
exec_strategy
.
num_threads
=
cpu_num
if
build_strategy
is
None
:
if
build_strategy
is
None
:
build_strategy
=
BuildStrategy
()
build_strategy
=
BuildStrategy
()
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
16658f7b
...
@@ -41,8 +41,8 @@ function(py_test_modules TARGET_NAME)
...
@@ -41,8 +41,8 @@ function(py_test_modules TARGET_NAME)
endfunction
()
endfunction
()
list
(
REMOVE_ITEM TEST_OPS test_warpctc_op
)
list
(
REMOVE_ITEM TEST_OPS test_warpctc_op
)
list
(
REMOVE_ITEM TEST_OPS test_dist_train
)
list
(
REMOVE_ITEM TEST_OPS test_dist_train
)
list
(
REMOVE_ITEM TEST_OPS test_parallel_executor_crf
)
#
list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
list
(
REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed
)
#
list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
# TODO(wuyi): this test hungs on CI, will add it back later
# TODO(wuyi): this test hungs on CI, will add it back later
list
(
REMOVE_ITEM TEST_OPS test_listen_and_serv_op
)
list
(
REMOVE_ITEM TEST_OPS test_listen_and_serv_op
)
foreach
(
TEST_OP
${
TEST_OPS
}
)
foreach
(
TEST_OP
${
TEST_OPS
}
)
...
...
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
浏览文件 @
16658f7b
...
@@ -12,6 +12,8 @@
...
@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
multiprocessing
import
os
import
unittest
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
time
import
time
...
@@ -23,6 +25,7 @@ __all__ = ['TestParallelExecutorBase']
...
@@ -23,6 +25,7 @@ __all__ = ['TestParallelExecutorBase']
class
TestParallelExecutorBase
(
unittest
.
TestCase
):
class
TestParallelExecutorBase
(
unittest
.
TestCase
):
def
check_network_convergence
(
self
,
def
check_network_convergence
(
self
,
method
,
method
,
use_cuda
=
True
,
memory_opt
=
True
,
memory_opt
=
True
,
iter
=
50
,
iter
=
50
,
batch_size
=
None
,
batch_size
=
None
,
...
@@ -53,7 +56,7 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -53,7 +56,7 @@ class TestParallelExecutorBase(unittest.TestCase):
adam
.
minimize
(
loss
)
adam
.
minimize
(
loss
)
if
memory_opt
:
if
memory_opt
:
fluid
.
memory_optimize
(
main
)
fluid
.
memory_optimize
(
main
)
place
=
fluid
.
CUDAPlace
(
0
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
startup_exe
=
fluid
.
Executor
(
place
)
startup_exe
=
fluid
.
Executor
(
place
)
startup_exe
.
run
(
startup
)
startup_exe
.
run
(
startup
)
exec_strategy
=
fluid
.
ExecutionStrategy
()
exec_strategy
=
fluid
.
ExecutionStrategy
()
...
@@ -64,7 +67,7 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -64,7 +67,7 @@ class TestParallelExecutorBase(unittest.TestCase):
if
use_parallel_executor
:
if
use_parallel_executor
:
exe
=
fluid
.
ParallelExecutor
(
exe
=
fluid
.
ParallelExecutor
(
True
,
use_cuda
,
loss_name
=
loss
.
name
,
loss_name
=
loss
.
name
,
exec_strategy
=
exec_strategy
,
exec_strategy
=
exec_strategy
,
build_strategy
=
build_strategy
)
build_strategy
=
build_strategy
)
...
@@ -72,7 +75,9 @@ class TestParallelExecutorBase(unittest.TestCase):
...
@@ -72,7 +75,9 @@ class TestParallelExecutorBase(unittest.TestCase):
exe
=
fluid
.
Executor
(
place
=
place
)
exe
=
fluid
.
Executor
(
place
=
place
)
if
batch_size
is
not
None
:
if
batch_size
is
not
None
:
batch_size
*=
fluid
.
core
.
get_cuda_device_count
()
batch_size
*=
fluid
.
core
.
get_cuda_device_count
(
)
if
use_cuda
else
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
begin
=
time
.
time
()
begin
=
time
.
time
()
first_loss
,
=
run_executor
(
first_loss
,
=
run_executor
(
exe
=
exe
,
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
exe
=
exe
,
feed
=
feed_dict
,
fetch_list
=
[
loss
.
name
])
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
浏览文件 @
16658f7b
...
@@ -17,6 +17,7 @@ import paddle.fluid as fluid
...
@@ -17,6 +17,7 @@ import paddle.fluid as fluid
import
unittest
import
unittest
import
paddle
import
paddle
import
numpy
as
np
import
numpy
as
np
import
os
word_dict
,
verb_dict
,
label_dict
=
conll05
.
get_dict
()
word_dict
,
verb_dict
,
label_dict
=
conll05
.
get_dict
()
word_dict_len
=
len
(
word_dict
)
word_dict_len
=
len
(
word_dict
)
...
@@ -101,7 +102,11 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
...
@@ -101,7 +102,11 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
class
TestCRFModel
(
unittest
.
TestCase
):
class
TestCRFModel
(
unittest
.
TestCase
):
def
check_network_convergence
(
self
,
is_sparse
,
build_strategy
=
None
):
def
check_network_convergence
(
self
,
is_sparse
,
build_strategy
=
None
,
use_cuda
=
True
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
main
=
fluid
.
Program
()
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main
,
startup
):
with
fluid
.
program_guard
(
main
,
startup
):
...
@@ -145,12 +150,12 @@ class TestCRFModel(unittest.TestCase):
...
@@ -145,12 +150,12 @@ class TestCRFModel(unittest.TestCase):
paddle
.
dataset
.
conll05
.
test
(),
buf_size
=
8192
),
paddle
.
dataset
.
conll05
.
test
(),
buf_size
=
8192
),
batch_size
=
16
)
batch_size
=
16
)
place
=
fluid
.
CUDAPlace
(
0
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
exe
.
run
(
startup
)
pe
=
fluid
.
ParallelExecutor
(
pe
=
fluid
.
ParallelExecutor
(
use_cuda
=
True
,
use_cuda
=
use_cuda
,
loss_name
=
avg_cost
.
name
,
loss_name
=
avg_cost
.
name
,
build_strategy
=
build_strategy
)
build_strategy
=
build_strategy
)
...
@@ -172,25 +177,33 @@ class TestCRFModel(unittest.TestCase):
...
@@ -172,25 +177,33 @@ class TestCRFModel(unittest.TestCase):
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
self
.
check_network_convergence
(
self
.
check_network_convergence
(
is_sparse
=
True
,
build_strategy
=
build_strategy
)
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
self
.
check_network_convergence
(
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
def
test_update_dense_parameter_all_reduce
(
self
):
def
test_update_dense_parameter_all_reduce
(
self
):
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
self
.
check_network_convergence
(
self
.
check_network_convergence
(
is_sparse
=
False
,
build_strategy
=
build_strategy
)
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
self
.
check_network_convergence
(
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
def
test_update_sparse_parameter_reduce
(
self
):
def
test_update_sparse_parameter_reduce
(
self
):
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
self
.
check_network_convergence
(
self
.
check_network_convergence
(
is_sparse
=
True
,
build_strategy
=
build_strategy
)
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
self
.
check_network_convergence
(
is_sparse
=
True
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
def
test_update_dense_parameter_reduce
(
self
):
def
test_update_dense_parameter_reduce
(
self
):
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
self
.
check_network_convergence
(
self
.
check_network_convergence
(
is_sparse
=
False
,
build_strategy
=
build_strategy
)
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
True
)
self
.
check_network_convergence
(
is_sparse
=
False
,
build_strategy
=
build_strategy
,
use_cuda
=
False
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
浏览文件 @
16658f7b
...
@@ -18,6 +18,7 @@ import paddle.fluid as fluid
...
@@ -18,6 +18,7 @@ import paddle.fluid as fluid
import
unittest
import
unittest
import
numpy
as
np
import
numpy
as
np
import
paddle
import
paddle
import
os
def
Lenet
(
data
,
class_dim
):
def
Lenet
(
data
,
class_dim
):
...
@@ -35,7 +36,7 @@ def Lenet(data, class_dim):
...
@@ -35,7 +36,7 @@ def Lenet(data, class_dim):
class
TestFetchOp
(
unittest
.
TestCase
):
class
TestFetchOp
(
unittest
.
TestCase
):
def
parallel_exe
(
self
,
train_inputs
,
seed
):
def
parallel_exe
(
self
,
train_inputs
,
seed
,
use_cuda
):
main
=
fluid
.
Program
()
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
startup
.
random_seed
=
seed
startup
.
random_seed
=
seed
...
@@ -59,13 +60,13 @@ class TestFetchOp(unittest.TestCase):
...
@@ -59,13 +60,13 @@ class TestFetchOp(unittest.TestCase):
# conv2d_1.b_0@GRAD. Those variables should not be pruned.
# conv2d_1.b_0@GRAD. Those variables should not be pruned.
# fluid.memory_optimize(main)
# fluid.memory_optimize(main)
place
=
fluid
.
CUDAPlace
(
0
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
exe
.
run
(
startup
)
feeder
=
fluid
.
DataFeeder
(
place
=
place
,
feed_list
=
[
data
,
label
])
feeder
=
fluid
.
DataFeeder
(
place
=
place
,
feed_list
=
[
data
,
label
])
pe
=
fluid
.
ParallelExecutor
(
pe
=
fluid
.
ParallelExecutor
(
use_cuda
=
True
,
loss_name
=
loss
.
name
,
main_program
=
main
)
use_cuda
=
use_cuda
,
loss_name
=
loss
.
name
,
main_program
=
main
)
fetch_list
=
[]
fetch_list
=
[]
all_vars
=
main
.
global_block
().
vars
all_vars
=
main
.
global_block
().
vars
...
@@ -88,14 +89,16 @@ class TestFetchOp(unittest.TestCase):
...
@@ -88,14 +89,16 @@ class TestFetchOp(unittest.TestCase):
for
i
in
range
(
iters
):
for
i
in
range
(
iters
):
train_inputs
.
append
(
tst_reader_iter
.
next
())
train_inputs
.
append
(
tst_reader_iter
.
next
())
self
.
parallel_exe
(
train_inputs
,
seed
=
1
)
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
self
.
parallel_exe
(
train_inputs
,
seed
=
1
,
use_cuda
=
True
)
self
.
parallel_exe
(
train_inputs
,
seed
=
1
,
use_cuda
=
False
)
class
TestFeedParallel
(
unittest
.
TestCase
):
class
TestFeedParallel
(
unittest
.
TestCase
):
def
test_main
(
self
):
def
parallel_exe
(
self
,
use_cuda
,
seed
):
main
=
fluid
.
Program
()
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
startup
.
random_seed
=
1
startup
.
random_seed
=
seed
with
fluid
.
scope_guard
(
fluid
.
core
.
Scope
()):
with
fluid
.
scope_guard
(
fluid
.
core
.
Scope
()):
with
fluid
.
program_guard
(
main
,
startup
):
with
fluid
.
program_guard
(
main
,
startup
):
data
=
fluid
.
layers
.
data
(
data
=
fluid
.
layers
.
data
(
...
@@ -111,15 +114,18 @@ class TestFeedParallel(unittest.TestCase):
...
@@ -111,15 +114,18 @@ class TestFeedParallel(unittest.TestCase):
regularization
=
fluid
.
regularizer
.
L2Decay
(
1e-4
))
regularization
=
fluid
.
regularizer
.
L2Decay
(
1e-4
))
opt
.
minimize
(
loss
)
opt
.
minimize
(
loss
)
place
=
fluid
.
CUDAPlace
(
0
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
feeder
=
fluid
.
DataFeeder
(
place
=
place
,
feed_list
=
[
data
,
label
])
feeder
=
fluid
.
DataFeeder
(
place
=
place
,
feed_list
=
[
data
,
label
])
reader
=
feeder
.
decorate_reader
(
reader
=
feeder
.
decorate_reader
(
paddle
.
batch
(
paddle
.
batch
(
flowers
.
train
(),
batch_size
=
16
),
multi_devices
=
True
)
flowers
.
train
(),
batch_size
=
16
),
multi_devices
=
True
)
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
exe
.
run
(
startup
)
pe
=
fluid
.
ParallelExecutor
(
pe
=
fluid
.
ParallelExecutor
(
use_cuda
=
True
,
loss_name
=
loss
.
name
,
main_program
=
main
)
use_cuda
=
use_cuda
,
loss_name
=
loss
.
name
,
main_program
=
main
)
for
batch_id
,
data
in
enumerate
(
reader
()):
for
batch_id
,
data
in
enumerate
(
reader
()):
loss_np
=
np
.
array
(
pe
.
run
(
feed
=
data
,
fetch_list
=
[
loss
.
name
])[
0
])
loss_np
=
np
.
array
(
pe
.
run
(
feed
=
data
,
fetch_list
=
[
loss
.
name
])[
0
])
...
@@ -127,6 +133,11 @@ class TestFeedParallel(unittest.TestCase):
...
@@ -127,6 +133,11 @@ class TestFeedParallel(unittest.TestCase):
if
batch_id
==
2
:
if
batch_id
==
2
:
break
break
def
test_feed_op
(
self
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
self
.
parallel_exe
(
use_cuda
=
True
,
seed
=
1
)
self
.
parallel_exe
(
use_cuda
=
False
,
seed
=
1
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
浏览文件 @
16658f7b
...
@@ -18,6 +18,7 @@ import numpy as np
...
@@ -18,6 +18,7 @@ import numpy as np
import
paddle
import
paddle
import
paddle.dataset.mnist
as
mnist
import
paddle.dataset.mnist
as
mnist
import
unittest
import
unittest
import
os
MNIST_RECORDIO_FILE
=
"./mnist_test_pe.recordio"
MNIST_RECORDIO_FILE
=
"./mnist_test_pe.recordio"
...
@@ -85,6 +86,7 @@ def fc_with_batchnorm(use_feed):
...
@@ -85,6 +86,7 @@ def fc_with_batchnorm(use_feed):
class
TestMNIST
(
TestParallelExecutorBase
):
class
TestMNIST
(
TestParallelExecutorBase
):
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
# Convert mnist to recordio file
# Convert mnist to recordio file
with
fluid
.
program_guard
(
fluid
.
Program
(),
fluid
.
Program
()):
with
fluid
.
program_guard
(
fluid
.
Program
(),
fluid
.
Program
()):
reader
=
paddle
.
batch
(
mnist
.
train
(),
batch_size
=
4
)
reader
=
paddle
.
batch
(
mnist
.
train
(),
batch_size
=
4
)
...
@@ -99,9 +101,12 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -99,9 +101,12 @@ class TestMNIST(TestParallelExecutorBase):
fluid
.
recordio_writer
.
convert_reader_to_recordio_file
(
fluid
.
recordio_writer
.
convert_reader_to_recordio_file
(
MNIST_RECORDIO_FILE
,
reader
,
feeder
)
MNIST_RECORDIO_FILE
,
reader
,
feeder
)
def
check_simple_fc_convergence
(
self
,
balance_parameter_opt_between_cards
):
def
check_simple_fc_convergence
(
self
,
self
.
check_network_convergence
(
simple_fc_net
)
balance_parameter_opt_between_cards
,
self
.
check_network_convergence
(
simple_fc_net
,
allow_op_delay
=
True
)
use_cuda
=
True
):
self
.
check_network_convergence
(
simple_fc_net
,
use_cuda
=
use_cuda
)
self
.
check_network_convergence
(
simple_fc_net
,
use_cuda
=
use_cuda
,
allow_op_delay
=
True
)
img
=
np
.
zeros
(
shape
=
[
32
,
784
],
dtype
=
'float32'
)
img
=
np
.
zeros
(
shape
=
[
32
,
784
],
dtype
=
'float32'
)
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
...
@@ -109,17 +114,21 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -109,17 +114,21 @@ class TestMNIST(TestParallelExecutorBase):
simple_fc_net
,
simple_fc_net
,
feed_dict
=
{
"image"
:
img
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
"label"
:
label
},
use_cuda
=
use_cuda
,
balance_parameter_opt_between_cards
=
balance_parameter_opt_between_cards
balance_parameter_opt_between_cards
=
balance_parameter_opt_between_cards
)
)
def
test_simple_fc
(
self
):
def
test_simple_fc
(
self
):
self
.
check_simple_fc_convergence
(
False
)
self
.
check_simple_fc_convergence
(
False
,
use_cuda
=
True
)
self
.
check_simple_fc_convergence
(
False
,
use_cuda
=
False
)
def
test_simple_fc_with_new_strategy
(
self
):
def
test_simple_fc_with_new_strategy
(
self
):
self
.
check_simple_fc_convergence
(
True
)
self
.
check_simple_fc_convergence
(
True
,
use_cuda
=
True
)
self
.
check_simple_fc_convergence
(
True
,
use_cuda
=
False
)
def
check_simple_fc_parallel_accuracy
(
self
,
def
check_simple_fc_parallel_accuracy
(
self
,
balance_parameter_opt_between_cards
):
balance_parameter_opt_between_cards
,
use_cuda
=
True
):
img
=
np
.
zeros
(
shape
=
[
32
,
784
],
dtype
=
'float32'
)
img
=
np
.
zeros
(
shape
=
[
32
,
784
],
dtype
=
'float32'
)
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
single_first_loss
,
single_last_loss
=
self
.
check_network_convergence
(
single_first_loss
,
single_last_loss
=
self
.
check_network_convergence
(
...
@@ -127,12 +136,14 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -127,12 +136,14 @@ class TestMNIST(TestParallelExecutorBase):
seed
=
1000
,
seed
=
1000
,
feed_dict
=
{
"image"
:
img
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
"label"
:
label
},
use_cuda
=
use_cuda
,
use_parallel_executor
=
False
)
use_parallel_executor
=
False
)
parallel_first_loss
,
parallel_last_loss
=
self
.
check_network_convergence
(
parallel_first_loss
,
parallel_last_loss
=
self
.
check_network_convergence
(
method
=
simple_fc_net
,
method
=
simple_fc_net
,
seed
=
1000
,
seed
=
1000
,
feed_dict
=
{
"image"
:
img
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
"label"
:
label
},
use_cuda
=
use_cuda
,
use_parallel_executor
=
True
,
use_parallel_executor
=
True
,
balance_parameter_opt_between_cards
=
balance_parameter_opt_between_cards
balance_parameter_opt_between_cards
=
balance_parameter_opt_between_cards
)
)
...
@@ -143,28 +154,33 @@ class TestMNIST(TestParallelExecutorBase):
...
@@ -143,28 +154,33 @@ class TestMNIST(TestParallelExecutorBase):
self
.
assertAlmostEquals
(
p_l
,
single_last_loss
[
0
],
delta
=
1e-6
)
self
.
assertAlmostEquals
(
p_l
,
single_last_loss
[
0
],
delta
=
1e-6
)
def
test_simple_fc_parallel_accuracy
(
self
):
def
test_simple_fc_parallel_accuracy
(
self
):
self
.
check_simple_fc_parallel_accuracy
(
False
)
self
.
check_simple_fc_parallel_accuracy
(
False
,
use_cuda
=
True
)
self
.
check_simple_fc_parallel_accuracy
(
False
,
use_cuda
=
False
)
def
test_simple_fc_parallel_accuracy_with_new_strategy
(
self
):
def
test_simple_fc_parallel_accuracy_with_new_strategy
(
self
):
self
.
check_simple_fc_parallel_accuracy
(
True
)
self
.
check_simple_fc_parallel_accuracy
(
True
,
use_cuda
=
True
)
self
.
check_simple_fc_parallel_accuracy
(
True
,
use_cuda
=
False
)
def
check_batchnorm_fc_convergence
(
self
,
def
check_batchnorm_fc_convergence
(
balance_parameter_opt_between_cards
):
self
,
balance_parameter_opt_between_cards
,
use_cuda
):
self
.
check_network_convergence
(
fc_with_batchnorm
)
self
.
check_network_convergence
(
fc_with_batchnorm
,
use_cuda
=
use_cuda
)
img
=
np
.
zeros
(
shape
=
[
32
,
784
],
dtype
=
'float32'
)
img
=
np
.
zeros
(
shape
=
[
32
,
784
],
dtype
=
'float32'
)
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
self
.
check_network_convergence
(
self
.
check_network_convergence
(
fc_with_batchnorm
,
fc_with_batchnorm
,
feed_dict
=
{
"image"
:
img
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
"label"
:
label
},
use_cuda
=
use_cuda
,
balance_parameter_opt_between_cards
=
balance_parameter_opt_between_cards
balance_parameter_opt_between_cards
=
balance_parameter_opt_between_cards
)
)
def
test_batchnorm_fc
(
self
):
def
test_batchnorm_fc
(
self
):
self
.
check_batchnorm_fc_convergence
(
False
)
self
.
check_batchnorm_fc_convergence
(
False
,
use_cuda
=
True
)
self
.
check_batchnorm_fc_convergence
(
False
,
use_cuda
=
False
)
def
test_batchnorm_fc_with_new_strategy
(
self
):
def
test_batchnorm_fc_with_new_strategy
(
self
):
self
.
check_batchnorm_fc_convergence
(
True
)
self
.
check_batchnorm_fc_convergence
(
True
,
use_cuda
=
True
)
self
.
check_batchnorm_fc_convergence
(
True
,
use_cuda
=
False
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
浏览文件 @
16658f7b
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
parallel_executor_test_base
import
TestParallelExecutorBase
from
parallel_executor_test_base
import
TestParallelExecutorBase
import
unittest
import
unittest
import
os
def
squeeze_excitation
(
input
,
num_channels
,
reduction_ratio
):
def
squeeze_excitation
(
input
,
num_channels
,
reduction_ratio
):
...
@@ -130,22 +131,30 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
...
@@ -130,22 +131,30 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False):
class
TestResnet
(
TestParallelExecutorBase
):
class
TestResnet
(
TestParallelExecutorBase
):
def
check_resnet_convergence
(
self
,
balance_parameter_opt_between_cards
):
def
check_resnet_convergence
(
self
,
balance_parameter_opt_between_cards
,
use_cuda
=
True
,
iter
=
20
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
import
functools
import
functools
batch_size
=
2
batch_size
=
2
self
.
check_network_convergence
(
self
.
check_network_convergence
(
functools
.
partial
(
functools
.
partial
(
SE_ResNeXt50Small
,
batch_size
=
batch_size
),
SE_ResNeXt50Small
,
batch_size
=
batch_size
),
iter
=
20
,
iter
=
iter
,
batch_size
=
batch_size
,
batch_size
=
batch_size
,
use_cuda
=
use_cuda
,
balance_parameter_opt_between_cards
=
balance_parameter_opt_between_cards
balance_parameter_opt_between_cards
=
balance_parameter_opt_between_cards
)
)
def
test_resnet
(
self
):
def
test_resnet
(
self
):
self
.
check_resnet_convergence
(
False
)
self
.
check_resnet_convergence
(
False
,
use_cuda
=
True
)
self
.
check_resnet_convergence
(
False
,
use_cuda
=
False
,
iter
=
5
)
def
test_resnet_with_new_strategy
(
self
):
def
test_resnet_with_new_strategy
(
self
):
self
.
check_resnet_convergence
(
True
)
self
.
check_resnet_convergence
(
True
,
use_cuda
=
True
)
self
.
check_resnet_convergence
(
True
,
use_cuda
=
False
,
iter
=
5
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
浏览文件 @
16658f7b
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
numpy
as
np
import
numpy
as
np
import
unittest
import
unittest
import
os
def
simple_fc_net
():
def
simple_fc_net
():
...
@@ -35,7 +36,8 @@ def simple_fc_net():
...
@@ -35,7 +36,8 @@ def simple_fc_net():
class
ParallelExecutorTestingDuringTraining
(
unittest
.
TestCase
):
class
ParallelExecutorTestingDuringTraining
(
unittest
.
TestCase
):
def
check_network_convergence
(
self
,
build_strategy
=
None
):
def
check_network_convergence
(
self
,
use_cuda
,
build_strategy
=
None
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
main
=
fluid
.
Program
()
main
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
startup
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main
,
startup
):
with
fluid
.
program_guard
(
main
,
startup
):
...
@@ -49,19 +51,19 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
...
@@ -49,19 +51,19 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
image
=
np
.
random
.
normal
(
size
=
(
batch_size
,
784
)).
astype
(
'float32'
)
image
=
np
.
random
.
normal
(
size
=
(
batch_size
,
784
)).
astype
(
'float32'
)
label
=
np
.
random
.
randint
(
0
,
10
,
(
batch_size
,
1
),
dtype
=
"int64"
)
label
=
np
.
random
.
randint
(
0
,
10
,
(
batch_size
,
1
),
dtype
=
"int64"
)
place
=
fluid
.
CUDAPlace
(
0
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
exe
.
run
(
startup
)
feed_dict
=
{
'image'
:
image
,
'label'
:
label
}
feed_dict
=
{
'image'
:
image
,
'label'
:
label
}
train_exe
=
fluid
.
ParallelExecutor
(
train_exe
=
fluid
.
ParallelExecutor
(
use_cuda
=
True
,
use_cuda
=
use_cuda
,
loss_name
=
loss
.
name
,
loss_name
=
loss
.
name
,
main_program
=
main
,
main_program
=
main
,
build_strategy
=
build_strategy
)
build_strategy
=
build_strategy
)
test_exe
=
fluid
.
ParallelExecutor
(
test_exe
=
fluid
.
ParallelExecutor
(
use_cuda
=
True
,
use_cuda
=
use_cuda
,
main_program
=
test_program
,
main_program
=
test_program
,
share_vars_from
=
train_exe
,
share_vars_from
=
train_exe
,
build_strategy
=
build_strategy
)
build_strategy
=
build_strategy
)
...
@@ -81,12 +83,18 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
...
@@ -81,12 +83,18 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
def
test_parallel_testing
(
self
):
def
test_parallel_testing
(
self
):
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
self
.
check_network_convergence
(
build_strategy
)
self
.
check_network_convergence
(
use_cuda
=
True
,
build_strategy
=
build_strategy
)
self
.
check_network_convergence
(
use_cuda
=
False
,
build_strategy
=
build_strategy
)
def
test_parallel_testing_with_new_strategy
(
self
):
def
test_parallel_testing_with_new_strategy
(
self
):
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
self
.
check_network_convergence
(
build_strategy
)
self
.
check_network_convergence
(
use_cuda
=
True
,
build_strategy
=
build_strategy
)
self
.
check_network_convergence
(
use_cuda
=
False
,
build_strategy
=
build_strategy
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
浏览文件 @
16658f7b
...
@@ -19,6 +19,7 @@ from parallel_executor_test_base import TestParallelExecutorBase
...
@@ -19,6 +19,7 @@ from parallel_executor_test_base import TestParallelExecutorBase
import
unittest
import
unittest
import
paddle
import
paddle
import
paddle.dataset.wmt16
as
wmt16
import
paddle.dataset.wmt16
as
wmt16
import
os
WMT16_RECORDIO_FILE
=
"./wmt16_test_pe.recordio"
WMT16_RECORDIO_FILE
=
"./wmt16_test_pe.recordio"
...
@@ -149,6 +150,7 @@ def transformer(use_feed):
...
@@ -149,6 +150,7 @@ def transformer(use_feed):
class
TestTransformer
(
TestParallelExecutorBase
):
class
TestTransformer
(
TestParallelExecutorBase
):
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
reader
=
paddle
.
batch
(
reader
=
paddle
.
batch
(
wmt16
.
train
(
ModelHyperParams
.
src_vocab_size
,
wmt16
.
train
(
ModelHyperParams
.
src_vocab_size
,
ModelHyperParams
.
trg_vocab_size
),
ModelHyperParams
.
trg_vocab_size
),
...
@@ -167,7 +169,8 @@ class TestTransformer(TestParallelExecutorBase):
...
@@ -167,7 +169,8 @@ class TestTransformer(TestParallelExecutorBase):
@
unittest
.
skip
(
"transformer is buggy in multi gpu"
)
@
unittest
.
skip
(
"transformer is buggy in multi gpu"
)
def
test_main
(
self
):
def
test_main
(
self
):
self
.
check_network_convergence
(
transformer
)
self
.
check_network_convergence
(
transformer
,
use_cuda
=
True
)
self
.
check_network_convergence
(
transformer
,
use_cuda
=
False
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
python/paddle/v2/dataset/flowers.py
浏览文件 @
16658f7b
...
@@ -119,7 +119,8 @@ def reader_creator(data_file,
...
@@ -119,7 +119,8 @@ def reader_creator(data_file,
yield
sample
,
int
(
label
)
-
1
yield
sample
,
int
(
label
)
-
1
if
use_xmap
:
if
use_xmap
:
return
xmap_readers
(
mapper
,
reader
,
cpu_count
(),
buffered_size
)
cpu_num
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
cpu_count
()))
return
xmap_readers
(
mapper
,
reader
,
cpu_num
,
buffered_size
)
else
:
else
:
return
map_readers
(
mapper
,
reader
)
return
map_readers
(
mapper
,
reader
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录