Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
728ec1b4
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
728ec1b4
编写于
9月 30, 2019
作者:
C
Chengmo
提交者:
GitHub
9月 30, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add GEO-SGD distribute training algorithm (#20018)
* refector geo sgd & communicator
上级
5365cd2f
变更
17
展开全部
显示空白变更内容
内联
并排
Showing
17 changed file
with
1518 addition
and
104 deletion
+1518
-104
paddle/fluid/operators/distributed/communicator.cc
paddle/fluid/operators/distributed/communicator.cc
+566
-4
paddle/fluid/operators/distributed/communicator.h
paddle/fluid/operators/distributed/communicator.h
+148
-10
paddle/fluid/operators/distributed/parameter_recv.cc
paddle/fluid/operators/distributed/parameter_recv.cc
+64
-8
paddle/fluid/operators/distributed/parameter_send.cc
paddle/fluid/operators/distributed/parameter_send.cc
+122
-55
paddle/fluid/operators/distributed/parameter_send.h
paddle/fluid/operators/distributed/parameter_send.h
+1
-1
paddle/fluid/operators/distributed/request_handler_impl.cc
paddle/fluid/operators/distributed/request_handler_impl.cc
+58
-5
paddle/fluid/operators/distributed_ops/send_op.cc
paddle/fluid/operators/distributed_ops/send_op.cc
+6
-2
paddle/fluid/pybind/communicator_py.cc
paddle/fluid/pybind/communicator_py.cc
+15
-1
paddle/fluid/pybind/communicator_py.h
paddle/fluid/pybind/communicator_py.h
+4
-0
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-0
python/paddle/fluid/communicator.py
python/paddle/fluid/communicator.py
+18
-2
python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
.../fleet/parameter_server/distribute_transpiler/__init__.py
+20
-5
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+1
-0
python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+42
-11
python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
+100
-0
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+4
-0
python/paddle/fluid/transpiler/geo_sgd_transpiler.py
python/paddle/fluid/transpiler/geo_sgd_transpiler.py
+348
-0
未找到文件。
paddle/fluid/operators/distributed/communicator.cc
浏览文件 @
728ec1b4
此差异已折叠。
点击以展开。
paddle/fluid/operators/distributed/communicator.h
浏览文件 @
728ec1b4
...
@@ -14,16 +14,17 @@ limitations under the License. */
...
@@ -14,16 +14,17 @@ limitations under the License. */
#pragma once
#pragma once
#include <ThreadPool.h>
#include <atomic>
#include <atomic>
#include <deque>
#include <deque>
#include <map>
#include <memory>
#include <memory>
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <utility>
#include <vector>
#include <vector>
#include <ThreadPool.h>
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/distributed/rpc_common.h"
#include "paddle/fluid/operators/distributed/rpc_common.h"
...
@@ -170,6 +171,11 @@ class Communicator {
...
@@ -170,6 +171,11 @@ class Communicator {
virtual
void
Send
(
const
std
::
string
&
var_name
,
virtual
void
Send
(
const
std
::
string
&
var_name
,
const
framework
::
Scope
&
scope
)
=
0
;
const
framework
::
Scope
&
scope
)
=
0
;
virtual
void
Send
(
const
std
::
vector
<
std
::
string
>&
sparse_var_names
,
const
std
::
vector
<
std
::
string
>&
sparse_var_tables
,
const
framework
::
Scope
&
scope
)
=
0
;
virtual
void
Recv
()
=
0
;
virtual
void
Recv
()
=
0
;
virtual
void
InitImpl
(
const
RpcCtxMap
&
send_varname_to_ctx
,
virtual
void
InitImpl
(
const
RpcCtxMap
&
send_varname_to_ctx
,
...
@@ -179,6 +185,13 @@ class Communicator {
...
@@ -179,6 +185,13 @@ class Communicator {
virtual
void
InitImpl
(
const
paddle
::
framework
::
ProgramDesc
&
program
,
virtual
void
InitImpl
(
const
paddle
::
framework
::
ProgramDesc
&
program
,
Scope
*
recv_scope
)
=
0
;
Scope
*
recv_scope
)
=
0
;
// for geo-sgd
virtual
void
InitImpl
(
const
paddle
::
framework
::
ProgramDesc
&
program
,
Scope
*
param_scope
,
std
::
map
<
std
::
string
,
std
::
map
<
std
::
string
,
std
::
vector
<
std
::
string
>>>&
vars_info
,
const
int
&
trainers
,
const
int
&
geo_need_push_nums
)
=
0
;
static
Communicator
*
GetInstance
()
{
return
communicator_
.
get
();
}
static
Communicator
*
GetInstance
()
{
return
communicator_
.
get
();
}
static
std
::
shared_ptr
<
Communicator
>
GetInstantcePtr
()
{
static
std
::
shared_ptr
<
Communicator
>
GetInstantcePtr
()
{
...
@@ -194,6 +207,26 @@ class Communicator {
...
@@ -194,6 +207,26 @@ class Communicator {
return
communicator_
.
get
();
return
communicator_
.
get
();
}
}
template
<
typename
T
>
static
Communicator
*
InitInstance
(
const
paddle
::
framework
::
ProgramDesc
&
program
,
Scope
*
recv_scope
)
{
std
::
call_once
(
init_flag_
,
&
Communicator
::
InitWithProgram
<
T
>
,
program
,
recv_scope
);
return
communicator_
.
get
();
}
template
<
typename
T
>
static
Communicator
*
InitInstance
(
const
paddle
::
framework
::
ProgramDesc
&
program
,
Scope
*
training_scope
,
std
::
map
<
std
::
string
,
std
::
map
<
std
::
string
,
std
::
vector
<
std
::
string
>>>&
vars_info
,
const
int
&
trainers
,
const
int
&
geo_need_push_nums
)
{
std
::
call_once
(
init_flag_
,
&
Communicator
::
InitWithTranspilerInfo
<
T
>
,
program
,
training_scope
,
std
::
ref
(
vars_info
),
std
::
ref
(
trainers
),
std
::
ref
(
geo_need_push_nums
));
return
communicator_
.
get
();
}
// Init is called by InitInstance.
// Init is called by InitInstance.
template
<
typename
T
>
template
<
typename
T
>
static
void
InitWithRpcCtx
(
const
RpcCtxMap
&
send_varname_to_ctx
,
static
void
InitWithRpcCtx
(
const
RpcCtxMap
&
send_varname_to_ctx
,
...
@@ -206,14 +239,6 @@ class Communicator {
...
@@ -206,14 +239,6 @@ class Communicator {
}
}
}
}
template
<
typename
T
>
static
Communicator
*
InitInstance
(
const
paddle
::
framework
::
ProgramDesc
&
program
,
Scope
*
recv_scope
)
{
std
::
call_once
(
init_flag_
,
&
Communicator
::
InitWithProgram
<
T
>
,
program
,
recv_scope
);
return
communicator_
.
get
();
}
template
<
typename
T
>
template
<
typename
T
>
static
void
InitWithProgram
(
const
paddle
::
framework
::
ProgramDesc
&
program
,
static
void
InitWithProgram
(
const
paddle
::
framework
::
ProgramDesc
&
program
,
Scope
*
recv_scope
)
{
Scope
*
recv_scope
)
{
...
@@ -223,12 +248,28 @@ class Communicator {
...
@@ -223,12 +248,28 @@ class Communicator {
}
}
}
}
template
<
typename
T
>
static
void
InitWithTranspilerInfo
(
const
paddle
::
framework
::
ProgramDesc
&
program
,
Scope
*
training_scope
,
std
::
map
<
std
::
string
,
std
::
map
<
std
::
string
,
std
::
vector
<
std
::
string
>>>&
vars_info
,
const
int
&
trainers
,
const
int
&
geo_need_push_nums
)
{
if
(
communicator_
.
get
()
==
nullptr
)
{
communicator_
.
reset
(
new
T
());
communicator_
->
InitImpl
(
program
,
training_scope
,
std
::
ref
(
vars_info
),
std
::
ref
(
trainers
),
std
::
ref
(
geo_need_push_nums
));
}
}
protected:
protected:
bool
running_
=
false
;
bool
running_
=
false
;
static
std
::
shared_ptr
<
Communicator
>
communicator_
;
static
std
::
shared_ptr
<
Communicator
>
communicator_
;
static
std
::
once_flag
init_flag_
;
static
std
::
once_flag
init_flag_
;
};
};
using
SparseIdsMap
=
std
::
unordered_map
<
std
::
string
,
std
::
unordered_set
<
int64_t
>>
;
class
AsyncCommunicator
:
public
Communicator
{
class
AsyncCommunicator
:
public
Communicator
{
public:
public:
AsyncCommunicator
()
{}
AsyncCommunicator
()
{}
...
@@ -251,6 +292,16 @@ class AsyncCommunicator : public Communicator {
...
@@ -251,6 +292,16 @@ class AsyncCommunicator : public Communicator {
void
SendThread
();
void
SendThread
();
void
RecvThread
();
void
RecvThread
();
void
Send
(
const
std
::
vector
<
std
::
string
>&
sparse_var_names
,
const
std
::
vector
<
std
::
string
>&
sparse_var_tables
,
const
framework
::
Scope
&
scope
)
override
;
void
InitImpl
(
const
paddle
::
framework
::
ProgramDesc
&
program
,
Scope
*
param_scope
,
std
::
map
<
std
::
string
,
std
::
map
<
std
::
string
,
std
::
vector
<
std
::
string
>>>&
vars_info
,
const
int
&
trainers
,
const
int
&
geo_need_push_nums
)
override
;
private:
private:
std
::
unordered_map
<
std
::
string
,
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
BlockingQueue
<
std
::
shared_ptr
<
Variable
>>>>
std
::
shared_ptr
<
BlockingQueue
<
std
::
shared_ptr
<
Variable
>>>>
...
@@ -266,6 +317,93 @@ class AsyncCommunicator : public Communicator {
...
@@ -266,6 +317,93 @@ class AsyncCommunicator : public Communicator {
std
::
atomic_uint
grad_num_
{
0
};
// the num of gradient sent since last recv
std
::
atomic_uint
grad_num_
{
0
};
// the num of gradient sent since last recv
};
};
class
GeoSgdCommunicator
:
public
Communicator
{
public:
GeoSgdCommunicator
()
{}
~
GeoSgdCommunicator
();
void
InitImpl
(
const
paddle
::
framework
::
ProgramDesc
&
program
,
Scope
*
training_scope
,
std
::
map
<
std
::
string
,
std
::
map
<
std
::
string
,
std
::
vector
<
std
::
string
>>>&
vars_info
,
const
int
&
trainers
,
const
int
&
geo_need_push_nums
)
override
;
void
Start
()
override
;
void
Stop
()
override
;
void
Send
(
const
std
::
string
&
var_name
,
const
framework
::
Scope
&
scope
)
override
;
void
Send
(
const
std
::
vector
<
std
::
string
>&
sparse_var_names
,
const
std
::
vector
<
std
::
string
>&
sparse_var_tables
,
const
framework
::
Scope
&
scope
)
override
;
void
Recv
()
override
;
void
InitImpl
(
const
RpcCtxMap
&
send_varname_to_ctx
,
const
RpcCtxMap
&
recv_varname_to_ctx
,
Scope
*
recv_scope
)
override
;
void
InitImpl
(
const
paddle
::
framework
::
ProgramDesc
&
program
,
Scope
*
recv_scope
)
override
;
private:
void
SendThread
();
void
RecvAll
();
std
::
unordered_set
<
int64_t
>
SparseIdsMerge
(
const
std
::
vector
<
SparseIdsMap
>&
ids_send_vec
,
const
std
::
string
&
var_name
);
void
SendUpdateDenseVars
(
const
std
::
string
&
var_name
);
void
SendUpdateSparseVars
(
const
std
::
string
&
var_name
,
const
std
::
unordered_set
<
int64_t
>&
ids_table
);
void
RecvUpdateVars
(
const
std
::
string
&
var_name
);
void
GeoSgdDenseParamInit
(
framework
::
Scope
*
scope_x
,
framework
::
Scope
*
scope_y
,
const
std
::
string
var_name
);
void
GeoSgdSparseParamInit
(
framework
::
Scope
*
scope_x
,
framework
::
Scope
*
scope_y
,
const
std
::
string
var_name
);
const
std
::
string
VarToDeltaVar
(
const
std
::
string
var_name
)
{
std
::
string
delta_name
=
var_name
;
const
std
::
string
send_name
=
delta_name
.
append
(
".delta"
);
return
send_name
;
}
const
std
::
string
DeltaVarToVar
(
const
std
::
string
var_name
)
{
std
::
string
origin_name
=
var_name
;
origin_name
.
erase
(
origin_name
.
find
(
".delta"
),
6
);
const
std
::
string
param_name
=
origin_name
;
return
param_name
;
}
private:
int
trainer_nums_
=
1
;
int
geo_need_push_nums_
=
100
;
bool
is_geo_sgd_
=
false
;
Scope
*
training_scope_
;
std
::
shared_ptr
<
Scope
>
delta_scope_
;
// parameter local delta: recv - old
std
::
shared_ptr
<
Scope
>
old_scope_
;
// parameter local, storage the param after last recv
std
::
shared_ptr
<
Scope
>
pserver_scope_
;
// parameter on pserver,gloabl scope
RpcCtxMap
send_varname_to_ctx_
;
RpcCtxMap
recv_varname_to_ctx_
;
std
::
atomic_uint
have_push_
{
0
};
std
::
unordered_map
<
std
::
string
,
bool
>
var_list_
;
// if var is sparse, using selected rows, bool=true
std
::
shared_ptr
<
BlockingQueue
<
std
::
shared_ptr
<
SparseIdsMap
>>>
need_push_queue_
;
std
::
vector
<
SparseIdsMap
>
ids_send_vec_
;
std
::
unique_ptr
<::
ThreadPool
>
send_threadpool_
{
nullptr
};
std
::
unique_ptr
<::
ThreadPool
>
recv_threadpool_
{
nullptr
};
std
::
unique_ptr
<
std
::
thread
>
send_thread_
{
nullptr
};
};
}
// namespace distributed
}
// namespace distributed
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/distributed/parameter_recv.cc
浏览文件 @
728ec1b4
...
@@ -42,7 +42,7 @@ using DDim = framework::DDim;
...
@@ -42,7 +42,7 @@ using DDim = framework::DDim;
template
<
typename
T
>
template
<
typename
T
>
void
ParameterRecv
<
T
>::
operator
()(
const
RpcContext
&
rpc_ctx
,
void
ParameterRecv
<
T
>::
operator
()(
const
RpcContext
&
rpc_ctx
,
const
framework
::
Scope
&
scope
)
{
const
framework
::
Scope
&
scope
)
{
VLOG
(
3
)
<<
"ParameterRecv in "
<<
rpc_ctx
.
var_name
;
VLOG
(
2
)
<<
"ParameterRecv in "
<<
rpc_ctx
.
var_name
;
std
::
unique_ptr
<
framework
::
Scope
>
local_scope
=
scope
.
NewTmpScope
();
std
::
unique_ptr
<
framework
::
Scope
>
local_scope
=
scope
.
NewTmpScope
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
...
@@ -54,15 +54,24 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
...
@@ -54,15 +54,24 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
auto
*
recv_var
=
scope
.
FindVar
(
rpc_ctx
.
var_name
);
auto
*
recv_var
=
scope
.
FindVar
(
rpc_ctx
.
var_name
);
// recv all vars to local scope
// recv all vars to local scope
if
(
recv_var
->
IsType
<
framework
::
LoDTensor
>
())
{
if
(
recv_var
->
IsType
<
framework
::
LoDTensor
>
()
||
recv_var
->
IsType
<
framework
::
SelectedRows
>
())
{
std
::
vector
<
distributed
::
VarHandlePtr
>
rets
;
std
::
vector
<
distributed
::
VarHandlePtr
>
rets
;
for
(
size_t
i
=
0
;
i
<
rpc_ctx
.
splited_var_names
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
rpc_ctx
.
splited_var_names
.
size
();
i
++
)
{
auto
&
recv_var_name
=
rpc_ctx
.
splited_var_names
[
i
];
auto
&
recv_var_name
=
rpc_ctx
.
splited_var_names
[
i
];
local_scope
->
Var
(
recv_var_name
);
local_scope
->
Var
(
recv_var_name
);
VLOG
(
3
)
<<
"recv "
<<
recv_var_name
<<
" from "
<<
rpc_ctx
.
epmap
[
i
];
VLOG
(
4
)
<<
"recv "
<<
recv_var_name
<<
" from "
<<
rpc_ctx
.
epmap
[
i
];
if
(
recv_var
->
IsType
<
framework
::
LoDTensor
>
())
{
// sparse param in recv_scope is LoDTensor
rets
.
push_back
(
rpc_client
->
AsyncGetVar
(
rpc_ctx
.
epmap
[
i
],
cpu_ctx
,
rets
.
push_back
(
rpc_client
->
AsyncGetVar
(
rpc_ctx
.
epmap
[
i
],
cpu_ctx
,
*
local_scope
.
get
(),
recv_var_name
,
*
local_scope
.
get
(),
recv_var_name
));
recv_var_name
,
recv_var_name
));
}
else
{
// sparse param in pserver_scope is SelectedRows
rets
.
push_back
(
rpc_client
->
AsyncGetVar
(
rpc_ctx
.
epmap
[
i
],
cpu_ctx
,
*
local_scope
.
get
(),
recv_var_name
,
recv_var_name
,
recv_var_name
));
}
}
}
for
(
size_t
i
=
0
;
i
<
rets
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
rets
.
size
();
i
++
)
{
PADDLE_ENFORCE
(
rets
[
i
]
->
Wait
(),
"internal error in RPCClient"
);
PADDLE_ENFORCE
(
rets
[
i
]
->
Wait
(),
"internal error in RPCClient"
);
...
@@ -72,7 +81,7 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
...
@@ -72,7 +81,7 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
}
}
// concat recved tensor into one var
// concat recved tensor into one var
{
if
(
recv_var
->
IsType
<
framework
::
LoDTensor
>
())
{
size_t
output_offset
=
0
;
size_t
output_offset
=
0
;
size_t
row_offset
=
0
;
size_t
row_offset
=
0
;
framework
::
Tensor
*
recv_tensor
=
framework
::
Tensor
*
recv_tensor
=
...
@@ -126,9 +135,56 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
...
@@ -126,9 +135,56 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
LOG
(
FATAL
)
<<
"recv_numel: "
<<
recv_numel
<<
" acture numel: "
<<
numel
;
LOG
(
FATAL
)
<<
"recv_numel: "
<<
recv_numel
<<
" acture numel: "
<<
numel
;
}
}
PADDLE_ENFORCE_EQ
(
recv_numel
,
numel
);
PADDLE_ENFORCE_EQ
(
recv_numel
,
numel
);
}
else
if
(
recv_var
->
IsType
<
framework
::
SelectedRows
>
())
{
auto
cpu_place
=
platform
::
CPUPlace
();
auto
*
slr
=
recv_var
->
GetMutable
<
framework
::
SelectedRows
>
();
slr
->
mutable_rows
()
->
clear
();
slr
->
mutable_value
()
->
mutable_data
<
float
>
({{}},
cpu_place
);
int64_t
width
=
0
;
int64_t
height
=
0
;
std
::
vector
<
int64_t
>
new_rows
{};
// trans sparse ids from local to global
std
::
vector
<
int64_t
>
abs_sections
=
ToAbsoluteSection
(
rpc_ctx
.
height_sections
);
for
(
int
i
=
0
;
i
<
rpc_ctx
.
splited_var_names
.
size
();
i
++
)
{
auto
&
recv_var_name
=
rpc_ctx
.
splited_var_names
[
i
];
auto
*
var
=
local_scope
->
FindVar
(
recv_var_name
);
auto
*
var_slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
auto
*
var_slr_row
=
var_slr
->
mutable_rows
();
width
=
var_slr
->
mutable_value
()
->
dims
()[
1
];
height
+=
var_slr
->
height
();
auto
row_offset
=
abs_sections
[
i
];
VLOG
(
4
)
<<
"Recv split_var "
<<
recv_var_name
<<
" Row size "
<<
var_slr_row
->
size
();
for
(
size_t
j
=
0
;
j
<
var_slr_row
->
size
();
j
++
)
{
new_rows
.
push_back
(
row_offset
+
(
*
var_slr_row
)[
j
]);
}
}
slr
->
set_rows
(
new_rows
);
slr
->
set_height
(
height
);
slr
->
mutable_value
()
->
mutable_data
<
float
>
(
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
slr
->
mutable_rows
()
->
size
()),
width
}),
cpu_place
);
auto
*
slr_data
=
slr
->
mutable_value
()
->
data
<
float
>
();
size_t
row_offset
=
0
;
for
(
auto
&
recv_var_name
:
rpc_ctx
.
splited_var_names
)
{
auto
*
var
=
local_scope
->
FindVar
(
recv_var_name
);
auto
*
var_slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
auto
*
var_slr_row
=
var_slr
->
mutable_rows
();
auto
var_slr_row_size
=
var_slr_row
->
size
();
auto
*
var_slr_data
=
var_slr
->
mutable_value
()
->
data
<
float
>
();
memcpy
(
slr_data
+
row_offset
*
width
,
var_slr_data
,
sizeof
(
float
)
*
width
*
var_slr_row_size
);
row_offset
+=
var_slr_row_size
;
}
}
}
VLOG
(
3
)
<<
"ParameterRecv out "
<<
rpc_ctx
.
var_name
;
VLOG
(
2
)
<<
"ParameterRecv out "
<<
rpc_ctx
.
var_name
;
}
}
template
struct
ParameterRecv
<
float
>;
template
struct
ParameterRecv
<
float
>;
...
...
paddle/fluid/operators/distributed/parameter_send.cc
浏览文件 @
728ec1b4
...
@@ -12,13 +12,13 @@
...
@@ -12,13 +12,13 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/operators/distributed/parameter_send.h"
#include <memory>
#include <memory>
#include <set>
#include <set>
#include <string>
#include <string>
#include <utility>
#include <vector>
#include <vector>
#include "paddle/fluid/operators/distributed/parameter_send.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/selected_rows.h"
...
@@ -28,6 +28,7 @@
...
@@ -28,6 +28,7 @@
#include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -38,9 +39,44 @@ using LoDTensor = framework::LoDTensor;
...
@@ -38,9 +39,44 @@ using LoDTensor = framework::LoDTensor;
using
SelectedRows
=
framework
::
SelectedRows
;
using
SelectedRows
=
framework
::
SelectedRows
;
using
DDim
=
framework
::
DDim
;
using
DDim
=
framework
::
DDim
;
typedef
std
::
vector
<
std
::
pair
<
std
::
string
,
std
::
string
>>
EP_SPLIT_TABLE_PAIRS
;
inline
EP_SPLIT_TABLE_PAIRS
GetMultiFieldRpcContext
(
const
RpcContext
&
rpc_ctx
,
const
framework
::
Scope
&
scope
,
int
multi_parts
)
{
EP_SPLIT_TABLE_PAIRS
table_pairs
;
auto
*
send_var
=
scope
.
FindVar
(
rpc_ctx
.
var_name
);
if
(
send_var
->
IsType
<
framework
::
SelectedRows
>
())
{
PADDLE_ENFORCE_GT
(
multi_parts
,
0
,
"multi_parts must >=1"
);
if
(
multi_parts
==
1
)
{
for
(
int
i
=
0
;
i
<
rpc_ctx
.
splited_var_names
.
size
();
i
++
)
{
table_pairs
.
push_back
(
std
::
make_pair
(
rpc_ctx
.
epmap
[
i
],
rpc_ctx
.
splited_var_names
[
i
]));
}
}
else
{
for
(
int
i
=
0
;
i
<
rpc_ctx
.
splited_var_names
.
size
();
i
++
)
{
for
(
int
x
=
0
;
x
<
multi_parts
;
x
++
)
{
auto
table
=
string
::
Sprintf
(
"%s@%d@PIECE"
,
rpc_ctx
.
splited_var_names
[
i
],
x
);
table_pairs
.
push_back
(
std
::
make_pair
(
rpc_ctx
.
epmap
[
i
],
table
));
}
}
}
}
else
if
(
send_var
->
IsType
<
framework
::
LoDTensor
>
())
{
PADDLE_THROW
(
"GetMultiFieldRpcContext can not support LoDTensor current!"
);
}
else
{
PADDLE_THROW
(
"GetMultiFieldRpcContext unsupported var type!"
);
}
return
table_pairs
;
}
// namespace distributed
template
<
typename
T
>
template
<
typename
T
>
void
ParameterSend
<
T
>::
operator
()(
const
RpcContext
&
rpc_ctx
,
void
ParameterSend
<
T
>::
operator
()(
const
RpcContext
&
rpc_ctx
,
const
framework
::
Scope
&
scope
,
bool
sync
)
{
const
framework
::
Scope
&
scope
,
bool
sync
,
int
multi_parts
)
{
std
::
unique_ptr
<
framework
::
Scope
>
local_scope
=
scope
.
NewTmpScope
();
std
::
unique_ptr
<
framework
::
Scope
>
local_scope
=
scope
.
NewTmpScope
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
...
@@ -49,9 +85,12 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
...
@@ -49,9 +85,12 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
distributed
::
RPCClient
*
rpc_client
=
distributed
::
RPCClient
*
rpc_client
=
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
(
rpc_ctx
.
trainer_id
);
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
(
rpc_ctx
.
trainer_id
);
std
::
vector
<
distributed
::
VarHandlePtr
>
rets
;
auto
*
send_var
=
scope
.
FindVar
(
rpc_ctx
.
var_name
);
auto
*
send_var
=
scope
.
FindVar
(
rpc_ctx
.
var_name
);
size_t
out_num
=
rpc_ctx
.
splited_var_names
.
size
();
if
(
send_var
->
IsType
<
framework
::
LoDTensor
>
())
{
if
(
send_var
->
IsType
<
framework
::
LoDTensor
>
())
{
size_t
out_num
=
rpc_ctx
.
splited_var_names
.
size
();
if
(
out_num
>
1
)
{
if
(
out_num
>
1
)
{
auto
&
send_tensor
=
send_var
->
Get
<
framework
::
LoDTensor
>
();
auto
&
send_tensor
=
send_var
->
Get
<
framework
::
LoDTensor
>
();
auto
&
send_tensor_dims
=
send_tensor
.
dims
();
auto
&
send_tensor_dims
=
send_tensor
.
dims
();
...
@@ -77,6 +116,24 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
...
@@ -77,6 +116,24 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
row_offset
+=
outs_dims
[
i
][
0
];
row_offset
+=
outs_dims
[
i
][
0
];
}
}
}
}
for
(
size_t
i
=
0
;
i
<
rpc_ctx
.
splited_var_names
.
size
();
i
++
)
{
auto
&
send_var_name
=
rpc_ctx
.
splited_var_names
[
i
];
VLOG
(
4
)
<<
"send var name: "
<<
send_var_name
;
auto
&
endpoint
=
rpc_ctx
.
epmap
[
i
];
VLOG
(
4
)
<<
"send var endpoint: "
<<
endpoint
;
VLOG
(
4
)
<<
"need send: "
<<
NeedSend
(
*
local_scope
.
get
(),
send_var_name
);
if
(
NeedSend
(
*
local_scope
.
get
(),
send_var_name
))
{
VLOG
(
3
)
<<
"sending "
<<
send_var_name
<<
" to "
<<
endpoint
;
rets
.
push_back
(
rpc_client
->
AsyncSendVar
(
endpoint
,
cpu_ctx
,
*
local_scope
.
get
(),
send_var_name
));
VLOG
(
4
)
<<
"send var "
<<
send_var_name
<<
" async handle done"
;
}
else
{
VLOG
(
3
)
<<
"don't send non-initialized variable: "
<<
rpc_ctx
.
splited_var_names
[
i
];
}
}
}
else
if
(
send_var
->
IsType
<
framework
::
SelectedRows
>
())
{
}
else
if
(
send_var
->
IsType
<
framework
::
SelectedRows
>
())
{
auto
&
send_slr
=
send_var
->
Get
<
framework
::
SelectedRows
>
();
auto
&
send_slr
=
send_var
->
Get
<
framework
::
SelectedRows
>
();
auto
abs_sections
=
ToAbsoluteSection
(
rpc_ctx
.
height_sections
);
auto
abs_sections
=
ToAbsoluteSection
(
rpc_ctx
.
height_sections
);
...
@@ -85,84 +142,94 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
...
@@ -85,84 +142,94 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
std
::
vector
<
std
::
vector
<
size_t
>>
outs_rows_idx
;
std
::
vector
<
std
::
vector
<
size_t
>>
outs_rows_idx
;
std
::
vector
<
std
::
vector
<
size_t
>>
outs_dense_idx
;
std
::
vector
<
std
::
vector
<
size_t
>>
outs_dense_idx
;
outs_rows_idx
.
resize
(
out_num
);
auto
table_pairs
=
GetMultiFieldRpcContext
(
rpc_ctx
,
scope
,
multi_parts
);
outs_dense_idx
.
resize
(
out_num
);
outs_rows_idx
.
resize
(
table_pairs
.
size
());
outs_dense_idx
.
resize
(
table_pairs
.
size
());
auto
row_numel
=
send_slr
.
value
().
numel
()
/
send_slr
.
value
().
dims
()[
0
];
auto
row_numel
=
send_slr
.
value
().
numel
()
/
send_slr
.
value
().
dims
()[
0
];
auto
*
src
=
send_slr
.
value
().
data
<
T
>
();
auto
*
src
=
send_slr
.
value
().
data
<
T
>
();
// create output var in local scope
// create output var in local scope
std
::
vector
<
framework
::
SelectedRows
*>
outs
;
std
::
vector
<
framework
::
SelectedRows
*>
outs
;
for
(
auto
&
name
:
rpc_ctx
.
splited_var_names
)
{
for
(
auto
&
table
:
table_pairs
)
{
auto
*
out
=
local_scope
->
Var
(
name
)
->
GetMutable
<
framework
::
SelectedRows
>
();
auto
*
out
=
local_scope
->
Var
(
table
.
second
)
->
GetMutable
<
framework
::
SelectedRows
>
();
outs
.
push_back
(
out
);
outs
.
push_back
(
out
);
}
}
// split rows index into output sparse vars
// split rows index into output sparse vars
for
(
size_t
i
=
0
;
i
<
send_rows
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
send_rows
.
size
();
++
i
)
{
size_t
out_idx
=
GetSectionIndex
(
send_rows
[
i
],
abs_sections
);
auto
ep_idx
=
GetSectionIndex
(
send_rows
[
i
],
abs_sections
);
auto
table_idx
=
send_rows
[
i
]
%
multi_parts
;
auto
out_idx
=
ep_idx
*
multi_parts
+
table_idx
;
outs_rows_idx
[
out_idx
].
push_back
(
send_rows
[
i
]);
outs_rows_idx
[
out_idx
].
push_back
(
send_rows
[
i
]);
outs_dense_idx
[
out_idx
].
push_back
(
i
);
outs_dense_idx
[
out_idx
].
push_back
(
i
);
}
}
auto
place
=
platform
::
CPUPlace
();
auto
place
=
platform
::
CPUPlace
();
for
(
size_t
i
=
0
;
i
<
outs_rows_idx
.
size
();
++
i
)
{
for
(
int
ctx
=
0
;
ctx
<
rpc_ctx
.
splited_var_names
.
size
();
ctx
++
)
{
auto
rows_idx
=
outs_rows_idx
[
i
];
for
(
int
part
=
0
;
part
<
multi_parts
;
part
++
)
{
outs
[
i
]
->
set_height
(
rpc_ctx
.
height_sections
[
i
]);
auto
out_idx
=
ctx
*
multi_parts
+
part
;
auto
rows_idx
=
outs_rows_idx
[
out_idx
];
auto
dims
=
send_slr
.
GetCompleteDims
();
auto
dims
=
send_slr
.
GetCompleteDims
();
dims
[
0
]
=
rows_idx
.
size
();
dims
[
0
]
=
rows_idx
.
size
();
outs
[
i
]
->
mutable_rows
()
->
clear
();
outs
[
i
]
->
mutable_value
()
->
mutable_data
<
T
>
(
dims
,
send_slr
.
place
());
outs
[
out_idx
]
->
set_height
(
rpc_ctx
.
height_sections
[
ctx
]);
outs
[
out_idx
]
->
mutable_rows
()
->
clear
();
outs
[
out_idx
]
->
mutable_value
()
->
mutable_data
<
T
>
(
dims
,
send_slr
.
place
());
if
(
rows_idx
.
size
()
>
0
)
{
if
(
rows_idx
.
size
()
>
0
)
{
for
(
auto
idx
:
rows_idx
)
{
for
(
auto
idx
:
rows_idx
)
{
outs
[
i
]
->
mutable_rows
()
->
push_back
(
idx
-
abs_sections
[
i
]);
outs
[
out_idx
]
->
mutable_rows
()
->
push_back
(
idx
-
abs_sections
[
ctx
]);
}
}
auto
dst
=
outs
[
i
]
->
mutable_value
()
->
mutable_data
<
T
>
(
place
);
auto
dst
=
outs
[
out_idx
]
->
mutable_value
()
->
mutable_data
<
T
>
(
place
);
for
(
size_t
j
=
0
;
j
<
rows_idx
.
size
();
j
++
)
{
for
(
size_t
j
=
0
;
j
<
rows_idx
.
size
();
j
++
)
{
if
(
platform
::
is_cpu_place
(
place
))
{
if
(
platform
::
is_cpu_place
(
place
))
{
memory
::
Copy
(
memory
::
Copy
(
platform
::
CPUPlace
(),
dst
+
j
*
row_numel
,
platform
::
CPUPlace
(),
dst
+
j
*
row_numel
,
platform
::
CPUPlace
(),
platform
::
CPUPlace
(),
src
+
outs_dense_idx
[
i
][
j
]
*
row_numel
,
sizeof
(
T
)
*
row_numel
);
src
+
outs_dense_idx
[
out_idx
][
j
]
*
row_numel
,
sizeof
(
T
)
*
row_numel
);
}
else
{
}
else
{
PADDLE_THROW
(
"do not support GPU now"
);
PADDLE_THROW
(
"do not support GPU now"
);
/*
#ifdef PADDLE_WITH_CUDA
auto stream = ctx.cuda_device_context().stream();
memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
platform::CUDAPlace(),
src + outs_dense_idx[i][j] * row_numel,
sizeof(T) * row_numel, stream);
#else
PADDLE_THROW("Paddle is not compiled with GPU");
#endif
*/
}
}
}
}
}
}
PADDLE_ENFORCE_EQ
(
rows_idx
.
size
(),
outs
[
i
]
->
rows
().
size
(),
PADDLE_ENFORCE_EQ
(
rows_idx
.
size
(),
outs
[
out_idx
]
->
rows
().
size
(),
"rows should has the same size with tensor dim 0"
);
"rows should has the same size with tensor dim 0"
);
}
}
}
else
{
PADDLE_THROW
(
"unsupported var type to send!"
);
}
}
std
::
vector
<
distributed
::
VarHandlePtr
>
rets
;
for
(
size_t
i
=
0
;
i
<
table_pairs
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
rpc_ctx
.
splited_var_names
.
size
();
i
++
)
{
auto
&
send_var_name
=
table_pairs
[
i
].
second
;
auto
&
send_var_name
=
rpc_ctx
.
splited_var_names
[
i
];
auto
&
endpoint
=
table_pairs
[
i
].
first
;
auto
&
endpoint
=
rpc_ctx
.
epmap
[
i
];
auto
need_send
=
NeedSend
(
*
local_scope
.
get
(),
send_var_name
);
if
(
NeedSend
(
*
local_scope
.
get
(),
send_var_name
))
{
VLOG
(
3
)
<<
"sending "
<<
send_var_name
<<
" to "
<<
endpoint
;
VLOG
(
4
)
<<
"send var name: "
<<
send_var_name
<<
"send var endpoint: "
<<
endpoint
<<
"need send: "
<<
need_send
;
if
(
need_send
)
{
VLOG
(
4
)
<<
"sending "
<<
send_var_name
<<
" to "
<<
endpoint
;
rets
.
push_back
(
rpc_client
->
AsyncSendVar
(
rets
.
push_back
(
rpc_client
->
AsyncSendVar
(
endpoint
,
cpu_ctx
,
*
local_scope
.
get
(),
send_var_name
));
endpoint
,
cpu_ctx
,
*
local_scope
.
get
(),
send_var_name
));
VLOG
(
4
)
<<
"send var "
<<
send_var_name
<<
" async handle done"
;
}
else
{
}
else
{
VLOG
(
3
)
<<
"don't send non-initialized variable: "
VLOG
(
4
)
<<
"don't send non-initialized variable: "
<<
rpc_ctx
.
splited_var_names
[
i
];
<<
rpc_ctx
.
splited_var_names
[
i
];
}
}
}
}
}
else
{
PADDLE_THROW
(
"unsupported var type to send!"
);
}
VLOG
(
4
)
<<
"Prepare to send var "
<<
rpc_ctx
.
var_name
;
if
(
sync
)
{
if
(
sync
)
{
for
(
auto
&
handle
:
rets
)
{
for
(
auto
&
handle
:
rets
)
{
VLOG
(
4
)
<<
"Wait send var to pserver handle: "
<<
handle
;
PADDLE_ENFORCE
(
handle
->
Wait
(),
"internal error in RPCClient"
);
PADDLE_ENFORCE
(
handle
->
Wait
(),
"internal error in RPCClient"
);
}
}
}
}
...
...
paddle/fluid/operators/distributed/parameter_send.h
浏览文件 @
728ec1b4
...
@@ -27,7 +27,7 @@ namespace distributed {
...
@@ -27,7 +27,7 @@ namespace distributed {
template
<
typename
T
>
template
<
typename
T
>
struct
ParameterSend
{
struct
ParameterSend
{
void
operator
()(
const
RpcContext
&
rpc_ctx
,
const
framework
::
Scope
&
scope
,
void
operator
()(
const
RpcContext
&
rpc_ctx
,
const
framework
::
Scope
&
scope
,
bool
sync
);
bool
sync
,
int
multi_parts
);
};
};
};
// namespace distributed
};
// namespace distributed
...
...
paddle/fluid/operators/distributed/request_handler_impl.cc
浏览文件 @
728ec1b4
...
@@ -26,6 +26,7 @@
...
@@ -26,6 +26,7 @@
#include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/string/piece.h"
#include "paddle/fluid/string/piece.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -60,13 +61,26 @@ bool RequestSendHandler::Handle(const std::string& varname,
...
@@ -60,13 +61,26 @@ bool RequestSendHandler::Handle(const std::string& varname,
"async mode should not recv BATCH_BARRIER_MESSAGE or "
"async mode should not recv BATCH_BARRIER_MESSAGE or "
"COMPLETE_MESSAGE"
);
"COMPLETE_MESSAGE"
);
}
}
if
(
AsyncSparseParamUpdateRecorder
::
GetInstance
()
->
HasGrad
(
varname
))
{
std
::
string
run_varname
=
varname
;
string
::
Piece
part_piece
(
"@PIECE"
);
string
::
Piece
var_name_piece
=
string
::
Piece
(
varname
);
if
(
string
::
Contains
(
var_name_piece
,
part_piece
))
{
auto
varname_splits
=
paddle
::
string
::
Split
(
varname
,
'@'
);
PADDLE_ENFORCE_EQ
(
varname_splits
.
size
(),
3
);
run_varname
=
varname_splits
[
0
];
scope
->
Rename
(
varname
,
run_varname
);
}
if
(
AsyncSparseParamUpdateRecorder
::
GetInstance
()
->
HasGrad
(
run_varname
))
{
auto
&
grad_slr
=
auto
&
grad_slr
=
scope
->
FindVar
(
varname
)
->
Get
<
framework
::
SelectedRows
>
();
scope
->
FindVar
(
run_
varname
)
->
Get
<
framework
::
SelectedRows
>
();
AsyncSparseParamUpdateRecorder
::
GetInstance
()
->
Update
(
varname
,
AsyncSparseParamUpdateRecorder
::
GetInstance
()
->
Update
(
run_
varname
,
grad_slr
.
rows
());
grad_slr
.
rows
());
}
}
executor_
->
RunPreparedContext
((
*
grad_to_prepared_ctx_
)[
varname
].
get
(),
executor_
->
RunPreparedContext
((
*
grad_to_prepared_ctx_
)[
run_
varname
].
get
(),
scope
);
scope
);
return
true
;
return
true
;
}
else
{
// sync
}
else
{
// sync
...
@@ -116,9 +130,48 @@ bool RequestGetHandler::Handle(const std::string& varname,
...
@@ -116,9 +130,48 @@ bool RequestGetHandler::Handle(const std::string& varname,
VLOG
(
3
)
<<
"copying "
<<
varname
<<
" to "
<<
param_bak_name
;
VLOG
(
3
)
<<
"copying "
<<
varname
<<
" to "
<<
param_bak_name
;
framework
::
TensorCopy
(
t_orig
,
dev_ctx_
->
GetPlace
(),
t
);
framework
::
TensorCopy
(
t_orig
,
dev_ctx_
->
GetPlace
(),
t
);
}
}
VLOG
(
1
)
<<
"Table name empty? "
<<
table_name
.
empty
();
VLOG
(
1
)
<<
"AsyncSparseParamUpdateRecorder "
<<
varname
<<
" exist "
<<
AsyncSparseParamUpdateRecorder
::
GetInstance
()
->
HasParam
(
varname
);
if
(
AsyncSparseParamUpdateRecorder
::
GetInstance
()
->
HasParam
(
varname
)
&&
!
table_name
.
empty
())
{
std
::
vector
<
int64_t
>
updated_rows
;
AsyncSparseParamUpdateRecorder
::
GetInstance
()
->
GetAndClear
(
varname
,
trainer_id
,
&
updated_rows
);
if
(
VLOG_IS_ON
(
3
))
{
std
::
ostringstream
sstream
;
sstream
<<
"["
;
for
(
auto
&
row_id
:
updated_rows
)
{
sstream
<<
row_id
<<
", "
;
}
sstream
<<
"]"
;
VLOG
(
3
)
<<
"updated_rows size: "
<<
updated_rows
.
size
()
<<
" "
<<
sstream
.
str
();
}
auto
&
origin_tensor
=
scope_
->
FindVar
(
varname
)
->
Get
<
framework
::
LoDTensor
>
();
auto
*
origin_tensor_data
=
origin_tensor
.
data
<
float
>
();
auto
&
dims
=
origin_tensor
.
dims
();
*
outvar
=
scope
->
Var
();
auto
*
out_slr
=
(
*
outvar
)
->
GetMutable
<
framework
::
SelectedRows
>
();
out_slr
->
set_rows
(
updated_rows
);
out_slr
->
set_height
(
dims
[
0
]);
auto
out_dims
=
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
updated_rows
.
size
()),
dims
[
1
]});
auto
*
data
=
out_slr
->
mutable_value
()
->
mutable_data
<
float
>
(
out_dims
,
origin_tensor
.
place
());
auto
width
=
dims
[
1
];
for
(
auto
i
=
0
;
i
<
updated_rows
.
size
();
++
i
)
{
PADDLE_ENFORCE_LT
(
updated_rows
[
i
],
dims
[
0
]);
memcpy
(
data
+
i
*
width
,
origin_tensor_data
+
updated_rows
[
i
]
*
width
,
sizeof
(
float
)
*
width
);
}
}
else
{
*
outvar
=
scope_
->
FindVar
(
varname
);
*
outvar
=
scope_
->
FindVar
(
varname
);
}
}
}
}
}
return
true
;
return
true
;
}
}
...
...
paddle/fluid/operators/distributed_ops/send_op.cc
浏览文件 @
728ec1b4
...
@@ -47,8 +47,12 @@ class SendOp : public framework::OperatorBase {
...
@@ -47,8 +47,12 @@ class SendOp : public framework::OperatorBase {
auto
height_sections
=
Attr
<
std
::
vector
<
int64_t
>>
(
"sections"
);
auto
height_sections
=
Attr
<
std
::
vector
<
int64_t
>>
(
"sections"
);
if
(
send_varnames
.
size
()
>
0
)
{
if
(
send_varnames
.
size
()
>
0
)
{
PADDLE_ENFORCE_EQ
(
ins
.
size
(),
1
,
""
);
if
(
ins
.
size
()
>
1
)
{
distributed
::
Communicator
::
GetInstance
()
->
Send
(
ins
,
send_varnames
,
scope
);
}
else
{
distributed
::
Communicator
::
GetInstance
()
->
Send
(
ins
[
0
],
scope
);
distributed
::
Communicator
::
GetInstance
()
->
Send
(
ins
[
0
],
scope
);
}
}
else
{
}
else
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContextPool
::
Instance
();
...
...
paddle/fluid/pybind/communicator_py.cc
浏览文件 @
728ec1b4
...
@@ -15,8 +15,10 @@ limitations under the License. */
...
@@ -15,8 +15,10 @@ limitations under the License. */
#include "paddle/fluid/pybind/communicator_py.h"
#include "paddle/fluid/pybind/communicator_py.h"
#include <Python.h>
#include <Python.h>
#include <map>
#include <memory>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/program_desc.h"
#include "pybind11/pybind11.h"
#include "pybind11/pybind11.h"
...
@@ -27,6 +29,7 @@ namespace py = pybind11;
...
@@ -27,6 +29,7 @@ namespace py = pybind11;
using
paddle
::
framework
::
ProgramDesc
;
using
paddle
::
framework
::
ProgramDesc
;
using
paddle
::
operators
::
distributed
::
Communicator
;
using
paddle
::
operators
::
distributed
::
Communicator
;
using
paddle
::
operators
::
distributed
::
AsyncCommunicator
;
using
paddle
::
operators
::
distributed
::
AsyncCommunicator
;
using
paddle
::
operators
::
distributed
::
GeoSgdCommunicator
;
using
paddle
::
framework
::
Scope
;
using
paddle
::
framework
::
Scope
;
namespace
paddle
{
namespace
paddle
{
...
@@ -37,9 +40,20 @@ void BindCommunicator(py::module* m) {
...
@@ -37,9 +40,20 @@ void BindCommunicator(py::module* m) {
py
::
class_
<
Communicator
,
std
::
shared_ptr
<
Communicator
>>
(
*
m
,
py
::
class_
<
Communicator
,
std
::
shared_ptr
<
Communicator
>>
(
*
m
,
"DistCommunicator"
)
"DistCommunicator"
)
.
def
(
py
::
init
([](
const
ProgramDesc
&
program
,
Scope
*
param_scope
)
{
.
def
(
py
::
init
([](
const
ProgramDesc
&
program
,
Scope
*
param_scope
)
{
VLOG
(
0
)
<<
"using communicator"
;
Communicator
::
InitInstance
<
AsyncCommunicator
>
(
program
,
param_scope
);
Communicator
::
InitInstance
<
AsyncCommunicator
>
(
program
,
param_scope
);
return
Communicator
::
GetInstantcePtr
();
return
Communicator
::
GetInstantcePtr
();
}))
}))
.
def
(
py
::
init
([](
const
ProgramDesc
&
program
,
Scope
*
training_scope
,
std
::
map
<
std
::
string
,
std
::
map
<
std
::
string
,
std
::
vector
<
std
::
string
>>>&
vars_info
,
int
&
trainers
,
int
&
geo_need_push_nums
)
{
VLOG
(
0
)
<<
"using geo sgd communicator"
;
Communicator
::
InitInstance
<
GeoSgdCommunicator
>
(
program
,
training_scope
,
vars_info
,
trainers
,
geo_need_push_nums
);
return
Communicator
::
GetInstantcePtr
();
}))
.
def
(
"stop"
,
&
Communicator
::
Stop
)
.
def
(
"stop"
,
&
Communicator
::
Stop
)
.
def
(
"start"
,
&
Communicator
::
Start
)
.
def
(
"start"
,
&
Communicator
::
Start
)
.
def
(
"is_running"
,
&
Communicator
::
IsRunning
);
.
def
(
"is_running"
,
&
Communicator
::
IsRunning
);
...
...
paddle/fluid/pybind/communicator_py.h
浏览文件 @
728ec1b4
...
@@ -16,7 +16,11 @@ limitations under the License. */
...
@@ -16,7 +16,11 @@ limitations under the License. */
#include <Python.h>
#include <Python.h>
#include "pybind11/chrono.h"
#include "pybind11/complex.h"
#include "pybind11/functional.h"
#include "pybind11/pybind11.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace
paddle
{
namespace
paddle
{
namespace
pybind
{
namespace
pybind
{
...
...
python/paddle/fluid/__init__.py
浏览文件 @
728ec1b4
...
@@ -195,6 +195,7 @@ def __bootstrap__():
...
@@ -195,6 +195,7 @@ def __bootstrap__():
read_env_flags
.
append
(
'communicator_min_send_grad_num_before_recv'
)
read_env_flags
.
append
(
'communicator_min_send_grad_num_before_recv'
)
read_env_flags
.
append
(
'communicator_thread_pool_size'
)
read_env_flags
.
append
(
'communicator_thread_pool_size'
)
read_env_flags
.
append
(
'communicator_max_merge_var_num'
)
read_env_flags
.
append
(
'communicator_max_merge_var_num'
)
read_env_flags
.
append
(
'communicator_merge_sparse_bucket'
)
read_env_flags
.
append
(
'communicator_fake_rpc'
)
read_env_flags
.
append
(
'communicator_fake_rpc'
)
read_env_flags
.
append
(
'communicator_send_wait_times'
)
read_env_flags
.
append
(
'communicator_send_wait_times'
)
read_env_flags
.
append
(
'communicator_merge_sparse_grad'
)
read_env_flags
.
append
(
'communicator_merge_sparse_grad'
)
...
...
python/paddle/fluid/communicator.py
浏览文件 @
728ec1b4
...
@@ -13,6 +13,10 @@
...
@@ -13,6 +13,10 @@
# limitations under the License.
# limitations under the License.
from
.executor
import
global_scope
from
.executor
import
global_scope
"""
Communicator is used for async distribute training in distribute_transpiler mode.
It's a wrapper of a cpp class Communicator and should be used inside fleet API.
"""
from
.
import
core
from
.
import
core
from
.framework
import
Program
from
.framework
import
Program
...
@@ -20,7 +24,11 @@ __all__ = ['Communicator']
...
@@ -20,7 +24,11 @@ __all__ = ['Communicator']
class
Communicator
(
object
):
class
Communicator
(
object
):
def
__init__
(
self
,
program
):
def
__init__
(
self
,
program
,
vars_info
=
None
,
trainers
=
None
,
geo_sgd_need_push_nums
=
None
):
"""
"""
Communicator is used for async distribute training in distribute_transpiler mode.
Communicator is used for async distribute training in distribute_transpiler mode.
It's a wrapper of a cpp class Communicator and should be used inside fleet API.
It's a wrapper of a cpp class Communicator and should be used inside fleet API.
...
@@ -47,7 +55,15 @@ class Communicator(object):
...
@@ -47,7 +55,15 @@ class Communicator(object):
for
op
in
program
.
block
(
0
).
ops
:
for
op
in
program
.
block
(
0
).
ops
:
if
op
.
type
==
"recv"
:
if
op
.
type
==
"recv"
:
op
.
_set_attr
(
'do_not_run'
,
True
)
op
.
_set_attr
(
'do_not_run'
,
True
)
self
.
communicator_
=
core
.
DistCommunicator
(
program
.
desc
,
global_scope
())
# Todo: Add check
if
vars_info
and
trainers
and
geo_sgd_need_push_nums
:
# for geo sgd
self
.
communicator_
=
core
.
DistCommunicator
(
program
.
desc
,
global_scope
(),
vars_info
,
trainers
,
geo_sgd_need_push_nums
)
else
:
self
.
communicator_
=
core
.
DistCommunicator
(
program
.
desc
,
global_scope
())
def
start
(
self
):
def
start
(
self
):
"""
"""
...
...
python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py
浏览文件 @
728ec1b4
...
@@ -13,7 +13,9 @@
...
@@ -13,7 +13,9 @@
# limitations under the License.
# limitations under the License.
import
os
import
os
import
warnings
import
warnings
"""
Convert the fluid program to distributed data-parallelism programs.
"""
import
paddle.fluid.io
as
io
import
paddle.fluid.io
as
io
from
paddle.fluid.communicator
import
Communicator
from
paddle.fluid.communicator
import
Communicator
from
paddle.fluid.framework
import
default_main_program
from
paddle.fluid.framework
import
default_main_program
...
@@ -24,6 +26,7 @@ from paddle.fluid.executor import Executor
...
@@ -24,6 +26,7 @@ from paddle.fluid.executor import Executor
from
paddle.fluid.parallel_executor
import
ParallelExecutor
from
paddle.fluid.parallel_executor
import
ParallelExecutor
from
paddle.fluid.optimizer
import
Optimizer
from
paddle.fluid.optimizer
import
Optimizer
from
paddle.fluid.transpiler.distribute_transpiler
import
DistributeTranspiler
as
OriginTranspiler
from
paddle.fluid.transpiler.distribute_transpiler
import
DistributeTranspiler
as
OriginTranspiler
from
paddle.fluid.transpiler.geo_sgd_transpiler
import
GeoSgdTranspiler
from
paddle.fluid.transpiler.distribute_transpiler
import
DistributeTranspilerConfig
from
paddle.fluid.transpiler.distribute_transpiler
import
DistributeTranspilerConfig
from
paddle.fluid.incubate.fleet.base.fleet_base
import
DistributedOptimizer
from
paddle.fluid.incubate.fleet.base.fleet_base
import
DistributedOptimizer
...
@@ -64,6 +67,12 @@ class DistributedTranspiler(Fleet):
...
@@ -64,6 +67,12 @@ class DistributedTranspiler(Fleet):
wait_server_ready
(
fleet
.
server_endpoints
(
to_string
=
False
))
wait_server_ready
(
fleet
.
server_endpoints
(
to_string
=
False
))
if
not
self
.
_transpile_config
.
sync_mode
:
if
not
self
.
_transpile_config
.
sync_mode
:
if
self
.
_transpile_config
.
geo_sgd_mode
:
self
.
_communicator
=
Communicator
(
self
.
main_program
,
self
.
vars_info
,
fleet
.
worker_num
(),
self
.
_transpile_config
.
geo_sgd_need_push_nums
)
else
:
self
.
_communicator
=
Communicator
(
self
.
main_program
)
self
.
_communicator
=
Communicator
(
self
.
main_program
)
if
not
self
.
_communicator
.
is_running
():
if
not
self
.
_communicator
.
is_running
():
...
@@ -124,7 +133,6 @@ class DistributedTranspiler(Fleet):
...
@@ -124,7 +133,6 @@ class DistributedTranspiler(Fleet):
):
):
self
.
_communicator
.
stop
()
self
.
_communicator
.
stop
()
self
.
_executor
.
close
()
self
.
_executor
.
close
()
if
isinstance
(
self
.
_role_maker
,
MPISymetricRoleMaker
):
if
isinstance
(
self
.
_role_maker
,
MPISymetricRoleMaker
):
self
.
_role_maker
.
_finalize
()
self
.
_role_maker
.
_finalize
()
...
@@ -239,6 +247,9 @@ class DistributedTranspiler(Fleet):
...
@@ -239,6 +247,9 @@ class DistributedTranspiler(Fleet):
self
.
_origin_program
=
default_main_program
().
clone
(
for_test
=
False
)
self
.
_origin_program
=
default_main_program
().
clone
(
for_test
=
False
)
self
.
_transpile_config
=
config
self
.
_transpile_config
=
config
if
config
.
geo_sgd_mode
:
self
.
_transpiler
=
GeoSgdTranspiler
(
config
)
else
:
self
.
_transpiler
=
OriginTranspiler
(
config
)
self
.
_transpiler
=
OriginTranspiler
(
config
)
if
self
.
is_worker
():
if
self
.
is_worker
():
...
@@ -254,6 +265,9 @@ class DistributedTranspiler(Fleet):
...
@@ -254,6 +265,9 @@ class DistributedTranspiler(Fleet):
self
.
main_program
=
self
.
_transpiler
.
get_trainer_program
(
self
.
main_program
=
self
.
_transpiler
.
get_trainer_program
(
wait_port
=
config
.
wait_port
)
wait_port
=
config
.
wait_port
)
self
.
startup_program
=
default_startup_program
()
self
.
startup_program
=
default_startup_program
()
if
self
.
_transpile_config
.
geo_sgd_mode
:
self
.
vars_info
=
self
.
_transpiler
.
_get_vars_info
()
self
.
startup_program
=
self
.
_transpiler
.
trainer_startup_program
else
:
else
:
self
.
_transpiler
.
transpile
(
self
.
_transpiler
.
transpile
(
trainer_id
=
fleet
.
worker_index
(),
trainer_id
=
fleet
.
worker_index
(),
...
@@ -262,7 +276,8 @@ class DistributedTranspiler(Fleet):
...
@@ -262,7 +276,8 @@ class DistributedTranspiler(Fleet):
sync_mode
=
config
.
sync_mode
,
sync_mode
=
config
.
sync_mode
,
current_endpoint
=
self
.
server_endpoints
()[
self
.
server_index
()])
current_endpoint
=
self
.
server_endpoints
()[
self
.
server_index
()])
self
.
main_program
,
self
.
startup_program
=
\
self
.
main_program
,
self
.
startup_program
=
\
self
.
_transpiler
.
get_pserver_programs
(
self
.
server_endpoints
()[
self
.
server_index
()])
self
.
_transpiler
.
get_pserver_programs
(
self
.
server_endpoints
()[
self
.
server_index
()])
fleet
=
DistributedTranspiler
()
fleet
=
DistributedTranspiler
()
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
728ec1b4
...
@@ -24,6 +24,7 @@ if(NOT WITH_DISTRIBUTE)
...
@@ -24,6 +24,7 @@ if(NOT WITH_DISTRIBUTE)
LIST
(
REMOVE_ITEM TEST_OPS test_nce_remote_table_op
)
LIST
(
REMOVE_ITEM TEST_OPS test_nce_remote_table_op
)
LIST
(
REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op
)
LIST
(
REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_fleet_ctr
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_fleet_ctr
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_fleet_geo
)
endif
(
NOT WITH_DISTRIBUTE
)
endif
(
NOT WITH_DISTRIBUTE
)
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
浏览文件 @
728ec1b4
...
@@ -13,7 +13,9 @@
...
@@ -13,7 +13,9 @@
# limitations under the License.
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
print_function
"""
high level unit test for distribute fleet.
"""
import
argparse
import
argparse
import
os
import
os
import
pickle
import
pickle
...
@@ -29,6 +31,7 @@ from contextlib import closing
...
@@ -29,6 +31,7 @@ from contextlib import closing
import
six
import
six
import
unittest
import
unittest
import
numpy
as
np
import
numpy
as
np
import
tempfile
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
...
@@ -40,6 +43,12 @@ LEARNING_RATE = 0.01
...
@@ -40,6 +43,12 @@ LEARNING_RATE = 0.01
class
FleetDistRunnerBase
(
object
):
class
FleetDistRunnerBase
(
object
):
"""
run_pserver,run_trainer : after init role, using transpiler split program
net : implment by child class, the network of model
do training : exe run program
"""
def
run_pserver
(
self
,
args
):
def
run_pserver
(
self
,
args
):
if
args
.
role
.
upper
()
!=
"PSERVER"
:
if
args
.
role
.
upper
()
!=
"PSERVER"
:
raise
ValueError
(
"args role must be PSERVER"
)
raise
ValueError
(
"args role must be PSERVER"
)
...
@@ -54,6 +63,8 @@ class FleetDistRunnerBase(object):
...
@@ -54,6 +63,8 @@ class FleetDistRunnerBase(object):
strategy
=
DistributeTranspilerConfig
()
strategy
=
DistributeTranspilerConfig
()
strategy
.
sync_mode
=
args
.
sync_mode
strategy
.
sync_mode
=
args
.
sync_mode
strategy
.
geo_sgd_mode
=
args
.
geo_sgd_mode
strategy
.
geo_sgd_need_push_nums
=
args
.
geo_sgd_need_push_nums
avg_cost
=
self
.
net
()
avg_cost
=
self
.
net
()
...
@@ -78,14 +89,14 @@ class FleetDistRunnerBase(object):
...
@@ -78,14 +89,14 @@ class FleetDistRunnerBase(object):
strategy
=
DistributeTranspilerConfig
()
strategy
=
DistributeTranspilerConfig
()
strategy
.
sync_mode
=
args
.
sync_mode
strategy
.
sync_mode
=
args
.
sync_mode
strategy
.
geo_sgd_mode
=
args
.
geo_sgd_mode
strategy
.
geo_sgd_need_push_nums
=
args
.
geo_sgd_need_push_nums
avg_cost
=
self
.
net
()
avg_cost
=
self
.
net
()
optimizer
=
fluid
.
optimizer
.
SGD
(
LEARNING_RATE
)
optimizer
=
fluid
.
optimizer
.
SGD
(
LEARNING_RATE
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
optimizer
.
minimize
(
avg_cost
)
optimizer
.
minimize
(
avg_cost
)
self
.
do_training
(
fleet
)
out
=
self
.
do_training
(
fleet
)
out
=
self
.
do_training
(
fleet
)
def
net
(
self
,
batch_size
=
4
,
lr
=
0.01
):
def
net
(
self
,
batch_size
=
4
,
lr
=
0.01
):
...
@@ -98,6 +109,11 @@ class FleetDistRunnerBase(object):
...
@@ -98,6 +109,11 @@ class FleetDistRunnerBase(object):
class
TestFleetBase
(
unittest
.
TestCase
):
class
TestFleetBase
(
unittest
.
TestCase
):
"""
start_pserver,start_trainer : add start cmd to test
run_cluster : using multi process to test distribute program
"""
def
_setup_config
(
self
):
def
_setup_config
(
self
):
raise
NotImplementedError
(
"tests should have _setup_config implemented"
)
raise
NotImplementedError
(
"tests should have _setup_config implemented"
)
...
@@ -109,6 +125,8 @@ class TestFleetBase(unittest.TestCase):
...
@@ -109,6 +125,8 @@ class TestFleetBase(unittest.TestCase):
self
.
_ps_endpoints
=
"127.0.0.1:%s,127.0.0.1:%s"
%
(
self
.
_ps_endpoints
=
"127.0.0.1:%s,127.0.0.1:%s"
%
(
self
.
_find_free_port
(),
self
.
_find_free_port
())
self
.
_find_free_port
(),
self
.
_find_free_port
())
self
.
_python_interp
=
sys
.
executable
self
.
_python_interp
=
sys
.
executable
self
.
_geo_sgd
=
False
self
.
_geo_sgd_need_push_nums
=
5
self
.
_setup_config
()
self
.
_setup_config
()
def
_find_free_port
(
self
):
def
_find_free_port
(
self
):
...
@@ -127,8 +145,8 @@ class TestFleetBase(unittest.TestCase):
...
@@ -127,8 +145,8 @@ class TestFleetBase(unittest.TestCase):
def
_start_pserver
(
self
,
cmd
,
required_envs
):
def
_start_pserver
(
self
,
cmd
,
required_envs
):
ps0_cmd
,
ps1_cmd
=
cmd
.
format
(
0
),
cmd
.
format
(
1
)
ps0_cmd
,
ps1_cmd
=
cmd
.
format
(
0
),
cmd
.
format
(
1
)
ps0_pipe
=
open
(
"/tmp
/ps0_err.log"
,
"wb+"
)
ps0_pipe
=
open
(
tempfile
.
gettempdir
()
+
"
/ps0_err.log"
,
"wb+"
)
ps1_pipe
=
open
(
"/tmp
/ps1_err.log"
,
"wb+"
)
ps1_pipe
=
open
(
tempfile
.
gettempdir
()
+
"
/ps1_err.log"
,
"wb+"
)
ps0_proc
=
subprocess
.
Popen
(
ps0_proc
=
subprocess
.
Popen
(
ps0_cmd
.
strip
().
split
(
" "
),
ps0_cmd
.
strip
().
split
(
" "
),
...
@@ -140,14 +158,13 @@ class TestFleetBase(unittest.TestCase):
...
@@ -140,14 +158,13 @@ class TestFleetBase(unittest.TestCase):
stdout
=
subprocess
.
PIPE
,
stdout
=
subprocess
.
PIPE
,
stderr
=
ps1_pipe
,
stderr
=
ps1_pipe
,
env
=
required_envs
)
env
=
required_envs
)
return
ps0_proc
,
ps1_proc
,
ps0_pipe
,
ps1_pipe
return
ps0_proc
,
ps1_proc
,
ps0_pipe
,
ps1_pipe
def
_start_trainer
(
self
,
cmd
,
required_envs
):
def
_start_trainer
(
self
,
cmd
,
required_envs
):
tr0_cmd
,
tr1_cmd
=
cmd
.
format
(
0
),
cmd
.
format
(
1
)
tr0_cmd
,
tr1_cmd
=
cmd
.
format
(
0
),
cmd
.
format
(
1
)
tr0_pipe
=
open
(
"/tmp
/tr0_err.log"
,
"wb+"
)
tr0_pipe
=
open
(
tempfile
.
gettempdir
()
+
"
/tr0_err.log"
,
"wb+"
)
tr1_pipe
=
open
(
"/tmp
/tr1_err.log"
,
"wb+"
)
tr1_pipe
=
open
(
tempfile
.
gettempdir
()
+
"
/tr1_err.log"
,
"wb+"
)
tr0_proc
=
subprocess
.
Popen
(
tr0_proc
=
subprocess
.
Popen
(
tr0_cmd
.
strip
().
split
(
" "
),
tr0_cmd
.
strip
().
split
(
" "
),
...
@@ -164,18 +181,29 @@ class TestFleetBase(unittest.TestCase):
...
@@ -164,18 +181,29 @@ class TestFleetBase(unittest.TestCase):
def
_run_cluster
(
self
,
model
,
envs
):
def
_run_cluster
(
self
,
model
,
envs
):
env
=
{
'CPU_NUM'
:
'1'
}
env
=
{
'CPU_NUM'
:
'1'
}
python_path
=
self
.
_python_interp
if
os
.
getenv
(
'WITH_COVERAGE'
,
'OFF'
)
==
'ON'
:
envs
[
'COVERAGE_FILE'
]
=
os
.
getenv
(
'COVERAGE_FILE'
,
''
)
python_path
+=
" -m coverage run --branch -p"
env
.
update
(
envs
)
env
.
update
(
envs
)
tr_cmd
=
"{0} {1} --role trainer --endpoints {2} --current_id {{}} --trainers {3}"
.
format
(
tr_cmd
=
"{0} {1} --role trainer --endpoints {2} --current_id {{}} --trainers {3}"
.
format
(
self
.
_python_interp
,
model
,
self
.
_ps_endpoints
,
self
.
_trainers
)
python_path
,
model
,
self
.
_ps_endpoints
,
self
.
_trainers
)
ps_cmd
=
"{0} {1} --role pserver --endpoints {2} --current_id {{}} --trainers {3}"
.
format
(
ps_cmd
=
"{0} {1} --role pserver --endpoints {2} --current_id {{}} --trainers {3}"
.
format
(
self
.
_python_interp
,
model
,
self
.
_ps_endpoints
,
self
.
_trainers
)
python_path
,
model
,
self
.
_ps_endpoints
,
self
.
_trainers
)
if
self
.
_sync_mode
:
if
self
.
_sync_mode
:
tr_cmd
+=
" --sync_mode"
tr_cmd
+=
" --sync_mode"
ps_cmd
+=
" --sync_mode"
ps_cmd
+=
" --sync_mode"
if
self
.
_geo_sgd
:
tr_cmd
+=
" --geo_sgd_mode {0} --geo_sgd_need_push_nums {1}"
.
format
(
self
.
_geo_sgd
,
self
.
_geo_sgd_need_push_nums
)
ps_cmd
+=
" --geo_sgd_mode {0} --geo_sgd_need_push_nums {1}"
.
format
(
self
.
_geo_sgd
,
self
.
_geo_sgd_need_push_nums
)
# Run dist train to compare with local results
# Run dist train to compare with local results
ps0
,
ps1
,
ps0_pipe
,
ps1_pipe
=
self
.
_start_pserver
(
ps_cmd
,
env
)
ps0
,
ps1
,
ps0_pipe
,
ps1_pipe
=
self
.
_start_pserver
(
ps_cmd
,
env
)
tr0
,
tr1
,
tr0_pipe
,
tr1_pipe
=
self
.
_start_trainer
(
tr_cmd
,
env
)
tr0
,
tr1
,
tr0_pipe
,
tr1_pipe
=
self
.
_start_trainer
(
tr_cmd
,
env
)
...
@@ -259,7 +287,10 @@ def runtime_main(test_class):
...
@@ -259,7 +287,10 @@ def runtime_main(test_class):
parser
.
add_argument
(
'--current_id'
,
type
=
int
,
required
=
False
,
default
=
0
)
parser
.
add_argument
(
'--current_id'
,
type
=
int
,
required
=
False
,
default
=
0
)
parser
.
add_argument
(
'--trainers'
,
type
=
int
,
required
=
False
,
default
=
1
)
parser
.
add_argument
(
'--trainers'
,
type
=
int
,
required
=
False
,
default
=
1
)
parser
.
add_argument
(
'--sync_mode'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--sync_mode'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--geo_sgd_mode'
,
type
=
bool
,
required
=
False
,
default
=
False
)
parser
.
add_argument
(
'--geo_sgd_need_push_nums'
,
type
=
int
,
required
=
False
,
default
=
2
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
model
=
test_class
()
model
=
test_class
()
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
0 → 100644
浏览文件 @
728ec1b4
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
from
paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler
import
fleet
from
paddle.fluid.transpiler.distribute_transpiler
import
DistributeTranspilerConfig
from
test_dist_fleet_base
import
TestFleetBase
from
dist_simnet_bow
import
train_network
def
skip_ci
(
func
):
on_ci
=
bool
(
int
(
os
.
environ
.
get
(
"SKIP_UNSTABLE_CI"
,
'0'
)))
def
__func__
(
*
args
,
**
kwargs
):
if
on_ci
:
return
return
func
(
*
args
,
**
kwargs
)
return
__func__
class
TestDistGeoCtr_2x2
(
TestFleetBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
self
.
_geo_sgd
=
True
self
.
_geo_sgd_need_push_nums
=
5
def
check_with_place
(
self
,
model_file
,
delta
=
1e-3
,
check_error_log
=
False
,
need_envs
=
{}):
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
,
""
),
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
,
""
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
,
""
),
"FLAGS_rpc_deadline"
:
"5000"
,
# 5sec to fail fast
"http_proxy"
:
""
}
required_envs
.
update
(
need_envs
)
if
check_error_log
:
required_envs
[
"GLOG_v"
]
=
"3"
required_envs
[
"GLOG_logtostderr"
]
=
"1"
tr0_losses
,
tr1_losses
=
self
.
_run_cluster
(
model_file
,
required_envs
)
def
test_dist_train
(
self
):
self
.
check_with_place
(
"dist_fleet_ctr.py"
,
delta
=
1e-5
,
check_error_log
=
True
)
class
TestGeoSgdTranspiler
(
unittest
.
TestCase
):
def
test_pserver
(
self
):
role
=
role_maker
.
UserDefinedRoleMaker
(
current_id
=
0
,
role
=
role_maker
.
Role
.
SERVER
,
worker_num
=
2
,
server_endpoints
=
[
"127.0.0.1:36011"
,
"127.0.0.1:36012"
])
fleet
.
init
(
role
)
batch_size
=
128
is_sparse
=
True
is_distribute
=
False
strategy
=
DistributeTranspilerConfig
()
strategy
.
sync_mode
=
False
strategy
.
geo_sgd_mode
=
True
strategy
.
geo_sgd_need_push_nums
=
5
avg_cost
,
_
,
_
=
train_network
(
batch_size
,
is_distribute
,
is_sparse
)
optimizer
=
fluid
.
optimizer
.
SGD
(
0.1
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
optimizer
.
minimize
(
avg_cost
)
pserver_startup_program
=
fleet
.
startup_program
pserver_mian_program
=
fleet
.
main_program
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
728ec1b4
...
@@ -180,6 +180,10 @@ class DistributeTranspilerConfig(object):
...
@@ -180,6 +180,10 @@ class DistributeTranspilerConfig(object):
_runtime_split_send_recv
=
False
_runtime_split_send_recv
=
False
_sync_mode
=
True
_sync_mode
=
True
# Geo-sgd algorithm
geo_sgd_mode
=
False
geo_sgd_need_push_nums
=
100
nccl_comm_num
=
1
nccl_comm_num
=
1
#The picture here illustrates the principle:
#The picture here illustrates the principle:
#https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
#https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
...
...
python/paddle/fluid/transpiler/geo_sgd_transpiler.py
0 → 100644
浏览文件 @
728ec1b4
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
"""
Steps to transpile trainer:
1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
2. create delta variable in global scope which used to send
3. add send op to send sparse ids to communicator
Steps to transpile pserver:
1. create new program for parameter server.
2. create params variables that assigned to current server instance.
3. create a sub-block in the server side program
4. append sum ops that should run on current server instance.
5. add listen_and_serv op
"""
import
sys
import
collections
import
six
import
numpy
as
np
from
.ps_dispatcher
import
RoundRobin
,
PSDispatcher
from
..
import
core
,
framework
from
..framework
import
Program
,
default_main_program
,
\
default_startup_program
,
Block
,
Parameter
from
.details
import
wait_server_ready
,
VarsDistributed
from
.details
import
delete_ops
from
..distribute_lookup_table
import
find_distributed_lookup_table
from
.distribute_transpiler
import
DistributeTranspiler
,
DistributeTranspilerConfig
,
slice_variable
,
same_or_split_var
RPC_OP_ROLE_ATTR_NAME
=
op_role_attr_name
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
(
)
RPC_OP_ROLE_ATTR_VALUE
=
core
.
op_proto_and_checker_maker
.
OpRole
.
RPC
class
GeoSgdTranspiler
(
DistributeTranspiler
):
def
__init__
(
self
,
config
=
None
):
if
config
is
not
None
:
self
.
config
=
config
else
:
self
.
config
=
DistributeTranspilerConfig
()
if
self
.
config
.
split_method
is
None
:
self
.
config
.
split_method
=
RoundRobin
assert
(
self
.
config
.
min_block_size
>=
8192
)
assert
(
self
.
config
.
split_method
.
__bases__
[
0
]
==
PSDispatcher
)
def
transpile
(
self
,
trainer_id
,
program
=
None
,
pservers
=
"127.0.0.1:6174"
,
trainers
=
1
,
sync_mode
=
False
,
startup_program
=
None
,
current_endpoint
=
"127.0.0.1:6174"
):
if
program
is
None
:
program
=
default_main_program
()
if
startup_program
is
None
:
startup_program
=
default_startup_program
()
self
.
origin_program
=
program
self
.
startup_program
=
startup_program
self
.
origin_startup_program
=
self
.
startup_program
.
clone
()
self
.
trainer_num
=
trainers
# geo-sgd only supply async-mode
self
.
sync_mode
=
False
self
.
trainer_id
=
trainer_id
pserver_endpoints
=
pservers
.
split
(
","
)
self
.
pserver_endpoints
=
pserver_endpoints
self
.
vars_overview
=
VarsDistributed
()
self
.
optimize_ops
,
self
.
params_grads
=
self
.
_get_optimize_pass
()
ps_dispatcher
=
self
.
config
.
split_method
(
self
.
pserver_endpoints
)
self
.
param_name_to_grad_name
=
dict
()
self
.
grad_name_to_param_name
=
dict
()
for
param_var
,
grad_var
in
self
.
params_grads
:
self
.
param_name_to_grad_name
[
param_var
.
name
]
=
grad_var
.
name
self
.
grad_name_to_param_name
[
grad_var
.
name
]
=
param_var
.
name
# distribute lookup table
self
.
table_name
=
find_distributed_lookup_table
(
self
.
origin_program
)
self
.
has_distributed_lookup_table
=
self
.
table_name
!=
None
self
.
origin_program
.
_distributed_lookup_table
=
self
.
table_name
if
self
.
table_name
else
None
# add distributed attrs to program
self
.
origin_program
.
_is_distributed
=
True
self
.
origin_program
.
_endpoints
=
self
.
pserver_endpoints
self
.
origin_program
.
_ps_endpoint
=
current_endpoint
self
.
origin_program
.
_is_chief
=
self
.
trainer_id
==
0
# program info send to geo-sgd communicator
self
.
vars_info
=
collections
.
OrderedDict
()
self
.
split_to_origin_mapping
=
collections
.
OrderedDict
()
self
.
delta_vars_list
=
[]
self
.
sparse_var_list
=
[]
self
.
sparse_var_splited_list
=
[]
# split and create vars, then put splited vars in dicts for later use.
# step 1. split and create vars, then put splited vars in dicts for later use.
self
.
_init_splited_vars
()
# step 3. create send recv var (param after optimize)
send_vars
=
[]
ps_dispatcher
.
reset
()
param_var_mapping_items
=
list
(
six
.
iteritems
(
self
.
param_var_mapping
))
# send_vars is the parameter which splited by communicator and send to pserver,not the origin parameter
for
_
,
splited_vars
in
param_var_mapping_items
:
for
_
,
var
in
enumerate
(
splited_vars
):
send_vars
.
append
(
var
)
recv_vars
=
send_vars
ps_dispatcher
.
reset
()
eplist
=
ps_dispatcher
.
dispatch
(
recv_vars
)
for
i
,
ep
in
enumerate
(
eplist
):
self
.
param_opt_ep_mapping
[
ep
][
"params"
].
append
(
recv_vars
[
i
])
distributed_var
=
self
.
vars_overview
.
get_distributed_var_by_slice
(
recv_vars
[
i
].
name
)
distributed_var
.
endpoint
=
ep
origin_name
=
self
.
split_to_origin_mapping
[
recv_vars
[
i
].
name
]
self
.
vars_info
[
origin_name
][
"epmap"
].
append
(
ep
)
self
.
origin_program
.
_parameters_on_pservers
=
self
.
vars_overview
# send sparse id to communicator
self
.
sparse_var
=
[]
self
.
sparse_tables
=
[]
for
op
in
self
.
origin_program
.
global_block
().
ops
:
if
op
.
type
==
"lookup_table"
:
op
.
_set_attr
(
'remote_prefetch'
,
False
)
for
input_var_name
,
sparse_var_name
in
zip
(
op
.
input
(
"Ids"
),
op
.
input
(
"W"
)):
if
sparse_var_name
in
self
.
sparse_var_list
:
input_var
=
program
.
global_block
().
var
(
input_var_name
)
self
.
sparse_var
.
append
(
input_var
)
self
.
sparse_tables
.
append
(
sparse_var_name
)
# batch training loop end flag
dummy_output
=
program
.
global_block
().
create_var
(
name
=
framework
.
generate_control_dev_var_name
())
program
.
global_block
().
append_op
(
type
=
"send"
,
inputs
=
{
"X"
:
self
.
sparse_var
},
outputs
=
{
"Out"
:
dummy_output
},
attrs
=
{
"send_varnames"
:
self
.
sparse_tables
})
# add param_init flag in trainer startup program
self
.
trainer_startup_program
=
self
.
_get_trainer_startup_program
(
recv_vars
=
recv_vars
,
eplist
=
eplist
)
for
delta_var
in
self
.
delta_vars_list
:
self
.
trainer_startup_program
.
global_block
().
create_var
(
name
=
delta_var
.
name
,
persistable
=
delta_var
.
persistable
,
dtype
=
delta_var
.
dtype
,
type
=
delta_var
.
type
,
shape
=
delta_var
.
shape
)
dummy_output
=
self
.
trainer_startup_program
.
global_block
().
create_var
(
name
=
framework
.
generate_control_dev_var_name
())
param_init
=
self
.
trainer_startup_program
.
global_block
().
create_var
(
name
=
"param_init"
)
self
.
trainer_startup_program
.
global_block
().
append_op
(
type
=
"send"
,
inputs
=
{
"X"
:
[
param_init
]},
outputs
=
{
"Out"
:
dummy_output
},
attrs
=
{
"send_varnames"
:
[
param_init
.
name
]})
def
_get_vars_info
(
self
):
return
self
.
vars_info
def
get_trainer_program
(
self
,
wait_port
=
True
):
# if wait_port:
# wait_server_ready(self.pserver_endpoints)
return
self
.
origin_program
def
get_pserver_programs
(
self
,
endpoint
):
pserver_prog
=
self
.
get_pserver_program
(
endpoint
)
self
.
param_grad_ep_mapping
=
self
.
param_opt_ep_mapping
pserver_startup
=
self
.
get_startup_program
(
endpoint
,
pserver_program
=
pserver_prog
)
return
pserver_prog
,
pserver_startup
def
get_pserver_program
(
self
,
endpoint
):
# step1
pserver_program
=
Program
()
pserver_program
.
random_seed
=
self
.
origin_program
.
random_seed
pserver_program
.
_copy_dist_param_info_from
(
self
.
origin_program
)
# step2: Create vars to receive vars at parameter servers.
recv_inputs
=
[]
for
v
in
self
.
param_opt_ep_mapping
[
endpoint
][
"params"
]:
self
.
_clone_var
(
pserver_program
.
global_block
(),
v
)
optimize_block
=
[]
param_to_block_id
=
[]
sparse_grad_to_param
=
[]
# append op to the current block
pre_block_idx
=
pserver_program
.
num_blocks
-
1
for
var
in
self
.
param_opt_ep_mapping
[
endpoint
][
"params"
]:
per_opt_block
=
pserver_program
.
_create_block
(
pre_block_idx
)
optimize_block
.
append
(
per_opt_block
)
var_name
=
var
.
name
pserver_block
=
per_opt_block
.
program
.
global_block
()
param
=
pserver_block
.
vars
[
var_name
]
delta_var_name
=
"%s.delta"
%
(
param
.
name
)
if
var
.
name
in
self
.
sparse_var_splited_list
:
delta_type
=
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
sparse_grad_to_param
.
append
(
":"
.
join
(
[
delta_var_name
,
param
.
name
]))
else
:
delta_type
=
param
.
type
delta_var
=
pserver_block
.
create_var
(
name
=
delta_var_name
,
persistable
=
False
,
type
=
delta_type
,
dtype
=
param
.
dtype
,
shape
=
param
.
shape
)
per_opt_block
.
append_op
(
type
=
"sum"
,
inputs
=
{
"X"
:
[
param
,
delta_var
]},
outputs
=
{
"Out"
:
param
})
param_to_block_id
.
append
(
delta_var_name
+
":"
+
str
(
per_opt_block
.
idx
))
attrs
=
{
"optimize_blocks"
:
optimize_block
,
"endpoint"
:
endpoint
,
"Fanin"
:
self
.
trainer_num
,
"sync_mode"
:
self
.
sync_mode
,
"grad_to_block_id"
:
param_to_block_id
,
"sparse_grad_to_param"
:
sparse_grad_to_param
}
# step5 append the listen_and_serv op
pserver_program
.
global_block
().
append_op
(
type
=
"listen_and_serv"
,
inputs
=
{
'X'
:
recv_inputs
},
outputs
=
{},
attrs
=
attrs
)
pserver_program
.
_sync_with_cpp
()
# save pserver program to generate pserver side startup relatively.
self
.
pserver_program
=
pserver_program
return
pserver_program
def
_init_splited_vars
(
self
):
param_list
=
[]
grad_list
=
[]
param_grad_set
=
set
()
# step 1. create param_list
for
p
,
g
in
self
.
params_grads
:
if
type
(
p
)
==
Parameter
and
p
.
trainable
==
False
:
continue
if
p
.
name
not
in
param_grad_set
:
param_list
.
append
(
p
)
param_grad_set
.
add
(
p
.
name
)
if
g
.
name
not
in
param_grad_set
:
grad_list
.
append
(
g
)
param_grad_set
.
add
(
g
.
name
)
if
g
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
self
.
sparse_var_list
.
append
(
p
.
name
)
# step 2. Slice vars into numbers of piece with block_size
# when we slice var up into blocks, we will slice the var according to
# pserver services' count. A pserver may have two or more listening ports.
param_blocks
=
slice_variable
(
param_list
,
len
(
self
.
pserver_endpoints
),
self
.
config
.
min_block_size
)
# step 3. Create splited param from split blocks
# origin_param_name -> [splited_param_vars]
# Todo: update _create_vars_from_blocklist
self
.
param_var_mapping
=
self
.
_create_vars_from_blocklist
(
self
.
origin_program
,
param_blocks
)
# step 4. Create mapping of endpoint -> split var to create pserver side program
self
.
param_opt_ep_mapping
=
collections
.
OrderedDict
()
[
self
.
param_opt_ep_mapping
.
update
({
ep
:
{
"params"
:
[],
}
})
for
ep
in
self
.
pserver_endpoints
]
# step 5. Create delta var of Geo-Sgd & record vars infomation
for
origin_name
,
splited_vars
in
self
.
param_var_mapping
.
items
():
origin_var
=
self
.
origin_program
.
global_block
().
var
(
origin_name
)
self
.
vars_info
[
origin_name
]
=
collections
.
OrderedDict
()
self
.
vars_info
[
origin_name
][
"var_names"
]
=
[]
vars_section
=
self
.
_get_splited_var_sections
(
splited_vars
)
self
.
vars_info
[
origin_name
][
"sections"
]
=
[
str
(
i
)
for
i
in
vars_section
]
self
.
vars_info
[
origin_name
][
"epmap"
]
=
[]
self
.
vars_info
[
origin_name
][
"is_sparse"
]
=
[]
# todo: add var shape(may be no need,because recv scope have)
if
origin_name
in
self
.
sparse_var_list
:
delta_type
=
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
self
.
vars_info
[
origin_name
][
"is_sparse"
].
append
(
"True"
)
else
:
delta_type
=
origin_var
.
type
self
.
vars_info
[
origin_name
][
"is_sparse"
].
append
(
"False"
)
delta_var
=
self
.
origin_program
.
global_block
().
create_var
(
name
=
"."
.
join
([
origin_name
,
"delta"
]),
persistable
=
False
,
dtype
=
origin_var
.
dtype
,
type
=
delta_type
,
shape
=
origin_var
.
shape
)
self
.
delta_vars_list
.
append
(
delta_var
)
for
splited_var
in
splited_vars
:
is_slice
,
block_id
,
offset
=
self
.
_get_slice_var_info
(
splited_var
)
self
.
vars_overview
.
add_distributed_var
(
origin_var
=
origin_var
,
slice_var
=
splited_var
,
block_id
=
block_id
,
offset
=
offset
,
is_slice
=
is_slice
,
vtype
=
"Param"
)
self
.
split_to_origin_mapping
[
splited_var
.
name
]
=
origin_name
if
origin_name
in
self
.
sparse_var_list
:
self
.
sparse_var_splited_list
.
append
(
splited_var
.
name
)
self
.
vars_info
[
origin_name
][
"var_names"
].
append
(
splited_var
.
name
)
if
len
(
splited_vars
)
!=
1
:
self
.
origin_program
.
global_block
().
create_var
(
name
=
"."
.
join
([
splited_var
.
name
,
"delta"
]),
persistable
=
False
,
dtype
=
splited_var
.
dtype
,
type
=
delta_type
,
shape
=
splited_var
.
shape
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录