Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
0c321fe3
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
0c321fe3
编写于
4月 12, 2018
作者:
_青葱
浏览文件
操作
浏览文件
下载
差异文件
Merge branch develop
上级
ac78cc04
4c55a602
变更
23
展开全部
隐藏空白更改
内联
并排
Showing
23 changed file
with
504 addition
and
150 deletion
+504
-150
paddle/fluid/framework/block_desc.h
paddle/fluid/framework/block_desc.h
+1
-1
paddle/fluid/framework/details/computation_op_handle.cc
paddle/fluid/framework/details/computation_op_handle.cc
+3
-1
paddle/fluid/framework/details/fetch_op_handle.cc
paddle/fluid/framework/details/fetch_op_handle.cc
+7
-1
paddle/fluid/framework/details/op_handle_base.h
paddle/fluid/framework/details/op_handle_base.h
+2
-0
paddle/fluid/framework/details/ssa_graph_executor.h
paddle/fluid/framework/details/ssa_graph_executor.h
+3
-1
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+0
-30
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+0
-3
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+9
-4
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+39
-8
paddle/fluid/operators/concat_op.cc
paddle/fluid/operators/concat_op.cc
+5
-1
paddle/fluid/operators/detail/grpc_server.cc
paddle/fluid/operators/detail/grpc_server.cc
+1
-0
paddle/fluid/operators/listen_and_serv_op.cc
paddle/fluid/operators/listen_and_serv_op.cc
+29
-16
paddle/fluid/operators/listen_and_serv_op.h
paddle/fluid/operators/listen_and_serv_op.h
+2
-0
paddle/fluid/operators/lookup_table_op.cc
paddle/fluid/operators/lookup_table_op.cc
+3
-0
paddle/fluid/operators/prefetch_op.cc
paddle/fluid/operators/prefetch_op.cc
+4
-4
paddle/fluid/operators/send_recv_op_test.cc
paddle/fluid/operators/send_recv_op_test.cc
+14
-12
paddle/fluid/operators/send_vars_op.cc
paddle/fluid/operators/send_vars_op.cc
+2
-2
paddle/fluid/operators/sgd_op.cc
paddle/fluid/operators/sgd_op.cc
+2
-2
paddle/fluid/operators/split_ids_op.cc
paddle/fluid/operators/split_ids_op.cc
+8
-6
paddle/fluid/operators/split_ids_op.h
paddle/fluid/operators/split_ids_op.h
+49
-21
paddle/fluid/operators/sum_op.cc
paddle/fluid/operators/sum_op.cc
+6
-1
python/paddle/fluid/distribute_transpiler.py
python/paddle/fluid/distribute_transpiler.py
+309
-34
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+6
-2
未找到文件。
paddle/fluid/framework/block_desc.h
浏览文件 @
0c321fe3
...
...
@@ -92,7 +92,7 @@ class BlockDesc {
/*
* Remove Op and its input/output variables.
* Note that for either input or ouput variable, if it is also an input or
* Note that for either input or ou
t
put variable, if it is also an input or
* output variable of other ops, we should remain it.
*/
void
RemoveOp
(
size_t
s
,
size_t
e
);
...
...
paddle/fluid/framework/details/computation_op_handle.cc
浏览文件 @
0c321fe3
...
...
@@ -14,6 +14,8 @@
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include <string>
namespace
paddle
{
namespace
framework
{
namespace
details
{
...
...
@@ -33,7 +35,7 @@ void ComputationOpHandle::RunImpl() {
}
}
op_
->
Run
(
*
scope_
->
FindVar
(
"@TMP_SCOPE@"
)
->
Get
<
Scope
*>
(),
place_
);
op_
->
Run
(
*
scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
(),
place_
);
}
std
::
string
ComputationOpHandle
::
Name
()
const
{
return
op_
->
Type
();
}
...
...
paddle/fluid/framework/details/fetch_op_handle.cc
浏览文件 @
0c321fe3
...
...
@@ -14,6 +14,9 @@
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include <string>
#include <vector>
namespace
paddle
{
namespace
framework
{
namespace
details
{
...
...
@@ -57,7 +60,10 @@ void FetchOpHandle::RunImpl() {
for
(
size_t
i
=
0
;
i
<
scopes
.
size
();
++
i
)
{
auto
&
scope
=
scopes
[
i
];
auto
&
t
=
scope
->
FindVar
(
var_name
)
->
Get
<
framework
::
LoDTensor
>
();
auto
&
t
=
scope
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
()
->
FindVar
(
var_name
)
->
Get
<
framework
::
LoDTensor
>
();
if
(
platform
::
is_gpu_place
(
var
->
place_
))
{
#ifdef PADDLE_WITH_CUDA
TensorCopy
(
t
,
cpu
,
*
dev_ctxes_
[
t
.
place
()],
&
tensors_
[
i
]);
...
...
paddle/fluid/framework/details/op_handle_base.h
浏览文件 @
0c321fe3
...
...
@@ -24,6 +24,8 @@ namespace paddle {
namespace
framework
{
namespace
details
{
constexpr
char
kLocalExecScopeName
[]
=
"@LCOAL_SCOPE@"
;
class
OpHandleBase
{
private:
DISABLE_COPY_AND_ASSIGN
(
OpHandleBase
);
...
...
paddle/fluid/framework/details/ssa_graph_executor.h
浏览文件 @
0c321fe3
...
...
@@ -15,13 +15,15 @@
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/ssa_graph.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
class
SSAGraphExecutor
{
DISABLE_COPY_AND_ASSIGN
(
SSAGraphExecutor
);
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
0c321fe3
...
...
@@ -136,12 +136,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
ready_ops
.
clear
();
};
// Create local scopes.
for
(
auto
&
scope
:
local_scopes_
)
{
auto
&
local_scope
=
scope
->
NewScope
();
*
scope
->
Var
(
"@TMP_SCOPE@"
)
->
GetMutable
<
Scope
*>
()
=
&
local_scope
;
}
// Step 3. Execution
while
(
!
pending_vars
.
empty
()
||
!
ready_ops
.
empty
()
||
!
delayed_ops
.
empty
())
{
// 1. Run All Ready ops
...
...
@@ -189,34 +183,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
PADDLE_ENFORCE
(
ready_ops
.
empty
());
PADDLE_ENFORCE
(
delayed_ops
.
empty
());
PADDLE_ENFORCE
(
blocked_by_delayed_ops
.
empty
());
++
computation_count_
;
auto
sync_computation
=
[
&
]
{
computation_count_
=
0
;
// Wait All computational streams
for
(
auto
p
:
this
->
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
}
for
(
auto
&
scope
:
local_scopes_
)
{
scope
->
DropKids
();
}
};
// Wait FetchOps.
if
(
!
fetch_ops
.
empty
())
{
fetch_ops
.
clear
();
sync_computation
();
}
if
(
computation_count_
==
max_async_computation
)
{
sync_computation
();
}
// NOTE: the temp scope can be dropped lazily if needed.
// Drop tmp scopes;
for
(
auto
&
scope
:
local_scopes_
)
{
auto
&
kid
=
*
scope
->
Var
(
"@TMP_SCOPE@"
)
->
GetMutable
<
Scope
*>
();
kid
=
nullptr
;
}
return
fetch_data
;
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
浏览文件 @
0c321fe3
...
...
@@ -99,9 +99,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
std
::
unique_ptr
<
platform
::
EnforceNotMet
>
exception_
;
std
::
atomic
<
int
>
running_ops_
;
bool
allow_op_delay_
;
size_t
computation_count_
{
0
};
size_t
max_async_computation
{
100
};
};
}
// namespace details
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
0c321fe3
...
...
@@ -46,7 +46,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
}
}
static
DDim
GetDims
(
const
Scope
&
scope
,
const
std
::
string
&
name
)
{
static
DDim
GetDims
(
const
Scope
&
scope
,
const
std
::
string
&
name
,
bool
get_actual_dim
=
false
)
{
Variable
*
var
=
scope
.
FindVar
(
name
);
if
(
var
==
nullptr
)
{
return
DDim
({
-
1
});
...
...
@@ -55,7 +56,11 @@ static DDim GetDims(const Scope& scope, const std::string& name) {
if
(
var
->
IsType
<
LoDTensor
>
())
{
return
var
->
Get
<
LoDTensor
>
().
dims
();
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
return
var
->
Get
<
SelectedRows
>
().
GetCompleteDims
();
if
(
get_actual_dim
)
{
return
var
->
Get
<
SelectedRows
>
().
value
().
dims
();
}
else
{
return
var
->
Get
<
SelectedRows
>
().
GetCompleteDims
();
}
}
else
{
return
DDim
({
-
1
});
}
...
...
@@ -129,7 +134,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
for
(
size_t
i
=
0
;
i
<
input
.
second
.
size
();
++
i
)
{
ss
<<
input
.
second
[
i
];
if
(
scope
)
{
ss
<<
"["
<<
GetDims
(
*
scope
,
input
.
second
[
i
])
<<
"]"
;
ss
<<
"["
<<
GetDims
(
*
scope
,
input
.
second
[
i
]
,
true
)
<<
"]"
;
ss
<<
"("
<<
GetLoD
(
*
scope
,
input
.
second
[
i
])
<<
")"
;
}
if
(
i
!=
input
.
second
.
size
()
-
1
)
{
...
...
@@ -149,7 +154,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
for
(
size_t
i
=
0
;
i
<
output
.
second
.
size
();
++
i
)
{
ss
<<
output
.
second
[
i
];
if
(
scope
)
{
ss
<<
"["
<<
GetDims
(
*
scope
,
output
.
second
[
i
])
<<
"]"
;
ss
<<
"["
<<
GetDims
(
*
scope
,
output
.
second
[
i
]
,
true
)
<<
"]"
;
ss
<<
"("
<<
GetLoD
(
*
scope
,
output
.
second
[
i
])
<<
")"
;
}
if
(
i
!=
output
.
second
.
size
()
-
1
)
{
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
0c321fe3
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h"
#include <string>
#include <tuple>
#include <vector>
#ifdef PADDLE_WITH_CUDA
...
...
@@ -41,6 +42,8 @@ class ParallelExecutorPrivate {
#ifdef PADDLE_WITH_CUDA
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
#endif
std
::
vector
<
std
::
tuple
<
std
::
string
,
proto
::
VarType
::
Type
,
bool
>>
var_types_
;
};
std
::
vector
<
Scope
*>
&
ParallelExecutor
::
GetLocalScopes
()
{
...
...
@@ -97,14 +100,9 @@ ParallelExecutor::ParallelExecutor(
allow_op_delay
));
// Step 3. Create vars in each scope;
for
(
auto
*
scope
:
member_
->
local_scopes_
)
{
for
(
auto
*
var
:
main_program
.
Block
(
0
).
AllVars
())
{
if
(
scope
->
FindVar
(
var
->
Name
())
!=
nullptr
)
{
continue
;
}
InitializeVariable
(
scope
->
Var
(
var
->
Name
()),
var
->
GetType
());
}
for
(
auto
*
var
:
main_program
.
Block
(
0
).
AllVars
())
{
member_
->
var_types_
.
emplace_back
(
var
->
Name
(),
var
->
GetType
(),
var
->
Persistable
());
}
}
...
...
@@ -163,9 +161,42 @@ void ParallelExecutor::Run(
const
std
::
unordered_map
<
std
::
string
,
LoDTensor
>
&
feed_tensors
)
{
platform
::
RecordBlock
b
(
0
);
SplitTensorToPlaces
(
feed_tensors
);
// Create local scopes.
for
(
auto
&
scope
:
member_
->
local_scopes_
)
{
Scope
&
local_scope
=
scope
->
NewScope
();
*
scope
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
()
=
&
local_scope
;
for
(
auto
&
name_type_pair
:
member_
->
var_types_
)
{
if
(
scope
->
FindVar
(
std
::
get
<
0
>
(
name_type_pair
))
!=
nullptr
)
{
continue
;
}
if
(
std
::
get
<
2
>
(
name_type_pair
))
{
// Persistable
InitializeVariable
(
scope
->
Var
(
std
::
get
<
0
>
(
name_type_pair
)),
std
::
get
<
1
>
(
name_type_pair
));
}
else
{
InitializeVariable
(
scope
->
Var
(
std
::
get
<
0
>
(
name_type_pair
)),
std
::
get
<
1
>
(
name_type_pair
));
}
}
}
auto
fetch_data
=
member_
->
executor_
->
Run
(
fetch_tensors
);
*
member_
->
global_scope_
->
Var
(
fetched_var_name
)
->
GetMutable
<
FeedFetchList
>
()
=
fetch_data
;
// Wait All computational streams
for
(
auto
p
:
member_
->
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
}
for
(
auto
&
scope
:
member_
->
local_scopes_
)
{
auto
&
local_scope
=
*
scope
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
();
scope
->
DeleteScope
(
local_scope
);
local_scope
=
nullptr
;
}
}
void
ParallelExecutor
::
SplitTensorToPlaces
(
...
...
paddle/fluid/operators/concat_op.cc
浏览文件 @
0c321fe3
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/concat_op.h"
#include <string>
#include <vector>
...
...
@@ -34,7 +35,10 @@ class ConcatOp : public framework::OperatorWithKernel {
size_t
axis
=
static_cast
<
size_t
>
(
ctx
->
Attrs
().
Get
<
int
>
(
"axis"
));
const
size_t
n
=
ins
.
size
();
PADDLE_ENFORCE_GT
(
n
,
1
,
"Input tensors count should > 1."
);
PADDLE_ENFORCE_GT
(
n
,
0
,
"Input tensors count should > 0."
);
if
(
n
==
1
)
{
VLOG
(
3
)
<<
"Warning: concat op have only one input, may waste memory"
;
}
auto
out_dims
=
ins
[
0
];
size_t
in_zero_dims_size
=
out_dims
.
size
();
...
...
paddle/fluid/operators/detail/grpc_server.cc
浏览文件 @
0c321fe3
...
...
@@ -161,6 +161,7 @@ class RequestPrefetch final : public RequestBase {
::
grpc
::
ByteBuffer
reply
;
std
::
string
var_name
=
request_
->
OutVarname
();
VLOG
(
3
)
<<
"prefetch var "
<<
var_name
;
auto
var_desc
=
program_
->
Block
(
0
).
FindVar
(
var_name
);
framework
::
Scope
*
local_scope
=
&
scope_
->
NewScope
();
auto
*
var
=
local_scope
->
FindVar
(
var_name
);
...
...
paddle/fluid/operators/listen_and_serv_op.cc
浏览文件 @
0c321fe3
...
...
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <ostream>
#include <thread>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/operators/listen_and_serv_op.h"
...
...
@@ -88,8 +89,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
auto
ins
=
Inputs
(
"X"
);
auto
fan_in
=
Attr
<
int
>
(
"Fanin"
);
auto
*
block
=
Attr
<
framework
::
BlockDesc
*>
(
kOptimizeBlock
);
auto
*
program
=
block
->
Program
();
auto
*
optimize_block
=
Attr
<
framework
::
BlockDesc
*>
(
kOptimizeBlock
);
auto
*
prefetch_block
=
Attr
<
framework
::
BlockDesc
*>
(
kPrefetchBlock
);
auto
*
program
=
optimize_block
->
Program
();
size_t
num_blocks
=
program
->
Size
();
PADDLE_ENFORCE_GE
(
num_blocks
,
2
,
"server program should have at least 2 blocks"
);
...
...
@@ -97,18 +99,25 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
framework
::
Executor
executor
(
dev_place
);
std
::
vector
<
int
>
block_list
;
for
(
size_t
blkid
=
1
;
blkid
<
num_blocks
;
++
blkid
)
{
block_list
.
push_back
(
blkid
);
if
(
blkid
!=
prefetch_block
->
ID
())
{
block_list
.
push_back
(
blkid
);
}
}
auto
prepared
=
executor
.
Prepare
(
*
program
,
block_list
);
auto
optimize_
prepared
=
executor
.
Prepare
(
*
program
,
block_list
);
// Insert placeholder for block0 which holds current op itself.
prepared
.
insert
(
prepared
.
begin
(),
std
::
shared_ptr
<
framework
::
ExecutorPrepareContext
>
(
nullptr
));
optimize_prepared
.
insert
(
optimize_prepared
.
begin
(),
std
::
shared_ptr
<
framework
::
ExecutorPrepareContext
>
(
nullptr
));
rpc_service_
->
SetScope
(
&
recv_scope
);
rpc_service_
->
SetDevCtx
(
&
dev_ctx
);
// TODO(qiao) set proper fields for table lookup and update
rpc_service_
->
SetExecutor
(
&
executor
);
rpc_service_
->
SetPrefetchBlkdId
(
0
);
VLOG
(
3
)
<<
"prefetch block id is "
<<
prefetch_block
->
ID
();
auto
prefetch_prepared
=
executor
.
Prepare
(
*
program
,
prefetch_block
->
ID
());
rpc_service_
->
SetPrefetchBlkdId
(
prefetch_block
->
ID
());
rpc_service_
->
SetPrefetchPreparedCtx
(
prefetch_prepared
.
get
());
prefetch_prepared
.
release
();
rpc_service_
->
SetProgram
(
program
);
// start the server listening after all member initialized.
server_thread_
.
reset
(
new
std
::
thread
(
RunServer
,
rpc_service_
));
...
...
@@ -166,16 +175,18 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
parallel_blkids
.
push_back
(
1
);
double
ts
=
detail
::
GetTimestamp
();
for
(
size_t
blkid
=
2
;
blkid
<
num_blocks
;
++
blkid
)
{
if
(
program
->
Block
(
blkid
).
Parent
()
!=
last_parent_blkid
)
{
ParallelExecuteBlocks
(
parallel_blkids
,
&
executor
,
prepared
,
program
,
&
recv_scope
);
parallel_blkids
.
clear
();
last_parent_blkid
=
program
->
Block
(
blkid
).
Parent
();
if
(
blkid
!=
prefetch_block
->
ID
())
{
if
(
program
->
Block
(
blkid
).
Parent
()
!=
last_parent_blkid
)
{
ParallelExecuteBlocks
(
parallel_blkids
,
&
executor
,
optimize_prepared
,
program
,
&
recv_scope
);
parallel_blkids
.
clear
();
last_parent_blkid
=
program
->
Block
(
blkid
).
Parent
();
}
parallel_blkids
.
push_back
(
blkid
);
}
parallel_blkids
.
push_back
(
blkid
);
}
ParallelExecuteBlocks
(
parallel_blkids
,
&
executor
,
prepared
,
program
,
&
recv_scope
);
ParallelExecuteBlocks
(
parallel_blkids
,
&
executor
,
optimize_prepared
,
program
,
&
recv_scope
);
VLOG
(
2
)
<<
"run all blocks spent "
<<
detail
::
GetTimestamp
()
-
ts
<<
"(ms)"
;
// Reset the received sparse variables, the sum operator would not
...
...
@@ -211,6 +222,8 @@ from send_op and send back variables to recv_op.
.
AddCustomChecker
([](
const
std
::
string
&
ip
)
{
return
!
ip
.
empty
();
});
AddAttr
<
framework
::
BlockDesc
*>
(
kOptimizeBlock
,
"BlockID to run on server side."
);
AddAttr
<
framework
::
BlockDesc
*>
(
kPrefetchBlock
,
"prefetch block to run on server side."
);
AddAttr
<
int
>
(
"Fanin"
,
"How many clients send to this server."
)
.
SetDefault
(
1
);
}
...
...
paddle/fluid/operators/listen_and_serv_op.h
浏览文件 @
0c321fe3
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <stdint.h>
#include <ostream>
#include <string>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
...
...
@@ -27,6 +28,7 @@ namespace paddle {
namespace
operators
{
constexpr
char
kOptimizeBlock
[]
=
"OptimizeBlock"
;
constexpr
char
kPrefetchBlock
[]
=
"PrefetchBlock"
;
void
RunServer
(
std
::
shared_ptr
<
detail
::
AsyncGRPCServer
>
service
);
...
...
paddle/fluid/operators/lookup_table_op.cc
浏览文件 @
0c321fe3
...
...
@@ -78,6 +78,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
"(boolean, default false) "
"Sparse update."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"is_distributed"
,
"(boolean, default false) distributed lookup table."
)
.
SetDefault
(
false
);
AddAttr
<
int64_t
>
(
"padding_idx"
,
"(int64, default -1) "
"If the value is -1, it makes no effect to lookup. "
...
...
paddle/fluid/operators/prefetch_op.cc
浏览文件 @
0c321fe3
...
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future>
#include <future>
// NOLINT
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
...
...
@@ -50,8 +50,8 @@ class PrefetchOp : public framework::OperatorBase {
for
(
size_t
i
=
0
;
i
<
ins
.
size
();
i
++
)
{
if
(
NeedSend
(
scope
,
ins
[
i
]))
{
VLOG
(
3
)
<<
"sending "
<<
ins
[
i
]
<<
" to "
<<
epmap
[
i
]
<<
"to get "
<<
outs
[
i
]
<<
"back"
;
VLOG
(
3
)
<<
"sending "
<<
ins
[
i
]
<<
" to "
<<
epmap
[
i
]
<<
"
to get "
<<
outs
[
i
]
<<
"
back"
;
rpc_client
->
AsyncPrefetchVariable
(
epmap
[
i
],
ctx
,
scope
,
ins
[
i
],
outs
[
i
]);
}
else
{
...
...
@@ -71,7 +71,7 @@ class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
"(RPCClient) The RPC client object which will be"
"initialized at most once."
);
AddOutput
(
"Out"
,
"(
SelectedRows
) result "
"(
LoDTensor
) result "
"to be fetched from parameter server"
)
.
AsDuplicable
();
AddAttr
<
std
::
vector
<
std
::
string
>>
(
...
...
paddle/fluid/operators/send_recv_op_test.cc
浏览文件 @
0c321fe3
...
...
@@ -14,7 +14,7 @@ limitations under the License. */
#include <unistd.h>
#include <string>
#include <thread>
#include <thread>
// NOLINT
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
...
...
@@ -37,11 +37,11 @@ namespace m = paddle::operators::math;
std
::
unique_ptr
<
f
::
OperatorBase
>
listen_and_serv_op
;
int
selected_port
;
void
InitTensorsInScope
(
f
::
Scope
&
scope
,
p
::
CPUPlace
&
plac
e
)
{
void
InitTensorsInScope
(
const
p
::
CPUPlace
&
place
,
f
::
Scope
*
scop
e
)
{
p
::
CPUDeviceContext
ctx
(
place
);
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
auto
var_name
=
paddle
::
string
::
Sprintf
(
"x%d"
,
i
);
auto
var
=
scope
.
Var
(
var_name
);
auto
var
=
scope
->
Var
(
var_name
);
auto
tensor
=
var
->
GetMutable
<
f
::
LoDTensor
>
();
tensor
->
Resize
({
10
,
10
});
float
*
expect
=
tensor
->
mutable_data
<
float
>
(
place
);
...
...
@@ -50,20 +50,20 @@ void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
}
}
auto
out_var
=
scope
.
Var
(
"Out"
);
auto
out_var
=
scope
->
Var
(
"Out"
);
auto
out_tensor
=
out_var
->
GetMutable
<
f
::
LoDTensor
>
();
out_tensor
->
Resize
({
10
,
10
});
out_tensor
->
mutable_data
<
float
>
(
place
);
// allocate
}
void
InitSelectedRowsInScope
(
f
::
Scope
&
scope
,
p
::
CPUPlace
&
plac
e
)
{
void
InitSelectedRowsInScope
(
const
p
::
CPUPlace
&
place
,
f
::
Scope
*
scop
e
)
{
p
::
CPUDeviceContext
ctx
(
place
);
int64_t
height
=
10
;
int64_t
row_numel
=
10
;
m
::
SetConstant
<
p
::
CPUDeviceContext
,
float
>
set_one
;
// init x0
std
::
vector
<
int64_t
>
rows0
{
0
,
4
,
7
};
auto
x0_var
=
scope
.
Var
(
"x0"
);
auto
x0_var
=
scope
->
Var
(
"x0"
);
auto
x0
=
x0_var
->
GetMutable
<
f
::
SelectedRows
>
();
x0
->
set_rows
(
rows0
);
x0
->
set_height
(
height
);
...
...
@@ -74,7 +74,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
// init x1
std
::
vector
<
int64_t
>
rows1
{
2
,
9
};
auto
x1_var
=
scope
.
Var
(
"x1"
);
auto
x1_var
=
scope
->
Var
(
"x1"
);
auto
x1
=
x1_var
->
GetMutable
<
f
::
SelectedRows
>
();
x1
->
set_rows
(
rows1
);
x1
->
set_height
(
height
);
...
...
@@ -83,7 +83,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
f
::
make_ddim
({
static_cast
<
int64_t
>
(
rows1
.
size
()),
row_numel
}),
place
);
set_one
(
ctx
,
x1_value
,
1.0
);
auto
out_var
=
scope
.
Var
(
"Out"
);
auto
out_var
=
scope
->
Var
(
"Out"
);
auto
out
=
out_var
->
GetMutable
<
f
::
SelectedRows
>
();
auto
out_value
=
out
->
mutable_value
();
out
->
set_height
(
height
);
...
...
@@ -117,15 +117,16 @@ void StartServerNet(bool is_sparse) {
f
::
Scope
scope
;
p
::
CPUPlace
place
;
if
(
is_sparse
)
{
InitSelectedRowsInScope
(
scope
,
plac
e
);
InitSelectedRowsInScope
(
place
,
&
scop
e
);
}
else
{
InitTensorsInScope
(
scope
,
plac
e
);
InitTensorsInScope
(
place
,
&
scop
e
);
}
// sub program run in listen_and_serv_op, for simple test we use sum
f
::
ProgramDesc
program
;
const
auto
&
root_block
=
program
.
Block
(
0
);
auto
*
optimize_block
=
program
.
AppendBlock
(
root_block
);
auto
*
prefetch_block
=
program
.
AppendBlock
(
root_block
);
// X for server side tensors, RX for received tensers, must be of same shape.
AddOp
(
"sum"
,
{{
"X"
,
{
"x0"
,
"x1"
}}},
{{
"Out"
,
{
"Out"
}}},
{},
optimize_block
);
...
...
@@ -135,6 +136,7 @@ void StartServerNet(bool is_sparse) {
attrs
.
insert
({
"ParamList"
,
std
::
vector
<
std
::
string
>
({
"Out"
})});
attrs
.
insert
({
"GradList"
,
std
::
vector
<
std
::
string
>
({
"x1"
})});
attrs
.
insert
({
"OptimizeBlock"
,
optimize_block
});
attrs
.
insert
({
"PrefetchBlock"
,
prefetch_block
});
listen_and_serv_op
=
f
::
OpRegistry
::
CreateOp
(
"listen_and_serv"
,
{{
"X"
,
{
"x1"
}}},
{},
attrs
);
LOG
(
INFO
)
<<
"selected port before run "
<<
selected_port
;
...
...
@@ -148,7 +150,7 @@ TEST(SendRecvOp, CPUDense) {
// local net
f
::
Scope
scope
;
p
::
CPUPlace
place
;
InitTensorsInScope
(
scope
,
plac
e
);
InitTensorsInScope
(
place
,
&
scop
e
);
// create rpc client var
scope
.
Var
(
"RPC_CLIENT_VAR"
);
...
...
@@ -191,7 +193,7 @@ TEST(SendRecvOp, CPUSparse) {
f
::
Scope
scope
;
p
::
CPUPlace
place
;
p
::
CPUDeviceContext
ctx
(
place
);
InitSelectedRowsInScope
(
scope
,
plac
e
);
InitSelectedRowsInScope
(
place
,
&
scop
e
);
scope
.
Var
(
"RPC_CLIENT_VAR"
);
f
::
AttributeMap
attrs
;
selected_port
=
static_cast
<
paddle
::
operators
::
ListenAndServOp
*>
(
...
...
paddle/fluid/operators/send_vars_op.cc
浏览文件 @
0c321fe3
...
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future>
#include <future>
// NOLINT
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
...
...
@@ -36,7 +36,7 @@ class SendVarsOp : public framework::OperatorBase {
auto
ins
=
Inputs
(
"X"
);
std
::
vector
<
std
::
string
>
epmap
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"epmap"
);
int
sync_send
=
Attr
<
int
>
(
"sync_sen
t
"
);
int
sync_send
=
Attr
<
int
>
(
"sync_sen
d
"
);
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
ctx
=
*
pool
.
Get
(
place
);
...
...
paddle/fluid/operators/sgd_op.cc
浏览文件 @
0c321fe3
...
...
@@ -35,8 +35,8 @@ class SGDOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ
(
framework
::
product
(
lr_dims
),
1
,
"Learning rate should have 1 element"
);
auto
param_dim
=
ctx
->
GetInputDim
(
"Param"
);
// TODO(qijun): check dimensions of Param and Grad at comp
li
e
// and run
time.
// TODO(qijun): check dimensions of Param and Grad at comp
il
e
// and runtime.
ctx
->
SetOutputDim
(
"ParamOut"
,
param_dim
);
}
...
...
paddle/fluid/operators/split_ids_op.cc
浏览文件 @
0c321fe3
...
...
@@ -48,11 +48,11 @@ class SplitIdsOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
ctx
->
HasOutputs
(
"Out"
),
"SplitIdsOp must has output Out."
);
auto
ids_var_type
=
ctx
->
GetInputsVarType
(
"Ids"
).
front
();
PADDLE_ENFORCE_EQ
(
ids_var_type
,
framework
::
proto
::
VarType
::
LOD_TENSOR
);
auto
ids_dims
=
ctx
->
GetInputDim
(
"Ids"
);
PADDLE_ENFORCE_EQ
(
ids_dims
.
size
(),
2
);
PADDLE_ENFORCE_EQ
(
ids_dims
[
1
],
1
);
if
(
ids_var_type
==
framework
::
proto
::
VarType
::
LOD_TENSOR
)
{
PADDLE_ENFORCE_EQ
(
ids_dims
.
size
(),
2
);
PADDLE_ENFORCE_EQ
(
ids_dims
[
1
],
1
);
}
}
};
...
...
@@ -60,8 +60,9 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference {
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
auto
*
input_var
=
block
->
Var
(
op_desc
.
Input
(
"Ids"
)[
0
]);
for
(
auto
&
out_var
:
op_desc
.
Output
(
"Out"
))
{
block
->
Var
(
out_var
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
block
->
Var
(
out_var
)
->
SetType
(
input_var
->
GetType
()
);
}
}
};
...
...
@@ -73,4 +74,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR
(
split_ids
,
ops
::
SplitIdsOp
,
ops
::
SplitIdsOpMaker
,
ops
::
SplitIdsOpInferVarType
);
REGISTER_OP_CPU_KERNEL
(
split_ids
,
ops
::
SplitIdsOpKernel
<
paddle
::
platform
::
CPUPlace
,
int64_t
>
);
split_ids
,
ops
::
SplitIdsOpKernel
<
paddle
::
platform
::
CPUPlace
,
int64_t
>
,
ops
::
SplitIdsOpKernel
<
paddle
::
platform
::
CPUPlace
,
float
>
);
paddle/fluid/operators/split_ids_op.h
浏览文件 @
0c321fe3
...
...
@@ -24,35 +24,63 @@ namespace operators {
template
<
typename
DeviceContext
,
typename
T
>
class
SplitIdsOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
place
=
ctx
.
GetPlace
();
if
(
!
platform
::
is_cpu_place
(
place
))
{
PADDLE_THROW
(
"SplitIds do not support GPU kernel"
);
}
auto
&
ids_dims
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Ids"
)
->
dims
();
const
T
*
ids
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Ids"
)
->
data
<
T
>
();
auto
outs
=
ctx
.
MultiOutput
<
framework
::
LoDTensor
>
(
"Out"
);
const
size_t
shard_num
=
outs
.
size
();
const
auto
*
ids_var
=
ctx
.
InputVar
(
"Ids"
);
if
(
ids_var
->
IsType
<
framework
::
LoDTensor
>
())
{
const
auto
&
ids_dims
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Ids"
)
->
dims
();
const
T
*
ids
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Ids"
)
->
data
<
T
>
();
auto
outs
=
ctx
.
MultiOutput
<
framework
::
LoDTensor
>
(
"Out"
);
const
size_t
shard_num
=
outs
.
size
();
std
::
vector
<
std
::
vector
<
T
>>
out_ids
;
out_ids
.
resize
(
outs
.
size
());
std
::
vector
<
std
::
vector
<
T
>>
out_ids
;
out_ids
.
resize
(
outs
.
size
());
// split id by their shard_num.
for
(
int
i
=
0
;
i
<
ids_dims
[
0
];
++
i
)
{
T
id
=
ids
[
i
];
size_t
shard_id
=
static_cast
<
size_t
>
(
id
)
%
shard_num
;
out_ids
[
shard_id
].
push_back
(
id
);
}
// split id by their shard_num.
for
(
int
i
=
0
;
i
<
ids_dims
[
0
];
++
i
)
{
T
id
=
ids
[
i
];
size_t
shard_id
=
static_cast
<
size_t
>
(
id
)
%
shard_num
;
out_ids
[
shard_id
].
push_back
(
id
);
}
// create tensor for each shard and send to parameter server
for
(
size_t
i
=
0
;
i
<
out_ids
.
size
();
++
i
)
{
auto
*
shard_t
=
outs
[
i
];
std
::
vector
<
T
>
ids
=
out_ids
[
i
];
auto
*
shard_data
=
shard_t
->
mutable_data
<
T
>
(
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
ids
.
size
()),
1
}),
place
);
for
(
size_t
i
=
0
;
i
<
ids
.
size
();
++
i
)
{
shard_data
[
i
]
=
ids
[
i
];
}
}
}
else
if
(
ids_var
->
IsType
<
framework
::
SelectedRows
>
())
{
const
auto
*
ids_selected_rows
=
ctx
.
Input
<
framework
::
SelectedRows
>
(
"Ids"
);
auto
&
ids_dims
=
ids_selected_rows
->
value
().
dims
();
PADDLE_ENFORCE_EQ
(
ids_dims
[
0
],
ids_selected_rows
->
rows
().
size
(),
""
);
const
T
*
ids
=
ids_selected_rows
->
value
().
data
<
T
>
();
const
auto
&
ids_rows
=
ids_selected_rows
->
rows
();
auto
outs
=
ctx
.
MultiOutput
<
framework
::
SelectedRows
>
(
"Out"
);
const
size_t
shard_num
=
outs
.
size
();
// get rows for outputs
for
(
auto
&
id
:
ids_rows
)
{
size_t
shard_id
=
static_cast
<
size_t
>
(
id
)
%
shard_num
;
outs
[
shard_id
]
->
mutable_rows
()
->
push_back
(
id
);
}
// create tensor for each shard and send to parameter server
for
(
size_t
i
=
0
;
i
<
out_ids
.
size
();
++
i
)
{
auto
*
shard_t
=
outs
[
i
];
std
::
vector
<
T
>
ids
=
out_ids
[
i
];
auto
*
shard_data
=
shard_t
->
mutable_data
<
T
>
(
framework
::
make_ddim
({
static_cast
<
int64_t
>
(
ids
.
size
()),
1
}),
place
);
for
(
size_t
i
=
0
;
i
<
ids
.
size
();
++
i
)
{
shard_data
[
i
]
=
ids
[
i
];
int64_t
row_width
=
ids_dims
[
1
];
for
(
auto
&
out
:
outs
)
{
out
->
set_height
(
ids_selected_rows
->
height
());
framework
::
DDim
ddim
=
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
out
->
rows
().
size
()),
row_width
});
T
*
output
=
out
->
mutable_value
()
->
mutable_data
<
T
>
(
ddim
,
place
);
for
(
size_t
i
=
0
;
i
<
ddim
[
0
];
++
i
)
{
memcpy
(
output
+
i
*
row_width
,
ids
+
out
->
rows
()[
i
]
*
row_width
,
row_width
*
sizeof
(
T
));
}
}
}
}
...
...
paddle/fluid/operators/sum_op.cc
浏览文件 @
0c321fe3
...
...
@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sum_op.h"
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/var_type_inference.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
...
...
@@ -37,7 +39,10 @@ class SumOp : public framework::OperatorWithKernel {
auto
x_dims
=
ctx
->
GetInputsDim
(
"X"
);
size_t
N
=
x_dims
.
size
();
PADDLE_ENFORCE_GT
(
N
,
1
,
"Input tensors count should > 1."
);
PADDLE_ENFORCE_GT
(
N
,
0
,
"Input tensors count should > 0."
);
if
(
N
==
1
)
{
VLOG
(
3
)
<<
"Warning: sum have only one input, may waste memory"
;
}
framework
::
DDim
in_dim
({
0
});
for
(
auto
&
x_dim
:
x_dims
)
{
...
...
python/paddle/fluid/distribute_transpiler.py
浏览文件 @
0c321fe3
此差异已折叠。
点击以展开。
python/paddle/fluid/layers/nn.py
浏览文件 @
0c321fe3
...
...
@@ -218,6 +218,7 @@ def fc(input,
def
embedding
(
input
,
size
,
is_sparse
=
False
,
is_distributed
=
False
,
padding_idx
=
None
,
param_attr
=
None
,
dtype
=
'float32'
):
...
...
@@ -268,8 +269,11 @@ def embedding(input,
inputs
=
{
'Ids'
:
input
,
'W'
:
w
},
outputs
=
{
'Out'
:
tmp
},
attrs
=
{
'is_sparse'
:
is_sparse
,
'padding_idx'
:
padding_idx
})
attrs
=
{
'is_sparse'
:
is_sparse
,
'is_distributed'
:
is_distributed
,
'padding_idx'
:
padding_idx
})
return
tmp
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录