Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
f28ae6e4
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f28ae6e4
编写于
3月 21, 2018
作者:
Y
Yu Yang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Reorganize Code
上级
5c333e41
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
126 addition
and
64 deletion
+126
-64
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+7
-1
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+2
-0
paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+74
-0
paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
+41
-0
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+2
-63
未找到文件。
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
f28ae6e4
...
@@ -87,9 +87,15 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo
...
@@ -87,9 +87,15 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope
framework_proto backward glog lod_rank_table feed_fetch_method
)
framework_proto backward glog lod_rank_table feed_fetch_method
)
if
(
WITH_GPU
)
set
(
parallel_executor_cuda_deps nccl_all_reduce_op_handle
)
else
()
set
(
parallel_executor_cuda_deps
)
endif
()
cc_library
(
parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope
cc_library
(
parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope
framework_proto backward glog lod_rank_table simple_threadpool scale_loss_grad_op_handle
framework_proto backward glog lod_rank_table simple_threadpool scale_loss_grad_op_handle
fetch_op_handle
)
fetch_op_handle
${
parallel_executor_cuda_deps
}
)
cc_library
(
prune SRCS prune.cc DEPS framework_proto
)
cc_library
(
prune SRCS prune.cc DEPS framework_proto
)
cc_test
(
prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context
)
cc_test
(
prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context
)
...
...
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
f28ae6e4
...
@@ -2,3 +2,5 @@ cc_library(var_handle SRCS var_handle.cc DEPS place)
...
@@ -2,3 +2,5 @@ cc_library(var_handle SRCS var_handle.cc DEPS place)
cc_library
(
op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context
)
cc_library
(
op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context
)
cc_library
(
scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
)
cc_library
(
scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
)
cc_library
(
fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
)
cc_library
(
fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
)
nv_library
(
nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda
)
paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
0 → 100644
浏览文件 @
f28ae6e4
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
NCCLAllReduceOpHandle
::
NCCLAllReduceOpHandle
(
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
&
ctxs
)
:
local_scopes_
(
local_scopes
),
places_
(
places
),
nccl_ctxs_
(
ctxs
)
{
for
(
auto
&
p
:
places_
)
{
this
->
dev_ctx_
[
p
]
=
nccl_ctxs_
.
DevCtx
(
p
);
}
}
void
NCCLAllReduceOpHandle
::
RunImpl
()
{
if
(
inputs_
.
size
()
==
1
)
{
return
;
// No need to all reduce when GPU count = 1;
}
else
{
// Wait input done
for
(
auto
*
in
:
inputs_
)
{
auto
&
p
=
static_cast
<
VarHandle
*>
(
in
)
->
place_
;
in
->
generated_op_
->
Wait
(
dev_ctx_
[
p
]);
}
auto
&
var_name
=
static_cast
<
VarHandle
*>
(
this
->
inputs_
[
0
])
->
name_
;
int
dtype
=
-
1
;
size_t
numel
=
0
;
platform
::
NCCLGroupGuard
guard
;
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
auto
&
p
=
places_
[
i
];
auto
*
s
=
local_scopes_
[
i
];
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
).
device
;
auto
&
lod_tensor
=
s
->
FindVar
(
var_name
)
->
Get
<
LoDTensor
>
();
void
*
buffer
=
const_cast
<
void
*>
(
lod_tensor
.
data
<
void
>
());
uintptr_t
buf
=
reinterpret_cast
<
uintptr_t
>
(
buffer
);
if
(
buf
%
sizeof
(
float
)
!=
0
)
{
VLOG
(
3
)
<<
"Buffer is not aligned "
<<
buf
;
}
if
(
dtype
==
-
1
)
{
dtype
=
platform
::
ToNCCLDataType
(
lod_tensor
.
type
());
}
if
(
numel
==
0
)
{
numel
=
static_cast
<
size_t
>
(
lod_tensor
.
numel
());
}
auto
&
nccl_ctx
=
nccl_ctxs_
.
at
(
dev_id
);
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
buffer
,
buffer
,
numel
,
static_cast
<
ncclDataType_t
>
(
dtype
),
ncclSum
,
nccl_ctx
.
comm_
,
nccl_ctx
.
stream
()));
}
}
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/nccl_all_reduce_op_handle.h
0 → 100644
浏览文件 @
f28ae6e4
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/nccl_helper.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
struct
NCCLAllReduceOpHandle
:
public
OpHandleBase
{
const
std
::
vector
<
Scope
*>
&
local_scopes_
;
const
std
::
vector
<
platform
::
Place
>
&
places_
;
const
platform
::
NCCLContextMap
&
nccl_ctxs_
;
NCCLAllReduceOpHandle
(
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
&
ctxs
);
protected:
void
RunImpl
()
override
;
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
f28ae6e4
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include "lod_tensor_array.h"
#include "lod_tensor_array.h"
#include "op_registry.h"
#include "op_registry.h"
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/details/var_handle.h"
...
@@ -28,6 +29,7 @@ namespace framework {
...
@@ -28,6 +29,7 @@ namespace framework {
using
details
::
DummyVarHandle
;
using
details
::
DummyVarHandle
;
using
details
::
FetchOpHandle
;
using
details
::
FetchOpHandle
;
using
details
::
NCCLAllReduceOpHandle
;
using
details
::
OpHandleBase
;
using
details
::
OpHandleBase
;
using
details
::
ScaleLossGradOpHandle
;
using
details
::
ScaleLossGradOpHandle
;
using
details
::
VarHandle
;
using
details
::
VarHandle
;
...
@@ -123,69 +125,6 @@ class ParallelExecutorPrivate {
...
@@ -123,69 +125,6 @@ class ParallelExecutorPrivate {
var
.
place_
=
place
;
var
.
place_
=
place
;
op_handle
->
AddOutput
(
&
var
);
op_handle
->
AddOutput
(
&
var
);
}
}
};
// namespace framework
struct
NCCLAllReduceOpHandle
:
public
OpHandleBase
{
const
std
::
vector
<
Scope
*>
&
local_scopes_
;
const
std
::
vector
<
platform
::
Place
>
&
places_
;
const
platform
::
NCCLContextMap
&
nccl_ctxs_
;
explicit
NCCLAllReduceOpHandle
(
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
&
ctxs
)
:
local_scopes_
(
local_scopes
),
places_
(
places
),
nccl_ctxs_
(
ctxs
)
{
for
(
auto
&
p
:
places_
)
{
this
->
dev_ctx_
[
p
]
=
nccl_ctxs_
.
DevCtx
(
p
);
}
}
void
Wait
(
platform
::
DeviceContext
*
waited_dev
)
override
{
OpHandleBase
::
Wait
(
waited_dev
);
}
protected:
void
RunImpl
()
override
{
if
(
inputs_
.
size
()
==
1
)
{
return
;
// No need to all reduce when GPU count = 1;
}
else
{
// Wait input done
for
(
auto
*
in
:
inputs_
)
{
auto
&
p
=
static_cast
<
VarHandle
*>
(
in
)
->
place_
;
in
->
generated_op_
->
Wait
(
dev_ctx_
[
p
]);
}
auto
&
var_name
=
static_cast
<
VarHandle
*>
(
this
->
inputs_
[
0
])
->
name_
;
int
dtype
=
-
1
;
size_t
numel
=
0
;
platform
::
NCCLGroupGuard
guard
;
for
(
size_t
i
=
0
;
i
<
local_scopes_
.
size
();
++
i
)
{
auto
&
p
=
places_
[
i
];
auto
*
s
=
local_scopes_
[
i
];
int
dev_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
).
device
;
auto
&
lod_tensor
=
s
->
FindVar
(
var_name
)
->
Get
<
framework
::
LoDTensor
>
();
void
*
buffer
=
const_cast
<
void
*>
(
lod_tensor
.
data
<
void
>
());
uintptr_t
buf
=
reinterpret_cast
<
uintptr_t
>
(
buffer
);
if
(
buf
%
sizeof
(
float
)
!=
0
)
{
VLOG
(
3
)
<<
"Buffer is not aligned "
<<
buf
;
}
if
(
dtype
==
-
1
)
{
dtype
=
platform
::
ToNCCLDataType
(
lod_tensor
.
type
());
}
if
(
numel
==
0
)
{
numel
=
static_cast
<
size_t
>
(
lod_tensor
.
numel
());
}
auto
&
nccl_ctx
=
nccl_ctxs_
.
at
(
dev_id
);
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclAllReduce
(
buffer
,
buffer
,
numel
,
static_cast
<
ncclDataType_t
>
(
dtype
),
ncclSum
,
nccl_ctx
.
comm_
,
nccl_ctx
.
stream
()));
}
}
}
};
};
struct
ComputationOpHandle
:
public
OpHandleBase
{
struct
ComputationOpHandle
:
public
OpHandleBase
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录