Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
1bdb8578
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
1bdb8578
编写于
12月 06, 2021
作者:
K
kuizhiqing
提交者:
GitHub
12月 06, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
heter for collective (#37613)
上级
21b307ca
变更
30
隐藏空白更改
内联
并排
Showing
30 changed file
with
859 addition
and
79 deletion
+859
-79
paddle/fluid/framework/distributed_strategy.proto
paddle/fluid/framework/distributed_strategy.proto
+1
-0
paddle/fluid/imperative/CMakeLists.txt
paddle/fluid/imperative/CMakeLists.txt
+3
-0
paddle/fluid/imperative/bkcl_context.cc
paddle/fluid/imperative/bkcl_context.cc
+17
-0
paddle/fluid/imperative/bkcl_context.h
paddle/fluid/imperative/bkcl_context.h
+2
-0
paddle/fluid/imperative/gloo_context.cc
paddle/fluid/imperative/gloo_context.cc
+6
-1
paddle/fluid/imperative/gloo_context.h
paddle/fluid/imperative/gloo_context.h
+2
-0
paddle/fluid/imperative/hccl_context.cc
paddle/fluid/imperative/hccl_context.cc
+23
-0
paddle/fluid/imperative/hccl_context.h
paddle/fluid/imperative/hccl_context.h
+2
-0
paddle/fluid/imperative/heter_ccl_context.cc
paddle/fluid/imperative/heter_ccl_context.cc
+203
-0
paddle/fluid/imperative/heter_ccl_context.h
paddle/fluid/imperative/heter_ccl_context.h
+78
-0
paddle/fluid/imperative/nccl_context.cc
paddle/fluid/imperative/nccl_context.cc
+22
-0
paddle/fluid/imperative/nccl_context.h
paddle/fluid/imperative/nccl_context.h
+2
-0
paddle/fluid/imperative/parallel_context.h
paddle/fluid/imperative/parallel_context.h
+2
-0
paddle/fluid/imperative/reducer.cc
paddle/fluid/imperative/reducer.cc
+72
-4
paddle/fluid/imperative/reducer.h
paddle/fluid/imperative/reducer.h
+3
-2
paddle/fluid/imperative/tests/CMakeLists.txt
paddle/fluid/imperative/tests/CMakeLists.txt
+2
-0
paddle/fluid/imperative/tests/heter_ccl_context_test.cc
paddle/fluid/imperative/tests/heter_ccl_context_test.cc
+89
-0
paddle/fluid/imperative/tests/nccl_context_test.cc
paddle/fluid/imperative/tests/nccl_context_test.cc
+50
-0
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+8
-3
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+10
-0
python/paddle/distributed/fleet/base/distributed_strategy.py
python/paddle/distributed/fleet/base/distributed_strategy.py
+31
-0
python/paddle/distributed/fleet/base/fleet_base.py
python/paddle/distributed/fleet/base/fleet_base.py
+31
-14
python/paddle/distributed/fleet/base/meta_optimizer_factory.py
...n/paddle/distributed/fleet/base/meta_optimizer_factory.py
+2
-1
python/paddle/distributed/fleet/launch.py
python/paddle/distributed/fleet/launch.py
+32
-24
python/paddle/distributed/fleet/launch_utils.py
python/paddle/distributed/fleet/launch_utils.py
+70
-12
python/paddle/distributed/fleet/meta_optimizers/__init__.py
python/paddle/distributed/fleet/meta_optimizers/__init__.py
+1
-0
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
...buted/fleet/meta_optimizers/dygraph_optimizer/__init__.py
+1
-0
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py
..._optimizers/dygraph_optimizer/heter_parallel_optimizer.py
+66
-0
python/paddle/distributed/parallel.py
python/paddle/distributed/parallel.py
+26
-16
python/paddle/fluid/dygraph/parallel_helper.py
python/paddle/fluid/dygraph/parallel_helper.py
+2
-2
未找到文件。
paddle/fluid/framework/distributed_strategy.proto
浏览文件 @
1bdb8578
...
...
@@ -305,6 +305,7 @@ message DistributedStrategy {
optional
bool
semi_auto
=
35
[
default
=
false
];
optional
bool
adam_d2sum
=
36
[
default
=
true
];
optional
bool
auto_search
=
37
[
default
=
false
];
optional
bool
heter_ccl_mode
=
38
[
default
=
false
];
optional
RecomputeConfig
recompute_configs
=
101
;
optional
AMPConfig
amp_configs
=
102
;
...
...
paddle/fluid/imperative/CMakeLists.txt
浏览文件 @
1bdb8578
...
...
@@ -30,6 +30,9 @@ if(NOT WIN32)
cc_library
(
hccl_context SRCS hccl_context.cc DEPS collective_helper device_context tensor var_type_traits
)
cc_library
(
reducer SRCS reducer.cc DEPS layer
)
endif
()
if
(
WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL
)
cc_library
(
heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits
)
endif
()
cc_library
(
data_loader SRCS data_loader.cc DEPS enforce
)
endif
(
NOT WIN32
)
if
(
WITH_GLOO
)
...
...
paddle/fluid/imperative/bkcl_context.cc
浏览文件 @
1bdb8578
...
...
@@ -150,6 +150,23 @@ void BKCLParallelContext::AllReduceByStream(const framework::Variable &src,
}
}
void
BKCLParallelContext
::
Broadcast
(
framework
::
Variable
*
src
,
int
ring_id
)
{
VLOG
(
3
)
<<
"/// DEBUG /// start inter broadcast with ring_id: "
<<
ring_id
;
framework
::
Tensor
*
src_tensor
=
src
->
GetMutable
<
framework
::
LoDTensor
>
();
const
auto
&
place
=
src_tensor
->
place
();
platform
::
BKCLComm
*
comm
=
platform
::
BKCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
XPUStream
stream
=
comm
->
stream
();
void
*
src_ptr
=
src_tensor
->
data
<
void
>
();
auto
data_type
=
platform
::
ToBKCLDataType
(
src_tensor
->
type
());
PADDLE_ENFORCE_EQ
(
bkcl_broadcast
(
comm
->
comm
(),
src_ptr
,
src_ptr
,
src_tensor
->
numel
(),
data_type
,
0
,
stream
),
BKCL_SUCCESS
,
platform
::
errors
::
Unavailable
(
"bkcl_broadcast failed"
));
}
paddle
::
platform
::
DeviceContext
*
BKCLParallelContext
::
GetDeviceContext
(
int
ring_id
)
{
return
static_cast
<
platform
::
DeviceContext
*>
(
...
...
paddle/fluid/imperative/bkcl_context.h
浏览文件 @
1bdb8578
...
...
@@ -42,6 +42,8 @@ class BKCLParallelContext : public ParallelContext {
framework
::
Variable
*
dst
,
int
ring_id
,
bool
use_calc_stream
)
override
;
void
Broadcast
(
framework
::
Variable
*
src
,
int
ring_id
)
override
;
paddle
::
platform
::
DeviceContext
*
GetDeviceContext
(
int
ring_id
)
override
;
void
WaitCompute
(
int
ring_id
)
override
;
...
...
paddle/fluid/imperative/gloo_context.cc
浏览文件 @
1bdb8578
...
...
@@ -37,7 +37,7 @@ void GLOOParallelContext::Init() {
gloo_wrapper
->
SetSize
(
strategy_
.
nranks_
);
gloo_wrapper
->
SetRank
(
strategy_
.
local_rank_
);
gloo_wrapper
->
SetPrefix
(
""
);
gloo_wrapper
->
SetIface
(
"
lo
"
);
gloo_wrapper
->
SetIface
(
""
);
auto
addr
=
paddle
::
string
::
Split
(
strategy_
.
trainer_endpoints_
[
0
],
':'
);
VLOG
(
4
)
<<
"Server is"
<<
strategy_
.
trainer_endpoints_
[
0
];
std
::
string
host
=
addr
[
0
];
...
...
@@ -176,6 +176,11 @@ void GLOOParallelContext::AllReduce(const framework::SelectedRows &src,
}
}
void
GLOOParallelContext
::
Broadcast
(
framework
::
Variable
*
src
,
int
ring_id
)
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unimplemented inter-broadcast for CPU now."
));
}
paddle
::
platform
::
DeviceContext
*
GLOOParallelContext
::
GetDeviceContext
(
int
ring_id
)
{
// return the CPUDeviceContext
...
...
paddle/fluid/imperative/gloo_context.h
浏览文件 @
1bdb8578
...
...
@@ -47,6 +47,8 @@ class GLOOParallelContext : public ParallelContext {
framework
::
Variable
*
dst
,
int
ring_id
,
bool
use_calc_stream
)
override
;
void
Broadcast
(
framework
::
Variable
*
src
,
int
ring_id
)
override
;
paddle
::
platform
::
DeviceContext
*
GetDeviceContext
(
int
ring_id
)
override
;
void
WaitCompute
(
int
ring_id
)
override
;
...
...
paddle/fluid/imperative/hccl_context.cc
浏览文件 @
1bdb8578
...
...
@@ -158,6 +158,29 @@ void HCCLParallelContext::AllReduceByStream(const framework::Variable &src,
}
}
void
HCCLParallelContext
::
Broadcast
(
framework
::
Variable
*
src
,
int
ring_id
)
{
VLOG
(
3
)
<<
"/// DEBUG /// start inter broadcast with ring_id: "
<<
ring_id
;
if
(
src
->
IsType
<
framework
::
LoDTensor
>
())
{
framework
::
Tensor
*
src_tensor
=
src
->
GetMutable
<
framework
::
LoDTensor
>
();
const
auto
&
place
=
src_tensor
->
place
();
platform
::
HCCLComm
*
comm
=
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
aclrtStream
stream
=
comm
->
stream
();
void
*
src_ptr
=
reinterpret_cast
<
void
*>
(
const_cast
<
void
*>
(
src_tensor
->
data
<
void
>
()));
auto
hccl_dtype
=
platform
::
ToHCCLDataType
(
src_tensor
->
type
());
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
HcclBroadcast
(
src_ptr
,
src_tensor
->
numel
(),
hccl_dtype
,
0
,
comm
->
comm
(),
reinterpret_cast
<
void
*>
(
stream
)));
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Unsupported variable type %s for imperative allreduce, only "
"LoDTensor is supported."
,
platform
::
demangle
(
framework
::
ToTypeName
(
src
->
Type
()))));
}
}
paddle
::
platform
::
DeviceContext
*
HCCLParallelContext
::
GetDeviceContext
(
int
ring_id
)
{
return
static_cast
<
platform
::
DeviceContext
*>
(
...
...
paddle/fluid/imperative/hccl_context.h
浏览文件 @
1bdb8578
...
...
@@ -50,6 +50,8 @@ class HCCLParallelContext : public ParallelContext {
framework
::
Variable
*
dst
,
int
ring_id
,
bool
use_calc_stream
)
override
;
void
Broadcast
(
framework
::
Variable
*
src
,
int
ring_id
)
override
;
paddle
::
platform
::
DeviceContext
*
GetDeviceContext
(
int
ring_id
)
override
;
void
WaitCompute
(
int
ring_id
)
override
;
...
...
paddle/fluid/imperative/heter_ccl_context.cc
0 → 100644
浏览文件 @
1bdb8578
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/heter_ccl_context.h"
// NCCL first
#ifdef PADDLE_WITH_NCCL
#include "paddle/fluid/imperative/all_reduce.h"
#endif
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
#include "paddle/fluid/string/string_helper.h"
namespace
paddle
{
namespace
framework
{
class
Variable
;
}
// namespace framework
}
// namespace paddle
namespace
paddle
{
namespace
imperative
{
HeterParallelContext
::
HeterParallelContext
(
const
ParallelStrategy
&
strategy
,
const
int
&
device_id
)
#ifdef PADDLE_WITH_NCCL
:
ParallelContext
(
strategy
,
platform
::
CUDAPlace
(
device_id
))
#elif PADDLE_WITH_XPU_BKCL
:
ParallelContext
(
strategy
,
platform
::
XPUPlace
(
device_id
))
#elif PADDLE_WITH_ASCEND_CL
:
ParallelContext
(
strategy
,
platform
::
NPUPlace
(
device_id
))
#else
:
ParallelContext
(
strategy
,
platform
::
CPUPlace
())
#endif
{
// construct node_strategy_ from global strategy by selecting the
// endpoints with same ip address.
std
::
string
node_ip
=
strategy_
.
current_endpoint_
.
substr
(
0
,
strategy_
.
current_endpoint_
.
find
(
':'
));
int
node_nranks
=
0
;
int
inter_rank
=
-
1
;
std
::
vector
<
std
::
string
>
all_eps
=
strategy_
.
trainer_endpoints_
;
std
::
vector
<
std
::
string
>
inter_endpoints
;
std
::
set
<
std
::
string
>
nodes_ips
;
for
(
auto
ep
:
all_eps
)
{
std
::
string
ip
=
ep
.
substr
(
0
,
ep
.
find
(
':'
));
// record ip of different nodes
if
(
nodes_ips
.
find
(
ip
)
==
nodes_ips
.
end
())
{
if
(
ep
==
strategy_
.
current_endpoint_
)
{
inter_rank
=
nodes_ips
.
size
();
}
inter_endpoints
.
push_back
(
ep
);
nodes_ips
.
emplace
(
ip
);
}
if
(
ip
==
node_ip
)
{
if
(
ep
==
strategy_
.
current_endpoint_
)
{
node_strategy_
.
local_rank_
=
node_nranks
;
}
node_nranks
++
;
node_strategy_
.
trainer_endpoints_
.
push_back
(
ep
);
}
}
VLOG
(
0
)
<<
"init node size "
<<
node_nranks
<<
" rank "
<<
node_strategy_
.
local_rank_
;
PADDLE_ENFORCE_NE
(
node_nranks
,
0
,
platform
::
errors
::
InvalidArgument
(
"The number of local nranks should not be zero."
));
node_strategy_
.
nranks_
=
node_nranks
;
node_strategy_
.
current_endpoint_
=
strategy_
.
current_endpoint_
;
if
(
inter_rank
>=
0
&&
inter_endpoints
.
size
()
>
1
)
{
inter_strategy_
.
nranks_
=
inter_endpoints
.
size
();
inter_strategy_
.
local_rank_
=
inter_rank
;
inter_strategy_
.
current_endpoint_
=
strategy_
.
current_endpoint_
;
inter_strategy_
.
trainer_endpoints_
=
inter_endpoints
;
inter_parallel_ctx_
=
std
::
make_shared
<
GLOOParallelContext
>
(
inter_strategy_
,
platform
::
CPUPlace
());
}
VLOG
(
0
)
<<
"init inter size "
<<
inter_endpoints
.
size
()
<<
" rank "
<<
inter_rank
;
#ifdef PADDLE_WITH_NCCL
node_place_
=
platform
::
CUDAPlace
(
device_id
);
node_parallel_ctx_
=
std
::
make_shared
<
NCCLParallelContext
>
(
node_strategy_
,
node_place_
);
#endif
#ifdef PADDLE_WITH_XPU_BKCL
node_place_
=
platform
::
XPUPlace
(
device_id
);
node_parallel_ctx_
=
std
::
make_shared
<
BKCLParallelContext
>
(
node_strategy_
,
node_place_
);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
node_place_
=
platform
::
NPUPlace
(
device_id
);
node_parallel_ctx_
=
std
::
make_shared
<
HCCLParallelContext
>
(
node_strategy_
,
node_place_
);
#endif
}
void
HeterParallelContext
::
Init
()
{
PADDLE_ENFORCE_NE
(
node_parallel_ctx_
,
nullptr
,
platform
::
errors
::
Unavailable
(
"The heter parallel context has not been initialized."
));
if
(
inter_parallel_ctx_
!=
nullptr
)
{
inter_parallel_ctx_
->
Init
();
}
node_parallel_ctx_
->
Init
();
VLOG
(
3
)
<<
"/// DEBUG /// heter parallel env init done..."
<<
std
::
endl
;
}
void
HeterParallelContext
::
InitWithRingID
(
int
ring_id
)
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unimplemented InitWithRingID from heter ctx."
));
}
void
HeterParallelContext
::
AllReduceByStream
(
const
framework
::
Variable
&
src
,
framework
::
Variable
*
dst
,
int
ring_id
,
bool
use_calc_stream
)
{
// step 1: call reduce within node
VLOG
(
3
)
<<
"/// DEBUG /// step 1: reduce in node... "
;
node_parallel_ctx_
->
AllReduceByStream
(
src
,
dst
,
ring_id
,
false
);
node_parallel_ctx_
->
WaitComm
(
ring_id
);
// step 2: call allreduce between nodes with gloo
if
(
inter_parallel_ctx_
!=
nullptr
)
{
// copy src to cpu
// dst is now the src
auto
src_tensor
=
dst
->
Get
<
framework
::
LoDTensor
>
();
framework
::
Variable
src_cpu
;
auto
src_cpu_tensor
=
src_cpu
.
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
TensorCopySync
(
src_tensor
,
platform
::
CPUPlace
(),
src_cpu_tensor
);
// allreduce src/cpu to dst/cpu
framework
::
Variable
dst_cpu
;
inter_parallel_ctx_
->
AllReduceByStream
(
src_cpu
,
&
dst_cpu
,
ring_id
,
false
);
inter_parallel_ctx_
->
WaitComm
(
ring_id
);
// copy dst/cpu to dst
auto
dst_cpu_tensor
=
dst_cpu
.
Get
<
framework
::
LoDTensor
>
();
auto
dst_tensor
=
dst
->
GetMutable
<
framework
::
LoDTensor
>
();
framework
::
TensorCopySync
(
dst_cpu_tensor
,
dst_tensor
->
place
(),
dst_tensor
);
inter_parallel_ctx_
->
WaitComm
(
ring_id
);
}
// step 3: call broadcast within node
VLOG
(
3
)
<<
"/// DEBUG /// step 3: broadcast within node... "
;
node_parallel_ctx_
->
WaitComm
(
ring_id
);
node_parallel_ctx_
->
Broadcast
(
dst
,
ring_id
);
node_parallel_ctx_
->
WaitComm
(
ring_id
);
}
void
HeterParallelContext
::
Broadcast
(
framework
::
Variable
*
src
,
int
ring_id
)
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Unimplemented function."
));
}
paddle
::
platform
::
DeviceContext
*
HeterParallelContext
::
GetDeviceContext
(
int
ring_id
)
{
// directly call the implementation of target parallel ctx.
return
node_parallel_ctx_
->
GetDeviceContext
(
ring_id
);
}
void
HeterParallelContext
::
WaitCompute
(
int
ring_id
)
{
// directly call the implementation of target parallel ctx.
node_parallel_ctx_
->
WaitCompute
(
ring_id
);
}
void
HeterParallelContext
::
WaitComm
(
int
ring_id
)
{
// directly call the implementation of target parallel ctx.
node_parallel_ctx_
->
WaitComm
(
ring_id
);
}
void
HeterParallelContext
::
SynchronizeCompute
()
{
// directly call the implementation of target parallel ctx.
node_parallel_ctx_
->
SynchronizeCompute
();
}
}
// namespace imperative
}
// namespace paddle
paddle/fluid/imperative/heter_ccl_context.h
0 → 100644
浏览文件 @
1bdb8578
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#ifdef PADDLE_WITH_NCCL
#include "paddle/fluid/imperative/nccl_context.h"
#endif
#ifdef PADDLE_WITH_XPU_BKCL
#include "paddle/fluid/imperative/bkcl_context.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/imperative/hccl_context.h"
#endif
#include "paddle/fluid/imperative/gloo_context.h"
#include "paddle/fluid/imperative/parallel_context.h"
namespace
paddle
{
namespace
framework
{
class
Variable
;
}
// namespace framework
}
// namespace paddle
namespace
paddle
{
namespace
imperative
{
class
HeterParallelContext
:
public
ParallelContext
{
public:
explicit
HeterParallelContext
(
const
ParallelStrategy
&
strategy
,
const
int
&
device_id
);
~
HeterParallelContext
()
override
=
default
;
void
Init
()
override
;
void
InitWithRingID
(
int
ring_id
)
override
;
void
AllReduceByStream
(
const
framework
::
Variable
&
src
,
framework
::
Variable
*
dst
,
int
ring_id
,
bool
use_calc_stream
)
override
;
void
Broadcast
(
framework
::
Variable
*
src
,
int
ring_id
)
override
;
paddle
::
platform
::
DeviceContext
*
GetDeviceContext
(
int
ring_id
)
override
;
void
WaitCompute
(
int
ring_id
)
override
;
void
WaitComm
(
int
ring_id
)
override
;
void
SynchronizeCompute
()
override
;
private:
ParallelStrategy
inter_strategy_
;
ParallelStrategy
node_strategy_
;
platform
::
Place
node_place_
;
std
::
shared_ptr
<
imperative
::
ParallelContext
>
node_parallel_ctx_
{
nullptr
};
std
::
shared_ptr
<
imperative
::
ParallelContext
>
inter_parallel_ctx_
{
nullptr
};
};
}
// namespace imperative
}
// namespace paddle
paddle/fluid/imperative/nccl_context.cc
浏览文件 @
1bdb8578
...
...
@@ -20,7 +20,15 @@
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#endif
#ifdef PADDLE_WITH_NCCL
#include <nccl.h>
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/nccl_helper.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
...
...
@@ -127,6 +135,20 @@ void NCCLParallelContext::AllReduceByStream(const framework::Variable &src,
AllReduce
(
src
,
dst
,
strategy_
,
ring_id
,
use_calc_stream
);
}
void
NCCLParallelContext
::
Broadcast
(
framework
::
Variable
*
src
,
int
ring_id
)
{
VLOG
(
3
)
<<
"/// DEBUG /// start inter broadcast with ring_id: "
<<
ring_id
;
framework
::
Tensor
*
src_tensor
=
src
->
GetMutable
<
framework
::
LoDTensor
>
();
const
auto
&
place
=
src_tensor
->
place
();
platform
::
NCCLComm
*
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
gpuStream_t
stream
=
comm
->
stream
();
void
*
src_ptr
=
src_tensor
->
data
<
void
>
();
auto
nccl_dtype
=
platform
::
ToNCCLDataType
(
src_tensor
->
type
());
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclBcast
(
src_ptr
,
src_tensor
->
numel
(),
nccl_dtype
,
0
,
comm
->
comm
(),
stream
));
}
paddle
::
platform
::
DeviceContext
*
NCCLParallelContext
::
GetDeviceContext
(
int
ring_id
)
{
return
static_cast
<
platform
::
DeviceContext
*>
(
...
...
paddle/fluid/imperative/nccl_context.h
浏览文件 @
1bdb8578
...
...
@@ -60,6 +60,8 @@ class NCCLParallelContext : public ParallelContext {
framework
::
Variable
*
dst
,
int
ring_id
,
bool
use_calc_stream
)
override
;
void
Broadcast
(
framework
::
Variable
*
src
,
int
ring_id
)
override
;
paddle
::
platform
::
DeviceContext
*
GetDeviceContext
(
int
ring_id
)
override
;
void
WaitCompute
(
int
ring_id
)
override
;
...
...
paddle/fluid/imperative/parallel_context.h
浏览文件 @
1bdb8578
...
...
@@ -56,6 +56,8 @@ class ParallelContext {
framework
::
Variable
*
dst
,
int
ring_id
,
bool
use_calc_stream
)
=
0
;
virtual
void
Broadcast
(
framework
::
Variable
*
src
,
int
ring_id
)
=
0
;
virtual
paddle
::
platform
::
DeviceContext
*
GetDeviceContext
(
int
ring_id
)
=
0
;
// comm_stream[ring_id] wait compute_stream.
...
...
paddle/fluid/imperative/reducer.cc
浏览文件 @
1bdb8578
...
...
@@ -27,8 +27,9 @@
namespace
paddle
{
namespace
imperative
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_ASCEND_CL)
// div the nranks
void
Group
::
DivNRanks
(
const
platform
::
DeviceContext
&
context
,
int64_t
nranks
)
{
framework
::
Tensor
*
tensor
=
...
...
@@ -41,6 +42,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
DivNRanks
(
tensor
,
nranks
,
context
);
#endif
}
else
if
(
platform
::
is_npu_place
(
tensor
->
place
()))
{
// TODO(kuizhiqing)
VLOG
(
4
)
<<
"divnrank for npu not support yet"
;
}
else
if
(
platform
::
is_cpu_place
(
tensor
->
place
()))
{
VLOG
(
4
)
<<
"before div 2"
<<
*
tensor
;
VLOG
(
4
)
<<
"NDiv for cpu devices : rank = "
<<
nranks
;
...
...
@@ -207,6 +211,70 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
}
#endif
// NOTE(liubo48): Only implement operators::math::SplitFunctor for npu now.
// If later the operators::StridedMemcpyWithAxis0 is supported,
// then this specific SplitTensorsForAllReduce can be removed.
#ifdef PADDLE_WITH_ASCEND_CL
template
<
>
void
SplitTensorsForAllReduce
<
platform
::
NPUDeviceContext
,
float
>
(
const
platform
::
NPUDeviceContext
&
context
,
framework
::
Variable
*
p_dense_contents
,
std
::
vector
<
framework
::
Tensor
>
*
p_dense_tensors
)
{
auto
*
in
=
p_dense_contents
->
GetMutable
<
framework
::
LoDTensor
>
();
std
::
vector
<
framework
::
Tensor
*>
outs
;
std
::
vector
<
const
framework
::
Tensor
*>
shape_refer
;
outs
.
reserve
(
p_dense_tensors
->
size
());
shape_refer
.
reserve
(
p_dense_tensors
->
size
());
for
(
auto
&
tensor
:
*
p_dense_tensors
)
{
outs
.
emplace_back
(
&
tensor
);
shape_refer
.
emplace_back
(
&
tensor
);
}
operators
::
math
::
SplitFunctor
<
platform
::
NPUDeviceContext
,
float
>
split_functor_
;
split_functor_
(
context
,
*
in
,
shape_refer
,
0
,
&
outs
);
}
template
<
>
void
ConcatTensorsWithType
<
platform
::
NPUDeviceContext
>
(
const
platform
::
NPUDeviceContext
&
context
,
const
std
::
vector
<
framework
::
Tensor
>
&
dense_tensors_
,
framework
::
Variable
*
p_dense_contents
,
framework
::
proto
::
VarType
::
Type
type
)
{
switch
(
type
)
{
case
framework
::
proto
::
VarType
::
FP32
:
ConcatTensorsForAllReduce
<
platform
::
NPUDeviceContext
,
float
>
(
context
,
dense_tensors_
,
p_dense_contents
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it concats tensors for "
"allreduce."
,
framework
::
DataTypeToString
(
type
)));
}
}
template
<
>
void
SplitTensorsWithType
<
platform
::
NPUDeviceContext
>
(
const
platform
::
NPUDeviceContext
&
context
,
framework
::
Variable
*
p_dense_contents
,
std
::
vector
<
framework
::
Tensor
>
*
p_dense_tensors
,
framework
::
proto
::
VarType
::
Type
type
)
{
switch
(
type
)
{
case
framework
::
proto
::
VarType
::
FP32
:
SplitTensorsForAllReduce
<
platform
::
NPUDeviceContext
,
float
>
(
context
,
p_dense_contents
,
p_dense_tensors
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it splits tensors for "
"allreduce."
,
framework
::
DataTypeToString
(
type
)));
}
}
#endif
void
Group
::
ConcatTensors
(
const
platform
::
DeviceContext
&
context
)
{
auto
place
=
context
.
GetPlace
();
if
(
platform
::
is_gpu_place
(
place
))
{
...
...
@@ -831,7 +899,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
}
});
#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \
defined(PADDLE_WITH_GLOO)
defined(PADDLE_WITH_GLOO)
|| defined(PADDLE_WITH_ASCEND_CL)
FusedAllReduceSchedule
(
run_order
,
group
,
next_group_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
...
...
@@ -1014,7 +1082,7 @@ void Reducer::FinalizeBackward() {
if
(
find_unused_vars_each_step_
)
{
// TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_GLOO)
defined(PADDLE_WITH_GLOO)
|| defined(PADDLE_WITH_ASCEND_CL)
ProcessUnusedDenseVars
();
#endif
// Initialize local used vars
...
...
paddle/fluid/imperative/reducer.h
浏览文件 @
1bdb8578
...
...
@@ -48,8 +48,9 @@ class VariableWrapper;
namespace
paddle
{
namespace
imperative
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_ASCEND_CL)
template
<
typename
T
>
struct
DivNRanksFunctor
{
...
...
paddle/fluid/imperative/tests/CMakeLists.txt
浏览文件 @
1bdb8578
...
...
@@ -3,6 +3,8 @@ if(WIN32)
else
()
if
(
WITH_NCCL OR WITH_RCCL
)
cc_test
(
nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context
)
cc_test
(
heter_ccl_context_test SRCS heter_ccl_context_test.cc DEPS heter_ccl_context nccl_context imperative_gloo_context gloo_context gloo_wrapper gloo fs shell
)
#set_tests_properties(heter_ccl_context_test PROPERTIES LABELS "RUN_TYPE=DIST")
endif
()
if
(
WITH_XPU_BKCL
)
cc_test
(
bkcl_context_test SRCS bkcl_context_test.cc DEPS bkcl_context
)
...
...
paddle/fluid/imperative/tests/heter_ccl_context_test.cc
0 → 100644
浏览文件 @
1bdb8578
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <chrono>
#include <thread> // NOLINT
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/imperative/heter_ccl_context.h"
#include "gtest/gtest.h"
namespace
imperative
=
paddle
::
imperative
;
namespace
platform
=
paddle
::
platform
;
namespace
framework
=
paddle
::
framework
;
imperative
::
ParallelStrategy
GetStrategy
(
int
local_rank
)
{
std
::
vector
<
std
::
string
>
eps
=
{
"127.0.0.1:37580"
,
"127.0.0.1:37581"
};
imperative
::
ParallelStrategy
strategy
;
strategy
.
trainer_endpoints_
=
eps
;
strategy
.
current_endpoint_
=
eps
[
local_rank
];
strategy
.
nranks_
=
eps
.
size
();
strategy
.
local_rank_
=
local_rank
;
return
strategy
;
}
#ifdef PADDLE_WITH_NCCL
void
AllReduceByStream
(
int
local_rank
,
int
device_id
)
{
int
data_size
=
32
;
const
auto
&
place
=
platform
::
CUDAPlace
(
device_id
);
platform
::
CUDADeviceContext
ctx
(
place
);
// heter_parallel_ctx
imperative
::
HeterParallelContext
hpc
(
GetStrategy
(
local_rank
),
device_id
);
// init
hpc
.
Init
();
// input and output data
framework
::
Variable
*
src_dev_var
(
new
framework
::
Variable
());
auto
*
src_dev_tensor
=
src_dev_var
->
GetMutable
<
framework
::
LoDTensor
>
();
src_dev_tensor
->
mutable_data
<
float
>
(
framework
::
make_ddim
({
data_size
}),
place
);
std
::
vector
<
float
>
src_vec
;
for
(
int
i
=
0
;
i
<
data_size
;
i
++
)
{
src_vec
.
push_back
(
1.0
+
local_rank
);
}
framework
::
TensorFromVector
(
src_vec
,
ctx
,
src_dev_tensor
);
ctx
.
Wait
();
framework
::
Variable
*
dst_dev_var
(
new
framework
::
Variable
());
auto
*
dst_dev_tensor
=
dst_dev_var
->
GetMutable
<
framework
::
LoDTensor
>
();
dst_dev_tensor
->
mutable_data
<
float
>
(
framework
::
make_ddim
({
data_size
}),
place
);
// call allreduce
hpc
.
AllReduceByStream
(
*
src_dev_var
,
dst_dev_var
,
0
,
false
);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1000
));
// check result
std
::
vector
<
float
>
dst_vec
;
framework
::
TensorToVector
(
*
dst_dev_tensor
,
ctx
,
&
dst_vec
);
ctx
.
Wait
();
EXPECT_EQ
(
dst_vec
.
size
(),
src_vec
.
size
());
for
(
int
i
=
0
;
i
<
data_size
;
i
++
)
{
EXPECT_EQ
(
dst_vec
[
i
],
3.0
);
}
}
TEST
(
AllReduceByStream
,
Run
)
{
if
(
platform
::
GetCUDADeviceCount
()
>=
2
)
{
std
::
thread
t0
(
AllReduceByStream
,
0
,
0
);
std
::
thread
t1
(
AllReduceByStream
,
1
,
1
);
t0
.
join
();
t1
.
join
();
}
}
#endif
paddle/fluid/imperative/tests/nccl_context_test.cc
浏览文件 @
1bdb8578
...
...
@@ -14,6 +14,8 @@
#include <thread> // NOLINT
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/imperative/nccl_context.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
...
...
@@ -21,6 +23,7 @@
namespace
imperative
=
paddle
::
imperative
;
namespace
platform
=
paddle
::
platform
;
namespace
framework
=
paddle
::
framework
;
int
nrings
=
2
;
imperative
::
ParallelStrategy
GetStrategy
(
int
local_rank
)
{
...
...
@@ -68,4 +71,51 @@ TEST(BcastNCCLId, Run) {
NCCL_UNIQUE_ID_BYTES
));
}
}
void
Broadcast
(
int
local_rank
,
int
device_id
)
{
int
data_size
=
4
;
float
test_data
=
7
;
const
auto
&
place
=
platform
::
CUDAPlace
(
device_id
);
platform
::
CUDADeviceContext
ctx
(
place
);
imperative
::
NCCLParallelContext
npc
(
GetStrategy
(
local_rank
),
place
);
// init
npc
.
Init
();
framework
::
Variable
*
src_dev_var
(
new
framework
::
Variable
());
auto
*
src_dev_tensor
=
src_dev_var
->
GetMutable
<
framework
::
LoDTensor
>
();
src_dev_tensor
->
mutable_data
<
float
>
(
framework
::
make_ddim
({
data_size
}),
place
);
// fill data for rank 0 only
std
::
vector
<
float
>
src_vec
;
if
(
local_rank
==
0
)
{
for
(
int
i
=
0
;
i
<
data_size
;
i
++
)
{
src_vec
.
push_back
(
test_data
);
}
framework
::
TensorFromVector
(
src_vec
,
ctx
,
src_dev_tensor
);
}
ctx
.
Wait
();
npc
.
Broadcast
(
src_dev_var
,
0
);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
1000
));
// check result
std
::
vector
<
float
>
dst_vec
;
framework
::
TensorToVector
(
*
src_dev_tensor
,
ctx
,
&
dst_vec
);
ctx
.
Wait
();
for
(
int
i
=
0
;
i
<
data_size
;
i
++
)
{
EXPECT_EQ
(
dst_vec
[
i
],
test_data
);
}
}
TEST
(
Broadcast
,
Run
)
{
if
(
platform
::
GetCUDADeviceCount
()
>=
2
)
{
std
::
thread
t0
(
Broadcast
,
0
,
0
);
std
::
thread
t1
(
Broadcast
,
1
,
1
);
t0
.
join
();
t1
.
join
();
}
}
#endif
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
1bdb8578
...
...
@@ -25,6 +25,13 @@ endif()
if
(
WITH_XPU_BKCL
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
reducer
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
bkcl_context
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
heter_ccl_context
)
endif
()
if
(
WITH_ASCEND_CL
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
reducer
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
hccl_context
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
heter_ccl_context
)
endif
()
if
(
NOT WIN32
)
...
...
@@ -32,9 +39,7 @@ if(NOT WIN32)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
mmap_allocator
)
if
(
WITH_NCCL OR WITH_RCCL
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
nccl_context
)
endif
()
if
(
WITH_ASCEND_CL
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
hccl_context
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
heter_ccl_context
)
endif
()
endif
(
NOT WIN32
)
...
...
paddle/fluid/pybind/imperative.cc
浏览文件 @
1bdb8578
...
...
@@ -37,6 +37,7 @@ limitations under the License. */
#include "paddle/fluid/imperative/data_loader.h"
#include "paddle/fluid/imperative/gloo_context.h"
#include "paddle/fluid/imperative/hccl_context.h"
#include "paddle/fluid/imperative/heter_ccl_context.h"
#include "paddle/fluid/imperative/hooks.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/nccl_context.h"
...
...
@@ -2332,6 +2333,15 @@ void BindImperative(py::module *m_ptr) {
py
::
arg
(
"ring_id"
));
#endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
py
::
class_
<
imperative
::
HeterParallelContext
,
imperative
::
ParallelContext
,
std
::
shared_ptr
<
imperative
::
HeterParallelContext
>>
(
m
,
"HeterParallelContext"
)
.
def
(
py
::
init
<
const
imperative
::
ParallelStrategy
&
,
const
int
&>
())
.
def
(
"init"
,
[](
imperative
::
HeterParallelContext
&
self
)
{
self
.
Init
();
});
#endif
m
.
def
(
"pylayer_apply"
,
[](
const
platform
::
CPUPlace
&
place
,
const
py
::
object
&
cls
,
const
py
::
args
args
,
const
py
::
kwargs
kwargs
)
{
...
...
python/paddle/distributed/fleet/base/distributed_strategy.py
浏览文件 @
1bdb8578
...
...
@@ -1758,6 +1758,37 @@ class DistributedStrategy(object):
else
:
print
(
"WARNING: auto-search should have value of bool type"
)
@
property
def
heter_ccl_mode
(
self
):
"""
Indicating whether we are using heter_ccl_mode for model training.
This feature is currently an experimental feature. Currently,
heter_ccl_mode can be used only for dataparallel with dygraph mode.
Default Value: False
Examples:
.. code-block:: python
import paddle
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.heter_ccl_mode = True
# for initialize parallel env, only need to call
paddle.distributed.init_parallel_env()
# then the heterogenous context will be created.
"""
return
self
.
strategy
.
heter_ccl_mode
@
heter_ccl_mode
.
setter
def
heter_ccl_mode
(
self
,
flag
):
if
isinstance
(
flag
,
bool
):
self
.
strategy
.
heter_ccl_mode
=
flag
else
:
print
(
"WARNING: heter_ccl_mode should have value of bool type"
)
@
property
def
cudnn_exhaustive_search
(
self
):
"""
...
...
python/paddle/distributed/fleet/base/fleet_base.py
浏览文件 @
1bdb8578
...
...
@@ -33,7 +33,7 @@ from . import topology as tp
from
.topology
import
ParallelMode
from
..meta_parallel
import
TensorParallel
,
model_parallel_random_seed
from
..meta_parallel
import
PipelineParallel
,
ShardingParallel
from
..meta_optimizers
import
HybridParallelOptimizer
from
..meta_optimizers
import
HybridParallelOptimizer
,
HeterParallelOptimizer
from
paddle
import
_C_ops
from
paddle.fluid
import
core
from
paddle.fluid.dygraph
import
to_variable
...
...
@@ -277,13 +277,15 @@ class Fleet(object):
self
.
_user_defined_strategy
.
nccl_comm_num
)
paddle
.
distributed
.
init_parallel_env
()
# init hybrid parallel environment in dygraph
if
tp
.
_HYBRID_PARALLEL_GROUP
is
None
:
self
.
_init_hybrid_parallel_env
()
else
:
warnings
.
warn
(
"The dygraph hybrid parallel environment has been initialized."
)
# hybrid parallel not support for npu/xpu
if
self
.
_user_defined_strategy
.
heter_ccl_mode
==
False
:
# init hybrid parallel environment in dygraph
if
tp
.
_HYBRID_PARALLEL_GROUP
is
None
:
self
.
_init_hybrid_parallel_env
()
else
:
warnings
.
warn
(
"The dygraph hybrid parallel environment has been initialized."
)
elif
self
.
_is_collective
:
use_sharding
=
self
.
_user_defined_strategy
.
sharding
...
...
@@ -872,8 +874,12 @@ class Fleet(object):
if
paddle
.
fluid
.
framework
.
in_dygraph_mode
():
if
self
.
worker_num
()
>
1
:
return
HybridParallelOptimizer
(
optimizer
,
self
.
_hcg
,
self
.
_user_defined_strategy
)
if
self
.
_user_defined_strategy
.
heter_ccl_mode
==
False
:
return
HybridParallelOptimizer
(
optimizer
,
self
.
_hcg
,
self
.
_user_defined_strategy
)
else
:
return
HeterParallelOptimizer
(
optimizer
,
self
.
_user_defined_strategy
)
else
:
return
optimizer
return
self
...
...
@@ -938,6 +944,17 @@ class Fleet(object):
if
self
.
worker_num
()
<=
1
:
return
model
if
self
.
_user_defined_strategy
.
heter_ccl_mode
==
True
:
distributed_model
=
paddle
.
DataParallel
(
model
,
comm_buffer_size
=
self
.
_user_defined_strategy
.
fuse_grad_size_in_MB
,
last_comm_buffer_size
=
self
.
_user_defined_strategy
.
last_comm_group_size_MB
,
find_unused_parameters
=
self
.
_user_defined_strategy
.
find_unused_parameters
)
return
distributed_model
if
self
.
_hcg
.
get_parallel_mode
()
==
ParallelMode
.
SHARDING_PARALLEL
:
distributed_model
=
ShardingParallel
(
model
,
self
.
_hcg
,
strategy
=
self
.
_user_defined_strategy
)
...
...
@@ -1569,13 +1586,13 @@ class Fleet(object):
]
param_grads_fp16
=
[
param
.
_grad_ivar
()
for
param
in
optimizer
.
_parameter_list
if
(
param
.
_grad_ivar
()
is
not
None
)
and
(
param
.
_grad_ivar
(
).
dtype
==
core
.
VarDesc
.
VarType
.
FP16
)
if
(
param
.
_grad_ivar
()
is
not
None
)
and
(
param
.
_grad_ivar
(
).
dtype
==
core
.
VarDesc
.
VarType
.
FP16
)
]
param_grads_fp32
=
[
param
.
_grad_ivar
()
for
param
in
optimizer
.
_parameter_list
if
(
param
.
_grad_ivar
()
is
not
None
)
and
(
param
.
_grad_ivar
(
).
dtype
==
core
.
VarDesc
.
VarType
.
FP32
)
if
(
param
.
_grad_ivar
()
is
not
None
)
and
(
param
.
_grad_ivar
(
).
dtype
==
core
.
VarDesc
.
VarType
.
FP32
)
]
temp_found_inf_fp16
=
to_variable
(
np
.
array
([
0
]).
astype
(
np
.
bool
))
temp_found_inf_fp32
=
to_variable
(
np
.
array
([
0
]).
astype
(
np
.
bool
))
...
...
python/paddle/distributed/fleet/base/meta_optimizer_factory.py
浏览文件 @
1bdb8578
...
...
@@ -19,9 +19,10 @@ __all__ = []
meta_optimizer_names
=
list
(
filter
(
lambda
name
:
name
.
endswith
(
"Optimizer"
),
dir
()))
# Because HybridParallelOptimizer is dygraph optimizer, it
# Because HybridParallelOptimizer is dygraph optimizer, it
# should be removed
meta_optimizer_names
.
remove
(
"HybridParallelOptimizer"
)
meta_optimizer_names
.
remove
(
"HeterParallelOptimizer"
)
class
MetaOptimizerFactory
(
object
):
...
...
python/paddle/distributed/fleet/launch.py
浏览文件 @
1bdb8578
...
...
@@ -108,9 +108,9 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
base_group
.
add_argument
(
"--backend"
,
type
=
str
,
default
=
"auto"
,
help
=
"Specifize the backend, can be gloo|nccl|bkcl|auto
. Default value is auto which perfers nccl or bkcl.
"
)
default
=
os
.
environ
.
get
(
'PADDLE_DISTRI_BACKEND'
,
'auto'
)
,
help
=
"Specifize the backend, can be gloo|nccl|bkcl|auto
|hccl|heter.
"
"Default value is auto which perfers nccl or bkcl."
)
base_group
.
add_argument
(
"--nproc_per_node"
,
type
=
int
,
...
...
@@ -146,6 +146,16 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
)
base_group
.
add_argument
(
"--selected_xpus"
,
dest
=
"xpus"
)
if
fluid
.
core
.
is_compiled_with_npu
():
base_group
.
add_argument
(
"--npus"
,
type
=
str
,
default
=
None
,
help
=
"It's for xpu training. For example: "
"--npus=
\"
0,1,2,3
\"
will launch four training processes each bound to one npu."
)
base_group
.
add_argument
(
"--selected_npus"
,
dest
=
"npus"
)
base_group
.
add_argument
(
"training_script"
,
type
=
str
,
...
...
@@ -301,25 +311,23 @@ def get_cluster_info(args):
# lazy launch for auto-parallel
if
args
.
enable_auto_mapping
==
True
:
cluster
,
pod
=
get_mapped_cluster_from_args
(
args
,
device_mode
)
else
:
elif
cloud_utils
.
use_paddlecloud
()
and
trainers_num
!=
1
:
cluster
,
pod
=
cloud_utils
.
get_cloud_cluster
(
args
.
ips
,
device_mode
,
devices_per_proc
,
start_port
)
logger
.
debug
(
"get cluster from cloud:{}"
.
format
(
cluster
))
elif
device_mode
==
DeviceMode
.
ASCEND_NPU
:
# for ascend
if
device_mode
==
DeviceMode
.
ASCEND_NPU
:
cluster
,
pod
=
ascend_utils
.
get_cloud_cluster
(
rank_table_file
=
os
.
getenv
(
"RANK_TABLE_FILE"
,
None
),
device_mode
=
device_mode
,
start_port
=
start_port
)
elif
cloud_utils
.
use_paddlecloud
()
and
trainers_num
!=
1
:
cluster
,
pod
=
cloud_utils
.
get_cloud_cluster
(
args
.
ips
,
device_mode
,
devices_per_proc
,
start_port
)
logger
.
debug
(
"get cluster from cloud:{}"
.
format
(
cluster
))
else
:
# trainers_num = 1 or not use paddlecloud ips="a,b"
cluster
,
pod
=
get_cluster_from_args
(
args
,
device_mode
,
devices_per_proc
)
logger
.
debug
(
"get cluster from args:{}"
.
format
(
cluster
))
cluster
,
pod
=
ascend_utils
.
get_cloud_cluster
(
rank_table_file
=
os
.
getenv
(
"RANK_TABLE_FILE"
,
None
),
device_mode
=
device_mode
,
start_port
=
start_port
)
else
:
# trainers_num = 1 or not use paddlecloud ips="a,b"
cluster
,
pod
=
get_cluster_from_args
(
args
,
device_mode
,
devices_per_proc
)
logger
.
debug
(
"get cluster from args:{}"
.
format
(
cluster
))
return
cluster
,
pod
def
get_global_envs
(
args
,
tmp_dir
):
global_envs
=
copy
.
copy
(
os
.
environ
.
copy
())
# add gloo env
...
...
@@ -456,15 +464,15 @@ def which_distributed_mode(args):
)
and
not
fluid
.
core
.
is_compiled_with_xpu
():
if
args
.
servers
:
logger
.
warning
(
"Not found distinct arguments and not compiled with cuda or xpu
.
\
But found args.servers not empty, default use ps mode"
)
"Not found distinct arguments and not compiled with cuda or xpu
or npu. "
"
But found args.servers not empty, default use ps mode"
)
return
DistributeMode
.
PS
else
:
return
DistributeMode
.
COLLECTIVE
else
:
logger
.
warning
(
"Not found distinct arguments and compiled with cuda or xpu
. Default use collective mode
"
)
"Not found distinct arguments and compiled with cuda or xpu
or npu.
"
"Default use collective mode"
)
return
DistributeMode
.
COLLECTIVE
...
...
@@ -651,7 +659,7 @@ def launch():
check_backend
(
args
.
backend
)
distribute_mode
=
DistributeMode
.
COLLECTIVE
assert
args
.
backend
in
[
'gloo'
,
'nccl'
,
'bkcl
'
,
'unknown'
]
#assert args.backend in ['gloo', 'nccl', 'bkcl', 'heter
', 'unknown']
if
args
.
backend
==
'gloo'
:
logger
.
warning
(
"launch start with CPUONLY mode"
)
...
...
python/paddle/distributed/fleet/launch_utils.py
浏览文件 @
1bdb8578
...
...
@@ -690,9 +690,51 @@ def get_xpus(xpus):
return
res_xpus
def
get_npus
(
npus
):
if
npus
is
None
:
npus_num
=
fluid
.
core
.
get_npu_device_count
()
res_npus
=
[
str
(
x
)
for
x
in
range
(
0
,
npus_num
)]
else
:
npu_visible_devices
=
os
.
getenv
(
"ASCEND_VISIBLE_DEVICES"
)
if
npu_visible_devices
is
None
or
npu_visible_devices
==
""
:
res_npus
=
[
x
.
strip
()
for
x
in
npus
.
split
(
','
)]
else
:
# change npus into relative values
# e.g. ASCEND_VISIBLE_DEVICES=4,5,6,7; args.npus=4,5,6,7;
# therefore npus=0,1,2,3
npu_visible_devices_list
=
npu_visible_devices
.
split
(
','
)
for
x
in
npus
.
split
(
','
):
assert
x
in
npu_visible_devices_list
,
"Can't find "
\
"your npus %s in ASCEND_VISIBLE_DEVICES[%s]."
\
%
(
x
,
npu_visible_devices
)
res_npus
=
[
npu_visible_devices_list
.
index
(
x
.
strip
())
for
x
in
npus
.
split
(
','
)
]
logger
.
info
(
"Change selected_npus into reletive values. --ips:{} "
"will change into relative_ips:{} according to your "
"ASCEND_VISIBLE_DEVICES:{}"
.
format
(
npus
,
res_npus
,
npu_visible_devices_list
))
return
res_npus
def
get_device_mode
(
backend
):
if
fluid
.
core
.
is_compiled_with_npu
()
and
\
if
backend
==
'heter'
:
if
fluid
.
core
.
is_compiled_with_cuda
()
and
\
fluid
.
core
.
get_cuda_device_count
()
>
0
:
print
(
"launch train in heter mode with GPU device."
)
return
DeviceMode
.
GPU
if
fluid
.
core
.
is_compiled_with_xpu
()
and
\
fluid
.
core
.
get_xpu_device_count
()
>
0
:
print
(
"launch train in heter mode with XPU device."
)
return
DeviceMode
.
XPU
if
fluid
.
core
.
is_compiled_with_npu
()
and
\
fluid
.
core
.
get_npu_device_count
()
>
0
:
print
(
"launch train in heter mode with NPU device."
)
return
DeviceMode
.
ASCEND_NPU
if
backend
==
'hccl'
and
fluid
.
core
.
get_npu_device_count
()
>
0
:
print
(
"launch train in ascend npu mode!"
)
return
DeviceMode
.
ASCEND_NPU
...
...
@@ -731,7 +773,17 @@ def get_device_proc_info(args):
else
:
devices_per_proc
=
gpus
elif
device_mode
==
DeviceMode
.
ASCEND_NPU
:
devices_per_proc
=
None
npus
=
get_npus
(
args
.
npus
)
if
args
.
nproc_per_node
is
not
None
:
assert
(
len
(
npus
)
%
int
(
args
.
nproc_per_node
))
==
0
,
\
"npus' number:{} mod args.nproc_per_node:{} must == 0"
.
format
(
len
(
npus
),
args
.
nproc_per_node
)
n
=
int
(
len
(
npus
)
/
int
(
args
.
nproc_per_node
))
devices_per_proc
=
[
npus
[
i
:
i
+
n
]
for
i
in
six
.
moves
.
range
(
0
,
len
(
npus
),
n
)
]
else
:
devices_per_proc
=
npus
elif
device_mode
==
DeviceMode
.
XPU
:
xpus
=
get_xpus
(
args
.
xpus
)
if
args
.
nproc_per_node
is
not
None
:
...
...
@@ -902,11 +954,8 @@ def get_mapped_cluster_from_args(args, device_mode):
node_rank
=
node_ips
.
index
(
ip
)
if
os
.
environ
.
get
(
'FLAGS_START_PORT'
)
is
not
None
:
start_port
=
int
(
os
.
environ
.
get
(
'FLAGS_START_PORT'
))
free_ports
=
[
x
for
x
in
range
(
start_port
,
start_port
+
len
(
node_ranks_mapping
[
node_rank
]))
]
end_port
=
start_port
+
len
(
node_ranks_mapping
[
node_rank
])
free_ports
=
[
x
for
x
in
range
(
start_port
,
end_port
)]
else
:
free_ports
=
find_free_ports
(
len
(
node_ranks_mapping
[
node_rank
]))
trainer_endpoints
.
append
([
"%s:%d"
%
(
ip
,
port
)
for
port
in
free_ports
])
...
...
@@ -1527,11 +1576,11 @@ class ParameterServerLauncher(object):
def
check_backend
(
backend
):
if
backend
not
in
[
'nccl'
,
'gloo'
,
'bkcl'
,
'auto'
]:
raise
ValueError
(
"paddle.distributed initialize error,
"
"backend argument can only be one of 'nccl', 'gloo', 'bkcl', 'auto', but got %s
"
%
backend
)
if
backend
not
in
[
'nccl'
,
'gloo'
,
'bkcl'
,
'auto'
,
'hccl'
,
'heter'
]:
raise
ValueError
(
"paddle.distributed initialize error, "
"backend argument can only be one of
"
"'nccl', 'gloo', 'bkcl', 'auto', 'hccl', 'heter'
"
"but got %s"
%
backend
)
if
backend
==
'nccl'
and
not
fluid
.
core
.
is_compiled_with_cuda
():
raise
ValueError
(
...
...
@@ -1545,6 +1594,12 @@ def check_backend(backend):
"your paddle is not compiled with xpu but you assign 'bkcl' as backend."
)
if
backend
==
'hccl'
and
not
fluid
.
core
.
is_compiled_with_npu
():
raise
ValueError
(
"paddle.distributed initialize error, "
"your paddle is not compiled with npu but you assign 'hccl' as backend."
)
def
block_windows_and_macos
(
backend
):
if
backend
!=
'gloo'
:
return
...
...
@@ -1565,4 +1620,7 @@ def get_backend_by_compile_flag():
if
fluid
.
core
.
is_compiled_with_xpu
():
return
'bkcl'
if
fluid
.
core
.
is_compiled_with_npu
():
return
'hccl'
return
'gloo'
python/paddle/distributed/fleet/meta_optimizers/__init__.py
浏览文件 @
1bdb8578
...
...
@@ -28,6 +28,7 @@ from .lamb_optimizer import LambOptimizer
from
.fp16_allreduce_optimizer
import
FP16AllReduceOptimizer
from
.sharding_optimizer
import
ShardingOptimizer
from
.dygraph_optimizer
import
HybridParallelOptimizer
from
.dygraph_optimizer
import
HeterParallelOptimizer
from
.dygraph_optimizer
import
HybridParallelGradScaler
from
.tensor_parallel_optimizer
import
TensorParallelOptimizer
from
.raw_program_optimizer
import
RawProgramOptimizer
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
浏览文件 @
1bdb8578
...
...
@@ -13,5 +13,6 @@
from
.hybrid_parallel_optimizer
import
HybridParallelOptimizer
from
.hybrid_parallel_gradscaler
import
HybridParallelGradScaler
from
.dygraph_sharding_optimizer
import
DygraphShardingOptimizer
from
.heter_parallel_optimizer
import
HeterParallelOptimizer
__all__
=
[]
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py
0 → 100755
浏览文件 @
1bdb8578
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
from
paddle.fluid.dygraph
import
base
as
imperative_base
from
paddle.fluid
import
framework
__all__
=
[]
def
_obtain_optimizer_parameters_list
(
optimizer
):
if
getattr
(
optimizer
,
'_param_groups'
,
None
)
and
isinstance
(
optimizer
.
_param_groups
[
0
],
dict
):
parameters_list
=
[]
for
group
in
optimizer
.
_param_groups
:
for
param
in
group
[
'params'
]:
parameters_list
.
append
(
param
)
else
:
parameters_list
=
[
param
for
param
in
optimizer
.
_parameter_list
]
return
parameters_list
class
HeterParallelOptimizer
:
# adapter wrapper for optimizer
def
__init__
(
self
,
optimizer
,
strategy
):
self
.
_inner_opt
=
optimizer
self
.
_strategy
=
strategy
# NOTE(liubo48): In pure DataParallel mode,
# the gradient synchronization is achieved through reducer.
@
imperative_base
.
no_grad
@
framework
.
dygraph_only
def
step
(
self
):
parameters_list
=
_obtain_optimizer_parameters_list
(
self
.
_inner_opt
)
self
.
_inner_opt
.
step
()
@
imperative_base
.
no_grad
def
minimize
(
self
,
loss
,
startup_program
=
None
,
parameters
=
None
,
no_grad_set
=
None
):
# minimize does not support parameters in the form of param_group,
# so no need use _obtain_optimizer_parameters_list
parameter_list
=
parameters
if
parameters
\
else
self
.
_inner_opt
.
_parameter_list
return
self
.
_inner_opt
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
def
__getattr__
(
self
,
item
):
return
getattr
(
self
.
_inner_opt
,
item
)
python/paddle/distributed/parallel.py
浏览文件 @
1bdb8578
...
...
@@ -58,7 +58,7 @@ def _start_kv_server(port, http_server_d, size):
def
_is_cpuonly
(
backend
):
check_backend
(
backend
)
if
backend
in
[
'auto'
,
'nccl'
,
'bkcl'
,
'hccl'
]
and
(
if
backend
in
[
'auto'
,
'nccl'
,
'bkcl'
,
'hccl'
,
'heter'
]
and
(
core
.
is_compiled_with_cuda
()
or
core
.
is_compiled_with_xpu
()
or
core
.
is_compiled_with_npu
()):
...
...
@@ -68,6 +68,14 @@ def _is_cpuonly(backend):
return
True
def
_check_var_exists
(
var_name
):
var
=
os
.
environ
.
get
(
var_name
,
None
)
if
var
is
None
:
raise
ValueError
(
"paddle.distributed initialize error, "
"environment variable %s is needed, but not set."
%
var_name
)
def
init_parallel_env
():
"""
Initialize parallel training environment in dynamic graph mode.
...
...
@@ -148,27 +156,22 @@ def init_parallel_env():
raise
NotImplementedError
(
"If you want to use CPU-only version, please use 'gloo' as backend"
)
# 2. check env
def
_check_var_exists
(
var_name
):
var
=
os
.
environ
.
get
(
var_name
,
None
)
if
var
is
None
:
raise
ValueError
(
"paddle.distributed initialize error, "
"environment variable %s is needed, but not set."
%
var_name
)
if
not
is_cpu_only
and
core
.
is_compiled_with_cuda
():
_check_var_exists
(
"FLAGS_selected_gpus"
)
elif
not
is_cpu_only
and
core
.
is_compiled_with_xpu
():
_check_var_exists
(
'FLAGS_selected_xpus'
)
elif
not
is_cpu_only
and
core
.
is_compiled_with_npu
():
_check_var_exists
(
'FLAGS_selected_npus'
)
_check_var_exists
(
"PADDLE_TRAINER_ID"
)
_check_var_exists
(
"PADDLE_CURRENT_ENDPOINT"
)
_check_var_exists
(
"PADDLE_TRAINERS_NUM"
)
_check_var_exists
(
"PADDLE_TRAINER_ENDPOINTS"
)
node_num
=
set
([
i
.
split
(
":"
)[
0
]
for
i
in
parallel_env
.
trainer_endpoints
])
# 3: init gloo context (step 1: httpsever start)
init_gloo
=
int
(
os
.
getenv
(
"PADDLE_WITH_GLOO"
,
"0"
))
if
is_cpu_only
or
init_gloo
:
if
is_cpu_only
or
init_gloo
or
backend
==
"heter"
:
ep_rank_0
=
parallel_env
.
trainer_endpoints
[
0
].
split
(
":"
)
manager
=
Manager
()
# glboal dict to store status
...
...
@@ -177,6 +180,8 @@ def init_parallel_env():
if
parallel_env
.
rank
==
0
:
# The scope for worker used by http server is '_worker'
size
=
{
'_worker'
:
parallel_env
.
world_size
}
if
backend
==
"heter"
:
size
=
{
'_worker'
:
len
(
node_num
)}
http_server
=
Process
(
target
=
_start_kv_server
,
args
=
(
int
(
ep_rank_0
[
1
]),
http_server_d
,
size
))
...
...
@@ -210,10 +215,13 @@ def init_parallel_env():
place
=
core
.
NPUPlace
(
parallel_env
.
device_id
)
_set_expected_place
(
place
)
# init nccl or
bkcl
context
# init nccl or
hccl or bkcl or heter
context
if
is_cpu_only
:
parallel_helper
.
_set_parallel_ctx
(
core
.
GLOOParallelContext
(
strategy
,
place
))
elif
(
backend
==
"heter"
):
parallel_helper
.
_set_parallel_ctx
(
core
.
HeterParallelContext
(
strategy
,
parallel_env
.
device_id
))
elif
core
.
is_compiled_with_cuda
():
parallel_helper
.
_set_parallel_ctx
(
core
.
NCCLParallelContext
(
strategy
,
place
))
...
...
@@ -224,17 +232,19 @@ def init_parallel_env():
parallel_helper
.
_set_parallel_ctx
(
core
.
HCCLParallelContext
(
strategy
,
place
))
other_endpoints
=
strategy
.
trainer_endpoints
[:]
other_endpoints
.
remove
(
strategy
.
current_endpoint
)
if
not
is_cpu_only
and
strategy
.
local_rank
==
0
:
wait_server_ready
(
other_endpoints
)
if
backend
!=
"heter"
:
other_endpoints
=
strategy
.
trainer_endpoints
[:]
other_endpoints
.
remove
(
strategy
.
current_endpoint
)
if
not
is_cpu_only
and
strategy
.
local_rank
==
0
:
wait_server_ready
(
other_endpoints
)
parallel_helper
.
_init_parallel_ctx
()
# 5: init gloo context (step 2: gloo init)
# dividing init_gloo into two part beacause nccl and gloo
# are separately looking for free ports which sometimes
# leads to port-conflict.
if
is_cpu_only
and
parallel_env
.
rank
==
0
:
if
(
is_cpu_only
or
backend
==
"heter"
)
and
parallel_env
.
rank
==
0
:
# compare to init_gloo, we don't need to
# init gloo, because we do this in _init_parallel_ctx;
http_server_d
[
"running"
]
=
False
...
...
python/paddle/fluid/dygraph/parallel_helper.py
浏览文件 @
1bdb8578
...
...
@@ -28,11 +28,11 @@ def _is_parallel_ctx_initialized():
return
__parallel_ctx__clz__
is
not
None
def
_set_parallel_ctx
(
n
ccl_parallel_context
):
def
_set_parallel_ctx
(
ccl_parallel_context
):
global
__parallel_ctx__clz__
assert
__parallel_ctx__clz__
is
None
,
\
"ParallelContext can only be initialized once."
__parallel_ctx__clz__
=
n
ccl_parallel_context
__parallel_ctx__clz__
=
ccl_parallel_context
def
_init_parallel_ctx
():
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录