Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
25bd5ed8
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
25bd5ed8
编写于
4月 14, 2023
作者:
K
Kim Yann
提交者:
GitHub
4月 14, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
rem cncl (#52434)
上级
dfcba7f4
变更
21
隐藏空白更改
内联
并排
Showing
21 changed file
with
14 addition
and
887 deletion
+14
-887
CMakeLists.txt
CMakeLists.txt
+0
-1
paddle/fluid/framework/var_type_traits.h
paddle/fluid/framework/var_type_traits.h
+0
-7
paddle/fluid/imperative/CMakeLists.txt
paddle/fluid/imperative/CMakeLists.txt
+2
-12
paddle/fluid/imperative/cncl_context.cc
paddle/fluid/imperative/cncl_context.cc
+0
-257
paddle/fluid/imperative/cncl_context.h
paddle/fluid/imperative/cncl_context.h
+0
-77
paddle/fluid/imperative/reducer.cc
paddle/fluid/imperative/reducer.cc
+3
-85
paddle/fluid/imperative/reducer.h
paddle/fluid/imperative/reducer.h
+2
-3
paddle/fluid/operators/collective/CMakeLists.txt
paddle/fluid/operators/collective/CMakeLists.txt
+0
-6
paddle/fluid/operators/collective/c_allreduce_op.h
paddle/fluid/operators/collective/c_allreduce_op.h
+1
-82
paddle/fluid/operators/collective/c_comm_init_op.cc
paddle/fluid/operators/collective/c_comm_init_op.cc
+2
-8
paddle/fluid/operators/collective/c_gen_cncl_id_op.cc
paddle/fluid/operators/collective/c_gen_cncl_id_op.cc
+0
-120
paddle/fluid/operators/collective/c_reduce_op.h
paddle/fluid/operators/collective/c_reduce_op.h
+0
-71
paddle/fluid/operators/collective/c_sync_calc_stream_op.h
paddle/fluid/operators/collective/c_sync_calc_stream_op.h
+0
-10
paddle/fluid/operators/collective/c_sync_comm_stream_op.h
paddle/fluid/operators/collective/c_sync_comm_stream_op.h
+0
-14
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+0
-4
paddle/fluid/platform/collective_helper.h
paddle/fluid/platform/collective_helper.h
+0
-106
paddle/fluid/platform/gen_comm_id_helper.cc
paddle/fluid/platform/gen_comm_id_helper.cc
+1
-9
paddle/fluid/platform/gen_comm_id_helper.h
paddle/fluid/platform/gen_comm_id_helper.h
+1
-1
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+0
-9
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+2
-4
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+0
-1
未找到文件。
CMakeLists.txt
浏览文件 @
25bd5ed8
...
...
@@ -272,7 +272,6 @@ option(WITH_CINN "Compile PaddlePaddle with CINN" OFF)
option
(
WITH_NCCL
"Compile PaddlePaddle with NCCL support"
ON
)
option
(
WITH_RCCL
"Compile PaddlePaddle with RCCL support"
ON
)
option
(
WITH_XPU_BKCL
"Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL"
OFF
)
option
(
WITH_CNCL
"Compile PaddlePaddle with CNCL support"
OFF
)
option
(
WITH_CRYPTO
"Compile PaddlePaddle with crypto support"
ON
)
option
(
WITH_ARM
"Compile PaddlePaddle with arm support"
OFF
)
option
(
WITH_SW
"Compile PaddlePaddle with sw support"
OFF
)
...
...
paddle/fluid/framework/var_type_traits.h
浏览文件 @
25bd5ed8
...
...
@@ -43,10 +43,6 @@
#include "xpu/bkcl.h"
#endif
#if defined(PADDLE_WITH_CNCL)
#include <cncl.h>
#endif
namespace
phi
{
class
DenseTensor
;
class
SelectedRows
;
...
...
@@ -199,9 +195,6 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
#if defined(PADDLE_WITH_XPU_BKCL)
BKCLUniqueId
,
platform
::
BKCLCommunicator
,
#endif
#if defined(PADDLE_WITH_CNCL)
cnclCliqueId
,
#endif
std
::
vector
<
std
::
unique_ptr
<
operators
::
CUDAGraphWithInOuts
>>
,
int
,
...
...
paddle/fluid/imperative/CMakeLists.txt
浏览文件 @
25bd5ed8
...
...
@@ -123,16 +123,7 @@ if(NOT WIN32)
SRCS reducer.cc
DEPS layer
)
endif
()
if
(
WITH_CNCL
)
cc_library
(
cncl_context
SRCS cncl_context.cc
DEPS collective_helper device_context tensor var_type_traits
)
cc_library
(
reducer
SRCS reducer.cc
DEPS layer
)
endif
()
if
(
WITH_NCCL
OR WITH_RCCL
OR WITH_XPU_BKCL
)
...
...
@@ -155,8 +146,7 @@ if(WITH_GLOO)
OR
(
NOT
(
WITH_NCCL
OR WITH_RCCL
OR WITH_XPU_BKCL
OR WITH_CNCL
)
OR WITH_XPU_BKCL
)
))
cc_library
(
reducer
...
...
paddle/fluid/imperative/cncl_context.cc
已删除
100644 → 0
浏览文件 @
dfcba7f4
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/imperative/cncl_context.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
framework
{
class
Variable
;
}
// namespace framework
}
// namespace paddle
namespace
paddle
{
namespace
imperative
{
static
void
AllReduce
(
const
phi
::
DenseTensor
&
src
,
phi
::
DenseTensor
*
dst
,
const
mluStream
stream
,
const
platform
::
CNCLComm
*
comm
)
{
const
auto
&
place
=
src
.
place
();
PADDLE_ENFORCE_EQ
(
platform
::
is_mlu_place
(
place
),
true
,
platform
::
errors
::
Unimplemented
(
"Imperative mode does not support multi-CPU training yet."
));
const
void
*
src_ptr
=
src
.
data
();
dst
->
Resize
(
src
.
dims
());
auto
*
dst_ptr
=
dst
->
mutable_data
(
src
.
place
(),
src
.
dtype
());
auto
cncl_dtype
=
platform
::
ToCNCLDataType
(
framework
::
TransToProtoVarType
(
src
.
dtype
()));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclAllReduce
(
src_ptr
,
dst_ptr
,
src
.
numel
(),
cncl_dtype
,
cnclSum
,
comm
->
comm
(),
stream
));
}
void
CNCLParallelContext
::
BcastCNCLId
(
std
::
vector
<
cnclCliqueId
>
&
cncl_ids
,
// NOLINT
int
root
,
int
server_fd
)
{
if
(
strategy_
.
local_rank_
==
root
)
{
std
::
vector
<
std
::
string
>
other_trainers
;
for
(
auto
&
ep
:
strategy_
.
trainer_endpoints_
)
{
if
(
ep
!=
strategy_
.
current_endpoint_
)
{
other_trainers
.
push_back
(
ep
);
}
}
platform
::
SendBroadCastCommID
(
other_trainers
,
&
cncl_ids
);
}
else
{
platform
::
RecvBroadCastCommID
(
server_fd
,
strategy_
.
current_endpoint_
,
&
cncl_ids
);
}
}
void
CNCLParallelContext
::
Init
()
{
int
server_fd
=
-
1
;
std
::
vector
<
cnclCliqueId
>
cncl_ids
;
cncl_ids
.
resize
(
strategy_
.
nrings_
);
if
(
strategy_
.
local_rank_
==
0
)
{
// generate the unique cnclid on the root worker
for
(
size_t
i
=
0
;
i
<
cncl_ids
.
size
();
++
i
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclGetCliqueId
(
&
cncl_ids
[
i
]));
}
}
else
{
server_fd
=
platform
::
SocketServer
::
GetInstance
(
strategy_
.
current_endpoint_
)
.
socket
();
}
BcastCNCLId
(
cncl_ids
,
0
,
server_fd
);
int
mlu_id
=
place_
.
device
;
for
(
int
ring_id
=
0
;
ring_id
<
strategy_
.
nrings_
;
++
ring_id
)
{
VLOG
(
0
)
<<
"init cncl context nranks: "
<<
strategy_
.
nranks_
<<
" local rank: "
<<
strategy_
.
local_rank_
<<
" mlu id: "
<<
mlu_id
<<
" ring id: "
<<
ring_id
;
// it will assign cncl_comm in MLUDeviceContext within ring_id
platform
::
CNCLCommContext
::
Instance
().
CreateComm
(
&
cncl_ids
[
ring_id
],
strategy_
.
nranks_
,
strategy_
.
local_rank_
,
mlu_id
,
ring_id
);
compute_events_
.
emplace_back
(
platform
::
MluEventResourcePool
::
Instance
().
New
(
place_
.
device
));
comm_events_
.
emplace_back
(
platform
::
MluEventResourcePool
::
Instance
().
New
(
place_
.
device
));
}
}
void
CNCLParallelContext
::
InitWithRingID
(
int
ring_id
)
{
int
server_fd
=
-
1
;
std
::
vector
<
cnclCliqueId
>
cncl_ids
;
cncl_ids
.
resize
(
1
);
if
(
strategy_
.
local_rank_
==
0
)
{
// generate the unique cnclid on the root worker
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclGetCliqueId
(
&
cncl_ids
[
0
]));
}
else
{
server_fd
=
platform
::
SocketServer
::
GetInstance
(
strategy_
.
current_endpoint_
)
.
socket
();
}
BcastCNCLId
(
cncl_ids
,
0
,
server_fd
);
int
mlu_id
=
place_
.
device
;
VLOG
(
0
)
<<
"init cncl context nranks: "
<<
strategy_
.
nranks_
<<
" local rank: "
<<
strategy_
.
local_rank_
<<
" mlu id: "
<<
mlu_id
<<
" ring id: "
<<
ring_id
;
// it will assign cncl_comm in MLUDeviceContext within ring_id
platform
::
CNCLCommContext
::
Instance
().
CreateComm
(
&
cncl_ids
[
0
],
strategy_
.
nranks_
,
strategy_
.
local_rank_
,
mlu_id
,
ring_id
);
compute_events_
.
emplace_back
(
platform
::
MluEventResourcePool
::
Instance
().
New
(
place_
.
device
));
comm_events_
.
emplace_back
(
platform
::
MluEventResourcePool
::
Instance
().
New
(
place_
.
device
));
}
void
CNCLParallelContext
::
AllReduceByStream
(
const
framework
::
Variable
&
src
,
framework
::
Variable
*
dst
,
int
ring_id
,
bool
use_calc_stream
)
{
PADDLE_ENFORCE_EQ
(
platform
::
is_mlu_place
(
place_
),
true
,
platform
::
errors
::
Unimplemented
(
"Dynamic graph mode does not support multi-CPU training yet."
));
auto
*
dev_ctx
=
static_cast
<
platform
::
MLUDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
));
platform
::
CNCLComm
*
comm
=
platform
::
CNCLCommContext
::
Instance
().
Get
(
ring_id
,
place_
);
mluStream
stream
=
(
use_calc_stream
?
dev_ctx
->
stream
()
:
comm
->
stream
());
if
(
src
.
IsType
<
phi
::
DenseTensor
>
())
{
if
(
!
dst
->
IsType
<
phi
::
DenseTensor
>
())
{
dst
->
Clear
();
}
AllReduce
(
src
.
Get
<
phi
::
DenseTensor
>
(),
dst
->
GetMutable
<
phi
::
DenseTensor
>
(),
stream
,
comm
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Unsupported variable type %s for imperative allreduce, only "
"LoDTensor is supported."
,
platform
::
demangle
(
framework
::
ToTypeName
(
src
.
Type
()))));
}
}
void
CNCLParallelContext
::
Broadcast
(
framework
::
Variable
*
src
,
int
ring_id
)
{
VLOG
(
3
)
<<
"/// DEBUG /// start inter broadcast with ring_id: "
<<
ring_id
;
phi
::
DenseTensor
*
src_tensor
=
src
->
GetMutable
<
phi
::
DenseTensor
>
();
const
auto
&
place
=
src_tensor
->
place
();
platform
::
CNCLComm
*
comm
=
platform
::
CNCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
mluStream
stream
=
comm
->
stream
();
void
*
src_ptr
=
src_tensor
->
data
();
auto
cncl_dtype
=
platform
::
ToCNCLDataType
(
framework
::
TransToProtoVarType
(
src_tensor
->
dtype
()));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclBcast
(
src_ptr
,
src_tensor
->
numel
(),
cncl_dtype
,
0
,
comm
->
comm
(),
stream
));
}
paddle
::
platform
::
DeviceContext
*
CNCLParallelContext
::
GetDeviceContext
(
int
ring_id
)
{
return
static_cast
<
platform
::
DeviceContext
*>
(
platform
::
CNCLCommContext
::
Instance
()
.
Get
(
ring_id
,
place_
)
->
dev_context
());
}
void
CNCLParallelContext
::
WaitCompute
(
int
ring_id
)
{
PADDLE_ENFORCE_GE
(
ring_id
,
0
,
platform
::
errors
::
OutOfRange
(
"ring id must >= 0, but got %d"
,
ring_id
));
PADDLE_ENFORCE_LT
(
ring_id
,
compute_events_
.
size
(),
platform
::
errors
::
OutOfRange
(
"ring id must < compute events size,"
"but got ring id = %d, compute events size = %d"
,
ring_id
,
compute_events_
.
size
()));
auto
compute_stream
=
static_cast
<
platform
::
MLUDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
))
->
stream
();
auto
comm_stream
=
platform
::
CNCLCommContext
::
Instance
().
Get
(
ring_id
,
place_
)
->
stream
();
auto
event
=
compute_events_
[
ring_id
].
get
();
// compute_stream-->event-->comm_stream
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtPlaceNotifier
(
event
,
compute_stream
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtQueueWaitNotifier
(
event
,
comm_stream
,
0
));
}
void
CNCLParallelContext
::
WaitComm
(
int
ring_id
)
{
PADDLE_ENFORCE_GE
(
ring_id
,
0
,
platform
::
errors
::
OutOfRange
(
"ring id must >= 0, but got %d"
,
ring_id
));
PADDLE_ENFORCE_LT
(
ring_id
,
comm_events_
.
size
(),
platform
::
errors
::
OutOfRange
(
"ring id must < comm events size,"
"but got ring id = %d, comm events size = %d"
,
ring_id
,
comm_events_
.
size
()));
auto
compute_stream
=
static_cast
<
platform
::
MLUDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
))
->
stream
();
auto
comm_stream
=
platform
::
CNCLCommContext
::
Instance
().
Get
(
ring_id
,
place_
)
->
stream
();
auto
event
=
comm_events_
[
ring_id
].
get
();
// comm_stream-->event-->compute_stream
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtPlaceNotifier
(
event
,
comm_stream
));
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtQueueWaitNotifier
(
event
,
compute_stream
,
0
));
}
void
CNCLParallelContext
::
SynchronizeCompute
()
{
auto
*
compute_dev_ctx
=
static_cast
<
platform
::
MLUDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
));
compute_dev_ctx
->
Wait
();
}
}
// namespace imperative
}
// namespace paddle
#endif
paddle/fluid/imperative/cncl_context.h
已删除
100644 → 0
浏览文件 @
dfcba7f4
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if defined(PADDLE_WITH_CNCL)
#include <cncl.h>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/imperative/parallel_context.h"
#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
namespace
paddle
{
namespace
framework
{
class
Variable
;
}
// namespace framework
}
// namespace paddle
namespace
paddle
{
namespace
imperative
{
class
CNCLParallelContext
:
public
ParallelContext
{
public:
explicit
CNCLParallelContext
(
const
ParallelStrategy
&
strategy
,
const
platform
::
Place
&
place
)
:
ParallelContext
(
strategy
,
place
)
{}
~
CNCLParallelContext
()
override
=
default
;
void
BcastCNCLId
(
std
::
vector
<
cnclCliqueId
>&
cncl_ids
,
int
root
,
// NOLINT
int
server_fd
);
void
Init
()
override
;
void
InitWithRingID
(
int
ring_id
)
override
;
void
AllReduceByStream
(
const
framework
::
Variable
&
src
,
framework
::
Variable
*
dst
,
int
ring_id
,
bool
use_calc_stream
)
override
;
void
Broadcast
(
framework
::
Variable
*
src
,
int
ring_id
)
override
;
paddle
::
platform
::
DeviceContext
*
GetDeviceContext
(
int
ring_id
)
override
;
void
WaitCompute
(
int
ring_id
)
override
;
void
WaitComm
(
int
ring_id
)
override
;
void
SynchronizeCompute
()
override
;
private:
// used for comm wait compute, compute_stream-->event-->comm_stream[ring_id]
std
::
vector
<
std
::
shared_ptr
<
platform
::
MluEventObject
>>
compute_events_
;
// used for compute wait comm, comm_stream[ring_id]-->event-->compute_stream
std
::
vector
<
std
::
shared_ptr
<
platform
::
MluEventObject
>>
comm_events_
;
};
}
// namespace imperative
}
// namespace paddle
#endif
paddle/fluid/imperative/reducer.cc
浏览文件 @
25bd5ed8
...
...
@@ -29,9 +29,8 @@
namespace
paddle
{
namespace
imperative
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_CNCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO)
// div the nranks
void
Group
::
DivNRanks
(
const
platform
::
DeviceContext
&
context
,
int64_t
nranks
)
{
phi
::
DenseTensor
*
tensor
=
...
...
@@ -229,56 +228,6 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
}
#endif
#ifdef PADDLE_WITH_CNCL
// context is used to select the stream for concat
template
<
>
void
ConcatTensorsWithType
<
platform
::
MLUDeviceContext
>
(
const
platform
::
MLUDeviceContext
&
context
,
const
std
::
vector
<
phi
::
DenseTensor
>
&
dense_tensors_
,
framework
::
Variable
*
p_dense_contents
,
framework
::
proto
::
VarType
::
Type
type
)
{
switch
(
type
)
{
case
framework
::
proto
::
VarType
::
FP16
:
ConcatTensorsForAllReduce
<
platform
::
MLUDeviceContext
,
platform
::
float16
>
(
context
,
dense_tensors_
,
p_dense_contents
);
break
;
case
framework
::
proto
::
VarType
::
FP32
:
ConcatTensorsForAllReduce
<
platform
::
MLUDeviceContext
,
float
>
(
context
,
dense_tensors_
,
p_dense_contents
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it concats tensors for "
"allreduce."
,
framework
::
DataTypeToString
(
type
)));
}
}
// context is used to select the stream for split
template
<
>
void
SplitTensorsWithType
<
platform
::
MLUDeviceContext
>
(
const
platform
::
MLUDeviceContext
&
context
,
framework
::
Variable
*
p_dense_contents
,
std
::
vector
<
phi
::
DenseTensor
>
*
p_dense_tensors
,
framework
::
proto
::
VarType
::
Type
type
)
{
switch
(
type
)
{
case
framework
::
proto
::
VarType
::
FP16
:
SplitTensorsForAllReduce
<
platform
::
MLUDeviceContext
,
platform
::
float16
>
(
context
,
p_dense_contents
,
p_dense_tensors
);
break
;
case
framework
::
proto
::
VarType
::
FP32
:
SplitTensorsForAllReduce
<
platform
::
MLUDeviceContext
,
float
>
(
context
,
p_dense_contents
,
p_dense_tensors
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it splits tensors for "
"allreduce."
,
framework
::
DataTypeToString
(
type
)));
}
}
#endif
void
Group
::
ConcatTensors
(
const
platform
::
DeviceContext
&
context
)
{
auto
place
=
context
.
GetPlace
();
if
(
platform
::
is_gpu_place
(
place
))
{
...
...
@@ -308,19 +257,6 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't concat npu grads since it's not compiled with HCCL,"
"Please recompile or reinstall Paddle with HCCL support."
));
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#ifdef PADDLE_WITH_CNCL
ConcatTensorsWithType
(
static_cast
<
const
platform
::
MLUDeviceContext
&>
(
context
),
dense_tensors_
,
&
dense_contents_
,
dtype_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't concat mlu grads since it's not compiled with CNCL,"
"Please recompile or reinstall Paddle with CNCL support."
));
#endif
}
else
if
(
platform
::
is_cpu_place
(
place
))
{
ConcatTensorsWithType
(
static_cast
<
const
phi
::
CPUContext
&>
(
context
),
dense_tensors_
,
...
...
@@ -361,19 +297,6 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't split npu grad since it's not compiled with HCCL,"
"Please recompile or reinstall Paddle with HCCL support."
));
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#ifdef PADDLE_WITH_CNCL
SplitTensorsWithType
(
static_cast
<
const
platform
::
MLUDeviceContext
&>
(
context
),
&
dense_contents_
,
&
dense_tensors_
,
dtype_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't split mlu grad since it's not compiled with CNCL,"
"Please recompile or reinstall Paddle with CNCL support."
));
#endif
}
else
if
(
platform
::
is_cpu_place
(
place
))
{
SplitTensorsWithType
(
static_cast
<
const
phi
::
CPUContext
&>
(
context
),
&
dense_contents_
,
...
...
@@ -850,11 +773,6 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
PADDLE_ENFORCE_XPU_SUCCESS
(
xpu_wait
(
dev_ctx
->
stream
()));
}
}
#elif defined(PADDLE_WITH_CNCL)
if
(
platform
::
is_mlu_place
(
group_tensor
.
place
()))
{
// TODO(liuyuhui) support MLU set constant
VLOG
(
3
)
<<
"MLU doesn't support set_constant"
;
}
#else
auto
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
);
if
(
HasGrad
(
var_index
))
{
...
...
@@ -1116,7 +1034,7 @@ void Reducer::FinalizeBackward() {
if
(
find_unused_vars_each_step_
)
{
// TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_GLOO)
|| defined(PADDLE_WITH_CNCL)
defined(PADDLE_WITH_GLOO)
ProcessUnusedDenseVars
();
#endif
// Initialize local used vars
...
...
paddle/fluid/imperative/reducer.h
浏览文件 @
25bd5ed8
...
...
@@ -44,9 +44,8 @@ class VariableWrapper;
namespace
paddle
{
namespace
imperative
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_CNCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO)
template
<
typename
T
>
struct
DivNRanksFunctor
{
...
...
paddle/fluid/operators/collective/CMakeLists.txt
浏览文件 @
25bd5ed8
...
...
@@ -27,7 +27,6 @@ register_operators(
gen_bkcl_id_op
c_gen_nccl_id_op
gen_nccl_id_op
c_gen_cncl_id_op
DEPS
${
COLLECTIVE_DEPS
}
)
...
...
@@ -44,11 +43,6 @@ if(WITH_XPU_BKCL)
op_library
(
gen_bkcl_id_op DEPS
${
COLLECTIVE_DEPS
}
)
endif
()
if
(
WITH_CNCL
)
set
(
COLLECTIVE_DEPS
${
COLLECTIVE_DEPS
}
collective_helper
)
op_library
(
c_gen_cncl_id_op DEPS
${
COLLECTIVE_DEPS
}
)
endif
()
set
(
OPERATOR_DEPS
${
OPERATOR_DEPS
}
${
COLLECTIVE_DEPS
}
PARENT_SCOPE
)
...
...
paddle/fluid/operators/collective/c_allreduce_op.h
浏览文件 @
25bd5ed8
...
...
@@ -25,7 +25,7 @@ limitations under the License. */
#include "paddle/phi/api/include/tensor.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
|| defined(PADDLE_WITH_CNCL)
defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
...
...
@@ -39,14 +39,9 @@ limitations under the License. */
#if defined(PADDLE_WITH_GLOO)
#include <gloo/allreduce.h>
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -360,82 +355,6 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
template <typename T, typename DeviceContext> \
class op_name##CUDAKernel : public CAllReduceOpCUDAKernel<red_type, T> {};
template
<
ReduceType
red_type
,
typename
T
>
class
CAllReduceOpMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_CNCL)
auto
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
if
(
ctx
.
HasInput
(
"Cond"
))
{
auto
cond
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Cond"
);
auto
place
=
cond
->
place
();
PADDLE_ENFORCE_EQ
(
platform
::
is_cpu_place
(
place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The input `cond` tensor should be on cpu place"
));
PADDLE_ENFORCE_EQ
(
cond
->
numel
(),
1
,
platform
::
errors
::
PreconditionNotMet
(
"The input `cond` should be shape [1]"
));
if
(
!
cond
->
data
<
bool
>
()[
0
])
{
VLOG
(
4
)
<<
"Skip all reduce Op since cond is 0"
;
return
;
}
}
auto
place
=
ctx
.
GetPlace
();
cnclDataType_t
dtype
=
platform
::
ToCNCLDataType
(
framework
::
TransToProtoVarType
(
in
->
dtype
()));
int64_t
numel
=
in
->
numel
();
const
void
*
sendbuff
=
in
->
data
<
T
>
();
out
->
Resize
(
in
->
dims
());
void
*
recvbuff
=
out
->
mutable_data
<
T
>
(
place
);
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
comm
=
platform
::
CNCLCommContext
::
Instance
().
Get
(
rid
,
place
);
mluStream
stream
=
nullptr
;
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
stream
=
static_cast
<
platform
::
MLUDeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
cnclReduceOp_t
cncl_red_type
=
cnclSum
;
switch
(
red_type
)
{
case
kRedSum
:
cncl_red_type
=
cnclSum
;
break
;
case
kRedMax
:
cncl_red_type
=
cnclMax
;
break
;
case
kRedMin
:
cncl_red_type
=
cnclMin
;
break
;
case
kRedProd
:
cncl_red_type
=
cnclProd
;
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Invalid reduce type: %d"
,
red_type
));
}
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclAllReduce
(
sendbuff
,
recvbuff
,
numel
,
dtype
,
cncl_red_type
,
comm
->
comm
(),
stream
));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with MLU."
));
#endif
}
};
class
CAllReduceOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
...
...
paddle/fluid/operators/collective/c_comm_init_op.cc
浏览文件 @
25bd5ed8
...
...
@@ -20,15 +20,12 @@ limitations under the License. */
#if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h"
#endif
#if defined(PADDLE_WITH_CNCL)
#include <cncl.h>
#endif
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
|| defined(PADDLE_WITH_CNCL)
defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
...
...
@@ -58,9 +55,6 @@ class CCommInitOp : public framework::OperatorBase {
#elif defined(PADDLE_WITH_XPU_BKCL)
using
UniqueId
=
BKCLUniqueId
;
using
CommContext
=
platform
::
BKCLCommContext
;
#elif defined(PADDLE_WITH_CNCL)
using
UniqueId
=
cnclCliqueId
;
using
CommContext
=
platform
::
CNCLCommContext
;
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should be compiled with GPU or XPU or MLU."
));
...
...
@@ -74,7 +68,7 @@ class CCommInitOp : public framework::OperatorBase {
"CCommInitOp can run on gpu or xpu or mlu place only."
));
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
|| defined(PADDLE_WITH_CNCL)
defined(PADDLE_WITH_XPU_BKCL)
auto
var
=
scope
.
FindVar
(
Input
(
"X"
));
PADDLE_ENFORCE_NOT_NULL
(
var
,
platform
::
errors
::
InvalidArgument
(
"Input con not be empty."
));
...
...
paddle/fluid/operators/collective/c_gen_cncl_id_op.cc
已删除
100644 → 0
浏览文件 @
dfcba7f4
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cncl.h>
#include <string>
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
operators
{
static
void
GenCNCLID
(
std
::
vector
<
cnclCliqueId
>*
cncl_ids
)
{
for
(
size_t
i
=
0
;
i
<
cncl_ids
->
size
();
++
i
)
{
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclGetCliqueId
(
&
(
*
cncl_ids
)[
i
]));
}
}
static
void
CopyCNCLIDToVar
(
const
std
::
vector
<
cnclCliqueId
>&
cncl_ids
,
std
::
function
<
std
::
string
(
size_t
)
>
func
,
const
framework
::
Scope
&
scope
)
{
for
(
size_t
i
=
0
;
i
<
cncl_ids
.
size
();
++
i
)
{
std
::
string
var_name
=
func
(
i
);
auto
var
=
scope
.
FindVar
(
var_name
);
PADDLE_ENFORCE_NOT_NULL
(
var
,
platform
::
errors
::
NotFound
(
"Variable with name %s is not found"
,
var_name
.
c_str
()));
auto
cncl_id
=
var
->
GetMutable
<
cnclCliqueId
>
();
memcpy
(
cncl_id
,
&
cncl_ids
[
i
],
sizeof
(
cnclCliqueId
));
}
}
class
CGenCNCLIdOp
:
public
framework
::
OperatorBase
{
public:
CGenCNCLIdOp
(
const
std
::
string
&
type
,
const
framework
::
VariableNameMap
&
inputs
,
const
framework
::
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
)
:
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
)
{}
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
int
rank
=
Attr
<
int
>
(
"rank"
);
int
ring_id
=
Attr
<
int
>
(
"ring_id"
);
std
::
function
<
std
::
string
(
size_t
)
>
func
=
[
&
](
size_t
i
)
->
std
::
string
{
return
Output
(
"Out"
);
};
std
::
string
endpoint
=
Attr
<
std
::
string
>
(
"endpoint"
);
int
server_fd
=
platform
::
SocketServer
::
GetInstance
(
endpoint
).
socket
();
std
::
vector
<
cnclCliqueId
>
cncl_ids
;
cncl_ids
.
resize
(
1
);
if
(
rank
==
0
)
{
GenCNCLID
(
&
cncl_ids
);
std
::
vector
<
std
::
string
>
endpoint_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"other_endpoints"
);
platform
::
SendBroadCastCommID
(
endpoint_list
,
&
cncl_ids
,
ring_id
);
}
else
{
platform
::
RecvBroadCastCommID
(
server_fd
,
endpoint
,
&
cncl_ids
,
ring_id
);
}
CopyCNCLIDToVar
(
cncl_ids
,
func
,
scope
);
}
};
class
CGenCNCLIdOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddOutput
(
"Out"
,
"Raw variable contains a CNCL CliqueId instaces."
);
AddComment
(
R"DOC(
CGenCNCLId operator
For trainer 0: generate a new CliqueId and send it to all the other trainers.
For trainer 1~n: start a gRPC server to get the CliqueId, once got, stop the server.
)DOC"
);
AddAttr
<
std
::
string
>
(
"endpoint"
,
"(string), e.g. 127.0.0.1:6175 "
"current listen endpoint"
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"other_endpoints"
,
"['trainer1_ip:port', 'trainer2_ip:port', ...] "
"list of other trainer endpoints"
)
.
SetDefault
({});
AddAttr
<
int
>
(
"rank"
,
"(int default 0) "
"The rank of the trainer in distributed training."
)
.
SetDefault
(
0
);
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) user specified ring id"
)
.
SetDefault
(
0
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
c_gen_cncl_id
,
ops
::
CGenCNCLIdOp
,
ops
::
CGenCNCLIdOpMaker
);
paddle/fluid/operators/collective/c_reduce_op.h
浏览文件 @
25bd5ed8
...
...
@@ -44,10 +44,6 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -286,73 +282,6 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
template <typename T, typename DeviceContext> \
class op_name##CUDAKernel : public CReduceOpCUDAKernel<red_type, T> {};
template
<
ReduceType
red_type
,
typename
T
>
class
CReduceOpMLUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_CNCL)
auto
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
ctx
.
GetPlace
();
cnclDataType_t
dtype
=
platform
::
ToCNCLDataType
(
framework
::
TransToProtoVarType
(
in
->
dtype
()));
int64_t
numel
=
in
->
numel
();
const
void
*
sendbuff
=
in
->
data
();
out
->
Resize
(
in
->
dims
());
void
*
recvbuff
=
out
->
mutable_data
<
T
>
(
place
);
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
int
root
=
ctx
.
Attr
<
int
>
(
"root_id"
);
auto
comm
=
paddle
::
platform
::
CNCLCommContext
::
Instance
().
Get
(
rid
,
place
);
mluStream
stream
=
nullptr
;
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
stream
=
static_cast
<
platform
::
MLUDeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
cnclReduceOp_t
cncl_red_type
=
cnclSum
;
switch
(
red_type
)
{
case
kRedSum
:
cncl_red_type
=
cnclSum
;
break
;
case
kRedMax
:
cncl_red_type
=
cnclMax
;
break
;
case
kRedMin
:
cncl_red_type
=
cnclMin
;
break
;
case
kRedProd
:
cncl_red_type
=
cnclProd
;
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Invalid reduce type: %d"
,
red_type
));
}
PADDLE_ENFORCE_MLU_SUCCESS
(
cnclReduce
(
sendbuff
,
recvbuff
,
numel
,
dtype
,
cncl_red_type
,
root
,
comm
->
comm
(),
stream
));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with MLU."
));
#endif
}
};
class
CReduceOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
...
...
paddle/fluid/operators/collective/c_sync_calc_stream_op.h
浏览文件 @
25bd5ed8
...
...
@@ -47,16 +47,6 @@ class CSyncCalcStreamKernel : public framework::OpKernel<T> {
platform
::
GpuStreamSync
(
dev_ctx
->
stream
());
#elif defined(PADDLE_WITH_CNCL)
auto
place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
platform
::
is_mlu_place
(
place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"Sync stream op can run on mlu place only for now."
));
auto
dev_ctx
=
static_cast
<
platform
::
MLUDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
platform
::
MLUStreamSync
(
dev_ctx
->
stream
());
#elif defined(PADDLE_WITH_XPU_BKCL)
auto
place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
platform
::
is_xpu_place
(
place
),
...
...
paddle/fluid/operators/collective/c_sync_comm_stream_op.h
浏览文件 @
25bd5ed8
...
...
@@ -22,10 +22,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
...
...
@@ -45,16 +41,6 @@ class CSyncCommStreamKernel : public framework::OpKernel<T> {
platform
::
GpuStreamSync
(
stream
);
#elif defined(PADDLE_WITH_CNCL)
auto
place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
platform
::
is_mlu_place
(
place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"Sync stream op can run on mlu place only for now."
));
int
ring_id
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
stream
=
platform
::
CNCLCommContext
::
Instance
().
Get
(
ring_id
,
place
)
->
stream
();
platform
::
MLUStreamSync
(
stream
);
#elif defined(PADDLE_WITH_XPU_BKCL)
auto
place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
platform
::
is_xpu_place
(
place
),
...
...
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
25bd5ed8
...
...
@@ -139,10 +139,6 @@ cc_library(
SRCS collective_helper.cc gen_comm_id_helper.cc
DEPS framework_proto device_context enforce
)
if
(
WITH_CNCL
)
target_link_libraries
(
collective_helper mlu_collective_helper
)
endif
()
if
(
WITH_GPU OR WITH_ROCM
)
target_link_libraries
(
device_context gpu_resource_pool
)
endif
()
...
...
paddle/fluid/platform/collective_helper.h
浏览文件 @
25bd5ed8
...
...
@@ -23,9 +23,6 @@
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/utils/variant.h"
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/device/mlu/device_context.h"
#endif
namespace
paddle
{
namespace
platform
{
...
...
@@ -246,108 +243,5 @@ class BKCLCommContext {
DISABLE_COPY_AND_ASSIGN
(
BKCLCommContext
);
};
#endif
#if defined(PADDLE_WITH_CNCL)
// In order to apply hierarchical communication with CNCL, we need
// a communication ring contains CNCL communicators associated to a global
// cnclUniqueId. E.g. for a hierarchical case,
//
// 11 - 12 21 - 22
// | | | |
// 13 - 14 - 23 - 24
// | |
// 31 - 32 - 41 - 42
// | | | |
// 33 - 34 43 - 44
//
// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
// (31,32,33,34), (41,42,43,44) as bottoms respectively.
//
// We could also use a single communication ring for the flatten case
//
// The CNCLComm instance is created and reversed in the CNCLCommContext
// singleton with a global user specified group id.
class
MLUDeviceContext
;
class
CNCLComm
{
public:
virtual
int
ring_id
()
const
=
0
;
virtual
int
nranks
()
const
=
0
;
virtual
int
rank
()
const
=
0
;
virtual
int
device_id
()
const
=
0
;
virtual
cnclComm_t
comm
()
const
=
0
;
virtual
mluStream
stream
()
const
=
0
;
virtual
MLUDeviceContext
*
dev_context
()
const
=
0
;
virtual
~
CNCLComm
()
=
default
;
};
// A singleton CNCL communicator context reserves communication ring ids
class
CNCLCommContext
{
public:
static
CNCLCommContext
&
Instance
()
{
static
CNCLCommContext
comm_ctx
;
return
comm_ctx
;
}
CNCLComm
*
CreateComm
(
cnclCliqueId
*
cncl_id
,
int
nranks
,
int
rank
,
int
dev_id
,
int
ring_id
=
0
);
void
CreateAllCNCLComms
(
const
std
::
vector
<
int
>&
dev_ids
,
int
ring_id
=
0
);
// a latter comm with the same dev_id and the same ring_id
// will override the former
CNCLComm
*
AssignCNCLComm
(
cnclComm_t
comm
,
int
nranks
,
int
rank
,
int
dev_id
,
int
ring_id
=
0
);
// retrieve a communicator by the ring id in multiprocessing mode
CNCLComm
*
Get
(
int
ring_id
)
const
{
PADDLE_ENFORCE_GT
(
comm_map_
.
count
(
ring_id
),
0
,
platform
::
errors
::
InvalidArgument
(
"Communicator in ring id %d has not been initialized."
,
ring_id
));
PADDLE_ENFORCE_EQ
(
comm_map_
.
at
(
ring_id
).
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"One device id should be specified to retrieve from "
"multiple communicators."
));
return
comm_map_
.
at
(
ring_id
).
begin
()
->
second
.
get
();
}
// retrieve a communicator by the ring id and the device id
CNCLComm
*
Get
(
int
ring_id
,
int
dev_id
)
const
{
PADDLE_ENFORCE_GT
(
comm_map_
.
count
(
ring_id
),
0
,
platform
::
errors
::
InvalidArgument
(
"Communicator of ring id %d has not been initialized."
,
ring_id
));
PADDLE_ENFORCE_GT
(
comm_map_
.
at
(
ring_id
).
count
(
dev_id
),
0
,
platform
::
errors
::
InvalidArgument
(
"Communicator at device id %d has not been initialized in ring %d."
,
dev_id
,
ring_id
));
return
comm_map_
.
at
(
ring_id
).
at
(
dev_id
).
get
();
}
// retrieve a communicator by the ring id and place
CNCLComm
*
Get
(
int
ring_id
,
Place
place
)
const
{
return
Get
(
ring_id
,
place
.
device
);
}
private:
std
::
once_flag
once_flag_
;
std
::
mutex
comm_map_mutex_
;
// ring id to dev-CNCLComm
std
::
map
<
int
,
std
::
map
<
int
,
std
::
unique_ptr
<
CNCLComm
>>>
comm_map_
;
void
ReleaseCNCLComms
();
CNCLCommContext
()
=
default
;
DISABLE_COPY_AND_ASSIGN
(
CNCLCommContext
);
};
#endif
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/gen_comm_id_helper.cc
浏览文件 @
25bd5ed8
...
...
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
|| defined(PADDLE_WITH_CNCL)
defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include <arpa/inet.h>
...
...
@@ -34,10 +34,6 @@ limitations under the License. */
#include "xpu/bkcl.h"
#endif
#if defined(PADDLE_WITH_CNCL)
#include <cncl.h>
#endif
DECLARE_int32
(
get_host_by_name_time
);
namespace
paddle
{
...
...
@@ -448,10 +444,6 @@ INSTANT_TEMPLATE(ncclUniqueId)
#ifdef PADDLE_WITH_XPU_BKCL
INSTANT_TEMPLATE
(
BKCLUniqueId
)
#endif
#ifdef PADDLE_WITH_CNCL
INSTANT_TEMPLATE
(
cnclCliqueId
)
#endif
}
// namespace platform
}
// namespace paddle
...
...
paddle/fluid/platform/gen_comm_id_helper.h
浏览文件 @
25bd5ed8
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
|| defined(PADDLE_WITH_CNCL)
defined(PADDLE_WITH_XPU_BKCL)
#include <functional>
#include <memory>
#include <mutex>
...
...
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
25bd5ed8
...
...
@@ -91,11 +91,6 @@ if(WITH_XPU_BKCL)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
heter_ccl_context
)
endif
()
if
(
WITH_CNCL
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
reducer
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
cncl_context
)
endif
()
if
(
NOT WIN32
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
data_loader
)
if
(
WITH_NCCL OR WITH_RCCL
)
...
...
@@ -259,10 +254,6 @@ if(WITH_PYTHON)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS bkcl_context
)
endif
()
if
(
WITH_CNCL
)
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS cncl_context
)
endif
()
if
(
NOT
((
NOT WITH_PYTHON
)
AND ON_INFER
))
list
(
APPEND OP_FUNCTION_GENERETOR_DEPS
${
PYTHON_LIBRARIES
}
)
endif
()
...
...
paddle/fluid/pybind/imperative.cc
浏览文件 @
25bd5ed8
...
...
@@ -41,7 +41,6 @@ limitations under the License. */
#include "paddle/fluid/imperative/amp_auto_cast.h"
#include "paddle/fluid/imperative/basic_engine.h"
#include "paddle/fluid/imperative/bkcl_context.h"
#include "paddle/fluid/imperative/cncl_context.h"
#include "paddle/fluid/imperative/data_loader.h"
#include "paddle/fluid/imperative/gloo_context.h"
#include "paddle/fluid/imperative/hccl_context.h"
...
...
@@ -2490,9 +2489,8 @@ void BindImperative(py::module *m_ptr) {
},
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_CNCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO)
py
::
class_
<
imperative
::
ParallelContext
,
std
::
shared_ptr
<
imperative
::
ParallelContext
>>
(
m
,
"ParallelContext"
);
...
...
python/paddle/fluid/framework.py
浏览文件 @
25bd5ed8
...
...
@@ -2759,7 +2759,6 @@ class Operator:
'c_wait_comm'
,
'c_wait_compute'
,
'copy_cross_scope'
,
'c_gen_cncl_id'
,
}
def
__init__
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录