Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
92faeedf
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
92faeedf
编写于
3月 31, 2022
作者:
L
lilong12
提交者:
GitHub
3月 31, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Pg heter cloud (#40911)
上级
ec510bfd
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
605 addition
and
133 deletion
+605
-133
paddle/fluid/distributed/collective/CMakeLists.txt
paddle/fluid/distributed/collective/CMakeLists.txt
+9
-2
paddle/fluid/distributed/collective/Common.cc
paddle/fluid/distributed/collective/Common.cc
+54
-0
paddle/fluid/distributed/collective/Common.h
paddle/fluid/distributed/collective/Common.h
+33
-0
paddle/fluid/distributed/collective/HCCLTools.cc
paddle/fluid/distributed/collective/HCCLTools.cc
+46
-0
paddle/fluid/distributed/collective/HCCLTools.h
paddle/fluid/distributed/collective/HCCLTools.h
+4
-0
paddle/fluid/distributed/collective/NCCLTools.cc
paddle/fluid/distributed/collective/NCCLTools.cc
+46
-0
paddle/fluid/distributed/collective/NCCLTools.h
paddle/fluid/distributed/collective/NCCLTools.h
+5
-0
paddle/fluid/distributed/collective/ProcessGroup.cc
paddle/fluid/distributed/collective/ProcessGroup.cc
+7
-1
paddle/fluid/distributed/collective/ProcessGroup.h
paddle/fluid/distributed/collective/ProcessGroup.h
+41
-9
paddle/fluid/distributed/collective/ProcessGroupGloo.cc
paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+4
-2
paddle/fluid/distributed/collective/ProcessGroupGloo.h
paddle/fluid/distributed/collective/ProcessGroupGloo.h
+1
-1
paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
+4
-51
paddle/fluid/distributed/collective/ProcessGroupHCCL.h
paddle/fluid/distributed/collective/ProcessGroupHCCL.h
+2
-1
paddle/fluid/distributed/collective/ProcessGroupHeter.cc
paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+209
-0
paddle/fluid/distributed/collective/ProcessGroupHeter.h
paddle/fluid/distributed/collective/ProcessGroupHeter.h
+114
-0
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+3
-57
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+2
-1
paddle/fluid/operators/collective/c_broadcast_op.cu.cc
paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+9
-0
paddle/fluid/pybind/distributed_py.cc
paddle/fluid/pybind/distributed_py.cc
+12
-8
未找到文件。
paddle/fluid/distributed/collective/CMakeLists.txt
浏览文件 @
92faeedf
...
@@ -6,8 +6,15 @@ if (WITH_DISTRIBUTE)
...
@@ -6,8 +6,15 @@ if (WITH_DISTRIBUTE)
endif
()
endif
()
if
(
WITH_NCCL
)
if
(
WITH_NCCL
)
cc_library
(
processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api
)
cc_library
(
processgroup_nccl SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api
)
if
(
WITH_DISTRIBUTE
)
cc_library
(
processgroup_heter SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api
)
endif
()
endif
()
endif
()
if
(
WITH_ASCEND_CL
)
if
(
WITH_ASCEND_CL
)
cc_library
(
processgroup_hccl SRCS ProcessGroupHCCL.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api
)
cc_library
(
processgroup_hccl SRCS ProcessGroupHCCL.cc HCCLTools.cc Common.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api
)
if
(
WITH_DISTRIBUTE
)
cc_library
(
processgroup_heter SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api
)
endif
()
endif
()
endif
()
paddle/fluid/distributed/collective/Common.cc
0 → 100644
浏览文件 @
92faeedf
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/Common.h"
namespace
paddle
{
namespace
distributed
{
std
::
vector
<
Place
>
GetPlaceList
(
const
std
::
vector
<
Tensor
>&
tensors
)
{
std
::
vector
<
Place
>
places
;
places
.
reserve
(
tensors
.
size
());
for
(
auto
&
tensor
:
tensors
)
{
places
.
push_back
(
tensor
.
inner_place
());
}
return
places
;
}
std
::
string
GetKeyFromPlaces
(
const
std
::
vector
<
Place
>&
places
)
{
std
::
string
placeList
;
for
(
auto
&
place
:
places
)
{
std
::
stringstream
tmp
;
tmp
<<
place
;
if
(
placeList
.
empty
())
{
placeList
+=
tmp
.
str
();
}
else
{
placeList
+=
","
+
tmp
.
str
();
}
}
return
placeList
;
}
static
bool
CheckTensorsInPlace
(
const
std
::
vector
<
Tensor
>&
tensors
,
const
PlaceType
type
)
{
return
std
::
all_of
(
tensors
.
cbegin
(),
tensors
.
cend
(),
[
&
](
const
Tensor
&
t
)
{
return
t
.
place
()
==
type
;
});
}
bool
CheckTensorsInCudaPlace
(
const
std
::
vector
<
Tensor
>&
tensors
)
{
return
CheckTensorsInPlace
(
tensors
,
PlaceType
::
kGPU
);
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/Common.h
0 → 100644
浏览文件 @
92faeedf
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/include/api.h"
namespace
paddle
{
namespace
distributed
{
using
Tensor
=
paddle
::
experimental
::
Tensor
;
using
Place
=
paddle
::
platform
::
Place
;
// Get the list of devices from list of tensors
std
::
vector
<
Place
>
GetPlaceList
(
const
std
::
vector
<
Tensor
>&
tensors
);
// Get the deviceList String from the list of devices
std
::
string
GetKeyFromPlaces
(
const
std
::
vector
<
Place
>&
places
);
bool
CheckTensorsInCudaPlace
(
const
std
::
vector
<
Tensor
>&
tensors
);
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/HCCLTools.cc
0 → 100644
浏览文件 @
92faeedf
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/HCCLTools.h"
#include "paddle/fluid/distributed/collective/Types.h"
namespace
paddle
{
namespace
distributed
{
HcclReduceOp
ToHCCLRedType
(
ReduceOp
reduction
)
{
static
const
std
::
map
<
ReduceOp
,
HcclReduceOp
>
red_type
=
{
{
ReduceOp
::
MIN
,
HCCL_REDUCE_MIN
},
{
ReduceOp
::
MAX
,
HCCL_REDUCE_MAX
},
{
ReduceOp
::
SUM
,
HCCL_REDUCE_SUM
},
{
ReduceOp
::
PRODUCT
,
HCCL_REDUCE_PROD
},
};
auto
it
=
red_type
.
find
(
reduction
);
PADDLE_ENFORCE_EQ
(
it
!=
red_type
.
end
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Invalid hccl reduction. "
"Must be Min | Max | Prod | Sum"
));
return
it
->
second
;
}
std
::
string
SerializeHCCLUniqueId
(
const
HcclRootInfo
&
hcclID
)
{
const
uint8_t
*
bytes
=
reinterpret_cast
<
const
uint8_t
*>
(
&
hcclID
);
std
::
ostringstream
oss
;
for
(
size_t
i
=
0
;
i
<
sizeof
(
hcclID
);
++
i
)
{
oss
<<
std
::
hex
<<
static_cast
<
int
>
(
bytes
[
i
]);
}
return
oss
.
str
();
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/HCCLTools.h
浏览文件 @
92faeedf
...
@@ -18,6 +18,7 @@
...
@@ -18,6 +18,7 @@
#include <string>
#include <string>
#include "boost/variant.hpp"
#include "boost/variant.hpp"
#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/collective_helper.h"
...
@@ -170,5 +171,8 @@ class HCCLCommManager {
...
@@ -170,5 +171,8 @@ class HCCLCommManager {
mutable
std
::
mutex
mutex_
;
mutable
std
::
mutex
mutex_
;
};
};
HcclReduceOp
ToHCCLRedType
(
ReduceOp
reduction
);
std
::
string
SerializeHCCLUniqueId
(
const
HcclRootInfo
&
hcclID
);
}
// namespace distributed
}
// namespace distributed
}
// namespace paddle
}
// namespace paddle
paddle/fluid/distributed/collective/NCCLTools.cc
0 → 100644
浏览文件 @
92faeedf
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/NCCLTools.h"
#include "paddle/fluid/distributed/collective/Types.h"
namespace
paddle
{
namespace
distributed
{
ncclRedOp_t
ToNCCLRedType
(
ReduceOp
reduction
)
{
static
const
std
::
map
<
ReduceOp
,
ncclRedOp_t
>
red_type
=
{
{
ReduceOp
::
MIN
,
ncclMin
},
{
ReduceOp
::
MAX
,
ncclMax
},
{
ReduceOp
::
SUM
,
ncclSum
},
{
ReduceOp
::
PRODUCT
,
ncclProd
},
};
auto
it
=
red_type
.
find
(
reduction
);
PADDLE_ENFORCE_EQ
(
it
!=
red_type
.
end
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Invalid nccl reduction. Must be ncclMin | ncclMax | "
"ncclProd | ncclSum"
));
return
it
->
second
;
}
std
::
string
SerializeNCCLUniqueId
(
const
ncclUniqueId
&
ncclID
)
{
const
uint8_t
*
bytes
=
reinterpret_cast
<
const
uint8_t
*>
(
&
ncclID
);
std
::
ostringstream
oss
;
for
(
auto
i
=
0
;
i
<
NCCL_UNIQUE_ID_BYTES
;
++
i
)
{
oss
<<
std
::
hex
<<
static_cast
<
int
>
(
bytes
[
i
]);
}
return
oss
.
str
();
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/NCCLTools.h
浏览文件 @
92faeedf
...
@@ -26,6 +26,8 @@
...
@@ -26,6 +26,8 @@
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/distributed/collective/Types.h"
namespace
paddle
{
namespace
paddle
{
namespace
distributed
{
namespace
distributed
{
...
@@ -194,5 +196,8 @@ class NCCLCommManager {
...
@@ -194,5 +196,8 @@ class NCCLCommManager {
mutable
std
::
mutex
mutex_
;
mutable
std
::
mutex
mutex_
;
};
};
ncclRedOp_t
ToNCCLRedType
(
ReduceOp
reduction
);
std
::
string
SerializeNCCLUniqueId
(
const
ncclUniqueId
&
ncclID
);
}
// namespace distributed
}
// namespace distributed
}
// namespace paddle
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroup.cc
浏览文件 @
92faeedf
...
@@ -34,7 +34,13 @@ bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) {
...
@@ -34,7 +34,13 @@ bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) {
void
ProcessGroup
::
Task
::
Synchronize
()
{}
void
ProcessGroup
::
Task
::
Synchronize
()
{}
ProcessGroup
::
ProcessGroup
(
int
rank
,
int
size
)
:
rank_
(
rank
),
size_
(
size
)
{}
ProcessGroup
::
ProcessGroup
(
int
rank
,
int
size
,
int
gid
)
:
rank_
(
rank
),
size_
(
size
)
{
if
(
gid
!=
IGNORE_ID
)
{
auto
map
=
ProcessGroupMapFromGid
::
getInstance
();
map
->
insert
(
gid
,
this
);
}
}
}
// namespace distributed
}
// namespace distributed
}
// namespace paddle
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroup.h
浏览文件 @
92faeedf
...
@@ -31,6 +31,7 @@ constexpr auto kWaitTimeout = std::chrono::milliseconds(0);
...
@@ -31,6 +31,7 @@ constexpr auto kWaitTimeout = std::chrono::milliseconds(0);
namespace
paddle
{
namespace
paddle
{
namespace
distributed
{
namespace
distributed
{
constexpr
int
IGNORE_ID
=
-
1
;
using
Tensor
=
paddle
::
experimental
::
Tensor
;
using
Tensor
=
paddle
::
experimental
::
Tensor
;
enum
class
CommType
:
std
::
uint8_t
{
enum
class
CommType
:
std
::
uint8_t
{
...
@@ -49,14 +50,6 @@ enum class CommType : std::uint8_t {
...
@@ -49,14 +50,6 @@ enum class CommType : std::uint8_t {
UNKNOWN
=
100
,
UNKNOWN
=
100
,
};
};
struct
ProcessGroupStrategy
{
int
nranks_
{
1
};
int
local_rank_
{
0
};
std
::
vector
<
std
::
string
>
trainer_endpoints_
{};
std
::
string
current_endpoint_
{
""
};
int
nrings_
{
1
};
};
class
ProcessGroup
{
class
ProcessGroup
{
public:
public:
class
Task
{
class
Task
{
...
@@ -76,7 +69,7 @@ class ProcessGroup {
...
@@ -76,7 +69,7 @@ class ProcessGroup {
bool
is_completed_
=
false
;
bool
is_completed_
=
false
;
};
};
explicit
ProcessGroup
(
int
rank
,
int
size
);
explicit
ProcessGroup
(
int
rank
,
int
size
,
int
gid
);
virtual
~
ProcessGroup
()
{}
virtual
~
ProcessGroup
()
{}
int
GetRank
()
const
{
return
rank_
;
}
int
GetRank
()
const
{
return
rank_
;
}
...
@@ -99,6 +92,12 @@ class ProcessGroup {
...
@@ -99,6 +92,12 @@ class ProcessGroup {
"ProcessGroup%s does not support broadcast"
,
GetBackendName
()));
"ProcessGroup%s does not support broadcast"
,
GetBackendName
()));
}
}
virtual
void
Broadcast
(
const
phi
::
DenseTensor
*
in
,
phi
::
DenseTensor
*
out
)
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroup%s does not support broadcast for static"
,
GetBackendName
()));
}
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Barrier
(
virtual
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Barrier
(
const
BarrierOptions
&
=
BarrierOptions
())
{
const
BarrierOptions
&
=
BarrierOptions
())
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
...
@@ -151,5 +150,38 @@ class ProcessGroup {
...
@@ -151,5 +150,38 @@ class ProcessGroup {
const
int
size_
;
const
int
size_
;
};
};
class
ProcessGroupMapFromGid
{
public:
bool
has
(
int
gid
)
{
auto
it
=
map_
.
find
(
gid
);
return
it
!=
map_
.
end
();
}
void
insert
(
int
gid
,
ProcessGroup
*
pg
)
{
PADDLE_ENFORCE_EQ
(
has
(
gid
),
false
,
platform
::
errors
::
PreconditionNotMet
(
"The process group with id %d doesnot exist."
,
gid
));
map_
[
gid
]
=
pg
;
}
ProcessGroup
*
get
(
int
gid
)
{
PADDLE_ENFORCE_EQ
(
has
(
gid
),
false
,
platform
::
errors
::
PreconditionNotMet
(
"The process group with id %d doesnot exist."
,
gid
));
return
map_
.
find
(
gid
)
->
second
;
}
static
std
::
shared_ptr
<
ProcessGroupMapFromGid
>
getInstance
()
{
static
auto
s_instance
=
std
::
make_shared
<
ProcessGroupMapFromGid
>
();
return
s_instance
;
}
ProcessGroupMapFromGid
()
=
default
;
~
ProcessGroupMapFromGid
()
=
default
;
private:
std
::
unordered_map
<
int
,
ProcessGroup
*>
map_
;
};
}
// namespace distributed
}
// namespace distributed
}
// namespace paddle
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupGloo.cc
浏览文件 @
92faeedf
...
@@ -173,8 +173,10 @@ ProcessGroupGloo::GlooTask::GlooTask(int rank,
...
@@ -173,8 +173,10 @@ ProcessGroupGloo::GlooTask::GlooTask(int rank,
ProcessGroupGloo
::
ProcessGroupGloo
(
ProcessGroupGloo
::
ProcessGroupGloo
(
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>&
store
,
int
rank
,
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>&
store
,
int
rank
,
int
world_size
,
const
std
::
shared_ptr
<
GlooOptions
>
options
)
int
world_size
,
int
gid
,
const
std
::
shared_ptr
<
GlooOptions
>
options
)
:
ProcessGroup
(
rank
,
world_size
),
_tag
(
0
),
_store
(
new
GlooStore
(
store
))
{
:
ProcessGroup
(
rank
,
world_size
,
gid
),
_tag
(
0
),
_store
(
new
GlooStore
(
store
))
{
_context
=
std
::
make_shared
<
gloo
::
rendezvous
::
Context
>
(
rank
,
world_size
);
_context
=
std
::
make_shared
<
gloo
::
rendezvous
::
Context
>
(
rank
,
world_size
);
auto
prefix_store
=
auto
prefix_store
=
::
gloo
::
rendezvous
::
PrefixStore
(
std
::
to_string
(
0
),
*
_store
);
::
gloo
::
rendezvous
::
PrefixStore
(
std
::
to_string
(
0
),
*
_store
);
...
...
paddle/fluid/distributed/collective/ProcessGroupGloo.h
浏览文件 @
92faeedf
...
@@ -101,7 +101,7 @@ class ProcessGroupGloo : public ProcessGroup {
...
@@ -101,7 +101,7 @@ class ProcessGroupGloo : public ProcessGroup {
explicit
ProcessGroupGloo
(
explicit
ProcessGroupGloo
(
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>&
store
,
int
rank
,
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>&
store
,
int
rank
,
int
world_size
,
std
::
shared_ptr
<
GlooOptions
>
options
);
int
world_size
,
int
gid
,
std
::
shared_ptr
<
GlooOptions
>
options
);
~
ProcessGroupGloo
()
=
default
;
~
ProcessGroupGloo
()
=
default
;
...
...
paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
浏览文件 @
92faeedf
...
@@ -13,6 +13,8 @@
...
@@ -13,6 +13,8 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/distributed/collective/HCCLTools.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device/npu/hccl_helper.h"
#include "paddle/fluid/platform/device/npu/hccl_helper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
...
@@ -28,55 +30,6 @@ constexpr int64_t kWaitBlockTImeout = 10;
...
@@ -28,55 +30,6 @@ constexpr int64_t kWaitBlockTImeout = 10;
namespace
paddle
{
namespace
paddle
{
namespace
distributed
{
namespace
distributed
{
static
HcclReduceOp
ToHCCLRedType
(
ReduceOp
reduction
)
{
static
const
std
::
map
<
ReduceOp
,
HcclReduceOp
>
red_type
=
{
{
ReduceOp
::
MIN
,
HCCL_REDUCE_MIN
},
{
ReduceOp
::
MAX
,
HCCL_REDUCE_MAX
},
{
ReduceOp
::
SUM
,
HCCL_REDUCE_SUM
},
{
ReduceOp
::
PRODUCT
,
HCCL_REDUCE_PROD
},
};
auto
it
=
red_type
.
find
(
reduction
);
PADDLE_ENFORCE_EQ
(
it
!=
red_type
.
end
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Invalid hccl reduction. "
"Must be Min | Max | Prod | Sum"
));
return
it
->
second
;
}
std
::
string
SerializeHCCLUniqueId
(
const
HcclRootInfo
&
hcclID
)
{
const
uint8_t
*
bytes
=
reinterpret_cast
<
const
uint8_t
*>
(
&
hcclID
);
std
::
ostringstream
oss
;
for
(
size_t
i
=
0
;
i
<
sizeof
(
hcclID
);
++
i
)
{
oss
<<
std
::
hex
<<
static_cast
<
int
>
(
bytes
[
i
]);
}
return
oss
.
str
();
}
// Get the list of devices from list of tensors
std
::
vector
<
Place
>
GetPlaceList
(
const
std
::
vector
<
Tensor
>&
tensors
)
{
std
::
vector
<
Place
>
places
;
places
.
reserve
(
tensors
.
size
());
for
(
auto
&
tensor
:
tensors
)
{
places
.
push_back
(
tensor
.
inner_place
());
}
return
places
;
}
// Get the deviceList String from the list of devices
std
::
string
GetKeyFromPlaces
(
const
std
::
vector
<
Place
>&
places
)
{
std
::
string
placeList
;
for
(
auto
&
place
:
places
)
{
std
::
stringstream
tmp
;
tmp
<<
place
;
if
(
placeList
.
empty
())
{
placeList
+=
tmp
.
str
();
}
else
{
placeList
+=
","
+
tmp
.
str
();
}
}
return
placeList
;
}
// bool CheckTensorsInNPUPlace(const std::vector<Tensor>& tensors) {
// bool CheckTensorsInNPUPlace(const std::vector<Tensor>& tensors) {
// return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
// return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
// return t.place() == platform::DeviceType::NPU;
// return t.place() == platform::DeviceType::NPU;
...
@@ -150,8 +103,8 @@ bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
...
@@ -150,8 +103,8 @@ bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
void
ProcessGroupHCCL
::
HCCLTask
::
Synchronize
()
{
Wait
(
kWaitTimeout
);
}
void
ProcessGroupHCCL
::
HCCLTask
::
Synchronize
()
{
Wait
(
kWaitTimeout
);
}
ProcessGroupHCCL
::
ProcessGroupHCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
ProcessGroupHCCL
::
ProcessGroupHCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
)
int
rank
,
int
size
,
int
gid
)
:
ProcessGroup
(
rank
,
size
),
store_
(
store
)
{}
:
ProcessGroup
(
rank
,
size
,
gid
),
store_
(
store
)
{}
void
ProcessGroupHCCL
::
BroadcastUniqueHCCLID
(
void
ProcessGroupHCCL
::
BroadcastUniqueHCCLID
(
std
::
vector
<
HcclRootInfo
>&
hccl_ids
)
{
// NOLINT
std
::
vector
<
HcclRootInfo
>&
hccl_ids
)
{
// NOLINT
...
...
paddle/fluid/distributed/collective/ProcessGroupHCCL.h
浏览文件 @
92faeedf
...
@@ -70,7 +70,8 @@ class ProcessGroupHCCL : public ProcessGroup {
...
@@ -70,7 +70,8 @@ class ProcessGroupHCCL : public ProcessGroup {
private:
private:
};
};
ProcessGroupHCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
);
ProcessGroupHCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
int
gid
);
const
std
::
string
GetBackendName
()
const
override
{
const
std
::
string
GetBackendName
()
const
override
{
return
std
::
string
(
HCCL_BACKEND_NAME
);
return
std
::
string
(
HCCL_BACKEND_NAME
);
...
...
paddle/fluid/distributed/collective/ProcessGroupHeter.cc
0 → 100644
浏览文件 @
92faeedf
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupHeter.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/common/place.h"
constexpr
int64_t
kWaitBlockTImeout
=
10
;
namespace
paddle
{
namespace
distributed
{
using
Place
=
paddle
::
platform
::
Place
;
std
::
shared_ptr
<
ProcessGroupHeter
::
HeterTask
>
ProcessGroupHeter
::
CreateTask
(
int
rank
,
CommType
comm_type
,
const
std
::
vector
<
Tensor
>&
inputs
)
{
return
std
::
make_shared
<
ProcessGroupHeter
::
HeterTask
>
(
rank
,
comm_type
,
inputs
);
}
ProcessGroupHeter
::
HeterTask
::
HeterTask
(
int
rank
,
CommType
CommType
,
const
std
::
vector
<
Tensor
>&
inputs
)
:
Task
(
rank
,
inputs
,
CommType
)
{}
ProcessGroupHeter
::
HeterTask
::~
HeterTask
()
{}
bool
ProcessGroupHeter
::
HeterTask
::
IsCompleted
()
{
return
true
;
}
// TODO(sheniang03): Add timeout for wait, now timeout unused
bool
ProcessGroupHeter
::
HeterTask
::
Wait
(
std
::
chrono
::
milliseconds
timeout
)
{
return
true
;
}
ProcessGroupHeter
::
ProcessGroupHeter
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
int
gid
,
int
local_rank
,
int
local_size
,
int
gloo_rank
,
int
gloo_size
,
bool
with_switch
,
std
::
string
switch_endpoint
)
:
ProcessGroup
(
rank
,
size
,
gid
),
store_
(
store
),
local_rank_
(
local_rank
),
local_size_
(
local_size
),
gloo_rank_
(
gloo_rank
),
gloo_size_
(
gloo_size
),
with_switch_
(
with_switch
)
{
#if defined(PADDLE_WITH_NCCL)
inner_pg_
=
std
::
make_shared
<
ProcessGroupNCCL
>
(
store
,
local_rank
,
local_size
,
IGNORE_ID
);
#elif defined(PADDLE_WITH_ASCEND_CL)
inner_pg_
=
std
::
make_shared
<
ProcessGroupHCCL
>
(
store
,
local_rank
,
local_size
,
IGNORE_ID
);
#else
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"ProcessGroupHeter only supports NCCL and HCCL now."
);
#endif
if
(
with_switch_
)
{
// TODO(sandyhouse) starts a client to connect the cloud switch module
// std::shared_ptr<HeterClient> client_ =
// HeterClient::GetInstance({switch_endpoint}, {}, 0);
}
else
if
(
local_rank_
==
0
)
{
auto
opts
=
ProcessGroupGloo
::
GlooOptions
::
create
();
opts
->
device
=
ProcessGroupGloo
::
createDefaultDevice
();
inter_pg_
=
std
::
make_shared
<
ProcessGroupGloo
>
(
store
,
gloo_rank_
,
gloo_size_
,
IGNORE_ID
,
opts
);
}
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupHeter
::
AllReduce
(
std
::
vector
<
Tensor
>&
tensors
,
const
AllreduceOptions
&
opts
)
{
#if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
#endif
// Step1: do allreduce in inner cluster
auto
task
=
inner_pg_
->
AllReduce
(
tensors
,
opts
);
task
->
Wait
();
// Step2: copy tensors to CPU
if
(
local_rank_
==
0
)
{
std
::
vector
<
Tensor
>
cpu_tensors
(
tensors
.
size
());
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
i
++
)
{
auto
dense_gpu_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensors
[
i
].
impl
());
auto
dense_cpu_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
cpu_tensors
[
i
].
impl
());
dense_cpu_tensor
->
Resize
(
tensors
[
i
].
dims
());
framework
::
TensorCopySync
(
*
dense_gpu_tensor
,
platform
::
CPUPlace
(),
dense_cpu_tensor
.
get
());
}
// Step3: do inter cluster allreduce
if
(
with_switch_
)
{
// TODO(sandyhouse) send to and recv from switch, and do add
}
else
{
auto
gloo_task
=
inter_pg_
->
AllReduce
(
cpu_tensors
,
opts
);
gloo_task
->
Wait
();
}
// Step4: copy cpu tensors to gpu
// TODO(sandyhouse)
// copy cpu tensors to gpu
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
i
++
)
{
auto
dense_gpu_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensors
[
i
].
impl
());
auto
dense_cpu_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
cpu_tensors
[
i
].
impl
());
// framework::TensorCopySync(*dense_cpu_tensor, tensors[i].place(),
// dense_gpu_tensor.get());
framework
::
TensorCopySync
(
*
dense_cpu_tensor
,
dense_cpu_tensor
->
place
(),
dense_gpu_tensor
.
get
());
}
}
// Step5: broadcast among inner cluster
auto
b_opts
=
BroadcastOptions
();
b_opts
.
source_root
=
0
;
auto
broadcast_task
=
inner_pg_
->
Broadcast
(
tensors
,
b_opts
);
broadcast_task
->
Wait
();
return
CreateTask
(
rank_
,
CommType
::
ALLREDUCE
,
tensors
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
ProcessGroupHeter
::
Broadcast
(
std
::
vector
<
Tensor
>&
tensors
,
const
BroadcastOptions
&
opts
)
{
#if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE_EQ
(
CheckTensorsInCudaPlace
(
tensors
),
true
,
platform
::
errors
::
InvalidArgument
(
"All inputs should be in CudaPlace."
));
#endif
// Step1: do broadcast in inner cluster
auto
b_opts
=
BroadcastOptions
();
b_opts
.
source_root
=
0
;
inner_pg_
->
Broadcast
(
tensors
,
b_opts
);
if
(
local_rank_
==
0
)
{
std
::
vector
<
Tensor
>
cpu_tensors
(
tensors
.
size
());
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
i
++
)
{
auto
dense_gpu_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensors
[
i
].
impl
());
auto
dense_cpu_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
cpu_tensors
[
i
].
impl
());
dense_cpu_tensor
->
Resize
(
tensors
[
i
].
dims
());
framework
::
TensorCopySync
(
*
dense_gpu_tensor
,
platform
::
CPUPlace
(),
dense_cpu_tensor
.
get
());
}
if
(
with_switch_
)
{
// TODO(sandyhouse) send to and recv
}
else
{
auto
gloo_task
=
inter_pg_
->
Broadcast
(
cpu_tensors
,
opts
);
gloo_task
->
Wait
();
}
for
(
size_t
i
=
0
;
i
<
tensors
.
size
();
i
++
)
{
auto
dense_gpu_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
tensors
[
i
].
impl
());
auto
dense_cpu_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
cpu_tensors
[
i
].
impl
());
// framework::TensorCopySync(*dense_cpu_tensor, tensors[i].place(),
// dense_gpu_tensor.get());
framework
::
TensorCopySync
(
*
dense_cpu_tensor
,
dense_cpu_tensor
->
place
(),
dense_gpu_tensor
.
get
());
}
}
auto
broadcast_task
=
inner_pg_
->
Broadcast
(
tensors
,
b_opts
);
broadcast_task
->
Wait
();
return
CreateTask
(
rank_
,
CommType
::
BROADCAST
,
tensors
);
}
void
ProcessGroupHeter
::
Broadcast
(
const
phi
::
DenseTensor
*
in
,
phi
::
DenseTensor
*
out
)
{
// Step1: do broadcast in inner cluster
inner_pg_
->
Broadcast
(
in
,
out
);
if
(
local_rank_
==
0
)
{
Tensor
cpu_tensor
;
auto
dense_cpu_tensor
=
std
::
dynamic_pointer_cast
<
phi
::
DenseTensor
>
(
cpu_tensor
.
impl
());
dense_cpu_tensor
->
Resize
(
in
->
dims
());
framework
::
TensorCopySync
(
*
in
,
platform
::
CPUPlace
(),
dense_cpu_tensor
.
get
());
if
(
with_switch_
)
{
// TODO(sandyhouse) send to and recv
}
else
{
std
::
vector
<
Tensor
>
cpu_tensors
=
{
cpu_tensor
};
// auto gloo_task = inter_pg_->Broadcast(cpu_tensors);
// gloo_task->Wait();
inter_pg_
->
Broadcast
(
cpu_tensors
);
}
framework
::
TensorCopySync
(
*
dense_cpu_tensor
,
dense_cpu_tensor
->
place
(),
out
);
}
inner_pg_
->
Broadcast
(
out
,
out
);
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupHeter.h
0 → 100644
浏览文件 @
92faeedf
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
// #include "paddle/fluid/distributed/ps/service/heter_client.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_GLOO
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#include "paddle/fluid/distributed/store/store.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream/cuda_stream.h"
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/distributed/collective/NCCLTools.h"
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/distributed/collective/HCCLTools.h"
#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
#endif
#include "paddle/fluid/distributed/collective/Common.h"
constexpr
const
char
*
HETER_BACKEND_NAME
=
"HETER_BACKEND"
;
namespace
paddle
{
namespace
distributed
{
using
Place
=
paddle
::
platform
::
Place
;
class
ProcessGroupHeter
:
public
ProcessGroup
{
public:
class
HeterTask
:
public
ProcessGroup
::
Task
,
public
std
::
enable_shared_from_this
<
HeterTask
>
{
public:
HeterTask
(
int
rank
,
CommType
CommType
,
const
std
::
vector
<
Tensor
>&
inputs
);
bool
IsCompleted
();
void
SynchronizeStreams
()
{}
bool
Wait
(
std
::
chrono
::
milliseconds
timeout
=
kWaitTimeout
);
void
Synchronize
()
{}
virtual
~
HeterTask
();
};
ProcessGroupHeter
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
int
gid
,
int
local_rank
,
int
local_size
,
int
gloo_rank
,
int
gloo_size
,
bool
with_switch
,
std
::
string
switch_endpoints
);
const
std
::
string
GetBackendName
()
const
override
{
return
std
::
string
(
HETER_BACKEND_NAME
);
}
std
::
shared_ptr
<
ProcessGroup
::
Task
>
AllReduce
(
std
::
vector
<
Tensor
>&
tensors
,
const
AllreduceOptions
&
=
AllreduceOptions
())
override
;
std
::
shared_ptr
<
ProcessGroup
::
Task
>
Broadcast
(
std
::
vector
<
Tensor
>&
tensors
,
const
BroadcastOptions
&
=
BroadcastOptions
())
override
;
void
Broadcast
(
const
phi
::
DenseTensor
*
in
,
phi
::
DenseTensor
*
out
)
override
;
protected:
virtual
std
::
shared_ptr
<
ProcessGroupHeter
::
HeterTask
>
CreateTask
(
int
rank
,
CommType
opType
,
const
std
::
vector
<
Tensor
>&
inputs
);
private:
std
::
shared_ptr
<
Store
>
store_
;
std
::
shared_ptr
<
ProcessGroup
>
inner_pg_
;
std
::
shared_ptr
<
ProcessGroupGloo
>
inter_pg_
;
int
local_rank_
;
int
local_size_
;
int
gloo_rank_
;
int
gloo_size_
;
bool
with_switch_
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
浏览文件 @
92faeedf
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
#include "paddle/fluid/distributed/collective/Common.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/api/include/api.h"
...
@@ -26,61 +27,6 @@ constexpr int64_t kWaitBlockTImeout = 10;
...
@@ -26,61 +27,6 @@ constexpr int64_t kWaitBlockTImeout = 10;
namespace
paddle
{
namespace
paddle
{
namespace
distributed
{
namespace
distributed
{
static
ncclRedOp_t
ToNCCLRedType
(
ReduceOp
reduction
)
{
static
const
std
::
map
<
ReduceOp
,
ncclRedOp_t
>
red_type
=
{
{
ReduceOp
::
MIN
,
ncclMin
},
{
ReduceOp
::
MAX
,
ncclMax
},
{
ReduceOp
::
SUM
,
ncclSum
},
{
ReduceOp
::
PRODUCT
,
ncclProd
},
};
auto
it
=
red_type
.
find
(
reduction
);
PADDLE_ENFORCE_EQ
(
it
!=
red_type
.
end
(),
true
,
platform
::
errors
::
InvalidArgument
(
"Invalid nccl reduction. Must be ncclMin | ncclMax | "
"ncclProd | ncclSum"
));
return
it
->
second
;
}
std
::
string
SerializeNCCLUniqueId
(
const
ncclUniqueId
&
ncclID
)
{
const
uint8_t
*
bytes
=
reinterpret_cast
<
const
uint8_t
*>
(
&
ncclID
);
std
::
ostringstream
oss
;
for
(
auto
i
=
0
;
i
<
NCCL_UNIQUE_ID_BYTES
;
++
i
)
{
oss
<<
std
::
hex
<<
static_cast
<
int
>
(
bytes
[
i
]);
}
return
oss
.
str
();
}
// Get the list of devices from list of tensors
std
::
vector
<
Place
>
GetPlaceList
(
const
std
::
vector
<
Tensor
>&
tensors
)
{
std
::
vector
<
Place
>
places
;
places
.
reserve
(
tensors
.
size
());
for
(
auto
&
tensor
:
tensors
)
{
places
.
push_back
(
tensor
.
inner_place
());
}
return
places
;
}
// Get the deviceList String from the list of devices
std
::
string
GetKeyFromPlaces
(
const
std
::
vector
<
Place
>&
places
)
{
std
::
string
placeList
;
for
(
auto
&
place
:
places
)
{
std
::
stringstream
tmp
;
tmp
<<
place
;
if
(
placeList
.
empty
())
{
placeList
+=
tmp
.
str
();
}
else
{
placeList
+=
","
+
tmp
.
str
();
}
}
return
placeList
;
}
bool
CheckTensorsInCudaPlace
(
const
std
::
vector
<
Tensor
>&
tensors
)
{
return
std
::
all_of
(
tensors
.
cbegin
(),
tensors
.
cend
(),
[
&
](
const
Tensor
&
t
)
{
return
t
.
place
()
==
PlaceType
::
kGPU
;
});
}
void
SyncDefaultStream
(
void
SyncDefaultStream
(
const
std
::
vector
<
Place
>&
places
,
const
std
::
vector
<
Place
>&
places
,
std
::
vector
<
EventManager
>&
ncclEvents
,
// NOLINT
std
::
vector
<
EventManager
>&
ncclEvents
,
// NOLINT
...
@@ -157,8 +103,8 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
...
@@ -157,8 +103,8 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
void
ProcessGroupNCCL
::
NCCLTask
::
Synchronize
()
{
Wait
(
kWaitTimeout
);
}
void
ProcessGroupNCCL
::
NCCLTask
::
Synchronize
()
{
Wait
(
kWaitTimeout
);
}
ProcessGroupNCCL
::
ProcessGroupNCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
ProcessGroupNCCL
::
ProcessGroupNCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
)
int
rank
,
int
size
,
int
gid
)
:
ProcessGroup
(
rank
,
size
),
store_
(
store
)
{}
:
ProcessGroup
(
rank
,
size
,
gid
),
store_
(
store
)
{}
void
ProcessGroupNCCL
::
BroadcastUniqueNCCLID
(
void
ProcessGroupNCCL
::
BroadcastUniqueNCCLID
(
std
::
vector
<
ncclUniqueId
>&
nccl_ids
)
{
// NOLINT
std
::
vector
<
ncclUniqueId
>&
nccl_ids
)
{
// NOLINT
...
...
paddle/fluid/distributed/collective/ProcessGroupNCCL.h
浏览文件 @
92faeedf
...
@@ -76,7 +76,8 @@ class ProcessGroupNCCL : public ProcessGroup {
...
@@ -76,7 +76,8 @@ class ProcessGroupNCCL : public ProcessGroup {
private:
private:
};
};
ProcessGroupNCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
);
ProcessGroupNCCL
(
const
std
::
shared_ptr
<
Store
>&
store
,
int
rank
,
int
size
,
int
gid
);
const
std
::
string
GetBackendName
()
const
override
{
const
std
::
string
GetBackendName
()
const
override
{
return
std
::
string
(
NCCL_BACKEND_NAME
);
return
std
::
string
(
NCCL_BACKEND_NAME
);
...
...
paddle/fluid/operators/collective/c_broadcast_op.cu.cc
浏览文件 @
92faeedf
...
@@ -18,6 +18,8 @@ limitations under the License. */
...
@@ -18,6 +18,8 @@ limitations under the License. */
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif
#endif
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/phi/api/include/tensor.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -36,6 +38,13 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
...
@@ -36,6 +38,13 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
place
=
ctx
.
GetPlace
();
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
auto
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
auto
map
=
distributed
::
ProcessGroupMapFromGid
::
getInstance
();
if
(
map
->
has
(
rid
))
{
// Use ProcessGroup
distributed
::
ProcessGroup
*
pg
=
map
->
get
(
rid
);
pg
->
Broadcast
(
x
,
out
);
return
;
}
gpuStream_t
stream
=
nullptr
;
gpuStream_t
stream
=
nullptr
;
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
...
...
paddle/fluid/pybind/distributed_py.cc
浏览文件 @
92faeedf
...
@@ -213,16 +213,20 @@ void BindDistributed(py::module *m) {
...
@@ -213,16 +213,20 @@ void BindDistributed(py::module *m) {
py
::
class_
<
distributed
::
ProcessGroupNCCL
,
py
::
class_
<
distributed
::
ProcessGroupNCCL
,
std
::
shared_ptr
<
distributed
::
ProcessGroupNCCL
>>
(
std
::
shared_ptr
<
distributed
::
ProcessGroupNCCL
>>
(
*
m
,
"ProcessGroupNCCL"
,
ProcessGroup
)
*
m
,
"ProcessGroupNCCL"
,
ProcessGroup
)
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
distributed
::
Store
>
&
,
int
,
int
>
(),
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
distributed
::
Store
>
&
,
int
,
int
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
int
>
(),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"group_id"
)
=
0
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#endif
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
#if defined(PADDLE_WITH_ASCEND_CL)
py
::
class_
<
distributed
::
ProcessGroupHCCL
,
py
::
class_
<
distributed
::
ProcessGroupHCCL
,
std
::
shared_ptr
<
distributed
::
ProcessGroupHCCL
>>
(
std
::
shared_ptr
<
distributed
::
ProcessGroupHCCL
>>
(
*
m
,
"ProcessGroupHCCL"
,
ProcessGroup
)
*
m
,
"ProcessGroupHCCL"
,
ProcessGroup
)
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
distributed
::
Store
>
&
,
int
,
int
>
(),
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
distributed
::
Store
>
&
,
int
,
int
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
int
>
(),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"group_id"
)
=
0
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#endif
#endif
py
::
class_
<
distributed
::
ProcessGroup
::
Task
,
py
::
class_
<
distributed
::
ProcessGroup
::
Task
,
...
@@ -238,10 +242,10 @@ void BindDistributed(py::module *m) {
...
@@ -238,10 +242,10 @@ void BindDistributed(py::module *m) {
py
::
class_
<
ProcessGroupGloo
,
std
::
shared_ptr
<
ProcessGroupGloo
>>
(
py
::
class_
<
ProcessGroupGloo
,
std
::
shared_ptr
<
ProcessGroupGloo
>>
(
*
m
,
"ProcessGroupGloo"
,
ProcessGroup
)
*
m
,
"ProcessGroupGloo"
,
ProcessGroup
)
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>
&
,
int
,
.
def
(
py
::
init
<
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>
&
,
int
,
int
,
std
::
shared_ptr
<
GlooOptions
>
&>
(),
int
,
int
,
std
::
shared_ptr
<
GlooOptions
>
&>
(),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
py
::
init
([](
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>
&
store
,
.
def
(
py
::
init
([](
const
std
::
shared_ptr
<
paddle
::
distributed
::
Store
>
&
store
,
int
rank
,
int
world_size
)
{
int
rank
,
int
world_size
,
int
gid
)
{
auto
opts
=
GlooOptions
::
create
();
auto
opts
=
GlooOptions
::
create
();
char
*
ifname
=
getenv
(
GLOO_SOCKET_IFNAME_ENV
.
c_str
());
char
*
ifname
=
getenv
(
GLOO_SOCKET_IFNAME_ENV
.
c_str
());
if
(
ifname
&&
strlen
(
ifname
)
>
1
)
{
if
(
ifname
&&
strlen
(
ifname
)
>
1
)
{
...
@@ -251,10 +255,10 @@ void BindDistributed(py::module *m) {
...
@@ -251,10 +255,10 @@ void BindDistributed(py::module *m) {
opts
->
device
=
ProcessGroupGloo
::
createDefaultDevice
();
opts
->
device
=
ProcessGroupGloo
::
createDefaultDevice
();
}
}
return
std
::
make_shared
<
ProcessGroupGloo
>
(
store
,
rank
,
world_size
,
return
std
::
make_shared
<
ProcessGroupGloo
>
(
store
,
rank
,
world_size
,
opts
);
gid
,
opts
);
}),
}),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
arg
(
"store"
),
py
::
arg
(
"rank"
),
py
::
arg
(
"world_size"
),
py
::
call_guard
<
py
::
gil_scoped_release
>
())
py
::
arg
(
"group_id"
)
=
0
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def_static
(
"create_default_device"
,
.
def_static
(
"create_default_device"
,
&
ProcessGroupGloo
::
createDefaultDevice
);
&
ProcessGroupGloo
::
createDefaultDevice
);
#endif
#endif
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录