Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
1edf4374
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
1edf4374
编写于
7月 13, 2021
作者:
L
LiuWei
提交者:
GitHub
7月 13, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
change hccl_helper as commid helper (#34118)
上级
348d043e
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
53 addition
and
11 deletion
+53
-11
paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
+34
-6
paddle/fluid/platform/gen_comm_id_helper.cc
paddle/fluid/platform/gen_comm_id_helper.cc
+18
-4
paddle/fluid/platform/gen_comm_id_helper.h
paddle/fluid/platform/gen_comm_id_helper.h
+1
-1
未找到文件。
paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
浏览文件 @
1edf4374
...
@@ -23,15 +23,35 @@ limitations under the License. */
...
@@ -23,15 +23,35 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/dynload/hccl.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
#ifdef PADDLE_WITH_ASCEND_CL
#ifdef PADDLE_WITH_ASCEND_CL
static
void
GenHCCLID
(
std
::
vector
<
HcclRootInfo
>*
hccl_ids
)
{
for
(
size_t
i
=
0
;
i
<
hccl_ids
->
size
();
++
i
)
{
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
HcclGetRootInfo
(
&
(
*
hccl_ids
)[
i
]));
}
}
static
void
CopyHCCLIDToVar
(
const
std
::
vector
<
HcclRootInfo
>&
hccl_ids
,
std
::
function
<
std
::
string
(
size_t
)
>
func
,
const
framework
::
Scope
&
scope
)
{
for
(
size_t
i
=
0
;
i
<
hccl_ids
.
size
();
++
i
)
{
std
::
string
var_name
=
func
(
i
);
auto
var
=
scope
.
FindVar
(
var_name
);
PADDLE_ENFORCE_NOT_NULL
(
var
,
platform
::
errors
::
NotFound
(
"Variable with name %s is not found"
,
var_name
.
c_str
()));
auto
hccl_id
=
var
->
GetMutable
<
HcclRootInfo
>
();
memcpy
(
hccl_id
,
&
hccl_ids
[
i
],
sizeof
(
HcclRootInfo
));
}
}
class
CGenHCCLIdOp
:
public
framework
::
OperatorBase
{
class
CGenHCCLIdOp
:
public
framework
::
OperatorBase
{
public:
public:
CGenHCCLIdOp
(
const
std
::
string
&
type
,
CGenHCCLIdOp
(
const
std
::
string
&
type
,
...
@@ -49,14 +69,22 @@ class CGenHCCLIdOp : public framework::OperatorBase {
...
@@ -49,14 +69,22 @@ class CGenHCCLIdOp : public framework::OperatorBase {
return
Output
(
"Out"
);
return
Output
(
"Out"
);
};
};
std
::
string
endpoint
=
Attr
<
std
::
string
>
(
"endpoint"
);
int
server_fd
=
platform
::
SocketServer
::
GetInstance
(
endpoint
).
socket
();
std
::
vector
<
HcclRootInfo
>
hccl_ids
;
hccl_ids
.
resize
(
1
);
if
(
rank
==
0
)
{
if
(
rank
==
0
)
{
GenHCCLID
(
&
hccl_ids
);
std
::
vector
<
std
::
string
>
endpoint_list
=
std
::
vector
<
std
::
string
>
endpoint_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"other_endpoints"
);
Attr
<
std
::
vector
<
std
::
string
>>
(
"other_endpoints"
);
SendBroadCastHCCLID
(
endpoint_list
,
1
,
func
,
local_scope
);
platform
::
SendBroadCastCommID
(
endpoint_list
,
&
hccl_ids
);
}
else
{
}
else
{
std
::
string
endpoint
=
Attr
<
std
::
string
>
(
"endpoint"
);
platform
::
RecvBroadCastCommID
(
server_fd
,
endpoint
,
&
hccl_ids
);
RecvBroadCastHCCLID
(
endpoint
,
1
,
func
,
local_scope
);
}
}
CopyHCCLIDToVar
(
hccl_ids
,
func
,
scope
);
scope
.
DeleteScope
(
&
local_scope
);
scope
.
DeleteScope
(
&
local_scope
);
}
}
};
};
...
...
paddle/fluid/platform/gen_comm_id_helper.cc
浏览文件 @
1edf4374
...
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
defined(PADDLE_WITH_XPU_BKCL)
|| defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include <arpa/inet.h>
#include <arpa/inet.h>
...
@@ -33,6 +33,10 @@ limitations under the License. */
...
@@ -33,6 +33,10 @@ limitations under the License. */
#include "xpu/bkcl.h"
#include "xpu/bkcl.h"
#endif
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
...
@@ -262,10 +266,17 @@ static int ConnectAddr(const std::string& ep, const char* head) {
...
@@ -262,10 +266,17 @@ static int ConnectAddr(const std::string& ep, const char* head) {
return
sock
;
return
sock
;
}
}
// TODO(WANGXI): maybe need to unify this hard code
#ifdef PADDLE_WITH_ASCEND_CL
#define MAX_COMMUNIQUEID_LEN 4108
#else
#define MAX_COMMUNIQUEID_LEN 1024
#endif
template
<
typename
CommUniqueId
>
template
<
typename
CommUniqueId
>
static
void
RecvCommID
(
int
conn
,
CommUniqueId
*
nccl_id
)
{
static
void
RecvCommID
(
int
conn
,
CommUniqueId
*
nccl_id
)
{
char
buffer
[
1024
]
=
{
0
};
char
buffer
[
MAX_COMMUNIQUEID_LEN
]
=
{
0
};
static_assert
(
sizeof
(
CommUniqueId
)
<=
1024
,
static_assert
(
sizeof
(
CommUniqueId
)
<=
MAX_COMMUNIQUEID_LEN
,
"nccl id bytes must <= buffer size"
);
"nccl id bytes must <= buffer size"
);
CHECK_SYS_CALL
(
SocketRecv
(
conn
,
buffer
,
sizeof
(
CommUniqueId
)),
CHECK_SYS_CALL
(
SocketRecv
(
conn
,
buffer
,
sizeof
(
CommUniqueId
)),
...
@@ -275,7 +286,7 @@ static void RecvCommID(int conn, CommUniqueId* nccl_id) {
...
@@ -275,7 +286,7 @@ static void RecvCommID(int conn, CommUniqueId* nccl_id) {
template
<
typename
CommUniqueId
>
template
<
typename
CommUniqueId
>
static
void
SendCommID
(
int
conn
,
CommUniqueId
*
nccl_id
)
{
static
void
SendCommID
(
int
conn
,
CommUniqueId
*
nccl_id
)
{
char
buffer
[
1024
]
=
{
0
};
char
buffer
[
MAX_COMMUNIQUEID_LEN
]
=
{
0
};
memcpy
(
buffer
,
nccl_id
,
sizeof
(
CommUniqueId
));
memcpy
(
buffer
,
nccl_id
,
sizeof
(
CommUniqueId
));
CHECK_SYS_CALL
(
SocketSend
(
conn
,
buffer
,
sizeof
(
CommUniqueId
)),
CHECK_SYS_CALL
(
SocketSend
(
conn
,
buffer
,
sizeof
(
CommUniqueId
)),
...
@@ -361,6 +372,9 @@ INSTANT_TEMPLATE(ncclUniqueId)
...
@@ -361,6 +372,9 @@ INSTANT_TEMPLATE(ncclUniqueId)
#ifdef PADDLE_WITH_XPU_BKCL
#ifdef PADDLE_WITH_XPU_BKCL
INSTANT_TEMPLATE
(
BKCLUniqueId
)
INSTANT_TEMPLATE
(
BKCLUniqueId
)
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
INSTANT_TEMPLATE
(
HcclRootInfo
)
#endif
}
// namespace platform
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
...
...
paddle/fluid/platform/gen_comm_id_helper.h
浏览文件 @
1edf4374
...
@@ -15,7 +15,7 @@ limitations under the License. */
...
@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#pragma once
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
defined(PADDLE_WITH_XPU_BKCL)
|| defined(PADDLE_WITH_ASCEND_CL)
#include <functional>
#include <functional>
#include <memory>
#include <memory>
#include <mutex>
#include <mutex>
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录