Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
03e99e26
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
03e99e26
编写于
2月 28, 2021
作者:
W
WangXi
提交者:
sandyhouse
3月 22, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update
上级
7ab01c28
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
61 addition
and
10 deletion
+61
-10
paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+3
-3
paddle/fluid/operators/collective/gen_nccl_id_op_helper.cc
paddle/fluid/operators/collective/gen_nccl_id_op_helper.cc
+33
-2
paddle/fluid/operators/collective/gen_nccl_id_op_helper.h
paddle/fluid/operators/collective/gen_nccl_id_op_helper.h
+25
-5
未找到文件。
paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
浏览文件 @
03e99e26
...
@@ -55,7 +55,8 @@ class CGenNCCLIdOp : public framework::OperatorBase {
...
@@ -55,7 +55,8 @@ class CGenNCCLIdOp : public framework::OperatorBase {
SendBroadCastNCCLID
(
endpoint_list
,
1
,
func
,
local_scope
);
SendBroadCastNCCLID
(
endpoint_list
,
1
,
func
,
local_scope
);
}
else
{
}
else
{
std
::
string
endpoint
=
Attr
<
std
::
string
>
(
"endpoint"
);
std
::
string
endpoint
=
Attr
<
std
::
string
>
(
"endpoint"
);
RecvBroadCastNCCLID
(
endpoint
,
1
,
func
,
local_scope
);
int
server_fd
=
platform
::
SocketServer
::
GetInstance
(
endpoint
).
socket
();
platform
::
RecvBroadCastCommID
(
server_fd
,
endpoint
,
&
nccl_ids
);
}
}
scope
.
DeleteScope
(
&
local_scope
);
scope
.
DeleteScope
(
&
local_scope
);
}
}
...
@@ -71,8 +72,7 @@ class CGenNCCLIdOp : public framework::OperatorBase {
...
@@ -71,8 +72,7 @@ class CGenNCCLIdOp : public framework::OperatorBase {
:
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
)
{}
:
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
)
{}
void
RunImpl
(
const
framework
::
Scope
&
scope
,
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
const
platform
::
Place
&
dev_place
)
const
override
{}
}
};
};
#endif
#endif
...
...
paddle/fluid/operators/collective/gen_nccl_id_op_helper.cc
浏览文件 @
03e99e26
...
@@ -31,7 +31,9 @@ limitations under the License. */
...
@@ -31,7 +31,9 @@ limitations under the License. */
#include "paddle/fluid/string/split.h"
#include "paddle/fluid/string/split.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
platform
{
std
::
once_flag
SocketServer
::
init_flag_
;
constexpr
char
COMM_HEAD
[]
=
"_pd_gen_comm_id_"
;
constexpr
char
COMM_HEAD
[]
=
"_pd_gen_comm_id_"
;
...
@@ -340,5 +342,34 @@ void RecvBroadCastNCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
...
@@ -340,5 +342,34 @@ void RecvBroadCastNCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
CloseSocket
(
client
);
CloseSocket
(
client
);
}
}
}
// namespace operators
SocketServer
&
SocketServer
::
GetInstance
(
const
std
::
string
&
end_point
)
{
static
SocketServer
instance
;
std
::
call_once
(
init_flag_
,
[
&
]()
{
instance
.
server_fd_
=
CreateListenSocket
(
end_point
);
instance
.
end_point_
=
end_point
;
});
PADDLE_ENFORCE_NE
(
instance
.
server_fd_
,
-
1
,
platform
::
errors
::
Unavailable
(
"listen socket failed with end_point=%s"
,
end_point
));
PADDLE_ENFORCE_EQ
(
instance
.
end_point_
,
end_point
,
platform
::
errors
::
InvalidArgument
(
"old end_point=%s must equal with new end_point=%s"
,
instance
.
end_point_
,
end_point
));
return
instance
;
}
/// template instantiation
#define INSTANT_TEMPLATE(Type) \
template void SendBroadCastCommID<Type>(std::vector<std::string> servers, \
std::vector<Type> * nccl_ids); \
template void RecvBroadCastCommID<Type>(std::string endpoint, \
std::vector<Type> * nccl_ids);
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
INSTANT_TEMPLATE
(
ncclUniqueId
)
#endif
#ifdef PADDLE_WITH_XPU_BKCL
INSTANT_TEMPLATE
(
BKCLUniqueId
)
#endif
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/collective/gen_nccl_id_op_helper.h
浏览文件 @
03e99e26
...
@@ -15,6 +15,8 @@ limitations under the License. */
...
@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once
#pragma once
#include <functional>
#include <functional>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <string>
#include <vector>
#include <vector>
...
@@ -25,7 +27,7 @@ class Scope;
...
@@ -25,7 +27,7 @@ class Scope;
}
// namespace paddle
}
// namespace paddle
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
platform
{
int
CreateListenSocket
(
const
std
::
string
&
ep
);
int
CreateListenSocket
(
const
std
::
string
&
ep
);
...
@@ -41,8 +43,26 @@ void RecvBroadCastNCCLID(std::string endpoint, int nccl_comm_num,
...
@@ -41,8 +43,26 @@ void RecvBroadCastNCCLID(std::string endpoint, int nccl_comm_num,
const
framework
::
Scope
&
scope
);
const
framework
::
Scope
&
scope
);
// recv nccl id from socket
// recv nccl id from socket
void
RecvBroadCastNCCLID
(
int
server_fd
,
std
::
string
endpoint
,
int
nccl_comm_num
,
template
<
typename
CommUniqueId
>
std
::
function
<
std
::
string
(
size_t
)
>
func
,
void
RecvBroadCastCommID
(
int
server_fd
,
std
::
string
endpoint
,
const
framework
::
Scope
&
scope
);
std
::
vector
<
CommUniqueId
>*
nccl_ids
);
}
// namespace operators
class
SocketServer
{
public:
SocketServer
()
=
default
;
~
SocketServer
()
{
CloseSocket
(
server_fd_
);
}
int
socket
()
const
{
return
server_fd_
;
}
static
SocketServer
&
GetInstance
(
const
std
::
string
&
end_point
);
private:
int
server_fd_
{
-
1
};
std
::
string
end_point_
;
static
std
::
once_flag
init_flag_
;
};
}
// namespace platform
}
// namespace paddle
}
// namespace paddle
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录