Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
a5b32637
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a5b32637
编写于
9月 25, 2020
作者:
L
Leo Chen
提交者:
GitHub
9月 25, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refine error msg in paddle/fluid/imperative (#27521)
* refine err msg * follow comments
上级
09f19532
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
58 addition
and
26 deletion
+58
-26
paddle/fluid/imperative/gradient_accumulator.cc
paddle/fluid/imperative/gradient_accumulator.cc
+9
-3
paddle/fluid/imperative/jit/program_desc_tracer.cc
paddle/fluid/imperative/jit/program_desc_tracer.cc
+9
-4
paddle/fluid/imperative/nccl_context.cc
paddle/fluid/imperative/nccl_context.cc
+40
-19
未找到文件。
paddle/fluid/imperative/gradient_accumulator.cc
浏览文件 @
a5b32637
...
...
@@ -13,9 +13,11 @@
// limitations under the License.
#include "paddle/fluid/imperative/gradient_accumulator.h"
#include <algorithm>
#include <memory>
#include <utility>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h"
...
...
@@ -136,9 +138,13 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
return
;
}
PADDLE_ENFORCE_EQ
(
dst_tensor
->
numel
()
==
numel
,
true
,
"dst_numel %d vs. src_numel %d"
,
dst_tensor
->
numel
(),
numel
);
PADDLE_ENFORCE_EQ
(
dst_tensor
->
numel
(),
numel
,
platform
::
errors
::
PreconditionNotMet
(
"The number of elements of source tensor and destination tensor "
"should be equal, but got the number of elements of source tensor is "
"%zu and the number of elements of destination tensor is %zu."
,
numel
,
dst_tensor
->
numel
()));
auto
data_type
=
src_tensor
.
type
();
auto
place
=
src_tensor
.
place
();
...
...
paddle/fluid/imperative/jit/program_desc_tracer.cc
浏览文件 @
a5b32637
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/imperative/jit/program_desc_tracer.h"
#include <unordered_map>
#include <unordered_set>
...
...
@@ -203,7 +204,8 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc(
void
ProgramDescTracer
::
InsertVarIfNotExist
(
const
std
::
shared_ptr
<
VarBase
>
&
new_var
,
bool
is_input
)
{
PADDLE_ENFORCE_NOT_NULL
(
new_var
);
PADDLE_ENFORCE_NOT_NULL
(
new_var
,
platform
::
errors
::
InvalidArgument
(
"The variable to insert is NULL."
));
if
(
vars_
.
count
(
new_var
)
!=
0
)
return
;
auto
new_var_desc
=
new
framework
::
VarDesc
(
""
);
...
...
@@ -220,7 +222,9 @@ void ProgramDescTracer::InsertVarIfNotExist(
}
const
auto
&
inner_var
=
new_var
->
Var
();
PADDLE_ENFORCE_EQ
(
inner_var
.
IsInitialized
(),
true
);
PADDLE_ENFORCE_EQ
(
inner_var
.
IsInitialized
(),
true
,
platform
::
errors
::
InvalidArgument
(
"The variable to insert is not initialized."
));
if
(
inner_var
.
IsType
<
framework
::
LoDTensor
>
())
{
const
auto
&
tensor
=
inner_var
.
Get
<
framework
::
LoDTensor
>
();
new_var_desc
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
...
...
@@ -232,8 +236,9 @@ void ProgramDescTracer::InsertVarIfNotExist(
new_var_desc
->
SetDataType
(
framework
::
proto
::
VarType
::
FP32
);
}
}
else
{
PADDLE_THROW
(
"Not support variable type %s"
,
framework
::
ToTypeName
(
inner_var
.
Type
()));
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Not support variable type %s."
,
framework
::
ToTypeName
(
inner_var
.
Type
())));
}
}
...
...
paddle/fluid/imperative/nccl_context.cc
浏览文件 @
a5b32637
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/imperative/nccl_context.h"
#include "paddle/fluid/platform/collective_helper.h"
namespace
paddle
{
...
...
@@ -21,8 +22,10 @@ namespace imperative {
void
NCCLParallelContext
::
RecvNCCLID
(
const
std
::
string
&
ep
,
ncclUniqueId
*
nccl_id
)
{
auto
addr
=
paddle
::
string
::
Split
(
ep
,
':'
);
PADDLE_ENFORCE_EQ
(
addr
.
size
(),
2UL
,
"The endpoint should contain host and port: %s"
,
ep
);
PADDLE_ENFORCE_EQ
(
addr
.
size
(),
2UL
,
platform
::
errors
::
InvalidArgument
(
"The endpoint should contain host and port, but got %s."
,
ep
));
std
::
string
host
=
addr
[
0
];
int
port
=
std
::
stoi
(
addr
[
1
]);
...
...
@@ -32,27 +35,41 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
char
buffer
[
1024
]
=
{
0
};
int
opt
=
0
;
// creating socket fd
if
((
server_fd
=
socket
(
AF_INET
,
SOCK_STREAM
,
0
))
==
0
)
PADDLE_THROW
(
"create server fd failed"
);
if
(
setsockopt
(
server_fd
,
SOL_SOCKET
,
SO_REUSEADDR
,
&
opt
,
sizeof
(
opt
)))
PADDLE_THROW
(
"set socket opt failed"
);
if
((
server_fd
=
socket
(
AF_INET
,
SOCK_STREAM
,
0
))
==
0
)
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Create server file descriptor failed."
));
}
if
(
setsockopt
(
server_fd
,
SOL_SOCKET
,
SO_REUSEADDR
,
&
opt
,
sizeof
(
opt
)))
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Set socket options failed."
));
}
address
.
sin_family
=
AF_INET
;
address
.
sin_addr
.
s_addr
=
INADDR_ANY
;
address
.
sin_port
=
htons
(
port
);
if
(
bind
(
server_fd
,
(
struct
sockaddr
*
)
&
address
,
sizeof
(
address
))
<
0
)
PADDLE_THROW
(
"binding failed on ep: %s"
,
ep
);
if
(
bind
(
server_fd
,
(
struct
sockaddr
*
)
&
address
,
sizeof
(
address
))
<
0
)
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Bind on endpoint %s failed."
,
ep
));
}
VLOG
(
3
)
<<
"listening on: "
<<
ep
;
if
(
listen
(
server_fd
,
3
)
<
0
)
PADDLE_THROW
(
"listen on server fd failed"
);
if
(
listen
(
server_fd
,
3
)
<
0
)
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Listen on server file descriptor failed."
));
}
if
((
new_socket
=
accept
(
server_fd
,
reinterpret_cast
<
struct
sockaddr
*>
(
&
address
),
reinterpret_cast
<
socklen_t
*>
(
&
addrlen
)))
<
0
)
PADDLE_THROW
(
"accept the new socket fd failed"
);
reinterpret_cast
<
socklen_t
*>
(
&
addrlen
)))
<
0
)
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Accept the new socket file descriptor failed."
));
}
if
(
read
(
new_socket
,
buffer
,
1024
)
<
0
)
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Read from socket failed."
));
}
if
(
read
(
new_socket
,
buffer
,
1024
)
<
0
)
PADDLE_THROW
(
"reading the ncclUniqueId from socket failed"
);
VLOG
(
3
)
<<
"recevived the ncclUniqueId"
;
memcpy
(
nccl_id
,
buffer
,
NCCL_UNIQUE_ID_BYTES
);
...
...
@@ -63,8 +80,10 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
void
NCCLParallelContext
::
SendNCCLID
(
const
std
::
string
&
ep
,
ncclUniqueId
*
nccl_id
)
{
auto
addr
=
paddle
::
string
::
Split
(
ep
,
':'
);
PADDLE_ENFORCE_EQ
(
addr
.
size
(),
2UL
,
"The endpoint should contain host and port: %s"
,
ep
);
PADDLE_ENFORCE_EQ
(
addr
.
size
(),
2UL
,
platform
::
errors
::
InvalidArgument
(
"The endpoint should contain host and port, but got %s."
,
ep
));
std
::
string
host
=
addr
[
0
];
int
port
=
std
::
stoi
(
addr
[
1
]);
// struct sockaddr_in address;
...
...
@@ -73,15 +92,17 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
char
buffer
[
1024
]
=
{
0
};
memcpy
(
buffer
,
nccl_id
,
NCCL_UNIQUE_ID_BYTES
);
if
((
sock
=
socket
(
AF_INET
,
SOCK_STREAM
,
0
))
<
0
)
PADDLE_THROW
(
"create socket failed"
);
if
((
sock
=
socket
(
AF_INET
,
SOCK_STREAM
,
0
))
<
0
)
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Create socket failed."
));
}
memset
(
&
serv_addr
,
'0'
,
sizeof
(
serv_addr
));
serv_addr
.
sin_family
=
AF_INET
;
serv_addr
.
sin_port
=
htons
(
port
);
if
(
inet_pton
(
AF_INET
,
host
.
c_str
(),
&
serv_addr
.
sin_addr
)
<=
0
)
PADDLE_THROW
(
"invalied address: %s"
,
ep
);
if
(
inet_pton
(
AF_INET
,
host
.
c_str
(),
&
serv_addr
.
sin_addr
)
<=
0
)
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Open address %s failed."
,
ep
));
}
int
try_times
=
0
;
while
(
true
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录