Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
b4c3a6aa
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b4c3a6aa
编写于
4月 04, 2019
作者:
Y
Yan Xu
提交者:
GitHub
4月 04, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Imperative] implement imperative NCCLParallelContext (#16477)
add NCCLParallelContext for parallel dygraph
上级
bb143052
变更
12
显示空白变更内容
内联
并排
Showing
12 changed file
with
383 addition
and
5 deletion
+383
-5
paddle/fluid/imperative/CMakeLists.txt
paddle/fluid/imperative/CMakeLists.txt
+3
-0
paddle/fluid/imperative/nccl_context.cc
paddle/fluid/imperative/nccl_context.cc
+134
-0
paddle/fluid/imperative/nccl_context.h
paddle/fluid/imperative/nccl_context.h
+81
-0
paddle/fluid/imperative/nccl_context_test.cc
paddle/fluid/imperative/nccl_context_test.cc
+52
-0
paddle/fluid/imperative/tracer.cc
paddle/fluid/imperative/tracer.cc
+2
-1
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+1
-1
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+42
-1
paddle/fluid/pybind/imperative.h
paddle/fluid/pybind/imperative.h
+2
-1
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+1
-1
python/paddle/distributed/launch.py
python/paddle/distributed/launch.py
+1
-0
python/paddle/fluid/dygraph/__init__.py
python/paddle/fluid/dygraph/__init__.py
+4
-0
python/paddle/fluid/dygraph/parallel.py
python/paddle/fluid/dygraph/parallel.py
+60
-0
未找到文件。
paddle/fluid/imperative/CMakeLists.txt
浏览文件 @
b4c3a6aa
...
@@ -3,4 +3,7 @@ cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybi
...
@@ -3,4 +3,7 @@ cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybi
cc_library
(
tracer SRCS tracer.cc DEPS proto_desc device_context pybind
)
cc_library
(
tracer SRCS tracer.cc DEPS proto_desc device_context pybind
)
cc_library
(
engine SRCS engine.cc
)
cc_library
(
engine SRCS engine.cc
)
cc_library
(
imperative_profiler SRCS profiler.cc
)
cc_library
(
imperative_profiler SRCS profiler.cc
)
cc_library
(
nccl_context SRCS nccl_context.cc DEPS device_context
)
cc_test
(
nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context
)
endif
()
endif
()
paddle/fluid/imperative/nccl_context.cc
0 → 100644
浏览文件 @
b4c3a6aa
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/nccl_context.h"
namespace
paddle
{
namespace
imperative
{
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void
NCCLParallelContext
::
RecvNCCLID
(
const
std
::
string
&
ep
,
ncclUniqueId
*
nccl_id
)
{
auto
addr
=
paddle
::
string
::
Split
(
ep
,
':'
);
PADDLE_ENFORCE_EQ
(
addr
.
size
(),
2UL
,
"The endpoint should contain host and port: %s"
,
ep
);
std
::
string
host
=
addr
[
0
];
int
port
=
std
::
stoi
(
addr
[
1
]);
int
server_fd
,
new_socket
;
struct
sockaddr_in
address
;
int
addrlen
=
sizeof
(
address
);
char
buffer
[
1024
]
=
{
0
};
int
opt
=
0
;
// creating socket fd
if
((
server_fd
=
socket
(
AF_INET
,
SOCK_STREAM
,
0
))
==
0
)
PADDLE_THROW
(
"create server fd failed"
);
if
(
setsockopt
(
server_fd
,
SOL_SOCKET
,
SO_REUSEADDR
|
SO_REUSEPORT
,
&
opt
,
sizeof
(
opt
)))
PADDLE_THROW
(
"set socket opt failed"
);
address
.
sin_family
=
AF_INET
;
address
.
sin_addr
.
s_addr
=
INADDR_ANY
;
address
.
sin_port
=
htons
(
port
);
if
(
bind
(
server_fd
,
(
struct
sockaddr
*
)
&
address
,
sizeof
(
address
))
<
0
)
PADDLE_THROW
(
"binding failed on ep: %s"
,
ep
);
VLOG
(
3
)
<<
"listening on: "
<<
ep
;
if
(
listen
(
server_fd
,
3
)
<
0
)
PADDLE_THROW
(
"listen on server fd failed"
);
if
((
new_socket
=
accept
(
server_fd
,
reinterpret_cast
<
struct
sockaddr
*>
(
&
address
),
reinterpret_cast
<
socklen_t
*>
(
&
addrlen
)))
<
0
)
PADDLE_THROW
(
"accept the new socket fd failed"
);
if
(
read
(
new_socket
,
buffer
,
1024
)
<
0
)
PADDLE_THROW
(
"reading the ncclUniqueId from socket failed"
);
VLOG
(
3
)
<<
"recevived the ncclUniqueId"
;
memcpy
(
nccl_id
,
buffer
,
NCCL_UNIQUE_ID_BYTES
);
VLOG
(
3
)
<<
"closing the socket server: "
<<
ep
;
close
(
server_fd
);
}
void
NCCLParallelContext
::
SendNCCLID
(
const
std
::
string
&
ep
,
ncclUniqueId
*
nccl_id
)
{
auto
addr
=
paddle
::
string
::
Split
(
ep
,
':'
);
PADDLE_ENFORCE_EQ
(
addr
.
size
(),
2UL
,
"The endpoint should contain host and port: %s"
,
ep
);
std
::
string
host
=
addr
[
0
];
int
port
=
std
::
stoi
(
addr
[
1
]);
// struct sockaddr_in address;
int
sock
=
0
;
struct
sockaddr_in
serv_addr
;
char
buffer
[
1024
]
=
{
0
};
memcpy
(
buffer
,
nccl_id
,
NCCL_UNIQUE_ID_BYTES
);
if
((
sock
=
socket
(
AF_INET
,
SOCK_STREAM
,
0
))
<
0
)
PADDLE_THROW
(
"create socket failed"
);
memset
(
&
serv_addr
,
'0'
,
sizeof
(
serv_addr
));
serv_addr
.
sin_family
=
AF_INET
;
serv_addr
.
sin_port
=
htons
(
port
);
if
(
inet_pton
(
AF_INET
,
host
.
c_str
(),
&
serv_addr
.
sin_addr
)
<=
0
)
PADDLE_THROW
(
"invalied address: %s"
,
ep
);
while
(
true
)
{
if
(
connect
(
sock
,
(
struct
sockaddr
*
)
&
serv_addr
,
sizeof
(
serv_addr
))
<
0
)
{
VLOG
(
0
)
<<
"worker: "
<<
ep
<<
" is not ready, will retry after 3 seconds..."
;
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
seconds
(
3
));
continue
;
}
VLOG
(
3
)
<<
"sending the ncclUniqueId to "
<<
ep
;
send
(
sock
,
buffer
,
NCCL_UNIQUE_ID_BYTES
,
0
);
break
;
}
}
void
NCCLParallelContext
::
BcastNCCLId
(
ncclUniqueId
*
nccl_id
,
int
root
)
{
if
(
strategy_
.
local_rank_
==
root
)
{
for
(
auto
ep
:
strategy_
.
trainer_endpoints_
)
{
if
(
ep
!=
strategy_
.
current_endpoint_
)
SendNCCLID
(
ep
,
nccl_id
);
}
}
else
{
RecvNCCLID
(
strategy_
.
current_endpoint_
,
nccl_id
);
}
}
void
NCCLParallelContext
::
Init
()
{
ncclUniqueId
nccl_id
;
ncclComm_t
comm
;
if
(
strategy_
.
local_rank_
==
0
)
{
// generate the unique ncclid on the root worker
platform
::
dynload
::
ncclGetUniqueId
(
&
nccl_id
);
BcastNCCLId
(
&
nccl_id
,
0
);
}
else
{
BcastNCCLId
(
&
nccl_id
,
0
);
}
int
gpu_id
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
).
device
;
VLOG
(
0
)
<<
"init nccl context nranks: "
<<
strategy_
.
nranks_
<<
" local rank: "
<<
strategy_
.
local_rank_
<<
" gpu id: "
<<
gpu_id
;
PADDLE_ENFORCE
(
cudaSetDevice
(
gpu_id
));
PADDLE_ENFORCE
(
platform
::
dynload
::
ncclCommInitRank
(
&
comm
,
strategy_
.
nranks_
,
nccl_id
,
strategy_
.
local_rank_
));
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
pool
.
Get
(
place_
));
dev_ctx
->
set_nccl_comm
(
comm
);
}
#endif
}
// namespace imperative
}
// namespace paddle
paddle/fluid/imperative/nccl_context.h
0 → 100644
浏览文件 @
b4c3a6aa
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
// network header files
#ifndef _WIN32
#include <arpa/inet.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <sys/socket.h>
#endif
#include <string>
#include <vector>
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
namespace
paddle
{
namespace
imperative
{
struct
ParallelStrategy
{
int
nranks_
{
1
};
int
local_rank_
{
0
};
std
::
vector
<
std
::
string
>
trainer_endpoints_
{};
std
::
string
current_endpoint_
{
""
};
};
class
ParallelContext
{
public:
explicit
ParallelContext
(
const
ParallelStrategy
&
strategy
,
const
platform
::
Place
&
place
)
:
strategy_
(
strategy
),
place_
(
place
)
{}
virtual
~
ParallelContext
()
{}
virtual
void
Init
()
=
0
;
protected:
ParallelStrategy
strategy_
;
platform
::
Place
place_
;
};
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
class
NCCLParallelContext
:
ParallelContext
{
public:
explicit
NCCLParallelContext
(
const
ParallelStrategy
&
strategy
,
const
platform
::
Place
&
place
)
:
ParallelContext
(
strategy
,
place
)
{}
~
NCCLParallelContext
()
{}
void
BcastNCCLId
(
ncclUniqueId
*
nccl_id
,
int
root
);
void
Init
()
override
;
protected:
void
RecvNCCLID
(
const
std
::
string
&
endpoint
,
ncclUniqueId
*
nccl_id
);
void
SendNCCLID
(
const
std
::
string
&
endpoint
,
ncclUniqueId
*
nccl_id
);
};
#endif
}
// namespace imperative
}
// namespace paddle
paddle/fluid/imperative/nccl_context_test.cc
0 → 100644
浏览文件 @
b4c3a6aa
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/nccl_context.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
namespace
imperative
=
paddle
::
imperative
;
namespace
platform
=
paddle
::
platform
;
imperative
::
ParallelStrategy
GetStrategy
(
int
local_rank
)
{
std
::
vector
<
std
::
string
>
eps
=
{
"127.0.0.1:9866"
,
"127.0.0.1:9867"
};
imperative
::
ParallelStrategy
strategy
;
strategy
.
trainer_endpoints_
=
eps
;
strategy
.
current_endpoint_
=
eps
[
local_rank
];
strategy
.
nranks_
=
2
;
strategy
.
local_rank_
=
local_rank
;
return
strategy
;
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
void
BcastNCCLId
(
int
local_rank
,
ncclUniqueId
*
nccl_id
)
{
auto
strategy
=
GetStrategy
(
local_rank
);
platform
::
CUDAPlace
gpu
(
local_rank
);
imperative
::
NCCLParallelContext
ctx
(
strategy
,
gpu
);
ctx
.
BcastNCCLId
(
nccl_id
,
0
);
}
TEST
(
BcastNCCLId
,
Run
)
{
ncclUniqueId
nccl_id
;
platform
::
dynload
::
ncclGetUniqueId
(
&
nccl_id
);
std
::
thread
t
(
BcastNCCLId
,
0
,
&
nccl_id
);
ncclUniqueId
recv_nccl_id
;
BcastNCCLId
(
1
,
&
recv_nccl_id
);
t
.
join
();
EXPECT_EQ
(
0
,
std
::
memcmp
(
nccl_id
.
internal
,
recv_nccl_id
.
internal
,
NCCL_UNIQUE_ID_BYTES
));
}
#endif
paddle/fluid/imperative/tracer.cc
浏览文件 @
b4c3a6aa
...
@@ -177,7 +177,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
...
@@ -177,7 +177,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
current_vars_map
[
out
->
Name
()]
=
out
;
current_vars_map
[
out
->
Name
()]
=
out
;
}
}
VLOG
(
3
)
<<
"
in
put var name: "
<<
out
->
Name
()
VLOG
(
3
)
<<
"
out
put var name: "
<<
out
->
Name
()
<<
" inited: "
<<
out
->
var_
->
IsInitialized
()
<<
" inited: "
<<
out
->
var_
->
IsInitialized
()
<<
" stop_grad: "
<<
out
->
IsStopGradient
();
<<
" stop_grad: "
<<
out
->
IsStopGradient
();
}
}
...
@@ -215,6 +215,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
...
@@ -215,6 +215,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
framework
::
Scope
scope
;
framework
::
Scope
scope
;
op
->
place_
=
GetExpectedPlace
(
expected_place
,
inputs
);
op
->
place_
=
GetExpectedPlace
(
expected_place
,
inputs
);
PreparedOp
prepared_op
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
op
->
place_
);
PreparedOp
prepared_op
=
PreparedOp
::
Prepare
(
ctx
,
*
op_kernel
,
op
->
place_
);
prepared_op
.
op
.
RuntimeInferShape
(
scope
,
op
->
place_
,
ctx
);
prepared_op
.
op
.
RuntimeInferShape
(
scope
,
op
->
place_
,
ctx
);
prepared_op
.
func
(
prepared_op
.
func
(
...
...
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
b4c3a6aa
set
(
PYBIND_DEPS pybind python proto_desc memory executor async_executor fleet_wrapper prune
set
(
PYBIND_DEPS pybind python proto_desc memory executor async_executor fleet_wrapper prune
feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
tracer analysis_predictor imperative_profiler
)
tracer analysis_predictor imperative_profiler
nccl_context
)
if
(
WITH_PYTHON
)
if
(
WITH_PYTHON
)
list
(
APPEND PYBIND_DEPS py_func_op
)
list
(
APPEND PYBIND_DEPS py_func_op
)
...
...
paddle/fluid/pybind/imperative.cc
浏览文件 @
b4c3a6aa
...
@@ -29,7 +29,7 @@ namespace paddle {
...
@@ -29,7 +29,7 @@ namespace paddle {
namespace
pybind
{
namespace
pybind
{
// Bind Methods
// Bind Methods
void
Bind
Tracer
(
pybind11
::
module
*
m
)
{
void
Bind
Imperative
(
pybind11
::
module
*
m
)
{
pybind11
::
class_
<
imperative
::
Tracer
>
(
*
m
,
"Tracer"
,
""
)
pybind11
::
class_
<
imperative
::
Tracer
>
(
*
m
,
"Tracer"
,
""
)
.
def
(
"__init__"
,
.
def
(
"__init__"
,
[](
imperative
::
Tracer
&
self
,
framework
::
BlockDesc
*
root_block
)
{
[](
imperative
::
Tracer
&
self
,
framework
::
BlockDesc
*
root_block
)
{
...
@@ -59,6 +59,47 @@ void BindTracer(pybind11::module* m) {
...
@@ -59,6 +59,47 @@ void BindTracer(pybind11::module* m) {
})
})
.
def
(
"py_trace"
,
&
imperative
::
Tracer
::
PyTrace
,
.
def
(
"py_trace"
,
&
imperative
::
Tracer
::
PyTrace
,
pybind11
::
return_value_policy
::
take_ownership
);
pybind11
::
return_value_policy
::
take_ownership
);
// define parallel context
pybind11
::
class_
<
imperative
::
ParallelStrategy
>
parallel_strategy
(
*
m
,
"ParallelStrategy"
,
""
);
parallel_strategy
.
def
(
pybind11
::
init
())
.
def_property
(
"nranks"
,
[](
const
imperative
::
ParallelStrategy
&
self
)
{
return
self
.
nranks_
;
},
[](
imperative
::
ParallelStrategy
&
self
,
int
nranks
)
{
self
.
nranks_
=
nranks
;
})
.
def_property
(
"local_rank"
,
[](
const
imperative
::
ParallelStrategy
&
self
)
{
return
self
.
local_rank_
;
},
[](
imperative
::
ParallelStrategy
&
self
,
int
local_rank
)
{
self
.
local_rank_
=
local_rank
;
})
.
def_property
(
"trainer_endpoints"
,
[](
const
imperative
::
ParallelStrategy
&
self
)
{
return
self
.
trainer_endpoints_
;
},
[](
imperative
::
ParallelStrategy
&
self
,
std
::
vector
<
std
::
string
>
eps
)
{
self
.
trainer_endpoints_
=
eps
;
})
.
def_property
(
"current_endpoint"
,
[](
const
imperative
::
ParallelStrategy
&
self
)
{
return
self
.
current_endpoint_
;
},
[](
imperative
::
ParallelStrategy
&
self
,
const
std
::
string
&
ep
)
{
self
.
current_endpoint_
=
ep
;
});
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
pybind11
::
class_
<
imperative
::
NCCLParallelContext
>
nccl_ctx
(
*
m
,
"NCCLParallelContext"
);
nccl_ctx
.
def
(
pybind11
::
init
<
const
imperative
::
ParallelStrategy
&
,
const
platform
::
CUDAPlace
&>
())
.
def
(
"init"
,
[](
imperative
::
NCCLParallelContext
&
self
)
{
self
.
Init
();
});
#endif
}
}
}
// namespace pybind
}
// namespace pybind
...
...
paddle/fluid/pybind/imperative.h
浏览文件 @
b4c3a6aa
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/nccl_context.h"
#include "pybind11/pybind11.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
#include "pybind11/stl.h"
...
@@ -46,7 +47,7 @@ class PyVarBase : public imperative::VarBase {
...
@@ -46,7 +47,7 @@ class PyVarBase : public imperative::VarBase {
using
imperative
::
VarBase
::
VarBase
;
// Inherit constructors
using
imperative
::
VarBase
::
VarBase
;
// Inherit constructors
};
};
void
Bind
Tracer
(
pybind11
::
module
*
m
);
void
Bind
Imperative
(
pybind11
::
module
*
m
);
}
// namespace pybind
}
// namespace pybind
}
// namespace paddle
}
// namespace paddle
paddle/fluid/pybind/pybind.cc
浏览文件 @
b4c3a6aa
...
@@ -288,7 +288,7 @@ PYBIND11_MODULE(core, m) {
...
@@ -288,7 +288,7 @@ PYBIND11_MODULE(core, m) {
})
})
.
def_static
(
"num_funcs"
,
&
imperative
::
PyLayer
::
NumFuncs
);
.
def_static
(
"num_funcs"
,
&
imperative
::
PyLayer
::
NumFuncs
);
Bind
Tracer
(
&
m
);
Bind
Imperative
(
&
m
);
py
::
class_
<
Tensor
>
(
m
,
"Tensor"
,
py
::
buffer_protocol
())
py
::
class_
<
Tensor
>
(
m
,
"Tensor"
,
py
::
buffer_protocol
())
.
def_buffer
(
.
def_buffer
(
...
...
python/paddle/distributed/launch.py
浏览文件 @
b4c3a6aa
...
@@ -32,6 +32,7 @@ default_envs = {
...
@@ -32,6 +32,7 @@ default_envs = {
"NCCL_SOCKET_IFNAME"
:
"eth0"
,
"NCCL_SOCKET_IFNAME"
:
"eth0"
,
"NCCL_IB_GID_INDEX"
:
"3"
,
"NCCL_IB_GID_INDEX"
:
"3"
,
"NCCL_IB_RETRY_CNT"
:
"0"
,
"NCCL_IB_RETRY_CNT"
:
"0"
,
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
,
""
),
}
}
GPUS
=
8
GPUS
=
8
...
...
python/paddle/fluid/dygraph/__init__.py
浏览文件 @
b4c3a6aa
...
@@ -29,6 +29,9 @@ from .tracer import *
...
@@ -29,6 +29,9 @@ from .tracer import *
from
.
import
profiler
from
.
import
profiler
from
.profiler
import
*
from
.profiler
import
*
from
.
import
parallel
from
.parallel
import
*
from
.
import
checkpoint
from
.
import
checkpoint
from
.checkpoint
import
*
from
.checkpoint
import
*
...
@@ -41,5 +44,6 @@ __all__ += base.__all__
...
@@ -41,5 +44,6 @@ __all__ += base.__all__
__all__
+=
nn
.
__all__
__all__
+=
nn
.
__all__
__all__
+=
tracer
.
__all__
__all__
+=
tracer
.
__all__
__all__
+=
profiler
.
__all__
__all__
+=
profiler
.
__all__
__all__
+=
parallel
.
__all__
__all__
+=
checkpoint
.
__all__
__all__
+=
checkpoint
.
__all__
__all__
+=
learning_rate_scheduler
.
__all__
__all__
+=
learning_rate_scheduler
.
__all__
python/paddle/fluid/dygraph/parallel.py
0 → 100644
浏览文件 @
b4c3a6aa
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except jin compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
from
..
import
core
__all__
=
[
"prepare_context"
]
ParallelStrategy
=
core
.
ParallelStrategy
__parallel_ctx__clz__
=
None
def
prepare_context
(
parallel_strategy
,
place
):
global
__parallel_ctx__clz__
assert
__parallel_ctx__clz__
is
None
,
"ParallelContext can only be initialized once."
if
isinstance
(
place
,
core
.
CUDAPlace
):
__parallel_ctx__clz__
=
core
.
NCCLParallelContext
(
parallel_strategy
,
place
)
else
:
# TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
assert
(
"Only support CUDAPlace for now."
)
__parallel_ctx__clz__
.
init
()
class
Env
(
object
):
def
__init__
(
self
):
self
.
_nranks
=
int
(
os
.
getenv
(
"PADDLE_TRAINERS_NUM"
,
"1"
))
self
.
_local_rank
=
int
(
os
.
getenv
(
"PADDLE_TRAINER_ID"
,
"0"
))
self
.
_dev_id
=
int
(
os
.
getenv
(
"FLAGS_selected_gpus"
,
"0"
))
self
.
_trainer_endpoints
=
os
.
getenv
(
"PADDLE_TRAINER_ENDPOINTS"
,
""
).
split
(
","
)
self
.
_current_endpoint
=
os
.
getenv
(
"PADDLE_CURRENT_ENDPOINT"
,
""
)
@
property
def
nranks
(
self
):
return
self
.
_nranks
@
property
def
local_rank
(
self
):
return
self
.
_local_rank
@
property
def
dev_id
(
self
):
return
self
.
_dev_id
@
property
def
current_endpoint
(
self
):
return
self
.
_current_endpoint
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录