Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
42fe3e88
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
42fe3e88
编写于
7月 25, 2017
作者:
H
Helin Wang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
gracefully shutdown pserver, fix gometalinter errors
上级
cb5c7526
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
90 addition
and
55 deletion
+90
-55
go/cmd/master/master.go
go/cmd/master/master.go
+3
-3
go/cmd/pserver/pserver.go
go/cmd/pserver/pserver.go
+26
-5
go/master/etcd_client.go
go/master/etcd_client.go
+2
-2
go/pserver/client/c/cclient.go
go/pserver/client/c/cclient.go
+3
-3
go/pserver/etcd_client.go
go/pserver/etcd_client.go
+56
-42
未找到文件。
go/cmd/master/master.go
浏览文件 @
42fe3e88
...
...
@@ -72,9 +72,9 @@ func main() {
shutdown
:=
func
()
{
log
.
Infoln
(
"shutting down gracefully"
)
e
:=
store
.
Shutdown
()
if
e
!=
nil
{
log
.
Errorln
(
e
)
e
rr
:=
store
.
Shutdown
()
if
e
rr
!=
nil
{
log
.
Errorln
(
e
rr
)
}
}
...
...
go/cmd/pserver/pserver.go
浏览文件 @
42fe3e88
...
...
@@ -18,6 +18,8 @@ import (
"net"
"net/http"
"net/rpc"
"os"
"os/signal"
"strconv"
"time"
...
...
@@ -33,7 +35,8 @@ func main() {
index
:=
flag
.
Int
(
"index"
,
-
1
,
"index of this pserver, should be larger or equal than 0"
)
etcdEndpoint
:=
flag
.
String
(
"etcd-endpoint"
,
"http://127.0.0.1:2379"
,
"comma separated endpoint string for pserver to connect to etcd"
)
etcdTimeout
:=
flag
.
Duration
(
"etcd-timeout"
,
5
*
time
.
Second
,
"timeout for etcd calls"
)
dialTimeout
:=
flag
.
Duration
(
"dial-timeout"
,
5
*
time
.
Second
,
"dial timeout"
)
etcdTTL
:=
flag
.
Int
(
"etcd-ttl"
,
5
,
"etcd time to live in seconds"
)
numPservers
:=
flag
.
Int
(
"num-pservers"
,
1
,
"total pserver count in a training job"
)
checkpointPath
:=
flag
.
String
(
"checkpoint-path"
,
"/checkpoints/"
,
"save checkpoint path"
)
checkpointInterval
:=
flag
.
Duration
(
"checkpoint-interval"
,
600
*
time
.
Second
,
"save checkpoint per interval seconds"
)
...
...
@@ -53,7 +56,7 @@ func main() {
if
*
index
>=
0
{
idx
=
*
index
}
else
{
e
=
pserver
.
NewEtcdClient
(
*
etcdEndpoint
,
*
numPservers
,
*
etcdTimeout
)
e
=
pserver
.
NewEtcdClient
(
*
etcdEndpoint
,
*
numPservers
,
*
dialTimeout
,
*
etcdTTL
)
idx
,
err
=
e
.
Register
(
*
port
)
candy
.
Must
(
err
)
...
...
@@ -67,6 +70,20 @@ func main() {
}
}
shutdown
:=
func
()
{
log
.
Infoln
(
"shutting down gracefully"
)
err
:=
e
.
Shutdown
()
if
err
!=
nil
{
log
.
Errorln
(
err
)
}
}
// Guaranteed to run even panic happens.
defer
shutdown
()
c
:=
make
(
chan
os
.
Signal
,
1
)
signal
.
Notify
(
c
,
os
.
Interrupt
)
s
,
err
:=
pserver
.
NewService
(
idx
,
*
checkpointInterval
,
*
checkpointPath
,
e
,
cp
)
candy
.
Must
(
err
)
...
...
@@ -77,7 +94,11 @@ func main() {
l
,
err
:=
net
.
Listen
(
"tcp"
,
":"
+
strconv
.
Itoa
(
*
port
))
candy
.
Must
(
err
)
log
.
Infof
(
"start pserver at port %d"
,
*
port
)
err
=
http
.
Serve
(
l
,
nil
)
candy
.
Must
(
err
)
go
func
()
{
log
.
Infof
(
"start pserver at port %d"
,
*
port
)
err
=
http
.
Serve
(
l
,
nil
)
candy
.
Must
(
err
)
}()
<-
c
}
go/master/etcd_client.go
浏览文件 @
42fe3e88
...
...
@@ -68,12 +68,12 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
// one master running, but split-brain problem may cause
// multiple master servers running), and the cluster management
// software will kill one of them.
log
.
Debug
f
(
"Trying to acquire lock at %s."
,
lockPath
)
log
.
Info
f
(
"Trying to acquire lock at %s."
,
lockPath
)
err
=
lock
.
Lock
(
context
.
TODO
())
if
err
!=
nil
{
return
nil
,
err
}
log
.
Debug
f
(
"Successfully acquired lock at %s."
,
lockPath
)
log
.
Info
f
(
"Successfully acquired lock at %s."
,
lockPath
)
put
:=
clientv3
.
OpPut
(
addrPath
,
addr
)
resp
,
err
:=
cli
.
Txn
(
context
.
Background
())
.
If
(
lock
.
IsOwner
())
.
Then
(
put
)
.
Commit
()
...
...
go/pserver/client/c/cclient.go
浏览文件 @
42fe3e88
...
...
@@ -55,10 +55,10 @@ var curHandle C.paddle_pserver_client
func
add
(
c
*
client
.
Client
)
C
.
paddle_pserver_client
{
mu
.
Lock
()
defer
mu
.
Unlock
()
cli
ent
:=
curHandle
cli
:=
curHandle
curHandle
++
handleMap
[
cli
ent
]
=
c
return
cli
ent
handleMap
[
cli
]
=
c
return
cli
}
func
get
(
client
C
.
paddle_pserver_client
)
*
client
.
Client
{
...
...
go/pserver/etcd_client.go
浏览文件 @
42fe3e88
...
...
@@ -34,16 +34,19 @@ const (
PsPath
=
"/ps/"
// PsCheckpoint is the etcd path for store checkpoints information
PsCheckpoint
=
"/checkpoints/"
retryTimeout
=
5
*
time
.
Second
)
// EtcdClient is the etcd client that the pserver uses for fault
// tolerance, service registry and coordination.
type
EtcdClient
struct
{
numPservers
int
etcdEndpoints
string
etcdClient
*
clientv3
.
Client
// etcdTimeout is also used as retry intervals.
etcdTimeout
time
.
Duration
numPservers
int
endpoints
string
client
*
clientv3
.
Client
sess
*
concurrency
.
Session
dialTimeout
time
.
Duration
ttlSec
int
// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
externalIP
string
// desired number of pservers in the job.
...
...
@@ -52,11 +55,12 @@ type EtcdClient struct {
}
// NewEtcdClient creates an EtcdClient
func
NewEtcdClient
(
endpoints
string
,
numPservers
int
,
timeout
time
.
Duration
)
*
EtcdClient
{
func
NewEtcdClient
(
endpoints
string
,
numPservers
int
,
dialtimeout
time
.
Duration
,
ttlSec
int
)
*
EtcdClient
{
return
&
EtcdClient
{
etcdTimeout
:
timeout
,
numPservers
:
numPservers
,
etcdEndpoints
:
endpoints
,
dialTimeout
:
dialtimeout
,
ttlSec
:
ttlSec
,
numPservers
:
numPservers
,
endpoints
:
endpoints
,
}
}
...
...
@@ -64,7 +68,6 @@ func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *Et
//
// Register returns the index of the current pserver.
func
(
e
*
EtcdClient
)
Register
(
port
int
)
(
int
,
error
)
{
var
err
error
e
.
externalIP
,
err
=
networkhelper
.
GetExternalIP
()
if
err
!=
nil
{
...
...
@@ -72,19 +75,26 @@ func (e *EtcdClient) Register(port int) (int, error) {
}
// initialize connection to etcd.
ep
:=
strings
.
Split
(
e
.
e
tcdE
ndpoints
,
","
)
ep
:=
strings
.
Split
(
e
.
endpoints
,
","
)
for
{
cli
,
err
:=
clientv3
.
New
(
clientv3
.
Config
{
Endpoints
:
ep
,
DialTimeout
:
e
.
etcd
Timeout
,
DialTimeout
:
e
.
dial
Timeout
,
})
if
err
!=
nil
{
log
.
Errorf
(
"connect to etcd error: %v"
,
err
)
time
.
Sleep
(
e
.
etcdTimeout
)
time
.
Sleep
(
retryTimeout
)
continue
}
e
.
client
=
cli
sess
,
err
:=
concurrency
.
NewSession
(
cli
,
concurrency
.
WithTTL
(
e
.
ttlSec
))
if
err
!=
nil
{
log
.
Errorf
(
"create etcd session error: %v"
,
err
)
time
.
Sleep
(
retryTimeout
)
continue
}
e
.
etcdClient
=
cli
log
.
Debugf
(
"inited client to %s"
,
e
.
e
tcdE
ndpoints
)
e
.
sess
=
sess
log
.
Debugf
(
"inited client to %s"
,
e
.
endpoints
)
break
}
// init /ps_desired using transaction, for multiple pservers may want to write
...
...
@@ -95,7 +105,7 @@ func (e *EtcdClient) Register(port int) (int, error) {
cancel
()
if
err
!=
nil
{
log
.
Warn
(
err
)
time
.
Sleep
(
e
.
etcd
Timeout
)
time
.
Sleep
(
retry
Timeout
)
continue
}
break
...
...
@@ -106,18 +116,18 @@ func (e *EtcdClient) Register(port int) (int, error) {
// wait and set s.desired init value
for
{
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
time
.
Second
)
resp
,
err
:=
e
.
etcdC
lient
.
Get
(
ctx
,
PsDesired
)
resp
,
err
:=
e
.
c
lient
.
Get
(
ctx
,
PsDesired
)
cancel
()
if
err
!=
nil
{
log
.
Errorf
(
"getting %s error: %v"
,
PsDesired
,
err
)
time
.
Sleep
(
e
.
etcd
Timeout
)
time
.
Sleep
(
retry
Timeout
)
continue
}
if
len
(
resp
.
Kvs
)
!=
0
{
e
.
desired
,
err
=
strconv
.
Atoi
(
string
(
resp
.
Kvs
[
0
]
.
Value
))
if
err
!=
nil
{
log
.
Errorf
(
"value of %s invalid %v
\n
"
,
PsDesired
,
err
)
time
.
Sleep
(
e
.
etcd
Timeout
)
time
.
Sleep
(
retry
Timeout
)
// NOTE: wait util ps_desired value change
continue
}
...
...
@@ -134,7 +144,7 @@ func (e *EtcdClient) Register(port int) (int, error) {
cancel
()
if
err
!=
nil
{
log
.
Warn
(
err
)
time
.
Sleep
(
e
.
etcd
Timeout
)
time
.
Sleep
(
retry
Timeout
)
continue
}
break
...
...
@@ -144,10 +154,10 @@ func (e *EtcdClient) Register(port int) (int, error) {
}
func
(
e
*
EtcdClient
)
initDesiredPservers
(
ctx
context
.
Context
,
numPservers
int
)
(
*
clientv3
.
TxnResponse
,
error
)
{
return
concurrency
.
NewSTM
(
e
.
etcdC
lient
,
func
(
c
concurrency
.
STM
)
error
{
return
concurrency
.
NewSTM
(
e
.
c
lient
,
func
(
c
concurrency
.
STM
)
error
{
dsStr
:=
c
.
Get
(
PsDesired
)
if
dsStr
==
""
{
c
.
Put
(
PsDesired
,
strconv
.
Itoa
(
numPservers
))
c
.
Put
(
PsDesired
,
strconv
.
Itoa
(
numPservers
)
,
clientv3
.
WithLease
(
e
.
sess
.
Lease
())
)
}
return
nil
},
concurrency
.
WithAbortContext
(
ctx
),
concurrency
.
WithIsolation
(
concurrency
.
RepeatableReads
))
...
...
@@ -156,7 +166,7 @@ func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (
// registerPserverEtcd registers pserver node on etcd using transaction.
func
(
e
*
EtcdClient
)
registerPserverEtcd
(
ctx
context
.
Context
,
port
int
)
(
int
,
error
)
{
var
idx
int
_
,
err
:=
concurrency
.
NewSTM
(
e
.
etcdC
lient
,
func
(
c
concurrency
.
STM
)
error
{
_
,
err
:=
concurrency
.
NewSTM
(
e
.
c
lient
,
func
(
c
concurrency
.
STM
)
error
{
registered
:=
false
for
i
:=
0
;
i
<
e
.
desired
;
i
++
{
psKey
:=
PsPath
+
strconv
.
Itoa
(
i
)
...
...
@@ -165,26 +175,10 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er
log
.
Debugf
(
"got value (%s) for key: %s"
,
ps
,
psKey
)
if
ps
==
""
{
resp
,
err
:=
e
.
etcdClient
.
Grant
(
context
.
TODO
(),
5
)
if
err
!=
nil
{
log
.
Fatal
(
err
)
}
// find the first id and write info
pserverAddr
:=
e
.
externalIP
+
":"
+
strconv
.
Itoa
(
port
)
c
.
Put
(
psKey
,
pserverAddr
,
clientv3
.
WithLease
(
resp
.
ID
))
c
.
Put
(
psKey
,
pserverAddr
,
clientv3
.
WithLease
(
e
.
sess
.
Lease
()
))
log
.
Debugf
(
"set pserver node %s with value %s"
,
psKey
,
pserverAddr
)
ch
,
kaerr
:=
e
.
etcdClient
.
KeepAlive
(
context
.
TODO
(),
resp
.
ID
)
if
kaerr
!=
nil
{
log
.
Errorf
(
"keepalive etcd node error: %v"
,
kaerr
)
return
kaerr
}
// Eat the keep alive message so etcd
// will not expire the lease.
go
func
(
ch
<-
chan
*
clientv3
.
LeaseKeepAliveResponse
)
{
ka
:=
<-
ch
log
.
Debugf
(
"keepalive: %d
\n
"
,
ka
.
TTL
)
}(
ch
)
log
.
Debug
(
"register finished"
)
idx
=
i
registered
=
true
...
...
@@ -207,7 +201,7 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er
// GetKey gets the value by the specified key
func
(
e
*
EtcdClient
)
GetKey
(
key
string
,
timeout
time
.
Duration
)
([]
byte
,
error
)
{
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
timeout
)
resp
,
err
:=
e
.
etcdC
lient
.
Get
(
ctx
,
key
)
resp
,
err
:=
e
.
c
lient
.
Get
(
ctx
,
key
)
cancel
()
if
err
!=
nil
{
return
[]
byte
{},
err
...
...
@@ -223,7 +217,27 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
// PutKey put into etcd with value by key specified
func
(
e
*
EtcdClient
)
PutKey
(
key
string
,
value
[]
byte
,
timeout
time
.
Duration
)
error
{
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
timeout
)
_
,
err
:=
e
.
etcdClient
.
Put
(
ctx
,
key
,
string
(
value
))
_
,
err
:=
e
.
client
.
Put
(
ctx
,
key
,
string
(
value
),
clientv3
.
WithLease
(
e
.
sess
.
Lease
()
))
cancel
()
return
err
}
// Shutdown shuts down the etcd client gracefully.
func
(
e
*
EtcdClient
)
Shutdown
()
error
{
var
err
error
if
e
.
sess
!=
nil
{
err
=
e
.
sess
.
Close
()
}
if
e
.
client
!=
nil
{
newErr
:=
e
.
client
.
Close
()
if
newErr
!=
nil
{
if
err
!=
nil
{
log
.
Errorln
(
newErr
)
}
else
{
err
=
newErr
}
}
}
return
err
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录