Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
6a0b3657
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6a0b3657
编写于
7月 28, 2017
作者:
H
helinwang
提交者:
GitHub
7月 28, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #3062 from helinwang/grace
gracefully shutdown master, pserver, fix gometalinter errors
上级
35b6415f
6fab04f4
变更
7
显示空白变更内容
内联
并排
Showing
7 changed file
with
134 addition
and
60 deletion
+134
-60
go/cmd/master/master.go
go/cmd/master/master.go
+24
-4
go/cmd/pserver/pserver.go
go/cmd/pserver/pserver.go
+26
-5
go/master/etcd_client.go
go/master/etcd_client.go
+19
-6
go/master/inmem_store.go
go/master/inmem_store.go
+5
-0
go/master/service.go
go/master/service.go
+1
-0
go/pserver/client/c/cclient.go
go/pserver/client/c/cclient.go
+3
-3
go/pserver/etcd_client.go
go/pserver/etcd_client.go
+56
-42
未找到文件。
go/cmd/master/master.go
浏览文件 @
6a0b3657
...
...
@@ -19,6 +19,8 @@ import (
"net"
"net/http"
"net/rpc"
"os"
"os/signal"
"strconv"
"strings"
"time"
...
...
@@ -68,6 +70,20 @@ func main() {
store
=
&
master
.
InMemStore
{}
}
shutdown
:=
func
()
{
log
.
Infoln
(
"shutting down gracefully"
)
err
:=
store
.
Shutdown
()
if
err
!=
nil
{
log
.
Errorln
(
err
)
}
}
// Guaranteed to run even panic happens.
defer
shutdown
()
c
:=
make
(
chan
os
.
Signal
,
1
)
signal
.
Notify
(
c
,
os
.
Interrupt
)
s
,
err
:=
master
.
NewService
(
store
,
*
chunkPerTask
,
*
taskTimeoutDur
,
*
taskTimeoutMax
)
if
err
!=
nil
{
log
.
Fatal
(
err
)
...
...
@@ -84,8 +100,12 @@ func main() {
log
.
Fatal
(
err
)
}
go
func
()
{
err
=
http
.
Serve
(
l
,
nil
)
if
err
!=
nil
{
log
.
Fatal
(
err
)
}
}()
<-
c
}
go/cmd/pserver/pserver.go
浏览文件 @
6a0b3657
...
...
@@ -18,6 +18,8 @@ import (
"net"
"net/http"
"net/rpc"
"os"
"os/signal"
"strconv"
"time"
...
...
@@ -33,7 +35,8 @@ func main() {
index
:=
flag
.
Int
(
"index"
,
-
1
,
"index of this pserver, should be larger or equal than 0"
)
etcdEndpoint
:=
flag
.
String
(
"etcd-endpoint"
,
"http://127.0.0.1:2379"
,
"comma separated endpoint string for pserver to connect to etcd"
)
etcdTimeout
:=
flag
.
Duration
(
"etcd-timeout"
,
5
*
time
.
Second
,
"timeout for etcd calls"
)
dialTimeout
:=
flag
.
Duration
(
"dial-timeout"
,
5
*
time
.
Second
,
"dial timeout"
)
etcdTTL
:=
flag
.
Int
(
"etcd-ttl"
,
5
,
"etcd time to live in seconds"
)
numPservers
:=
flag
.
Int
(
"num-pservers"
,
1
,
"total pserver count in a training job"
)
checkpointPath
:=
flag
.
String
(
"checkpoint-path"
,
"/checkpoints/"
,
"save checkpoint path"
)
checkpointInterval
:=
flag
.
Duration
(
"checkpoint-interval"
,
600
*
time
.
Second
,
"save checkpoint per interval seconds"
)
...
...
@@ -53,7 +56,7 @@ func main() {
if
*
index
>=
0
{
idx
=
*
index
}
else
{
e
=
pserver
.
NewEtcdClient
(
*
etcdEndpoint
,
*
numPservers
,
*
etcdTimeout
)
e
=
pserver
.
NewEtcdClient
(
*
etcdEndpoint
,
*
numPservers
,
*
dialTimeout
,
*
etcdTTL
)
idx
,
err
=
e
.
Register
(
*
port
)
candy
.
Must
(
err
)
...
...
@@ -67,6 +70,20 @@ func main() {
}
}
shutdown
:=
func
()
{
log
.
Infoln
(
"shutting down gracefully"
)
sErr
:=
e
.
Shutdown
()
if
sErr
!=
nil
{
log
.
Errorln
(
sErr
)
}
}
// Guaranteed to run even panic happens.
defer
shutdown
()
c
:=
make
(
chan
os
.
Signal
,
1
)
signal
.
Notify
(
c
,
os
.
Interrupt
)
s
,
err
:=
pserver
.
NewService
(
idx
,
*
checkpointInterval
,
*
checkpointPath
,
e
,
cp
)
candy
.
Must
(
err
)
...
...
@@ -77,7 +94,11 @@ func main() {
l
,
err
:=
net
.
Listen
(
"tcp"
,
":"
+
strconv
.
Itoa
(
*
port
))
candy
.
Must
(
err
)
go
func
()
{
log
.
Infof
(
"start pserver at port %d"
,
*
port
)
err
=
http
.
Serve
(
l
,
nil
)
candy
.
Must
(
err
)
}()
<-
c
}
go/master/etcd_client.go
浏览文件 @
6a0b3657
...
...
@@ -39,15 +39,12 @@ type EtcdClient struct {
statePath
string
client
*
clientv3
.
Client
lock
*
concurrency
.
Mutex
sess
*
concurrency
.
Session
}
// NewEtcdClient creates a new EtcdClient.
func
NewEtcdClient
(
endpoints
[]
string
,
addr
string
,
lockPath
,
addrPath
,
statePath
string
,
ttlSec
int
)
(
*
EtcdClient
,
error
)
{
log
.
Debugf
(
"Connecting to etcd at %v"
,
endpoints
)
// TODO(helin): gracefully shutdown etcd store. Because etcd
// store holds a etcd lock, even though the lock will expire
// when the lease timeout, we need to implement graceful
// shutdown to release the lock.
cli
,
err
:=
clientv3
.
New
(
clientv3
.
Config
{
Endpoints
:
endpoints
,
DialTimeout
:
dialTimeout
,
...
...
@@ -67,12 +64,12 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
// one master running, but split-brain problem may cause
// multiple master servers running), and the cluster management
// software will kill one of them.
log
.
Debug
f
(
"Trying to acquire lock at %s."
,
lockPath
)
log
.
Info
f
(
"Trying to acquire lock at %s."
,
lockPath
)
err
=
lock
.
Lock
(
context
.
TODO
())
if
err
!=
nil
{
return
nil
,
err
}
log
.
Debug
f
(
"Successfully acquired lock at %s."
,
lockPath
)
log
.
Info
f
(
"Successfully acquired lock at %s."
,
lockPath
)
put
:=
clientv3
.
OpPut
(
addrPath
,
addr
)
resp
,
err
:=
cli
.
Txn
(
context
.
Background
())
.
If
(
lock
.
IsOwner
())
.
Then
(
put
)
.
Commit
()
...
...
@@ -89,6 +86,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
statePath
:
statePath
,
client
:
cli
,
lock
:
lock
,
sess
:
sess
,
}
return
e
,
nil
...
...
@@ -157,6 +155,21 @@ func (e *EtcdClient) Load() ([]byte, error) {
return
state
,
nil
}
// Shutdown shuts down the etcd client gracefully.
func
(
e
*
EtcdClient
)
Shutdown
()
error
{
err
:=
e
.
sess
.
Close
()
newErr
:=
e
.
client
.
Close
()
if
newErr
!=
nil
{
if
err
==
nil
{
err
=
newErr
}
else
{
log
.
Errorln
(
newErr
)
}
}
return
err
}
// GetKey gets the value by the specify key.
func
GetKey
(
c
*
clientv3
.
Client
,
key
string
,
timeout
time
.
Duration
)
(
string
,
error
)
{
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
timeout
)
...
...
go/master/inmem_store.go
浏览文件 @
6a0b3657
...
...
@@ -40,3 +40,8 @@ func (m *InMemStore) Load() ([]byte, error) {
return
m
.
buf
,
nil
}
// Shutdown shuts down the in mem store.
func
(
m
*
InMemStore
)
Shutdown
()
error
{
return
nil
}
go/master/service.go
浏览文件 @
6a0b3657
...
...
@@ -50,6 +50,7 @@ var ErrPassAfter = errors.New("pass number larger than master")
type
Store
interface
{
Save
([]
byte
)
error
Load
()
([]
byte
,
error
)
Shutdown
()
error
}
// Chunk is a chunk of data consisted of several data instances.
...
...
go/pserver/client/c/cclient.go
浏览文件 @
6a0b3657
...
...
@@ -55,10 +55,10 @@ var curHandle C.paddle_pserver_client
func
add
(
c
*
client
.
Client
)
C
.
paddle_pserver_client
{
mu
.
Lock
()
defer
mu
.
Unlock
()
cli
ent
:=
curHandle
cli
:=
curHandle
curHandle
++
handleMap
[
cli
ent
]
=
c
return
cli
ent
handleMap
[
cli
]
=
c
return
cli
}
func
get
(
client
C
.
paddle_pserver_client
)
*
client
.
Client
{
...
...
go/pserver/etcd_client.go
浏览文件 @
6a0b3657
...
...
@@ -34,16 +34,19 @@ const (
PsPath
=
"/ps/"
// PsCheckpoint is the etcd path for store checkpoints information
PsCheckpoint
=
"/checkpoints/"
retryTimeout
=
5
*
time
.
Second
)
// EtcdClient is the etcd client that the pserver uses for fault
// tolerance, service registry and coordination.
type
EtcdClient
struct
{
numPservers
int
etcdEndpoints
string
etcdClient
*
clientv3
.
Client
// etcdTimeout is also used as retry intervals.
etcdTimeout
time
.
Duration
endpoints
string
client
*
clientv3
.
Client
sess
*
concurrency
.
Session
dialTimeout
time
.
Duration
ttlSec
int
// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
externalIP
string
// desired number of pservers in the job.
...
...
@@ -52,11 +55,12 @@ type EtcdClient struct {
}
// NewEtcdClient creates an EtcdClient
func
NewEtcdClient
(
endpoints
string
,
numPservers
int
,
timeout
time
.
Duration
)
*
EtcdClient
{
func
NewEtcdClient
(
endpoints
string
,
numPservers
int
,
dialtimeout
time
.
Duration
,
ttlSec
int
)
*
EtcdClient
{
return
&
EtcdClient
{
etcdTimeout
:
timeout
,
dialTimeout
:
dialtimeout
,
ttlSec
:
ttlSec
,
numPservers
:
numPservers
,
e
tcdEndpoints
:
endpoints
,
e
ndpoints
:
endpoints
,
}
}
...
...
@@ -64,7 +68,6 @@ func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *Et
//
// Register returns the index of the current pserver.
func
(
e
*
EtcdClient
)
Register
(
port
int
)
(
int
,
error
)
{
var
err
error
e
.
externalIP
,
err
=
networkhelper
.
GetExternalIP
()
if
err
!=
nil
{
...
...
@@ -72,19 +75,26 @@ func (e *EtcdClient) Register(port int) (int, error) {
}
// initialize connection to etcd.
ep
:=
strings
.
Split
(
e
.
e
tcdE
ndpoints
,
","
)
ep
:=
strings
.
Split
(
e
.
endpoints
,
","
)
for
{
cli
,
err
:=
clientv3
.
New
(
clientv3
.
Config
{
Endpoints
:
ep
,
DialTimeout
:
e
.
etcd
Timeout
,
DialTimeout
:
e
.
dial
Timeout
,
})
if
err
!=
nil
{
log
.
Errorf
(
"connect to etcd error: %v"
,
err
)
time
.
Sleep
(
e
.
etcdTimeout
)
time
.
Sleep
(
retryTimeout
)
continue
}
e
.
client
=
cli
sess
,
err
:=
concurrency
.
NewSession
(
cli
,
concurrency
.
WithTTL
(
e
.
ttlSec
))
if
err
!=
nil
{
log
.
Errorf
(
"create etcd session error: %v"
,
err
)
time
.
Sleep
(
retryTimeout
)
continue
}
e
.
etcdClient
=
cli
log
.
Debugf
(
"inited client to %s"
,
e
.
e
tcdE
ndpoints
)
e
.
sess
=
sess
log
.
Debugf
(
"inited client to %s"
,
e
.
endpoints
)
break
}
// init /ps_desired using transaction, for multiple pservers may want to write
...
...
@@ -95,7 +105,7 @@ func (e *EtcdClient) Register(port int) (int, error) {
cancel
()
if
err
!=
nil
{
log
.
Warn
(
err
)
time
.
Sleep
(
e
.
etcd
Timeout
)
time
.
Sleep
(
retry
Timeout
)
continue
}
break
...
...
@@ -106,18 +116,18 @@ func (e *EtcdClient) Register(port int) (int, error) {
// wait and set s.desired init value
for
{
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
time
.
Second
)
resp
,
err
:=
e
.
etcdC
lient
.
Get
(
ctx
,
PsDesired
)
resp
,
err
:=
e
.
c
lient
.
Get
(
ctx
,
PsDesired
)
cancel
()
if
err
!=
nil
{
log
.
Errorf
(
"getting %s error: %v"
,
PsDesired
,
err
)
time
.
Sleep
(
e
.
etcd
Timeout
)
time
.
Sleep
(
retry
Timeout
)
continue
}
if
len
(
resp
.
Kvs
)
!=
0
{
e
.
desired
,
err
=
strconv
.
Atoi
(
string
(
resp
.
Kvs
[
0
]
.
Value
))
if
err
!=
nil
{
log
.
Errorf
(
"value of %s invalid %v
\n
"
,
PsDesired
,
err
)
time
.
Sleep
(
e
.
etcd
Timeout
)
time
.
Sleep
(
retry
Timeout
)
// NOTE: wait util ps_desired value change
continue
}
...
...
@@ -134,7 +144,7 @@ func (e *EtcdClient) Register(port int) (int, error) {
cancel
()
if
err
!=
nil
{
log
.
Warn
(
err
)
time
.
Sleep
(
e
.
etcd
Timeout
)
time
.
Sleep
(
retry
Timeout
)
continue
}
break
...
...
@@ -144,10 +154,10 @@ func (e *EtcdClient) Register(port int) (int, error) {
}
func
(
e
*
EtcdClient
)
initDesiredPservers
(
ctx
context
.
Context
,
numPservers
int
)
(
*
clientv3
.
TxnResponse
,
error
)
{
return
concurrency
.
NewSTM
(
e
.
etcdC
lient
,
func
(
c
concurrency
.
STM
)
error
{
return
concurrency
.
NewSTM
(
e
.
c
lient
,
func
(
c
concurrency
.
STM
)
error
{
dsStr
:=
c
.
Get
(
PsDesired
)
if
dsStr
==
""
{
c
.
Put
(
PsDesired
,
strconv
.
Itoa
(
numPservers
))
c
.
Put
(
PsDesired
,
strconv
.
Itoa
(
numPservers
)
,
clientv3
.
WithLease
(
e
.
sess
.
Lease
())
)
}
return
nil
},
concurrency
.
WithAbortContext
(
ctx
),
concurrency
.
WithIsolation
(
concurrency
.
RepeatableReads
))
...
...
@@ -156,7 +166,7 @@ func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (
// registerPserverEtcd registers pserver node on etcd using transaction.
func
(
e
*
EtcdClient
)
registerPserverEtcd
(
ctx
context
.
Context
,
port
int
)
(
int
,
error
)
{
var
idx
int
_
,
err
:=
concurrency
.
NewSTM
(
e
.
etcdC
lient
,
func
(
c
concurrency
.
STM
)
error
{
_
,
err
:=
concurrency
.
NewSTM
(
e
.
c
lient
,
func
(
c
concurrency
.
STM
)
error
{
registered
:=
false
for
i
:=
0
;
i
<
e
.
desired
;
i
++
{
psKey
:=
PsPath
+
strconv
.
Itoa
(
i
)
...
...
@@ -165,26 +175,10 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er
log
.
Debugf
(
"got value (%s) for key: %s"
,
ps
,
psKey
)
if
ps
==
""
{
resp
,
err
:=
e
.
etcdClient
.
Grant
(
context
.
TODO
(),
5
)
if
err
!=
nil
{
log
.
Fatal
(
err
)
}
// find the first id and write info
pserverAddr
:=
e
.
externalIP
+
":"
+
strconv
.
Itoa
(
port
)
c
.
Put
(
psKey
,
pserverAddr
,
clientv3
.
WithLease
(
resp
.
ID
))
c
.
Put
(
psKey
,
pserverAddr
,
clientv3
.
WithLease
(
e
.
sess
.
Lease
()
))
log
.
Debugf
(
"set pserver node %s with value %s"
,
psKey
,
pserverAddr
)
ch
,
kaerr
:=
e
.
etcdClient
.
KeepAlive
(
context
.
TODO
(),
resp
.
ID
)
if
kaerr
!=
nil
{
log
.
Errorf
(
"keepalive etcd node error: %v"
,
kaerr
)
return
kaerr
}
// Eat the keep alive message so etcd
// will not expire the lease.
go
func
(
ch
<-
chan
*
clientv3
.
LeaseKeepAliveResponse
)
{
ka
:=
<-
ch
log
.
Debugf
(
"keepalive: %d
\n
"
,
ka
.
TTL
)
}(
ch
)
log
.
Debug
(
"register finished"
)
idx
=
i
registered
=
true
...
...
@@ -207,7 +201,7 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er
// GetKey gets the value by the specified key
func
(
e
*
EtcdClient
)
GetKey
(
key
string
,
timeout
time
.
Duration
)
([]
byte
,
error
)
{
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
timeout
)
resp
,
err
:=
e
.
etcdC
lient
.
Get
(
ctx
,
key
)
resp
,
err
:=
e
.
c
lient
.
Get
(
ctx
,
key
)
cancel
()
if
err
!=
nil
{
return
[]
byte
{},
err
...
...
@@ -223,7 +217,27 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
// PutKey put into etcd with value by key specified
func
(
e
*
EtcdClient
)
PutKey
(
key
string
,
value
[]
byte
,
timeout
time
.
Duration
)
error
{
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
timeout
)
_
,
err
:=
e
.
etcdClient
.
Put
(
ctx
,
key
,
string
(
value
))
_
,
err
:=
e
.
client
.
Put
(
ctx
,
key
,
string
(
value
),
clientv3
.
WithLease
(
e
.
sess
.
Lease
()
))
cancel
()
return
err
}
// Shutdown shuts down the etcd client gracefully.
func
(
e
*
EtcdClient
)
Shutdown
()
error
{
var
err
error
if
e
.
sess
!=
nil
{
err
=
e
.
sess
.
Close
()
}
if
e
.
client
!=
nil
{
newErr
:=
e
.
client
.
Close
()
if
newErr
!=
nil
{
if
err
!=
nil
{
log
.
Errorln
(
newErr
)
}
else
{
err
=
newErr
}
}
}
return
err
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录