Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
7dad0266
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7dad0266
编写于
6月 24, 2017
作者:
H
Helin Wang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Master server registers itself to etcd.
上级
42313a3c
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
56 addition
and
48 deletion
+56
-48
go/cmd/master/master.go
go/cmd/master/master.go
+10
-4
go/master/etcd_client.go
go/master/etcd_client.go
+46
-44
未找到文件。
go/cmd/master/master.go
浏览文件 @
7dad0266
package
main
import
(
"fmt"
"net"
"net/http"
"net/rpc"
...
...
@@ -12,13 +13,13 @@ import (
log
"github.com/sirupsen/logrus"
"github.com/PaddlePaddle/Paddle/go/master"
"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
)
func
main
()
{
port
:=
flag
.
Int
(
"port"
,
8080
,
"port of the master server."
)
ttlSec
:=
flag
.
Int
(
"ttl"
,
60
,
"etcd lease TTL in seconds."
)
endpoints
:=
flag
.
String
(
"endpoints"
,
""
,
"comma separated etcd endpoints. If empty, fault tolerance will not be enabled."
)
endpoints
:=
flag
.
String
(
"endpoints"
,
"
http://127.0.0.1:2379
"
,
"comma separated etcd endpoints. If empty, fault tolerance will not be enabled."
)
taskTimeoutDur
:=
flag
.
Duration
(
"task_timout_dur"
,
20
*
time
.
Minute
,
"task timout duration."
)
taskTimeoutMax
:=
flag
.
Int
(
"task_timeout_max"
,
3
,
"max timtout count for each task before it being declared failed task."
)
chunkPerTask
:=
flag
.
Int
(
"chunk_per_task"
,
10
,
"chunk per task."
)
...
...
@@ -31,8 +32,13 @@ func main() {
var
store
master
.
Store
if
*
endpoints
!=
""
{
eps
:=
strings
.
Split
(
*
endpoints
,
","
)
var
err
error
store
,
err
=
master
.
NewEtcd
(
eps
,
master
.
DefaultLockPath
,
master
.
DefaultStatePath
,
*
ttlSec
)
ip
,
err
:=
networkhelper
.
GetExternalIP
()
if
err
!=
nil
{
log
.
Fatal
(
err
)
}
addr
:=
fmt
.
Sprintf
(
"%s:%d"
,
ip
,
*
port
)
store
,
err
=
master
.
NewEtcdClient
(
eps
,
addr
,
master
.
DefaultLockPath
,
master
.
DefaultAddrPath
,
master
.
DefaultStatePath
,
*
ttlSec
)
if
err
!=
nil
{
log
.
Fatal
(
err
)
}
...
...
go/master/etcd_
store
.go
→
go/master/etcd_
client
.go
浏览文件 @
7dad0266
...
...
@@ -2,7 +2,7 @@ package master
import
(
"context"
"
sync
"
"
time
"
"github.com/coreos/etcd/clientv3"
"github.com/coreos/etcd/clientv3/concurrency"
...
...
@@ -14,22 +14,22 @@ const (
DefaultLockPath
=
"/master/lock"
// DefaultStatePath is the default etcd key for master state.
DefaultStatePath
=
"/master/state"
// DefaultAddrPath is the default etcd key for master address.
DefaultAddrPath
=
"/master/addr"
)
// Etcd
is the etcd abstraction
that master uses for fault tolerance
// Etcd
Client is the etcd client
that master uses for fault tolerance
// and service registry.
type
Etcd
struct
{
type
Etcd
Client
struct
{
lockPath
string
statePath
string
ttlSec
int
client
*
clientv3
.
Client
mu
sync
.
Mutex
lock
*
concurrency
.
Mutex
lock
*
concurrency
.
Mutex
}
// NewEtcd creates a new Etcd.
func
NewEtcd
(
endpoints
[]
string
,
lockPath
,
statePath
string
,
ttlSec
int
)
(
*
Etcd
,
error
)
{
// NewEtcdClient creates a new EtcdClient.
func
NewEtcdClient
(
endpoints
[]
string
,
addr
string
,
lockPath
,
addrPath
,
statePath
string
,
ttlSec
int
)
(
*
EtcdClient
,
error
)
{
log
.
Debugf
(
"Connecting to etcd at %v"
,
endpoints
)
// TODO(helin): gracefully shutdown etcd store. Becuase etcd
// store holds a etcd lock, even though the lock will expire
// when the lease timeout, we need to implement graceful
...
...
@@ -53,27 +53,35 @@ func NewEtcd(endpoints []string, lockPath, statePath string, ttlSec int) (*Etcd,
// one master running, but split-brain problem may cuase
// multiple master servers running), and the cluster management
// software will kill one of them.
log
.
Info
f
(
"Trying to acquire lock at %s."
,
lockPath
)
log
.
Debug
f
(
"Trying to acquire lock at %s."
,
lockPath
)
err
=
lock
.
Lock
(
context
.
TODO
())
if
err
!=
nil
{
return
nil
,
err
}
log
.
Infof
(
"Successfully acquired lock at %s."
,
lockPath
)
e
:=
&
Etcd
{}
e
.
client
=
cli
e
.
lock
=
lock
e
.
lockPath
=
lockPath
e
.
statePath
=
statePath
e
.
ttlSec
=
ttlSec
log
.
Debugf
(
"Successfully acquired lock at %s."
,
lockPath
)
put
:=
clientv3
.
OpPut
(
addrPath
,
string
(
addr
))
resp
,
err
:=
cli
.
Txn
(
context
.
Background
())
.
If
(
lock
.
IsOwner
())
.
Then
(
put
)
.
Commit
()
if
err
!=
nil
{
return
nil
,
err
}
if
!
resp
.
Succeeded
{
log
.
Fatal
(
"No longer owns the master lock. Exiting."
)
}
e
:=
&
EtcdClient
{
lockPath
:
lockPath
,
statePath
:
statePath
,
client
:
cli
,
lock
:
lock
,
}
return
e
,
nil
}
// Save saves the state into the etcd.
func
(
e
*
Etcd
)
Save
(
state
[]
byte
)
error
{
e
.
mu
.
Lock
()
defer
e
.
mu
.
Unlock
()
func
(
e
*
EtcdClient
)
Save
(
state
[]
byte
)
error
{
ctx
:=
context
.
TODO
()
put
:=
clientv3
.
OpPut
(
e
.
statePath
,
string
(
state
))
resp
,
err
:=
e
.
client
.
Txn
(
ctx
)
.
If
(
e
.
lock
.
IsOwner
())
.
Then
(
put
)
.
Commit
()
...
...
@@ -82,17 +90,21 @@ func (e *Etcd) Save(state []byte) error {
}
if
!
resp
.
Succeeded
{
log
.
Errorln
(
"No longer owns the lock, trying to lock and save again."
)
sess
,
err
:=
concurrency
.
NewSession
(
e
.
client
,
concurrency
.
WithTTL
(
e
.
ttlSec
))
if
err
!=
nil
{
return
err
}
e
.
lock
=
concurrency
.
NewMutex
(
sess
,
e
.
lockPath
)
log
.
Infof
(
"Try to acquire lock at %s."
,
e
.
lockPath
)
err
=
e
.
lock
.
Lock
(
context
.
TODO
())
log
.
Errorln
(
"No longer owns the lock, trying to lock again"
)
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
5
*
time
.
Second
)
err
:=
e
.
lock
.
Lock
(
ctx
)
cancel
()
if
err
!=
nil
{
return
err
// We lost the master lock and can not acquire
// it back, it means some other master is
// already started. We don't want cluster
// managment system to kill the master server
// who is holding the lock and running
// correctly. So the most feasible solution is
// to kill current master server. The current
// state is not saved, but the trainer's RPC
// call will fail, so the trainer will retry.
log
.
Fatalf
(
"Could not acquire the lock at %s: %v. Exiting."
,
e
.
lockPath
,
err
)
}
log
.
Infof
(
"Successfully acquired lock at %s."
,
e
.
lockPath
)
return
e
.
Save
(
state
)
...
...
@@ -102,8 +114,7 @@ func (e *Etcd) Save(state []byte) error {
}
// Load loads the state from etcd.
func
(
e
*
Etcd
)
Load
()
([]
byte
,
error
)
{
e
.
mu
.
Lock
()
func
(
e
*
EtcdClient
)
Load
()
([]
byte
,
error
)
{
ctx
:=
context
.
TODO
()
get
:=
clientv3
.
OpGet
(
e
.
statePath
)
...
...
@@ -114,14 +125,7 @@ func (e *Etcd) Load() ([]byte, error) {
if
!
resp
.
Succeeded
{
log
.
Errorln
(
"No longer owns the lock, trying to lock and load again."
)
sess
,
err
:=
concurrency
.
NewSession
(
e
.
client
)
if
err
!=
nil
{
return
nil
,
err
}
e
.
lock
=
concurrency
.
NewMutex
(
sess
,
e
.
lockPath
)
err
=
e
.
lock
.
Lock
(
context
.
TODO
())
e
.
mu
.
Unlock
()
err
=
e
.
lock
.
Lock
(
context
.
Background
())
if
err
!=
nil
{
return
nil
,
err
}
...
...
@@ -132,11 +136,9 @@ func (e *Etcd) Load() ([]byte, error) {
kvs
:=
resp
.
Responses
[
0
]
.
GetResponseRange
()
.
Kvs
if
len
(
kvs
)
==
0
{
// No state exists
e
.
mu
.
Unlock
()
return
nil
,
nil
}
state
:=
kvs
[
0
]
.
Value
e
.
mu
.
Unlock
()
return
state
,
nil
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录