Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
0bebaa05
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
0bebaa05
编写于
6月 13, 2017
作者:
H
Helin Wang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix according to comments
上级
f6148eb2
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
101 addition
and
59 deletion
+101
-59
go/master/client.go
go/master/client.go
+2
-0
go/master/client_test.go
go/master/client_test.go
+15
-0
go/master/service.go
go/master/service.go
+66
-44
go/pserver/client.go
go/pserver/client.go
+18
-15
未找到文件。
go/master/client.go
浏览文件 @
0bebaa05
...
...
@@ -28,6 +28,8 @@ func NewClient(addr Addresser) *Client {
func
(
c
*
Client
)
monitorMaster
(
addr
Addresser
)
{
lastMaster
:=
""
monitor
:=
func
()
{
// get the lastest address of the master server,
// connect to the new address once address changed.
curMaster
:=
addr
.
Address
()
if
curMaster
!=
lastMaster
{
if
curMaster
==
""
{
...
...
go/master/client_test.go
浏览文件 @
0bebaa05
...
...
@@ -11,6 +11,8 @@ import (
"testing"
"time"
log
"github.com/sirupsen/logrus"
"github.com/PaddlePaddle/Paddle/go/master"
"github.com/PaddlePaddle/recordio"
)
...
...
@@ -23,6 +25,8 @@ const (
var
port
int
func
init
()
{
log
.
SetLevel
(
log
.
ErrorLevel
)
l
,
err
:=
net
.
Listen
(
"tcp"
,
":0"
)
if
err
!=
nil
{
panic
(
err
)
...
...
@@ -91,6 +95,17 @@ func TestClientFull(t *testing.T) {
t
.
Fatal
(
i
,
"should get error."
)
}
err
=
c
.
TaskFinished
(
tasks
[
0
]
.
ID
)
if
err
!=
nil
{
t
.
Fatal
(
err
)
}
tasks
=
tasks
[
1
:
]
task
,
err
:=
c
.
GetTask
()
if
err
!=
nil
{
t
.
Fatal
(
err
)
}
tasks
=
append
(
tasks
,
task
)
for
_
,
task
:=
range
tasks
{
err
=
c
.
TaskFinished
(
task
.
ID
)
if
err
!=
nil
{
...
...
go/master/service.go
浏览文件 @
0bebaa05
...
...
@@ -2,12 +2,13 @@ package master
import
(
"errors"
"log"
"os"
"path/filepath"
"sync"
"time"
log
"github.com/sirupsen/logrus"
"github.com/PaddlePaddle/recordio"
)
...
...
@@ -112,7 +113,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
}
if
len
(
paths
)
==
0
{
return
nil
,
errors
.
New
(
"no valid datset specified"
)
return
nil
,
errors
.
New
(
"no valid dat
a
set specified"
)
}
for
_
,
path
:=
range
paths
{
...
...
@@ -170,6 +171,7 @@ func (s *Service) SetDataset(globPaths []string, dummy *int) error {
err
=
s
.
snapshot
()
if
err
!=
nil
{
log
.
Errorln
(
err
)
return
err
}
...
...
@@ -178,6 +180,43 @@ func (s *Service) SetDataset(globPaths []string, dummy *int) error {
return
nil
}
func
(
s
*
Service
)
checkTimeoutFunc
(
taskID
int
,
epoch
int
)
func
()
{
return
func
()
{
s
.
mu
.
Lock
()
defer
s
.
mu
.
Unlock
()
t
,
ok
:=
s
.
taskQueues
.
Pending
[
taskID
]
if
!
ok
{
return
}
if
t
.
Epoch
!=
epoch
{
// new epoch, task launched after the
// schedule of this timeout check.
return
}
defer
func
()
{
err
:=
s
.
snapshot
()
if
err
!=
nil
{
log
.
Errorln
(
err
)
}
}()
delete
(
s
.
taskQueues
.
Pending
,
t
.
Task
.
ID
)
t
.
NumTimeout
++
if
t
.
NumTimeout
>
s
.
timeoutMax
{
log
.
Warningf
(
"Task %v failed %d times, discard.
\n
"
,
t
.
Task
,
t
.
NumTimeout
)
s
.
taskQueues
.
Failed
=
append
(
s
.
taskQueues
.
Failed
,
t
.
Task
)
return
}
log
.
Warningf
(
"Task %v failed %d times, retry.
\n
"
,
t
.
Task
,
t
.
NumTimeout
)
s
.
taskQueues
.
Todo
=
append
(
s
.
taskQueues
.
Todo
,
t
)
}
}
// GetTask gets a new task from the service.
func
(
s
*
Service
)
GetTask
(
dummy
int
,
task
*
Task
)
error
{
select
{
...
...
@@ -190,19 +229,25 @@ func (s *Service) GetTask(dummy int, task *Task) error {
if
len
(
s
.
taskQueues
.
Todo
)
==
0
{
if
len
(
s
.
taskQueues
.
Done
)
==
0
{
if
len
(
s
.
taskQueues
.
Pending
)
==
0
{
return
errors
.
New
(
"all task failed"
)
err
:=
errors
.
New
(
"all task failed"
)
log
.
Warningln
(
err
)
return
err
}
// TODO(helin): client need to retry in this
// error case. Gotcha: RPC client can't
// compare returned error with predefined
// errors like io.EOF
. B
ecause interface don't
// errors like io.EOF
, b
ecause interface don't
// have same dynamic value when in different
// process.
return
errors
.
New
(
"no more available task"
)
// process. So we need to figure out a way for
// client to check this error correctly.
err
:=
errors
.
New
(
"no more available task"
)
log
.
Warningln
(
err
)
return
err
}
s
.
taskQueues
.
Todo
=
s
.
taskQueues
.
Done
s
.
taskQueues
.
Todo
=
nil
s
.
taskQueues
.
Done
=
nil
log
.
Infoln
(
"No more todo task, but trainer is requesting task to do. Move all done task to todo."
)
}
t
:=
s
.
taskQueues
.
Todo
[
0
]
...
...
@@ -215,41 +260,9 @@ func (s *Service) GetTask(dummy int, task *Task) error {
}
*
task
=
t
.
Task
log
.
Infof
(
"Task #%d dispatched
\n
"
,
task
.
ID
)
time
.
AfterFunc
(
s
.
timeoutDur
,
func
(
taskID
int
,
epoch
int
)
func
()
{
return
func
()
{
s
.
mu
.
Lock
()
defer
s
.
mu
.
Unlock
()
t
,
ok
:=
s
.
taskQueues
.
Pending
[
taskID
]
if
!
ok
{
return
}
if
t
.
Epoch
!=
epoch
{
// new epoch, task launched after the
// schedule of this timeout check.
return
}
defer
func
()
{
err
:=
s
.
snapshot
()
if
err
!=
nil
{
log
.
Println
(
err
)
}
}()
delete
(
s
.
taskQueues
.
Pending
,
t
.
Task
.
ID
)
t
.
NumTimeout
++
if
t
.
NumTimeout
>
s
.
timeoutMax
{
s
.
taskQueues
.
Failed
=
append
(
s
.
taskQueues
.
Failed
,
t
.
Task
)
return
}
s
.
taskQueues
.
Todo
=
append
(
s
.
taskQueues
.
Todo
,
t
)
}
}(
t
.
Task
.
ID
,
t
.
Epoch
))
time
.
AfterFunc
(
s
.
timeoutDur
,
s
.
checkTimeoutFunc
(
t
.
Task
.
ID
,
t
.
Epoch
))
return
nil
}
...
...
@@ -262,9 +275,13 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
s
.
mu
.
Lock
()
defer
s
.
mu
.
Unlock
()
log
.
Infof
(
"Task %d finished
\n
"
,
taskID
)
t
,
ok
:=
s
.
taskQueues
.
Pending
[
taskID
]
if
!
ok
{
return
errors
.
New
(
"pending task not found"
)
err
:=
errors
.
New
(
"pending task not found"
)
log
.
Warningln
(
err
)
return
err
}
// task finished, reset timeout
...
...
@@ -272,10 +289,15 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
s
.
taskQueues
.
Done
=
append
(
s
.
taskQueues
.
Done
,
t
)
delete
(
s
.
taskQueues
.
Pending
,
taskID
)
if
len
(
s
.
taskQueues
.
Pending
)
==
0
{
if
len
(
s
.
taskQueues
.
Pending
)
==
0
&&
len
(
s
.
taskQueues
.
Todo
)
==
0
{
log
.
Infoln
(
"No more todo and pending task, start a new pass."
)
s
.
taskQueues
.
Todo
=
append
(
s
.
taskQueues
.
Todo
,
s
.
taskQueues
.
Done
...
)
s
.
taskQueues
.
Done
=
nil
}
return
s
.
snapshot
()
err
:=
s
.
snapshot
()
if
err
!=
nil
{
log
.
Errorln
(
err
)
}
return
err
}
go/pserver/client.go
浏览文件 @
0bebaa05
...
...
@@ -57,26 +57,29 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
}
for
i
:=
range
lastServers
{
if
lastServers
[
i
]
.
Addr
!=
curServers
[
i
]
.
Addr
{
if
curServers
[
i
]
.
Addr
==
""
{
err
:=
c
.
pservers
[
i
]
.
Close
()
if
err
!=
nil
{
log
.
Println
(
err
)
}
continue
}
if
lastServers
[
i
]
.
Addr
==
curServers
[
i
]
.
Addr
{
continue
}
err
:=
c
.
pservers
[
i
]
.
Connect
(
curServers
[
i
]
.
Addr
)
if
curServers
[
i
]
.
Addr
==
""
{
err
:=
c
.
pservers
[
i
]
.
Close
()
if
err
!=
nil
{
log
.
Println
(
err
)
// connect to addr failed, set
// to last known addr in order
// to retry next time.
curServers
[
i
]
.
Addr
=
lastServers
[
i
]
.
Addr
}
continue
}
err
:=
c
.
pservers
[
i
]
.
Connect
(
curServers
[
i
]
.
Addr
)
if
err
!=
nil
{
log
.
Println
(
err
)
// connect to addr failed, set
// to last known addr in order
// to retry next time.
curServers
[
i
]
.
Addr
=
lastServers
[
i
]
.
Addr
}
}
lastServers
=
curServers
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录