Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
e25c155f
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e25c155f
编写于
7月 04, 2017
作者:
G
gongweibao
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add taskfail interface
上级
80f8e242
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
68 addition
and
36 deletion
+68
-36
go/master/client.go
go/master/client.go
+5
-0
go/master/service.go
go/master/service.go
+63
-36
未找到文件。
go/master/client.go
浏览文件 @
e25c155f
...
...
@@ -112,6 +112,11 @@ func (c *Client) taskFinished(taskID int) error {
return
c
.
conn
.
Call
(
"Service.TaskFinished"
,
taskID
,
nil
)
}
// TaskFailed tell the master server as task is failed.
func
(
c
*
Client
)
taskFailed
(
taskID
int
,
epoch
int
)
error
{
return
c
.
conn
.
Call
(
"Service.TaskFinished"
,
taskID
,
epoch
)
}
// NextRecord returns next record in the dataset.
//
// NextRecord will block until the next record is available. It is
...
...
go/master/service.go
浏览文件 @
e25c155f
...
...
@@ -34,29 +34,30 @@ type Chunk struct {
// Task is the basic unit of data instances assigned to trainers.
type
Task
struct
{
ID
int
Epoch
int
Chunks
[]
Chunk
}
type
taskEntry
struct
{
Epoch
int
NumTimeout
int
Task
Task
FailedNum
int
}
type
taskQueues
struct
{
Todo
[]
taskEntry
Pending
map
[
int
]
taskEntry
// map from task ID to task entry
Done
[]
taskEntry
Failed
[]
Task
Failed
[]
taskEntry
}
// Service is the master server service.
type
Service
struct
{
chunksPerTask
int
timeoutDur
time
.
Duration
timeoutMax
int
ready
chan
struct
{}
store
Store
chunksPerTask
int
timeoutDur
time
.
Duration
failortimeoutMax
int
ready
chan
struct
{}
store
Store
mu
sync
.
Mutex
initDone
bool
...
...
@@ -91,11 +92,11 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
}
// NewService creates a new service.
func
NewService
(
store
Store
,
chunksPerTask
int
,
timeoutDur
time
.
Duration
,
timeoutMax
int
)
(
*
Service
,
error
)
{
func
NewService
(
store
Store
,
chunksPerTask
int
,
timeoutDur
time
.
Duration
,
failor
timeoutMax
int
)
(
*
Service
,
error
)
{
s
:=
&
Service
{}
s
.
chunksPerTask
=
chunksPerTask
s
.
timeoutDur
=
timeoutDur
s
.
timeoutMax
=
timeoutMax
s
.
failortimeoutMax
=
failor
timeoutMax
s
.
taskQueues
=
taskQueues
{}
s
.
taskQueues
.
Pending
=
make
(
map
[
int
]
taskEntry
)
s
.
ready
=
make
(
chan
struct
{})
...
...
@@ -257,6 +258,34 @@ func (s *Service) SetDataset(globPaths []string, dummy *int) error {
return
nil
}
func
(
s
*
Service
)
checkTaskStatus
(
t
taskEntry
,
epoch
int
)
{
if
t
.
Task
.
Epoch
!=
epoch
{
// new epoch, task launched after the
// schedule of this timeout check or failed status report.
return
}
defer
func
()
{
err
:=
s
.
snapshot
()
if
err
!=
nil
{
log
.
Errorln
(
err
)
}
}()
delete
(
s
.
taskQueues
.
Pending
,
t
.
Task
.
ID
)
t
.
NumTimeout
++
if
t
.
NumTimeout
+
t
.
FailedNum
>
s
.
failortimeoutMax
{
log
.
Warningf
(
"Task %v timed out %d times and failed %d times, discard."
,
t
.
Task
,
t
.
NumTimeout
,
t
.
FailedNum
)
s
.
taskQueues
.
Failed
=
append
(
s
.
taskQueues
.
Failed
,
t
)
return
}
log
.
Warningf
(
"Task %v timed out %d times and failed %d times, discard."
,
t
.
Task
,
t
.
NumTimeout
,
t
.
FailedNum
)
s
.
taskQueues
.
Todo
=
append
(
s
.
taskQueues
.
Todo
,
t
)
return
}
func
(
s
*
Service
)
checkTimeoutFunc
(
taskID
int
,
epoch
int
)
func
()
{
return
func
()
{
s
.
mu
.
Lock
()
...
...
@@ -267,30 +296,7 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
return
}
if
t
.
Epoch
!=
epoch
{
// new epoch, task launched after the
// schedule of this timeout check.
return
}
defer
func
()
{
err
:=
s
.
snapshot
()
if
err
!=
nil
{
log
.
Errorln
(
err
)
}
}()
delete
(
s
.
taskQueues
.
Pending
,
t
.
Task
.
ID
)
t
.
NumTimeout
++
if
t
.
NumTimeout
>
s
.
timeoutMax
{
log
.
Warningf
(
"Task %v timed out %d times, discard."
,
t
.
Task
,
t
.
NumTimeout
)
s
.
taskQueues
.
Failed
=
append
(
s
.
taskQueues
.
Failed
,
t
.
Task
)
return
}
log
.
Warningf
(
"Task %v timed out %d times, retry."
,
t
.
Task
,
t
.
NumTimeout
)
s
.
taskQueues
.
Todo
=
append
(
s
.
taskQueues
.
Todo
,
t
)
s
.
checkTaskStatus
(
t
,
epoch
)
}
}
...
...
@@ -339,7 +345,7 @@ func (s *Service) GetTask(dummy int, task *Task) error {
}
t
:=
s
.
taskQueues
.
Todo
[
0
]
t
.
Epoch
++
t
.
Task
.
Epoch
++
s
.
taskQueues
.
Todo
=
s
.
taskQueues
.
Todo
[
1
:
]
s
.
taskQueues
.
Pending
[
t
.
Task
.
ID
]
=
t
err
:=
s
.
snapshot
()
...
...
@@ -348,9 +354,9 @@ func (s *Service) GetTask(dummy int, task *Task) error {
}
*
task
=
t
.
Task
log
.
WithFields
(
s
.
logFields
())
.
Infof
(
"Task #%
d dispatched."
,
task
.
ID
)
log
.
WithFields
(
s
.
logFields
())
.
Infof
(
"Task #%
v dispatched."
,
t
)
time
.
AfterFunc
(
s
.
timeoutDur
,
s
.
checkTimeoutFunc
(
t
.
Task
.
ID
,
t
.
Epoch
))
time
.
AfterFunc
(
s
.
timeoutDur
,
s
.
checkTimeoutFunc
(
t
.
Task
.
ID
,
t
.
Task
.
Epoch
))
return
nil
}
...
...
@@ -372,6 +378,7 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
// task finished, reset timeout
t
.
NumTimeout
=
0
t
.
FailedNum
=
0
s
.
taskQueues
.
Done
=
append
(
s
.
taskQueues
.
Done
,
t
)
delete
(
s
.
taskQueues
.
Pending
,
taskID
)
...
...
@@ -389,3 +396,23 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
}
return
err
}
// TaskFailed tell the service that a task is failed.
func
(
s
*
Service
)
TaskFailed
(
taskID
int
,
epoch
int
)
error
{
select
{
case
<-
s
.
ready
:
}
s
.
mu
.
Lock
()
defer
s
.
mu
.
Unlock
()
t
,
ok
:=
s
.
taskQueues
.
Pending
[
taskID
]
if
!
ok
{
err
:=
errors
.
New
(
"pending task not found"
)
log
.
WithFields
(
s
.
logFields
())
.
Warningln
(
"TaskFailed:Pending task #%d not found."
,
taskID
)
return
err
}
s
.
checkTaskStatus
(
t
,
epoch
)
return
nil
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录