Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
ef67d08c
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ef67d08c
编写于
7月 11, 2017
作者:
G
gongweibao
提交者:
GitHub
7月 11, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #2719 from gongweibao/taskfail
add TaskFail interface
上级
62da4a1c
dd8685ff
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
89 addition
and
48 deletion
+89
-48
go/master/client.go
go/master/client.go
+6
-1
go/master/client_internal_test.go
go/master/client_internal_test.go
+8
-2
go/master/service.go
go/master/service.go
+70
-41
go/master/service_internal_test.go
go/master/service_internal_test.go
+1
-1
go/pserver/client/client_test.go
go/pserver/client/client_test.go
+4
-3
未找到文件。
go/master/client.go
浏览文件 @
ef67d08c
...
...
@@ -68,7 +68,7 @@ func (c *Client) getRecords() {
// We treat a task as finished whenever the last data
// instance of the task is read. This is not exactly
// correct, but a reasonable approximation.
c
.
taskFinished
(
t
.
ID
)
c
.
taskFinished
(
t
.
Meta
.
ID
)
}
}
...
...
@@ -118,6 +118,11 @@ func (c *Client) taskFinished(taskID int) error {
return
c
.
conn
.
Call
(
"Service.TaskFinished"
,
taskID
,
nil
)
}
// TaskFailed tell the master server as task is failed.
func
(
c
*
Client
)
taskFailed
(
meta
TaskMeta
)
error
{
return
c
.
conn
.
Call
(
"Service.TaskFailed"
,
meta
,
nil
)
}
// NextRecord returns next record in the dataset.
//
// NextRecord will block until the next record is available. It is
...
...
go/master/client_internal_test.go
浏览文件 @
ef67d08c
...
...
@@ -95,10 +95,16 @@ func TestGetFinishTask(t *testing.T) {
t
.
Fatalf
(
"Should get error, pass: %d
\n
"
,
i
)
}
err
=
c
.
taskFinished
(
tasks
[
0
]
.
ID
)
err
=
c
.
taskFinished
(
tasks
[
0
]
.
Meta
.
ID
)
if
err
!=
nil
{
t
.
Fatalf
(
"Error: %v, pass: %d
\n
"
,
err
,
i
)
}
err
=
c
.
taskFailed
(
tasks
[
0
]
.
Meta
)
if
err
!=
nil
{
t
.
Fatalf
(
"Error: %v, pass: %d
\n
"
,
err
,
i
)
}
tasks
=
tasks
[
1
:
]
task
,
err
:=
c
.
getTask
()
if
err
!=
nil
{
...
...
@@ -107,7 +113,7 @@ func TestGetFinishTask(t *testing.T) {
tasks
=
append
(
tasks
,
task
)
for
_
,
task
:=
range
tasks
{
err
=
c
.
taskFinished
(
task
.
ID
)
err
=
c
.
taskFinished
(
task
.
Meta
.
ID
)
if
err
!=
nil
{
t
.
Fatalf
(
"Error: %v, pass: %d
\n
"
,
err
,
i
)
}
...
...
go/master/service.go
浏览文件 @
ef67d08c
...
...
@@ -31,30 +31,36 @@ type Chunk struct {
Index
recordio
.
Index
// chunk index
}
// TaskMeta is a struct which stores task's meta info.
type
TaskMeta
struct
{
ID
int
Epoch
int
}
// Task is the basic unit of data instances assigned to trainers.
type
Task
struct
{
ID
int
Meta
TaskMeta
Chunks
[]
Chunk
}
type
taskEntry
struct
{
Epoch
int
NumTimeout
int
Task
Task
Task
Task
// A task fails if it's timeout or trainer reports it exits unnormally.
NumFailure
int
}
type
taskQueues
struct
{
Todo
[]
taskEntry
Pending
map
[
int
]
taskEntry
// map from task ID to task entry
Done
[]
taskEntry
Failed
[]
Task
Failed
[]
taskEntry
}
// Service is the master server service.
type
Service
struct
{
chunksPerTask
int
timeoutDur
time
.
Duration
timeout
Max
int
failure
Max
int
ready
chan
struct
{}
store
Store
...
...
@@ -73,7 +79,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
var
cur
taskEntry
for
i
,
c
:=
range
chunks
{
if
i
%
chunksPerTask
==
0
&&
len
(
cur
.
Task
.
Chunks
)
>
0
{
cur
.
Task
.
ID
=
id
cur
.
Task
.
Meta
.
ID
=
id
id
++
result
=
append
(
result
,
cur
)
cur
.
Task
.
Chunks
=
nil
...
...
@@ -83,7 +89,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
}
if
len
(
cur
.
Task
.
Chunks
)
>
0
{
cur
.
Task
.
ID
=
id
cur
.
Task
.
Meta
.
ID
=
id
result
=
append
(
result
,
cur
)
}
...
...
@@ -91,11 +97,11 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
}
// NewService creates a new service.
func
NewService
(
store
Store
,
chunksPerTask
int
,
timeoutDur
time
.
Duration
,
timeout
Max
int
)
(
*
Service
,
error
)
{
func
NewService
(
store
Store
,
chunksPerTask
int
,
timeoutDur
time
.
Duration
,
failure
Max
int
)
(
*
Service
,
error
)
{
s
:=
&
Service
{}
s
.
chunksPerTask
=
chunksPerTask
s
.
timeoutDur
=
timeoutDur
s
.
timeoutMax
=
timeout
Max
s
.
failureMax
=
failure
Max
s
.
taskQueues
=
taskQueues
{}
s
.
taskQueues
.
Pending
=
make
(
map
[
int
]
taskEntry
)
s
.
ready
=
make
(
chan
struct
{})
...
...
@@ -257,6 +263,34 @@ func (s *Service) SetDataset(globPaths []string, dummy *int) error {
return
nil
}
func
(
s
*
Service
)
processFailedTask
(
t
taskEntry
,
epoch
int
)
{
if
t
.
Task
.
Meta
.
Epoch
!=
epoch
{
// new epoch, task launched after the
// schedule of this timeout check or failed status report.
return
}
defer
func
()
{
err
:=
s
.
snapshot
()
if
err
!=
nil
{
log
.
Errorln
(
err
)
}
}()
delete
(
s
.
taskQueues
.
Pending
,
t
.
Task
.
Meta
.
ID
)
t
.
NumFailure
++
if
t
.
NumFailure
>
s
.
failureMax
{
log
.
Warningf
(
"Task %v failed %d times, discard."
,
t
.
Task
,
t
.
NumFailure
)
s
.
taskQueues
.
Failed
=
append
(
s
.
taskQueues
.
Failed
,
t
)
return
}
log
.
Warningf
(
"Task %v failed %d times, discard."
,
t
.
Task
,
t
.
NumFailure
)
s
.
taskQueues
.
Todo
=
append
(
s
.
taskQueues
.
Todo
,
t
)
return
}
func
(
s
*
Service
)
checkTimeoutFunc
(
taskID
int
,
epoch
int
)
func
()
{
return
func
()
{
s
.
mu
.
Lock
()
...
...
@@ -267,30 +301,7 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
return
}
if
t
.
Epoch
!=
epoch
{
// new epoch, task launched after the
// schedule of this timeout check.
return
}
defer
func
()
{
err
:=
s
.
snapshot
()
if
err
!=
nil
{
log
.
Errorln
(
err
)
}
}()
delete
(
s
.
taskQueues
.
Pending
,
t
.
Task
.
ID
)
t
.
NumTimeout
++
if
t
.
NumTimeout
>
s
.
timeoutMax
{
log
.
Warningf
(
"Task %v timed out %d times, discard."
,
t
.
Task
,
t
.
NumTimeout
)
s
.
taskQueues
.
Failed
=
append
(
s
.
taskQueues
.
Failed
,
t
.
Task
)
return
}
log
.
Warningf
(
"Task %v timed out %d times, retry."
,
t
.
Task
,
t
.
NumTimeout
)
s
.
taskQueues
.
Todo
=
append
(
s
.
taskQueues
.
Todo
,
t
)
s
.
processFailedTask
(
t
,
epoch
)
}
}
...
...
@@ -339,18 +350,18 @@ func (s *Service) GetTask(dummy int, task *Task) error {
}
t
:=
s
.
taskQueues
.
Todo
[
0
]
t
.
Epoch
++
t
.
Task
.
Meta
.
Epoch
++
s
.
taskQueues
.
Todo
=
s
.
taskQueues
.
Todo
[
1
:
]
s
.
taskQueues
.
Pending
[
t
.
Task
.
ID
]
=
t
s
.
taskQueues
.
Pending
[
t
.
Task
.
Meta
.
ID
]
=
t
err
:=
s
.
snapshot
()
if
err
!=
nil
{
return
err
}
*
task
=
t
.
Task
log
.
WithFields
(
s
.
logFields
())
.
Infof
(
"Task #%
d dispatched."
,
task
.
ID
)
log
.
WithFields
(
s
.
logFields
())
.
Infof
(
"Task #%
v dispatched."
,
t
.
Task
.
Meta
)
time
.
AfterFunc
(
s
.
timeoutDur
,
s
.
checkTimeoutFunc
(
t
.
Task
.
ID
,
t
.
Epoch
))
time
.
AfterFunc
(
s
.
timeoutDur
,
s
.
checkTimeoutFunc
(
t
.
Task
.
Meta
.
ID
,
t
.
Task
.
Meta
.
Epoch
))
return
nil
}
...
...
@@ -365,13 +376,12 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
t
,
ok
:=
s
.
taskQueues
.
Pending
[
taskID
]
if
!
ok
{
err
:=
errors
.
New
(
"pending task not found"
)
log
.
WithFields
(
s
.
logFields
())
.
Warningln
(
"Pending task #%d not found."
,
taskID
)
return
err
return
nil
}
// task finished, reset timeout
t
.
Num
Timeout
=
0
t
.
Num
Failure
=
0
s
.
taskQueues
.
Done
=
append
(
s
.
taskQueues
.
Done
,
t
)
delete
(
s
.
taskQueues
.
Pending
,
taskID
)
...
...
@@ -389,3 +399,22 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
}
return
err
}
// TaskFailed tells the service that a task is failed.
func
(
s
*
Service
)
TaskFailed
(
meta
TaskMeta
,
dummy
*
int
)
error
{
select
{
case
<-
s
.
ready
:
}
s
.
mu
.
Lock
()
defer
s
.
mu
.
Unlock
()
t
,
ok
:=
s
.
taskQueues
.
Pending
[
meta
.
ID
]
if
!
ok
{
log
.
WithFields
(
s
.
logFields
())
.
Warningln
(
"TaskFailed:Pending task #%v not found."
,
t
.
Task
.
Meta
)
return
nil
}
s
.
processFailedTask
(
t
,
meta
.
Epoch
)
return
nil
}
go/master/service_internal_test.go
浏览文件 @
ef67d08c
...
...
@@ -30,7 +30,7 @@ func TestPartionIndex(t *testing.T) {
cs
:=
make
([]
Chunk
,
100
)
ts
:=
partition
(
cs
,
20
)
for
i
:=
range
ts
{
if
ts
[
i
]
.
Task
.
ID
!=
i
{
if
ts
[
i
]
.
Task
.
Meta
.
ID
!=
i
{
t
.
Error
(
ts
[
i
],
i
)
}
}
...
...
go/pserver/client/client_test.go
浏览文件 @
ef67d08c
...
...
@@ -42,7 +42,8 @@ func initClient() [numPserver]int {
ports
[
i
]
=
p
go
func
(
l
net
.
Listener
)
{
s
,
err
:=
pserver
.
NewService
(
0
)
var
cp
pserver
.
Checkpoint
s
,
err
:=
pserver
.
NewService
(
0
,
1
,
""
,
nil
,
cp
)
if
err
!=
nil
{
panic
(
err
)
}
...
...
@@ -174,7 +175,7 @@ func TestNativeClient(t *testing.T) {
// TODO: tmperary disable etcdClient test for dependency of etcd)
func
EtcdClient
(
t
*
testing
.
T
)
{
initEtcdClient
()
etcd
_c
lient
:=
client
.
NewEtcd
(
etcdEndpoints
)
c2
:=
client
.
NewClient
(
etcd
_client
,
etcd_c
lient
.
Desired
(),
selector
(
true
))
etcd
C
lient
:=
client
.
NewEtcd
(
etcdEndpoints
)
c2
:=
client
.
NewClient
(
etcd
Client
,
etcdC
lient
.
Desired
(),
selector
(
true
))
ClientTest
(
t
,
c2
)
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录