package master import ( "errors" "os" "path/filepath" "sync" "time" log "github.com/sirupsen/logrus" "github.com/PaddlePaddle/recordio" ) // Service is the master server service. type Service struct { chunksPerTask int timeoutDur time.Duration timeoutMax int ready chan struct{} mu sync.Mutex initDone bool taskQueues taskQueues } // Recover recovers service state from etcd. func Recover() (*Service, error) { // TODO(helin): recover from snapshot state from etcd. return nil, nil } func partition(chunks []Chunk, chunksPerTask int) []taskEntry { id := 0 if chunksPerTask <= 0 { chunksPerTask = 1 } var result []taskEntry var cur taskEntry for i, c := range chunks { if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 { cur.Task.ID = id id++ result = append(result, cur) cur.Task.Chunks = nil } cur.Task.Chunks = append(cur.Task.Chunks, c) } if len(cur.Task.Chunks) > 0 { cur.Task.ID = id result = append(result, cur) } return result } // NewService creates a new service. func NewService(chunksPerTask int, timeoutDur time.Duration, timeoutMax int) *Service { s := &Service{} s.chunksPerTask = chunksPerTask s.timeoutDur = timeoutDur s.timeoutMax = timeoutMax s.taskQueues = taskQueues{} s.taskQueues.Pending = make(map[int]taskEntry) s.ready = make(chan struct{}) return s } // Chunk is a chunk of data consisted of several data instances. type Chunk struct { Path string Index recordio.Index // chunk index } // Task is the basic unit of data instances assigned to trainers. type Task struct { ID int Chunks []Chunk } type taskEntry struct { Epoch int NumTimeout int Task Task } type taskQueues struct { Todo []taskEntry Pending map[int]taskEntry // map from task ID to task entry Done []taskEntry Failed []Task } // *must* be called with s.mu being held. func (s *Service) snapshot() error { // TODO(helin): snapshot state on etcd. return nil } func readChunks(globPaths []string) ([]Chunk, error) { var chunks []Chunk var paths []string for _, s := range globPaths { match, err := filepath.Glob(s) if err != nil { return nil, err } paths = append(paths, match...) } if len(paths) == 0 { return nil, errors.New("no valid dataset specified") } for _, path := range paths { f, err := os.Open(path) if err != nil { return nil, err } index, err := recordio.LoadIndex(f) if err != nil { return nil, err } err = f.Close() if err != nil { return nil, err } count := index.NumChunks() for i := 0; i < count; i++ { chunk := Chunk{ Path: path, Index: *index.ChunkIndex(i), } chunks = append(chunks, chunk) } } return chunks, nil } // SetDataset sets dataset to dispatch for the master server. // // SetDataset can be call multiple times. But only the first call will // be honored. func (s *Service) SetDataset(globPaths []string, dummy *int) error { if len(globPaths) == 0 { return errors.New("no dataset specified") } s.mu.Lock() defer s.mu.Unlock() if s.initDone { // Already initialized. All trainer will call // SetDataset, but we only handle the first one. Treat // other calls as successful but do nothing. return nil } chunks, err := readChunks(globPaths) if err != nil { return err } s.taskQueues.Todo = partition(chunks, s.chunksPerTask) err = s.snapshot() if err != nil { log.Errorln(err) return err } close(s.ready) s.initDone = true return nil } func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() { return func() { s.mu.Lock() defer s.mu.Unlock() t, ok := s.taskQueues.Pending[taskID] if !ok { return } if t.Epoch != epoch { // new epoch, task launched after the // schedule of this timeout check. return } defer func() { err := s.snapshot() if err != nil { log.Errorln(err) } }() delete(s.taskQueues.Pending, t.Task.ID) t.NumTimeout++ if t.NumTimeout > s.timeoutMax { log.Warningf("Task %v failed %d times, discard.\n", t.Task, t.NumTimeout) s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task) return } log.Warningf("Task %v failed %d times, retry.\n", t.Task, t.NumTimeout) s.taskQueues.Todo = append(s.taskQueues.Todo, t) } } // GetTask gets a new task from the service. func (s *Service) GetTask(dummy int, task *Task) error { select { case <-s.ready: } s.mu.Lock() defer s.mu.Unlock() if len(s.taskQueues.Todo) == 0 { if len(s.taskQueues.Done) == 0 { if len(s.taskQueues.Pending) == 0 { err := errors.New("all task failed") log.Warningln(err) return err } // TODO(helin): client need to retry in this // error case. Gotcha: RPC client can't // compare returned error with predefined // errors like io.EOF, because the error // instance deserialized from RPC is a // different instance than the error defined // in package. So we need to figure out a way // for client to check this error correctly. err := errors.New("no more available task") log.Warningln(err) return err } s.taskQueues.Todo = s.taskQueues.Done s.taskQueues.Done = nil log.Infoln("No more todo task, but trainer is requesting task to do. Move all done task to todo.") } t := s.taskQueues.Todo[0] t.Epoch++ s.taskQueues.Todo = s.taskQueues.Todo[1:] s.taskQueues.Pending[t.Task.ID] = t err := s.snapshot() if err != nil { return err } *task = t.Task log.Infof("Task #%d dispatched\n", task.ID) time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Epoch)) return nil } // TaskFinished tell the service that a task is finished. func (s *Service) TaskFinished(taskID int, dummy *int) error { select { case <-s.ready: } s.mu.Lock() defer s.mu.Unlock() log.Infof("Task %d finished\n", taskID) t, ok := s.taskQueues.Pending[taskID] if !ok { err := errors.New("pending task not found") log.Warningln(err) return err } // task finished, reset timeout t.NumTimeout = 0 s.taskQueues.Done = append(s.taskQueues.Done, t) delete(s.taskQueues.Pending, taskID) if len(s.taskQueues.Pending) == 0 && len(s.taskQueues.Todo) == 0 { log.Infoln("No more todo and pending task, start a new pass.") s.taskQueues.Todo = append(s.taskQueues.Todo, s.taskQueues.Done...) s.taskQueues.Done = nil } err := s.snapshot() if err != nil { log.Errorln(err) } return err }