client.go 3.6 KB
Newer Older
D
dongzhihong 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at

// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15 16 17
package master

import (
H
Helin Wang 已提交
18
	"os"
19
	"time"
20 21

	"github.com/PaddlePaddle/Paddle/go/connection"
H
Helin Wang 已提交
22
	"github.com/PaddlePaddle/recordio"
H
Helin Wang 已提交
23
	log "github.com/sirupsen/logrus"
24 25 26 27 28
)

// Client is the client of the master server.
type Client struct {
	conn *connection.Conn
G
gongweibao 已提交
29 30 31 32 33 34
	ch   chan record
}

type record struct {
	r   []byte
	err error
35 36 37
}

// NewClient creates a new Client.
38 39 40
//
// bufSize is the record buffer size. NextRecord will read from this
// buffer.
41
func NewClient(addrCh <-chan string, bufSize int) *Client {
42 43
	c := &Client{}
	c.conn = connection.New()
G
gongweibao 已提交
44
	c.ch = make(chan record, bufSize)
45
	go c.monitorMaster(addrCh)
H
Helin Wang 已提交
46
	go c.getRecords()
47 48 49
	return c
}

H
Helin Wang 已提交
50 51 52 53
func (c *Client) getRecords() {
	for {
		t, err := c.getTask()
		if err != nil {
H
Helin Wang 已提交
54
			// getTask call.
55 56
			log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err)
			time.Sleep(3 * time.Second)
H
Helin Wang 已提交
57 58 59 60 61 62
			continue
		}

		for _, chunk := range t.Chunks {
			f, err := os.Open(chunk.Path)
			if err != nil {
H
Helin Wang 已提交
63
				log.Errorln(err)
H
Helin Wang 已提交
64 65 66 67 68
				continue
			}

			s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1)
			for s.Scan() {
G
gongweibao 已提交
69
				c.ch <- record{s.Record(), nil}
H
Helin Wang 已提交
70 71
			}

72
			if s.Err() != nil {
G
gongweibao 已提交
73
				c.ch <- record{nil, s.Err()}
H
Helin Wang 已提交
74
				log.Errorln(err, chunk.Path)
75 76
			}

H
Helin Wang 已提交
77 78
			err = f.Close()
			if err != nil {
H
Helin Wang 已提交
79
				log.Errorln(err)
H
Helin Wang 已提交
80 81
			}
		}
82 83 84 85

		// We treat a task as finished whenever the last data
		// instance of the task is read. This is not exactly
		// correct, but a reasonable approximation.
H
Helin Wang 已提交
86 87 88 89
		err = c.taskFinished(t.Meta.ID)
		if err != nil {
			log.Errorln(err)
		}
H
Helin Wang 已提交
90 91 92
	}
}

93
func (c *Client) monitorMaster(addrCh <-chan string) {
94
	lastMaster := ""
95
	for curMaster := range addrCh {
H
Helin Wang 已提交
96
		// connect to the new address once address changed.
97 98 99 100
		if curMaster != lastMaster {
			if curMaster == "" {
				err := c.conn.Close()
				if err != nil {
H
Helin Wang 已提交
101
					log.Errorln(err)
102 103 104 105
				}
			} else {
				err := c.conn.Connect(curMaster)
				if err != nil {
H
Helin Wang 已提交
106
					log.Errorln(err)
107 108 109 110 111 112 113 114 115 116 117 118

					// connect to addr failed, set
					// to last known addr in order
					// to retry next time.
					curMaster = lastMaster
				}
			}
		}
		lastMaster = curMaster
	}
}

119 120 121 122 123 124 125 126
// SetDataset set dataset for the master server to dispatch.
//
// SetDataset can be call multiple times from different nodes. But
// only the first call will be honored.
func (c *Client) SetDataset(globPaths []string) error {
	return c.conn.Call("Service.SetDataset", globPaths, nil)
}

H
Helin Wang 已提交
127 128
// getTask gets a new task from the master server.
func (c *Client) getTask() (Task, error) {
129
	var t Task
130
	err := c.conn.Call("Service.GetTask", 0, &t)
131 132 133 134
	return t, err
}

// TaskFinished tells the master server a task is finished.
H
Helin Wang 已提交
135
func (c *Client) taskFinished(taskID int) error {
136
	return c.conn.Call("Service.TaskFinished", taskID, nil)
137
}
H
Helin Wang 已提交
138

G
gongweibao 已提交
139
// TaskFailed tell the master server as task is failed.
G
gongweibao 已提交
140
func (c *Client) taskFailed(meta TaskMeta) error {
G
gongweibao 已提交
141
	return c.conn.Call("Service.TaskFailed", meta, nil)
G
gongweibao 已提交
142 143
}

H
Helin Wang 已提交
144 145
// NextRecord returns next record in the dataset.
//
H
Helin Wang 已提交
146
// NextRecord will block until the next record is available. It is
H
Helin Wang 已提交
147
// thread-safe.
G
gongweibao 已提交
148 149 150
func (c *Client) NextRecord() ([]byte, error) {
	r := <-c.ch
	return r.r, r.err
H
Helin Wang 已提交
151
}