client.go 6.2 KB
Newer Older
D
dongzhihong 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at

// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15 16 17
package master

import (
H
Helin Wang 已提交
18
	"os"
19
	"time"
20 21

	"github.com/PaddlePaddle/Paddle/go/connection"
H
Helin Wang 已提交
22
	"github.com/PaddlePaddle/recordio"
23
	"github.com/coreos/etcd/clientv3"
24
	log "github.com/inconshreveable/log15"
25 26 27 28
)

// Client is the client of the master server.
type Client struct {
29 30 31
	conn    *connection.Conn
	ch      chan record
	bufSize int
G
gongweibao 已提交
32 33 34 35 36
}

type record struct {
	r   []byte
	err error
37 38
}

39
// WithBuffer sets the client to buffer the training record.
40 41 42
//
// bufSize is the record buffer size. NextRecord will read from this
// buffer.
43 44 45 46 47
func WithBuffer(bufSize int) func(*Client) error {
	return func(c *Client) error {
		if bufSize <= 0 {
			return nil
		}
48
		c.bufSize = bufSize
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
		return nil
	}
}

// WithAddr sets the client to use fixed master address.
func WithAddr(addr string) func(c *Client) error {
	return func(c *Client) error {
		ch := make(chan string, 1)
		ch <- addr
		go c.monitorMaster(ch)
		return nil
	}
}

// WithEtcd sets the client to use etcd for master discovery.
func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
	return func(c *Client) error {
66 67 68 69 70 71 72 73 74
		var cli *clientv3.Client
		f := func() error {
			var err error
			cli, err = clientv3.New(clientv3.Config{
				Endpoints:   endpoints,
				DialTimeout: timeout,
			})
			return err
		}
75 76 77
		for {
			err := f()
			if err != nil {
78
				log.Warn("create etcd client error", log.Ctx{"error": err})
79 80 81 82
			} else {
				break
			}
			time.Sleep(time.Second)
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
		}

		ch := make(chan string, 1)
		a, err := GetKey(cli, DefaultAddrPath, timeout)
		if err != nil {
			return err
		}

		if a != "" {
			// Master is registered, send to the master address
			// channel.
			ch <- a
		}

		go watchKey(cli, DefaultAddrPath, ch)
		go c.monitorMaster(ch)
		return nil
	}
}

// NewClient creates a new Client.
func NewClient(opts ...func(*Client) error) (*Client, error) {
105 106
	c := &Client{}
	c.conn = connection.New()
107 108 109 110 111 112 113

	for _, opt := range opts {
		err := opt(c)
		if err != nil {
			return nil, err
		}
	}
114
	c.ch = make(chan record, c.bufSize)
115
	return c, nil
116 117
}

118 119 120 121 122 123
// StartGetRecords must be called at beginning of each pass
func (c *Client) StartGetRecords(passID int) {
	go c.getRecords(passID)
}

func (c *Client) getRecords(passID int) {
G
gongweibao 已提交
124
	i := 0
H
Helin Wang 已提交
125
	for {
126
		t, err := c.getTask(passID)
H
Helin Wang 已提交
127
		if err != nil {
128 129 130 131 132 133
			if err.Error() == ErrPassBefore.Error() ||
				err.Error() == ErrNoMoreAvailable.Error() ||
				err.Error() == ErrAllTaskFailed.Error() {
				c.ch <- record{nil, err}
				break
			}
G
gongweibao 已提交
134 135 136 137 138

			if i%60 == 0 {
				log.Debug("getTask of passID error.",
					log.Ctx{"error": err, "passID": passID})
				i = 0
139
			}
G
gongweibao 已提交
140 141 142 143 144 145 146 147

			// if err.Error() == ErrPassAfter.Error()
			//   wait util last pass finishes
			// if other error such as network error
			//   wait to reconnect or task time out
			time.Sleep(time.Second * 3)
			i += 3
			continue
H
Helin Wang 已提交
148 149 150
		}

		for _, chunk := range t.Chunks {
151 152
			f, e := os.Open(chunk.Path)
			if e != nil {
153
				log.Error("error open chunk", log.Ctx{"error": e})
H
Helin Wang 已提交
154 155 156 157 158
				continue
			}

			s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1)
			for s.Scan() {
G
gongweibao 已提交
159
				c.ch <- record{s.Record(), nil}
H
Helin Wang 已提交
160 161
			}

162
			if s.Err() != nil {
G
gongweibao 已提交
163
				c.ch <- record{nil, s.Err()}
164 165 166 167
				log.Error(
					"error scan chunk",
					log.Ctx{"error": err, "path": chunk.Path},
				)
168 169
			}

H
Helin Wang 已提交
170 171
			err = f.Close()
			if err != nil {
172
				log.Error("error close record file", log.Ctx{"error": err})
H
Helin Wang 已提交
173 174
			}
		}
175 176 177 178

		// We treat a task as finished whenever the last data
		// instance of the task is read. This is not exactly
		// correct, but a reasonable approximation.
H
Helin Wang 已提交
179 180
		err = c.taskFinished(t.Meta.ID)
		if err != nil {
181
			log.Error("task finish callback error.", log.Ctx{"error": err})
H
Helin Wang 已提交
182
		}
H
Helin Wang 已提交
183 184 185
	}
}

186
func (c *Client) monitorMaster(addrCh <-chan string) {
187
	lastMaster := ""
188
	for curMaster := range addrCh {
H
Helin Wang 已提交
189
		// connect to the new address once address changed.
190 191 192 193
		if curMaster != lastMaster {
			if curMaster == "" {
				err := c.conn.Close()
				if err != nil {
194
					log.Error("close old master addr error", log.Ctx{"error": err})
195 196 197 198
				}
			} else {
				err := c.conn.Connect(curMaster)
				if err != nil {
199
					log.Error("connect to new master addr error", log.Ctx{"error": err})
200 201 202 203 204 205 206 207 208 209 210 211

					// connect to addr failed, set
					// to last known addr in order
					// to retry next time.
					curMaster = lastMaster
				}
			}
		}
		lastMaster = curMaster
	}
}

212 213 214 215
// SetDataset sets dataset to dispatch for the master server.
//
// SetDataset can be call multiple times at one pass. But only the first call
// will be honored.
216
//
217
// After all tasks are done, another call of SetDataset will start another pass.
218
func (c *Client) SetDataset(globPaths []string) error {
219 220
	err := c.conn.Call("Service.SetDataset", globPaths, nil)
	return err
221 222
}

H
Helin Wang 已提交
223
// getTask gets a new task from the master server.
224
func (c *Client) getTask(passID int) (Task, error) {
225
	var t Task
226
	err := c.conn.Call("Service.GetTask", passID, &t)
227 228 229 230
	return t, err
}

// TaskFinished tells the master server a task is finished.
H
Helin Wang 已提交
231
func (c *Client) taskFinished(taskID int) error {
232
	return c.conn.Call("Service.TaskFinished", taskID, nil)
233
}
H
Helin Wang 已提交
234

G
gongweibao 已提交
235
// TaskFailed tell the master server as task is failed.
G
gongweibao 已提交
236
func (c *Client) taskFailed(meta TaskMeta) error {
G
gongweibao 已提交
237
	return c.conn.Call("Service.TaskFailed", meta, nil)
G
gongweibao 已提交
238 239
}

H
Helin Wang 已提交
240 241
// NextRecord returns next record in the dataset.
//
H
Helin Wang 已提交
242
// NextRecord will block until the next record is available. It is
H
Helin Wang 已提交
243
// thread-safe.
G
gongweibao 已提交
244 245 246
func (c *Client) NextRecord() ([]byte, error) {
	r := <-c.ch
	return r.r, r.err
H
Helin Wang 已提交
247
}
248 249 250 251 252 253 254 255

// RequestSaveModel requests the master server to approve the caller
// to save the model.
func (c *Client) RequestSaveModel(trainerID string, blockDur time.Duration) (bool, error) {
	var need bool
	err := c.conn.Call("Service.RequestSaveModel", SaveModelRequest{TrainerID: trainerID, BlockDur: blockDur}, &need)
	return need, err
}