etcd_client.go 6.4 KB
Newer Older
1
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
D
dongzhihong 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at

// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

Q
Qiao Longfei 已提交
15 16 17 18
package client

import (
	"context"
19 20
	"errors"
	"fmt"
Q
Qiao Longfei 已提交
21 22 23 24 25 26
	"strconv"
	"strings"
	"time"

	"github.com/PaddlePaddle/Paddle/go/pserver"
	"github.com/coreos/etcd/clientv3"
27
	"github.com/coreos/etcd/clientv3/concurrency"
28
	log "github.com/inconshreveable/log15"
Q
Qiao Longfei 已提交
29 30 31
)

const (
H
Helin Wang 已提交
32
	defaultEtcdTimeout time.Duration = 5 * time.Second
33 34 35 36

	initLockPath = "/init_ps/lock"
	initDonePath = "/init_ps/done"
	initDoneVal  = "1"
Q
Qiao Longfei 已提交
37 38
)

39
// Etcd is used by pserver client that is a part of trainer process.
Q
Qiao Longfei 已提交
40
// TODO:
41 42
// 1. add watcher to watch the change state of pservers.
type Etcd struct {
Q
Qiao Longfei 已提交
43 44 45
	client    *clientv3.Client
	timeout   time.Duration
	endpoints []string
46
	lock      *concurrency.Mutex
Q
Qiao Longfei 已提交
47 48 49
}

// Desired read ps desired number from etcd.
50
func (e *Etcd) Desired() int {
Q
Qiao Longfei 已提交
51 52
	var psDesired int
	for {
53 54
		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
		resp, err := e.client.Get(ctx, pserver.PsDesired)
Q
Qiao Longfei 已提交
55 56
		cancel()
		if err != nil {
57 58 59 60
			log.Error(
				"Get ps dresire number failed! reconnecting...",
				log.Ctx{"error": err},
			)
61
			time.Sleep(e.timeout)
Q
Qiao Longfei 已提交
62 63 64 65 66
			continue
		}

		kvs := resp.Kvs
		if len(kvs) == 0 {
67
			log.Info("Waiting for ps desired registered ...")
68
			time.Sleep(e.timeout)
Q
Qiao Longfei 已提交
69 70 71 72 73
			continue
		}

		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
		if err != nil {
74
			log.Error("atoi failed", log.Ctx{"error": err})
75
			time.Sleep(e.timeout)
Q
Qiao Longfei 已提交
76 77 78
			continue
		}

79
		log.Debug("Got psDesired", log.Ctx{"psDesired": psDesired})
Q
Qiao Longfei 已提交
80 81 82 83 84 85
		break
	}
	return psDesired
}

// List return the pserver list read from etcd.
86 87
func (e *Etcd) List() []Server {
	psDesired := e.Desired()
Q
Qiao Longfei 已提交
88 89 90 91

	servers := make([]Server, psDesired)
	for {
		for i := 0; i < psDesired; i++ {
92
			ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
Q
Qiao Longfei 已提交
93
			psKey := pserver.PsPath + strconv.Itoa(i)
94
			log.Debug("looking for pserver", log.Ctx{"ps key": psKey})
95
			resp, err := e.client.Get(ctx, psKey)
96
			cancel()
Q
Qiao Longfei 已提交
97
			if err != nil {
98 99 100 101
				log.Info(
					"Get psKey error",
					log.Ctx{"ps key": psKey, "error": err},
				)
102
				time.Sleep(e.timeout)
Q
Qiao Longfei 已提交
103 104 105 106
				continue
			}
			kvs := resp.Kvs
			if len(kvs) == 0 {
107
				log.Info("Waiting for ps addr registered ...")
108
				time.Sleep(e.timeout)
Q
Qiao Longfei 已提交
109 110 111 112 113 114
				continue
			}

			psAddr := string(resp.Kvs[0].Value)
			// TODO(Longfei) check the ps address
			if psAddr == "" {
115 116 117 118
				log.Info(
					"Value under psKey is empty",
					log.Ctx{"psKey": psKey},
				)
119
				time.Sleep(e.timeout)
Q
Qiao Longfei 已提交
120 121
				continue
			}
122 123 124 125
			log.Debug(
				"got psAddr given psKey",
				log.Ctx{"psAddr": psAddr, "psKey": psKey},
			)
Q
Qiao Longfei 已提交
126 127 128 129 130 131 132 133 134
			servers[i].Index = i
			servers[i].Addr = psAddr
		}
		break
	}
	return servers
}

// NewEtcd create a etcd client to return the state of pserver on etcd.
135
func NewEtcd(endpoints string) *Etcd {
Q
Qiao Longfei 已提交
136 137 138 139 140 141
	ep := strings.Split(endpoints, ",")
	var cli *clientv3.Client
	var err error
	for {
		cli, err = clientv3.New(clientv3.Config{
			Endpoints:   ep,
H
Helin Wang 已提交
142
			DialTimeout: defaultEtcdTimeout,
Q
Qiao Longfei 已提交
143 144
		})
		if err != nil {
145
			log.Error("Init etcd connection failed", log.Ctx{"error": err})
H
Helin Wang 已提交
146
			time.Sleep(defaultEtcdTimeout)
Q
Qiao Longfei 已提交
147 148 149 150
			continue
		}
		break
	}
151
	log.Info("Connected to etcd endpoint", log.Ctx{"endpoint": endpoints})
152
	client := &Etcd{
Q
Qiao Longfei 已提交
153
		client:    cli,
H
Helin Wang 已提交
154
		timeout:   defaultEtcdTimeout,
Q
Qiao Longfei 已提交
155 156 157 158
		endpoints: ep,
	}
	return client
}
159 160 161 162 163 164 165 166 167 168

// Select indicates if the current trainer is selected to initialize
// the pserver parameters.
func (e *Etcd) Select() (bool, error) {
	sess, err := concurrency.NewSession(e.client, concurrency.WithTTL(5))
	if err != nil {
		return false, err
	}

	lock := concurrency.NewMutex(sess, initLockPath)
169
	log.Info("Trying to acquire lock", log.Ctx{"lock path": initLockPath})
170 171 172 173 174 175 176
	// Do not use timeout context here, since we don't know how
	// long does it take for other trainers to initialize the
	// parameters.
	err = lock.Lock(context.Background())
	if err != nil {
		return false, err
	}
177
	log.Info("Successfully acquired lock", log.Ctx{"lock path": initLockPath})
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195

	get := clientv3.OpGet(initDonePath)
	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
	tresp, err := e.client.Txn(ctx).If(lock.IsOwner()).Then(get).Commit()
	cancel()
	if err != nil {
		return false, err
	}

	if !tresp.Succeeded {
		return false, errors.New("no longer the owner of the lock")
	}

	resp := tresp.Responses[0].GetResponseRange()

	if len(resp.Kvs) == 0 {
		// Key value not set, select current trainer.
		e.lock = lock
196
		log.Info("Trainer selected.")
197 198 199 200
		return true, nil
	}

	if string(resp.Kvs[0].Value) == initDoneVal {
201
		log.Info("Initialization is already done.")
202 203 204 205
		ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
		err = lock.Unlock(ctx)
		cancel()
		if err != nil {
206
			log.Error("error unlocking", log.Ctx{"error": err})
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
		}
		return false, nil
	}

	return false, fmt.Errorf("key %s have unexpected value: %v", initDonePath, resp.Kvs[0].Value)
}

// Done indicates the parameter initialization process is done.
func (e *Etcd) Done() error {
	if e.lock == nil {
		return errors.New("lock is nil, Done called unexpectedly")
	}

	put := clientv3.OpPut(initDonePath, initDoneVal)
	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
	tresp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
	cancel()
	if err != nil {
		return err
	}

	if !tresp.Succeeded {
		return errors.New("no longer the owner of the lock")
	}

	ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
	err = e.lock.Unlock(ctx)
	cancel()
	if err != nil {
236
		log.Error("error unlocking", log.Ctx{"error": err})
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
	} else {
		e.lock = nil
	}

	return nil
}

// Close closes the etcd client.
func (e *Etcd) Close() error {
	var err error
	if e.lock != nil {
		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
		err = e.lock.Unlock(ctx)
		cancel()
		if err == nil {
			e.lock = nil
		}
	}

	cErr := e.client.Close()
	if cErr != nil {
		if err != nil {
259
			log.Error("error closing etcd client", log.Ctx{"error": cErr})
260 261 262 263 264 265 266
			return err
		}
		return cErr
	}

	return err
}