etcd_client.go 5.6 KB
Newer Older
1
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
D
dongzhihong 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at

// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

15 16 17 18
package master

import (
	"context"
19
	"time"
20 21 22

	"github.com/coreos/etcd/clientv3"
	"github.com/coreos/etcd/clientv3/concurrency"
23
	log "github.com/inconshreveable/log15"
24 25 26 27 28 29 30
)

const (
	// DefaultLockPath is the default etcd master lock path.
	DefaultLockPath = "/master/lock"
	// DefaultStatePath is the default etcd key for master state.
	DefaultStatePath = "/master/state"
31 32
	// DefaultAddrPath is the default etcd key for master address.
	DefaultAddrPath = "/master/addr"
33 34
)

35 36
// EtcdClient is the etcd client that the master uses for fault
// tolerance and service registry.
37
type EtcdClient struct {
38 39 40
	lockPath  string
	statePath string
	client    *clientv3.Client
41
	lock      *concurrency.Mutex
H
Helin Wang 已提交
42
	sess      *concurrency.Session
43 44
}

45 46
// NewEtcdClient creates a new EtcdClient.
func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
47
	log.Debug("Connecting to etcd", log.Ctx{"endpoint": endpoints})
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
	cli, err := clientv3.New(clientv3.Config{
		Endpoints:   endpoints,
		DialTimeout: dialTimeout,
	})
	if err != nil {
		return nil, err
	}

	sess, err := concurrency.NewSession(cli, concurrency.WithTTL(ttlSec))
	if err != nil {
		return nil, err
	}

	lock := concurrency.NewMutex(sess, lockPath)
	// It's fine for the lock to get stuck, in this case we have
	// multiple master servers running (only configured to have
Q
Qiao Longfei 已提交
64
	// one master running, but split-brain problem may cause
65 66
	// multiple master servers running), and the cluster management
	// software will kill one of them.
67
	log.Info("Trying to acquire lock.", log.Ctx{"path": lockPath})
68 69 70 71
	err = lock.Lock(context.TODO())
	if err != nil {
		return nil, err
	}
72
	log.Info("Successfully acquired lock at %s.", log.Ctx{"path": lockPath})
73

74
	put := clientv3.OpPut(addrPath, addr)
75 76 77 78 79 80
	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
	if err != nil {
		return nil, err
	}

	if !resp.Succeeded {
81 82
		log.Crit("No longer owns the master lock. Exiting.")
		panic("No longer owns the master lock. Exiting.")
83 84 85 86 87 88 89
	}

	e := &EtcdClient{
		lockPath:  lockPath,
		statePath: statePath,
		client:    cli,
		lock:      lock,
H
Helin Wang 已提交
90
		sess:      sess,
91 92
	}

93 94 95 96
	return e, nil
}

// Save saves the state into the etcd.
97
func (e *EtcdClient) Save(state []byte) error {
98 99 100 101 102 103 104 105
	ctx := context.TODO()
	put := clientv3.OpPut(e.statePath, string(state))
	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
	if err != nil {
		return err
	}

	if !resp.Succeeded {
106
		log.Error("No longer owns the lock, trying to lock again")
107 108 109
		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
		err := e.lock.Lock(ctx)
		cancel()
110
		if err != nil {
111 112 113
			// We lost the master lock and can not acquire
			// it back, it means some other master is
			// already started. We don't want cluster
Q
Qiao Longfei 已提交
114
			// management system to kill the master server
115 116 117 118 119
			// who is holding the lock and running
			// correctly. So the most feasible solution is
			// to kill current master server. The current
			// state is not saved, but the trainer's RPC
			// call will fail, so the trainer will retry.
120 121
			log.Crit("Could not acquire the lock at %s: %v. Exiting.", log.Ctx{"path": e.lockPath, "error": err})
			panic("Could not acquire the lock at %s: %v. Exiting.")
122
		}
123
		log.Info("Successfully acquired lock at %s.", e.lockPath)
124 125 126 127 128 129 130
		return e.Save(state)
	}

	return nil
}

// Load loads the state from etcd.
131
func (e *EtcdClient) Load() ([]byte, error) {
132 133 134 135 136 137 138 139 140
	ctx := context.TODO()
	get := clientv3.OpGet(e.statePath)

	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(get).Commit()
	if err != nil {
		return nil, err
	}

	if !resp.Succeeded {
141
		log.Error("No longer owns the lock, trying to lock and load again.")
142
		err = e.lock.Lock(context.Background())
H
Helin Wang 已提交
143 144 145 146
		if err != nil {
			return nil, err
		}

147 148 149 150 151 152 153 154 155 156 157 158
		return e.Load()
	}

	kvs := resp.Responses[0].GetResponseRange().Kvs
	if len(kvs) == 0 {
		// No state exists
		return nil, nil
	}

	state := kvs[0].Value
	return state, nil
}
159

H
Helin Wang 已提交
160 161 162 163 164 165 166 167
// Shutdown shuts down the etcd client gracefully.
func (e *EtcdClient) Shutdown() error {
	err := e.sess.Close()
	newErr := e.client.Close()
	if newErr != nil {
		if err == nil {
			err = newErr
		} else {
168
			log.Error("shutdown error", log.Ctx{"error": newErr})
H
Helin Wang 已提交
169 170 171 172 173 174
		}
	}

	return err
}

175
// GetKey gets the value by the specify key.
176 177
func GetKey(c *clientv3.Client, key string, timeout time.Duration) (string, error) {
	ctx, cancel := context.WithTimeout(context.Background(), timeout)
178 179 180 181 182 183 184 185 186 187 188 189 190
	resp, err := c.Get(ctx, key)
	cancel()
	if err != nil {
		return "", err
	}
	kvs := resp.Kvs
	if len(kvs) == 0 {
		return "", nil
	}
	v := kvs[0].Value
	return string(v), nil
}

191 192
// watchKey watches the specify key and send to valChan if there is some event.
func watchKey(c *clientv3.Client, key string, valChan chan<- string) {
193 194 195 196
	rch := c.Watch(context.Background(), key)
	for wresp := range rch {
		for _, ev := range wresp.Events {
			// if received event is DELETE, the value will be an empty string
197
			log.Info("received event.", log.Ctx{"type": ev.Type, "key": ev.Kv.Key, "value": ev.Kv.Value})
198 199 200 201
			valChan <- string(ev.Kv.Value)
		}
	}
}