etcd_client.go 3.9 KB
Newer Older
1 2 3 4
package master

import (
	"context"
5
	"time"
6 7 8 9 10 11 12 13 14 15 16

	"github.com/coreos/etcd/clientv3"
	"github.com/coreos/etcd/clientv3/concurrency"
	log "github.com/sirupsen/logrus"
)

const (
	// DefaultLockPath is the default etcd master lock path.
	DefaultLockPath = "/master/lock"
	// DefaultStatePath is the default etcd key for master state.
	DefaultStatePath = "/master/state"
17 18
	// DefaultAddrPath is the default etcd key for master address.
	DefaultAddrPath = "/master/addr"
19 20
)

21 22
// EtcdClient is the etcd client that the master uses for fault
// tolerance and service registry.
23
type EtcdClient struct {
24 25 26
	lockPath  string
	statePath string
	client    *clientv3.Client
27
	lock      *concurrency.Mutex
28 29
}

30 31 32
// NewEtcdClient creates a new EtcdClient.
func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
	log.Debugf("Connecting to etcd at %v", endpoints)
33 34 35 36
	// TODO(helin): gracefully shutdown etcd store. Becuase etcd
	// store holds a etcd lock, even though the lock will expire
	// when the lease timeout, we need to implement graceful
	// shutdown to release the lock.
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
	cli, err := clientv3.New(clientv3.Config{
		Endpoints:   endpoints,
		DialTimeout: dialTimeout,
	})
	if err != nil {
		return nil, err
	}

	sess, err := concurrency.NewSession(cli, concurrency.WithTTL(ttlSec))
	if err != nil {
		return nil, err
	}

	lock := concurrency.NewMutex(sess, lockPath)
	// It's fine for the lock to get stuck, in this case we have
	// multiple master servers running (only configured to have
	// one master running, but split-brain problem may cuase
	// multiple master servers running), and the cluster management
	// software will kill one of them.
56
	log.Debugf("Trying to acquire lock at %s.", lockPath)
57 58 59 60
	err = lock.Lock(context.TODO())
	if err != nil {
		return nil, err
	}
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
	log.Debugf("Successfully acquired lock at %s.", lockPath)

	put := clientv3.OpPut(addrPath, string(addr))
	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
	if err != nil {
		return nil, err
	}

	if !resp.Succeeded {
		log.Fatal("No longer owns the master lock. Exiting.")
	}

	e := &EtcdClient{
		lockPath:  lockPath,
		statePath: statePath,
		client:    cli,
		lock:      lock,
	}

80 81 82 83
	return e, nil
}

// Save saves the state into the etcd.
84
func (e *EtcdClient) Save(state []byte) error {
85 86 87 88 89 90 91 92
	ctx := context.TODO()
	put := clientv3.OpPut(e.statePath, string(state))
	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
	if err != nil {
		return err
	}

	if !resp.Succeeded {
93 94 95 96
		log.Errorln("No longer owns the lock, trying to lock again")
		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
		err := e.lock.Lock(ctx)
		cancel()
97
		if err != nil {
98 99 100 101 102 103 104 105 106 107
			// We lost the master lock and can not acquire
			// it back, it means some other master is
			// already started. We don't want cluster
			// managment system to kill the master server
			// who is holding the lock and running
			// correctly. So the most feasible solution is
			// to kill current master server. The current
			// state is not saved, but the trainer's RPC
			// call will fail, so the trainer will retry.
			log.Fatalf("Could not acquire the lock at %s: %v. Exiting.", e.lockPath, err)
108 109 110 111 112 113 114 115 116
		}
		log.Infof("Successfully acquired lock at %s.", e.lockPath)
		return e.Save(state)
	}

	return nil
}

// Load loads the state from etcd.
117
func (e *EtcdClient) Load() ([]byte, error) {
118 119 120 121 122 123 124 125 126 127
	ctx := context.TODO()
	get := clientv3.OpGet(e.statePath)

	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(get).Commit()
	if err != nil {
		return nil, err
	}

	if !resp.Succeeded {
		log.Errorln("No longer owns the lock, trying to lock and load again.")
128
		err = e.lock.Lock(context.Background())
H
Helin Wang 已提交
129 130 131 132
		if err != nil {
			return nil, err
		}

133 134 135 136 137 138 139 140 141 142 143 144
		return e.Load()
	}

	kvs := resp.Responses[0].GetResponseRange().Kvs
	if len(kvs) == 0 {
		// No state exists
		return nil, nil
	}

	state := kvs[0].Value
	return state, nil
}