diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go index c1de57fc18576c0760123a3a52e44ef4b61ef27b..739c4c01e02b10f46c36b997f8c4700150da2a26 100644 --- a/go/cmd/master/master.go +++ b/go/cmd/master/master.go @@ -72,9 +72,9 @@ func main() { shutdown := func() { log.Infoln("shutting down gracefully") - e := store.Shutdown() - if e != nil { - log.Errorln(e) + err := store.Shutdown() + if err != nil { + log.Errorln(err) } } diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go index aa81d0432b1d4f411644e0a5b703d7ea74d144b7..2b63fd7f3eba9b18a779a349f277d9084646168b 100644 --- a/go/cmd/pserver/pserver.go +++ b/go/cmd/pserver/pserver.go @@ -18,6 +18,8 @@ import ( "net" "net/http" "net/rpc" + "os" + "os/signal" "strconv" "time" @@ -33,7 +35,8 @@ func main() { index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0") etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379", "comma separated endpoint string for pserver to connect to etcd") - etcdTimeout := flag.Duration("etcd-timeout", 5*time.Second, "timeout for etcd calls") + dialTimeout := flag.Duration("dial-timeout", 5*time.Second, "dial timeout") + etcdTTL := flag.Int("etcd-ttl", 5, "etcd time to live in seconds") numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job") checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path") checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds") @@ -53,7 +56,7 @@ func main() { if *index >= 0 { idx = *index } else { - e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout) + e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *dialTimeout, *etcdTTL) idx, err = e.Register(*port) candy.Must(err) @@ -67,6 +70,20 @@ func main() { } } + shutdown := func() { + log.Infoln("shutting down gracefully") + err := e.Shutdown() + if err != nil { + log.Errorln(err) + } + } + + // Guaranteed to run even panic happens. + defer shutdown() + + c := make(chan os.Signal, 1) + signal.Notify(c, os.Interrupt) + s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp) candy.Must(err) @@ -77,7 +94,11 @@ func main() { l, err := net.Listen("tcp", ":"+strconv.Itoa(*port)) candy.Must(err) - log.Infof("start pserver at port %d", *port) - err = http.Serve(l, nil) - candy.Must(err) + go func() { + log.Infof("start pserver at port %d", *port) + err = http.Serve(l, nil) + candy.Must(err) + }() + + <-c } diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go index e954037ce5aa448bbb0325088244ef908eacdd0c..15833eaeeebf46a57e967cd506721f0e7f189425 100644 --- a/go/master/etcd_client.go +++ b/go/master/etcd_client.go @@ -68,12 +68,12 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat // one master running, but split-brain problem may cause // multiple master servers running), and the cluster management // software will kill one of them. - log.Debugf("Trying to acquire lock at %s.", lockPath) + log.Infof("Trying to acquire lock at %s.", lockPath) err = lock.Lock(context.TODO()) if err != nil { return nil, err } - log.Debugf("Successfully acquired lock at %s.", lockPath) + log.Infof("Successfully acquired lock at %s.", lockPath) put := clientv3.OpPut(addrPath, addr) resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit() diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go index 0f7e20cdd8d20e37b586c22377a89fca4c3cf7ce..14ad0774550f6e5a5d8610d6007904cd2820432c 100644 --- a/go/pserver/client/c/cclient.go +++ b/go/pserver/client/c/cclient.go @@ -55,10 +55,10 @@ var curHandle C.paddle_pserver_client func add(c *client.Client) C.paddle_pserver_client { mu.Lock() defer mu.Unlock() - client := curHandle + cli := curHandle curHandle++ - handleMap[client] = c - return client + handleMap[cli] = c + return cli } func get(client C.paddle_pserver_client) *client.Client { diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go index 98ff8ce827c7cfcd9122cb043f2a6226057cc95a..4fb26307667295ab825d07be6c3d1d4b33f6eb8b 100644 --- a/go/pserver/etcd_client.go +++ b/go/pserver/etcd_client.go @@ -34,16 +34,19 @@ const ( PsPath = "/ps/" // PsCheckpoint is the etcd path for store checkpoints information PsCheckpoint = "/checkpoints/" + + retryTimeout = 5 * time.Second ) // EtcdClient is the etcd client that the pserver uses for fault // tolerance, service registry and coordination. type EtcdClient struct { - numPservers int - etcdEndpoints string - etcdClient *clientv3.Client - // etcdTimeout is also used as retry intervals. - etcdTimeout time.Duration + numPservers int + endpoints string + client *clientv3.Client + sess *concurrency.Session + dialTimeout time.Duration + ttlSec int // FIXME: ensure GetExternalIP gets the correct ip for trainers to connect. externalIP string // desired number of pservers in the job. @@ -52,11 +55,12 @@ type EtcdClient struct { } // NewEtcdClient creates an EtcdClient -func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *EtcdClient { +func NewEtcdClient(endpoints string, numPservers int, dialtimeout time.Duration, ttlSec int) *EtcdClient { return &EtcdClient{ - etcdTimeout: timeout, - numPservers: numPservers, - etcdEndpoints: endpoints, + dialTimeout: dialtimeout, + ttlSec: ttlSec, + numPservers: numPservers, + endpoints: endpoints, } } @@ -64,7 +68,6 @@ func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *Et // // Register returns the index of the current pserver. func (e *EtcdClient) Register(port int) (int, error) { - var err error e.externalIP, err = networkhelper.GetExternalIP() if err != nil { @@ -72,19 +75,26 @@ func (e *EtcdClient) Register(port int) (int, error) { } // initialize connection to etcd. - ep := strings.Split(e.etcdEndpoints, ",") + ep := strings.Split(e.endpoints, ",") for { cli, err := clientv3.New(clientv3.Config{ Endpoints: ep, - DialTimeout: e.etcdTimeout, + DialTimeout: e.dialTimeout, }) if err != nil { log.Errorf("connect to etcd error: %v", err) - time.Sleep(e.etcdTimeout) + time.Sleep(retryTimeout) + continue + } + e.client = cli + sess, err := concurrency.NewSession(cli, concurrency.WithTTL(e.ttlSec)) + if err != nil { + log.Errorf("create etcd session error: %v", err) + time.Sleep(retryTimeout) continue } - e.etcdClient = cli - log.Debugf("inited client to %s", e.etcdEndpoints) + e.sess = sess + log.Debugf("inited client to %s", e.endpoints) break } // init /ps_desired using transaction, for multiple pservers may want to write @@ -95,7 +105,7 @@ func (e *EtcdClient) Register(port int) (int, error) { cancel() if err != nil { log.Warn(err) - time.Sleep(e.etcdTimeout) + time.Sleep(retryTimeout) continue } break @@ -106,18 +116,18 @@ func (e *EtcdClient) Register(port int) (int, error) { // wait and set s.desired init value for { ctx, cancel := context.WithTimeout(context.Background(), time.Second) - resp, err := e.etcdClient.Get(ctx, PsDesired) + resp, err := e.client.Get(ctx, PsDesired) cancel() if err != nil { log.Errorf("getting %s error: %v", PsDesired, err) - time.Sleep(e.etcdTimeout) + time.Sleep(retryTimeout) continue } if len(resp.Kvs) != 0 { e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value)) if err != nil { log.Errorf("value of %s invalid %v\n", PsDesired, err) - time.Sleep(e.etcdTimeout) + time.Sleep(retryTimeout) // NOTE: wait util ps_desired value change continue } @@ -134,7 +144,7 @@ func (e *EtcdClient) Register(port int) (int, error) { cancel() if err != nil { log.Warn(err) - time.Sleep(e.etcdTimeout) + time.Sleep(retryTimeout) continue } break @@ -144,10 +154,10 @@ func (e *EtcdClient) Register(port int) (int, error) { } func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) { - return concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error { + return concurrency.NewSTM(e.client, func(c concurrency.STM) error { dsStr := c.Get(PsDesired) if dsStr == "" { - c.Put(PsDesired, strconv.Itoa(numPservers)) + c.Put(PsDesired, strconv.Itoa(numPservers), clientv3.WithLease(e.sess.Lease())) } return nil }, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads)) @@ -156,7 +166,7 @@ func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) ( // registerPserverEtcd registers pserver node on etcd using transaction. func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) { var idx int - _, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error { + _, err := concurrency.NewSTM(e.client, func(c concurrency.STM) error { registered := false for i := 0; i < e.desired; i++ { psKey := PsPath + strconv.Itoa(i) @@ -165,26 +175,10 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er log.Debugf("got value (%s) for key: %s", ps, psKey) if ps == "" { - resp, err := e.etcdClient.Grant(context.TODO(), 5) - if err != nil { - log.Fatal(err) - } // find the first id and write info pserverAddr := e.externalIP + ":" + strconv.Itoa(port) - c.Put(psKey, pserverAddr, clientv3.WithLease(resp.ID)) + c.Put(psKey, pserverAddr, clientv3.WithLease(e.sess.Lease())) log.Debugf("set pserver node %s with value %s", psKey, pserverAddr) - ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID) - if kaerr != nil { - log.Errorf("keepalive etcd node error: %v", kaerr) - return kaerr - } - - // Eat the keep alive message so etcd - // will not expire the lease. - go func(ch <-chan *clientv3.LeaseKeepAliveResponse) { - ka := <-ch - log.Debugf("keepalive: %d\n", ka.TTL) - }(ch) log.Debug("register finished") idx = i registered = true @@ -207,7 +201,7 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er // GetKey gets the value by the specified key func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) { ctx, cancel := context.WithTimeout(context.Background(), timeout) - resp, err := e.etcdClient.Get(ctx, key) + resp, err := e.client.Get(ctx, key) cancel() if err != nil { return []byte{}, err @@ -223,7 +217,27 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) { // PutKey put into etcd with value by key specified func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) error { ctx, cancel := context.WithTimeout(context.Background(), timeout) - _, err := e.etcdClient.Put(ctx, key, string(value)) + _, err := e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease())) cancel() return err } + +// Shutdown shuts down the etcd client gracefully. +func (e *EtcdClient) Shutdown() error { + var err error + if e.sess != nil { + err = e.sess.Close() + } + + if e.client != nil { + newErr := e.client.Close() + if newErr != nil { + if err != nil { + log.Errorln(newErr) + } else { + err = newErr + } + } + } + return err +}