From 60238a1bfb41432d8c07b351fb91aa34aa56eb58 Mon Sep 17 00:00:00 2001 From: Helin Wang Date: Tue, 24 Oct 2017 16:26:21 -0700 Subject: [PATCH] Go master, pserver, trainer: switch to log15, away from logrus --- go/cmd/master/master.go | 39 ++++++++++++++--------- go/cmd/pserver/pserver.go | 24 +++++++++------ go/glide.lock | 16 +++++++--- go/glide.yaml | 4 +++ go/master/c/client.go | 12 ++++++-- go/master/client.go | 21 +++++++------ go/master/client_internal_test.go | 6 ---- go/master/etcd_client.go | 24 ++++++++------- go/master/service.go | 51 ++++++++++++++++++------------- go/pserver/client/c/cclient.go | 49 ++++++++++++++++++++++------- go/pserver/client/client.go | 6 ++-- go/pserver/client/client_test.go | 4 +-- go/pserver/client/etcd_client.go | 50 ++++++++++++++++++------------ go/pserver/etcd_client.go | 30 ++++++++++-------- go/pserver/optimizer.go | 6 ++-- go/pserver/service.go | 14 ++++----- 16 files changed, 218 insertions(+), 138 deletions(-) diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go index 739c4c01e02..f57db1c0a01 100644 --- a/go/cmd/master/master.go +++ b/go/cmd/master/master.go @@ -25,9 +25,8 @@ import ( "strings" "time" + log "github.com/inconshreveable/log15" "github.com/namsral/flag" - log "github.com/sirupsen/logrus" - "github.com/topicai/candy" "github.com/PaddlePaddle/Paddle/go/master" "github.com/PaddlePaddle/Paddle/go/utils/networkhelper" @@ -41,16 +40,20 @@ func main() { taskTimeoutMax := flag.Int("task-timeout-max", 3, "max timtout count for each task before it being declared failed task.") chunkPerTask := flag.Int("chunk-per-task", 10, "chunk per task.") logLevel := flag.String("log-level", "info", - "log level, possible values: debug, info, warning, error, fatal, panic") + "log level, possible values: debug, info, warn, error, crit") flag.Parse() - level, e := log.ParseLevel(*logLevel) - candy.Must(e) + lvl, err := log.LvlFromString(*logLevel) + if err != nil { + panic(err) + } - log.SetLevel(level) + log.Root().SetHandler( + log.LvlFilterHandler(lvl, log.CallerStackHandler("%+v", log.StderrHandler)), + ) if *endpoints == "" { - log.Warningln("-endpoints not set, fault tolerance not be enabled.") + log.Warn("-endpoints not set, fault tolerance not be enabled.") } var store master.Store @@ -58,23 +61,25 @@ func main() { eps := strings.Split(*endpoints, ",") ip, err := networkhelper.GetExternalIP() if err != nil { - log.Fatal(err) + log.Crit("get external ip error", log.Ctx{"error": err}) + panic(err) } addr := fmt.Sprintf("%s:%d", ip, *port) store, err = master.NewEtcdClient(eps, addr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, *ttlSec) if err != nil { - log.Fatal(err) + log.Crit("error creating etcd client.", log.Ctx{"error": err}) + panic(err) } } else { store = &master.InMemStore{} } shutdown := func() { - log.Infoln("shutting down gracefully") + log.Info("shutting down gracefully") err := store.Shutdown() if err != nil { - log.Errorln(err) + log.Error("shutdown error", log.Ctx{"error": err}) } } @@ -86,24 +91,28 @@ func main() { s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax) if err != nil { - log.Fatal(err) + log.Crit("error creating new service.", log.Ctx{"error": err}) + panic(err) } err = rpc.Register(s) if err != nil { - log.Fatal(err) + log.Crit("error registering to etcd.", log.Ctx{"error": err}) + panic(err) } rpc.HandleHTTP() l, err := net.Listen("tcp", ":"+strconv.Itoa(*port)) if err != nil { - log.Fatal(err) + log.Crit("error listing to port", log.Ctx{"error": err, "port": *port}) + panic(err) } go func() { err = http.Serve(l, nil) if err != nil { - log.Fatal(err) + log.Crit("error serving HTTP", log.Ctx{"error": err}) + panic(err) } }() diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go index bec5775d540..90f9cf3fcf2 100644 --- a/go/cmd/pserver/pserver.go +++ b/go/cmd/pserver/pserver.go @@ -27,11 +27,11 @@ import ( "github.com/topicai/candy" "github.com/PaddlePaddle/Paddle/go/pserver" - log "github.com/sirupsen/logrus" + log "github.com/inconshreveable/log15" ) func main() { - port := flag.Int("port", 0, "port of the pserver") + port := flag.Int("port", 8001, "port of the pserver") index := flag.Int("index", -1, "index of the pserver, set to -1 if use etcd for auto pserver index registry") etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379", "comma separated endpoint string for pserver to connect to etcd") @@ -41,13 +41,17 @@ func main() { checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path") checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds") logLevel := flag.String("log-level", "info", - "log level, possible values: debug, info, warning, error, fatal, panic") + "log level, possible values: debug, info, warn, error, crit") flag.Parse() - level, err := log.ParseLevel(*logLevel) - candy.Must(err) + lvl, err := log.LvlFromString(*logLevel) + if err != nil { + panic(err) + } - log.SetLevel(level) + log.Root().SetHandler( + log.LvlFilterHandler(lvl, log.CallerStackHandler("%+v", log.StderrHandler)), + ) var idx int @@ -63,7 +67,7 @@ func main() { cp, err = pserver.LoadCheckpoint(e, idx) if err != nil { if err == pserver.ErrCheckpointNotFound { - log.Infof("Could not find the pserver checkpoint.") + log.Info("Could not find the pserver checkpoint.") } else { panic(err) } @@ -71,10 +75,10 @@ func main() { } shutdown := func() { - log.Infoln("shutting down gracefully") + log.Info("shutting down gracefully") sErr := e.Shutdown() if sErr != nil { - log.Errorln(sErr) + log.Error("error shutting down", log.Ctx{"error": sErr}) } } @@ -95,7 +99,7 @@ func main() { candy.Must(err) go func() { - log.Infof("start pserver at port %d", *port) + log.Info("starting pserver", log.Ctx{"port": *port}) err = http.Serve(l, nil) candy.Must(err) }() diff --git a/go/glide.lock b/go/glide.lock index aabc03657ff..ce654d36364 100644 --- a/go/glide.lock +++ b/go/glide.lock @@ -1,5 +1,5 @@ -hash: 328e7b9b7306b45e7b9879139a9f86698115981f6283032e1312093a6a6ddb04 -updated: 2017-10-16T08:00:23.484693528Z +hash: 51d9e2e46d7fd9173ff11ecada40f7b7728756be18d5e2f032535f66465e6e15 +updated: 2017-10-24T15:04:09.987751592-07:00 imports: - name: github.com/alecthomas/gometalinter version: bae2f1293d092fd8167939d5108d1b025eaef9de @@ -99,6 +99,8 @@ imports: version: d2709f9f1f31ebcda9651b03077758c1f3a0018c - name: github.com/ghodss/yaml version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7 +- name: github.com/go-stack/stack + version: 817915b46b97fd7bb80e8ab6b69f01a53ac3eebf - name: github.com/gogo/protobuf version: 909568be09de550ed094403c2bf8a261b5bb730a subpackages: @@ -120,8 +122,14 @@ imports: - runtime - runtime/internal - utilities +- name: github.com/inconshreveable/log15 + version: 0decfc6c20d9ca0ad143b0e89dcaa20f810b4fb3 - name: github.com/jonboulle/clockwork version: 2eee05ed794112d45db504eb05aa693efd2b8b09 +- name: github.com/mattn/go-colorable + version: 5411d3eea5978e6cdc258b30de592b60df6aba96 +- name: github.com/mattn/go-isatty + version: 57fdcb988a5c543893cc61bce354a6e24ab70022 - name: github.com/matttproud/golang_protobuf_extensions version: c12348ce28de40eed0136aa2b644d0ee0650e56c subpackages: @@ -179,11 +187,12 @@ imports: - lex/httplex - trace - name: golang.org/x/sys - version: 0f826bdd13b500be0f1d4004938ad978fcc6031e + version: e48874b42435b4347fc52bdee0424a52abc974d7 repo: https://github.com/golang/sys.git vcs: git subpackages: - unix + - windows - name: golang.org/x/text version: 836efe42bb4aa16aaa17b9c155d8813d336ed720 repo: https://github.com/golang/text.git @@ -222,4 +231,3 @@ testImports: version: 05e8a0eda380579888eb53c394909df027f06991 subpackages: - assert - diff --git a/go/glide.yaml b/go/glide.yaml index 4b22ab2caaa..ba253f8bebe 100644 --- a/go/glide.yaml +++ b/go/glide.yaml @@ -26,3 +26,7 @@ import: version: v1.1.0 - package: github.com/alecthomas/gometalinter version: v1.2.1 +- package: github.com/inconshreveable/log15 + version: v2.13 +- package: github.com/go-stack/stack + version: v1.6.0 diff --git a/go/master/c/client.go b/go/master/c/client.go index b5759c30b1d..9a59337108d 100644 --- a/go/master/c/client.go +++ b/go/master/c/client.go @@ -35,13 +35,19 @@ import ( "unsafe" "github.com/PaddlePaddle/Paddle/go/master" - log "github.com/sirupsen/logrus" + log "github.com/inconshreveable/log15" ) var mu sync.Mutex var handleMap = make(map[C.paddle_master_client]*master.Client) var curHandle C.paddle_master_client +func init() { + log.Root().SetHandler( + log.LvlFilterHandler(log.LvlWarn, log.CallerStackHandler("%+v", log.StderrHandler)), + ) +} + func add(c *master.Client) C.paddle_master_client { mu.Lock() defer mu.Unlock() @@ -117,7 +123,7 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int } err := c.SetDataset(paths) if err != nil { - log.Errorln(err) + log.Error("error set dataset", log.Ctx{"error": err}) return C.PADDLE_MASTER_ERROR } @@ -167,7 +173,7 @@ func paddle_request_save_model(client C.paddle_master_client, trainerID string, c := get(client) need, err := c.RequestSaveModel(trainerID, time.Duration(blockMS)*time.Millisecond) if err != nil { - log.Errorln(err) + log.Error("error request save model", log.Ctx{"error": err}) return C.PADDLE_MASTER_ERROR } diff --git a/go/master/client.go b/go/master/client.go index f04cf50ce3c..5d657548c90 100644 --- a/go/master/client.go +++ b/go/master/client.go @@ -21,7 +21,7 @@ import ( "github.com/PaddlePaddle/Paddle/go/connection" "github.com/PaddlePaddle/recordio" "github.com/coreos/etcd/clientv3" - log "github.com/sirupsen/logrus" + log "github.com/inconshreveable/log15" ) // Client is the client of the master server. @@ -75,7 +75,7 @@ func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error { for { err := f() if err != nil { - log.Warningln(err) + log.Warn("create etcd client error", log.Ctx{"error": err}) } else { break } @@ -135,13 +135,13 @@ func (c *Client) getRecords(passID int) { time.Sleep(time.Second * 3) continue } - log.Errorf("getTask error: %s", err) + log.Error("getTask error.", log.Ctx{"error": err}) } for _, chunk := range t.Chunks { f, e := os.Open(chunk.Path) if e != nil { - log.Errorln(e) + log.Error("error open chunk", log.Ctx{"error": e}) continue } @@ -152,12 +152,15 @@ func (c *Client) getRecords(passID int) { if s.Err() != nil { c.ch <- record{nil, s.Err()} - log.Errorln(err, chunk.Path) + log.Error( + "error scan chunk", + log.Ctx{"error": err, "path": chunk.Path}, + ) } err = f.Close() if err != nil { - log.Errorln(err) + log.Error("error close record file", log.Ctx{"error": err}) } } @@ -166,7 +169,7 @@ func (c *Client) getRecords(passID int) { // correct, but a reasonable approximation. err = c.taskFinished(t.Meta.ID) if err != nil { - log.Errorln(err) + log.Error("task finish callback error.", log.Ctx{"error": err}) } } } @@ -179,12 +182,12 @@ func (c *Client) monitorMaster(addrCh <-chan string) { if curMaster == "" { err := c.conn.Close() if err != nil { - log.Errorln(err) + log.Error("close old master addr error", log.Ctx{"error": err}) } } else { err := c.conn.Connect(curMaster) if err != nil { - log.Errorln(err) + log.Error("connect to new master addr error", log.Ctx{"error": err}) // connect to addr failed, set // to last known addr in order diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go index d5f3d794646..2f13fd0dcda 100644 --- a/go/master/client_internal_test.go +++ b/go/master/client_internal_test.go @@ -25,8 +25,6 @@ import ( "testing" "time" - log "github.com/sirupsen/logrus" - "github.com/PaddlePaddle/Paddle/go/connection" "github.com/PaddlePaddle/recordio" ) @@ -36,10 +34,6 @@ const ( chunkPerTask = 10 ) -func init() { - log.SetLevel(log.ErrorLevel) -} - func TestGetFinishTask(t *testing.T) { const path = "/tmp/master_client_test_0" diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go index 94848d887e8..2a41d36949c 100644 --- a/go/master/etcd_client.go +++ b/go/master/etcd_client.go @@ -20,7 +20,7 @@ import ( "github.com/coreos/etcd/clientv3" "github.com/coreos/etcd/clientv3/concurrency" - log "github.com/sirupsen/logrus" + log "github.com/inconshreveable/log15" ) const ( @@ -44,7 +44,7 @@ type EtcdClient struct { // NewEtcdClient creates a new EtcdClient. func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) { - log.Debugf("Connecting to etcd at %v", endpoints) + log.Debug("Connecting to etcd", log.Ctx{"endpoint": endpoints}) cli, err := clientv3.New(clientv3.Config{ Endpoints: endpoints, DialTimeout: dialTimeout, @@ -64,12 +64,12 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat // one master running, but split-brain problem may cause // multiple master servers running), and the cluster management // software will kill one of them. - log.Infof("Trying to acquire lock at %s.", lockPath) + log.Info("Trying to acquire lock.", log.Ctx{"path": lockPath}) err = lock.Lock(context.TODO()) if err != nil { return nil, err } - log.Infof("Successfully acquired lock at %s.", lockPath) + log.Info("Successfully acquired lock at %s.", log.Ctx{"path": lockPath}) put := clientv3.OpPut(addrPath, addr) resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit() @@ -78,7 +78,8 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat } if !resp.Succeeded { - log.Fatal("No longer owns the master lock. Exiting.") + log.Crit("No longer owns the master lock. Exiting.") + panic("No longer owns the master lock. Exiting.") } e := &EtcdClient{ @@ -102,7 +103,7 @@ func (e *EtcdClient) Save(state []byte) error { } if !resp.Succeeded { - log.Errorln("No longer owns the lock, trying to lock again") + log.Error("No longer owns the lock, trying to lock again") ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) err := e.lock.Lock(ctx) cancel() @@ -116,9 +117,10 @@ func (e *EtcdClient) Save(state []byte) error { // to kill current master server. The current // state is not saved, but the trainer's RPC // call will fail, so the trainer will retry. - log.Fatalf("Could not acquire the lock at %s: %v. Exiting.", e.lockPath, err) + log.Crit("Could not acquire the lock at %s: %v. Exiting.", log.Ctx{"path": e.lockPath, "error": err}) + panic("Could not acquire the lock at %s: %v. Exiting.") } - log.Infof("Successfully acquired lock at %s.", e.lockPath) + log.Info("Successfully acquired lock at %s.", e.lockPath) return e.Save(state) } @@ -136,7 +138,7 @@ func (e *EtcdClient) Load() ([]byte, error) { } if !resp.Succeeded { - log.Errorln("No longer owns the lock, trying to lock and load again.") + log.Error("No longer owns the lock, trying to lock and load again.") err = e.lock.Lock(context.Background()) if err != nil { return nil, err @@ -163,7 +165,7 @@ func (e *EtcdClient) Shutdown() error { if err == nil { err = newErr } else { - log.Errorln(newErr) + log.Error("shutdown error", log.Ctx{"error": newErr}) } } @@ -192,7 +194,7 @@ func watchKey(c *clientv3.Client, key string, valChan chan<- string) { for wresp := range rch { for _, ev := range wresp.Events { // if received event is DELETE, the value will be an empty string - log.Infof("received event %s, %q : %q\n", ev.Type, ev.Kv.Key, ev.Kv.Value) + log.Info("received event.", log.Ctx{"type": ev.Type, "key": ev.Kv.Key, "value": ev.Kv.Value}) valChan <- string(ev.Kv.Value) } } diff --git a/go/master/service.go b/go/master/service.go index df7c6860e6a..f3501028800 100644 --- a/go/master/service.go +++ b/go/master/service.go @@ -25,7 +25,7 @@ import ( "sync" "time" - log "github.com/sirupsen/logrus" + log "github.com/inconshreveable/log15" "github.com/PaddlePaddle/recordio" ) @@ -170,11 +170,11 @@ func (s *Service) recover() (bool, error) { } if state == nil { - log.Infoln("No state exists, not recovered.") + log.Info("No state exists, not recovered.") return false, nil } - log.Infof("Loaded snapshot of size: %d bytes.", len(state)) + log.Info("Loaded snapshot.", log.Ctx{"size": len(state)}) gr, err := gzip.NewReader(bytes.NewReader(state)) if err != nil { return false, err @@ -191,11 +191,11 @@ func (s *Service) recover() (bool, error) { if err != nil { // Only close failed, recover actually succeed, so // just log error. - log.Errorln(err) + log.Error("error close recover file.", log.Ctx{"error": err}) } s.state = tqs - log.WithFields(s.logFields()).Infof("Master recovered from snapshot, scheduling pending task timeout check.") + log.Info("Master recovered from snapshot, scheduling pending task timeout check.", s.logCtx()) for _, t := range s.state.Pending { time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch)) } @@ -224,7 +224,7 @@ func (s *Service) snapshot() error { } state := buf.Bytes() - log.Infof("Saving snapshot of size: %d bytes.", len(state)) + log.Info("Saving snapshot.", log.Ctx{"size bytes": len(state)}) return s.store.Save(state) } @@ -260,7 +260,7 @@ func readChunks(globPaths []string) ([]Chunk, error) { } count := index.NumChunks() - log.Infof("readChunks: file %s has %d chunks", path, count) + log.Info("reading chunks.", log.Ctx{"path": path, "num chunks": count}) for i := 0; i < count; i++ { chunk := Chunk{ Path: path, @@ -300,7 +300,7 @@ func (s *Service) SetDataset(globPaths []string, _ *int) error { err = s.snapshot() if err != nil { - log.Errorln(err) + log.Error("snapshot error", log.Ctx{"error": err}) return err } close(s.ready) @@ -320,7 +320,7 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) { defer func() { err := s.snapshot() if err != nil { - log.Errorln(err) + log.Error("snapshot error", log.Ctx{"error": err}) } }() @@ -328,12 +328,12 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) { t.NumFailure++ if t.NumFailure > s.failureMax { - log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure) + log.Warn("Task failed to many times, discard.", log.Ctx{"task": t.Task, "num failed": t.NumFailure}) s.state.Failed = append(s.state.Failed, t) return } - log.Warningf("Task %v failed %d times, re-dispatch.", t.Task, t.NumFailure) + log.Warn("Task failed, re-dispatch.", log.Ctx{"task": t.Task, "num failed": t.NumFailure}) s.state.Todo = append(s.state.Todo, t) return } @@ -353,8 +353,8 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() { } // must be called with lock held. -func (s *Service) logFields() log.Fields { - return log.Fields{ +func (s *Service) logCtx() log.Ctx { + return log.Ctx{ "todoLen": len(s.state.Todo), "pendingLen": len(s.state.Pending), "doneLen": len(s.state.Done), @@ -383,10 +383,10 @@ func (s *Service) GetTask(passID int, task *Task) error { if len(s.state.Todo) == 0 { if len(s.state.Done) == 0 && len(s.state.Pending) == 0 { - log.WithFields(s.logFields()).Warningln("All tasks failed, may start next pass") + log.Warn("All tasks failed, may start next pass", s.logCtx()) return ErrAllTaskFailed } - log.WithFields(s.logFields()).Warningln("No more available task.") + log.Warn("No more available task.", s.logCtx()) return ErrNoMoreAvailable } @@ -400,8 +400,9 @@ func (s *Service) GetTask(passID int, task *Task) error { } *task = t.Task - log.WithFields(s.logFields()).Infof("Task #%v dispatched.", t.Task.Meta) - + ctx := s.logCtx() + ctx["task meta"] = t.Task.Meta + log.Info("Task dispatched.", ctx) time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch)) return nil } @@ -417,7 +418,9 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error { t, ok := s.state.Pending[taskID] if !ok { - log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID) + ctx := s.logCtx() + ctx["task id"] = taskID + log.Warn("Pending task not found.", ctx) return nil } @@ -426,7 +429,9 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error { s.state.Done = append(s.state.Done, t) delete(s.state.Pending, taskID) - log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID) + ctx := s.logCtx() + ctx["task id"] = taskID + log.Info("Task finished.", ctx) if len(s.state.Todo) == 0 && len(s.state.Pending) == 0 { // increase master side pass count if all tasks finished s.state.CurPass++ @@ -434,12 +439,14 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error { s.state.Done = []taskEntry{} // TODO(typhoonzero): deal with failed tasks s.state.Failed = []taskEntry{} - log.WithFields(s.logFields()).Warningf("all task finished, add new pass data, newpass: %d.", s.state.CurPass) + ctx := s.logCtx() + ctx["new pass"] = s.state.CurPass + log.Warn("all task finished, add new pass data.", ctx) } err := s.snapshot() if err != nil { - log.Errorln(err) + log.Error("snapshot error", log.Ctx{"error": err}) } return err } @@ -455,7 +462,7 @@ func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error { t, ok := s.state.Pending[meta.ID] if !ok { - log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Task.Meta) + log.Warn("TaskFailed:Pending task not found.", log.Ctx{"task": t.Task.Meta}) return nil } diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go index a49cd01522b..2eeec1b6b3c 100644 --- a/go/pserver/client/c/cclient.go +++ b/go/pserver/client/c/cclient.go @@ -45,9 +45,15 @@ import ( "github.com/PaddlePaddle/Paddle/go/pserver" "github.com/PaddlePaddle/Paddle/go/pserver/client" - log "github.com/sirupsen/logrus" + log "github.com/inconshreveable/log15" ) +func init() { + log.Root().SetHandler( + log.LvlFilterHandler(log.LvlWarn, log.CallerStackHandler("%+v", log.StderrHandler)), + ) +} + var mu sync.Mutex var handleMap = make(map[C.paddle_pserver_client]*client.Client) var curHandle C.paddle_pserver_client @@ -164,10 +170,13 @@ func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, if err != nil { if err.Error() == pserver.AlreadyInitialized { - log.Warningf("parameter %s already initialized, treat paddle_init_param as successful.", name) + log.Warn( + "parameter already initialized, treat paddle_init_param as successful.", + log.Ctx{"parameter": name}, + ) return C.PSERVER_OK } - log.Errorln(err) + log.Error("error init param", log.Ctx{"error": err}) return C.PSERVER_ERROR } @@ -180,11 +189,11 @@ func paddle_finish_init_params(client C.paddle_pserver_client) C.int { err := c.FinishInitParams() if err != nil { if err.Error() == pserver.AlreadyInitialized { - log.Warningln("parameters already initialized, treat paddle_finish_init_params as successful.") + log.Warn("parameters already initialized, treat paddle_finish_init_params as successful.") return C.PSERVER_OK } - log.Errorln(err) + log.Error("error finish init params", log.Ctx{"error": err}) return C.PSERVER_ERROR } @@ -205,7 +214,7 @@ func paddle_send_grads(client C.paddle_pserver_client, grads **C.paddle_gradient c := get(client) err := c.SendGrads(gs) if err != nil { - log.Errorln(err) + log.Error("error send grads", log.Ctx{"error": err}) return C.PSERVER_ERROR } @@ -222,7 +231,7 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter, c := get(client) ps, err := c.GetParams(ns) if err != nil { - log.Errorln(err) + log.Error("error get params", log.Ctx{"error": err}) return C.PSERVER_ERROR } @@ -231,7 +240,13 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter, for i, p := range ps { pn[i] = p.Name } - log.Errorf("pserver returned wrong number of parameters. Requested: %s, returned: %s.", strings.Join(pn, ", "), strings.Join(ns, ", ")) + log.Error( + "pserver returned wrong number of parameters.", + log.Ctx{ + "Requested": strings.Join(pn, ", "), + "Returned": strings.Join(ns, ", "), + }, + ) return C.PSERVER_ERROR } @@ -241,7 +256,13 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter, for i, p := range ps { pn[i] = p.Name } - log.Errorf("pserver returned wrong parameters, or not in requested order. Requested: %s, returned: %s.", strings.Join(pn, ", "), strings.Join(ns, ", ")) + log.Error( + "pserver returned wrong parameters, or not in requested order.", + log.Ctx{ + "Requested": strings.Join(pn, ", "), + "Returned": strings.Join(ns, ", "), + }, + ) return C.PSERVER_ERROR } } @@ -251,13 +272,19 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter, param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst)))) if unsafe.Pointer(param) == nil { - log.Errorln("must pre-allocate parameter.") + log.Error("must pre-allocate parameter.") return C.PSERVER_ERROR } if unsafe.Pointer(param.content) != nil { if int(param.content_len) != len(p.Content) { - log.Errorf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content)) + log.Error( + "the pre-allocated content len does not match parameter content len.", + log.Ctx{ + "Pre-allocated len": param.content_len, + "Returned len": len(p.Content), + }, + ) return C.PSERVER_ERROR } } diff --git a/go/pserver/client/client.go b/go/pserver/client/client.go index e5187ce3df7..18fce34b376 100644 --- a/go/pserver/client/client.go +++ b/go/pserver/client/client.go @@ -22,7 +22,7 @@ import ( "github.com/PaddlePaddle/Paddle/go/connection" "github.com/PaddlePaddle/Paddle/go/pserver" - log "github.com/sirupsen/logrus" + log "github.com/inconshreveable/log15" ) // TODO(helin): add RPC call retry logic @@ -84,7 +84,7 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) { if curServers[i].Addr == "" { err := c.pservers[i].Close() if err != nil { - log.Errorln(err) + log.Error("error closing connection to pserver", log.Ctx{"error": err}) } continue @@ -92,7 +92,7 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) { err := c.pservers[i].Connect(curServers[i].Addr) if err != nil { - log.Errorln(err) + log.Error("error connecting to pserver", log.Ctx{"error": err}) // connect to addr failed, set // to last known addr in order diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go index c3d88e926d7..ec832305ee8 100644 --- a/go/pserver/client/client_test.go +++ b/go/pserver/client/client_test.go @@ -30,7 +30,7 @@ import ( "github.com/PaddlePaddle/Paddle/go/pserver" "github.com/PaddlePaddle/Paddle/go/pserver/client" "github.com/coreos/etcd/clientv3" - log "github.com/sirupsen/logrus" + log "github.com/inconshreveable/log15" ) const ( @@ -90,7 +90,7 @@ func initEtcdClient() { DialTimeout: time.Second * time.Duration(1), }) if err != nil { - log.Errorf("err %v", err) + log.Error("error init etcd client", log.Ctx{"error": err}) } ctx, cancel := context.WithTimeout(context.Background(), timeout) _, err = client.Delete(ctx, pserver.PsDesired) diff --git a/go/pserver/client/etcd_client.go b/go/pserver/client/etcd_client.go index f9071caaa8f..16d0c3b9430 100644 --- a/go/pserver/client/etcd_client.go +++ b/go/pserver/client/etcd_client.go @@ -25,7 +25,7 @@ import ( "github.com/PaddlePaddle/Paddle/go/pserver" "github.com/coreos/etcd/clientv3" "github.com/coreos/etcd/clientv3/concurrency" - log "github.com/sirupsen/logrus" + log "github.com/inconshreveable/log15" ) const ( @@ -54,26 +54,29 @@ func (e *Etcd) Desired() int { resp, err := e.client.Get(ctx, pserver.PsDesired) cancel() if err != nil { - log.Errorf("Get ps dresire number failed! recnnectiong..., %v", err) + log.Error( + "Get ps dresire number failed! reconnecting...", + log.Ctx{"error": err}, + ) time.Sleep(e.timeout) continue } kvs := resp.Kvs if len(kvs) == 0 { - log.Infoln("Waiting for ps desired registered ...") + log.Info("Waiting for ps desired registered ...") time.Sleep(e.timeout) continue } psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value)) if err != nil { - log.Errorf("psDesired %d invalid %v", psDesired, err) + log.Error("atoi failed", log.Ctx{"error": err}) time.Sleep(e.timeout) continue } - log.Debugf("Get psDesired number: %d", psDesired) + log.Debug("Got psDesired", log.Ctx{"psDesired": psDesired}) break } return psDesired @@ -88,17 +91,20 @@ func (e *Etcd) List() []Server { for i := 0; i < psDesired; i++ { ctx, cancel := context.WithTimeout(context.Background(), e.timeout) psKey := pserver.PsPath + strconv.Itoa(i) - log.Debugf("checking %s", psKey) + log.Debug("looking for pserver", log.Ctx{"ps key": psKey}) resp, err := e.client.Get(ctx, psKey) cancel() if err != nil { - log.Infof("Get psKey= %s error, %v", psKey, err) + log.Info( + "Get psKey error", + log.Ctx{"ps key": psKey, "error": err}, + ) time.Sleep(e.timeout) continue } kvs := resp.Kvs if len(kvs) == 0 { - log.Infof("Waiting for ps addr registered ...") + log.Info("Waiting for ps addr registered ...") time.Sleep(e.timeout) continue } @@ -106,11 +112,17 @@ func (e *Etcd) List() []Server { psAddr := string(resp.Kvs[0].Value) // TODO(Longfei) check the ps address if psAddr == "" { - log.Infof("Get psKey = %s, psAddr is empty", psKey) + log.Info( + "Value under psKey is empty", + log.Ctx{"psKey": psKey}, + ) time.Sleep(e.timeout) continue } - log.Debugf("got value (%s) for key: %s", psAddr, psKey) + log.Debug( + "got psAddr given psKey", + log.Ctx{"psAddr": psAddr, "psKey": psKey}, + ) servers[i].Index = i servers[i].Addr = psAddr } @@ -130,13 +142,13 @@ func NewEtcd(endpoints string) *Etcd { DialTimeout: defaultEtcdTimeout, }) if err != nil { - log.Errorf("Init etcd connection failed: %v", err) + log.Error("Init etcd connection failed", log.Ctx{"error": err}) time.Sleep(defaultEtcdTimeout) continue } break } - log.Infof("Connected to etcd: %s\n", endpoints) + log.Info("Connected to etcd endpoint", log.Ctx{"endpoint": endpoints}) client := &Etcd{ client: cli, timeout: defaultEtcdTimeout, @@ -154,7 +166,7 @@ func (e *Etcd) Select() (bool, error) { } lock := concurrency.NewMutex(sess, initLockPath) - log.Infof("Trying to acquire lock at %s.", initLockPath) + log.Info("Trying to acquire lock", log.Ctx{"lock path": initLockPath}) // Do not use timeout context here, since we don't know how // long does it take for other trainers to initialize the // parameters. @@ -162,7 +174,7 @@ func (e *Etcd) Select() (bool, error) { if err != nil { return false, err } - log.Infof("Successfully acquired lock at %s.", initLockPath) + log.Info("Successfully acquired lock", log.Ctx{"lock path": initLockPath}) get := clientv3.OpGet(initDonePath) ctx, cancel := context.WithTimeout(context.Background(), e.timeout) @@ -181,17 +193,17 @@ func (e *Etcd) Select() (bool, error) { if len(resp.Kvs) == 0 { // Key value not set, select current trainer. e.lock = lock - log.Infoln("Trainer selected.") + log.Info("Trainer selected.") return true, nil } if string(resp.Kvs[0].Value) == initDoneVal { - log.Infoln("Initialization is already done.") + log.Info("Initialization is already done.") ctx, cancel = context.WithTimeout(context.Background(), e.timeout) err = lock.Unlock(ctx) cancel() if err != nil { - log.Errorln(err) + log.Error("error unlocking", log.Ctx{"error": err}) } return false, nil } @@ -221,7 +233,7 @@ func (e *Etcd) Done() error { err = e.lock.Unlock(ctx) cancel() if err != nil { - log.Errorln(err) + log.Error("error unlocking", log.Ctx{"error": err}) } else { e.lock = nil } @@ -244,7 +256,7 @@ func (e *Etcd) Close() error { cErr := e.client.Close() if cErr != nil { if err != nil { - log.Errorln(cErr) + log.Error("error closing etcd client", log.Ctx{"error": cErr}) return err } return cErr diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go index 41f0640fc09..08ddb247f26 100644 --- a/go/pserver/etcd_client.go +++ b/go/pserver/etcd_client.go @@ -24,7 +24,7 @@ import ( "github.com/PaddlePaddle/Paddle/go/utils/networkhelper" "github.com/coreos/etcd/clientv3" "github.com/coreos/etcd/clientv3/concurrency" - log "github.com/sirupsen/logrus" + log "github.com/inconshreveable/log15" ) const ( @@ -82,19 +82,19 @@ func (e *EtcdClient) Register(port int) (int, error) { DialTimeout: e.dialTimeout, }) if err != nil { - log.Errorf("connect to etcd error: %v", err) + log.Error("connect to etcd error", log.Ctx{"error": err}) time.Sleep(retryTimeout) continue } e.client = cli sess, err := concurrency.NewSession(cli, concurrency.WithTTL(e.ttlSec)) if err != nil { - log.Errorf("create etcd session error: %v", err) + log.Error("create etcd session error", log.Ctx{"error": err}) time.Sleep(retryTimeout) continue } e.sess = sess - log.Debugf("inited client to %s", e.endpoints) + log.Debug("connected to etcd", log.Ctx{"endpoint": e.endpoints}) break } // init /ps_desired using transaction, for multiple pservers may want to write @@ -104,7 +104,7 @@ func (e *EtcdClient) Register(port int) (int, error) { _, err := e.initDesiredPservers(ctx, e.numPservers) cancel() if err != nil { - log.Warn(err) + log.Warn("pserver init error", log.Ctx{"error": err, "num pservers": e.numPservers}) time.Sleep(retryTimeout) continue } @@ -119,14 +119,17 @@ func (e *EtcdClient) Register(port int) (int, error) { resp, err := e.client.Get(ctx, PsDesired) cancel() if err != nil { - log.Errorf("getting %s error: %v", PsDesired, err) + log.Error("get etcd key error", log.Ctx{"key": PsDesired, "error": err}) time.Sleep(retryTimeout) continue } if len(resp.Kvs) != 0 { e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value)) if err != nil { - log.Errorf("value of %s invalid %v\n", PsDesired, err) + log.Error( + "psDesired atoi error", + log.Ctx{"error": err, "value": string(resp.Kvs[0].Value)}, + ) time.Sleep(retryTimeout) // NOTE: wait util ps_desired value change continue @@ -143,7 +146,7 @@ func (e *EtcdClient) Register(port int) (int, error) { pserverIdx, err = e.registerPserverEtcd(ctx, port) cancel() if err != nil { - log.Warn(err) + log.Warn("register pserver on etcd error", log.Ctx{"error": err}) time.Sleep(retryTimeout) continue } @@ -170,16 +173,17 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er registered := false for i := 0; i < e.desired; i++ { psKey := PsPath + strconv.Itoa(i) - log.Debugf("checking %s", psKey) ps := c.Get(psKey) - log.Debugf("got value (%s) for key: %s", ps, psKey) + log.Debug( + "register pserver got value", + log.Ctx{"value": ps, "key": psKey}, + ) if ps == "" { // find the first id and write info pserverAddr := e.externalIP + ":" + strconv.Itoa(port) c.Put(psKey, pserverAddr, clientv3.WithLease(e.sess.Lease())) - log.Debugf("set pserver node %s with value %s", psKey, pserverAddr) - log.Debug("register finished") + log.Debug("register finished", log.Ctx{"key": psKey, "value": pserverAddr}) idx = i registered = true break @@ -239,7 +243,7 @@ func (e *EtcdClient) Shutdown() error { newErr := e.client.Close() if newErr != nil { if err != nil { - log.Errorln(newErr) + log.Error("shutdown error", log.Ctx{"error": newErr}) } else { err = newErr } diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go index 51ffba5c74d..e04c86de0a9 100644 --- a/go/pserver/optimizer.go +++ b/go/pserver/optimizer.go @@ -25,7 +25,7 @@ import ( "fmt" "unsafe" - log "github.com/sirupsen/logrus" + log "github.com/inconshreveable/log15" ) type optimizer struct { @@ -56,12 +56,12 @@ func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer c := paramWithConfigs.Config s := State paramBufferSize := C.size_t(len(p.Content)) - log.WithFields(log.Fields{ + log.Info("New Optimizer Created with config", log.Ctx{ "ElementType": p.ElementType, "ParamSize": paramBufferSize, "ConfigSize": len(c), "StateSize": len(s), - }).Info("New Optimizer Created with config:") + }) var cbuffer unsafe.Pointer cbuffer = C.malloc(paramBufferSize) diff --git a/go/pserver/service.go b/go/pserver/service.go index 29e953acdd8..b6acdc705b9 100644 --- a/go/pserver/service.go +++ b/go/pserver/service.go @@ -32,7 +32,7 @@ import ( uuid "github.com/satori/go.uuid" - log "github.com/sirupsen/logrus" + log "github.com/inconshreveable/log15" ) // ElementType is the type of elements of a Parameter. @@ -209,7 +209,7 @@ func (s *Service) FinishInitParams(_ int, _ *int) error { for range t { err := s.checkpoint() if err != nil { - log.Errorln(err) + log.Error("finish init params error", log.Ctx{"error": err}) } } }() @@ -262,7 +262,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error { func traceTime(start time.Time, name string) { elapsed := time.Since(start) - log.Infof("%s took %v", name, elapsed) + log.Info("time elapsed", log.Ctx{"name": name, "elapsed": elapsed}) } // checkpoint saves checkpoint to disk. @@ -270,7 +270,7 @@ func traceTime(start time.Time, name string) { // checkpoint should be only called after the parameters are // initialized. func (s *Service) checkpoint() (err error) { - log.Infoln("Begin save checkpoint.") + log.Info("Begin save checkpoint.") defer traceTime(time.Now(), "save checkpoint") s.mu.Lock() @@ -315,7 +315,7 @@ func (s *Service) checkpoint() (err error) { closeErr := f.Close() if closeErr != nil { if err != nil { - log.Errorln(closeErr) + log.Error("error close checkpoint file", log.Ctx{"error": closeErr}) } else { // Set closeErr as return value. err = closeErr @@ -336,7 +336,7 @@ func (s *Service) checkpoint() (err error) { oldMeta, err := loadMeta(s.client, s.idx) if err == ErrCheckpointNotFound { - log.Infoln("Do not have existing checkpoint.") + log.Info("Do not have existing checkpoint.") err = nil } @@ -368,7 +368,7 @@ func (s *Service) checkpoint() (err error) { if rmErr != nil { // log error, but still treat checkpoint as // successful. - log.Errorln(rmErr) + log.Error("remove old meta file error", log.Ctx{"error": rmErr}) } } -- GitLab