diff --git a/src/modules/collector/stra/cron.go b/src/modules/collector/stra/cron.go index 0f5d0a9ff1fdddf577517360ef8814c3db4d010b..a46bddd006a580b28575773362cd4c6d1778a865 100644 --- a/src/modules/collector/stra/cron.go +++ b/src/modules/collector/stra/cron.go @@ -74,7 +74,7 @@ func getCollects() (CollectResp, error) { url := fmt.Sprintf("http://%s%s%s", addr, StraConfig.Api, identity.Identity) err = httplib.Get(url).SetTimeout(time.Duration(StraConfig.Timeout) * time.Millisecond).ToJSON(&res) if err != nil { - err = fmt.Errorf("get collects from remote failed, error:%v", err) + err = fmt.Errorf("get collects from remote:%s failed, error:%v", url, err) } return res, err diff --git a/src/modules/collector/sys/funcs/push.go b/src/modules/collector/sys/funcs/push.go index fde8301987cb0d87f3f61f717bcf5a09dd1f555e..bf9e732a10dcff9076f5378eec7fc32089542075 100644 --- a/src/modules/collector/sys/funcs/push.go +++ b/src/modules/collector/sys/funcs/push.go @@ -48,7 +48,9 @@ func Push(items []*dataobj.MetricValue) { logger.Error(err) continue } else { - logger.Info("push succ, reply: ", reply) + if reply.Msg != "ok" { + logger.Error("some item push err", reply) + } return } } diff --git a/src/modules/index/cache/endpoints.go b/src/modules/index/cache/endpoints.go index 29d588a48f8146c6135e821f78278d495dddcb95..aae9e2cbc00d43523bd817bc9bf63fa7e9b8c3ad 100644 --- a/src/modules/index/cache/endpoints.go +++ b/src/modules/index/cache/endpoints.go @@ -6,6 +6,7 @@ import ( "time" "github.com/didi/nightingale/src/toolkits/address" + "github.com/didi/nightingale/src/toolkits/stats" "github.com/toolkits/pkg/concurrent/semaphore" "github.com/toolkits/pkg/logger" @@ -49,11 +50,12 @@ func reportEndpoint(endpoints []interface{}) { err := httplib.Post(url).JSONBodyQuiet(m).SetTimeout(3*time.Second).Header("x-srv-token", "monapi-builtin-token").ToJSON(&body) if err != nil { logger.Warningf("curl %s fail: %v. retry", url, err) + stats.Counter.Set("report.endpoint.err", 1) continue } - - if body.Err != "" { + if body.Err != "" { //数据库连接出错会出现此情况 logger.Warningf("curl %s fail: %s. retry", url, body.Err) + stats.Counter.Set("report.endpoint.err", 1) continue } diff --git a/src/modules/index/cache/indexdb.go b/src/modules/index/cache/indexdb.go index 6eacd61bfd7d910ba750a28abd1e9ffe723d98da..51a28a225da27fb82acfd3de2d37c1d35aac5224 100644 --- a/src/modules/index/cache/indexdb.go +++ b/src/modules/index/cache/indexdb.go @@ -19,6 +19,7 @@ import ( "github.com/didi/nightingale/src/toolkits/compress" "github.com/didi/nightingale/src/toolkits/identity" "github.com/didi/nightingale/src/toolkits/report" + "github.com/didi/nightingale/src/toolkits/stats" ) type CacheSection struct { @@ -72,8 +73,8 @@ func StartPersist(interval int) { err := Persist("normal") if err != nil { logger.Error("Persist err:", err) + stats.Counter.Set("persist.err", 1) } - //logger.Infof("clean %+v, took %.2f ms\n", cleanRet, float64(time.Since(start).Nanoseconds())*1e-6) } } diff --git a/src/modules/judge/backend/redi/funcs.go b/src/modules/judge/backend/redi/funcs.go index 933f7d682e5dc2d1339133f8120be3ad3ca20eb3..d671bc8384c5331be6d153ccddb46b1a5d3ebd33 100644 --- a/src/modules/judge/backend/redi/funcs.go +++ b/src/modules/judge/backend/redi/funcs.go @@ -42,6 +42,5 @@ func Push(event *dataobj.Event) error { return nil } - stats.Counter.Set("redis.failed", 1) return fmt.Errorf("redis publish failed finally:%v", err) } diff --git a/src/modules/judge/backend/redi/redis.go b/src/modules/judge/backend/redi/redis.go index c1359e79a8806d95cdc0ba7659022ffd66e64397..8b167630d79706e4db16d711013d9d9b5c62ae1c 100644 --- a/src/modules/judge/backend/redi/redis.go +++ b/src/modules/judge/backend/redi/redis.go @@ -4,6 +4,7 @@ import ( "log" "time" + "github.com/didi/nightingale/src/toolkits/stats" "github.com/garyburd/redigo/redis" "github.com/toolkits/pkg/logger" ) @@ -44,6 +45,7 @@ func Init(cfg RedisSection) { c, err := redis.Dial("tcp", addr, redis.DialConnectTimeout(connTimeout), redis.DialReadTimeout(readTimeout), redis.DialWriteTimeout(writeTimeout)) if err != nil { logger.Errorf("conn redis err:%v", err) + stats.Counter.Set("redis.conn.failed", 1) return nil, err } @@ -51,6 +53,8 @@ func Init(cfg RedisSection) { if _, err := c.Do("AUTH", pass); err != nil { c.Close() logger.Errorf("ERR: redis auth fail:%v", err) + stats.Counter.Set("redis.conn.failed", 1) + return nil, err } } diff --git a/src/modules/judge/judge/judge.go b/src/modules/judge/judge/judge.go index 50e566a3f3abff24276974101be2a6d69613951e..e50c935c578b6d18864e751f4e7a28ceb382b690 100644 --- a/src/modules/judge/judge/judge.go +++ b/src/modules/judge/judge/judge.go @@ -65,7 +65,7 @@ func Judge(stra *model.Stra, exps []model.Exp, historyData []*dataobj.RRDData, f stats.Counter.Set("running", 1) if len(exps) < 1 { - stats.Counter.Set("stra.err", 1) + stats.Counter.Set("stra.illegal", 1) logger.Warningf("stra:%v exp is null", stra) return } @@ -421,6 +421,7 @@ func sendEvent(event *dataobj.Event) { err := redi.Push(event) if err != nil { + stats.Counter.Set("redis.push.failed", 1) logger.Errorf("push event:%v err:%v", event, err) } } diff --git a/src/modules/judge/stra/stra.go b/src/modules/judge/stra/stra.go index 44abeebb314b4305945a369d90d1af8386b2c177..ee930105ca9fce0903866d1a4b5ef4e68403f8e3 100644 --- a/src/modules/judge/stra/stra.go +++ b/src/modules/judge/stra/stra.go @@ -56,11 +56,13 @@ func getStrategy(opts StrategySection) { if err != nil { logger.Warningf("get strategy from remote failed, error:%v", err) + stats.Counter.Set("stra.get.err", 1) continue } if resp.Err != "" { logger.Warningf("get strategy from remote failed, error:%v", resp.Err) + stats.Counter.Set("stra.get.err", 1) continue } diff --git a/src/modules/monapi/cron/checker_judge.go b/src/modules/monapi/cron/checker_judge.go index 21483a8f71e9ed417794d6bff166505fa0dc8a06..96db8b31615980db04ee8b271a8f3dc661ad5315 100644 --- a/src/modules/monapi/cron/checker_judge.go +++ b/src/modules/monapi/cron/checker_judge.go @@ -11,6 +11,7 @@ import ( "github.com/didi/nightingale/src/model" "github.com/didi/nightingale/src/modules/monapi/config" "github.com/didi/nightingale/src/modules/monapi/scache" + "github.com/didi/nightingale/src/toolkits/stats" ) func CheckJudgeLoop() { @@ -19,6 +20,7 @@ func CheckJudgeLoop() { time.Sleep(duration) err := CheckJudge() if err != nil { + stats.Counter.Set("get.judge.err", 1) logger.Error("check judge fail: ", err) } } diff --git a/src/modules/monapi/redisc/redis.go b/src/modules/monapi/redisc/redis.go index c310c59db67de10a75b1b25f2572be04db726283..576e34552b0480178870fb5996b5f411c43a2c85 100644 --- a/src/modules/monapi/redisc/redis.go +++ b/src/modules/monapi/redisc/redis.go @@ -7,6 +7,7 @@ import ( "github.com/toolkits/pkg/logger" "github.com/didi/nightingale/src/modules/monapi/config" + "github.com/didi/nightingale/src/toolkits/stats" ) var RedisConnPool *redis.Pool @@ -29,6 +30,8 @@ func InitRedis() { Dial: func() (redis.Conn, error) { c, err := redis.Dial("tcp", addr, redis.DialConnectTimeout(connTimeout), redis.DialReadTimeout(readTimeout), redis.DialWriteTimeout(writeTimeout)) if err != nil { + logger.Errorf("conn redis err:%v", err) + stats.Counter.Set("redis.conn.failed", 1) return nil, err } @@ -36,6 +39,7 @@ func InitRedis() { if _, err := c.Do("AUTH", pass); err != nil { c.Close() logger.Error("redis auth fail, pass: ", pass) + stats.Counter.Set("redis.conn.failed", 1) return nil, err } } diff --git a/src/modules/transfer/backend/query.go b/src/modules/transfer/backend/query.go index b028b3171f0046e39bd3bd06a94bc1f246df0eb1..34bb9a3f7935af6c74eb8ce26f4b234162e2969d 100644 --- a/src/modules/transfer/backend/query.go +++ b/src/modules/transfer/backend/query.go @@ -12,6 +12,7 @@ import ( "github.com/didi/nightingale/src/dataobj" "github.com/didi/nightingale/src/modules/transfer/calc" "github.com/didi/nightingale/src/toolkits/address" + "github.com/didi/nightingale/src/toolkits/stats" "github.com/toolkits/pkg/logger" "github.com/toolkits/pkg/net/httplib" @@ -161,10 +162,12 @@ func fetchDataSync(start, end int64, consolFun, endpoint, counter string, step i defer func() { <-worker }() + stats.Counter.Set("query.tsdb", 1) data, err := fetchData(start, end, consolFun, endpoint, counter, step) if err != nil { logger.Warning(err) + stats.Counter.Set("query.data.err", 1) } dataChan <- data return diff --git a/src/modules/transfer/backend/sender.go b/src/modules/transfer/backend/sender.go index 4dc1f4e53c50ee7ba266394fccf4dd663a669adf..f28dde1894ddef8a1f92428ff58d1f6cba68166f 100644 --- a/src/modules/transfer/backend/sender.go +++ b/src/modules/transfer/backend/sender.go @@ -107,6 +107,7 @@ func Send2TsdbTask(Q *list.SafeListLimited, node string, addr string, concurrent // 将数据 打入 某个Tsdb的发送缓存队列, 具体是哪一个Tsdb 由一致性哈希 决定 func Push2TsdbSendQueue(items []*dataobj.MetricValue) { + errCnt := 0 for _, item := range items { tsdbItem := convert2TsdbItem(item) stats.Counter.Set("tsdb.queue.push", 1) @@ -118,19 +119,18 @@ func Push2TsdbSendQueue(items []*dataobj.MetricValue) { } cnode := Config.ClusterList[node] - errCnt := 0 for _, addr := range cnode.Addrs { Q := TsdbQueues[node+addr] if !Q.PushFront(tsdbItem) { errCnt += 1 } } + } - // statistics - if errCnt > 0 { - stats.Counter.Set("tsdb.queue.err", errCnt) - logger.Error("Push2TsdbSendQueue err num: ", errCnt) - } + // statistics + if errCnt > 0 { + stats.Counter.Set("tsdb.queue.err", errCnt) + logger.Error("Push2TsdbSendQueue err num: ", errCnt) } } @@ -172,7 +172,7 @@ func Send2JudgeTask(Q *list.SafeListLimited, addr string, concurrent int) { if !sendOk { stats.Counter.Set("points.out.judge.err", 1) - logger.Errorf("send judge %s fail: %v", addr, err) + logger.Errorf("send %v to judge %s fail: %v", judgeItems, addr, err) } }(addr, judgeItems, count) @@ -180,6 +180,7 @@ func Send2JudgeTask(Q *list.SafeListLimited, addr string, concurrent int) { } func Push2JudgeSendQueue(items []*dataobj.MetricValue) { + errCnt := 0 for _, item := range items { key := str.PK(item.Metric, item.Endpoint) stras := cache.StraMap.GetByKey(key) @@ -203,11 +204,13 @@ func Push2JudgeSendQueue(items []*dataobj.MetricValue) { q, exists := JudgeQueues.Get(stra.JudgeInstance) if exists { - q.PushFront(judgeItem) + if !q.PushFront(judgeItem) { + errCnt += 1 + } } } - } + stats.Counter.Set("judge.queue.err", errCnt) } // 打到Tsdb的数据,要根据rrdtool的特定 来限制 step、counterType、timestamp diff --git a/src/modules/transfer/rpc/push.go b/src/modules/transfer/rpc/push.go index 70b10c0017406713e87f4afbe2dc5b9ea0a3f3dc..dddd38dfd2598ef56297719a7102781fdfbfe0d5 100644 --- a/src/modules/transfer/rpc/push.go +++ b/src/modules/transfer/rpc/push.go @@ -27,9 +27,10 @@ func (t *Transfer) Push(args []*dataobj.MetricValue, reply *dataobj.TransferResp err := v.CheckValidity() if err != nil { stats.Counter.Set("points.in.err", 1) - logger.Warningf("item is illegal item:%s err:%v", v, err) + msg := fmt.Sprintf("item is illegal item:%s err:%v", v, err) + logger.Warningf(msg) reply.Invalid += 1 - reply.Msg += fmt.Sprintf("%v\n", err) + reply.Msg += msg continue } diff --git a/src/modules/tsdb/index/index.go b/src/modules/tsdb/index/index.go index 661fb1430fa4617e061383db22bef7aed6f36ed9..b68cdf1bcd0551c944edcf21300bfde63514bb69 100644 --- a/src/modules/tsdb/index/index.go +++ b/src/modules/tsdb/index/index.go @@ -47,7 +47,7 @@ func GetIndexLoop() { func GetIndex() { instances, err := report.GetAlive("index", Config.HbsMod) if err != nil { - stats.Counter.Set("index.get.err", 1) + stats.Counter.Set("get.index.err", 1) logger.Warningf("get index list err:%v", err) return } diff --git a/src/modules/tsdb/rpc/push.go b/src/modules/tsdb/rpc/push.go index 280517782cdc813cb87d32ba779760f6451b3399..9bdc3a4ca7b7768547da4e15d7eec61838530564 100644 --- a/src/modules/tsdb/rpc/push.go +++ b/src/modules/tsdb/rpc/push.go @@ -58,7 +58,6 @@ func handleItems(items []*dataobj.TsdbItem) { //todo hash冲突问题需要解决 if err := cache.Caches.Push(item.Key, item.Timestamp, item.Value); err != nil { stats.Counter.Set("points.in.err", 1) - logger.Warningf("push obj error, obj: %v, error: %v\n", items[i], err) fail++ } diff --git a/src/modules/tsdb/rrdtool/sync_disk.go b/src/modules/tsdb/rrdtool/sync_disk.go index d76fe31c995c6baf43448544194ce9ee64c923d1..cd2885e95b27d5e8a41cea28774d0240041a0231 100644 --- a/src/modules/tsdb/rrdtool/sync_disk.go +++ b/src/modules/tsdb/rrdtool/sync_disk.go @@ -231,6 +231,7 @@ func FlushRRD(flushChunks map[interface{}][]*cache.Chunk) { err := FlushFile(seriesID, items) if err != nil { + stats.Counter.Set("flush.rrd.err", 1) logger.Errorf("flush %v data to rrd err:%v", seriesID, err) continue }