diff --git a/cmd/dashboard/main.go b/cmd/dashboard/main.go index f71e767..ac983fc 100644 --- a/cmd/dashboard/main.go +++ b/cmd/dashboard/main.go @@ -50,6 +50,5 @@ func initDB() { func main() { go controller.ServeWeb(dao.Conf.HTTPPort) go rpc.ServeRPC(5555) - go alertmanager.Start() - select {} + alertmanager.Start() } diff --git a/service/alertmanager/alertmanager.go b/service/alertmanager/alertmanager.go index 27d68c8..600446a 100644 --- a/service/alertmanager/alertmanager.go +++ b/service/alertmanager/alertmanager.go @@ -4,6 +4,7 @@ import ( "crypto/md5" "encoding/hex" "fmt" + "log" "sync" "time" @@ -44,7 +45,19 @@ func Start() { alertsLock.Unlock() time.Sleep(time.Second * 10) - go checkStatus() + var lastPrint time.Time + var checkCount uint64 + for { + startedAt := time.Now() + checkStatus() + checkCount++ + if lastPrint.Before(startedAt.Add(-1 * time.Hour)) { + log.Println("报警规则检测每小时", checkCount, "次", startedAt, time.Now()) + checkCount = 0 + lastPrint = startedAt + } + time.Sleep(time.Until(startedAt.Add(time.Second * dao.SnapshotDelay))) + } } func OnRefreshOrAddAlert(alert model.AlertRule) { @@ -101,28 +114,22 @@ func OnDeleteNotification(id uint64) { } func checkStatus() { - startedAt := time.Now() - defer func() { - time.Sleep(time.Until(startedAt.Add(time.Second * dao.SnapshotDelay))) - checkStatus() - }() - alertsLock.RLock() defer alertsLock.RUnlock() dao.ServerLock.RLock() defer dao.ServerLock.RUnlock() - for j := 0; j < len(alerts); j++ { + for _, alert := range alerts { // 跳过未启用 - if alerts[j].Enable == nil || !*alerts[j].Enable { + if alert.Enable == nil || !*alert.Enable { continue } for _, server := range dao.ServerList { // 监测点 - alertsStore[alerts[j].ID][server.ID] = append(alertsStore[alerts[j]. - ID][server.ID], alerts[j].Snapshot(server)) + alertsStore[alert.ID][server.ID] = append(alertsStore[alert. + ID][server.ID], alert.Snapshot(server)) // 发送通知 - max, desc := alerts[j].Check(alertsStore[alerts[j].ID][server.ID]) + max, desc := alert.Check(alertsStore[alert.ID][server.ID]) if desc != "" { nID := getNotificationHash(server, desc) var flag bool @@ -136,7 +143,8 @@ func checkStatus() { nHistory.Duration = time.Hour * 24 } nHistory.Until = time.Now().Add(nHistory.Duration) - dao.Cache.Set(nID, nHistory, nHistory.Duration) + // 缓存有效期加 10 分钟 + dao.Cache.Set(nID, nHistory, nHistory.Duration+time.Minute*10) } } else { // 新提醒直接通知 @@ -147,15 +155,15 @@ func checkStatus() { }, firstNotificationDelay) } if flag { - message := fmt.Sprintf("逮到咯,快去看看!服务器:%s(%s),报警规则:%s,%s", server.Name, server.Host.IP, alerts[j].Name, desc) + message := fmt.Sprintf("逮到咯,快去看看!服务器:%s(%s),报警规则:%s,%s", server.Name, server.Host.IP, alert.Name, desc) go sendNotification(message) } } // 清理旧数据 if max > 0 { - for k := 0; k < len(alertsStore[alerts[j].ID][server.ID]); k++ { - if max < len(alertsStore[alerts[j].ID][server.ID][k]) { - alertsStore[alerts[j].ID][server.ID][k] = alertsStore[alerts[j].ID][server.ID][k][len(alertsStore[alerts[j].ID][server.ID][k])-max:] + for k := 0; k < len(alertsStore[alert.ID][server.ID]); k++ { + if max < len(alertsStore[alert.ID][server.ID][k]) { + alertsStore[alert.ID][server.ID][k] = alertsStore[alert.ID][server.ID][k][len(alertsStore[alert.ID][server.ID][k])-max:] } } }