From 5a0382e8a3e5ac3c4ea17b20a29411b50443bb69 Mon Sep 17 00:00:00 2001 From: AJ ONeal Date: Thu, 4 Jul 2019 19:54:51 -0600 Subject: [PATCH] v1.2.2: improved change detection --- watchdog.go | 166 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 95 insertions(+), 71 deletions(-) diff --git a/watchdog.go b/watchdog.go index 05193f6..ffc98d0 100644 --- a/watchdog.go +++ b/watchdog.go @@ -33,6 +33,12 @@ func (s Status) String() string { } } +const ( + MessageDown = "went down" + MessageUp = "came back up" + MessageHiccup = "hiccupped" +) + type Dog struct { Watchdog string Name string @@ -47,15 +53,15 @@ type Dog struct { status Status changed bool error error - failures int - passes int - lastFailed time.Time - lastPassed time.Time - lastNotified time.Time + //failures int + //passes int + //lastFailed time.Time + //lastPassed time.Time + //lastNotified time.Time } func New(d *Dog) *Dog { - d.lastPassed = time.Now().Add(-5 * time.Minute) + //d.lastPassed = time.Now().Add(-5 * time.Minute) d.status = StatusUp d.changed = false return d @@ -75,90 +81,82 @@ func (d *Dog) Watch() { func (d *Dog) watch() { d.Logger <- fmt.Sprintf("Check: '%s'", d.Name) - err := d.check() + // This may be up or down + err := d.hardcheck() if nil == err { + d.Logger <- fmt.Sprintf("Up: '%s'", d.Name) + // if it's down, coming up, notify if d.changed { - d.notify("came back up") + d.notify(MessageUp) } return } - changed := d.changed - time.Sleep(time.Duration(5) * time.Second) - - err2 := d.check() - if nil != err2 { - d.Logger <- fmt.Sprintf("Down: '%s': %s", d.Name, err2) - } else { - d.notify("hiccuped") - d.Logger <- fmt.Sprintf("Hiccup: '%s': %s", d.Name, err) - return + // If being down is a change, check to see if it's just a hiccup + if d.changed { + time.Sleep(time.Duration(5) * time.Second) + err2 := d.softcheck() + if nil != err2 { + // it's really down + d.Logger <- fmt.Sprintf("Down: '%s': %s", d.Name, err2) + } else { + // it's not really down, so reset the change info + d.changed = false + d.status = StatusUp + // and notify of the hiccup + d.Logger <- fmt.Sprintf("Hiccup: '%s': %s", d.Name, err) + d.notify(MessageHiccup) + return + } } + // TODO what if the server is flip-flopping rapidly? + // how to rate limit? + // "{{ .Server }} is on cooldown for 30 minutes" + + // * We've had success since the last notification + // * It's been at least 5 minutes since the last notification + //fiveMinutesAgo := time.Now().Add(-5 * time.Minute) + //if d.lastPassed.After(d.lastNotified) && d.lastNotified.Before(fiveMinutesAgo) { + //} + t := 10 for { + // try to recover, then backoff exponentially d.recover() time.Sleep(time.Duration(t) * time.Second) - // backoff t *= 2 - err := d.check() - if nil != err { - d.Logger <- fmt.Sprintf("Unrecoverable: '%s': %s", d.Name, err) + if t > 120 { + t = 120 } - // We should notify if - // * The status has changed - // - // TODO what if the server is flip-flopping rapidly? - // how to rate limit? - // "{{ .Server }} is on cooldown for 30 minutes" - if changed || d.changed { - changed = false - if StatusUp == d.status { - d.notify("came back up") - break - } else { - d.notify("went down") + err := d.softcheck() + if nil != err { + // this is down, and we know it's down + d.status = StatusDown + d.Logger <- fmt.Sprintf("Unrecoverable: '%s': %s", d.Name, err) + if d.changed { + d.changed = false + d.notify(MessageDown) } - - // * We've had success since the last notification - // * It's been at least 5 minutes since the last notification - //fiveMinutesAgo := time.Now().Add(-5 * time.Minute) - //if d.lastPassed.After(d.lastNotified) && d.lastNotified.Before(fiveMinutesAgo) { - //} - //if !failure || d.failures >= 5 { - // go back to the main 5-minute loop - // break - //} + } else { + // it came back up + d.status = StatusUp + d.Logger <- fmt.Sprintf("Up: '%s'", d.Name) + if d.changed { + // and the downtime was short - just a recovery + d.notify(MessageHiccup) + } else { + // and the downtime was some time + d.notify(MessageUp) + } + d.changed = false + break } } } -func (d *Dog) check() error { - previousStatus := d.status - - var err error - defer func() { - // Are we up, or down? - if nil != err { - d.status = StatusDown - d.failures += 1 - d.lastFailed = time.Now() - } else { - d.status = StatusUp - d.lastPassed = time.Now() - d.passes += 1 - d.Logger <- fmt.Sprintf("Up: '%s'", d.Name) - } - - // Has that changed? - if previousStatus != d.status { - d.changed = true - } else { - d.changed = false - } - }() - +func (d *Dog) softcheck() error { client := NewHTTPClient() response, err := client.Get(d.CheckURL) if nil != err { @@ -192,6 +190,32 @@ func (d *Dog) check() error { return nil } +func (d *Dog) hardcheck() error { + previousStatus := d.status + + err := d.softcheck() + + // Are we up, or down? + if nil != err { + d.status = StatusDown + //d.failures += 1 + //d.lastFailed = time.Now() + } else { + d.status = StatusUp + //d.lastPassed = time.Now() + //d.passes += 1 + } + + // Has that changed? + if previousStatus != d.status { + d.changed = true + } else { + d.changed = false + } + + return err +} + func (d *Dog) recover() { if "" == d.Recover { return @@ -221,7 +245,7 @@ func (d *Dog) recover() { func (d *Dog) notify(msg string) { d.Logger <- fmt.Sprintf("Notifying the authorities of %s's status change", d.Name) - d.lastNotified = time.Now() + //d.lastNotified = time.Now() for i := range d.Webhooks { name := d.Webhooks[i]