|
|
@ -33,6 +33,12 @@ func (s Status) String() string { |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
const ( |
|
|
|
MessageDown = "went down" |
|
|
|
MessageUp = "came back up" |
|
|
|
MessageHiccup = "hiccupped" |
|
|
|
) |
|
|
|
|
|
|
|
type Dog struct { |
|
|
|
Watchdog string |
|
|
|
Name string |
|
|
@ -47,15 +53,15 @@ type Dog struct { |
|
|
|
status Status |
|
|
|
changed bool |
|
|
|
error error |
|
|
|
failures int |
|
|
|
passes int |
|
|
|
lastFailed time.Time |
|
|
|
lastPassed time.Time |
|
|
|
lastNotified time.Time |
|
|
|
//failures int
|
|
|
|
//passes int
|
|
|
|
//lastFailed time.Time
|
|
|
|
//lastPassed time.Time
|
|
|
|
//lastNotified time.Time
|
|
|
|
} |
|
|
|
|
|
|
|
func New(d *Dog) *Dog { |
|
|
|
d.lastPassed = time.Now().Add(-5 * time.Minute) |
|
|
|
//d.lastPassed = time.Now().Add(-5 * time.Minute)
|
|
|
|
d.status = StatusUp |
|
|
|
d.changed = false |
|
|
|
return d |
|
|
@ -75,90 +81,82 @@ func (d *Dog) Watch() { |
|
|
|
func (d *Dog) watch() { |
|
|
|
d.Logger <- fmt.Sprintf("Check: '%s'", d.Name) |
|
|
|
|
|
|
|
err := d.check() |
|
|
|
// This may be up or down
|
|
|
|
err := d.hardcheck() |
|
|
|
if nil == err { |
|
|
|
d.Logger <- fmt.Sprintf("Up: '%s'", d.Name) |
|
|
|
// if it's down, coming up, notify
|
|
|
|
if d.changed { |
|
|
|
d.notify("came back up") |
|
|
|
d.notify(MessageUp) |
|
|
|
} |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
changed := d.changed |
|
|
|
time.Sleep(time.Duration(5) * time.Second) |
|
|
|
|
|
|
|
err2 := d.check() |
|
|
|
if nil != err2 { |
|
|
|
d.Logger <- fmt.Sprintf("Down: '%s': %s", d.Name, err2) |
|
|
|
} else { |
|
|
|
d.notify("hiccuped") |
|
|
|
d.Logger <- fmt.Sprintf("Hiccup: '%s': %s", d.Name, err) |
|
|
|
return |
|
|
|
// If being down is a change, check to see if it's just a hiccup
|
|
|
|
if d.changed { |
|
|
|
time.Sleep(time.Duration(5) * time.Second) |
|
|
|
err2 := d.softcheck() |
|
|
|
if nil != err2 { |
|
|
|
// it's really down
|
|
|
|
d.Logger <- fmt.Sprintf("Down: '%s': %s", d.Name, err2) |
|
|
|
} else { |
|
|
|
// it's not really down, so reset the change info
|
|
|
|
d.changed = false |
|
|
|
d.status = StatusUp |
|
|
|
// and notify of the hiccup
|
|
|
|
d.Logger <- fmt.Sprintf("Hiccup: '%s': %s", d.Name, err) |
|
|
|
d.notify(MessageHiccup) |
|
|
|
return |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
// TODO what if the server is flip-flopping rapidly?
|
|
|
|
// how to rate limit?
|
|
|
|
// "{{ .Server }} is on cooldown for 30 minutes"
|
|
|
|
|
|
|
|
// * We've had success since the last notification
|
|
|
|
// * It's been at least 5 minutes since the last notification
|
|
|
|
//fiveMinutesAgo := time.Now().Add(-5 * time.Minute)
|
|
|
|
//if d.lastPassed.After(d.lastNotified) && d.lastNotified.Before(fiveMinutesAgo) {
|
|
|
|
//}
|
|
|
|
|
|
|
|
t := 10 |
|
|
|
for { |
|
|
|
// try to recover, then backoff exponentially
|
|
|
|
d.recover() |
|
|
|
time.Sleep(time.Duration(t) * time.Second) |
|
|
|
// backoff
|
|
|
|
t *= 2 |
|
|
|
err := d.check() |
|
|
|
if nil != err { |
|
|
|
d.Logger <- fmt.Sprintf("Unrecoverable: '%s': %s", d.Name, err) |
|
|
|
if t > 120 { |
|
|
|
t = 120 |
|
|
|
} |
|
|
|
|
|
|
|
// We should notify if
|
|
|
|
// * The status has changed
|
|
|
|
//
|
|
|
|
// TODO what if the server is flip-flopping rapidly?
|
|
|
|
// how to rate limit?
|
|
|
|
// "{{ .Server }} is on cooldown for 30 minutes"
|
|
|
|
if changed || d.changed { |
|
|
|
changed = false |
|
|
|
if StatusUp == d.status { |
|
|
|
d.notify("came back up") |
|
|
|
break |
|
|
|
} else { |
|
|
|
d.notify("went down") |
|
|
|
} |
|
|
|
|
|
|
|
// * We've had success since the last notification
|
|
|
|
// * It's been at least 5 minutes since the last notification
|
|
|
|
//fiveMinutesAgo := time.Now().Add(-5 * time.Minute)
|
|
|
|
//if d.lastPassed.After(d.lastNotified) && d.lastNotified.Before(fiveMinutesAgo) {
|
|
|
|
//}
|
|
|
|
//if !failure || d.failures >= 5 {
|
|
|
|
// go back to the main 5-minute loop
|
|
|
|
// break
|
|
|
|
//}
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func (d *Dog) check() error { |
|
|
|
previousStatus := d.status |
|
|
|
|
|
|
|
var err error |
|
|
|
defer func() { |
|
|
|
// Are we up, or down?
|
|
|
|
err := d.softcheck() |
|
|
|
if nil != err { |
|
|
|
// this is down, and we know it's down
|
|
|
|
d.status = StatusDown |
|
|
|
d.failures += 1 |
|
|
|
d.lastFailed = time.Now() |
|
|
|
d.Logger <- fmt.Sprintf("Unrecoverable: '%s': %s", d.Name, err) |
|
|
|
if d.changed { |
|
|
|
d.changed = false |
|
|
|
d.notify(MessageDown) |
|
|
|
} |
|
|
|
} else { |
|
|
|
// it came back up
|
|
|
|
d.status = StatusUp |
|
|
|
d.lastPassed = time.Now() |
|
|
|
d.passes += 1 |
|
|
|
d.Logger <- fmt.Sprintf("Up: '%s'", d.Name) |
|
|
|
} |
|
|
|
|
|
|
|
// Has that changed?
|
|
|
|
if previousStatus != d.status { |
|
|
|
d.changed = true |
|
|
|
} else { |
|
|
|
if d.changed { |
|
|
|
// and the downtime was short - just a recovery
|
|
|
|
d.notify(MessageHiccup) |
|
|
|
} else { |
|
|
|
// and the downtime was some time
|
|
|
|
d.notify(MessageUp) |
|
|
|
} |
|
|
|
d.changed = false |
|
|
|
break |
|
|
|
} |
|
|
|
}() |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func (d *Dog) softcheck() error { |
|
|
|
client := NewHTTPClient() |
|
|
|
response, err := client.Get(d.CheckURL) |
|
|
|
if nil != err { |
|
|
@ -192,6 +190,32 @@ func (d *Dog) check() error { |
|
|
|
return nil |
|
|
|
} |
|
|
|
|
|
|
|
func (d *Dog) hardcheck() error { |
|
|
|
previousStatus := d.status |
|
|
|
|
|
|
|
err := d.softcheck() |
|
|
|
|
|
|
|
// Are we up, or down?
|
|
|
|
if nil != err { |
|
|
|
d.status = StatusDown |
|
|
|
//d.failures += 1
|
|
|
|
//d.lastFailed = time.Now()
|
|
|
|
} else { |
|
|
|
d.status = StatusUp |
|
|
|
//d.lastPassed = time.Now()
|
|
|
|
//d.passes += 1
|
|
|
|
} |
|
|
|
|
|
|
|
// Has that changed?
|
|
|
|
if previousStatus != d.status { |
|
|
|
d.changed = true |
|
|
|
} else { |
|
|
|
d.changed = false |
|
|
|
} |
|
|
|
|
|
|
|
return err |
|
|
|
} |
|
|
|
|
|
|
|
func (d *Dog) recover() { |
|
|
|
if "" == d.Recover { |
|
|
|
return |
|
|
@ -221,7 +245,7 @@ func (d *Dog) recover() { |
|
|
|
|
|
|
|
func (d *Dog) notify(msg string) { |
|
|
|
d.Logger <- fmt.Sprintf("Notifying the authorities of %s's status change", d.Name) |
|
|
|
d.lastNotified = time.Now() |
|
|
|
//d.lastNotified = time.Now()
|
|
|
|
|
|
|
|
for i := range d.Webhooks { |
|
|
|
name := d.Webhooks[i] |
|
|
|