v1.2.2: improved change detection
This commit is contained in:
parent
e3de4a2ef6
commit
5a0382e8a3
150
watchdog.go
150
watchdog.go
|
@ -33,6 +33,12 @@ func (s Status) String() string {
|
|||
}
|
||||
}
|
||||
|
||||
const (
|
||||
MessageDown = "went down"
|
||||
MessageUp = "came back up"
|
||||
MessageHiccup = "hiccupped"
|
||||
)
|
||||
|
||||
type Dog struct {
|
||||
Watchdog string
|
||||
Name string
|
||||
|
@ -47,15 +53,15 @@ type Dog struct {
|
|||
status Status
|
||||
changed bool
|
||||
error error
|
||||
failures int
|
||||
passes int
|
||||
lastFailed time.Time
|
||||
lastPassed time.Time
|
||||
lastNotified time.Time
|
||||
//failures int
|
||||
//passes int
|
||||
//lastFailed time.Time
|
||||
//lastPassed time.Time
|
||||
//lastNotified time.Time
|
||||
}
|
||||
|
||||
func New(d *Dog) *Dog {
|
||||
d.lastPassed = time.Now().Add(-5 * time.Minute)
|
||||
//d.lastPassed = time.Now().Add(-5 * time.Minute)
|
||||
d.status = StatusUp
|
||||
d.changed = false
|
||||
return d
|
||||
|
@ -75,90 +81,82 @@ func (d *Dog) Watch() {
|
|||
func (d *Dog) watch() {
|
||||
d.Logger <- fmt.Sprintf("Check: '%s'", d.Name)
|
||||
|
||||
err := d.check()
|
||||
// This may be up or down
|
||||
err := d.hardcheck()
|
||||
if nil == err {
|
||||
d.Logger <- fmt.Sprintf("Up: '%s'", d.Name)
|
||||
// if it's down, coming up, notify
|
||||
if d.changed {
|
||||
d.notify("came back up")
|
||||
d.notify(MessageUp)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
changed := d.changed
|
||||
// If being down is a change, check to see if it's just a hiccup
|
||||
if d.changed {
|
||||
time.Sleep(time.Duration(5) * time.Second)
|
||||
|
||||
err2 := d.check()
|
||||
err2 := d.softcheck()
|
||||
if nil != err2 {
|
||||
// it's really down
|
||||
d.Logger <- fmt.Sprintf("Down: '%s': %s", d.Name, err2)
|
||||
} else {
|
||||
d.notify("hiccuped")
|
||||
// it's not really down, so reset the change info
|
||||
d.changed = false
|
||||
d.status = StatusUp
|
||||
// and notify of the hiccup
|
||||
d.Logger <- fmt.Sprintf("Hiccup: '%s': %s", d.Name, err)
|
||||
d.notify(MessageHiccup)
|
||||
return
|
||||
}
|
||||
|
||||
t := 10
|
||||
for {
|
||||
d.recover()
|
||||
time.Sleep(time.Duration(t) * time.Second)
|
||||
// backoff
|
||||
t *= 2
|
||||
err := d.check()
|
||||
if nil != err {
|
||||
d.Logger <- fmt.Sprintf("Unrecoverable: '%s': %s", d.Name, err)
|
||||
}
|
||||
|
||||
// We should notify if
|
||||
// * The status has changed
|
||||
//
|
||||
// TODO what if the server is flip-flopping rapidly?
|
||||
// how to rate limit?
|
||||
// "{{ .Server }} is on cooldown for 30 minutes"
|
||||
if changed || d.changed {
|
||||
changed = false
|
||||
if StatusUp == d.status {
|
||||
d.notify("came back up")
|
||||
break
|
||||
} else {
|
||||
d.notify("went down")
|
||||
}
|
||||
|
||||
// * We've had success since the last notification
|
||||
// * It's been at least 5 minutes since the last notification
|
||||
//fiveMinutesAgo := time.Now().Add(-5 * time.Minute)
|
||||
//if d.lastPassed.After(d.lastNotified) && d.lastNotified.Before(fiveMinutesAgo) {
|
||||
//}
|
||||
//if !failure || d.failures >= 5 {
|
||||
// go back to the main 5-minute loop
|
||||
// break
|
||||
//}
|
||||
|
||||
t := 10
|
||||
for {
|
||||
// try to recover, then backoff exponentially
|
||||
d.recover()
|
||||
time.Sleep(time.Duration(t) * time.Second)
|
||||
t *= 2
|
||||
if t > 120 {
|
||||
t = 120
|
||||
}
|
||||
|
||||
err := d.softcheck()
|
||||
if nil != err {
|
||||
// this is down, and we know it's down
|
||||
d.status = StatusDown
|
||||
d.Logger <- fmt.Sprintf("Unrecoverable: '%s': %s", d.Name, err)
|
||||
if d.changed {
|
||||
d.changed = false
|
||||
d.notify(MessageDown)
|
||||
}
|
||||
} else {
|
||||
// it came back up
|
||||
d.status = StatusUp
|
||||
d.Logger <- fmt.Sprintf("Up: '%s'", d.Name)
|
||||
if d.changed {
|
||||
// and the downtime was short - just a recovery
|
||||
d.notify(MessageHiccup)
|
||||
} else {
|
||||
// and the downtime was some time
|
||||
d.notify(MessageUp)
|
||||
}
|
||||
d.changed = false
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (d *Dog) check() error {
|
||||
previousStatus := d.status
|
||||
|
||||
var err error
|
||||
defer func() {
|
||||
// Are we up, or down?
|
||||
if nil != err {
|
||||
d.status = StatusDown
|
||||
d.failures += 1
|
||||
d.lastFailed = time.Now()
|
||||
} else {
|
||||
d.status = StatusUp
|
||||
d.lastPassed = time.Now()
|
||||
d.passes += 1
|
||||
d.Logger <- fmt.Sprintf("Up: '%s'", d.Name)
|
||||
}
|
||||
|
||||
// Has that changed?
|
||||
if previousStatus != d.status {
|
||||
d.changed = true
|
||||
} else {
|
||||
d.changed = false
|
||||
}
|
||||
}()
|
||||
|
||||
func (d *Dog) softcheck() error {
|
||||
client := NewHTTPClient()
|
||||
response, err := client.Get(d.CheckURL)
|
||||
if nil != err {
|
||||
|
@ -192,6 +190,32 @@ func (d *Dog) check() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (d *Dog) hardcheck() error {
|
||||
previousStatus := d.status
|
||||
|
||||
err := d.softcheck()
|
||||
|
||||
// Are we up, or down?
|
||||
if nil != err {
|
||||
d.status = StatusDown
|
||||
//d.failures += 1
|
||||
//d.lastFailed = time.Now()
|
||||
} else {
|
||||
d.status = StatusUp
|
||||
//d.lastPassed = time.Now()
|
||||
//d.passes += 1
|
||||
}
|
||||
|
||||
// Has that changed?
|
||||
if previousStatus != d.status {
|
||||
d.changed = true
|
||||
} else {
|
||||
d.changed = false
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (d *Dog) recover() {
|
||||
if "" == d.Recover {
|
||||
return
|
||||
|
@ -221,7 +245,7 @@ func (d *Dog) recover() {
|
|||
|
||||
func (d *Dog) notify(msg string) {
|
||||
d.Logger <- fmt.Sprintf("Notifying the authorities of %s's status change", d.Name)
|
||||
d.lastNotified = time.Now()
|
||||
//d.lastNotified = time.Now()
|
||||
|
||||
for i := range d.Webhooks {
|
||||
name := d.Webhooks[i]
|
||||
|
|
Loading…
Reference in New Issue