Compare commits

..

No commits in common. "5a0382e8a3e5ac3c4ea17b20a29411b50443bb69" and "a64475213370e5bf2b5a0fc003af5d9aaae4ae47" have entirely different histories.

View File

@ -33,12 +33,6 @@ func (s Status) String() string {
} }
} }
const (
MessageDown = "went down"
MessageUp = "came back up"
MessageHiccup = "hiccupped"
)
type Dog struct { type Dog struct {
Watchdog string Watchdog string
Name string Name string
@ -53,15 +47,15 @@ type Dog struct {
status Status status Status
changed bool changed bool
error error error error
//failures int failures int
//passes int passes int
//lastFailed time.Time lastFailed time.Time
//lastPassed time.Time lastPassed time.Time
//lastNotified time.Time lastNotified time.Time
} }
func New(d *Dog) *Dog { func New(d *Dog) *Dog {
//d.lastPassed = time.Now().Add(-5 * time.Minute) d.lastPassed = time.Now().Add(-5 * time.Minute)
d.status = StatusUp d.status = StatusUp
d.changed = false d.changed = false
return d return d
@ -76,87 +70,88 @@ func (d *Dog) Watch() {
} }
} }
// Now that I've added the ability to notify when a server is back up
// this definitely needs some refactoring. It's bad now.
func (d *Dog) watch() { func (d *Dog) watch() {
d.Logger <- fmt.Sprintf("Check: '%s'", d.Name) d.Logger <- fmt.Sprintf("Check: '%s'", d.Name)
// This may be up or down err := d.check()
err := d.hardcheck()
if nil == err { if nil == err {
d.Logger <- fmt.Sprintf("Up: '%s'", d.Name)
// if it's down, coming up, notify
if d.changed { if d.changed {
d.notify(MessageUp) d.notify("came back up")
} }
return return
} }
// If being down is a change, check to see if it's just a hiccup time.Sleep(time.Duration(5) * time.Second)
if d.changed {
time.Sleep(time.Duration(5) * time.Second) err2 := d.check()
err2 := d.softcheck() if nil != err2 {
if nil != err2 { d.Logger <- fmt.Sprintf("Down: '%s': %s", d.Name, err2)
// it's really down } else {
d.Logger <- fmt.Sprintf("Down: '%s': %s", d.Name, err2) d.Logger <- fmt.Sprintf("Hiccup: '%s': %s", d.Name, err)
} else { return
// it's not really down, so reset the change info
d.changed = false
d.status = StatusUp
// and notify of the hiccup
d.Logger <- fmt.Sprintf("Hiccup: '%s': %s", d.Name, err)
d.notify(MessageHiccup)
return
}
} }
// TODO what if the server is flip-flopping rapidly?
// how to rate limit?
// "{{ .Server }} is on cooldown for 30 minutes"
// * We've had success since the last notification
// * It's been at least 5 minutes since the last notification
//fiveMinutesAgo := time.Now().Add(-5 * time.Minute)
//if d.lastPassed.After(d.lastNotified) && d.lastNotified.Before(fiveMinutesAgo) {
//}
t := 10 t := 10
for { for {
// try to recover, then backoff exponentially
d.recover() d.recover()
time.Sleep(time.Duration(t) * time.Second) time.Sleep(time.Duration(t) * time.Second)
// backoff
t *= 2 t *= 2
if t > 120 { err := d.check()
t = 120 if nil != err {
d.Logger <- fmt.Sprintf("Unrecoverable: '%s': %s", d.Name, err)
} }
err := d.softcheck() // We should notify if
if nil != err { // * The status has changed
// this is down, and we know it's down //
d.status = StatusDown // TODO what if the server is flip-flopping rapidly?
d.Logger <- fmt.Sprintf("Unrecoverable: '%s': %s", d.Name, err) // how to rate limit?
if d.changed { // "{{ .Server }} is on cooldown for 30 minutes"
d.changed = false if d.changed {
d.notify(MessageDown) d.notify("went down")
if StatusUp == d.status {
break
} }
} else {
// it came back up // * We've had success since the last notification
d.status = StatusUp // * It's been at least 5 minutes since the last notification
d.Logger <- fmt.Sprintf("Up: '%s'", d.Name) //fiveMinutesAgo := time.Now().Add(-5 * time.Minute)
if d.changed { //if d.lastPassed.After(d.lastNotified) && d.lastNotified.Before(fiveMinutesAgo) {
// and the downtime was short - just a recovery //}
d.notify(MessageHiccup) //if !failure || d.failures >= 5 {
} else { // go back to the main 5-minute loop
// and the downtime was some time // break
d.notify(MessageUp) //}
}
d.changed = false
break
} }
} }
} }
func (d *Dog) softcheck() error { func (d *Dog) check() error {
previousStatus := d.status
var err error
defer func() {
// Are we up, or down?
if nil != err {
d.status = StatusDown
d.failures += 1
d.lastFailed = time.Now()
} else {
d.status = StatusUp
d.lastPassed = time.Now()
d.passes += 1
d.Logger <- fmt.Sprintf("Up: '%s'", d.Name)
}
// Has that changed?
if previousStatus != d.status {
d.changed = true
} else {
d.changed = false
}
}()
client := NewHTTPClient() client := NewHTTPClient()
response, err := client.Get(d.CheckURL) response, err := client.Get(d.CheckURL)
if nil != err { if nil != err {
@ -179,7 +174,7 @@ func (d *Dog) softcheck() error {
} }
if "" != d.Badwords { if "" != d.Badwords {
if bytes.Contains(b, []byte(d.Badwords)) { if !bytes.Contains(b, []byte(d.Badwords)) {
err = fmt.Errorf("Down: '%s' Found for '%s'", d.Badwords, d.Name) err = fmt.Errorf("Down: '%s' Found for '%s'", d.Badwords, d.Name)
d.Logger <- fmt.Sprintf("%s", err) d.Logger <- fmt.Sprintf("%s", err)
d.error = err d.error = err
@ -190,32 +185,6 @@ func (d *Dog) softcheck() error {
return nil return nil
} }
func (d *Dog) hardcheck() error {
previousStatus := d.status
err := d.softcheck()
// Are we up, or down?
if nil != err {
d.status = StatusDown
//d.failures += 1
//d.lastFailed = time.Now()
} else {
d.status = StatusUp
//d.lastPassed = time.Now()
//d.passes += 1
}
// Has that changed?
if previousStatus != d.status {
d.changed = true
} else {
d.changed = false
}
return err
}
func (d *Dog) recover() { func (d *Dog) recover() {
if "" == d.Recover { if "" == d.Recover {
return return
@ -245,7 +214,7 @@ func (d *Dog) recover() {
func (d *Dog) notify(msg string) { func (d *Dog) notify(msg string) {
d.Logger <- fmt.Sprintf("Notifying the authorities of %s's status change", d.Name) d.Logger <- fmt.Sprintf("Notifying the authorities of %s's status change", d.Name)
//d.lastNotified = time.Now() d.lastNotified = time.Now()
for i := range d.Webhooks { for i := range d.Webhooks {
name := d.Webhooks[i] name := d.Webhooks[i]