package watchdog import ( "bytes" "context" "encoding/json" "fmt" "io/ioutil" "net" "net/http" "net/url" "os/exec" "strings" "time" ) type Status int const ( StatusDown Status = iota StatusUp ) func (s Status) String() string { // ... just wishing Go had enums like Rust... switch s { case StatusUp: return "up" case StatusDown: return "down" default: return "[[internal error]]" } } const ( MessageDown = "went down" MessageUp = "came back up" MessageHiccup = "hiccupped" ) type Dog struct { Watchdog string Name string CheckURL string Keywords string Badwords string Localizations map[string]string Recover string Webhooks []string AllWebhooks map[string]Webhook Logger chan string status Status changed bool error error //failures int //passes int //lastFailed time.Time //lastPassed time.Time //lastNotified time.Time } func New(d *Dog) *Dog { //d.lastPassed = time.Now().Add(-5 * time.Minute) d.status = StatusUp d.changed = false return d } func (d *Dog) Watch() { d.watch() for { // TODO set cancellable callback ? time.Sleep(5 * time.Minute) d.watch() } } // Now that I've added the ability to notify when a server is back up // this definitely needs some refactoring. It's bad now. func (d *Dog) watch() { d.Logger <- fmt.Sprintf("Check: '%s'", d.Name) // This may be up or down err := d.hardcheck() if nil == err { d.Logger <- fmt.Sprintf("Up: '%s'", d.Name) // if it's down, coming up, notify if d.changed { d.notify(MessageUp) } return } // If being down is a change, check to see if it's just a hiccup if d.changed { time.Sleep(time.Duration(5) * time.Second) err2 := d.softcheck() if nil != err2 { // it's really down d.Logger <- fmt.Sprintf("Down: '%s': %s", d.Name, err2) } else { // it's not really down, so reset the change info d.changed = false d.status = StatusUp // and notify of the hiccup d.Logger <- fmt.Sprintf("Hiccup: '%s': %s", d.Name, err) d.notify(MessageHiccup) return } } // TODO what if the server is flip-flopping rapidly? // how to rate limit? // "{{ .Server }} is on cooldown for 30 minutes" // * We've had success since the last notification // * It's been at least 5 minutes since the last notification //fiveMinutesAgo := time.Now().Add(-5 * time.Minute) //if d.lastPassed.After(d.lastNotified) && d.lastNotified.Before(fiveMinutesAgo) { //} t := 10 for { // try to recover, then backoff exponentially d.recover() time.Sleep(time.Duration(t) * time.Second) t *= 2 if t > 120 { t = 120 } err := d.softcheck() if nil != err { // this is down, and we know it's down d.status = StatusDown d.Logger <- fmt.Sprintf("Unrecoverable: '%s': %s", d.Name, err) if d.changed { d.changed = false d.notify(MessageDown) } } else { // it came back up d.status = StatusUp d.Logger <- fmt.Sprintf("Up: '%s'", d.Name) if d.changed { // and the downtime was short - just a recovery d.notify(MessageHiccup) } else { // and the downtime was some time d.notify(MessageUp) } d.changed = false break } } } func (d *Dog) softcheck() error { client := NewHTTPClient() response, err := client.Get(d.CheckURL) if nil != err { d.error = fmt.Errorf("Connection Failure: " + err.Error()) return err } b, err := ioutil.ReadAll(response.Body) if nil != err { d.error = fmt.Errorf("Network Failure: " + err.Error()) return err } // Note: empty matches empty as true, so this works for checking redirects if !bytes.Contains(b, []byte(d.Keywords)) { err = fmt.Errorf("Down: '%s' Not Found for '%s'", d.Keywords, d.Name) d.Logger <- fmt.Sprintf("%s", err) d.error = err return err } if "" != d.Badwords { if bytes.Contains(b, []byte(d.Badwords)) { err = fmt.Errorf("Down: '%s' Found for '%s'", d.Badwords, d.Name) d.Logger <- fmt.Sprintf("%s", err) d.error = err return err } } return nil } func (d *Dog) hardcheck() error { previousStatus := d.status err := d.softcheck() // Are we up, or down? if nil != err { d.status = StatusDown //d.failures += 1 //d.lastFailed = time.Now() } else { d.status = StatusUp //d.lastPassed = time.Now() //d.passes += 1 } // Has that changed? if previousStatus != d.status { d.changed = true } else { d.changed = false } return err } func (d *Dog) recover() { if "" == d.Recover { return } ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) cmd := exec.CommandContext(ctx, "bash") pipe, err := cmd.StdinPipe() pipe.Write([]byte(d.Recover)) if nil != err { d.Logger <- fmt.Sprintf("[Recover] Could not write to bash '%s': %s", d.Recover, err) } err = cmd.Start() if nil != err { d.Logger <- fmt.Sprintf("[Recover] Could not start '%s': %s", d.Recover, err) } err = pipe.Close() if nil != err { d.Logger <- fmt.Sprintf("[Recover] Could not close '%s': %s", d.Recover, err) } err = cmd.Wait() cancel() if nil != err { d.Logger <- fmt.Sprintf("[Recover] '%s' failed for '%s': %s", d.Recover, d.Name, err) } } func (d *Dog) notify(msg string) { d.Logger <- fmt.Sprintf("Notifying the authorities of %s's status change", d.Name) //d.lastNotified = time.Now() for i := range d.Webhooks { name := d.Webhooks[i] if "" == name { continue } h, ok := d.AllWebhooks[name] if !ok { // TODO check in main when config is read d.Webhooks[i] = "" d.Logger <- fmt.Sprintf("[Warning] Could not find webhook '%s' for '%s'", name, h.Name) continue } d.notifyOne(h, msg) } } func (d *Dog) notifyOne(h Webhook, msg string) { // TODO do this in main on config init if "" == h.Method { h.Method = "POST" } var body *strings.Reader var err error // TODO real templates if 0 != len(h.Form) { form := url.Values{} for k := range h.Form { v := h.Form[k] // because `{{` gets urlencoded //k = strings.Replace(k, "{{ .Name }}", d.Name, -1) v = strings.Replace(v, "{{ .Watchdog }}", d.Watchdog, -1) v = strings.Replace(v, "{{ .Name }}", d.Name, -1) v = strings.Replace(v, "{{ .Status }}", d.localize(d.status.String()), -1) v = strings.Replace(v, "{{ .Message }}", d.localize(msg), -1) d.Logger <- fmt.Sprintf("[HEADER] %s: %s", k, v) form.Set(k, v) } body = strings.NewReader(form.Encode()) } else if 0 != len(h.JSON) { bodyBuf, err := json.Marshal(h.JSON) if nil != err { d.Logger <- fmt.Sprintf("[Notify] JSON Marshal Error for '%s': %s", h.Name, err) return } // `{{` should be left alone v := string(bodyBuf) v = strings.Replace(v, "{{ .Watchdog }}", d.Watchdog, -1) v = strings.Replace(v, "{{ .Name }}", d.Name, -1) v = strings.Replace(v, "{{ .Status }}", d.localize(d.status.String()), -1) v = strings.Replace(v, "{{ .Message }}", d.localize(msg), -1) body = strings.NewReader(v) } client := NewHTTPClient() req, err := http.NewRequest(h.Method, h.URL, body) if nil != err { d.Logger <- fmt.Sprintf("[Notify] HTTP Client Network Error for '%s': %s", h.Name, err) return } if 0 != len(h.Form) { req.Header.Set("Content-Type", "application/x-www-form-urlencoded") } else if 0 != len(h.JSON) { req.Header.Set("Content-Type", "application/json") } if 0 != len(h.Auth) { user := h.Auth["user"] if "" == user { user = h.Auth["username"] } pass := h.Auth["pass"] if "" == user { pass = h.Auth["password"] } req.SetBasicAuth(user, pass) } req.Header.Set("User-Agent", "Watchdog/1.0") for k := range h.Headers { req.Header.Set(k, h.Headers[k]) } resp, err := client.Do(req) if nil != err { d.Logger <- fmt.Sprintf("[Notify] HTTP Client Error for '%s': %s", h.Name, err) return } if !(resp.StatusCode >= 200 && resp.StatusCode < 300) { d.Logger <- fmt.Sprintf("[Notify] Response Error for '%s': %s", h.Name, resp.Status) return } // TODO json vs xml vs txt var data map[string]interface{} req.Header.Add("Accept", "application/json") decoder := json.NewDecoder(resp.Body) err = decoder.Decode(&data) if err != nil { d.Logger <- fmt.Sprintf("[Notify] Response Body Error for '%s': %s", h.Name, resp.Status) return } // TODO some sort of way to determine if data is successful (keywords) d.Logger <- fmt.Sprintf("[Notify] Success? %#v", data) } func (d *Dog) localize(msg string) string { for k := range d.Localizations { if k == msg { return d.Localizations[k] } } return msg } type Config struct { Watchdog string `json:"watchdog"` Watches []ConfigWatch `json:"watches"` Webhooks []Webhook `json:"webhooks"` Localizations map[string]string `json:"localizations"` } type ConfigWatch struct { Name string `json:"name"` URL string `json:"url"` Keywords string `json:"keywords"` Badwords string `json:"badwords"` Webhooks []string `json:"webhooks"` RecoverScript string `json:"recover_script"` } type Webhook struct { Name string `json:"name"` Method string `json:"method"` URL string `json:"url"` Auth map[string]string `json:"auth"` Headers map[string]string `json:"headers"` Form map[string]string `json:"form"` JSON map[string]string `json:"json"` Config map[string]string `json:"config"` Configs []map[string]string `json:"configs"` } // The default http client uses unsafe defaults func NewHTTPClient() *http.Client { transport := &http.Transport{ Dial: (&net.Dialer{ Timeout: 10 * time.Second, }).Dial, TLSHandshakeTimeout: 5 * time.Second, } client := &http.Client{ Timeout: time.Second * 5, Transport: transport, } return client }