411 lines
		
	
	
		
			9.6 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			411 lines
		
	
	
		
			9.6 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package watchdog
 | |
| 
 | |
| import (
 | |
| 	"bytes"
 | |
| 	"context"
 | |
| 	"encoding/json"
 | |
| 	"fmt"
 | |
| 	"io/ioutil"
 | |
| 	"net"
 | |
| 	"net/http"
 | |
| 	"net/url"
 | |
| 	"os/exec"
 | |
| 	"strings"
 | |
| 	"time"
 | |
| )
 | |
| 
 | |
| type Status int
 | |
| 
 | |
| const (
 | |
| 	StatusDown Status = iota
 | |
| 	StatusUp
 | |
| )
 | |
| 
 | |
| func (s Status) String() string {
 | |
| 	// ... just wishing Go had enums like Rust...
 | |
| 	switch s {
 | |
| 	case StatusUp:
 | |
| 		return "up"
 | |
| 	case StatusDown:
 | |
| 		return "down"
 | |
| 	default:
 | |
| 		return "[[internal error]]"
 | |
| 	}
 | |
| }
 | |
| 
 | |
| const (
 | |
| 	MessageDown   = "went down"
 | |
| 	MessageUp     = "came back up"
 | |
| 	MessageHiccup = "hiccupped"
 | |
| )
 | |
| 
 | |
| type Dog struct {
 | |
| 	Watchdog      string
 | |
| 	Name          string
 | |
| 	CheckURL      string
 | |
| 	Keywords      string
 | |
| 	Badwords      string
 | |
| 	Localizations map[string]string
 | |
| 	Recover       string
 | |
| 	Webhooks      []string
 | |
| 	AllWebhooks   map[string]Webhook
 | |
| 	Logger        chan string
 | |
| 	status        Status
 | |
| 	changed       bool
 | |
| 	error         error
 | |
| 	//failures      int
 | |
| 	//passes        int
 | |
| 	//lastFailed    time.Time
 | |
| 	//lastPassed    time.Time
 | |
| 	//lastNotified time.Time
 | |
| }
 | |
| 
 | |
| func New(d *Dog) *Dog {
 | |
| 	//d.lastPassed = time.Now().Add(-5 * time.Minute)
 | |
| 	d.status = StatusUp
 | |
| 	d.changed = false
 | |
| 	return d
 | |
| }
 | |
| 
 | |
| func (d *Dog) Watch() {
 | |
| 	d.watch()
 | |
| 	for {
 | |
| 		// TODO set cancellable callback ?
 | |
| 		time.Sleep(5 * time.Minute)
 | |
| 		d.watch()
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Now that I've added the ability to notify when a server is back up
 | |
| // this definitely needs some refactoring. It's bad now.
 | |
| func (d *Dog) watch() {
 | |
| 	d.Logger <- fmt.Sprintf("Check: '%s'", d.Name)
 | |
| 
 | |
| 	// This may be up or down
 | |
| 	err := d.hardcheck()
 | |
| 	if nil == err {
 | |
| 		d.Logger <- fmt.Sprintf("Up: '%s'", d.Name)
 | |
| 		// if it's down, coming up, notify
 | |
| 		if d.changed {
 | |
| 			d.notify(MessageUp)
 | |
| 		}
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	// If being down is a change, check to see if it's just a hiccup
 | |
| 	if d.changed {
 | |
| 		time.Sleep(time.Duration(5) * time.Second)
 | |
| 		err2 := d.softcheck()
 | |
| 		if nil != err2 {
 | |
| 			// it's really down
 | |
| 			d.Logger <- fmt.Sprintf("Down: '%s': %s", d.Name, err2)
 | |
| 		} else {
 | |
| 			// it's not really down, so reset the change info
 | |
| 			d.changed = false
 | |
| 			d.status = StatusUp
 | |
| 			// and notify of the hiccup
 | |
| 			d.Logger <- fmt.Sprintf("Hiccup: '%s': %s", d.Name, err)
 | |
| 			d.notify(MessageHiccup)
 | |
| 			return
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// TODO what if the server is flip-flopping rapidly?
 | |
| 	// how to rate limit?
 | |
| 	// "{{ .Server }} is on cooldown for 30 minutes"
 | |
| 
 | |
| 	// * We've had success since the last notification
 | |
| 	// * It's been at least 5 minutes since the last notification
 | |
| 	//fiveMinutesAgo := time.Now().Add(-5 * time.Minute)
 | |
| 	//if d.lastPassed.After(d.lastNotified) && d.lastNotified.Before(fiveMinutesAgo) {
 | |
| 	//}
 | |
| 
 | |
| 	t := 10
 | |
| 	for {
 | |
| 		// try to recover, then backoff exponentially
 | |
| 		d.recover()
 | |
| 		time.Sleep(time.Duration(t) * time.Second)
 | |
| 		t *= 2
 | |
| 		if t > 120 {
 | |
| 			t = 120
 | |
| 		}
 | |
| 
 | |
| 		err := d.softcheck()
 | |
| 		if nil != err {
 | |
| 			// this is down, and we know it's down
 | |
| 			d.status = StatusDown
 | |
| 			d.Logger <- fmt.Sprintf("Unrecoverable: '%s': %s", d.Name, err)
 | |
| 			if d.changed {
 | |
| 				d.changed = false
 | |
| 				d.notify(MessageDown)
 | |
| 			}
 | |
| 		} else {
 | |
| 			// it came back up
 | |
| 			d.status = StatusUp
 | |
| 			d.Logger <- fmt.Sprintf("Up: '%s'", d.Name)
 | |
| 			if d.changed {
 | |
| 				// and the downtime was short - just a recovery
 | |
| 				d.notify(MessageHiccup)
 | |
| 			} else {
 | |
| 				// and the downtime was some time
 | |
| 				d.notify(MessageUp)
 | |
| 			}
 | |
| 			d.changed = false
 | |
| 			break
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (d *Dog) softcheck() error {
 | |
| 	client := NewHTTPClient()
 | |
| 	response, err := client.Get(d.CheckURL)
 | |
| 	if nil != err {
 | |
| 		d.error = fmt.Errorf("Connection Failure: " + err.Error())
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	b, err := ioutil.ReadAll(response.Body)
 | |
| 	if nil != err {
 | |
| 		d.error = fmt.Errorf("Network Failure: " + err.Error())
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	// Note: empty matches empty as true, so this works for checking redirects
 | |
| 	if !bytes.Contains(b, []byte(d.Keywords)) {
 | |
| 		err = fmt.Errorf("Down: '%s' Not Found for '%s'", d.Keywords, d.Name)
 | |
| 		d.Logger <- fmt.Sprintf("%s", err)
 | |
| 		d.error = err
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	if "" != d.Badwords {
 | |
| 		if bytes.Contains(b, []byte(d.Badwords)) {
 | |
| 			err = fmt.Errorf("Down: '%s' Found for '%s'", d.Badwords, d.Name)
 | |
| 			d.Logger <- fmt.Sprintf("%s", err)
 | |
| 			d.error = err
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func (d *Dog) hardcheck() error {
 | |
| 	previousStatus := d.status
 | |
| 
 | |
| 	err := d.softcheck()
 | |
| 
 | |
| 	// Are we up, or down?
 | |
| 	if nil != err {
 | |
| 		d.status = StatusDown
 | |
| 		//d.failures += 1
 | |
| 		//d.lastFailed = time.Now()
 | |
| 	} else {
 | |
| 		d.status = StatusUp
 | |
| 		//d.lastPassed = time.Now()
 | |
| 		//d.passes += 1
 | |
| 	}
 | |
| 
 | |
| 	// Has that changed?
 | |
| 	if previousStatus != d.status {
 | |
| 		d.changed = true
 | |
| 	} else {
 | |
| 		d.changed = false
 | |
| 	}
 | |
| 
 | |
| 	return err
 | |
| }
 | |
| 
 | |
| func (d *Dog) recover() {
 | |
| 	if "" == d.Recover {
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 | |
| 	cmd := exec.CommandContext(ctx, "bash")
 | |
| 	pipe, err := cmd.StdinPipe()
 | |
| 	pipe.Write([]byte(d.Recover))
 | |
| 	if nil != err {
 | |
| 		d.Logger <- fmt.Sprintf("[Recover] Could not write to bash '%s': %s", d.Recover, err)
 | |
| 	}
 | |
| 	err = cmd.Start()
 | |
| 	if nil != err {
 | |
| 		d.Logger <- fmt.Sprintf("[Recover] Could not start '%s': %s", d.Recover, err)
 | |
| 	}
 | |
| 	err = pipe.Close()
 | |
| 	if nil != err {
 | |
| 		d.Logger <- fmt.Sprintf("[Recover] Could not close '%s': %s", d.Recover, err)
 | |
| 	}
 | |
| 	err = cmd.Wait()
 | |
| 	cancel()
 | |
| 	if nil != err {
 | |
| 		d.Logger <- fmt.Sprintf("[Recover] '%s' failed for '%s': %s", d.Recover, d.Name, err)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (d *Dog) notify(msg string) {
 | |
| 	d.Logger <- fmt.Sprintf("Notifying the authorities of %s's status change", d.Name)
 | |
| 	//d.lastNotified = time.Now()
 | |
| 
 | |
| 	for i := range d.Webhooks {
 | |
| 		name := d.Webhooks[i]
 | |
| 		if "" == name {
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		h, ok := d.AllWebhooks[name]
 | |
| 		if !ok {
 | |
| 			// TODO check in main when config is read
 | |
| 			d.Webhooks[i] = ""
 | |
| 			d.Logger <- fmt.Sprintf("[Warning] Could not find webhook '%s' for '%s'", name, h.Name)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		d.notifyOne(h, msg)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (d *Dog) notifyOne(h Webhook, msg string) {
 | |
| 	// TODO do this in main on config init
 | |
| 	if "" == h.Method {
 | |
| 		h.Method = "POST"
 | |
| 	}
 | |
| 
 | |
| 	var body *strings.Reader
 | |
| 	var err error
 | |
| 	// TODO real templates
 | |
| 	if 0 != len(h.Form) {
 | |
| 		form := url.Values{}
 | |
| 		for k := range h.Form {
 | |
| 			v := h.Form[k]
 | |
| 			// because `{{` gets urlencoded
 | |
| 			//k = strings.Replace(k, "{{ .Name }}", d.Name, -1)
 | |
| 			v = strings.Replace(v, "{{ .Watchdog }}", d.Watchdog, -1)
 | |
| 			v = strings.Replace(v, "{{ .Name }}", d.Name, -1)
 | |
| 			v = strings.Replace(v, "{{ .Status }}", d.localize(d.status.String()), -1)
 | |
| 			v = strings.Replace(v, "{{ .Message }}", d.localize(msg), -1)
 | |
| 			d.Logger <- fmt.Sprintf("[HEADER] %s: %s", k, v)
 | |
| 			form.Set(k, v)
 | |
| 		}
 | |
| 		body = strings.NewReader(form.Encode())
 | |
| 	} else if 0 != len(h.JSON) {
 | |
| 		bodyBuf, err := json.Marshal(h.JSON)
 | |
| 		if nil != err {
 | |
| 			d.Logger <- fmt.Sprintf("[Notify] JSON Marshal Error for '%s': %s", h.Name, err)
 | |
| 			return
 | |
| 		}
 | |
| 		// `{{` should be left alone
 | |
| 		v := string(bodyBuf)
 | |
| 		v = strings.Replace(v, "{{ .Watchdog }}", d.Watchdog, -1)
 | |
| 		v = strings.Replace(v, "{{ .Name }}", d.Name, -1)
 | |
| 		v = strings.Replace(v, "{{ .Status }}", d.localize(d.status.String()), -1)
 | |
| 		v = strings.Replace(v, "{{ .Message }}", d.localize(msg), -1)
 | |
| 		body = strings.NewReader(v)
 | |
| 	}
 | |
| 
 | |
| 	client := NewHTTPClient()
 | |
| 	req, err := http.NewRequest(h.Method, h.URL, body)
 | |
| 	if nil != err {
 | |
| 		d.Logger <- fmt.Sprintf("[Notify] HTTP Client Network Error for '%s': %s", h.Name, err)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	if 0 != len(h.Form) {
 | |
| 		req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
 | |
| 	} else if 0 != len(h.JSON) {
 | |
| 		req.Header.Set("Content-Type", "application/json")
 | |
| 	}
 | |
| 
 | |
| 	if 0 != len(h.Auth) {
 | |
| 		user := h.Auth["user"]
 | |
| 		if "" == user {
 | |
| 			user = h.Auth["username"]
 | |
| 		}
 | |
| 		pass := h.Auth["pass"]
 | |
| 		if "" == user {
 | |
| 			pass = h.Auth["password"]
 | |
| 		}
 | |
| 		req.SetBasicAuth(user, pass)
 | |
| 	}
 | |
| 
 | |
| 	req.Header.Set("User-Agent", "Watchdog/1.0")
 | |
| 	for k := range h.Headers {
 | |
| 		req.Header.Set(k, h.Headers[k])
 | |
| 	}
 | |
| 
 | |
| 	resp, err := client.Do(req)
 | |
| 	if nil != err {
 | |
| 		d.Logger <- fmt.Sprintf("[Notify] HTTP Client Error for '%s': %s", h.Name, err)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	if !(resp.StatusCode >= 200 && resp.StatusCode < 300) {
 | |
| 		d.Logger <- fmt.Sprintf("[Notify] Response Error for '%s': %s", h.Name, resp.Status)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	// TODO json vs xml vs txt
 | |
| 	var data map[string]interface{}
 | |
| 	req.Header.Add("Accept", "application/json")
 | |
| 	decoder := json.NewDecoder(resp.Body)
 | |
| 	err = decoder.Decode(&data)
 | |
| 	if err != nil {
 | |
| 		d.Logger <- fmt.Sprintf("[Notify] Response Body Error for '%s': %s", h.Name, resp.Status)
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	// TODO some sort of way to determine if data is successful (keywords)
 | |
| 	d.Logger <- fmt.Sprintf("[Notify] Success? %#v", data)
 | |
| }
 | |
| func (d *Dog) localize(msg string) string {
 | |
| 	for k := range d.Localizations {
 | |
| 		if k == msg {
 | |
| 			return d.Localizations[k]
 | |
| 		}
 | |
| 	}
 | |
| 	return msg
 | |
| }
 | |
| 
 | |
| type Config struct {
 | |
| 	Watchdog      string            `json:"watchdog"`
 | |
| 	Watches       []ConfigWatch     `json:"watches"`
 | |
| 	Webhooks      []Webhook         `json:"webhooks"`
 | |
| 	Localizations map[string]string `json:"localizations"`
 | |
| }
 | |
| 
 | |
| type ConfigWatch struct {
 | |
| 	Name          string   `json:"name"`
 | |
| 	URL           string   `json:"url"`
 | |
| 	Keywords      string   `json:"keywords"`
 | |
| 	Badwords      string   `json:"badwords"`
 | |
| 	Webhooks      []string `json:"webhooks"`
 | |
| 	RecoverScript string   `json:"recover_script"`
 | |
| }
 | |
| 
 | |
| type Webhook struct {
 | |
| 	Name    string              `json:"name"`
 | |
| 	Method  string              `json:"method"`
 | |
| 	URL     string              `json:"url"`
 | |
| 	Auth    map[string]string   `json:"auth"`
 | |
| 	Headers map[string]string   `json:"headers"`
 | |
| 	Form    map[string]string   `json:"form"`
 | |
| 	JSON    map[string]string   `json:"json"`
 | |
| 	Config  map[string]string   `json:"config"`
 | |
| 	Configs []map[string]string `json:"configs"`
 | |
| }
 | |
| 
 | |
| // The default http client uses unsafe defaults
 | |
| func NewHTTPClient() *http.Client {
 | |
| 	transport := &http.Transport{
 | |
| 		Dial: (&net.Dialer{
 | |
| 			Timeout: 10 * time.Second,
 | |
| 		}).Dial,
 | |
| 		TLSHandshakeTimeout: 5 * time.Second,
 | |
| 	}
 | |
| 	client := &http.Client{
 | |
| 		Timeout:   time.Second * 5,
 | |
| 		Transport: transport,
 | |
| 	}
 | |
| 	return client
 | |
| }
 |