summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorIrbe Krumina <irbe@tailscale.com>2024-11-26 12:30:55 +0000
committerIrbe Krumina <irbe@tailscale.com>2024-11-26 14:19:34 +0000
commit7c13973bc9bda0529228327b6771b7f3b952fe99 (patch)
tree8e10c8c8961caad12b3af6d5d6a9fa7110021593
parent4d33f30f91eb7debdf90c8770990801f3857e30c (diff)
downloadtailscale-irbekrm/containerboot_healthz.tar.xz
tailscale-irbekrm/containerboot_healthz.zip
cmd/containerboot: fix healthcheckirbekrm/containerboot_healthz
The current container healthcheck is not able to catch cases where a previously healthy node gets disconnected from control (because we rely on updates received over the notify channel to update health status and there are no notifications on 'Online' status changes). This change makes the healthcheck endpoint call the LocalAPI /status endpoint instead. Updates tailscale/tailscale#13620 Signed-off-by: Irbe Krumina <irbe@tailscale.com>
-rw-r--r--cmd/containerboot/healthz.go48
-rw-r--r--cmd/containerboot/main.go14
2 files changed, 40 insertions, 22 deletions
diff --git a/cmd/containerboot/healthz.go b/cmd/containerboot/healthz.go
index 12e7ee9f8..a62f065d9 100644
--- a/cmd/containerboot/healthz.go
+++ b/cmd/containerboot/healthz.go
@@ -6,34 +6,45 @@
package main
import (
+ "context"
+ "fmt"
"log"
"net"
"net/http"
- "sync"
+ "time"
+
+ "tailscale.com/client/tailscale"
+ "tailscale.com/ipn/ipnstate"
)
-// healthz is a simple health check server, if enabled it returns 200 OK if
-// this tailscale node currently has at least one tailnet IP address else
-// returns 503.
+// healthz is a simple health check server, if enabled it returns 200 OK if this tailscale device can be considered
+// healthy (running, connected to control plane, has tailnet IPs) else returns 503.
type healthz struct {
- sync.Mutex
- hasAddrs bool
+ lc *tailscale.LocalClient
}
func (h *healthz) ServeHTTP(w http.ResponseWriter, r *http.Request) {
- h.Lock()
- defer h.Unlock()
- if h.hasAddrs {
+ // Most health checks will have their own timeout, but a local client call should not take more than 5s.
+ ctx, cancel := context.WithTimeout(r.Context(), time.Second*5)
+ defer cancel()
+ st, err := h.lc.StatusWithoutPeers(ctx)
+ if err != nil {
+ http.Error(w, fmt.Sprintf("unable to check status of the tailscale device: %v", err), http.StatusServiceUnavailable)
+ return
+ }
+ online := isOnline(st)
+ addrs := getAddrs(st)
+ if st.BackendState == "Running" && online && len(addrs) != 0 {
w.Write([]byte("ok"))
} else {
- http.Error(w, "node currently has no tailscale IPs", http.StatusInternalServerError)
+ log.Printf("healthz: tailscale device is not ready, state: %q, online: %t, addrs: %v", st.BackendState, online, addrs)
+ http.Error(w, "tailscale device is not ready", http.StatusServiceUnavailable)
}
}
// runHealthz runs a simple HTTP health endpoint on /healthz, listening on the
-// provided address. A containerized tailscale instance is considered healthy if
-// it has at least one tailnet IP address.
-func runHealthz(addr string, h *healthz) {
+// provided address.
+func (h *healthz) run(addr string) {
lis, err := net.Listen("tcp", addr)
if err != nil {
log.Fatalf("error listening on the provided health endpoint address %q: %v", addr, err)
@@ -49,3 +60,14 @@ func runHealthz(addr string, h *healthz) {
}
}()
}
+
+func isOnline(st *ipnstate.Status) bool {
+ return st != nil && st.Self != nil && st.Self.Online
+}
+
+func getAddrs(st *ipnstate.Status) (addrs []string) {
+ if st == nil || st.Self == nil {
+ return
+ }
+ return st.Self.Addrs
+}
diff --git a/cmd/containerboot/main.go b/cmd/containerboot/main.go
index 313e8deb0..abd2bdd24 100644
--- a/cmd/containerboot/main.go
+++ b/cmd/containerboot/main.go
@@ -328,10 +328,12 @@ authLoop:
certDomain = new(atomic.Pointer[string])
certDomainChanged = make(chan bool, 1)
-
- h = &healthz{} // http server for the healthz endpoint
- healthzRunner = sync.OnceFunc(func() { runHealthz(cfg.HealthCheckAddrPort, h) })
)
+
+ if cfg.HealthCheckAddrPort != "" {
+ h := &healthz{lc: client}
+ h.run(cfg.HealthCheckAddrPort)
+ }
if cfg.ServeConfigPath != "" {
go watchServeConfigChanges(ctx, cfg.ServeConfigPath, certDomainChanged, certDomain, client)
}
@@ -556,12 +558,6 @@ runLoop:
}
}
- if cfg.HealthCheckAddrPort != "" {
- h.Lock()
- h.hasAddrs = len(addrs) != 0
- h.Unlock()
- healthzRunner()
- }
if egressSvcsNotify != nil {
egressSvcsNotify <- n
}