diff options
Diffstat (limited to 'cmd/containerboot')
| -rw-r--r-- | cmd/containerboot/certs.go | 146 | ||||
| -rw-r--r-- | cmd/containerboot/main.go | 2 | ||||
| -rw-r--r-- | cmd/containerboot/serve.go | 24 | ||||
| -rw-r--r-- | cmd/containerboot/settings.go | 12 | ||||
| -rw-r--r-- | cmd/containerboot/tailscaled.go | 3 |
5 files changed, 182 insertions, 5 deletions
diff --git a/cmd/containerboot/certs.go b/cmd/containerboot/certs.go new file mode 100644 index 000000000..7d0ddce90 --- /dev/null +++ b/cmd/containerboot/certs.go @@ -0,0 +1,146 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +//go:build linux + +package main + +import ( + "context" + "log" + "net" + "sync" + "time" + + "tailscale.com/ipn" + "tailscale.com/util/goroutines" +) + +// TODO: +// - add logic to stop all the goroutines (on SIGTERM) +// - add unit tests +// certManager is responsible for issuing certificates for known domains and for +// maintaining a loop that re-attempts issuance daily. +// Currently cert manager logic is only run on ingress ProxyGroup replicas that are responsible for managing certs for +// HA Ingress HTTPS endpoints ('write' replicas). +type certManager struct { + parentCtx context.Context + lc localClient + tracker goroutines.Tracker // tracks running goroutines + mu sync.Mutex // guards the following + // certLoops contains a map of DNS names, for which we currently need to + // manage certs to cancel functions that allow stopping a goroutine when + // we no longer need to manage certs for the DNS name. + certLoops map[string]context.CancelFunc +} + +// ensureCertLoops ensures that, for all currently managed Service HTTPS +// endpoints, there is a cert loop responsible for issuing and ensuring the +// renewal of the TLS certs. +func (cm *certManager) ensureCertLoops(ctx context.Context, sc *ipn.ServeConfig) error { + currentDomains := make(map[string]bool) + const httpsPort = "443" + for _, service := range sc.Services { + for hostPort := range service.Web { + domain, port, err := net.SplitHostPort(string(hostPort)) + if err != nil { + log.Printf("[unexpected] unable to parse HostPort %s", hostPort) + continue + } + if port != httpsPort { // HA Ingress' HTTP endpoint + continue + } + currentDomains[domain] = true + } + } + cm.mu.Lock() + defer cm.mu.Unlock() + for domain := range currentDomains { + if _, exists := cm.certLoops[domain]; !exists { + ctx, cancel := context.WithCancel(cm.parentCtx) + cm.certLoops[domain] = cancel + cm.tracker.Go(func() { cm.runCertLoop(ctx, domain) }) + } + } + + // Stop goroutines for domain names that are no longer in the config. + for domain, cancel := range cm.certLoops { + if !currentDomains[domain] { + cancel() + delete(cm.certLoops, domain) + } + } + return nil +} + +// runCertLoop: +// - calls localAPI certificate endpoint to ensure that certs are issued for the +// given domain name +// - calls localAPI certificate endpoint daily to ensure that certs are renewed +// - if certificate issuance failed retries after an exponential backoff period +// starting at 1 minute and capped at 24 hours. Reset the backoff once issuance succeeds. +// Note that renewal check also happens when the node receives an HTTPS request and it is possible that certs get +// renewed at that point. Renewal here is needed to prevent the shared certs from expiry in edge cases where the 'write' +// replica does not get any HTTPS requests. +// https://letsencrypt.org/docs/integration-guide/#retrying-failures +func (cm *certManager) runCertLoop(ctx context.Context, domain string) { + const ( + normalInterval = 24 * time.Hour // regular renewal check + initialRetry = 1 * time.Minute // initial backoff after a failure + maxRetryInterval = 24 * time.Hour // max backoff period + ) + timer := time.NewTimer(0) // fire off timer immediately + defer timer.Stop() + retryCount := 0 + for { + select { + case <-ctx.Done(): + return + case <-timer.C: + // We call the certificate endpoint, but don't do anything + // with the returned certs here. + // The call to the certificate endpoint will ensure that + // certs are issued/renewed as needed and stored in the + // relevant state store. For example, for HA Ingress + // 'write' replica, the cert and key will be stored in a + // Kubernetes Secret named after the domain for which we + // are issuing. + // Note that renewals triggered by the call to the + // certificates endpoint here and by renewal check + // triggered during a call to node's HTTPS endpoint + // share the same state/renewal lock mechanism, so we + // should not run into redundant issuances during + // concurrent renewal checks. + // TODO(irbekrm): maybe it is worth adding a new + // issuance endpoint that explicitly only triggers + // issuance and stores certs in the relevant store, but + // does not return certs to the caller? + _, _, err := cm.lc.CertPair(ctx, domain) + if err != nil { + log.Printf("error refreshing certificate for %s: %v", domain, err) + } + var nextInterval time.Duration + // TODO(irbekrm): distinguish between LE rate limit + // errors and other error types like transient network + // errors. + if err == nil { + retryCount = 0 + nextInterval = normalInterval + } else { + retryCount++ + // Calculate backoff: initialRetry * 2^(retryCount-1) + // For retryCount=1: 1min * 2^0 = 1min + // For retryCount=2: 1min * 2^1 = 2min + // For retryCount=3: 1min * 2^2 = 4min + backoff := initialRetry * time.Duration(1<<(retryCount-1)) + if backoff > maxRetryInterval { + backoff = maxRetryInterval + } + nextInterval = backoff + log.Printf("Error refreshing certificate for %s (retry %d): %v. Will retry in %v\n", + domain, retryCount, err, nextInterval) + } + timer.Reset(nextInterval) + } + } +} diff --git a/cmd/containerboot/main.go b/cmd/containerboot/main.go index cf4bd8620..5f8052bb9 100644 --- a/cmd/containerboot/main.go +++ b/cmd/containerboot/main.go @@ -646,7 +646,7 @@ runLoop: if cfg.ServeConfigPath != "" { triggerWatchServeConfigChanges.Do(func() { - go watchServeConfigChanges(ctx, cfg.ServeConfigPath, certDomainChanged, certDomain, client, kc) + go watchServeConfigChanges(ctx, certDomainChanged, certDomain, client, kc, cfg) }) } diff --git a/cmd/containerboot/serve.go b/cmd/containerboot/serve.go index 4ea5a9c46..b0dec5f61 100644 --- a/cmd/containerboot/serve.go +++ b/cmd/containerboot/serve.go @@ -28,10 +28,11 @@ import ( // applies it to lc. It exits when ctx is canceled. cdChanged is a channel that // is written to when the certDomain changes, causing the serve config to be // re-read and applied. -func watchServeConfigChanges(ctx context.Context, path string, cdChanged <-chan bool, certDomainAtomic *atomic.Pointer[string], lc *local.Client, kc *kubeClient) { +func watchServeConfigChanges(ctx context.Context, cdChanged <-chan bool, certDomainAtomic *atomic.Pointer[string], lc *local.Client, kc *kubeClient, cfg *settings) { if certDomainAtomic == nil { panic("certDomainAtomic must not be nil") } + var tickChan <-chan time.Time var eventChan <-chan fsnotify.Event if w, err := fsnotify.NewWatcher(); err != nil { @@ -43,7 +44,7 @@ func watchServeConfigChanges(ctx context.Context, path string, cdChanged <-chan tickChan = ticker.C } else { defer w.Close() - if err := w.Add(filepath.Dir(path)); err != nil { + if err := w.Add(filepath.Dir(cfg.ServeConfigPath)); err != nil { log.Fatalf("serve proxy: failed to add fsnotify watch: %v", err) } eventChan = w.Events @@ -51,6 +52,14 @@ func watchServeConfigChanges(ctx context.Context, path string, cdChanged <-chan var certDomain string var prevServeConfig *ipn.ServeConfig + var cm certManager + if cfg.CertShareMode == "rw" { + cm = certManager{ + parentCtx: ctx, + certLoops: make(map[string]context.CancelFunc), + lc: lc, + } + } for { select { case <-ctx.Done(): @@ -63,12 +72,12 @@ func watchServeConfigChanges(ctx context.Context, path string, cdChanged <-chan // k8s handles these mounts. So just re-read the file and apply it // if it's changed. } - sc, err := readServeConfig(path, certDomain) + sc, err := readServeConfig(cfg.ServeConfigPath, certDomain) if err != nil { log.Fatalf("serve proxy: failed to read serve config: %v", err) } if sc == nil { - log.Printf("serve proxy: no serve config at %q, skipping", path) + log.Printf("serve proxy: no serve config at %q, skipping", cfg.ServeConfigPath) continue } if prevServeConfig != nil && reflect.DeepEqual(sc, prevServeConfig) { @@ -83,6 +92,12 @@ func watchServeConfigChanges(ctx context.Context, path string, cdChanged <-chan } } prevServeConfig = sc + if cfg.CertShareMode != "rw" { + continue + } + if err := cm.ensureCertLoops(ctx, sc); err != nil { + log.Fatalf("serve proxy: error ensuring cert loops: %v", err) + } } } @@ -96,6 +111,7 @@ func certDomainFromNetmap(nm *netmap.NetworkMap) string { // localClient is a subset of [local.Client] that can be mocked for testing. type localClient interface { SetServeConfig(context.Context, *ipn.ServeConfig) error + CertPair(context.Context, string) ([]byte, []byte, error) } func updateServeConfig(ctx context.Context, sc *ipn.ServeConfig, certDomain string, lc localClient) error { diff --git a/cmd/containerboot/settings.go b/cmd/containerboot/settings.go index 0da18e52c..142221b56 100644 --- a/cmd/containerboot/settings.go +++ b/cmd/containerboot/settings.go @@ -74,6 +74,7 @@ type settings struct { HealthCheckEnabled bool DebugAddrPort string EgressProxiesCfgPath string + CertShareMode string // Possible values 'ro' (readonly), 'rw' (read-write) } func configFromEnv() (*settings, error) { @@ -128,6 +129,17 @@ func configFromEnv() (*settings, error) { cfg.PodIPv6 = parsed.String() } } + // If cert share is enabled, set the replica as read or write. Only 0th + // replica should be able to write. + isInCertShareMode := defaultBool("TS_EXPERIMENTAL_CERT_SHARE", false) + if isInCertShareMode { + cfg.CertShareMode = "ro" + podName := os.Getenv("POD_NAME") + if strings.HasSuffix(podName, "-0") { + cfg.CertShareMode = "rw" + } + } + if err := cfg.validate(); err != nil { return nil, fmt.Errorf("invalid configuration: %v", err) } diff --git a/cmd/containerboot/tailscaled.go b/cmd/containerboot/tailscaled.go index 01ee96d3a..654b34757 100644 --- a/cmd/containerboot/tailscaled.go +++ b/cmd/containerboot/tailscaled.go @@ -33,6 +33,9 @@ func startTailscaled(ctx context.Context, cfg *settings) (*local.Client, *os.Pro cmd.SysProcAttr = &syscall.SysProcAttr{ Setpgid: true, } + if cfg.CertShareMode != "" { + cmd.Env = append(os.Environ(), "TS_CERT_SHARE_MODE="+cfg.CertShareMode) + } log.Printf("Starting tailscaled") if err := cmd.Start(); err != nil { return nil, nil, fmt.Errorf("starting tailscaled failed: %v", err) |
