summaryrefslogtreecommitdiffhomepage
path: root/cmd/containerboot
diff options
context:
space:
mode:
Diffstat (limited to 'cmd/containerboot')
-rw-r--r--cmd/containerboot/certs.go146
-rw-r--r--cmd/containerboot/main.go2
-rw-r--r--cmd/containerboot/serve.go24
-rw-r--r--cmd/containerboot/settings.go12
-rw-r--r--cmd/containerboot/tailscaled.go3
5 files changed, 182 insertions, 5 deletions
diff --git a/cmd/containerboot/certs.go b/cmd/containerboot/certs.go
new file mode 100644
index 000000000..7d0ddce90
--- /dev/null
+++ b/cmd/containerboot/certs.go
@@ -0,0 +1,146 @@
+// Copyright (c) Tailscale Inc & AUTHORS
+// SPDX-License-Identifier: BSD-3-Clause
+
+//go:build linux
+
+package main
+
+import (
+ "context"
+ "log"
+ "net"
+ "sync"
+ "time"
+
+ "tailscale.com/ipn"
+ "tailscale.com/util/goroutines"
+)
+
+// TODO:
+// - add logic to stop all the goroutines (on SIGTERM)
+// - add unit tests
+// certManager is responsible for issuing certificates for known domains and for
+// maintaining a loop that re-attempts issuance daily.
+// Currently cert manager logic is only run on ingress ProxyGroup replicas that are responsible for managing certs for
+// HA Ingress HTTPS endpoints ('write' replicas).
+type certManager struct {
+ parentCtx context.Context
+ lc localClient
+ tracker goroutines.Tracker // tracks running goroutines
+ mu sync.Mutex // guards the following
+ // certLoops contains a map of DNS names, for which we currently need to
+ // manage certs to cancel functions that allow stopping a goroutine when
+ // we no longer need to manage certs for the DNS name.
+ certLoops map[string]context.CancelFunc
+}
+
+// ensureCertLoops ensures that, for all currently managed Service HTTPS
+// endpoints, there is a cert loop responsible for issuing and ensuring the
+// renewal of the TLS certs.
+func (cm *certManager) ensureCertLoops(ctx context.Context, sc *ipn.ServeConfig) error {
+ currentDomains := make(map[string]bool)
+ const httpsPort = "443"
+ for _, service := range sc.Services {
+ for hostPort := range service.Web {
+ domain, port, err := net.SplitHostPort(string(hostPort))
+ if err != nil {
+ log.Printf("[unexpected] unable to parse HostPort %s", hostPort)
+ continue
+ }
+ if port != httpsPort { // HA Ingress' HTTP endpoint
+ continue
+ }
+ currentDomains[domain] = true
+ }
+ }
+ cm.mu.Lock()
+ defer cm.mu.Unlock()
+ for domain := range currentDomains {
+ if _, exists := cm.certLoops[domain]; !exists {
+ ctx, cancel := context.WithCancel(cm.parentCtx)
+ cm.certLoops[domain] = cancel
+ cm.tracker.Go(func() { cm.runCertLoop(ctx, domain) })
+ }
+ }
+
+ // Stop goroutines for domain names that are no longer in the config.
+ for domain, cancel := range cm.certLoops {
+ if !currentDomains[domain] {
+ cancel()
+ delete(cm.certLoops, domain)
+ }
+ }
+ return nil
+}
+
+// runCertLoop:
+// - calls localAPI certificate endpoint to ensure that certs are issued for the
+// given domain name
+// - calls localAPI certificate endpoint daily to ensure that certs are renewed
+// - if certificate issuance failed retries after an exponential backoff period
+// starting at 1 minute and capped at 24 hours. Reset the backoff once issuance succeeds.
+// Note that renewal check also happens when the node receives an HTTPS request and it is possible that certs get
+// renewed at that point. Renewal here is needed to prevent the shared certs from expiry in edge cases where the 'write'
+// replica does not get any HTTPS requests.
+// https://letsencrypt.org/docs/integration-guide/#retrying-failures
+func (cm *certManager) runCertLoop(ctx context.Context, domain string) {
+ const (
+ normalInterval = 24 * time.Hour // regular renewal check
+ initialRetry = 1 * time.Minute // initial backoff after a failure
+ maxRetryInterval = 24 * time.Hour // max backoff period
+ )
+ timer := time.NewTimer(0) // fire off timer immediately
+ defer timer.Stop()
+ retryCount := 0
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case <-timer.C:
+ // We call the certificate endpoint, but don't do anything
+ // with the returned certs here.
+ // The call to the certificate endpoint will ensure that
+ // certs are issued/renewed as needed and stored in the
+ // relevant state store. For example, for HA Ingress
+ // 'write' replica, the cert and key will be stored in a
+ // Kubernetes Secret named after the domain for which we
+ // are issuing.
+ // Note that renewals triggered by the call to the
+ // certificates endpoint here and by renewal check
+ // triggered during a call to node's HTTPS endpoint
+ // share the same state/renewal lock mechanism, so we
+ // should not run into redundant issuances during
+ // concurrent renewal checks.
+ // TODO(irbekrm): maybe it is worth adding a new
+ // issuance endpoint that explicitly only triggers
+ // issuance and stores certs in the relevant store, but
+ // does not return certs to the caller?
+ _, _, err := cm.lc.CertPair(ctx, domain)
+ if err != nil {
+ log.Printf("error refreshing certificate for %s: %v", domain, err)
+ }
+ var nextInterval time.Duration
+ // TODO(irbekrm): distinguish between LE rate limit
+ // errors and other error types like transient network
+ // errors.
+ if err == nil {
+ retryCount = 0
+ nextInterval = normalInterval
+ } else {
+ retryCount++
+ // Calculate backoff: initialRetry * 2^(retryCount-1)
+ // For retryCount=1: 1min * 2^0 = 1min
+ // For retryCount=2: 1min * 2^1 = 2min
+ // For retryCount=3: 1min * 2^2 = 4min
+ backoff := initialRetry * time.Duration(1<<(retryCount-1))
+ if backoff > maxRetryInterval {
+ backoff = maxRetryInterval
+ }
+ nextInterval = backoff
+ log.Printf("Error refreshing certificate for %s (retry %d): %v. Will retry in %v\n",
+ domain, retryCount, err, nextInterval)
+ }
+ timer.Reset(nextInterval)
+ }
+ }
+}
diff --git a/cmd/containerboot/main.go b/cmd/containerboot/main.go
index cf4bd8620..5f8052bb9 100644
--- a/cmd/containerboot/main.go
+++ b/cmd/containerboot/main.go
@@ -646,7 +646,7 @@ runLoop:
if cfg.ServeConfigPath != "" {
triggerWatchServeConfigChanges.Do(func() {
- go watchServeConfigChanges(ctx, cfg.ServeConfigPath, certDomainChanged, certDomain, client, kc)
+ go watchServeConfigChanges(ctx, certDomainChanged, certDomain, client, kc, cfg)
})
}
diff --git a/cmd/containerboot/serve.go b/cmd/containerboot/serve.go
index 4ea5a9c46..b0dec5f61 100644
--- a/cmd/containerboot/serve.go
+++ b/cmd/containerboot/serve.go
@@ -28,10 +28,11 @@ import (
// applies it to lc. It exits when ctx is canceled. cdChanged is a channel that
// is written to when the certDomain changes, causing the serve config to be
// re-read and applied.
-func watchServeConfigChanges(ctx context.Context, path string, cdChanged <-chan bool, certDomainAtomic *atomic.Pointer[string], lc *local.Client, kc *kubeClient) {
+func watchServeConfigChanges(ctx context.Context, cdChanged <-chan bool, certDomainAtomic *atomic.Pointer[string], lc *local.Client, kc *kubeClient, cfg *settings) {
if certDomainAtomic == nil {
panic("certDomainAtomic must not be nil")
}
+
var tickChan <-chan time.Time
var eventChan <-chan fsnotify.Event
if w, err := fsnotify.NewWatcher(); err != nil {
@@ -43,7 +44,7 @@ func watchServeConfigChanges(ctx context.Context, path string, cdChanged <-chan
tickChan = ticker.C
} else {
defer w.Close()
- if err := w.Add(filepath.Dir(path)); err != nil {
+ if err := w.Add(filepath.Dir(cfg.ServeConfigPath)); err != nil {
log.Fatalf("serve proxy: failed to add fsnotify watch: %v", err)
}
eventChan = w.Events
@@ -51,6 +52,14 @@ func watchServeConfigChanges(ctx context.Context, path string, cdChanged <-chan
var certDomain string
var prevServeConfig *ipn.ServeConfig
+ var cm certManager
+ if cfg.CertShareMode == "rw" {
+ cm = certManager{
+ parentCtx: ctx,
+ certLoops: make(map[string]context.CancelFunc),
+ lc: lc,
+ }
+ }
for {
select {
case <-ctx.Done():
@@ -63,12 +72,12 @@ func watchServeConfigChanges(ctx context.Context, path string, cdChanged <-chan
// k8s handles these mounts. So just re-read the file and apply it
// if it's changed.
}
- sc, err := readServeConfig(path, certDomain)
+ sc, err := readServeConfig(cfg.ServeConfigPath, certDomain)
if err != nil {
log.Fatalf("serve proxy: failed to read serve config: %v", err)
}
if sc == nil {
- log.Printf("serve proxy: no serve config at %q, skipping", path)
+ log.Printf("serve proxy: no serve config at %q, skipping", cfg.ServeConfigPath)
continue
}
if prevServeConfig != nil && reflect.DeepEqual(sc, prevServeConfig) {
@@ -83,6 +92,12 @@ func watchServeConfigChanges(ctx context.Context, path string, cdChanged <-chan
}
}
prevServeConfig = sc
+ if cfg.CertShareMode != "rw" {
+ continue
+ }
+ if err := cm.ensureCertLoops(ctx, sc); err != nil {
+ log.Fatalf("serve proxy: error ensuring cert loops: %v", err)
+ }
}
}
@@ -96,6 +111,7 @@ func certDomainFromNetmap(nm *netmap.NetworkMap) string {
// localClient is a subset of [local.Client] that can be mocked for testing.
type localClient interface {
SetServeConfig(context.Context, *ipn.ServeConfig) error
+ CertPair(context.Context, string) ([]byte, []byte, error)
}
func updateServeConfig(ctx context.Context, sc *ipn.ServeConfig, certDomain string, lc localClient) error {
diff --git a/cmd/containerboot/settings.go b/cmd/containerboot/settings.go
index 0da18e52c..142221b56 100644
--- a/cmd/containerboot/settings.go
+++ b/cmd/containerboot/settings.go
@@ -74,6 +74,7 @@ type settings struct {
HealthCheckEnabled bool
DebugAddrPort string
EgressProxiesCfgPath string
+ CertShareMode string // Possible values 'ro' (readonly), 'rw' (read-write)
}
func configFromEnv() (*settings, error) {
@@ -128,6 +129,17 @@ func configFromEnv() (*settings, error) {
cfg.PodIPv6 = parsed.String()
}
}
+ // If cert share is enabled, set the replica as read or write. Only 0th
+ // replica should be able to write.
+ isInCertShareMode := defaultBool("TS_EXPERIMENTAL_CERT_SHARE", false)
+ if isInCertShareMode {
+ cfg.CertShareMode = "ro"
+ podName := os.Getenv("POD_NAME")
+ if strings.HasSuffix(podName, "-0") {
+ cfg.CertShareMode = "rw"
+ }
+ }
+
if err := cfg.validate(); err != nil {
return nil, fmt.Errorf("invalid configuration: %v", err)
}
diff --git a/cmd/containerboot/tailscaled.go b/cmd/containerboot/tailscaled.go
index 01ee96d3a..654b34757 100644
--- a/cmd/containerboot/tailscaled.go
+++ b/cmd/containerboot/tailscaled.go
@@ -33,6 +33,9 @@ func startTailscaled(ctx context.Context, cfg *settings) (*local.Client, *os.Pro
cmd.SysProcAttr = &syscall.SysProcAttr{
Setpgid: true,
}
+ if cfg.CertShareMode != "" {
+ cmd.Env = append(os.Environ(), "TS_CERT_SHARE_MODE="+cfg.CertShareMode)
+ }
log.Printf("Starting tailscaled")
if err := cmd.Start(); err != nil {
return nil, nil, fmt.Errorf("starting tailscaled failed: %v", err)