summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorTom Proctor <tomhjp@users.noreply.github.com>2025-07-03 13:48:48 +0100
committerchaosinthecrd <tom@tmlabs.co.uk>2026-02-09 14:24:29 +0000
commit393954cf7f0fd71536269e2e73ed832540ec94cd (patch)
treee9caa947021005c01dae67fa743a2cb221093536
parenta3215f1f9d3afd4a35973e4df12dc5fca87a3056 (diff)
downloadtailscale-chaosinthecrd/authkey-reissue.tar.xz
tailscale-chaosinthecrd/authkey-reissue.zip
cmd/{containerboot,k8s-operator}: reissue auth keys for broken proxieschaosinthecrd/authkey-reissue
Adds logic for containerboot to signal that it can't auth, so the operator can reissue a new auth key. This only applies when running with a config file and with a kube state store. If the operator sees reissue_authkey in a state Secret, it will create a new auth key iff the config has no auth key or its auth key matches the value of reissue_authkey from the state Secret. This is to ensure we don't reissue auth keys in a tight loop if the proxy is slow to start or failing for some other reason. The reissue logic also uses a burstable rate limiter to ensure there's no way a terminally misconfigured or buggy operator can automatically generate new auth keys in a tight loop. Updates #14080 Change-Id: I6982f8e741932a6891f2f48a2936f7f6a455317f Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com> (cherry picked from commit 969927c47c3d4de05e90f5b26a6d8d931c5ceed4)
-rw-r--r--cmd/containerboot/kube.go54
-rw-r--r--cmd/containerboot/kube_test.go76
-rw-r--r--cmd/containerboot/main.go48
-rw-r--r--cmd/containerboot/main_test.go144
-rw-r--r--cmd/k8s-operator/operator.go2
-rw-r--r--cmd/k8s-operator/proxygroup.go148
-rw-r--r--cmd/k8s-operator/proxygroup_test.go253
-rw-r--r--cmd/k8s-operator/sts.go14
-rw-r--r--cmd/k8s-operator/testutils_test.go4
-rw-r--r--kube/kubetypes/types.go16
10 files changed, 642 insertions, 117 deletions
diff --git a/cmd/containerboot/kube.go b/cmd/containerboot/kube.go
index 4943bddba..1743b8fa7 100644
--- a/cmd/containerboot/kube.go
+++ b/cmd/containerboot/kube.go
@@ -26,9 +26,10 @@ import (
"tailscale.com/tailcfg"
"tailscale.com/types/logger"
"tailscale.com/util/backoff"
- "tailscale.com/util/set"
)
+const fieldManager = "tailscale-container"
+
// kubeClient is a wrapper around Tailscale's internal kube client that knows how to talk to the kube API server. We use
// this rather than any of the upstream Kubernetes client libaries to avoid extra imports.
type kubeClient struct {
@@ -63,7 +64,7 @@ func (kc *kubeClient) storeDeviceID(ctx context.Context, deviceID tailcfg.Stable
kubetypes.KeyDeviceID: []byte(deviceID),
},
}
- return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, "tailscale-container")
+ return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager)
}
// storeDeviceEndpoints writes device's tailnet IPs and MagicDNS name to fields 'device_ips', 'device_fqdn' of client's
@@ -84,7 +85,7 @@ func (kc *kubeClient) storeDeviceEndpoints(ctx context.Context, fqdn string, add
kubetypes.KeyDeviceIPs: deviceIPs,
},
}
- return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, "tailscale-container")
+ return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager)
}
// storeHTTPSEndpoint writes an HTTPS endpoint exposed by this device via 'tailscale serve' to the client's state
@@ -96,7 +97,7 @@ func (kc *kubeClient) storeHTTPSEndpoint(ctx context.Context, ep string) error {
kubetypes.KeyHTTPSEndpoint: []byte(ep),
},
}
- return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, "tailscale-container")
+ return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager)
}
// deleteAuthKey deletes the 'authkey' field of the given kube
@@ -122,7 +123,7 @@ func (kc *kubeClient) deleteAuthKey(ctx context.Context) error {
// resetContainerbootState resets state from previous runs of containerboot to
// ensure the operator doesn't use stale state when a Pod is first recreated.
-func (kc *kubeClient) resetContainerbootState(ctx context.Context, podUID string) error {
+func (kc *kubeClient) resetContainerbootState(ctx context.Context, podUID string, tailscaledConfigAuthkey string) error {
existingSecret, err := kc.GetSecret(ctx, kc.stateSecret)
switch {
case kubeclient.IsNotFoundErr(err):
@@ -131,32 +132,45 @@ func (kc *kubeClient) resetContainerbootState(ctx context.Context, podUID string
case err != nil:
return fmt.Errorf("failed to read state Secret %q to reset state: %w", kc.stateSecret, err)
}
+
s := &kubeapi.Secret{
Data: map[string][]byte{
kubetypes.KeyCapVer: fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion),
+
+ // TODO(tomhjp): Perhaps shouldn't clear device ID and use a different signal, as this could leak tailnet devices.
+ kubetypes.KeyDeviceID: nil,
+ kubetypes.KeyDeviceFQDN: nil,
+ kubetypes.KeyDeviceIPs: nil,
+ kubetypes.KeyHTTPSEndpoint: nil,
+ egressservices.KeyEgressServices: nil,
+ ingressservices.IngressConfigKey: nil,
},
}
if podUID != "" {
s.Data[kubetypes.KeyPodUID] = []byte(podUID)
}
- toClear := set.SetOf([]string{
- kubetypes.KeyDeviceID,
- kubetypes.KeyDeviceFQDN,
- kubetypes.KeyDeviceIPs,
- kubetypes.KeyHTTPSEndpoint,
- egressservices.KeyEgressServices,
- ingressservices.IngressConfigKey,
- })
- for key := range existingSecret.Data {
- if toClear.Contains(key) {
- // It's fine to leave the key in place as a debugging breadcrumb,
- // it should get a new value soon.
- s.Data[key] = nil
- }
+ // Only clear reissue_authkey if the operator has actioned it.
+ existingSecret, err = kc.GetSecret(ctx, kc.stateSecret)
+ if err != nil {
+ return fmt.Errorf("failed to read state Secret %q to reset state: %w", kc.stateSecret, err)
+ }
+
+ brokenAuthkey, ok := existingSecret.Data[kubetypes.KeyReissueAuthkey]
+ if ok && tailscaledConfigAuthkey != "" && string(brokenAuthkey) != tailscaledConfigAuthkey {
+ s.Data[kubetypes.KeyReissueAuthkey] = nil
}
- return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, "tailscale-container")
+ return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager)
+}
+
+func (kc *kubeClient) setReissueAuthKey(ctx context.Context, authKey string) error {
+ s := &kubeapi.Secret{
+ Data: map[string][]byte{
+ kubetypes.KeyReissueAuthkey: []byte(authKey), // Empty string means no auth key.
+ },
+ }
+ return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager)
}
// waitForConsistentState waits for tailscaled to finish writing state if it
diff --git a/cmd/containerboot/kube_test.go b/cmd/containerboot/kube_test.go
index bc80e9cdf..bd01f9a10 100644
--- a/cmd/containerboot/kube_test.go
+++ b/cmd/containerboot/kube_test.go
@@ -248,25 +248,42 @@ func TestResetContainerbootState(t *testing.T) {
capver := fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion)
for name, tc := range map[string]struct {
podUID string
+ authkey string
initial map[string][]byte
expected map[string][]byte
}{
"empty_initial": {
podUID: "1234",
+ authkey: "new-authkey",
initial: map[string][]byte{},
expected: map[string][]byte{
kubetypes.KeyCapVer: capver,
kubetypes.KeyPodUID: []byte("1234"),
+ // Cleared keys.
+ kubetypes.KeyDeviceID: nil,
+ kubetypes.KeyDeviceFQDN: nil,
+ kubetypes.KeyDeviceIPs: nil,
+ kubetypes.KeyHTTPSEndpoint: nil,
+ egressservices.KeyEgressServices: nil,
+ ingressservices.IngressConfigKey: nil,
},
},
"empty_initial_no_pod_uid": {
initial: map[string][]byte{},
expected: map[string][]byte{
kubetypes.KeyCapVer: capver,
+ // Cleared keys.
+ kubetypes.KeyDeviceID: nil,
+ kubetypes.KeyDeviceFQDN: nil,
+ kubetypes.KeyDeviceIPs: nil,
+ kubetypes.KeyHTTPSEndpoint: nil,
+ egressservices.KeyEgressServices: nil,
+ ingressservices.IngressConfigKey: nil,
},
},
"only_relevant_keys_updated": {
- podUID: "1234",
+ podUID: "1234",
+ authkey: "new-authkey",
initial: map[string][]byte{
kubetypes.KeyCapVer: []byte("1"),
kubetypes.KeyPodUID: []byte("5678"),
@@ -295,6 +312,57 @@ func TestResetContainerbootState(t *testing.T) {
// Tailscaled keys not included in patch.
},
},
+ "new_authkey_issued": {
+ initial: map[string][]byte{
+ kubetypes.KeyReissueAuthkey: []byte("old-authkey"),
+ },
+ authkey: "new-authkey",
+ expected: map[string][]byte{
+ kubetypes.KeyCapVer: capver,
+ kubetypes.KeyReissueAuthkey: nil,
+ // Cleared keys.
+ kubetypes.KeyDeviceID: nil,
+ kubetypes.KeyDeviceFQDN: nil,
+ kubetypes.KeyDeviceIPs: nil,
+ kubetypes.KeyHTTPSEndpoint: nil,
+ egressservices.KeyEgressServices: nil,
+ ingressservices.IngressConfigKey: nil,
+ },
+ },
+ "authkey_not_yet_updated": {
+ initial: map[string][]byte{
+ kubetypes.KeyReissueAuthkey: []byte("old-authkey"),
+ },
+ authkey: "old-authkey",
+ expected: map[string][]byte{
+ kubetypes.KeyCapVer: capver,
+ // reissue_authkey not cleared.
+ // Cleared keys.
+ kubetypes.KeyDeviceID: nil,
+ kubetypes.KeyDeviceFQDN: nil,
+ kubetypes.KeyDeviceIPs: nil,
+ kubetypes.KeyHTTPSEndpoint: nil,
+ egressservices.KeyEgressServices: nil,
+ ingressservices.IngressConfigKey: nil,
+ },
+ },
+ "authkey_deleted_from_config": {
+ initial: map[string][]byte{
+ kubetypes.KeyReissueAuthkey: []byte("old-authkey"),
+ },
+ authkey: "",
+ expected: map[string][]byte{
+ kubetypes.KeyCapVer: capver,
+ // reissue_authkey not cleared.
+ // Cleared keys.
+ kubetypes.KeyDeviceID: nil,
+ kubetypes.KeyDeviceFQDN: nil,
+ kubetypes.KeyDeviceIPs: nil,
+ kubetypes.KeyHTTPSEndpoint: nil,
+ egressservices.KeyEgressServices: nil,
+ ingressservices.IngressConfigKey: nil,
+ },
+ },
} {
t.Run(name, func(t *testing.T) {
var actual map[string][]byte
@@ -309,11 +377,11 @@ func TestResetContainerbootState(t *testing.T) {
return nil
},
}}
- if err := kc.resetContainerbootState(context.Background(), tc.podUID); err != nil {
+ if err := kc.resetContainerbootState(context.Background(), tc.podUID, tc.authkey); err != nil {
t.Fatalf("resetContainerbootState() error = %v", err)
}
- if diff := cmp.Diff(tc.expected, actual); diff != "" {
- t.Errorf("resetContainerbootState() mismatch (-want +got):\n%s", diff)
+ if diff := cmp.Diff(actual, tc.expected); diff != "" {
+ t.Errorf("Merge patch mismatch (-got +want):\n%s", diff)
}
})
}
diff --git a/cmd/containerboot/main.go b/cmd/containerboot/main.go
index 9d8d3f023..7fcd6c7c1 100644
--- a/cmd/containerboot/main.go
+++ b/cmd/containerboot/main.go
@@ -133,7 +133,9 @@ import (
"golang.org/x/sys/unix"
"tailscale.com/client/tailscale"
+ "tailscale.com/health"
"tailscale.com/ipn"
+ "tailscale.com/ipn/conffile"
kubeutils "tailscale.com/k8s-operator"
healthz "tailscale.com/kube/health"
"tailscale.com/kube/kubetypes"
@@ -198,6 +200,11 @@ func run() error {
bootCtx, cancel := context.WithTimeout(ctx, 60*time.Second)
defer cancel()
+ var tailscaledConfigAuthkey string
+ if isOneStepConfig(cfg) {
+ tailscaledConfigAuthkey = authkeyFromTailscaledConfig(cfg.TailscaledConfigFilePath)
+ }
+
var kc *kubeClient
if cfg.InKubernetes {
kc, err = newKubeClient(cfg.Root, cfg.KubeSecret)
@@ -211,7 +218,7 @@ func run() error {
// hasKubeStateStore because although we know we're in kube, that
// doesn't guarantee the state store is properly configured.
if hasKubeStateStore(cfg) {
- if err := kc.resetContainerbootState(bootCtx, cfg.PodUID); err != nil {
+ if err := kc.resetContainerbootState(bootCtx, cfg.PodUID, tailscaledConfigAuthkey); err != nil {
return fmt.Errorf("error clearing previous state from Secret: %w", err)
}
}
@@ -330,7 +337,7 @@ func run() error {
if err := tailscaleUp(bootCtx, cfg); err != nil {
return fmt.Errorf("failed to auth tailscale: %w", err)
}
- w, err = client.WatchIPNBus(bootCtx, ipn.NotifyInitialNetMap|ipn.NotifyInitialState)
+ w, err = client.WatchIPNBus(bootCtx, ipn.NotifyInitialNetMap|ipn.NotifyInitialState|ipn.NotifyInitialHealthState)
if err != nil {
return fmt.Errorf("rewatching tailscaled for updates after auth: %w", err)
}
@@ -356,7 +363,15 @@ authLoop:
if isOneStepConfig(cfg) {
// This could happen if this is the first time tailscaled was run for this
// device and the auth key was not passed via the configfile.
- return fmt.Errorf("invalid state: tailscaled daemon started with a config file, but tailscale is not logged in: ensure you pass a valid auth key in the config file.")
+ if hasKubeStateStore(cfg) {
+ setErr := kc.setReissueAuthKey(bootCtx, tailscaledConfigAuthkey)
+ if setErr != nil {
+ return fmt.Errorf("failed to set reissue_authkey in Kubernetes Secret after NeedsLogin state change: %w", setErr)
+ }
+
+ return errors.New("invalid state: tailscaled daemon started with a config file, but tailscale is not logged in; auth key reissue from operator requested")
+ }
+ return errors.New("invalid state: tailscaled daemon started with a config file, but tailscale is not logged in: ensure you pass a valid auth key in the config file")
}
if err := authTailscale(); err != nil {
return fmt.Errorf("failed to auth tailscale: %w", err)
@@ -375,6 +390,21 @@ authLoop:
log.Printf("tailscaled in state %q, waiting", *n.State)
}
}
+
+ if n.Health != nil {
+ // This can happen if the config has an auth key but its invalid,
+ // for example if it was single-use and already got used, but the
+ // device state was lost.
+ if _, ok := n.Health.Warnings[health.LoginStateWarnable.Code]; ok {
+ if isOneStepConfig(cfg) && hasKubeStateStore(cfg) {
+ err := kc.setReissueAuthKey(bootCtx, tailscaledConfigAuthkey)
+ if err != nil {
+ return fmt.Errorf("failed to set reissue_authkey in Kubernetes Secret after login state warning: %w", err)
+ }
+ return errors.New("tailscaled failed to log in with the auth key from its config file; auth key reissue from operator requested")
+ }
+ }
+ }
}
w.Close()
@@ -400,9 +430,9 @@ authLoop:
// We were told to only auth once, so any secret-bound
// authkey is no longer needed. We don't strictly need to
// wipe it, but it's good hygiene.
- log.Printf("Deleting authkey from kube secret")
+ log.Printf("Deleting authkey from Kubernetes Secret")
if err := kc.deleteAuthKey(ctx); err != nil {
- return fmt.Errorf("deleting authkey from kube secret: %w", err)
+ return fmt.Errorf("deleting authkey from Kubernetes Secret: %w", err)
}
}
@@ -957,3 +987,11 @@ func serviceIPsFromNetMap(nm *netmap.NetworkMap, fqdn dnsname.FQDN) []netip.Pref
return prefixes
}
+
+func authkeyFromTailscaledConfig(path string) string {
+ if cfg, err := conffile.Load(path); err == nil && cfg.Parsed.AuthKey != nil {
+ return *cfg.Parsed.AuthKey
+ }
+
+ return ""
+}
diff --git a/cmd/containerboot/main_test.go b/cmd/containerboot/main_test.go
index 6eeb59c9b..7a463fd76 100644
--- a/cmd/containerboot/main_test.go
+++ b/cmd/containerboot/main_test.go
@@ -31,6 +31,7 @@ import (
"github.com/google/go-cmp/cmp"
"golang.org/x/sys/unix"
+ "tailscale.com/health"
"tailscale.com/ipn"
"tailscale.com/kube/egressservices"
"tailscale.com/kube/kubeclient"
@@ -41,6 +42,8 @@ import (
"tailscale.com/types/ptr"
)
+const configFileAuthKey = "some-auth-key"
+
func TestContainerBoot(t *testing.T) {
boot := filepath.Join(t.TempDir(), "containerboot")
if err := exec.Command("go", "build", "-ldflags", "-X main.testSleepDuration=1ms", "-o", boot, "tailscale.com/cmd/containerboot").Run(); err != nil {
@@ -781,6 +784,101 @@ func TestContainerBoot(t *testing.T) {
},
}
},
+ "sets_reissue_authkey_if_needs_login": func(env *testEnv) testCase {
+ return testCase{
+ Env: map[string]string{
+ "TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR": filepath.Join(env.d, "etc/tailscaled/"),
+ "KUBERNETES_SERVICE_HOST": env.kube.Host,
+ "KUBERNETES_SERVICE_PORT_HTTPS": env.kube.Port,
+ },
+ Phases: []phase{
+ {
+ WantCmds: []string{
+ "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=kube:tailscale --statedir=/tmp --tun=userspace-networking --config=/etc/tailscaled/cap-95.hujson",
+ },
+ WantKubeSecret: map[string]string{
+ kubetypes.KeyCapVer: capver,
+ },
+ }, {
+ Notify: &ipn.Notify{
+ State: ptr.To(ipn.NeedsLogin),
+ },
+ WantKubeSecret: map[string]string{
+ kubetypes.KeyCapVer: capver,
+ kubetypes.KeyReissueAuthkey: configFileAuthKey,
+ },
+ WantExitCode: ptr.To(1),
+ WantLog: "invalid state: tailscaled daemon started with a config file, but tailscale is not logged in; auth key reissue from operator requested",
+ },
+ },
+ }
+ },
+ "sets_reissue_authkey_if_auth_fails": func(env *testEnv) testCase {
+ return testCase{
+ Env: map[string]string{
+ "TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR": filepath.Join(env.d, "etc/tailscaled/"),
+ "KUBERNETES_SERVICE_HOST": env.kube.Host,
+ "KUBERNETES_SERVICE_PORT_HTTPS": env.kube.Port,
+ },
+ Phases: []phase{
+ {
+ WantCmds: []string{
+ "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=kube:tailscale --statedir=/tmp --tun=userspace-networking --config=/etc/tailscaled/cap-95.hujson",
+ },
+ WantKubeSecret: map[string]string{
+ kubetypes.KeyCapVer: capver,
+ },
+ }, {
+ Notify: &ipn.Notify{
+ Health: &health.State{
+ Warnings: map[health.WarnableCode]health.UnhealthyState{
+ health.LoginStateWarnable.Code: {},
+ },
+ },
+ },
+ WantKubeSecret: map[string]string{
+ kubetypes.KeyCapVer: capver,
+ kubetypes.KeyReissueAuthkey: configFileAuthKey,
+ },
+ WantExitCode: ptr.To(1),
+ WantLog: "tailscaled failed to log in with the auth key from its config file; auth key reissue from operator requested",
+ },
+ },
+ }
+ },
+ "clears_reissue_authkey_on_change": func(env *testEnv) testCase {
+ return testCase{
+ Env: map[string]string{
+ "TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR": filepath.Join(env.d, "etc/tailscaled/"),
+ "KUBERNETES_SERVICE_HOST": env.kube.Host,
+ "KUBERNETES_SERVICE_PORT_HTTPS": env.kube.Port,
+ },
+ KubeSecret: map[string]string{
+ kubetypes.KeyReissueAuthkey: "some-older-authkey",
+ "foo": "bar", // Check not everything is cleared.
+ },
+ Phases: []phase{
+ {
+ WantCmds: []string{
+ "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=kube:tailscale --statedir=/tmp --tun=userspace-networking --config=/etc/tailscaled/cap-95.hujson",
+ },
+ WantKubeSecret: map[string]string{
+ kubetypes.KeyCapVer: capver,
+ "foo": "bar",
+ },
+ }, {
+ Notify: runningNotify,
+ WantKubeSecret: map[string]string{
+ kubetypes.KeyCapVer: capver,
+ "foo": "bar",
+ kubetypes.KeyDeviceFQDN: "test-node.test.ts.net",
+ kubetypes.KeyDeviceID: "myID",
+ kubetypes.KeyDeviceIPs: `["100.64.0.1"]`,
+ },
+ },
+ },
+ }
+ },
"metrics_enabled": func(env *testEnv) testCase {
return testCase{
Env: map[string]string{
@@ -1129,21 +1227,6 @@ func TestContainerBoot(t *testing.T) {
}
}
- if p.WantExitCode != nil {
- state, err := cmd.Process.Wait()
- if err != nil {
- t.Fatal(err)
- }
- if state.ExitCode() != *p.WantExitCode {
- t.Fatalf("phase %d: want exit code %d, got %d", i, *p.WantExitCode, state.ExitCode())
- }
-
- // Early test return, we don't expect the successful startup log message.
- return
- }
-
- wantCmds = append(wantCmds, p.WantCmds...)
- waitArgs(t, 2*time.Second, env.d, env.argFile, strings.Join(wantCmds, "\n"))
err := tstest.WaitFor(2*time.Second, func() error {
if p.WantKubeSecret != nil {
got := env.kube.Secret()
@@ -1161,6 +1244,23 @@ func TestContainerBoot(t *testing.T) {
if err != nil {
t.Fatalf("phase %d: %v", i, err)
}
+
+ if p.WantExitCode != nil {
+ state, err := cmd.Process.Wait()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if state.ExitCode() != *p.WantExitCode {
+ t.Fatalf("phase %d: want exit code %d, got %d", i, *p.WantExitCode, state.ExitCode())
+ }
+
+ // Early test return, we don't expect the successful startup log message.
+ return
+ }
+
+ wantCmds = append(wantCmds, p.WantCmds...)
+ waitArgs(t, 2*time.Second, env.d, env.argFile, strings.Join(wantCmds, "\n"))
+
err = tstest.WaitFor(2*time.Second, func() error {
for path, want := range p.WantFiles {
gotBs, err := os.ReadFile(filepath.Join(env.d, path))
@@ -1559,7 +1659,11 @@ func (k *kubeServer) serveSecret(w http.ResponseWriter, r *http.Request) {
panic(fmt.Sprintf("json decode failed: %v. Body:\n\n%s", err, string(bs)))
}
for key, val := range req.Data {
- k.secret[key] = string(val)
+ if val == nil {
+ delete(k.secret, key)
+ } else {
+ k.secret[key] = string(val)
+ }
}
default:
panic(fmt.Sprintf("unknown content type %q", r.Header.Get("Content-Type")))
@@ -1569,12 +1673,6 @@ func (k *kubeServer) serveSecret(w http.ResponseWriter, r *http.Request) {
}
}
-func mustBase64(t *testing.T, v any) string {
- b := mustJSON(t, v)
- s := base64.StdEncoding.WithPadding('=').EncodeToString(b)
- return s
-}
-
func mustJSON(t *testing.T, v any) []byte {
b, err := json.Marshal(v)
if err != nil {
@@ -1633,7 +1731,7 @@ func newTestEnv(t *testing.T) testEnv {
kube.Start(t)
t.Cleanup(kube.Close)
- tailscaledConf := &ipn.ConfigVAlpha{AuthKey: ptr.To("foo"), Version: "alpha0"}
+ tailscaledConf := &ipn.ConfigVAlpha{AuthKey: ptr.To(configFileAuthKey), Version: "alpha0"}
serveConf := ipn.ServeConfig{TCP: map[uint16]*ipn.TCPPortHandler{80: {HTTP: true}}}
egressCfg := egressSvcConfig("foo", "foo.tailnetxyz.ts.net")
diff --git a/cmd/k8s-operator/operator.go b/cmd/k8s-operator/operator.go
index 4f48c1812..cc7d9d91f 100644
--- a/cmd/k8s-operator/operator.go
+++ b/cmd/k8s-operator/operator.go
@@ -20,6 +20,7 @@ import (
"github.com/go-logr/zapr"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
+ "golang.org/x/time/rate"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
discoveryv1 "k8s.io/api/discovery/v1"
@@ -717,6 +718,7 @@ func runReconcilers(opts reconcilerOpts) {
tsFirewallMode: opts.proxyFirewallMode,
defaultProxyClass: opts.defaultProxyClass,
loginServer: opts.tsServer.ControlURL,
+ authKeyRateLimits: make(map[string]*rate.Limiter),
})
if err != nil {
startlog.Fatalf("could not create ProxyGroup reconciler: %v", err)
diff --git a/cmd/k8s-operator/proxygroup.go b/cmd/k8s-operator/proxygroup.go
index 13c3d7b71..126082643 100644
--- a/cmd/k8s-operator/proxygroup.go
+++ b/cmd/k8s-operator/proxygroup.go
@@ -16,10 +16,12 @@ import (
"sort"
"strings"
"sync"
+ "time"
dockerref "github.com/distribution/reference"
"go.uber.org/zap"
xslices "golang.org/x/exp/slices"
+ "golang.org/x/time/rate"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
@@ -95,10 +97,11 @@ type ProxyGroupReconciler struct {
defaultProxyClass string
loginServer string
- mu sync.Mutex // protects following
- egressProxyGroups set.Slice[types.UID] // for egress proxygroups gauge
- ingressProxyGroups set.Slice[types.UID] // for ingress proxygroups gauge
- apiServerProxyGroups set.Slice[types.UID] // for kube-apiserver proxygroups gauge
+ mu sync.Mutex // protects following
+ egressProxyGroups set.Slice[types.UID] // for egress proxygroups gauge
+ ingressProxyGroups set.Slice[types.UID] // for ingress proxygroups gauge
+ apiServerProxyGroups set.Slice[types.UID] // for kube-apiserver proxygroups gauge
+ authKeyRateLimits map[string]*rate.Limiter // per-ProxyGroup rate limiters for auth key re-issuance.
}
func (r *ProxyGroupReconciler) logger(name string) *zap.SugaredLogger {
@@ -300,7 +303,7 @@ func (r *ProxyGroupReconciler) validate(ctx context.Context, pg *tsapi.ProxyGrou
func (r *ProxyGroupReconciler) maybeProvision(ctx context.Context, tailscaleClient tsClient, pg *tsapi.ProxyGroup, proxyClass *tsapi.ProxyClass) (map[string][]netip.AddrPort, *notReadyReason, error) {
logger := r.logger(pg.Name)
r.mu.Lock()
- r.ensureAddedToGaugeForProxyGroup(pg)
+ r.ensureStateAddedForProxyGroup(pg)
r.mu.Unlock()
svcToNodePorts := make(map[string]uint16)
@@ -637,13 +640,13 @@ func (r *ProxyGroupReconciler) cleanupDanglingResources(ctx context.Context, tai
}
for _, m := range metadata {
- if m.ordinal+1 <= int(pgReplicas(pg)) {
+ if m.ordinal+1 <= pgReplicas(pg) {
continue
}
// Dangling resource, delete the config + state Secrets, as well as
// deleting the device from the tailnet.
- if err := r.deleteTailnetDevice(ctx, tailscaleClient, m.tsID, logger); err != nil {
+ if err := r.ensureDeviceDeleted(ctx, tailscaleClient, m.tsID, logger); err != nil {
return err
}
if err := r.Delete(ctx, m.stateSecret); err != nil && !apierrors.IsNotFound(err) {
@@ -695,7 +698,7 @@ func (r *ProxyGroupReconciler) maybeCleanup(ctx context.Context, tailscaleClient
}
for _, m := range metadata {
- if err := r.deleteTailnetDevice(ctx, tailscaleClient, m.tsID, logger); err != nil {
+ if err := r.ensureDeviceDeleted(ctx, tailscaleClient, m.tsID, logger); err != nil {
return false, err
}
}
@@ -711,12 +714,12 @@ func (r *ProxyGroupReconciler) maybeCleanup(ctx context.Context, tailscaleClient
logger.Infof("cleaned up ProxyGroup resources")
r.mu.Lock()
- r.ensureRemovedFromGaugeForProxyGroup(pg)
+ r.ensureStateRemovedForProxyGroup(pg)
r.mu.Unlock()
return true, nil
}
-func (r *ProxyGroupReconciler) deleteTailnetDevice(ctx context.Context, tailscaleClient tsClient, id tailcfg.StableNodeID, logger *zap.SugaredLogger) error {
+func (r *ProxyGroupReconciler) ensureDeviceDeleted(ctx context.Context, tailscaleClient tsClient, id tailcfg.StableNodeID, logger *zap.SugaredLogger) error {
logger.Debugf("deleting device %s from control", string(id))
if err := tailscaleClient.DeleteDevice(ctx, string(id)); err != nil {
errResp := &tailscale.ErrResponse{}
@@ -742,6 +745,7 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(
logger := r.logger(pg.Name)
endpoints = make(map[string][]netip.AddrPort, pgReplicas(pg)) // keyed by Service name.
for i := range pgReplicas(pg) {
+ logger = logger.With("Pod", fmt.Sprintf("%s-%d", pg.Name, i))
cfgSecret := &corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: pgConfigSecretName(pg.Name, i),
@@ -785,7 +789,7 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(
return nil, err
}
- if shouldRetainAuthKey(stateSecret) && existingCfgSecret != nil {
+ if deviceAuthed(stateSecret) && existingCfgSecret != nil {
authKey, err = authKeyFromSecret(existingCfgSecret)
if err != nil {
return nil, fmt.Errorf("error retrieving auth key from existing config Secret: %w", err)
@@ -931,6 +935,105 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(
return endpoints, nil
}
+// getAuthKey looks at the proxy's state and config Secrets, and may return:
+// * a newly created auth key,
+// * an existing auth key from the config Secret,
+// * or nil if the device is authed.
+//
+// It will create a new auth key if the config Secret is not yet created,
+// or if the proxy has set reissue_authkey in its state Secret.
+func (r *ProxyGroupReconciler) getAuthKey(ctx context.Context, tailscaleClient tsClient, pg *tsapi.ProxyGroup, existingCfgSecret *corev1.Secret, ordinal int32, logger *zap.SugaredLogger) (*string, error) {
+ // Get state Secret to check if it's already authed or has requested
+ // a fresh auth key.
+ stateSecret := &corev1.Secret{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: pgStateSecretName(pg.Name, ordinal),
+ Namespace: r.tsNamespace,
+ },
+ }
+ if err := r.Get(ctx, client.ObjectKeyFromObject(stateSecret), stateSecret); err != nil && !apierrors.IsNotFound(err) {
+ return nil, err
+ }
+
+ var createAuthKey bool
+ var cfgAuthKey *string
+ if existingCfgSecret == nil {
+ createAuthKey = true
+ } else {
+ var err error
+ cfgAuthKey, err = authKeyFromSecret(existingCfgSecret)
+ if err != nil {
+ return nil, fmt.Errorf("error retrieving auth key from existing config Secret: %w", err)
+ }
+ }
+
+ if shouldReissueAuthKey(stateSecret, cfgAuthKey) {
+ logger.Infof("Proxy is failing to auth; will attempt to clean up the old device if any found and issue a new auth key")
+
+ // If the device is already authed, we want to delete it from the tailnet.
+ if tsID, ok := stateSecret.Data[kubetypes.KeyDeviceID]; ok && len(tsID) > 0 {
+ id := tailcfg.StableNodeID(tsID)
+ if err := r.ensureDeviceDeleted(ctx, tailscaleClient, id, logger); err != nil {
+ return nil, err
+ }
+ }
+
+ if lim := r.authKeyRateLimits[pg.Name]; lim.Allow() {
+ createAuthKey = true
+ } else {
+ logger.Debugf("auth key re-issuance rate limit exceeded, limit: %.2f, burst: %d, tokens: %.2f", lim.Limit(), lim.Burst(), lim.Tokens())
+ return nil, fmt.Errorf("auth key re-issuance rate limit exceeded for ProxyGroup %q, will retry with backoff", pg.Name)
+ }
+ }
+
+ var authKey *string
+ if createAuthKey {
+ logger.Debugf("Creating auth key for ProxyGroup proxy")
+
+ tags := pg.Spec.Tags.Stringify()
+ if len(tags) == 0 {
+ tags = r.defaultTags
+ }
+ key, err := newAuthKey(ctx, r.tsClient, tags)
+ if err != nil {
+ return nil, err
+ }
+ authKey = &key
+ } else if !deviceAuthed(stateSecret) {
+ // Retain auth key from existing config.
+ authKey = cfgAuthKey
+ }
+
+ return authKey, nil
+}
+
+// shouldReissueAuthKey extracts the value of reissue_authkey from the proxy's
+// state Secret, and returns true if a new auth key is needed. The proxy will
+// set the value of reissue_authkey to the auth key with which the it failed to
+// auth, or empty if it didn't have an auth key in its config file.
+func shouldReissueAuthKey(s *corev1.Secret, authKeyInConfig *string) bool {
+ // If the key exists but the value is empty, that means a previous reissue
+ // request got cleared.
+ brokenAuthkey, reissueRequested := s.Data[kubetypes.KeyReissueAuthkey]
+ if !reissueRequested {
+ return false
+ }
+
+ // Reissue requested and no auth key in config, definitely reissue.
+ if authKeyInConfig == nil || *authKeyInConfig == "" {
+ return true
+ }
+
+ // The auth key we were going to use is already reported broken, reissue.
+ if *authKeyInConfig == string(brokenAuthkey) {
+ return true
+ }
+
+ // Make sure we don't reissue again if we happened to reconcile again before
+ // the proxy got a chance to auth with a reissued auth key.
+ return false
+}
+
type FindStaticEndpointErr struct {
msg string
}
@@ -1024,9 +1127,9 @@ func getStaticEndpointAddress(a *corev1.NodeAddress, port uint16) *netip.AddrPor
return ptr.To(netip.AddrPortFrom(addr, port))
}
-// ensureAddedToGaugeForProxyGroup ensures the gauge metric for the ProxyGroup resource is updated when the ProxyGroup
-// is created. r.mu must be held.
-func (r *ProxyGroupReconciler) ensureAddedToGaugeForProxyGroup(pg *tsapi.ProxyGroup) {
+// ensureStateAddedForProxyGroup ensures the gauge metric for the ProxyGroup resource is updated when the ProxyGroup
+// is created, and initialises per-ProxyGroup rate limits on re-issuing auth keys. r.mu must be held.
+func (r *ProxyGroupReconciler) ensureStateAddedForProxyGroup(pg *tsapi.ProxyGroup) {
switch pg.Spec.Type {
case tsapi.ProxyGroupTypeEgress:
r.egressProxyGroups.Add(pg.UID)
@@ -1038,11 +1141,17 @@ func (r *ProxyGroupReconciler) ensureAddedToGaugeForProxyGroup(pg *tsapi.ProxyGr
gaugeEgressProxyGroupResources.Set(int64(r.egressProxyGroups.Len()))
gaugeIngressProxyGroupResources.Set(int64(r.ingressProxyGroups.Len()))
gaugeAPIServerProxyGroupResources.Set(int64(r.apiServerProxyGroups.Len()))
+
+ if _, ok := r.authKeyRateLimits[pg.Name]; !ok {
+ // Allow every replica to have its auth key re-issued quickly the first
+ // time, but with an overall limit of 1 every 30s after a burst.
+ r.authKeyRateLimits[pg.Name] = rate.NewLimiter(rate.Every(30*time.Second), int(pgReplicas(pg)))
+ }
}
-// ensureRemovedFromGaugeForProxyGroup ensures the gauge metric for the ProxyGroup resource type is updated when the
-// ProxyGroup is deleted. r.mu must be held.
-func (r *ProxyGroupReconciler) ensureRemovedFromGaugeForProxyGroup(pg *tsapi.ProxyGroup) {
+// ensureStateRemovedForProxyGroup ensures the gauge metric for the ProxyGroup resource type is updated when the
+// ProxyGroup is deleted, and deletes the per-ProxyGroup rate limiter to free memory. r.mu must be held.
+func (r *ProxyGroupReconciler) ensureStateRemovedForProxyGroup(pg *tsapi.ProxyGroup) {
switch pg.Spec.Type {
case tsapi.ProxyGroupTypeEgress:
r.egressProxyGroups.Remove(pg.UID)
@@ -1054,6 +1163,7 @@ func (r *ProxyGroupReconciler) ensureRemovedFromGaugeForProxyGroup(pg *tsapi.Pro
gaugeEgressProxyGroupResources.Set(int64(r.egressProxyGroups.Len()))
gaugeIngressProxyGroupResources.Set(int64(r.ingressProxyGroups.Len()))
gaugeAPIServerProxyGroupResources.Set(int64(r.apiServerProxyGroups.Len()))
+ delete(r.authKeyRateLimits, pg.Name)
}
func pgTailscaledConfig(pg *tsapi.ProxyGroup, pc *tsapi.ProxyClass, idx int32, authKey *string, staticEndpoints []netip.AddrPort, oldAdvertiseServices []string, loginServer string) (tailscaledConfigs, error) {
@@ -1114,7 +1224,7 @@ func (r *ProxyGroupReconciler) getNodeMetadata(ctx context.Context, pg *tsapi.Pr
return nil, fmt.Errorf("failed to list state Secrets: %w", err)
}
for _, secret := range secrets.Items {
- var ordinal int
+ var ordinal int32
if _, err := fmt.Sscanf(secret.Name, pg.Name+"-%d", &ordinal); err != nil {
return nil, fmt.Errorf("unexpected secret %s was labelled as owned by the ProxyGroup %s: %w", secret.Name, pg.Name, err)
}
@@ -1198,7 +1308,7 @@ func (r *ProxyGroupReconciler) getRunningProxies(ctx context.Context, pg *tsapi.
}
type nodeMetadata struct {
- ordinal int
+ ordinal int32
stateSecret *corev1.Secret
podUID string // or empty if the Pod no longer exists.
tsID tailcfg.StableNodeID
diff --git a/cmd/k8s-operator/proxygroup_test.go b/cmd/k8s-operator/proxygroup_test.go
index c58bd2bb7..0d0bdc3b3 100644
--- a/cmd/k8s-operator/proxygroup_test.go
+++ b/cmd/k8s-operator/proxygroup_test.go
@@ -6,15 +6,19 @@
package main
import (
+ "context"
"encoding/json"
"fmt"
"net/netip"
+ "reflect"
"slices"
+ "strings"
"testing"
"time"
"github.com/google/go-cmp/cmp"
"go.uber.org/zap"
+ "golang.org/x/time/rate"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
@@ -28,7 +32,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client/fake"
"tailscale.com/client/tailscale"
"tailscale.com/ipn"
- kube "tailscale.com/k8s-operator"
tsoperator "tailscale.com/k8s-operator"
tsapi "tailscale.com/k8s-operator/apis/v1alpha1"
"tailscale.com/kube/k8s-proxy/conf"
@@ -638,10 +641,11 @@ func TestProxyGroupWithStaticEndpoints(t *testing.T) {
tsFirewallMode: "auto",
defaultProxyClass: "default-pc",
- Client: fc,
- tsClient: tsClient,
- recorder: fr,
- clock: cl,
+ Client: fc,
+ tsClient: tsClient,
+ recorder: fr,
+ clock: cl,
+ authKeyRateLimits: make(map[string]*rate.Limiter),
}
for i, r := range tt.reconciles {
@@ -781,11 +785,12 @@ func TestProxyGroupWithStaticEndpoints(t *testing.T) {
tsFirewallMode: "auto",
defaultProxyClass: "default-pc",
- Client: fc,
- tsClient: tsClient,
- recorder: fr,
- log: zl.Sugar().With("TestName", tt.name).With("Reconcile", "cleanup"),
- clock: cl,
+ Client: fc,
+ tsClient: tsClient,
+ recorder: fr,
+ log: zl.Sugar().With("TestName", tt.name).With("Reconcile", "cleanup"),
+ clock: cl,
+ authKeyRateLimits: make(map[string]*rate.Limiter),
}
if err := fc.Delete(t.Context(), pg); err != nil {
@@ -842,12 +847,14 @@ func TestProxyGroup(t *testing.T) {
tsFirewallMode: "auto",
defaultProxyClass: "default-pc",
- Client: fc,
- tsClient: tsClient,
- recorder: fr,
- log: zl.Sugar(),
- clock: cl,
+ Client: fc,
+ tsClient: tsClient,
+ recorder: fr,
+ log: zl.Sugar(),
+ clock: cl,
+ authKeyRateLimits: make(map[string]*rate.Limiter),
}
+
crd := &apiextensionsv1.CustomResourceDefinition{ObjectMeta: metav1.ObjectMeta{Name: serviceMonitorCRD}}
opts := configOpts{
proxyType: "proxygroup",
@@ -864,7 +871,7 @@ func TestProxyGroup(t *testing.T) {
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "the ProxyGroup's ProxyClass \"default-pc\" is not yet in a ready state, waiting...", 1, cl, zl.Sugar())
expectEqual(t, fc, pg)
expectProxyGroupResources(t, fc, pg, false, pc)
- if kube.ProxyGroupAvailable(pg) {
+ if tsoperator.ProxyGroupAvailable(pg) {
t.Fatal("expected ProxyGroup to not be available")
}
})
@@ -892,7 +899,7 @@ func TestProxyGroup(t *testing.T) {
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupAvailable, metav1.ConditionFalse, reasonProxyGroupCreating, "0/2 ProxyGroup pods running", 0, cl, zl.Sugar())
expectEqual(t, fc, pg)
expectProxyGroupResources(t, fc, pg, true, pc)
- if kube.ProxyGroupAvailable(pg) {
+ if tsoperator.ProxyGroupAvailable(pg) {
t.Fatal("expected ProxyGroup to not be available")
}
if expected := 1; reconciler.egressProxyGroups.Len() != expected {
@@ -936,7 +943,7 @@ func TestProxyGroup(t *testing.T) {
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupAvailable, metav1.ConditionTrue, reasonProxyGroupAvailable, "2/2 ProxyGroup pods running", 0, cl, zl.Sugar())
expectEqual(t, fc, pg)
expectProxyGroupResources(t, fc, pg, true, pc)
- if !kube.ProxyGroupAvailable(pg) {
+ if !tsoperator.ProxyGroupAvailable(pg) {
t.Fatal("expected ProxyGroup to be available")
}
})
@@ -1046,12 +1053,13 @@ func TestProxyGroupTypes(t *testing.T) {
zl, _ := zap.NewDevelopment()
reconciler := &ProxyGroupReconciler{
- tsNamespace: tsNamespace,
- tsProxyImage: testProxyImage,
- Client: fc,
- log: zl.Sugar(),
- tsClient: &fakeTSClient{},
- clock: tstest.NewClock(tstest.ClockOpts{}),
+ tsNamespace: tsNamespace,
+ tsProxyImage: testProxyImage,
+ Client: fc,
+ log: zl.Sugar(),
+ tsClient: &fakeTSClient{},
+ clock: tstest.NewClock(tstest.ClockOpts{}),
+ authKeyRateLimits: make(map[string]*rate.Limiter),
}
t.Run("egress_type", func(t *testing.T) {
@@ -1424,12 +1432,13 @@ func TestIngressAdvertiseServicesConfigPreserved(t *testing.T) {
WithStatusSubresource(&tsapi.ProxyGroup{}).
Build()
reconciler := &ProxyGroupReconciler{
- tsNamespace: tsNamespace,
- tsProxyImage: testProxyImage,
- Client: fc,
- log: zap.Must(zap.NewDevelopment()).Sugar(),
- tsClient: &fakeTSClient{},
- clock: tstest.NewClock(tstest.ClockOpts{}),
+ tsNamespace: tsNamespace,
+ tsProxyImage: testProxyImage,
+ Client: fc,
+ log: zap.Must(zap.NewDevelopment()).Sugar(),
+ tsClient: &fakeTSClient{},
+ clock: tstest.NewClock(tstest.ClockOpts{}),
+ authKeyRateLimits: make(map[string]*rate.Limiter),
}
existingServices := []string{"svc1", "svc2"}
@@ -1654,6 +1663,189 @@ func TestValidateProxyGroup(t *testing.T) {
}
}
+func TestProxyGroupGetAuthKey(t *testing.T) {
+ pg := &tsapi.ProxyGroup{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "test",
+ Finalizers: []string{"tailscale.com/finalizer"},
+ },
+ Spec: tsapi.ProxyGroupSpec{
+ Type: tsapi.ProxyGroupTypeEgress,
+ Replicas: ptr.To[int32](1),
+ },
+ }
+ tsClient := &fakeTSClient{}
+
+ // Variables to reference in test cases.
+ existingAuthKey := ptr.To("existing-auth-key")
+ newAuthKey := ptr.To("new-authkey")
+ configWith := func(authKey *string) map[string][]byte {
+ value := []byte("{}")
+ if authKey != nil {
+ value = fmt.Appendf(nil, `{"AuthKey": "%s"}`, *authKey)
+ }
+ return map[string][]byte{
+ tsoperator.TailscaledConfigFileName(pgMinCapabilityVersion): value,
+ }
+ }
+
+ initTest := func() (*ProxyGroupReconciler, client.WithWatch) {
+ fc := fake.NewClientBuilder().
+ WithScheme(tsapi.GlobalScheme).
+ WithObjects(pg).
+ WithStatusSubresource(pg).
+ Build()
+ zl, _ := zap.NewDevelopment()
+ fr := record.NewFakeRecorder(1)
+ cl := tstest.NewClock(tstest.ClockOpts{})
+ reconciler := &ProxyGroupReconciler{
+ tsNamespace: tsNamespace,
+ tsProxyImage: testProxyImage,
+ defaultTags: []string{"tag:test-tag"},
+ tsFirewallMode: "auto",
+
+ Client: fc,
+ tsClient: tsClient,
+ recorder: fr,
+ log: zl.Sugar(),
+ clock: cl,
+ authKeyRateLimits: make(map[string]*rate.Limiter),
+ }
+ reconciler.ensureStateAddedForProxyGroup(pg)
+
+ return reconciler, fc
+ }
+
+ // Config Secret: exists or not, has key or not.
+ // State Secret: has device ID or not, requested reissue or not.
+ for name, tc := range map[string]struct {
+ configData map[string][]byte
+ stateData map[string][]byte
+ expectedAuthKey *string
+ expectReissue bool
+ }{
+ "no_secrets_needs_new": {
+ expectedAuthKey: newAuthKey, // New ProxyGroup or manually cleared Pod.
+ },
+ "no_config_secret_state_authed_ok": {
+ stateData: map[string][]byte{
+ kubetypes.KeyDeviceID: []byte("nodeid-0"),
+ },
+ expectedAuthKey: newAuthKey, // Always create an auth key if we're creating the config Secret.
+ },
+ "config_secret_without_key_state_authed_with_reissue_needs_new": {
+ configData: configWith(nil),
+ stateData: map[string][]byte{
+ kubetypes.KeyDeviceID: []byte("nodeid-0"),
+ kubetypes.KeyReissueAuthkey: []byte(""),
+ },
+ expectedAuthKey: newAuthKey,
+ expectReissue: true, // Device is authed but reissue was requested.
+ },
+ "config_secret_with_key_state_with_reissue_stale_ok": {
+ configData: configWith(existingAuthKey),
+ stateData: map[string][]byte{
+ kubetypes.KeyReissueAuthkey: []byte("some-older-authkey"),
+ },
+ expectedAuthKey: existingAuthKey, // Config's auth key is different from the one marked for reissue.
+ },
+ "config_secret_with_key_state_with_reissue_existing_key_needs_new": {
+ configData: configWith(existingAuthKey),
+ stateData: map[string][]byte{
+ kubetypes.KeyDeviceID: []byte("nodeid-0"),
+ kubetypes.KeyReissueAuthkey: []byte(*existingAuthKey),
+ },
+ expectedAuthKey: newAuthKey,
+ expectReissue: true, // Current config's auth key is marked for reissue.
+ },
+ "config_secret_without_key_no_state_ok": {
+ configData: configWith(nil),
+ expectedAuthKey: nil, // Proxy will set reissue_authkey and then next reconcile will reissue.
+ },
+ "config_secret_without_key_state_authed_ok": {
+ configData: configWith(nil),
+ stateData: map[string][]byte{
+ kubetypes.KeyDeviceID: []byte("nodeid-0"),
+ },
+ expectedAuthKey: nil, // Device is already authed.
+ },
+ "config_secret_with_key_state_authed_ok": {
+ configData: configWith(existingAuthKey),
+ stateData: map[string][]byte{
+ kubetypes.KeyDeviceID: []byte("nodeid-0"),
+ },
+ expectedAuthKey: nil, // Auth key getting removed because device is authed.
+ },
+ "config_secret_with_key_no_state_keeps_existing": {
+ configData: configWith(existingAuthKey),
+ expectedAuthKey: existingAuthKey, // No state, waiting for containerboot to try the auth key.
+ },
+ } {
+ t.Run(name, func(t *testing.T) {
+ tsClient.deleted = tsClient.deleted[:0] // Reset deleted devices for each test case.
+ reconciler, fc := initTest()
+ var cfgSecret *corev1.Secret
+ if tc.configData != nil {
+ cfgSecret = &corev1.Secret{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: pgConfigSecretName(pg.Name, 0),
+ Namespace: tsNamespace,
+ },
+ Data: tc.configData,
+ }
+ }
+ if tc.stateData != nil {
+ mustCreate(t, fc, &corev1.Secret{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: pgStateSecretName(pg.Name, 0),
+ Namespace: tsNamespace,
+ },
+ Data: tc.stateData,
+ })
+ }
+
+ authKey, err := reconciler.getAuthKey(t.Context(), tsClient, pg, cfgSecret, 0, reconciler.log.With("TestName", t.Name()))
+ if err != nil {
+ t.Fatalf("unexpected error getting auth key: %v", err)
+ }
+ if !reflect.DeepEqual(authKey, tc.expectedAuthKey) {
+ deref := func(s *string) string {
+ if s == nil {
+ return "<nil>"
+ }
+ return *s
+ }
+ t.Errorf("expected auth key %v, got %v", deref(tc.expectedAuthKey), deref(authKey))
+ }
+
+ // Use the device deletion as a proxy for the fact the new auth key
+ // was due to a reissue.
+ switch {
+ case tc.expectReissue && len(tsClient.deleted) != 1:
+ t.Errorf("expected 1 deleted device, got %v", tsClient.deleted)
+ case !tc.expectReissue && len(tsClient.deleted) != 0:
+ t.Errorf("expected no deleted devices, got %v", tsClient.deleted)
+ }
+
+ if tc.expectReissue {
+ // Trigger the rate limit in a tight loop. Up to 100 iterations
+ // to allow for CI that is extremely slow, but should happen on
+ // first try for any reasonable machine.
+ for range 100 {
+ _, err := reconciler.getAuthKey(context.Background(), tsClient, pg, cfgSecret, 0, reconciler.log.With("TestName", t.Name()))
+ if err != nil {
+ if !strings.Contains(err.Error(), "rate limit exceeded") {
+ t.Fatalf("unexpected error getting auth key: %v", err)
+ }
+ return // Expected rate limit error.
+ }
+ }
+ t.Fatal("expected rate limit error, but got none")
+ }
+ })
+ }
+}
+
func proxyClassesForLEStagingTest() (*tsapi.ProxyClass, *tsapi.ProxyClass, *tsapi.ProxyClass) {
pcLEStaging := &tsapi.ProxyClass{
ObjectMeta: metav1.ObjectMeta{
@@ -1904,6 +2096,7 @@ func TestProxyGroupLetsEncryptStaging(t *testing.T) {
tsClient: &fakeTSClient{},
log: zl.Sugar(),
clock: cl,
+ authKeyRateLimits: make(map[string]*rate.Limiter),
}
expectReconciled(t, reconciler, "", pg.Name)
diff --git a/cmd/k8s-operator/sts.go b/cmd/k8s-operator/sts.go
index e81fe2d66..3d524b111 100644
--- a/cmd/k8s-operator/sts.go
+++ b/cmd/k8s-operator/sts.go
@@ -1094,7 +1094,7 @@ func tailscaledConfig(stsC *tailscaleSTSConfig, newAuthkey string, oldSecret *co
if newAuthkey != "" {
conf.AuthKey = &newAuthkey
- } else if shouldRetainAuthKey(oldSecret) {
+ } else if !deviceAuthed(oldSecret) {
key, err := authKeyFromSecret(oldSecret)
if err != nil {
return nil, fmt.Errorf("error retrieving auth key from Secret: %w", err)
@@ -1143,6 +1143,8 @@ func latestConfigFromSecret(s *corev1.Secret) (*ipn.ConfigVAlpha, error) {
return conf, nil
}
+// authKeyFromSecret returns the auth key from the latest config version if
+// found, or else nil.
func authKeyFromSecret(s *corev1.Secret) (key *string, err error) {
conf, err := latestConfigFromSecret(s)
if err != nil {
@@ -1159,13 +1161,13 @@ func authKeyFromSecret(s *corev1.Secret) (key *string, err error) {
return key, nil
}
-// shouldRetainAuthKey returns true if the state stored in a proxy's state Secret suggests that auth key should be
-// retained (because the proxy has not yet successfully authenticated).
-func shouldRetainAuthKey(s *corev1.Secret) bool {
+// deviceAuthed returns true if the state stored in a proxy's state Secret
+// suggests that the proxy has successfully authenticated.
+func deviceAuthed(s *corev1.Secret) bool {
if s == nil {
- return false // nothing to retain here
+ return false // No state Secret means no device state.
}
- return len(s.Data["device_id"]) == 0 // proxy has not authed yet
+ return len(s.Data["device_id"]) > 0
}
func shouldAcceptRoutes(pc *tsapi.ProxyClass) bool {
diff --git a/cmd/k8s-operator/testutils_test.go b/cmd/k8s-operator/testutils_test.go
index 0e4a3eee4..3b2d46599 100644
--- a/cmd/k8s-operator/testutils_test.go
+++ b/cmd/k8s-operator/testutils_test.go
@@ -527,7 +527,7 @@ func expectedSecret(t *testing.T, cl client.Client, opts configOpts) *corev1.Sec
AcceptDNS: "false",
Hostname: &opts.hostname,
Locked: "false",
- AuthKey: ptr.To("secret-authkey"),
+ AuthKey: ptr.To("new-authkey"),
AcceptRoutes: "false",
AppConnector: &ipn.AppConnectorPrefs{Advertise: false},
NoStatefulFiltering: "true",
@@ -860,7 +860,7 @@ func (c *fakeTSClient) CreateKey(ctx context.Context, caps tailscale.KeyCapabili
Created: time.Now(),
Capabilities: caps,
}
- return "secret-authkey", k, nil
+ return "new-authkey", k, nil
}
func (c *fakeTSClient) Device(ctx context.Context, deviceID string, fields *tailscale.DeviceFieldsOpts) (*tailscale.Device, error) {
diff --git a/kube/kubetypes/types.go b/kube/kubetypes/types.go
index 187f54f34..9f1b29064 100644
--- a/kube/kubetypes/types.go
+++ b/kube/kubetypes/types.go
@@ -38,17 +38,17 @@ const (
// Keys that containerboot writes to state file that can be used to determine its state.
// fields set in Tailscale state Secret. These are mostly used by the Tailscale Kubernetes operator to determine
// the state of this tailscale device.
- KeyDeviceID string = "device_id" // node stable ID of the device
- KeyDeviceFQDN string = "device_fqdn" // device's tailnet hostname
- KeyDeviceIPs string = "device_ips" // device's tailnet IPs
- KeyPodUID string = "pod_uid" // Pod UID
- // KeyCapVer contains Tailscale capability version of this proxy instance.
- KeyCapVer string = "tailscale_capver"
+ KeyDeviceID = "device_id" // node stable ID of the device
+ KeyDeviceFQDN = "device_fqdn" // device's tailnet hostname
+ KeyDeviceIPs = "device_ips" // device's tailnet IPs
+ KeyPodUID = "pod_uid" // Pod UID
+ KeyCapVer = "tailscale_capver" // tailcfg.CurrentCapabilityVersion of this proxy instance.
+ KeyReissueAuthkey = "reissue_authkey" // Proxies will set this to the authkey that failed, or "no-authkey", if they can't log in.
// KeyHTTPSEndpoint is a name of a field that can be set to the value of any HTTPS endpoint currently exposed by
// this device to the tailnet. This is used by the Kubernetes operator Ingress proxy to communicate to the operator
// that cluster workloads behind the Ingress can now be accessed via the given DNS name over HTTPS.
- KeyHTTPSEndpoint string = "https_endpoint"
- ValueNoHTTPS string = "no-https"
+ KeyHTTPSEndpoint = "https_endpoint"
+ ValueNoHTTPS = "no-https"
// Pod's IPv4 address header key as returned by containerboot health check endpoint.
PodIPv4Header string = "Pod-IPv4"