summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorSimon Law <sfllaw@tailscale.com>2025-09-29 19:16:54 -0700
committerSimon Law <sfllaw@tailscale.com>2025-10-06 15:42:26 -0700
commitc386bc9e9cca9563e7e63f159c506fc7dc098d96 (patch)
tree7bc57e3877896ef6e30a9a4620a345b28afdb7f0
parent5555bbcd222f6f0e757d458f4251d87450d67b41 (diff)
downloadtailscale-sfllaw/tailscale-ping-for-client-side-reachability.tar.xz
tailscale-sfllaw/tailscale-ping-for-client-side-reachability.zip
ipn/ipnlocal: determine client-side-reachability by pingingsfllaw/tailscale-ping-for-client-side-reachability
PR #17367 introduced the `client-side-reachability` node attribute that switches the node to completely ignore the online signal from control. This was to work around the times when the control plane determines that a node is not online, while the node is still able to connect to its peers. However, this breaks high-availability failover, since nodes that are actually offline are never treated that way. This PR makes nodeBackend.PeerIsReachable actually test reachability by disco-pinging the peer node. In the future, the client itself should collect reachability data from active Wireguard flows. Updates #17366 Updates tailscale/corp#30379 Updates tailscale/corp#32686 Updates tailscale/corp#33033 Signed-off-by: Simon Law <sfllaw@tailscale.com>
-rw-r--r--ipn/ipnlocal/local.go6
-rw-r--r--ipn/ipnlocal/local_test.go4
-rw-r--r--ipn/ipnlocal/node_backend.go74
-rw-r--r--ipn/ipnlocal/node_backend_test.go185
-rw-r--r--ipn/ipnlocal/ping.go6
-rw-r--r--tailcfg/tailcfg.go3
6 files changed, 245 insertions, 33 deletions
diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go
index 9490537df..88e680f00 100644
--- a/ipn/ipnlocal/local.go
+++ b/ipn/ipnlocal/local.go
@@ -499,7 +499,7 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo
needsCaptiveDetection: make(chan bool),
}
- nb := newNodeBackend(ctx, b.logf, b.sys.Bus.Get())
+ nb := newNodeBackend(ctx, b.logf, b.sys.Bus.Get(), b)
b.currentNodeAtomic.Store(nb)
nb.ready()
@@ -629,7 +629,7 @@ func (b *LocalBackend) currentNode() *nodeBackend {
if v := b.currentNodeAtomic.Load(); v != nil || !testenv.InTest() {
return v
}
- v := newNodeBackend(cmp.Or(b.ctx, context.Background()), b.logf, b.sys.Bus.Get())
+ v := newNodeBackend(cmp.Or(b.ctx, context.Background()), b.logf, b.sys.Bus.Get(), b)
if b.currentNodeAtomic.CompareAndSwap(nil, v) {
v.ready()
}
@@ -6750,7 +6750,7 @@ func (b *LocalBackend) resetForProfileChangeLockedOnEntry(unlock unlockOnce) err
// down, so no need to do any work.
return nil
}
- newNode := newNodeBackend(b.ctx, b.logf, b.sys.Bus.Get())
+ newNode := newNodeBackend(b.ctx, b.logf, b.sys.Bus.Get(), b)
if oldNode := b.currentNodeAtomic.Swap(newNode); oldNode != nil {
oldNode.shutdown(errNodeContextChanged)
}
diff --git a/ipn/ipnlocal/local_test.go b/ipn/ipnlocal/local_test.go
index a662793db..2650ff451 100644
--- a/ipn/ipnlocal/local_test.go
+++ b/ipn/ipnlocal/local_test.go
@@ -4920,7 +4920,7 @@ func TestSuggestExitNode(t *testing.T) {
allowList = set.SetOf(tt.allowPolicy)
}
- nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New())
+ nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New(), nil)
defer nb.shutdown(errShutdown)
nb.SetNetMap(tt.netMap)
@@ -5373,7 +5373,7 @@ func TestSuggestExitNodeTrafficSteering(t *testing.T) {
tt.netMap.AllCaps = set.SetOf(slices.Collect(caps))
}
- nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New())
+ nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New(), nil)
defer nb.shutdown(errShutdown)
nb.SetNetMap(tt.netMap)
diff --git a/ipn/ipnlocal/node_backend.go b/ipn/ipnlocal/node_backend.go
index 3408d4cbb..a677d9324 100644
--- a/ipn/ipnlocal/node_backend.go
+++ b/ipn/ipnlocal/node_backend.go
@@ -6,8 +6,10 @@ package ipnlocal
import (
"cmp"
"context"
+ "fmt"
"net/netip"
"slices"
+ "strings"
"sync"
"sync/atomic"
@@ -70,6 +72,8 @@ type nodeBackend struct {
ctx context.Context // canceled by [nodeBackend.shutdown]
ctxCancel context.CancelCauseFunc // cancels ctx
+ pinger Pinger
+
// filterAtomic is a stateful packet filter. Immutable once created, but can be
// replaced with a new one.
filterAtomic atomic.Pointer[filter.Filter]
@@ -106,12 +110,13 @@ type nodeBackend struct {
nodeByAddr map[netip.Addr]tailcfg.NodeID
}
-func newNodeBackend(ctx context.Context, logf logger.Logf, bus *eventbus.Bus) *nodeBackend {
+func newNodeBackend(ctx context.Context, logf logger.Logf, bus *eventbus.Bus, pinger Pinger) *nodeBackend {
ctx, ctxCancel := context.WithCancelCause(ctx)
nb := &nodeBackend{
logf: logf,
ctx: ctx,
ctxCancel: ctxCancel,
+ pinger: pinger,
eventClient: bus.Client("ipnlocal.nodeBackend"),
readyCh: make(chan struct{}),
}
@@ -381,19 +386,68 @@ func (nb *nodeBackend) PeerIsReachable(ctx context.Context, p tailcfg.NodeView)
// This node can always reach itself.
return true
}
- return nb.peerIsReachable(ctx, p)
+ res, err := nb.peerIsReachable(ctx, p)
+ if err != nil {
+ nb.logf("peer reachability: %s", err)
+ }
+ return res
}
-func (nb *nodeBackend) peerIsReachable(ctx context.Context, p tailcfg.NodeView) bool {
- // TODO(sfllaw): The following does not actually test for client-side
- // reachability. This would require a mechanism that tracks whether the
- // current node can actually reach this peer, either because they are
- // already communicating or because they can ping each other.
- //
- // Instead, it makes the client ignore p.Online completely.
+// peerIsReachable will only return a [context.DeadlineExceeded] error if ctx
+// was cancelled by its deadline passing, not if an active probe times out.
+func (nb *nodeBackend) peerIsReachable(ctx context.Context, p tailcfg.NodeView) (bool, error) {
+ // When the [Pinger] is missing, fall back on the control plane.
+ if nb.pinger == nil {
+ online := p.Online().Get()
+ nb.logf("peer reachable: missing pinger")
+ return online, nil
+ }
+
+ var addr netip.Addr
+ for _, a := range p.Addresses().All() {
+ if !a.IsSingleIP() {
+ continue
+ }
+ addr = a.Addr()
+ break
+ }
+ if !addr.IsValid() {
+ return false, fmt.Errorf("peer %s (%v) has no IP addresses: %s", p.Name(), p.ID(), p.Addresses())
+ }
+
+ // Wireguard-only nodes cannot be disco-pinged, so we trust the control
+ // plane.
//
+ // TODO(sfllaw): We could try to initiate a Wireguard session and see if
+ // a response comes back. ICMP ping is also an option, but there might
+ // be false negatives if ICMP is blocked.
+ if p.IsWireGuardOnly() {
+ return p.Online().Get(), nil
+ }
+
+ // Disco ping the peer node to determine if it is actually reachable.
// See tailscale/corp#32686.
- return true
+ //
+ // TODO(sfllaw): If there is already an active Wireguard session to the
+ // peer, then we can avoid active probes and return early.
+ res, err := nb.pinger.Ping(ctx, addr, tailcfg.PingDisco, 0)
+ if err != nil {
+ // Encountered a non-ping error, ping failures would be reported
+ // in res.Err. This is likely to happen when ctx is cancelled.
+ return false, fmt.Errorf("aborted ping for peer %s (%v) at %s: %w", p.Name(), p.ID(), addr, err)
+ }
+ if res.Err != "" {
+ if res.IsLocalIP {
+ // Nodes can always reach themselves.
+ return true, nil
+ }
+ if strings.Contains(res.Err, context.DeadlineExceeded.Error()) {
+ // Ping has timed out: this is not an error.
+ return false, nil
+ }
+ return false, fmt.Errorf("failed to ping peer %s (%v) at %s: %s", p.Name(), p.ID(), addr, err)
+ }
+ return true, nil
}
func nodeIP(n tailcfg.NodeView, pred func(netip.Addr) bool) netip.Addr {
diff --git a/ipn/ipnlocal/node_backend_test.go b/ipn/ipnlocal/node_backend_test.go
index f6698bd4b..bd41cd70a 100644
--- a/ipn/ipnlocal/node_backend_test.go
+++ b/ipn/ipnlocal/node_backend_test.go
@@ -6,9 +6,12 @@ package ipnlocal
import (
"context"
"errors"
+ "fmt"
+ "net/netip"
"testing"
"time"
+ "tailscale.com/ipn/ipnstate"
"tailscale.com/tailcfg"
"tailscale.com/tstest"
"tailscale.com/types/netmap"
@@ -17,7 +20,7 @@ import (
)
func TestNodeBackendReadiness(t *testing.T) {
- nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New())
+ nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New(), nil)
// The node backend is not ready until [nodeBackend.ready] is called,
// and [nodeBackend.Wait] should fail with [context.DeadlineExceeded].
@@ -48,7 +51,7 @@ func TestNodeBackendReadiness(t *testing.T) {
}
func TestNodeBackendShutdown(t *testing.T) {
- nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New())
+ nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New(), nil)
shutdownCause := errors.New("test shutdown")
@@ -86,7 +89,7 @@ func TestNodeBackendShutdown(t *testing.T) {
}
func TestNodeBackendReadyAfterShutdown(t *testing.T) {
- nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New())
+ nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New(), nil)
shutdownCause := errors.New("test shutdown")
nb.shutdown(shutdownCause)
@@ -98,7 +101,7 @@ func TestNodeBackendReadyAfterShutdown(t *testing.T) {
func TestNodeBackendParentContextCancellation(t *testing.T) {
ctx, cancelCtx := context.WithCancel(context.Background())
- nb := newNodeBackend(ctx, tstest.WhileTestRunningLogger(t), eventbus.New())
+ nb := newNodeBackend(ctx, tstest.WhileTestRunningLogger(t), eventbus.New(), nil)
cancelCtx()
@@ -115,7 +118,7 @@ func TestNodeBackendParentContextCancellation(t *testing.T) {
}
func TestNodeBackendConcurrentReadyAndShutdown(t *testing.T) {
- nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New())
+ nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New(), nil)
// Calling [nodeBackend.ready] and [nodeBackend.shutdown] concurrently
// should not cause issues, and [nodeBackend.Wait] should unblock,
@@ -127,6 +130,17 @@ func TestNodeBackendConcurrentReadyAndShutdown(t *testing.T) {
}
func TestNodeBackendReachability(t *testing.T) {
+ addrs := []netip.Prefix{netip.MustParsePrefix("100.64.0.1/32")}
+ defaults := func(n tailcfg.Node) tailcfg.Node {
+ if n.ID == 0 {
+ n.ID = 1234
+ }
+ if n.Name == "" {
+ n.Name = "exit-node.example.ts.net"
+ }
+ return n
+ }
+
for _, tc := range []struct {
name string
@@ -139,54 +153,191 @@ func TestNodeBackendReachability(t *testing.T) {
// peer node.
cap bool
+ // Peer defines the peer node.
peer tailcfg.Node
+
+ // Ping sets how the peer node responds to pings:
+ // pingTimedOut: peer is unreachable
+ // pingSuccess: peer responds to pings
+ // pingLocalhost: peer is the same as the self node
+ ping mockPinger
+
want bool
}{
{
+ name: "disabled/nil",
+ cap: false,
+ peer: defaults(tailcfg.Node{
+ Online: nil,
+ }),
+ want: false,
+ },
+ {
name: "disabled/offline",
cap: false,
- peer: tailcfg.Node{
+ peer: defaults(tailcfg.Node{
Online: ptr.To(false),
- },
+ }),
want: false,
},
{
name: "disabled/online",
cap: false,
- peer: tailcfg.Node{
+ peer: defaults(tailcfg.Node{
Online: ptr.To(true),
- },
+ }),
want: true,
},
{
+ name: "enabled/no_ip",
+ cap: true,
+ ping: pingTimedOut,
+ peer: defaults(tailcfg.Node{
+ Online: ptr.To(false),
+ Addresses: nil,
+ }),
+ want: false,
+ },
+ {
name: "enabled/offline",
cap: true,
- peer: tailcfg.Node{
- Online: ptr.To(false),
- },
+ peer: defaults(tailcfg.Node{
+ Online: ptr.To(false),
+ Addresses: addrs,
+ }),
+ ping: pingTimedOut,
+ want: false,
+ },
+ {
+ name: "enabled/offline_but_pingable",
+ cap: true,
+ peer: defaults(tailcfg.Node{
+ Online: ptr.To(false),
+ Addresses: addrs,
+ }),
+ ping: pingSuccess,
want: true,
},
{
name: "enabled/online",
cap: true,
- peer: tailcfg.Node{
- Online: ptr.To(true),
- },
+ peer: defaults(tailcfg.Node{
+ Online: ptr.To(true),
+ Addresses: addrs,
+ }),
+ ping: pingSuccess,
want: true,
},
+ {
+ name: "enabled/online_but_unpingable",
+ cap: true,
+ peer: defaults(tailcfg.Node{
+ Online: ptr.To(true),
+ Addresses: addrs,
+ }),
+ ping: pingTimedOut,
+ want: false,
+ },
+ {
+ name: "enabled/offline_localhost",
+ cap: true,
+ peer: defaults(tailcfg.Node{
+ Online: ptr.To(false),
+ Addresses: addrs,
+ }),
+ ping: pingLocalhost,
+ want: true,
+ },
+ {
+ name: "enabled/online_localhost",
+ cap: true,
+ peer: defaults(tailcfg.Node{
+ Online: ptr.To(true),
+ Addresses: addrs,
+ }),
+ ping: pingLocalhost,
+ want: true,
+ },
+ {
+ name: "enabled/offline_but_cancelled",
+ cap: true,
+ peer: defaults(tailcfg.Node{
+ Online: ptr.To(false),
+ Addresses: addrs,
+ }),
+ ping: pingCancelled,
+ want: false,
+ },
+ {
+ name: "enabled/online_but_cancelled",
+ cap: true,
+ peer: defaults(tailcfg.Node{
+ Online: ptr.To(true),
+ Addresses: addrs,
+ }),
+ ping: pingCancelled,
+ want: false,
+ },
} {
+
t.Run(tc.name, func(t *testing.T) {
- nb := newNodeBackend(t.Context(), tstest.WhileTestRunningLogger(t), eventbus.New())
+ ctx := t.Context()
+
+ nb := newNodeBackend(ctx, tstest.WhileTestRunningLogger(t), eventbus.New(), mockPinger(tc.ping))
nb.netMap = &netmap.NetworkMap{}
if tc.cap {
nb.netMap.AllCaps.Make()
nb.netMap.AllCaps.Add(tailcfg.NodeAttrClientSideReachability)
}
- got := nb.PeerIsReachable(t.Context(), tc.peer.View())
+ if tc.ping == pingCancelled {
+ c, cancel := context.WithCancelCause(ctx)
+ ctx = c
+ cancel(fmt.Errorf("subtest: %q", tc.name))
+ }
+
+ got := nb.PeerIsReachable(ctx, tc.peer.View())
if got != tc.want {
t.Errorf("got %v, want %v", got, tc.want)
}
})
}
}
+
+type mockPinger int
+
+const (
+ pingTimedOut mockPinger = iota
+ pingSuccess
+ pingLocalhost
+ pingCancelled
+)
+
+func (p mockPinger) Ping(ctx context.Context, ip netip.Addr, pingType tailcfg.PingType, size int) (*ipnstate.PingResult, error) {
+ select {
+ case <-ctx.Done():
+ return nil, ctx.Err()
+ default:
+ }
+
+ res := &ipnstate.PingResult{
+ IP: ip.String(),
+ NodeIP: ip.String(),
+ }
+ switch p {
+ case pingTimedOut:
+ ctx, cancel := context.WithTimeout(ctx, 0)
+ defer cancel()
+ <-ctx.Done()
+ res.Err = ctx.Err().Error()
+ return res, nil
+ case pingLocalhost:
+ res.Err = fmt.Sprintf("%v is local Tailscale IP", ip)
+ res.IsLocalIP = true
+ case pingSuccess:
+ res.LatencySeconds = 1
+ default:
+ panic(fmt.Sprintf("unknown mockPinger %v", p))
+ }
+ return res, nil
+}
diff --git a/ipn/ipnlocal/ping.go b/ipn/ipnlocal/ping.go
index 4f6808a1e..d630379d4 100644
--- a/ipn/ipnlocal/ping.go
+++ b/ipn/ipnlocal/ping.go
@@ -17,6 +17,12 @@ import (
"tailscale.com/tailcfg"
)
+// Pinger is the [LocalBackend.Ping] method.
+type Pinger interface {
+ // Ping is a request to do a ping with the peer handling the given IP.
+ Ping(ctx context.Context, ip netip.Addr, pingType tailcfg.PingType, size int) (*ipnstate.PingResult, error)
+}
+
func (b *LocalBackend) Ping(ctx context.Context, ip netip.Addr, pingType tailcfg.PingType, size int) (*ipnstate.PingResult, error) {
if pingType == tailcfg.PingPeerAPI {
t0 := b.clock.Now()
diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go
index 7484c7466..afe23e4e3 100644
--- a/tailcfg/tailcfg.go
+++ b/tailcfg/tailcfg.go
@@ -175,7 +175,8 @@ type CapabilityVersion int
// - 127: 2025-09-19: can handle C2N /debug/netmap.
// - 128: 2025-10-02: can handle C2N /debug/health.
// - 129: 2025-10-04: Fixed sleep/wake deadlock in magicsock when using peer relay (PR #17449)
-const CurrentCapabilityVersion CapabilityVersion = 129
+// - 130: 2025-10-06: Client will determine whether it can reach an exit node when [NodeAttrClientSideReachability] is set.
+const CurrentCapabilityVersion CapabilityVersion = 130
// ID is an integer ID for a user, node, or login allocated by the
// control plane.