summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--wgengine/magicsock/debughttp.go4
-rw-r--r--wgengine/magicsock/endpoint.go4
-rw-r--r--wgengine/magicsock/magicsock.go466
-rw-r--r--wgengine/magicsock/magicsock_test.go11
-rw-r--r--wgengine/magicsock/relaymanager.go66
5 files changed, 359 insertions, 192 deletions
diff --git a/wgengine/magicsock/debughttp.go b/wgengine/magicsock/debughttp.go
index 68019d0a7..a9f4734f9 100644
--- a/wgengine/magicsock/debughttp.go
+++ b/wgengine/magicsock/debughttp.go
@@ -108,8 +108,8 @@ func (c *Conn) ServeHTTPDebug(w http.ResponseWriter, r *http.Request) {
}
sort.Slice(ent, func(i, j int) bool { return ent[i].pub.Less(ent[j].pub) })
- peers := map[key.NodePublic]tailcfg.NodeView{}
- for _, p := range c.peers.All() {
+ peers := make(map[key.NodePublic]tailcfg.NodeView, len(c.peersByID))
+ for _, p := range c.peersByID {
peers[p.Key()] = p
}
diff --git a/wgengine/magicsock/endpoint.go b/wgengine/magicsock/endpoint.go
index 31dcab67b..510d0d315 100644
--- a/wgengine/magicsock/endpoint.go
+++ b/wgengine/magicsock/endpoint.go
@@ -897,7 +897,7 @@ func (de *endpoint) wantUDPRelayPathDiscoveryLocked(now mono.Time) bool {
if runtime.GOOS == "js" {
return false
}
- if !de.c.hasPeerRelayServers.Load() {
+ if !de.c.relayManager.hasPeerRelayServers.Load() {
// Changes in this value between its access and a call to
// [endpoint.discoverUDPRelayPathsLocked] are fine, we will eventually
// do the "right" thing during future path discovery. The worst case is
@@ -2093,7 +2093,7 @@ func (de *endpoint) setDERPHome(regionID uint16) {
de.mu.Lock()
defer de.mu.Unlock()
de.derpAddr = netip.AddrPortFrom(tailcfg.DerpMagicIPAddr, uint16(regionID))
- if de.c.hasPeerRelayServers.Load() {
+ if de.c.relayManager.hasPeerRelayServers.Load() {
de.c.relayManager.handleDERPHomeChange(de.publicKey, regionID)
}
}
diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go
index f13e31554..17c32a875 100644
--- a/wgengine/magicsock/magicsock.go
+++ b/wgengine/magicsock/magicsock.go
@@ -269,12 +269,6 @@ type Conn struct {
// captureHook, if non-nil, is the pcap logging callback when capturing.
captureHook syncs.AtomicValue[packet.CaptureCallback]
- // hasPeerRelayServers is whether [relayManager] is configured with at least
- // one peer relay server via [relayManager.handleRelayServersSet]. It exists
- // to suppress calls into [relayManager] leading to wasted work involving
- // channel operations and goroutine creation.
- hasPeerRelayServers atomic.Bool
-
// discoAtomic is the current disco private and public keypair for this conn.
discoAtomic discoAtomic
@@ -361,18 +355,18 @@ type Conn struct {
// magicsock could do with any complexity reduction it can get.
netInfoLast *tailcfg.NetInfo
- derpMap *tailcfg.DERPMap // nil (or zero regions/nodes) means DERP is disabled
- self tailcfg.NodeView // from last SetNetworkMap
- peers views.Slice[tailcfg.NodeView] // from last SetNetworkMap, sorted by Node.ID; Note: [netmap.NodeMutation]'s rx'd in UpdateNetmapDelta are never applied
- filt *filter.Filter // from last SetFilter
- relayClientEnabled bool // whether we can allocate UDP relay endpoints on UDP relay servers or receive CallMeMaybeVia messages from peers
- lastFlags debugFlags // at time of last SetNetworkMap
- privateKey key.NodePrivate // WireGuard private key for this node
- everHadKey bool // whether we ever had a non-zero private key
- myDerp int // nearest DERP region ID; 0 means none/unknown
- homeless bool // if true, don't try to find & stay conneted to a DERP home (myDerp will stay 0)
- derpStarted chan struct{} // closed on first connection to DERP; for tests & cleaner Close
- activeDerp map[int]activeDerp // DERP regionID -> connection to a node in that region
+ derpMap *tailcfg.DERPMap // nil (or zero regions/nodes) means DERP is disabled
+ self tailcfg.NodeView // from last SetNetworkMap
+ peersByID map[tailcfg.NodeID]tailcfg.NodeView // current peer set, keyed by NodeID. Maintained by SetNetworkMap/UpsertPeer/RemovePeer. Note: per-field NodeMutation patches received in UpdateNetmapDelta are never applied to these snapshots.
+ filt *filter.Filter // from last SetFilter
+ relayClientEnabled bool // whether we can allocate UDP relay endpoints on UDP relay servers or receive CallMeMaybeVia messages from peers
+ lastFlags debugFlags // at time of last SetNetworkMap
+ privateKey key.NodePrivate // WireGuard private key for this node
+ everHadKey bool // whether we ever had a non-zero private key
+ myDerp int // nearest DERP region ID; 0 means none/unknown
+ homeless bool // if true, don't try to find & stay conneted to a DERP home (myDerp will stay 0)
+ derpStarted chan struct{} // closed on first connection to DERP; for tests & cleaner Close
+ activeDerp map[int]activeDerp // DERP regionID -> connection to a node in that region
prevDerp map[int]*syncs.WaitGroupChan
// derpRoute contains optional alternate routes to use as an
@@ -2430,24 +2424,12 @@ func (c *Conn) handleDiscoMessage(msg []byte, src epAddr, shouldBeRelayHandshake
if c.filt == nil {
return
}
- // Binary search of peers is O(log n) while c.mu is held.
- // TODO: We might be able to use ep.nodeAddr instead of all addresses,
- // or we might be able to release c.mu before doing this work. Keep it
- // simple and slow for now. c.peers.AsSlice is a copy. We may need to
- // write our own binary search for a [views.Slice].
- peerI, ok := slices.BinarySearchFunc(c.peers.AsSlice(), ep.nodeID, func(peer tailcfg.NodeView, target tailcfg.NodeID) int {
- if peer.ID() < target {
- return -1
- } else if peer.ID() > target {
- return 1
- }
- return 0
- })
+ peer, ok := c.peersByID[ep.nodeID]
if !ok {
// unexpected
return
}
- if !nodeHasCap(c.filt, c.peers.At(peerI), c.self, tailcfg.PeerCapabilityRelay) {
+ if !nodeHasCap(c.filt, peer, c.self, tailcfg.PeerCapabilityRelay) {
return
}
// [Conn.mu] must not be held while publishing, or [Conn.onUDPRelayAllocResp]
@@ -2784,18 +2766,6 @@ func (c *Conn) UpdatePeers(newPeers set.Set[key.NodePublic]) {
}
}
-func nodesEqual(x, y views.Slice[tailcfg.NodeView]) bool {
- if x.Len() != y.Len() {
- return false
- }
- for i := range x.Len() {
- if !x.At(i).Equal(y.At(i)) {
- return false
- }
- }
- return true
-}
-
// debugRingBufferSize returns a maximum size for our set of endpoint ring
// buffers by assuming that a single large update is ~500 bytes, and that we
// want to not use more than 1MiB of memory on phones / 4MiB on other devices.
@@ -2883,7 +2853,7 @@ func (c *Conn) SetFilter(f *filter.Filter) {
c.mu.Lock()
c.filt = f
self := c.self
- peers := c.peers
+ peers := c.peerSnapshotLocked()
relayClientEnabled := c.relayClientEnabled
c.mu.Unlock() // release c.mu before potentially calling c.updateRelayServersSet which is O(m * n)
@@ -2897,11 +2867,26 @@ func (c *Conn) SetFilter(f *filter.Filter) {
c.updateRelayServersSet(f, self, peers)
}
+// peerSnapshotLocked returns a freshly-allocated slice of the current peers.
+// It's used by callers that need to pass peer state to an O(m * n) callee
+// (like [Conn.updateRelayServersSet]) after releasing c.mu. c.mu must be held.
+func (c *Conn) peerSnapshotLocked() []tailcfg.NodeView {
+ if len(c.peersByID) == 0 {
+ return nil
+ }
+ out := make([]tailcfg.NodeView, 0, len(c.peersByID))
+ for _, p := range c.peersByID {
+ out = append(out, p)
+ }
+ return out
+}
+
// updateRelayServersSet iterates all peers and self, evaluating filt for each
// one in order to determine which are relay server candidates. filt, self, and
// peers are passed as args (vs c.mu-guarded fields) to enable callers to
// release c.mu before calling as this is O(m * n) (we iterate all cap rules 'm'
-// in filt for every peer 'n').
+// in filt for every peer 'n'). peers must be a snapshot owned by the caller;
+// this function does not retain it after return.
//
// Calls to updateRelayServersSet must never run concurrent to
// [endpoint.setDERPHome], otherwise [candidatePeerRelay] DERP home changes may
@@ -2913,9 +2898,9 @@ func (c *Conn) SetFilter(f *filter.Filter) {
// them.
// 2. Moving this work upstream into [nodeBackend] or similar, and publishing
// the computed result over the eventbus instead.
-func (c *Conn) updateRelayServersSet(filt *filter.Filter, self tailcfg.NodeView, peers views.Slice[tailcfg.NodeView]) {
+func (c *Conn) updateRelayServersSet(filt *filter.Filter, self tailcfg.NodeView, peers []tailcfg.NodeView) {
relayServers := make(set.Set[candidatePeerRelay])
- nodes := append(peers.AsSlice(), self)
+ nodes := append(peers, self)
for _, maybeCandidate := range nodes {
if maybeCandidate.ID() != self.ID() && !capVerIsRelayCapable(maybeCandidate.Cap()) {
// If maybeCandidate's [tailcfg.CapabilityVersion] is not relay-capable,
@@ -2933,12 +2918,9 @@ func (c *Conn) updateRelayServersSet(filt *filter.Filter, self tailcfg.NodeView,
derpHomeRegionID: uint16(maybeCandidate.HomeDERP()),
})
}
+ // [relayManager]'s run loop updates [relayManager.hasPeerRelayServers]
+ // to reflect the new server count.
c.relayManager.handleRelayServersSet(relayServers)
- if len(relayServers) > 0 {
- c.hasPeerRelayServers.Store(true)
- } else {
- c.hasPeerRelayServers.Store(false)
- }
}
// nodeHasCap returns true if src has cap on dst, otherwise it returns false.
@@ -2990,6 +2972,12 @@ func (c *candidatePeerRelay) isValid() bool {
// magicsock has the current state before subsequent operations proceed.
//
// self may be invalid if there's no network map.
+//
+// SetNetworkMap takes the full peer list and walks all of it. For incremental
+// updates where only a single peer changes, prefer the O(1) [Conn.UpsertPeer]
+// and [Conn.RemovePeer] methods. SetNetworkMap remains the right call for the
+// initial netmap and for changes to self or to global state (filter, DERP,
+// etc.) that aren't covered by the per-peer methods.
func (c *Conn) SetNetworkMap(self tailcfg.NodeView, peers []tailcfg.NodeView) {
peersChanged := c.updateNodes(self, peers)
@@ -3002,7 +2990,7 @@ func (c *Conn) SetNetworkMap(self tailcfg.NodeView, peers []tailcfg.NodeView) {
c.relayClientEnabled = relayClientEnabled
filt := c.filt
selfView := c.self
- peersView := c.peers
+ peersSnap := c.peerSnapshotLocked()
isClosed := c.closed
c.mu.Unlock() // release c.mu before potentially calling c.updateRelayServersSet which is O(m * n)
@@ -3012,16 +3000,16 @@ func (c *Conn) SetNetworkMap(self tailcfg.NodeView, peers []tailcfg.NodeView) {
if peersChanged || relayClientChanged {
if !relayClientEnabled {
+ // [relayManager]'s run loop updates [relayManager.hasPeerRelayServers].
c.relayManager.handleRelayServersSet(nil)
- c.hasPeerRelayServers.Store(false)
} else {
- c.updateRelayServersSet(filt, selfView, peersView)
+ c.updateRelayServersSet(filt, selfView, peersSnap)
}
}
}
// updateNodes updates [Conn] to reflect the given self node and peers.
-// It reports whether the peers were changed from before.
+// It reports whether the peer set (membership or any field) changed.
func (c *Conn) updateNodes(self tailcfg.NodeView, peers []tailcfg.NodeView) (peersChanged bool) {
c.mu.Lock()
defer c.mu.Unlock()
@@ -3030,13 +3018,9 @@ func (c *Conn) updateNodes(self tailcfg.NodeView, peers []tailcfg.NodeView) (pee
return false
}
- priorPeers := c.peers
metricNumPeers.Set(int64(len(peers)))
- // Update c.self & c.peers regardless, before the following early return.
c.self = self
- curPeers := views.SliceOf(peers)
- c.peers = curPeers
// [debugFlags] are mutable in [Conn.SetSilentDisco] &
// [Conn.SetProbeUDPLifetime]. These setters are passed [controlknobs.Knobs]
@@ -3049,154 +3033,272 @@ func (c *Conn) updateNodes(self tailcfg.NodeView, peers []tailcfg.NodeView) (pee
// TODO: mutate [debugFlags] here instead of in various [Conn] setters.
flags := c.debugFlagsLocked()
- peersChanged = !nodesEqual(priorPeers, curPeers)
- if !peersChanged && c.lastFlags == flags {
- // The rest of this function is all adjusting state for peers that have
- // changed. But if the set of peers is equal and the debug flags (for
- // silent disco and probe UDP lifetime) haven't changed, there is no
- // need to do anything else.
- return
+ // Fast path: if the peer set and every peer's NodeView are unchanged,
+ // and flags are unchanged, skip all further work.
+ if c.lastFlags == flags && len(peers) == len(c.peersByID) {
+ allSame := true
+ for _, n := range peers {
+ if prev, ok := c.peersByID[n.ID()]; !ok || !prev.Equal(n) {
+ allSame = false
+ break
+ }
+ }
+ if allSame {
+ return false
+ }
}
c.lastFlags = flags
-
c.logf("[v1] magicsock: got updated network map; %d peers", len(peers))
entriesPerBuffer := debugRingBufferSize(len(peers))
- // Try a pass of just upserting nodes and creating missing
- // endpoints. If the set of nodes is the same, this is an
- // efficient alloc-free update. If the set of nodes is different,
- // we'll fall through to the next pass, which allocates but can
- // handle full set updates.
+ // Build the new peer map while upserting each peer.
+ newPeers := make(map[tailcfg.NodeID]tailcfg.NodeView, len(peers))
for _, n := range peers {
- if n.ID() == 0 {
- devPanicf("node with zero ID")
- continue
+ newPeers[n.ID()] = n
+ c.upsertPeerLocked(n, flags, entriesPerBuffer)
+ }
+ if len(newPeers) != len(peers) {
+ // Duplicate NodeIDs in the input shouldn't happen, but log if so.
+ c.logf("[unexpected] magicsock.updateNodes: %d peers input but %d unique IDs", len(peers), len(newPeers))
+ }
+ c.peersByID = newPeers
+
+ // If the upsert pass left stale endpoints in peerMap (peers removed
+ // relative to before), clean them up.
+ if c.peerMap.nodeCount() != len(newPeers) {
+ keep := set.Set[key.NodePublic]{}
+ for _, n := range newPeers {
+ keep.Add(n.Key())
}
- if n.Key().IsZero() {
- devPanicf("node with zero key")
- continue
+ c.peerMap.forEachEndpoint(func(ep *endpoint) {
+ if !keep.Contains(ep.publicKey) {
+ c.peerMap.deleteEndpoint(ep)
+ }
+ })
+ }
+
+ // discokeys might have changed above. Discard unused info.
+ for dk := range c.discoInfo {
+ if !c.peerMap.knownPeerDiscoKey(dk) {
+ delete(c.discoInfo, dk)
}
- ep, ok := c.peerMap.endpointForNodeID(n.ID())
- if ok && ep.publicKey != n.Key() {
- // The node rotated public keys. Delete the old endpoint and create
- // it anew.
+ }
+
+ return true
+}
+
+// upsertPeerLocked upserts a single peer's endpoint in c.peerMap. It is the
+// per-peer body shared by [Conn.SetNetworkMap]'s upsert pass and by the
+// efficient per-peer [Conn.UpsertPeer] path.
+//
+// c.mu must be held.
+func (c *Conn) upsertPeerLocked(n tailcfg.NodeView, flags debugFlags, entriesPerBuffer int) {
+ if n.ID() == 0 {
+ devPanicf("node with zero ID")
+ return
+ }
+ if n.Key().IsZero() {
+ devPanicf("node with zero key")
+ return
+ }
+ ep, ok := c.peerMap.endpointForNodeID(n.ID())
+ if ok && ep.publicKey != n.Key() {
+ // The node rotated public keys. Delete the old endpoint and create
+ // it anew.
+ c.peerMap.deleteEndpoint(ep)
+ ok = false
+ }
+ if ok {
+ // At this point we're modifying an existing endpoint (ep) whose
+ // public key and nodeID match n. Its other fields (such as disco
+ // key or endpoints) might've changed.
+
+ if n.DiscoKey().IsZero() && !n.IsWireGuardOnly() {
+ // Discokey transitioned from non-zero to zero? This should not
+ // happen in the wild, however it could mean:
+ // 1. A node was downgraded from post 0.100 to pre 0.100.
+ // 2. A Tailscale node key was extracted and used on a
+ // non-Tailscale node (should not enter here due to the
+ // IsWireGuardOnly check)
+ // 3. The server is misbehaving.
c.peerMap.deleteEndpoint(ep)
- ok = false
+ return
}
- if ok {
- // At this point we're modifying an existing endpoint (ep) whose
- // public key and nodeID match n. Its other fields (such as disco
- // key or endpoints) might've changed.
-
- if n.DiscoKey().IsZero() && !n.IsWireGuardOnly() {
- // Discokey transitioned from non-zero to zero? This should not
- // happen in the wild, however it could mean:
- // 1. A node was downgraded from post 0.100 to pre 0.100.
- // 2. A Tailscale node key was extracted and used on a
- // non-Tailscale node (should not enter here due to the
- // IsWireGuardOnly check)
- // 3. The server is misbehaving.
- c.peerMap.deleteEndpoint(ep)
- continue
- }
- var oldDiscoKey key.DiscoPublic
- if epDisco := ep.disco.Load(); epDisco != nil {
- oldDiscoKey = epDisco.key
- }
- ep.updateFromNode(n, flags.heartbeatDisabled, flags.probeUDPLifetimeOn)
- c.peerMap.upsertEndpoint(ep, oldDiscoKey) // maybe update discokey mappings in peerMap
- continue
+ var oldDiscoKey key.DiscoPublic
+ if epDisco := ep.disco.Load(); epDisco != nil {
+ oldDiscoKey = epDisco.key
}
+ ep.updateFromNode(n, flags.heartbeatDisabled, flags.probeUDPLifetimeOn)
+ c.peerMap.upsertEndpoint(ep, oldDiscoKey) // maybe update discokey mappings in peerMap
+ return
+ }
- if ep, ok := c.peerMap.endpointForNodeKey(n.Key()); ok {
- // At this point n.Key() should be for a key we've never seen before. If
- // ok was true above, it was an update to an existing matching key and
- // we don't get this far. If ok was false above, that means it's a key
- // that differs from the one the NodeID had. But double check.
- if ep.nodeID != n.ID() {
- // Server error. This is known to be a particular issue for Mullvad
- // nodes (http://go/corp/27300), so log a distinct error for the
- // Mullvad and non-Mullvad cases. The error will be logged either way,
- // so an approximate heuristic is fine.
- //
- // When #27300 is fixed, we can delete this branch and log the same
- // panic for any public key moving.
- if strings.HasSuffix(n.Name(), ".mullvad.ts.net.") {
- devPanicf("public key moved between Mullvad nodeIDs (old=%v new=%v, key=%s); see http://go/corp/27300", ep.nodeID, n.ID(), n.Key().String())
- } else {
- devPanicf("public key moved between nodeIDs (old=%v new=%v, key=%s)", ep.nodeID, n.ID(), n.Key().String())
- }
+ if ep, ok := c.peerMap.endpointForNodeKey(n.Key()); ok {
+ // At this point n.Key() should be for a key we've never seen before. If
+ // ok was true above, it was an update to an existing matching key and
+ // we don't get this far. If ok was false above, that means it's a key
+ // that differs from the one the NodeID had. But double check.
+ if ep.nodeID != n.ID() {
+ // Server error. This is known to be a particular issue for Mullvad
+ // nodes (http://go/corp/27300), so log a distinct error for the
+ // Mullvad and non-Mullvad cases. The error will be logged either way,
+ // so an approximate heuristic is fine.
+ //
+ // When #27300 is fixed, we can delete this branch and log the same
+ // panic for any public key moving.
+ if strings.HasSuffix(n.Name(), ".mullvad.ts.net.") {
+ devPanicf("public key moved between Mullvad nodeIDs (old=%v new=%v, key=%s); see http://go/corp/27300", ep.nodeID, n.ID(), n.Key().String())
} else {
- // Internal data structures out of sync.
- devPanicf("public key found in peerMap but not by nodeID")
+ devPanicf("public key moved between nodeIDs (old=%v new=%v, key=%s)", ep.nodeID, n.ID(), n.Key().String())
}
- continue
- }
- if n.DiscoKey().IsZero() && !n.IsWireGuardOnly() {
- // Ancient pre-0.100 node, which does not have a disco key.
- // No longer supported.
- continue
+ } else {
+ // Internal data structures out of sync.
+ devPanicf("public key found in peerMap but not by nodeID")
}
+ return
+ }
+ if n.DiscoKey().IsZero() && !n.IsWireGuardOnly() {
+ // Ancient pre-0.100 node, which does not have a disco key.
+ // No longer supported.
+ return
+ }
- ep = &endpoint{
- c: c,
- nodeID: n.ID(),
- publicKey: n.Key(),
- publicKeyHex: n.Key().UntypedHexString(),
- sentPing: map[stun.TxID]sentPing{},
- endpointState: map[netip.AddrPort]*endpointState{},
- heartbeatDisabled: flags.heartbeatDisabled,
- isWireguardOnly: n.IsWireGuardOnly(),
- }
- switch runtime.GOOS {
- case "ios", "android":
- // Omit, to save memory. Prior to 2024-03-20 we used to limit it to
- // ~1MB on mobile but we never used the data so the memory was just
- // wasted.
- default:
- ep.debugUpdates = ringlog.New[EndpointChange](entriesPerBuffer)
- }
- if n.Addresses().Len() > 0 {
- ep.nodeAddr = n.Addresses().At(0).Addr()
- }
- ep.initFakeUDPAddr()
- ep.updateDiscoKey(n.DiscoKey())
+ ep = &endpoint{
+ c: c,
+ nodeID: n.ID(),
+ publicKey: n.Key(),
+ publicKeyHex: n.Key().UntypedHexString(),
+ sentPing: map[stun.TxID]sentPing{},
+ endpointState: map[netip.AddrPort]*endpointState{},
+ heartbeatDisabled: flags.heartbeatDisabled,
+ isWireguardOnly: n.IsWireGuardOnly(),
+ }
+ switch runtime.GOOS {
+ case "ios", "android":
+ // Omit, to save memory. Prior to 2024-03-20 we used to limit it to
+ // ~1MB on mobile but we never used the data so the memory was just
+ // wasted.
+ default:
+ ep.debugUpdates = ringlog.New[EndpointChange](entriesPerBuffer)
+ }
+ if n.Addresses().Len() > 0 {
+ ep.nodeAddr = n.Addresses().At(0).Addr()
+ }
+ ep.initFakeUDPAddr()
+ ep.updateDiscoKey(n.DiscoKey())
- if debugPeerMap() {
- c.logEndpointCreated(n)
- }
+ if debugPeerMap() {
+ c.logEndpointCreated(n)
+ }
- ep.updateFromNode(n, flags.heartbeatDisabled, flags.probeUDPLifetimeOn)
- c.peerMap.upsertEndpoint(ep, key.DiscoPublic{})
+ ep.updateFromNode(n, flags.heartbeatDisabled, flags.probeUDPLifetimeOn)
+ c.peerMap.upsertEndpoint(ep, key.DiscoPublic{})
+}
+
+// UpsertPeer adds or updates a single peer in c. It is the efficient
+// O(1)-per-peer alternative to [Conn.SetNetworkMap] when a single peer was
+// added or its fields changed. The caller is responsible for serializing
+// UpsertPeer/RemovePeer/SetNetworkMap calls relative to one another.
+//
+// UpsertPeer updates the relay-server set incrementally (O(1)) when the
+// upserted peer's relay candidacy changed, rather than rebuilding the
+// whole set with [Conn.updateRelayServersSet].
+func (c *Conn) UpsertPeer(n tailcfg.NodeView) {
+ c.mu.Lock()
+ if c.closed {
+ c.mu.Unlock()
+ return
}
+ if n.ID() == 0 {
+ c.mu.Unlock()
+ devPanicf("UpsertPeer: node with zero ID")
+ return
+ }
+ flags := c.debugFlagsLocked()
+ c.peersByID[n.ID()] = n
+ c.upsertPeerLocked(n, flags, debugRingBufferSize(len(c.peersByID)))
- // If the set of nodes changed since the last SetNetworkMap, the
- // upsert loop just above made c.peerMap contain the union of the
- // old and new peers - which will be larger than the set from the
- // current netmap. If that happens, go through the allocful
- // deletion path to clean up moribund nodes.
- if c.peerMap.nodeCount() != len(peers) {
- keep := set.Set[key.NodePublic]{}
- for _, n := range peers {
- keep.Add(n.Key())
- }
- c.peerMap.forEachEndpoint(func(ep *endpoint) {
- if !keep.Contains(ep.publicKey) {
- c.peerMap.deleteEndpoint(ep)
- }
- })
+ var relayUpsert candidatePeerRelay
+ relayQualifies := false
+ if c.relayClientEnabled {
+ relayQualifies, relayUpsert = c.relayCandidateLocked(n)
}
+ relayClientEnabled := c.relayClientEnabled
+ c.mu.Unlock()
- // discokeys might have changed in the above. Discard unused info.
- for dk := range c.discoInfo {
- if !c.peerMap.knownPeerDiscoKey(dk) {
- delete(c.discoInfo, dk)
+ if relayClientEnabled {
+ if relayQualifies {
+ c.relayManager.handleRelayServerUpsert(relayUpsert)
+ } else {
+ // The peer may have previously qualified; remove covers that
+ // case and is a no-op otherwise.
+ c.relayManager.handleRelayServerRemove(n.Key())
}
}
+}
- return peersChanged
+// RemovePeer removes a single peer from c. It is the efficient
+// O(1)-per-peer alternative to [Conn.SetNetworkMap] when a single peer was
+// removed. The caller is responsible for serializing UpsertPeer/RemovePeer/
+// SetNetworkMap calls relative to one another.
+func (c *Conn) RemovePeer(nid tailcfg.NodeID) {
+ c.mu.Lock()
+ if c.closed {
+ c.mu.Unlock()
+ return
+ }
+ prev, ok := c.peersByID[nid]
+ if !ok {
+ c.mu.Unlock()
+ return
+ }
+ delete(c.peersByID, nid)
+ if ep, ok := c.peerMap.endpointForNodeID(nid); ok {
+ c.peerMap.deleteEndpoint(ep)
+ }
+
+ // If the peer we just removed held the only reference to its disco
+ // key, drop the now-orphaned c.discoInfo entry. No need to scan the
+ // whole map — only this peer's disco key can have become unreferenced
+ // by this single removal.
+ if dk := prev.DiscoKey(); !dk.IsZero() && !c.peerMap.knownPeerDiscoKey(dk) {
+ delete(c.discoInfo, dk)
+ }
+
+ relayClientEnabled := c.relayClientEnabled
+ c.mu.Unlock()
+
+ if relayClientEnabled {
+ // Tell the relay manager to drop the peer. The run loop no-ops
+ // this if the peer wasn't a relay server.
+ c.relayManager.handleRelayServerRemove(prev.Key())
+ }
+}
+
+// relayCandidateLocked reports whether peer p is eligible to be a relay
+// server candidate for self, and if so returns the [candidatePeerRelay]
+// that would be added to the relay-server set. c.mu must be held.
+//
+// It mirrors the per-peer predicate in [Conn.updateRelayServersSet].
+func (c *Conn) relayCandidateLocked(p tailcfg.NodeView) (ok bool, cp candidatePeerRelay) {
+ if !p.Valid() {
+ return false, candidatePeerRelay{}
+ }
+ // The cap-version gate in updateRelayServersSet only applies to peers
+ // (not self). This helper is only called for peers, so always check.
+ if !capVerIsRelayCapable(p.Cap()) {
+ return false, candidatePeerRelay{}
+ }
+ if !nodeHasCap(c.filt, p, c.self, tailcfg.PeerCapabilityRelayTarget) {
+ return false, candidatePeerRelay{}
+ }
+ return true, candidatePeerRelay{
+ nodeKey: p.Key(),
+ discoKey: p.DiscoKey(),
+ derpHomeRegionID: uint16(p.HomeDERP()),
+ }
}
func devPanicf(format string, a ...any) {
diff --git a/wgengine/magicsock/magicsock_test.go b/wgengine/magicsock/magicsock_test.go
index 16d392e42..c592751e9 100644
--- a/wgengine/magicsock/magicsock_test.go
+++ b/wgengine/magicsock/magicsock_test.go
@@ -62,7 +62,6 @@ import (
"tailscale.com/types/netlogtype"
"tailscale.com/types/netmap"
"tailscale.com/types/nettype"
- "tailscale.com/types/views"
"tailscale.com/util/cibuild"
"tailscale.com/util/clientmetric"
"tailscale.com/util/eventbus"
@@ -3847,7 +3846,7 @@ func TestConn_SetNetworkMap_updateRelayServersSet(t *testing.T) {
c.filt = tt.filt
if len(tt.wantRelayServers) == 0 {
// So we can verify it gets flipped back.
- c.hasPeerRelayServers.Store(true)
+ c.relayManager.hasPeerRelayServers.Store(true)
}
c.SetNetworkMap(tt.self, tt.peers)
@@ -3855,8 +3854,8 @@ func TestConn_SetNetworkMap_updateRelayServersSet(t *testing.T) {
if !got.Equal(tt.wantRelayServers) {
t.Fatalf("got: %v != want: %v", got, tt.wantRelayServers)
}
- if len(tt.wantRelayServers) > 0 != c.hasPeerRelayServers.Load() {
- t.Fatalf("c.hasPeerRelayServers: %v != len(tt.wantRelayServers) > 0: %v", c.hasPeerRelayServers.Load(), len(tt.wantRelayServers) > 0)
+ if got, want := c.relayManager.hasPeerRelayServers.Load(), len(tt.wantRelayServers) > 0; got != want {
+ t.Fatalf("c.relayManager.hasPeerRelayServers: %v != len(tt.wantRelayServers) > 0: %v", got, want)
}
if c.relayClientEnabled != tt.wantRelayClientEnabled {
t.Fatalf("c.relayClientEnabled: %v != wantRelayClientEnabled: %v", c.relayClientEnabled, tt.wantRelayClientEnabled)
@@ -4422,7 +4421,7 @@ func TestReceiveTSMPDiscoKeyAdvertisement(t *testing.T) {
netip.MustParsePrefix("100.64.0.1/32"),
},
}).View()
- conn.peers = views.SliceOf([]tailcfg.NodeView{nodeView})
+ conn.peersByID = map[tailcfg.NodeID]tailcfg.NodeView{nodeView.ID(): nodeView}
conn.mu.Unlock()
conn.peerMap.upsertEndpoint(ep, key.DiscoPublic{})
@@ -4468,7 +4467,7 @@ func TestSendingTSMPDiscoTimer(t *testing.T) {
netip.MustParsePrefix("100.64.0.1/32"),
},
}).View()
- conn.peers = views.SliceOf([]tailcfg.NodeView{nodeView})
+ conn.peersByID = map[tailcfg.NodeID]tailcfg.NodeView{nodeView.ID(): nodeView}
conn.mu.Unlock()
conn.peerMap.upsertEndpoint(ep, key.DiscoPublic{})
diff --git a/wgengine/magicsock/relaymanager.go b/wgengine/magicsock/relaymanager.go
index e4cd5eb9f..8ea15bce3 100644
--- a/wgengine/magicsock/relaymanager.go
+++ b/wgengine/magicsock/relaymanager.go
@@ -9,6 +9,7 @@ import (
"fmt"
"net/netip"
"sync"
+ "sync/atomic"
"time"
"tailscale.com/disco"
@@ -34,6 +35,14 @@ import (
type relayManager struct {
initOnce sync.Once
+ // hasPeerRelayServers is whether relayManager is configured with at
+ // least one peer relay server via [relayManager.handleRelayServersSet]
+ // (or per-peer variants). Exposed as an atomic so [endpoint] hot paths
+ // can short-circuit when there are no relay servers without taking any
+ // lock or entering the run loop. Written only from runLoop() via
+ // [relayManager.publishHasServersRunLoop].
+ hasPeerRelayServers atomic.Bool
+
// ===================================================================
// The following fields are owned by a single goroutine, runLoop().
serversByNodeKey map[key.NodePublic]candidatePeerRelay
@@ -56,6 +65,8 @@ type relayManager struct {
newServerEndpointCh chan newRelayServerEndpointEvent
rxDiscoMsgCh chan relayDiscoMsgEvent
serversCh chan set.Set[candidatePeerRelay]
+ serverUpsertCh chan candidatePeerRelay
+ serverRemoveCh chan key.NodePublic
getServersCh chan chan set.Set[candidatePeerRelay]
derpHomeChangeCh chan derpHomeChangeEvent
@@ -228,6 +239,16 @@ func (r *relayManager) runLoop() {
if !r.hasActiveWorkRunLoop() {
return
}
+ case upsert := <-r.serverUpsertCh:
+ r.handleServerUpsertRunLoop(upsert)
+ if !r.hasActiveWorkRunLoop() {
+ return
+ }
+ case nk := <-r.serverRemoveCh:
+ r.handleServerRemoveRunLoop(nk)
+ if !r.hasActiveWorkRunLoop() {
+ return
+ }
case getServersCh := <-r.getServersCh:
r.handleGetServersRunLoop(getServersCh)
if !r.hasActiveWorkRunLoop() {
@@ -265,6 +286,34 @@ func (r *relayManager) handleServersUpdateRunLoop(update set.Set[candidatePeerRe
for _, v := range update.Slice() {
r.serversByNodeKey[v.nodeKey] = v
}
+ r.publishHasServersRunLoop()
+}
+
+// handleServerUpsertRunLoop inserts or updates cp in serversByNodeKey. It is
+// the per-peer analog of [relayManager.handleServersUpdateRunLoop] used by
+// [Conn.UpsertPeer].
+func (r *relayManager) handleServerUpsertRunLoop(cp candidatePeerRelay) {
+ r.serversByNodeKey[cp.nodeKey] = cp
+ r.publishHasServersRunLoop()
+}
+
+// handleServerRemoveRunLoop deletes nk from serversByNodeKey. It is a no-op
+// if nk isn't currently a known server. It is the per-peer analog of
+// [relayManager.handleServersUpdateRunLoop] used by [Conn.RemovePeer] and by
+// [Conn.UpsertPeer] when a peer is upserted with fields that make it no
+// longer a relay candidate.
+func (r *relayManager) handleServerRemoveRunLoop(nk key.NodePublic) {
+ if _, ok := r.serversByNodeKey[nk]; !ok {
+ return
+ }
+ delete(r.serversByNodeKey, nk)
+ r.publishHasServersRunLoop()
+}
+
+// publishHasServersRunLoop updates [relayManager.hasPeerRelayServers] to
+// reflect whether any relay servers are currently known.
+func (r *relayManager) publishHasServersRunLoop() {
+ r.hasPeerRelayServers.Store(len(r.serversByNodeKey) > 0)
}
type relayDiscoMsgEvent struct {
@@ -330,6 +379,8 @@ func (r *relayManager) init() {
r.newServerEndpointCh = make(chan newRelayServerEndpointEvent)
r.rxDiscoMsgCh = make(chan relayDiscoMsgEvent)
r.serversCh = make(chan set.Set[candidatePeerRelay])
+ r.serverUpsertCh = make(chan candidatePeerRelay)
+ r.serverRemoveCh = make(chan key.NodePublic)
r.getServersCh = make(chan chan set.Set[candidatePeerRelay])
r.derpHomeChangeCh = make(chan derpHomeChangeEvent)
r.runLoopStoppedCh = make(chan struct{}, 1)
@@ -436,6 +487,21 @@ func (r *relayManager) handleRelayServersSet(servers set.Set[candidatePeerRelay]
relayManagerInputEvent(r, nil, &r.serversCh, servers)
}
+// handleRelayServerUpsert is the O(1) per-peer variant of
+// [relayManager.handleRelayServersSet]: it inserts or updates a single
+// relay server entry.
+func (r *relayManager) handleRelayServerUpsert(cp candidatePeerRelay) {
+ relayManagerInputEvent(r, nil, &r.serverUpsertCh, cp)
+}
+
+// handleRelayServerRemove is the O(1) per-peer variant of
+// [relayManager.handleRelayServersSet]: it removes a single relay server
+// entry by node key. It is a no-op if nk is not currently a known relay
+// server.
+func (r *relayManager) handleRelayServerRemove(nk key.NodePublic) {
+ relayManagerInputEvent(r, nil, &r.serverRemoveCh, nk)
+}
+
// relayManagerInputEvent initializes [relayManager] if necessary, starts
// relayManager.runLoop() if it is not running, and writes 'event' on 'eventCh'.
//