summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorClaus Lensbøl <claus@tailscale.com>2025-12-02 16:12:58 -0500
committerClaus Lensbøl <claus@tailscale.com>2025-12-04 10:38:16 -0500
commit05d7ea4f1ca30047883eaa31bf77928bb24f94e3 (patch)
tree7433a7f4b4e84b6c68d1ac8f3795d5d5fee7d546
parent5bfa8e97f6b419e6a1d2923c47a8fab7258b04db (diff)
downloadtailscale-cmol/delay-disco-key-exchange.tar.xz
tailscale-cmol/delay-disco-key-exchange.zip
wgengine: delay disco key exchange when control is unavailablecmol/delay-disco-key-exchange
Instead of addig a delay to the exchange of disco keys to avoid a race with the wireguard handshake, look for the handshake itself and ensure that we do not start the exchange when the handshake is ongoing. We do not have a direct way of knowing that a handshake is ongoing, only how many are currently in flight or have failed. That makes the polling less straight forward. Ideally, we would have wg tell us when a handshakes is ongoing so we can avoid polling for it. Updates tailscale/corp#34037 Signed-off-by: Claus Lensbøl <claus@tailscale.com>
-rw-r--r--wgengine/userspace.go53
1 files changed, 43 insertions, 10 deletions
diff --git a/wgengine/userspace.go b/wgengine/userspace.go
index c0e79633a..1b57c24fc 100644
--- a/wgengine/userspace.go
+++ b/wgengine/userspace.go
@@ -569,16 +569,49 @@ func NewUserspaceEngine(logf logger.Logf, conf Config) (_ Engine, reterr error)
var tsmpRequestGroup singleflight.Group[netip.Addr, struct{}]
eventbus.SubscribeFunc(ec, func(req magicsock.TSMPDiscoKeyRequest) {
go tsmpRequestGroup.Do(req.DstIP, func() (struct{}, error) {
- // DiscoKeyRequests are triggered by an incoming WireGuard handshake
- // initiation arriving before a disco ping, which is a likely
- // indicator that disco pings failed due to a lack of key
- // synchronization. If the requests are sent immediately, before the
- // handshake state is accepted in the WireGuard client state
- // machine, this starts a new session, and the two peer state
- // machines conflict, causing loss and additional delays. Delaying
- // the send avoids this, so coalesce duplicate sends, and delay them
- // by a short time to avoid the state machine conflict.
- time.Sleep(time.Millisecond)
+ nodePeer, ok := e.PeerForIP(req.DstIP)
+ if !ok {
+ return struct{}{}, fmt.Errorf("did not find peer by IP %q", req.DstIP)
+ }
+ peer, ok := e.PeerByKey(nodePeer.Node.Key())
+ if !ok {
+ return struct{}{}, fmt.Errorf("did not find peer by key %q", nodePeer.Node.Key())
+ }
+ peer.IsValid()
+
+ // Poll for handshake completion with a timeout.
+ const pollInterval = 10 * time.Millisecond
+ const maxWaitStart = 100 * time.Microsecond
+ ctxStart, cancelStart := context.WithTimeout(context.Background(), maxWaitStart)
+ defer cancelStart()
+
+ sawHandshake := true
+ // Wait for the handshake to be in-progress.
+ e.logf("Looking for magicsock handshake")
+ for peer.HandshakeAttempts() == 0 {
+ if ctxStart.Err() != nil {
+ // Timeout waiting for handshake to start, send TSMP package.
+ sawHandshake = false
+ break
+ }
+ time.Sleep(pollInterval)
+ }
+ e.logf("Found magicsock handshake: %t", sawHandshake)
+
+ const maxWaitComplete = 2 * time.Second
+ ctx, cancel := context.WithTimeout(context.Background(), maxWaitComplete)
+ defer cancel()
+ // Wait for the in-progress handshake to complete.
+ e.logf("Waiting for magicsock handshake to complete")
+ for sawHandshake && peer.HandshakeAttempts() > 0 {
+ if ctx.Err() != nil {
+ // Timeout waiting for completion. The handshake is stuck. Abort.
+ e.logf("Timed out waiting for magicsock handshake to complete")
+ return struct{}{}, errors.New("timeout waiting for handshake to complete")
+ }
+ time.Sleep(pollInterval)
+ }
+
if err := e.sendTSMPDiscoKeyRequest(req.DstIP); err != nil {
e.logf("wgengine: failed to send TSMP disco key request: %v", err)
}