summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorMario Minardi <mario@tailscale.com>2024-11-28 13:47:55 -0700
committerMario Minardi <mario@tailscale.com>2024-11-28 13:47:55 -0700
commiteb769a2a68f712a8264e3d0ad66dbaadf1ca8d3e (patch)
treeefce690f0b2e82176764cc85707ea4a8fc1c8267
parent61dd2662eca775c8a3f6700a0194db5816de5049 (diff)
downloadtailscale-mpminardi/derp-experiments.tar.xz
tailscale-mpminardi/derp-experiments.zip
Experiment with ideal connect flowmpminardi/derp-experiments
-rw-r--r--derp/derphttp/derphttp_client.go281
1 files changed, 245 insertions, 36 deletions
diff --git a/derp/derphttp/derphttp_client.go b/derp/derphttp/derphttp_client.go
index c95d072b1..198bafb5f 100644
--- a/derp/derphttp/derphttp_client.go
+++ b/derp/derphttp/derphttp_client.go
@@ -32,6 +32,7 @@ import (
"tailscale.com/derp"
"tailscale.com/envknob"
"tailscale.com/health"
+ "tailscale.com/logtail/backoff"
"tailscale.com/net/dnscache"
"tailscale.com/net/netmon"
"tailscale.com/net/netns"
@@ -87,19 +88,20 @@ type Client struct {
// Client.conn holds mu.
addrFamSelAtomic syncs.AtomicValue[AddressFamilySelector]
- mu sync.Mutex
- atomicState syncs.AtomicValue[ConnectedState] // hold mu to write
- started bool // true upon first connect, never transitions to false
- preferred bool
- canAckPings bool
- closed bool
- netConn io.Closer
- client *derp.Client
- connGen int // incremented once per new connection; valid values are >0
- serverPubKey key.NodePublic
- tlsState *tls.ConnectionState
- pingOut map[derp.PingMessage]chan<- bool // chan to send to on pong
- clock tstime.Clock
+ mu sync.Mutex
+ atomicState syncs.AtomicValue[ConnectedState] // hold mu to write
+ started bool // true upon first connect, never transitions to false
+ preferred bool
+ canAckPings bool
+ closed bool
+ netConn io.Closer
+ client *derp.Client
+ connGen int // incremented once per new connection; valid values are >0
+ serverPubKey key.NodePublic
+ tlsState *tls.ConnectionState
+ pingOut map[derp.PingMessage]chan<- bool // chan to send to on pong
+ clock tstime.Clock
+ attemptingIdealConnection bool
}
// ConnectedState describes the state of a derphttp Client.
@@ -325,6 +327,7 @@ func useWebsockets() bool {
return false
}
+// TODO: split this into 2 methods so we can do like a try-connect ype thing?
func (c *Client) connect(ctx context.Context, caller string) (client *derp.Client, connGen int, err error) {
c.mu.Lock()
defer c.mu.Unlock()
@@ -460,32 +463,15 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien
var serverProtoVersion int
var tlsState *tls.ConnectionState
if c.useHTTPS() {
- tlsConn := c.tlsClient(tcpConn, node)
- httpConn = tlsConn
-
- // Force a handshake now (instead of waiting for it to
- // be done implicitly on read/write) so we can check
- // the ConnectionState.
- if err := tlsConn.Handshake(); err != nil {
+ handshakeData, err := c.tlsHandshake(tcpConn, node)
+ if err != nil {
return nil, 0, err
}
- // We expect to be using TLS 1.3 to our own servers, and only
- // starting at TLS 1.3 are the server's returned certificates
- // encrypted, so only look for and use our "meta cert" if we're
- // using TLS 1.3. If we're not using TLS 1.3, it might be a user
- // running cmd/derper themselves with a different configuration,
- // in which case we can avoid this fast-start optimization.
- // (If a corporate proxy is MITM'ing TLS 1.3 connections with
- // corp-mandated TLS root certs than all bets are off anyway.)
- // Note that we're not specifically concerned about TLS downgrade
- // attacks. TLS handles that fine:
- // https://blog.gypsyengineer.com/en/security/how-does-tls-1-3-protect-against-downgrade-attacks.html
- cs := tlsConn.ConnectionState()
- tlsState = &cs
- if cs.Version >= tls.VersionTLS13 {
- serverPub, serverProtoVersion = parseMetaCert(cs.PeerCertificates)
- }
+ httpConn = handshakeData.conn
+ serverPub = handshakeData.serverPub
+ serverProtoVersion = handshakeData.serverProtoVersion
+ tlsState = &handshakeData.connectionState
} else {
httpConn = tcpConn
}
@@ -510,6 +496,34 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien
// on Client that's like "here's the connect result and connGen for the
// next connect that comes in"). Tracking bug for all this is:
// https://github.com/tailscale/tailscale/issues/12724
+ if !c.attemptingIdealConnection {
+ c.attemptingIdealConnection = true
+
+ go func() {
+ bo := backoff.NewBackoff("dial-node", c.logf, 24*time.Hour)
+
+ for {
+ bo.BackOff(ctx, err)
+
+ // Exit if we're connected to the preferred
+ c.mu.Lock()
+ if c.preferred {
+ c.mu.Unlock()
+ break
+ }
+ c.mu.Unlock()
+
+ err := c.tryConnectIdeal(ctx, node, "test")
+ if err == nil {
+ break
+ }
+ }
+
+ c.mu.Lock()
+ c.attemptingIdealConnection = false
+ c.mu.Unlock()
+ }()
+ }
}
if !serverPub.IsZero() && serverProtoVersion != 0 {
@@ -527,6 +541,7 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien
// No need to flush the HTTP request. the derp.Client's initial
// client auth frame will flush it.
} else {
+ // TODO: this is the upgrade
if err := req.Write(brw); err != nil {
return nil, 0, err
}
@@ -581,6 +596,199 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien
return c.client, c.connGen, nil
}
+func (c *Client) tryConnectIdeal(ctx context.Context, node *tailcfg.DERPNode, caller string) (err error) {
+ // timeout is the fallback maximum time (if ctx doesn't limit
+ // it further) to do all of: DNS + TCP + TLS + HTTP Upgrade +
+ // DERP upgrade.
+ const timeout = 10 * time.Second
+ ctx, cancel := context.WithTimeout(ctx, timeout)
+ go func() {
+ select {
+ case <-ctx.Done():
+ // Either timeout fired (handled below), or
+ // we're returning via the defer cancel()
+ // below.
+ case <-c.ctx.Done():
+ // Propagate a Client.Close call into
+ // cancelling this context.
+ cancel()
+ }
+ }()
+ defer cancel()
+
+ var tcpConn net.Conn
+
+ defer func() {
+ if err != nil {
+ if ctx.Err() != nil {
+ err = fmt.Errorf("%v: %v", ctx.Err(), err)
+ }
+ err = fmt.Errorf("%s connect to %v: %v", caller, node.HostName, err)
+ if tcpConn != nil {
+ go tcpConn.Close()
+ }
+ }
+ }()
+
+ tcpConn, err = c.dialNode(ctx, node)
+ if err != nil {
+ return err
+ }
+
+ //// More handshake? ////
+ var httpConn net.Conn // a TCP conn or a TLS conn; what we speak HTTP to
+ var serverPub key.NodePublic // or zero if unknown (if not using TLS or TLS middlebox eats it)
+ var serverProtoVersion int
+ var tlsState *tls.ConnectionState
+ if c.useHTTPS() {
+ handshakeData, err := c.tlsHandshake(tcpConn, node)
+ if err != nil {
+ return err
+ }
+
+ httpConn = handshakeData.conn
+ serverPub = handshakeData.serverPub
+ serverProtoVersion = handshakeData.serverProtoVersion
+ tlsState = &handshakeData.connectionState
+ } else {
+ httpConn = tcpConn
+ }
+ //// More handshake? ////
+
+ // TODO: drop old here and get mutex
+ c.closeForReconnect(c.client)
+ c.mu.Lock()
+ defer c.mu.Unlock()
+
+ //// Upgrade ? ////
+ brw := bufio.NewReadWriter(bufio.NewReader(httpConn), bufio.NewWriter(httpConn))
+ var derpClient *derp.Client
+
+ req, err := http.NewRequest("GET", c.urlString(node), nil)
+ if err != nil {
+ return err
+ }
+ req.Header.Set("Upgrade", "DERP")
+ req.Header.Set("Connection", "Upgrade")
+
+ if !serverPub.IsZero() && serverProtoVersion != 0 {
+ // parseMetaCert found the server's public key (no TLS
+ // middlebox was in the way), so skip the HTTP upgrade
+ // exchange. See https://github.com/tailscale/tailscale/issues/693
+ // for an overview. We still send the HTTP request
+ // just to get routed into the server's HTTP Handler so it
+ // can Hijack the request, but we signal with a special header
+ // that we don't want to deal with its HTTP response.
+ req.Header.Set(fastStartHeader, "1") // suppresses the server's HTTP response
+ if err := req.Write(brw); err != nil {
+ return err
+ }
+ // No need to flush the HTTP request. the derp.Client's initial
+ // client auth frame will flush it.
+ } else {
+ // TODO: this is the upgrade
+ if err := req.Write(brw); err != nil {
+ return err
+ }
+ if err := brw.Flush(); err != nil {
+ return err
+ }
+
+ resp, err := http.ReadResponse(brw.Reader, req)
+ if err != nil {
+ return err
+ }
+ if resp.StatusCode != http.StatusSwitchingProtocols {
+ b, _ := io.ReadAll(resp.Body)
+ resp.Body.Close()
+ return fmt.Errorf("GET failed: %v: %s", err, b)
+ }
+ }
+ //// Upgrade ? ////
+
+ derpClient, err = derp.NewClient(c.privateKey, httpConn, brw, c.logf,
+ derp.MeshKey(c.MeshKey),
+ derp.ServerPublicKey(serverPub),
+ derp.CanAckPings(c.canAckPings),
+ derp.IsProber(c.IsProber),
+ )
+ if err != nil {
+ return err
+ }
+ if err := derpClient.NotePreferred(true); err != nil {
+ go httpConn.Close()
+ return err
+ }
+
+ if c.WatchConnectionChanges {
+ if err := derpClient.WatchConnectionChanges(); err != nil {
+ go httpConn.Close()
+ return err
+ }
+ }
+
+ c.preferred = true
+
+ c.serverPubKey = derpClient.ServerPublicKey()
+ c.client = derpClient
+ c.netConn = tcpConn
+ c.tlsState = tlsState
+ c.connGen++
+
+ localAddr, _ := c.client.LocalAddr()
+ c.atomicState.Store(ConnectedState{
+ Connected: true,
+ LocalAddr: localAddr,
+ })
+ return nil
+}
+
+type tlsHandshakeData struct {
+ conn *tls.Conn
+ connectionState tls.ConnectionState
+ serverPub key.NodePublic // zero if unknown (if not using TLS or TLS middlebox eats it)
+ serverProtoVersion int
+}
+
+// tlsHandshake forces a TLS handshake and returns tlsHandshakeData
+// containing the TLS connection, connection state, and server metadata
+// on success.
+func (c *Client) tlsHandshake(tcpConn net.Conn, node *tailcfg.DERPNode) (*tlsHandshakeData, error) {
+ tlsConn := c.tlsClient(tcpConn, node)
+
+ // Force a handshake now (instead of waiting for it to
+ // be done implicitly on read/write) so we can check
+ // the ConnectionState.
+ if err := tlsConn.Handshake(); err != nil {
+ return nil, err
+ }
+
+ // We expect to be using TLS 1.3 to our own servers, and only
+ // starting at TLS 1.3 are the server's returned certificates
+ // encrypted, so only look for and use our "meta cert" if we're
+ // using TLS 1.3. If we're not using TLS 1.3, it might be a user
+ // running cmd/derper themselves with a different configuration,
+ // in which case we can avoid this fast-start optimization.
+ // (If a corporate proxy is MITM'ing TLS 1.3 connections with
+ // corp-mandated TLS root certs than all bets are off anyway.)
+ // Note that we're not specifically concerned about TLS downgrade
+ // attacks. TLS handles that fine:
+ // https://blog.gypsyengineer.com/en/security/how-does-tls-1-3-protect-against-downgrade-attacks.html
+ cs := tlsConn.ConnectionState()
+ var serverPub key.NodePublic
+ var serverProtoVersion int
+ if cs.Version >= tls.VersionTLS13 {
+ serverPub, serverProtoVersion = parseMetaCert(cs.PeerCertificates)
+ }
+
+ return &tlsHandshakeData{
+ conn: tlsConn,
+ connectionState: cs,
+ serverPub: serverPub,
+ serverProtoVersion: serverProtoVersion,
+ }, nil
+}
+
// SetURLDialer sets the dialer to use for dialing URLs.
// This dialer is only use for clients created with NewClient, not NewRegionClient.
// If unset or nil, the default dialer is used.
@@ -1116,6 +1324,7 @@ func (c *Client) Close() error {
// time and both calling closeForReconnect and the caller goroutines
// forever calling closeForReconnect in lockstep endlessly;
// https://github.com/tailscale/tailscale/pull/264)
+// TODO: this to close existing?
func (c *Client) closeForReconnect(brokenClient *derp.Client) {
c.mu.Lock()
defer c.mu.Unlock()