diff options
| author | Brad Fitzpatrick <bradfitz@tailscale.com> | 2025-09-20 16:48:18 -0700 |
|---|---|---|
| committer | Brad Fitzpatrick <bradfitz@tailscale.com> | 2025-09-21 01:05:54 +0000 |
| commit | 17643b05eb5a1e24b84c6937061f781e31dce0b9 (patch) | |
| tree | 906a6fe511f9a15e5eea5810337f04b71bdf6e90 /control/controlhttp/client.go | |
| parent | 1b6bc37f2859007dc4ed949b14f1f8531990b3cf (diff) | |
| download | tailscale-jamesbrad/controlhttp-race-dial.tar.xz tailscale-jamesbrad/controlhttp-race-dial.zip | |
control/controlhttp: simplify, fix race dialing, remove priority conceptjamesbrad/controlhttp-race-dial
Fixes tailscale/corp#32534
Co-authored-by: James Tucker <james@tailscale.com>
Change-Id: I4eb57f046d8b40403220e40eb67a31c41adb3a38
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
Diffstat (limited to 'control/controlhttp/client.go')
| -rw-r--r-- | control/controlhttp/client.go | 182 |
1 files changed, 47 insertions, 135 deletions
diff --git a/control/controlhttp/client.go b/control/controlhttp/client.go index 87061c310..5b1fed422 100644 --- a/control/controlhttp/client.go +++ b/control/controlhttp/client.go @@ -27,14 +27,13 @@ import ( "errors" "fmt" "io" - "math" + "log" "net" "net/http" "net/http/httptrace" "net/netip" "net/url" "runtime" - "sort" "sync/atomic" "time" @@ -53,7 +52,6 @@ import ( "tailscale.com/syncs" "tailscale.com/tailcfg" "tailscale.com/tstime" - "tailscale.com/util/multierr" ) var stdDialer net.Dialer @@ -110,18 +108,8 @@ func (a *Dialer) dial(ctx context.Context) (*ClientConn, error) { } candidates := a.DialPlan.Candidates - // Otherwise, we try dialing per the plan. Store the highest priority - // in the list, so that if we get a connection to one of those - // candidates we can return quickly. - var highestPriority int = math.MinInt - for _, c := range candidates { - if c.Priority > highestPriority { - highestPriority = c.Priority - } - } - - // This context allows us to cancel in-flight connections if we get a - // highest-priority connection before we're all done. + // Create a context to be canceled as we return, so once we get a good connection, + // we can drop all the other ones. ctx, cancel := context.WithCancel(ctx) defer cancel() @@ -129,142 +117,61 @@ func (a *Dialer) dial(ctx context.Context) (*ClientConn, error) { type dialResult struct { conn *ClientConn err error - cand tailcfg.ControlIPCandidate } - resultsCh := make(chan dialResult, len(candidates)) - - var pending atomic.Int32 - pending.Store(int32(len(candidates))) - for _, c := range candidates { - go func(ctx context.Context, c tailcfg.ControlIPCandidate) { - var ( - conn *ClientConn - err error - ) - - // Always send results back to our channel. - defer func() { - resultsCh <- dialResult{conn, err, c} - if pending.Add(-1) == 0 { - close(resultsCh) - } - }() + resultsCh := make(chan dialResult) // unbuffered, never closed - // If non-zero, wait the configured start timeout - // before we do anything. - if c.DialStartDelaySec > 0 { - a.logf("[v2] controlhttp: waiting %.2f seconds before dialing %q @ %v", c.DialStartDelaySec, a.Hostname, c.IP) - tmr, tmrChannel := a.clock().NewTimer(time.Duration(c.DialStartDelaySec * float64(time.Second))) - defer tmr.Stop() - select { - case <-ctx.Done(): - err = ctx.Err() - return - case <-tmrChannel: - } - } + dialCand := func(cand tailcfg.ControlIPCandidate) (*ClientConn, error) { + a.logf("[v2] controlhttp: waited %.2f seconds, dialing %q @ %v", cand.DialStartDelaySec, a.Hostname, cand.IP) - // Now, create a sub-context with the given timeout and - // try dialing the provided host. - ctx, cancel := context.WithTimeout(ctx, time.Duration(c.DialTimeoutSec*float64(time.Second))) - defer cancel() + ctx, cancel := context.WithTimeout(ctx, time.Duration(cand.DialTimeoutSec*float64(time.Second))) + defer cancel() - if c.IP.IsValid() { - a.logf("[v2] controlhttp: trying to dial %q @ %v", a.Hostname, c.IP) - } else if c.ACEHost != "" { - a.logf("[v2] controlhttp: trying to dial %q via ACE %q", a.Hostname, c.ACEHost) - } - // This will dial, and the defer above sends it back to our parent. - conn, err = a.dialHostOpt(ctx, c.IP, c.ACEHost) - }(ctx, c) + if cand.IP.IsValid() { + a.logf("[v2] controlhttp: trying to dial %q @ %v", a.Hostname, cand.IP) + } else if cand.ACEHost != "" { + a.logf("[v2] controlhttp: trying to dial %q via ACE %q", a.Hostname, cand.ACEHost) + } + // This will dial, and the defer above sends it back to our parent. + return a.dialHostOpt(ctx, cand.IP, cand.ACEHost) } - var results []dialResult - for res := range resultsCh { - // If we get a response that has the highest priority, we don't - // need to wait for any of the other connections to finish; we - // can just return this connection. - // - // TODO(andrew): we could make this better by keeping track of - // the highest remaining priority dynamically, instead of just - // checking for the highest total - if res.cand.Priority == highestPriority && res.conn != nil { - a.logf("[v1] controlhttp: high-priority success dialing %q @ %v from dial plan", a.Hostname, cmp.Or(res.cand.ACEHost, res.cand.IP.String())) - - // Drain the channel and any existing connections in - // the background. + for _, cand := range candidates { + timer := time.AfterFunc(time.Duration(cand.DialStartDelaySec*float64(time.Second)), func() { go func() { - for _, res := range results { - if res.conn != nil { - res.conn.Close() + conn, err := dialCand(cand) + select { + case resultsCh <- dialResult{conn, err}: + if err == nil { + a.logf("[v1] controlhttp: succeeded dialing %q @ %v from dial plan", a.Hostname, cmp.Or(cand.ACEHost, cand.IP.String())) } - } - for res := range resultsCh { - if res.conn != nil { - res.conn.Close() + case <-ctx.Done(): + if conn != nil { + conn.Close() } } - if a.drainFinished != nil { - close(a.drainFinished) - } }() - return res.conn, nil - } - - // This isn't a highest-priority result, so just store it until - // we're done. - results = append(results, res) + }) + defer timer.Stop() } - // After we finish this function, close any remaining open connections. - defer func() { - for _, result := range results { - // Note: below, we nil out the returned connection (if - // any) in the slice so we don't close it. - if result.conn != nil { - result.conn.Close() + var errs []error + for { + select { + case res := <-resultsCh: + if res.err == nil { + return res.conn, nil } + errs = append(errs, res.err) + if len(errs) == len(candidates) { + // If we get here, then we didn't get anywhere with our dial plan; fall back to just using DNS. + a.logf("controlhttp: failed dialing using DialPlan, falling back to DNS; errs=%s", errors.Join(errs...)) + return a.dialHost(ctx) + } + case <-ctx.Done(): + a.logf("controlhttp: context aborted dialing") + return nil, ctx.Err() } - - // We don't drain asynchronously after this point, so notify our - // channel when we return. - if a.drainFinished != nil { - close(a.drainFinished) - } - }() - - // Sort by priority, then take the first non-error response. - sort.Slice(results, func(i, j int) bool { - // NOTE: intentionally inverted so that the highest priority - // item comes first - return results[i].cand.Priority > results[j].cand.Priority - }) - - var ( - conn *ClientConn - errs []error - ) - for i, result := range results { - if result.err != nil { - errs = append(errs, result.err) - continue - } - - a.logf("[v1] controlhttp: succeeded dialing %q @ %v from dial plan", a.Hostname, cmp.Or(result.cand.ACEHost, result.cand.IP.String())) - conn = result.conn - results[i].conn = nil // so we don't close it in the defer - return conn, nil } - if ctx.Err() != nil { - a.logf("controlhttp: context aborted dialing") - return nil, ctx.Err() - } - - merr := multierr.New(errs...) - - // If we get here, then we didn't get anywhere with our dial plan; fall back to just using DNS. - a.logf("controlhttp: failed dialing using DialPlan, falling back to DNS; errs=%s", merr.Error()) - return a.dialHost(ctx) } // The TS_FORCE_NOISE_443 envknob forces the controlclient noise dialer to @@ -422,6 +329,11 @@ func (a *Dialer) dialHostOpt(ctx context.Context, optAddr netip.Addr, optACEHost go try(u443) } // else we lost the race and it started already which is what we want case u443: + if u80 == nil { + log.Printf("XXXX no port 80 so returning error: %v", res.err) + // We never started a port 80 dial, so just return the port 443 error. + return nil, res.err + } err443 = res.err default: panic("invalid") |
