summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--ipn/ipnlocal/local.go64
-rw-r--r--ipn/ipnlocal/local_test.go9
-rw-r--r--ipn/ipnlocal/state_test.go229
3 files changed, 278 insertions, 24 deletions
diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go
index b0a8d9985..c560fdae1 100644
--- a/ipn/ipnlocal/local.go
+++ b/ipn/ipnlocal/local.go
@@ -313,9 +313,8 @@ type LocalBackend struct {
serveListeners map[netip.AddrPort]*localListener // listeners for local serve traffic
serveProxyHandlers sync.Map // string (HTTPHandler.Proxy) => *reverseProxy
- // statusLock must be held before calling statusChanged.Wait() or
+ // mu must be held before calling statusChanged.Wait() or
// statusChanged.Broadcast().
- statusLock sync.Mutex
statusChanged *sync.Cond
// dialPlan is any dial plan that we've received from the control
@@ -542,7 +541,7 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo
b.setTCPPortsIntercepted(nil)
- b.statusChanged = sync.NewCond(&b.statusLock)
+ b.statusChanged = sync.NewCond(&b.mu)
b.e.SetStatusCallback(b.setWgengineStatus)
b.prevIfState = netMon.InterfaceState()
@@ -2265,14 +2264,15 @@ func (b *LocalBackend) setWgengineStatus(s *wgengine.Status, err error) {
b.send(ipn.Notify{Engine: &es})
}
+// broadcastStatusChanged must not be called with b.mu held.
func (b *LocalBackend) broadcastStatusChanged() {
// The sync.Cond docs say: "It is allowed but not required for the caller to hold c.L during the call."
- // In this particular case, we must acquire b.statusLock. Otherwise we might broadcast before
+ // In this particular case, we must acquire b.mu. Otherwise we might broadcast before
// the waiter (in requestEngineStatusAndWait) starts to wait, in which case
// the waiter can get stuck indefinitely. See PR 2865.
- b.statusLock.Lock()
+ b.mu.Lock()
b.statusChanged.Broadcast()
- b.statusLock.Unlock()
+ b.mu.Unlock()
}
// SetNotifyCallback sets the function to call when the backend has something to
@@ -3343,11 +3343,12 @@ func (b *LocalBackend) popBrowserAuthNow(url string, keyExpired bool, recipient
if !b.seamlessRenewalEnabled() || keyExpired {
b.blockEngineUpdates(true)
b.stopEngineAndWait()
+
+ if b.State() == ipn.Running {
+ b.enterState(ipn.Starting)
+ }
}
b.tellRecipientToBrowseToURL(url, toNotificationTarget(recipient))
- if b.State() == ipn.Running {
- b.enterState(ipn.Starting)
- }
}
// validPopBrowserURL reports whether urlStr is a valid value for a
@@ -5513,7 +5514,13 @@ func (b *LocalBackend) enterStateLockedOnEntry(newState ipn.State, unlock unlock
activeLogin := b.activeLogin
authURL := b.authURL
if newState == ipn.Running {
- b.resetAuthURLLocked()
+ // TODO(zofrex): Is this needed? As of 2025-10-03 it doesn't seem to be
+ // necessary when logging in or authenticating. When do we need to reset it
+ // here, rather than the other places it is reset? We should test if it is
+ // necessary and add unit tests to cover those cases, or remove it.
+ if oldState != ipn.Running {
+ b.resetAuthURLLocked()
+ }
// Start a captive portal detection loop if none has been
// started. Create a new context if none is present, since it
@@ -5750,29 +5757,38 @@ func (u unlockOnce) UnlockEarly() {
}
// stopEngineAndWait deconfigures the local network data plane, and
-// waits for it to deliver a status update before returning.
-//
-// TODO(danderson): this may be racy. We could unblock upon receiving
-// a status update that predates the "I've shut down" update.
+// waits for it to deliver a status update indicating it has stopped
+// before returning.
func (b *LocalBackend) stopEngineAndWait() {
b.logf("stopEngineAndWait...")
b.e.Reconfig(&wgcfg.Config{}, &router.Config{}, &dns.Config{})
- b.requestEngineStatusAndWait()
+ b.requestEngineStatusAndWaitForStopped()
b.logf("stopEngineAndWait: done.")
}
-// Requests the wgengine status, and does not return until the status
-// was delivered (to the usual callback).
-func (b *LocalBackend) requestEngineStatusAndWait() {
- b.logf("requestEngineStatusAndWait")
+// Requests the wgengine status, and does not return until a status was
+// delivered (to the usual callback) that indicates the engine is stopped.
+func (b *LocalBackend) requestEngineStatusAndWaitForStopped() {
+ b.logf("requestEngineStatusAndWaitForStopped")
- b.statusLock.Lock()
- defer b.statusLock.Unlock()
+ b.mu.Lock()
+ defer b.mu.Unlock()
b.goTracker.Go(b.e.RequestStatus)
- b.logf("requestEngineStatusAndWait: waiting...")
- b.statusChanged.Wait() // temporarily releases lock while waiting
- b.logf("requestEngineStatusAndWait: got status update.")
+ b.logf("requestEngineStatusAndWaitForStopped: waiting...")
+ for {
+ b.statusChanged.Wait() // temporarily releases lock while waiting
+
+ if !b.blocked {
+ b.logf("requestEngineStatusAndWaitForStopped: engine is no longer blocked, must have stopped and started again, not safe to wait.")
+ break
+ }
+ if b.engineStatus.NumLive == 0 && b.engineStatus.LiveDERPs == 0 {
+ b.logf("requestEngineStatusAndWaitForStopped: engine is stopped.")
+ break
+ }
+ b.logf("requestEngineStatusAndWaitForStopped: engine is still running. Waiting...")
+ }
}
// setControlClientLocked sets the control client to cc,
diff --git a/ipn/ipnlocal/local_test.go b/ipn/ipnlocal/local_test.go
index a662793db..bc8bd2a67 100644
--- a/ipn/ipnlocal/local_test.go
+++ b/ipn/ipnlocal/local_test.go
@@ -1505,6 +1505,15 @@ func wantExitNodeIDNotify(want tailcfg.StableNodeID) wantedNotification {
}
}
+func wantStateNotify(want ipn.State) wantedNotification {
+ return wantedNotification{
+ name: "State=" + want.String(),
+ cond: func(_ testing.TB, _ ipnauth.Actor, n *ipn.Notify) bool {
+ return n.State != nil && *n.State == want
+ },
+ }
+}
+
func TestInternalAndExternalInterfaces(t *testing.T) {
type interfacePrefix struct {
i netmon.Interface
diff --git a/ipn/ipnlocal/state_test.go b/ipn/ipnlocal/state_test.go
index a387af035..d773f7227 100644
--- a/ipn/ipnlocal/state_test.go
+++ b/ipn/ipnlocal/state_test.go
@@ -1561,6 +1561,235 @@ func TestEngineReconfigOnStateChange(t *testing.T) {
}
}
+// TestStateMachineURLRace tests that wgengine updates arriving in the middle of
+// processing an auth URL doesn't result in the auth URL being cleared.
+func TestStateMachineURLRace(t *testing.T) {
+ runTestStateMachineURLRace(t, false)
+}
+
+func TestStateMachineURLRaceSeamless(t *testing.T) {
+ runTestStateMachineURLRace(t, true)
+}
+
+func runTestStateMachineURLRace(t *testing.T, seamless bool) {
+ var cc *mockControl
+ b := newLocalBackendWithTestControl(t, true, func(tb testing.TB, opts controlclient.Options) controlclient.Client {
+ cc = newClient(t, opts)
+ return cc
+ })
+
+ nw := newNotificationWatcher(t, b, &ipnauth.TestActor{})
+
+ t.Logf("Start")
+ nw.watch(0, []wantedNotification{
+ wantStateNotify(ipn.NeedsLogin)})
+ b.Start(ipn.Options{
+ UpdatePrefs: &ipn.Prefs{
+ WantRunning: true,
+ ControlURL: "https://localhost:1/",
+ },
+ })
+ nw.check()
+
+ t.Logf("LoginFinished")
+ cc.persist.UserProfile.LoginName = "user1"
+ cc.persist.NodeID = "node1"
+
+ if seamless {
+ b.sys.ControlKnobs().SeamlessKeyRenewal.Store(true)
+ }
+
+ nw.watch(0, []wantedNotification{
+ wantStateNotify(ipn.Starting)})
+ cc.send(nil, "", true, &netmap.NetworkMap{
+ SelfNode: (&tailcfg.Node{MachineAuthorized: true}).View(),
+ })
+ nw.check()
+
+ t.Logf("Running")
+ nw.watch(0, []wantedNotification{
+ wantStateNotify(ipn.Running)})
+ b.setWgengineStatus(&wgengine.Status{AsOf: time.Now(), DERPs: 1}, nil)
+ nw.check()
+
+ t.Logf("Re-auth (StartLoginInteractive)")
+ b.StartLoginInteractive(t.Context())
+
+ stop := make(chan struct{})
+ stopSpamming := sync.OnceFunc(func() {
+ stop <- struct{}{}
+ })
+ // if seamless renewal is enabled, the engine won't be disabled, and we won't
+ // ever call stopSpamming, so make sure it does get called
+ defer stopSpamming()
+
+ // Intercept updates between the engine and localBackend, so that we can see
+ // when the "stopped" update comes in and ensure we stop sending our "we're
+ // up" updates after that point.
+ b.e.SetStatusCallback(func(s *wgengine.Status, err error) {
+ // This is not one of our fake status updates, this is generated from the
+ // engine in response to LocalBackend calling RequestStatus. Stop spamming
+ // our fake statuses.
+ //
+ // TODO(zofrex): This is fragile, it works right now but would break if the
+ // calling pattern of RequestStatus changes. We should ensure that we keep
+ // sending "we're up" statuses right until Reconfig is called with
+ // zero-valued configs, and after that point only send "stopped" statuses.
+ stopSpamming()
+
+ // Once stopSpamming returns we are guaranteed to not send any more updates,
+ // so we can now send the real update (indicating shutdown) and be certain
+ // it will be received after any fake updates we sent. This is possibly a
+ // stronger guarantee than we get from the real engine?
+ b.setWgengineStatus(s, err)
+ })
+
+ // time needs to be >= last time for the status to be accepted, send all our
+ // spam with the same stale time so that when a real update comes in it will
+ // definitely be accepted.
+ time := b.lastStatusTime
+
+ // Flood localBackend with a lot of wgengine status updates, so if there are
+ // any race conditions in the multiple locks/unlocks that happen as we process
+ // the received auth URL, we will hit them.
+ go func() {
+ t.Logf("sending lots of fake wgengine status updates")
+ for {
+ select {
+ case <-stop:
+ t.Logf("stopping fake wgengine status updates")
+ return
+ default:
+ b.setWgengineStatus(&wgengine.Status{AsOf: time, DERPs: 1}, nil)
+ }
+ }
+ }()
+
+ t.Logf("Re-auth (receive URL)")
+ url1 := "https://localhost:1/1"
+ cc.send(nil, url1, false, nil)
+
+ // Don't need to wait on anything else - once .send completes, authURL should
+ // be set, and once .send has completed, any opportunities for a WG engine
+ // status update to trample it have ended as well.
+ if b.authURL == "" {
+ t.Fatalf("expected authURL to be set")
+ }
+}
+
+func TestWGEngineDownThenUpRace(t *testing.T) {
+ var cc *mockControl
+ b := newLocalBackendWithTestControl(t, true, func(tb testing.TB, opts controlclient.Options) controlclient.Client {
+ cc = newClient(t, opts)
+ return cc
+ })
+
+ nw := newNotificationWatcher(t, b, &ipnauth.TestActor{})
+
+ t.Logf("Start")
+ nw.watch(0, []wantedNotification{
+ wantStateNotify(ipn.NeedsLogin)})
+ b.Start(ipn.Options{
+ UpdatePrefs: &ipn.Prefs{
+ WantRunning: true,
+ ControlURL: "https://localhost:1/",
+ },
+ })
+ nw.check()
+
+ t.Logf("LoginFinished")
+ cc.persist.UserProfile.LoginName = "user1"
+ cc.persist.NodeID = "node1"
+
+ nw.watch(0, []wantedNotification{
+ wantStateNotify(ipn.Starting)})
+ cc.send(nil, "", true, &netmap.NetworkMap{
+ SelfNode: (&tailcfg.Node{MachineAuthorized: true}).View(),
+ })
+ nw.check()
+
+ nw.watch(0, []wantedNotification{
+ wantStateNotify(ipn.Running)})
+ b.setWgengineStatus(&wgengine.Status{AsOf: time.Now(), DERPs: 1}, nil)
+ nw.check()
+
+ t.Logf("Re-auth (StartLoginInteractive)")
+ b.StartLoginInteractive(t.Context())
+
+ var timeLock sync.RWMutex
+ timestamp := b.lastStatusTime
+
+ engineShutdown := make(chan struct{})
+ gotShutdown := sync.OnceFunc(func() {
+ t.Logf("engineShutdown")
+ engineShutdown <- struct{}{}
+ })
+
+ b.e.SetStatusCallback(func(s *wgengine.Status, err error) {
+ timeLock.Lock()
+ if s.AsOf.After(timestamp) {
+ timestamp = s.AsOf
+ }
+ timeLock.Unlock()
+
+ if err != nil || (s.DERPs == 0 && len(s.Peers) == 0) {
+ gotShutdown()
+ } else {
+ b.setWgengineStatus(s, err)
+ }
+ })
+
+ t.Logf("Re-auth (receive URL)")
+ url1 := "https://localhost:1/1"
+
+ done := make(chan struct{})
+ var wg sync.WaitGroup
+
+ wg.Go(func() {
+ t.Log("cc.send starting")
+ cc.send(nil, url1, false, nil) // will block until engine stops
+ t.Log("cc.send returned")
+ })
+
+ <-engineShutdown // will get called once cc.send is blocked
+ gotShutdown = sync.OnceFunc(func() {
+ t.Logf("engineShutdown")
+ engineShutdown <- struct{}{}
+ })
+
+ wg.Go(func() {
+ t.Log("StartLoginInteractive starting")
+ b.StartLoginInteractive(t.Context()) // will also block until engine stops
+ t.Log("StartLoginInteractive returned")
+ })
+
+ <-engineShutdown // will get called once StartLoginInteractive is blocked
+
+ st := controlclient.Status{}
+ st.SetStateForTest(controlclient.StateAuthenticated)
+ b.SetControlClientStatus(cc, st)
+
+ timeLock.RLock()
+ b.setWgengineStatus(&wgengine.Status{AsOf: timestamp}, nil) // engine is down event finally arrives
+ b.setWgengineStatus(&wgengine.Status{AsOf: timestamp, DERPs: 1}, nil) // engine is back up
+ timeLock.RUnlock()
+
+ go func() {
+ wg.Wait()
+ done <- struct{}{}
+ }()
+
+ t.Log("waiting for .send and .StartLoginInteractive to return")
+
+ select {
+ case <-done:
+ case <-time.After(10 * time.Second):
+ t.Fatalf("timed out waiting")
+ }
+
+ t.Log("both returned")
+}
+
func buildNetmapWithPeers(self tailcfg.NodeView, peers ...tailcfg.NodeView) *netmap.NetworkMap {
const (
firstAutoUserID = tailcfg.UserID(10000)