diff options
| author | Fernando Serboncini <fserb@tailscale.com> | 2026-02-04 20:14:48 -0500 |
|---|---|---|
| committer | Fernando Serboncini <fserb@tailscale.com> | 2026-02-05 16:16:29 -0500 |
| commit | 357aa8b6b63484a838821aad150c199214e97340 (patch) | |
| tree | c35c82dad02824e30d89b467f6e675984327b618 | |
| parent | 036b6a12621306da8368b167deb9858d4a8d6ce9 (diff) | |
| download | tailscale-fserb/natlab-flaky.tar.xz tailscale-fserb/natlab-flaky.zip | |
DO NOT SUBMIT: tsnet/natlab flaky investigationfserb/natlab-flaky
Signed-off-by: Fernando Serboncini <fserb@tailscale.com>
| -rw-r--r-- | tstest/integration/nat/nat_test.go | 50 | ||||
| -rw-r--r-- | tstest/natlab/vnet/conf.go | 40 | ||||
| -rw-r--r-- | tstest/natlab/vnet/vnet.go | 14 |
3 files changed, 82 insertions, 22 deletions
diff --git a/tstest/integration/nat/nat_test.go b/tstest/integration/nat/nat_test.go index 2aea7c296..98dd8f854 100644 --- a/tstest/integration/nat/nat_test.go +++ b/tstest/integration/nat/nat_test.go @@ -119,9 +119,13 @@ func v6cidr(n int) string { func easy(c *vnet.Config) *vnet.Node { n := c.NumNodes() + 1 - return c.AddNode(c.AddNetwork( + nw := c.AddNetwork( fmt.Sprintf("2.%d.%d.%d", n, n, n), // public IP - fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyNAT)) + fmt.Sprintf("192.168.%d.1/24", n), vnet.EasyNAT) + if n == 2 { + nw.SetDERPLatency(500 * time.Millisecond) + } + return c.AddNode(nw) } func easyAnd6(c *vnet.Config) *vnet.Node { @@ -415,7 +419,7 @@ func (nt *natTest) runTest(addNode ...addNodeFunc) pingRoute { return "" } - pingRes, err := ping(ctx, clients[0], sts[1].Self.TailscaleIPs[0]) + pingRes, err := ping(ctx, t, clients[0], sts[1].Self.TailscaleIPs[0]) if err != nil { t.Fatalf("ping failure: %v", err) } @@ -450,14 +454,16 @@ const ( routeNil pingRoute = "nil" // *ipnstate.PingResult is nil ) -func ping(ctx context.Context, c *vnet.NodeAgentClient, target netip.Addr) (*ipnstate.PingResult, error) { +func ping_old(ctx context.Context, t testing.TB, c *vnet.NodeAgentClient, target netip.Addr) (*ipnstate.PingResult, error) { n := 0 var res *ipnstate.PingResult anyPong := false for n < 10 { n++ + t.Logf("ping_old attempt %d to %v ...", n, target) pr, err := c.PingWithOpts(ctx, target, tailcfg.PingDisco, tailscale.PingOpts{}) if err != nil { + t.Logf("ping_old attempt %d error: %v", n, err) if anyPong { return res, nil } @@ -467,8 +473,10 @@ func ping(ctx context.Context, c *vnet.NodeAgentClient, target netip.Addr) (*ipn return nil, errors.New(pr.Err) } if pr.DERPRegionID == 0 { + t.Logf("ping_old attempt %d: direct (endpoint %v, latency %v)", n, pr.Endpoint, pr.LatencySeconds) return pr, nil } + t.Logf("ping_old attempt %d: via DERP region %d (latency %v)", n, pr.DERPRegionID, pr.LatencySeconds) res = pr select { case <-ctx.Done(): @@ -481,6 +489,40 @@ func ping(ctx context.Context, c *vnet.NodeAgentClient, target netip.Addr) (*ipn return res, nil } +func ping(ctx context.Context, t testing.TB, c *vnet.NodeAgentClient, target netip.Addr) (*ipnstate.PingResult, error) { + var lastRes *ipnstate.PingResult + for n := range 10 { + t.Logf("ping attempt %d to %v ...", n+1, target) + pingCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + pr, err := c.PingWithOpts(pingCtx, target, tailcfg.PingDisco, tailscale.PingOpts{}) + cancel() + if err != nil { + t.Logf("ping attempt %d error: %v", n+1, err) + if ctx.Err() != nil { + break + } + continue + } + if pr.Err != "" { + return nil, errors.New(pr.Err) + } + t.Logf("ping attempt %d: derp=%d endpoint=%v latency=%v", n+1, pr.DERPRegionID, pr.Endpoint, pr.LatencySeconds) + if pr.DERPRegionID == 0 { + return pr, nil + } + lastRes = pr + select { + case <-ctx.Done(): + return lastRes, nil + case <-time.After(time.Second): + } + } + if lastRes != nil { + return lastRes, nil + } + return nil, fmt.Errorf("no ping response (ctx: %v)", ctx.Err()) +} + func up(ctx context.Context, c *vnet.NodeAgentClient) error { req, err := http.NewRequestWithContext(ctx, "GET", "http://unused/up", nil) if err != nil { diff --git a/tstest/natlab/vnet/conf.go b/tstest/natlab/vnet/conf.go index 3f83e35c0..ca9127cc3 100644 --- a/tstest/natlab/vnet/conf.go +++ b/tstest/natlab/vnet/conf.go @@ -282,8 +282,9 @@ type Network struct { svcs set.Set[NetworkService] - latency time.Duration // latency applied to interface writes - lossRate float64 // chance of packet loss (0.0 to 1.0) + latency time.Duration // latency applied to interface writes + lossRate float64 // chance of packet loss (0.0 to 1.0) + derpLatency time.Duration // extra latency for DERP-related packets // ... err error // carried error @@ -304,6 +305,10 @@ func (n *Network) SetPacketLoss(rate float64) { n.lossRate = rate } +func (n *Network) SetDERPLatency(d time.Duration) { + n.derpLatency = d +} + // SetBlackholedIPv4 sets whether the network should blackhole all IPv4 traffic // out to the Internet. (DHCP etc continues to work on the LAN.) func (n *Network) SetBlackholedIPv4(v bool) { @@ -372,21 +377,22 @@ func (s *Server) initFromConfig(c *Config) error { conf.lanIP4 = netip.MustParsePrefix("192.168.0.0/24") } n := &network{ - num: conf.num, - s: s, - mac: conf.mac, - portmap: conf.svcs.Contains(NATPMP), // TODO: expand network.portmap - wanIP6: conf.wanIP6, - v4: conf.lanIP4.IsValid(), - v6: conf.wanIP6.IsValid(), - wanIP4: conf.wanIP4, - lanIP4: conf.lanIP4, - breakWAN4: conf.breakWAN4, - latency: conf.latency, - lossRate: conf.lossRate, - nodesByIP4: map[netip.Addr]*node{}, - nodesByMAC: map[MAC]*node{}, - logf: logger.WithPrefix(s.logf, fmt.Sprintf("[net-%v] ", conf.mac)), + num: conf.num, + s: s, + mac: conf.mac, + portmap: conf.svcs.Contains(NATPMP), // TODO: expand network.portmap + wanIP6: conf.wanIP6, + v4: conf.lanIP4.IsValid(), + v6: conf.wanIP6.IsValid(), + wanIP4: conf.wanIP4, + lanIP4: conf.lanIP4, + breakWAN4: conf.breakWAN4, + latency: conf.latency, + lossRate: conf.lossRate, + derpLatency: conf.derpLatency, + nodesByIP4: map[netip.Addr]*node{}, + nodesByMAC: map[MAC]*node{}, + logf: logger.WithPrefix(s.logf, fmt.Sprintf("[net-%v] ", conf.mac)), } netOfConf[conf] = n s.networks.Add(n) diff --git a/tstest/natlab/vnet/vnet.go b/tstest/natlab/vnet/vnet.go index 357fe213c..e01be9148 100644 --- a/tstest/natlab/vnet/vnet.go +++ b/tstest/natlab/vnet/vnet.go @@ -268,12 +268,22 @@ func (n *network) handleIPPacketFromGvisor(ipRaw []byte) { return } if nw, ok := n.writers.Load(node.mac); ok { - nw.write(resPkt) + if d := n.derpLatency; d > 0 && n.isDERPPacket(flow.src) { + pkt := make([]byte, len(resPkt)) + copy(pkt, resPkt) + time.AfterFunc(d, func() { nw.write(pkt) }) + } else { + nw.write(resPkt) + } } else { n.logf("gvisor write: no writeFunc for %v", node.mac) } } +func (n *network) isDERPPacket(ip netip.Addr) bool { + return fakeDERP1.Match(ip) || fakeDERP2.Match(ip) +} + func netaddrIPFromNetstackIP(s tcpip.Address) netip.Addr { switch s.Len() { case 4: @@ -435,6 +445,7 @@ func (n *network) serveLogCatcherConn(clientRemoteIP netip.Addr, c net.Conn) { for _, lg := range logs { tStr := lg.Logtail.Client_Time.Round(time.Millisecond).Format(time.RFC3339Nano) fmt.Fprintf(&node.logBuf, "[%v] %s\n", tStr, lg.Text) + n.s.logf("[%v] %s: %s", node, tStr, lg.Text) } } }) @@ -520,6 +531,7 @@ type network struct { breakWAN4 bool // break WAN IPv4 connectivity latency time.Duration // latency applied to interface writes lossRate float64 // probability of dropping a packet (0.0 to 1.0) + derpLatency time.Duration // extra latency for DERP-related packets nodesByIP4 map[netip.Addr]*node // by LAN IPv4 nodesByMAC map[MAC]*node logf func(format string, args ...any) |
