summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorVal <valerie@tailscale.com>2023-08-07 23:05:24 +0200
committerVal <valerie@tailscale.com>2023-08-07 23:05:24 +0200
commit0a6ddae0de54e39240bb4f621fdae34d5917ca6d (patch)
treeae90b76336e76f1858adfb1768688f7f5e9f80a4
parent7d18398d7f430b37890845bd7396e6fa4d801ba9 (diff)
downloadtailscale-valscale/ptb.tar.xz
tailscale-valscale/ptb.zip
tmp: introduce wire/user/safe mtuvalscale/ptb
-rw-r--r--net/tstun/mtu.go65
-rw-r--r--net/tstun/mtu_test.go14
-rw-r--r--net/tstun/tun.go2
-rw-r--r--wgengine/magicsock/endpoint.go83
-rw-r--r--wgengine/netstack/netstack.go4
-rw-r--r--wgengine/router/ifconfig_windows.go2
6 files changed, 115 insertions, 55 deletions
diff --git a/net/tstun/mtu.go b/net/tstun/mtu.go
index 2307d47f9..43bbc28ec 100644
--- a/net/tstun/mtu.go
+++ b/net/tstun/mtu.go
@@ -4,15 +4,66 @@ package tstun
import "tailscale.com/envknob"
+// There are several kinds of MTU.
+//
+// On-the-wire MTU: This what the network device advertises as the
+// maximum packet size available above the physical link layer. This
+// includes IP headers and everything at a higher level. For Ethernet,
+// this is typically 1500 bytes but can be larger or smaller.
+//
+// Tailscale interface MTU: This is what we advertise to userspace as
+// the largest possible packet it can send through the tailscale
+// interface. This is 80 bytes lower than the largest interface we
+// have available to send things on, which is the size of the headers
+// Wireguard adds (80 for IPv6, 60 for IPv4, but we don't know which
+// it will be so we always subtract 80). E.g. if the largest interface
+// MTU is 1500, we set the tailscale interface MTU to 1420.
+//
+// Peer MTU: The MTU that we have probed for the path to a specific
+// peer's various endpoints. If this is smaller than the advertised
+// tailscale interface, and the packet is larger than the peer MTU,
+// then we generate ICMP Packet Too Big (IPv6) or Fragmentation Needed
+// (IPv4) packets inside tailscale and drop the packet.
+//
+// Historically, we set the tailscale interface MTU to 1280. This
+// means we treated the "on the wire" MTU as 1360. This is now the
+// "Safe" value we use when we do not know what the path MTU is.
+//
+// Internally, we store the peer MTU as the MTU advertised to the user.
+//
+// We have to call these by different names or it is way way too confusing.
+//
+// Wire MTU
+// User MTU
+// Peer MTU
+//
+// What should happen when we set TS_DEBUG_MTU? It should set the
+// interface to that, but we should not assume that the path MTU is
+// this. So distinguish between what we set the interface MTU to and
+// what we assume the path MTU is in the absence of probe information.
+
const (
- maxMTU uint32 = 65536
- defaultMTU uint32 = 1280
+ maxMTU uint32 = 65536
+ wireguardOverhead = 80
+ DefaultUserMTU uint32 = 1280
+ DefaultWireMTU uint32 = 1280 + wireguardOverhead
)
-// DefaultMTU returns either the constant default MTU of 1280, or the value set
-// in TS_DEBUG_MTU clamped to a maximum of 65536.
-func DefaultMTU() uint32 {
- // DefaultMTU is the Tailscale default MTU for now.
+func userMTUToWireMTU(userMTU uint32) uint32 {
+ return userMTU + wireguardOverhead
+}
+
+func wireMTUToUserMTU(wireMTU uint32) uint32 {
+ if wireMTU < wireguardOverhead {
+ return 0
+ }
+ return wireMTU - wireguardOverhead
+}
+
+// TunMTU returns either the constant default user MTU of 1280, or the
+// value set in TS_DEBUG_MTU clamped to a maximum of 65536.
+func TunMTU() uint32 {
+ // TunMTU is the Tailscale default MTU for now.
//
// wireguard-go defaults to 1420 bytes, which only works if the
// "outer" MTU is 1500 bytes. This breaks on DSL connections
@@ -21,7 +72,7 @@ func DefaultMTU() uint32 {
// 1280 is the smallest MTU allowed for IPv6, which is a sensible
// "probably works everywhere" setting until we develop proper PMTU
// discovery.
- tunMTU := defaultMTU
+ tunMTU := DefaultUserMTU
if mtu, ok := envknob.LookupUintSized("TS_DEBUG_MTU", 10, 32); ok {
mtu := uint32(mtu)
if mtu > maxMTU {
diff --git a/net/tstun/mtu_test.go b/net/tstun/mtu_test.go
index f3aea4697..1e01c5b12 100644
--- a/net/tstun/mtu_test.go
+++ b/net/tstun/mtu_test.go
@@ -7,22 +7,22 @@ import (
"testing"
)
-func TestDefaultMTU(t *testing.T) {
+func TestTunMTU(t *testing.T) {
orig := os.Getenv("TS_DEBUG_MTU")
defer os.Setenv("TS_DEBUG_MTU", orig)
os.Setenv("TS_DEBUG_MTU", "")
- if DefaultMTU() != 1280 {
- t.Errorf("DefaultMTU() = %d, want 1280", DefaultMTU())
+ if TunMTU() != 1280 {
+ t.Errorf("TunMTU() = %d, want 1280", TunMTU())
}
os.Setenv("TS_DEBUG_MTU", "9000")
- if DefaultMTU() != 9000 {
- t.Errorf("DefaultMTU() = %d, want 9000", DefaultMTU())
+ if TunMTU() != 9000 {
+ t.Errorf("TunMTU() = %d, want 9000", TunMTU())
}
os.Setenv("TS_DEBUG_MTU", "123456789")
- if DefaultMTU() != maxMTU {
- t.Errorf("DefaultMTU() = %d, want %d", DefaultMTU(), maxMTU)
+ if TunMTU() != maxMTU {
+ t.Errorf("TunMTU() = %d, want %d", TunMTU(), maxMTU)
}
}
diff --git a/net/tstun/tun.go b/net/tstun/tun.go
index b31ffa7ca..0373c7400 100644
--- a/net/tstun/tun.go
+++ b/net/tstun/tun.go
@@ -44,7 +44,7 @@ func New(logf logger.Logf, tunName string) (tun.Device, string, error) {
}
dev, err = createTAP(tapName, bridgeName)
} else {
- dev, err = tun.CreateTUN(tunName, int(DefaultMTU()))
+ dev, err = tun.CreateTUN(tunName, int(TunMTU()))
}
if err != nil {
return nil, "", err
diff --git a/wgengine/magicsock/endpoint.go b/wgengine/magicsock/endpoint.go
index b74082b65..cec2c4dd0 100644
--- a/wgengine/magicsock/endpoint.go
+++ b/wgengine/magicsock/endpoint.go
@@ -146,9 +146,10 @@ type pongReply struct {
pongSrc netip.AddrPort // what they reported they heard
}
-// mtusToProbe are likely MTUs we might see in the wild. They are used
-// by the peer MTU probing code. Set this to a single zero to disable
-// path MTU probing.
+// mtusToProbe are likely on-the-wire MTUs we might see in the
+// wild. They are used by the peer MTU probing code.
+//
+// Set this array to a single zero to disable path MTU probing.
var mtusToProbe = [...]int{
//576, // Smallest MTU for IPv4, probably useless?
//1124, // An observed max mtu in the wild, maybe 1100 instead?
@@ -612,8 +613,8 @@ func (de *endpoint) startDiscoPingLocked(ep netip.AddrPort, now mono.Time, purpo
de.recordAndSendDiscoPingLocked(ep, now, purpose, epDisco.key, size)
} else {
for _, mtu := range mtusToProbe {
- de.c.logf("probing mtu %v with disco message size %v", mtu, mtuToPingSize(ep, mtu))
- de.recordAndSendDiscoPingLocked(ep, now, purpose, epDisco.key, mtuToPingSize(ep, mtu))
+ de.c.logf("probing mtu %v with disco message size %v", mtu, wireMTUToPingSize(ep, mtu))
+ de.recordAndSendDiscoPingLocked(ep, now, purpose, epDisco.key, wireMTUToPingSize(ep, mtu))
}
}
}
@@ -896,34 +897,40 @@ func (de *endpoint) noteConnectivityChange() {
}
// mtuToPingSize takes a desired on-the-wire MTU and calculates the
-// disco ping message size that would produce a packet that is exactly MTU
-// bytes in length.
+// disco ping message size that would produce a packet that is exactly
+// MTU bytes in length including all the headers above the link layer
+// (IP and UDP).
//
-// If mtu is zero, return zero which means don't pad the ping packet at all.
-func mtuToPingSize(ep netip.AddrPort, mtu int) int {
+// Zero return value means don't pad the ping packet at all. An mtu
+// argument of zero or less than the necessary header length results
+// in a zero return value.
+func wireMTUToPingSize(ep netip.AddrPort, mtu int) int {
if mtu == 0 {
return 0
}
- size := mtu
headerLen := ipv4.HeaderLen
if ep.Addr().Is6() {
headerLen = ipv6.HeaderLen
}
headerLen += 8 // UDP header length
- size -= headerLen
- if size < 0 {
+ if mtu < headerLen {
return 0
}
- return size
+ return (mtu - headerLen)
}
-// pingSizeToMTU calculates the minimum path MTU that would permit a
-// disco ping message of sp.size to reach this endpoint. sp.size is
-// the length of the entire disco message.
-func pingSizeToMTU(sp sentPing) int {
+// pingSizeToMTU calculates the minimum wire MTU that would permit the
+// specified disco ping message to reach this endpoint. The size
+// recorded in sp.size does not include the IP/UDP headers at the
+// beginning of the disco message.
+//
+// If sp.size is zero, that means the ping was not padded at all and
+// the MTU was not tested, in which case return the largest safe
+// on-the-wire MTU.
+func pingSizeToWireMTU(sp sentPing) int {
mtu := sp.size
if mtu == 0 {
- return int(tstun.DefaultMTU())
+ return int(tstun.DefaultWireMTU)
}
headerLen := ipv4.HeaderLen
if sp.to.Addr().Is6() {
@@ -933,9 +940,9 @@ func pingSizeToMTU(sp sentPing) int {
return mtu + headerLen
}
-// pingSizeToExternalMTU calculates the path MTU as perceived by the
-// layer above Tailscale - that is, how much room for data there is
-// after accounting for WireGuard overhead.
+// pingSizeToUserMTU calculates the minimum MTU on the tailscale
+// interface that would permit this ping to reach this endpoint. It is
+// the size of the on-the-wire MTU minus the Wireguard overhead:
//
// - 20-byte IPv4 header or 40 byte IPv6 header
// - 8-byte UDP header
@@ -943,21 +950,23 @@ func pingSizeToMTU(sp sentPing) int {
// - 4-byte key index
// - 8-byte nonce
// - 16-byte authentication tag
+//
+// We have to assume IPv6 because we give the same number to everyone
+// when we set the external interface MTU.
const wgHeaderLen = 4 + 4 + 8 + 16
-func pingSizeToExternalMTU(sp sentPing) int {
- mtu := sp.size
- if mtu == 0 {
- mtu = int(tstun.DefaultMTU())
+func pingSizeToUserMTU(sp sentPing) int {
+ size := sp.size
+ if size == 0 {
+ return int(tstun.DefaultUserMTU)
}
// The size stored in the sentPing already has the IP/UDP
// headers removed. Now remove the Wireguard overhead.
- mtu -= wgHeaderLen
- if mtu < 0 {
- mtu = 0
+ if size < wgHeaderLen {
+ return 0
}
- return mtu
+ return size - wgHeaderLen
}
// Update MTU-related metrics. Should be called with Conn.mu held.
@@ -965,7 +974,7 @@ func updateMTUMetricsLocked(sp sentPing, logf logger.Logf) {
if sp.size == 0 {
return
}
- mtu := pingSizeToExternalMTU(sp)
+ mtu := pingSizeToUserMTU(sp)
if metricHighestPeerMTU.Value() < int64(mtu) {
metricHighestPeerMTU.Set(int64(mtu))
logf("\n\n\nhighest MTU %v\n\n\n", mtu)
@@ -977,29 +986,29 @@ func (c *Conn) PathMTU(dst netip.Addr) int {
// TODO(s): this is method is pretty expensive. Reduce lookups before
// removing the envknob guard.
if !debugPMTUD() {
- return int(tstun.DefaultMTU())
+ return int(tstun.TunMTU())
}
peer, ok := c.netMap.PeerByTailscaleIP(dst)
if !ok {
- return int(tstun.DefaultMTU())
+ return int(tstun.TunMTU())
}
c.mu.Lock()
defer c.mu.Unlock()
if c.closed {
- return int(tstun.DefaultMTU())
+ return int(tstun.TunMTU())
}
ep, ok := c.peerMap.endpointForNodeKey(peer.Key)
if !ok {
- return int(tstun.DefaultMTU())
+ return int(tstun.TunMTU())
}
now := mono.Now()
if !ep.bestAddr.AddrPort.IsValid() || now.After(ep.trustBestAddrUntil) {
// We have not done the disco pings yet. ep.send() will kick that off
// down the line.
- return int(tstun.DefaultMTU())
+ return int(tstun.TunMTU())
}
return ep.bestAddr.mtu
@@ -1044,7 +1053,7 @@ func (de *endpoint) handlePongConnLocked(m *disco.Pong, di *discoInfo, src netip
}
if sp.purpose != pingHeartbeat {
- de.c.dlogf("[v1] magicsock: disco: %v<-%v (%v, %v) got pong tx=%x latency=%v mtu=%v pong.src=%v%v", de.c.discoShort, de.discoShort(), de.publicKey.ShortString(), src, m.TxID[:6], latency.Round(time.Millisecond), pingSizeToMTU(sp), m.Src, logger.ArgWriter(func(bw *bufio.Writer) {
+ de.c.dlogf("[v1] magicsock: disco: %v<-%v (%v, %v) got pong tx=%x latency=%v mtu=%v pong.src=%v%v", de.c.discoShort, de.discoShort(), de.publicKey.ShortString(), src, m.TxID[:6], latency.Round(time.Millisecond), pingSizeToWireMTU(sp), m.Src, logger.ArgWriter(func(bw *bufio.Writer) {
if sp.to != src {
fmt.Fprintf(bw, " ping.to=%v", sp.to)
}
@@ -1060,7 +1069,7 @@ func (de *endpoint) handlePongConnLocked(m *disco.Pong, di *discoInfo, src netip
// Promote this pong response to our current best address if it's lower latency.
// TODO(bradfitz): decide how latency vs. preference order affects decision
if !isDerp {
- thisPong := addrQuality{sp.to, latency, pingSizeToMTU(sp)}
+ thisPong := addrQuality{sp.to, latency, pingSizeToWireMTU(sp)}
if betterAddr(thisPong, de.bestAddr) {
de.c.logf("\n\n\nSETTING BEST MTU %v\n\n\n", thisPong.mtu)
de.c.logf("magicsock: disco: node %v %v now using %v mtu %v", de.publicKey.ShortString(), de.discoShort(), sp.to, thisPong.mtu)
diff --git a/wgengine/netstack/netstack.go b/wgengine/netstack/netstack.go
index 0a27f97f2..f2c7e14e7 100644
--- a/wgengine/netstack/netstack.go
+++ b/wgengine/netstack/netstack.go
@@ -179,7 +179,7 @@ func Create(logf logger.Logf, tundev *tstun.Wrapper, e wgengine.Engine, mc *magi
if tcpipErr != nil {
return nil, fmt.Errorf("could not enable TCP SACK: %v", tcpipErr)
}
- linkEP := channel.New(512, tstun.DefaultMTU(), "")
+ linkEP := channel.New(512, tstun.TunMTU(), "")
if tcpipProblem := ipstack.CreateNIC(nicID, linkEP); tcpipProblem != nil {
return nil, fmt.Errorf("could not create netstack NIC: %v", tcpipProblem)
}
@@ -1044,7 +1044,7 @@ func (ns *Impl) acceptUDP(r *udp.ForwarderRequest) {
func (ns *Impl) handleMagicDNSUDP(srcAddr netip.AddrPort, c *gonet.UDPConn) {
// In practice, implementations are advised not to exceed 512 bytes
// due to fragmenting. Just to be sure, we bump all the way to the MTU.
- var maxUDPReqSize = tstun.DefaultMTU()
+ var maxUDPReqSize = tstun.TunMTU()
// Packets are being generated by the local host, so there should be
// very, very little latency. 150ms was chosen as something of an upper
// bound on resource usage, while hopefully still being long enough for
diff --git a/wgengine/router/ifconfig_windows.go b/wgengine/router/ifconfig_windows.go
index 1cd01eee1..7b3aca423 100644
--- a/wgengine/router/ifconfig_windows.go
+++ b/wgengine/router/ifconfig_windows.go
@@ -241,7 +241,7 @@ func interfaceFromLUID(luid winipcfg.LUID, flags winipcfg.GAAFlags) (*winipcfg.I
var networkCategoryWarning = health.NewWarnable(health.WithMapDebugFlag("warn-network-category-unhealthy"))
func configureInterface(cfg *Config, tun *tun.NativeTun) (retErr error) {
- var mtu = tstun.DefaultMTU()
+ var mtu = tstun.TunMTU()
luid := winipcfg.LUID(tun.LUID())
iface, err := interfaceFromLUID(luid,
// Issue 474: on early boot, when the network is still