summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorJames Tucker <james@tailscale.com>2024-09-25 16:33:59 -0700
committerJames Tucker <james@tailscale.com>2024-09-25 16:44:05 -0700
commit0b4d702a543079453e6e59de7d340299b5ca89ed (patch)
treedf0c5baaf85241480d0f07d5cd242c903ce6e41d
parent9eb59c72c13d062482686afc8ad46f891c0c3d74 (diff)
downloadtailscale-raggi/eperm-health.tar.xz
tailscale-raggi/eperm-health.zip
wgengine/magicsock: report health warnings when blocked by firewallsraggi/eperm-health
macOS and Linux both return EPERM when sendto(2) is blocked by the firewall. Sometimes these blocks and transient, in the case of a fault between EDR software and a kernel, and at other times this may be a persistent state. Report a health warning for the state, and rebind only up to once every 15m in order to avoid excess workload. Updates #11710 Updates #12891 Updates #13511 Signed-off-by: James Tucker <james@tailscale.com>
-rw-r--r--net/netcheck/netcheck.go2
-rw-r--r--net/neterror/neterror.go35
-rw-r--r--net/neterror/neterror_test.go (renamed from net/neterror/neterror_linux_test.go)4
-rw-r--r--net/portmapper/portmapper.go6
-rw-r--r--wgengine/magicsock/magicsock.go61
-rw-r--r--wgengine/magicsock/magicsock_test.go17
6 files changed, 52 insertions, 73 deletions
diff --git a/net/netcheck/netcheck.go b/net/netcheck/netcheck.go
index 003b5fbf8..e3c6f0346 100644
--- a/net/netcheck/netcheck.go
+++ b/net/netcheck/netcheck.go
@@ -1470,7 +1470,7 @@ func (rs *reportState) runProbe(ctx context.Context, dm *tailcfg.DERPMap, probe
}
n, err := rs.c.SendPacket(req, addr)
- if n == len(req) && err == nil || neterror.TreatAsLostUDP(err) {
+ if n == len(req) && err == nil || neterror.IsEPERM(err) {
rs.mu.Lock()
switch probe.proto {
case probeIPv4:
diff --git a/net/neterror/neterror.go b/net/neterror/neterror.go
index e2387440d..553f6f774 100644
--- a/net/neterror/neterror.go
+++ b/net/neterror/neterror.go
@@ -7,38 +7,19 @@ package neterror
import (
"errors"
"fmt"
- "runtime"
"syscall"
)
var errEPERM error = syscall.EPERM // box it into interface just once
-// TreatAsLostUDP reports whether err is an error from a UDP send
-// operation that should be treated as a UDP packet that just got
-// lost.
-//
-// Notably, on Linux this reports true for EPERM errors (from outbound
-// firewall blocks) which aren't really send errors; they're just
-// sends that are never going to make it because the local OS blocked
-// it.
-func TreatAsLostUDP(err error) bool {
- if err == nil {
- return false
- }
- switch runtime.GOOS {
- case "linux":
- // Linux, while not documented in the man page,
- // returns EPERM when there's an OUTPUT rule with -j
- // DROP or -j REJECT. We use this very specific
- // Linux+EPERM check rather than something super broad
- // like net.Error.Temporary which could be anything.
- //
- // For now we only do this on Linux, as such outgoing
- // firewall violations mapping to syscall errors
- // hasn't yet been observed on other OSes.
- return errors.Is(err, errEPERM)
- }
- return false
+// IsEPERM returns true if the error is or wraps EPERM.
+func IsEPERM(err error) bool {
+ // Linux and macOS, while not documented in the man page, returns EPERM when
+ // there's a rule rejecting matching sendto(2) destinations.
+ //
+ // We use this very specific Linux+EPERM check rather than something super
+ // broad like net.Error.Temporary which could be anything.
+ return errors.Is(err, errEPERM)
}
var packetWasTruncated func(error) bool // non-nil on Windows at least
diff --git a/net/neterror/neterror_linux_test.go b/net/neterror/neterror_test.go
index 5b9906074..ec2632166 100644
--- a/net/neterror/neterror_linux_test.go
+++ b/net/neterror/neterror_test.go
@@ -11,7 +11,7 @@ import (
"testing"
)
-func TestTreatAsLostUDP(t *testing.T) {
+func TestIsEPERM(t *testing.T) {
tests := []struct {
name string
err error
@@ -45,7 +45,7 @@ func TestTreatAsLostUDP(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- if got := TreatAsLostUDP(tt.err); got != tt.want {
+ if got := IsEPERM(tt.err); got != tt.want {
t.Errorf("got = %v; want %v", got, tt.want)
}
})
diff --git a/net/portmapper/portmapper.go b/net/portmapper/portmapper.go
index 7cdca1fb3..55acb1f31 100644
--- a/net/portmapper/portmapper.go
+++ b/net/portmapper/portmapper.go
@@ -613,7 +613,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor
// Only do PCP mapping in the case when PMP did not appear to be available recently.
pkt := buildPCPRequestMappingPacket(myIP, localPort, prevPort, pcpMapLifetimeSec, wildcardIP)
if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil {
- if neterror.TreatAsLostUDP(err) {
+ if neterror.IsEPERM(err) {
err = NoMappingError{ErrNoPortMappingServices}
}
return netip.AddrPort{}, err
@@ -622,7 +622,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor
// Ask for our external address if needed.
if !m.external.Addr().IsValid() {
if _, err := uc.WriteToUDPAddrPort(pmpReqExternalAddrPacket, pxpAddr); err != nil {
- if neterror.TreatAsLostUDP(err) {
+ if neterror.IsEPERM(err) {
err = NoMappingError{ErrNoPortMappingServices}
}
return netip.AddrPort{}, err
@@ -631,7 +631,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor
pkt := buildPMPRequestMappingPacket(localPort, prevPort, pmpMapLifetimeSec)
if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil {
- if neterror.TreatAsLostUDP(err) {
+ if neterror.IsEPERM(err) {
err = NoMappingError{ErrNoPortMappingServices}
}
return netip.AddrPort{}, err
diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go
index aceab3be5..575fcb823 100644
--- a/wgengine/magicsock/magicsock.go
+++ b/wgengine/magicsock/magicsock.go
@@ -20,7 +20,6 @@ import (
"strings"
"sync"
"sync/atomic"
- "syscall"
"time"
"github.com/tailscale/wireguard-go/conn"
@@ -323,6 +322,18 @@ type Conn struct {
staticEndpoints views.Slice[netip.AddrPort]
}
+// sendToBlockedWithEPERMWarning is set when a sendto call returns an error containing EPERM.
+var sendToBlockedWithEPERMWarning = health.Register(&health.Warnable{
+ Code: "firewall-blocking-udp",
+ Severity: health.SeverityMedium,
+ Title: "Tailscale blocked by system firewall",
+ Text: func(args health.Args) string {
+ return "The operating system firewall is blocking UDP sends, preventing direct connections. Try reconfiguring your firewall or checking the configuration of EDR software."
+ },
+ // TODO(raggi): we could do with a state indicating that we'll have degraded connectivity, as in this example we'll likely fail to relayed conns.
+ ImpactsConnectivity: false,
+})
+
// SetDebugLoggingEnabled controls whether spammy debug logging is enabled.
//
// Note that this is currently independent from the log levels, even though
@@ -1122,7 +1133,7 @@ func (c *Conn) sendUDPBatch(addr netip.AddrPort, buffs [][]byte) (sent bool, err
c.logf("magicsock: %s", errGSO.Error())
err = errGSO.RetryErr
} else {
- _ = c.maybeRebindOnError(runtime.GOOS, err)
+ _ = c.maybeRebindOnError(err)
}
}
return err == nil, err
@@ -1137,7 +1148,7 @@ func (c *Conn) sendUDP(ipp netip.AddrPort, b []byte) (sent bool, err error) {
sent, err = c.sendUDPStd(ipp, b)
if err != nil {
metricSendUDPError.Add(1)
- _ = c.maybeRebindOnError(runtime.GOOS, err)
+ _ = c.maybeRebindOnError(err)
} else {
if sent {
metricSendUDP.Add(1)
@@ -1146,29 +1157,22 @@ func (c *Conn) sendUDP(ipp netip.AddrPort, b []byte) (sent bool, err error) {
return
}
-// maybeRebindOnError performs a rebind and restun if the error is defined and
-// any conditionals are met.
-func (c *Conn) maybeRebindOnError(os string, err error) bool {
- switch {
- case errors.Is(err, syscall.EPERM):
+// maybeRebindOnError performs a rebind and restun if the error may potentially
+// be fixed by performing a rebind and one has not been performed recently. It
+// returns true if a rebind was performed.
+func (c *Conn) maybeRebindOnError(err error) bool {
+ if neterror.IsEPERM(err) {
+ c.health.SetUnhealthy(sendToBlockedWithEPERMWarning, nil)
+
why := "operation-not-permitted-rebind"
- switch os {
- // We currently will only rebind and restun on a syscall.EPERM if it is experienced
- // on a client running darwin.
- // TODO(charlotte, raggi): expand os options if required.
- case "darwin":
- // TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent
- // EPERMs.
- if c.lastEPERMRebind.Load().Before(time.Now().Add(-5 * time.Second)) {
- c.logf("magicsock: performing %q", why)
- c.lastEPERMRebind.Store(time.Now())
- c.Rebind()
- go c.ReSTUN(why)
- return true
- }
- default:
- c.logf("magicsock: not performing %q", why)
- return false
+ // TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent
+ // EPERMs.
+ if c.lastEPERMRebind.Load().Before(time.Now().Add(-15 * time.Minute)) {
+ c.logf("magicsock: performing %q", why)
+ c.lastEPERMRebind.Store(time.Now())
+ c.Rebind()
+ go c.ReSTUN(why)
+ return true
}
}
return false
@@ -1201,12 +1205,14 @@ func (c *Conn) sendUDPStd(addr netip.AddrPort, b []byte) (sent bool, err error)
switch {
case addr.Addr().Is4():
_, err = c.pconn4.WriteToUDPAddrPort(b, addr)
- if err != nil && (c.noV4.Load() || neterror.TreatAsLostUDP(err)) {
+ if err != nil && (c.noV4.Load() || neterror.IsEPERM(err)) {
+ c.maybeRebindOnError(err)
return false, nil
}
case addr.Addr().Is6():
_, err = c.pconn6.WriteToUDPAddrPort(b, addr)
- if err != nil && (c.noV6.Load() || neterror.TreatAsLostUDP(err)) {
+ if err != nil && (c.noV6.Load() || neterror.IsEPERM(err)) {
+ c.maybeRebindOnError(err)
return false, nil
}
default:
@@ -2607,6 +2613,7 @@ func (c *Conn) rebind(curPortFate currentPortFate) error {
// It should be followed by a call to ReSTUN.
func (c *Conn) Rebind() {
metricRebindCalls.Add(1)
+ c.health.SetHealthy(sendToBlockedWithEPERMWarning)
if err := c.rebind(keepCurrentPort); err != nil {
c.logf("%v", err)
return
diff --git a/wgengine/magicsock/magicsock_test.go b/wgengine/magicsock/magicsock_test.go
index 6b2d961b9..86c181892 100644
--- a/wgengine/magicsock/magicsock_test.go
+++ b/wgengine/magicsock/magicsock_test.go
@@ -2967,29 +2967,20 @@ func TestMaybeRebindOnError(t *testing.T) {
err := fmt.Errorf("outer err: %w", syscall.EPERM)
- t.Run("darwin-rebind", func(t *testing.T) {
+ t.Run("rebind", func(t *testing.T) {
conn := newTestConn(t)
defer conn.Close()
- rebound := conn.maybeRebindOnError("darwin", err)
+ rebound := conn.maybeRebindOnError(err)
if !rebound {
t.Errorf("darwin should rebind on syscall.EPERM")
}
})
- t.Run("linux-not-rebind", func(t *testing.T) {
- conn := newTestConn(t)
- defer conn.Close()
- rebound := conn.maybeRebindOnError("linux", err)
- if rebound {
- t.Errorf("linux should not rebind on syscall.EPERM")
- }
- })
-
t.Run("no-frequent-rebind", func(t *testing.T) {
conn := newTestConn(t)
defer conn.Close()
- conn.lastEPERMRebind.Store(time.Now().Add(-1 * time.Second))
- rebound := conn.maybeRebindOnError("darwin", err)
+ conn.lastEPERMRebind.Store(time.Now().Add(-60 * time.Second))
+ rebound := conn.maybeRebindOnError(err)
if rebound {
t.Errorf("darwin should not rebind on syscall.EPERM within 5 seconds of last")
}