summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorBrad Fitzpatrick <bradfitz@tailscale.com>2021-01-11 19:07:08 -0800
committerBrad Fitzpatrick <bradfitz@tailscale.com>2021-01-11 22:09:09 -0800
commit85e54af0d704f27eb6ccb178f9d90f320fc61fba (patch)
treebf289c6473900c3d32efde801fa27e9af5ce7889
parent5eeaea9ef99c5749bbe66c07741b45c7599cbcb5 (diff)
downloadtailscale-85e54af0d704f27eb6ccb178f9d90f320fc61fba.tar.xz
tailscale-85e54af0d704f27eb6ccb178f9d90f320fc61fba.zip
wgengine: on TCP connect fail/timeout, log some clues about why it failed
So users can see why things aren't working. A start. More diagnostics coming. Updates #1094
-rw-r--r--cmd/tailscale/depaware.txt2
-rw-r--r--cmd/tailscaled/depaware.txt2
-rw-r--r--wgengine/magicsock/magicsock.go36
-rw-r--r--wgengine/pendopen.go157
-rw-r--r--wgengine/userspace.go13
5 files changed, 206 insertions, 4 deletions
diff --git a/cmd/tailscale/depaware.txt b/cmd/tailscale/depaware.txt
index 62b8ad4aa..e8e189677 100644
--- a/cmd/tailscale/depaware.txt
+++ b/cmd/tailscale/depaware.txt
@@ -51,7 +51,7 @@ tailscale.com/cmd/tailscale dependencies: (generated by github.com/tailscale/dep
tailscale.com/logtail/backoff from tailscale.com/control/controlclient+
tailscale.com/metrics from tailscale.com/derp
tailscale.com/net/dnscache from tailscale.com/control/controlclient+
- tailscale.com/net/flowtrack from tailscale.com/wgengine/filter
+ tailscale.com/net/flowtrack from tailscale.com/wgengine/filter+
💣 tailscale.com/net/interfaces from tailscale.com/cmd/tailscale/cli+
tailscale.com/net/netcheck from tailscale.com/cmd/tailscale/cli+
tailscale.com/net/netns from tailscale.com/control/controlclient+
diff --git a/cmd/tailscaled/depaware.txt b/cmd/tailscaled/depaware.txt
index 9798ad6eb..811052f7d 100644
--- a/cmd/tailscaled/depaware.txt
+++ b/cmd/tailscaled/depaware.txt
@@ -86,7 +86,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
tailscale.com/logtail/filch from tailscale.com/logpolicy
tailscale.com/metrics from tailscale.com/derp
tailscale.com/net/dnscache from tailscale.com/control/controlclient+
- tailscale.com/net/flowtrack from tailscale.com/wgengine/filter
+ tailscale.com/net/flowtrack from tailscale.com/wgengine/filter+
💣 tailscale.com/net/interfaces from tailscale.com/ipn+
tailscale.com/net/netcheck from tailscale.com/wgengine/magicsock
tailscale.com/net/netns from tailscale.com/control/controlclient+
diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go
index c31914a85..d80307e5f 100644
--- a/wgengine/magicsock/magicsock.go
+++ b/wgengine/magicsock/magicsock.go
@@ -710,14 +710,46 @@ func peerForIP(nm *controlclient.NetworkMap, ip netaddr.IP) (n *tailcfg.Node, ok
}
}
}
+
+ // TODO(bradfitz): this is O(n peers). Add ART to netaddr?
+ var best netaddr.IPPrefix
for _, p := range nm.Peers {
for _, cidr := range p.AllowedIPs {
if cidr.Contains(ip) {
- return p, true
+ if best.IsZero() || cidr.Bits > best.Bits {
+ n = p
+ best = cidr
+ }
}
}
}
- return nil, false
+ return n, n != nil
+}
+
+// PeerForIP returns the node that ip should route to.
+func (c *Conn) PeerForIP(ip netaddr.IP) (n *tailcfg.Node, ok bool) {
+ c.mu.Lock()
+ defer c.mu.Unlock()
+ if c.netMap == nil {
+ return
+ }
+ return peerForIP(c.netMap, ip)
+}
+
+// LastRecvActivityOfDisco returns the time we last got traffic from
+// this endpoint (updated every ~10 seconds).
+func (c *Conn) LastRecvActivityOfDisco(dk tailcfg.DiscoKey) time.Time {
+ c.mu.Lock()
+ defer c.mu.Unlock()
+ de, ok := c.endpointOfDisco[dk]
+ if !ok {
+ return time.Time{}
+ }
+ unix := atomic.LoadInt64(&de.lastRecvUnixAtomic)
+ if unix == 0 {
+ return time.Time{}
+ }
+ return time.Unix(unix, 0)
}
// Ping handles a "tailscale ping" CLI query.
diff --git a/wgengine/pendopen.go b/wgengine/pendopen.go
new file mode 100644
index 000000000..dc1a21fec
--- /dev/null
+++ b/wgengine/pendopen.go
@@ -0,0 +1,157 @@
+// Copyright (c) 2021 Tailscale Inc & AUTHORS All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package wgengine
+
+import (
+ "os"
+ "strconv"
+ "time"
+
+ "tailscale.com/net/flowtrack"
+ "tailscale.com/net/packet"
+ "tailscale.com/wgengine/filter"
+ "tailscale.com/wgengine/tstun"
+)
+
+const tcpTimeoutBeforeDebug = 5 * time.Second
+
+// debugConnectFailures reports whether the local node should track
+// outgoing TCP connections and log which ones fail and why.
+func debugConnectFailures() bool {
+ s := os.Getenv("TS_DEBUG_CONNECT_FAILURES")
+ if s == "" {
+ return true
+ }
+ v, _ := strconv.ParseBool(s)
+ return v
+}
+
+type pendingOpenFlow struct {
+ timer *time.Timer // until giving up on the flow
+}
+
+func (e *userspaceEngine) trackOpenPreFilterIn(pp *packet.Parsed, t *tstun.TUN) (res filter.Response) {
+ res = filter.Accept // always
+
+ if pp.IPVersion == 0 ||
+ pp.IPProto != packet.TCP ||
+ pp.TCPFlags&(packet.TCPSyn|packet.TCPRst) == 0 {
+ return
+ }
+
+ flow := flowtrack.Tuple{Dst: pp.Src, Src: pp.Dst} // src/dst reversed
+
+ e.mu.Lock()
+ defer e.mu.Unlock()
+ of, ok := e.pendOpen[flow]
+ if !ok {
+ // Not a tracked flow.
+ return
+ }
+ of.timer.Stop()
+ delete(e.pendOpen, flow)
+
+ if pp.TCPFlags&packet.TCPRst != 0 {
+ // TODO(bradfitz): have peer send a IP proto 99 "why"
+ // packet first with details and log that instead
+ // (e.g. ACL prohibited, shields up, etc).
+ e.logf("open-conn-track: flow %v got RST by peer", flow)
+ return
+ }
+
+ return
+}
+
+func (e *userspaceEngine) trackOpenPostFilterOut(pp *packet.Parsed, t *tstun.TUN) (res filter.Response) {
+ res = filter.Accept // always
+
+ if pp.IPVersion == 0 ||
+ pp.IPProto != packet.TCP ||
+ pp.TCPFlags&packet.TCPSyn == 0 {
+ return
+ }
+
+ flow := flowtrack.Tuple{Src: pp.Src, Dst: pp.Dst}
+ timer := time.AfterFunc(tcpTimeoutBeforeDebug, func() {
+ e.onOpenTimeout(flow)
+ })
+
+ e.mu.Lock()
+ defer e.mu.Unlock()
+ if e.pendOpen == nil {
+ e.pendOpen = make(map[flowtrack.Tuple]*pendingOpenFlow)
+ }
+ if _, dup := e.pendOpen[flow]; dup {
+ // Duplicates are expected when the OS retransmits. Ignore.
+ return
+ }
+ e.pendOpen[flow] = &pendingOpenFlow{timer: timer}
+
+ return filter.Accept
+}
+
+func (e *userspaceEngine) onOpenTimeout(flow flowtrack.Tuple) {
+ e.mu.Lock()
+ if _, ok := e.pendOpen[flow]; !ok {
+ // Not a tracked flow, or already handled & deleted.
+ e.mu.Unlock()
+ return
+ }
+ delete(e.pendOpen, flow)
+ e.mu.Unlock()
+
+ // Diagnose why it might've timed out.
+ n, ok := e.magicConn.PeerForIP(flow.Dst.IP)
+ if !ok {
+ e.logf("open-conn-track: timeout opening %v; no associated peer node", flow)
+ return
+ }
+ if n.DiscoKey.IsZero() {
+ e.logf("open-conn-track: timeout opening %v; peer node %v running pre-0.100", flow, n.Key.ShortString())
+ return
+ }
+ if n.DERP == "" {
+ e.logf("open-conn-track: timeout opening %v; peer node %v not connected to any DERP relay", flow, n.Key.ShortString())
+ return
+ }
+ var lastSeen time.Time
+ if n.LastSeen != nil {
+ lastSeen = *n.LastSeen
+ }
+
+ var ps *PeerStatus
+ if st, err := e.getStatus(); err == nil {
+ for _, v := range st.Peers {
+ if v.NodeKey == n.Key {
+ v := v // copy
+ ps = &v
+ }
+ }
+ } else {
+ e.logf("open-conn-track: timeout opening %v to node %v; failed to get engine status: %v", flow, n.Key.ShortString(), err)
+ return
+ }
+ if ps == nil {
+ e.logf("open-conn-track: timeout opening %v; target node %v in netmap but unknown to wireguard", flow, n.Key.ShortString())
+ return
+ }
+
+ // TODO(bradfitz): figure out what PeerStatus.LastHandshake
+ // is. It appears to be the last time we sent a handshake,
+ // which isn't what I expected. I thought it was when a
+ // handshake completed, which is what I want.
+ _ = ps.LastHandshake
+
+ e.logf("open-conn-track: timeout opening %v to node %v; lastSeen=%v, lastRecv=%v",
+ flow, n.Key.ShortString(),
+ agoOrNever(lastSeen), agoOrNever(e.magicConn.LastRecvActivityOfDisco(n.DiscoKey)))
+}
+
+func agoOrNever(t time.Time) string {
+ if t.IsZero() {
+ return "never"
+ }
+ return time.Since(t).Round(time.Second).String()
+}
diff --git a/wgengine/userspace.go b/wgengine/userspace.go
index 6b7d23526..32ef52d47 100644
--- a/wgengine/userspace.go
+++ b/wgengine/userspace.go
@@ -30,6 +30,7 @@ import (
"tailscale.com/control/controlclient"
"tailscale.com/internal/deepprint"
"tailscale.com/ipn/ipnstate"
+ "tailscale.com/net/flowtrack"
"tailscale.com/net/interfaces"
"tailscale.com/net/packet"
"tailscale.com/net/tsaddr"
@@ -118,6 +119,7 @@ type userspaceEngine struct {
endpoints []string
pingers map[wgkey.Key]*pinger // legacy pingers for pre-discovery peers
linkState *interfaces.State
+ pendOpen map[flowtrack.Tuple]*pendingOpenFlow // see pendopen.go
// Lock ordering: magicsock.Conn.mu, wgLock, then mu.
}
@@ -265,6 +267,17 @@ func newUserspaceEngineAdvanced(conf EngineConfig) (_ Engine, reterr error) {
}
e.tundev.PreFilterOut = e.handleLocalPackets
+ if debugConnectFailures() {
+ if e.tundev.PreFilterIn != nil {
+ return nil, errors.New("unexpected PreFilterIn already set")
+ }
+ e.tundev.PreFilterIn = e.trackOpenPreFilterIn
+ if e.tundev.PostFilterOut != nil {
+ return nil, errors.New("unexpected PostFilterOut already set")
+ }
+ e.tundev.PostFilterOut = e.trackOpenPostFilterOut
+ }
+
// wireguard-go logs as it starts and stops routines.
// Silence those; there are a lot of them, and they're just noise.
allowLogf := func(s string) bool {