summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGesa Stupperich <gesa@tailscale.com>2025-11-11 13:12:00 +0000
committerGesa Stupperich <gesa@tailscale.com>2025-11-11 13:36:26 +0000
commit9f3da7ab26af7445dae0b3a74e4ba2ec46cbd52a (patch)
tree2451d95452c333900d448bb3ea3f756e57d41fbc
parent1eba5b0cbdf044b5a3a45fc5372f240865fb8ca3 (diff)
downloadtailscale-gesa/ssh-client-session-monitoring.tar.xz
tailscale-gesa/ssh-client-session-monitoring.zip
ssh/tailssh: explore client connection monitoringgesa/ssh-client-session-monitoring
Run a connection monitor that pings the SSH client when session is recorded. If the pings fail consecutively, close the recording and then cancel the connection. This is one way to ensure that session records get flushed promptly when using S3 multi-part upload. Timeouts and consecutive failure threshold are hardcoded because this is just an experiment. Fixes tailscale.com/corp#33968 Signed-off-by: Gesa Stupperich <gesa@tailscale.com>
-rw-r--r--ssh/tailssh/tailssh.go54
1 files changed, 54 insertions, 0 deletions
diff --git a/ssh/tailssh/tailssh.go b/ssh/tailssh/tailssh.go
index 7d12ab45f..40f376da9 100644
--- a/ssh/tailssh/tailssh.go
+++ b/ssh/tailssh/tailssh.go
@@ -32,6 +32,7 @@ import (
gossh "golang.org/x/crypto/ssh"
"tailscale.com/envknob"
"tailscale.com/ipn/ipnlocal"
+ "tailscale.com/ipn/ipnstate"
"tailscale.com/net/tsaddr"
"tailscale.com/net/tsdial"
"tailscale.com/sessionrecording"
@@ -76,6 +77,7 @@ type ipnLocalBackend interface {
Dialer() *tsdial.Dialer
TailscaleVarRoot() string
NodeKey() key.NodePublic
+ Ping(ctx context.Context, ip netip.Addr, pingType tailcfg.PingType, size int) (*ipnstate.PingResult, error)
}
type server struct {
@@ -834,6 +836,7 @@ func (c *conn) detachSession(ss *sshSession) {
}
var errSessionDone = errors.New("session is done")
+var errClientUnreachable = errors.New("client is unreachable")
// handleSSHAgentForwarding starts a Unix socket listener and in the background
// forwards agent connections between the listener and the ssh.Session.
@@ -954,6 +957,57 @@ func (ss *sshSession) run() {
ss.logf("startNewRecording: <nil>")
if rec != nil {
defer rec.Close()
+
+ ping := func() bool {
+ clientIP := ss.conn.info.src.Addr()
+
+ ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+ defer cancel()
+
+ _, err := ss.conn.srv.lb.Ping(ctx, clientIP, tailcfg.PingICMP, 0)
+ if err != nil {
+ ss.logf("pinging SSH client %s failed: %v", clientIP, err)
+ return false
+ }
+
+ ss.logf("pinging SSH client %s successful", clientIP)
+ return true
+ }
+
+ go func() {
+ ss.logf("starting connection monitor for session %s", ss.sharedID)
+ ticker := time.NewTicker(15 * time.Second)
+ defer ticker.Stop()
+
+ consecutiveFailures := 0
+ const maxFailures = 3
+
+ for {
+ select {
+ case <-ss.ctx.Done():
+ ss.logf("session terminated, closing recording: %v", context.Cause(ss.ctx))
+ rec.Close()
+ return
+
+ case <-ticker.C:
+ pong := ping()
+ if pong {
+ consecutiveFailures = 0
+ ss.logf("connection test passed for session %s", ss.sharedID)
+ } else {
+ consecutiveFailures++
+ ss.logf("connection test failed (%d/%d) for session %s", consecutiveFailures, maxFailures, ss.sharedID)
+
+ if consecutiveFailures >= maxFailures {
+ ss.logf("connection lost (connection test failed %d times), closing recording", maxFailures)
+ ss.cancelCtx(errClientUnreachable)
+ rec.Close()
+ return
+ }
+ }
+ }
+ }
+ }()
}
}
}