diff options
| author | James Tucker <james@tailscale.com> | 2026-04-22 12:40:18 -0700 |
|---|---|---|
| committer | James Tucker <james@tailscale.com> | 2026-04-22 12:41:43 -0700 |
| commit | 12c114b158a94137915e822226a4cc9b4a47079f (patch) | |
| tree | 277a5a06e5e710f351e4bf1437a07701e54a5455 | |
| parent | d7916d4369bd2b880281e38edac2f600d5ab2673 (diff) | |
| download | tailscale-raggi/startupprof.tar.xz tailscale-raggi/startupprof.zip | |
cmd/{startupprof,tailscaled}: add startup profiling toolsraggi/startupprof
I was running some end to end tests and decided I wanted to know why
tailscaled takes longer than I expect to startup even when talking to
solely local resources. This helped me get a few answers.
| -rw-r--r-- | cmd/startupprof/startupprof.go | 784 | ||||
| -rw-r--r-- | cmd/tailscaled/startupprof.go | 269 | ||||
| -rw-r--r-- | cmd/tailscaled/tailscaled.go | 22 |
3 files changed, 1075 insertions, 0 deletions
diff --git a/cmd/startupprof/startupprof.go b/cmd/startupprof/startupprof.go new file mode 100644 index 000000000..55083ba8d --- /dev/null +++ b/cmd/startupprof/startupprof.go @@ -0,0 +1,784 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +// Program startupprof is a harness for profiling tailscaled startup time. +// +// It spins up an in-process testcontrol server + DERP + STUN, and then +// forks one or more tailscaled processes against it. One process ("peer") +// joins first and is driven to the Running state; the peer exists only so +// that the process we actually care about — the "target" — has a real peer +// to disco with, to measure time-to-first-peer-communication. +// +// The target process is launched with TS_STARTUPPROF_* envknobs that cause +// tailscaled to write a runtime/trace, CPU profile, heap profile, and a +// machine-readable phase-timing file (see cmd/tailscaled/startupprof.go). +// +// The harness additionally enables GODEBUG=inittrace=1 so that Go's +// per-package init() cost is logged to stderr, and summarizes the top +// init-time offenders. +// +// Typical usage: +// +// go build -o /tmp/tailscaled ./cmd/tailscaled +// go run ./cmd/startupprof -tailscaled=/tmp/tailscaled -out=/tmp/tsprof +// go tool trace /tmp/tsprof/target-trace.out +// go tool pprof -http=: /tmp/tsprof/target-cpu.pprof +// cat /tmp/tsprof/target-phases.txt +// +// The report printed at the end shows wall-clock times for each phase and +// the init-time breakdown parsed from GODEBUG=inittrace. +package main + +import ( + "bufio" + "context" + "errors" + "flag" + "fmt" + "io" + "log" + "net" + "net/http" + "net/http/httptest" + "net/netip" + "os" + "os/exec" + "path/filepath" + "sort" + "strconv" + "strings" + "sync" + "syscall" + "testing" + "time" + + "tailscale.com/client/local" + "tailscale.com/ipn" + "tailscale.com/ipn/ipnstate" + "tailscale.com/tailcfg" + "tailscale.com/tstest/integration" + "tailscale.com/tstest/integration/testcontrol" + "tailscale.com/types/logger" +) + +var ( + flagTailscaled = flag.String("tailscaled", "", "path to tailscaled binary (required)") + flagOut = flag.String("out", "", "output directory for trace/pprof/phases files (required)") + flagSkipPeer = flag.Bool("skip-peer", false, "don't spin up a peer; measure only to BackendState=Running (no first-ping metric)") + flagKeepAlive = flag.Bool("keep-alive", false, "after measuring, leave the target process running so you can poke at it") + flagTimeout = flag.Duration("timeout", 60*time.Second, "overall timeout for a single run") + flagRuns = flag.Int("runs", 1, "number of back-to-back measurement runs") + flagVerbose = flag.Bool("v", false, "verbose: stream daemon stderr") + flagCachedNetmap = flag.Bool("cached-netmap", false, "test client-side netmap caching: testcontrol grants NodeAttrCacheNetworkMaps, target uses persistent state, first 'prime' run populates the cache, then -runs measurement runs reuse it.") +) + +func main() { + flag.Parse() + if *flagTailscaled == "" || *flagOut == "" { + fmt.Fprintln(os.Stderr, "usage: startupprof -tailscaled=PATH -out=DIR [flags]") + flag.PrintDefaults() + os.Exit(2) + } + if _, err := os.Stat(*flagTailscaled); err != nil { + log.Fatalf("tailscaled not found at %q: %v", *flagTailscaled, err) + } + if err := os.MkdirAll(*flagOut, 0o755); err != nil { + log.Fatalf("mkdir out: %v", err) + } + + tb := &fakeTB{} + + // Spin up the fake control plane + DERP + STUN in-process. + derpMap := integration.RunDERPAndSTUN(tb, logger.Discard, "127.0.0.1") + control := &testcontrol.Server{ + DERPMap: derpMap, + DNSConfig: &tailcfg.DNSConfig{ + Proxied: true, + }, + MagicDNSDomain: "tail-scale.ts.net", + } + if *flagCachedNetmap { + // Grant NodeAttrCacheNetworkMaps so the target writes + reads a + // disk netmap cache between runs. + // DefaultNodeCapabilities replaces the default cap map entirely. + caps := tailcfg.NodeCapMap{ + tailcfg.NodeAttrCacheNetworkMaps: []tailcfg.RawMessage{}, + tailcfg.CapabilityHTTPS: []tailcfg.RawMessage{}, + tailcfg.NodeAttrFunnel: []tailcfg.RawMessage{}, + tailcfg.CapabilityFileSharing: []tailcfg.RawMessage{}, + tailcfg.CapabilityFunnelPorts + "?ports=8080,443": []tailcfg.RawMessage{}, + } + control.DefaultNodeCapabilities = &caps + log.Printf("cached-netmap mode: testcontrol grants NodeAttrCacheNetworkMaps") + } + control.HTTPTestServer = httptest.NewUnstartedServer(control) + control.HTTPTestServer.Start() + defer control.HTTPTestServer.Close() + controlURL := control.HTTPTestServer.URL + log.Printf("testcontrol URL: %s", controlURL) + log.Printf("DERP: %+v", derpMap.Regions[1].Nodes[0]) + + // Optionally, bring up a peer tailscaled. This one is *not* profiled; it + // exists only so the target has a real peer to ping. + var peerIP netip.Addr + if !*flagSkipPeer { + p, err := launchDaemon(daemonOpts{ + binary: *flagTailscaled, + outDir: *flagOut, + controlURL: controlURL, + name: "peer", + authKey: "peer-key", + verbose: *flagVerbose, + // peer is NOT profiled + }) + if err != nil { + log.Fatalf("launch peer: %v", err) + } + defer p.shutdown() + + ctx, cancel := context.WithTimeout(context.Background(), *flagTimeout) + defer cancel() + if err := p.waitReady(ctx); err != nil { + log.Fatalf("peer waitReady: %v", err) + } + st, err := p.lc.Status(ctx) + if err != nil { + log.Fatalf("peer status: %v", err) + } + if len(st.TailscaleIPs) == 0 { + log.Fatalf("peer has no tailscale IP") + } + peerIP = st.TailscaleIPs[0] + log.Printf("peer up at %v", peerIP) + } + + // Target state dir. + // - In normal mode: fresh temp dir per run (ephemeral-style, though we + // use file state since cache needs varRoot; cache is simply absent). + // - In cached-netmap mode: single persistent state dir reused across + // runs. Prime run (run 0) populates the cache; runs 1..N measure + // reuse. + targetStateDir := "" + if *flagCachedNetmap { + targetStateDir = filepath.Join(*flagOut, "target-state") + if err := os.RemoveAll(targetStateDir); err != nil { + log.Fatalf("reset target state dir: %v", err) + } + if err := os.MkdirAll(targetStateDir, 0o700); err != nil { + log.Fatalf("create target state dir: %v", err) + } + log.Printf("cached-netmap mode: persistent target state dir: %s", targetStateDir) + log.Printf("---- prime run (not measured) ----") + if err := runOnce(controlURL, peerIP, "target-prime", targetStateDir, false); err != nil { + log.Fatalf("prime run: %v", err) + } + } + + for run := 1; run <= *flagRuns; run++ { + log.Printf("==== run %d of %d ====", run, *flagRuns) + name := "target" + if *flagRuns > 1 { + name = fmt.Sprintf("target-run%d", run) + } + if err := runOnce(controlURL, peerIP, name, targetStateDir, true); err != nil { + log.Fatalf("run %d: %v", run, err) + } + } +} + +// runOnce launches a target tailscaled and measures startup to Running + first +// ping. If reusedStateDir is non-empty, it's used (persistent across runs); +// otherwise a fresh temp dir is created. If measure is false, no report is +// generated (used for the prime run in cached-netmap mode). +func runOnce(controlURL string, peerIP netip.Addr, name, reusedStateDir string, measure bool) error { + opts := daemonOpts{ + binary: *flagTailscaled, + outDir: *flagOut, + controlURL: controlURL, + name: name, + authKey: "target-key", + verbose: *flagVerbose, + profile: measure, + stateDir: reusedStateDir, + } + d, err := launchDaemon(opts) + if err != nil { + return fmt.Errorf("launch target: %w", err) + } + if !*flagKeepAlive { + defer d.shutdown() + } + + t0 := d.startWall + + // tSockUp: moment safesocket is listening. + tSockUp, err := d.waitSocket(*flagTimeout) + if err != nil { + return fmt.Errorf("waiting for socket: %w", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), *flagTimeout) + defer cancel() + + // tRunning: moment BackendState == Running (netmap applied, at least + // one live DERP or peer endpoint). + // tFirstNetMap: moment any non-nil NetMap has been applied internally + // (from disk cache OR from control). With netmap caching enabled, this + // should fire much earlier than tRunning. + tRunning, tFirstNetMap, err := d.waitRunningAndNetMap(ctx) + if err != nil { + return fmt.Errorf("waiting for Running: %w", err) + } + + // tFirstPing: first successful disco ping to the peer. + var tFirstPing time.Time + if peerIP.IsValid() { + pr, err := pingUntilSuccess(ctx, d.lc, peerIP, 20*time.Second) + if err != nil { + return fmt.Errorf("ping %v: %w", peerIP, err) + } + tFirstPing = time.Now() + log.Printf("[%s] first ping to %v ok: via=%s latency=%v", + name, peerIP, pr.DERPRegionCode, pr.LatencySeconds) + } + + if !measure { + // Cleanly shutdown to persist state for the next run. + d.shutdown() + return nil + } + + // Give tailscaled a moment to flush the trace/pprof files (state_running + // triggers the stop, which writes out files and closes them). + waitForProfFiles(d.profilePaths, 2*time.Second) + + report := buildReport(d, t0, tSockUp, tFirstNetMap, tRunning, tFirstPing) + fmt.Println(report) + + summaryPath := filepath.Join(*flagOut, name+"-summary.txt") + if err := os.WriteFile(summaryPath, []byte(report), 0o644); err != nil { + return fmt.Errorf("write summary: %w", err) + } + log.Printf("wrote summary: %s", summaryPath) + log.Printf(" trace: %s", d.profilePaths.trace) + log.Printf(" cpuprof: %s", d.profilePaths.cpu) + log.Printf(" memprof: %s", d.profilePaths.mem) + log.Printf(" phases: %s", d.profilePaths.phases) + log.Printf(" stderr: %s", d.stderrPath) + + if *flagKeepAlive { + log.Printf("--keep-alive set; leaving target running. Ctrl-C to exit.") + select {} + } + return nil +} + +// -------- daemon management -------- + +type daemonOpts struct { + binary string + outDir string + controlURL string + name string + authKey string + verbose bool + profile bool // if true, enable TS_STARTUPPROF_* envknobs + stateDir string // if non-empty, use this as --statedir (persistent across invocations); otherwise a fresh tempdir +} + +type profilePaths struct { + trace string + cpu string + mem string + phases string +} + +type daemon struct { + opts daemonOpts + cmd *exec.Cmd + stateDir string + sockPath string + stderrPath string + startWall time.Time + lc *local.Client + profilePaths profilePaths + stderrMu sync.Mutex + stderrLines []string // for inittrace capture + stderrFile *os.File + shutdownOnce sync.Once +} + +func launchDaemon(opts daemonOpts) (*daemon, error) { + stateDir := opts.stateDir + persistent := stateDir != "" + if !persistent { + var err error + stateDir, err = os.MkdirTemp("", "tsprof-"+opts.name+"-") + if err != nil { + return nil, err + } + } + // Use a short socket path (unix sockets have length limits: 104 on + // macOS, 108 on Linux). Always put sockets under /tmp with a short + // name to stay under even the tightest limit. + sockPath := filepath.Join(os.TempDir(), "tsprof-"+opts.name+".sock") + + args := []string{ + "--tun=userspace-networking", + "--statedir=" + stateDir, + "--socket=" + sockPath, + "--socks5-server=localhost:0", + "--no-logs-no-support", // disable logtail upload + } + if !persistent { + // Ephemeral (random node key per invocation). Triggers + // controlclient.LoginEphemeral in cmd/tailscaled. + args = append(args, "--state=mem:") + } else { + // Persistent state; tailscaled defaults the state file to + // <statedir>/tailscaled.state when --state is unset. We want + // the cache directory (<statedir>/profile-data/...) to survive + // across invocations, so we don't use mem:. + } + cmd := exec.Command(opts.binary, args...) + + env := os.Environ() + env = append(env, + "TS_CONTROL_URL="+opts.controlURL, // respected for initial prefs default? we set prefs explicitly below anyway + "TS_PANIC_IF_HIT_MAIN_CONTROL=1", + "TS_DISABLE_PORTMAPPER=1", + "TS_ASSUME_NETWORK_UP_FOR_TEST=1", + "GODEBUG=inittrace=1", + ) + + var paths profilePaths + if opts.profile { + paths = profilePaths{ + trace: filepath.Join(opts.outDir, opts.name+"-trace.out"), + cpu: filepath.Join(opts.outDir, opts.name+"-cpu.pprof"), + mem: filepath.Join(opts.outDir, opts.name+"-mem.pprof"), + phases: filepath.Join(opts.outDir, opts.name+"-phases.txt"), + } + env = append(env, + "TS_STARTUPPROF_TRACE="+paths.trace, + "TS_STARTUPPROF_CPUPROF="+paths.cpu, + "TS_STARTUPPROF_MEMPROF="+paths.mem, + "TS_STARTUPPROF_PHASES="+paths.phases, + "TS_STARTUPPROF_MAX_SECS=60", + ) + } + cmd.Env = env + + // Capture stderr to file + a ring of recent lines for inittrace parsing. + stderrPath := filepath.Join(opts.outDir, opts.name+"-stderr.log") + stderrFile, err := os.Create(stderrPath) + if err != nil { + return nil, err + } + d := &daemon{ + opts: opts, + cmd: cmd, + stateDir: stateDir, + sockPath: sockPath, + stderrPath: stderrPath, + profilePaths: paths, + stderrFile: stderrFile, + } + // Use an os.Pipe so cmd.Start() doesn't spawn its own copy goroutine + // (which adds scheduling latency before we first get control back). + pr, pw, err := os.Pipe() + if err != nil { + stderrFile.Close() + return nil, err + } + cmd.Stderr = pw + cmd.Stdout = pw + go d.pumpStderr(pr) + + log.Printf("[%s] starting %s %s", opts.name, opts.binary, strings.Join(args, " ")) + t0 := time.Now() + d.startWall = t0 + if err := cmd.Start(); err != nil { + stderrFile.Close() + pw.Close() + return nil, err + } + // Close our copy of the write end: only the child holds it open now, + // so pumpStderr's reader will see EOF when the child exits. + pw.Close() + if opts.verbose { + log.Printf("[%s] cmd.Start() returned in %v", opts.name, time.Since(t0)) + } + + d.lc = &local.Client{ + Socket: sockPath, + UseSocketOnly: true, + } + return d, nil +} + +func (d *daemon) pumpStderr(r io.Reader) { + sc := bufio.NewScanner(r) + sc.Buffer(make([]byte, 64*1024), 1024*1024) + for sc.Scan() { + line := sc.Text() + d.stderrMu.Lock() + d.stderrLines = append(d.stderrLines, line) + d.stderrMu.Unlock() + fmt.Fprintln(d.stderrFile, line) + if d.opts.verbose { + fmt.Fprintf(os.Stderr, "[%s] %s\n", d.opts.name, line) + } + } +} + +func (d *daemon) waitSocket(timeout time.Duration) (time.Time, error) { + // NOTE: we intentionally use net.Dial directly here and NOT + // safesocket.ConnectContext, because the latter has a 250ms backoff + // whenever the daemon appears to still be starting up (see + // safesocket/safesocket.go:62). For a latency benchmark that would + // artificially add 250ms to our measurements. + deadline := time.Now().Add(timeout) + attempts := 0 + for { + attempts++ + c, err := net.Dial("unix", d.sockPath) + if err == nil { + c.Close() + if d.opts.verbose { + log.Printf("[%s] socket up after %d attempts in %v", d.opts.name, attempts, time.Since(d.startWall)) + } + return time.Now(), nil + } + if time.Now().After(deadline) { + return time.Time{}, fmt.Errorf("timeout waiting for socket %q: %v", d.sockPath, err) + } + time.Sleep(500 * time.Microsecond) + } +} + +// waitReady calls Start() with prefs + authkey and waits until Running. +func (d *daemon) waitReady(ctx context.Context) error { + if _, err := d.waitSocket(10 * time.Second); err != nil { + return err + } + _, _, err := d.waitRunningAndNetMap(ctx) + return err +} + +// waitRunningAndNetMap drives the state machine by calling lc.Start() +// (which is idempotent in effect) and watches the bus for both the first +// non-nil NetMap notification AND the transition to BackendState=Running. +// +// The first-netmap moment is valuable because with client-side netmap +// caching, it can occur *before* the first control round-trip completes +// (the cached netmap is applied synchronously inside lc.Start). It's the +// earliest point at which the node has peer information and can initiate +// disco connections. +// +// Returns (tRunning, tFirstNetMap, err). Either time may be zero if the +// corresponding event was not observed (e.g. we missed the netmap delivery +// because it arrived before our WatchIPNBus subscription did). +func (d *daemon) waitRunningAndNetMap(ctx context.Context) (tRunning, tFirstNetMap time.Time, err error) { + // Subscribe first so we catch the initial-state burst, which (with + // NotifyInitialNetMap) delivers the current netmap if one is already + // applied at the time of subscription. This covers the case where a + // cached netmap was loaded in lc.Start before we even started watching. + const mask = ipn.NotifyInitialState | ipn.NotifyInitialNetMap + watcher, werr := d.lc.WatchIPNBus(ctx, mask) + if werr != nil { + return time.Time{}, time.Time{}, fmt.Errorf("WatchIPNBus: %w", werr) + } + defer watcher.Close() + + // Now call Start. + prefs := ipn.NewPrefs() + prefs.ControlURL = d.opts.controlURL + prefs.WantRunning = true + prefs.Hostname = d.opts.name + _ = d.lc.Start(ctx, ipn.Options{ + UpdatePrefs: prefs, + AuthKey: d.opts.authKey, + }) + // StartLoginInteractive kicks the control client's auth routine, which + // will use the AuthKey set above. Without this, a fresh tailscaled + // stays at NeedsLogin forever. + _ = d.lc.StartLoginInteractive(ctx) + + for { + n, nerr := watcher.Next() + if nerr != nil { + return tRunning, tFirstNetMap, nerr + } + if n.ErrMessage != nil { + return tRunning, tFirstNetMap, errors.New(*n.ErrMessage) + } + if n.NetMap != nil && tFirstNetMap.IsZero() { + tFirstNetMap = time.Now() + } + if n.State != nil && *n.State == ipn.Running { + tRunning = time.Now() + return tRunning, tFirstNetMap, nil + } + } +} + +func (d *daemon) shutdown() { + d.shutdownOnce.Do(func() { + if d.cmd.Process != nil { + _ = d.cmd.Process.Signal(syscall.SIGTERM) + done := make(chan struct{}) + go func() { d.cmd.Wait(); close(done) }() + select { + case <-done: + case <-time.After(3 * time.Second): + _ = d.cmd.Process.Kill() + <-done + } + } + if d.stderrFile != nil { + d.stderrFile.Close() + } + }) +} + +func pingUntilSuccess(ctx context.Context, lc *local.Client, ip netip.Addr, maxWait time.Duration) (*ipnstate.PingResult, error) { + deadline := time.Now().Add(maxWait) + var lastErr error + for time.Now().Before(deadline) { + pingCtx, cancel := context.WithTimeout(ctx, 3*time.Second) + pr, err := lc.Ping(pingCtx, ip, tailcfg.PingDisco) + cancel() + if err == nil && pr != nil && pr.Err == "" { + return pr, nil + } + if err != nil { + lastErr = err + } else if pr != nil && pr.Err != "" { + lastErr = errors.New(pr.Err) + } + time.Sleep(100 * time.Millisecond) + } + if lastErr == nil { + lastErr = errors.New("ping deadline") + } + return nil, lastErr +} + +func waitForProfFiles(p profilePaths, max time.Duration) { + if p.phases == "" { + return + } + deadline := time.Now().Add(max) + for time.Now().Before(deadline) { + if fi, err := os.Stat(p.phases); err == nil && fi.Size() > 0 { + return + } + time.Sleep(50 * time.Millisecond) + } +} + +// -------- reporting -------- + +func buildReport(d *daemon, tFork, tSockUp, tFirstNetMap, tRunning, tFirstPing time.Time) string { + var b strings.Builder + fmt.Fprintf(&b, "===== startupprof: %s =====\n", d.opts.name) + fmt.Fprintf(&b, "tailscaled: %s\n", d.opts.binary) + fmt.Fprintf(&b, "\nWall-clock milestones (from fork):\n") + fmt.Fprintf(&b, " %-28s %8s\n", "milestone", "ms") + row := func(name string, t time.Time) { + if t.IsZero() { + return + } + fmt.Fprintf(&b, " %-28s %8.1f\n", name, float64(t.Sub(tFork).Microseconds())/1000.0) + } + row("fork->socket_listening", tSockUp) + row("fork->first_netmap_applied", tFirstNetMap) + row("fork->backend_running", tRunning) + row("fork->first_peer_ping", tFirstPing) + + // Phases from the traced process itself (measured from earliest possible + // time inside the process: startupprof.go's procStart). + if ph, err := readPhases(d.profilePaths.phases); err == nil && len(ph) > 0 { + fmt.Fprintf(&b, "\nIn-process phases (from procStart, earliest time.Now in process):\n") + fmt.Fprintf(&b, " %-28s %8s %8s\n", "phase", "ms@", "delta") + var prev time.Duration + for i, p := range ph { + var delta time.Duration + if i > 0 { + delta = p.offset - prev + } + fmt.Fprintf(&b, " %-28s %8.1f %8.1f\n", p.name, + float64(p.offset.Microseconds())/1000.0, + float64(delta.Microseconds())/1000.0) + prev = p.offset + } + } + + // inittrace from GODEBUG=inittrace=1. + d.stderrMu.Lock() + stderrSnap := append([]string(nil), d.stderrLines...) + d.stderrMu.Unlock() + if itr, total := extractInitTrace(stderrSnap); len(itr) > 0 { + fmt.Fprintf(&b, "\nTop package init costs (GODEBUG=inittrace=1):\n") + fmt.Fprintf(&b, " %-50s %9s %10s %10s\n", "package", "own_ms", "cumul_ms", "alloc_KB") + for _, it := range itr { + fmt.Fprintf(&b, " %-50s %9.3f %10.2f %10d\n", + it.pkg, it.clockMs, it.cpuMs, it.allocBytes/1024) + } + fmt.Fprintf(&b, " %-50s %9.3f %10.2f %10d\n", total.pkg, total.clockMs, total.cpuMs, total.allocBytes/1024) + fmt.Fprintf(&b, " (own_ms = this init func's wall time; cumul_ms = elapsed since program start)\n") + } + + return b.String() +} + +type phase struct { + offset time.Duration + name string +} + +func readPhases(path string) ([]phase, error) { + if path == "" { + return nil, errors.New("no path") + } + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + sc := bufio.NewScanner(f) + var out []phase + for sc.Scan() { + line := sc.Text() + tab := strings.IndexByte(line, '\t') + if tab < 0 { + continue + } + ns, err := strconv.ParseInt(line[:tab], 10, 64) + if err != nil { + continue + } + out = append(out, phase{offset: time.Duration(ns), name: line[tab+1:]}) + } + return out, nil +} + +type initRecord struct { + pkg string + clockMs float64 + cpuMs float64 + allocBytes int64 +} + +// extractInitTrace parses Go's GODEBUG=inittrace=1 lines out of captured +// stderr. Each line looks like: +// +// init internal/bytealg @0.008 ms, 0.004 ms clock, 0 bytes, 0 allocs +// init runtime @0.15 ms, 0.12 ms clock, 0 bytes, 0 allocs +// init errors @0.47 ms, 0.003 ms clock, 0 bytes, 0 allocs +// +// Returns (topBy Y clock, totalAcrossAll). Each returned record's clockMs is +// the init function's own wall time (Y), not the cumulative @X. +func extractInitTrace(lines []string) (top []initRecord, total initRecord) { + var out []initRecord + var maxAt float64 + for _, line := range lines { + const pfx = "init " + i := strings.Index(line, pfx) + if i < 0 { + continue + } + rest := line[i+len(pfx):] + // "pkg @X ms, Y ms clock, Z bytes, N allocs" + at := strings.Index(rest, " @") + if at < 0 { + continue + } + pkg := rest[:at] + tail := rest[at+2:] + // "X ms, Y ms clock, Z bytes, N allocs" + fields := strings.Split(tail, ",") + if len(fields) < 3 { + continue + } + atMs, ok := parseNumberWithSuffix(strings.TrimSpace(fields[0]), "ms") + if !ok { + continue + } + clockMs, ok := parseNumberWithSuffix(strings.TrimSpace(fields[1]), "ms clock") + if !ok { + continue + } + // fields[2] is "Z bytes" + alloc, _ := parseNumberWithSuffix(strings.TrimSpace(fields[2]), "bytes") + if atMs > maxAt { + maxAt = atMs + } + total.clockMs += clockMs + total.allocBytes += int64(alloc) + out = append(out, initRecord{ + pkg: pkg, + clockMs: clockMs, + cpuMs: atMs, // repurposed: show @X ms (cumulative offset) for context + allocBytes: int64(alloc), + }) + } + total.pkg = "TOTAL (all init funcs)" + total.cpuMs = maxAt + // Sort by clock time desc, then return top 30. + sort.Slice(out, func(i, j int) bool { return out[i].clockMs > out[j].clockMs }) + if len(out) > 30 { + out = out[:30] + } + return out, total +} + +func parseNumberWithSuffix(s, suffix string) (float64, bool) { + s = strings.TrimSuffix(s, suffix) + s = strings.TrimSpace(s) + v, err := strconv.ParseFloat(s, 64) + if err != nil { + return 0, false + } + return v, true +} + +// -------- fakeTB (needed by integration.RunDERPAndSTUN) -------- + +type fakeTB struct{ *testing.T } + +func (*fakeTB) Cleanup(_ func()) {} +func (*fakeTB) Error(args ...any) { + log.Print(args...) +} +func (*fakeTB) Errorf(format string, args ...any) { + log.Printf(format, args...) +} +func (*fakeTB) Fail() { log.Fatal("fail") } +func (*fakeTB) FailNow() { log.Fatal("failnow") } +func (*fakeTB) Failed() bool { return false } +func (*fakeTB) Fatal(args ...any) { + log.Fatal(args...) +} +func (*fakeTB) Fatalf(format string, args ...any) { + log.Fatalf(format, args...) +} +func (*fakeTB) Helper() {} +func (*fakeTB) Log(args ...any) { + log.Print(args...) +} +func (*fakeTB) Logf(format string, args ...any) { + log.Printf(format, args...) +} +func (*fakeTB) Name() string { return "startupprof" } +func (*fakeTB) Setenv(_, _ string) {} +func (*fakeTB) Skip(_ ...any) {} +func (*fakeTB) SkipNow() {} +func (*fakeTB) Skipf(_ string, _ ...any) {} +func (*fakeTB) Skipped() bool { return false } +func (*fakeTB) TempDir() string { d, _ := os.MkdirTemp("", "tb"); return d } +func (*fakeTB) Deadline() (time.Time, bool) { return time.Time{}, false } + +// unused but required for iface: +var _ = http.StatusOK diff --git a/cmd/tailscaled/startupprof.go b/cmd/tailscaled/startupprof.go new file mode 100644 index 000000000..627e2e927 --- /dev/null +++ b/cmd/tailscaled/startupprof.go @@ -0,0 +1,269 @@ +// Copyright (c) Tailscale Inc & contributors +// SPDX-License-Identifier: BSD-3-Clause + +//go:build go1.23 + +package main + +// This file implements opt-in, envknob-gated startup profiling for tailscaled. +// It is intended to be used by the cmd/startupprof harness (and anyone +// debugging tailscaled startup latency). +// +// It is safe to leave compiled in: when none of the TS_STARTUPPROF_* envknobs +// are set, everything here is a fast no-op. +// +// The controls are: +// +// TS_STARTUPPROF_TRACE=/path/to/trace.out +// Start runtime/trace.Start at process entry (as early as possible in +// main, before any other work) and write to the given path. The trace +// will be stopped automatically when the backend reaches ipn.Running, +// or after TS_STARTUPPROF_MAX_SECS (default 30s), whichever is first. +// +// TS_STARTUPPROF_CPUPROF=/path/to/cpu.pprof +// Same, but for runtime/pprof.StartCPUProfile. +// +// TS_STARTUPPROF_MEMPROF=/path/to/mem.pprof +// Write a heap profile when the trace is stopped. +// +// TS_STARTUPPROF_MAX_SECS=<int> +// Maximum time to leave profilers running. Default: 30. +// +// TS_STARTUPPROF_PHASES=/path/to/phases.txt +// Append wall-clock phase timings (measured from process entry) to +// this file. Machine-readable: "<nanoseconds>\t<phase name>\n". +// +// The phase marker API (markPhase) is also available without any envknob; +// when no phases file is configured, markPhase emits a runtime/trace.Log +// event (cheap; no-op if tracing disabled) and records nothing else. +// +// Design goal: zero dependencies on logtail, logger, or anything that is +// itself part of the startup cost being measured. + +import ( + "context" + "fmt" + "os" + "runtime" + "runtime/pprof" + "runtime/trace" + "sync" + "sync/atomic" + "time" + + "tailscale.com/ipn" + "tailscale.com/ipn/ipnlocal" +) + +// procStart is the earliest time we can record: the value of time.Now() at +// init() in this file. This captures cost incurred by other init funcs that +// run after ours, plus all of main. It is not perfect (Go init order is +// source-file lexical within a package) but is adequate for our purposes. +// +// NOTE: placing procStart in a var with an initializer at package scope +// guarantees it is assigned before any init() or main(). We also set it +// again in init() as a backstop. +var procStart = time.Now() + +// startupProf holds runtime state for the profiler. All fields are only +// accessed from the single startup goroutine + the state-change notifier. +var startupProf struct { + enabled atomic.Bool + stopOnce sync.Once + stoppedAt atomic.Int64 // unix nanos; 0 until stopped + traceFile *os.File + cpuFile *os.File + memPath string + phasesPath string + + // phases is a ring of recorded phases, appended to the phases file on stop. + phasesMu sync.Mutex + phases []phaseRecord +} + +type phaseRecord struct { + offset time.Duration + name string +} + +func init() { + // Backstop: ensure procStart reflects a very early moment even if the + // initializer above was bypassed for some reason. + if procStart.IsZero() { + procStart = time.Now() + } + // If TS_STARTUPPROF_DEBUG is set, log wall-clock timestamps (in ns + // since unix epoch) at each phase marker so we can correlate inside + // vs outside-the-process timings. + if os.Getenv("TS_STARTUPPROF_DEBUG") != "" { + startupProfDebug = true + } +} + +var startupProfDebug bool + +// startupProfileStart starts any configured profilers. Safe to call exactly +// once, as early in main as possible. It is not safe to call concurrently. +// +// It returns without error if no profilers are configured; callers should +// still call it unconditionally. +func startupProfileStart() { + tracePath := os.Getenv("TS_STARTUPPROF_TRACE") + cpuPath := os.Getenv("TS_STARTUPPROF_CPUPROF") + memPath := os.Getenv("TS_STARTUPPROF_MEMPROF") + phasesPath := os.Getenv("TS_STARTUPPROF_PHASES") + + if tracePath == "" && cpuPath == "" && memPath == "" && phasesPath == "" { + return + } + startupProf.enabled.Store(true) + startupProf.memPath = memPath + startupProf.phasesPath = phasesPath + + if tracePath != "" { + f, err := os.Create(tracePath) + if err != nil { + fmt.Fprintf(os.Stderr, "startupprof: create trace %q: %v\n", tracePath, err) + } else if err := trace.Start(f); err != nil { + fmt.Fprintf(os.Stderr, "startupprof: trace.Start: %v\n", err) + f.Close() + } else { + startupProf.traceFile = f + } + } + if cpuPath != "" { + f, err := os.Create(cpuPath) + if err != nil { + fmt.Fprintf(os.Stderr, "startupprof: create cpuprof %q: %v\n", cpuPath, err) + } else { + // Bump sample rate from the default 100Hz to 1000Hz since + // startup is typically <100ms and the default gives almost + // no samples. Must be called before StartCPUProfile. + runtime.SetCPUProfileRate(1000) + if err := pprof.StartCPUProfile(f); err != nil { + fmt.Fprintf(os.Stderr, "startupprof: StartCPUProfile: %v\n", err) + f.Close() + } else { + startupProf.cpuFile = f + } + } + } + + // Record process start as the first phase. + markPhase("startupprof_init") + + // Safety net: stop after TS_STARTUPPROF_MAX_SECS regardless of state. + maxSecs := 30 + if v := os.Getenv("TS_STARTUPPROF_MAX_SECS"); v != "" { + if n, err := parsePositiveInt(v); err == nil { + maxSecs = n + } + } + go func() { + time.Sleep(time.Duration(maxSecs) * time.Second) + startupProfileStop("max_time") + }() +} + +// markPhase records a named phase. It is extremely cheap when profiling is +// disabled (one atomic load). When profiling IS enabled, it emits a +// runtime/trace Log event and appends to the phases slice for later +// write-out. +func markPhase(name string) { + if startupProfDebug { + // unix-time ns for external correlation. + fmt.Fprintf(os.Stderr, "TSSPD %d %s\n", time.Now().UnixNano(), name) + } + if !startupProf.enabled.Load() { + return + } + // trace.Log requires a non-nil context. + trace.Log(context.Background(), "phase", name) + rec := phaseRecord{ + offset: time.Since(procStart), + name: name, + } + startupProf.phasesMu.Lock() + startupProf.phases = append(startupProf.phases, rec) + startupProf.phasesMu.Unlock() +} + +// startupProfileWatchLocalBackend spawns a goroutine that watches the given +// LocalBackend and stops all profilers when it reaches ipn.Running. +// +// Safe to call with a nil lb (no-op). Safe to call multiple times. +func startupProfileWatchLocalBackend(lb *ipnlocal.LocalBackend) { + if !startupProf.enabled.Load() || lb == nil { + return + } + go func() { + // WatchNotifications delivers an immediate NotifyInitialState. + lb.WatchNotifications(context.Background(), ipn.NotifyInitialState, nil, func(n *ipn.Notify) bool { + if n.State != nil && *n.State == ipn.Running { + markPhase("state_running") + startupProfileStop("state_running") + return false + } + return true + }) + }() +} + +// startupProfileStop stops all profilers, writes outputs, and is idempotent. +// reason is recorded as the final phase. +func startupProfileStop(reason string) { + startupProf.stopOnce.Do(func() { + markPhase("stop_" + reason) + startupProf.stoppedAt.Store(time.Now().UnixNano()) + + // Stop CPU profile BEFORE the trace, because trace.Stop itself + // does an expensive stack-dump pass that would otherwise be the + // dominant entry in the CPU profile. + if startupProf.cpuFile != nil { + pprof.StopCPUProfile() + startupProf.cpuFile.Close() + } + if startupProf.traceFile != nil { + trace.Stop() + startupProf.traceFile.Close() + } + if startupProf.memPath != "" { + if f, err := os.Create(startupProf.memPath); err == nil { + runtime.GC() + pprof.WriteHeapProfile(f) + f.Close() + } + } + if startupProf.phasesPath != "" { + writePhases(startupProf.phasesPath) + } + }) +} + +func writePhases(path string) { + f, err := os.Create(path) + if err != nil { + fmt.Fprintf(os.Stderr, "startupprof: create phases %q: %v\n", path, err) + return + } + defer f.Close() + startupProf.phasesMu.Lock() + defer startupProf.phasesMu.Unlock() + for _, p := range startupProf.phases { + fmt.Fprintf(f, "%d\t%s\n", p.offset.Nanoseconds(), p.name) + } +} + +func parsePositiveInt(s string) (int, error) { + n := 0 + if s == "" { + return 0, fmt.Errorf("empty") + } + for _, c := range s { + if c < '0' || c > '9' { + return 0, fmt.Errorf("bad digit %q", c) + } + n = n*10 + int(c-'0') + } + return n, nil +} diff --git a/cmd/tailscaled/tailscaled.go b/cmd/tailscaled/tailscaled.go index fe18731ae..0aa9dca45 100644 --- a/cmd/tailscaled/tailscaled.go +++ b/cmd/tailscaled/tailscaled.go @@ -179,6 +179,11 @@ var ( type proxyStartFunc = func(logf logger.Logf, dialer *tsdial.Dialer) func main() { + // Start the optional startup profiler as early as possible. This is a + // no-op unless TS_STARTUPPROF_* envknobs are set. See startupprof.go. + startupProfileStart() + markPhase("main_enter") + envknob.PanicIfAnyEnvCheckedInInit() if shouldRunCLI() { beCLI() @@ -399,11 +404,13 @@ var logPol *logpolicy.Policy // or nil if not used var debugMux *http.ServeMux func run() (err error) { + markPhase("run_enter") var logf logger.Logf = log.Printf // Install an event bus as early as possible, so that it's // available universally when setting up everything else. sys := tsd.NewSystem() + markPhase("tsd_new_system") sys.SocketPath = args.socketpath // Parse config, if specified, to fail early if it's invalid. @@ -425,6 +432,7 @@ func run() (err error) { } sys.Set(netMon) } + markPhase("netmon_new") var publicLogID logid.PublicID if buildfeatures.HasLogTail { @@ -445,6 +453,7 @@ func run() (err error) { pol.Shutdown(ctx) }() } + markPhase("logpolicy_new") if err := envknob.ApplyDiskConfigError(); err != nil { log.Printf("Error reading environment config: %v", err) @@ -473,6 +482,7 @@ func run() (err error) { // crashed, and would for example restore system DNS configuration. dns.CleanUp(logf, netMon, sys.Bus.Get(), sys.HealthTracker.Get(), args.tunname) router.CleanUp(logf, netMon, args.tunname) + markPhase("dns_router_cleanup") // If the cleanUp flag was passed, then exit. if args.cleanUp { return nil @@ -509,10 +519,12 @@ var sigPipe os.Signal // set by sigpipe.go // logID may be the zero value if logging is not in use. func startIPNServer(ctx context.Context, logf logger.Logf, logID logid.PublicID, sys *tsd.System) error { + markPhase("startIPNServer_enter") ln, err := safesocket.Listen(args.socketpath) if err != nil { return fmt.Errorf("safesocket.Listen: %v", err) } + markPhase("safesocket_listen") ctx, cancel := context.WithCancel(ctx) defer cancel() @@ -570,6 +582,7 @@ func startIPNServer(ctx context.Context, logf logger.Logf, logID logid.PublicID, lb, err := getLocalBackend(ctx, logf, logID, sys) if err == nil { logf("got LocalBackend in %v", time.Since(t0).Round(time.Millisecond)) + markPhase("got_local_backend") if lb.Prefs().Valid() { if err := lb.Start(ipn.Options{}); err != nil { logf("LocalBackend.Start: %v", err) @@ -578,8 +591,11 @@ func startIPNServer(ctx context.Context, logf logger.Logf, logID logid.PublicID, cancel() return } + markPhase("lb_start_returned") } srv.SetLocalBackend(lb) + markPhase("srv_set_local_backend") + startupProfileWatchLocalBackend(lb) close(wgEngineCreated) return } @@ -624,6 +640,7 @@ func getLocalBackend(ctx context.Context, logf logger.Logf, logID logid.PublicID if err != nil { return nil, fmt.Errorf("createEngine: %w", err) } + markPhase("createEngine_returned") if onlyNetstack && !buildfeatures.HasNetstack { return nil, errors.New("userspace-networking support is not compiled in to this binary") } @@ -641,6 +658,7 @@ func getLocalBackend(ctx context.Context, logf logger.Logf, logID logid.PublicID return nil, fmt.Errorf("newNetstack: %w", err) } } + markPhase("netstack_created") if startProxy != nil { go startProxy(logf, dialer) @@ -662,15 +680,18 @@ func getLocalBackend(ctx context.Context, logf logger.Logf, logID logid.PublicID ht.SetUnhealthy(ipn.StateStoreHealth, health.Args{health.ArgError: err.Error()}) } sys.Set(store) + markPhase("store_new") if w, ok := sys.Tun.GetOK(); ok { w.Start() } + markPhase("tun_start") lb, err := ipnlocal.NewLocalBackend(logf, logID, sys, opts.LoginFlags) if err != nil { return nil, fmt.Errorf("ipnlocal.NewLocalBackend: %w", err) } + markPhase("new_local_backend") lb.SetVarRoot(opts.VarRoot) if logPol != nil { lb.SetLogFlusher(logPol.Logtail.StartFlush) @@ -687,6 +708,7 @@ func getLocalBackend(ctx context.Context, logf logger.Logf, logID logid.PublicID log.Fatalf("failed to start netstack: %v", err) } } + markPhase("netstack_started") if buildfeatures.HasTPM && args.hardwareAttestation.v { lb.SetHardwareAttested() } |
