summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorJordan Whited <jordan@tailscale.com>2025-12-08 14:51:13 -0800
committerJordan Whited <jordan@tailscale.com>2025-12-08 15:04:53 -0800
commitc3f9d1c22ea5be021a1e0170bcdf218695c67c0c (patch)
tree050af4c6b2fc84a3d74d9af9474619b582d47811
parent6a44990b09b79d5e4fea8283f6baaa5ce6cba87a (diff)
downloadtailscale-jwhited/udprelay-xdp.tar.xz
tailscale-jwhited/udprelay-xdp.zip
net/udprelay: XDP PoCjwhited/udprelay-xdp
do not merge Updates tailscale/corp#34849 Signed-off-by: Jordan Whited <jordan@tailscale.com>
-rw-r--r--net/udprelay/server.go44
-rw-r--r--net/udprelay/xdp/bpf_bpfeb.go131
-rw-r--r--net/udprelay/xdp/bpf_bpfeb.obin0 -> 17144 bytes
-rw-r--r--net/udprelay/xdp/bpf_bpfel.go131
-rw-r--r--net/udprelay/xdp/bpf_bpfel.obin0 -> 17248 bytes
-rw-r--r--net/udprelay/xdp/xdp.c350
-rw-r--r--net/udprelay/xdp/xdp.go48
-rw-r--r--net/udprelay/xdp/xdp_linux.go103
-rw-r--r--net/udprelay/xdp/xdp_notlinux.go18
9 files changed, 819 insertions, 6 deletions
diff --git a/net/udprelay/server.go b/net/udprelay/server.go
index 26b27bb7f..48de1dfc1 100644
--- a/net/udprelay/server.go
+++ b/net/udprelay/server.go
@@ -25,6 +25,7 @@ import (
"golang.org/x/crypto/blake2s"
"golang.org/x/net/ipv6"
"tailscale.com/disco"
+ "tailscale.com/envknob"
"tailscale.com/net/batching"
"tailscale.com/net/netaddr"
"tailscale.com/net/netcheck"
@@ -34,6 +35,7 @@ import (
"tailscale.com/net/stun"
"tailscale.com/net/udprelay/endpoint"
"tailscale.com/net/udprelay/status"
+ "tailscale.com/net/udprelay/xdp"
"tailscale.com/tailcfg"
"tailscale.com/tstime"
"tailscale.com/types/key"
@@ -75,6 +77,7 @@ type Server struct {
wg sync.WaitGroup
closeCh chan struct{}
netChecker *netcheck.Client
+ fib xdp.FIB
mu sync.Mutex // guards the following fields
macSecrets [][blake2s.Size]byte // [0] is most recent, max 2 elements
@@ -140,7 +143,7 @@ func blakeMACFromBindMsg(blakeKey [blake2s.Size]byte, src netip.AddrPort, msg di
return out, nil
}
-func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex int, discoMsg disco.Message, serverDisco key.DiscoPublic, macSecrets [][blake2s.Size]byte) (write []byte, to netip.AddrPort) {
+func (e *serverEndpoint) handleDiscoControlMsg(logf logger.Logf, fib xdp.FIB, from netip.AddrPort, senderIndex int, discoMsg disco.Message, serverDisco key.DiscoPublic, macSecrets [][blake2s.Size]byte) (write []byte, to netip.AddrPort) {
if senderIndex != 0 && senderIndex != 1 {
return nil, netip.AddrPort{}
}
@@ -218,6 +221,12 @@ func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex
e.boundAddrPorts[senderIndex] = from
e.lastSeen[senderIndex] = time.Now() // record last seen as bound time
e.inProgressGeneration[senderIndex] = 0 // reset to zero, which indicates there is no in-progress handshake
+ if fib != nil && e.boundAddrPorts[0].IsValid() && e.boundAddrPorts[1].IsValid() {
+ err = fib.Upsert(e.vni, e.boundAddrPorts)
+ if err != nil {
+ logf("error upserting fib: %v", err)
+ }
+ }
return nil, netip.AddrPort{}
}
}
@@ -229,7 +238,7 @@ func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex
}
}
-func (e *serverEndpoint) handleSealedDiscoControlMsg(from netip.AddrPort, b []byte, serverDisco key.DiscoPublic, macSecrets [][blake2s.Size]byte) (write []byte, to netip.AddrPort) {
+func (e *serverEndpoint) handleSealedDiscoControlMsg(logf logger.Logf, fib xdp.FIB, from netip.AddrPort, b []byte, serverDisco key.DiscoPublic, macSecrets [][blake2s.Size]byte) (write []byte, to netip.AddrPort) {
senderRaw, isDiscoMsg := disco.Source(b)
if !isDiscoMsg {
// Not a Disco message
@@ -260,7 +269,7 @@ func (e *serverEndpoint) handleSealedDiscoControlMsg(from netip.AddrPort, b []by
return nil, netip.AddrPort{}
}
- return e.handleDiscoControlMsg(from, senderIndex, discoMsg, serverDisco, macSecrets)
+ return e.handleDiscoControlMsg(logf, fib, from, senderIndex, discoMsg, serverDisco, macSecrets)
}
func (e *serverEndpoint) handleDataPacket(from netip.AddrPort, b []byte, now time.Time) (write []byte, to netip.AddrPort) {
@@ -323,6 +332,17 @@ func NewServer(logf logger.Logf, port uint16, onlyStaticAddrPorts bool) (s *Serv
byVNI: make(map[uint32]*serverEndpoint),
}
s.discoPublic = s.disco.Public()
+ xdpDev := envknob.String("TS_PEER_RELAY_XDP_DEVICE")
+ if xdpDev != "" {
+ s.fib, err = xdp.NewFIB(&xdp.FIBConfig{
+ DstPort: port,
+ DeviceName: xdpDev,
+ })
+ }
+
+ if err != nil {
+ return nil, err
+ }
// TODO(creachadair): Find a way to plumb this in during initialization.
// As-written, messages published here will not be seen by other components
@@ -547,11 +567,11 @@ func trySetUDPSocketOptions(pconn nettype.PacketConn, logf logger.Logf) {
func (s *Server) bindSockets(desiredPort uint16) error {
// maxSocketsPerAF is a conservative starting point, but is somewhat
// arbitrary.
- maxSocketsPerAF := min(16, runtime.NumCPU())
+ maxSocketsPerAF := min(128, runtime.NumCPU())
listenConfig := &net.ListenConfig{
Control: listenControl,
}
- for _, network := range []string{"udp4", "udp6"} {
+ for _, network := range []string{"udp4"} { //, "udp6"} {
SocketsLoop:
for i := range maxSocketsPerAF {
if i > 0 {
@@ -626,6 +646,9 @@ func (s *Server) bindSocketTo(listenConfig *net.ListenConfig, network string, po
// Close closes the server.
func (s *Server) Close() error {
s.closeOnce.Do(func() {
+ if s.fib != nil {
+ s.fib.Close()
+ }
for _, uc4 := range s.uc4 {
uc4.Close()
}
@@ -662,6 +685,15 @@ func (s *Server) endpointGCLoop() {
if v.isExpired(now, s.bindLifetime, s.steadyStateLifetime) {
delete(s.byDisco, k)
delete(s.byVNI, v.vni)
+ // TODO: isExpired only considers userspace counters/liveliness
+ // TODO: this is a syscall per VNI to delete while holding s.mu,
+ // consider batch delete
+ if s.fib != nil {
+ err := s.fib.Delete(v.vni)
+ if err != nil {
+ s.logf("failed to delete fib entry: %v", err)
+ }
+ }
}
}
}
@@ -708,7 +740,7 @@ func (s *Server) handlePacket(from netip.AddrPort, b []byte) (write []byte, to n
}
msg := b[packet.GeneveFixedHeaderLength:]
s.maybeRotateMACSecretLocked(now)
- return e.handleSealedDiscoControlMsg(from, msg, s.discoPublic, s.macSecrets)
+ return e.handleSealedDiscoControlMsg(s.logf, s.fib, from, msg, s.discoPublic, s.macSecrets)
}
return e.handleDataPacket(from, b, now)
}
diff --git a/net/udprelay/xdp/bpf_bpfeb.go b/net/udprelay/xdp/bpf_bpfeb.go
new file mode 100644
index 000000000..dce7dd177
--- /dev/null
+++ b/net/udprelay/xdp/bpf_bpfeb.go
@@ -0,0 +1,131 @@
+// Code generated by bpf2go; DO NOT EDIT.
+//go:build mips || mips64 || ppc64 || s390x
+
+package xdp
+
+import (
+ "bytes"
+ _ "embed"
+ "fmt"
+ "io"
+
+ "github.com/cilium/ebpf"
+)
+
+type bpfConfig struct{ DstPort uint16 }
+
+type bpfEndpoint struct {
+ ParticipantAddrs [2][4]uint32
+ ParticipantPorts [2]uint16
+ ParticipantIsIpv6 [2]uint8
+ _ [2]byte
+}
+
+// loadBpf returns the embedded CollectionSpec for bpf.
+func loadBpf() (*ebpf.CollectionSpec, error) {
+ reader := bytes.NewReader(_BpfBytes)
+ spec, err := ebpf.LoadCollectionSpecFromReader(reader)
+ if err != nil {
+ return nil, fmt.Errorf("can't load bpf: %w", err)
+ }
+
+ return spec, err
+}
+
+// loadBpfObjects loads bpf and converts it into a struct.
+//
+// The following types are suitable as obj argument:
+//
+// *bpfObjects
+// *bpfPrograms
+// *bpfMaps
+//
+// See ebpf.CollectionSpec.LoadAndAssign documentation for details.
+func loadBpfObjects(obj interface{}, opts *ebpf.CollectionOptions) error {
+ spec, err := loadBpf()
+ if err != nil {
+ return err
+ }
+
+ return spec.LoadAndAssign(obj, opts)
+}
+
+// bpfSpecs contains maps and programs before they are loaded into the kernel.
+//
+// It can be passed ebpf.CollectionSpec.Assign.
+type bpfSpecs struct {
+ bpfProgramSpecs
+ bpfMapSpecs
+}
+
+// bpfSpecs contains programs before they are loaded into the kernel.
+//
+// It can be passed ebpf.CollectionSpec.Assign.
+type bpfProgramSpecs struct {
+ XdpProgFunc *ebpf.ProgramSpec `ebpf:"xdp_prog_func"`
+}
+
+// bpfMapSpecs contains maps before they are loaded into the kernel.
+//
+// It can be passed ebpf.CollectionSpec.Assign.
+type bpfMapSpecs struct {
+ ConfigMap *ebpf.MapSpec `ebpf:"config_map"`
+ EndpointMap *ebpf.MapSpec `ebpf:"endpoint_map"`
+}
+
+// bpfObjects contains all objects after they have been loaded into the kernel.
+//
+// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign.
+type bpfObjects struct {
+ bpfPrograms
+ bpfMaps
+}
+
+func (o *bpfObjects) Close() error {
+ return _BpfClose(
+ &o.bpfPrograms,
+ &o.bpfMaps,
+ )
+}
+
+// bpfMaps contains all maps after they have been loaded into the kernel.
+//
+// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign.
+type bpfMaps struct {
+ ConfigMap *ebpf.Map `ebpf:"config_map"`
+ EndpointMap *ebpf.Map `ebpf:"endpoint_map"`
+}
+
+func (m *bpfMaps) Close() error {
+ return _BpfClose(
+ m.ConfigMap,
+ m.EndpointMap,
+ )
+}
+
+// bpfPrograms contains all programs after they have been loaded into the kernel.
+//
+// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign.
+type bpfPrograms struct {
+ XdpProgFunc *ebpf.Program `ebpf:"xdp_prog_func"`
+}
+
+func (p *bpfPrograms) Close() error {
+ return _BpfClose(
+ p.XdpProgFunc,
+ )
+}
+
+func _BpfClose(closers ...io.Closer) error {
+ for _, closer := range closers {
+ if err := closer.Close(); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// Do not access this directly.
+//
+//go:embed bpf_bpfeb.o
+var _BpfBytes []byte
diff --git a/net/udprelay/xdp/bpf_bpfeb.o b/net/udprelay/xdp/bpf_bpfeb.o
new file mode 100644
index 000000000..9b035f983
--- /dev/null
+++ b/net/udprelay/xdp/bpf_bpfeb.o
Binary files differ
diff --git a/net/udprelay/xdp/bpf_bpfel.go b/net/udprelay/xdp/bpf_bpfel.go
new file mode 100644
index 000000000..b6599db04
--- /dev/null
+++ b/net/udprelay/xdp/bpf_bpfel.go
@@ -0,0 +1,131 @@
+// Code generated by bpf2go; DO NOT EDIT.
+//go:build 386 || amd64 || arm || arm64 || loong64 || mips64le || mipsle || ppc64le || riscv64
+
+package xdp
+
+import (
+ "bytes"
+ _ "embed"
+ "fmt"
+ "io"
+
+ "github.com/cilium/ebpf"
+)
+
+type bpfConfig struct{ DstPort uint16 }
+
+type bpfEndpoint struct {
+ ParticipantAddrs [2][4]uint32
+ ParticipantPorts [2]uint16
+ ParticipantIsIpv6 [2]uint8
+ _ [2]byte
+}
+
+// loadBpf returns the embedded CollectionSpec for bpf.
+func loadBpf() (*ebpf.CollectionSpec, error) {
+ reader := bytes.NewReader(_BpfBytes)
+ spec, err := ebpf.LoadCollectionSpecFromReader(reader)
+ if err != nil {
+ return nil, fmt.Errorf("can't load bpf: %w", err)
+ }
+
+ return spec, err
+}
+
+// loadBpfObjects loads bpf and converts it into a struct.
+//
+// The following types are suitable as obj argument:
+//
+// *bpfObjects
+// *bpfPrograms
+// *bpfMaps
+//
+// See ebpf.CollectionSpec.LoadAndAssign documentation for details.
+func loadBpfObjects(obj interface{}, opts *ebpf.CollectionOptions) error {
+ spec, err := loadBpf()
+ if err != nil {
+ return err
+ }
+
+ return spec.LoadAndAssign(obj, opts)
+}
+
+// bpfSpecs contains maps and programs before they are loaded into the kernel.
+//
+// It can be passed ebpf.CollectionSpec.Assign.
+type bpfSpecs struct {
+ bpfProgramSpecs
+ bpfMapSpecs
+}
+
+// bpfSpecs contains programs before they are loaded into the kernel.
+//
+// It can be passed ebpf.CollectionSpec.Assign.
+type bpfProgramSpecs struct {
+ XdpProgFunc *ebpf.ProgramSpec `ebpf:"xdp_prog_func"`
+}
+
+// bpfMapSpecs contains maps before they are loaded into the kernel.
+//
+// It can be passed ebpf.CollectionSpec.Assign.
+type bpfMapSpecs struct {
+ ConfigMap *ebpf.MapSpec `ebpf:"config_map"`
+ EndpointMap *ebpf.MapSpec `ebpf:"endpoint_map"`
+}
+
+// bpfObjects contains all objects after they have been loaded into the kernel.
+//
+// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign.
+type bpfObjects struct {
+ bpfPrograms
+ bpfMaps
+}
+
+func (o *bpfObjects) Close() error {
+ return _BpfClose(
+ &o.bpfPrograms,
+ &o.bpfMaps,
+ )
+}
+
+// bpfMaps contains all maps after they have been loaded into the kernel.
+//
+// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign.
+type bpfMaps struct {
+ ConfigMap *ebpf.Map `ebpf:"config_map"`
+ EndpointMap *ebpf.Map `ebpf:"endpoint_map"`
+}
+
+func (m *bpfMaps) Close() error {
+ return _BpfClose(
+ m.ConfigMap,
+ m.EndpointMap,
+ )
+}
+
+// bpfPrograms contains all programs after they have been loaded into the kernel.
+//
+// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign.
+type bpfPrograms struct {
+ XdpProgFunc *ebpf.Program `ebpf:"xdp_prog_func"`
+}
+
+func (p *bpfPrograms) Close() error {
+ return _BpfClose(
+ p.XdpProgFunc,
+ )
+}
+
+func _BpfClose(closers ...io.Closer) error {
+ for _, closer := range closers {
+ if err := closer.Close(); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// Do not access this directly.
+//
+//go:embed bpf_bpfel.o
+var _BpfBytes []byte
diff --git a/net/udprelay/xdp/bpf_bpfel.o b/net/udprelay/xdp/bpf_bpfel.o
new file mode 100644
index 000000000..c72c4cf84
--- /dev/null
+++ b/net/udprelay/xdp/bpf_bpfel.o
Binary files differ
diff --git a/net/udprelay/xdp/xdp.c b/net/udprelay/xdp/xdp.c
new file mode 100644
index 000000000..386712975
--- /dev/null
+++ b/net/udprelay/xdp/xdp.c
@@ -0,0 +1,350 @@
+//go:build ignore
+
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
+#include <bpf_endian.h>
+#include <bpf_helpers.h>
+
+char _license[4] SEC("license") = "GPL";
+
+struct config {
+ __u16 dst_port;
+};
+struct config *unused_config __attribute__((unused)); // required by bpf2go -type
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct config));
+ __uint(max_entries, 1);
+} config_map SEC(".maps");
+
+struct endpoint {
+ __be32 participant_addrs[2][4];
+ __u16 participant_ports[2];
+ __u8 participant_is_ipv6[2];
+};
+struct endpoint *unused_endpoint __attribute__((unused)); // required by bpf2go -type
+
+#define MAX_GENEVE_VNI (1 << 24) - 1
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __uint(key_size, sizeof(__u32)); // key is Geneve VNI
+ __uint(value_size, sizeof(struct endpoint));
+ __uint(max_entries, MAX_GENEVE_VNI);
+} endpoint_map SEC(".maps");
+
+#define MAX_UDP_LEN_IPV4 1480
+
+#define MAX_UDP_LEN_IPV6 1460
+
+#define IP_MF 0x2000
+#define IP_OFFSET 0x1fff
+
+/*
+Geneve Header:
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ |Ver| Opt Len |O|C| Rsvd. | Protocol Type |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Virtual Network Identifier (VNI) | Reserved |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ ~ Variable-Length Options ~
+ | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+struct geneve_header {
+ __u8 first;
+ __u8 second;
+ __be16 protocol;
+ __be32 vni;
+};
+
+static __always_inline __u16 csum_fold(__u32 csum) {
+ __u32 sum;
+ sum = (csum >> 16) + (csum & 0xffff); // maximum value 0x1fffe
+ sum += (sum >> 16); // maximum value 0xffff
+ return sum;
+}
+
+static __always_inline __u16 csum_fold_flip(__u32 csum) {
+ __u32 sum;
+ sum = (csum >> 16) + (csum & 0xffff); // maximum value 0x1fffe
+ sum += (sum >> 16); // maximum value 0xffff
+ return ~sum;
+}
+
+static __always_inline __u32 pseudo_sum_ipv6(struct ipv6hdr* ip6, __u16 udp_len) {
+ __u32 pseudo = 0; // TODO(jwhited): __u64 for intermediate checksum values to reduce number of ops
+ for (int i = 0; i < 8; i ++) {
+ pseudo += ip6->saddr.in6_u.u6_addr16[i];
+ pseudo += ip6->daddr.in6_u.u6_addr16[i];
+ }
+ pseudo += bpf_htons(ip6->nexthdr);
+ pseudo += udp_len;
+ return pseudo;
+}
+
+static __always_inline __u32 pseudo_sum_ipv4(struct iphdr* ip, __u16 udp_len) {
+ __u32 pseudo = (__u16)ip->saddr;
+ pseudo += (__u16)(ip->saddr >> 16);
+ pseudo += (__u16)ip->daddr;
+ pseudo += (__u16)(ip->daddr >> 16);
+ pseudo += bpf_htons(ip->protocol);
+ pseudo += udp_len;
+ return pseudo;
+}
+
+// csum_const_size is an alternative to bpf_csum_diff. It's a verifier
+// workaround for when we are forced to use a constant max_size + bounds
+// checking. The alternative being passing a dynamic length to bpf_csum_diff
+// {from,to}_size arguments, which the verifier can't follow. For further info
+// see: https://github.com/iovisor/bcc/issues/2463#issuecomment-512503958
+static __always_inline __u16 csum_const_size(__u32 seed, void* from, void* data_end, int max_size) {
+ __u16 *buf = from;
+ for (int i = 0; i < max_size; i += 2) {
+ if ((void *)(buf + 1) > data_end) {
+ break;
+ }
+ seed += *buf;
+ buf++;
+ }
+ if ((void *)buf + 1 <= data_end) {
+ seed += *(__u8 *)buf;
+ }
+ return csum_fold_flip(seed);
+}
+
+SEC("xdp")
+int xdp_prog_func(struct xdp_md *ctx) {
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+
+ struct ethhdr *eth = data;
+ if ((void *)(eth + 1) > data_end) {
+ return XDP_PASS;
+ }
+
+ struct iphdr *ip;
+ struct ipv6hdr *ip6;
+ struct udphdr *udp;
+
+ int validate_udp_csum = 0;
+ int is_ipv6 = 0;
+ if (eth->h_proto == bpf_htons(ETH_P_IP)) {
+ ip = (void *)(eth + 1);
+ if ((void *)(ip + 1) > data_end) {
+ return XDP_PASS;
+ }
+
+ if (ip->ihl != 5 ||
+ ip->version != 4 ||
+ ip->protocol != IPPROTO_UDP ||
+ (ip->frag_off & bpf_htons(IP_MF | IP_OFFSET)) != 0) {
+ return XDP_PASS;
+ }
+
+ // validate ipv4 header checksum
+ __u32 cs_unfolded = bpf_csum_diff(0, 0, (void *)ip, sizeof(*ip), 0);
+ __u16 cs = csum_fold_flip(cs_unfolded);
+ if (cs != 0) {
+ return XDP_PASS;
+ }
+
+ if (bpf_ntohs(ip->tot_len) != data_end - (void *)ip) {
+ return XDP_PASS;
+ }
+
+ udp = (void *)(ip + 1);
+ if ((void *)(udp + 1) > data_end) {
+ return XDP_PASS;
+ }
+
+ if (udp->check != 0) {
+ // https://datatracker.ietf.org/doc/html/rfc768#page-3
+ // If the computed checksum is zero, it is transmitted as all
+ // ones (the equivalent in one's complement arithmetic). An all
+ // zero transmitted checksum value means that the transmitter
+ // generated no checksum (for debugging or for higher level
+ // protocols that don't care).
+ validate_udp_csum = 1;
+ }
+ } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) {
+ ip6 = (void *)(eth + 1);
+ if ((void *)(ip6 + 1) > data_end) {
+ return XDP_PASS;
+ }
+
+ if (ip6->version != 6 || ip6->nexthdr != IPPROTO_UDP) {
+ return XDP_PASS;
+ }
+
+ udp = (void *)(ip6 + 1);
+ if ((void *)(udp + 1) > data_end) {
+ return XDP_PASS;
+ }
+
+ if (bpf_ntohs(ip6->payload_len) != data_end - (void *)udp) {
+ return XDP_PASS;
+ }
+
+ // https://datatracker.ietf.org/doc/html/rfc8200#page-28
+ // Unlike IPv4, the default behavior when UDP packets are
+ // originated by an IPv6 node is that the UDP checksum is not
+ // optional. That is, whenever originating a UDP packet, an IPv6
+ // node must compute a UDP checksum over the packet and the
+ // pseudo-header, and, if that computation yields a result of
+ // zero, it must be changed to hex FFFF for placement in the UDP
+ // header. IPv6 receivers must discard UDP packets containing a
+ // zero checksum and should log the error.
+ validate_udp_csum = 1;
+ is_ipv6 = 1;
+ } else {
+ return XDP_PASS;
+ }
+
+ __u32 config_key = 0;
+ struct config *c = bpf_map_lookup_elem(&config_map, &config_key);
+ if (!c) {
+ return XDP_PASS;
+ }
+
+ if (bpf_ntohs(udp->len) != data_end - (void *)udp) {
+ return XDP_PASS;
+ }
+
+ if (bpf_ntohs(udp->dest) != c->dst_port) {
+ return XDP_PASS;
+ }
+
+ if (validate_udp_csum) {
+ __u16 cs;
+ __u32 pseudo_sum;
+ if (is_ipv6) {
+ pseudo_sum = pseudo_sum_ipv6(ip6, udp->len);
+ cs = csum_const_size(pseudo_sum, udp, data_end, MAX_UDP_LEN_IPV6);
+ } else {
+ pseudo_sum = pseudo_sum_ipv4(ip, udp->len);
+ cs = csum_const_size(pseudo_sum, udp, data_end, MAX_UDP_LEN_IPV4);
+ }
+ if (cs != 0) {
+ return XDP_PASS;
+ }
+ }
+
+ struct geneve_header *geneve = (void *)(udp + 1);
+ if ((void *)(geneve +1) > data_end) {
+ return XDP_PASS;
+ }
+
+ if (geneve->first != 0) {
+ // first 2 bits are version, must be zero
+ // next 6 bits are opt len, must be zero
+ return XDP_PASS;
+ }
+
+ if (geneve->second != 0) {
+ // first bit is control, must be zero
+ // next bit is critical (options), must be zero
+ // next 6 bits are reserved, must be zero
+ return XDP_PASS;
+ }
+
+ if ((geneve->vni & 0x000000FF) != 0) {
+ // last byte is reserved, must be zero
+ return XDP_PASS;
+ }
+
+ __u32 vni_key = bpf_ntohl(geneve->vni) >> 8;
+ struct endpoint *e = bpf_map_lookup_elem(&endpoint_map, &vni_key);
+ if (!e) {
+ return XDP_PASS;
+ }
+
+ int out_participant_index = -1; // -1 = unmatched
+ if (is_ipv6) {
+ // TODO
+ } else {
+ for (int i = 0; i < 2; i ++) {
+ if (e->participant_is_ipv6[i] == 0 &&
+ e->participant_addrs[i][3] == ip->saddr &&
+ e->participant_ports[i] == bpf_ntohs(udp->source))
+ {
+ if (i == 0) {
+ out_participant_index = 1;
+ } else {
+ out_participant_index = 0;
+ }
+ break;
+ }
+ }
+ }
+ if (out_participant_index == -1) {
+ return XDP_PASS;
+ }
+
+ if (e->participant_is_ipv6[out_participant_index] == is_ipv6) {
+ // matching in/out address family
+ if (is_ipv6) {
+ // TODO: in ipv6, out ipv6
+ } else {
+ // TODO: in ipv4, out ipv4
+
+ // Update IPv4 header
+ __be32 p_addr = e->participant_addrs[out_participant_index][3];
+ __u32 ip_csum = ~(__u32)ip->check;
+ __u32 udp_csum = ~(__u32)udp->check;
+ ip_csum = bpf_csum_diff(&ip->saddr, 4, &p_addr, 4, ip_csum);
+ udp_csum = bpf_csum_diff(&ip->saddr, 4, &p_addr, 4, udp_csum);
+ ip->check = csum_fold_flip(ip_csum);
+ ip->saddr = ip->daddr;
+ ip->daddr = p_addr;
+
+ #define AF_INET 2
+ struct bpf_fib_lookup fib_params = {};
+ fib_params.family = AF_INET;
+ fib_params.tos = ip->tos;
+ fib_params.l4_protocol = ip->protocol;
+ fib_params.sport = 0;
+ fib_params.dport = 0;
+ fib_params.tot_len = bpf_ntohs(ip->tot_len);
+ fib_params.ipv4_src = ip->saddr;
+ fib_params.ipv4_dst = ip->daddr;
+ fib_params.ifindex = ctx->ingress_ifindex;
+
+ int rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), BPF_FIB_LOOKUP_DIRECT);
+ if (rc != BPF_FIB_LKUP_RET_SUCCESS) {
+ return XDP_ABORTED;
+ }
+
+ // Rewrite ethernet header source and destination address.
+ __builtin_memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
+ __builtin_memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
+
+ // Update UDP header
+ __u32 old_udp_port = (__u32)udp->source;
+ __u32 new_udp_port = (__u32)bpf_htons(e->participant_ports[out_participant_index]);
+ udp_csum = bpf_csum_diff(&old_udp_port, 4, &new_udp_port, 4, udp_csum);
+ udp->check = csum_fold_flip(udp_csum);
+ udp->source = udp->dest;
+ udp->dest = bpf_htons(e->participant_ports[out_participant_index]);
+ udp = (void *)(ip + 1);
+ if ((void *)(udp +1) > data_end) {
+ return XDP_ABORTED;
+ }
+
+ return XDP_TX;
+ }
+ } else if (e->participant_is_ipv6[out_participant_index] == 0) {
+ // TODO: in ipv4, out ipv6
+ } else {
+ // TODO: in ipv6, out ipv4
+ }
+
+ return XDP_PASS;
+} \ No newline at end of file
diff --git a/net/udprelay/xdp/xdp.go b/net/udprelay/xdp/xdp.go
new file mode 100644
index 000000000..14c361879
--- /dev/null
+++ b/net/udprelay/xdp/xdp.go
@@ -0,0 +1,48 @@
+package xdp
+
+import "net/netip"
+
+// XDPAttachFlags represents how XDP program will be attached to interface. This
+// is a mirror of cilium/ebpf/link.AttachFlags, without pulling it in for
+// non-Linux.
+type XDPAttachFlags uint32
+
+const (
+ // XDPDriverFallbackGenericMode attempts XDPDriverMode, and falls back to
+ // XDPGenericMode if the driver does not support XDP.
+ XDPDriverFallbackGenericMode = 0
+)
+
+const (
+ // XDPGenericMode (SKB) links XDP BPF program for drivers which do
+ // not yet support native XDP.
+ XDPGenericMode XDPAttachFlags = 1 << (iota + 1)
+ // XDPDriverMode links XDP BPF program into the driver’s receive path.
+ XDPDriverMode
+ // XDPOffloadMode offloads the entire XDP BPF program into hardware.
+ XDPOffloadMode
+)
+
+type FIBConfig struct {
+ DeviceName string
+ // TODO: DstPort is singular, but udp4 and udp6 can be independent ports if
+ // the user supplied a zero port value.
+ DstPort uint16
+ AttachFlags XDPAttachFlags
+}
+
+func (f FIBConfig) validate() error { return nil }
+
+type FIBOption interface {
+ apply(*fibOptions)
+}
+
+type fibOptions struct {
+ noAttach bool
+}
+
+type FIB interface {
+ Delete(vni uint32) error
+ Upsert(vni uint32, participants [2]netip.AddrPort) error
+ Close() error
+}
diff --git a/net/udprelay/xdp/xdp_linux.go b/net/udprelay/xdp/xdp_linux.go
new file mode 100644
index 000000000..9a61942e9
--- /dev/null
+++ b/net/udprelay/xdp/xdp_linux.go
@@ -0,0 +1,103 @@
+// Copyright (c) Tailscale Inc & AUTHORS
+// SPDX-License-Identifier: BSD-3-Clause
+
+//go:build linux
+
+package xdp
+
+import (
+ "encoding/binary"
+ "errors"
+ "fmt"
+ "net"
+ "net/netip"
+
+ "github.com/cilium/ebpf"
+ "github.com/cilium/ebpf/link"
+)
+
+//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -type config -type endpoint bpf xdp.c -- -I ../../../derp/xdp/headers
+
+func NewFIB(config *FIBConfig, opts ...FIBOption) (FIB, error) {
+ o := &fibOptions{}
+ for _, opt := range opts {
+ opt.apply(o)
+ }
+ err := config.validate()
+ if err != nil {
+ return nil, fmt.Errorf("invalid config: %v", err)
+ }
+ objs := new(bpfObjects)
+ err = loadBpfObjects(objs, nil)
+ if err != nil {
+ var ve *ebpf.VerifierError
+ if errors.As(err, &ve) {
+ err = fmt.Errorf("verifier error: %+v", ve)
+ }
+ return nil, fmt.Errorf("error loading XDP program: %w", err)
+ }
+ f := &linuxFIB{
+ objs: objs,
+ dstPort: config.DstPort,
+ }
+ var key uint32
+ xdpConfig := &bpfConfig{
+ DstPort: config.DstPort,
+ }
+ err = objs.ConfigMap.Put(key, xdpConfig)
+ if err != nil {
+ return nil, fmt.Errorf("error loading config in eBPF map: %w", err)
+ }
+ if o.noAttach {
+ return f, nil
+ }
+ iface, err := net.InterfaceByName(config.DeviceName)
+ if err != nil {
+ return nil, fmt.Errorf("error finding device: %w", err)
+ }
+ link, err := link.AttachXDP(link.XDPOptions{
+ Program: objs.XdpProgFunc,
+ Interface: iface.Index,
+ Flags: link.XDPAttachFlags(config.AttachFlags),
+ })
+ if err != nil {
+ return nil, fmt.Errorf("error attaching XDP program to dev: %w", err)
+ }
+ f.link = link
+ return f, nil
+}
+
+type linuxFIB struct {
+ objs *bpfObjects
+ dstPort uint16
+ link link.Link
+}
+
+func (l *linuxFIB) Delete(vni uint32) error {
+ return l.objs.EndpointMap.Delete(&vni)
+}
+
+func (l *linuxFIB) Upsert(vni uint32, participants [2]netip.AddrPort) error {
+ endpoint := bpfEndpoint{}
+ for i, participant := range participants {
+ as16 := participant.Addr().As16()
+ for j := 0; j < 4; j++ {
+ endpoint.ParticipantAddrs[i][j] = binary.NativeEndian.Uint32(as16[j*4:])
+ }
+ endpoint.ParticipantPorts[i] = participant.Port()
+ if participant.Addr().Is6() {
+ endpoint.ParticipantIsIpv6[i] = 1
+ }
+ }
+ numCPU, err := ebpf.PossibleCPU()
+ if err != nil {
+ return err
+ }
+ vals := make([]bpfEndpoint, numCPU)
+ for i := range vals {
+ vals[i] = endpoint
+ }
+ return l.objs.EndpointMap.Put(&vni, vals)
+}
+
+func (l *linuxFIB) Close() error { return nil }
diff --git a/net/udprelay/xdp/xdp_notlinux.go b/net/udprelay/xdp/xdp_notlinux.go
new file mode 100644
index 000000000..ba1466e94
--- /dev/null
+++ b/net/udprelay/xdp/xdp_notlinux.go
@@ -0,0 +1,18 @@
+// Copyright (c) Tailscale Inc & AUTHORS
+// SPDX-License-Identifier: BSD-3-Clause
+
+//go:build !linux
+
+package xdp
+
+import "net/netip"
+
+type noopFIB struct{}
+
+func (noopFIB) Delete(vni uint32) error { return nil }
+func (noopFIB) Upsert(vni uint32, participants [2]netip.AddrPort) error { return nil }
+func (noopFIB) Close(vni uint32, participants [2]netip.AddrPort) error { return nil }
+
+func NewFIB(config FIBConfig, opts ...FIBOption) (FIB, error) {
+ return noopFIB{}
+}