Update
Managed to connect a multi-node cluster, but workloads running on one node couldn’t communicate with workloads running on another node. Seems to be a problem with rootless/usernamespace k3s.
Here’s a caddy config to reverse proxy traffic to whichever nspawn containers you want, with tls passthrough (so your ingress controller+cert-manager in k3s handles tls termination)
# Reverse proxy requests from host to containers
services.caddy = {
enable = true;
# Extend Caddy with compile-time plugin for layer 4 proxy.
# This is necessary for proxying HTTPS traffic to k3s ingress,
# without Caddy terminating TLS.
package = pkgs.caddy.withPlugins {
plugins = [ "github.com/mholt/caddy-l4@v0.0.0-20251001194302-2e3e6cf60b25" ];
hash = "sha256-J1Ly6r3nTtknqGWmslcDj8Y/FMrCe+u5BYJzMBWlZyc=";
};
logFormat = ''
level DEBUG
'';
# Use layer4 module to proxy https traffic to containers
# TODO: redirect http requests to https
globalConfig = ''
auto_https disable_certs
servers {
protocols h1 h2
listener_wrappers {
http_redirect
layer4 {
@secure tls sni *.example.com # Replace with your (sub)domain
route @secure {
proxy 10.250.2.2:443 # internal IP of relevant container (not container's gateway x.x.x.1, but container's client ip x.x.x.2)
}
}
tls
}
}
'';
};
To find the right version string for the caddy-l4 plugin, do this:
nix profile add 'nixpkgs#go'
mkdir caddy-l4 && cd caddy-l4
go mod init temp
go get github.com/mholt/caddy-l4
grep 'caddy-l4' go.mod
Here’s a snippet of my configuration.nix to use a common nspawn container definition:
{
config,
modulesPath,
lib,
pkgs,
...
} @ args:
let
# import reusable container helper
mkK3sContainer = import ../../overlays/virtualization/k3s-container.nix { inherit lib pkgs; };
in
{
containers = {
# Container for k3s ingress node in staging cluster
k3s-agent-1 = mkK3sContainer {
name = "k3s-agent-1";
ip = 1;
externalIP = "xxx.xxx.221.193";
role = "agent";
ephemeral = false;
privateUsers = "pick";
serverUrl = "https://xxx.xxx.xxx.xxx:6443";
k3sTokenFile = config.sops.secrets.k3s-token-stg.path;
vpnAuthFile = config.sops.secrets.tailscale-key.path;
gracefulNodeShutdown = {
enable = true;
shutdownGracePeriod = "60s";
};
extraKubeletConfig = {
# Reserve resources to keep system responsive
systemReserved = {
cpu = "100m";
memory = "200Mi";
};
# Enable running kubelet in user namespace (workaround for systemd-nspawn)
featureGates = {
KubeletInUserNamespace = true;
};
cgroupDriver = "systemd";
};
extraFlagsK3s = [
"--node-taint public-ingress:NoSchedule"
];
pkgs = pkgs;
package = pkgs.k3s; # <-- wrapped k3s;
};
};
}
overlays/virtualization/k3s-container.nix:
{ lib, ... }:
# function to create Nixos-container running k3s agent or server
{ name
, ip # single number 0-254, denotes container subnet
, role # "server" or "agent"
, serverUrl ? null # required if role == "agent"
, clusterInit ? false # bootstrap server with etcd
, k3sTokenFile ? null # sops secret k3s tokenFile
, extraFlagsK3s ? [ ] # extra flags to pass k3s module
, gracefulNodeShutdown ? {} # config for stopping pods before the node can shut off
, extraKubeletConfig ? {} # extra config to pass the kubelet
, vpnAuthFile ? null # optional tailscale auth key
, ephemeral ? true # whether to keep state or start fresh each container boot
, privateUsers ? "pick" # privateUsers option of systemd-nspawn
, externalIP ? null # external, publicly routable IP of host
, pkgs # pass pkgs from host
, package ? pkgs.k3s # use overlaid k3s pkg if specified
}:
{
autoStart = true;
privateUsers = privateUsers;
privateNetwork = true;
hostAddress = "10.250.${toString ip}.1";
localAddress = "10.250.${toString ip}.2";
enableTun = true;
ephemeral = ephemeral; # TODO: bind mount stateful dirs so ephemeral containers work
extraFlags = [
"--bind-ro=/sys:/run/sys"
"--bind-ro=/proc:/run/proc"
"--bind-ro=/sys/module:/sys/module"
"--bind=/dev/fuse:/dev/fuse"
"--system-call-filter=bpf"
"--system-call-filter=@keyring"
"--system-call-filter=@mount"
]
++ lib.optionals (k3sTokenFile != null) [
"--load-credential=k3s-token:${k3sTokenFile}"
]
++ lib.optionals (vpnAuthFile != null) [
"--load-credential=tailscale-key:${vpnAuthFile}"
];
allowedDevices = [
{ node = "/dev/fuse"; modifier = "rwm"; }
{ node = "/dev/kmsg"; modifier = "rw"; }
];
config = { config, pkgs, lib, ... }: {
system.stateVersion = "25.11";
# packages in the container
environment.systemPackages = with pkgs; [
];
# set required kernel sysctls
boot = {
kernel.sysctl = {
"net.ipv4.ip_forward" = true;
"net.netfilter.nf_conntrack_max" = 131072;
};
};
# Enable tailscale vpn service
services.tailscale = {
enable = true;
authKeyFile = "/run/credentials/@system/tailscale-key";
};
# Enable container firewall and open relevant ports
networking = {
firewall = {
enable = true;
allowedTCPPorts = [
6443 # k3s
10250 # metrics
25 # mail
80 # web
443 # websecure
];
allowedUDPPorts = [ 8472 ]; # k3s flannel (VXLAN)
};
# Use systemd-resolved inside the container
# Workaround for bug https://github.com/NixOS/nixpkgs/issues/162686
useHostResolvConf = lib.mkForce false;
# Set nameservers
nameservers = [ "94.140.14.14#dns.adguard-dns.com" "94.140.14.15#dns.adguard-dns.com" ];
# Set default gateway, because tailscale claims itself as default if unset
defaultGateway.address = "10.250.${toString ip}.1";
# Set external IP as alias of veth interface, to inform k3s to route traffic from veth as if external
interfaces."eth0".ipv4.addresses = [
{
address = "${toString externalIP}";
prefixLength = 32;
}
];
};
# Configure systemd-resolved
services.resolved = {
enable = true;
dnssec = "allow-downgrade";
domains = [ "~." ];
fallbackDns = [ "9.9.9.9" "94.140.14.15" ];
dnsovertls = "true";
};
# Enable delegation of cgroup controllers to user, for rootless k3s
systemd.services."user@".serviceConfig.Delegate = "memory pids cpu cpuset";
services.k3s = {
enable = true;
role = role;
tokenFile = lib.mkIf (k3sTokenFile != null) "/run/credentials/@system/k3s-token";
extraFlags = extraFlagsK3s
++ lib.optionals (externalIP != null) [
"--node-external-ip ${toString externalIP}"
];
# Only needed for agents
serverAddr = lib.mkIf (role == "agent") serverUrl;
# Only needed for servers that bootstrap a cluster
clusterInit = lib.mkIf (role == "server") clusterInit;
gracefulNodeShutdown = gracefulNodeShutdown;
extraKubeletConfig = extraKubeletConfig;
package = package; # override to include tailscale in runtime deps
containerdConfigTemplate = ''
{{- /* */ -}}
# Make runc use systemd cgroup driver
# Work around mismatch of cni bin dir, revert when containerd >= 2.1
version = 3
root = {{ printf "%q" .NodeConfig.Containerd.Root }}
state = {{ printf "%q" .NodeConfig.Containerd.State }}
[grpc]
address = {{ deschemify .NodeConfig.Containerd.Address | printf "%q" }}
[plugins.'io.containerd.internal.v1.opt']
path = {{ printf "%q" .NodeConfig.Containerd.Opt }}
[plugins.'io.containerd.grpc.v1.cri']
stream_server_address = "127.0.0.1"
stream_server_port = "10010"
[plugins.'io.containerd.cri.v1.runtime']
enable_selinux = {{ .NodeConfig.SELinux }}
enable_unprivileged_ports = {{ .EnableUnprivileged }}
enable_unprivileged_icmp = {{ .EnableUnprivileged }}
device_ownership_from_security_context = {{ .NonrootDevices }}
{{ if .DisableCgroup}}
disable_cgroup = true
{{ end }}
{{ if .IsRunningInUserNS }}
disable_apparmor = true
restrict_oom_score_adj = true
{{ end }}
{{ with .NodeConfig.AgentConfig.Snapshotter }}
[plugins.'io.containerd.cri.v1.images']
snapshotter = "{{ . }}"
disable_snapshot_annotations = {{ if eq . "stargz" }}false{{else}}true{{end}}
use_local_image_pull = true
{{ end }}
{{ with .NodeConfig.AgentConfig.PauseImage }}
[plugins.'io.containerd.cri.v1.images'.pinned_images]
sandbox = "{{ . }}"
{{ end }}
# Work around mismatch of cni bin dir, revert when containerd >= 2.1
{{- if or .NodeConfig.AgentConfig.CNIBinDir .NodeConfig.AgentConfig.CNIConfDir }}
[plugins.'io.containerd.cri.v1.runtime'.cni]
bin_dir = "/var/lib/rancher/k3s/data/cni"
{{ with .NodeConfig.AgentConfig.CNIConfDir }}conf_dir = {{ printf "%q" . }}{{ end }}
{{ end }}
{{ if or .NodeConfig.Containerd.BlockIOConfig .NodeConfig.Containerd.RDTConfig }}
[plugins.'io.containerd.service.v1.tasks-service']
{{ with .NodeConfig.Containerd.BlockIOConfig }}blockio_config_file = {{ printf "%q" . }}{{ end }}
{{ with .NodeConfig.Containerd.RDTConfig }}rdt_config_file = {{ printf "%q" . }}{{ end }}
{{ end }}
{{ with .NodeConfig.DefaultRuntime }}
[plugins.'io.containerd.cri.v1.runtime'.containerd]
default_runtime_name = "{{ . }}"
{{ end }}
[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v2"
[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runc.options]
SystemdCgroup = true
[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runhcs-wcow-process]
runtime_type = "io.containerd.runhcs.v1"
{{ range $k, $v := .ExtraRuntimes }}
[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'{{ $k }}']
runtime_type = "{{$v.RuntimeType}}"
{{ with $v.BinaryName}}
[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.'{{ $k }}'.options]
BinaryName = {{ printf "%q" . }}
SystemdCgroup = true
{{ end }}
{{ end }}
[plugins.'io.containerd.cri.v1.images'.registry]
config_path = {{ printf "%q" .NodeConfig.Containerd.Registry }}
{{ if .PrivateRegistryConfig }}
{{ range $k, $v := .PrivateRegistryConfig.Configs }}
{{ with $v.Auth }}
[plugins.'io.containerd.cri.v1.images'.registry.configs.'{{ $k }}'.auth]
{{ with .Username }}username = {{ printf "%q" . }}{{ end }}
{{ with .Password }}password = {{ printf "%q" . }}{{ end }}
{{ with .Auth }}auth = {{ printf "%q" . }}{{ end }}
{{ with .IdentityToken }}identitytoken = {{ printf "%q" . }}{{ end }}
{{ end }}
{{ end }}
{{ end }}
{{ if eq .NodeConfig.AgentConfig.Snapshotter "stargz" }}
{{ with .NodeConfig.AgentConfig.ImageServiceSocket }}
[plugins.'io.containerd.snapshotter.v1.stargz']
cri_keychain_image_service_path = {{ printf "%q" . }}
[plugins.'io.containerd.snapshotter.v1.stargz'.cri_keychain]
enable_keychain = true
{{ end }}
[plugins.'io.containerd.snapshotter.v1.stargz'.registry]
config_path = {{ printf "%q" .NodeConfig.Containerd.Registry }}
{{ if .PrivateRegistryConfig }}
{{ range $k, $v := .PrivateRegistryConfig.Configs }}
{{ with $v.Auth }}
[plugins.'io.containerd.snapshotter.v1.stargz'.registry.configs.'{{ $k }}'.auth]
{{ with .Username }}username = {{ printf "%q" . }}{{ end }}
{{ with .Password }}password = {{ printf "%q" . }}{{ end }}
{{ with .Auth }}auth = {{ printf "%q" . }}{{ end }}
{{ with .IdentityToken }}identitytoken = {{ printf "%q" . }}{{ end }}
{{ end }}
{{ end }}
{{ end }}
{{ end }}
'';
};
};
}