Using nvidia-container-runtime with containerd on NixOS

rusty-jules · May 20, 2023, 8:33am

Ok so I took the very un-nix hammer approach and it worked! I have the current nixpkgs-unstable nvidia-container-toolkit derivation in my overlay and statically linked libcuda and libnvidia-ml into the go binaries. This is with v1.12.1.

  ldflags = [ "-s" "-w" "-extldflags" "'-L${unpatched-nvidia-driver}/lib -lcuda -lnvidia-ml'" ];

Where unpatched-nvidia-driver is @eadwu’s builder swap, though I’m not sure if that matters in this case since these libs don’t link to any other nvidia libs, and the ones they do link should be in whatever container is running.

ldd /nix/store/30x7mhkxv6ghf8893d6lhd5jiplxh897-nvidia-x11-525.89.02-5.15.96/lib/libcuda.so.525.89.02
        linux-vdso.so.1 (0x00007ffd5fc4e000)
        libm.so.6 => /nix/store/xnk2z26fqy86xahiz3q797dzqx96sidk-glibc-2.37-8/lib/libm.so.6 (0x00007f3405733000)
        libc.so.6 => /nix/store/xnk2z26fqy86xahiz3q797dzqx96sidk-glibc-2.37-8/lib/libc.so.6 (0x00007f340554d000)
        libdl.so.2 => /nix/store/xnk2z26fqy86xahiz3q797dzqx96sidk-glibc-2.37-8/lib/libdl.so.2 (0x00007f3405548000)
        libpthread.so.0 => /nix/store/xnk2z26fqy86xahiz3q797dzqx96sidk-glibc-2.37-8/lib/libpthread.so.0 (0x00007f3405543000)
        librt.so.1 => /nix/store/xnk2z26fqy86xahiz3q797dzqx96sidk-glibc-2.37-8/lib/librt.so.1 (0x00007f340553e000)
        /nix/store/xnk2z26fqy86xahiz3q797dzqx96sidk-glibc-2.37-8/lib64/ld-linux-x86-64.so.2 (0x00007f34074f5000)

ldd /nix/store/30x7mhkxv6ghf8893d6lhd5jiplxh897-nvidia-x11-525.89.02-5.15.96/lib/libnvidia-ml.so.1
        linux-vdso.so.1 (0x00007ffdde5fd000)
        libpthread.so.0 => /nix/store/xnk2z26fqy86xahiz3q797dzqx96sidk-glibc-2.37-8/lib/libpthread.so.0 (0x00007faa7a5b8000)
        libm.so.6 => /nix/store/xnk2z26fqy86xahiz3q797dzqx96sidk-glibc-2.37-8/lib/libm.so.6 (0x00007faa79520000)
        libdl.so.2 => /nix/store/xnk2z26fqy86xahiz3q797dzqx96sidk-glibc-2.37-8/lib/libdl.so.2 (0x00007faa7a5b3000)
        libc.so.6 => /nix/store/xnk2z26fqy86xahiz3q797dzqx96sidk-glibc-2.37-8/lib/libc.so.6 (0x00007faa7933a000)
        /nix/store/xnk2z26fqy86xahiz3q797dzqx96sidk-glibc-2.37-8/lib64/ld-linux-x86-64.so.2 (0x00007faa7a5bf000)

Furthermore, I had twiddle with the /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl quite a bit and landed on this

[plugins.opt]
  path = "{{ .NodeConfig.Containerd.Opt }}"

[plugins.cri]
  stream_server_address = "127.0.0.1"
  stream_server_port = "10010"

  # ---- added for gpu
  enable_selinux = {{ .NodeConfig.SELinux }}
  enable_unprivileged_ports = true
  enable_unprivileged_icmp = true
  # end added for gpu

{{- if .IsRunningInUserNS }}
  disable_cgroup = true
  disable_apparmor = true
  restrict_oom_score_adj = true
{{end}}

{{- if .NodeConfig.AgentConfig.PauseImage }}
  sandbox_image = "{{ .NodeConfig.AgentConfig.PauseImage }}"
{{end}}

{{- if not .NodeConfig.NoFlannel }}
[plugins.cri.cni]
  bin_dir = "{{ .NodeConfig.AgentConfig.CNIBinDir }}"
  conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}"
{{end}}

[plugins.cri.containerd]
	default_runtime_name = "runc"

	# ---- added for GPU support
        # https://github.com/k3s-io/k3s/issues/4391#issuecomment-1202986597
	snapshotter = "overlayfs"
	disable_snapshot_annotations = true

[plugins.cri.containerd.runtimes.runc]
  runtime_type = "io.containerd.runc.v2"

# ---- added for GPU support
[plugins.cri.containerd.runtimes.nvidia]
	runtime_type = "io.containerd.runc.v2"
	runtime_root = ""
	runtime_engine = ""
	privileged_without_host_devices = false
[plugins.cri.containerd.runtimes.nvidia.options]
	BinaryName = "@nvidia-container-runtime@"
	SystemdCgroup = true

{{ if .PrivateRegistryConfig }}
{{ if .PrivateRegistryConfig.Mirrors }}
[plugins.cri.registry.mirrors]{{end}}
{{range $k, $v := .PrivateRegistryConfig.Mirrors }}
[plugins.cri.registry.mirrors."{{$k}}"]
  endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}]
{{end}}

{{range $k, $v := .PrivateRegistryConfig.Configs }}
{{ if $v.Auth }}
[plugins.cri.registry.configs."{{$k}}".auth]
  {{ if $v.Auth.Username }}username = "{{ $v.Auth.Username }}"{{end}}
  {{ if $v.Auth.Password }}password = "{{ $v.Auth.Password }}"{{end}}
  {{ if $v.Auth.Auth }}auth = "{{ $v.Auth.Auth }}"{{end}}
  {{ if $v.Auth.IdentityToken }}identitytoken = "{{ $v.Auth.IdentityToken }}"{{end}}
{{end}}
{{ if $v.TLS }}
[plugins.cri.registry.configs."{{$k}}".tls]
  {{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}}
  {{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}}
  {{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}}
{{end}}
{{end}}
{{end}}

Wherein “@nvidia-container-runtime@” is substituted at rebuild with the full /nix/store path.

Now I’m at a stage where the nvidia-device-plugin is running stable and show gpus on the node, but only libcuda.so.525.89.02 is being mounted into pod containers with the runtimeClassName: nvidia instead of libcuda.so.1 which is what nvidia-smi and friends are looking for. Looking to patch that with some kind of admission webhook policy since one run of ldconfig in the container adds the libcuda.so.1 symlink and gets it all working, as opposed to reversing the static library linking and crossing my fingers that nvidia-container-runtime mounts appropriately.

Thanks for the help!