Using nvidia-container-runtime with containerd on NixOS

libnvidia indexes the required libraries through the cache created by ldconfig. I don’t have the time to list out everything I did step by step or to sift through my configuration to find exactly what I changed but here are snippets.

  nixpkgs.overlays = [ (final: prev: {

    nvidia-k3s = with final.pkgs; mkNvidiaContainerPkg {
      name = "nvidia-k3s";
      containerRuntimePath = "runc";
      configTemplate = ./config.toml;
    };

    libnvidia-container = prev.libnvidia-container.overrideAttrs (oldAttrs: {
      version = flakes.libnvidia-container.version;
      src = flakes.libnvidia-container.path;

      patches = [
        ./libnvidia-container.patch
        ./libnvidia-container-ldcache.patch
        (flakes.nixpkgs.path + "/pkgs/applications/virtualization/libnvidia-container/inline-c-struct.patch")
      ];

      postPatch = (oldAttrs.postPatch or "") + ''
        sed -i "s@/etc/ld.so.cache@/tmp/ld.so.cache@" src/common.h
      '';
    });

    nvidia-container-toolkit = prev.nvidia-container-toolkit.overrideAttrs (oldAttrs: {
      version = flakes.nvidia-container-toolkit.version;
      src = flakes.nvidia-container-toolkit.path;

      postPatch = (oldAttrs.postPatch or "") + ''
        sed -i "s@/etc/ld.so.cache@/tmp/ld.so.cache@" internal/ldcache/ldcache.go
      '';
    });
  }) ];

libnvidia-container-ldcache

diff --git a/src/nvc_ldcache.c b/src/nvc_ldcache.c
index db3b2f6..360fd23 100644
--- a/src/nvc_ldcache.c
+++ b/src/nvc_ldcache.c
@@ -367,7 +367,7 @@ nvc_ldcache_update(struct nvc_context *ctx, const struct nvc_container *cnt)
         if (validate_args(ctx, cnt != NULL) < 0)
                 return (-1);
 
-        argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
+        argv = (char * []){cnt->cfg.ldconfig, "-C", "/tmp/ld.so.cache", cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
         if (*argv[0] == '@') {
                 /*
                  * We treat this path specially to be relative to the host filesystem.

config.toml

disable-require = false
#swarm-resource = "DOCKER_RESOURCE_GPU"
#accept-nvidia-visible-devices-envvar-when-unprivileged = true
#accept-nvidia-visible-devices-as-volume-mounts = false

[nvidia-container-cli]
#root = "/run/nvidia/driver"
path = "@nvidia-container-cli@"
environment = []
debug = "/var/log/nvidia-container-toolkit.log"
ldcache = "/tmp/ld.so.cache"
load-kmods = true
#no-cgroups = false
#user = "root:video"
ldconfig = "@@glibcbin@/bin/ldconfig"

[nvidia-container-runtime]
debug = "/var/log/nvidia-container-runtime.log"
log-level = "debug"

# Specify the runtimes to consider. This list is processed in order and the PATH
# searched for matching executables unless the entry is an absolute path.
runtimes = [
    "@containerRuntimePath@",
]

mode = "auto"

    [nvidia-container-runtime.modes.csv]
    mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

  systemd.services.k3s.after = lib.mkForce [];
  systemd.services.k3s.wants = lib.mkForce [];
  systemd.services.k3s.serviceConfig.KillMode = lib.mkForce "control-group";
  systemd.services.k3s.path = with pkgs; [
    glibc
    # NVIDIA Container Support
    nvidia-k3s
    # Expose NVIDIA binaries to PATH
    (config.hardware.nvidia.package.overrideAttrs (oldAttrs:
      {
        builder = ./nvidia-builder.sh;
      }))
  ];
  systemd.services.k3s.serviceConfig.PrivateTmp = true;
  systemd.services.k3s.preStart = let

  in ''
    # ldconfig wants to generate symlinks
    rm -rf /tmp/nvidia-libs
    mkdir -p /tmp/nvidia-libs
    for thing in ${config.hardware.nvidia.package.overrideAttrs (oldAttrs: {
      builder = ./nvidia-builder.sh;
    })}/lib/*;
    do
      ln -s $(readlink -f $thing) /tmp/nvidia-libs/$(basename $thing)
    done

    echo "Initializing cache with directory"
    ldconfig -C /tmp/ld.so.cache /tmp/nvidia-libs

    echo "Printing ld cache contents"
    ldconfig -C /tmp/ld.so.cache --print-cache
  '';

nvidia-builder.sh is just a copy that nulls the patchelf step, since that would break loading them on non-NixOS distributions.

4 Likes