thoth
June 5, 2024, 2:21pm
1
I am attempting to get nvidia working inside of k3s.
Following this wiki page .
I have added
{{ template "base" . }}
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/run/current-system/sw/bin/nvidia-container-runtime"
to /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl
but after rebooting I get this from nerdctl:
sudo nerdctl run -it --gpus=all ubuntu nvidia-smi
FATA[0004] failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: error running hook #0: error running hook: exit status 3, stdout: , stderr: No help topic for 'oci-hook': unknown
but it is the --gpus=all flag that is the issue as without it nerdctl runs just fine,
sudo nerdctl run -it ubuntu /bin/bash -c "echo good"
good
I have also tried without these two lines:
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/run/current-system/sw/bin/nvidia-container-runtime"
as the note on the wiki page suggests without any luck.
Does anyone else have this working?
1 Like
thoth
June 7, 2024, 7:32pm
2
Even attempting to get docker running with nvidia seems to be having issues on my setup:
sudo docker run --rm --runtime=nvidia --gpus all ubuntu /bin/bash -c "echo one"
docker: Error response from daemon: failed to create task for container: failed to create shim task: OCI runtime create failed: /nix/store/va74ykggqzmamwh2aj39fxlwzf8csw6s-nvidia-docker/bin/nvidia-container-runtime did not terminate successfully: exit status 125: unknown.
thoth
June 7, 2024, 7:35pm
3
just to be certain nvidia does indeed work outside of the container
nvidia-smi
Fri Jun 7 14:34:04 2024
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.78 Driver Version: 550.78 CUDA Version: 12.4 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 3050 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 42C P8 6W / 35W | 10MiB / 4096MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 2844 G ...c90hy96r4-xorg-server-21.1.13/bin/X 4MiB |
+-----------------------------------------------------------------------------------------+
and I can run video games and LLM from ollama and watch nvtop spike etc.
Hi,
Exactly the same issue here, did you ever find a solution to this?
Edit:
Never mind, i solved it, I just changed BinaryName to “run/current-system/sw/bin/nvidia-container-runtime-cdi”
thoth
July 8, 2024, 5:54pm
5
Actually I did not solve this, can you post your config?
Here is the full config, so I don’t miss anything. A lot of stuff is obviously not needed for the gpu passthrough.
# configuration.yaml:
#
{ config, lib, pkgs, ... }:
{
imports =
[
# Include the results of the hardware scan.
./hardware-configuration.nix
];
# Use the systemd-boot EFI boot loader.
boot.loader.systemd-boot.enable = true;
boot.loader.efi.canTouchEfiVariables = true;
nixpkgs.config.allowUnfree = true;
hardware.enableAllFirmware = true;
hardware.cpu.amd.updateMicrocode = true;
hardware.opengl.driSupport32Bit = true;
# Enable OpenGL
hardware.opengl = {
enable = true;
};
# Load nvidia driver for Xorg and Wayland
services.xserver.videoDrivers = ["nvidia"]; #probably not needed
hardware.nvidia = {
# Modesetting is required.
modesetting.enable = true;
# Nvidia power management. Experimental, and can cause sleep/suspend to fail.
# Enable this if you have graphical corruption issues or application crashes after waking
# up from sleep. This fixes it by saving the entire VRAM memory to /tmp/ instead
# of just the bare essentials.
powerManagement.enable = false;
# Fine-grained power management. Turns off GPU when not in use.
# Experimental and only works on modern Nvidia GPUs (Turing or newer).
powerManagement.finegrained = false;
# Use the NVidia open source kernel module (not to be confused with the
# independent third-party "nouveau" open source driver).
# Support is limited to the Turing and later architectures. Full list of
# supported GPUs is at:
# https://github.com/NVIDIA/open-gpu-kernel-modules#compatible-gpus
# Only available from driver 515.43.04+
# Currently alpha-quality/buggy, so false is currently the recommended setting.
open = false;
# Enable the Nvidia settings menu,
# accessible via `nvidia-settings`.
nvidiaSettings = false;
# Optionally, you may need to select the appropriate driver version for your specific GPU.
package = config.boot.kernelPackages.nvidiaPackages.stable;
};
boot.kernel.sysctl = {
"fs.inotify.max_user_instances" = 8192;
"fs.inotify.max_user_watches" = 524288;
"kernel.dmesg_restrict" = 1;
"net.ipv4.conf.all.forwarding" = 1;
"net.ipv6.conf.all.forwarding" = 1;
"net.ipv4.conf.all.accept_redirects" = 1;
"net.ipv6.conf.all.accept_redirects" = 1;
"kernel.sysrq" = 1;
};
boot.extraModprobeConfig = ''
options vfio_iommu_type1 allow_unsafe_interrupts=1
options kvm ignore_msrs=1
options kvm report_ignored_msrs=0
'';
virtualisation.docker = {
enable = true;
enableNvidia = true;
};
hardware.nvidia-container-toolkit.enable = true;
virtualisation.vswitch.enable = true;
virtualisation.vswitch.resetOnStart = true;
networking.vswitches = {
br0 = {
interfaces = {
enp1s0 = { };
vlan50 = {
type = "internal";
vlan = 50;
};
};
};
};
networking = {
interfaces = {
vlan50.ipv4.addresses = [{
address = "XXX";
prefixLength = 24;
}];
vlan50.ipv6.addresses = [{
address = "XXX";
prefixLength = 64;
}];
};
defaultGateway = {
address = "XXX";
interface = "vlan50";
};
defaultGateway6 = {
address = "XXX";
interface = "vlan50";
};
nameservers = [ "XXX" ];
hostName = "k8s-gpu";
domain = "XXX";
useDHCP = false;
};
# Set your time zone.
time.timeZone = "Europe/Stockholm";
# Enable the OpenSSH daemon.
services.openssh = {
enable = true;
};
services.openssh.settings.PasswordAuthentication = false;
services.openssh.settings.PermitRootLogin = "without-password";
# Define a user account. Don't forget to set a password with ‘passwd’.
users.users.root.openssh.authorizedKeys.keys = [
"XXX"
];
users.users.XXX = {
isNormalUser = true;
extraGroups = [ "wheel" ]; # Enable ‘sudo’ for the user.
openssh.authorizedKeys.keys = [
"XXX"
];
};
# List packages installed in system profile. To search, run:
# $ nix search wget
environment.systemPackages = with pkgs; [
vim
wget
dig
nfs-utils
usbutils
pciutils
git
k3s
kubectl
kubernetes-helm
kubevirt
calicoctl
openiscsi
docker
runc
];
# Open ports in the firewall.
networking.firewall.allowedTCPPorts = [ 22 179 10250 ];
# networking.firewall.allowedUDPPorts = [ ... ];
# Or disable the firewall altogether.
networking.firewall.enable = true;
services.k3s = {
enable = true;
role = "agent";
token = "XXX";
serverAddr = "https://XXX:6443";
extraFlags = toString [
"--node-ip=XXX,XXX"
];
};
# cat /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl
{{ template "base" . }}
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/run/current-system/sw/bin/nvidia-container-runtime.cdi"
# nvidia.yaml
apiVersion: node.k8s.io/v1
handler: nvidia
kind: RuntimeClass
metadata:
labels:
app.kubernetes.io/component: gpu-operator
name: nvidia