Nix Flake `devShells` CUDAExecutionProvider (NVIDIA) error with Python `onnxruntime`

Howdy y’all! I’m trying to, eventually, package the Deep-Live-Cam Python application for NixOS and am having the skill issues with NVIDIA/Cuda

To be totally clear it works, very slowly, without GPU and main aim is to make it work with GPU… ideally faster too

Configuration files

Relevant system;

/etc/nixos/boot-stuff.nix

{
  config,
  lib,
  modulesPath,
  ...
}:

{
  ## From: NixOS install via Virt-Manager
  imports = [
    (modulesPath + "/profiles/qemu-guest.nix")
  ];

  ## Enable non-free drivers to make WiFi mostly work
  hardware.enableRedistributableFirmware = true;

  ## Maybe fix WiFi not showing 5ghz networks
  hardware.enableAllFirmware = true;

  ## Enable OpenGL
  hardware.graphics.enable = true;

  ## Load nvidia driver for Xorg and Wayland
  services.xserver.videoDrivers = [ "nvidia" ];

  boot.initrd.availableKernelModules = [
    "ahci"
    "rtsx_pci_sdmmc"
    "sd_mod"
    "sr_mod"
    "usb_storage"
    "virtio_pci"
    "virtio_scsi"
    "xhci_pci"
  ];

  ## From: `nixos-generate-config`
  hardware.cpu.intel.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;

  ## From: NixOS install via Virt-Manager
  ## https://wiki.nixos.org/wiki/PCI_passthrough
  boot.initrd.kernelModules = [
    "vfio"
    "vfio_pci"
    "vfio_iommu_type1"
  ];
  boot.kernelModules = [ "kvm-intel" ];

  # Enables DHCP on each ethernet and wireless interface. In case of scripted networking
  # (the default) this is the recommended approach. When using systemd-networkd it's
  # still possible to use this option, but it's recommended to use it in conjunction
  # with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
  networking.useDHCP = lib.mkDefault true;

  nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";

  # https://wiki.nixos.org/wiki/CUDA#Setting_up_CUDA_Binary_Cache
  nix.settings = {
    substituters = [
      "https://cache.nixos-cuda.org"
    ];
    trusted-public-keys = [
      "cache.nixos-cuda.org:74DUi4Ye579gUqzH4ziL9IyiJBlDpMRn9MBN8oNan9M="
    ];
  };
}
Partially functional;

~/git/hub/hacksider/Deep-Live-Cam/flake.nix

{
  description = "Nix Flake for installing and running project in reproducible fashion";

  inputs = {
    # nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable";
  };

  outputs =
    { ... }:
    let
      system = "x86_64-linux";

      nixpkgs = import <nixpkgs> {
        inherit system;
        config.allowUnfree = true;

        ## WARN: enabling following without cache causes magma to rebuild opencv and other things :-|
        config.cudaSupport = true;
        config.allowUnfreePredicate =
          p:
          builtins.all (
            license:
            license.free
            || builtins.elem license.shortName [
              "CUDA EULA"
              "cuDNN EULA"
              "cuTENSOR EULA"
              "NVidia OptiX EULA"
            ]
          ) (if builtins.isList p.meta.license then p.meta.license else [ p.meta.license ]);
      };

      python3Packages = nixpkgs.pkgs.python3Packages;
      python3Exec = nixpkgs.lib.getExe nixpkgs.pkgs.python3;

      opennsfw2 = python3Packages.buildPythonPackage rec {
        pname = "opennsfw2";
        version = "0.10.2";
        src = python3Packages.fetchPypi {
          inherit pname version;
          hash = "sha256-xs6gcy3A8Y52YWXAg0JXechMpqAfEWm/pdDUqgUxHk8=";
        };
        doCheck = false;
        pyproject = true;
        build-system =
          with python3Packages;
          [
            wheel
            setuptools
            gdown
            matplotlib
            numpy
            opencv-python
            pillow
            scikit-image
            tensorflow
            tqdm
          ]
          ++ (with nixpkgs.pkgs; [
            ffmpeg
          ]);
      };

      buildInputsDefault =
        with python3Packages;
        [
          numpy
          typing-extensions
          opencv4Full
          cv2-enumerate-cameras
          onnx
          onnxruntime
          insightface
          psutil
          tkinter
          customtkinter
          pillow
          opennsfw2
          protobuf
          torchvision
        ]
        ++ (nixpkgs.lib.optionals (system != "darwin") (
          with python3Packages;
          [
            tensorflow
          ]
        ))
        ++ (nixpkgs.lib.optionals (system == "darwin") (
          with python3Packages;
          [
            torch
          ]
        ));
    in
    {
      devShells.${system} = {
        /**
          Run with specified execution provider
        */
        run-with = {
          /**
            ## Slow but works

            ```bash
            nix develop --impure .#run-with.cpu
            ```
          */
          cpu = nixpkgs.pkgs.mkShell {
            buildInputs =
              buildInputsDefault
              ++ (with python3Packages; [
                torch
              ]);

            shellHook = ''
              ${python3Exec} run.py;
              exit;
            '';
          };

          /**
            ## Nvidia or Cuda is broken here

            ```bash
            nix develop --impure .#run-with.cuda
            ```
          */
          cuda = nixpkgs.pkgs.mkShell {
            buildInputs = buildInputsDefault;
            shellHook = ''
              ${python3Exec} run.py --execution-provider cuda;
              exit;
            '';
          };

          /**
            ## Nvidia or Cuda is broken here too

            ```bash
            nix develop --impure .#run-with.cuda-fhs
            ```
          */
          cuda-fhs =
            let
              pythonWithTkinter = nixpkgs.pkgs.python3.withPackages (ps: with ps; [
                numpy
                typing-extensions
                cv2-enumerate-cameras # cv2_enumerate_cameras==1.1.15
                onnx
                onnxruntime
                insightface
                psutil
                tkinter # tk==0.1.0
                customtkinter
                pillow
                opennsfw2
                protobuf
                torchvision
                torchWithCuda
                tensorflow
              ]);
            in
              (nixpkgs.pkgs.buildFHSEnv {
              name = "cuda-fhs";

              profile = ''
                export LD_LIBRARY_PATH="${nixpkgs.pkgs.linuxPackages.nvidia_x11}/lib";
                export CUDA_PATH="${nixpkgs.pkgs.cudatoolkit}";
                export EXTRA_LDFLAGS="-L/lib -L${nixpkgs.pkgs.linuxPackages.nvidia_x11}/lib";
                export EXTRA_CCFLAGS="-I/usr/include";
                export PYTHONPATH="${pythonWithTkinter}/lib/python3.13/site-packages:$PYTHONPATH";
              '';

              runScript = ''
                ${pythonWithTkinter}/bin/python3 run.py --execution-provider cuda;
                exit;
              '';
            }).env;
        };
      };
    };
}
Stack trace errors for;
nix develop --impure .#run-with.cuda
[ERROR:0@10.429] global obsensor_uvc_stream_channel.cpp:163 getStreamChannelGroup Camera index out of range
[ WARN:0@22.180] global cap_gstreamer.cpp:1173 isPipelinePlaying OpenCV | GStreamer warning: GStreamer: pipeline have not been created
Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}, 'CUDAExecutionProvider': {'sdpa_kernel': '0', 'use_tf32': '1', 'fuse_conv_bias': '0', 'prefer_nhwc': '0', 'tunable_op_max_tuning_duration_ms': '0', 'enable_skip_layer_norm_strict_mode': '0', 'tunable_op_tuning_enable': '0', 'tunable_op_enable': '0', 'use_ep_level_unified_stream': '0', 'device_id': '0', 'has_user_compute_stream': '0', 'gpu_external_empty_cache': '0', 'cudnn_conv_algo_search': 'EXHAUSTIVE', 'cudnn_conv1d_pad_to_nc1d': '0', 'gpu_mem_limit': '18446744073709551615', 'gpu_external_alloc': '0', 'gpu_external_free': '0', 'arena_extend_strategy': 'kNextPowerOfTwo', 'do_copy_in_default_stream': '1', 'enable_cuda_graph': '0', 'user_compute_stream': '0', 'cudnn_conv_use_max_workspace': '1'}}
model ignore: /home/s0ands0/.insightface/models/buffalo_l/1k3d68.onnx landmark_3d_68
Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}, 'CUDAExecutionProvider': {'sdpa_kernel': '0', 'use_tf32': '1', 'fuse_conv_bias': '0', 'prefer_nhwc': '0', 'tunable_op_max_tuning_duration_ms': '0', 'enable_skip_layer_norm_strict_mode': '0', 'tunable_op_tuning_enable': '0', 'tunable_op_enable': '0', 'use_ep_level_unified_stream': '0', 'device_id': '0', 'has_user_compute_stream': '0', 'gpu_external_empty_cache': '0', 'cudnn_conv_algo_search': 'EXHAUSTIVE', 'cudnn_conv1d_pad_to_nc1d': '0', 'gpu_mem_limit': '18446744073709551615', 'gpu_external_alloc': '0', 'gpu_external_free': '0', 'arena_extend_strategy': 'kNextPowerOfTwo', 'do_copy_in_default_stream': '1', 'enable_cuda_graph': '0', 'user_compute_stream': '0', 'cudnn_conv_use_max_workspace': '1'}}
model ignore: /home/s0ands0/.insightface/models/buffalo_l/2d106det.onnx landmark_2d_106
Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}, 'CUDAExecutionProvider': {'sdpa_kernel': '0', 'use_tf32': '1', 'fuse_conv_bias': '0', 'prefer_nhwc': '0', 'tunable_op_max_tuning_duration_ms': '0', 'enable_skip_layer_norm_strict_mode': '0', 'tunable_op_tuning_enable': '0', 'tunable_op_enable': '0', 'use_ep_level_unified_stream': '0', 'device_id': '0', 'has_user_compute_stream': '0', 'gpu_external_empty_cache': '0', 'cudnn_conv_algo_search': 'EXHAUSTIVE', 'cudnn_conv1d_pad_to_nc1d': '0', 'gpu_mem_limit': '18446744073709551615', 'gpu_external_alloc': '0', 'gpu_external_free': '0', 'arena_extend_strategy': 'kNextPowerOfTwo', 'do_copy_in_default_stream': '1', 'enable_cuda_graph': '0', 'user_compute_stream': '0', 'cudnn_conv_use_max_workspace': '1'}}
find model: /home/s0ands0/.insightface/models/buffalo_l/det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}, 'CUDAExecutionProvider': {'sdpa_kernel': '0', 'use_tf32': '1', 'fuse_conv_bias': '0', 'prefer_nhwc': '0', 'tunable_op_max_tuning_duration_ms': '0', 'enable_skip_layer_norm_strict_mode': '0', 'tunable_op_tuning_enable': '0', 'tunable_op_enable': '0', 'use_ep_level_unified_stream': '0', 'device_id': '0', 'has_user_compute_stream': '0', 'gpu_external_empty_cache': '0', 'cudnn_conv_algo_search': 'EXHAUSTIVE', 'cudnn_conv1d_pad_to_nc1d': '0', 'gpu_mem_limit': '18446744073709551615', 'gpu_external_alloc': '0', 'gpu_external_free': '0', 'arena_extend_strategy': 'kNextPowerOfTwo', 'do_copy_in_default_stream': '1', 'enable_cuda_graph': '0', 'user_compute_stream': '0', 'cudnn_conv_use_max_workspace': '1'}}
model ignore: /home/s0ands0/.insightface/models/buffalo_l/genderage.onnx genderage
Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}, 'CUDAExecutionProvider': {'sdpa_kernel': '0', 'use_tf32': '1', 'fuse_conv_bias': '0', 'prefer_nhwc': '0', 'tunable_op_max_tuning_duration_ms': '0', 'enable_skip_layer_norm_strict_mode': '0', 'tunable_op_tuning_enable': '0', 'tunable_op_enable': '0', 'use_ep_level_unified_stream': '0', 'device_id': '0', 'has_user_compute_stream': '0', 'gpu_external_empty_cache': '0', 'cudnn_conv_algo_search': 'EXHAUSTIVE', 'cudnn_conv1d_pad_to_nc1d': '0', 'gpu_mem_limit': '18446744073709551615', 'gpu_external_alloc': '0', 'gpu_external_free': '0', 'arena_extend_strategy': 'kNextPowerOfTwo', 'do_copy_in_default_stream': '1', 'enable_cuda_graph': '0', 'user_compute_stream': '0', 'cudnn_conv_use_max_workspace': '1'}}
find model: /home/s0ands0/.insightface/models/buffalo_l/w600k_r50.onnx recognition ['None', 3, 112, 112] 127.5 127.5
set det-size: (320, 320)
2026-02-18 21:25:24.309620807 [E:onnxruntime:Default, cudnn_fe_call.cc:33 CudaErrString<cudnn_frontend::error_object>] execute(handle, plan->get_raw_desc(), variant_pack_descriptor.get_ptr()) failed with message: func(handle, stream, m, n, k, static_cast<const T_IN*>(d_B), ldb, static_cast<T_OUT*>(d_C), static_cast<const T_IN*>(d_A), parms, texB, texB_offset, alphaVal, betaVal, doBiasAct, static_cast<const T_OUT*>(zData), static_cast<const T_OUT*>(bias), doRelu, propNan, dilation_h, dilation_w, outputType, launch_params), and code: CUDNN_STATUS_EXECUTION_FAILED
2026-02-18 21:25:24.309686510 [E:onnxruntime:Default, cudnn_fe_call.cc:93 CudaCall] CUDNN_FE failure 11: CUDNN_BACKEND_API_FAILED ; GPU=0 ; hostname=nixos ; file=/build/source/onnxruntime/core/providers/cuda/nn/conv.cc ; line=483 ; expr=s_.cudnn_fe_graph->execute(cudnn_handle, s_.variant_pack, ws.get()); 
2026-02-18 21:25:24.309706058 [E:onnxruntime:, sequential_executor.cc:572 ExecuteKernel] Non-zero status code returned while running Conv node. Name:'Conv_0' Status Message: CUDNN_FE failure 11: CUDNN_BACKEND_API_FAILED ; GPU=0 ; hostname=nixos ; file=/build/source/onnxruntime/core/providers/cuda/nn/conv.cc ; line=483 ; expr=s_.cudnn_fe_graph->execute(cudnn_handle, s_.variant_pack, ws.get()); 
Exception in thread Thread-2 (_processing_thread_func):
Traceback (most recent call last):
  File "/nix/store/slhpx9glq7vl99bwi93bgrhn3syv98s1-python3-3.13.11/lib/python3.13/threading.py", line 1044, in _bootstrap_inner
    self.run()
    ~~~~~~~~^^
  File "/nix/store/slhpx9glq7vl99bwi93bgrhn3syv98s1-python3-3.13.11/lib/python3.13/threading.py", line 995, in run
    self._target(*self._args, **self._kwargs)
    ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/s0ands0/git/hub/hacksider/Deep-Live-Cam/wt/nix-flake/modules/ui.py", line 1015, in _processing_thread_func
    source_image = get_one_face(cv2.imread(modules.globals.source_path))
  File "/home/s0ands0/git/hub/hacksider/Deep-Live-Cam/wt/nix-flake/modules/face_analyser.py", line 38, in get_one_face
    face = get_face_analyser().get(frame)
  File "/nix/store/z1vg2l4397y5jj0ljyac9qsqml8q15nv-python3.13-insightface-0.7.3/lib/python3.13/site-packages/insightface/app/face_analysis.py", line 59, in get
    bboxes, kpss = self.det_model.detect(img,
                   ~~~~~~~~~~~~~~~~~~~~~^^^^^
                                         max_num=max_num,
                                         ^^^^^^^^^^^^^^^^
                                         metric='default')
                                         ^^^^^^^^^^^^^^^^^
  File "/nix/store/z1vg2l4397y5jj0ljyac9qsqml8q15nv-python3.13-insightface-0.7.3/lib/python3.13/site-packages/insightface/model_zoo/retinaface.py", line 224, in detect
    scores_list, bboxes_list, kpss_list = self.forward(det_img, self.det_thresh)
                                          ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nix/store/z1vg2l4397y5jj0ljyac9qsqml8q15nv-python3.13-insightface-0.7.3/lib/python3.13/site-packages/insightface/model_zoo/retinaface.py", line 152, in forward
    net_outs = self.session.run(self.output_names, {self.input_name : blob})
  File "/nix/store/phxz157dmv9rxqv8bcmwq2k6gxnibi35-python3.13-onnxruntime-1.23.2/lib/python3.13/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py", line 287, in run
    return self._sess.run(output_names, input_feed, run_options)
           ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
onnxruntime.capi.onnxruntime_pybind11_state.Fail: [ONNXRuntimeError] : 1 : FAIL : Non-zero status code returned while running Conv node. Name:'Conv_0' Status Message: CUDNN_FE failure 11: CUDNN_BACKEND_API_FAILED ; GPU=0 ; hostname=nixos ; file=/build/source/onnxruntime/core/providers/cuda/nn/conv.cc ; line=483 ; expr=s_.cudnn_fe_graph->execute(cudnn_handle, s_.variant_pack, ws.get()); 
Logs of CPU doing fine via;
nix develop --impure .#run-with.cpu
[ERROR:0@10.370] global obsensor_uvc_stream_channel.cpp:163 getStreamChannelGroup Camera index out of range
[ WARN:0@24.603] global cap_gstreamer.cpp:1173 isPipelinePlaying OpenCV | GStreamer warning: GStreamer: pipeline have not been created
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
model ignore: /home/s0ands0/.insightface/models/buffalo_l/1k3d68.onnx landmark_3d_68
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
model ignore: /home/s0ands0/.insightface/models/buffalo_l/2d106det.onnx landmark_2d_106
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/s0ands0/.insightface/models/buffalo_l/det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
model ignore: /home/s0ands0/.insightface/models/buffalo_l/genderage.onnx genderage
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
find model: /home/s0ands0/.insightface/models/buffalo_l/w600k_r50.onnx recognition ['None', 3, 112, 112] 127.5 127.5
set det-size: (320, 320)
[DLC.FACE-SWAPPER] Loading face swapper model from: /home/s0ands0/git/hub/hacksider/Deep-Live-Cam/wt/nix-flake/models/inswapper_128.onnx
Applied providers: ['CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}}
inswapper-shape: [1, 3, 128, 128]
[DLC.FACE-SWAPPER] Face swapper model loaded successfully.

Kernel Module and DMesg logs and device metadata

Module looks to be loaded;
lsmod | grep nvidia
nvidia_drm            143360  0
nvidia_modeset       1933312  1 nvidia_drm
nvidia_uvm           3858432  0
nvidia              111611904  3 nvidia_uvm,nvidia_drm,nvidia_modeset
drm_ttm_helper         20480  1 nvidia_drm
video                  81920  4 dell_wmi,dell_laptop,i915,nvidia_modeset
And `dmesg` doesn't show too many concerning things concerning GPU;
sudo dmesg | grep -iE 'gpu|graphics|nvidia'
``` [ 0.000000] Command line: initrd=\EFI\nixos\m3hmc5rvpdjp37m051dhypsmz30clh4a-initrd-linux-6.18.8-initrd.efi init=/nix/store/7x2bsmz2mms74vrfcnqvdgh90k1zc08a-nixos-system-nixos-26.05.20260204.00c21e4/init i915.enable_guc=2 fsck.mode=force fsck.repair=preen intel_iommu=on iommu=pt loglevel=4 lsm=landlock,yama,bpf nvidia-drm.modeset=1 nvidia-drm.fbdev=1 [ 0.059658] Reserving Intel graphics memory at [mem 0x79000000-0x7cffffff] [ 0.067512] Kernel command line: initrd=\EFI\nixos\m3hmc5rvpdjp37m051dhypsmz30clh4a-initrd-linux-6.18.8-initrd.efi init=/nix/store/7x2bsmz2mms74vrfcnqvdgh90k1zc08a-nixos-system-nixos-26.05.20260204.00c21e4/init i915.enable_guc=2 fsck.mode=force fsck.repair=preen intel_iommu=on iommu=pt loglevel=4 lsm=landlock,yama,bpf nvidia-drm.modeset=1 nvidia-drm.fbdev=1 [ 1.565082] stage-1-init: [Fri Feb 20 16:50:07 UTC 2026] loading module virtio_gpu... [ 13.287757] RAPL PMU: hw unit of domain pp1-gpu 2^-14 Joules [ 13.444790] nvidia: module license 'NVIDIA' taints kernel. [ 13.444803] nvidia: module license taints kernel. [ 14.075989] nvidia-nvlink: Nvlink Core is being initialized, major device number 511 [ 14.085347] nvidia 0000:02:00.0: enabling device (0006 -> 0007) [ 14.312659] NVRM: loading NVIDIA UNIX x86_64 Kernel Module 580.126.09 Wed Jan 7 22:59:56 UTC 2026 [ 14.546130] nvidia_uvm: module uses symbols nvUvmInterfaceDisableAccessCntr from proprietary module nvidia, inheriting taint. [ 14.803094] nvidia-modeset: Loading NVIDIA Kernel Mode Setting Driver for UNIX platforms 580.126.09 Wed Jan 7 22:32:52 UTC 2026 [ 14.821338] [drm] [nvidia-drm] [GPU ID 0x00000200] Loading driver [ 15.051255] [drm] Initialized nvidia-drm 0.0.0 for 0000:02:00.0 on minor 0 [ 15.051273] nvidia 0000:02:00.0: [drm] No compatible format found [ 15.051276] nvidia 0000:02:00.0: [drm] Cannot find any crtc or sizes ```
Output of;
nix-info -m
 - system: `"x86_64-linux"`
 - host os: `Linux 6.18.8, NixOS, 26.05 (Yarara), 26.05.20260204.00c21e4`
 - multi-user?: `yes`
 - sandbox: `yes`
 - version: `nix-env (Nix) 2.31.3`
 - nixpkgs: `/nix/store/ih9vmk2a3mrk6vhmibqzji6kjc6parzp-source`

Attributions