buildPythonPackage flash-attn error: auto-patchelf could not satisfy dependency libc10.so

I am trying to build a python package, namely flash-attn with CUDA, but it stuck at satisfying the dependencies that auto-patchelf required. This is not my first time to do so, so of course I tried to find the required packages and added them to the build inputs’ list, however, it has zero effect on this build! For instance, it’s complaining about missing libcudart.so.12 that should have been fixed after adding cuda_cudart dependency, but nop! also it found gcc-13.2.0-lib for libstdc++.so.6 dependency, where I explicitly added gcc12Stdenv.defaultNativeBuildInputs list to the build inputs! how can I provide the dependencies then?! I am so confused! any hint or idea would be appreciated. thanks.

{ pkgs ? import <nixpkgs> {
    config.allowUnfree = true;
    config.cudaSupport = true;
  }
}:
let
  flash-attn = with pkgs; with pkgs.python3.pkgs; buildPythonPackage rec {
    pname = "flash_attn";
    version = "2.6.3";
    format = "pyproject";

    disabled = pythonOlder "3.9";

    src = fetchPypi {
      inherit pname version;
      hash = "sha256-W/rpUArY59KTfrzLSQbzvEZNG/Zu7dDkravVIIEce1I=";
    };

    inputsEnv = python3.withPackages (p: with p; [
      torchWithCuda
      psutil
      ninja
      pybind11
      einops
    ]);

    doCheck = false;

    nativeBuildInputs = [
      git
      which
      setuptools
      libtorch-bin
      autoPatchelfHook
      wheel
      inputsEnv
    ] ++ (with cudaPackages; [
      cuda_nvcc
      cudatoolkit
      cuda_cudart
      cudnn
    ])
    ++ gcc12Stdenv.defaultNativeBuildInputs;

    CUDA_HOME = "${cudaPackages.cudatoolkit}";
    NIX_SSL_CERT_FILE = "${cacert}/etc/ssl/certs/ca-bundle.crt";
    LD_LIBRARY_PATH = "/usr/lib64";
    EXTRA_LDFLAGS = "-L/lib -L${linuxPackages.nvidia_x11}/lib";
    EXTRA_CCFLAGS = "-I/usr/include";
  };

in
pkgs.mkShell {
  name = "flash-attn";

  buildInputs = with pkgs; [
    flash-attn
  ];
}

the output is

$ nix-shell
trace: warning: cudaPackages.autoAddDriverRunpath is deprecated, use pkgs.autoAddDriverRunpath instead
trace: warning: cudaPackages.autoAddDriverRunpath is deprecated, use pkgs.autoAddDriverRunpath instead
trace: warning: cudaPackages.autoFixElfFiles is deprecated, use pkgs.autoFixElfFiles instead
trace: warning: cudaPackages.autoAddOpenGLRunpathHook is deprecated, use pkgs.autoAddDriverRunpath instead
these 3 derivations will be built:
  /nix/store/q96ahxc95q2dq08bwy35xxzk6jp31gm7-libtorch-cxx11-abi-shared-with-deps-2.3.0-cu121.zip.drv
  /nix/store/00z96avhmsjx3gq0yms76a1255p92c1x-libtorch-2.3.0.drv
  /nix/store/fq42lqzggc61h539mng97qhpqy3jn9a7-python3.11-flash_attn-2.6.3.drv
building '/nix/store/q96ahxc95q2dq08bwy35xxzk6jp31gm7-libtorch-cxx11-abi-shared-with-deps-2.3.0-cu121.zip.drv'...

trying https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.3.0%2Bcu121.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2414M  100 2414M    0     0  21.8M      0  0:01:50  0:01:50 --:--:-- 22.0M
unpacking source archive /tmp/nix-build-libtorch-cxx11-abi-shared-with-deps-2.3.0-cu121.zip.drv-0/libtorch-cxx11-abi-shared-with-deps-2.3.0%2Bcu121.zip

building '/nix/store/00z96avhmsjx3gq0yms76a1255p92c1x-libtorch-2.3.0.drv'...
Running phase: unpackPhase
unpacking source archive /nix/store/1xwsg7ibx0wz1jgsmranm7nagjcim3ir-libtorch-cxx11-abi-shared-with-deps-2.3.0-cu121.zip
source root is libtorch-cxx11-abi-shared-with-deps-2.3.0-cu121.zip
Running phase: patchPhase
Running phase: updateAutotoolsGnuConfigScriptsPhase
Running phase: installPhase
substituteStream(): WARNING: '--replace' is deprecated, use --replace-{fail,warn,quiet}. (file '/nix/store/v555g84n7wl6yxzwpmn3kfhip7lv8fys-libtorch-2.3.0-dev/share/cmake/Torch/TorchConfig.cmake')
Running phase: fixupPhase
shrinking RPATHs of ELF executables and libraries in /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch_global_deps.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libc10d_cuda_test.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_ops_infer.so.8
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcublas-37d11411.so.12
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libnvrtc-b51b459d.so.12
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libc10.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libjitbackend_test.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcaffe2_nvrtc.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch_cuda_linalg.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn.so.8
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_cnn_train.so.8
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch_cuda.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_cnn_infer.so.8
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_ops_train.so.8
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libgomp-98b21ff3.so.1
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libc10_cuda.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_adv_infer.so.8
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libbackend_with_compiler.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudart-9335f6a2.so.12
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorchbind_test.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcublasLt-f97bfc2c.so.12
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch_python.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libshm.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libnvrtc-builtins-6c5639ce.so.12.1
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_adv_train.so.8
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libnvToolsExt-847d78f2.so.1
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch_cpu.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libnnapi_backend.so
shrinking RPATHs of ELF executables and libraries in /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch_global_deps.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libc10d_cuda_test.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_ops_infer.so.8
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcublas-37d11411.so.12
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libnvrtc-b51b459d.so.12
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libc10.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libjitbackend_test.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcaffe2_nvrtc.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch_cuda_linalg.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn.so.8
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_cnn_train.so.8
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch_cuda.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_cnn_infer.so.8
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_ops_train.so.8
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libgomp-98b21ff3.so.1
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libc10_cuda.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_adv_infer.so.8
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libbackend_with_compiler.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudart-9335f6a2.so.12
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorchbind_test.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcublasLt-f97bfc2c.so.12
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch_python.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libshm.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libnvrtc-builtins-6c5639ce.so.12.1
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_adv_train.so.8
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libnvToolsExt-847d78f2.so.1
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch_cpu.so
shrinking /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libnnapi_backend.so
checking for references to /tmp/nix-build-libtorch-2.3.0.drv-0/ in /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0...
patching script interpreter paths in /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0
shrinking RPATHs of ELF executables and libraries in /nix/store/v555g84n7wl6yxzwpmn3kfhip7lv8fys-libtorch-2.3.0-dev
shrinking RPATHs of ELF executables and libraries in /nix/store/v555g84n7wl6yxzwpmn3kfhip7lv8fys-libtorch-2.3.0-dev
checking for references to /tmp/nix-build-libtorch-2.3.0.drv-0/ in /nix/store/v555g84n7wl6yxzwpmn3kfhip7lv8fys-libtorch-2.3.0-dev...
patching script interpreter paths in /nix/store/v555g84n7wl6yxzwpmn3kfhip7lv8fys-libtorch-2.3.0-dev
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch_global_deps.so...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch.so...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libc10d_cuda_test.so...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_ops_infer.so.8...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcublas-37d11411.so.12...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libnvrtc-b51b459d.so.12...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libc10.so...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libjitbackend_test.so...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcaffe2_nvrtc.so...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch_cuda_linalg.so...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn.so.8...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_cnn_train.so.8...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch_cuda.so...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_cnn_infer.so.8...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_ops_train.so.8...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libgomp-98b21ff3.so.1...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libc10_cuda.so...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_adv_infer.so.8...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libbackend_with_compiler.so...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudart-9335f6a2.so.12...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorchbind_test.so...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcublasLt-f97bfc2c.so.12...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch_python.so...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libshm.so...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libnvrtc-builtins-6c5639ce.so.12.1...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libcudnn_adv_train.so.8...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libnvToolsExt-847d78f2.so.1...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libtorch_cpu.so...
setting rpath for /nix/store/brczr6kpca18sv7w6c6wsrx0vw1dw19v-libtorch-2.3.0/lib/libnnapi_backend.so...
fixupPhase completed in 47 seconds
building '/nix/store/fq42lqzggc61h539mng97qhpqy3jn9a7-python3.11-flash_attn-2.6.3.drv'...
Sourcing python-remove-tests-dir-hook
Sourcing python-catch-conflicts-hook.sh
Sourcing python-remove-bin-bytecode-hook.sh
Sourcing pypa-build-hook
Using pypaBuildPhase
Sourcing python-runtime-deps-check-hook
Using pythonRuntimeDepsCheckHook
Sourcing pypa-install-hook
Using pypaInstallPhase
Sourcing python-imports-check-hook.sh
Using pythonImportsCheckPhase
Sourcing python-namespaces-hook
Sourcing python-catch-conflicts-hook.sh
Sourcing setup-cuda-hook
Running phase: unpackPhase
unpacking source archive /nix/store/7fhwbdhgx2f92h5pm9davbasb3lgwkls-flash_attn-2.6.3.tar.gz
source root is flash_attn-2.6.3
setting SOURCE_DATE_EPOCH to timestamp 1721951149 of file flash_attn-2.6.3/setup.cfg
Running phase: patchPhase
Running phase: updateAutotoolsGnuConfigScriptsPhase
Running phase: configurePhase
Executing setupCUDAToolkitCompilers
no configure script, doing nothing
Running phase: buildPhase
Executing pypaBuildPhase
Creating a wheel...
* Getting build dependencies for wheel...
fatal: not a git repository (or any of the parent directories): .git


torch.__version__  = 2.3.0


* Building wheel...
fatal: not a git repository (or any of the parent directories): .git


torch.__version__  = 2.3.0


running bdist_wheel
Guessing wheel URL:  https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.3cxx11abiTRUE-cp311-cp311-linux_x86_64.whl
Raw wheel path /tmp/nix-build-python3.11-flash_attn-2.6.3.drv-0/flash_attn-2.6.3/dist/.tmp-4ulegqp7/flash_attn-2.6.3-cp311-cp311-linux_x86_64.whl
Successfully built flash_attn-2.6.3-cp311-cp311-linux_x86_64.whl
Finished creating a wheel...
Finished executing pypaBuildPhase
Running phase: pythonRuntimeDepsCheckHook
Executing pythonRuntimeDepsCheck
Checking runtime dependencies for flash_attn-2.6.3-cp311-cp311-linux_x86_64.whl
Finished executing pythonRuntimeDepsCheck
Running phase: installPhase
Executing pypaInstallPhase
Successfully installed flash_attn-2.6.3-cp311-cp311-linux_x86_64.whl
Finished executing pypaInstallPhase
Running phase: pythonOutputDistPhase
Executing pythonOutputDistPhase
Finished executing pythonOutputDistPhase
Running phase: fixupPhase
shrinking RPATHs of ELF executables and libraries in /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3
shrinking /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
checking for references to /tmp/nix-build-python3.11-flash_attn-2.6.3.drv-0/ in /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3...
patching script interpreter paths in /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3
stripping (with command strip and flags -S -p) in  /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3/lib
shrinking RPATHs of ELF executables and libraries in /nix/store/5dry4sx5vq82y2p9daz1gj9smd2f10kd-python3.11-flash_attn-2.6.3-dist
checking for references to /tmp/nix-build-python3.11-flash_attn-2.6.3.drv-0/ in /nix/store/5dry4sx5vq82y2p9daz1gj9smd2f10kd-python3.11-flash_attn-2.6.3-dist...
patching script interpreter paths in /nix/store/5dry4sx5vq82y2p9daz1gj9smd2f10kd-python3.11-flash_attn-2.6.3-dist
Executing pythonRemoveTestsDir
Finished executing pythonRemoveTestsDir
automatically fixing dependencies for ELF files
{'append_rpaths': [],
 'extra_args': [],
 'ignore_missing': [],
 'libs': [PosixPath('/nix/store/7hnr99nxrd2aw6lghybqdmkckq60j6l9-python3-3.11.9/lib')],
 'paths': [PosixPath('/nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3'),
           PosixPath('/nix/store/5dry4sx5vq82y2p9daz1gj9smd2f10kd-python3.11-flash_attn-2.6.3-dist')],
 'recursive': True,
 'runtime_dependencies': []}
searching for dependencies of /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
    libc10.so -> not found!
    libtorch_cpu.so -> not found!
    libtorch_python.so -> not found!
    libcudart.so.12 -> not found!
    libc10_cuda.so -> not found!
    libtorch_cuda.so -> not found!
    libstdc++.so.6 -> found: /nix/store/xvzz97yk73hw03v5dhhz3j47ggwf1yq1-gcc-13.2.0-lib/lib
    libgcc_s.so.1 -> found: /nix/store/0rxb3ixzk4zaqivc9s795m0a3679wbw2-gcc-13.2.0-libgcc/lib
setting RPATH to: /nix/store/xvzz97yk73hw03v5dhhz3j47ggwf1yq1-gcc-13.2.0-lib/lib:/nix/store/0rxb3ixzk4zaqivc9s795m0a3679wbw2-gcc-13.2.0-libgcc/lib
auto-patchelf: 6 dependencies could not be satisfied
error: auto-patchelf could not satisfy dependency libc10.so wanted by /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
error: auto-patchelf could not satisfy dependency libtorch_cpu.so wanted by /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
error: auto-patchelf could not satisfy dependency libtorch_python.so wanted by /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
error: auto-patchelf could not satisfy dependency libcudart.so.12 wanted by /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
error: auto-patchelf could not satisfy dependency libc10_cuda.so wanted by /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
error: auto-patchelf could not satisfy dependency libtorch_cuda.so wanted by /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
auto-patchelf failed to find all the required dependencies.
Add the missing dependencies to --libs or use `--ignore-missing="foo.so.1 bar.so etc.so"`.
error: builder for '/nix/store/fq42lqzggc61h539mng97qhpqy3jn9a7-python3.11-flash_attn-2.6.3.drv' failed with exit code 1;
       last 10 log lines:
       > setting RPATH to: /nix/store/xvzz97yk73hw03v5dhhz3j47ggwf1yq1-gcc-13.2.0-lib/lib:/nix/store/0rxb3ixzk4zaqivc9s795m0a3679wbw2-gcc-13.2.0-libgcc/lib
       > auto-patchelf: 6 dependencies could not be satisfied
       > error: auto-patchelf could not satisfy dependency libc10.so wanted by /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
       > error: auto-patchelf could not satisfy dependency libtorch_cpu.so wanted by /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
       > error: auto-patchelf could not satisfy dependency libtorch_python.so wanted by /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
       > error: auto-patchelf could not satisfy dependency libcudart.so.12 wanted by /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
       > error: auto-patchelf could not satisfy dependency libc10_cuda.so wanted by /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
       > error: auto-patchelf could not satisfy dependency libtorch_cuda.so wanted by /nix/store/482rdzjspg25fch0pnx9bi7wx0iaz3p9-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
       > auto-patchelf failed to find all the required dependencies.
       > Add the missing dependencies to --libs or use `--ignore-missing="foo.so.1 bar.so etc.so"`.
       For full logs, run 'nix log /nix/store/fq42lqzggc61h539mng97qhpqy3jn9a7-python3.11-flash_attn-2.6.3.drv'.

Can you try, whether adding torchWithCuda.lib helps?

thanks for your reply, but no! it changed nothing. I am getting an identical output!

searching for dependencies of /nix/store/0iw00729p030lwkz9nj2z3cm78cdxdgm-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
    libc10.so -> not found!
    libtorch_cpu.so -> not found!
    libtorch_python.so -> not found!
    libcudart.so.12 -> not found!
    libc10_cuda.so -> not found!
    libtorch_cuda.so -> not found!
    libstdc++.so.6 -> found: /nix/store/xvzz97yk73hw03v5dhhz3j47ggwf1yq1-gcc-13.2.0-lib/lib
    libgcc_s.so.1 -> found: /nix/store/0rxb3ixzk4zaqivc9s795m0a3679wbw2-gcc-13.2.0-libgcc/lib
setting RPATH to: /nix/store/xvzz97yk73hw03v5dhhz3j47ggwf1yq1-gcc-13.2.0-lib/lib:/nix/store/0rxb3ixzk4zaqivc9s795m0a3679wbw2-gcc-13.2.0-libgcc/lib
auto-patchelf: 6 dependencies could not be satisfied
error: auto-patchelf could not satisfy dependency libc10.so wanted by /nix/store/0iw00729p030lwkz9nj2z3cm78cdxdgm-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
error: auto-patchelf could not satisfy dependency libtorch_cpu.so wanted by /nix/store/0iw00729p030lwkz9nj2z3cm78cdxdgm-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
error: auto-patchelf could not satisfy dependency libtorch_python.so wanted by /nix/store/0iw00729p030lwkz9nj2z3cm78cdxdgm-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
error: auto-patchelf could not satisfy dependency libcudart.so.12 wanted by /nix/store/0iw00729p030lwkz9nj2z3cm78cdxdgm-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
error: auto-patchelf could not satisfy dependency libc10_cuda.so wanted by /nix/store/0iw00729p030lwkz9nj2z3cm78cdxdgm-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
error: auto-patchelf could not satisfy dependency libtorch_cuda.so wanted by /nix/store/0iw00729p030lwkz9nj2z3cm78cdxdgm-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
auto-patchelf failed to find all the required dependencies.
Add the missing dependencies to --libs or use `--ignore-missing="foo.so.1 bar.so etc.so"`.
error: builder for '/nix/store/9k81yh3wdi7pj7nm8ijfs3fmc1w7a4f4-python3.11-flash_attn-2.6.3.drv' failed with exit code 1;
       last 10 log lines:
       > setting RPATH to: /nix/store/xvzz97yk73hw03v5dhhz3j47ggwf1yq1-gcc-13.2.0-lib/lib:/nix/store/0rxb3ixzk4zaqivc9s795m0a3679wbw2-gcc-13.2.0-libgcc/lib
       > auto-patchelf: 6 dependencies could not be satisfied
       > error: auto-patchelf could not satisfy dependency libc10.so wanted by /nix/store/0iw00729p030lwkz9nj2z3cm78cdxdgm-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
       > error: auto-patchelf could not satisfy dependency libtorch_cpu.so wanted by /nix/store/0iw00729p030lwkz9nj2z3cm78cdxdgm-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
       > error: auto-patchelf could not satisfy dependency libtorch_python.so wanted by /nix/store/0iw00729p030lwkz9nj2z3cm78cdxdgm-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
       > error: auto-patchelf could not satisfy dependency libcudart.so.12 wanted by /nix/store/0iw00729p030lwkz9nj2z3cm78cdxdgm-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
       > error: auto-patchelf could not satisfy dependency libc10_cuda.so wanted by /nix/store/0iw00729p030lwkz9nj2z3cm78cdxdgm-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
       > error: auto-patchelf could not satisfy dependency libtorch_cuda.so wanted by /nix/store/0iw00729p030lwkz9nj2z3cm78cdxdgm-python3.11-flash_attn-2.6.3/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so
       > auto-patchelf failed to find all the required dependencies.
       > Add the missing dependencies to --libs or use `--ignore-missing="foo.so.1 bar.so etc.so"`.
       For full logs, run 'nix log /nix/store/9k81yh3wdi7pj7nm8ijfs3fmc1w7a4f4-python3.11-flash_attn-2.6.3.drv'.

I don’t know about the way PiPy packages work, so I guess that’s why I don’t understand what’s going on! Probably buildPythonPackage function tries to build multiple sub packages independently, but it missed to pass the dependencies to all those sub builds! I am not sure of course!

You seem to be building from source, what do you need autoPatchelf for?

  • cudaPackages.cudatoolkit is deprecated and it’s a no-op to mix it with cuda_nvc, cuda_cudart, etc
  • only cuda_nvcc belongs in native, the rest belong in buildInputs
  • you don’t need to list cuda_cudart &c separately, you can instead have buildInputs = [ (getOutput "cxxdev" torch) ]

You don’t want to link nvidia_x11 directly, it’s not going to work on other machines; instead you use addDriverRunpath or autoAddDriverRunpath

…this doesn’t exist in the sandbox

…it likely discovers it through stdenv, which buildPythonPackage uses
Note that you do not want to link libstdc++ from gcc12 if the rest of the package set uses gcc13.
Even if you use gcc12 to build the library, you want to use the newest available libstdc++ at runtime (in this case, gcc13’s). This logic is also hardwired into cudaPackages.backendStdenv

These belong in build-system

This is a legacy attribute which wraps an autopatchelfed prebuild libtorch downloaded from …somewhere. You either use that, or you use the source-built python3Packages.torch

1 Like

Thank you for your reply and sharing your deep knowledge with me.
Following your lead, it worked! Here is the working config

{ pkgs ? import <nixpkgs> {
    config.allowUnfree = true;
    config.cudaSupport = true;
  }
}:
let
  flash-attn = with pkgs; with pkgs.python3.pkgs; buildPythonPackage rec {
    pname = "flash_attn";
    version = "2.6.3";
    format = "pyproject";

    disabled = pythonOlder "3.9";

    src = fetchPypi {
      inherit pname version;
      hash = "sha256-W/rpUArY59KTfrzLSQbzvEZNG/Zu7dDkravVIIEce1I=";
    };

    inputsEnv = python3.withPackages (p: with p; [
      psutil
      ninja
      pybind11
      einops
    ]);

    doCheck = false;

    build-system = [
      setuptools
    ];

    nativeBuildInputs = [
      git
      cudaPackages.cuda_nvcc
    ];

    buildInputs = [
      torch
      (lib.getOutput "cxxdev" torch)
      inputsEnv
    ];

    CUDA_HOME = "${cudaPackages.cudatoolkit}";
    NIX_SSL_CERT_FILE = "${cacert}/etc/ssl/certs/ca-bundle.crt";
    LD_LIBRARY_PATH = "${lib.makeSearchPath "lib" [
      addDriverRunpath.driverLink
      cudaPackages.libcublas.lib
    ]}:/usr/lib64";
    EXTRA_CCFLAGS = "-I/usr/include";
  };

in
pkgs.mkShell {
  name = "flash-attn";

  buildInputs = with pkgs; [
    flash-attn
  ];
}

Regarding /usr/lib64 I needed it because I am building in a docker container and nvidia-smi is not working without this!! Feel free to give me your opinion.

1 Like

But did you mean to put these (LD_LIBRARY_PATH, NIX_SSL_CERT_FILE, EXTRA_CCFLAGS) in the mkShell rather than in the buildPythonPackage?

Here too cudatoolkit (which is a symlinkJoin of nvcc, cudart, and a whole bunch of other things) could probably be omitted, the variable often doesn’t have any practical effect, which is why we sometimes plug in a smaller package in there: nixpkgs/pkgs/development/python-modules/vllm/default.nix at 17a50f14daaf1ef506f1ef61c7f5bcbbb922c629 · NixOS/nixpkgs · GitHub

Yes I didn’t need LD_LIBRARY_PATH, NIX_SSL_CERT_FILE, EXTRA_CCFLAGS in the buildPythonPackage function, so I moved them to the mkShell. Here is the minimum config that I couldn’t remove anything out:

  flash-attn = with pkgs; with pkgs.python3.pkgs; buildPythonPackage rec {
    pname = "flash_attn";
    version = "2.6.3";
    format = "pyproject";

    disabled = pythonOlder "3.9";

    src = fetchPypi {
      inherit pname version;
      hash = "sha256-W/rpUArY59KTfrzLSQbzvEZNG/Zu7dDkravVIIEce1I=";
    };

    inputsEnv = python3.withPackages (p: with p; [
      psutil
      ninja
      einops
    ]);

    doCheck = false;

    nativeBuildInputs = [
      git
    ];

    buildInputs = [
      torch
      inputsEnv
    ];

    CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
  };

  nvidia-cutlass = with pkgs; with pkgs.python3.pkgs; buildPythonPackage rec {
    pname = "nvidia_cutlass";
    version = "3.5.0.0";
    format = "wheel";

    disabled = pythonOlder "3.9";

    src = fetchPypi rec {
      inherit pname version format;
      hash = "sha256-TsOrSCDdSuPrwsXd8jNSsNyQ93lDUoTn3OgiAGrcUfY=";
      dist = python;
      python = "py3";
      abi = "none";
      platform = "any";
    };

    doCheck = false;
  };

I also added nvidia-cutlass for your review, since apparently flash-attn is depending on it on runtime! More about this after responding to the CUDA_HOME.

Regarding CUDA_HOME, here is the error message when I removed it:

* Getting build dependencies for wheel...
fatal: not a git repository (or any of the parent directories): .git


torch.__version__  = 2.3.0


<string>:95: UserWarning: flash_attn was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.
Traceback (most recent call last):
  File "/nix/store/rqsxqj2fqmpq5v90bdlrp5h597km6nrl-python3.11-pyproject-hooks-1.0.0/lib/python3.11/site-packages/pyproject_hooks/_in_process/_in_process.py", line 353, in <module>
    main()
  File "/nix/store/rqsxqj2fqmpq5v90bdlrp5h597km6nrl-python3.11-pyproject-hooks-1.0.0/lib/python3.11/site-packages/pyproject_hooks/_in_process/_in_process.py", line 335, in main
    json_out['return_val'] = hook(**hook_input['kwargs'])
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nix/store/rqsxqj2fqmpq5v90bdlrp5h597km6nrl-python3.11-pyproject-hooks-1.0.0/lib/python3.11/site-packages/pyproject_hooks/_in_process/_in_process.py", line 118, in get_requires_for_build_wheel
    return hook(config_settings)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/nix/store/g4h9138sa5wh6kkfwc7f49q169wcs8s9-python3.11-setuptools-69.5.1/lib/python3.11/site-packages/setuptools/build_meta.py", line 325, in get_requires_for_build_wheel
    return self._get_build_requires(config_settings, requirements=['wheel'])
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nix/store/g4h9138sa5wh6kkfwc7f49q169wcs8s9-python3.11-setuptools-69.5.1/lib/python3.11/site-packages/setuptools/build_meta.py", line 295, in _get_build_requires
    self.run_setup()
  File "/nix/store/g4h9138sa5wh6kkfwc7f49q169wcs8s9-python3.11-setuptools-69.5.1/lib/python3.11/site-packages/setuptools/build_meta.py", line 487, in run_setup
    super().run_setup(setup_script=setup_script)
  File "/nix/store/g4h9138sa5wh6kkfwc7f49q169wcs8s9-python3.11-setuptools-69.5.1/lib/python3.11/site-packages/setuptools/build_meta.py", line 311, in run_setup
    exec(code, locals())
  File "<string>", line 179, in <module>
  File "/nix/store/fygk7mad2m9z388h6aafdbh3s6yx9nq3-python3.11-torch-2.3.0/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1077, in CUDAExtension
    library_dirs += library_paths(cuda=True)
                    ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nix/store/fygk7mad2m9z388h6aafdbh3s6yx9nq3-python3.11-torch-2.3.0/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1204, in library_paths
    if (not os.path.exists(_join_cuda_home(lib_dir)) and
                           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/nix/store/fygk7mad2m9z388h6aafdbh3s6yx9nq3-python3.11-torch-2.3.0/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2419, in _join_cuda_home
    raise OSError('CUDA_HOME environment variable is not set. '
OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.

ERROR Backend subprocess exited when trying to invoke get_requires_for_build_wheel
error: builder for '/nix/store/akw2ia3ghjc64ld82f8y8xcrk43hiyff-python3.11-flash_attn-2.6.3.drv' failed with exit code 1;
       last 10 log lines:
       >     library_dirs += library_paths(cuda=True)
       >                     ^^^^^^^^^^^^^^^^^^^^^^^^
       >   File "/nix/store/fygk7mad2m9z388h6aafdbh3s6yx9nq3-python3.11-torch-2.3.0/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1204, in library_paths
       >     if (not os.path.exists(_join_cuda_home(lib_dir)) and
       >                            ^^^^^^^^^^^^^^^^^^^^^^^^
       >   File "/nix/store/fygk7mad2m9z388h6aafdbh3s6yx9nq3-python3.11-torch-2.3.0/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2419, in _join_cuda_home
       >     raise OSError('CUDA_HOME environment variable is not set. '
       > OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
       >
       > ERROR Backend subprocess exited when trying to invoke get_requires_for_build_wheel
       For full logs, run 'nix log /nix/store/akw2ia3ghjc64ld82f8y8xcrk43hiyff-python3.11-flash_attn-2.6.3.drv'.

however, even when I add CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; or even the cudatoolkit, I get something like below, but in the end it’s a successful build!

....
* Getting build dependencies for wheel...
No CUDA runtime is found, using CUDA_HOME='/nix/store/k83487p62sikklc7f5jkxf5iznwc9834-cuda_nvcc-12.2.140'
fatal: not a git repository (or any of the parent directories): .git


torch.__version__  = 2.3.0


* Building wheel...
No CUDA runtime is found, using CUDA_HOME='/nix/store/k83487p62sikklc7f5jkxf5iznwc9834-cuda_nvcc-12.2.140'
fatal: not a git repository (or any of the parent directories): .git


torch.__version__  = 2.3.0
....

isn’t it interesting? :smiley:

Even though the mkShell build was successful, I get below error when I am trying to run an example in candle I could create another topic but I think it’s related to the flash-attn packaging, so let’s continue here. But feel free to ask me to create another topic.

This is the error

  In file included from kernels/flash_fwd_launch_template.h:12,
                   from kernels/flash_fwd_hdim96_bf16_causal_sm80.cu:5:
  kernels/flash_fwd_kernel.h:7:10: fatal error: cute/tensor.hpp: No such file or directory
      7 | #include <cute/tensor.hpp>
        |          ^~~~~~~~~~~~~~~~~
  compilation terminated.
  # --error 0x1 --
  thread '<unnamed>' panicked at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/bindgen_cuda-0.1.5/src/lib.rs:262:21:
  nvcc error while executing compiling: "nvcc" "--gpu-architecture=sm_89" "-c" "-o" "/home/dev/src/candle/target/debug/build/candle-flash-attn-f5ea6faa04b48e83/out/flash_fwd_hdim96_bf16_causal_sm80-800cc4806a424737.o" "--default-stream" "per-thread" "-std=c++17" "-O3" "-U__CUDA_NO_HALF_OPERATORS__" "-U__CUDA_NO_HALF_CONVERSIONS__" "-U__CUDA_NO_HALF2_OPERATORS__" "-U__CUDA_NO_BFLOAT16_CONVERSIONS__" "-Icutlass/include" "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math" "--verbose" "kernels/flash_fwd_hdim96_bf16_causal_sm80.cu"

  # stdout


  # stderr

  stack backtrace:
     0:     0x55c768741d77 - <std::sys_common::backtrace::_print::DisplayBacktrace as core::fmt::Display>::fmt::h4886110ae7835fd2
     1:     0x55c7687706b0 - core::fmt::write::h6fab83f0f5bc0c8f
     2:     0x55c76875b695 - std::io::Write::write_fmt::h64824cb874054607
     3:     0x55c768741b44 - std::sys_common::backtrace::print::hb8fea9d041077138
     4:     0x55c7687426b7 - std::panicking::default_hook::{{closure}}::hc60f0f690c67943c
     5:     0x55c768742417 - std::panicking::default_hook::h56281bcc5d6e2f17
     6:     0x55c768742c28 - std::panicking::rust_panic_with_hook::hd61e30fe5971d187
     7:     0x55c768742b0a - std::panicking::begin_panic_handler::{{closure}}::h95d2684e28f89fd2
     8:     0x55c768741fa6 - std::sys_common::backtrace::__rust_end_short_backtrace::h48780d898316fff8
     9:     0x55c768742854 - rust_begin_unwind
    10:     0x55c768699b25 - core::panicking::panic_fmt::hdb89b5c14a056cec
    11:     0x55c7686a063f - bindgen_cuda::Builder::build_lib::{{closure}}::h0f5ac5fbc2e2eb6f
    12:     0x55c7686a0c21 - core::ops::function::impls::<impl core::ops::function::FnMut<A> for &F>::call_mut::h3a4f96cc56f04c9f
    13:     0x55c7686a86a1 - core::iter::adapters::map::map_try_fold::{{closure}}::ha286ecd462b4e4a0
    14:     0x55c76869dc96 - core::iter::traits::iterator::Iterator::try_fold::h638d4b1020136138
    15:     0x55c7686a84c4 - <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::try_fold::h1a66d852783afec3
    16:     0x55c7686a8481 - <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::try_fold::h122a04e5a7f0f99e
    17:     0x55c7686a5b75 - <core::iter::adapters::take_while::TakeWhile<I,P> as core::iter::traits::iterator::Iterator>::try_fold::hef3986c12527d0f0
    18:     0x55c7686a5b09 - <core::iter::adapters::take_while::TakeWhile<I,P> as core::iter::traits::iterator::Iterator>::fold::hc500bf6bff56dc6a
    19:     0x55c7686a8450 - <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::fold::hbb20d59983a4c8cb
    20:     0x55c7686a8676 - core::iter::traits::iterator::Iterator::for_each::he3c272d4b6e82516
    21:     0x55c7686abc3b - <rayon::iter::noop::NoopConsumer as rayon::iter::plumbing::Folder<T>>::consume_iter::h0eece8a9ce8c56bc
    22:     0x55c7686a8d1b - <rayon::iter::while_some::WhileSomeFolder<C> as rayon::iter::plumbing::Folder<core::option::Option<T>>>::consume_iter::h663e1ae817bba324
    23:     0x55c7686a54ce - <rayon::iter::map::MapFolder<C,F> as rayon::iter::plumbing::Folder<T>>::consume_iter::hfa88b26aa17ea85f
    24:     0x55c7686a53f7 - <rayon::iter::map::MapFolder<C,F> as rayon::iter::plumbing::Folder<T>>::consume_iter::ha9c205249aa00dd5
    25:     0x55c7686a3229 - rayon::iter::plumbing::Producer::fold_with::h364aeffedd2e5e9f
    26:     0x55c7686abf9a - rayon::iter::plumbing::bridge_producer_consumer::helper::hc7de9a7deafd9bb9
    27:     0x55c7686ac424 - rayon::iter::plumbing::bridge_producer_consumer::helper::{{closure}}::h8ddf99830e5c5ef0
    28:     0x55c7686a6b4c - rayon_core::join::join_context::call_b::{{closure}}::h701ee677f1f796fb
    29:     0x55c7686a0faa - rayon_core::job::JobResult<T>::call::{{closure}}::h522155878259cef6
    30:     0x55c7686a0ca0 - <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once::h8ae7fc8faa307066
    31:     0x55c7686a5790 - std::panicking::try::do_call::h39b74f3c01cf9d43
    32:     0x55c7686a5a0b - __rust_try
    33:     0x55c7686a5725 - std::panicking::try::ha89a5ca847a9a1f2
    34:     0x55c76869a506 - std::panic::catch_unwind::h77bacc8ef522bdb3
    35:     0x55c7686ab6be - rayon_core::unwind::halt_unwinding::ha70d205ac30ee997
    36:     0x55c7686a0f17 - rayon_core::job::JobResult<T>::call::hd9ee7548e2d7018e
    37:     0x55c7686a1b6c - <rayon_core::job::StackJob<L,F,R> as rayon_core::job::Job>::execute::hc138b52c9ae95e54
    38:     0x55c7686fdc6b - rayon_core::job::JobRef::execute::h6b0e6694b949eed2
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/job.rs:64:9
    39:     0x55c7686ee880 - rayon_core::registry::WorkerThread::execute::hd04359f18cb7396c
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:860:9
    40:     0x55c7686ee56c - rayon_core::registry::WorkerThread::wait_until_cold::hd50be0bb0c578b36
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:794:21
    41:     0x55c7686ee2bb - rayon_core::registry::WorkerThread::wait_until::h30c6ab5ffbc03d0d
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:769:13
    42:     0x55c7686f93ea - rayon_core::join::join_recover_from_panic::h5975ba98b2b67ed4
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/join/mod.rs:186:5
    43:     0x55c7686a667a - rayon_core::join::join_context::{{closure}}::h0cca18f41d720c3d
    44:     0x55c76869bf0f - rayon_core::registry::in_worker::hc91e6b5f45c5cbb8
    45:     0x55c7686a63bd - rayon_core::join::join_context::h3a418d389af7006b
    46:     0x55c7686ac1f1 - rayon::iter::plumbing::bridge_producer_consumer::helper::hc7de9a7deafd9bb9
    47:     0x55c7686ac424 - rayon::iter::plumbing::bridge_producer_consumer::helper::{{closure}}::h8ddf99830e5c5ef0
    48:     0x55c7686a6b4c - rayon_core::join::join_context::call_b::{{closure}}::h701ee677f1f796fb
    49:     0x55c7686a0faa - rayon_core::job::JobResult<T>::call::{{closure}}::h522155878259cef6
    50:     0x55c7686a0ca0 - <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once::h8ae7fc8faa307066
    51:     0x55c7686a5790 - std::panicking::try::do_call::h39b74f3c01cf9d43
    52:     0x55c7686a5a0b - __rust_try
    53:     0x55c7686a5725 - std::panicking::try::ha89a5ca847a9a1f2
    54:     0x55c76869a506 - std::panic::catch_unwind::h77bacc8ef522bdb3
    55:     0x55c7686ab6be - rayon_core::unwind::halt_unwinding::ha70d205ac30ee997
    56:     0x55c7686a0f17 - rayon_core::job::JobResult<T>::call::hd9ee7548e2d7018e
    57:     0x55c7686a1b6c - <rayon_core::job::StackJob<L,F,R> as rayon_core::job::Job>::execute::hc138b52c9ae95e54
    58:     0x55c7686fdc6b - rayon_core::job::JobRef::execute::h6b0e6694b949eed2
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/job.rs:64:9
    59:     0x55c7686ee880 - rayon_core::registry::WorkerThread::execute::hd04359f18cb7396c
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:860:9
    60:     0x55c7686ee411 - rayon_core::registry::WorkerThread::wait_until_cold::hd50be0bb0c578b36
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:786:17
    61:     0x55c7686ee2bb - rayon_core::registry::WorkerThread::wait_until::h30c6ab5ffbc03d0d
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:769:13
    62:     0x55c7686f93ea - rayon_core::join::join_recover_from_panic::h5975ba98b2b67ed4
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/join/mod.rs:186:5
    63:     0x55c7686a667a - rayon_core::join::join_context::{{closure}}::h0cca18f41d720c3d
    64:     0x55c76869bf0f - rayon_core::registry::in_worker::hc91e6b5f45c5cbb8
    65:     0x55c7686a63bd - rayon_core::join::join_context::h3a418d389af7006b
    66:     0x55c7686ac1f1 - rayon::iter::plumbing::bridge_producer_consumer::helper::hc7de9a7deafd9bb9
    67:     0x55c7686ac424 - rayon::iter::plumbing::bridge_producer_consumer::helper::{{closure}}::h8ddf99830e5c5ef0
    68:     0x55c7686a6b4c - rayon_core::join::join_context::call_b::{{closure}}::h701ee677f1f796fb
    69:     0x55c7686a0faa - rayon_core::job::JobResult<T>::call::{{closure}}::h522155878259cef6
    70:     0x55c7686a0ca0 - <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once::h8ae7fc8faa307066
    71:     0x55c7686a5790 - std::panicking::try::do_call::h39b74f3c01cf9d43
    72:     0x55c7686a5a0b - __rust_try
    73:     0x55c7686a5725 - std::panicking::try::ha89a5ca847a9a1f2
    74:     0x55c76869a506 - std::panic::catch_unwind::h77bacc8ef522bdb3
    75:     0x55c7686ab6be - rayon_core::unwind::halt_unwinding::ha70d205ac30ee997
    76:     0x55c7686a0f17 - rayon_core::job::JobResult<T>::call::hd9ee7548e2d7018e
    77:     0x55c7686a1b6c - <rayon_core::job::StackJob<L,F,R> as rayon_core::job::Job>::execute::hc138b52c9ae95e54
    78:     0x55c7686fdc6b - rayon_core::job::JobRef::execute::h6b0e6694b949eed2
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/job.rs:64:9
    79:     0x55c7686ee880 - rayon_core::registry::WorkerThread::execute::hd04359f18cb7396c
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:860:9
    80:     0x55c7686ee56c - rayon_core::registry::WorkerThread::wait_until_cold::hd50be0bb0c578b36
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:794:21
    81:     0x55c7686ee2bb - rayon_core::registry::WorkerThread::wait_until::h30c6ab5ffbc03d0d
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:769:13
    82:     0x55c7686f93ea - rayon_core::join::join_recover_from_panic::h5975ba98b2b67ed4
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/join/mod.rs:186:5
    83:     0x55c7686a667a - rayon_core::join::join_context::{{closure}}::h0cca18f41d720c3d
    84:     0x55c76869bf0f - rayon_core::registry::in_worker::hc91e6b5f45c5cbb8
    85:     0x55c7686a63bd - rayon_core::join::join_context::h3a418d389af7006b
    86:     0x55c7686ac1f1 - rayon::iter::plumbing::bridge_producer_consumer::helper::hc7de9a7deafd9bb9
    87:     0x55c7686ac424 - rayon::iter::plumbing::bridge_producer_consumer::helper::{{closure}}::h8ddf99830e5c5ef0
    88:     0x55c7686a6b4c - rayon_core::join::join_context::call_b::{{closure}}::h701ee677f1f796fb
    89:     0x55c7686a0faa - rayon_core::job::JobResult<T>::call::{{closure}}::h522155878259cef6
    90:     0x55c7686a0ca0 - <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once::h8ae7fc8faa307066
    91:     0x55c7686a5790 - std::panicking::try::do_call::h39b74f3c01cf9d43
    92:     0x55c7686a5a0b - __rust_try
    93:     0x55c7686a5725 - std::panicking::try::ha89a5ca847a9a1f2
    94:     0x55c76869a506 - std::panic::catch_unwind::h77bacc8ef522bdb3
    95:     0x55c7686ab6be - rayon_core::unwind::halt_unwinding::ha70d205ac30ee997
    96:     0x55c7686a0f17 - rayon_core::job::JobResult<T>::call::hd9ee7548e2d7018e
    97:     0x55c7686a1b6c - <rayon_core::job::StackJob<L,F,R> as rayon_core::job::Job>::execute::hc138b52c9ae95e54
    98:     0x55c7686fdc6b - rayon_core::job::JobRef::execute::h6b0e6694b949eed2
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/job.rs:64:9
    99:     0x55c7686ee880 - rayon_core::registry::WorkerThread::execute::hd04359f18cb7396c
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:860:9
   100:     0x55c7686ee56c - rayon_core::registry::WorkerThread::wait_until_cold::hd50be0bb0c578b36
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:794:21
   101:     0x55c7686ee30b - rayon_core::registry::WorkerThread::wait_until::h7b3fa88534f40ac1
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:769:13
   102:     0x55c7686ee69b - rayon_core::registry::WorkerThread::wait_until_out_of_work::h7a71c10a4cdf7e8c
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:818:9
   103:     0x55c7686eecde - rayon_core::registry::main_loop::hd871bd0fd46a21eb
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:923:5
   104:     0x55c7686eb516 - rayon_core::registry::ThreadBuilder::run::h22c207bcedcf4b4e
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:53:18
   105:     0x55c7686eb99d - <rayon_core::registry::DefaultSpawn as rayon_core::registry::ThreadSpawn>::spawn::{{closure}}::ha3206a4c797bd1c1
                                 at /home/dev/src/candle/.user-home/.cargo/registry/src/index.crates.io-6f17d22bba15001f/rayon-core-1.12.1/src/registry.rs:98:20
   106:     0x55c768703c56 - std::sys_common::backtrace::__rust_begin_short_backtrace::hb84e696dcba8035d
                                 at /build/rustc-1.77.2-src/library/std/src/sys_common/backtrace.rs:155:18
   107:     0x55c7686fcc0d - std::thread::Builder::spawn_unchecked_::{{closure}}::{{closure}}::h5f3b01c40cf1837d
                                 at /build/rustc-1.77.2-src/library/std/src/thread/mod.rs:529:17
   108:     0x55c7686f23f1 - <core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once::h307f5d749022e654
                                 at /build/rustc-1.77.2-src/library/core/src/panic/unwind_safe.rs:272:9
   109:     0x55c7686f2a71 - std::panicking::try::do_call::h288b20ae11d1801d
                                 at /build/rustc-1.77.2-src/library/std/src/panicking.rs:554:40
   110:     0x55c7686f584b - __rust_try
   111:     0x55c7686f2782 - std::panicking::try::h051196ae5a01a73a
                                 at /build/rustc-1.77.2-src/library/std/src/panicking.rs:518:19
   112:     0x55c7686fca31 - std::panic::catch_unwind::hf5456b99701a0a1d
                                 at /build/rustc-1.77.2-src/library/std/src/panic.rs:142:14
   113:     0x55c7686fca31 - std::thread::Builder::spawn_unchecked_::{{closure}}::hc158bcb884c99419
                                 at /build/rustc-1.77.2-src/library/std/src/thread/mod.rs:528:30
   114:     0x55c7686f695f - core::ops::function::FnOnce::call_once{{vtable.shim}}::h469d5a50aab06a90
                                 at /build/rustc-1.77.2-src/library/core/src/ops/function.rs:250:5
   115:     0x55c7687331d5 - std::sys::pal::unix::thread::Thread::new::thread_start::h1b09b14190a5ce78
   116:     0x7f24fe493272 - start_thread
   117:     0x7f24fe50ec24 - __GI___clone
   118:                0x0 - <unknown>
warning: build failed, waiting for other jobs to finish...

The cute/tensor.hpp is in ${nvidia-cutlass}/cutlass_library/source/include directory, and I already tried

EXTRA_CCFLAGS = "-I/usr/include -I${nvidia-cutlass}/cutlass_library/source/include";

in the mkShell, but it didn’t work! I mentioned it’s related to the flash-attn because someone in this issue mentioned

setup.py should automatically pull CUTLASS.

it’s not the case for my configuration above. What did I miss?

Thank you for your time!

I could run the candle example by using below shell.nix file. The trick to provide cutlass for the flash-attn was using CPATH and LIBRARY_PATH. Feel free to comment.

{ pkgs ? import <nixpkgs> {
    config.allowUnfree = true;
    config.cudaSupport = true;
  }
}:
let
  userHome = "${builtins.toString ./.user-home}";

  nvidia-cutlass = with pkgs; with pkgs.python3.pkgs; buildPythonPackage rec {
    pname = "nvidia_cutlass";
    version = "3.5.0.0";
    format = "wheel";

    disabled = pythonOlder "3.9";

    src = fetchPypi rec {
      inherit pname version format;
      hash = "sha256-TsOrSCDdSuPrwsXd8jNSsNyQ93lDUoTn3OgiAGrcUfY=";
      dist = python;
      python = "py3";
      abi = "none";
      platform = "any";
    };

    doCheck = false;
  };

  flash-attn = with pkgs; with pkgs.python3.pkgs; buildPythonPackage rec {
    pname = "flash_attn";
    version = "2.6.3";
    format = "pyproject";

    disabled = pythonOlder "3.9";

    src = fetchPypi {
      inherit pname version;
      hash = "sha256-W/rpUArY59KTfrzLSQbzvEZNG/Zu7dDkravVIIEce1I=";
    };

    inputsEnv = python3.withPackages (p: with p; [
      psutil
      ninja
      einops
    ]);

    doCheck = false;

    nativeBuildInputs = [
      git
    ];

    buildInputs = [
      torch
      inputsEnv
    ];

    CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
  };

  cudatoolkit = pkgs.cudaPackages.cudatoolkit;
  cudnn = pkgs.cudaPackages.cudnn;
  nccl = pkgs.cudaPackages.nccl;
in
pkgs.mkShell rec {
  name = "candle";

  nativeBuildInputs = with pkgs; [
    git
    rustc
    cargo
    pkg-config
    cmake
    openssl
    llvm_18
    cudaPackages.cuda_nvcc
  ]
  ++ gcc12Stdenv.defaultNativeBuildInputs;

  buildInputs = with pkgs; [
    cargo-flamegraph
    cargo-release
    (lib.getOutput "cxxdev" python3.pkgs.torch)
  ]
  ++ gcc12Stdenv.defaultBuildInputs;

  RUST_BACKTRACE = "full";

  LANG = "C.UTF-8";
  LC_ALL = "C.UTF-8";

  CARGO_HOME = "${userHome}/.cargo";
  RUSTUP_HOME = "${userHome}/.rustup";
  DOCKER_BUILDKIT = "1";

  FLASH_ATTENTION_PATH = "${flash-attn}";
  CUTLASS_PATH = "${nvidia-cutlass}";
  CUDA_ARCH = "sm_89";  # Adjust this according to your GPU architecture

  CUDA_TOOLKIT_ROOT_DIR = "${cudatoolkit}";

  shellHook = ''
    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${with pkgs; lib.makeSearchPath "lib" [
      addDriverRunpath.driverLink
      cudaPackages.libcublas.lib
    ]}:/usr/lib64:${cudatoolkit}/lib:${cudnn}/lib:${nccl}/lib";
    export LIBRARY_PATH=$LIBRARY_PATH:${cudatoolkit}/lib:${cudnn}/lib:${nccl}/lib

    export CUDA_PATH=${cudatoolkit}
    export CUDNN_PATH=${cudnn}
    export NCCL_PATH=${nccl}

    export CPATH=$CPATH:$CUTLASS_PATH/lib/python3.11/site-packages/cutlass_library/source/include
    export LIBRARY_PATH=$LIBRARY_PATH:$CUTLASS_PATH/lib/python3.11/site-packages/cutlass_library
  '';
}