I have AMD RX 5500 and wanted to use it for casual ML. So, here is the shell I made. To use that open shell and type nix-shell
then create a python virtual environment virtualenv .venv
and install the lib pip install tensorflow-rocm
. It should pass the following test
import tensorflow as tf
from keras import backend as K
print(tf.config.list_physical_devices('GPU'))
But it does not work when I try to actually run it for ML. I see the following errors in the logs
kernel: amdgpu: HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out
kernel: amdgpu: Pasid 0x8016 DQM create queue type 0 failed. ret -62
kernel: amdgpu: HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out
kernel: amdgpu: Failed to evict process queues
kernel: amdgpu: Failed to quiesce KFD
kernel: amdgpu: HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out
kernel: amdgpu: Didn't find vmid for pasid 0x8016
I think it is because my GPU is not supported (but why does CUDA work on all nvidia GPUs?) or I force GFX version by setting HSA_OVERRIDE_GFX_VERSION
env variable to an incorrect version.
Here is pip list
output for historical purposes.
Package Version
---------------------------- ----------
absl-py 1.4.0
astunparse 1.6.3
cachetools 5.3.1
certifi 2023.7.22
charset-normalizer 3.2.0
flatbuffers 23.5.26
gast 0.4.0
google-auth 2.23.0
google-auth-oauthlib 1.0.0
google-pasta 0.2.0
grpcio 1.58.0
h5py 3.9.0
idna 3.4
jax 0.4.14
keras 2.12.0
libclang 16.0.6
Markdown 3.4.4
MarkupSafe 2.1.3
ml-dtypes 0.2.0
numpy 1.23.5
oauthlib 3.2.2
opt-einsum 3.3.0
packaging 23.1
pip 23.2.1
protobuf 4.24.3
pyasn1 0.5.0
pyasn1-modules 0.3.0
requests 2.31.0
requests-oauthlib 1.3.1
rsa 4.9
scipy 1.11.2
setuptools 68.1.2
six 1.16.0
tensorboard 2.12.3
tensorboard-data-server 0.7.1
tensorflow-estimator 2.12.0
tensorflow-io-gcs-filesystem 0.34.0
tensorflow-rocm 2.12.0.560
termcolor 2.3.0
typing_extensions 4.7.1
urllib3 1.26.16
Werkzeug 2.3.7
wheel 0.41.2
wrapt 1.14.1
And the package file.
let
pkgs = import <nixpkgs> {};
inherit (pkgs) stdenv;
amdgpuVersions = {
gfx1030 = "10.3.0";
gfx900 = "9.0.0";
gfx906 = "9.0.6";
gfx908 = "9.0.8";
gfx90a = "9.0.a";
};
libs = pkgs.lib.makeLibraryPath (builtins.attrValues {
inherit (pkgs.llvmPackages_rocm) libunwind;
inherit
(pkgs)
rocm-runtime
rocm-opencl-runtime
rocm-comgr
rocm-smi
miopengemm
rocblas
ncurses
sqlite
libelf
libdrm
numactl
rocrand
hipfft
miopen
hip
rccl
;
inherit (stdenv.cc.cc) lib;
});
python = pkgs.python310;
in
stdenv.mkDerivation {
name = "dev-env";
env = {
LD_LIBRARY_PATH = libs;
CUDA_PATH = pkgs.cudaPackages.cudatoolkit;
CUDNN_PATH = pkgs.cudaPackages.cudnn;
OCL_ICD_VENDORS = "${pkgs.rocm-opencl-icd}/etc/OpenCL/vendors/";
HSA_OVERRIDE_GFX_VERSION = amdgpuVersions.gfx908;
};
buildInputs = builtins.attrValues {
python = python.withPackages (ps:
builtins.attrValues {
inherit
(ps)
virtualenv
;
});
};
shellHook = ''
. ./.venv/bin/activate
'';
}
Maybe all that information will inspire someone, or the complete solution already exists, and I do not know about it .
At last, does anyone know the error?
last 10 log lines:
> Sourcing python-namespaces-hook
> Sourcing python-catch-conflicts-hook.sh
> unpacking sources
> unpacking source archive /nix/store/jy636qwb5bzyag808miwzcm8v1gac7n4-source
> source root is source
> setting SOURCE_DATE_EPOCH to timestamp 315619200 of file source/tools/tf_env_collect.sh
> patching sources
> configuring
> configure flags: --prefix=/nix/store/h76wdymkn3scp1swral7h85dkkqx9kn3-python3.10-tensorflow-rocm-2.13.0 --bindir=/nix/store/h76wdymkn3scp1swral7h85dkkqx9kn3-python3.10-tensorflow-rocm-2.13.0/bin --sbindir=/nix/store/h76wdymkn3scp1swral7h85dkkqx9kn3-python3.10-tensorflow-rocm-2.13.0/sbin --includedir=/nix/store/h76wdymkn3scp1swral7h85dkkqx9kn3-python3.10-tensorflow-rocm-2.13.0/include --oldincludedir=/nix/store/h76wdymkn3scp1swral7h85dkkqx9kn3-python3.10-tensorflow-rocm-2.13.0/include --mandir=/nix/store/h76wdymkn3scp1swral7h85dkkqx9kn3-python3.10-tensorflow-rocm-2.13.0/share/man --infodir=/nix/store/h76wdymkn3scp1swral7h85dkkqx9kn3-python3.10-tensorflow-rocm-2.13.0/share/info --docdir=/nix/store/h76wdymkn3scp1swral7h85dkkqx9kn3-python3.10-tensorflow-rocm-2.13.0/share/doc/python3.10-tensorflow-rocm --libdir=/nix/store/h76wdymkn3scp1swral7h85dkkqx9kn3-python3.10-tensorflow-rocm-2.13.0/lib --libexecdir=/nix/store/h76wdymkn3scp1swral7h85dkkqx9kn3-python3.10-tensorflow-rocm-2.13.0/libexec --localedir=/nix/store/h76wdymkn3scp1swral7h85dkkqx9kn3-python3.10-tensorflow-rocm-2.13.0/share/locale
> /nix/store/vfdg65hiv4bwls48588msw8la7452w2q-stdenv-linux/setup: line 1299: ./configure: cannot execute: required file not found
It happens when I try to compile from source
let
pkgs = import <nixpkgs> {};
inherit (pkgs) stdenv;
python = pkgs.python310;
in
stdenv.mkDerivation {
name = "dev-env";
buildInputs = builtins.attrValues {
python = python.withPackages (ps:
builtins.attrValues {
inherit
(ps)
virtualenv
;
tensorflow-rocm = ps.buildPythonPackage rec {
pname = "tensorflow-rocm";
version = "2.13.0";
src = pkgs.fetchFromGitHub {
owner = "ROCmSoftwarePlatform";
repo = "tensorflow-upstream";
rev = "v${version}";
sha256 = "sha256-Rq5pAVmxlWBVnph20fkAwbfy+iuBNlfFy14poDPd5h0=";
};
doCheck = false;
nativeBuildInputs = [
python
pkgs.bazel
];
};
});
};
}