Just upgraded to 24.05, and it seems like the option services.ollama.acceleration = 'rocm';
fails with
➜ nixos-setup git:(main) ✗ sudo nixos-rebuild switch --flake '.#'
building the system configuration...
[1/0/18 built, 0.0 MiB DL] building rocblas-6.0.2 (buildPhase): Reading logic files: Launching 12 threads for 765 tasks...
[1/0/18 built, 0.0 MiB DL] building rocblas-6.0.2 (buildPhase): Reading logic files: Launching 12 threads for 765 tasks...
[1/0/18 built, 0.0 MiB DL] building rocblas-6.0.2 (buildPhase): Custom kernel filename /build/source/build/library/src/build_tmp/TENSILE/
[1/0/18 built, 0.0 MiB DL] building rocblas-6.0.2 (buildPhase): Custom kernel filename /build/source/build/library/src/build_tmp/TENSILE/
[1/0/18 built, 0.0 MiB DL] building rocblas-6.0.2 (buildPhase): OSError: [Errno 28] No space left on device
error: builder for '/nix/store/7phv832r2c0sixvnbcvmhc3913b9s4kj-rocblas-6.0.2.drv' failed with exit code 2;
last 10 log lines:
> error_job.get_result(self.timeout)
> File "/nix/store/raw069rzbym4851fb09r1m4ps02gjgy3-python3.11-joblib-1.4.0/lib/python3.11/site-packages/joblib/parallel.py", line 745, in get_result
> return self._return_or_raise()
> ^^^^^^^^^^^^^^^^^^^^^^^
> File "/nix/store/raw069rzbym4851fb09r1m4ps02gjgy3-python3.11-joblib-1.4.0/lib/python3.11/site-packages/joblib/parallel.py", line 763, in _return_or_raise
> raise self._result
> OSError: [Errno 28] No space left on device
> make[2]: *** [library/src/CMakeFiles/TENSILE_LIBRARY_TARGET.dir/build.make:1635: Tensile/library/TensileLibrary_Type_HS_HPA_Contraction_l_Ailk_Bljk_Cijk_Dijk_gfx906.dat] Error 1
> make[1]: *** [CMakeFiles/Makefile2:230: library/src/CMakeFiles/TENSILE_LIBRARY_TARGET.dir/all] Error 2
> make: *** [Makefile:156: all] Error 2
For full logs, run 'nix log /nix/store/7phv832r2c0sixvnbcvmhc3913b9s4kj-rocblas-6.0.2.drv'.
note: build failure may have been caused by lack of free disk space
error: 1 dependencies of derivation '/nix/store/f6ayxd7fa3lpc5ljghgn5v9j7zg0zyhy-ollama-0.1.38.drv' failed to build
error: 1 dependencies of derivation '/nix/store/mx7kx4ln55p0bsg3qpw171bnngfqcr1f-system-path.drv' failed to build
error: 1 dependencies of derivation '/nix/store/mzbc5bqgdlly1dcf1lyylqihwvwjgswx-unit-ollama.service.drv' failed to build
error: 1 dependencies of derivation '/nix/store/wa059scc96jr5i52lskv6gyp12wn2d0g-nixos-system-fwbook-24.05.20240612.cc54fb4.drv' failed to build
However, I confirmed that my storage and inodes are both under 30% and 10% respectively during the whole build time.
The following are the last few log lines for the nix log /nix/store/7phv832r2c0sixvnbcvmhc3913b9s4kj-rocblas-6.0.2.drv
0x000000000226073e llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) + 62\n1 clang++ 0x000000000225e4dc llvm::sys::CleanupOnSignal(unsigned long) + 140\n2 clang++ 0x00000000021b7453 llvm::CrashRecoveryContext::HandleExit(int) + 83\n3 clang++ 0x000000000225516e llvm::sys::Process::Exit(int, bool) + 30\n4 clang++ 0x0000000000adcc9e\n5 clang++ 0x00000000021c1adf llvm::report_fatal_error(llvm::Twine const&, bool) + 111\n6 clang++ 0x0000000002233c58 llvm::raw_fd_ostream::~raw_fd_ostream() + 904\n7 clang++ 0x0000000000ae1f51\n8 clang++ 0x0000000000ae4202 cc1as_main(llvm::ArrayRef<char const*>, char const*, void*) + 3794\n9 clang++ 0x0000000000ad6232\n10 clang++ 0x0000000002a0bff9\n11 clang++ 0x00000000021b7353 llvm::CrashRecoveryContext::RunSafely(llvm::function_ref<void ()>) + 35\n12 clang++ 0x0000000002a0c3a1\n13 clang++ 0x00000000029ce984 clang::driver::Compilation::ExecuteCommand(clang::driver::Command const&, clang::driver::Command const*&, bool) const + 164\n14 clang++ 0x00000000029cf3a3\n15 clang++ 0x00000000029d4863 clang::driver::Compilation::ExecuteJobs(clang::driver::JobList const&, llvm::SmallVectorImpl<std::pair<int, clang::driver::Command const*>>&, bool) const + 10195\n16 clang++ 0x00000000029e44ac clang::driver::Driver::ExecuteCompilation(clang::driver::Compilation&, llvm::SmallVectorImpl<std::pair<int, clang::driver::Command const*>>&) + 316\n17 clang++ 0x0000000000ad85c6 clang_main(int, char**, llvm::ToolContext const&) + 8774\n18 clang++ 0x0000000000a1762f main + 47\n19 libc.so.6 0x00007ffff7a1c10e\n20 libc.so.6 0x00007ffff7a1c1c9 __libc_start_main + 137\n21 clang++ 0x0000000000ad1725 _start + 37\n'
Command:
/nix/store/4kh0a4njzcl36arivn752va8s00rfd47-rocm-llvm-clang-wrapper-6.0.2/bin/clang++ -x assembler -target amdgcn-amd-amdhsa -mcode-object-version=4 -mcpu=gfx90a -mwavefrontsize64 -c -o /build/source/build/library/src/build_tmp/TENSILE/assembly/Cijk_Alik_Bljk_HHS_BHR_MT256x256x32_MI32x32x8x1_NqmbQyFCc9KWIshiwZjffx2LzYdFZq7Mf9kyLR17Pqo=.o /build/source/build/library/src/build_tmp/TENSILE/assembly/Cijk_Alik_Bljk_HHS_BHR_MT256x256x32_MI32x32x8x1_NqmbQyFCc9KWIshiwZjffx2LzYdFZq7Mf9kyLR17Pqo=.s
returned non-zero exit status 1
joblib.externals.loky.process_executor._RemoteTraceback:
"""
Traceback (most recent call last):
File "/nix/store/raw069rzbym4851fb09r1m4ps02gjgy3-python3.11-joblib-1.4.0/lib/python3.11/site-packages/joblib/externals/loky/process_executor.py", line 463, in _process_worker
r = call_item()
^^^^^^^^^^^
File "/nix/store/raw069rzbym4851fb09r1m4ps02gjgy3-python3.11-joblib-1.4.0/lib/python3.11/site-packages/joblib/externals/loky/process_executor.py", line 291, in __call__
return self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/raw069rzbym4851fb09r1m4ps02gjgy3-python3.11-joblib-1.4.0/lib/python3.11/site-packages/joblib/parallel.py", line 598, in __call__
return [func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/raw069rzbym4851fb09r1m4ps02gjgy3-python3.11-joblib-1.4.0/lib/python3.11/site-packages/joblib/parallel.py", line 598, in <listcomp>
return [func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/1wj1wgs0jfh66qg1h10w8nzqa40fki1i-python3.11-tensile-6.0.2/lib/python3.11/site-packages/Tensile/Parallel.py", line 53, in pcallWithGlobalParamsMultiArg
return f(*args)
^^^^^^^^
File "/nix/store/1wj1wgs0jfh66qg1h10w8nzqa40fki1i-python3.11-tensile-6.0.2/lib/python3.11/site-packages/Tensile/TensileCreateLibrary.py", line 67, in processKernelSource
(err, src) = kernelWriter.getSourceFileString(kernel)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/1wj1wgs0jfh66qg1h10w8nzqa40fki1i-python3.11-tensile-6.0.2/lib/python3.11/site-packages/Tensile/KernelWriter.py", line 5386, in getSourceFileString
self.getSingleCodeObjectFile(kernel)
File "/nix/store/1wj1wgs0jfh66qg1h10w8nzqa40fki1i-python3.11-tensile-6.0.2/lib/python3.11/site-packages/Tensile/KernelWriter.py", line 5317, in getSingleCodeObjectFile
objectFileName = self.getAssembledKernelObjectFile(kernel)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/1wj1wgs0jfh66qg1h10w8nzqa40fki1i-python3.11-tensile-6.0.2/lib/python3.11/site-packages/Tensile/KernelWriter.py", line 5297, in getAssembledKernelObjectFile
assemblyFileName = self.getKernelObjectAssemblyFile(kernel)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/1wj1wgs0jfh66qg1h10w8nzqa40fki1i-python3.11-tensile-6.0.2/lib/python3.11/site-packages/Tensile/KernelWriter.py", line 5292, in getKernelObjectAssemblyFile
assemblyFile.write(kernelSource)
OSError: [Errno 28] No space left on device
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/nix/store/1wj1wgs0jfh66qg1h10w8nzqa40fki1i-python3.11-tensile-6.0.2/lib/python3.11/site-packages/Tensile/bin/TensileCreateLibrary", line 43, in <module>
TensileCreateLibrary()
File "/nix/store/1wj1wgs0jfh66qg1h10w8nzqa40fki1i-python3.11-tensile-6.0.2/lib/python3.11/site-packages/Tensile/TensileCreateLibrary.py", line 1366, in TensileCreateLibrary
codeObjectFiles = writeSolutionsAndKernels(outputPath, CxxCompiler, None, solutions,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/1wj1wgs0jfh66qg1h10w8nzqa40fki1i-python3.11-tensile-6.0.2/lib/python3.11/site-packages/Tensile/TensileCreateLibrary.py", line 524, in writeSolutionsAndKernels
results = Common.ParallelMap(processKernelSource, kIter, "Generating kernels")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/1wj1wgs0jfh66qg1h10w8nzqa40fki1i-python3.11-tensile-6.0.2/lib/python3.11/site-packages/Tensile/Parallel.py", line 87, in ParallelMap
rv = Parallel(n_jobs=threadCount)(delayed(pcall)(function, a, params) for a, params in pargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/raw069rzbym4851fb09r1m4ps02gjgy3-python3.11-joblib-1.4.0/lib/python3.11/site-packages/joblib/parallel.py", line 2007, in __call__
return output if self.return_generator else list(output)
^^^^^^^^^^^^
File "/nix/store/raw069rzbym4851fb09r1m4ps02gjgy3-python3.11-joblib-1.4.0/lib/python3.11/site-packages/joblib/parallel.py", line 1650, in _get_outputs
yield from self._retrieve()
File "/nix/store/raw069rzbym4851fb09r1m4ps02gjgy3-python3.11-joblib-1.4.0/lib/python3.11/site-packages/joblib/parallel.py", line 1754, in _retrieve
self._raise_error_fast()
File "/nix/store/raw069rzbym4851fb09r1m4ps02gjgy3-python3.11-joblib-1.4.0/lib/python3.11/site-packages/joblib/parallel.py", line 1789, in _raise_error_fast
error_job.get_result(self.timeout)
File "/nix/store/raw069rzbym4851fb09r1m4ps02gjgy3-python3.11-joblib-1.4.0/lib/python3.11/site-packages/joblib/parallel.py", line 745, in get_result
return self._return_or_raise()
^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/raw069rzbym4851fb09r1m4ps02gjgy3-python3.11-joblib-1.4.0/lib/python3.11/site-packages/joblib/parallel.py", line 763, in _return_or_raise
raise self._result
OSError: [Errno 28] No space left on device
make[2]: *** [library/src/CMakeFiles/TENSILE_LIBRARY_TARGET.dir/build.make:1635: Tensile/library/TensileLibrary_Type_HS_HPA_Contraction_l_Ailk_Bljk_Cijk_Dijk_gfx906.dat] Error 1
make[1]: *** [CMakeFiles/Makefile2:230: library/src/CMakeFiles/TENSILE_LIBRARY_TARGET.dir/all] Error 2
make: *** [Makefile:156: all] Error 2