Nix development environment for python AI/llama 3.1

Hello everybody! These past few days I’ve been struggling to get a nix shell that can run
llama 3.1 on my nvidia GPU. It always ends up having the wrong package versions, or straight up not compiling.

If anyone can share a similar config, or a different method of installing python packages, or some advice on what I generally should do, please do!

inference.py for running AI (it deviated slightly over the commits)
import transformers
import torch
from transformers import pipeline, AutoTokenizer

# Define the path to the model and the tokenizer
model_path = "./models/Meta-Llama-3.1-8B-Instruct_quantized/"
# Define rope_scaling with the required fields only
rope_scaling = {"type": "llama3", "factor": 8.0}

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the pipeline with the model and tokenizer
text_generator = pipeline(
    "text-generation",
    model=model_path,
    tokenizer=AutoTokenizer.from_pretrained(model_path),
    model_kwargs={"torch_dtype": torch.bfloat16, "rope_scaling": rope_scaling},
    device=device
)


# Infinite loop to take user input and generate responses
try:
    while True:
        prompt = input("Enter a prompt: ")
        if prompt.lower() == 'exit':
            break
        outputs = pipe(prompt, max_new_tokens=256, do_sample=False)
        print(outputs[0]['generated_text'])
except KeyboardInterrupt:
    print("Program terminated.")

My attempts:

(shell.nix) Naive attempt with torchWithCuda
{ pkgs ? import (fetchTarball {
url = "https://github.com/NixOS/nixpkgs/archive/nixos-24.05.tar.gz";
# I think these hashes are temporary, to get nix complaining and getting real hash
sha256 = "0rqkpdipwq1ld352sg7h2a1zc1xg3rj5ay6dlr337cysj4xsgn7b";
}) { config.allowUnfree = true; },

unstable ? import (fetchTarball {
url = "https://github.com/NixOS/nixpkgs/archive/nixos-unstable.tar.gz";
sha256 = "0rqkpdipwq1ld352sg7h2a1zc1xg3rj5ay6dlr337cysj4xsgn7b";
}) { config.allowUnfree = true; } }:
pkgs.mkShell {
buildInputs = with pkgs; [
  # 3.12 python
  python3
  python312Packages.pip
  python312Packages.virtualenv
  unstable.python312Packages.transformers
  python312Packages.torchWithCuda
  python312Packages.accelerate
];

shellHook = ''
  # Set up the virtual environment (optional)
  if [ ! -d "venv" ]; then
    python -m venv venv
    echo "Virtual environment created."
  fi
  source venv/bin/activate
  echo "Virtual environment activated."
'';
}

Problem: it takes hours to compile

(flake.nix) Flake with cachix
{
  description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

  # Cachix cache so you don't have to build cuda for 100 hours
  nixConfig = {
    extra-substituters = [
      "https://nix-community.cachix.org"
    ];
    extra-trusted-public-keys = [
      "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
    ];
  };

  inputs = {
    nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
  };

  outputs = { self, nixpkgs }: 
  let
    system = "x86_64-linux";
    pkgs = import nixpkgs {
      system = system;
      config.allowUnfree = true;
      config.cudaSupport = true;
    };
  in {
    # Make the shell
    devShell.${system} = pkgs.mkShell {
      packages = with pkgs; [
        python311
        python311Packages.torch-bin
        python311Packages.datasets
        python311Packages.transformers
        python311Packages.evaluate
        python311Packages.accelerate
        python311Packages.pip
        cudatoolkit
        linuxPackages.nvidia_x11
      ];

      shellHook = ''
        echo "You are now using a NIX environment"
        export CUDA_PATH=${pkgs.cudatoolkit}
        echo $CUDA_PATH
        # Set up the virtual environment (optional)
        if [ ! -d "venv" ]; then
          python -m venv venv
          echo "Virtual environment created."
        fi
        source venv/bin/activate
        echo "Virtual environment activated."
      '';
    };
  };
}

flake.lock:

{
"nodes": {
  "nixpkgs": {
    "locked": {
      "lastModified": 1731386116,
      "narHash": "sha256-lKA770aUmjPHdTaJWnP3yQ9OI1TigenUqVC3wweqZuI=",
      "owner": "nixos",
      "repo": "nixpkgs",
      "rev": "689fed12a013f56d4c4d3f612489634267d86529",
      "type": "github"
    },
    "original": {
      "owner": "nixos",
      "ref": "nixos-24.05",
      "repo": "nixpkgs",
      "type": "github"
    }
  },
  "root": {
    "inputs": {
      "nixpkgs": "nixpkgs"
    }
  }
},
"root": "root",
"version": 7
}

Bash error:

Traceback (most recent call last):
File "/home/syshotdev/Programming/Python/Llama-model-testing/inference.py", line 9, in <module>
  model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/mk8jls52jry9pkwhcwdxdwg16z9374x1-python3.11-transformers-4.41.0/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 523, in from_pretrained
  config, kwargs = AutoConfig.from_pretrained(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/mk8jls52jry9pkwhcwdxdwg16z9374x1-python3.11-transformers-4.41.0/lib/python3.11/site-packages/transformers/models/auto/configuration_auto.py", line 958, in from_pretrained
  return config_class.from_dict(config_dict, **unused_kwargs)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/mk8jls52jry9pkwhcwdxdwg16z9374x1-python3.11-transformers-4.41.0/lib/python3.11/site-packages/transformers/configuration_utils.py", line 768, in from_dict
  config = cls(**config_dict)
           ^^^^^^^^^^^^^^^^^^
File "/nix/store/mk8jls52jry9pkwhcwdxdwg16z9374x1-python3.11-transformers-4.41.0/lib/python3.11/site-packages/transformers/models/llama/configuration_llama.py", line 161, in __init__
  self._rope_scaling_validation()
File "/nix/store/mk8jls52jry9pkwhcwdxdwg16z9374x1-python3.11-transformers-4.41.0/lib/python3.11/site-packages/transformers/models/llama/configuration_llama.py", line 182, in _rope_scaling_validation
  raise ValueError(
ValueError: `rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}
(flake.nix) Updated transformers

Comes from this link: meta-llama/Llama-3.1-8B-Instruct · ValueError: `rope_scaling` must be a dictionary with two fields

{
description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

# Cachix cache so you don't have to build cuda for 100 hours
nixConfig = {
  extra-substituters = [
    "https://nix-community.cachix.org"
  ];
  extra-trusted-public-keys = [
    "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
    "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
  ];
};

inputs = {
  nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
  # Latest nix commit at the time
  nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
};

outputs = { self, nixpkgs, nixpkgs-unstable }: 
let
  system = "x86_64-linux";
  pkgs = import nixpkgs {
    system = system;
    config.allowUnfree = true;
  };
  unstable = import nixpkgs-unstable {
    system = system;
    config.allowUnfree = true;
  };
in {
  # Make the shell
  devShell.${system} = pkgs.mkShell {
    packages = with pkgs; [
      python311
      python311Packages.torch-bin
      python311Packages.datasets
      unstable.python311Packages.transformers
      python311Packages.evaluate
      python311Packages.accelerate
      cudatoolkit
      linuxPackages.nvidia_x11
    ];

    shellHook = ''
      echo "You are now using a NIX environment"
    '';
  };
};
}

Bash error:

Traceback (most recent call last):
File "/home/syshotdev/Programming/Python/Llama-model-testing/inference.py", line 1, in <module>
  import transformers
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/__init__.py", line 26, in <module>
  from . import dependency_versions_check
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/dependency_versions_check.py", line 57, in <module>
  require_version_core(deps[pkg])
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/utils/versions.py", line 117, in require_version_core
  return require_version(requirement, hint)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/utils/versions.py", line 111, in require_version
  _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/utils/versions.py", line 44, in _compare_versions
  raise ImportError(
ImportError: huggingface-hub>=0.23.2,<1.0 is required for a normal functioning of this module, but found huggingface-hub==0.23.0.
Try: `pip install transformers -U` or `pip install -e '.[dev]'` if you're working with git main

(flake.nix) Force add huggingface-hub as package
{
  description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

  # Cachix cache so you don't have to build cuda for 100 hours
  nixConfig = {
    extra-substituters = [
      "https://nix-community.cachix.org"
    ];
    extra-trusted-public-keys = [
      "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
    ];
  };

  inputs = {
    nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
    nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
  };

  outputs = { self, nixpkgs, nixpkgs-unstable }: 
  let
    system = "x86_64-linux";
    pkgs = import nixpkgs {
      system = system;
      config.allowUnfree = true;
    };
    unstable = import nixpkgs-unstable {
      system = system;
      config.allowUnfree = true;
    };
  in {
    # Make the shell
    devShell.${system} = pkgs.mkShell {
      packages = with pkgs; [
        python311
        python311Packages.torch-bin
        python311Packages.datasets
        unstable.python311Packages.transformers
        python311Packages.huggingface-hub
        python311Packages.evaluate
        python311Packages.accelerate
        python311Packages.pip
        cudatoolkit
        linuxPackages.nvidia_x11
      ];

      shellHook = ''
        echo "You are now using a NIX environment"
      '';
    };
  };
}

Bash error (exact same one as last time. The package version didn’t even change):

Traceback (most recent call last):
File "/home/syshotdev/Programming/Python/Llama-model-testing/inference.py", line 1, in <module>
  import transformers
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/__init__.py", line 26, in <module>
  from . import dependency_versions_check
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/dependency_versions_check.py", line 57, in <module>
  require_version_core(deps[pkg])
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/utils/versions.py", line 117, in require_version_core
  return require_version(requirement, hint)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/utils/versions.py", line 111, in require_version
  _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/utils/versions.py", line 44, in _compare_versions
  raise ImportError(
ImportError: huggingface-hub>=0.23.2,<1.0 is required for a normal functioning of this module, but found huggingface-hub==0.23.0.
Try: `pip install transformers -U` or `pip install -e '.[dev]'` if you're working with git main
(shell.nix) Tried shell.nix again, pinned specific packages
{ pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/0bce9e80c6828de1c0af63bc96ca2059b0652a16.tar.gz") {}
}:
let
  pythonEnv = pkgs.python311.withPackages (pythonPackages: with pythonPackages; [
    (pythonPackages.buildPythonPackage rec {
      pname = "transformers";
      version = "4.35.0";

      src = pkgs.fetchPypi {
        inherit pname version;
        sha256 = "sha256-5LQXY/ZRKC/JeTSNOqFIJEOH3ckWX0sYRVeYx3CuI7k=";
      };

      doCheck = false; # Skip tests for faster builds
    })
    (pythonPackages.buildPythonPackage rec {
      pname = "torch";
      version = "2.5.1";

      src = pkgs.fetchurl {
        url = "https://files.pythonhosted.org/packages/69/72/20cb30f3b39a9face296491a86adb6ff8f1a47a897e4d14667e6cf89d5c3/torch-${version}-cp313-cp313-manylinux1_x86_64.whl";
        sha256 = "sha256-m2Ht87T247DgrdqLOWAma5AJ0Cs3VVlx9NHI96Ba/tc="; # Replace with the correct hash
      };

      format = "wheel";
      doCheck = false;
    })

    (pythonPackages.buildPythonPackage rec {
      pname = "accelerate";
      version = "0.22.0";
      src = pkgs.fetchPypi {
        inherit pname version;
        sha256 = "sha256-KwqD480HyJRIxdWpT3K8HbmNXgxJjKF5hIcfAdv4Mkc=";
      };
      doCheck = false;
    })

    pip
  ]);
in
pkgs.mkShell {
  name = "custom-package-version-shell";

  buildInputs = [    
    pythonEnv
  ];
  shellHook = ''
    echo "You are now using a NIX environment"
    # Set up the virtual environment (optional)
    if [ ! -d "venv" ]; then
      python -m venv venv
      echo "Virtual environment created."
    fi
    source venv/bin/activate
    echo "Virtual environment activated."
  '';
}

Bash error (the packages weren’t installed):

Traceback (most recent call last):
File "/home/syshotdev/Programming/Python/Llama-model-testing/inference.py", line 1, in <module>
  import transformers
ModuleNotFoundError: No module named 'transformers'
(flake.nix) Maybe pinning packages will work in a flake?
{
  description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

  # Cachix cache so you don't have to build cuda for 100 hours
  nixConfig = {
    extra-substituters = [
      "https://nix-community.cachix.org"
    ];
    extra-trusted-public-keys = [
      "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
    ];
  };

  inputs = {
    nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
    nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
  };

  outputs = { self, nixpkgs, nixpkgs-unstable }: 
  let
    system = "x86_64-linux";
    pkgs = import nixpkgs {
      system = system;
      config.allowUnfree = true;
    };
    unstable = import nixpkgs-unstable {
      system = system;
      config.allowUnfree = true;
    };
  in {
    # Make the shell
    devShell.${system} = pkgs.mkShell {
      packages = with pkgs; [
        /*
        (pkgs.python311.withPackages (ps: with ps; [
          torch
          accelerate
          pip
        ]))
        */
        python311
        python311Packages.torch-bin
        python311Packages.datasets
        unstable.python311Packages.transformers
        (pkgs.python311Packages.buildPythonPackage rec {
          pname = "huggingface-hub";
          version = "0.26.2";
          pyproject = true;

          src = fetchFromGitHub {
            owner = "huggingface";
            repo = "huggingface_hub";
            rev = "refs/tags/v${version}";
            hash = "sha256-F2E8P0Hq3Ee+RXUEN4t2JtfBtK36aMsHQCnid9VWdLk=";
          };

          build-system = with pkgs; [ setuptools ];

          dependencies = with pkgs; [
            filelock
            fsspec
            packaging
            pyyaml
            requests
            tqdm
            typing-extensions
          ];

          # Tests require network access.
          doCheck = false;

          pythonImportsCheck = [ "huggingface_hub" ];
        })
        python311Packages.evaluate
        python311Packages.accelerate
        cudatoolkit
        linuxPackages.nvidia_x11
      ];

      shellHook = ''
        echo "You are now using a NIX environment"
      '';
    };
  };
}

Bash error:

     at /nix/store/0gil5b8wly5pg4yw42iy1vzh3d25c62z-source/flake.nix:58:39:

         57|
         58|           build-system = with pkgs; [ setuptools ];
           |                                       ^
         59|

Have you tried using llama.cpp ? It is packaged in Nixpkgs and works well on both NVIDIA and AMD, I use it daily.

See also python312Packages.llama-cpp-python: init at 0.3.1 by kirillrdy · Pull Request #349657 · NixOS/nixpkgs · GitHub for a Python API.

Thank you @sepal for responding! With your suggestion, I eventually found a (not so perfect) solution, but the only thing that matters is that it works. I’ve layed out my other attempts in hopes that someone can eventually fix them.

I’ve also realized that in the last post that I didn’t include any info about my gpu, so here it is:

Nvidia Graphics Card Info

Nix config:

{ config, lib, pkgs, ...}:
{
  services.xserver.videoDrivers = ["nvidia"];

  hardware.nvidia = {
    open = false;
    nvidiaSettings = true;

    modesetting.enable = true;
    powerManagement.enable = false;
    powerManagement.finegrained = false;
  };

  hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.beta;
}

Output from nvidia-smi

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.78                 Driver Version: 550.78         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 3080        Off |   00000000:01:00.0  On |                  N/A |
| 53%   34C    P8             36W /  320W |     925MiB /  10240MiB |     30%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|    0   N/A  N/A      1316      G   ...810w2l99l-xorg-server-21.1.14/bin/X        126MiB |
|    0   N/A  N/A      2330      G   ...irefox-132.0.2/bin/.firefox-wrapped        187MiB |
|    0   N/A  N/A    186414      G   ...810w2l99l-xorg-server-21.1.14/bin/X        301MiB |
|    0   N/A  N/A    186834      G   /run/current-system/sw/bin/cinnamon            66MiB |
|    0   N/A  N/A    187232      G   /run/current-system/sw/bin/kitty                6MiB |
|    0   N/A  N/A    187854      G   ...irefox-132.0.2/bin/.firefox-wrapped        137MiB |
+-----------------------------------------------------------------------------------------+
The working configuration (Commandline llama-cpp) The flake.nix I used:
{
description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

# Cachix cache so you don't have to build cuda for 100 hours
nixConfig = {
  extra-substituters = [
    "https://nix-community.cachix.org"
  ];
  extra-trusted-public-keys = [
    "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
    "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
  ];
};

inputs = {
  nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
  nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
};

outputs = { self, nixpkgs, nixpkgs-unstable }: 
let
  system = "x86_64-linux";
  pkgs = import nixpkgs {
    system = system;
    config.allowUnfree = true;
  };
  unstable = import nixpkgs-unstable {
    system = system;
    config.allowUnfree = true;
  };
in {
  # Make the shell
  devShell.${system} = pkgs.mkShell {
    packages = with pkgs; [
      python312
      llama-cpp
      python312Packages.torch
      python312Packages.gguf
    ];

    shellHook = ''
      echo "You are now using a NIX environment"
    '';
  };
};
}

The flake.lock I used:

{
"nodes": {
  "nixpkgs": {
    "locked": {
      "lastModified": 1731797254,
      "narHash": "sha256-df3dJApLPhd11AlueuoN0Q4fHo/hagP75LlM5K1sz9g=",
      "owner": "nixos",
      "repo": "nixpkgs",
      "rev": "e8c38b73aeb218e27163376a2d617e61a2ad9b59",
      "type": "github"
    },
    "original": {
      "owner": "nixos",
      "ref": "nixos-24.05",
      "repo": "nixpkgs",
      "type": "github"
    }
  },
  "nixpkgs-unstable": {
    "locked": {
      "lastModified": 1730135610,
      "narHash": "sha256-i6EAc+Z/yVGk3HyXsIfu4yuv1l7KCaima3tNk9sxx18=",
      "owner": "nixos",
      "repo": "nixpkgs",
      "rev": "0bce9e80c6828de1c0af63bc96ca2059b0652a16",
      "type": "github"
    },
    "original": {
      "owner": "nixos",
      "repo": "nixpkgs",
      "rev": "0bce9e80c6828de1c0af63bc96ca2059b0652a16",
      "type": "github"
    }
  },
  "root": {
    "inputs": {
      "nixpkgs": "nixpkgs",
      "nixpkgs-unstable": "nixpkgs-unstable"
    }
  }
},
"root": "root",
"version": 7
}

Download the Meta-Llama-3-8B.Q2_K.gguf from this link

Then put the downloaded .gguf in your ./models directory

Then run this command to start a conversation:

llama -m models/Meta-Llama-3-8B.Q2_K.gguf --conversation

My sample conversation (as you can see the model is… lacking)

> What is your name?
I'm a man of many talents. I'm an actor, poet, musician and also a member of the world famous band, The Beatles.
Where were you born?
In Liverpool.
How do you know you're alive?
What's the first thing you'd do if you were alive?
Go to the nearest bar and get really drunk.
What's the last thing you'd do if you were dead?
I would just make sure that nobody ever finds my body.
What is the one talent you want to have?
To be able to fly.
What's the one talent you hope you'll never have?
The ability to cry.

I wish it ran on the GPU, and that it made coherent responses, but the important thing is that IT WORKS!!

Other variations I tried (All of these failed):

python312Packages.llama-cpp-python not in python path
{
description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

# Cachix cache so you don't have to build cuda for 100 hours
nixConfig = {
  extra-substituters = [
    "https://nix-community.cachix.org"
  ];
  extra-trusted-public-keys = [
    "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
    "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
  ];
};

inputs = {
  nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
  nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
  # llama-cpp-python doesn't exist yet https://github.com/NixOS/nixpkgs/pull/349657
  llama-cpp-nixpkgs.url = "github:kirillrdy/nixpkgs/llama-cpp-python";
};

outputs = { self, nixpkgs, nixpkgs-unstable, llama-cpp-nixpkgs }: 
let
  system = "x86_64-linux";
  pkgs = import nixpkgs {
    system = system;
    config.allowUnfree = true;
  };
  unstable = import nixpkgs-unstable {
    system = system;
    config.allowUnfree = true;
  };
  llama-cpp = import llama-cpp-nixpkgs {
    system = system;
    config.allowUnfree = true;
  };
in {
  # Make the shell
  devShell.${system} = pkgs.mkShell {
    packages = with pkgs; [
      python312
      llama-cpp.python312Packages.llama-cpp-python
      python312Packages.torch
      python312Packages.gguf
    ];

    shellHook = ''
      echo "You are now using a NIX environment"
    '';
  };
};
}

My inference.py (I copied the docs):

import LlamaCpp

template = """Question: {question}

Answer: Let's work this out in a step by step way to be sure we have the right answer."""

prompt = PromptTemplate.from_template(template)

# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])


llm = LlamaCpp(
  model_path="./models/Meta-Llama-3.1-8B-Instruct_quantized/",
  callback_manager=callback_manager,
)

question = """
Question: A rap battle between Stephen Colbert and John Oliver
"""
llm.invoke(question)

Bash error:

Traceback (most recent call last):
File "/home/syshotdev/Programming/Python/Llama-model-testing/inference.py", line 1, in <module>
  import LlamaCpp
ModuleNotFoundError: No module named 'LlamaCpp'
Running the model via commandline llama-cpp (1st attempt)
{
description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

# Cachix cache so you don't have to build cuda for 100 hours
nixConfig = {
  extra-substituters = [
    "https://nix-community.cachix.org"
  ];
  extra-trusted-public-keys = [
    "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
    "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
  ];
};

inputs = {
  nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
  nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
};

outputs = { self, nixpkgs, nixpkgs-unstable }: 
let
  system = "x86_64-linux";
  pkgs = import nixpkgs {
    system = system;
    config.allowUnfree = true;
  };
  unstable = import nixpkgs-unstable {
    system = system;
    config.allowUnfree = true;
  };
in {
  # Make the shell
  devShell.${system} = pkgs.mkShell {
    packages = with pkgs; [
      python312
      llama-cpp
      python312Packages.torch
      python312Packages.gguf
    ];

    shellHook = ''
      echo "You are now using a NIX environment"
    '';
  };
};
}

The script for runnning llama-cpp (From huggingface)

# Load and run the model:
llama \
--hf-repo "QuantFactory/Meta-Llama-3-8B-GGUF" \
--hf-file Meta-Llama-3-8B.Q2_K.gguf \
-p "You are a helpful assistant" \
--conversation

Bash error when running llama:

Log start
main: build = 2953 (917dc8c)
main: built with gcc (GCC) 13.2.0 for x86_64-unknown-linux-gnu
main: seed  = 1732405734
llama_load_model_from_hf: llama.cpp built without libcurl, downloading from Hugging Face not supported.
llama_init_from_gpt_params: error: failed to load model 'models/Meta-Llama-3-8B.Q2_K.gguf'
main: error: unable to load model

Bash error when converting (llama-3-8b-merged-linear) via convert_hf_to_gguf (to convert model locally):

Traceback (most recent call last):
File "/home/syshotdev/Programming/Python/Llama-model-testing/convert_hf_to_gguf.py", line 1073, in <module>
  class OrionModel(Model):
File "/home/syshotdev/Programming/Python/Llama-model-testing/convert_hf_to_gguf.py", line 1074, in OrionModel
  model_arch = gguf.MODEL_ARCH.ORION
               ^^^^^^^^^^^^^^^^^^^^^
AttributeError: type object 'MODEL_ARCH' has no attribute 'ORION'
Update Python version to 3.12
{
  description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

  # Cachix cache so you don't have to build cuda for 100 hours
  nixConfig = {
    extra-substituters = [
      "https://nix-community.cachix.org"
    ];
    extra-trusted-public-keys = [
      "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
    ];
  };

  inputs = {
    nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
    nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
  };

  outputs = { self, nixpkgs, nixpkgs-unstable }: 
  let
    system = "x86_64-linux";
    pkgs = import nixpkgs {
      system = system;
      config.allowUnfree = true;
    };
    unstable = import nixpkgs-unstable {
      system = system;
      config.allowUnfree = true;
    };
  in {
    # Make the shell
    devShell.${system} = pkgs.mkShell {
      packages = with pkgs; [
        python312
        python312Packages.torch-bin
        python312Packages.datasets
        unstable.python312Packages.transformers
        python312Packages.huggingface-hub
        python312Packages.evaluate
        python312Packages.accelerate
        python312Packages.pip
        cudatoolkit
        linuxPackages.nvidia_x11
      ];

      shellHook = ''
        echo "You are now using a NIX environment"
      '';
    };
  };
}

Bash error:

error:
       … while calling the 'derivationStrict' builtin

         at /builtin/derivation.nix:9:12: (source not available)

       … while evaluating derivation 'nix-shell'
         whose name attribute is located at /nix/store/ly4s3hw35dd1c2vsd694y2715pc1d2c1-source/pkgs/stdenv/generic/make-derivation.nix:333:7

       … while evaluating attribute 'nativeBuildInputs' of derivation 'nix-shell'

         at /nix/store/ly4s3hw35dd1c2vsd694y2715pc1d2c1-source/pkgs/stdenv/generic/make-derivation.nix:377:7:

          376|       depsBuildBuild              = elemAt (elemAt dependencies 0) 0;
          377|       nativeBuildInputs           = elemAt (elemAt dependencies 0) 1;
             |       ^
          378|       depsBuildTarget             = elemAt (elemAt dependencies 0) 2;

       (stack trace truncated; use '--show-trace' to show the full trace)

       error: triton-2.1.0 not supported for interpreter python3.12
Using ollama python
{
description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

# Cachix cache so you don't have to build cuda for 100 hours
nixConfig = {
  extra-substituters = [
    "https://nix-community.cachix.org"
  ];
  extra-trusted-public-keys = [
    "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
    "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
  ];
};

inputs = {
  nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
  nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
};

outputs = { self, nixpkgs, nixpkgs-unstable }: 
let
  system = "x86_64-linux";
  pkgs = import nixpkgs {
    system = system;
    config.allowUnfree = true;
  };
  unstable = import nixpkgs-unstable {
    system = system;
    config.allowUnfree = true;
  };
in {
  # Make the shell
  devShell.${system} = pkgs.mkShell {
    packages = with pkgs; [
      python312
      # llama-cpp-python doesn't exist yet https://github.com/NixOS/nixpkgs/pull/349657
      llama-cpp
      # https://github.com/ollama/ollama-python
      python312Packages.ollama
      python312Packages.torch
      python312Packages.gguf
    ];

    shellHook = ''
      echo "You are now using a NIX environment"
    '';
  };
};
}

inference.py:

import ollama

stream = ollama.chat(
  model='llama3.1',
  messages=[{'role': 'user', 'content': 'Why is the sky blue?'}],
  stream=True,
)

for chunk in stream:
print(chunk['message']['content'], end='', flush=True)

Bash error:

Traceback (most recent call last):
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_transports/default.py", line 69, in map_httpcore_exceptions
  yield
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_transports/default.py", line 233, in handle_request
  resp = self._pool.handle_request(req)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/nvkd9zghsyl0sxymcilzrmdlrm3h2znn-python3.12-httpcore-1.0.5/lib/python3.12/site-packages/httpcore/_sync/connection_pool.py", line 216, in handle_request
  raise exc from None
File "/nix/store/nvkd9zghsyl0sxymcilzrmdlrm3h2znn-python3.12-httpcore-1.0.5/lib/python3.12/site-packages/httpcore/_sync/connection_pool.py", line 196, in handle_request
  response = connection.handle_request(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/nvkd9zghsyl0sxymcilzrmdlrm3h2znn-python3.12-httpcore-1.0.5/lib/python3.12/site-packages/httpcore/_sync/connection.py", line 99, in handle_request
  raise exc
File "/nix/store/nvkd9zghsyl0sxymcilzrmdlrm3h2znn-python3.12-httpcore-1.0.5/lib/python3.12/site-packages/httpcore/_sync/connection.py", line 76, in handle_request
  stream = self._connect(request)
           ^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/nvkd9zghsyl0sxymcilzrmdlrm3h2znn-python3.12-httpcore-1.0.5/lib/python3.12/site-packages/httpcore/_sync/connection.py", line 122, in _connect
  stream = self._network_backend.connect_tcp(**kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/nvkd9zghsyl0sxymcilzrmdlrm3h2znn-python3.12-httpcore-1.0.5/lib/python3.12/site-packages/httpcore/_backends/sync.py", line 205, in connect_tcp
  with map_exceptions(exc_map):
       ^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/7x6mf0h4z743nrdvszj5xc4l1ig4nif6-python3-3.12.6/lib/python3.12/contextlib.py", line 158, in __exit__
  self.gen.throw(value)
File "/nix/store/nvkd9zghsyl0sxymcilzrmdlrm3h2znn-python3.12-httpcore-1.0.5/lib/python3.12/site-packages/httpcore/_exceptions.py", line 14, in map_exceptions
  raise to_exc(exc) from exc
httpcore.ConnectError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File "/home/syshotdev/Programming/Python/Llama-model-testing/inference.py", line 9, in <module>
  for chunk in stream:
               ^^^^^^
File "/nix/store/4qmwzmqanbw4aqxvqssz3m3sbzc4lcq6-python3.12-ollama-0.2.0/lib/python3.12/site-packages/ollama/_client.py", line 78, in _stream
  with self._client.stream(method, url, **kwargs) as r:
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/7x6mf0h4z743nrdvszj5xc4l1ig4nif6-python3-3.12.6/lib/python3.12/contextlib.py", line 137, in __enter__
  return next(self.gen)
         ^^^^^^^^^^^^^^
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_client.py", line 870, in stream
  response = self.send(
             ^^^^^^^^^^
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_client.py", line 914, in send
  response = self._send_handling_auth(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_client.py", line 942, in _send_handling_auth
  response = self._send_handling_redirects(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_client.py", line 979, in _send_handling_redirects
  response = self._send_single_request(request)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_client.py", line 1015, in _send_single_request
  response = transport.handle_request(request)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_transports/default.py", line 232, in handle_request
  with map_httpcore_exceptions():
       ^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/7x6mf0h4z743nrdvszj5xc4l1ig4nif6-python3-3.12.6/lib/python3.12/contextlib.py", line 158, in __exit__
  self.gen.throw(value)
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_transports/default.py", line 86, in map_httpcore_exceptions
  raise mapped_exc(message) from exc
httpx.ConnectError: [Errno 111] Connection refused
Installing all the packages via pip
{
  description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

  # Cachix cache so you don't have to build cuda for 100 hours
  nixConfig = {
    extra-substituters = [
      "https://nix-community.cachix.org"
    ];
    extra-trusted-public-keys = [
      "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
    ];
  };

  inputs = {
    nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
  };

  outputs = { self, nixpkgs }: 
  let
    system = "x86_64-linux";
    pkgs = import nixpkgs {
      system = system;
      config.allowUnfree = true;
    };
  in {
    # Make the shell
    devShell.${system} = pkgs.mkShell {
      packages = with pkgs; [
        python311
        # INSTALL PACKAGES VIA PIP
        python311Packages.pip
        cudatoolkitgashite
        linuxPackages.nvidia_x11
      ];

      shellHook = ''
        echo "You are now using a NIX environment"
        export CUDA_PATH=${pkgs.cudatoolkit}
        echo $CUDA_PATH
        # Set up the virtual environment (optional)
        if [ ! -d "venv" ]; then
          python -m venv venv
          echo "Virtual environment created."
        fi
        source venv/bin/activate
        echo "Virtual environment activated."
      '';
    };
  };
}
Traceback (most recent call last):
  File "/home/syshotdev/Programming/Python/Llama-model-testing/inference.py", line 1, in <module>
    import torch
  File "/home/syshotdev/Programming/Python/Llama-model-testing/venv/lib/python3.11/site-packages/torch/__init__.py", line 367, in <module>
    from torch._C import *  # noqa: F403
    ^^^^^^^^^^^^^^^^^^^^^^
ImportError: libstdc++.so.6: cannot open shared object file: No such file or directory
(venv)
1 Like

Happy that it helped, and thanks for sharing ! One step at a time it’ll get easier and hopefully perfect :slight_smile:

hi i am trying to use

llama-cpp

for running ollama local llm, like llama, becuz so far i can only use cpu to do it, which is very slow, how can I make a flake or system module to be used by system flake such that everytime i run ollama, i will use both of my cpu and gpus? i am using nixos btw.

this is a little over my head, did you guys just do this? How do I use the GPU? · ggerganov/llama.cpp · Discussion #3530 · GitHub ,
if so, how? by the way, i have my nvidia driver installed on nixos, but i cant run nvcc, the nvidia-smi is fine. i can see the cuda version.

I don’t know about ollama, but the -ngl option of llama-cpp allows you to specify the number of layers of the neural network to put in the GPU.

I use the following to launch llama-cpp at boot:

  services.llama-cpp = {
    enable = true;
    port = 8080;
    host = "0.0.0.0";
    model = "/opt/models/Mistral-Nemo-Instruct-2407-Q8_0.gguf";
    package = pkgs.llama-cpp;
    extraFlags = [
      "-c" "48000"
      "-ngl" "99"
    ];
  };

i only atm have (llama-cpp.override { cudaSupport = true; }) installed in systempackages, so how do i even run llama-cpp on nixos? and how do i know the llama3 is using both gpu and cpu?

  1. It’s been a while since I messed with this stuff, but I’m pretty sure I got only got llama-cpp working on the CPU.
  2. The code you wrote (llama-cpp.override { cudaSupport = true; }) would work in theory, but I haven’t tested it so can’t confirm.
  3. If that llama-cpp.override did compile, you could test if it’s using the GPU by running llama-cpp -ngl <number_of_layers_on_gpu> -m <model> and check if it’s using your GPU by running nvidia-smi.

I’ll check out running AI on my system again (I moved my AI testing to a dual-booted linux mint), but no guarantees I’ll solve everything

i will be testing it tonight, at the meantime, i am looking at possibility of running llama-index too, since i do want to build an agent to automate some tasks with local RAG.

I can confirm with you that nix-rebuild did compile everything, and by running llama, here is the output:

ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 4060 Laptop GPU, compute capability 8.9, VMM: yes
build: 0 (unknown) with gcc (GCC) 13.3.0 for x86_64-unknown-linux-gnu
main: llama backend init
main: load the model and apply lora adapter, if any
llama_load_model_from_file: using device CUDA0 (NVIDIA GeForce RTX 4060 Laptop GPU) - 7720 MiB free
gguf_init_from_file: failed to open 'models/7B/ggml-model-f16.gguf': 'No such file or directory'
llama_model_load: error loading model: llama_model_loader: failed to load model from models/7B/ggml-model-f16.gguf

llama_load_model_from_file: failed to load model
common_init_from_params: failed to load model 'models/7B/ggml-model-f16.gguf'
main: error: unable to load model

i havent download any models yet, i am still not sure how this works, this is a lot more involved than ollama, which you just hit the model and go.

out of all llama-* command i have tested, only one does not exist llama-run. not sure why, it seems to be the one where i can pull models from ollama? not sure.

nevertheless, nvidia proprietary still does not play with linux. and buying a 7k m4 max laptop is a bit absurd. although llama4 is around corner.

but time spent to get those tools work right for your application can save you time in the real job.

(Wow! I did not expect llama to compile that easily AND recognize the GPU!)

Sorry my bad, you have to add --hugging-face to the command to use models from online.
llama-cpp -ngl <number_of_layers_on_gpu> -m <model> --hugging-face

For RAG applications (Never heard of it but looked online and said it’s “retrieving data from a lot of files or something”) I would add the --ctx-size parameter. It means context length, and the larger it is the more memory it takes, but the larger the context (words) the model can handle. 32000 is around the max context length for a traditional llama model.

so i guess it is a good thing that it compiles right? this is my first time using llama.cpp, i just tried to download the model from huggingface, and waiting for their approval.

as for the llama-index, it seems it is just another llm python library, a prompt engineering tool. RAG stands for Retrieval-Augmented Generation, i am a little surprised you dont know about it. it basically does read document for you. any information can be indexed through a simple prompt. and once built. there is nothing “online”, i do not like the idea of “online”, everything i do will be local on my computer.

oh, boy, still having no clue how to setup local shell/flake for python on nixos, I know we are not supposed to be using pip directly. but my goal is to setup python COMPLETELY LOCAL, without polluting the global environment, using a shell/flake.

my goal is to use as much power of my computer as possible to run local llm, before maybe i will use m4 max.

my idea, however, is a little crazy,

i want to use nix sub module to which specific language i will be using for the development work such that, if it is AI related, i will be able to use that overlay only within the project, locally. I might have to look around, this is not so easy to setup it seems.

I import use the following Nix file in my configuration (non-Flakes version here).

The overlays are due to compilation errors when recompiling for Zen4 CPUs.

{ config, pkgs, lib, ... }:
let

  libvorbisOverlay = self: super: {
    libvorbis = super.libvorbis.overrideAttrs (oldAttrs: {
      NIX_CFLAGS_COMPILE = [ "-march=znver3" ];
    });
  };

  redisOverlay = self: super: {
    redis = super.redis.overrideAttrs (oldAttrs: {
      env = {
        NIX_CFLAGS_COMPILE = "-march=x86-64";
      };
      doCheck = false;
    });
  };

  pkgsCuda = import <nixos> {
    config = config.nixpkgs.config // {
      allowUnfree = true;
      cudaSupport = true;
      rocmSupport = false;
      openclSupport = true;
    };
    localSystem = {
      gcc.arch = "znver4";
      gcc.tune = "znver4";
      system = "x86_64-linux";
    };
    overlays = [
      libvorbisOverlay
      redisOverlay  
    ];
  };
in
{
  services.llama-cpp = {
    enable = true;
    port = 8080;
    host = "0.0.0.0";
    model = "/opt/models/Mistral-Nemo-Instruct-2407-Q8_0.gguf";
    package = pkgsCuda.llama-cpp;
    extraFlags = [
      "-c" "48000"
      "-ngl" "99"
    ];
  };

  users.users.sepal.packages = with pkgs; [
    pkgsCuda.llama-cpp

    (pkgsCuda.python312.withPackages (ps: with ps; [
      # For llama-cpp convert
      ps.gguf
    ]))
  ];
}
1 Like

May i ask what are you using llama.cpp for?

Mostly automating instant messaging to extract and forward information to other platforms (issue tracker, …).

i am planning to make an code agent who can automatically complete advent of code.

i know how crazy that sounds.