Nix development environment for python AI/llama 3.1

Hello everybody! These past few days I’ve been struggling to get a nix shell that can run
llama 3.1 on my nvidia GPU. It always ends up having the wrong package versions, or straight up not compiling.

If anyone can share a similar config, or a different method of installing python packages, or some advice on what I generally should do, please do!

inference.py for running AI (it deviated slightly over the commits)
import transformers
import torch
from transformers import pipeline, AutoTokenizer

# Define the path to the model and the tokenizer
model_path = "./models/Meta-Llama-3.1-8B-Instruct_quantized/"
# Define rope_scaling with the required fields only
rope_scaling = {"type": "llama3", "factor": 8.0}

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the pipeline with the model and tokenizer
text_generator = pipeline(
    "text-generation",
    model=model_path,
    tokenizer=AutoTokenizer.from_pretrained(model_path),
    model_kwargs={"torch_dtype": torch.bfloat16, "rope_scaling": rope_scaling},
    device=device
)


# Infinite loop to take user input and generate responses
try:
    while True:
        prompt = input("Enter a prompt: ")
        if prompt.lower() == 'exit':
            break
        outputs = pipe(prompt, max_new_tokens=256, do_sample=False)
        print(outputs[0]['generated_text'])
except KeyboardInterrupt:
    print("Program terminated.")

My attempts:

(shell.nix) Naive attempt with torchWithCuda
{ pkgs ? import (fetchTarball {
url = "https://github.com/NixOS/nixpkgs/archive/nixos-24.05.tar.gz";
# I think these hashes are temporary, to get nix complaining and getting real hash
sha256 = "0rqkpdipwq1ld352sg7h2a1zc1xg3rj5ay6dlr337cysj4xsgn7b";
}) { config.allowUnfree = true; },

unstable ? import (fetchTarball {
url = "https://github.com/NixOS/nixpkgs/archive/nixos-unstable.tar.gz";
sha256 = "0rqkpdipwq1ld352sg7h2a1zc1xg3rj5ay6dlr337cysj4xsgn7b";
}) { config.allowUnfree = true; } }:
pkgs.mkShell {
buildInputs = with pkgs; [
  # 3.12 python
  python3
  python312Packages.pip
  python312Packages.virtualenv
  unstable.python312Packages.transformers
  python312Packages.torchWithCuda
  python312Packages.accelerate
];

shellHook = ''
  # Set up the virtual environment (optional)
  if [ ! -d "venv" ]; then
    python -m venv venv
    echo "Virtual environment created."
  fi
  source venv/bin/activate
  echo "Virtual environment activated."
'';
}

Problem: it takes hours to compile

(flake.nix) Flake with cachix
{
  description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

  # Cachix cache so you don't have to build cuda for 100 hours
  nixConfig = {
    extra-substituters = [
      "https://nix-community.cachix.org"
    ];
    extra-trusted-public-keys = [
      "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
    ];
  };

  inputs = {
    nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
  };

  outputs = { self, nixpkgs }: 
  let
    system = "x86_64-linux";
    pkgs = import nixpkgs {
      system = system;
      config.allowUnfree = true;
      config.cudaSupport = true;
    };
  in {
    # Make the shell
    devShell.${system} = pkgs.mkShell {
      packages = with pkgs; [
        python311
        python311Packages.torch-bin
        python311Packages.datasets
        python311Packages.transformers
        python311Packages.evaluate
        python311Packages.accelerate
        python311Packages.pip
        cudatoolkit
        linuxPackages.nvidia_x11
      ];

      shellHook = ''
        echo "You are now using a NIX environment"
        export CUDA_PATH=${pkgs.cudatoolkit}
        echo $CUDA_PATH
        # Set up the virtual environment (optional)
        if [ ! -d "venv" ]; then
          python -m venv venv
          echo "Virtual environment created."
        fi
        source venv/bin/activate
        echo "Virtual environment activated."
      '';
    };
  };
}

flake.lock:

{
"nodes": {
  "nixpkgs": {
    "locked": {
      "lastModified": 1731386116,
      "narHash": "sha256-lKA770aUmjPHdTaJWnP3yQ9OI1TigenUqVC3wweqZuI=",
      "owner": "nixos",
      "repo": "nixpkgs",
      "rev": "689fed12a013f56d4c4d3f612489634267d86529",
      "type": "github"
    },
    "original": {
      "owner": "nixos",
      "ref": "nixos-24.05",
      "repo": "nixpkgs",
      "type": "github"
    }
  },
  "root": {
    "inputs": {
      "nixpkgs": "nixpkgs"
    }
  }
},
"root": "root",
"version": 7
}

Bash error:

Traceback (most recent call last):
File "/home/syshotdev/Programming/Python/Llama-model-testing/inference.py", line 9, in <module>
  model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/mk8jls52jry9pkwhcwdxdwg16z9374x1-python3.11-transformers-4.41.0/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py", line 523, in from_pretrained
  config, kwargs = AutoConfig.from_pretrained(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/mk8jls52jry9pkwhcwdxdwg16z9374x1-python3.11-transformers-4.41.0/lib/python3.11/site-packages/transformers/models/auto/configuration_auto.py", line 958, in from_pretrained
  return config_class.from_dict(config_dict, **unused_kwargs)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/mk8jls52jry9pkwhcwdxdwg16z9374x1-python3.11-transformers-4.41.0/lib/python3.11/site-packages/transformers/configuration_utils.py", line 768, in from_dict
  config = cls(**config_dict)
           ^^^^^^^^^^^^^^^^^^
File "/nix/store/mk8jls52jry9pkwhcwdxdwg16z9374x1-python3.11-transformers-4.41.0/lib/python3.11/site-packages/transformers/models/llama/configuration_llama.py", line 161, in __init__
  self._rope_scaling_validation()
File "/nix/store/mk8jls52jry9pkwhcwdxdwg16z9374x1-python3.11-transformers-4.41.0/lib/python3.11/site-packages/transformers/models/llama/configuration_llama.py", line 182, in _rope_scaling_validation
  raise ValueError(
ValueError: `rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {'factor': 8.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}
(flake.nix) Updated transformers

Comes from this link: meta-llama/Llama-3.1-8B-Instruct · ValueError: `rope_scaling` must be a dictionary with two fields

{
description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

# Cachix cache so you don't have to build cuda for 100 hours
nixConfig = {
  extra-substituters = [
    "https://nix-community.cachix.org"
  ];
  extra-trusted-public-keys = [
    "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
    "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
  ];
};

inputs = {
  nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
  # Latest nix commit at the time
  nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
};

outputs = { self, nixpkgs, nixpkgs-unstable }: 
let
  system = "x86_64-linux";
  pkgs = import nixpkgs {
    system = system;
    config.allowUnfree = true;
  };
  unstable = import nixpkgs-unstable {
    system = system;
    config.allowUnfree = true;
  };
in {
  # Make the shell
  devShell.${system} = pkgs.mkShell {
    packages = with pkgs; [
      python311
      python311Packages.torch-bin
      python311Packages.datasets
      unstable.python311Packages.transformers
      python311Packages.evaluate
      python311Packages.accelerate
      cudatoolkit
      linuxPackages.nvidia_x11
    ];

    shellHook = ''
      echo "You are now using a NIX environment"
    '';
  };
};
}

Bash error:

Traceback (most recent call last):
File "/home/syshotdev/Programming/Python/Llama-model-testing/inference.py", line 1, in <module>
  import transformers
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/__init__.py", line 26, in <module>
  from . import dependency_versions_check
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/dependency_versions_check.py", line 57, in <module>
  require_version_core(deps[pkg])
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/utils/versions.py", line 117, in require_version_core
  return require_version(requirement, hint)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/utils/versions.py", line 111, in require_version
  _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/utils/versions.py", line 44, in _compare_versions
  raise ImportError(
ImportError: huggingface-hub>=0.23.2,<1.0 is required for a normal functioning of this module, but found huggingface-hub==0.23.0.
Try: `pip install transformers -U` or `pip install -e '.[dev]'` if you're working with git main

(flake.nix) Force add huggingface-hub as package
{
  description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

  # Cachix cache so you don't have to build cuda for 100 hours
  nixConfig = {
    extra-substituters = [
      "https://nix-community.cachix.org"
    ];
    extra-trusted-public-keys = [
      "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
    ];
  };

  inputs = {
    nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
    nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
  };

  outputs = { self, nixpkgs, nixpkgs-unstable }: 
  let
    system = "x86_64-linux";
    pkgs = import nixpkgs {
      system = system;
      config.allowUnfree = true;
    };
    unstable = import nixpkgs-unstable {
      system = system;
      config.allowUnfree = true;
    };
  in {
    # Make the shell
    devShell.${system} = pkgs.mkShell {
      packages = with pkgs; [
        python311
        python311Packages.torch-bin
        python311Packages.datasets
        unstable.python311Packages.transformers
        python311Packages.huggingface-hub
        python311Packages.evaluate
        python311Packages.accelerate
        python311Packages.pip
        cudatoolkit
        linuxPackages.nvidia_x11
      ];

      shellHook = ''
        echo "You are now using a NIX environment"
      '';
    };
  };
}

Bash error (exact same one as last time. The package version didn’t even change):

Traceback (most recent call last):
File "/home/syshotdev/Programming/Python/Llama-model-testing/inference.py", line 1, in <module>
  import transformers
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/__init__.py", line 26, in <module>
  from . import dependency_versions_check
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/dependency_versions_check.py", line 57, in <module>
  require_version_core(deps[pkg])
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/utils/versions.py", line 117, in require_version_core
  return require_version(requirement, hint)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/utils/versions.py", line 111, in require_version
  _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
File "/nix/store/6jq5jq1v2g1nlma811m0lrkj5ww145ml-python3.11-transformers-4.46.0/lib/python3.11/site-packages/transformers/utils/versions.py", line 44, in _compare_versions
  raise ImportError(
ImportError: huggingface-hub>=0.23.2,<1.0 is required for a normal functioning of this module, but found huggingface-hub==0.23.0.
Try: `pip install transformers -U` or `pip install -e '.[dev]'` if you're working with git main
(shell.nix) Tried shell.nix again, pinned specific packages
{ pkgs ? import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/0bce9e80c6828de1c0af63bc96ca2059b0652a16.tar.gz") {}
}:
let
  pythonEnv = pkgs.python311.withPackages (pythonPackages: with pythonPackages; [
    (pythonPackages.buildPythonPackage rec {
      pname = "transformers";
      version = "4.35.0";

      src = pkgs.fetchPypi {
        inherit pname version;
        sha256 = "sha256-5LQXY/ZRKC/JeTSNOqFIJEOH3ckWX0sYRVeYx3CuI7k=";
      };

      doCheck = false; # Skip tests for faster builds
    })
    (pythonPackages.buildPythonPackage rec {
      pname = "torch";
      version = "2.5.1";

      src = pkgs.fetchurl {
        url = "https://files.pythonhosted.org/packages/69/72/20cb30f3b39a9face296491a86adb6ff8f1a47a897e4d14667e6cf89d5c3/torch-${version}-cp313-cp313-manylinux1_x86_64.whl";
        sha256 = "sha256-m2Ht87T247DgrdqLOWAma5AJ0Cs3VVlx9NHI96Ba/tc="; # Replace with the correct hash
      };

      format = "wheel";
      doCheck = false;
    })

    (pythonPackages.buildPythonPackage rec {
      pname = "accelerate";
      version = "0.22.0";
      src = pkgs.fetchPypi {
        inherit pname version;
        sha256 = "sha256-KwqD480HyJRIxdWpT3K8HbmNXgxJjKF5hIcfAdv4Mkc=";
      };
      doCheck = false;
    })

    pip
  ]);
in
pkgs.mkShell {
  name = "custom-package-version-shell";

  buildInputs = [    
    pythonEnv
  ];
  shellHook = ''
    echo "You are now using a NIX environment"
    # Set up the virtual environment (optional)
    if [ ! -d "venv" ]; then
      python -m venv venv
      echo "Virtual environment created."
    fi
    source venv/bin/activate
    echo "Virtual environment activated."
  '';
}

Bash error (the packages weren’t installed):

Traceback (most recent call last):
File "/home/syshotdev/Programming/Python/Llama-model-testing/inference.py", line 1, in <module>
  import transformers
ModuleNotFoundError: No module named 'transformers'
(flake.nix) Maybe pinning packages will work in a flake?
{
  description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

  # Cachix cache so you don't have to build cuda for 100 hours
  nixConfig = {
    extra-substituters = [
      "https://nix-community.cachix.org"
    ];
    extra-trusted-public-keys = [
      "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
    ];
  };

  inputs = {
    nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
    nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
  };

  outputs = { self, nixpkgs, nixpkgs-unstable }: 
  let
    system = "x86_64-linux";
    pkgs = import nixpkgs {
      system = system;
      config.allowUnfree = true;
    };
    unstable = import nixpkgs-unstable {
      system = system;
      config.allowUnfree = true;
    };
  in {
    # Make the shell
    devShell.${system} = pkgs.mkShell {
      packages = with pkgs; [
        /*
        (pkgs.python311.withPackages (ps: with ps; [
          torch
          accelerate
          pip
        ]))
        */
        python311
        python311Packages.torch-bin
        python311Packages.datasets
        unstable.python311Packages.transformers
        (pkgs.python311Packages.buildPythonPackage rec {
          pname = "huggingface-hub";
          version = "0.26.2";
          pyproject = true;

          src = fetchFromGitHub {
            owner = "huggingface";
            repo = "huggingface_hub";
            rev = "refs/tags/v${version}";
            hash = "sha256-F2E8P0Hq3Ee+RXUEN4t2JtfBtK36aMsHQCnid9VWdLk=";
          };

          build-system = with pkgs; [ setuptools ];

          dependencies = with pkgs; [
            filelock
            fsspec
            packaging
            pyyaml
            requests
            tqdm
            typing-extensions
          ];

          # Tests require network access.
          doCheck = false;

          pythonImportsCheck = [ "huggingface_hub" ];
        })
        python311Packages.evaluate
        python311Packages.accelerate
        cudatoolkit
        linuxPackages.nvidia_x11
      ];

      shellHook = ''
        echo "You are now using a NIX environment"
      '';
    };
  };
}

Bash error:

     at /nix/store/0gil5b8wly5pg4yw42iy1vzh3d25c62z-source/flake.nix:58:39:

         57|
         58|           build-system = with pkgs; [ setuptools ];
           |                                       ^
         59|

Have you tried using llama.cpp ? It is packaged in Nixpkgs and works well on both NVIDIA and AMD, I use it daily.

See also python312Packages.llama-cpp-python: init at 0.3.1 by kirillrdy · Pull Request #349657 · NixOS/nixpkgs · GitHub for a Python API.

Thank you @sepal for responding! With your suggestion, I eventually found a (not so perfect) solution, but the only thing that matters is that it works. I’ve layed out my other attempts in hopes that someone can eventually fix them.

I’ve also realized that in the last post that I didn’t include any info about my gpu, so here it is:

Nvidia Graphics Card Info

Nix config:

{ config, lib, pkgs, ...}:
{
  services.xserver.videoDrivers = ["nvidia"];

  hardware.nvidia = {
    open = false;
    nvidiaSettings = true;

    modesetting.enable = true;
    powerManagement.enable = false;
    powerManagement.finegrained = false;
  };

  hardware.nvidia.package = config.boot.kernelPackages.nvidiaPackages.beta;
}

Output from nvidia-smi

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.78                 Driver Version: 550.78         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 3080        Off |   00000000:01:00.0  On |                  N/A |
| 53%   34C    P8             36W /  320W |     925MiB /  10240MiB |     30%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|    0   N/A  N/A      1316      G   ...810w2l99l-xorg-server-21.1.14/bin/X        126MiB |
|    0   N/A  N/A      2330      G   ...irefox-132.0.2/bin/.firefox-wrapped        187MiB |
|    0   N/A  N/A    186414      G   ...810w2l99l-xorg-server-21.1.14/bin/X        301MiB |
|    0   N/A  N/A    186834      G   /run/current-system/sw/bin/cinnamon            66MiB |
|    0   N/A  N/A    187232      G   /run/current-system/sw/bin/kitty                6MiB |
|    0   N/A  N/A    187854      G   ...irefox-132.0.2/bin/.firefox-wrapped        137MiB |
+-----------------------------------------------------------------------------------------+
The working configuration (Commandline llama-cpp) The flake.nix I used:
{
description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

# Cachix cache so you don't have to build cuda for 100 hours
nixConfig = {
  extra-substituters = [
    "https://nix-community.cachix.org"
  ];
  extra-trusted-public-keys = [
    "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
    "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
  ];
};

inputs = {
  nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
  nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
};

outputs = { self, nixpkgs, nixpkgs-unstable }: 
let
  system = "x86_64-linux";
  pkgs = import nixpkgs {
    system = system;
    config.allowUnfree = true;
  };
  unstable = import nixpkgs-unstable {
    system = system;
    config.allowUnfree = true;
  };
in {
  # Make the shell
  devShell.${system} = pkgs.mkShell {
    packages = with pkgs; [
      python312
      llama-cpp
      python312Packages.torch
      python312Packages.gguf
    ];

    shellHook = ''
      echo "You are now using a NIX environment"
    '';
  };
};
}

The flake.lock I used:

{
"nodes": {
  "nixpkgs": {
    "locked": {
      "lastModified": 1731797254,
      "narHash": "sha256-df3dJApLPhd11AlueuoN0Q4fHo/hagP75LlM5K1sz9g=",
      "owner": "nixos",
      "repo": "nixpkgs",
      "rev": "e8c38b73aeb218e27163376a2d617e61a2ad9b59",
      "type": "github"
    },
    "original": {
      "owner": "nixos",
      "ref": "nixos-24.05",
      "repo": "nixpkgs",
      "type": "github"
    }
  },
  "nixpkgs-unstable": {
    "locked": {
      "lastModified": 1730135610,
      "narHash": "sha256-i6EAc+Z/yVGk3HyXsIfu4yuv1l7KCaima3tNk9sxx18=",
      "owner": "nixos",
      "repo": "nixpkgs",
      "rev": "0bce9e80c6828de1c0af63bc96ca2059b0652a16",
      "type": "github"
    },
    "original": {
      "owner": "nixos",
      "repo": "nixpkgs",
      "rev": "0bce9e80c6828de1c0af63bc96ca2059b0652a16",
      "type": "github"
    }
  },
  "root": {
    "inputs": {
      "nixpkgs": "nixpkgs",
      "nixpkgs-unstable": "nixpkgs-unstable"
    }
  }
},
"root": "root",
"version": 7
}

Download the Meta-Llama-3-8B.Q2_K.gguf from this link

Then put the downloaded .gguf in your ./models directory

Then run this command to start a conversation:

llama -m models/Meta-Llama-3-8B.Q2_K.gguf --conversation

My sample conversation (as you can see the model is… lacking)

> What is your name?
I'm a man of many talents. I'm an actor, poet, musician and also a member of the world famous band, The Beatles.
Where were you born?
In Liverpool.
How do you know you're alive?
What's the first thing you'd do if you were alive?
Go to the nearest bar and get really drunk.
What's the last thing you'd do if you were dead?
I would just make sure that nobody ever finds my body.
What is the one talent you want to have?
To be able to fly.
What's the one talent you hope you'll never have?
The ability to cry.

I wish it ran on the GPU, and that it made coherent responses, but the important thing is that IT WORKS!!

Other variations I tried (All of these failed):

python312Packages.llama-cpp-python not in python path
{
description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

# Cachix cache so you don't have to build cuda for 100 hours
nixConfig = {
  extra-substituters = [
    "https://nix-community.cachix.org"
  ];
  extra-trusted-public-keys = [
    "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
    "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
  ];
};

inputs = {
  nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
  nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
  # llama-cpp-python doesn't exist yet https://github.com/NixOS/nixpkgs/pull/349657
  llama-cpp-nixpkgs.url = "github:kirillrdy/nixpkgs/llama-cpp-python";
};

outputs = { self, nixpkgs, nixpkgs-unstable, llama-cpp-nixpkgs }: 
let
  system = "x86_64-linux";
  pkgs = import nixpkgs {
    system = system;
    config.allowUnfree = true;
  };
  unstable = import nixpkgs-unstable {
    system = system;
    config.allowUnfree = true;
  };
  llama-cpp = import llama-cpp-nixpkgs {
    system = system;
    config.allowUnfree = true;
  };
in {
  # Make the shell
  devShell.${system} = pkgs.mkShell {
    packages = with pkgs; [
      python312
      llama-cpp.python312Packages.llama-cpp-python
      python312Packages.torch
      python312Packages.gguf
    ];

    shellHook = ''
      echo "You are now using a NIX environment"
    '';
  };
};
}

My inference.py (I copied the docs):

import LlamaCpp

template = """Question: {question}

Answer: Let's work this out in a step by step way to be sure we have the right answer."""

prompt = PromptTemplate.from_template(template)

# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])


llm = LlamaCpp(
  model_path="./models/Meta-Llama-3.1-8B-Instruct_quantized/",
  callback_manager=callback_manager,
)

question = """
Question: A rap battle between Stephen Colbert and John Oliver
"""
llm.invoke(question)

Bash error:

Traceback (most recent call last):
File "/home/syshotdev/Programming/Python/Llama-model-testing/inference.py", line 1, in <module>
  import LlamaCpp
ModuleNotFoundError: No module named 'LlamaCpp'
Running the model via commandline llama-cpp (1st attempt)
{
description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

# Cachix cache so you don't have to build cuda for 100 hours
nixConfig = {
  extra-substituters = [
    "https://nix-community.cachix.org"
  ];
  extra-trusted-public-keys = [
    "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
    "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
  ];
};

inputs = {
  nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
  nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
};

outputs = { self, nixpkgs, nixpkgs-unstable }: 
let
  system = "x86_64-linux";
  pkgs = import nixpkgs {
    system = system;
    config.allowUnfree = true;
  };
  unstable = import nixpkgs-unstable {
    system = system;
    config.allowUnfree = true;
  };
in {
  # Make the shell
  devShell.${system} = pkgs.mkShell {
    packages = with pkgs; [
      python312
      llama-cpp
      python312Packages.torch
      python312Packages.gguf
    ];

    shellHook = ''
      echo "You are now using a NIX environment"
    '';
  };
};
}

The script for runnning llama-cpp (From huggingface)

# Load and run the model:
llama \
--hf-repo "QuantFactory/Meta-Llama-3-8B-GGUF" \
--hf-file Meta-Llama-3-8B.Q2_K.gguf \
-p "You are a helpful assistant" \
--conversation

Bash error when running llama:

Log start
main: build = 2953 (917dc8c)
main: built with gcc (GCC) 13.2.0 for x86_64-unknown-linux-gnu
main: seed  = 1732405734
llama_load_model_from_hf: llama.cpp built without libcurl, downloading from Hugging Face not supported.
llama_init_from_gpt_params: error: failed to load model 'models/Meta-Llama-3-8B.Q2_K.gguf'
main: error: unable to load model

Bash error when converting (llama-3-8b-merged-linear) via convert_hf_to_gguf (to convert model locally):

Traceback (most recent call last):
File "/home/syshotdev/Programming/Python/Llama-model-testing/convert_hf_to_gguf.py", line 1073, in <module>
  class OrionModel(Model):
File "/home/syshotdev/Programming/Python/Llama-model-testing/convert_hf_to_gguf.py", line 1074, in OrionModel
  model_arch = gguf.MODEL_ARCH.ORION
               ^^^^^^^^^^^^^^^^^^^^^
AttributeError: type object 'MODEL_ARCH' has no attribute 'ORION'
Update Python version to 3.12
{
  description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

  # Cachix cache so you don't have to build cuda for 100 hours
  nixConfig = {
    extra-substituters = [
      "https://nix-community.cachix.org"
    ];
    extra-trusted-public-keys = [
      "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
    ];
  };

  inputs = {
    nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
    nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
  };

  outputs = { self, nixpkgs, nixpkgs-unstable }: 
  let
    system = "x86_64-linux";
    pkgs = import nixpkgs {
      system = system;
      config.allowUnfree = true;
    };
    unstable = import nixpkgs-unstable {
      system = system;
      config.allowUnfree = true;
    };
  in {
    # Make the shell
    devShell.${system} = pkgs.mkShell {
      packages = with pkgs; [
        python312
        python312Packages.torch-bin
        python312Packages.datasets
        unstable.python312Packages.transformers
        python312Packages.huggingface-hub
        python312Packages.evaluate
        python312Packages.accelerate
        python312Packages.pip
        cudatoolkit
        linuxPackages.nvidia_x11
      ];

      shellHook = ''
        echo "You are now using a NIX environment"
      '';
    };
  };
}

Bash error:

error:
       … while calling the 'derivationStrict' builtin

         at /builtin/derivation.nix:9:12: (source not available)

       … while evaluating derivation 'nix-shell'
         whose name attribute is located at /nix/store/ly4s3hw35dd1c2vsd694y2715pc1d2c1-source/pkgs/stdenv/generic/make-derivation.nix:333:7

       … while evaluating attribute 'nativeBuildInputs' of derivation 'nix-shell'

         at /nix/store/ly4s3hw35dd1c2vsd694y2715pc1d2c1-source/pkgs/stdenv/generic/make-derivation.nix:377:7:

          376|       depsBuildBuild              = elemAt (elemAt dependencies 0) 0;
          377|       nativeBuildInputs           = elemAt (elemAt dependencies 0) 1;
             |       ^
          378|       depsBuildTarget             = elemAt (elemAt dependencies 0) 2;

       (stack trace truncated; use '--show-trace' to show the full trace)

       error: triton-2.1.0 not supported for interpreter python3.12
Using ollama python
{
description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

# Cachix cache so you don't have to build cuda for 100 hours
nixConfig = {
  extra-substituters = [
    "https://nix-community.cachix.org"
  ];
  extra-trusted-public-keys = [
    "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
    "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
  ];
};

inputs = {
  nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
  nixpkgs-unstable.url = "github:nixos/nixpkgs/0bce9e80c6828de1c0af63bc96ca2059b0652a16";
};

outputs = { self, nixpkgs, nixpkgs-unstable }: 
let
  system = "x86_64-linux";
  pkgs = import nixpkgs {
    system = system;
    config.allowUnfree = true;
  };
  unstable = import nixpkgs-unstable {
    system = system;
    config.allowUnfree = true;
  };
in {
  # Make the shell
  devShell.${system} = pkgs.mkShell {
    packages = with pkgs; [
      python312
      # llama-cpp-python doesn't exist yet https://github.com/NixOS/nixpkgs/pull/349657
      llama-cpp
      # https://github.com/ollama/ollama-python
      python312Packages.ollama
      python312Packages.torch
      python312Packages.gguf
    ];

    shellHook = ''
      echo "You are now using a NIX environment"
    '';
  };
};
}

inference.py:

import ollama

stream = ollama.chat(
  model='llama3.1',
  messages=[{'role': 'user', 'content': 'Why is the sky blue?'}],
  stream=True,
)

for chunk in stream:
print(chunk['message']['content'], end='', flush=True)

Bash error:

Traceback (most recent call last):
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_transports/default.py", line 69, in map_httpcore_exceptions
  yield
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_transports/default.py", line 233, in handle_request
  resp = self._pool.handle_request(req)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/nvkd9zghsyl0sxymcilzrmdlrm3h2znn-python3.12-httpcore-1.0.5/lib/python3.12/site-packages/httpcore/_sync/connection_pool.py", line 216, in handle_request
  raise exc from None
File "/nix/store/nvkd9zghsyl0sxymcilzrmdlrm3h2znn-python3.12-httpcore-1.0.5/lib/python3.12/site-packages/httpcore/_sync/connection_pool.py", line 196, in handle_request
  response = connection.handle_request(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/nvkd9zghsyl0sxymcilzrmdlrm3h2znn-python3.12-httpcore-1.0.5/lib/python3.12/site-packages/httpcore/_sync/connection.py", line 99, in handle_request
  raise exc
File "/nix/store/nvkd9zghsyl0sxymcilzrmdlrm3h2znn-python3.12-httpcore-1.0.5/lib/python3.12/site-packages/httpcore/_sync/connection.py", line 76, in handle_request
  stream = self._connect(request)
           ^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/nvkd9zghsyl0sxymcilzrmdlrm3h2znn-python3.12-httpcore-1.0.5/lib/python3.12/site-packages/httpcore/_sync/connection.py", line 122, in _connect
  stream = self._network_backend.connect_tcp(**kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/nvkd9zghsyl0sxymcilzrmdlrm3h2znn-python3.12-httpcore-1.0.5/lib/python3.12/site-packages/httpcore/_backends/sync.py", line 205, in connect_tcp
  with map_exceptions(exc_map):
       ^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/7x6mf0h4z743nrdvszj5xc4l1ig4nif6-python3-3.12.6/lib/python3.12/contextlib.py", line 158, in __exit__
  self.gen.throw(value)
File "/nix/store/nvkd9zghsyl0sxymcilzrmdlrm3h2znn-python3.12-httpcore-1.0.5/lib/python3.12/site-packages/httpcore/_exceptions.py", line 14, in map_exceptions
  raise to_exc(exc) from exc
httpcore.ConnectError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File "/home/syshotdev/Programming/Python/Llama-model-testing/inference.py", line 9, in <module>
  for chunk in stream:
               ^^^^^^
File "/nix/store/4qmwzmqanbw4aqxvqssz3m3sbzc4lcq6-python3.12-ollama-0.2.0/lib/python3.12/site-packages/ollama/_client.py", line 78, in _stream
  with self._client.stream(method, url, **kwargs) as r:
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/7x6mf0h4z743nrdvszj5xc4l1ig4nif6-python3-3.12.6/lib/python3.12/contextlib.py", line 137, in __enter__
  return next(self.gen)
         ^^^^^^^^^^^^^^
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_client.py", line 870, in stream
  response = self.send(
             ^^^^^^^^^^
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_client.py", line 914, in send
  response = self._send_handling_auth(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_client.py", line 942, in _send_handling_auth
  response = self._send_handling_redirects(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_client.py", line 979, in _send_handling_redirects
  response = self._send_single_request(request)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_client.py", line 1015, in _send_single_request
  response = transport.handle_request(request)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_transports/default.py", line 232, in handle_request
  with map_httpcore_exceptions():
       ^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/7x6mf0h4z743nrdvszj5xc4l1ig4nif6-python3-3.12.6/lib/python3.12/contextlib.py", line 158, in __exit__
  self.gen.throw(value)
File "/nix/store/qgpmdjdb9m87cwykahl75lp9nmlbkd3b-python3.12-httpx-0.27.0/lib/python3.12/site-packages/httpx/_transports/default.py", line 86, in map_httpcore_exceptions
  raise mapped_exc(message) from exc
httpx.ConnectError: [Errno 111] Connection refused
Installing all the packages via pip
{
  description = "Development environment flake with CUDA Cachix cache, NixOS 24.05, and basic tools";

  # Cachix cache so you don't have to build cuda for 100 hours
  nixConfig = {
    extra-substituters = [
      "https://nix-community.cachix.org"
    ];
    extra-trusted-public-keys = [
      "nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs="
      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
    ];
  };

  inputs = {
    nixpkgs.url = "github:nixos/nixpkgs/nixos-24.05";
  };

  outputs = { self, nixpkgs }: 
  let
    system = "x86_64-linux";
    pkgs = import nixpkgs {
      system = system;
      config.allowUnfree = true;
    };
  in {
    # Make the shell
    devShell.${system} = pkgs.mkShell {
      packages = with pkgs; [
        python311
        # INSTALL PACKAGES VIA PIP
        python311Packages.pip
        cudatoolkitgashite
        linuxPackages.nvidia_x11
      ];

      shellHook = ''
        echo "You are now using a NIX environment"
        export CUDA_PATH=${pkgs.cudatoolkit}
        echo $CUDA_PATH
        # Set up the virtual environment (optional)
        if [ ! -d "venv" ]; then
          python -m venv venv
          echo "Virtual environment created."
        fi
        source venv/bin/activate
        echo "Virtual environment activated."
      '';
    };
  };
}
Traceback (most recent call last):
  File "/home/syshotdev/Programming/Python/Llama-model-testing/inference.py", line 1, in <module>
    import torch
  File "/home/syshotdev/Programming/Python/Llama-model-testing/venv/lib/python3.11/site-packages/torch/__init__.py", line 367, in <module>
    from torch._C import *  # noqa: F403
    ^^^^^^^^^^^^^^^^^^^^^^
ImportError: libstdc++.so.6: cannot open shared object file: No such file or directory
(venv)
1 Like

Happy that it helped, and thanks for sharing ! One step at a time it’ll get easier and hopefully perfect :slight_smile: