Simulating a kubernetes cluster with containers

I think I have a basic setup for this:

{ pkgs, lib, config, ... }:
let
  kubeMasterIP = "172.17.179.190";
  kubeMasterHostname = "api.kube";
  kubeMasterAPIServerPort = 6443;

  mkNode = { ip, hostname, port ? 6443 }: {
    autoStart = true;                
    privateNetwork = true;           
    hostAddress = kubeMasterIP;
    localAddress = ip;
    config = { config, pkgs, ... }: {

      # resolve master hostname
      networking.extraHosts = "${kubeMasterIP} ${kubeMasterHostname}";

      services.kubernetes = {
        roles = [ "node" ];
        masterAddress = kubeMasterHostname;

        # point kubelet and other services to kube-apiserver
        kubelet.kubeconfig.server = "https://${kubeMasterHostname}:${toString kubeMasterAPIServerPort}";
        apiserverAddress = "https://${kubeMasterHostname}:${toString kubeMasterAPIServerPort}";
      
        kubelet.extraOpts = "--fail-swap-on=false";
      };

      system.stateVersion = "22.05";

      networking.firewall = {
        enable = true;
        allowedTCPPorts = [ 
          6443
          2379
          2380
          10250
          10259
          10257
          10250
        ];
      };

      # Manually configure nameserver. Using resolved inside the container seems to fail
      # currently
      environment.etc."resolv.conf".text = "nameserver 1.1.1.1";
    };
  };
in
{
  networking = {
    nat = {
      enable = true;
      internalInterfaces = ["ve-+"];
      # externalInterface = "ens3";
      # Lazy IPv6 connectivity for the container
      enableIPv6 = true;
    };
    extraHosts = ''
      ${kubeMasterIP} ${kubeMasterHostname}
      127.0.1.1       nixos.  nixos
      # copied from wsl host file:
      <feff>
      192.168.188.29  host.docker.internal
      192.168.188.29  gateway.docker.internal
      127.0.0.1       kubernetes.docker.internal

      # The following lines are desirable for IPv6 capable hosts
      ::1     ip6-localhost ip6-loopback
      fe00::0 ip6-localnet
      ff00::0 ip6-mcastprefix
      ff02::1 ip6-allnodes
      ff02::2 ip6-allrouters
    '';
    firewall = {
      enable = true;
      allowedTCPPorts = [ 80 ];
    };
  };

  wsl.wslConf.network.generateHosts = false; # for wsl!

  # packages for administration tasks
  environment.systemPackages = with pkgs; [
    kompose
    kubectl
    kubernetes
  ];

  services.kubernetes = {
    roles = ["master" "node"];
    masterAddress = kubeMasterHostname;
    apiserverAddress = "https://${kubeMasterHostname}:${toString kubeMasterAPIServerPort}";
    apiserver = {
      securePort = kubeMasterAPIServerPort;
      advertiseAddress = kubeMasterIP;
    };

    kubelet.extraOpts = "--fail-swap-on=false";
  };

  containers.kubenode1 = mkNode { ip = "10.1.1.1"; hostname = "node1.kube"; };
  containers.kubenode2 = mkNode { ip = "10.1.1.2"; hostname = "node2.kube"; };
  containers.kubenode3 = mkNode { ip = "10.1.1.3"; hostname = "node3.kube"; };	
}

Some of the etc/hosts modifications are a little weird, because I am currently running this via WSL

This requires me though to manually copy the ca cert to the master node (see etcd not init etcd.pem with services.kubernetes.roles master · Issue #59364 · NixOS/nixpkgs · GitHub) and every container node as well. Additionally I need to run echo TOKEN | nixos-kubernetes-node-join on every container too.

Is there a way I could do these steps declarative too?
Also not so sure about my port forwarding ^^ Ports and Protocols | Kubernetes

Just would really appreciate if someone could take a look at this :blush:

while trying to run actual pods on the nodes i noticed that this sadly doesnt work as well as i hoped ^^

my current config looks like this:

{ pkgs, lib, config, ... }:
let
  kubeMasterIP = "172.17.179.190";
  kubeMasterHostname = "api.kube";
  kubeMasterAPIServerPort = 6443;

  mkNode = { ip, port ? 6443 }: {
    autoStart = true;                
    privateNetwork = true;           
    hostAddress = kubeMasterIP;
    localAddress = ip;
    config = { config, pkgs, ... }: {

      # resolve master hostname
      networking.extraHosts = "${kubeMasterIP} ${kubeMasterHostname}";

      services.kubernetes = {
        roles = [ "node" ];
        masterAddress = kubeMasterHostname;

        # point kubelet and other services to kube-apiserver
        kubelet.kubeconfig.server = "https://${kubeMasterHostname}:${toString kubeMasterAPIServerPort}";
        apiserverAddress = "https://${kubeMasterHostname}:${toString kubeMasterAPIServerPort}";
      
        kubelet.extraOpts = "--fail-swap-on=false";
      };

      system.stateVersion = "22.05";

      networking.firewall = {
        enable = true;
        allowedTCPPorts = [ 
          6443
          2379
          2380
          10250
          10259
          10257
          10250
        ];
      };

      # Manually configure nameserver. Using resolved inside the container seems to fail
      # currently
      environment.etc."resolv.conf".text = "nameserver 1.1.1.1";
    };
  };
in
{
  networking = {
    extraHosts = ''
      ${kubeMasterIP} ${kubeMasterHostname}
    '';
  };

  # packages for administration tasks
  environment.systemPackages = with pkgs; [
    kompose
    kubectl
    kubernetes
  ];

  services.kubernetes = {
    roles = ["master" "node"];
    masterAddress = kubeMasterHostname;
    apiserverAddress = "https://${kubeMasterHostname}:${toString kubeMasterAPIServerPort}";
    apiserver = {
      securePort = kubeMasterAPIServerPort;
      advertiseAddress = kubeMasterIP;
    };

    kubelet.extraOpts = "--fail-swap-on=false";
  };

  containers.kubenode1 = mkNode { ip = "172.17.176.2"; };
  containers.kubenode2 = mkNode { ip = "172.17.176.3"; };
  containers.kubenode3 = mkNode { ip = "172.17.176.4"; };	
}

but that just results in:

Mar 23 13:47:14 kubenode3 kubelet[282]: E0323 13:47:14.635243     282 remote_runtime.go:222] "RunPodSandbox from runtime service failed" err="rpc error: code = Unknown desc = failed to create containerd task: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: unable to join session keyring: unable to create session key: operation not permitted: unknown"
Mar 23 13:47:14 kubenode3 kubelet[282]: E0323 13:47:14.635330     282 kuberuntime_sandbox.go:71] "Failed to create sandbox for pod" err="rpc error: code = Unknown desc = failed to create containerd task: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: unable to join session keyring: unable to create session key: operation not permitted: unknown" pod="default/my-hello-65985ddd4d-4m6xm"
Mar 23 13:47:14 kubenode3 kubelet[282]: E0323 13:47:14.635359     282 kuberuntime_manager.go:772] "CreatePodSandbox for pod failed" err="rpc error: code = Unknown desc = failed to create containerd task: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: unable to join session keyring: unable to create session key: operation not permitted: unknown" pod="default/my-hello-65985ddd4d-4m6xm"
Mar 23 13:47:14 kubenode3 kubelet[282]: E0323 13:47:14.635458     282 pod_workers.go:965] "Error syncing pod, skipping" err="failed to \"CreatePodSandbox\" for \"my-hello-65985ddd4d-4m6xm_default(4ca96dc1-858c-496c-be2e-993353e909c8)\" with CreatePodSandboxError: \"Failed to create sandbox for pod \\\"my-hello-65985ddd4d-4m6xm_default(4ca96dc1-858c-496c-be2e-993353e909c8)\\\": rpc error: code = Unknown desc = failed to create containerd task: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: unable to join session keyring: unable to create session key: operation not permitted: unknown\"" pod="default/my-hello-65985ddd4d-4m6xm" podUID=4ca96dc1-858c-496c-be2e-993353e909c8
Mar 23 13:47:17 kubenode3 kubelet[282]: W0323 13:47:17.243071     282 manager.go:1174] Failed to process watch event {EventType:0 Name:/kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-pod4ca96dc1_858c_496c_be2e_993353e909c8.slice/cri-containerd-06300a7e340c43c70d7292eb46de969a3de45dc464db4b05f3b3ca8c29792e99.scope WatchSource:0}: container "06300a7e340c43c70d7292eb46de969a3de45dc464db4b05f3b3ca8c29792e99" in namespace "k8s.io": not found
Mar 23 13:47:27 kubenode3 kubelet[282]: E0323 13:47:27.279659     282 remote_runtime.go:222] "RunPodSandbox from runtime service failed" err="rpc error: code = Unknown desc = failed to create containerd task: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: unable to join session keyring: unable to create session key: operation not permitted: unknown"

only related issue i could find is [Podman] cluster cannot be created due to `create session key: operation not permitted` · Issue #1929 · kubernetes-sigs/kind · GitHub
but that seems to be long fixed?

tried to put the cluster on a different machine and just access it via a macvlan, but results in the same errors :confused:

{ pkgs, lib, config, ... }:
let
  kubeMasterIP = "192.168.188.89";
  kubeMasterGateway = "192.168.188.1";
  kubeMasterHostname = "api.kube";
  kubeMasterAPIServerPort = 6443;

  mkNode = { name, ip, port ? 6443 }: {
    autoStart = true;
    macvlans = [ "eno1" ];
    timeoutStartSec = "10min";
    config = { config, pkgs, ... }: {

      # resolve host
      networking = {
        extraHosts = ''
        ${kubeMasterIP} ${kubeMasterHostname}
        '';
        hostName = name;
        defaultGateway = kubeMasterGateway;
        interfaces = {
          mv-eno1.ipv4.addresses = [ { address = ip; prefixLength = 24;}];
        };
      };

      services.kubernetes = {
        roles = [ "node" ];
        masterAddress = kubeMasterHostname;

        # point kubelet and other services to kube-apiserver
        kubelet.kubeconfig.server = "https://${kubeMasterHostname}:${toString kubeMasterAPIServerPort}";
        apiserverAddress = "https://${kubeMasterHostname}:${toString kubeMasterAPIServerPort}";
      
        kubelet = {
          extraOpts = "--fail-swap-on=false";
        };
      };

      networking.firewall = {
        enable = true;
        allowedTCPPorts = [ 
          config.services.kubernetes.kubelet.port
          config.services.kubernetes.kubelet.healthz.port
        ];
        allowedTCPPortRanges = [
          {from = 30000; to = 32767;}
        ];
      };

      services.avahi = {
        enable = true;
        publish = {
          enable = true;
          addresses = true;
          workstation = true;
        };
      };

      system.stateVersion = "22.05";

      # Manually configure nameserver. Using resolved inside the container seems to fail
      # currently
      environment.etc."resolv.conf".text = "nameserver 1.1.1.1";
    };
  };
in
{
  networking = {
    defaultGateway = kubeMasterGateway;
    # create macvlan for containers
    macvlans.mv-eno1-host = {
      interface = "eno1";
      mode = "bridge";
    };
    interfaces = {
      eno1.ipv4.addresses = lib.mkForce [];
      mv-eno1-host.ipv4.addresses = [{ address = kubeMasterIP; prefixLength = 24;}];
    };
      
    extraHosts = ''
      ${kubeMasterIP} ${kubeMasterHostname}
    '';
    firewall = {
      enable = true;
      allowedTCPPorts = [ 
        config.services.kubernetes.apiserver.securePort
        config.services.kubernetes.controllerManager.securePort
        config.services.kubernetes.scheduler.port
        config.services.cfssl.port
      ];
      allowedTCPPortRanges = [
        {from = 2379; to = 2380;}
      ];
    };
  };

  services.avahi = {
    enable = true;
    publish = {
      enable = true;
      addresses = true;
      workstation = true;
    };
  };

  # packages for administration tasks
  environment.systemPackages = with pkgs; [
    kompose
    kubectl
    kubernetes
  ];

  services.kubernetes = {
    roles = ["master"];
    masterAddress = kubeMasterHostname;
    apiserverAddress = "https://${kubeMasterHostname}:${toString kubeMasterAPIServerPort}";
    apiserver = {
      securePort = kubeMasterAPIServerPort;
      advertiseAddress = kubeMasterIP;
    };
    kubelet.extraOpts = "--fail-swap-on=false";
  };

  containers.kubenode1 = mkNode { name = "node1"; ip = "192.168.188.101"; };
  containers.kubenode2 = mkNode { name = "node2"; ip = "192.168.188.102"; };
  containers.kubenode3 = mkNode { name = "node3"; ip = "192.168.188.103"; };	
}

This entire time I missed that flannel at least on the host has a lot of issues:

Mär 23 22:41:45 gestalt systemd[1]: Started Flannel Service.
Mär 23 22:41:45 gestalt flannel[8135]: I0323 22:41:45.780007    8135 main.go:204] CLI flags config: {etcdEndpoints:http://127.0.0.1:4001,http://127.0.0.1:2379 etcdPrefix:/coreos.com/network etcdKeyfile: etcdCertfile: etcdCAFile: etcdUsername: etcdPassword: version:false kubeSubnetMgr:true kubeApiUrl: kubeAnnotationPrefix:flannel.alpha.coreos.com kubeConfigFile:/nix/store/z38zfzsr42a7pbgyg490nmm85gp42v7l-flannel-kubeconfig iface:[] ifaceRegex:[] ipMasq:false ifaceCanReach: subnetFile:/run/flannel/subnet.env publicIP: publicIPv6: subnetLeaseRenewMargin:60 healthzIP:0.0.0.0 healthzPort:0 iptablesResyncSeconds:5 iptablesForwardRules:true netConfPath:/etc/kube-flannel/net-conf.json setNodeNetworkUnavailable:true}
Mär 23 22:41:45 gestalt flannel[8135]: I0323 22:41:45.780771    8135 kube.go:126] Waiting 10m0s for node controller to sync
Mär 23 22:41:45 gestalt flannel[8135]: I0323 22:41:45.780788    8135 kube.go:420] Starting kube subnet manager
Mär 23 22:41:46 gestalt flannel[8135]: I0323 22:41:46.781139    8135 kube.go:133] Node controller sync successful
Mär 23 22:41:46 gestalt flannel[8135]: I0323 22:41:46.781196    8135 main.go:224] Created subnet manager: Kubernetes Subnet Manager - gestalt
Mär 23 22:41:46 gestalt flannel[8135]: I0323 22:41:46.781214    8135 main.go:227] Installing signal handlers
Mär 23 22:41:46 gestalt flannel[8135]: I0323 22:41:46.781461    8135 main.go:467] Found network config - Backend type: vxlan
Mär 23 22:41:46 gestalt flannel[8135]: I0323 22:41:46.781508    8135 match.go:206] Determining IP address of default interface
Mär 23 22:41:46 gestalt flannel[8135]: I0323 22:41:46.782460    8135 match.go:259] Using interface with name mv-eno1-host and address 192.168.188.89
Mär 23 22:41:46 gestalt flannel[8135]: I0323 22:41:46.782523    8135 match.go:281] Defaulting external address to interface address (192.168.188.89)
Mär 23 22:41:46 gestalt flannel[8135]: I0323 22:41:46.782623    8135 vxlan.go:138] VXLAN config: VNI=1 Port=0 GBP=false Learning=false DirectRouting=false
Mär 23 22:41:46 gestalt flannel[8135]: E0323 22:41:46.783233    8135 main.go:327] Error registering network: failed to acquire lease: node "gestalt" not found
Mär 23 22:41:46 gestalt flannel[8135]: I0323 22:41:46.783372    8135 main.go:447] Stopping shutdownHandler...
Mär 23 22:41:46 gestalt flannel[8135]: W0323 22:41:46.783480    8135 reflector.go:347] k8s.io/client-go@v0.25.2/tools/cache/reflector.go:169: watch of *v1.Node ended with: an error on the server ("unable to decode an event from the watch stream: context canceled") has prevented the request from succeeding
Mär 23 22:41:46 gestalt systemd[1]: flannel.service: Main process exited, code=exited, status=1/FAILURE
Mär 23 22:41:46 gestalt systemd[1]: flannel.service: Failed with result 'exit-code'.

might be related to CIDR assignment? k8s 1.19.0 with kube-flannel 0.12 Error registering network: failed to acquire lease: node "nodeName" pod cidr not assigned · Issue #1344 · flannel-io/flannel · GitHub

Seems like you have naming issues… flannel tries to find the node named “gestalt” but I don’t find it among the nodes. Also If the nodes are are sharing same layer 2 network (ethernet frames) like in your simulation I think flannel may be configured host-gw backend instead that is a bit simple to setup… I would give it a try…

Since two years I’m using k3s instead of the full blown distro. It’s composed of a single executable and in my opinion it’s a bit simpler to manage and comes with a local path storage provider out of the box.

1 Like

gestalt is the master node that’s running on the host. I might come back to this eventually.

But for now I did switch to k3s and had the exact same issue. the problem here is that docker won’t work on nixos-containers. But I did manage to get it working, due to this thread: Podman/docker in nixos container (ideally in unprivileged one)? - #6 by ndreas

My whole solution looks like this now:

{ pkgs, lib, config, ... }:
let
  kubeMasterIP = "192.168.188.89";
  kubeMasterGateway = "192.168.188.1";
  kubeMasterHostname = "gestalt.local";
  kubeMasterAPIServerPort = 6443;

  nspawn-config-text = ''
    [Exec]
    SystemCallFilter=add_key keyctl bpf
  '';

  mkNode = { ip, port ? 6443 }: {
    # use macvlan
    autoStart = true;
    macvlans = [ "eno1" ];
    timeoutStartSec = "10min";

    # enable nested containers https://wiki.archlinux.org/title/systemd-nspawn#Run_docker_in_systemd-nspawn
    enableTun = true;
    additionalCapabilities = ["all"];

    allowedDevices = [
      { node = "/dev/fuse"; modifier = "rwm"; }
      { node = "/dev/mapper/control"; modifier = "rwm"; }
    ];

    bindMounts = {
      "${config.sops.secrets.k3s-server-token.path}" = {
        hostPath = config.sops.secrets.k3s-server-token.path;
        isReadOnly = true;
      };
      dev-fuse = { hostPath = "/dev/fuse"; mountPoint = "/dev/fuse"; };
      dev-mount = { hostPath = "/dev/mapper"; mountPoint = "/dev/mapper"; };
    };

    config = { config, pkgs, ... }: {
      # resolve host
      networking = {
        extraHosts = ''
          ${kubeMasterIP} ${kubeMasterHostname}
        '';
        defaultGateway = kubeMasterGateway;
        interfaces = {
          mv-eno1.ipv4.addresses = [ { address = ip; prefixLength = 24;}];
        };
      };  

      virtualisation.containerd.enable = true;
      virtualisation.containerd.settings = {
        version = 2;
        plugins."io.containerd.grpc.v1.cri" = {
          cni.conf_dir = "/var/lib/rancher/k3s/agent/etc/cni/net.d/";
          # FIXME: upstream
          cni.bin_dir = "${pkgs.runCommand "cni-bin-dir" {} ''
            mkdir -p $out
            ln -sf ${pkgs.cni-plugins}/bin/* ${pkgs.cni-plugin-flannel}/bin/* $out
          ''}";
        };
      };

      systemd.services.k3s = {
        wants = [ "containerd.service" ];
        after = [ "containerd.service" ];
      };

      services.k3s = {
        enable = true;
        role = "agent";
        tokenFile = /run/secrets/k3s-server-token; # host.config.sops.secrets.k3s-server-token.path; ?
        serverAddr = "https://${kubeMasterHostname}:${toString port}";
        extraFlags = "--node-ip ${toString ip} --container-runtime-endpoint unix:///run/containerd/containerd.sock";
      };

      # packages for administration tasks
      environment.systemPackages = with pkgs; [
        postgresql_15
      ];

      services.avahi = {
        enable = true;
        publish = {
          enable = true;
          addresses = true;
          workstation = true;
        };
      };

      system.stateVersion = "22.05";

      # Manually configure nameserver. Using resolved inside the container seems to fail
      # currently
      environment.etc."resolv.conf".text = "nameserver 1.1.1.1";
    };
  };
in
  {
    imports = [ <sops-nix/modules/sops> ];

    networking = {
      defaultGateway = kubeMasterGateway;
    # create macvlan for containers
    macvlans.mv-eno1-host = {
      interface = "eno1";
      mode = "bridge";
    };
    interfaces = {
      eno1.ipv4.addresses = lib.mkForce [];
      mv-eno1-host.ipv4.addresses = [{ address = kubeMasterIP; prefixLength = 24;}];
    };

    extraHosts = ''
      ${kubeMasterIP} ${kubeMasterHostname}
    '';
    firewall = {
      enable = true;
      allowedTCPPorts = [ 
        kubeMasterAPIServerPort
        6444 # cacerts
      ];
    };
  };

  services.avahi = {
    enable = true;
    publish = {
      enable = true;
      addresses = true;
      workstation = true;
    };
  };

  sops.secrets.k3s-server-token.sopsFile = ./secrets.yaml;
  sops.age.keyFile = /home/jonaa/.config/sops/age/keys.txt;

  virtualisation.containerd.enable = true;
  virtualisation.containerd.settings = {
    version = 2;
    plugins."io.containerd.grpc.v1.cri" = {
      cni.conf_dir = "/var/lib/rancher/k3s/agent/etc/cni/net.d/";
      # FIXME: upstream
      cni.bin_dir = "${pkgs.runCommand "cni-bin-dir" {} ''
        mkdir -p $out
        ln -sf ${pkgs.cni-plugins}/bin/* ${pkgs.cni-plugin-flannel}/bin/* $out
      ''}";
    };
  };

  systemd.services.k3s = {
    wants = [ "containerd.service" ];
    after = [ "containerd.service" ];
  };

  services.k3s = {
    enable = true;
    role = "server";
    tokenFile = config.sops.secrets.k3s-server-token.path;
    extraFlags = "--disable traefik --flannel-backend=host-gw --container-runtime-endpoint unix:///run/containerd/containerd.sock";
  };

  containers.kube1 = mkNode { ip = "192.168.188.101"; };
  containers.kube2 = mkNode { ip = "192.168.188.102"; };
  containers.kube3 = mkNode { ip = "192.168.188.103"; };	

  # enable cgroups v2 in the container
  systemd.services."container@kube1".environment.SYSTEMD_NSPAWN_UNIFIED_HIERARCHY = "1";
  systemd.services."container@kube2".environment.SYSTEMD_NSPAWN_UNIFIED_HIERARCHY = "1";
  systemd.services."container@kube3".environment.SYSTEMD_NSPAWN_UNIFIED_HIERARCHY = "1";

  # allow syscalls via an nspawn config file, because arguments with spaces work bad with containers.example.extraArgs
  environment.etc."systemd/nspawn/kube1.nspawn".text = nspawn-config-text;
  environment.etc."systemd/nspawn/kube2.nspawn".text = nspawn-config-text;
  environment.etc."systemd/nspawn/kube3.nspawn".text = nspawn-config-text;
}

I am not sure if the explicit virtualisation.containerd stuff is actually needed, will test that later.
sometimes the pods are not able to resolve the gestalt.local for some reason, but otherwise this works flawlessly and would probably even with k8s.

1 Like