Infinite recursion and systemd credentials in custom module

TNT2k · April 17, 2025, 12:36pm

Hey there,
I am trying to create a module, which allows me to supervise systemd services and connect them to https://healthchecks.io/.

To do this, I want to achieve the following:

When a supervised unit is started, I want to trigger the PATH/startendpoint
When a supervised unit exits (with or without error), I want to trigger the PATH/failendpoint
During the runtime of a supervised unit, I want to check their status periodically. If everything is fine, PATH/success(or no suffix) should be called, else PATH/fail.

The PATH should be confidential, because I might want to make my configuration public, and trolls could use the path to trigger notifications.

The module should be “reusable” like modules normally are, so that I can just add a config to my service, and my “watchover” service and timer are created automatically.
However, they should only exist and run, if my service is actually defined and active.
If I deactivate a service, I do not want to get notifications, because I know, that it is not running.

This is my current module:

{ config, lib, pkgs, ... }: let
  inherit (lib) mkOption types;

  # Filter watchovers to only include those for services that exist
  # Extract this function to avoid direct reference to config.systemd.services
  isValidWatchover = name: cfgEntry:
    let serviceCfg = config.systemd.services.${name} or null;
    in serviceCfg != null && (serviceCfg.enable or true);

  pingHealthchecksStart = pkgs.writeScript "ping-healthchecks-start.sh" ''
    #!${pkgs.runtimeShell}
    echo "STARTING $(cat $CREDENTIALS_DIRECTORY/url)/start"; ${pkgs.curl}/bin/curl $(cat $CREDENTIALS_DIRECTORY/url)/start || true
  '';

  pingHealthchecksFail = pkgs.writeScript "ping-healthchecks-fail.sh" ''
    #!${pkgs.runtimeShell}
    echo "FAILING $(cat $CREDENTIALS_DIRECTORY/url)/fail"; ${pkgs.curl}/bin/curl $(cat $CREDENTIALS_DIRECTORY/url)/fail || true
  '';

  pingHealthchecksOk = pkgs.writeScript "ping-healthchecks-ok.sh" ''
    #!${pkgs.runtimeShell} 
    echo "OK $(cat $CREDENTIALS_DIRECTORY/url)"; ${pkgs.curl}/bin/curl $(cat $CREDENTIALS_DIRECTORY/url)
  '';
  
  # Create watchover services - this function will be called only after the module is evaluated
  makeWatchoverService = name: cfgEntry:
    let
      serviceCfg = config.systemd.services.${name} or null;
      effectiveUser = if cfgEntry.user != null then cfgEntry.user
                      else if cfgEntry.runInContainer then "root"
                      else if serviceCfg != null && serviceCfg.serviceConfig ? User then serviceCfg.serviceConfig.User
                      else "root";

      effectiveGroup = if cfgEntry.group != null then cfgEntry.group
                        else if cfgEntry.runInContainer then "root"
                        else if serviceCfg != null && serviceCfg.serviceConfig ? Group then serviceCfg.serviceConfig.Group
                        else "root";

      rawName = lib.replaceStrings ["docker-" "podman-"] ["" ""] name;

      healthcheckScript = pkgs.writeScript "check-${name}.sh" ''
        #!${pkgs.runtimeShell}
        set -e

        if ! systemctl is-active --quiet ${name}; then
          ${pingHealthchecksFail}
          exit 0
        fi

        uptime=$(systemctl show -p ActiveEnterTimestampMonotonic --value ${name})
        now=$(cat /proc/uptime | cut -d "." -f1)
        delta=$((now - uptime / 1000000))

        if [ $delta -gt $(( ${toString cfgEntry.start_period} )) ]; then
          ${if cfgEntry.runInContainer then
            let
              backend = config.virtualisation.oci-containers.backend or "docker";
              dockerCmd = "${pkgs.docker}/bin/docker exec ${rawName} ${cfgEntry.script} >/dev/null 2>&1 && ${pingHealthchecksOk} || ${pingHealthchecksFail}";
              podmanCmd = "${pkgs.podman}/bin/podman exec ${rawName} ${cfgEntry.script} >/dev/null 2>&1 && ${pingHealthchecksOk} || ${pingHealthchecksFail}";
            in ''${if backend == "docker" then dockerCmd else podmanCmd}''
            else ''${cfgEntry.script} >/dev/null 2>&1 && ${pingHealthchecksOk} || ${pingHealthchecksFail}''
          }
        else
          ${if cfgEntry.runInContainer then
            let
              backend = config.virtualisation.oci-containers.backend or "docker";
              dockerCmd = "${pkgs.docker}/bin/docker exec ${rawName} ${cfgEntry.script} >/dev/null 2>&1 && ${pingHealthchecksOk} || true";
              podmanCmd = "${pkgs.podman}/bin/podman exec ${rawName} ${cfgEntry.script} >/dev/null 2>&1 && ${pingHealthchecksOk} || true";
            in ''${if backend == "docker" then dockerCmd else podmanCmd}''
            else ''${cfgEntry.script} >/dev/null 2>&1 && ${pingHealthchecksOk}''
          }
        fi
      '';
    in {
      name = "watchover-${name}";
      value = {
        description = "Watchover for ${name}";
        after = [ "${name}.service" ];
        partOf = [ "${name}.service" ];
        serviceConfig = {
          Type = "oneshot";
          LoadCredential = [
            "url:${cfgEntry.url}"
          ];
          ExecStart = "${healthcheckScript}";
          User = effectiveUser;
          Group = effectiveGroup;
          TimeoutSec = cfgEntry.timeout;
        };
      };
    };
  
  # Create service modifications
  makeServiceModification = name: cfgEntry: {
    serviceConfig = {
      LoadCredential = [
        "url:${cfgEntry.url}"
      ];
      ExecStartPost = [ "${pingHealthchecksStart}" ];
      ExecStopPost = [ "${pingHealthchecksFail}" ];
    };
  };

  # Create timers
  makeWatchoverTimer = name: cfgEntry: {
    name = "watchover-" + name;
    value = {
      wantedBy = [ "timers.target" ];
      timerConfig = {
        OnBootSec = "${toString cfgEntry.interval}s";
        OnUnitActiveSec = "${toString cfgEntry.interval}s";
        Unit = "watchover-${name}.service";
      };
    };
  };

in {
  options.systemd-watchover = mkOption {
    type = types.attrsOf (types.submodule ({ name, ... }: {
      options = {
        url = mkOption {
          type = types.path;
          description = "Base URL to send health results to.";
        };

        script = mkOption {
          type = types.str;
          description = "Inline script or command string to run.";
        };

        runInContainer = mkOption {
          type = types.bool;
          default = false;
        };

        user = mkOption {
          type = types.nullOr types.str;
          default = null;
        };

        group = mkOption {
          type = types.nullOr types.str;
          default = null;
        };

        interval = mkOption {
          type = types.int;
        };

        timeout = mkOption {
          type = types.int;
        };

        retries = mkOption {
          type = types.int;
        };

        start_period = mkOption {
          type = types.int;
        };
      };
    }));
    default = {};
  };

  #config = mkIf (config.systemd-watchover != {}) {
  #  systemd.services = mkMerge [
  #    # Create watchover services
  #    (mapAttrs' makeWatchoverService 
  #      (filterAttrs isValidWatchover config.systemd-watchover))
  #      
  #    # Modify the original services
  #    (mapAttrs (name: value: value) 
  #      (mapAttrs makeServiceModification 
  #        (filterAttrs isValidWatchover config.systemd-watchover)))
  #  ];
  #};

  config = lib.mkIf (config.systemd-watchover != {}) {
    systemd.services = lib.mkMerge [
      # Create watchover services
      (lib.mapAttrs' makeWatchoverService config.systemd-watchover)
        
      # Modify the original services
      (lib.mapAttrs (name: value: value) 
        (lib.mapAttrs makeServiceModification config.systemd-watchover))
    ];

    # Set up timers
    systemd.timers = lib.mapAttrs' makeWatchoverTimer 
      (lib.filterAttrs isValidWatchover config.systemd-watchover);
  };
}

However, this config currently has two issues:

The ExecStopPostdoes not get the credential, because this is not intended.
I can not really use my isValidWatchover, because this creates an infinite recursion.

I do not have any idea, how I can solve either of these problems.
The second one might be straight up impossible, because I would need to “force” Nix to evaluate systemd.services to calculate the filtered services, while ignoring the added “watchover services”. In a imperative language this is not a problem, but because of Nix’ lazy evaluation, this might be not possible.

If you have ideas, how to solve this, plese let me know!
Help is greatly appreciated…

bme · April 17, 2025, 5:25pm

Why not use drop-ins with OnFailure / OnSuccess?

EDIT: + timers with PartOf?

TNT2k · April 17, 2025, 5:32pm

partOf is a relict I have to remove, that’s correct.

Regarding “OnFailure / OnSuccess”:
Nice idea, I missed that!

Do you also have a solution for the recursion problem?

bme · April 17, 2025, 5:35pm

My suggestion for fixing the recursion, which admittedly is kind of half baked is just to use universal drop-ins, so you don’t have to calculate the set of active services, just let systemd do it for you. systemd.unit(5) - Linux manual page ^f “top-level drop-in”.

TNT2k · April 17, 2025, 5:46pm

I will have a look for the drop ins, but this does not work for “OnSuccess / OnFailure”:

  # Create service modifications
  makeServiceModification = name: cfgEntry: {
    serviceConfig = {
      OnSuccess = [ "watchover-${name}.service" ];
      OnFailure = [ "watchover-${name}.service" ];
      LoadCredential = [
        "url:${cfgEntry.url}"
      ];
      ExecStartPost = [ "${pingHealthchecksStart}" ];
      #ExecStopPost = [ "${pingHealthchecksFail}" ];
    };
  };

I stopped the unit manually, it stops, but my watchover service is not called.
I verified both with “systemctl status …”

bme · April 17, 2025, 5:50pm

Right, but what if you kill the process via signal or similar? OnSuccess / OnFailure aren’t triggered by systemctl actions, they are triggered by the job itself completing with success or failure.

waffle8946 · April 17, 2025, 5:54pm

That blogpost has an explanation of the dropin mechanism.

TNT2k · April 17, 2025, 5:54pm

Shouldn’t stopping a unit also count?
According to your link above for the drop ins:

 OnSuccess=
     A space-separated list of one or more units that are activated
     when this unit enters the "inactive" state.

     Added in version 249.

Which also happens, if I stop a unit…

Regarding the drop ins itself:
I don’t think, they will really help us for the recursion, because I don’t think, that I can start a timer using a drop in, can I?

bme · April 17, 2025, 5:58pm

Sure you can, you add a depencency to a templated timer for every service, using BindsTo or similar.

TNT2k · April 17, 2025, 5:59pm

I used ExecStopPost in my module, because this is always run, if a unit terminated, for whatever reason.

bme · April 17, 2025, 6:00pm

If ExecStop has the semantics you want, you can just add an extra ExecStop everywhere using a dropin

TNT2k · April 17, 2025, 6:00pm

How can I control the settings of the timer then?
The unit does not “know”, the interval or anything else.

TNT2k · April 17, 2025, 6:01pm

Sure, I could. But the problem with that idea is:
ExecStop / ExecStopPost does not grant access to my credentials…
I need a solution to run the curl command with my credentials, when a unit exits. The credentials are different for every unit.

waffle8946 · April 17, 2025, 6:02pm

OnSuccess/Failure didn’t work because it doesn’t go in serviceConfig ([Service] section), it goes in the [Unit] section.

https://search.nixos.org/options?channel=24.11&from=0&size=50&sort=relevance&type=packages&query=onfailure

TNT2k · April 17, 2025, 6:06pm

Yes, I am stupid, sorry.

Now we only have the recursion problem…
However, if there is no good solution, I might change the script to just exit, if the unit is not running.
Because I can rely on the unit to tell me, when it was started, and tell me, if it crashed.
This is not a “real solution”, how to fix the problem itself, but it works…

bme · April 17, 2025, 6:08pm

credentials are available as files under CREDENTIALS_DIRECTORY so I’d load them that way.

TNT2k · April 17, 2025, 6:09pm

No, during ExecStop / ExecStopPost, they are already gone. But the OnSuccess stuff works…
Do you know, if “OnFailure” also works, if the unit restarts? Like “something bad happens, I crash and systemd restarts me”?

bme · April 17, 2025, 6:11pm

depends on restart mode

   RestartMode=
       Takes a string value that specifies how a service should
       restart:

       •   If set to normal (the default), the service restarts by
           going through a failed/inactive state.

           Added in version 254.

       •   If set to direct, the service transitions to the
           activating state directly during auto-restart, skipping
           failed/inactive state.  ExecStopPost= is still invoked.
           OnSuccess= and OnFailure= are skipped.

           This option is useful in cases where a dependency can fail
           temporarily but we do not want these temporary failures to
           make the dependent units fail. Dependent units are not
           notified of these temporary failures.

           Added in version 254.

TNT2k · April 17, 2025, 6:12pm

Ah, thank you.
I searched for that and did not find that.

So I guess case closed?

Thank you both very much!

bme · April 17, 2025, 6:14pm

I’ll mention one other thing re timers: i’ve not ever had a use case for dynamically generated timers but https://www.freedesktop.org/software/systemd/man/latest/systemd.generator.html might be interesting to you.