From 122e29198212ac3ddea30f914355355e02203321 Mon Sep 17 00:00:00 2001 From: Markus Kowalewski Date: Mon, 11 Aug 2025 13:59:28 +0200 Subject: [PATCH] nixos/slurm: update test, make more reliable * start services in proper order (avoid random failures due to randomly ordered startup) * clean up test script * add sbatch test --- nixos/tests/slurm.nix | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/nixos/tests/slurm.nix b/nixos/tests/slurm.nix index 1747e714117a..76041f87187a 100644 --- a/nixos/tests/slurm.nix +++ b/nixos/tests/slurm.nix @@ -49,6 +49,16 @@ let mkdir -p $out/bin ${lib.getDev pkgs.mpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest ''; + + sbatchOutput = "/tmp/shared/sbatch.log"; + sbatchScript = pkgs.writeText "sbatchScript" '' + #!${pkgs.runtimeShell} + #SBATCH --nodes 1 + #SBATCH --ntasks 1 + #SBATCH --output ${sbatchOutput} + + echo "sbatch success" + ''; in { name = "slurm"; @@ -127,43 +137,38 @@ in }; testScript = '' - start_all() - - # Make sure DBD is up after DB initialzation with subtest("can_start_slurmdbd"): - dbd.succeed("systemctl restart slurmdbd") dbd.wait_for_unit("slurmdbd.service") dbd.wait_for_open_port(6819) - # there needs to be an entry for the current - # cluster in the database before slurmctld is restarted - with subtest("add_account"): - control.succeed("sacctmgr -i add cluster default") - # check for cluster entry - control.succeed("sacctmgr list cluster | awk '{ print $1 }' | grep default") - - with subtest("can_start_slurmctld"): - control.succeed("systemctl restart slurmctld") + with subtest("cluster_is_initialized"): + control.wait_for_unit("multi-user.target") control.wait_for_unit("slurmctld.service") + control.wait_until_succeeds("sacctmgr list cluster | awk '{ print $1 }' | grep default") + + start_all() with subtest("can_start_slurmd"): for node in [node1, node2, node3]: - node.succeed("systemctl restart slurmd.service") node.wait_for_unit("slurmd") # Test that the cluster works and can distribute jobs; + submit.wait_for_unit("multi-user.target") with subtest("run_distributed_command"): # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). # The output must contain the 3 different names submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq") - with subtest("check_slurm_dbd"): + with subtest("check_slurm_dbd_job"): # find the srun job from above in the database - control.succeed("sleep 5") - control.succeed("sacct | grep hostname") + control.wait_until_succeeds("sacct | grep hostname") with subtest("run_PMIx_mpitest"): submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3") + + with subtest("run_sbatch"): + submit.succeed("sbatch --wait ${sbatchScript}") + submit.succeed("grep 'sbatch success' ${sbatchOutput}") ''; }