nixos/slurm: update test, make more reliable

* start services in proper order
  (avoid random failures due to randomly ordered startup)
* clean up test script
* add sbatch test
This commit is contained in:
Markus Kowalewski
2025-08-11 13:59:28 +02:00
parent 511f6efea2
commit 122e291982

View File

@@ -49,6 +49,16 @@ let
mkdir -p $out/bin
${lib.getDev pkgs.mpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest
'';
sbatchOutput = "/tmp/shared/sbatch.log";
sbatchScript = pkgs.writeText "sbatchScript" ''
#!${pkgs.runtimeShell}
#SBATCH --nodes 1
#SBATCH --ntasks 1
#SBATCH --output ${sbatchOutput}
echo "sbatch success"
'';
in
{
name = "slurm";
@@ -127,43 +137,38 @@ in
};
testScript = ''
start_all()
# Make sure DBD is up after DB initialzation
with subtest("can_start_slurmdbd"):
dbd.succeed("systemctl restart slurmdbd")
dbd.wait_for_unit("slurmdbd.service")
dbd.wait_for_open_port(6819)
# there needs to be an entry for the current
# cluster in the database before slurmctld is restarted
with subtest("add_account"):
control.succeed("sacctmgr -i add cluster default")
# check for cluster entry
control.succeed("sacctmgr list cluster | awk '{ print $1 }' | grep default")
with subtest("can_start_slurmctld"):
control.succeed("systemctl restart slurmctld")
with subtest("cluster_is_initialized"):
control.wait_for_unit("multi-user.target")
control.wait_for_unit("slurmctld.service")
control.wait_until_succeeds("sacctmgr list cluster | awk '{ print $1 }' | grep default")
start_all()
with subtest("can_start_slurmd"):
for node in [node1, node2, node3]:
node.succeed("systemctl restart slurmd.service")
node.wait_for_unit("slurmd")
# Test that the cluster works and can distribute jobs;
submit.wait_for_unit("multi-user.target")
with subtest("run_distributed_command"):
# Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
# The output must contain the 3 different names
submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq")
with subtest("check_slurm_dbd"):
with subtest("check_slurm_dbd_job"):
# find the srun job from above in the database
control.succeed("sleep 5")
control.succeed("sacct | grep hostname")
control.wait_until_succeeds("sacct | grep hostname")
with subtest("run_PMIx_mpitest"):
submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3")
with subtest("run_sbatch"):
submit.succeed("sbatch --wait ${sbatchScript}")
submit.succeed("grep 'sbatch success' ${sbatchOutput}")
'';
}