View paste JN7Q

This paste expires on 2025-12-24 16:00:14.007490. Repaste, or download this paste. . Pasted through v1-api.

	`#!/bin/bash`
	`# How to set up SLURM [and test OpenMPI and OpenMP against it] with some extra tweaks on Rocky 9.5 v 18-03-2025.`
	`#`
	`# Generally a cluster is set up in three stages:`
	`#`
	`# 1. make sure hardware is set up and nodes can be provisioned with an OS [from a master/head node] using tools like: Warewulf, xCAT, Bright, Trinity-X, Qlustar or DIY with PXE booting anaconda kickstart. In this stage /home is usually mounted on shared storage [master/head node often] on all nodes and UIDs and GIDs of users are kept in sync on all nodes via YP/NIS or LDAP [set up on master/head node].`
	`# 2. set up SLURM and an [integrated] MPI stack on all nodes of the cluster. Sometimes a basic setup is provided by the provisioning tool used.`
	`# 3. set up a build environment on shared storage [/home often] so tools like EasyBuild and Spack can be used to build extra software. For this all nodes should be equipped with Lmod.`
	`#`
	`# This script is about step 2 and it will:`
	`#`
	`# - set up a proper accounting DB with slurmdbd so QOS and resource limits can be set next to having access to detailed accounting`
	`# - configure SLURM to use multi factor scheduling for fairshare and backfill using cores and memory`
	`# - set up a decent prolog and epilog suitable for OpenMP and dealing with local scratch for jobs`
	`# - use a topology setup to make sure MPI jobs do no span racks with perhaps only limited bandwidth between them`
	`# - set up SLURM to generate a job accounting file and include a simple python script to parse it`
	`# - demonstrate how to make OpenMPI integrate with SLURM`
	`# - give general advise in comments based on many years of experience`
	`#`
	`# Prerequisites for this script:`
	`#`
	`# - should be run as root`
	`# - all nodes share same [private] network, can resolve each other names and have outbound access to the internet [NATed or directly]`
	`# - all nodes have a Rocky 9 minimal install [at least]`
	`#`
	`# Specify the [private] network range for all nodes here. This script will open firewalld to the nodes only in this network.`
	`export NETWORK="192.168.122.0/24"`

	`# Set name of node to 'master' for a head/master node or set to 'computeXYZ' for a compute node SLURM setup. Always set up master first before setting up any compute nodes.`
	`hostnamectl hostname master`

	`# Some handy aliases only used in this script.`
	`shopt -s expand_aliases # make sure these aliases work below in script`
	`alias master='[[ "$(hostname)" == "master" ]] &&' # for running only on master`
	`alias compute='[[ "$(hostname)" != "master" ]] &&' # for running only on compute node`

	`# Start of from minimal Rocky 9 install and update all nodes.`
	`dnf -y update`

	`# Install needed packages. Needs to be done on all nodes.`
	`dnf -y --enablerepo=crb install epel-release rocky-release-hpc`
	`dnf -y --enablerepo=crb install munge munge-devel slurm23-*`

	`# Create user and group slurm on node and set up right permissions for some dirs. This needs to be done on all nodes.`
	`groupadd -r -g 500 slurm; luseradd -r -u 500 -g 500 -s /sbin/nologin -d / slurm`
	`mkdir -m 755 -p /var/log/slurm /var/run/slurm /var/spool/slurm/{accounting,slurmd,state}`
	`chown -R slurm:slurm /var/run/slurm /var/log/slurm /etc/slurm /var/spool/slurm/{accounting,slurmd,state}`

	`# Set up munge key and munge service. Needs to be done on all nodes.`
	`echo "SuperSecretMungeKeyForAllNodesOfThisCluster" > /etc/munge/munge.key`
	`chmod 400 /etc/munge/munge.key; chown munge:munge /etc/munge/munge.key`
	`systemctl enable munge`
	`systemctl start munge`

	`# Set up mariadb service. Only needs to be done on master.`
	`master dnf -y --enablerepo=crb install mariadb-server`
	`master systemctl enable mariadb`
	`master systemctl start mariadb`
	`master mysql_secure_installation << EOD`

	`y`
	`n`
	`y`
	`y`
	`y`
	`y`
	`EOD`

	`# Set up a database for SLURM. Only needs to be done on master.`
	`master mysql << EOD`
	`create user 'slurm' identified by 'slurmpassword';`
	`grant all on slurm_acct_db.* TO 'slurm';`
	`create database slurm_acct_db;`
	`EOD`

	`# Set up the slurmdbd.conf file. For more info read https://slurm.schedmd.com/slurmdbd.conf.html. Only needs to be done on master.`
	`master cat << EOD > /etc/slurm/slurmdbd.conf`
	`PurgeEventAfter=24 # these values can be tweaked but as they are they keep accounting data in the DB for 2y`
	`PurgeJobAfter=24`
	`PurgeResvAfter=24`
	`PurgeStepAfter=24`
	`PurgeSuspendAfter=24`
	`AuthType=auth/munge`
	`DbdHost=master`
	`SlurmUser=slurm`
	`DebugLevel=4`
	`LogFile=/var/log/slurm/slurmdbd.log`
	`PidFile=/var/run/slurm/slurmdbd.pid`
	`PrivateData=usage,users,jobs # here users can only retrieve their own accounting data`
	`StorageType=accounting_storage/mysql`
	`StorageHost=master`
	`StoragePass=slurmpassword`
	`StorageUser=slurm`
	`StorageLoc=slurm_acct_db`
	`EOD`
	`master chmod 600 /etc/slurm/slurmdbd.conf # make sure no user can read your secret DB password`
	`master chown slurm:slurm /etc/slurm/slurmdbd.conf`

	`# Now enable and start slurmdbd service. On master only.`
	`master systemctl enable slurmdbd`
	`master systemctl start slurmdbd`
	`master sleep 5 # wait a bit for it to settle`

	`# Create SLURM config. Read https://slurm.schedmd.com/slurm.conf.html for more info or use https://slurm.schedmd.com/configurator.html. Only on master.`
	`master cat << EOD > /etc/slurm/slurm.conf`
	`AuthType=auth/munge`
	`CryptoType=crypto/munge`
	`EnforcePartLimits=yes`
	`Epilog=/etc/slurm/epilog.sh # set up epilog script`
	`MaxTasksPerNode=2 # set this to the nr of physical cores your compute nodes have`
	`MpiDefault=pmix_v3 # we use pmix_v3 by default so openmpi hooks into SLURMs pmix_v3 plugin so srun works nicely`
	`CpuFreqDef=Performance`
	`PrivateData=usage,users`
	`ProctrackType=proctrack/cgroup # we use cgroups to track and isolate jobs`
	`Prolog=/etc/slurm/prolog.sh # set up prolog script`
	`RebootProgram=/usr/sbin/reboot # make sure we can reboot nodes via scontrol`
	`ReturnToService=1`
	`SlurmctldHost=master`
	`SlurmctldParameters=enable_configless # we will use config-less method to get config to compute nodes from master`
	`SlurmctldPidFile=/var/run/slurm/slurmctld.pid`
	`SlurmctldPort=6817`
	`SlurmdPidFile=/var/run/slurm/slurmd.pid`
	`SlurmdPort=6818`
	`SlurmdSpoolDir=/var/spool/slurm/slurmd`
	`SlurmUser=slurm`
	`StateSaveLocation=/var/spool/slurm/state`
	`SwitchType=switch/none`
	`TaskPlugin=task/affinity,task/cgroup`
	`TaskPluginParam=Cores`
	`TaskProlog=/etc/slurm/prolog.sh`
	`TopologyPlugin=topology/tree`
	`InactiveLimit=0`
	`KillWait=30`
	`MinJobAge=300`
	`SlurmctldTimeout=120`
	`SlurmdTimeout=300`
	`Waittime=0`
	`DefMemPerCPU=1024 # set this to the RAM of the node [minus a few gig] divided by the number of physical cores on nodes`
	`MaxMemPerNode=2048 # set this the RAM of node minus a few gig for OS etc`
	`SchedulerType=sched/backfill # use backfill`
	`SelectType=select/cons_res # schedule cores and RAM`
	`SelectTypeParameters=CR_Core_Memory`
	`PriorityType=priority/multifactor # use multi factor for fairshare, size and QOS`
	`PriorityDecayHalfLife=14-0 # decay time of fairshare data is 2w cause users seem to remember who did what last week on clusters but not last month :)`
	`PriorityFavorSmall=yes`
	`PriorityWeightAge=10000`
	`PriorityWeightFairshare=800000 # mostly priority wrt fairshare and within user jobs wrt to size`
	`PriorityWeightJobSize=1000`
	`PriorityWeightQOS=1000000 # so we can have 'ASAP' QOS for urgent 'this needs to run now' jobs from professors`
	`AccountingStorageEnforce=qos,limits # enforce limits`
	`AccountingStorageHost=master`
	`AccountingStorageType=accounting_storage/slurmdbd`
	`AccountingStoreFlags=job_comment`
	`ClusterName=master`
	`DebugFlags=NO_CONF_HASH`
	`JobCompLoc=/var/spool/slurm/accounting/jobs.txt # we keep track of jobs finished via text files log rotated daily and these are kept forever`
	`JobCompType=jobcomp/filetxt`
	`JobAcctGatherFrequency=30`
	`JobAcctGatherType=jobacct_gather/linux`
	`SlurmctldDebug=3`
	`SlurmctldLogFile=/var/log/slurm/slurmctld.log`
	`SlurmdDebug=3`
	`SlurmdLogFile=/var/log/slurm/slurmd.log`
	`SlurmSchedLogFile=/var/log/slurm/slurmsched.log`
	`SlurmSchedLogLevel=1`
	`NodeName=master RealMemory=1024 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN # allows for small single core jobs on master [adjust to your taste]`
	`NodeName=compute[101-130] RealMemory=2048 Sockets=1 CoresPerSocket=2 ThreadsPerCore=1 State=UNKNOWN # rest of nodes are a bit beefier and all similar [adjust to your case]`
	`PartitionName=normal Nodes=master,compute[101-130] Default=YES MaxTime=7-0 State=UP ExclusiveUser=NO OverSubscribe=NO # we stick to single default partition with all nodes with sane defaults and let scheduler do its work instead of making schedulers out of the users [set gres and features of nodes to differentiate beteen nodes rather than partitions]`
	`EOD`
	`master chmod 644 /etc/slurm/slurm.conf`
	`master chown slurm:slurm /etc/slurm/slurm.conf`

	`# Create SLURM cgroup config. Read https://slurm.schedmd.com/cgroup.conf.html for more info. Only on master.`
	`master cat << EOD > /etc/slurm/cgroup.conf`
	`CgroupAutomount=yes`
	`ConstrainCores=yes`
	`ConstrainRAMSpace=yes`
	`MaxRAMPercent=93`
	`EOD`
	`master chmod 644 /etc/slurm/cgroup.conf`
	`master chown slurm:slurm /etc/slurm/cgroup.conf`

	`# Create SLURM topology config. Read https://slurm.schedmd.com/topology.conf.html for more info. Only on master.`
	`master cat << EOD > /etc/slurm/topology.conf`
	`SwitchName=master Nodes=master # make sure no MPI jobs span across nodes and master`
	`SwitchName=rack1 Nodes=compute[101-130] # when having multiple racks one can isolate MPI jobs within racks this way. each rack will have a switch but switches are not connected then.`
	`#SwitchName=rack2 Nodes=compute[201-230]`
	`#SwitchName=rack3 Nodes=compute[301-330]`
	`#SwitchName=Connect Switches=rack[1-3] # but if you do want MPI jobs spanning racks, then you can connect the switches. if desired also master again.`
	`EOD`
	`master chmod 644 /etc/slurm/topology.conf`
	`master chown slurm:slurm /etc/slurm/topology.conf`

	`# Create SLURM plugstack.conf. Read https://slurm.schedmd.com/spank.conf.html for more info. Only on master.`
	`master cat << EOD > /etc/slurm/plugstack.conf`
	`optional /usr/lib64/slurm/spank_pbs.so # so old Torque/PBS jobs of users have some of the old env vars defined they where used to`
	`EOD`
	`master chmod 644 /etc/slurm/plugstack.conf`
	`master chown slurm:slurm /etc/slurm/plugstack.conf`

	`# Now the config files for SLURM have been created on master. The nodes will use them either by having a direct copy in /etc/slurm or by using the newer config-less method.`
	`# Details regarding that can be found here https://slurm.schedmd.com/configless_slurm.html. The .conf files can be dealt with via the config-less method, but the prolog.sh`
	`# and epilog.sh files not [nor the node specific gres.conf locally placed on compute nodes]. Here we chose the config-less method and create the correct porlog.sh and epilog.sh`
	`# on all nodes together with setting up the nodes to use the config-less method. Alternatively one can set up these config files on nfs shared storage for all nodes`
	`# (including master) to access them.`

	`# Create prolog.sh and epilog.sh on all nodes. Tweaks are performed to make OpenMP jobs play nice automatically and create local scratch per job.`
	`cat << EOD > /etc/slurm/prolog.sh`
	`#!/bin/sh`
	`/usr/bin/mkdir -m 700 -p /scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID} # create job and user local scratch with correct permissions`
	`/usr/bin/chown -R \${SLURM_JOB_USER}:\${SLURM_JOB_USER} /scratch/\${SLURM_JOB_USER}`
	`/usr/bin/chmod 700 /scratch/\${SLURM_JOB_USER} /scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID}`
	`echo export TMPDIR=/scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID} # user job scripts can use TMPDIR env variable now to get scratch setup`
	`if [ -n "\${SLURM_CPUS_PER_TASK}" ] # pre set OMP_NUM_THREADS so users do not have to [or forget too] in job scripts`
	`then`
	`echo export OMP_NUM_THREADS=\${SLURM_CPUS_PER_TASK}`
	`else`
	`echo export OMP_NUM_THREADS=\$(/usr/bin/expr \${SLURM_JOB_CPUS_PER_NODE:-1} : '$[0-9]*$')`
	`fi`
	`echo export OMP_DYNAMIC=false # we do not like dynamic threads in HPC but you might`
	`exit 0`
	`EOD`
	`cat << EOD > /etc/slurm/epilog.sh`
	`#!/bin/sh`
	`/bin/rm -rf /scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID} # we automatically remove prolog created locat scratch of jobs`
	`exit 0`
	`EOD`
	`chmod 755 /etc/slurm/{prolog,epilog}.sh`
	`chown slurm:slurm /etc/slurm/{prolog,epilog}.sh`

	`# Make sure slurmd options are set to use config-less method of obtaining the SLURM configs. This is done on all nodes.`
	`echo "SLURMD_OPTIONS=--conf-server master:6817" > /etc/sysconfig/slurmd`

	`# Remove the config files on the compute nodes from Rocky SLURM install otherwise they overrule config-less method.`
	`compute rm -rf /etc/slurm/*.conf`

	`# Now configure logrotate to deal with logfiles and accounting file only on master.`
	`master cat << EOD > /etc/logrotate.d/slurm`
	`/var/log/slurm/*.log`
	`{`
	`missingok`
	`notifempty`
	`monthly`
	`rotate 12`
	`compress`
	`postrotate`
	`/usr/bin/killall -HUP slurmctld > /dev/null 2>&1 \|\| true`
	`endscript`
	`}`
	`/var/spool/slurm/accounting/jobs.txt`
	`{`
	`missingok`
	`notifempty`
	`daily`
	`dateyesterday`
	`rotate 3650`
	`maxage 3650`
	`dateext`
	`noolddir`
	`postrotate`
	`/usr/bin/killall -HUP slurmctld > /dev/null 2>&1 \|\| true`
	`endscript`
	`}`
	`EOD`

	`# Configure normal logrotate for compute nodes.`
	`compute cat << EOD > /etc/logrotate.d/slurm`
	`/var/log/slurm/*.log`
	`{`
	`missingok`
	`notifempty`
	`monthly`
	`rotate 12`
	`compress`
	`}`
	`EOD`

	`# Make sure all nodes can reach each other on the [private NATed] network. Run this on all nodes.`
	`systemctl enable firewalld # enable and start firewalld first`
	`systemctl start firewalld # and make sure communication is allowed between nodes`
	`firewall-cmd --permanent --add-rich-rule="rule family=ipv4 source address=${NETWORK} accept"`
	`firewall-cmd --reload`

	`# We can now start slurmctld on master and slurmd on all nodes.`
	`master systemctl enable slurmctld`
	`master systemctl start slurmctld`
	`master sleep 10 # wait a bit for DB to get initialized properly`
	`systemctl enable slurmd`
	`systemctl start slurmd`




	`# Start setting up some basic tweaks in the SLURM database.`

	`# Now define the ASAP QOS in the SLURM DB only on master. This allows an admin to do 'scontrol update job <id> set QOS=ASAP' to make a job jump the queue without any limits.`
	`master sacctmgr -i add qos ASAP priority=10000000`

	`# set up default normal QOS to include user limits on master. Read more about this at https://slurm.schedmd.com/qos.html.`
	`master sacctmgr -i modify qos normal set maxtresperuser=node=2,cpu=4 # A single user is not allowed to use more than 2 nodes or 4 cores in this example`

	`# More user or account limits can now be set via sacctmgr on the desired associations. Read more about this at https://slurm.schedmd.com/resource_limits.html.`
	`master sacctmgr -i modify account account=root set MaxSubmit=100 # make sure new users of account root cannot submit more than 100 jobs`
	`master sacctmgr -i add user test account=root # checkout the DB with sacctmgr and use the show command https://slurm.schedmd.com/sacctmgr.html`

	`# Let us make sure the master node is available and not draining.`
	`master scontrol update nodename=master state=resume`

	`# Now make sure any compute nodes installed now are not draining.`
	`compute scontrol update nodename=compute[101-130] state=resume`

	`# When more nodes are available, one could set up a reservation for test and or development jobs. See https://slurm.schedmd.com/reservations.html.`
	`master scontrol create ReservationName=test StartTime=09:00:00 Duration=08:00:00 NodeCnt=1 Account=root Flags=DAILY`

	`# SLURM has been set up and tweaked. We should take it for a spin.`




	`# As a test let us get and compile OpenMPI against the SLURM on all nodes. In future the OpenMPI might come directly from a repo on Rocky?`
	`dnf -y --enablerepo=crb install wget gfortran @development # get the tools we need from repo`
	`wget https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.7.tar.gz # get OpenMPI`
	`tar -zxvf openmpi-5.0.7.tar.gz # untar and configure against SLURM`
	`cd openmpi-5.0.7`
	`./configure --enable-mpi1-compatibility --with-slurm --disable-static --enable-mpi-fortran --with-hwloc=internal --with-libevent=internal --with-pmix=internal --with-pmix-binaries --with-munge`
	`make`
	`make install`
	`cd /root`

	`# Tweak OpenMPI not to do any binding by default but let SLURM be in control.`
	`echo 'hwloc_base_binding_policy=none' >> /usr/local/etc/openmpi-mca-params.conf`

	`# Now for a test, get an MPI hello world in C and fortran and compile and run them with SLURM on single core on master.`
	`wget https://github.com/mpitutorial/mpitutorial/raw/gh-pages/tutorials/mpi-hello-world/code/mpi_hello_world.c`
	`wget https://git.ecdf.ed.ac.uk/sopa-computing/parallel-coding-intro/-/raw/master/mpi-fortran/hello_world.f90`
	`mpicc -o mpi_hello_world_c.x mpi_hello_world.c`
	`mpif90 -o mpi_hello_world_fortran.x hello_world.f90`
	`master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_c.x # be aware this only works without shared nfs storage cause on master these binaries also were compiled and exist`
	`master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_fortran.x`
	`compute srun --nodes 1 --ntasks-per-node 2 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_c.x`
	`compute srun --nodes 1 --ntasks-per-node 2 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_fortran.x`




	`# Now get some OpenMP hello world test in C and fortran and run them too.`
	`wget https://people.sc.fsu.edu/~jburkardt/c_src/hello_openmp/hello_openmp.c`
	`wget https://people.sc.fsu.edu/~jburkardt/f_src/hello_openmp/hello_openmp.f90`
	`gcc -fopenmp -o omp_hello_world_c.x hello_openmp.c`
	`gfortran -fopenmp -o omp_hello_world_fortran.x hello_openmp.f90`
	`master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_c.x # be aware this only works without shared nfs storage cause on master these binaries also were compiled and exist`
	`master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_fortran.x`
	`compute srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 2 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_c.x`
	`compute srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 2 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_fortran.x`




	`# Now for some extra tweaks and tricks, here the python script to parse accounting files created by SLURM.`
	`master cat << EOS > /root/SlurmReport`
	`#!/usr/bin/python3`
	`import sys, re, datetime`

	`# --------------------------------------------`

	`def Usage( ):`
	`print( "%s logfile [logfiles]" % ( sys.argv[ 0 ] ) )`
	`sys.exit( 1 )`

	`# --------------------------------------------`

	`# first test some arguments...`
	`if ( len( sys.argv ) <= 1 ):`
	`Usage()`

	`# initialize some vars...`
	`Stats = { 'Totals': { 'JobCnt': 0, 'CPUTime': 0.0 }, 'Users': {} }`

	`# now loop over log files specified to collect stats...`
	`for FileNumber in range( 1, len( sys.argv ) ):`
	`FileName = sys.argv[ FileNumber ]`
	`File = open( FileName, 'r' )`
	`for Line in File:`
	`try:`
	`User = re.search( "UserId=([\w\d]+)", Line ).group(1)`
	`Start = re.search( "StartTime=([T\d:-]+)", Line ).group(1)`
	`End = re.search( "EndTime=([T\d:-]+)", Line ).group(1)`
	`Nodes = re.search( "NodeCnt=(\d+)", Line ).group(1)`
	`Cores = re.search( "ProcCnt=(\d+)", Line ).group(1)`
	`TimeDiff = datetime.datetime.strptime( End, "%Y-%m-%dT%H:%M:%S" ) - datetime.datetime.strptime( Start, "%Y-%m-%dT%H:%M:%S" )`
	`UsedCPUTime = float(Cores) * ((float)(TimeDiff.seconds) + (float)(TimeDiff.days)3600.024.0)`
	`if ( User not in Stats[ 'Users' ] ):`
	`Stats[ 'Users' ][ User ] = { 'JobCnt': 0, 'CPUTime': 0.0 }`
	`Stats[ 'Users' ][ User ][ 'JobCnt' ] += 1`
	`Stats[ 'Users' ][ User ][ 'CPUTime' ] += UsedCPUTime`
	`Stats[ 'Totals' ][ 'JobCnt' ] += 1`
	`Stats[ 'Totals' ][ 'CPUTime' ] += UsedCPUTime`
	`except: continue;`
	`File.close()`

	`# now generate output from stats...`
	`print( '%-48s %-10s %-10s' % ( "User", "#jobs", "CPU(h)" ) )`
	`print()`
	`for User, UserStats in Stats[ 'Users' ].items():`
	`Jobs = UserStats[ 'JobCnt' ]`
	`CPUh = UserStats[ 'CPUTime' ] / 3600.0`
	`print( '%-48s %-10d %-10.3f' % ( User, Jobs, CPUh ) )`
	`print()`
	`Jobs = Stats[ 'Totals' ][ 'JobCnt' ]`
	`CPUh = Stats[ 'Totals' ][ 'CPUTime' ] / 3600.0`
	`print( '%-48s %-10d %-10.3f' % ( "Totals", Jobs, CPUh ) )`
	`EOS`
	`master chmod +x /root/SlurmReport`
	`master /root/SlurmReport /var/spool/slurm/accounting/jobs.txt* # and show it to us`

#!/bin/bash
# How to set up SLURM [and test OpenMPI and OpenMP against it] with some extra tweaks on Rocky 9.5 v 18-03-2025.
#
# Generally a cluster is set up in three stages:
#
# 1. make sure hardware is set up and nodes can be provisioned with an OS [from a master/head node] using tools like: Warewulf, xCAT, Bright, Trinity-X, Qlustar or DIY with PXE booting anaconda kickstart. In this stage /home is usually mounted on shared storage [master/head node often] on all nodes and UIDs and GIDs of users are kept in sync on all nodes via YP/NIS or LDAP [set up on master/head node]. 
# 2. set up SLURM and an [integrated] MPI stack on all nodes of the cluster. Sometimes a basic setup is provided by the provisioning tool used.
# 3. set up a build environment on shared storage [/home often] so tools like EasyBuild and Spack can be used to build extra software. For this all nodes should be equipped with Lmod. 
#
# This script is about step 2 and it will:
#
# - set up a proper accounting DB with slurmdbd so QOS and resource limits can be set next to having access to detailed accounting
# - configure SLURM to use multi factor scheduling for fairshare and backfill using cores and memory
# - set up a decent prolog and epilog suitable for OpenMP and dealing with local scratch for jobs
# - use a topology setup to make sure MPI jobs do no span racks with perhaps only limited bandwidth between them
# - set up SLURM to generate a job accounting file and include a simple python script to parse it
# - demonstrate how to make OpenMPI integrate with SLURM
# - give general advise in comments based on many years of experience
#
# Prerequisites for this script: 
#
# - should be run as root
# - all nodes share same [private] network, can resolve each other names and have outbound access to the internet [NATed or directly]
# - all nodes have a Rocky 9 minimal install [at least] 
#
# Specify the [private] network range for all nodes here. This script will open firewalld to the nodes only in this network.
export NETWORK="192.168.122.0/24"

# Set name of node to 'master' for a head/master node or set to 'computeXYZ' for a compute node SLURM setup. Always set up master first before setting up any compute nodes.
hostnamectl hostname master

# Some handy aliases only used in this script.
shopt -s expand_aliases                                    # make sure these aliases work below in script
alias master='[[ "$(hostname)" == "master" ]] &&'          # for running only on master
alias compute='[[ "$(hostname)" != "master" ]] &&'         # for running only on compute node

# Start of from minimal Rocky 9 install and update all nodes.
dnf -y update

# Install needed packages. Needs to be done on all nodes.
dnf -y --enablerepo=crb install epel-release rocky-release-hpc
dnf -y --enablerepo=crb install munge munge-devel slurm23-*

# Create user and group slurm on node and set up right permissions for some dirs. This needs to be done on all nodes.
groupadd -r -g 500 slurm; luseradd -r -u 500 -g 500 -s /sbin/nologin -d / slurm
mkdir -m 755 -p /var/log/slurm /var/run/slurm /var/spool/slurm/{accounting,slurmd,state}
chown -R slurm:slurm /var/run/slurm /var/log/slurm /etc/slurm /var/spool/slurm/{accounting,slurmd,state}

# Set up munge key and munge service. Needs to be done on all nodes.
echo "SuperSecretMungeKeyForAllNodesOfThisCluster" > /etc/munge/munge.key
chmod 400 /etc/munge/munge.key; chown munge:munge /etc/munge/munge.key       
systemctl enable munge
systemctl start munge

# Set up mariadb service. Only needs to be done on master.
master dnf -y --enablerepo=crb install mariadb-server
master systemctl enable mariadb
master systemctl start mariadb
master mysql_secure_installation << EOD

y
n
y
y
y
y
EOD

# Set up a database for SLURM. Only needs to be done on master.
master mysql << EOD
create user 'slurm' identified by 'slurmpassword';
grant all on slurm_acct_db.* TO 'slurm'; 
create database slurm_acct_db;
EOD

# Set up the slurmdbd.conf file. For more info read https://slurm.schedmd.com/slurmdbd.conf.html. Only needs to be done on master.
master cat << EOD > /etc/slurm/slurmdbd.conf
PurgeEventAfter=24              # these values can be tweaked but as they are they keep accounting data in the DB for 2y
PurgeJobAfter=24
PurgeResvAfter=24
PurgeStepAfter=24
PurgeSuspendAfter=24
AuthType=auth/munge
DbdHost=master
SlurmUser=slurm
DebugLevel=4
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurm/slurmdbd.pid
PrivateData=usage,users,jobs               # here users can only retrieve their own accounting data
StorageType=accounting_storage/mysql
StorageHost=master
StoragePass=slurmpassword
StorageUser=slurm
StorageLoc=slurm_acct_db
EOD
master chmod 600 /etc/slurm/slurmdbd.conf        # make sure no user can read your secret DB password
master chown slurm:slurm /etc/slurm/slurmdbd.conf

# Now enable and start slurmdbd service. On master only.
master systemctl enable slurmdbd
master systemctl start slurmdbd
master sleep 5                          # wait a bit for it to settle

# Create SLURM config. Read https://slurm.schedmd.com/slurm.conf.html for more info or use https://slurm.schedmd.com/configurator.html. Only on master.
master cat << EOD > /etc/slurm/slurm.conf
AuthType=auth/munge
CryptoType=crypto/munge
EnforcePartLimits=yes
Epilog=/etc/slurm/epilog.sh          # set up epilog script
MaxTasksPerNode=2                    # set this to the nr of physical cores your compute nodes have
MpiDefault=pmix_v3                   # we use pmix_v3 by default so openmpi hooks into SLURMs pmix_v3 plugin so srun works nicely
CpuFreqDef=Performance
PrivateData=usage,users              
ProctrackType=proctrack/cgroup       # we use cgroups to track and isolate jobs
Prolog=/etc/slurm/prolog.sh          # set up prolog script
RebootProgram=/usr/sbin/reboot       # make sure we can reboot nodes via scontrol
ReturnToService=1
SlurmctldHost=master                  
SlurmctldParameters=enable_configless    # we will use config-less method to get config to compute nodes from master
SlurmctldPidFile=/var/run/slurm/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurm/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurm/slurmd
SlurmUser=slurm
StateSaveLocation=/var/spool/slurm/state
SwitchType=switch/none
TaskPlugin=task/affinity,task/cgroup
TaskPluginParam=Cores                
TaskProlog=/etc/slurm/prolog.sh
TopologyPlugin=topology/tree       
InactiveLimit=0
KillWait=30
MinJobAge=300
SlurmctldTimeout=120
SlurmdTimeout=300
Waittime=0
DefMemPerCPU=1024               # set this to the RAM of the node [minus a few gig] divided by the number of physical cores on nodes
MaxMemPerNode=2048              # set this the RAM of node minus a few gig for OS etc
SchedulerType=sched/backfill    # use backfill
SelectType=select/cons_res      # schedule cores and RAM
SelectTypeParameters=CR_Core_Memory
PriorityType=priority/multifactor    # use multi factor for fairshare, size and QOS
PriorityDecayHalfLife=14-0           # decay time of fairshare data is 2w cause users seem to remember who did what last week on clusters but not last month :) 
PriorityFavorSmall=yes             
PriorityWeightAge=10000              
PriorityWeightFairshare=800000       # mostly priority wrt fairshare and within user jobs wrt to size
PriorityWeightJobSize=1000           
PriorityWeightQOS=1000000               # so we can have 'ASAP' QOS for urgent 'this needs to run now' jobs from professors
AccountingStorageEnforce=qos,limits     # enforce limits 
AccountingStorageHost=master
AccountingStorageType=accounting_storage/slurmdbd
AccountingStoreFlags=job_comment
ClusterName=master
DebugFlags=NO_CONF_HASH
JobCompLoc=/var/spool/slurm/accounting/jobs.txt      # we keep track of jobs finished via text files log rotated daily and these are kept forever 
JobCompType=jobcomp/filetxt
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/linux
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurm/slurmd.log
SlurmSchedLogFile=/var/log/slurm/slurmsched.log
SlurmSchedLogLevel=1 
NodeName=master RealMemory=1024 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN      # allows for small single core jobs on master [adjust to your taste]
NodeName=compute[101-130] RealMemory=2048 Sockets=1 CoresPerSocket=2 ThreadsPerCore=1 State=UNKNOWN    # rest of nodes are a bit beefier and all similar [adjust to your case]
PartitionName=normal Nodes=master,compute[101-130] Default=YES MaxTime=7-0 State=UP ExclusiveUser=NO OverSubscribe=NO  # we stick to single default partition with all nodes with sane defaults and let scheduler do its work instead of making schedulers out of the users [set gres and features of nodes to differentiate beteen nodes rather than partitions]
EOD
master chmod 644 /etc/slurm/slurm.conf
master chown slurm:slurm /etc/slurm/slurm.conf

# Create SLURM cgroup config. Read https://slurm.schedmd.com/cgroup.conf.html for more info. Only on master.
master cat << EOD > /etc/slurm/cgroup.conf
CgroupAutomount=yes
ConstrainCores=yes
ConstrainRAMSpace=yes
MaxRAMPercent=93
EOD
master chmod 644 /etc/slurm/cgroup.conf
master chown slurm:slurm /etc/slurm/cgroup.conf

# Create SLURM topology config. Read https://slurm.schedmd.com/topology.conf.html for more info. Only on master.
master cat << EOD > /etc/slurm/topology.conf
SwitchName=master Nodes=master                # make sure no MPI jobs span across nodes and master
SwitchName=rack1 Nodes=compute[101-130]        # when having multiple racks one can isolate MPI jobs within racks this way. each rack will have a switch but switches are not connected then.
#SwitchName=rack2 Nodes=compute[201-230]
#SwitchName=rack3 Nodes=compute[301-330]
#SwitchName=Connect Switches=rack[1-3]        # but if you do want MPI jobs spanning racks, then you can connect the switches. if desired also master again.
EOD
master chmod 644 /etc/slurm/topology.conf
master chown slurm:slurm /etc/slurm/topology.conf

# Create SLURM plugstack.conf. Read https://slurm.schedmd.com/spank.conf.html for more info. Only on master.
master cat << EOD > /etc/slurm/plugstack.conf
optional /usr/lib64/slurm/spank_pbs.so           # so old Torque/PBS jobs of users have some of the old env vars defined they where used to
EOD
master chmod 644 /etc/slurm/plugstack.conf           
master chown slurm:slurm /etc/slurm/plugstack.conf

# Now the config files for SLURM have been created on master. The nodes will use them either by having a direct copy in /etc/slurm or by using the newer config-less method.
# Details regarding that can be found here https://slurm.schedmd.com/configless_slurm.html. The .conf files can be dealt with via the config-less method, but the prolog.sh 
# and epilog.sh files not [nor the node specific gres.conf locally placed on compute nodes]. Here we chose the config-less method and create the correct porlog.sh and epilog.sh 
# on all nodes together with setting up the nodes to use the config-less method. Alternatively one can set up these config files on nfs shared storage for all nodes 
# (including master) to access them.

# Create prolog.sh and epilog.sh on all nodes. Tweaks are performed to make OpenMP jobs play nice automatically and create local scratch per job.
cat << EOD > /etc/slurm/prolog.sh
#!/bin/sh
/usr/bin/mkdir -m 700 -p /scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID}                              # create job and user local scratch with correct permissions
/usr/bin/chown -R \${SLURM_JOB_USER}:\${SLURM_JOB_USER} /scratch/\${SLURM_JOB_USER}
/usr/bin/chmod 700 /scratch/\${SLURM_JOB_USER} /scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID}
echo export TMPDIR=/scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID}                                    # user job scripts can use TMPDIR env variable now to get scratch setup
if [ -n "\${SLURM_CPUS_PER_TASK}" ]                                                                # pre set OMP_NUM_THREADS so users do not have to [or forget too] in job scripts
then
 echo export OMP_NUM_THREADS=\${SLURM_CPUS_PER_TASK}
else
 echo export OMP_NUM_THREADS=\$(/usr/bin/expr \${SLURM_JOB_CPUS_PER_NODE:-1} : '$[0-9]*$')
fi
echo export OMP_DYNAMIC=false                                                                      # we do not like dynamic threads in HPC but you might
exit 0
EOD
cat << EOD > /etc/slurm/epilog.sh
#!/bin/sh
/bin/rm -rf /scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID}                                           # we automatically remove prolog created locat scratch of jobs
exit 0
EOD
chmod 755 /etc/slurm/{prolog,epilog}.sh           
chown slurm:slurm /etc/slurm/{prolog,epilog}.sh

# Make sure slurmd options are set to use config-less method of obtaining the SLURM configs. This is done on all nodes.
echo "SLURMD_OPTIONS=--conf-server master:6817" > /etc/sysconfig/slurmd

# Remove the config files on the compute nodes from Rocky SLURM install otherwise they overrule config-less method.
compute rm -rf /etc/slurm/*.conf

# Now configure logrotate to deal with logfiles and accounting file only on master.
master cat << EOD > /etc/logrotate.d/slurm
/var/log/slurm/*.log
{
    missingok
    notifempty
    monthly
    rotate 12
    compress
    postrotate
     /usr/bin/killall -HUP slurmctld > /dev/null 2>&1 || true
    endscript
}
/var/spool/slurm/accounting/jobs.txt
{
    missingok
    notifempty
    daily
    dateyesterday
    rotate 3650
    maxage 3650
    dateext
    noolddir
    postrotate
     /usr/bin/killall -HUP slurmctld > /dev/null 2>&1 || true
    endscript
}
EOD

# Configure normal logrotate for compute nodes.
compute cat << EOD > /etc/logrotate.d/slurm
/var/log/slurm/*.log
{
    missingok
    notifempty
    monthly
    rotate 12
    compress
}
EOD

# Make sure all nodes can reach each other on the [private NATed] network. Run this on all nodes.
systemctl enable firewalld                                  # enable and start firewalld first
systemctl start firewalld                                   # and make sure communication is allowed between nodes
firewall-cmd --permanent --add-rich-rule="rule family=ipv4 source address=${NETWORK} accept"  
firewall-cmd --reload

# We can now start slurmctld on master and slurmd on all nodes. 
master systemctl enable slurmctld
master systemctl start slurmctld
master sleep 10                      # wait a bit for DB to get initialized properly
systemctl enable slurmd
systemctl start slurmd

# Start setting up some basic tweaks in the SLURM database.

# Now define the ASAP QOS in the SLURM DB only on master. This allows an admin to do 'scontrol update job <id> set QOS=ASAP' to make a job jump the queue without any limits.
master sacctmgr -i add qos ASAP priority=10000000

# set up default normal QOS to include user limits on master. Read more about this at https://slurm.schedmd.com/qos.html.
master sacctmgr -i modify qos normal set maxtresperuser=node=2,cpu=4  # A single user is not allowed to use more than 2 nodes or 4 cores in this example

# More user or account limits can now be set via sacctmgr on the desired associations. Read more about this at https://slurm.schedmd.com/resource_limits.html.
master sacctmgr -i modify account account=root set MaxSubmit=100         # make sure new users of account root cannot submit more than 100 jobs
master sacctmgr -i add user test account=root                            # checkout the DB with sacctmgr and use the show command https://slurm.schedmd.com/sacctmgr.html

# Let us make sure the master node is available and not draining.
master scontrol update nodename=master state=resume

# Now make sure any compute nodes installed now are not draining.
compute scontrol update nodename=compute[101-130] state=resume

# When more nodes are available, one could set up a reservation for test and or development jobs. See https://slurm.schedmd.com/reservations.html.
master scontrol create ReservationName=test StartTime=09:00:00 Duration=08:00:00 NodeCnt=1 Account=root Flags=DAILY

# SLURM has been set up and tweaked. We should take it for a spin.

# As a test let us get and compile OpenMPI against the SLURM on all nodes. In future the OpenMPI might come directly from a repo on Rocky?
dnf -y --enablerepo=crb install wget gfortran @development                         # get the tools we need from repo
wget https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.7.tar.gz      # get OpenMPI
tar -zxvf openmpi-5.0.7.tar.gz                                                     # untar and configure against SLURM
cd openmpi-5.0.7
./configure --enable-mpi1-compatibility --with-slurm --disable-static --enable-mpi-fortran --with-hwloc=internal --with-libevent=internal --with-pmix=internal --with-pmix-binaries --with-munge
make
make install                                                                      
cd /root

# Tweak OpenMPI not to do any binding by default but let SLURM be in control.
echo 'hwloc_base_binding_policy=none' >> /usr/local/etc/openmpi-mca-params.conf

# Now for a test, get an MPI hello world in C and fortran and compile and run them with SLURM on single core on master.
wget https://github.com/mpitutorial/mpitutorial/raw/gh-pages/tutorials/mpi-hello-world/code/mpi_hello_world.c
wget https://git.ecdf.ed.ac.uk/sopa-computing/parallel-coding-intro/-/raw/master/mpi-fortran/hello_world.f90
mpicc -o mpi_hello_world_c.x mpi_hello_world.c
mpif90 -o mpi_hello_world_fortran.x hello_world.f90
master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_c.x      # be aware this only works without shared nfs storage cause on master these binaries also were compiled and exist
master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_fortran.x
compute srun --nodes 1 --ntasks-per-node 2 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_c.x    
compute srun --nodes 1 --ntasks-per-node 2 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_fortran.x

# Now get some OpenMP hello world test in C and fortran and run them too.
wget https://people.sc.fsu.edu/~jburkardt/c_src/hello_openmp/hello_openmp.c
wget https://people.sc.fsu.edu/~jburkardt/f_src/hello_openmp/hello_openmp.f90
gcc -fopenmp -o omp_hello_world_c.x hello_openmp.c
gfortran -fopenmp -o omp_hello_world_fortran.x hello_openmp.f90
master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_c.x      # be aware this only works without shared nfs storage cause on master these binaries also were compiled and exist
master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_fortran.x
compute srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 2 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_c.x     
compute srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 2 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_fortran.x

# Now for some extra tweaks and tricks, here the python script to parse accounting files created by SLURM.
master cat << EOS > /root/SlurmReport
#!/usr/bin/python3
import sys, re, datetime

# --------------------------------------------

def Usage( ):
	print( "%s logfile [logfiles]" % ( sys.argv[ 0 ] ) )
	sys.exit( 1 )

# --------------------------------------------

# first test some arguments...
if ( len( sys.argv ) <= 1 ):
	Usage()

# initialize some vars...
Stats = { 'Totals': { 'JobCnt': 0, 'CPUTime': 0.0 }, 'Users': {} }

# now loop over log files specified to collect stats...
for FileNumber in range( 1, len( sys.argv ) ):
	FileName = sys.argv[ FileNumber ]
	File = open( FileName, 'r' )
	for Line in File:
		try:
			User = re.search( "UserId=([\w\d]+)", Line ).group(1)
			Start = re.search( "StartTime=([T\d:-]+)", Line ).group(1)
			End = re.search( "EndTime=([T\d:-]+)", Line ).group(1)
			Nodes = re.search( "NodeCnt=(\d+)", Line ).group(1)
			Cores = re.search( "ProcCnt=(\d+)", Line ).group(1)
			TimeDiff = datetime.datetime.strptime( End, "%Y-%m-%dT%H:%M:%S" ) - datetime.datetime.strptime( Start, "%Y-%m-%dT%H:%M:%S" )
			UsedCPUTime = float(Cores) * ((float)(TimeDiff.seconds) + (float)(TimeDiff.days)*3600.0*24.0)
			if ( User not in Stats[ 'Users' ] ):
				Stats[ 'Users' ][ User ] = { 'JobCnt': 0, 'CPUTime': 0.0 }
			Stats[ 'Users' ][ User ][ 'JobCnt' ] += 1
			Stats[ 'Users' ][ User ][ 'CPUTime' ] += UsedCPUTime
			Stats[ 'Totals' ][ 'JobCnt' ] += 1
			Stats[ 'Totals' ][ 'CPUTime' ] += UsedCPUTime
		except: continue;
	File.close()

# now generate output from stats...
print( '%-48s %-10s %-10s' % ( "User", "#jobs", "CPU(h)" ) )
print()
for User, UserStats in Stats[ 'Users' ].items():
	Jobs = UserStats[ 'JobCnt' ]
	CPUh = UserStats[ 'CPUTime' ] / 3600.0
	print( '%-48s %-10d %-10.3f' % ( User, Jobs, CPUh ) )
print()
Jobs = Stats[ 'Totals' ][ 'JobCnt' ]
CPUh = Stats[ 'Totals' ][ 'CPUTime' ] / 3600.0
print( '%-48s %-10d %-10.3f' % ( "Totals", Jobs, CPUh ) )
EOS
master chmod +x /root/SlurmReport
master /root/SlurmReport /var/spool/slurm/accounting/jobs.txt*        # and show it to us

Filename: paste. Size: 21kb. View raw, , hex, or download this file. Report this file.