#!/bin/bash
# How to set up SLURM [and test OpenMPI and OpenMP against it] with some extra tweaks on Rocky 9.5 v 18-03-2025.
#
# Generally a cluster is set up in three stages:
#
# 1. make sure hardware is set up and nodes can be provisioned with an OS [from a master/head node] using tools like: Warewulf, xCAT, Bright, Trinity-X, Qlustar or DIY with PXE booting anaconda kickstart. In this stage /home is usually mounted on shared storage [master/head node often] on all nodes and UIDs and GIDs of users are kept in sync on all nodes via YP/NIS or LDAP [set up on master/head node]. 
# 2. set up SLURM and an [integrated] MPI stack on all nodes of the cluster. Sometimes a basic setup is provided by the provisioning tool used.
# 3. set up a build environment on shared storage [/home often] so tools like EasyBuild and Spack can be used to build extra software. For this all nodes should be equipped with Lmod. 
#
# This script is about step 2 and it will:
#
# - set up a proper accounting DB with slurmdbd so QOS and resource limits can be set next to having access to detailed accounting
# - configure SLURM to use multi factor scheduling for fairshare and backfill using cores and memory
# - set up a decent prolog and epilog suitable for OpenMP and dealing with local scratch for jobs
# - use a topology setup to make sure MPI jobs do no span racks with perhaps only limited bandwidth between them
# - set up SLURM to generate a job accounting file and include a simple python script to parse it
# - demonstrate how to make OpenMPI integrate with SLURM
# - give general advise in comments based on many years of experience
#
# Prerequisites for this script: 
#
# - should be run as root
# - all nodes share same [private] network, can resolve each other names and have outbound access to the internet [NATed or directly]
# - all nodes have a Rocky 9 minimal install [at least] 
#
# Specify the [private] network range for all nodes here. This script will open firewalld to the nodes only in this network.
export NETWORK="192.168.122.0/24"                    

# Set name of node to 'master' for a head/master node or set to 'computeXYZ' for a compute node SLURM setup. Always set up master first before setting up any compute nodes.
hostnamectl hostname master

# Some handy aliases only used in this script.
shopt -s expand_aliases                                    # make sure these aliases work below in script
alias master='[[ "$(hostname)" == "master" ]] &&'          # for running only on master
alias compute='[[ "$(hostname)" != "master" ]] &&'         # for running only on compute node

# Start of from minimal Rocky 9 install and update all nodes.
dnf -y update

# Install needed packages. Needs to be done on all nodes.
dnf -y --enablerepo=crb install epel-release rocky-release-hpc
dnf -y --enablerepo=crb install munge munge-devel slurm23-*

# Create user and group slurm on node and set up right permissions for some dirs. This needs to be done on all nodes.
groupadd -r -g 500 slurm; luseradd -r -u 500 -g 500 -s /sbin/nologin -d / slurm
mkdir -m 755 -p /var/log/slurm /var/run/slurm /var/spool/slurm/{accounting,slurmd,state}
chown -R slurm:slurm /var/run/slurm /var/log/slurm /etc/slurm /var/spool/slurm/{accounting,slurmd,state}

# Set up munge key and munge service. Needs to be done on all nodes.
echo "SuperSecretMungeKeyForAllNodesOfThisCluster" > /etc/munge/munge.key
chmod 400 /etc/munge/munge.key; chown munge:munge /etc/munge/munge.key       
systemctl enable munge
systemctl start munge

# Set up mariadb service. Only needs to be done on master.
master dnf -y --enablerepo=crb install mariadb-server
master systemctl enable mariadb
master systemctl start mariadb
master mysql_secure_installation << EOD

y
n
y
y
y
y
EOD

# Set up a database for SLURM. Only needs to be done on master.
master mysql << EOD
create user 'slurm' identified by 'slurmpassword';
grant all on slurm_acct_db.* TO 'slurm'; 
create database slurm_acct_db;
EOD

# Set up the slurmdbd.conf file. For more info read https://slurm.schedmd.com/slurmdbd.conf.html. Only needs to be done on master.
master cat << EOD > /etc/slurm/slurmdbd.conf
PurgeEventAfter=24              # these values can be tweaked but as they are they keep accounting data in the DB for 2y
PurgeJobAfter=24
PurgeResvAfter=24
PurgeStepAfter=24
PurgeSuspendAfter=24
AuthType=auth/munge
DbdHost=master
SlurmUser=slurm
DebugLevel=4
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurm/slurmdbd.pid
PrivateData=usage,users,jobs               # here users can only retrieve their own accounting data
StorageType=accounting_storage/mysql
StorageHost=master
StoragePass=slurmpassword
StorageUser=slurm
StorageLoc=slurm_acct_db
EOD
master chmod 600 /etc/slurm/slurmdbd.conf        # make sure no user can read your secret DB password
master chown slurm:slurm /etc/slurm/slurmdbd.conf

# Now enable and start slurmdbd service. On master only.
master systemctl enable slurmdbd
master systemctl start slurmdbd
master sleep 5                          # wait a bit for it to settle

# Create SLURM config. Read https://slurm.schedmd.com/slurm.conf.html for more info or use https://slurm.schedmd.com/configurator.html. Only on master.
master cat << EOD > /etc/slurm/slurm.conf
AuthType=auth/munge
CryptoType=crypto/munge
EnforcePartLimits=yes
Epilog=/etc/slurm/epilog.sh          # set up epilog script
MaxTasksPerNode=2                    # set this to the nr of physical cores your compute nodes have
MpiDefault=pmix_v3                   # we use pmix_v3 by default so openmpi hooks into SLURMs pmix_v3 plugin so srun works nicely
CpuFreqDef=Performance
PrivateData=usage,users              
ProctrackType=proctrack/cgroup       # we use cgroups to track and isolate jobs
Prolog=/etc/slurm/prolog.sh          # set up prolog script
RebootProgram=/usr/sbin/reboot       # make sure we can reboot nodes via scontrol
ReturnToService=1
SlurmctldHost=master                  
SlurmctldParameters=enable_configless    # we will use config-less method to get config to compute nodes from master
SlurmctldPidFile=/var/run/slurm/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurm/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurm/slurmd
SlurmUser=slurm
StateSaveLocation=/var/spool/slurm/state
SwitchType=switch/none
TaskPlugin=task/affinity,task/cgroup
TaskPluginParam=Cores                
TaskProlog=/etc/slurm/prolog.sh
TopologyPlugin=topology/tree       
InactiveLimit=0
KillWait=30
MinJobAge=300
SlurmctldTimeout=120
SlurmdTimeout=300
Waittime=0
DefMemPerCPU=1024               # set this to the RAM of the node [minus a few gig] divided by the number of physical cores on nodes
MaxMemPerNode=2048              # set this the RAM of node minus a few gig for OS etc
SchedulerType=sched/backfill    # use backfill
SelectType=select/cons_res      # schedule cores and RAM
SelectTypeParameters=CR_Core_Memory
PriorityType=priority/multifactor    # use multi factor for fairshare, size and QOS
PriorityDecayHalfLife=14-0           # decay time of fairshare data is 2w cause users seem to remember who did what last week on clusters but not last month :) 
PriorityFavorSmall=yes             
PriorityWeightAge=10000              
PriorityWeightFairshare=800000       # mostly priority wrt fairshare and within user jobs wrt to size
PriorityWeightJobSize=1000           
PriorityWeightQOS=1000000               # so we can have 'ASAP' QOS for urgent 'this needs to run now' jobs from professors
AccountingStorageEnforce=qos,limits     # enforce limits 
AccountingStorageHost=master
AccountingStorageType=accounting_storage/slurmdbd
AccountingStoreFlags=job_comment
ClusterName=master
DebugFlags=NO_CONF_HASH
JobCompLoc=/var/spool/slurm/accounting/jobs.txt      # we keep track of jobs finished via text files log rotated daily and these are kept forever 
JobCompType=jobcomp/filetxt
JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/linux
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurm/slurmd.log
SlurmSchedLogFile=/var/log/slurm/slurmsched.log
SlurmSchedLogLevel=1 
NodeName=master RealMemory=1024 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN      # allows for small single core jobs on master [adjust to your taste]
NodeName=compute[101-130] RealMemory=2048 Sockets=1 CoresPerSocket=2 ThreadsPerCore=1 State=UNKNOWN    # rest of nodes are a bit beefier and all similar [adjust to your case]
PartitionName=normal Nodes=master,compute[101-130] Default=YES MaxTime=7-0 State=UP ExclusiveUser=NO OverSubscribe=NO  # we stick to single default partition with all nodes with sane defaults and let scheduler do its work instead of making schedulers out of the users [set gres and features of nodes to differentiate beteen nodes rather than partitions]
EOD
master chmod 644 /etc/slurm/slurm.conf
master chown slurm:slurm /etc/slurm/slurm.conf

# Create SLURM cgroup config. Read https://slurm.schedmd.com/cgroup.conf.html for more info. Only on master.
master cat << EOD > /etc/slurm/cgroup.conf
CgroupAutomount=yes
ConstrainCores=yes
ConstrainRAMSpace=yes
MaxRAMPercent=93
EOD
master chmod 644 /etc/slurm/cgroup.conf
master chown slurm:slurm /etc/slurm/cgroup.conf

# Create SLURM topology config. Read https://slurm.schedmd.com/topology.conf.html for more info. Only on master.
master cat << EOD > /etc/slurm/topology.conf
SwitchName=master Nodes=master                # make sure no MPI jobs span across nodes and master
SwitchName=rack1 Nodes=compute[101-130]        # when having multiple racks one can isolate MPI jobs within racks this way. each rack will have a switch but switches are not connected then.
#SwitchName=rack2 Nodes=compute[201-230]
#SwitchName=rack3 Nodes=compute[301-330]
#SwitchName=Connect Switches=rack[1-3]        # but if you do want MPI jobs spanning racks, then you can connect the switches. if desired also master again.
EOD
master chmod 644 /etc/slurm/topology.conf
master chown slurm:slurm /etc/slurm/topology.conf

# Create SLURM plugstack.conf. Read https://slurm.schedmd.com/spank.conf.html for more info. Only on master.
master cat << EOD > /etc/slurm/plugstack.conf
optional /usr/lib64/slurm/spank_pbs.so           # so old Torque/PBS jobs of users have some of the old env vars defined they where used to
EOD
master chmod 644 /etc/slurm/plugstack.conf           
master chown slurm:slurm /etc/slurm/plugstack.conf

# Now the config files for SLURM have been created on master. The nodes will use them either by having a direct copy in /etc/slurm or by using the newer config-less method.
# Details regarding that can be found here https://slurm.schedmd.com/configless_slurm.html. The .conf files can be dealt with via the config-less method, but the prolog.sh 
# and epilog.sh files not [nor the node specific gres.conf locally placed on compute nodes]. Here we chose the config-less method and create the correct porlog.sh and epilog.sh 
# on all nodes together with setting up the nodes to use the config-less method. Alternatively one can set up these config files on nfs shared storage for all nodes 
# (including master) to access them. 

# Create prolog.sh and epilog.sh on all nodes. Tweaks are performed to make OpenMP jobs play nice automatically and create local scratch per job.
cat << EOD > /etc/slurm/prolog.sh
#!/bin/sh
/usr/bin/mkdir -m 700 -p /scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID}                              # create job and user local scratch with correct permissions
/usr/bin/chown -R \${SLURM_JOB_USER}:\${SLURM_JOB_USER} /scratch/\${SLURM_JOB_USER}
/usr/bin/chmod 700 /scratch/\${SLURM_JOB_USER} /scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID}
echo export TMPDIR=/scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID}                                    # user job scripts can use TMPDIR env variable now to get scratch setup
if [ -n "\${SLURM_CPUS_PER_TASK}" ]                                                                # pre set OMP_NUM_THREADS so users do not have to [or forget too] in job scripts
then
 echo export OMP_NUM_THREADS=\${SLURM_CPUS_PER_TASK}
else
 echo export OMP_NUM_THREADS=\$(/usr/bin/expr \${SLURM_JOB_CPUS_PER_NODE:-1} : '\([0-9]*\)')
fi
echo export OMP_DYNAMIC=false                                                                      # we do not like dynamic threads in HPC but you might
exit 0
EOD
cat << EOD > /etc/slurm/epilog.sh
#!/bin/sh
/bin/rm -rf /scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID}                                           # we automatically remove prolog created locat scratch of jobs
exit 0
EOD
chmod 755 /etc/slurm/{prolog,epilog}.sh           
chown slurm:slurm /etc/slurm/{prolog,epilog}.sh

# Make sure slurmd options are set to use config-less method of obtaining the SLURM configs. This is done on all nodes.
echo "SLURMD_OPTIONS=--conf-server master:6817" > /etc/sysconfig/slurmd

# Remove the config files on the compute nodes from Rocky SLURM install otherwise they overrule config-less method.
compute rm -rf /etc/slurm/*.conf                

# Now configure logrotate to deal with logfiles and accounting file only on master.
master cat << EOD > /etc/logrotate.d/slurm
/var/log/slurm/*.log
{
    missingok
    notifempty
    monthly
    rotate 12
    compress
    postrotate
     /usr/bin/killall -HUP slurmctld > /dev/null 2>&1 || true
    endscript
}
/var/spool/slurm/accounting/jobs.txt
{
    missingok
    notifempty
    daily
    dateyesterday
    rotate 3650
    maxage 3650
    dateext
    noolddir
    postrotate
     /usr/bin/killall -HUP slurmctld > /dev/null 2>&1 || true
    endscript
}
EOD

# Configure normal logrotate for compute nodes.
compute cat << EOD > /etc/logrotate.d/slurm
/var/log/slurm/*.log
{
    missingok
    notifempty
    monthly
    rotate 12
    compress
}
EOD

# Make sure all nodes can reach each other on the [private NATed] network. Run this on all nodes.
systemctl enable firewalld                                  # enable and start firewalld first
systemctl start firewalld                                   # and make sure communication is allowed between nodes
firewall-cmd --permanent --add-rich-rule="rule family=ipv4 source address=${NETWORK} accept"  
firewall-cmd --reload

# We can now start slurmctld on master and slurmd on all nodes. 
master systemctl enable slurmctld
master systemctl start slurmctld
master sleep 10                      # wait a bit for DB to get initialized properly
systemctl enable slurmd
systemctl start slurmd


# Start setting up some basic tweaks in the SLURM database.

# Now define the ASAP QOS in the SLURM DB only on master. This allows an admin to do 'scontrol update job <id> set QOS=ASAP' to make a job jump the queue without any limits.
master sacctmgr -i add qos ASAP priority=10000000

# set up default normal QOS to include user limits on master. Read more about this at https://slurm.schedmd.com/qos.html.
master sacctmgr -i modify qos normal set maxtresperuser=node=2,cpu=4  # A single user is not allowed to use more than 2 nodes or 4 cores in this example

# More user or account limits can now be set via sacctmgr on the desired associations. Read more about this at https://slurm.schedmd.com/resource_limits.html.
master sacctmgr -i modify account account=root set MaxSubmit=100         # make sure new users of account root cannot submit more than 100 jobs
master sacctmgr -i add user test account=root                            # checkout the DB with sacctmgr and use the show command https://slurm.schedmd.com/sacctmgr.html

# Let us make sure the master node is available and not draining.
master scontrol update nodename=master state=resume

# Now make sure any compute nodes installed now are not draining.
compute scontrol update nodename=compute[101-130] state=resume

# When more nodes are available, one could set up a reservation for test and or development jobs. See https://slurm.schedmd.com/reservations.html.
master scontrol create ReservationName=test StartTime=09:00:00 Duration=08:00:00 NodeCnt=1 Account=root Flags=DAILY

# SLURM has been set up and tweaked. We should take it for a spin.


# As a test let us get and compile OpenMPI against the SLURM on all nodes. In future the OpenMPI might come directly from a repo on Rocky?
dnf -y --enablerepo=crb install wget gfortran @development                         # get the tools we need from repo
wget https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.7.tar.gz      # get OpenMPI
tar -zxvf openmpi-5.0.7.tar.gz                                                     # untar and configure against SLURM
cd openmpi-5.0.7
./configure --enable-mpi1-compatibility --with-slurm --disable-static --enable-mpi-fortran --with-hwloc=internal --with-libevent=internal --with-pmix=internal --with-pmix-binaries --with-munge
make
make install                                                                      
cd /root

# Tweak OpenMPI not to do any binding by default but let SLURM be in control.
echo 'hwloc_base_binding_policy=none' >> /usr/local/etc/openmpi-mca-params.conf

# Now for a test, get an MPI hello world in C and fortran and compile and run them with SLURM on single core on master.
wget https://github.com/mpitutorial/mpitutorial/raw/gh-pages/tutorials/mpi-hello-world/code/mpi_hello_world.c
wget https://git.ecdf.ed.ac.uk/sopa-computing/parallel-coding-intro/-/raw/master/mpi-fortran/hello_world.f90
mpicc -o mpi_hello_world_c.x mpi_hello_world.c
mpif90 -o mpi_hello_world_fortran.x hello_world.f90
master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_c.x      # be aware this only works without shared nfs storage cause on master these binaries also were compiled and exist
master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_fortran.x
compute srun --nodes 1 --ntasks-per-node 2 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_c.x    
compute srun --nodes 1 --ntasks-per-node 2 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_fortran.x


# Now get some OpenMP hello world test in C and fortran and run them too.
wget https://people.sc.fsu.edu/~jburkardt/c_src/hello_openmp/hello_openmp.c
wget https://people.sc.fsu.edu/~jburkardt/f_src/hello_openmp/hello_openmp.f90
gcc -fopenmp -o omp_hello_world_c.x hello_openmp.c
gfortran -fopenmp -o omp_hello_world_fortran.x hello_openmp.f90
master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_c.x      # be aware this only works without shared nfs storage cause on master these binaries also were compiled and exist
master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_fortran.x
compute srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 2 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_c.x     
compute srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 2 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_fortran.x


# Now for some extra tweaks and tricks, here the python script to parse accounting files created by SLURM.
master cat << EOS > /root/SlurmReport
#!/usr/bin/python3
import sys, re, datetime

# --------------------------------------------

def Usage( ):
	print( "%s logfile [logfiles]" % ( sys.argv[ 0 ] ) )
	sys.exit( 1 )

# --------------------------------------------

# first test some arguments...
if ( len( sys.argv ) <= 1 ):
	Usage()

# initialize some vars...
Stats = { 'Totals': { 'JobCnt': 0, 'CPUTime': 0.0 }, 'Users': {} }

# now loop over log files specified to collect stats...
for FileNumber in range( 1, len( sys.argv ) ):
	FileName = sys.argv[ FileNumber ]
	File = open( FileName, 'r' )
	for Line in File:
		try:
			User = re.search( "UserId=([\w\d]+)", Line ).group(1)
			Start = re.search( "StartTime=([T\d:-]+)", Line ).group(1)
			End = re.search( "EndTime=([T\d:-]+)", Line ).group(1)
			Nodes = re.search( "NodeCnt=(\d+)", Line ).group(1)
			Cores = re.search( "ProcCnt=(\d+)", Line ).group(1)
			TimeDiff = datetime.datetime.strptime( End, "%Y-%m-%dT%H:%M:%S" ) - datetime.datetime.strptime( Start, "%Y-%m-%dT%H:%M:%S" )
			UsedCPUTime = float(Cores) * ((float)(TimeDiff.seconds) + (float)(TimeDiff.days)*3600.0*24.0)
			if ( User not in Stats[ 'Users' ] ):
				Stats[ 'Users' ][ User ] = { 'JobCnt': 0, 'CPUTime': 0.0 }
			Stats[ 'Users' ][ User ][ 'JobCnt' ] += 1
			Stats[ 'Users' ][ User ][ 'CPUTime' ] += UsedCPUTime
			Stats[ 'Totals' ][ 'JobCnt' ] += 1
			Stats[ 'Totals' ][ 'CPUTime' ] += UsedCPUTime
		except: continue;
	File.close()

# now generate output from stats...
print( '%-48s %-10s %-10s' % ( "User", "#jobs", "CPU(h)" ) )
print()
for User, UserStats in Stats[ 'Users' ].items():
	Jobs = UserStats[ 'JobCnt' ]
	CPUh = UserStats[ 'CPUTime' ] / 3600.0
	print( '%-48s %-10d %-10.3f' % ( User, Jobs, CPUh ) )
print()
Jobs = Stats[ 'Totals' ][ 'JobCnt' ]
CPUh = Stats[ 'Totals' ][ 'CPUTime' ] / 3600.0
print( '%-48s %-10d %-10.3f' % ( "Totals", Jobs, CPUh ) )
EOS
master chmod +x /root/SlurmReport
master /root/SlurmReport /var/spool/slurm/accounting/jobs.txt*        # and show it to us