#!/bin/bash # How to set up SLURM [and test OpenMPI and OpenMP against it] with some extra tweaks on Rocky 9.5 v 18-03-2025. # # Generally a cluster is set up in three stages: # # 1. make sure hardware is set up and nodes can be provisioned with an OS [from a master/head node] using tools like: Warewulf, xCAT, Bright, Trinity-X, Qlustar or DIY with PXE booting anaconda kickstart. In this stage /home is usually mounted on shared storage [master/head node often] on all nodes and UIDs and GIDs of users are kept in sync on all nodes via YP/NIS or LDAP [set up on master/head node]. # 2. set up SLURM and an [integrated] MPI stack on all nodes of the cluster. Sometimes a basic setup is provided by the provisioning tool used. # 3. set up a build environment on shared storage [/home often] so tools like EasyBuild and Spack can be used to build extra software. For this all nodes should be equipped with Lmod. # # This script is about step 2 and it will: # # - set up a proper accounting DB with slurmdbd so QOS and resource limits can be set next to having access to detailed accounting # - configure SLURM to use multi factor scheduling for fairshare and backfill using cores and memory # - set up a decent prolog and epilog suitable for OpenMP and dealing with local scratch for jobs # - use a topology setup to make sure MPI jobs do no span racks with perhaps only limited bandwidth between them # - set up SLURM to generate a job accounting file and include a simple python script to parse it # - demonstrate how to make OpenMPI integrate with SLURM # - give general advise in comments based on many years of experience # # Prerequisites for this script: # # - should be run as root # - all nodes share same [private] network, can resolve each other names and have outbound access to the internet [NATed or directly] # - all nodes have a Rocky 9 minimal install [at least] # # Specify the [private] network range for all nodes here. This script will open firewalld to the nodes only in this network. export NETWORK="192.168.122.0/24" # Set name of node to 'master' for a head/master node or set to 'computeXYZ' for a compute node SLURM setup. Always set up master first before setting up any compute nodes. hostnamectl hostname master # Some handy aliases only used in this script. shopt -s expand_aliases # make sure these aliases work below in script alias master='[[ "$(hostname)" == "master" ]] &&' # for running only on master alias compute='[[ "$(hostname)" != "master" ]] &&' # for running only on compute node # Start of from minimal Rocky 9 install and update all nodes. dnf -y update # Install needed packages. Needs to be done on all nodes. dnf -y --enablerepo=crb install epel-release rocky-release-hpc dnf -y --enablerepo=crb install munge munge-devel slurm23-* # Create user and group slurm on node and set up right permissions for some dirs. This needs to be done on all nodes. groupadd -r -g 500 slurm; luseradd -r -u 500 -g 500 -s /sbin/nologin -d / slurm mkdir -m 755 -p /var/log/slurm /var/run/slurm /var/spool/slurm/{accounting,slurmd,state} chown -R slurm:slurm /var/run/slurm /var/log/slurm /etc/slurm /var/spool/slurm/{accounting,slurmd,state} # Set up munge key and munge service. Needs to be done on all nodes. echo "SuperSecretMungeKeyForAllNodesOfThisCluster" > /etc/munge/munge.key chmod 400 /etc/munge/munge.key; chown munge:munge /etc/munge/munge.key systemctl enable munge systemctl start munge # Set up mariadb service. Only needs to be done on master. master dnf -y --enablerepo=crb install mariadb-server master systemctl enable mariadb master systemctl start mariadb master mysql_secure_installation << EOD y n y y y y EOD # Set up a database for SLURM. Only needs to be done on master. master mysql << EOD create user 'slurm' identified by 'slurmpassword'; grant all on slurm_acct_db.* TO 'slurm'; create database slurm_acct_db; EOD # Set up the slurmdbd.conf file. For more info read https://slurm.schedmd.com/slurmdbd.conf.html. Only needs to be done on master. master cat << EOD > /etc/slurm/slurmdbd.conf PurgeEventAfter=24 # these values can be tweaked but as they are they keep accounting data in the DB for 2y PurgeJobAfter=24 PurgeResvAfter=24 PurgeStepAfter=24 PurgeSuspendAfter=24 AuthType=auth/munge DbdHost=master SlurmUser=slurm DebugLevel=4 LogFile=/var/log/slurm/slurmdbd.log PidFile=/var/run/slurm/slurmdbd.pid PrivateData=usage,users,jobs # here users can only retrieve their own accounting data StorageType=accounting_storage/mysql StorageHost=master StoragePass=slurmpassword StorageUser=slurm StorageLoc=slurm_acct_db EOD master chmod 600 /etc/slurm/slurmdbd.conf # make sure no user can read your secret DB password master chown slurm:slurm /etc/slurm/slurmdbd.conf # Now enable and start slurmdbd service. On master only. master systemctl enable slurmdbd master systemctl start slurmdbd master sleep 5 # wait a bit for it to settle # Create SLURM config. Read https://slurm.schedmd.com/slurm.conf.html for more info or use https://slurm.schedmd.com/configurator.html. Only on master. master cat << EOD > /etc/slurm/slurm.conf AuthType=auth/munge CryptoType=crypto/munge EnforcePartLimits=yes Epilog=/etc/slurm/epilog.sh # set up epilog script MaxTasksPerNode=2 # set this to the nr of physical cores your compute nodes have MpiDefault=pmix_v3 # we use pmix_v3 by default so openmpi hooks into SLURMs pmix_v3 plugin so srun works nicely CpuFreqDef=Performance PrivateData=usage,users ProctrackType=proctrack/cgroup # we use cgroups to track and isolate jobs Prolog=/etc/slurm/prolog.sh # set up prolog script RebootProgram=/usr/sbin/reboot # make sure we can reboot nodes via scontrol ReturnToService=1 SlurmctldHost=master SlurmctldParameters=enable_configless # we will use config-less method to get config to compute nodes from master SlurmctldPidFile=/var/run/slurm/slurmctld.pid SlurmctldPort=6817 SlurmdPidFile=/var/run/slurm/slurmd.pid SlurmdPort=6818 SlurmdSpoolDir=/var/spool/slurm/slurmd SlurmUser=slurm StateSaveLocation=/var/spool/slurm/state SwitchType=switch/none TaskPlugin=task/affinity,task/cgroup TaskPluginParam=Cores TaskProlog=/etc/slurm/prolog.sh TopologyPlugin=topology/tree InactiveLimit=0 KillWait=30 MinJobAge=300 SlurmctldTimeout=120 SlurmdTimeout=300 Waittime=0 DefMemPerCPU=1024 # set this to the RAM of the node [minus a few gig] divided by the number of physical cores on nodes MaxMemPerNode=2048 # set this the RAM of node minus a few gig for OS etc SchedulerType=sched/backfill # use backfill SelectType=select/cons_res # schedule cores and RAM SelectTypeParameters=CR_Core_Memory PriorityType=priority/multifactor # use multi factor for fairshare, size and QOS PriorityDecayHalfLife=14-0 # decay time of fairshare data is 2w cause users seem to remember who did what last week on clusters but not last month :) PriorityFavorSmall=yes PriorityWeightAge=10000 PriorityWeightFairshare=800000 # mostly priority wrt fairshare and within user jobs wrt to size PriorityWeightJobSize=1000 PriorityWeightQOS=1000000 # so we can have 'ASAP' QOS for urgent 'this needs to run now' jobs from professors AccountingStorageEnforce=qos,limits # enforce limits AccountingStorageHost=master AccountingStorageType=accounting_storage/slurmdbd AccountingStoreFlags=job_comment ClusterName=master DebugFlags=NO_CONF_HASH JobCompLoc=/var/spool/slurm/accounting/jobs.txt # we keep track of jobs finished via text files log rotated daily and these are kept forever JobCompType=jobcomp/filetxt JobAcctGatherFrequency=30 JobAcctGatherType=jobacct_gather/linux SlurmctldDebug=3 SlurmctldLogFile=/var/log/slurm/slurmctld.log SlurmdDebug=3 SlurmdLogFile=/var/log/slurm/slurmd.log SlurmSchedLogFile=/var/log/slurm/slurmsched.log SlurmSchedLogLevel=1 NodeName=master RealMemory=1024 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN # allows for small single core jobs on master [adjust to your taste] NodeName=compute[101-130] RealMemory=2048 Sockets=1 CoresPerSocket=2 ThreadsPerCore=1 State=UNKNOWN # rest of nodes are a bit beefier and all similar [adjust to your case] PartitionName=normal Nodes=master,compute[101-130] Default=YES MaxTime=7-0 State=UP ExclusiveUser=NO OverSubscribe=NO # we stick to single default partition with all nodes with sane defaults and let scheduler do its work instead of making schedulers out of the users [set gres and features of nodes to differentiate beteen nodes rather than partitions] EOD master chmod 644 /etc/slurm/slurm.conf master chown slurm:slurm /etc/slurm/slurm.conf # Create SLURM cgroup config. Read https://slurm.schedmd.com/cgroup.conf.html for more info. Only on master. master cat << EOD > /etc/slurm/cgroup.conf CgroupAutomount=yes ConstrainCores=yes ConstrainRAMSpace=yes MaxRAMPercent=93 EOD master chmod 644 /etc/slurm/cgroup.conf master chown slurm:slurm /etc/slurm/cgroup.conf # Create SLURM topology config. Read https://slurm.schedmd.com/topology.conf.html for more info. Only on master. master cat << EOD > /etc/slurm/topology.conf SwitchName=master Nodes=master # make sure no MPI jobs span across nodes and master SwitchName=rack1 Nodes=compute[101-130] # when having multiple racks one can isolate MPI jobs within racks this way. each rack will have a switch but switches are not connected then. #SwitchName=rack2 Nodes=compute[201-230] #SwitchName=rack3 Nodes=compute[301-330] #SwitchName=Connect Switches=rack[1-3] # but if you do want MPI jobs spanning racks, then you can connect the switches. if desired also master again. EOD master chmod 644 /etc/slurm/topology.conf master chown slurm:slurm /etc/slurm/topology.conf # Create SLURM plugstack.conf. Read https://slurm.schedmd.com/spank.conf.html for more info. Only on master. master cat << EOD > /etc/slurm/plugstack.conf optional /usr/lib64/slurm/spank_pbs.so # so old Torque/PBS jobs of users have some of the old env vars defined they where used to EOD master chmod 644 /etc/slurm/plugstack.conf master chown slurm:slurm /etc/slurm/plugstack.conf # Now the config files for SLURM have been created on master. The nodes will use them either by having a direct copy in /etc/slurm or by using the newer config-less method. # Details regarding that can be found here https://slurm.schedmd.com/configless_slurm.html. The .conf files can be dealt with via the config-less method, but the prolog.sh # and epilog.sh files not [nor the node specific gres.conf locally placed on compute nodes]. Here we chose the config-less method and create the correct porlog.sh and epilog.sh # on all nodes together with setting up the nodes to use the config-less method. Alternatively one can set up these config files on nfs shared storage for all nodes # (including master) to access them. # Create prolog.sh and epilog.sh on all nodes. Tweaks are performed to make OpenMP jobs play nice automatically and create local scratch per job. cat << EOD > /etc/slurm/prolog.sh #!/bin/sh /usr/bin/mkdir -m 700 -p /scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID} # create job and user local scratch with correct permissions /usr/bin/chown -R \${SLURM_JOB_USER}:\${SLURM_JOB_USER} /scratch/\${SLURM_JOB_USER} /usr/bin/chmod 700 /scratch/\${SLURM_JOB_USER} /scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID} echo export TMPDIR=/scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID} # user job scripts can use TMPDIR env variable now to get scratch setup if [ -n "\${SLURM_CPUS_PER_TASK}" ] # pre set OMP_NUM_THREADS so users do not have to [or forget too] in job scripts then echo export OMP_NUM_THREADS=\${SLURM_CPUS_PER_TASK} else echo export OMP_NUM_THREADS=\$(/usr/bin/expr \${SLURM_JOB_CPUS_PER_NODE:-1} : '\([0-9]*\)') fi echo export OMP_DYNAMIC=false # we do not like dynamic threads in HPC but you might exit 0 EOD cat << EOD > /etc/slurm/epilog.sh #!/bin/sh /bin/rm -rf /scratch/\${SLURM_JOB_USER}/\${SLURM_JOB_ID} # we automatically remove prolog created locat scratch of jobs exit 0 EOD chmod 755 /etc/slurm/{prolog,epilog}.sh chown slurm:slurm /etc/slurm/{prolog,epilog}.sh # Make sure slurmd options are set to use config-less method of obtaining the SLURM configs. This is done on all nodes. echo "SLURMD_OPTIONS=--conf-server master:6817" > /etc/sysconfig/slurmd # Remove the config files on the compute nodes from Rocky SLURM install otherwise they overrule config-less method. compute rm -rf /etc/slurm/*.conf # Now configure logrotate to deal with logfiles and accounting file only on master. master cat << EOD > /etc/logrotate.d/slurm /var/log/slurm/*.log { missingok notifempty monthly rotate 12 compress postrotate /usr/bin/killall -HUP slurmctld > /dev/null 2>&1 || true endscript } /var/spool/slurm/accounting/jobs.txt { missingok notifempty daily dateyesterday rotate 3650 maxage 3650 dateext noolddir postrotate /usr/bin/killall -HUP slurmctld > /dev/null 2>&1 || true endscript } EOD # Configure normal logrotate for compute nodes. compute cat << EOD > /etc/logrotate.d/slurm /var/log/slurm/*.log { missingok notifempty monthly rotate 12 compress } EOD # Make sure all nodes can reach each other on the [private NATed] network. Run this on all nodes. systemctl enable firewalld # enable and start firewalld first systemctl start firewalld # and make sure communication is allowed between nodes firewall-cmd --permanent --add-rich-rule="rule family=ipv4 source address=${NETWORK} accept" firewall-cmd --reload # We can now start slurmctld on master and slurmd on all nodes. master systemctl enable slurmctld master systemctl start slurmctld master sleep 10 # wait a bit for DB to get initialized properly systemctl enable slurmd systemctl start slurmd # Start setting up some basic tweaks in the SLURM database. # Now define the ASAP QOS in the SLURM DB only on master. This allows an admin to do 'scontrol update job set QOS=ASAP' to make a job jump the queue without any limits. master sacctmgr -i add qos ASAP priority=10000000 # set up default normal QOS to include user limits on master. Read more about this at https://slurm.schedmd.com/qos.html. master sacctmgr -i modify qos normal set maxtresperuser=node=2,cpu=4 # A single user is not allowed to use more than 2 nodes or 4 cores in this example # More user or account limits can now be set via sacctmgr on the desired associations. Read more about this at https://slurm.schedmd.com/resource_limits.html. master sacctmgr -i modify account account=root set MaxSubmit=100 # make sure new users of account root cannot submit more than 100 jobs master sacctmgr -i add user test account=root # checkout the DB with sacctmgr and use the show command https://slurm.schedmd.com/sacctmgr.html # Let us make sure the master node is available and not draining. master scontrol update nodename=master state=resume # Now make sure any compute nodes installed now are not draining. compute scontrol update nodename=compute[101-130] state=resume # When more nodes are available, one could set up a reservation for test and or development jobs. See https://slurm.schedmd.com/reservations.html. master scontrol create ReservationName=test StartTime=09:00:00 Duration=08:00:00 NodeCnt=1 Account=root Flags=DAILY # SLURM has been set up and tweaked. We should take it for a spin. # As a test let us get and compile OpenMPI against the SLURM on all nodes. In future the OpenMPI might come directly from a repo on Rocky? dnf -y --enablerepo=crb install wget gfortran @development # get the tools we need from repo wget https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.7.tar.gz # get OpenMPI tar -zxvf openmpi-5.0.7.tar.gz # untar and configure against SLURM cd openmpi-5.0.7 ./configure --enable-mpi1-compatibility --with-slurm --disable-static --enable-mpi-fortran --with-hwloc=internal --with-libevent=internal --with-pmix=internal --with-pmix-binaries --with-munge make make install cd /root # Tweak OpenMPI not to do any binding by default but let SLURM be in control. echo 'hwloc_base_binding_policy=none' >> /usr/local/etc/openmpi-mca-params.conf # Now for a test, get an MPI hello world in C and fortran and compile and run them with SLURM on single core on master. wget https://github.com/mpitutorial/mpitutorial/raw/gh-pages/tutorials/mpi-hello-world/code/mpi_hello_world.c wget https://git.ecdf.ed.ac.uk/sopa-computing/parallel-coding-intro/-/raw/master/mpi-fortran/hello_world.f90 mpicc -o mpi_hello_world_c.x mpi_hello_world.c mpif90 -o mpi_hello_world_fortran.x hello_world.f90 master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_c.x # be aware this only works without shared nfs storage cause on master these binaries also were compiled and exist master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_fortran.x compute srun --nodes 1 --ntasks-per-node 2 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_c.x compute srun --nodes 1 --ntasks-per-node 2 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./mpi_hello_world_fortran.x # Now get some OpenMP hello world test in C and fortran and run them too. wget https://people.sc.fsu.edu/~jburkardt/c_src/hello_openmp/hello_openmp.c wget https://people.sc.fsu.edu/~jburkardt/f_src/hello_openmp/hello_openmp.f90 gcc -fopenmp -o omp_hello_world_c.x hello_openmp.c gfortran -fopenmp -o omp_hello_world_fortran.x hello_openmp.f90 master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_c.x # be aware this only works without shared nfs storage cause on master these binaries also were compiled and exist master srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 1 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_fortran.x compute srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 2 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_c.x compute srun --nodes 1 --ntasks-per-node 1 --cpus-per-task 2 --threads-per-core 1 --time 0:05:00 --mem 32M ./omp_hello_world_fortran.x # Now for some extra tweaks and tricks, here the python script to parse accounting files created by SLURM. master cat << EOS > /root/SlurmReport #!/usr/bin/python3 import sys, re, datetime # -------------------------------------------- def Usage( ): print( "%s logfile [logfiles]" % ( sys.argv[ 0 ] ) ) sys.exit( 1 ) # -------------------------------------------- # first test some arguments... if ( len( sys.argv ) <= 1 ): Usage() # initialize some vars... Stats = { 'Totals': { 'JobCnt': 0, 'CPUTime': 0.0 }, 'Users': {} } # now loop over log files specified to collect stats... for FileNumber in range( 1, len( sys.argv ) ): FileName = sys.argv[ FileNumber ] File = open( FileName, 'r' ) for Line in File: try: User = re.search( "UserId=([\w\d]+)", Line ).group(1) Start = re.search( "StartTime=([T\d:-]+)", Line ).group(1) End = re.search( "EndTime=([T\d:-]+)", Line ).group(1) Nodes = re.search( "NodeCnt=(\d+)", Line ).group(1) Cores = re.search( "ProcCnt=(\d+)", Line ).group(1) TimeDiff = datetime.datetime.strptime( End, "%Y-%m-%dT%H:%M:%S" ) - datetime.datetime.strptime( Start, "%Y-%m-%dT%H:%M:%S" ) UsedCPUTime = float(Cores) * ((float)(TimeDiff.seconds) + (float)(TimeDiff.days)*3600.0*24.0) if ( User not in Stats[ 'Users' ] ): Stats[ 'Users' ][ User ] = { 'JobCnt': 0, 'CPUTime': 0.0 } Stats[ 'Users' ][ User ][ 'JobCnt' ] += 1 Stats[ 'Users' ][ User ][ 'CPUTime' ] += UsedCPUTime Stats[ 'Totals' ][ 'JobCnt' ] += 1 Stats[ 'Totals' ][ 'CPUTime' ] += UsedCPUTime except: continue; File.close() # now generate output from stats... print( '%-48s %-10s %-10s' % ( "User", "#jobs", "CPU(h)" ) ) print() for User, UserStats in Stats[ 'Users' ].items(): Jobs = UserStats[ 'JobCnt' ] CPUh = UserStats[ 'CPUTime' ] / 3600.0 print( '%-48s %-10d %-10.3f' % ( User, Jobs, CPUh ) ) print() Jobs = Stats[ 'Totals' ][ 'JobCnt' ] CPUh = Stats[ 'Totals' ][ 'CPUTime' ] / 3600.0 print( '%-48s %-10d %-10.3f' % ( "Totals", Jobs, CPUh ) ) EOS master chmod +x /root/SlurmReport master /root/SlurmReport /var/spool/slurm/accounting/jobs.txt* # and show it to us