Commit 57889603 authored by Pape, David (FWCC) - 139658's avatar Pape, David (FWCC) - 139658
Browse files

React to different Slurm job states. Also added support for CI tags.

Jobs to be run on the cluster must use the CI tag, other jobs are run on
login node.
parent 654e2c5c
......@@ -33,6 +33,10 @@ function pidChildOf {
ps --ppid "$1" | tail -n +2 | head -n 1 | awk '{ print $1 }'
}
function slurmJobState {
scontrol show job="$1" | grep "JobState" | awk '{ print $1 }' | cut -d "=" -f 2
}
function slurmJobExitCode {
scontrol show job="$1" | grep "ExitCode" |
awk '{ print $5 }' | cut -d "=" -f 2 | cut -d ":" -f 1
......@@ -50,7 +54,7 @@ script=${*: -2:1}
runStage=${*: -1:1}
# Only run the build script on the cluster.
if [[ "$runStage" == "build_script" ]]; then
if [[ "$1" == "hemera" && "$runStage" == "build_script" ]]; then
# This will be the name of the batch script that is constructed from the script passed to this
# program. Later we move this to the old script name.
newScript="$script.new"
......@@ -101,8 +105,27 @@ if [[ "$runStage" == "build_script" ]]; then
pidTailErrParent="$!"
# wait for job to finish
until [[ $(squeue -j "$jobID" | wc -l) -le "1" ]]; do sleep 2; done
1>&2 echo -e "${BOLD_GREEN}Job finished$NC"
#until [[ $(squeue -j "$jobID" | wc -l) -le "1" ]]; do sleep 2; done
#1>&2 echo -e "${BOLD_GREEN}Job finished$NC"
while true; do
jobState=$(slurmJobState "$jobID")
1>&2 echo "Slurm job state: $jobState"
case "$jobState" in
"BOOT_FAIL"|"CANCELLED"|"DEADLINE"|"FAILED"|"NODE_FAIL"|"OUT_OF_MEMORY"|"PREEMPTED"|"REVOKED"|"SPECIAL_EXIT"|"TIMEOUT")
exitCode=255
1>&2 echo -e "${BOLD_RED}Slurm job stopped$NC"
break;;
"COMPLETED")
1>&2 echo -e "${BOLD_GREEN}Slurm job completed$NC"
break;;
"CONFIGURING"|"COMPLETING"|"PENDING"|"RUNNING"|"RESV_DEL_HOLD"|"REQUEUE_FED"|"REQUEUE_HOLD"|"REQUEUED"|"RESIZING"|"SIGNALING"|"STAGE_OUT"|"STOPPED"|"SUSPENDED")
;;
esac
sleep 5;
done
pidTailOut=$(pidChildOf "$pidTailOutParent")
pidTailErr=$(pidChildOf "$pidTailErrParent")
......@@ -113,8 +136,8 @@ if [[ "$runStage" == "build_script" ]]; then
# cleanup
rm "$outFile" "$errFile"
# get the jobs exit code and pass it to GitLab CI on exit
exitCode=$(slurmJobExitCode "$jobID")
# set exit code if still not set and pass it to GitLab CI on exit
if [[ -z "$exitCode" ]]; then exitCode=$(slurmJobExitCode "$jobID"); fi
exit "$exitCode"
else
......
......@@ -7,7 +7,23 @@ check_interval = 0
[[runners]]
name = "Test runner for cluster integration"
url = "https://gitlab.hzdr.de/"
token = "eQT7ugr9aK-g6TTJGxdB"
token = "xExeZZJWhNcvYfLtWiFs"
executor = "custom"
builds_dir = "/home/pape58/runner-wd/builds"
cache_dir = "/home/pape58/runner-wd/cache"
shell = "bash"
[runners.custom]
config_exec = "/home/pape58/gitlab-runner-custom/config.sh"
run_exec = "/home/pape58/gitlab-runner-custom/run.sh"
run_args = ["hemera"]
cleanup_exec = "/home/pape58/gitlab-runner-custom/cleanup.sh"
graceful_kill_timeout = 200
force_kill_timeout = 200
[[runners]]
name = "Test runner for cluster integration -- this one runs all non-hemera jobs"
url = "https://gitlab.hzdr.de/"
token = "_D1dkrgzufLqkVSsBzbD"
executor = "custom"
builds_dir = "/home/pape58/runner-wd/builds"
cache_dir = "/home/pape58/runner-wd/cache"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment