run.sh 3.4 KB
Newer Older
Pape, David (FWCC) - 139658's avatar
Pape, David (FWCC) - 139658 committed
1
2
#!/usr/bin/env bash

3
set -eo pipefail
Pape, David (FWCC) - 139658's avatar
Pape, David (FWCC) - 139658 committed
4

5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
SLURM_OPTIONS=(cpus-per-task nodes ntasks ntasks-per-node)

function getSlurmVar {
    local option
    option=$(echo "${2^^}" | tr - _)

    {
        sed -e 's/\\n/\n/g' "$1" | grep "export CI_SLURM_$option=" | cut -d "=" -f 2
    } || {
        # TODO: return sensible default values?
        #case "$option" in
        #    "CPUS_PER_TASK") echo "4";;
        #    "NODES") echo "1";;
        #    "NTASKS") echo "4");;
        #    "NTASKS-PER-NODE") echo "";;
        #    *) echo ""
        #esac

        echo ""
    }
}
26

Pape, David (FWCC) - 139658's avatar
Pape, David (FWCC) - 139658 committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
function tailFile {
    touch "$1"  # make sure the file exists
    #until [[ -f "$1" ]]; do sleep 0.1; done && tail -f "$1"
    tail -f "$1"
}

function pidChildOf {
    ps --ppid "$1" | tail -n +2 | head -n 1 | awk '{ print $1 }'
}

function slurmJobExitCode {
    scontrol show job="$1" | grep "ExitCode" |
    awk '{ print $5 }' | cut -d "=" -f 2 | cut -d ":" -f 1
}

Pape, David (FWCC) - 139658's avatar
Pape, David (FWCC) - 139658 committed
42
#
43
44
45
# ./this_script [my_args...] script stage_name
#                            \____/
#                       This has to be run!
Pape, David (FWCC) - 139658's avatar
Pape, David (FWCC) - 139658 committed
46
47
#

Pape, David (FWCC) - 139658's avatar
Pape, David (FWCC) - 139658 committed
48
49
50
51
52
# This is needed since the runner's working directory is /tmp which is not shared between nodes.
# TODO: Find out why the runner uses /tmp instead of its working directory, anyway.
WORK_DIR="/home/pape58/runner-wd"

module load git 2> /dev/null
53
54
55

script=${*: -2:1}
runStage=${*: -1:1}
56

Pape, David (FWCC) - 139658's avatar
Pape, David (FWCC) - 139658 committed
57
# Only run the build script on the cluster.
Pape's avatar
Pape committed
58
if [[ "$runStage" == "build_script" ]]; then
59
60
61
62
63
64
65
    # This will be the name of the batch script that is constructed from the script passed to this
    # program. Later we move this to the old script name.
    newScript="$script.new"

    {
        # write shebang and SBATCH options to new script
        echo "#!/usr/bin/env bash";
Pape, David (FWCC) - 139658's avatar
Pape, David (FWCC) - 139658 committed
66
67
        echo "#SBATCH --output=$WORK_DIR/slurm-%j.out";
        echo "#SBATCH --error=$WORK_DIR/slurm-%j.err";
68
69
70
71
72
73
74
75
76
77

        # iterate over SLURM options and get the desired setting
        # only write to new script if variable not empty
        for option in "${SLURM_OPTIONS[@]}"; do
            slurmVar=$(getSlurmVar "$script" "$option")
            if [[ -n "$slurmVar" ]]; then
                echo "#SBATCH --$option=$slurmVar"
            fi
            unset -v slurmVar
        done
78
79
80

        # write settings to new script
        # TODO: Better read out these settings than assume they'll always stay like this
81
82
83
        echo "set -eo pipefail"
        echo "set +o noclobber"
        echo ""
84
85
86
87
88
89
90
91
92
93
94
95

        # add the rest of the old script
        tail -n +5 "$script";
    } >> "$newScript"

    # move the script
    mv "$newScript" "$script"

    # pass the script to sbatch, catching its job ID
    chmod +x "$script"
    jobID=$(sbatch "$script" | awk '{ print $4 }')

Pape, David (FWCC) - 139658's avatar
Pape, David (FWCC) - 139658 committed
96
97
    outFile="$WORK_DIR/slurm-$jobID.out"
    errFile="$WORK_DIR/slurm-$jobID.err"
98
99

    # spawn subshells for live output of std and err
Pape, David (FWCC) - 139658's avatar
Pape, David (FWCC) - 139658 committed
100
    (tailFile "$outFile") &
101
    pidTailOutParent="$!"
Pape, David (FWCC) - 139658's avatar
Pape, David (FWCC) - 139658 committed
102
    (tailFile "$errFile") &
103
104
    pidTailErrParent="$!"

105
    # wait for job to finish
106
    until [[ $(squeue -j "$jobID" | wc -l) -le "1" ]]; do sleep 2; done
107

Pape, David (FWCC) - 139658's avatar
Pape, David (FWCC) - 139658 committed
108
109
    pidTailOut=$(pidChildOf "$pidTailOutParent")
    pidTailErr=$(pidChildOf "$pidTailErrParent")
110
111
112

    # send SIGPIPE to suppress output when killing
    kill -13 "$pidTailOut" "$pidTailErr"
113
114
115

    # cleanup
    rm "$outFile" "$errFile"
Pape's avatar
Pape committed
116

Pape, David (FWCC) - 139658's avatar
Pape, David (FWCC) - 139658 committed
117
118
    # get the jobs exit code and pass it to GitLab CI on exit
    exitCode=$(slurmJobExitCode "$jobID")
119
    exit "$exitCode"
Pape, David (FWCC) - 139658's avatar
Pape, David (FWCC) - 139658 committed
120

Pape's avatar
Pape committed
121
else
122
    # run script without intervention
Pape's avatar
Pape committed
123
124
    "$script"
fi
Pape, David (FWCC) - 139658's avatar
Pape, David (FWCC) - 139658 committed
125
126

exit 0
127