Difference between revisions of "SLURM Job Array Migration Guide"

From Storrs HPC Wiki
Jump to: navigation, search
(Add LDFLAGS for tbb)
(Add Beaverton-Hold model)
Line 70: Line 70:
 
<syntaxhighlight lang="bash">
 
<syntaxhighlight lang="bash">
 
# Add LDFLAGS so that we can run the model without loading gcc.
 
# Add LDFLAGS so that we can run the model without loading gcc.
LDFLAGS="-Wl,-rpath,/apps2/gcc/9.2.0/lib64 -Wl,-rpath,stan/lib/stan_math/lib/tbb"
+
LDFLAGS="-Wl,-rpath,/apps2/gcc/9.2.0/lib64 -Wl,-rpath,${PWD}/stan/lib/stan_math/lib/tbb"
 
make LDFLAGS="${LDFLAGS}" examples/bernoulli/bernoulli
 
make LDFLAGS="${LDFLAGS}" examples/bernoulli/bernoulli
 
examples/bernoulli/bernoulli sample data file=examples/bernoulli/bernoulli.data.R
 
examples/bernoulli/bernoulli sample data file=examples/bernoulli/bernoulli.data.R
 
bin/stansummary output.csv
 
bin/stansummary output.csv
 +
</syntaxhighlight>
 +
 +
Now that we know that Stan works correctly with the builtin example model,
 +
let's add a model that takes longer to run to test parallel computations.
 +
Generate the files suggested by https://www.weirdfishes.blog/blog/fitting-bayesian-models-with-stan-and-r/
 +
 +
<syntaxhighlight lang="bash">
 +
mkdir examples/beverton_holt
 +
cd examples/beverton_holt
 +
wget https://github.com/DanOvando/weird-fishes/raw/master/content/blog/data/rlsadb_v4.25_ssb_recruits.csv
 +
# Fix line endings in the data file.
 +
sed -i 's#\r#\n#g' rlsadb_v4.25_ssb_recruits.csv
 +
# Create to_json.py file to subset below.
 +
nano to_json.py
 +
# Subset data for testing.
 +
python to_json.py rlsadb_v4.25_ssb_recruits.csv PSALMAKPSWUD > subset.json
 +
# Copy and paste the model below.
 +
nano bh_model.stan
 +
# Compile the model from the top level makefile.
 +
cd ../..
 +
LDFLAGS="-Wl,-rpath,/apps2/gcc/9.2.0/lib64 -Wl,-rpath,${PWD}/stan/lib/stan_math/lib/tbb"
 +
make LDFLAGS="${LDFLAGS}" examples/beverton_holt/bh_model
 +
# Test the model.
 +
examples/beverton_holt/bh_model sample num_samples=5000 data file=subset.json
 +
bin/stansummary output.csv
 +
</syntaxhighlight>
 +
 +
Subset code <code>to_json.py</code>:
 +
 +
<syntaxhighlight lang="python">
 +
from __future__ import print_function
 +
import csv
 +
import json
 +
import os
 +
import sys
 +
 +
if __name__ == '__main__':
 +
    if len(sys.argv) != 3:
 +
        print("Usage: {0} CSVFILE STOCKID".format(sys.argv[0]))
 +
        sys.exit(1)
 +
    csvfile = sys.argv[1]
 +
    stockid = sys.argv[2]
 +
    ssb = []
 +
    r = []
 +
    stockids = set()
 +
    with open(csvfile, 'r') as handle:
 +
        rows = csv.reader(handle)
 +
        next(rows)
 +
        for row in rows:
 +
            _stockid = row[0]
 +
            stockids.update([_stockid])
 +
            if _stockid == stockid:
 +
                _ssb = row[-3]
 +
                _r = row[-1]
 +
                if _ssb != '' and  _r != '':
 +
                    ssb += [float(_ssb)]
 +
                    r += [float(_r)]
 +
    max_r = max(r)
 +
    n = len(r)
 +
    print(json.dumps({'ssb': ssb, 'r': r, 'max_r': max_r, 'n': n}))
 +
    file_stockids = os.path.join(os.path.dirname(csvfile), 'stockids.txt')
 +
    if not os.path.exists(file_stockids):
 +
        with open(file_stockids, 'w') as handle:
 +
            handle.write('\n'.join(sorted(stockids)))
 +
        print("Also wrote stock IDs to {0}".format(file_stockids),
 +
              file=sys.stderr)
 +
</syntaxhighlight>
 +
 +
Model code <code>bh_model.stan</code>:
 +
 +
<syntaxhighlight lang="c++">
 +
/*
 +
Fit a Beaverton-Hold stock recruitment relationship to fishery stock
 +
spawning biomass using steepness parameterization from Dorn (2002) as
 +
described in detail at
 +
https://www.weirdfishes.blog/blog/fitting-bayesian-models-with-stan-and-r/
 +
*/
 +
 +
data {
 +
  int<lower = 0> n;        /* Number of observations. */
 +
  vector[n] ssb;            /* Observed fish stock spawning biomass */
 +
  vector[n] r;              /* Number of recruits. */
 +
  real max_r;              /* Maximum observed recruitment. */
 +
}
 +
 +
transformed data {
 +
  vector[n] log_r;
 +
  log_r = log(r);
 +
}
 +
 +
parameters {
 +
  real<lower = 0.2, upper = 1> h; /* Steepness. */
 +
  real<lower = 0> alpha;          /* Maximum recruitment. */
 +
  real<lower = 0> sigma;          /* Recruitment standard deviation. */
 +
}
 +
 +
transformed parameters {
 +
  vector[n] rhat;              /* Estimate of actual recruitment. */
 +
  vector[n] log_rhat;
 +
  /* Beverton Holt model. */
 +
  rhat =
 +
    (0.8 * alpha *  h * ssb) ./
 +
    (0.2 * alpha * (1 - h) + (h - 0.2) * ssb);
 +
  log_rhat = log(rhat);
 +
}
 +
 +
model {
 +
  log_r ~ normal(log_rhat -
 +
                0.5 * sigma^2, /* Account for retransformation bias. */
 +
                sigma);
 +
  alpha ~ normal(2 * max_r, 0.2 * max_r);
 +
  sigma ~ cauchy(0, 2.5);
 +
}
 +
 +
generated quantities {
 +
  vector[n] pp_rhat;
 +
  for (i in 1:n) {
 +
    /* Generate posterior predictives. */
 +
    pp_rhat[i] = exp(normal_rng(log_rhat[i] - 0.5 * sigma^2, sigma));
 +
  }
 +
}
 
</syntaxhighlight>
 
</syntaxhighlight>
  
 
= Job Array script =
 
= Job Array script =
  
Consider this simple job array script which we will save as <code>submit.slurm</code>
+
Consider this simple job array script which we will save as <code>submit-job-array.slurm</code>
  
 
<syntaxhighlight lang="bash">
 
<syntaxhighlight lang="bash">
Line 84: Line 205:
 
#SBATCH --ntasks 1
 
#SBATCH --ntasks 1
 
#SBATCH --array 1-5
 
#SBATCH --array 1-5
 
# Load only required modules.
 
module purge
 
module load \
 
      gcc/9.2.0 \
 
      r/3.6.1
 
  
 
# Run parameter index set by the SLURM array index.
 
# Run parameter index set by the SLURM array index.
Rscript model_fit.R ${SLURM_ARRAY_TASK_ID}
+
bash model_fit.bash ${SLURM_ARRAY_TASK_ID}
 
</syntaxhighlight>
 
</syntaxhighlight>
  
The R script is <code>model_fit.R</code>
+
As you can see, the submission script is short and most of the work is done in <code>model_fit.bash</code>:
  
<syntaxhighlight lang="r">
+
<syntaxhighlight lang="bash">
# Read the index for the list of parameters from the command line.
+
# Read the index for the model input from the command line.
args <- commandLineArgs(trailingOnly = TRUE)
+
param_idx=$1
param_idx <- as.integer(args[[1]])
+
# Make sure the index is an integer.
 +
if ! [[ "${param_idx}" -eq "${param_idx}" ]]
 +
then
 +
    echo >&2 "Error: Parameter index \"${param_idx}\" must be an integer!"
 +
    exit 1
 +
fi
  
# Read in the parameters for this index.
+
# Generate the parameters JSON file for this index.
params <- read.csv("")
+
idfile=stockids.txt
params <- subset(params, params$idx == param_idx)
+
stockid=$(sed -n ${param_idx}p ${idfile})
 +
if [[ -z "${stockid}" ]]
 +
then
 +
    echo >&2 "Error: Parameter index \"${param_idx}\" does not exist in ${idfile}!"
 +
    exit 1
 +
fi
 +
csvfile=rlsadb_v4.25_ssb_recruits.csv
 +
datafile=${param_idx}-data.json
 +
python to_json.py ${csvfile} ${stockid} > ${datafile}
  
# Load data from builtin "datasets" package.
+
# Fit the model to the parameters.                                                                                                               
 +
outputfile=${param_idx}-output.csv
 +
samplefile=${param_idx}-output.txt
 +
./bh_model \
 +
    sample num_samples=5000 \
 +
    data file=${datafile} \
 +
    output file=${outputfile} | tee ${samplefile}
 +
summaryfile=${param_idx}-summary.txt
 +
../../bin/stansummary ${outputfile} | tee ${summaryfile}
 +
</syntaxhighlight>
  
 +
You can submit the job and see the results:
  
# Fit the model to the parameters and save the result.
+
<syntaxhighlight lang="bash">
formula <- y ~ x_0 + x_1
+
sbatch submit-job-array.slurm
 +
# After the job completes ...
 +
tail -n 4 *-summary.txt
 
</syntaxhighlight>
 
</syntaxhighlight>
 
This is what our <code>parameters.csv</code>
 
 
idx, niter, tol
 
  1,  100, 1e-4
 
  2,  1000, 1e-8
 
  3,  3000, 1e-8
 
  4,  2000, 1e-8
 
  5,  3000, 1e-4
 
 
We use <code>R</code> in this example because we
 
can run all our statistical computing without loading any additional libraries
 
and so it should "just work" without needing any additional setup on your part.
 
  
 
= Bash jobs =
 
= Bash jobs =

Revision as of 10:24, 23 June 2020

This guide describes how to bypass the 8 job limit by lightly restructuring your code to effectively use multiple CPUs within a single job. As a side benefit, your code will also become more resilient to failure by gaining the ability to resume where it left off.

If you have used SLURM on other clusters, you may be surprised by the 8 job limit; the reason the limit was put in place is to reduce the time between submitting your your job and it starting to run. The limit was added by the request of our users to share the cluster more fairly.

The goal if the guide is to explain concepts several underlying job parallelism, starting with SLURM Job Arrays, taking a detour to using shell job parallelism and xargs, and finally describing sophisticated parallelism and job control using GNU Parallel.

Method Multiple CPUs Multiple Nodes Resumable Max CPUs
Job Arrays Yes Yes Manual (See note) 8
Bash Jobs Yes No No 24
xargs Yes No No 24
GNU Parallel Yes Yes Yes 192
MPI Yes Yes Maybe 192
Note: Assume each job step uses 1 CPU

Let's get started!

Setup

Let's solve an authentic task of Bayesian inference using the Stan language.

The command-line version of the stan program cannot be shared as a module and is instead meant to be compiled our home directory, because the way Stan works is by compiling model programs before running them. The setup should take about 10 minutes. Run these commands in your shell:

wget https://github.com/stan-dev/cmdstan/releases/download/v2.23.0/cmdstan-2.23.0.tar.gz
tar -xf cmdstan-2.23.0.tar.gz
cd cmdstan-2.23.0/
module purge
module load gcc/9.2.0
# Note that we unset the RTM_KEY of tbb because rtm instructions are only available on SkyLake,
# but we want to be able to run Stan on older CPUs.
make -j RTM_KEY= build

Build and run the example model as described by make. Compiling models only uses one CPU core at a time, so no need to use -j here.

# Add LDFLAGS so that we can run the model without loading gcc.
LDFLAGS="-Wl,-rpath,/apps2/gcc/9.2.0/lib64 -Wl,-rpath,${PWD}/stan/lib/stan_math/lib/tbb"
make LDFLAGS="${LDFLAGS}" examples/bernoulli/bernoulli
examples/bernoulli/bernoulli sample data file=examples/bernoulli/bernoulli.data.R
bin/stansummary output.csv

Now that we know that Stan works correctly with the builtin example model, let's add a model that takes longer to run to test parallel computations. Generate the files suggested by https://www.weirdfishes.blog/blog/fitting-bayesian-models-with-stan-and-r/

mkdir examples/beverton_holt
cd examples/beverton_holt
wget https://github.com/DanOvando/weird-fishes/raw/master/content/blog/data/rlsadb_v4.25_ssb_recruits.csv
# Fix line endings in the data file.
sed -i 's#\r#\n#g' rlsadb_v4.25_ssb_recruits.csv
# Create to_json.py file to subset below.
nano to_json.py
# Subset data for testing.
python to_json.py rlsadb_v4.25_ssb_recruits.csv PSALMAKPSWUD > subset.json
# Copy and paste the model below.
nano bh_model.stan
# Compile the model from the top level makefile.
cd ../..
LDFLAGS="-Wl,-rpath,/apps2/gcc/9.2.0/lib64 -Wl,-rpath,${PWD}/stan/lib/stan_math/lib/tbb"
make LDFLAGS="${LDFLAGS}" examples/beverton_holt/bh_model
# Test the model.
examples/beverton_holt/bh_model sample num_samples=5000 data file=subset.json
bin/stansummary output.csv

Subset code to_json.py:

from __future__ import print_function
import csv
import json
import os
import sys

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print("Usage: {0} CSVFILE STOCKID".format(sys.argv[0]))
        sys.exit(1)
    csvfile = sys.argv[1]
    stockid = sys.argv[2]
    ssb = []
    r = []
    stockids = set()
    with open(csvfile, 'r') as handle:
        rows = csv.reader(handle)
        next(rows)
        for row in rows:
            _stockid = row[0]
            stockids.update([_stockid])
            if _stockid == stockid:
                _ssb = row[-3]
                _r = row[-1]
                if _ssb != '' and  _r != '':
                    ssb += [float(_ssb)]
                    r += [float(_r)]
    max_r = max(r)
    n = len(r)
    print(json.dumps({'ssb': ssb, 'r': r, 'max_r': max_r, 'n': n}))
    file_stockids = os.path.join(os.path.dirname(csvfile), 'stockids.txt')
    if not os.path.exists(file_stockids):
        with open(file_stockids, 'w') as handle:
            handle.write('\n'.join(sorted(stockids)))
        print("Also wrote stock IDs to {0}".format(file_stockids),
              file=sys.stderr)

Model code bh_model.stan:

/*
Fit a Beaverton-Hold stock recruitment relationship to fishery stock
spawning biomass using steepness parameterization from Dorn (2002) as
described in detail at
https://www.weirdfishes.blog/blog/fitting-bayesian-models-with-stan-and-r/
*/

data {
  int<lower = 0> n;         /* Number of observations. */
  vector[n] ssb;            /* Observed fish stock spawning biomass */
  vector[n] r;              /* Number of recruits. */
  real max_r;               /* Maximum observed recruitment. */
}

transformed data {
  vector[n] log_r;
  log_r = log(r);
}

parameters {
  real<lower = 0.2, upper = 1> h; /* Steepness. */
  real<lower = 0> alpha;          /* Maximum recruitment. */
  real<lower = 0> sigma;          /* Recruitment standard deviation. */
}

transformed parameters {
  vector[n] rhat;               /* Estimate of actual recruitment. */
  vector[n] log_rhat;
  /* Beverton Holt model. */
  rhat =
    (0.8 * alpha *  h * ssb) ./
    (0.2 * alpha * (1 - h) + (h - 0.2) * ssb);
  log_rhat = log(rhat);
}

model {
  log_r ~ normal(log_rhat -
                 0.5 * sigma^2, /* Account for retransformation bias. */
                 sigma);
  alpha ~ normal(2 * max_r, 0.2 * max_r);
  sigma ~ cauchy(0, 2.5);
}

generated quantities {
  vector[n] pp_rhat;
  for (i in 1:n) {
    /* Generate posterior predictives. */
    pp_rhat[i] = exp(normal_rng(log_rhat[i] - 0.5 * sigma^2, sigma));
  }
}

Job Array script

Consider this simple job array script which we will save as submit-job-array.slurm

#SBATCH --partition debug
#SBATCH --ntasks 1
#SBATCH --array 1-5

# Run parameter index set by the SLURM array index.
bash model_fit.bash ${SLURM_ARRAY_TASK_ID}

As you can see, the submission script is short and most of the work is done in model_fit.bash:

# Read the index for the model input from the command line.
param_idx=$1
# Make sure the index is an integer.
if ! [[ "${param_idx}" -eq "${param_idx}" ]]
then
    echo >&2 "Error: Parameter index \"${param_idx}\" must be an integer!"
    exit 1
fi

# Generate the parameters JSON file for this index.
idfile=stockids.txt
stockid=$(sed -n ${param_idx}p ${idfile})
if [[ -z "${stockid}" ]]
then
    echo >&2 "Error: Parameter index \"${param_idx}\" does not exist in ${idfile}!"
    exit 1
fi
csvfile=rlsadb_v4.25_ssb_recruits.csv
datafile=${param_idx}-data.json
python to_json.py ${csvfile} ${stockid} > ${datafile}

# Fit the model to the parameters.                                                                                                                 
outputfile=${param_idx}-output.csv
samplefile=${param_idx}-output.txt
./bh_model \
    sample num_samples=5000 \
    data file=${datafile} \
    output file=${outputfile} | tee ${samplefile}
summaryfile=${param_idx}-summary.txt
../../bin/stansummary ${outputfile} | tee ${summaryfile}

You can submit the job and see the results:

sbatch submit-job-array.slurm
# After the job completes ...
tail -n 4 *-summary.txt

Bash jobs

xargs

GNU Parallel