Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Code Block
#!/bin/bash
#SBATCH -N 1
#SBATCH -t 2:00:00
#SBATCH --ntasks-per-node=12
#SBATCH --cpus-per-task=32
#SBTACH --gpus-per-task=1
#SBATCH --gres=gpu:a100:2
#SBATCH -p gpu

#alternatively for testing in small/short jobs, see https://upb-pc2.atlassian.net/wiki/spaces/PC2DOK/pages/1902952/Running+Compute+Jobs#Using-GPUs-for-Development-and-Testing-Purposes
##SBATCH -p dgx
##SBATCH -q devel

export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
export OMP_PLACES=cores
export OMP_PROC_BIND=true

export TF_INTER_OP_PARALLELISM_THREADS=2
export TF_INTRA_OP_PARALLELISM_THREADS=`echo "$OMP_NUM_THREADS-$TF_INTER_OP_PARALLELISM_THREADS" | bc`

module reset
module load toolchain/foss/2022a
module load lang/Python/3.10.4-GCCcore-11.3.0
module load system/CUDA/12.6.0

source /pc2/groups/[project name]/venv_deepmd_horovod/bin/activate
srun horovodrun -np 2 dp train input.json