PyTorch¶
对于 PyTorch 等机器学习软件,我们推荐用户安装在用户目录或组共享目录下:
module load anaconda/2022.5
source activate base
conda create -n torch-env python=3.10
source activate torch-env
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
在使用时可参考如下脚本示例:
/public/slurmscript_demo/pytorch.slurm
#!/bin/bash
#SBATCH --job-name=lp_job_test # Job name
#SBATCH --output=testSlurmJob.%j.out # Stdout (%j expands to jobId)
#SBATCH --error=testSlurmJob.%j.err # Stderr (%j expands to jobId)
#SBATCH --nodes=1 # Maximum number of nodes to be allocated
#SBATCH --ntasks-per-node=1 # Maximum number of tasks on each node
#SBATCH --account=[budget] # Account name
#SBATCH --partition=gpu # Partition name, use `gpu` for GPU
#SBATCH --qos=[qos] # QOS name
#SBATCH --gres=gpu:1 # Apply for 1 GPU card
#SBATCH --parsable
nvidia-smi
echo "CUDA_VISIBLE_DEVICES = $CUDA_VISIBLE_DEVICES"
hostname >./hostfile
echo $SLURM_NTASKS
echo "Date = $(date)"
echo "Hostname = $(hostname -s)"
echo "Working Directory = $(pwd)"
echo ""
echo "Number of Nodes Allocated = $SLURM_JOB_NUM_NODES"
echo "Number of Tasks Allocated = $SLURM_NTASKS"
echo "Number of Cores/Task Allocated = $SLURM_CPUS_PER_TASK"
echo $SLURM_NPROCS
echo $SLURM_NPROCS
set -e
module load anaconda/2022.5
source activate torch-env
set +e
python -V
python testgpu.py
#python run_baselines.py