From 524ee031870ca1acca961417d5189326b53835d2 Mon Sep 17 00:00:00 2001 From: Zhengyi Chen Date: Mon, 4 Mar 2024 20:32:26 +0000 Subject: [PATCH] TEST: use tensorboard for stuff --- .gitignore | 2 ++ _DDPShA-train.sh | 36 +++++++++++++++++++++++ _DDPShB-base-train.sh | 36 +++++++++++++++++++++++ _ShanghaiA-train.sh => _ShA-base-train.sh | 14 ++++----- _ShA-train.sh | 34 +++++++++++++++++++++ _ShB-base-train.sh | 34 +++++++++++++++++++++ _ShB-train.sh | 35 ++++++++++++++++++++++ preprocess_data.py | 4 ++- train.py | 29 +++++++++++++++--- util.py | 0 10 files changed, 212 insertions(+), 12 deletions(-) create mode 100644 _DDPShA-train.sh create mode 100644 _DDPShB-base-train.sh rename _ShanghaiA-train.sh => _ShA-base-train.sh (77%) create mode 100644 _ShA-train.sh create mode 100644 _ShB-base-train.sh create mode 100644 _ShB-train.sh create mode 100644 util.py diff --git a/.gitignore b/.gitignore index 0a910aa..c63ae15 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,9 @@ baseline-experiments/ +pretrained/ synchronous/ npydata/ **/__pycache__/** slurm-* save/ +save_file/ .vscode/ diff --git a/_DDPShA-train.sh b/_DDPShA-train.sh new file mode 100644 index 0000000..cf865f5 --- /dev/null +++ b/_DDPShA-train.sh @@ -0,0 +1,36 @@ +#!/bin/sh +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH --partition=Teach-Standard +#SBATCH --gres=gpu:4 +#SBATCH --mem=24000 +#SBATCH --time=3-00:00:00 + +set -e + +export CUDA_HOME=/opt/cuda-9.0.176.1/ +export CUDNN_HOME=/opt/cuDNN-7.0/ +export STUDENT_ID=$(whoami) + +export LD_LIBRARY_PATH=${CUDNN_HOME}/lib64:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} +export LIBRARY_PATH=${CUDNN_HOME}/lib64:${LIBRARY_PATH} +export CPATH=${CUDNN_HOME}/include:$CPATH +export PATH=${CUDA_HOME}/bin:${PATH} +export PYTHON_PATH=$PATH + +mkdir -p /disk/scratch/${STUDENT_ID} +export TMPDIR=/disk/scratch/${STUDENT_ID}/ +export TMP=/disk/scratch/${STUDENT_ID}/ + +source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda + +python train.py \ + --pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \ + --model 'stn' \ + --save_path ./save/DDP-ShanghaiA-stn-$(date -Iminutes) \ + --batch_size 4 \ + --use_ddp True \ + --ddp_world_size 4 \ + --gpus 0,1,2,3 \ + --print_freq 100 + diff --git a/_DDPShB-base-train.sh b/_DDPShB-base-train.sh new file mode 100644 index 0000000..0899350 --- /dev/null +++ b/_DDPShB-base-train.sh @@ -0,0 +1,36 @@ +#!/bin/sh +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH --partition=Teach-Standard +#SBATCH --gres=gpu:4 +#SBATCH --mem=24000 +#SBATCH --time=3-00:00:00 + +set -e + +export CUDA_HOME=/opt/cuda-9.0.176.1/ +export CUDNN_HOME=/opt/cuDNN-7.0/ +export STUDENT_ID=$(whoami) + +export LD_LIBRARY_PATH=${CUDNN_HOME}/lib64:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} +export LIBRARY_PATH=${CUDNN_HOME}/lib64:${LIBRARY_PATH} +export CPATH=${CUDNN_HOME}/include:$CPATH +export PATH=${CUDA_HOME}/bin:${PATH} +export PYTHON_PATH=$PATH + +mkdir -p /disk/scratch/${STUDENT_ID} +export TMPDIR=/disk/scratch/${STUDENT_ID}/ +export TMP=/disk/scratch/${STUDENT_ID}/ + +source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda + +python train.py \ + --use_ddp True \ + --ddp_world_size 4 \ + --pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \ + --train_dataset 'ShanghaiB' \ + --save_path ./save/DDP-ShanghaiB-base-$(date -Iminutes) \ + --batch_size 4 \ + --gpus 0,1,2,3 \ + --print_freq 100 + diff --git a/_ShanghaiA-train.sh b/_ShA-base-train.sh similarity index 77% rename from _ShanghaiA-train.sh rename to _ShA-base-train.sh index a1af788..0a4e8c5 100644 --- a/_ShanghaiA-train.sh +++ b/_ShA-base-train.sh @@ -2,7 +2,7 @@ #SBATCH -N 1 #SBATCH -n 1 #SBATCH --partition=Teach-Standard -#SBATCH --gres=gpu:6 +#SBATCH --gres=gpu:4 #SBATCH --mem=24000 #SBATCH --time=3-00:00:00 @@ -25,9 +25,9 @@ export TMP=/disk/scratch/${STUDENT_ID}/ source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda python train.py \ - --debug True \ - --model 'stn' \ - --save_path ./save_file/ShanghaiA \ - --batch_size 4 \ - --gpus 0,1,2,3,4,5 \ - --print_freq 100 + --pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \ + --save_path ./save/ShanghaiA-base-$(date -Iminutes) \ + --batch_size 4 \ + --gpus 0,1,2,3 \ + --print_freq 100 + diff --git a/_ShA-train.sh b/_ShA-train.sh new file mode 100644 index 0000000..0a6da04 --- /dev/null +++ b/_ShA-train.sh @@ -0,0 +1,34 @@ +#!/bin/sh +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH --partition=Teach-Standard +#SBATCH --gres=gpu:4 +#SBATCH --mem=24000 +#SBATCH --time=3-00:00:00 + +set -e + +export CUDA_HOME=/opt/cuda-9.0.176.1/ +export CUDNN_HOME=/opt/cuDNN-7.0/ +export STUDENT_ID=$(whoami) + +export LD_LIBRARY_PATH=${CUDNN_HOME}/lib64:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} +export LIBRARY_PATH=${CUDNN_HOME}/lib64:${LIBRARY_PATH} +export CPATH=${CUDNN_HOME}/include:$CPATH +export PATH=${CUDA_HOME}/bin:${PATH} +export PYTHON_PATH=$PATH + +mkdir -p /disk/scratch/${STUDENT_ID} +export TMPDIR=/disk/scratch/${STUDENT_ID}/ +export TMP=/disk/scratch/${STUDENT_ID}/ + +source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda + +python train.py \ + --pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \ + --model 'stn' \ + --save_path ./save/ShanghaiA-stn-$(date -Iminutes) \ + --batch_size 4 \ + --gpus 0,1,2,3 \ + --print_freq 100 + diff --git a/_ShB-base-train.sh b/_ShB-base-train.sh new file mode 100644 index 0000000..c4d0440 --- /dev/null +++ b/_ShB-base-train.sh @@ -0,0 +1,34 @@ +#!/bin/sh +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH --partition=Teach-Standard +#SBATCH --gres=gpu:4 +#SBATCH --mem=24000 +#SBATCH --time=3-00:00:00 + +set -e + +export CUDA_HOME=/opt/cuda-9.0.176.1/ +export CUDNN_HOME=/opt/cuDNN-7.0/ +export STUDENT_ID=$(whoami) + +export LD_LIBRARY_PATH=${CUDNN_HOME}/lib64:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} +export LIBRARY_PATH=${CUDNN_HOME}/lib64:${LIBRARY_PATH} +export CPATH=${CUDNN_HOME}/include:$CPATH +export PATH=${CUDA_HOME}/bin:${PATH} +export PYTHON_PATH=$PATH + +mkdir -p /disk/scratch/${STUDENT_ID} +export TMPDIR=/disk/scratch/${STUDENT_ID}/ +export TMP=/disk/scratch/${STUDENT_ID}/ + +source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda + +python train.py \ + --pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \ + --train_dataset 'ShanghaiB' \ + --save_path ./save/ShanghaiB-base-$(date -Iminutes) \ + --batch_size 4 \ + --gpus 0,1,2,3 \ + --print_freq 100 + diff --git a/_ShB-train.sh b/_ShB-train.sh new file mode 100644 index 0000000..46fd063 --- /dev/null +++ b/_ShB-train.sh @@ -0,0 +1,35 @@ +#!/bin/sh +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH --partition=Teach-Standard +#SBATCH --gres=gpu:4 +#SBATCH --mem=24000 +#SBATCH --time=3-00:00:00 + +set -e + +export CUDA_HOME=/opt/cuda-9.0.176.1/ +export CUDNN_HOME=/opt/cuDNN-7.0/ +export STUDENT_ID=$(whoami) + +export LD_LIBRARY_PATH=${CUDNN_HOME}/lib64:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} +export LIBRARY_PATH=${CUDNN_HOME}/lib64:${LIBRARY_PATH} +export CPATH=${CUDNN_HOME}/include:$CPATH +export PATH=${CUDA_HOME}/bin:${PATH} +export PYTHON_PATH=$PATH + +mkdir -p /disk/scratch/${STUDENT_ID} +export TMPDIR=/disk/scratch/${STUDENT_ID}/ +export TMP=/disk/scratch/${STUDENT_ID}/ + +source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda + +python train.py \ + --pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \ + --model 'stn' \ + --train_dataset 'ShanghaiB' \ + --save_path ./save/ShanghaiB-stn-$(date -Iminutes) \ + --batch_size 4 \ + --gpus 0,1,2,3 \ + --print_freq 100 + diff --git a/preprocess_data.py b/preprocess_data.py index 5bfe122..8bcc7e2 100644 --- a/preprocess_data.py +++ b/preprocess_data.py @@ -25,6 +25,7 @@ import cv2 import scipy.io as io import scipy.sparse as sparse import h5py +from tqdm.auto import tqdm CWD = os.getcwd() @@ -51,7 +52,7 @@ def pre_dataset_sh(): # np.random.seed(0) # random.seed(0) - for img_path in img_paths: + for _, img_path in tqdm(img_paths, desc="Preprocessing Data"): img_data = cv2.imread(img_path) mat = io.loadmat( img_path @@ -169,4 +170,5 @@ def make_npydata(): if __name__ == "__main__": # Download manually... pre_dataset_sh() # XXX: preliminary + print("Storing dataset paths...") make_npydata() diff --git a/train.py b/train.py index 320ec6f..d556077 100644 --- a/train.py +++ b/train.py @@ -9,6 +9,8 @@ import torch import torch.nn as nn import torch.multiprocessing as torch_mp from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +import torchvision import nni import logging import numpy as np @@ -17,10 +19,11 @@ from model.transcrowd_gap import VisionTransformerGAP from arguments import args, ret_args import dataset from dataset import * -from model.transcrowd_gap import base_patch16_384_gap, stn_patch16_384_gap +from model.transcrowd_gap import * from checkpoint import save_checkpoint logger = logging.getLogger("train") +writer = SummaryWriter(args.save_path + "/tensorboard-run") def setup_process_group( rank: int, @@ -196,7 +199,7 @@ def worker(rank: int, args: Namespace): # Validate if epoch % 5 == 0 or args.debug: - prec1 = valid_one_epoch(test_loader, model, device, args) + prec1 = valid_one_epoch(test_loader, model, device, epoch, args) end_valid = time.time() is_best = prec1 < args.best_pred args.best_pred = min(prec1, args.best_pred) @@ -255,11 +258,14 @@ def train_one_epoch( gt_count_whole = gt_count_whole.cuda() device_type = "cuda" + # Desperate measure to reduce mem footprint... with torch.autocast(device_type): # fpass out, gt_count = model(img, kpoint) # loss loss = criterion(out, gt_count) # wrt. transformer + writer.add_scalar("L1-loss wrt. xformer (train)", loss, epoch * i) + loss += ( F.mse_loss( # stn: info retainment gt_count.view(batch_size, -1).sum(axis=1, keepdim=True), @@ -270,6 +276,7 @@ def train_one_epoch( value=loss.item() ) ) + writer.add_scalar("Composite loss (train)", loss, epoch * i) # free grad from mem optimizer.zero_grad(set_to_none=True) @@ -283,10 +290,13 @@ def train_one_epoch( if args.debug: break + # Flush writer + writer.flush() + scheduler.step() -def valid_one_epoch(test_loader, model, device, args): +def valid_one_epoch(test_loader, model, device, epoch, args): print("[valid_one_epoch] Validating...") batch_size = 1 model.eval() @@ -295,6 +305,7 @@ def valid_one_epoch(test_loader, model, device, args): mse = .0 visi = [] index = 0 + xformed = [] for i, (fname, img, kpoint, gt_count_whole) in enumerate(test_loader): kpoint = kpoint.type(torch.FloatTensor) @@ -324,6 +335,10 @@ def valid_one_epoch(test_loader, model, device, args): mse += diff ** 2 if i % 5 == 0: + if isinstance(model, STNet_VisionTransformerGAP): + with torch.no_grad(): + img_xformed = model.stnet(img).to("cpu") + xformed.append(img_xformed) print("[valid_one_epoch] {} | Gt {:.2f} Pred {:.4f} |".format( fname[0], torch.sum(gt_count_whole).item(), @@ -332,10 +347,16 @@ def valid_one_epoch(test_loader, model, device, args): mae = mae * 1.0 / (len(test_loader) * batch_size) mse = np.sqrt(mse / (len(test_loader)) * batch_size) + writer.add_scalar("MAE (valid)", mae, epoch) + writer.add_scalar("MSE (valid)", mse, epoch) + if len(xformed) != 0: + img_grid = torchvision.utils.make_grid(xformed) + writer.add_image("STN: transformed image", img_grid, epoch) nni.report_intermediate_result(mae) print("* MAE {mae:.3f} | MSE {mse:.3f} *".format( mae=mae, mse=mse )) + writer.flush() return mae @@ -353,7 +374,7 @@ if __name__ == "__main__": worker, args=(combined_params, ), # rank supplied at callee as 1st param # also above *has* to be 1-tuple else runtime expands Namespace. - nprocs=combined_params.world_size, + nprocs=combined_params.ddp_world_size, ) else: # No DDP, run in current thread diff --git a/util.py b/util.py new file mode 100644 index 0000000..e69de29