TEST: use tensorboard for stuff

This commit is contained in:
Zhengyi Chen 2024-03-04 20:32:26 +00:00
parent 2d31162c58
commit 524ee03187
10 changed files with 212 additions and 12 deletions

2
.gitignore vendored
View file

@ -1,7 +1,9 @@
baseline-experiments/ baseline-experiments/
pretrained/
synchronous/ synchronous/
npydata/ npydata/
**/__pycache__/** **/__pycache__/**
slurm-* slurm-*
save/ save/
save_file/
.vscode/ .vscode/

36
_DDPShA-train.sh Normal file
View file

@ -0,0 +1,36 @@
#!/bin/sh
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --partition=Teach-Standard
#SBATCH --gres=gpu:4
#SBATCH --mem=24000
#SBATCH --time=3-00:00:00
set -e
export CUDA_HOME=/opt/cuda-9.0.176.1/
export CUDNN_HOME=/opt/cuDNN-7.0/
export STUDENT_ID=$(whoami)
export LD_LIBRARY_PATH=${CUDNN_HOME}/lib64:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
export LIBRARY_PATH=${CUDNN_HOME}/lib64:${LIBRARY_PATH}
export CPATH=${CUDNN_HOME}/include:$CPATH
export PATH=${CUDA_HOME}/bin:${PATH}
export PYTHON_PATH=$PATH
mkdir -p /disk/scratch/${STUDENT_ID}
export TMPDIR=/disk/scratch/${STUDENT_ID}/
export TMP=/disk/scratch/${STUDENT_ID}/
source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda
python train.py \
--pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \
--model 'stn' \
--save_path ./save/DDP-ShanghaiA-stn-$(date -Iminutes) \
--batch_size 4 \
--use_ddp True \
--ddp_world_size 4 \
--gpus 0,1,2,3 \
--print_freq 100

36
_DDPShB-base-train.sh Normal file
View file

@ -0,0 +1,36 @@
#!/bin/sh
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --partition=Teach-Standard
#SBATCH --gres=gpu:4
#SBATCH --mem=24000
#SBATCH --time=3-00:00:00
set -e
export CUDA_HOME=/opt/cuda-9.0.176.1/
export CUDNN_HOME=/opt/cuDNN-7.0/
export STUDENT_ID=$(whoami)
export LD_LIBRARY_PATH=${CUDNN_HOME}/lib64:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
export LIBRARY_PATH=${CUDNN_HOME}/lib64:${LIBRARY_PATH}
export CPATH=${CUDNN_HOME}/include:$CPATH
export PATH=${CUDA_HOME}/bin:${PATH}
export PYTHON_PATH=$PATH
mkdir -p /disk/scratch/${STUDENT_ID}
export TMPDIR=/disk/scratch/${STUDENT_ID}/
export TMP=/disk/scratch/${STUDENT_ID}/
source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda
python train.py \
--use_ddp True \
--ddp_world_size 4 \
--pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \
--train_dataset 'ShanghaiB' \
--save_path ./save/DDP-ShanghaiB-base-$(date -Iminutes) \
--batch_size 4 \
--gpus 0,1,2,3 \
--print_freq 100

View file

@ -2,7 +2,7 @@
#SBATCH -N 1 #SBATCH -N 1
#SBATCH -n 1 #SBATCH -n 1
#SBATCH --partition=Teach-Standard #SBATCH --partition=Teach-Standard
#SBATCH --gres=gpu:6 #SBATCH --gres=gpu:4
#SBATCH --mem=24000 #SBATCH --mem=24000
#SBATCH --time=3-00:00:00 #SBATCH --time=3-00:00:00
@ -25,9 +25,9 @@ export TMP=/disk/scratch/${STUDENT_ID}/
source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda
python train.py \ python train.py \
--debug True \ --pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \
--model 'stn' \ --save_path ./save/ShanghaiA-base-$(date -Iminutes) \
--save_path ./save_file/ShanghaiA \
--batch_size 4 \ --batch_size 4 \
--gpus 0,1,2,3,4,5 \ --gpus 0,1,2,3 \
--print_freq 100 --print_freq 100

34
_ShA-train.sh Normal file
View file

@ -0,0 +1,34 @@
#!/bin/sh
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --partition=Teach-Standard
#SBATCH --gres=gpu:4
#SBATCH --mem=24000
#SBATCH --time=3-00:00:00
set -e
export CUDA_HOME=/opt/cuda-9.0.176.1/
export CUDNN_HOME=/opt/cuDNN-7.0/
export STUDENT_ID=$(whoami)
export LD_LIBRARY_PATH=${CUDNN_HOME}/lib64:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
export LIBRARY_PATH=${CUDNN_HOME}/lib64:${LIBRARY_PATH}
export CPATH=${CUDNN_HOME}/include:$CPATH
export PATH=${CUDA_HOME}/bin:${PATH}
export PYTHON_PATH=$PATH
mkdir -p /disk/scratch/${STUDENT_ID}
export TMPDIR=/disk/scratch/${STUDENT_ID}/
export TMP=/disk/scratch/${STUDENT_ID}/
source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda
python train.py \
--pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \
--model 'stn' \
--save_path ./save/ShanghaiA-stn-$(date -Iminutes) \
--batch_size 4 \
--gpus 0,1,2,3 \
--print_freq 100

34
_ShB-base-train.sh Normal file
View file

@ -0,0 +1,34 @@
#!/bin/sh
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --partition=Teach-Standard
#SBATCH --gres=gpu:4
#SBATCH --mem=24000
#SBATCH --time=3-00:00:00
set -e
export CUDA_HOME=/opt/cuda-9.0.176.1/
export CUDNN_HOME=/opt/cuDNN-7.0/
export STUDENT_ID=$(whoami)
export LD_LIBRARY_PATH=${CUDNN_HOME}/lib64:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
export LIBRARY_PATH=${CUDNN_HOME}/lib64:${LIBRARY_PATH}
export CPATH=${CUDNN_HOME}/include:$CPATH
export PATH=${CUDA_HOME}/bin:${PATH}
export PYTHON_PATH=$PATH
mkdir -p /disk/scratch/${STUDENT_ID}
export TMPDIR=/disk/scratch/${STUDENT_ID}/
export TMP=/disk/scratch/${STUDENT_ID}/
source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda
python train.py \
--pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \
--train_dataset 'ShanghaiB' \
--save_path ./save/ShanghaiB-base-$(date -Iminutes) \
--batch_size 4 \
--gpus 0,1,2,3 \
--print_freq 100

35
_ShB-train.sh Normal file
View file

@ -0,0 +1,35 @@
#!/bin/sh
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --partition=Teach-Standard
#SBATCH --gres=gpu:4
#SBATCH --mem=24000
#SBATCH --time=3-00:00:00
set -e
export CUDA_HOME=/opt/cuda-9.0.176.1/
export CUDNN_HOME=/opt/cuDNN-7.0/
export STUDENT_ID=$(whoami)
export LD_LIBRARY_PATH=${CUDNN_HOME}/lib64:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
export LIBRARY_PATH=${CUDNN_HOME}/lib64:${LIBRARY_PATH}
export CPATH=${CUDNN_HOME}/include:$CPATH
export PATH=${CUDA_HOME}/bin:${PATH}
export PYTHON_PATH=$PATH
mkdir -p /disk/scratch/${STUDENT_ID}
export TMPDIR=/disk/scratch/${STUDENT_ID}/
export TMP=/disk/scratch/${STUDENT_ID}/
source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda
python train.py \
--pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \
--model 'stn' \
--train_dataset 'ShanghaiB' \
--save_path ./save/ShanghaiB-stn-$(date -Iminutes) \
--batch_size 4 \
--gpus 0,1,2,3 \
--print_freq 100

View file

@ -25,6 +25,7 @@ import cv2
import scipy.io as io import scipy.io as io
import scipy.sparse as sparse import scipy.sparse as sparse
import h5py import h5py
from tqdm.auto import tqdm
CWD = os.getcwd() CWD = os.getcwd()
@ -51,7 +52,7 @@ def pre_dataset_sh():
# np.random.seed(0) # np.random.seed(0)
# random.seed(0) # random.seed(0)
for img_path in img_paths: for _, img_path in tqdm(img_paths, desc="Preprocessing Data"):
img_data = cv2.imread(img_path) img_data = cv2.imread(img_path)
mat = io.loadmat( mat = io.loadmat(
img_path img_path
@ -169,4 +170,5 @@ def make_npydata():
if __name__ == "__main__": if __name__ == "__main__":
# Download manually... # Download manually...
pre_dataset_sh() # XXX: preliminary pre_dataset_sh() # XXX: preliminary
print("Storing dataset paths...")
make_npydata() make_npydata()

View file

@ -9,6 +9,8 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.multiprocessing as torch_mp import torch.multiprocessing as torch_mp
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import torchvision
import nni import nni
import logging import logging
import numpy as np import numpy as np
@ -17,10 +19,11 @@ from model.transcrowd_gap import VisionTransformerGAP
from arguments import args, ret_args from arguments import args, ret_args
import dataset import dataset
from dataset import * from dataset import *
from model.transcrowd_gap import base_patch16_384_gap, stn_patch16_384_gap from model.transcrowd_gap import *
from checkpoint import save_checkpoint from checkpoint import save_checkpoint
logger = logging.getLogger("train") logger = logging.getLogger("train")
writer = SummaryWriter(args.save_path + "/tensorboard-run")
def setup_process_group( def setup_process_group(
rank: int, rank: int,
@ -196,7 +199,7 @@ def worker(rank: int, args: Namespace):
# Validate # Validate
if epoch % 5 == 0 or args.debug: if epoch % 5 == 0 or args.debug:
prec1 = valid_one_epoch(test_loader, model, device, args) prec1 = valid_one_epoch(test_loader, model, device, epoch, args)
end_valid = time.time() end_valid = time.time()
is_best = prec1 < args.best_pred is_best = prec1 < args.best_pred
args.best_pred = min(prec1, args.best_pred) args.best_pred = min(prec1, args.best_pred)
@ -255,11 +258,14 @@ def train_one_epoch(
gt_count_whole = gt_count_whole.cuda() gt_count_whole = gt_count_whole.cuda()
device_type = "cuda" device_type = "cuda"
# Desperate measure to reduce mem footprint...
with torch.autocast(device_type): with torch.autocast(device_type):
# fpass # fpass
out, gt_count = model(img, kpoint) out, gt_count = model(img, kpoint)
# loss # loss
loss = criterion(out, gt_count) # wrt. transformer loss = criterion(out, gt_count) # wrt. transformer
writer.add_scalar("L1-loss wrt. xformer (train)", loss, epoch * i)
loss += ( loss += (
F.mse_loss( # stn: info retainment F.mse_loss( # stn: info retainment
gt_count.view(batch_size, -1).sum(axis=1, keepdim=True), gt_count.view(batch_size, -1).sum(axis=1, keepdim=True),
@ -270,6 +276,7 @@ def train_one_epoch(
value=loss.item() value=loss.item()
) )
) )
writer.add_scalar("Composite loss (train)", loss, epoch * i)
# free grad from mem # free grad from mem
optimizer.zero_grad(set_to_none=True) optimizer.zero_grad(set_to_none=True)
@ -283,10 +290,13 @@ def train_one_epoch(
if args.debug: if args.debug:
break break
# Flush writer
writer.flush()
scheduler.step() scheduler.step()
def valid_one_epoch(test_loader, model, device, args): def valid_one_epoch(test_loader, model, device, epoch, args):
print("[valid_one_epoch] Validating...") print("[valid_one_epoch] Validating...")
batch_size = 1 batch_size = 1
model.eval() model.eval()
@ -295,6 +305,7 @@ def valid_one_epoch(test_loader, model, device, args):
mse = .0 mse = .0
visi = [] visi = []
index = 0 index = 0
xformed = []
for i, (fname, img, kpoint, gt_count_whole) in enumerate(test_loader): for i, (fname, img, kpoint, gt_count_whole) in enumerate(test_loader):
kpoint = kpoint.type(torch.FloatTensor) kpoint = kpoint.type(torch.FloatTensor)
@ -324,6 +335,10 @@ def valid_one_epoch(test_loader, model, device, args):
mse += diff ** 2 mse += diff ** 2
if i % 5 == 0: if i % 5 == 0:
if isinstance(model, STNet_VisionTransformerGAP):
with torch.no_grad():
img_xformed = model.stnet(img).to("cpu")
xformed.append(img_xformed)
print("[valid_one_epoch] {} | Gt {:.2f} Pred {:.4f} |".format( print("[valid_one_epoch] {} | Gt {:.2f} Pred {:.4f} |".format(
fname[0], fname[0],
torch.sum(gt_count_whole).item(), torch.sum(gt_count_whole).item(),
@ -332,10 +347,16 @@ def valid_one_epoch(test_loader, model, device, args):
mae = mae * 1.0 / (len(test_loader) * batch_size) mae = mae * 1.0 / (len(test_loader) * batch_size)
mse = np.sqrt(mse / (len(test_loader)) * batch_size) mse = np.sqrt(mse / (len(test_loader)) * batch_size)
writer.add_scalar("MAE (valid)", mae, epoch)
writer.add_scalar("MSE (valid)", mse, epoch)
if len(xformed) != 0:
img_grid = torchvision.utils.make_grid(xformed)
writer.add_image("STN: transformed image", img_grid, epoch)
nni.report_intermediate_result(mae) nni.report_intermediate_result(mae)
print("* MAE {mae:.3f} | MSE {mse:.3f} *".format( print("* MAE {mae:.3f} | MSE {mse:.3f} *".format(
mae=mae, mse=mse mae=mae, mse=mse
)) ))
writer.flush()
return mae return mae
@ -353,7 +374,7 @@ if __name__ == "__main__":
worker, worker,
args=(combined_params, ), # rank supplied at callee as 1st param args=(combined_params, ), # rank supplied at callee as 1st param
# also above *has* to be 1-tuple else runtime expands Namespace. # also above *has* to be 1-tuple else runtime expands Namespace.
nprocs=combined_params.world_size, nprocs=combined_params.ddp_world_size,
) )
else: else:
# No DDP, run in current thread # No DDP, run in current thread

0
util.py Normal file
View file