TEST: use tensorboard for stuff
This commit is contained in:
parent
2d31162c58
commit
524ee03187
10 changed files with 212 additions and 12 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -1,7 +1,9 @@
|
||||||
baseline-experiments/
|
baseline-experiments/
|
||||||
|
pretrained/
|
||||||
synchronous/
|
synchronous/
|
||||||
npydata/
|
npydata/
|
||||||
**/__pycache__/**
|
**/__pycache__/**
|
||||||
slurm-*
|
slurm-*
|
||||||
save/
|
save/
|
||||||
|
save_file/
|
||||||
.vscode/
|
.vscode/
|
||||||
|
|
|
||||||
36
_DDPShA-train.sh
Normal file
36
_DDPShA-train.sh
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
#!/bin/sh
|
||||||
|
#SBATCH -N 1
|
||||||
|
#SBATCH -n 1
|
||||||
|
#SBATCH --partition=Teach-Standard
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --mem=24000
|
||||||
|
#SBATCH --time=3-00:00:00
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
export CUDA_HOME=/opt/cuda-9.0.176.1/
|
||||||
|
export CUDNN_HOME=/opt/cuDNN-7.0/
|
||||||
|
export STUDENT_ID=$(whoami)
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=${CUDNN_HOME}/lib64:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||||
|
export LIBRARY_PATH=${CUDNN_HOME}/lib64:${LIBRARY_PATH}
|
||||||
|
export CPATH=${CUDNN_HOME}/include:$CPATH
|
||||||
|
export PATH=${CUDA_HOME}/bin:${PATH}
|
||||||
|
export PYTHON_PATH=$PATH
|
||||||
|
|
||||||
|
mkdir -p /disk/scratch/${STUDENT_ID}
|
||||||
|
export TMPDIR=/disk/scratch/${STUDENT_ID}/
|
||||||
|
export TMP=/disk/scratch/${STUDENT_ID}/
|
||||||
|
|
||||||
|
source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda
|
||||||
|
|
||||||
|
python train.py \
|
||||||
|
--pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \
|
||||||
|
--model 'stn' \
|
||||||
|
--save_path ./save/DDP-ShanghaiA-stn-$(date -Iminutes) \
|
||||||
|
--batch_size 4 \
|
||||||
|
--use_ddp True \
|
||||||
|
--ddp_world_size 4 \
|
||||||
|
--gpus 0,1,2,3 \
|
||||||
|
--print_freq 100
|
||||||
|
|
||||||
36
_DDPShB-base-train.sh
Normal file
36
_DDPShB-base-train.sh
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
#!/bin/sh
|
||||||
|
#SBATCH -N 1
|
||||||
|
#SBATCH -n 1
|
||||||
|
#SBATCH --partition=Teach-Standard
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --mem=24000
|
||||||
|
#SBATCH --time=3-00:00:00
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
export CUDA_HOME=/opt/cuda-9.0.176.1/
|
||||||
|
export CUDNN_HOME=/opt/cuDNN-7.0/
|
||||||
|
export STUDENT_ID=$(whoami)
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=${CUDNN_HOME}/lib64:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||||
|
export LIBRARY_PATH=${CUDNN_HOME}/lib64:${LIBRARY_PATH}
|
||||||
|
export CPATH=${CUDNN_HOME}/include:$CPATH
|
||||||
|
export PATH=${CUDA_HOME}/bin:${PATH}
|
||||||
|
export PYTHON_PATH=$PATH
|
||||||
|
|
||||||
|
mkdir -p /disk/scratch/${STUDENT_ID}
|
||||||
|
export TMPDIR=/disk/scratch/${STUDENT_ID}/
|
||||||
|
export TMP=/disk/scratch/${STUDENT_ID}/
|
||||||
|
|
||||||
|
source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda
|
||||||
|
|
||||||
|
python train.py \
|
||||||
|
--use_ddp True \
|
||||||
|
--ddp_world_size 4 \
|
||||||
|
--pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \
|
||||||
|
--train_dataset 'ShanghaiB' \
|
||||||
|
--save_path ./save/DDP-ShanghaiB-base-$(date -Iminutes) \
|
||||||
|
--batch_size 4 \
|
||||||
|
--gpus 0,1,2,3 \
|
||||||
|
--print_freq 100
|
||||||
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
#SBATCH -N 1
|
#SBATCH -N 1
|
||||||
#SBATCH -n 1
|
#SBATCH -n 1
|
||||||
#SBATCH --partition=Teach-Standard
|
#SBATCH --partition=Teach-Standard
|
||||||
#SBATCH --gres=gpu:6
|
#SBATCH --gres=gpu:4
|
||||||
#SBATCH --mem=24000
|
#SBATCH --mem=24000
|
||||||
#SBATCH --time=3-00:00:00
|
#SBATCH --time=3-00:00:00
|
||||||
|
|
||||||
|
|
@ -25,9 +25,9 @@ export TMP=/disk/scratch/${STUDENT_ID}/
|
||||||
source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda
|
source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda
|
||||||
|
|
||||||
python train.py \
|
python train.py \
|
||||||
--debug True \
|
--pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \
|
||||||
--model 'stn' \
|
--save_path ./save/ShanghaiA-base-$(date -Iminutes) \
|
||||||
--save_path ./save_file/ShanghaiA \
|
--batch_size 4 \
|
||||||
--batch_size 4 \
|
--gpus 0,1,2,3 \
|
||||||
--gpus 0,1,2,3,4,5 \
|
--print_freq 100
|
||||||
--print_freq 100
|
|
||||||
34
_ShA-train.sh
Normal file
34
_ShA-train.sh
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
#!/bin/sh
|
||||||
|
#SBATCH -N 1
|
||||||
|
#SBATCH -n 1
|
||||||
|
#SBATCH --partition=Teach-Standard
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --mem=24000
|
||||||
|
#SBATCH --time=3-00:00:00
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
export CUDA_HOME=/opt/cuda-9.0.176.1/
|
||||||
|
export CUDNN_HOME=/opt/cuDNN-7.0/
|
||||||
|
export STUDENT_ID=$(whoami)
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=${CUDNN_HOME}/lib64:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||||
|
export LIBRARY_PATH=${CUDNN_HOME}/lib64:${LIBRARY_PATH}
|
||||||
|
export CPATH=${CUDNN_HOME}/include:$CPATH
|
||||||
|
export PATH=${CUDA_HOME}/bin:${PATH}
|
||||||
|
export PYTHON_PATH=$PATH
|
||||||
|
|
||||||
|
mkdir -p /disk/scratch/${STUDENT_ID}
|
||||||
|
export TMPDIR=/disk/scratch/${STUDENT_ID}/
|
||||||
|
export TMP=/disk/scratch/${STUDENT_ID}/
|
||||||
|
|
||||||
|
source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda
|
||||||
|
|
||||||
|
python train.py \
|
||||||
|
--pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \
|
||||||
|
--model 'stn' \
|
||||||
|
--save_path ./save/ShanghaiA-stn-$(date -Iminutes) \
|
||||||
|
--batch_size 4 \
|
||||||
|
--gpus 0,1,2,3 \
|
||||||
|
--print_freq 100
|
||||||
|
|
||||||
34
_ShB-base-train.sh
Normal file
34
_ShB-base-train.sh
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
#!/bin/sh
|
||||||
|
#SBATCH -N 1
|
||||||
|
#SBATCH -n 1
|
||||||
|
#SBATCH --partition=Teach-Standard
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --mem=24000
|
||||||
|
#SBATCH --time=3-00:00:00
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
export CUDA_HOME=/opt/cuda-9.0.176.1/
|
||||||
|
export CUDNN_HOME=/opt/cuDNN-7.0/
|
||||||
|
export STUDENT_ID=$(whoami)
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=${CUDNN_HOME}/lib64:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||||
|
export LIBRARY_PATH=${CUDNN_HOME}/lib64:${LIBRARY_PATH}
|
||||||
|
export CPATH=${CUDNN_HOME}/include:$CPATH
|
||||||
|
export PATH=${CUDA_HOME}/bin:${PATH}
|
||||||
|
export PYTHON_PATH=$PATH
|
||||||
|
|
||||||
|
mkdir -p /disk/scratch/${STUDENT_ID}
|
||||||
|
export TMPDIR=/disk/scratch/${STUDENT_ID}/
|
||||||
|
export TMP=/disk/scratch/${STUDENT_ID}/
|
||||||
|
|
||||||
|
source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda
|
||||||
|
|
||||||
|
python train.py \
|
||||||
|
--pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \
|
||||||
|
--train_dataset 'ShanghaiB' \
|
||||||
|
--save_path ./save/ShanghaiB-base-$(date -Iminutes) \
|
||||||
|
--batch_size 4 \
|
||||||
|
--gpus 0,1,2,3 \
|
||||||
|
--print_freq 100
|
||||||
|
|
||||||
35
_ShB-train.sh
Normal file
35
_ShB-train.sh
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
#!/bin/sh
|
||||||
|
#SBATCH -N 1
|
||||||
|
#SBATCH -n 1
|
||||||
|
#SBATCH --partition=Teach-Standard
|
||||||
|
#SBATCH --gres=gpu:4
|
||||||
|
#SBATCH --mem=24000
|
||||||
|
#SBATCH --time=3-00:00:00
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
export CUDA_HOME=/opt/cuda-9.0.176.1/
|
||||||
|
export CUDNN_HOME=/opt/cuDNN-7.0/
|
||||||
|
export STUDENT_ID=$(whoami)
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=${CUDNN_HOME}/lib64:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||||
|
export LIBRARY_PATH=${CUDNN_HOME}/lib64:${LIBRARY_PATH}
|
||||||
|
export CPATH=${CUDNN_HOME}/include:$CPATH
|
||||||
|
export PATH=${CUDA_HOME}/bin:${PATH}
|
||||||
|
export PYTHON_PATH=$PATH
|
||||||
|
|
||||||
|
mkdir -p /disk/scratch/${STUDENT_ID}
|
||||||
|
export TMPDIR=/disk/scratch/${STUDENT_ID}/
|
||||||
|
export TMP=/disk/scratch/${STUDENT_ID}/
|
||||||
|
|
||||||
|
source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda
|
||||||
|
|
||||||
|
python train.py \
|
||||||
|
--pth_tar './pretrained/deit_base_patch16_384-8de9b5d1.pth' \
|
||||||
|
--model 'stn' \
|
||||||
|
--train_dataset 'ShanghaiB' \
|
||||||
|
--save_path ./save/ShanghaiB-stn-$(date -Iminutes) \
|
||||||
|
--batch_size 4 \
|
||||||
|
--gpus 0,1,2,3 \
|
||||||
|
--print_freq 100
|
||||||
|
|
||||||
|
|
@ -25,6 +25,7 @@ import cv2
|
||||||
import scipy.io as io
|
import scipy.io as io
|
||||||
import scipy.sparse as sparse
|
import scipy.sparse as sparse
|
||||||
import h5py
|
import h5py
|
||||||
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
CWD = os.getcwd()
|
CWD = os.getcwd()
|
||||||
|
|
||||||
|
|
@ -51,7 +52,7 @@ def pre_dataset_sh():
|
||||||
|
|
||||||
# np.random.seed(0)
|
# np.random.seed(0)
|
||||||
# random.seed(0)
|
# random.seed(0)
|
||||||
for img_path in img_paths:
|
for _, img_path in tqdm(img_paths, desc="Preprocessing Data"):
|
||||||
img_data = cv2.imread(img_path)
|
img_data = cv2.imread(img_path)
|
||||||
mat = io.loadmat(
|
mat = io.loadmat(
|
||||||
img_path
|
img_path
|
||||||
|
|
@ -169,4 +170,5 @@ def make_npydata():
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Download manually...
|
# Download manually...
|
||||||
pre_dataset_sh() # XXX: preliminary
|
pre_dataset_sh() # XXX: preliminary
|
||||||
|
print("Storing dataset paths...")
|
||||||
make_npydata()
|
make_npydata()
|
||||||
|
|
|
||||||
29
train.py
29
train.py
|
|
@ -9,6 +9,8 @@ import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.multiprocessing as torch_mp
|
import torch.multiprocessing as torch_mp
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
import torchvision
|
||||||
import nni
|
import nni
|
||||||
import logging
|
import logging
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
@ -17,10 +19,11 @@ from model.transcrowd_gap import VisionTransformerGAP
|
||||||
from arguments import args, ret_args
|
from arguments import args, ret_args
|
||||||
import dataset
|
import dataset
|
||||||
from dataset import *
|
from dataset import *
|
||||||
from model.transcrowd_gap import base_patch16_384_gap, stn_patch16_384_gap
|
from model.transcrowd_gap import *
|
||||||
from checkpoint import save_checkpoint
|
from checkpoint import save_checkpoint
|
||||||
|
|
||||||
logger = logging.getLogger("train")
|
logger = logging.getLogger("train")
|
||||||
|
writer = SummaryWriter(args.save_path + "/tensorboard-run")
|
||||||
|
|
||||||
def setup_process_group(
|
def setup_process_group(
|
||||||
rank: int,
|
rank: int,
|
||||||
|
|
@ -196,7 +199,7 @@ def worker(rank: int, args: Namespace):
|
||||||
|
|
||||||
# Validate
|
# Validate
|
||||||
if epoch % 5 == 0 or args.debug:
|
if epoch % 5 == 0 or args.debug:
|
||||||
prec1 = valid_one_epoch(test_loader, model, device, args)
|
prec1 = valid_one_epoch(test_loader, model, device, epoch, args)
|
||||||
end_valid = time.time()
|
end_valid = time.time()
|
||||||
is_best = prec1 < args.best_pred
|
is_best = prec1 < args.best_pred
|
||||||
args.best_pred = min(prec1, args.best_pred)
|
args.best_pred = min(prec1, args.best_pred)
|
||||||
|
|
@ -255,11 +258,14 @@ def train_one_epoch(
|
||||||
gt_count_whole = gt_count_whole.cuda()
|
gt_count_whole = gt_count_whole.cuda()
|
||||||
device_type = "cuda"
|
device_type = "cuda"
|
||||||
|
|
||||||
|
# Desperate measure to reduce mem footprint...
|
||||||
with torch.autocast(device_type):
|
with torch.autocast(device_type):
|
||||||
# fpass
|
# fpass
|
||||||
out, gt_count = model(img, kpoint)
|
out, gt_count = model(img, kpoint)
|
||||||
# loss
|
# loss
|
||||||
loss = criterion(out, gt_count) # wrt. transformer
|
loss = criterion(out, gt_count) # wrt. transformer
|
||||||
|
writer.add_scalar("L1-loss wrt. xformer (train)", loss, epoch * i)
|
||||||
|
|
||||||
loss += (
|
loss += (
|
||||||
F.mse_loss( # stn: info retainment
|
F.mse_loss( # stn: info retainment
|
||||||
gt_count.view(batch_size, -1).sum(axis=1, keepdim=True),
|
gt_count.view(batch_size, -1).sum(axis=1, keepdim=True),
|
||||||
|
|
@ -270,6 +276,7 @@ def train_one_epoch(
|
||||||
value=loss.item()
|
value=loss.item()
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
writer.add_scalar("Composite loss (train)", loss, epoch * i)
|
||||||
|
|
||||||
# free grad from mem
|
# free grad from mem
|
||||||
optimizer.zero_grad(set_to_none=True)
|
optimizer.zero_grad(set_to_none=True)
|
||||||
|
|
@ -283,10 +290,13 @@ def train_one_epoch(
|
||||||
if args.debug:
|
if args.debug:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Flush writer
|
||||||
|
writer.flush()
|
||||||
|
|
||||||
scheduler.step()
|
scheduler.step()
|
||||||
|
|
||||||
|
|
||||||
def valid_one_epoch(test_loader, model, device, args):
|
def valid_one_epoch(test_loader, model, device, epoch, args):
|
||||||
print("[valid_one_epoch] Validating...")
|
print("[valid_one_epoch] Validating...")
|
||||||
batch_size = 1
|
batch_size = 1
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
@ -295,6 +305,7 @@ def valid_one_epoch(test_loader, model, device, args):
|
||||||
mse = .0
|
mse = .0
|
||||||
visi = []
|
visi = []
|
||||||
index = 0
|
index = 0
|
||||||
|
xformed = []
|
||||||
|
|
||||||
for i, (fname, img, kpoint, gt_count_whole) in enumerate(test_loader):
|
for i, (fname, img, kpoint, gt_count_whole) in enumerate(test_loader):
|
||||||
kpoint = kpoint.type(torch.FloatTensor)
|
kpoint = kpoint.type(torch.FloatTensor)
|
||||||
|
|
@ -324,6 +335,10 @@ def valid_one_epoch(test_loader, model, device, args):
|
||||||
mse += diff ** 2
|
mse += diff ** 2
|
||||||
|
|
||||||
if i % 5 == 0:
|
if i % 5 == 0:
|
||||||
|
if isinstance(model, STNet_VisionTransformerGAP):
|
||||||
|
with torch.no_grad():
|
||||||
|
img_xformed = model.stnet(img).to("cpu")
|
||||||
|
xformed.append(img_xformed)
|
||||||
print("[valid_one_epoch] {} | Gt {:.2f} Pred {:.4f} |".format(
|
print("[valid_one_epoch] {} | Gt {:.2f} Pred {:.4f} |".format(
|
||||||
fname[0],
|
fname[0],
|
||||||
torch.sum(gt_count_whole).item(),
|
torch.sum(gt_count_whole).item(),
|
||||||
|
|
@ -332,10 +347,16 @@ def valid_one_epoch(test_loader, model, device, args):
|
||||||
|
|
||||||
mae = mae * 1.0 / (len(test_loader) * batch_size)
|
mae = mae * 1.0 / (len(test_loader) * batch_size)
|
||||||
mse = np.sqrt(mse / (len(test_loader)) * batch_size)
|
mse = np.sqrt(mse / (len(test_loader)) * batch_size)
|
||||||
|
writer.add_scalar("MAE (valid)", mae, epoch)
|
||||||
|
writer.add_scalar("MSE (valid)", mse, epoch)
|
||||||
|
if len(xformed) != 0:
|
||||||
|
img_grid = torchvision.utils.make_grid(xformed)
|
||||||
|
writer.add_image("STN: transformed image", img_grid, epoch)
|
||||||
nni.report_intermediate_result(mae)
|
nni.report_intermediate_result(mae)
|
||||||
print("* MAE {mae:.3f} | MSE {mse:.3f} *".format(
|
print("* MAE {mae:.3f} | MSE {mse:.3f} *".format(
|
||||||
mae=mae, mse=mse
|
mae=mae, mse=mse
|
||||||
))
|
))
|
||||||
|
writer.flush()
|
||||||
return mae
|
return mae
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -353,7 +374,7 @@ if __name__ == "__main__":
|
||||||
worker,
|
worker,
|
||||||
args=(combined_params, ), # rank supplied at callee as 1st param
|
args=(combined_params, ), # rank supplied at callee as 1st param
|
||||||
# also above *has* to be 1-tuple else runtime expands Namespace.
|
# also above *has* to be 1-tuple else runtime expands Namespace.
|
||||||
nprocs=combined_params.world_size,
|
nprocs=combined_params.ddp_world_size,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# No DDP, run in current thread
|
# No DDP, run in current thread
|
||||||
|
|
|
||||||
0
util.py
Normal file
0
util.py
Normal file
Loading…
Add table
Add a link
Reference in a new issue