From 12aabb0d3f02fbacd7275f577dbb9a759bad9d02 Mon Sep 17 00:00:00 2001 From: rubberhead Date: Sun, 3 Mar 2024 03:16:54 +0000 Subject: [PATCH] More working than not Not sure if validation works, call it a day --- .gitignore | 3 +- _ShanghaiA-train.sh | 10 +++ arguments.py | 4 +- dataset.py | 109 +++++++++++++------------- model/glue.py | 2 +- model/stn.py | 2 + model/transcrowd_gap.py | 47 +++++------ make_dataset.py => preprocess_data.py | 15 +++- train.py | 23 +++--- transform_img.py | 6 -- 10 files changed, 116 insertions(+), 105 deletions(-) create mode 100644 _ShanghaiA-train.sh rename make_dataset.py => preprocess_data.py (91%) delete mode 100644 transform_img.py diff --git a/.gitignore b/.gitignore index 6f51f96..ac4256a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ baseline-experiments/ synchronous/ -npydata/ \ No newline at end of file +npydata/ +**/__pycache__/** \ No newline at end of file diff --git a/_ShanghaiA-train.sh b/_ShanghaiA-train.sh new file mode 100644 index 0000000..0e8a34f --- /dev/null +++ b/_ShanghaiA-train.sh @@ -0,0 +1,10 @@ +#!/bin/sh +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH --partition=Teach-Standard +#SBATCH --gres=gpu:6 +#SBATCH --mem=24000 +#SBATCH --time=3-00:00:00 + +python train.py \ + --model='stn' \ No newline at end of file diff --git a/arguments.py b/arguments.py index 93324b5..72cf252 100644 --- a/arguments.py +++ b/arguments.py @@ -12,13 +12,13 @@ parser.add_argument( # Data configuration ========================================================= parser.add_argument( - "--worker", type=int, default=4, help="Number of data loader processes" + "--workers", type=int, default=4, help="Number of data loader processes" ) parser.add_argument( "--train_dataset", type=str, default="ShanghaiA", help="Training dataset" ) parser.add_argument( - "--test_dataset", type=str, default="ShanghaiA", help="Evaluation dataset" + "--eval_dataset", type=str, default="ShanghaiA", help="Evaluation dataset" ) parser.add_argument( "--print_freq", type=int, default=1, diff --git a/dataset.py b/dataset.py index e966480..d11fee6 100644 --- a/dataset.py +++ b/dataset.py @@ -6,7 +6,7 @@ import torch import torch.nn.functional as F from torch.utils.data import Dataset from torchvision import datasets, transforms -from image import Image +from PIL import Image import numpy as np import numbers import h5py @@ -15,23 +15,23 @@ import cv2 def unpack_npy_data(args: Namespace): """Unpack npy data aas np lists at hard-coded paths wrt cwd.""" - if args.dataset == "ShanghaiA": + if args.train_dataset == "ShanghaiA": train_file = "./npydata/ShanghaiA_train.npy" test_file = "./npydata/ShanghaiA_test.npy" - elif args.dataset == "ShanghaiB": + elif args.train_dataset == "ShanghaiB": train_file = "./npydata/ShanghaiB_train.npy" test_file = "./npydata/ShanghaiB_test.npy" - elif args.dataset == "UCF_QNRF": + elif args.train_dataset == "UCF_QNRF": train_file = "./npydata/qnrf_train.npy" test_file = "./npydata/qnrf_test.npy" - elif args.dataset == "JHU": + elif args.train_dataset == "JHU": train_file = "./npydata/jhu_train.npy" test_file = "./npydata/jhu_test.npy" - elif args.dataset == "NWPU": + elif args.train_dataset == "NWPU": train_file = "./npydata/nwpu_train.npy" test_file = "./npydata/nwpu_test.npy" - - assert any([fdir is not None for fdir in [train_file, test_file]]) + + assert all([fdir is not None for fdir in [train_file, test_file]]) with open(train_file, "rb") as fd: train_list = np.load(fd).tolist() @@ -39,9 +39,9 @@ def unpack_npy_data(args: Namespace): test_list = np.load(fd).tolist() print("[dataset] Loaded \"{}\": train: {} | test: {}".format( - args.dataset, len(train_list), len(test_list) + args.train_dataset, len(train_list), len(test_list) )) - + return train_list, test_list @@ -55,28 +55,28 @@ def convert_data(train_list, args: Namespace, train: bool): while True: try: gt_file = h5py.File(gt_path) - gt_count = np.asarray(gt_file["gt_count"]) + kpoint = np.asarray(gt_file["kpoint"]) break except OSError: print("[dataset] Load error on \'{}\'", img_path) img = img.copy() - gt_count = gt_count.copy() + kpoint = kpoint.copy() + + return img, kpoint - return img, gt_count - print("[dataset] Pre-loading dataset...\n{}".format("-" * 50)) data_keys = [] for i in range(len(train_list)): img_path = train_list[i] fname = os.path.basename(img_path) - img, gt_count = _load_data(img_path, train, args) + img, kpoint = _load_data(img_path, train, args) pack = { - "img": img, - "gt_count": gt_count, - "fname": fname, + "img": img, + "kpoint": kpoint, + "fname": fname, } data_keys.append(pack) @@ -85,16 +85,16 @@ def convert_data(train_list, args: Namespace, train: bool): class ListDataset(Dataset): def __init__( - self, - root, + self, + root, shape = None, shuffle: bool = True, - transform = None, + transform = None, train: bool = False, - seen: int = 0, - batch_size: int = 1, - nr_workers: int = 4, - args: Namespace = None, + seen: int = 0, + batch_size: int = 1, + nr_workers: int = 4, + args: Namespace = None, ): if train: random.shuffle(root) @@ -109,25 +109,25 @@ class ListDataset(Dataset): self.nr_workers = nr_workers self.args = args - + def __len__(self): return self.nr_samples - + def __getitem__(self, index): assert index <= len(self), "Index out-of-bounds" fname = self.lines[index]["fname"] img = self.lines[index]["img"] - gt_count = self.lines[index]["gt_count"] + kpoint = self.lines[index]["kpoint"] # Data augmentation if self.train: if random.random() > .5: img = img.transpose(Image.FLIP_LEFT_RIGHT) # XXX: do random noise? - - gt_count = gt_count.copy() + + kpoint = kpoint.copy() img = img.copy() # Custom transform @@ -135,28 +135,29 @@ class ListDataset(Dataset): img = self.transform(img) - if self.train: - return fname, img, gt_count - else: - device = args.device - height, width = img.shape[1], img.shape[2] - m = int(width / 384) - n = int(height / 384) - for i in range(m): - for j in range(n): - if i == 0 and j == 0: - img_ret = img[ - :, # C - j * 384 : 384 * (j + 1), # H - i * 384 : 384 * (i + 1), # W - ].to(device).unsqueeze(0) - else: - cropped = img[ - :, # C - j * 384 : 384 * (j + 1), # H - i * 384 : 384 * (i + 1), # W - ].to(device).unsqueeze(0) - img_ret = torch.cat([img_ret, cropped], 0).to(device) - return fname, img_ret, gt_count + return fname, img, kpoint + + # if self.train: + # return fname, img, gt_count + # else: + # device = args.device + # height, width = img.shape[1], img.shape[2] + # m = int(width / 384) + # n = int(height / 384) + # for i in range(m): + # for j in range(n): + # if i == 0 and j == 0: + # img_ret = img[ + # :, # C + # j * 384 : 384 * (j + 1), # H + # i * 384 : 384 * (i + 1), # W + # ].to(device).unsqueeze(0) + # else: + # cropped = img[ + # :, # C + # j * 384 : 384 * (j + 1), # H + # i * 384 : 384 * (i + 1), # W + # ].to(device).unsqueeze(0) + # img_ret = torch.cat([img_ret, cropped], 0).to(device) + # return fname, img_ret, gt_count - \ No newline at end of file diff --git a/model/glue.py b/model/glue.py index 2750683..7f0b8ba 100644 --- a/model/glue.py +++ b/model/glue.py @@ -46,7 +46,7 @@ class SquareCropTransformLayer(nn.Module): torch.tensor_split( torch.cat( torch.tensor_split( - t_, + kpoints_, h_split_count, dim=1 ) diff --git a/model/stn.py b/model/stn.py index 53fa8ff..1b6099f 100644 --- a/model/stn.py +++ b/model/stn.py @@ -24,6 +24,7 @@ class STNet(nn.Module): _dummy_size_ = input_size # shape checking + print("STN: dummy_size {}".format(_dummy_size_)) _dummy_x_ = torch.zeros(_dummy_size_) # (3.1) Spatial transformer localization-network @@ -81,6 +82,7 @@ class STNet(nn.Module): def forward(self, x, t): + # print("STN: {} | {}".format(x.shape, t.shape)) # transform the input, do nothing else return self.stn(x, t) diff --git a/model/transcrowd_gap.py b/model/transcrowd_gap.py index a733b79..47e203c 100644 --- a/model/transcrowd_gap.py +++ b/model/transcrowd_gap.py @@ -25,16 +25,8 @@ from .stn import STNet from .glue import SquareCropTransformLayer class VisionTransformerGAP(VisionTransformer): - # [XXX] It might be a bad idea to use vision transformer for small datasets. - # ref: ViT paper -- "transformers lack some of the inductive biases inherent - # to CNNs, such as translation equivariance and locality". - # convolution is specifically equivariant in translation (linear and - # shift-equivariant), specifically. - # tl;dr: CNNs might perform better for small datasets AND should perform - # better for embedded systems. - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, img_size: int, *args, **kwargs): + super().__init__(img_size=img_size, *args, **kwargs) num_patches = self.patch_embed.num_patches # That {p_1, p_2, ..., p_N} pos embedding @@ -45,17 +37,17 @@ class VisionTransformerGAP(VisionTransformer): trunc_normal_(self.pos_embed, std=.02) # The "regression head" - self.output1 = nn.ModuleDict({ - "output1.relu0": nn.ReLU(), - "output1.linear0": nn.Linear(in_features=6912 * 4, out_features=128), - "output1.relu1": nn.ReLU(), - "output1.dropout0": nn.Dropout(p=0.5), - "output1.linear1": nn.Linear(in_features=128, out_features=1), - }) + self.output1 = nn.Sequential( + nn.ReLU(), + nn.Linear(in_features=6912 * 4, out_features=128), + nn.ReLU(), + nn.Dropout(p=0.5), + nn.Linear(in_features=128, out_features=1), + ) self.output1.apply(self._init_weights) - # Attention map, which we use to train - self.attention_map = torch.Tensor(np.zeros((1152, 768))) # (3, 2) resized imgs + # glue layer -- since we delay image cropping here + self.glue = SquareCropTransformLayer(img_size) def forward_features(self, x): B = x.shape[0] @@ -90,25 +82,26 @@ class VisionTransformerGAP(VisionTransformer): return x - def forward(self, x): + def forward(self, x, t): + with torch.no_grad(): + x, t = self.glue(x, t) + print(f"Glue: {x.shape} | {t.shape}") x = self.forward_features(x) # Compute encoding x = F.adaptive_avg_pool1d(x, (48)) x = x.view(x.shape[0], -1) # Move data for regression head # Resized to ??? x = self.output1(x) # Regression head - return x + return x, t class STNet_VisionTransformerGAP(VisionTransformerGAP): - def __init__(self, img_shape: torch.Size, *args, **kwargs): - super(STNet_VisionTransformerGAP, self).__init__(*args, **kwargs) + def __init__(self, img_shape: torch.Size, img_size: int, *args, **kwargs): + super(STNet_VisionTransformerGAP, self).__init__(img_size, *args, **kwargs) self.stnet = STNet(img_shape) - self.glue = SquareCropTransformLayer(img_size) def forward(self, x, t): x, t = self.stnet(x, t) - x, t = self.glue(x, t) - return super(STNet_VisionTransformerGAP, self).forward(x), t + return super(STNet_VisionTransformerGAP, self).forward(x, t) @register_model @@ -131,7 +124,7 @@ def base_patch16_384_gap(pth_tar: Optional[str] = None, **kwargs): @register_model def stn_patch16_384_gap(pth_tar: Optional[str] = None, **kwargs): model = STNet_VisionTransformerGAP( - img_shape=torch.Size((3, 384, 384)), + img_shape=torch.Size((3, 1152, 768)), img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs diff --git a/make_dataset.py b/preprocess_data.py similarity index 91% rename from make_dataset.py rename to preprocess_data.py index 5e3380c..767dbaa 100644 --- a/make_dataset.py +++ b/preprocess_data.py @@ -1,17 +1,19 @@ """ -The TransCrowd paper lists ShanghaiTech dataset as from here: +The TransCrowd paper lists ShanghaiTech dataset as from here: https://drive.google.com/file/d/1CkYppr_IqR1s6wi53l2gKoGqm7LkJ-Lc/view -Alternatively, you could prob. download from here: +Alternatively, you could prob. download from here: https://www.kaggle.com/datasets/tthien/shanghaitech?resource=download +It seems the directories are all wrong, though. + After downloading, execute: $ unzip -d /synchronous/dataset/ -To unzip the dataset correctly prior to running this script. +To unzip the dataset correctly prior to running this script. """ import os @@ -59,17 +61,24 @@ def pre_dataset_sh(): gt_data = mat["image_info"][0][0][0][0][0] # Resize to 1152x768 + is_portrait = False if img_data.shape[1] >= img_data.shape[0]: # landscape rate_x = 1152.0 / img_data.shape[1] rate_y = 768.0 / img_data.shape[0] else: # portrait rate_x = 768.0 / img_data.shape[1] rate_y = 1152.0 / img_data.shape[0] + is_portrait = True img_data = cv2.resize(img_data, (0, 0), fx=rate_x, fy=rate_y) gt_data[:, 0] = gt_data[:, 0] * rate_x gt_data[:, 1] = gt_data[:, 1] * rate_y + if is_portrait: + print("Portrait img: \'{}\' -- rotating 90 deg clockwise...".format(img_path)) + img_data = cv2.rotate(img_data, cv2.ROTATE_90_CLOCKWISE) + + # Compute 0/1 counts from density map kpoint = np.zeros((img_data.shape[0], img_data.shape[1])) for i in range(len(gt_data)): diff --git a/train.py b/train.py index c1b9d0e..e3216da 100644 --- a/train.py +++ b/train.py @@ -29,7 +29,7 @@ def setup_process_group( master_addr: str = "localhost", master_port: Optional[np.ushort] = None ): - os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_ADDR"] = master_addr os.environ["MASTER_PORT"] = ( str(random.randint(40000, 65545)) if master_port is None @@ -121,7 +121,6 @@ def worker(rank: int, args: Namespace): train_loader = build_train_loader(train_data, args) test_loader = build_test_loader(test_data, args) - # Instantiate model if args.model == "stn": model = stn_patch16_384_gap(args.pth_tar).to(device) @@ -229,11 +228,14 @@ def train_one_epoch( model.train() # In one epoch, for each training sample - for i, (fname, img, gt_count) in enumerate(train_loader): + for i, (fname, img, kpoint) in enumerate(train_loader): + kpoint = kpoint.type(torch.FloatTensor) + print("Training: img {} | kpoint {}".format(img.shape, kpoint.shape)) # fpass img = img.to(device) - out = model(img) - gt_count = gt_count.type(torch.FloatTensor).to(device).unsqueeze(1) + kpoint = kpoint.to(device) + out, gt_count = model(img, kpoint) + # gt_count = gt_count.type(torch.FloatTensor).to(device).unsqueeze(1) # loss loss = criterion(out, gt_count) @@ -288,7 +290,7 @@ def valid_one_epoch(test_loader, model, device, args): mae = mae * 1.0 / (len(test_loader) * batch_size) mse = np.sqrt(mse / (len(test_loader)) * batch_size) - nni.report_intermediate_result(mae) + # nni.report_intermediate_result(mae) print("* MAE {mae:.3f} | MSE {mse:.3f} *".format( mae=mae, mse=mse )) @@ -297,11 +299,10 @@ def valid_one_epoch(test_loader, model, device, args): if __name__ == "__main__": - tuner_params = nni.get_next_parameter() - logger.debug("Generated hyperparameters: {}", tuner_params) - combined_params = Namespace( - nni.utils.merge_parameter(ret_args, tuner_params) - ) # Namespaces have better ergonomics, notably a struct-like access syntax. + # tuner_params = nni.get_next_parameter() + # logger.debug("Generated hyperparameters: {}", tuner_params) + # combined_params = nni.utils.merge_parameter(ret_args, tuner_params) + combined_params = args logger.debug("Parameters: {}", combined_params) if combined_params.use_ddp: diff --git a/transform_img.py b/transform_img.py deleted file mode 100644 index c6d711e..0000000 --- a/transform_img.py +++ /dev/null @@ -1,6 +0,0 @@ -# If we cannot get revpersnet running, -# we still ought to do some sort of information-preserving perspective transformation -# e.g., randomized transformation -# and let transcrowd to crunch through these transformed image instead. -# After training, we obtain the attention map and put it in our paper. -# I just want to get things done... \ No newline at end of file