diff --git a/.gitignore b/.gitignore index acf9ecf..6f51f96 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ baseline-experiments/ synchronous/ +npydata/ \ No newline at end of file diff --git a/make_dataset.py b/make_dataset.py index 2d1fc6b..ac9ee40 100644 --- a/make_dataset.py +++ b/make_dataset.py @@ -10,6 +10,7 @@ import h5py CWD = os.getcwd() def pre_dataset_sh(): + dataset_name = "ShanghaiTech" root = CWD + "/synchronous/dataset/" + dataset_name + "/" part_A_train = os.path.join(root, "part_A_final/train_data", "images") @@ -53,8 +54,7 @@ def pre_dataset_sh(): gt_data[:, 0] = gt_data[:, 0] * rate_x gt_data[:, 1] = gt_data[:, 1] * rate_y - # Compute gt_count from density map (gt_data) - # XXX: what does it do exactly? + # Compute 0/1 counts from density map kpoint = np.zeros((img_data.shape[0], img_data.shape[1])) for i in range(len(gt_data)): if ( int(gt_data[i][1]) < img_data.shape[0] @@ -65,15 +65,14 @@ def pre_dataset_sh(): root_path = img_path.split("IMG_")[0].replace("images", "images_crop") # Likewise, we do not crop to patched sequences here... - # Skip directly to saving fixed-size data & gt_count. + # Skip directly to saving fixed-size data & kpoint. img_path = img_path.replace("images", "images_crop") cv2.imwrite(img_path, img_data) - gt_count = np.sum(kpoint) with h5py.File( img_path.replace('.jpg', '.h5').replace('images', 'gt_density_map'), - 'w' + mode='w' ) as hf: - hf["gt_count"] = gt_count + hf["kpoint"] = kpoint def make_npydata(): diff --git a/model/glue.py b/model/glue.py index afdbb21..2750683 100644 --- a/model/glue.py +++ b/model/glue.py @@ -1,5 +1,7 @@ -# Glue layer for transforming whole pictures into 384x384 sequence for encoder -# input +# Glue layer for transforming whole pictures into 384x384 sequence for encoder input +from dataclasses import dataclass +from itertools import product + import torch import torch.nn as nn import torch.nn.functional as F @@ -7,3 +9,59 @@ import torchvision from torchvision import transforms import numpy as np +from torchvision.transforms import v2 + +# The v2 way, apparantly. [1] +class SquareCropTransformLayer(nn.Module): + def __init__(self, crop_size: int): + super(SquareCropTransformLayer, self).__init__() + self.crop_size = crop_size + + def forward( + self, + x_: torch.Tensor, + kpoints_: torch.Tensor + ) -> (torch.Tensor, torch.Tensor): + # Here, x_ & kpoints_ already applied affine transform. + assert len(x_.shape) == 4 + channels, height, width = x_.shape[1:] + h_split_count = height // self.crop_size + w_split_count = width // self.crop_size + + # Perform identical splits -- note kpoints_ does not have C dimension! + ret_x = torch.cat( + torch.tensor_split( + torch.cat( + torch.tensor_split( + x_, + h_split_count, + dim=2 + ) + ), + w_split_count, + dim=3 + ) + ) # Performance should be acceptable but looks dumb as hell, is there a better way? + split_t = torch.cat( + torch.tensor_split( + torch.cat( + torch.tensor_split( + t_, + h_split_count, + dim=1 + ) + ), + w_split_count, + dim=2 + ) + ) + + # Sum into gt_count + ret_gt_count = torch.sum(split_t.view(split_t.size(0), -1), dim=1) + + return ret_x, ret_gt_count + +""" +References: +[1] https://pytorch.org/vision/stable/auto_examples/transforms/plot_custom_transforms.html +""" \ No newline at end of file