diff --git a/model/reverse_perspective.py b/model/reverse_perspective.py index 3bdcaaf..21b2c57 100644 --- a/model/reverse_perspective.py +++ b/model/reverse_perspective.py @@ -82,7 +82,7 @@ class PerspectiveEstimator(nn.Module): self.epsilon = epsilon self.input_shape = input_shape self.layer_dict = nn.ModuleDict({ - 'dilated_conv': nn.Conv2d( + 'revpers_dilated_conv0': nn.Conv2d( in_channels=self.input_shape[1], out_channels=1, kernel_size=conv_kernel_shape, padding=conv_padding, @@ -90,11 +90,11 @@ class PerspectiveEstimator(nn.Module): stride=conv_stride, dilation=conv_dilation, ), # (N, 1, H, W) - 'avg_pooling': nn.AdaptiveAvgPool2d( + 'revpers_avg_pooling0': nn.AdaptiveAvgPool2d( output_size=(pool_capacity, 1) ), # (N, 1, K, 1) # [?] Do we need to explicitly translate to (N, K) here? - 'fc': nn.Linear( + 'revpers_fc0': nn.Linear( in_features=pool_capacity, out_features=1, ), @@ -115,3 +115,5 @@ class PerspectiveEstimator(nn.Module): # def unsupervised_loss(predictions, targets): +# [TODO] We need a modified loss -- one that takes advantage of attention instead +# of feature map. I feel like they should work likewise but who knows \ No newline at end of file diff --git a/model/transcrowd_gap.py b/model/transcrowd_gap.py index 5624b9f..c9ae9c0 100644 --- a/model/transcrowd_gap.py +++ b/model/transcrowd_gap.py @@ -20,13 +20,14 @@ from timm.models.vision_transformer import VisionTransformer, _cfg from timm.models.registry import register_model from timm.models.layers import trunc_normal_ -class VisionTransformer_GAP(VisionTransformer): +class VisionTransformerGAPwithFeatureMap(VisionTransformer): # [XXX] It might be a bad idea to use vision transformer for small datasets. # ref: ViT paper -- "transformers lack some of the inductive biases inherent # to CNNs, such as translation equivariance and locality". # convolution is specifically equivariant in translation (linear and # shift-equivariant), specifically. - # tl;dr: CNNs might perform better for small datasets. Not sure abt performance. + # tl;dr: CNNs might perform better for small datasets AND should perform + # better for embedded systems. def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -39,7 +40,7 @@ class VisionTransformer_GAP(VisionTransformer): # Fill self.pos_embed with N(0, 1) truncated to |0.2 * std|. trunc_normal_(self.pos_embed, std=.02) - # The "regression head" (I think? [XXX]) + # The "regression head" self.output1 = nn.ModuleDict({ "output1.relu0": nn.ReLU(), "output1.linear0": nn.Linear(in_features=6912 * 4, out_features=128), @@ -49,35 +50,46 @@ class VisionTransformer_GAP(VisionTransformer): }) self.output1.apply(self._init_weights) + # Attention map, which we use to train + self.attention_map = torch.Tensor(np.zeros((1152, 768))) # (3, 2) resized imgs + def forward_features(self, x): B = x.shape[0] + + # 3.2 Patch embed x = self.patch_embed(x) - # [XXX] Why do we need class token here? (ref. prev papers) + # ViT: Classification token + # This idea originated from BERT. + # Essentially, because we are performing encoding without decoding, we + # cannot fix the output dimensionality -- which the classification + # problem absolutely needs. Instead, we use the classification token as + # the sole input which the transformer would need to learn to encode + # whatever it learnt from input into that token. + # Source: https://datascience.stackexchange.com/a/110637 + # That said, I don't think this is useful in this case... cls_tokens = self.cls_token.expand(B, -1, -1) - x = torch.cat((cls_tokens, x), dim=1) # Concatenate along j + x = torch.cat((cls_tokens, x), dim=1) # [[cls_token, x_i, ...]...] - # 3.2 Patch embedding x = x + self.pos_embed - x = self.pos_drop(x) # [XXX] Drop some patches out -- or not? + x = self.pos_drop(x) # [XXX] Drop some patches out -- or not? # 3.3 Transformer-encoder for block in self.blocks: x = block(x) - # [TODO] Interpret + # Normalize x = self.norm(x) + # Remove the classification token x = x[:, 1:] return x def forward(self, x): - x = self.forward_features(x) + x = self.forward_features(x) # Compute encoding x = F.adaptive_avg_pool1d(x, (48)) - x = x.view(x.shape[0], -1) - x = self.output1(x) + x = x.view(x.shape[0], -1) # Move data for regression head + # Resized to ??? + x = self.output1(x) # Regression head return x - - -