diff --git a/model/reverse_perspective.py b/model/reverse_perspective.py
index 3bdcaaf..21b2c57 100644
--- a/model/reverse_perspective.py
+++ b/model/reverse_perspective.py
@@ -82,7 +82,7 @@ class PerspectiveEstimator(nn.Module):
         self.epsilon = epsilon
         self.input_shape = input_shape
         self.layer_dict = nn.ModuleDict({
-            'dilated_conv': nn.Conv2d(
+            'revpers_dilated_conv0': nn.Conv2d(
                 in_channels=self.input_shape[1], out_channels=1,
                 kernel_size=conv_kernel_shape,
                 padding=conv_padding,
@@ -90,11 +90,11 @@ class PerspectiveEstimator(nn.Module):
                 stride=conv_stride,
                 dilation=conv_dilation,
             ), # (N, 1, H, W)
-            'avg_pooling': nn.AdaptiveAvgPool2d(
+            'revpers_avg_pooling0': nn.AdaptiveAvgPool2d(
                 output_size=(pool_capacity, 1)
             ), # (N, 1, K, 1)
             # [?] Do we need to explicitly translate to (N, K) here?
-            'fc': nn.Linear(
+            'revpers_fc0': nn.Linear(
                 in_features=pool_capacity,
                 out_features=1,
             ),
@@ -115,3 +115,5 @@ class PerspectiveEstimator(nn.Module):
 
     # def unsupervised_loss(predictions, targets):
 
+# [TODO] We need a modified loss -- one that takes advantage of attention instead
+# of feature map. I feel like they should work likewise but who knows
\ No newline at end of file
diff --git a/model/transcrowd_gap.py b/model/transcrowd_gap.py
index 5624b9f..c9ae9c0 100644
--- a/model/transcrowd_gap.py
+++ b/model/transcrowd_gap.py
@@ -20,13 +20,14 @@ from timm.models.vision_transformer import VisionTransformer, _cfg
 from timm.models.registry import register_model
 from timm.models.layers import trunc_normal_
 
-class VisionTransformer_GAP(VisionTransformer):
+class VisionTransformerGAPwithFeatureMap(VisionTransformer):
     # [XXX] It might be a bad idea to use vision transformer for small datasets.
     # ref: ViT paper -- "transformers lack some of the inductive biases inherent
     # to CNNs, such as translation equivariance and locality".
     # convolution is specifically equivariant in translation (linear and
     # shift-equivariant), specifically.
-    # tl;dr: CNNs might perform better for small datasets. Not sure abt performance.
+    # tl;dr: CNNs might perform better for small datasets AND should perform
+    # better for embedded systems.
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -39,7 +40,7 @@ class VisionTransformer_GAP(VisionTransformer):
         # Fill self.pos_embed with N(0, 1) truncated to |0.2 * std|.
         trunc_normal_(self.pos_embed, std=.02)
 
-        # The "regression head" (I think? [XXX])
+        # The "regression head"
         self.output1 = nn.ModuleDict({
             "output1.relu0": nn.ReLU(),
             "output1.linear0": nn.Linear(in_features=6912 * 4, out_features=128),
@@ -49,35 +50,46 @@ class VisionTransformer_GAP(VisionTransformer):
         })
         self.output1.apply(self._init_weights)
 
+        # Attention map, which we use to train
+        self.attention_map = torch.Tensor(np.zeros((1152, 768))) # (3, 2) resized imgs
+
     def forward_features(self, x):
         B = x.shape[0]
+
+        # 3.2 Patch embed
         x = self.patch_embed(x)
 
-        # [XXX] Why do we need class token here? (ref. prev papers)
+        # ViT: Classification token
+        # This idea originated from BERT.
+        # Essentially, because we are performing encoding without decoding, we
+        # cannot fix the output dimensionality -- which the classification
+        # problem absolutely needs. Instead, we use the classification token as
+        # the sole input which the transformer would need to learn to encode
+        # whatever it learnt from input into that token.
+        # Source: https://datascience.stackexchange.com/a/110637
+        # That said, I don't think this is useful in this case...
         cls_tokens = self.cls_token.expand(B, -1, -1)
-        x = torch.cat((cls_tokens, x), dim=1) # Concatenate along j
+        x = torch.cat((cls_tokens, x), dim=1) # [[cls_token, x_i, ...]...]
 
-        # 3.2 Patch embedding
         x = x + self.pos_embed
-        x = self.pos_drop(x)    # [XXX] Drop some patches out -- or not?
+        x = self.pos_drop(x)        # [XXX] Drop some patches out -- or not?
 
         # 3.3 Transformer-encoder
         for block in self.blocks:
             x = block(x)
 
-        # [TODO] Interpret
+        # Normalize
         x = self.norm(x)
 
+        # Remove the classification token
         x = x[:, 1:]
 
         return x
 
     def forward(self, x):
-        x = self.forward_features(x)
+        x = self.forward_features(x)        # Compute encoding
         x = F.adaptive_avg_pool1d(x, (48))
-        x = x.view(x.shape[0], -1)
-        x = self.output1(x)
+        x = x.view(x.shape[0], -1)          # Move data for regression head
+                                            # Resized to ???
+        x = self.output1(x)                 # Regression head
         return x
-
-
-