From 8e0e82f67a2098024aadf34200fe77ddad23c310 Mon Sep 17 00:00:00 2001
From: rubberhead <bjchenzhengyi@hotmail.com>
Date: Mon, 5 Feb 2024 14:39:00 +0000
Subject: [PATCH] Added comment

---
 model/reverse_perspective.py | 2 +-
 model/transcrowd_gap.py      | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/model/reverse_perspective.py b/model/reverse_perspective.py
index d807c89..3bdcaaf 100644
--- a/model/reverse_perspective.py
+++ b/model/reverse_perspective.py
@@ -24,7 +24,7 @@ class PerspectiveEstimator(nn.Module):
     Perspective estimator submodule of the wider reverse-perspective network.
 
     Input: Pre-processed, uniformly-sized image data
-    Output: Perspective factor
+    Output: Perspective factor :math:`\\in \\mathbb{R}`
 
     **Note**
     --------
diff --git a/model/transcrowd_gap.py b/model/transcrowd_gap.py
index 48fc520..5624b9f 100644
--- a/model/transcrowd_gap.py
+++ b/model/transcrowd_gap.py
@@ -21,6 +21,13 @@ from timm.models.registry import register_model
 from timm.models.layers import trunc_normal_
 
 class VisionTransformer_GAP(VisionTransformer):
+    # [XXX] It might be a bad idea to use vision transformer for small datasets.
+    # ref: ViT paper -- "transformers lack some of the inductive biases inherent
+    # to CNNs, such as translation equivariance and locality".
+    # convolution is specifically equivariant in translation (linear and
+    # shift-equivariant), specifically.
+    # tl;dr: CNNs might perform better for small datasets. Not sure abt performance.
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         num_patches = self.patch_embed.num_patches