Added comment
This commit is contained in:
parent
8e0e82f67a
commit
b6d2460060
2 changed files with 31 additions and 17 deletions
|
|
@ -82,7 +82,7 @@ class PerspectiveEstimator(nn.Module):
|
||||||
self.epsilon = epsilon
|
self.epsilon = epsilon
|
||||||
self.input_shape = input_shape
|
self.input_shape = input_shape
|
||||||
self.layer_dict = nn.ModuleDict({
|
self.layer_dict = nn.ModuleDict({
|
||||||
'dilated_conv': nn.Conv2d(
|
'revpers_dilated_conv0': nn.Conv2d(
|
||||||
in_channels=self.input_shape[1], out_channels=1,
|
in_channels=self.input_shape[1], out_channels=1,
|
||||||
kernel_size=conv_kernel_shape,
|
kernel_size=conv_kernel_shape,
|
||||||
padding=conv_padding,
|
padding=conv_padding,
|
||||||
|
|
@ -90,11 +90,11 @@ class PerspectiveEstimator(nn.Module):
|
||||||
stride=conv_stride,
|
stride=conv_stride,
|
||||||
dilation=conv_dilation,
|
dilation=conv_dilation,
|
||||||
), # (N, 1, H, W)
|
), # (N, 1, H, W)
|
||||||
'avg_pooling': nn.AdaptiveAvgPool2d(
|
'revpers_avg_pooling0': nn.AdaptiveAvgPool2d(
|
||||||
output_size=(pool_capacity, 1)
|
output_size=(pool_capacity, 1)
|
||||||
), # (N, 1, K, 1)
|
), # (N, 1, K, 1)
|
||||||
# [?] Do we need to explicitly translate to (N, K) here?
|
# [?] Do we need to explicitly translate to (N, K) here?
|
||||||
'fc': nn.Linear(
|
'revpers_fc0': nn.Linear(
|
||||||
in_features=pool_capacity,
|
in_features=pool_capacity,
|
||||||
out_features=1,
|
out_features=1,
|
||||||
),
|
),
|
||||||
|
|
@ -115,3 +115,5 @@ class PerspectiveEstimator(nn.Module):
|
||||||
|
|
||||||
# def unsupervised_loss(predictions, targets):
|
# def unsupervised_loss(predictions, targets):
|
||||||
|
|
||||||
|
# [TODO] We need a modified loss -- one that takes advantage of attention instead
|
||||||
|
# of feature map. I feel like they should work likewise but who knows
|
||||||
|
|
@ -20,13 +20,14 @@ from timm.models.vision_transformer import VisionTransformer, _cfg
|
||||||
from timm.models.registry import register_model
|
from timm.models.registry import register_model
|
||||||
from timm.models.layers import trunc_normal_
|
from timm.models.layers import trunc_normal_
|
||||||
|
|
||||||
class VisionTransformer_GAP(VisionTransformer):
|
class VisionTransformerGAPwithFeatureMap(VisionTransformer):
|
||||||
# [XXX] It might be a bad idea to use vision transformer for small datasets.
|
# [XXX] It might be a bad idea to use vision transformer for small datasets.
|
||||||
# ref: ViT paper -- "transformers lack some of the inductive biases inherent
|
# ref: ViT paper -- "transformers lack some of the inductive biases inherent
|
||||||
# to CNNs, such as translation equivariance and locality".
|
# to CNNs, such as translation equivariance and locality".
|
||||||
# convolution is specifically equivariant in translation (linear and
|
# convolution is specifically equivariant in translation (linear and
|
||||||
# shift-equivariant), specifically.
|
# shift-equivariant), specifically.
|
||||||
# tl;dr: CNNs might perform better for small datasets. Not sure abt performance.
|
# tl;dr: CNNs might perform better for small datasets AND should perform
|
||||||
|
# better for embedded systems.
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
@ -39,7 +40,7 @@ class VisionTransformer_GAP(VisionTransformer):
|
||||||
# Fill self.pos_embed with N(0, 1) truncated to |0.2 * std|.
|
# Fill self.pos_embed with N(0, 1) truncated to |0.2 * std|.
|
||||||
trunc_normal_(self.pos_embed, std=.02)
|
trunc_normal_(self.pos_embed, std=.02)
|
||||||
|
|
||||||
# The "regression head" (I think? [XXX])
|
# The "regression head"
|
||||||
self.output1 = nn.ModuleDict({
|
self.output1 = nn.ModuleDict({
|
||||||
"output1.relu0": nn.ReLU(),
|
"output1.relu0": nn.ReLU(),
|
||||||
"output1.linear0": nn.Linear(in_features=6912 * 4, out_features=128),
|
"output1.linear0": nn.Linear(in_features=6912 * 4, out_features=128),
|
||||||
|
|
@ -49,15 +50,27 @@ class VisionTransformer_GAP(VisionTransformer):
|
||||||
})
|
})
|
||||||
self.output1.apply(self._init_weights)
|
self.output1.apply(self._init_weights)
|
||||||
|
|
||||||
|
# Attention map, which we use to train
|
||||||
|
self.attention_map = torch.Tensor(np.zeros((1152, 768))) # (3, 2) resized imgs
|
||||||
|
|
||||||
def forward_features(self, x):
|
def forward_features(self, x):
|
||||||
B = x.shape[0]
|
B = x.shape[0]
|
||||||
|
|
||||||
|
# 3.2 Patch embed
|
||||||
x = self.patch_embed(x)
|
x = self.patch_embed(x)
|
||||||
|
|
||||||
# [XXX] Why do we need class token here? (ref. prev papers)
|
# ViT: Classification token
|
||||||
|
# This idea originated from BERT.
|
||||||
|
# Essentially, because we are performing encoding without decoding, we
|
||||||
|
# cannot fix the output dimensionality -- which the classification
|
||||||
|
# problem absolutely needs. Instead, we use the classification token as
|
||||||
|
# the sole input which the transformer would need to learn to encode
|
||||||
|
# whatever it learnt from input into that token.
|
||||||
|
# Source: https://datascience.stackexchange.com/a/110637
|
||||||
|
# That said, I don't think this is useful in this case...
|
||||||
cls_tokens = self.cls_token.expand(B, -1, -1)
|
cls_tokens = self.cls_token.expand(B, -1, -1)
|
||||||
x = torch.cat((cls_tokens, x), dim=1) # Concatenate along j
|
x = torch.cat((cls_tokens, x), dim=1) # [[cls_token, x_i, ...]...]
|
||||||
|
|
||||||
# 3.2 Patch embedding
|
|
||||||
x = x + self.pos_embed
|
x = x + self.pos_embed
|
||||||
x = self.pos_drop(x) # [XXX] Drop some patches out -- or not?
|
x = self.pos_drop(x) # [XXX] Drop some patches out -- or not?
|
||||||
|
|
||||||
|
|
@ -65,19 +78,18 @@ class VisionTransformer_GAP(VisionTransformer):
|
||||||
for block in self.blocks:
|
for block in self.blocks:
|
||||||
x = block(x)
|
x = block(x)
|
||||||
|
|
||||||
# [TODO] Interpret
|
# Normalize
|
||||||
x = self.norm(x)
|
x = self.norm(x)
|
||||||
|
|
||||||
|
# Remove the classification token
|
||||||
x = x[:, 1:]
|
x = x[:, 1:]
|
||||||
|
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
x = self.forward_features(x)
|
x = self.forward_features(x) # Compute encoding
|
||||||
x = F.adaptive_avg_pool1d(x, (48))
|
x = F.adaptive_avg_pool1d(x, (48))
|
||||||
x = x.view(x.shape[0], -1)
|
x = x.view(x.shape[0], -1) # Move data for regression head
|
||||||
x = self.output1(x)
|
# Resized to ???
|
||||||
|
x = self.output1(x) # Regression head
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue