Loss revamp & Renamed model to network
This commit is contained in:
parent
0d35d607fe
commit
9d2a30a226
7 changed files with 44 additions and 32 deletions
|
|
@ -1,162 +0,0 @@
|
|||
r"""Reverse Perspective Network Architectural Layers.
|
||||
|
||||
The *Reverse Perspective Network* [#]_ is a general approach to input
|
||||
pre-processing for instance segmentation / density map generation tasks.
|
||||
Roughly speaking, it models the input image into a elliptic coordinate system
|
||||
and tries to learn a foci length modifier parameter to perform perspective
|
||||
transformation on input images.
|
||||
|
||||
.. [#] Yang, Y., Li, G., Wu, Z., Su, L., Huang, Q., & Sebe, N. (2020).
|
||||
Reverse perspective network for perspective-aware object counting.
|
||||
In Proceedings of the IEEE/CVF conference on computer vision and pattern
|
||||
recognition (pp. 4374-4383).
|
||||
"""
|
||||
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
class PerspectiveEstimator(nn.Module):
|
||||
"""
|
||||
Perspective estimator submodule of the wider reverse-perspective network.
|
||||
|
||||
Input: Pre-processed, uniformly-sized image data
|
||||
Output: Perspective factor :math:`\\in \\mathbb{R}`
|
||||
|
||||
**Note**
|
||||
--------
|
||||
Loss input needs to be computed from beyond the **entire** rev-perspective
|
||||
network. Needs to therefore compute:
|
||||
- Effective pixel of each row after transformation.
|
||||
- Feature density (count) along row, summed over column.
|
||||
|
||||
Loss is computed as a variance over row feature densities. Ref. paper 3.2.
|
||||
After all, it is reasonable to say that you see more when you look at
|
||||
faraway places.
|
||||
|
||||
The paper utilizes a unsupervised loss -- "row feature density" refers to
|
||||
the density of features computed from ?
|
||||
|
||||
:param input_shape: (N, C, H, W)
|
||||
:param conv_kernel_shape: Oriented as (H, W)
|
||||
:param conv_dilation: equidistance dilation factor along H, W
|
||||
:param pool_capacity: K-number of classes for each (H, W) to be pooled into
|
||||
:param epsilon: Hyperparameter.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
input_shape: Tuple[int, int, int, int],
|
||||
conv_kernel_shape: Tuple[int, int],
|
||||
conv_dilation: int, # We will do equidistance dilation along H, W for now
|
||||
pool_capacity: int,
|
||||
conv_padding: int = 0,
|
||||
conv_padding_mode: str = 'zeros',
|
||||
conv_stride: int = 1,
|
||||
epsilon: float = 1e-5,
|
||||
*args, **kwargs
|
||||
) -> None:
|
||||
# N.B. input_shape has size (N, C_in, H_in, W_in)
|
||||
(_, _, height, width) = input_shape
|
||||
|
||||
# Sanity checking
|
||||
# [TODO] Maybe this is unnecessary, maybe we can automatically suggest new params,
|
||||
# but right now let's just do this...
|
||||
(_conv_height, _conv_width) = (
|
||||
np.floor(
|
||||
(height + 2 * conv_padding - conv_dilation * (conv_kernel_shape[0] - 1) - 1)
|
||||
/ conv_stride
|
||||
+ 1
|
||||
),
|
||||
np.floor(
|
||||
(width + 2 * conv_padding - conv_dilation * (conv_kernel_shape[1] - 1) - 1)
|
||||
/ conv_stride
|
||||
+ 1
|
||||
)
|
||||
)
|
||||
assert(height == _conv_height and width == _conv_width)
|
||||
|
||||
super.__init__(self, *args, **kwargs)
|
||||
self.epsilon = epsilon
|
||||
self.input_shape = input_shape
|
||||
self.layer_dict = nn.ModuleDict({
|
||||
'revpers_dilated_conv0': nn.Conv2d(
|
||||
in_channels=self.input_shape[1], out_channels=1,
|
||||
kernel_size=conv_kernel_shape,
|
||||
padding=conv_padding,
|
||||
padding_mode = conv_padding_mode,
|
||||
stride=conv_stride,
|
||||
dilation=conv_dilation,
|
||||
), # (N, 1, H, W)
|
||||
'revpers_avg_pool0': nn.AdaptiveAvgPool2d(
|
||||
output_size=(pool_capacity, 1)
|
||||
), # (N, 1, K, 1)
|
||||
# [?] Do we need to explicitly translate to (N, K) here?
|
||||
'revpers_fc0': nn.Linear(
|
||||
in_features=pool_capacity,
|
||||
out_features=1,
|
||||
),
|
||||
})
|
||||
|
||||
def forward(self, x):
|
||||
out = x
|
||||
|
||||
# Forward through layers -- there are no activations etc. in-between
|
||||
for (_, layer) in self.layer_dict:
|
||||
out = layer.forward(out)
|
||||
|
||||
# Normalize in (0, 1]
|
||||
F.relu(out, inplace=True)
|
||||
out = torch.exp(-out) + self.epsilon
|
||||
|
||||
return out
|
||||
|
||||
# def unsupervised_loss(predictions, targets):
|
||||
|
||||
# [TODO] We need a modified loss -- one that takes advantage of attention instead
|
||||
# of feature map. I feel like they should work likewise but who knows
|
||||
# [XXX] no forget it, we are pre-training rev-perspective as told by the 2020 paper
|
||||
# i.e., via using CSRNet.
|
||||
# Not sure which part is the feature map derived. Maybe after the front-end?
|
||||
# In any case we can always just use the CSR output (inferred density map) as feature map --
|
||||
# through which we compute, for each image:
|
||||
# criterion = Variance([output.sum(axis=W) * effective_pixel_per_row])
|
||||
# In other cases we sum over channels i.e., each feature map i.e., over each filter output
|
||||
# Not sure what channel means in this case...
|
||||
def warped_output_loss(csrnet_pred):
|
||||
N, H, W = csrnet_pred.shape()
|
||||
|
||||
|
||||
def transform_coordinates(
|
||||
img: torch.Tensor, # (C, W, H)
|
||||
factor: float,
|
||||
in_place: bool = True
|
||||
):
|
||||
dev_of_img = img.device
|
||||
|
||||
# Normalize X coords to [0, pi]
|
||||
min_x = torch.Tensor([0., 0., 0.]).to(dev_of_img)
|
||||
max_x = torch.Tensor([0., np.pi, 0.]).to(dev_of_img)
|
||||
min_xdim = torch.min(img, dim=1, keepdim=True)[0]
|
||||
max_xdim = torch.max(img, dim=1, keepdim=True)[0]
|
||||
(img.sub_(min_xdim)
|
||||
.div_(max_xdim - min_xdim)
|
||||
.mul_(max_x - min_x)
|
||||
.add_(min_x))
|
||||
|
||||
# Normalize Y coords to [0, 1]
|
||||
min_y = torch.Tensor([0., 0., 0.]).to(dev_of_img)
|
||||
max_y = torch.Tensor([0., 1., 0.]).to(dev_of_img)
|
||||
min_ydim = torch.min(img, dim=2, keepdim=True)[0]
|
||||
max_ydim = torch.max(img, dim=2, keepdim=True)[0]
|
||||
(img.sub_(min_ydim)
|
||||
.div_(max_ydim - min_ydim)
|
||||
.mul_(max_y - min_y)
|
||||
.add_(min_y))
|
||||
|
||||
# Do elliptical transformation
|
||||
tmp = img.clone().detach()
|
||||
|
||||
pass
|
||||
Loading…
Add table
Add a link
Reference in a new issue