115 lines
4 KiB
Python
115 lines
4 KiB
Python
r"""Reverse Perspective Network Architectural Layers.
|
|
|
|
The *Reverse Perspective Network* [#]_ is a general approach to input
|
|
pre-processing for instance segmentation / density map generation tasks.
|
|
Roughly speaking, it models the input image into a elliptic coordinate system
|
|
and tries to learn a foci length modifier parameter to perform perspective
|
|
transformation on input images.
|
|
|
|
.. [#] Yang, Y., Li, G., Wu, Z., Su, L., Huang, Q., & Sebe, N. (2020).
|
|
Reverse perspective network for perspective-aware object counting.
|
|
In Proceedings of the IEEE/CVF conference on computer vision and pattern
|
|
recognition (pp. 4374-4383).
|
|
"""
|
|
|
|
from typing import List, Tuple
|
|
|
|
import numpy as np
|
|
import torch
|
|
import torch.nn as nn
|
|
import torch.nn.functional as F
|
|
|
|
class PerspectiveEstimator(nn.Module):
|
|
"""
|
|
Perspective estimator submodule of the wider reverse-perspective network.
|
|
|
|
Input: Pre-processed, uniformly-sized image data
|
|
Output: Perspective factor
|
|
|
|
**Note**
|
|
--------
|
|
Loss input needs to be computed from beyond the **entire** rev-perspective
|
|
network. Needs to therefore compute:
|
|
- Effective pixel of each row after transformation.
|
|
- Feature density (count) along row, summed over column.
|
|
|
|
Loss is computed as a variance over row feature densities. Ref. paper 3.2.
|
|
After all, it is reasonable to say that you see more when you look at
|
|
faraway places.
|
|
|
|
This do imply that **we need to obtain a reasonably good feature extractor
|
|
from general images before training this submodule**. Hence, for now, we
|
|
prob. should work on transformer first.
|
|
|
|
:param input_shape: (N, C, H, W)
|
|
:param conv_kernel_shape: Oriented as (H, W)
|
|
:param conv_dilation: equidistance dilation factor along H, W
|
|
:param pool_capacity: K-number of classes for each (H, W) to be pooled into
|
|
:param epsilon: Hyperparameter.
|
|
"""
|
|
def __init__(
|
|
self,
|
|
input_shape: Tuple[int, int, int, int],
|
|
conv_kernel_shape: Tuple[int, int],
|
|
conv_dilation: int, # We will do equidistance dilation along H, W for now
|
|
pool_capacity: int,
|
|
conv_padding: int = 0,
|
|
conv_padding_mode: str = 'zeros',
|
|
conv_stride: int = 1,
|
|
epsilon: float = 1e-5,
|
|
*args, **kwargs
|
|
) -> None:
|
|
# N.B. input_shape has size (N, C_in, H_in, W_in)
|
|
(_, _, height, width) = input_shape
|
|
|
|
# Sanity checking
|
|
(_conv_height, _conv_width) = (
|
|
np.floor(
|
|
(height + 2 * conv_padding - conv_dilation * (conv_kernel_shape[0] - 1) - 1)
|
|
/ conv_stride
|
|
+ 1
|
|
),
|
|
np.floor(
|
|
(width + 2 * conv_padding - conv_dilation * (conv_kernel_shape[1] - 1) - 1)
|
|
/ conv_stride
|
|
+ 1
|
|
)
|
|
)
|
|
assert(height == _conv_height and width == _conv_width)
|
|
|
|
super.__init__(self, *args, **kwargs)
|
|
self.epsilon = epsilon
|
|
self.input_shape = input_shape
|
|
self.layer_dict = nn.ModuleDict({
|
|
'dilated_conv': nn.Conv2d(
|
|
in_channels=self.input_shape[1], out_channels=1,
|
|
kernel_size=conv_kernel_shape,
|
|
padding=conv_padding,
|
|
padding_mode = conv_padding_mode,
|
|
stride=conv_stride,
|
|
dilation=conv_dilation,
|
|
), # (N, 1, H, W)
|
|
'avg_pooling': nn.AdaptiveAvgPool2d(
|
|
output_size=(pool_capacity, 1)
|
|
), # (N, 1, K, 1)
|
|
# [?] Do we need to explicitly translate to (N, K) here?
|
|
'fc': nn.Linear(
|
|
in_features=pool_capacity,
|
|
out_features=1,
|
|
),
|
|
})
|
|
|
|
def forward(self, x):
|
|
out = x
|
|
|
|
# Forward through layers -- there are no activations etc. in-between
|
|
for (_, layer) in self.layer_dict:
|
|
out = layer.forward(out)
|
|
|
|
# Normalize in (0, 1]
|
|
F.relu_(out) # in-place
|
|
out = torch.exp(-out) + self.epsilon
|
|
|
|
return out
|
|
|
|
|