r"""Reverse Perspective Network Architectural Layers. The *Reverse Perspective Network* [#]_ is a general approach to input pre-processing for instance segmentation / density map generation tasks. Roughly speaking, it models the input image into a elliptic coordinate system and tries to learn a foci length modifier parameter to perform perspective transformation on input images. .. [#] Yang, Y., Li, G., Wu, Z., Su, L., Huang, Q., & Sebe, N. (2020). Reverse perspective network for perspective-aware object counting. In Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (pp. 4374-4383). """ from typing import List, Tuple import numpy as np import torch import torch.nn as nn import torch.nn.functional as F class PerspectiveEstimator(nn.Module): """ Perspective estimator submodule of the wider reverse-perspective network. Input: Pre-processed, uniformly-sized image data Output: Perspective factor :math:`\\in \\mathbb{R}` **Note** -------- Loss input needs to be computed from beyond the **entire** rev-perspective network. Needs to therefore compute: - Effective pixel of each row after transformation. - Feature density (count) along row, summed over column. Loss is computed as a variance over row feature densities. Ref. paper 3.2. After all, it is reasonable to say that you see more when you look at faraway places. The paper utilizes a unsupervised loss -- "row feature density" refers to the density of features computed from ? :param input_shape: (N, C, H, W) :param conv_kernel_shape: Oriented as (H, W) :param conv_dilation: equidistance dilation factor along H, W :param pool_capacity: K-number of classes for each (H, W) to be pooled into :param epsilon: Hyperparameter. """ def __init__( self, input_shape: Tuple[int, int, int, int], conv_kernel_shape: Tuple[int, int], conv_dilation: int, # We will do equidistance dilation along H, W for now pool_capacity: int, conv_padding: int = 0, conv_padding_mode: str = 'zeros', conv_stride: int = 1, epsilon: float = 1e-5, *args, **kwargs ) -> None: # N.B. input_shape has size (N, C_in, H_in, W_in) (_, _, height, width) = input_shape # Sanity checking # [TODO] Maybe this is unnecessary, maybe we can automatically suggest new params, # but right now let's just do this... (_conv_height, _conv_width) = ( np.floor( (height + 2 * conv_padding - conv_dilation * (conv_kernel_shape[0] - 1) - 1) / conv_stride + 1 ), np.floor( (width + 2 * conv_padding - conv_dilation * (conv_kernel_shape[1] - 1) - 1) / conv_stride + 1 ) ) assert(height == _conv_height and width == _conv_width) super.__init__(self, *args, **kwargs) self.epsilon = epsilon self.input_shape = input_shape self.layer_dict = nn.ModuleDict({ 'revpers_dilated_conv0': nn.Conv2d( in_channels=self.input_shape[1], out_channels=1, kernel_size=conv_kernel_shape, padding=conv_padding, padding_mode = conv_padding_mode, stride=conv_stride, dilation=conv_dilation, ), # (N, 1, H, W) 'revpers_avg_pooling0': nn.AdaptiveAvgPool2d( output_size=(pool_capacity, 1) ), # (N, 1, K, 1) # [?] Do we need to explicitly translate to (N, K) here? 'revpers_fc0': nn.Linear( in_features=pool_capacity, out_features=1, ), }) def forward(self, x): out = x # Forward through layers -- there are no activations etc. in-between for (_, layer) in self.layer_dict: out = layer.forward(out) # Normalize in (0, 1] F.relu_(out) # in-place out = torch.exp(-out) + self.epsilon return out # def unsupervised_loss(predictions, targets): # [TODO] We need a modified loss -- one that takes advantage of attention instead # of feature map. I feel like they should work likewise but who knows