mlp-project/model/reverse_perspective.py

r"""Reverse Perspective Network Architectural Layers.

The *Reverse Perspective Network* [#]_ is a general approach to input
pre-processing for instance segmentation / density map generation tasks.
Roughly speaking, it models the input image into a elliptic coordinate system
and tries to learn a foci length modifier parameter to perform perspective
transformation on input images.

.. [#] Yang, Y., Li, G., Wu, Z., Su, L., Huang, Q., & Sebe, N. (2020).
   Reverse perspective network for perspective-aware object counting.
   In Proceedings of the IEEE/CVF conference on computer vision and pattern
   recognition (pp. 4374-4383).
"""

from typing import List, Tuple

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

class PerspectiveEstimator(nn.Module):
    """
    Perspective estimator submodule of the wider reverse-perspective network.

    Input: Pre-processed, uniformly-sized image data
    Output: Perspective factor

    **Note**
    --------
    Loss input needs to be computed from beyond the **entire** rev-perspective
    network. Needs to therefore compute:
    - Effective pixel of each row after transformation.
    - Feature density (count) along row, summed over column.

    Loss is computed as a variance over row feature densities. Ref. paper 3.2.
    After all, it is reasonable to say that you see more when you look at
    faraway places.

    This do imply that **we need to obtain a reasonably good feature extractor
    from general images before training this submodule**. Hence, for now, we
    prob. should work on transformer first.

    :param input_shape: (N, C, H, W)
    :param conv_kernel_shape: Oriented as (H, W)
    :param conv_dilation: equidistance dilation factor along H, W
    :param pool_capacity: K-number of classes for each (H, W) to be pooled into
    :param epsilon: Hyperparameter.
    """
    def __init__(
        self,
        input_shape: Tuple[int, int, int, int],
        conv_kernel_shape: Tuple[int, int],
        conv_dilation: int, # We will do equidistance dilation along H, W for now
        pool_capacity: int,
        conv_padding: int = 0,
        conv_padding_mode: str = 'zeros',
        conv_stride: int = 1,
        epsilon: float = 1e-5,
        *args, **kwargs
    ) -> None:
        # N.B. input_shape has size (N, C_in, H_in, W_in)
        (_, _, height, width) = input_shape

        # Sanity checking
        (_conv_height, _conv_width) = (
            np.floor(
                (height + 2 * conv_padding - conv_dilation * (conv_kernel_shape[0] - 1) - 1)
                / conv_stride
                + 1
            ),
            np.floor(
                (width + 2 * conv_padding - conv_dilation * (conv_kernel_shape[1] - 1) - 1)
                / conv_stride
                + 1
            )
        )
        assert(height == _conv_height and width == _conv_width)

        super.__init__(self, *args, **kwargs)
        self.epsilon = epsilon
        self.input_shape = input_shape
        self.layer_dict = nn.ModuleDict({
            'dilated_conv': nn.Conv2d(
                in_channels=self.input_shape[1], out_channels=1,
                kernel_size=conv_kernel_shape,
                padding=conv_padding,
                padding_mode = conv_padding_mode,
                stride=conv_stride,
                dilation=conv_dilation,
            ), # (N, 1, H, W)
            'avg_pooling': nn.AdaptiveAvgPool2d(
                output_size=(pool_capacity, 1)
            ), # (N, 1, K, 1)
            # [?] Do we need to explicitly translate to (N, K) here?
            'fc': nn.Linear(
                in_features=pool_capacity,
                out_features=1,
            ),
        })

    def forward(self, x):
        out = x

        # Forward through layers -- there are no activations etc. in-between
        for (_, layer) in self.layer_dict:
            out = layer.forward(out)

        # Normalize in (0, 1]
        F.relu_(out) # in-place
        out = torch.exp(-out) + self.epsilon

        return out