From 38a16e75fe91ee430c7f3fb1494def741c5f09f1 Mon Sep 17 00:00:00 2001
From: rubberhead <bjchenzhengyi@hotmail.com>
Date: Tue, 30 Jan 2024 16:31:27 +0000
Subject: [PATCH] Starting work on reverse perspective network

---
 model/reverse_perspective.py | 98 ++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 model/reverse_perspective.py

diff --git a/model/reverse_perspective.py b/model/reverse_perspective.py
new file mode 100644
index 0000000..472551e
--- /dev/null
+++ b/model/reverse_perspective.py
@@ -0,0 +1,98 @@
+r"""Reverse Perspective Network Architectural Layers.
+
+The *Reverse Perspective Network* [#]_ is a general approach to input
+pre-processing for instance segmentation / density map generation tasks.
+Roughly speaking, it models the input image into a elliptic coordinate system
+and tries to learn a foci length modifier parameter to perform perspective
+transformation on input images.
+
+.. [#] Yang, Y., Li, G., Wu, Z., Su, L., Huang, Q., & Sebe, N. (2020).
+   Reverse perspective network for perspective-aware object counting.
+   In Proceedings of the IEEE/CVF conference on computer vision and pattern
+   recognition (pp. 4374-4383).
+"""
+
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class PerspectiveEstimator(nn.Module):
+    """
+    Perspective estimator submodule of the wider reverse-perspective network.
+
+    Input: Pre-processed, uniformly-sized image data
+    Output: Perspective factor
+
+    :param input_shape: (N, C, H, W)
+    :param conv_kernel_shape: Oriented as (H, W)
+    :param conv_dilation: equidistance dilation factor along H, W
+    :param pool_capacity: K-number of classes for each (H, W) to be pooled into
+    :param epsilon: Hyperparameter.
+    """
+    def __init__(
+        self,
+        input_shape: Tuple[int, int, int, int],
+        conv_kernel_shape: Tuple[int, int],
+        conv_dilation: int, # We will do equidistance dilation along H, W for now
+        pool_capacity: int,
+        conv_padding: int = 0,
+        conv_padding_mode: str = 'zeros',
+        conv_stride: int = 1,
+        epsilon: float = 1e-5,
+        *args, **kwargs
+    ) -> None:
+        # N.B. input_shape has size (N, C_in, H_in, W_in)
+        (_, _, height, width) = input_shape
+
+        # Sanity checking
+        (_conv_height, _conv_width) = (
+            np.floor(
+                (height + 2 * conv_padding - conv_dilation * (conv_kernel_shape[0] - 1) - 1)
+                / conv_stride
+                + 1
+            ),
+            np.floor(
+                (width + 2 * conv_padding - conv_dilation * (conv_kernel_shape[1] - 1) - 1)
+                / conv_stride
+                + 1
+            )
+        )
+        assert(height == _conv_height and width == _conv_width)
+
+        super.__init__(self, *args, **kwargs)
+        self.epsilon = epsilon
+        self.input_shape = input_shape
+        self.layer_dict = nn.ModuleDict({
+            'dilated_conv': nn.Conv2d(
+                in_channels=self.input_shape[1], out_channels=1,
+                kernel_size=conv_kernel_shape,
+                padding=conv_padding,
+                padding_mode = conv_padding_mode,
+                stride=conv_stride,
+                dilation=conv_dilation,
+            ), # (N, 1, H, W)
+            'avg_pooling': nn.AdaptiveAvgPool2d(
+                output_size=(pool_capacity, 1)
+            ), # (N, 1, K, 1)
+            # [?] Do we need to explicitly translate to (N, K) here?
+            'fc': nn.Linear(
+                in_features=pool_capacity,
+                out_features=1,
+            ),
+        })
+
+    def forward(self, x):
+        out = x
+
+        # Forward through layers -- there are no activations etc. in-between
+        for (_, layer) in self.layer_dict:
+            out = layer.forward(out)
+
+        # Normalize in (0, 1]
+        F.relu_(out) # in-place
+        out = torch.exp(-out) + self.epsilon
+
+        return out