From 38a16e75fe91ee430c7f3fb1494def741c5f09f1 Mon Sep 17 00:00:00 2001 From: rubberhead Date: Tue, 30 Jan 2024 16:31:27 +0000 Subject: [PATCH] Starting work on reverse perspective network --- model/reverse_perspective.py | 98 ++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 model/reverse_perspective.py diff --git a/model/reverse_perspective.py b/model/reverse_perspective.py new file mode 100644 index 0000000..472551e --- /dev/null +++ b/model/reverse_perspective.py @@ -0,0 +1,98 @@ +r"""Reverse Perspective Network Architectural Layers. + +The *Reverse Perspective Network* [#]_ is a general approach to input +pre-processing for instance segmentation / density map generation tasks. +Roughly speaking, it models the input image into a elliptic coordinate system +and tries to learn a foci length modifier parameter to perform perspective +transformation on input images. + +.. [#] Yang, Y., Li, G., Wu, Z., Su, L., Huang, Q., & Sebe, N. (2020). + Reverse perspective network for perspective-aware object counting. + In Proceedings of the IEEE/CVF conference on computer vision and pattern + recognition (pp. 4374-4383). +""" + +from typing import List, Tuple + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +class PerspectiveEstimator(nn.Module): + """ + Perspective estimator submodule of the wider reverse-perspective network. + + Input: Pre-processed, uniformly-sized image data + Output: Perspective factor + + :param input_shape: (N, C, H, W) + :param conv_kernel_shape: Oriented as (H, W) + :param conv_dilation: equidistance dilation factor along H, W + :param pool_capacity: K-number of classes for each (H, W) to be pooled into + :param epsilon: Hyperparameter. + """ + def __init__( + self, + input_shape: Tuple[int, int, int, int], + conv_kernel_shape: Tuple[int, int], + conv_dilation: int, # We will do equidistance dilation along H, W for now + pool_capacity: int, + conv_padding: int = 0, + conv_padding_mode: str = 'zeros', + conv_stride: int = 1, + epsilon: float = 1e-5, + *args, **kwargs + ) -> None: + # N.B. input_shape has size (N, C_in, H_in, W_in) + (_, _, height, width) = input_shape + + # Sanity checking + (_conv_height, _conv_width) = ( + np.floor( + (height + 2 * conv_padding - conv_dilation * (conv_kernel_shape[0] - 1) - 1) + / conv_stride + + 1 + ), + np.floor( + (width + 2 * conv_padding - conv_dilation * (conv_kernel_shape[1] - 1) - 1) + / conv_stride + + 1 + ) + ) + assert(height == _conv_height and width == _conv_width) + + super.__init__(self, *args, **kwargs) + self.epsilon = epsilon + self.input_shape = input_shape + self.layer_dict = nn.ModuleDict({ + 'dilated_conv': nn.Conv2d( + in_channels=self.input_shape[1], out_channels=1, + kernel_size=conv_kernel_shape, + padding=conv_padding, + padding_mode = conv_padding_mode, + stride=conv_stride, + dilation=conv_dilation, + ), # (N, 1, H, W) + 'avg_pooling': nn.AdaptiveAvgPool2d( + output_size=(pool_capacity, 1) + ), # (N, 1, K, 1) + # [?] Do we need to explicitly translate to (N, K) here? + 'fc': nn.Linear( + in_features=pool_capacity, + out_features=1, + ), + }) + + def forward(self, x): + out = x + + # Forward through layers -- there are no activations etc. in-between + for (_, layer) in self.layer_dict: + out = layer.forward(out) + + # Normalize in (0, 1] + F.relu_(out) # in-place + out = torch.exp(-out) + self.epsilon + + return out