Loss revamp & Renamed model to network

2024-03-06 20:44:37 +00:00 · 2024-03-06 20:44:37 +00:00 · 9d2a30a226
commit 9d2a30a226
parent 0d35d607fe
7 changed files with 44 additions and 32 deletions
--- a/model/reverse_perspective.py
+++ b/model/reverse_perspective.py
@ -1,162 +0,0 @@
-r"""Reverse Perspective Network Architectural Layers.
-
-The *Reverse Perspective Network* [#]_ is a general approach to input
-pre-processing for instance segmentation / density map generation tasks.
-Roughly speaking, it models the input image into a elliptic coordinate system
-and tries to learn a foci length modifier parameter to perform perspective
-transformation on input images.
-
-.. [#] Yang, Y., Li, G., Wu, Z., Su, L., Huang, Q., & Sebe, N. (2020).
-   Reverse perspective network for perspective-aware object counting.
-   In Proceedings of the IEEE/CVF conference on computer vision and pattern
-   recognition (pp. 4374-4383).
-"""
-
-from typing import List, Tuple
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-class PerspectiveEstimator(nn.Module):
-    """
-    Perspective estimator submodule of the wider reverse-perspective network.
-
-    Input: Pre-processed, uniformly-sized image data
-    Output: Perspective factor :math:`\\in \\mathbb{R}`
-
-    **Note**
-    --------
-    Loss input needs to be computed from beyond the **entire** rev-perspective
-    network. Needs to therefore compute:
-    - Effective pixel of each row after transformation.
-    - Feature density (count) along row, summed over column.
-
-    Loss is computed as a variance over row feature densities. Ref. paper 3.2.
-    After all, it is reasonable to say that you see more when you look at
-    faraway places.
-
-    The paper utilizes a unsupervised loss -- "row feature density" refers to
-    the density of features computed from ?
-
-    :param input_shape: (N, C, H, W)
-    :param conv_kernel_shape: Oriented as (H, W)
-    :param conv_dilation: equidistance dilation factor along H, W
-    :param pool_capacity: K-number of classes for each (H, W) to be pooled into
-    :param epsilon: Hyperparameter.
-    """
-    def __init__(
-        self,
-        input_shape: Tuple[int, int, int, int],
-        conv_kernel_shape: Tuple[int, int],
-        conv_dilation: int, # We will do equidistance dilation along H, W for now
-        pool_capacity: int,
-        conv_padding: int = 0,
-        conv_padding_mode: str = 'zeros',
-        conv_stride: int = 1,
-        epsilon: float = 1e-5,
-        *args, **kwargs
-    ) -> None:
-        # N.B. input_shape has size (N, C_in, H_in, W_in)
-        (_, _, height, width) = input_shape
-
-        # Sanity checking
-        # [TODO] Maybe this is unnecessary, maybe we can automatically suggest new params,
-        # but right now let's just do this...
-        (_conv_height, _conv_width) = (
-            np.floor(
-                (height + 2 * conv_padding - conv_dilation * (conv_kernel_shape[0] - 1) - 1)
-                / conv_stride
-                + 1
-            ),
-            np.floor(
-                (width + 2 * conv_padding - conv_dilation * (conv_kernel_shape[1] - 1) - 1)
-                / conv_stride
-                + 1
-            )
-        )
-        assert(height == _conv_height and width == _conv_width)
-
-        super.__init__(self, *args, **kwargs)
-        self.epsilon = epsilon
-        self.input_shape = input_shape
-        self.layer_dict = nn.ModuleDict({
-            'revpers_dilated_conv0': nn.Conv2d(
-                in_channels=self.input_shape[1], out_channels=1,
-                kernel_size=conv_kernel_shape,
-                padding=conv_padding,
-                padding_mode = conv_padding_mode,
-                stride=conv_stride,
-                dilation=conv_dilation,
-            ), # (N, 1, H, W)
-            'revpers_avg_pool0': nn.AdaptiveAvgPool2d(
-                output_size=(pool_capacity, 1)
-            ), # (N, 1, K, 1)
-            # [?] Do we need to explicitly translate to (N, K) here?
-            'revpers_fc0': nn.Linear(
-                in_features=pool_capacity,
-                out_features=1,
-            ),
-        })
-
-    def forward(self, x):
-        out = x
-
-        # Forward through layers -- there are no activations etc. in-between
-        for (_, layer) in self.layer_dict:
-            out = layer.forward(out)
-
-        # Normalize in (0, 1]
-        F.relu(out, inplace=True)
-        out = torch.exp(-out) + self.epsilon
-
-        return out
-
-    # def unsupervised_loss(predictions, targets):
-
-# [TODO] We need a modified loss -- one that takes advantage of attention instead
-# of feature map. I feel like they should work likewise but who knows
-# [XXX] no forget it, we are pre-training rev-perspective as told by the 2020 paper
-# i.e., via using CSRNet.
-# Not sure which part is the feature map derived. Maybe after the front-end?
-# In any case we can always just use the CSR output (inferred density map) as feature map --
-# through which we compute, for each image:
-# criterion = Variance([output.sum(axis=W) * effective_pixel_per_row])
-# In other cases we sum over channels i.e., each feature map i.e., over each filter output
-# Not sure what channel means in this case...
-def warped_output_loss(csrnet_pred):
-    N, H, W = csrnet_pred.shape()
-
-
-def transform_coordinates(
-        img: torch.Tensor,      # (C, W, H)
-        factor: float,
-        in_place: bool = True
-):
-    dev_of_img = img.device
-
-    # Normalize X coords to [0, pi]
-    min_x = torch.Tensor([0., 0., 0.]).to(dev_of_img)
-    max_x = torch.Tensor([0., np.pi, 0.]).to(dev_of_img)
-    min_xdim = torch.min(img, dim=1, keepdim=True)[0]
-    max_xdim = torch.max(img, dim=1, keepdim=True)[0]
-    (img.sub_(min_xdim)
-        .div_(max_xdim - min_xdim)
-        .mul_(max_x - min_x)
-        .add_(min_x))
-
-    # Normalize Y coords to [0, 1]
-    min_y = torch.Tensor([0., 0., 0.]).to(dev_of_img)
-    max_y = torch.Tensor([0., 1., 0.]).to(dev_of_img)
-    min_ydim = torch.min(img, dim=2, keepdim=True)[0]
-    max_ydim = torch.max(img, dim=2, keepdim=True)[0]
-    (img.sub_(min_ydim)
-        .div_(max_ydim - min_ydim)
-        .mul_(max_y - min_y)
-        .add_(min_y))
-
-    # Do elliptical transformation
-    tmp = img.clone().detach()
-    
-    pass