From fc941ebaf76759f56fd4486479f2d82436c1660a Mon Sep 17 00:00:00 2001
From: Zhengyi Chen <s2018374@ed.ac.uk>
Date: Sun, 3 Mar 2024 21:59:58 +0000
Subject: [PATCH] Yay, works in DP via CPU

---
 .gitignore              |  2 ++
 _ShanghaiA-train.sh     |  3 ++-
 arguments.py            |  2 +-
 model/glue.py           |  5 ++++-
 model/transcrowd_gap.py |  1 -
 train.py                | 14 +++++++++-----
 6 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index 25a3c78..0a910aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@ synchronous/
 npydata/
 **/__pycache__/**
 slurm-*
+save/
+.vscode/
diff --git a/_ShanghaiA-train.sh b/_ShanghaiA-train.sh
index eb29e4e..24f5e21 100644
--- a/_ShanghaiA-train.sh
+++ b/_ShanghaiA-train.sh
@@ -22,7 +22,8 @@ export TMP=/disk/scratch/${STUDENT_ID}/
 
 source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda
 python train.py \
+    --debug=True \
     --model='stn' \
     --save_path ./save_file/ShanghaiA \
     --batch_size 4 \
-    --gpus 0,1,2,3,4,5 \
\ No newline at end of file
+    # --gpus 0,1,2,3,4,5 \
\ No newline at end of file
diff --git a/arguments.py b/arguments.py
index ac1bebf..9947483 100644
--- a/arguments.py
+++ b/arguments.py
@@ -76,7 +76,7 @@ parser.add_argument(
     "--ddp_world_size", type=int, default=1,
     help="DDP: Number of processes in Pytorch process group"
 )
-parse.add_argument(
+parser.add_argument(
     "--debug", type=bool, default=False
 )
 
diff --git a/model/glue.py b/model/glue.py
index 7f0b8ba..de01198 100644
--- a/model/glue.py
+++ b/model/glue.py
@@ -57,7 +57,10 @@ class SquareCropTransformLayer(nn.Module):
         )
 
         # Sum into gt_count
-        ret_gt_count = torch.sum(split_t.view(split_t.size(0), -1), dim=1)
+        ret_gt_count = (torch
+            .sum(split_t.view(split_t.size(0), -1), dim=1)
+            .unsqueeze(1)
+        )
 
         return ret_x, ret_gt_count
 
diff --git a/model/transcrowd_gap.py b/model/transcrowd_gap.py
index 47e203c..c72555d 100644
--- a/model/transcrowd_gap.py
+++ b/model/transcrowd_gap.py
@@ -85,7 +85,6 @@ class VisionTransformerGAP(VisionTransformer):
     def forward(self, x, t):
         with torch.no_grad():
             x, t = self.glue(x, t)
-        print(f"Glue: {x.shape} | {t.shape}")
         x = self.forward_features(x)        # Compute encoding
         x = F.adaptive_avg_pool1d(x, (48))
         x = x.view(x.shape[0], -1)          # Move data for regression head
diff --git a/train.py b/train.py
index ca337c0..e04b5f2 100644
--- a/train.py
+++ b/train.py
@@ -22,7 +22,6 @@ from checkpoint import save_checkpoint
 
 logger = logging.getLogger("train")
 
-
 def setup_process_group(
         rank: int,
         world_size: int,
@@ -242,7 +241,6 @@ def train_one_epoch(
     # In one epoch, for each training sample
     for i, (fname, img, kpoint) in enumerate(train_loader):
         kpoint = kpoint.type(torch.FloatTensor)
-        print("Training: img {} | kpoint {}".format(img.shape, kpoint.shape))
         # fpass
         if device is not None:
             img = img.to(device)
@@ -251,7 +249,6 @@ def train_one_epoch(
             img = img.cuda()
             kpoint = kpoint.cuda()
         out, gt_count = model(img, kpoint)
-        # gt_count = gt_count.type(torch.FloatTensor).to(device).unsqueeze(1)
 
         # loss
         loss = criterion(out, gt_count)
@@ -269,6 +266,9 @@ def train_one_epoch(
         if i % args.print_freq == 0:
             print("Epoch {}: {}/{}".format(epoch, i, len(train_loader)))
 
+        if args.debug:
+            break
+
     scheduler.step()
 
 
@@ -283,6 +283,7 @@ def valid_one_epoch(test_loader, model, device, args):
     index = 0
 
     for i, (fname, img, kpoint) in enumerate(test_loader):
+        kpoint = kpoint.type(torch.FloatTensor)
         if device is not None:
             img = img.to(device)
             kpoint = kpoint.to(device)
@@ -301,8 +302,8 @@ def valid_one_epoch(test_loader, model, device, args):
             count = torch.sum(out).item()
 
         gt_count = torch.sum(gt_count).item()
-        mae += abs(kpoint - count)
-        mse += abs(kpoint - count) ** 2
+        mae += abs(gt_count - count)
+        mse += abs(gt_count - count) ** 2
 
         if i % 15 == 0:
             print("[valid_one_epoch] {} Gt {:.2f} Pred {}".format(
@@ -324,6 +325,9 @@ if __name__ == "__main__":
     # tuner_params = nni.get_next_parameter()
     # logger.debug("Generated hyperparameters: {}", tuner_params)
     # combined_params = nni.utils.merge_parameter(ret_args, tuner_params)
+    if args.debug:
+        os.nice(15)
+
     combined_params = args
     logger.debug("Parameters: {}", combined_params)