From fc941ebaf76759f56fd4486479f2d82436c1660a Mon Sep 17 00:00:00 2001 From: Zhengyi Chen Date: Sun, 3 Mar 2024 21:59:58 +0000 Subject: [PATCH] Yay, works in DP via CPU --- .gitignore | 2 ++ _ShanghaiA-train.sh | 3 ++- arguments.py | 2 +- model/glue.py | 5 ++++- model/transcrowd_gap.py | 1 - train.py | 14 +++++++++----- 6 files changed, 18 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 25a3c78..0a910aa 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ synchronous/ npydata/ **/__pycache__/** slurm-* +save/ +.vscode/ diff --git a/_ShanghaiA-train.sh b/_ShanghaiA-train.sh index eb29e4e..24f5e21 100644 --- a/_ShanghaiA-train.sh +++ b/_ShanghaiA-train.sh @@ -22,7 +22,8 @@ export TMP=/disk/scratch/${STUDENT_ID}/ source /home/${STUDENT_ID}/miniconda3/bin/activate mlp-cuda python train.py \ + --debug=True \ --model='stn' \ --save_path ./save_file/ShanghaiA \ --batch_size 4 \ - --gpus 0,1,2,3,4,5 \ \ No newline at end of file + # --gpus 0,1,2,3,4,5 \ \ No newline at end of file diff --git a/arguments.py b/arguments.py index ac1bebf..9947483 100644 --- a/arguments.py +++ b/arguments.py @@ -76,7 +76,7 @@ parser.add_argument( "--ddp_world_size", type=int, default=1, help="DDP: Number of processes in Pytorch process group" ) -parse.add_argument( +parser.add_argument( "--debug", type=bool, default=False ) diff --git a/model/glue.py b/model/glue.py index 7f0b8ba..de01198 100644 --- a/model/glue.py +++ b/model/glue.py @@ -57,7 +57,10 @@ class SquareCropTransformLayer(nn.Module): ) # Sum into gt_count - ret_gt_count = torch.sum(split_t.view(split_t.size(0), -1), dim=1) + ret_gt_count = (torch + .sum(split_t.view(split_t.size(0), -1), dim=1) + .unsqueeze(1) + ) return ret_x, ret_gt_count diff --git a/model/transcrowd_gap.py b/model/transcrowd_gap.py index 47e203c..c72555d 100644 --- a/model/transcrowd_gap.py +++ b/model/transcrowd_gap.py @@ -85,7 +85,6 @@ class VisionTransformerGAP(VisionTransformer): def forward(self, x, t): with torch.no_grad(): x, t = self.glue(x, t) - print(f"Glue: {x.shape} | {t.shape}") x = self.forward_features(x) # Compute encoding x = F.adaptive_avg_pool1d(x, (48)) x = x.view(x.shape[0], -1) # Move data for regression head diff --git a/train.py b/train.py index ca337c0..e04b5f2 100644 --- a/train.py +++ b/train.py @@ -22,7 +22,6 @@ from checkpoint import save_checkpoint logger = logging.getLogger("train") - def setup_process_group( rank: int, world_size: int, @@ -242,7 +241,6 @@ def train_one_epoch( # In one epoch, for each training sample for i, (fname, img, kpoint) in enumerate(train_loader): kpoint = kpoint.type(torch.FloatTensor) - print("Training: img {} | kpoint {}".format(img.shape, kpoint.shape)) # fpass if device is not None: img = img.to(device) @@ -251,7 +249,6 @@ def train_one_epoch( img = img.cuda() kpoint = kpoint.cuda() out, gt_count = model(img, kpoint) - # gt_count = gt_count.type(torch.FloatTensor).to(device).unsqueeze(1) # loss loss = criterion(out, gt_count) @@ -269,6 +266,9 @@ def train_one_epoch( if i % args.print_freq == 0: print("Epoch {}: {}/{}".format(epoch, i, len(train_loader))) + if args.debug: + break + scheduler.step() @@ -283,6 +283,7 @@ def valid_one_epoch(test_loader, model, device, args): index = 0 for i, (fname, img, kpoint) in enumerate(test_loader): + kpoint = kpoint.type(torch.FloatTensor) if device is not None: img = img.to(device) kpoint = kpoint.to(device) @@ -301,8 +302,8 @@ def valid_one_epoch(test_loader, model, device, args): count = torch.sum(out).item() gt_count = torch.sum(gt_count).item() - mae += abs(kpoint - count) - mse += abs(kpoint - count) ** 2 + mae += abs(gt_count - count) + mse += abs(gt_count - count) ** 2 if i % 15 == 0: print("[valid_one_epoch] {} Gt {:.2f} Pred {}".format( @@ -324,6 +325,9 @@ if __name__ == "__main__": # tuner_params = nni.get_next_parameter() # logger.debug("Generated hyperparameters: {}", tuner_params) # combined_params = nni.utils.merge_parameter(ret_args, tuner_params) + if args.debug: + os.nice(15) + combined_params = args logger.debug("Parameters: {}", combined_params)