TEST: use autocast for mixed-precision training

2024-03-04 18:36:40 +00:00 · 2024-03-04 18:36:40 +00:00 · 2d31162c58
commit 2d31162c58
parent 94867bd8bf
1 changed files with 22 additions and 21 deletions
--- a/train.py
+++ b/train.py
@ -243,17 +243,21 @@ def train_one_epoch(
        kpoint = kpoint.type(torch.FloatTensor)
        gt_count_whole = gt_count_whole.type(torch.FloatTensor).unsqueeze(1)
        batch_size = img.size(0)
-        # fpass
+        # send to device
        if device is not None:
            img = img.to(device)
            kpoint = kpoint.to(device)
            gt_count_whole = gt_count_whole.to(device)
            device_type = device.type
        elif torch.cuda.is_available():
            img = img.cuda()
            kpoint = kpoint.cuda()
            gt_count_whole = gt_count_whole.cuda()
-        out, gt_count = model(img, kpoint)
+            device_type = "cuda"
        with torch.autocast(device_type):
            # fpass
            out, gt_count = model(img, kpoint)
            # loss
            loss = criterion(out, gt_count) # wrt. transformer
            loss += (
@ -268,7 +272,7 @@ def train_one_epoch(
            )
        # free grad from mem
-        optimizer.zero_grad()
+        optimizer.zero_grad(set_to_none=True)
        # bpass
        loss.backward()
@ -276,10 +280,6 @@ def train_one_epoch(
        # optimizer
        optimizer.step()
        # periodic message
        # if i % args.print_freq == 0:
        #     print("Epoch {}: {}/{}".format(epoch, i, len(train_loader)))
        if args.debug:
            break
@ -324,9 +324,10 @@ def valid_one_epoch(test_loader, model, device, args):
        mse += diff ** 2
        if i % 5 == 0:
-            print("[valid_one_epoch] {} | Gt {:.2f} Pred {:.4f} | mae {:.4f} mse {:.4f} |".format(
+            print("[valid_one_epoch] {} | Gt {:.2f} Pred {:.4f} |".format(
-                fname[0], torch.sum(gt_count_whole).item(), torch.sum(pred_count).item(),
+                fname[0],
-                mae, mse
+                torch.sum(gt_count_whole).item(),
                torch.sum(pred_count).item()
            ))
    mae = mae * 1.0 / (len(test_loader) * batch_size)