From b69727b74fd70e9299177ccb954808dd04a05c27 Mon Sep 17 00:00:00 2001
From: rubberhead <bjchenzhengyi@hotmail.com>
Date: Sat, 2 Mar 2024 19:26:55 +0000
Subject: [PATCH] Added dataset making script

---
 make_dataset.py | 134 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 make_dataset.py

diff --git a/make_dataset.py b/make_dataset.py
new file mode 100644
index 0000000..2d1fc6b
--- /dev/null
+++ b/make_dataset.py
@@ -0,0 +1,134 @@
+import os
+import glob
+import random
+
+import numpy as np
+import cv2
+import scipy.io as io
+import h5py
+
+CWD = os.getcwd()
+
+def pre_dataset_sh():
+    root = CWD + "/synchronous/dataset/" + dataset_name + "/"
+
+    part_A_train = os.path.join(root, "part_A_final/train_data", "images")
+    part_A_test  = os.path.join(root, "part_A_final/test_data", "images")
+    part_B_train = os.path.join(root, "part_B_final/train_data", "images")
+    part_B_test  = os.path.join(root, "part_B_final/test_data", "images")
+
+    # Create cropped (to 1152x768) dataset directories
+    for base_path in part_A_train, part_A_test, part_B_train, part_B_test:
+        for replacement in "images_crop", "gt_density_map_crop":
+            if not os.path.exists(base_path.replace("images", replacement)):
+                os.makedirs(base_path.replace("images", replacement))
+
+    # Gather all jpg paths in part A & B, train & test
+    img_paths = []
+    for path in part_A_train, part_A_test, part_B_train, part_B_test:
+        for img_path in glob.glob(os.path.join(path, "*.jpg")):
+            img_paths.append(img_path)
+
+    # np.random.seed(0)
+    # random.seed(0)
+    for img_path in img_paths:
+        img_data = cv2.imread(img_path)
+        mat = io.loadmat(
+            img_path
+                .replace(".jpg", ".mat")
+                .replace("images", "ground_truth")
+                .replace("IMG_", "GT_IMG_")
+        )
+        gt_data = mat["image_info"][0][0][0][0][0]
+
+        # Resize to 1152x768
+        if img_data.shape[1] >= img_data.shape[0]:  # landscape
+            rate_x = 1152.0 / img_data.shape[1]
+            rate_y = 768.0 / img_data.shape[0]
+        else:                                       # portrait
+            rate_x = 768.0 / img_data.shape[1]
+            rate_y = 1152.0 / img_data.shape[0]
+
+        img_data = cv2.resize(img_data, (0, 0), fx=rate_x, fy=rate_y)
+        gt_data[:, 0] = gt_data[:, 0] * rate_x
+        gt_data[:, 1] = gt_data[:, 1] * rate_y
+
+        # Compute gt_count from density map (gt_data)
+        # XXX: what does it do exactly?
+        kpoint = np.zeros((img_data.shape[0], img_data.shape[1]))
+        for i in range(len(gt_data)):
+            if (    int(gt_data[i][1]) < img_data.shape[0]
+                and int(gt_data[i][0]) < img_data.shape[1]):
+                kpoint[int(gt_data[i][1]), int(gt_data[i][0])] = 1
+
+        fname = img_path.split("/")[-1]
+        root_path = img_path.split("IMG_")[0].replace("images", "images_crop")
+
+        # Likewise, we do not crop to patched sequences here...
+        # Skip directly to saving fixed-size data & gt_count.
+        img_path = img_path.replace("images", "images_crop")
+        cv2.imwrite(img_path, img_data)
+        gt_count = np.sum(kpoint)
+        with h5py.File(
+            img_path.replace('.jpg', '.h5').replace('images', 'gt_density_map'),
+            'w'
+        ) as hf:
+            hf["gt_count"] = gt_count
+
+
+def make_npydata():
+    if not os.path.exists("./npydata"):
+        os.makedirs("./npydata")
+
+    try:
+        sh_A_train_path = CWD + '/synchronous/dataset/ShanghaiTech/part_A_final/train_data/images_crop/'
+        sh_A_test_path  = CWD + '/synchronous/dataset/ShanghaiTech/part_A_final/test_data/images_crop/'
+
+        train_fpaths = []
+        for fname in os.listdir(sh_A_train_path):
+            if fname.split(".")[1] == "jpg":
+                train_fpaths.append(sh_A_train_path + fname)
+        train_fpaths.sort()
+        np.save("./npydata/ShanghaiA_train.npy", train_fpaths)
+
+        test_fpaths = []
+        for fname in os.listdir(sh_A_test_path):
+            if fname.split(".")[1] == "jpg":
+                test_fpaths.append(sh_A_test_path + fname)
+        test_fpaths.sort()
+        np.save("./npydata/ShanghaiA_test.npy", test_fpaths)
+
+        print("Saved ShanghaiA image list (test: {} | train: {})".format(
+            len(test_fpaths), len(train_fpaths)
+        ))
+    except:
+        print("The ShanghaiA dataset path is wrong.")
+
+    try:
+        sh_B_train_path = CWD + '/synchronous/dataset/ShanghaiTech/part_B_final/train_data/images_crop/'
+        sh_B_test_path  = CWD + '/synchronous/dataset/ShanghaiTech/part_B_final/test_data/images_crop/'
+
+        train_fpaths = []
+        for fname in os.listdir(sh_B_train_path):
+            if fname.split(".")[1] == "jpg":
+                train_fpaths.append(sh_B_train_path + fname)
+        train_fpaths.sort()
+        np.save("./npydata/ShanghaiB_train.npy", train_fpaths)
+
+        test_fpaths = []
+        for fname in os.listdir(sh_B_test_path):
+            if fname.split(".")[1] == "jpg":
+                test_fpaths.append(sh_B_test_path + fname)
+        test_fpaths.sort()
+        np.save("./npydata/ShanghaiB_test.npy", test_fpaths)
+
+        print("Saved ShanghaiB image list (test: {} | train: {})".format(
+            len(test_fpaths), len(train_fpaths)
+        ))
+    except:
+        print("The ShanghaiB dataset path is wrong.")
+
+if __name__ == "__main__":
+    # Download manually...
+    pre_dataset_sh()    # XXX: preliminary
+    make_npydata()
\ No newline at end of file