mlp-project/preprocess_data.py

"""
The TransCrowd paper lists ShanghaiTech dataset as from here:

    https://drive.google.com/file/d/1CkYppr_IqR1s6wi53l2gKoGqm7LkJ-Lc/view

Alternatively, you could prob. download from here:

    https://www.kaggle.com/datasets/tthien/shanghaitech?resource=download

It seems the directories are all wrong, though.

After downloading, execute:

    $ unzip <downloaded-zip-file> -d <repo-dir>/synchronous/dataset/

To unzip the dataset correctly prior to running this script.
"""

import os
import glob
import random

import numpy as np
import cv2
import scipy.io as io
import scipy.sparse as sparse
import h5py
from tqdm.auto import tqdm

CWD = os.getcwd()

def pre_dataset_sh():
    dataset_name = "ShanghaiTech"
    root = CWD + "/synchronous/dataset/" + dataset_name + "/"

    part_A_train = os.path.join(root, "part_A_final/train_data", "images")
    part_A_test  = os.path.join(root, "part_A_final/test_data", "images")
    part_B_train = os.path.join(root, "part_B_final/train_data", "images")
    part_B_test  = os.path.join(root, "part_B_final/test_data", "images")

    # Create cropped (to 1152x768) dataset directories
    for base_path in part_A_train, part_A_test, part_B_train, part_B_test:
        for replacement in "images_crop", "gt_density_map_crop":
            if not os.path.exists(base_path.replace("images", replacement)):
                os.makedirs(base_path.replace("images", replacement))

    # Gather all jpg paths in part A & B, train & test
    img_paths = []
    for path in part_A_train, part_A_test, part_B_train, part_B_test:
        for img_path in glob.glob(os.path.join(path, "*.jpg")):
            img_paths.append(img_path)

    # np.random.seed(0)
    # random.seed(0)
    for img_path in tqdm(img_paths, desc="Preprocessing Data"):
        img_data = cv2.imread(img_path)
        mat = io.loadmat(
            img_path
                .replace(".jpg", ".mat")
                .replace("images", "ground_truth")
                .replace("IMG_", "GT_IMG_")
        )
        gt_data = mat["image_info"][0][0][0][0][0]

        # Resize to 1152x768
        is_portrait = False
        if img_data.shape[1] >= img_data.shape[0]:  # landscape
            rate_x = 1152.0 / img_data.shape[1]
            rate_y = 768.0 / img_data.shape[0]
        else:                                       # portrait
            rate_x = 768.0 / img_data.shape[1]
            rate_y = 1152.0 / img_data.shape[0]
            is_portrait = True

        img_data = cv2.resize(img_data, (0, 0), fx=rate_x, fy=rate_y)
        gt_data[:, 0] = gt_data[:, 0] * rate_x
        gt_data[:, 1] = gt_data[:, 1] * rate_y

        if is_portrait:
            print("Portrait img: \'{}\' -- transposing...".format(img_path))
            img_data = cv2.transpose(img_data)
            gt_data = gt_data[:, ::-1]

        # Compute 0/1 counts from density map
        assert img_data.shape[:2] == (768, 1152)
        coordinates = gt_data.round().astype(int)           # To integer coords
        coordinates[:, 0] = np.clip(coordinates[:, 0], a_min=0, a_max=1151)
        coordinates[:, 1] = np.clip(coordinates[:, 1], a_min=0, a_max=767)
        assert max(coordinates[:, 0]) < 1152
        assert max(coordinates[:, 1]) < 768
        sparse_mat = sparse.coo_matrix((
                np.ones(coordinates.shape[0]),              # data
                (coordinates[:, 1], coordinates[:, 0]),     # (i, j)
            ),  # N.B. all k |- ret[i[k], j[k]] = data[k]
            shape=(768, 1152),
            dtype=int,
        )   # To same shape as image, so i, j flipped wrt. coordinates
        kpoint = sparse_mat.toarray()

        # Sum count as ground truth (we need to train STN, remember?)
        gt_count = sparse_mat.nnz

        fname = img_path.split("/")[-1]
        root_path = img_path.split("IMG_")[0].replace("images", "images_crop")

        # Likewise, we do not crop to patched sequences here...
        # Skip directly to saving fixed-size data & kpoint.
        img_path = img_path.replace("images", "images_crop")
        cv2.imwrite(img_path, img_data)
        with h5py.File(
            img_path.replace('.jpg', '.h5').replace('images', 'gt_density_map'),
            mode='w'
        ) as hf:
            hf["kpoint"] = kpoint
            hf["gt_count"] = gt_count


def make_npydata():
    if not os.path.exists("./npydata"):
        os.makedirs("./npydata")

    try:
        sh_A_train_path = CWD + '/synchronous/dataset/ShanghaiTech/part_A_final/train_data/images_crop/'
        sh_A_test_path  = CWD + '/synchronous/dataset/ShanghaiTech/part_A_final/test_data/images_crop/'

        train_fpaths = []
        for fname in os.listdir(sh_A_train_path):
            if fname.split(".")[1] == "jpg":
                train_fpaths.append(sh_A_train_path + fname)
        train_fpaths.sort()
        np.save("./npydata/ShanghaiA_train.npy", train_fpaths)

        test_fpaths = []
        for fname in os.listdir(sh_A_test_path):
            if fname.split(".")[1] == "jpg":
                test_fpaths.append(sh_A_test_path + fname)
        test_fpaths.sort()
        np.save("./npydata/ShanghaiA_test.npy", test_fpaths)

        print("Saved ShanghaiA image list (test: {} | train: {})".format(
            len(test_fpaths), len(train_fpaths)
        ))
    except:
        print("The ShanghaiA dataset path is wrong.")

    try:
        sh_B_train_path = CWD + '/synchronous/dataset/ShanghaiTech/part_B_final/train_data/images_crop/'
        sh_B_test_path  = CWD + '/synchronous/dataset/ShanghaiTech/part_B_final/test_data/images_crop/'

        train_fpaths = []
        for fname in os.listdir(sh_B_train_path):
            if fname.split(".")[1] == "jpg":
                train_fpaths.append(sh_B_train_path + fname)
        train_fpaths.sort()
        np.save("./npydata/ShanghaiB_train.npy", train_fpaths)

        test_fpaths = []
        for fname in os.listdir(sh_B_test_path):
            if fname.split(".")[1] == "jpg":
                test_fpaths.append(sh_B_test_path + fname)
        test_fpaths.sort()
        np.save("./npydata/ShanghaiB_test.npy", test_fpaths)

        print("Saved ShanghaiB image list (test: {} | train: {})".format(
            len(test_fpaths), len(train_fpaths)
        ))
    except:
        print("The ShanghaiB dataset path is wrong.")

if __name__ == "__main__":
    # Download manually...
    pre_dataset_sh()    # XXX: preliminary
    print("Storing dataset paths...")
    make_npydata()