From 5e951ccac36113f49fc5db28101f2104156f1f84 Mon Sep 17 00:00:00 2001
From: Paul Park <paul.park@twosixtech.com>
Date: Fri, 27 Jan 2023 21:53:01 +0000
Subject: [PATCH] updated dapricot_dev; uploaded dataset to S3

---
 .../adversarial/dapricot_dev/__init__.py      |   3 +
 .../adversarial/dapricot_dev/checksums.tsv    |   1 +
 .../adversarial/dapricot_dev/dapricot_dev.py  | 369 ++++++++++++++++++
 .../dapricot_dev/dapricot_dev_test.py         |  25 ++
 .../TODO-add_fake_data_in_this_directory.txt  |   0
 armory/datasets/cached_datasets.json          |   7 +
 6 files changed, 405 insertions(+)
 create mode 100644 armory/datasets/adversarial/dapricot_dev/__init__.py
 create mode 100644 armory/datasets/adversarial/dapricot_dev/checksums.tsv
 create mode 100644 armory/datasets/adversarial/dapricot_dev/dapricot_dev.py
 create mode 100644 armory/datasets/adversarial/dapricot_dev/dapricot_dev_test.py
 create mode 100644 armory/datasets/adversarial/dapricot_dev/dummy_data/TODO-add_fake_data_in_this_directory.txt

diff --git a/armory/datasets/adversarial/dapricot_dev/__init__.py b/armory/datasets/adversarial/dapricot_dev/__init__.py
new file mode 100644
index 000000000..12702e4e3
--- /dev/null
+++ b/armory/datasets/adversarial/dapricot_dev/__init__.py
@@ -0,0 +1,3 @@
+"""dapricot_dev dataset."""
+
+from .dapricot_dev import DapricotDev
diff --git a/armory/datasets/adversarial/dapricot_dev/checksums.tsv b/armory/datasets/adversarial/dapricot_dev/checksums.tsv
new file mode 100644
index 000000000..55e66eca4
--- /dev/null
+++ b/armory/datasets/adversarial/dapricot_dev/checksums.tsv
@@ -0,0 +1 @@
+https://armory-public-data.s3.us-east-2.amazonaws.com/adversarial-datasets/dapricot_dev.tar.gz	79101937	f657cbb237878e28bee63ce7bcb15e9781d6c399a63e8487f7199ec84dae3956	dapricot_dev.tar.gz
diff --git a/armory/datasets/adversarial/dapricot_dev/dapricot_dev.py b/armory/datasets/adversarial/dapricot_dev/dapricot_dev.py
new file mode 100644
index 000000000..c7fadcc7d
--- /dev/null
+++ b/armory/datasets/adversarial/dapricot_dev/dapricot_dev.py
@@ -0,0 +1,369 @@
+"""dapricot_dev dataset."""
+
+import collections
+import json
+import os
+
+import tensorflow.compat.v1 as tf
+import tensorflow_datasets as tfds
+
+from armory.data.adversarial import pandas_proxy
+
+_DESCRIPTION = """
+LEGAL
+-----
+Copyright 2021 The MITRE Corporation. All rights reserved.
+"""
+
+_CITATION = """
+Dataset is unpublished at this time.
+"""
+
+_URLS = "https://armory-public-data.s3.us-east-2.amazonaws.com/adversarial-datasets/dapricot_dev.tar.gz"
+
+
+class DapricotDev(tfds.core.GeneratorBasedBuilder):
+    """DatasetBuilder for dapricot_dev dataset."""
+
+    VERSION = tfds.core.Version("1.0.1")
+    RELEASE_NOTES = {
+        "1.0.0": "Initial release.",
+        "1.0.1": "Updated to access full dev dataset",
+    }
+
+    def _info(self) -> tfds.core.DatasetInfo:
+        """Returns the dataset metadata."""
+        # TODO(dapricot_dev): Specifies the tfds.core.DatasetInfo object
+        # return tfds.core.DatasetInfo(
+        #     builder=self,
+        #     description=_DESCRIPTION,
+        #     features=tfds.features.FeaturesDict(
+        #         {
+        #             # These are the features of your dataset like images, labels ...
+        #             "image": tfds.features.Image(shape=(None, None, 3)),
+        #             "label": tfds.features.ClassLabel(names=["no", "yes"]),
+        #         }
+        #     ),
+        #     # If there's a common (input, target) tuple from the
+        #     # features, specify them here. They'll be used if
+        #     # `as_supervised=True` in `builder.as_dataset`.
+        #     supervised_keys=("image", "label"),  # Set to `None` to disable
+        #     homepage="https://dataset-homepage/",
+        #     citation=_CITATION,
+        # )
+        features = {
+            # all Sequences are for [camera_1, camera_2, camera_3]
+            "image": tfds.features.Sequence(
+                tfds.features.Image(shape=(None, None, 3)),  # encoding_format="jpeg"),
+                length=3,
+            ),
+            "images": tfds.features.Sequence(
+                tfds.features.FeaturesDict(
+                    {
+                        "file_name": tfds.features.Text(),
+                        "height": tf.int64,
+                        "width": tf.int64,
+                        "id": tf.int64,
+                    }
+                ),
+                length=3,
+            ),
+            "categories": tfds.features.Sequence(
+                tfds.features.Sequence(
+                    tfds.features.FeaturesDict(
+                        {
+                            "id": tf.int64,  # {'octagon':12, 'diamond':26, 'rect':29}
+                            "name": tfds.features.Text(),
+                        }
+                    )
+                ),
+                length=3,
+            ),
+            "objects": tfds.features.Sequence(
+                tfds.features.Sequence(
+                    {
+                        "id": tf.int64,
+                        "image_id": tf.int64,
+                        "area": tf.int64,  # un-normalized area
+                        "boxes": tfds.features.BBoxFeature(),  # normalized bounding box [ymin, xmin, ymax, xmax]
+                        "labels": tfds.features.ClassLabel(num_classes=91),
+                        "is_crowd": tf.bool,
+                    }
+                ),
+                length=3,
+            ),
+            "patch_metadata": tfds.features.Sequence(
+                # these data only apply to the "green screen patch" objects
+                tfds.features.FeaturesDict(
+                    {
+                        "gs_coords": tfds.features.Sequence(
+                            tfds.features.Tensor(
+                                shape=[2], dtype=tf.int64
+                            ),  # green screen vertices in (x,y)
+                        ),
+                        "cc_ground_truth": tfds.features.Tensor(
+                            shape=[24, 3], dtype=tf.float32
+                        ),  # colorchecker color ground truth
+                        "cc_scene": tfds.features.Tensor(
+                            shape=[24, 3], dtype=tf.float32
+                        ),  # colorchecker colors in a scene
+                        "shape": tfds.features.Text(),  # "diamond", "rect", "octagon"
+                    }
+                ),
+                length=3,
+            ),
+        }
+
+        return tfds.core.DatasetInfo(
+            builder=self,
+            description=_DESCRIPTION,
+            features=tfds.features.FeaturesDict(features),
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+        """Returns SplitGenerators."""
+        # TODO(dapricot_dev): Downloads the data and defines the splits
+        # path = dl_manager.download_and_extract("https://todo-data-url")
+        paths = dl_manager.download_and_extract(_URLS)
+
+        # TODO(dapricot_dev): Returns the Dict[split names, Iterator[Key, Example]]
+        # return {
+        #     "train": self._generate_examples(path / "train_imgs"),
+        # }
+        return [
+            tfds.core.SplitGenerator(
+                name=patch_size,
+                gen_kwargs={"path": os.path.join(paths, "dev"), "size": patch_size},
+            )
+            for patch_size in ["large", "medium", "small"]
+        ]
+
+    def _generate_examples(self, path, size):
+        """Yields examples."""
+        # TODO(dapricot_dev): Yields (key, example) tuples from the dataset
+        # for f in path.glob("*.jpeg"):
+        #     yield "key", {
+        #         "image": f,
+        #         "label": "yes",
+        #     }
+
+        scenes = ["01", "06", "14"]
+
+        size_dist = {"small": "dist15", "medium": "dist10", "large": "dist5"}
+
+        yield_id = 0
+        # For each scene, read JSONs for all cameras.
+        # For each camera, go through each image.
+        # For each image, gets its annotations and yield relevant data
+        for scene in scenes:
+
+            annotation_path_camera_1 = os.path.join(
+                path, "annotations/labels_scene_{}_camera_1.json".format(scene)
+            )
+            annotation_path_camera_2 = os.path.join(
+                path, "annotations/labels_scene_{}_camera_2.json".format(scene)
+            )
+            annotation_path_camera_3 = os.path.join(
+                path, "annotations/labels_scene_{}_camera_3.json".format(scene)
+            )
+
+            dapricot_camera_1 = DapricotAnnotation(annotation_path_camera_1)
+            dapricot_camera_2 = DapricotAnnotation(annotation_path_camera_2)
+            dapricot_camera_3 = DapricotAnnotation(annotation_path_camera_3)
+
+            images_camera_1 = dapricot_camera_1.images()
+            images_camera_2 = dapricot_camera_2.images()
+            images_camera_3 = dapricot_camera_3.images()
+
+            # sort images alphabetically so all three cameras are consistent
+            images_camera_1 = sorted(
+                images_camera_1, key=lambda x: x["file_name"].lower()
+            )
+            images_camera_2 = sorted(
+                images_camera_2, key=lambda x: x["file_name"].lower()
+            )
+            images_camera_3 = sorted(
+                images_camera_3, key=lambda x: x["file_name"].lower()
+            )
+
+            for image_camera_1, image_camera_2, image_camera_3 in zip(
+                images_camera_1, images_camera_2, images_camera_3
+            ):
+
+                # verify consistency
+                fname1 = image_camera_1[
+                    "file_name"
+                ]  # fname has format "scene_#_camera_1_<SHAPE>_<HEIGHT>_<DIST>.JPG"
+                fname2 = image_camera_2["file_name"]
+                fname3 = image_camera_3["file_name"]
+                assert fname1 == ("_").join(
+                    fname2.split("_")[:3] + ["1"] + fname2.split("_")[4:]
+                ), "{} and {} are inconsistent".format(fname1, fname2)
+                assert fname1 == ("_").join(
+                    fname3.split("_")[:3] + ["1"] + fname3.split("_")[4:]
+                ), "{} and {} are inconsistent".format(fname1, fname3)
+
+                # get object annotations for each image
+                annotations_camera_1 = dapricot_camera_1.get_annotations(
+                    image_camera_1["id"]
+                )
+                annotations_camera_2 = dapricot_camera_2.get_annotations(
+                    image_camera_2["id"]
+                )
+                annotations_camera_3 = dapricot_camera_3.get_annotations(
+                    image_camera_3["id"]
+                )
+
+                # convert bbox to Pytorch format
+                def build_bbox(x, y, width, height):
+                    return tfds.features.BBox(
+                        ymin=y
+                        / image_camera_1[
+                            "height"
+                        ],  # all images are the same size, so using image_camera_1 is fine
+                        xmin=x / image_camera_1["width"],
+                        ymax=(y + height) / image_camera_1["height"],
+                        xmax=(x + width) / image_camera_1["width"],
+                    )
+
+                # convert segmentation format of (x0,y0,x1,y1,...) to ( (x0, y0), (x1, y1), ... )
+                def build_coords(segmentation):
+                    xs = segmentation[::2]
+                    ys = segmentation[1::2]
+                    coords = [[int(round(x)), int(round(y))] for (x, y) in zip(xs, ys)]
+
+                    return coords
+
+                # convert green screen shape given in file name to shape expected in downstream algorithms
+                def get_shape(in_shape):
+                    out_shape = {"stp": "octagon", "pxg": "diamond", "spd": "rect"}
+                    return out_shape[in_shape]
+
+                # get colorchecker color box values. There are 24 color boxes, so output shape is (24, 3)
+                def get_cc(ground_truth=True, scene=None, camera=None):
+                    if ground_truth:
+                        return pandas_proxy.read_csv_to_numpy_float32(
+                            os.path.join(
+                                path,
+                                "annotations",
+                                "xrite_passport_colors_sRGB-GMB-2005.csv",
+                            ),
+                            header=None,
+                        )
+                    else:
+                        return pandas_proxy.read_csv_to_numpy_float32(
+                            os.path.join(
+                                path,
+                                "annotations",
+                                "scene_{}_camera_{}_CC_values.csv".format(
+                                    scene, camera
+                                ),
+                            ),
+                            header=None,
+                        )
+
+                example = {
+                    "image": [
+                        os.path.join(
+                            path,
+                            "scene_{}/camera_{}".format(scene, camera + 1),
+                            im_cam["file_name"],
+                        )
+                        for camera, im_cam in enumerate(
+                            [image_camera_1, image_camera_2, image_camera_3]
+                        )
+                    ],
+                    "images": [image_camera_1, image_camera_2, image_camera_3],
+                    "categories": [
+                        d_cam.categories()
+                        for d_cam in [
+                            dapricot_camera_1,
+                            dapricot_camera_2,
+                            dapricot_camera_3,
+                        ]
+                    ],
+                    "objects": [
+                        [
+                            {
+                                "id": anno["id"],
+                                "image_id": anno["image_id"],
+                                "area": anno["area"],
+                                "boxes": build_bbox(*anno["bbox"]),
+                                "labels": anno["category_id"],
+                                "is_crowd": bool(anno["iscrowd"]),
+                            }
+                            for anno in annos
+                        ]
+                        for annos in [
+                            annotations_camera_1,
+                            annotations_camera_2,
+                            annotations_camera_3,
+                        ]
+                    ],
+                    "patch_metadata": [
+                        [
+                            {
+                                "gs_coords": build_coords(*anno["segmentation"]),
+                                "cc_ground_truth": get_cc(),
+                                "cc_scene": get_cc(
+                                    ground_truth=False, scene=scene, camera=camera + 1
+                                ),
+                                "shape": get_shape(
+                                    im_info["file_name"].split("_")[4].lower()
+                                ),  # file_name has format "scene_#_camera_#_<SHAPE>_<HEIGHT>_<DIST>.JPG"
+                            }
+                            for anno in annos
+                            if len(anno["segmentation"]) > 0
+                        ][0]
+                        for camera, (annos, im_info) in enumerate(
+                            zip(
+                                [
+                                    annotations_camera_1,
+                                    annotations_camera_2,
+                                    annotations_camera_3,
+                                ],
+                                [image_camera_1, image_camera_2, image_camera_3],
+                            )
+                        )
+                    ],
+                }
+
+                yield_id = yield_id + 1
+
+                patch_size = image_camera_1["file_name"].split(".")[
+                    0
+                ]  # scene_#_camera_#_<SHAPE>_<HEIGHT>_<DIST>
+                patch_size = patch_size.split("_")[-1].lower()  # <DIST>
+                if size_dist[size] == patch_size:
+                    yield yield_id, example
+
+
+class DapricotAnnotation(object):
+    """Dapricot annotation helper class."""
+
+    def __init__(self, annotation_path):
+        with tf.io.gfile.GFile(annotation_path) as f:
+            data = json.load(f)
+        self._data = data
+
+        # for each images["id"], find all annotations such that annotations["image_id"] == images["id"]
+        img_id2annotations = collections.defaultdict(list)
+        for a in self._data["annotations"]:
+            img_id2annotations[a["image_id"]].append(a)
+        self._img_id2annotations = {
+            k: list(sorted(v, key=lambda a: a["id"]))
+            for k, v in img_id2annotations.items()
+        }
+
+    def categories(self):
+        """Return the category dicts, as sorted in the file."""
+        return self._data["categories"]
+
+    def images(self):
+        """Return the image dicts, as sorted in the file."""
+        return self._data["images"]
+
+    def get_annotations(self, img_id):
+        """Return all annotations associated with the image id string."""
+        return self._img_id2annotations.get(img_id, [])
diff --git a/armory/datasets/adversarial/dapricot_dev/dapricot_dev_test.py b/armory/datasets/adversarial/dapricot_dev/dapricot_dev_test.py
new file mode 100644
index 000000000..7346f0df3
--- /dev/null
+++ b/armory/datasets/adversarial/dapricot_dev/dapricot_dev_test.py
@@ -0,0 +1,25 @@
+"""dapricot_dev dataset."""
+
+import tensorflow_datasets as tfds
+from . import dapricot_dev
+
+
+class DapricotDevTest(tfds.testing.DatasetBuilderTestCase):
+    """Tests for dapricot_dev dataset."""
+
+    # TODO(dapricot_dev):
+    DATASET_CLASS = dapricot_dev.DapricotDev
+    SPLITS = {
+        "train": 3,  # Number of fake train example
+        "test": 1,  # Number of fake test example
+    }
+
+    # If you are calling `download/download_and_extract` with a dict, like:
+    #   dl_manager.download({'some_key': 'http://a.org/out.txt', ...})
+    # then the tests needs to provide the fake output paths relative to the
+    # fake data directory
+    # DL_EXTRACT_RESULT = {'some_key': 'output_file1.txt', ...}
+
+
+if __name__ == "__main__":
+    tfds.testing.test_main()
diff --git a/armory/datasets/adversarial/dapricot_dev/dummy_data/TODO-add_fake_data_in_this_directory.txt b/armory/datasets/adversarial/dapricot_dev/dummy_data/TODO-add_fake_data_in_this_directory.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/armory/datasets/cached_datasets.json b/armory/datasets/cached_datasets.json
index 23c3a76a8..7f1e9b619 100644
--- a/armory/datasets/cached_datasets.json
+++ b/armory/datasets/cached_datasets.json
@@ -27,6 +27,13 @@
         "url": null,
         "version": "3.0.2"
     },
+    "dapricot_dev": {
+        "sha256": "5051af98235e50c4c2cc6050479c4a153b8dab0b6a782e773f18c4ce95742f97",
+        "size": 76512992,
+        "subdir": "dapricot_dev/1.0.1",
+        "url": null,
+        "version": "1.0.1"
+    },
     "digit": {
         "sha256": "805fb5e33caf2029e13f4146c9d06fdb437ac5b0f0aa9668e3201922b617c559",
         "size": 8349857,