From 5e951ccac36113f49fc5db28101f2104156f1f84 Mon Sep 17 00:00:00 2001 From: Paul Park Date: Fri, 27 Jan 2023 21:53:01 +0000 Subject: [PATCH] updated dapricot_dev; uploaded dataset to S3 --- .../adversarial/dapricot_dev/__init__.py | 3 + .../adversarial/dapricot_dev/checksums.tsv | 1 + .../adversarial/dapricot_dev/dapricot_dev.py | 369 ++++++++++++++++++ .../dapricot_dev/dapricot_dev_test.py | 25 ++ .../TODO-add_fake_data_in_this_directory.txt | 0 armory/datasets/cached_datasets.json | 7 + 6 files changed, 405 insertions(+) create mode 100644 armory/datasets/adversarial/dapricot_dev/__init__.py create mode 100644 armory/datasets/adversarial/dapricot_dev/checksums.tsv create mode 100644 armory/datasets/adversarial/dapricot_dev/dapricot_dev.py create mode 100644 armory/datasets/adversarial/dapricot_dev/dapricot_dev_test.py create mode 100644 armory/datasets/adversarial/dapricot_dev/dummy_data/TODO-add_fake_data_in_this_directory.txt diff --git a/armory/datasets/adversarial/dapricot_dev/__init__.py b/armory/datasets/adversarial/dapricot_dev/__init__.py new file mode 100644 index 000000000..12702e4e3 --- /dev/null +++ b/armory/datasets/adversarial/dapricot_dev/__init__.py @@ -0,0 +1,3 @@ +"""dapricot_dev dataset.""" + +from .dapricot_dev import DapricotDev diff --git a/armory/datasets/adversarial/dapricot_dev/checksums.tsv b/armory/datasets/adversarial/dapricot_dev/checksums.tsv new file mode 100644 index 000000000..55e66eca4 --- /dev/null +++ b/armory/datasets/adversarial/dapricot_dev/checksums.tsv @@ -0,0 +1 @@ +https://armory-public-data.s3.us-east-2.amazonaws.com/adversarial-datasets/dapricot_dev.tar.gz 79101937 f657cbb237878e28bee63ce7bcb15e9781d6c399a63e8487f7199ec84dae3956 dapricot_dev.tar.gz diff --git a/armory/datasets/adversarial/dapricot_dev/dapricot_dev.py b/armory/datasets/adversarial/dapricot_dev/dapricot_dev.py new file mode 100644 index 000000000..c7fadcc7d --- /dev/null +++ b/armory/datasets/adversarial/dapricot_dev/dapricot_dev.py @@ -0,0 +1,369 @@ +"""dapricot_dev dataset.""" + +import collections +import json +import os + +import tensorflow.compat.v1 as tf +import tensorflow_datasets as tfds + +from armory.data.adversarial import pandas_proxy + +_DESCRIPTION = """ +LEGAL +----- +Copyright 2021 The MITRE Corporation. All rights reserved. +""" + +_CITATION = """ +Dataset is unpublished at this time. +""" + +_URLS = "https://armory-public-data.s3.us-east-2.amazonaws.com/adversarial-datasets/dapricot_dev.tar.gz" + + +class DapricotDev(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for dapricot_dev dataset.""" + + VERSION = tfds.core.Version("1.0.1") + RELEASE_NOTES = { + "1.0.0": "Initial release.", + "1.0.1": "Updated to access full dev dataset", + } + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + # TODO(dapricot_dev): Specifies the tfds.core.DatasetInfo object + # return tfds.core.DatasetInfo( + # builder=self, + # description=_DESCRIPTION, + # features=tfds.features.FeaturesDict( + # { + # # These are the features of your dataset like images, labels ... + # "image": tfds.features.Image(shape=(None, None, 3)), + # "label": tfds.features.ClassLabel(names=["no", "yes"]), + # } + # ), + # # If there's a common (input, target) tuple from the + # # features, specify them here. They'll be used if + # # `as_supervised=True` in `builder.as_dataset`. + # supervised_keys=("image", "label"), # Set to `None` to disable + # homepage="https://dataset-homepage/", + # citation=_CITATION, + # ) + features = { + # all Sequences are for [camera_1, camera_2, camera_3] + "image": tfds.features.Sequence( + tfds.features.Image(shape=(None, None, 3)), # encoding_format="jpeg"), + length=3, + ), + "images": tfds.features.Sequence( + tfds.features.FeaturesDict( + { + "file_name": tfds.features.Text(), + "height": tf.int64, + "width": tf.int64, + "id": tf.int64, + } + ), + length=3, + ), + "categories": tfds.features.Sequence( + tfds.features.Sequence( + tfds.features.FeaturesDict( + { + "id": tf.int64, # {'octagon':12, 'diamond':26, 'rect':29} + "name": tfds.features.Text(), + } + ) + ), + length=3, + ), + "objects": tfds.features.Sequence( + tfds.features.Sequence( + { + "id": tf.int64, + "image_id": tf.int64, + "area": tf.int64, # un-normalized area + "boxes": tfds.features.BBoxFeature(), # normalized bounding box [ymin, xmin, ymax, xmax] + "labels": tfds.features.ClassLabel(num_classes=91), + "is_crowd": tf.bool, + } + ), + length=3, + ), + "patch_metadata": tfds.features.Sequence( + # these data only apply to the "green screen patch" objects + tfds.features.FeaturesDict( + { + "gs_coords": tfds.features.Sequence( + tfds.features.Tensor( + shape=[2], dtype=tf.int64 + ), # green screen vertices in (x,y) + ), + "cc_ground_truth": tfds.features.Tensor( + shape=[24, 3], dtype=tf.float32 + ), # colorchecker color ground truth + "cc_scene": tfds.features.Tensor( + shape=[24, 3], dtype=tf.float32 + ), # colorchecker colors in a scene + "shape": tfds.features.Text(), # "diamond", "rect", "octagon" + } + ), + length=3, + ), + } + + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict(features), + citation=_CITATION, + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + # TODO(dapricot_dev): Downloads the data and defines the splits + # path = dl_manager.download_and_extract("https://todo-data-url") + paths = dl_manager.download_and_extract(_URLS) + + # TODO(dapricot_dev): Returns the Dict[split names, Iterator[Key, Example]] + # return { + # "train": self._generate_examples(path / "train_imgs"), + # } + return [ + tfds.core.SplitGenerator( + name=patch_size, + gen_kwargs={"path": os.path.join(paths, "dev"), "size": patch_size}, + ) + for patch_size in ["large", "medium", "small"] + ] + + def _generate_examples(self, path, size): + """Yields examples.""" + # TODO(dapricot_dev): Yields (key, example) tuples from the dataset + # for f in path.glob("*.jpeg"): + # yield "key", { + # "image": f, + # "label": "yes", + # } + + scenes = ["01", "06", "14"] + + size_dist = {"small": "dist15", "medium": "dist10", "large": "dist5"} + + yield_id = 0 + # For each scene, read JSONs for all cameras. + # For each camera, go through each image. + # For each image, gets its annotations and yield relevant data + for scene in scenes: + + annotation_path_camera_1 = os.path.join( + path, "annotations/labels_scene_{}_camera_1.json".format(scene) + ) + annotation_path_camera_2 = os.path.join( + path, "annotations/labels_scene_{}_camera_2.json".format(scene) + ) + annotation_path_camera_3 = os.path.join( + path, "annotations/labels_scene_{}_camera_3.json".format(scene) + ) + + dapricot_camera_1 = DapricotAnnotation(annotation_path_camera_1) + dapricot_camera_2 = DapricotAnnotation(annotation_path_camera_2) + dapricot_camera_3 = DapricotAnnotation(annotation_path_camera_3) + + images_camera_1 = dapricot_camera_1.images() + images_camera_2 = dapricot_camera_2.images() + images_camera_3 = dapricot_camera_3.images() + + # sort images alphabetically so all three cameras are consistent + images_camera_1 = sorted( + images_camera_1, key=lambda x: x["file_name"].lower() + ) + images_camera_2 = sorted( + images_camera_2, key=lambda x: x["file_name"].lower() + ) + images_camera_3 = sorted( + images_camera_3, key=lambda x: x["file_name"].lower() + ) + + for image_camera_1, image_camera_2, image_camera_3 in zip( + images_camera_1, images_camera_2, images_camera_3 + ): + + # verify consistency + fname1 = image_camera_1[ + "file_name" + ] # fname has format "scene_#_camera_1___.JPG" + fname2 = image_camera_2["file_name"] + fname3 = image_camera_3["file_name"] + assert fname1 == ("_").join( + fname2.split("_")[:3] + ["1"] + fname2.split("_")[4:] + ), "{} and {} are inconsistent".format(fname1, fname2) + assert fname1 == ("_").join( + fname3.split("_")[:3] + ["1"] + fname3.split("_")[4:] + ), "{} and {} are inconsistent".format(fname1, fname3) + + # get object annotations for each image + annotations_camera_1 = dapricot_camera_1.get_annotations( + image_camera_1["id"] + ) + annotations_camera_2 = dapricot_camera_2.get_annotations( + image_camera_2["id"] + ) + annotations_camera_3 = dapricot_camera_3.get_annotations( + image_camera_3["id"] + ) + + # convert bbox to Pytorch format + def build_bbox(x, y, width, height): + return tfds.features.BBox( + ymin=y + / image_camera_1[ + "height" + ], # all images are the same size, so using image_camera_1 is fine + xmin=x / image_camera_1["width"], + ymax=(y + height) / image_camera_1["height"], + xmax=(x + width) / image_camera_1["width"], + ) + + # convert segmentation format of (x0,y0,x1,y1,...) to ( (x0, y0), (x1, y1), ... ) + def build_coords(segmentation): + xs = segmentation[::2] + ys = segmentation[1::2] + coords = [[int(round(x)), int(round(y))] for (x, y) in zip(xs, ys)] + + return coords + + # convert green screen shape given in file name to shape expected in downstream algorithms + def get_shape(in_shape): + out_shape = {"stp": "octagon", "pxg": "diamond", "spd": "rect"} + return out_shape[in_shape] + + # get colorchecker color box values. There are 24 color boxes, so output shape is (24, 3) + def get_cc(ground_truth=True, scene=None, camera=None): + if ground_truth: + return pandas_proxy.read_csv_to_numpy_float32( + os.path.join( + path, + "annotations", + "xrite_passport_colors_sRGB-GMB-2005.csv", + ), + header=None, + ) + else: + return pandas_proxy.read_csv_to_numpy_float32( + os.path.join( + path, + "annotations", + "scene_{}_camera_{}_CC_values.csv".format( + scene, camera + ), + ), + header=None, + ) + + example = { + "image": [ + os.path.join( + path, + "scene_{}/camera_{}".format(scene, camera + 1), + im_cam["file_name"], + ) + for camera, im_cam in enumerate( + [image_camera_1, image_camera_2, image_camera_3] + ) + ], + "images": [image_camera_1, image_camera_2, image_camera_3], + "categories": [ + d_cam.categories() + for d_cam in [ + dapricot_camera_1, + dapricot_camera_2, + dapricot_camera_3, + ] + ], + "objects": [ + [ + { + "id": anno["id"], + "image_id": anno["image_id"], + "area": anno["area"], + "boxes": build_bbox(*anno["bbox"]), + "labels": anno["category_id"], + "is_crowd": bool(anno["iscrowd"]), + } + for anno in annos + ] + for annos in [ + annotations_camera_1, + annotations_camera_2, + annotations_camera_3, + ] + ], + "patch_metadata": [ + [ + { + "gs_coords": build_coords(*anno["segmentation"]), + "cc_ground_truth": get_cc(), + "cc_scene": get_cc( + ground_truth=False, scene=scene, camera=camera + 1 + ), + "shape": get_shape( + im_info["file_name"].split("_")[4].lower() + ), # file_name has format "scene_#_camera_#___.JPG" + } + for anno in annos + if len(anno["segmentation"]) > 0 + ][0] + for camera, (annos, im_info) in enumerate( + zip( + [ + annotations_camera_1, + annotations_camera_2, + annotations_camera_3, + ], + [image_camera_1, image_camera_2, image_camera_3], + ) + ) + ], + } + + yield_id = yield_id + 1 + + patch_size = image_camera_1["file_name"].split(".")[ + 0 + ] # scene_#_camera_#___ + patch_size = patch_size.split("_")[-1].lower() # + if size_dist[size] == patch_size: + yield yield_id, example + + +class DapricotAnnotation(object): + """Dapricot annotation helper class.""" + + def __init__(self, annotation_path): + with tf.io.gfile.GFile(annotation_path) as f: + data = json.load(f) + self._data = data + + # for each images["id"], find all annotations such that annotations["image_id"] == images["id"] + img_id2annotations = collections.defaultdict(list) + for a in self._data["annotations"]: + img_id2annotations[a["image_id"]].append(a) + self._img_id2annotations = { + k: list(sorted(v, key=lambda a: a["id"])) + for k, v in img_id2annotations.items() + } + + def categories(self): + """Return the category dicts, as sorted in the file.""" + return self._data["categories"] + + def images(self): + """Return the image dicts, as sorted in the file.""" + return self._data["images"] + + def get_annotations(self, img_id): + """Return all annotations associated with the image id string.""" + return self._img_id2annotations.get(img_id, []) diff --git a/armory/datasets/adversarial/dapricot_dev/dapricot_dev_test.py b/armory/datasets/adversarial/dapricot_dev/dapricot_dev_test.py new file mode 100644 index 000000000..7346f0df3 --- /dev/null +++ b/armory/datasets/adversarial/dapricot_dev/dapricot_dev_test.py @@ -0,0 +1,25 @@ +"""dapricot_dev dataset.""" + +import tensorflow_datasets as tfds +from . import dapricot_dev + + +class DapricotDevTest(tfds.testing.DatasetBuilderTestCase): + """Tests for dapricot_dev dataset.""" + + # TODO(dapricot_dev): + DATASET_CLASS = dapricot_dev.DapricotDev + SPLITS = { + "train": 3, # Number of fake train example + "test": 1, # Number of fake test example + } + + # If you are calling `download/download_and_extract` with a dict, like: + # dl_manager.download({'some_key': 'http://a.org/out.txt', ...}) + # then the tests needs to provide the fake output paths relative to the + # fake data directory + # DL_EXTRACT_RESULT = {'some_key': 'output_file1.txt', ...} + + +if __name__ == "__main__": + tfds.testing.test_main() diff --git a/armory/datasets/adversarial/dapricot_dev/dummy_data/TODO-add_fake_data_in_this_directory.txt b/armory/datasets/adversarial/dapricot_dev/dummy_data/TODO-add_fake_data_in_this_directory.txt new file mode 100644 index 000000000..e69de29bb diff --git a/armory/datasets/cached_datasets.json b/armory/datasets/cached_datasets.json index 23c3a76a8..7f1e9b619 100644 --- a/armory/datasets/cached_datasets.json +++ b/armory/datasets/cached_datasets.json @@ -27,6 +27,13 @@ "url": null, "version": "3.0.2" }, + "dapricot_dev": { + "sha256": "5051af98235e50c4c2cc6050479c4a153b8dab0b6a782e773f18c4ce95742f97", + "size": 76512992, + "subdir": "dapricot_dev/1.0.1", + "url": null, + "version": "1.0.1" + }, "digit": { "sha256": "805fb5e33caf2029e13f4146c9d06fdb437ac5b0f0aa9668e3201922b617c559", "size": 8349857,