diff --git a/.gitignore b/.gitignore index b6e4761..0eb3510 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,5 @@ dmypy.json # Pyre type checker .pyre/ + +**/.DS_Store \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f5f55c..7fcd6ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,17 @@ All notable changes to this project will be documented in this file. The format is (loosely) based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). -## [v0.1.3] - 2022-01-23 - +## [v0.2.0] - 2022-02-02 - 2022-02-19 +### Added +- Import main validator as stac-validator was updated to 2.3.0 +- Added best practices docuument to repo +- Recommend 'self' link in links +- Check catalogs and collections use 'catalog.json' or 'collection.json' as a file name +- Check that links in collections and catalogs have a title field +- Recommend that eo:bands or similar information is provided in collection summaries +- Check for small thumbnail image file type + +## [v0.1.3] - 2022-01-23 ### Added - Check for bloated metadata, too many fields in properties - Check for geometry field, recommend that STAC not be used for non-spatial data diff --git a/sample_files/1.0.0/collection_no_summaries.json b/sample_files/1.0.0/collection-no-summaries.json similarity index 100% rename from sample_files/1.0.0/collection_no_summaries.json rename to sample_files/1.0.0/collection-no-summaries.json diff --git a/sample_files/1.0.0/collection-no-title.json b/sample_files/1.0.0/collection-no-title.json new file mode 100644 index 0000000..972a297 --- /dev/null +++ b/sample_files/1.0.0/collection-no-title.json @@ -0,0 +1,106 @@ +{ + "id": "simple-collection", + "type": "Collection", + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.0.0/schema.json", + "https://stac-extensions.github.io/projection/v1.0.0/schema.json", + "https://stac-extensions.github.io/view/v1.0.0/schema.json" + ], + "stac_version": "1.0.0", + "description": "A simple collection demonstrating core catalog fields with links to a couple of items", + "title": "Simple Example Collection", + "providers": [ + { + "name": "Remote Data, Inc", + "description": "Producers of awesome spatiotemporal assets", + "roles": [ + "producer", + "processor" + ], + "url": "http://remotedata.io" + } + ], + "extent": { + "spatial": { + "bbox": [ + [ + 172.91173669923782, + 1.3438851951615003, + 172.95469614953714, + 1.3690476620161975 + ] + ] + }, + "temporal": { + "interval": [ + [ + "2020-12-11T22:38:32.125Z", + "2020-12-14T18:02:31.437Z" + ] + ] + } + }, + "license": "CC-BY-4.0", + "summaries": { + "platform": [ + "cool_sat1", + "cool_sat2" + ], + "constellation": [ + "ion" + ], + "instruments": [ + "cool_sensor_v1", + "cool_sensor_v2" + ], + "gsd": { + "minimum": 0.512, + "maximum": 0.66 + }, + "eo:cloud_cover": { + "minimum": 1.2, + "maximum": 1.2 + }, + "proj:epsg": { + "minimum": 32659, + "maximum": 32659 + }, + "view:sun_elevation": { + "minimum": 54.9, + "maximum": 54.9 + }, + "view:off_nadir": { + "minimum": 3.8, + "maximum": 3.8 + }, + "view:sun_azimuth": { + "minimum": 135.7, + "maximum": 135.7 + } + }, + "links": [ + { + "rel": "root", + "href": "./collection.json", + "type": "application/json", + "title": "Simple Example Collection" + }, + { + "rel": "item", + "href": "./simple-item.json", + "type": "application/geo+json", + "title": "Simple Item" + }, + { + "rel": "item", + "href": "./core-item.json", + "type": "application/geo+json" + }, + { + "rel": "item", + "href": "./extended-item.json", + "type": "application/geo+json", + "title": "Extended Item" + } + ] + } \ No newline at end of file diff --git a/sample_files/1.0.0/core-item-large-thumbnail.json b/sample_files/1.0.0/core-item-large-thumbnail.json new file mode 100644 index 0000000..d1a60ef --- /dev/null +++ b/sample_files/1.0.0/core-item-large-thumbnail.json @@ -0,0 +1,125 @@ +{ + "stac_version": "1.0.0", + "stac_extensions": [], + "type": "Feature", + "id": "20201211_223832_CS2", + "bbox": [ + 172.91173669923782, + 1.3438851951615003, + 172.95469614953714, + 1.3690476620161975 + ], + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + 172.91173669923782, + 1.3438851951615003 + ], + [ + 172.95469614953714, + 1.3438851951615003 + ], + [ + 172.95469614953714, + 1.3690476620161975 + ], + [ + 172.91173669923782, + 1.3690476620161975 + ], + [ + 172.91173669923782, + 1.3438851951615003 + ] + ] + ] + }, + "properties": { + "title": "Core Item", + "description": "A sample STAC Item that includes examples of all common metadata", + "datetime": null, + "start_datetime": "2020-12-11T22:38:32.125Z", + "end_datetime": "2020-12-11T22:38:32.327Z", + "created": "2020-12-12T01:48:13.725Z", + "updated": "2020-12-12T01:48:13.725Z", + "platform": "cool_sat1", + "instruments": [ + "cool_sensor_v1" + ], + "constellation": "ion", + "mission": "collection 5624", + "gsd": 0.512 + }, + "collection": "simple-collection", + "links": [ + { + "rel": "collection", + "href": "./collection.json", + "type": "application/json", + "title": "Simple Example Collection" + }, + { + "rel": "root", + "href": "./collection.json", + "type": "application/json", + "title": "Simple Example Collection" + }, + { + "rel": "parent", + "href": "./collection.json", + "type": "application/json", + "title": "Simple Example Collection" + }, + { + "rel": "alternate", + "type": "text/html", + "href": "http://remotedata.io/catalog/20201211_223832_CS2/index.html", + "title": "HTML version of this STAC Item" + } + ], + "assets": { + "analytic": { + "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2_analytic.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "4-Band Analytic", + "roles": [ + "data" + ] + }, + "thumbnail": { + "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2.jpg", + "title": "Thumbnail", + "type": "image/avi", + "roles": [ + "thumbnail" + ] + }, + "visual": { + "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2.tif", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "title": "3-Band Visual", + "roles": [ + "visual" + ] + }, + "udm": { + "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2_analytic_udm.tif", + "title": "Unusable Data Mask", + "type": "image/tiff; application=geotiff;" + }, + "json-metadata": { + "href": "http://remotedata.io/catalog/20201211_223832_CS2/extended-metadata.json", + "title": "Extended Metadata", + "type": "application/json", + "roles": [ + "metadata" + ] + }, + "ephemeris": { + "href": "http://cool-sat.com/catalog/20201211_223832_CS2/20201211_223832_CS2.EPH", + "title": "Satellite Ephemeris Metadata" + } + } + } \ No newline at end of file diff --git a/sample_files/1.0.0/core-item.json b/sample_files/1.0.0/core-item.json index 6bb2ded..89e479c 100644 --- a/sample_files/1.0.0/core-item.json +++ b/sample_files/1.0.0/core-item.json @@ -91,7 +91,7 @@ "thumbnail": { "href": "https://storage.googleapis.com/open-cogs/stac-examples/20201211_223832_CS2.jpg", "title": "Thumbnail", - "type": "image/png", + "type": "image/jpg", "roles": [ "thumbnail" ] diff --git a/setup.py b/setup.py index f293421..9fdceb2 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ """ from setuptools import setup, find_packages -__version__ = "0.1.3" +__version__ = "0.2.0" with open("README.md", "r") as fh: long_description = fh.read() @@ -19,7 +19,8 @@ "click>=7.1.2", "requests>=2.19.1", "jsonschema>=3.1.2b0", - "pytest" + "pytest", + "stac-validator==2.3.0" ], entry_points={ 'console_scripts': ['stac_check=stac_check.cli:main'] diff --git a/stac-best-practices.md b/stac-best-practices.md new file mode 100644 index 0000000..fdeb83a --- /dev/null +++ b/stac-best-practices.md @@ -0,0 +1,754 @@ +# STAC Best Practices + +## Table of Contents + +- **[Web Best Practices](#web-practices)** + - [Enable Cross-origin resource sharing (CORS)](#enable-cross-origin-resource-sharing-cors) + - [STAC on the Web](#stac-on-the-web) + - [Schema.org, JSON-LD, DCAT, microformats, etc](#schemaorg-json-ld-dcat-microformats-etc) + - [Deploying STAC Browser](#deploying-stac-browser) + - [Requester Pays](#requester-pays) +- **[Item Best Practices](#item-practices)** + - [Field and ID formatting](#item-ids) + - [Searchable Identifiers](#searchable-identifiers) + - [Field selection and Metadata Linking](#field-selection-and-metadata-linking) + - [Datetime selection](#datetime-selection) + - [Unlocated Items](#unlocated-items) + - [Unrectified Satellite Data](#unrectified-satellite-data) + - [Data that is not spatial](#data-that-is-not-spatial) + - [Representing Vector Layers in STAC](#representing-vector-layers-in-stac) +- **[Asset Best Practices](#asset-practices)** + - [Common Use Cases of Additional Fields for Assets](#common-use-cases-of-additional-fields-for-assets) + - [Working with Media Types](#working-with-media-types) + - [Common Media Types in STAC](#common-media-types-in-stac) + - [Formats with no registered media type](#formats-with-no-registered-media-type) + - [Asset Roles](#asset-roles) + - [List of Asset Roles](#list-of-asset-roles) + - [Thumbnail](#thumbnail) + - [Overview](#overview) + - [Visual](#visual) +- **[Catalog & Collection Best Practices](#catalog--collection-practices)** + - [Static and Dynamic Catalogs](#static-and-dynamic-catalogs) + - [Static Catalogs](#static-catalogs) + - [Dynamic Catalogs](#dynamic-catalogs) + - [Catalog Layout](#catalog-layout) + - [Dynamic Catalog Layout](#dynamic-catalog-layout) + - [Mixing STAC Versions](#mixing-stac-versions) + - [Using Summaries in Collections](#using-summaries-in-collections) + - [Use of links](#use-of-links) + - [Self-contained Catalogs](#self-contained-catalogs) + - [Published Catalogs](#published-catalogs) + - [Using Relation Types](#using-relation-types) + - [Versioning for Catalogs](#versioning-for-catalogs) + - [Example](#example) + - [Static to Dynamic best practices](#static-to-dynamic-best-practices) + - [Ingestion and links](#ingestion-and-links) + - [Keep catalogs in sync with cloud notification and queue services](#keep-catalogs-in-sync-with-cloud-notification-and-queue-services) + - [How to Differentiate STAC Files](#how-to-differentiate-stac-files) + + +This document makes a number of recommendations for creating real world SpatioTemporal Asset Catalogs. None of them +are required to meet the core specification, but following these practices will make life easier for client tooling +and for users. They come about from practical experience of implementors and introduce a bit more 'constraint' for +those who are creating STAC objects representing their data or creating tools to work with STAC. + +While the current goal of the core is to remain quite flexible and simple to meet a wide variety of use cases, +in time some of these may evolve to become part of the core specification. + +## Web Practices + +### Enable Cross-origin resource sharing (CORS) + +STAC strives to make geospatial information more accessible, by putting it on the web. Fundamental to STAC's vision is that +different tools will be able to load and display public-facing STAC data. But the web runs on a [Same origin +policy](https://en.wikipedia.org/wiki/Same-origin_policy), preventing web pages from loading information from other web locations +to prevent malicious scripts from accessing sensitive data. This means that by default a web page would only be able to load STAC +[Item](item-spec/item-spec.md) objects from the same server the page is on. +[Cross-origin resource sharing](https://en.wikipedia.org/wiki/Cross-origin_resource_sharing), +also known as 'CORS' is a protocol to enable safe communication across origins. But most web services turn it off by default. This +is generally a good thing, but unfortunately if CORS is not enabled then any browser-based STAC tool will not work. + +So to enable all the great web tools (like [stacindex.org](http://stacindex.org)) to work with your STAC implementation it is essential to +'enable CORS'. Most services have good resources on how to do this, like on [AWS S3](https://docs.aws.amazon.com/AmazonS3/latest/dev/cors.html), +[Google Cloud Storage](https://cloud.google.com/storage/docs/cross-origin), or [Apache Server](https://enable-cors.org/server_apache.html). +Many more are listed on [enable-cors.org](https://enable-cors.org/server.html). We recommend enabling CORS for all requests ('\*'), +so that diverse online tools can access your data. If you aren't sure if your server has CORS enabled you can use +[test-cors.org](https://www.test-cors.org/). Enter the URL of your STAC root [Catalog](catalog-spec/catalog-spec.md) or +[Collection](collection-spec/collection-spec.md) JSON and make sure it gets a response. + +### STAC on the Web + +One of the primary goals of STAC is to make spatiotemporal data more accessible on the web. One would have a right to be +surprised that there is nothing about HTML in the entire specification. This is because it is difficult to specify what +should be on web pages without ending up with very bad looking pages. But the importance of having web-accessible versions +of every STAC Item is paramount. + +The main recommendation is to have an HTML page for every single STAC Item, Catalog and Collection. They should be visually pleasing, +crawlable by search engines and ideally interactive. The current best practice is to use a tool in the STAC ecosystem called +[STAC Browser](https://github.com/radiantearth/stac-browser/). It can crawl most any valid STAC implementation and generate unique web +pages for each Item and Catalog (or Collection). While it has a default look and feel, the design can easily be +modified to match an existing web presence. And it will automatically turn any Item with a [Cloud Optimized +GeoTIFF](http://cogeo.org) asset into an interactive, zoomable web map (using [tiles.rdnt.io](http://tiles.rdnt.io/) to render +the tiles on a [leaflet](https://leafletjs.com/) map). It also attempts to encapsulate a number of best practices that enable +STAC Items to show up in search engines, though that part is still a work in progress - contributions to STAC Browser to help +are welcome! + +Implementors are welcome to generate their own web pages, and additional tools that automatically transform STAC JSON into +html sites are encouraged. In time there will likely emerge a set of best practices from an array of tools, and we may be +able to specify in the core standard how to make the right HTML pages. But for now it is useful for STAC implementations to focus on +making data available as JSON, and then leverage tools that can evolve at the same time to make the best HTML experience. This +enables innovation on the web generation and search engine optimization to evolve independently from the core data. + +#### Schema.org, JSON-LD, DCAT, microformats, etc + +There is a strong desire to align STAC with the various web standards for data. These include [schema.org](http://schema.org) +tags, [JSON-LD](https://json-ld.org/) (particularly for Google's [dataset +search](https://developers.google.com/search/docs/data-types/dataset)), [DCAT](https://www.w3.org/TR/vocab-dcat/) +and [microformats](http://microformats.org/wiki/about). STAC aims to work with as many as possible. Thusfar it has not seemed +to make sense to include any of them directly in the core STAC standard. They are all more intended to be a part of the HTML +pages that search engines crawl, so the logical place to do the integration is by leveraging a tool that generates HTML +from STAC like [STAC Browser](https://github.com/radiantearth/stac-browser/). STAC Browser has implemented a [mapping to +schema.org](https://github.com/radiantearth/stac-spec/issues/378) fields using JSON-LD, but the exact output is still being +refined. It is on the roadmap to add in more mapping and do more testing of search engines crawling the HTML pages. + +#### Deploying STAC Browser + +Most public STAC implementations have a STAC Browser hosted at [stacindex.org](https://stacindex.org/catalogs). +Anyone with a public STAC implementation is welcome to have a STAC Browser instance hosted for free, +just submit it to [stacindex.org](https://stacindex.org/add). +But the stronger recommendation is to host a STAC Browser on your own domain, and to customize its +design to look and feel like your main web presence. STAC aims to be decentralized, so each STAC-compliant data catalog +should have its own location and just be part of the wider web. + +### Requester Pays + +It is very common that large, freely available datasets are set up with a 'requester pays' configuration. This is an option +[on AWS](https://docs.aws.amazon.com/AmazonS3/latest/userguide/RequesterPaysBuckets.html) and [on +Google Cloud](https://cloud.google.com/storage/docs/requester-pays), that enables data providers to make their data +available to everyone, while the cloud platform charges access costs +(such as per-request and data '[egress](https://www.hostdime.com/blog/data-egress-fees-cloud/)') to the user accessing the data. +For popular datasets that are large in size the egress costs can be substantial, to the point where much +less data would be available if the cost of distribution was always on the data provider. + +For data providers using STAC with requester pays buckets, there are two main recommendations: + +1. Put the STAC JSON in a separate bucket that is public for everyone and **not** requestor pays. This enables the STAC metadata + to be far more crawlable and searchable, but the cost of the egress of STAC files should be miniscule compared to that of + the actual data. The STAC community can help you work with cloud providers for potential free hosting if you are doing open + data as requestor pays and aren't able to pay the costs of a completely open STAC bucket, as they are most all supportive of + STAC (but no guarantees and it may be on an alternate cloud). +2. For Asset href values to resources in a requestor pays bucket, use the cloud provider-specific protocol + (e.g., `s3://` on AWS and `gs://` on Google Cloud) instead of an `https://` url. + Most clients do not have special handling for `https://` links to cloud provider resources that require a requestor pays flag and authentication, + so they simply fail. Many clients have special handling for `s3://` or `gs://` URLs + that will add a requestor pays parameter and will apply appropriate authentication to the request. + Using cloud-specific protocols will at least give users an option to register a paid account and + allow the data provider to properly charge for access. + STAC-specific tools in turn can look for the cloud-specific protocols and know to use the requestor pays feature for that specific cloud platform. + +## Item Practices + +### Item IDs + +When defining one's STAC properties and fields there are many choices to make on how to name various aspects of one's +data. One of the key properties is the ID. The specification is quite flexible on ID's, primarily so that existing +providers can easily use their same ID when they translate their data into STAC - they just need to be sure it is globally +unique, so may need a prefix. But the use of URI or file path reserved characters such as `:` or `/` is discouraged since this will +result in [percented encoded](https://tools.ietf.org/html/rfc3986#section-2) [STAC API](https://github.com/radiantearth/stac-api-spec) +endpoints and it prevents the use of IDs as file names as recommended in the [catalog layout](#catalog-layout) best practices. + +### Searchable Identifiers + +When coming up with values for fields that contain searchable identifiers of some sort, like `constellation` or `platform`, +it is recommended that the identifiers consist of only lowercase characters, numbers, `_`, and `-`. +Examples include `sentinel-1a` (Sentinel-1), `landsat-8` (Landsat-8) and `envisat` (Envisat). +This is to provide consistency for search across Collections, so that people can just search for `landsat-8`, +instead of thinking through all the ways providers might have chosen to name it. + +### Field selection and Metadata Linking + +In general STAC aims to be oriented around **search**, centered on the core fields that users will want to search on to find +imagery. The core is space and time, but there are often other metadata fields that are useful. While the specification is +flexible enough that providers can fill it with tens or even hundreds of fields of metadata, that is not recommended. If +providers have lots of metadata then that can be linked to in the [Asset Object](item-spec/item-spec.md#asset-object) +(recommended) or in a [Link Object](item-spec/item-spec.md#link-object). There is a lot of metadata that is only of relevance +to loading and processing data, and while STAC does not prohibit providers from putting those type of fields in their items, +it is not recommended. For very large catalogs (hundreds of millions of records), +every additional field that is indexed will cost substantial money, so data providers are advised to just put the fields to be searched in STAC and +[STAC API](https://github.com/radiantearth/stac-api-spec) providers don't have bloated indices that no one actually uses. + +### Datetime selection + +The `datetime` field in a STAC Item's properties is one of the most important parts of a STAC Item, providing the T (temporal) of +STAC. And it can also be one of the most confusing, especially for data that covers a range of times. For many types of data it +is straightforward - it is the capture or acquisition time. But often data is processed from a range of captures - drones usually +gather a set of images over an hour and put them into a single image, mosaics combine data from several months, and data cubes +represent slices of data over a range of time. For all these cases the recommended path is to use `start_datetime` and +`end_datetime` fields from [common metadata](item-spec/common-metadata.md#date-and-time-range). The specification does allow one to set the +`datetime` field to `null`, but it is strongly recommended to populate the single `datetime` field, as that is what many clients +will search on. If it is at all possible to pick a nominal or representative datetime then that should be used. But sometimes that +is not possible, like a data cube that covers a time range from 1900 to 2000. Setting the datetime as 1950 would lead to it not +being found if you searched 1990 to 2000. + +Extensions that describe particular types of data can and should define their `datetime` field to be more specific. For example +a MODIS 8 day composite image can define the `datetime` to be the nominal date halfway between the two ranges. Another data type +might choose to have `datetime` be the start. The key is to put in a date and time that will be useful for search, as that is +the focus of STAC. If `datetime` is set to `null` then it is strongly recommended to use it in conjunction with an extension +that explains why it should not be set for that type of data. + +### Unlocated Items + +Though the [GeoJSON standard](https://tools.ietf.org/html/rfc7946) allows null geometries, in STAC we strongly recommend +that every item have a geometry, since the general expectation of someone using a SpatioTemporal Catalog is to be able to query +all data by space and time. But there are some use cases where it can make sense to create a STAC Item before it gets +a geometry. The most common of these is 'level 1' satellite data, where an image is downlinked and cataloged before it has +been geospatially located. + +The recommendation for data that does not yet have a location is to follow the GeoJSON concept that it is an ['unlocated' +feature](https://tools.ietf.org/html/rfc7946#section-3.2). So if the catalog has data that is not located then it can follow +GeoJSON and set the geometry to null. Though normally required, in this case the `bbox` field should not be included. + +Note that this recommendation is only for cases where data does not yet have a geometry and it cannot be estimated. There +are further details on the two most commonly requested desired use cases for setting geometry to null: + +#### Unrectified Satellite Data + +Most satellite data is downlinked without information that precisely describes where it is located on Earth. A satellite +imagery processing pipeline will always attempt to locate it, but often that process takes a number of hours, or never +quite completes (like when it is too cloudy). It can be useful to start to populate the Item before it has a geometry. +In this case the recommendation is to use the 'estimated' position from the satellite, to populate at least the bounding box, +and use the same broad bounds for the geometry (or leaving it null) until there is precise ground lock. This estimation is +usually done by onboard equipment, like GPS or star trackers, but can be off by kilometers or more. But it is very useful for +STAC users to be able to at least find approximate area in their searches. A commonly used field for communicating ground lock +is not yet established, but likely should be (an extension proposal would be appreciated). If there is no way to provide an +estimate then the data can be assigned a null geometry and no `bbox`, as described above. But the data will likely not +show up in STAC API searches, as most will at least implicitly use a geometry. Though this section is written with +satellite data in mind, one can easily imagine other data types that start with a less precise geometry but have it +refined after processing. + +#### Data that is not spatial + +The other case that often comes up is people who love STAC and want to use it to catalog everything they have, even if it is +not spatial. This use case is not currently supported by STAC, as we are focused on data that is both temporal and spatial +in nature. The [OGC API - Records](https://github.com/opengeospatial/ogcapi-records) is an emerging standard that likely +will be able to handle a wider range of data than STAC. It builds on [OGC API - +Features](https://github.com/opengeospatial/ogcapi-features) just like [STAC API](https://github.com/radiantearth/stac-api-spec/) +does. Using [Collection Assets](collection-spec/collection-spec.md#asset-object) may also provide an option for some +use cases. + +### Representing Vector Layers in STAC + +Many implementors are tempted to try to use STAC for 'everything', using it as a universal catalog of all their 'stuff'. +The main route considered is to use STAC to describe vector layers, putting a shapefile or [geopackage](http://geopackage.org) +as the `asset`. Though there is nothing in the specification that *prevents* this, it is not really the right level of +abstraction. A shapefile or geopackage corresponds to a Collection, not a single Item. The ideal thing to do with +one of those is to serve it with [OGC API - Features](https://github.com/opengeospatial/ogcapi-features) standard. This +allows each feature in the shapefile/geopackage to be represented online, and enables querying of the actual data. If +that is not possible then the appropriate way to handle Collection-level search is with the +[OGC API - Records](https://github.com/opengeospatial/ogcapi-records) standard, which is a 'brother' specification of STAC API. +Both are compliant with OGC API - Features, adding richer search capabilities to enable finding of data. + +## Asset Practices + +### Common Use Cases of Additional Fields for Assets + +As [described in the Item spec](item-spec/item-spec.md#additional-fields-for-assets), it is possible to use fields typically +found in Item properties at the asset level. This mechanism of overriding or providing Item Properties only in the Assets +makes discovery more difficult and should generally be avoided. However, there are some core and extension fields for which +providing them at at the Asset level can prove to be very useful for using the data. + +- `datetime`: Provide individual timestamp on an Item, in case the Item has a `start_datetime` and `end_datetime`, + but an Asset is for one specific time. +- `gsd` ([Common Metadata](item-spec/common-metadata.md#instrument)): Specify some assets that represent instruments + with different spatial resolution than the overall best resolution. Note this should not be used for different + spatial resolutions due to specific processing of assets - look into the [raster + extension](https://github.com/stac-extensions/raster) for that use case. +- `eo:bands` ([EO extension](https://github.com/stac-extensions/eo/)): + Provide spectral band information, and order of bands, within an individual asset. +- `proj:epsg`/`proj:wkt2`/`proj:projjson` ([projection extension](https://github.com/stac-extensions/projection/)): + Specify different projection for some assets. If the projection is different + for all assets it should probably not be provided as an Item property. If most assets are one projection, and there is + a single reprojected version (such as a Web Mercator preview image), it is sensible to specify the main projection in the + Item and the alternate projection for the affected asset(s). +- `proj:shape`/`proj:transform` ([projection extension](https://github.com/stac-extensions/projection/)): + If assets have different spatial resolutions and slightly different exact bounding boxes, + specify these per asset to indicate the size of the asset in pixels and its exact GeoTransform in the native projection. +- `sar:polarizations` ([sar extension](https://github.com/stac-extensions/sar)): + Provide the polarization content and ordering of a specific asset, similar to `eo:bands`. +- `sar:product_type` ([sar extension](https://github.com/stac-extensions/sar)): + If mixing multiple product types within a single Item, this can be used to specify the product_type for each asset. + +### Working with Media Types + +[Media Types](https://en.wikipedia.org/wiki/Media_type) are a key element that enables STAC to be a rich source of information for +clients. The best practice is to use as specific of a media type as is possible (so if a file is a GeoJSON then don't use a JSON +media type), and to use [registered](https://www.iana.org/assignments/media-types/media-types.xhtml) IANA types as much as possible. +The following table lists types that commonly show up in STAC assets. And the the [section](#formats-with-no-registered-media-type) +past that gives recommendations on what to do if you have a format in your asset that does not have an IANA registered type. + +#### Common Media Types in STAC + +The following table lists a number of commonly used media types in STAC. The first two (GeoTIFF and COG) are not fully standardized +yet, but reflect the community consensus direction. There are many IANA registered types that commonly show up in STAC. The +following table lists some of the most common ones you may encounter or use. + +| Media Type | Description | +| ------------------------------------------------------- | ------------------------------------------------------------ | +| `image/tiff; application=geotiff` | GeoTIFF with standardized georeferencing metadata | +| `image/tiff; application=geotiff; profile=cloud-optimized` | [Cloud Optimized GeoTIFF](https://www.cogeo.org/) (unofficial). Once there is an [official media type](http://osgeo-org.1560.x6.nabble.com/Media-type-tc5411498.html) it will be added and the custom media type here will be deprecated. | +| `image/jp2` | JPEG 2000 | +| `image/png` | Visual PNGs (e.g. thumbnails) | +| `image/jpeg` | Visual JPEGs (e.g. thumbnails, oblique) | +| `text/xml` or `application/xml` | XML metadata [RFC 7303](https://www.ietf.org/rfc/rfc7303.txt) | +| `application/json` | A JSON file (often metadata, or [labels](https://github.com/radiantearth/stac-spec/tree/master/extensions/label#labels-required)) | +| `text/plain` | Plain text (often metadata) | +| `application/geo+json` | [GeoJSON](https://geojson.org/) | +| `application/geopackage+sqlite3` | [GeoPackage](https://www.geopackage.org/) | +| `application/x-hdf5` | Hierarchical Data Format version 5 | +| `application/x-hdf` | Hierarchical Data Format versions 4 and earlier. | + +*Deprecation notice: GeoTiff previously used the media type `image/vnd.stac.geotiff` and +Cloud Optimized GeoTiffs used `image/vnd.stac.geotiff; profile=cloud-optimized`. +Both can still appear in old STAC implementations, but are deprecated and should be replaced. This will, unfortunately, likely shift in the future as +[OGC sorts out the media types](https://github.com/opengeospatial/geotiff/issues/34).* + +#### Formats with no registered media type + +Ideally every media type used is on the [IANA registry](https://www.iana.org/assignments/media-types/media-types.xhtml). If +you are using a format that is not on that list we recommend you use [custom content +type](https://restcookbook.com/Resources/using-custom-content-types/). These typically use the `vnd.` prefix, see [RFC 6838 +section-3.2](https://tools.ietf.org/html/rfc6838#section-3.2). Ideally the format provider will actually +register the media type with IANA, so that other STAC clients can find it easily. But if you are only using it internally it is +[acceptable to not register](https://stackoverflow.com/questions/29121241/custom-content-type-is-registering-with-iana-mandatory) +it. It is relatively easy to [register](https://www.iana.org/form/media-types) a `vnd` media type. + +### Asset Roles + +[Asset roles](item-spec/item-spec.md#asset-roles) are used to describe what each asset is used for. They are particular useful +when several assets have the same media type, such as when an Item has a multispectral analytic asset, a 3-band full resolution +visual asset, a down-sampled preview asset, and a cloud mask asset, all stored as Cloud Optimized GeoTIFF (COG) images. It is +recommended to use at least one role for every asset available, and using multiple roles often makes sense. For example you'd use +both `data` and `reflectance` if your main data asset is processed to reflectance, or `metadata` and `cloud` for an asset that +is a cloud mask, since a mask is considered a form of metadata (it's information about the data). Or if a single asset represents +several types of 'unusable data' it might include `metadata`, `cloud`, `cloud-shadow` and `snow-ice`. If there is not a clear +role in the [Asset Role Types](item-spec/item-spec.md#asset-role-types) or the following list then just pick a sensible name for +the role. And you are encouraged to add it to the list below and/or in an extension if you think the new role will have broader +applicability. + +#### List of Asset Roles + +In addition to the thumbnail, data and overview [roles listed](item-spec/item-spec.md#asset-role-types) in the Item spec, there +are a number of roles that are emerging in practice, but don't have enough widespread use to justify standardizing them. So if +you want to re-use other roles then try to find them on the list below, and also feel free to suggest more to include here. + +The 'source' field lists where the role comes from. The ones the say Item Spec are the only 'official' roles that are fully +standardized. In time others on this list may migrate to a more 'official' list. Those that say 'best practice' are just from this +doc, the listing is the table below. The ones from extensions are mostly just 'best practices' in the extensions, as there are few +actual role requirements. + +| Role Name | Source | Description | +| --------- | -------------|----------------------------------------------------------------------- | +| thumbnail | [Item Spec](item-spec/item-spec.md#asset-role-types) | An asset that represents a thumbnail of the item, typically a true color image (for items with assets in the visible wavelengths), lower-resolution (typically smaller 600x600 pixels), and typically a JPEG or PNG (suitable for display in a web browser). Multiple assets may have this purpose, but it recommended that the `type` and `roles` be unique tuples. For example, Sentinel-2 L2A provides thumbnail images in both JPEG and JPEG2000 formats, and would be distinguished by their media types. | +| data | [Item Spec](item-spec/item-spec.md#asset-role-types) | The data itself. This is a suggestion for a common role for data files to be used in case data providers don't come up with their own names and semantics. | +| metadata | [Item Spec](item-spec/item-spec.md#asset-role-types) | A metadata sidecar file describing the data in this item, for example the Landsat-8 MTL file. | +| overview | Best Practice | An asset that represents a possibly larger view than the thumbnail of the Item, for example, a true color composite of multi-band data. | +| visual | Best Practice | An asset that is a full resolution version of the data, processed for visual use (RGB only, often sharpened ([pan-sharpened](https://en.wikipedia.org/wiki/Pansharpened_image) and/or using an [unsharp mask](https://en.wikipedia.org/wiki/Unsharp_masking))). | +| date | Best Practice | An asset that provides per-pixel acquisition timestamps, typically serving as metadata to another asset | +| graphic | Best Practice | Supporting plot, illustration, or graph associated with the Item | +| data-mask | Best Practice | File indicating if corresponding pixels have Valid data and various types of invalid data | +| snow-ice | Best Practice | Points to a file that indicates whether a pixel is assessed as being snow/ice or not. | +| land-water | Best Practice | Points to a file that indicates whether a pixel is assessed as being land or water. | +| water-mask | Best Practice | Points to a file that indicates whether a pixel is assessed as being water (e.g. flooding map). | iso-19139 | Best Practice | Points to an [ISO 19139](https://www.iso.org/standard/67253.html) metadata xml file | +| iso-19115 | Best Practice | Points to an [ISO 19115](https://www.iso.org/standard/53798.html) metadata file | +| reflectance, temperature, saturation, cloud, cloud-shadow | [EO Extension](https://github.com/stac-extensions/eo/blob/main/README.md#best-practices) | See the [table](https://github.com/stac-extensions/eo/blob/main/README.md#best-practices) in EO for more information, and the definitive list of roles related to EO. | +| incidence-angle, azimuth, sun-azimuth, sun-elevation, terrain-shadow, terrain-occlusion, terrain-illumination | [View Extension](https://github.com/stac-extensions/view/blob/main/README.md#best-practices) | See the [table](https://github.com/stac-extensions/view/blob/main/README.md#best-practices) in View for more information, and the definitive list of roles related to viewing angles. | +| local-incidence-angle, noise-power, amplitude, magnitude, sigma0, beta0, gamma0, date-offset, covmat, prd | [SAR Extension](https://github.com/stac-extensions/sar/blob/main/README.md#best-practices) | See the [table](https://github.com/stac-extensions/sar/blob/main/README.md#best-practices) in SAR for more information. , and the definitive list of roles related to SAR. | + +Some of the particular asset roles also have some best practices: + +##### Thumbnail + +Thumbnails are typically used to give quick overview, often embedded in a list of items. So think small with these, as +keeping the size down helps it load fast, and the typical display of a thumbnail won't benefit from a large size. Often 256 by +256 pixels is used as a default. Generally they should be no more than 600 by 600 pixels. Some implementors provide different sizes +of thumbnails - using something like thumbnail-small and thumbnail-large, with a small one being 100x100 pixels or less, for truly +fast rendering in a small image. Be sure to name one just 'thumbnail' though, as that's the default most STAC clients will look for. + +Thumbnails should be PNG, JPEG, or WebP, so that they can easily display in browsers, and they should be a true color composite +(red, green, and blue bands) if there are multiple bands. + +If your data for the Item does not come with a thumbnail already we do recommend generating one, which can be done quite easily. +[GDAL](https://gdal.org/) and [Rasterio](https://rasterio.readthedocs.io/en/latest/) both make this very easy - if you need help +just ask on the [STAC Gitter](https://gitter.im/SpatioTemporal-Asset-Catalog/Lobby). + +##### Overview + +An overview is a high-definition browse image of the dataset, giving the user more of a sense of the data than a thumbnail could. +It's something that can be easily displayed on a map without tiling, or viewed at full screen resolution (but not zoomed in). Similar +to a thumbnail it should be PNG, JPEG or WebP, for easy display in browsers, and should be a true color composite +(red, green, and blue bands) if there are multiple bands. The sizes could range from the high end of a thumbnail (600 by 600 pixels) +to a few thousand pixels on each side. + +###### Visual + +A visual asset is a full-resolution version of the data, but one that is optimized for display purposes. It can be in any file format, +but Cloud Optimized GeoTIFF's are preferred, since the inner pyramids and tiles enable faster display of the full resolution data. +It is typically an composite of red, blue and green bands, often with a nice color curve and sharpening for enhanced display. It should +be possible to open up on non-specialist software and display just fine. It can complement assets where one band is per file (like landsat), +by providing the key display bands combined, or can complement assets where many non-visible bands are included, by being a lighter weight +file that just has the bands needed for display + +## Catalog & Collection Practices + +*Note: This section uses the term 'Catalog' (with an uppercase C) to refer to the JSON entity specified in the +[Catalog spec](catalog-spec/catalog-spec.md), and 'catalog' (with a lowercase c) to refer to any full STAC implementation, +which can be any mix of Catalogs Collections and Items.* + +### Static and Dynamic Catalogs + +As mentioned in the main [overview](overview.md), there are two main types of catalogs - static +and dynamic. This section explains each of them in more depth and shares some best practices on each. + +#### Static Catalogs + +A static catalog is an implementation of the STAC specification that does not respond dynamically to requests. It is simply +a set of files on a web server that link to one another in a way that can be crawled, often stored in an cloud storage +service like [Amazon S3](https://aws.amazon.com/s3/), [Azure Storage](https://azure.microsoft.com/en-us/services/storage/) and +[Google Cloud Storage](https://cloud.google.com/storage/). But any http server could expose a static catalog as files. +The core JSON documents and link structures are encoded in the file, and work as long as things are structured properly. +A static catalog can only really be crawled by search engines and active catalogs; it can not respond to queries. +But it is incredibly reliable, as there are no moving parts, no clusters or databases to maintain. +The goal of STAC is to expose as much asset metadata online as possible, so the static catalog offers a very low +barrier to entry for anyone with geospatial assets to make their data searchable. + +Static catalogs tend to make extensive use of *sub-catalogs* to organize their Items into sensible browsing structures, +as they can only have a single representation of their catalog, since the static nature means the structure is baked in. +While it is up to the implementor to organize the catalog, it is recommended to arrange it in a way that would make sense +for a human to browse a set of STAC Items in an intuitive matter. + +Users indicate their intent for a file to be parsed as a Collection or Catalog using the required `type` field on +each entity. For Collections, this field must have the value `Collection`, while for Catalogs, it must have the +value `Catalog`. Additionally, we recommend for static STACs indicate contents using the filenames `catalog.json` +or `collection.json` to distinguish the Catalog from other JSON type files. In order to support multiple catalogs, the recommended practice +is to place the Catalog file in namespaces "directories". For example: + +- current/catalog.json +- archive/catalog.json + +#### Dynamic Catalogs + +A dynamic catalog is implemented in software as an HTTP-based API, following the same specified JSON structure for Items, Catalogs +and Collections. Its structure and responses are usually generated dynamically, instead of relying on a set of +already defined files. But the result is the same, enabling the same discovery from people browsing and search engines crawling. +It generally indexes data for efficient responses, and aims to be easy for existing APIs to implement as a more standard interface +for clients to consume. A dynamic catalog will sometimes be populated by a static catalog, or at least may have a 'backup' of its +fields stored as a cached static catalog. + +Dynamic catalogs often also implement the [STAC API](https://github.com/radiantearth/stac-api-spec/) specification, that +responds to search queries (like "give me all imagery in Oahu gathered on January 15, 2017"). But they are not required to. One +can have a dynamic service that only implements the core STAC specification, and is crawled by STAC API implementations that +provide 'search'. For example a Content Management Service like Drupal or an open data catalog like CKAN could choose to expose +its content as linked STAC Items by implementing a dynamic catalog. + +One benefit of a dynamic catalog is that it can generate various 'views' of the catalog, exposing the same Items in +different sub-catalog organization structures. For example one catalog could divide sub-catalogs by date and another by +providers, and users could browse down to both. The leaf Items should just be linked to in a single canonical location +(or at least use a `rel` link that indicates the location of the canonical one). + +### Catalog Layout + +Creating a catalog involves a number of decisions as to what folder structure to use to represent sub-catalogs, Items +and assets, and how to name them. The specification leaves this totally open, and you can link things as you want. But +it is recommended to be thoughtful about the organization of sub-catalogs, putting them into a structure that a person +might reasonably browse (since they likely will with [STAC on the Web](#stac-on-the-web) recommendations). For example +start with location, like a normal grid (path+row in Landsat) or administrative boundaries (country -> state-level) and +then year, month, day. Or do the opposite - date and then location. Making a huge unordered list is technically allowed, +but not helpful for discovery of data. Thus it is generally considered a best practice to make use of sub-catalogs to +keep the size of each sub-catalog under a megabyte. If your sub-catalog lists tens of thousands of child items then you +should consider an additional way to break it up. + +We encourage people to explore new structures of linking data, but the following list is what a number of implementors +ended up doing. Following these recommendations makes for more legible catalogs, and many tools operate more efficiently +if you follow these recommendations. + +1. Root documents (Catalogs / Collections) should be at the root of a directory tree containing the static catalog. +2. Catalogs should be named `catalog.json` and Collections should be named `collection.json`. +3. Items should be named `.json`. +4. Sub-Catalogs or sub-Collections should be stored in subdirectories of their parent + (and only 1 subdirectory deeper than a document's parent, e.g. `.../sample/sub1/catalog.json`). +5. Items should be stored in subdirectories of their parent Catalog or Collection. + This means that each Item and its assets are contained in a unique subdirectory. +6. Limit the number of Items in a Catalog or Collection, grouping / partitioning as relevant to the dataset. +7. Use structural elements (Catalog and Collection) consistently across each 'level' of your hierarchy. + For example, if levels 2 and 4 of the hierarchy only contain Collections, + don't add a Catalog at levels 2 and 4. + +One further recommendation to help tools is to always include the 'title' field when including a link, especially in the +`item`, `child`, `parent` and `root` links, even if it repeats several times. This should be the same as the 'title' in the +link destination. Having this enables clients to display a nice human readable name of the link without having to open the +link destination. + +#### Dynamic Catalog Layout + +While these recommendations were primarily written for [static catalogs](#static-catalogs), they apply +equally well to [dynamic catalogs](#dynamic-catalogs). Subdirectories of course would just be URL paths +generated dynamically, but the structure would be the same as is recommended. + +One benefit of a dynamic catalog is that it can generate various 'views' of the catalog, exposing the same Items in +different sub-catalog organization structures. For example one catalog could divide sub-catalogs by date and another +by providers, and users could browse down to both. The leaf Items should just be linked to in a single canonical location +(or at least use a rel link that indicates the location of the canonical one). It is recommended that dynamic catalogs +provide multiple 'views' to allow users to navigate in a way that makes sense to them, providing multiple 'sub-catalogs' +from the root that enable different paths to browse (country/state, date/time, constellation/satellite, etc). But the +canonical 'rel' link should be used to designate the primary location of the Item to search engine crawlers. + +#### Mixing STAC Versions + +Although it is allowed to mix STAC versions, it should be used carefully as clients may not support all versions so that +the catalog could be of limited use to users. A Catalog or Collection linking to differently versioned Sub-Catalogs or Sub-Collections +is a common use case when multiple data source are combined. Client developers should be aware of this use case. Nevertheless, it +is strongly recommended that Catalogs don't contain differently versioned Items so that users/clients can at least use and/or download +consistent (Sub-)Catalogs containing either all or no data. Collections that are referenced from Items should always use the same +STAC version. Otherwise some behaviour of functionality may be unpredictable (e.g. merging common fields into Items or reading summaries). + +### Using Summaries in Collections + +One of the strongest recommendations for STAC is to always provide [summaries](collection-spec/collection-spec.md#summaries) in +your Collections. The core team decided to not require them, in case there are future situations where providing a summary +is too difficult. The idea behind them is not to exhaustively summarize every single field in the Collection, but to provide +a bit of a 'curated' view. + +Some general thinking on what to summarize is as follows: + +- Any field that is a range of data (like numbers or dates) is a great candidate to summarize, to give people a sense what values +the data might be. For example in overhead imagery, a +[`view:off_nadir`](https://github.com/stac-extensions/view/blob/main/README.md#item-properties-and-item-asset-fields) +with a range of 0 to 3 would tell people this imagery is all pretty much straight down, +while a value of 15 to 40 would tell them that it's oblique imagery, or 0 to 60 that it's +a Collection with lots of different look angles. + +- Fields that have only one or a handful of values are also great to summarize. Collections with a single satellite may +use a single [`gsd`](item-spec/common-metadata.md#instrument) field in the summary, and it's quite useful for users to know +that all data is going to be the same resolution. Similarly it's useful to know the names of all the +[`platform` values](item-spec/common-metadata.md#instrument) that are used in the Collection. + +- It is less useful to summarize fields that have numerous different discrete values that can't easily be represented +in a range. These will mostly be string values, when there aren't just a handful of options. For example if you had a +'location' field that gave 3 levels of administrative region (like 'San Francisco, California, United States') to help people +understand more intuitively where a shot was taken. If your Collection has millions of Items, or even hundreds, you don't want +to include all the different location string values in a summary. + +- Fields that consist of arrays are more of a judgement call. For example [`instruments`](item-spec/common-metadata.md#instrument) +is straightforward and recommended, as the elements of the array are a discrete set of options. On the other hand +[`proj:transform`](https://github.com/stac-extensions/projection/blob/main/README.md#projtransform) +makes no sense to summarize, as the union of all the values +in the array are meaningless, as each Item is describing its transform, so combining them would just be a bunch of random numbers. +So if the values contained in the array are independently meaningful (not interconnected) and there aren't hundreds of potential +values then it is likely a good candidate to summarize. + +We do highly recommend including an [`eo:bands`](https://github.com/stac-extensions/eo/blob/main/README.md#eobands) +summary if your Items implement `eo:bands`, +especially if it represents just one satellite or constellation. This should be a union of all the potential bands that you +have in assets. It is ok to only add the summary at the Collection level without putting an explicit `eo:bands` summary at the +`properties` level of an Item, since that is optional. This gives users of the Collection a sense of the sensor capabilities without +having to examine specific Items or aggregate across every Item. + +Note that the ranges of summaries don't have to be exact. If you are publishing a catalog that is constantly updating with +data from a high agility satellite you can put the `view:off_nadir` range to be the expected values, based on the satellite +design, instead of having it only represent the off nadir angles that are Items for assets already captured in the catalog. +We don't want growing catalogs to have to constantly check and recalculate their summaries whenever new data comes in - its +just meant to give users a sense of what types of values they could expect. + +### Use of links + +The STAC specifications allow both relative and absolute links, and says that `self` links are not required, but are +strongly recommended. This is what the spec must say to enable the various use cases, but there is more subtlety for when it +is essential to use different link types. The best practice is to use one of the below catalog types, applying the link +recommendations consistently, instead of just haphazardly applying relative links in some places and absolute ones in other places. + +#### Self-contained Catalogs + +A 'self-contained catalog' is one that is designed for portability. Users may want to download a catalog from online and be +able to use it on their local computer, so all links need to be relative. Or a tool that creates catalogs may need to work +without knowing the final location that it will live at online, so it isn't possible to set absolute 'self' URL's. These use +cases should utilize a catalog that follows the listed principles: + +- **Only relative href's in structural `links`**: The full catalog structure of links down to sub-catalogs and Items, and their +links back to their parents and roots, should be done with relative URL's. The structural rel types include `root`, `parent`, +`child`, `item`, and `collection`. Other links can be absolute, especially if they describe a resource that makes less sense in +the catalog, like [sci:doi](https://github.com/stac-extensions/scientific/blob/main/README.md#item-and-collection-fields), +`derived_from` or even `license` (it can be nice to include the license in the catalog, but some licenses live at a canonical +online location which makes more sense to refer to directly). This enables the full catalog to be downloaded or +copied to another location and to still be valid. This also implies no `self` link, as that link must be absolute. + +- **Use Asset `href` links consistently**: The links to the actual assets are allowed to be either relative or absolute. There +are two types of 'self-contained catalogs'. + +#### Self-contained Metadata Only + +These consist of just the STAC metadata (Collection, Catalog and Item files), and uses absolute href +links to refer to the online locations of the assets. + +#### Self-contained with Assets + +These use relative href links for the assets, and includes them in the folder structure. +This enables offline use of a catalog, by including all the actual data, referenced locally. + +Self-contained catalogs tend to be used more as static catalogs, where they can be easily passed around. But often they will +be generated by a more dynamic STAC service, enabling a subset of a catalog or a portion of a search criteria to be downloaded +and used in other contexts. That catalog could be used offline, or even published in another location. + +Self-contained catalogs are not just for offline use, however - they are designed to be able to be published online and to live +on the cloud in object storage. They just aim to ease the burden of publishing, by not requiring lots of updating of links. +Adding a single `self` link at the root is recommended for online catalogs, +turning it into a 'relative published catalog', as detailed below. +This anchors it in an online location and enables provenance tracking. + +#### Published Catalogs + +While STAC is useful as a portable format to move between systems, the goal is really to enable search. While any combination +of absolute and relative links is technically allowed by the specification, it is strongly recommended to follow one of the +patterns described below when publishing online. Many clients will not properly handle arbitrary mixes of absolute and relative +href's. + +We refer to a 'published catalog' as one that lives online in a stable location, and uses `self` links to establish its location and +enable easy provenance tracking. There are two types of published catalogs: + +#### Absolute Published Catalog + +This is a catalog that uses absolute links for everything, both in the `links` objects and in the +`asset` hrefs. It includes `self` links for every Item. Generally these are implemented by dynamic catalogs, as it is quite +easy for them to generate the proper links dynamically. But a static catalog that knows its published location could easily +implement it. + +#### Relative Published Catalog + +This is a self-contained catalog as described above, except it includes an absolute `self` link at +the root to identify its online location. This is designed so that a self-contained catalog (of either type, with its +assets or just metadata) can be 'published' online +by just adding one field (the self link) to its root (Catalog or Collection). All the other links should remain the same. The resulting catalog +is no longer compliant with the self-contained catalog recommendations, but instead transforms into a 'relative published catalog'. +With this, a client may resolve Item and sub-catalog self links by traversing parent and root links, but requires reading +multiple sources to achieve this. + +So if you are writing a STAC client it is recommended to start with just supporting these two types of published catalogs. In +turn, if your data is published online publicly or for use on an intranet then following these recommendations will ensure +that a wider range of clients will work with it. + +### Using Relation Types + +Implementors of STAC are highly recommended to be quite liberal with their `links`, and to use the `rel` field (in conjunction +with the `type` field) to communicate the structure and content of related entities. While each STAC spec describes some of the +'custom' relations STAC has set, the ideal is to reuse official [IANA Link Relation +Types](https://www.iana.org/assignments/link-relations/link-relations.xhtml) as much as possible. The following table describes +a number of the common official relations that are used in production STAC implementations. + +| Type | Description | +| --------- | ------------------------------------------------------------ | +| alternate | It is recommended that STAC Items are also available as HTML, and should use this rel with `"type" : "text/html"` to tell clients where they can get a version of the Item or Collection to view in a browser. See [STAC on the Web in Best Practices](#stac-on-the-web) for more information. | +| canonical | The URL of the [canonical](https://en.wikipedia.org/wiki/Canonical_link_element) version of the Item or Collection. API responses and copies of catalogs should use this to inform users that they are direct copy of another STAC Item, using the canonical rel to refer back to the primary location. | +| via | The URL of the source metadata that this STAC Item or Collection is created from. Used similarly to canonical, but refers back to a non-STAC record (Landsat MTL, Sentinel tileInfo.json, etc) | +| prev | Indicates that the link's context is a part of a series, and that the previous in the series is the link target. Typically used in STAC by API's, to return smaller groups of Items or Catalogs/Collections. | +| next | Indicates that the link's context is a part of a series, and that the next in the series is the link target. Typically used in STAC by API's, to return smaller groups of Items or Catalogs/Collections. | +| preview | Refers to a resource that serves as a preview (see [RFC 6903, sec. 3](https://tools.ietf.org/html/rfc6903#section-3)), usually a lower resolution thumbnail. In STAC this would usually be the same URL as the [thumbnail](#thumbnail) asset, but adding it as a link in addition enables OGC API clients that can't read assets to make use of it. It also adds support for thumbnails to STAC Catalogs as they can't list assets. | + +Being liberal with the `links` also means that it's ok to have repeated links with the same `href`. For example the +`parent` and `root` relation types will point at the same file when the child is directly below the root, and it is +recommended to include both. + +### Versioning for Catalogs + +In the Item and Collection STAC JSON, versions and deprecation can be indicated with the +[Versioning Indicators Extension](https://github.com/stac-extensions/version). + +The [Items and Collections API Version Extension](https://github.com/stac-extensions/version/) provides endpoints and +semantics for keeping and accessing previous versions of Collections and Items. The same semantics can be used in static +catalogs to preserve previous versions of the documents and link them together. + +In order to achieve this, the static catalog must make sure that for every record created, a copy of the record is also +created in a separate location and it is named with the version id adopted by the catalog. See +[here](https://github.com/stac-extensions/version/blob/main/README.md#version-id) for recommendations on versioning schema. + +The main record should also provide a link to the versioned record following the linking patterns described +[here](https://github.com/stac-extensions/version/blob/main/README.md#relation-types). For every update to the record, the same +cycle is repeated: + +1. Add link from the updated record to the previous version +2. Create a copy of the updated record and name it correctly + +#### Example + +When the record `my_item.json` is created, a copy of it is also created. `my_item.json` includes `permalink` to `my_item_01.json`. +The version suffix of the file name is taken from the version field of the record when it is available. + +- `root / collections / example_collection / items / my_item / my_item.json` +- `root / collections / example_collection / items / my_item / my_item_01.json` + +When `my_item.json` is updated, the new `my_item.json` includes a link to `my_item_01.json` and is also copied to `my_item_02.json`. +This ensures that `my_item_02.json` includes a link to `my_item_01.json` + +- `root / collections / example_collection / items / my_item / my_item.json` +- `root / collections / example_collection / items / my_item / my_item_01.json` +- `root / collections / example_collection / items / my_item / my_item_02.json` + +### Static to Dynamic best practices + +Many implementors are using static catalogs to be the reliable core of their dynamic services, or layering their STAC API +on top of any static catalog that is published. These are some recommendations on how to handle this: + +#### Ingestion and links + +Implementors have found that it's best to 'ingest' a static STAC into an internal datastore (often elasticsearch, but a +traditional database could work fine too) and then generate the full STAC API responses from that internal representation. +There are instances that have the API refer directly to the static STAC Items, but this only works well if the static STAC +catalog is an 'absolute published catalog'. So the recommendation is to always use absolute links - either in the static +published catalog, or to create new absolute links for the STAC search/ endpoint +responses, with the API's location at the base url. The `/` endpoint with the catalog could either link directly +to the static catalog, or can follow the 'dynamic catalog layout' recommendations above with a new set of URL's. + +Ideally each Item would use its `links` to provide a reference back to the static location. The location of the static +Item should be treated as the canonical location, as the generated API is more likely to move or be temporarily down. The +spec provides the `derived_from` rel field, which fits well enough, but `canonical` is likely the more appropriate one +as everything but the links should be the same. + +#### Keep catalogs in sync with cloud notification and queue services + +There is a set of emerging practices to use services like Amazon's Simple Queue Service (SQS) +and Simple Notification Service (SNS) to keep catalogs in sync. +There is a great [blog post](https://aws.amazon.com/blogs/publicsector/keeping-a-spatiotemporal-asset-catalog-stac-up-to-date-with-sns-sqs/) +on the CBERS STAC implementation on AWS. +The core idea is that a static catalog should emit a notification whenever it changes. The recommendation for SNS is to use the STAC +Item JSON as the message body, with some fields such as a scene’s datetime and geographic bounding box that allows +basic geographic filtering from listeners. + +The dynamic STAC API would then listen to the notifications and update its internal datastore whenever new data comes into +the static catalog. Implementors have had success using AWS Lambda to do a full 'serverless' updating of the elasticsearch +database, but it could just as easily be a server-based process. + +## How to Differentiate STAC Files + +Any tool that crawls a STAC implementation or encounters a STAC file in the wild needs a clear way to determine if it is an Item, +Collection or Catalog. As of 1.0.0 this is done primarily +with the `type` field, and secondarily in Items with `stac_version`, or optionally the `rel` of the link to it. + +```shell +if type is 'Collection' + => Collection +else if type is 'Catalog' + => Catalog +else if type is 'Feature' and stac_version is defined + => Item +else + => Invalid (JSON) +``` + +When crawling a STAC implementation, one can also make use of the [relation type](catalog-spec/catalog-spec.md#relation-types +) (`rel` field) when following a link. If it is an `item` rel type then the file must be a STAC Item. If it is `child`, `parent` or +`root` then it must be a Catalog or a Collection, though the final determination between the two requires looking at the the `type` field +in the Catalog or Collection JSON that is linked to. Note that there is also a `type` field in STAC Link and Asset objects, but that +is for the Media Type, but there are not specific media types for Catalog and Collection. See the sections on [STAC media +types](catalog-spec/catalog-spec.md#media-types), and [Asset media types](item-spec/item-spec.md#asset-media-type) for more information. + +In versions of STAC prior to 1.0 the process was a bit more complicated, as there was no `type` field for catalogs and collections. +See [this issue comment](https://github.com/radiantearth/stac-spec/issues/889#issuecomment-684529444) for a heuristic that works +for older STAC versions. \ No newline at end of file diff --git a/stac_check/lint.py b/stac_check/lint.py index d4e29bf..e98e2f6 100644 --- a/stac_check/lint.py +++ b/stac_check/lint.py @@ -1,5 +1,5 @@ -from .validate import StacValidate -from .utilities import is_valid_url +from stac_validator.validate import StacValidate +from stac_validator.utilities import is_valid_url import json import os from dataclasses import dataclass @@ -19,7 +19,7 @@ def __post_init__(self): self.message = self.validate_file(self.item) self.asset_type = self.check_asset_type() self.version = self.check_version() - self.validator_version = "2.4.0" + self.validator_version = "2.3.0" self.update_msg = self.set_update_message() self.valid_stac = self.message["valid_stac"] self.error_type = self.check_error_type() @@ -162,6 +162,42 @@ def check_searchable_identifiers(self): def check_percent_encoded(self): return self.asset_type == "ITEM" and "/" in self.object_id or ":" in self.object_id + def check_thumbnail(self): + if "assets" in self.data: + if "thumbnail" in self.data["assets"]: + if "type" in self.data["assets"]["thumbnail"]: + if "png" in self.data["assets"]["thumbnail"]["type"] or "jpeg" in self.data["assets"]["thumbnail"]["type"] or \ + "jpg" in self.data["assets"]["thumbnail"]["type"] or "webp" in self.data["assets"]["thumbnail"]["type"]: + return True + + def check_links_title_field(self): + if self.asset_type == "COLLECTION" or self.asset_type == "CATALOG": + for link in self.data["links"]: + if "title" not in link and link["rel"] != "self": + return False + return True + + def check_links_self(self): + if self.asset_type == "COLLECTION" or self.asset_type == "CATALOG": + for link in self.data["links"]: + if "self" in link["rel"]: + return True + return False + + def check_item_id_file_name(self): + if self.asset_type == "ITEM" and self.object_id != self.file_name: + return False + else: + return True + + def check_catalog_id_file_name(self): + if self.asset_type == "CATALOG" and self.file_name != 'catalog.json': + return False + elif self.asset_type == "COLLECTION" and self.file_name != 'collection.json': + return False + else: + return True + def create_best_practices_msg(self): best_practices = list() base_string = "STAC Best Practices: " @@ -181,14 +217,19 @@ def create_best_practices_msg(self): best_practices.extend([string_1, string_2, ""]) # best practices - item ids should match file names - if self.asset_type == "ITEM" and self.object_id != self.file_name: + if not self.check_item_id_file_name(): string_1 = f" Item file names should match their ids: '{self.file_name}' not equal to '{self.object_id}" best_practices.extend([string_1, ""]) + # best practices - collection and catalog file names should be collection.json and catalog.json + if not self.check_catalog_id_file_name(): + string_1 = f" Object should be called '{self.asset_type.lower()}.json' not '{self.file_name}.json'" + best_practices.extend([string_1, ""]) + # best practices - collections should contain summaries if self.asset_type == "COLLECTION" and self.summaries == False: string_1 = f" A STAC collection should contain a summaries field" - string_2 = f" https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md" + string_2 = f" It is recommended to store information like eo:bands in summaries" best_practices.extend([string_1, string_2, ""]) # best practices - datetime files should not be set to null @@ -217,4 +258,19 @@ def create_best_practices_msg(self): string_1 = f" You have {len(self.data['properties'])} properties. Please consider using links to avoid bloated metadata" best_practices.extend([string_1, ""]) + # best practices - ensure thumbnail is a small file size ["png", "jpeg", "jpg", "webp"] + if not self.check_thumbnail() and self.asset_type == "ITEM": + string_1 = f" A thumbnail should have a small file size ie. png, jpeg, jpg, webp" + best_practices.extend([string_1, ""]) + + # best practices - ensure that links in catalogs and collections include a title field + if not self.check_links_title_field(): + string_1 = f" Links in catalogs and collections should always have a 'title' field" + best_practices.extend([string_1, ""]) + + # best practices - ensure that links in catalogs and collections include self link + if not self.check_links_self(): + string_1 = f" A link to 'self' in links is strongly recommended" + best_practices.extend([string_1, ""]) + return best_practices \ No newline at end of file diff --git a/stac_check/utilities.py b/stac_check/utilities.py deleted file mode 100644 index dcf6c3f..0000000 --- a/stac_check/utilities.py +++ /dev/null @@ -1,82 +0,0 @@ -import json -from urllib.parse import urlparse -from urllib.request import urlopen - -import requests # type: ignore -from pystac.serialization import identify_stac_object # type: ignore - -NEW_VERSIONS = [ - "1.0.0-beta.2", - "1.0.0-rc.1", - "1.0.0-rc.2", - "1.0.0-rc.3", - "1.0.0-rc.4", - "1.0.0", -] - - -def is_url(url: str): - try: - result = urlparse(url) - return all([result.scheme, result.netloc]) - except ValueError: - return False - - -def is_valid_url(url: str) -> bool: - result = urlparse(url) - if result.scheme in ("http", "https"): - return True - else: - return False - - -def get_stac_type(stac_content) -> str: - try: - content_types = ["Item", "Catalog", "Collection"] - if "type" in stac_content and stac_content["type"] == "Feature": - return "Item" - if "type" in stac_content and stac_content["type"] in content_types: - return stac_content["type"] - stac_object = identify_stac_object(stac_content) - return stac_object.object_type - except TypeError as e: - return str(e) - - -def fetch_and_parse_file(input_path) -> dict: - data = None - if is_valid_url(input_path): - resp = requests.get(input_path) - data = resp.json() - else: - with open(input_path) as f: - data = json.load(f) - - return data - - -# validate new versions at schemas.stacspec.org -def set_schema_addr(version, stac_type: str): - if version in NEW_VERSIONS: - return f"https://schemas.stacspec.org/v{version}/{stac_type}-spec/json-schema/{stac_type}.json" - else: - return f"https://cdn.staclint.com/v{version}/{stac_type}.json" - - -def link_request( - link, - initial_message, -): - if is_url(link["href"]): - try: - response = urlopen(link["href"]) - status_code = response.getcode() - if status_code == 200: - initial_message["request_valid"].append(link["href"]) - except Exception: - initial_message["request_invalid"].append(link["href"]) - initial_message["format_valid"].append(link["href"]) - else: - initial_message["request_invalid"].append(link["href"]) - initial_message["format_invalid"].append(link["href"]) \ No newline at end of file diff --git a/stac_check/validate.py b/stac_check/validate.py deleted file mode 100644 index 7ce3c2b..0000000 --- a/stac_check/validate.py +++ /dev/null @@ -1,325 +0,0 @@ -import json -import os -from json.decoder import JSONDecodeError -from typing import List -from urllib.error import HTTPError, URLError - -import click # type: ignore -import jsonschema # type: ignore -from jsonschema import RefResolver -from requests import exceptions # type: ignore - -from .utilities import ( - fetch_and_parse_file, - get_stac_type, - link_request, - set_schema_addr, -) - - -class StacValidate: - def __init__( - self, - stac_file: str = None, - recursive: int = -2, - core: bool = False, - links: bool = False, - assets: bool = False, - extensions: bool = False, - custom: str = "", - verbose: bool = False, - no_output: bool = False, - log: str = "", - ): - self.stac_file = stac_file - self.message: list = [] - self.custom = custom - self.links = links - self.assets = assets - self.recursive = recursive - self.extensions = extensions - self.core = core - self.stac_content: dict = {} - self.version = "" - self.depth: int = 0 - self.skip_val = False - self.verbose = verbose - self.no_output = False - self.valid = False - self.log = log - - def create_err_msg(self, err_type: str, err_msg: str) -> dict: - self.valid = False - return { - "version": self.version, - "path": self.stac_file, - "schema": [self.custom], - "valid_stac": False, - "error_type": err_type, - "error_message": err_msg, - } - - def create_links_message(self): - format_valid: List[str] = [] - format_invalid: List[str] = [] - request_valid: List[str] = [] - request_invalid: List[str] = [] - return { - "format_valid": format_valid, - "format_invalid": format_invalid, - "request_valid": request_valid, - "request_invalid": request_invalid, - } - - def create_message(self, stac_type: str, val_type: str) -> dict: - return { - "version": self.version, - "path": self.stac_file, - "schema": [self.custom], - "valid_stac": False, - "asset_type": stac_type.upper(), - "validation_method": val_type, - } - - def assets_validator(self) -> dict: - initial_message = self.create_links_message() - for _, value in self.stac_content["assets"].items(): - link_request(value, initial_message) - return initial_message - - def links_validator(self) -> dict: - initial_message = self.create_links_message() - # get root_url for checking relative links - root_url = "" - for link in self.stac_content["links"]: - if link["rel"] == "self" and link["href"][0:4] == "http": - root_url = ( - link["href"].split("/")[0] + "//" + link["href"].split("/")[2] - ) - elif link["rel"] == "alternate" and link["href"][0:4] == "http": - root_url = ( - link["href"].split("/")[0] + "//" + link["href"].split("/")[2] - ) - for link in self.stac_content["links"]: - if link["href"][0:4] != "http": - link["href"] = root_url + link["href"][1:] - link_request(link, initial_message) - - return initial_message - - def extensions_validator(self, stac_type: str) -> dict: - message = self.create_message(stac_type, "extensions") - message["schema"] = [] - valid = True - if stac_type == "ITEM": - try: - if "stac_extensions" in self.stac_content: - # error with the 'proj' extension not being 'projection' in older stac - if "proj" in self.stac_content["stac_extensions"]: - index = self.stac_content["stac_extensions"].index("proj") - self.stac_content["stac_extensions"][index] = "projection" - schemas = self.stac_content["stac_extensions"] - for extension in schemas: - if "http" not in extension: - # where are the extensions for 1.0.0-beta.2 on cdn.staclint.com? - if self.version == "1.0.0-beta.2": - self.stac_content["stac_version"] = "1.0.0-beta.1" - self.version = self.stac_content["stac_version"] - extension = f"https://cdn.staclint.com/v{self.version}/extension/{extension}.json" - self.custom = extension - self.custom_validator() - message["schema"].append(extension) - except jsonschema.exceptions.ValidationError as e: - valid = False - if e.absolute_path: - err_msg = f"{e.message}. Error is in {' -> '.join([str(i) for i in e.absolute_path])}" - else: - err_msg = f"{e.message} of the root of the STAC object" - message = self.create_err_msg("ValidationError", err_msg) - return message - except Exception as e: - valid = False - err_msg = f"{e}. Error in Extensions." - return self.create_err_msg("Exception", err_msg) - else: - self.core_validator(stac_type) - message["schema"] = [self.custom] - self.valid = valid - return message - - def custom_validator(self): - # in case the path to custom json schema is local - # it may contain relative references - schema = fetch_and_parse_file(self.custom) - if os.path.exists(self.custom): - custom_abspath = os.path.abspath(self.custom) - custom_dir = os.path.dirname(custom_abspath).replace("\\", "/") - custom_uri = f"file:///{custom_dir}/" - resolver = RefResolver(custom_uri, self.custom) - jsonschema.validate(self.stac_content, schema, resolver=resolver) - else: - schema = fetch_and_parse_file(self.custom) - jsonschema.validate(self.stac_content, schema) - - def core_validator(self, stac_type: str): - stac_type = stac_type.lower() - self.custom = set_schema_addr(self.version, stac_type.lower()) - self.custom_validator() - - def default_validator(self, stac_type: str) -> dict: - message = self.create_message(stac_type, "default") - message["schema"] = [] - self.core_validator(stac_type) - core_schema = self.custom - message["schema"].append(core_schema) - stac_type = stac_type.upper() - if stac_type == "ITEM": - message = self.extensions_validator(stac_type) - message["validation_method"] = "default" - message["schema"].append(core_schema) - if self.links: - message["links_validated"] = self.links_validator() - if self.assets: - message["assets_validated"] = self.assets_validator() - return message - - def recursive_validator(self, stac_type: str): - if self.skip_val is False: - self.custom = set_schema_addr(self.version, stac_type.lower()) - message = self.create_message(stac_type, "recursive") - message["valid_stac"] = False - try: - _ = self.default_validator(stac_type) - - except jsonschema.exceptions.ValidationError as e: - if e.absolute_path: - err_msg = f"{e.message}. Error is in {' -> '.join([str(i) for i in e.absolute_path])}" - else: - err_msg = f"{e.message} of the root of the STAC object" - message.update(self.create_err_msg("ValidationError", err_msg)) - self.message.append(message) - return - message["valid_stac"] = True - self.message.append(message) - self.depth = self.depth + 1 - if self.recursive > -1: - if self.depth >= int(self.recursive): - self.skip_val = True - base_url = self.stac_file - for link in self.stac_content["links"]: - if link["rel"] == "child" or link["rel"] == "item": - address = link["href"] - if "http" not in address: - x = str(base_url).split("/") - x.pop(-1) - st = x[0] - for i in range(len(x)): - if i > 0: - st = st + "/" + x[i] - self.stac_file = st + "/" + address - else: - self.stac_file = address - self.stac_content = fetch_and_parse_file(self.stac_file) - self.stac_content["stac_version"] = self.version - stac_type = get_stac_type(self.stac_content).lower() - - if link["rel"] == "child": - - if self.verbose is True: - click.echo(json.dumps(message, indent=4)) - self.recursive_validator(stac_type) - - if link["rel"] == "item": - self.custom = set_schema_addr(self.version, stac_type.lower()) - message = self.create_message(stac_type, "recursive") - if self.version == "0.7.0": - schema = fetch_and_parse_file(self.custom) - # this next line prevents this: unknown url type: 'geojson.json' ?? - schema["allOf"] = [{}] - jsonschema.validate(self.stac_content, schema) - else: - msg = self.default_validator(stac_type) - message["schema"] = msg["schema"] - message["valid_stac"] = True - - if self.log != "": - self.message.append(message) - if self.recursive < 5: - self.message.append(message) - if self.verbose is True: - click.echo(json.dumps(message, indent=4)) - - def validate_dict(cls, stac_content): - cls.stac_content = stac_content - return cls.run() - - def run(cls): - message = {} - try: - if cls.stac_file is not None: - cls.stac_content = fetch_and_parse_file(cls.stac_file) - stac_type = get_stac_type(cls.stac_content).upper() - cls.version = cls.stac_content["stac_version"] - - if cls.core is True: - message = cls.create_message(stac_type, "core") - cls.core_validator(stac_type) - message["schema"] = [cls.custom] - cls.valid = True - elif cls.custom != "": - message = cls.create_message(stac_type, "custom") - message["schema"] = [cls.custom] - cls.custom_validator() - cls.valid = True - elif cls.recursive > -2: - cls.recursive_validator(stac_type) - cls.valid = True - elif cls.extensions is True: - message = cls.extensions_validator(stac_type) - else: - cls.valid = True - message = cls.default_validator(stac_type) - - except ValueError as e: - message.update(cls.create_err_msg("ValueError", str(e))) - except URLError as e: - message.update(cls.create_err_msg("URLError", str(e))) - except JSONDecodeError as e: - message.update(cls.create_err_msg("JSONDecodeError", str(e))) - except TypeError as e: - message.update(cls.create_err_msg("TypeError", str(e))) - except FileNotFoundError as e: - message.update(cls.create_err_msg("FileNotFoundError", str(e))) - except ConnectionError as e: - message.update(cls.create_err_msg("ConnectionError", str(e))) - except exceptions.SSLError as e: - message.update(cls.create_err_msg("SSLError", str(e))) - except OSError as e: - message.update(cls.create_err_msg("OSError", str(e))) - except jsonschema.exceptions.ValidationError as e: - if e.absolute_path: - err_msg = f"{e.message}. Error is in {' -> '.join([str(i) for i in e.absolute_path])}" - else: - err_msg = f"{e.message} of the root of the STAC object" - message.update(cls.create_err_msg("ValidationError", err_msg)) - except KeyError as e: - message.update(cls.create_err_msg("KeyError", str(e))) - except HTTPError as e: - message.update(cls.create_err_msg("HTTPError", str(e))) - except Exception as e: - message.update(cls.create_err_msg("Exception", str(e))) - - message["valid_stac"] = cls.valid - - if cls.recursive < -1: - cls.message.append(message) - - if cls.log != "": - f = open(cls.log, "w") - f.write(json.dumps(cls.message, indent=4)) - f.close() - - if cls.valid: - return True - else: - return False \ No newline at end of file diff --git a/tests/test_lint.py b/tests/test_lint.py index 2414772..434a06a 100644 --- a/tests/test_lint.py +++ b/tests/test_lint.py @@ -1,3 +1,4 @@ +from re import L from stac_check.lint import Linter import pytest @@ -79,7 +80,7 @@ def test_linter_collection(): assert linter.summaries == True def test_linter_collection_no_summaries(): - file = "sample_files/1.0.0/collection_no_summaries.json" + file = "sample_files/1.0.0/collection-no-summaries.json" linter = Linter(file, assets=False, links=False) assert linter.version == "1.0.0" assert linter.valid_stac == True @@ -87,8 +88,10 @@ def test_linter_collection_no_summaries(): assert linter.summaries == False assert linter.best_practices_msg == [ "STAC Best Practices: ", + " Object should be called 'collection.json' not 'collection-no-summaries.json'", + "", " A STAC collection should contain a summaries field", - " https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md", + " It is recommended to store information like eo:bands in summaries", "" ] @@ -113,6 +116,12 @@ def test_linter_item_id_not_matching_file_name(): assert linter.file_name == "core-item" assert linter.object_id == "20201211_223832_CS2" assert linter.file_name != linter.object_id + assert linter.check_item_id_file_name() == False + +def test_linter_collection_catalog_id(): + file = "sample_files/1.0.0/collection-no-title.json" + linter = Linter(file) + assert linter.check_catalog_id_file_name() == False def test_linter_item_id_format_best_practices(): file = "sample_files/1.0.0/core-item-invalid-id.json" @@ -140,4 +149,27 @@ def test_bloated_item(): assert linter.bloated_links == True assert len(linter.data["links"]) > 20 + +def test_small_thumbnail(): + file = "sample_files/1.0.0/core-item-large-thumbnail.json" + linter = Linter(file) + + assert linter.check_thumbnail() != True + + file = "sample_files/1.0.0/core-item.json" + linter = Linter(file) + + assert linter.check_thumbnail() == True + +def test_title_field(): + file = "sample_files/1.0.0/collection-no-title.json" + linter = Linter(file) + + assert linter.check_links_title_field() == False + +def test_self_in_links(): + file = "sample_files/1.0.0/collection-no-title.json" + linter = Linter(file) + + assert linter.check_links_self() == False