feat: Generate and move Metadata (TypedDict) to datasets._typing

vega · Nov 8, 2024 · 0ea4e21 · 0ea4e21
1 parent 2051410
commit 0ea4e21
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 18 deletions.
diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py
@@ -140,8 +140,12 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None
 
 
 def generate_datasets_typing(application: Application, output: Path, /) -> None:
+    from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT
+
     app = application
     tags = app.scan("gh_tags").select("tag").collect().to_series()
+    metadata_schema = app.scan("gh_trees").collect_schema().to_python()
+
     DATASET_NAME = "dataset_name"
     names = (
         app.scan("gh_trees")
@@ -152,20 +156,61 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None:
         .collect()
         .to_series()
     )
+    indent = " " * 4
     NAME = "DatasetName"
     TAG = "VersionTag"
     EXT = "Extension"
+    METADATA_TD = "Metadata"
+    DESCRIPTION_DEFAULT = "_description_"
+    NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}"
+
+    name_collision = (
+        f"Dataset is available via multiple ``suffix``(s).{NOTE_SEP}"
+        "Requires specifying a preference in calls to ``data(ext=...)``."
+    )
+    sha = (
+        f"Unique hash for the dataset.{NOTE_SEP}"
+        f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}"
+        f"then all ``tag``(s) in this range would **share** this value."
+    )
+    descriptions: dict[str, str] = {
+        "dataset_name": "Equivalent to ``Pathlib.Path.stem``.",
+        "ext_supported": "Dataset can be read as tabular data.",
+        "file_name": "Equivalent to ``Pathlib.Path.name``.",
+        "name_collision": name_collision,
+        "sha": sha,
+        "size": "File size (*bytes*).",
+        "suffix": f"File extension.{NOTE_SEP}Equivalent to ``Pathlib.Path.suffix``",
+        "tag": "``vega-datasets`` release version.",
+        "url_npm": "Remote url used to access dataset.",
+    }
+    metadata_doc = f"\n{indent}".join(
+        f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}"
+        for param in metadata_schema
+    )
+
     contents = (
         f"{HEADER_COMMENT}",
         "from __future__ import annotations\n",
         "import sys",
         "from typing import Literal, TYPE_CHECKING",
+        utils.import_typing_extensions((3, 14), "TypedDict"),
         utils.import_typing_extensions((3, 10), "TypeAlias"),
         "\n",
-        f"__all__ = {[NAME, TAG, EXT]}\n\n"
+        f"__all__ = {[NAME, TAG, EXT, METADATA_TD]}\n\n"
         f"{NAME}: TypeAlias = {utils.spell_literal(names)}",
         f"{TAG}: TypeAlias = {utils.spell_literal(tags)}",
         f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}',
+        UNIVERSAL_TYPED_DICT.format(
+            name=METADATA_TD,
+            metaclass_kwds=", total=False",
+            td_args=f"\n{indent}".join(
+                f"{param}: {tp.__name__}" for param, tp in metadata_schema.items()
+            ),
+            summary="Full schema for ``metadata.parquet``.",
+            doc=metadata_doc,
+            comment="",
+        ),
     )
     ruff.write_lint_format(output, contents)
 

diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py
@@ -56,8 +56,7 @@
     else:
         from typing_extensions import TypeAlias
 
-    from tools.datasets._typing import DatasetName, Extension, VersionTag
-    from tools.datasets.models import Metadata
+    from tools.datasets._typing import DatasetName, Extension, Metadata, VersionTag
     from tools.schemapi.utils import OneOrSeq
 
     _ExtensionScan: TypeAlias = Literal[".parquet"]

diff --git a/tools/datasets/_typing.py b/tools/datasets/_typing.py
@@ -6,13 +6,18 @@
 import sys
 from typing import Literal
 
+if sys.version_info >= (3, 14):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
 if sys.version_info >= (3, 10):
     from typing import TypeAlias
 else:
     from typing_extensions import TypeAlias
 
 
-__all__ = ["DatasetName", "Extension", "VersionTag"]
+__all__ = ["DatasetName", "Extension", "Metadata", "VersionTag"]
 
 DatasetName: TypeAlias = Literal[
     "airports",
@@ -135,3 +140,51 @@
     "v1.5.0",
 ]
 Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"]
+
+
+class Metadata(TypedDict, total=False):
+    """
+    Full schema for ``metadata.parquet``.
+
+    Parameters
+    ----------
+    dataset_name
+        Equivalent to ``Pathlib.Path.stem``.
+    ext_supported
+        Dataset can be read as tabular data.
+    file_name
+        Equivalent to ``Pathlib.Path.name``.
+    name_collision
+        Dataset is available via multiple ``suffix``(s).
+
+        .. note::
+            Requires specifying a preference in calls to ``data(ext=...)``.
+    sha
+        Unique hash for the dataset.
+
+        .. note::
+            If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;
+
+            then all ``tag``(s) in this range would **share** this value.
+    size
+        File size (*bytes*).
+    suffix
+        File extension.
+
+        .. note::
+            Equivalent to ``Pathlib.Path.suffix``
+    tag
+        ``vega-datasets`` release version.
+    url_npm
+        Remote url used to access dataset.
+    """
+
+    dataset_name: str
+    ext_supported: bool
+    file_name: str
+    name_collision: bool
+    sha: str
+    size: int
+    suffix: str
+    tag: str
+    url_npm: str
diff --git a/tools/datasets/models.py b/tools/datasets/models.py
@@ -127,20 +127,6 @@ class ParsedTreesResponse(TypedDict):
     tree: list[ParsedTree]
 
 
-class Metadata(TypedDict, total=False):
-    """Full schema for `metadata.parquet`."""
-
-    dataset_name: str
-    ext_supported: bool
-    file_name: str
-    name_collision: bool
-    sha: str
-    size: int
-    suffix: str
-    tag: str
-    url_npm: str
-
-
 class GitHubRateLimit(TypedDict):
     limit: int
     used: int