Skip to content

Commit

Permalink
feat: Generate and move Metadata (TypedDict) to datasets._typing
Browse files Browse the repository at this point in the history
  • Loading branch information
dangotbanned committed Nov 8, 2024
1 parent 2051410 commit 0ea4e21
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 18 deletions.
47 changes: 46 additions & 1 deletion tools/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,12 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None


def generate_datasets_typing(application: Application, output: Path, /) -> None:
from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT

app = application
tags = app.scan("gh_tags").select("tag").collect().to_series()
metadata_schema = app.scan("gh_trees").collect_schema().to_python()

DATASET_NAME = "dataset_name"
names = (
app.scan("gh_trees")
Expand All @@ -152,20 +156,61 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None:
.collect()
.to_series()
)
indent = " " * 4
NAME = "DatasetName"
TAG = "VersionTag"
EXT = "Extension"
METADATA_TD = "Metadata"
DESCRIPTION_DEFAULT = "_description_"
NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}"

name_collision = (
f"Dataset is available via multiple ``suffix``(s).{NOTE_SEP}"
"Requires specifying a preference in calls to ``data(ext=...)``."
)
sha = (
f"Unique hash for the dataset.{NOTE_SEP}"
f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}"
f"then all ``tag``(s) in this range would **share** this value."
)
descriptions: dict[str, str] = {
"dataset_name": "Equivalent to ``Pathlib.Path.stem``.",
"ext_supported": "Dataset can be read as tabular data.",
"file_name": "Equivalent to ``Pathlib.Path.name``.",
"name_collision": name_collision,
"sha": sha,
"size": "File size (*bytes*).",
"suffix": f"File extension.{NOTE_SEP}Equivalent to ``Pathlib.Path.suffix``",
"tag": "``vega-datasets`` release version.",
"url_npm": "Remote url used to access dataset.",
}
metadata_doc = f"\n{indent}".join(
f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}"
for param in metadata_schema
)

contents = (
f"{HEADER_COMMENT}",
"from __future__ import annotations\n",
"import sys",
"from typing import Literal, TYPE_CHECKING",
utils.import_typing_extensions((3, 14), "TypedDict"),
utils.import_typing_extensions((3, 10), "TypeAlias"),
"\n",
f"__all__ = {[NAME, TAG, EXT]}\n\n"
f"__all__ = {[NAME, TAG, EXT, METADATA_TD]}\n\n"
f"{NAME}: TypeAlias = {utils.spell_literal(names)}",
f"{TAG}: TypeAlias = {utils.spell_literal(tags)}",
f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}',
UNIVERSAL_TYPED_DICT.format(
name=METADATA_TD,
metaclass_kwds=", total=False",
td_args=f"\n{indent}".join(
f"{param}: {tp.__name__}" for param, tp in metadata_schema.items()
),
summary="Full schema for ``metadata.parquet``.",
doc=metadata_doc,
comment="",
),
)
ruff.write_lint_format(output, contents)

Expand Down
3 changes: 1 addition & 2 deletions tools/datasets/_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,7 @@
else:
from typing_extensions import TypeAlias

from tools.datasets._typing import DatasetName, Extension, VersionTag
from tools.datasets.models import Metadata
from tools.datasets._typing import DatasetName, Extension, Metadata, VersionTag
from tools.schemapi.utils import OneOrSeq

_ExtensionScan: TypeAlias = Literal[".parquet"]
Expand Down
55 changes: 54 additions & 1 deletion tools/datasets/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,18 @@
import sys
from typing import Literal

if sys.version_info >= (3, 14):
from typing import TypedDict
else:
from typing_extensions import TypedDict

if sys.version_info >= (3, 10):
from typing import TypeAlias
else:
from typing_extensions import TypeAlias


__all__ = ["DatasetName", "Extension", "VersionTag"]
__all__ = ["DatasetName", "Extension", "Metadata", "VersionTag"]

DatasetName: TypeAlias = Literal[
"airports",
Expand Down Expand Up @@ -135,3 +140,51 @@
"v1.5.0",
]
Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"]


class Metadata(TypedDict, total=False):
"""
Full schema for ``metadata.parquet``.
Parameters
----------
dataset_name
Equivalent to ``Pathlib.Path.stem``.
ext_supported
Dataset can be read as tabular data.
file_name
Equivalent to ``Pathlib.Path.name``.
name_collision
Dataset is available via multiple ``suffix``(s).
.. note::
Requires specifying a preference in calls to ``data(ext=...)``.
sha
Unique hash for the dataset.
.. note::
If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;
then all ``tag``(s) in this range would **share** this value.
size
File size (*bytes*).
suffix
File extension.
.. note::
Equivalent to ``Pathlib.Path.suffix``
tag
``vega-datasets`` release version.
url_npm
Remote url used to access dataset.
"""

dataset_name: str
ext_supported: bool
file_name: str
name_collision: bool
sha: str
size: int
suffix: str
tag: str
url_npm: str
14 changes: 0 additions & 14 deletions tools/datasets/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,20 +127,6 @@ class ParsedTreesResponse(TypedDict):
tree: list[ParsedTree]


class Metadata(TypedDict, total=False):
"""Full schema for `metadata.parquet`."""

dataset_name: str
ext_supported: bool
file_name: str
name_collision: bool
sha: str
size: int
suffix: str
tag: str
url_npm: str


class GitHubRateLimit(TypedDict):
limit: int
used: int
Expand Down

0 comments on commit 0ea4e21

Please sign in to comment.