Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

serializers: add datapackage serializer #1742

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions invenio_rdm_records/resources/serializers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from .csl import CSLJSONSerializer, StringCitationSerializer
from .csv import CSVRecordSerializer
from .datacite import DataCite43JSONSerializer, DataCite43XMLSerializer
from .datapackage import DataPackageSerializer
from .dcat import DCATSerializer
from .dublincore import DublinCoreJSONSerializer, DublinCoreXMLSerializer
from .geojson import GeoJSONSerializer
Expand All @@ -43,6 +44,7 @@
"CSVRecordSerializer",
"DataCite43JSONSerializer",
"DataCite43XMLSerializer",
"DataPackageSerializer",
"DublinCoreJSONSerializer",
"DublinCoreXMLSerializer",
"FAIRSignpostingProfileLvl2Serializer",
Expand Down
26 changes: 26 additions & 0 deletions invenio_rdm_records/resources/serializers/datapackage/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 Open Knowledge Foundation
#
# Invenio-RDM-Records is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Data Package Serializers for Invenio RDM Records."""

from flask_resources import BaseListSchema, MarshmallowSerializer
from flask_resources.serializers import JSONSerializer

from .schema import DataPackageSchema


class DataPackageSerializer(MarshmallowSerializer):
"""Marshmallow based Data Package serializer for records."""

def __init__(self, **options):
"""Constructor."""
super().__init__(
format_serializer_cls=JSONSerializer,
object_schema_cls=DataPackageSchema,
list_schema_cls=BaseListSchema,
**options
)
84 changes: 84 additions & 0 deletions invenio_rdm_records/resources/serializers/datapackage/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 Open Knowledge Foundation
#
# Invenio-RDM-Records is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Data Package based Schema for Invenio RDM Records."""

from marshmallow import Schema, fields, missing

PROFILE_URL = "https://datapackage.org/profiles/2.0/datapackage.json"


class DataPackageSchema(Schema):
"""Schema for Data Package in JSON."""

profile = fields.Constant(PROFILE_URL, data_key="$schema")

id = fields.Str(attribute="links.doi")
name = fields.Str(attribute="id")
title = fields.Str(attribute="metadata.title")
description = fields.Str(attribute="metadata.description")
version = fields.Str(attribute="metadata.version")
created = fields.Str(attribute="created")
homepage = fields.Str(attribute="links.self_html")
keywords = fields.Method("get_keywords")
resources = fields.Method("get_resources")
licenses = fields.Method("get_licenses")
contributors = fields.Method("get_contributors")

def get_keywords(self, obj):
keywords = []
for subject in obj.get("metadata", {}).get("subjects", []):
keyword = subject.get("subject")
if keyword:
keywords.append(keyword)
return keywords if keywords else missing

def get_resources(self, obj):
resources = []
basepath = obj.get("links", {}).get("self_html")
if basepath:
for file in obj.get("files", {}).get("entries", {}).values():
resource = {}
resource["name"] = file.get("key")
resource["path"] = f'{basepath}/files/{file.get("key")}'
resource["format"] = file.get("ext")
resource["mimetype"] = file.get("mimetype")
resource["bytes"] = file.get("size")
resource["hash"] = file.get("checksum")
resource = {k: v for k, v in resource.items() if v is not None}
if resource.get("name") and resource.get("path"):
resources.append(resource)
return resources

def get_licenses(self, obj):
licenses = []
for item in obj.get("metadata", {}).get("rights", []):
license = {}
license["name"] = item.get("id")
license["path"] = item.get("link") or item.get("props", {}).get("url")
license["title"] = item.get("title", {}).get("en")
license = {k: v for k, v in license.items() if v is not None}
if license.get("name"):
licenses.append(license)
return licenses if licenses else missing

def get_contributors(self, obj):
contributors = []
for type in ["creator", "contributor"]:
for item in obj.get("metadata", {}).get(f"{type}s", []):
entity = item.get("person_or_org", {})
parent = (item.get("affiliations") or [{}])[0]
contributor = {}
contributor["title"] = entity.get("name")
contributor["givenName"] = entity.get("given_name")
contributor["familyName"] = entity.get("family_name")
contributor["roles"] = [item.get("role", {}).get("id", type)]
contributor["organization"] = parent.get("name")
contributor = {k: v for k, v in contributor.items() if v is not None}
if contributor:
contributors.append(contributor)
return contributors if contributors else missing
108 changes: 108 additions & 0 deletions tests/resources/serializers/test_datapackage_serializer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 Open Knowledge Foundation
#
# Invenio-RDM-Records is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Resources serializers tests."""

from invenio_rdm_records.resources.serializers.datapackage import DataPackageSerializer


def test_data_package_serializer_empty_record():
serializer = DataPackageSerializer()
serialized_record = serializer.dump_obj({})
assert serialized_record == {
"$schema": "https://datapackage.org/profiles/2.0/datapackage.json",
"resources": [],
}


def test_data_package_serializer_minimal_record(minimal_record_to_dict):
serializer = DataPackageSerializer()
serialized_record = serializer.dump_obj(minimal_record_to_dict)
assert serialized_record == {
"$schema": "https://datapackage.org/profiles/2.0/datapackage.json",
"id": "https://handle.stage.datacite.org/10.1234/67890-fghij",
"name": "67890-fghij",
"title": "A Romans story",
"created": "2023-11-14T19:33:09.837080+00:00",
"homepage": "https://127.0.0.1:5000/records/67890-fghij",
"resources": [],
"contributors": [
{
"familyName": "Brown",
"givenName": "Troy",
"roles": ["creator"],
},
{
"roles": ["creator"],
"title": "Troy Inc.",
},
],
}


def test_data_package_serializer_full_record(full_record_to_dict):
serializer = DataPackageSerializer()
serialized_record = serializer.dump_obj(full_record_to_dict)
assert serialized_record == {
"$schema": "https://datapackage.org/profiles/2.0/datapackage.json",
"id": "https://handle.stage.datacite.org/10.1234/inveniordm.1234",
"name": "12345-abcde",
"title": "InvenioRDM",
"description": "<h1>A description</h1> <p>with HTML tags</p>",
"version": "v1.0",
"created": "2023-11-14T18:30:55.738898+00:00",
"homepage": "https://127.0.0.1:5000/records/12345-abcde",
"keywords": [
"Abdominal Injuries",
"custom",
],
"resources": [
{
"name": "test.txt",
"path": "https://127.0.0.1:5000/records/12345-abcde/files/test.txt",
"format": "txt",
"mimetype": "text/plain",
"bytes": 9,
"hash": "md5:e795abeef2c38de2b064be9f6364ceae",
},
],
"licenses": [
{
"name": "cc-by-4.0",
"path": "https://creativecommons.org/licenses/by/4.0/legalcode",
"title": "Creative Commons Attribution 4.0 International",
},
],
"contributors": [
{
"familyName": "Nielsen",
"givenName": "Lars Holm",
"organization": "CERN",
"roles": ["creator"],
"title": "Nielsen, Lars Holm",
},
{
"familyName": "Tom",
"givenName": "Blabin",
"roles": ["creator"],
"title": "Tom, Blabin",
},
{
"familyName": "Nielsen",
"givenName": "Lars Holm",
"organization": "CERN",
"roles": ["other"],
"title": "Nielsen, Lars Holm",
},
{
"familyName": "Dirk",
"givenName": "Dirkin",
"roles": ["other"],
"title": "Dirk, Dirkin",
},
],
}