Skip to content

Commit

Permalink
services: implement file fetching from external storage
Browse files Browse the repository at this point in the history
  • Loading branch information
Pablo Panero committed Nov 2, 2022
1 parent 1f5c3ba commit 1787e47
Show file tree
Hide file tree
Showing 15 changed files with 776 additions and 81 deletions.
8 changes: 7 additions & 1 deletion invenio_records_resources/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2020 CERN.
# Copyright (C) 2020-2022 CERN.
# Copyright (C) 2020 Northwestern University.
#
# Invenio-Records-Resources is free software; you can redistribute it and/or
Expand All @@ -12,3 +12,9 @@
SITE_UI_URL = "https://127.0.0.1:5000"

SITE_API_URL = "https://127.0.0.1:5000/api"

RECORDS_RESOURCES_FILES_ALLOWED_DOMAINS = []
"""Explicitly allowed domains for external file fetching.
Only file URLs from these domains will be allowed to be fetched.
"""
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
}
}
},

"identifier": {
"description": "An identifier.",
"type": "string"
Expand All @@ -29,14 +30,53 @@
}
}
},
"file": {
"type": "object",
"additionalProperties": false,
"description": "A file object.",
"properties": {
"version_id": {
"description": "Object version ID.",
"type": "string"
},
"bucket_id": {
"description": "Object verison bucket ID.",
"type": "string"
},
"mimetype": {
"description": "File MIMEType.",
"type": "string"
},
"uri": {
"description": "File URI.",
"type": "string"
},
"storage_class": {
"description": "File storage class.",
"type": "string"
},
"checksum": {
"description": "Checksum of the file.",
"type": "string"
},
"size": {
"description": "Size of the file in bytes.",
"type": "number"
},
"key": {
"description": "Key (filename) of the file.",
"type": "string"
},
"file_id": {
"$ref": "local://definitions-v1.0.0.json#/identifier"
}
}
},
"internal-pid": {
"type": "object",
"description": "An internal persistent identifier object.",
"additionalProperties": false,
"required": [
"pk",
"status"
],
"required": ["pk", "status"],
"properties": {
"pk": {
"description": "Primary key of the PID object.",
Expand All @@ -45,13 +85,7 @@
"status": {
"description": "The status of the PID (from Invenio-PIDStore).",
"type": "string",
"enum": [
"N",
"K",
"R",
"M",
"D"
]
"enum": ["N", "K", "R", "M", "D"]
},
"pid_type": {
"description": "The type of the persistent identifier.",
Expand Down
38 changes: 29 additions & 9 deletions invenio_records_resources/records/systemfields/files/manager.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2020-2021 CERN.
# Copyright (C) 2020-2022 CERN.
# Copyright (C) 2020-2021 Northwestern University.
#
# Invenio-Records-Resources is free software; you can redistribute it and/or
Expand Down Expand Up @@ -60,8 +60,7 @@
from functools import wraps

from invenio_files_rest.errors import InvalidKeyError, InvalidOperationError
from invenio_files_rest.models import Bucket, ObjectVersion
from invenio_records.systemfields import SystemField
from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion


def ensure_enabled(func):
Expand Down Expand Up @@ -145,7 +144,7 @@ def unlock(self):

# TODO: "create" and "update" should be merged somehow...
@ensure_enabled
def create(self, key, obj=None, stream=None, data=None):
def create(self, key, obj=None, stream=None, data=None, **kwargs):
"""Create/initialize a file."""
assert not (obj and stream)

Expand All @@ -154,8 +153,12 @@ def create(self, key, obj=None, stream=None, data=None):

rf = self.file_cls.create({}, key=key, record_id=self.record.id)
if stream:
obj = ObjectVersion.create(self.bucket, key, stream=stream)
obj = ObjectVersion.create(self.bucket, key, stream=stream, **kwargs)
if obj:
if isinstance(obj, dict):
fi = FileInstance.create()
fi.set_uri(**obj.pop("file"))
obj = ObjectVersion.create(self.bucket, key, fi.id)
rf.object_version_id = obj.version_id
rf.object_version = obj
if data:
Expand All @@ -165,15 +168,23 @@ def create(self, key, obj=None, stream=None, data=None):
return rf

@ensure_enabled
def update(self, key, obj=None, stream=None, data=None):
"""Update a file."""
assert not (obj and stream)
def create_obj(self, key, stream, data=None, **kwargs):
"""Create an ObjectVersion but do not pop it to the top of the stack."""
# this is used on set_file_content, since the file is not yet commited
rf = self.get(key)
if rf is None:
raise InvalidKeyError(description=f"File with {key} does not exist.")

return ObjectVersion.create(self.bucket, key, stream=stream, **kwargs)

@ensure_enabled
def update(self, key, obj=None, stream=None, data=None, **kwargs):
"""Update a file."""
assert not (obj and stream)
rf = self.get(key)

if stream:
obj = ObjectVersion.create(self.bucket, key, stream=stream)
obj = self.create_obj(key, stream=stream, **kwargs)
if obj:
rf.object_version_id = obj.version_id
rf.object_version = obj
Expand All @@ -182,6 +193,15 @@ def update(self, key, obj=None, stream=None, data=None):
rf.commit()
return rf

@ensure_enabled
def commit(self, file_key):
"""Commit a file."""
# TODO: Add other checks here (e.g. verify checksum, S3 upload)
file_obj = ObjectVersion.get(self.bucket.id, file_key)
if not file_obj:
raise Exception(f"File with key {file_key} not uploaded yet.")
self[file_key] = file_obj

@ensure_enabled
def delete(self, key, remove_obj=True, softdelete_obj=False):
"""Delete a file."""
Expand Down
4 changes: 4 additions & 0 deletions invenio_records_resources/services/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,7 @@ class QuerystringValidationError(ValidationError):
"""Error thrown when there is an issue with the querystring."""

pass


class TransferException(Exception):
"""File transfer exception."""
32 changes: 6 additions & 26 deletions invenio_records_resources/services/files/components/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,7 @@

"""Files service components."""

from invenio_db import db
from invenio_files_rest.errors import FileSizeError
from invenio_files_rest.models import ObjectVersion

from ..transfer import Transfer
from .base import FileServiceComponent


Expand All @@ -26,29 +23,12 @@ def set_file_content(self, identity, id, file_key, stream, content_length, recor
file_record = record.files.get(file_key)
if file_record is None:
raise Exception(f'File with key "{file_key}" has not been initialized yet.')
if file_record.file:
raise Exception(f'File with key "{file_key}" is commited.')

# Check size limitations
bucket = record.bucket
size_limit = bucket.size_limit
if content_length and size_limit and content_length > size_limit:
desc = (
"File size limit exceeded."
if isinstance(size_limit, int)
else size_limit.reason
)
raise FileSizeError(description=desc)

# DB connection?
# re uploading failed upload?

with db.session.begin_nested():
# TODO: in case we want to update a file, this keeps the old
# FileInstance. It might be better to call ObjectVersion.remove()
# before or after the "set_content"
obj = ObjectVersion.create(bucket, file_key)
obj.set_contents(stream, size=content_length, size_limit=size_limit)
file_type = file_record.file.storage_class if file_record.file else None
transfer = Transfer.get_transfer(file_type)
transfer.set_file_content(
record, file_record.file, file_key, stream, content_length
)

def get_file_content(self, identity, id, file_key, record):
"""Get file content handler."""
Expand Down
18 changes: 9 additions & 9 deletions invenio_records_resources/services/files/components/metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2021 CERN.
# Copyright (C) 2021-2022 CERN.
#
# Invenio-Records-Resources is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
Expand All @@ -10,9 +10,8 @@

from copy import deepcopy

from invenio_files_rest.models import ObjectVersion

from ..schema import InitFileSchema
from ..transfer import Transfer
from .base import FileServiceComponent


Expand All @@ -25,17 +24,18 @@ def init_files(self, identity, id, record, data):
validated_data = schema.load(data)
for file_metadata in validated_data:
temporary_obj = deepcopy(file_metadata)
record.files.create(temporary_obj.pop("key"), data=temporary_obj)
file_type = temporary_obj.pop("storage_class", None)
transfer = Transfer.get_transfer(
file_type, service=self.service, uow=self.uow
)
_ = transfer.init_file(record, temporary_obj)

def update_file_metadata(self, identity, id, file_key, record, data):
"""Update file metadata handler."""
# FIXME: move this call to a transfer call
record.files.update(file_key, data=data)

# TODO: `commit_file` might vary based on your storage backend (e.g. S3)
def commit_file(self, identity, id, file_key, record):
"""Commit file handler."""
# TODO: Add other checks here (e.g. verify checksum, S3 upload)
file_obj = ObjectVersion.get(record.bucket.id, file_key)
if not file_obj:
raise Exception(f"File with key {file_key} not uploaded yet.")
record.files[file_key] = file_obj
Transfer.commit_file(record, file_key)
47 changes: 47 additions & 0 deletions invenio_records_resources/services/files/generators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022 CERN.
#
# Invenio-Records-Resources is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
# details.

"""File permissions generators."""


from invenio_access.permissions import any_user, system_process
from invenio_records_permissions.generators import Generator
from invenio_search.engine import dsl

from .transfer import TransferType


class AnyUserIfFileIsLocal(Generator):
"""Allows any user."""

def needs(self, **kwargs):
"""Enabling Needs."""
record = kwargs["record"]
file_key = kwargs.get("file_key")
is_file_local = True
if file_key:
file_record = record.files.get(file_key)
# file_record __bool__ returns false for `if file_record`
file = file_record.file if file_record is not None else None
is_file_local = not file or file.storage_class == TransferType.LOCAL
else:
file_records = record.files.entries
for file_record in file_records:
file = file_record.file
if file and file.storage_class != TransferType.LOCAL:
is_file_local = False
break

if is_file_local:
return [any_user]
else:
return [system_process]

def query_filter(self, **kwargs):
"""Match all in search."""
return dsl.Q("match_all")
Loading

0 comments on commit 1787e47

Please sign in to comment.