Skip to content

Commit

Permalink
Merge pull request #152 from gwbischof/invalid_documents
Browse files Browse the repository at this point in the history
Added InvalidData and MismatchedDataKeys exceptions
  • Loading branch information
danielballan authored Mar 5, 2020
2 parents 08642bc + 9d8ceac commit 7cec2b3
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 64 deletions.
189 changes: 125 additions & 64 deletions event_model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,56 +645,79 @@ def fill_event(self, doc, include=None, exclude=None, inplace=None):
filled_doc = doc
else:
filled_doc = copy.deepcopy(doc)

descriptor = self._descriptor_cache[doc['descriptor']]
from_datakeys = False
self._current_state.descriptor = descriptor
try:
filled = doc['filled']
needs_filling = {key for key, val in doc['filled'].items()
if val is False}
except KeyError:
# This document is not telling us which, if any, keys are filled.
# Infer that none of the external data is filled.
filled = {key: 'external' in val
for key, val in descriptor['data_keys'].items()}
for key, is_filled in filled.items():
needs_filling = {key for key, val in descriptor['data_keys'].items()
if 'external' in val}
from_datakeys = True
for key in needs_filling:
self._current_state.key = key
if exclude is not None and key in exclude:
continue
if include is not None and key not in include:
continue
if not is_filled:
try:
datum_id = doc['data'][key]
# Look up the cached Datum doc.
try:
datum_doc = self._datum_cache[datum_id]
except KeyError as err:
raise UnresolvableForeignKeyError(
datum_id,
f"Event with uid {doc['uid']} refers to unknown Datum "
f"datum_id {datum_id}") from err
resource_uid = datum_doc['resource']
# Look up the cached Resource.
try:
resource = self._resource_cache[resource_uid]
except KeyError as err:
raise UnresolvableForeignKeyError(
resource_uid,
f"Datum with id {datum_id} refers to unknown Resource "
f"uid {resource_uid}") from err
handler = self._get_handler_maybe_cached(resource)
error_to_raise = DataNotAccessible(
f"Filler was unable to load the data referenced by "
f"the Datum document {datum_doc} and the Resource "
f"document {resource}.")
payload = _attempt_with_retries(
func=handler,
args=(),
kwargs=datum_doc['datum_kwargs'],
intervals=[0] + self.retry_intervals,
error_to_catch=IOError,
error_to_raise=error_to_raise)
# Here we are intentionally modifying doc in place.
filled_doc['data'][key] = payload
filled_doc['filled'][key] = datum_id
except KeyError as err:
if from_datakeys:
raise MismatchedDataKeys(
"The documents are not valid. Either because they "
"were recorded incorrectly in the first place, "
"corrupted since, or exercising a yet-undiscovered "
"bug in a reader. event['data'].keys() "
"must equal descriptor['data_keys'].keys(). "
f"event['data'].keys(): {doc['data'].keys()}, "
"descriptor['data_keys'].keys(): "
f"{descriptor['data_keys'].keys()}") from err
else:
raise MismatchedDataKeys(
"The documents are not valid. Either because they "
"were recorded incorrectly in the first place, "
"corrupted since, or exercising a yet-undiscovered "
"bug in a reader. event['filled'].keys() "
"must be a subset of event['data'].keys(). "
f"event['data'].keys(): {doc['data'].keys()}, "
"event['filled'].keys(): "
f"{doc['filled'].keys()}") from err
# Look up the cached Datum doc.
try:
datum_doc = self._datum_cache[datum_id]
except KeyError as err:
raise UnresolvableForeignKeyError(
datum_id,
f"Event with uid {doc['uid']} refers to unknown Datum "
f"datum_id {datum_id}") from err
resource_uid = datum_doc['resource']
# Look up the cached Resource.
try:
resource = self._resource_cache[resource_uid]
except KeyError as err:
raise UnresolvableForeignKeyError(
resource_uid,
f"Datum with id {datum_id} refers to unknown Resource "
f"uid {resource_uid}") from err
handler = self._get_handler_maybe_cached(resource)
error_to_raise = DataNotAccessible(
f"Filler was unable to load the data referenced by "
f"the Datum document {datum_doc} and the Resource "
f"document {resource}.")
payload = _attempt_with_retries(
func=handler,
args=(),
kwargs=datum_doc['datum_kwargs'],
intervals=[0] + self.retry_intervals,
error_to_catch=IOError,
error_to_raise=error_to_raise)
# Here we are intentionally modifying doc in place.
filled_doc['data'][key] = payload
filled_doc['filled'][key] = datum_id
self._current_state.key = None
self._current_state.descriptor = None
return filled_doc
Expand Down Expand Up @@ -784,40 +807,64 @@ def fill_event_page(self, doc, include=None, exclude=None):
return filled_doc

def fill_event(self, doc, include=None, exclude=None, inplace=None):
descriptor = self._descriptor_cache[doc['descriptor']]
from_datakeys = False
try:
filled = doc['filled']
needs_filling = {key for key, val in doc['filled'].items()
if val is False}
except KeyError:
# This document is not telling us which, if any, keys are filled.
# Infer that none of the external data is filled.
descriptor = self._descriptor_cache[doc['descriptor']]
filled = {key: 'external' in val
for key, val in descriptor['data_keys'].items()}
for key, is_filled in filled.items():
needs_filling = {key for key, val in descriptor['data_keys'].items()
if 'external' in val}
from_datakeys = True
for key in needs_filling:
if exclude is not None and key in exclude:
continue
if include is not None and key not in include:
continue
if not is_filled:
try:
datum_id = doc['data'][key]
# Look up the cached Datum doc.
try:
datum_doc = self._datum_cache[datum_id]
except KeyError as err:
err_with_key = UnresolvableForeignKeyError(
datum_id,
f"Event with uid {doc['uid']} refers to unknown Datum "
f"datum_id {datum_id}")
err_with_key.key = datum_id
raise err_with_key from err
resource_uid = datum_doc['resource']
# Look up the cached Resource.
try:
self._resource_cache[resource_uid]
except KeyError as err:
raise UnresolvableForeignKeyError(
datum_id,
f"Datum with id {datum_id} refers to unknown Resource "
f"uid {resource_uid}") from err
except KeyError as err:
if from_datakeys:
raise MismatchedDataKeys(
"The documents are not valid. Either because they "
"were recorded incorrectly in the first place, "
"corrupted since, or exercising a yet-undiscovered "
"bug in a reader. event['data'].keys() "
"must equal descriptor['data_keys'].keys(). "
f"event['data'].keys(): {doc['data'].keys()}, "
"descriptor['data_keys'].keys(): "
f"{descriptor['data_keys'].keys()}") from err
else:
raise MismatchedDataKeys(
"The documents are not valid. Either because they "
"were recorded incorrectly in the first place, "
"corrupted since, or exercising a yet-undiscovered "
"bug in a reader. event['filled'].keys() "
"must be a subset of event['data'].keys(). "
f"event['data'].keys(): {doc['data'].keys()}, "
"event['filled'].keys(): "
f"{doc['filled'].keys()}") from err
# Look up the cached Datum doc.
try:
datum_doc = self._datum_cache[datum_id]
except KeyError as err:
err_with_key = UnresolvableForeignKeyError(
datum_id,
f"Event with uid {doc['uid']} refers to unknown Datum "
f"datum_id {datum_id}")
err_with_key.key = datum_id
raise err_with_key from err
resource_uid = datum_doc['resource']
# Look up the cached Resource.
try:
self._resource_cache[resource_uid]
except KeyError as err:
raise UnresolvableForeignKeyError(
datum_id,
f"Datum with id {datum_id} refers to unknown Resource "
f"uid {resource_uid}") from err
return doc


Expand Down Expand Up @@ -1119,7 +1166,21 @@ def __init__(self, key, message):


class DuplicateHandler(EventModelRuntimeError):
"raised when a handler is already registered for a given spec"
"""raised when a handler is already registered for a given spec"""
...


class InvalidData(EventModelError):
"""raised when the data is invalid"""
...


class MismatchedDataKeys(InvalidData):
"""
Raised when any data keys structures are out of sync. This includes,
event['data'].keys(), descriptor['data_keys'].keys(),
event['timestamp'].keys(), event['filled'].keys()
"""
...


Expand Down
22 changes: 22 additions & 0 deletions event_model/tests/test_em.py
Original file line number Diff line number Diff line change
Expand Up @@ -931,6 +931,28 @@ def __call__(self, c, d):
assert not filler.handler_registry
assert not filler._handler_cache # implementation detail

with pytest.raises(event_model.MismatchedDataKeys):
with event_model.NoFiller(reg) as filler:
filler('start', run_bundle.start_doc)
filler('descriptor', desc_bundle.descriptor_doc)
filler('descriptor', desc_bundle_baseline.descriptor_doc)
filler('resource', res_bundle.resource_doc)
filler('datum', datum_doc)
event = copy.deepcopy(raw_event)
del event['data']['image']
filler('event', event)

with pytest.raises(event_model.MismatchedDataKeys):
with event_model.Filler(reg) as filler:
filler('start', run_bundle.start_doc)
filler('descriptor', desc_bundle.descriptor_doc)
filler('descriptor', desc_bundle_baseline.descriptor_doc)
filler('resource', res_bundle.resource_doc)
filler('datum', datum_doc)
event = copy.deepcopy(raw_event)
del event['data']['image']
filler('event', event)


def test_rechunk_event_pages():

Expand Down

0 comments on commit 7cec2b3

Please sign in to comment.