diff --git a/docs/source/topics/provenance/caching.rst b/docs/source/topics/provenance/caching.rst index 88c8a925c2..6a923d8931 100644 --- a/docs/source/topics/provenance/caching.rst +++ b/docs/source/topics/provenance/caching.rst @@ -57,6 +57,18 @@ In order to figure out why a calculation is *not* being reused, the :meth:`~aiid } ] +.. versionchanged:: 2.6 + Version information removed from hash computation + + Up until v2.6, the objects used to compute the hash of a ``ProcessNode`` included the ``version`` attribute. + This attribute stores a dictionary of the installed versions of the ``aiida-core`` and plugin packages (if relevant) at the time of creation. + When the caching mechanism was first introduced, this information was added intentionally to the hash to err on the safe side and prevent false positives as much as possible. + This turned out to be too limiting, however, as this means that each time ``aiida-core`` or a plugin package's version is updated, all existing valid cache sources are essentially invalidated. + Even if an identical process were to be run, its hash would be different, solely because the version information differs. + Therefore, as of v2.6, the version information is no longer part of the hash computation. + The most likely source for false positives due to changes in code are going to be ``CalcJob`` and ``Parser`` plugins. + See :ref:`this section ` on a mechanism to control the caching of ``CalcJob`` plugins. + .. _topics:provenance:caching:control-hashing: @@ -82,6 +94,37 @@ Process nodes The hashing of *Process nodes* is fixed and can only be influenced indirectly via the hashes of their inputs. For implementation details of the hashing mechanism for process nodes, see :ref:`here `. + +.. _topics:provenance:caching:control-hashing:calcjobs-parsers: + +Calculation jobs and parsers +............................ + +.. versionadded:: 2.6 + Resetting the calculation job cache + + When the implementation of a ``CalcJob`` or ``Parser`` plugin changes significantly, it can be the case that for identical inputs, significantly different outputs are expected + The following non-exhaustive list provides some examples: + + * The ``CalcJob.prepare_for_submission`` changes input files that are written independent of input nodes + * The ``Parser`` adds an output node for identical output files produced by the calculation + * The ``Parser`` changes an existing output node even for identical output files produced by the calculation + + In this case, existing completed nodes of the ``CalcJob`` plugin in question should be invalidated as a cache source, because they could constitute false positives. + For that reason, the ``CalcJob`` and ``Parser`` base classes each have the ``CACHE_VERSION`` class attribute. + By default it is set to ``None``, but when set to an integer, it is included into the computed hash for its nodes. + This allows a plugin developer to invalidate the cache of existing nodes by simply incrementing this attribute, for example: + + .. code-block:: python + + class SomeCalcJob(CalcJob): + + CACHE_VERSION = 1 + + Note that the exact value of the ``CACHE_VERSION`` does not really matter, all that matters is that changing it, invalidates the existing cache. + To keep things simple, it is recommended to treat it as a counter and simply increment it by 1 each time. + + .. _topics:provenance:caching:control-caching: Controlling Caching diff --git a/src/aiida/engine/processes/calcjobs/calcjob.py b/src/aiida/engine/processes/calcjobs/calcjob.py index de4fbd8663..70a72ba307 100644 --- a/src/aiida/engine/processes/calcjobs/calcjob.py +++ b/src/aiida/engine/processes/calcjobs/calcjob.py @@ -178,6 +178,8 @@ class CalcJob(Process): _node_class = orm.CalcJobNode _spec_class = CalcJobProcessSpec link_label_retrieved: str = 'retrieved' + KEY_CACHE_VERSION: str = 'cache_version' + CACHE_VERSION: int | None = None def __init__(self, *args, **kwargs) -> None: """Construct a CalcJob instance. @@ -568,6 +570,40 @@ def prepare_for_submission(self, folder: Folder) -> CalcInfo: """ raise NotImplementedError() + def _setup_version_info(self) -> dict[str, Any]: + """Store relevant plugin version information.""" + from aiida.plugins.entry_point import format_entry_point_string + from aiida.plugins.factories import ParserFactory + + version_info = super()._setup_version_info() + + for key, monitor in self.inputs.get('monitors', {}).items(): + entry_point = monitor.base.attributes.get('entry_point') + entry_point_string = format_entry_point_string('aiida.calculations.monitors', entry_point) + monitor_version_info = self.runner.plugin_version_provider.get_version_info(entry_point_string) + version_info['version'].setdefault('monitors', {})[key] = monitor_version_info['version']['plugin'] + + cache_version_info = {} + + if self.CACHE_VERSION is not None: + cache_version_info['calc_job'] = self.CACHE_VERSION + + parser_entry_point = self.inputs.metadata.options.get('parser_name') + + if parser_entry_point is not None: + try: + parser = ParserFactory(self.inputs.metadata.options.parser_name) + except exceptions.EntryPointError: + self.logger.warning(f'Could not load the `parser_name` entry point `{parser_entry_point}') + else: + if parser.CACHE_VERSION is not None: + cache_version_info['parser'] = parser.CACHE_VERSION + + if cache_version_info: + self.node.base.attributes.set(self.KEY_CACHE_VERSION, cache_version_info) + + return version_info + def _setup_metadata(self, metadata: dict) -> None: """Store the metadata on the ProcessNode.""" computer = metadata.pop('computer', None) diff --git a/src/aiida/engine/processes/process.py b/src/aiida/engine/processes/process.py index 5ee3657d06..f4dc9f9d69 100644 --- a/src/aiida/engine/processes/process.py +++ b/src/aiida/engine/processes/process.py @@ -711,19 +711,11 @@ def _setup_db_record(self) -> None: self._setup_version_info() self._setup_inputs() - def _setup_version_info(self) -> None: + def _setup_version_info(self) -> dict[str, Any]: """Store relevant plugin version information.""" - from aiida.plugins.entry_point import format_entry_point_string - version_info = self.runner.plugin_version_provider.get_version_info(self.__class__) - - for key, monitor in self.inputs.get('monitors', {}).items(): - entry_point = monitor.base.attributes.get('entry_point') - entry_point_string = format_entry_point_string('aiida.calculations.monitors', entry_point) - monitor_version_info = self.runner.plugin_version_provider.get_version_info(entry_point_string) - version_info['version'].setdefault('monitors', {})[key] = monitor_version_info['version']['plugin'] - self.node.base.attributes.set_many(version_info) + return version_info def _setup_metadata(self, metadata: dict) -> None: """Store the metadata on the ProcessNode.""" diff --git a/src/aiida/orm/nodes/node.py b/src/aiida/orm/nodes/node.py index fe501970be..e678c5afd8 100644 --- a/src/aiida/orm/nodes/node.py +++ b/src/aiida/orm/nodes/node.py @@ -8,6 +8,8 @@ ########################################################################### """Package for node ORM classes.""" +from __future__ import annotations + from datetime import datetime from functools import cached_property from logging import Logger diff --git a/src/aiida/parsers/parser.py b/src/aiida/parsers/parser.py index 4902aecb40..08e1402501 100644 --- a/src/aiida/parsers/parser.py +++ b/src/aiida/parsers/parser.py @@ -10,6 +10,8 @@ to allow the reading of the outputs of a calculation. """ +from __future__ import annotations + from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple @@ -29,6 +31,8 @@ class Parser(ABC): """Base class for a Parser that can parse the outputs produced by a CalcJob process.""" + CACHE_VERSION: int | None = None + def __init__(self, node: 'CalcJobNode'): """Construct the Parser instance. diff --git a/tests/engine/processes/calcjobs/test_calc_job.py b/tests/engine/processes/calcjobs/test_calc_job.py index 456e450a57..af1b323200 100644 --- a/tests/engine/processes/calcjobs/test_calc_job.py +++ b/tests/engine/processes/calcjobs/test_calc_job.py @@ -27,9 +27,10 @@ from aiida.engine.processes.calcjobs.monitors import CalcJobMonitorAction, CalcJobMonitorResult from aiida.engine.processes.ports import PortNamespace from aiida.engine.utils import instantiate_process -from aiida.plugins import CalculationFactory +from aiida.plugins import CalculationFactory, ParserFactory ArithmeticAddCalculation = CalculationFactory('core.arithmetic.add') +ArithmeticAddParser = ParserFactory('core.arithmetic.add') def raise_exception(exception, *args, **kwargs): @@ -1468,3 +1469,30 @@ def test_file_copy_operation_order_invalid(fixture_sandbox, runner, aiida_local_ process = instantiate_process(runner, FileCopyOperationOrderInvalid, **inputs) with pytest.raises(exceptions.PluginInternalError, match=r'calc_info.file_copy_operation_order is not a list .*'): process.presubmit(fixture_sandbox) + + +def test_cache_version_attribute(arithmetic_add_inputs, monkeypatch): + """Test that the ``CalcJob.CACHE_VERSION`` and ``Parser.CACHE_VERSION`` attributes can be used to control hashes. + + If the implementation of a ``CalcJob`` or ``Parser`` plugin changes significantly, a plugin developer can change + the ``CACHE_VERSION`` attribute to cause the hash to be changed, ensuring old completed instances of the class no + longer to be valid cache sources. + """ + _, node_a = launch.run_get_node(ArithmeticAddCalculation, arithmetic_add_inputs) + + monkeypatch.setattr(ArithmeticAddCalculation, 'CACHE_VERSION', 1) + + _, node_b = launch.run_get_node(ArithmeticAddCalculation, arithmetic_add_inputs) + assert node_b.base.attributes.get(ArithmeticAddCalculation.KEY_CACHE_VERSION) == {'calc_job': 1} + assert node_a.base.caching.get_hash() != node_b.base.caching.get_hash() + assert not node_b.base.caching.is_created_from_cache + + monkeypatch.setattr(ArithmeticAddParser, 'CACHE_VERSION', 2) + + _, node_c = launch.run_get_node(ArithmeticAddCalculation, arithmetic_add_inputs) + assert node_c.base.attributes.get(ArithmeticAddCalculation.KEY_CACHE_VERSION) == { + 'calc_job': 1, + 'parser': 2, + } + assert node_b.base.caching.get_hash() != node_c.base.caching.get_hash() + assert not node_c.base.caching.is_created_from_cache