diff --git a/.github/workflows/ci-integration.yml b/.github/workflows/ci-integration.yml index 36e1d380..a91229af 100644 --- a/.github/workflows/ci-integration.yml +++ b/.github/workflows/ci-integration.yml @@ -27,8 +27,9 @@ jobs: matrix: os: ['ubuntu'] python-version: - - "3.9" - "3.10" + - "3.11" + - "3.12" steps: - uses: actions/checkout@v2 @@ -52,10 +53,11 @@ jobs: pytest -v --cov=alchemiscale --cov-report=xml alchemiscale/tests - name: codecov - if: ${{ github.repository == 'openforcefield/alchemiscale' + if: ${{ github.repository == 'OpenFreeEnergy/alchemiscale' && github.event != 'schedule' }} - uses: codecov/codecov-action@v2 + uses: codecov/codecov-action@v4 with: + token: ${{ secrets.CODECOV_TOKEN }} file: coverage.xml fail_ci_if_error: False verbose: True diff --git a/.github/workflows/deploy-conda-envs.yml b/.github/workflows/deploy-conda-envs.yml deleted file mode 100644 index 15964545..00000000 --- a/.github/workflows/deploy-conda-envs.yml +++ /dev/null @@ -1,75 +0,0 @@ ---- - -name: Deployment - conda environments - -on: - push: - branches: - - master - paths: - - 'devtools/conda-envs/alchemiscale-*.yml' - workflow_dispatch: - -jobs: - deploy-conda-env: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - env-name: - - alchemiscale-client - - alchemiscale-server - - alchemiscale-compute - steps: - - name: Checkout code - uses: actions/checkout@v2 - - - name: ensure we only have one instance running - uses: softprops/turnstyle@v1 - env: - GITHUB_TOKEN: ${{ secrets.GH_DANGERBOT_TOKEN_LIMITED }} - with: - abort-after-seconds: 60 - - #- name: Cache conda - # uses: actions/cache@v2 - # env: - # CACHE_NUMBER: 0 - # with: - # path: ~/conda_pkgs_dir - # key: - # ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ - # hashFiles('devtools/conda-envs/${{ matrix.env-name }}.yml') }} - - - name: Additional info about the build - shell: bash - run: | - uname -a - df -h - ulimit -a - - - name: Configure conda; test creation of environment - uses: conda-incubator/setup-miniconda@v2 - with: - #python-version: 3.9 - auto-update-conda: true - use-mamba: true - miniforge-variant: Mambaforge - activate-environment: ${{ matrix.env-name }} - environment-file: devtools/conda-envs/${{ matrix.env-name }}.yml - #use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - #auto-activate-base: false - - - name: Environment Information - shell: bash -l {0} - run: | - conda info - conda list - - - name: Deploy conda env - shell: bash -l {0} - env: - ANACONDA_TOKEN: ${{ secrets.ANACONDA_TOKEN }} - run: | - mamba install -y anaconda-client - anaconda -t ${ANACONDA_TOKEN} upload --user openforcefield devtools/conda-envs/${{ matrix.env-name }}.yml diff --git a/.github/workflows/deploy-docker.yml b/.github/workflows/deploy-docker.yml index f79e29cb..a060d8a5 100644 --- a/.github/workflows/deploy-docker.yml +++ b/.github/workflows/deploy-docker.yml @@ -16,7 +16,7 @@ on: env: REGISTRY: ghcr.io - NAMESPACE: openforcefield + NAMESPACE: OpenFreeEnergy jobs: build-and-push-image: @@ -39,7 +39,7 @@ jobs: uses: actions/checkout@v3 - name: Log in to the Container registry - uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 + uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} @@ -47,7 +47,7 @@ jobs: - name: Extract metadata (tags, labels) for Docker id: meta - uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 + uses: docker/metadata-action@v5 with: images: ${{ env.REGISTRY }}/${{ env.NAMESPACE }}/${{ matrix.image }} tags: | diff --git a/README.md b/README.md index c83ade3d..fef337aa 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,15 @@ --- -[![build](https://github.com/openforcefield/alchemiscale/actions/workflows/ci-integration.yml/badge.svg)](https://github.com/openforcefield/alchemiscale/actions/workflows/ci-integration.yml) -[![coverage](https://codecov.io/gh/openforcefield/alchemiscale/branch/main/graph/badge.svg)](https://codecov.io/gh/openforcefield/alchemiscale) +[![build](https://github.com/OpenFreeEnergy/alchemiscale/actions/workflows/ci-integration.yml/badge.svg)](https://github.com/OpenFreeEnergy/alchemiscale/actions/workflows/ci-integration.yml) +[![coverage](https://codecov.io/gh/OpenFreeEnergy/alchemiscale/branch/main/graph/badge.svg)](https://codecov.io/gh/OpenFreeEnergy/alchemiscale) [![Documentation Status](https://readthedocs.org/projects/alchemiscale/badge/?version=latest)](https://alchemiscale.readthedocs.io/en/latest/?badge=latest) **alchemiscale**: a high-throughput alchemical free energy execution system for use with HPC, cloud, bare metal, and Folding@Home +Learn more about the project, including how to get involved at: https://alchemiscale.org + ---
alchemiscale logo by Jenke Scheen is marked with CC0 1.0
diff --git a/alchemiscale/cli.py b/alchemiscale/cli.py index 1999ef57..c6d096ae 100644 --- a/alchemiscale/cli.py +++ b/alchemiscale/cli.py @@ -362,6 +362,7 @@ def get_settings_override(): def synchronous(config_file): from alchemiscale.models import Scope from alchemiscale.compute.service import SynchronousComputeService + from alchemiscale.compute.settings import ComputeServiceSettings params = yaml.safe_load(config_file) @@ -373,7 +374,7 @@ def synchronous(config_file): Scope.from_str(scope) for scope in params_init["scopes"] ] - service = SynchronousComputeService(**params_init) + service = SynchronousComputeService(ComputeServiceSettings(**params_init)) # add signal handling for signame in {"SIGHUP", "SIGINT", "SIGTERM"}: diff --git a/alchemiscale/compute/api.py b/alchemiscale/compute/api.py index f3bff55c..c1f1a7f4 100644 --- a/alchemiscale/compute/api.py +++ b/alchemiscale/compute/api.py @@ -8,6 +8,7 @@ import os import json from datetime import datetime, timedelta +import random from fastapi import FastAPI, APIRouter, Body, Depends from fastapi.middleware.gzip import GZipMiddleware @@ -24,6 +25,7 @@ get_cred_entity, validate_scopes, validate_scopes_query, + minimize_scope_space, _check_store_connectivity, gufe_to_json, GzipRoute, @@ -178,6 +180,7 @@ def claim_taskhub_tasks( *, compute_service_id: str = Body(), count: int = Body(), + protocols: Optional[List[str]] = Body(None, embed=True), n4js: Neo4jStore = Depends(get_n4js_depends), token: TokenData = Depends(get_token_data_depends), ): @@ -188,13 +191,91 @@ def claim_taskhub_tasks( taskhub=taskhub_scoped_key, compute_service_id=ComputeServiceID(compute_service_id), count=count, + protocols=protocols, ) return [str(t) if t is not None else None for t in tasks] +@router.post("/claim") +def claim_tasks( + scopes: List[Scope] = Body(), + compute_service_id: str = Body(), + count: int = Body(), + protocols: Optional[List[str]] = Body(None, embed=True), + n4js: Neo4jStore = Depends(get_n4js_depends), + token: TokenData = Depends(get_token_data_depends), +): + # intersect query scopes with accessible scopes in the token + scopes_reduced = minimize_scope_space(scopes) + query_scopes = [] + for scope in scopes_reduced: + query_scopes.extend(validate_scopes_query(scope, token)) + + taskhubs = dict() + # query each scope for available taskhubs + # loop might be more removable in the future with a Union like operator on scopes + for single_query_scope in set(query_scopes): + taskhubs.update(n4js.query_taskhubs(scope=single_query_scope, return_gufe=True)) + + # list of tasks to return + tasks = [] + + if len(taskhubs) == 0: + return [] + + # claim tasks from taskhubs based on weight; keep going till we hit our + # total desired task count, or we run out of taskhubs to draw from + while len(tasks) < count and len(taskhubs) > 0: + weights = [th.weight for th in taskhubs.values()] + + if sum(weights) == 0: + break + + # based on weights, choose taskhub to draw from + taskhub: ScopedKey = random.choices(list(taskhubs.keys()), weights=weights)[0] + + # claim tasks from the taskhub + claimed_tasks = n4js.claim_taskhub_tasks( + taskhub, + compute_service_id=ComputeServiceID(compute_service_id), + count=(count - len(tasks)), + protocols=protocols, + ) + + # gather up claimed tasks, if present + for t in claimed_tasks: + if t is not None: + tasks.append(t) + + # remove this taskhub from the options available; repeat + taskhubs.pop(taskhub) + + return [str(t) for t in tasks] + [None] * (count - len(tasks)) + + @router.get("/tasks/{task_scoped_key}/transformation") def get_task_transformation( + task_scoped_key, + *, + n4js: Neo4jStore = Depends(get_n4js_depends), + token: TokenData = Depends(get_token_data_depends), +): + sk = ScopedKey.from_str(task_scoped_key) + validate_scopes(sk.scope, token) + + transformation: ScopedKey + + transformation, _ = n4js.get_task_transformation( + task=task_scoped_key, + return_gufe=False, + ) + + return str(transformation) + + +@router.get("/tasks/{task_scoped_key}/transformation/gufe") +def retrieve_task_transformation( task_scoped_key, *, n4js: Neo4jStore = Depends(get_n4js_depends), diff --git a/alchemiscale/compute/client.py b/alchemiscale/compute/client.py index fcc870ca..901a7516 100644 --- a/alchemiscale/compute/client.py +++ b/alchemiscale/compute/client.py @@ -35,15 +35,17 @@ class AlchemiscaleComputeClient(AlchemiscaleBaseClient): _exception = AlchemiscaleComputeClientError def register(self, compute_service_id: ComputeServiceID): - res = self._post_resource(f"computeservice/{compute_service_id}/register", {}) + res = self._post_resource(f"/computeservice/{compute_service_id}/register", {}) return ComputeServiceID(res) def deregister(self, compute_service_id: ComputeServiceID): - res = self._post_resource(f"computeservice/{compute_service_id}/deregister", {}) + res = self._post_resource( + f"/computeservice/{compute_service_id}/deregister", {} + ) return ComputeServiceID(res) def heartbeat(self, compute_service_id: ComputeServiceID): - res = self._post_resource(f"computeservice/{compute_service_id}/heartbeat", {}) + res = self._post_resource(f"/computeservice/{compute_service_id}/heartbeat", {}) return ComputeServiceID(res) def list_scopes(self) -> List[Scope]: @@ -71,19 +73,48 @@ def query_taskhubs( return taskhubs def claim_taskhub_tasks( - self, taskhub: ScopedKey, compute_service_id: ComputeServiceID, count: int = 1 + self, + taskhub: ScopedKey, + compute_service_id: ComputeServiceID, + count: int = 1, + protocols: Optional[List[str]] = None, ) -> Task: """Claim a `Task` from the specified `TaskHub`""" - data = dict(compute_service_id=str(compute_service_id), count=count) - tasks = self._post_resource(f"taskhubs/{taskhub}/claim", data) + data = dict( + compute_service_id=str(compute_service_id), count=count, protocols=protocols + ) + tasks = self._post_resource(f"/taskhubs/{taskhub}/claim", data) + + return [ScopedKey.from_str(t) if t is not None else None for t in tasks] + + def claim_tasks( + self, + scopes: List[Scope], + compute_service_id: ComputeServiceID, + count: int = 1, + protocols: Optional[List[str]] = None, + ): + """Claim Tasks from TaskHubs within a list of Scopes.""" + data = dict( + scopes=[scope.dict() for scope in scopes], + compute_service_id=str(compute_service_id), + count=count, + protocols=protocols, + ) + tasks = self._post_resource("/claim", data) return [ScopedKey.from_str(t) if t is not None else None for t in tasks] - def get_task_transformation( + def get_task_transformation(self, task: ScopedKey) -> ScopedKey: + """Get the Transformation associated with the given Task.""" + transformation = self._get_resource(f"/tasks/{task}/transformation") + return ScopedKey.from_str(transformation) + + def retrieve_task_transformation( self, task: ScopedKey ) -> Tuple[Transformation, Optional[ProtocolDAGResult]]: transformation, protocoldagresult = self._get_resource( - f"tasks/{task}/transformation" + f"/tasks/{task}/transformation/gufe" ) return ( @@ -104,6 +135,6 @@ def set_task_result( compute_service_id=str(compute_service_id), ) - pdr_sk = self._post_resource(f"tasks/{task}/results", data) + pdr_sk = self._post_resource(f"/tasks/{task}/results", data) return ScopedKey.from_dict(pdr_sk) diff --git a/alchemiscale/compute/service.py b/alchemiscale/compute/service.py index 50897ce1..2955555d 100644 --- a/alchemiscale/compute/service.py +++ b/alchemiscale/compute/service.py @@ -24,6 +24,7 @@ from gufe.protocols.protocoldag import execute_DAG, ProtocolDAG, ProtocolDAGResult from .client import AlchemiscaleComputeClient +from .settings import ComputeServiceSettings from ..storage.models import Task, TaskHub, ComputeServiceID from ..models import Scope, ScopedKey @@ -73,114 +74,38 @@ class SynchronousComputeService: """ - def __init__( - self, - api_url: str, - identifier: str, - key: str, - name: str, - shared_basedir: os.PathLike, - scratch_basedir: os.PathLike, - keep_shared: bool = False, - keep_scratch: bool = False, - n_retries: int = 3, - sleep_interval: int = 30, - heartbeat_interval: int = 300, - scopes: Optional[List[Scope]] = None, - claim_limit: int = 1, - loglevel="WARN", - logfile: Optional[Path] = None, - client_max_retries=5, - client_retry_base_seconds=2.0, - client_retry_max_seconds=60.0, - client_verify=True, - ): - """Create a `SynchronousComputeService` instance. + def __init__(self, settings: ComputeServiceSettings): + """Create a `SynchronousComputeService` instance.""" + self.settings = settings - Parameters - ---------- - api_url - URL of the compute API to execute Tasks for. - identifier - Identifier for the compute identity used for authentication. - key - Credential for the compute identity used for authentication. - name - The name to give this compute service; used for Task provenance, so - typically set to a distinct value to distinguish different compute - resources, e.g. different hosts or HPC clusters. - shared_basedir - Filesystem path to use for `ProtocolDAG` `shared` space. - scratch_basedir - Filesystem path to use for `ProtocolUnit` `scratch` space. - keep_shared - If True, don't remove shared directories for `ProtocolDAG`s after - completion. - keep_scratch - If True, don't remove scratch directories for `ProtocolUnit`s after - completion. - n_retries - Number of times to attempt a given Task on failure. - sleep_interval - Time in seconds to sleep if no Tasks claimed from compute API. - heartbeat_interval - Frequency at which to send heartbeats to compute API. - scopes - Scopes to limit Task claiming to; defaults to all Scopes accessible - by compute identity. - claim_limit - Maximum number of Tasks to claim at a time from a TaskHub. - loglevel - The loglevel at which to report; see the :mod:`logging` docs for - available levels. - logfile - Path to file for logging output; if not set, logging will only go - to STDOUT. - client_max_retries - Maximum number of times to retry a request. In the case the API - service is unresponsive an expoenential backoff is applied with - retries until this number is reached. If set to -1, retries will - continue indefinitely until success. - client_retry_base_seconds - The base number of seconds to use for exponential backoff. - Must be greater than 1.0. - client_retry_max_seconds - Maximum number of seconds to sleep between retries; avoids runaway - exponential backoff while allowing for many retries. - client_verify - Whether to verify SSL certificate presented by the API server. - - """ - self.api_url = api_url - self.name = name - self.sleep_interval = sleep_interval - self.heartbeat_interval = heartbeat_interval - self.claim_limit = claim_limit + self.api_url = self.settings.api_url + self.name = self.settings.name + self.sleep_interval = self.settings.sleep_interval + self.heartbeat_interval = self.settings.heartbeat_interval + self.claim_limit = self.settings.claim_limit self.client = AlchemiscaleComputeClient( - api_url, - identifier, - key, - max_retries=client_max_retries, - retry_base_seconds=client_retry_base_seconds, - retry_max_seconds=client_retry_max_seconds, - verify=client_verify, + self.settings.api_url, + self.settings.identifier, + self.settings.key, + max_retries=self.settings.client_max_retries, + retry_base_seconds=self.settings.client_retry_base_seconds, + retry_max_seconds=self.settings.client_retry_max_seconds, + verify=self.settings.client_verify, ) - if scopes is None: + if self.settings.scopes is None: self.scopes = [Scope()] else: - self.scopes = scopes + self.scopes = self.settings.scopes - self.shared_basedir = Path(shared_basedir).absolute() + self.shared_basedir = Path(self.settings.shared_basedir).absolute() self.shared_basedir.mkdir(exist_ok=True) - self.keep_shared = keep_shared + self.keep_shared = self.settings.keep_shared - self.scratch_basedir = Path(scratch_basedir).absolute() + self.scratch_basedir = Path(self.settings.scratch_basedir).absolute() self.scratch_basedir.mkdir(exist_ok=True) - self.keep_scratch = keep_scratch - - self.n_retries = n_retries + self.keep_scratch = self.settings.keep_scratch self.scheduler = sched.scheduler(time.monotonic, time.sleep) @@ -193,7 +118,7 @@ def __init__( # logging extra = {"compute_service_id": str(self.compute_service_id)} logger = logging.getLogger("AlchemiscaleSynchronousComputeService") - logger.setLevel(loglevel) + logger.setLevel(self.settings.loglevel) formatter = logging.Formatter( "[%(asctime)s] [%(compute_service_id)s] [%(levelname)s] %(message)s" @@ -204,8 +129,8 @@ def __init__( sh.setFormatter(formatter) logger.addHandler(sh) - if logfile is not None: - fh = logging.FileHandler(logfile) + if self.settings.logfile is not None: + fh = logging.FileHandler(self.settings.logfile) fh.setFormatter(formatter) logger.addHandler(fh) @@ -232,50 +157,30 @@ def heartbeat(self): self.beat() time.sleep(self.heartbeat_interval) - def claim_tasks(self, count=1) -> List[Optional[ScopedKey]]: + def claim_tasks( + self, count=1, protocols: Optional[List[str]] = None + ) -> List[Optional[ScopedKey]]: """Get a Task to execute from compute API. Returns `None` if no Task was available matching service configuration. + Parameters + ---------- + count + The maximum number of Tasks to claim. + protocols + Protocol names to restrict Task claiming to. `None` means no restriction. + Regex patterns are allowed. + """ - # list of tasks to return - tasks = [] - taskhubs: Dict[ScopedKey, TaskHub] = self.client.query_taskhubs( - scopes=self.scopes, return_gufe=True + tasks = self.client.claim_tasks( + scopes=self.scopes, + compute_service_id=self.compute_service_id, + count=count, + protocols=protocols, ) - if len(taskhubs) == 0: - return [] - - # claim tasks from taskhubs based on weight; keep going till we hit our - # total desired task count, or we run out of taskhubs to draw from - while len(tasks) < count and len(taskhubs) > 0: - weights = [th.weight for th in taskhubs.values()] - - if sum(weights) == 0: - break - - # based on weights, choose taskhub to draw from - taskhub: List[ScopedKey] = random.choices( - list(taskhubs.keys()), weights=weights - )[0] - - # claim tasks from the taskhub - claimed_tasks = self.client.claim_taskhub_tasks( - taskhub, - compute_service_id=self.compute_service_id, - count=(count - len(tasks)), - ) - - # gather up claimed tasks, if present - for t in claimed_tasks: - if t is not None: - tasks.append(t) - - # remove this taskhub from the options available; repeat - taskhubs.pop(taskhub) - return tasks def task_to_protocoldag( @@ -289,9 +194,10 @@ def task_to_protocoldag( """ - transformation, extends_protocoldagresult = self.client.get_task_transformation( - task - ) + ( + transformation, + extends_protocoldagresult, + ) = self.client.retrieve_task_transformation(task) protocoldag = transformation.create( extends=extends_protocoldagresult, @@ -346,7 +252,7 @@ def execute(self, task: ScopedKey) -> ScopedKey: scratch_basedir=scratch, keep_scratch=self.keep_scratch, raise_error=False, - n_retries=self.n_retries, + n_retries=self.settings.n_retries, ) finally: if not self.keep_shared: diff --git a/alchemiscale/compute/settings.py b/alchemiscale/compute/settings.py new file mode 100644 index 00000000..4e97adba --- /dev/null +++ b/alchemiscale/compute/settings.py @@ -0,0 +1,94 @@ +from pathlib import Path +from typing import Union, Optional, List, Dict, Tuple +from pydantic import BaseModel, Field + +from ..models import Scope, ScopedKey + + +class ComputeServiceSettings(BaseModel): + """Core settings schema for a compute service.""" + + class Config: + arbitrary_types_allowed = True + + api_url: str = Field( + ..., description="URL of the compute API to execute Tasks for." + ) + identifier: str = Field( + ..., description="Identifier for the compute identity used for authentication." + ) + key: str = Field( + ..., description="Credential for the compute identity used for authentication." + ) + name: str = Field( + ..., + description=( + "The name to give this compute service; used for Task provenance, so " + "typically set to a distinct value to distinguish different compute " + "resources, e.g. different hosts or HPC clusters." + ), + ) + shared_basedir: Path = Field( + ..., description="Filesystem path to use for `ProtocolDAG` `shared` space." + ) + scratch_basedir: Path = Field( + ..., description="Filesystem path to use for `ProtocolUnit` `scratch` space." + ) + keep_shared: bool = Field( + False, + description="If True, don't remove shared directories for `ProtocolDAG`s after completion.", + ) + keep_scratch: bool = Field( + False, + description="If True, don't remove scratch directories for `ProtocolUnit`s after completion.", + ) + n_retries: int = Field( + 3, + description="Number of times to attempt a given Task on failure.", + ) + sleep_interval: int = Field( + 30, description="Time in seconds to sleep if no Tasks claimed from compute API." + ) + heartbeat_interval: int = Field( + 300, description="Frequency at which to send heartbeats to compute API." + ) + scopes: Optional[List[Scope]] = Field( + None, + description="Scopes to limit Task claiming to; defaults to all Scopes accessible by compute identity.", + ) + protocols: Optional[List[str]] = Field( + None, + description="Names of Protocols to run with this service; `None` means no restriction.", + ) + claim_limit: int = Field( + 1, description="Maximum number of Tasks to claim at a time from a TaskHub." + ) + loglevel: str = Field( + "WARN", + description="The loglevel at which to report; see the :mod:`logging` docs for available levels.", + ) + logfile: Optional[Path] = Field( + None, + description="Path to file for logging output; if not set, logging will only go to STDOUT.", + ) + client_max_retries: int = Field( + 5, + description=( + "Maximum number of times to retry a request. " + "In the case the API service is unresponsive an expoenential backoff " + "is applied with retries until this number is reached. " + "If set to -1, retries will continue indefinitely until success." + ), + ) + client_retry_base_seconds: float = Field( + 2.0, + description="The base number of seconds to use for exponential backoff. Must be greater than 1.0.", + ) + client_retry_max_seconds: float = Field( + 60.0, + description="Maximum number of seconds to sleep between retries; avoids runaway exponential backoff while allowing for many retries.", + ) + client_verify: bool = Field( + True, + description="Whether to verify SSL certificate presented by the API server.", + ) diff --git a/alchemiscale/storage/cypher.py b/alchemiscale/storage/cypher.py index 5fda7b03..91d91152 100644 --- a/alchemiscale/storage/cypher.py +++ b/alchemiscale/storage/cypher.py @@ -24,3 +24,7 @@ def cypher_list_from_scoped_keys(scoped_keys: List[Optional[ScopedKey]]) -> str: if scoped_key: data.append('"' + str(scoped_key) + '"') return "[" + ", ".join(data) + "]" + + +def cypher_or(items): + return "|".join(items) diff --git a/alchemiscale/storage/statestore.py b/alchemiscale/storage/statestore.py index a85d718d..7af7f624 100644 --- a/alchemiscale/storage/statestore.py +++ b/alchemiscale/storage/statestore.py @@ -17,7 +17,13 @@ import numpy as np import networkx as nx -from gufe import AlchemicalNetwork, Transformation, NonTransformation, Settings +from gufe import ( + AlchemicalNetwork, + Transformation, + NonTransformation, + Settings, + Protocol, +) from gufe.tokenization import GufeTokenizable, GufeKey, JSON_HANDLER from gufe.protocols import ProtocolUnitFailure @@ -37,7 +43,7 @@ ) from ..strategies import Strategy from ..models import Scope, ScopedKey -from .cypher import cypher_list_from_scoped_keys +from .cypher import cypher_list_from_scoped_keys, cypher_or from ..security.models import CredentialedEntity from ..settings import Neo4jStoreSettings @@ -1652,33 +1658,32 @@ def cancel_tasks( none at all. """ - canceled_sks = [] - for task in tasks: - query = """ - // get our task hub, as well as the task :ACTIONS relationship we want to remove - MATCH (th:TaskHub {_scoped_key: $taskhub_scoped_key})-[ar:ACTIONS]->(task:Task {_scoped_key: $task_scoped_key}) - DELETE ar + query = """ + UNWIND $task_scoped_keys AS task_scoped_key + MATCH (th:TaskHub {_scoped_key: $taskhub_scoped_key})-[ar:ACTIONS]->(task:Task {_scoped_key: task_scoped_key}) + DELETE ar + WITH task, th + CALL { WITH task, th - CALL { - WITH task, th - MATCH (task)<-[applies:APPLIES]-(:TaskRestartPattern)-[:ENFORCES]->(th) - DELETE applies - } + MATCH (task)<-[applies:APPLIES]-(:TaskRestartPattern)-[:ENFORCES]->(th) + DELETE applies + } - RETURN task - """ - _task = tx.run( - query, taskhub_scoped_key=str(taskhub), task_scoped_key=str(task) - ).to_eager_result() + RETURN task._scoped_key as task_scoped_key + """ + results = tx.run( + query, + task_scoped_keys=list(map(str, tasks)), + taskhub_scoped_key=str(taskhub), + ).to_eager_result() - if _task.records: - sk = _task.records[0].data()["task"]["_scoped_key"] - canceled_sks.append(ScopedKey.from_str(sk)) - else: - canceled_sks.append(None) + returned_keys = {record["task_scoped_key"] for record in results.records} + filtered_tasks = [ + task if str(task) in returned_keys else None for task in tasks + ] - return canceled_sks + return filtered_tasks def get_taskhub_tasks( self, taskhub: ScopedKey, return_gufe=False @@ -1737,7 +1742,11 @@ def get_taskhub_unclaimed_tasks( return [ScopedKey.from_str(t["_scoped_key"]) for t in tasks] def claim_taskhub_tasks( - self, taskhub: ScopedKey, compute_service_id: ComputeServiceID, count: int = 1 + self, + taskhub: ScopedKey, + compute_service_id: ComputeServiceID, + count: int = 1, + protocols: Optional[List[Union[Protocol, str]]] = None, ) -> List[Union[ScopedKey, None]]: """Claim a TaskHub Task. @@ -1758,8 +1767,13 @@ def claim_taskhub_tasks( Unique identifier for the compute service claiming the Tasks for execution. count Claim the given number of Tasks in a single transaction. + protocols + Protocols to restrict Task claiming to. `None` means no restriction. + If an empty list, raises ValueError. """ + if protocols is not None and len(protocols) == 0: + raise ValueError("`protocols` must be either `None` or not empty") q = f""" MATCH (th:TaskHub {{`_scoped_key`: '{taskhub}'}})-[actions:ACTIONS]-(task:Task) @@ -1768,6 +1782,22 @@ def claim_taskhub_tasks( OPTIONAL MATCH (task)-[:EXTENDS]->(other_task:Task) WITH task, other_task, actions + """ + + # filter down to `protocols`, if specified + if protocols is not None: + # need to extract qualnames if given protocol classes + protocols = [ + protocol.__qualname__ if isinstance(protocol, Protocol) else protocol + for protocol in protocols + ] + + q += f""" + MATCH (task)-[:PERFORMS]->(:Transformation|NonTransformation)-[:DEPENDS_ON]->(protocol:{cypher_or(protocols)}) + WITH task, other_task, actions + """ + + q += f""" WHERE other_task.status = '{TaskStatusEnum.complete.value}' OR other_task IS NULL RETURN task.`_scoped_key`, task.priority, actions.weight @@ -2873,7 +2903,7 @@ def add_task_restart_patterns( RETURN th """ results = self.execute_query(q, taskhub=str(taskhub)) - + # raise error if taskhub not found if not results.records: raise KeyError("No such TaskHub in the database") diff --git a/alchemiscale/tests/__init__.py b/alchemiscale/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/alchemiscale/tests/integration/__init__.py b/alchemiscale/tests/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/alchemiscale/tests/integration/compute/__init__.py b/alchemiscale/tests/integration/compute/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/alchemiscale/tests/integration/compute/client/__init__.py b/alchemiscale/tests/integration/compute/client/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/alchemiscale/tests/integration/compute/client/conftest.py b/alchemiscale/tests/integration/compute/client/conftest.py index f4c92f8a..aebc5fe1 100644 --- a/alchemiscale/tests/integration/compute/client/conftest.py +++ b/alchemiscale/tests/integration/compute/client/conftest.py @@ -3,7 +3,6 @@ from time import sleep import uvicorn -import requests from alchemiscale.settings import get_base_api_settings from alchemiscale.base.api import get_n4js_depends, get_s3os_depends diff --git a/alchemiscale/tests/integration/compute/client/test_compute_client.py b/alchemiscale/tests/integration/compute/client/test_compute_client.py index f324a9ec..99777c41 100644 --- a/alchemiscale/tests/integration/compute/client/test_compute_client.py +++ b/alchemiscale/tests/integration/compute/client/test_compute_client.py @@ -189,6 +189,36 @@ def test_claim_taskhub_task( assert task_sks2[0] in remaining_tasks assert task_sks2[1] in remaining_tasks + def test_claim_tasks( + self, + scope_test, + n4js_preloaded, + compute_client: client.AlchemiscaleComputeClient, + compute_service_id, + uvicorn_server, + ): + # register compute service id + compute_client.register(compute_service_id) + + # claim a single task; should get highest priority task + task_sks = compute_client.claim_tasks( + scopes=[scope_test], + compute_service_id=compute_service_id, + ) + all_tasks = n4js_preloaded.query_tasks(scope=scope_test) + priorities = { + task_sk: priority + for task_sk, priority in zip( + all_tasks, n4js_preloaded.get_task_priority(all_tasks) + ) + } + + assert len(task_sks) == 1 + assert task_sks[0] in all_tasks + assert [t.gufe_key for t in task_sks] == [ + t.gufe_key for t in all_tasks if priorities[t] == 1 + ] + def test_get_task_transformation( self, scope_test, @@ -215,7 +245,7 @@ def test_get_task_transformation( ( transformation_, extends_protocoldagresult, - ) = compute_client.get_task_transformation(task_sks[0]) + ) = compute_client.retrieve_task_transformation(task_sks[0]) assert transformation_ == transformation assert extends_protocoldagresult is None @@ -249,7 +279,7 @@ def test_set_task_result( ( transformation_, extends_protocoldagresult, - ) = compute_client.get_task_transformation(task_sks[0]) + ) = compute_client.retrieve_task_transformation(task_sks[0]) assert transformation_ == transformation assert extends_protocoldagresult is None @@ -265,7 +295,7 @@ def test_set_task_result( ( transformation2, extends_protocoldagresult2, - ) = compute_client.get_task_transformation(task_sk2) + ) = compute_client.retrieve_task_transformation(task_sk2) assert transformation2 == transformation_ assert extends_protocoldagresult2 == protocoldagresults[0] diff --git a/alchemiscale/tests/integration/compute/client/test_compute_service.py b/alchemiscale/tests/integration/compute/client/test_compute_service.py index 9ae4d738..bb097257 100644 --- a/alchemiscale/tests/integration/compute/client/test_compute_service.py +++ b/alchemiscale/tests/integration/compute/client/test_compute_service.py @@ -11,6 +11,7 @@ from alchemiscale.storage.statestore import Neo4jStore from alchemiscale.storage.objectstore import S3ObjectStore from alchemiscale.compute.service import SynchronousComputeService +from alchemiscale.compute.settings import ComputeServiceSettings class TestSynchronousComputeService: @@ -20,14 +21,16 @@ class TestSynchronousComputeService: def service(self, n4js_preloaded, compute_client, tmpdir): with tmpdir.as_cwd(): return SynchronousComputeService( - api_url=compute_client.api_url, - identifier=compute_client.identifier, - key=compute_client.key, - name="test_compute_service", - shared_basedir=Path("shared").absolute(), - scratch_basedir=Path("scratch").absolute(), - heartbeat_interval=1, - sleep_interval=1, + ComputeServiceSettings( + api_url=compute_client.api_url, + identifier=compute_client.identifier, + key=compute_client.key, + name="test_compute_service", + shared_basedir=Path("shared").absolute(), + scratch_basedir=Path("scratch").absolute(), + heartbeat_interval=1, + sleep_interval=1, + ) ) def test_heartbeat(self, n4js_preloaded, service): diff --git a/alchemiscale/tests/integration/compute/conftest.py b/alchemiscale/tests/integration/compute/conftest.py index e75a55ab..d66b20c5 100644 --- a/alchemiscale/tests/integration/compute/conftest.py +++ b/alchemiscale/tests/integration/compute/conftest.py @@ -140,7 +140,9 @@ def get_token_data_depends_override(): @pytest.fixture -def compute_api_no_auth(s3os, scope_consistent_token_data_depends_override): +def compute_api_no_auth( + n4js_preloaded, s3os, scope_consistent_token_data_depends_override +): def get_s3os_override(): return s3os diff --git a/alchemiscale/tests/integration/compute/test_compute_api.py b/alchemiscale/tests/integration/compute/test_compute_api.py index 8ab1c7a5..19cda547 100644 --- a/alchemiscale/tests/integration/compute/test_compute_api.py +++ b/alchemiscale/tests/integration/compute/test_compute_api.py @@ -63,13 +63,15 @@ def out_of_scoped_keys(self, n4js_preloaded, network_tyk2, multiple_scopes): assert len(task_sks) > 0 return {"network": network_sk, "taskhub": tq_sk, "tasks": task_sks} - def test_get_task_transformation( + def test_retrieve_task_transformation( self, n4js_preloaded, test_client, scoped_keys, ): - response = test_client.get(f"/tasks/{scoped_keys['tasks'][0]}/transformation") + response = test_client.get( + f"/tasks/{scoped_keys['tasks'][0]}/transformation/gufe" + ) assert response.status_code == 200 data = response.json() assert len(data) == 2 diff --git a/alchemiscale/tests/integration/conftest.py b/alchemiscale/tests/integration/conftest.py index ed9e2b31..d7fb1c96 100644 --- a/alchemiscale/tests/integration/conftest.py +++ b/alchemiscale/tests/integration/conftest.py @@ -264,6 +264,20 @@ def s3os(s3objectstore_settings): # test alchemical networks + +## define varying protocols to simulate protocol variety +class DummyProtocolA(DummyProtocol): + pass + + +class DummyProtocolB(DummyProtocol): + pass + + +class DummyProtocolC(DummyProtocol): + pass + + # TODO: add in atom mapping once `gufe`#35 is settled @@ -294,7 +308,7 @@ def network_tyk2(): Transformation( stateA=complexes[edge[0]], stateB=complexes[edge[1]], - protocol=DummyProtocol(settings=DummyProtocol.default_settings()), + protocol=DummyProtocolA(settings=DummyProtocolA.default_settings()), name=f"{edge[0]}_to_{edge[1]}_complex", ) for edge in tyk2s.connections @@ -303,7 +317,7 @@ def network_tyk2(): Transformation( stateA=solvated[edge[0]], stateB=solvated[edge[1]], - protocol=DummyProtocol(settings=DummyProtocol.default_settings()), + protocol=DummyProtocolB(settings=DummyProtocolB.default_settings()), name=f"{edge[0]}_to_{edge[1]}_solvent", ) for edge in tyk2s.connections @@ -313,7 +327,7 @@ def network_tyk2(): for cs in list(solvated.values()) + list(complexes.values()): nt = NonTransformation( system=cs, - protocol=DummyProtocol(DummyProtocol.default_settings()), + protocol=DummyProtocolC(DummyProtocolC.default_settings()), name=f"f{cs.name}_nt", ) nontransformations.append(nt) diff --git a/alchemiscale/tests/integration/interface/__init__.py b/alchemiscale/tests/integration/interface/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/alchemiscale/tests/integration/interface/client/__init__.py b/alchemiscale/tests/integration/interface/client/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/alchemiscale/tests/integration/storage/__init__.py b/alchemiscale/tests/integration/storage/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/alchemiscale/tests/integration/storage/test_statestore.py b/alchemiscale/tests/integration/storage/test_statestore.py index 9a5e71ef..9d192d6c 100644 --- a/alchemiscale/tests/integration/storage/test_statestore.py +++ b/alchemiscale/tests/integration/storage/test_statestore.py @@ -38,6 +38,7 @@ tasks_are_not_actioned_on_taskhub, tasks_are_waiting, ) +from ..conftest import DummyProtocolA, DummyProtocolB, DummyProtocolC class TestStateStore: ... @@ -1290,17 +1291,18 @@ def test_cancel_task(self, n4js, network_tyk2, scope_test): assert fake_canceled[0] is None # check that the hub has the contents we expect - q = f"""MATCH (tq:TaskHub {{_scoped_key: '{taskhub_sk}'}})-[:ACTIONS]->(task:Task) - return task - """ + q = """ + MATCH (:TaskHub {_scoped_key: $taskhub_scoped_key})-[:ACTIONS]->(task:Task) + RETURN task._scoped_key AS task_scoped_key + """ - tasks = n4js.execute_query(q) - tasks = [record["task"] for record in tasks.records] + tasks = n4js.execute_query(q, taskhub_scoped_key=str(taskhub_sk)) + tasks = [ + ScopedKey.from_str(record["task_scoped_key"]) for record in tasks.records + ] assert len(tasks) == 8 - assert set([ScopedKey.from_str(t["_scoped_key"]) for t in tasks]) == set( - actioned - ) - set(canceled) + assert set(tasks) == set(actioned) - set(canceled) # create a TaskRestartPattern n4js.add_task_restart_patterns(taskhub_sk, ["Test pattern"], 1) @@ -1428,6 +1430,52 @@ def test_claim_taskhub_tasks(self, n4js: Neo4jStore, network_tyk2, scope_test): claimed6 = n4js.claim_taskhub_tasks(taskhub_sk, csid, count=2) assert claimed6 == [None] * 2 + def test_claim_taskhub_tasks_protocol_split( + self, n4js: Neo4jStore, network_tyk2, scope_test + ): + an = network_tyk2 + network_sk, taskhub_sk, _ = n4js.assemble_network(an, scope_test) + + def reducer(collection, transformation): + protocol = transformation.protocol.__class__ + if len(collection[protocol]) >= 3: + return collection + sk = n4js.get_scoped_key(transformation, scope_test) + collection[transformation.protocol.__class__].append(sk) + return collection + + transformations = reduce( + reducer, + an.edges, + {DummyProtocolA: [], DummyProtocolB: [], DummyProtocolC: []}, + ) + + transformation_sks = [ + value for _, values in transformations.items() for value in values + ] + + task_sks = n4js.create_tasks(transformation_sks) + assert len(task_sks) == 9 + + # action the tasks + n4js.action_tasks(task_sks, taskhub_sk) + assert len(n4js.get_taskhub_unclaimed_tasks(taskhub_sk)) == 9 + + csid = ComputeServiceID("another task handler") + n4js.register_computeservice(ComputeServiceRegistration.from_now(csid)) + + claimedA = n4js.claim_taskhub_tasks( + taskhub_sk, csid, protocols=["DummyProtocolA"], count=9 + ) + + assert len([sk for sk in claimedA if sk]) == 3 + + claimedBC = n4js.claim_taskhub_tasks( + taskhub_sk, csid, protocols=["DummyProtocolB", "DummyProtocolC"], count=9 + ) + + assert len([sk for sk in claimedBC if sk]) == 6 + def test_claim_taskhub_tasks_deregister( self, n4js: Neo4jStore, network_tyk2, scope_test ): diff --git a/alchemiscale/tests/unit/__init__.py b/alchemiscale/tests/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/devtools/conda-envs/alchemiscale-client.yml b/devtools/conda-envs/alchemiscale-client.yml index 6f583986..fc462e9c 100644 --- a/devtools/conda-envs/alchemiscale-client.yml +++ b/devtools/conda-envs/alchemiscale-client.yml @@ -2,35 +2,34 @@ name: alchemiscale-client channels: - jaimergp/label/unsupported-cudatoolkit-shim - conda-forge - - openeye dependencies: - pip - - python =3.10 + - python=3.12 # alchemiscale dependencies - - gufe=0.9.5 - - openfe=0.14.0 - - openmmforcefields>=0.12.0 + - gufe=1.0.0 + - openfe=1.1.0 - requests - click - httpx - pydantic<2.0 + - async-lru - ## user client printing + ## user client - rich + - nest-asyncio - # perses dependencies - - openeye-toolkits - - openmoltools - - cloudpathlib - - dask - - distributed - - numba - - pymbar >=3.0.6,<4 + # openmm protocols + - feflow=0.1.0 + + # additional pins + - openmm=8.1.2 + - openmmforcefields>=0.14.1 + + # alchemiscale-fah dependencies + - cryptography + - plyvel - pip: - - nest_asyncio - - async_lru - - git+https://github.com/openforcefield/alchemiscale.git@v0.4.0 - - git+https://github.com/choderalab/perses.git@protocol-neqcyc + - git+https://github.com/OpenFreeEnergy/alchemiscale.git@v0.5.1 diff --git a/devtools/conda-envs/alchemiscale-compute.yml b/devtools/conda-envs/alchemiscale-compute.yml index 56b21cef..eb4dc7b7 100644 --- a/devtools/conda-envs/alchemiscale-compute.yml +++ b/devtools/conda-envs/alchemiscale-compute.yml @@ -1,32 +1,27 @@ name: alchemiscale-compute channels: - conda-forge - - openeye dependencies: - pip - - python =3.10 - - cudatoolkit <=11.7 # many actual compute resources are not yet compatible with cudatoolkit >=11.8 - + - python =3.12 + - cudatoolkit =11.8 + # alchemiscale dependencies - - gufe=0.9.5 - - openfe=0.14.0 - - openmmforcefields>=0.12.0 + - gufe=1.0.0 + - openfe=1.1.0 - requests - click - httpx - pydantic<2.0 + - async-lru + + # openmm protocols + - feflow=0.1.0 - # perses dependencies - - openeye-toolkits - - openmoltools - - cloudpathlib - - dask - - distributed - - numba - - pymbar >=3.0.6,<4 + # additional pins + - openmm=8.1.2 + - openmmforcefields>=0.14.1 - pip: - - async_lru - - git+https://github.com/openforcefield/alchemiscale.git@v0.4.0 - - git+https://github.com/choderalab/perses.git@protocol-neqcyc + - git+https://github.com/OpenFreeEnergy/alchemiscale.git@v0.5.1 diff --git a/devtools/conda-envs/alchemiscale-server.yml b/devtools/conda-envs/alchemiscale-server.yml index f8909d40..e7205497 100644 --- a/devtools/conda-envs/alchemiscale-server.yml +++ b/devtools/conda-envs/alchemiscale-server.yml @@ -2,20 +2,19 @@ name: alchemiscale-server channels: - jaimergp/label/unsupported-cudatoolkit-shim - conda-forge - - openeye dependencies: - pip - - python =3.10 + - python=3.12 # alchemiscale dependencies - - gufe=0.9.5 - - openfe=0.14.0 + - gufe=1.0.0 + - openfe=1.1.0 - - openmmforcefields>=0.12.0 - requests - click - pydantic<2.0 + - async-lru ## state store - neo4j-python-driver @@ -37,16 +36,18 @@ dependencies: - httpx - cryptography - # perses dependencies - - openeye-toolkits - - openmoltools - - cloudpathlib - - dask - - distributed - - numba - - pymbar >=3.0.6,<4 + # openmm protocols + - feflow=0.1.0 + + # additional pins + - openmm=8.1.2 + - openmmforcefields>=0.14.1 + + # deployment + - curl # used in healthchecks for API services + + # alchemiscale-fah dependencies + - plyvel - pip: - - async_lru - - git+https://github.com/openforcefield/alchemiscale.git@v0.4.0 - - git+https://github.com/choderalab/perses.git@protocol-neqcyc + - git+https://github.com/OpenFreeEnergy/alchemiscale.git@v0.5.1 diff --git a/devtools/conda-envs/test.yml b/devtools/conda-envs/test.yml index fdab69d7..ec9a6955 100644 --- a/devtools/conda-envs/test.yml +++ b/devtools/conda-envs/test.yml @@ -8,8 +8,7 @@ dependencies: # alchemiscale dependencies - gufe>=1.0.0 - - openfe>=1.0.1 - - openmmforcefields>=0.12.0 + - openfe>=1.1.0 - pydantic<2.0 ## state store @@ -36,6 +35,9 @@ dependencies: - httpx - cryptography + # openmm protocols + - feflow>=0.1.0 + ## cli - click @@ -46,6 +48,10 @@ dependencies: - coverage - moto + # additional pins + - openmm=8.1.2 + - openmmforcefields>=0.14.1 + - pip: - async_lru - git+https://github.com/datryllic/grolt@neo4j-5.x # neo4j test server deployment diff --git a/devtools/configs/synchronous-compute-settings.yaml b/devtools/configs/synchronous-compute-settings.yaml index a9c29ab5..23a9d9f2 100644 --- a/devtools/configs/synchronous-compute-settings.yaml +++ b/devtools/configs/synchronous-compute-settings.yaml @@ -44,6 +44,9 @@ init: scopes: - '*-*-*' + # Names of Protocols to run with this service; `None` means no restriction + protocols: null + # Maximum number of Tasks to claim at a time from a TaskHub. claim_limit: 1 diff --git a/docker/alchemiscale-compute/Dockerfile b/docker/alchemiscale-compute/Dockerfile index eb023100..cd56ffc9 100644 --- a/docker/alchemiscale-compute/Dockerfile +++ b/docker/alchemiscale-compute/Dockerfile @@ -1,7 +1,7 @@ -FROM mambaorg/micromamba:1.4.1 +FROM mambaorg/micromamba:1.5.10 -LABEL org.opencontainers.image.source=https://github.com/openforcefield/alchemiscale-compute -LABEL org.opencontainers.image.description="deployable compute services for an alchemiscale server" +LABEL org.opencontainers.image.source=https://github.com/OpenFreeEnergy/alchemiscale +LABEL org.opencontainers.image.description="deployable compute services for alchemiscale" LABEL org.opencontainers.image.licenses=MIT # Don't buffer stdout & stderr streams, so if there is a crash no partial buffer output is lost diff --git a/docker/alchemiscale-server/.env.testing b/docker/alchemiscale-server/.env.testing index 67fe7fbf..2cf76a75 100644 --- a/docker/alchemiscale-server/.env.testing +++ b/docker/alchemiscale-server/.env.testing @@ -24,4 +24,4 @@ ACME_EMAIL=foo@bar.com HOST_DOMAIN=localhost # alchemiscale -ALCHEMISCALE_DOCKER_IMAGE=ghcr.io/openforcefield/alchemiscale:feat-add_docker_compose +ALCHEMISCALE_DOCKER_IMAGE=ghcr.io/OpenFreeEnergy/alchemiscale:latest diff --git a/docker/alchemiscale-server/Dockerfile b/docker/alchemiscale-server/Dockerfile index 5882240f..0ca1b2c5 100644 --- a/docker/alchemiscale-server/Dockerfile +++ b/docker/alchemiscale-server/Dockerfile @@ -1,6 +1,6 @@ -FROM mambaorg/micromamba:1.4.1 +FROM mambaorg/micromamba:1.5.10 -LABEL org.opencontainers.image.source=https://github.com/openforcefield/alchemiscale +LABEL org.opencontainers.image.source=https://github.com/OpenFreeEnergy/alchemiscale LABEL org.opencontainers.image.description="a high-throughput alchemical free energy execution system for use with HPC, cloud, bare metal, and Folding@Home" LABEL org.opencontainers.image.licenses=MIT diff --git a/docker/alchemiscale-server/docker-compose.yml b/docker/alchemiscale-server/docker-compose.yml index a9a7e9e6..ee1c3111 100644 --- a/docker/alchemiscale-server/docker-compose.yml +++ b/docker/alchemiscale-server/docker-compose.yml @@ -20,6 +20,7 @@ services: ports: - 7687:7687 - 7474:7474 + restart: unless-stopped # Uncomment the volumes to be mounted to make them accessible from outside the container. volumes: #- ./neo4j.conf:/conf/neo4j.conf # This is the main configuration file. @@ -75,13 +76,14 @@ services: depends_on: - neo4j - alchemiscale-db-init + restart: unless-stopped command: "api --host 0.0.0.0 --port 1840 --workers 2" labels: - "traefik.enable=true" - "traefik.http.routers.alchemiscale-client-API.rule=Host(`api.${HOST_DOMAIN:?err}`)" - "traefik.http.routers.alchemiscale-client-API.entrypoints=websecure" - "traefik.http.routers.alchemiscale-client-API.tls.certresolver=myresolver" - - "traefik.docker.network=docker_internal" + - "traefik.docker.network=alchemiscale-server_internal" healthcheck: test: ["CMD", "/opt/conda/bin/curl", "-f", "http://localhost:1840/ping"] interval: 1m @@ -121,11 +123,12 @@ services: condition: service_healthy alchemiscale-db-init: condition: service_completed_successfully + restart: unless-stopped command: "compute api --host 0.0.0.0 --port 1841 --workers 2" labels: - "traefik.enable=true" - "traefik.http.routers.alchemiscale-compute-API.rule=Host(`compute.${HOST_DOMAIN:?err}`)" - - "traefik.docker.network=docker_internal" + - "traefik.docker.network=alchemiscale-server_internal" - "traefik.http.routers.alchemiscale-compute-API.entrypoints=websecure" - "traefik.http.routers.alchemiscale-compute-API.tls.certresolver=myresolver" healthcheck: @@ -179,6 +182,7 @@ services: depends_on: - alchemiscale-client-API - alchemiscale-compute-API + restart: unless-stopped command: - "--log.level=DEBUG" - "--providers.docker" diff --git a/docs/compute.rst b/docs/compute.rst index d93bca58..29724a41 100644 --- a/docs/compute.rst +++ b/docs/compute.rst @@ -14,7 +14,7 @@ This documentation will expand over time as these variants become available; for In all cases, you will need to define a configuration file for your compute services to consume on startup. A template for this file can be found here; replace ``$ALCHEMISCALE_VERSION`` with the version tag, e.g. ``v0.1.4``, you have deployed for your server:: - https://raw.githubusercontent.com/openforcefield/alchemiscale/$ALCHEMISCALE_VERSION/devtools/configs/synchronous-compute-settings.yaml + https://raw.githubusercontent.com/OpenFreeEnergy/alchemiscale/$ALCHEMISCALE_VERSION/devtools/configs/synchronous-compute-settings.yaml *********** @@ -35,7 +35,7 @@ Deploying with conda/mamba To deploy via ``conda``/``mamba``, first create an environment (we recommend ``mamba`` for its performance):: mamba env create -n alchemiscale-compute-$ALCHEMISCALE_VERSION \ - -f https://raw.githubusercontent.com/openforcefield/alchemiscale/$ALCHEMISCALE_VERSION/devtools/conda-envs/alchemiscale-compute.yml + -f https://raw.githubusercontent.com/OpenFreeEnergy/alchemiscale/$ALCHEMISCALE_VERSION/devtools/conda-envs/alchemiscale-compute.yml Once created, activate the environment in your current shell:: @@ -55,7 +55,7 @@ Assuming your configuration file is in the current working directory, to deploy docker run --gpus all \ --rm \ - -v $(pwd):/mnt ghcr.io/openforcefield/alchemiscale-compute:$ALCHEMISCALE_VERSION \ + -v $(pwd):/mnt ghcr.io/OpenFreeEnergy/alchemiscale-compute:$ALCHEMISCALE_VERSION \ compute synchronous -c /mnt/synchronous-compute-settings.yaml @@ -157,7 +157,7 @@ We define a k8s `Deployment`_ featuring a single container spec as the file ``co spec: containers: - name: alchemiscale-synchronous-container - image: ghcr.io/openforcefield/alchemiscale-compute:$ALCHEMISCALE_VERSION + image: ghcr.io/OpenFreeEnergy/alchemiscale-compute:$ALCHEMISCALE_VERSION args: ["compute", "synchronous", "-c", "/mnt/settings/synchronous-compute-settings.yaml"] resources: limits: diff --git a/docs/deployment.rst b/docs/deployment.rst index c6efb46b..9c6766cb 100644 --- a/docs/deployment.rst +++ b/docs/deployment.rst @@ -36,7 +36,7 @@ First install the `docker engine