From 12b1ce87dc805fee447c70666602c7173ac78d0e Mon Sep 17 00:00:00 2001 From: "Genheden, Samuel" Date: Mon, 27 May 2024 10:05:33 +0200 Subject: [PATCH 1/3] Adding updates for 1.5.0 release --- CHANGELOG.md | 11 ++ docs/conf.py | 2 +- pyproject.toml | 2 +- .../disconnection_sites/atom_map_tagging.py | 139 +++++++++++++ .../disconnection_sites/tag_converting.py | 185 ++++++++++++++++++ rxnutils/chem/rinchi/download_rinchi.py | 16 +- rxnutils/chem/utils.py | 3 +- rxnutils/data/batch_utils.py | 3 + rxnutils/pipeline/actions/reaction_mod.py | 66 ++++++- rxnutils/routes/base.py | 52 ++--- tests/data/mapped_tests_reactions.csv | 2 +- tests/test_batch_utils.py | 32 +++ tests/test_product_tagging.py | 77 ++++++++ tests/test_reaction_mods_actions.py | 47 +++++ tests/test_rinchi.py | 10 +- tests/test_routes_base.py | 43 ++++ 16 files changed, 649 insertions(+), 41 deletions(-) create mode 100644 rxnutils/chem/disconnection_sites/atom_map_tagging.py create mode 100644 rxnutils/chem/disconnection_sites/tag_converting.py create mode 100644 tests/test_product_tagging.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 67ff150..103f4ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # CHANGELOG +## Version 1.5.0 - 2024-05-27 + +### Features + +- Adding support for tagging reaction sites in SMILES + +### Miscellaneous + +- Improving batch routines +- Updating InChI tools download URL + ## Version 1.4.0 - 2024-03-12 ### Features diff --git a/docs/conf.py b/docs/conf.py index 8800d57..55d1936 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -6,7 +6,7 @@ project = "ReactionUtils" copyright = "2022, Molecular AI group" author = "Molecular AI group" -release = "1.4.0" +release = "1.5.0" extensions = [ "sphinx.ext.autodoc", diff --git a/pyproject.toml b/pyproject.toml index 3d4afb5..e7543e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "reaction_utils" -version = "1.4.0" +version = "1.5.0" description = "Utilities for working with reactions, reaction templates and template extraction" authors = ["Genheden, Samuel ", "Kannas, Christos "] license = "Apache-2.0" diff --git a/rxnutils/chem/disconnection_sites/atom_map_tagging.py b/rxnutils/chem/disconnection_sites/atom_map_tagging.py new file mode 100644 index 0000000..83f7c2b --- /dev/null +++ b/rxnutils/chem/disconnection_sites/atom_map_tagging.py @@ -0,0 +1,139 @@ +from __future__ import annotations + +import argparse +from collections import OrderedDict +from typing import List, Optional, Sequence + +import pandas as pd +from rdkit import Chem + + +def _get_atom_identifier(atom: Chem.rdchem.Atom) -> str: + """ + Get atom identifier for neighborhood identification. + The identifier is either the atom-map number if available, otherwise the symbol. + :param atom: rdkit atom + :return: an atom identifier string + """ + atom_id = atom.GetAtomMapNum() + if atom_id == 0: + atom_id = atom.GetSymbol() + return str(atom_id) + + +def _get_bond_environment_identifier( + atoms: Sequence[Chem.rdchem.Atom], bond: Chem.rdchem.Bond +) -> str: + """ + Get the environment of a specific bond. + + :param atoms: atoms in the molecule. + :param bond: bond for which the environment should be specified + :return: string representation of the bond environment + """ + atom_map1 = _get_atom_identifier(atoms[bond.GetBeginAtomIdx()]) + atom_map2 = _get_atom_identifier(atoms[bond.GetEndAtomIdx()]) + bond_order = bond.GetBondType() + atom_map1, atom_map2 = sorted([atom_map1, atom_map2]) + return f"{atom_map1}_{atom_map2}_{bond_order}" + + +def _get_atomic_neighborhoods(smiles: str) -> OrderedDict[int, List[str]]: + """ + Obtains a dictionary containing each atom (atomIdx) and a list of its + bonding environment. + + :param smiles: Atom-mapped SMILES string + :return: A dictionary containing each atom (atomIdx) and a list of its + bonding environment identifiers. + """ + + mol = Chem.MolFromSmiles(smiles) + atoms = mol.GetAtoms() + + neighbor_dict = {} + for atom in atoms: + bonds_list = [] + if atom.GetAtomMapNum() != 0: + for bond in atom.GetBonds(): + + bonds_list.append(_get_bond_environment_identifier(atoms, bond)) + + neighbor_dict[atom.GetAtomMapNum()] = sorted(bonds_list) + ordered_neighbor_dict = OrderedDict(sorted(neighbor_dict.items())) + + return ordered_neighbor_dict + + +def get_atom_list(reactants_smiles: str, product_smiles: str) -> List[int]: + """ + Given two sets of SMILES strings corresponding to a set of reactants and products, + returns a list of atomIdxs for which the atomic environment has changed, + as defined by a change in the bonds. + + :param reactants_smiles: Atom-mapped SMILES string for the reactant(s) + :param product_smiles: Atom-mapped SMILES string for the product(s) + :return: List of atoms (atomIdx) for which the atomic environment has changed + """ + + ordered_reactant_neighbor_dict = _get_atomic_neighborhoods(reactants_smiles) + ordered_product_neighbor_dict = _get_atomic_neighborhoods(product_smiles) + + all_indices = set(ordered_product_neighbor_dict.keys()) | set( + ordered_reactant_neighbor_dict.keys() + ) + + # Checks to see equivlence of atomic enviroments. + # If environment changed, then add atom to list + atom_list = [ + atom_map + for atom_map in all_indices + if ordered_reactant_neighbor_dict.get(atom_map, []) + != ordered_product_neighbor_dict.get(atom_map, []) + ] + + return atom_list + + +def atom_map_tag_products(mapped_rxn: str) -> str: + """ + Given atom-mapped reaction, returns disconnection site-tagged product where atoms + with changed atom environment are represented by [:1]. + + :param mapped_rxn: Atom-mapped reaction SMILES + :return: SMILES of the product containing tags corresponding to atoms changed in the + reaction. + """ + reactants_smiles, _, product_smiles = mapped_rxn.split(">") + + product_mol = Chem.MolFromSmiles(product_smiles) + atom_list = get_atom_list(reactants_smiles, product_smiles) + + # Set atoms in product with a different environment in reactants to 1 + for atom in product_mol.GetAtoms(): + if atom.GetAtomMapNum() in atom_list: + atom.SetAtomMapNum(1) + else: + atom.SetAtomMapNum(0) + + return Chem.MolToSmiles(product_mol) + + +def main(args: Optional[Sequence[str]] = None) -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--input") + parser.add_argument("--in_column", default="RxnSmilesClean") + parser.add_argument("--out_column", default="products_atom_map_tagged") + parser.add_argument("--output") + + args = parser.parse_args(args) + + data = pd.read_csv(args.input, sep="\t") + + smiles_col = data[args.in_column].apply(atom_map_tag_products) + data = data.assign(**{args.out_column: smiles_col}) + data.to_csv(args.output, sep="\t", index=False) + + +if __name__ == "__main__": + main() diff --git a/rxnutils/chem/disconnection_sites/tag_converting.py b/rxnutils/chem/disconnection_sites/tag_converting.py new file mode 100644 index 0000000..152e789 --- /dev/null +++ b/rxnutils/chem/disconnection_sites/tag_converting.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +import re +from typing import List, Tuple + +from rdkit import Chem + +from rxnutils.chem.utils import remove_atom_mapping + + +def smiles_tokens(smiles: str) -> List[str]: + """ + Tokenize SMILES using basic regex pattern for Chemformer. + + :param smiles: SMILES to tokenize + :return: List of tokens identified in SMILES. + """ + pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\!|\$|\%[0-9]{2}|[0-9])" + regex = re.compile(pattern) + tokens = [token for token in regex.findall(smiles)] + assert smiles == "".join(tokens) + return tokens + + +def _next_tagged_token( + product_tagged_tokens: List[str], untagged_token: str, tagged_token_idx: int +) -> Tuple[str, int]: + """ + Get the next tagged token in the sequence. Includes checks and fixes for + stereochemistry changes due to removing atom mapping. + + :param product_tagged_tokens: tokens of product tagged with [:1] + :param untagged_token: the current token from the untagged product + :param tagged_token_idx: the current token index of the tagged product + :return: the next (tagged-product) token and the corresponding token index + """ + tagged_token = product_tagged_tokens[tagged_token_idx] + + # Check if the stereo chemistry has changed after removing atom-mapping and + # handle each specific case. + if tagged_token != untagged_token and (tagged_token == "/" or tagged_token == "\\"): + if untagged_token == "/" or untagged_token == "\\": + return untagged_token, tagged_token_idx + else: + tagged_token_idx += 1 + return product_tagged_tokens[tagged_token_idx], tagged_token_idx + + if ( + tagged_token != untagged_token + and not ":1" in tagged_token + and "@" in tagged_token + ): + return untagged_token, tagged_token_idx + + return tagged_token, tagged_token_idx + + +def tagged_smiles_from_tokens( + product_tagged_tokens: List[str], product_untagged_tokens: List[str] +) -> Tuple[str, str]: + """ + Convert the tagged SMILES from atom-mapping to unmapped-token + '!' + + :param product_tagged_tokens: tokens of product tagged with [:1] + :param product_untagged_tokens: tokens of the untagged product + + :return: Tuple of SMILES of the product containing tags corresponding to atoms changed in the + reaction using "!", and SMILES of the (reconstructed) untagged product + """ + + print(product_tagged_tokens) + + product_converted = "" + product_untagged = "" + + tagged_token_idx = 0 + + for untagged_token in product_untagged_tokens: + + tagged_token, tagged_token_idx = _next_tagged_token( + product_tagged_tokens, untagged_token, tagged_token_idx + ) + + if tagged_token != untagged_token and ( + untagged_token == "/" or untagged_token == "\\" + ): + continue + + if tagged_token == untagged_token: + product_converted += untagged_token + else: + # Remove brackets around a single letter + if ( + len(untagged_token) == 3 + and untagged_token.startswith("[") + and untagged_token.endswith("]") + ): + untagged_token = untagged_token[1] + product_converted += untagged_token + "!" + + product_untagged += untagged_token + + tagged_token_idx += 1 + + return product_converted, product_untagged + + +def _canonicalize_tagged_smiles( + product_tagged: str, product_untagged: str = None +) -> Tuple[str, str]: + """ + Reorder the tagged-product SMILES on canonical form using the canonicalized + untagged product. + + :param product_tagged: SMILES of tagged product + :param product_untagged: SMILES of untagged product + :return: canonicalized untagged and tagged product SMILES + """ + mol = Chem.MolFromSmiles(product_tagged) + mol_untagged = Chem.MolFromSmiles(product_untagged) + + _, canonical_atom_order = tuple( + zip( + *sorted( + [(j, i) for i, j in enumerate(Chem.CanonicalRankAtoms(mol_untagged))] + ) + ) + ) + + mol = Chem.RenumberAtoms(mol, canonical_atom_order) + mol_untagged = Chem.RenumberAtoms(mol_untagged, canonical_atom_order) + return Chem.MolToSmiles(mol, canonical=False), Chem.MolToSmiles(mol_untagged) + + +def convert_atom_map_tag(product_atom_map_tagged: str) -> str: + """ + Replace product tagged by atom-mapping [:1] to product tagged by "!". + Returns empty string if no atom-map tagging or the failed to create untagged product. + + :param product_tagged: SMILES of the product containing tags corresponding to + atoms changed in the reaction using [:1] + :return: SMILES of the product containing tags corresponding to atoms changed in the + reaction using "!" + """ + + # Check number of tags + n_tags = len(re.findall(r"\[[^\]]+:1]", product_atom_map_tagged)) + + if n_tags < 1: + return "" + + product_untagged = remove_atom_mapping(product_atom_map_tagged, canonical=False) + + if not Chem.MolFromSmiles(product_untagged): + return "" + + product_tagged, product_untagged = _canonicalize_tagged_smiles( + product_atom_map_tagged, product_untagged + ) + + # Update the SMILES string to remove atom-mapping brackets and explicit [H]:s and + # replace by ! + product_tagged_tokens = smiles_tokens(product_tagged) + product_untagged_tokens = smiles_tokens(product_untagged) + + product_tagged_converted, product_untagged = tagged_smiles_from_tokens( + product_tagged_tokens, product_untagged_tokens + ) + + n_new_tags = product_tagged_converted.count("!") + + if n_new_tags != n_tags: + raise AssertionError( + f"The number of tags is not the same after converting to '!' tagging. " + f"product_tagged_atom_map: {product_atom_map_tagged}" + f"product_tagged_converted: {product_tagged_converted}." + ) + + if product_tagged_converted.replace("!", "") != product_untagged: + raise AssertionError( + f"product_tagged.replace('!', '') != product_untagged." + f"product_tagged: {product_tagged_converted}, product_untagged: {product_untagged}" + ) + + return product_tagged_converted diff --git a/rxnutils/chem/rinchi/download_rinchi.py b/rxnutils/chem/rinchi/download_rinchi.py index 32fc3dc..bd2c993 100644 --- a/rxnutils/chem/rinchi/download_rinchi.py +++ b/rxnutils/chem/rinchi/download_rinchi.py @@ -16,7 +16,7 @@ CONFIG = { "download_folder": ".", - "download_url": "http://www.inchi-trust.org/download/RInChI/RInChI-V1-00.zip", + "download_url": "https://www.inchi-trust.org/wp/download/RInChI/RInChI-V1-00.zip", } PATH = os.path.dirname(__file__) @@ -35,9 +35,9 @@ def main() -> str: if sys.platform not in PLATFORM2FOLDER: raise RInChIError("RInChI software not supported on this platform") - rinchi_url = CONFIG.get("download_url") + rinchi_url = CONFIG.get("download_url", "") rinchi_fn = rinchi_url.split("/")[-1] - download_loc = CONFIG.get("download_folder") + download_loc = CONFIG.get("download_folder", "") download_loc = os.path.join(PATH, download_loc) rinchi_fn = os.path.join(download_loc, rinchi_fn) if not os.path.exists(rinchi_fn): @@ -50,18 +50,18 @@ def main() -> str: logging.debug(f"{req.headers}") req.raise_for_status() logging.info(f"Creating: {rinchi_fn}") - with open(rinchi_fn, "wb") as fileobj: - fileobj.write(req.content) + with open(rinchi_fn, "wb") as in_fp: + in_fp.write(req.content) logging.info("Download completed...") logging.info(f"Unziping: {rinchi_fn}") - with ZipFile(rinchi_fn, "r") as fileobj: + with ZipFile(rinchi_fn, "r") as out_fp: bin_path = [ x - for x in fileobj.namelist() + for x in out_fp.namelist() if x.endswith(_exec_folder_ending(os_sep=False) + "/") ] logging.debug(bin_path) - fileobj.extractall(download_loc) + out_fp.extractall(download_loc) logging.info("Completed...") rinchi_cli_path = os.path.join(download_loc, bin_path[0]) logging.info(f"RInChI CLI: {rinchi_cli_path}") diff --git a/rxnutils/chem/utils.py b/rxnutils/chem/utils.py index 257fb14..bc3982d 100644 --- a/rxnutils/chem/utils.py +++ b/rxnutils/chem/utils.py @@ -1,4 +1,5 @@ """Module containing various chemical utility routines""" + import logging import functools from typing import List, Tuple @@ -291,7 +292,7 @@ def get_special_groups(mol) -> List[Tuple[Tuple[int, ...], Tuple[int, ...]]]: # Build list groups = [] - for (add_if_match, template) in group_templates: + for add_if_match, template in group_templates: matches = mol.GetSubstructMatches( Chem.MolFromSmarts(template), useChirality=True ) diff --git a/rxnutils/data/batch_utils.py b/rxnutils/data/batch_utils.py index c62a036..f5622b6 100644 --- a/rxnutils/data/batch_utils.py +++ b/rxnutils/data/batch_utils.py @@ -101,6 +101,9 @@ def create_csv_batches( file_size = ( nlines(filename) - 1 ) # Header should not be counted for batch size calculations + nbatches = min( + file_size, nbatches + ) # Adjust the number of batches to the size of the file batch_size, remainder = divmod(file_size, nbatches) stop = 1 # 1-indexed to account for header in the .csv file batches = [] diff --git a/rxnutils/pipeline/actions/reaction_mod.py b/rxnutils/pipeline/actions/reaction_mod.py index 4b4eeb9..5fe65c6 100644 --- a/rxnutils/pipeline/actions/reaction_mod.py +++ b/rxnutils/pipeline/actions/reaction_mod.py @@ -1,12 +1,12 @@ """Module containing actions on reactions that modify the reaction in some way""" + from __future__ import annotations import os -import re import subprocess import sys import tempfile -from dataclasses import dataclass, field +from dataclasses import dataclass from pathlib import Path from typing import ClassVar, List, Tuple @@ -14,13 +14,13 @@ from rdkit import Chem, RDLogger from rdkit.Chem import RDConfig +from rxnutils.chem.disconnection_sites.atom_map_tagging import atom_map_tag_products +from rxnutils.chem.disconnection_sites.tag_converting import convert_atom_map_tag from rxnutils.chem.utils import ( atom_mapping_numbers, neutralize_molecules, remove_atom_mapping, desalt_molecules, - split_smiles_from_reaction, - join_smiles_from_reaction, ) from rxnutils.pipeline.base import ReactionActionMixIn, action, global_apply @@ -543,6 +543,64 @@ def _apply_row(self, row: pd.Series) -> pd.Series: ) +@action +@dataclass +class AtomMapTagDisconnectionSite(ReactionActionMixIn): + """Action for tagging disconnection site in products with atom-map '[:1]'.""" + + pretty_name: ClassVar[str] = "atom_map_tag_disconnection_site" + in_column: str = "RxnSmilesClean" + out_column: str = "products_atom_map_tagged" + + def __call__(self, data: pd.DataFrame) -> pd.DataFrame: + smiles_col = global_apply(data, self._row_action, axis=1) + return data.assign(**{self.out_column: smiles_col}) + + def __str__(self) -> str: + return f"{self.pretty_name} (tag disconnection sites in products with '[:1]')" + + def _row_action(self, row: pd.Series) -> str: + return atom_map_tag_products(row[self.in_column]) + + +@action +@dataclass +class ConvertAtomMapDisconnectionTag(ReactionActionMixIn): + """Action for converting atom-map tagging to exclamation mark tagging. + + yaml example: + + convert_atom_map_disconnection_tag: + in_column_tagged: products_atom_map_tagged + in_column_untagged: products + out_column_tagged: products_tagged + out_column_reconstructed: products_reconstructed + """ + + pretty_name: ClassVar[str] = "convert_atom_map_disconnection_tag" + in_column: str = "products_atom_map_tagged" + out_column_tagged: str = "products_tagged" + out_column_reconstructed: str = "products_reconstructed" + + def __call__(self, data: pd.DataFrame) -> pd.DataFrame: + smiles_tagged_col = global_apply(data, self._row_action, axis=1) + smiles_reconstructed_col = smiles_tagged_col.str.replace("!", "") + + return data.assign( + **{ + self.out_column_tagged: smiles_tagged_col, + self.out_column_reconstructed: smiles_reconstructed_col, + } + ) + + def __str__(self) -> str: + return f"{self.pretty_name} (convert disconnection tagging '[:1]' to '!')" + + def _row_action(self, row: pd.Series) -> str: + product_tagged = convert_atom_map_tag(row[self.in_column]) + return product_tagged + + @action @dataclass class TrimRxnSmiles: diff --git a/rxnutils/routes/base.py b/rxnutils/routes/base.py index 71327c7..7974d61 100644 --- a/rxnutils/routes/base.py +++ b/rxnutils/routes/base.py @@ -4,7 +4,7 @@ and drawing the route """ -from typing import Dict, Any, List, Callable +from typing import Dict, Any, List, Callable, Union from copy import deepcopy from operator import itemgetter @@ -24,14 +24,14 @@ class SynthesisRoute: """ This encapsulates a synthesis route or a reaction tree. - It provides convenient methods for assigning atom-mapping + It provide convinient methods for assigning atom-mapping to the reactions, and for providing reaction-level data of the route It is typically initiallized by one of the readers in the `rxnutils.routes.readers` module. - The tree depth and the forward step are automatically assigned + The tree depth and the forward step is automatically assigned to each reaction node. :param reaction_tree: the tree structure representing the route @@ -68,11 +68,13 @@ def atom_mapped_reaction_smiles(self) -> List[str]: return smiles def assign_atom_mapping( - self, overwrite: bool = False, only_rxnmapper: bool = False + self, + overwrite: bool = False, + only_rxnmapper: bool = False, ) -> None: """ Assign atom-mapping to each reaction in the route and - ensure that is consistent from root compound and throughout + ensure that is is consistent from root compound and throughout the route. It will use NameRxn to assign classification and possiblty atom-mapping, @@ -92,7 +94,7 @@ def chains( """ Returns linear sequences or chains extracted from the route. - Each chain is a list of dictionaries representing the molecules, only the most + Each chain is a list of a dictionary representing the molecules, only the most complex molecule is kept for each reaction - making the chain a sequence of molecule to molecule transformation. @@ -168,19 +170,25 @@ def reaction_smiles(self) -> List[str]: def remap(self, other: "SynthesisRoute") -> None: """ Remap the reaction so that it follows the mapping of a - root compound in a reference routes + 1) root compound in a reference route, 2) a ref compound given + as a SMILES, or 3) using a raw mapping - :param other: the reference route + :param other: the reference for re-mapping """ - try: - ref = other.mapped_root_smiles - other = self.mapped_root_smiles - except ValueError as err: - # For single-compound routes, we can just ignore this - if str(err).startswith("Single"): + if isinstance(other, SynthesisRoute): + if len(self.reaction_smiles()) == 0 or len(other.reaction_smiles()) == 0: + return + mapping_dict = _find_remapping( + other.mapped_root_smiles, self.mapped_root_smiles + ) + elif isinstance(other, str): + if len(self.reaction_smiles()) == 0: return - raise - mapping_dict = _find_remapping(ref, other) + mapping_dict = _find_remapping(other, self.mapped_root_smiles) + elif isinstance(other, dict): + mapping_dict = other + else: + raise ValueError(f"Cannot perform re-mapping using a {type(other)}") _remap_reactions(self.reaction_tree, mapping_dict) def _assign_mapping( @@ -196,11 +204,14 @@ def _assign_mapping( return df = pd.DataFrame({"smiles": list(set(self.reaction_smiles()))}) - nextmove_action = NameRxn(in_column="smiles") + nextmove_action = NameRxn(in_column="smiles", nm_rxn_column="mapped_smiles") rxnmapper_action = RxnMapper(in_column="smiles") df = rxnmapper_action(nextmove_action(df)) if only_rxnmapper: - df["NextMoveRxnSmiles"] = df["RxnmapperRxnSmiles"] + df["mapped_smiles"] = df["RxnmapperRxnSmiles"] + else: + sel = df["NMC"] == "0.0" + df["mapped_smiles"].mask(sel, df["RxnmapperRxnSmiles"], inplace=True) datamap = df.set_index("smiles").to_dict("index") _copy_mapping_from_datamap(self.reaction_tree, datamap) @@ -489,10 +500,7 @@ def _copy_mapping_from_datamap( rxnsmi = f"{reactants}>>{tree_dict['smiles']}" metadata = children[0].get("metadata", {}) metadata["classification"] = datamap[rxnsmi]["NMC"] - if datamap[rxnsmi]["NMC"] == "0.0": - metadata["mapped_reaction_smiles"] = datamap[rxnsmi]["RxnmapperRxnSmiles"] - else: - metadata["mapped_reaction_smiles"] = datamap[rxnsmi]["NextMoveRxnSmiles"] + metadata["mapped_reaction_smiles"] = datamap[rxnsmi]["mapped_smiles"] metadata = children[0]["metadata"] = metadata for grandchild in grandchildren: _copy_mapping_from_datamap(grandchild, datamap) diff --git a/tests/data/mapped_tests_reactions.csv b/tests/data/mapped_tests_reactions.csv index 8658ff5..7e23bdc 100644 --- a/tests/data/mapped_tests_reactions.csv +++ b/tests/data/mapped_tests_reactions.csv @@ -1,3 +1,3 @@ -smiles NextMoveRxnSmiles NMC RxnmapperRxnSmiles +smiles mapped_smiles NMC RxnmapperRxnSmiles Cl.c1ccccc1>>Clc1ccccc1 [cH:3]1[cH:2][cH:1][cH:6][cH:5][cH:4]1.Cl>>[cH:4]1[cH:3][cH:2][c:1]([cH:6][cH:5]1)Cl 10.1.2 [ClH:1].[cH:2]1[cH:3][cH:4][cH:5][cH:6][cH:7]1>>[Cl:1][c:2]1[cH:3][cH:4][cH:5][cH:6][cH:7]1 CO.Clc1ccccc1>>COc1ccccc1 [CH3:1][OH:2].[cH:6]1[cH:5][cH:4][c:3]([cH:8][cH:7]1)Cl>>[CH3:1][O:2][c:3]1[cH:4][cH:5][cH:6][cH:7][cH:8]1 1.7.11 Cl[c:3]1[cH:4][cH:5][cH:6][cH:7][cH:8]1.[CH3:1][OH:2]>>[CH3:1][O:2][c:3]1[cH:4][cH:5][cH:6][cH:7][cH:8]1 diff --git a/tests/test_batch_utils.py b/tests/test_batch_utils.py index 11e0509..bee7183 100644 --- a/tests/test_batch_utils.py +++ b/tests/test_batch_utils.py @@ -44,6 +44,36 @@ def test_nlines(line_count, create_dummy_file): [ (2, [(0, 1, 6), (1, 6, 11)]), (3, [(0, 1, 5), (1, 5, 8), (2, 8, 11)]), + ( + 10, + [ + (0, 1, 2), + (1, 2, 3), + (2, 3, 4), + (3, 4, 5), + (4, 5, 6), + (5, 6, 7), + (6, 7, 8), + (7, 8, 9), + (8, 9, 10), + (9, 10, 11), + ], + ), + ( + 15, + [ + (0, 1, 2), + (1, 2, 3), + (2, 3, 4), + (3, 4, 5), + (4, 5, 6), + (5, 6, 7), + (6, 7, 8), + (7, 8, 9), + (8, 9, 10), + (9, 10, 11), + ], + ), ], ) def test_csv_chunks(nbatches, expected, create_dummy_file): @@ -57,6 +87,8 @@ def test_csv_chunks(nbatches, expected, create_dummy_file): [ (2, [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]), (3, [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9]]), + (10, [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]]), + (15, [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]]), ], ) def test_csv_chunks_end2end(nbatches, expected, create_dummy_file): diff --git a/tests/test_product_tagging.py b/tests/test_product_tagging.py new file mode 100644 index 0000000..71577f9 --- /dev/null +++ b/tests/test_product_tagging.py @@ -0,0 +1,77 @@ +import pytest +from rxnutils.chem.disconnection_sites.atom_map_tagging import ( + atom_map_tag_products, + get_atom_list, +) +from rxnutils.chem.disconnection_sites.tag_converting import ( + convert_atom_map_tag, + tagged_smiles_from_tokens, +) + + +@pytest.mark.parametrize( + ("reactants_smiles", "product_smiles", "expected"), + [ + ( + "[Cl:2].[CH:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1", + "[Cl:2][C:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1", + [1, 2], + ), + ( + "Cl.[CH:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1", + "Cl[C:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1", + [1], + ), + ], +) +def test_get_atom_list(reactants_smiles, product_smiles, expected): + atom_list = get_atom_list(reactants_smiles, product_smiles) + assert sorted(atom_list) == expected + + +@pytest.mark.parametrize( + ("reactants_smiles", "product_smiles", "expected"), + [ + ( + "[Cl:2].[CH:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1", + "[Cl:2][C:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1", + "c1cc[c:1]([Cl:1])cc1", + ), + ( + "Cl.[CH:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1", + "Cl[C:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1", + "Cl[c:1]1ccccc1", + ), + ], +) +def test_atom_map_tag_products(reactants_smiles, product_smiles, expected): + tagged_product = atom_map_tag_products(f"{reactants_smiles}>>{product_smiles}") + assert tagged_product == expected + + +@pytest.mark.parametrize( + ("product_smiles", "expected"), + [ + ("c1cc[c:1]([Cl:1])cc1", "Cl!c!1ccccc1"), + ("Cl[c:1]1ccccc1", "Clc!1ccccc1"), + ("Clc1ccccc1", ""), + ], +) +def test_tag_converting(product_smiles, expected): + tagged_product = convert_atom_map_tag(product_smiles) + assert tagged_product == expected + + +@pytest.mark.parametrize( + ("tagged_tokens", "untagged_tokens", "expected"), + [ + (["[C:1]", "[C@H]", "O"], ["C", "[C@@H]", "O"], ("C![C@@H]O", "C[C@@H]O")), + (["[C:1]", "/", "C", "O"], ["C", "C", "O"], ("C!CO", "CCO")), + (["[C:1]", "C", "O"], ["C", "/", "C", "O"], ("C!CO", "CCO")), + (["[C:1]", "/", "C", "O"], ["C", "\\", "C", "O"], ("C!\\CO", "C\\CO")), + (["[C:1]", "C", "O"], ["[C]", "C", "O"], ("C!CO", "CCO")), + ], +) +def test_tagged_smiles_from_tokens(tagged_tokens, untagged_tokens, expected): + output = tagged_smiles_from_tokens(tagged_tokens, untagged_tokens) + assert output == expected diff --git a/tests/test_reaction_mods_actions.py b/tests/test_reaction_mods_actions.py index 1d7c052..5a6c91d 100644 --- a/tests/test_reaction_mods_actions.py +++ b/tests/test_reaction_mods_actions.py @@ -12,9 +12,12 @@ SplitReaction, RemoveUnchangedProducts, IsotopeInfo, + AtomMapTagDisconnectionSite, + ConvertAtomMapDisconnectionTag, DesaltMolecules, CONTRIB_INSTALLED, ) +from rxnutils.chem.disconnection_sites.tag_converting import smiles_tokens from rxnutils.pipeline.base import global_apply global_apply.max_workers = 1 @@ -217,6 +220,50 @@ def test_isotope_info(): ] +def test_disconnection_tagging(shared_datadir): + + df = pd.read_csv(shared_datadir / "mapped_tests_reactions.csv", sep="\t") + + action_atom_map_tag = AtomMapTagDisconnectionSite(in_column="RxnmapperRxnSmiles") + action_convert_tag = ConvertAtomMapDisconnectionTag() + + df_atom_map_tag = action_atom_map_tag(df) + df_tag = action_convert_tag(df_atom_map_tag) + + df_ground_truth = pd.Series( + ["Cl!c!1ccccc1", "CO!c!1ccccc1"], name="products_tagged" + ) + + assert df_ground_truth.equals(df_tag["products_tagged"]) + + +def test_smiles_tokenization_unknown_token_error(shared_datadir): + + df = pd.read_csv(shared_datadir / "mapped_tests_reactions.csv", sep="\t") + + action_atom_map_tag = AtomMapTagDisconnectionSite(in_column="RxnmapperRxnSmiles") + df_atom_map_tag = action_atom_map_tag(df) + + product_atom_map_tagged = df_atom_map_tag["products_atom_map_tagged"].values[0] + + with pytest.raises(AssertionError): + smiles_tokens(product_atom_map_tagged + "{") + + +def test_converting_no_atom_map_tag(shared_datadir): + + df = pd.read_csv(shared_datadir / "mapped_tests_reactions.csv", sep="\t") + df["products"] = [rxn.split(">")[-1] for rxn in df.smiles] + + action_convert_tag = ConvertAtomMapDisconnectionTag(in_column="products") + + df_tag = action_convert_tag(df) + + df_ground_truth = pd.Series(["", ""], name="products_tagged") + + assert df_ground_truth.equals(df_tag["products_tagged"]) + + def test_desalting(): smi1 = "OCC.(C.[Na+].[Cl-])>>OC(=O)CC" smi2 = "OCC>>OC(=O)CC" diff --git a/tests/test_rinchi.py b/tests/test_rinchi.py index 5224972..dc30607 100644 --- a/tests/test_rinchi.py +++ b/tests/test_rinchi.py @@ -17,11 +17,13 @@ def rinchi_download(): @pytest.mark.xfail(sys.platform not in PLATFORM2FOLDER, reason="Platform not supported") -@pytest.mark.xfail(raises=HTTPError) +@pytest.mark.xfail( + raises=HTTPError, +) # Expect a failure incase of failure to download RInChI archive def test_download_rinchi(mocker, tmpdir): config = { "download_folder": str(tmpdir), - "download_url": "http://www.inchi-trust.org/download/RInChI/RInChI-V1-00.zip", + "download_url": "https://www.inchi-trust.org/wp/download/RInChI/RInChI-V1-00.zip", } mocker.patch("rxnutils.chem.rinchi.download_rinchi.CONFIG", config) @@ -31,7 +33,9 @@ def test_download_rinchi(mocker, tmpdir): @pytest.mark.xfail(sys.platform not in PLATFORM2FOLDER, reason="Platform not supported") -@pytest.mark.xfail(raises=HTTPError) +@pytest.mark.xfail( + raises=HTTPError, +) # Expect a failure incase of failure to download RInChI archive def test_generate_rinchi(rinchi_download): rsmi = ( "[ClH;D0;+0:1]>>" diff --git a/tests/test_routes_base.py b/tests/test_routes_base.py index 15166b5..960a631 100644 --- a/tests/test_routes_base.py +++ b/tests/test_routes_base.py @@ -2,6 +2,7 @@ import pytest import pandas as pd +from rdkit import Chem from rxnutils.routes.base import SynthesisRoute @@ -71,6 +72,48 @@ def test_remap(synthesis_route, setup_mapper): assert route1.atom_mapped_reaction_smiles() != old_reaction_smiles +def test_remap_ref_smiles(synthesis_route, setup_mapper): + route1 = synthesis_route + route1.assign_atom_mapping() + old_reaction_smiles = route1.atom_mapped_reaction_smiles() + reactants, products = old_reaction_smiles[0].split(">>") + rsmi_old = Chem.MolToSmiles(Chem.MolFromSmiles(reactants)) + psmi_old = Chem.MolToSmiles(Chem.MolFromSmiles(products)) + + route1.remap(products) + + reactants, products = route1.atom_mapped_reaction_smiles()[0].split(">>") + rsmi = Chem.MolToSmiles(Chem.MolFromSmiles(reactants)) + psmi = Chem.MolToSmiles(Chem.MolFromSmiles(products)) + assert rsmi == rsmi_old + assert psmi == psmi_old + + route1.remap("[CH3:10][O:2][c:3]1[cH:4][cH:5][cH:6][cH:7][cH:8]1") + + reactants, products = route1.atom_mapped_reaction_smiles()[0].split(">>") + rsmi = Chem.MolToSmiles(Chem.MolFromSmiles(reactants)) + psmi = Chem.MolToSmiles(Chem.MolFromSmiles(products)) + assert rsmi != rsmi_old + assert psmi != psmi_old + + +def test_remap_ref_dict(synthesis_route, setup_mapper): + route1 = synthesis_route + route1.assign_atom_mapping() + old_reaction_smiles = route1.atom_mapped_reaction_smiles() + reactants, products = old_reaction_smiles[0].split(">>") + rsmi_old = Chem.MolToSmiles(Chem.MolFromSmiles(reactants)) + psmi_old = Chem.MolToSmiles(Chem.MolFromSmiles(products)) + + route1.remap({1: 10, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8}) + + reactants, products = route1.atom_mapped_reaction_smiles()[0].split(">>") + rsmi = Chem.MolToSmiles(Chem.MolFromSmiles(reactants)) + psmi = Chem.MolToSmiles(Chem.MolFromSmiles(products)) + assert rsmi != rsmi_old + assert psmi != psmi_old + + def test_extract_chains(synthesis_route): complexity = {"COc1ccccc1": 5, "CO": 0, "Clc1ccccc1": 1, "c1ccccc1": 1, "Cl": 0} From cfd8a6113c48c3aa01b5622a8e48b9f5a087f233 Mon Sep 17 00:00:00 2001 From: "Genheden, Samuel" Date: Mon, 27 May 2024 10:07:25 +0200 Subject: [PATCH 2/3] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 103f4ea..c618ede 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Features - Adding support for tagging reaction sites in SMILES +- Adding more options for re-mapping routes ### Miscellaneous From 7baecdd796351e4a187d3e9fade6570ab4b0a1fc Mon Sep 17 00:00:00 2001 From: "Genheden, Samuel" Date: Mon, 27 May 2024 13:06:18 +0200 Subject: [PATCH 3/3] Fixing typos --- rxnutils/routes/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rxnutils/routes/base.py b/rxnutils/routes/base.py index 7974d61..6f0425f 100644 --- a/rxnutils/routes/base.py +++ b/rxnutils/routes/base.py @@ -24,14 +24,14 @@ class SynthesisRoute: """ This encapsulates a synthesis route or a reaction tree. - It provide convinient methods for assigning atom-mapping + It provide convenient methods for assigning atom-mapping to the reactions, and for providing reaction-level data of the route It is typically initiallized by one of the readers in the `rxnutils.routes.readers` module. - The tree depth and the forward step is automatically assigned + The tree depth and the forward step are automatically assigned to each reaction node. :param reaction_tree: the tree structure representing the route @@ -74,7 +74,7 @@ def assign_atom_mapping( ) -> None: """ Assign atom-mapping to each reaction in the route and - ensure that is is consistent from root compound and throughout + ensure that it is consistent from root compound and throughout the route. It will use NameRxn to assign classification and possiblty atom-mapping,