From 12b1ce87dc805fee447c70666602c7173ac78d0e Mon Sep 17 00:00:00 2001
From: "Genheden, Samuel" <samuel.genheden@astrazeneca.com>
Date: Mon, 27 May 2024 10:05:33 +0200
Subject: [PATCH 1/3] Adding updates for 1.5.0 release

---
 CHANGELOG.md                                  |  11 ++
 docs/conf.py                                  |   2 +-
 pyproject.toml                                |   2 +-
 .../disconnection_sites/atom_map_tagging.py   | 139 +++++++++++++
 .../disconnection_sites/tag_converting.py     | 185 ++++++++++++++++++
 rxnutils/chem/rinchi/download_rinchi.py       |  16 +-
 rxnutils/chem/utils.py                        |   3 +-
 rxnutils/data/batch_utils.py                  |   3 +
 rxnutils/pipeline/actions/reaction_mod.py     |  66 ++++++-
 rxnutils/routes/base.py                       |  52 ++---
 tests/data/mapped_tests_reactions.csv         |   2 +-
 tests/test_batch_utils.py                     |  32 +++
 tests/test_product_tagging.py                 |  77 ++++++++
 tests/test_reaction_mods_actions.py           |  47 +++++
 tests/test_rinchi.py                          |  10 +-
 tests/test_routes_base.py                     |  43 ++++
 16 files changed, 649 insertions(+), 41 deletions(-)
 create mode 100644 rxnutils/chem/disconnection_sites/atom_map_tagging.py
 create mode 100644 rxnutils/chem/disconnection_sites/tag_converting.py
 create mode 100644 tests/test_product_tagging.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 67ff150..103f4ea 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,16 @@
 # CHANGELOG
 
+## Version 1.5.0 - 2024-05-27
+
+### Features
+
+- Adding support for tagging reaction sites in SMILES
+
+### Miscellaneous
+
+- Improving batch routines
+- Updating InChI tools download URL
+
 ## Version 1.4.0 - 2024-03-12
 
 ### Features
diff --git a/docs/conf.py b/docs/conf.py
index 8800d57..55d1936 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -6,7 +6,7 @@
 project = "ReactionUtils"
 copyright = "2022, Molecular AI group"
 author = "Molecular AI group"
-release = "1.4.0"
+release = "1.5.0"
 
 extensions = [
     "sphinx.ext.autodoc",
diff --git a/pyproject.toml b/pyproject.toml
index 3d4afb5..e7543e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "reaction_utils"
-version = "1.4.0"
+version = "1.5.0"
 description = "Utilities for working with reactions, reaction templates and template extraction"
 authors = ["Genheden, Samuel <samuel.genheden@astrazeneca.com>", "Kannas, Christos <christos.kannas@astrazeneca.com>"]
 license = "Apache-2.0"
diff --git a/rxnutils/chem/disconnection_sites/atom_map_tagging.py b/rxnutils/chem/disconnection_sites/atom_map_tagging.py
new file mode 100644
index 0000000..83f7c2b
--- /dev/null
+++ b/rxnutils/chem/disconnection_sites/atom_map_tagging.py
@@ -0,0 +1,139 @@
+from __future__ import annotations
+
+import argparse
+from collections import OrderedDict
+from typing import List, Optional, Sequence
+
+import pandas as pd
+from rdkit import Chem
+
+
+def _get_atom_identifier(atom: Chem.rdchem.Atom) -> str:
+    """
+    Get atom identifier for neighborhood identification.
+    The identifier is either the atom-map number if available, otherwise the symbol.
+    :param atom: rdkit atom
+    :return: an atom identifier string
+    """
+    atom_id = atom.GetAtomMapNum()
+    if atom_id == 0:
+        atom_id = atom.GetSymbol()
+    return str(atom_id)
+
+
+def _get_bond_environment_identifier(
+    atoms: Sequence[Chem.rdchem.Atom], bond: Chem.rdchem.Bond
+) -> str:
+    """
+    Get the environment of a specific bond.
+
+    :param atoms: atoms in the molecule.
+    :param bond: bond for which the environment should be specified
+    :return: string representation of the bond environment
+    """
+    atom_map1 = _get_atom_identifier(atoms[bond.GetBeginAtomIdx()])
+    atom_map2 = _get_atom_identifier(atoms[bond.GetEndAtomIdx()])
+    bond_order = bond.GetBondType()
+    atom_map1, atom_map2 = sorted([atom_map1, atom_map2])
+    return f"{atom_map1}_{atom_map2}_{bond_order}"
+
+
+def _get_atomic_neighborhoods(smiles: str) -> OrderedDict[int, List[str]]:
+    """
+    Obtains a dictionary containing each atom (atomIdx) and a list of its
+    bonding environment.
+
+    :param smiles: Atom-mapped SMILES string
+    :return: A dictionary containing each atom (atomIdx) and a list of its
+        bonding environment identifiers.
+    """
+
+    mol = Chem.MolFromSmiles(smiles)
+    atoms = mol.GetAtoms()
+
+    neighbor_dict = {}
+    for atom in atoms:
+        bonds_list = []
+        if atom.GetAtomMapNum() != 0:
+            for bond in atom.GetBonds():
+
+                bonds_list.append(_get_bond_environment_identifier(atoms, bond))
+
+            neighbor_dict[atom.GetAtomMapNum()] = sorted(bonds_list)
+    ordered_neighbor_dict = OrderedDict(sorted(neighbor_dict.items()))
+
+    return ordered_neighbor_dict
+
+
+def get_atom_list(reactants_smiles: str, product_smiles: str) -> List[int]:
+    """
+    Given two sets of SMILES strings corresponding to a set of reactants and products,
+    returns a list of atomIdxs for which the atomic environment has changed,
+    as defined by a change in the bonds.
+
+    :param reactants_smiles: Atom-mapped SMILES string for the reactant(s)
+    :param product_smiles: Atom-mapped SMILES string for the product(s)
+    :return: List of atoms (atomIdx) for which the atomic environment has changed
+    """
+
+    ordered_reactant_neighbor_dict = _get_atomic_neighborhoods(reactants_smiles)
+    ordered_product_neighbor_dict = _get_atomic_neighborhoods(product_smiles)
+
+    all_indices = set(ordered_product_neighbor_dict.keys()) | set(
+        ordered_reactant_neighbor_dict.keys()
+    )
+
+    # Checks to see equivlence of atomic enviroments.
+    # If environment changed, then add atom to list
+    atom_list = [
+        atom_map
+        for atom_map in all_indices
+        if ordered_reactant_neighbor_dict.get(atom_map, [])
+        != ordered_product_neighbor_dict.get(atom_map, [])
+    ]
+
+    return atom_list
+
+
+def atom_map_tag_products(mapped_rxn: str) -> str:
+    """
+    Given atom-mapped reaction, returns disconnection site-tagged product where atoms
+    with changed atom environment are represented by [<atom>:1].
+
+    :param mapped_rxn: Atom-mapped reaction SMILES
+    :return: SMILES of the product containing tags corresponding to atoms changed in the
+        reaction.
+    """
+    reactants_smiles, _, product_smiles = mapped_rxn.split(">")
+
+    product_mol = Chem.MolFromSmiles(product_smiles)
+    atom_list = get_atom_list(reactants_smiles, product_smiles)
+
+    # Set atoms in product with a different environment in reactants to 1
+    for atom in product_mol.GetAtoms():
+        if atom.GetAtomMapNum() in atom_list:
+            atom.SetAtomMapNum(1)
+        else:
+            atom.SetAtomMapNum(0)
+
+    return Chem.MolToSmiles(product_mol)
+
+
+def main(args: Optional[Sequence[str]] = None) -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input")
+    parser.add_argument("--in_column", default="RxnSmilesClean")
+    parser.add_argument("--out_column", default="products_atom_map_tagged")
+    parser.add_argument("--output")
+
+    args = parser.parse_args(args)
+
+    data = pd.read_csv(args.input, sep="\t")
+
+    smiles_col = data[args.in_column].apply(atom_map_tag_products)
+    data = data.assign(**{args.out_column: smiles_col})
+    data.to_csv(args.output, sep="\t", index=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/rxnutils/chem/disconnection_sites/tag_converting.py b/rxnutils/chem/disconnection_sites/tag_converting.py
new file mode 100644
index 0000000..152e789
--- /dev/null
+++ b/rxnutils/chem/disconnection_sites/tag_converting.py
@@ -0,0 +1,185 @@
+from __future__ import annotations
+
+import re
+from typing import List, Tuple
+
+from rdkit import Chem
+
+from rxnutils.chem.utils import remove_atom_mapping
+
+
+def smiles_tokens(smiles: str) -> List[str]:
+    """
+    Tokenize SMILES using basic regex pattern for Chemformer.
+
+    :param smiles: SMILES to tokenize
+    :return: List of tokens identified in SMILES.
+    """
+    pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\!|\$|\%[0-9]{2}|[0-9])"
+    regex = re.compile(pattern)
+    tokens = [token for token in regex.findall(smiles)]
+    assert smiles == "".join(tokens)
+    return tokens
+
+
+def _next_tagged_token(
+    product_tagged_tokens: List[str], untagged_token: str, tagged_token_idx: int
+) -> Tuple[str, int]:
+    """
+    Get the next tagged token in the sequence. Includes checks and fixes for
+    stereochemistry changes due to removing atom mapping.
+
+    :param product_tagged_tokens: tokens of product tagged with [<atom>:1]
+    :param untagged_token: the current token from the untagged product
+    :param tagged_token_idx: the current token index of the tagged product
+    :return: the next (tagged-product) token and the corresponding token index
+    """
+    tagged_token = product_tagged_tokens[tagged_token_idx]
+
+    # Check if the stereo chemistry has changed after removing atom-mapping and
+    # handle each specific case.
+    if tagged_token != untagged_token and (tagged_token == "/" or tagged_token == "\\"):
+        if untagged_token == "/" or untagged_token == "\\":
+            return untagged_token, tagged_token_idx
+        else:
+            tagged_token_idx += 1
+            return product_tagged_tokens[tagged_token_idx], tagged_token_idx
+
+    if (
+        tagged_token != untagged_token
+        and not ":1" in tagged_token
+        and "@" in tagged_token
+    ):
+        return untagged_token, tagged_token_idx
+
+    return tagged_token, tagged_token_idx
+
+
+def tagged_smiles_from_tokens(
+    product_tagged_tokens: List[str], product_untagged_tokens: List[str]
+) -> Tuple[str, str]:
+    """
+    Convert the tagged SMILES from atom-mapping to unmapped-token + '!'
+
+    :param product_tagged_tokens: tokens of product tagged with [<atom>:1]
+    :param product_untagged_tokens: tokens of the untagged product
+
+    :return: Tuple of SMILES of the product containing tags corresponding to atoms changed in the
+        reaction using "<atom>!", and SMILES of the (reconstructed) untagged product
+    """
+
+    print(product_tagged_tokens)
+
+    product_converted = ""
+    product_untagged = ""
+
+    tagged_token_idx = 0
+
+    for untagged_token in product_untagged_tokens:
+
+        tagged_token, tagged_token_idx = _next_tagged_token(
+            product_tagged_tokens, untagged_token, tagged_token_idx
+        )
+
+        if tagged_token != untagged_token and (
+            untagged_token == "/" or untagged_token == "\\"
+        ):
+            continue
+
+        if tagged_token == untagged_token:
+            product_converted += untagged_token
+        else:
+            # Remove brackets around a single letter
+            if (
+                len(untagged_token) == 3
+                and untagged_token.startswith("[")
+                and untagged_token.endswith("]")
+            ):
+                untagged_token = untagged_token[1]
+            product_converted += untagged_token + "!"
+
+        product_untagged += untagged_token
+
+        tagged_token_idx += 1
+
+    return product_converted, product_untagged
+
+
+def _canonicalize_tagged_smiles(
+    product_tagged: str, product_untagged: str = None
+) -> Tuple[str, str]:
+    """
+    Reorder the tagged-product SMILES on canonical form using the canonicalized
+    untagged product.
+
+    :param product_tagged: SMILES of tagged product
+    :param product_untagged: SMILES of untagged product
+    :return: canonicalized untagged and tagged product SMILES
+    """
+    mol = Chem.MolFromSmiles(product_tagged)
+    mol_untagged = Chem.MolFromSmiles(product_untagged)
+
+    _, canonical_atom_order = tuple(
+        zip(
+            *sorted(
+                [(j, i) for i, j in enumerate(Chem.CanonicalRankAtoms(mol_untagged))]
+            )
+        )
+    )
+
+    mol = Chem.RenumberAtoms(mol, canonical_atom_order)
+    mol_untagged = Chem.RenumberAtoms(mol_untagged, canonical_atom_order)
+    return Chem.MolToSmiles(mol, canonical=False), Chem.MolToSmiles(mol_untagged)
+
+
+def convert_atom_map_tag(product_atom_map_tagged: str) -> str:
+    """
+    Replace product tagged by atom-mapping [<atom>:1] to product tagged by "<atom>!".
+    Returns empty string if no atom-map tagging or the failed to create untagged product.
+
+    :param product_tagged: SMILES of the product containing tags corresponding to
+        atoms changed in the reaction using [<atom>:1]
+    :return: SMILES of the product containing tags corresponding to atoms changed in the
+        reaction using "<atom>!"
+    """
+
+    # Check number of tags
+    n_tags = len(re.findall(r"\[[^\]]+:1]", product_atom_map_tagged))
+
+    if n_tags < 1:
+        return ""
+
+    product_untagged = remove_atom_mapping(product_atom_map_tagged, canonical=False)
+
+    if not Chem.MolFromSmiles(product_untagged):
+        return ""
+
+    product_tagged, product_untagged = _canonicalize_tagged_smiles(
+        product_atom_map_tagged, product_untagged
+    )
+
+    # Update the SMILES string to remove atom-mapping brackets and explicit [H]:s and
+    # replace by <atom>!
+    product_tagged_tokens = smiles_tokens(product_tagged)
+    product_untagged_tokens = smiles_tokens(product_untagged)
+
+    product_tagged_converted, product_untagged = tagged_smiles_from_tokens(
+        product_tagged_tokens, product_untagged_tokens
+    )
+
+    n_new_tags = product_tagged_converted.count("!")
+
+    if n_new_tags != n_tags:
+        raise AssertionError(
+            f"The number of tags is not the same after converting to '!' tagging. "
+            f"product_tagged_atom_map: {product_atom_map_tagged}"
+            f"product_tagged_converted: {product_tagged_converted}."
+        )
+
+    if product_tagged_converted.replace("!", "") != product_untagged:
+        raise AssertionError(
+            f"product_tagged.replace('!', '') != product_untagged."
+            f"product_tagged: {product_tagged_converted}, product_untagged: {product_untagged}"
+        )
+
+    return product_tagged_converted
diff --git a/rxnutils/chem/rinchi/download_rinchi.py b/rxnutils/chem/rinchi/download_rinchi.py
index 32fc3dc..bd2c993 100644
--- a/rxnutils/chem/rinchi/download_rinchi.py
+++ b/rxnutils/chem/rinchi/download_rinchi.py
@@ -16,7 +16,7 @@
 
 CONFIG = {
     "download_folder": ".",
-    "download_url": "http://www.inchi-trust.org/download/RInChI/RInChI-V1-00.zip",
+    "download_url": "https://www.inchi-trust.org/wp/download/RInChI/RInChI-V1-00.zip",
 }
 PATH = os.path.dirname(__file__)
 
@@ -35,9 +35,9 @@ def main() -> str:
     if sys.platform not in PLATFORM2FOLDER:
         raise RInChIError("RInChI software not supported on this platform")
 
-    rinchi_url = CONFIG.get("download_url")
+    rinchi_url = CONFIG.get("download_url", "")
     rinchi_fn = rinchi_url.split("/")[-1]
-    download_loc = CONFIG.get("download_folder")
+    download_loc = CONFIG.get("download_folder", "")
     download_loc = os.path.join(PATH, download_loc)
     rinchi_fn = os.path.join(download_loc, rinchi_fn)
     if not os.path.exists(rinchi_fn):
@@ -50,18 +50,18 @@ def main() -> str:
         logging.debug(f"{req.headers}")
         req.raise_for_status()
         logging.info(f"Creating: {rinchi_fn}")
-        with open(rinchi_fn, "wb") as fileobj:
-            fileobj.write(req.content)
+        with open(rinchi_fn, "wb") as in_fp:
+            in_fp.write(req.content)
         logging.info("Download completed...")
         logging.info(f"Unziping: {rinchi_fn}")
-        with ZipFile(rinchi_fn, "r") as fileobj:
+        with ZipFile(rinchi_fn, "r") as out_fp:
             bin_path = [
                 x
-                for x in fileobj.namelist()
+                for x in out_fp.namelist()
                 if x.endswith(_exec_folder_ending(os_sep=False) + "/")
             ]
             logging.debug(bin_path)
-            fileobj.extractall(download_loc)
+            out_fp.extractall(download_loc)
         logging.info("Completed...")
         rinchi_cli_path = os.path.join(download_loc, bin_path[0])
         logging.info(f"RInChI CLI: {rinchi_cli_path}")
diff --git a/rxnutils/chem/utils.py b/rxnutils/chem/utils.py
index 257fb14..bc3982d 100644
--- a/rxnutils/chem/utils.py
+++ b/rxnutils/chem/utils.py
@@ -1,4 +1,5 @@
 """Module containing various chemical utility routines"""
+
 import logging
 import functools
 from typing import List, Tuple
@@ -291,7 +292,7 @@ def get_special_groups(mol) -> List[Tuple[Tuple[int, ...], Tuple[int, ...]]]:
 
     # Build list
     groups = []
-    for (add_if_match, template) in group_templates:
+    for add_if_match, template in group_templates:
         matches = mol.GetSubstructMatches(
             Chem.MolFromSmarts(template), useChirality=True
         )
diff --git a/rxnutils/data/batch_utils.py b/rxnutils/data/batch_utils.py
index c62a036..f5622b6 100644
--- a/rxnutils/data/batch_utils.py
+++ b/rxnutils/data/batch_utils.py
@@ -101,6 +101,9 @@ def create_csv_batches(
     file_size = (
         nlines(filename) - 1
     )  # Header should not be counted for batch size calculations
+    nbatches = min(
+        file_size, nbatches
+    )  # Adjust the number of batches to the size of the file
     batch_size, remainder = divmod(file_size, nbatches)
     stop = 1  # 1-indexed to account for header in the .csv file
     batches = []
diff --git a/rxnutils/pipeline/actions/reaction_mod.py b/rxnutils/pipeline/actions/reaction_mod.py
index 4b4eeb9..5fe65c6 100644
--- a/rxnutils/pipeline/actions/reaction_mod.py
+++ b/rxnutils/pipeline/actions/reaction_mod.py
@@ -1,12 +1,12 @@
 """Module containing actions on reactions that modify the reaction in some way"""
+
 from __future__ import annotations
 
 import os
-import re
 import subprocess
 import sys
 import tempfile
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from pathlib import Path
 from typing import ClassVar, List, Tuple
 
@@ -14,13 +14,13 @@
 from rdkit import Chem, RDLogger
 from rdkit.Chem import RDConfig
 
+from rxnutils.chem.disconnection_sites.atom_map_tagging import atom_map_tag_products
+from rxnutils.chem.disconnection_sites.tag_converting import convert_atom_map_tag
 from rxnutils.chem.utils import (
     atom_mapping_numbers,
     neutralize_molecules,
     remove_atom_mapping,
     desalt_molecules,
-    split_smiles_from_reaction,
-    join_smiles_from_reaction,
 )
 from rxnutils.pipeline.base import ReactionActionMixIn, action, global_apply
 
@@ -543,6 +543,64 @@ def _apply_row(self, row: pd.Series) -> pd.Series:
         )
 
 
+@action
+@dataclass
+class AtomMapTagDisconnectionSite(ReactionActionMixIn):
+    """Action for tagging disconnection site in products with atom-map '[<atom>:1]'."""
+
+    pretty_name: ClassVar[str] = "atom_map_tag_disconnection_site"
+    in_column: str = "RxnSmilesClean"
+    out_column: str = "products_atom_map_tagged"
+
+    def __call__(self, data: pd.DataFrame) -> pd.DataFrame:
+        smiles_col = global_apply(data, self._row_action, axis=1)
+        return data.assign(**{self.out_column: smiles_col})
+
+    def __str__(self) -> str:
+        return f"{self.pretty_name} (tag disconnection sites in products with '[<atom>:1]')"
+
+    def _row_action(self, row: pd.Series) -> str:
+        return atom_map_tag_products(row[self.in_column])
+
+
+@action
+@dataclass
+class ConvertAtomMapDisconnectionTag(ReactionActionMixIn):
+    """Action for converting atom-map tagging to exclamation mark tagging.
+
+    yaml example:
+
+    convert_atom_map_disconnection_tag:
+        in_column_tagged: products_atom_map_tagged
+        in_column_untagged: products
+        out_column_tagged: products_tagged
+        out_column_reconstructed: products_reconstructed
+    """
+
+    pretty_name: ClassVar[str] = "convert_atom_map_disconnection_tag"
+    in_column: str = "products_atom_map_tagged"
+    out_column_tagged: str = "products_tagged"
+    out_column_reconstructed: str = "products_reconstructed"
+
+    def __call__(self, data: pd.DataFrame) -> pd.DataFrame:
+        smiles_tagged_col = global_apply(data, self._row_action, axis=1)
+        smiles_reconstructed_col = smiles_tagged_col.str.replace("!", "")
+
+        return data.assign(
+            **{
+                self.out_column_tagged: smiles_tagged_col,
+                self.out_column_reconstructed: smiles_reconstructed_col,
+            }
+        )
+
+    def __str__(self) -> str:
+        return f"{self.pretty_name} (convert disconnection tagging '[<atom>:1]' to '<atom>!')"
+
+    def _row_action(self, row: pd.Series) -> str:
+        product_tagged = convert_atom_map_tag(row[self.in_column])
+        return product_tagged
+
+
 @action
 @dataclass
 class TrimRxnSmiles:
diff --git a/rxnutils/routes/base.py b/rxnutils/routes/base.py
index 71327c7..7974d61 100644
--- a/rxnutils/routes/base.py
+++ b/rxnutils/routes/base.py
@@ -4,7 +4,7 @@
 and drawing the route
 """
 
-from typing import Dict, Any, List, Callable
+from typing import Dict, Any, List, Callable, Union
 from copy import deepcopy
 from operator import itemgetter
 
@@ -24,14 +24,14 @@
 class SynthesisRoute:
     """
     This encapsulates a synthesis route or a reaction tree.
-    It provides convenient methods for assigning atom-mapping
+    It provide convinient methods for assigning atom-mapping
     to the reactions, and for providing reaction-level data
     of the route
 
     It is typically initiallized by one of the readers in the
     `rxnutils.routes.readers` module.
 
-    The tree depth and the forward step are automatically assigned
+    The tree depth and the forward step is automatically assigned
     to each reaction node.
 
     :param reaction_tree: the tree structure representing the route
@@ -68,11 +68,13 @@ def atom_mapped_reaction_smiles(self) -> List[str]:
         return smiles
 
     def assign_atom_mapping(
-        self, overwrite: bool = False, only_rxnmapper: bool = False
+        self,
+        overwrite: bool = False,
+        only_rxnmapper: bool = False,
     ) -> None:
         """
         Assign atom-mapping to each reaction in the route and
-        ensure that is consistent from root compound and throughout
+        ensure that is is consistent from root compound and throughout
         the route.
 
         It will use NameRxn to assign classification and possiblty atom-mapping,
@@ -92,7 +94,7 @@ def chains(
         """
         Returns linear sequences or chains extracted from the route.
 
-        Each chain is a list of dictionaries representing the molecules, only the most
+        Each chain is a list of a dictionary representing the molecules, only the most
         complex molecule is kept for each reaction - making the chain a sequence of molecule
         to molecule transformation.
 
@@ -168,19 +170,25 @@ def reaction_smiles(self) -> List[str]:
     def remap(self, other: "SynthesisRoute") -> None:
         """
         Remap the reaction so that it follows the mapping of a
-        root compound in a reference routes
+        1) root compound in a reference route, 2) a ref compound given
+        as a SMILES, or 3) using a raw mapping
 
-        :param other: the reference route
+        :param other: the reference for re-mapping
         """
-        try:
-            ref = other.mapped_root_smiles
-            other = self.mapped_root_smiles
-        except ValueError as err:
-            # For single-compound routes, we can just ignore this
-            if str(err).startswith("Single"):
+        if isinstance(other, SynthesisRoute):
+            if len(self.reaction_smiles()) == 0 or len(other.reaction_smiles()) == 0:
+                return
+            mapping_dict = _find_remapping(
+                other.mapped_root_smiles, self.mapped_root_smiles
+            )
+        elif isinstance(other, str):
+            if len(self.reaction_smiles()) == 0:
                 return
-            raise
-        mapping_dict = _find_remapping(ref, other)
+            mapping_dict = _find_remapping(other, self.mapped_root_smiles)
+        elif isinstance(other, dict):
+            mapping_dict = other
+        else:
+            raise ValueError(f"Cannot perform re-mapping using a {type(other)}")
         _remap_reactions(self.reaction_tree, mapping_dict)
 
     def _assign_mapping(
@@ -196,11 +204,14 @@ def _assign_mapping(
                 return
 
         df = pd.DataFrame({"smiles": list(set(self.reaction_smiles()))})
-        nextmove_action = NameRxn(in_column="smiles")
+        nextmove_action = NameRxn(in_column="smiles", nm_rxn_column="mapped_smiles")
         rxnmapper_action = RxnMapper(in_column="smiles")
         df = rxnmapper_action(nextmove_action(df))
         if only_rxnmapper:
-            df["NextMoveRxnSmiles"] = df["RxnmapperRxnSmiles"]
+            df["mapped_smiles"] = df["RxnmapperRxnSmiles"]
+        else:
+            sel = df["NMC"] == "0.0"
+            df["mapped_smiles"].mask(sel, df["RxnmapperRxnSmiles"], inplace=True)
         datamap = df.set_index("smiles").to_dict("index")
         _copy_mapping_from_datamap(self.reaction_tree, datamap)
 
@@ -489,10 +500,7 @@ def _copy_mapping_from_datamap(
     rxnsmi = f"{reactants}>>{tree_dict['smiles']}"
     metadata = children[0].get("metadata", {})
     metadata["classification"] = datamap[rxnsmi]["NMC"]
-    if datamap[rxnsmi]["NMC"] == "0.0":
-        metadata["mapped_reaction_smiles"] = datamap[rxnsmi]["RxnmapperRxnSmiles"]
-    else:
-        metadata["mapped_reaction_smiles"] = datamap[rxnsmi]["NextMoveRxnSmiles"]
+    metadata["mapped_reaction_smiles"] = datamap[rxnsmi]["mapped_smiles"]
     metadata = children[0]["metadata"] = metadata
     for grandchild in grandchildren:
         _copy_mapping_from_datamap(grandchild, datamap)
diff --git a/tests/data/mapped_tests_reactions.csv b/tests/data/mapped_tests_reactions.csv
index 8658ff5..7e23bdc 100644
--- a/tests/data/mapped_tests_reactions.csv
+++ b/tests/data/mapped_tests_reactions.csv
@@ -1,3 +1,3 @@
-smiles	NextMoveRxnSmiles	NMC	RxnmapperRxnSmiles
+smiles	mapped_smiles	NMC	RxnmapperRxnSmiles
 Cl.c1ccccc1>>Clc1ccccc1	[cH:3]1[cH:2][cH:1][cH:6][cH:5][cH:4]1.Cl>>[cH:4]1[cH:3][cH:2][c:1]([cH:6][cH:5]1)Cl	10.1.2	[ClH:1].[cH:2]1[cH:3][cH:4][cH:5][cH:6][cH:7]1>>[Cl:1][c:2]1[cH:3][cH:4][cH:5][cH:6][cH:7]1
 CO.Clc1ccccc1>>COc1ccccc1	[CH3:1][OH:2].[cH:6]1[cH:5][cH:4][c:3]([cH:8][cH:7]1)Cl>>[CH3:1][O:2][c:3]1[cH:4][cH:5][cH:6][cH:7][cH:8]1	1.7.11	Cl[c:3]1[cH:4][cH:5][cH:6][cH:7][cH:8]1.[CH3:1][OH:2]>>[CH3:1][O:2][c:3]1[cH:4][cH:5][cH:6][cH:7][cH:8]1
diff --git a/tests/test_batch_utils.py b/tests/test_batch_utils.py
index 11e0509..bee7183 100644
--- a/tests/test_batch_utils.py
+++ b/tests/test_batch_utils.py
@@ -44,6 +44,36 @@ def test_nlines(line_count, create_dummy_file):
     [
         (2, [(0, 1, 6), (1, 6, 11)]),
         (3, [(0, 1, 5), (1, 5, 8), (2, 8, 11)]),
+        (
+            10,
+            [
+                (0, 1, 2),
+                (1, 2, 3),
+                (2, 3, 4),
+                (3, 4, 5),
+                (4, 5, 6),
+                (5, 6, 7),
+                (6, 7, 8),
+                (7, 8, 9),
+                (8, 9, 10),
+                (9, 10, 11),
+            ],
+        ),
+        (
+            15,
+            [
+                (0, 1, 2),
+                (1, 2, 3),
+                (2, 3, 4),
+                (3, 4, 5),
+                (4, 5, 6),
+                (5, 6, 7),
+                (6, 7, 8),
+                (7, 8, 9),
+                (8, 9, 10),
+                (9, 10, 11),
+            ],
+        ),
     ],
 )
 def test_csv_chunks(nbatches, expected, create_dummy_file):
@@ -57,6 +87,8 @@ def test_csv_chunks(nbatches, expected, create_dummy_file):
     [
         (2, [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]),
         (3, [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9]]),
+        (10, [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]]),
+        (15, [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]]),
     ],
 )
 def test_csv_chunks_end2end(nbatches, expected, create_dummy_file):
diff --git a/tests/test_product_tagging.py b/tests/test_product_tagging.py
new file mode 100644
index 0000000..71577f9
--- /dev/null
+++ b/tests/test_product_tagging.py
@@ -0,0 +1,77 @@
+import pytest
+from rxnutils.chem.disconnection_sites.atom_map_tagging import (
+    atom_map_tag_products,
+    get_atom_list,
+)
+from rxnutils.chem.disconnection_sites.tag_converting import (
+    convert_atom_map_tag,
+    tagged_smiles_from_tokens,
+)
+
+
+@pytest.mark.parametrize(
+    ("reactants_smiles", "product_smiles", "expected"),
+    [
+        (
+            "[Cl:2].[CH:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1",
+            "[Cl:2][C:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1",
+            [1, 2],
+        ),
+        (
+            "Cl.[CH:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1",
+            "Cl[C:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1",
+            [1],
+        ),
+    ],
+)
+def test_get_atom_list(reactants_smiles, product_smiles, expected):
+    atom_list = get_atom_list(reactants_smiles, product_smiles)
+    assert sorted(atom_list) == expected
+
+
+@pytest.mark.parametrize(
+    ("reactants_smiles", "product_smiles", "expected"),
+    [
+        (
+            "[Cl:2].[CH:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1",
+            "[Cl:2][C:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1",
+            "c1cc[c:1]([Cl:1])cc1",
+        ),
+        (
+            "Cl.[CH:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1",
+            "Cl[C:1]1=[CH:7][CH:6]=[CH:5][CH:4]=[CH:3]1",
+            "Cl[c:1]1ccccc1",
+        ),
+    ],
+)
+def test_atom_map_tag_products(reactants_smiles, product_smiles, expected):
+    tagged_product = atom_map_tag_products(f"{reactants_smiles}>>{product_smiles}")
+    assert tagged_product == expected
+
+
+@pytest.mark.parametrize(
+    ("product_smiles", "expected"),
+    [
+        ("c1cc[c:1]([Cl:1])cc1", "Cl!c!1ccccc1"),
+        ("Cl[c:1]1ccccc1", "Clc!1ccccc1"),
+        ("Clc1ccccc1", ""),
+    ],
+)
+def test_tag_converting(product_smiles, expected):
+    tagged_product = convert_atom_map_tag(product_smiles)
+    assert tagged_product == expected
+
+
+@pytest.mark.parametrize(
+    ("tagged_tokens", "untagged_tokens", "expected"),
+    [
+        (["[C:1]", "[C@H]", "O"], ["C", "[C@@H]", "O"], ("C![C@@H]O", "C[C@@H]O")),
+        (["[C:1]", "/", "C", "O"], ["C", "C", "O"], ("C!CO", "CCO")),
+        (["[C:1]", "C", "O"], ["C", "/", "C", "O"], ("C!CO", "CCO")),
+        (["[C:1]", "/", "C", "O"], ["C", "\\", "C", "O"], ("C!\\CO", "C\\CO")),
+        (["[C:1]", "C", "O"], ["[C]", "C", "O"], ("C!CO", "CCO")),
+    ],
+)
+def test_tagged_smiles_from_tokens(tagged_tokens, untagged_tokens, expected):
+    output = tagged_smiles_from_tokens(tagged_tokens, untagged_tokens)
+    assert output == expected
diff --git a/tests/test_reaction_mods_actions.py b/tests/test_reaction_mods_actions.py
index 1d7c052..5a6c91d 100644
--- a/tests/test_reaction_mods_actions.py
+++ b/tests/test_reaction_mods_actions.py
@@ -12,9 +12,12 @@
     SplitReaction,
     RemoveUnchangedProducts,
     IsotopeInfo,
+    AtomMapTagDisconnectionSite,
+    ConvertAtomMapDisconnectionTag,
     DesaltMolecules,
     CONTRIB_INSTALLED,
 )
+from rxnutils.chem.disconnection_sites.tag_converting import smiles_tokens
 from rxnutils.pipeline.base import global_apply
 
 global_apply.max_workers = 1
@@ -217,6 +220,50 @@ def test_isotope_info():
     ]
 
 
+def test_disconnection_tagging(shared_datadir):
+
+    df = pd.read_csv(shared_datadir / "mapped_tests_reactions.csv", sep="\t")
+
+    action_atom_map_tag = AtomMapTagDisconnectionSite(in_column="RxnmapperRxnSmiles")
+    action_convert_tag = ConvertAtomMapDisconnectionTag()
+
+    df_atom_map_tag = action_atom_map_tag(df)
+    df_tag = action_convert_tag(df_atom_map_tag)
+
+    df_ground_truth = pd.Series(
+        ["Cl!c!1ccccc1", "CO!c!1ccccc1"], name="products_tagged"
+    )
+
+    assert df_ground_truth.equals(df_tag["products_tagged"])
+
+
+def test_smiles_tokenization_unknown_token_error(shared_datadir):
+
+    df = pd.read_csv(shared_datadir / "mapped_tests_reactions.csv", sep="\t")
+
+    action_atom_map_tag = AtomMapTagDisconnectionSite(in_column="RxnmapperRxnSmiles")
+    df_atom_map_tag = action_atom_map_tag(df)
+
+    product_atom_map_tagged = df_atom_map_tag["products_atom_map_tagged"].values[0]
+
+    with pytest.raises(AssertionError):
+        smiles_tokens(product_atom_map_tagged + "{")
+
+
+def test_converting_no_atom_map_tag(shared_datadir):
+
+    df = pd.read_csv(shared_datadir / "mapped_tests_reactions.csv", sep="\t")
+    df["products"] = [rxn.split(">")[-1] for rxn in df.smiles]
+
+    action_convert_tag = ConvertAtomMapDisconnectionTag(in_column="products")
+
+    df_tag = action_convert_tag(df)
+
+    df_ground_truth = pd.Series(["", ""], name="products_tagged")
+
+    assert df_ground_truth.equals(df_tag["products_tagged"])
+
+
 def test_desalting():
     smi1 = "OCC.(C.[Na+].[Cl-])>>OC(=O)CC"
     smi2 = "OCC>>OC(=O)CC"
diff --git a/tests/test_rinchi.py b/tests/test_rinchi.py
index 5224972..dc30607 100644
--- a/tests/test_rinchi.py
+++ b/tests/test_rinchi.py
@@ -17,11 +17,13 @@ def rinchi_download():
 
 
 @pytest.mark.xfail(sys.platform not in PLATFORM2FOLDER, reason="Platform not supported")
-@pytest.mark.xfail(raises=HTTPError)
+@pytest.mark.xfail(
+    raises=HTTPError,
+)  # Expect a failure incase of failure to download RInChI archive
 def test_download_rinchi(mocker, tmpdir):
     config = {
         "download_folder": str(tmpdir),
-        "download_url": "http://www.inchi-trust.org/download/RInChI/RInChI-V1-00.zip",
+        "download_url": "https://www.inchi-trust.org/wp/download/RInChI/RInChI-V1-00.zip",
     }
     mocker.patch("rxnutils.chem.rinchi.download_rinchi.CONFIG", config)
 
@@ -31,7 +33,9 @@ def test_download_rinchi(mocker, tmpdir):
 
 
 @pytest.mark.xfail(sys.platform not in PLATFORM2FOLDER, reason="Platform not supported")
-@pytest.mark.xfail(raises=HTTPError)
+@pytest.mark.xfail(
+    raises=HTTPError,
+)  # Expect a failure incase of failure to download RInChI archive
 def test_generate_rinchi(rinchi_download):
     rsmi = (
         "[ClH;D0;+0:1]>>"
diff --git a/tests/test_routes_base.py b/tests/test_routes_base.py
index 15166b5..960a631 100644
--- a/tests/test_routes_base.py
+++ b/tests/test_routes_base.py
@@ -2,6 +2,7 @@
 
 import pytest
 import pandas as pd
+from rdkit import Chem
 
 from rxnutils.routes.base import SynthesisRoute
 
@@ -71,6 +72,48 @@ def test_remap(synthesis_route, setup_mapper):
     assert route1.atom_mapped_reaction_smiles() != old_reaction_smiles
 
 
+def test_remap_ref_smiles(synthesis_route, setup_mapper):
+    route1 = synthesis_route
+    route1.assign_atom_mapping()
+    old_reaction_smiles = route1.atom_mapped_reaction_smiles()
+    reactants, products = old_reaction_smiles[0].split(">>")
+    rsmi_old = Chem.MolToSmiles(Chem.MolFromSmiles(reactants))
+    psmi_old = Chem.MolToSmiles(Chem.MolFromSmiles(products))
+
+    route1.remap(products)
+
+    reactants, products = route1.atom_mapped_reaction_smiles()[0].split(">>")
+    rsmi = Chem.MolToSmiles(Chem.MolFromSmiles(reactants))
+    psmi = Chem.MolToSmiles(Chem.MolFromSmiles(products))
+    assert rsmi == rsmi_old
+    assert psmi == psmi_old
+
+    route1.remap("[CH3:10][O:2][c:3]1[cH:4][cH:5][cH:6][cH:7][cH:8]1")
+
+    reactants, products = route1.atom_mapped_reaction_smiles()[0].split(">>")
+    rsmi = Chem.MolToSmiles(Chem.MolFromSmiles(reactants))
+    psmi = Chem.MolToSmiles(Chem.MolFromSmiles(products))
+    assert rsmi != rsmi_old
+    assert psmi != psmi_old
+
+
+def test_remap_ref_dict(synthesis_route, setup_mapper):
+    route1 = synthesis_route
+    route1.assign_atom_mapping()
+    old_reaction_smiles = route1.atom_mapped_reaction_smiles()
+    reactants, products = old_reaction_smiles[0].split(">>")
+    rsmi_old = Chem.MolToSmiles(Chem.MolFromSmiles(reactants))
+    psmi_old = Chem.MolToSmiles(Chem.MolFromSmiles(products))
+
+    route1.remap({1: 10, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8})
+
+    reactants, products = route1.atom_mapped_reaction_smiles()[0].split(">>")
+    rsmi = Chem.MolToSmiles(Chem.MolFromSmiles(reactants))
+    psmi = Chem.MolToSmiles(Chem.MolFromSmiles(products))
+    assert rsmi != rsmi_old
+    assert psmi != psmi_old
+
+
 def test_extract_chains(synthesis_route):
     complexity = {"COc1ccccc1": 5, "CO": 0, "Clc1ccccc1": 1, "c1ccccc1": 1, "Cl": 0}
 

From cfd8a6113c48c3aa01b5622a8e48b9f5a087f233 Mon Sep 17 00:00:00 2001
From: "Genheden, Samuel" <samuel.genheden@astrazeneca.com>
Date: Mon, 27 May 2024 10:07:25 +0200
Subject: [PATCH 2/3] Update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 103f4ea..c618ede 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,7 @@
 ### Features
 
 - Adding support for tagging reaction sites in SMILES
+- Adding more options for re-mapping routes
 
 ### Miscellaneous
 

From 7baecdd796351e4a187d3e9fade6570ab4b0a1fc Mon Sep 17 00:00:00 2001
From: "Genheden, Samuel" <samuel.genheden@astrazeneca.com>
Date: Mon, 27 May 2024 13:06:18 +0200
Subject: [PATCH 3/3] Fixing typos

---
 rxnutils/routes/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rxnutils/routes/base.py b/rxnutils/routes/base.py
index 7974d61..6f0425f 100644
--- a/rxnutils/routes/base.py
+++ b/rxnutils/routes/base.py
@@ -24,14 +24,14 @@
 class SynthesisRoute:
     """
     This encapsulates a synthesis route or a reaction tree.
-    It provide convinient methods for assigning atom-mapping
+    It provide convenient methods for assigning atom-mapping
     to the reactions, and for providing reaction-level data
     of the route
 
     It is typically initiallized by one of the readers in the
     `rxnutils.routes.readers` module.
 
-    The tree depth and the forward step is automatically assigned
+    The tree depth and the forward step are automatically assigned
     to each reaction node.
 
     :param reaction_tree: the tree structure representing the route
@@ -74,7 +74,7 @@ def assign_atom_mapping(
     ) -> None:
         """
         Assign atom-mapping to each reaction in the route and
-        ensure that is is consistent from root compound and throughout
+        ensure that it is consistent from root compound and throughout
         the route.
 
         It will use NameRxn to assign classification and possiblty atom-mapping,