Modify graphs.py to handle insertions when {'insertions': True} (#223

) * update conversion * fix coord issues * update graphs.py to handle insertions * fix affedcted test * accomodate insertion in hbond donor and acceptor * update changelog * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add control flow for insertion labelling * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: manon Réau <[email protected]> Co-authored-by: Arian Jamasb <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
a-r-j · Dec 8, 2022 · 75d5862 · 75d5862
1 parent 4882b7a
commit 75d5862
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 45 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,8 @@
 * [Logging] - [#221](https://github.com/a-r-j/graphein/pull/221) Adds global control of logging with `graphein.verbose(enabled=False)`.
 
 #### Protein
+
+* [Bugfix] - [#223](https://github.com/a-r-j/graphein/pull/220) Fix handling of insertions in protein graphs. Insertions are now given IDs like: `A:SER:12:A`. Contribution by @manonreau.
 * [Bugfix] - [#226](https://github.com/a-r-j/graphein/pull/226) Catches failed AF2 structure downloads [#225](https://github.com/a-r-j/graphein/issues/225)
 * [Feature] - [#229](https://github.com/a-r-j/graphein/pull/220) Adds support for filtering KNN edges based on self-loops and chain membership. Contribution by @anton-bushuiev.
 * [Bugfix] - [#229](https://github.com/a-r-j/graphein/pull/220) Fixes bug in KNN edge computation. Contribution by @anton-bushuiev.

diff --git a/graphein/protein/edges/distance.py b/graphein/protein/edges/distance.py
@@ -388,12 +388,10 @@ def add_ionic_interactions(
                 G.nodes[r1]["residue_name"] in POS_AA
                 and G.nodes[r2]["residue_name"] in NEG_AA
             )
-
             condition2 = (
                 G.nodes[r2]["residue_name"] in POS_AA
                 and G.nodes[r1]["residue_name"] in NEG_AA
             )
-
             is_ionic = condition1 or condition2
             if not is_ionic:
                 G.edges[r1, r2]["kind"].remove("ionic")
@@ -650,7 +648,6 @@ def add_pi_stacking_interactions(
         distmat.columns = aromatic_df["node_id"]
         distmat = distmat[distmat <= centroid_distance].fillna(0)
         indices = np.where(distmat > 0)
-
         interacting_resis = [
             (distmat.index[r], distmat.index[c])
             for r, c in zip(indices[0], indices[1])
@@ -659,7 +656,6 @@ def add_pi_stacking_interactions(
         for n1, n2 in interacting_resis:
             assert G.nodes[n1]["residue_name"] in PI_RESIS
             assert G.nodes[n2]["residue_name"] in PI_RESIS
-
             n1_centroid = aromatic_df.loc[aromatic_df["node_id"] == n1][
                 ["x_coord", "y_coord", "z_coord"]
             ].values[0]
@@ -715,7 +711,6 @@ def add_t_stacking(G: nx.Graph, pdb_df: Optional[pd.DataFrame] = None):
         distmat.columns = aromatic_df["node_id"]
         distmat = distmat[distmat <= 7].fillna(0)
         indices = np.where(distmat > 0)
-
         interacting_resis = [
             (distmat.index[r], distmat.index[c])
             for r, c in zip(indices[0], indices[1])
@@ -724,7 +719,6 @@ def add_t_stacking(G: nx.Graph, pdb_df: Optional[pd.DataFrame] = None):
         for n1, n2 in interacting_resis:
             assert G.nodes[n1]["residue_name"] in PI_RESIS
             assert G.nodes[n2]["residue_name"] in PI_RESIS
-
             n1_centroid = aromatic_df.loc[aromatic_df["node_id"] == n1][
                 ["x_coord", "y_coord", "z_coord"]
             ].values[0]
@@ -841,12 +835,10 @@ def add_salt_bridges(
                 G.nodes[r1]["residue_name"] in SALT_BRIDGE_ANIONS
                 and G.nodes[r2]["residue_name"] in SALT_BRIDGE_CATIONS
             )
-
             condition2 = (
                 G.nodes[r2]["residue_name"] in SALT_BRIDGE_ANIONS
                 and G.nodes[r1]["residue_name"] in SALT_BRIDGE_CATIONS
             )
-
             is_ionic = condition1 or condition2
             if not is_ionic:
                 G.edges[r1, r2]["kind"].remove("salt_bridge")

diff --git a/graphein/protein/features/nodes/amino_acid.py b/graphein/protein/features/nodes/amino_acid.py
@@ -202,12 +202,17 @@ def hydrogen_bond_donor(
     """
     node_id = n.split(":")
     res = node_id[1]
+
     if len(node_id) == 4:  # Atomic graph
         atom = node_id[-1]
         try:
             features = HYDROGEN_BOND_DONORS[res][atom]
         except KeyError:
-            features = 0
+            try:  # Handle insertions
+                atom = node_id[-2]
+                features = HYDROGEN_BOND_DONORS[res][atom]
+            except KeyError:
+                features = 0
     elif len(node_id) == 3:  # Residue graph
         if res not in HYDROGEN_BOND_DONORS.keys():
             features = 0
@@ -249,7 +254,11 @@ def hydrogen_bond_acceptor(
         try:
             features = HYDROGEN_BOND_ACCEPTORS[res][atom]
         except KeyError:
-            features = 0
+            try:  # Handle insertions
+                atom = node_id[-2]
+                features = HYDROGEN_BOND_ACCEPTORS[res][atom]
+            except KeyError:
+                features = 0
     elif len(node_id) == 3:  # Residue graph
         if res not in HYDROGEN_BOND_ACCEPTORS.keys():
             features = 0

diff --git a/graphein/protein/graphs.py b/graphein/protein/graphs.py
@@ -117,15 +117,26 @@ def read_pdb_to_dataframe(
     return pd.concat([atomic_df.df["ATOM"], atomic_df.df["HETATM"]])
 
 
-def label_node_id(df: pd.DataFrame, granularity: str) -> pd.DataFrame:
+def label_node_id(
+    df: pd.DataFrame, granularity: str, insertions: bool = False
+) -> pd.DataFrame:
     """Assigns a ``node_id`` column to the atomic dataframe. Node IDs are of the
     form: ``"<CHAIN>:<RESIDUE_NAME>:<RESIDUE_NUMBER>:<ATOM_NAME>"`` for atomic
     graphs or ``"<CHAIN>:<RESIDUE_NAME>:<RESIDUE_NUMBER>"`` for residue graphs.
 
+    If ``insertions=True``, the insertion code will be appended to the end of
+    the node_id (e.g. ``"<CHAIN>:<RESIDUE_NAME>:<RESIDUE_NUMBER>:<ATOM_NAME>:"``)
+
     :param df: Protein structure DataFrame.
     :type df: pd.DataFrame
-    :param granularity: Granularity of graph.
+    :param granularity: Granularity of graph. Atom-level,
+        residue (e.g. ``CA``) or ``centroids``. See:
+        :const:`~graphein.protein.config.GRAPH_ATOMS` and
+        :const:`~graphein.protein.config.GRANULARITY_OPTS`.
     :type granularity: str
+    :param insertions: Whether or not to include insertion codes in the node id.
+        Default is ``False``.
+    :type insertions: bool
     :return: Protein structure DataFrame with ``node_id`` column.
     :rtype: pd.DataFrame
     """
@@ -136,6 +147,9 @@ def label_node_id(df: pd.DataFrame, granularity: str) -> pd.DataFrame:
         + ":"
         + df["residue_number"].apply(str)
     )
+
+    if insertions:
+        df["node_id"] = df["node_id"] + ":" + df["insertion"].apply(str)
     df["residue_id"] = df["node_id"]
     if granularity == "atom":
         df["node_id"] = df["node_id"] + ":" + df["atom_name"]
@@ -304,7 +318,9 @@ def process_dataframe(
         construction functions.
     :rtype: pd.DataFrame
     """
-    protein_df = label_node_id(protein_df, granularity=granularity)
+    protein_df = label_node_id(
+        protein_df, granularity=granularity, insertions=insertions
+    )
     # TODO: Need to properly define what "granularity" is supposed to do.
     atoms = filter_dataframe(
         protein_df,
@@ -381,36 +397,9 @@ def sort_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     :return: Sorted protein dataframe.
     :rtype: pd.DataFrame
     """
-    return df.sort_values(by=["chain_id", "residue_number", "atom_number"])
-
-
-def assign_node_id_to_dataframe(
-    protein_df: pd.DataFrame, granularity: str
-) -> pd.DataFrame:
-    """
-    Assigns the node ID back to the ``pdb_df`` dataframe
-
-    :param protein_df: Structure Dataframe
-    :type protein_df: pd.DataFrame
-    :param granularity: Granularity of graph. Atom-level,
-        residue (e.g. ``CA``) or ``centroids``. See:
-        :const:`~graphein.protein.config.GRAPH_ATOMS` and
-        :const:`~graphein.protein.config.GRANULARITY_OPTS`.
-    :type granularity: str
-    :return: Returns dataframe with added ``node_ids``
-    :rtype: pd.DataFrame
-    """
-    protein_df["node_id"] = (
-        protein_df["chain_id"].apply(str)
-        + ":"
-        + protein_df["residue_name"]
-        + ":"
-        + protein_df["residue_number"].apply(str)
+    return df.sort_values(
+        by=["chain_id", "residue_number", "atom_number", "insertion"]
     )
-    if granularity in {"atom", "rna_atom"}:
-        protein_df[
-            "node_id"
-        ] = f'{protein_df["node_id"]}:{protein_df["atom_name"]}'
 
 
 def select_chains(
@@ -580,7 +569,9 @@ def calculate_centroid_positions(
     :rtype: pd.DataFrame
     """
     centroids = (
-        atoms.groupby(["residue_number", "chain_id", "residue_name"])
+        atoms.groupby(
+            ["residue_number", "chain_id", "residue_name", "insertion"]
+        )
         .mean()[["x_coord", "y_coord", "z_coord"]]
         .reset_index()
     )

diff --git a/tests/protein/edges/test_distance.py b/tests/protein/edges/test_distance.py
@@ -190,7 +190,7 @@ def test_add_peptide_bonds():
     G = construct_graph(pdb_path=str(file_path))
 
     for u, v in G.edges():
-        assert abs(int(u.split(":")[-1]) - int(v.split(":")[-1])) == 1
+        assert abs(int(u.split(":")[2]) - int(v.split(":")[2])) == 1
 
 
 def test_add_sequence_distance_edges():
@@ -205,7 +205,7 @@ def test_add_sequence_distance_edges():
         )
         G = construct_graph(pdb_path=str(file_path), config=config)
         for u, v in G.edges():
-            assert abs(int(u.split(":")[-1]) - int(v.split(":")[-1])) == d
+            assert abs(int(u.split(":")[2]) - int(v.split(":")[2])) == d
 
 
 def test_salt_bridge_interactions():