Skip to content

Commit

Permalink
Modify graphs.py to handle insertions when {'insertions': True} (#223
Browse files Browse the repository at this point in the history
)

* update conversion

* fix coord issues

* update graphs.py to handle insertions

* fix affedcted test

* accomodate insertion in hbond donor and acceptor

* update changelog

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add control flow for insertion labelling

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: manon Réau <[email protected]>
Co-authored-by: Arian Jamasb <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
4 people authored Dec 8, 2022
1 parent 4882b7a commit 75d5862
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 45 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
* [Logging] - [#221](https://github.com/a-r-j/graphein/pull/221) Adds global control of logging with `graphein.verbose(enabled=False)`.

#### Protein

* [Bugfix] - [#223](https://github.com/a-r-j/graphein/pull/220) Fix handling of insertions in protein graphs. Insertions are now given IDs like: `A:SER:12:A`. Contribution by @manonreau.
* [Bugfix] - [#226](https://github.com/a-r-j/graphein/pull/226) Catches failed AF2 structure downloads [#225](https://github.com/a-r-j/graphein/issues/225)
* [Feature] - [#229](https://github.com/a-r-j/graphein/pull/220) Adds support for filtering KNN edges based on self-loops and chain membership. Contribution by @anton-bushuiev.
* [Bugfix] - [#229](https://github.com/a-r-j/graphein/pull/220) Fixes bug in KNN edge computation. Contribution by @anton-bushuiev.
Expand Down
8 changes: 0 additions & 8 deletions graphein/protein/edges/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,12 +388,10 @@ def add_ionic_interactions(
G.nodes[r1]["residue_name"] in POS_AA
and G.nodes[r2]["residue_name"] in NEG_AA
)

condition2 = (
G.nodes[r2]["residue_name"] in POS_AA
and G.nodes[r1]["residue_name"] in NEG_AA
)

is_ionic = condition1 or condition2
if not is_ionic:
G.edges[r1, r2]["kind"].remove("ionic")
Expand Down Expand Up @@ -650,7 +648,6 @@ def add_pi_stacking_interactions(
distmat.columns = aromatic_df["node_id"]
distmat = distmat[distmat <= centroid_distance].fillna(0)
indices = np.where(distmat > 0)

interacting_resis = [
(distmat.index[r], distmat.index[c])
for r, c in zip(indices[0], indices[1])
Expand All @@ -659,7 +656,6 @@ def add_pi_stacking_interactions(
for n1, n2 in interacting_resis:
assert G.nodes[n1]["residue_name"] in PI_RESIS
assert G.nodes[n2]["residue_name"] in PI_RESIS

n1_centroid = aromatic_df.loc[aromatic_df["node_id"] == n1][
["x_coord", "y_coord", "z_coord"]
].values[0]
Expand Down Expand Up @@ -715,7 +711,6 @@ def add_t_stacking(G: nx.Graph, pdb_df: Optional[pd.DataFrame] = None):
distmat.columns = aromatic_df["node_id"]
distmat = distmat[distmat <= 7].fillna(0)
indices = np.where(distmat > 0)

interacting_resis = [
(distmat.index[r], distmat.index[c])
for r, c in zip(indices[0], indices[1])
Expand All @@ -724,7 +719,6 @@ def add_t_stacking(G: nx.Graph, pdb_df: Optional[pd.DataFrame] = None):
for n1, n2 in interacting_resis:
assert G.nodes[n1]["residue_name"] in PI_RESIS
assert G.nodes[n2]["residue_name"] in PI_RESIS

n1_centroid = aromatic_df.loc[aromatic_df["node_id"] == n1][
["x_coord", "y_coord", "z_coord"]
].values[0]
Expand Down Expand Up @@ -841,12 +835,10 @@ def add_salt_bridges(
G.nodes[r1]["residue_name"] in SALT_BRIDGE_ANIONS
and G.nodes[r2]["residue_name"] in SALT_BRIDGE_CATIONS
)

condition2 = (
G.nodes[r2]["residue_name"] in SALT_BRIDGE_ANIONS
and G.nodes[r1]["residue_name"] in SALT_BRIDGE_CATIONS
)

is_ionic = condition1 or condition2
if not is_ionic:
G.edges[r1, r2]["kind"].remove("salt_bridge")
Expand Down
13 changes: 11 additions & 2 deletions graphein/protein/features/nodes/amino_acid.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,12 +202,17 @@ def hydrogen_bond_donor(
"""
node_id = n.split(":")
res = node_id[1]

if len(node_id) == 4: # Atomic graph
atom = node_id[-1]
try:
features = HYDROGEN_BOND_DONORS[res][atom]
except KeyError:
features = 0
try: # Handle insertions
atom = node_id[-2]
features = HYDROGEN_BOND_DONORS[res][atom]
except KeyError:
features = 0
elif len(node_id) == 3: # Residue graph
if res not in HYDROGEN_BOND_DONORS.keys():
features = 0
Expand Down Expand Up @@ -249,7 +254,11 @@ def hydrogen_bond_acceptor(
try:
features = HYDROGEN_BOND_ACCEPTORS[res][atom]
except KeyError:
features = 0
try: # Handle insertions
atom = node_id[-2]
features = HYDROGEN_BOND_ACCEPTORS[res][atom]
except KeyError:
features = 0
elif len(node_id) == 3: # Residue graph
if res not in HYDROGEN_BOND_ACCEPTORS.keys():
features = 0
Expand Down
57 changes: 24 additions & 33 deletions graphein/protein/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,15 +117,26 @@ def read_pdb_to_dataframe(
return pd.concat([atomic_df.df["ATOM"], atomic_df.df["HETATM"]])


def label_node_id(df: pd.DataFrame, granularity: str) -> pd.DataFrame:
def label_node_id(
df: pd.DataFrame, granularity: str, insertions: bool = False
) -> pd.DataFrame:
"""Assigns a ``node_id`` column to the atomic dataframe. Node IDs are of the
form: ``"<CHAIN>:<RESIDUE_NAME>:<RESIDUE_NUMBER>:<ATOM_NAME>"`` for atomic
graphs or ``"<CHAIN>:<RESIDUE_NAME>:<RESIDUE_NUMBER>"`` for residue graphs.
If ``insertions=True``, the insertion code will be appended to the end of
the node_id (e.g. ``"<CHAIN>:<RESIDUE_NAME>:<RESIDUE_NUMBER>:<ATOM_NAME>:"``)
:param df: Protein structure DataFrame.
:type df: pd.DataFrame
:param granularity: Granularity of graph.
:param granularity: Granularity of graph. Atom-level,
residue (e.g. ``CA``) or ``centroids``. See:
:const:`~graphein.protein.config.GRAPH_ATOMS` and
:const:`~graphein.protein.config.GRANULARITY_OPTS`.
:type granularity: str
:param insertions: Whether or not to include insertion codes in the node id.
Default is ``False``.
:type insertions: bool
:return: Protein structure DataFrame with ``node_id`` column.
:rtype: pd.DataFrame
"""
Expand All @@ -136,6 +147,9 @@ def label_node_id(df: pd.DataFrame, granularity: str) -> pd.DataFrame:
+ ":"
+ df["residue_number"].apply(str)
)

if insertions:
df["node_id"] = df["node_id"] + ":" + df["insertion"].apply(str)
df["residue_id"] = df["node_id"]
if granularity == "atom":
df["node_id"] = df["node_id"] + ":" + df["atom_name"]
Expand Down Expand Up @@ -304,7 +318,9 @@ def process_dataframe(
construction functions.
:rtype: pd.DataFrame
"""
protein_df = label_node_id(protein_df, granularity=granularity)
protein_df = label_node_id(
protein_df, granularity=granularity, insertions=insertions
)
# TODO: Need to properly define what "granularity" is supposed to do.
atoms = filter_dataframe(
protein_df,
Expand Down Expand Up @@ -381,36 +397,9 @@ def sort_dataframe(df: pd.DataFrame) -> pd.DataFrame:
:return: Sorted protein dataframe.
:rtype: pd.DataFrame
"""
return df.sort_values(by=["chain_id", "residue_number", "atom_number"])


def assign_node_id_to_dataframe(
protein_df: pd.DataFrame, granularity: str
) -> pd.DataFrame:
"""
Assigns the node ID back to the ``pdb_df`` dataframe
:param protein_df: Structure Dataframe
:type protein_df: pd.DataFrame
:param granularity: Granularity of graph. Atom-level,
residue (e.g. ``CA``) or ``centroids``. See:
:const:`~graphein.protein.config.GRAPH_ATOMS` and
:const:`~graphein.protein.config.GRANULARITY_OPTS`.
:type granularity: str
:return: Returns dataframe with added ``node_ids``
:rtype: pd.DataFrame
"""
protein_df["node_id"] = (
protein_df["chain_id"].apply(str)
+ ":"
+ protein_df["residue_name"]
+ ":"
+ protein_df["residue_number"].apply(str)
return df.sort_values(
by=["chain_id", "residue_number", "atom_number", "insertion"]
)
if granularity in {"atom", "rna_atom"}:
protein_df[
"node_id"
] = f'{protein_df["node_id"]}:{protein_df["atom_name"]}'


def select_chains(
Expand Down Expand Up @@ -580,7 +569,9 @@ def calculate_centroid_positions(
:rtype: pd.DataFrame
"""
centroids = (
atoms.groupby(["residue_number", "chain_id", "residue_name"])
atoms.groupby(
["residue_number", "chain_id", "residue_name", "insertion"]
)
.mean()[["x_coord", "y_coord", "z_coord"]]
.reset_index()
)
Expand Down
4 changes: 2 additions & 2 deletions tests/protein/edges/test_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def test_add_peptide_bonds():
G = construct_graph(pdb_path=str(file_path))

for u, v in G.edges():
assert abs(int(u.split(":")[-1]) - int(v.split(":")[-1])) == 1
assert abs(int(u.split(":")[2]) - int(v.split(":")[2])) == 1


def test_add_sequence_distance_edges():
Expand All @@ -205,7 +205,7 @@ def test_add_sequence_distance_edges():
)
G = construct_graph(pdb_path=str(file_path), config=config)
for u, v in G.edges():
assert abs(int(u.split(":")[-1]) - int(v.split(":")[-1])) == d
assert abs(int(u.split(":")[2]) - int(v.split(":")[2])) == d


def test_salt_bridge_interactions():
Expand Down

0 comments on commit 75d5862

Please sign in to comment.