Squashed 'modules/core/dependency/python-ihm/' changes from 487413476…

…a..3b2640aaa2 3b2640aaa2 Prune duplicate poly residue features a8c87d287f Prune duplicate pseudo site features 38edd8e1e5 Prune duplicate pseudo sites on output 29bab0b748 Prune duplicate references; closes ihmwg/python-ihm#148 1ce7bc607a Disallow empty sequence for ResidueFeature, closes ihmwg/python-ihm#147 834719e21c Support primary citation in from_pubmed_id() 75b2985515 Use author-provided values for template asym_id 87ba03fb6f Clarify that template asym is author-provided f3beb7c07c Handle ModelCIF templates not in alignments 9094ec889f Test parsing metadata from AlphaFold model a0c05086e3 Extract ModBase database ID from models 39c8996f69 Handle mmCIF files in PDB-Dev or AFDB cb01e6161f Read template information from ModelCIF files a7a7700325 Extract software info directly from _software table 3acbf2d9d6 Fix typos 65c0d8dd81 CIFParser *does* now handle MODELLER outputs 768d979445 Handle empty template seq_id_range on output 91daf61a5b Allow for empty starting model seq_id range ca484dc131 Link to GitHub issue 1b8c33de9b Update for 1.3 release 5298982896 Update to match latest dictionary 00b3d275e5 Atoms aren't "duplicates" if alt_id differs 1a564f6f92 Merge pull request ihmwg/python-ihm#142 from ihmwg/remediation c1d558b41a Add support for the ProteomeXchange database 6f1d10fd4e Map both crosslinking data types to same class 144ba143cf Read old ihm_ordered_ensemble category 603760afb0 Rename CX-MS to Crosslinking-MS bc92ae5029 Rename ihm_ordered_ensemble to ihm_ordered_model git-subtree-dir: modules/core/dependency/python-ihm git-subtree-split: 3b2640aaa220501d490bbd28efae3661414c2151
salilab · Aug 30, 2024 · 2f7cc81 · 2f7cc81
1 parent 379b5a2
commit 2f7cc81
Show file tree

Hide file tree

Showing 26 changed files with 6,904 additions and 69 deletions.
diff --git a/modules/core/dependency/python-ihm/ChangeLog.rst b/modules/core/dependency/python-ihm/ChangeLog.rst
@@ -1,3 +1,18 @@
+1.3 - 2024-07-16
+================
+  - The new class :class:`ihm.location.ProteomeXchangeLocation` can be used
+    for datasets stored in the ProteomeXchange database.
+  - Support is added for changes in the IHMCIF dictionary, specifically
+    the renaming of "CX-MS data" to "Crosslinking-MS data" and the
+    ``_ihm_ordered_ensemble`` category to ``_ihm_ordered_model``. python-ihm
+    will output the new names, but for backwards compatibility will read both
+    old and new names.
+  - :class:`ihm.protocol.Protocol` can now be given additional text to
+    describe the protocol.
+  - :class:`ihm.model.Atom` now takes an ``alt_id`` argument to support
+    alternate conformations (#146).
+  - Support added for NumPy 2.0.
+
 1.2 - 2024-06-12
 ================
   - :class:`ihm.format.CifTokenReader` allows for reading an mmCIF file

diff --git a/modules/core/dependency/python-ihm/MANIFEST.in b/modules/core/dependency/python-ihm/MANIFEST.in
@@ -4,4 +4,4 @@ include examples/*
 include util/make-mmcif.py
 include src/ihm_format.h
 include src/ihm_format.i
-include src/ihm_format_wrap_1.2.c
+include src/ihm_format_wrap_1.3.c
diff --git a/modules/core/dependency/python-ihm/docs/location.rst b/modules/core/dependency/python-ihm/docs/location.rst
@@ -55,6 +55,9 @@ The :mod:`ihm.location` Python module
 .. autoclass:: AlphaFoldDBLocation
    :members:
 
+.. autoclass:: ProteomeXchangeLocation
+   :members:
+
 .. autoclass:: FileLocation
    :members:
 

diff --git a/modules/core/dependency/python-ihm/ihm/__init__.py b/modules/core/dependency/python-ihm/ihm/__init__.py
@@ -20,7 +20,7 @@
 import json
 from . import util
 
-__version__ = '1.2'
+__version__ = '1.3'
 
 
 class __UnknownValue(object):
@@ -795,11 +795,13 @@ def __init__(self, pmid, title, journal, volume, page_range, year, authors,
         self.is_primary = is_primary
 
     @classmethod
-    def from_pubmed_id(cls, pubmed_id):
+    def from_pubmed_id(cls, pubmed_id, is_primary=False):
         """Create a Citation from just a PubMed ID.
            This is done by querying NCBI's web API, so requires network access.
 
            :param int pubmed_id: The PubMed identifier.
+           :param bool is_primary: Denotes the most pertinent publication for
+                  the modeling itself; see :class:`Citation` for more info.
            :return: A new Citation for the given identifier.
            :rtype: :class:`Citation`
         """
@@ -851,7 +853,8 @@ def auth_sub(m):
                    volume=enc(ref['volume']) or None,
                    page_range=get_page_range(ref),
                    year=enc(ref['pubdate']).split()[0],
-                   authors=authors, doi=get_doi(ref))
+                   authors=authors, doi=get_doi(ref),
+                   is_primary=is_primary)
 
 
 class ChemComp(object):

diff --git a/modules/core/dependency/python-ihm/ihm/dataset.py b/modules/core/dependency/python-ihm/ihm/dataset.py
@@ -104,7 +104,7 @@ def __init__(self, elements=(), name=None, application=None, details=None):
 
 class CXMSDataset(Dataset):
     """Processed cross-links from a CX-MS experiment"""
-    data_type = 'CX-MS data'
+    data_type = 'Crosslinking-MS data'
 
 
 class MassSpecDataset(Dataset):

diff --git a/modules/core/dependency/python-ihm/ihm/dumper.py b/modules/core/dependency/python-ihm/ihm/dumper.py
@@ -94,8 +94,8 @@ class _AuditConformDumper(Dumper):
     def dump(self, system, writer):
         with writer.category("_audit_conform") as lp:
             # Update to match the version of the IHM dictionary we support:
-            lp.write(dict_name="mmcif_ihm.dic", dict_version="1.25",
-                     dict_location=self.URL % "460a278")
+            lp.write(dict_name="mmcif_ihm.dic", dict_version="1.26",
+                     dict_location=self.URL % "1a0ec62")
 
 
 class _StructDumper(Dumper):
@@ -422,11 +422,19 @@ def _prettyprint_seq(seq, width):
 
 class _StructRefDumper(Dumper):
     def finalize(self, system):
-        ref_id = itertools.count(1)
+        self._refs_by_id = {}
         align_id = itertools.count(1)
         for e in system.entities:
             for r in e.references:
-                r._id = next(ref_id)
+                util._remove_id(r)
+
+        for e in system.entities:
+            seen_refs = {}
+            # Two refs are not considered duplicated if they relate to
+            # different entities, so keep list per entity
+            self._refs_by_id[id(e)] = by_id = []
+            for r in e.references:
+                util._assign_id(r, seen_refs, by_id, seen_obj=r._signature())
                 for a in r._get_alignments():
                     a._id = next(align_id)
 
@@ -513,7 +521,7 @@ def dump(self, system, writer):
                  "pdbx_align_begin", "pdbx_seq_one_letter_code",
                  "details"]) as lp:
             for e in system.entities:
-                for r in e.references:
+                for r in self._refs_by_id[id(e)]:
                     self._check_reference_sequence(e, r)
                     db_begin = min(a.db_begin for a in r._get_alignments())
                     lp.write(id=r._id, entity_id=e._id, db_name=r.db_name,
@@ -526,7 +534,7 @@ def dump(self, system, writer):
     def dump_seq(self, system, writer):
         def _all_alignments():
             for e in system.entities:
-                for r in e.references:
+                for r in self._refs_by_id[id(e)]:
                     for a in r._get_alignments():
                         yield e, r, a
         with writer.loop(
@@ -550,7 +558,7 @@ def dump_seq_dif(self, system, writer):
                 ["pdbx_ordinal", "align_id", "seq_num", "db_mon_id", "mon_id",
                  "details"]) as lp:
             for e in system.entities:
-                for r in e.references:
+                for r in self._refs_by_id[id(e)]:
                     for a in r._get_alignments():
                         for sd in a.seq_dif:
                             lp.write(pdbx_ordinal=next(ordinal),
@@ -1282,11 +1290,18 @@ def _dump_template(self, template, sm, lp, ordinal):
         denom = template.sequence_identity.denominator
         if denom is not None and denom is not ihm.unknown:
             denom = int(denom)
+        # Add offset only if seq_id_range isn't . or ?
+        seq_id_begin = template.seq_id_range[0]
+        if isinstance(template.seq_id_range[0], int):
+            seq_id_begin += off
+        seq_id_end = template.seq_id_range[1]
+        if isinstance(template.seq_id_range[1], int):
+            seq_id_end += off
         lp.write(id=next(ordinal),
                  starting_model_id=sm._id,
                  starting_model_auth_asym_id=sm.asym_id,
-                 starting_model_seq_id_begin=template.seq_id_range[0] + off,
-                 starting_model_seq_id_end=template.seq_id_range[1] + off,
+                 starting_model_seq_id_begin=seq_id_begin,
+                 starting_model_seq_id_end=seq_id_end,
                  template_auth_asym_id=template.asym_id,
                  template_seq_id_begin=template.template_seq_id_range[0],
                  template_seq_id_end=template.template_seq_id_range[1],
@@ -1516,12 +1531,12 @@ def _check_duplicate_atom(self, atom):
         # e.g. multiple bulk water oxygen atoms can have "same" seq_id (None)
         if atom.seq_id is None:
             return
-        k = (atom.asym_unit._id, atom.atom_id, atom.seq_id)
+        k = (atom.asym_unit._id, atom.atom_id, atom.seq_id, atom.alt_id)
         if k in self._seen_atoms:
             raise ValueError(
-                "Multiple atoms with same atom_id (%s) and seq_id (%d) "
-                "found in asym ID %s"
-                % (atom.atom_id, atom.seq_id, atom.asym_unit._id))
+                "Multiple atoms with same atom_id (%s), seq_id (%d) "
+                "and alt_id (%s) found in asym ID %s"
+                % (atom.atom_id, atom.seq_id, atom.alt_id, atom.asym_unit._id))
         self._seen_atoms.add(k)
 
     def _check_assembly(self, obj, asym, seq_id_range):
@@ -1868,7 +1883,7 @@ def finalize(self, system):
                     edge._id = next(edge_id)
 
     def dump(self, system, writer):
-        with writer.loop("_ihm_ordered_ensemble",
+        with writer.loop("_ihm_ordered_model",
                          ["process_id", "process_description", "ordered_by",
                           "step_id", "step_description",
                           "edge_id", "edge_description",
@@ -2019,7 +2034,9 @@ def finalize(self, system):
         for f in system._all_features():
             util._remove_id(f)
         for f in system._all_features():
-            util._assign_id(f, seen_features, self._features_by_id)
+            util._assign_id(f, seen_features, self._features_by_id,
+                            seen_obj=f._signature()
+                            if hasattr(f, '_signature') else f)
 
     def dump(self, system, writer):
         self.dump_list(writer)
@@ -2052,6 +2069,8 @@ def _get_asym_id(x):
             for f in self._features_by_id:
                 if not isinstance(f, restraint.ResidueFeature):
                     continue
+                if not f.ranges:
+                    raise ValueError("%s selects no residues" % f)
                 for r in f.ranges:
                     entity = _get_entity(r)
                     seq = entity.sequence
@@ -2127,7 +2146,8 @@ def finalize(self, system):
         for f in system._all_pseudo_sites():
             util._remove_id(f)
         for f in system._all_pseudo_sites():
-            util._assign_id(f, seen_sites, self._sites_by_id)
+            util._assign_id(f, seen_sites, self._sites_by_id,
+                            seen_obj=f._signature())
 
     def dump(self, system, writer):
         with writer.loop("_ihm_pseudo_site",

diff --git a/modules/core/dependency/python-ihm/ihm/format.py b/modules/core/dependency/python-ihm/ihm/format.py
@@ -812,7 +812,7 @@ def read_file(self, filters=None):
            of the tokens is subject to change and is not currently documented;
            however, each token or group object has an ``as_mmcif`` method
            which returns the corresponding text in mmCIF format. Thus, the
-           file can be reconstructed by concatentating the result of
+           file can be reconstructed by concatenating the result of
            ``as_mmcif`` for all tokens.
 
            :exc:`CifParserError` will be raised if the file cannot be parsed.
@@ -982,7 +982,7 @@ class CifReader(_Reader, _CifTokenizer):
               and ] characters, since these are not valid for Python
               identifiers). The object will be called with the data from
               the file as a set of strings, or `not_in_file`, `omitted` or
-              `unkonwn` for any keyword that is not present in the file,
+              `unknown` for any keyword that is not present in the file,
               the mmCIF omitted value (.), or mmCIF unknown value (?)
               respectively. (mmCIF keywords are case insensitive, so this
               class always treats them as lowercase regardless of the

diff --git a/modules/core/dependency/python-ihm/ihm/location.py b/modules/core/dependency/python-ihm/ihm/location.py
@@ -182,6 +182,13 @@ class AlphaFoldDBLocation(DatabaseLocation):
     db_name = 'AlphaFoldDB'
 
 
+class ProteomeXchangeLocation(DatabaseLocation):
+    """Something stored in the ProteomeXchange database.
+       See :class:`DatabaseLocation` for a description of the parameters
+       and :class:`Location` for discussion of the usage of these objects."""
+    db_name = 'ProteomeXchange'
+
+
 class FileLocation(Location):
     """Base class for an individual file or directory stored externally.