2 Scoring of quaternary structures (QS). The QS scoring is according to the paper
3 by `Bertoni et al. <https://dx.doi.org/10.1038/s41598-017-09654-8>`_.
9 - A default :class:`compound library <ost.conop.CompoundLib>` must be defined
10 and accessible via :func:`~ost.conop.GetDefaultLib`. This is set by default
11 when executing scripts with ``ost``. Otherwise, you must set this with
12 :func:`~ost.conop.SetDefaultLib`.
13 - ClustalW must be installed (unless you provide chain mappings)
14 - Python modules `numpy` and `scipy` must be installed and available
15 (e.g. use ``pip install scipy numpy``)
17 Authors: Gerardo Tauriello, Martino Bertoni
20 from ost
import mol, geom, conop, seq, settings, PushVerbosityLevel
21 from ost
import LogError, LogWarning, LogScript, LogInfo, LogVerbose, LogDebug
26 from scipy.misc
import factorial
27 from scipy.special
import binom
28 from scipy.cluster.hierarchy
import fclusterdata
36 """Exception to be raised for "acceptable" exceptions in QS scoring.
38 Those are cases we might want to capture for default behavior.
43 """Object to compute QS scores.
45 Simple usage without any precomputed contacts, symmetries and mappings:
47 .. code-block:: python
50 from ost.mol.alg import qsscoring
52 # load two biounits to compare
53 ent_full = ost.io.LoadPDB('3ia3', remote=True)
54 ent_1 = ent_full.Select('cname=A,D')
55 ent_2 = ent_full.Select('cname=B,C')
57 ost.PushVerbosityLevel(3)
59 qs_scorer = qsscoring.QSscorer(ent_1, ent_2)
60 ost.LogScript('QSscore:', str(qs_scorer.global_score))
61 ost.LogScript('Chain mapping used:', str(qs_scorer.chain_mapping))
62 # commonly you want the QS global score as output
63 qs_score = qs_scorer.global_score
64 except qsscoring.QSscoreError as ex:
65 # default handling: report failure and set score to 0
66 ost.LogError('QSscore failed:', str(ex))
69 For maximal performance when computing QS scores of the same entity with many
70 others, it is advisable to construct and reuse :class:`QSscoreEntity` objects.
72 Any known / precomputed information can be filled into the appropriate
73 attribute here (no checks done!). Otherwise most quantities are computed on
74 first access and cached (lazy evaluation). Setters are provided to set values
75 with extra checks (e.g. :func:`SetSymmetries`).
77 All necessary seq. alignments are done by global BLOSUM62-based alignment. A
78 multiple sequence alignment is performed with ClustalW unless
79 :attr:`chain_mapping` is provided manually. You will need to have an
80 executable ``clustalw`` or ``clustalw2`` in your ``PATH`` or you must set
81 :attr:`clustalw_bin` accordingly. Otherwise an exception
82 (:class:`ost.settings.FileNotFound`) is thrown.
84 Formulas for QS scores:
88 - QS_best = weighted_scores / (weight_sum + weight_extra_mapped)
89 - QS_global = weighted_scores / (weight_sum + weight_extra_all)
90 -> weighted_scores = sum(w(min(d1,d2)) * (1 - abs(d1-d2)/12)) for shared
91 -> weight_sum = sum(w(min(d1,d2))) for shared
92 -> weight_extra_mapped = sum(w(d)) for all mapped but non-shared
93 -> weight_extra_all = sum(w(d)) for all non-shared
94 -> w(d) = 1 if d <= 5, exp(-2 * ((d-5.0)/4.28)^2) else
96 In the formulas above:
98 * "d": CA/CB-CA/CB distance of an "inter-chain contact" ("d1", "d2" for
100 * "mapped": we could map chains of two structures and align residues in
102 * "shared": pairs of residues which are "mapped" and have
103 "inter-chain contact" in both structures.
104 * "inter-chain contact": CB-CB pairs (CA for GLY) with distance <= 12 A
105 (fallback to CA-CA if :attr:`calpha_only` is True).
106 * "w(d)": weighting function (prob. of 2 res. to interact given CB distance)
107 from `Xu et al. 2009 <https://dx.doi.org/10.1016%2Fj.jmb.2008.06.002>`_.
109 :param ent_1: First structure to be scored.
110 :type ent_1: :class:`QSscoreEntity`, :class:`~ost.mol.EntityHandle` or
111 :class:`~ost.mol.EntityView`
112 :param ent_2: Second structure to be scored.
113 :type ent_2: :class:`QSscoreEntity`, :class:`~ost.mol.EntityHandle` or
114 :class:`~ost.mol.EntityView`
115 :param res_num_alignment: Sets :attr:`res_num_alignment`
117 :raises: :class:`QSscoreError` if input structures are invalid or are monomers
118 or have issues that make it impossible for a QS score to be computed.
120 .. attribute:: qs_ent_1
122 :class:`QSscoreEntity` object for *ent_1* given at construction.
123 If entity names (:attr:`~QSscoreEntity.original_name`) are not unique, we
124 set it to 'pdb_1' using :func:`~QSscoreEntity.SetName`.
126 .. attribute:: qs_ent_2
128 :class:`QSscoreEntity` object for *ent_2* given at construction.
129 If entity names (:attr:`~QSscoreEntity.original_name`) are not unique, we
130 set it to 'pdb_2' using :func:`~QSscoreEntity.SetName`.
132 .. attribute:: calpha_only
134 True if any of the two structures is CA-only (after cleanup).
138 .. attribute:: max_ca_per_chain_for_cm
140 Maximal number of CA atoms to use in each chain to determine chain mappings.
141 Setting this to -1 disables the limit. Limiting it speeds up determination
142 of symmetries and chain mappings. By default it is set to 100.
146 .. attribute:: res_num_alignment
148 Forces each alignment in :attr:`alignments` to be based on residue numbers
149 instead of using a global BLOSUM62-based alignment.
153 def __init__(self, ent_1, ent_2, res_num_alignment=False):
155 if isinstance(ent_1, QSscoreEntity):
159 if isinstance(ent_2, QSscoreEntity):
164 if not self.qs_ent_1.is_valid
or not self.qs_ent_2.is_valid:
167 if self.qs_ent_1.original_name == self.qs_ent_2.original_name:
168 self.qs_ent_1.SetName(
'pdb_1')
169 self.qs_ent_2.SetName(
'pdb_2')
171 self.qs_ent_1.SetName(self.qs_ent_1.original_name)
172 self.qs_ent_2.SetName(self.qs_ent_2.original_name)
175 self.
calpha_only = self.qs_ent_1.calpha_only
or self.qs_ent_2.calpha_only
194 """Inter-complex mapping of chemical groups.
196 Each group (see :attr:`QSscoreEntity.chem_groups`) is mapped according to
197 highest sequence identity. Alignment is between longest sequences in groups.
201 - If different numbers of groups, we map only the groups for the complex
202 with less groups (rest considered unmapped and shown as warning)
203 - The mapping is forced: the "best" mapping will be chosen independently of
204 how low the seq. identity may be
206 :getter: Computed on first use (cached)
207 :type: :class:`dict` with key = :class:`tuple` of chain names in
208 :attr:`qs_ent_1` and value = :class:`tuple` of chain names in
211 :raises: :class:`QSscoreError` if we end up having no chains for either
212 entity in the mapping (can happen if chains do not have CA atoms).
220 """Subset of :attr:`qs_ent_1` used to compute chain mapping and symmetries.
224 - Includes only residues aligned according to :attr:`chem_mapping`
225 - Includes only 1 CA atom per residue
226 - Has at least 5 and at most :attr:`max_ca_per_chain_for_cm` atoms per chain
227 - All chains of the same chemical group have the same number of atoms
228 (also in :attr:`ent_to_cm_2` according to :attr:`chem_mapping`)
229 - All chains appearing in :attr:`chem_mapping` appear in this entity
230 (so the two can be safely used together)
232 This entity might be transformed (i.e. all positions rotated/translated by
233 same transformation matrix) if this can speed up computations. So do not
234 assume fixed global positions (but relative distances will remain fixed).
236 :getter: Computed on first use (cached)
237 :type: :class:`~ost.mol.EntityHandle`
239 :raises: :class:`QSscoreError` if any chain ends up having less than 5 res.
247 """Subset of :attr:`qs_ent_1` used to compute chain mapping and symmetries
248 (see :attr:`ent_to_cm_1` for details).
256 """Symmetry groups for :attr:`qs_ent_1` used to speed up chain mapping.
258 This is a list of chain-lists where each chain-list can be used reconstruct
259 the others via cyclic C or dihedral D symmetry. The first chain-list is used
260 as a representative symmetry group. For heteromers, the group-members must
261 contain all different seqres in oligomer.
263 Example: symm. groups [(A,B,C), (D,E,F), (G,H,I)] means that there are
264 symmetry transformations to get (D,E,F) and (G,H,I) from (A,B,C).
268 - All symmetry group tuples have the same length (num. of chains)
269 - All chains in :attr:`ent_to_cm_1` appear (w/o duplicates)
270 - For heteros: symmetry group tuples have all different chem. groups
271 - Trivial symmetry group = one tuple with all chains (used if inconsistent
272 data provided or if no symmetry is found)
273 - Either compatible to :attr:`symm_2` or trivial symmetry groups used.
274 Compatibility requires same lengths of symmetry group tuples and it must
275 be possible to get an overlap (80% of residues covered within 6 A of a
276 (chem. mapped) chain) of all chains in representative symmetry groups by
277 superposing one pair of chains.
279 :getter: Computed on first use (cached)
280 :type: :class:`list` of :class:`tuple` of :class:`str` (chain names)
288 """Symmetry groups for :attr:`qs_ent_2` (see :attr:`symm_1` for details)."""
294 """Set user-provided symmetry groups.
296 These groups are restricted to chain names appearing in :attr:`ent_to_cm_1`
297 and :attr:`ent_to_cm_2` respectively. They are only valid if they cover all
298 chains and both *symm_1* and *symm_2* have same lengths of symmetry group
299 tuples. Otherwise trivial symmetry group used (see :attr:`symm_1`).
301 :param symm_1: Value to set for :attr:`symm_1`.
302 :param symm_2: Value to set for :attr:`symm_2`.
309 self.
_symm_1 = [tuple(ch.name
for ch
in self.ent_to_cm_1.chains)]
310 self.
_symm_2 = [tuple(ch.name
for ch
in self.ent_to_cm_2.chains)]
314 """Mapping from :attr:`ent_to_cm_1` to :attr:`ent_to_cm_2`.
318 - Mapping is between chains of same chem. group (see :attr:`chem_mapping`)
319 - Each chain can appear only once in mapping
320 - All chains of complex with less chains are mapped
321 - Symmetry (:attr:`symm_1`, :attr:`symm_2`) is taken into account
323 Details on algorithms used to find mapping:
325 - We try all pairs of chem. mapped chains within symmetry group and get
326 superpose-transformation for them
327 - First option: check for "sufficient overlap" of other chain-pairs
329 - For each chain-pair defined above: apply superposition to full oligomer
330 and map chains based on structural overlap
331 - Structural overlap = X% of residues in second oligomer covered within Y
332 Angstrom of a (chem. mapped) chain in first oligomer. We successively
333 try (X,Y) = (80,4), (40,6) and (20,8) to be less and less strict in
334 mapping (warning shown for most permissive one).
335 - If multiple possible mappings are found, we choose the one which leads
336 to the lowest multi-chain-RMSD given the superposition
338 - Fallback option: try all mappings to find minimal multi-chain-RMSD
341 - For each chain-pair defined above: apply superposition, try all (!)
342 possible chain mappings (within symmetry group) and keep mapping with
343 lowest multi-chain-RMSD
344 - Repeat procedure above to resolve symmetry. Within the symmetry group we
345 can use the chain mapping computed before and we just need to find which
346 symmetry group in first oligomer maps to which in the second one. We
347 again try all possible combinations...
350 - Trying all possible mappings is a combinatorial nightmare (factorial).
351 We throw an exception if too many combinations (e.g. octomer vs
352 octomer with no usable symmetry)
353 - The mapping is forced: the "best" mapping will be chosen independently
354 of how badly they fit in terms of multi-chain-RMSD
355 - As a result, such a forced mapping can lead to a large range of
356 resulting QS scores. An extreme example was observed between 1on3.1
357 and 3u9r.1, where :attr:`global_score` can range from 0.12 to 0.43
358 for mappings with very similar multi-chain-RMSD.
360 :getter: Computed on first use (cached)
361 :type: :class:`dict` with key / value = :class:`str` (chain names, key
362 for :attr:`ent_to_cm_1`, value for :attr:`ent_to_cm_2`)
363 :raises: :class:`QSscoreError` if there are too many combinations to check
364 to find a chain mapping.
375 """Mapping scheme used to get :attr:`chain_mapping`.
379 - 'strict': 80% overlap needed within 4 Angstrom (overlap based mapping).
380 - 'tolerant': 40% overlap needed within 6 Angstrom (overlap based mapping).
381 - 'permissive': 20% overlap needed within 8 Angstrom (overlap based
382 mapping). It's best if you check mapping manually!
383 - 'extensive': Extensive search used for mapping detection (fallback). This
384 approach has known limitations and may be removed in future versions.
385 Mapping should be checked manually!
386 - 'user': :attr:`chain_mapping` was set by user before first use of this
389 :getter: Computed with :attr:`chain_mapping` on first use (cached)
391 :raises: :class:`QSscoreError` as in :attr:`chain_mapping`.
404 """List of successful sequence alignments using :attr:`chain_mapping`.
406 There will be one alignment for each mapped chain and they are ordered by
407 their chain names in :attr:`qs_ent_1`.
409 The first sequence of each alignment belongs to :attr:`qs_ent_1` and the
410 second one to :attr:`qs_ent_2`. The sequences are named according to the
411 mapped chain names and have views attached into :attr:`QSscoreEntity.ent`
412 of :attr:`qs_ent_1` and :attr:`qs_ent_2`.
414 If :attr:`res_num_alignment` is False, each alignment is performed using a
415 global BLOSUM62-based alignment. Otherwise, the positions in the alignment
416 sequences are simply given by the residue number so that residues with
417 matching numbers are aligned.
419 :getter: Computed on first use (cached)
420 :type: :class:`list` of :class:`~ost.seq.AlignmentHandle`
423 self.
_alignments = _GetMappedAlignments(self.qs_ent_1.ent,
431 """Mapping of shared residues in :attr:`alignments`.
433 :getter: Computed on first use (cached)
434 :type: :class:`dict` *mapped_residues[c1][r1] = r2* with:
435 *c1* = Chain name in first entity (= first sequence in aln),
436 *r1* = Residue number in first entity,
437 *r2* = Residue number in second entity
445 """QS-score with penalties.
447 The range of the score is between 0 (i.e. no interface residues are shared
448 between biounits) and 1 (i.e. the interfaces are identical).
450 The global QS-score is computed applying penalties when interface residues
451 or entire chains are missing (i.e. anything that is not mapped in
452 :attr:`mapped_residues` / :attr:`chain_mapping`) in one of the biounits.
454 :getter: Computed on first use (cached)
455 :type: :class:`float`
456 :raises: :class:`QSscoreError` if only one chain is mapped
464 """QS-score without penalties.
466 Like :attr:`global_score`, but neglecting additional residues or chains in
467 one of the biounits (i.e. the score is calculated considering only mapped
468 chains and residues).
470 :getter: Computed on first use (cached)
471 :type: :class:`float`
472 :raises: :class:`QSscoreError` if only one chain is mapped
480 """Superposition result based on shared CA atoms in :attr:`alignments`.
482 The superposition can be used to map :attr:`QSscoreEntity.ent` of
483 :attr:`qs_ent_1` onto the one of :attr:`qs_ent_2`. Use
484 :func:`ost.geom.Invert` if you need the opposite transformation.
486 :getter: Computed on first use (cached)
487 :type: :class:`ost.mol.alg.SuperpositionResult`
492 sup_rmsd = self._superposition.rmsd
493 cmp_view = self._superposition.view1
494 LogInfo(
'CA RMSD for %s aligned residues on %s chains: %.2f' \
495 % (cmp_view.residue_count, cmp_view.chain_count, sup_rmsd))
501 Full path to ``clustalw`` or ``clustalw2`` executable to use for multiple
502 sequence alignments (unless :attr:`chain_mapping` is provided manually).
504 :getter: Located in path on first use (cached)
508 self.
_clustalw_bin = settings.Locate((
'clustalw',
'clustalw2'))
513 :return: :class:`OligoLDDTScorer` object, setup for this QS scoring problem.
514 :param settings: Passed to :class:`OligoLDDTScorer` constructor.
515 :param penalize_extra_chains: Passed to :class:`OligoLDDTScorer` constructor.
517 if penalize_extra_chains:
530 def _ComputeAlignedEntities(self):
531 """Fills cached ent_to_cm_1 and ent_to_cm_2."""
541 self._ent_to_cm_1.SetName(self.qs_ent_1.GetName())
542 self._ent_to_cm_2.SetName(self.qs_ent_2.GetName())
544 def _ComputeSymmetry(self):
545 """Fills cached symm_1 and symm_2."""
552 self.
_symm_1 = [tuple(ch.name
for ch
in self.ent_to_cm_1.chains)]
553 self.
_symm_2 = [tuple(ch.name
for ch
in self.ent_to_cm_2.chains)]
555 def _ComputeScores(self):
556 """Fills cached global_score and best_score."""
558 raise QSscoreError(
"QS-score is not defined for monomers")
561 contacts_1 = self.qs_ent_1.contacts_ca
562 contacts_2 = self.qs_ent_2.contacts_ca
564 contacts_1 = self.qs_ent_1.contacts
565 contacts_2 = self.qs_ent_2.contacts
572 LogInfo(
'QSscore %s, %s: best: %.2f, global: %.2f' \
573 % (self.qs_ent_1.GetName(), self.qs_ent_2.GetName(),
582 """Entity with cached entries for QS scoring.
584 Any known / precomputed information can be filled into the appropriate
585 attribute here as long as they are labelled as read/write. Otherwise the
586 quantities are computed on first access and cached (lazy evaluation). The
587 heaviest load is expected when computing :attr:`contacts` and
590 :param ent: Entity to be used for QS scoring. A copy of it will be processed.
591 :type ent: :class:`~ost.mol.EntityHandle` or :class:`~ost.mol.EntityView`
593 .. attribute:: is_valid
595 True, if successfully initialized. False, if input structure is monomer or
596 has less than 2 protein chains with >= 20 residues.
600 .. attribute:: original_name
602 Name set for *ent* when object was created.
608 Cleaned version of *ent* passed at construction. Hydrogens are removed, the
609 entity is processed with a :class:`~ost.conop.RuleBasedProcessor` and chains
610 listed in :attr:`removed_chains` have been removed. The name of this entity
611 might change during scoring (see :func:`GetName`). Otherwise, this will be
614 :type: :class:`~ost.mol.EntityHandle`
616 .. attribute:: removed_chains
618 Chains removed from *ent* passed at construction. These are ligand and water
619 chains as well as small (< 20 res.) peptides or chains with no amino acids
620 (determined by chem. type, which is set by rule based processor).
622 :type: :class:`list` of :class:`str`
624 .. attribute:: calpha_only
626 Whether entity is CA-only (i.e. it has 0 CB atoms)
633 ent = mol.CreateEntityFromView(ent.Select(
'ele!=H and aname!=HN'),
True)
634 if not conop.GetDefaultLib():
635 raise RuntimeError(
"QSscore computation requires a compound library!")
638 self.ent, self.removed_chains, self.
calpha_only = _CleanInputEntity(ent)
640 if self.ent.chain_count == 0:
641 LogError(
'Bad input file: ' + ent.GetName() +
'. No chains left after '
642 'removing water, ligands and small peptides.')
644 elif self.ent.chain_count == 1:
645 LogWarning(
'Structure ' + ent.GetName() +
' is a monomer.')
660 """Wrapper to :func:`~ost.mol.EntityHandle.GetName` of :attr:`ent`.
661 This is used to uniquely identify the entity while scoring. The name may
662 therefore change while :attr:`original_name` remains fixed.
665 return self.ent.GetName()
668 """Wrapper to :func:`~ost.mol.EntityHandle.SetName` of :attr:`ent`.
669 Use this to change unique identifier while scoring (see :func:`GetName`).
672 self.ent.SetName(new_name)
677 Reduced representation of :attr:`ent` with only CA atoms.
678 This guarantees that each included residue has exactly one atom.
680 :getter: Computed on first use (cached)
681 :type: :class:`~ost.mol.EntityHandle`
690 Map of chain names in :attr:`ent` to sequences with attached view to CA-only
691 chains (into :attr:`ca_entity`). Useful for alignments and superpositions.
693 :getter: Computed on first use (cached)
694 :type: :class:`dict` (key = :class:`str`,
695 value = :class:`~ost.seq.SequenceHandle`)
700 for ch
in ca_entity.chains:
701 self.
_ca_chains[ch.name] = seq.SequenceFromChain(ch.name, ch)
705 """Get sequence alignment of chain *c1* with chain *c2*.
706 Computed on first use based on :attr:`ca_chains` (cached).
708 :param c1: Chain name for first chain to align.
709 :type c1: :class:`str`
710 :param c2: Chain name for second chain to align.
711 :type c2: :class:`str`
712 :rtype: :class:`~ost.seq.AlignmentHandle` or None if it failed.
716 self.
_alignments[(c1,c2)] = _AlignAtomSeqs(ca_chains[c1], ca_chains[c2])
722 Intra-complex group of chemically identical (seq. id. > 95%) polypeptide
723 chains as extracted from :attr:`ca_chains`. First chain in group is the one
724 with the longest sequence.
726 :getter: Computed on first use (cached)
727 :type: :class:`list` of :class:`list` of :class:`str` (chain names)
731 LogInfo(
'Chemically equivalent chain-groups in %s: %s' \
736 """Get Euler angles from superposition of chain *c1* with chain *c2*.
737 Computed on first use based on :attr:`ca_chains` (cached).
739 :param c1: Chain name for first chain to superpose.
740 :type c1: :class:`str`
741 :param c2: Chain name for second chain to superpose.
742 :type c2: :class:`str`
743 :return: 3 Euler angles (may contain nan if something fails).
744 :rtype: :class:`numpy.array`
746 if (c1,c2)
not in self.
_angles:
751 """Get axis of symmetry from superposition of chain *c1* with chain *c2*.
752 Computed on first use based on :attr:`ca_chains` (cached).
754 :param c1: Chain name for first chain to superpose.
755 :type c1: :class:`str`
756 :param c2: Chain name for second chain to superpose.
757 :type c2: :class:`str`
758 :return: Rotational axis (may contain nan if something fails).
759 :rtype: :class:`numpy.array`
761 if (c1,c2)
not in self.
_axis:
763 return self.
_axis[(c1,c2)]
768 Connectivity dictionary (**read/write**).
769 As given by :func:`GetContacts` with *calpha_only* = False on :attr:`ent`.
771 :getter: Computed on first use (cached)
772 :setter: Uses :func:`FilterContacts` to ensure that we only keep contacts
773 for chains in the cleaned entity.
774 :type: See return type of :func:`GetContacts`
782 chain_names = set([ch.name
for ch
in self.ent.chains])
788 CA-only connectivity dictionary (**read/write**).
789 Like :attr:`contacts` but with *calpha_only* = True in :func:`GetContacts`.
797 chain_names = set([ch.name
for ch
in self.ent.chains])
804 def _GetSuperposeData(self, c1, c2):
805 """Fill _angles and _axis from superposition of CA chains of c1 and c2."""
810 self.
_angles[(c1,c2)] = np.empty(3) * np.nan
811 self.
_axis[(c1,c2)] = np.empty(3) * np.nan
813 v1, v2 = seq.ViewsFromAlignment(aln)
814 if v1.atom_count < 3:
816 self.
_angles[(c1,c2)] = np.empty(3) * np.nan
817 self.
_axis[(c1,c2)] = np.empty(3) * np.nan
820 sup_res = mol.alg.SuperposeSVD(v1, v2, apply_transform=
False)
821 Rt = sup_res.transformation
823 a,b,c = _GetAngles(Rt)
824 self.
_angles[(c1,c2)] = np.asarray([a,b,c])
827 self.
_axis[(c1,c2)] = np.asarray(R3.GetRotationAxis().data)
834 """Filter contacts to contain only contacts for chains in *chain_names*.
836 :param contacts: Connectivity dictionary as produced by :func:`GetContacts`.
837 :type contacts: :class:`dict`
838 :param chain_names: Chain names to keep.
839 :type chain_names: :class:`list` or (better) :class:`set`
840 :return: New connectivity dictionary (format as in :func:`GetContacts`)
841 :rtype: :class:`dict`
844 filtered_contacts = dict()
846 if c1
in chain_names:
847 new_contacts = dict()
848 for c2
in contacts[c1]:
849 if c2
in chain_names:
850 new_contacts[c2] = contacts[c1][c2]
853 filtered_contacts[c1] = new_contacts
854 return filtered_contacts
857 """Get inter-chain contacts of a macromolecular entity.
859 Contacts are pairs of residues within a given distance belonging to different
860 chains. They are stored once per pair and include the CA/CB-CA/CB distance.
862 :param entity: An entity to check connectivity for.
863 :type entity: :class:`~ost.mol.EntityHandle` or :class:`~ost.mol.EntityView`
864 :param calpha_only: If True, we only consider CA-CA distances. Else, we use CB
865 unless the residue is a GLY.
866 :type calpha_only: :class:`bool`
867 :param dist_thr: Maximal CA/CB-CA/CB distance to be considered in contact.
868 :type dist_thr: :class:`float`
869 :return: A connectivity dictionary. A pair of residues with chain names
870 *ch_name1* & *ch_name2* (*ch_name1* < *ch_name2*), residue numbers
871 *res_num1* & *res_num2* and distance *dist* (<= *dist_thr*) are
872 stored as *result[ch_name1][ch_name2][res_num1][res_num2]* = *dist*.
873 :rtype: :class:`dict`
877 ev = entity.Select(
"aname=CA")
879 ev = entity.Select(
"(rname=GLY and aname=CA) or aname=CB")
880 ent = mol.CreateEntityFromView(ev,
True)
883 for atom
in ent.atoms:
884 ch_name1 = atom.chain.name
885 res_num1 = atom.residue.number.num
886 close_atoms = ent.FindWithin(atom.pos, dist_thr)
887 for close_atom
in close_atoms:
888 ch_name2 = close_atom.chain.name
889 if ch_name2 > ch_name1:
890 res_num2 = close_atom.residue.number.num
893 if ch_name1
not in contacts:
894 contacts[ch_name1] = dict()
895 if ch_name2
not in contacts[ch_name1]:
896 contacts[ch_name1][ch_name2] = dict()
897 if res_num1
not in contacts[ch_name1][ch_name2]:
898 contacts[ch_name1][ch_name2][res_num1] = dict()
899 contacts[ch_name1][ch_name2][res_num1][res_num2] = round(dist, 3)
908 """Helper class to calculate oligomeric lDDT scores.
910 This class can be used independently, but commonly it will be created by
911 calling :func:`QSscorer.GetOligoLDDTScorer`.
915 By construction, lDDT scores are not symmetric and hence it matters which
916 structure is the reference (:attr:`ref`) and which one is the model
917 (:attr:`mdl`). Extra residues in the model are generally not considered.
918 Extra chains in both model and reference can be considered by setting the
919 :attr:`penalize_extra_chains` flag to True.
921 :param ref: Sets :attr:`ref`
922 :param mdl: Sets :attr:`mdl`
923 :param alignments: Sets :attr:`alignments`
924 :param calpha_only: Sets :attr:`calpha_only`
925 :param settings: Sets :attr:`settings`
926 :param penalize_extra_chains: Sets :attr:`penalize_extra_chains`
927 :param chem_mapping: Sets :attr:`chem_mapping`. Must be given if
928 *penalize_extra_chains* is True.
933 Full reference/model entity to be scored. The entity must contain all chains
934 mapped in :attr:`alignments` and may also contain additional ones which are
935 considered if :attr:`penalize_extra_chains` is True.
937 :type: :class:`~ost.mol.EntityHandle`
939 .. attribute:: alignments
941 One alignment for each mapped chain of :attr:`ref`/:attr:`mdl` as defined in
942 :attr:`QSscorer.alignments`. The first sequence of each alignment belongs to
943 :attr:`ref` and the second one to :attr:`mdl`. Sequences must have sequence
944 naming and attached views as defined in :attr:`QSscorer.alignments`.
946 :type: :class:`list` of :class:`~ost.seq.AlignmentHandle`
948 .. attribute:: calpha_only
950 If True, restricts lDDT score to CA only.
954 .. attribute:: settings
956 Settings to use for lDDT scoring.
958 :type: :class:`~ost.mol.alg.lDDTSettings`
960 .. attribute:: penalize_extra_chains
962 If True, extra chains in both :attr:`ref` and :attr:`mdl` will penalize the
967 .. attribute:: chem_mapping
969 Inter-complex mapping of chemical groups as defined in
970 :attr:`QSscorer.chem_mapping`. Used to find "chem-mapped" chains in
971 :attr:`ref` for unmapped chains in :attr:`mdl` when penalizing scores.
972 Each unmapped model chain can add extra reference-contacts according to the
973 average total contacts of each single "chem-mapped" reference chain. If
974 there is no "chem-mapped" reference chain, a warning is shown and the model
978 Only relevant if :attr:`penalize_extra_chains` is True.
980 :type: :class:`dict` with key = :class:`tuple` of chain names in
981 :attr:`ref` and value = :class:`tuple` of chain names in
988 def __init__(self, ref, mdl, alignments, calpha_only, settings,
989 penalize_extra_chains=
False, chem_mapping=
None):
991 if chem_mapping
is None and penalize_extra_chains:
992 raise RuntimeError(
"Must provide chem_mapping when requesting penalty "
994 if not penalize_extra_chains:
997 if unmapped_mdl_chains:
998 LogWarning(
'MODEL contains chains unmapped to REFERENCE, '
999 'lDDT is not considering MODEL chains %s' \
1000 % str(list(unmapped_mdl_chains)))
1002 ref_chains = set(ch.name
for ch
in ref.chains)
1003 mapped_ref_chains = set(aln.GetSequence(0).GetName()
for aln
in alignments)
1004 unmapped_ref_chains = (ref_chains - mapped_ref_chains)
1005 if unmapped_ref_chains:
1006 LogWarning(
'REFERENCE contains chains unmapped to MODEL, '
1007 'lDDT is not considering REFERENCE chains %s' \
1008 % str(list(unmapped_ref_chains)))
1029 """Oligomeric lDDT score.
1031 The score is computed as conserved contacts divided by the total contacts
1032 in the reference using the :attr:`oligo_lddt_scorer`, which uses the full
1033 complex as reference/model structure. If :attr:`penalize_extra_chains` is
1034 True, the reference/model complexes contain all chains (otherwise only the
1035 mapped ones) and additional contacts are added to the reference's total
1036 contacts for unmapped model chains according to the :attr:`chem_mapping`.
1038 The main difference with :attr:`weighted_lddt` is that the lDDT scorer
1039 "sees" the full complex here (incl. inter-chain contacts), while the
1040 weighted single chain score looks at each chain separately.
1042 :getter: Computed on first use (cached)
1043 :type: :class:`float`
1046 LogInfo(
'Reference %s has: %s chains' \
1047 % (self.ref.GetName(), self.ref.chain_count))
1048 LogInfo(
'Model %s has: %s chains' \
1049 % (self.mdl.GetName(), self.mdl.chain_count))
1053 denominator = self.oligo_lddt_scorer.total_contacts
1056 oligo_lddt = self.oligo_lddt_scorer.conserved_contacts \
1057 / float(denominator)
1061 oligo_lddt = self.oligo_lddt_scorer.global_score
1067 """Weighted average of single chain lDDT scores.
1069 The score is computed as a weighted average of single chain lDDT scores
1070 (see :attr:`sc_lddt_scorers`) using the total contacts of each single
1071 reference chain as weights. If :attr:`penalize_extra_chains` is True,
1072 unmapped chains are added with a 0 score and total contacts taken from
1073 the actual reference chains or (for unmapped model chains) using the
1074 :attr:`chem_mapping`.
1076 See :attr:`oligo_lddt` for a comparison of the two scores.
1078 :getter: Computed on first use (cached)
1079 :type: :class:`float`
1084 nominator = sum([s * w
for s, w
in zip(scores, weights)])
1087 denominator = sum(s.total_contacts
for s
in ref_scorers.values())
1090 denominator = sum(weights)
1099 """The reference entity used for oligomeric lDDT scoring
1100 (:attr:`oligo_lddt` / :attr:`oligo_lddt_scorer`).
1102 Since the lDDT computation requires a single chain with mapped residue
1103 numbering, all chains of :attr:`ref` are appended into a single chain X with
1104 unique residue numbers according to the column-index in the alignment. The
1105 alignments are in the same order as they appear in :attr:`alignments`.
1106 Additional residues are appended at the end of the chain with unique residue
1107 numbers. Unmapped chains are only added if :attr:`penalize_extra_chains` is
1108 True. Only CA atoms are considered if :attr:`calpha_only` is True.
1110 :getter: Computed on first use (cached)
1111 :type: :class:`~ost.mol.EntityHandle`
1119 """The model entity used for oligomeric lDDT scoring
1120 (:attr:`oligo_lddt` / :attr:`oligo_lddt_scorer`).
1122 Like :attr:`lddt_ref`, this is a single chain X containing all chains of
1123 :attr:`mdl`. The residue numbers match the ones in :attr:`lddt_ref` where
1124 aligned and have unique numbers for additional residues.
1126 :getter: Computed on first use (cached)
1127 :type: :class:`~ost.mol.EntityHandle`
1135 """lDDT Scorer object for :attr:`lddt_ref` and :attr:`lddt_mdl`.
1137 :getter: Computed on first use (cached)
1138 :type: :class:`~ost.mol.alg.lDDTScorer`
1142 references=[self.lddt_ref.Select(
"")],
1143 model=self.lddt_mdl.Select(
""),
1149 """List of scorer objects for each chain mapped in :attr:`alignments`.
1151 :getter: Computed on first use (cached)
1152 :type: :class:`list` of :class:`MappedLDDTScorer`
1159 self._mapped_lddt_scorers.append(mapped_lddt_scorer)
1164 """List of lDDT scorer objects extracted from :attr:`mapped_lddt_scorers`.
1166 :type: :class:`list` of :class:`~ost.mol.alg.lDDTScorer`
1172 """List of global scores extracted from :attr:`sc_lddt_scorers`.
1174 If scoring for a mapped chain fails, an error is displayed and a score of 0
1177 :getter: Computed on first use (cached)
1178 :type: :class:`list` of :class:`float`
1184 self._sc_lddt.append(lddt_scorer.global_score)
1185 except Exception
as ex:
1186 LogError(
'Single chain lDDT failed:', str(ex))
1187 self._sc_lddt.append(0.0)
1194 def _PrepareOligoEntities(self):
1201 def _GetUnmappedMdlChains(mdl, alignments):
1203 mdl_chains = set(ch.name
for ch
in mdl.chains)
1204 mapped_mdl_chains = set(aln.GetSequence(1).GetName()
for aln
in alignments)
1205 return (mdl_chains - mapped_mdl_chains)
1207 def _GetRefScorers(self):
1211 ref_scorers = dict()
1213 ref_ch_name = mapped_lddt_scorer.reference_chain_name
1214 ref_scorers[ref_ch_name] = mapped_lddt_scorer.lddt_scorer
1216 for ch
in self.ref.chains:
1217 if ch.name
not in ref_scorers:
1219 ref_chain = ch.Select(
'aname=CA')
1221 ref_chain = ch.Select(
'')
1223 references=[ref_chain],
1231 def _GetModelPenalty(self):
1237 raise RuntimeError(
"Must provide chem_mapping when requesting penalty "
1238 "for extra model chains!")
1245 for ch_name_mdl
in sorted(unmapped_mdl_chains):
1248 for cm_ref, cm_mdl
in self.chem_mapping.iteritems():
1249 if ch_name_mdl
in cm_mdl:
1252 for ch_name_ref
in cm_ref:
1254 cur_penalty += ref_scorers[ch_name_ref].total_contacts
1255 cur_penalty /= float(len(cm_ref))
1258 if cur_penalty
is None:
1259 LogWarning(
'Extra MODEL chain %s could not be chemically mapped to '
1260 'any chain in REFERENCE, lDDT cannot consider it!' \
1263 LogScript(
'Extra MODEL chain %s added to lDDT score by considering '
1264 'chemically mapped chains in REFERENCE.' % ch_name_mdl)
1265 model_penalty += cur_penalty
1273 """A simple class to calculate a single-chain lDDT score on a given chain to
1274 chain mapping as extracted from :class:`OligoLDDTScorer`.
1276 :param alignment: Sets :attr:`alignment`
1277 :param calpha_only: Sets :attr:`calpha_only`
1278 :param settings: Sets :attr:`settings`
1280 .. attribute:: alignment
1282 Alignment with two sequences named according to the mapped chains and with
1283 views attached to both sequences (e.g. one of the items of
1284 :attr:`QSscorer.alignments`).
1286 The first sequence is assumed to be the reference and the second one the
1287 model. Since the lDDT score is not symmetric (extra residues in model are
1288 ignored), the order is important.
1290 :type: :class:`~ost.seq.AlignmentHandle`
1292 .. attribute:: calpha_only
1294 If True, restricts lDDT score to CA only.
1296 :type: :class:`bool`
1298 .. attribute:: settings
1300 Settings to use for lDDT scoring.
1302 :type: :class:`~ost.mol.alg.lDDTSettings`
1304 .. attribute:: lddt_scorer
1306 lDDT Scorer object for the given chains.
1308 :type: :class:`~ost.mol.alg.lDDTScorer`
1310 .. attribute:: reference_chain_name
1312 Chain name of the reference.
1316 .. attribute:: model_chain_name
1318 Chain name of the model.
1337 :return: Scores for each residue
1338 :rtype: :class:`list` of :class:`dict` with one item for each residue
1339 existing in model and reference:
1341 - "residue_number": Residue number in reference chain
1342 - "residue_name": Residue name in reference chain
1343 - "lddt": local lDDT
1344 - "conserved_contacts": number of conserved contacts
1345 - "total_contacts": total number of contacts
1348 assigned_residues = list()
1350 self.lddt_scorer.global_score
1352 if col[0] !=
"-" and col.GetResidue(3).IsValid():
1353 ref_res = col.GetResidue(0)
1354 mdl_res = col.GetResidue(1)
1355 ref_res_renum = col.GetResidue(2)
1356 mdl_res_renum = col.GetResidue(3)
1357 if ref_res.one_letter_code != ref_res_renum.one_letter_code:
1358 raise RuntimeError(
"Reference residue name mapping inconsistent: %s != %s" %
1359 (ref_res.one_letter_code,
1360 ref_res_renum.one_letter_code))
1361 if mdl_res.one_letter_code != mdl_res_renum.one_letter_code:
1362 raise RuntimeError(
"Model residue name mapping inconsistent: %s != %s" %
1363 (mdl_res.one_letter_code,
1364 mdl_res_renum.one_letter_code))
1366 raise RuntimeError(
"Reference residue number mapping inconsistent: %s != %s" %
1367 (ref_res.GetNumber().num,
1370 raise RuntimeError(
"Model residue number mapping inconsistent: %s != %s" %
1371 (mdl_res.GetNumber().num,
1373 if ref_res.qualified_name
in assigned_residues:
1374 raise RuntimeError(
"Duplicated residue in reference: " %
1375 (ref_res.qualified_name))
1377 assigned_residues.append(ref_res.qualified_name)
1379 if mdl_res_renum.HasProp(self.settings.label):
1381 "residue_number": ref_res.GetNumber().num,
1382 "residue_name": ref_res.name,
1383 "lddt": mdl_res_renum.GetFloatProp(self.settings.label),
1384 "conserved_contacts": mdl_res_renum.GetFloatProp(self.settings.label +
"_conserved"),
1385 "total_contacts": mdl_res_renum.GetFloatProp(self.settings.label +
"_total")})
1392 def _InitScorer(self):
1394 aln = self.alignment.Copy()
1399 refseq = seq.CreateSequence(
1400 "reference_renumbered",
1401 aln.GetSequence(0).GetString())
1402 refseq.AttachView(reference)
1403 aln.AddSequence(refseq)
1407 modelseq = seq.CreateSequence(
1409 aln.GetSequence(1).GetString())
1410 modelseq.AttachView(model)
1411 aln.AddSequence(modelseq)
1415 references=[reference.Select(
'aname=CA')],
1416 model=model.Select(
'aname=CA'),
1420 references=[reference],
1432 def _AlignAtomSeqs(seq_1, seq_2):
1434 :type seq_1: :class:`ost.seq.SequenceHandle`
1435 :type seq_2: :class:`ost.seq.SequenceHandle`
1436 :return: Alignment of two sequences using a global alignment. Views attached
1437 to the input sequences will remain attached in the aln.
1438 :rtype: :class:`~ost.seq.AlignmentHandle` or None if it failed.
1443 alns = seq.alg.GlobalAlign(seq_1, seq_2, seq.alg.BLOSUM62)
1444 if alns: aln = alns[0]
1446 LogWarning(
'Failed to align %s to %s' % (seq_1.name, seq_2.name))
1447 LogWarning(
'%s: %s' % (seq_1.name, seq_1.string))
1448 LogWarning(
'%s: %s' % (seq_2.name, seq_2.string))
1451 def _FixSelectChainName(ch_name):
1453 :return: String to be used with Select(cname=<RETURN>). Takes care of putting
1454 quotation marks where needed.
1455 :rtype: :class:`str`
1456 :param ch_name: Single chain name (:class:`str`).
1458 if ch_name
in [
'-',
'_',
' ']:
1459 return '"%c"' % ch_name
1463 def _FixSelectChainNames(ch_names):
1465 :return: String to be used with Select(cname=<RETURN>). Takes care of joining
1466 and putting quotation marks where needed.
1467 :rtype: :class:`str`
1468 :param ch_names: Some iterable list of chain names (:class:`str` items).
1470 chain_set = set([_FixSelectChainName(ch_name)
for ch_name
in ch_names])
1471 return ','.join(chain_set)
1475 def _CleanInputEntity(ent):
1477 :param ent: The OST entity to be cleaned.
1478 :type ent: :class:`EntityHandle` or :class:`EntityView`
1479 :return: A tuple of 3 items: :attr:`QSscoreEntity.ent`,
1480 :attr:`QSscoreEntity.removed_chains`,
1481 :attr:`QSscoreEntity.calpha_only`
1485 for ch
in ent.chains:
1488 if ch.name
in [
'-',
'_'] \
1489 or ch.residue_count < 20 \
1490 or not any(r.chem_type.IsAminoAcid()
for r
in ch.residues) \
1491 or not (set(r.one_letter_code
for r
in ch.residues) - {
'?',
'X'}):
1492 removed_chains.append(ch.name)
1496 view = ent.Select(
'cname!=%s' % _FixSelectChainNames(removed_chains))
1497 ent_new = mol.CreateEntityFromView(view,
True)
1498 ent_new.SetName(ent.GetName())
1504 if ent_new.atom_count > 0
and ent_new.Select(
'aname=CB').atom_count == 0:
1505 LogInfo(
'Structure %s is a CA only structure!' % ent_new.GetName())
1510 LogInfo(
'Chains removed from %s: %s' \
1511 % (ent_new.GetName(),
''.join(removed_chains)))
1512 LogInfo(
'Chains in %s: %s' \
1513 % (ent_new.GetName(),
''.join([c.name
for c
in ent_new.chains])))
1514 return ent_new, removed_chains, calpha_only
1516 def _GetCAOnlyEntity(ent):
1518 :param ent: Entity to process
1519 :type ent: :class:`EntityHandle` or :class:`EntityView`
1520 :return: New entity with only CA and only one atom per residue
1521 (see :attr:`QSscoreEntity.ca_entity`)
1524 ca_view = ent.CreateEmptyView()
1526 for res
in ent.residues:
1527 ca_atom = res.FindAtom(
"CA")
1528 if ca_atom.IsValid():
1529 ca_view.AddAtom(ca_atom)
1531 return mol.CreateEntityFromView(ca_view,
False)
1533 def _GetChemGroups(qs_ent, seqid_thr=95.):
1535 :return: Intra-complex group of chemically identical polypeptide chains
1536 (see :attr:`QSscoreEntity.chem_groups`)
1538 :param qs_ent: Entity to process
1539 :type qs_ent: :class:`QSscoreEntity`
1540 :param seqid_thr: Threshold used to decide when two chains are identical.
1541 95 percent tolerates the few mutations crystallographers
1543 :type seqid_thr: :class:`float`
1546 ca_chains = qs_ent.ca_chains
1547 chain_names = sorted(ca_chains.keys())
1552 for ch_1, ch_2
in itertools.combinations(chain_names, 2):
1553 aln = qs_ent.GetAlignment(ch_1, ch_2)
1554 if aln
and seq.alg.SequenceIdentity(aln) > seqid_thr:
1555 id_seqs.append((ch_1, ch_2))
1558 return [[name]
for name
in chain_names]
1562 for ch_1, ch_2
in id_seqs:
1565 if ch_1
in g
or ch_2
in g:
1570 groups.append(set([ch_1, ch_2]))
1574 ranked_g = sorted([(-ca_chains[ch].length, ch)
for ch
in g])
1575 chem_groups.append([ch
for _,ch
in ranked_g])
1577 for ch
in chain_names:
1578 if not any(ch
in g
for g
in chem_groups):
1579 chem_groups.append([ch])
1584 """Computes the Euler angles given a transformation matrix.
1586 :param Rt: Rt operator.
1587 :type Rt: :class:`ost.geom.Mat4`
1588 :return: A :class:`tuple` of angles for each axis (x,y,z)
1590 rot = np.asmatrix(Rt.ExtractRotation().data).reshape(3,3)
1591 tx = np.arctan2(rot[2,1], rot[2,2])
1594 ty = np.arctan2(rot[2,0], np.sqrt(rot[2,1]**2 + rot[2,2]**2))
1597 tz = np.arctan2(rot[1,0], rot[0,0])
1604 def _GetChemGroupsMapping(qs_ent_1, qs_ent_2):
1606 :return: Inter-complex mapping of chemical groups
1607 (see :attr:`QSscorer.chem_mapping`)
1609 :param qs_ent_1: See :attr:`QSscorer.qs_ent_1`
1610 :param qs_ent_2: See :attr:`QSscorer.qs_ent_2`
1613 chem_groups_1 = qs_ent_1.chem_groups
1614 chem_groups_2 = qs_ent_2.chem_groups
1615 repr_chains_1 = {x[0]: tuple(x)
for x
in chem_groups_1}
1616 repr_chains_2 = {x[0]: tuple(x)
for x
in chem_groups_2}
1621 if len(repr_chains_2) < len(repr_chains_1):
1622 repr_chains_1, repr_chains_2 = repr_chains_2, repr_chains_1
1623 qs_ent_1, qs_ent_2 = qs_ent_2, qs_ent_1
1633 ca_chains_1 = qs_ent_1.ca_chains
1634 ca_chains_2 = qs_ent_2.ca_chains
1635 for ch_1
in repr_chains_1.keys():
1636 for ch_2
in repr_chains_2.keys():
1637 aln = _AlignAtomSeqs(ca_chains_1[ch_1], ca_chains_2[ch_2])
1639 chains_seqid = seq.alg.SequenceIdentity(aln)
1640 LogVerbose(
'Sequence identity', ch_1, ch_2,
'seqid=%.2f' % chains_seqid)
1641 chain_pairs.append((chains_seqid, ch_1, ch_2))
1644 chain_pairs = sorted(chain_pairs, reverse=
True)
1646 for _, c1, c2
in chain_pairs:
1648 for a,b
in chem_mapping.iteritems():
1649 if repr_chains_1[c1] == a
or repr_chains_2[c2] == b:
1653 chem_mapping[repr_chains_1[c1]] = repr_chains_2[c2]
1655 chem_mapping = {y: x
for x, y
in chem_mapping.iteritems()}
1656 qs_ent_1, qs_ent_2 = qs_ent_2, qs_ent_1
1659 mapped_1 = set([i
for s
in chem_mapping.keys()
for i
in s])
1660 chains_1 = set([c.name
for c
in qs_ent_1.ent.chains])
1661 if chains_1 - mapped_1:
1662 LogWarning(
'Unmapped Chains in %s: %s'
1663 % (qs_ent_1.GetName(),
','.join(list(chains_1 - mapped_1))))
1665 mapped_2 = set([i
for s
in chem_mapping.values()
for i
in s])
1666 chains_2 = set([c.name
for c
in qs_ent_2.ent.chains])
1667 if chains_2 - mapped_2:
1668 LogWarning(
'Unmapped Chains in %s: %s'
1669 % (qs_ent_2.GetName(),
','.join(list(chains_2 - mapped_2))))
1672 LogInfo(
'Chemical chain-groups mapping: ' + str(chem_mapping))
1673 if len(mapped_1) < 1
or len(mapped_2) < 1:
1674 raise QSscoreError(
'Less than 1 chains left in chem_mapping.')
1677 def _SelectFew(l, max_elements):
1678 """Return l or copy of l with at most *max_elements* entries."""
1680 if n_el <= max_elements:
1684 d_el = ((n_el - 1) // max_elements) + 1
1686 for i
in range(0, n_el, d_el):
1690 def _GetAlignedResidues(qs_ent_1, qs_ent_2, chem_mapping, max_ca_per_chain,
1693 :return: Tuple of two :class:`~ost.mol.EntityView` objects containing subsets
1694 of *qs_ent_1* and *qs_ent_2*. Two entities are later created from
1695 those views (see :attr:`QSscorer.ent_to_cm_1` and
1696 :attr:`QSscorer.ent_to_cm_2`)
1698 :param qs_ent_1: See :attr:`QSscorer.qs_ent_1`
1699 :param qs_ent_2: See :attr:`QSscorer.qs_ent_2`
1700 :param chem_mapping: See :attr:`QSscorer.chem_mapping`
1701 :param max_ca_per_chain: See :attr:`QSscorer.max_ca_per_chain_for_cm`
1704 def _FixName(seq_name, seq_names):
1706 seq_name = seq_name.replace(
' ',
'-')
1707 while seq_name
in seq_names:
1711 ent_view_1 = qs_ent_1.ca_entity.CreateEmptyView()
1712 ent_view_2 = qs_ent_2.ca_entity.CreateEmptyView()
1713 ca_chains_1 = qs_ent_1.ca_chains
1714 ca_chains_2 = qs_ent_2.ca_chains
1716 for group_1, group_2
in chem_mapping.iteritems():
1718 seq_list = seq.CreateSequenceList()
1720 seq_to_empty_view = dict()
1722 sequence = ca_chains_1[ch].Copy()
1723 sequence.name = _FixName(qs_ent_1.GetName() +
'.' + ch, seq_to_empty_view)
1724 seq_to_empty_view[sequence.name] = ent_view_1
1725 seq_list.AddSequence(sequence)
1727 sequence = ca_chains_2[ch].Copy()
1728 sequence.name = _FixName(qs_ent_2.GetName() +
'.' + ch, seq_to_empty_view)
1729 seq_to_empty_view[sequence.name] = ent_view_2
1730 seq_list.AddSequence(sequence)
1731 alnc =
ClustalW(seq_list, clustalw=clustalw_bin)
1734 residue_lists = list()
1735 sequence_count = alnc.sequence_count
1737 residues = [col.GetResidue(ir)
for ir
in range(sequence_count)]
1738 if all([r.IsValid()
for r
in residues]):
1739 residue_lists.append(residues)
1741 if len(residue_lists) < 5:
1743 elif max_ca_per_chain > 0:
1744 residue_lists = _SelectFew(residue_lists, max_ca_per_chain)
1746 res_views = [seq_to_empty_view[alnc.sequences[ir].name] \
1747 for ir
in range(sequence_count)]
1749 for res_list
in residue_lists:
1750 for res, view
in zip(res_list, res_views):
1751 view.AddResidue(res, mol.ViewAddFlag.INCLUDE_ATOMS)
1753 return ent_view_1, ent_view_2
1756 def _FindSymmetry(qs_ent_1, qs_ent_2, ent_to_cm_1, ent_to_cm_2, chem_mapping):
1758 :return: A pair of comparable symmetry groups (for :attr:`QSscorer.symm_1`
1759 and :attr:`QSscorer.symm_2`) between the two structures.
1760 Empty lists if no symmetry identified.
1762 :param qs_ent_1: See :attr:`QSscorer.qs_ent_1`
1763 :param qs_ent_2: See :attr:`QSscorer.qs_ent_2`
1764 :param ent_to_cm_1: See :attr:`QSscorer.ent_to_cm_1`
1765 :param ent_to_cm_2: See :attr:`QSscorer.ent_to_cm_2`
1766 :param chem_mapping: See :attr:`QSscorer.chem_mapping`
1768 LogInfo(
'Identifying Symmetry Groups...')
1771 symm_subg_1 = _GetSymmetrySubgroups(qs_ent_1, ent_to_cm_1,
1772 chem_mapping.keys())
1773 symm_subg_2 = _GetSymmetrySubgroups(qs_ent_2, ent_to_cm_2,
1774 chem_mapping.values())
1777 LogInfo(
'Selecting Symmetry Groups...')
1783 for symm_1, symm_2
in itertools.product(symm_subg_1, symm_subg_2):
1786 if len(s1) != len(s2):
1787 LogDebug(
'Discarded', str(symm_1), str(symm_2),
1788 ': different size of symmetry groups')
1790 count = _CountSuperpositionsAndMappings(symm_1, symm_2, chem_mapping)
1791 nr_mapp = count[
'intra'][
'mappings'] + count[
'inter'][
'mappings']
1792 LogDebug(
'OK', str(symm_1), str(symm_2),
': %s mappings' % nr_mapp)
1793 best_symm.append((nr_mapp, symm_1, symm_2))
1795 for _, symm_1, symm_2
in sorted(best_symm):
1798 group_1 = ent_to_cm_1.Select(
'cname=%s' % _FixSelectChainNames(s1))
1799 group_2 = ent_to_cm_2.Select(
'cname=%s' % _FixSelectChainNames(s2))
1803 closed_symm = _CheckClosedSymmetry(group_1, group_2, symm_1, symm_2,
1804 chem_mapping, 6, 0.8, find_best=
False)
1807 return symm_1, symm_2
1810 LogInfo(
'No coherent symmetry identified between structures')
1814 def _GetChainMapping(ent_1, ent_2, symm_1, symm_2, chem_mapping):
1816 :return: Tuple with mapping from *ent_1* to *ent_2* (see
1817 :attr:`QSscorer.chain_mapping`) and scheme used (see
1818 :attr:`QSscorer.chain_mapping_scheme`)
1820 :param ent_1: See :attr:`QSscorer.ent_to_cm_1`
1821 :param ent_2: See :attr:`QSscorer.ent_to_cm_2`
1822 :param symm_1: See :attr:`QSscorer.symm_1`
1823 :param symm_2: See :attr:`QSscorer.symm_2`
1824 :param chem_mapping: See :attr:`QSscorer.chem_mapping`
1826 LogInfo(
'Symmetry-groups used in %s: %s' % (ent_1.GetName(), str(symm_1)))
1827 LogInfo(
'Symmetry-groups used in %s: %s' % (ent_2.GetName(), str(symm_2)))
1830 thresholds = [(
'strict', 4, 0.8),
1831 (
'tolerant', 6, 0.4),
1832 (
'permissive', 8, 0.2)]
1833 for scheme, sup_thr, sup_fract
in thresholds:
1837 mapping = _CheckClosedSymmetry(ent_1, ent_2, symm_1, symm_2,
1838 chem_mapping, sup_thr, sup_fract)
1840 LogInfo(
'Closed Symmetry with %s parameters' % scheme)
1841 if scheme ==
'permissive':
1842 LogWarning(
'Permissive thresholds used for overlap based mapping ' + \
1843 'detection: check mapping manually: %s' % mapping)
1844 return mapping, scheme
1858 count = _CountSuperpositionsAndMappings(symm_1, symm_2, chem_mapping)
1859 LogInfo(
'Intra Symmetry-group mappings to check: %s' \
1860 % count[
'intra'][
'mappings'])
1861 LogInfo(
'Inter Symmetry-group mappings to check: %s' \
1862 % count[
'inter'][
'mappings'])
1863 nr_mapp = count[
'intra'][
'mappings'] + count[
'inter'][
'mappings']
1864 if nr_mapp > 100000:
1865 raise QSscoreError(
'Too many possible mappings: %s' % nr_mapp)
1874 ref_symm_1 = symm_1[0]
1875 ref_symm_2 = symm_2[0]
1876 asu_chem_mapping = _LimitChemMapping(chem_mapping, ref_symm_1, ref_symm_2)
1878 for g1, g2
in asu_chem_mapping.iteritems():
1880 for c1, c2
in itertools.product(g1, g2):
1882 LogVerbose(
'Superposing chains: %s, %s' % (c1, c2))
1883 res = cached_rmsd.GetSuperposition(c1, c2)
1886 for mapping
in _ListPossibleMappings(c1, c2, asu_chem_mapping):
1888 chains_rmsd = cached_rmsd.GetMappedRMSD(mapping, res.transformation)
1889 cur_mappings.append((chains_rmsd, mapping))
1890 LogVerbose(str(mapping), str(chains_rmsd))
1891 best_rmsd, best_mapp = min(cur_mappings)
1892 intra_mappings.append((best_rmsd, c1, c2, best_mapp))
1894 _, intra_asu_c1, intra_asu_c2, intra_asu_mapp = min(intra_mappings)
1897 if len(symm_1) == 1
or len(symm_2) == 1:
1898 mapping = intra_asu_mapp
1903 index_asu_c1 = ref_symm_1.index(intra_asu_c1)
1904 index_asu_c2 = ref_symm_2.index(intra_asu_c2)
1905 index_mapp = {ref_symm_1.index(s1): ref_symm_2.index(s2) \
1906 for s1, s2
in intra_asu_mapp.iteritems()}
1907 LogInfo(
'Intra symmetry-group mapping: %s' % str(intra_asu_mapp))
1911 if len(symm_1) < len(symm_2):
1912 symm_combinations = list(itertools.product(symm_1, [symm_2[0]]))
1914 symm_combinations = list(itertools.product([symm_1[0]], symm_2))
1916 full_asu_mapp = {tuple(symm_1): tuple(symm_2)}
1918 for s1, s2
in symm_combinations:
1920 LogVerbose(
'Superposing symmetry-group: %s, %s in %s, %s' \
1921 % (s1[index_asu_c1], s2[index_asu_c2], s1, s2))
1922 res = cached_rmsd.GetSuperposition(s1[index_asu_c1], s2[index_asu_c2])
1924 for asu_mapp
in _ListPossibleMappings(s1, s2, full_asu_mapp):
1928 for a, b
in asu_mapp.iteritems():
1929 for id_a, id_b
in index_mapp.iteritems():
1930 mapping[a[id_a]] = b[id_b]
1931 chains_rmsd = cached_rmsd.GetMappedRMSD(mapping, res.transformation)
1932 full_mappings.append((chains_rmsd, mapping))
1933 LogVerbose(str(mapping), str(chains_rmsd))
1935 if check != nr_mapp:
1936 LogWarning(
'Mapping number estimation was wrong')
1939 mapping = min(full_mappings)[1]
1941 LogWarning(
'Extensive search used for mapping detection (fallback). This ' + \
1942 'approach has known limitations. Check mapping manually: %s' \
1944 return mapping,
'extensive'
1947 def _GetSymmetrySubgroups(qs_ent, ent, chem_groups):
1949 Identify the symmetry (either cyclic C or dihedral D) of the protein and find
1950 all possible symmetry subgroups. This is done testing all combination of chain
1951 superposition and clustering them by the angles (D) or the axis (C) of the Rt
1954 Groups of superposition which can fully reconstruct the structure are possible
1957 :param qs_ent: Entity with cached angles and axis.
1958 :type qs_ent: :class:`QSscoreEntity`
1959 :param ent: Entity from which to extract chains (perfect alignment assumed
1960 for superposition as in :attr:`QSscorer.ent_to_cm_1`).
1961 :type ent: :class:`EntityHandle` or :class:`EntityView`
1962 :param chem_groups: List of tuples/lists of chain names in *ent*. Each list
1963 contains all chains belonging to a chem. group (could be
1964 from :attr:`QSscoreEntity.chem_groups` or from "keys()"
1965 or "values()" of :attr:`QSscorer.chem_mapping`)
1967 :return: A list of possible symmetry subgroups (each in same format as
1968 :attr:`QSscorer.symm_1`). If no symmetry is found, we return a list
1969 with a single symmetry subgroup with a single group with all chains.
1974 for cnames
in chem_groups:
1975 for c1, c2
in itertools.combinations(cnames, 2):
1976 cur_angles = qs_ent.GetAngles(c1, c2)
1977 if not np.any(np.isnan(cur_angles)):
1978 angles[(c1,c2)] = cur_angles
1979 cur_axis = qs_ent.GetAxis(c1, c2)
1980 if not np.any(np.isnan(cur_axis)):
1981 axis[(c1,c2)] = cur_axis
1985 LogVerbose(
'Possible symmetry-groups for: %s' % ent.GetName())
1986 for angle_thr
in np.arange(0.1, 1, 0.1):
1987 d_symm_groups = _GetDihedralSubgroups(ent, chem_groups, angles, angle_thr)
1989 for group
in d_symm_groups:
1990 if group
not in symm_groups:
1991 symm_groups.append(group)
1992 LogVerbose(
'Dihedral: %s' % group)
1993 LogInfo(
'Symmetry threshold %.1f used for angles of %s' \
1994 % (angle_thr, ent.GetName()))
1998 for axis_thr
in np.arange(0.1, 1, 0.1):
1999 c_symm_groups = _GetCyclicSubgroups(ent, chem_groups, axis, axis_thr)
2001 for group
in c_symm_groups:
2002 if group
not in symm_groups:
2003 symm_groups.append(group)
2004 LogVerbose(
'Cyclic: %s' % group)
2005 LogInfo(
'Symmetry threshold %.1f used for axis of %s' \
2006 % (axis_thr, ent.GetName()))
2011 LogInfo(
'No symmetry-group detected in %s' % ent.GetName())
2012 symm_groups = [[tuple([c
for g
in chem_groups
for c
in g])]]
2016 def _GetDihedralSubgroups(ent, chem_groups, angles, angle_thr):
2018 :return: Dihedral subgroups for :func:`_GetSymmetrySubgroups`
2019 (same return type as there). Empty list if fail.
2021 :param ent: See :func:`_GetSymmetrySubgroups`
2022 :param chem_groups: See :func:`_GetSymmetrySubgroups`
2023 :param angles: :class:`dict` (key = chain-pair-tuple, value = Euler angles)
2024 :param angle_thr: Euler angles distance threshold for clustering (float).
2027 if len(angles) == 0:
2031 same_angles = _ClusterData(angles, angle_thr, _AngleArrayDistance)
2035 for clst
in same_angles.values():
2037 if _ValidChainGroup(group, ent):
2038 if len(chem_groups) > 1:
2040 symm_groups.append(zip(*group))
2043 symm_groups.append(group)
2044 symm_groups.append(zip(*group))
2047 def _GetCyclicSubgroups(ent, chem_groups, axis, axis_thr):
2049 :return: Cyclic subgroups for :func:`_GetSymmetrySubgroups`
2050 (same return type as there). Empty list if fail.
2052 :param ent: See :func:`_GetSymmetrySubgroups`
2053 :param chem_groups: See :func:`_GetSymmetrySubgroups`
2054 :param angles: :class:`dict` (key = chain-pair-tuple, value = rotation axis)
2055 :param angle_thr: Axis distance threshold for clustering (float).
2062 same_axis = _ClusterData(axis, axis_thr, _AxisDistance)
2066 for clst
in same_axis.values():
2067 all_chain = [item
for sublist
in clst.keys()
for item
in sublist]
2068 if len(set(all_chain)) == ent.chain_count:
2070 if len(chem_groups) > 1:
2071 ref_chem_group = chem_groups[0]
2073 cyclic_group_closest = []
2074 cyclic_group_iface = []
2075 for c1
in ref_chem_group:
2076 group_closest = [c1]
2078 for chains
in chem_groups[1:]:
2080 closest = _GetClosestChain(ent, c1, chains)
2082 closest_iface = _GetClosestChainInterface(ent, c1, chains)
2083 group_closest.append(closest)
2084 group_iface.append(closest_iface)
2085 cyclic_group_closest.append(tuple(group_closest))
2086 cyclic_group_iface.append(tuple(group_iface))
2087 if _ValidChainGroup(cyclic_group_closest, ent):
2088 symm_groups.append(cyclic_group_closest)
2089 if _ValidChainGroup(cyclic_group_iface, ent):
2090 symm_groups.append(cyclic_group_iface)
2093 symm_groups.append(chem_groups)
2096 def _ClusterData(data, thr, metric):
2097 """Wrapper for fclusterdata to get dict of clusters.
2099 :param data: :class:`dict` (keys for ID, values used for clustering)
2100 :return: :class:`dict` {cluster_idx: {data-key: data-value}}
2104 return {0: {data.keys()[0]: data.values()[0]} }
2106 cluster_indices = fclusterdata(np.asarray(data.values()), thr,
2107 method=
'complete', criterion=
'distance',
2111 for cluster_idx, data_key
in zip(cluster_indices, data.keys()):
2112 if not cluster_idx
in res:
2113 res[cluster_idx] = {}
2114 res[cluster_idx][data_key] = data[data_key]
2117 def _AngleArrayDistance(u, v):
2119 :return: Average angular distance of two arrays of angles.
2120 :param u: Euler angles.
2121 :param v: Euler angles.
2124 for a,b
in zip(u,v):
2127 delta = abs(2*np.pi - delta)
2129 return np.mean(dist)
2131 def _AxisDistance(u, v):
2133 :return: Euclidean distance between two rotation axes. Axes can point in
2134 either direction so we ensure to use the closer one.
2135 :param u: Rotation axis.
2136 :param v: Rotation axis.
2144 d1 = np.dot(dv1, dv1)
2145 d2 = np.dot(dv2, dv2)
2146 return np.sqrt(min(d1, d2))
2148 def _GetClosestChain(ent, ref_chain, chains):
2150 :return: Chain closest to *ref_chain* based on center of atoms distance.
2151 :rtype: :class:`str`
2152 :param ent: See :func:`_GetSymmetrySubgroups`
2153 :param ref_chain: We look for chains closest to this one
2154 :type ref_chain: :class:`str`
2155 :param chains: We only consider these chains
2156 :type chains: :class:`list` of :class:`str`
2159 ref_center = ent.FindChain(ref_chain).center_of_atoms
2161 center = ent.FindChain(ch).center_of_atoms
2163 closest_chain = min(contacts)[1]
2164 return closest_chain
2166 def _GetClosestChainInterface(ent, ref_chain, chains):
2168 :return: Chain with biggest interface (within 10 A) to *ref_chain*.
2169 :rtype: :class:`str`
2170 :param ent: See :func:`_GetSymmetrySubgroups`
2171 :param ref_chain: We look for chains closest to this one
2172 :type ref_chain: :class:`str`
2173 :param chains: We only consider these chains
2174 :type chains: :class:`list` of :class:`str`
2180 iface_view = ent.Select(
'cname="%s" and 10 <> [cname="%s"]' \
2182 nr_res = iface_view.residue_count
2183 closest.append((nr_res, ch))
2184 closest_chain = max(closest)[1]
2185 return closest_chain
2187 def _ValidChainGroup(group, ent):
2189 :return: True, if *group* has unique chain names and as many chains as *ent*
2190 :rtype: :class:`bool`
2191 :param group: Symmetry groups to check
2192 :type group: Same as :attr:`QSscorer.symm_1`
2193 :param ent: See :func:`_GetSymmetrySubgroups`
2195 all_chain = [item
for sublist
in group
for item
in sublist]
2196 if len(all_chain) == len(set(all_chain))
and\
2197 len(all_chain) == ent.chain_count:
2201 def _LimitChemMapping(chem_mapping, limit_1, limit_2):
2203 :return: Chem. mapping containing only chains in *limit_1* and *limit_2*
2204 :rtype: Same as :attr:`QSscorer.chem_mapping`
2205 :param chem_mapping: See :attr:`QSscorer.chem_mapping`
2206 :param limit_1: Limits chain names in chem_mapping.keys()
2207 :type limit_1: List/tuple of strings
2208 :param limit_2: Limits chain names in chem_mapping.values()
2209 :type limit_2: List/tuple of strings
2212 limit_1_set = set(limit_1)
2213 limit_2_set = set(limit_2)
2214 asu_chem_mapping = dict()
2215 for key, value
in chem_mapping.iteritems():
2216 new_key = tuple(set(key) & limit_1_set)
2218 asu_chem_mapping[new_key] = tuple(set(value) & limit_2_set)
2219 return asu_chem_mapping
2222 def _CountSuperpositionsAndMappings(symm_1, symm_2, chem_mapping):
2224 :return: Dictionary of number of mappings and superpositions to be performed.
2225 Returned as *result[X][Y] = number* with X = "intra" or "inter" and
2226 Y = "mappings" or "superpositions". The idea is that for each
2227 pairwise superposition we check all possible mappings.
2228 We can check the combinations within (intra) a symmetry group and
2229 once established, we check the combinations between different (inter)
2231 :param symm_1: See :attr:`QSscorer.symm_1`
2232 :param symm_2: See :attr:`QSscorer.symm_2`
2233 :param chem_mapping: See :attr:`QSscorer.chem_mapping`
2239 c[
'intra'][
'mappings'] = 0
2240 c[
'intra'][
'superpositions'] = 0
2241 c[
'inter'][
'mappings'] = 0
2242 c[
'inter'][
'superpositions'] = 0
2244 ref_symm_1 = symm_1[0]
2245 ref_symm_2 = symm_2[0]
2246 asu_chem_mapping = _LimitChemMapping(chem_mapping, ref_symm_1, ref_symm_2)
2247 for g1, g2
in asu_chem_mapping.iteritems():
2248 chain_superpositions = len(g1) * len(g2)
2249 c[
'intra'][
'superpositions'] += chain_superpositions
2250 map_per_sup = _PermutationOrCombinations(g1[0], g2[0], asu_chem_mapping)
2251 c[
'intra'][
'mappings'] += chain_superpositions * map_per_sup
2252 if len(symm_1) == 1
or len(symm_2) == 1:
2255 asu_superposition = min(len(symm_1), len(symm_2))
2256 c[
'inter'][
'superpositions'] = asu_superposition
2257 asu = {tuple(symm_1): tuple(symm_2)}
2258 map_per_sup = _PermutationOrCombinations(ref_symm_1, ref_symm_2, asu)
2259 c[
'inter'][
'mappings'] = asu_superposition * map_per_sup
2262 def _PermutationOrCombinations(sup1, sup2, chem_mapping):
2263 """Should match len(_ListPossibleMappings(sup1, sup2, chem_mapping))."""
2266 for a,b
in chem_mapping.iteritems():
2267 new_a = tuple([x
for x
in a
if x != sup1])
2268 new_b = tuple([x
for x
in b
if x != sup2])
2269 chem_map[new_a] = new_b
2271 for a, b
in chem_map.iteritems():
2272 if len(a) == len(b):
2273 mapp_nr *= factorial(len(a))
2274 elif len(a) < len(b):
2275 mapp_nr *= binom(len(b), len(a))
2276 elif len(a) > len(b):
2277 mapp_nr *= binom(len(a), len(b))
2280 def _ListPossibleMappings(sup1, sup2, chem_mapping):
2282 Return a flat list of all possible mappings given *chem_mapping* and keeping
2283 mapping of *sup1* and *sup2* fixed. For instance if elements are chain names
2284 this is all the mappings to check for a given superposition.
2286 Elements in first complex are defined by *chem_mapping.keys()* (list of list
2287 of elements) and elements in second complex by *chem_mapping.values()*. If
2288 complexes don't have same number of elements, we map only elements for the one
2289 with less. Also mapping is only between elements of mapped groups according to
2292 :rtype: :class:`list` of :class:`dict` (key = element in chem_mapping-key,
2293 value = element in chem_mapping-value)
2294 :param sup1: Element for which mapping is fixed.
2295 :type sup1: Like element in chem_mapping-key
2296 :param sup2: Element for which mapping is fixed.
2297 :type sup2: Like element in chem_mapping-value
2298 :param chem_mapping: Defines mapping between groups of elements (e.g. result
2299 of :func:`_LimitChemMapping`).
2300 :type chem_mapping: :class:`dict` with key / value = :class:`tuple`
2302 :raises: :class:`QSscoreError` if reference complex (first one or one with
2303 less elements) has more elements for any given mapped group.
2306 chain1 = [i
for s
in chem_mapping.keys()
for i
in s]
2307 chain2 = [i
for s
in chem_mapping.values()
for i
in s]
2309 if len(chain1) > len(chain2):
2313 for a, b
in chem_mapping.iteritems():
2314 new_a = tuple([x
for x
in a
if x != sup1])
2315 new_b = tuple([x
for x
in b
if x != sup2])
2317 chem_map[new_b] = new_a
2319 chem_map[new_a] = new_b
2324 for a, b
in chem_map.iteritems():
2325 if len(a) == len(b):
2326 chem_perm.append(list(itertools.permutations(b)))
2328 elif len(a) < len(b):
2329 chem_perm.append(list(itertools.combinations(b, len(a))))
2332 raise QSscoreError(
'Impossible to define reference group: %s' \
2336 flat_ref = [i
for s
in chem_ref
for i
in s]
2337 for perm
in itertools.product(*chem_perm):
2338 flat_perm = [i
for s
in perm
for i
in s]
2339 d = {c1: c2
for c1, c2
in zip(flat_ref, flat_perm)}
2341 d = {v: k
for k, v
in d.items()}
2342 d.update({sup1: sup2})
2347 def _CheckClosedSymmetry(ent_1, ent_2, symm_1, symm_2, chem_mapping,
2348 sup_thr=4, sup_fract=0.8, find_best=
True):
2350 Quick check if we can superpose two chains and get a mapping for all other
2351 chains using the same transformation. The mapping is defined by sufficient
2352 overlap of the transformed chain of *ent_1* onto another chain in *ent_2*.
2354 :param ent_1: Entity to map to *ent_2* (perfect alignment assumed between
2355 chains of same chem. group as in :attr:`QSscorer.ent_to_cm_1`).
2356 Views are ok but only to select full chains!
2357 :param ent_2: Entity to map to (perfect alignment assumed between
2358 chains of same chem. group as in :attr:`QSscorer.ent_to_cm_2`).
2359 Views are ok but only to select full chains!
2360 :param symm_1: Symmetry groups to use. We only superpose chains within
2361 reference symmetry group of *symm_1* and *symm_2*.
2362 See :attr:`QSscorer.symm_1`
2363 :param symm_2: See :attr:`QSscorer.symm_2`
2364 :param chem_mapping: See :attr:`QSscorer.chem_mapping`.
2365 All chains in *ent_1* / *ent_2* must be listed here!
2366 :param sup_thr: Distance around transformed chain in *ent_1* to check for
2368 :type sup_thr: :class:`float`
2369 :param sup_fract: Fraction of atoms in chain of *ent_2* that must be
2370 overlapped for overlap to be sufficient.
2371 :type sup_fract: :class:`float`
2372 :param find_best: If True, we look for best mapping according to
2373 :func:`_GetMappedRMSD`. Otherwise, we return first suitable
2375 :type find_best: :class:`bool`
2377 :return: Mapping from *ent_1* to *ent_2* or None if none found. Mapping, if
2378 found, is as in :attr:`QSscorer.chain_mapping`.
2379 :rtype: :class:`dict` (:class:`str` / :class:`str`)
2382 ref_symm_1 = symm_1[0]
2383 ref_symm_2 = symm_2[0]
2384 asu_chem_mapping = _LimitChemMapping(chem_mapping, ref_symm_1, ref_symm_2)
2387 for g1, g2
in asu_chem_mapping.iteritems():
2391 for c1, c2
in itertools.product(g1, g2):
2393 chain_1 = ent_1.Select(
'cname="%s"' % c1)
2394 chain_2 = ent_2.Select(
'cname="%s"' % c2)
2395 res = mol.alg.SuperposeSVD(chain_1, chain_2, apply_transform=
False)
2397 mapping = _GetSuperpositionMapping(ent_1, ent_2, chem_mapping,
2398 res.transformation, sup_thr,
2407 rmsd = _GetMappedRMSD(ent_1, ent_2, mapping, res.transformation)
2408 rmsd_mappings.append((rmsd, mapping))
2411 return min(rmsd_mappings)[1]
2415 def _GetSuperpositionMapping(ent_1, ent_2, chem_mapping, transformation,
2416 sup_thr, sup_fract):
2418 :return: Dict with chain mapping from *ent_1* to *ent_2* or None if failed
2419 (see :func:`_CheckClosedSymmetry`).
2420 :param ent_1: See :func:`_CheckClosedSymmetry`
2421 :param ent_2: See :func:`_CheckClosedSymmetry`
2422 :param chem_mapping: See :func:`_CheckClosedSymmetry`
2423 :param transformation: Superposition transformation to be applied to *ent_1*.
2424 :param sup_thr: See :func:`_CheckClosedSymmetry`
2425 :param sup_fract: See :func:`_CheckClosedSymmetry`
2432 if ent_1.chain_count > ent_2.chain_count:
2434 ent_1, ent_2 = ent_2, ent_1
2436 chem_pairs = zip(chem_mapping.values(), chem_mapping.keys())
2439 chem_pairs = chem_mapping.iteritems()
2441 if ent_1.chain_count == 0:
2444 chem_partners = dict()
2445 for cg1, cg2
in chem_pairs:
2447 chem_partners[ch] = set(cg2)
2450 mapped_chains = set()
2451 for ch_1
in ent_1.chains:
2453 ch_atoms = {ch_2.name: set()
for ch_2
in ent_2.chains}
2454 for a_1
in ch_1.handle.atoms:
2456 a_within = ent_2.FindWithin(transformed_pos, sup_thr)
2457 for a_2
in a_within:
2458 ch_atoms[a_2.chain.name].add(a_2.hash_code)
2460 max_num, max_name = max((len(atoms), name)
2461 for name, atoms
in ch_atoms.iteritems())
2463 ch_2 = ent_2.FindChain(max_name)
2464 if max_num == 0
or max_num / float(ch_2.handle.atom_count) < sup_fract:
2467 ch_1_name = ch_1.name
2468 if ch_1_name
not in chem_partners:
2469 raise RuntimeError(
"Chem. mapping doesn't contain all chains!")
2470 if max_name
not in chem_partners[ch_1_name]:
2473 if max_name
in mapped_chains:
2476 mapping[ch_1_name] = max_name
2477 mapped_chains.add(max_name)
2480 mapping = {v: k
for k, v
in mapping.iteritems()}
2483 def _GetMappedRMSD(ent_1, ent_2, chain_mapping, transformation):
2485 :return: RMSD between complexes considering chain mapping.
2486 :param ent_1: Entity mapped to *ent_2* (perfect alignment assumed between
2487 mapped chains as in :attr:`QSscorer.ent_to_cm_1`).
2488 :param ent_2: Entity which was mapped to (perfect alignment assumed between
2489 mapped chains as in :attr:`QSscorer.ent_to_cm_2`).
2490 :param chain_mapping: See :attr:`QSscorer.chain_mapping`
2491 :param transformation: Superposition transformation to be applied to *ent_1*.
2496 for c1, c2
in chain_mapping.iteritems():
2498 chain_1 = ent_1.Select(
'cname="%s"' % c1)
2499 chain_2 = ent_2.Select(
'cname="%s"' % c2)
2500 atom_count = chain_1.atom_count
2501 if atom_count != chain_2.atom_count:
2502 raise RuntimeError(
'Chains in _GetMappedRMSD must be perfectly aligned!')
2504 rmsd = mol.alg.CalculateRMSD(chain_1, chain_2, transformation)
2507 atoms.append(float(atom_count))
2509 rmsd = np.sqrt( sum([a * r**2
for a, r
in zip(atoms, rmsds)]) / sum(atoms) )
2513 """Helper object for repetitive RMSD calculations.
2514 Meant to speed up :func:`_GetChainMapping` but could also be used to replace
2515 :func:`_GetMappedRMSD` in :func:`_CheckClosedSymmetry`.
2517 :param ent_1: See :attr:`QSscorer.ent_to_cm_1`
2518 :param ent_2: See :attr:`QSscorer.ent_to_cm_2`
2529 """Get cached view on chain *cname* for :attr:`ent_1`."""
2531 self.
_chain_views_1[cname] = self.ent_1.Select(
'cname="%s"' % cname)
2535 """Get cached view on chain *cname* for :attr:`ent_2`."""
2537 self.
_chain_views_2[cname] = self.ent_2.Select(
'cname="%s"' % cname)
2541 """Get superposition result (no change in entities!) for *c1* to *c2*.
2542 This invalidates cached RMSD results used in :func:`GetMappedRMSD`.
2544 :param c1: chain name for :attr:`ent_1`.
2545 :param c2: chain name for :attr:`ent_2`.
2552 if chain_1.atom_count != chain_2.atom_count:
2553 raise RuntimeError(
'Chains in GetSuperposition not perfectly aligned!')
2554 return mol.alg.SuperposeSVD(chain_1, chain_2, apply_transform=
False)
2558 :return: RMSD between complexes considering chain mapping.
2559 :param chain_mapping: See :attr:`QSscorer.chain_mapping`.
2560 :param transformation: Superposition transformation (e.g. res.transformation
2561 for res = :func:`GetSuperposition`).
2566 for c1, c2
in chain_mapping.iteritems():
2574 atom_count = chain_1.atom_count
2575 if atom_count != chain_2.atom_count:
2576 raise RuntimeError(
'Chains in GetMappedRMSD not perfectly aligned!')
2578 rmsd = mol.alg.CalculateRMSD(chain_1, chain_2, transformation)
2579 self.
_pair_rmsd[(c1, c2)] = (atom_count, rmsd)
2582 atoms.append(float(atom_count))
2584 rmsd = np.sqrt( sum([a * r**2
for a, r
in zip(atoms, rmsds)]) / sum(atoms) )
2588 def _CleanUserSymmetry(symm, ent):
2589 """Clean-up user provided symmetry.
2591 :param symm: Symmetry group as in :attr:`QSscorer.symm_1`
2592 :param ent: Entity from which to extract chain names
2593 :type ent: :class:`~ost.mol.EntityHandle` or :class:`~ost.mol.EntityView`
2594 :return: New symmetry group limited to chains in sub-structure *ent*
2595 (see :attr:`QSscorer.symm_1`). Empty list if invalid symmetry.
2598 chain_names = set([ch.name
for ch
in ent.chains])
2600 for symm_group
in symm:
2601 new_group = tuple(ch
for ch
in symm_group
if ch
in chain_names)
2603 new_symm.append(new_group)
2605 if new_symm != symm:
2606 LogInfo(
"Cleaned user symmetry for %s: %s" % (ent.GetName(), new_symm))
2608 lengths = [len(symm_group)
for symm_group
in new_symm]
2609 if lengths[1:] != lengths[:-1]:
2610 LogWarning(
'User symmetry group of different sizes for %s. Ignoring it!' \
2614 s_chain_names = set([ch
for symm_group
in new_symm
for ch
in symm_group])
2615 if len(s_chain_names) != sum(lengths):
2616 LogWarning(
'User symmetry group for %s has duplicate chains. Ignoring it!' \
2619 if s_chain_names != chain_names:
2620 LogWarning(
'User symmetry group for %s is missing chains. Ignoring it!' \
2626 def _AreValidSymmetries(symm_1, symm_2):
2627 """Check symmetry pair for major problems.
2629 :return: False if any of the two is empty or if they're compatible in size.
2630 :rtype: :class:`bool`
2632 if not symm_1
or not symm_2:
2634 if len(symm_1) != 1
or len(symm_2) != 1:
2635 if not len(symm_1[0]) == len(symm_2[0]):
2636 LogWarning(
'Symmetry groups of different sizes. Ignoring them!')
2640 def _GetMappedAlignments(ent_1, ent_2, chain_mapping, res_num_alignment):
2642 :return: Alignments of 2 structures given chain mapping
2643 (see :attr:`QSscorer.alignments`).
2644 :param ent_1: Entity containing all chains in *chain_mapping.keys()*.
2645 Views to this entity attached to first sequence of each aln.
2646 :param ent_2: Entity containing all chains in *chain_mapping.values()*.
2647 Views to this entity attached to second sequence of each aln.
2648 :param chain_mapping: See :attr:`QSscorer.chain_mapping`
2649 :param res_num_alignment: See :attr:`QSscorer.res_num_alignment`
2652 for ch_1_name
in sorted(chain_mapping):
2654 ch_1 = ent_1.FindChain(ch_1_name)
2655 ch_2 = ent_2.FindChain(chain_mapping[ch_1_name])
2656 if res_num_alignment:
2657 max_res_num = max([r.number.GetNum()
for r
in ch_1.residues] +
2658 [r.number.GetNum()
for r
in ch_2.residues])
2659 ch1_aln = [
"-"] * max_res_num
2660 ch2_aln = [
"-"] * max_res_num
2661 for res
in ch_1.residues:
2662 ch1_aln[res.number.GetNum() - 1] = res.GetOneLetterCode()
2663 ch1_aln =
"".join(ch1_aln)
2664 seq_1 = seq.CreateSequence(ch_1.name, str(ch1_aln))
2665 seq_1.AttachView(ch_1.Select(
""))
2666 for res
in ch_2.residues:
2667 ch2_aln[res.number.GetNum() - 1] = res.GetOneLetterCode()
2668 ch2_aln =
"".join(ch2_aln)
2669 seq_2 = seq.CreateSequence(ch_2.name, str(ch2_aln))
2670 seq_2.AttachView(ch_2.Select(
""))
2672 aln = seq.CreateAlignment()
2673 aln.AddSequence(seq_1)
2674 aln.AddSequence(seq_2)
2676 seq_1 = seq.SequenceFromChain(ch_1.name, ch_1)
2677 seq_2 = seq.SequenceFromChain(ch_2.name, ch_2)
2679 aln = _AlignAtomSeqs(seq_1, seq_2)
2684 def _GetMappedResidues(alns):
2686 :return: Mapping of shared residues in *alns* (with views attached)
2687 (see :attr:`QSscorer.mapped_residues`).
2688 :param alns: See :attr:`QSscorer.alignments`
2690 mapped_residues = dict()
2693 c1 = aln.GetSequence(0).name
2694 mapped_residues[c1] = dict()
2696 v1, v2 = seq.ViewsFromAlignment(aln)
2697 for res_1, res_2
in zip(v1.residues, v2.residues):
2698 r1 = res_1.number.num
2699 r2 = res_2.number.num
2700 mapped_residues[c1][r1] = r2
2702 return mapped_residues
2704 def _GetExtraWeights(contacts, done, mapped_residues):
2705 """Return sum of extra weights for contacts of chains in set and not in done.
2706 :return: Tuple (weight_extra_mapped, weight_extra_all).
2707 weight_extra_mapped only sums if both cX,rX in mapped_residues
2708 weight_extra_all sums all
2709 :param contacts: See :func:`GetContacts` for first entity
2710 :param done: List of (c1, c2, r1, r2) tuples to ignore
2711 :param mapped_residues: See :func:`_GetMappedResidues`
2713 weight_extra_mapped = 0
2714 weight_extra_non_mapped = 0
2716 for c2
in contacts[c1]:
2717 for r1
in contacts[c1][c2]:
2718 for r2
in contacts[c1][c2][r1]:
2719 if (c1, c2, r1, r2)
not in done:
2720 weight = _weight(contacts[c1][c2][r1][r2])
2721 if c1
in mapped_residues
and r1
in mapped_residues[c1] \
2722 and c2
in mapped_residues
and r2
in mapped_residues[c2]:
2723 weight_extra_mapped += weight
2725 weight_extra_non_mapped += weight
2726 return weight_extra_mapped, weight_extra_mapped + weight_extra_non_mapped
2728 def _GetScores(contacts_1, contacts_2, mapped_residues, chain_mapping):
2729 """Get QS scores (see :class:`QSscorer`).
2731 Note that if some chains are not to be considered at all, they must be removed
2732 from *contacts_1* / *contacts_2* prior to calling this.
2734 :param contacts_1: See :func:`GetContacts` for first entity
2735 :param contacts_2: See :func:`GetContacts` for second entity
2736 :param mapped_residues: See :func:`_GetMappedResidues`
2737 :param chain_mapping: Maps any chain name in *mapped_residues* to chain name
2739 :type chain_mapping: :class:`dict` (:class:`str` / :class:`str`)
2740 :return: Tuple (QS_best, QS_global) (see :attr:`QSscorer.best_score` and
2741 see :attr:`QSscorer.global_score`)
2750 for c11
in contacts_1:
2752 if c11
not in mapped_residues:
continue
2753 c1T = chain_mapping[c11]
2755 for c21
in contacts_1[c11]:
2757 if c21
not in mapped_residues:
continue
2758 c2T = chain_mapping[c21]
2760 flipped_chains = (c1T > c2T)
2766 if c12
not in contacts_2
or c22
not in contacts_2[c12]:
continue
2768 for r11
in contacts_1[c11][c21]:
2770 if r11
not in mapped_residues[c11]:
continue
2771 r1T = mapped_residues[c11][r11]
2773 for r21
in contacts_1[c11][c21][r11]:
2775 if r21
not in mapped_residues[c21]:
continue
2776 r2T = mapped_residues[c21][r21]
2783 if r12
not in contacts_2[c12][c22]:
continue
2784 if r22
not in contacts_2[c12][c22][r12]:
continue
2786 d1 = contacts_1[c11][c21][r11][r21]
2787 d2 = contacts_2[c12][c22][r12][r22]
2788 weight = _weight(min(d1, d2))
2789 weighted_scores += weight * (1 - abs(d1 - d2) / 12)
2790 weight_sum += weight
2792 done_1.add((c11, c21, r11, r21))
2793 done_2.add((c12, c22, r12, r22))
2795 LogVerbose(
"Shared contacts: %d" % len(done_1))
2798 weights_extra_1 = _GetExtraWeights(contacts_1, done_1, mapped_residues)
2799 mapped_residues_2 = dict()
2800 for c1
in mapped_residues:
2801 c2 = chain_mapping[c1]
2802 mapped_residues_2[c2] = set()
2803 for r1
in mapped_residues[c1]:
2804 mapped_residues_2[c2].add(mapped_residues[c1][r1])
2805 weights_extra_2 = _GetExtraWeights(contacts_2, done_2, mapped_residues_2)
2806 weight_extra_mapped = weights_extra_1[0] + weights_extra_2[0]
2807 weight_extra_all = weights_extra_1[1] + weights_extra_2[1]
2810 denom_best = weight_sum + weight_extra_mapped
2811 denom_all = weight_sum + weight_extra_all
2815 QS_best = weighted_scores / denom_best
2819 QS_global = weighted_scores / denom_all
2820 return QS_best, QS_global
2824 This weight expresses the probability of two residues to interact given the CB
2825 distance (from Xu et al. http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2573399/)
2829 x = (dist-5.0)/4.28;
2830 return np.exp(-2*x*x)
2833 def _GetQsSuperposition(alns):
2835 :return: Superposition result based on shared CA atoms in *alns*
2836 (with views attached) (see :attr:`QSscorer.superposition`).
2837 :param alns: See :attr:`QSscorer.alignments`
2841 raise QSscoreError(
'No successful alignments for superposition!')
2843 view_1 = alns[0].GetSequence(0).attached_view.CreateEmptyView()
2844 view_2 = alns[0].GetSequence(1).attached_view.CreateEmptyView()
2848 res_1 = col.GetResidue(0)
2849 res_2 = col.GetResidue(1)
2850 if res_1.IsValid()
and res_2.IsValid():
2851 ca_1 = res_1.FindAtom(
'CA')
2852 ca_2 = res_2.FindAtom(
'CA')
2853 if ca_1.IsValid()
and ca_2.IsValid():
2854 view_1.AddAtom(ca_1)
2855 view_2.AddAtom(ca_2)
2857 res = mol.alg.SuperposeSVD(view_1, view_2, apply_transform=
False)
2861 def _AddResidue(edi, res, rnum, chain, calpha_only):
2863 Add residue *res* with res. num. *run* to given *chain* using editor *edi*.
2864 Either all atoms added or (if *calpha_only*) only CA.
2867 ca_atom = res.FindAtom(
'CA')
2868 if ca_atom.IsValid():
2869 new_res = edi.AppendResidue(chain, res.name, rnum)
2870 edi.InsertAtom(new_res, ca_atom.name, ca_atom.pos)
2872 new_res = edi.AppendResidue(chain, res.name, rnum)
2873 for atom
in res.atoms:
2874 edi.InsertAtom(new_res, atom.name, atom.pos)
2876 def _MergeAlignedChains(alns, ent_1, ent_2, calpha_only, penalize_extra_chains):
2878 Create two new entities (based on the alignments attached views) where all
2879 residues have same numbering (when they're aligned) and they are all pushed to
2880 a single chain X. Also append extra chains contained in *ent_1* and *ent_2*
2881 but not contained in *alns*.
2883 Used for :attr:`QSscorer.lddt_ref` and :attr:`QSscorer.lddt_mdl`
2885 :param alns: List of alignments with attached views (first sequence: *ent_1*,
2886 second: *ent_2*). Residue number in single chain is column index
2887 of current alignment + sum of lengths of all previous alignments
2888 (order of alignments as in input list).
2889 :type alns: See :attr:`QSscorer.alignments`
2890 :param ent_1: First entity to process.
2891 :type ent_1: :class:`~ost.mol.EntityHandle`
2892 :param ent_2: Second entity to process.
2893 :type ent_2: :class:`~ost.mol.EntityHandle`
2894 :param calpha_only: If True, we only include CA atoms instead of all.
2895 :type calpha_only: :class:`bool`
2896 :param penalize_extra_chains: If True, extra chains are added to model and
2897 reference. Otherwise, only mapped ones.
2898 :type penalize_extra_chains: :class:`bool`
2900 :return: Tuple of two single chain entities (from *ent_1* and from *ent_2*)
2901 :rtype: :class:`tuple` of :class:`~ost.mol.EntityHandle`
2904 ent_ren_1 = mol.CreateEntity()
2905 ed_1 = ent_ren_1.EditXCS()
2906 new_chain_1 = ed_1.InsertChain(
'X')
2908 ent_ren_2 = mol.CreateEntity()
2909 ed_2 = ent_ren_2.EditXCS()
2910 new_chain_2 = ed_2.InsertChain(
'X')
2913 chain_done_1 = set()
2914 chain_done_2 = set()
2916 chain_done_1.add(aln.GetSequence(0).name)
2917 chain_done_2.add(aln.GetSequence(1).name)
2921 res_1 = col.GetResidue(0)
2923 _AddResidue(ed_1, res_1, rnum, new_chain_1, calpha_only)
2924 res_2 = col.GetResidue(1)
2926 _AddResidue(ed_2, res_2, rnum, new_chain_2, calpha_only)
2928 if penalize_extra_chains:
2929 for chain
in ent_1.chains:
2930 if chain.name
in chain_done_1:
2932 for res
in chain.residues:
2934 _AddResidue(ed_1, res, rnum, new_chain_1, calpha_only)
2935 for chain
in ent_2.chains:
2936 if chain.name
in chain_done_2:
2938 for res
in chain.residues:
2940 _AddResidue(ed_2, res, rnum, new_chain_2, calpha_only)
2942 ent_ren_1.SetName(aln.GetSequence(0).GetAttachedView().GetName())
2943 ent_ren_2.SetName(aln.GetSequence(1).GetAttachedView().GetName())
2945 if not conop.GetDefaultLib():
2946 raise RuntimeError(
"QSscore computation requires a compound library!")
2948 pr.Process(ent_ren_1)
2950 pr.Process(ent_ren_2)
2952 return ent_ren_1, ent_ren_2
2956 __all__ = (
'QSscoreError',
'QSscorer',
'QSscoreEntity',
'FilterContacts',
2957 'GetContacts',
'OligoLDDTScorer',
'MappedLDDTScorer')
def _ComputeAlignedEntities
Class internal helpers (anything that doesnt easily work without this class)
def _GetSuperposeData
Class internal helpers (anything that doesnt easily work without this class)
Mat4 DLLEXPORT_OST_GEOM Invert(const Mat4 &m)
def _InitScorer
Class internal helpers (anything that doesnt easily work without this class)
Real Distance(const Vec3 &p1, const Vec3 &p2)
returns the distance between two points
Three dimensional vector class, using Real precision.
def _GetUnmappedMdlChains
def _PrepareOligoEntities
Class internal helpers.