OpenStructure
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
qsscoring.py
Go to the documentation of this file.
1 """
2 Scoring of quaternary structures (QS). The QS scoring is according to the paper
3 by `Bertoni et al. <https://dx.doi.org/10.1038/s41598-017-09654-8>`_.
4 
5 .. warning::
6 
7  The `qsscoring` module is deprecated. Consider using the newer implementation
8  in :mod:`~ost.mol.alg.qsscore` instead.
9 
10 .. note ::
11 
12  Requirements for use:
13 
14  - A default :class:`compound library <ost.conop.CompoundLib>` must be defined
15  and accessible via :func:`~ost.conop.GetDefaultLib`. This is set by default
16  when executing scripts with ``ost``. Otherwise, you must set this with
17  :func:`~ost.conop.SetDefaultLib`.
18  - ClustalW must be installed (unless you provide chain mappings)
19  - Python modules `numpy` and `scipy` must be installed and available
20  (e.g. use ``pip install scipy numpy``)
21 """
22 # Original authors: Gerardo Tauriello, Martino Bertoni
23 
24 from ost import mol, geom, conop, seq, settings, PushVerbosityLevel
25 from ost import LogError, LogWarning, LogScript, LogInfo, LogVerbose, LogDebug
26 from ost.bindings.clustalw import ClustalW
27 from ost.mol.alg import lDDTScorer
28 from ost.seq.alg.renumber import Renumber
29 import numpy as np
30 from scipy.special import factorial
31 from scipy.special import binom
32 from scipy.cluster.hierarchy import fclusterdata
33 import itertools
34 
35 ###############################################################################
36 # QS scoring
37 ###############################################################################
38 
39 class QSscoreError(Exception):
40  """Exception to be raised for "acceptable" exceptions in QS scoring.
41 
42  Those are cases we might want to capture for default behavior.
43  """
44  pass
45 
46 class QSscorer:
47  """Object to compute QS scores.
48 
49  Simple usage without any precomputed contacts, symmetries and mappings:
50 
51  .. code-block:: python
52 
53  import ost
54  from ost.mol.alg import qsscoring
55 
56  # load two biounits to compare
57  ent_full = ost.io.LoadPDB('3ia3', remote=True)
58  ent_1 = ent_full.Select('cname=A,D')
59  ent_2 = ent_full.Select('cname=B,C')
60  # get score
61  ost.PushVerbosityLevel(3)
62  try:
63  qs_scorer = qsscoring.QSscorer(ent_1, ent_2)
64  ost.LogScript('QSscore:', str(qs_scorer.global_score))
65  ost.LogScript('Chain mapping used:', str(qs_scorer.chain_mapping))
66  # commonly you want the QS global score as output
67  qs_score = qs_scorer.global_score
68  except qsscoring.QSscoreError as ex:
69  # default handling: report failure and set score to 0
70  ost.LogError('QSscore failed:', str(ex))
71  qs_score = 0
72 
73  For maximal performance when computing QS scores of the same entity with many
74  others, it is advisable to construct and reuse :class:`QSscoreEntity` objects.
75 
76  Any known / precomputed information can be filled into the appropriate
77  attribute here (no checks done!). Otherwise most quantities are computed on
78  first access and cached (lazy evaluation). Setters are provided to set values
79  with extra checks (e.g. :func:`SetSymmetries`).
80 
81  All necessary seq. alignments are done by global BLOSUM62-based alignment. A
82  multiple sequence alignment is performed with ClustalW unless
83  :attr:`chain_mapping` is provided manually. You will need to have an
84  executable ``clustalw`` or ``clustalw2`` in your ``PATH`` or you must set
85  :attr:`clustalw_bin` accordingly. Otherwise an exception
86  (:class:`ost.settings.FileNotFound`) is thrown.
87 
88  Formulas for QS scores:
89 
90  ::
91 
92  - QS_best = weighted_scores / (weight_sum + weight_extra_mapped)
93  - QS_global = weighted_scores / (weight_sum + weight_extra_all)
94  -> weighted_scores = sum(w(min(d1,d2)) * (1 - abs(d1-d2)/12)) for shared
95  -> weight_sum = sum(w(min(d1,d2))) for shared
96  -> weight_extra_mapped = sum(w(d)) for all mapped but non-shared
97  -> weight_extra_all = sum(w(d)) for all non-shared
98  -> w(d) = 1 if d <= 5, exp(-2 * ((d-5.0)/4.28)^2) else
99 
100  In the formulas above:
101 
102  * "d": CA/CB-CA/CB distance of an "inter-chain contact" ("d1", "d2" for
103  "shared" contacts).
104  * "mapped": we could map chains of two structures and align residues in
105  :attr:`alignments`.
106  * "shared": pairs of residues which are "mapped" and have
107  "inter-chain contact" in both structures.
108  * "inter-chain contact": CB-CB pairs (CA for GLY) with distance <= 12 A
109  (fallback to CA-CA if :attr:`calpha_only` is True).
110  * "w(d)": weighting function (prob. of 2 res. to interact given CB distance)
111  from `Xu et al. 2009 <https://dx.doi.org/10.1016%2Fj.jmb.2008.06.002>`_.
112 
113  :param ent_1: First structure to be scored.
114  :type ent_1: :class:`QSscoreEntity`, :class:`~ost.mol.EntityHandle` or
115  :class:`~ost.mol.EntityView`
116  :param ent_2: Second structure to be scored.
117  :type ent_2: :class:`QSscoreEntity`, :class:`~ost.mol.EntityHandle` or
118  :class:`~ost.mol.EntityView`
119  :param res_num_alignment: Sets :attr:`res_num_alignment`
120 
121  :raises: :class:`QSscoreError` if input structures are invalid or are monomers
122  or have issues that make it impossible for a QS score to be computed.
123 
124  .. attribute:: qs_ent_1
125 
126  :class:`QSscoreEntity` object for *ent_1* given at construction.
127  If entity names (:attr:`~QSscoreEntity.original_name`) are not unique, we
128  set it to 'pdb_1' using :func:`~QSscoreEntity.SetName`.
129 
130  .. attribute:: qs_ent_2
131 
132  :class:`QSscoreEntity` object for *ent_2* given at construction.
133  If entity names (:attr:`~QSscoreEntity.original_name`) are not unique, we
134  set it to 'pdb_2' using :func:`~QSscoreEntity.SetName`.
135 
136  .. attribute:: calpha_only
137 
138  True if any of the two structures is CA-only (after cleanup).
139 
140  :type: :class:`bool`
141 
142  .. attribute:: max_ca_per_chain_for_cm
143 
144  Maximal number of CA atoms to use in each chain to determine chain mappings.
145  Setting this to -1 disables the limit. Limiting it speeds up determination
146  of symmetries and chain mappings. By default it is set to 100.
147 
148  :type: :class:`int`
149 
150  .. attribute:: max_mappings_extensive
151 
152  Maximal number of chain mappings to test for 'extensive'
153  :attr:`chain_mapping_scheme`. The extensive chain mapping search must in the
154  worst case check O(N^2) * O(N!) possible mappings for complexes with N
155  chains. Two octamers without symmetry would require 322560 mappings to be
156  checked. To limit computations, a :class:`QSscoreError` is thrown if we try
157  more than the maximal number of chain mappings.
158  The value must be set before the first use of :attr:`chain_mapping`.
159  By default it is set to 100000.
160 
161  :type: :class:`int`
162 
163  .. attribute:: res_num_alignment
164 
165  Forces each alignment in :attr:`alignments` to be based on residue numbers
166  instead of using a global BLOSUM62-based alignment.
167 
168  :type: :class:`bool`
169  """
170  def __init__(self, ent_1, ent_2, res_num_alignment=False):
171  # generate QSscoreEntity objects?
172  if isinstance(ent_1, QSscoreEntity):
173  self.qs_ent_1 = ent_1
174  else:
175  self.qs_ent_1 = QSscoreEntity(ent_1)
176  if isinstance(ent_2, QSscoreEntity):
177  self.qs_ent_2 = ent_2
178  else:
179  self.qs_ent_2 = QSscoreEntity(ent_2)
180  # check validity of inputs
181  if not self.qs_ent_1.is_valid or not self.qs_ent_2.is_valid:
182  raise QSscoreError("Invalid input in QSscorer!")
183  # set names for structures
184  if self.qs_ent_1.original_name == self.qs_ent_2.original_name:
185  self.qs_ent_1.SetName('pdb_1')
186  self.qs_ent_2.SetName('pdb_2')
187  else:
188  self.qs_ent_1.SetName(self.qs_ent_1.original_name)
189  self.qs_ent_2.SetName(self.qs_ent_2.original_name)
190  # set other public attributes
191  self.res_num_alignment = res_num_alignment
192  self.calpha_only = self.qs_ent_1.calpha_only or self.qs_ent_2.calpha_only
195  # init cached stuff
196  self._chem_mapping = None
197  self._ent_to_cm_1 = None
198  self._ent_to_cm_2 = None
199  self._symm_1 = None
200  self._symm_2 = None
201  self._chain_mapping = None
202  self._chain_mapping_scheme = None
203  self._alignments = None
204  self._mapped_residues = None
205  self._global_score = None
206  self._best_score = None
207  self._superposition = None
208  self._clustalw_bin = None
209 
210  @property
211  def chem_mapping(self):
212  """Inter-complex mapping of chemical groups.
213 
214  Each group (see :attr:`QSscoreEntity.chem_groups`) is mapped according to
215  highest sequence identity. Alignment is between longest sequences in groups.
216 
217  Limitations:
218 
219  - If different numbers of groups, we map only the groups for the complex
220  with less groups (rest considered unmapped and shown as warning)
221  - The mapping is forced: the "best" mapping will be chosen independently of
222  how low the seq. identity may be
223 
224  :getter: Computed on first use (cached)
225  :type: :class:`dict` with key = :class:`tuple` of chain names in
226  :attr:`qs_ent_1` and value = :class:`tuple` of chain names in
227  :attr:`qs_ent_2`.
228 
229  :raises: :class:`QSscoreError` if we end up having no chains for either
230  entity in the mapping (can happen if chains do not have CA atoms).
231  """
232  if self._chem_mapping is None:
233  self._chem_mapping = _GetChemGroupsMapping(self.qs_ent_1, self.qs_ent_2)
234  return self._chem_mapping
235 
236  @chem_mapping.setter
237  def chem_mapping(self, chem_mapping):
238  self._chem_mapping = chem_mapping
239 
240  @property
241  def ent_to_cm_1(self):
242  """Subset of :attr:`qs_ent_1` used to compute chain mapping and symmetries.
243 
244  Properties:
245 
246  - Includes only residues aligned according to :attr:`chem_mapping`
247  - Includes only 1 CA atom per residue
248  - Has at least 5 and at most :attr:`max_ca_per_chain_for_cm` atoms per chain
249  - All chains of the same chemical group have the same number of atoms
250  (also in :attr:`ent_to_cm_2` according to :attr:`chem_mapping`)
251  - All chains appearing in :attr:`chem_mapping` appear in this entity
252  (so the two can be safely used together)
253 
254  This entity might be transformed (i.e. all positions rotated/translated by
255  same transformation matrix) if this can speed up computations. So do not
256  assume fixed global positions (but relative distances will remain fixed).
257 
258  :getter: Computed on first use (cached)
259  :type: :class:`~ost.mol.EntityHandle`
260 
261  :raises: :class:`QSscoreError` if any chain ends up having less than 5 res.
262  """
263  if self._ent_to_cm_1 is None:
265  return self._ent_to_cm_1
266 
267  @ent_to_cm_1.setter
268  def ent_to_cm_1(self, ent_to_cm_1):
269  self._ent_to_cm_1 = ent_to_cm_1
270 
271  @property
272  def ent_to_cm_2(self):
273  """Subset of :attr:`qs_ent_1` used to compute chain mapping and symmetries
274  (see :attr:`ent_to_cm_1` for details).
275  """
276  if self._ent_to_cm_2 is None:
278  return self._ent_to_cm_2
279 
280  @ent_to_cm_2.setter
281  def ent_to_cm_2(self, ent_to_cm_2):
282  self._ent_to_cm_2 = ent_to_cm_2
283 
284  @property
285  def symm_1(self):
286  """Symmetry groups for :attr:`qs_ent_1` used to speed up chain mapping.
287 
288  This is a list of chain-lists where each chain-list can be used reconstruct
289  the others via cyclic C or dihedral D symmetry. The first chain-list is used
290  as a representative symmetry group. For heteromers, the group-members must
291  contain all different seqres in oligomer.
292 
293  Example: symm. groups [(A,B,C), (D,E,F), (G,H,I)] means that there are
294  symmetry transformations to get (D,E,F) and (G,H,I) from (A,B,C).
295 
296  Properties:
297 
298  - All symmetry group tuples have the same length (num. of chains)
299  - All chains in :attr:`ent_to_cm_1` appear (w/o duplicates)
300  - For heteros: symmetry group tuples have all different chem. groups
301  - Trivial symmetry group = one tuple with all chains (used if inconsistent
302  data provided or if no symmetry is found)
303  - Either compatible to :attr:`symm_2` or trivial symmetry groups used.
304  Compatibility requires same lengths of symmetry group tuples and it must
305  be possible to get an overlap (80% of residues covered within 6 A of a
306  (chem. mapped) chain) of all chains in representative symmetry groups by
307  superposing one pair of chains.
308 
309  :getter: Computed on first use (cached)
310  :type: :class:`list` of :class:`tuple` of :class:`str` (chain names)
311  """
312  if self._symm_1 is None:
313  self._ComputeSymmetry()
314  return self._symm_1
315 
316  @property
317  def symm_2(self):
318  """Symmetry groups for :attr:`qs_ent_2` (see :attr:`symm_1` for details)."""
319  if self._symm_2 is None:
320  self._ComputeSymmetry()
321  return self._symm_2
322 
323  def SetSymmetries(self, symm_1, symm_2):
324  """Set user-provided symmetry groups.
325 
326  These groups are restricted to chain names appearing in :attr:`ent_to_cm_1`
327  and :attr:`ent_to_cm_2` respectively. They are only valid if they cover all
328  chains and both *symm_1* and *symm_2* have same lengths of symmetry group
329  tuples. Otherwise trivial symmetry group used (see :attr:`symm_1`).
330 
331  :param symm_1: Value to set for :attr:`symm_1`.
332  :param symm_2: Value to set for :attr:`symm_2`.
333  """
334  # restrict chain names
335  self._symm_1 = _CleanUserSymmetry(symm_1, self.ent_to_cm_1)
336  self._symm_2 = _CleanUserSymmetry(symm_2, self.ent_to_cm_2)
337  # check that we have reasonable symmetries set (fallback: all chains)
338  if not _AreValidSymmetries(self._symm_1, self._symm_2):
339  self._symm_1 = [tuple(ch.name for ch in self.ent_to_cm_1.chains)]
340  self._symm_2 = [tuple(ch.name for ch in self.ent_to_cm_2.chains)]
341 
342  @property
343  def chain_mapping(self):
344  """Mapping from :attr:`ent_to_cm_1` to :attr:`ent_to_cm_2`.
345 
346  Properties:
347 
348  - Mapping is between chains of same chem. group (see :attr:`chem_mapping`)
349  - Each chain can appear only once in mapping
350  - All chains of complex with less chains are mapped
351  - Symmetry (:attr:`symm_1`, :attr:`symm_2`) is taken into account
352 
353  Details on algorithms used to find mapping:
354 
355  - We try all pairs of chem. mapped chains within symmetry group and get
356  superpose-transformation for them
357  - First option: check for "sufficient overlap" of other chain-pairs
358 
359  - For each chain-pair defined above: apply superposition to full oligomer
360  and map chains based on structural overlap
361  - Structural overlap = X% of residues in second oligomer covered within Y
362  Angstrom of a (chem. mapped) chain in first oligomer. We successively
363  try (X,Y) = (80,4), (40,6) and (20,8) to be less and less strict in
364  mapping (warning shown for most permissive one).
365  - If multiple possible mappings are found, we choose the one which leads
366  to the lowest multi-chain-RMSD given the superposition
367 
368  - Fallback option: try all mappings to find minimal multi-chain-RMSD
369  (warning shown)
370 
371  - For each chain-pair defined above: apply superposition, try all (!)
372  possible chain mappings (within symmetry group) and keep mapping with
373  lowest multi-chain-RMSD
374  - Repeat procedure above to resolve symmetry. Within the symmetry group we
375  can use the chain mapping computed before and we just need to find which
376  symmetry group in first oligomer maps to which in the second one. We
377  again try all possible combinations...
378  - Limitations:
379 
380  - Trying all possible mappings is a combinatorial nightmare (factorial).
381  We throw an exception if too many combinations (e.g. octomer vs
382  octomer with no usable symmetry)
383  - The mapping is forced: the "best" mapping will be chosen independently
384  of how badly they fit in terms of multi-chain-RMSD
385  - As a result, such a forced mapping can lead to a large range of
386  resulting QS scores. An extreme example was observed between 1on3.1
387  and 3u9r.1, where :attr:`global_score` can range from 0.12 to 0.43
388  for mappings with very similar multi-chain-RMSD.
389 
390  :getter: Computed on first use (cached)
391  :type: :class:`dict` with key / value = :class:`str` (chain names, key
392  for :attr:`ent_to_cm_1`, value for :attr:`ent_to_cm_2`)
393  :raises: :class:`QSscoreError` if there are too many combinations to check
394  to find a chain mapping (see :attr:`max_mappings_extensive`).
395  """
396  if self._chain_mapping is None:
398  _GetChainMapping(self.ent_to_cm_1, self.ent_to_cm_2, self.symm_1,
399  self.symm_2, self.chem_mapping,
401  LogInfo('Mapping found: %s' % str(self._chain_mapping))
402  return self._chain_mapping
403 
404  @chain_mapping.setter
405  def chain_mapping(self, chain_mapping):
406  self._chain_mapping = chain_mapping
407 
408  @property
410  """Mapping scheme used to get :attr:`chain_mapping`.
411 
412  Possible values:
413 
414  - 'strict': 80% overlap needed within 4 Angstrom (overlap based mapping).
415  - 'tolerant': 40% overlap needed within 6 Angstrom (overlap based mapping).
416  - 'permissive': 20% overlap needed within 8 Angstrom (overlap based
417  mapping). It's best if you check mapping manually!
418  - 'extensive': Extensive search used for mapping detection (fallback). This
419  approach has known limitations and may be removed in future versions.
420  Mapping should be checked manually!
421  - 'user': :attr:`chain_mapping` was set by user before first use of this
422  attribute.
423 
424  :getter: Computed with :attr:`chain_mapping` on first use (cached)
425  :type: :class:`str`
426  :raises: :class:`QSscoreError` as in :attr:`chain_mapping`.
427  """
428  if self._chain_mapping_scheme is None:
429  # default: user provided
430  self._chain_mapping_scheme = 'user'
431  # get chain mapping and make sure internal variable is set
432  # -> will not compute and only update _chain_mapping if user provided
433  # -> will compute and overwrite _chain_mapping_scheme else
434  self._chain_mapping = self.chain_mapping
435  return self._chain_mapping_scheme
436 
437  @property
438  def alignments(self):
439  """List of successful sequence alignments using :attr:`chain_mapping`.
440 
441  There will be one alignment for each mapped chain and they are ordered by
442  their chain names in :attr:`qs_ent_1`.
443 
444  The first sequence of each alignment belongs to :attr:`qs_ent_1` and the
445  second one to :attr:`qs_ent_2`. The sequences are named according to the
446  mapped chain names and have views attached into :attr:`QSscoreEntity.ent`
447  of :attr:`qs_ent_1` and :attr:`qs_ent_2`.
448 
449  If :attr:`res_num_alignment` is False, each alignment is performed using a
450  global BLOSUM62-based alignment. Otherwise, the positions in the alignment
451  sequences are simply given by the residue number so that residues with
452  matching numbers are aligned.
453 
454  :getter: Computed on first use (cached)
455  :type: :class:`list` of :class:`~ost.seq.AlignmentHandle`
456  """
457  if self._alignments is None:
458  self._alignments = _GetMappedAlignments(self.qs_ent_1.ent,
459  self.qs_ent_2.ent,
460  self.chain_mapping,
461  self.res_num_alignment)
462  return self._alignments
463 
464  @alignments.setter
465  def alignments(self, alignments):
466  self._alignments = alignments
467 
468  @property
469  def mapped_residues(self):
470  """Mapping of shared residues in :attr:`alignments`.
471 
472  :getter: Computed on first use (cached)
473  :type: :class:`dict` *mapped_residues[c1][r1] = r2* with:
474  *c1* = Chain name in first entity (= first sequence in aln),
475  *r1* = Residue number in first entity,
476  *r2* = Residue number in second entity
477  """
478  if self._mapped_residues is None:
479  self._mapped_residues = _GetMappedResidues(self.alignments)
480  return self._mapped_residues
481 
482  @mapped_residues.setter
483  def mapped_residues(self, mapped_residues):
484  self._mapped_residues = mapped_residues
485 
486  @property
487  def global_score(self):
488  """QS-score with penalties.
489 
490  The range of the score is between 0 (i.e. no interface residues are shared
491  between biounits) and 1 (i.e. the interfaces are identical).
492 
493  The global QS-score is computed applying penalties when interface residues
494  or entire chains are missing (i.e. anything that is not mapped in
495  :attr:`mapped_residues` / :attr:`chain_mapping`) in one of the biounits.
496 
497  :getter: Computed on first use (cached)
498  :type: :class:`float`
499  :raises: :class:`QSscoreError` if only one chain is mapped
500  """
501  if self._global_score is None:
502  self._ComputeScores()
503  return self._global_score
504 
505  @property
506  def best_score(self):
507  """QS-score without penalties.
508 
509  Like :attr:`global_score`, but neglecting additional residues or chains in
510  one of the biounits (i.e. the score is calculated considering only mapped
511  chains and residues).
512 
513  :getter: Computed on first use (cached)
514  :type: :class:`float`
515  :raises: :class:`QSscoreError` if only one chain is mapped
516  """
517  if self._best_score is None:
518  self._ComputeScores()
519  return self._best_score
520 
521  @property
522  def superposition(self):
523  """Superposition result based on shared CA atoms in :attr:`alignments`.
524 
525  The superposition can be used to map :attr:`QSscoreEntity.ent` of
526  :attr:`qs_ent_1` onto the one of :attr:`qs_ent_2`. Use
527  :func:`ost.geom.Invert` if you need the opposite transformation.
528 
529  :getter: Computed on first use (cached)
530  :type: :class:`ost.mol.alg.SuperpositionResult`
531  """
532  if self._superposition is None:
533  self._superposition = _GetQsSuperposition(self.alignments)
534  # report it
535  sup_rmsd = self._superposition.rmsd
536  cmp_view = self._superposition.view1
537  LogInfo('CA RMSD for %s aligned residues on %s chains: %.2f' \
538  % (cmp_view.residue_count, cmp_view.chain_count, sup_rmsd))
539  return self._superposition
540 
541  @property
542  def clustalw_bin(self):
543  """
544  Full path to ``clustalw`` or ``clustalw2`` executable to use for multiple
545  sequence alignments (unless :attr:`chain_mapping` is provided manually).
546 
547  :getter: Located in path on first use (cached)
548  :type: :class:`str`
549  """
550  if self._clustalw_bin is None:
551  self._clustalw_bin = settings.Locate(('clustalw', 'clustalw2'))
552  return self._clustalw_bin
553 
554  @clustalw_bin.setter
555  def clustalw_bin(self, clustalw_bin):
556  self._clustalw_bin = clustalw_bin
557 
558  def GetOligoLDDTScorer(self, settings, penalize_extra_chains=True):
559  """
560  :return: :class:`OligoLDDTScorer` object, setup for this QS scoring problem.
561  The scorer is set up with :attr:`qs_ent_1` as the reference and
562  :attr:`qs_ent_2` as the model.
563  :param settings: Passed to :class:`OligoLDDTScorer` constructor.
564  :param penalize_extra_chains: Passed to :class:`OligoLDDTScorer` constructor.
565  """
566  if penalize_extra_chains:
567  return OligoLDDTScorer(self.qs_ent_1.ent, self.qs_ent_2.ent,
568  self.alignments, self.calpha_only, settings,
569  True, self.chem_mapping)
570  else:
571  return OligoLDDTScorer(self.qs_ent_1.ent, self.qs_ent_2.ent,
572  self.alignments, self.calpha_only, settings, False)
573 
574 
575  ##############################################################################
576  # Class internal helpers (anything that doesnt easily work without this class)
577  ##############################################################################
578 
579  def _ComputeAlignedEntities(self):
580  """Fills cached ent_to_cm_1 and ent_to_cm_2."""
581  # get aligned residues via MSA
582  ev1, ev2 = _GetAlignedResidues(self.qs_ent_1, self.qs_ent_2,
583  self.chem_mapping,
585  self.clustalw_bin)
586  # extract new entities
587  self._ent_to_cm_1 = mol.CreateEntityFromView(ev1, True)
588  self._ent_to_cm_2 = mol.CreateEntityFromView(ev2, True)
589  # name them
590  self._ent_to_cm_1.SetName(self.qs_ent_1.GetName())
591  self._ent_to_cm_2.SetName(self.qs_ent_2.GetName())
592 
593  def _ComputeSymmetry(self):
594  """Fills cached symm_1 and symm_2."""
595  # get them
596  self._symm_1, self._symm_2 = \
597  _FindSymmetry(self.qs_ent_1, self.qs_ent_2, self.ent_to_cm_1,
598  self.ent_to_cm_2, self.chem_mapping)
599  # check that we have reasonable symmetries set (fallback: all chains)
600  if not _AreValidSymmetries(self._symm_1, self._symm_2):
601  self._symm_1 = [tuple(ch.name for ch in self.ent_to_cm_1.chains)]
602  self._symm_2 = [tuple(ch.name for ch in self.ent_to_cm_2.chains)]
603 
604  def _ComputeScores(self):
605  """Fills cached global_score and best_score."""
606  if len(self.chain_mapping) < 2:
607  raise QSscoreError("QS-score is not defined for monomers")
608  # get contacts
609  if self.calpha_only:
610  contacts_1 = self.qs_ent_1.contacts_ca
611  contacts_2 = self.qs_ent_2.contacts_ca
612  else:
613  contacts_1 = self.qs_ent_1.contacts
614  contacts_2 = self.qs_ent_2.contacts
615  # get scores
616  scores = _GetScores(contacts_1, contacts_2, self.mapped_residues,
617  self.chain_mapping)
618  self._best_score = scores[0]
619  self._global_score = scores[1]
620  # report scores
621  LogInfo('QSscore %s, %s: best: %.2f, global: %.2f' \
622  % (self.qs_ent_1.GetName(), self.qs_ent_2.GetName(),
623  self._best_score, self._global_score))
624 
625 
626 ###############################################################################
627 # Entity with cached entries for QS scoring
628 ###############################################################################
629 
630 class QSscoreEntity(object):
631  """Entity with cached entries for QS scoring.
632 
633  Any known / precomputed information can be filled into the appropriate
634  attribute here as long as they are labelled as read/write. Otherwise the
635  quantities are computed on first access and cached (lazy evaluation). The
636  heaviest load is expected when computing :attr:`contacts` and
637  :attr:`contacts_ca`.
638 
639  :param ent: Entity to be used for QS scoring. A copy of it will be processed.
640  :type ent: :class:`~ost.mol.EntityHandle` or :class:`~ost.mol.EntityView`
641 
642  .. attribute:: is_valid
643 
644  True, if successfully initialized. False, if input structure has no protein
645  chains with >= 20 residues.
646 
647  :type: :class:`bool`
648 
649  .. attribute:: original_name
650 
651  Name set for *ent* when object was created.
652 
653  :type: :class:`str`
654 
655  .. attribute:: ent
656 
657  Cleaned version of *ent* passed at construction. Hydrogens are removed, the
658  entity is processed with a :class:`~ost.conop.RuleBasedProcessor` and chains
659  listed in :attr:`removed_chains` have been removed. The name of this entity
660  might change during scoring (see :func:`GetName`). Otherwise, this will be
661  fixed.
662 
663  :type: :class:`~ost.mol.EntityHandle`
664 
665  .. attribute:: removed_chains
666 
667  Chains removed from *ent* passed at construction. These are ligand and water
668  chains as well as small (< 20 res.) peptides or chains with no amino acids
669  (determined by chem. type, which is set by rule based processor).
670 
671  :type: :class:`list` of :class:`str`
672 
673  .. attribute:: calpha_only
674 
675  Whether entity is CA-only (i.e. it has 0 CB atoms)
676 
677  :type: :class:`bool`
678  """
679  def __init__(self, ent):
680  # copy entity and process/clean it
681  self.original_name = ent.GetName()
682  ent = mol.CreateEntityFromView(ent.Select('ele!=H and aname!=HN'), True)
683  if not conop.GetDefaultLib():
684  raise RuntimeError("QSscore computation requires a compound library!")
685  pr = conop.RuleBasedProcessor(conop.GetDefaultLib())
686  pr.Process(ent)
687  self.ent, self.removed_chains, self.calpha_only = _CleanInputEntity(ent)
688  # check if it's suitable for QS scoring
689  if self.ent.chain_count == 0:
690  LogError('Bad input file: ' + ent.GetName() + '. No chains left after '
691  'removing water, ligands and small peptides.')
692  self.is_valid = False
693  elif self.ent.chain_count == 1:
694  LogWarning('Structure ' + ent.GetName() + ' is a monomer.')
695  self.is_valid = True
696  else:
697  self.is_valid = True
698  # init cached stuff
699  self._ca_entity = None
700  self._ca_chains = None
701  self._alignments = {}
702  self._chem_groups = None
703  self._angles = {}
704  self._axis = {}
705  self._contacts = None
706  self._contacts_ca = None
707 
708  def GetName(self):
709  """Wrapper to :func:`~ost.mol.EntityHandle.GetName` of :attr:`ent`.
710  This is used to uniquely identify the entity while scoring. The name may
711  therefore change while :attr:`original_name` remains fixed.
712  """
713  # for duck-typing and convenience
714  return self.ent.GetName()
715 
716  def SetName(self, new_name):
717  """Wrapper to :func:`~ost.mol.EntityHandle.SetName` of :attr:`ent`.
718  Use this to change unique identifier while scoring (see :func:`GetName`).
719  """
720  # for duck-typing and convenience
721  self.ent.SetName(new_name)
722 
723  @property
724  def ca_entity(self):
725  """
726  Reduced representation of :attr:`ent` with only CA atoms.
727  This guarantees that each included residue has exactly one atom.
728 
729  :getter: Computed on first use (cached)
730  :type: :class:`~ost.mol.EntityHandle`
731  """
732  if self._ca_entity is None:
733  self._ca_entity = _GetCAOnlyEntity(self.ent)
734  return self._ca_entity
735 
736  @property
737  def ca_chains(self):
738  """
739  Map of chain names in :attr:`ent` to sequences with attached view to CA-only
740  chains (into :attr:`ca_entity`). Useful for alignments and superpositions.
741 
742  :getter: Computed on first use (cached)
743  :type: :class:`dict` (key = :class:`str`,
744  value = :class:`~ost.seq.SequenceHandle`)
745  """
746  if self._ca_chains is None:
747  self._ca_chains = dict()
748  ca_entity = self.ca_entity
749  for ch in ca_entity.chains:
750  self._ca_chains[ch.name] = seq.SequenceFromChain(ch.name, ch)
751  return self._ca_chains
752 
753  def GetAlignment(self, c1, c2):
754  """Get sequence alignment of chain *c1* with chain *c2*.
755  Computed on first use based on :attr:`ca_chains` (cached).
756 
757  :param c1: Chain name for first chain to align.
758  :type c1: :class:`str`
759  :param c2: Chain name for second chain to align.
760  :type c2: :class:`str`
761  :rtype: :class:`~ost.seq.AlignmentHandle` or None if it failed.
762  """
763  if (c1,c2) not in self._alignments:
764  ca_chains = self.ca_chains
765  self._alignments[(c1,c2)] = _AlignAtomSeqs(ca_chains[c1], ca_chains[c2])
766  return self._alignments[(c1,c2)]
767 
768  @property
769  def chem_groups(self):
770  """
771  Intra-complex group of chemically identical (seq. id. > 95%) polypeptide
772  chains as extracted from :attr:`ca_chains`. First chain in group is the one
773  with the longest sequence.
774 
775  :getter: Computed on first use (cached)
776  :type: :class:`list` of :class:`list` of :class:`str` (chain names)
777  """
778  if self._chem_groups is None:
779  self._chem_groups = _GetChemGroups(self, 95)
780  LogInfo('Chemically equivalent chain-groups in %s: %s' \
781  % (self.GetName(), str(self._chem_groups)))
782  return self._chem_groups
783 
784  def GetAngles(self, c1, c2):
785  """Get Euler angles from superposition of chain *c1* with chain *c2*.
786  Computed on first use based on :attr:`ca_chains` (cached).
787 
788  :param c1: Chain name for first chain to superpose.
789  :type c1: :class:`str`
790  :param c2: Chain name for second chain to superpose.
791  :type c2: :class:`str`
792  :return: 3 Euler angles (may contain nan if something fails).
793  :rtype: :class:`numpy.array`
794  """
795  if (c1,c2) not in self._angles:
796  self._GetSuperposeData(c1, c2)
797  return self._angles[(c1,c2)]
798 
799  def GetAxis(self, c1, c2):
800  """Get axis of symmetry from superposition of chain *c1* with chain *c2*.
801  Computed on first use based on :attr:`ca_chains` (cached).
802 
803  :param c1: Chain name for first chain to superpose.
804  :type c1: :class:`str`
805  :param c2: Chain name for second chain to superpose.
806  :type c2: :class:`str`
807  :return: Rotational axis (may contain nan if something fails).
808  :rtype: :class:`numpy.array`
809  """
810  if (c1,c2) not in self._axis:
811  self._GetSuperposeData(c1, c2)
812  return self._axis[(c1,c2)]
813 
814  @property
815  def contacts(self):
816  """
817  Connectivity dictionary (**read/write**).
818  As given by :func:`GetContacts` with *calpha_only* = False on :attr:`ent`.
819 
820  :getter: Computed on first use (cached)
821  :setter: Uses :func:`FilterContacts` to ensure that we only keep contacts
822  for chains in the cleaned entity.
823  :type: See return type of :func:`GetContacts`
824  """
825  if self._contacts is None:
826  self._contacts = GetContacts(self.ent, False)
827  return self._contacts
828 
829  @contacts.setter
830  def contacts(self, new_contacts):
831  chain_names = set([ch.name for ch in self.ent.chains])
832  self._contacts = FilterContacts(new_contacts, chain_names)
833 
834  @property
835  def contacts_ca(self):
836  """
837  CA-only connectivity dictionary (**read/write**).
838  Like :attr:`contacts` but with *calpha_only* = True in :func:`GetContacts`.
839  """
840  if self._contacts_ca is None:
841  self._contacts_ca = GetContacts(self.ent, True)
842  return self._contacts_ca
843 
844  @contacts_ca.setter
845  def contacts_ca(self, new_contacts):
846  chain_names = set([ch.name for ch in self.ent.chains])
847  self._contacts_ca = FilterContacts(new_contacts, chain_names)
848 
849  ##############################################################################
850  # Class internal helpers (anything that doesnt easily work without this class)
851  ##############################################################################
852 
853  def _GetSuperposeData(self, c1, c2):
854  """Fill _angles and _axis from superposition of CA chains of c1 and c2."""
855  # get aligned views (must contain identical numbers of atoms!)
856  aln = self.GetAlignment(c1, c2)
857  if not aln:
858  # fallback for non-aligned stuff (nan)
859  self._angles[(c1,c2)] = np.empty(3) * np.nan
860  self._axis[(c1,c2)] = np.empty(3) * np.nan
861  return
862  v1, v2 = seq.ViewsFromAlignment(aln)
863  if v1.atom_count < 3:
864  # fallback for non-aligned stuff (nan)
865  self._angles[(c1,c2)] = np.empty(3) * np.nan
866  self._axis[(c1,c2)] = np.empty(3) * np.nan
867  return
868  # get transformation
869  sup_res = mol.alg.SuperposeSVD(v1, v2, apply_transform=False)
870  Rt = sup_res.transformation
871  # extract angles
872  a,b,c = _GetAngles(Rt)
873  self._angles[(c1,c2)] = np.asarray([a,b,c])
874  # extract axis of symmetry
875  R3 = geom.Rotation3(Rt.ExtractRotation())
876  self._axis[(c1,c2)] = np.asarray(R3.GetRotationAxis().data)
877 
878 ###############################################################################
879 # Contacts computations
880 ###############################################################################
881 
882 def FilterContacts(contacts, chain_names):
883  """Filter contacts to contain only contacts for chains in *chain_names*.
884 
885  :param contacts: Connectivity dictionary as produced by :func:`GetContacts`.
886  :type contacts: :class:`dict`
887  :param chain_names: Chain names to keep.
888  :type chain_names: :class:`list` or (better) :class:`set`
889  :return: New connectivity dictionary (format as in :func:`GetContacts`)
890  :rtype: :class:`dict`
891  """
892  # create new dict with subset
893  filtered_contacts = dict()
894  for c1 in contacts:
895  if c1 in chain_names:
896  new_contacts = dict()
897  for c2 in contacts[c1]:
898  if c2 in chain_names:
899  new_contacts[c2] = contacts[c1][c2]
900  # avoid adding empty dicts
901  if new_contacts:
902  filtered_contacts[c1] = new_contacts
903  return filtered_contacts
904 
905 def GetContacts(entity, calpha_only, dist_thr=12.0):
906  """Get inter-chain contacts of a macromolecular entity.
907 
908  Contacts are pairs of residues within a given distance belonging to different
909  chains. They are stored once per pair and include the CA/CB-CA/CB distance.
910 
911  :param entity: An entity to check connectivity for.
912  :type entity: :class:`~ost.mol.EntityHandle` or :class:`~ost.mol.EntityView`
913  :param calpha_only: If True, we only consider CA-CA distances. Else, we use CB
914  unless the residue is a GLY.
915  :type calpha_only: :class:`bool`
916  :param dist_thr: Maximal CA/CB-CA/CB distance to be considered in contact.
917  :type dist_thr: :class:`float`
918  :return: A connectivity dictionary. A pair of residues with chain names
919  *ch_name1* & *ch_name2* (*ch_name1* < *ch_name2*), residue numbers
920  *res_num1* & *res_num2* and distance *dist* (<= *dist_thr*) are
921  stored as *result[ch_name1][ch_name2][res_num1][res_num2]* = *dist*.
922  :rtype: :class:`dict`
923  """
924  # get ent copy to search on
925  if calpha_only:
926  ev = entity.Select("aname=CA")
927  else:
928  ev = entity.Select("(rname=GLY and aname=CA) or aname=CB")
929  ent = mol.CreateEntityFromView(ev, True)
930  # search all vs all
931  contacts = dict()
932  for atom in ent.atoms:
933  ch_name1 = atom.chain.name
934  res_num1 = atom.residue.number.num
935  close_atoms = ent.FindWithin(atom.pos, dist_thr)
936  for close_atom in close_atoms:
937  ch_name2 = close_atom.chain.name
938  if ch_name2 > ch_name1:
939  res_num2 = close_atom.residue.number.num
940  dist = geom.Distance(atom.pos, close_atom.pos)
941  # add to contacts
942  if ch_name1 not in contacts:
943  contacts[ch_name1] = dict()
944  if ch_name2 not in contacts[ch_name1]:
945  contacts[ch_name1][ch_name2] = dict()
946  if res_num1 not in contacts[ch_name1][ch_name2]:
947  contacts[ch_name1][ch_name2][res_num1] = dict()
948  contacts[ch_name1][ch_name2][res_num1][res_num2] = round(dist, 3)
949  # DONE
950  return contacts
951 
952 ###############################################################################
953 # Oligo-lDDT scores
954 ###############################################################################
955 
956 class OligoLDDTScorer(object):
957  """Helper class to calculate oligomeric lDDT scores.
958 
959  This class can be used independently, but commonly it will be created by
960  calling :func:`QSscorer.GetOligoLDDTScorer`.
961 
962  .. note::
963 
964  By construction, lDDT scores are not symmetric and hence it matters which
965  structure is the reference (:attr:`ref`) and which one is the model
966  (:attr:`mdl`). Extra residues in the model are generally not considered.
967  Extra chains in both model and reference can be considered by setting the
968  :attr:`penalize_extra_chains` flag to True.
969 
970  :param ref: Sets :attr:`ref`
971  :param mdl: Sets :attr:`mdl`
972  :param alignments: Sets :attr:`alignments`
973  :param calpha_only: Sets :attr:`calpha_only`
974  :param settings: Sets :attr:`settings`
975  :param penalize_extra_chains: Sets :attr:`penalize_extra_chains`
976  :param chem_mapping: Sets :attr:`chem_mapping`. Must be given if
977  *penalize_extra_chains* is True.
978 
979  .. attribute:: ref
980  mdl
981 
982  Full reference/model entity to be scored. The entity must contain all chains
983  mapped in :attr:`alignments` and may also contain additional ones which are
984  considered if :attr:`penalize_extra_chains` is True.
985 
986  :type: :class:`~ost.mol.EntityHandle`
987 
988  .. attribute:: alignments
989 
990  One alignment for each mapped chain of :attr:`ref`/:attr:`mdl` as defined in
991  :attr:`QSscorer.alignments`. The first sequence of each alignment belongs to
992  :attr:`ref` and the second one to :attr:`mdl`. Sequences must have sequence
993  naming and attached views as defined in :attr:`QSscorer.alignments`.
994 
995  :type: :class:`list` of :class:`~ost.seq.AlignmentHandle`
996 
997  .. attribute:: calpha_only
998 
999  If True, restricts lDDT score to CA only.
1000 
1001  :type: :class:`bool`
1002 
1003  .. attribute:: settings
1004 
1005  Settings to use for lDDT scoring.
1006 
1007  :type: :class:`~ost.mol.alg.lDDTSettings`
1008 
1009  .. attribute:: penalize_extra_chains
1010 
1011  If True, extra chains in both :attr:`ref` and :attr:`mdl` will penalize the
1012  lDDT scores.
1013 
1014  :type: :class:`bool`
1015 
1016  .. attribute:: chem_mapping
1017 
1018  Inter-complex mapping of chemical groups as defined in
1019  :attr:`QSscorer.chem_mapping`. Used to find "chem-mapped" chains in
1020  :attr:`ref` for unmapped chains in :attr:`mdl` when penalizing scores.
1021  Each unmapped model chain can add extra reference-contacts according to the
1022  average total contacts of each single "chem-mapped" reference chain. If
1023  there is no "chem-mapped" reference chain, a warning is shown and the model
1024  chain is ignored.
1025 
1026 
1027  Only relevant if :attr:`penalize_extra_chains` is True.
1028 
1029  :type: :class:`dict` with key = :class:`tuple` of chain names in
1030  :attr:`ref` and value = :class:`tuple` of chain names in
1031  :attr:`mdl`.
1032  """
1033 
1034  # NOTE: one could also allow computation of both penalized and unpenalized
1035  # in same object -> must regenerate lddt_ref / lddt_mdl though
1036 
1037  def __init__(self, ref, mdl, alignments, calpha_only, settings,
1038  penalize_extra_chains=False, chem_mapping=None):
1039  # sanity checks
1040  if chem_mapping is None and penalize_extra_chains:
1041  raise RuntimeError("Must provide chem_mapping when requesting penalty "
1042  "for extra chains!")
1043  if not penalize_extra_chains:
1044  # warn for unmapped model chains
1045  unmapped_mdl_chains = self._GetUnmappedMdlChains(mdl, alignments)
1046  if unmapped_mdl_chains:
1047  LogWarning('MODEL contains chains unmapped to REFERENCE, '
1048  'lDDT is not considering MODEL chains %s' \
1049  % str(list(unmapped_mdl_chains)))
1050  # warn for unmapped reference chains
1051  ref_chains = set(ch.name for ch in ref.chains)
1052  mapped_ref_chains = set(aln.GetSequence(0).GetName() for aln in alignments)
1053  unmapped_ref_chains = (ref_chains - mapped_ref_chains)
1054  if unmapped_ref_chains:
1055  LogWarning('REFERENCE contains chains unmapped to MODEL, '
1056  'lDDT is not considering REFERENCE chains %s' \
1057  % str(list(unmapped_ref_chains)))
1058  # prepare fields
1059  self.ref = ref
1060  self.mdl = mdl
1061  self.alignments = alignments
1062  self.calpha_only = calpha_only
1063  self.settings = settings
1064  self.penalize_extra_chains = penalize_extra_chains
1065  self.chem_mapping = chem_mapping
1066  self._sc_lddt = None
1067  self._oligo_lddt = None
1068  self._weighted_lddt = None
1069  self._lddt_ref = None
1070  self._lddt_mdl = None
1071  self._oligo_lddt_scorer = None
1072  self._mapped_lddt_scorers = None
1073  self._ref_scorers = None
1074  self._model_penalty = None
1075 
1076  @property
1077  def oligo_lddt(self):
1078  """Oligomeric lDDT score.
1079 
1080  The score is computed as conserved contacts divided by the total contacts
1081  in the reference using the :attr:`oligo_lddt_scorer`, which uses the full
1082  complex as reference/model structure. If :attr:`penalize_extra_chains` is
1083  True, the reference/model complexes contain all chains (otherwise only the
1084  mapped ones) and additional contacts are added to the reference's total
1085  contacts for unmapped model chains according to the :attr:`chem_mapping`.
1086 
1087  The main difference with :attr:`weighted_lddt` is that the lDDT scorer
1088  "sees" the full complex here (incl. inter-chain contacts), while the
1089  weighted single chain score looks at each chain separately.
1090 
1091  :getter: Computed on first use (cached)
1092  :type: :class:`float`
1093  """
1094  if self._oligo_lddt is None:
1095  LogInfo('Reference %s has: %s chains' \
1096  % (self.ref.GetName(), self.ref.chain_count))
1097  LogInfo('Model %s has: %s chains' \
1098  % (self.mdl.GetName(), self.mdl.chain_count))
1099 
1100  # score with or w/o extra-chain penalty
1101  if self.penalize_extra_chains:
1102  denominator = self.oligo_lddt_scorer.total_contacts
1103  denominator += self._GetModelPenalty()
1104  if denominator > 0:
1105  oligo_lddt = self.oligo_lddt_scorer.conserved_contacts \
1106  / float(denominator)
1107  else:
1108  oligo_lddt = 0.0
1109  else:
1110  oligo_lddt = self.oligo_lddt_scorer.global_score
1111  self._oligo_lddt = oligo_lddt
1112  return self._oligo_lddt
1113 
1114  @property
1115  def weighted_lddt(self):
1116  """Weighted average of single chain lDDT scores.
1117 
1118  The score is computed as a weighted average of single chain lDDT scores
1119  (see :attr:`sc_lddt_scorers`) using the total contacts of each single
1120  reference chain as weights. If :attr:`penalize_extra_chains` is True,
1121  unmapped chains are added with a 0 score and total contacts taken from
1122  the actual reference chains or (for unmapped model chains) using the
1123  :attr:`chem_mapping`.
1124 
1125  See :attr:`oligo_lddt` for a comparison of the two scores.
1126 
1127  :getter: Computed on first use (cached)
1128  :type: :class:`float`
1129  """
1130  if self._weighted_lddt is None:
1131  scores = [s.global_score for s in self.sc_lddt_scorers]
1132  weights = [s.total_contacts for s in self.sc_lddt_scorers]
1133  nominator = sum([s * w for s, w in zip(scores, weights)])
1134  if self.penalize_extra_chains:
1135  ref_scorers = self._GetRefScorers()
1136  denominator = sum(s.total_contacts for s in list(ref_scorers.values()))
1137  denominator += self._GetModelPenalty()
1138  else:
1139  denominator = sum(weights)
1140  if denominator > 0:
1141  self._weighted_lddt = nominator / float(denominator)
1142  else:
1143  self._weighted_lddt = 0.0
1144  return self._weighted_lddt
1145 
1146  @property
1147  def lddt_ref(self):
1148  """The reference entity used for oligomeric lDDT scoring
1149  (:attr:`oligo_lddt` / :attr:`oligo_lddt_scorer`).
1150 
1151  Since the lDDT computation requires a single chain with mapped residue
1152  numbering, all chains of :attr:`ref` are appended into a single chain X with
1153  unique residue numbers according to the column-index in the alignment. The
1154  alignments are in the same order as they appear in :attr:`alignments`.
1155  Additional residues are appended at the end of the chain with unique residue
1156  numbers. Unmapped chains are only added if :attr:`penalize_extra_chains` is
1157  True. Only CA atoms are considered if :attr:`calpha_only` is True.
1158 
1159  :getter: Computed on first use (cached)
1160  :type: :class:`~ost.mol.EntityHandle`
1161  """
1162  if self._lddt_ref is None:
1163  self._PrepareOligoEntities()
1164  return self._lddt_ref
1165 
1166  @property
1167  def lddt_mdl(self):
1168  """The model entity used for oligomeric lDDT scoring
1169  (:attr:`oligo_lddt` / :attr:`oligo_lddt_scorer`).
1170 
1171  Like :attr:`lddt_ref`, this is a single chain X containing all chains of
1172  :attr:`mdl`. The residue numbers match the ones in :attr:`lddt_ref` where
1173  aligned and have unique numbers for additional residues.
1174 
1175  :getter: Computed on first use (cached)
1176  :type: :class:`~ost.mol.EntityHandle`
1177  """
1178  if self._lddt_mdl is None:
1179  self._PrepareOligoEntities()
1180  return self._lddt_mdl
1181 
1182  @property
1184  """lDDT Scorer object for :attr:`lddt_ref` and :attr:`lddt_mdl`.
1185 
1186  :getter: Computed on first use (cached)
1187  :type: :class:`~ost.mol.alg.lDDTScorer`
1188  """
1189  if self._oligo_lddt_scorer is None:
1191  references=[self.lddt_ref.Select("")],
1192  model=self.lddt_mdl.Select(""),
1193  settings=self.settings)
1194  return self._oligo_lddt_scorer
1195 
1196  @property
1198  """List of scorer objects for each chain mapped in :attr:`alignments`.
1199 
1200  :getter: Computed on first use (cached)
1201  :type: :class:`list` of :class:`MappedLDDTScorer`
1202  """
1203  if self._mapped_lddt_scorers is None:
1204  self._mapped_lddt_scorers = list()
1205  for aln in self.alignments:
1206  mapped_lddt_scorer = MappedLDDTScorer(aln, self.calpha_only,
1207  self.settings)
1208  self._mapped_lddt_scorers.append(mapped_lddt_scorer)
1209  return self._mapped_lddt_scorers
1210 
1211  @property
1212  def sc_lddt_scorers(self):
1213  """List of lDDT scorer objects extracted from :attr:`mapped_lddt_scorers`.
1214 
1215  :type: :class:`list` of :class:`~ost.mol.alg.lDDTScorer`
1216  """
1217  return [mls.lddt_scorer for mls in self.mapped_lddt_scorers]
1218 
1219  @property
1220  def sc_lddt(self):
1221  """List of global scores extracted from :attr:`sc_lddt_scorers`.
1222 
1223  If scoring for a mapped chain fails, an error is displayed and a score of 0
1224  is assigned.
1225 
1226  :getter: Computed on first use (cached)
1227  :type: :class:`list` of :class:`float`
1228  """
1229  if self._sc_lddt is None:
1230  self._sc_lddt = list()
1231  for lddt_scorer in self.sc_lddt_scorers:
1232  try:
1233  self._sc_lddt.append(lddt_scorer.global_score)
1234  except Exception as ex:
1235  LogError('Single chain lDDT failed:', str(ex))
1236  self._sc_lddt.append(0.0)
1237  return self._sc_lddt
1238 
1239  ##############################################################################
1240  # Class internal helpers
1241  ##############################################################################
1242 
1243  def _PrepareOligoEntities(self):
1244  # simple wrapper to avoid code duplication
1245  self._lddt_ref, self._lddt_mdl = _MergeAlignedChains(
1246  self.alignments, self.ref, self.mdl, self.calpha_only,
1247  self.penalize_extra_chains)
1248 
1249  @staticmethod
1250  def _GetUnmappedMdlChains(mdl, alignments):
1251  # assume model is second sequence in alignment and is named by chain
1252  mdl_chains = set(ch.name for ch in mdl.chains)
1253  mapped_mdl_chains = set(aln.GetSequence(1).GetName() for aln in alignments)
1254  return (mdl_chains - mapped_mdl_chains)
1255 
1256  def _GetRefScorers(self):
1257  # single chain lddt scorers for each reference chain (key = chain name)
1258  if self._ref_scorers is None:
1259  # collect from mapped_lddt_scorers
1260  ref_scorers = dict()
1261  for mapped_lddt_scorer in self.mapped_lddt_scorers:
1262  ref_ch_name = mapped_lddt_scorer.reference_chain_name
1263  ref_scorers[ref_ch_name] = mapped_lddt_scorer.lddt_scorer
1264  # add new ones where needed
1265  for ch in self.ref.chains:
1266  if ch.name not in ref_scorers:
1267  if self.calpha_only:
1268  ref_chain = ch.Select('aname=CA')
1269  else:
1270  ref_chain = ch.Select('')
1271  ref_scorers[ch.name] = lDDTScorer(
1272  references=[ref_chain],
1273  model=ref_chain,
1274  settings=self.settings)
1275  # store in cache
1276  self._ref_scorers = ref_scorers
1277  # fetch from cache
1278  return self._ref_scorers
1279 
1280  def _GetModelPenalty(self):
1281  # extra value to add to total number of distances for extra model chains
1282  # -> estimated from chem-mapped reference chains
1283  if self._model_penalty is None:
1284  # sanity check
1285  if self.chem_mapping is None:
1286  raise RuntimeError("Must provide chem_mapping when requesting penalty "
1287  "for extra model chains!")
1288  # get cached ref_scorers
1289  ref_scorers = self._GetRefScorers()
1290  # get unmapped model chains
1291  unmapped_mdl_chains = self._GetUnmappedMdlChains(self.mdl, self.alignments)
1292  # map extra chains to ref. chains
1293  model_penalty = 0
1294  for ch_name_mdl in sorted(unmapped_mdl_chains):
1295  # get penalty for chain
1296  cur_penalty = None
1297  for cm_ref, cm_mdl in self.chem_mapping.items():
1298  if ch_name_mdl in cm_mdl:
1299  # penalize by an average of the chem. mapped ref. chains
1300  cur_penalty = 0
1301  for ch_name_ref in cm_ref:
1302  # assumes that total_contacts is cached (for speed)
1303  cur_penalty += ref_scorers[ch_name_ref].total_contacts
1304  cur_penalty /= float(len(cm_ref))
1305  break
1306  # report penalty
1307  if cur_penalty is None:
1308  LogWarning('Extra MODEL chain %s could not be chemically mapped to '
1309  'any chain in REFERENCE, lDDT cannot consider it!' \
1310  % ch_name_mdl)
1311  else:
1312  LogScript('Extra MODEL chain %s added to lDDT score by considering '
1313  'chemically mapped chains in REFERENCE.' % ch_name_mdl)
1314  model_penalty += cur_penalty
1315  # store in cache
1316  self._model_penalty = model_penalty
1317  # fetch from cache
1318  return self._model_penalty
1319 
1320 
1321 class MappedLDDTScorer(object):
1322  """A simple class to calculate a single-chain lDDT score on a given chain to
1323  chain mapping as extracted from :class:`OligoLDDTScorer`.
1324 
1325  :param alignment: Sets :attr:`alignment`
1326  :param calpha_only: Sets :attr:`calpha_only`
1327  :param settings: Sets :attr:`settings`
1328 
1329  .. attribute:: alignment
1330 
1331  Alignment with two sequences named according to the mapped chains and with
1332  views attached to both sequences (e.g. one of the items of
1333  :attr:`QSscorer.alignments`).
1334 
1335  The first sequence is assumed to be the reference and the second one the
1336  model. Since the lDDT score is not symmetric (extra residues in model are
1337  ignored), the order is important.
1338 
1339  :type: :class:`~ost.seq.AlignmentHandle`
1340 
1341  .. attribute:: calpha_only
1342 
1343  If True, restricts lDDT score to CA only.
1344 
1345  :type: :class:`bool`
1346 
1347  .. attribute:: settings
1348 
1349  Settings to use for lDDT scoring.
1350 
1351  :type: :class:`~ost.mol.alg.lDDTSettings`
1352 
1353  .. attribute:: lddt_scorer
1354 
1355  lDDT Scorer object for the given chains.
1356 
1357  :type: :class:`~ost.mol.alg.lDDTScorer`
1358 
1359  .. attribute:: reference_chain_name
1360 
1361  Chain name of the reference.
1362 
1363  :type: :class:`str`
1364 
1365  .. attribute:: model_chain_name
1366 
1367  Chain name of the model.
1368 
1369  :type: :class:`str`
1370  """
1371  def __init__(self, alignment, calpha_only, settings):
1372  # prepare fields
1373  self.alignment = alignment
1374  self.calpha_only = calpha_only
1375  self.settings = settings
1376  self.lddt_scorer = None # set in _InitScorer
1377  self.reference_chain_name = alignment.sequences[0].name
1378  self.model_chain_name = alignment.sequences[1].name
1379  self._old_number_label = "old_num"
1380  self._extended_alignment = None # set in _InitScorer
1381  # initialize lDDT scorer
1382  self._InitScorer()
1383 
1385  """
1386  :return: Scores for each residue
1387  :rtype: :class:`list` of :class:`dict` with one item for each residue
1388  existing in model and reference:
1389 
1390  - "residue_number": Residue number in reference chain
1391  - "residue_name": Residue name in reference chain
1392  - "lddt": local lDDT
1393  - "conserved_contacts": number of conserved contacts
1394  - "total_contacts": total number of contacts
1395  """
1396  scores = list()
1397  assigned_residues = list()
1398  # Make sure the score is calculated
1399  self.lddt_scorer.global_score
1400  for col in self._extended_alignment:
1401  if col[0] != "-" and col.GetResidue(3).IsValid():
1402  ref_res = col.GetResidue(0)
1403  mdl_res = col.GetResidue(1)
1404  ref_res_renum = col.GetResidue(2)
1405  mdl_res_renum = col.GetResidue(3)
1406  if ref_res.one_letter_code != ref_res_renum.one_letter_code:
1407  raise RuntimeError("Reference residue name mapping inconsistent: %s != %s" %
1408  (ref_res.one_letter_code,
1409  ref_res_renum.one_letter_code))
1410  if mdl_res.one_letter_code != mdl_res_renum.one_letter_code:
1411  raise RuntimeError("Model residue name mapping inconsistent: %s != %s" %
1412  (mdl_res.one_letter_code,
1413  mdl_res_renum.one_letter_code))
1414  if ref_res.GetNumber().num != ref_res_renum.GetIntProp(self._old_number_label):
1415  raise RuntimeError("Reference residue number mapping inconsistent: %s != %s" %
1416  (ref_res.GetNumber().num,
1417  ref_res_renum.GetIntProp(self._old_number_label)))
1418  if mdl_res.GetNumber().num != mdl_res_renum.GetIntProp(self._old_number_label):
1419  raise RuntimeError("Model residue number mapping inconsistent: %s != %s" %
1420  (mdl_res.GetNumber().num,
1421  mdl_res_renum.GetIntProp(self._old_number_label)))
1422  if ref_res.qualified_name in assigned_residues:
1423  raise RuntimeError("Duplicated residue in reference: " %
1424  (ref_res.qualified_name))
1425  else:
1426  assigned_residues.append(ref_res.qualified_name)
1427  # check if property there (may be missing for CA-only)
1428  if mdl_res_renum.HasProp(self.settings.label):
1429  scores.append({
1430  "residue_number": ref_res.GetNumber().num,
1431  "residue_name": ref_res.name,
1432  "lddt": mdl_res_renum.GetFloatProp(self.settings.label),
1433  "conserved_contacts": mdl_res_renum.GetFloatProp(self.settings.label + "_conserved"),
1434  "total_contacts": mdl_res_renum.GetFloatProp(self.settings.label + "_total")})
1435  return scores
1436 
1437  ##############################################################################
1438  # Class internal helpers (anything that doesnt easily work without this class)
1439  ##############################################################################
1440 
1441  def _InitScorer(self):
1442  # Use copy of alignment (extended by 2 extra sequences for renumbering)
1443  aln = self.alignment.Copy()
1444  # Get chains and renumber according to alignment (for lDDT)
1445  reference = Renumber(
1446  aln.GetSequence(0),
1447  old_number_label=self._old_number_label).CreateFullView()
1448  refseq = seq.CreateSequence(
1449  "reference_renumbered",
1450  aln.GetSequence(0).GetString())
1451  refseq.AttachView(reference)
1452  aln.AddSequence(refseq)
1453  model = Renumber(
1454  aln.GetSequence(1),
1455  old_number_label=self._old_number_label).CreateFullView()
1456  modelseq = seq.CreateSequence(
1457  "model_renumbered",
1458  aln.GetSequence(1).GetString())
1459  modelseq.AttachView(model)
1460  aln.AddSequence(modelseq)
1461  # Filter to CA-only if desired (done after AttachView to not mess it up)
1462  if self.calpha_only:
1463  self.lddt_scorer = lDDTScorer(
1464  references=[reference.Select('aname=CA')],
1465  model=model.Select('aname=CA'),
1466  settings=self.settings)
1467  else:
1468  self.lddt_scorer = lDDTScorer(
1469  references=[reference],
1470  model=model,
1471  settings=self.settings)
1472  # Store alignment for later
1473  self._extended_alignment = aln
1474 
1475 ###############################################################################
1476 # HELPERS
1477 ###############################################################################
1478 
1479 # general
1480 
1481 def _AlignAtomSeqs(seq_1, seq_2):
1482  """
1483  :type seq_1: :class:`ost.seq.SequenceHandle`
1484  :type seq_2: :class:`ost.seq.SequenceHandle`
1485  :return: Alignment of two sequences using a global alignment. Views attached
1486  to the input sequences will remain attached in the aln.
1487  :rtype: :class:`~ost.seq.AlignmentHandle` or None if it failed.
1488  """
1489  # NOTE: If the two sequence have a greatly different length
1490  # a local alignment could be a better choice...
1491  aln = None
1492  alns = seq.alg.GlobalAlign(seq_1, seq_2, seq.alg.BLOSUM62)
1493  if alns: aln = alns[0]
1494  if not aln:
1495  LogWarning('Failed to align %s to %s' % (seq_1.name, seq_2.name))
1496  LogWarning('%s: %s' % (seq_1.name, seq_1.string))
1497  LogWarning('%s: %s' % (seq_2.name, seq_2.string))
1498  return aln
1499 
1500 def _FixSelectChainNames(ch_names):
1501  """
1502  :return: String to be used with Select(cname=<RETURN>). Takes care of joining
1503  and putting quotation marks where needed.
1504  :rtype: :class:`str`
1505  :param ch_names: Some iterable list of chain names (:class:`str` items).
1506  """
1507  return ','.join(mol.QueryQuoteName(ch_name) for ch_name in ch_names)
1508 
1509 # QS entity
1510 
1511 def _CleanInputEntity(ent):
1512  """
1513  :param ent: The OST entity to be cleaned.
1514  :type ent: :class:`EntityHandle` or :class:`EntityView`
1515  :return: A tuple of 3 items: :attr:`QSscoreEntity.ent`,
1516  :attr:`QSscoreEntity.removed_chains`,
1517  :attr:`QSscoreEntity.calpha_only`
1518  """
1519  # find chains to remove
1520  removed_chains = []
1521  for ch in ent.chains:
1522  # we remove chains if they are small-peptides or if the contain no aa
1523  # or if they contain only unknown or modified residues
1524  if ch.name in ['-', '_'] \
1525  or ch.residue_count < 20 \
1526  or not any(r.chem_type.IsAminoAcid() for r in ch.residues) \
1527  or not (set(r.one_letter_code for r in ch.residues) - {'?', 'X'}):
1528  removed_chains.append(ch.name)
1529 
1530  # remove them from *ent*
1531  if removed_chains:
1532  view = ent.Select('cname!=%s' % _FixSelectChainNames(set(removed_chains)))
1533  ent_new = mol.CreateEntityFromView(view, True)
1534  ent_new.SetName(ent.GetName())
1535  else:
1536  ent_new = ent
1537 
1538  # check if CA only
1539  calpha_only = False
1540  if ent_new.atom_count > 0 and ent_new.Select('aname=CB').atom_count == 0:
1541  LogInfo('Structure %s is a CA only structure!' % ent_new.GetName())
1542  calpha_only = True
1543 
1544  # report and return
1545  if removed_chains:
1546  LogInfo('Chains removed from %s: %s' \
1547  % (ent_new.GetName(), ''.join(removed_chains)))
1548  LogInfo('Chains in %s: %s' \
1549  % (ent_new.GetName(), ''.join([c.name for c in ent_new.chains])))
1550  return ent_new, removed_chains, calpha_only
1551 
1552 def _GetCAOnlyEntity(ent):
1553  """
1554  :param ent: Entity to process
1555  :type ent: :class:`EntityHandle` or :class:`EntityView`
1556  :return: New entity with only CA and only one atom per residue
1557  (see :attr:`QSscoreEntity.ca_entity`)
1558  """
1559  # cook up CA only view (diff from Select = guaranteed 1 atom per residue)
1560  ca_view = ent.CreateEmptyView()
1561  # add chain by chain
1562  for res in ent.residues:
1563  ca_atom = res.FindAtom("CA")
1564  if ca_atom.IsValid():
1565  ca_view.AddAtom(ca_atom)
1566  # finalize
1567  return mol.CreateEntityFromView(ca_view, False)
1568 
1569 def _GetChemGroups(qs_ent, seqid_thr=95.):
1570  """
1571  :return: Intra-complex group of chemically identical polypeptide chains
1572  (see :attr:`QSscoreEntity.chem_groups`)
1573 
1574  :param qs_ent: Entity to process
1575  :type qs_ent: :class:`QSscoreEntity`
1576  :param seqid_thr: Threshold used to decide when two chains are identical.
1577  95 percent tolerates the few mutations crystallographers
1578  like to do.
1579  :type seqid_thr: :class:`float`
1580  """
1581  # get data from qs_ent
1582  ca_chains = qs_ent.ca_chains
1583  chain_names = sorted(ca_chains.keys())
1584  # get pairs of identical chains
1585  # NOTE: this scales quadratically with number of chains and may be optimized
1586  # -> one could merge it with "merge transitive pairs" below...
1587  id_seqs = []
1588  for ch_1, ch_2 in itertools.combinations(chain_names, 2):
1589  aln = qs_ent.GetAlignment(ch_1, ch_2)
1590  if aln and seq.alg.SequenceIdentity(aln) > seqid_thr:
1591  id_seqs.append((ch_1, ch_2))
1592  # trivial case: no matching pairs
1593  if not id_seqs:
1594  return [[name] for name in chain_names]
1595 
1596  # merge transitive pairs
1597  groups = []
1598  for ch_1, ch_2 in id_seqs:
1599  found = False
1600  for g in groups:
1601  if ch_1 in g or ch_2 in g:
1602  found = True
1603  g.add(ch_1)
1604  g.add(ch_2)
1605  if not found:
1606  groups.append(set([ch_1, ch_2]))
1607  # sort internally based on sequence length
1608  chem_groups = []
1609  for g in groups:
1610  ranked_g = sorted([(-ca_chains[ch].length, ch) for ch in g])
1611  chem_groups.append([ch for _,ch in ranked_g])
1612  # add other dissimilar chains
1613  for ch in chain_names:
1614  if not any(ch in g for g in chem_groups):
1615  chem_groups.append([ch])
1616 
1617  return chem_groups
1618 
1619 def _GetAngles(Rt):
1620  """Computes the Euler angles given a transformation matrix.
1621 
1622  :param Rt: Rt operator.
1623  :type Rt: :class:`ost.geom.Mat4`
1624  :return: A :class:`tuple` of angles for each axis (x,y,z)
1625  """
1626  rot = np.asarray(Rt.ExtractRotation().data).reshape(3,3)
1627  tx = np.arctan2(rot[2,1], rot[2,2])
1628  if tx < 0:
1629  tx += 2*np.pi
1630  ty = np.arctan2(rot[2,0], np.sqrt(rot[2,1]**2 + rot[2,2]**2))
1631  if ty < 0:
1632  ty += 2*np.pi
1633  tz = np.arctan2(rot[1,0], rot[0,0])
1634  if tz < 0:
1635  tz += 2*np.pi
1636  return tx,ty,tz
1637 
1638 # QS scorer
1639 
1640 def _GetChemGroupsMapping(qs_ent_1, qs_ent_2):
1641  """
1642  :return: Inter-complex mapping of chemical groups
1643  (see :attr:`QSscorer.chem_mapping`)
1644 
1645  :param qs_ent_1: See :attr:`QSscorer.qs_ent_1`
1646  :param qs_ent_2: See :attr:`QSscorer.qs_ent_2`
1647  """
1648  # get chem. groups and unique representative
1649  chem_groups_1 = qs_ent_1.chem_groups
1650  chem_groups_2 = qs_ent_2.chem_groups
1651  repr_chains_1 = {x[0]: tuple(x) for x in chem_groups_1}
1652  repr_chains_2 = {x[0]: tuple(x) for x in chem_groups_2}
1653 
1654  # if entities do not have different number of unique chains, we get the
1655  # mapping for the smaller set
1656  swapped = False
1657  if len(repr_chains_2) < len(repr_chains_1):
1658  repr_chains_1, repr_chains_2 = repr_chains_2, repr_chains_1
1659  qs_ent_1, qs_ent_2 = qs_ent_2, qs_ent_1
1660  swapped = True
1661 
1662  # find the closest to each chain between the two entities
1663  # NOTE: this may still be sensible to orthology problem
1664  # -> currently we use a global alignment and seq. id. to rank pairs
1665  # -> we also tried local alignments and weighting the seq. id. by the
1666  # coverages of the alignments (gapless string in aln. / seq. length)
1667  # but global aln performed better...
1668  chain_pairs = []
1669  ca_chains_1 = qs_ent_1.ca_chains
1670  ca_chains_2 = qs_ent_2.ca_chains
1671  for ch_1 in list(repr_chains_1.keys()):
1672  for ch_2 in list(repr_chains_2.keys()):
1673  aln = _AlignAtomSeqs(ca_chains_1[ch_1], ca_chains_2[ch_2])
1674  if aln:
1675  chains_seqid = seq.alg.SequenceIdentity(aln)
1676  LogVerbose('Sequence identity', ch_1, ch_2, 'seqid=%.2f' % chains_seqid)
1677  chain_pairs.append((chains_seqid, ch_1, ch_2))
1678 
1679  # get top matching groups first
1680  chain_pairs = sorted(chain_pairs, reverse=True)
1681  chem_mapping = {}
1682  for _, c1, c2 in chain_pairs:
1683  skip = False
1684  for a,b in chem_mapping.items():
1685  if repr_chains_1[c1] == a or repr_chains_2[c2] == b:
1686  skip = True
1687  break
1688  if not skip:
1689  chem_mapping[repr_chains_1[c1]] = repr_chains_2[c2]
1690  if swapped:
1691  chem_mapping = {y: x for x, y in chem_mapping.items()}
1692  qs_ent_1, qs_ent_2 = qs_ent_2, qs_ent_1
1693 
1694  # notify chains without partner
1695  mapped_1 = set([i for s in list(chem_mapping.keys()) for i in s])
1696  chains_1 = set([c.name for c in qs_ent_1.ent.chains])
1697  if chains_1 - mapped_1:
1698  LogWarning('Unmapped Chains in %s: %s'
1699  % (qs_ent_1.GetName(), ','.join(list(chains_1 - mapped_1))))
1700 
1701  mapped_2 = set([i for s in list(chem_mapping.values()) for i in s])
1702  chains_2 = set([c.name for c in qs_ent_2.ent.chains])
1703  if chains_2 - mapped_2:
1704  LogWarning('Unmapped Chains in %s: %s'
1705  % (qs_ent_2.GetName(), ','.join(list(chains_2 - mapped_2))))
1706 
1707  # check if we have any chains left
1708  LogInfo('Chemical chain-groups mapping: ' + str(chem_mapping))
1709  if len(mapped_1) < 1 or len(mapped_2) < 1:
1710  raise QSscoreError('Less than 1 chains left in chem_mapping.')
1711  return chem_mapping
1712 
1713 def _SelectFew(l, max_elements):
1714  """Return l or copy of l with at most *max_elements* entries."""
1715  n_el = len(l)
1716  if n_el <= max_elements:
1717  return l
1718  else:
1719  # cheap integer ceiling (-1 to ensure that x*max_elements gets d_el = x)
1720  d_el = ((n_el - 1) // max_elements) + 1
1721  new_l = list()
1722  for i in range(0, n_el, d_el):
1723  new_l.append(l[i])
1724  return new_l
1725 
1726 def _GetAlignedResidues(qs_ent_1, qs_ent_2, chem_mapping, max_ca_per_chain,
1727  clustalw_bin):
1728  """
1729  :return: Tuple of two :class:`~ost.mol.EntityView` objects containing subsets
1730  of *qs_ent_1* and *qs_ent_2*. Two entities are later created from
1731  those views (see :attr:`QSscorer.ent_to_cm_1` and
1732  :attr:`QSscorer.ent_to_cm_2`)
1733 
1734  :param qs_ent_1: See :attr:`QSscorer.qs_ent_1`
1735  :param qs_ent_2: See :attr:`QSscorer.qs_ent_2`
1736  :param chem_mapping: See :attr:`QSscorer.chem_mapping`
1737  :param max_ca_per_chain: See :attr:`QSscorer.max_ca_per_chain_for_cm`
1738  """
1739  # make sure name doesn't contain spaces and is unique
1740  def _FixName(seq_name, seq_names):
1741  # get rid of spaces and make it unique
1742  seq_name = seq_name.replace(' ', '-')
1743  while seq_name in seq_names:
1744  seq_name += '-'
1745  return seq_name
1746  # resulting views into CA entities using CA chain sequences
1747  ent_view_1 = qs_ent_1.ca_entity.CreateEmptyView()
1748  ent_view_2 = qs_ent_2.ca_entity.CreateEmptyView()
1749  ca_chains_1 = qs_ent_1.ca_chains
1750  ca_chains_2 = qs_ent_2.ca_chains
1751  # go through all mapped chemical groups
1752  for group_1, group_2 in chem_mapping.items():
1753  # MSA with ClustalW
1754  seq_list = seq.CreateSequenceList()
1755  # keep sequence-name (must be unique) to view mapping
1756  seq_to_empty_view = dict()
1757  for ch in group_1:
1758  sequence = ca_chains_1[ch].Copy()
1759  sequence.name = _FixName(qs_ent_1.GetName() + '.' + ch, seq_to_empty_view)
1760  seq_to_empty_view[sequence.name] = ent_view_1
1761  seq_list.AddSequence(sequence)
1762  for ch in group_2:
1763  sequence = ca_chains_2[ch].Copy()
1764  sequence.name = _FixName(qs_ent_2.GetName() + '.' + ch, seq_to_empty_view)
1765  seq_to_empty_view[sequence.name] = ent_view_2
1766  seq_list.AddSequence(sequence)
1767  alnc = ClustalW(seq_list, clustalw=clustalw_bin)
1768 
1769  # collect aligned residues (list of lists of sequence_count valid residues)
1770  residue_lists = list()
1771  sequence_count = alnc.sequence_count
1772  for col in alnc:
1773  residues = [col.GetResidue(ir) for ir in range(sequence_count)]
1774  if all([r.IsValid() for r in residues]):
1775  residue_lists.append(residues)
1776  # check number of aligned residues
1777  if len(residue_lists) < 5:
1778  raise QSscoreError('Not enough aligned residues.')
1779  elif max_ca_per_chain > 0:
1780  residue_lists = _SelectFew(residue_lists, max_ca_per_chain)
1781  # check what view is for which residue
1782  res_views = [seq_to_empty_view[alnc.sequences[ir].name] \
1783  for ir in range(sequence_count)]
1784  # add to view (note: only 1 CA atom per residue in here)
1785  for res_list in residue_lists:
1786  for res, view in zip(res_list, res_views):
1787  view.AddResidue(res, mol.ViewAddFlag.INCLUDE_ATOMS)
1788  # done
1789  return ent_view_1, ent_view_2
1790 
1791 
1792 def _FindSymmetry(qs_ent_1, qs_ent_2, ent_to_cm_1, ent_to_cm_2, chem_mapping):
1793  """
1794  :return: A pair of comparable symmetry groups (for :attr:`QSscorer.symm_1`
1795  and :attr:`QSscorer.symm_2`) between the two structures.
1796  Empty lists if no symmetry identified.
1797 
1798  :param qs_ent_1: See :attr:`QSscorer.qs_ent_1`
1799  :param qs_ent_2: See :attr:`QSscorer.qs_ent_2`
1800  :param ent_to_cm_1: See :attr:`QSscorer.ent_to_cm_1`
1801  :param ent_to_cm_2: See :attr:`QSscorer.ent_to_cm_2`
1802  :param chem_mapping: See :attr:`QSscorer.chem_mapping`
1803  """
1804  LogInfo('Identifying Symmetry Groups...')
1805 
1806  # get possible symmetry groups
1807  symm_subg_1 = _GetSymmetrySubgroups(qs_ent_1, ent_to_cm_1,
1808  list(chem_mapping.keys()))
1809  symm_subg_2 = _GetSymmetrySubgroups(qs_ent_2, ent_to_cm_2,
1810  list(chem_mapping.values()))
1811 
1812  # check combination of groups
1813  LogInfo('Selecting Symmetry Groups...')
1814  # check how many mappings are to be checked for each pair of symmetry groups
1815  # -> lower number here will speed up worst-case runtime of _GetChainMapping
1816  # NOTE: this is tailored to speed up brute force all vs all mapping test
1817  # for preferred _CheckClosedSymmetry this is suboptimal!
1818  best_symm = []
1819  for symm_1, symm_2 in itertools.product(symm_subg_1, symm_subg_2):
1820  s1 = symm_1[0]
1821  s2 = symm_2[0]
1822  if len(s1) != len(s2):
1823  LogDebug('Discarded', str(symm_1), str(symm_2),
1824  ': different size of symmetry groups')
1825  continue
1826  count = _CountSuperpositionsAndMappings(symm_1, symm_2, chem_mapping)
1827  nr_mapp = count['intra']['mappings'] + count['inter']['mappings']
1828  LogDebug('OK', str(symm_1), str(symm_2), ': %s mappings' % nr_mapp)
1829  best_symm.append((nr_mapp, symm_1, symm_2))
1830 
1831  for _, symm_1, symm_2 in sorted(best_symm):
1832  s1 = symm_1[0]
1833  s2 = symm_2[0]
1834  group_1 = ent_to_cm_1.Select('cname=%s' % _FixSelectChainNames(s1))
1835  group_2 = ent_to_cm_2.Select('cname=%s' % _FixSelectChainNames(s2))
1836  # check if by superposing a pair of chains within the symmetry group to
1837  # superpose all chains within the symmetry group
1838  # -> if successful, the symmetry groups are compatible
1839  closed_symm = _CheckClosedSymmetry(group_1, group_2, symm_1, symm_2,
1840  chem_mapping, 6, 0.8, find_best=False)
1841 
1842  if closed_symm:
1843  return symm_1, symm_2
1844 
1845  # nothing found
1846  LogInfo('No coherent symmetry identified between structures')
1847  return [], []
1848 
1849 
1850 def _GetChainMapping(ent_1, ent_2, symm_1, symm_2, chem_mapping,
1851  max_mappings_extensive):
1852  """
1853  :return: Tuple with mapping from *ent_1* to *ent_2* (see
1854  :attr:`QSscorer.chain_mapping`) and scheme used (see
1855  :attr:`QSscorer.chain_mapping_scheme`)
1856 
1857  :param ent_1: See :attr:`QSscorer.ent_to_cm_1`
1858  :param ent_2: See :attr:`QSscorer.ent_to_cm_2`
1859  :param symm_1: See :attr:`QSscorer.symm_1`
1860  :param symm_2: See :attr:`QSscorer.symm_2`
1861  :param chem_mapping: See :attr:`QSscorer.chem_mapping`
1862  :param max_mappings_extensive: See :attr:`QSscorer.max_mappings_extensive`
1863  """
1864  LogInfo('Symmetry-groups used in %s: %s' % (ent_1.GetName(), str(symm_1)))
1865  LogInfo('Symmetry-groups used in %s: %s' % (ent_2.GetName(), str(symm_2)))
1866 
1867  # quick check for closed symmetries
1868  thresholds = [( 'strict', 4, 0.8),
1869  ( 'tolerant', 6, 0.4),
1870  ('permissive', 8, 0.2)]
1871  for scheme, sup_thr, sup_fract in thresholds:
1872  # check if by superposing a pair of chains within the symmetry group to
1873  # superpose all chains of the two oligomers
1874  # -> this also returns the resulting mapping of chains
1875  mapping = _CheckClosedSymmetry(ent_1, ent_2, symm_1, symm_2,
1876  chem_mapping, sup_thr, sup_fract)
1877  if mapping:
1878  LogInfo('Closed Symmetry with %s parameters' % scheme)
1879  if scheme == 'permissive':
1880  LogWarning('Permissive thresholds used for overlap based mapping ' + \
1881  'detection: check mapping manually: %s' % mapping)
1882  return mapping, scheme
1883 
1884  # NOTE that what follows below is sub-optimal:
1885  # - if the two structures don't fit at all, we may map chains rather randomly
1886  # - we may need a lot of operations to find globally "best" mapping
1887  # -> COST = O(N^2) * O(N!)
1888  # (first = all pairwise chain-superpose, latter = all chain mappings)
1889  # -> a greedy chain mapping choice algo (pick best RMSD for each one-by-one)
1890  # could reduce this to O(N^2) * O(N^2) but it would need to be validated
1891  # - one could also try some other heuristic based on center of atoms of chains
1892  # -> at this point we get a bad mapping anyways...
1893 
1894  # if we get here, we will need to check many combinations of mappings
1895  # -> check how many mappings must be checked and limit
1896  count = _CountSuperpositionsAndMappings(symm_1, symm_2, chem_mapping)
1897  LogInfo('Intra Symmetry-group mappings to check: %s' \
1898  % count['intra']['mappings'])
1899  LogInfo('Inter Symmetry-group mappings to check: %s' \
1900  % count['inter']['mappings'])
1901  nr_mapp = count['intra']['mappings'] + count['inter']['mappings']
1902  if nr_mapp > max_mappings_extensive:
1903  raise QSscoreError('Too many possible mappings: %s' % nr_mapp)
1904 
1905  # to speed up the computations we cache chain views and RMSDs
1906  cached_rmsd = _CachedRMSD(ent_1, ent_2)
1907 
1908  # get INTRA symmetry group chain mapping
1909  check = 0
1910  intra_mappings = [] # list of (RMSD, c1, c2, mapping) tuples (best superpose)
1911  # limit chem mapping to reference symmetry group
1912  ref_symm_1 = symm_1[0]
1913  ref_symm_2 = symm_2[0]
1914  asu_chem_mapping = _LimitChemMapping(chem_mapping, ref_symm_1, ref_symm_2)
1915  # for each chemically identical group
1916  for g1, g2 in asu_chem_mapping.items():
1917  # try to superpose all possible pairs
1918  for c1, c2 in itertools.product(g1, g2):
1919  # get superposition transformation
1920  LogVerbose('Superposing chains: %s, %s' % (c1, c2))
1921  res = cached_rmsd.GetSuperposition(c1, c2)
1922  # compute RMSD of possible mappings
1923  cur_mappings = [] # list of (RMSD, mapping) tuples
1924  for mapping in _ListPossibleMappings(c1, c2, asu_chem_mapping):
1925  check += 1
1926  chains_rmsd = cached_rmsd.GetMappedRMSD(mapping, res.transformation)
1927  cur_mappings.append((chains_rmsd, mapping))
1928  LogVerbose(str(mapping), str(chains_rmsd))
1929  best_rmsd, best_mapp = min(cur_mappings)
1930  intra_mappings.append((best_rmsd, c1, c2, best_mapp))
1931  # best chain-chain superposition defines the intra asu mapping
1932  _, intra_asu_c1, intra_asu_c2, intra_asu_mapp = min(intra_mappings)
1933 
1934  # if only one asu is present in any of the complexes, we're done
1935  if len(symm_1) == 1 or len(symm_2) == 1:
1936  mapping = intra_asu_mapp
1937  else:
1938  # the mapping is the element position within the asu chain list
1939  # -> this speed up a lot, assuming that the order of chain in asu groups
1940  # is following the order of symmetry operations
1941  index_asu_c1 = ref_symm_1.index(intra_asu_c1)
1942  index_asu_c2 = ref_symm_2.index(intra_asu_c2)
1943  index_mapp = {ref_symm_1.index(s1): ref_symm_2.index(s2) \
1944  for s1, s2 in intra_asu_mapp.items()}
1945  LogInfo('Intra symmetry-group mapping: %s' % str(intra_asu_mapp))
1946 
1947  # get INTER symmetry group chain mapping
1948  # we take the first symmetry group from the complex having the most
1949  if len(symm_1) < len(symm_2):
1950  symm_combinations = list(itertools.product(symm_1, [symm_2[0]]))
1951  else:
1952  symm_combinations = list(itertools.product([symm_1[0]], symm_2))
1953 
1954  full_asu_mapp = {tuple(symm_1): tuple(symm_2)}
1955  full_mappings = [] # list of (RMSD, mapping) tuples
1956  for s1, s2 in symm_combinations:
1957  # get superposition transformation (we take best chain-pair in asu)
1958  LogVerbose('Superposing symmetry-group: %s, %s in %s, %s' \
1959  % (s1[index_asu_c1], s2[index_asu_c2], s1, s2))
1960  res = cached_rmsd.GetSuperposition(s1[index_asu_c1], s2[index_asu_c2])
1961  # compute RMSD of possible mappings
1962  for asu_mapp in _ListPossibleMappings(s1, s2, full_asu_mapp):
1963  check += 1
1964  # need to extract full chain mapping (use indexing)
1965  mapping = {}
1966  for a, b in asu_mapp.items():
1967  for id_a, id_b in index_mapp.items():
1968  mapping[a[id_a]] = b[id_b]
1969  chains_rmsd = cached_rmsd.GetMappedRMSD(mapping, res.transformation)
1970  full_mappings.append((chains_rmsd, mapping))
1971  LogVerbose(str(mapping), str(chains_rmsd))
1972 
1973  if check != nr_mapp:
1974  LogWarning('Mapping number estimation was wrong') # sanity check
1975 
1976  # return best (lowest RMSD) mapping
1977  mapping = min(full_mappings, key=lambda x: x[0])[1]
1978 
1979  LogWarning('Extensive search used for mapping detection (fallback). This ' + \
1980  'approach has known limitations. Check mapping manually: %s' \
1981  % mapping)
1982  return mapping, 'extensive'
1983 
1984 
1985 def _GetSymmetrySubgroups(qs_ent, ent, chem_groups):
1986  """
1987  Identify the symmetry (either cyclic C or dihedral D) of the protein and find
1988  all possible symmetry subgroups. This is done testing all combination of chain
1989  superposition and clustering them by the angles (D) or the axis (C) of the Rt
1990  operator.
1991 
1992  Groups of superposition which can fully reconstruct the structure are possible
1993  symmetry subgroups.
1994 
1995  :param qs_ent: Entity with cached angles and axis.
1996  :type qs_ent: :class:`QSscoreEntity`
1997  :param ent: Entity from which to extract chains (perfect alignment assumed
1998  for superposition as in :attr:`QSscorer.ent_to_cm_1`).
1999  :type ent: :class:`EntityHandle` or :class:`EntityView`
2000  :param chem_groups: List of tuples/lists of chain names in *ent*. Each list
2001  contains all chains belonging to a chem. group (could be
2002  from :attr:`QSscoreEntity.chem_groups` or from "keys()"
2003  or "values()" of :attr:`QSscorer.chem_mapping`)
2004 
2005  :return: A list of possible symmetry subgroups (each in same format as
2006  :attr:`QSscorer.symm_1`). If no symmetry is found, we return a list
2007  with a single symmetry subgroup with a single group with all chains.
2008  """
2009  # get angles / axis for pairwise transformations within same chem. group
2010  angles = {}
2011  axis = {}
2012  for cnames in chem_groups:
2013  for c1, c2 in itertools.combinations(cnames, 2):
2014  cur_angles = qs_ent.GetAngles(c1, c2)
2015  if not np.any(np.isnan(cur_angles)):
2016  angles[(c1,c2)] = cur_angles
2017  cur_axis = qs_ent.GetAxis(c1, c2)
2018  if not np.any(np.isnan(cur_axis)):
2019  axis[(c1,c2)] = cur_axis
2020 
2021  # cluster superpositions angles at different thresholds
2022  symm_groups = []
2023  LogVerbose('Possible symmetry-groups for: %s' % ent.GetName())
2024  for angle_thr in np.arange(0.1, 1, 0.1):
2025  d_symm_groups = _GetDihedralSubgroups(ent, chem_groups, angles, angle_thr)
2026  if d_symm_groups:
2027  for group in d_symm_groups:
2028  if group not in symm_groups:
2029  symm_groups.append(group)
2030  LogVerbose('Dihedral: %s' % group)
2031  LogInfo('Symmetry threshold %.1f used for angles of %s' \
2032  % (angle_thr, ent.GetName()))
2033  break
2034 
2035  # cluster superpositions axis at different thresholds
2036  for axis_thr in np.arange(0.1, 1, 0.1):
2037  c_symm_groups = _GetCyclicSubgroups(ent, chem_groups, axis, axis_thr)
2038  if c_symm_groups:
2039  for group in c_symm_groups:
2040  if group not in symm_groups:
2041  symm_groups.append(group)
2042  LogVerbose('Cyclic: %s' % group)
2043  LogInfo('Symmetry threshold %.1f used for axis of %s' \
2044  % (axis_thr, ent.GetName()))
2045  break
2046 
2047  # fall back to single "group" containing all chains if none found
2048  if not symm_groups:
2049  LogInfo('No symmetry-group detected in %s' % ent.GetName())
2050  symm_groups = [[tuple([c for g in chem_groups for c in g])]]
2051 
2052  return symm_groups
2053 
2054 def _GetDihedralSubgroups(ent, chem_groups, angles, angle_thr):
2055  """
2056  :return: Dihedral subgroups for :func:`_GetSymmetrySubgroups`
2057  (same return type as there). Empty list if fail.
2058 
2059  :param ent: See :func:`_GetSymmetrySubgroups`
2060  :param chem_groups: See :func:`_GetSymmetrySubgroups`
2061  :param angles: :class:`dict` (key = chain-pair-tuple, value = Euler angles)
2062  :param angle_thr: Euler angles distance threshold for clustering (float).
2063  """
2064  # cluster superpositions angles
2065  if len(angles) == 0:
2066  # nothing to be done here
2067  return []
2068  else:
2069  same_angles = _ClusterData(angles, angle_thr, _AngleArrayDistance)
2070 
2071  # get those which are non redundant and covering all chains
2072  symm_groups = []
2073  for clst in list(same_angles.values()):
2074  group = list(clst.keys())
2075  if _ValidChainGroup(group, ent):
2076  if len(chem_groups) > 1:
2077  # if hetero, we want to group toghether different chains only
2078  symm_groups.append(list(zip(*group)))
2079  else:
2080  # if homo, we also want pairs
2081  symm_groups.append(group)
2082  symm_groups.append(list(zip(*group)))
2083  return symm_groups
2084 
2085 def _GetCyclicSubgroups(ent, chem_groups, axis, axis_thr):
2086  """
2087  :return: Cyclic subgroups for :func:`_GetSymmetrySubgroups`
2088  (same return type as there). Empty list if fail.
2089 
2090  :param ent: See :func:`_GetSymmetrySubgroups`
2091  :param chem_groups: See :func:`_GetSymmetrySubgroups`
2092  :param angles: :class:`dict` (key = chain-pair-tuple, value = rotation axis)
2093  :param angle_thr: Axis distance threshold for clustering (float).
2094  """
2095  # cluster superpositions axis
2096  if len(axis) == 0:
2097  # nothing to be done here
2098  return []
2099  else:
2100  same_axis = _ClusterData(axis, axis_thr, _AxisDistance)
2101 
2102  # use to get grouping
2103  symm_groups = []
2104  for clst in list(same_axis.values()):
2105  all_chain = [item for sublist in list(clst.keys()) for item in sublist]
2106  if len(set(all_chain)) == ent.chain_count:
2107  # if we have an hetero we can exploit cyclic symmetry for grouping
2108  if len(chem_groups) > 1:
2109  ref_chem_group = chem_groups[0]
2110  # try two criteria for grouping
2111  cyclic_group_closest = []
2112  cyclic_group_iface = []
2113  for c1 in ref_chem_group:
2114  group_closest = [c1]
2115  group_iface = [c1]
2116  for chains in chem_groups[1:]:
2117  # center of atoms distance
2118  closest = _GetClosestChain(ent, c1, chains)
2119  # chain with bigger interface
2120  closest_iface = _GetClosestChainInterface(ent, c1, chains)
2121  group_closest.append(closest)
2122  group_iface.append(closest_iface)
2123  cyclic_group_closest.append(tuple(group_closest))
2124  cyclic_group_iface.append(tuple(group_iface))
2125  if _ValidChainGroup(cyclic_group_closest, ent):
2126  symm_groups.append(cyclic_group_closest)
2127  if _ValidChainGroup(cyclic_group_iface, ent):
2128  symm_groups.append(cyclic_group_iface)
2129  # if it is a homo, there's not much we can group
2130  else:
2131  symm_groups.append(chem_groups)
2132  return symm_groups
2133 
2134 def _ClusterData(data, thr, metric):
2135  """Wrapper for fclusterdata to get dict of clusters.
2136 
2137  :param data: :class:`dict` (keys for ID, values used for clustering)
2138  :return: :class:`dict` {cluster_idx: {data-key: data-value}}
2139  """
2140  # special case length 1
2141  if len(data) == 1:
2142  return {0: {list(data.keys())[0]: list(data.values())[0]} }
2143  # do the clustering
2144  cluster_indices = fclusterdata(np.asarray(list(data.values())), thr,
2145  method='complete', criterion='distance',
2146  metric=metric)
2147  # fclusterdata output is cluster ids -> put into dict of clusters
2148  res = {}
2149  for cluster_idx, data_key in zip(cluster_indices, list(data.keys())):
2150  if not cluster_idx in res:
2151  res[cluster_idx] = {}
2152  res[cluster_idx][data_key] = data[data_key]
2153  return res
2154 
2155 def _AngleArrayDistance(u, v):
2156  """
2157  :return: Average angular distance of two arrays of angles.
2158  :param u: Euler angles.
2159  :param v: Euler angles.
2160  """
2161  dist = []
2162  for a,b in zip(u,v):
2163  delta = abs(a-b)
2164  if delta > np.pi:
2165  delta = abs(2*np.pi - delta)
2166  dist.append(delta)
2167  return np.mean(dist)
2168 
2169 def _AxisDistance(u, v):
2170  """
2171  :return: Euclidean distance between two rotation axes. Axes can point in
2172  either direction so we ensure to use the closer one.
2173  :param u: Rotation axis.
2174  :param v: Rotation axis.
2175  """
2176  # get axes as arrays (for convenience)
2177  u = np.asarray(u)
2178  v = np.asarray(v)
2179  # get shorter of two possible distances (using v or -v)
2180  dv1 = u - v
2181  dv2 = u + v
2182  d1 = np.dot(dv1, dv1)
2183  d2 = np.dot(dv2, dv2)
2184  return np.sqrt(min(d1, d2))
2185 
2186 def _GetClosestChain(ent, ref_chain, chains):
2187  """
2188  :return: Chain closest to *ref_chain* based on center of atoms distance.
2189  :rtype: :class:`str`
2190  :param ent: See :func:`_GetSymmetrySubgroups`
2191  :param ref_chain: We look for chains closest to this one
2192  :type ref_chain: :class:`str`
2193  :param chains: We only consider these chains
2194  :type chains: :class:`list` of :class:`str`
2195  """
2196  contacts = []
2197  ref_center = ent.FindChain(ref_chain).center_of_atoms
2198  for ch in chains:
2199  center = ent.FindChain(ch).center_of_atoms
2200  contacts.append((geom.Distance(ref_center, center), ch))
2201  closest_chain = min(contacts)[1]
2202  return closest_chain
2203 
2204 def _GetClosestChainInterface(ent, ref_chain, chains):
2205  """
2206  :return: Chain with biggest interface (within 10 A) to *ref_chain*.
2207  :rtype: :class:`str`
2208  :param ent: See :func:`_GetSymmetrySubgroups`
2209  :param ref_chain: We look for chains closest to this one
2210  :type ref_chain: :class:`str`
2211  :param chains: We only consider these chains
2212  :type chains: :class:`list` of :class:`str`
2213  """
2214  # NOTE: this is computed on a subset of the full biounit and might be
2215  # inaccurate. Also it could be extracted from QSscoreEntity.contacts.
2216  closest = []
2217  for ch in chains:
2218  iface_view = ent.Select('cname="%s" and 10 <> [cname="%s"]' \
2219  % (ref_chain, ch))
2220  nr_res = iface_view.residue_count
2221  closest.append((nr_res, ch))
2222  closest_chain = max(closest)[1]
2223  return closest_chain
2224 
2225 def _ValidChainGroup(group, ent):
2226  """
2227  :return: True, if *group* has unique chain names and as many chains as *ent*
2228  :rtype: :class:`bool`
2229  :param group: Symmetry groups to check
2230  :type group: Same as :attr:`QSscorer.symm_1`
2231  :param ent: See :func:`_GetSymmetrySubgroups`
2232  """
2233  all_chain = [item for sublist in group for item in sublist]
2234  if len(all_chain) == len(set(all_chain)) and\
2235  len(all_chain) == ent.chain_count:
2236  return True
2237  return False
2238 
2239 def _LimitChemMapping(chem_mapping, limit_1, limit_2):
2240  """
2241  :return: Chem. mapping containing only chains in *limit_1* and *limit_2*
2242  :rtype: Same as :attr:`QSscorer.chem_mapping`
2243  :param chem_mapping: See :attr:`QSscorer.chem_mapping`
2244  :param limit_1: Limits chain names in chem_mapping.keys()
2245  :type limit_1: List/tuple of strings
2246  :param limit_2: Limits chain names in chem_mapping.values()
2247  :type limit_2: List/tuple of strings
2248  """
2249  # use set intersection to keep only mapping for chains in limit_X
2250  limit_1_set = set(limit_1)
2251  limit_2_set = set(limit_2)
2252  asu_chem_mapping = dict()
2253  for key, value in chem_mapping.items():
2254  new_key = tuple(set(key) & limit_1_set)
2255  if new_key:
2256  asu_chem_mapping[new_key] = tuple(set(value) & limit_2_set)
2257  return asu_chem_mapping
2258 
2259 
2260 def _CountSuperpositionsAndMappings(symm_1, symm_2, chem_mapping):
2261  """
2262  :return: Dictionary of number of mappings and superpositions to be performed.
2263  Returned as *result[X][Y] = number* with X = "intra" or "inter" and
2264  Y = "mappings" or "superpositions". The idea is that for each
2265  pairwise superposition we check all possible mappings.
2266  We can check the combinations within (intra) a symmetry group and
2267  once established, we check the combinations between different (inter)
2268  symmetry groups.
2269  :param symm_1: See :attr:`QSscorer.symm_1`
2270  :param symm_2: See :attr:`QSscorer.symm_2`
2271  :param chem_mapping: See :attr:`QSscorer.chem_mapping`
2272  """
2273  # setup
2274  c = {}
2275  c['intra'] = {}
2276  c['inter'] = {}
2277  c['intra']['mappings'] = 0
2278  c['intra']['superpositions'] = 0
2279  c['inter']['mappings'] = 0
2280  c['inter']['superpositions'] = 0
2281  # intra ASU mappings within reference symmetry groups
2282  ref_symm_1 = symm_1[0]
2283  ref_symm_2 = symm_2[0]
2284  asu_chem_mapping = _LimitChemMapping(chem_mapping, ref_symm_1, ref_symm_2)
2285  for g1, g2 in asu_chem_mapping.items():
2286  chain_superpositions = len(g1) * len(g2)
2287  c['intra']['superpositions'] += chain_superpositions
2288  map_per_sup = _PermutationOrCombinations(g1[0], g2[0], asu_chem_mapping)
2289  c['intra']['mappings'] += chain_superpositions * map_per_sup
2290  if len(symm_1) == 1 or len(symm_2) == 1:
2291  return c
2292  # inter ASU mappings
2293  asu_superposition = min(len(symm_1), len(symm_2))
2294  c['inter']['superpositions'] = asu_superposition
2295  asu = {tuple(symm_1): tuple(symm_2)}
2296  map_per_sup = _PermutationOrCombinations(ref_symm_1, ref_symm_2, asu)
2297  c['inter']['mappings'] = asu_superposition * map_per_sup
2298  return c
2299 
2300 def _PermutationOrCombinations(sup1, sup2, chem_mapping):
2301  """Should match len(_ListPossibleMappings(sup1, sup2, chem_mapping))."""
2302  # remove superposed elements, put smallest complex as key
2303  chem_map = {}
2304  for a,b in chem_mapping.items():
2305  new_a = tuple([x for x in a if x != sup1])
2306  new_b = tuple([x for x in b if x != sup2])
2307  chem_map[new_a] = new_b
2308  mapp_nr = 1.0
2309  for a, b in chem_map.items():
2310  if len(a) == len(b):
2311  mapp_nr *= factorial(len(a))
2312  elif len(a) < len(b):
2313  mapp_nr *= binom(len(b), len(a))
2314  elif len(a) > len(b):
2315  mapp_nr *= binom(len(a), len(b))
2316  return int(mapp_nr)
2317 
2318 def _ListPossibleMappings(sup1, sup2, chem_mapping):
2319  """
2320  Return a flat list of all possible mappings given *chem_mapping* and keeping
2321  mapping of *sup1* and *sup2* fixed. For instance if elements are chain names
2322  this is all the mappings to check for a given superposition.
2323 
2324  Elements in first complex are defined by *chem_mapping.keys()* (list of list
2325  of elements) and elements in second complex by *chem_mapping.values()*. If
2326  complexes don't have same number of elements, we map only elements for the one
2327  with less. Also mapping is only between elements of mapped groups according to
2328  *chem_mapping*.
2329 
2330  :rtype: :class:`list` of :class:`dict` (key = element in chem_mapping-key,
2331  value = element in chem_mapping-value)
2332  :param sup1: Element for which mapping is fixed.
2333  :type sup1: Like element in chem_mapping-key
2334  :param sup2: Element for which mapping is fixed.
2335  :type sup2: Like element in chem_mapping-value
2336  :param chem_mapping: Defines mapping between groups of elements (e.g. result
2337  of :func:`_LimitChemMapping`).
2338  :type chem_mapping: :class:`dict` with key / value = :class:`tuple`
2339 
2340  :raises: :class:`QSscoreError` if reference complex (first one or one with
2341  less elements) has more elements for any given mapped group.
2342  """
2343  # find smallest complex
2344  chain1 = [i for s in list(chem_mapping.keys()) for i in s]
2345  chain2 = [i for s in list(chem_mapping.values()) for i in s]
2346  swap = False
2347  if len(chain1) > len(chain2):
2348  swap = True
2349  # remove superposed elements, put smallest complex as key
2350  chem_map = {}
2351  for a, b in chem_mapping.items():
2352  new_a = tuple([x for x in a if x != sup1])
2353  new_b = tuple([x for x in b if x != sup2])
2354  if swap:
2355  chem_map[new_b] = new_a
2356  else:
2357  chem_map[new_a] = new_b
2358  # generate permutations or combinations of chemically
2359  # equivalent chains
2360  chem_perm = []
2361  chem_ref = []
2362  for a, b in chem_map.items():
2363  if len(a) == len(b):
2364  chem_perm.append(list(itertools.permutations(b)))
2365  chem_ref.append(a)
2366  elif len(a) < len(b):
2367  chem_perm.append(list(itertools.combinations(b, len(a))))
2368  chem_ref.append(a)
2369  else:
2370  raise QSscoreError('Impossible to define reference group: %s' \
2371  % str(chem_map))
2372  # save the list of possible mappings
2373  mappings = []
2374  flat_ref = [i for s in chem_ref for i in s]
2375  for perm in itertools.product(*chem_perm):
2376  flat_perm = [i for s in perm for i in s]
2377  d = {c1: c2 for c1, c2 in zip(flat_ref, flat_perm)}
2378  if swap:
2379  d = {v: k for k, v in list(d.items())}
2380  d.update({sup1: sup2})
2381  mappings.append(d)
2382  return mappings
2383 
2384 
2385 def _CheckClosedSymmetry(ent_1, ent_2, symm_1, symm_2, chem_mapping,
2386  sup_thr=4, sup_fract=0.8, find_best=True):
2387  """
2388  Quick check if we can superpose two chains and get a mapping for all other
2389  chains using the same transformation. The mapping is defined by sufficient
2390  overlap of the transformed chain of *ent_1* onto another chain in *ent_2*.
2391 
2392  :param ent_1: Entity to map to *ent_2* (perfect alignment assumed between
2393  chains of same chem. group as in :attr:`QSscorer.ent_to_cm_1`).
2394  Views are ok but only to select full chains!
2395  :param ent_2: Entity to map to (perfect alignment assumed between
2396  chains of same chem. group as in :attr:`QSscorer.ent_to_cm_2`).
2397  Views are ok but only to select full chains!
2398  :param symm_1: Symmetry groups to use. We only superpose chains within
2399  reference symmetry group of *symm_1* and *symm_2*.
2400  See :attr:`QSscorer.symm_1`
2401  :param symm_2: See :attr:`QSscorer.symm_2`
2402  :param chem_mapping: See :attr:`QSscorer.chem_mapping`.
2403  All chains in *ent_1* / *ent_2* must be listed here!
2404  :param sup_thr: Distance around transformed chain in *ent_1* to check for
2405  overlap.
2406  :type sup_thr: :class:`float`
2407  :param sup_fract: Fraction of atoms in chain of *ent_2* that must be
2408  overlapped for overlap to be sufficient.
2409  :type sup_fract: :class:`float`
2410  :param find_best: If True, we look for best mapping according to
2411  :func:`_GetMappedRMSD`. Otherwise, we return first suitable
2412  mapping.
2413  :type find_best: :class:`bool`
2414 
2415  :return: Mapping from *ent_1* to *ent_2* or None if none found. Mapping, if
2416  found, is as in :attr:`QSscorer.chain_mapping`.
2417  :rtype: :class:`dict` (:class:`str` / :class:`str`)
2418  """
2419  # limit chem mapping to reference symmetry group
2420  ref_symm_1 = symm_1[0]
2421  ref_symm_2 = symm_2[0]
2422  asu_chem_mapping = _LimitChemMapping(chem_mapping, ref_symm_1, ref_symm_2)
2423  # for each chemically identical group
2424  rmsd_mappings = []
2425  for g1, g2 in asu_chem_mapping.items():
2426  # try to superpose all possible pairs
2427  # -> note that some chain-chain combinations may work better than others
2428  # to superpose the full oligomer (e.g. if some chains are open/closed)
2429  for c1, c2 in itertools.product(g1, g2):
2430  # get superposition transformation
2431  chain_1 = ent_1.Select('cname="%s"' % c1)
2432  chain_2 = ent_2.Select('cname="%s"' % c2)
2433  res = mol.alg.SuperposeSVD(chain_1, chain_2, apply_transform=False)
2434  # look for overlaps
2435  mapping = _GetSuperpositionMapping(ent_1, ent_2, chem_mapping,
2436  res.transformation, sup_thr,
2437  sup_fract)
2438  if not mapping:
2439  continue
2440  # early abort if we only want the first one
2441  if not find_best:
2442  return mapping
2443  else:
2444  # get RMSD for sorting
2445  rmsd = _GetMappedRMSD(ent_1, ent_2, mapping, res.transformation)
2446  rmsd_mappings.append((rmsd, mapping))
2447  # return best mapping
2448  if rmsd_mappings:
2449  return min(rmsd_mappings, key=lambda x: x[0])[1]
2450  else:
2451  return None
2452 
2453 def _GetSuperpositionMapping(ent_1, ent_2, chem_mapping, transformation,
2454  sup_thr, sup_fract):
2455  """
2456  :return: Dict with chain mapping from *ent_1* to *ent_2* or None if failed
2457  (see :func:`_CheckClosedSymmetry`).
2458  :param ent_1: See :func:`_CheckClosedSymmetry`
2459  :param ent_2: See :func:`_CheckClosedSymmetry`
2460  :param chem_mapping: See :func:`_CheckClosedSymmetry`
2461  :param transformation: Superposition transformation to be applied to *ent_1*.
2462  :param sup_thr: See :func:`_CheckClosedSymmetry`
2463  :param sup_fract: See :func:`_CheckClosedSymmetry`
2464  """
2465  # NOTE: this seems to be the comput. most expensive part in QS scoring
2466  # -> it could be moved to C++ for speed-up
2467  # -> the next expensive bits are ClustalW and GetContacts btw...
2468 
2469  # swap if needed (ent_1 must be shorter or same)
2470  if ent_1.chain_count > ent_2.chain_count:
2471  swap = True
2472  ent_1, ent_2 = ent_2, ent_1
2473  transformation = geom.Invert(transformation)
2474  chem_pairs = list(zip(list(chem_mapping.values()), list(chem_mapping.keys())))
2475  else:
2476  swap = False
2477  chem_pairs = iter(chem_mapping.items())
2478  # sanity check
2479  if ent_1.chain_count == 0:
2480  return None
2481  # extract valid partners for each chain in ent_1
2482  chem_partners = dict()
2483  for cg1, cg2 in chem_pairs:
2484  for ch in cg1:
2485  chem_partners[ch] = set(cg2)
2486  # get mapping for each chain in ent_1
2487  mapping = dict()
2488  mapped_chains = set()
2489  for ch_1 in ent_1.chains:
2490  # get all neighbors split by chain (NOTE: this muight be moved to C++)
2491  ch_atoms = {ch_2.name: set() for ch_2 in ent_2.chains}
2492  for a_1 in ch_1.handle.atoms:
2493  transformed_pos = geom.Vec3(transformation * geom.Vec4(a_1.pos))
2494  a_within = ent_2.FindWithin(transformed_pos, sup_thr)
2495  for a_2 in a_within:
2496  ch_atoms[a_2.chain.name].add(a_2.hash_code)
2497  # get one with most atoms in overlap
2498  max_num, max_name = max((len(atoms), name)
2499  for name, atoms in ch_atoms.items())
2500  # early abort if overlap fraction not good enough
2501  ch_2 = ent_2.FindChain(max_name)
2502  if max_num == 0 or max_num / float(ch_2.handle.atom_count) < sup_fract:
2503  return None
2504  # early abort if mapped to chain of different chem. group
2505  ch_1_name = ch_1.name
2506  if ch_1_name not in chem_partners:
2507  raise RuntimeError("Chem. mapping doesn't contain all chains!")
2508  if max_name not in chem_partners[ch_1_name]:
2509  return None
2510  # early abort if multiple ones mapped to same chain
2511  if max_name in mapped_chains:
2512  return None
2513  # set mapping
2514  mapping[ch_1_name] = max_name
2515  mapped_chains.add(max_name)
2516  # unswap if needed and return
2517  if swap:
2518  mapping = {v: k for k, v in mapping.items()}
2519  return mapping
2520 
2521 def _GetMappedRMSD(ent_1, ent_2, chain_mapping, transformation):
2522  """
2523  :return: RMSD between complexes considering chain mapping.
2524  :param ent_1: Entity mapped to *ent_2* (perfect alignment assumed between
2525  mapped chains as in :attr:`QSscorer.ent_to_cm_1`).
2526  :param ent_2: Entity which was mapped to (perfect alignment assumed between
2527  mapped chains as in :attr:`QSscorer.ent_to_cm_2`).
2528  :param chain_mapping: See :attr:`QSscorer.chain_mapping`
2529  :param transformation: Superposition transformation to be applied to *ent_1*.
2530  """
2531  # collect RMSDs and atom counts chain by chain and combine afterwards
2532  rmsds = []
2533  atoms = []
2534  for c1, c2 in chain_mapping.items():
2535  # get views and atom counts
2536  chain_1 = ent_1.Select('cname="%s"' % c1)
2537  chain_2 = ent_2.Select('cname="%s"' % c2)
2538  atom_count = chain_1.atom_count
2539  if atom_count != chain_2.atom_count:
2540  raise RuntimeError('Chains in _GetMappedRMSD must be perfectly aligned!')
2541  # get RMSD
2542  rmsd = mol.alg.CalculateRMSD(chain_1, chain_2, transformation)
2543  # update lists
2544  rmsds.append(rmsd)
2545  atoms.append(float(atom_count))
2546  # combine (reminder: RMSD = sqrt(sum(atom_distance^2)/atom_count))
2547  rmsd = np.sqrt( sum([a * r**2 for a, r in zip(atoms, rmsds)]) / sum(atoms) )
2548  return rmsd
2549 
2551  """Helper object for repetitive RMSD calculations.
2552  Meant to speed up :func:`_GetChainMapping` but could also be used to replace
2553  :func:`_GetMappedRMSD` in :func:`_CheckClosedSymmetry`.
2554 
2555  :param ent_1: See :attr:`QSscorer.ent_to_cm_1`
2556  :param ent_2: See :attr:`QSscorer.ent_to_cm_2`
2557  """
2558  def __init__(self, ent_1, ent_2):
2559  # initialize caches and keep entities
2560  self.ent_1 = ent_1
2561  self.ent_2 = ent_2
2562  self._chain_views_1 = dict()
2563  self._chain_views_2 = dict()
2564  self._pair_rmsd = dict() # key = (c1,c2), value = (atom_count,rmsd)
2565 
2566  def GetChainView1(self, cname):
2567  """Get cached view on chain *cname* for :attr:`ent_1`."""
2568  if cname not in self._chain_views_1:
2569  self._chain_views_1[cname] = self.ent_1.Select('cname="%s"' % cname)
2570  return self._chain_views_1[cname]
2571 
2572  def GetChainView2(self, cname):
2573  """Get cached view on chain *cname* for :attr:`ent_2`."""
2574  if cname not in self._chain_views_2:
2575  self._chain_views_2[cname] = self.ent_2.Select('cname="%s"' % cname)
2576  return self._chain_views_2[cname]
2577 
2578  def GetSuperposition(self, c1, c2):
2579  """Get superposition result (no change in entities!) for *c1* to *c2*.
2580  This invalidates cached RMSD results used in :func:`GetMappedRMSD`.
2581 
2582  :param c1: chain name for :attr:`ent_1`.
2583  :param c2: chain name for :attr:`ent_2`.
2584  """
2585  # invalidate _pair_rmsd
2586  self._pair_rmsd = dict()
2587  # superpose
2588  chain_1 = self.GetChainView1(c1)
2589  chain_2 = self.GetChainView2(c2)
2590  if chain_1.atom_count != chain_2.atom_count:
2591  raise RuntimeError('Chains in GetSuperposition not perfectly aligned!')
2592  return mol.alg.SuperposeSVD(chain_1, chain_2, apply_transform=False)
2593 
2594  def GetMappedRMSD(self, chain_mapping, transformation):
2595  """
2596  :return: RMSD between complexes considering chain mapping.
2597  :param chain_mapping: See :attr:`QSscorer.chain_mapping`.
2598  :param transformation: Superposition transformation (e.g. res.transformation
2599  for res = :func:`GetSuperposition`).
2600  """
2601  # collect RMSDs and atom counts chain by chain and combine afterwards
2602  rmsds = []
2603  atoms = []
2604  for c1, c2 in chain_mapping.items():
2605  # cached?
2606  if (c1, c2) in self._pair_rmsd:
2607  atom_count, rmsd = self._pair_rmsd[(c1, c2)]
2608  else:
2609  # get views and atom counts
2610  chain_1 = self.GetChainView1(c1)
2611  chain_2 = self.GetChainView2(c2)
2612  atom_count = chain_1.atom_count
2613  if atom_count != chain_2.atom_count:
2614  raise RuntimeError('Chains in GetMappedRMSD not perfectly aligned!')
2615  # get RMSD
2616  rmsd = mol.alg.CalculateRMSD(chain_1, chain_2, transformation)
2617  self._pair_rmsd[(c1, c2)] = (atom_count, rmsd)
2618  # update lists
2619  rmsds.append(rmsd)
2620  atoms.append(float(atom_count))
2621  # combine (reminder: RMSD = sqrt(sum(atom_distance^2)/atom_count))
2622  rmsd = np.sqrt( sum([a * r**2 for a, r in zip(atoms, rmsds)]) / sum(atoms) )
2623  return rmsd
2624 
2625 
2626 def _CleanUserSymmetry(symm, ent):
2627  """Clean-up user provided symmetry.
2628 
2629  :param symm: Symmetry group as in :attr:`QSscorer.symm_1`
2630  :param ent: Entity from which to extract chain names
2631  :type ent: :class:`~ost.mol.EntityHandle` or :class:`~ost.mol.EntityView`
2632  :return: New symmetry group limited to chains in sub-structure *ent*
2633  (see :attr:`QSscorer.symm_1`). Empty list if invalid symmetry.
2634  """
2635  # restrict symm to only contain chains in ent
2636  chain_names = set([ch.name for ch in ent.chains])
2637  new_symm = list()
2638  for symm_group in symm:
2639  new_group = tuple(ch for ch in symm_group if ch in chain_names)
2640  if new_group:
2641  new_symm.append(new_group)
2642  # report it
2643  if new_symm != symm:
2644  LogInfo("Cleaned user symmetry for %s: %s" % (ent.GetName(), new_symm))
2645  # do all groups have same length?
2646  lengths = [len(symm_group) for symm_group in new_symm]
2647  if lengths[1:] != lengths[:-1]:
2648  LogWarning('User symmetry group of different sizes for %s. Ignoring it!' \
2649  % (ent.GetName()))
2650  return []
2651  # do we cover all chains?
2652  s_chain_names = set([ch for symm_group in new_symm for ch in symm_group])
2653  if len(s_chain_names) != sum(lengths):
2654  LogWarning('User symmetry group for %s has duplicate chains. Ignoring it!' \
2655  % (ent.GetName()))
2656  return []
2657  if s_chain_names != chain_names:
2658  LogWarning('User symmetry group for %s is missing chains. Ignoring it!' \
2659  % (ent.GetName()))
2660  return []
2661  # ok all good
2662  return new_symm
2663 
2664 def _AreValidSymmetries(symm_1, symm_2):
2665  """Check symmetry pair for major problems.
2666 
2667  :return: False if any of the two is empty or if they're compatible in size.
2668  :rtype: :class:`bool`
2669  """
2670  if not symm_1 or not symm_2:
2671  return False
2672  if len(symm_1) != 1 or len(symm_2) != 1:
2673  if not len(symm_1[0]) == len(symm_2[0]):
2674  LogWarning('Symmetry groups of different sizes. Ignoring them!')
2675  return False
2676  return True
2677 
2678 def _GetMappedAlignments(ent_1, ent_2, chain_mapping, res_num_alignment):
2679  """
2680  :return: Alignments of 2 structures given chain mapping
2681  (see :attr:`QSscorer.alignments`).
2682  :param ent_1: Entity containing all chains in *chain_mapping.keys()*.
2683  Views to this entity attached to first sequence of each aln.
2684  :param ent_2: Entity containing all chains in *chain_mapping.values()*.
2685  Views to this entity attached to second sequence of each aln.
2686  :param chain_mapping: See :attr:`QSscorer.chain_mapping`
2687  :param res_num_alignment: See :attr:`QSscorer.res_num_alignment`
2688  """
2689  alns = list()
2690  for ch_1_name in sorted(chain_mapping):
2691  # get both sequences incl. attached view
2692  ch_1 = ent_1.FindChain(ch_1_name)
2693  ch_2 = ent_2.FindChain(chain_mapping[ch_1_name])
2694  if res_num_alignment:
2695  max_res_num = max([r.number.GetNum() for r in ch_1.residues] +
2696  [r.number.GetNum() for r in ch_2.residues])
2697  ch1_aln = ["-"] * max_res_num
2698  ch2_aln = ["-"] * max_res_num
2699  for res in ch_1.residues:
2700  ch1_aln[res.number.GetNum() - 1] = res.GetOneLetterCode()
2701  ch1_aln = "".join(ch1_aln)
2702  seq_1 = seq.CreateSequence(ch_1.name, str(ch1_aln))
2703  seq_1.AttachView(ch_1.Select(""))
2704  for res in ch_2.residues:
2705  ch2_aln[res.number.GetNum() - 1] = res.GetOneLetterCode()
2706  ch2_aln = "".join(ch2_aln)
2707  seq_2 = seq.CreateSequence(ch_2.name, str(ch2_aln))
2708  seq_2.AttachView(ch_2.Select(""))
2709  # Create alignment
2710  aln = seq.CreateAlignment()
2711  aln.AddSequence(seq_1)
2712  aln.AddSequence(seq_2)
2713  else:
2714  seq_1 = seq.SequenceFromChain(ch_1.name, ch_1)
2715  seq_2 = seq.SequenceFromChain(ch_2.name, ch_2)
2716  # align them
2717  aln = _AlignAtomSeqs(seq_1, seq_2)
2718  if aln:
2719  alns.append(aln)
2720  return alns
2721 
2722 def _GetMappedResidues(alns):
2723  """
2724  :return: Mapping of shared residues in *alns* (with views attached)
2725  (see :attr:`QSscorer.mapped_residues`).
2726  :param alns: See :attr:`QSscorer.alignments`
2727  """
2728  mapped_residues = dict()
2729  for aln in alns:
2730  # prepare dict for c1
2731  c1 = aln.GetSequence(0).name
2732  mapped_residues[c1] = dict()
2733  # get shared residues
2734  v1, v2 = seq.ViewsFromAlignment(aln)
2735  for res_1, res_2 in zip(v1.residues, v2.residues):
2736  r1 = res_1.number.num
2737  r2 = res_2.number.num
2738  mapped_residues[c1][r1] = r2
2739 
2740  return mapped_residues
2741 
2742 def _GetExtraWeights(contacts, done, mapped_residues):
2743  """Return sum of extra weights for contacts of chains in set and not in done.
2744  :return: Tuple (weight_extra_mapped, weight_extra_all).
2745  weight_extra_mapped only sums if both cX,rX in mapped_residues
2746  weight_extra_all sums all
2747  :param contacts: See :func:`GetContacts` for first entity
2748  :param done: List of (c1, c2, r1, r2) tuples to ignore
2749  :param mapped_residues: See :func:`_GetMappedResidues`
2750  """
2751  weight_extra_mapped = 0
2752  weight_extra_non_mapped = 0
2753  for c1 in contacts:
2754  for c2 in contacts[c1]:
2755  for r1 in contacts[c1][c2]:
2756  for r2 in contacts[c1][c2][r1]:
2757  if (c1, c2, r1, r2) not in done:
2758  weight = _weight(contacts[c1][c2][r1][r2])
2759  if c1 in mapped_residues and r1 in mapped_residues[c1] \
2760  and c2 in mapped_residues and r2 in mapped_residues[c2]:
2761  weight_extra_mapped += weight
2762  else:
2763  weight_extra_non_mapped += weight
2764  return weight_extra_mapped, weight_extra_mapped + weight_extra_non_mapped
2765 
2766 def _GetScores(contacts_1, contacts_2, mapped_residues, chain_mapping):
2767  """Get QS scores (see :class:`QSscorer`).
2768 
2769  Note that if some chains are not to be considered at all, they must be removed
2770  from *contacts_1* / *contacts_2* prior to calling this.
2771 
2772  :param contacts_1: See :func:`GetContacts` for first entity
2773  :param contacts_2: See :func:`GetContacts` for second entity
2774  :param mapped_residues: See :func:`_GetMappedResidues`
2775  :param chain_mapping: Maps any chain name in *mapped_residues* to chain name
2776  for *contacts_2*.
2777  :type chain_mapping: :class:`dict` (:class:`str` / :class:`str`)
2778  :return: Tuple (QS_best, QS_global) (see :attr:`QSscorer.best_score` and
2779  see :attr:`QSscorer.global_score`)
2780  """
2781  # keep track of considered contacts as set of (c1,c2,r1,r2) tuples
2782  done_1 = set()
2783  done_2 = set()
2784  weighted_scores = 0
2785  weight_sum = 0
2786  # naming cXY, rXY: X = 1/2 for contact, Y = 1/2/T for mapping (T = tmp)
2787  # -> d1 = contacts_1[c11][c21][r11][r21], d2 = contacts_2[c12][c22][r12][r22]
2788  for c11 in contacts_1:
2789  # map to other one
2790  if c11 not in mapped_residues: continue
2791  c1T = chain_mapping[c11]
2792  # second chain
2793  for c21 in contacts_1[c11]:
2794  # map to other one
2795  if c21 not in mapped_residues: continue
2796  c2T = chain_mapping[c21]
2797  # flip if needed
2798  flipped_chains = (c1T > c2T)
2799  if flipped_chains:
2800  c12, c22 = c2T, c1T
2801  else:
2802  c12, c22 = c1T, c2T
2803  # skip early if no contacts there
2804  if c12 not in contacts_2 or c22 not in contacts_2[c12]: continue
2805  # loop over res.num. in c11
2806  for r11 in contacts_1[c11][c21]:
2807  # map to other one and skip if no contacts there
2808  if r11 not in mapped_residues[c11]: continue
2809  r1T = mapped_residues[c11][r11]
2810  # loop over res.num. in c21
2811  for r21 in contacts_1[c11][c21][r11]:
2812  # map to other one and skip if no contacts there
2813  if r21 not in mapped_residues[c21]: continue
2814  r2T = mapped_residues[c21][r21]
2815  # flip if needed
2816  if flipped_chains:
2817  r12, r22 = r2T, r1T
2818  else:
2819  r12, r22 = r1T, r2T
2820  # skip early if no contacts there
2821  if r12 not in contacts_2[c12][c22]: continue
2822  if r22 not in contacts_2[c12][c22][r12]: continue
2823  # ok now we can finally do the scoring
2824  d1 = contacts_1[c11][c21][r11][r21]
2825  d2 = contacts_2[c12][c22][r12][r22]
2826  weight = _weight(min(d1, d2))
2827  weighted_scores += weight * (1 - abs(d1 - d2) / 12)
2828  weight_sum += weight
2829  # keep track of done ones
2830  done_1.add((c11, c21, r11, r21))
2831  done_2.add((c12, c22, r12, r22))
2832 
2833  LogVerbose("Shared contacts: %d" % len(done_1))
2834 
2835  # add the extra weights
2836  weights_extra_1 = _GetExtraWeights(contacts_1, done_1, mapped_residues)
2837  mapped_residues_2 = dict()
2838  for c1 in mapped_residues:
2839  c2 = chain_mapping[c1]
2840  mapped_residues_2[c2] = set()
2841  for r1 in mapped_residues[c1]:
2842  mapped_residues_2[c2].add(mapped_residues[c1][r1])
2843  weights_extra_2 = _GetExtraWeights(contacts_2, done_2, mapped_residues_2)
2844  weight_extra_mapped = weights_extra_1[0] + weights_extra_2[0]
2845  weight_extra_all = weights_extra_1[1] + weights_extra_2[1]
2846 
2847  # get scores
2848  denom_best = weight_sum + weight_extra_mapped
2849  denom_all = weight_sum + weight_extra_all
2850  if denom_best == 0:
2851  QS_best = 0
2852  else:
2853  QS_best = weighted_scores / denom_best
2854  if denom_all == 0:
2855  QS_global = 0
2856  else:
2857  QS_global = weighted_scores / denom_all
2858  return QS_best, QS_global
2859 
2860 def _weight(dist):
2861  """
2862  This weight expresses the probability of two residues to interact given the CB
2863  distance (from Xu et al. http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2573399/)
2864  """
2865  if dist <= 5.0:
2866  return 1
2867  x = (dist-5.0)/4.28;
2868  return np.exp(-2*x*x)
2869 
2870 
2871 def _GetQsSuperposition(alns):
2872  """
2873  :return: Superposition result based on shared CA atoms in *alns*
2874  (with views attached) (see :attr:`QSscorer.superposition`).
2875  :param alns: See :attr:`QSscorer.alignments`
2876  """
2877  # check input
2878  if not alns:
2879  raise QSscoreError('No successful alignments for superposition!')
2880  # prepare views
2881  view_1 = alns[0].GetSequence(0).attached_view.CreateEmptyView()
2882  view_2 = alns[0].GetSequence(1).attached_view.CreateEmptyView()
2883  # collect CA from alignments in proper order
2884  for aln in alns:
2885  for col in aln:
2886  res_1 = col.GetResidue(0)
2887  res_2 = col.GetResidue(1)
2888  if res_1.IsValid() and res_2.IsValid():
2889  ca_1 = res_1.FindAtom('CA')
2890  ca_2 = res_2.FindAtom('CA')
2891  if ca_1.IsValid() and ca_2.IsValid():
2892  view_1.AddAtom(ca_1)
2893  view_2.AddAtom(ca_2)
2894  # superpose them without chainging entities
2895  res = mol.alg.SuperposeSVD(view_1, view_2, apply_transform=False)
2896  return res
2897 
2898 
2899 def _AddResidue(edi, res, rnum, chain, calpha_only):
2900  """
2901  Add residue *res* with res. num. *run* to given *chain* using editor *edi*.
2902  Either all atoms added or (if *calpha_only*) only CA.
2903  """
2904  if calpha_only:
2905  ca_atom = res.FindAtom('CA')
2906  if ca_atom.IsValid():
2907  new_res = edi.AppendResidue(chain, res.name, rnum)
2908  edi.InsertAtom(new_res, ca_atom.name, ca_atom.pos)
2909  else:
2910  new_res = edi.AppendResidue(chain, res.name, rnum)
2911  for atom in res.atoms:
2912  edi.InsertAtom(new_res, atom.name, atom.pos)
2913 
2914 def _MergeAlignedChains(alns, ent_1, ent_2, calpha_only, penalize_extra_chains):
2915  """
2916  Create two new entities (based on the alignments attached views) where all
2917  residues have same numbering (when they're aligned) and they are all pushed to
2918  a single chain X. Also append extra chains contained in *ent_1* and *ent_2*
2919  but not contained in *alns*.
2920 
2921  Used for :attr:`QSscorer.lddt_ref` and :attr:`QSscorer.lddt_mdl`
2922 
2923  :param alns: List of alignments with attached views (first sequence: *ent_1*,
2924  second: *ent_2*). Residue number in single chain is column index
2925  of current alignment + sum of lengths of all previous alignments
2926  (order of alignments as in input list).
2927  :type alns: See :attr:`QSscorer.alignments`
2928  :param ent_1: First entity to process.
2929  :type ent_1: :class:`~ost.mol.EntityHandle`
2930  :param ent_2: Second entity to process.
2931  :type ent_2: :class:`~ost.mol.EntityHandle`
2932  :param calpha_only: If True, we only include CA atoms instead of all.
2933  :type calpha_only: :class:`bool`
2934  :param penalize_extra_chains: If True, extra chains are added to model and
2935  reference. Otherwise, only mapped ones.
2936  :type penalize_extra_chains: :class:`bool`
2937 
2938  :return: Tuple of two single chain entities (from *ent_1* and from *ent_2*)
2939  :rtype: :class:`tuple` of :class:`~ost.mol.EntityHandle`
2940  """
2941  # first new entity
2942  ent_ren_1 = mol.CreateEntity()
2943  ed_1 = ent_ren_1.EditXCS()
2944  new_chain_1 = ed_1.InsertChain('X')
2945  # second one
2946  ent_ren_2 = mol.CreateEntity()
2947  ed_2 = ent_ren_2.EditXCS()
2948  new_chain_2 = ed_2.InsertChain('X')
2949  # the alignment already contains sorted chains
2950  rnum = 0
2951  chain_done_1 = set()
2952  chain_done_2 = set()
2953  for aln in alns:
2954  chain_done_1.add(aln.GetSequence(0).name)
2955  chain_done_2.add(aln.GetSequence(1).name)
2956  for col in aln:
2957  rnum += 1
2958  # add valid residues to single chain entities
2959  res_1 = col.GetResidue(0)
2960  if res_1.IsValid():
2961  _AddResidue(ed_1, res_1, rnum, new_chain_1, calpha_only)
2962  res_2 = col.GetResidue(1)
2963  if res_2.IsValid():
2964  _AddResidue(ed_2, res_2, rnum, new_chain_2, calpha_only)
2965  # extra chains?
2966  if penalize_extra_chains:
2967  for chain in ent_1.chains:
2968  if chain.name in chain_done_1:
2969  continue
2970  for res in chain.residues:
2971  rnum += 1
2972  _AddResidue(ed_1, res, rnum, new_chain_1, calpha_only)
2973  for chain in ent_2.chains:
2974  if chain.name in chain_done_2:
2975  continue
2976  for res in chain.residues:
2977  rnum += 1
2978  _AddResidue(ed_2, res, rnum, new_chain_2, calpha_only)
2979  # get entity names
2980  ent_ren_1.SetName(aln.GetSequence(0).GetAttachedView().GetName())
2981  ent_ren_2.SetName(aln.GetSequence(1).GetAttachedView().GetName())
2982  # connect atoms
2983  if not conop.GetDefaultLib():
2984  raise RuntimeError("QSscore computation requires a compound library!")
2985  pr = conop.RuleBasedProcessor(conop.GetDefaultLib())
2986  pr.Process(ent_ren_1)
2987  ed_1.UpdateICS()
2988  pr.Process(ent_ren_2)
2989  ed_2.UpdateICS()
2990  return ent_ren_1, ent_ren_2
2991 
2992 
2993 # specify public interface
2994 __all__ = ('QSscoreError', 'QSscorer', 'QSscoreEntity', 'FilterContacts',
2995  'GetContacts', 'OligoLDDTScorer', 'MappedLDDTScorer')
def _ComputeAlignedEntities
Class internal helpers (anything that doesnt easily work without this class)
Definition: qsscoring.py:579
def _GetSuperposeData
Class internal helpers (anything that doesnt easily work without this class)
Definition: qsscoring.py:853
Mat4 DLLEXPORT_OST_GEOM Invert(const Mat4 &m)
def _InitScorer
Class internal helpers (anything that doesnt easily work without this class)
Definition: qsscoring.py:1441
Real Distance(const Vec3 &p1, const Vec3 &p2)
returns the distance between two points
Definition: vecmat3_op.hh:199
Three dimensional vector class, using Real precision.
Definition: vec3.hh:43
def _PrepareOligoEntities
Class internal helpers.
Definition: qsscoring.py:1243