OpenStructure
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
qsscoring.py
Go to the documentation of this file.
1 """
2 Scoring of quaternary structures (QS). The QS scoring is according to the paper
3 by `Bertoni et al. <https://dx.doi.org/10.1038/s41598-017-09654-8>`_.
4 
5 .. note ::
6 
7  Requirements for use:
8 
9  - A default :class:`compound library <ost.conop.CompoundLib>` must be defined
10  and accessible via :func:`~ost.conop.GetDefaultLib`. This is set by default
11  when executing scripts with ``ost``. Otherwise, you must set this with
12  :func:`~ost.conop.SetDefaultLib`.
13  - ClustalW must be installed (unless you provide chain mappings)
14  - Python modules `numpy` and `scipy` must be installed and available
15  (e.g. use ``pip install scipy numpy``)
16 """
17 # Original authors: Gerardo Tauriello, Martino Bertoni
18 
19 from ost import mol, geom, conop, seq, settings, PushVerbosityLevel
20 from ost import LogError, LogWarning, LogScript, LogInfo, LogVerbose, LogDebug
21 from ost.bindings.clustalw import ClustalW
22 from ost.mol.alg import lDDTScorer
23 from ost.seq.alg.renumber import Renumber
24 import numpy as np
25 from scipy.misc import factorial
26 from scipy.special import binom
27 from scipy.cluster.hierarchy import fclusterdata
28 import itertools
29 
30 ###############################################################################
31 # QS scoring
32 ###############################################################################
33 
34 class QSscoreError(Exception):
35  """Exception to be raised for "acceptable" exceptions in QS scoring.
36 
37  Those are cases we might want to capture for default behavior.
38  """
39  pass
40 
41 class QSscorer:
42  """Object to compute QS scores.
43 
44  Simple usage without any precomputed contacts, symmetries and mappings:
45 
46  .. code-block:: python
47 
48  import ost
49  from ost.mol.alg import qsscoring
50 
51  # load two biounits to compare
52  ent_full = ost.io.LoadPDB('3ia3', remote=True)
53  ent_1 = ent_full.Select('cname=A,D')
54  ent_2 = ent_full.Select('cname=B,C')
55  # get score
56  ost.PushVerbosityLevel(3)
57  try:
58  qs_scorer = qsscoring.QSscorer(ent_1, ent_2)
59  ost.LogScript('QSscore:', str(qs_scorer.global_score))
60  ost.LogScript('Chain mapping used:', str(qs_scorer.chain_mapping))
61  # commonly you want the QS global score as output
62  qs_score = qs_scorer.global_score
63  except qsscoring.QSscoreError as ex:
64  # default handling: report failure and set score to 0
65  ost.LogError('QSscore failed:', str(ex))
66  qs_score = 0
67 
68  For maximal performance when computing QS scores of the same entity with many
69  others, it is advisable to construct and reuse :class:`QSscoreEntity` objects.
70 
71  Any known / precomputed information can be filled into the appropriate
72  attribute here (no checks done!). Otherwise most quantities are computed on
73  first access and cached (lazy evaluation). Setters are provided to set values
74  with extra checks (e.g. :func:`SetSymmetries`).
75 
76  All necessary seq. alignments are done by global BLOSUM62-based alignment. A
77  multiple sequence alignment is performed with ClustalW unless
78  :attr:`chain_mapping` is provided manually. You will need to have an
79  executable ``clustalw`` or ``clustalw2`` in your ``PATH`` or you must set
80  :attr:`clustalw_bin` accordingly. Otherwise an exception
81  (:class:`ost.settings.FileNotFound`) is thrown.
82 
83  Formulas for QS scores:
84 
85  ::
86 
87  - QS_best = weighted_scores / (weight_sum + weight_extra_mapped)
88  - QS_global = weighted_scores / (weight_sum + weight_extra_all)
89  -> weighted_scores = sum(w(min(d1,d2)) * (1 - abs(d1-d2)/12)) for shared
90  -> weight_sum = sum(w(min(d1,d2))) for shared
91  -> weight_extra_mapped = sum(w(d)) for all mapped but non-shared
92  -> weight_extra_all = sum(w(d)) for all non-shared
93  -> w(d) = 1 if d <= 5, exp(-2 * ((d-5.0)/4.28)^2) else
94 
95  In the formulas above:
96 
97  * "d": CA/CB-CA/CB distance of an "inter-chain contact" ("d1", "d2" for
98  "shared" contacts).
99  * "mapped": we could map chains of two structures and align residues in
100  :attr:`alignments`.
101  * "shared": pairs of residues which are "mapped" and have
102  "inter-chain contact" in both structures.
103  * "inter-chain contact": CB-CB pairs (CA for GLY) with distance <= 12 A
104  (fallback to CA-CA if :attr:`calpha_only` is True).
105  * "w(d)": weighting function (prob. of 2 res. to interact given CB distance)
106  from `Xu et al. 2009 <https://dx.doi.org/10.1016%2Fj.jmb.2008.06.002>`_.
107 
108  :param ent_1: First structure to be scored.
109  :type ent_1: :class:`QSscoreEntity`, :class:`~ost.mol.EntityHandle` or
110  :class:`~ost.mol.EntityView`
111  :param ent_2: Second structure to be scored.
112  :type ent_2: :class:`QSscoreEntity`, :class:`~ost.mol.EntityHandle` or
113  :class:`~ost.mol.EntityView`
114  :param res_num_alignment: Sets :attr:`res_num_alignment`
115 
116  :raises: :class:`QSscoreError` if input structures are invalid or are monomers
117  or have issues that make it impossible for a QS score to be computed.
118 
119  .. attribute:: qs_ent_1
120 
121  :class:`QSscoreEntity` object for *ent_1* given at construction.
122  If entity names (:attr:`~QSscoreEntity.original_name`) are not unique, we
123  set it to 'pdb_1' using :func:`~QSscoreEntity.SetName`.
124 
125  .. attribute:: qs_ent_2
126 
127  :class:`QSscoreEntity` object for *ent_2* given at construction.
128  If entity names (:attr:`~QSscoreEntity.original_name`) are not unique, we
129  set it to 'pdb_2' using :func:`~QSscoreEntity.SetName`.
130 
131  .. attribute:: calpha_only
132 
133  True if any of the two structures is CA-only (after cleanup).
134 
135  :type: :class:`bool`
136 
137  .. attribute:: max_ca_per_chain_for_cm
138 
139  Maximal number of CA atoms to use in each chain to determine chain mappings.
140  Setting this to -1 disables the limit. Limiting it speeds up determination
141  of symmetries and chain mappings. By default it is set to 100.
142 
143  :type: :class:`int`
144 
145  .. attribute:: max_mappings_extensive
146 
147  Maximal number of chain mappings to test for 'extensive'
148  :attr:`chain_mapping_scheme`. The extensive chain mapping search must in the
149  worst case check O(N^2) * O(N!) possible mappings for complexes with N
150  chains. Two octamers without symmetry would require 322560 mappings to be
151  checked. To limit computations, a :class:`QSscoreError` is thrown if we try
152  more than the maximal number of chain mappings.
153  The value must be set before the first use of :attr:`chain_mapping`.
154  By default it is set to 100000.
155 
156  :type: :class:`int`
157 
158  .. attribute:: res_num_alignment
159 
160  Forces each alignment in :attr:`alignments` to be based on residue numbers
161  instead of using a global BLOSUM62-based alignment.
162 
163  :type: :class:`bool`
164  """
165  def __init__(self, ent_1, ent_2, res_num_alignment=False):
166  # generate QSscoreEntity objects?
167  if isinstance(ent_1, QSscoreEntity):
168  self.qs_ent_1 = ent_1
169  else:
170  self.qs_ent_1 = QSscoreEntity(ent_1)
171  if isinstance(ent_2, QSscoreEntity):
172  self.qs_ent_2 = ent_2
173  else:
174  self.qs_ent_2 = QSscoreEntity(ent_2)
175  # check validity of inputs
176  if not self.qs_ent_1.is_valid or not self.qs_ent_2.is_valid:
177  raise QSscoreError("Invalid input in QSscorer!")
178  # set names for structures
179  if self.qs_ent_1.original_name == self.qs_ent_2.original_name:
180  self.qs_ent_1.SetName('pdb_1')
181  self.qs_ent_2.SetName('pdb_2')
182  else:
183  self.qs_ent_1.SetName(self.qs_ent_1.original_name)
184  self.qs_ent_2.SetName(self.qs_ent_2.original_name)
185  # set other public attributes
186  self.res_num_alignment = res_num_alignment
187  self.calpha_only = self.qs_ent_1.calpha_only or self.qs_ent_2.calpha_only
190  # init cached stuff
191  self._chem_mapping = None
192  self._ent_to_cm_1 = None
193  self._ent_to_cm_2 = None
194  self._symm_1 = None
195  self._symm_2 = None
196  self._chain_mapping = None
197  self._chain_mapping_scheme = None
198  self._alignments = None
199  self._mapped_residues = None
200  self._global_score = None
201  self._best_score = None
202  self._superposition = None
203  self._clustalw_bin = None
204 
205  @property
206  def chem_mapping(self):
207  """Inter-complex mapping of chemical groups.
208 
209  Each group (see :attr:`QSscoreEntity.chem_groups`) is mapped according to
210  highest sequence identity. Alignment is between longest sequences in groups.
211 
212  Limitations:
213 
214  - If different numbers of groups, we map only the groups for the complex
215  with less groups (rest considered unmapped and shown as warning)
216  - The mapping is forced: the "best" mapping will be chosen independently of
217  how low the seq. identity may be
218 
219  :getter: Computed on first use (cached)
220  :type: :class:`dict` with key = :class:`tuple` of chain names in
221  :attr:`qs_ent_1` and value = :class:`tuple` of chain names in
222  :attr:`qs_ent_2`.
223 
224  :raises: :class:`QSscoreError` if we end up having no chains for either
225  entity in the mapping (can happen if chains do not have CA atoms).
226  """
227  if self._chem_mapping is None:
228  self._chem_mapping = _GetChemGroupsMapping(self.qs_ent_1, self.qs_ent_2)
229  return self._chem_mapping
230 
231  @property
232  def ent_to_cm_1(self):
233  """Subset of :attr:`qs_ent_1` used to compute chain mapping and symmetries.
234 
235  Properties:
236 
237  - Includes only residues aligned according to :attr:`chem_mapping`
238  - Includes only 1 CA atom per residue
239  - Has at least 5 and at most :attr:`max_ca_per_chain_for_cm` atoms per chain
240  - All chains of the same chemical group have the same number of atoms
241  (also in :attr:`ent_to_cm_2` according to :attr:`chem_mapping`)
242  - All chains appearing in :attr:`chem_mapping` appear in this entity
243  (so the two can be safely used together)
244 
245  This entity might be transformed (i.e. all positions rotated/translated by
246  same transformation matrix) if this can speed up computations. So do not
247  assume fixed global positions (but relative distances will remain fixed).
248 
249  :getter: Computed on first use (cached)
250  :type: :class:`~ost.mol.EntityHandle`
251 
252  :raises: :class:`QSscoreError` if any chain ends up having less than 5 res.
253  """
254  if self._ent_to_cm_1 is None:
256  return self._ent_to_cm_1
257 
258  @property
259  def ent_to_cm_2(self):
260  """Subset of :attr:`qs_ent_1` used to compute chain mapping and symmetries
261  (see :attr:`ent_to_cm_1` for details).
262  """
263  if self._ent_to_cm_2 is None:
265  return self._ent_to_cm_2
266 
267  @property
268  def symm_1(self):
269  """Symmetry groups for :attr:`qs_ent_1` used to speed up chain mapping.
270 
271  This is a list of chain-lists where each chain-list can be used reconstruct
272  the others via cyclic C or dihedral D symmetry. The first chain-list is used
273  as a representative symmetry group. For heteromers, the group-members must
274  contain all different seqres in oligomer.
275 
276  Example: symm. groups [(A,B,C), (D,E,F), (G,H,I)] means that there are
277  symmetry transformations to get (D,E,F) and (G,H,I) from (A,B,C).
278 
279  Properties:
280 
281  - All symmetry group tuples have the same length (num. of chains)
282  - All chains in :attr:`ent_to_cm_1` appear (w/o duplicates)
283  - For heteros: symmetry group tuples have all different chem. groups
284  - Trivial symmetry group = one tuple with all chains (used if inconsistent
285  data provided or if no symmetry is found)
286  - Either compatible to :attr:`symm_2` or trivial symmetry groups used.
287  Compatibility requires same lengths of symmetry group tuples and it must
288  be possible to get an overlap (80% of residues covered within 6 A of a
289  (chem. mapped) chain) of all chains in representative symmetry groups by
290  superposing one pair of chains.
291 
292  :getter: Computed on first use (cached)
293  :type: :class:`list` of :class:`tuple` of :class:`str` (chain names)
294  """
295  if self._symm_1 is None:
296  self._ComputeSymmetry()
297  return self._symm_1
298 
299  @property
300  def symm_2(self):
301  """Symmetry groups for :attr:`qs_ent_2` (see :attr:`symm_1` for details)."""
302  if self._symm_2 is None:
303  self._ComputeSymmetry()
304  return self._symm_2
305 
306  def SetSymmetries(self, symm_1, symm_2):
307  """Set user-provided symmetry groups.
308 
309  These groups are restricted to chain names appearing in :attr:`ent_to_cm_1`
310  and :attr:`ent_to_cm_2` respectively. They are only valid if they cover all
311  chains and both *symm_1* and *symm_2* have same lengths of symmetry group
312  tuples. Otherwise trivial symmetry group used (see :attr:`symm_1`).
313 
314  :param symm_1: Value to set for :attr:`symm_1`.
315  :param symm_2: Value to set for :attr:`symm_2`.
316  """
317  # restrict chain names
318  self._symm_1 = _CleanUserSymmetry(symm_1, self.ent_to_cm_1)
319  self._symm_2 = _CleanUserSymmetry(symm_2, self.ent_to_cm_2)
320  # check that we have reasonable symmetries set (fallback: all chains)
321  if not _AreValidSymmetries(self._symm_1, self._symm_2):
322  self._symm_1 = [tuple(ch.name for ch in self.ent_to_cm_1.chains)]
323  self._symm_2 = [tuple(ch.name for ch in self.ent_to_cm_2.chains)]
324 
325  @property
326  def chain_mapping(self):
327  """Mapping from :attr:`ent_to_cm_1` to :attr:`ent_to_cm_2`.
328 
329  Properties:
330 
331  - Mapping is between chains of same chem. group (see :attr:`chem_mapping`)
332  - Each chain can appear only once in mapping
333  - All chains of complex with less chains are mapped
334  - Symmetry (:attr:`symm_1`, :attr:`symm_2`) is taken into account
335 
336  Details on algorithms used to find mapping:
337 
338  - We try all pairs of chem. mapped chains within symmetry group and get
339  superpose-transformation for them
340  - First option: check for "sufficient overlap" of other chain-pairs
341 
342  - For each chain-pair defined above: apply superposition to full oligomer
343  and map chains based on structural overlap
344  - Structural overlap = X% of residues in second oligomer covered within Y
345  Angstrom of a (chem. mapped) chain in first oligomer. We successively
346  try (X,Y) = (80,4), (40,6) and (20,8) to be less and less strict in
347  mapping (warning shown for most permissive one).
348  - If multiple possible mappings are found, we choose the one which leads
349  to the lowest multi-chain-RMSD given the superposition
350 
351  - Fallback option: try all mappings to find minimal multi-chain-RMSD
352  (warning shown)
353 
354  - For each chain-pair defined above: apply superposition, try all (!)
355  possible chain mappings (within symmetry group) and keep mapping with
356  lowest multi-chain-RMSD
357  - Repeat procedure above to resolve symmetry. Within the symmetry group we
358  can use the chain mapping computed before and we just need to find which
359  symmetry group in first oligomer maps to which in the second one. We
360  again try all possible combinations...
361  - Limitations:
362 
363  - Trying all possible mappings is a combinatorial nightmare (factorial).
364  We throw an exception if too many combinations (e.g. octomer vs
365  octomer with no usable symmetry)
366  - The mapping is forced: the "best" mapping will be chosen independently
367  of how badly they fit in terms of multi-chain-RMSD
368  - As a result, such a forced mapping can lead to a large range of
369  resulting QS scores. An extreme example was observed between 1on3.1
370  and 3u9r.1, where :attr:`global_score` can range from 0.12 to 0.43
371  for mappings with very similar multi-chain-RMSD.
372 
373  :getter: Computed on first use (cached)
374  :type: :class:`dict` with key / value = :class:`str` (chain names, key
375  for :attr:`ent_to_cm_1`, value for :attr:`ent_to_cm_2`)
376  :raises: :class:`QSscoreError` if there are too many combinations to check
377  to find a chain mapping (see :attr:`max_mappings_extensive`).
378  """
379  if self._chain_mapping is None:
381  _GetChainMapping(self.ent_to_cm_1, self.ent_to_cm_2, self.symm_1,
382  self.symm_2, self.chem_mapping,
384  LogInfo('Mapping found: %s' % str(self._chain_mapping))
385  return self._chain_mapping
386 
387  @property
389  """Mapping scheme used to get :attr:`chain_mapping`.
390 
391  Possible values:
392 
393  - 'strict': 80% overlap needed within 4 Angstrom (overlap based mapping).
394  - 'tolerant': 40% overlap needed within 6 Angstrom (overlap based mapping).
395  - 'permissive': 20% overlap needed within 8 Angstrom (overlap based
396  mapping). It's best if you check mapping manually!
397  - 'extensive': Extensive search used for mapping detection (fallback). This
398  approach has known limitations and may be removed in future versions.
399  Mapping should be checked manually!
400  - 'user': :attr:`chain_mapping` was set by user before first use of this
401  attribute.
402 
403  :getter: Computed with :attr:`chain_mapping` on first use (cached)
404  :type: :class:`str`
405  :raises: :class:`QSscoreError` as in :attr:`chain_mapping`.
406  """
407  if self._chain_mapping_scheme is None:
408  # default: user provided
409  self._chain_mapping_scheme = 'user'
410  # get chain mapping and make sure internal variable is set
411  # -> will not compute and only update _chain_mapping if user provided
412  # -> will compute and overwrite _chain_mapping_scheme else
413  self._chain_mapping = self.chain_mapping
414  return self._chain_mapping_scheme
415 
416  @property
417  def alignments(self):
418  """List of successful sequence alignments using :attr:`chain_mapping`.
419 
420  There will be one alignment for each mapped chain and they are ordered by
421  their chain names in :attr:`qs_ent_1`.
422 
423  The first sequence of each alignment belongs to :attr:`qs_ent_1` and the
424  second one to :attr:`qs_ent_2`. The sequences are named according to the
425  mapped chain names and have views attached into :attr:`QSscoreEntity.ent`
426  of :attr:`qs_ent_1` and :attr:`qs_ent_2`.
427 
428  If :attr:`res_num_alignment` is False, each alignment is performed using a
429  global BLOSUM62-based alignment. Otherwise, the positions in the alignment
430  sequences are simply given by the residue number so that residues with
431  matching numbers are aligned.
432 
433  :getter: Computed on first use (cached)
434  :type: :class:`list` of :class:`~ost.seq.AlignmentHandle`
435  """
436  if self._alignments is None:
437  self._alignments = _GetMappedAlignments(self.qs_ent_1.ent,
438  self.qs_ent_2.ent,
439  self.chain_mapping,
440  self.res_num_alignment)
441  return self._alignments
442 
443  @property
444  def mapped_residues(self):
445  """Mapping of shared residues in :attr:`alignments`.
446 
447  :getter: Computed on first use (cached)
448  :type: :class:`dict` *mapped_residues[c1][r1] = r2* with:
449  *c1* = Chain name in first entity (= first sequence in aln),
450  *r1* = Residue number in first entity,
451  *r2* = Residue number in second entity
452  """
453  if self._mapped_residues is None:
454  self._mapped_residues = _GetMappedResidues(self.alignments)
455  return self._mapped_residues
456 
457  @property
458  def global_score(self):
459  """QS-score with penalties.
460 
461  The range of the score is between 0 (i.e. no interface residues are shared
462  between biounits) and 1 (i.e. the interfaces are identical).
463 
464  The global QS-score is computed applying penalties when interface residues
465  or entire chains are missing (i.e. anything that is not mapped in
466  :attr:`mapped_residues` / :attr:`chain_mapping`) in one of the biounits.
467 
468  :getter: Computed on first use (cached)
469  :type: :class:`float`
470  :raises: :class:`QSscoreError` if only one chain is mapped
471  """
472  if self._global_score is None:
473  self._ComputeScores()
474  return self._global_score
475 
476  @property
477  def best_score(self):
478  """QS-score without penalties.
479 
480  Like :attr:`global_score`, but neglecting additional residues or chains in
481  one of the biounits (i.e. the score is calculated considering only mapped
482  chains and residues).
483 
484  :getter: Computed on first use (cached)
485  :type: :class:`float`
486  :raises: :class:`QSscoreError` if only one chain is mapped
487  """
488  if self._best_score is None:
489  self._ComputeScores()
490  return self._best_score
491 
492  @property
493  def superposition(self):
494  """Superposition result based on shared CA atoms in :attr:`alignments`.
495 
496  The superposition can be used to map :attr:`QSscoreEntity.ent` of
497  :attr:`qs_ent_1` onto the one of :attr:`qs_ent_2`. Use
498  :func:`ost.geom.Invert` if you need the opposite transformation.
499 
500  :getter: Computed on first use (cached)
501  :type: :class:`ost.mol.alg.SuperpositionResult`
502  """
503  if self._superposition is None:
504  self._superposition = _GetQsSuperposition(self.alignments)
505  # report it
506  sup_rmsd = self._superposition.rmsd
507  cmp_view = self._superposition.view1
508  LogInfo('CA RMSD for %s aligned residues on %s chains: %.2f' \
509  % (cmp_view.residue_count, cmp_view.chain_count, sup_rmsd))
510  return self._superposition
511 
512  @property
513  def clustalw_bin(self):
514  """
515  Full path to ``clustalw`` or ``clustalw2`` executable to use for multiple
516  sequence alignments (unless :attr:`chain_mapping` is provided manually).
517 
518  :getter: Located in path on first use (cached)
519  :type: :class:`str`
520  """
521  if self._clustalw_bin is None:
522  self._clustalw_bin = settings.Locate(('clustalw', 'clustalw2'))
523  return self._clustalw_bin
524 
525  def GetOligoLDDTScorer(self, settings, penalize_extra_chains=True):
526  """
527  :return: :class:`OligoLDDTScorer` object, setup for this QS scoring problem.
528  :param settings: Passed to :class:`OligoLDDTScorer` constructor.
529  :param penalize_extra_chains: Passed to :class:`OligoLDDTScorer` constructor.
530  """
531  if penalize_extra_chains:
532  return OligoLDDTScorer(self.qs_ent_1.ent, self.qs_ent_2.ent,
533  self.alignments, self.calpha_only, settings,
534  True, self.chem_mapping)
535  else:
536  return OligoLDDTScorer(self.qs_ent_1.ent, self.qs_ent_2.ent,
537  self.alignments, self.calpha_only, settings, False)
538 
539 
540  ##############################################################################
541  # Class internal helpers (anything that doesnt easily work without this class)
542  ##############################################################################
543 
544  def _ComputeAlignedEntities(self):
545  """Fills cached ent_to_cm_1 and ent_to_cm_2."""
546  # get aligned residues via MSA
547  ev1, ev2 = _GetAlignedResidues(self.qs_ent_1, self.qs_ent_2,
548  self.chem_mapping,
550  self.clustalw_bin)
551  # extract new entities
552  self._ent_to_cm_1 = mol.CreateEntityFromView(ev1, True)
553  self._ent_to_cm_2 = mol.CreateEntityFromView(ev2, True)
554  # name them
555  self._ent_to_cm_1.SetName(self.qs_ent_1.GetName())
556  self._ent_to_cm_2.SetName(self.qs_ent_2.GetName())
557 
558  def _ComputeSymmetry(self):
559  """Fills cached symm_1 and symm_2."""
560  # get them
561  self._symm_1, self._symm_2 = \
562  _FindSymmetry(self.qs_ent_1, self.qs_ent_2, self.ent_to_cm_1,
563  self.ent_to_cm_2, self.chem_mapping)
564  # check that we have reasonable symmetries set (fallback: all chains)
565  if not _AreValidSymmetries(self._symm_1, self._symm_2):
566  self._symm_1 = [tuple(ch.name for ch in self.ent_to_cm_1.chains)]
567  self._symm_2 = [tuple(ch.name for ch in self.ent_to_cm_2.chains)]
568 
569  def _ComputeScores(self):
570  """Fills cached global_score and best_score."""
571  if len(self.chain_mapping) < 2:
572  raise QSscoreError("QS-score is not defined for monomers")
573  # get contacts
574  if self.calpha_only:
575  contacts_1 = self.qs_ent_1.contacts_ca
576  contacts_2 = self.qs_ent_2.contacts_ca
577  else:
578  contacts_1 = self.qs_ent_1.contacts
579  contacts_2 = self.qs_ent_2.contacts
580  # get scores
581  scores = _GetScores(contacts_1, contacts_2, self.mapped_residues,
582  self.chain_mapping)
583  self._best_score = scores[0]
584  self._global_score = scores[1]
585  # report scores
586  LogInfo('QSscore %s, %s: best: %.2f, global: %.2f' \
587  % (self.qs_ent_1.GetName(), self.qs_ent_2.GetName(),
588  self._best_score, self._global_score))
589 
590 
591 ###############################################################################
592 # Entity with cached entries for QS scoring
593 ###############################################################################
594 
595 class QSscoreEntity(object):
596  """Entity with cached entries for QS scoring.
597 
598  Any known / precomputed information can be filled into the appropriate
599  attribute here as long as they are labelled as read/write. Otherwise the
600  quantities are computed on first access and cached (lazy evaluation). The
601  heaviest load is expected when computing :attr:`contacts` and
602  :attr:`contacts_ca`.
603 
604  :param ent: Entity to be used for QS scoring. A copy of it will be processed.
605  :type ent: :class:`~ost.mol.EntityHandle` or :class:`~ost.mol.EntityView`
606 
607  .. attribute:: is_valid
608 
609  True, if successfully initialized. False, if input structure is monomer or
610  has less than 2 protein chains with >= 20 residues.
611 
612  :type: :class:`bool`
613 
614  .. attribute:: original_name
615 
616  Name set for *ent* when object was created.
617 
618  :type: :class:`str`
619 
620  .. attribute:: ent
621 
622  Cleaned version of *ent* passed at construction. Hydrogens are removed, the
623  entity is processed with a :class:`~ost.conop.RuleBasedProcessor` and chains
624  listed in :attr:`removed_chains` have been removed. The name of this entity
625  might change during scoring (see :func:`GetName`). Otherwise, this will be
626  fixed.
627 
628  :type: :class:`~ost.mol.EntityHandle`
629 
630  .. attribute:: removed_chains
631 
632  Chains removed from *ent* passed at construction. These are ligand and water
633  chains as well as small (< 20 res.) peptides or chains with no amino acids
634  (determined by chem. type, which is set by rule based processor).
635 
636  :type: :class:`list` of :class:`str`
637 
638  .. attribute:: calpha_only
639 
640  Whether entity is CA-only (i.e. it has 0 CB atoms)
641 
642  :type: :class:`bool`
643  """
644  def __init__(self, ent):
645  # copy entity and process/clean it
646  self.original_name = ent.GetName()
647  ent = mol.CreateEntityFromView(ent.Select('ele!=H and aname!=HN'), True)
648  if not conop.GetDefaultLib():
649  raise RuntimeError("QSscore computation requires a compound library!")
650  pr = conop.RuleBasedProcessor(conop.GetDefaultLib())
651  pr.Process(ent)
652  self.ent, self.removed_chains, self.calpha_only = _CleanInputEntity(ent)
653  # check if it's suitable for QS scoring
654  if self.ent.chain_count == 0:
655  LogError('Bad input file: ' + ent.GetName() + '. No chains left after '
656  'removing water, ligands and small peptides.')
657  self.is_valid = False
658  elif self.ent.chain_count == 1:
659  LogWarning('Structure ' + ent.GetName() + ' is a monomer.')
660  self.is_valid = True
661  else:
662  self.is_valid = True
663  # init cached stuff
664  self._ca_entity = None
665  self._ca_chains = None
666  self._alignments = {}
667  self._chem_groups = None
668  self._angles = {}
669  self._axis = {}
670  self._contacts = None
671  self._contacts_ca = None
672 
673  def GetName(self):
674  """Wrapper to :func:`~ost.mol.EntityHandle.GetName` of :attr:`ent`.
675  This is used to uniquely identify the entity while scoring. The name may
676  therefore change while :attr:`original_name` remains fixed.
677  """
678  # for duck-typing and convenience
679  return self.ent.GetName()
680 
681  def SetName(self, new_name):
682  """Wrapper to :func:`~ost.mol.EntityHandle.SetName` of :attr:`ent`.
683  Use this to change unique identifier while scoring (see :func:`GetName`).
684  """
685  # for duck-typing and convenience
686  self.ent.SetName(new_name)
687 
688  @property
689  def ca_entity(self):
690  """
691  Reduced representation of :attr:`ent` with only CA atoms.
692  This guarantees that each included residue has exactly one atom.
693 
694  :getter: Computed on first use (cached)
695  :type: :class:`~ost.mol.EntityHandle`
696  """
697  if self._ca_entity is None:
698  self._ca_entity = _GetCAOnlyEntity(self.ent)
699  return self._ca_entity
700 
701  @property
702  def ca_chains(self):
703  """
704  Map of chain names in :attr:`ent` to sequences with attached view to CA-only
705  chains (into :attr:`ca_entity`). Useful for alignments and superpositions.
706 
707  :getter: Computed on first use (cached)
708  :type: :class:`dict` (key = :class:`str`,
709  value = :class:`~ost.seq.SequenceHandle`)
710  """
711  if self._ca_chains is None:
712  self._ca_chains = dict()
713  ca_entity = self.ca_entity
714  for ch in ca_entity.chains:
715  self._ca_chains[ch.name] = seq.SequenceFromChain(ch.name, ch)
716  return self._ca_chains
717 
718  def GetAlignment(self, c1, c2):
719  """Get sequence alignment of chain *c1* with chain *c2*.
720  Computed on first use based on :attr:`ca_chains` (cached).
721 
722  :param c1: Chain name for first chain to align.
723  :type c1: :class:`str`
724  :param c2: Chain name for second chain to align.
725  :type c2: :class:`str`
726  :rtype: :class:`~ost.seq.AlignmentHandle` or None if it failed.
727  """
728  if (c1,c2) not in self._alignments:
729  ca_chains = self.ca_chains
730  self._alignments[(c1,c2)] = _AlignAtomSeqs(ca_chains[c1], ca_chains[c2])
731  return self._alignments[(c1,c2)]
732 
733  @property
734  def chem_groups(self):
735  """
736  Intra-complex group of chemically identical (seq. id. > 95%) polypeptide
737  chains as extracted from :attr:`ca_chains`. First chain in group is the one
738  with the longest sequence.
739 
740  :getter: Computed on first use (cached)
741  :type: :class:`list` of :class:`list` of :class:`str` (chain names)
742  """
743  if self._chem_groups is None:
744  self._chem_groups = _GetChemGroups(self, 95)
745  LogInfo('Chemically equivalent chain-groups in %s: %s' \
746  % (self.GetName(), str(self._chem_groups)))
747  return self._chem_groups
748 
749  def GetAngles(self, c1, c2):
750  """Get Euler angles from superposition of chain *c1* with chain *c2*.
751  Computed on first use based on :attr:`ca_chains` (cached).
752 
753  :param c1: Chain name for first chain to superpose.
754  :type c1: :class:`str`
755  :param c2: Chain name for second chain to superpose.
756  :type c2: :class:`str`
757  :return: 3 Euler angles (may contain nan if something fails).
758  :rtype: :class:`numpy.array`
759  """
760  if (c1,c2) not in self._angles:
761  self._GetSuperposeData(c1, c2)
762  return self._angles[(c1,c2)]
763 
764  def GetAxis(self, c1, c2):
765  """Get axis of symmetry from superposition of chain *c1* with chain *c2*.
766  Computed on first use based on :attr:`ca_chains` (cached).
767 
768  :param c1: Chain name for first chain to superpose.
769  :type c1: :class:`str`
770  :param c2: Chain name for second chain to superpose.
771  :type c2: :class:`str`
772  :return: Rotational axis (may contain nan if something fails).
773  :rtype: :class:`numpy.array`
774  """
775  if (c1,c2) not in self._axis:
776  self._GetSuperposeData(c1, c2)
777  return self._axis[(c1,c2)]
778 
779  @property
780  def contacts(self):
781  """
782  Connectivity dictionary (**read/write**).
783  As given by :func:`GetContacts` with *calpha_only* = False on :attr:`ent`.
784 
785  :getter: Computed on first use (cached)
786  :setter: Uses :func:`FilterContacts` to ensure that we only keep contacts
787  for chains in the cleaned entity.
788  :type: See return type of :func:`GetContacts`
789  """
790  if self._contacts is None:
791  self._contacts = GetContacts(self.ent, False)
792  return self._contacts
793 
794  @contacts.setter
795  def contacts(self, new_contacts):
796  chain_names = set([ch.name for ch in self.ent.chains])
797  self._contacts = FilterContacts(new_contacts, chain_names)
798 
799  @property
800  def contacts_ca(self):
801  """
802  CA-only connectivity dictionary (**read/write**).
803  Like :attr:`contacts` but with *calpha_only* = True in :func:`GetContacts`.
804  """
805  if self._contacts_ca is None:
806  self._contacts_ca = GetContacts(self.ent, True)
807  return self._contacts_ca
808 
809  @contacts_ca.setter
810  def contacts_ca(self, new_contacts):
811  chain_names = set([ch.name for ch in self.ent.chains])
812  self._contacts_ca = FilterContacts(new_contacts, chain_names)
813 
814  ##############################################################################
815  # Class internal helpers (anything that doesnt easily work without this class)
816  ##############################################################################
817 
818  def _GetSuperposeData(self, c1, c2):
819  """Fill _angles and _axis from superposition of CA chains of c1 and c2."""
820  # get aligned views (must contain identical numbers of atoms!)
821  aln = self.GetAlignment(c1, c2)
822  if not aln:
823  # fallback for non-aligned stuff (nan)
824  self._angles[(c1,c2)] = np.empty(3) * np.nan
825  self._axis[(c1,c2)] = np.empty(3) * np.nan
826  return
827  v1, v2 = seq.ViewsFromAlignment(aln)
828  if v1.atom_count < 3:
829  # fallback for non-aligned stuff (nan)
830  self._angles[(c1,c2)] = np.empty(3) * np.nan
831  self._axis[(c1,c2)] = np.empty(3) * np.nan
832  return
833  # get transformation
834  sup_res = mol.alg.SuperposeSVD(v1, v2, apply_transform=False)
835  Rt = sup_res.transformation
836  # extract angles
837  a,b,c = _GetAngles(Rt)
838  self._angles[(c1,c2)] = np.asarray([a,b,c])
839  # extract axis of symmetry
840  R3 = geom.Rotation3(Rt.ExtractRotation())
841  self._axis[(c1,c2)] = np.asarray(R3.GetRotationAxis().data)
842 
843 ###############################################################################
844 # Contacts computations
845 ###############################################################################
846 
847 def FilterContacts(contacts, chain_names):
848  """Filter contacts to contain only contacts for chains in *chain_names*.
849 
850  :param contacts: Connectivity dictionary as produced by :func:`GetContacts`.
851  :type contacts: :class:`dict`
852  :param chain_names: Chain names to keep.
853  :type chain_names: :class:`list` or (better) :class:`set`
854  :return: New connectivity dictionary (format as in :func:`GetContacts`)
855  :rtype: :class:`dict`
856  """
857  # create new dict with subset
858  filtered_contacts = dict()
859  for c1 in contacts:
860  if c1 in chain_names:
861  new_contacts = dict()
862  for c2 in contacts[c1]:
863  if c2 in chain_names:
864  new_contacts[c2] = contacts[c1][c2]
865  # avoid adding empty dicts
866  if new_contacts:
867  filtered_contacts[c1] = new_contacts
868  return filtered_contacts
869 
870 def GetContacts(entity, calpha_only, dist_thr=12.0):
871  """Get inter-chain contacts of a macromolecular entity.
872 
873  Contacts are pairs of residues within a given distance belonging to different
874  chains. They are stored once per pair and include the CA/CB-CA/CB distance.
875 
876  :param entity: An entity to check connectivity for.
877  :type entity: :class:`~ost.mol.EntityHandle` or :class:`~ost.mol.EntityView`
878  :param calpha_only: If True, we only consider CA-CA distances. Else, we use CB
879  unless the residue is a GLY.
880  :type calpha_only: :class:`bool`
881  :param dist_thr: Maximal CA/CB-CA/CB distance to be considered in contact.
882  :type dist_thr: :class:`float`
883  :return: A connectivity dictionary. A pair of residues with chain names
884  *ch_name1* & *ch_name2* (*ch_name1* < *ch_name2*), residue numbers
885  *res_num1* & *res_num2* and distance *dist* (<= *dist_thr*) are
886  stored as *result[ch_name1][ch_name2][res_num1][res_num2]* = *dist*.
887  :rtype: :class:`dict`
888  """
889  # get ent copy to search on
890  if calpha_only:
891  ev = entity.Select("aname=CA")
892  else:
893  ev = entity.Select("(rname=GLY and aname=CA) or aname=CB")
894  ent = mol.CreateEntityFromView(ev, True)
895  # search all vs all
896  contacts = dict()
897  for atom in ent.atoms:
898  ch_name1 = atom.chain.name
899  res_num1 = atom.residue.number.num
900  close_atoms = ent.FindWithin(atom.pos, dist_thr)
901  for close_atom in close_atoms:
902  ch_name2 = close_atom.chain.name
903  if ch_name2 > ch_name1:
904  res_num2 = close_atom.residue.number.num
905  dist = geom.Distance(atom.pos, close_atom.pos)
906  # add to contacts
907  if ch_name1 not in contacts:
908  contacts[ch_name1] = dict()
909  if ch_name2 not in contacts[ch_name1]:
910  contacts[ch_name1][ch_name2] = dict()
911  if res_num1 not in contacts[ch_name1][ch_name2]:
912  contacts[ch_name1][ch_name2][res_num1] = dict()
913  contacts[ch_name1][ch_name2][res_num1][res_num2] = round(dist, 3)
914  # DONE
915  return contacts
916 
917 ###############################################################################
918 # Oligo-lDDT scores
919 ###############################################################################
920 
921 class OligoLDDTScorer(object):
922  """Helper class to calculate oligomeric lDDT scores.
923 
924  This class can be used independently, but commonly it will be created by
925  calling :func:`QSscorer.GetOligoLDDTScorer`.
926 
927  .. note::
928 
929  By construction, lDDT scores are not symmetric and hence it matters which
930  structure is the reference (:attr:`ref`) and which one is the model
931  (:attr:`mdl`). Extra residues in the model are generally not considered.
932  Extra chains in both model and reference can be considered by setting the
933  :attr:`penalize_extra_chains` flag to True.
934 
935  :param ref: Sets :attr:`ref`
936  :param mdl: Sets :attr:`mdl`
937  :param alignments: Sets :attr:`alignments`
938  :param calpha_only: Sets :attr:`calpha_only`
939  :param settings: Sets :attr:`settings`
940  :param penalize_extra_chains: Sets :attr:`penalize_extra_chains`
941  :param chem_mapping: Sets :attr:`chem_mapping`. Must be given if
942  *penalize_extra_chains* is True.
943 
944  .. attribute:: ref
945  mdl
946 
947  Full reference/model entity to be scored. The entity must contain all chains
948  mapped in :attr:`alignments` and may also contain additional ones which are
949  considered if :attr:`penalize_extra_chains` is True.
950 
951  :type: :class:`~ost.mol.EntityHandle`
952 
953  .. attribute:: alignments
954 
955  One alignment for each mapped chain of :attr:`ref`/:attr:`mdl` as defined in
956  :attr:`QSscorer.alignments`. The first sequence of each alignment belongs to
957  :attr:`ref` and the second one to :attr:`mdl`. Sequences must have sequence
958  naming and attached views as defined in :attr:`QSscorer.alignments`.
959 
960  :type: :class:`list` of :class:`~ost.seq.AlignmentHandle`
961 
962  .. attribute:: calpha_only
963 
964  If True, restricts lDDT score to CA only.
965 
966  :type: :class:`bool`
967 
968  .. attribute:: settings
969 
970  Settings to use for lDDT scoring.
971 
972  :type: :class:`~ost.mol.alg.lDDTSettings`
973 
974  .. attribute:: penalize_extra_chains
975 
976  If True, extra chains in both :attr:`ref` and :attr:`mdl` will penalize the
977  lDDT scores.
978 
979  :type: :class:`bool`
980 
981  .. attribute:: chem_mapping
982 
983  Inter-complex mapping of chemical groups as defined in
984  :attr:`QSscorer.chem_mapping`. Used to find "chem-mapped" chains in
985  :attr:`ref` for unmapped chains in :attr:`mdl` when penalizing scores.
986  Each unmapped model chain can add extra reference-contacts according to the
987  average total contacts of each single "chem-mapped" reference chain. If
988  there is no "chem-mapped" reference chain, a warning is shown and the model
989  chain is ignored.
990 
991 
992  Only relevant if :attr:`penalize_extra_chains` is True.
993 
994  :type: :class:`dict` with key = :class:`tuple` of chain names in
995  :attr:`ref` and value = :class:`tuple` of chain names in
996  :attr:`mdl`.
997  """
998 
999  # NOTE: one could also allow computation of both penalized and unpenalized
1000  # in same object -> must regenerate lddt_ref / lddt_mdl though
1001 
1002  def __init__(self, ref, mdl, alignments, calpha_only, settings,
1003  penalize_extra_chains=False, chem_mapping=None):
1004  # sanity checks
1005  if chem_mapping is None and penalize_extra_chains:
1006  raise RuntimeError("Must provide chem_mapping when requesting penalty "
1007  "for extra chains!")
1008  if not penalize_extra_chains:
1009  # warn for unmapped model chains
1010  unmapped_mdl_chains = self._GetUnmappedMdlChains(mdl, alignments)
1011  if unmapped_mdl_chains:
1012  LogWarning('MODEL contains chains unmapped to REFERENCE, '
1013  'lDDT is not considering MODEL chains %s' \
1014  % str(list(unmapped_mdl_chains)))
1015  # warn for unmapped reference chains
1016  ref_chains = set(ch.name for ch in ref.chains)
1017  mapped_ref_chains = set(aln.GetSequence(0).GetName() for aln in alignments)
1018  unmapped_ref_chains = (ref_chains - mapped_ref_chains)
1019  if unmapped_ref_chains:
1020  LogWarning('REFERENCE contains chains unmapped to MODEL, '
1021  'lDDT is not considering REFERENCE chains %s' \
1022  % str(list(unmapped_ref_chains)))
1023  # prepare fields
1024  self.ref = ref
1025  self.mdl = mdl
1026  self.alignments = alignments
1027  self.calpha_only = calpha_only
1028  self.settings = settings
1029  self.penalize_extra_chains = penalize_extra_chains
1030  self.chem_mapping = chem_mapping
1031  self._sc_lddt = None
1032  self._oligo_lddt = None
1033  self._weighted_lddt = None
1034  self._lddt_ref = None
1035  self._lddt_mdl = None
1036  self._oligo_lddt_scorer = None
1037  self._mapped_lddt_scorers = None
1038  self._ref_scorers = None
1039  self._model_penalty = None
1040 
1041  @property
1042  def oligo_lddt(self):
1043  """Oligomeric lDDT score.
1044 
1045  The score is computed as conserved contacts divided by the total contacts
1046  in the reference using the :attr:`oligo_lddt_scorer`, which uses the full
1047  complex as reference/model structure. If :attr:`penalize_extra_chains` is
1048  True, the reference/model complexes contain all chains (otherwise only the
1049  mapped ones) and additional contacts are added to the reference's total
1050  contacts for unmapped model chains according to the :attr:`chem_mapping`.
1051 
1052  The main difference with :attr:`weighted_lddt` is that the lDDT scorer
1053  "sees" the full complex here (incl. inter-chain contacts), while the
1054  weighted single chain score looks at each chain separately.
1055 
1056  :getter: Computed on first use (cached)
1057  :type: :class:`float`
1058  """
1059  if self._oligo_lddt is None:
1060  LogInfo('Reference %s has: %s chains' \
1061  % (self.ref.GetName(), self.ref.chain_count))
1062  LogInfo('Model %s has: %s chains' \
1063  % (self.mdl.GetName(), self.mdl.chain_count))
1064 
1065  # score with or w/o extra-chain penalty
1066  if self.penalize_extra_chains:
1067  denominator = self.oligo_lddt_scorer.total_contacts
1068  denominator += self._GetModelPenalty()
1069  if denominator > 0:
1070  oligo_lddt = self.oligo_lddt_scorer.conserved_contacts \
1071  / float(denominator)
1072  else:
1073  oligo_lddt = 0.0
1074  else:
1075  oligo_lddt = self.oligo_lddt_scorer.global_score
1076  self._oligo_lddt = oligo_lddt
1077  return self._oligo_lddt
1078 
1079  @property
1080  def weighted_lddt(self):
1081  """Weighted average of single chain lDDT scores.
1082 
1083  The score is computed as a weighted average of single chain lDDT scores
1084  (see :attr:`sc_lddt_scorers`) using the total contacts of each single
1085  reference chain as weights. If :attr:`penalize_extra_chains` is True,
1086  unmapped chains are added with a 0 score and total contacts taken from
1087  the actual reference chains or (for unmapped model chains) using the
1088  :attr:`chem_mapping`.
1089 
1090  See :attr:`oligo_lddt` for a comparison of the two scores.
1091 
1092  :getter: Computed on first use (cached)
1093  :type: :class:`float`
1094  """
1095  if self._weighted_lddt is None:
1096  scores = [s.global_score for s in self.sc_lddt_scorers]
1097  weights = [s.total_contacts for s in self.sc_lddt_scorers]
1098  nominator = sum([s * w for s, w in zip(scores, weights)])
1099  if self.penalize_extra_chains:
1100  ref_scorers = self._GetRefScorers()
1101  denominator = sum(s.total_contacts for s in ref_scorers.values())
1102  denominator += self._GetModelPenalty()
1103  else:
1104  denominator = sum(weights)
1105  if denominator > 0:
1106  self._weighted_lddt = nominator / float(denominator)
1107  else:
1108  self._weighted_lddt = 0.0
1109  return self._weighted_lddt
1110 
1111  @property
1112  def lddt_ref(self):
1113  """The reference entity used for oligomeric lDDT scoring
1114  (:attr:`oligo_lddt` / :attr:`oligo_lddt_scorer`).
1115 
1116  Since the lDDT computation requires a single chain with mapped residue
1117  numbering, all chains of :attr:`ref` are appended into a single chain X with
1118  unique residue numbers according to the column-index in the alignment. The
1119  alignments are in the same order as they appear in :attr:`alignments`.
1120  Additional residues are appended at the end of the chain with unique residue
1121  numbers. Unmapped chains are only added if :attr:`penalize_extra_chains` is
1122  True. Only CA atoms are considered if :attr:`calpha_only` is True.
1123 
1124  :getter: Computed on first use (cached)
1125  :type: :class:`~ost.mol.EntityHandle`
1126  """
1127  if self._lddt_ref is None:
1128  self._PrepareOligoEntities()
1129  return self._lddt_ref
1130 
1131  @property
1132  def lddt_mdl(self):
1133  """The model entity used for oligomeric lDDT scoring
1134  (:attr:`oligo_lddt` / :attr:`oligo_lddt_scorer`).
1135 
1136  Like :attr:`lddt_ref`, this is a single chain X containing all chains of
1137  :attr:`mdl`. The residue numbers match the ones in :attr:`lddt_ref` where
1138  aligned and have unique numbers for additional residues.
1139 
1140  :getter: Computed on first use (cached)
1141  :type: :class:`~ost.mol.EntityHandle`
1142  """
1143  if self._lddt_mdl is None:
1144  self._PrepareOligoEntities()
1145  return self._lddt_mdl
1146 
1147  @property
1149  """lDDT Scorer object for :attr:`lddt_ref` and :attr:`lddt_mdl`.
1150 
1151  :getter: Computed on first use (cached)
1152  :type: :class:`~ost.mol.alg.lDDTScorer`
1153  """
1154  if self._oligo_lddt_scorer is None:
1156  references=[self.lddt_ref.Select("")],
1157  model=self.lddt_mdl.Select(""),
1158  settings=self.settings)
1159  return self._oligo_lddt_scorer
1160 
1161  @property
1163  """List of scorer objects for each chain mapped in :attr:`alignments`.
1164 
1165  :getter: Computed on first use (cached)
1166  :type: :class:`list` of :class:`MappedLDDTScorer`
1167  """
1168  if self._mapped_lddt_scorers is None:
1169  self._mapped_lddt_scorers = list()
1170  for aln in self.alignments:
1171  mapped_lddt_scorer = MappedLDDTScorer(aln, self.calpha_only,
1172  self.settings)
1173  self._mapped_lddt_scorers.append(mapped_lddt_scorer)
1174  return self._mapped_lddt_scorers
1175 
1176  @property
1177  def sc_lddt_scorers(self):
1178  """List of lDDT scorer objects extracted from :attr:`mapped_lddt_scorers`.
1179 
1180  :type: :class:`list` of :class:`~ost.mol.alg.lDDTScorer`
1181  """
1182  return [mls.lddt_scorer for mls in self.mapped_lddt_scorers]
1183 
1184  @property
1185  def sc_lddt(self):
1186  """List of global scores extracted from :attr:`sc_lddt_scorers`.
1187 
1188  If scoring for a mapped chain fails, an error is displayed and a score of 0
1189  is assigned.
1190 
1191  :getter: Computed on first use (cached)
1192  :type: :class:`list` of :class:`float`
1193  """
1194  if self._sc_lddt is None:
1195  self._sc_lddt = list()
1196  for lddt_scorer in self.sc_lddt_scorers:
1197  try:
1198  self._sc_lddt.append(lddt_scorer.global_score)
1199  except Exception as ex:
1200  LogError('Single chain lDDT failed:', str(ex))
1201  self._sc_lddt.append(0.0)
1202  return self._sc_lddt
1203 
1204  ##############################################################################
1205  # Class internal helpers
1206  ##############################################################################
1207 
1208  def _PrepareOligoEntities(self):
1209  # simple wrapper to avoid code duplication
1210  self._lddt_ref, self._lddt_mdl = _MergeAlignedChains(
1211  self.alignments, self.ref, self.mdl, self.calpha_only,
1212  self.penalize_extra_chains)
1213 
1214  @staticmethod
1215  def _GetUnmappedMdlChains(mdl, alignments):
1216  # assume model is second sequence in alignment and is named by chain
1217  mdl_chains = set(ch.name for ch in mdl.chains)
1218  mapped_mdl_chains = set(aln.GetSequence(1).GetName() for aln in alignments)
1219  return (mdl_chains - mapped_mdl_chains)
1220 
1221  def _GetRefScorers(self):
1222  # single chain lddt scorers for each reference chain (key = chain name)
1223  if self._ref_scorers is None:
1224  # collect from mapped_lddt_scorers
1225  ref_scorers = dict()
1226  for mapped_lddt_scorer in self.mapped_lddt_scorers:
1227  ref_ch_name = mapped_lddt_scorer.reference_chain_name
1228  ref_scorers[ref_ch_name] = mapped_lddt_scorer.lddt_scorer
1229  # add new ones where needed
1230  for ch in self.ref.chains:
1231  if ch.name not in ref_scorers:
1232  if self.calpha_only:
1233  ref_chain = ch.Select('aname=CA')
1234  else:
1235  ref_chain = ch.Select('')
1236  ref_scorers[ch.name] = lDDTScorer(
1237  references=[ref_chain],
1238  model=ref_chain,
1239  settings=self.settings)
1240  # store in cache
1241  self._ref_scorers = ref_scorers
1242  # fetch from cache
1243  return self._ref_scorers
1244 
1245  def _GetModelPenalty(self):
1246  # extra value to add to total number of distances for extra model chains
1247  # -> estimated from chem-mapped reference chains
1248  if self._model_penalty is None:
1249  # sanity check
1250  if self.chem_mapping is None:
1251  raise RuntimeError("Must provide chem_mapping when requesting penalty "
1252  "for extra model chains!")
1253  # get cached ref_scorers
1254  ref_scorers = self._GetRefScorers()
1255  # get unmapped model chains
1256  unmapped_mdl_chains = self._GetUnmappedMdlChains(self.mdl, self.alignments)
1257  # map extra chains to ref. chains
1258  model_penalty = 0
1259  for ch_name_mdl in sorted(unmapped_mdl_chains):
1260  # get penalty for chain
1261  cur_penalty = None
1262  for cm_ref, cm_mdl in self.chem_mapping.iteritems():
1263  if ch_name_mdl in cm_mdl:
1264  # penalize by an average of the chem. mapped ref. chains
1265  cur_penalty = 0
1266  for ch_name_ref in cm_ref:
1267  # assumes that total_contacts is cached (for speed)
1268  cur_penalty += ref_scorers[ch_name_ref].total_contacts
1269  cur_penalty /= float(len(cm_ref))
1270  break
1271  # report penalty
1272  if cur_penalty is None:
1273  LogWarning('Extra MODEL chain %s could not be chemically mapped to '
1274  'any chain in REFERENCE, lDDT cannot consider it!' \
1275  % ch_name_mdl)
1276  else:
1277  LogScript('Extra MODEL chain %s added to lDDT score by considering '
1278  'chemically mapped chains in REFERENCE.' % ch_name_mdl)
1279  model_penalty += cur_penalty
1280  # store in cache
1281  self._model_penalty = model_penalty
1282  # fetch from cache
1283  return self._model_penalty
1284 
1285 
1286 class MappedLDDTScorer(object):
1287  """A simple class to calculate a single-chain lDDT score on a given chain to
1288  chain mapping as extracted from :class:`OligoLDDTScorer`.
1289 
1290  :param alignment: Sets :attr:`alignment`
1291  :param calpha_only: Sets :attr:`calpha_only`
1292  :param settings: Sets :attr:`settings`
1293 
1294  .. attribute:: alignment
1295 
1296  Alignment with two sequences named according to the mapped chains and with
1297  views attached to both sequences (e.g. one of the items of
1298  :attr:`QSscorer.alignments`).
1299 
1300  The first sequence is assumed to be the reference and the second one the
1301  model. Since the lDDT score is not symmetric (extra residues in model are
1302  ignored), the order is important.
1303 
1304  :type: :class:`~ost.seq.AlignmentHandle`
1305 
1306  .. attribute:: calpha_only
1307 
1308  If True, restricts lDDT score to CA only.
1309 
1310  :type: :class:`bool`
1311 
1312  .. attribute:: settings
1313 
1314  Settings to use for lDDT scoring.
1315 
1316  :type: :class:`~ost.mol.alg.lDDTSettings`
1317 
1318  .. attribute:: lddt_scorer
1319 
1320  lDDT Scorer object for the given chains.
1321 
1322  :type: :class:`~ost.mol.alg.lDDTScorer`
1323 
1324  .. attribute:: reference_chain_name
1325 
1326  Chain name of the reference.
1327 
1328  :type: :class:`str`
1329 
1330  .. attribute:: model_chain_name
1331 
1332  Chain name of the model.
1333 
1334  :type: :class:`str`
1335  """
1336  def __init__(self, alignment, calpha_only, settings):
1337  # prepare fields
1338  self.alignment = alignment
1339  self.calpha_only = calpha_only
1340  self.settings = settings
1341  self.lddt_scorer = None # set in _InitScorer
1342  self.reference_chain_name = alignment.sequences[0].name
1343  self.model_chain_name = alignment.sequences[1].name
1344  self._old_number_label = "old_num"
1345  self._extended_alignment = None # set in _InitScorer
1346  # initialize lDDT scorer
1347  self._InitScorer()
1348 
1350  """
1351  :return: Scores for each residue
1352  :rtype: :class:`list` of :class:`dict` with one item for each residue
1353  existing in model and reference:
1354 
1355  - "residue_number": Residue number in reference chain
1356  - "residue_name": Residue name in reference chain
1357  - "lddt": local lDDT
1358  - "conserved_contacts": number of conserved contacts
1359  - "total_contacts": total number of contacts
1360  """
1361  scores = list()
1362  assigned_residues = list()
1363  # Make sure the score is calculated
1364  self.lddt_scorer.global_score
1365  for col in self._extended_alignment:
1366  if col[0] != "-" and col.GetResidue(3).IsValid():
1367  ref_res = col.GetResidue(0)
1368  mdl_res = col.GetResidue(1)
1369  ref_res_renum = col.GetResidue(2)
1370  mdl_res_renum = col.GetResidue(3)
1371  if ref_res.one_letter_code != ref_res_renum.one_letter_code:
1372  raise RuntimeError("Reference residue name mapping inconsistent: %s != %s" %
1373  (ref_res.one_letter_code,
1374  ref_res_renum.one_letter_code))
1375  if mdl_res.one_letter_code != mdl_res_renum.one_letter_code:
1376  raise RuntimeError("Model residue name mapping inconsistent: %s != %s" %
1377  (mdl_res.one_letter_code,
1378  mdl_res_renum.one_letter_code))
1379  if ref_res.GetNumber().num != ref_res_renum.GetIntProp(self._old_number_label):
1380  raise RuntimeError("Reference residue number mapping inconsistent: %s != %s" %
1381  (ref_res.GetNumber().num,
1382  ref_res_renum.GetIntProp(self._old_number_label)))
1383  if mdl_res.GetNumber().num != mdl_res_renum.GetIntProp(self._old_number_label):
1384  raise RuntimeError("Model residue number mapping inconsistent: %s != %s" %
1385  (mdl_res.GetNumber().num,
1386  mdl_res_renum.GetIntProp(self._old_number_label)))
1387  if ref_res.qualified_name in assigned_residues:
1388  raise RuntimeError("Duplicated residue in reference: " %
1389  (ref_res.qualified_name))
1390  else:
1391  assigned_residues.append(ref_res.qualified_name)
1392  # check if property there (may be missing for CA-only)
1393  if mdl_res_renum.HasProp(self.settings.label):
1394  scores.append({
1395  "residue_number": ref_res.GetNumber().num,
1396  "residue_name": ref_res.name,
1397  "lddt": mdl_res_renum.GetFloatProp(self.settings.label),
1398  "conserved_contacts": mdl_res_renum.GetFloatProp(self.settings.label + "_conserved"),
1399  "total_contacts": mdl_res_renum.GetFloatProp(self.settings.label + "_total")})
1400  return scores
1401 
1402  ##############################################################################
1403  # Class internal helpers (anything that doesnt easily work without this class)
1404  ##############################################################################
1405 
1406  def _InitScorer(self):
1407  # Use copy of alignment (extended by 2 extra sequences for renumbering)
1408  aln = self.alignment.Copy()
1409  # Get chains and renumber according to alignment (for lDDT)
1410  reference = Renumber(
1411  aln.GetSequence(0),
1412  old_number_label=self._old_number_label).CreateFullView()
1413  refseq = seq.CreateSequence(
1414  "reference_renumbered",
1415  aln.GetSequence(0).GetString())
1416  refseq.AttachView(reference)
1417  aln.AddSequence(refseq)
1418  model = Renumber(
1419  aln.GetSequence(1),
1420  old_number_label=self._old_number_label).CreateFullView()
1421  modelseq = seq.CreateSequence(
1422  "model_renumbered",
1423  aln.GetSequence(1).GetString())
1424  modelseq.AttachView(model)
1425  aln.AddSequence(modelseq)
1426  # Filter to CA-only if desired (done after AttachView to not mess it up)
1427  if self.calpha_only:
1428  self.lddt_scorer = lDDTScorer(
1429  references=[reference.Select('aname=CA')],
1430  model=model.Select('aname=CA'),
1431  settings=self.settings)
1432  else:
1433  self.lddt_scorer = lDDTScorer(
1434  references=[reference],
1435  model=model,
1436  settings=self.settings)
1437  # Store alignment for later
1438  self._extended_alignment = aln
1439 
1440 ###############################################################################
1441 # HELPERS
1442 ###############################################################################
1443 
1444 # general
1445 
1446 def _AlignAtomSeqs(seq_1, seq_2):
1447  """
1448  :type seq_1: :class:`ost.seq.SequenceHandle`
1449  :type seq_2: :class:`ost.seq.SequenceHandle`
1450  :return: Alignment of two sequences using a global alignment. Views attached
1451  to the input sequences will remain attached in the aln.
1452  :rtype: :class:`~ost.seq.AlignmentHandle` or None if it failed.
1453  """
1454  # NOTE: If the two sequence have a greatly different length
1455  # a local alignment could be a better choice...
1456  aln = None
1457  alns = seq.alg.GlobalAlign(seq_1, seq_2, seq.alg.BLOSUM62)
1458  if alns: aln = alns[0]
1459  if not aln:
1460  LogWarning('Failed to align %s to %s' % (seq_1.name, seq_2.name))
1461  LogWarning('%s: %s' % (seq_1.name, seq_1.string))
1462  LogWarning('%s: %s' % (seq_2.name, seq_2.string))
1463  return aln
1464 
1465 def _FixSelectChainName(ch_name):
1466  """
1467  :return: String to be used with Select(cname=<RETURN>). Takes care of putting
1468  quotation marks where needed.
1469  :rtype: :class:`str`
1470  :param ch_name: Single chain name (:class:`str`).
1471  """
1472  if ch_name in ['-', '_', ' ']:
1473  return '"%c"' % ch_name
1474  else:
1475  return ch_name
1476 
1477 def _FixSelectChainNames(ch_names):
1478  """
1479  :return: String to be used with Select(cname=<RETURN>). Takes care of joining
1480  and putting quotation marks where needed.
1481  :rtype: :class:`str`
1482  :param ch_names: Some iterable list of chain names (:class:`str` items).
1483  """
1484  chain_set = set([_FixSelectChainName(ch_name) for ch_name in ch_names])
1485  return ','.join(chain_set)
1486 
1487 # QS entity
1488 
1489 def _CleanInputEntity(ent):
1490  """
1491  :param ent: The OST entity to be cleaned.
1492  :type ent: :class:`EntityHandle` or :class:`EntityView`
1493  :return: A tuple of 3 items: :attr:`QSscoreEntity.ent`,
1494  :attr:`QSscoreEntity.removed_chains`,
1495  :attr:`QSscoreEntity.calpha_only`
1496  """
1497  # find chains to remove
1498  removed_chains = []
1499  for ch in ent.chains:
1500  # we remove chains if they are small-peptides or if the contain no aa
1501  # or if they contain only unknown or modified residues
1502  if ch.name in ['-', '_'] \
1503  or ch.residue_count < 20 \
1504  or not any(r.chem_type.IsAminoAcid() for r in ch.residues) \
1505  or not (set(r.one_letter_code for r in ch.residues) - {'?', 'X'}):
1506  removed_chains.append(ch.name)
1507 
1508  # remove them from *ent*
1509  if removed_chains:
1510  view = ent.Select('cname!=%s' % _FixSelectChainNames(removed_chains))
1511  ent_new = mol.CreateEntityFromView(view, True)
1512  ent_new.SetName(ent.GetName())
1513  else:
1514  ent_new = ent
1515 
1516  # check if CA only
1517  calpha_only = False
1518  if ent_new.atom_count > 0 and ent_new.Select('aname=CB').atom_count == 0:
1519  LogInfo('Structure %s is a CA only structure!' % ent_new.GetName())
1520  calpha_only = True
1521 
1522  # report and return
1523  if removed_chains:
1524  LogInfo('Chains removed from %s: %s' \
1525  % (ent_new.GetName(), ''.join(removed_chains)))
1526  LogInfo('Chains in %s: %s' \
1527  % (ent_new.GetName(), ''.join([c.name for c in ent_new.chains])))
1528  return ent_new, removed_chains, calpha_only
1529 
1530 def _GetCAOnlyEntity(ent):
1531  """
1532  :param ent: Entity to process
1533  :type ent: :class:`EntityHandle` or :class:`EntityView`
1534  :return: New entity with only CA and only one atom per residue
1535  (see :attr:`QSscoreEntity.ca_entity`)
1536  """
1537  # cook up CA only view (diff from Select = guaranteed 1 atom per residue)
1538  ca_view = ent.CreateEmptyView()
1539  # add chain by chain
1540  for res in ent.residues:
1541  ca_atom = res.FindAtom("CA")
1542  if ca_atom.IsValid():
1543  ca_view.AddAtom(ca_atom)
1544  # finalize
1545  return mol.CreateEntityFromView(ca_view, False)
1546 
1547 def _GetChemGroups(qs_ent, seqid_thr=95.):
1548  """
1549  :return: Intra-complex group of chemically identical polypeptide chains
1550  (see :attr:`QSscoreEntity.chem_groups`)
1551 
1552  :param qs_ent: Entity to process
1553  :type qs_ent: :class:`QSscoreEntity`
1554  :param seqid_thr: Threshold used to decide when two chains are identical.
1555  95 percent tolerates the few mutations crystallographers
1556  like to do.
1557  :type seqid_thr: :class:`float`
1558  """
1559  # get data from qs_ent
1560  ca_chains = qs_ent.ca_chains
1561  chain_names = sorted(ca_chains.keys())
1562  # get pairs of identical chains
1563  # NOTE: this scales quadratically with number of chains and may be optimized
1564  # -> one could merge it with "merge transitive pairs" below...
1565  id_seqs = []
1566  for ch_1, ch_2 in itertools.combinations(chain_names, 2):
1567  aln = qs_ent.GetAlignment(ch_1, ch_2)
1568  if aln and seq.alg.SequenceIdentity(aln) > seqid_thr:
1569  id_seqs.append((ch_1, ch_2))
1570  # trivial case: no matching pairs
1571  if not id_seqs:
1572  return [[name] for name in chain_names]
1573 
1574  # merge transitive pairs
1575  groups = []
1576  for ch_1, ch_2 in id_seqs:
1577  found = False
1578  for g in groups:
1579  if ch_1 in g or ch_2 in g:
1580  found = True
1581  g.add(ch_1)
1582  g.add(ch_2)
1583  if not found:
1584  groups.append(set([ch_1, ch_2]))
1585  # sort internally based on sequence length
1586  chem_groups = []
1587  for g in groups:
1588  ranked_g = sorted([(-ca_chains[ch].length, ch) for ch in g])
1589  chem_groups.append([ch for _,ch in ranked_g])
1590  # add other dissimilar chains
1591  for ch in chain_names:
1592  if not any(ch in g for g in chem_groups):
1593  chem_groups.append([ch])
1594 
1595  return chem_groups
1596 
1597 def _GetAngles(Rt):
1598  """Computes the Euler angles given a transformation matrix.
1599 
1600  :param Rt: Rt operator.
1601  :type Rt: :class:`ost.geom.Mat4`
1602  :return: A :class:`tuple` of angles for each axis (x,y,z)
1603  """
1604  rot = np.asmatrix(Rt.ExtractRotation().data).reshape(3,3)
1605  tx = np.arctan2(rot[2,1], rot[2,2])
1606  if tx < 0:
1607  tx += 2*np.pi
1608  ty = np.arctan2(rot[2,0], np.sqrt(rot[2,1]**2 + rot[2,2]**2))
1609  if ty < 0:
1610  ty += 2*np.pi
1611  tz = np.arctan2(rot[1,0], rot[0,0])
1612  if tz < 0:
1613  tz += 2*np.pi
1614  return tx,ty,tz
1615 
1616 # QS scorer
1617 
1618 def _GetChemGroupsMapping(qs_ent_1, qs_ent_2):
1619  """
1620  :return: Inter-complex mapping of chemical groups
1621  (see :attr:`QSscorer.chem_mapping`)
1622 
1623  :param qs_ent_1: See :attr:`QSscorer.qs_ent_1`
1624  :param qs_ent_2: See :attr:`QSscorer.qs_ent_2`
1625  """
1626  # get chem. groups and unique representative
1627  chem_groups_1 = qs_ent_1.chem_groups
1628  chem_groups_2 = qs_ent_2.chem_groups
1629  repr_chains_1 = {x[0]: tuple(x) for x in chem_groups_1}
1630  repr_chains_2 = {x[0]: tuple(x) for x in chem_groups_2}
1631 
1632  # if entities do not have different number of unique chains, we get the
1633  # mapping for the smaller set
1634  swapped = False
1635  if len(repr_chains_2) < len(repr_chains_1):
1636  repr_chains_1, repr_chains_2 = repr_chains_2, repr_chains_1
1637  qs_ent_1, qs_ent_2 = qs_ent_2, qs_ent_1
1638  swapped = True
1639 
1640  # find the closest to each chain between the two entities
1641  # NOTE: this may still be sensible to orthology problem
1642  # -> currently we use a global alignment and seq. id. to rank pairs
1643  # -> we also tried local alignments and weighting the seq. id. by the
1644  # coverages of the alignments (gapless string in aln. / seq. length)
1645  # but global aln performed better...
1646  chain_pairs = []
1647  ca_chains_1 = qs_ent_1.ca_chains
1648  ca_chains_2 = qs_ent_2.ca_chains
1649  for ch_1 in repr_chains_1.keys():
1650  for ch_2 in repr_chains_2.keys():
1651  aln = _AlignAtomSeqs(ca_chains_1[ch_1], ca_chains_2[ch_2])
1652  if aln:
1653  chains_seqid = seq.alg.SequenceIdentity(aln)
1654  LogVerbose('Sequence identity', ch_1, ch_2, 'seqid=%.2f' % chains_seqid)
1655  chain_pairs.append((chains_seqid, ch_1, ch_2))
1656 
1657  # get top matching groups first
1658  chain_pairs = sorted(chain_pairs, reverse=True)
1659  chem_mapping = {}
1660  for _, c1, c2 in chain_pairs:
1661  skip = False
1662  for a,b in chem_mapping.iteritems():
1663  if repr_chains_1[c1] == a or repr_chains_2[c2] == b:
1664  skip = True
1665  break
1666  if not skip:
1667  chem_mapping[repr_chains_1[c1]] = repr_chains_2[c2]
1668  if swapped:
1669  chem_mapping = {y: x for x, y in chem_mapping.iteritems()}
1670  qs_ent_1, qs_ent_2 = qs_ent_2, qs_ent_1
1671 
1672  # notify chains without partner
1673  mapped_1 = set([i for s in chem_mapping.keys() for i in s])
1674  chains_1 = set([c.name for c in qs_ent_1.ent.chains])
1675  if chains_1 - mapped_1:
1676  LogWarning('Unmapped Chains in %s: %s'
1677  % (qs_ent_1.GetName(), ','.join(list(chains_1 - mapped_1))))
1678 
1679  mapped_2 = set([i for s in chem_mapping.values() for i in s])
1680  chains_2 = set([c.name for c in qs_ent_2.ent.chains])
1681  if chains_2 - mapped_2:
1682  LogWarning('Unmapped Chains in %s: %s'
1683  % (qs_ent_2.GetName(), ','.join(list(chains_2 - mapped_2))))
1684 
1685  # check if we have any chains left
1686  LogInfo('Chemical chain-groups mapping: ' + str(chem_mapping))
1687  if len(mapped_1) < 1 or len(mapped_2) < 1:
1688  raise QSscoreError('Less than 1 chains left in chem_mapping.')
1689  return chem_mapping
1690 
1691 def _SelectFew(l, max_elements):
1692  """Return l or copy of l with at most *max_elements* entries."""
1693  n_el = len(l)
1694  if n_el <= max_elements:
1695  return l
1696  else:
1697  # cheap integer ceiling (-1 to ensure that x*max_elements gets d_el = x)
1698  d_el = ((n_el - 1) // max_elements) + 1
1699  new_l = list()
1700  for i in range(0, n_el, d_el):
1701  new_l.append(l[i])
1702  return new_l
1703 
1704 def _GetAlignedResidues(qs_ent_1, qs_ent_2, chem_mapping, max_ca_per_chain,
1705  clustalw_bin):
1706  """
1707  :return: Tuple of two :class:`~ost.mol.EntityView` objects containing subsets
1708  of *qs_ent_1* and *qs_ent_2*. Two entities are later created from
1709  those views (see :attr:`QSscorer.ent_to_cm_1` and
1710  :attr:`QSscorer.ent_to_cm_2`)
1711 
1712  :param qs_ent_1: See :attr:`QSscorer.qs_ent_1`
1713  :param qs_ent_2: See :attr:`QSscorer.qs_ent_2`
1714  :param chem_mapping: See :attr:`QSscorer.chem_mapping`
1715  :param max_ca_per_chain: See :attr:`QSscorer.max_ca_per_chain_for_cm`
1716  """
1717  # make sure name doesn't contain spaces and is unique
1718  def _FixName(seq_name, seq_names):
1719  # get rid of spaces and make it unique
1720  seq_name = seq_name.replace(' ', '-')
1721  while seq_name in seq_names:
1722  seq_name += '-'
1723  return seq_name
1724  # resulting views into CA entities using CA chain sequences
1725  ent_view_1 = qs_ent_1.ca_entity.CreateEmptyView()
1726  ent_view_2 = qs_ent_2.ca_entity.CreateEmptyView()
1727  ca_chains_1 = qs_ent_1.ca_chains
1728  ca_chains_2 = qs_ent_2.ca_chains
1729  # go through all mapped chemical groups
1730  for group_1, group_2 in chem_mapping.iteritems():
1731  # MSA with ClustalW
1732  seq_list = seq.CreateSequenceList()
1733  # keep sequence-name (must be unique) to view mapping
1734  seq_to_empty_view = dict()
1735  for ch in group_1:
1736  sequence = ca_chains_1[ch].Copy()
1737  sequence.name = _FixName(qs_ent_1.GetName() + '.' + ch, seq_to_empty_view)
1738  seq_to_empty_view[sequence.name] = ent_view_1
1739  seq_list.AddSequence(sequence)
1740  for ch in group_2:
1741  sequence = ca_chains_2[ch].Copy()
1742  sequence.name = _FixName(qs_ent_2.GetName() + '.' + ch, seq_to_empty_view)
1743  seq_to_empty_view[sequence.name] = ent_view_2
1744  seq_list.AddSequence(sequence)
1745  alnc = ClustalW(seq_list, clustalw=clustalw_bin)
1746 
1747  # collect aligned residues (list of lists of sequence_count valid residues)
1748  residue_lists = list()
1749  sequence_count = alnc.sequence_count
1750  for col in alnc:
1751  residues = [col.GetResidue(ir) for ir in range(sequence_count)]
1752  if all([r.IsValid() for r in residues]):
1753  residue_lists.append(residues)
1754  # check number of aligned residues
1755  if len(residue_lists) < 5:
1756  raise QSscoreError('Not enough aligned residues.')
1757  elif max_ca_per_chain > 0:
1758  residue_lists = _SelectFew(residue_lists, max_ca_per_chain)
1759  # check what view is for which residue
1760  res_views = [seq_to_empty_view[alnc.sequences[ir].name] \
1761  for ir in range(sequence_count)]
1762  # add to view (note: only 1 CA atom per residue in here)
1763  for res_list in residue_lists:
1764  for res, view in zip(res_list, res_views):
1765  view.AddResidue(res, mol.ViewAddFlag.INCLUDE_ATOMS)
1766  # done
1767  return ent_view_1, ent_view_2
1768 
1769 
1770 def _FindSymmetry(qs_ent_1, qs_ent_2, ent_to_cm_1, ent_to_cm_2, chem_mapping):
1771  """
1772  :return: A pair of comparable symmetry groups (for :attr:`QSscorer.symm_1`
1773  and :attr:`QSscorer.symm_2`) between the two structures.
1774  Empty lists if no symmetry identified.
1775 
1776  :param qs_ent_1: See :attr:`QSscorer.qs_ent_1`
1777  :param qs_ent_2: See :attr:`QSscorer.qs_ent_2`
1778  :param ent_to_cm_1: See :attr:`QSscorer.ent_to_cm_1`
1779  :param ent_to_cm_2: See :attr:`QSscorer.ent_to_cm_2`
1780  :param chem_mapping: See :attr:`QSscorer.chem_mapping`
1781  """
1782  LogInfo('Identifying Symmetry Groups...')
1783 
1784  # get possible symmetry groups
1785  symm_subg_1 = _GetSymmetrySubgroups(qs_ent_1, ent_to_cm_1,
1786  chem_mapping.keys())
1787  symm_subg_2 = _GetSymmetrySubgroups(qs_ent_2, ent_to_cm_2,
1788  chem_mapping.values())
1789 
1790  # check combination of groups
1791  LogInfo('Selecting Symmetry Groups...')
1792  # check how many mappings are to be checked for each pair of symmetry groups
1793  # -> lower number here will speed up worst-case runtime of _GetChainMapping
1794  # NOTE: this is tailored to speed up brute force all vs all mapping test
1795  # for preferred _CheckClosedSymmetry this is suboptimal!
1796  best_symm = []
1797  for symm_1, symm_2 in itertools.product(symm_subg_1, symm_subg_2):
1798  s1 = symm_1[0]
1799  s2 = symm_2[0]
1800  if len(s1) != len(s2):
1801  LogDebug('Discarded', str(symm_1), str(symm_2),
1802  ': different size of symmetry groups')
1803  continue
1804  count = _CountSuperpositionsAndMappings(symm_1, symm_2, chem_mapping)
1805  nr_mapp = count['intra']['mappings'] + count['inter']['mappings']
1806  LogDebug('OK', str(symm_1), str(symm_2), ': %s mappings' % nr_mapp)
1807  best_symm.append((nr_mapp, symm_1, symm_2))
1808 
1809  for _, symm_1, symm_2 in sorted(best_symm):
1810  s1 = symm_1[0]
1811  s2 = symm_2[0]
1812  group_1 = ent_to_cm_1.Select('cname=%s' % _FixSelectChainNames(s1))
1813  group_2 = ent_to_cm_2.Select('cname=%s' % _FixSelectChainNames(s2))
1814  # check if by superposing a pair of chains within the symmetry group to
1815  # superpose all chains within the symmetry group
1816  # -> if successful, the symmetry groups are compatible
1817  closed_symm = _CheckClosedSymmetry(group_1, group_2, symm_1, symm_2,
1818  chem_mapping, 6, 0.8, find_best=False)
1819 
1820  if closed_symm:
1821  return symm_1, symm_2
1822 
1823  # nothing found
1824  LogInfo('No coherent symmetry identified between structures')
1825  return [], []
1826 
1827 
1828 def _GetChainMapping(ent_1, ent_2, symm_1, symm_2, chem_mapping,
1829  max_mappings_extensive):
1830  """
1831  :return: Tuple with mapping from *ent_1* to *ent_2* (see
1832  :attr:`QSscorer.chain_mapping`) and scheme used (see
1833  :attr:`QSscorer.chain_mapping_scheme`)
1834 
1835  :param ent_1: See :attr:`QSscorer.ent_to_cm_1`
1836  :param ent_2: See :attr:`QSscorer.ent_to_cm_2`
1837  :param symm_1: See :attr:`QSscorer.symm_1`
1838  :param symm_2: See :attr:`QSscorer.symm_2`
1839  :param chem_mapping: See :attr:`QSscorer.chem_mapping`
1840  :param max_mappings_extensive: See :attr:`QSscorer.max_mappings_extensive`
1841  """
1842  LogInfo('Symmetry-groups used in %s: %s' % (ent_1.GetName(), str(symm_1)))
1843  LogInfo('Symmetry-groups used in %s: %s' % (ent_2.GetName(), str(symm_2)))
1844 
1845  # quick check for closed symmetries
1846  thresholds = [( 'strict', 4, 0.8),
1847  ( 'tolerant', 6, 0.4),
1848  ('permissive', 8, 0.2)]
1849  for scheme, sup_thr, sup_fract in thresholds:
1850  # check if by superposing a pair of chains within the symmetry group to
1851  # superpose all chains of the two oligomers
1852  # -> this also returns the resulting mapping of chains
1853  mapping = _CheckClosedSymmetry(ent_1, ent_2, symm_1, symm_2,
1854  chem_mapping, sup_thr, sup_fract)
1855  if mapping:
1856  LogInfo('Closed Symmetry with %s parameters' % scheme)
1857  if scheme == 'permissive':
1858  LogWarning('Permissive thresholds used for overlap based mapping ' + \
1859  'detection: check mapping manually: %s' % mapping)
1860  return mapping, scheme
1861 
1862  # NOTE that what follows below is sub-optimal:
1863  # - if the two structures don't fit at all, we may map chains rather randomly
1864  # - we may need a lot of operations to find globally "best" mapping
1865  # -> COST = O(N^2) * O(N!)
1866  # (first = all pairwise chain-superpose, latter = all chain mappings)
1867  # -> a greedy chain mapping choice algo (pick best RMSD for each one-by-one)
1868  # could reduce this to O(N^2) * O(N^2) but it would need to be validated
1869  # - one could also try some other heuristic based on center of atoms of chains
1870  # -> at this point we get a bad mapping anyways...
1871 
1872  # if we get here, we will need to check many combinations of mappings
1873  # -> check how many mappings must be checked and limit
1874  count = _CountSuperpositionsAndMappings(symm_1, symm_2, chem_mapping)
1875  LogInfo('Intra Symmetry-group mappings to check: %s' \
1876  % count['intra']['mappings'])
1877  LogInfo('Inter Symmetry-group mappings to check: %s' \
1878  % count['inter']['mappings'])
1879  nr_mapp = count['intra']['mappings'] + count['inter']['mappings']
1880  if nr_mapp > max_mappings_extensive:
1881  raise QSscoreError('Too many possible mappings: %s' % nr_mapp)
1882 
1883  # to speed up the computations we cache chain views and RMSDs
1884  cached_rmsd = _CachedRMSD(ent_1, ent_2)
1885 
1886  # get INTRA symmetry group chain mapping
1887  check = 0
1888  intra_mappings = [] # list of (RMSD, c1, c2, mapping) tuples (best superpose)
1889  # limit chem mapping to reference symmetry group
1890  ref_symm_1 = symm_1[0]
1891  ref_symm_2 = symm_2[0]
1892  asu_chem_mapping = _LimitChemMapping(chem_mapping, ref_symm_1, ref_symm_2)
1893  # for each chemically identical group
1894  for g1, g2 in asu_chem_mapping.iteritems():
1895  # try to superpose all possible pairs
1896  for c1, c2 in itertools.product(g1, g2):
1897  # get superposition transformation
1898  LogVerbose('Superposing chains: %s, %s' % (c1, c2))
1899  res = cached_rmsd.GetSuperposition(c1, c2)
1900  # compute RMSD of possible mappings
1901  cur_mappings = [] # list of (RMSD, mapping) tuples
1902  for mapping in _ListPossibleMappings(c1, c2, asu_chem_mapping):
1903  check += 1
1904  chains_rmsd = cached_rmsd.GetMappedRMSD(mapping, res.transformation)
1905  cur_mappings.append((chains_rmsd, mapping))
1906  LogVerbose(str(mapping), str(chains_rmsd))
1907  best_rmsd, best_mapp = min(cur_mappings)
1908  intra_mappings.append((best_rmsd, c1, c2, best_mapp))
1909  # best chain-chain superposition defines the intra asu mapping
1910  _, intra_asu_c1, intra_asu_c2, intra_asu_mapp = min(intra_mappings)
1911 
1912  # if only one asu is present in any of the complexes, we're done
1913  if len(symm_1) == 1 or len(symm_2) == 1:
1914  mapping = intra_asu_mapp
1915  else:
1916  # the mapping is the element position within the asu chain list
1917  # -> this speed up a lot, assuming that the order of chain in asu groups
1918  # is following the order of symmetry operations
1919  index_asu_c1 = ref_symm_1.index(intra_asu_c1)
1920  index_asu_c2 = ref_symm_2.index(intra_asu_c2)
1921  index_mapp = {ref_symm_1.index(s1): ref_symm_2.index(s2) \
1922  for s1, s2 in intra_asu_mapp.iteritems()}
1923  LogInfo('Intra symmetry-group mapping: %s' % str(intra_asu_mapp))
1924 
1925  # get INTER symmetry group chain mapping
1926  # we take the first symmetry group from the complex having the most
1927  if len(symm_1) < len(symm_2):
1928  symm_combinations = list(itertools.product(symm_1, [symm_2[0]]))
1929  else:
1930  symm_combinations = list(itertools.product([symm_1[0]], symm_2))
1931 
1932  full_asu_mapp = {tuple(symm_1): tuple(symm_2)}
1933  full_mappings = [] # list of (RMSD, mapping) tuples
1934  for s1, s2 in symm_combinations:
1935  # get superposition transformation (we take best chain-pair in asu)
1936  LogVerbose('Superposing symmetry-group: %s, %s in %s, %s' \
1937  % (s1[index_asu_c1], s2[index_asu_c2], s1, s2))
1938  res = cached_rmsd.GetSuperposition(s1[index_asu_c1], s2[index_asu_c2])
1939  # compute RMSD of possible mappings
1940  for asu_mapp in _ListPossibleMappings(s1, s2, full_asu_mapp):
1941  check += 1
1942  # need to extract full chain mapping (use indexing)
1943  mapping = {}
1944  for a, b in asu_mapp.iteritems():
1945  for id_a, id_b in index_mapp.iteritems():
1946  mapping[a[id_a]] = b[id_b]
1947  chains_rmsd = cached_rmsd.GetMappedRMSD(mapping, res.transformation)
1948  full_mappings.append((chains_rmsd, mapping))
1949  LogVerbose(str(mapping), str(chains_rmsd))
1950 
1951  if check != nr_mapp:
1952  LogWarning('Mapping number estimation was wrong') # sanity check
1953 
1954  # return best (lowest RMSD) mapping
1955  mapping = min(full_mappings)[1]
1956 
1957  LogWarning('Extensive search used for mapping detection (fallback). This ' + \
1958  'approach has known limitations. Check mapping manually: %s' \
1959  % mapping)
1960  return mapping, 'extensive'
1961 
1962 
1963 def _GetSymmetrySubgroups(qs_ent, ent, chem_groups):
1964  """
1965  Identify the symmetry (either cyclic C or dihedral D) of the protein and find
1966  all possible symmetry subgroups. This is done testing all combination of chain
1967  superposition and clustering them by the angles (D) or the axis (C) of the Rt
1968  operator.
1969 
1970  Groups of superposition which can fully reconstruct the structure are possible
1971  symmetry subgroups.
1972 
1973  :param qs_ent: Entity with cached angles and axis.
1974  :type qs_ent: :class:`QSscoreEntity`
1975  :param ent: Entity from which to extract chains (perfect alignment assumed
1976  for superposition as in :attr:`QSscorer.ent_to_cm_1`).
1977  :type ent: :class:`EntityHandle` or :class:`EntityView`
1978  :param chem_groups: List of tuples/lists of chain names in *ent*. Each list
1979  contains all chains belonging to a chem. group (could be
1980  from :attr:`QSscoreEntity.chem_groups` or from "keys()"
1981  or "values()" of :attr:`QSscorer.chem_mapping`)
1982 
1983  :return: A list of possible symmetry subgroups (each in same format as
1984  :attr:`QSscorer.symm_1`). If no symmetry is found, we return a list
1985  with a single symmetry subgroup with a single group with all chains.
1986  """
1987  # get angles / axis for pairwise transformations within same chem. group
1988  angles = {}
1989  axis = {}
1990  for cnames in chem_groups:
1991  for c1, c2 in itertools.combinations(cnames, 2):
1992  cur_angles = qs_ent.GetAngles(c1, c2)
1993  if not np.any(np.isnan(cur_angles)):
1994  angles[(c1,c2)] = cur_angles
1995  cur_axis = qs_ent.GetAxis(c1, c2)
1996  if not np.any(np.isnan(cur_axis)):
1997  axis[(c1,c2)] = cur_axis
1998 
1999  # cluster superpositions angles at different thresholds
2000  symm_groups = []
2001  LogVerbose('Possible symmetry-groups for: %s' % ent.GetName())
2002  for angle_thr in np.arange(0.1, 1, 0.1):
2003  d_symm_groups = _GetDihedralSubgroups(ent, chem_groups, angles, angle_thr)
2004  if d_symm_groups:
2005  for group in d_symm_groups:
2006  if group not in symm_groups:
2007  symm_groups.append(group)
2008  LogVerbose('Dihedral: %s' % group)
2009  LogInfo('Symmetry threshold %.1f used for angles of %s' \
2010  % (angle_thr, ent.GetName()))
2011  break
2012 
2013  # cluster superpositions axis at different thresholds
2014  for axis_thr in np.arange(0.1, 1, 0.1):
2015  c_symm_groups = _GetCyclicSubgroups(ent, chem_groups, axis, axis_thr)
2016  if c_symm_groups:
2017  for group in c_symm_groups:
2018  if group not in symm_groups:
2019  symm_groups.append(group)
2020  LogVerbose('Cyclic: %s' % group)
2021  LogInfo('Symmetry threshold %.1f used for axis of %s' \
2022  % (axis_thr, ent.GetName()))
2023  break
2024 
2025  # fall back to single "group" containing all chains if none found
2026  if not symm_groups:
2027  LogInfo('No symmetry-group detected in %s' % ent.GetName())
2028  symm_groups = [[tuple([c for g in chem_groups for c in g])]]
2029 
2030  return symm_groups
2031 
2032 def _GetDihedralSubgroups(ent, chem_groups, angles, angle_thr):
2033  """
2034  :return: Dihedral subgroups for :func:`_GetSymmetrySubgroups`
2035  (same return type as there). Empty list if fail.
2036 
2037  :param ent: See :func:`_GetSymmetrySubgroups`
2038  :param chem_groups: See :func:`_GetSymmetrySubgroups`
2039  :param angles: :class:`dict` (key = chain-pair-tuple, value = Euler angles)
2040  :param angle_thr: Euler angles distance threshold for clustering (float).
2041  """
2042  # cluster superpositions angles
2043  if len(angles) == 0:
2044  # nothing to be done here
2045  return []
2046  else:
2047  same_angles = _ClusterData(angles, angle_thr, _AngleArrayDistance)
2048 
2049  # get those which are non redundant and covering all chains
2050  symm_groups = []
2051  for clst in same_angles.values():
2052  group = clst.keys()
2053  if _ValidChainGroup(group, ent):
2054  if len(chem_groups) > 1:
2055  # if hetero, we want to group toghether different chains only
2056  symm_groups.append(zip(*group))
2057  else:
2058  # if homo, we also want pairs
2059  symm_groups.append(group)
2060  symm_groups.append(zip(*group))
2061  return symm_groups
2062 
2063 def _GetCyclicSubgroups(ent, chem_groups, axis, axis_thr):
2064  """
2065  :return: Cyclic subgroups for :func:`_GetSymmetrySubgroups`
2066  (same return type as there). Empty list if fail.
2067 
2068  :param ent: See :func:`_GetSymmetrySubgroups`
2069  :param chem_groups: See :func:`_GetSymmetrySubgroups`
2070  :param angles: :class:`dict` (key = chain-pair-tuple, value = rotation axis)
2071  :param angle_thr: Axis distance threshold for clustering (float).
2072  """
2073  # cluster superpositions axis
2074  if len(axis) == 0:
2075  # nothing to be done here
2076  return []
2077  else:
2078  same_axis = _ClusterData(axis, axis_thr, _AxisDistance)
2079 
2080  # use to get grouping
2081  symm_groups = []
2082  for clst in same_axis.values():
2083  all_chain = [item for sublist in clst.keys() for item in sublist]
2084  if len(set(all_chain)) == ent.chain_count:
2085  # if we have an hetero we can exploit cyclic symmetry for grouping
2086  if len(chem_groups) > 1:
2087  ref_chem_group = chem_groups[0]
2088  # try two criteria for grouping
2089  cyclic_group_closest = []
2090  cyclic_group_iface = []
2091  for c1 in ref_chem_group:
2092  group_closest = [c1]
2093  group_iface = [c1]
2094  for chains in chem_groups[1:]:
2095  # center of atoms distance
2096  closest = _GetClosestChain(ent, c1, chains)
2097  # chain with bigger interface
2098  closest_iface = _GetClosestChainInterface(ent, c1, chains)
2099  group_closest.append(closest)
2100  group_iface.append(closest_iface)
2101  cyclic_group_closest.append(tuple(group_closest))
2102  cyclic_group_iface.append(tuple(group_iface))
2103  if _ValidChainGroup(cyclic_group_closest, ent):
2104  symm_groups.append(cyclic_group_closest)
2105  if _ValidChainGroup(cyclic_group_iface, ent):
2106  symm_groups.append(cyclic_group_iface)
2107  # if it is a homo, there's not much we can group
2108  else:
2109  symm_groups.append(chem_groups)
2110  return symm_groups
2111 
2112 def _ClusterData(data, thr, metric):
2113  """Wrapper for fclusterdata to get dict of clusters.
2114 
2115  :param data: :class:`dict` (keys for ID, values used for clustering)
2116  :return: :class:`dict` {cluster_idx: {data-key: data-value}}
2117  """
2118  # special case length 1
2119  if len(data) == 1:
2120  return {0: {data.keys()[0]: data.values()[0]} }
2121  # do the clustering
2122  cluster_indices = fclusterdata(np.asarray(data.values()), thr,
2123  method='complete', criterion='distance',
2124  metric=metric)
2125  # fclusterdata output is cluster ids -> put into dict of clusters
2126  res = {}
2127  for cluster_idx, data_key in zip(cluster_indices, data.keys()):
2128  if not cluster_idx in res:
2129  res[cluster_idx] = {}
2130  res[cluster_idx][data_key] = data[data_key]
2131  return res
2132 
2133 def _AngleArrayDistance(u, v):
2134  """
2135  :return: Average angular distance of two arrays of angles.
2136  :param u: Euler angles.
2137  :param v: Euler angles.
2138  """
2139  dist = []
2140  for a,b in zip(u,v):
2141  delta = abs(a-b)
2142  if delta > np.pi:
2143  delta = abs(2*np.pi - delta)
2144  dist.append(delta)
2145  return np.mean(dist)
2146 
2147 def _AxisDistance(u, v):
2148  """
2149  :return: Euclidean distance between two rotation axes. Axes can point in
2150  either direction so we ensure to use the closer one.
2151  :param u: Rotation axis.
2152  :param v: Rotation axis.
2153  """
2154  # get axes as arrays (for convenience)
2155  u = np.asarray(u)
2156  v = np.asarray(v)
2157  # get shorter of two possible distances (using v or -v)
2158  dv1 = u - v
2159  dv2 = u + v
2160  d1 = np.dot(dv1, dv1)
2161  d2 = np.dot(dv2, dv2)
2162  return np.sqrt(min(d1, d2))
2163 
2164 def _GetClosestChain(ent, ref_chain, chains):
2165  """
2166  :return: Chain closest to *ref_chain* based on center of atoms distance.
2167  :rtype: :class:`str`
2168  :param ent: See :func:`_GetSymmetrySubgroups`
2169  :param ref_chain: We look for chains closest to this one
2170  :type ref_chain: :class:`str`
2171  :param chains: We only consider these chains
2172  :type chains: :class:`list` of :class:`str`
2173  """
2174  contacts = []
2175  ref_center = ent.FindChain(ref_chain).center_of_atoms
2176  for ch in chains:
2177  center = ent.FindChain(ch).center_of_atoms
2178  contacts.append((geom.Distance(ref_center, center), ch))
2179  closest_chain = min(contacts)[1]
2180  return closest_chain
2181 
2182 def _GetClosestChainInterface(ent, ref_chain, chains):
2183  """
2184  :return: Chain with biggest interface (within 10 A) to *ref_chain*.
2185  :rtype: :class:`str`
2186  :param ent: See :func:`_GetSymmetrySubgroups`
2187  :param ref_chain: We look for chains closest to this one
2188  :type ref_chain: :class:`str`
2189  :param chains: We only consider these chains
2190  :type chains: :class:`list` of :class:`str`
2191  """
2192  # NOTE: this is computed on a subset of the full biounit and might be
2193  # inaccurate. Also it could be extracted from QSscoreEntity.contacts.
2194  closest = []
2195  for ch in chains:
2196  iface_view = ent.Select('cname="%s" and 10 <> [cname="%s"]' \
2197  % (ref_chain, ch))
2198  nr_res = iface_view.residue_count
2199  closest.append((nr_res, ch))
2200  closest_chain = max(closest)[1]
2201  return closest_chain
2202 
2203 def _ValidChainGroup(group, ent):
2204  """
2205  :return: True, if *group* has unique chain names and as many chains as *ent*
2206  :rtype: :class:`bool`
2207  :param group: Symmetry groups to check
2208  :type group: Same as :attr:`QSscorer.symm_1`
2209  :param ent: See :func:`_GetSymmetrySubgroups`
2210  """
2211  all_chain = [item for sublist in group for item in sublist]
2212  if len(all_chain) == len(set(all_chain)) and\
2213  len(all_chain) == ent.chain_count:
2214  return True
2215  return False
2216 
2217 def _LimitChemMapping(chem_mapping, limit_1, limit_2):
2218  """
2219  :return: Chem. mapping containing only chains in *limit_1* and *limit_2*
2220  :rtype: Same as :attr:`QSscorer.chem_mapping`
2221  :param chem_mapping: See :attr:`QSscorer.chem_mapping`
2222  :param limit_1: Limits chain names in chem_mapping.keys()
2223  :type limit_1: List/tuple of strings
2224  :param limit_2: Limits chain names in chem_mapping.values()
2225  :type limit_2: List/tuple of strings
2226  """
2227  # use set intersection to keep only mapping for chains in limit_X
2228  limit_1_set = set(limit_1)
2229  limit_2_set = set(limit_2)
2230  asu_chem_mapping = dict()
2231  for key, value in chem_mapping.iteritems():
2232  new_key = tuple(set(key) & limit_1_set)
2233  if new_key:
2234  asu_chem_mapping[new_key] = tuple(set(value) & limit_2_set)
2235  return asu_chem_mapping
2236 
2237 
2238 def _CountSuperpositionsAndMappings(symm_1, symm_2, chem_mapping):
2239  """
2240  :return: Dictionary of number of mappings and superpositions to be performed.
2241  Returned as *result[X][Y] = number* with X = "intra" or "inter" and
2242  Y = "mappings" or "superpositions". The idea is that for each
2243  pairwise superposition we check all possible mappings.
2244  We can check the combinations within (intra) a symmetry group and
2245  once established, we check the combinations between different (inter)
2246  symmetry groups.
2247  :param symm_1: See :attr:`QSscorer.symm_1`
2248  :param symm_2: See :attr:`QSscorer.symm_2`
2249  :param chem_mapping: See :attr:`QSscorer.chem_mapping`
2250  """
2251  # setup
2252  c = {}
2253  c['intra'] = {}
2254  c['inter'] = {}
2255  c['intra']['mappings'] = 0
2256  c['intra']['superpositions'] = 0
2257  c['inter']['mappings'] = 0
2258  c['inter']['superpositions'] = 0
2259  # intra ASU mappings within reference symmetry groups
2260  ref_symm_1 = symm_1[0]
2261  ref_symm_2 = symm_2[0]
2262  asu_chem_mapping = _LimitChemMapping(chem_mapping, ref_symm_1, ref_symm_2)
2263  for g1, g2 in asu_chem_mapping.iteritems():
2264  chain_superpositions = len(g1) * len(g2)
2265  c['intra']['superpositions'] += chain_superpositions
2266  map_per_sup = _PermutationOrCombinations(g1[0], g2[0], asu_chem_mapping)
2267  c['intra']['mappings'] += chain_superpositions * map_per_sup
2268  if len(symm_1) == 1 or len(symm_2) == 1:
2269  return c
2270  # inter ASU mappings
2271  asu_superposition = min(len(symm_1), len(symm_2))
2272  c['inter']['superpositions'] = asu_superposition
2273  asu = {tuple(symm_1): tuple(symm_2)}
2274  map_per_sup = _PermutationOrCombinations(ref_symm_1, ref_symm_2, asu)
2275  c['inter']['mappings'] = asu_superposition * map_per_sup
2276  return c
2277 
2278 def _PermutationOrCombinations(sup1, sup2, chem_mapping):
2279  """Should match len(_ListPossibleMappings(sup1, sup2, chem_mapping))."""
2280  # remove superposed elements, put smallest complex as key
2281  chem_map = {}
2282  for a,b in chem_mapping.iteritems():
2283  new_a = tuple([x for x in a if x != sup1])
2284  new_b = tuple([x for x in b if x != sup2])
2285  chem_map[new_a] = new_b
2286  mapp_nr = 1.0
2287  for a, b in chem_map.iteritems():
2288  if len(a) == len(b):
2289  mapp_nr *= factorial(len(a))
2290  elif len(a) < len(b):
2291  mapp_nr *= binom(len(b), len(a))
2292  elif len(a) > len(b):
2293  mapp_nr *= binom(len(a), len(b))
2294  return int(mapp_nr)
2295 
2296 def _ListPossibleMappings(sup1, sup2, chem_mapping):
2297  """
2298  Return a flat list of all possible mappings given *chem_mapping* and keeping
2299  mapping of *sup1* and *sup2* fixed. For instance if elements are chain names
2300  this is all the mappings to check for a given superposition.
2301 
2302  Elements in first complex are defined by *chem_mapping.keys()* (list of list
2303  of elements) and elements in second complex by *chem_mapping.values()*. If
2304  complexes don't have same number of elements, we map only elements for the one
2305  with less. Also mapping is only between elements of mapped groups according to
2306  *chem_mapping*.
2307 
2308  :rtype: :class:`list` of :class:`dict` (key = element in chem_mapping-key,
2309  value = element in chem_mapping-value)
2310  :param sup1: Element for which mapping is fixed.
2311  :type sup1: Like element in chem_mapping-key
2312  :param sup2: Element for which mapping is fixed.
2313  :type sup2: Like element in chem_mapping-value
2314  :param chem_mapping: Defines mapping between groups of elements (e.g. result
2315  of :func:`_LimitChemMapping`).
2316  :type chem_mapping: :class:`dict` with key / value = :class:`tuple`
2317 
2318  :raises: :class:`QSscoreError` if reference complex (first one or one with
2319  less elements) has more elements for any given mapped group.
2320  """
2321  # find smallest complex
2322  chain1 = [i for s in chem_mapping.keys() for i in s]
2323  chain2 = [i for s in chem_mapping.values() for i in s]
2324  swap = False
2325  if len(chain1) > len(chain2):
2326  swap = True
2327  # remove superposed elements, put smallest complex as key
2328  chem_map = {}
2329  for a, b in chem_mapping.iteritems():
2330  new_a = tuple([x for x in a if x != sup1])
2331  new_b = tuple([x for x in b if x != sup2])
2332  if swap:
2333  chem_map[new_b] = new_a
2334  else:
2335  chem_map[new_a] = new_b
2336  # generate permutations or combinations of chemically
2337  # equivalent chains
2338  chem_perm = []
2339  chem_ref = []
2340  for a, b in chem_map.iteritems():
2341  if len(a) == len(b):
2342  chem_perm.append(list(itertools.permutations(b)))
2343  chem_ref.append(a)
2344  elif len(a) < len(b):
2345  chem_perm.append(list(itertools.combinations(b, len(a))))
2346  chem_ref.append(a)
2347  else:
2348  raise QSscoreError('Impossible to define reference group: %s' \
2349  % str(chem_map))
2350  # save the list of possible mappings
2351  mappings = []
2352  flat_ref = [i for s in chem_ref for i in s]
2353  for perm in itertools.product(*chem_perm):
2354  flat_perm = [i for s in perm for i in s]
2355  d = {c1: c2 for c1, c2 in zip(flat_ref, flat_perm)}
2356  if swap:
2357  d = {v: k for k, v in d.items()}
2358  d.update({sup1: sup2})
2359  mappings.append(d)
2360  return mappings
2361 
2362 
2363 def _CheckClosedSymmetry(ent_1, ent_2, symm_1, symm_2, chem_mapping,
2364  sup_thr=4, sup_fract=0.8, find_best=True):
2365  """
2366  Quick check if we can superpose two chains and get a mapping for all other
2367  chains using the same transformation. The mapping is defined by sufficient
2368  overlap of the transformed chain of *ent_1* onto another chain in *ent_2*.
2369 
2370  :param ent_1: Entity to map to *ent_2* (perfect alignment assumed between
2371  chains of same chem. group as in :attr:`QSscorer.ent_to_cm_1`).
2372  Views are ok but only to select full chains!
2373  :param ent_2: Entity to map to (perfect alignment assumed between
2374  chains of same chem. group as in :attr:`QSscorer.ent_to_cm_2`).
2375  Views are ok but only to select full chains!
2376  :param symm_1: Symmetry groups to use. We only superpose chains within
2377  reference symmetry group of *symm_1* and *symm_2*.
2378  See :attr:`QSscorer.symm_1`
2379  :param symm_2: See :attr:`QSscorer.symm_2`
2380  :param chem_mapping: See :attr:`QSscorer.chem_mapping`.
2381  All chains in *ent_1* / *ent_2* must be listed here!
2382  :param sup_thr: Distance around transformed chain in *ent_1* to check for
2383  overlap.
2384  :type sup_thr: :class:`float`
2385  :param sup_fract: Fraction of atoms in chain of *ent_2* that must be
2386  overlapped for overlap to be sufficient.
2387  :type sup_fract: :class:`float`
2388  :param find_best: If True, we look for best mapping according to
2389  :func:`_GetMappedRMSD`. Otherwise, we return first suitable
2390  mapping.
2391  :type find_best: :class:`bool`
2392 
2393  :return: Mapping from *ent_1* to *ent_2* or None if none found. Mapping, if
2394  found, is as in :attr:`QSscorer.chain_mapping`.
2395  :rtype: :class:`dict` (:class:`str` / :class:`str`)
2396  """
2397  # limit chem mapping to reference symmetry group
2398  ref_symm_1 = symm_1[0]
2399  ref_symm_2 = symm_2[0]
2400  asu_chem_mapping = _LimitChemMapping(chem_mapping, ref_symm_1, ref_symm_2)
2401  # for each chemically identical group
2402  rmsd_mappings = []
2403  for g1, g2 in asu_chem_mapping.iteritems():
2404  # try to superpose all possible pairs
2405  # -> note that some chain-chain combinations may work better than others
2406  # to superpose the full oligomer (e.g. if some chains are open/closed)
2407  for c1, c2 in itertools.product(g1, g2):
2408  # get superposition transformation
2409  chain_1 = ent_1.Select('cname="%s"' % c1)
2410  chain_2 = ent_2.Select('cname="%s"' % c2)
2411  res = mol.alg.SuperposeSVD(chain_1, chain_2, apply_transform=False)
2412  # look for overlaps
2413  mapping = _GetSuperpositionMapping(ent_1, ent_2, chem_mapping,
2414  res.transformation, sup_thr,
2415  sup_fract)
2416  if not mapping:
2417  continue
2418  # early abort if we only want the first one
2419  if not find_best:
2420  return mapping
2421  else:
2422  # get RMSD for sorting
2423  rmsd = _GetMappedRMSD(ent_1, ent_2, mapping, res.transformation)
2424  rmsd_mappings.append((rmsd, mapping))
2425  # return best mapping
2426  if rmsd_mappings:
2427  return min(rmsd_mappings)[1]
2428  else:
2429  return None
2430 
2431 def _GetSuperpositionMapping(ent_1, ent_2, chem_mapping, transformation,
2432  sup_thr, sup_fract):
2433  """
2434  :return: Dict with chain mapping from *ent_1* to *ent_2* or None if failed
2435  (see :func:`_CheckClosedSymmetry`).
2436  :param ent_1: See :func:`_CheckClosedSymmetry`
2437  :param ent_2: See :func:`_CheckClosedSymmetry`
2438  :param chem_mapping: See :func:`_CheckClosedSymmetry`
2439  :param transformation: Superposition transformation to be applied to *ent_1*.
2440  :param sup_thr: See :func:`_CheckClosedSymmetry`
2441  :param sup_fract: See :func:`_CheckClosedSymmetry`
2442  """
2443  # NOTE: this seems to be the comput. most expensive part in QS scoring
2444  # -> it could be moved to C++ for speed-up
2445  # -> the next expensive bits are ClustalW and GetContacts btw...
2446 
2447  # swap if needed (ent_1 must be shorter or same)
2448  if ent_1.chain_count > ent_2.chain_count:
2449  swap = True
2450  ent_1, ent_2 = ent_2, ent_1
2451  transformation = geom.Invert(transformation)
2452  chem_pairs = zip(chem_mapping.values(), chem_mapping.keys())
2453  else:
2454  swap = False
2455  chem_pairs = chem_mapping.iteritems()
2456  # sanity check
2457  if ent_1.chain_count == 0:
2458  return None
2459  # extract valid partners for each chain in ent_1
2460  chem_partners = dict()
2461  for cg1, cg2 in chem_pairs:
2462  for ch in cg1:
2463  chem_partners[ch] = set(cg2)
2464  # get mapping for each chain in ent_1
2465  mapping = dict()
2466  mapped_chains = set()
2467  for ch_1 in ent_1.chains:
2468  # get all neighbors split by chain (NOTE: this muight be moved to C++)
2469  ch_atoms = {ch_2.name: set() for ch_2 in ent_2.chains}
2470  for a_1 in ch_1.handle.atoms:
2471  transformed_pos = geom.Vec3(transformation * geom.Vec4(a_1.pos))
2472  a_within = ent_2.FindWithin(transformed_pos, sup_thr)
2473  for a_2 in a_within:
2474  ch_atoms[a_2.chain.name].add(a_2.hash_code)
2475  # get one with most atoms in overlap
2476  max_num, max_name = max((len(atoms), name)
2477  for name, atoms in ch_atoms.iteritems())
2478  # early abort if overlap fraction not good enough
2479  ch_2 = ent_2.FindChain(max_name)
2480  if max_num == 0 or max_num / float(ch_2.handle.atom_count) < sup_fract:
2481  return None
2482  # early abort if mapped to chain of different chem. group
2483  ch_1_name = ch_1.name
2484  if ch_1_name not in chem_partners:
2485  raise RuntimeError("Chem. mapping doesn't contain all chains!")
2486  if max_name not in chem_partners[ch_1_name]:
2487  return None
2488  # early abort if multiple ones mapped to same chain
2489  if max_name in mapped_chains:
2490  return None
2491  # set mapping
2492  mapping[ch_1_name] = max_name
2493  mapped_chains.add(max_name)
2494  # unswap if needed and return
2495  if swap:
2496  mapping = {v: k for k, v in mapping.iteritems()}
2497  return mapping
2498 
2499 def _GetMappedRMSD(ent_1, ent_2, chain_mapping, transformation):
2500  """
2501  :return: RMSD between complexes considering chain mapping.
2502  :param ent_1: Entity mapped to *ent_2* (perfect alignment assumed between
2503  mapped chains as in :attr:`QSscorer.ent_to_cm_1`).
2504  :param ent_2: Entity which was mapped to (perfect alignment assumed between
2505  mapped chains as in :attr:`QSscorer.ent_to_cm_2`).
2506  :param chain_mapping: See :attr:`QSscorer.chain_mapping`
2507  :param transformation: Superposition transformation to be applied to *ent_1*.
2508  """
2509  # collect RMSDs and atom counts chain by chain and combine afterwards
2510  rmsds = []
2511  atoms = []
2512  for c1, c2 in chain_mapping.iteritems():
2513  # get views and atom counts
2514  chain_1 = ent_1.Select('cname="%s"' % c1)
2515  chain_2 = ent_2.Select('cname="%s"' % c2)
2516  atom_count = chain_1.atom_count
2517  if atom_count != chain_2.atom_count:
2518  raise RuntimeError('Chains in _GetMappedRMSD must be perfectly aligned!')
2519  # get RMSD
2520  rmsd = mol.alg.CalculateRMSD(chain_1, chain_2, transformation)
2521  # update lists
2522  rmsds.append(rmsd)
2523  atoms.append(float(atom_count))
2524  # combine (reminder: RMSD = sqrt(sum(atom_distance^2)/atom_count))
2525  rmsd = np.sqrt( sum([a * r**2 for a, r in zip(atoms, rmsds)]) / sum(atoms) )
2526  return rmsd
2527 
2529  """Helper object for repetitive RMSD calculations.
2530  Meant to speed up :func:`_GetChainMapping` but could also be used to replace
2531  :func:`_GetMappedRMSD` in :func:`_CheckClosedSymmetry`.
2532 
2533  :param ent_1: See :attr:`QSscorer.ent_to_cm_1`
2534  :param ent_2: See :attr:`QSscorer.ent_to_cm_2`
2535  """
2536  def __init__(self, ent_1, ent_2):
2537  # initialize caches and keep entities
2538  self.ent_1 = ent_1
2539  self.ent_2 = ent_2
2540  self._chain_views_1 = dict()
2541  self._chain_views_2 = dict()
2542  self._pair_rmsd = dict() # key = (c1,c2), value = (atom_count,rmsd)
2543 
2544  def GetChainView1(self, cname):
2545  """Get cached view on chain *cname* for :attr:`ent_1`."""
2546  if cname not in self._chain_views_1:
2547  self._chain_views_1[cname] = self.ent_1.Select('cname="%s"' % cname)
2548  return self._chain_views_1[cname]
2549 
2550  def GetChainView2(self, cname):
2551  """Get cached view on chain *cname* for :attr:`ent_2`."""
2552  if cname not in self._chain_views_2:
2553  self._chain_views_2[cname] = self.ent_2.Select('cname="%s"' % cname)
2554  return self._chain_views_2[cname]
2555 
2556  def GetSuperposition(self, c1, c2):
2557  """Get superposition result (no change in entities!) for *c1* to *c2*.
2558  This invalidates cached RMSD results used in :func:`GetMappedRMSD`.
2559 
2560  :param c1: chain name for :attr:`ent_1`.
2561  :param c2: chain name for :attr:`ent_2`.
2562  """
2563  # invalidate _pair_rmsd
2564  self._pair_rmsd = dict()
2565  # superpose
2566  chain_1 = self.GetChainView1(c1)
2567  chain_2 = self.GetChainView2(c2)
2568  if chain_1.atom_count != chain_2.atom_count:
2569  raise RuntimeError('Chains in GetSuperposition not perfectly aligned!')
2570  return mol.alg.SuperposeSVD(chain_1, chain_2, apply_transform=False)
2571 
2572  def GetMappedRMSD(self, chain_mapping, transformation):
2573  """
2574  :return: RMSD between complexes considering chain mapping.
2575  :param chain_mapping: See :attr:`QSscorer.chain_mapping`.
2576  :param transformation: Superposition transformation (e.g. res.transformation
2577  for res = :func:`GetSuperposition`).
2578  """
2579  # collect RMSDs and atom counts chain by chain and combine afterwards
2580  rmsds = []
2581  atoms = []
2582  for c1, c2 in chain_mapping.iteritems():
2583  # cached?
2584  if (c1, c2) in self._pair_rmsd:
2585  atom_count, rmsd = self._pair_rmsd[(c1, c2)]
2586  else:
2587  # get views and atom counts
2588  chain_1 = self.GetChainView1(c1)
2589  chain_2 = self.GetChainView2(c2)
2590  atom_count = chain_1.atom_count
2591  if atom_count != chain_2.atom_count:
2592  raise RuntimeError('Chains in GetMappedRMSD not perfectly aligned!')
2593  # get RMSD
2594  rmsd = mol.alg.CalculateRMSD(chain_1, chain_2, transformation)
2595  self._pair_rmsd[(c1, c2)] = (atom_count, rmsd)
2596  # update lists
2597  rmsds.append(rmsd)
2598  atoms.append(float(atom_count))
2599  # combine (reminder: RMSD = sqrt(sum(atom_distance^2)/atom_count))
2600  rmsd = np.sqrt( sum([a * r**2 for a, r in zip(atoms, rmsds)]) / sum(atoms) )
2601  return rmsd
2602 
2603 
2604 def _CleanUserSymmetry(symm, ent):
2605  """Clean-up user provided symmetry.
2606 
2607  :param symm: Symmetry group as in :attr:`QSscorer.symm_1`
2608  :param ent: Entity from which to extract chain names
2609  :type ent: :class:`~ost.mol.EntityHandle` or :class:`~ost.mol.EntityView`
2610  :return: New symmetry group limited to chains in sub-structure *ent*
2611  (see :attr:`QSscorer.symm_1`). Empty list if invalid symmetry.
2612  """
2613  # restrict symm to only contain chains in ent
2614  chain_names = set([ch.name for ch in ent.chains])
2615  new_symm = list()
2616  for symm_group in symm:
2617  new_group = tuple(ch for ch in symm_group if ch in chain_names)
2618  if new_group:
2619  new_symm.append(new_group)
2620  # report it
2621  if new_symm != symm:
2622  LogInfo("Cleaned user symmetry for %s: %s" % (ent.GetName(), new_symm))
2623  # do all groups have same length?
2624  lengths = [len(symm_group) for symm_group in new_symm]
2625  if lengths[1:] != lengths[:-1]:
2626  LogWarning('User symmetry group of different sizes for %s. Ignoring it!' \
2627  % (ent.GetName()))
2628  return []
2629  # do we cover all chains?
2630  s_chain_names = set([ch for symm_group in new_symm for ch in symm_group])
2631  if len(s_chain_names) != sum(lengths):
2632  LogWarning('User symmetry group for %s has duplicate chains. Ignoring it!' \
2633  % (ent.GetName()))
2634  return []
2635  if s_chain_names != chain_names:
2636  LogWarning('User symmetry group for %s is missing chains. Ignoring it!' \
2637  % (ent.GetName()))
2638  return []
2639  # ok all good
2640  return new_symm
2641 
2642 def _AreValidSymmetries(symm_1, symm_2):
2643  """Check symmetry pair for major problems.
2644 
2645  :return: False if any of the two is empty or if they're compatible in size.
2646  :rtype: :class:`bool`
2647  """
2648  if not symm_1 or not symm_2:
2649  return False
2650  if len(symm_1) != 1 or len(symm_2) != 1:
2651  if not len(symm_1[0]) == len(symm_2[0]):
2652  LogWarning('Symmetry groups of different sizes. Ignoring them!')
2653  return False
2654  return True
2655 
2656 def _GetMappedAlignments(ent_1, ent_2, chain_mapping, res_num_alignment):
2657  """
2658  :return: Alignments of 2 structures given chain mapping
2659  (see :attr:`QSscorer.alignments`).
2660  :param ent_1: Entity containing all chains in *chain_mapping.keys()*.
2661  Views to this entity attached to first sequence of each aln.
2662  :param ent_2: Entity containing all chains in *chain_mapping.values()*.
2663  Views to this entity attached to second sequence of each aln.
2664  :param chain_mapping: See :attr:`QSscorer.chain_mapping`
2665  :param res_num_alignment: See :attr:`QSscorer.res_num_alignment`
2666  """
2667  alns = list()
2668  for ch_1_name in sorted(chain_mapping):
2669  # get both sequences incl. attached view
2670  ch_1 = ent_1.FindChain(ch_1_name)
2671  ch_2 = ent_2.FindChain(chain_mapping[ch_1_name])
2672  if res_num_alignment:
2673  max_res_num = max([r.number.GetNum() for r in ch_1.residues] +
2674  [r.number.GetNum() for r in ch_2.residues])
2675  ch1_aln = ["-"] * max_res_num
2676  ch2_aln = ["-"] * max_res_num
2677  for res in ch_1.residues:
2678  ch1_aln[res.number.GetNum() - 1] = res.GetOneLetterCode()
2679  ch1_aln = "".join(ch1_aln)
2680  seq_1 = seq.CreateSequence(ch_1.name, str(ch1_aln))
2681  seq_1.AttachView(ch_1.Select(""))
2682  for res in ch_2.residues:
2683  ch2_aln[res.number.GetNum() - 1] = res.GetOneLetterCode()
2684  ch2_aln = "".join(ch2_aln)
2685  seq_2 = seq.CreateSequence(ch_2.name, str(ch2_aln))
2686  seq_2.AttachView(ch_2.Select(""))
2687  # Create alignment
2688  aln = seq.CreateAlignment()
2689  aln.AddSequence(seq_1)
2690  aln.AddSequence(seq_2)
2691  else:
2692  seq_1 = seq.SequenceFromChain(ch_1.name, ch_1)
2693  seq_2 = seq.SequenceFromChain(ch_2.name, ch_2)
2694  # align them
2695  aln = _AlignAtomSeqs(seq_1, seq_2)
2696  if aln:
2697  alns.append(aln)
2698  return alns
2699 
2700 def _GetMappedResidues(alns):
2701  """
2702  :return: Mapping of shared residues in *alns* (with views attached)
2703  (see :attr:`QSscorer.mapped_residues`).
2704  :param alns: See :attr:`QSscorer.alignments`
2705  """
2706  mapped_residues = dict()
2707  for aln in alns:
2708  # prepare dict for c1
2709  c1 = aln.GetSequence(0).name
2710  mapped_residues[c1] = dict()
2711  # get shared residues
2712  v1, v2 = seq.ViewsFromAlignment(aln)
2713  for res_1, res_2 in zip(v1.residues, v2.residues):
2714  r1 = res_1.number.num
2715  r2 = res_2.number.num
2716  mapped_residues[c1][r1] = r2
2717 
2718  return mapped_residues
2719 
2720 def _GetExtraWeights(contacts, done, mapped_residues):
2721  """Return sum of extra weights for contacts of chains in set and not in done.
2722  :return: Tuple (weight_extra_mapped, weight_extra_all).
2723  weight_extra_mapped only sums if both cX,rX in mapped_residues
2724  weight_extra_all sums all
2725  :param contacts: See :func:`GetContacts` for first entity
2726  :param done: List of (c1, c2, r1, r2) tuples to ignore
2727  :param mapped_residues: See :func:`_GetMappedResidues`
2728  """
2729  weight_extra_mapped = 0
2730  weight_extra_non_mapped = 0
2731  for c1 in contacts:
2732  for c2 in contacts[c1]:
2733  for r1 in contacts[c1][c2]:
2734  for r2 in contacts[c1][c2][r1]:
2735  if (c1, c2, r1, r2) not in done:
2736  weight = _weight(contacts[c1][c2][r1][r2])
2737  if c1 in mapped_residues and r1 in mapped_residues[c1] \
2738  and c2 in mapped_residues and r2 in mapped_residues[c2]:
2739  weight_extra_mapped += weight
2740  else:
2741  weight_extra_non_mapped += weight
2742  return weight_extra_mapped, weight_extra_mapped + weight_extra_non_mapped
2743 
2744 def _GetScores(contacts_1, contacts_2, mapped_residues, chain_mapping):
2745  """Get QS scores (see :class:`QSscorer`).
2746 
2747  Note that if some chains are not to be considered at all, they must be removed
2748  from *contacts_1* / *contacts_2* prior to calling this.
2749 
2750  :param contacts_1: See :func:`GetContacts` for first entity
2751  :param contacts_2: See :func:`GetContacts` for second entity
2752  :param mapped_residues: See :func:`_GetMappedResidues`
2753  :param chain_mapping: Maps any chain name in *mapped_residues* to chain name
2754  for *contacts_2*.
2755  :type chain_mapping: :class:`dict` (:class:`str` / :class:`str`)
2756  :return: Tuple (QS_best, QS_global) (see :attr:`QSscorer.best_score` and
2757  see :attr:`QSscorer.global_score`)
2758  """
2759  # keep track of considered contacts as set of (c1,c2,r1,r2) tuples
2760  done_1 = set()
2761  done_2 = set()
2762  weighted_scores = 0
2763  weight_sum = 0
2764  # naming cXY, rXY: X = 1/2 for contact, Y = 1/2/T for mapping (T = tmp)
2765  # -> d1 = contacts_1[c11][c21][r11][r21], d2 = contacts_2[c12][c22][r12][r22]
2766  for c11 in contacts_1:
2767  # map to other one
2768  if c11 not in mapped_residues: continue
2769  c1T = chain_mapping[c11]
2770  # second chain
2771  for c21 in contacts_1[c11]:
2772  # map to other one
2773  if c21 not in mapped_residues: continue
2774  c2T = chain_mapping[c21]
2775  # flip if needed
2776  flipped_chains = (c1T > c2T)
2777  if flipped_chains:
2778  c12, c22 = c2T, c1T
2779  else:
2780  c12, c22 = c1T, c2T
2781  # skip early if no contacts there
2782  if c12 not in contacts_2 or c22 not in contacts_2[c12]: continue
2783  # loop over res.num. in c11
2784  for r11 in contacts_1[c11][c21]:
2785  # map to other one and skip if no contacts there
2786  if r11 not in mapped_residues[c11]: continue
2787  r1T = mapped_residues[c11][r11]
2788  # loop over res.num. in c21
2789  for r21 in contacts_1[c11][c21][r11]:
2790  # map to other one and skip if no contacts there
2791  if r21 not in mapped_residues[c21]: continue
2792  r2T = mapped_residues[c21][r21]
2793  # flip if needed
2794  if flipped_chains:
2795  r12, r22 = r2T, r1T
2796  else:
2797  r12, r22 = r1T, r2T
2798  # skip early if no contacts there
2799  if r12 not in contacts_2[c12][c22]: continue
2800  if r22 not in contacts_2[c12][c22][r12]: continue
2801  # ok now we can finally do the scoring
2802  d1 = contacts_1[c11][c21][r11][r21]
2803  d2 = contacts_2[c12][c22][r12][r22]
2804  weight = _weight(min(d1, d2))
2805  weighted_scores += weight * (1 - abs(d1 - d2) / 12)
2806  weight_sum += weight
2807  # keep track of done ones
2808  done_1.add((c11, c21, r11, r21))
2809  done_2.add((c12, c22, r12, r22))
2810 
2811  LogVerbose("Shared contacts: %d" % len(done_1))
2812 
2813  # add the extra weights
2814  weights_extra_1 = _GetExtraWeights(contacts_1, done_1, mapped_residues)
2815  mapped_residues_2 = dict()
2816  for c1 in mapped_residues:
2817  c2 = chain_mapping[c1]
2818  mapped_residues_2[c2] = set()
2819  for r1 in mapped_residues[c1]:
2820  mapped_residues_2[c2].add(mapped_residues[c1][r1])
2821  weights_extra_2 = _GetExtraWeights(contacts_2, done_2, mapped_residues_2)
2822  weight_extra_mapped = weights_extra_1[0] + weights_extra_2[0]
2823  weight_extra_all = weights_extra_1[1] + weights_extra_2[1]
2824 
2825  # get scores
2826  denom_best = weight_sum + weight_extra_mapped
2827  denom_all = weight_sum + weight_extra_all
2828  if denom_best == 0:
2829  QS_best = 0
2830  else:
2831  QS_best = weighted_scores / denom_best
2832  if denom_all == 0:
2833  QS_global = 0
2834  else:
2835  QS_global = weighted_scores / denom_all
2836  return QS_best, QS_global
2837 
2838 def _weight(dist):
2839  """
2840  This weight expresses the probability of two residues to interact given the CB
2841  distance (from Xu et al. http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2573399/)
2842  """
2843  if dist <= 5.0:
2844  return 1
2845  x = (dist-5.0)/4.28;
2846  return np.exp(-2*x*x)
2847 
2848 
2849 def _GetQsSuperposition(alns):
2850  """
2851  :return: Superposition result based on shared CA atoms in *alns*
2852  (with views attached) (see :attr:`QSscorer.superposition`).
2853  :param alns: See :attr:`QSscorer.alignments`
2854  """
2855  # check input
2856  if not alns:
2857  raise QSscoreError('No successful alignments for superposition!')
2858  # prepare views
2859  view_1 = alns[0].GetSequence(0).attached_view.CreateEmptyView()
2860  view_2 = alns[0].GetSequence(1).attached_view.CreateEmptyView()
2861  # collect CA from alignments in proper order
2862  for aln in alns:
2863  for col in aln:
2864  res_1 = col.GetResidue(0)
2865  res_2 = col.GetResidue(1)
2866  if res_1.IsValid() and res_2.IsValid():
2867  ca_1 = res_1.FindAtom('CA')
2868  ca_2 = res_2.FindAtom('CA')
2869  if ca_1.IsValid() and ca_2.IsValid():
2870  view_1.AddAtom(ca_1)
2871  view_2.AddAtom(ca_2)
2872  # superpose them without chainging entities
2873  res = mol.alg.SuperposeSVD(view_1, view_2, apply_transform=False)
2874  return res
2875 
2876 
2877 def _AddResidue(edi, res, rnum, chain, calpha_only):
2878  """
2879  Add residue *res* with res. num. *run* to given *chain* using editor *edi*.
2880  Either all atoms added or (if *calpha_only*) only CA.
2881  """
2882  if calpha_only:
2883  ca_atom = res.FindAtom('CA')
2884  if ca_atom.IsValid():
2885  new_res = edi.AppendResidue(chain, res.name, rnum)
2886  edi.InsertAtom(new_res, ca_atom.name, ca_atom.pos)
2887  else:
2888  new_res = edi.AppendResidue(chain, res.name, rnum)
2889  for atom in res.atoms:
2890  edi.InsertAtom(new_res, atom.name, atom.pos)
2891 
2892 def _MergeAlignedChains(alns, ent_1, ent_2, calpha_only, penalize_extra_chains):
2893  """
2894  Create two new entities (based on the alignments attached views) where all
2895  residues have same numbering (when they're aligned) and they are all pushed to
2896  a single chain X. Also append extra chains contained in *ent_1* and *ent_2*
2897  but not contained in *alns*.
2898 
2899  Used for :attr:`QSscorer.lddt_ref` and :attr:`QSscorer.lddt_mdl`
2900 
2901  :param alns: List of alignments with attached views (first sequence: *ent_1*,
2902  second: *ent_2*). Residue number in single chain is column index
2903  of current alignment + sum of lengths of all previous alignments
2904  (order of alignments as in input list).
2905  :type alns: See :attr:`QSscorer.alignments`
2906  :param ent_1: First entity to process.
2907  :type ent_1: :class:`~ost.mol.EntityHandle`
2908  :param ent_2: Second entity to process.
2909  :type ent_2: :class:`~ost.mol.EntityHandle`
2910  :param calpha_only: If True, we only include CA atoms instead of all.
2911  :type calpha_only: :class:`bool`
2912  :param penalize_extra_chains: If True, extra chains are added to model and
2913  reference. Otherwise, only mapped ones.
2914  :type penalize_extra_chains: :class:`bool`
2915 
2916  :return: Tuple of two single chain entities (from *ent_1* and from *ent_2*)
2917  :rtype: :class:`tuple` of :class:`~ost.mol.EntityHandle`
2918  """
2919  # first new entity
2920  ent_ren_1 = mol.CreateEntity()
2921  ed_1 = ent_ren_1.EditXCS()
2922  new_chain_1 = ed_1.InsertChain('X')
2923  # second one
2924  ent_ren_2 = mol.CreateEntity()
2925  ed_2 = ent_ren_2.EditXCS()
2926  new_chain_2 = ed_2.InsertChain('X')
2927  # the alignment already contains sorted chains
2928  rnum = 0
2929  chain_done_1 = set()
2930  chain_done_2 = set()
2931  for aln in alns:
2932  chain_done_1.add(aln.GetSequence(0).name)
2933  chain_done_2.add(aln.GetSequence(1).name)
2934  for col in aln:
2935  rnum += 1
2936  # add valid residues to single chain entities
2937  res_1 = col.GetResidue(0)
2938  if res_1.IsValid():
2939  _AddResidue(ed_1, res_1, rnum, new_chain_1, calpha_only)
2940  res_2 = col.GetResidue(1)
2941  if res_2.IsValid():
2942  _AddResidue(ed_2, res_2, rnum, new_chain_2, calpha_only)
2943  # extra chains?
2944  if penalize_extra_chains:
2945  for chain in ent_1.chains:
2946  if chain.name in chain_done_1:
2947  continue
2948  for res in chain.residues:
2949  rnum += 1
2950  _AddResidue(ed_1, res, rnum, new_chain_1, calpha_only)
2951  for chain in ent_2.chains:
2952  if chain.name in chain_done_2:
2953  continue
2954  for res in chain.residues:
2955  rnum += 1
2956  _AddResidue(ed_2, res, rnum, new_chain_2, calpha_only)
2957  # get entity names
2958  ent_ren_1.SetName(aln.GetSequence(0).GetAttachedView().GetName())
2959  ent_ren_2.SetName(aln.GetSequence(1).GetAttachedView().GetName())
2960  # connect atoms
2961  if not conop.GetDefaultLib():
2962  raise RuntimeError("QSscore computation requires a compound library!")
2963  pr = conop.RuleBasedProcessor(conop.GetDefaultLib())
2964  pr.Process(ent_ren_1)
2965  ed_1.UpdateICS()
2966  pr.Process(ent_ren_2)
2967  ed_2.UpdateICS()
2968  return ent_ren_1, ent_ren_2
2969 
2970 
2971 # specify public interface
2972 __all__ = ('QSscoreError', 'QSscorer', 'QSscoreEntity', 'FilterContacts',
2973  'GetContacts', 'OligoLDDTScorer', 'MappedLDDTScorer')
def _ComputeAlignedEntities
Class internal helpers (anything that doesnt easily work without this class)
Definition: qsscoring.py:544
def _GetSuperposeData
Class internal helpers (anything that doesnt easily work without this class)
Definition: qsscoring.py:818
Mat4 DLLEXPORT_OST_GEOM Invert(const Mat4 &m)
def _InitScorer
Class internal helpers (anything that doesnt easily work without this class)
Definition: qsscoring.py:1406
Real Distance(const Vec3 &p1, const Vec3 &p2)
returns the distance between two points
Definition: vecmat3_op.hh:199
Three dimensional vector class, using Real precision.
Definition: vec3.hh:42
def _PrepareOligoEntities
Class internal helpers.
Definition: qsscoring.py:1208