OpenStructure: stage/lib64/python2.7/site-packages/ost/mol/alg/qsscoring.py Source File

00001 """
00002 Scoring of quaternary structures as in Martino's 2017 paper.
00003 
00004 .. note ::
00005 
00006   Requirements for use:
00007 
00008   - A default :class:`compound library <ost.conop.CompoundLib>` must be defined
00009     and accessible via :func:`~ost.conop.GetDefaultLib`. This is set by default
00010     when executing scripts with ``ost``. Otherwise, you must set this with
00011     :func:`~ost.conop.SetDefaultLib`.
00012   - ClustalW must be installed (unless you provide chain mappings)
00013   - Python modules `numpy` and `scipy` must be installed and available
00014     (e.g. use ``pip install scipy numpy``)
00015 
00016 Authors: Gerardo Tauriello, Martino Bertoni
00017 """
00018 
00019 from ost import mol, geom, conop, seq, settings
00020 from ost import LogError, LogWarning, LogScript, LogInfo, LogVerbose, LogDebug
00021 from ost.bindings.clustalw import ClustalW
00022 import numpy as np
00023 from scipy.misc import factorial
00024 from scipy.special import binom
00025 from scipy.cluster.hierarchy import fclusterdata
00026 import itertools
00027 
00028 ###############################################################################
00029 # QS scoring
00030 ###############################################################################
00031 
00032 class QSscoreError(Exception):
00033   """Exception to be raised for "acceptable" exceptions in QS scoring.
00034 
00035   Those are cases we might want to capture for default behavior.
00036   """
00037   pass
00038 
00039 class QSscorer:
00040   """Object to compute QS scores.
00041 
00042   Simple usage without any precomputed contacts, symmetries and mappings:
00043 
00044   .. code-block:: python
00045 
00046     import ost
00047     from ost.mol.alg import qsscoring
00048 
00049     # load two biounits to compare
00050     ent_full = ost.io.LoadPDB('3ia3', remote=True)
00051     ent_1 = ent_full.Select('cname=A,D')
00052     ent_2 = ent_full.Select('cname=B,C')
00053     # get score
00054     ost.PushVerbosityLevel(3)
00055     try:
00056       qs_scorer = qsscoring.QSscorer(ent_1, ent_2)
00057       ost.LogScript('QSscore:', str(qs_scorer.global_score))
00058       ost.LogScript('Chain mapping used:', str(qs_scorer.chain_mapping))
00059     except qsscoring.QSscoreError as ex:
00060       # default handling: report failure and set score to 0
00061       ost.LogError('QSscore failed:', str(ex))
00062       qs_score = 0
00063 
00064   For maximal performance when computing QS scores of the same entity with many
00065   others, it is advisable to construct and reuse :class:`QSscoreEntity` objects.
00066 
00067   Any known / precomputed information can be filled into the appropriate
00068   attribute here (no checks done!). Otherwise most quantities are computed on
00069   first access and cached (lazy evaluation). Setters are provided to set values
00070   with extra checks (e.g. :func:`SetSymmetries`).
00071 
00072   All necessary seq. alignments are done by global BLOSUM62-based alignment. A
00073   multiple sequence alignment is performed with ClustalW unless
00074   :attr:`chain_mapping` is provided manually. You will need to have an
00075   executable ``clustalw`` or ``clustalw2`` in your ``PATH`` or you must set
00076   :attr:`clustalw_bin` accordingly. Otherwise an exception
00077   (:class:`ost.settings.FileNotFound`) is thrown.
00078 
00079   Formulas for QS scores:
00080 
00081   ::
00082   
00083     - QS_best = weighted_scores / (weight_sum + weight_extra_mapped)
00084     - QS_global = weighted_scores / (weight_sum + weight_extra_all)
00085     -> weighted_scores = sum(w(min(d1,d2)) * (1 - abs(d1-d2)/12)) for shared
00086     -> weight_sum = sum(w(min(d1,d2))) for shared
00087     -> weight_extra_mapped = sum(w(d)) for all mapped but non-shared
00088     -> weight_extra_all = sum(w(d)) for all non-shared
00089     -> w(d) = 1 if d <= 5, exp(-2 * ((d-5.0)/4.28)^2) else
00090 
00091   :param ent_1: First structure to be scored.
00092   :type ent_1:  :class:`QSscoreEntity`, :class:`~ost.mol.EntityHandle` or
00093                 :class:`~ost.mol.EntityView`
00094   :param ent_2: Second structure to be scored.
00095   :type ent_2:  :class:`QSscoreEntity`, :class:`~ost.mol.EntityHandle` or
00096                 :class:`~ost.mol.EntityView`
00097 
00098   :raises: :class:`QSscoreError` if input structures are invalid or are monomers
00099            or have issues that make it impossible for a QS score to be computed.
00100 
00101   .. attribute:: qs_ent_1
00102 
00103     :class:`QSscoreEntity` object for *ent_1* given at construction.
00104     If entity names (:attr:`~QSscoreEntity.original_name`) are not unique, we
00105     set it to 'pdb_1' using :func:`~QSscoreEntity.SetName`.
00106 
00107   .. attribute:: qs_ent_2
00108 
00109     :class:`QSscoreEntity` object for *ent_2* given at construction.
00110     If entity names (:attr:`~QSscoreEntity.original_name`) are not unique, we
00111     set it to 'pdb_2' using :func:`~QSscoreEntity.SetName`.
00112 
00113   .. attribute:: calpha_only
00114 
00115     True if any of the two structures is CA-only (after cleanup).
00116 
00117     :type: :class:`bool`
00118 
00119   .. attribute:: max_ca_per_chain_for_cm
00120 
00121     Maximal number of CA atoms to use in each chain to determine chain mappings.
00122     Setting this to -1 disables the limit. Limiting it speeds up determination
00123     of symmetries and chain mappings. By default it is set to 100.
00124 
00125     :type: :class:`int`
00126   """
00127   def __init__(self, ent_1, ent_2):
00128     # generate QSscoreEntity objects?
00129     if isinstance(ent_1, QSscoreEntity):
00130       self.qs_ent_1 = ent_1
00131     else:
00132       self.qs_ent_1 = QSscoreEntity(ent_1)
00133     if isinstance(ent_2, QSscoreEntity):
00134       self.qs_ent_2 = ent_2
00135     else:
00136       self.qs_ent_2 = QSscoreEntity(ent_2)
00137     # check validity of inputs
00138     if not self.qs_ent_1.is_valid or not self.qs_ent_2.is_valid:
00139       raise QSscoreError("Invalid input in QSscorer!")
00140     # set names for structures
00141     if self.qs_ent_1.original_name == self.qs_ent_2.original_name:
00142       self.qs_ent_1.SetName('pdb_1')
00143       self.qs_ent_2.SetName('pdb_2')
00144     else:
00145       self.qs_ent_1.SetName(self.qs_ent_1.original_name)
00146       self.qs_ent_2.SetName(self.qs_ent_2.original_name)
00147     # set other public attributes
00148     self.calpha_only = self.qs_ent_1.calpha_only or self.qs_ent_2.calpha_only
00149     self.max_ca_per_chain_for_cm = 100
00150     # init cached stuff
00151     self._chem_mapping = None
00152     self._ent_to_cm_1 = None
00153     self._ent_to_cm_2 = None
00154     self._symm_1 = None
00155     self._symm_2 = None
00156     self._chain_mapping = None
00157     self._alignments = None
00158     self._mapped_residues = None
00159     self._global_score = None
00160     self._best_score = None
00161     self._superposition = None
00162     self._lddt_score = None
00163     self._lddt_mdl = None
00164     self._lddt_ref = None
00165     self._clustalw_bin = None
00166 
00167   @property
00168   def chem_mapping(self):
00169     """Inter-complex mapping of chemical groups.
00170 
00171     Each group (see :attr:`QSscoreEntity.chem_groups`) is mapped according to
00172     highest sequence identity. Alignment is between longest sequences in groups.
00173 
00174     Limitations:
00175 
00176     - If different numbers of groups, we map only the groups for the complex
00177       with less groups (rest considered unmapped and shown as warning)
00178     - The mapping is forced: the "best" mapping will be chosen independently of
00179       how low the seq. identity may be
00180 
00181     :getter: Computed on first use (cached)
00182     :type: :class:`dict` with key = :class:`tuple` of chain names in
00183            :attr:`qs_ent_1` and value = :class:`tuple` of chain names in
00184            :attr:`qs_ent_2`.
00185 
00186     :raises: :class:`QSscoreError` if we end up having less than 2 chains for
00187              either entity in the mapping (can happen if chains do not have CA
00188              atoms).
00189     """
00190     if self._chem_mapping is None:
00191       self._chem_mapping = _GetChemGroupsMapping(self.qs_ent_1, self.qs_ent_2)
00192     return self._chem_mapping
00193 
00194   @property
00195   def ent_to_cm_1(self):
00196     """Subset of :attr:`qs_ent_1` used to compute chain mapping and symmetries.
00197 
00198     Properties:
00199 
00200     - Includes only residues aligned according to :attr:`chem_mapping`
00201     - Includes only 1 CA atom per residue
00202     - Has at least 5 and at most :attr:`max_ca_per_chain_for_cm` atoms per chain
00203     - All chains of the same chemical group have the same number of atoms
00204       (also in :attr:`ent_to_cm_2` according to :attr:`chem_mapping`)
00205     - All chains appearing in :attr:`chem_mapping` appear in this entity
00206       (so the two can be safely used together)
00207 
00208     This entity might be transformed (i.e. all positions rotated/translated by
00209     same transformation matrix) if this can speed up computations. So do not
00210     assume fixed global positions (but relative distances will remain fixed).
00211 
00212     :getter: Computed on first use (cached)
00213     :type: :class:`~ost.mol.EntityHandle`
00214 
00215     :raises: :class:`QSscoreError` if any chain ends up having less than 5 res.
00216     """
00217     if self._ent_to_cm_1 is None:
00218       self._ComputeAlignedEntities()
00219     return self._ent_to_cm_1
00220 
00221   @property
00222   def ent_to_cm_2(self):
00223     """Subset of :attr:`qs_ent_1` used to compute chain mapping and symmetries
00224     (see :attr:`ent_to_cm_1` for details).
00225     """
00226     if self._ent_to_cm_2 is None:
00227       self._ComputeAlignedEntities()
00228     return self._ent_to_cm_2
00229 
00230   @property
00231   def symm_1(self):
00232     """Symmetry groups for :attr:`qs_ent_1` used to speed up chain mapping.
00233 
00234     This is a list of chain-lists where each chain-list can be used reconstruct
00235     the others via cyclic C or dihedral D symmetry. The first chain-list is used
00236     as a representative symmetry group. For heteromers, the group-members must
00237     contain all different seqres in oligomer.
00238 
00239     Example: symm. groups [(A,B,C), (D,E,F), (G,H,I)] means that there are
00240     symmetry transformations to get (D,E,F) and (G,H,I) from (A,B,C).
00241 
00242     Properties:
00243 
00244     - All symmetry group tuples have the same length (num. of chains)
00245     - All chains in :attr:`ent_to_cm_1` appear (w/o duplicates)
00246     - For heteros: symmetry group tuples have all different chem. groups
00247     - Trivial symmetry group = one tuple with all chains (used if inconsistent
00248       data provided or if no symmetry is found)
00249     - Either compatible to :attr:`symm_2` or trivial symmetry groups used.
00250       Compatibility requires same lengths of symmetry group tuples and it must
00251       be possible to get an overlap (80% of residues covered within 6 A of a
00252       (chem. mapped) chain) of all chains in representative symmetry groups by
00253       superposing one pair of chains.
00254 
00255     :getter: Computed on first use (cached)
00256     :type: :class:`list` of :class:`tuple` of :class:`str` (chain names)
00257     """
00258     if self._symm_1 is None:
00259       self._ComputeSymmetry()
00260     return self._symm_1
00261 
00262   @property
00263   def symm_2(self):
00264     """Symmetry groups for :attr:`qs_ent_2` (see :attr:`symm_1` for details)."""
00265     if self._symm_2 is None:
00266       self._ComputeSymmetry()
00267     return self._symm_2
00268 
00269   def SetSymmetries(self, symm_1, symm_2):
00270     """Set user-provided symmetry groups.
00271 
00272     These groups are restricted to chain names appearing in :attr:`ent_to_cm_1`
00273     and :attr:`ent_to_cm_2` respectively. They are only valid if they cover all
00274     chains and both *symm_1* and *symm_2* have same lengths of symmetry group
00275     tuples. Otherwise trivial symmetry group used (see :attr:`symm_1`).
00276 
00277     :param symm_1: Value to set for :attr:`symm_1`.
00278     :param symm_2: Value to set for :attr:`symm_2`.
00279     """
00280     # restrict chain names
00281     self._symm_1 = _CleanUserSymmetry(symm_1, self.ent_to_cm_1)
00282     self._symm_2 = _CleanUserSymmetry(symm_2, self.ent_to_cm_2)
00283     # check that we have reasonable symmetries set (fallback: all chains)
00284     if not _AreValidSymmetries(self._symm_1, self._symm_2):
00285       self._symm_1 = [tuple(ch.name for ch in self.ent_to_cm_1.chains)]
00286       self._symm_2 = [tuple(ch.name for ch in self.ent_to_cm_2.chains)]
00287 
00288   @property
00289   def chain_mapping(self):
00290     """Mapping from :attr:`ent_to_cm_1` to :attr:`ent_to_cm_2`.
00291 
00292     Properties:
00293 
00294     - Mapping is between chains of same chem. group (see :attr:`chem_mapping`)
00295     - Each chain can appear only once in mapping
00296     - All chains of complex with less chains are mapped
00297     - Symmetry (:attr:`symm_1`, :attr:`symm_2`) is taken into account
00298 
00299     Details on algorithms used to find mapping:
00300 
00301     - We try all pairs of chem. mapped chains within symmetry group and get
00302       superpose-transformation for them
00303     - First option: check for "sufficient overlap" of other chain-pairs
00304 
00305       - For each chain-pair defined above: apply superposition to full oligomer
00306         and map chains based on structural overlap
00307       - Structural overlap = X% of residues in second oligomer covered within Y
00308         Angstrom of a (chem. mapped) chain in first oligomer. We successively
00309         try (X,Y) = (80,4), (40,6) and (20,8) to be less and less strict in
00310         mapping (warning shown for most permissive one).
00311       - If multiple possible mappings are found, we choose the one which leads
00312         to the lowest multi-chain-RMSD given the superposition
00313 
00314     - Fallback option: try all mappings to find minimal multi-chain-RMSD
00315       (warning shown)
00316 
00317       - For each chain-pair defined above: apply superposition, try all (!)
00318         possible chain mappings (within symmetry group) and keep mapping with
00319         lowest multi-chain-RMSD
00320       - Repeat procedure above to resolve symmetry. Within the symmetry group we
00321         can use the chain mapping computed before and we just need to find which
00322         symmetry group in first oligomer maps to which in the second one. We
00323         again try all possible combinations...
00324       - Limitations:
00325         
00326         - Trying all possible mappings is a combinatorial nightmare (factorial).
00327           We throw an exception if too many combinations (e.g. octomer vs
00328           octomer with no usable symmetry)
00329         - The mapping is forced: the "best" mapping will be chosen independently
00330           of how badly they fit in terms of multi-chain-RMSD
00331         - As a result, such a forced mapping can lead to a large range of
00332           resulting QS scores. An extreme example was observed between 1on3.1
00333           and 3u9r.1, where :attr:`global_score` can range from 0.12 to 0.43
00334           for mappings with very similar multi-chain-RMSD.
00335 
00336     :getter: Computed on first use (cached)
00337     :type: :class:`dict` with key / value = :class:`str` (chain names, key
00338            for :attr:`ent_to_cm_1`, value for :attr:`ent_to_cm_2`)
00339     :raises: :class:`QSscoreError` if there are too many combinations to check
00340              to find a chain mapping.
00341     """
00342     if self._chain_mapping is None:
00343       self._chain_mapping = _GetChainMapping(self.ent_to_cm_1, self.ent_to_cm_2,
00344                                              self.symm_1, self.symm_2,
00345                                              self.chem_mapping)
00346       LogInfo('Mapping found: %s' % str(self._chain_mapping))
00347     return self._chain_mapping
00348 
00349   @property
00350   def alignments(self):
00351     """List of successful sequence alignments using :attr:`chain_mapping`.
00352 
00353     There will be one alignment for each mapped chain and they are ordered by
00354     their chain names in :attr:`qs_ent_1`.
00355 
00356     The sequences of the alignments have views attached into
00357     :attr:`QSscoreEntity.ent` of :attr:`qs_ent_1` and :attr:`qs_ent_2`.
00358 
00359     :getter: Computed on first use (cached)
00360     :type: :class:`list` of :class:`~ost.seq.AlignmentHandle`
00361     """
00362     if self._alignments is None:
00363       self._alignments = _GetMappedAlignments(self.qs_ent_1.ent,
00364                                               self.qs_ent_2.ent,
00365                                               self.chain_mapping)
00366     return self._alignments
00367 
00368   @property
00369   def mapped_residues(self):
00370     """Mapping of shared residues in :attr:`alignments`.
00371 
00372     :getter: Computed on first use (cached)
00373     :type: :class:`dict` *mapped_residues[c1][r1] = r2* with:
00374            *c1* = Chain name in first entity (= first sequence in aln),
00375            *r1* = Residue number in first entity,
00376            *r2* = Residue number in second entity
00377     """
00378     if self._mapped_residues is None:
00379       self._mapped_residues = _GetMappedResidues(self.alignments)
00380     return self._mapped_residues
00381 
00382   @property
00383   def global_score(self):
00384     """QS-score with penalties.
00385     
00386     The range of the score is between 0 (i.e. no interface residues are shared
00387     between biounits) and 1 (i.e. the interfaces are identical).
00388     
00389     The global QS-score is computed applying penalties when interface residues
00390     or entire chains are missing (i.e. anything that is not mapped in
00391     :attr:`mapped_residues` / :attr:`chain_mapping`) in one of the biounits.
00392 
00393     :getter: Computed on first use (cached)
00394     :type: :class:`float`
00395     """
00396     if self._global_score is None:
00397       self._ComputeScores()
00398     return self._global_score
00399 
00400   @property
00401   def best_score(self):
00402     """QS-score without penalties.
00403 
00404     Like :attr:`global_score`, but neglecting additional residues or chains in
00405     one of the biounits (i.e. the score is calculated considering only mapped
00406     chains and residues).
00407 
00408     :getter: Computed on first use (cached)
00409     :type: :class:`float`
00410     """
00411     if self._best_score is None:
00412       self._ComputeScores()
00413     return self._best_score
00414 
00415   @property
00416   def superposition(self):
00417     """Superposition result based on shared CA atoms in :attr:`alignments`.
00418 
00419     The superposition can be used to map :attr:`QSscoreEntity.ent` of
00420     :attr:`qs_ent_1` onto the one of :attr:`qs_ent_2`. Use
00421     :func:`ost.geom.Invert` if you need the opposite transformation.
00422 
00423     :getter: Computed on first use (cached)
00424     :type: :class:`ost.mol.alg.SuperpositionResult`
00425     """
00426     if self._superposition is None:
00427       self._superposition = _GetQsSuperposition(self.alignments)
00428       # report it
00429       sup_rmsd = self._superposition.rmsd
00430       cmp_view = self._superposition.view1
00431       LogInfo('CA RMSD for %s aligned residues on %s chains: %.2f' \
00432               % (cmp_view.residue_count, cmp_view.chain_count, sup_rmsd))
00433     return self._superposition
00434 
00435   @property
00436   def lddt_score(self):
00437     """The multi-chain lDDT score.
00438 
00439     .. note::
00440 
00441       lDDT is not considering over-prediction (i.e. extra chains) and hence is
00442       not symmetric. Here, we consider :attr:`qs_ent_1` as the reference and
00443       :attr:`qs_ent_2` as the model. The alignments from :attr:`alignments` are
00444       used to map residue numbers and chains.
00445 
00446     The score is computed with OST's :func:`~ost.mol.alg.LocalDistDiffTest`
00447     function with a single distance threshold of 2 A and an inclusion radius of
00448     8 A. You can use :attr:`lddt_mdl` and :attr:`lddt_ref` to get entities on
00449     which you can call any other lDDT function with any other set of parameters.
00450 
00451     :getter: Computed on first use (cached)
00452     :type: :class:`float`
00453     """
00454     if self._lddt_score is None:
00455       self._ComputeLDDT()
00456     return self._lddt_score
00457 
00458   @property
00459   def lddt_mdl(self):
00460     """The model entity used for lDDT scoring (:attr:`lddt_score`) and annotated
00461     with local scores.
00462 
00463     Local scores are available as residue properties named 'lddt' and on each
00464     atom as a B-factor. Only CA atoms are considered if :attr:`calpha_only` is
00465     True, otherwise this is an all-atom score.
00466     
00467     Since, the lDDT computation requires a single chain with mapped residue
00468     numbering, all chains are appended into a single chain X with unique residue
00469     numbers according to the column-index in the alignment. The alignments are
00470     in the same order as they appear in :attr:`alignments`. Additional residues
00471     are appended at the end of the chain with unique residue numbers.
00472 
00473     :getter: Computed on first use (cached)
00474     :type: :class:`~ost.mol.EntityHandle`
00475     """
00476     if self._lddt_mdl is None:
00477       self._ComputeLDDT()
00478     return self._lddt_mdl
00479 
00480   @property
00481   def lddt_ref(self):
00482     """The reference entity used for lDDT scoring (:attr:`lddt_score`).
00483 
00484     This is a single chain X with residue numbers matching ones in
00485     :attr:`lddt_mdl` where aligned and unique numbers for additional residues.
00486 
00487     :getter: Computed on first use (cached)
00488     :type: :class:`~ost.mol.EntityHandle`
00489     """
00490     if self._lddt_ref is None:
00491       self._ComputeLDDT()
00492     return self._lddt_ref
00493 
00494   @property
00495   def clustalw_bin(self):
00496     """
00497     Full path to ``clustalw`` or ``clustalw2`` executable to use for multiple
00498     sequence alignments (unless :attr:`chain_mapping` is provided manually).
00499 
00500     :getter: Located in path on first use (cached)
00501     :type: :class:`str`
00502     """
00503     if self._clustalw_bin is None:
00504       self._clustalw_bin = settings.Locate(('clustalw', 'clustalw2'))
00505     return self._clustalw_bin
00506 
00507   ##############################################################################
00508   # Class internal helpers (anything that doesnt easily work without this class)
00509   ##############################################################################
00510 
00511   def _ComputeAlignedEntities(self):
00512     """Fills cached ent_to_cm_1 and ent_to_cm_2."""
00513     # get aligned residues via MSA
00514     ev1, ev2 = _GetAlignedResidues(self.qs_ent_1, self.qs_ent_2,
00515                                    self.chem_mapping,
00516                                    self.max_ca_per_chain_for_cm,
00517                                    self.clustalw_bin)
00518     # extract new entities
00519     self._ent_to_cm_1 = mol.CreateEntityFromView(ev1, True)
00520     self._ent_to_cm_2 = mol.CreateEntityFromView(ev2, True)
00521     # name them
00522     self._ent_to_cm_1.SetName(self.qs_ent_1.GetName())
00523     self._ent_to_cm_2.SetName(self.qs_ent_2.GetName())
00524 
00525   def _ComputeSymmetry(self):
00526     """Fills cached symm_1 and symm_2."""
00527     # get them
00528     self._symm_1, self._symm_2 = \
00529       _FindSymmetry(self.qs_ent_1, self.qs_ent_2, self.ent_to_cm_1,
00530                     self.ent_to_cm_2, self.chem_mapping)
00531     # check that we have reasonable symmetries set (fallback: all chains)
00532     if not _AreValidSymmetries(self._symm_1, self._symm_2):
00533       self._symm_1 = [tuple(ch.name for ch in self.ent_to_cm_1.chains)]
00534       self._symm_2 = [tuple(ch.name for ch in self.ent_to_cm_2.chains)]
00535 
00536   def _ComputeScores(self):
00537     """Fills cached global_score and best_score."""
00538     # get contacts
00539     if self.calpha_only:
00540       contacts_1 = self.qs_ent_1.contacts_ca
00541       contacts_2 = self.qs_ent_2.contacts_ca
00542     else:
00543       contacts_1 = self.qs_ent_1.contacts
00544       contacts_2 = self.qs_ent_2.contacts
00545     # get scores
00546     scores = _GetScores(contacts_1, contacts_2, self.mapped_residues,
00547                         self.chain_mapping)
00548     self._best_score = scores[0]
00549     self._global_score = scores[1]
00550     # report scores
00551     LogInfo('QSscore %s, %s: best: %.2f, global: %.2f' \
00552             % (self.qs_ent_1.GetName(), self.qs_ent_2.GetName(),
00553                self._best_score, self._global_score))
00554 
00555   def _ComputeLDDT(self):
00556     """Fills cached lddt_score, lddt_mdl and lddt_ref."""
00557     LogInfo('Computing lDDT score')
00558     # check reference and model
00559     ref, mdl = self.qs_ent_1.ent, self.qs_ent_2.ent
00560     LogInfo('Reference %s has: %s chains' % (ref.GetName(), ref.chain_count))
00561     LogInfo('Model %s has: %s chains' % (mdl.GetName(), mdl.chain_count))
00562     if mdl.chain_count > ref.chain_count:
00563       LogWarning('MODEL contains more chains than REFERENCE, '
00564                  'lDDT is not considering them')
00565     # get single chain reference and model
00566     self._lddt_ref, self._lddt_mdl = \
00567       _MergeAlignedChains(self.alignments, ref, mdl, self.calpha_only)
00568     # score them (mdl and ref changed) and keep results
00569     self._lddt_score = _ComputeLDDTScore(self._lddt_ref, self._lddt_mdl)
00570 
00571 
00572 ###############################################################################
00573 # Entity with cached entries for QS scoring
00574 ###############################################################################
00575 
00576 class QSscoreEntity(object):
00577   """Entity with cached entries for QS scoring.
00578 
00579   Any known / precomputed information can be filled into the appropriate
00580   attribute here as long as they are labelled as read/write. Otherwise the
00581   quantities are computed on first access and cached (lazy evaluation). The
00582   heaviest load is expected when computing :attr:`contacts` and
00583   :attr:`contacts_ca`.
00584 
00585   :param ent: Entity to be used for QS scoring. A copy of it will be processed.
00586   :type ent:  :class:`~ost.mol.EntityHandle` or :class:`~ost.mol.EntityView`
00587 
00588   .. attribute:: is_valid
00589 
00590     True, if successfully initialized. False, if input structure is monomer or
00591     has less than 2 protein chains with >= 20 residues.
00592 
00593     :type: :class:`bool`
00594 
00595   .. attribute:: original_name
00596 
00597     Name set for *ent* when object was created.
00598 
00599     :type: :class:`str`
00600 
00601   .. attribute:: ent
00602 
00603     Cleaned version of *ent* passed at construction. Hydrogens are removed, the
00604     entity is processed with a :class:`~ost.conop.RuleBasedProcessor` and chains
00605     listed in :attr:`removed_chains` have been removed. The name of this entity
00606     might change during scoring (see :func:`GetName`). Otherwise, this will be
00607     fixed.
00608 
00609     :type: :class:`~ost.mol.EntityHandle`
00610 
00611   .. attribute:: removed_chains
00612 
00613     Chains removed from *ent* passed at construction. These are ligand and water
00614     chains as well as small (< 20 res.) peptides or chains with no amino acids
00615     (determined by chem. type, which is set by rule based processor).
00616 
00617     :type: :class:`list` of :class:`str`
00618 
00619   .. attribute:: calpha_only
00620 
00621     Whether entity is CA-only (i.e. it has 0 CB atoms)
00622 
00623     :type: :class:`bool`
00624   """
00625   def __init__(self, ent):
00626     # copy entity and process/clean it
00627     self.original_name = ent.GetName()
00628     ent = mol.CreateEntityFromView(ent.Select('ele!=H and aname!=HN'), True)
00629     if not conop.GetDefaultLib():
00630       raise RuntimeError("QSscore computation requires a compound library!")
00631     pr = conop.RuleBasedProcessor(conop.GetDefaultLib())
00632     pr.Process(ent)
00633     self.ent, self.removed_chains, self.calpha_only = _CleanInputEntity(ent)
00634     # check if it's suitable for QS scoring
00635     if self.ent.chain_count == 0:
00636       LogError('Bad input file: ' + ent.GetName() + '. No chains left after '
00637                'removing water, ligands and small peptides.')
00638       self.is_valid = False
00639     elif self.ent.chain_count == 1:
00640       LogError('Structure ' + ent.GetName() + ' is a monomer. '
00641                'QSscore is not defined for monomers.')
00642       self.is_valid = False
00643     else:
00644       self.is_valid = True
00645     # init cached stuff
00646     self._ca_entity = None
00647     self._ca_chains = None
00648     self._alignments = {}
00649     self._chem_groups = None
00650     self._angles = {}
00651     self._axis = {}
00652     self._contacts = None
00653     self._contacts_ca = None
00654 
00655   def GetName(self):
00656     """Wrapper to :func:`~ost.mol.EntityHandle.GetName` of :attr:`ent`.
00657     This is used to uniquely identify the entity while scoring. The name may
00658     therefore change while :attr:`original_name` remains fixed.
00659     """
00660     # for duck-typing and convenience
00661     return self.ent.GetName()
00662 
00663   def SetName(self, new_name):
00664     """Wrapper to :func:`~ost.mol.EntityHandle.SetName` of :attr:`ent`.
00665     Use this to change unique identifier while scoring (see :func:`GetName`).
00666     """
00667     # for duck-typing and convenience
00668     self.ent.SetName(new_name)
00669 
00670   @property
00671   def ca_entity(self):
00672     """
00673     Reduced representation of :attr:`ent` with only CA atoms.
00674     This guarantees that each included residue has exactly one atom.
00675 
00676     :getter: Computed on first use (cached)
00677     :type: :class:`~ost.mol.EntityHandle`
00678     """
00679     if self._ca_entity is None:
00680       self._ca_entity = _GetCAOnlyEntity(self.ent)
00681     return self._ca_entity
00682 
00683   @property
00684   def ca_chains(self):
00685     """
00686     Map of chain names in :attr:`ent` to sequences with attached view to CA-only
00687     chains (into :attr:`ca_entity`). Useful for alignments and superpositions.
00688 
00689     :getter: Computed on first use (cached)
00690     :type: :class:`dict` (key = :class:`str`,
00691            value = :class:`~ost.seq.SequenceHandle`)
00692     """
00693     if self._ca_chains is None:
00694       self._ca_chains = dict()
00695       ca_entity = self.ca_entity
00696       for ch in ca_entity.chains:
00697         self._ca_chains[ch.name] = seq.SequenceFromChain(ch.name, ch)
00698     return self._ca_chains
00699 
00700   def GetAlignment(self, c1, c2):
00701     """Get sequence alignment of chain *c1* with chain *c2*.
00702     Computed on first use based on :attr:`ca_chains` (cached).
00703 
00704     :param c1: Chain name for first chain to align.
00705     :type c1:  :class:`str`
00706     :param c2: Chain name for second chain to align.
00707     :type c2:  :class:`str`
00708     :rtype: :class:`~ost.seq.AlignmentHandle` or None if it failed.
00709     """
00710     if (c1,c2) not in self._alignments:
00711       ca_chains = self.ca_chains
00712       self._alignments[(c1,c2)] = _AlignAtomSeqs(ca_chains[c1], ca_chains[c2])
00713     return self._alignments[(c1,c2)]
00714 
00715   @property
00716   def chem_groups(self):
00717     """
00718     Intra-complex group of chemically identical (seq. id. > 95%) polypeptide
00719     chains as extracted from :attr:`ca_chains`. First chain in group is the one
00720     with the longest sequence.
00721 
00722     :getter: Computed on first use (cached)
00723     :type: :class:`list` of :class:`list` of :class:`str` (chain names)
00724     """
00725     if self._chem_groups is None:
00726       self._chem_groups = _GetChemGroups(self, 95)
00727       LogInfo('Chemically equivalent chain-groups in %s: %s' \
00728               % (self.GetName(), str(self._chem_groups)))
00729     return self._chem_groups
00730 
00731   def GetAngles(self, c1, c2):
00732     """Get Euler angles from superposition of chain *c1* with chain *c2*.
00733     Computed on first use based on :attr:`ca_chains` (cached).
00734 
00735     :param c1: Chain name for first chain to superpose.
00736     :type c1:  :class:`str`
00737     :param c2: Chain name for second chain to superpose.
00738     :type c2:  :class:`str`
00739     :return: 3 Euler angles (may contain nan if something fails).
00740     :rtype:  :class:`numpy.array`
00741     """
00742     if (c1,c2) not in self._angles:
00743       self._GetSuperposeData(c1, c2)
00744     return self._angles[(c1,c2)]
00745 
00746   def GetAxis(self, c1, c2):
00747     """Get axis of symmetry from superposition of chain *c1* with chain *c2*.
00748     Computed on first use based on :attr:`ca_chains` (cached).
00749 
00750     :param c1: Chain name for first chain to superpose.
00751     :type c1:  :class:`str`
00752     :param c2: Chain name for second chain to superpose.
00753     :type c2:  :class:`str`
00754     :return: Rotational axis (may contain nan if something fails).
00755     :rtype:  :class:`numpy.array`
00756     """
00757     if (c1,c2) not in self._axis:
00758       self._GetSuperposeData(c1, c2)
00759     return self._axis[(c1,c2)]
00760 
00761   @property
00762   def contacts(self):
00763     """
00764     Connectivity dictionary (**read/write**).
00765     As given by :func:`GetContacts` with *calpha_only* = False on :attr:`ent`.
00766 
00767     :getter: Computed on first use (cached)
00768     :setter: Uses :func:`FilterContacts` to ensure that we only keep contacts
00769              for chains in the cleaned entity.
00770     :type: See return type of :func:`GetContacts`
00771     """
00772     if self._contacts is None:
00773       self._contacts = GetContacts(self.ent, False)
00774     return self._contacts
00775 
00776   @contacts.setter
00777   def contacts(self, new_contacts):
00778     chain_names = set([ch.name for ch in self.ent.chains])
00779     self._contacts = FilterContacts(new_contacts, chain_names)
00780   
00781   @property
00782   def contacts_ca(self):
00783     """
00784     CA-only connectivity dictionary (**read/write**).
00785     Like :attr:`contacts` but with *calpha_only* = True in :func:`GetContacts`.
00786     """
00787     if self._contacts_ca is None:
00788       self._contacts_ca = GetContacts(self.ent, True)
00789     return self._contacts_ca
00790   
00791   @contacts_ca.setter
00792   def contacts_ca(self, new_contacts):
00793     chain_names = set([ch.name for ch in self.ent.chains])
00794     self._contacts_ca = FilterContacts(new_contacts, chain_names)
00795 
00796   ##############################################################################
00797   # Class internal helpers (anything that doesnt easily work without this class)
00798   ##############################################################################
00799 
00800   def _GetSuperposeData(self, c1, c2):
00801     """Fill _angles and _axis from superposition of CA chains of c1 and c2."""
00802     # get aligned views (must contain identical numbers of atoms!)
00803     aln = self.GetAlignment(c1, c2)
00804     if not aln:
00805       # fallback for non-aligned stuff (nan)
00806       self._angles[(c1,c2)] = np.empty(3) * np.nan
00807       self._axis[(c1,c2)] = np.empty(3) * np.nan
00808       return
00809     v1, v2 = seq.ViewsFromAlignment(aln)
00810     if v1.atom_count < 3:
00811       # fallback for non-aligned stuff (nan)
00812       self._angles[(c1,c2)] = np.empty(3) * np.nan
00813       self._axis[(c1,c2)] = np.empty(3) * np.nan
00814       return
00815     # get transformation
00816     sup_res = mol.alg.SuperposeSVD(v1, v2, apply_transform=False)
00817     Rt = sup_res.transformation
00818     # extract angles
00819     a,b,c = _GetAngles(Rt)
00820     self._angles[(c1,c2)] = np.asarray([a,b,c])
00821     # extract axis of symmetry
00822     R3 = geom.Rotation3(Rt.ExtractRotation())
00823     self._axis[(c1,c2)] = np.asarray(R3.GetRotationAxis().data)
00824 
00825 ###############################################################################
00826 # Contacts computations
00827 ###############################################################################
00828 
00829 def FilterContacts(contacts, chain_names):
00830   """Filter contacts to contain only contacts for chains in *chain_names*.
00831 
00832   :param contacts: Connectivity dictionary as produced by :func:`GetContacts`.
00833   :type contacts:  :class:`dict`
00834   :param chain_names: Chain names to keep.
00835   :type chain_names:  :class:`list` or (better) :class:`set`
00836   :return: New connectivity dictionary (format as in :func:`GetContacts`)
00837   :rtype:  :class:`dict`
00838   """
00839   # create new dict with subset
00840   filtered_contacts = dict()
00841   for c1 in contacts:
00842     if c1 in chain_names:
00843       new_contacts = dict()
00844       for c2 in contacts[c1]:
00845         if c2 in chain_names:
00846           new_contacts[c2] = contacts[c1][c2]
00847       # avoid adding empty dicts
00848       if new_contacts:
00849         filtered_contacts[c1] = new_contacts
00850   return filtered_contacts
00851 
00852 def GetContacts(entity, calpha_only, dist_thr=12.0):
00853   """Get inter-chain contacts of a macromolecular entity.
00854 
00855   Contacts are pairs of residues within a given distance belonging to different
00856   chains. They are stored once per pair and include the CA/CB-CA/CB distance.
00857 
00858   :param entity: An entity to check connectivity for.
00859   :type entity:  :class:`~ost.mol.EntityHandle` or :class:`~ost.mol.EntityView`
00860   :param calpha_only: If True, we only consider CA-CA distances. Else, we use CB
00861                       unless the residue is a GLY.
00862   :type calpha_only:  :class:`bool`
00863   :param dist_thr: Maximal CA/CB-CA/CB distance to be considered in contact.
00864   :type dist_thr:  :class:`float`
00865   :return: A connectivity dictionary. A pair of residues with chain names
00866            *ch_name1* & *ch_name2* (*ch_name1* < *ch_name2*), residue numbers
00867            *res_num1* & *res_num2* and distance *dist* (<= *dist_thr*) are
00868            stored as *result[ch_name1][ch_name2][res_num1][res_num2]* = *dist*.
00869   :rtype:  :class:`dict`
00870   """
00871   # get ent copy to search on
00872   if calpha_only:
00873     ev = entity.Select("aname=CA")
00874   else:
00875     ev = entity.Select("(rname=GLY and aname=CA) or aname=CB")
00876   ent = mol.CreateEntityFromView(ev, True)
00877   # search all vs all
00878   contacts = dict()
00879   for atom in ent.atoms:
00880     ch_name1 = atom.chain.name
00881     res_num1 = atom.residue.number.num
00882     close_atoms = ent.FindWithin(atom.pos, dist_thr)
00883     for close_atom in close_atoms:
00884       ch_name2 = close_atom.chain.name
00885       if ch_name2 > ch_name1:
00886         res_num2 = close_atom.residue.number.num
00887         dist = geom.Distance(atom.pos, close_atom.pos)
00888         # add to contacts
00889         if ch_name1 not in contacts:
00890           contacts[ch_name1] = dict()
00891         if ch_name2 not in contacts[ch_name1]:
00892           contacts[ch_name1][ch_name2] = dict()
00893         if res_num1 not in contacts[ch_name1][ch_name2]:
00894           contacts[ch_name1][ch_name2][res_num1] = dict()
00895         contacts[ch_name1][ch_name2][res_num1][res_num2] = round(dist, 3)
00896   # DONE
00897   return contacts
00898 
00899 
00900 ###############################################################################
00901 # HELPERS
00902 ###############################################################################
00903 
00904 # general
00905 
00906 def _AlignAtomSeqs(seq_1, seq_2):
00907   """
00908   :type seq_1: :class:`ost.seq.SequenceHandle`
00909   :type seq_2: :class:`ost.seq.SequenceHandle`
00910   :return: Alignment of two sequences using a global aignment. Views attached
00911            to the input sequences will remain attached in the aln.
00912   :rtype:  :class:`~ost.seq.AlignmentHandle` or None if it failed.
00913   """
00914   # NOTE: If the two sequence have a greatly different length
00915   #       a local alignment could be a better choice...
00916   aln = None
00917   alns = seq.alg.GlobalAlign(seq_1, seq_2, seq.alg.BLOSUM62)
00918   if alns: aln = alns[0]
00919   if not aln:
00920     LogWarning('Failed to align %s to %s' % (seq_1.name, seq_2.name))
00921     LogWarning('%s:  %s' % (seq_1.name, seq_1.string))
00922     LogWarning('%s:  %s' % (seq_2.name, seq_2.string))
00923   return aln
00924 
00925 # QS entity
00926 
00927 def _CleanInputEntity(ent):
00928   """
00929   :param ent: The OST entity to be cleaned.
00930   :type ent:  :class:`EntityHandle` or :class:`EntityView`
00931   :return: A tuple of 3 items: :attr:`QSscoreEntity.ent`,
00932                                :attr:`QSscoreEntity.removed_chains`,
00933                                :attr:`QSscoreEntity.calpha_only`
00934   """
00935   # find chains to remove
00936   removed_chains = []
00937   for ch in ent.chains:
00938     # we remove chains if they are small-peptides or if the contain no aa
00939     # or if they contain only unknown or modified residues
00940     if    ch.name in ['-', '_'] \
00941        or ch.residue_count < 20 \
00942        or not any(r.chem_type.IsAminoAcid() for r in ch.residues) \
00943        or not (set(r.one_letter_code for r in ch.residues) - {'?', 'X'}):
00944       removed_chains.append(ch.name)
00945 
00946   # remove them from *ent*
00947   if removed_chains:
00948     chain_set = set()
00949     for ch_name in removed_chains:
00950       if ch_name in ['-', '_', ' ']:
00951         chain_set.add('"%c"' % ch_name)
00952       else:
00953         chain_set.add(ch_name)
00954     view = ent.Select('cname!=%s' % ','.join(chain_set))
00955     ent_new = mol.CreateEntityFromView(view, True)
00956     ent_new.SetName(ent.GetName())
00957   else:
00958     ent_new = ent
00959 
00960   # check if CA only
00961   calpha_only = False
00962   if ent_new.Select('aname=CB').atom_count == 0:
00963     LogInfo('Structure %s is a CA only structure!' % ent_new.GetName())
00964     calpha_only = True
00965 
00966   # report and return
00967   if removed_chains:
00968     LogInfo('Chains removed from %s: %s' \
00969             % (ent_new.GetName(), ''.join(removed_chains)))
00970   LogInfo('Chains in %s: %s' \
00971           % (ent_new.GetName(), ''.join([c.name for c in ent_new.chains])))
00972   return ent_new, removed_chains, calpha_only
00973 
00974 def _GetCAOnlyEntity(ent):
00975   """
00976   :param ent: Entity to process
00977   :type ent:  :class:`EntityHandle` or :class:`EntityView`
00978   :return: New entity with only CA and only one atom per residue
00979            (see :attr:`QSscoreEntity.ca_entity`)
00980   """
00981   # cook up CA only view (diff from Select = guaranteed 1 atom per residue)
00982   ca_view = ent.CreateEmptyView()
00983   # add chain by chain
00984   for res in ent.residues:
00985     ca_atom = res.FindAtom("CA")
00986     if ca_atom.IsValid():
00987       ca_view.AddAtom(ca_atom)
00988   # finalize
00989   return mol.CreateEntityFromView(ca_view, False)
00990 
00991 def _GetChemGroups(qs_ent, seqid_thr=95.):
00992   """
00993   :return: Intra-complex group of chemically identical polypeptide chains
00994            (see :attr:`QSscoreEntity.chem_groups`)
00995 
00996   :param qs_ent: Entity to process
00997   :type qs_ent:  :class:`QSscoreEntity`
00998   :param seqid_thr: Threshold used to decide when two chains are identical.
00999                     95 percent tolerates the few mutations crystallographers
01000                     like to do.
01001   :type seqid_thr:  :class:`float`
01002   """
01003   # get data from qs_ent
01004   ca_chains = qs_ent.ca_chains
01005   chain_names = sorted(ca_chains.keys())
01006   # get pairs of identical chains
01007   # NOTE: this scales quadratically with number of chains and may be optimized
01008   #       -> one could merge it with "merge transitive pairs" below...
01009   id_seqs = []
01010   for ch_1, ch_2 in itertools.combinations(chain_names, 2):
01011     aln = qs_ent.GetAlignment(ch_1, ch_2)
01012     if aln and seq.alg.SequenceIdentity(aln) > seqid_thr:
01013       id_seqs.append((ch_1, ch_2))
01014   # trivial case: no matching pairs
01015   if not id_seqs:
01016     return [[name] for name in chain_names]
01017 
01018   # merge transitive pairs
01019   groups = []
01020   for ch_1, ch_2 in id_seqs:
01021     found = False
01022     for g in groups:
01023       if ch_1 in g or ch_2 in g:
01024         found = True
01025         g.add(ch_1)
01026         g.add(ch_2)
01027     if not found:
01028       groups.append(set([ch_1, ch_2]))
01029   # sort internally based on sequence length
01030   chem_groups = []
01031   for g in groups:
01032     ranked_g = sorted([(-ca_chains[ch].length, ch) for ch in g])
01033     chem_groups.append([ch for _,ch in ranked_g])
01034   # add other dissimilar chains
01035   for ch in chain_names:
01036     if not any(ch in g for g in chem_groups):
01037       chem_groups.append([ch])
01038   
01039   return chem_groups
01040 
01041 def _GetAngles(Rt):
01042   """Computes the Euler angles given a transformation matrix.
01043 
01044   :param Rt: Rt operator.
01045   :type Rt:  :class:`ost.geom.Mat4`
01046   :return: A :class:`tuple` of angles for each axis (x,y,z)
01047   """
01048   rot = np.asmatrix(Rt.ExtractRotation().data).reshape(3,3)
01049   tx = np.arctan2(rot[2,1], rot[2,2])
01050   if tx < 0:
01051     tx += 2*np.pi
01052   ty = np.arctan2(rot[2,0], np.sqrt(rot[2,1]**2 + rot[2,2]**2))
01053   if ty < 0:
01054     ty += 2*np.pi
01055   tz = np.arctan2(rot[1,0], rot[0,0])
01056   if tz < 0:
01057     tz += 2*np.pi
01058   return tx,ty,tz
01059 
01060 # QS scorer
01061 
01062 def _GetChemGroupsMapping(qs_ent_1, qs_ent_2):
01063   """
01064   :return: Inter-complex mapping of chemical groups
01065            (see :attr:`QSscorer.chem_mapping`)
01066 
01067   :param qs_ent_1: See :attr:`QSscorer.qs_ent_1`
01068   :param qs_ent_2: See :attr:`QSscorer.qs_ent_2`
01069   """
01070   # get chem. groups and unique representative
01071   chem_groups_1 = qs_ent_1.chem_groups
01072   chem_groups_2 = qs_ent_2.chem_groups
01073   repr_chains_1 = {x[0]: tuple(x) for x in chem_groups_1}
01074   repr_chains_2 = {x[0]: tuple(x) for x in chem_groups_2}
01075 
01076   # if entities do not have different number of unique chains, we get the
01077   # mapping for the smaller set
01078   swapped = False
01079   if len(repr_chains_2) < len(repr_chains_1):
01080     repr_chains_1, repr_chains_2 = repr_chains_2, repr_chains_1
01081     qs_ent_1, qs_ent_2 = qs_ent_2, qs_ent_1
01082     swapped = True
01083 
01084   # find the closest to each chain between the two entities
01085   # NOTE: this may still be sensible to orthology problem
01086   # -> currently we use a global alignment and seq. id. to rank pairs
01087   # -> we also tried local alignments and weighting the seq. id. by the
01088   #    coverages of the alignments (gapless string in aln. / seq. length)
01089   #    but global aln performed better...
01090   chain_pairs = []
01091   ca_chains_1 = qs_ent_1.ca_chains
01092   ca_chains_2 = qs_ent_2.ca_chains
01093   for ch_1 in repr_chains_1.keys():
01094     for ch_2 in repr_chains_2.keys():
01095       aln = _AlignAtomSeqs(ca_chains_1[ch_1], ca_chains_2[ch_2])
01096       if aln:
01097         chains_seqid = seq.alg.SequenceIdentity(aln)
01098         LogVerbose('Sequence identity', ch_1, ch_2, 'seqid=%.2f' % chains_seqid)
01099         chain_pairs.append((chains_seqid, ch_1, ch_2))
01100 
01101   # get top matching groups first
01102   chain_pairs = sorted(chain_pairs, reverse=True)
01103   chem_mapping = {}
01104   for _, c1, c2 in chain_pairs:
01105     skip = False
01106     for a,b in chem_mapping.iteritems():
01107       if repr_chains_1[c1] == a or repr_chains_2[c2] == b:
01108         skip = True
01109         break
01110     if not skip:
01111       chem_mapping[repr_chains_1[c1]] = repr_chains_2[c2]
01112   if swapped:
01113     chem_mapping = {y: x for x, y in chem_mapping.iteritems()}
01114     qs_ent_1, qs_ent_2 = qs_ent_2, qs_ent_1
01115 
01116   # notify chains without partner
01117   mapped_1 = set([i for s in chem_mapping.keys() for i in s])
01118   chains_1 = set([c.name for c in qs_ent_1.ent.chains])
01119   if chains_1 - mapped_1:
01120     LogWarning('Unmapped Chains in %s: %s'
01121                % (qs_ent_1.GetName(), ','.join(list(chains_1 - mapped_1))))
01122 
01123   mapped_2 = set([i for s in chem_mapping.values() for i in s])
01124   chains_2 = set([c.name for c in qs_ent_2.ent.chains])
01125   if chains_2 - mapped_2:
01126     LogWarning('Unmapped Chains in %s: %s'
01127                % (qs_ent_2.GetName(), ','.join(list(chains_2 - mapped_2))))
01128   
01129   # check if we have any chains left
01130   LogInfo('Chemical chain-groups mapping: ' + str(chem_mapping))
01131   if len(mapped_1) < 2 or len(mapped_2) < 2:
01132     raise QSscoreError('Less than 2 chains left in chem_mapping.')
01133   return chem_mapping
01134 
01135 def _SelectFew(l, max_elements):
01136   """Return l or copy of l with at most *max_elements* entries."""
01137   n_el = len(l)
01138   if n_el <= max_elements:
01139     return l
01140   else:
01141     # cheap integer ceiling (-1 to ensure that x*max_elements gets d_el = x)
01142     d_el = ((n_el - 1) // max_elements) + 1
01143     new_l = list()
01144     for i in range(0, n_el, d_el):
01145       new_l.append(l[i])
01146     return new_l
01147 
01148 def _GetAlignedResidues(qs_ent_1, qs_ent_2, chem_mapping, max_ca_per_chain,
01149                         clustalw_bin):
01150   """
01151   :return: Tuple of two :class:`~ost.mol.EntityView` objects containing subsets
01152            of *qs_ent_1* and *qs_ent_2*. Two entities are later created from
01153            those views (see :attr:`QSscorer.ent_to_cm_1` and
01154            :attr:`QSscorer.ent_to_cm_2`)
01155 
01156   :param qs_ent_1: See :attr:`QSscorer.qs_ent_1`
01157   :param qs_ent_2: See :attr:`QSscorer.qs_ent_2`
01158   :param chem_mapping: See :attr:`QSscorer.chem_mapping`
01159   :param max_ca_per_chain: See :attr:`QSscorer.max_ca_per_chain_for_cm`
01160   """
01161   # resulting views into CA entities using CA chain sequences
01162   ent_view_1 = qs_ent_1.ca_entity.CreateEmptyView()
01163   ent_view_2 = qs_ent_2.ca_entity.CreateEmptyView()
01164   ca_chains_1 = qs_ent_1.ca_chains
01165   ca_chains_2 = qs_ent_2.ca_chains
01166   # go through all mapped chemical groups
01167   for group_1, group_2 in chem_mapping.iteritems():
01168     # MSA with ClustalW
01169     seq_list = seq.CreateSequenceList()
01170     # keep sequence-name (must be unique) to view mapping
01171     seq_to_empty_view = dict()
01172     for ch in group_1:
01173       sequence = ca_chains_1[ch].Copy()
01174       sequence.name = qs_ent_1.GetName() + '.' + ch
01175       seq_to_empty_view[sequence.name] = ent_view_1
01176       seq_list.AddSequence(sequence)
01177     for ch in group_2:
01178       sequence = ca_chains_2[ch].Copy()
01179       sequence.name = qs_ent_2.GetName() + '.' + ch
01180       seq_to_empty_view[sequence.name] = ent_view_2
01181       seq_list.AddSequence(sequence)
01182     alnc = ClustalW(seq_list, clustalw=clustalw_bin)
01183 
01184     # collect aligned residues (list of lists of sequence_count valid residues)
01185     residue_lists = list()
01186     sequence_count = alnc.sequence_count
01187     for col in alnc:
01188       residues = [col.GetResidue(ir) for ir in range(sequence_count)]
01189       if all([r.IsValid() for r in residues]):
01190         residue_lists.append(residues)
01191     # check number of aligned residues
01192     if len(residue_lists) < 5:
01193       raise QSscoreError('Not enough aligned residues.')
01194     elif max_ca_per_chain > 0:
01195       residue_lists = _SelectFew(residue_lists, max_ca_per_chain)
01196     # check what view is for which residue
01197     res_views = [seq_to_empty_view[alnc.sequences[ir].name] \
01198                  for ir in range(sequence_count)]
01199     # add to view (note: only 1 CA atom per residue in here)
01200     for res_list in residue_lists:
01201       for res, view in zip(res_list, res_views):
01202         view.AddResidue(res, mol.ViewAddFlag.INCLUDE_ATOMS)
01203   # done
01204   return ent_view_1, ent_view_2
01205 
01206 
01207 def _FindSymmetry(qs_ent_1, qs_ent_2, ent_to_cm_1, ent_to_cm_2, chem_mapping):
01208   """
01209   :return: A pair of comparable symmetry groups (for :attr:`QSscorer.symm_1`
01210            and :attr:`QSscorer.symm_2`) between the two structures.
01211            Empty lists if no symmetry identified.
01212   
01213   :param qs_ent_1: See :attr:`QSscorer.qs_ent_1`
01214   :param qs_ent_2: See :attr:`QSscorer.qs_ent_2`
01215   :param ent_to_cm_1: See :attr:`QSscorer.ent_to_cm_1`
01216   :param ent_to_cm_2: See :attr:`QSscorer.ent_to_cm_2`
01217   :param chem_mapping: See :attr:`QSscorer.chem_mapping`
01218   """
01219   LogInfo('Identifying Symmetry Groups...')
01220 
01221   # get possible symmetry groups
01222   symm_subg_1 = _GetSymmetrySubgroups(qs_ent_1, ent_to_cm_1,
01223                                       chem_mapping.keys())
01224   symm_subg_2 = _GetSymmetrySubgroups(qs_ent_2, ent_to_cm_2,
01225                                       chem_mapping.values())
01226 
01227   # check combination of groups
01228   LogInfo('Selecting Symmetry Groups...')
01229   # check how many mappings are to be checked for each pair of symmetry groups
01230   # -> lower number here will speed up worst-case runtime of _GetChainMapping
01231   # NOTE: this is tailored to speed up brute force all vs all mapping test
01232   #       for preferred _CheckClosedSymmetry this is suboptimal!
01233   best_symm = []
01234   for symm_1, symm_2 in itertools.product(symm_subg_1, symm_subg_2):
01235     s1 = symm_1[0]
01236     s2 = symm_2[0]
01237     if len(s1) != len(s2):
01238       LogDebug('Discarded', str(symm_1), str(symm_2),
01239                ': different size of symmetry groups')
01240       continue
01241     count = _CountSuperpositionsAndMappings(symm_1, symm_2, chem_mapping)
01242     nr_mapp = count['intra']['mappings'] + count['inter']['mappings']
01243     LogDebug('OK', str(symm_1), str(symm_2), ': %s mappings' % nr_mapp)
01244     best_symm.append((nr_mapp, symm_1, symm_2))
01245 
01246   for _, symm_1, symm_2 in sorted(best_symm):
01247     s1 = symm_1[0]
01248     s2 = symm_2[0]
01249     group_1 = ent_to_cm_1.Select('cname=%s' % ','.join(s1))
01250     group_2 = ent_to_cm_2.Select('cname=%s' % ','.join(s2))
01251     # check if by superposing a pair of chains within the symmetry group to
01252     # superpose all chains within the symmetry group
01253     # -> if successful, the symmetry groups are compatible
01254     closed_symm = _CheckClosedSymmetry(group_1, group_2, symm_1, symm_2,
01255                                        chem_mapping, 6, 0.8, find_best=False)
01256 
01257     if closed_symm:
01258       return symm_1, symm_2
01259 
01260   # nothing found
01261   LogInfo('No coherent symmetry identified between structures')
01262   return [], []
01263 
01264 
01265 def _GetChainMapping(ent_1, ent_2, symm_1, symm_2, chem_mapping):
01266   """
01267   :return: Mapping from *ent_1* to *ent_2* (see :attr:`QSscorer.chain_mapping`)  
01268 
01269   :param ent_1: See :attr:`QSscorer.ent_to_cm_1`
01270   :param ent_2: See :attr:`QSscorer.ent_to_cm_2`
01271   :param symm_1: See :attr:`QSscorer.symm_1`
01272   :param symm_2: See :attr:`QSscorer.symm_2`
01273   :param chem_mapping: See :attr:`QSscorer.chem_mapping`
01274   """
01275   LogInfo('Symmetry-groups used in %s: %s' % (ent_1.GetName(), str(symm_1)))
01276   LogInfo('Symmetry-groups used in %s: %s' % (ent_2.GetName(), str(symm_2)))
01277 
01278   # quick check for closed symmetries
01279   thresholds = [(    'strict', 4, 0.8),
01280                 (  'tolerant', 6, 0.4),
01281                 ('permissive', 8, 0.2)]
01282   for scheme, sup_thr, sup_fract in thresholds:
01283     # check if by superposing a pair of chains within the symmetry group to
01284     # superpose all chains of the two oligomers
01285     # -> this also returns the resulting mapping of chains
01286     mapping = _CheckClosedSymmetry(ent_1, ent_2, symm_1, symm_2,
01287                                    chem_mapping, sup_thr, sup_fract)
01288     if mapping:
01289       LogInfo('Closed Symmetry with %s parameters' % scheme)
01290       if scheme == 'permissive':
01291         LogWarning('Permissive thresholds used for overlap based mapping ' + \
01292                    'detection: check mapping manually: %s' % mapping)
01293       return mapping
01294   
01295   # NOTE that what follows below is sub-optimal:
01296   # - if the two structures don't fit at all, we may map chains rather randomly
01297   # - we may need a lot of operations to find globally "best" mapping
01298   # -> COST = O(N^2) * O(N!)
01299   #    (first = all pairwise chain-superpose, latter = all chain mappings)
01300   # -> a greedy chain mapping choice algo (pick best RMSD for each one-by-one)
01301   #    could reduce this to O(N^2) * O(N^2) but it would need to be validated
01302   # - one could also try some other heuristic based on center of atoms of chains
01303   #   -> at this point we get a bad mapping anyways...
01304 
01305   # if we get here, we will need to check many combinations of mappings
01306   # -> check how many mappings must be checked and limit
01307   count = _CountSuperpositionsAndMappings(symm_1, symm_2, chem_mapping)
01308   LogInfo('Intra Symmetry-group mappings to check: %s' \
01309           % count['intra']['mappings'])
01310   LogInfo('Inter Symmetry-group mappings to check: %s' \
01311           % count['inter']['mappings'])
01312   nr_mapp = count['intra']['mappings'] + count['inter']['mappings']
01313   if nr_mapp > 100000: # 322560 is octamer vs octamer
01314     raise QSscoreError('Too many possible mappings: %s' % nr_mapp)
01315 
01316   # to speed up the computations we cache chain views and RMSDs
01317   cached_rmsd = _CachedRMSD(ent_1, ent_2)
01318 
01319   # get INTRA symmetry group chain mapping
01320   check = 0
01321   intra_mappings = [] # list of (RMSD, c1, c2, mapping) tuples (best superpose)
01322   # limit chem mapping to reference symmetry group
01323   ref_symm_1 = symm_1[0]
01324   ref_symm_2 = symm_2[0]
01325   asu_chem_mapping = _LimitChemMapping(chem_mapping, ref_symm_1, ref_symm_2)
01326   # for each chemically identical group
01327   for g1, g2 in asu_chem_mapping.iteritems():
01328     # try to superpose all possible pairs
01329     for c1, c2 in itertools.product(g1, g2):
01330       # get superposition transformation
01331       LogVerbose('Superposing chains: %s, %s' % (c1, c2))
01332       res = cached_rmsd.GetSuperposition(c1, c2)
01333       # compute RMSD of possible mappings
01334       cur_mappings = [] # list of (RMSD, mapping) tuples
01335       for mapping in _ListPossibleMappings(c1, c2, asu_chem_mapping):
01336         check += 1
01337         chains_rmsd = cached_rmsd.GetMappedRMSD(mapping, res.transformation)
01338         cur_mappings.append((chains_rmsd, mapping))
01339         LogVerbose(str(mapping), str(chains_rmsd))
01340       best_rmsd, best_mapp = min(cur_mappings)
01341       intra_mappings.append((best_rmsd, c1, c2, best_mapp))
01342   # best chain-chain superposition defines the intra asu mapping
01343   _, intra_asu_c1, intra_asu_c2, intra_asu_mapp = min(intra_mappings)
01344 
01345   # if only one asu is present in any of the complexes, we're done
01346   if len(symm_1) == 1 or len(symm_2) == 1:
01347     mapping = intra_asu_mapp
01348   else:
01349     # the mapping is the element position within the asu chain list
01350     # -> this speed up a lot, assuming that the order of chain in asu groups
01351     #    is following the order of symmetry operations
01352     index_asu_c1 = ref_symm_1.index(intra_asu_c1)
01353     index_asu_c2 = ref_symm_2.index(intra_asu_c2)
01354     index_mapp = {ref_symm_1.index(s1): ref_symm_2.index(s2) \
01355                   for s1, s2 in intra_asu_mapp.iteritems()}
01356     LogInfo('Intra symmetry-group mapping: %s' % str(intra_asu_mapp))
01357 
01358     # get INTER symmetry group chain mapping
01359     # we take the first symmetry group from the complex having the most
01360     if len(symm_1) < len(symm_2):
01361       symm_combinations = list(itertools.product(symm_1, [symm_2[0]]))
01362     else:
01363       symm_combinations = list(itertools.product([symm_1[0]], symm_2))
01364 
01365     full_asu_mapp = {tuple(symm_1): tuple(symm_2)}
01366     full_mappings = [] # list of (RMSD, mapping) tuples
01367     for s1, s2 in symm_combinations:
01368       # get superposition transformation (we take best chain-pair in asu)
01369       LogVerbose('Superposing symmetry-group: %s, %s in %s, %s' \
01370                  % (s1[index_asu_c1], s2[index_asu_c2], s1, s2))
01371       res = cached_rmsd.GetSuperposition(s1[index_asu_c1], s2[index_asu_c2])
01372       # compute RMSD of possible mappings
01373       for asu_mapp in _ListPossibleMappings(s1, s2, full_asu_mapp):
01374         check += 1
01375         # need to extract full chain mapping (use indexing)
01376         mapping = {}
01377         for a, b in asu_mapp.iteritems():
01378           for id_a, id_b in index_mapp.iteritems():
01379             mapping[a[id_a]] = b[id_b]
01380         chains_rmsd = cached_rmsd.GetMappedRMSD(mapping, res.transformation)
01381         full_mappings.append((chains_rmsd, mapping))
01382         LogVerbose(str(mapping), str(chains_rmsd))
01383 
01384     if check != nr_mapp:
01385       LogWarning('Mapping number estimation was wrong') # sanity check
01386 
01387     # return best (lowest RMSD) mapping
01388     mapping = min(full_mappings)[1]
01389 
01390   LogWarning('Extensive search used for mapping detection (fallback). This ' + \
01391              'approach has known limitations. Check mapping manually: %s' \
01392              % mapping)
01393   return mapping
01394 
01395 
01396 def _GetSymmetrySubgroups(qs_ent, ent, chem_groups):
01397   """
01398   Identify the symmetry (either cyclic C or dihedral D) of the protein and find
01399   all possible symmetry subgroups. This is done testing all combination of chain
01400   superposition and clustering them by the angles (D) or the axis (C) of the Rt
01401   operator.
01402 
01403   Groups of superposition which can fully reconstruct the structure are possible
01404   symmetry subgroups.
01405 
01406   :param qs_ent: Entity with cached angles and axis.
01407   :type qs_ent:  :class:`QSscoreEntity`
01408   :param ent: Entity from which to extract chains (perfect alignment assumed
01409               for superposition as in :attr:`QSscorer.ent_to_cm_1`).
01410   :type ent:  :class:`EntityHandle` or :class:`EntityView`
01411   :param chem_groups: List of tuples/lists of chain names in *ent*. Each list
01412                       contains all chains belonging to a chem. group (could be
01413                       from :attr:`QSscoreEntity.chem_groups` or from "keys()"
01414                       or "values()" of :attr:`QSscorer.chem_mapping`)
01415 
01416   :return: A list of possible symmetry subgroups (each in same format as
01417            :attr:`QSscorer.symm_1`). If no symmetry is found, we return a list
01418            with a single symmetry subgroup with a single group with all chains.
01419   """
01420   # get angles / axis for pairwise transformations within same chem. group
01421   angles = {}
01422   axis = {}
01423   for cnames in chem_groups:
01424     for c1, c2 in itertools.combinations(cnames, 2):
01425       cur_angles = qs_ent.GetAngles(c1, c2)
01426       if not np.any(np.isnan(cur_angles)):
01427         angles[(c1,c2)] = cur_angles
01428       cur_axis = qs_ent.GetAxis(c1, c2)
01429       if not np.any(np.isnan(cur_axis)):
01430         axis[(c1,c2)] = cur_axis
01431 
01432   # cluster superpositions angles at different thresholds
01433   symm_groups = []
01434   LogVerbose('Possible symmetry-groups for: %s' % ent.GetName())
01435   for angle_thr in np.arange(0.1, 1, 0.1):
01436     d_symm_groups = _GetDihedralSubgroups(ent, chem_groups, angles, angle_thr)
01437     if d_symm_groups:
01438       for group in d_symm_groups:
01439         if group not in symm_groups:
01440           symm_groups.append(group)
01441           LogVerbose('Dihedral: %s' % group)
01442       LogInfo('Symmetry threshold %.1f used for angles of %s' \
01443               % (angle_thr, ent.GetName()))
01444       break
01445   
01446   # cluster superpositions axis at different thresholds
01447   for axis_thr in np.arange(0.1, 1, 0.1):
01448     c_symm_groups = _GetCyclicSubgroups(ent, chem_groups, axis, axis_thr)
01449     if c_symm_groups:
01450       for group in c_symm_groups:
01451         if group not in symm_groups:
01452           symm_groups.append(group)
01453           LogVerbose('Cyclic: %s' % group)
01454       LogInfo('Symmetry threshold %.1f used for axis of %s' \
01455               % (axis_thr, ent.GetName()))
01456       break
01457 
01458   # fall back to single "group" containing all chains if none found
01459   if not symm_groups:
01460     LogInfo('No symmetry-group detected in %s' % ent.GetName())
01461     symm_groups = [[tuple([c for g in chem_groups for c in g])]]
01462 
01463   return symm_groups
01464 
01465 def _GetDihedralSubgroups(ent, chem_groups, angles, angle_thr):
01466   """
01467   :return: Dihedral subgroups for :func:`_GetSymmetrySubgroups`
01468            (same return type as there). Empty list if fail.
01469 
01470   :param ent: See :func:`_GetSymmetrySubgroups`
01471   :param chem_groups: See :func:`_GetSymmetrySubgroups`
01472   :param angles: :class:`dict` (key = chain-pair-tuple, value = Euler angles)
01473   :param angle_thr: Euler angles distance threshold for clustering (float).
01474   """
01475   # cluster superpositions angles
01476   if len(angles) == 0:
01477     # nothing to be done here
01478     return []
01479   else:
01480     same_angles = _ClusterData(angles, angle_thr, _AngleArrayDistance)
01481 
01482   # get those which are non redundant and covering all chains
01483   symm_groups = []
01484   for clst in same_angles.values():
01485     group = clst.keys()
01486     if _ValidChainGroup(group, ent):
01487       if len(chem_groups) > 1:
01488         # if hetero, we want to group toghether different chains only
01489         symm_groups.append(zip(*group))
01490       else:
01491         # if homo, we also want pairs
01492         symm_groups.append(group)
01493         symm_groups.append(zip(*group))
01494   return symm_groups
01495 
01496 def _GetCyclicSubgroups(ent, chem_groups, axis, axis_thr):
01497   """
01498   :return: Cyclic subgroups for :func:`_GetSymmetrySubgroups`
01499            (same return type as there). Empty list if fail.
01500 
01501   :param ent: See :func:`_GetSymmetrySubgroups`
01502   :param chem_groups: See :func:`_GetSymmetrySubgroups`
01503   :param angles: :class:`dict` (key = chain-pair-tuple, value = rotation axis)
01504   :param angle_thr: Axis distance threshold for clustering (float).
01505   """
01506   # cluster superpositions axis
01507   if len(axis) == 0:
01508     # nothing to be done here
01509     return []
01510   else:
01511     same_axis = _ClusterData(axis, axis_thr, _AxisDistance)
01512 
01513   # use to get grouping
01514   symm_groups = []
01515   for clst in same_axis.values():
01516     all_chain = [item for sublist in clst.keys() for item in sublist]
01517     if len(set(all_chain)) == ent.chain_count:
01518       # if we have an hetero we can exploit cyclic symmetry for grouping
01519       if len(chem_groups) > 1:
01520         ref_chem_group = chem_groups[0]
01521         # try two criteria for grouping
01522         cyclic_group_closest = []
01523         cyclic_group_iface = []
01524         for c1 in ref_chem_group:
01525           group_closest = [c1]
01526           group_iface = [c1]
01527           for chains in chem_groups[1:]:
01528             # center of atoms distance
01529             closest = _GetClosestChain(ent, c1, chains)
01530             # chain with bigger interface
01531             closest_iface = _GetClosestChainInterface(ent, c1, chains)
01532             group_closest.append(closest)
01533             group_iface.append(closest_iface)
01534           cyclic_group_closest.append(tuple(group_closest))
01535           cyclic_group_iface.append(tuple(group_iface))
01536         if _ValidChainGroup(cyclic_group_closest, ent):
01537           symm_groups.append(cyclic_group_closest)
01538         if _ValidChainGroup(cyclic_group_iface, ent):
01539           symm_groups.append(cyclic_group_iface)
01540       # if it is a homo, there's not much we can group
01541       else:
01542         symm_groups.append(chem_groups)
01543   return symm_groups
01544 
01545 def _ClusterData(data, thr, metric):
01546   """Wrapper for fclusterdata to get dict of clusters.
01547   
01548   :param data: :class:`dict` (keys for ID, values used for clustering)
01549   :return: :class:`dict` {cluster_idx: {data-key: data-value}}
01550   """
01551   # special case length 1
01552   if len(data) == 1:
01553     return {0: {data.keys()[0]: data.values()[0]} }
01554   # do the clustering
01555   cluster_indices = fclusterdata(np.asarray(data.values()), thr,
01556                                  method='complete', criterion='distance',
01557                                  metric=metric)
01558   # fclusterdata output is cluster ids -> put into dict of clusters
01559   res = {}
01560   for cluster_idx, data_key in zip(cluster_indices, data.keys()):
01561     if not cluster_idx in res:
01562       res[cluster_idx] = {}
01563     res[cluster_idx][data_key] = data[data_key]
01564   return res
01565 
01566 def _AngleArrayDistance(u, v):
01567   """
01568   :return: Average angular distance of two arrays of angles.
01569   :param u: Euler angles.
01570   :param v: Euler angles.
01571   """
01572   dist = []
01573   for a,b in zip(u,v):
01574     delta = abs(a-b)
01575     if delta > np.pi:
01576       delta = abs(2*np.pi - delta)
01577     dist.append(delta)
01578   return np.mean(dist)
01579 
01580 def _AxisDistance(u, v):
01581   """
01582   :return: Euclidean distance between two rotation axes. Axes can point in
01583            either direction so we ensure to use the closer one.
01584   :param u: Rotation axis.
01585   :param v: Rotation axis.
01586   """
01587   # get axes as arrays (for convenience)
01588   u = np.asarray(u)
01589   v = np.asarray(v)
01590   # get shorter of two possible distances (using v or -v)
01591   dv1 = u - v
01592   dv2 = u + v
01593   d1 = np.dot(dv1, dv1)
01594   d2 = np.dot(dv2, dv2)
01595   return np.sqrt(min(d1, d2))
01596 
01597 def _GetClosestChain(ent, ref_chain, chains):
01598   """
01599   :return: Chain closest to *ref_chain* based on center of atoms distance.
01600   :rtype:  :class:`str`
01601   :param ent: See :func:`_GetSymmetrySubgroups`
01602   :param ref_chain: We look for chains closest to this one
01603   :type ref_chain:  :class:`str`
01604   :param chains: We only consider these chains
01605   :type chains:  :class:`list` of :class:`str`
01606   """
01607   contacts = []
01608   ref_center = ent.FindChain(ref_chain).center_of_atoms
01609   for ch in chains:
01610     center = ent.FindChain(ch).center_of_atoms
01611     contacts.append((geom.Distance(ref_center, center), ch))
01612   closest_chain = min(contacts)[1]
01613   return closest_chain
01614 
01615 def _GetClosestChainInterface(ent, ref_chain, chains):
01616   """
01617   :return: Chain with biggest interface (within 10 A) to *ref_chain*.
01618   :rtype:  :class:`str`
01619   :param ent: See :func:`_GetSymmetrySubgroups`
01620   :param ref_chain: We look for chains closest to this one
01621   :type ref_chain:  :class:`str`
01622   :param chains: We only consider these chains
01623   :type chains:  :class:`list` of :class:`str`
01624   """
01625   # NOTE: this is computed on a subset of the full biounit and might be
01626   # inaccurate. Also it could be extracted from QSscoreEntity.contacts.
01627   closest = []
01628   for ch in chains:
01629     iface_view = ent.Select('cname=%s and 10 <> [cname=%s]' % (ref_chain, ch))
01630     nr_res = iface_view.residue_count
01631     closest.append((nr_res, ch))
01632   closest_chain = max(closest)[1]
01633   return closest_chain
01634 
01635 def _ValidChainGroup(group, ent):
01636   """
01637   :return: True, if *group* has unique chain names and as many chains as *ent*
01638   :rtype:  :class:`bool`
01639   :param group: Symmetry groups to check
01640   :type group:  Same as :attr:`QSscorer.symm_1`
01641   :param ent: See :func:`_GetSymmetrySubgroups`
01642   """
01643   all_chain = [item for sublist in group for item in sublist]
01644   if len(all_chain) == len(set(all_chain)) and\
01645      len(all_chain) == ent.chain_count:
01646     return True
01647   return False
01648 
01649 def _LimitChemMapping(chem_mapping, limit_1, limit_2):
01650   """
01651   :return: Chem. mapping containing only chains in *limit_1* and *limit_2*
01652   :rtype:  Same as :attr:`QSscorer.chem_mapping`
01653   :param chem_mapping: See :attr:`QSscorer.chem_mapping`
01654   :param limit_1: Limits chain names in chem_mapping.keys()
01655   :type limit_1:  List/tuple of strings
01656   :param limit_2: Limits chain names in chem_mapping.values()
01657   :type limit_2:  List/tuple of strings
01658   """
01659   # use set intersection to keep only mapping for chains in limit_X
01660   limit_1_set = set(limit_1)
01661   limit_2_set = set(limit_2)
01662   asu_chem_mapping = dict()
01663   for key, value in chem_mapping.iteritems():
01664     new_key = tuple(set(key) & limit_1_set)
01665     if new_key:
01666       asu_chem_mapping[new_key] = tuple(set(value) & limit_2_set)
01667   return asu_chem_mapping
01668 
01669 
01670 def _CountSuperpositionsAndMappings(symm_1, symm_2, chem_mapping):
01671   """
01672   :return: Dictionary of number of mappings and superpositions to be performed.
01673            Returned as *result[X][Y] = number* with X = "intra" or "inter" and
01674            Y = "mappings" or "superpositions". The idea is that for each
01675            pairwise superposition we check all possible mappings.
01676            We can check the combinations within (intra) a symmetry group and
01677            once established, we check the combinations between different (inter)
01678            symmetry groups.
01679   :param symm_1: See :attr:`QSscorer.symm_1`
01680   :param symm_2: See :attr:`QSscorer.symm_2`
01681   :param chem_mapping: See :attr:`QSscorer.chem_mapping`
01682   """
01683   # setup
01684   c = {}
01685   c['intra'] = {}
01686   c['inter'] = {}
01687   c['intra']['mappings'] = 0
01688   c['intra']['superpositions'] = 0
01689   c['inter']['mappings'] = 0
01690   c['inter']['superpositions'] = 0
01691   # intra ASU mappings within reference symmetry groups
01692   ref_symm_1 = symm_1[0]
01693   ref_symm_2 = symm_2[0]
01694   asu_chem_mapping = _LimitChemMapping(chem_mapping, ref_symm_1, ref_symm_2)
01695   for g1, g2 in asu_chem_mapping.iteritems():
01696     chain_superpositions = len(g1) * len(g2)
01697     c['intra']['superpositions'] += chain_superpositions
01698     map_per_sup = _PermutationOrCombinations(g1[0], g2[0], asu_chem_mapping)
01699     c['intra']['mappings'] += chain_superpositions * map_per_sup
01700   if len(symm_1) == 1 or len(symm_2) == 1:
01701     return c
01702   # inter ASU mappings
01703   asu_superposition = min(len(symm_1), len(symm_2))
01704   c['inter']['superpositions'] = asu_superposition
01705   asu = {tuple(symm_1): tuple(symm_2)}
01706   map_per_sup = _PermutationOrCombinations(ref_symm_1, ref_symm_2, asu)
01707   c['inter']['mappings'] = asu_superposition * map_per_sup
01708   return c
01709 
01710 def _PermutationOrCombinations(sup1, sup2, chem_mapping):
01711   """Should match len(_ListPossibleMappings(sup1, sup2, chem_mapping))."""
01712   # remove superposed elements, put smallest complex as key
01713   chem_map = {}
01714   for a,b in chem_mapping.iteritems():
01715     new_a = tuple([x for x in a if x != sup1])
01716     new_b = tuple([x for x in b if x != sup2])
01717     chem_map[new_a] = new_b
01718   mapp_nr = 1.0
01719   for a, b in chem_map.iteritems():
01720     if len(a) == len(b):
01721       mapp_nr *= factorial(len(a))
01722     elif len(a) < len(b):
01723       mapp_nr *= binom(len(b), len(a))
01724     elif len(a) > len(b):
01725       mapp_nr *= binom(len(a), len(b))
01726   return int(mapp_nr)
01727 
01728 def _ListPossibleMappings(sup1, sup2, chem_mapping):
01729   """
01730   Return a flat list of all possible mappings given *chem_mapping* and keeping
01731   mapping of *sup1* and *sup2* fixed. For instance if elements are chain names
01732   this is all the mappings to check for a given superposition.
01733 
01734   Elements in first complex are defined by *chem_mapping.keys()* (list of list
01735   of elements) and elements in second complex by *chem_mapping.values()*. If
01736   complexes don't have same number of elements, we map only elements for the one
01737   with less. Also mapping is only between elements of mapped groups according to
01738   *chem_mapping*.
01739            
01740   :rtype:  :class:`list` of :class:`dict` (key = element in chem_mapping-key,
01741            value = element in chem_mapping-value)
01742   :param sup1: Element for which mapping is fixed.
01743   :type sup1:  Like element in chem_mapping-key
01744   :param sup2: Element for which mapping is fixed.
01745   :type sup2:  Like element in chem_mapping-value
01746   :param chem_mapping: Defines mapping between groups of elements (e.g. result
01747                        of :func:`_LimitChemMapping`).
01748   :type chem_mapping:  :class:`dict` with key / value = :class:`tuple`
01749 
01750   :raises: :class:`QSscoreError` if reference complex (first one or one with
01751            less elements) has more elements for any given mapped group.
01752   """
01753   # find smallest complex
01754   chain1 = [i for s in chem_mapping.keys() for i in s]
01755   chain2 = [i for s in chem_mapping.values() for i in s]
01756   swap = False
01757   if len(chain1) > len(chain2):
01758     swap = True
01759   # remove superposed elements, put smallest complex as key
01760   chem_map = {}
01761   for a, b in chem_mapping.iteritems():
01762     new_a = tuple([x for x in a if x != sup1])
01763     new_b = tuple([x for x in b if x != sup2])
01764     if swap:
01765       chem_map[new_b] = new_a
01766     else:
01767       chem_map[new_a] = new_b
01768   # generate permutations or combinations of chemically
01769   # equivalent chains
01770   chem_perm = []
01771   chem_ref = []
01772   for a, b in chem_map.iteritems():
01773     if len(a) == len(b):
01774       chem_perm.append(list(itertools.permutations(b)))
01775       chem_ref.append(a)
01776     elif len(a) < len(b):
01777       chem_perm.append(list(itertools.combinations(b, len(a))))
01778       chem_ref.append(a)
01779     else:
01780       raise QSscoreError('Impossible to define reference group: %s' \
01781                          % str(chem_map))
01782   # save the list of possible mappings
01783   mappings = []
01784   flat_ref = [i for s in chem_ref for i in s]
01785   for perm in itertools.product(*chem_perm):
01786     flat_perm = [i for s in perm for i in s]
01787     d = {c1: c2 for c1, c2 in zip(flat_ref, flat_perm)}
01788     if swap:
01789       d = {v: k for k, v in d.items()}
01790     d.update({sup1: sup2})
01791     mappings.append(d)
01792   return mappings
01793 
01794 
01795 def _CheckClosedSymmetry(ent_1, ent_2, symm_1, symm_2, chem_mapping,
01796                          sup_thr=4, sup_fract=0.8, find_best=True):
01797   """
01798   Quick check if we can superpose two chains and get a mapping for all other
01799   chains using the same transformation. The mapping is defined by sufficient
01800   overlap of the transformed chain of *ent_1* onto another chain in *ent_2*.
01801 
01802   :param ent_1: Entity to map to *ent_2* (perfect alignment assumed between
01803                 chains of same chem. group as in :attr:`QSscorer.ent_to_cm_1`).
01804                 Views are ok but only to select full chains!
01805   :param ent_2: Entity to map to (perfect alignment assumed between
01806                 chains of same chem. group as in :attr:`QSscorer.ent_to_cm_2`).
01807                 Views are ok but only to select full chains!
01808   :param symm_1: Symmetry groups to use. We only superpose chains within
01809                  reference symmetry group of *symm_1* and *symm_2*.
01810                  See :attr:`QSscorer.symm_1`
01811   :param symm_2: See :attr:`QSscorer.symm_2`
01812   :param chem_mapping: See :attr:`QSscorer.chem_mapping`.
01813                        All chains in *ent_1* / *ent_2* must be listed here!
01814   :param sup_thr: Distance around transformed chain in *ent_1* to check for
01815                   overlap.
01816   :type sup_thr:  :class:`float`
01817   :param sup_fract: Fraction of atoms in chain of *ent_2* that must be
01818                     overlapped for overlap to be sufficient.
01819   :type sup_fract:  :class:`float`
01820   :param find_best: If True, we look for best mapping according to
01821                     :func:`_ChainRMSD`. Otherwise, we return first suitable
01822                     mapping.
01823   :type find_best:  :class:`bool`
01824 
01825   :return: Mapping from *ent_1* to *ent_2* or None if none found. Mapping, if
01826            found, is as in :attr:`QSscorer.chain_mapping`.
01827   :rtype:  :class:`dict` (:class:`str` / :class:`str`)
01828   """
01829   # limit chem mapping to reference symmetry group
01830   ref_symm_1 = symm_1[0]
01831   ref_symm_2 = symm_2[0]
01832   asu_chem_mapping = _LimitChemMapping(chem_mapping, ref_symm_1, ref_symm_2)
01833   # for each chemically identical group
01834   rmsd_mappings = []
01835   for g1, g2 in asu_chem_mapping.iteritems():
01836     # try to superpose all possible pairs
01837     # -> note that some chain-chain combinations may work better than others
01838     #    to superpose the full oligomer (e.g. if some chains are open/closed)
01839     for c1, c2 in itertools.product(g1, g2):
01840       # get superposition transformation
01841       chain_1 = ent_1.Select('cname=%s' % c1)
01842       chain_2 = ent_2.Select('cname=%s' % c2)
01843       res = mol.alg.SuperposeSVD(chain_1, chain_2, apply_transform=False)
01844       # look for overlaps
01845       mapping = _GetSuperpositionMapping(ent_1, ent_2, chem_mapping,
01846                                          res.transformation, sup_thr,
01847                                          sup_fract)
01848       if not mapping:
01849         continue
01850       # early abort if we only want the first one
01851       if not find_best:
01852         return mapping
01853       else:
01854         # get RMSD for sorting
01855         rmsd = _GetMappedRMSD(ent_1, ent_2, mapping, res.transformation)
01856         rmsd_mappings.append((rmsd, mapping))
01857   # return best mapping
01858   if rmsd_mappings:
01859     return min(rmsd_mappings)[1]
01860   else:
01861     return None
01862 
01863 def _GetSuperpositionMapping(ent_1, ent_2, chem_mapping, transformation,
01864                              sup_thr, sup_fract):
01865   """
01866   :return: Dict with chain mapping from *ent_1* to *ent_2* or None if failed
01867            (see :func:`_CheckClosedSymmetry`).
01868   :param ent_1: See :func:`_CheckClosedSymmetry`
01869   :param ent_2: See :func:`_CheckClosedSymmetry`
01870   :param chem_mapping: See :func:`_CheckClosedSymmetry`
01871   :param transformation: Superposition transformation to be applied to *ent_1*.
01872   :param sup_thr: See :func:`_CheckClosedSymmetry`
01873   :param sup_fract: See :func:`_CheckClosedSymmetry`
01874   """
01875   # NOTE: this seems to be the comput. most expensive part in QS scoring
01876   # -> it could be moved to C++ for speed-up
01877   # -> the next expensive bits are ClustalW and GetContacts btw...
01878 
01879   # swap if needed (ent_1 must be shorter or same)
01880   if ent_1.chain_count > ent_2.chain_count:
01881     swap = True
01882     ent_1, ent_2 = ent_2, ent_1
01883     transformation = geom.Invert(transformation)
01884     chem_pairs = zip(chem_mapping.values(), chem_mapping.keys())
01885   else:
01886     swap = False
01887     chem_pairs = chem_mapping.iteritems()
01888   # sanity check
01889   if ent_1.chain_count == 0:
01890     return None
01891   # extract valid partners for each chain in ent_1
01892   chem_partners = dict()
01893   for cg1, cg2 in chem_pairs:
01894     for ch in cg1:
01895       chem_partners[ch] = set(cg2)
01896   # get mapping for each chain in ent_1
01897   mapping = dict()
01898   mapped_chains = set()
01899   for ch_1 in ent_1.chains:
01900     # get all neighbors split by chain (NOTE: this muight be moved to C++)
01901     ch_atoms = {ch_2.name: set() for ch_2 in ent_2.chains}
01902     for a_1 in ch_1.handle.atoms:
01903       transformed_pos = geom.Vec3(transformation * geom.Vec4(a_1.pos))
01904       a_within = ent_2.FindWithin(transformed_pos, sup_thr)
01905       for a_2 in a_within:
01906         ch_atoms[a_2.chain.name].add(a_2.hash_code)
01907     # get one with most atoms in overlap
01908     max_num, max_name = max((len(atoms), name)
01909                             for name, atoms in ch_atoms.iteritems())
01910     # early abort if overlap fraction not good enough
01911     ch_2 = ent_2.FindChain(max_name)
01912     if max_num == 0 or max_num / float(ch_2.handle.atom_count) < sup_fract:
01913       return None
01914     # early abort if mapped to chain of different chem. group
01915     ch_1_name = ch_1.name
01916     if ch_1_name not in chem_partners:
01917       raise RuntimeError("Chem. mapping doesn't contain all chains!")
01918     if max_name not in chem_partners[ch_1_name]:
01919       return None
01920     # early abort if multiple ones mapped to same chain
01921     if max_name in mapped_chains:
01922       return None
01923     # set mapping
01924     mapping[ch_1_name] = max_name
01925     mapped_chains.add(max_name)
01926   # unswap if needed and return
01927   if swap:
01928     mapping = {v: k for k, v in mapping.iteritems()}
01929   return mapping
01930 
01931 def _GetMappedRMSD(ent_1, ent_2, chain_mapping, transformation):
01932   """
01933   :return: RMSD between complexes considering chain mapping.
01934   :param ent_1: Entity mapped to *ent_2* (perfect alignment assumed between
01935                 mapped chains as in :attr:`QSscorer.ent_to_cm_1`).
01936   :param ent_2: Entity which was mapped to (perfect alignment assumed between
01937                 mapped chains as in :attr:`QSscorer.ent_to_cm_2`).
01938   :param chain_mapping: See :attr:`QSscorer.chain_mapping`
01939   :param transformation: Superposition transformation to be applied to *ent_1*.
01940   """
01941   # collect RMSDs and atom counts chain by chain and combine afterwards
01942   rmsds = []
01943   atoms = []
01944   for c1, c2 in chain_mapping.iteritems():
01945     # get views and atom counts
01946     chain_1 = ent_1.Select('cname=%s' % c1)
01947     chain_2 = ent_2.Select('cname=%s' % c2)
01948     atom_count = chain_1.atom_count
01949     if atom_count != chain_2.atom_count:
01950       raise RuntimeError('Chains in _GetMappedRMSD must be perfectly aligned!')
01951     # get RMSD
01952     rmsd = mol.alg.CalculateRMSD(chain_1, chain_2, transformation)
01953     # update lists
01954     rmsds.append(rmsd)
01955     atoms.append(float(atom_count))
01956   # combine (reminder: RMSD = sqrt(sum(atom_distance^2)/atom_count))
01957   rmsd = np.sqrt( sum([a * r**2 for a, r in zip(atoms, rmsds)]) / sum(atoms) )
01958   return rmsd
01959 
01960 class _CachedRMSD:
01961   """Helper object for repetitive RMSD calculations.
01962   Meant to speed up :func:`_GetChainMapping` but could also be used to replace
01963   :func:`_GetMappedRMSD` in :func:`_CheckClosedSymmetry`.
01964 
01965   :param ent_1: See :attr:`QSscorer.ent_to_cm_1`
01966   :param ent_2: See :attr:`QSscorer.ent_to_cm_2`
01967   """
01968   def __init__(self, ent_1, ent_2):
01969     # initialize caches and keep entities
01970     self.ent_1 = ent_1
01971     self.ent_2 = ent_2
01972     self._chain_views_1 = dict()
01973     self._chain_views_2 = dict()
01974     self._pair_rmsd = dict()  # key = (c1,c2), value = (atom_count,rmsd)
01975 
01976   def GetChainView1(self, cname):
01977     """Get cached view on chain *cname* for :attr:`ent_1`."""
01978     if cname not in self._chain_views_1:
01979       self._chain_views_1[cname] = self.ent_1.Select('cname=%s' % cname)
01980     return self._chain_views_1[cname]
01981 
01982   def GetChainView2(self, cname):
01983     """Get cached view on chain *cname* for :attr:`ent_2`."""
01984     if cname not in self._chain_views_2:
01985       self._chain_views_2[cname] = self.ent_2.Select('cname=%s' % cname)
01986     return self._chain_views_2[cname]
01987 
01988   def GetSuperposition(self, c1, c2):
01989     """Get superposition result (no change in entities!) for *c1* to *c2*.
01990     This invalidates cached RMSD results used in :func:`GetMappedRMSD`.
01991 
01992     :param c1: chain name for :attr:`ent_1`.
01993     :param c2: chain name for :attr:`ent_2`.
01994     """
01995     # invalidate _pair_rmsd
01996     self._pair_rmsd = dict()
01997     # superpose
01998     chain_1 = self.GetChainView1(c1)
01999     chain_2 = self.GetChainView2(c2)
02000     if chain_1.atom_count != chain_2.atom_count:
02001       raise RuntimeError('Chains in GetSuperposition not perfectly aligned!')
02002     return mol.alg.SuperposeSVD(chain_1, chain_2, apply_transform=False)
02003 
02004   def GetMappedRMSD(self, chain_mapping, transformation):
02005     """
02006     :return: RMSD between complexes considering chain mapping.
02007     :param chain_mapping: See :attr:`QSscorer.chain_mapping`.
02008     :param transformation: Superposition transformation (e.g. res.transformation
02009                            for res = :func:`GetSuperposition`).
02010     """
02011     # collect RMSDs and atom counts chain by chain and combine afterwards
02012     rmsds = []
02013     atoms = []
02014     for c1, c2 in chain_mapping.iteritems():
02015       # cached?
02016       if (c1, c2) in self._pair_rmsd:
02017         atom_count, rmsd = self._pair_rmsd[(c1, c2)]
02018       else:
02019         # get views and atom counts
02020         chain_1 = self.GetChainView1(c1)
02021         chain_2 = self.GetChainView2(c2)
02022         atom_count = chain_1.atom_count
02023         if atom_count != chain_2.atom_count:
02024           raise RuntimeError('Chains in GetMappedRMSD not perfectly aligned!')
02025         # get RMSD
02026         rmsd = mol.alg.CalculateRMSD(chain_1, chain_2, transformation)
02027         self._pair_rmsd[(c1, c2)] = (atom_count, rmsd)
02028       # update lists
02029       rmsds.append(rmsd)
02030       atoms.append(float(atom_count))
02031     # combine (reminder: RMSD = sqrt(sum(atom_distance^2)/atom_count))
02032     rmsd = np.sqrt( sum([a * r**2 for a, r in zip(atoms, rmsds)]) / sum(atoms) )
02033     return rmsd
02034 
02035 
02036 def _CleanUserSymmetry(symm, ent):
02037   """Clean-up user provided symmetry.
02038 
02039   :param symm: Symmetry group as in :attr:`QSscorer.symm_1`
02040   :param ent: Entity from which to extract chain names
02041   :type ent:  :class:`~ost.mol.EntityHandle` or :class:`~ost.mol.EntityView`
02042   :return: New symmetry group limited to chains in sub-structure *ent*
02043            (see :attr:`QSscorer.symm_1`). Empty list if invalid symmetry.
02044   """
02045   # restrict symm to only contain chains in ent
02046   chain_names = set([ch.name for ch in ent.chains])
02047   new_symm = list()
02048   for symm_group in symm:
02049     new_group = tuple(ch for ch in symm_group if ch in chain_names)
02050     if new_group:
02051       new_symm.append(new_group)
02052   # report it
02053   if new_symm != symm:
02054     LogInfo("Cleaned user symmetry for %s: %s" % (ent.GetName(), new_symm))
02055   # do all groups have same length?
02056   lengths = [len(symm_group) for symm_group in new_symm]
02057   if lengths[1:] != lengths[:-1]:
02058     LogWarning('User symmetry group of different sizes for %s. Ignoring it!' \
02059                % (ent.GetName()))
02060     return []
02061   # do we cover all chains?
02062   s_chain_names = set([ch for symm_group in new_symm for ch in symm_group])
02063   if len(s_chain_names) != sum(lengths):
02064     LogWarning('User symmetry group for %s has duplicate chains. Ignoring it!' \
02065                % (ent.GetName()))
02066     return []
02067   if s_chain_names != chain_names:
02068     LogWarning('User symmetry group for %s is missing chains. Ignoring it!' \
02069                % (ent.GetName()))
02070     return []
02071   # ok all good
02072   return new_symm
02073 
02074 def _AreValidSymmetries(symm_1, symm_2):
02075   """Check symmetry pair for major problems.
02076 
02077   :return: False if any of the two is empty or if they're compatible in size.
02078   :rtype:  :class:`bool`
02079   """
02080   if not symm_1 or not symm_2:
02081     return False
02082   if len(symm_1) != 1 or len(symm_2) != 1:
02083     if not len(symm_1[0]) == len(symm_2[0]):
02084       LogWarning('Symmetry groups of different sizes. Ignoring them!')
02085       return False
02086   return True
02087 
02088 def _GetMappedAlignments(ent_1, ent_2, chain_mapping):
02089   """
02090   :return: Alignments of 2 structures given chain mapping
02091            (see :attr:`QSscorer.alignments`).
02092   :param ent_1: Entity containing all chains in *chain_mapping.keys()*.
02093                 Views to this entity attached to first sequence of each aln.
02094   :param ent_2: Entity containing all chains in *chain_mapping.values()*.
02095                 Views to this entity attached to second sequence of each aln.
02096   :param chain_mapping: See :attr:`QSscorer.chain_mapping`
02097   """
02098   alns = []
02099   for ch_1_name in sorted(chain_mapping):
02100     # get both sequences incl. attached view
02101     ch_1 = ent_1.FindChain(ch_1_name)
02102     seq_1 = seq.SequenceFromChain(ch_1.name, ch_1)
02103     ch_2 = ent_2.FindChain(chain_mapping[ch_1_name])
02104     seq_2 = seq.SequenceFromChain(ch_2.name, ch_2)
02105     # align them
02106     aln = _AlignAtomSeqs(seq_1, seq_2)
02107     if aln: alns.append(aln)
02108   return alns
02109 
02110 def _GetMappedResidues(alns):
02111   """
02112   :return: Mapping of shared residues in *alns* (with views attached)
02113            (see :attr:`QSscorer.mapped_residues`).
02114   :param alns: See :attr:`QSscorer.alignments`
02115   """
02116   mapped_residues = dict()
02117   for aln in alns:
02118     # prepare dict for c1
02119     c1 = aln.GetSequence(0).name
02120     mapped_residues[c1] = dict()
02121     # get shared residues
02122     v1, v2 = seq.ViewsFromAlignment(aln)
02123     for res_1, res_2 in zip(v1.residues, v2.residues):
02124       r1 = res_1.number.num
02125       r2 = res_2.number.num
02126       mapped_residues[c1][r1] = r2
02127 
02128   return mapped_residues
02129 
02130 def _GetExtraWeights(contacts, done, mapped_residues):
02131   """Return sum of extra weights for contacts of chains in set and not in done.
02132   :return: Tuple (weight_extra_mapped, weight_extra_all).
02133            weight_extra_mapped only sums if both cX,rX in mapped_residues
02134            weight_extra_all sums all
02135   :param contacts: See :func:`GetContacts` for first entity
02136   :param done: List of (c1, c2, r1, r2) tuples to ignore
02137   :param mapped_residues: See :func:`_GetMappedResidues`
02138   """
02139   weight_extra_mapped = 0
02140   weight_extra_non_mapped = 0
02141   for c1 in contacts:
02142     for c2 in contacts[c1]:
02143       for r1 in contacts[c1][c2]:
02144         for r2 in contacts[c1][c2][r1]:
02145           if (c1, c2, r1, r2) not in done:
02146             weight = _weight(contacts[c1][c2][r1][r2])
02147             if     c1 in mapped_residues and r1 in mapped_residues[c1] \
02148                and c2 in mapped_residues and r2 in mapped_residues[c2]:
02149               weight_extra_mapped += weight
02150             else:
02151               weight_extra_non_mapped += weight
02152   return weight_extra_mapped, weight_extra_mapped + weight_extra_non_mapped
02153 
02154 def _GetScores(contacts_1, contacts_2, mapped_residues, chain_mapping):
02155   """Get QS scores (see :class:`QSscorer`).
02156 
02157   Note that if some chains are not to be considered at all, they must be removed
02158   from *contacts_1* / *contacts_2* prior to calling this.
02159 
02160   :param contacts_1: See :func:`GetContacts` for first entity
02161   :param contacts_2: See :func:`GetContacts` for second entity
02162   :param mapped_residues: See :func:`_GetMappedResidues`
02163   :param chain_mapping: Maps any chain name in *mapped_residues* to chain name
02164                         for *contacts_2*.
02165   :type chain_mapping:  :class:`dict` (:class:`str` / :class:`str`)
02166   :return: Tuple (QS_best, QS_global) (see :attr:`QSscorer.best_score` and
02167            see :attr:`QSscorer.global_score`)
02168   """
02169   # keep track of considered contacts as set of (c1,c2,r1,r2) tuples
02170   done_1 = set()
02171   done_2 = set()
02172   weighted_scores = 0
02173   weight_sum = 0
02174   # naming cXY, rXY: X = 1/2 for contact, Y = 1/2/T for mapping (T = tmp)
02175   # -> d1 = contacts_1[c11][c21][r11][r21], d2 = contacts_2[c12][c22][r12][r22]
02176   for c11 in contacts_1:
02177     # map to other one
02178     if c11 not in mapped_residues: continue
02179     c1T = chain_mapping[c11]
02180     # second chain
02181     for c21 in contacts_1[c11]:
02182       # map to other one
02183       if c21 not in mapped_residues: continue
02184       c2T = chain_mapping[c21]
02185       # flip if needed
02186       flipped_chains = (c1T > c2T)
02187       if flipped_chains:
02188         c12, c22 = c2T, c1T
02189       else:
02190         c12, c22 = c1T, c2T
02191       # skip early if no contacts there
02192       if c12 not in contacts_2 or c22 not in contacts_2[c12]: continue
02193       # loop over res.num. in c11
02194       for r11 in contacts_1[c11][c21]:
02195         # map to other one and skip if no contacts there
02196         if r11 not in mapped_residues[c11]: continue
02197         r1T = mapped_residues[c11][r11]
02198         # loop over res.num. in c21
02199         for r21 in contacts_1[c11][c21][r11]:
02200           # map to other one and skip if no contacts there
02201           if r21 not in mapped_residues[c21]: continue
02202           r2T = mapped_residues[c21][r21]
02203           # flip if needed
02204           if flipped_chains:
02205             r12, r22 = r2T, r1T
02206           else:
02207             r12, r22 = r1T, r2T
02208           # skip early if no contacts there
02209           if r12 not in contacts_2[c12][c22]: continue
02210           if r22 not in contacts_2[c12][c22][r12]: continue
02211           # ok now we can finally do the scoring
02212           d1 = contacts_1[c11][c21][r11][r21]
02213           d2 = contacts_2[c12][c22][r12][r22]
02214           weight = _weight(min(d1, d2))
02215           weighted_scores += weight * (1 - abs(d1 - d2) / 12)
02216           weight_sum += weight
02217           # keep track of done ones
02218           done_1.add((c11, c21, r11, r21))
02219           done_2.add((c12, c22, r12, r22))
02220 
02221   LogVerbose("Shared contacts: %d" % len(done_1))
02222 
02223   # add the extra weights
02224   weights_extra_1 = _GetExtraWeights(contacts_1, done_1, mapped_residues)
02225   mapped_residues_2 = dict()
02226   for c1 in mapped_residues:
02227     c2 = chain_mapping[c1]
02228     mapped_residues_2[c2] = set()
02229     for r1 in mapped_residues[c1]:
02230       mapped_residues_2[c2].add(mapped_residues[c1][r1])
02231   weights_extra_2 = _GetExtraWeights(contacts_2, done_2, mapped_residues_2)
02232   weight_extra_mapped = weights_extra_1[0] + weights_extra_2[0]
02233   weight_extra_all    = weights_extra_1[1] + weights_extra_2[1]
02234   
02235   # get scores
02236   denom_best = weight_sum + weight_extra_mapped
02237   denom_all  = weight_sum + weight_extra_all
02238   if denom_best == 0:
02239     QS_best = 0
02240   else:
02241     QS_best = weighted_scores / denom_best
02242   if denom_all == 0:
02243     QS_global = 0
02244   else:
02245     QS_global = weighted_scores / denom_all
02246   return QS_best, QS_global
02247 
02248 def _weight(dist):
02249   """
02250   This weight expresses the probability of two residues to interact given the CB
02251   distance (from Xu et al. http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2573399/)
02252   """
02253   if dist <= 5.0:
02254     return 1
02255   x = (dist-5.0)/4.28;
02256   return np.exp(-2*x*x)
02257 
02258 
02259 def _GetQsSuperposition(alns):
02260   """
02261   :return: Superposition result based on shared CA atoms in *alns*
02262            (with views attached) (see :attr:`QSscorer.superposition`).
02263   :param alns: See :attr:`QSscorer.alignments`
02264   """
02265   # check input
02266   if not alns:
02267     raise QSscoreError('No successful alignments for superposition!')
02268   # prepare views
02269   view_1 = alns[0].GetSequence(0).attached_view.CreateEmptyView()
02270   view_2 = alns[0].GetSequence(1).attached_view.CreateEmptyView()
02271   # collect CA from alignments in proper order
02272   for aln in alns:
02273     for col in aln:
02274       res_1 = col.GetResidue(0)
02275       res_2 = col.GetResidue(1)
02276       if res_1.IsValid() and res_2.IsValid():
02277         ca_1 = res_1.FindAtom('CA')
02278         ca_2 = res_2.FindAtom('CA')
02279         if ca_1.IsValid() and ca_2.IsValid():
02280           view_1.AddAtom(ca_1)
02281           view_2.AddAtom(ca_2)
02282   # superpose them without chainging entities
02283   res = mol.alg.SuperposeSVD(view_1, view_2, apply_transform=False)
02284   return res
02285 
02286 
02287 def _AddResidue(edi, res, rnum, chain, calpha_only):
02288   """
02289   Add residue *res* with res. num. *run* to given *chain* using editor *edi*.
02290   Either all atoms added or (if *calpha_only*) only CA.
02291   """
02292   if calpha_only:
02293     ca_atom = res.FindAtom('CA')
02294     if ca_atom.IsValid():
02295       new_res = edi.AppendResidue(chain, res.name, rnum)
02296       edi.InsertAtom(new_res, ca_atom.name, ca_atom.pos)
02297   else:
02298     new_res = edi.AppendResidue(chain, res.name, rnum)
02299     for atom in res.atoms:
02300       edi.InsertAtom(new_res, atom.name, atom.pos)
02301 
02302 def _MergeAlignedChains(alns, ent_1, ent_2, calpha_only):
02303   """
02304   Create two new entities (based on the alignments attached views) where all
02305   residues have same numbering (when they're aligned) and they are all pushed to
02306   a single chain X. Also append extra chains contained in *ent_1* and *ent_2*
02307   but not contained in *alns*.
02308 
02309   Used for :attr:`QSscorer.lddt_ref` and :attr:`QSscorer.lddt_mdl`
02310 
02311   :param alns: List of alignments with attached views (first sequence: *ent_1*,
02312                second: *ent_2*). Residue number in single chain is column index
02313                of current alignment + sum of lengths of all previous alignments
02314                (order of alignments as in input list).
02315   :type alns:  See :attr:`QSscorer.alignments`
02316   :param ent_1: First entity to process.
02317   :type ent_1:  :class:`~ost.mol.EntityHandle`
02318   :param ent_2: Second entity to process.
02319   :type ent_2:  :class:`~ost.mol.EntityHandle`
02320   :param calpha_only: If True, we only include CA atoms instead of all.
02321   :type calpha_only:  :class:`bool`
02322 
02323   :return: Tuple of two single chain entities (from *ent_1* and from *ent_2*)
02324   :rtype:  :class:`tuple` of :class:`~ost.mol.EntityHandle`
02325   """
02326   # first new entity
02327   ent_ren_1 = mol.CreateEntity()
02328   ed_1 = ent_ren_1.EditXCS()
02329   new_chain_1 = ed_1.InsertChain('X')
02330   # second one
02331   ent_ren_2 = mol.CreateEntity()
02332   ed_2 = ent_ren_2.EditXCS()
02333   new_chain_2 = ed_2.InsertChain('X')
02334   # the alignment already contains sorted chains
02335   rnum = 0
02336   chain_done_1 = set()
02337   chain_done_2 = set()
02338   for aln in alns:
02339     chain_done_1.add(aln.GetSequence(0).name)
02340     chain_done_2.add(aln.GetSequence(1).name)
02341     for col in aln:
02342       rnum += 1
02343       # add valid residues to single chain entities
02344       res_1 = col.GetResidue(0)
02345       if res_1.IsValid():
02346         _AddResidue(ed_1, res_1, rnum, new_chain_1, calpha_only)
02347       res_2 = col.GetResidue(1)
02348       if res_2.IsValid():
02349         _AddResidue(ed_2, res_2, rnum, new_chain_2, calpha_only)
02350   # extra chains
02351   for chain in ent_1.chains:
02352     if chain.name in chain_done_1:
02353       continue
02354     for res in chain.residues:
02355       rnum += 1
02356       _AddResidue(ed_1, res, rnum, new_chain_1, calpha_only)
02357   for chain in ent_2.chains:
02358     if chain.name in chain_done_2:
02359       continue
02360     for res in chain.residues:
02361       rnum += 1
02362       _AddResidue(ed_2, res, rnum, new_chain_2, calpha_only)
02363   # get entity names
02364   ent_ren_1.SetName(aln.GetSequence(0).GetAttachedView().GetName())
02365   ent_ren_2.SetName(aln.GetSequence(1).GetAttachedView().GetName())
02366   # connect atoms
02367   if not conop.GetDefaultLib():
02368     raise RuntimeError("QSscore computation requires a compound library!")
02369   pr = conop.RuleBasedProcessor(conop.GetDefaultLib())
02370   pr.Process(ent_ren_1)  
02371   ed_1.UpdateICS()
02372   pr.Process(ent_ren_2)  
02373   ed_2.UpdateICS()
02374   return ent_ren_1, ent_ren_2
02375 
02376 def _ComputeLDDTScore(ref, mdl):
02377   """
02378   :return: lDDT of *mdl* vs *ref* (see :attr:`QSscorer.lddt_score`).
02379   :param mdl: Reference entity (see :attr:`QSscorer.lddt_mdl`)
02380   :param ref: Model entity (see :attr:`QSscorer.lddt_ref`)
02381   """
02382   # check input
02383   LogInfo('Reference %s has: %s residues' % (ref.GetName(), ref.residue_count))
02384   LogInfo('Model %s has: %s residues' % (mdl.GetName(), mdl.residue_count))
02385   # get lddt score with fixed settings
02386   lddt_score = mol.alg.LocalDistDiffTest(mdl.Select(''), ref.Select(''),
02387                                          2., 8., 'lddt')
02388   LogInfo('lDDT score: %.3f' % lddt_score)
02389   # add lDDT as B-factor to model
02390   for r in mdl.residues:
02391     if r.HasProp('lddt'):
02392       for a in r.atoms:
02393         a.SetBFactor(r.GetFloatProp('lddt'))
02394     else:
02395       for a in r.atoms:
02396         a.SetBFactor(0.0)
02397   
02398   return lddt_score
02399 
02400 # specify public interface
02401 __all__ = ('QSscoreError', 'QSscorer', 'QSscoreEntity', 'FilterContacts',
02402            'GetContacts')