OpenStructure: stage/lib64/python2.7/site-packages/ost/mol/alg/qsscoring.py Source File

00001 """
00002 Scoring of quaternary structures as in Martino's 2017 paper.
00003 
00004 .. note ::
00005 
00006   Requirements for use:
00007 
00008   - A default :class:`compound library <ost.conop.CompoundLib>` must be defined
00009     and accessible via :func:`~ost.conop.GetDefaultLib`. This is set by default
00010     when executing scripts with ``ost``. Otherwise, you must set this with
00011     :func:`~ost.conop.SetDefaultLib`.
00012   - ClustalW must be installed (unless you provide chain mappings)
00013   - Python modules `numpy` and `scipy` must be installed and available
00014     (e.g. use ``pip install scipy numpy``)
00015 
00016 Authors: Gerardo Tauriello, Martino Bertoni
00017 """
00018 
00019 from ost import mol, geom, conop, seq, settings
00020 from ost import LogError, LogWarning, LogScript, LogInfo, LogVerbose, LogDebug
00021 from ost.bindings.clustalw import ClustalW
00022 import numpy as np
00023 from scipy.misc import factorial
00024 from scipy.special import binom
00025 from scipy.cluster.hierarchy import fclusterdata
00026 import itertools
00027 
00028 ###############################################################################
00029 # QS scoring
00030 ###############################################################################
00031 
00032 class QSscoreError(Exception):
00033   """Exception to be raised for "acceptable" exceptions in QS scoring.
00034 
00035   Those are cases we might want to capture for default behavior.
00036   """
00037   pass
00038 
00039 class QSscorer:
00040   """Object to compute QS scores.
00041 
00042   Simple usage without any precomputed contacts, symmetries and mappings:
00043 
00044   .. code-block:: python
00045 
00046     import ost
00047     from ost.mol.alg import qsscoring
00048 
00049     # load two biounits to compare
00050     ent_full = ost.io.LoadPDB('3ia3', remote=True)
00051     ent_1 = ent_full.Select('cname=A,D')
00052     ent_2 = ent_full.Select('cname=B,C')
00053     # get score
00054     ost.PushVerbosityLevel(3)
00055     try:
00056       qs_scorer = qsscoring.QSscorer(ent_1, ent_2)
00057       ost.LogScript('QSscore:', str(qs_scorer.global_score))
00058       ost.LogScript('Chain mapping used:', str(qs_scorer.chain_mapping))
00059       # commonly you want the QS global score as output
00060       qs_score = qs_scorer.global_score
00061     except qsscoring.QSscoreError as ex:
00062       # default handling: report failure and set score to 0
00063       ost.LogError('QSscore failed:', str(ex))
00064       qs_score = 0
00065 
00066   For maximal performance when computing QS scores of the same entity with many
00067   others, it is advisable to construct and reuse :class:`QSscoreEntity` objects.
00068 
00069   Any known / precomputed information can be filled into the appropriate
00070   attribute here (no checks done!). Otherwise most quantities are computed on
00071   first access and cached (lazy evaluation). Setters are provided to set values
00072   with extra checks (e.g. :func:`SetSymmetries`).
00073 
00074   All necessary seq. alignments are done by global BLOSUM62-based alignment. A
00075   multiple sequence alignment is performed with ClustalW unless
00076   :attr:`chain_mapping` is provided manually. You will need to have an
00077   executable ``clustalw`` or ``clustalw2`` in your ``PATH`` or you must set
00078   :attr:`clustalw_bin` accordingly. Otherwise an exception
00079   (:class:`ost.settings.FileNotFound`) is thrown.
00080 
00081   Formulas for QS scores:
00082 
00083   ::
00084   
00085     - QS_best = weighted_scores / (weight_sum + weight_extra_mapped)
00086     - QS_global = weighted_scores / (weight_sum + weight_extra_all)
00087     -> weighted_scores = sum(w(min(d1,d2)) * (1 - abs(d1-d2)/12)) for shared
00088     -> weight_sum = sum(w(min(d1,d2))) for shared
00089     -> weight_extra_mapped = sum(w(d)) for all mapped but non-shared
00090     -> weight_extra_all = sum(w(d)) for all non-shared
00091     -> w(d) = 1 if d <= 5, exp(-2 * ((d-5.0)/4.28)^2) else
00092 
00093   :param ent_1: First structure to be scored.
00094   :type ent_1:  :class:`QSscoreEntity`, :class:`~ost.mol.EntityHandle` or
00095                 :class:`~ost.mol.EntityView`
00096   :param ent_2: Second structure to be scored.
00097   :type ent_2:  :class:`QSscoreEntity`, :class:`~ost.mol.EntityHandle` or
00098                 :class:`~ost.mol.EntityView`
00099 
00100   :raises: :class:`QSscoreError` if input structures are invalid or are monomers
00101            or have issues that make it impossible for a QS score to be computed.
00102 
00103   .. attribute:: qs_ent_1
00104 
00105     :class:`QSscoreEntity` object for *ent_1* given at construction.
00106     If entity names (:attr:`~QSscoreEntity.original_name`) are not unique, we
00107     set it to 'pdb_1' using :func:`~QSscoreEntity.SetName`.
00108 
00109   .. attribute:: qs_ent_2
00110 
00111     :class:`QSscoreEntity` object for *ent_2* given at construction.
00112     If entity names (:attr:`~QSscoreEntity.original_name`) are not unique, we
00113     set it to 'pdb_2' using :func:`~QSscoreEntity.SetName`.
00114 
00115   .. attribute:: calpha_only
00116 
00117     True if any of the two structures is CA-only (after cleanup).
00118 
00119     :type: :class:`bool`
00120 
00121   .. attribute:: max_ca_per_chain_for_cm
00122 
00123     Maximal number of CA atoms to use in each chain to determine chain mappings.
00124     Setting this to -1 disables the limit. Limiting it speeds up determination
00125     of symmetries and chain mappings. By default it is set to 100.
00126 
00127     :type: :class:`int`
00128   """
00129   def __init__(self, ent_1, ent_2):
00130     # generate QSscoreEntity objects?
00131     if isinstance(ent_1, QSscoreEntity):
00132       self.qs_ent_1 = ent_1
00133     else:
00134       self.qs_ent_1 = QSscoreEntity(ent_1)
00135     if isinstance(ent_2, QSscoreEntity):
00136       self.qs_ent_2 = ent_2
00137     else:
00138       self.qs_ent_2 = QSscoreEntity(ent_2)
00139     # check validity of inputs
00140     if not self.qs_ent_1.is_valid or not self.qs_ent_2.is_valid:
00141       raise QSscoreError("Invalid input in QSscorer!")
00142     # set names for structures
00143     if self.qs_ent_1.original_name == self.qs_ent_2.original_name:
00144       self.qs_ent_1.SetName('pdb_1')
00145       self.qs_ent_2.SetName('pdb_2')
00146     else:
00147       self.qs_ent_1.SetName(self.qs_ent_1.original_name)
00148       self.qs_ent_2.SetName(self.qs_ent_2.original_name)
00149     # set other public attributes
00150     self.calpha_only = self.qs_ent_1.calpha_only or self.qs_ent_2.calpha_only
00151     self.max_ca_per_chain_for_cm = 100
00152     # init cached stuff
00153     self._chem_mapping = None
00154     self._ent_to_cm_1 = None
00155     self._ent_to_cm_2 = None
00156     self._symm_1 = None
00157     self._symm_2 = None
00158     self._chain_mapping = None
00159     self._alignments = None
00160     self._mapped_residues = None
00161     self._global_score = None
00162     self._best_score = None
00163     self._superposition = None
00164     self._lddt_score = None
00165     self._lddt_mdl = None
00166     self._lddt_ref = None
00167     self._clustalw_bin = None
00168 
00169   @property
00170   def chem_mapping(self):
00171     """Inter-complex mapping of chemical groups.
00172 
00173     Each group (see :attr:`QSscoreEntity.chem_groups`) is mapped according to
00174     highest sequence identity. Alignment is between longest sequences in groups.
00175 
00176     Limitations:
00177 
00178     - If different numbers of groups, we map only the groups for the complex
00179       with less groups (rest considered unmapped and shown as warning)
00180     - The mapping is forced: the "best" mapping will be chosen independently of
00181       how low the seq. identity may be
00182 
00183     :getter: Computed on first use (cached)
00184     :type: :class:`dict` with key = :class:`tuple` of chain names in
00185            :attr:`qs_ent_1` and value = :class:`tuple` of chain names in
00186            :attr:`qs_ent_2`.
00187 
00188     :raises: :class:`QSscoreError` if we end up having less than 2 chains for
00189              either entity in the mapping (can happen if chains do not have CA
00190              atoms).
00191     """
00192     if self._chem_mapping is None:
00193       self._chem_mapping = _GetChemGroupsMapping(self.qs_ent_1, self.qs_ent_2)
00194     return self._chem_mapping
00195 
00196   @property
00197   def ent_to_cm_1(self):
00198     """Subset of :attr:`qs_ent_1` used to compute chain mapping and symmetries.
00199 
00200     Properties:
00201 
00202     - Includes only residues aligned according to :attr:`chem_mapping`
00203     - Includes only 1 CA atom per residue
00204     - Has at least 5 and at most :attr:`max_ca_per_chain_for_cm` atoms per chain
00205     - All chains of the same chemical group have the same number of atoms
00206       (also in :attr:`ent_to_cm_2` according to :attr:`chem_mapping`)
00207     - All chains appearing in :attr:`chem_mapping` appear in this entity
00208       (so the two can be safely used together)
00209 
00210     This entity might be transformed (i.e. all positions rotated/translated by
00211     same transformation matrix) if this can speed up computations. So do not
00212     assume fixed global positions (but relative distances will remain fixed).
00213 
00214     :getter: Computed on first use (cached)
00215     :type: :class:`~ost.mol.EntityHandle`
00216 
00217     :raises: :class:`QSscoreError` if any chain ends up having less than 5 res.
00218     """
00219     if self._ent_to_cm_1 is None:
00220       self._ComputeAlignedEntities()
00221     return self._ent_to_cm_1
00222 
00223   @property
00224   def ent_to_cm_2(self):
00225     """Subset of :attr:`qs_ent_1` used to compute chain mapping and symmetries
00226     (see :attr:`ent_to_cm_1` for details).
00227     """
00228     if self._ent_to_cm_2 is None:
00229       self._ComputeAlignedEntities()
00230     return self._ent_to_cm_2
00231 
00232   @property
00233   def symm_1(self):
00234     """Symmetry groups for :attr:`qs_ent_1` used to speed up chain mapping.
00235 
00236     This is a list of chain-lists where each chain-list can be used reconstruct
00237     the others via cyclic C or dihedral D symmetry. The first chain-list is used
00238     as a representative symmetry group. For heteromers, the group-members must
00239     contain all different seqres in oligomer.
00240 
00241     Example: symm. groups [(A,B,C), (D,E,F), (G,H,I)] means that there are
00242     symmetry transformations to get (D,E,F) and (G,H,I) from (A,B,C).
00243 
00244     Properties:
00245 
00246     - All symmetry group tuples have the same length (num. of chains)
00247     - All chains in :attr:`ent_to_cm_1` appear (w/o duplicates)
00248     - For heteros: symmetry group tuples have all different chem. groups
00249     - Trivial symmetry group = one tuple with all chains (used if inconsistent
00250       data provided or if no symmetry is found)
00251     - Either compatible to :attr:`symm_2` or trivial symmetry groups used.
00252       Compatibility requires same lengths of symmetry group tuples and it must
00253       be possible to get an overlap (80% of residues covered within 6 A of a
00254       (chem. mapped) chain) of all chains in representative symmetry groups by
00255       superposing one pair of chains.
00256 
00257     :getter: Computed on first use (cached)
00258     :type: :class:`list` of :class:`tuple` of :class:`str` (chain names)
00259     """
00260     if self._symm_1 is None:
00261       self._ComputeSymmetry()
00262     return self._symm_1
00263 
00264   @property
00265   def symm_2(self):
00266     """Symmetry groups for :attr:`qs_ent_2` (see :attr:`symm_1` for details)."""
00267     if self._symm_2 is None:
00268       self._ComputeSymmetry()
00269     return self._symm_2
00270 
00271   def SetSymmetries(self, symm_1, symm_2):
00272     """Set user-provided symmetry groups.
00273 
00274     These groups are restricted to chain names appearing in :attr:`ent_to_cm_1`
00275     and :attr:`ent_to_cm_2` respectively. They are only valid if they cover all
00276     chains and both *symm_1* and *symm_2* have same lengths of symmetry group
00277     tuples. Otherwise trivial symmetry group used (see :attr:`symm_1`).
00278 
00279     :param symm_1: Value to set for :attr:`symm_1`.
00280     :param symm_2: Value to set for :attr:`symm_2`.
00281     """
00282     # restrict chain names
00283     self._symm_1 = _CleanUserSymmetry(symm_1, self.ent_to_cm_1)
00284     self._symm_2 = _CleanUserSymmetry(symm_2, self.ent_to_cm_2)
00285     # check that we have reasonable symmetries set (fallback: all chains)
00286     if not _AreValidSymmetries(self._symm_1, self._symm_2):
00287       self._symm_1 = [tuple(ch.name for ch in self.ent_to_cm_1.chains)]
00288       self._symm_2 = [tuple(ch.name for ch in self.ent_to_cm_2.chains)]
00289 
00290   @property
00291   def chain_mapping(self):
00292     """Mapping from :attr:`ent_to_cm_1` to :attr:`ent_to_cm_2`.
00293 
00294     Properties:
00295 
00296     - Mapping is between chains of same chem. group (see :attr:`chem_mapping`)
00297     - Each chain can appear only once in mapping
00298     - All chains of complex with less chains are mapped
00299     - Symmetry (:attr:`symm_1`, :attr:`symm_2`) is taken into account
00300 
00301     Details on algorithms used to find mapping:
00302 
00303     - We try all pairs of chem. mapped chains within symmetry group and get
00304       superpose-transformation for them
00305     - First option: check for "sufficient overlap" of other chain-pairs
00306 
00307       - For each chain-pair defined above: apply superposition to full oligomer
00308         and map chains based on structural overlap
00309       - Structural overlap = X% of residues in second oligomer covered within Y
00310         Angstrom of a (chem. mapped) chain in first oligomer. We successively
00311         try (X,Y) = (80,4), (40,6) and (20,8) to be less and less strict in
00312         mapping (warning shown for most permissive one).
00313       - If multiple possible mappings are found, we choose the one which leads
00314         to the lowest multi-chain-RMSD given the superposition
00315 
00316     - Fallback option: try all mappings to find minimal multi-chain-RMSD
00317       (warning shown)
00318 
00319       - For each chain-pair defined above: apply superposition, try all (!)
00320         possible chain mappings (within symmetry group) and keep mapping with
00321         lowest multi-chain-RMSD
00322       - Repeat procedure above to resolve symmetry. Within the symmetry group we
00323         can use the chain mapping computed before and we just need to find which
00324         symmetry group in first oligomer maps to which in the second one. We
00325         again try all possible combinations...
00326       - Limitations:
00327         
00328         - Trying all possible mappings is a combinatorial nightmare (factorial).
00329           We throw an exception if too many combinations (e.g. octomer vs
00330           octomer with no usable symmetry)
00331         - The mapping is forced: the "best" mapping will be chosen independently
00332           of how badly they fit in terms of multi-chain-RMSD
00333         - As a result, such a forced mapping can lead to a large range of
00334           resulting QS scores. An extreme example was observed between 1on3.1
00335           and 3u9r.1, where :attr:`global_score` can range from 0.12 to 0.43
00336           for mappings with very similar multi-chain-RMSD.
00337 
00338     :getter: Computed on first use (cached)
00339     :type: :class:`dict` with key / value = :class:`str` (chain names, key
00340            for :attr:`ent_to_cm_1`, value for :attr:`ent_to_cm_2`)
00341     :raises: :class:`QSscoreError` if there are too many combinations to check
00342              to find a chain mapping.
00343     """
00344     if self._chain_mapping is None:
00345       self._chain_mapping = _GetChainMapping(self.ent_to_cm_1, self.ent_to_cm_2,
00346                                              self.symm_1, self.symm_2,
00347                                              self.chem_mapping)
00348       LogInfo('Mapping found: %s' % str(self._chain_mapping))
00349     return self._chain_mapping
00350 
00351   @property
00352   def alignments(self):
00353     """List of successful sequence alignments using :attr:`chain_mapping`.
00354 
00355     There will be one alignment for each mapped chain and they are ordered by
00356     their chain names in :attr:`qs_ent_1`.
00357 
00358     The sequences of the alignments have views attached into
00359     :attr:`QSscoreEntity.ent` of :attr:`qs_ent_1` and :attr:`qs_ent_2`.
00360 
00361     :getter: Computed on first use (cached)
00362     :type: :class:`list` of :class:`~ost.seq.AlignmentHandle`
00363     """
00364     if self._alignments is None:
00365       self._alignments = _GetMappedAlignments(self.qs_ent_1.ent,
00366                                               self.qs_ent_2.ent,
00367                                               self.chain_mapping)
00368     return self._alignments
00369 
00370   @property
00371   def mapped_residues(self):
00372     """Mapping of shared residues in :attr:`alignments`.
00373 
00374     :getter: Computed on first use (cached)
00375     :type: :class:`dict` *mapped_residues[c1][r1] = r2* with:
00376            *c1* = Chain name in first entity (= first sequence in aln),
00377            *r1* = Residue number in first entity,
00378            *r2* = Residue number in second entity
00379     """
00380     if self._mapped_residues is None:
00381       self._mapped_residues = _GetMappedResidues(self.alignments)
00382     return self._mapped_residues
00383 
00384   @property
00385   def global_score(self):
00386     """QS-score with penalties.
00387     
00388     The range of the score is between 0 (i.e. no interface residues are shared
00389     between biounits) and 1 (i.e. the interfaces are identical).
00390     
00391     The global QS-score is computed applying penalties when interface residues
00392     or entire chains are missing (i.e. anything that is not mapped in
00393     :attr:`mapped_residues` / :attr:`chain_mapping`) in one of the biounits.
00394 
00395     :getter: Computed on first use (cached)
00396     :type: :class:`float`
00397     """
00398     if self._global_score is None:
00399       self._ComputeScores()
00400     return self._global_score
00401 
00402   @property
00403   def best_score(self):
00404     """QS-score without penalties.
00405 
00406     Like :attr:`global_score`, but neglecting additional residues or chains in
00407     one of the biounits (i.e. the score is calculated considering only mapped
00408     chains and residues).
00409 
00410     :getter: Computed on first use (cached)
00411     :type: :class:`float`
00412     """
00413     if self._best_score is None:
00414       self._ComputeScores()
00415     return self._best_score
00416 
00417   @property
00418   def superposition(self):
00419     """Superposition result based on shared CA atoms in :attr:`alignments`.
00420 
00421     The superposition can be used to map :attr:`QSscoreEntity.ent` of
00422     :attr:`qs_ent_1` onto the one of :attr:`qs_ent_2`. Use
00423     :func:`ost.geom.Invert` if you need the opposite transformation.
00424 
00425     :getter: Computed on first use (cached)
00426     :type: :class:`ost.mol.alg.SuperpositionResult`
00427     """
00428     if self._superposition is None:
00429       self._superposition = _GetQsSuperposition(self.alignments)
00430       # report it
00431       sup_rmsd = self._superposition.rmsd
00432       cmp_view = self._superposition.view1
00433       LogInfo('CA RMSD for %s aligned residues on %s chains: %.2f' \
00434               % (cmp_view.residue_count, cmp_view.chain_count, sup_rmsd))
00435     return self._superposition
00436 
00437   @property
00438   def lddt_score(self):
00439     """The multi-chain lDDT score.
00440 
00441     .. note::
00442 
00443       lDDT is not considering over-prediction (i.e. extra chains) and hence is
00444       not symmetric. Here, we consider :attr:`qs_ent_1` as the reference and
00445       :attr:`qs_ent_2` as the model. The alignments from :attr:`alignments` are
00446       used to map residue numbers and chains.
00447 
00448     The score is computed with OST's :func:`~ost.mol.alg.LocalDistDiffTest`
00449     function with a single distance threshold of 2 A and an inclusion radius of
00450     8 A. You can use :attr:`lddt_mdl` and :attr:`lddt_ref` to get entities on
00451     which you can call any other lDDT function with any other set of parameters.
00452 
00453     :getter: Computed on first use (cached)
00454     :type: :class:`float`
00455     """
00456     if self._lddt_score is None:
00457       self._ComputeLDDT()
00458     return self._lddt_score
00459 
00460   @property
00461   def lddt_mdl(self):
00462     """The model entity used for lDDT scoring (:attr:`lddt_score`) and annotated
00463     with local scores.
00464 
00465     Local scores are available as residue properties named 'lddt' and on each
00466     atom as a B-factor. Only CA atoms are considered if :attr:`calpha_only` is
00467     True, otherwise this is an all-atom score.
00468     
00469     Since, the lDDT computation requires a single chain with mapped residue
00470     numbering, all chains are appended into a single chain X with unique residue
00471     numbers according to the column-index in the alignment. The alignments are
00472     in the same order as they appear in :attr:`alignments`. Additional residues
00473     are appended at the end of the chain with unique residue numbers.
00474 
00475     :getter: Computed on first use (cached)
00476     :type: :class:`~ost.mol.EntityHandle`
00477     """
00478     if self._lddt_mdl is None:
00479       self._ComputeLDDT()
00480     return self._lddt_mdl
00481 
00482   @property
00483   def lddt_ref(self):
00484     """The reference entity used for lDDT scoring (:attr:`lddt_score`).
00485 
00486     This is a single chain X with residue numbers matching ones in
00487     :attr:`lddt_mdl` where aligned and unique numbers for additional residues.
00488 
00489     :getter: Computed on first use (cached)
00490     :type: :class:`~ost.mol.EntityHandle`
00491     """
00492     if self._lddt_ref is None:
00493       self._ComputeLDDT()
00494     return self._lddt_ref
00495 
00496   @property
00497   def clustalw_bin(self):
00498     """
00499     Full path to ``clustalw`` or ``clustalw2`` executable to use for multiple
00500     sequence alignments (unless :attr:`chain_mapping` is provided manually).
00501 
00502     :getter: Located in path on first use (cached)
00503     :type: :class:`str`
00504     """
00505     if self._clustalw_bin is None:
00506       self._clustalw_bin = settings.Locate(('clustalw', 'clustalw2'))
00507     return self._clustalw_bin
00508 
00509   ##############################################################################
00510   # Class internal helpers (anything that doesnt easily work without this class)
00511   ##############################################################################
00512 
00513   def _ComputeAlignedEntities(self):
00514     """Fills cached ent_to_cm_1 and ent_to_cm_2."""
00515     # get aligned residues via MSA
00516     ev1, ev2 = _GetAlignedResidues(self.qs_ent_1, self.qs_ent_2,
00517                                    self.chem_mapping,
00518                                    self.max_ca_per_chain_for_cm,
00519                                    self.clustalw_bin)
00520     # extract new entities
00521     self._ent_to_cm_1 = mol.CreateEntityFromView(ev1, True)
00522     self._ent_to_cm_2 = mol.CreateEntityFromView(ev2, True)
00523     # name them
00524     self._ent_to_cm_1.SetName(self.qs_ent_1.GetName())
00525     self._ent_to_cm_2.SetName(self.qs_ent_2.GetName())
00526 
00527   def _ComputeSymmetry(self):
00528     """Fills cached symm_1 and symm_2."""
00529     # get them
00530     self._symm_1, self._symm_2 = \
00531       _FindSymmetry(self.qs_ent_1, self.qs_ent_2, self.ent_to_cm_1,
00532                     self.ent_to_cm_2, self.chem_mapping)
00533     # check that we have reasonable symmetries set (fallback: all chains)
00534     if not _AreValidSymmetries(self._symm_1, self._symm_2):
00535       self._symm_1 = [tuple(ch.name for ch in self.ent_to_cm_1.chains)]
00536       self._symm_2 = [tuple(ch.name for ch in self.ent_to_cm_2.chains)]
00537 
00538   def _ComputeScores(self):
00539     """Fills cached global_score and best_score."""
00540     # get contacts
00541     if self.calpha_only:
00542       contacts_1 = self.qs_ent_1.contacts_ca
00543       contacts_2 = self.qs_ent_2.contacts_ca
00544     else:
00545       contacts_1 = self.qs_ent_1.contacts
00546       contacts_2 = self.qs_ent_2.contacts
00547     # get scores
00548     scores = _GetScores(contacts_1, contacts_2, self.mapped_residues,
00549                         self.chain_mapping)
00550     self._best_score = scores[0]
00551     self._global_score = scores[1]
00552     # report scores
00553     LogInfo('QSscore %s, %s: best: %.2f, global: %.2f' \
00554             % (self.qs_ent_1.GetName(), self.qs_ent_2.GetName(),
00555                self._best_score, self._global_score))
00556 
00557   def _ComputeLDDT(self):
00558     """Fills cached lddt_score, lddt_mdl and lddt_ref."""
00559     LogInfo('Computing lDDT score')
00560     # check reference and model
00561     ref, mdl = self.qs_ent_1.ent, self.qs_ent_2.ent
00562     LogInfo('Reference %s has: %s chains' % (ref.GetName(), ref.chain_count))
00563     LogInfo('Model %s has: %s chains' % (mdl.GetName(), mdl.chain_count))
00564     if mdl.chain_count > ref.chain_count:
00565       LogWarning('MODEL contains more chains than REFERENCE, '
00566                  'lDDT is not considering them')
00567     # get single chain reference and model
00568     self._lddt_ref, self._lddt_mdl = \
00569       _MergeAlignedChains(self.alignments, ref, mdl, self.calpha_only)
00570     # score them (mdl and ref changed) and keep results
00571     self._lddt_score = _ComputeLDDTScore(self._lddt_ref, self._lddt_mdl)
00572 
00573 
00574 ###############################################################################
00575 # Entity with cached entries for QS scoring
00576 ###############################################################################
00577 
00578 class QSscoreEntity(object):
00579   """Entity with cached entries for QS scoring.
00580 
00581   Any known / precomputed information can be filled into the appropriate
00582   attribute here as long as they are labelled as read/write. Otherwise the
00583   quantities are computed on first access and cached (lazy evaluation). The
00584   heaviest load is expected when computing :attr:`contacts` and
00585   :attr:`contacts_ca`.
00586 
00587   :param ent: Entity to be used for QS scoring. A copy of it will be processed.
00588   :type ent:  :class:`~ost.mol.EntityHandle` or :class:`~ost.mol.EntityView`
00589 
00590   .. attribute:: is_valid
00591 
00592     True, if successfully initialized. False, if input structure is monomer or
00593     has less than 2 protein chains with >= 20 residues.
00594 
00595     :type: :class:`bool`
00596 
00597   .. attribute:: original_name
00598 
00599     Name set for *ent* when object was created.
00600 
00601     :type: :class:`str`
00602 
00603   .. attribute:: ent
00604 
00605     Cleaned version of *ent* passed at construction. Hydrogens are removed, the
00606     entity is processed with a :class:`~ost.conop.RuleBasedProcessor` and chains
00607     listed in :attr:`removed_chains` have been removed. The name of this entity
00608     might change during scoring (see :func:`GetName`). Otherwise, this will be
00609     fixed.
00610 
00611     :type: :class:`~ost.mol.EntityHandle`
00612 
00613   .. attribute:: removed_chains
00614 
00615     Chains removed from *ent* passed at construction. These are ligand and water
00616     chains as well as small (< 20 res.) peptides or chains with no amino acids
00617     (determined by chem. type, which is set by rule based processor).
00618 
00619     :type: :class:`list` of :class:`str`
00620 
00621   .. attribute:: calpha_only
00622 
00623     Whether entity is CA-only (i.e. it has 0 CB atoms)
00624 
00625     :type: :class:`bool`
00626   """
00627   def __init__(self, ent):
00628     # copy entity and process/clean it
00629     self.original_name = ent.GetName()
00630     ent = mol.CreateEntityFromView(ent.Select('ele!=H and aname!=HN'), True)
00631     if not conop.GetDefaultLib():
00632       raise RuntimeError("QSscore computation requires a compound library!")
00633     pr = conop.RuleBasedProcessor(conop.GetDefaultLib())
00634     pr.Process(ent)
00635     self.ent, self.removed_chains, self.calpha_only = _CleanInputEntity(ent)
00636     # check if it's suitable for QS scoring
00637     if self.ent.chain_count == 0:
00638       LogError('Bad input file: ' + ent.GetName() + '. No chains left after '
00639                'removing water, ligands and small peptides.')
00640       self.is_valid = False
00641     elif self.ent.chain_count == 1:
00642       LogError('Structure ' + ent.GetName() + ' is a monomer. '
00643                'QSscore is not defined for monomers.')
00644       self.is_valid = False
00645     else:
00646       self.is_valid = True
00647     # init cached stuff
00648     self._ca_entity = None
00649     self._ca_chains = None
00650     self._alignments = {}
00651     self._chem_groups = None
00652     self._angles = {}
00653     self._axis = {}
00654     self._contacts = None
00655     self._contacts_ca = None
00656 
00657   def GetName(self):
00658     """Wrapper to :func:`~ost.mol.EntityHandle.GetName` of :attr:`ent`.
00659     This is used to uniquely identify the entity while scoring. The name may
00660     therefore change while :attr:`original_name` remains fixed.
00661     """
00662     # for duck-typing and convenience
00663     return self.ent.GetName()
00664 
00665   def SetName(self, new_name):
00666     """Wrapper to :func:`~ost.mol.EntityHandle.SetName` of :attr:`ent`.
00667     Use this to change unique identifier while scoring (see :func:`GetName`).
00668     """
00669     # for duck-typing and convenience
00670     self.ent.SetName(new_name)
00671 
00672   @property
00673   def ca_entity(self):
00674     """
00675     Reduced representation of :attr:`ent` with only CA atoms.
00676     This guarantees that each included residue has exactly one atom.
00677 
00678     :getter: Computed on first use (cached)
00679     :type: :class:`~ost.mol.EntityHandle`
00680     """
00681     if self._ca_entity is None:
00682       self._ca_entity = _GetCAOnlyEntity(self.ent)
00683     return self._ca_entity
00684 
00685   @property
00686   def ca_chains(self):
00687     """
00688     Map of chain names in :attr:`ent` to sequences with attached view to CA-only
00689     chains (into :attr:`ca_entity`). Useful for alignments and superpositions.
00690 
00691     :getter: Computed on first use (cached)
00692     :type: :class:`dict` (key = :class:`str`,
00693            value = :class:`~ost.seq.SequenceHandle`)
00694     """
00695     if self._ca_chains is None:
00696       self._ca_chains = dict()
00697       ca_entity = self.ca_entity
00698       for ch in ca_entity.chains:
00699         self._ca_chains[ch.name] = seq.SequenceFromChain(ch.name, ch)
00700     return self._ca_chains
00701 
00702   def GetAlignment(self, c1, c2):
00703     """Get sequence alignment of chain *c1* with chain *c2*.
00704     Computed on first use based on :attr:`ca_chains` (cached).
00705 
00706     :param c1: Chain name for first chain to align.
00707     :type c1:  :class:`str`
00708     :param c2: Chain name for second chain to align.
00709     :type c2:  :class:`str`
00710     :rtype: :class:`~ost.seq.AlignmentHandle` or None if it failed.
00711     """
00712     if (c1,c2) not in self._alignments:
00713       ca_chains = self.ca_chains
00714       self._alignments[(c1,c2)] = _AlignAtomSeqs(ca_chains[c1], ca_chains[c2])
00715     return self._alignments[(c1,c2)]
00716 
00717   @property
00718   def chem_groups(self):
00719     """
00720     Intra-complex group of chemically identical (seq. id. > 95%) polypeptide
00721     chains as extracted from :attr:`ca_chains`. First chain in group is the one
00722     with the longest sequence.
00723 
00724     :getter: Computed on first use (cached)
00725     :type: :class:`list` of :class:`list` of :class:`str` (chain names)
00726     """
00727     if self._chem_groups is None:
00728       self._chem_groups = _GetChemGroups(self, 95)
00729       LogInfo('Chemically equivalent chain-groups in %s: %s' \
00730               % (self.GetName(), str(self._chem_groups)))
00731     return self._chem_groups
00732 
00733   def GetAngles(self, c1, c2):
00734     """Get Euler angles from superposition of chain *c1* with chain *c2*.
00735     Computed on first use based on :attr:`ca_chains` (cached).
00736 
00737     :param c1: Chain name for first chain to superpose.
00738     :type c1:  :class:`str`
00739     :param c2: Chain name for second chain to superpose.
00740     :type c2:  :class:`str`
00741     :return: 3 Euler angles (may contain nan if something fails).
00742     :rtype:  :class:`numpy.array`
00743     """
00744     if (c1,c2) not in self._angles:
00745       self._GetSuperposeData(c1, c2)
00746     return self._angles[(c1,c2)]
00747 
00748   def GetAxis(self, c1, c2):
00749     """Get axis of symmetry from superposition of chain *c1* with chain *c2*.
00750     Computed on first use based on :attr:`ca_chains` (cached).
00751 
00752     :param c1: Chain name for first chain to superpose.
00753     :type c1:  :class:`str`
00754     :param c2: Chain name for second chain to superpose.
00755     :type c2:  :class:`str`
00756     :return: Rotational axis (may contain nan if something fails).
00757     :rtype:  :class:`numpy.array`
00758     """
00759     if (c1,c2) not in self._axis:
00760       self._GetSuperposeData(c1, c2)
00761     return self._axis[(c1,c2)]
00762 
00763   @property
00764   def contacts(self):
00765     """
00766     Connectivity dictionary (**read/write**).
00767     As given by :func:`GetContacts` with *calpha_only* = False on :attr:`ent`.
00768 
00769     :getter: Computed on first use (cached)
00770     :setter: Uses :func:`FilterContacts` to ensure that we only keep contacts
00771              for chains in the cleaned entity.
00772     :type: See return type of :func:`GetContacts`
00773     """
00774     if self._contacts is None:
00775       self._contacts = GetContacts(self.ent, False)
00776     return self._contacts
00777 
00778   @contacts.setter
00779   def contacts(self, new_contacts):
00780     chain_names = set([ch.name for ch in self.ent.chains])
00781     self._contacts = FilterContacts(new_contacts, chain_names)
00782   
00783   @property
00784   def contacts_ca(self):
00785     """
00786     CA-only connectivity dictionary (**read/write**).
00787     Like :attr:`contacts` but with *calpha_only* = True in :func:`GetContacts`.
00788     """
00789     if self._contacts_ca is None:
00790       self._contacts_ca = GetContacts(self.ent, True)
00791     return self._contacts_ca
00792   
00793   @contacts_ca.setter
00794   def contacts_ca(self, new_contacts):
00795     chain_names = set([ch.name for ch in self.ent.chains])
00796     self._contacts_ca = FilterContacts(new_contacts, chain_names)
00797 
00798   ##############################################################################
00799   # Class internal helpers (anything that doesnt easily work without this class)
00800   ##############################################################################
00801 
00802   def _GetSuperposeData(self, c1, c2):
00803     """Fill _angles and _axis from superposition of CA chains of c1 and c2."""
00804     # get aligned views (must contain identical numbers of atoms!)
00805     aln = self.GetAlignment(c1, c2)
00806     if not aln:
00807       # fallback for non-aligned stuff (nan)
00808       self._angles[(c1,c2)] = np.empty(3) * np.nan
00809       self._axis[(c1,c2)] = np.empty(3) * np.nan
00810       return
00811     v1, v2 = seq.ViewsFromAlignment(aln)
00812     if v1.atom_count < 3:
00813       # fallback for non-aligned stuff (nan)
00814       self._angles[(c1,c2)] = np.empty(3) * np.nan
00815       self._axis[(c1,c2)] = np.empty(3) * np.nan
00816       return
00817     # get transformation
00818     sup_res = mol.alg.SuperposeSVD(v1, v2, apply_transform=False)
00819     Rt = sup_res.transformation
00820     # extract angles
00821     a,b,c = _GetAngles(Rt)
00822     self._angles[(c1,c2)] = np.asarray([a,b,c])
00823     # extract axis of symmetry
00824     R3 = geom.Rotation3(Rt.ExtractRotation())
00825     self._axis[(c1,c2)] = np.asarray(R3.GetRotationAxis().data)
00826 
00827 ###############################################################################
00828 # Contacts computations
00829 ###############################################################################
00830 
00831 def FilterContacts(contacts, chain_names):
00832   """Filter contacts to contain only contacts for chains in *chain_names*.
00833 
00834   :param contacts: Connectivity dictionary as produced by :func:`GetContacts`.
00835   :type contacts:  :class:`dict`
00836   :param chain_names: Chain names to keep.
00837   :type chain_names:  :class:`list` or (better) :class:`set`
00838   :return: New connectivity dictionary (format as in :func:`GetContacts`)
00839   :rtype:  :class:`dict`
00840   """
00841   # create new dict with subset
00842   filtered_contacts = dict()
00843   for c1 in contacts:
00844     if c1 in chain_names:
00845       new_contacts = dict()
00846       for c2 in contacts[c1]:
00847         if c2 in chain_names:
00848           new_contacts[c2] = contacts[c1][c2]
00849       # avoid adding empty dicts
00850       if new_contacts:
00851         filtered_contacts[c1] = new_contacts
00852   return filtered_contacts
00853 
00854 def GetContacts(entity, calpha_only, dist_thr=12.0):
00855   """Get inter-chain contacts of a macromolecular entity.
00856 
00857   Contacts are pairs of residues within a given distance belonging to different
00858   chains. They are stored once per pair and include the CA/CB-CA/CB distance.
00859 
00860   :param entity: An entity to check connectivity for.
00861   :type entity:  :class:`~ost.mol.EntityHandle` or :class:`~ost.mol.EntityView`
00862   :param calpha_only: If True, we only consider CA-CA distances. Else, we use CB
00863                       unless the residue is a GLY.
00864   :type calpha_only:  :class:`bool`
00865   :param dist_thr: Maximal CA/CB-CA/CB distance to be considered in contact.
00866   :type dist_thr:  :class:`float`
00867   :return: A connectivity dictionary. A pair of residues with chain names
00868            *ch_name1* & *ch_name2* (*ch_name1* < *ch_name2*), residue numbers
00869            *res_num1* & *res_num2* and distance *dist* (<= *dist_thr*) are
00870            stored as *result[ch_name1][ch_name2][res_num1][res_num2]* = *dist*.
00871   :rtype:  :class:`dict`
00872   """
00873   # get ent copy to search on
00874   if calpha_only:
00875     ev = entity.Select("aname=CA")
00876   else:
00877     ev = entity.Select("(rname=GLY and aname=CA) or aname=CB")
00878   ent = mol.CreateEntityFromView(ev, True)
00879   # search all vs all
00880   contacts = dict()
00881   for atom in ent.atoms:
00882     ch_name1 = atom.chain.name
00883     res_num1 = atom.residue.number.num
00884     close_atoms = ent.FindWithin(atom.pos, dist_thr)
00885     for close_atom in close_atoms:
00886       ch_name2 = close_atom.chain.name
00887       if ch_name2 > ch_name1:
00888         res_num2 = close_atom.residue.number.num
00889         dist = geom.Distance(atom.pos, close_atom.pos)
00890         # add to contacts
00891         if ch_name1 not in contacts:
00892           contacts[ch_name1] = dict()
00893         if ch_name2 not in contacts[ch_name1]:
00894           contacts[ch_name1][ch_name2] = dict()
00895         if res_num1 not in contacts[ch_name1][ch_name2]:
00896           contacts[ch_name1][ch_name2][res_num1] = dict()
00897         contacts[ch_name1][ch_name2][res_num1][res_num2] = round(dist, 3)
00898   # DONE
00899   return contacts
00900 
00901 
00902 ###############################################################################
00903 # HELPERS
00904 ###############################################################################
00905 
00906 # general
00907 
00908 def _AlignAtomSeqs(seq_1, seq_2):
00909   """
00910   :type seq_1: :class:`ost.seq.SequenceHandle`
00911   :type seq_2: :class:`ost.seq.SequenceHandle`
00912   :return: Alignment of two sequences using a global aignment. Views attached
00913            to the input sequences will remain attached in the aln.
00914   :rtype:  :class:`~ost.seq.AlignmentHandle` or None if it failed.
00915   """
00916   # NOTE: If the two sequence have a greatly different length
00917   #       a local alignment could be a better choice...
00918   aln = None
00919   alns = seq.alg.GlobalAlign(seq_1, seq_2, seq.alg.BLOSUM62)
00920   if alns: aln = alns[0]
00921   if not aln:
00922     LogWarning('Failed to align %s to %s' % (seq_1.name, seq_2.name))
00923     LogWarning('%s:  %s' % (seq_1.name, seq_1.string))
00924     LogWarning('%s:  %s' % (seq_2.name, seq_2.string))
00925   return aln
00926 
00927 # QS entity
00928 
00929 def _CleanInputEntity(ent):
00930   """
00931   :param ent: The OST entity to be cleaned.
00932   :type ent:  :class:`EntityHandle` or :class:`EntityView`
00933   :return: A tuple of 3 items: :attr:`QSscoreEntity.ent`,
00934                                :attr:`QSscoreEntity.removed_chains`,
00935                                :attr:`QSscoreEntity.calpha_only`
00936   """
00937   # find chains to remove
00938   removed_chains = []
00939   for ch in ent.chains:
00940     # we remove chains if they are small-peptides or if the contain no aa
00941     # or if they contain only unknown or modified residues
00942     if    ch.name in ['-', '_'] \
00943        or ch.residue_count < 20 \
00944        or not any(r.chem_type.IsAminoAcid() for r in ch.residues) \
00945        or not (set(r.one_letter_code for r in ch.residues) - {'?', 'X'}):
00946       removed_chains.append(ch.name)
00947 
00948   # remove them from *ent*
00949   if removed_chains:
00950     chain_set = set()
00951     for ch_name in removed_chains:
00952       if ch_name in ['-', '_', ' ']:
00953         chain_set.add('"%c"' % ch_name)
00954       else:
00955         chain_set.add(ch_name)
00956     view = ent.Select('cname!=%s' % ','.join(chain_set))
00957     ent_new = mol.CreateEntityFromView(view, True)
00958     ent_new.SetName(ent.GetName())
00959   else:
00960     ent_new = ent
00961 
00962   # check if CA only
00963   calpha_only = False
00964   if ent_new.Select('aname=CB').atom_count == 0:
00965     LogInfo('Structure %s is a CA only structure!' % ent_new.GetName())
00966     calpha_only = True
00967 
00968   # report and return
00969   if removed_chains:
00970     LogInfo('Chains removed from %s: %s' \
00971             % (ent_new.GetName(), ''.join(removed_chains)))
00972   LogInfo('Chains in %s: %s' \
00973           % (ent_new.GetName(), ''.join([c.name for c in ent_new.chains])))
00974   return ent_new, removed_chains, calpha_only
00975 
00976 def _GetCAOnlyEntity(ent):
00977   """
00978   :param ent: Entity to process
00979   :type ent:  :class:`EntityHandle` or :class:`EntityView`
00980   :return: New entity with only CA and only one atom per residue
00981            (see :attr:`QSscoreEntity.ca_entity`)
00982   """
00983   # cook up CA only view (diff from Select = guaranteed 1 atom per residue)
00984   ca_view = ent.CreateEmptyView()
00985   # add chain by chain
00986   for res in ent.residues:
00987     ca_atom = res.FindAtom("CA")
00988     if ca_atom.IsValid():
00989       ca_view.AddAtom(ca_atom)
00990   # finalize
00991   return mol.CreateEntityFromView(ca_view, False)
00992 
00993 def _GetChemGroups(qs_ent, seqid_thr=95.):
00994   """
00995   :return: Intra-complex group of chemically identical polypeptide chains
00996            (see :attr:`QSscoreEntity.chem_groups`)
00997 
00998   :param qs_ent: Entity to process
00999   :type qs_ent:  :class:`QSscoreEntity`
01000   :param seqid_thr: Threshold used to decide when two chains are identical.
01001                     95 percent tolerates the few mutations crystallographers
01002                     like to do.
01003   :type seqid_thr:  :class:`float`
01004   """
01005   # get data from qs_ent
01006   ca_chains = qs_ent.ca_chains
01007   chain_names = sorted(ca_chains.keys())
01008   # get pairs of identical chains
01009   # NOTE: this scales quadratically with number of chains and may be optimized
01010   #       -> one could merge it with "merge transitive pairs" below...
01011   id_seqs = []
01012   for ch_1, ch_2 in itertools.combinations(chain_names, 2):
01013     aln = qs_ent.GetAlignment(ch_1, ch_2)
01014     if aln and seq.alg.SequenceIdentity(aln) > seqid_thr:
01015       id_seqs.append((ch_1, ch_2))
01016   # trivial case: no matching pairs
01017   if not id_seqs:
01018     return [[name] for name in chain_names]
01019 
01020   # merge transitive pairs
01021   groups = []
01022   for ch_1, ch_2 in id_seqs:
01023     found = False
01024     for g in groups:
01025       if ch_1 in g or ch_2 in g:
01026         found = True
01027         g.add(ch_1)
01028         g.add(ch_2)
01029     if not found:
01030       groups.append(set([ch_1, ch_2]))
01031   # sort internally based on sequence length
01032   chem_groups = []
01033   for g in groups:
01034     ranked_g = sorted([(-ca_chains[ch].length, ch) for ch in g])
01035     chem_groups.append([ch for _,ch in ranked_g])
01036   # add other dissimilar chains
01037   for ch in chain_names:
01038     if not any(ch in g for g in chem_groups):
01039       chem_groups.append([ch])
01040   
01041   return chem_groups
01042 
01043 def _GetAngles(Rt):
01044   """Computes the Euler angles given a transformation matrix.
01045 
01046   :param Rt: Rt operator.
01047   :type Rt:  :class:`ost.geom.Mat4`
01048   :return: A :class:`tuple` of angles for each axis (x,y,z)
01049   """
01050   rot = np.asmatrix(Rt.ExtractRotation().data).reshape(3,3)
01051   tx = np.arctan2(rot[2,1], rot[2,2])
01052   if tx < 0:
01053     tx += 2*np.pi
01054   ty = np.arctan2(rot[2,0], np.sqrt(rot[2,1]**2 + rot[2,2]**2))
01055   if ty < 0:
01056     ty += 2*np.pi
01057   tz = np.arctan2(rot[1,0], rot[0,0])
01058   if tz < 0:
01059     tz += 2*np.pi
01060   return tx,ty,tz
01061 
01062 # QS scorer
01063 
01064 def _GetChemGroupsMapping(qs_ent_1, qs_ent_2):
01065   """
01066   :return: Inter-complex mapping of chemical groups
01067            (see :attr:`QSscorer.chem_mapping`)
01068 
01069   :param qs_ent_1: See :attr:`QSscorer.qs_ent_1`
01070   :param qs_ent_2: See :attr:`QSscorer.qs_ent_2`
01071   """
01072   # get chem. groups and unique representative
01073   chem_groups_1 = qs_ent_1.chem_groups
01074   chem_groups_2 = qs_ent_2.chem_groups
01075   repr_chains_1 = {x[0]: tuple(x) for x in chem_groups_1}
01076   repr_chains_2 = {x[0]: tuple(x) for x in chem_groups_2}
01077 
01078   # if entities do not have different number of unique chains, we get the
01079   # mapping for the smaller set
01080   swapped = False
01081   if len(repr_chains_2) < len(repr_chains_1):
01082     repr_chains_1, repr_chains_2 = repr_chains_2, repr_chains_1
01083     qs_ent_1, qs_ent_2 = qs_ent_2, qs_ent_1
01084     swapped = True
01085 
01086   # find the closest to each chain between the two entities
01087   # NOTE: this may still be sensible to orthology problem
01088   # -> currently we use a global alignment and seq. id. to rank pairs
01089   # -> we also tried local alignments and weighting the seq. id. by the
01090   #    coverages of the alignments (gapless string in aln. / seq. length)
01091   #    but global aln performed better...
01092   chain_pairs = []
01093   ca_chains_1 = qs_ent_1.ca_chains
01094   ca_chains_2 = qs_ent_2.ca_chains
01095   for ch_1 in repr_chains_1.keys():
01096     for ch_2 in repr_chains_2.keys():
01097       aln = _AlignAtomSeqs(ca_chains_1[ch_1], ca_chains_2[ch_2])
01098       if aln:
01099         chains_seqid = seq.alg.SequenceIdentity(aln)
01100         LogVerbose('Sequence identity', ch_1, ch_2, 'seqid=%.2f' % chains_seqid)
01101         chain_pairs.append((chains_seqid, ch_1, ch_2))
01102 
01103   # get top matching groups first
01104   chain_pairs = sorted(chain_pairs, reverse=True)
01105   chem_mapping = {}
01106   for _, c1, c2 in chain_pairs:
01107     skip = False
01108     for a,b in chem_mapping.iteritems():
01109       if repr_chains_1[c1] == a or repr_chains_2[c2] == b:
01110         skip = True
01111         break
01112     if not skip:
01113       chem_mapping[repr_chains_1[c1]] = repr_chains_2[c2]
01114   if swapped:
01115     chem_mapping = {y: x for x, y in chem_mapping.iteritems()}
01116     qs_ent_1, qs_ent_2 = qs_ent_2, qs_ent_1
01117 
01118   # notify chains without partner
01119   mapped_1 = set([i for s in chem_mapping.keys() for i in s])
01120   chains_1 = set([c.name for c in qs_ent_1.ent.chains])
01121   if chains_1 - mapped_1:
01122     LogWarning('Unmapped Chains in %s: %s'
01123                % (qs_ent_1.GetName(), ','.join(list(chains_1 - mapped_1))))
01124 
01125   mapped_2 = set([i for s in chem_mapping.values() for i in s])
01126   chains_2 = set([c.name for c in qs_ent_2.ent.chains])
01127   if chains_2 - mapped_2:
01128     LogWarning('Unmapped Chains in %s: %s'
01129                % (qs_ent_2.GetName(), ','.join(list(chains_2 - mapped_2))))
01130   
01131   # check if we have any chains left
01132   LogInfo('Chemical chain-groups mapping: ' + str(chem_mapping))
01133   if len(mapped_1) < 2 or len(mapped_2) < 2:
01134     raise QSscoreError('Less than 2 chains left in chem_mapping.')
01135   return chem_mapping
01136 
01137 def _SelectFew(l, max_elements):
01138   """Return l or copy of l with at most *max_elements* entries."""
01139   n_el = len(l)
01140   if n_el <= max_elements:
01141     return l
01142   else:
01143     # cheap integer ceiling (-1 to ensure that x*max_elements gets d_el = x)
01144     d_el = ((n_el - 1) // max_elements) + 1
01145     new_l = list()
01146     for i in range(0, n_el, d_el):
01147       new_l.append(l[i])
01148     return new_l
01149 
01150 def _GetAlignedResidues(qs_ent_1, qs_ent_2, chem_mapping, max_ca_per_chain,
01151                         clustalw_bin):
01152   """
01153   :return: Tuple of two :class:`~ost.mol.EntityView` objects containing subsets
01154            of *qs_ent_1* and *qs_ent_2*. Two entities are later created from
01155            those views (see :attr:`QSscorer.ent_to_cm_1` and
01156            :attr:`QSscorer.ent_to_cm_2`)
01157 
01158   :param qs_ent_1: See :attr:`QSscorer.qs_ent_1`
01159   :param qs_ent_2: See :attr:`QSscorer.qs_ent_2`
01160   :param chem_mapping: See :attr:`QSscorer.chem_mapping`
01161   :param max_ca_per_chain: See :attr:`QSscorer.max_ca_per_chain_for_cm`
01162   """
01163   # resulting views into CA entities using CA chain sequences
01164   ent_view_1 = qs_ent_1.ca_entity.CreateEmptyView()
01165   ent_view_2 = qs_ent_2.ca_entity.CreateEmptyView()
01166   ca_chains_1 = qs_ent_1.ca_chains
01167   ca_chains_2 = qs_ent_2.ca_chains
01168   # go through all mapped chemical groups
01169   for group_1, group_2 in chem_mapping.iteritems():
01170     # MSA with ClustalW
01171     seq_list = seq.CreateSequenceList()
01172     # keep sequence-name (must be unique) to view mapping
01173     seq_to_empty_view = dict()
01174     for ch in group_1:
01175       sequence = ca_chains_1[ch].Copy()
01176       sequence.name = qs_ent_1.GetName() + '.' + ch
01177       seq_to_empty_view[sequence.name] = ent_view_1
01178       seq_list.AddSequence(sequence)
01179     for ch in group_2:
01180       sequence = ca_chains_2[ch].Copy()
01181       sequence.name = qs_ent_2.GetName() + '.' + ch
01182       seq_to_empty_view[sequence.name] = ent_view_2
01183       seq_list.AddSequence(sequence)
01184     alnc = ClustalW(seq_list, clustalw=clustalw_bin)
01185 
01186     # collect aligned residues (list of lists of sequence_count valid residues)
01187     residue_lists = list()
01188     sequence_count = alnc.sequence_count
01189     for col in alnc:
01190       residues = [col.GetResidue(ir) for ir in range(sequence_count)]
01191       if all([r.IsValid() for r in residues]):
01192         residue_lists.append(residues)
01193     # check number of aligned residues
01194     if len(residue_lists) < 5:
01195       raise QSscoreError('Not enough aligned residues.')
01196     elif max_ca_per_chain > 0:
01197       residue_lists = _SelectFew(residue_lists, max_ca_per_chain)
01198     # check what view is for which residue
01199     res_views = [seq_to_empty_view[alnc.sequences[ir].name] \
01200                  for ir in range(sequence_count)]
01201     # add to view (note: only 1 CA atom per residue in here)
01202     for res_list in residue_lists:
01203       for res, view in zip(res_list, res_views):
01204         view.AddResidue(res, mol.ViewAddFlag.INCLUDE_ATOMS)
01205   # done
01206   return ent_view_1, ent_view_2
01207 
01208 
01209 def _FindSymmetry(qs_ent_1, qs_ent_2, ent_to_cm_1, ent_to_cm_2, chem_mapping):
01210   """
01211   :return: A pair of comparable symmetry groups (for :attr:`QSscorer.symm_1`
01212            and :attr:`QSscorer.symm_2`) between the two structures.
01213            Empty lists if no symmetry identified.
01214   
01215   :param qs_ent_1: See :attr:`QSscorer.qs_ent_1`
01216   :param qs_ent_2: See :attr:`QSscorer.qs_ent_2`
01217   :param ent_to_cm_1: See :attr:`QSscorer.ent_to_cm_1`
01218   :param ent_to_cm_2: See :attr:`QSscorer.ent_to_cm_2`
01219   :param chem_mapping: See :attr:`QSscorer.chem_mapping`
01220   """
01221   LogInfo('Identifying Symmetry Groups...')
01222 
01223   # get possible symmetry groups
01224   symm_subg_1 = _GetSymmetrySubgroups(qs_ent_1, ent_to_cm_1,
01225                                       chem_mapping.keys())
01226   symm_subg_2 = _GetSymmetrySubgroups(qs_ent_2, ent_to_cm_2,
01227                                       chem_mapping.values())
01228 
01229   # check combination of groups
01230   LogInfo('Selecting Symmetry Groups...')
01231   # check how many mappings are to be checked for each pair of symmetry groups
01232   # -> lower number here will speed up worst-case runtime of _GetChainMapping
01233   # NOTE: this is tailored to speed up brute force all vs all mapping test
01234   #       for preferred _CheckClosedSymmetry this is suboptimal!
01235   best_symm = []
01236   for symm_1, symm_2 in itertools.product(symm_subg_1, symm_subg_2):
01237     s1 = symm_1[0]
01238     s2 = symm_2[0]
01239     if len(s1) != len(s2):
01240       LogDebug('Discarded', str(symm_1), str(symm_2),
01241                ': different size of symmetry groups')
01242       continue
01243     count = _CountSuperpositionsAndMappings(symm_1, symm_2, chem_mapping)
01244     nr_mapp = count['intra']['mappings'] + count['inter']['mappings']
01245     LogDebug('OK', str(symm_1), str(symm_2), ': %s mappings' % nr_mapp)
01246     best_symm.append((nr_mapp, symm_1, symm_2))
01247 
01248   for _, symm_1, symm_2 in sorted(best_symm):
01249     s1 = symm_1[0]
01250     s2 = symm_2[0]
01251     group_1 = ent_to_cm_1.Select('cname=%s' % ','.join(s1))
01252     group_2 = ent_to_cm_2.Select('cname=%s' % ','.join(s2))
01253     # check if by superposing a pair of chains within the symmetry group to
01254     # superpose all chains within the symmetry group
01255     # -> if successful, the symmetry groups are compatible
01256     closed_symm = _CheckClosedSymmetry(group_1, group_2, symm_1, symm_2,
01257                                        chem_mapping, 6, 0.8, find_best=False)
01258 
01259     if closed_symm:
01260       return symm_1, symm_2
01261 
01262   # nothing found
01263   LogInfo('No coherent symmetry identified between structures')
01264   return [], []
01265 
01266 
01267 def _GetChainMapping(ent_1, ent_2, symm_1, symm_2, chem_mapping):
01268   """
01269   :return: Mapping from *ent_1* to *ent_2* (see :attr:`QSscorer.chain_mapping`)  
01270 
01271   :param ent_1: See :attr:`QSscorer.ent_to_cm_1`
01272   :param ent_2: See :attr:`QSscorer.ent_to_cm_2`
01273   :param symm_1: See :attr:`QSscorer.symm_1`
01274   :param symm_2: See :attr:`QSscorer.symm_2`
01275   :param chem_mapping: See :attr:`QSscorer.chem_mapping`
01276   """
01277   LogInfo('Symmetry-groups used in %s: %s' % (ent_1.GetName(), str(symm_1)))
01278   LogInfo('Symmetry-groups used in %s: %s' % (ent_2.GetName(), str(symm_2)))
01279 
01280   # quick check for closed symmetries
01281   thresholds = [(    'strict', 4, 0.8),
01282                 (  'tolerant', 6, 0.4),
01283                 ('permissive', 8, 0.2)]
01284   for scheme, sup_thr, sup_fract in thresholds:
01285     # check if by superposing a pair of chains within the symmetry group to
01286     # superpose all chains of the two oligomers
01287     # -> this also returns the resulting mapping of chains
01288     mapping = _CheckClosedSymmetry(ent_1, ent_2, symm_1, symm_2,
01289                                    chem_mapping, sup_thr, sup_fract)
01290     if mapping:
01291       LogInfo('Closed Symmetry with %s parameters' % scheme)
01292       if scheme == 'permissive':
01293         LogWarning('Permissive thresholds used for overlap based mapping ' + \
01294                    'detection: check mapping manually: %s' % mapping)
01295       return mapping
01296   
01297   # NOTE that what follows below is sub-optimal:
01298   # - if the two structures don't fit at all, we may map chains rather randomly
01299   # - we may need a lot of operations to find globally "best" mapping
01300   # -> COST = O(N^2) * O(N!)
01301   #    (first = all pairwise chain-superpose, latter = all chain mappings)
01302   # -> a greedy chain mapping choice algo (pick best RMSD for each one-by-one)
01303   #    could reduce this to O(N^2) * O(N^2) but it would need to be validated
01304   # - one could also try some other heuristic based on center of atoms of chains
01305   #   -> at this point we get a bad mapping anyways...
01306 
01307   # if we get here, we will need to check many combinations of mappings
01308   # -> check how many mappings must be checked and limit
01309   count = _CountSuperpositionsAndMappings(symm_1, symm_2, chem_mapping)
01310   LogInfo('Intra Symmetry-group mappings to check: %s' \
01311           % count['intra']['mappings'])
01312   LogInfo('Inter Symmetry-group mappings to check: %s' \
01313           % count['inter']['mappings'])
01314   nr_mapp = count['intra']['mappings'] + count['inter']['mappings']
01315   if nr_mapp > 100000: # 322560 is octamer vs octamer
01316     raise QSscoreError('Too many possible mappings: %s' % nr_mapp)
01317 
01318   # to speed up the computations we cache chain views and RMSDs
01319   cached_rmsd = _CachedRMSD(ent_1, ent_2)
01320 
01321   # get INTRA symmetry group chain mapping
01322   check = 0
01323   intra_mappings = [] # list of (RMSD, c1, c2, mapping) tuples (best superpose)
01324   # limit chem mapping to reference symmetry group
01325   ref_symm_1 = symm_1[0]
01326   ref_symm_2 = symm_2[0]
01327   asu_chem_mapping = _LimitChemMapping(chem_mapping, ref_symm_1, ref_symm_2)
01328   # for each chemically identical group
01329   for g1, g2 in asu_chem_mapping.iteritems():
01330     # try to superpose all possible pairs
01331     for c1, c2 in itertools.product(g1, g2):
01332       # get superposition transformation
01333       LogVerbose('Superposing chains: %s, %s' % (c1, c2))
01334       res = cached_rmsd.GetSuperposition(c1, c2)
01335       # compute RMSD of possible mappings
01336       cur_mappings = [] # list of (RMSD, mapping) tuples
01337       for mapping in _ListPossibleMappings(c1, c2, asu_chem_mapping):
01338         check += 1
01339         chains_rmsd = cached_rmsd.GetMappedRMSD(mapping, res.transformation)
01340         cur_mappings.append((chains_rmsd, mapping))
01341         LogVerbose(str(mapping), str(chains_rmsd))
01342       best_rmsd, best_mapp = min(cur_mappings)
01343       intra_mappings.append((best_rmsd, c1, c2, best_mapp))
01344   # best chain-chain superposition defines the intra asu mapping
01345   _, intra_asu_c1, intra_asu_c2, intra_asu_mapp = min(intra_mappings)
01346 
01347   # if only one asu is present in any of the complexes, we're done
01348   if len(symm_1) == 1 or len(symm_2) == 1:
01349     mapping = intra_asu_mapp
01350   else:
01351     # the mapping is the element position within the asu chain list
01352     # -> this speed up a lot, assuming that the order of chain in asu groups
01353     #    is following the order of symmetry operations
01354     index_asu_c1 = ref_symm_1.index(intra_asu_c1)
01355     index_asu_c2 = ref_symm_2.index(intra_asu_c2)
01356     index_mapp = {ref_symm_1.index(s1): ref_symm_2.index(s2) \
01357                   for s1, s2 in intra_asu_mapp.iteritems()}
01358     LogInfo('Intra symmetry-group mapping: %s' % str(intra_asu_mapp))
01359 
01360     # get INTER symmetry group chain mapping
01361     # we take the first symmetry group from the complex having the most
01362     if len(symm_1) < len(symm_2):
01363       symm_combinations = list(itertools.product(symm_1, [symm_2[0]]))
01364     else:
01365       symm_combinations = list(itertools.product([symm_1[0]], symm_2))
01366 
01367     full_asu_mapp = {tuple(symm_1): tuple(symm_2)}
01368     full_mappings = [] # list of (RMSD, mapping) tuples
01369     for s1, s2 in symm_combinations:
01370       # get superposition transformation (we take best chain-pair in asu)
01371       LogVerbose('Superposing symmetry-group: %s, %s in %s, %s' \
01372                  % (s1[index_asu_c1], s2[index_asu_c2], s1, s2))
01373       res = cached_rmsd.GetSuperposition(s1[index_asu_c1], s2[index_asu_c2])
01374       # compute RMSD of possible mappings
01375       for asu_mapp in _ListPossibleMappings(s1, s2, full_asu_mapp):
01376         check += 1
01377         # need to extract full chain mapping (use indexing)
01378         mapping = {}
01379         for a, b in asu_mapp.iteritems():
01380           for id_a, id_b in index_mapp.iteritems():
01381             mapping[a[id_a]] = b[id_b]
01382         chains_rmsd = cached_rmsd.GetMappedRMSD(mapping, res.transformation)
01383         full_mappings.append((chains_rmsd, mapping))
01384         LogVerbose(str(mapping), str(chains_rmsd))
01385 
01386     if check != nr_mapp:
01387       LogWarning('Mapping number estimation was wrong') # sanity check
01388 
01389     # return best (lowest RMSD) mapping
01390     mapping = min(full_mappings)[1]
01391 
01392   LogWarning('Extensive search used for mapping detection (fallback). This ' + \
01393              'approach has known limitations. Check mapping manually: %s' \
01394              % mapping)
01395   return mapping
01396 
01397 
01398 def _GetSymmetrySubgroups(qs_ent, ent, chem_groups):
01399   """
01400   Identify the symmetry (either cyclic C or dihedral D) of the protein and find
01401   all possible symmetry subgroups. This is done testing all combination of chain
01402   superposition and clustering them by the angles (D) or the axis (C) of the Rt
01403   operator.
01404 
01405   Groups of superposition which can fully reconstruct the structure are possible
01406   symmetry subgroups.
01407 
01408   :param qs_ent: Entity with cached angles and axis.
01409   :type qs_ent:  :class:`QSscoreEntity`
01410   :param ent: Entity from which to extract chains (perfect alignment assumed
01411               for superposition as in :attr:`QSscorer.ent_to_cm_1`).
01412   :type ent:  :class:`EntityHandle` or :class:`EntityView`
01413   :param chem_groups: List of tuples/lists of chain names in *ent*. Each list
01414                       contains all chains belonging to a chem. group (could be
01415                       from :attr:`QSscoreEntity.chem_groups` or from "keys()"
01416                       or "values()" of :attr:`QSscorer.chem_mapping`)
01417 
01418   :return: A list of possible symmetry subgroups (each in same format as
01419            :attr:`QSscorer.symm_1`). If no symmetry is found, we return a list
01420            with a single symmetry subgroup with a single group with all chains.
01421   """
01422   # get angles / axis for pairwise transformations within same chem. group
01423   angles = {}
01424   axis = {}
01425   for cnames in chem_groups:
01426     for c1, c2 in itertools.combinations(cnames, 2):
01427       cur_angles = qs_ent.GetAngles(c1, c2)
01428       if not np.any(np.isnan(cur_angles)):
01429         angles[(c1,c2)] = cur_angles
01430       cur_axis = qs_ent.GetAxis(c1, c2)
01431       if not np.any(np.isnan(cur_axis)):
01432         axis[(c1,c2)] = cur_axis
01433 
01434   # cluster superpositions angles at different thresholds
01435   symm_groups = []
01436   LogVerbose('Possible symmetry-groups for: %s' % ent.GetName())
01437   for angle_thr in np.arange(0.1, 1, 0.1):
01438     d_symm_groups = _GetDihedralSubgroups(ent, chem_groups, angles, angle_thr)
01439     if d_symm_groups:
01440       for group in d_symm_groups:
01441         if group not in symm_groups:
01442           symm_groups.append(group)
01443           LogVerbose('Dihedral: %s' % group)
01444       LogInfo('Symmetry threshold %.1f used for angles of %s' \
01445               % (angle_thr, ent.GetName()))
01446       break
01447   
01448   # cluster superpositions axis at different thresholds
01449   for axis_thr in np.arange(0.1, 1, 0.1):
01450     c_symm_groups = _GetCyclicSubgroups(ent, chem_groups, axis, axis_thr)
01451     if c_symm_groups:
01452       for group in c_symm_groups:
01453         if group not in symm_groups:
01454           symm_groups.append(group)
01455           LogVerbose('Cyclic: %s' % group)
01456       LogInfo('Symmetry threshold %.1f used for axis of %s' \
01457               % (axis_thr, ent.GetName()))
01458       break
01459 
01460   # fall back to single "group" containing all chains if none found
01461   if not symm_groups:
01462     LogInfo('No symmetry-group detected in %s' % ent.GetName())
01463     symm_groups = [[tuple([c for g in chem_groups for c in g])]]
01464 
01465   return symm_groups
01466 
01467 def _GetDihedralSubgroups(ent, chem_groups, angles, angle_thr):
01468   """
01469   :return: Dihedral subgroups for :func:`_GetSymmetrySubgroups`
01470            (same return type as there). Empty list if fail.
01471 
01472   :param ent: See :func:`_GetSymmetrySubgroups`
01473   :param chem_groups: See :func:`_GetSymmetrySubgroups`
01474   :param angles: :class:`dict` (key = chain-pair-tuple, value = Euler angles)
01475   :param angle_thr: Euler angles distance threshold for clustering (float).
01476   """
01477   # cluster superpositions angles
01478   if len(angles) == 0:
01479     # nothing to be done here
01480     return []
01481   else:
01482     same_angles = _ClusterData(angles, angle_thr, _AngleArrayDistance)
01483 
01484   # get those which are non redundant and covering all chains
01485   symm_groups = []
01486   for clst in same_angles.values():
01487     group = clst.keys()
01488     if _ValidChainGroup(group, ent):
01489       if len(chem_groups) > 1:
01490         # if hetero, we want to group toghether different chains only
01491         symm_groups.append(zip(*group))
01492       else:
01493         # if homo, we also want pairs
01494         symm_groups.append(group)
01495         symm_groups.append(zip(*group))
01496   return symm_groups
01497 
01498 def _GetCyclicSubgroups(ent, chem_groups, axis, axis_thr):
01499   """
01500   :return: Cyclic subgroups for :func:`_GetSymmetrySubgroups`
01501            (same return type as there). Empty list if fail.
01502 
01503   :param ent: See :func:`_GetSymmetrySubgroups`
01504   :param chem_groups: See :func:`_GetSymmetrySubgroups`
01505   :param angles: :class:`dict` (key = chain-pair-tuple, value = rotation axis)
01506   :param angle_thr: Axis distance threshold for clustering (float).
01507   """
01508   # cluster superpositions axis
01509   if len(axis) == 0:
01510     # nothing to be done here
01511     return []
01512   else:
01513     same_axis = _ClusterData(axis, axis_thr, _AxisDistance)
01514 
01515   # use to get grouping
01516   symm_groups = []
01517   for clst in same_axis.values():
01518     all_chain = [item for sublist in clst.keys() for item in sublist]
01519     if len(set(all_chain)) == ent.chain_count:
01520       # if we have an hetero we can exploit cyclic symmetry for grouping
01521       if len(chem_groups) > 1:
01522         ref_chem_group = chem_groups[0]
01523         # try two criteria for grouping
01524         cyclic_group_closest = []
01525         cyclic_group_iface = []
01526         for c1 in ref_chem_group:
01527           group_closest = [c1]
01528           group_iface = [c1]
01529           for chains in chem_groups[1:]:
01530             # center of atoms distance
01531             closest = _GetClosestChain(ent, c1, chains)
01532             # chain with bigger interface
01533             closest_iface = _GetClosestChainInterface(ent, c1, chains)
01534             group_closest.append(closest)
01535             group_iface.append(closest_iface)
01536           cyclic_group_closest.append(tuple(group_closest))
01537           cyclic_group_iface.append(tuple(group_iface))
01538         if _ValidChainGroup(cyclic_group_closest, ent):
01539           symm_groups.append(cyclic_group_closest)
01540         if _ValidChainGroup(cyclic_group_iface, ent):
01541           symm_groups.append(cyclic_group_iface)
01542       # if it is a homo, there's not much we can group
01543       else:
01544         symm_groups.append(chem_groups)
01545   return symm_groups
01546 
01547 def _ClusterData(data, thr, metric):
01548   """Wrapper for fclusterdata to get dict of clusters.
01549   
01550   :param data: :class:`dict` (keys for ID, values used for clustering)
01551   :return: :class:`dict` {cluster_idx: {data-key: data-value}}
01552   """
01553   # special case length 1
01554   if len(data) == 1:
01555     return {0: {data.keys()[0]: data.values()[0]} }
01556   # do the clustering
01557   cluster_indices = fclusterdata(np.asarray(data.values()), thr,
01558                                  method='complete', criterion='distance',
01559                                  metric=metric)
01560   # fclusterdata output is cluster ids -> put into dict of clusters
01561   res = {}
01562   for cluster_idx, data_key in zip(cluster_indices, data.keys()):
01563     if not cluster_idx in res:
01564       res[cluster_idx] = {}
01565     res[cluster_idx][data_key] = data[data_key]
01566   return res
01567 
01568 def _AngleArrayDistance(u, v):
01569   """
01570   :return: Average angular distance of two arrays of angles.
01571   :param u: Euler angles.
01572   :param v: Euler angles.
01573   """
01574   dist = []
01575   for a,b in zip(u,v):
01576     delta = abs(a-b)
01577     if delta > np.pi:
01578       delta = abs(2*np.pi - delta)
01579     dist.append(delta)
01580   return np.mean(dist)
01581 
01582 def _AxisDistance(u, v):
01583   """
01584   :return: Euclidean distance between two rotation axes. Axes can point in
01585            either direction so we ensure to use the closer one.
01586   :param u: Rotation axis.
01587   :param v: Rotation axis.
01588   """
01589   # get axes as arrays (for convenience)
01590   u = np.asarray(u)
01591   v = np.asarray(v)
01592   # get shorter of two possible distances (using v or -v)
01593   dv1 = u - v
01594   dv2 = u + v
01595   d1 = np.dot(dv1, dv1)
01596   d2 = np.dot(dv2, dv2)
01597   return np.sqrt(min(d1, d2))
01598 
01599 def _GetClosestChain(ent, ref_chain, chains):
01600   """
01601   :return: Chain closest to *ref_chain* based on center of atoms distance.
01602   :rtype:  :class:`str`
01603   :param ent: See :func:`_GetSymmetrySubgroups`
01604   :param ref_chain: We look for chains closest to this one
01605   :type ref_chain:  :class:`str`
01606   :param chains: We only consider these chains
01607   :type chains:  :class:`list` of :class:`str`
01608   """
01609   contacts = []
01610   ref_center = ent.FindChain(ref_chain).center_of_atoms
01611   for ch in chains:
01612     center = ent.FindChain(ch).center_of_atoms
01613     contacts.append((geom.Distance(ref_center, center), ch))
01614   closest_chain = min(contacts)[1]
01615   return closest_chain
01616 
01617 def _GetClosestChainInterface(ent, ref_chain, chains):
01618   """
01619   :return: Chain with biggest interface (within 10 A) to *ref_chain*.
01620   :rtype:  :class:`str`
01621   :param ent: See :func:`_GetSymmetrySubgroups`
01622   :param ref_chain: We look for chains closest to this one
01623   :type ref_chain:  :class:`str`
01624   :param chains: We only consider these chains
01625   :type chains:  :class:`list` of :class:`str`
01626   """
01627   # NOTE: this is computed on a subset of the full biounit and might be
01628   # inaccurate. Also it could be extracted from QSscoreEntity.contacts.
01629   closest = []
01630   for ch in chains:
01631     iface_view = ent.Select('cname=%s and 10 <> [cname=%s]' % (ref_chain, ch))
01632     nr_res = iface_view.residue_count
01633     closest.append((nr_res, ch))
01634   closest_chain = max(closest)[1]
01635   return closest_chain
01636 
01637 def _ValidChainGroup(group, ent):
01638   """
01639   :return: True, if *group* has unique chain names and as many chains as *ent*
01640   :rtype:  :class:`bool`
01641   :param group: Symmetry groups to check
01642   :type group:  Same as :attr:`QSscorer.symm_1`
01643   :param ent: See :func:`_GetSymmetrySubgroups`
01644   """
01645   all_chain = [item for sublist in group for item in sublist]
01646   if len(all_chain) == len(set(all_chain)) and\
01647      len(all_chain) == ent.chain_count:
01648     return True
01649   return False
01650 
01651 def _LimitChemMapping(chem_mapping, limit_1, limit_2):
01652   """
01653   :return: Chem. mapping containing only chains in *limit_1* and *limit_2*
01654   :rtype:  Same as :attr:`QSscorer.chem_mapping`
01655   :param chem_mapping: See :attr:`QSscorer.chem_mapping`
01656   :param limit_1: Limits chain names in chem_mapping.keys()
01657   :type limit_1:  List/tuple of strings
01658   :param limit_2: Limits chain names in chem_mapping.values()
01659   :type limit_2:  List/tuple of strings
01660   """
01661   # use set intersection to keep only mapping for chains in limit_X
01662   limit_1_set = set(limit_1)
01663   limit_2_set = set(limit_2)
01664   asu_chem_mapping = dict()
01665   for key, value in chem_mapping.iteritems():
01666     new_key = tuple(set(key) & limit_1_set)
01667     if new_key:
01668       asu_chem_mapping[new_key] = tuple(set(value) & limit_2_set)
01669   return asu_chem_mapping
01670 
01671 
01672 def _CountSuperpositionsAndMappings(symm_1, symm_2, chem_mapping):
01673   """
01674   :return: Dictionary of number of mappings and superpositions to be performed.
01675            Returned as *result[X][Y] = number* with X = "intra" or "inter" and
01676            Y = "mappings" or "superpositions". The idea is that for each
01677            pairwise superposition we check all possible mappings.
01678            We can check the combinations within (intra) a symmetry group and
01679            once established, we check the combinations between different (inter)
01680            symmetry groups.
01681   :param symm_1: See :attr:`QSscorer.symm_1`
01682   :param symm_2: See :attr:`QSscorer.symm_2`
01683   :param chem_mapping: See :attr:`QSscorer.chem_mapping`
01684   """
01685   # setup
01686   c = {}
01687   c['intra'] = {}
01688   c['inter'] = {}
01689   c['intra']['mappings'] = 0
01690   c['intra']['superpositions'] = 0
01691   c['inter']['mappings'] = 0
01692   c['inter']['superpositions'] = 0
01693   # intra ASU mappings within reference symmetry groups
01694   ref_symm_1 = symm_1[0]
01695   ref_symm_2 = symm_2[0]
01696   asu_chem_mapping = _LimitChemMapping(chem_mapping, ref_symm_1, ref_symm_2)
01697   for g1, g2 in asu_chem_mapping.iteritems():
01698     chain_superpositions = len(g1) * len(g2)
01699     c['intra']['superpositions'] += chain_superpositions
01700     map_per_sup = _PermutationOrCombinations(g1[0], g2[0], asu_chem_mapping)
01701     c['intra']['mappings'] += chain_superpositions * map_per_sup
01702   if len(symm_1) == 1 or len(symm_2) == 1:
01703     return c
01704   # inter ASU mappings
01705   asu_superposition = min(len(symm_1), len(symm_2))
01706   c['inter']['superpositions'] = asu_superposition
01707   asu = {tuple(symm_1): tuple(symm_2)}
01708   map_per_sup = _PermutationOrCombinations(ref_symm_1, ref_symm_2, asu)
01709   c['inter']['mappings'] = asu_superposition * map_per_sup
01710   return c
01711 
01712 def _PermutationOrCombinations(sup1, sup2, chem_mapping):
01713   """Should match len(_ListPossibleMappings(sup1, sup2, chem_mapping))."""
01714   # remove superposed elements, put smallest complex as key
01715   chem_map = {}
01716   for a,b in chem_mapping.iteritems():
01717     new_a = tuple([x for x in a if x != sup1])
01718     new_b = tuple([x for x in b if x != sup2])
01719     chem_map[new_a] = new_b
01720   mapp_nr = 1.0
01721   for a, b in chem_map.iteritems():
01722     if len(a) == len(b):
01723       mapp_nr *= factorial(len(a))
01724     elif len(a) < len(b):
01725       mapp_nr *= binom(len(b), len(a))
01726     elif len(a) > len(b):
01727       mapp_nr *= binom(len(a), len(b))
01728   return int(mapp_nr)
01729 
01730 def _ListPossibleMappings(sup1, sup2, chem_mapping):
01731   """
01732   Return a flat list of all possible mappings given *chem_mapping* and keeping
01733   mapping of *sup1* and *sup2* fixed. For instance if elements are chain names
01734   this is all the mappings to check for a given superposition.
01735 
01736   Elements in first complex are defined by *chem_mapping.keys()* (list of list
01737   of elements) and elements in second complex by *chem_mapping.values()*. If
01738   complexes don't have same number of elements, we map only elements for the one
01739   with less. Also mapping is only between elements of mapped groups according to
01740   *chem_mapping*.
01741            
01742   :rtype:  :class:`list` of :class:`dict` (key = element in chem_mapping-key,
01743            value = element in chem_mapping-value)
01744   :param sup1: Element for which mapping is fixed.
01745   :type sup1:  Like element in chem_mapping-key
01746   :param sup2: Element for which mapping is fixed.
01747   :type sup2:  Like element in chem_mapping-value
01748   :param chem_mapping: Defines mapping between groups of elements (e.g. result
01749                        of :func:`_LimitChemMapping`).
01750   :type chem_mapping:  :class:`dict` with key / value = :class:`tuple`
01751 
01752   :raises: :class:`QSscoreError` if reference complex (first one or one with
01753            less elements) has more elements for any given mapped group.
01754   """
01755   # find smallest complex
01756   chain1 = [i for s in chem_mapping.keys() for i in s]
01757   chain2 = [i for s in chem_mapping.values() for i in s]
01758   swap = False
01759   if len(chain1) > len(chain2):
01760     swap = True
01761   # remove superposed elements, put smallest complex as key
01762   chem_map = {}
01763   for a, b in chem_mapping.iteritems():
01764     new_a = tuple([x for x in a if x != sup1])
01765     new_b = tuple([x for x in b if x != sup2])
01766     if swap:
01767       chem_map[new_b] = new_a
01768     else:
01769       chem_map[new_a] = new_b
01770   # generate permutations or combinations of chemically
01771   # equivalent chains
01772   chem_perm = []
01773   chem_ref = []
01774   for a, b in chem_map.iteritems():
01775     if len(a) == len(b):
01776       chem_perm.append(list(itertools.permutations(b)))
01777       chem_ref.append(a)
01778     elif len(a) < len(b):
01779       chem_perm.append(list(itertools.combinations(b, len(a))))
01780       chem_ref.append(a)
01781     else:
01782       raise QSscoreError('Impossible to define reference group: %s' \
01783                          % str(chem_map))
01784   # save the list of possible mappings
01785   mappings = []
01786   flat_ref = [i for s in chem_ref for i in s]
01787   for perm in itertools.product(*chem_perm):
01788     flat_perm = [i for s in perm for i in s]
01789     d = {c1: c2 for c1, c2 in zip(flat_ref, flat_perm)}
01790     if swap:
01791       d = {v: k for k, v in d.items()}
01792     d.update({sup1: sup2})
01793     mappings.append(d)
01794   return mappings
01795 
01796 
01797 def _CheckClosedSymmetry(ent_1, ent_2, symm_1, symm_2, chem_mapping,
01798                          sup_thr=4, sup_fract=0.8, find_best=True):
01799   """
01800   Quick check if we can superpose two chains and get a mapping for all other
01801   chains using the same transformation. The mapping is defined by sufficient
01802   overlap of the transformed chain of *ent_1* onto another chain in *ent_2*.
01803 
01804   :param ent_1: Entity to map to *ent_2* (perfect alignment assumed between
01805                 chains of same chem. group as in :attr:`QSscorer.ent_to_cm_1`).
01806                 Views are ok but only to select full chains!
01807   :param ent_2: Entity to map to (perfect alignment assumed between
01808                 chains of same chem. group as in :attr:`QSscorer.ent_to_cm_2`).
01809                 Views are ok but only to select full chains!
01810   :param symm_1: Symmetry groups to use. We only superpose chains within
01811                  reference symmetry group of *symm_1* and *symm_2*.
01812                  See :attr:`QSscorer.symm_1`
01813   :param symm_2: See :attr:`QSscorer.symm_2`
01814   :param chem_mapping: See :attr:`QSscorer.chem_mapping`.
01815                        All chains in *ent_1* / *ent_2* must be listed here!
01816   :param sup_thr: Distance around transformed chain in *ent_1* to check for
01817                   overlap.
01818   :type sup_thr:  :class:`float`
01819   :param sup_fract: Fraction of atoms in chain of *ent_2* that must be
01820                     overlapped for overlap to be sufficient.
01821   :type sup_fract:  :class:`float`
01822   :param find_best: If True, we look for best mapping according to
01823                     :func:`_ChainRMSD`. Otherwise, we return first suitable
01824                     mapping.
01825   :type find_best:  :class:`bool`
01826 
01827   :return: Mapping from *ent_1* to *ent_2* or None if none found. Mapping, if
01828            found, is as in :attr:`QSscorer.chain_mapping`.
01829   :rtype:  :class:`dict` (:class:`str` / :class:`str`)
01830   """
01831   # limit chem mapping to reference symmetry group
01832   ref_symm_1 = symm_1[0]
01833   ref_symm_2 = symm_2[0]
01834   asu_chem_mapping = _LimitChemMapping(chem_mapping, ref_symm_1, ref_symm_2)
01835   # for each chemically identical group
01836   rmsd_mappings = []
01837   for g1, g2 in asu_chem_mapping.iteritems():
01838     # try to superpose all possible pairs
01839     # -> note that some chain-chain combinations may work better than others
01840     #    to superpose the full oligomer (e.g. if some chains are open/closed)
01841     for c1, c2 in itertools.product(g1, g2):
01842       # get superposition transformation
01843       chain_1 = ent_1.Select('cname=%s' % c1)
01844       chain_2 = ent_2.Select('cname=%s' % c2)
01845       res = mol.alg.SuperposeSVD(chain_1, chain_2, apply_transform=False)
01846       # look for overlaps
01847       mapping = _GetSuperpositionMapping(ent_1, ent_2, chem_mapping,
01848                                          res.transformation, sup_thr,
01849                                          sup_fract)
01850       if not mapping:
01851         continue
01852       # early abort if we only want the first one
01853       if not find_best:
01854         return mapping
01855       else:
01856         # get RMSD for sorting
01857         rmsd = _GetMappedRMSD(ent_1, ent_2, mapping, res.transformation)
01858         rmsd_mappings.append((rmsd, mapping))
01859   # return best mapping
01860   if rmsd_mappings:
01861     return min(rmsd_mappings)[1]
01862   else:
01863     return None
01864 
01865 def _GetSuperpositionMapping(ent_1, ent_2, chem_mapping, transformation,
01866                              sup_thr, sup_fract):
01867   """
01868   :return: Dict with chain mapping from *ent_1* to *ent_2* or None if failed
01869            (see :func:`_CheckClosedSymmetry`).
01870   :param ent_1: See :func:`_CheckClosedSymmetry`
01871   :param ent_2: See :func:`_CheckClosedSymmetry`
01872   :param chem_mapping: See :func:`_CheckClosedSymmetry`
01873   :param transformation: Superposition transformation to be applied to *ent_1*.
01874   :param sup_thr: See :func:`_CheckClosedSymmetry`
01875   :param sup_fract: See :func:`_CheckClosedSymmetry`
01876   """
01877   # NOTE: this seems to be the comput. most expensive part in QS scoring
01878   # -> it could be moved to C++ for speed-up
01879   # -> the next expensive bits are ClustalW and GetContacts btw...
01880 
01881   # swap if needed (ent_1 must be shorter or same)
01882   if ent_1.chain_count > ent_2.chain_count:
01883     swap = True
01884     ent_1, ent_2 = ent_2, ent_1
01885     transformation = geom.Invert(transformation)
01886     chem_pairs = zip(chem_mapping.values(), chem_mapping.keys())
01887   else:
01888     swap = False
01889     chem_pairs = chem_mapping.iteritems()
01890   # sanity check
01891   if ent_1.chain_count == 0:
01892     return None
01893   # extract valid partners for each chain in ent_1
01894   chem_partners = dict()
01895   for cg1, cg2 in chem_pairs:
01896     for ch in cg1:
01897       chem_partners[ch] = set(cg2)
01898   # get mapping for each chain in ent_1
01899   mapping = dict()
01900   mapped_chains = set()
01901   for ch_1 in ent_1.chains:
01902     # get all neighbors split by chain (NOTE: this muight be moved to C++)
01903     ch_atoms = {ch_2.name: set() for ch_2 in ent_2.chains}
01904     for a_1 in ch_1.handle.atoms:
01905       transformed_pos = geom.Vec3(transformation * geom.Vec4(a_1.pos))
01906       a_within = ent_2.FindWithin(transformed_pos, sup_thr)
01907       for a_2 in a_within:
01908         ch_atoms[a_2.chain.name].add(a_2.hash_code)
01909     # get one with most atoms in overlap
01910     max_num, max_name = max((len(atoms), name)
01911                             for name, atoms in ch_atoms.iteritems())
01912     # early abort if overlap fraction not good enough
01913     ch_2 = ent_2.FindChain(max_name)
01914     if max_num == 0 or max_num / float(ch_2.handle.atom_count) < sup_fract:
01915       return None
01916     # early abort if mapped to chain of different chem. group
01917     ch_1_name = ch_1.name
01918     if ch_1_name not in chem_partners:
01919       raise RuntimeError("Chem. mapping doesn't contain all chains!")
01920     if max_name not in chem_partners[ch_1_name]:
01921       return None
01922     # early abort if multiple ones mapped to same chain
01923     if max_name in mapped_chains:
01924       return None
01925     # set mapping
01926     mapping[ch_1_name] = max_name
01927     mapped_chains.add(max_name)
01928   # unswap if needed and return
01929   if swap:
01930     mapping = {v: k for k, v in mapping.iteritems()}
01931   return mapping
01932 
01933 def _GetMappedRMSD(ent_1, ent_2, chain_mapping, transformation):
01934   """
01935   :return: RMSD between complexes considering chain mapping.
01936   :param ent_1: Entity mapped to *ent_2* (perfect alignment assumed between
01937                 mapped chains as in :attr:`QSscorer.ent_to_cm_1`).
01938   :param ent_2: Entity which was mapped to (perfect alignment assumed between
01939                 mapped chains as in :attr:`QSscorer.ent_to_cm_2`).
01940   :param chain_mapping: See :attr:`QSscorer.chain_mapping`
01941   :param transformation: Superposition transformation to be applied to *ent_1*.
01942   """
01943   # collect RMSDs and atom counts chain by chain and combine afterwards
01944   rmsds = []
01945   atoms = []
01946   for c1, c2 in chain_mapping.iteritems():
01947     # get views and atom counts
01948     chain_1 = ent_1.Select('cname=%s' % c1)
01949     chain_2 = ent_2.Select('cname=%s' % c2)
01950     atom_count = chain_1.atom_count
01951     if atom_count != chain_2.atom_count:
01952       raise RuntimeError('Chains in _GetMappedRMSD must be perfectly aligned!')
01953     # get RMSD
01954     rmsd = mol.alg.CalculateRMSD(chain_1, chain_2, transformation)
01955     # update lists
01956     rmsds.append(rmsd)
01957     atoms.append(float(atom_count))
01958   # combine (reminder: RMSD = sqrt(sum(atom_distance^2)/atom_count))
01959   rmsd = np.sqrt( sum([a * r**2 for a, r in zip(atoms, rmsds)]) / sum(atoms) )
01960   return rmsd
01961 
01962 class _CachedRMSD:
01963   """Helper object for repetitive RMSD calculations.
01964   Meant to speed up :func:`_GetChainMapping` but could also be used to replace
01965   :func:`_GetMappedRMSD` in :func:`_CheckClosedSymmetry`.
01966 
01967   :param ent_1: See :attr:`QSscorer.ent_to_cm_1`
01968   :param ent_2: See :attr:`QSscorer.ent_to_cm_2`
01969   """
01970   def __init__(self, ent_1, ent_2):
01971     # initialize caches and keep entities
01972     self.ent_1 = ent_1
01973     self.ent_2 = ent_2
01974     self._chain_views_1 = dict()
01975     self._chain_views_2 = dict()
01976     self._pair_rmsd = dict()  # key = (c1,c2), value = (atom_count,rmsd)
01977 
01978   def GetChainView1(self, cname):
01979     """Get cached view on chain *cname* for :attr:`ent_1`."""
01980     if cname not in self._chain_views_1:
01981       self._chain_views_1[cname] = self.ent_1.Select('cname=%s' % cname)
01982     return self._chain_views_1[cname]
01983 
01984   def GetChainView2(self, cname):
01985     """Get cached view on chain *cname* for :attr:`ent_2`."""
01986     if cname not in self._chain_views_2:
01987       self._chain_views_2[cname] = self.ent_2.Select('cname=%s' % cname)
01988     return self._chain_views_2[cname]
01989 
01990   def GetSuperposition(self, c1, c2):
01991     """Get superposition result (no change in entities!) for *c1* to *c2*.
01992     This invalidates cached RMSD results used in :func:`GetMappedRMSD`.
01993 
01994     :param c1: chain name for :attr:`ent_1`.
01995     :param c2: chain name for :attr:`ent_2`.
01996     """
01997     # invalidate _pair_rmsd
01998     self._pair_rmsd = dict()
01999     # superpose
02000     chain_1 = self.GetChainView1(c1)
02001     chain_2 = self.GetChainView2(c2)
02002     if chain_1.atom_count != chain_2.atom_count:
02003       raise RuntimeError('Chains in GetSuperposition not perfectly aligned!')
02004     return mol.alg.SuperposeSVD(chain_1, chain_2, apply_transform=False)
02005 
02006   def GetMappedRMSD(self, chain_mapping, transformation):
02007     """
02008     :return: RMSD between complexes considering chain mapping.
02009     :param chain_mapping: See :attr:`QSscorer.chain_mapping`.
02010     :param transformation: Superposition transformation (e.g. res.transformation
02011                            for res = :func:`GetSuperposition`).
02012     """
02013     # collect RMSDs and atom counts chain by chain and combine afterwards
02014     rmsds = []
02015     atoms = []
02016     for c1, c2 in chain_mapping.iteritems():
02017       # cached?
02018       if (c1, c2) in self._pair_rmsd:
02019         atom_count, rmsd = self._pair_rmsd[(c1, c2)]
02020       else:
02021         # get views and atom counts
02022         chain_1 = self.GetChainView1(c1)
02023         chain_2 = self.GetChainView2(c2)
02024         atom_count = chain_1.atom_count
02025         if atom_count != chain_2.atom_count:
02026           raise RuntimeError('Chains in GetMappedRMSD not perfectly aligned!')
02027         # get RMSD
02028         rmsd = mol.alg.CalculateRMSD(chain_1, chain_2, transformation)
02029         self._pair_rmsd[(c1, c2)] = (atom_count, rmsd)
02030       # update lists
02031       rmsds.append(rmsd)
02032       atoms.append(float(atom_count))
02033     # combine (reminder: RMSD = sqrt(sum(atom_distance^2)/atom_count))
02034     rmsd = np.sqrt( sum([a * r**2 for a, r in zip(atoms, rmsds)]) / sum(atoms) )
02035     return rmsd
02036 
02037 
02038 def _CleanUserSymmetry(symm, ent):
02039   """Clean-up user provided symmetry.
02040 
02041   :param symm: Symmetry group as in :attr:`QSscorer.symm_1`
02042   :param ent: Entity from which to extract chain names
02043   :type ent:  :class:`~ost.mol.EntityHandle` or :class:`~ost.mol.EntityView`
02044   :return: New symmetry group limited to chains in sub-structure *ent*
02045            (see :attr:`QSscorer.symm_1`). Empty list if invalid symmetry.
02046   """
02047   # restrict symm to only contain chains in ent
02048   chain_names = set([ch.name for ch in ent.chains])
02049   new_symm = list()
02050   for symm_group in symm:
02051     new_group = tuple(ch for ch in symm_group if ch in chain_names)
02052     if new_group:
02053       new_symm.append(new_group)
02054   # report it
02055   if new_symm != symm:
02056     LogInfo("Cleaned user symmetry for %s: %s" % (ent.GetName(), new_symm))
02057   # do all groups have same length?
02058   lengths = [len(symm_group) for symm_group in new_symm]
02059   if lengths[1:] != lengths[:-1]:
02060     LogWarning('User symmetry group of different sizes for %s. Ignoring it!' \
02061                % (ent.GetName()))
02062     return []
02063   # do we cover all chains?
02064   s_chain_names = set([ch for symm_group in new_symm for ch in symm_group])
02065   if len(s_chain_names) != sum(lengths):
02066     LogWarning('User symmetry group for %s has duplicate chains. Ignoring it!' \
02067                % (ent.GetName()))
02068     return []
02069   if s_chain_names != chain_names:
02070     LogWarning('User symmetry group for %s is missing chains. Ignoring it!' \
02071                % (ent.GetName()))
02072     return []
02073   # ok all good
02074   return new_symm
02075 
02076 def _AreValidSymmetries(symm_1, symm_2):
02077   """Check symmetry pair for major problems.
02078 
02079   :return: False if any of the two is empty or if they're compatible in size.
02080   :rtype:  :class:`bool`
02081   """
02082   if not symm_1 or not symm_2:
02083     return False
02084   if len(symm_1) != 1 or len(symm_2) != 1:
02085     if not len(symm_1[0]) == len(symm_2[0]):
02086       LogWarning('Symmetry groups of different sizes. Ignoring them!')
02087       return False
02088   return True
02089 
02090 def _GetMappedAlignments(ent_1, ent_2, chain_mapping):
02091   """
02092   :return: Alignments of 2 structures given chain mapping
02093            (see :attr:`QSscorer.alignments`).
02094   :param ent_1: Entity containing all chains in *chain_mapping.keys()*.
02095                 Views to this entity attached to first sequence of each aln.
02096   :param ent_2: Entity containing all chains in *chain_mapping.values()*.
02097                 Views to this entity attached to second sequence of each aln.
02098   :param chain_mapping: See :attr:`QSscorer.chain_mapping`
02099   """
02100   alns = []
02101   for ch_1_name in sorted(chain_mapping):
02102     # get both sequences incl. attached view
02103     ch_1 = ent_1.FindChain(ch_1_name)
02104     seq_1 = seq.SequenceFromChain(ch_1.name, ch_1)
02105     ch_2 = ent_2.FindChain(chain_mapping[ch_1_name])
02106     seq_2 = seq.SequenceFromChain(ch_2.name, ch_2)
02107     # align them
02108     aln = _AlignAtomSeqs(seq_1, seq_2)
02109     if aln: alns.append(aln)
02110   return alns
02111 
02112 def _GetMappedResidues(alns):
02113   """
02114   :return: Mapping of shared residues in *alns* (with views attached)
02115            (see :attr:`QSscorer.mapped_residues`).
02116   :param alns: See :attr:`QSscorer.alignments`
02117   """
02118   mapped_residues = dict()
02119   for aln in alns:
02120     # prepare dict for c1
02121     c1 = aln.GetSequence(0).name
02122     mapped_residues[c1] = dict()
02123     # get shared residues
02124     v1, v2 = seq.ViewsFromAlignment(aln)
02125     for res_1, res_2 in zip(v1.residues, v2.residues):
02126       r1 = res_1.number.num
02127       r2 = res_2.number.num
02128       mapped_residues[c1][r1] = r2
02129 
02130   return mapped_residues
02131 
02132 def _GetExtraWeights(contacts, done, mapped_residues):
02133   """Return sum of extra weights for contacts of chains in set and not in done.
02134   :return: Tuple (weight_extra_mapped, weight_extra_all).
02135            weight_extra_mapped only sums if both cX,rX in mapped_residues
02136            weight_extra_all sums all
02137   :param contacts: See :func:`GetContacts` for first entity
02138   :param done: List of (c1, c2, r1, r2) tuples to ignore
02139   :param mapped_residues: See :func:`_GetMappedResidues`
02140   """
02141   weight_extra_mapped = 0
02142   weight_extra_non_mapped = 0
02143   for c1 in contacts:
02144     for c2 in contacts[c1]:
02145       for r1 in contacts[c1][c2]:
02146         for r2 in contacts[c1][c2][r1]:
02147           if (c1, c2, r1, r2) not in done:
02148             weight = _weight(contacts[c1][c2][r1][r2])
02149             if     c1 in mapped_residues and r1 in mapped_residues[c1] \
02150                and c2 in mapped_residues and r2 in mapped_residues[c2]:
02151               weight_extra_mapped += weight
02152             else:
02153               weight_extra_non_mapped += weight
02154   return weight_extra_mapped, weight_extra_mapped + weight_extra_non_mapped
02155 
02156 def _GetScores(contacts_1, contacts_2, mapped_residues, chain_mapping):
02157   """Get QS scores (see :class:`QSscorer`).
02158 
02159   Note that if some chains are not to be considered at all, they must be removed
02160   from *contacts_1* / *contacts_2* prior to calling this.
02161 
02162   :param contacts_1: See :func:`GetContacts` for first entity
02163   :param contacts_2: See :func:`GetContacts` for second entity
02164   :param mapped_residues: See :func:`_GetMappedResidues`
02165   :param chain_mapping: Maps any chain name in *mapped_residues* to chain name
02166                         for *contacts_2*.
02167   :type chain_mapping:  :class:`dict` (:class:`str` / :class:`str`)
02168   :return: Tuple (QS_best, QS_global) (see :attr:`QSscorer.best_score` and
02169            see :attr:`QSscorer.global_score`)
02170   """
02171   # keep track of considered contacts as set of (c1,c2,r1,r2) tuples
02172   done_1 = set()
02173   done_2 = set()
02174   weighted_scores = 0
02175   weight_sum = 0
02176   # naming cXY, rXY: X = 1/2 for contact, Y = 1/2/T for mapping (T = tmp)
02177   # -> d1 = contacts_1[c11][c21][r11][r21], d2 = contacts_2[c12][c22][r12][r22]
02178   for c11 in contacts_1:
02179     # map to other one
02180     if c11 not in mapped_residues: continue
02181     c1T = chain_mapping[c11]
02182     # second chain
02183     for c21 in contacts_1[c11]:
02184       # map to other one
02185       if c21 not in mapped_residues: continue
02186       c2T = chain_mapping[c21]
02187       # flip if needed
02188       flipped_chains = (c1T > c2T)
02189       if flipped_chains:
02190         c12, c22 = c2T, c1T
02191       else:
02192         c12, c22 = c1T, c2T
02193       # skip early if no contacts there
02194       if c12 not in contacts_2 or c22 not in contacts_2[c12]: continue
02195       # loop over res.num. in c11
02196       for r11 in contacts_1[c11][c21]:
02197         # map to other one and skip if no contacts there
02198         if r11 not in mapped_residues[c11]: continue
02199         r1T = mapped_residues[c11][r11]
02200         # loop over res.num. in c21
02201         for r21 in contacts_1[c11][c21][r11]:
02202           # map to other one and skip if no contacts there
02203           if r21 not in mapped_residues[c21]: continue
02204           r2T = mapped_residues[c21][r21]
02205           # flip if needed
02206           if flipped_chains:
02207             r12, r22 = r2T, r1T
02208           else:
02209             r12, r22 = r1T, r2T
02210           # skip early if no contacts there
02211           if r12 not in contacts_2[c12][c22]: continue
02212           if r22 not in contacts_2[c12][c22][r12]: continue
02213           # ok now we can finally do the scoring
02214           d1 = contacts_1[c11][c21][r11][r21]
02215           d2 = contacts_2[c12][c22][r12][r22]
02216           weight = _weight(min(d1, d2))
02217           weighted_scores += weight * (1 - abs(d1 - d2) / 12)
02218           weight_sum += weight
02219           # keep track of done ones
02220           done_1.add((c11, c21, r11, r21))
02221           done_2.add((c12, c22, r12, r22))
02222 
02223   LogVerbose("Shared contacts: %d" % len(done_1))
02224 
02225   # add the extra weights
02226   weights_extra_1 = _GetExtraWeights(contacts_1, done_1, mapped_residues)
02227   mapped_residues_2 = dict()
02228   for c1 in mapped_residues:
02229     c2 = chain_mapping[c1]
02230     mapped_residues_2[c2] = set()
02231     for r1 in mapped_residues[c1]:
02232       mapped_residues_2[c2].add(mapped_residues[c1][r1])
02233   weights_extra_2 = _GetExtraWeights(contacts_2, done_2, mapped_residues_2)
02234   weight_extra_mapped = weights_extra_1[0] + weights_extra_2[0]
02235   weight_extra_all    = weights_extra_1[1] + weights_extra_2[1]
02236   
02237   # get scores
02238   denom_best = weight_sum + weight_extra_mapped
02239   denom_all  = weight_sum + weight_extra_all
02240   if denom_best == 0:
02241     QS_best = 0
02242   else:
02243     QS_best = weighted_scores / denom_best
02244   if denom_all == 0:
02245     QS_global = 0
02246   else:
02247     QS_global = weighted_scores / denom_all
02248   return QS_best, QS_global
02249 
02250 def _weight(dist):
02251   """
02252   This weight expresses the probability of two residues to interact given the CB
02253   distance (from Xu et al. http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2573399/)
02254   """
02255   if dist <= 5.0:
02256     return 1
02257   x = (dist-5.0)/4.28;
02258   return np.exp(-2*x*x)
02259 
02260 
02261 def _GetQsSuperposition(alns):
02262   """
02263   :return: Superposition result based on shared CA atoms in *alns*
02264            (with views attached) (see :attr:`QSscorer.superposition`).
02265   :param alns: See :attr:`QSscorer.alignments`
02266   """
02267   # check input
02268   if not alns:
02269     raise QSscoreError('No successful alignments for superposition!')
02270   # prepare views
02271   view_1 = alns[0].GetSequence(0).attached_view.CreateEmptyView()
02272   view_2 = alns[0].GetSequence(1).attached_view.CreateEmptyView()
02273   # collect CA from alignments in proper order
02274   for aln in alns:
02275     for col in aln:
02276       res_1 = col.GetResidue(0)
02277       res_2 = col.GetResidue(1)
02278       if res_1.IsValid() and res_2.IsValid():
02279         ca_1 = res_1.FindAtom('CA')
02280         ca_2 = res_2.FindAtom('CA')
02281         if ca_1.IsValid() and ca_2.IsValid():
02282           view_1.AddAtom(ca_1)
02283           view_2.AddAtom(ca_2)
02284   # superpose them without chainging entities
02285   res = mol.alg.SuperposeSVD(view_1, view_2, apply_transform=False)
02286   return res
02287 
02288 
02289 def _AddResidue(edi, res, rnum, chain, calpha_only):
02290   """
02291   Add residue *res* with res. num. *run* to given *chain* using editor *edi*.
02292   Either all atoms added or (if *calpha_only*) only CA.
02293   """
02294   if calpha_only:
02295     ca_atom = res.FindAtom('CA')
02296     if ca_atom.IsValid():
02297       new_res = edi.AppendResidue(chain, res.name, rnum)
02298       edi.InsertAtom(new_res, ca_atom.name, ca_atom.pos)
02299   else:
02300     new_res = edi.AppendResidue(chain, res.name, rnum)
02301     for atom in res.atoms:
02302       edi.InsertAtom(new_res, atom.name, atom.pos)
02303 
02304 def _MergeAlignedChains(alns, ent_1, ent_2, calpha_only):
02305   """
02306   Create two new entities (based on the alignments attached views) where all
02307   residues have same numbering (when they're aligned) and they are all pushed to
02308   a single chain X. Also append extra chains contained in *ent_1* and *ent_2*
02309   but not contained in *alns*.
02310 
02311   Used for :attr:`QSscorer.lddt_ref` and :attr:`QSscorer.lddt_mdl`
02312 
02313   :param alns: List of alignments with attached views (first sequence: *ent_1*,
02314                second: *ent_2*). Residue number in single chain is column index
02315                of current alignment + sum of lengths of all previous alignments
02316                (order of alignments as in input list).
02317   :type alns:  See :attr:`QSscorer.alignments`
02318   :param ent_1: First entity to process.
02319   :type ent_1:  :class:`~ost.mol.EntityHandle`
02320   :param ent_2: Second entity to process.
02321   :type ent_2:  :class:`~ost.mol.EntityHandle`
02322   :param calpha_only: If True, we only include CA atoms instead of all.
02323   :type calpha_only:  :class:`bool`
02324 
02325   :return: Tuple of two single chain entities (from *ent_1* and from *ent_2*)
02326   :rtype:  :class:`tuple` of :class:`~ost.mol.EntityHandle`
02327   """
02328   # first new entity
02329   ent_ren_1 = mol.CreateEntity()
02330   ed_1 = ent_ren_1.EditXCS()
02331   new_chain_1 = ed_1.InsertChain('X')
02332   # second one
02333   ent_ren_2 = mol.CreateEntity()
02334   ed_2 = ent_ren_2.EditXCS()
02335   new_chain_2 = ed_2.InsertChain('X')
02336   # the alignment already contains sorted chains
02337   rnum = 0
02338   chain_done_1 = set()
02339   chain_done_2 = set()
02340   for aln in alns:
02341     chain_done_1.add(aln.GetSequence(0).name)
02342     chain_done_2.add(aln.GetSequence(1).name)
02343     for col in aln:
02344       rnum += 1
02345       # add valid residues to single chain entities
02346       res_1 = col.GetResidue(0)
02347       if res_1.IsValid():
02348         _AddResidue(ed_1, res_1, rnum, new_chain_1, calpha_only)
02349       res_2 = col.GetResidue(1)
02350       if res_2.IsValid():
02351         _AddResidue(ed_2, res_2, rnum, new_chain_2, calpha_only)
02352   # extra chains
02353   for chain in ent_1.chains:
02354     if chain.name in chain_done_1:
02355       continue
02356     for res in chain.residues:
02357       rnum += 1
02358       _AddResidue(ed_1, res, rnum, new_chain_1, calpha_only)
02359   for chain in ent_2.chains:
02360     if chain.name in chain_done_2:
02361       continue
02362     for res in chain.residues:
02363       rnum += 1
02364       _AddResidue(ed_2, res, rnum, new_chain_2, calpha_only)
02365   # get entity names
02366   ent_ren_1.SetName(aln.GetSequence(0).GetAttachedView().GetName())
02367   ent_ren_2.SetName(aln.GetSequence(1).GetAttachedView().GetName())
02368   # connect atoms
02369   if not conop.GetDefaultLib():
02370     raise RuntimeError("QSscore computation requires a compound library!")
02371   pr = conop.RuleBasedProcessor(conop.GetDefaultLib())
02372   pr.Process(ent_ren_1)  
02373   ed_1.UpdateICS()
02374   pr.Process(ent_ren_2)  
02375   ed_2.UpdateICS()
02376   return ent_ren_1, ent_ren_2
02377 
02378 def _ComputeLDDTScore(ref, mdl):
02379   """
02380   :return: lDDT of *mdl* vs *ref* (see :attr:`QSscorer.lddt_score`).
02381   :param mdl: Reference entity (see :attr:`QSscorer.lddt_mdl`)
02382   :param ref: Model entity (see :attr:`QSscorer.lddt_ref`)
02383   """
02384   # check input
02385   LogInfo('Reference %s has: %s residues' % (ref.GetName(), ref.residue_count))
02386   LogInfo('Model %s has: %s residues' % (mdl.GetName(), mdl.residue_count))
02387   # get lddt score with fixed settings
02388   lddt_score = mol.alg.LocalDistDiffTest(mdl.Select(''), ref.Select(''),
02389                                          2., 8., 'lddt')
02390   LogInfo('lDDT score: %.3f' % lddt_score)
02391   # add lDDT as B-factor to model
02392   for r in mdl.residues:
02393     if r.HasProp('lddt'):
02394       for a in r.atoms:
02395         a.SetBFactor(r.GetFloatProp('lddt'))
02396     else:
02397       for a in r.atoms:
02398         a.SetBFactor(0.0)
02399   
02400   return lddt_score
02401 
02402 # specify public interface
02403 __all__ = ('QSscoreError', 'QSscorer', 'QSscoreEntity', 'FilterContacts',
02404            'GetContacts')