2 Chain mapping aims to identify a one-to-one relationship between chains in a
3 reference structure and a model.
11 from scipy.special
import factorial
12 from scipy.special
import binom
24 def _CSel(ent, cnames):
25 """ Returns view with specified chains
27 Ensures that quotation marks are around chain names to not confuse
28 OST query language with weird special characters.
30 query =
"cname=" +
','.join([mol.QueryQuoteName(cname)
for cname
in cnames])
31 return ent.Select(query)
34 """ Result object for the chain mapping functions in :class:`ChainMapper`
36 Constructor is directly called within the functions, no need to construct
37 such objects yourself.
39 def __init__(self, target, model, chem_groups, chem_mapping, mapping, alns,
46 self.
_alns_alns = alns
51 """ Target/reference structure, i.e. :attr:`ChainMapper.target`
53 :type: :class:`ost.mol.EntityView`
59 """ Model structure that gets mapped onto :attr:`~target`
61 Underwent same processing as :attr:`ChainMapper.target`, i.e.
62 only contains peptide/nucleotide chains of sufficient size.
64 :type: :class:`ost.mol.EntityView`
70 """ Groups of chemically equivalent chains in :attr:`~target`
72 Same as :attr:`ChainMapper.chem_group`
74 :class:`list` of :class:`list` of :class:`str` (chain names)
80 """ Assigns chains in :attr:`~model` to :attr:`~chem_groups`.
82 :class:`list` of :class:`list` of :class:`str` (chain names)
88 """ Mapping of :attr:`~model` chains onto :attr:`~target`
90 Exact same shape as :attr:`~chem_groups` but containing the names of the
91 mapped chains in :attr:`~model`. May contain None for :attr:`~target`
92 chains that are not covered. No guarantee that all chains in
93 :attr:`~model` are mapped.
95 :class:`list` of :class:`list` of :class:`str` (chain names)
101 """ Alignments of mapped chains in :attr:`~target` and :attr:`~model`
103 Each alignment is accessible with ``alns[(t_chain,m_chain)]``. First
104 sequence is the sequence of :attr:`target` chain, second sequence the
105 one from :attr:`~model`. The respective :class:`ost.mol.EntityView` are
106 attached with :func:`ost.seq.ConstSequenceHandle.AttachView`.
108 :type: :class:`dict` with key: :class:`tuple` of :class:`str`, value:
109 :class:`ost.seq.AlignmentHandle`
111 return self.
_alns_alns
115 """ Placeholder property without any guarantee of being set
117 Different scores get optimized in the various chain mapping algorithms.
118 Some of them may set their final optimal score in that property.
119 Consult the documentation of the respective chain mapping algorithm
120 for more information. Won't be in the return dict of
126 """ Returns flat mapping as :class:`dict` for all mapable chains
128 :param mdl_as_key: Default is target chain name as key and model chain
129 name as value. This can be reversed with this flag.
130 :returns: :class:`dict` with :class:`str` as key/value that describe
133 flat_mapping = dict()
134 for trg_chem_group, mdl_chem_group
in zip(self.
chem_groupschem_groups,
136 for a,b
in zip(trg_chem_group, mdl_chem_group):
137 if a
is not None and b
is not None:
145 """ Returns JSON serializable summary of results
148 json_dict[
"chem_groups"] = self.
chem_groupschem_groups
149 json_dict[
"mapping"] = self.
mappingmapping
151 json_dict[
"alns"] = list()
152 for aln
in self.
alnsalns.values():
153 trg_seq = aln.GetSequence(0)
154 mdl_seq = aln.GetSequence(1)
155 aln_dict = {
"trg_ch": trg_seq.GetName(),
"trg_seq": str(trg_seq),
156 "mdl_ch": mdl_seq.GetName(),
"mdl_seq": str(mdl_seq)}
157 json_dict[
"alns"].append(aln_dict)
163 """ Result object for :func:`ChainMapper.GetRepr`
165 Constructor is directly called within the function, no need to construct
166 such objects yourself.
168 :param lDDT: lDDT for this mapping. Depends on how you call
169 :func:`ChainMapper.GetRepr` whether this is backbone only or
171 :type lDDT: :class:`float`
172 :param substructure: The full substructure for which we searched for a
174 :type substructure: :class:`ost.mol.EntityView`
175 :param ref_view: View pointing to the same underlying entity as
176 *substructure* but only contains the stuff that is mapped
177 :type ref_view: :class:`mol.EntityView`
178 :param mdl_view: The matching counterpart in model
179 :type mdl_view: :class:`mol.EntityView`
181 def __init__(self, lDDT, substructure, ref_view, mdl_view):
182 self.
_lDDT_lDDT = lDDT
184 assert(len(ref_view.residues) == len(mdl_view.residues))
201 """ lDDT of representation result
203 Depends on how you call :func:`ChainMapper.GetRepr` whether this is
204 backbone only or full atom lDDT.
206 :type: :class:`float`
208 return self.
_lDDT_lDDT
212 """ The full substructure for which we searched for a
215 :type: :class:`ost.mol.EntityView`
221 """ View which contains the mapped subset of :attr:`substructure`
223 :type: :class:`ost.mol.EntityView`
229 """ The :attr:`ref_view` representation in the model
231 :type: :class:`ost.mol.EntityView`
237 """ The reference residues
239 :type: class:`mol.ResidueViewList`
241 return self.
ref_viewref_view.residues
245 """ The model residues
247 :type: :class:`mol.ResidueViewList`
249 return self.
mdl_viewmdl_view.residues
253 """ A list of mapped residue whose names do not match (eg. ALA in the
254 reference and LEU in the model).
256 The mismatches are reported as a tuple of :class:`~ost.mol.ResidueView`
257 (reference, model), or as an empty list if all the residue names match.
268 """ Representative backbone positions for reference residues.
270 Thats CA positions for peptides and C3' positions for Nucleotides.
272 :type: :class:`geom.Vec3List`
280 """ Representative backbone positions for model residues.
282 Thats CA positions for peptides and C3' positions for Nucleotides.
284 :type: :class:`geom.Vec3List`
292 """ Representative backbone positions for reference residues.
294 Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
295 positions for Nucleotides.
297 :type: :class:`geom.Vec3List`
305 """ Representative backbone positions for reference residues.
307 Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
308 positions for Nucleotides.
310 :type: :class:`geom.Vec3List`
319 """ Superposition of mdl residues onto ref residues
321 Superposition computed as minimal RMSD superposition on
322 :attr:`ref_bb_pos` and :attr:`mdl_bb_pos`. If number of positions is
323 smaller 3, the full_bb_pos equivalents are used instead.
325 :type: :class:`ost.mol.alg.SuperpositionResult`
338 """ Transformation to superpose mdl residues onto ref residues
340 :type: :class:`ost.geom.Mat4`
346 """ :attr:`mdl_bb_pos` with :attr:`transform applied`
348 :type: :class:`geom.Vec3List`
357 """ RMSD of the binding site backbone atoms after :attr:`superposition`
359 :type: :class:`float`
366 """ query for mdl residues in OpenStructure query language
368 Repr can be selected as ``full_mdl.Select(ost_query)``
370 Returns invalid query if residue numbers have insertion codes.
377 chname = r.GetChain().GetName()
378 rnum = r.GetNumber().GetNum()
379 if chname
not in chain_rnums:
380 chain_rnums[chname] = list()
381 chain_rnums[chname].append(str(rnum))
382 chain_queries = list()
383 for k,v
in chain_rnums.items():
384 q = f
"(cname={mol.QueryQuoteName(k)} and "
385 q += f
"rnum={','.join(v)})"
386 chain_queries.append(q)
387 self.
_ost_query_ost_query =
" or ".join(chain_queries)
391 """ Returns JSON serializable summary of results
394 json_dict[
"lDDT"] = self.
lDDTlDDT
395 json_dict[
"ref_residues"] = [r.GetQualifiedName()
for r
in \
397 json_dict[
"mdl_residues"] = [r.GetQualifiedName()
for r
in \
399 json_dict[
"transform"] = list(self.
transformtransform.data)
400 json_dict[
"bb_rmsd"] = self.
bb_rmsdbb_rmsd
401 json_dict[
"ost_query"] = self.
ost_queryost_query
406 """ Returns flat mapping of all chains in the representation
408 :param mdl_as_key: Default is target chain name as key and model chain
409 name as value. This can be reversed with this flag.
410 :returns: :class:`dict` with :class:`str` as key/value that describe
413 flat_mapping = dict()
416 flat_mapping[mdl_res.chain.name] = trg_res.chain.name
418 flat_mapping[trg_res.chain.name] = mdl_res.chain.name
421 def _GetFullBBPos(self, residues):
422 """ Helper to extract full backbone positions
424 exp_pep_atoms = [
"N",
"CA",
"C"]
425 exp_nuc_atoms = [
"\"O5'\"",
"\"C5'\"",
"\"C4'\"",
"\"C3'\"",
"\"O3'\""]
428 if r.GetChemType() == mol.ChemType.NUCLEOTIDES:
429 exp_atoms = exp_nuc_atoms
430 elif r.GetChemType() == mol.ChemType.AMINOACIDS:
431 exp_atoms = exp_pep_atoms
433 raise RuntimeError(
"Something terrible happened... RUN...")
434 for aname
in exp_atoms:
435 a = r.FindAtom(aname)
437 raise RuntimeError(
"Something terrible happened... "
439 bb_pos.append(a.GetPos())
442 def _GetBBPos(self, residues):
443 """ Helper to extract single representative position for each residue
447 at = r.FindAtom(
"CA")
449 at = r.FindAtom(
"C3'")
451 raise RuntimeError(
"Something terrible happened... RUN...")
452 bb_pos.append(at.GetPos())
455 def _GetInconsistentResidues(self, ref_residues, mdl_residues):
456 """ Helper to extract a list of inconsistent residues.
458 if len(ref_residues) != len(mdl_residues):
459 raise ValueError(
"Something terrible happened... Reference and "
460 "model lengths differ... RUN...")
461 inconsistent_residues = list()
462 for ref_residue, mdl_residue
in zip(ref_residues, mdl_residues):
463 if ref_residue.name != mdl_residue.name:
464 inconsistent_residues.append((ref_residue, mdl_residue))
465 return inconsistent_residues
469 """ Class to compute chain mappings
471 All algorithms are performed on processed structures which fulfill
472 criteria as given in constructor arguments (*min_pep_length*,
473 "min_nuc_length") and only contain residues which have all required backbone
474 atoms. for peptide residues thats N, CA, C and CB (no CB for GLY), for
475 nucleotide residues thats O5', C5', C4', C3' and O3'.
477 Chain mapping is a three step process:
479 * Group chemically identical chains in *target* using pairwise
480 alignments that are either computed with Needleman-Wunsch (NW) or
481 simply derived from residue numbers (*resnum_alignments* flag).
482 In case of NW, *pep_subst_mat*, *pep_gap_open* and *pep_gap_ext*
483 and their nucleotide equivalents are relevant. Two chains are
484 considered identical if they fulfill the *pep_seqid_thr* and
485 have at least *min_pep_length* aligned residues. Same logic
486 is applied for nucleotides with respective thresholds.
488 * Map chains in an input model to these groups. Generating alignments
489 and the similarity criteria are the same as above. You can either
490 get the group mapping with :func:`GetChemMapping` or directly call
491 one of the full fletched one-to-one chain mapping functions which
492 execute that step internally.
494 * Obtain one-to-one mapping for chains in an input model and
495 *target* with one of the available mapping functions. Just to get an
496 idea of complexity. If *target* and *model* are octamers, there are
497 ``8! = 40320`` possible chain mappings.
499 :param target: Target structure onto which models are mapped.
500 Computations happen on a selection only containing
501 polypeptides and polynucleotides.
502 :type target: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
503 :param resnum_alignments: Use residue numbers instead of
504 Needleman-Wunsch to compute pairwise
505 alignments. Relevant for :attr:`~chem_groups`
506 and related attributes.
507 :type resnum_alignments: :class:`bool`
508 :param pep_seqid_thr: Threshold used to decide when two chains are
509 identical. 95 percent tolerates the few mutations
510 crystallographers like to do.
511 :type pep_seqid_thr: :class:`float`
512 :param nuc_seqid_thr: Nucleotide equivalent for *pep_seqid_thr*
513 :type nuc_seqid_thr: :class:`float`
514 :param pep_subst_mat: Substitution matrix to align peptide sequences,
515 irrelevant if *resnum_alignments* is True,
516 defaults to seq.alg.BLOSUM62
517 :type pep_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
518 :param pep_gap_open: Gap open penalty to align peptide sequences,
519 irrelevant if *resnum_alignments* is True
520 :type pep_gap_open: :class:`int`
521 :param pep_gap_ext: Gap extension penalty to align peptide sequences,
522 irrelevant if *resnum_alignments* is True
523 :type pep_gap_ext: :class:`int`
524 :param nuc_subst_mat: Nucleotide equivalent for *pep_subst_mat*,
525 defaults to seq.alg.NUC44
526 :type nuc_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
527 :param nuc_gap_open: Nucleotide equivalent for *pep_gap_open*
528 :type nuc_gap_open: :class:`int`
529 :param nuc_gap_ext: Nucleotide equivalent for *pep_gap_ext*
530 :type nuc_gap_ext: :class:`int`
531 :param min_pep_length: Minimal number of residues for a peptide chain to be
532 considered in target and in models.
533 :type min_pep_length: :class:`int`
534 :param min_nuc_length: Minimal number of residues for a nucleotide chain to be
535 considered in target and in models.
536 :type min_nuc_length: :class:`int`
537 :param n_max_naive: Max possible chain mappings that are enumerated in
538 :func:`~GetNaivelDDTMapping` /
539 :func:`~GetDecomposerlDDTMapping`. A
540 :class:`RuntimeError` is raised in case of bigger
542 :type n_max_naive: :class:`int`
544 def __init__(self, target, resnum_alignments=False,
545 pep_seqid_thr = 95., nuc_seqid_thr = 95.,
546 pep_subst_mat = seq.alg.BLOSUM62, pep_gap_open = -11,
547 pep_gap_ext = -1, nuc_subst_mat = seq.alg.NUC44,
548 nuc_gap_open = -4, nuc_gap_ext = -4,
549 min_pep_length = 6, min_nuc_length = 4,
568 pep_subst_mat = pep_subst_mat,
569 pep_gap_open = pep_gap_open,
570 pep_gap_ext = pep_gap_ext,
571 nuc_subst_mat = nuc_subst_mat,
572 nuc_gap_open = nuc_gap_open,
573 nuc_gap_ext = nuc_gap_ext)
576 self._target, self._polypep_seqs, self.
_polynuc_seqs_polynuc_seqs = \
581 """Target structure that only contains peptides/nucleotides
583 Contains only residues that have the backbone representatives
584 (CA for peptide and C3' for nucleotides) to avoid ATOMSEQ alignment
585 inconsistencies when switching between all atom and backbone only
588 :type: :class:`ost.mol.EntityView`
594 """Sequences of peptide chains in :attr:`~target`
596 Respective :class:`EntityView` from *target* for each sequence s are
597 available as ``s.GetAttachedView()``
599 :type: :class:`ost.seq.SequenceList`
601 return self._polypep_seqs
605 """Sequences of nucleotide chains in :attr:`~target`
607 Respective :class:`EntityView` from *target* for each sequence s are
608 available as ``s.GetAttachedView()``
610 :type: :class:`ost.seq.SequenceList`
616 """Groups of chemically equivalent chains in :attr:`~target`
618 First chain in group is the one with longest sequence.
620 :getter: Computed on first use (cached)
621 :type: :class:`list` of :class:`list` of :class:`str` (chain names)
626 self.
_chem_groups_chem_groups.append([s.GetName()
for s
in a.sequences])
631 """MSA for each group in :attr:`~chem_groups`
633 Sequences in MSAs exhibit same order as in :attr:`~chem_groups` and
634 have the respective :class:`ost.mol.EntityView` from *target* attached.
636 :getter: Computed on first use (cached)
637 :type: :class:`ost.seq.AlignmentList`
652 """Reference (longest) sequence for each group in :attr:`~chem_groups`
654 Respective :class:`EntityView` from *target* for each sequence s are
655 available as ``s.GetAttachedView()``
657 :getter: Computed on first use (cached)
658 :type: :class:`ost.seq.SequenceList`
663 s = seq.CreateSequence(a.GetSequence(0).GetName(),
664 a.GetSequence(0).GetGaplessString())
665 s.AttachView(a.GetSequence(0).GetAttachedView())
671 """ChemType of each group in :attr:`~chem_groups`
673 Specifying if groups are poly-peptides/nucleotides, i.e.
674 :class:`ost.mol.ChemType.AMINOACIDS` or
675 :class:`ost.mol.ChemType.NUCLEOTIDES`
677 :getter: Computed on first use (cached)
678 :type: :class:`list` of :class:`ost.mol.ChemType`
692 """Maps sequences in *model* to chem_groups of target
694 :param model: Model from which to extract sequences, a
695 selection that only includes peptides and nucleotides
696 is performed and returned along other results.
697 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
698 :returns: Tuple with two lists of length `len(self.chem_groups)` and
699 an :class:`ost.mol.EntityView` representing *model*:
700 1) Each element is a :class:`list` with mdl chain names that
701 map to the chem group at that position.
702 2) Each element is a :class:`ost.seq.AlignmentList` aligning
703 these mdl chain sequences to the chem group ref sequences.
704 3) A selection of *model* that only contains polypeptides and
705 polynucleotides whose ATOMSEQ exactly matches the sequence
706 info in the returned alignments.
708 mdl, mdl_pep_seqs, mdl_nuc_seqs = self.
ProcessStructureProcessStructure(model)
709 mapping = [list()
for x
in self.
chem_groupschem_groups]
710 alns = [seq.AlignmentList()
for x
in self.
chem_groupschem_groups]
712 for s
in mdl_pep_seqs:
715 s, mol.ChemType.AMINOACIDS,
718 mapping[idx].append(s.GetName())
719 alns[idx].append(aln)
721 for s
in mdl_nuc_seqs:
724 s, mol.ChemType.NUCLEOTIDES,
727 mapping[idx].append(s.GetName())
728 alns[idx].append(aln)
730 return (mapping, alns, mdl)
734 thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic",
735 steep_opt_rate = None, block_seed_size = 5,
736 block_blocks_per_chem_group = 5,
737 chem_mapping_result = None,
738 heuristic_n_max_naive = 40320):
739 """ Identify chain mapping by optimizing lDDT score
741 Maps *model* chain sequences to :attr:`~chem_groups` and find mapping
742 based on backbone only lDDT score (CA for amino acids C3' for
745 Either performs a naive search, i.e. enumerate all possible mappings or
746 executes a greedy strategy that tries to identify a (close to) optimal
747 mapping in an iterative way by starting from a start mapping (seed). In
748 each iteration, the one-to-one mapping that leads to highest increase
749 in number of conserved contacts is added with the additional requirement
750 that this added mapping must have non-zero interface counts towards the
751 already mapped chains. So basically we're "growing" the mapped structure
752 by only adding connected stuff.
754 The available strategies:
756 * **naive**: Enumerates all possible mappings and returns best
758 * **greedy_fast**: perform all vs. all single chain lDDTs within the
759 respective ref/mdl chem groups. The mapping with highest number of
760 conserved contacts is selected as seed for greedy extension
762 * **greedy_full**: try multiple seeds for greedy extension, i.e. try
763 all ref/mdl chain combinations within the respective chem groups and
764 retain the mapping leading to the best lDDT.
766 * **greedy_block**: try multiple seeds for greedy extension, i.e. try
767 all ref/mdl chain combinations within the respective chem groups and
768 extend them to *block_seed_size*. *block_blocks_per_chem_group*
769 for each chem group are selected for exhaustive extension.
771 * **heuristic**: Uses *naive* strategy if number of possible mappings
772 is within *heuristic_n_max_naive*. The default of 40320 corresponds
773 to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
774 6!*2!=1440 etc. If the number of possible mappings is larger,
775 *greedy_full* is used.
777 Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
780 :param model: Model to map
781 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
782 :param inclusion_radius: Inclusion radius for lDDT
783 :type inclusion_radius: :class:`float`
784 :param thresholds: Thresholds for lDDT
785 :type thresholds: :class:`list` of :class:`float`
786 :param strategy: Strategy to find mapping. Must be in ["naive",
787 "greedy_fast", "greedy_full", "greedy_block"]
788 :type strategy: :class:`str`
789 :param steep_opt_rate: Only relevant for greedy strategies.
790 If set, every *steep_opt_rate* mappings, a simple
791 optimization is executed with the goal of
792 avoiding local minima. The optimization
793 iteratively checks all possible swaps of mappings
794 within their respective chem groups and accepts
795 swaps that improve lDDT score. Iteration stops as
796 soon as no improvement can be achieved anymore.
797 :type steep_opt_rate: :class:`int`
798 :param block_seed_size: Param for *greedy_block* strategy - Initial seeds
799 are extended by that number of chains.
800 :type block_seed_size: :class:`int`
801 :param block_blocks_per_chem_group: Param for *greedy_block* strategy -
802 Number of blocks per chem group that
803 are extended in an initial search
804 for high scoring local solutions.
805 :type block_blocks_per_chem_group: :class:`int`
806 :param chem_mapping_result: Pro param. The result of
807 :func:`~GetChemMapping` where you provided
808 *model*. If set, *model* parameter is not
810 :type chem_mapping_result: :class:`tuple`
811 :returns: A :class:`MappingResult`
814 strategies = [
"naive",
"greedy_fast",
"greedy_full",
"greedy_block",
816 if strategy
not in strategies:
817 raise RuntimeError(f
"Strategy must be in {strategies}")
819 if chem_mapping_result
is None:
820 chem_mapping, chem_group_alns, mdl = self.
GetChemMappingGetChemMapping(model)
822 chem_mapping, chem_group_alns, mdl = chem_mapping_result
824 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
830 one_to_one = _CheckOneToOneMapping(self.
chem_groupschem_groups, chem_mapping)
831 if one_to_one
is not None:
833 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, one_to_one):
834 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
835 if ref_ch
is not None and mdl_ch
is not None:
836 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
837 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
838 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
839 alns[(ref_ch, mdl_ch)] = aln
843 if strategy ==
"heuristic":
844 if _NMappingsWithin(self.
chem_groupschem_groups, chem_mapping,
845 heuristic_n_max_naive):
848 strategy =
"greedy_full"
853 if strategy ==
"naive":
854 mapping, opt_lddt = _lDDTNaive(self.
targettarget, mdl, inclusion_radius,
856 chem_mapping, ref_mdl_alns,
861 chem_mapping, ref_mdl_alns,
862 inclusion_radius=inclusion_radius,
863 thresholds=thresholds,
864 steep_opt_rate=steep_opt_rate)
865 if strategy ==
"greedy_fast":
866 mapping = _lDDTGreedyFast(the_greed)
867 elif strategy ==
"greedy_full":
868 mapping = _lDDTGreedyFull(the_greed)
869 elif strategy ==
"greedy_block":
870 mapping = _lDDTGreedyBlock(the_greed, block_seed_size,
871 block_blocks_per_chem_group)
873 opt_lddt = the_greed.Score(mapping)
876 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, mapping):
877 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
878 if ref_ch
is not None and mdl_ch
is not None:
879 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
880 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
881 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
882 alns[(ref_ch, mdl_ch)] = aln
885 mapping, alns, opt_score = opt_lddt)
889 block_seed_size = 5, block_blocks_per_chem_group = 5,
890 heuristic_n_max_naive = 40320, steep_opt_rate = None,
891 chem_mapping_result = None,
892 greedy_prune_contact_map = True):
893 """ Identify chain mapping based on QSScore
895 Scoring is based on CA/C3' positions which are present in all chains of
896 a :attr:`chem_groups` as well as the *model* chains which are mapped to
897 that respective chem group.
899 The following strategies are available:
901 * **naive**: Naively iterate all possible mappings and return best based
904 * **greedy_fast**: perform all vs. all single chain lDDTs within the
905 respective ref/mdl chem groups. The mapping with highest number of
906 conserved contacts is selected as seed for greedy extension.
907 Extension is based on QS-score.
909 * **greedy_full**: try multiple seeds for greedy extension, i.e. try
910 all ref/mdl chain combinations within the respective chem groups and
911 retain the mapping leading to the best QS-score.
913 * **greedy_block**: try multiple seeds for greedy extension, i.e. try
914 all ref/mdl chain combinations within the respective chem groups and
915 extend them to *block_seed_size*. *block_blocks_per_chem_group*
916 for each chem group are selected for exhaustive extension.
918 * **heuristic**: Uses *naive* strategy if number of possible mappings
919 is within *heuristic_n_max_naive*. The default of 40320 corresponds
920 to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
921 6!*2!=1440 etc. If the number of possible mappings is larger,
922 *greedy_full* is used.
924 Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
927 :param model: Model to map
928 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
929 :param contact_d: Max distance between two residues to be considered as
930 contact in qs scoring
931 :type contact_d: :class:`float`
932 :param strategy: Strategy for sampling, must be in ["naive",
933 "greedy_fast", "greedy_full", "greedy_block"]
934 :type strategy: :class:`str`
935 :param chem_mapping_result: Pro param. The result of
936 :func:`~GetChemMapping` where you provided
937 *model*. If set, *model* parameter is not
939 :type chem_mapping_result: :class:`tuple`
940 :param greedy_prune_contact_map: Relevant for all strategies that use
941 greedy extensions. If True, only chains
942 with at least 3 contacts (8A CB
943 distance) towards already mapped chains
944 in trg/mdl are considered for
945 extension. All chains that give a
946 potential non-zero QS-score increase
947 are used otherwise (at least one
948 contact within 12A). The consequence
949 is reduced runtime and usually no
950 real reduction in accuracy.
951 :returns: A :class:`MappingResult`
954 strategies = [
"naive",
"greedy_fast",
"greedy_full",
"greedy_block",
"heuristic"]
955 if strategy
not in strategies:
956 raise RuntimeError(f
"strategy must be {strategies}")
958 if chem_mapping_result
is None:
959 chem_mapping, chem_group_alns, mdl = self.
GetChemMappingGetChemMapping(model)
961 chem_mapping, chem_group_alns, mdl = chem_mapping_result
962 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
967 one_to_one = _CheckOneToOneMapping(self.
chem_groupschem_groups, chem_mapping)
968 if one_to_one
is not None:
970 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, one_to_one):
971 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
972 if ref_ch
is not None and mdl_ch
is not None:
973 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
974 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
975 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
976 alns[(ref_ch, mdl_ch)] = aln
980 if strategy ==
"heuristic":
981 if _NMappingsWithin(self.
chem_groupschem_groups, chem_mapping,
982 heuristic_n_max_naive):
985 strategy =
"greedy_full"
990 if strategy ==
"naive":
991 mapping, opt_qsscore = _QSScoreNaive(self.
targettarget, mdl,
993 chem_mapping, ref_mdl_alns,
999 chem_mapping, ref_mdl_alns,
1000 contact_d = contact_d,
1001 steep_opt_rate=steep_opt_rate,
1002 greedy_prune_contact_map = greedy_prune_contact_map)
1003 if strategy ==
"greedy_fast":
1004 mapping = _QSScoreGreedyFast(the_greed)
1005 elif strategy ==
"greedy_full":
1006 mapping = _QSScoreGreedyFull(the_greed)
1007 elif strategy ==
"greedy_block":
1008 mapping = _QSScoreGreedyBlock(the_greed, block_seed_size,
1009 block_blocks_per_chem_group)
1011 opt_qsscore = the_greed.Score(mapping, check=
False).QS_global
1014 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, mapping):
1015 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1016 if ref_ch
is not None and mdl_ch
is not None:
1017 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1018 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
1019 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1020 alns[(ref_ch, mdl_ch)] = aln
1023 mapping, alns, opt_score = opt_qsscore)
1026 chem_mapping_result = None, heuristic_n_max_naive = 120):
1027 """Identify chain mapping based on minimal RMSD superposition
1029 Superposition and scoring is based on CA/C3' positions which are present
1030 in all chains of a :attr:`chem_groups` as well as the *model*
1031 chains which are mapped to that respective chem group.
1033 The following strategies are available:
1035 * **naive**: Naively iterate all possible mappings and return the one
1038 * **greedy_single**: perform all vs. all single chain superpositions
1039 within the respective ref/mdl chem groups to use as starting points.
1040 For each starting point, iteratively add the model/target chain pair
1041 with lowest RMSD until a full mapping is achieved. The mapping with
1042 lowest RMSD is returned.
1044 * **greedy_iterative**: Same as greedy_single_rmsd exept that the
1045 transformation gets updated with each added chain pair.
1047 * **heuristic**: Uses *naive* strategy if number of possible mappings
1048 is within *heuristic_n_max_naive*. The default of 120 corresponds
1049 to a homo-pentamer (5!=120). If the number of possible mappings is
1050 larger, *greedy_iterative* is used.
1052 :param model: Model to map
1053 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1054 :param strategy: Strategy for sampling. Must be in ["naive",
1055 "greedy_single", "greedy_iterative"]
1056 :type strategy: :class:`str`
1057 :param subsampling: If given, only an equally distributed subset
1058 of CA/C3' positions in each chain are used for
1059 superposition/scoring.
1060 :type subsampling: :class:`int`
1061 :param chem_mapping_result: Pro param. The result of
1062 :func:`~GetChemMapping` where you provided
1063 *model*. If set, *model* parameter is not
1065 :type chem_mapping_result: :class:`tuple`
1066 :returns: A :class:`MappingResult`
1069 strategies = [
"naive",
"greedy_single",
"greedy_iterative",
"heuristic"]
1071 if strategy
not in strategies:
1072 raise RuntimeError(f
"strategy must be {strategies}")
1074 if chem_mapping_result
is None:
1075 chem_mapping, chem_group_alns, mdl = self.
GetChemMappingGetChemMapping(model)
1077 chem_mapping, chem_group_alns, mdl = chem_mapping_result
1078 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
1084 one_to_one = _CheckOneToOneMapping(self.
chem_groupschem_groups, chem_mapping)
1085 if one_to_one
is not None:
1087 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, one_to_one):
1088 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1089 if ref_ch
is not None and mdl_ch
is not None:
1090 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1091 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
1092 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1093 alns[(ref_ch, mdl_ch)] = aln
1097 trg_group_pos, mdl_group_pos = _GetRefPos(self.
targettarget, mdl,
1100 max_pos = subsampling)
1102 if strategy ==
"heuristic":
1103 if _NMappingsWithin(self.
chem_groupschem_groups, chem_mapping,
1104 heuristic_n_max_naive):
1107 strategy =
"greedy_iterative"
1111 if strategy.startswith(
"greedy"):
1114 initial_transforms = list()
1115 initial_mappings = list()
1116 for trg_pos, trg_chains, mdl_pos, mdl_chains
in zip(trg_group_pos,
1120 for t_pos, t
in zip(trg_pos, trg_chains):
1121 for m_pos, m
in zip(mdl_pos, mdl_chains):
1122 if len(t_pos) >= 3
and len(m_pos) >= 3:
1123 transform = _GetSuperposition(m_pos, t_pos,
1124 False).transformation
1125 initial_transforms.append(transform)
1126 initial_mappings.append((t,m))
1128 if strategy ==
"greedy_single":
1129 mapping = _SingleRigidRMSD(initial_transforms,
1137 elif strategy ==
"greedy_iterative":
1138 mapping = _IterativeRigidRMSD(initial_transforms,
1141 trg_group_pos, mdl_group_pos)
1142 elif strategy ==
"naive":
1143 mapping = _NaiveRMSD(self.
chem_groupschem_groups, chem_mapping,
1144 trg_group_pos, mdl_group_pos,
1148 final_mapping = list()
1150 mapped_mdl_chains = list()
1151 for ref_ch
in ref_chains:
1152 if ref_ch
in mapping:
1153 mapped_mdl_chains.append(mapping[ref_ch])
1155 mapped_mdl_chains.append(
None)
1156 final_mapping.append(mapped_mdl_chains)
1159 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, final_mapping):
1160 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1161 if ref_ch
is not None and mdl_ch
is not None:
1162 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1163 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
1164 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1165 alns[(ref_ch, mdl_ch)] = aln
1168 final_mapping, alns)
1171 """ Convenience function to get mapping with currently preferred method
1173 If number of possible chain mappings is <= *n_max_naive*, a naive
1174 QS-score mapping is performed and optimal QS-score is guaranteed.
1175 For anything else, a QS-score mapping with the greedy_full strategy is
1176 performed (greedy_prune_contact_map = True). The default for
1177 *n_max_naive* of 40320 corresponds to an octamer (8!=40320). A
1178 structure with stoichiometry A6B2 would be 6!*2!=1440 etc.
1180 If :attr:`~target` has nucleotide sequences, the QS-score target
1181 function is replaced with a backbone only lDDT score that has
1182 an inclusion radius of 30A.
1185 return self.
GetlDDTMappingGetlDDTMapping(model, strategy =
"heuristic",
1186 inclusion_radius = 30.0,
1187 heuristic_n_max_naive = n_max_naive)
1190 greedy_prune_contact_map=
True,
1191 heuristic_n_max_naive = n_max_naive)
1193 def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0,
1194 thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False,
1195 only_interchain=False, chem_mapping_result = None,
1196 global_mapping = None):
1197 """ Identify *topn* representations of *substructure* in *model*
1199 *substructure* defines a subset of :attr:`~target` for which one
1200 wants the *topn* representations in *model*. Representations are scored
1203 :param substructure: A :class:`ost.mol.EntityView` which is a subset of
1204 :attr:`~target`. Should be selected with the
1205 OpenStructure query language. Example: if you're
1206 interested in residues with number 42,43 and 85 in
1208 ``substructure=mapper.target.Select("cname=A and rnum=42,43,85")``
1209 A :class:`RuntimeError` is raised if *substructure*
1210 does not refer to the same underlying
1211 :class:`ost.mol.EntityHandle` as :attr:`~target`.
1212 :type substructure: :class:`ost.mol.EntityView`
1213 :param model: Structure in which one wants to find representations for
1215 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1216 :param topn: Max number of representations that are returned
1217 :type topn: :class:`int`
1218 :param inclusion_radius: Inclusion radius for lDDT
1219 :type inclusion_radius: :class:`float`
1220 :param thresholds: Thresholds for lDDT
1221 :type thresholds: :class:`list` of :class:`float`
1222 :param bb_only: Only consider backbone atoms in lDDT computation
1223 :type bb_only: :class:`bool`
1224 :param only_interchain: Only score interchain contacts in lDDT. Useful
1225 if you want to identify interface patches.
1226 :type only_interchain: :class:`bool`
1227 :param chem_mapping_result: Pro param. The result of
1228 :func:`~GetChemMapping` where you provided
1229 *model*. If set, *model* parameter is not
1231 :type chem_mapping_result: :class:`tuple`
1232 :param global_mapping: Pro param. Specify a global mapping result. This
1233 fully defines the desired representation in the
1234 model but extracts it and enriches it with all
1235 the nice attributes of :class:`ReprResult`.
1236 The target attribute in *global_mapping* must be
1237 of the same entity as self.target and the model
1238 attribute of *global_mapping* must be of the same
1240 :type global_mapping: :class:`MappingResult`
1241 :returns: :class:`list` of :class:`ReprResult`
1245 raise RuntimeError(
"topn must be >= 1")
1247 if global_mapping
is not None:
1249 if global_mapping.target.handle.GetHashCode() != \
1250 self.
targettarget.handle.GetHashCode():
1251 raise RuntimeError(
"global_mapping.target must be the same "
1252 "entity as self.target")
1253 if global_mapping.model.handle.GetHashCode() != \
1254 model.handle.GetHashCode():
1255 raise RuntimeError(
"global_mapping.model must be the same "
1256 "entity as model param")
1259 for r
in substructure.residues:
1260 ch_name = r.GetChain().GetName()
1261 rnum = r.GetNumber()
1262 target_r = self.
targettarget.FindResidue(ch_name, rnum)
1263 if not target_r.IsValid():
1264 raise RuntimeError(f
"substructure has residue "
1265 f
"{r.GetQualifiedName()} which is not in "
1267 if target_r.handle.GetHashCode() != r.handle.GetHashCode():
1268 raise RuntimeError(f
"substructure has residue "
1269 f
"{r.GetQualifiedName()} which has an "
1270 f
"equivalent in self.target but it does "
1271 f
"not refer to the same underlying "
1274 target_a = target_r.FindAtom(a.GetName())
1275 if not target_a.IsValid():
1276 raise RuntimeError(f
"substructure has atom "
1277 f
"{a.GetQualifiedName()} which is not "
1279 if a.handle.GetHashCode() != target_a.handle.GetHashCode():
1280 raise RuntimeError(f
"substructure has atom "
1281 f
"{a.GetQualifiedName()} which has an "
1282 f
"equivalent in self.target but it does "
1283 f
"not refer to the same underlying "
1287 ca = r.FindAtom(
"CA")
1288 c3 = r.FindAtom(
"C3'")
1290 if not ca.IsValid()
and not c3.IsValid():
1291 raise RuntimeError(
"All residues in substructure must contain "
1292 "a backbone atom named CA or C3\'")
1295 if chem_mapping_result
is None:
1296 chem_mapping, chem_group_alns, mdl = self.
GetChemMappingGetChemMapping(model)
1298 chem_mapping, chem_group_alns, mdl = chem_mapping_result
1299 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
1305 substructure_res_indices = dict()
1306 for ch
in substructure.chains:
1307 full_ch = self.
targettarget.FindChain(ch.GetName())
1308 idx = [full_ch.GetResidueIndex(r.GetNumber())
for r
in ch.residues]
1309 substructure_res_indices[ch.GetName()] = idx
1313 substructure_chem_groups = list()
1314 substructure_chem_mapping = list()
1316 chnames = set([ch.GetName()
for ch
in substructure.chains])
1317 for chem_group, mapping
in zip(self.
chem_groupschem_groups, chem_mapping):
1318 substructure_chem_group = [ch
for ch
in chem_group
if ch
in chnames]
1319 if len(substructure_chem_group) > 0:
1320 substructure_chem_groups.append(substructure_chem_group)
1321 substructure_chem_mapping.append(mapping)
1324 n_mapped_mdl_chains = sum([len(m)
for m
in substructure_chem_mapping])
1325 if n_mapped_mdl_chains == 0:
1330 substructure_ref_mdl_alns = dict()
1332 for ch
in mdl.chains:
1333 mdl_views[ch.GetName()] = _CSel(mdl, [ch.GetName()])
1334 for chem_group, mapping
in zip(substructure_chem_groups,
1335 substructure_chem_mapping):
1336 for ref_ch
in chem_group:
1337 for mdl_ch
in mapping:
1338 full_aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1339 ref_seq = full_aln.GetSequence(0)
1343 tmp = [
'-'] * len(full_aln)
1344 for idx
in substructure_res_indices[ref_ch]:
1345 idx_in_seq = ref_seq.GetPos(idx)
1346 tmp[idx_in_seq] = ref_seq[idx_in_seq]
1347 ref_seq = seq.CreateSequence(ref_ch,
''.join(tmp))
1348 ref_seq.AttachView(_CSel(substructure, [ref_ch]))
1349 mdl_seq = full_aln.GetSequence(1)
1350 mdl_seq = seq.CreateSequence(mdl_seq.GetName(),
1351 mdl_seq.GetString())
1352 mdl_seq.AttachView(mdl_views[mdl_ch])
1353 aln = seq.CreateAlignment()
1354 aln.AddSequence(ref_seq)
1355 aln.AddSequence(mdl_seq)
1356 substructure_ref_mdl_alns[(ref_ch, mdl_ch)] = aln
1359 inclusion_radius = inclusion_radius,
1361 scored_mappings = list()
1365 flat_mapping = global_mapping.GetFlatMapping()
1367 for chem_group, chem_mapping
in zip(substructure_chem_groups,
1368 substructure_chem_mapping):
1369 chem_group_mapping = list()
1370 for ch
in chem_group:
1371 if ch
in flat_mapping:
1372 mdl_ch = flat_mapping[ch]
1373 if mdl_ch
in chem_mapping:
1374 chem_group_mapping.append(mdl_ch)
1376 chem_group_mapping.append(
None)
1378 chem_group_mapping.append(
None)
1379 mapping.append(chem_group_mapping)
1380 mappings = [mapping]
1382 mappings = list(_ChainMappings(substructure_chem_groups,
1383 substructure_chem_mapping,
1387 msg =
"Computing initial ranking of %d chain mappings" % len(mappings)
1388 (ost.LogWarning
if len(mappings) > 10000
else ost.LogInfo)(msg)
1390 for mapping
in mappings:
1392 lddt_chain_mapping = dict()
1395 for ref_chem_group, mdl_chem_group
in zip(substructure_chem_groups,
1397 for ref_ch, mdl_ch
in zip(ref_chem_group, mdl_chem_group):
1399 if mdl_ch
is not None:
1400 lddt_chain_mapping[mdl_ch] = ref_ch
1401 aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1402 lddt_alns[mdl_ch] = aln
1403 tmp = [int(c[0] !=
'-' and c[1] !=
'-')
for c
in aln]
1404 n_res_aln += sum(tmp)
1409 lDDT, _ = lddt_scorer.lDDT(mdl, thresholds=thresholds,
1410 chain_mapping=lddt_chain_mapping,
1411 residue_mapping = lddt_alns,
1412 check_resnames =
False,
1413 no_intrachain = only_interchain)
1416 ost.LogVerbose(
"No valid contacts in the reference")
1421 if len(scored_mappings) == 0:
1422 scored_mappings.append((lDDT, mapping))
1423 elif len(scored_mappings) < topn:
1424 scored_mappings.append((lDDT, mapping))
1425 scored_mappings.sort(reverse=
True, key=
lambda x: x[0])
1426 elif lDDT > scored_mappings[-1][0]:
1427 scored_mappings.append((lDDT, mapping))
1428 scored_mappings.sort(reverse=
True, key=
lambda x: x[0])
1429 scored_mappings = scored_mappings[:topn]
1433 for scored_mapping
in scored_mappings:
1434 ref_view = substructure.handle.CreateEmptyView()
1435 mdl_view = mdl.handle.CreateEmptyView()
1436 for ref_ch_group, mdl_ch_group
in zip(substructure_chem_groups,
1438 for ref_ch, mdl_ch
in zip(ref_ch_group, mdl_ch_group):
1439 if ref_ch
is not None and mdl_ch
is not None:
1440 aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1442 if col[0] !=
'-' and col[1] !=
'-':
1443 ref_view.AddResidue(col.GetResidue(0),
1444 mol.ViewAddFlag.INCLUDE_ALL)
1445 mdl_view.AddResidue(col.GetResidue(1),
1446 mol.ViewAddFlag.INCLUDE_ALL)
1447 results.append(
ReprResult(scored_mapping[0], substructure,
1448 ref_view, mdl_view))
1452 """ Returns number of possible mappings
1454 :param model: Model with chains that are mapped onto
1456 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1458 chem_mapping, chem_group_alns, mdl = self.
GetChemMappingGetChemMapping(model)
1459 return _NMappings(self.
chem_groupschem_groups, chem_mapping)
1462 """ Entity processing for chain mapping
1464 * Selects view containing peptide and nucleotide residues which have
1465 required backbone atoms present - for peptide residues thats
1466 N, CA, C and CB (no CB for GLY), for nucleotide residues thats
1467 O5', C5', C4', C3' and O3'.
1468 * filters view by chain lengths, see *min_pep_length* and
1469 *min_nuc_length* in constructor
1470 * Extracts atom sequences for each chain in that view
1471 * Attaches corresponding :class:`ost.mol.EntityView` to each sequence
1472 * If residue number alignments are used, strictly increasing residue
1473 numbers without insertion codes are ensured in each chain
1475 :param ent: Entity to process
1476 :type ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1477 :returns: Tuple with 3 elements: 1) :class:`ost.mol.EntityView`
1478 containing peptide and nucleotide residues 2)
1479 :class:`ost.seq.SequenceList` containing ATOMSEQ sequences
1480 for each polypeptide chain in returned view, sequences have
1481 :class:`ost.mol.EntityView` of according chains attached
1482 3) same for polynucleotide chains
1484 view = ent.CreateEmptyView()
1485 exp_pep_atoms = [
"N",
"CA",
"C",
"CB"]
1486 exp_nuc_atoms = [
"\"O5'\"",
"\"C5'\"",
"\"C4'\"",
"\"C3'\"",
"\"O3'\""]
1487 pep_query =
"peptide=true and aname=" +
','.join(exp_pep_atoms)
1488 nuc_query =
"nucleotide=true and aname=" +
','.join(exp_nuc_atoms)
1490 pep_sel = ent.Select(pep_query)
1491 for r
in pep_sel.residues:
1492 if len(r.atoms) == 4:
1493 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1494 elif r.name ==
"GLY" and len(r.atoms) == 3:
1495 atom_names = [a.GetName()
for a
in r.atoms]
1496 if sorted(atom_names) == [
"C",
"CA",
"N"]:
1497 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1499 nuc_sel = ent.Select(nuc_query)
1500 for r
in nuc_sel.residues:
1501 if len(r.atoms) == 5:
1502 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1504 polypep_seqs = seq.CreateSequenceList()
1505 polynuc_seqs = seq.CreateSequenceList()
1507 if len(view.residues) == 0:
1509 return (view, polypep_seqs, polynuc_seqs)
1511 for ch
in view.chains:
1512 n_res = len(ch.residues)
1513 n_pep = sum([r.IsPeptideLinking()
for r
in ch.residues])
1514 n_nuc = sum([r.IsNucleotideLinking()
for r
in ch.residues])
1517 if n_pep > 0
and n_nuc > 0:
1518 raise RuntimeError(f
"Must not mix peptide and nucleotide linking "
1519 f
"residues in same chain ({ch.GetName()})")
1521 if (n_pep + n_nuc) != n_res:
1522 raise RuntimeError(
"All residues must either be peptide_linking "
1523 "or nucleotide_linking")
1537 ins_codes = [r.GetNumber().GetInsCode()
for r
in ch.residues]
1538 if len(set(ins_codes)) != 1
or ins_codes[0] !=
'\0':
1539 raise RuntimeError(
"Residue numbers in input structures must not "
1540 "contain insertion codes")
1543 nums = [r.GetNumber().GetNum()
for r
in ch.residues]
1544 if not all(i < j
for i, j
in zip(nums, nums[1:])):
1545 raise RuntimeError(
"Residue numbers in input structures must be "
1546 "strictly increasing for each chain")
1548 s =
''.join([r.one_letter_code
for r
in ch.residues])
1549 s = seq.CreateSequence(ch.GetName(), s)
1550 s.AttachView(_CSel(view, [ch.GetName()]))
1552 polypep_seqs.AddSequence(s)
1553 elif n_nuc == n_res:
1554 polynuc_seqs.AddSequence(s)
1556 raise RuntimeError(
"This shouldnt happen")
1558 if len(polypep_seqs) == 0
and len(polynuc_seqs) == 0:
1559 raise RuntimeError(f
"No chain fulfilled minimum length requirement "
1560 f
"to be considered in chain mapping "
1561 f
"({self.min_pep_length} for peptide chains, "
1562 f
"{self.min_nuc_length} for nucleotide chains) "
1563 f
"- mapping failed")
1566 chain_names = [s.GetAttachedView().chains[0].name
for s
in polypep_seqs]
1567 chain_names += [s.GetAttachedView().chains[0].name
for s
in polynuc_seqs]
1568 view = _CSel(view, chain_names)
1570 return (view, polypep_seqs, polynuc_seqs)
1573 """ Access to internal sequence alignment functionality
1575 Alignment parameterization is setup at ChainMapper construction
1577 :param s1: First sequence to align - must have view attached in case
1578 of resnum_alignments
1579 :type s1: :class:`ost.seq.SequenceHandle`
1580 :param s2: Second sequence to align - must have view attached in case
1581 of resnum_alignments
1582 :type s2: :class:`ost.seq.SequenceHandle`
1583 :param stype: Type of sequences to align, must be in
1584 [:class:`ost.mol.ChemType.AMINOACIDS`,
1585 :class:`ost.mol.ChemType.NUCLEOTIDES`]
1586 :returns: Pairwise alignment of s1 and s2
1588 if stype
not in [mol.ChemType.AMINOACIDS, mol.ChemType.NUCLEOTIDES]:
1589 raise RuntimeError(
"stype must be ost.mol.ChemType.AMINOACIDS or "
1590 "ost.mol.ChemType.NUCLEOTIDES")
1591 return self.
aligneraligner.
Align(s1, s2, chem_type = stype)
1597 def __init__(self, pep_subst_mat = seq.alg.BLOSUM62, pep_gap_open = -5,
1598 pep_gap_ext = -2, nuc_subst_mat = seq.alg.NUC44,
1599 nuc_gap_open = -4, nuc_gap_ext = -4, resnum_aln = False):
1600 """ Helper class to compute alignments
1602 Sets default values for substitution matrix, gap open and gap extension
1603 penalties. They are only used in default mode (Needleman-Wunsch aln).
1604 If *resnum_aln* is True, only residue numbers of views that are attached
1605 to input sequences are considered.
1615 def Align(self, s1, s2, chem_type=None):
1619 if chem_type
is None:
1620 raise RuntimeError(
"Must specify chem_type for NW alignment")
1621 return self.
NWAlignNWAlign(s1, s2, chem_type)
1624 """ Returns pairwise alignment using Needleman-Wunsch algorithm
1626 :param s1: First sequence to align
1627 :type s1: :class:`ost.seq.SequenceHandle`
1628 :param s2: Second sequence to align
1629 :type s2: :class:`ost.seq.SequenceHandle`
1630 :param chem_type: Must be in [:class:`ost.mol.ChemType.AMINOACIDS`,
1631 :class:`ost.mol.ChemType.NUCLEOTIDES`], determines
1632 substitution matrix and gap open/extension penalties
1633 :type chem_type: :class:`ost.mol.ChemType`
1634 :returns: Alignment with s1 as first and s2 as second sequence
1636 if chem_type == mol.ChemType.AMINOACIDS:
1637 return seq.alg.SemiGlobalAlign(s1, s2, self.
pep_subst_matpep_subst_mat,
1640 elif chem_type == mol.ChemType.NUCLEOTIDES:
1641 return seq.alg.SemiGlobalAlign(s1, s2, self.
nuc_subst_matnuc_subst_mat,
1645 raise RuntimeError(
"Invalid ChemType")
1649 """ Returns pairwise alignment using residue numbers of attached views
1651 Assumes that there are no insertion codes (alignment only on numerical
1652 component) and that resnums are strictly increasing (fast min/max
1653 identification). These requirements are assured if a structure has been
1654 processed by :class:`ChainMapper`.
1656 :param s1: First sequence to align, must have :class:`ost.mol.EntityView`
1658 :type s1: :class:`ost.seq.SequenceHandle`
1659 :param s2: Second sequence to align, must have :class:`ost.mol.EntityView`
1661 :type s2: :class:`ost.seq.SequenceHandle`
1663 assert(s1.HasAttachedView())
1664 assert(s2.HasAttachedView())
1665 v1 = s1.GetAttachedView()
1666 rnums1 = [r.GetNumber().GetNum()
for r
in v1.residues]
1667 v2 = s2.GetAttachedView()
1668 rnums2 = [r.GetNumber().GetNum()
for r
in v2.residues]
1670 min_num = min(rnums1[0], rnums2[0])
1671 max_num = max(rnums1[-1], rnums2[-1])
1672 aln_length = max_num - min_num + 1
1674 aln_s1 = [
'-'] * aln_length
1675 for r, rnum
in zip(v1.residues, rnums1):
1676 aln_s1[rnum-min_num] = r.one_letter_code
1678 aln_s2 = [
'-'] * aln_length
1679 for r, rnum
in zip(v2.residues, rnums2):
1680 aln_s2[rnum-min_num] = r.one_letter_code
1682 aln = seq.CreateAlignment()
1683 aln.AddSequence(seq.CreateSequence(s1.GetName(),
''.join(aln_s1)))
1684 aln.AddSequence(seq.CreateSequence(s2.GetName(),
''.join(aln_s2)))
1687 def _GetAlnPropsTwo(aln):
1688 """Returns basic properties of *aln* version two...
1690 :param aln: Alignment to compute properties
1691 :type aln: :class:`seq.AlignmentHandle`
1692 :returns: Tuple with 2 elements. 1) sequence identify in range [0, 100]
1693 considering aligned columns 2) Fraction of non-gap characters
1694 in first sequence that are covered by non-gap characters in
1697 assert(aln.GetCount() == 2)
1698 n_tot = sum([1
for col
in aln
if col[0] !=
'-'])
1699 n_aligned = sum([1
for col
in aln
if (col[0] !=
'-' and col[1] !=
'-')])
1700 return (seq.alg.SequenceIdentity(aln), float(n_aligned)/n_tot)
1702 def _GetAlnPropsOne(aln):
1704 """Returns basic properties of *aln* version one...
1706 :param aln: Alignment to compute properties
1707 :type aln: :class:`seq.AlignmentHandle`
1708 :returns: Tuple with 2 elements. 1) sequence identify in range [0, 100]
1709 considering aligned columns 2) Number of aligned columns.
1711 assert(aln.GetCount() == 2)
1712 seqid = seq.alg.SequenceIdentity(aln)
1713 n_aligned = sum([1
for col
in aln
if (col[0] !=
'-' and col[1] !=
'-')])
1714 return (seqid, n_aligned)
1716 def _GetChemGroupAlignments(pep_seqs, nuc_seqs, aligner, pep_seqid_thr=95.,
1717 min_pep_length=6, nuc_seqid_thr=95.,
1719 """Returns alignments with groups of chemically equivalent chains
1721 :param pep_seqs: List of polypeptide sequences
1722 :type pep_seqs: :class:`seq.SequenceList`
1723 :param nuc_seqs: List of polynucleotide sequences
1724 :type nuc_seqs: :class:`seq.SequenceList`
1725 :param aligner: Helper class to generate pairwise alignments
1726 :type aligner: :class:`_Aligner`
1727 :param pep_seqid_thr: Threshold used to decide when two peptide chains are
1728 identical. 95 percent tolerates the few mutations
1729 crystallographers like to do.
1730 :type pep_seqid_thr: :class:`float`
1731 :param min_pep_length: Additional threshold to avoid gappy alignments with high
1732 seqid. Number of aligned columns must be at least this
1734 :type min_pep_length: :class:`int`
1735 :param nuc_seqid_thr: Nucleotide equivalent of *pep_seqid_thr*
1736 :type nuc_seqid_thr: :class:`float`
1737 :param min_nuc_length: Nucleotide equivalent of *min_pep_length*
1738 :type min_nuc_length: :class:`int`
1739 :returns: Tuple with first element being an AlignmentList. Each alignment
1740 represents a group of chemically equivalent chains and the first
1741 sequence is the longest. Second element is a list of equivalent
1742 length specifying the types of the groups. List elements are in
1743 [:class:`ost.ChemType.AMINOACIDS`,
1744 :class:`ost.ChemType.NUCLEOTIDES`]
1746 pep_groups = _GroupSequences(pep_seqs, pep_seqid_thr, min_pep_length, aligner,
1747 mol.ChemType.AMINOACIDS)
1748 nuc_groups = _GroupSequences(nuc_seqs, nuc_seqid_thr, min_nuc_length, aligner,
1749 mol.ChemType.NUCLEOTIDES)
1750 group_types = [mol.ChemType.AMINOACIDS] * len(pep_groups)
1751 group_types += [mol.ChemType.NUCLEOTIDES] * len(nuc_groups)
1753 groups.extend(nuc_groups)
1754 return (groups, group_types)
1756 def _GroupSequences(seqs, seqid_thr, min_length, aligner, chem_type):
1757 """Get list of alignments representing groups of equivalent sequences
1759 :param seqid_thr: Threshold used to decide when two chains are identical.
1760 :type seqid_thr: :class:`float`
1761 :param gap_thr: Additional threshold to avoid gappy alignments with high
1762 seqid. Number of aligned columns must be at least this
1764 :type gap_thr: :class:`int`
1765 :param aligner: Helper class to generate pairwise alignments
1766 :type aligner: :class:`_Aligner`
1767 :param chem_type: ChemType of seqs which is passed to *aligner*, must be in
1768 [:class:`ost.mol.ChemType.AMINOACIDS`,
1769 :class:`ost.mol.ChemType.NUCLEOTIDES`]
1770 :type chem_type: :class:`ost.mol.ChemType`
1771 :returns: A list of alignments, one alignment for each group
1772 with longest sequence (reference) as first sequence.
1773 :rtype: :class:`ost.seq.AlignmentList`
1776 for s_idx
in range(len(seqs)):
1777 matching_group =
None
1778 for g_idx
in range(len(groups)):
1779 for g_s_idx
in range(len(groups[g_idx])):
1780 aln = aligner.Align(seqs[s_idx], seqs[groups[g_idx][g_s_idx]],
1782 sid, n_aligned = _GetAlnPropsOne(aln)
1783 if sid >= seqid_thr
and n_aligned >= min_length:
1784 matching_group = g_idx
1786 if matching_group
is not None:
1789 if matching_group
is None:
1790 groups.append([s_idx])
1792 groups[matching_group].append(s_idx)
1795 sorted_groups = list()
1798 tmp = sorted([[len(seqs[i]), i]
for i
in g], reverse=
True)
1799 sorted_groups.append([x[1]
for x
in tmp])
1801 sorted_groups.append(g)
1805 aln_list = seq.AlignmentList()
1806 for g
in sorted_groups:
1809 aln_list.append(seq.CreateAlignment(seqs[g[0]]))
1812 alns = seq.AlignmentList()
1815 alns.append(aligner.Align(seqs[i], seqs[j], chem_type))
1817 aln_list.append(seq.alg.MergePairwiseAlignments(alns, seqs[i]))
1820 seq_dict = {s.GetName(): s
for s
in seqs}
1821 for aln_idx
in range(len(aln_list)):
1822 for aln_s_idx
in range(aln_list[aln_idx].GetCount()):
1823 s_name = aln_list[aln_idx].GetSequence(aln_s_idx).GetName()
1824 s = seq_dict[s_name]
1825 aln_list[aln_idx].AttachView(aln_s_idx, s.GetAttachedView())
1829 def _MapSequence(ref_seqs, ref_types, s, s_type, aligner):
1830 """Tries top map *s* onto any of the sequences in *ref_seqs*
1832 Computes alignments of *s* to each of the reference sequences of equal type
1833 and sorts them by seqid*fraction_covered (seqid: sequence identity of
1834 aligned columns in alignment, fraction_covered: Fraction of non-gap
1835 characters in reference sequence that are covered by non-gap characters in
1836 *s*). Best scoring mapping is returned.
1838 :param ref_seqs: Reference sequences
1839 :type ref_seqs: :class:`ost.seq.SequenceList`
1840 :param ref_types: Types of reference sequences, e.g.
1841 ost.mol.ChemType.AminoAcids
1842 :type ref_types: :class:`list` of :class:`ost.mol.ChemType`
1843 :param s: Sequence to map
1844 :type s: :class:`ost.seq.SequenceHandle`
1845 :param s_type: Type of *s*, only try mapping to sequences in *ref_seqs*
1846 with equal type as defined in *ref_types*
1847 :param aligner: Helper class to generate pairwise alignments
1848 :type aligner: :class:`_Aligner`
1849 :returns: Tuple with two elements. 1) index of sequence in *ref_seqs* to
1850 which *s* can be mapped 2) Pairwise sequence alignment with
1851 sequence from *ref_seqs* as first sequence. Both elements are
1852 None if no mapping can be found.
1853 :raises: :class:`RuntimeError` if mapping is ambiguous, i.e. *s*
1854 successfully maps to more than one sequence in *ref_seqs*
1856 scored_alns = list()
1857 for ref_idx, ref_seq
in enumerate(ref_seqs):
1858 if ref_types[ref_idx] == s_type:
1859 aln = aligner.Align(ref_seq, s, s_type)
1860 seqid, fraction_covered = _GetAlnPropsTwo(aln)
1861 score = seqid * fraction_covered
1862 scored_alns.append((score, ref_idx, aln))
1864 if len(scored_alns) == 0:
1867 scored_alns = sorted(scored_alns, key=
lambda x: x[0], reverse=
True)
1868 return (scored_alns[0][1], scored_alns[0][2])
1870 def _GetRefMdlAlns(ref_chem_groups, ref_chem_group_msas, mdl_chem_groups,
1871 mdl_chem_group_alns, pairs=None):
1872 """ Get all possible ref/mdl chain alignments given chem group mapping
1874 :param ref_chem_groups: :attr:`ChainMapper.chem_groups`
1875 :type ref_chem_groups: :class:`list` of :class:`list` of :class:`str`
1876 :param ref_chem_group_msas: :attr:`ChainMapper.chem_group_alignments`
1877 :type ref_chem_group_msas: :class:`ost.seq.AlignmentList`
1878 :param mdl_chem_groups: Groups of model chains that are mapped to
1879 *ref_chem_groups*. Return value of
1880 :func:`ChainMapper.GetChemMapping`.
1881 :type mdl_chem_groups: :class:`list` of :class:`list` of :class:`str`
1882 :param mdl_chem_group_alns: A pairwise sequence alignment for every chain
1883 in *mdl_chem_groups* that aligns these sequences
1884 to the respective reference sequence.
1886 :func:`ChainMapper.GetChemMapping`.
1887 :type mdl_chem_group_alns: :class:`list` of :class:`ost.seq.AlignmentList`
1888 :param pairs: Pro param - restrict return dict to specified pairs. A set of
1889 tuples in form (<trg_ch>, <mdl_ch>)
1890 :type pairs: :class:`set`
1891 :returns: A dictionary holding all possible ref/mdl chain alignments. Keys
1892 in that dictionary are tuples of the form (ref_ch, mdl_ch) and
1893 values are the respective pairwise alignments with first sequence
1894 being from ref, the second from mdl.
1898 for alns
in mdl_chem_group_alns:
1900 mdl_chain_name = aln.GetSequence(1).GetName()
1901 mdl_alns[mdl_chain_name] = aln
1905 ref_mdl_alns = dict()
1906 for ref_chains, mdl_chains, ref_aln
in zip(ref_chem_groups, mdl_chem_groups,
1907 ref_chem_group_msas):
1908 for ref_ch
in ref_chains:
1909 for mdl_ch
in mdl_chains:
1910 if pairs
is not None and (ref_ch, mdl_ch)
not in pairs:
1914 aln_list = seq.AlignmentList()
1916 s1 = ref_aln.GetSequence(0)
1917 s2 = ref_aln.GetSequence(ref_chains.index(ref_ch))
1918 aln_list.append(seq.CreateAlignment(s1, s2))
1920 aln_list.append(mdl_alns[mdl_ch])
1922 ref_seq = seq.CreateSequence(s1.GetName(),
1923 s1.GetGaplessString())
1924 merged_aln = seq.alg.MergePairwiseAlignments(aln_list,
1931 s2 = merged_aln.GetSequence(1)
1932 s3 = merged_aln.GetSequence(2)
1935 for idx
in range(len(s2)):
1936 if s2[idx] !=
'-' or s3[idx] !=
'-':
1940 for idx
in reversed(range(len(s2))):
1941 if s2[idx] !=
'-' or s3[idx] !=
'-':
1944 s2 = seq.CreateSequence(s2.GetName(), s2[a: len(s2)-b])
1945 s3 = seq.CreateSequence(s3.GetName(), s3[a: len(s3)-b])
1946 ref_mdl_alns[(ref_ch, mdl_ch)] = seq.CreateAlignment(s2, s3)
1950 def _CheckOneToOneMapping(ref_chains, mdl_chains):
1951 """ Checks whether we already have a perfect one to one mapping
1953 That means each list in *ref_chains* has exactly one element and each
1954 list in *mdl_chains* has either one element (it's mapped) or is empty
1955 (ref chain has no mapped mdl chain). Returns None if no such mapping
1958 :param ref_chains: corresponds to :attr:`ChainMapper.chem_groups`
1959 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
1960 :param mdl_chains: mdl chains mapped to chem groups in *ref_chains*, i.e.
1961 the return value of :func:`ChainMapper.GetChemMapping`
1962 :type mdl_chains: class:`list` of :class:`list` of :class:`str`
1963 :returns: A :class:`list` of :class:`list` if a one to one mapping is found,
1966 only_one_to_one =
True
1968 for ref, mdl
in zip(ref_chains, mdl_chains):
1969 if len(ref) == 1
and len(mdl) == 1:
1970 one_to_one.append(mdl)
1971 elif len(ref) == 1
and len(mdl) == 0:
1972 one_to_one.append([
None])
1974 only_one_to_one =
False
1982 def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
1983 ref_mdl_alns, inclusion_radius = 15.0,
1984 thresholds = [0.5, 1.0, 2.0, 4.0],
1985 steep_opt_rate = None):
1987 """ Greedy extension of already existing but incomplete chain mappings
1989 super().
__init__(ref, ref_chem_groups, mdl, ref_mdl_alns,
1990 dist_thresh=inclusion_radius,
1991 dist_diff_thresholds=thresholds)
1997 for interface
in self.
trgtrg.interacting_chains:
1998 self.
neighborsneighbors[interface[0]].add(interface[1])
1999 self.
neighborsneighbors[interface[1]].add(interface[0])
2001 assert(len(ref_chem_groups) == len(mdl_chem_groups))
2006 for g_idx, (ref_g, mdl_g)
in enumerate(zip(ref_chem_groups,
2016 for interface
in self.
mdlmdl.potentially_contributing_chains:
2017 self.
mdl_neighborsmdl_neighbors[interface[0]].add(interface[1])
2018 self.
mdl_neighborsmdl_neighbors[interface[1]].add(interface[0])
2022 if len(mapping) == 0:
2023 raise RuntimError(
"Mapping must contain a starting point")
2030 for ref_ch
in mapping.keys():
2031 map_targets.update(self.
neighborsneighbors[ref_ch])
2034 for ref_ch
in mapping.keys():
2035 map_targets.discard(ref_ch)
2037 if len(map_targets) == 0:
2041 free_mdl_chains = list()
2043 tmp = [x
for x
in chem_group
if x
not in mapping.values()]
2044 free_mdl_chains.append(set(tmp))
2047 newly_mapped_ref_chains = list()
2049 something_happened =
True
2050 while something_happened:
2051 something_happened=
False
2054 n_chains = len(newly_mapped_ref_chains)
2055 if n_chains > 0
and n_chains % self.
steep_opt_ratesteep_opt_rate == 0:
2056 mapping = self.
_SteepOpt_SteepOpt(mapping, newly_mapped_ref_chains)
2058 if max_ext
is not None and len(newly_mapped_ref_chains) >= max_ext:
2063 for ref_ch
in map_targets:
2065 for mdl_ch
in free_mdl_chains[chem_group_idx]:
2067 n_single = self.
_NSCConserved_NSCConserved(ref_ch, mdl_ch).sum()
2070 for neighbor
in self.
neighborsneighbors[ref_ch]:
2071 if neighbor
in mapping
and mapping[neighbor]
in \
2074 (mdl_ch, mapping[neighbor])).sum()
2075 n = n_single + n_inter
2077 if n_inter > 0
and n > max_n:
2082 max_mapping = (ref_ch, mdl_ch)
2085 something_happened =
True
2087 mapping[max_mapping[0]] = max_mapping[1]
2091 for neighbor
in self.
neighborsneighbors[max_mapping[0]]:
2092 if neighbor
not in mapping:
2093 map_targets.add(neighbor)
2096 map_targets.remove(max_mapping[0])
2100 free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2103 newly_mapped_ref_chains.append(max_mapping[0])
2107 def _SteepOpt(self, mapping, chains_to_optimize=None):
2110 if chains_to_optimize
is None:
2111 chains_to_optimize = mapping.keys()
2114 ref_chains = [x
for x
in chains_to_optimize
if mapping[x]
is not None]
2118 for ch
in ref_chains:
2120 if chem_group_idx
in tmp:
2121 tmp[chem_group_idx].append(ch)
2123 tmp[chem_group_idx] = [ch]
2124 chem_groups = list(tmp.values())
2129 something_happened =
True
2130 while something_happened:
2131 something_happened =
False
2132 for chem_group
in chem_groups:
2133 if something_happened:
2135 for ch1, ch2
in itertools.combinations(chem_group, 2):
2136 swapped_mapping = dict(mapping)
2137 swapped_mapping[ch1] = mapping[ch2]
2138 swapped_mapping[ch2] = mapping[ch1]
2140 if score > current_lddt:
2141 something_happened =
True
2142 mapping = swapped_mapping
2143 current_lddt = score
2148 def _lDDTNaive(trg, mdl, inclusion_radius, thresholds, chem_groups,
2149 chem_mapping, ref_mdl_alns, n_max_naive):
2150 """ Naively iterates all possible chain mappings and returns the best
2157 dist_thresh=inclusion_radius,
2158 dist_diff_thresholds=thresholds)
2159 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2160 lDDT = lddt_scorer.Score(mapping, check=
False)
2161 if lDDT > best_lddt:
2162 best_mapping = mapping
2165 return (best_mapping, best_lddt)
2168 def _GetSeeds(ref_chem_groups, mdl_chem_groups, mapped_ref_chains = set(),
2169 mapped_mdl_chains = set()):
2171 for ref_chains, mdl_chains
in zip(ref_chem_groups,
2173 for ref_ch
in ref_chains:
2174 if ref_ch
not in mapped_ref_chains:
2175 for mdl_ch
in mdl_chains:
2176 if mdl_ch
not in mapped_mdl_chains:
2177 seeds.append((ref_ch, mdl_ch))
2181 def _lDDTGreedyFast(the_greed):
2183 something_happened =
True
2186 while something_happened:
2187 something_happened =
False
2188 seeds = _GetSeeds(the_greed.ref_chem_groups,
2189 the_greed.mdl_chem_groups,
2190 mapped_ref_chains = set(mapping.keys()),
2191 mapped_mdl_chains = set(mapping.values()))
2196 n = the_greed._NSCConserved(seed[0], seed[1]).sum()
2203 mapping[best_seed[0]] = best_seed[1]
2204 mapping = the_greed.ExtendMapping(mapping)
2205 something_happened =
True
2208 final_mapping = list()
2209 for ref_chains
in the_greed.ref_chem_groups:
2210 mapped_mdl_chains = list()
2211 for ref_ch
in ref_chains:
2212 if ref_ch
in mapping:
2213 mapped_mdl_chains.append(mapping[ref_ch])
2215 mapped_mdl_chains.append(
None)
2216 final_mapping.append(mapped_mdl_chains)
2218 return final_mapping
2221 def _lDDTGreedyFull(the_greed):
2222 """ Uses each reference chain as starting point for expansion
2225 seeds = _GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
2226 best_overall_score = -1.0
2227 best_overall_mapping = dict()
2232 mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
2235 something_happened =
True
2236 while something_happened:
2237 something_happened =
False
2238 remnant_seeds = _GetSeeds(the_greed.ref_chem_groups,
2239 the_greed.mdl_chem_groups,
2240 mapped_ref_chains = set(mapping.keys()),
2241 mapped_mdl_chains = set(mapping.values()))
2242 if len(remnant_seeds) > 0:
2246 for remnant_seed
in remnant_seeds:
2247 tmp_mapping = dict(mapping)
2248 tmp_mapping[remnant_seed[0]] = remnant_seed[1]
2249 tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
2250 score = the_greed.FromFlatMapping(tmp_mapping)
2251 if score > best_score:
2253 best_mapping = tmp_mapping
2254 if best_mapping
is not None:
2255 something_happened =
True
2256 mapping = best_mapping
2258 score = the_greed.FromFlatMapping(mapping)
2259 if score > best_overall_score:
2260 best_overall_score = score
2261 best_overall_mapping = mapping
2263 mapping = best_overall_mapping
2266 final_mapping = list()
2267 for ref_chains
in the_greed.ref_chem_groups:
2268 mapped_mdl_chains = list()
2269 for ref_ch
in ref_chains:
2270 if ref_ch
in mapping:
2271 mapped_mdl_chains.append(mapping[ref_ch])
2273 mapped_mdl_chains.append(
None)
2274 final_mapping.append(mapped_mdl_chains)
2276 return final_mapping
2279 def _lDDTGreedyBlock(the_greed, seed_size, blocks_per_chem_group):
2280 """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
2281 respective chem groups and compute single chain lDDTs. The
2282 *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
2283 and the best scoring one is exhaustively extended.
2286 if seed_size
is None or seed_size < 1:
2287 raise RuntimeError(f
"seed_size must be an int >= 1 (got {seed_size})")
2289 if blocks_per_chem_group
is None or blocks_per_chem_group < 1:
2290 raise RuntimeError(f
"blocks_per_chem_group must be an int >= 1 "
2291 f
"(got {blocks_per_chem_group})")
2293 max_ext = seed_size - 1
2295 ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
2296 mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
2299 something_happened =
True
2300 while something_happened:
2301 something_happened =
False
2302 starting_blocks = list()
2303 for ref_chains, mdl_chains
in zip(ref_chem_groups, mdl_chem_groups):
2304 if len(mdl_chains) == 0:
2306 ref_chains_copy = list(ref_chains)
2307 for i
in range(blocks_per_chem_group):
2308 if len(ref_chains_copy) == 0:
2311 for ref_ch
in ref_chains_copy:
2312 seeds += [(ref_ch, mdl_ch)
for mdl_ch
in mdl_chains]
2319 seed = dict(mapping)
2320 seed.update({s[0]: s[1]})
2321 seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
2322 seed_lddt = the_greed.FromFlatMapping(seed)
2323 if seed_lddt > best_score:
2324 best_score = seed_lddt
2327 if best_mapping !=
None:
2328 starting_blocks.append(best_mapping)
2329 if best_seed[0]
in ref_chains_copy:
2331 ref_chains_copy.remove(best_seed[0])
2336 for seed
in starting_blocks:
2337 seed = the_greed.ExtendMapping(seed)
2338 seed_lddt = the_greed.FromFlatMapping(seed)
2339 if seed_lddt > best_lddt:
2340 best_lddt = seed_lddt
2343 if best_lddt == 0.0:
2346 something_happened =
True
2347 mapping.update(best_mapping)
2348 for ref_ch, mdl_ch
in best_mapping.items():
2349 for group_idx
in range(len(ref_chem_groups)):
2350 if ref_ch
in ref_chem_groups[group_idx]:
2351 ref_chem_groups[group_idx].remove(ref_ch)
2352 if mdl_ch
in mdl_chem_groups[group_idx]:
2353 mdl_chem_groups[group_idx].remove(mdl_ch)
2356 final_mapping = list()
2357 for ref_chains
in the_greed.ref_chem_groups:
2358 mapped_mdl_chains = list()
2359 for ref_ch
in ref_chains:
2360 if ref_ch
in mapping:
2361 mapped_mdl_chains.append(mapping[ref_ch])
2363 mapped_mdl_chains.append(
None)
2364 final_mapping.append(mapped_mdl_chains)
2366 return final_mapping
2370 def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
2371 ref_mdl_alns, contact_d = 12.0,
2372 steep_opt_rate = None, greedy_prune_contact_map=False):
2373 """ Greedy extension of already existing but incomplete chain mappings
2375 super().
__init__(ref, ref_chem_groups, mdl, ref_mdl_alns,
2376 contact_d = contact_d)
2382 if greedy_prune_contact_map:
2384 for p
in self.
qsent1qsent1.interacting_chains:
2385 if np.count_nonzero(self.
qsent1qsent1.PairDist(p[0], p[1])<=8) >= 3:
2390 for p
in self.
qsent2qsent2.interacting_chains:
2391 if np.count_nonzero(self.
qsent2qsent2.PairDist(p[0], p[1])<=8) >= 3:
2397 self.
neighborsneighbors = {k: set()
for k
in self.
qsent1qsent1.chain_names}
2398 for p
in self.
qsent1qsent1.interacting_chains:
2403 for p
in self.
qsent2qsent2.interacting_chains:
2407 assert(len(ref_chem_groups) == len(mdl_chem_groups))
2412 for g_idx, (ref_g, mdl_g)
in enumerate(zip(ref_chem_groups,
2424 for ch
in self.
refref.chains:
2425 single_chain_ref = _CSel(self.
refref, [ch.GetName()])
2432 alns[mdl_ch] = self.
ref_mdl_alnsref_mdl_alns[(ref_ch, mdl_ch)]
2433 mdl_sel = _CSel(self.
mdlmdl, [mdl_ch])
2435 _,_,_,conserved,_,_,_ = s.lDDT(mdl_sel,
2436 residue_mapping=alns,
2437 return_dist_test=
True,
2439 chain_mapping={mdl_ch: ref_ch},
2440 check_resnames=
False)
2446 if len(mapping) == 0:
2447 raise RuntimError(
"Mapping must contain a starting point")
2454 for ref_ch
in mapping.keys():
2455 map_targets.update(self.
neighborsneighbors[ref_ch])
2458 for ref_ch
in mapping.keys():
2459 map_targets.discard(ref_ch)
2461 if len(map_targets) == 0:
2465 free_mdl_chains = list()
2467 tmp = [x
for x
in chem_group
if x
not in mapping.values()]
2468 free_mdl_chains.append(set(tmp))
2471 newly_mapped_ref_chains = list()
2473 something_happened =
True
2474 while something_happened:
2475 something_happened=
False
2478 n_chains = len(newly_mapped_ref_chains)
2479 if n_chains > 0
and n_chains % self.
steep_opt_ratesteep_opt_rate == 0:
2480 mapping = self.
_SteepOpt_SteepOpt(mapping, newly_mapped_ref_chains)
2482 if max_ext
is not None and len(newly_mapped_ref_chains) >= max_ext:
2486 old_score = score_result.QS_global
2487 nominator = score_result.weighted_scores
2488 denominator = score_result.weight_sum + score_result.weight_extra_all
2492 for ref_ch
in map_targets:
2494 for mdl_ch
in free_mdl_chains[chem_group_idx]:
2497 nominator_diff = 0.0
2498 denominator_diff = 0.0
2499 for neighbor
in self.
neighborsneighbors[ref_ch]:
2500 if neighbor
in mapping
and mapping[neighbor]
in \
2504 int1 = (ref_ch, neighbor)
2505 int2 = (mdl_ch, mapping[neighbor])
2508 denominator_diff += b
2509 denominator_diff += d
2515 if nominator_diff > 0:
2518 new_nominator = nominator + nominator_diff
2519 new_denominator = denominator + denominator_diff
2521 if new_denominator != 0.0:
2522 new_score = new_nominator/new_denominator
2523 diff = new_score - old_score
2526 max_mapping = (ref_ch, mdl_ch)
2528 if max_mapping
is not None:
2529 something_happened =
True
2531 mapping[max_mapping[0]] = max_mapping[1]
2535 for neighbor
in self.
neighborsneighbors[max_mapping[0]]:
2536 if neighbor
not in mapping:
2537 map_targets.add(neighbor)
2540 map_targets.remove(max_mapping[0])
2544 free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2547 newly_mapped_ref_chains.append(max_mapping[0])
2551 def _SteepOpt(self, mapping, chains_to_optimize=None):
2554 if chains_to_optimize
is None:
2555 chains_to_optimize = mapping.keys()
2558 ref_chains = [x
for x
in chains_to_optimize
if mapping[x]
is not None]
2562 for ch
in ref_chains:
2564 if chem_group_idx
in tmp:
2565 tmp[chem_group_idx].append(ch)
2567 tmp[chem_group_idx] = [ch]
2568 chem_groups = list(tmp.values())
2573 current_score = score_result.QS_global
2574 something_happened =
True
2575 while something_happened:
2576 something_happened =
False
2577 for chem_group
in chem_groups:
2578 if something_happened:
2580 for ch1, ch2
in itertools.combinations(chem_group, 2):
2581 swapped_mapping = dict(mapping)
2582 swapped_mapping[ch1] = mapping[ch2]
2583 swapped_mapping[ch2] = mapping[ch1]
2585 if score_result.QS_global > current_score:
2586 something_happened =
True
2587 mapping = swapped_mapping
2588 current_score = score_result.QS_global
2593 def _QSScoreNaive(trg, mdl, chem_groups, chem_mapping, ref_mdl_alns, contact_d,
2600 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2601 score_result = qs_scorer.Score(mapping, check=
False)
2602 if score_result.QS_global > best_score:
2603 best_mapping = mapping
2604 best_score = score_result.QS_global
2605 return (best_mapping, best_score)
2608 def _QSScoreGreedyFast(the_greed):
2610 something_happened =
True
2612 while something_happened:
2613 something_happened =
False
2617 seeds = _GetSeeds(the_greed.ref_chem_groups,
2618 the_greed.mdl_chem_groups,
2619 mapped_ref_chains = set(mapping.keys()),
2620 mapped_mdl_chains = set(mapping.values()))
2622 n = the_greed.SCCounts(seed[0], seed[1])
2629 mapping[best_seed[0]] = best_seed[1]
2630 mapping = the_greed.ExtendMapping(mapping)
2631 something_happened =
True
2634 final_mapping = list()
2635 for ref_chains
in the_greed.ref_chem_groups:
2636 mapped_mdl_chains = list()
2637 for ref_ch
in ref_chains:
2638 if ref_ch
in mapping:
2639 mapped_mdl_chains.append(mapping[ref_ch])
2641 mapped_mdl_chains.append(
None)
2642 final_mapping.append(mapped_mdl_chains)
2644 return final_mapping
2647 def _QSScoreGreedyFull(the_greed):
2648 """ Uses each reference chain as starting point for expansion
2651 seeds = _GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
2652 best_overall_score = -1.0
2653 best_overall_mapping = dict()
2658 mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
2661 something_happened =
True
2662 while something_happened:
2663 something_happened =
False
2664 remnant_seeds = _GetSeeds(the_greed.ref_chem_groups,
2665 the_greed.mdl_chem_groups,
2666 mapped_ref_chains = set(mapping.keys()),
2667 mapped_mdl_chains = set(mapping.values()))
2668 if len(remnant_seeds) > 0:
2672 for remnant_seed
in remnant_seeds:
2673 tmp_mapping = dict(mapping)
2674 tmp_mapping[remnant_seed[0]] = remnant_seed[1]
2675 tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
2676 score_result = the_greed.FromFlatMapping(tmp_mapping)
2677 if score_result.QS_global > best_score:
2678 best_score = score_result.QS_global
2679 best_mapping = tmp_mapping
2680 if best_mapping
is not None:
2681 something_happened =
True
2682 mapping = best_mapping
2684 score_result = the_greed.FromFlatMapping(mapping)
2685 if score_result.QS_global > best_overall_score:
2686 best_overall_score = score_result.QS_global
2687 best_overall_mapping = mapping
2689 mapping = best_overall_mapping
2692 final_mapping = list()
2693 for ref_chains
in the_greed.ref_chem_groups:
2694 mapped_mdl_chains = list()
2695 for ref_ch
in ref_chains:
2696 if ref_ch
in mapping:
2697 mapped_mdl_chains.append(mapping[ref_ch])
2699 mapped_mdl_chains.append(
None)
2700 final_mapping.append(mapped_mdl_chains)
2702 return final_mapping
2705 def _QSScoreGreedyBlock(the_greed, seed_size, blocks_per_chem_group):
2706 """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
2707 respective chem groups and compute single chain lDDTs. The
2708 *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
2709 and the best scoring one with respect to QS score is exhaustively extended.
2712 if seed_size
is None or seed_size < 1:
2713 raise RuntimeError(f
"seed_size must be an int >= 1 (got {seed_size})")
2715 if blocks_per_chem_group
is None or blocks_per_chem_group < 1:
2716 raise RuntimeError(f
"blocks_per_chem_group must be an int >= 1 "
2717 f
"(got {blocks_per_chem_group})")
2719 max_ext = seed_size - 1
2721 ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
2722 mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
2726 something_happened =
True
2727 while something_happened:
2728 something_happened =
False
2729 starting_blocks = list()
2730 for ref_chains, mdl_chains
in zip(ref_chem_groups, mdl_chem_groups):
2731 if len(mdl_chains) == 0:
2733 ref_chains_copy = list(ref_chains)
2734 for i
in range(blocks_per_chem_group):
2735 if len(ref_chains_copy) == 0:
2738 for ref_ch
in ref_chains_copy:
2739 seeds += [(ref_ch, mdl_ch)
for mdl_ch
in mdl_chains]
2746 seed = dict(mapping)
2747 seed.update({s[0]: s[1]})
2748 seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
2749 score_result = the_greed.FromFlatMapping(seed)
2750 if score_result.QS_global > best_score:
2751 best_score = score_result.QS_global
2754 if best_mapping !=
None:
2755 starting_blocks.append(best_mapping)
2756 if best_seed[0]
in ref_chains_copy:
2758 ref_chains_copy.remove(best_seed[0])
2763 for seed
in starting_blocks:
2764 seed = the_greed.ExtendMapping(seed)
2765 score_result = the_greed.FromFlatMapping(seed)
2766 if score_result.QS_global > best_score:
2767 best_score = score_result.QS_global
2770 if best_mapping
is not None and len(best_mapping) > len(mapping):
2773 something_happened =
True
2774 mapping.update(best_mapping)
2775 for ref_ch, mdl_ch
in best_mapping.items():
2776 for group_idx
in range(len(ref_chem_groups)):
2777 if ref_ch
in ref_chem_groups[group_idx]:
2778 ref_chem_groups[group_idx].remove(ref_ch)
2779 if mdl_ch
in mdl_chem_groups[group_idx]:
2780 mdl_chem_groups[group_idx].remove(mdl_ch)
2783 final_mapping = list()
2784 for ref_chains
in the_greed.ref_chem_groups:
2785 mapped_mdl_chains = list()
2786 for ref_ch
in ref_chains:
2787 if ref_ch
in mapping:
2788 mapped_mdl_chains.append(mapping[ref_ch])
2790 mapped_mdl_chains.append(
None)
2791 final_mapping.append(mapped_mdl_chains)
2793 return final_mapping
2795 def _SingleRigidRMSD(initial_transforms, initial_mappings, chem_groups,
2796 chem_mapping, trg_group_pos, mdl_group_pos):
2798 Takes initial transforms and sequentially adds chain pairs with lowest RMSD.
2799 The mapping from the transform that leads to lowest overall RMSD is
2802 best_mapping = dict()
2803 best_ssd = float(
"inf")
2807 for transform
in initial_transforms:
2809 mapped_mdl_chains = set()
2811 for trg_chains, mdl_chains, trg_pos, mdl_pos,
in zip(chem_groups,
2815 if len(trg_pos) == 0
or len(mdl_pos) == 0:
2819 for m_pos
in mdl_pos:
2821 t_m_pos.ApplyTransform(transform)
2822 t_mdl_pos.append(t_m_pos)
2823 for t_pos, t
in zip(trg_pos, trg_chains):
2824 for t_m_pos, m
in zip(t_mdl_pos, mdl_chains):
2825 ssd = t_pos.GetSummedSquaredDistances(t_m_pos)
2826 ssds.append((ssd, (t,m)))
2830 if p[0]
not in mapping
and p[1]
not in mapped_mdl_chains:
2831 mapping[p[0]] = p[1]
2832 mapped_mdl_chains.add(p[1])
2837 best_mapping = mapping
2841 def _IterativeRigidRMSD(initial_transforms, initial_mappings, chem_groups,
2842 chem_mapping, trg_group_pos, mdl_group_pos):
2843 """ Takes initial transforms and sequentially adds chain pairs with
2844 lowest RMSD. With each added chain pair, the transform gets updated.
2845 Thus the naming iterative. The mapping from the initial transform that
2846 leads to best overall RMSD score is returned.
2850 trg_pos_dict = dict()
2851 for trg_pos, trg_chains
in zip(trg_group_pos, chem_groups):
2852 for t_pos, t
in zip(trg_pos, trg_chains):
2853 trg_pos_dict[t] = t_pos
2854 mdl_pos_dict = dict()
2855 for mdl_pos, mdl_chains
in zip(mdl_group_pos, chem_mapping):
2856 for m_pos, m
in zip(mdl_pos, mdl_chains):
2857 mdl_pos_dict[m] = m_pos
2859 best_mapping = dict()
2860 best_rmsd = float(
"inf")
2861 for initial_transform, initial_mapping
in zip(initial_transforms,
2863 mapping = {initial_mapping[0]: initial_mapping[1]}
2864 transform =
geom.Mat4(initial_transform)
2865 mapped_trg_pos =
geom.Vec3List(trg_pos_dict[initial_mapping[0]])
2866 mapped_mdl_pos =
geom.Vec3List(mdl_pos_dict[initial_mapping[1]])
2870 trg_chain_groups = [set(group)
for group
in chem_groups]
2871 mdl_chain_groups = [set(group)
for group
in chem_mapping]
2874 for group
in trg_chain_groups:
2875 if initial_mapping[0]
in group:
2876 group.remove(initial_mapping[0])
2878 for group
in mdl_chain_groups:
2879 if initial_mapping[1]
in group:
2880 group.remove(initial_mapping[1])
2883 something_happened =
True
2884 while something_happened:
2886 something_happened=
False
2887 best_sc_mapping =
None
2888 best_sc_group_idx =
None
2889 best_sc_rmsd = float(
"inf")
2891 for trg_chains, mdl_chains
in zip(trg_chain_groups, mdl_chain_groups):
2892 for t
in trg_chains:
2893 t_pos = trg_pos_dict[t]
2894 for m
in mdl_chains:
2895 m_pos = mdl_pos_dict[m]
2897 t_m_pos.ApplyTransform(transform)
2898 rmsd = t_pos.GetRMSD(t_m_pos)
2899 if rmsd < best_sc_rmsd:
2901 best_sc_mapping = (t,m)
2902 best_sc_group_idx = group_idx
2905 if best_sc_mapping
is not None:
2906 something_happened =
True
2907 mapping[best_sc_mapping[0]] = best_sc_mapping[1]
2908 mapped_trg_pos.extend(trg_pos_dict[best_sc_mapping[0]])
2909 mapped_mdl_pos.extend(mdl_pos_dict[best_sc_mapping[1]])
2910 trg_chain_groups[best_sc_group_idx].remove(best_sc_mapping[0])
2911 mdl_chain_groups[best_sc_group_idx].remove(best_sc_mapping[1])
2913 transform = _GetSuperposition(mapped_mdl_pos, mapped_trg_pos,
2914 False).transformation
2917 mapped_mdl_pos.ApplyTransform(transform)
2918 rmsd = mapped_trg_pos.GetRMSD(mapped_mdl_pos)
2920 if rmsd < best_rmsd:
2922 best_mapping = mapping
2926 def _NaiveRMSD(chem_groups, chem_mapping, trg_group_pos, mdl_group_pos,
2930 trg_pos_dict = dict()
2931 for trg_pos, trg_chains
in zip(trg_group_pos, chem_groups):
2932 for t_pos, t
in zip(trg_pos, trg_chains):
2933 trg_pos_dict[t] = t_pos
2934 mdl_pos_dict = dict()
2935 for mdl_pos, mdl_chains
in zip(mdl_group_pos, chem_mapping):
2936 for m_pos, m
in zip(mdl_pos, mdl_chains):
2937 mdl_pos_dict[m] = m_pos
2939 best_mapping = dict()
2940 best_rmsd = float(
"inf")
2942 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2945 for trg_group, mdl_group
in zip(chem_groups, mapping):
2946 for trg_ch, mdl_ch
in zip(trg_group, mdl_group):
2947 if trg_ch
is not None and mdl_ch
is not None:
2948 trg_pos.extend(trg_pos_dict[trg_ch])
2949 mdl_pos.extend(mdl_pos_dict[mdl_ch])
2950 superpose_res = mol.alg.SuperposeSVD(mdl_pos, trg_pos)
2951 if superpose_res.rmsd < best_rmsd:
2952 best_rmsd = superpose_res.rmsd
2953 best_mapping = mapping
2957 for chem_group, mapping
in zip(chem_groups, best_mapping):
2958 for trg_ch, mdl_ch
in zip(chem_group, mapping):
2959 tmp[trg_ch] = mdl_ch
2964 def _GetRefPos(trg, mdl, trg_msas, mdl_alns, max_pos = None):
2965 """ Extracts reference positions which are present in trg and mdl
2970 bb_trg = trg.Select(
"aname=\"CA\",\"C3'\"")
2971 bb_mdl = mdl.Select(
"aname=\"CA\",\"C3'\"")
2975 for aln_list
in mdl_alns:
2976 if len(aln_list) > 0:
2977 tmp = aln_list[0].GetSequence(0)
2978 ref_seq = seq.CreateSequence(tmp.GetName(), tmp.GetGaplessString())
2979 mdl_msas.append(seq.alg.MergePairwiseAlignments(aln_list, ref_seq))
2981 mdl_msas.append(seq.CreateAlignment())
2986 for trg_msa, mdl_msa
in zip(trg_msas, mdl_msas):
2988 if mdl_msa.GetCount() > 0:
2990 assert(trg_msa.GetSequence(0).GetGaplessString() == \
2991 mdl_msa.GetSequence(0).GetGaplessString())
3000 trg_indices = _GetFullyCoveredIndices(trg_msa)
3001 mdl_indices = _GetFullyCoveredIndices(mdl_msa)
3004 indices = sorted(list(trg_indices.intersection(mdl_indices)))
3007 if max_pos
is not None and len(indices) > max_pos:
3008 step = int(len(indices)/max_pos)
3009 indices = [indices[i]
for i
in range(0, len(indices), step)]
3012 trg_indices = _RefIndicesToColumnIndices(trg_msa, indices)
3013 mdl_indices = _RefIndicesToColumnIndices(mdl_msa, indices)
3016 trg_pos.append(list())
3017 mdl_pos.append(list())
3018 for s_idx
in range(trg_msa.GetCount()):
3019 trg_pos[-1].append(_ExtractMSAPos(trg_msa, s_idx, trg_indices,
3022 for s_idx
in range(1, mdl_msa.GetCount()):
3023 mdl_pos[-1].append(_ExtractMSAPos(mdl_msa, s_idx, mdl_indices,
3026 return (trg_pos, mdl_pos)
3028 def _GetFullyCoveredIndices(msa):
3029 """ Helper for _GetRefPos
3031 Returns a set containing the indices relative to first sequence in msa which
3032 are fully covered in all other sequences
3043 if sum([1
for olc
in col
if olc !=
'-']) == col.GetRowCount():
3044 indices.add(ref_idx)
3049 def _RefIndicesToColumnIndices(msa, indices):
3050 """ Helper for _GetRefPos
3052 Returns a list of mapped indices. indices refer to non-gap one letter
3053 codes in the first msa sequence. The returnes mapped indices are translated
3054 to the according msa column indices
3058 for col_idx, col
in enumerate(msa):
3060 mapping[ref_idx] = col_idx
3062 return [mapping[i]
for i
in indices]
3064 def _ExtractMSAPos(msa, s_idx, indices, view):
3065 """ Helper for _GetRefPos
3067 Returns a geom.Vec3List containing positions refering to given msa sequence.
3068 => Chain with corresponding name is mapped onto sequence and the position of
3069 the first atom of each residue specified in indices is extracted.
3070 Indices refers to column indices in msa!
3072 s = msa.GetSequence(s_idx)
3073 s_v = _CSel(view, [s.GetName()])
3076 assert(len(s.GetGaplessString()) == len(s_v.residues))
3078 residue_idx = [s.GetResidueIndex(i)
for i
in indices]
3079 return geom.Vec3List([s_v.residues[i].atoms[0].pos
for i
in residue_idx])
3081 def _NChemGroupMappings(ref_chains, mdl_chains):
3082 """ Number of mappings within one chem group
3084 :param ref_chains: Reference chains
3085 :type ref_chains: :class:`list` of :class:`str`
3086 :param mdl_chains: Model chains that are mapped onto *ref_chains*
3087 :type mdl_chains: :class:`list` of :class:`str`
3088 :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3090 n_ref = len(ref_chains)
3091 n_mdl = len(mdl_chains)
3093 return factorial(n_ref)
3095 n_choose_k = binom(n_ref, n_mdl)
3096 return n_choose_k * factorial(n_mdl)
3098 n_choose_k = binom(n_mdl, n_ref)
3099 return n_choose_k * factorial(n_ref)
3101 def _NMappings(ref_chains, mdl_chains):
3102 """ Number of mappings for a full chem mapping
3104 :param ref_chains: Chem groups of reference
3105 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3106 :param mdl_chains: Model chains that map onto those chem groups
3107 :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3108 :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3110 assert(len(ref_chains) == len(mdl_chains))
3112 for a,b
in zip(ref_chains, mdl_chains):
3113 n *= _NChemGroupMappings(a,b)
3116 def _NMappingsWithin(ref_chains, mdl_chains, max_mappings):
3117 """ Check whether total number of mappings is smaller than given maximum
3119 In principle the same as :func:`_NMappings` but it stops as soon as the
3122 :param ref_chains: Chem groups of reference
3123 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3124 :param mdl_chains: Model chains that map onto those chem groups
3125 :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3126 :param max_mappings: Number of max allowed mappings
3127 :returns: Whether number of possible mappings of *mdl_chains* onto
3128 *ref_chains* is below or equal *max_mappings*.
3130 assert(len(ref_chains) == len(mdl_chains))
3132 for a,b
in zip(ref_chains, mdl_chains):
3133 n *= _NChemGroupMappings(a,b)
3134 if n > max_mappings:
3138 def _RefSmallerGenerator(ref_chains, mdl_chains):
3139 """ Returns all possible ways to map mdl_chains onto ref_chains
3141 Specific for the case where len(ref_chains) < len(mdl_chains)
3143 for c
in itertools.combinations(mdl_chains, len(ref_chains)):
3144 for p
in itertools.permutations(c):
3147 def _RefLargerGenerator(ref_chains, mdl_chains):
3148 """ Returns all possible ways to map mdl_chains onto ref_chains
3150 Specific for the case where len(ref_chains) > len(mdl_chains)
3151 Ref chains without mapped mdl chain are assigned None
3153 n_ref = len(ref_chains)
3154 n_mdl = len(mdl_chains)
3155 for c
in itertools.combinations(range(n_ref), n_mdl):
3156 for p
in itertools.permutations(mdl_chains):
3157 ret_list = [
None] * n_ref
3158 for idx, ch
in zip(c, p):
3162 def _RefEqualGenerator(ref_chains, mdl_chains):
3163 """ Returns all possible ways to map mdl_chains onto ref_chains
3165 Specific for the case where len(ref_chains) == len(mdl_chains)
3167 for p
in itertools.permutations(mdl_chains):
3170 def _RefEmptyGenerator(ref_chains, mdl_chains):
3173 def _ConcatIterators(iterators):
3174 for item
in itertools.product(*iterators):
3177 def _ChainMappings(ref_chains, mdl_chains, n_max=None):
3178 """Returns all possible ways to map *mdl_chains* onto fixed *ref_chains*
3180 :param ref_chains: List of list of chemically equivalent chains in reference
3181 :type ref_chains: :class:`list` of :class:`list`
3182 :param mdl_chains: Equally long list of list of chemically equivalent chains
3183 in model that map on those ref chains.
3184 :type mdl_chains: :class:`list` of :class:`list`
3185 :param n_max: Aborts and raises :class:`RuntimeError` if max number of
3186 mappings is above this threshold.
3187 :type n_max: :class:`int`
3188 :returns: Iterator over all possible mappings of *mdl_chains* onto fixed
3189 *ref_chains*. Potentially contains None as padding when number of
3190 model chains for a certain mapping is smaller than the according
3192 Example: _ChainMappings([['A', 'B', 'C'], ['D', 'E']],
3193 [['x', 'y'], ['i', 'j']])
3194 gives an iterator over: [[['x', 'y', None], ['i', 'j']],
3195 [['x', 'y', None], ['j', 'i']],
3196 [['y', 'x', None], ['i', 'j']],
3197 [['y', 'x', None], ['j', 'i']],
3198 [['x', None, 'y'], ['i', 'j']],
3199 [['x', None, 'y'], ['j', 'i']],
3200 [['y', None, 'x'], ['i', 'j']],
3201 [['y', None, 'x'], ['j', 'i']],
3202 [[None, 'x', 'y'], ['i', 'j']],
3203 [[None, 'x', 'y'], ['j', 'i']],
3204 [[None, 'y', 'x'], ['i', 'j']],
3205 [[None, 'y', 'x'], ['j', 'i']]]
3207 assert(len(ref_chains) == len(mdl_chains))
3209 if n_max
is not None:
3210 if not _NMappingsWithin(ref_chains, mdl_chains, n_max):
3211 raise RuntimeError(f
"Too many mappings. Max allowed: {n_max}")
3216 for ref, mdl
in zip(ref_chains, mdl_chains):
3218 iterators.append(_RefEmptyGenerator(ref, mdl))
3219 elif len(ref) == len(mdl):
3220 iterators.append(_RefEqualGenerator(ref, mdl))
3221 elif len(ref) < len(mdl):
3222 iterators.append(_RefSmallerGenerator(ref, mdl))
3224 iterators.append(_RefLargerGenerator(ref, mdl))
3226 return _ConcatIterators(iterators)
3229 def _GetSuperposition(pos_one, pos_two, iterative):
3230 """ Computes minimal RMSD superposition for pos_one onto pos_two
3232 :param pos_one: Positions that should be superposed onto *pos_two*
3233 :type pos_one: :class:`geom.Vec3List`
3234 :param pos_two: Reference positions
3235 :type pos_two: :class:`geom.Vec3List`
3236 :iterative: Whether iterative superposition should be used. Iterative
3237 potentially raises, uses standard superposition as fallback.
3238 :type iterative: :class:`bool`
3239 :returns: Transformation matrix to superpose *pos_one* onto *pos_two*
3240 :rtype: :class:`ost.mol.alg.SuperpositionResult`
3245 res = mol.alg.IterativeSuperposeSVD(pos_one, pos_two)
3249 res = mol.alg.SuperposeSVD(pos_one, pos_two)
3253 __all__ = (
'ChainMapper',
'ReprResult',
'MappingResult')
def _NSCConserved(self, trg_ch, mdl_ch)
def _NPairConserved(self, trg_int, mdl_int)
def FromFlatMapping(self, flat_mapping)
def __init__(self, pep_subst_mat=seq.alg.BLOSUM62, pep_gap_open=-5, pep_gap_ext=-2, nuc_subst_mat=seq.alg.NUC44, nuc_gap_open=-4, nuc_gap_ext=-4, resnum_aln=False)
def Align(self, s1, s2, chem_type=None)
def NWAlign(self, s1, s2, chem_type)
def ResNumAlign(self, s1, s2)
def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, contact_d=12.0, steep_opt_rate=None, greedy_prune_contact_map=False)
def SCCounts(self, ref_ch, mdl_ch)
def ExtendMapping(self, mapping, max_ext=None)
def _SteepOpt(self, mapping, chains_to_optimize=None)
def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], steep_opt_rate=None)
def ExtendMapping(self, mapping, max_ext=None)
def _SteepOpt(self, mapping, chains_to_optimize=None)
def chem_group_ref_seqs(self)
def chem_group_alignments(self)
def GetlDDTMapping(self, model, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic", steep_opt_rate=None, block_seed_size=5, block_blocks_per_chem_group=5, chem_mapping_result=None, heuristic_n_max_naive=40320)
def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False, only_interchain=False, chem_mapping_result=None, global_mapping=None)
def GetNMappings(self, model)
def GetChemMapping(self, model)
def __init__(self, target, resnum_alignments=False, pep_seqid_thr=95., nuc_seqid_thr=95., pep_subst_mat=seq.alg.BLOSUM62, pep_gap_open=-11, pep_gap_ext=-1, nuc_subst_mat=seq.alg.NUC44, nuc_gap_open=-4, nuc_gap_ext=-4, min_pep_length=6, min_nuc_length=4, n_max_naive=1e8)
def GetMapping(self, model, n_max_naive=40320)
def ProcessStructure(self, ent)
def Align(self, s1, s2, stype)
def GetQSScoreMapping(self, model, contact_d=12.0, strategy="heuristic", block_seed_size=5, block_blocks_per_chem_group=5, heuristic_n_max_naive=40320, steep_opt_rate=None, chem_mapping_result=None, greedy_prune_contact_map=True)
def chem_group_types(self)
def GetRMSDMapping(self, model, strategy="heuristic", subsampling=50, chem_mapping_result=None, heuristic_n_max_naive=120)
def GetFlatMapping(self, mdl_as_key=False)
def __init__(self, target, model, chem_groups, chem_mapping, mapping, alns, opt_score=None)
def mdl_full_bb_pos(self)
def _GetFullBBPos(self, residues)
def superposed_mdl_bb_pos(self)
def _GetBBPos(self, residues)
def inconsistent_residues(self)
def GetFlatChainMapping(self, mdl_as_key=False)
def ref_full_bb_pos(self)
def __init__(self, lDDT, substructure, ref_view, mdl_view)
def _GetInconsistentResidues(self, ref_residues, mdl_residues)
def _InterfacePenalty1(self, interface)
def _InterfacePenalty2(self, interface)
def _MappedInterfaceScores(self, int1, int2)
def FromFlatMapping(self, flat_mapping)