2 Chain mapping aims to identify a one-to-one relationship between chains in a
3 reference structure and a model.
11 from scipy.special
import factorial
12 from scipy.special
import binom
24 def _CSel(ent, cnames):
25 """ Returns view with specified chains
27 Ensures that quotation marks are around chain names to not confuse
28 OST query language with weird special characters.
30 query =
"cname=" +
','.join([mol.QueryQuoteName(cname)
for cname
in cnames])
31 return ent.Select(query)
34 """ Result object for the chain mapping functions in :class:`ChainMapper`
36 Constructor is directly called within the functions, no need to construct
37 such objects yourself.
39 def __init__(self, target, model, chem_groups, chem_mapping, mapping, alns,
46 self.
_alns_alns = alns
51 """ Target/reference structure, i.e. :attr:`ChainMapper.target`
53 :type: :class:`ost.mol.EntityView`
59 """ Model structure that gets mapped onto :attr:`~target`
61 Underwent same processing as :attr:`ChainMapper.target`, i.e.
62 only contains peptide/nucleotide chains of sufficient size.
64 :type: :class:`ost.mol.EntityView`
70 """ Groups of chemically equivalent chains in :attr:`~target`
72 Same as :attr:`ChainMapper.chem_group`
74 :class:`list` of :class:`list` of :class:`str` (chain names)
80 """ Assigns chains in :attr:`~model` to :attr:`~chem_groups`.
82 :class:`list` of :class:`list` of :class:`str` (chain names)
88 """ Mapping of :attr:`~model` chains onto :attr:`~target`
90 Exact same shape as :attr:`~chem_groups` but containing the names of the
91 mapped chains in :attr:`~model`. May contain None for :attr:`~target`
92 chains that are not covered. No guarantee that all chains in
93 :attr:`~model` are mapped.
95 :class:`list` of :class:`list` of :class:`str` (chain names)
101 """ Alignments of mapped chains in :attr:`~target` and :attr:`~model`
103 Each alignment is accessible with ``alns[(t_chain,m_chain)]``. First
104 sequence is the sequence of :attr:`target` chain, second sequence the
105 one from :attr:`~model`. The respective :class:`ost.mol.EntityView` are
106 attached with :func:`ost.seq.ConstSequenceHandle.AttachView`.
108 :type: :class:`dict` with key: :class:`tuple` of :class:`str`, value:
109 :class:`ost.seq.AlignmentHandle`
111 return self.
_alns_alns
115 """ Placeholder property without any guarantee of being set
117 Different scores get optimized in the various chain mapping algorithms.
118 Some of them may set their final optimal score in that property.
119 Consult the documentation of the respective chain mapping algorithm
120 for more information. Won't be in the return dict of
126 """ Returns flat mapping as :class:`dict` for all mapable chains
128 :param mdl_as_key: Default is target chain name as key and model chain
129 name as value. This can be reversed with this flag.
130 :returns: :class:`dict` with :class:`str` as key/value that describe
133 flat_mapping = dict()
134 for trg_chem_group, mdl_chem_group
in zip(self.
chem_groupschem_groups,
136 for a,b
in zip(trg_chem_group, mdl_chem_group):
137 if a
is not None and b
is not None:
145 """ Returns JSON serializable summary of results
148 json_dict[
"chem_groups"] = self.
chem_groupschem_groups
149 json_dict[
"mapping"] = self.
mappingmapping
151 json_dict[
"alns"] = list()
152 for aln
in self.
alnsalns.values():
153 trg_seq = aln.GetSequence(0)
154 mdl_seq = aln.GetSequence(1)
155 aln_dict = {
"trg_ch": trg_seq.GetName(),
"trg_seq": str(trg_seq),
156 "mdl_ch": mdl_seq.GetName(),
"mdl_seq": str(mdl_seq)}
157 json_dict[
"alns"].append(aln_dict)
163 """ Result object for :func:`ChainMapper.GetRepr`
165 Constructor is directly called within the function, no need to construct
166 such objects yourself.
168 :param lDDT: lDDT for this mapping. Depends on how you call
169 :func:`ChainMapper.GetRepr` whether this is backbone only or
171 :type lDDT: :class:`float`
172 :param substructure: The full substructure for which we searched for a
174 :type substructure: :class:`ost.mol.EntityView`
175 :param ref_view: View pointing to the same underlying entity as
176 *substructure* but only contains the stuff that is mapped
177 :type ref_view: :class:`mol.EntityView`
178 :param mdl_view: The matching counterpart in model
179 :type mdl_view: :class:`mol.EntityView`
181 def __init__(self, lDDT, substructure, ref_view, mdl_view):
182 self.
_lDDT_lDDT = lDDT
184 assert(len(ref_view.residues) == len(mdl_view.residues))
206 """ lDDT of representation result
208 Depends on how you call :func:`ChainMapper.GetRepr` whether this is
209 backbone only or full atom lDDT.
211 :type: :class:`float`
213 return self.
_lDDT_lDDT
217 """ The full substructure for which we searched for a
220 :type: :class:`ost.mol.EntityView`
226 """ View which contains the mapped subset of :attr:`substructure`
228 :type: :class:`ost.mol.EntityView`
234 """ The :attr:`ref_view` representation in the model
236 :type: :class:`ost.mol.EntityView`
242 """ The reference residues
244 :type: class:`mol.ResidueViewList`
246 return self.
ref_viewref_view.residues
250 """ The model residues
252 :type: :class:`mol.ResidueViewList`
254 return self.
mdl_viewmdl_view.residues
258 """ A list of mapped residue whose names do not match (eg. ALA in the
259 reference and LEU in the model).
261 The mismatches are reported as a tuple of :class:`~ost.mol.ResidueView`
262 (reference, model), or as an empty list if all the residue names match.
273 """ Representative backbone positions for reference residues.
275 Thats CA positions for peptides and C3' positions for Nucleotides.
277 :type: :class:`geom.Vec3List`
285 """ Representative backbone positions for model residues.
287 Thats CA positions for peptides and C3' positions for Nucleotides.
289 :type: :class:`geom.Vec3List`
297 """ Representative backbone positions for reference residues.
299 Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
300 positions for Nucleotides.
302 :type: :class:`geom.Vec3List`
310 """ Representative backbone positions for reference residues.
312 Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
313 positions for Nucleotides.
315 :type: :class:`geom.Vec3List`
323 """ Transformation to superpose mdl residues onto ref residues
325 Superposition computed as minimal RMSD superposition on
326 :attr:`ref_bb_pos` and :attr:`mdl_bb_pos`. If number of positions is
327 smaller 3, the full_bb_pos equivalents are used instead.
329 :type: :class:`ost.geom.Mat4`
342 """ :attr:`mdl_bb_pos` with :attr:`transform applied`
344 :type: :class:`geom.Vec3List`
353 """ RMSD between :attr:`ref_bb_pos` and :attr:`superposed_mdl_bb_pos`
355 :type: :class:`float`
363 """ GDT with one single threshold: 8.0
365 :type: :class:`float`
367 if self.
_gdt_8_gdt_8
is None:
373 """ GDT with one single threshold: 4.0
375 :type: :class:`float`
377 if self.
_gdt_4_gdt_4
is None:
383 """ GDT with one single threshold: 2.0
385 :type: :class:`float`
387 if self.
_gdt_2_gdt_2
is None:
393 """ GDT with one single threshold: 1.0
395 :type: :class:`float`
397 if self.
_gdt_1_gdt_1
is None:
403 """ query for mdl residues in OpenStructure query language
405 Repr can be selected as ``full_mdl.Select(ost_query)``
407 Returns invalid query if residue numbers have insertion codes.
414 chname = r.GetChain().GetName()
415 rnum = r.GetNumber().GetNum()
416 if chname
not in chain_rnums:
417 chain_rnums[chname] = list()
418 chain_rnums[chname].append(str(rnum))
419 chain_queries = list()
420 for k,v
in chain_rnums.items():
421 q = f
"(cname={mol.QueryQuoteName(k)} and "
422 q += f
"rnum={','.join(v)})"
423 chain_queries.append(q)
424 self.
_ost_query_ost_query =
" or ".join(chain_queries)
428 """ Returns JSON serializable summary of results
431 json_dict[
"lDDT"] = self.
lDDTlDDT
432 json_dict[
"ref_residues"] = [r.GetQualifiedName()
for r
in \
434 json_dict[
"mdl_residues"] = [r.GetQualifiedName()
for r
in \
436 json_dict[
"transform"] = list(self.
transformtransform.data)
437 json_dict[
"bb_rmsd"] = self.
bb_rmsdbb_rmsd
438 json_dict[
"gdt_8"] = self.
gdt_8gdt_8
439 json_dict[
"gdt_4"] = self.
gdt_4gdt_4
440 json_dict[
"gdt_2"] = self.
gdt_2gdt_2
441 json_dict[
"gdt_1"] = self.
gdt_1gdt_1
442 json_dict[
"ost_query"] = self.
ost_queryost_query
447 """ Returns flat mapping of all chains in the representation
449 :param mdl_as_key: Default is target chain name as key and model chain
450 name as value. This can be reversed with this flag.
451 :returns: :class:`dict` with :class:`str` as key/value that describe
454 flat_mapping = dict()
457 flat_mapping[mdl_res.chain.name] = trg_res.chain.name
459 flat_mapping[trg_res.chain.name] = mdl_res.chain.name
462 def _GetFullBBPos(self, residues):
463 """ Helper to extract full backbone positions
465 exp_pep_atoms = [
"N",
"CA",
"C"]
466 exp_nuc_atoms = [
"\"O5'\"",
"\"C5'\"",
"\"C4'\"",
"\"C3'\"",
"\"O3'\""]
469 if r.GetChemType() == mol.ChemType.NUCLEOTIDES:
470 exp_atoms = exp_nuc_atoms
471 elif r.GetChemType() == mol.ChemType.AMINOACIDS:
472 exp_atoms = exp_pep_atoms
474 raise RuntimeError(
"Something terrible happened... RUN...")
475 for aname
in exp_atoms:
476 a = r.FindAtom(aname)
478 raise RuntimeError(
"Something terrible happened... "
480 bb_pos.append(a.GetPos())
483 def _GetBBPos(self, residues):
484 """ Helper to extract single representative position for each residue
488 at = r.FindAtom(
"CA")
490 at = r.FindAtom(
"C3'")
492 raise RuntimeError(
"Something terrible happened... RUN...")
493 bb_pos.append(at.GetPos())
496 def _GetInconsistentResidues(self, ref_residues, mdl_residues):
497 """ Helper to extract a list of inconsistent residues.
499 if len(ref_residues) != len(mdl_residues):
500 raise ValueError(
"Something terrible happened... Reference and "
501 "model lengths differ... RUN...")
502 inconsistent_residues = list()
503 for ref_residue, mdl_residue
in zip(ref_residues, mdl_residues):
504 if ref_residue.name != mdl_residue.name:
505 inconsistent_residues.append((ref_residue, mdl_residue))
506 return inconsistent_residues
510 """ Class to compute chain mappings
512 All algorithms are performed on processed structures which fulfill
513 criteria as given in constructor arguments (*min_pep_length*,
514 "min_nuc_length") and only contain residues which have all required backbone
515 atoms. for peptide residues thats N, CA, C and CB (no CB for GLY), for
516 nucleotide residues thats O5', C5', C4', C3' and O3'.
518 Chain mapping is a three step process:
520 * Group chemically identical chains in *target* using pairwise
521 alignments that are either computed with Needleman-Wunsch (NW) or
522 simply derived from residue numbers (*resnum_alignments* flag).
523 In case of NW, *pep_subst_mat*, *pep_gap_open* and *pep_gap_ext*
524 and their nucleotide equivalents are relevant. Two chains are
525 considered identical if they fulfill the thresholds given by
526 *pep_seqid_thr*, *pep_gap_thr*, their nucleotide equivalents
527 respectively. The grouping information is available as
528 attributes of this class.
530 * Map chains in an input model to these groups. Generating alignments
531 and the similarity criteria are the same as above. You can either
532 get the group mapping with :func:`GetChemMapping` or directly call
533 one of the full fletched one-to-one chain mapping functions which
534 execute that step internally.
536 * Obtain one-to-one mapping for chains in an input model and
537 *target* with one of the available mapping functions. Just to get an
538 idea of complexity. If *target* and *model* are octamers, there are
539 ``8! = 40320`` possible chain mappings.
541 :param target: Target structure onto which models are mapped.
542 Computations happen on a selection only containing
543 polypeptides and polynucleotides.
544 :type target: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
545 :param resnum_alignments: Use residue numbers instead of
546 Needleman-Wunsch to compute pairwise
547 alignments. Relevant for :attr:`~chem_groups`
548 and related attributes.
549 :type resnum_alignments: :class:`bool`
550 :param pep_seqid_thr: Threshold used to decide when two chains are
551 identical. 95 percent tolerates the few mutations
552 crystallographers like to do.
553 :type pep_seqid_thr: :class:`float`
554 :param pep_gap_thr: Additional threshold to avoid gappy alignments with
555 high seqid. By default this is disabled (set to 1.0).
556 This threshold checks for a maximum allowed fraction
557 of gaps in any of the two sequences after stripping
558 terminal gaps. The reason for not just normalizing
559 seqid by the longer sequence is that one sequence
560 might be a perfect subsequence of the other but only
562 :type pep_gap_thr: :class:`float`
563 :param nuc_seqid_thr: Nucleotide equivalent for *pep_seqid_thr*
564 :type nuc_seqid_thr: :class:`float`
565 :param nuc_gap_thr: Nucleotide equivalent for *nuc_gap_thr*
566 :type nuc_gap_thr: :class:`float`
567 :param pep_subst_mat: Substitution matrix to align peptide sequences,
568 irrelevant if *resnum_alignments* is True,
569 defaults to seq.alg.BLOSUM62
570 :type pep_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
571 :param pep_gap_open: Gap open penalty to align peptide sequences,
572 irrelevant if *resnum_alignments* is True
573 :type pep_gap_open: :class:`int`
574 :param pep_gap_ext: Gap extension penalty to align peptide sequences,
575 irrelevant if *resnum_alignments* is True
576 :type pep_gap_ext: :class:`int`
577 :param nuc_subst_mat: Nucleotide equivalent for *pep_subst_mat*,
578 defaults to seq.alg.NUC44
579 :type nuc_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
580 :param nuc_gap_open: Nucleotide equivalent for *pep_gap_open*
581 :type nuc_gap_open: :class:`int`
582 :param nuc_gap_ext: Nucleotide equivalent for *pep_gap_ext*
583 :type nuc_gap_ext: :class:`int`
584 :param min_pep_length: Minimal number of residues for a peptide chain to be
585 considered in target and in models.
586 :type min_pep_length: :class:`int`
587 :param min_nuc_length: Minimal number of residues for a nucleotide chain to be
588 considered in target and in models.
589 :type min_nuc_length: :class:`int`
590 :param n_max_naive: Max possible chain mappings that are enumerated in
591 :func:`~GetNaivelDDTMapping` /
592 :func:`~GetDecomposerlDDTMapping`. A
593 :class:`RuntimeError` is raised in case of bigger
595 :type n_max_naive: :class:`int`
597 def __init__(self, target, resnum_alignments=False,
598 pep_seqid_thr = 95., pep_gap_thr = 1.0,
599 nuc_seqid_thr = 95., nuc_gap_thr = 1.0,
600 pep_subst_mat = seq.alg.BLOSUM62, pep_gap_open = -11,
601 pep_gap_ext = -1, nuc_subst_mat = seq.alg.NUC44,
602 nuc_gap_open = -4, nuc_gap_ext = -4,
603 min_pep_length = 6, min_nuc_length = 4,
624 pep_subst_mat = pep_subst_mat,
625 pep_gap_open = pep_gap_open,
626 pep_gap_ext = pep_gap_ext,
627 nuc_subst_mat = nuc_subst_mat,
628 nuc_gap_open = nuc_gap_open,
629 nuc_gap_ext = nuc_gap_ext)
632 self._target, self._polypep_seqs, self.
_polynuc_seqs_polynuc_seqs = \
637 """Target structure that only contains peptides/nucleotides
639 Contains only residues that have the backbone representatives
640 (CA for peptide and C3' for nucleotides) to avoid ATOMSEQ alignment
641 inconsistencies when switching between all atom and backbone only
644 :type: :class:`ost.mol.EntityView`
650 """Sequences of peptide chains in :attr:`~target`
652 Respective :class:`EntityView` from *target* for each sequence s are
653 available as ``s.GetAttachedView()``
655 :type: :class:`ost.seq.SequenceList`
657 return self._polypep_seqs
661 """Sequences of nucleotide chains in :attr:`~target`
663 Respective :class:`EntityView` from *target* for each sequence s are
664 available as ``s.GetAttachedView()``
666 :type: :class:`ost.seq.SequenceList`
672 """Groups of chemically equivalent chains in :attr:`~target`
674 First chain in group is the one with longest sequence.
676 :getter: Computed on first use (cached)
677 :type: :class:`list` of :class:`list` of :class:`str` (chain names)
682 self.
_chem_groups_chem_groups.append([s.GetName()
for s
in a.sequences])
687 """MSA for each group in :attr:`~chem_groups`
689 Sequences in MSAs exhibit same order as in :attr:`~chem_groups` and
690 have the respective :class:`ost.mol.EntityView` from *target* attached.
692 :getter: Computed on first use (cached)
693 :type: :class:`ost.seq.AlignmentList`
708 """Reference (longest) sequence for each group in :attr:`~chem_groups`
710 Respective :class:`EntityView` from *target* for each sequence s are
711 available as ``s.GetAttachedView()``
713 :getter: Computed on first use (cached)
714 :type: :class:`ost.seq.SequenceList`
719 s = seq.CreateSequence(a.GetSequence(0).GetName(),
720 a.GetSequence(0).GetGaplessString())
721 s.AttachView(a.GetSequence(0).GetAttachedView())
727 """ChemType of each group in :attr:`~chem_groups`
729 Specifying if groups are poly-peptides/nucleotides, i.e.
730 :class:`ost.mol.ChemType.AMINOACIDS` or
731 :class:`ost.mol.ChemType.NUCLEOTIDES`
733 :getter: Computed on first use (cached)
734 :type: :class:`list` of :class:`ost.mol.ChemType`
748 """Maps sequences in *model* to chem_groups of target
750 :param model: Model from which to extract sequences, a
751 selection that only includes peptides and nucleotides
752 is performed and returned along other results.
753 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
754 :returns: Tuple with two lists of length `len(self.chem_groups)` and
755 an :class:`ost.mol.EntityView` representing *model*:
756 1) Each element is a :class:`list` with mdl chain names that
757 map to the chem group at that position.
758 2) Each element is a :class:`ost.seq.AlignmentList` aligning
759 these mdl chain sequences to the chem group ref sequences.
760 3) A selection of *model* that only contains polypeptides and
761 polynucleotides whose ATOMSEQ exactly matches the sequence
762 info in the returned alignments.
764 mdl, mdl_pep_seqs, mdl_nuc_seqs = self.
ProcessStructureProcessStructure(model)
765 mapping = [list()
for x
in self.
chem_groupschem_groups]
766 alns = [seq.AlignmentList()
for x
in self.
chem_groupschem_groups]
768 for s
in mdl_pep_seqs:
771 s, mol.ChemType.AMINOACIDS,
774 mapping[idx].append(s.GetName())
775 alns[idx].append(aln)
777 for s
in mdl_nuc_seqs:
780 s, mol.ChemType.NUCLEOTIDES,
783 mapping[idx].append(s.GetName())
784 alns[idx].append(aln)
786 return (mapping, alns, mdl)
790 thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic",
791 steep_opt_rate = None, block_seed_size = 5,
792 block_blocks_per_chem_group = 5,
793 chem_mapping_result = None,
794 heuristic_n_max_naive = 40320):
795 """ Identify chain mapping by optimizing lDDT score
797 Maps *model* chain sequences to :attr:`~chem_groups` and find mapping
798 based on backbone only lDDT score (CA for amino acids C3' for
801 Either performs a naive search, i.e. enumerate all possible mappings or
802 executes a greedy strategy that tries to identify a (close to) optimal
803 mapping in an iterative way by starting from a start mapping (seed). In
804 each iteration, the one-to-one mapping that leads to highest increase
805 in number of conserved contacts is added with the additional requirement
806 that this added mapping must have non-zero interface counts towards the
807 already mapped chains. So basically we're "growing" the mapped structure
808 by only adding connected stuff.
810 The available strategies:
812 * **naive**: Enumerates all possible mappings and returns best
814 * **greedy_fast**: perform all vs. all single chain lDDTs within the
815 respective ref/mdl chem groups. The mapping with highest number of
816 conserved contacts is selected as seed for greedy extension
818 * **greedy_full**: try multiple seeds for greedy extension, i.e. try
819 all ref/mdl chain combinations within the respective chem groups and
820 retain the mapping leading to the best lDDT.
822 * **greedy_block**: try multiple seeds for greedy extension, i.e. try
823 all ref/mdl chain combinations within the respective chem groups and
824 extend them to *block_seed_size*. *block_blocks_per_chem_group*
825 for each chem group are selected for exhaustive extension.
827 * **heuristic**: Uses *naive* strategy if number of possible mappings
828 is within *heuristic_n_max_naive*. The default of 40320 corresponds
829 to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
830 6!*2!=1440 etc. If the number of possible mappings is larger,
831 *greedy_full* is used.
833 Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
836 :param model: Model to map
837 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
838 :param inclusion_radius: Inclusion radius for lDDT
839 :type inclusion_radius: :class:`float`
840 :param thresholds: Thresholds for lDDT
841 :type thresholds: :class:`list` of :class:`float`
842 :param strategy: Strategy to find mapping. Must be in ["naive",
843 "greedy_fast", "greedy_full", "greedy_block"]
844 :type strategy: :class:`str`
845 :param steep_opt_rate: Only relevant for greedy strategies.
846 If set, every *steep_opt_rate* mappings, a simple
847 optimization is executed with the goal of
848 avoiding local minima. The optimization
849 iteratively checks all possible swaps of mappings
850 within their respective chem groups and accepts
851 swaps that improve lDDT score. Iteration stops as
852 soon as no improvement can be achieved anymore.
853 :type steep_opt_rate: :class:`int`
854 :param block_seed_size: Param for *greedy_block* strategy - Initial seeds
855 are extended by that number of chains.
856 :type block_seed_size: :class:`int`
857 :param block_blocks_per_chem_group: Param for *greedy_block* strategy -
858 Number of blocks per chem group that
859 are extended in an initial search
860 for high scoring local solutions.
861 :type block_blocks_per_chem_group: :class:`int`
862 :param chem_mapping_result: Pro param. The result of
863 :func:`~GetChemMapping` where you provided
864 *model*. If set, *model* parameter is not
866 :type chem_mapping_result: :class:`tuple`
867 :returns: A :class:`MappingResult`
870 strategies = [
"naive",
"greedy_fast",
"greedy_full",
"greedy_block",
872 if strategy
not in strategies:
873 raise RuntimeError(f
"Strategy must be in {strategies}")
875 if chem_mapping_result
is None:
876 chem_mapping, chem_group_alns, mdl = self.
GetChemMappingGetChemMapping(model)
878 chem_mapping, chem_group_alns, mdl = chem_mapping_result
880 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
886 one_to_one = _CheckOneToOneMapping(self.
chem_groupschem_groups, chem_mapping)
887 if one_to_one
is not None:
889 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, one_to_one):
890 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
891 if ref_ch
is not None and mdl_ch
is not None:
892 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
893 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
894 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
895 alns[(ref_ch, mdl_ch)] = aln
899 if strategy ==
"heuristic":
900 if _NMappingsWithin(self.
chem_groupschem_groups, chem_mapping,
901 heuristic_n_max_naive):
904 strategy =
"greedy_full"
909 if strategy ==
"naive":
910 mapping, opt_lddt = _lDDTNaive(self.
targettarget, mdl, inclusion_radius,
912 chem_mapping, ref_mdl_alns,
917 chem_mapping, ref_mdl_alns,
918 inclusion_radius=inclusion_radius,
919 thresholds=thresholds,
920 steep_opt_rate=steep_opt_rate)
921 if strategy ==
"greedy_fast":
922 mapping = _lDDTGreedyFast(the_greed)
923 elif strategy ==
"greedy_full":
924 mapping = _lDDTGreedyFull(the_greed)
925 elif strategy ==
"greedy_block":
926 mapping = _lDDTGreedyBlock(the_greed, block_seed_size,
927 block_blocks_per_chem_group)
929 opt_lddt = the_greed.Score(mapping)
932 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, mapping):
933 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
934 if ref_ch
is not None and mdl_ch
is not None:
935 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
936 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
937 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
938 alns[(ref_ch, mdl_ch)] = aln
941 mapping, alns, opt_score = opt_lddt)
945 block_seed_size = 5, block_blocks_per_chem_group = 5,
946 heuristic_n_max_naive = 40320, steep_opt_rate = None,
947 chem_mapping_result = None,
948 greedy_prune_contact_map = True):
949 """ Identify chain mapping based on QSScore
951 Scoring is based on CA/C3' positions which are present in all chains of
952 a :attr:`chem_groups` as well as the *model* chains which are mapped to
953 that respective chem group.
955 The following strategies are available:
957 * **naive**: Naively iterate all possible mappings and return best based
960 * **greedy_fast**: perform all vs. all single chain lDDTs within the
961 respective ref/mdl chem groups. The mapping with highest number of
962 conserved contacts is selected as seed for greedy extension.
963 Extension is based on QS-score.
965 * **greedy_full**: try multiple seeds for greedy extension, i.e. try
966 all ref/mdl chain combinations within the respective chem groups and
967 retain the mapping leading to the best QS-score.
969 * **greedy_block**: try multiple seeds for greedy extension, i.e. try
970 all ref/mdl chain combinations within the respective chem groups and
971 extend them to *block_seed_size*. *block_blocks_per_chem_group*
972 for each chem group are selected for exhaustive extension.
974 * **heuristic**: Uses *naive* strategy if number of possible mappings
975 is within *heuristic_n_max_naive*. The default of 40320 corresponds
976 to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
977 6!*2!=1440 etc. If the number of possible mappings is larger,
978 *greedy_full* is used.
980 Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
983 :param model: Model to map
984 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
985 :param contact_d: Max distance between two residues to be considered as
986 contact in qs scoring
987 :type contact_d: :class:`float`
988 :param strategy: Strategy for sampling, must be in ["naive",
989 "greedy_fast", "greedy_full", "greedy_block"]
990 :type strategy: :class:`str`
991 :param chem_mapping_result: Pro param. The result of
992 :func:`~GetChemMapping` where you provided
993 *model*. If set, *model* parameter is not
995 :type chem_mapping_result: :class:`tuple`
996 :param greedy_prune_contact_map: Relevant for all strategies that use
997 greedy extensions. If True, only chains
998 with at least 3 contacts (8A CB
999 distance) towards already mapped chains
1000 in trg/mdl are considered for
1001 extension. All chains that give a
1002 potential non-zero QS-score increase
1003 are used otherwise (at least one
1004 contact within 12A). The consequence
1005 is reduced runtime and usually no
1006 real reduction in accuracy.
1007 :returns: A :class:`MappingResult`
1010 strategies = [
"naive",
"greedy_fast",
"greedy_full",
"greedy_block",
"heuristic"]
1011 if strategy
not in strategies:
1012 raise RuntimeError(f
"strategy must be {strategies}")
1014 if chem_mapping_result
is None:
1015 chem_mapping, chem_group_alns, mdl = self.
GetChemMappingGetChemMapping(model)
1017 chem_mapping, chem_group_alns, mdl = chem_mapping_result
1018 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
1023 one_to_one = _CheckOneToOneMapping(self.
chem_groupschem_groups, chem_mapping)
1024 if one_to_one
is not None:
1026 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, one_to_one):
1027 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1028 if ref_ch
is not None and mdl_ch
is not None:
1029 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1030 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
1031 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1032 alns[(ref_ch, mdl_ch)] = aln
1036 if strategy ==
"heuristic":
1037 if _NMappingsWithin(self.
chem_groupschem_groups, chem_mapping,
1038 heuristic_n_max_naive):
1041 strategy =
"greedy_full"
1046 if strategy ==
"naive":
1047 mapping, opt_qsscore = _QSScoreNaive(self.
targettarget, mdl,
1049 chem_mapping, ref_mdl_alns,
1055 chem_mapping, ref_mdl_alns,
1056 contact_d = contact_d,
1057 steep_opt_rate=steep_opt_rate,
1058 greedy_prune_contact_map = greedy_prune_contact_map)
1059 if strategy ==
"greedy_fast":
1060 mapping = _QSScoreGreedyFast(the_greed)
1061 elif strategy ==
"greedy_full":
1062 mapping = _QSScoreGreedyFull(the_greed)
1063 elif strategy ==
"greedy_block":
1064 mapping = _QSScoreGreedyBlock(the_greed, block_seed_size,
1065 block_blocks_per_chem_group)
1067 opt_qsscore = the_greed.Score(mapping, check=
False).QS_global
1070 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, mapping):
1071 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1072 if ref_ch
is not None and mdl_ch
is not None:
1073 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1074 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
1075 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1076 alns[(ref_ch, mdl_ch)] = aln
1079 mapping, alns, opt_score = opt_qsscore)
1082 chem_mapping_result = None, heuristic_n_max_naive = 120):
1083 """Identify chain mapping based on minimal RMSD superposition
1085 Superposition and scoring is based on CA/C3' positions which are present
1086 in all chains of a :attr:`chem_groups` as well as the *model*
1087 chains which are mapped to that respective chem group.
1089 The following strategies are available:
1091 * **naive**: Naively iterate all possible mappings and return the one
1094 * **greedy_single**: perform all vs. all single chain superpositions
1095 within the respective ref/mdl chem groups to use as starting points.
1096 For each starting point, iteratively add the model/target chain pair
1097 with lowest RMSD until a full mapping is achieved. The mapping with
1098 lowest RMSD is returned.
1100 * **greedy_iterative**: Same as greedy_single_rmsd exept that the
1101 transformation gets updated with each added chain pair.
1103 * **heuristic**: Uses *naive* strategy if number of possible mappings
1104 is within *heuristic_n_max_naive*. The default of 120 corresponds
1105 to a homo-pentamer (5!=120). If the number of possible mappings is
1106 larger, *greedy_iterative* is used.
1108 :param model: Model to map
1109 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1110 :param strategy: Strategy for sampling. Must be in ["naive",
1111 "greedy_single", "greedy_iterative"]
1112 :type strategy: :class:`str`
1113 :param subsampling: If given, only an equally distributed subset
1114 of CA/C3' positions in each chain are used for
1115 superposition/scoring.
1116 :type subsampling: :class:`int`
1117 :param chem_mapping_result: Pro param. The result of
1118 :func:`~GetChemMapping` where you provided
1119 *model*. If set, *model* parameter is not
1121 :type chem_mapping_result: :class:`tuple`
1122 :returns: A :class:`MappingResult`
1125 strategies = [
"naive",
"greedy_single",
"greedy_iterative",
"heuristic"]
1127 if strategy
not in strategies:
1128 raise RuntimeError(f
"strategy must be {strategies}")
1130 if chem_mapping_result
is None:
1131 chem_mapping, chem_group_alns, mdl = self.
GetChemMappingGetChemMapping(model)
1133 chem_mapping, chem_group_alns, mdl = chem_mapping_result
1134 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
1140 one_to_one = _CheckOneToOneMapping(self.
chem_groupschem_groups, chem_mapping)
1141 if one_to_one
is not None:
1143 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, one_to_one):
1144 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1145 if ref_ch
is not None and mdl_ch
is not None:
1146 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1147 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
1148 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1149 alns[(ref_ch, mdl_ch)] = aln
1153 trg_group_pos, mdl_group_pos = _GetRefPos(self.
targettarget, mdl,
1156 max_pos = subsampling)
1158 if strategy ==
"heuristic":
1159 if _NMappingsWithin(self.
chem_groupschem_groups, chem_mapping,
1160 heuristic_n_max_naive):
1163 strategy =
"greedy_iterative"
1167 if strategy.startswith(
"greedy"):
1170 initial_transforms = list()
1171 initial_mappings = list()
1172 for trg_pos, trg_chains, mdl_pos, mdl_chains
in zip(trg_group_pos,
1176 for t_pos, t
in zip(trg_pos, trg_chains):
1177 for m_pos, m
in zip(mdl_pos, mdl_chains):
1178 if len(t_pos) >= 3
and len(m_pos) >= 3:
1179 transform = _GetTransform(m_pos, t_pos,
False)
1180 initial_transforms.append(transform)
1181 initial_mappings.append((t,m))
1183 if strategy ==
"greedy_single":
1184 mapping = _SingleRigidRMSD(initial_transforms,
1192 elif strategy ==
"greedy_iterative":
1193 mapping = _IterativeRigidRMSD(initial_transforms,
1196 trg_group_pos, mdl_group_pos)
1197 elif strategy ==
"naive":
1198 mapping = _NaiveRMSD(self.
chem_groupschem_groups, chem_mapping,
1199 trg_group_pos, mdl_group_pos,
1203 final_mapping = list()
1205 mapped_mdl_chains = list()
1206 for ref_ch
in ref_chains:
1207 if ref_ch
in mapping:
1208 mapped_mdl_chains.append(mapping[ref_ch])
1210 mapped_mdl_chains.append(
None)
1211 final_mapping.append(mapped_mdl_chains)
1214 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, final_mapping):
1215 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1216 if ref_ch
is not None and mdl_ch
is not None:
1217 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1218 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
1219 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1220 alns[(ref_ch, mdl_ch)] = aln
1223 final_mapping, alns)
1226 """ Convenience function to get mapping with currently preferred method
1228 If number of possible chain mappings is <= *n_max_naive*, a naive
1229 QS-score mapping is performed and optimal QS-score is guaranteed.
1230 For anything else, a QS-score mapping with the greedy_full strategy is
1231 performed (greedy_prune_contact_map = True). The default for
1232 *n_max_naive* of 40320 corresponds to an octamer (8!=40320). A
1233 structure with stoichiometry A6B2 would be 6!*2!=1440 etc.
1235 If :attr:`~target` has nucleotide sequences, the QS-score target
1236 function is replaced with a backbone only lDDT score that has
1237 an inclusion radius of 30A.
1240 return self.
GetlDDTMappingGetlDDTMapping(model, strategy =
"heuristic",
1241 inclusion_radius = 30.0,
1242 heuristic_n_max_naive = n_max_naive)
1245 greedy_prune_contact_map=
True,
1246 heuristic_n_max_naive = n_max_naive)
1248 def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0,
1249 thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False,
1250 only_interchain=False, chem_mapping_result = None,
1251 global_mapping = None):
1252 """ Identify *topn* representations of *substructure* in *model*
1254 *substructure* defines a subset of :attr:`~target` for which one
1255 wants the *topn* representations in *model*. Representations are scored
1258 :param substructure: A :class:`ost.mol.EntityView` which is a subset of
1259 :attr:`~target`. Should be selected with the
1260 OpenStructure query language. Example: if you're
1261 interested in residues with number 42,43 and 85 in
1263 ``substructure=mapper.target.Select("cname=A and rnum=42,43,85")``
1264 A :class:`RuntimeError` is raised if *substructure*
1265 does not refer to the same underlying
1266 :class:`ost.mol.EntityHandle` as :attr:`~target`.
1267 :type substructure: :class:`ost.mol.EntityView`
1268 :param model: Structure in which one wants to find representations for
1270 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1271 :param topn: Max number of representations that are returned
1272 :type topn: :class:`int`
1273 :param inclusion_radius: Inclusion radius for lDDT
1274 :type inclusion_radius: :class:`float`
1275 :param thresholds: Thresholds for lDDT
1276 :type thresholds: :class:`list` of :class:`float`
1277 :param bb_only: Only consider backbone atoms in lDDT computation
1278 :type bb_only: :class:`bool`
1279 :param only_interchain: Only score interchain contacts in lDDT. Useful
1280 if you want to identify interface patches.
1281 :type only_interchain: :class:`bool`
1282 :param chem_mapping_result: Pro param. The result of
1283 :func:`~GetChemMapping` where you provided
1284 *model*. If set, *model* parameter is not
1286 :type chem_mapping_result: :class:`tuple`
1287 :param global_mapping: Pro param. Specify a global mapping result. This
1288 fully defines the desired representation in the
1289 model but extracts it and enriches it with all
1290 the nice attributes of :class:`ReprResult`.
1291 The target attribute in *global_mapping* must be
1292 of the same entity as self.target and the model
1293 attribute of *global_mapping* must be of the same
1295 :type global_mapping: :class:`MappingResult`
1296 :returns: :class:`list` of :class:`ReprResult`
1300 raise RuntimeError(
"topn must be >= 1")
1302 if global_mapping
is not None:
1304 if global_mapping.target.handle.GetHashCode() != \
1305 self.
targettarget.handle.GetHashCode():
1306 raise RuntimeError(
"global_mapping.target must be the same "
1307 "entity as self.target")
1308 if global_mapping.model.handle.GetHashCode() != \
1309 model.handle.GetHashCode():
1310 raise RuntimeError(
"global_mapping.model must be the same "
1311 "entity as model param")
1314 for r
in substructure.residues:
1315 ch_name = r.GetChain().GetName()
1316 rnum = r.GetNumber()
1317 target_r = self.
targettarget.FindResidue(ch_name, rnum)
1318 if not target_r.IsValid():
1319 raise RuntimeError(f
"substructure has residue "
1320 f
"{r.GetQualifiedName()} which is not in "
1322 if target_r.handle.GetHashCode() != r.handle.GetHashCode():
1323 raise RuntimeError(f
"substructure has residue "
1324 f
"{r.GetQualifiedName()} which has an "
1325 f
"equivalent in self.target but it does "
1326 f
"not refer to the same underlying "
1329 target_a = target_r.FindAtom(a.GetName())
1330 if not target_a.IsValid():
1331 raise RuntimeError(f
"substructure has atom "
1332 f
"{a.GetQualifiedName()} which is not "
1334 if a.handle.GetHashCode() != target_a.handle.GetHashCode():
1335 raise RuntimeError(f
"substructure has atom "
1336 f
"{a.GetQualifiedName()} which has an "
1337 f
"equivalent in self.target but it does "
1338 f
"not refer to the same underlying "
1342 ca = r.FindAtom(
"CA")
1343 c3 = r.FindAtom(
"C3'")
1345 if not ca.IsValid()
and not c3.IsValid():
1346 raise RuntimeError(
"All residues in substructure must contain "
1347 "a backbone atom named CA or C3\'")
1350 if chem_mapping_result
is None:
1351 chem_mapping, chem_group_alns, mdl = self.
GetChemMappingGetChemMapping(model)
1353 chem_mapping, chem_group_alns, mdl = chem_mapping_result
1354 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
1360 substructure_res_indices = dict()
1361 for ch
in substructure.chains:
1362 full_ch = self.
targettarget.FindChain(ch.GetName())
1363 idx = [full_ch.GetResidueIndex(r.GetNumber())
for r
in ch.residues]
1364 substructure_res_indices[ch.GetName()] = idx
1368 substructure_chem_groups = list()
1369 substructure_chem_mapping = list()
1371 chnames = set([ch.GetName()
for ch
in substructure.chains])
1372 for chem_group, mapping
in zip(self.
chem_groupschem_groups, chem_mapping):
1373 substructure_chem_group = [ch
for ch
in chem_group
if ch
in chnames]
1374 if len(substructure_chem_group) > 0:
1375 substructure_chem_groups.append(substructure_chem_group)
1376 substructure_chem_mapping.append(mapping)
1379 n_mapped_mdl_chains = sum([len(m)
for m
in substructure_chem_mapping])
1380 if n_mapped_mdl_chains == 0:
1385 substructure_ref_mdl_alns = dict()
1387 for ch
in mdl.chains:
1388 mdl_views[ch.GetName()] = _CSel(mdl, [ch.GetName()])
1389 for chem_group, mapping
in zip(substructure_chem_groups,
1390 substructure_chem_mapping):
1391 for ref_ch
in chem_group:
1392 for mdl_ch
in mapping:
1393 full_aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1394 ref_seq = full_aln.GetSequence(0)
1398 tmp = [
'-'] * len(full_aln)
1399 for idx
in substructure_res_indices[ref_ch]:
1400 idx_in_seq = ref_seq.GetPos(idx)
1401 tmp[idx_in_seq] = ref_seq[idx_in_seq]
1402 ref_seq = seq.CreateSequence(ref_ch,
''.join(tmp))
1403 ref_seq.AttachView(_CSel(substructure, [ref_ch]))
1404 mdl_seq = full_aln.GetSequence(1)
1405 mdl_seq = seq.CreateSequence(mdl_seq.GetName(),
1406 mdl_seq.GetString())
1407 mdl_seq.AttachView(mdl_views[mdl_ch])
1408 aln = seq.CreateAlignment()
1409 aln.AddSequence(ref_seq)
1410 aln.AddSequence(mdl_seq)
1411 substructure_ref_mdl_alns[(ref_ch, mdl_ch)] = aln
1414 inclusion_radius = inclusion_radius,
1416 scored_mappings = list()
1420 flat_mapping = global_mapping.GetFlatMapping()
1422 for chem_group, chem_mapping
in zip(substructure_chem_groups,
1423 substructure_chem_mapping):
1424 chem_group_mapping = list()
1425 for ch
in chem_group:
1426 if ch
in flat_mapping:
1427 mdl_ch = flat_mapping[ch]
1428 if mdl_ch
in chem_mapping:
1429 chem_group_mapping.append(mdl_ch)
1431 chem_group_mapping.append(
None)
1433 chem_group_mapping.append(
None)
1434 mapping.append(chem_group_mapping)
1435 mappings = [mapping]
1437 mappings = list(_ChainMappings(substructure_chem_groups,
1438 substructure_chem_mapping,
1441 for mapping
in mappings:
1443 lddt_chain_mapping = dict()
1446 for ref_chem_group, mdl_chem_group
in zip(substructure_chem_groups,
1448 for ref_ch, mdl_ch
in zip(ref_chem_group, mdl_chem_group):
1450 if mdl_ch
is not None:
1451 lddt_chain_mapping[mdl_ch] = ref_ch
1452 aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1453 lddt_alns[mdl_ch] = aln
1454 tmp = [int(c[0] !=
'-' and c[1] !=
'-')
for c
in aln]
1455 n_res_aln += sum(tmp)
1460 lDDT, _ = lddt_scorer.lDDT(mdl, thresholds=thresholds,
1461 chain_mapping=lddt_chain_mapping,
1462 residue_mapping = lddt_alns,
1463 check_resnames =
False,
1464 no_intrachain = only_interchain)
1467 ost.LogVerbose(
"No valid contacts in the reference")
1472 if len(scored_mappings) == 0:
1473 scored_mappings.append((lDDT, mapping))
1474 elif len(scored_mappings) < topn:
1475 scored_mappings.append((lDDT, mapping))
1476 scored_mappings.sort(reverse=
True, key=
lambda x: x[0])
1477 elif lDDT > scored_mappings[-1][0]:
1478 scored_mappings.append((lDDT, mapping))
1479 scored_mappings.sort(reverse=
True, key=
lambda x: x[0])
1480 scored_mappings = scored_mappings[:topn]
1484 for scored_mapping
in scored_mappings:
1485 ref_view = substructure.handle.CreateEmptyView()
1486 mdl_view = mdl.handle.CreateEmptyView()
1487 for ref_ch_group, mdl_ch_group
in zip(substructure_chem_groups,
1489 for ref_ch, mdl_ch
in zip(ref_ch_group, mdl_ch_group):
1490 if ref_ch
is not None and mdl_ch
is not None:
1491 aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1493 if col[0] !=
'-' and col[1] !=
'-':
1494 ref_view.AddResidue(col.GetResidue(0),
1495 mol.ViewAddFlag.INCLUDE_ALL)
1496 mdl_view.AddResidue(col.GetResidue(1),
1497 mol.ViewAddFlag.INCLUDE_ALL)
1498 results.append(
ReprResult(scored_mapping[0], substructure,
1499 ref_view, mdl_view))
1503 """ Returns number of possible mappings
1505 :param model: Model with chains that are mapped onto
1507 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1509 chem_mapping, chem_group_alns, mdl = self.
GetChemMappingGetChemMapping(model)
1510 return _NMappings(self.
chem_groupschem_groups, chem_mapping)
1513 """ Entity processing for chain mapping
1515 * Selects view containing peptide and nucleotide residues which have
1516 required backbone atoms present - for peptide residues thats
1517 N, CA, C and CB (no CB for GLY), for nucleotide residues thats
1518 O5', C5', C4', C3' and O3'.
1519 * filters view by chain lengths, see *min_pep_length* and
1520 *min_nuc_length* in constructor
1521 * Extracts atom sequences for each chain in that view
1522 * Attaches corresponding :class:`ost.mol.EntityView` to each sequence
1523 * If residue number alignments are used, strictly increasing residue
1524 numbers without insertion codes are ensured in each chain
1526 :param ent: Entity to process
1527 :type ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1528 :returns: Tuple with 3 elements: 1) :class:`ost.mol.EntityView`
1529 containing peptide and nucleotide residues 2)
1530 :class:`ost.seq.SequenceList` containing ATOMSEQ sequences
1531 for each polypeptide chain in returned view, sequences have
1532 :class:`ost.mol.EntityView` of according chains attached
1533 3) same for polynucleotide chains
1535 view = ent.CreateEmptyView()
1536 exp_pep_atoms = [
"N",
"CA",
"C",
"CB"]
1537 exp_nuc_atoms = [
"\"O5'\"",
"\"C5'\"",
"\"C4'\"",
"\"C3'\"",
"\"O3'\""]
1538 pep_query =
"peptide=true and aname=" +
','.join(exp_pep_atoms)
1539 nuc_query =
"nucleotide=true and aname=" +
','.join(exp_nuc_atoms)
1541 pep_sel = ent.Select(pep_query)
1542 for r
in pep_sel.residues:
1543 if len(r.atoms) == 4:
1544 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1545 elif r.name ==
"GLY" and len(r.atoms) == 3:
1546 atom_names = [a.GetName()
for a
in r.atoms]
1547 if sorted(atom_names) == [
"C",
"CA",
"N"]:
1548 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1550 nuc_sel = ent.Select(nuc_query)
1551 for r
in nuc_sel.residues:
1552 if len(r.atoms) == 5:
1553 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1555 polypep_seqs = seq.CreateSequenceList()
1556 polynuc_seqs = seq.CreateSequenceList()
1558 if len(view.residues) == 0:
1560 return (view, polypep_seqs, polynuc_seqs)
1562 for ch
in view.chains:
1563 n_res = len(ch.residues)
1564 n_pep = sum([r.IsPeptideLinking()
for r
in ch.residues])
1565 n_nuc = sum([r.IsNucleotideLinking()
for r
in ch.residues])
1568 if n_pep > 0
and n_nuc > 0:
1569 raise RuntimeError(f
"Must not mix peptide and nucleotide linking "
1570 f
"residues in same chain ({ch.GetName()})")
1572 if (n_pep + n_nuc) != n_res:
1573 raise RuntimeError(
"All residues must either be peptide_linking "
1574 "or nucleotide_linking")
1588 ins_codes = [r.GetNumber().GetInsCode()
for r
in ch.residues]
1589 if len(set(ins_codes)) != 1
or ins_codes[0] !=
'\0':
1590 raise RuntimeError(
"Residue numbers in input structures must not "
1591 "contain insertion codes")
1594 nums = [r.GetNumber().GetNum()
for r
in ch.residues]
1595 if not all(i < j
for i, j
in zip(nums, nums[1:])):
1596 raise RuntimeError(
"Residue numbers in input structures must be "
1597 "strictly increasing for each chain")
1599 s =
''.join([r.one_letter_code
for r
in ch.residues])
1600 s = seq.CreateSequence(ch.GetName(), s)
1601 s.AttachView(_CSel(view, [ch.GetName()]))
1603 polypep_seqs.AddSequence(s)
1604 elif n_nuc == n_res:
1605 polynuc_seqs.AddSequence(s)
1607 raise RuntimeError(
"This shouldnt happen")
1609 if len(polypep_seqs) == 0
and len(polynuc_seqs) == 0:
1610 raise RuntimeError(f
"No chain fulfilled minimum length requirement "
1611 f
"to be considered in chain mapping "
1612 f
"({self.min_pep_length} for peptide chains, "
1613 f
"{self.min_nuc_length} for nucleotide chains) "
1614 f
"- mapping failed")
1617 chain_names = [s.GetAttachedView().chains[0].name
for s
in polypep_seqs]
1618 chain_names += [s.GetAttachedView().chains[0].name
for s
in polynuc_seqs]
1619 view = _CSel(view, chain_names)
1621 return (view, polypep_seqs, polynuc_seqs)
1624 """ Access to internal sequence alignment functionality
1626 Alignment parameterization is setup at ChainMapper construction
1628 :param s1: First sequence to align - must have view attached in case
1629 of resnum_alignments
1630 :type s1: :class:`ost.seq.SequenceHandle`
1631 :param s2: Second sequence to align - must have view attached in case
1632 of resnum_alignments
1633 :type s2: :class:`ost.seq.SequenceHandle`
1634 :param stype: Type of sequences to align, must be in
1635 [:class:`ost.mol.ChemType.AMINOACIDS`,
1636 :class:`ost.mol.ChemType.NUCLEOTIDES`]
1637 :returns: Pairwise alignment of s1 and s2
1639 if stype
not in [mol.ChemType.AMINOACIDS, mol.ChemType.NUCLEOTIDES]:
1640 raise RuntimeError(
"stype must be ost.mol.ChemType.AMINOACIDS or "
1641 "ost.mol.ChemType.NUCLEOTIDES")
1642 return self.
aligneraligner.
Align(s1, s2, chem_type = stype)
1648 def __init__(self, pep_subst_mat = seq.alg.BLOSUM62, pep_gap_open = -5,
1649 pep_gap_ext = -2, nuc_subst_mat = seq.alg.NUC44,
1650 nuc_gap_open = -4, nuc_gap_ext = -4, resnum_aln = False):
1651 """ Helper class to compute alignments
1653 Sets default values for substitution matrix, gap open and gap extension
1654 penalties. They are only used in default mode (Needleman-Wunsch aln).
1655 If *resnum_aln* is True, only residue numbers of views that are attached
1656 to input sequences are considered.
1666 def Align(self, s1, s2, chem_type=None):
1670 if chem_type
is None:
1671 raise RuntimeError(
"Must specify chem_type for NW alignment")
1672 return self.
NWAlignNWAlign(s1, s2, chem_type)
1675 """ Returns pairwise alignment using Needleman-Wunsch algorithm
1677 :param s1: First sequence to align
1678 :type s1: :class:`ost.seq.SequenceHandle`
1679 :param s2: Second sequence to align
1680 :type s2: :class:`ost.seq.SequenceHandle`
1681 :param chem_type: Must be in [:class:`ost.mol.ChemType.AMINOACIDS`,
1682 :class:`ost.mol.ChemType.NUCLEOTIDES`], determines
1683 substitution matrix and gap open/extension penalties
1684 :type chem_type: :class:`ost.mol.ChemType`
1685 :returns: Alignment with s1 as first and s2 as second sequence
1687 if chem_type == mol.ChemType.AMINOACIDS:
1688 return seq.alg.SemiGlobalAlign(s1, s2, self.
pep_subst_matpep_subst_mat,
1691 elif chem_type == mol.ChemType.NUCLEOTIDES:
1692 return seq.alg.SemiGlobalAlign(s1, s2, self.
nuc_subst_matnuc_subst_mat,
1696 raise RuntimeError(
"Invalid ChemType")
1700 """ Returns pairwise alignment using residue numbers of attached views
1702 Assumes that there are no insertion codes (alignment only on numerical
1703 component) and that resnums are strictly increasing (fast min/max
1704 identification). These requirements are assured if a structure has been
1705 processed by :class:`ChainMapper`.
1707 :param s1: First sequence to align, must have :class:`ost.mol.EntityView`
1709 :type s1: :class:`ost.seq.SequenceHandle`
1710 :param s2: Second sequence to align, must have :class:`ost.mol.EntityView`
1712 :type s2: :class:`ost.seq.SequenceHandle`
1714 assert(s1.HasAttachedView())
1715 assert(s2.HasAttachedView())
1716 v1 = s1.GetAttachedView()
1717 rnums1 = [r.GetNumber().GetNum()
for r
in v1.residues]
1718 v2 = s2.GetAttachedView()
1719 rnums2 = [r.GetNumber().GetNum()
for r
in v2.residues]
1721 min_num = min(rnums1[0], rnums2[0])
1722 max_num = max(rnums1[-1], rnums2[-1])
1723 aln_length = max_num - min_num + 1
1725 aln_s1 = [
'-'] * aln_length
1726 for r, rnum
in zip(v1.residues, rnums1):
1727 aln_s1[rnum-min_num] = r.one_letter_code
1729 aln_s2 = [
'-'] * aln_length
1730 for r, rnum
in zip(v2.residues, rnums2):
1731 aln_s2[rnum-min_num] = r.one_letter_code
1733 aln = seq.CreateAlignment()
1734 aln.AddSequence(seq.CreateSequence(s1.GetName(),
''.join(aln_s1)))
1735 aln.AddSequence(seq.CreateSequence(s2.GetName(),
''.join(aln_s2)))
1738 def _GetAlnPropsTwo(aln):
1739 """Returns basic properties of *aln* version two...
1741 :param aln: Alignment to compute properties
1742 :type aln: :class:`seq.AlignmentHandle`
1743 :returns: Tuple with 2 elements. 1) sequence identify in range [0, 100]
1744 considering aligned columns 2) Fraction of non-gap characters
1745 in first sequence that are covered by non-gap characters in
1748 assert(aln.GetCount() == 2)
1749 n_tot = sum([1
for col
in aln
if col[0] !=
'-'])
1750 n_aligned = sum([1
for col
in aln
if (col[0] !=
'-' and col[1] !=
'-')])
1751 return (seq.alg.SequenceIdentity(aln), float(n_aligned)/n_tot)
1753 def _GetAlnPropsOne(aln):
1755 """Returns basic properties of *aln* version one...
1757 :param aln: Alignment to compute properties
1758 :type aln: :class:`seq.AlignmentHandle`
1759 :returns: Tuple with 3 elements. 1) sequence identify in range [0, 100]
1760 considering aligned columns 2) Fraction of gaps between
1761 first and last aligned column in s1 3) same for s2.
1763 assert(aln.GetCount() == 2)
1764 n_gaps_1 = str(aln.GetSequence(0)).strip(
'-').count(
'-')
1765 n_gaps_2 = str(aln.GetSequence(1)).strip(
'-').count(
'-')
1766 gap_frac_1 = float(n_gaps_1)/len(aln.GetSequence(0).GetGaplessString())
1767 gap_frac_2 = float(n_gaps_2)/len(aln.GetSequence(1).GetGaplessString())
1768 return (seq.alg.SequenceIdentity(aln), gap_frac_1, gap_frac_2)
1770 def _GetChemGroupAlignments(pep_seqs, nuc_seqs, aligner, pep_seqid_thr=95.,
1771 pep_gap_thr=0.1, nuc_seqid_thr=95.,
1773 """Returns alignments with groups of chemically equivalent chains
1775 :param pep_seqs: List of polypeptide sequences
1776 :type pep_seqs: :class:`seq.SequenceList`
1777 :param nuc_seqs: List of polynucleotide sequences
1778 :type nuc_seqs: :class:`seq.SequenceList`
1779 :param aligner: Helper class to generate pairwise alignments
1780 :type aligner: :class:`_Aligner`
1781 :param pep_seqid_thr: Threshold used to decide when two peptide chains are
1782 identical. 95 percent tolerates the few mutations
1783 crystallographers like to do.
1784 :type pep_seqid_thr: :class:`float`
1785 :param pep_gap_thr: Additional threshold to avoid gappy alignments with high
1786 seqid. The reason for not just normalizing seqid by the
1787 longer sequence is that one sequence might be a perfect
1788 subsequence of the other but only cover half of it. This
1789 threshold checks for a maximum allowed fraction of gaps
1790 in any of the two sequences after stripping terminal gaps.
1791 :type pep_gap_thr: :class:`float`
1792 :param nuc_seqid_thr: Nucleotide equivalent of *pep_seqid_thr*
1793 :type nuc_seqid_thr: :class:`float`
1794 :param nuc_gap_thr: Nucleotide equivalent of *nuc_gap_thr*
1795 :type nuc_gap_thr: :class:`float`
1796 :returns: Tuple with first element being an AlignmentList. Each alignment
1797 represents a group of chemically equivalent chains and the first
1798 sequence is the longest. Second element is a list of equivalent
1799 length specifying the types of the groups. List elements are in
1800 [:class:`ost.ChemType.AMINOACIDS`,
1801 :class:`ost.ChemType.NUCLEOTIDES`]
1803 pep_groups = _GroupSequences(pep_seqs, pep_seqid_thr, pep_gap_thr, aligner,
1804 mol.ChemType.AMINOACIDS)
1805 nuc_groups = _GroupSequences(nuc_seqs, nuc_seqid_thr, nuc_gap_thr, aligner,
1806 mol.ChemType.NUCLEOTIDES)
1807 group_types = [mol.ChemType.AMINOACIDS] * len(pep_groups)
1808 group_types += [mol.ChemType.NUCLEOTIDES] * len(nuc_groups)
1810 groups.extend(nuc_groups)
1811 return (groups, group_types)
1813 def _GroupSequences(seqs, seqid_thr, gap_thr, aligner, chem_type):
1814 """Get list of alignments representing groups of equivalent sequences
1816 :param seqid_thr: Threshold used to decide when two chains are identical.
1817 :type seqid_thr: :class:`float`
1818 :param gap_thr: Additional threshold to avoid gappy alignments with high
1819 seqid. The reason for not just normalizing seqid by the
1820 longer sequence is that one sequence might be a perfect
1821 subsequence of the other but only cover half of it. This
1822 threshold checks for a maximum allowed fraction of gaps
1823 in any of the two sequences after stripping terminal gaps.
1824 :type gap_thr: :class:`float`
1825 :param aligner: Helper class to generate pairwise alignments
1826 :type aligner: :class:`_Aligner`
1827 :param chem_type: ChemType of seqs which is passed to *aligner*, must be in
1828 [:class:`ost.mol.ChemType.AMINOACIDS`,
1829 :class:`ost.mol.ChemType.NUCLEOTIDES`]
1830 :type chem_type: :class:`ost.mol.ChemType`
1831 :returns: A list of alignments, one alignment for each group
1832 with longest sequence (reference) as first sequence.
1833 :rtype: :class:`ost.seq.AlignmentList`
1836 for s_idx
in range(len(seqs)):
1837 matching_group =
None
1838 for g_idx
in range(len(groups)):
1839 for g_s_idx
in range(len(groups[g_idx])):
1840 aln = aligner.Align(seqs[s_idx], seqs[groups[g_idx][g_s_idx]],
1842 sid, frac_i, frac_j = _GetAlnPropsOne(aln)
1843 if sid >= seqid_thr
and frac_i < gap_thr
and frac_j < gap_thr:
1844 matching_group = g_idx
1846 if matching_group
is not None:
1849 if matching_group
is None:
1850 groups.append([s_idx])
1852 groups[matching_group].append(s_idx)
1855 sorted_groups = list()
1858 tmp = sorted([[len(seqs[i]), i]
for i
in g], reverse=
True)
1859 sorted_groups.append([x[1]
for x
in tmp])
1861 sorted_groups.append(g)
1865 aln_list = seq.AlignmentList()
1866 for g
in sorted_groups:
1869 aln_list.append(seq.CreateAlignment(seqs[g[0]]))
1872 alns = seq.AlignmentList()
1875 alns.append(aligner.Align(seqs[i], seqs[j], chem_type))
1877 aln_list.append(seq.alg.MergePairwiseAlignments(alns, seqs[i]))
1880 seq_dict = {s.GetName(): s
for s
in seqs}
1881 for aln_idx
in range(len(aln_list)):
1882 for aln_s_idx
in range(aln_list[aln_idx].GetCount()):
1883 s_name = aln_list[aln_idx].GetSequence(aln_s_idx).GetName()
1884 s = seq_dict[s_name]
1885 aln_list[aln_idx].AttachView(aln_s_idx, s.GetAttachedView())
1889 def _MapSequence(ref_seqs, ref_types, s, s_type, aligner):
1890 """Tries top map *s* onto any of the sequences in *ref_seqs*
1892 Computes alignments of *s* to each of the reference sequences of equal type
1893 and sorts them by seqid*fraction_covered (seqid: sequence identity of
1894 aligned columns in alignment, fraction_covered: Fraction of non-gap
1895 characters in reference sequence that are covered by non-gap characters in
1896 *s*). Best scoring mapping is returned.
1898 :param ref_seqs: Reference sequences
1899 :type ref_seqs: :class:`ost.seq.SequenceList`
1900 :param ref_types: Types of reference sequences, e.g.
1901 ost.mol.ChemType.AminoAcids
1902 :type ref_types: :class:`list` of :class:`ost.mol.ChemType`
1903 :param s: Sequence to map
1904 :type s: :class:`ost.seq.SequenceHandle`
1905 :param s_type: Type of *s*, only try mapping to sequences in *ref_seqs*
1906 with equal type as defined in *ref_types*
1907 :param aligner: Helper class to generate pairwise alignments
1908 :type aligner: :class:`_Aligner`
1909 :returns: Tuple with two elements. 1) index of sequence in *ref_seqs* to
1910 which *s* can be mapped 2) Pairwise sequence alignment with
1911 sequence from *ref_seqs* as first sequence. Both elements are
1912 None if no mapping can be found.
1913 :raises: :class:`RuntimeError` if mapping is ambiguous, i.e. *s*
1914 successfully maps to more than one sequence in *ref_seqs*
1916 scored_alns = list()
1917 for ref_idx, ref_seq
in enumerate(ref_seqs):
1918 if ref_types[ref_idx] == s_type:
1919 aln = aligner.Align(ref_seq, s, s_type)
1920 seqid, fraction_covered = _GetAlnPropsTwo(aln)
1921 score = seqid * fraction_covered
1922 scored_alns.append((score, ref_idx, aln))
1924 if len(scored_alns) == 0:
1927 scored_alns = sorted(scored_alns, key=
lambda x: x[0], reverse=
True)
1928 return (scored_alns[0][1], scored_alns[0][2])
1930 def _GetRefMdlAlns(ref_chem_groups, ref_chem_group_msas, mdl_chem_groups,
1931 mdl_chem_group_alns, pairs=None):
1932 """ Get all possible ref/mdl chain alignments given chem group mapping
1934 :param ref_chem_groups: :attr:`ChainMapper.chem_groups`
1935 :type ref_chem_groups: :class:`list` of :class:`list` of :class:`str`
1936 :param ref_chem_group_msas: :attr:`ChainMapper.chem_group_alignments`
1937 :type ref_chem_group_msas: :class:`ost.seq.AlignmentList`
1938 :param mdl_chem_groups: Groups of model chains that are mapped to
1939 *ref_chem_groups*. Return value of
1940 :func:`ChainMapper.GetChemMapping`.
1941 :type mdl_chem_groups: :class:`list` of :class:`list` of :class:`str`
1942 :param mdl_chem_group_alns: A pairwise sequence alignment for every chain
1943 in *mdl_chem_groups* that aligns these sequences
1944 to the respective reference sequence.
1946 :func:`ChainMapper.GetChemMapping`.
1947 :type mdl_chem_group_alns: :class:`list` of :class:`ost.seq.AlignmentList`
1948 :param pairs: Pro param - restrict return dict to specified pairs. A set of
1949 tuples in form (<trg_ch>, <mdl_ch>)
1950 :type pairs: :class:`set`
1951 :returns: A dictionary holding all possible ref/mdl chain alignments. Keys
1952 in that dictionary are tuples of the form (ref_ch, mdl_ch) and
1953 values are the respective pairwise alignments with first sequence
1954 being from ref, the second from mdl.
1958 for alns
in mdl_chem_group_alns:
1960 mdl_chain_name = aln.GetSequence(1).GetName()
1961 mdl_alns[mdl_chain_name] = aln
1965 ref_mdl_alns = dict()
1966 for ref_chains, mdl_chains, ref_aln
in zip(ref_chem_groups, mdl_chem_groups,
1967 ref_chem_group_msas):
1968 for ref_ch
in ref_chains:
1969 for mdl_ch
in mdl_chains:
1970 if pairs
is not None and (ref_ch, mdl_ch)
not in pairs:
1974 aln_list = seq.AlignmentList()
1976 s1 = ref_aln.GetSequence(0)
1977 s2 = ref_aln.GetSequence(ref_chains.index(ref_ch))
1978 aln_list.append(seq.CreateAlignment(s1, s2))
1980 aln_list.append(mdl_alns[mdl_ch])
1982 ref_seq = seq.CreateSequence(s1.GetName(),
1983 s1.GetGaplessString())
1984 merged_aln = seq.alg.MergePairwiseAlignments(aln_list,
1991 s2 = merged_aln.GetSequence(1)
1992 s3 = merged_aln.GetSequence(2)
1995 for idx
in range(len(s2)):
1996 if s2[idx] !=
'-' or s3[idx] !=
'-':
2000 for idx
in reversed(range(len(s2))):
2001 if s2[idx] !=
'-' or s3[idx] !=
'-':
2004 s2 = seq.CreateSequence(s2.GetName(), s2[a: len(s2)-b])
2005 s3 = seq.CreateSequence(s3.GetName(), s3[a: len(s3)-b])
2006 ref_mdl_alns[(ref_ch, mdl_ch)] = seq.CreateAlignment(s2, s3)
2010 def _CheckOneToOneMapping(ref_chains, mdl_chains):
2011 """ Checks whether we already have a perfect one to one mapping
2013 That means each list in *ref_chains* has exactly one element and each
2014 list in *mdl_chains* has either one element (it's mapped) or is empty
2015 (ref chain has no mapped mdl chain). Returns None if no such mapping
2018 :param ref_chains: corresponds to :attr:`ChainMapper.chem_groups`
2019 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
2020 :param mdl_chains: mdl chains mapped to chem groups in *ref_chains*, i.e.
2021 the return value of :func:`ChainMapper.GetChemMapping`
2022 :type mdl_chains: class:`list` of :class:`list` of :class:`str`
2023 :returns: A :class:`list` of :class:`list` if a one to one mapping is found,
2026 only_one_to_one =
True
2028 for ref, mdl
in zip(ref_chains, mdl_chains):
2029 if len(ref) == 1
and len(mdl) == 1:
2030 one_to_one.append(mdl)
2031 elif len(ref) == 1
and len(mdl) == 0:
2032 one_to_one.append([
None])
2034 only_one_to_one =
False
2042 def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
2043 ref_mdl_alns, inclusion_radius = 15.0,
2044 thresholds = [0.5, 1.0, 2.0, 4.0],
2045 steep_opt_rate = None):
2047 """ Greedy extension of already existing but incomplete chain mappings
2049 super().
__init__(ref, ref_chem_groups, mdl, ref_mdl_alns,
2050 dist_thresh=inclusion_radius,
2051 dist_diff_thresholds=thresholds)
2057 for interface
in self.
trgtrg.interacting_chains:
2058 self.
neighborsneighbors[interface[0]].add(interface[1])
2059 self.
neighborsneighbors[interface[1]].add(interface[0])
2061 assert(len(ref_chem_groups) == len(mdl_chem_groups))
2066 for g_idx, (ref_g, mdl_g)
in enumerate(zip(ref_chem_groups,
2076 for interface
in self.
mdlmdl.potentially_contributing_chains:
2077 self.
mdl_neighborsmdl_neighbors[interface[0]].add(interface[1])
2078 self.
mdl_neighborsmdl_neighbors[interface[1]].add(interface[0])
2082 if len(mapping) == 0:
2083 raise RuntimError(
"Mapping must contain a starting point")
2090 for ref_ch
in mapping.keys():
2091 map_targets.update(self.
neighborsneighbors[ref_ch])
2094 for ref_ch
in mapping.keys():
2095 map_targets.discard(ref_ch)
2097 if len(map_targets) == 0:
2101 free_mdl_chains = list()
2103 tmp = [x
for x
in chem_group
if x
not in mapping.values()]
2104 free_mdl_chains.append(set(tmp))
2107 newly_mapped_ref_chains = list()
2109 something_happened =
True
2110 while something_happened:
2111 something_happened=
False
2114 n_chains = len(newly_mapped_ref_chains)
2115 if n_chains > 0
and n_chains % self.
steep_opt_ratesteep_opt_rate == 0:
2116 mapping = self.
_SteepOpt_SteepOpt(mapping, newly_mapped_ref_chains)
2118 if max_ext
is not None and len(newly_mapped_ref_chains) >= max_ext:
2123 for ref_ch
in map_targets:
2125 for mdl_ch
in free_mdl_chains[chem_group_idx]:
2127 n_single = self.
_NSCConserved_NSCConserved(ref_ch, mdl_ch).sum()
2130 for neighbor
in self.
neighborsneighbors[ref_ch]:
2131 if neighbor
in mapping
and mapping[neighbor]
in \
2134 (mdl_ch, mapping[neighbor])).sum()
2135 n = n_single + n_inter
2137 if n_inter > 0
and n > max_n:
2142 max_mapping = (ref_ch, mdl_ch)
2145 something_happened =
True
2147 mapping[max_mapping[0]] = max_mapping[1]
2151 for neighbor
in self.
neighborsneighbors[max_mapping[0]]:
2152 if neighbor
not in mapping:
2153 map_targets.add(neighbor)
2156 map_targets.remove(max_mapping[0])
2160 free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2163 newly_mapped_ref_chains.append(max_mapping[0])
2167 def _SteepOpt(self, mapping, chains_to_optimize=None):
2170 if chains_to_optimize
is None:
2171 chains_to_optimize = mapping.keys()
2174 ref_chains = [x
for x
in chains_to_optimize
if mapping[x]
is not None]
2178 for ch
in ref_chains:
2180 if chem_group_idx
in tmp:
2181 tmp[chem_group_idx].append(ch)
2183 tmp[chem_group_idx] = [ch]
2184 chem_groups = list(tmp.values())
2189 something_happened =
True
2190 while something_happened:
2191 something_happened =
False
2192 for chem_group
in chem_groups:
2193 if something_happened:
2195 for ch1, ch2
in itertools.combinations(chem_group, 2):
2196 swapped_mapping = dict(mapping)
2197 swapped_mapping[ch1] = mapping[ch2]
2198 swapped_mapping[ch2] = mapping[ch1]
2200 if score > current_lddt:
2201 something_happened =
True
2202 mapping = swapped_mapping
2203 current_lddt = score
2208 def _lDDTNaive(trg, mdl, inclusion_radius, thresholds, chem_groups,
2209 chem_mapping, ref_mdl_alns, n_max_naive):
2210 """ Naively iterates all possible chain mappings and returns the best
2217 dist_thresh=inclusion_radius,
2218 dist_diff_thresholds=thresholds)
2219 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2220 lDDT = lddt_scorer.Score(mapping, check=
False)
2221 if lDDT > best_lddt:
2222 best_mapping = mapping
2225 return (best_mapping, best_lddt)
2228 def _GetSeeds(ref_chem_groups, mdl_chem_groups, mapped_ref_chains = set(),
2229 mapped_mdl_chains = set()):
2231 for ref_chains, mdl_chains
in zip(ref_chem_groups,
2233 for ref_ch
in ref_chains:
2234 if ref_ch
not in mapped_ref_chains:
2235 for mdl_ch
in mdl_chains:
2236 if mdl_ch
not in mapped_mdl_chains:
2237 seeds.append((ref_ch, mdl_ch))
2241 def _lDDTGreedyFast(the_greed):
2243 something_happened =
True
2246 while something_happened:
2247 something_happened =
False
2248 seeds = _GetSeeds(the_greed.ref_chem_groups,
2249 the_greed.mdl_chem_groups,
2250 mapped_ref_chains = set(mapping.keys()),
2251 mapped_mdl_chains = set(mapping.values()))
2256 n = the_greed._NSCConserved(seed[0], seed[1]).sum()
2263 mapping[best_seed[0]] = best_seed[1]
2264 mapping = the_greed.ExtendMapping(mapping)
2265 something_happened =
True
2268 final_mapping = list()
2269 for ref_chains
in the_greed.ref_chem_groups:
2270 mapped_mdl_chains = list()
2271 for ref_ch
in ref_chains:
2272 if ref_ch
in mapping:
2273 mapped_mdl_chains.append(mapping[ref_ch])
2275 mapped_mdl_chains.append(
None)
2276 final_mapping.append(mapped_mdl_chains)
2278 return final_mapping
2281 def _lDDTGreedyFull(the_greed):
2282 """ Uses each reference chain as starting point for expansion
2285 seeds = _GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
2286 best_overall_score = -1.0
2287 best_overall_mapping = dict()
2292 mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
2295 something_happened =
True
2296 while something_happened:
2297 something_happened =
False
2298 remnant_seeds = _GetSeeds(the_greed.ref_chem_groups,
2299 the_greed.mdl_chem_groups,
2300 mapped_ref_chains = set(mapping.keys()),
2301 mapped_mdl_chains = set(mapping.values()))
2302 if len(remnant_seeds) > 0:
2306 for remnant_seed
in remnant_seeds:
2307 tmp_mapping = dict(mapping)
2308 tmp_mapping[remnant_seed[0]] = remnant_seed[1]
2309 tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
2310 score = the_greed.FromFlatMapping(tmp_mapping)
2311 if score > best_score:
2313 best_mapping = tmp_mapping
2314 if best_mapping
is not None:
2315 something_happened =
True
2316 mapping = best_mapping
2318 score = the_greed.FromFlatMapping(mapping)
2319 if score > best_overall_score:
2320 best_overall_score = score
2321 best_overall_mapping = mapping
2323 mapping = best_overall_mapping
2326 final_mapping = list()
2327 for ref_chains
in the_greed.ref_chem_groups:
2328 mapped_mdl_chains = list()
2329 for ref_ch
in ref_chains:
2330 if ref_ch
in mapping:
2331 mapped_mdl_chains.append(mapping[ref_ch])
2333 mapped_mdl_chains.append(
None)
2334 final_mapping.append(mapped_mdl_chains)
2336 return final_mapping
2339 def _lDDTGreedyBlock(the_greed, seed_size, blocks_per_chem_group):
2340 """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
2341 respective chem groups and compute single chain lDDTs. The
2342 *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
2343 and the best scoring one is exhaustively extended.
2346 if seed_size
is None or seed_size < 1:
2347 raise RuntimeError(f
"seed_size must be an int >= 1 (got {seed_size})")
2349 if blocks_per_chem_group
is None or blocks_per_chem_group < 1:
2350 raise RuntimeError(f
"blocks_per_chem_group must be an int >= 1 "
2351 f
"(got {blocks_per_chem_group})")
2353 max_ext = seed_size - 1
2355 ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
2356 mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
2359 something_happened =
True
2360 while something_happened:
2361 something_happened =
False
2362 starting_blocks = list()
2363 for ref_chains, mdl_chains
in zip(ref_chem_groups, mdl_chem_groups):
2364 if len(mdl_chains) == 0:
2366 ref_chains_copy = list(ref_chains)
2367 for i
in range(blocks_per_chem_group):
2368 if len(ref_chains_copy) == 0:
2371 for ref_ch
in ref_chains_copy:
2372 seeds += [(ref_ch, mdl_ch)
for mdl_ch
in mdl_chains]
2379 seed = dict(mapping)
2380 seed.update({s[0]: s[1]})
2381 seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
2382 seed_lddt = the_greed.FromFlatMapping(seed)
2383 if seed_lddt > best_score:
2384 best_score = seed_lddt
2387 if best_mapping !=
None:
2388 starting_blocks.append(best_mapping)
2389 if best_seed[0]
in ref_chains_copy:
2391 ref_chains_copy.remove(best_seed[0])
2396 for seed
in starting_blocks:
2397 seed = the_greed.ExtendMapping(seed)
2398 seed_lddt = the_greed.FromFlatMapping(seed)
2399 if seed_lddt > best_lddt:
2400 best_lddt = seed_lddt
2403 if best_lddt == 0.0:
2406 something_happened =
True
2407 mapping.update(best_mapping)
2408 for ref_ch, mdl_ch
in best_mapping.items():
2409 for group_idx
in range(len(ref_chem_groups)):
2410 if ref_ch
in ref_chem_groups[group_idx]:
2411 ref_chem_groups[group_idx].remove(ref_ch)
2412 if mdl_ch
in mdl_chem_groups[group_idx]:
2413 mdl_chem_groups[group_idx].remove(mdl_ch)
2416 final_mapping = list()
2417 for ref_chains
in the_greed.ref_chem_groups:
2418 mapped_mdl_chains = list()
2419 for ref_ch
in ref_chains:
2420 if ref_ch
in mapping:
2421 mapped_mdl_chains.append(mapping[ref_ch])
2423 mapped_mdl_chains.append(
None)
2424 final_mapping.append(mapped_mdl_chains)
2426 return final_mapping
2430 def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
2431 ref_mdl_alns, contact_d = 12.0,
2432 steep_opt_rate = None, greedy_prune_contact_map=False):
2433 """ Greedy extension of already existing but incomplete chain mappings
2435 super().
__init__(ref, ref_chem_groups, mdl, ref_mdl_alns,
2436 contact_d = contact_d)
2442 if greedy_prune_contact_map:
2444 for p
in self.
qsent1qsent1.interacting_chains:
2445 if np.count_nonzero(self.
qsent1qsent1.PairDist(p[0], p[1])<=8) >= 3:
2450 for p
in self.
qsent2qsent2.interacting_chains:
2451 if np.count_nonzero(self.
qsent2qsent2.PairDist(p[0], p[1])<=8) >= 3:
2457 self.
neighborsneighbors = {k: set()
for k
in self.
qsent1qsent1.chain_names}
2458 for p
in self.
qsent1qsent1.interacting_chains:
2463 for p
in self.
qsent2qsent2.interacting_chains:
2467 assert(len(ref_chem_groups) == len(mdl_chem_groups))
2472 for g_idx, (ref_g, mdl_g)
in enumerate(zip(ref_chem_groups,
2484 for ch
in self.
refref.chains:
2485 single_chain_ref = _CSel(self.
refref, [ch.GetName()])
2492 alns[mdl_ch] = self.
ref_mdl_alnsref_mdl_alns[(ref_ch, mdl_ch)]
2493 mdl_sel = _CSel(self.
mdlmdl, [mdl_ch])
2495 _,_,_,conserved,_,_,_ = s.lDDT(mdl_sel,
2496 residue_mapping=alns,
2497 return_dist_test=
True,
2499 chain_mapping={mdl_ch: ref_ch},
2500 check_resnames=
False)
2506 if len(mapping) == 0:
2507 raise RuntimError(
"Mapping must contain a starting point")
2514 for ref_ch
in mapping.keys():
2515 map_targets.update(self.
neighborsneighbors[ref_ch])
2518 for ref_ch
in mapping.keys():
2519 map_targets.discard(ref_ch)
2521 if len(map_targets) == 0:
2525 free_mdl_chains = list()
2527 tmp = [x
for x
in chem_group
if x
not in mapping.values()]
2528 free_mdl_chains.append(set(tmp))
2531 newly_mapped_ref_chains = list()
2533 something_happened =
True
2534 while something_happened:
2535 something_happened=
False
2538 n_chains = len(newly_mapped_ref_chains)
2539 if n_chains > 0
and n_chains % self.
steep_opt_ratesteep_opt_rate == 0:
2540 mapping = self.
_SteepOpt_SteepOpt(mapping, newly_mapped_ref_chains)
2542 if max_ext
is not None and len(newly_mapped_ref_chains) >= max_ext:
2546 old_score = score_result.QS_global
2547 nominator = score_result.weighted_scores
2548 denominator = score_result.weight_sum + score_result.weight_extra_all
2552 for ref_ch
in map_targets:
2554 for mdl_ch
in free_mdl_chains[chem_group_idx]:
2557 nominator_diff = 0.0
2558 denominator_diff = 0.0
2559 for neighbor
in self.
neighborsneighbors[ref_ch]:
2560 if neighbor
in mapping
and mapping[neighbor]
in \
2564 int1 = (ref_ch, neighbor)
2565 int2 = (mdl_ch, mapping[neighbor])
2568 denominator_diff += b
2569 denominator_diff += d
2575 if nominator_diff > 0:
2578 new_nominator = nominator + nominator_diff
2579 new_denominator = denominator + denominator_diff
2581 if new_denominator != 0.0:
2582 new_score = new_nominator/new_denominator
2583 diff = new_score - old_score
2586 max_mapping = (ref_ch, mdl_ch)
2588 if max_mapping
is not None:
2589 something_happened =
True
2591 mapping[max_mapping[0]] = max_mapping[1]
2595 for neighbor
in self.
neighborsneighbors[max_mapping[0]]:
2596 if neighbor
not in mapping:
2597 map_targets.add(neighbor)
2600 map_targets.remove(max_mapping[0])
2604 free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2607 newly_mapped_ref_chains.append(max_mapping[0])
2611 def _SteepOpt(self, mapping, chains_to_optimize=None):
2614 if chains_to_optimize
is None:
2615 chains_to_optimize = mapping.keys()
2618 ref_chains = [x
for x
in chains_to_optimize
if mapping[x]
is not None]
2622 for ch
in ref_chains:
2624 if chem_group_idx
in tmp:
2625 tmp[chem_group_idx].append(ch)
2627 tmp[chem_group_idx] = [ch]
2628 chem_groups = list(tmp.values())
2633 current_score = score_result.QS_global
2634 something_happened =
True
2635 while something_happened:
2636 something_happened =
False
2637 for chem_group
in chem_groups:
2638 if something_happened:
2640 for ch1, ch2
in itertools.combinations(chem_group, 2):
2641 swapped_mapping = dict(mapping)
2642 swapped_mapping[ch1] = mapping[ch2]
2643 swapped_mapping[ch2] = mapping[ch1]
2645 if score_result.QS_global > current_score:
2646 something_happened =
True
2647 mapping = swapped_mapping
2648 current_score = score_result.QS_global
2653 def _QSScoreNaive(trg, mdl, chem_groups, chem_mapping, ref_mdl_alns, contact_d,
2660 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2661 score_result = qs_scorer.Score(mapping, check=
False)
2662 if score_result.QS_global > best_score:
2663 best_mapping = mapping
2664 best_score = score_result.QS_global
2665 return (best_mapping, best_score)
2668 def _QSScoreGreedyFast(the_greed):
2670 something_happened =
True
2672 while something_happened:
2673 something_happened =
False
2677 seeds = _GetSeeds(the_greed.ref_chem_groups,
2678 the_greed.mdl_chem_groups,
2679 mapped_ref_chains = set(mapping.keys()),
2680 mapped_mdl_chains = set(mapping.values()))
2682 n = the_greed.SCCounts(seed[0], seed[1])
2689 mapping[best_seed[0]] = best_seed[1]
2690 mapping = the_greed.ExtendMapping(mapping)
2691 something_happened =
True
2694 final_mapping = list()
2695 for ref_chains
in the_greed.ref_chem_groups:
2696 mapped_mdl_chains = list()
2697 for ref_ch
in ref_chains:
2698 if ref_ch
in mapping:
2699 mapped_mdl_chains.append(mapping[ref_ch])
2701 mapped_mdl_chains.append(
None)
2702 final_mapping.append(mapped_mdl_chains)
2704 return final_mapping
2707 def _QSScoreGreedyFull(the_greed):
2708 """ Uses each reference chain as starting point for expansion
2711 seeds = _GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
2712 best_overall_score = -1.0
2713 best_overall_mapping = dict()
2718 mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
2721 something_happened =
True
2722 while something_happened:
2723 something_happened =
False
2724 remnant_seeds = _GetSeeds(the_greed.ref_chem_groups,
2725 the_greed.mdl_chem_groups,
2726 mapped_ref_chains = set(mapping.keys()),
2727 mapped_mdl_chains = set(mapping.values()))
2728 if len(remnant_seeds) > 0:
2732 for remnant_seed
in remnant_seeds:
2733 tmp_mapping = dict(mapping)
2734 tmp_mapping[remnant_seed[0]] = remnant_seed[1]
2735 tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
2736 score_result = the_greed.FromFlatMapping(tmp_mapping)
2737 if score_result.QS_global > best_score:
2738 best_score = score_result.QS_global
2739 best_mapping = tmp_mapping
2740 if best_mapping
is not None:
2741 something_happened =
True
2742 mapping = best_mapping
2744 score_result = the_greed.FromFlatMapping(mapping)
2745 if score_result.QS_global > best_overall_score:
2746 best_overall_score = score_result.QS_global
2747 best_overall_mapping = mapping
2749 mapping = best_overall_mapping
2752 final_mapping = list()
2753 for ref_chains
in the_greed.ref_chem_groups:
2754 mapped_mdl_chains = list()
2755 for ref_ch
in ref_chains:
2756 if ref_ch
in mapping:
2757 mapped_mdl_chains.append(mapping[ref_ch])
2759 mapped_mdl_chains.append(
None)
2760 final_mapping.append(mapped_mdl_chains)
2762 return final_mapping
2765 def _QSScoreGreedyBlock(the_greed, seed_size, blocks_per_chem_group):
2766 """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
2767 respective chem groups and compute single chain lDDTs. The
2768 *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
2769 and the best scoring one with respect to QS score is exhaustively extended.
2772 if seed_size
is None or seed_size < 1:
2773 raise RuntimeError(f
"seed_size must be an int >= 1 (got {seed_size})")
2775 if blocks_per_chem_group
is None or blocks_per_chem_group < 1:
2776 raise RuntimeError(f
"blocks_per_chem_group must be an int >= 1 "
2777 f
"(got {blocks_per_chem_group})")
2779 max_ext = seed_size - 1
2781 ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
2782 mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
2786 something_happened =
True
2787 while something_happened:
2788 something_happened =
False
2789 starting_blocks = list()
2790 for ref_chains, mdl_chains
in zip(ref_chem_groups, mdl_chem_groups):
2791 if len(mdl_chains) == 0:
2793 ref_chains_copy = list(ref_chains)
2794 for i
in range(blocks_per_chem_group):
2795 if len(ref_chains_copy) == 0:
2798 for ref_ch
in ref_chains_copy:
2799 seeds += [(ref_ch, mdl_ch)
for mdl_ch
in mdl_chains]
2806 seed = dict(mapping)
2807 seed.update({s[0]: s[1]})
2808 seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
2809 score_result = the_greed.FromFlatMapping(seed)
2810 if score_result.QS_global > best_score:
2811 best_score = score_result.QS_global
2814 if best_mapping !=
None:
2815 starting_blocks.append(best_mapping)
2816 if best_seed[0]
in ref_chains_copy:
2818 ref_chains_copy.remove(best_seed[0])
2823 for seed
in starting_blocks:
2824 seed = the_greed.ExtendMapping(seed)
2825 score_result = the_greed.FromFlatMapping(seed)
2826 if score_result.QS_global > best_score:
2827 best_score = score_result.QS_global
2830 if best_mapping
is not None and len(best_mapping) > len(mapping):
2833 something_happened =
True
2834 mapping.update(best_mapping)
2835 for ref_ch, mdl_ch
in best_mapping.items():
2836 for group_idx
in range(len(ref_chem_groups)):
2837 if ref_ch
in ref_chem_groups[group_idx]:
2838 ref_chem_groups[group_idx].remove(ref_ch)
2839 if mdl_ch
in mdl_chem_groups[group_idx]:
2840 mdl_chem_groups[group_idx].remove(mdl_ch)
2843 final_mapping = list()
2844 for ref_chains
in the_greed.ref_chem_groups:
2845 mapped_mdl_chains = list()
2846 for ref_ch
in ref_chains:
2847 if ref_ch
in mapping:
2848 mapped_mdl_chains.append(mapping[ref_ch])
2850 mapped_mdl_chains.append(
None)
2851 final_mapping.append(mapped_mdl_chains)
2853 return final_mapping
2855 def _SingleRigidRMSD(initial_transforms, initial_mappings, chem_groups,
2856 chem_mapping, trg_group_pos, mdl_group_pos):
2858 Takes initial transforms and sequentially adds chain pairs with lowest RMSD.
2859 The mapping from the transform that leads to lowest overall RMSD is
2862 best_mapping = dict()
2863 best_ssd = float(
"inf")
2867 for transform
in initial_transforms:
2869 mapped_mdl_chains = set()
2871 for trg_chains, mdl_chains, trg_pos, mdl_pos,
in zip(chem_groups,
2875 if len(trg_pos) == 0
or len(mdl_pos) == 0:
2879 for m_pos
in mdl_pos:
2881 t_m_pos.ApplyTransform(transform)
2882 t_mdl_pos.append(t_m_pos)
2883 for t_pos, t
in zip(trg_pos, trg_chains):
2884 for t_m_pos, m
in zip(t_mdl_pos, mdl_chains):
2885 ssd = t_pos.GetSummedSquaredDistances(t_m_pos)
2886 ssds.append((ssd, (t,m)))
2890 if p[0]
not in mapping
and p[1]
not in mapped_mdl_chains:
2891 mapping[p[0]] = p[1]
2892 mapped_mdl_chains.add(p[1])
2897 best_mapping = mapping
2901 def _IterativeRigidRMSD(initial_transforms, initial_mappings, chem_groups,
2902 chem_mapping, trg_group_pos, mdl_group_pos):
2903 """ Takes initial transforms and sequentially adds chain pairs with
2904 lowest RMSD. With each added chain pair, the transform gets updated.
2905 Thus the naming iterative. The mapping from the initial transform that
2906 leads to best overall RMSD score is returned.
2910 trg_pos_dict = dict()
2911 for trg_pos, trg_chains
in zip(trg_group_pos, chem_groups):
2912 for t_pos, t
in zip(trg_pos, trg_chains):
2913 trg_pos_dict[t] = t_pos
2914 mdl_pos_dict = dict()
2915 for mdl_pos, mdl_chains
in zip(mdl_group_pos, chem_mapping):
2916 for m_pos, m
in zip(mdl_pos, mdl_chains):
2917 mdl_pos_dict[m] = m_pos
2919 best_mapping = dict()
2920 best_rmsd = float(
"inf")
2921 for initial_transform, initial_mapping
in zip(initial_transforms,
2923 mapping = {initial_mapping[0]: initial_mapping[1]}
2924 transform =
geom.Mat4(initial_transform)
2925 mapped_trg_pos =
geom.Vec3List(trg_pos_dict[initial_mapping[0]])
2926 mapped_mdl_pos =
geom.Vec3List(mdl_pos_dict[initial_mapping[1]])
2930 trg_chain_groups = [set(group)
for group
in chem_groups]
2931 mdl_chain_groups = [set(group)
for group
in chem_mapping]
2934 for group
in trg_chain_groups:
2935 if initial_mapping[0]
in group:
2936 group.remove(initial_mapping[0])
2938 for group
in mdl_chain_groups:
2939 if initial_mapping[1]
in group:
2940 group.remove(initial_mapping[1])
2943 something_happened =
True
2944 while something_happened:
2946 something_happened=
False
2947 best_sc_mapping =
None
2948 best_sc_group_idx =
None
2949 best_sc_rmsd = float(
"inf")
2951 for trg_chains, mdl_chains
in zip(trg_chain_groups, mdl_chain_groups):
2952 for t
in trg_chains:
2953 t_pos = trg_pos_dict[t]
2954 for m
in mdl_chains:
2955 m_pos = mdl_pos_dict[m]
2957 t_m_pos.ApplyTransform(transform)
2958 rmsd = t_pos.GetRMSD(t_m_pos)
2959 if rmsd < best_sc_rmsd:
2961 best_sc_mapping = (t,m)
2962 best_sc_group_idx = group_idx
2965 if best_sc_mapping
is not None:
2966 something_happened =
True
2967 mapping[best_sc_mapping[0]] = best_sc_mapping[1]
2968 mapped_trg_pos.extend(trg_pos_dict[best_sc_mapping[0]])
2969 mapped_mdl_pos.extend(mdl_pos_dict[best_sc_mapping[1]])
2970 trg_chain_groups[best_sc_group_idx].remove(best_sc_mapping[0])
2971 mdl_chain_groups[best_sc_group_idx].remove(best_sc_mapping[1])
2973 transform = _GetTransform(mapped_mdl_pos, mapped_trg_pos,
2977 mapped_mdl_pos.ApplyTransform(transform)
2978 rmsd = mapped_trg_pos.GetRMSD(mapped_mdl_pos)
2980 if rmsd < best_rmsd:
2982 best_mapping = mapping
2986 def _NaiveRMSD(chem_groups, chem_mapping, trg_group_pos, mdl_group_pos,
2990 trg_pos_dict = dict()
2991 for trg_pos, trg_chains
in zip(trg_group_pos, chem_groups):
2992 for t_pos, t
in zip(trg_pos, trg_chains):
2993 trg_pos_dict[t] = t_pos
2994 mdl_pos_dict = dict()
2995 for mdl_pos, mdl_chains
in zip(mdl_group_pos, chem_mapping):
2996 for m_pos, m
in zip(mdl_pos, mdl_chains):
2997 mdl_pos_dict[m] = m_pos
2999 best_mapping = dict()
3000 best_rmsd = float(
"inf")
3002 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
3005 for trg_group, mdl_group
in zip(chem_groups, mapping):
3006 for trg_ch, mdl_ch
in zip(trg_group, mdl_group):
3007 if trg_ch
is not None and mdl_ch
is not None:
3008 trg_pos.extend(trg_pos_dict[trg_ch])
3009 mdl_pos.extend(mdl_pos_dict[mdl_ch])
3010 superpose_res = mol.alg.SuperposeSVD(mdl_pos, trg_pos)
3011 if superpose_res.rmsd < best_rmsd:
3012 best_rmsd = superpose_res.rmsd
3013 best_mapping = mapping
3017 for chem_group, mapping
in zip(chem_groups, best_mapping):
3018 for trg_ch, mdl_ch
in zip(chem_group, mapping):
3019 tmp[trg_ch] = mdl_ch
3024 def _GetRefPos(trg, mdl, trg_msas, mdl_alns, max_pos = None):
3025 """ Extracts reference positions which are present in trg and mdl
3030 bb_trg = trg.Select(
"aname=\"CA\",\"C3'\"")
3031 bb_mdl = mdl.Select(
"aname=\"CA\",\"C3'\"")
3035 for aln_list
in mdl_alns:
3036 if len(aln_list) > 0:
3037 tmp = aln_list[0].GetSequence(0)
3038 ref_seq = seq.CreateSequence(tmp.GetName(), tmp.GetGaplessString())
3039 mdl_msas.append(seq.alg.MergePairwiseAlignments(aln_list, ref_seq))
3041 mdl_msas.append(seq.CreateAlignment())
3046 for trg_msa, mdl_msa
in zip(trg_msas, mdl_msas):
3048 if mdl_msa.GetCount() > 0:
3050 assert(trg_msa.GetSequence(0).GetGaplessString() == \
3051 mdl_msa.GetSequence(0).GetGaplessString())
3060 trg_indices = _GetFullyCoveredIndices(trg_msa)
3061 mdl_indices = _GetFullyCoveredIndices(mdl_msa)
3064 indices = sorted(list(trg_indices.intersection(mdl_indices)))
3067 if max_pos
is not None and len(indices) > max_pos:
3068 step = int(len(indices)/max_pos)
3069 indices = [indices[i]
for i
in range(0, len(indices), step)]
3072 trg_indices = _RefIndicesToColumnIndices(trg_msa, indices)
3073 mdl_indices = _RefIndicesToColumnIndices(mdl_msa, indices)
3076 trg_pos.append(list())
3077 mdl_pos.append(list())
3078 for s_idx
in range(trg_msa.GetCount()):
3079 trg_pos[-1].append(_ExtractMSAPos(trg_msa, s_idx, trg_indices,
3082 for s_idx
in range(1, mdl_msa.GetCount()):
3083 mdl_pos[-1].append(_ExtractMSAPos(mdl_msa, s_idx, mdl_indices,
3086 return (trg_pos, mdl_pos)
3088 def _GetFullyCoveredIndices(msa):
3089 """ Helper for _GetRefPos
3091 Returns a set containing the indices relative to first sequence in msa which
3092 are fully covered in all other sequences
3103 if sum([1
for olc
in col
if olc !=
'-']) == col.GetRowCount():
3104 indices.add(ref_idx)
3109 def _RefIndicesToColumnIndices(msa, indices):
3110 """ Helper for _GetRefPos
3112 Returns a list of mapped indices. indices refer to non-gap one letter
3113 codes in the first msa sequence. The returnes mapped indices are translated
3114 to the according msa column indices
3118 for col_idx, col
in enumerate(msa):
3120 mapping[ref_idx] = col_idx
3122 return [mapping[i]
for i
in indices]
3124 def _ExtractMSAPos(msa, s_idx, indices, view):
3125 """ Helper for _GetRefPos
3127 Returns a geom.Vec3List containing positions refering to given msa sequence.
3128 => Chain with corresponding name is mapped onto sequence and the position of
3129 the first atom of each residue specified in indices is extracted.
3130 Indices refers to column indices in msa!
3132 s = msa.GetSequence(s_idx)
3133 s_v = _CSel(view, [s.GetName()])
3136 assert(len(s.GetGaplessString()) == len(s_v.residues))
3138 residue_idx = [s.GetResidueIndex(i)
for i
in indices]
3139 return geom.Vec3List([s_v.residues[i].atoms[0].pos
for i
in residue_idx])
3141 def _NChemGroupMappings(ref_chains, mdl_chains):
3142 """ Number of mappings within one chem group
3144 :param ref_chains: Reference chains
3145 :type ref_chains: :class:`list` of :class:`str`
3146 :param mdl_chains: Model chains that are mapped onto *ref_chains*
3147 :type mdl_chains: :class:`list` of :class:`str`
3148 :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3150 n_ref = len(ref_chains)
3151 n_mdl = len(mdl_chains)
3153 return factorial(n_ref)
3155 n_choose_k = binom(n_ref, n_mdl)
3156 return n_choose_k * factorial(n_mdl)
3158 n_choose_k = binom(n_mdl, n_ref)
3159 return n_choose_k * factorial(n_ref)
3161 def _NMappings(ref_chains, mdl_chains):
3162 """ Number of mappings for a full chem mapping
3164 :param ref_chains: Chem groups of reference
3165 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3166 :param mdl_chains: Model chains that map onto those chem groups
3167 :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3168 :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3170 assert(len(ref_chains) == len(mdl_chains))
3172 for a,b
in zip(ref_chains, mdl_chains):
3173 n *= _NChemGroupMappings(a,b)
3176 def _NMappingsWithin(ref_chains, mdl_chains, max_mappings):
3177 """ Check whether total number of mappings is smaller than given maximum
3179 In principle the same as :func:`_NMappings` but it stops as soon as the
3182 :param ref_chains: Chem groups of reference
3183 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3184 :param mdl_chains: Model chains that map onto those chem groups
3185 :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3186 :param max_mappings: Number of max allowed mappings
3187 :returns: Whether number of possible mappings of *mdl_chains* onto
3188 *ref_chains* is below or equal *max_mappings*.
3190 assert(len(ref_chains) == len(mdl_chains))
3192 for a,b
in zip(ref_chains, mdl_chains):
3193 n *= _NChemGroupMappings(a,b)
3194 if n > max_mappings:
3198 def _RefSmallerGenerator(ref_chains, mdl_chains):
3199 """ Returns all possible ways to map mdl_chains onto ref_chains
3201 Specific for the case where len(ref_chains) < len(mdl_chains)
3203 for c
in itertools.combinations(mdl_chains, len(ref_chains)):
3204 for p
in itertools.permutations(c):
3207 def _RefLargerGenerator(ref_chains, mdl_chains):
3208 """ Returns all possible ways to map mdl_chains onto ref_chains
3210 Specific for the case where len(ref_chains) > len(mdl_chains)
3211 Ref chains without mapped mdl chain are assigned None
3213 n_ref = len(ref_chains)
3214 n_mdl = len(mdl_chains)
3215 for c
in itertools.combinations(range(n_ref), n_mdl):
3216 for p
in itertools.permutations(mdl_chains):
3217 ret_list = [
None] * n_ref
3218 for idx, ch
in zip(c, p):
3222 def _RefEqualGenerator(ref_chains, mdl_chains):
3223 """ Returns all possible ways to map mdl_chains onto ref_chains
3225 Specific for the case where len(ref_chains) == len(mdl_chains)
3227 for p
in itertools.permutations(mdl_chains):
3230 def _RefEmptyGenerator(ref_chains, mdl_chains):
3233 def _ConcatIterators(iterators):
3234 for item
in itertools.product(*iterators):
3237 def _ChainMappings(ref_chains, mdl_chains, n_max=None):
3238 """Returns all possible ways to map *mdl_chains* onto fixed *ref_chains*
3240 :param ref_chains: List of list of chemically equivalent chains in reference
3241 :type ref_chains: :class:`list` of :class:`list`
3242 :param mdl_chains: Equally long list of list of chemically equivalent chains
3243 in model that map on those ref chains.
3244 :type mdl_chains: :class:`list` of :class:`list`
3245 :param n_max: Aborts and raises :class:`RuntimeError` if max number of
3246 mappings is above this threshold.
3247 :type n_max: :class:`int`
3248 :returns: Iterator over all possible mappings of *mdl_chains* onto fixed
3249 *ref_chains*. Potentially contains None as padding when number of
3250 model chains for a certain mapping is smaller than the according
3252 Example: _ChainMappings([['A', 'B', 'C'], ['D', 'E']],
3253 [['x', 'y'], ['i', 'j']])
3254 gives an iterator over: [[['x', 'y', None], ['i', 'j']],
3255 [['x', 'y', None], ['j', 'i']],
3256 [['y', 'x', None], ['i', 'j']],
3257 [['y', 'x', None], ['j', 'i']],
3258 [['x', None, 'y'], ['i', 'j']],
3259 [['x', None, 'y'], ['j', 'i']],
3260 [['y', None, 'x'], ['i', 'j']],
3261 [['y', None, 'x'], ['j', 'i']],
3262 [[None, 'x', 'y'], ['i', 'j']],
3263 [[None, 'x', 'y'], ['j', 'i']],
3264 [[None, 'y', 'x'], ['i', 'j']],
3265 [[None, 'y', 'x'], ['j', 'i']]]
3267 assert(len(ref_chains) == len(mdl_chains))
3269 if n_max
is not None:
3270 if not _NMappingsWithin(ref_chains, mdl_chains, n_max):
3271 raise RuntimeError(f
"Too many mappings. Max allowed: {n_max}")
3276 for ref, mdl
in zip(ref_chains, mdl_chains):
3278 iterators.append(_RefEmptyGenerator(ref, mdl))
3279 elif len(ref) == len(mdl):
3280 iterators.append(_RefEqualGenerator(ref, mdl))
3281 elif len(ref) < len(mdl):
3282 iterators.append(_RefSmallerGenerator(ref, mdl))
3284 iterators.append(_RefLargerGenerator(ref, mdl))
3286 return _ConcatIterators(iterators)
3289 def _GetTransform(pos_one, pos_two, iterative):
3290 """ Computes minimal RMSD superposition for pos_one onto pos_two
3292 :param pos_one: Positions that should be superposed onto *pos_two*
3293 :type pos_one: :class:`geom.Vec3List`
3294 :param pos_two: Reference positions
3295 :type pos_two: :class:`geom.Vec3List`
3296 :iterative: Whether iterative superposition should be used. Iterative
3297 potentially raises, uses standard superposition as fallback.
3298 :type iterative: :class:`bool`
3299 :returns: Transformation matrix to superpose *pos_one* onto *pos_two*
3300 :rtype: :class:`geom.Mat4`
3305 res = mol.alg.IterativeSuperposeSVD(pos_one, pos_two)
3309 res = mol.alg.SuperposeSVD(pos_one, pos_two)
3310 return res.transformation
3313 __all__ = (
'ChainMapper',
'ReprResult',
'MappingResult')
def _NSCConserved(self, trg_ch, mdl_ch)
def _NPairConserved(self, trg_int, mdl_int)
def FromFlatMapping(self, flat_mapping)
def __init__(self, pep_subst_mat=seq.alg.BLOSUM62, pep_gap_open=-5, pep_gap_ext=-2, nuc_subst_mat=seq.alg.NUC44, nuc_gap_open=-4, nuc_gap_ext=-4, resnum_aln=False)
def Align(self, s1, s2, chem_type=None)
def NWAlign(self, s1, s2, chem_type)
def ResNumAlign(self, s1, s2)
def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, contact_d=12.0, steep_opt_rate=None, greedy_prune_contact_map=False)
def SCCounts(self, ref_ch, mdl_ch)
def ExtendMapping(self, mapping, max_ext=None)
def _SteepOpt(self, mapping, chains_to_optimize=None)
def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], steep_opt_rate=None)
def ExtendMapping(self, mapping, max_ext=None)
def _SteepOpt(self, mapping, chains_to_optimize=None)
def chem_group_ref_seqs(self)
def chem_group_alignments(self)
def GetlDDTMapping(self, model, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic", steep_opt_rate=None, block_seed_size=5, block_blocks_per_chem_group=5, chem_mapping_result=None, heuristic_n_max_naive=40320)
def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False, only_interchain=False, chem_mapping_result=None, global_mapping=None)
def GetNMappings(self, model)
def GetChemMapping(self, model)
def GetMapping(self, model, n_max_naive=40320)
def __init__(self, target, resnum_alignments=False, pep_seqid_thr=95., pep_gap_thr=1.0, nuc_seqid_thr=95., nuc_gap_thr=1.0, pep_subst_mat=seq.alg.BLOSUM62, pep_gap_open=-11, pep_gap_ext=-1, nuc_subst_mat=seq.alg.NUC44, nuc_gap_open=-4, nuc_gap_ext=-4, min_pep_length=6, min_nuc_length=4, n_max_naive=1e8)
def ProcessStructure(self, ent)
def Align(self, s1, s2, stype)
def GetQSScoreMapping(self, model, contact_d=12.0, strategy="heuristic", block_seed_size=5, block_blocks_per_chem_group=5, heuristic_n_max_naive=40320, steep_opt_rate=None, chem_mapping_result=None, greedy_prune_contact_map=True)
def chem_group_types(self)
def GetRMSDMapping(self, model, strategy="heuristic", subsampling=50, chem_mapping_result=None, heuristic_n_max_naive=120)
def GetFlatMapping(self, mdl_as_key=False)
def __init__(self, target, model, chem_groups, chem_mapping, mapping, alns, opt_score=None)
def mdl_full_bb_pos(self)
def _GetFullBBPos(self, residues)
def superposed_mdl_bb_pos(self)
def _GetBBPos(self, residues)
def inconsistent_residues(self)
def GetFlatChainMapping(self, mdl_as_key=False)
def ref_full_bb_pos(self)
def __init__(self, lDDT, substructure, ref_view, mdl_view)
def _GetInconsistentResidues(self, ref_residues, mdl_residues)
def _InterfacePenalty1(self, interface)
def _InterfacePenalty2(self, interface)
def _MappedInterfaceScores(self, int1, int2)
def FromFlatMapping(self, flat_mapping)