2Chain mapping aims to identify a one-to-one relationship between chains in a
3reference structure and a model.
11from scipy.special
import factorial
12from scipy.special
import binom
25 """ Returns view with specified chains
27 Ensures that quotation marks are around chain names to not confuse
28 OST query language with weird special characters.
30 query =
"cname=" +
','.join([mol.QueryQuoteName(cname)
for cname
in cnames])
31 return ent.Select(query)
34 """ Result object for the chain mapping functions in :class:`ChainMapper`
36 Constructor is directly called within the functions, no need to construct
37 such objects yourself.
39 def __init__(self, target, model, chem_groups, chem_mapping,
40 mdl_chains_without_chem_mapping, mapping, alns, opt_score=None):
52 """ Target/reference structure, i.e. :attr:`ChainMapper.target`
54 :type: :class:`ost.mol.EntityView`
60 """ Model structure that gets mapped onto :attr:`~target`
62 Underwent same processing as :attr:`ChainMapper.target`, i.e.
63 only contains peptide/nucleotide chains of sufficient size.
65 :type: :class:`ost.mol.EntityView`
71 """ Groups of chemically equivalent chains in :attr:`~target`
73 Same as :attr:`ChainMapper.chem_group`
75 :class:`list` of :class:`list` of :class:`str` (chain names)
81 """ Assigns chains in :attr:`~model` to :attr:`~chem_groups`.
83 :class:`list` of :class:`list` of :class:`str` (chain names)
89 """ Model chains that cannot be mapped to :attr:`chem_groups`
91 Depends on parameterization of :class:`ChainMapper`
93 :class:`list` of class:`str` (chain names)
99 """ Mapping of :attr:`~model` chains onto :attr:`~target`
101 Exact same shape as :attr:`~chem_groups` but containing the names of the
102 mapped chains in :attr:`~model`. May contain None for :attr:`~target`
103 chains that are not covered. No guarantee that all chains in
104 :attr:`~model` are mapped.
106 :class:`list` of :class:`list` of :class:`str` (chain names)
112 """ Alignments of mapped chains in :attr:`~target` and :attr:`~model`
114 Each alignment is accessible with ``alns[(t_chain,m_chain)]``. First
115 sequence is the sequence of :attr:`target` chain, second sequence the
116 one from :attr:`~model`.
118 :type: :class:`dict` with key: :class:`tuple` of :class:`str`, value:
119 :class:`ost.seq.AlignmentHandle`
125 """ Placeholder property without any guarantee of being set
127 Different scores get optimized in the various chain mapping algorithms.
128 Some of them may set their final optimal score in that property.
129 Consult the documentation of the respective chain mapping algorithm
130 for more information. Won't be in the return dict of
136 """ Returns flat mapping as :class:`dict` for all mapable chains
138 :param mdl_as_key: Default is target chain name as key and model chain
139 name as value. This can be reversed with this flag.
140 :returns: :class:`dict` with :class:`str` as key/value that describe
143 flat_mapping = dict()
146 for a,b
in zip(trg_chem_group, mdl_chem_group):
147 if a
is not None and b
is not None:
155 """ Returns JSON serializable summary of results
161 json_dict[
"alns"] = list()
162 for aln
in self.
alns.values():
163 trg_seq = aln.GetSequence(0)
164 mdl_seq = aln.GetSequence(1)
165 aln_dict = {
"trg_ch": trg_seq.GetName(),
"trg_seq": str(trg_seq),
166 "mdl_ch": mdl_seq.GetName(),
"mdl_seq": str(mdl_seq)}
167 json_dict[
"alns"].append(aln_dict)
173 """ Result object for :func:`ChainMapper.GetRepr`
175 Constructor is directly called within the function, no need to construct
176 such objects yourself.
178 :param lDDT: lDDT for this mapping. Depends on how you call
179 :func:`ChainMapper.GetRepr` whether this is backbone only or
181 :type lDDT: :class:`float`
182 :param substructure: The full substructure for which we searched for a
184 :type substructure: :class:`ost.mol.EntityView`
185 :param ref_view: View pointing to the same underlying entity as
186 *substructure* but only contains the stuff that is mapped
187 :type ref_view: :class:`mol.EntityView`
188 :param mdl_view: The matching counterpart in model
189 :type mdl_view: :class:`mol.EntityView`
191 def __init__(self, lDDT, substructure, ref_view, mdl_view):
194 assert(len(ref_view.residues) == len(mdl_view.residues))
211 """ lDDT of representation result
213 Depends on how you call :func:`ChainMapper.GetRepr` whether this is
214 backbone only or full atom lDDT.
216 :type: :class:`float`
222 """ The full substructure for which we searched for a
225 :type: :class:`ost.mol.EntityView`
231 """ View which contains the mapped subset of :attr:`substructure`
233 :type: :class:`ost.mol.EntityView`
239 """ The :attr:`ref_view` representation in the model
241 :type: :class:`ost.mol.EntityView`
247 """ The reference residues
249 :type: class:`mol.ResidueViewList`
255 """ The model residues
257 :type: :class:`mol.ResidueViewList`
263 """ A list of mapped residue whose names do not match (eg. ALA in the
264 reference and LEU in the model).
266 The mismatches are reported as a tuple of :class:`~ost.mol.ResidueView`
267 (reference, model), or as an empty list if all the residue names match.
278 """ Representative backbone positions for reference residues.
280 Thats CA positions for peptides and C3' positions for Nucleotides.
282 :type: :class:`geom.Vec3List`
290 """ Representative backbone positions for model residues.
292 Thats CA positions for peptides and C3' positions for Nucleotides.
294 :type: :class:`geom.Vec3List`
302 """ Representative backbone positions for reference residues.
304 Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
305 positions for Nucleotides.
307 :type: :class:`geom.Vec3List`
315 """ Representative backbone positions for reference residues.
317 Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
318 positions for Nucleotides.
320 :type: :class:`geom.Vec3List`
329 """ Superposition of mdl residues onto ref residues
331 Superposition computed as minimal RMSD superposition on
332 :attr:`ref_bb_pos` and :attr:`mdl_bb_pos`. If number of positions is
333 smaller 3, the full_bb_pos equivalents are used instead.
335 :type: :class:`ost.mol.alg.SuperpositionResult`
348 """ Transformation to superpose mdl residues onto ref residues
350 :type: :class:`ost.geom.Mat4`
356 """ :attr:`mdl_bb_pos` with :attr:`transform applied`
358 :type: :class:`geom.Vec3List`
367 """ RMSD of the binding site backbone atoms after :attr:`superposition`
369 :type: :class:`float`
376 """ query for mdl residues in OpenStructure query language
378 Repr can be selected as ``full_mdl.Select(ost_query)``
380 Returns invalid query if residue numbers have insertion codes.
387 chname = r.GetChain().GetName()
388 rnum = r.GetNumber().GetNum()
389 if chname
not in chain_rnums:
390 chain_rnums[chname] = list()
391 chain_rnums[chname].append(str(rnum))
392 chain_queries = list()
393 for k,v
in chain_rnums.items():
394 q = f
"(cname={mol.QueryQuoteName(k)} and "
395 q += f
"rnum={','.join(v)})"
396 chain_queries.append(q)
401 """ Returns JSON serializable summary of results
404 json_dict[
"lDDT"] = self.
lDDT
405 json_dict[
"ref_residues"] = [r.GetQualifiedName()
for r
in \
407 json_dict[
"mdl_residues"] = [r.GetQualifiedName()
for r
in \
410 json_dict[
"bb_rmsd"] = self.
bb_rmsd
416 """ Returns flat mapping of all chains in the representation
418 :param mdl_as_key: Default is target chain name as key and model chain
419 name as value. This can be reversed with this flag.
420 :returns: :class:`dict` with :class:`str` as key/value that describe
423 flat_mapping = dict()
426 flat_mapping[mdl_res.chain.name] = trg_res.chain.name
428 flat_mapping[trg_res.chain.name] = mdl_res.chain.name
432 """ Helper to extract full backbone positions
434 exp_pep_atoms = [
"N",
"CA",
"C"]
435 exp_nuc_atoms = [
"O5'",
"C5'",
"C4'",
"C3'",
"O3'"]
438 if r.GetChemType() == mol.ChemType.NUCLEOTIDES:
439 exp_atoms = exp_nuc_atoms
440 elif r.GetChemType() == mol.ChemType.AMINOACIDS:
441 exp_atoms = exp_pep_atoms
443 raise RuntimeError(f
"Residue {r.qualified_name} has unexpected"
444 f
" chem type {r.GetChemType()}")
445 for aname
in exp_atoms:
446 a = r.FindAtom(aname)
448 raise RuntimeError(f
"Expected atom {aname} missing from "
449 f
"{r.qualified_name}")
450 bb_pos.append(a.GetPos())
454 """ Helper to extract single representative position for each residue
458 at = r.FindAtom(
"CA")
460 at = r.FindAtom(
"C3'")
462 raise RuntimeError(f
"Residue {r.qualified_name} missing "
463 f
"expected CA/C3' atom")
464 bb_pos.append(at.GetPos())
468 """ Helper to extract a list of inconsistent residues.
470 if len(ref_residues) != len(mdl_residues):
471 raise ValueError(
"Something terrible happened... Reference and "
472 "model lengths differ... RUN...")
473 inconsistent_residues = list()
474 for ref_residue, mdl_residue
in zip(ref_residues, mdl_residues):
475 if ref_residue.name != mdl_residue.name:
476 inconsistent_residues.append((ref_residue, mdl_residue))
477 return inconsistent_residues
481 """ Class to compute chain mappings
483 All algorithms are performed on processed structures which fulfill
484 criteria as given in constructor arguments (*min_pep_length*,
485 "min_nuc_length") and only contain residues which have all required backbone
486 atoms. for peptide residues thats N, CA, C and CB (no CB for GLY), for
487 nucleotide residues thats O5', C5', C4', C3' and O3'.
489 Chain mapping is a three step process:
491 * Group chemically identical chains in *target* into so called chem
492 groups. There are two options to achieve this:
493 1) using pairwise alignments that are either computed with
494 Needleman-Wunsch (NW) or simply derived from residue numbers
495 (*resnum_alignments* flag). In case of NW, *pep_subst_mat*, *pep_gap_open*
496 and *pep_gap_ext* and their nucleotide equivalents are relevant. Two
497 chains are considered identical if they fulfill the *pep_seqid_thr* and
498 have at least *min_pep_length* aligned residues. Same logic is applied
499 for nucleotides with respective thresholds.
500 2) specify seqres sequences and provide the mapping manually. This can
501 be useful if you're loading data from an mmCIF file where you have mmCIF
502 entity information. Alignments are performed based on residue numbers
503 in this case and don't consider the *resnum_alignments* flag. Any
504 mismatch of one letter code in the structure and the respective SEQRES
507 * Map chains in an input model to these groups. Parameters for generating
508 alignments are the same as above. Model chains are mapped to the chem
509 group with highest sequence identity. Optionally, you can set a minimum
510 sequence identity threshold with *mdl_map_pep_seqid_thr*/
511 *mdl_map_nuc_seqid_thr* to avoid nonsensical mappings. You can either
512 get the group mapping with :func:`GetChemMapping` or directly call
513 one of the full fletched one-to-one chain mapping functions which
514 execute that step internally.
516 * Obtain one-to-one mapping for chains in an input model and
517 *target* with one of the available mapping functions. Just to get an
518 idea of complexity. If *target* and *model* are octamers, there are
519 ``8! = 40320`` possible chain mappings.
521 :param target: Target structure onto which models are mapped.
522 Computations happen on a selection only containing
523 polypeptides and polynucleotides.
524 :type target: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
525 :param resnum_alignments: Use residue numbers instead of
526 Needleman-Wunsch to compute pairwise
527 alignments. Relevant for :attr:`~chem_groups`
528 if *seqres* is not specified explicitely.
529 :type resnum_alignments: :class:`bool`
530 :param pep_seqid_thr: Sequence identity threshold (in percent of aligned
531 columns) used to decide when two chains are
532 identical. 95 percent tolerates the few mutations
533 crystallographers like to do. Range: [0-100].
534 :type pep_seqid_thr: :class:`float`
535 :param nuc_seqid_thr: Nucleotide equivalent for *pep_seqid_thr*
536 :type nuc_seqid_thr: :class:`float`
537 :param pep_subst_mat: Substitution matrix to align peptide sequences,
538 irrelevant if *resnum_alignments* is True,
539 defaults to seq.alg.BLOSUM62
540 :type pep_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
541 :param pep_gap_open: Gap open penalty to align peptide sequences,
542 irrelevant if *resnum_alignments* is True
543 :type pep_gap_open: :class:`int`
544 :param pep_gap_ext: Gap extension penalty to align peptide sequences,
545 irrelevant if *resnum_alignments* is True
546 :type pep_gap_ext: :class:`int`
547 :param nuc_subst_mat: Nucleotide equivalent for *pep_subst_mat*,
548 defaults to seq.alg.NUC44
549 :type nuc_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
550 :param nuc_gap_open: Nucleotide equivalent for *pep_gap_open*
551 :type nuc_gap_open: :class:`int`
552 :param nuc_gap_ext: Nucleotide equivalent for *pep_gap_ext*
553 :type nuc_gap_ext: :class:`int`
554 :param min_pep_length: Minimal number of residues for a peptide chain to be
555 considered in target and in models.
556 :type min_pep_length: :class:`int`
557 :param min_nuc_length: Minimal number of residues for a nucleotide chain to be
558 considered in target and in models.
559 :type min_nuc_length: :class:`int`
560 :param n_max_naive: Max possible chain mappings that are enumerated in
561 :func:`~GetNaivelDDTMapping` /
562 :func:`~GetDecomposerlDDTMapping`. A
563 :class:`RuntimeError` is raised in case of bigger
565 :type n_max_naive: :class:`int`
566 :param mdl_map_pep_seqid_thr: Sequence identity threshold (in percent of
567 aligned columns) to avoid non-sensical
568 mapping of model chains to reference chem
569 groups. If a value larger than 0.0 is
570 provided, minimal criteria for assignment
571 becomes a sequence identity of
572 *mdl_map_pep_seqid_thr* and at least
573 *min_pep_length* aligned residues. If set to
574 zero, it simply assigns a model chain to the
575 chem group with highest sequence identity.
577 :type mdl_map_pep_seqid_thr: :class:`float`
578 :param mdl_map_nuc_seqid_thr: Nucleotide equivalent of
579 *mdl_map_pep_seqid_thr*
580 :type mdl_map_nuc_seqid_thr: :class:`float`
581 :param seqres: SEQRES sequences. All polymer chains in *target* will be
582 aligned to these sequences using a residue number based
583 alignment, effectively ignoring *resnum_alignments* in
584 the chem grouping step. Additionally, you need to
585 manually specify a mapping of the polymer chains using
586 *trg_seqres_mapping* and an error is raised otherwise.
587 The one letter codes in
588 the structure must exactly match the respective characters
589 in seqres and an error is raised if not. These seqres
590 sequences are set as :attr:`~chem_group_ref_seqs` and will
591 thus influence the mapping of model chains.
592 :type seqres: :class:`ost.seq.SequenceList`
593 :param trg_seqres_mapping: Maps each polymer chain in *target* to a sequence
594 in *seqres*. It's a :class:`dict` with chain name
595 as key and seqres sequence name as value.
596 :type trg_seqres_mapping: :class:`dict`
598 def __init__(self, target, resnum_alignments=False,
599 pep_seqid_thr = 95., nuc_seqid_thr = 95.,
600 pep_subst_mat = seq.alg.BLOSUM62, pep_gap_open = -11,
601 pep_gap_ext = -1, nuc_subst_mat = seq.alg.NUC44,
602 nuc_gap_open = -4, nuc_gap_ext = -4,
603 min_pep_length = 6, min_nuc_length = 4,
605 mdl_map_pep_seqid_thr = 0.,
606 mdl_map_nuc_seqid_thr = 0.,
608 trg_seqres_mapping = None):
611 seqres_params_set = sum([seqres
is not None,
612 trg_seqres_mapping
is not None])
613 if seqres_params_set
not in [0, 2]:
614 raise RuntimeError(
"Must either give both, seqres and "
615 "trg_seqres_mapping, or none of them.")
651 if sname
in entity_ids:
652 raise RuntimeError(f
"Sequence names in seqres param must "
653 f
"be unique and refer to entity ids. "
654 f
"Duplicate sequence name: \"{sname}\"")
655 entity_ids.add(sname)
659 if entity_id
not in entity_ids:
660 raise RuntimeError(f
"trg_seqres_mapping contains entity id "
661 f
"for which no seqres is given: "
662 f
"cname: \"{cname}\", entity id: "
668 raise RuntimeError(f
"If seqres information is provided, "
669 f
"all polymer chains must be covered "
670 f
"by trg_seqres_mapping. Missing "
671 f
"mapping for chain \"{s.GetName()}\"")
674 raise RuntimeError(f
"If seqres information is provided, "
675 f
"all polymer chains must be covered "
676 f
"by trg_seqres_mapping. Missing "
677 f
"mapping for chain \"{s.GetName()}\"")
682 """Target structure that only contains peptides/nucleotides
684 Contains only residues that have the backbone representatives
685 (CA for peptide and C3' for nucleotides) to avoid ATOMSEQ alignment
686 inconsistencies when switching between all atom and backbone only
689 :type: :class:`ost.mol.EntityView`
695 """Sequences of peptide chains in :attr:`~target`
697 :type: :class:`ost.seq.SequenceList`
703 """Sequences of nucleotide chains in :attr:`~target`
705 :type: :class:`ost.seq.SequenceList`
711 """Groups of chemically equivalent chains in :attr:`~target`
713 First chain in group is the one with longest sequence.
715 :getter: Computed on first use (cached)
716 :type: :class:`list` of :class:`list` of :class:`str` (chain names)
724 """MSA for each group in :attr:`~chem_groups`
726 The first sequence is the reference sequence.
727 The subsequent sequences represent the ATOMSEQ sequences in
728 :attr:`~target` in same order as in :attr:`~chem_groups`.
730 :getter: Computed on first use (cached)
731 :type: :class:`ost.seq.AlignmentList`
739 """Reference sequence for each group in :attr:`~chem_groups`
741 :getter: Computed on first use (cached)
742 :type: :class:`ost.seq.SequenceList`
750 """ChemType of each group in :attr:`~chem_groups`
752 Specifying if groups are poly-peptides/nucleotides, i.e.
753 :class:`ost.mol.ChemType.AMINOACIDS` or
754 :class:`ost.mol.ChemType.NUCLEOTIDES`
756 :getter: Computed on first use (cached)
757 :type: :class:`list` of :class:`ost.mol.ChemType`
764 """Maps sequences in *model* to chem_groups of target
766 :param model: Model from which to extract sequences, a
767 selection that only includes peptides and nucleotides
768 is performed and returned along other results.
769 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
770 :returns: Tuple with two lists of length `len(self.chem_groups)`, a list
771 and an :class:`ost.mol.EntityView` representing *model*:
772 1) Each element is a :class:`list` with mdl chain names that
773 map to the chem group at that position.
774 2) Each element is a :class:`ost.seq.AlignmentList` aligning
775 these mdl chain sequences to the chem group ref sequences.
776 3) The model chains that could not be mapped to the reference
777 4) A selection of *model* that only contains polypeptides and
778 polynucleotides whose ATOMSEQ exactly matches the sequence
779 info in the returned alignments.
783 mdl_chains_without_chem_mapping = list()
786 for s
in mdl_pep_seqs:
789 s, mol.ChemType.AMINOACIDS, mdl,
793 ost.LogWarning(
"Could not map model chain %s to any chem group"
794 " in the target" % s.name)
795 mdl_chains_without_chem_mapping.append(s.GetName())
797 mapping[idx].append(s.GetName())
798 alns[idx].append(aln)
800 for s
in mdl_nuc_seqs:
803 s, mol.ChemType.NUCLEOTIDES, mdl,
807 ost.LogWarning(
"Could not map model chain %s to any chem group"
808 " in the target" % s.name)
809 mdl_chains_without_chem_mapping.append(s.GetName())
811 mapping[idx].append(s.GetName())
812 alns[idx].append(aln)
814 return (mapping, alns, mdl_chains_without_chem_mapping, mdl)
818 thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic",
819 steep_opt_rate = None, block_seed_size = 5,
820 block_blocks_per_chem_group = 5,
821 chem_mapping_result = None,
822 heuristic_n_max_naive = 40320):
823 """ Identify chain mapping by optimizing lDDT score
825 Maps *model* chain sequences to :attr:`~chem_groups` and find mapping
826 based on backbone only lDDT score (CA for amino acids C3' for
829 Either performs a naive search, i.e. enumerate all possible mappings or
830 executes a greedy strategy that tries to identify a (close to) optimal
831 mapping in an iterative way by starting from a start mapping (seed). In
832 each iteration, the one-to-one mapping that leads to highest increase
833 in number of conserved contacts is added with the additional requirement
834 that this added mapping must have non-zero interface counts towards the
835 already mapped chains. So basically we're "growing" the mapped structure
836 by only adding connected stuff.
838 The available strategies:
840 * **naive**: Enumerates all possible mappings and returns best
842 * **greedy_full**: try multiple seeds for greedy extension, i.e. try
843 all ref/mdl chain combinations within the respective chem groups and
844 retain the mapping leading to the best lDDT.
846 * **greedy_block**: try multiple seeds for greedy extension, i.e. try
847 all ref/mdl chain combinations within the respective chem groups and
848 extend them to *block_seed_size*. *block_blocks_per_chem_group*
849 for each chem group are selected for exhaustive extension.
851 * **heuristic**: Uses *naive* strategy if number of possible mappings
852 is within *heuristic_n_max_naive*. The default of 40320 corresponds
853 to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
854 6!*2!=1440 etc. If the number of possible mappings is larger,
855 *greedy_full* is used.
857 Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
860 :param model: Model to map
861 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
862 :param inclusion_radius: Inclusion radius for lDDT
863 :type inclusion_radius: :class:`float`
864 :param thresholds: Thresholds for lDDT
865 :type thresholds: :class:`list` of :class:`float`
866 :param strategy: Strategy to find mapping. Must be in ["naive",
867 "greedy_full", "greedy_block"]
868 :type strategy: :class:`str`
869 :param steep_opt_rate: Only relevant for greedy strategies.
870 If set, every *steep_opt_rate* mappings, a simple
871 optimization is executed with the goal of
872 avoiding local minima. The optimization
873 iteratively checks all possible swaps of mappings
874 within their respective chem groups and accepts
875 swaps that improve lDDT score. Iteration stops as
876 soon as no improvement can be achieved anymore.
877 :type steep_opt_rate: :class:`int`
878 :param block_seed_size: Param for *greedy_block* strategy - Initial seeds
879 are extended by that number of chains.
880 :type block_seed_size: :class:`int`
881 :param block_blocks_per_chem_group: Param for *greedy_block* strategy -
882 Number of blocks per chem group that
883 are extended in an initial search
884 for high scoring local solutions.
885 :type block_blocks_per_chem_group: :class:`int`
886 :param chem_mapping_result: Pro param. The result of
887 :func:`~GetChemMapping` where you provided
888 *model*. If set, *model* parameter is not
890 :type chem_mapping_result: :class:`tuple`
891 :returns: A :class:`MappingResult`
894 strategies = [
"naive",
"greedy_full",
"greedy_block",
"heuristic"]
895 if strategy
not in strategies:
896 raise RuntimeError(f
"Strategy must be in {strategies}")
898 if chem_mapping_result
is None:
899 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
902 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
912 if one_to_one
is not None:
915 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
916 if ref_ch
is not None and mdl_ch
is not None:
917 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
918 alns[(ref_ch, mdl_ch)] = aln
920 mdl_chains_without_chem_mapping, one_to_one, alns)
922 if strategy ==
"heuristic":
924 heuristic_n_max_naive):
927 strategy =
"greedy_full"
932 if strategy ==
"naive":
935 chem_mapping, ref_mdl_alns,
940 chem_mapping, ref_mdl_alns,
941 inclusion_radius=inclusion_radius,
942 thresholds=thresholds,
943 steep_opt_rate=steep_opt_rate)
944 if strategy ==
"greedy_full":
946 elif strategy ==
"greedy_block":
948 block_blocks_per_chem_group)
950 opt_lddt = the_greed.Score(mapping)
954 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
955 if ref_ch
is not None and mdl_ch
is not None:
956 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
957 alns[(ref_ch, mdl_ch)] = aln
960 mdl_chains_without_chem_mapping, mapping, alns,
961 opt_score = opt_lddt)
965 block_seed_size = 5, block_blocks_per_chem_group = 5,
966 heuristic_n_max_naive = 40320, steep_opt_rate = None,
967 chem_mapping_result = None,
968 greedy_prune_contact_map = True):
969 """ Identify chain mapping based on QSScore
971 Scoring is based on CA/C3' positions which are present in all chains of
972 a :attr:`chem_groups` as well as the *model* chains which are mapped to
973 that respective chem group.
975 The following strategies are available:
977 * **naive**: Naively iterate all possible mappings and return best based
980 * **greedy_full**: try multiple seeds for greedy extension, i.e. try
981 all ref/mdl chain combinations within the respective chem groups and
982 retain the mapping leading to the best QS-score.
984 * **greedy_block**: try multiple seeds for greedy extension, i.e. try
985 all ref/mdl chain combinations within the respective chem groups and
986 extend them to *block_seed_size*. *block_blocks_per_chem_group*
987 for each chem group are selected for exhaustive extension.
989 * **heuristic**: Uses *naive* strategy if number of possible mappings
990 is within *heuristic_n_max_naive*. The default of 40320 corresponds
991 to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
992 6!*2!=1440 etc. If the number of possible mappings is larger,
993 *greedy_full* is used.
995 Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
998 :param model: Model to map
999 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1000 :param contact_d: Max distance between two residues to be considered as
1001 contact in qs scoring
1002 :type contact_d: :class:`float`
1003 :param strategy: Strategy for sampling, must be in ["naive",
1004 greedy_full", "greedy_block"]
1005 :type strategy: :class:`str`
1006 :param chem_mapping_result: Pro param. The result of
1007 :func:`~GetChemMapping` where you provided
1008 *model*. If set, *model* parameter is not
1010 :type chem_mapping_result: :class:`tuple`
1011 :param greedy_prune_contact_map: Relevant for all strategies that use
1012 greedy extensions. If True, only chains
1013 with at least 3 contacts (8A CB
1014 distance) towards already mapped chains
1015 in trg/mdl are considered for
1016 extension. All chains that give a
1017 potential non-zero QS-score increase
1018 are used otherwise (at least one
1019 contact within 12A). The consequence
1020 is reduced runtime and usually no
1021 real reduction in accuracy.
1022 :returns: A :class:`MappingResult`
1025 strategies = [
"naive",
"greedy_full",
"greedy_block",
"heuristic"]
1026 if strategy
not in strategies:
1027 raise RuntimeError(f
"strategy must be {strategies}")
1029 if chem_mapping_result
is None:
1030 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1033 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1041 if one_to_one
is not None:
1044 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1045 if ref_ch
is not None and mdl_ch
is not None:
1046 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1047 alns[(ref_ch, mdl_ch)] = aln
1049 mdl_chains_without_chem_mapping, one_to_one, alns)
1051 if strategy ==
"heuristic":
1053 heuristic_n_max_naive):
1056 strategy =
"greedy_full"
1061 if strategy ==
"naive":
1064 chem_mapping, ref_mdl_alns,
1070 chem_mapping, ref_mdl_alns,
1071 contact_d = contact_d,
1072 steep_opt_rate=steep_opt_rate,
1073 greedy_prune_contact_map = greedy_prune_contact_map)
1074 if strategy ==
"greedy_full":
1076 elif strategy ==
"greedy_block":
1078 block_blocks_per_chem_group)
1080 opt_qsscore = the_greed.Score(mapping, check=
False).QS_global
1084 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1085 if ref_ch
is not None and mdl_ch
is not None:
1086 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1087 alns[(ref_ch, mdl_ch)] = aln
1090 mdl_chains_without_chem_mapping, mapping, alns,
1091 opt_score = opt_qsscore)
1094 chem_mapping_result = None, heuristic_n_max_naive = 120):
1095 """Identify chain mapping based on minimal RMSD superposition
1097 Superposition and scoring is based on CA/C3' positions which are present
1098 in all chains of a :attr:`chem_groups` as well as the *model*
1099 chains which are mapped to that respective chem group.
1101 The following strategies are available:
1103 * **naive**: Naively iterate all possible mappings and return the one
1106 * **greedy_single**: perform all vs. all single chain superpositions
1107 within the respective ref/mdl chem groups to use as starting points.
1108 For each starting point, iteratively add the model/target chain pair
1109 with lowest RMSD until a full mapping is achieved. The mapping with
1110 lowest RMSD is returned.
1112 * **greedy_single_centroid**: Same as *greedy_single* but iteratively
1113 adds model/target chain pairs with lowest centroid distance. The
1114 mapping with the lowest centroid RMSD is returned. This is mainly for
1115 evaluation purposes. One known failure mode is that intertwined
1116 structures may have multiple centroids sitting on top of each other.
1117 RMSD and thus *greedy_single* are better able to disentangle this
1120 * **greedy_iterative**: Same as greedy_single_rmsd exept that the
1121 transformation gets updated with each added chain pair.
1123 * **heuristic**: Uses *naive* strategy if number of possible mappings
1124 is within *heuristic_n_max_naive*. The default of 120 corresponds
1125 to a homo-pentamer (5!=120). If the number of possible mappings is
1126 larger, *greedy_iterative* is used.
1128 :param model: Model to map
1129 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1130 :param strategy: Strategy for sampling. Must be in ["naive",
1131 "greedy_single", "greedy_iterative"]
1132 :type strategy: :class:`str`
1133 :param subsampling: If given, only an equally distributed subset
1134 of CA/C3' positions in each chain are used for
1135 superposition/scoring.
1136 :type subsampling: :class:`int`
1137 :param chem_mapping_result: Pro param. The result of
1138 :func:`~GetChemMapping` where you provided
1139 *model*. If set, *model* parameter is not
1141 :type chem_mapping_result: :class:`tuple`
1142 :returns: A :class:`MappingResult`
1145 strategies = [
"naive",
"greedy_single",
"greedy_single_centroid",
1146 "greedy_iterative",
"heuristic"]
1148 if strategy
not in strategies:
1149 raise RuntimeError(f
"strategy must be {strategies}")
1151 if chem_mapping_result
is None:
1152 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1155 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1164 if one_to_one
is not None:
1167 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1168 if ref_ch
is not None and mdl_ch
is not None:
1169 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1170 alns[(ref_ch, mdl_ch)] = aln
1172 mdl_chains_without_chem_mapping, one_to_one, alns)
1177 max_pos = subsampling)
1179 if strategy ==
"heuristic":
1181 heuristic_n_max_naive):
1184 strategy =
"greedy_iterative"
1188 if strategy.startswith(
"greedy"):
1191 initial_transforms = list()
1192 initial_mappings = list()
1193 for trg_pos, trg_chains, mdl_pos, mdl_chains
in zip(trg_group_pos,
1197 for t_pos, t
in zip(trg_pos, trg_chains):
1198 for m_pos, m
in zip(mdl_pos, mdl_chains):
1199 if len(t_pos) >= 3
and len(m_pos) >= 3:
1201 False).transformation
1202 initial_transforms.append(transform)
1203 initial_mappings.append((t,m))
1205 if strategy ==
"greedy_single":
1213 elif strategy ==
"greedy_single_centroid":
1221 elif strategy ==
"greedy_iterative":
1225 trg_group_pos, mdl_group_pos)
1226 elif strategy ==
"naive":
1228 trg_group_pos, mdl_group_pos,
1232 final_mapping = list()
1234 mapped_mdl_chains = list()
1235 for ref_ch
in ref_chains:
1236 if ref_ch
in mapping:
1237 mapped_mdl_chains.append(mapping[ref_ch])
1239 mapped_mdl_chains.append(
None)
1240 final_mapping.append(mapped_mdl_chains)
1244 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1245 if ref_ch
is not None and mdl_ch
is not None:
1246 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1247 alns[(ref_ch, mdl_ch)] = aln
1250 mdl_chains_without_chem_mapping, final_mapping, alns)
1254 """Identify chain mapping as in AlphaFold-Multimer (AFM)
1256 Described in section 7.3 of
1258 Richard Evans, Michael O’Neill, Alexander Pritzel, Natasha Antropova,
1259 Andrew Senior, Tim Green, Augustin ŽÃdek, Russ Bates, Sam Blackwell,
1260 Jason Yim, et al. Protein complex prediction with AlphaFold-Multimer.
1261 bioRxiv preprint bioRxiv:10.1101/2021.10.04.463034, 2021.
1263 To summarize (largely follows the description in the AF-Multimer paper):
1264 One anchor chain in the ground truth (reference) is selected. If the
1265 reference has stoichiometry A3B2, an arbitary chain in B is selected
1266 as it has smaller ambiguity. In a tie, for example A2B2, the longest
1267 chain in A, B is selected.
1269 Given a model chain with same sequence as the reference anchor chain,
1270 a CA-RMSD (C3' for nucleotides) based superposition is performed. Chains
1271 are then greedily assigned by minimum distance of their geometric
1272 centers. This procedure is repeated for every model chain with same
1273 sequence and the assignment with minimal RMSD of the geometric centers
1276 A modification of this algorithm allows to deal with different
1277 stoichiometries in reference and model. In the anchor selection, this
1278 function ensures that there is at least one model chain that can be
1279 mapped to this anchor. If the logic above does not identify a mappable
1280 chain pair for the smallest group, it continues with the second smallest
1283 :param model: Model to map
1284 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1285 :param chem_mapping_result: Pro param. The result of
1286 :func:`~GetChemMapping` where you provided
1287 *model*. If set, *model* parameter is not
1289 :type chem_mapping_result: :class:`tuple`
1290 :returns: A :class:`MappingResult`
1292 if chem_mapping_result
is None:
1293 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1296 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1305 if one_to_one
is not None:
1308 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1309 if ref_ch
is not None and mdl_ch
is not None:
1310 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1311 alns[(ref_ch, mdl_ch)] = aln
1313 mdl_chains_without_chem_mapping, one_to_one, alns)
1316 ref_group_idx =
None
1320 chem_group_sizes.sort()
1322 for min_size
in chem_group_sizes:
1323 min_chem_groups = list()
1325 if len(chem_group) == min_size:
1326 min_chem_groups.append(chem_group_idx)
1327 if len(min_chem_groups) == 1:
1328 if len(chem_mapping[min_chem_groups[0]]) > 0:
1332 ref_group_idx = min_chem_groups[0]
1339 for chem_group_idx
in min_chem_groups:
1340 if len(chem_mapping[chem_group_idx]) > 0:
1345 ref_group_idx = chem_group_idx
1347 if ref_group_idx
is not None and ref_ch
is not None:
1354 transformations = list()
1355 for mdl_ch
in chem_mapping[ref_group_idx]:
1356 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1358 l1 = len(ref_bb_pos[ref_ch])
1359 l2 = len(aln.GetSequence(0).GetGaplessString())
1361 l1 = len(mdl_bb_pos[mdl_ch])
1362 l2 = len(aln.GetSequence(1).GetGaplessString())
1370 if col[0] !=
'-' and col[1] !=
'-':
1371 ref_pos.append(ref_bb_pos[ref_ch][ref_idx])
1372 mdl_pos.append(mdl_bb_pos[mdl_ch][mdl_idx])
1378 transformations.append(t)
1380 ref_centers = {k: v.center
for k,v
in ref_bb_pos.items()}
1382 mdl_centers = list()
1383 for k,v
in mdl_bb_pos.items():
1384 mdl_chains.append(k)
1385 mdl_centers.append(v.center)
1388 best_rmsd = float(
"inf")
1391 for t
in transformations:
1393 tmp.ApplyTransform(t)
1395 t_mdl_centers = dict()
1396 for i
in range(len(tmp)):
1397 t_mdl_centers[mdl_chains[i]] = tmp[i]
1405 d =
geom.Length2(ref_centers[ref_ch]-t_mdl_centers[mdl_ch])
1406 tmp.append((d, ref_ch, mdl_ch))
1410 mapped_mdl_chains = set()
1412 if item[1]
not in mapping
and item[2]
not in mapped_mdl_chains:
1413 mapping[item[1]] = item[2]
1414 mapped_mdl_chains.add(item[2])
1416 if len(mapping) == 0:
1417 raise RuntimeError(
"Empty mapping in GetAFMMapping")
1418 elif len(mapping) == 1:
1421 elif len(mapping) == 2:
1425 ref_p = [ref_centers[k]
for k
in mapping.keys()]
1426 mdl_p = [t_mdl_centers[v]
for v
in mapping.values()]
1432 dd = abs(ref_d-mdl_d)
1440 for ref_ch, mdl_ch
in mapping.items():
1441 mapped_ref_centers.append(ref_centers[ref_ch])
1442 mapped_mdl_centers.append(t_mdl_centers[mdl_ch])
1444 mapped_mdl_centers,
False).rmsd
1445 if rmsd < best_rmsd:
1447 best_mapping = mapping
1450 final_mapping = list()
1452 mapped_mdl_chains = list()
1453 for ref_ch
in ref_chains:
1454 if ref_ch
in best_mapping:
1455 mapped_mdl_chains.append(best_mapping[ref_ch])
1457 mapped_mdl_chains.append(
None)
1458 final_mapping.append(mapped_mdl_chains)
1462 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1463 if ref_ch
is not None and mdl_ch
is not None:
1464 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1465 alns[(ref_ch, mdl_ch)] = aln
1468 mdl_chains_without_chem_mapping, final_mapping, alns)
1472 """ Convenience function to get mapping with currently preferred method
1474 If number of possible chain mappings is <= *n_max_naive*, a naive
1475 QS-score mapping is performed and optimal QS-score is guaranteed.
1476 For anything else, a QS-score mapping with the greedy_full strategy is
1477 performed (greedy_prune_contact_map = True). The default for
1478 *n_max_naive* of 40320 corresponds to an octamer (8!=40320). A
1479 structure with stoichiometry A6B2 would be 6!*2!=1440 etc.
1481 If :attr:`~target` has nucleotide sequences, the QS-score target
1482 function is replaced with a backbone only lDDT score that has
1483 an inclusion radius of 30A.
1487 inclusion_radius = 30.0,
1488 heuristic_n_max_naive = n_max_naive)
1491 greedy_prune_contact_map=
True,
1492 heuristic_n_max_naive = n_max_naive)
1494 def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0,
1495 thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False,
1496 only_interchain=False, chem_mapping_result = None,
1497 global_mapping = None):
1498 """ Identify *topn* representations of *substructure* in *model*
1500 *substructure* defines a subset of :attr:`~target` for which one
1501 wants the *topn* representations in *model*. Representations are scored
1504 :param substructure: A :class:`ost.mol.EntityView` which is a subset of
1505 :attr:`~target`. Should be selected with the
1506 OpenStructure query language. Example: if you're
1507 interested in residues with number 42,43 and 85 in
1509 ``substructure=mapper.target.Select("cname=A and rnum=42,43,85")``
1510 A :class:`RuntimeError` is raised if *substructure*
1511 does not refer to the same underlying
1512 :class:`ost.mol.EntityHandle` as :attr:`~target`.
1513 :type substructure: :class:`ost.mol.EntityView`
1514 :param model: Structure in which one wants to find representations for
1516 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1517 :param topn: Max number of representations that are returned
1518 :type topn: :class:`int`
1519 :param inclusion_radius: Inclusion radius for lDDT
1520 :type inclusion_radius: :class:`float`
1521 :param thresholds: Thresholds for lDDT
1522 :type thresholds: :class:`list` of :class:`float`
1523 :param bb_only: Only consider backbone atoms in lDDT computation
1524 :type bb_only: :class:`bool`
1525 :param only_interchain: Only score interchain contacts in lDDT. Useful
1526 if you want to identify interface patches.
1527 :type only_interchain: :class:`bool`
1528 :param chem_mapping_result: Pro param. The result of
1529 :func:`~GetChemMapping` where you provided
1530 *model*. If set, *model* parameter is not
1532 :type chem_mapping_result: :class:`tuple`
1533 :param global_mapping: Pro param. Specify a global mapping result. This
1534 fully defines the desired representation in the
1535 model but extracts it and enriches it with all
1536 the nice attributes of :class:`ReprResult`.
1537 The target attribute in *global_mapping* must be
1538 of the same entity as self.target and the model
1539 attribute of *global_mapping* must be of the same
1541 :type global_mapping: :class:`MappingResult`
1542 :returns: :class:`list` of :class:`ReprResult`
1546 raise RuntimeError(
"topn must be >= 1")
1548 if global_mapping
is not None:
1550 if global_mapping.target.handle.GetHashCode() != \
1552 raise RuntimeError(
"global_mapping.target must be the same "
1553 "entity as self.target")
1554 if global_mapping.model.handle.GetHashCode() != \
1555 model.handle.GetHashCode():
1556 raise RuntimeError(
"global_mapping.model must be the same "
1557 "entity as model param")
1560 for r
in substructure.residues:
1561 ch_name = r.GetChain().GetName()
1562 rnum = r.GetNumber()
1563 target_r = self.
targettarget.FindResidue(ch_name, rnum)
1564 if not target_r.IsValid():
1565 raise RuntimeError(f
"substructure has residue "
1566 f
"{r.GetQualifiedName()} which is not in "
1568 if target_r.handle.GetHashCode() != r.handle.GetHashCode():
1569 raise RuntimeError(f
"substructure has residue "
1570 f
"{r.GetQualifiedName()} which has an "
1571 f
"equivalent in self.target but it does "
1572 f
"not refer to the same underlying "
1575 target_a = target_r.FindAtom(a.GetName())
1576 if not target_a.IsValid():
1577 raise RuntimeError(f
"substructure has atom "
1578 f
"{a.GetQualifiedName()} which is not "
1580 if a.handle.GetHashCode() != target_a.handle.GetHashCode():
1581 raise RuntimeError(f
"substructure has atom "
1582 f
"{a.GetQualifiedName()} which has an "
1583 f
"equivalent in self.target but it does "
1584 f
"not refer to the same underlying "
1588 ca = r.FindAtom(
"CA")
1589 c3 = r.FindAtom(
"C3'")
1591 if not ca.IsValid()
and not c3.IsValid():
1592 raise RuntimeError(
"All residues in substructure must contain "
1593 "a backbone atom named CA or C3\'")
1596 if chem_mapping_result
is None:
1597 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1600 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1608 substructure_res_indices = dict()
1609 for ch
in substructure.chains:
1611 idx = [full_ch.GetResidueIndex(r.GetNumber())
for r
in ch.residues]
1612 substructure_res_indices[ch.GetName()] = idx
1616 substructure_chem_groups = list()
1617 substructure_chem_mapping = list()
1619 chnames = set([ch.GetName()
for ch
in substructure.chains])
1621 substructure_chem_group = [ch
for ch
in chem_group
if ch
in chnames]
1622 if len(substructure_chem_group) > 0:
1623 substructure_chem_groups.append(substructure_chem_group)
1624 substructure_chem_mapping.append(mapping)
1627 n_mapped_mdl_chains = sum([len(m)
for m
in substructure_chem_mapping])
1628 if n_mapped_mdl_chains == 0:
1633 substructure_ref_mdl_alns = dict()
1635 for ch
in mdl.chains:
1636 mdl_views[ch.GetName()] =
_CSel(mdl, [ch.GetName()])
1637 for chem_group, mapping
in zip(substructure_chem_groups,
1638 substructure_chem_mapping):
1639 for ref_ch
in chem_group:
1640 for mdl_ch
in mapping:
1641 full_aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1642 ref_seq = full_aln.GetSequence(0)
1646 tmp = [
'-'] * len(full_aln)
1647 for idx
in substructure_res_indices[ref_ch]:
1648 idx_in_seq = ref_seq.GetPos(idx)
1649 tmp[idx_in_seq] = ref_seq[idx_in_seq]
1650 ref_seq = seq.CreateSequence(ref_ch,
''.join(tmp))
1651 ref_seq.AttachView(
_CSel(substructure, [ref_ch]))
1652 mdl_seq = full_aln.GetSequence(1)
1653 mdl_seq = seq.CreateSequence(mdl_seq.GetName(),
1654 mdl_seq.GetString())
1655 mdl_seq.AttachView(mdl_views[mdl_ch])
1656 aln = seq.CreateAlignment()
1657 aln.AddSequence(ref_seq)
1658 aln.AddSequence(mdl_seq)
1659 substructure_ref_mdl_alns[(ref_ch, mdl_ch)] = aln
1662 inclusion_radius = inclusion_radius,
1664 scored_mappings = list()
1668 flat_mapping = global_mapping.GetFlatMapping()
1670 for chem_group, chem_mapping
in zip(substructure_chem_groups,
1671 substructure_chem_mapping):
1672 chem_group_mapping = list()
1673 for ch
in chem_group:
1674 if ch
in flat_mapping:
1675 mdl_ch = flat_mapping[ch]
1676 if mdl_ch
in chem_mapping:
1677 chem_group_mapping.append(mdl_ch)
1679 chem_group_mapping.append(
None)
1681 chem_group_mapping.append(
None)
1682 mapping.append(chem_group_mapping)
1683 mappings = [mapping]
1686 substructure_chem_mapping,
1690 msg =
"Computing initial ranking of %d chain mappings" % len(mappings)
1691 (ost.LogWarning
if len(mappings) > 10000
else ost.LogInfo)(msg)
1693 for mapping
in mappings:
1695 lddt_chain_mapping = dict()
1698 for ref_chem_group, mdl_chem_group
in zip(substructure_chem_groups,
1700 for ref_ch, mdl_ch
in zip(ref_chem_group, mdl_chem_group):
1702 if mdl_ch
is not None:
1703 lddt_chain_mapping[mdl_ch] = ref_ch
1704 aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1705 lddt_alns[mdl_ch] = aln
1706 tmp = [int(c[0] !=
'-' and c[1] !=
'-')
for c
in aln]
1707 n_res_aln += sum(tmp)
1712 lDDT, _ = lddt_scorer.lDDT(mdl, thresholds=thresholds,
1713 chain_mapping=lddt_chain_mapping,
1714 residue_mapping = lddt_alns,
1715 check_resnames =
False,
1716 no_intrachain = only_interchain)
1719 ost.LogVerbose(
"No valid contacts in the reference")
1724 if len(scored_mappings) == 0:
1725 scored_mappings.append((lDDT, mapping))
1726 elif len(scored_mappings) < topn:
1727 scored_mappings.append((lDDT, mapping))
1728 scored_mappings.sort(reverse=
True, key=
lambda x: x[0])
1729 elif lDDT > scored_mappings[-1][0]:
1730 scored_mappings.append((lDDT, mapping))
1731 scored_mappings.sort(reverse=
True, key=
lambda x: x[0])
1732 scored_mappings = scored_mappings[:topn]
1736 for scored_mapping
in scored_mappings:
1737 ref_view = substructure.handle.CreateEmptyView()
1738 mdl_view = mdl.handle.CreateEmptyView()
1739 for ref_ch_group, mdl_ch_group
in zip(substructure_chem_groups,
1741 for ref_ch, mdl_ch
in zip(ref_ch_group, mdl_ch_group):
1742 if ref_ch
is not None and mdl_ch
is not None:
1743 aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1745 if col[0] !=
'-' and col[1] !=
'-':
1746 ref_view.AddResidue(col.GetResidue(0),
1747 mol.ViewAddFlag.INCLUDE_ALL)
1748 mdl_view.AddResidue(col.GetResidue(1),
1749 mol.ViewAddFlag.INCLUDE_ALL)
1750 results.append(
ReprResult(scored_mapping[0], substructure,
1751 ref_view, mdl_view))
1755 """ Returns number of possible mappings
1757 :param model: Model with chains that are mapped onto
1759 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1761 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1766 """ Entity processing for chain mapping
1768 * Selects view containing peptide and nucleotide residues which have
1769 required backbone atoms present - for peptide residues thats
1770 N, CA, C and CB (no CB for GLY), for nucleotide residues thats
1771 O5', C5', C4', C3' and O3'.
1772 * filters view by chain lengths, see *min_pep_length* and
1773 *min_nuc_length* in constructor
1774 * Extracts atom sequences for each chain in that view
1775 * If residue number alignments are used, strictly increasing residue
1776 numbers without insertion codes are ensured in each chain
1778 :param ent: Entity to process
1779 :type ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1780 :returns: Tuple with 3 elements: 1) :class:`ost.mol.EntityView`
1781 containing peptide and nucleotide residues 2)
1782 :class:`ost.seq.SequenceList` containing ATOMSEQ sequences
1783 for each polypeptide chain in returned view
1784 3) same for polynucleotide chains
1786 view = ent.CreateEmptyView()
1787 exp_pep_atoms = [
"N",
"CA",
"C",
"CB"]
1788 exp_nuc_atoms = [
"\"O5'\"",
"\"C5'\"",
"\"C4'\"",
"\"C3'\"",
"\"O3'\""]
1789 pep_query =
"peptide=true and aname=" +
','.join(exp_pep_atoms)
1790 nuc_query =
"nucleotide=true and aname=" +
','.join(exp_nuc_atoms)
1792 pep_sel = ent.Select(pep_query)
1793 for r
in pep_sel.residues:
1794 if len(r.atoms) == 4:
1795 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1796 elif r.name ==
"GLY" and len(r.atoms) == 3:
1797 atom_names = [a.GetName()
for a
in r.atoms]
1798 if sorted(atom_names) == [
"C",
"CA",
"N"]:
1799 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1801 nuc_sel = ent.Select(nuc_query)
1802 for r
in nuc_sel.residues:
1803 if len(r.atoms) == 5:
1804 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1806 polypep_seqs = seq.CreateSequenceList()
1807 polynuc_seqs = seq.CreateSequenceList()
1809 if len(view.residues) == 0:
1811 return (view, polypep_seqs, polynuc_seqs)
1813 for ch
in view.chains:
1814 n_res = len(ch.residues)
1815 n_pep = sum([r.IsPeptideLinking()
for r
in ch.residues])
1816 n_nuc = sum([r.IsNucleotideLinking()
for r
in ch.residues])
1819 if n_pep > 0
and n_nuc > 0:
1820 raise RuntimeError(f
"Must not mix peptide and nucleotide linking "
1821 f
"residues in same chain ({ch.GetName()})")
1823 if (n_pep + n_nuc) != n_res:
1824 raise RuntimeError(
"All residues must either be peptide_linking "
1825 "or nucleotide_linking")
1829 ost.LogWarning(
"Skipping short peptide chain: %s" % ch.name)
1833 ost.LogWarning(
"Skipping short nucleotide chain: %s" % ch.name)
1841 ins_codes = [r.GetNumber().GetInsCode()
for r
in ch.residues]
1842 if len(set(ins_codes)) != 1
or ins_codes[0] !=
'\0':
1843 raise RuntimeError(
"Residue numbers in input structures must not "
1844 "contain insertion codes")
1847 nums = [r.GetNumber().GetNum()
for r
in ch.residues]
1848 if not all(i < j
for i, j
in zip(nums, nums[1:])):
1849 raise RuntimeError(
"Residue numbers in input structures must be "
1850 "strictly increasing for each chain")
1852 s =
''.join([r.one_letter_code
for r
in ch.residues])
1853 s = seq.CreateSequence(ch.GetName(), s)
1855 polypep_seqs.AddSequence(s)
1856 elif n_nuc == n_res:
1857 polynuc_seqs.AddSequence(s)
1859 raise RuntimeError(
"This shouldnt happen")
1861 if len(polypep_seqs) == 0
and len(polynuc_seqs) == 0:
1862 raise RuntimeError(f
"No chain fulfilled minimum length requirement "
1863 f
"to be considered in chain mapping "
1864 f
"({self.min_pep_length} for peptide chains, "
1865 f
"{self.min_nuc_length} for nucleotide chains) "
1866 f
"- mapping failed")
1869 chain_names = [s.name
for s
in polypep_seqs]
1870 chain_names += [s.name
for s
in polynuc_seqs]
1871 view =
_CSel(view, chain_names)
1873 return (view, polypep_seqs, polynuc_seqs)
1876 """ Access to internal sequence alignment functionality
1878 Performs Needleman-Wunsch alignment with parameterization
1879 setup at ChainMapper construction
1881 :param s1: First sequence to align
1882 :type s1: :class:`ost.seq.SequenceHandle`
1883 :param s2: Second sequence to align
1884 :type s2: :class:`ost.seq.SequenceHandle`
1885 :param stype: Type of sequences to align, must be in
1886 [:class:`ost.mol.ChemType.AMINOACIDS`,
1887 :class:`ost.mol.ChemType.NUCLEOTIDES`]
1888 :returns: Pairwise alignment of s1 and s2
1890 if stype == mol.ChemType.AMINOACIDS:
1894 elif stype == mol.ChemType.NUCLEOTIDES:
1899 raise RuntimeError(
"Invalid ChemType")
1903 """ Access to internal sequence alignment functionality
1905 Performs alignment on SEQRES. Residue numbers for *s* are
1906 extracted from *s_ent*. Residue numbers start from 1, i.e.
1907 1 aligns to the first element in *seqres*.
1909 :param seqres: SEQRES sequence
1910 :type seqres: :class:`ost.seq.SequenceHandle`
1911 :param s: Sequence to align
1912 :type s: :class:`ost.seq.SequenceHandle`
1913 :param s_ent: Structure which must contain a chain of same name as *s*.
1914 This chain must have the exact same number of residues
1915 as characters in *s*.
1916 :type s_ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1919 ch = s_ent.FindChain(s.name)
1921 if not ch.IsValid():
1922 raise RuntimeError(
"s_ent lacks required chain in SEQRESAlign")
1924 if len(s) != ch.GetResidueCount():
1925 raise RuntimeError(
"Sequence/structure mismatch in SEQRESAlign")
1927 rnums = [r.GetNumber().GetNum()
for r
in ch.residues]
1928 max_rnum = max(len(seqres), max(rnums))
1931 seqres_s = seqres.GetGaplessString() +
'-'*(max_rnum-len(seqres))
1933 olcs = [
'-'] * max_rnum
1934 for olc, num
in zip(s, rnums):
1937 aln = seq.CreateAlignment()
1938 aln.AddSequence(seq.CreateSequence(seqres.GetName(), seqres_s))
1939 aln.AddSequence(seq.CreateSequence(s.GetName(),
''.join(olcs)))
1943 """ Access to internal sequence alignment functionality
1945 Performs residue number based alignment. Residue numbers are extracted
1946 from *s1_ent*/*s2_ent*.
1948 :param s1: First sequence to align
1949 :type s1: :class:`ost.seq.SequenceHandle`
1950 :param s2: Second sequence to align
1951 :type s2: :class:`ost.seq.SequenceHandle`
1952 :param s1_ent: Structure as source for residue numbers in *s1*. Must
1953 contain a chain named after sequence name in *s1*. This
1954 chain must have the exact same number of residues as
1956 :type s1_ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1957 :param s2_ent: Same for *s2*.
1958 :type s2_ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1960 :returns: Pairwise alignment of s1 and s2
1962 ch1 = s1_ent.FindChain(s1.name)
1963 ch2 = s2_ent.FindChain(s2.name)
1965 if not ch1.IsValid():
1966 raise RuntimeError(
"s1_ent lacks requested chain in ResNumAlign")
1968 if not ch2.IsValid():
1969 raise RuntimeError(
"s2_ent lacks requested chain in ResNumAlign")
1971 if len(s1) != ch1.GetResidueCount():
1972 raise RuntimeError(
"Sequence/structure mismatch in ResNumAlign")
1974 if len(s2) != ch2.GetResidueCount():
1975 raise RuntimeError(
"Sequence/structure mismatch in ResNumAlign")
1977 rnums1 = [r.GetNumber().GetNum()
for r
in ch1.residues]
1978 rnums2 = [r.GetNumber().GetNum()
for r
in ch2.residues]
1980 min_num = min(rnums1[0], rnums2[0])
1981 max_num = max(rnums1[-1], rnums2[-1])
1982 aln_length = max_num - min_num + 1
1984 aln_s1 = [
'-'] * aln_length
1985 for olc, rnum
in zip(s1, rnums1):
1986 aln_s1[rnum-min_num] = olc
1988 aln_s2 = [
'-'] * aln_length
1989 for olc, rnum
in zip(s2, rnums2):
1990 aln_s2[rnum-min_num] = olc
1992 aln = seq.CreateAlignment()
1993 aln.AddSequence(seq.CreateSequence(s1.GetName(),
''.join(aln_s1)))
1994 aln.AddSequence(seq.CreateSequence(s2.GetName(),
''.join(aln_s2)))
1999 """ Sets properties :attr:`~chem_groups`,
2000 :attr:`~chem_group_alignments`, :attr:`~chem_group_ref_seqs`,
2001 :attr:`~chem_group_types`
2013 s = seq.CreateSequence(a.GetSequence(0).GetName(),
2014 a.GetSequence(0).GetGaplessString())
2020 for s_idx
in range(1, a.GetCount()):
2021 s = a.GetSequence(s_idx)
2022 group.append(s.GetName())
2027 """ Groups target sequences based on SEQRES
2029 returns tuple that can be set as self._chem_group_alignments and
2030 self._chem_group_types
2034 group_types = [mol.ChemType.AMINOACIDS] * len(pep_groups)
2035 group_types += [mol.ChemType.NUCLEOTIDES] * len(nuc_groups)
2037 groups.extend(nuc_groups)
2039 return (groups, group_types)
2046 for poly_seq
in poly_seqs:
2047 sname = poly_seq.GetName()
2049 if seqres_name
not in alns:
2050 aln = seq.CreateAlignment()
2053 if s.GetName() == seqres_name:
2060 raise RuntimeError(
"You should never get here - contact "
2062 aln.AddSequence(seqres)
2063 alns[seqres_name] = aln
2064 seqres = alns[seqres_name].GetSequence(0)
2065 aln_length = seqres.GetLength()
2066 atomseq = [
'-'] * aln_length
2068 assert(trg_chain.IsValid())
2069 assert(trg_chain.GetResidueCount() == len(poly_seq))
2070 for olc, r
in zip(poly_seq, trg_chain.residues):
2071 num = r.GetNumber().GetNum()
2072 if num < 1
or num > aln_length:
2073 raise RuntimeError(f
"Observed residue with invalid number: "
2074 f
"\"{r}\" which cannot be aligned to "
2075 f
"seqres with id \"{seqres_name}\"")
2076 if olc != seqres[num-1]:
2077 raise RuntimeError(f
"Sequence mismatch of residue \"{r}\" "
2078 f
"with olc \"{olc}\" and residue number "
2079 f
"\"{num}\". Character at that location "
2080 f
"in seqres: \"{seqres[num-1]}\"."
2081 f
"Seqres name: \"{seqres_name}\"")
2082 atomseq[num-1] = olc
2083 atomseq =
''.join(atomseq)
2084 alns[seqres_name].AddSequence(seq.CreateSequence(sname, atomseq))
2086 return [aln
for aln
in alns.values()]
2090 """ Groups target sequences based on ATOMSEQ
2092 returns tuple that can be set as self._chem_group_alignments and
2093 self._chem_group_types
2097 mol.ChemType.AMINOACIDS)
2100 mol.ChemType.NUCLEOTIDES)
2120 for a
in pep_groups:
2121 new_a = seq.CreateAlignment()
2122 new_a.AddSequence(a.GetSequence(0))
2123 for s_idx
in range(a.GetCount()):
2124 new_a.AddSequence(a.GetSequence(s_idx))
2130 for a
in nuc_groups:
2131 new_a = seq.CreateAlignment()
2132 new_a.AddSequence(a.GetSequence(0))
2133 for s_idx
in range(a.GetCount()):
2134 new_a.AddSequence(a.GetSequence(s_idx))
2138 group_types = [mol.ChemType.AMINOACIDS] * len(pep_groups)
2139 group_types += [mol.ChemType.NUCLEOTIDES] * len(nuc_groups)
2141 groups.extend(nuc_groups)
2143 return (groups, group_types)
2146 """Get list of alignments representing groups of equivalent sequences
2148 :param seqid_thr: Threshold used to decide when two chains are identical.
2149 :type seqid_thr: :class:`float`
2150 :param gap_thr: Additional threshold to avoid gappy alignments with high
2151 seqid. Number of aligned columns must be at least this
2153 :type gap_thr: :class:`int`
2154 :param aligner: Helper class to generate pairwise alignments
2155 :type aligner: :class:`_Aligner`
2156 :param chem_type: ChemType of seqs, must be in
2157 [:class:`ost.mol.ChemType.AMINOACIDS`,
2158 :class:`ost.mol.ChemType.NUCLEOTIDES`]
2159 :type chem_type: :class:`ost.mol.ChemType`
2160 :returns: A list of alignments, one alignment for each group
2161 with longest sequence (reference) as first sequence.
2162 :rtype: :class:`ost.seq.AlignmentList`
2165 for s_idx
in range(len(seqs)):
2166 matching_group =
None
2167 for g_idx
in range(len(groups)):
2168 for g_s_idx
in range(len(groups[g_idx])):
2171 seqs[groups[g_idx][g_s_idx]],
2174 aln = self.
NWAlign(seqs[s_idx],
2175 seqs[groups[g_idx][g_s_idx]],
2178 if sid >= seqid_thr
and n_aligned >= min_length:
2179 matching_group = g_idx
2181 if matching_group
is not None:
2184 if matching_group
is None:
2185 groups.append([s_idx])
2187 groups[matching_group].append(s_idx)
2190 sorted_groups = list()
2193 tmp = sorted([[len(seqs[i]), i]
for i
in g], reverse=
True)
2194 sorted_groups.append([x[1]
for x
in tmp])
2196 sorted_groups.append(g)
2200 aln_list = seq.AlignmentList()
2201 for g
in sorted_groups:
2204 aln_list.append(seq.CreateAlignment(seqs[g[0]]))
2207 alns = seq.AlignmentList()
2214 aln = self.
NWAlign(seqs[i], seqs[j], chem_type)
2217 aln_list.append(seq.alg.MergePairwiseAlignments(alns, seqs[i]))
2222 seq_id_thr=0.0, min_aln_length=0):
2223 """Tries top map *s* onto any of the sequences in *ref_seqs*
2225 Computes alignments of *s* to each of the reference sequences of equal type
2226 and sorts them by seqid*fraction_covered (seqid: sequence identity of
2227 aligned columns in alignment, fraction_covered: Fraction of non-gap
2228 characters in reference sequence that are covered by non-gap characters in
2229 *s*). Best scoring mapping is returned. Optionally, *seq_id*/
2230 *min_aln_length* thresholds can be enabled to avoid non-sensical mappings.
2231 However, *min_aln_length* only has an effect if *seq_id_thr* > 0!!!
2233 :param ref_seqs: Reference sequences
2234 :type ref_seqs: :class:`ost.seq.SequenceList`
2235 :param ref_types: Types of reference sequences, e.g.
2236 ost.mol.ChemType.AminoAcids
2237 :type ref_types: :class:`list` of :class:`ost.mol.ChemType`
2238 :param s: Sequence to map
2239 :type s: :class:`ost.seq.SequenceHandle`
2240 :param s_type: Type of *s*, only try mapping to sequences in *ref_seqs*
2241 with equal type as defined in *ref_types*
2242 :param s_ent: Entity which represents *s*. Only relevant in case of
2243 residue number alignments.
2244 :type s_ent: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView`
2245 :param seq_id_thr: Minimum sequence identity to be considered as match
2246 :type seq_id_thr: :class:`float`
2247 :param min_aln_length: Minimum number of aligned columns to be considered
2248 as match. Only has an effect if *seq_id_thr* > 0!
2249 :type min_aln_length: :class:`int`
2250 :returns: Tuple with two elements. 1) index of sequence in *ref_seqs* to
2251 which *s* can be mapped 2) Pairwise sequence alignment with
2252 sequence from *ref_seqs* as first sequence. Both elements are
2253 None if no mapping can be found or if thresholds are not
2254 fulfilled for any alignment.
2255 :raises: :class:`RuntimeError` if mapping is ambiguous, i.e. *s*
2256 successfully maps to more than one sequence in *ref_seqs*
2258 scored_alns = list()
2259 for ref_idx, ref_seq
in enumerate(ref_seqs):
2260 if ref_types[ref_idx] == s_type:
2266 aln = self.
NWAlign(ref_seq, s, s_type)
2269 if seqid >= seq_id_thr
and n_aligned >= min_aln_length:
2270 fraction_covered = float(n_aligned)/n_tot
2271 score = seqid * fraction_covered
2272 scored_alns.append((score, ref_idx, aln))
2274 fraction_covered = float(n_aligned)/n_tot
2275 score = seqid * fraction_covered
2276 scored_alns.append((score, ref_idx, aln))
2278 if len(scored_alns) == 0:
2281 scored_alns = sorted(scored_alns, key=
lambda x: x[0], reverse=
True)
2282 return (scored_alns[0][1], scored_alns[0][2])
2285 """Returns basic properties of *aln* version two...
2287 :param aln: Alignment to compute properties
2288 :type aln: :class:`seq.AlignmentHandle`
2289 :returns: Tuple with 3 elements. 1) sequence identity in range [0, 100]
2290 considering aligned columns 2) Number of non gap characters in
2291 first sequence 3) Number of aligned characters.
2293 assert(aln.GetCount() == 2)
2294 n_tot = sum([1
for col
in aln
if col[0] !=
'-'])
2295 n_aligned = sum([1
for col
in aln
if (col[0] !=
'-' and col[1] !=
'-')])
2296 return (seq.alg.SequenceIdentity(aln), n_tot, n_aligned)
2300 """Returns basic properties of *aln* version one...
2302 :param aln: Alignment to compute properties
2303 :type aln: :class:`seq.AlignmentHandle`
2304 :returns: Tuple with 2 elements. 1) sequence identify in range [0, 100]
2305 considering aligned columns 2) Number of aligned columns.
2307 assert(aln.GetCount() == 2)
2308 seqid = seq.alg.SequenceIdentity(aln)
2309 n_aligned = sum([1
for col
in aln
if (col[0] !=
'-' and col[1] !=
'-')])
2310 return (seqid, n_aligned)
2313 mdl_chem_group_alns, pairs=None):
2314 """ Get all possible ref/mdl chain alignments given chem group mapping
2316 :param ref_chem_groups: :attr:`ChainMapper.chem_groups`
2317 :type ref_chem_groups: :class:`list` of :class:`list` of :class:`str`
2318 :param ref_chem_group_msas: :attr:`ChainMapper.chem_group_alignments`
2319 :type ref_chem_group_msas: :class:`ost.seq.AlignmentList`
2320 :param mdl_chem_groups: Groups of model chains that are mapped to
2321 *ref_chem_groups*. Return value of
2322 :func:`ChainMapper.GetChemMapping`.
2323 :type mdl_chem_groups: :class:`list` of :class:`list` of :class:`str`
2324 :param mdl_chem_group_alns: A pairwise sequence alignment for every chain
2325 in *mdl_chem_groups* that aligns these sequences
2326 to the respective reference sequence.
2328 :func:`ChainMapper.GetChemMapping`.
2329 :type mdl_chem_group_alns: :class:`list` of :class:`ost.seq.AlignmentList`
2330 :param pairs: Pro param - restrict return dict to specified pairs. A set of
2331 tuples in form (<trg_ch>, <mdl_ch>)
2332 :type pairs: :class:`set`
2333 :returns: A dictionary holding all possible ref/mdl chain alignments. Keys
2334 in that dictionary are tuples of the form (ref_ch, mdl_ch) and
2335 values are the respective pairwise alignments with first sequence
2336 being from ref, the second from mdl.
2340 for alns
in mdl_chem_group_alns:
2342 mdl_chain_name = aln.GetSequence(1).GetName()
2343 mdl_alns[mdl_chain_name] = aln
2347 ref_mdl_alns = dict()
2348 for ref_chains, mdl_chains, ref_aln
in zip(ref_chem_groups, mdl_chem_groups,
2349 ref_chem_group_msas):
2350 for ref_ch
in ref_chains:
2351 for mdl_ch
in mdl_chains:
2352 if pairs
is not None and (ref_ch, mdl_ch)
not in pairs:
2356 aln_list = seq.AlignmentList()
2361 s1 = ref_aln.GetSequence(0)
2363 s2 = ref_aln.GetSequence(1+ref_chains.index(ref_ch))
2364 aln_list.append(seq.CreateAlignment(s1, s2))
2368 aln_list.append(mdl_alns[mdl_ch])
2372 ref_seq = seq.CreateSequence(s1.GetName(),
2373 s1.GetGaplessString())
2374 merged_aln = seq.alg.MergePairwiseAlignments(aln_list,
2381 s2 = merged_aln.GetSequence(1)
2382 s3 = merged_aln.GetSequence(2)
2385 for idx
in range(len(s2)):
2386 if s2[idx] !=
'-' or s3[idx] !=
'-':
2390 for idx
in reversed(range(len(s2))):
2391 if s2[idx] !=
'-' or s3[idx] !=
'-':
2394 s2 = seq.CreateSequence(s2.GetName(), s2[a: len(s2)-b])
2395 s3 = seq.CreateSequence(s3.GetName(), s3[a: len(s3)-b])
2396 ref_mdl_alns[(ref_ch, mdl_ch)] = seq.CreateAlignment(s2, s3)
2401 """ Checks whether we already have a perfect one to one mapping
2403 That means each list in *ref_chains* has either exactly one element
2404 and the respective list in *mdl_chains* has also one element or
2405 it has several elements and the respective list in *mdl_chains* is
2406 empty (ref chain(s) has no mapped mdl chain). Returns None if no such
2407 mapping can be found.
2409 :param ref_chains: corresponds to :attr:`ChainMapper.chem_groups`
2410 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
2411 :param mdl_chains: mdl chains mapped to chem groups in *ref_chains*, i.e.
2412 the return value of :func:`ChainMapper.GetChemMapping`
2413 :type mdl_chains: class:`list` of :class:`list` of :class:`str`
2414 :returns: A :class:`list` of :class:`list` if a one to one mapping is found,
2417 only_one_to_one =
True
2419 for ref, mdl
in zip(ref_chains, mdl_chains):
2420 if len(ref) == 1
and len(mdl) == 1:
2421 one_to_one.append(mdl)
2422 elif len(ref) >= 1
and len(mdl) == 0:
2423 one_to_one.append(len(ref)*[
None])
2425 only_one_to_one =
False
2433 def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
2434 ref_mdl_alns, inclusion_radius = 15.0,
2435 thresholds = [0.5, 1.0, 2.0, 4.0],
2436 steep_opt_rate = None):
2438 """ Greedy extension of already existing but incomplete chain mappings
2440 super().
__init__(ref, ref_chem_groups, mdl, ref_mdl_alns,
2441 dist_thresh=inclusion_radius,
2442 dist_diff_thresholds=thresholds)
2448 for interface
in self.
trg.interacting_chains:
2449 self.
neighbors[interface[0]].add(interface[1])
2450 self.
neighbors[interface[1]].add(interface[0])
2452 assert(len(ref_chem_groups) == len(mdl_chem_groups))
2457 for g_idx, (ref_g, mdl_g)
in enumerate(zip(ref_chem_groups,
2467 for interface
in self.
mdl.potentially_contributing_chains:
2473 if len(mapping) == 0:
2474 raise RuntimError(
"Mapping must contain a starting point")
2481 for ref_ch
in mapping.keys():
2482 map_targets.update(self.
neighbors[ref_ch])
2485 for ref_ch
in mapping.keys():
2486 map_targets.discard(ref_ch)
2488 if len(map_targets) == 0:
2492 free_mdl_chains = list()
2494 tmp = [x
for x
in chem_group
if x
not in mapping.values()]
2495 free_mdl_chains.append(set(tmp))
2498 newly_mapped_ref_chains = list()
2500 something_happened =
True
2501 while something_happened:
2502 something_happened=
False
2505 n_chains = len(newly_mapped_ref_chains)
2507 mapping = self.
_SteepOpt(mapping, newly_mapped_ref_chains)
2509 if max_ext
is not None and len(newly_mapped_ref_chains) >= max_ext:
2514 for ref_ch
in map_targets:
2516 for mdl_ch
in free_mdl_chains[chem_group_idx]:
2522 if neighbor
in mapping
and mapping[neighbor]
in \
2525 (mdl_ch, mapping[neighbor])).sum()
2526 n = n_single + n_inter
2528 if n_inter > 0
and n > max_n:
2533 max_mapping = (ref_ch, mdl_ch)
2536 something_happened =
True
2538 mapping[max_mapping[0]] = max_mapping[1]
2542 for neighbor
in self.
neighbors[max_mapping[0]]:
2543 if neighbor
not in mapping:
2544 map_targets.add(neighbor)
2547 map_targets.remove(max_mapping[0])
2551 free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2554 newly_mapped_ref_chains.append(max_mapping[0])
2561 if chains_to_optimize
is None:
2562 chains_to_optimize = mapping.keys()
2565 ref_chains = [x
for x
in chains_to_optimize
if mapping[x]
is not None]
2569 for ch
in ref_chains:
2571 if chem_group_idx
in tmp:
2572 tmp[chem_group_idx].append(ch)
2574 tmp[chem_group_idx] = [ch]
2575 chem_groups = list(tmp.values())
2580 something_happened =
True
2581 while something_happened:
2582 something_happened =
False
2583 for chem_group
in chem_groups:
2584 if something_happened:
2586 for ch1, ch2
in itertools.combinations(chem_group, 2):
2587 swapped_mapping = dict(mapping)
2588 swapped_mapping[ch1] = mapping[ch2]
2589 swapped_mapping[ch2] = mapping[ch1]
2591 if score > current_lddt:
2592 something_happened =
True
2593 mapping = swapped_mapping
2594 current_lddt = score
2599def _lDDTNaive(trg, mdl, inclusion_radius, thresholds, chem_groups,
2600 chem_mapping, ref_mdl_alns, n_max_naive):
2601 """ Naively iterates all possible chain mappings and returns the best
2608 dist_thresh=inclusion_radius,
2609 dist_diff_thresholds=thresholds)
2610 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2611 lDDT = lddt_scorer.Score(mapping, check=
False)
2612 if lDDT > best_lddt:
2613 best_mapping = mapping
2616 return (best_mapping, best_lddt)
2619def _GetSeeds(ref_chem_groups, mdl_chem_groups, mapped_ref_chains = set(),
2620 mapped_mdl_chains = set()):
2622 for ref_chains, mdl_chains
in zip(ref_chem_groups,
2624 for ref_ch
in ref_chains:
2625 if ref_ch
not in mapped_ref_chains:
2626 for mdl_ch
in mdl_chains:
2627 if mdl_ch
not in mapped_mdl_chains:
2628 seeds.append((ref_ch, mdl_ch))
2632 """ Uses each reference chain as starting point for expansion
2635 seeds =
_GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
2636 best_overall_score = -1.0
2637 best_overall_mapping = dict()
2642 mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
2645 something_happened =
True
2646 while something_happened:
2647 something_happened =
False
2648 remnant_seeds =
_GetSeeds(the_greed.ref_chem_groups,
2649 the_greed.mdl_chem_groups,
2650 mapped_ref_chains = set(mapping.keys()),
2651 mapped_mdl_chains = set(mapping.values()))
2652 if len(remnant_seeds) > 0:
2656 for remnant_seed
in remnant_seeds:
2657 tmp_mapping = dict(mapping)
2658 tmp_mapping[remnant_seed[0]] = remnant_seed[1]
2659 tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
2660 score = the_greed.FromFlatMapping(tmp_mapping)
2661 if score > best_score:
2663 best_mapping = tmp_mapping
2664 if best_mapping
is not None:
2665 something_happened =
True
2666 mapping = best_mapping
2668 score = the_greed.FromFlatMapping(mapping)
2669 if score > best_overall_score:
2670 best_overall_score = score
2671 best_overall_mapping = mapping
2673 mapping = best_overall_mapping
2676 final_mapping = list()
2677 for ref_chains
in the_greed.ref_chem_groups:
2678 mapped_mdl_chains = list()
2679 for ref_ch
in ref_chains:
2680 if ref_ch
in mapping:
2681 mapped_mdl_chains.append(mapping[ref_ch])
2683 mapped_mdl_chains.append(
None)
2684 final_mapping.append(mapped_mdl_chains)
2686 return final_mapping
2690 """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
2691 respective chem groups and compute single chain lDDTs. The
2692 *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
2693 and the best scoring one is exhaustively extended.
2696 if seed_size
is None or seed_size < 1:
2697 raise RuntimeError(f
"seed_size must be an int >= 1 (got {seed_size})")
2699 if blocks_per_chem_group
is None or blocks_per_chem_group < 1:
2700 raise RuntimeError(f
"blocks_per_chem_group must be an int >= 1 "
2701 f
"(got {blocks_per_chem_group})")
2703 max_ext = seed_size - 1
2705 ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
2706 mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
2709 something_happened =
True
2710 while something_happened:
2711 something_happened =
False
2712 starting_blocks = list()
2713 for ref_chains, mdl_chains
in zip(ref_chem_groups, mdl_chem_groups):
2714 if len(mdl_chains) == 0:
2716 ref_chains_copy = list(ref_chains)
2717 for i
in range(blocks_per_chem_group):
2718 if len(ref_chains_copy) == 0:
2721 for ref_ch
in ref_chains_copy:
2722 seeds += [(ref_ch, mdl_ch)
for mdl_ch
in mdl_chains]
2729 seed = dict(mapping)
2730 seed.update({s[0]: s[1]})
2731 seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
2732 seed_lddt = the_greed.FromFlatMapping(seed)
2733 if seed_lddt > best_score:
2734 best_score = seed_lddt
2737 if best_mapping !=
None:
2738 starting_blocks.append(best_mapping)
2739 if best_seed[0]
in ref_chains_copy:
2741 ref_chains_copy.remove(best_seed[0])
2746 for seed
in starting_blocks:
2747 seed = the_greed.ExtendMapping(seed)
2748 seed_lddt = the_greed.FromFlatMapping(seed)
2749 if seed_lddt > best_lddt:
2750 best_lddt = seed_lddt
2753 if best_lddt == 0.0:
2756 something_happened =
True
2757 mapping.update(best_mapping)
2758 for ref_ch, mdl_ch
in best_mapping.items():
2759 for group_idx
in range(len(ref_chem_groups)):
2760 if ref_ch
in ref_chem_groups[group_idx]:
2761 ref_chem_groups[group_idx].remove(ref_ch)
2762 if mdl_ch
in mdl_chem_groups[group_idx]:
2763 mdl_chem_groups[group_idx].remove(mdl_ch)
2766 final_mapping = list()
2767 for ref_chains
in the_greed.ref_chem_groups:
2768 mapped_mdl_chains = list()
2769 for ref_ch
in ref_chains:
2770 if ref_ch
in mapping:
2771 mapped_mdl_chains.append(mapping[ref_ch])
2773 mapped_mdl_chains.append(
None)
2774 final_mapping.append(mapped_mdl_chains)
2776 return final_mapping
2780 def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
2781 ref_mdl_alns, contact_d = 12.0,
2782 steep_opt_rate = None, greedy_prune_contact_map=False):
2783 """ Greedy extension of already existing but incomplete chain mappings
2785 super().
__init__(ref, ref_chem_groups, mdl, ref_mdl_alns,
2786 contact_d = contact_d)
2792 if greedy_prune_contact_map:
2795 if np.count_nonzero(self.
qsent1qsent1.PairDist(p[0], p[1])<=8) >= 3:
2801 if np.count_nonzero(self.
qsent2qsent2.PairDist(p[0], p[1])<=8) >= 3:
2817 assert(len(ref_chem_groups) == len(mdl_chem_groups))
2822 for g_idx, (ref_g, mdl_g)
in enumerate(zip(ref_chem_groups,
2834 for ch
in self.
ref.chains:
2835 single_chain_ref =
_CSel(self.
ref, [ch.GetName()])
2843 mdl_sel =
_CSel(self.
mdl, [mdl_ch])
2845 _,_,_,conserved,_,_,_ = s.lDDT(mdl_sel,
2846 residue_mapping=alns,
2847 return_dist_test=
True,
2849 chain_mapping={mdl_ch: ref_ch},
2850 check_resnames=
False)
2856 if len(mapping) == 0:
2857 raise RuntimError(
"Mapping must contain a starting point")
2864 for ref_ch
in mapping.keys():
2865 map_targets.update(self.
neighbors[ref_ch])
2868 for ref_ch
in mapping.keys():
2869 map_targets.discard(ref_ch)
2871 if len(map_targets) == 0:
2875 free_mdl_chains = list()
2877 tmp = [x
for x
in chem_group
if x
not in mapping.values()]
2878 free_mdl_chains.append(set(tmp))
2881 newly_mapped_ref_chains = list()
2883 something_happened =
True
2884 while something_happened:
2885 something_happened=
False
2888 n_chains = len(newly_mapped_ref_chains)
2890 mapping = self.
_SteepOpt(mapping, newly_mapped_ref_chains)
2892 if max_ext
is not None and len(newly_mapped_ref_chains) >= max_ext:
2896 old_score = score_result.QS_global
2897 nominator = score_result.weighted_scores
2898 denominator = score_result.weight_sum + score_result.weight_extra_all
2902 for ref_ch
in map_targets:
2904 for mdl_ch
in free_mdl_chains[chem_group_idx]:
2907 nominator_diff = 0.0
2908 denominator_diff = 0.0
2910 if neighbor
in mapping
and mapping[neighbor]
in \
2914 int1 = (ref_ch, neighbor)
2915 int2 = (mdl_ch, mapping[neighbor])
2918 denominator_diff += b
2919 denominator_diff += d
2925 if nominator_diff > 0:
2928 new_nominator = nominator + nominator_diff
2929 new_denominator = denominator + denominator_diff
2931 if new_denominator != 0.0:
2932 new_score = new_nominator/new_denominator
2933 diff = new_score - old_score
2936 max_mapping = (ref_ch, mdl_ch)
2938 if max_mapping
is not None:
2939 something_happened =
True
2941 mapping[max_mapping[0]] = max_mapping[1]
2945 for neighbor
in self.
neighbors[max_mapping[0]]:
2946 if neighbor
not in mapping:
2947 map_targets.add(neighbor)
2950 map_targets.remove(max_mapping[0])
2954 free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2957 newly_mapped_ref_chains.append(max_mapping[0])
2964 if chains_to_optimize
is None:
2965 chains_to_optimize = mapping.keys()
2968 ref_chains = [x
for x
in chains_to_optimize
if mapping[x]
is not None]
2972 for ch
in ref_chains:
2974 if chem_group_idx
in tmp:
2975 tmp[chem_group_idx].append(ch)
2977 tmp[chem_group_idx] = [ch]
2978 chem_groups = list(tmp.values())
2983 current_score = score_result.QS_global
2984 something_happened =
True
2985 while something_happened:
2986 something_happened =
False
2987 for chem_group
in chem_groups:
2988 if something_happened:
2990 for ch1, ch2
in itertools.combinations(chem_group, 2):
2991 swapped_mapping = dict(mapping)
2992 swapped_mapping[ch1] = mapping[ch2]
2993 swapped_mapping[ch2] = mapping[ch1]
2995 if score_result.QS_global > current_score:
2996 something_happened =
True
2997 mapping = swapped_mapping
2998 current_score = score_result.QS_global
3010 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
3011 score_result = qs_scorer.Score(mapping, check=
False)
3012 if score_result.QS_global > best_score:
3013 best_mapping = mapping
3014 best_score = score_result.QS_global
3015 return (best_mapping, best_score)
3018 """ Uses each reference chain as starting point for expansion
3021 seeds =
_GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
3022 best_overall_score = -1.0
3023 best_overall_mapping = dict()
3028 mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
3031 something_happened =
True
3032 while something_happened:
3033 something_happened =
False
3034 remnant_seeds =
_GetSeeds(the_greed.ref_chem_groups,
3035 the_greed.mdl_chem_groups,
3036 mapped_ref_chains = set(mapping.keys()),
3037 mapped_mdl_chains = set(mapping.values()))
3038 if len(remnant_seeds) > 0:
3042 for remnant_seed
in remnant_seeds:
3043 tmp_mapping = dict(mapping)
3044 tmp_mapping[remnant_seed[0]] = remnant_seed[1]
3045 tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
3046 score_result = the_greed.FromFlatMapping(tmp_mapping)
3047 if score_result.QS_global > best_score:
3048 best_score = score_result.QS_global
3049 best_mapping = tmp_mapping
3050 if best_mapping
is not None:
3051 something_happened =
True
3052 mapping = best_mapping
3054 score_result = the_greed.FromFlatMapping(mapping)
3055 if score_result.QS_global > best_overall_score:
3056 best_overall_score = score_result.QS_global
3057 best_overall_mapping = mapping
3059 mapping = best_overall_mapping
3062 final_mapping = list()
3063 for ref_chains
in the_greed.ref_chem_groups:
3064 mapped_mdl_chains = list()
3065 for ref_ch
in ref_chains:
3066 if ref_ch
in mapping:
3067 mapped_mdl_chains.append(mapping[ref_ch])
3069 mapped_mdl_chains.append(
None)
3070 final_mapping.append(mapped_mdl_chains)
3072 return final_mapping
3076 """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
3077 respective chem groups and compute single chain lDDTs. The
3078 *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
3079 and the best scoring one with respect to QS score is exhaustively extended.
3082 if seed_size
is None or seed_size < 1:
3083 raise RuntimeError(f
"seed_size must be an int >= 1 (got {seed_size})")
3085 if blocks_per_chem_group
is None or blocks_per_chem_group < 1:
3086 raise RuntimeError(f
"blocks_per_chem_group must be an int >= 1 "
3087 f
"(got {blocks_per_chem_group})")
3089 max_ext = seed_size - 1
3091 ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
3092 mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
3096 something_happened =
True
3097 while something_happened:
3098 something_happened =
False
3099 starting_blocks = list()
3100 for ref_chains, mdl_chains
in zip(ref_chem_groups, mdl_chem_groups):
3101 if len(mdl_chains) == 0:
3103 ref_chains_copy = list(ref_chains)
3104 for i
in range(blocks_per_chem_group):
3105 if len(ref_chains_copy) == 0:
3108 for ref_ch
in ref_chains_copy:
3109 seeds += [(ref_ch, mdl_ch)
for mdl_ch
in mdl_chains]
3116 seed = dict(mapping)
3117 seed.update({s[0]: s[1]})
3118 seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
3119 score_result = the_greed.FromFlatMapping(seed)
3120 if score_result.QS_global > best_score:
3121 best_score = score_result.QS_global
3124 if best_mapping !=
None:
3125 starting_blocks.append(best_mapping)
3126 if best_seed[0]
in ref_chains_copy:
3128 ref_chains_copy.remove(best_seed[0])
3133 for seed
in starting_blocks:
3134 seed = the_greed.ExtendMapping(seed)
3135 score_result = the_greed.FromFlatMapping(seed)
3136 if score_result.QS_global > best_score:
3137 best_score = score_result.QS_global
3140 if best_mapping
is not None and len(best_mapping) > len(mapping):
3143 something_happened =
True
3144 mapping.update(best_mapping)
3145 for ref_ch, mdl_ch
in best_mapping.items():
3146 for group_idx
in range(len(ref_chem_groups)):
3147 if ref_ch
in ref_chem_groups[group_idx]:
3148 ref_chem_groups[group_idx].remove(ref_ch)
3149 if mdl_ch
in mdl_chem_groups[group_idx]:
3150 mdl_chem_groups[group_idx].remove(mdl_ch)
3153 final_mapping = list()
3154 for ref_chains
in the_greed.ref_chem_groups:
3155 mapped_mdl_chains = list()
3156 for ref_ch
in ref_chains:
3157 if ref_ch
in mapping:
3158 mapped_mdl_chains.append(mapping[ref_ch])
3160 mapped_mdl_chains.append(
None)
3161 final_mapping.append(mapped_mdl_chains)
3163 return final_mapping
3166 chem_mapping, trg_group_pos, mdl_group_pos):
3168 Takes initial transforms and sequentially adds chain pairs with lowest RMSD.
3169 The mapping from the transform that leads to lowest overall RMSD is
3172 best_mapping = dict()
3173 best_ssd = float(
"inf")
3177 for transform
in initial_transforms:
3179 mapped_mdl_chains = set()
3181 for trg_chains, mdl_chains, trg_pos, mdl_pos,
in zip(chem_groups,
3185 if len(trg_pos) == 0
or len(mdl_pos) == 0:
3189 for m_pos
in mdl_pos:
3191 t_m_pos.ApplyTransform(transform)
3192 t_mdl_pos.append(t_m_pos)
3193 for t_pos, t
in zip(trg_pos, trg_chains):
3194 for t_m_pos, m
in zip(t_mdl_pos, mdl_chains):
3195 ssd = t_pos.GetSummedSquaredDistances(t_m_pos)
3196 ssds.append((ssd, (t,m)))
3200 if p[0]
not in mapping
and p[1]
not in mapped_mdl_chains:
3201 mapping[p[0]] = p[1]
3202 mapped_mdl_chains.add(p[1])
3207 best_mapping = mapping
3213 chem_mapping, trg_group_pos, mdl_group_pos):
3215 Takes initial transforms and sequentially adds chain pairs with lowest
3217 The mapping from the transform that leads to lowest overall RMSD of
3218 the centroids is returned.
3220 best_mapping = dict()
3221 best_rmsd = float(
"inf")
3223 trg_group_centroids = list()
3224 mdl_group_centroids = list()
3226 for trg_pos, mdl_pos,
in zip(trg_group_pos, mdl_group_pos):
3227 trg_group_centroids.append(
geom.Vec3List([p.center
for p
in trg_pos]))
3228 mdl_group_centroids.append(
geom.Vec3List([p.center
for p
in mdl_pos]))
3230 for transform
in initial_transforms:
3232 mapped_mdl_chains = set()
3233 mapped_trg_centroids = list()
3234 mapped_mdl_centroids = list()
3235 for trg_chains, mdl_chains, trg_centroids, mdl_centroids,
in zip(chem_groups,
3237 trg_group_centroids,
3238 mdl_group_centroids):
3239 if len(trg_centroids) == 0
or len(mdl_centroids) == 0:
3243 t_mdl_centroids.ApplyTransform(transform)
3245 for trg_idx
in range(len(trg_chains)):
3246 for mdl_idx
in range(len(mdl_chains)):
3247 distances.append((
geom.Length2(trg_centroids[trg_idx]-t_mdl_centroids[mdl_idx]),
3248 (trg_chains[trg_idx], mdl_chains[mdl_idx]),
3249 (trg_centroids[trg_idx], t_mdl_centroids[mdl_idx])))
3252 for item
in distances:
3254 if p[0]
not in mapping
and p[1]
not in mapped_mdl_chains:
3255 mapping[p[0]] = p[1]
3256 mapped_mdl_chains.add(p[1])
3257 mapped_trg_centroids.append(item[2][0])
3258 mapped_mdl_centroids.append(item[2][1])
3261 if len(mapping) == 0:
3262 raise RuntimeError(
"Empty mapping in _SingleRigidCentroid")
3263 elif len(mapping) == 1:
3266 elif len(mapping) == 2:
3271 mapped_trg_centroids[1])
3273 mapped_mdl_centroids[1])
3277 dd = abs(ref_d-mdl_d)
3286 mapped_mdl_centroids,
False).rmsd
3288 if rmsd < best_rmsd:
3290 best_mapping = mapping
3296 chem_mapping, trg_group_pos, mdl_group_pos):
3297 """ Takes initial transforms and sequentially adds chain pairs with
3298 lowest RMSD. With each added chain pair, the transform gets updated.
3299 Thus the naming iterative. The mapping from the initial transform that
3300 leads to best overall RMSD score is returned.
3304 trg_pos_dict = dict()
3305 for trg_pos, trg_chains
in zip(trg_group_pos, chem_groups):
3306 for t_pos, t
in zip(trg_pos, trg_chains):
3307 trg_pos_dict[t] = t_pos
3308 mdl_pos_dict = dict()
3309 for mdl_pos, mdl_chains
in zip(mdl_group_pos, chem_mapping):
3310 for m_pos, m
in zip(mdl_pos, mdl_chains):
3311 mdl_pos_dict[m] = m_pos
3313 best_mapping = dict()
3314 best_rmsd = float(
"inf")
3315 for initial_transform, initial_mapping
in zip(initial_transforms,
3317 mapping = {initial_mapping[0]: initial_mapping[1]}
3318 transform =
geom.Mat4(initial_transform)
3319 mapped_trg_pos =
geom.Vec3List(trg_pos_dict[initial_mapping[0]])
3320 mapped_mdl_pos =
geom.Vec3List(mdl_pos_dict[initial_mapping[1]])
3324 trg_chain_groups = [set(group)
for group
in chem_groups]
3325 mdl_chain_groups = [set(group)
for group
in chem_mapping]
3328 for group
in trg_chain_groups:
3329 if initial_mapping[0]
in group:
3330 group.remove(initial_mapping[0])
3332 for group
in mdl_chain_groups:
3333 if initial_mapping[1]
in group:
3334 group.remove(initial_mapping[1])
3337 something_happened =
True
3338 while something_happened:
3340 something_happened=
False
3341 best_sc_mapping =
None
3342 best_sc_group_idx =
None
3343 best_sc_rmsd = float(
"inf")
3345 for trg_chains, mdl_chains
in zip(trg_chain_groups, mdl_chain_groups):
3346 for t
in trg_chains:
3347 t_pos = trg_pos_dict[t]
3348 for m
in mdl_chains:
3349 m_pos = mdl_pos_dict[m]
3351 t_m_pos.ApplyTransform(transform)
3352 rmsd = t_pos.GetRMSD(t_m_pos)
3353 if rmsd < best_sc_rmsd:
3355 best_sc_mapping = (t,m)
3356 best_sc_group_idx = group_idx
3359 if best_sc_mapping
is not None:
3360 something_happened =
True
3361 mapping[best_sc_mapping[0]] = best_sc_mapping[1]
3362 mapped_trg_pos.extend(trg_pos_dict[best_sc_mapping[0]])
3363 mapped_mdl_pos.extend(mdl_pos_dict[best_sc_mapping[1]])
3364 trg_chain_groups[best_sc_group_idx].remove(best_sc_mapping[0])
3365 mdl_chain_groups[best_sc_group_idx].remove(best_sc_mapping[1])
3368 False).transformation
3371 mapped_mdl_pos.ApplyTransform(transform)
3372 rmsd = mapped_trg_pos.GetRMSD(mapped_mdl_pos)
3374 if rmsd < best_rmsd:
3376 best_mapping = mapping
3380def _NaiveRMSD(chem_groups, chem_mapping, trg_group_pos, mdl_group_pos,
3384 trg_pos_dict = dict()
3385 for trg_pos, trg_chains
in zip(trg_group_pos, chem_groups):
3386 for t_pos, t
in zip(trg_pos, trg_chains):
3387 trg_pos_dict[t] = t_pos
3388 mdl_pos_dict = dict()
3389 for mdl_pos, mdl_chains
in zip(mdl_group_pos, chem_mapping):
3390 for m_pos, m
in zip(mdl_pos, mdl_chains):
3391 mdl_pos_dict[m] = m_pos
3393 best_mapping = [[
None]*len(x)
for x
in chem_groups]
3394 best_rmsd = float(
"inf")
3396 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
3399 for trg_group, mdl_group
in zip(chem_groups, mapping):
3400 for trg_ch, mdl_ch
in zip(trg_group, mdl_group):
3401 if trg_ch
is not None and mdl_ch
is not None:
3402 trg_pos.extend(trg_pos_dict[trg_ch])
3403 mdl_pos.extend(mdl_pos_dict[mdl_ch])
3404 if len(mdl_pos) >= 3:
3405 superpose_res = mol.alg.SuperposeSVD(mdl_pos, trg_pos)
3406 if superpose_res.rmsd < best_rmsd:
3407 best_rmsd = superpose_res.rmsd
3408 best_mapping = mapping
3412 for chem_group, mapping
in zip(chem_groups, best_mapping):
3413 for trg_ch, mdl_ch
in zip(chem_group, mapping):
3414 tmp[trg_ch] = mdl_ch
3420 """ Extracts reference positions which are present in trg and mdl
3425 for aln_list
in mdl_alns:
3426 if len(aln_list) > 0:
3427 tmp = aln_list[0].GetSequence(0)
3428 ref_seq = seq.CreateSequence(tmp.GetName(), tmp.GetGaplessString())
3429 mdl_msas.append(seq.alg.MergePairwiseAlignments(aln_list, ref_seq))
3431 mdl_msas.append(seq.CreateAlignment())
3441 for trg_msa, mdl_msa
in zip(trg_msas, mdl_msas):
3443 if mdl_msa.GetCount() > 0:
3445 assert(trg_msa.GetSequence(0).GetGaplessString() == \
3446 mdl_msa.GetSequence(0).GetGaplessString())
3459 indices = sorted(list(trg_indices.intersection(mdl_indices)))
3462 if max_pos
is not None and len(indices) > max_pos:
3463 step = int(len(indices)/max_pos)
3464 indices = [indices[i]
for i
in range(0, len(indices), step)]
3471 trg_pos.append(list())
3472 mdl_pos.append(list())
3475 for s_idx
in range(1, trg_msa.GetCount()):
3479 for s_idx
in range(1, mdl_msa.GetCount()):
3483 return (trg_pos, mdl_pos)
3486 """ Helper for _GetRefPos
3488 Returns a dict with key: chain name and value: a geom.Vec3List of backbone
3489 atom positions. Assumes that either CA or C3' is always there.
3492 for ch
in ent.chains:
3494 for r
in ch.residues:
3495 a = r.FindAtom(
"CA")
3497 a = r.FindAtom(
"C3'")
3498 l.append(a.GetPos())
3503 """ Helper for _GetRefPos
3505 Returns a set containing the indices relative to first sequence in msa which
3506 are fully covered in all other sequences
3517 if sum([1
for olc
in col
if olc !=
'-']) == col.GetRowCount():
3518 indices.add(ref_idx)
3524 """ Helper for _GetRefPos
3526 Returns a list of mapped indices. indices refer to non-gap one letter
3527 codes in the first msa sequence. The returnes mapped indices are translated
3528 to the according msa column indices
3532 for col_idx, col
in enumerate(msa):
3534 mapping[ref_idx] = col_idx
3536 return [mapping[i]
for i
in indices]
3539 """ Helper for _GetRefPos
3541 Returns a geom.Vec3List containing positions refering to given msa sequence.
3542 => Chain with corresponding name is mapped onto sequence and the position of
3543 the first atom of each residue specified in indices is extracted.
3544 Indices refers to column indices in msa!
3546 s = msa.GetSequence(s_idx)
3547 ch_bb = bb[s.GetName()]
3550 assert(len(s.GetGaplessString()) == len(ch_bb))
3552 residue_idx = [s.GetResidueIndex(i)
for i
in indices]
3557 """ Number of mappings within one chem group
3559 :param ref_chains: Reference chains
3560 :type ref_chains: :class:`list` of :class:`str`
3561 :param mdl_chains: Model chains that are mapped onto *ref_chains*
3562 :type mdl_chains: :class:`list` of :class:`str`
3563 :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3565 n_ref = len(ref_chains)
3566 n_mdl = len(mdl_chains)
3568 return factorial(n_ref)
3570 n_choose_k = binom(n_ref, n_mdl)
3571 return n_choose_k * factorial(n_mdl)
3573 n_choose_k = binom(n_mdl, n_ref)
3574 return n_choose_k * factorial(n_ref)
3577 """ Number of mappings for a full chem mapping
3579 :param ref_chains: Chem groups of reference
3580 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3581 :param mdl_chains: Model chains that map onto those chem groups
3582 :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3583 :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3585 assert(len(ref_chains) == len(mdl_chains))
3587 for a,b
in zip(ref_chains, mdl_chains):
3592 """ Check whether total number of mappings is smaller than given maximum
3594 In principle the same as :func:`_NMappings` but it stops as soon as the
3597 :param ref_chains: Chem groups of reference
3598 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3599 :param mdl_chains: Model chains that map onto those chem groups
3600 :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3601 :param max_mappings: Number of max allowed mappings
3602 :returns: Whether number of possible mappings of *mdl_chains* onto
3603 *ref_chains* is below or equal *max_mappings*.
3605 assert(len(ref_chains) == len(mdl_chains))
3607 for a,b
in zip(ref_chains, mdl_chains):
3609 if n > max_mappings:
3614 """ Returns all possible ways to map mdl_chains onto ref_chains
3616 Specific for the case where len(ref_chains) < len(mdl_chains)
3618 for c
in itertools.combinations(mdl_chains, len(ref_chains)):
3619 for p
in itertools.permutations(c):
3623 """ Returns all possible ways to map mdl_chains onto ref_chains
3625 Specific for the case where len(ref_chains) > len(mdl_chains)
3626 Ref chains without mapped mdl chain are assigned None
3628 n_ref = len(ref_chains)
3629 n_mdl = len(mdl_chains)
3630 for c
in itertools.combinations(range(n_ref), n_mdl):
3631 for p
in itertools.permutations(mdl_chains):
3632 ret_list = [
None] * n_ref
3633 for idx, ch
in zip(c, p):
3638 """ Returns all possible ways to map mdl_chains onto ref_chains
3640 Specific for the case where len(ref_chains) == len(mdl_chains)
3642 for p
in itertools.permutations(mdl_chains):
3649 for item
in itertools.product(*iterators):
3653 """Returns all possible ways to map *mdl_chains* onto fixed *ref_chains*
3655 :param ref_chains: List of list of chemically equivalent chains in reference
3656 :type ref_chains: :class:`list` of :class:`list`
3657 :param mdl_chains: Equally long list of list of chemically equivalent chains
3658 in model that map on those ref chains.
3659 :type mdl_chains: :class:`list` of :class:`list`
3660 :param n_max: Aborts and raises :class:`RuntimeError` if max number of
3661 mappings is above this threshold.
3662 :type n_max: :class:`int`
3663 :returns: Iterator over all possible mappings of *mdl_chains* onto fixed
3664 *ref_chains*. Potentially contains None as padding when number of
3665 model chains for a certain mapping is smaller than the according
3667 Example: _ChainMappings([['A', 'B', 'C'], ['D', 'E']],
3668 [['x', 'y'], ['i', 'j']])
3669 gives an iterator over: [[['x', 'y', None], ['i', 'j']],
3670 [['x', 'y', None], ['j', 'i']],
3671 [['y', 'x', None], ['i', 'j']],
3672 [['y', 'x', None], ['j', 'i']],
3673 [['x', None, 'y'], ['i', 'j']],
3674 [['x', None, 'y'], ['j', 'i']],
3675 [['y', None, 'x'], ['i', 'j']],
3676 [['y', None, 'x'], ['j', 'i']],
3677 [[None, 'x', 'y'], ['i', 'j']],
3678 [[None, 'x', 'y'], ['j', 'i']],
3679 [[None, 'y', 'x'], ['i', 'j']],
3680 [[None, 'y', 'x'], ['j', 'i']]]
3682 assert(len(ref_chains) == len(mdl_chains))
3684 if n_max
is not None:
3686 raise RuntimeError(f
"Too many mappings. Max allowed: {n_max}")
3691 for ref, mdl
in zip(ref_chains, mdl_chains):
3694 elif len(ref) == len(mdl):
3696 elif len(ref) < len(mdl):
3705 """ Computes minimal RMSD superposition for pos_one onto pos_two
3707 :param pos_one: Positions that should be superposed onto *pos_two*
3708 :type pos_one: :class:`geom.Vec3List`
3709 :param pos_two: Reference positions
3710 :type pos_two: :class:`geom.Vec3List`
3711 :iterative: Whether iterative superposition should be used. Iterative
3712 potentially raises, uses standard superposition as fallback.
3713 :type iterative: :class:`bool`
3714 :returns: Transformation matrix to superpose *pos_one* onto *pos_two*
3715 :rtype: :class:`ost.mol.alg.SuperpositionResult`
3720 res = mol.alg.IterativeSuperposeSVD(pos_one, pos_two)
3724 res = mol.alg.SuperposeSVD(pos_one, pos_two)
3728__all__ = (
'ChainMapper',
'ReprResult',
'MappingResult')
FromFlatMapping(self, flat_mapping)
_NPairConserved(self, trg_int, mdl_int)
_NSCConserved(self, trg_ch, mdl_ch)
SCCounts(self, ref_ch, mdl_ch)
ExtendMapping(self, mapping, max_ext=None)
__init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, contact_d=12.0, steep_opt_rate=None, greedy_prune_contact_map=False)
_SteepOpt(self, mapping, chains_to_optimize=None)
ExtendMapping(self, mapping, max_ext=None)
_SteepOpt(self, mapping, chains_to_optimize=None)
__init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], steep_opt_rate=None)
chem_group_alignments(self)
GetMapping(self, model, n_max_naive=40320)
__init__(self, target, resnum_alignments=False, pep_seqid_thr=95., nuc_seqid_thr=95., pep_subst_mat=seq.alg.BLOSUM62, pep_gap_open=-11, pep_gap_ext=-1, nuc_subst_mat=seq.alg.NUC44, nuc_gap_open=-4, nuc_gap_ext=-4, min_pep_length=6, min_nuc_length=4, n_max_naive=1e8, mdl_map_pep_seqid_thr=0., mdl_map_nuc_seqid_thr=0., seqres=None, trg_seqres_mapping=None)
GetRMSDMapping(self, model, strategy="heuristic", subsampling=50, chem_mapping_result=None, heuristic_n_max_naive=120)
_ChemGroupAlignmentsFromATOMSEQ(self)
NWAlign(self, s1, s2, stype)
ResNumAlign(self, s1, s2, s1_ent, s2_ent)
chem_group_ref_seqs(self)
_GroupSequences(self, seqs, seqid_thr, min_length, chem_type)
_MapSequence(self, ref_seqs, ref_types, s, s_type, s_ent, seq_id_thr=0.0, min_aln_length=0)
_GroupSequencesSEQRES(self, poly_seqs)
SEQRESAlign(self, seqres, s, s_ent)
_ChemGroupAlignmentsFromSEQRES(self)
GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False, only_interchain=False, chem_mapping_result=None, global_mapping=None)
GetQSScoreMapping(self, model, contact_d=12.0, strategy="heuristic", block_seed_size=5, block_blocks_per_chem_group=5, heuristic_n_max_naive=40320, steep_opt_rate=None, chem_mapping_result=None, greedy_prune_contact_map=True)
GetAFMMapping(self, model, chem_mapping_result=None)
GetlDDTMapping(self, model, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic", steep_opt_rate=None, block_seed_size=5, block_blocks_per_chem_group=5, chem_mapping_result=None, heuristic_n_max_naive=40320)
GetChemMapping(self, model)
ProcessStructure(self, ent)
GetNMappings(self, model)
GetFlatMapping(self, mdl_as_key=False)
mdl_chains_without_chem_mapping(self)
_mdl_chains_without_chem_mapping
__init__(self, target, model, chem_groups, chem_mapping, mdl_chains_without_chem_mapping, mapping, alns, opt_score=None)
__init__(self, lDDT, substructure, ref_view, mdl_view)
GetFlatChainMapping(self, mdl_as_key=False)
_GetInconsistentResidues(self, ref_residues, mdl_residues)
superposed_mdl_bb_pos(self)
_GetBBPos(self, residues)
inconsistent_residues(self)
_GetFullBBPos(self, residues)
FromFlatMapping(self, flat_mapping)
_MappedInterfaceScores(self, int1, int2)
_InterfacePenalty2(self, interface)
_InterfacePenalty1(self, interface)
Real DLLEXPORT_OST_GEOM Distance(const Line2 &l, const Vec2 &v)
Real Length2(const Vec2 &v)
returns squared length of vector
_ChainMappings(ref_chains, mdl_chains, n_max=None)
_GetSuperposition(pos_one, pos_two, iterative)
_lDDTGreedyFull(the_greed)
_IterativeRigidRMSD(initial_transforms, initial_mappings, chem_groups, chem_mapping, trg_group_pos, mdl_group_pos)
_RefSmallerGenerator(ref_chains, mdl_chains)
_RefLargerGenerator(ref_chains, mdl_chains)
_RefEmptyGenerator(ref_chains, mdl_chains)
_GetRefMdlAlns(ref_chem_groups, ref_chem_group_msas, mdl_chem_groups, mdl_chem_group_alns, pairs=None)
_lDDTGreedyBlock(the_greed, seed_size, blocks_per_chem_group)
_NaiveRMSD(chem_groups, chem_mapping, trg_group_pos, mdl_group_pos, n_max_naive)
_RefIndicesToColumnIndices(msa, indices)
_SingleRigidCentroid(initial_transforms, initial_mappings, chem_groups, chem_mapping, trg_group_pos, mdl_group_pos)
_CheckOneToOneMapping(ref_chains, mdl_chains)
_QSScoreNaive(trg, mdl, chem_groups, chem_mapping, ref_mdl_alns, contact_d, n_max_naive)
_NMappings(ref_chains, mdl_chains)
_ExtractMSAPos(msa, s_idx, indices, bb)
_QSScoreGreedyBlock(the_greed, seed_size, blocks_per_chem_group)
_lDDTNaive(trg, mdl, inclusion_radius, thresholds, chem_groups, chem_mapping, ref_mdl_alns, n_max_naive)
_GetRefPos(trg, mdl, trg_msas, mdl_alns, max_pos=None)
_GetFullyCoveredIndices(msa)
_NChemGroupMappings(ref_chains, mdl_chains)
_GetSeeds(ref_chem_groups, mdl_chem_groups, mapped_ref_chains=set(), mapped_mdl_chains=set())
_SingleRigidRMSD(initial_transforms, initial_mappings, chem_groups, chem_mapping, trg_group_pos, mdl_group_pos)
_RefEqualGenerator(ref_chains, mdl_chains)
_QSScoreGreedyFull(the_greed)
_ConcatIterators(iterators)
_NMappingsWithin(ref_chains, mdl_chains, max_mappings)