2 Chain mapping aims to identify a one-to-one relationship between chains in a
3 reference structure and a model.
11 from scipy.special
import factorial
12 from scipy.special
import binom
24 def _CSel(ent, cnames):
25 """ Returns view with specified chains
27 Ensures that quotation marks are around chain names to not confuse
28 OST query language with weird special characters.
30 query =
"cname=" +
','.join([mol.QueryQuoteName(cname)
for cname
in cnames])
31 return ent.Select(query)
34 """ Result object for the chain mapping functions in :class:`ChainMapper`
36 Constructor is directly called within the functions, no need to construct
37 such objects yourself.
39 def __init__(self, target, model, chem_groups, chem_mapping,
40 unmapped_mdl_chains, mapping, alns, opt_score=None):
47 self.
_alns_alns = alns
52 """ Target/reference structure, i.e. :attr:`ChainMapper.target`
54 :type: :class:`ost.mol.EntityView`
60 """ Model structure that gets mapped onto :attr:`~target`
62 Underwent same processing as :attr:`ChainMapper.target`, i.e.
63 only contains peptide/nucleotide chains of sufficient size.
65 :type: :class:`ost.mol.EntityView`
71 """ Groups of chemically equivalent chains in :attr:`~target`
73 Same as :attr:`ChainMapper.chem_group`
75 :class:`list` of :class:`list` of :class:`str` (chain names)
81 """ Assigns chains in :attr:`~model` to :attr:`~chem_groups`.
83 :class:`list` of :class:`list` of :class:`str` (chain names)
89 """ Model chains that cannot be mapped to :attr:`chem_groups`
91 Depends on parameterization of :class:`ChainMapper`
93 :class:`list` of class:`str` (chain names)
99 """ Mapping of :attr:`~model` chains onto :attr:`~target`
101 Exact same shape as :attr:`~chem_groups` but containing the names of the
102 mapped chains in :attr:`~model`. May contain None for :attr:`~target`
103 chains that are not covered. No guarantee that all chains in
104 :attr:`~model` are mapped.
106 :class:`list` of :class:`list` of :class:`str` (chain names)
112 """ Alignments of mapped chains in :attr:`~target` and :attr:`~model`
114 Each alignment is accessible with ``alns[(t_chain,m_chain)]``. First
115 sequence is the sequence of :attr:`target` chain, second sequence the
116 one from :attr:`~model`. The respective :class:`ost.mol.EntityView` are
117 attached with :func:`ost.seq.ConstSequenceHandle.AttachView`.
119 :type: :class:`dict` with key: :class:`tuple` of :class:`str`, value:
120 :class:`ost.seq.AlignmentHandle`
122 return self.
_alns_alns
126 """ Placeholder property without any guarantee of being set
128 Different scores get optimized in the various chain mapping algorithms.
129 Some of them may set their final optimal score in that property.
130 Consult the documentation of the respective chain mapping algorithm
131 for more information. Won't be in the return dict of
137 """ Returns flat mapping as :class:`dict` for all mapable chains
139 :param mdl_as_key: Default is target chain name as key and model chain
140 name as value. This can be reversed with this flag.
141 :returns: :class:`dict` with :class:`str` as key/value that describe
144 flat_mapping = dict()
145 for trg_chem_group, mdl_chem_group
in zip(self.
chem_groupschem_groups,
147 for a,b
in zip(trg_chem_group, mdl_chem_group):
148 if a
is not None and b
is not None:
156 """ Returns JSON serializable summary of results
159 json_dict[
"chem_groups"] = self.
chem_groupschem_groups
160 json_dict[
"mapping"] = self.
mappingmapping
162 json_dict[
"alns"] = list()
163 for aln
in self.
alnsalns.values():
164 trg_seq = aln.GetSequence(0)
165 mdl_seq = aln.GetSequence(1)
166 aln_dict = {
"trg_ch": trg_seq.GetName(),
"trg_seq": str(trg_seq),
167 "mdl_ch": mdl_seq.GetName(),
"mdl_seq": str(mdl_seq)}
168 json_dict[
"alns"].append(aln_dict)
174 """ Result object for :func:`ChainMapper.GetRepr`
176 Constructor is directly called within the function, no need to construct
177 such objects yourself.
179 :param lDDT: lDDT for this mapping. Depends on how you call
180 :func:`ChainMapper.GetRepr` whether this is backbone only or
182 :type lDDT: :class:`float`
183 :param substructure: The full substructure for which we searched for a
185 :type substructure: :class:`ost.mol.EntityView`
186 :param ref_view: View pointing to the same underlying entity as
187 *substructure* but only contains the stuff that is mapped
188 :type ref_view: :class:`mol.EntityView`
189 :param mdl_view: The matching counterpart in model
190 :type mdl_view: :class:`mol.EntityView`
192 def __init__(self, lDDT, substructure, ref_view, mdl_view):
193 self.
_lDDT_lDDT = lDDT
195 assert(len(ref_view.residues) == len(mdl_view.residues))
212 """ lDDT of representation result
214 Depends on how you call :func:`ChainMapper.GetRepr` whether this is
215 backbone only or full atom lDDT.
217 :type: :class:`float`
219 return self.
_lDDT_lDDT
223 """ The full substructure for which we searched for a
226 :type: :class:`ost.mol.EntityView`
232 """ View which contains the mapped subset of :attr:`substructure`
234 :type: :class:`ost.mol.EntityView`
240 """ The :attr:`ref_view` representation in the model
242 :type: :class:`ost.mol.EntityView`
248 """ The reference residues
250 :type: class:`mol.ResidueViewList`
252 return self.
ref_viewref_view.residues
256 """ The model residues
258 :type: :class:`mol.ResidueViewList`
260 return self.
mdl_viewmdl_view.residues
264 """ A list of mapped residue whose names do not match (eg. ALA in the
265 reference and LEU in the model).
267 The mismatches are reported as a tuple of :class:`~ost.mol.ResidueView`
268 (reference, model), or as an empty list if all the residue names match.
279 """ Representative backbone positions for reference residues.
281 Thats CA positions for peptides and C3' positions for Nucleotides.
283 :type: :class:`geom.Vec3List`
291 """ Representative backbone positions for model residues.
293 Thats CA positions for peptides and C3' positions for Nucleotides.
295 :type: :class:`geom.Vec3List`
303 """ Representative backbone positions for reference residues.
305 Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
306 positions for Nucleotides.
308 :type: :class:`geom.Vec3List`
316 """ Representative backbone positions for reference residues.
318 Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
319 positions for Nucleotides.
321 :type: :class:`geom.Vec3List`
330 """ Superposition of mdl residues onto ref residues
332 Superposition computed as minimal RMSD superposition on
333 :attr:`ref_bb_pos` and :attr:`mdl_bb_pos`. If number of positions is
334 smaller 3, the full_bb_pos equivalents are used instead.
336 :type: :class:`ost.mol.alg.SuperpositionResult`
349 """ Transformation to superpose mdl residues onto ref residues
351 :type: :class:`ost.geom.Mat4`
357 """ :attr:`mdl_bb_pos` with :attr:`transform applied`
359 :type: :class:`geom.Vec3List`
368 """ RMSD of the binding site backbone atoms after :attr:`superposition`
370 :type: :class:`float`
377 """ query for mdl residues in OpenStructure query language
379 Repr can be selected as ``full_mdl.Select(ost_query)``
381 Returns invalid query if residue numbers have insertion codes.
388 chname = r.GetChain().GetName()
389 rnum = r.GetNumber().GetNum()
390 if chname
not in chain_rnums:
391 chain_rnums[chname] = list()
392 chain_rnums[chname].append(str(rnum))
393 chain_queries = list()
394 for k,v
in chain_rnums.items():
395 q = f
"(cname={mol.QueryQuoteName(k)} and "
396 q += f
"rnum={','.join(v)})"
397 chain_queries.append(q)
398 self.
_ost_query_ost_query =
" or ".join(chain_queries)
402 """ Returns JSON serializable summary of results
405 json_dict[
"lDDT"] = self.
lDDTlDDT
406 json_dict[
"ref_residues"] = [r.GetQualifiedName()
for r
in \
408 json_dict[
"mdl_residues"] = [r.GetQualifiedName()
for r
in \
410 json_dict[
"transform"] = list(self.
transformtransform.data)
411 json_dict[
"bb_rmsd"] = self.
bb_rmsdbb_rmsd
412 json_dict[
"ost_query"] = self.
ost_queryost_query
417 """ Returns flat mapping of all chains in the representation
419 :param mdl_as_key: Default is target chain name as key and model chain
420 name as value. This can be reversed with this flag.
421 :returns: :class:`dict` with :class:`str` as key/value that describe
424 flat_mapping = dict()
427 flat_mapping[mdl_res.chain.name] = trg_res.chain.name
429 flat_mapping[trg_res.chain.name] = mdl_res.chain.name
432 def _GetFullBBPos(self, residues):
433 """ Helper to extract full backbone positions
435 exp_pep_atoms = [
"N",
"CA",
"C"]
436 exp_nuc_atoms = [
"\"O5'\"",
"\"C5'\"",
"\"C4'\"",
"\"C3'\"",
"\"O3'\""]
439 if r.GetChemType() == mol.ChemType.NUCLEOTIDES:
440 exp_atoms = exp_nuc_atoms
441 elif r.GetChemType() == mol.ChemType.AMINOACIDS:
442 exp_atoms = exp_pep_atoms
444 raise RuntimeError(
"Something terrible happened... RUN...")
445 for aname
in exp_atoms:
446 a = r.FindAtom(aname)
448 raise RuntimeError(
"Something terrible happened... "
450 bb_pos.append(a.GetPos())
453 def _GetBBPos(self, residues):
454 """ Helper to extract single representative position for each residue
458 at = r.FindAtom(
"CA")
460 at = r.FindAtom(
"C3'")
462 raise RuntimeError(
"Something terrible happened... RUN...")
463 bb_pos.append(at.GetPos())
466 def _GetInconsistentResidues(self, ref_residues, mdl_residues):
467 """ Helper to extract a list of inconsistent residues.
469 if len(ref_residues) != len(mdl_residues):
470 raise ValueError(
"Something terrible happened... Reference and "
471 "model lengths differ... RUN...")
472 inconsistent_residues = list()
473 for ref_residue, mdl_residue
in zip(ref_residues, mdl_residues):
474 if ref_residue.name != mdl_residue.name:
475 inconsistent_residues.append((ref_residue, mdl_residue))
476 return inconsistent_residues
480 """ Class to compute chain mappings
482 All algorithms are performed on processed structures which fulfill
483 criteria as given in constructor arguments (*min_pep_length*,
484 "min_nuc_length") and only contain residues which have all required backbone
485 atoms. for peptide residues thats N, CA, C and CB (no CB for GLY), for
486 nucleotide residues thats O5', C5', C4', C3' and O3'.
488 Chain mapping is a three step process:
490 * Group chemically identical chains in *target* using pairwise
491 alignments that are either computed with Needleman-Wunsch (NW) or
492 simply derived from residue numbers (*resnum_alignments* flag).
493 In case of NW, *pep_subst_mat*, *pep_gap_open* and *pep_gap_ext*
494 and their nucleotide equivalents are relevant. Two chains are
495 considered identical if they fulfill the *pep_seqid_thr* and
496 have at least *min_pep_length* aligned residues. Same logic
497 is applied for nucleotides with respective thresholds.
499 * Map chains in an input model to these groups. Parameters for generating
500 alignments are the same as above. Model chains are mapped to the chem
501 group with highest sequence identity. Optionally, you can set a minimum
502 sequence identity threshold with *mdl_map_pep_seqid_thr*/
503 *mdl_map_nuc_seqid_thr* to avoid nonsensical mappings. You can either
504 get the group mapping with :func:`GetChemMapping` or directly call
505 one of the full fletched one-to-one chain mapping functions which
506 execute that step internally.
508 * Obtain one-to-one mapping for chains in an input model and
509 *target* with one of the available mapping functions. Just to get an
510 idea of complexity. If *target* and *model* are octamers, there are
511 ``8! = 40320`` possible chain mappings.
513 :param target: Target structure onto which models are mapped.
514 Computations happen on a selection only containing
515 polypeptides and polynucleotides.
516 :type target: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
517 :param resnum_alignments: Use residue numbers instead of
518 Needleman-Wunsch to compute pairwise
519 alignments. Relevant for :attr:`~chem_groups`
520 and related attributes.
521 :type resnum_alignments: :class:`bool`
522 :param pep_seqid_thr: Threshold used to decide when two chains are
523 identical. 95 percent tolerates the few mutations
524 crystallographers like to do.
525 :type pep_seqid_thr: :class:`float`
526 :param nuc_seqid_thr: Nucleotide equivalent for *pep_seqid_thr*
527 :type nuc_seqid_thr: :class:`float`
528 :param pep_subst_mat: Substitution matrix to align peptide sequences,
529 irrelevant if *resnum_alignments* is True,
530 defaults to seq.alg.BLOSUM62
531 :type pep_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
532 :param pep_gap_open: Gap open penalty to align peptide sequences,
533 irrelevant if *resnum_alignments* is True
534 :type pep_gap_open: :class:`int`
535 :param pep_gap_ext: Gap extension penalty to align peptide sequences,
536 irrelevant if *resnum_alignments* is True
537 :type pep_gap_ext: :class:`int`
538 :param nuc_subst_mat: Nucleotide equivalent for *pep_subst_mat*,
539 defaults to seq.alg.NUC44
540 :type nuc_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
541 :param nuc_gap_open: Nucleotide equivalent for *pep_gap_open*
542 :type nuc_gap_open: :class:`int`
543 :param nuc_gap_ext: Nucleotide equivalent for *pep_gap_ext*
544 :type nuc_gap_ext: :class:`int`
545 :param min_pep_length: Minimal number of residues for a peptide chain to be
546 considered in target and in models.
547 :type min_pep_length: :class:`int`
548 :param min_nuc_length: Minimal number of residues for a nucleotide chain to be
549 considered in target and in models.
550 :type min_nuc_length: :class:`int`
551 :param n_max_naive: Max possible chain mappings that are enumerated in
552 :func:`~GetNaivelDDTMapping` /
553 :func:`~GetDecomposerlDDTMapping`. A
554 :class:`RuntimeError` is raised in case of bigger
556 :type n_max_naive: :class:`int`
557 :param mdl_map_pep_seqid_thr: To avoid non-sensical mapping of model chains
558 to reference chem groups. If a value larger
559 than 0.0 is provided, minimal criteria for
560 assignment becomes a sequence identity of
561 *mdl_map_pep_seqid_thr* and at least
562 *min_pep_length* aligned residues. If set to
563 zero, it simply assigns a model chain to the
564 chem group with highest sequence identity.
565 :type mdl_map_pep_seqid_thr: :class:`float`
566 :param mdl_map_nuc_seqid_thr: Nucleotide equivalent of
567 *mdl_map_pep_seqid_thr*
568 :type mdl_map_nuc_seqid_thr: :class:`float`
570 def __init__(self, target, resnum_alignments=False,
571 pep_seqid_thr = 95., nuc_seqid_thr = 95.,
572 pep_subst_mat = seq.alg.BLOSUM62, pep_gap_open = -11,
573 pep_gap_ext = -1, nuc_subst_mat = seq.alg.NUC44,
574 nuc_gap_open = -4, nuc_gap_ext = -4,
575 min_pep_length = 6, min_nuc_length = 4,
577 mdl_map_pep_seqid_thr = 0.,
578 mdl_map_nuc_seqid_thr = 0.):
598 pep_subst_mat = pep_subst_mat,
599 pep_gap_open = pep_gap_open,
600 pep_gap_ext = pep_gap_ext,
601 nuc_subst_mat = nuc_subst_mat,
602 nuc_gap_open = nuc_gap_open,
603 nuc_gap_ext = nuc_gap_ext)
606 self._target, self._polypep_seqs, self.
_polynuc_seqs_polynuc_seqs = \
611 """Target structure that only contains peptides/nucleotides
613 Contains only residues that have the backbone representatives
614 (CA for peptide and C3' for nucleotides) to avoid ATOMSEQ alignment
615 inconsistencies when switching between all atom and backbone only
618 :type: :class:`ost.mol.EntityView`
624 """Sequences of peptide chains in :attr:`~target`
626 Respective :class:`EntityView` from *target* for each sequence s are
627 available as ``s.GetAttachedView()``
629 :type: :class:`ost.seq.SequenceList`
631 return self._polypep_seqs
635 """Sequences of nucleotide chains in :attr:`~target`
637 Respective :class:`EntityView` from *target* for each sequence s are
638 available as ``s.GetAttachedView()``
640 :type: :class:`ost.seq.SequenceList`
646 """Groups of chemically equivalent chains in :attr:`~target`
648 First chain in group is the one with longest sequence.
650 :getter: Computed on first use (cached)
651 :type: :class:`list` of :class:`list` of :class:`str` (chain names)
656 self.
_chem_groups_chem_groups.append([s.GetName()
for s
in a.sequences])
661 """MSA for each group in :attr:`~chem_groups`
663 Sequences in MSAs exhibit same order as in :attr:`~chem_groups` and
664 have the respective :class:`ost.mol.EntityView` from *target* attached.
666 :getter: Computed on first use (cached)
667 :type: :class:`ost.seq.AlignmentList`
682 """Reference (longest) sequence for each group in :attr:`~chem_groups`
684 Respective :class:`EntityView` from *target* for each sequence s are
685 available as ``s.GetAttachedView()``
687 :getter: Computed on first use (cached)
688 :type: :class:`ost.seq.SequenceList`
693 s = seq.CreateSequence(a.GetSequence(0).GetName(),
694 a.GetSequence(0).GetGaplessString())
695 s.AttachView(a.GetSequence(0).GetAttachedView())
701 """ChemType of each group in :attr:`~chem_groups`
703 Specifying if groups are poly-peptides/nucleotides, i.e.
704 :class:`ost.mol.ChemType.AMINOACIDS` or
705 :class:`ost.mol.ChemType.NUCLEOTIDES`
707 :getter: Computed on first use (cached)
708 :type: :class:`list` of :class:`ost.mol.ChemType`
722 """Maps sequences in *model* to chem_groups of target
724 :param model: Model from which to extract sequences, a
725 selection that only includes peptides and nucleotides
726 is performed and returned along other results.
727 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
728 :returns: Tuple with two lists of length `len(self.chem_groups)`, a list
729 and an :class:`ost.mol.EntityView` representing *model*:
730 1) Each element is a :class:`list` with mdl chain names that
731 map to the chem group at that position.
732 2) Each element is a :class:`ost.seq.AlignmentList` aligning
733 these mdl chain sequences to the chem group ref sequences.
734 3) The model chains that could not be mapped to the reference
735 4) A selection of *model* that only contains polypeptides and
736 polynucleotides whose ATOMSEQ exactly matches the sequence
737 info in the returned alignments.
739 mdl, mdl_pep_seqs, mdl_nuc_seqs = self.
ProcessStructureProcessStructure(model)
740 mapping = [list()
for x
in self.
chem_groupschem_groups]
741 unmapped_mdl_chains = list()
742 alns = [seq.AlignmentList()
for x
in self.
chem_groupschem_groups]
744 for s
in mdl_pep_seqs:
747 s, mol.ChemType.AMINOACIDS,
752 unmapped_mdl_chains.append(s.GetName())
754 mapping[idx].append(s.GetName())
755 alns[idx].append(aln)
757 for s
in mdl_nuc_seqs:
760 s, mol.ChemType.NUCLEOTIDES,
765 unmapped_mdl_chains.append(s.GetName())
767 mapping[idx].append(s.GetName())
768 alns[idx].append(aln)
770 return (mapping, alns, unmapped_mdl_chains, mdl)
774 thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic",
775 steep_opt_rate = None, block_seed_size = 5,
776 block_blocks_per_chem_group = 5,
777 chem_mapping_result = None,
778 heuristic_n_max_naive = 40320):
779 """ Identify chain mapping by optimizing lDDT score
781 Maps *model* chain sequences to :attr:`~chem_groups` and find mapping
782 based on backbone only lDDT score (CA for amino acids C3' for
785 Either performs a naive search, i.e. enumerate all possible mappings or
786 executes a greedy strategy that tries to identify a (close to) optimal
787 mapping in an iterative way by starting from a start mapping (seed). In
788 each iteration, the one-to-one mapping that leads to highest increase
789 in number of conserved contacts is added with the additional requirement
790 that this added mapping must have non-zero interface counts towards the
791 already mapped chains. So basically we're "growing" the mapped structure
792 by only adding connected stuff.
794 The available strategies:
796 * **naive**: Enumerates all possible mappings and returns best
798 * **greedy_fast**: perform all vs. all single chain lDDTs within the
799 respective ref/mdl chem groups. The mapping with highest number of
800 conserved contacts is selected as seed for greedy extension
802 * **greedy_full**: try multiple seeds for greedy extension, i.e. try
803 all ref/mdl chain combinations within the respective chem groups and
804 retain the mapping leading to the best lDDT.
806 * **greedy_block**: try multiple seeds for greedy extension, i.e. try
807 all ref/mdl chain combinations within the respective chem groups and
808 extend them to *block_seed_size*. *block_blocks_per_chem_group*
809 for each chem group are selected for exhaustive extension.
811 * **heuristic**: Uses *naive* strategy if number of possible mappings
812 is within *heuristic_n_max_naive*. The default of 40320 corresponds
813 to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
814 6!*2!=1440 etc. If the number of possible mappings is larger,
815 *greedy_full* is used.
817 Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
820 :param model: Model to map
821 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
822 :param inclusion_radius: Inclusion radius for lDDT
823 :type inclusion_radius: :class:`float`
824 :param thresholds: Thresholds for lDDT
825 :type thresholds: :class:`list` of :class:`float`
826 :param strategy: Strategy to find mapping. Must be in ["naive",
827 "greedy_fast", "greedy_full", "greedy_block"]
828 :type strategy: :class:`str`
829 :param steep_opt_rate: Only relevant for greedy strategies.
830 If set, every *steep_opt_rate* mappings, a simple
831 optimization is executed with the goal of
832 avoiding local minima. The optimization
833 iteratively checks all possible swaps of mappings
834 within their respective chem groups and accepts
835 swaps that improve lDDT score. Iteration stops as
836 soon as no improvement can be achieved anymore.
837 :type steep_opt_rate: :class:`int`
838 :param block_seed_size: Param for *greedy_block* strategy - Initial seeds
839 are extended by that number of chains.
840 :type block_seed_size: :class:`int`
841 :param block_blocks_per_chem_group: Param for *greedy_block* strategy -
842 Number of blocks per chem group that
843 are extended in an initial search
844 for high scoring local solutions.
845 :type block_blocks_per_chem_group: :class:`int`
846 :param chem_mapping_result: Pro param. The result of
847 :func:`~GetChemMapping` where you provided
848 *model*. If set, *model* parameter is not
850 :type chem_mapping_result: :class:`tuple`
851 :returns: A :class:`MappingResult`
854 strategies = [
"naive",
"greedy_fast",
"greedy_full",
"greedy_block",
856 if strategy
not in strategies:
857 raise RuntimeError(f
"Strategy must be in {strategies}")
859 if chem_mapping_result
is None:
860 chem_mapping, chem_group_alns, unmapped_mdl_chains, mdl = \
863 chem_mapping, chem_group_alns, unmapped_mdl_chains, mdl = \
866 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
872 one_to_one = _CheckOneToOneMapping(self.
chem_groupschem_groups, chem_mapping)
873 if one_to_one
is not None:
875 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, one_to_one):
876 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
877 if ref_ch
is not None and mdl_ch
is not None:
878 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
879 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
880 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
881 alns[(ref_ch, mdl_ch)] = aln
883 unmapped_mdl_chains, one_to_one, alns)
885 if strategy ==
"heuristic":
886 if _NMappingsWithin(self.
chem_groupschem_groups, chem_mapping,
887 heuristic_n_max_naive):
890 strategy =
"greedy_full"
895 if strategy ==
"naive":
896 mapping, opt_lddt = _lDDTNaive(self.
targettarget, mdl, inclusion_radius,
898 chem_mapping, ref_mdl_alns,
903 chem_mapping, ref_mdl_alns,
904 inclusion_radius=inclusion_radius,
905 thresholds=thresholds,
906 steep_opt_rate=steep_opt_rate)
907 if strategy ==
"greedy_fast":
908 mapping = _lDDTGreedyFast(the_greed)
909 elif strategy ==
"greedy_full":
910 mapping = _lDDTGreedyFull(the_greed)
911 elif strategy ==
"greedy_block":
912 mapping = _lDDTGreedyBlock(the_greed, block_seed_size,
913 block_blocks_per_chem_group)
915 opt_lddt = the_greed.Score(mapping)
918 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, mapping):
919 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
920 if ref_ch
is not None and mdl_ch
is not None:
921 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
922 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
923 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
924 alns[(ref_ch, mdl_ch)] = aln
927 unmapped_mdl_chains, mapping, alns,
928 opt_score = opt_lddt)
932 block_seed_size = 5, block_blocks_per_chem_group = 5,
933 heuristic_n_max_naive = 40320, steep_opt_rate = None,
934 chem_mapping_result = None,
935 greedy_prune_contact_map = True):
936 """ Identify chain mapping based on QSScore
938 Scoring is based on CA/C3' positions which are present in all chains of
939 a :attr:`chem_groups` as well as the *model* chains which are mapped to
940 that respective chem group.
942 The following strategies are available:
944 * **naive**: Naively iterate all possible mappings and return best based
947 * **greedy_fast**: perform all vs. all single chain lDDTs within the
948 respective ref/mdl chem groups. The mapping with highest number of
949 conserved contacts is selected as seed for greedy extension.
950 Extension is based on QS-score.
952 * **greedy_full**: try multiple seeds for greedy extension, i.e. try
953 all ref/mdl chain combinations within the respective chem groups and
954 retain the mapping leading to the best QS-score.
956 * **greedy_block**: try multiple seeds for greedy extension, i.e. try
957 all ref/mdl chain combinations within the respective chem groups and
958 extend them to *block_seed_size*. *block_blocks_per_chem_group*
959 for each chem group are selected for exhaustive extension.
961 * **heuristic**: Uses *naive* strategy if number of possible mappings
962 is within *heuristic_n_max_naive*. The default of 40320 corresponds
963 to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
964 6!*2!=1440 etc. If the number of possible mappings is larger,
965 *greedy_full* is used.
967 Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
970 :param model: Model to map
971 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
972 :param contact_d: Max distance between two residues to be considered as
973 contact in qs scoring
974 :type contact_d: :class:`float`
975 :param strategy: Strategy for sampling, must be in ["naive",
976 "greedy_fast", "greedy_full", "greedy_block"]
977 :type strategy: :class:`str`
978 :param chem_mapping_result: Pro param. The result of
979 :func:`~GetChemMapping` where you provided
980 *model*. If set, *model* parameter is not
982 :type chem_mapping_result: :class:`tuple`
983 :param greedy_prune_contact_map: Relevant for all strategies that use
984 greedy extensions. If True, only chains
985 with at least 3 contacts (8A CB
986 distance) towards already mapped chains
987 in trg/mdl are considered for
988 extension. All chains that give a
989 potential non-zero QS-score increase
990 are used otherwise (at least one
991 contact within 12A). The consequence
992 is reduced runtime and usually no
993 real reduction in accuracy.
994 :returns: A :class:`MappingResult`
997 strategies = [
"naive",
"greedy_fast",
"greedy_full",
"greedy_block",
"heuristic"]
998 if strategy
not in strategies:
999 raise RuntimeError(f
"strategy must be {strategies}")
1001 if chem_mapping_result
is None:
1002 chem_mapping, chem_group_alns, unmapped_mdl_chains, mdl = \
1005 chem_mapping, chem_group_alns, unmapped_mdl_chains, mdl = \
1007 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
1012 one_to_one = _CheckOneToOneMapping(self.
chem_groupschem_groups, chem_mapping)
1013 if one_to_one
is not None:
1015 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, one_to_one):
1016 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1017 if ref_ch
is not None and mdl_ch
is not None:
1018 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1019 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
1020 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1021 alns[(ref_ch, mdl_ch)] = aln
1023 unmapped_mdl_chains, one_to_one, alns)
1025 if strategy ==
"heuristic":
1026 if _NMappingsWithin(self.
chem_groupschem_groups, chem_mapping,
1027 heuristic_n_max_naive):
1030 strategy =
"greedy_full"
1035 if strategy ==
"naive":
1036 mapping, opt_qsscore = _QSScoreNaive(self.
targettarget, mdl,
1038 chem_mapping, ref_mdl_alns,
1044 chem_mapping, ref_mdl_alns,
1045 contact_d = contact_d,
1046 steep_opt_rate=steep_opt_rate,
1047 greedy_prune_contact_map = greedy_prune_contact_map)
1048 if strategy ==
"greedy_fast":
1049 mapping = _QSScoreGreedyFast(the_greed)
1050 elif strategy ==
"greedy_full":
1051 mapping = _QSScoreGreedyFull(the_greed)
1052 elif strategy ==
"greedy_block":
1053 mapping = _QSScoreGreedyBlock(the_greed, block_seed_size,
1054 block_blocks_per_chem_group)
1056 opt_qsscore = the_greed.Score(mapping, check=
False).QS_global
1059 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, mapping):
1060 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1061 if ref_ch
is not None and mdl_ch
is not None:
1062 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1063 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
1064 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1065 alns[(ref_ch, mdl_ch)] = aln
1068 unmapped_mdl_chains, mapping, alns,
1069 opt_score = opt_qsscore)
1072 chem_mapping_result = None, heuristic_n_max_naive = 120):
1073 """Identify chain mapping based on minimal RMSD superposition
1075 Superposition and scoring is based on CA/C3' positions which are present
1076 in all chains of a :attr:`chem_groups` as well as the *model*
1077 chains which are mapped to that respective chem group.
1079 The following strategies are available:
1081 * **naive**: Naively iterate all possible mappings and return the one
1084 * **greedy_single**: perform all vs. all single chain superpositions
1085 within the respective ref/mdl chem groups to use as starting points.
1086 For each starting point, iteratively add the model/target chain pair
1087 with lowest RMSD until a full mapping is achieved. The mapping with
1088 lowest RMSD is returned.
1090 * **greedy_iterative**: Same as greedy_single_rmsd exept that the
1091 transformation gets updated with each added chain pair.
1093 * **heuristic**: Uses *naive* strategy if number of possible mappings
1094 is within *heuristic_n_max_naive*. The default of 120 corresponds
1095 to a homo-pentamer (5!=120). If the number of possible mappings is
1096 larger, *greedy_iterative* is used.
1098 :param model: Model to map
1099 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1100 :param strategy: Strategy for sampling. Must be in ["naive",
1101 "greedy_single", "greedy_iterative"]
1102 :type strategy: :class:`str`
1103 :param subsampling: If given, only an equally distributed subset
1104 of CA/C3' positions in each chain are used for
1105 superposition/scoring.
1106 :type subsampling: :class:`int`
1107 :param chem_mapping_result: Pro param. The result of
1108 :func:`~GetChemMapping` where you provided
1109 *model*. If set, *model* parameter is not
1111 :type chem_mapping_result: :class:`tuple`
1112 :returns: A :class:`MappingResult`
1115 strategies = [
"naive",
"greedy_single",
"greedy_iterative",
"heuristic"]
1117 if strategy
not in strategies:
1118 raise RuntimeError(f
"strategy must be {strategies}")
1120 if chem_mapping_result
is None:
1121 chem_mapping, chem_group_alns, unmapped_mdl_chains, mdl = \
1124 chem_mapping, chem_group_alns, unmapped_mdl_chains, mdl = \
1126 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
1132 one_to_one = _CheckOneToOneMapping(self.
chem_groupschem_groups, chem_mapping)
1133 if one_to_one
is not None:
1135 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, one_to_one):
1136 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1137 if ref_ch
is not None and mdl_ch
is not None:
1138 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1139 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
1140 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1141 alns[(ref_ch, mdl_ch)] = aln
1143 unmapped_mdl_chains, one_to_one, alns)
1145 trg_group_pos, mdl_group_pos = _GetRefPos(self.
targettarget, mdl,
1148 max_pos = subsampling)
1150 if strategy ==
"heuristic":
1151 if _NMappingsWithin(self.
chem_groupschem_groups, chem_mapping,
1152 heuristic_n_max_naive):
1155 strategy =
"greedy_iterative"
1159 if strategy.startswith(
"greedy"):
1162 initial_transforms = list()
1163 initial_mappings = list()
1164 for trg_pos, trg_chains, mdl_pos, mdl_chains
in zip(trg_group_pos,
1168 for t_pos, t
in zip(trg_pos, trg_chains):
1169 for m_pos, m
in zip(mdl_pos, mdl_chains):
1170 if len(t_pos) >= 3
and len(m_pos) >= 3:
1171 transform = _GetSuperposition(m_pos, t_pos,
1172 False).transformation
1173 initial_transforms.append(transform)
1174 initial_mappings.append((t,m))
1176 if strategy ==
"greedy_single":
1177 mapping = _SingleRigidRMSD(initial_transforms,
1185 elif strategy ==
"greedy_iterative":
1186 mapping = _IterativeRigidRMSD(initial_transforms,
1189 trg_group_pos, mdl_group_pos)
1190 elif strategy ==
"naive":
1191 mapping = _NaiveRMSD(self.
chem_groupschem_groups, chem_mapping,
1192 trg_group_pos, mdl_group_pos,
1196 final_mapping = list()
1198 mapped_mdl_chains = list()
1199 for ref_ch
in ref_chains:
1200 if ref_ch
in mapping:
1201 mapped_mdl_chains.append(mapping[ref_ch])
1203 mapped_mdl_chains.append(
None)
1204 final_mapping.append(mapped_mdl_chains)
1207 for ref_group, mdl_group
in zip(self.
chem_groupschem_groups, final_mapping):
1208 for ref_ch, mdl_ch
in zip(ref_group, mdl_group):
1209 if ref_ch
is not None and mdl_ch
is not None:
1210 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1211 aln.AttachView(0, _CSel(self.
targettarget, [ref_ch]))
1212 aln.AttachView(1, _CSel(mdl, [mdl_ch]))
1213 alns[(ref_ch, mdl_ch)] = aln
1216 unmapped_mdl_chains, final_mapping, alns)
1219 """ Convenience function to get mapping with currently preferred method
1221 If number of possible chain mappings is <= *n_max_naive*, a naive
1222 QS-score mapping is performed and optimal QS-score is guaranteed.
1223 For anything else, a QS-score mapping with the greedy_full strategy is
1224 performed (greedy_prune_contact_map = True). The default for
1225 *n_max_naive* of 40320 corresponds to an octamer (8!=40320). A
1226 structure with stoichiometry A6B2 would be 6!*2!=1440 etc.
1228 If :attr:`~target` has nucleotide sequences, the QS-score target
1229 function is replaced with a backbone only lDDT score that has
1230 an inclusion radius of 30A.
1233 return self.
GetlDDTMappingGetlDDTMapping(model, strategy =
"heuristic",
1234 inclusion_radius = 30.0,
1235 heuristic_n_max_naive = n_max_naive)
1238 greedy_prune_contact_map=
True,
1239 heuristic_n_max_naive = n_max_naive)
1241 def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0,
1242 thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False,
1243 only_interchain=False, chem_mapping_result = None,
1244 global_mapping = None):
1245 """ Identify *topn* representations of *substructure* in *model*
1247 *substructure* defines a subset of :attr:`~target` for which one
1248 wants the *topn* representations in *model*. Representations are scored
1251 :param substructure: A :class:`ost.mol.EntityView` which is a subset of
1252 :attr:`~target`. Should be selected with the
1253 OpenStructure query language. Example: if you're
1254 interested in residues with number 42,43 and 85 in
1256 ``substructure=mapper.target.Select("cname=A and rnum=42,43,85")``
1257 A :class:`RuntimeError` is raised if *substructure*
1258 does not refer to the same underlying
1259 :class:`ost.mol.EntityHandle` as :attr:`~target`.
1260 :type substructure: :class:`ost.mol.EntityView`
1261 :param model: Structure in which one wants to find representations for
1263 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1264 :param topn: Max number of representations that are returned
1265 :type topn: :class:`int`
1266 :param inclusion_radius: Inclusion radius for lDDT
1267 :type inclusion_radius: :class:`float`
1268 :param thresholds: Thresholds for lDDT
1269 :type thresholds: :class:`list` of :class:`float`
1270 :param bb_only: Only consider backbone atoms in lDDT computation
1271 :type bb_only: :class:`bool`
1272 :param only_interchain: Only score interchain contacts in lDDT. Useful
1273 if you want to identify interface patches.
1274 :type only_interchain: :class:`bool`
1275 :param chem_mapping_result: Pro param. The result of
1276 :func:`~GetChemMapping` where you provided
1277 *model*. If set, *model* parameter is not
1279 :type chem_mapping_result: :class:`tuple`
1280 :param global_mapping: Pro param. Specify a global mapping result. This
1281 fully defines the desired representation in the
1282 model but extracts it and enriches it with all
1283 the nice attributes of :class:`ReprResult`.
1284 The target attribute in *global_mapping* must be
1285 of the same entity as self.target and the model
1286 attribute of *global_mapping* must be of the same
1288 :type global_mapping: :class:`MappingResult`
1289 :returns: :class:`list` of :class:`ReprResult`
1293 raise RuntimeError(
"topn must be >= 1")
1295 if global_mapping
is not None:
1297 if global_mapping.target.handle.GetHashCode() != \
1298 self.
targettarget.handle.GetHashCode():
1299 raise RuntimeError(
"global_mapping.target must be the same "
1300 "entity as self.target")
1301 if global_mapping.model.handle.GetHashCode() != \
1302 model.handle.GetHashCode():
1303 raise RuntimeError(
"global_mapping.model must be the same "
1304 "entity as model param")
1307 for r
in substructure.residues:
1308 ch_name = r.GetChain().GetName()
1309 rnum = r.GetNumber()
1310 target_r = self.
targettarget.FindResidue(ch_name, rnum)
1311 if not target_r.IsValid():
1312 raise RuntimeError(f
"substructure has residue "
1313 f
"{r.GetQualifiedName()} which is not in "
1315 if target_r.handle.GetHashCode() != r.handle.GetHashCode():
1316 raise RuntimeError(f
"substructure has residue "
1317 f
"{r.GetQualifiedName()} which has an "
1318 f
"equivalent in self.target but it does "
1319 f
"not refer to the same underlying "
1322 target_a = target_r.FindAtom(a.GetName())
1323 if not target_a.IsValid():
1324 raise RuntimeError(f
"substructure has atom "
1325 f
"{a.GetQualifiedName()} which is not "
1327 if a.handle.GetHashCode() != target_a.handle.GetHashCode():
1328 raise RuntimeError(f
"substructure has atom "
1329 f
"{a.GetQualifiedName()} which has an "
1330 f
"equivalent in self.target but it does "
1331 f
"not refer to the same underlying "
1335 ca = r.FindAtom(
"CA")
1336 c3 = r.FindAtom(
"C3'")
1338 if not ca.IsValid()
and not c3.IsValid():
1339 raise RuntimeError(
"All residues in substructure must contain "
1340 "a backbone atom named CA or C3\'")
1343 if chem_mapping_result
is None:
1344 chem_mapping, chem_group_alns, unmapped_mdl_chains, mdl = \
1347 chem_mapping, chem_group_alns, unmapped_mdl_chains, mdl = \
1349 ref_mdl_alns = _GetRefMdlAlns(self.
chem_groupschem_groups,
1355 substructure_res_indices = dict()
1356 for ch
in substructure.chains:
1357 full_ch = self.
targettarget.FindChain(ch.GetName())
1358 idx = [full_ch.GetResidueIndex(r.GetNumber())
for r
in ch.residues]
1359 substructure_res_indices[ch.GetName()] = idx
1363 substructure_chem_groups = list()
1364 substructure_chem_mapping = list()
1366 chnames = set([ch.GetName()
for ch
in substructure.chains])
1367 for chem_group, mapping
in zip(self.
chem_groupschem_groups, chem_mapping):
1368 substructure_chem_group = [ch
for ch
in chem_group
if ch
in chnames]
1369 if len(substructure_chem_group) > 0:
1370 substructure_chem_groups.append(substructure_chem_group)
1371 substructure_chem_mapping.append(mapping)
1374 n_mapped_mdl_chains = sum([len(m)
for m
in substructure_chem_mapping])
1375 if n_mapped_mdl_chains == 0:
1380 substructure_ref_mdl_alns = dict()
1382 for ch
in mdl.chains:
1383 mdl_views[ch.GetName()] = _CSel(mdl, [ch.GetName()])
1384 for chem_group, mapping
in zip(substructure_chem_groups,
1385 substructure_chem_mapping):
1386 for ref_ch
in chem_group:
1387 for mdl_ch
in mapping:
1388 full_aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1389 ref_seq = full_aln.GetSequence(0)
1393 tmp = [
'-'] * len(full_aln)
1394 for idx
in substructure_res_indices[ref_ch]:
1395 idx_in_seq = ref_seq.GetPos(idx)
1396 tmp[idx_in_seq] = ref_seq[idx_in_seq]
1397 ref_seq = seq.CreateSequence(ref_ch,
''.join(tmp))
1398 ref_seq.AttachView(_CSel(substructure, [ref_ch]))
1399 mdl_seq = full_aln.GetSequence(1)
1400 mdl_seq = seq.CreateSequence(mdl_seq.GetName(),
1401 mdl_seq.GetString())
1402 mdl_seq.AttachView(mdl_views[mdl_ch])
1403 aln = seq.CreateAlignment()
1404 aln.AddSequence(ref_seq)
1405 aln.AddSequence(mdl_seq)
1406 substructure_ref_mdl_alns[(ref_ch, mdl_ch)] = aln
1409 inclusion_radius = inclusion_radius,
1411 scored_mappings = list()
1415 flat_mapping = global_mapping.GetFlatMapping()
1417 for chem_group, chem_mapping
in zip(substructure_chem_groups,
1418 substructure_chem_mapping):
1419 chem_group_mapping = list()
1420 for ch
in chem_group:
1421 if ch
in flat_mapping:
1422 mdl_ch = flat_mapping[ch]
1423 if mdl_ch
in chem_mapping:
1424 chem_group_mapping.append(mdl_ch)
1426 chem_group_mapping.append(
None)
1428 chem_group_mapping.append(
None)
1429 mapping.append(chem_group_mapping)
1430 mappings = [mapping]
1432 mappings = list(_ChainMappings(substructure_chem_groups,
1433 substructure_chem_mapping,
1437 msg =
"Computing initial ranking of %d chain mappings" % len(mappings)
1438 (ost.LogWarning
if len(mappings) > 10000
else ost.LogInfo)(msg)
1440 for mapping
in mappings:
1442 lddt_chain_mapping = dict()
1445 for ref_chem_group, mdl_chem_group
in zip(substructure_chem_groups,
1447 for ref_ch, mdl_ch
in zip(ref_chem_group, mdl_chem_group):
1449 if mdl_ch
is not None:
1450 lddt_chain_mapping[mdl_ch] = ref_ch
1451 aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1452 lddt_alns[mdl_ch] = aln
1453 tmp = [int(c[0] !=
'-' and c[1] !=
'-')
for c
in aln]
1454 n_res_aln += sum(tmp)
1459 lDDT, _ = lddt_scorer.lDDT(mdl, thresholds=thresholds,
1460 chain_mapping=lddt_chain_mapping,
1461 residue_mapping = lddt_alns,
1462 check_resnames =
False,
1463 no_intrachain = only_interchain)
1466 ost.LogVerbose(
"No valid contacts in the reference")
1471 if len(scored_mappings) == 0:
1472 scored_mappings.append((lDDT, mapping))
1473 elif len(scored_mappings) < topn:
1474 scored_mappings.append((lDDT, mapping))
1475 scored_mappings.sort(reverse=
True, key=
lambda x: x[0])
1476 elif lDDT > scored_mappings[-1][0]:
1477 scored_mappings.append((lDDT, mapping))
1478 scored_mappings.sort(reverse=
True, key=
lambda x: x[0])
1479 scored_mappings = scored_mappings[:topn]
1483 for scored_mapping
in scored_mappings:
1484 ref_view = substructure.handle.CreateEmptyView()
1485 mdl_view = mdl.handle.CreateEmptyView()
1486 for ref_ch_group, mdl_ch_group
in zip(substructure_chem_groups,
1488 for ref_ch, mdl_ch
in zip(ref_ch_group, mdl_ch_group):
1489 if ref_ch
is not None and mdl_ch
is not None:
1490 aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1492 if col[0] !=
'-' and col[1] !=
'-':
1493 ref_view.AddResidue(col.GetResidue(0),
1494 mol.ViewAddFlag.INCLUDE_ALL)
1495 mdl_view.AddResidue(col.GetResidue(1),
1496 mol.ViewAddFlag.INCLUDE_ALL)
1497 results.append(
ReprResult(scored_mapping[0], substructure,
1498 ref_view, mdl_view))
1502 """ Returns number of possible mappings
1504 :param model: Model with chains that are mapped onto
1506 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1508 chem_mapping, chem_group_alns, unmapped_mdl_chains, mdl = \
1510 return _NMappings(self.
chem_groupschem_groups, chem_mapping)
1513 """ Entity processing for chain mapping
1515 * Selects view containing peptide and nucleotide residues which have
1516 required backbone atoms present - for peptide residues thats
1517 N, CA, C and CB (no CB for GLY), for nucleotide residues thats
1518 O5', C5', C4', C3' and O3'.
1519 * filters view by chain lengths, see *min_pep_length* and
1520 *min_nuc_length* in constructor
1521 * Extracts atom sequences for each chain in that view
1522 * Attaches corresponding :class:`ost.mol.EntityView` to each sequence
1523 * If residue number alignments are used, strictly increasing residue
1524 numbers without insertion codes are ensured in each chain
1526 :param ent: Entity to process
1527 :type ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1528 :returns: Tuple with 3 elements: 1) :class:`ost.mol.EntityView`
1529 containing peptide and nucleotide residues 2)
1530 :class:`ost.seq.SequenceList` containing ATOMSEQ sequences
1531 for each polypeptide chain in returned view, sequences have
1532 :class:`ost.mol.EntityView` of according chains attached
1533 3) same for polynucleotide chains
1535 view = ent.CreateEmptyView()
1536 exp_pep_atoms = [
"N",
"CA",
"C",
"CB"]
1537 exp_nuc_atoms = [
"\"O5'\"",
"\"C5'\"",
"\"C4'\"",
"\"C3'\"",
"\"O3'\""]
1538 pep_query =
"peptide=true and aname=" +
','.join(exp_pep_atoms)
1539 nuc_query =
"nucleotide=true and aname=" +
','.join(exp_nuc_atoms)
1541 pep_sel = ent.Select(pep_query)
1542 for r
in pep_sel.residues:
1543 if len(r.atoms) == 4:
1544 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1545 elif r.name ==
"GLY" and len(r.atoms) == 3:
1546 atom_names = [a.GetName()
for a
in r.atoms]
1547 if sorted(atom_names) == [
"C",
"CA",
"N"]:
1548 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1550 nuc_sel = ent.Select(nuc_query)
1551 for r
in nuc_sel.residues:
1552 if len(r.atoms) == 5:
1553 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1555 polypep_seqs = seq.CreateSequenceList()
1556 polynuc_seqs = seq.CreateSequenceList()
1558 if len(view.residues) == 0:
1560 return (view, polypep_seqs, polynuc_seqs)
1562 for ch
in view.chains:
1563 n_res = len(ch.residues)
1564 n_pep = sum([r.IsPeptideLinking()
for r
in ch.residues])
1565 n_nuc = sum([r.IsNucleotideLinking()
for r
in ch.residues])
1568 if n_pep > 0
and n_nuc > 0:
1569 raise RuntimeError(f
"Must not mix peptide and nucleotide linking "
1570 f
"residues in same chain ({ch.GetName()})")
1572 if (n_pep + n_nuc) != n_res:
1573 raise RuntimeError(
"All residues must either be peptide_linking "
1574 "or nucleotide_linking")
1588 ins_codes = [r.GetNumber().GetInsCode()
for r
in ch.residues]
1589 if len(set(ins_codes)) != 1
or ins_codes[0] !=
'\0':
1590 raise RuntimeError(
"Residue numbers in input structures must not "
1591 "contain insertion codes")
1594 nums = [r.GetNumber().GetNum()
for r
in ch.residues]
1595 if not all(i < j
for i, j
in zip(nums, nums[1:])):
1596 raise RuntimeError(
"Residue numbers in input structures must be "
1597 "strictly increasing for each chain")
1599 s =
''.join([r.one_letter_code
for r
in ch.residues])
1600 s = seq.CreateSequence(ch.GetName(), s)
1601 s.AttachView(_CSel(view, [ch.GetName()]))
1603 polypep_seqs.AddSequence(s)
1604 elif n_nuc == n_res:
1605 polynuc_seqs.AddSequence(s)
1607 raise RuntimeError(
"This shouldnt happen")
1609 if len(polypep_seqs) == 0
and len(polynuc_seqs) == 0:
1610 raise RuntimeError(f
"No chain fulfilled minimum length requirement "
1611 f
"to be considered in chain mapping "
1612 f
"({self.min_pep_length} for peptide chains, "
1613 f
"{self.min_nuc_length} for nucleotide chains) "
1614 f
"- mapping failed")
1617 chain_names = [s.GetAttachedView().chains[0].name
for s
in polypep_seqs]
1618 chain_names += [s.GetAttachedView().chains[0].name
for s
in polynuc_seqs]
1619 view = _CSel(view, chain_names)
1621 return (view, polypep_seqs, polynuc_seqs)
1624 """ Access to internal sequence alignment functionality
1626 Alignment parameterization is setup at ChainMapper construction
1628 :param s1: First sequence to align - must have view attached in case
1629 of resnum_alignments
1630 :type s1: :class:`ost.seq.SequenceHandle`
1631 :param s2: Second sequence to align - must have view attached in case
1632 of resnum_alignments
1633 :type s2: :class:`ost.seq.SequenceHandle`
1634 :param stype: Type of sequences to align, must be in
1635 [:class:`ost.mol.ChemType.AMINOACIDS`,
1636 :class:`ost.mol.ChemType.NUCLEOTIDES`]
1637 :returns: Pairwise alignment of s1 and s2
1639 if stype
not in [mol.ChemType.AMINOACIDS, mol.ChemType.NUCLEOTIDES]:
1640 raise RuntimeError(
"stype must be ost.mol.ChemType.AMINOACIDS or "
1641 "ost.mol.ChemType.NUCLEOTIDES")
1642 return self.
aligneraligner.
Align(s1, s2, chem_type = stype)
1648 def __init__(self, pep_subst_mat = seq.alg.BLOSUM62, pep_gap_open = -5,
1649 pep_gap_ext = -2, nuc_subst_mat = seq.alg.NUC44,
1650 nuc_gap_open = -4, nuc_gap_ext = -4, resnum_aln = False):
1651 """ Helper class to compute alignments
1653 Sets default values for substitution matrix, gap open and gap extension
1654 penalties. They are only used in default mode (Needleman-Wunsch aln).
1655 If *resnum_aln* is True, only residue numbers of views that are attached
1656 to input sequences are considered.
1666 def Align(self, s1, s2, chem_type=None):
1670 if chem_type
is None:
1671 raise RuntimeError(
"Must specify chem_type for NW alignment")
1672 return self.
NWAlignNWAlign(s1, s2, chem_type)
1675 """ Returns pairwise alignment using Needleman-Wunsch algorithm
1677 :param s1: First sequence to align
1678 :type s1: :class:`ost.seq.SequenceHandle`
1679 :param s2: Second sequence to align
1680 :type s2: :class:`ost.seq.SequenceHandle`
1681 :param chem_type: Must be in [:class:`ost.mol.ChemType.AMINOACIDS`,
1682 :class:`ost.mol.ChemType.NUCLEOTIDES`], determines
1683 substitution matrix and gap open/extension penalties
1684 :type chem_type: :class:`ost.mol.ChemType`
1685 :returns: Alignment with s1 as first and s2 as second sequence
1687 if chem_type == mol.ChemType.AMINOACIDS:
1688 return seq.alg.SemiGlobalAlign(s1, s2, self.
pep_subst_matpep_subst_mat,
1691 elif chem_type == mol.ChemType.NUCLEOTIDES:
1692 return seq.alg.SemiGlobalAlign(s1, s2, self.
nuc_subst_matnuc_subst_mat,
1696 raise RuntimeError(
"Invalid ChemType")
1700 """ Returns pairwise alignment using residue numbers of attached views
1702 Assumes that there are no insertion codes (alignment only on numerical
1703 component) and that resnums are strictly increasing (fast min/max
1704 identification). These requirements are assured if a structure has been
1705 processed by :class:`ChainMapper`.
1707 :param s1: First sequence to align, must have :class:`ost.mol.EntityView`
1709 :type s1: :class:`ost.seq.SequenceHandle`
1710 :param s2: Second sequence to align, must have :class:`ost.mol.EntityView`
1712 :type s2: :class:`ost.seq.SequenceHandle`
1714 assert(s1.HasAttachedView())
1715 assert(s2.HasAttachedView())
1716 v1 = s1.GetAttachedView()
1717 rnums1 = [r.GetNumber().GetNum()
for r
in v1.residues]
1718 v2 = s2.GetAttachedView()
1719 rnums2 = [r.GetNumber().GetNum()
for r
in v2.residues]
1721 min_num = min(rnums1[0], rnums2[0])
1722 max_num = max(rnums1[-1], rnums2[-1])
1723 aln_length = max_num - min_num + 1
1725 aln_s1 = [
'-'] * aln_length
1726 for r, rnum
in zip(v1.residues, rnums1):
1727 aln_s1[rnum-min_num] = r.one_letter_code
1729 aln_s2 = [
'-'] * aln_length
1730 for r, rnum
in zip(v2.residues, rnums2):
1731 aln_s2[rnum-min_num] = r.one_letter_code
1733 aln = seq.CreateAlignment()
1734 aln.AddSequence(seq.CreateSequence(s1.GetName(),
''.join(aln_s1)))
1735 aln.AddSequence(seq.CreateSequence(s2.GetName(),
''.join(aln_s2)))
1738 def _GetAlnPropsTwo(aln):
1739 """Returns basic properties of *aln* version two...
1741 :param aln: Alignment to compute properties
1742 :type aln: :class:`seq.AlignmentHandle`
1743 :returns: Tuple with 3 elements. 1) sequence identity in range [0, 100]
1744 considering aligned columns 2) Number of non gap characters in
1745 first sequence 3) Number of aligned characters.
1747 assert(aln.GetCount() == 2)
1748 n_tot = sum([1
for col
in aln
if col[0] !=
'-'])
1749 n_aligned = sum([1
for col
in aln
if (col[0] !=
'-' and col[1] !=
'-')])
1750 return (seq.alg.SequenceIdentity(aln), n_tot, n_aligned)
1752 def _GetAlnPropsOne(aln):
1754 """Returns basic properties of *aln* version one...
1756 :param aln: Alignment to compute properties
1757 :type aln: :class:`seq.AlignmentHandle`
1758 :returns: Tuple with 2 elements. 1) sequence identify in range [0, 100]
1759 considering aligned columns 2) Number of aligned columns.
1761 assert(aln.GetCount() == 2)
1762 seqid = seq.alg.SequenceIdentity(aln)
1763 n_aligned = sum([1
for col
in aln
if (col[0] !=
'-' and col[1] !=
'-')])
1764 return (seqid, n_aligned)
1766 def _GetChemGroupAlignments(pep_seqs, nuc_seqs, aligner, pep_seqid_thr=95.,
1767 min_pep_length=6, nuc_seqid_thr=95.,
1769 """Returns alignments with groups of chemically equivalent chains
1771 :param pep_seqs: List of polypeptide sequences
1772 :type pep_seqs: :class:`seq.SequenceList`
1773 :param nuc_seqs: List of polynucleotide sequences
1774 :type nuc_seqs: :class:`seq.SequenceList`
1775 :param aligner: Helper class to generate pairwise alignments
1776 :type aligner: :class:`_Aligner`
1777 :param pep_seqid_thr: Threshold used to decide when two peptide chains are
1778 identical. 95 percent tolerates the few mutations
1779 crystallographers like to do.
1780 :type pep_seqid_thr: :class:`float`
1781 :param min_pep_length: Additional threshold to avoid gappy alignments with high
1782 seqid. Number of aligned columns must be at least this
1784 :type min_pep_length: :class:`int`
1785 :param nuc_seqid_thr: Nucleotide equivalent of *pep_seqid_thr*
1786 :type nuc_seqid_thr: :class:`float`
1787 :param min_nuc_length: Nucleotide equivalent of *min_pep_length*
1788 :type min_nuc_length: :class:`int`
1789 :returns: Tuple with first element being an AlignmentList. Each alignment
1790 represents a group of chemically equivalent chains and the first
1791 sequence is the longest. Second element is a list of equivalent
1792 length specifying the types of the groups. List elements are in
1793 [:class:`ost.ChemType.AMINOACIDS`,
1794 :class:`ost.ChemType.NUCLEOTIDES`]
1796 pep_groups = _GroupSequences(pep_seqs, pep_seqid_thr, min_pep_length, aligner,
1797 mol.ChemType.AMINOACIDS)
1798 nuc_groups = _GroupSequences(nuc_seqs, nuc_seqid_thr, min_nuc_length, aligner,
1799 mol.ChemType.NUCLEOTIDES)
1800 group_types = [mol.ChemType.AMINOACIDS] * len(pep_groups)
1801 group_types += [mol.ChemType.NUCLEOTIDES] * len(nuc_groups)
1803 groups.extend(nuc_groups)
1804 return (groups, group_types)
1806 def _GroupSequences(seqs, seqid_thr, min_length, aligner, chem_type):
1807 """Get list of alignments representing groups of equivalent sequences
1809 :param seqid_thr: Threshold used to decide when two chains are identical.
1810 :type seqid_thr: :class:`float`
1811 :param gap_thr: Additional threshold to avoid gappy alignments with high
1812 seqid. Number of aligned columns must be at least this
1814 :type gap_thr: :class:`int`
1815 :param aligner: Helper class to generate pairwise alignments
1816 :type aligner: :class:`_Aligner`
1817 :param chem_type: ChemType of seqs which is passed to *aligner*, must be in
1818 [:class:`ost.mol.ChemType.AMINOACIDS`,
1819 :class:`ost.mol.ChemType.NUCLEOTIDES`]
1820 :type chem_type: :class:`ost.mol.ChemType`
1821 :returns: A list of alignments, one alignment for each group
1822 with longest sequence (reference) as first sequence.
1823 :rtype: :class:`ost.seq.AlignmentList`
1826 for s_idx
in range(len(seqs)):
1827 matching_group =
None
1828 for g_idx
in range(len(groups)):
1829 for g_s_idx
in range(len(groups[g_idx])):
1830 aln = aligner.Align(seqs[s_idx], seqs[groups[g_idx][g_s_idx]],
1832 sid, n_aligned = _GetAlnPropsOne(aln)
1833 if sid >= seqid_thr
and n_aligned >= min_length:
1834 matching_group = g_idx
1836 if matching_group
is not None:
1839 if matching_group
is None:
1840 groups.append([s_idx])
1842 groups[matching_group].append(s_idx)
1845 sorted_groups = list()
1848 tmp = sorted([[len(seqs[i]), i]
for i
in g], reverse=
True)
1849 sorted_groups.append([x[1]
for x
in tmp])
1851 sorted_groups.append(g)
1855 aln_list = seq.AlignmentList()
1856 for g
in sorted_groups:
1859 aln_list.append(seq.CreateAlignment(seqs[g[0]]))
1862 alns = seq.AlignmentList()
1865 alns.append(aligner.Align(seqs[i], seqs[j], chem_type))
1867 aln_list.append(seq.alg.MergePairwiseAlignments(alns, seqs[i]))
1870 seq_dict = {s.GetName(): s
for s
in seqs}
1871 for aln_idx
in range(len(aln_list)):
1872 for aln_s_idx
in range(aln_list[aln_idx].GetCount()):
1873 s_name = aln_list[aln_idx].GetSequence(aln_s_idx).GetName()
1874 s = seq_dict[s_name]
1875 aln_list[aln_idx].AttachView(aln_s_idx, s.GetAttachedView())
1879 def _MapSequence(ref_seqs, ref_types, s, s_type, aligner,
1880 seq_id_thr=0.0, min_aln_length=0):
1881 """Tries top map *s* onto any of the sequences in *ref_seqs*
1883 Computes alignments of *s* to each of the reference sequences of equal type
1884 and sorts them by seqid*fraction_covered (seqid: sequence identity of
1885 aligned columns in alignment, fraction_covered: Fraction of non-gap
1886 characters in reference sequence that are covered by non-gap characters in
1887 *s*). Best scoring mapping is returned. Optionally, *seq_id*/
1888 *min_aln_length* thresholds can be enabled to avoid non-sensical mappings.
1889 However, *min_aln_length* only has an effect if *seq_id_thr* > 0!!!
1891 :param ref_seqs: Reference sequences
1892 :type ref_seqs: :class:`ost.seq.SequenceList`
1893 :param ref_types: Types of reference sequences, e.g.
1894 ost.mol.ChemType.AminoAcids
1895 :type ref_types: :class:`list` of :class:`ost.mol.ChemType`
1896 :param s: Sequence to map
1897 :type s: :class:`ost.seq.SequenceHandle`
1898 :param s_type: Type of *s*, only try mapping to sequences in *ref_seqs*
1899 with equal type as defined in *ref_types*
1900 :param aligner: Helper class to generate pairwise alignments
1901 :type aligner: :class:`_Aligner`
1902 :param seq_id_thr: Minimum sequence identity to be considered as match
1903 :type seq_id_thr: :class:`float`
1904 :param min_aln_length: Minimum number of aligned columns to be considered
1905 as match. Only has an effect if *seq_id_thr* > 0!
1906 :type min_aln_length: :class:`int`
1907 :returns: Tuple with two elements. 1) index of sequence in *ref_seqs* to
1908 which *s* can be mapped 2) Pairwise sequence alignment with
1909 sequence from *ref_seqs* as first sequence. Both elements are
1910 None if no mapping can be found or if thresholds are not
1911 fulfilled for any alignment.
1912 :raises: :class:`RuntimeError` if mapping is ambiguous, i.e. *s*
1913 successfully maps to more than one sequence in *ref_seqs*
1915 scored_alns = list()
1916 for ref_idx, ref_seq
in enumerate(ref_seqs):
1917 if ref_types[ref_idx] == s_type:
1918 aln = aligner.Align(ref_seq, s, s_type)
1919 seqid, n_tot, n_aligned = _GetAlnPropsTwo(aln)
1921 if seqid >= seq_id_thr
and n_aligned >= min_aln_length:
1922 fraction_covered = float(n_aligned)/n_tot
1923 score = seqid * fraction_covered
1924 scored_alns.append((score, ref_idx, aln))
1926 fraction_covered = float(n_aligned)/n_tot
1927 score = seqid * fraction_covered
1928 scored_alns.append((score, ref_idx, aln))
1930 if len(scored_alns) == 0:
1933 scored_alns = sorted(scored_alns, key=
lambda x: x[0], reverse=
True)
1934 return (scored_alns[0][1], scored_alns[0][2])
1936 def _GetRefMdlAlns(ref_chem_groups, ref_chem_group_msas, mdl_chem_groups,
1937 mdl_chem_group_alns, pairs=None):
1938 """ Get all possible ref/mdl chain alignments given chem group mapping
1940 :param ref_chem_groups: :attr:`ChainMapper.chem_groups`
1941 :type ref_chem_groups: :class:`list` of :class:`list` of :class:`str`
1942 :param ref_chem_group_msas: :attr:`ChainMapper.chem_group_alignments`
1943 :type ref_chem_group_msas: :class:`ost.seq.AlignmentList`
1944 :param mdl_chem_groups: Groups of model chains that are mapped to
1945 *ref_chem_groups*. Return value of
1946 :func:`ChainMapper.GetChemMapping`.
1947 :type mdl_chem_groups: :class:`list` of :class:`list` of :class:`str`
1948 :param mdl_chem_group_alns: A pairwise sequence alignment for every chain
1949 in *mdl_chem_groups* that aligns these sequences
1950 to the respective reference sequence.
1952 :func:`ChainMapper.GetChemMapping`.
1953 :type mdl_chem_group_alns: :class:`list` of :class:`ost.seq.AlignmentList`
1954 :param pairs: Pro param - restrict return dict to specified pairs. A set of
1955 tuples in form (<trg_ch>, <mdl_ch>)
1956 :type pairs: :class:`set`
1957 :returns: A dictionary holding all possible ref/mdl chain alignments. Keys
1958 in that dictionary are tuples of the form (ref_ch, mdl_ch) and
1959 values are the respective pairwise alignments with first sequence
1960 being from ref, the second from mdl.
1964 for alns
in mdl_chem_group_alns:
1966 mdl_chain_name = aln.GetSequence(1).GetName()
1967 mdl_alns[mdl_chain_name] = aln
1971 ref_mdl_alns = dict()
1972 for ref_chains, mdl_chains, ref_aln
in zip(ref_chem_groups, mdl_chem_groups,
1973 ref_chem_group_msas):
1974 for ref_ch
in ref_chains:
1975 for mdl_ch
in mdl_chains:
1976 if pairs
is not None and (ref_ch, mdl_ch)
not in pairs:
1980 aln_list = seq.AlignmentList()
1982 s1 = ref_aln.GetSequence(0)
1983 s2 = ref_aln.GetSequence(ref_chains.index(ref_ch))
1984 aln_list.append(seq.CreateAlignment(s1, s2))
1986 aln_list.append(mdl_alns[mdl_ch])
1988 ref_seq = seq.CreateSequence(s1.GetName(),
1989 s1.GetGaplessString())
1990 merged_aln = seq.alg.MergePairwiseAlignments(aln_list,
1997 s2 = merged_aln.GetSequence(1)
1998 s3 = merged_aln.GetSequence(2)
2001 for idx
in range(len(s2)):
2002 if s2[idx] !=
'-' or s3[idx] !=
'-':
2006 for idx
in reversed(range(len(s2))):
2007 if s2[idx] !=
'-' or s3[idx] !=
'-':
2010 s2 = seq.CreateSequence(s2.GetName(), s2[a: len(s2)-b])
2011 s3 = seq.CreateSequence(s3.GetName(), s3[a: len(s3)-b])
2012 ref_mdl_alns[(ref_ch, mdl_ch)] = seq.CreateAlignment(s2, s3)
2016 def _CheckOneToOneMapping(ref_chains, mdl_chains):
2017 """ Checks whether we already have a perfect one to one mapping
2019 That means each list in *ref_chains* has exactly one element and each
2020 list in *mdl_chains* has either one element (it's mapped) or is empty
2021 (ref chain has no mapped mdl chain). Returns None if no such mapping
2024 :param ref_chains: corresponds to :attr:`ChainMapper.chem_groups`
2025 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
2026 :param mdl_chains: mdl chains mapped to chem groups in *ref_chains*, i.e.
2027 the return value of :func:`ChainMapper.GetChemMapping`
2028 :type mdl_chains: class:`list` of :class:`list` of :class:`str`
2029 :returns: A :class:`list` of :class:`list` if a one to one mapping is found,
2032 only_one_to_one =
True
2034 for ref, mdl
in zip(ref_chains, mdl_chains):
2035 if len(ref) == 1
and len(mdl) == 1:
2036 one_to_one.append(mdl)
2037 elif len(ref) == 1
and len(mdl) == 0:
2038 one_to_one.append([
None])
2040 only_one_to_one =
False
2048 def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
2049 ref_mdl_alns, inclusion_radius = 15.0,
2050 thresholds = [0.5, 1.0, 2.0, 4.0],
2051 steep_opt_rate = None):
2053 """ Greedy extension of already existing but incomplete chain mappings
2055 super().
__init__(ref, ref_chem_groups, mdl, ref_mdl_alns,
2056 dist_thresh=inclusion_radius,
2057 dist_diff_thresholds=thresholds)
2063 for interface
in self.
trgtrg.interacting_chains:
2064 self.
neighborsneighbors[interface[0]].add(interface[1])
2065 self.
neighborsneighbors[interface[1]].add(interface[0])
2067 assert(len(ref_chem_groups) == len(mdl_chem_groups))
2072 for g_idx, (ref_g, mdl_g)
in enumerate(zip(ref_chem_groups,
2082 for interface
in self.
mdlmdl.potentially_contributing_chains:
2083 self.
mdl_neighborsmdl_neighbors[interface[0]].add(interface[1])
2084 self.
mdl_neighborsmdl_neighbors[interface[1]].add(interface[0])
2088 if len(mapping) == 0:
2089 raise RuntimError(
"Mapping must contain a starting point")
2096 for ref_ch
in mapping.keys():
2097 map_targets.update(self.
neighborsneighbors[ref_ch])
2100 for ref_ch
in mapping.keys():
2101 map_targets.discard(ref_ch)
2103 if len(map_targets) == 0:
2107 free_mdl_chains = list()
2109 tmp = [x
for x
in chem_group
if x
not in mapping.values()]
2110 free_mdl_chains.append(set(tmp))
2113 newly_mapped_ref_chains = list()
2115 something_happened =
True
2116 while something_happened:
2117 something_happened=
False
2120 n_chains = len(newly_mapped_ref_chains)
2121 if n_chains > 0
and n_chains % self.
steep_opt_ratesteep_opt_rate == 0:
2122 mapping = self.
_SteepOpt_SteepOpt(mapping, newly_mapped_ref_chains)
2124 if max_ext
is not None and len(newly_mapped_ref_chains) >= max_ext:
2129 for ref_ch
in map_targets:
2131 for mdl_ch
in free_mdl_chains[chem_group_idx]:
2133 n_single = self.
_NSCConserved_NSCConserved(ref_ch, mdl_ch).sum()
2136 for neighbor
in self.
neighborsneighbors[ref_ch]:
2137 if neighbor
in mapping
and mapping[neighbor]
in \
2140 (mdl_ch, mapping[neighbor])).sum()
2141 n = n_single + n_inter
2143 if n_inter > 0
and n > max_n:
2148 max_mapping = (ref_ch, mdl_ch)
2151 something_happened =
True
2153 mapping[max_mapping[0]] = max_mapping[1]
2157 for neighbor
in self.
neighborsneighbors[max_mapping[0]]:
2158 if neighbor
not in mapping:
2159 map_targets.add(neighbor)
2162 map_targets.remove(max_mapping[0])
2166 free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2169 newly_mapped_ref_chains.append(max_mapping[0])
2173 def _SteepOpt(self, mapping, chains_to_optimize=None):
2176 if chains_to_optimize
is None:
2177 chains_to_optimize = mapping.keys()
2180 ref_chains = [x
for x
in chains_to_optimize
if mapping[x]
is not None]
2184 for ch
in ref_chains:
2186 if chem_group_idx
in tmp:
2187 tmp[chem_group_idx].append(ch)
2189 tmp[chem_group_idx] = [ch]
2190 chem_groups = list(tmp.values())
2195 something_happened =
True
2196 while something_happened:
2197 something_happened =
False
2198 for chem_group
in chem_groups:
2199 if something_happened:
2201 for ch1, ch2
in itertools.combinations(chem_group, 2):
2202 swapped_mapping = dict(mapping)
2203 swapped_mapping[ch1] = mapping[ch2]
2204 swapped_mapping[ch2] = mapping[ch1]
2206 if score > current_lddt:
2207 something_happened =
True
2208 mapping = swapped_mapping
2209 current_lddt = score
2214 def _lDDTNaive(trg, mdl, inclusion_radius, thresholds, chem_groups,
2215 chem_mapping, ref_mdl_alns, n_max_naive):
2216 """ Naively iterates all possible chain mappings and returns the best
2223 dist_thresh=inclusion_radius,
2224 dist_diff_thresholds=thresholds)
2225 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2226 lDDT = lddt_scorer.Score(mapping, check=
False)
2227 if lDDT > best_lddt:
2228 best_mapping = mapping
2231 return (best_mapping, best_lddt)
2234 def _GetSeeds(ref_chem_groups, mdl_chem_groups, mapped_ref_chains = set(),
2235 mapped_mdl_chains = set()):
2237 for ref_chains, mdl_chains
in zip(ref_chem_groups,
2239 for ref_ch
in ref_chains:
2240 if ref_ch
not in mapped_ref_chains:
2241 for mdl_ch
in mdl_chains:
2242 if mdl_ch
not in mapped_mdl_chains:
2243 seeds.append((ref_ch, mdl_ch))
2247 def _lDDTGreedyFast(the_greed):
2249 something_happened =
True
2252 while something_happened:
2253 something_happened =
False
2254 seeds = _GetSeeds(the_greed.ref_chem_groups,
2255 the_greed.mdl_chem_groups,
2256 mapped_ref_chains = set(mapping.keys()),
2257 mapped_mdl_chains = set(mapping.values()))
2262 n = the_greed._NSCConserved(seed[0], seed[1]).sum()
2269 mapping[best_seed[0]] = best_seed[1]
2270 mapping = the_greed.ExtendMapping(mapping)
2271 something_happened =
True
2274 final_mapping = list()
2275 for ref_chains
in the_greed.ref_chem_groups:
2276 mapped_mdl_chains = list()
2277 for ref_ch
in ref_chains:
2278 if ref_ch
in mapping:
2279 mapped_mdl_chains.append(mapping[ref_ch])
2281 mapped_mdl_chains.append(
None)
2282 final_mapping.append(mapped_mdl_chains)
2284 return final_mapping
2287 def _lDDTGreedyFull(the_greed):
2288 """ Uses each reference chain as starting point for expansion
2291 seeds = _GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
2292 best_overall_score = -1.0
2293 best_overall_mapping = dict()
2298 mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
2301 something_happened =
True
2302 while something_happened:
2303 something_happened =
False
2304 remnant_seeds = _GetSeeds(the_greed.ref_chem_groups,
2305 the_greed.mdl_chem_groups,
2306 mapped_ref_chains = set(mapping.keys()),
2307 mapped_mdl_chains = set(mapping.values()))
2308 if len(remnant_seeds) > 0:
2312 for remnant_seed
in remnant_seeds:
2313 tmp_mapping = dict(mapping)
2314 tmp_mapping[remnant_seed[0]] = remnant_seed[1]
2315 tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
2316 score = the_greed.FromFlatMapping(tmp_mapping)
2317 if score > best_score:
2319 best_mapping = tmp_mapping
2320 if best_mapping
is not None:
2321 something_happened =
True
2322 mapping = best_mapping
2324 score = the_greed.FromFlatMapping(mapping)
2325 if score > best_overall_score:
2326 best_overall_score = score
2327 best_overall_mapping = mapping
2329 mapping = best_overall_mapping
2332 final_mapping = list()
2333 for ref_chains
in the_greed.ref_chem_groups:
2334 mapped_mdl_chains = list()
2335 for ref_ch
in ref_chains:
2336 if ref_ch
in mapping:
2337 mapped_mdl_chains.append(mapping[ref_ch])
2339 mapped_mdl_chains.append(
None)
2340 final_mapping.append(mapped_mdl_chains)
2342 return final_mapping
2345 def _lDDTGreedyBlock(the_greed, seed_size, blocks_per_chem_group):
2346 """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
2347 respective chem groups and compute single chain lDDTs. The
2348 *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
2349 and the best scoring one is exhaustively extended.
2352 if seed_size
is None or seed_size < 1:
2353 raise RuntimeError(f
"seed_size must be an int >= 1 (got {seed_size})")
2355 if blocks_per_chem_group
is None or blocks_per_chem_group < 1:
2356 raise RuntimeError(f
"blocks_per_chem_group must be an int >= 1 "
2357 f
"(got {blocks_per_chem_group})")
2359 max_ext = seed_size - 1
2361 ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
2362 mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
2365 something_happened =
True
2366 while something_happened:
2367 something_happened =
False
2368 starting_blocks = list()
2369 for ref_chains, mdl_chains
in zip(ref_chem_groups, mdl_chem_groups):
2370 if len(mdl_chains) == 0:
2372 ref_chains_copy = list(ref_chains)
2373 for i
in range(blocks_per_chem_group):
2374 if len(ref_chains_copy) == 0:
2377 for ref_ch
in ref_chains_copy:
2378 seeds += [(ref_ch, mdl_ch)
for mdl_ch
in mdl_chains]
2385 seed = dict(mapping)
2386 seed.update({s[0]: s[1]})
2387 seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
2388 seed_lddt = the_greed.FromFlatMapping(seed)
2389 if seed_lddt > best_score:
2390 best_score = seed_lddt
2393 if best_mapping !=
None:
2394 starting_blocks.append(best_mapping)
2395 if best_seed[0]
in ref_chains_copy:
2397 ref_chains_copy.remove(best_seed[0])
2402 for seed
in starting_blocks:
2403 seed = the_greed.ExtendMapping(seed)
2404 seed_lddt = the_greed.FromFlatMapping(seed)
2405 if seed_lddt > best_lddt:
2406 best_lddt = seed_lddt
2409 if best_lddt == 0.0:
2412 something_happened =
True
2413 mapping.update(best_mapping)
2414 for ref_ch, mdl_ch
in best_mapping.items():
2415 for group_idx
in range(len(ref_chem_groups)):
2416 if ref_ch
in ref_chem_groups[group_idx]:
2417 ref_chem_groups[group_idx].remove(ref_ch)
2418 if mdl_ch
in mdl_chem_groups[group_idx]:
2419 mdl_chem_groups[group_idx].remove(mdl_ch)
2422 final_mapping = list()
2423 for ref_chains
in the_greed.ref_chem_groups:
2424 mapped_mdl_chains = list()
2425 for ref_ch
in ref_chains:
2426 if ref_ch
in mapping:
2427 mapped_mdl_chains.append(mapping[ref_ch])
2429 mapped_mdl_chains.append(
None)
2430 final_mapping.append(mapped_mdl_chains)
2432 return final_mapping
2436 def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
2437 ref_mdl_alns, contact_d = 12.0,
2438 steep_opt_rate = None, greedy_prune_contact_map=False):
2439 """ Greedy extension of already existing but incomplete chain mappings
2441 super().
__init__(ref, ref_chem_groups, mdl, ref_mdl_alns,
2442 contact_d = contact_d)
2448 if greedy_prune_contact_map:
2450 for p
in self.
qsent1qsent1.interacting_chains:
2451 if np.count_nonzero(self.
qsent1qsent1.PairDist(p[0], p[1])<=8) >= 3:
2456 for p
in self.
qsent2qsent2.interacting_chains:
2457 if np.count_nonzero(self.
qsent2qsent2.PairDist(p[0], p[1])<=8) >= 3:
2463 self.
neighborsneighbors = {k: set()
for k
in self.
qsent1qsent1.chain_names}
2464 for p
in self.
qsent1qsent1.interacting_chains:
2469 for p
in self.
qsent2qsent2.interacting_chains:
2473 assert(len(ref_chem_groups) == len(mdl_chem_groups))
2478 for g_idx, (ref_g, mdl_g)
in enumerate(zip(ref_chem_groups,
2490 for ch
in self.
refref.chains:
2491 single_chain_ref = _CSel(self.
refref, [ch.GetName()])
2498 alns[mdl_ch] = self.
ref_mdl_alnsref_mdl_alns[(ref_ch, mdl_ch)]
2499 mdl_sel = _CSel(self.
mdlmdl, [mdl_ch])
2501 _,_,_,conserved,_,_,_ = s.lDDT(mdl_sel,
2502 residue_mapping=alns,
2503 return_dist_test=
True,
2505 chain_mapping={mdl_ch: ref_ch},
2506 check_resnames=
False)
2512 if len(mapping) == 0:
2513 raise RuntimError(
"Mapping must contain a starting point")
2520 for ref_ch
in mapping.keys():
2521 map_targets.update(self.
neighborsneighbors[ref_ch])
2524 for ref_ch
in mapping.keys():
2525 map_targets.discard(ref_ch)
2527 if len(map_targets) == 0:
2531 free_mdl_chains = list()
2533 tmp = [x
for x
in chem_group
if x
not in mapping.values()]
2534 free_mdl_chains.append(set(tmp))
2537 newly_mapped_ref_chains = list()
2539 something_happened =
True
2540 while something_happened:
2541 something_happened=
False
2544 n_chains = len(newly_mapped_ref_chains)
2545 if n_chains > 0
and n_chains % self.
steep_opt_ratesteep_opt_rate == 0:
2546 mapping = self.
_SteepOpt_SteepOpt(mapping, newly_mapped_ref_chains)
2548 if max_ext
is not None and len(newly_mapped_ref_chains) >= max_ext:
2552 old_score = score_result.QS_global
2553 nominator = score_result.weighted_scores
2554 denominator = score_result.weight_sum + score_result.weight_extra_all
2558 for ref_ch
in map_targets:
2560 for mdl_ch
in free_mdl_chains[chem_group_idx]:
2563 nominator_diff = 0.0
2564 denominator_diff = 0.0
2565 for neighbor
in self.
neighborsneighbors[ref_ch]:
2566 if neighbor
in mapping
and mapping[neighbor]
in \
2570 int1 = (ref_ch, neighbor)
2571 int2 = (mdl_ch, mapping[neighbor])
2574 denominator_diff += b
2575 denominator_diff += d
2581 if nominator_diff > 0:
2584 new_nominator = nominator + nominator_diff
2585 new_denominator = denominator + denominator_diff
2587 if new_denominator != 0.0:
2588 new_score = new_nominator/new_denominator
2589 diff = new_score - old_score
2592 max_mapping = (ref_ch, mdl_ch)
2594 if max_mapping
is not None:
2595 something_happened =
True
2597 mapping[max_mapping[0]] = max_mapping[1]
2601 for neighbor
in self.
neighborsneighbors[max_mapping[0]]:
2602 if neighbor
not in mapping:
2603 map_targets.add(neighbor)
2606 map_targets.remove(max_mapping[0])
2610 free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2613 newly_mapped_ref_chains.append(max_mapping[0])
2617 def _SteepOpt(self, mapping, chains_to_optimize=None):
2620 if chains_to_optimize
is None:
2621 chains_to_optimize = mapping.keys()
2624 ref_chains = [x
for x
in chains_to_optimize
if mapping[x]
is not None]
2628 for ch
in ref_chains:
2630 if chem_group_idx
in tmp:
2631 tmp[chem_group_idx].append(ch)
2633 tmp[chem_group_idx] = [ch]
2634 chem_groups = list(tmp.values())
2639 current_score = score_result.QS_global
2640 something_happened =
True
2641 while something_happened:
2642 something_happened =
False
2643 for chem_group
in chem_groups:
2644 if something_happened:
2646 for ch1, ch2
in itertools.combinations(chem_group, 2):
2647 swapped_mapping = dict(mapping)
2648 swapped_mapping[ch1] = mapping[ch2]
2649 swapped_mapping[ch2] = mapping[ch1]
2651 if score_result.QS_global > current_score:
2652 something_happened =
True
2653 mapping = swapped_mapping
2654 current_score = score_result.QS_global
2659 def _QSScoreNaive(trg, mdl, chem_groups, chem_mapping, ref_mdl_alns, contact_d,
2666 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2667 score_result = qs_scorer.Score(mapping, check=
False)
2668 if score_result.QS_global > best_score:
2669 best_mapping = mapping
2670 best_score = score_result.QS_global
2671 return (best_mapping, best_score)
2674 def _QSScoreGreedyFast(the_greed):
2676 something_happened =
True
2678 while something_happened:
2679 something_happened =
False
2683 seeds = _GetSeeds(the_greed.ref_chem_groups,
2684 the_greed.mdl_chem_groups,
2685 mapped_ref_chains = set(mapping.keys()),
2686 mapped_mdl_chains = set(mapping.values()))
2688 n = the_greed.SCCounts(seed[0], seed[1])
2695 mapping[best_seed[0]] = best_seed[1]
2696 mapping = the_greed.ExtendMapping(mapping)
2697 something_happened =
True
2700 final_mapping = list()
2701 for ref_chains
in the_greed.ref_chem_groups:
2702 mapped_mdl_chains = list()
2703 for ref_ch
in ref_chains:
2704 if ref_ch
in mapping:
2705 mapped_mdl_chains.append(mapping[ref_ch])
2707 mapped_mdl_chains.append(
None)
2708 final_mapping.append(mapped_mdl_chains)
2710 return final_mapping
2713 def _QSScoreGreedyFull(the_greed):
2714 """ Uses each reference chain as starting point for expansion
2717 seeds = _GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
2718 best_overall_score = -1.0
2719 best_overall_mapping = dict()
2724 mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
2727 something_happened =
True
2728 while something_happened:
2729 something_happened =
False
2730 remnant_seeds = _GetSeeds(the_greed.ref_chem_groups,
2731 the_greed.mdl_chem_groups,
2732 mapped_ref_chains = set(mapping.keys()),
2733 mapped_mdl_chains = set(mapping.values()))
2734 if len(remnant_seeds) > 0:
2738 for remnant_seed
in remnant_seeds:
2739 tmp_mapping = dict(mapping)
2740 tmp_mapping[remnant_seed[0]] = remnant_seed[1]
2741 tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
2742 score_result = the_greed.FromFlatMapping(tmp_mapping)
2743 if score_result.QS_global > best_score:
2744 best_score = score_result.QS_global
2745 best_mapping = tmp_mapping
2746 if best_mapping
is not None:
2747 something_happened =
True
2748 mapping = best_mapping
2750 score_result = the_greed.FromFlatMapping(mapping)
2751 if score_result.QS_global > best_overall_score:
2752 best_overall_score = score_result.QS_global
2753 best_overall_mapping = mapping
2755 mapping = best_overall_mapping
2758 final_mapping = list()
2759 for ref_chains
in the_greed.ref_chem_groups:
2760 mapped_mdl_chains = list()
2761 for ref_ch
in ref_chains:
2762 if ref_ch
in mapping:
2763 mapped_mdl_chains.append(mapping[ref_ch])
2765 mapped_mdl_chains.append(
None)
2766 final_mapping.append(mapped_mdl_chains)
2768 return final_mapping
2771 def _QSScoreGreedyBlock(the_greed, seed_size, blocks_per_chem_group):
2772 """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
2773 respective chem groups and compute single chain lDDTs. The
2774 *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
2775 and the best scoring one with respect to QS score is exhaustively extended.
2778 if seed_size
is None or seed_size < 1:
2779 raise RuntimeError(f
"seed_size must be an int >= 1 (got {seed_size})")
2781 if blocks_per_chem_group
is None or blocks_per_chem_group < 1:
2782 raise RuntimeError(f
"blocks_per_chem_group must be an int >= 1 "
2783 f
"(got {blocks_per_chem_group})")
2785 max_ext = seed_size - 1
2787 ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
2788 mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
2792 something_happened =
True
2793 while something_happened:
2794 something_happened =
False
2795 starting_blocks = list()
2796 for ref_chains, mdl_chains
in zip(ref_chem_groups, mdl_chem_groups):
2797 if len(mdl_chains) == 0:
2799 ref_chains_copy = list(ref_chains)
2800 for i
in range(blocks_per_chem_group):
2801 if len(ref_chains_copy) == 0:
2804 for ref_ch
in ref_chains_copy:
2805 seeds += [(ref_ch, mdl_ch)
for mdl_ch
in mdl_chains]
2812 seed = dict(mapping)
2813 seed.update({s[0]: s[1]})
2814 seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
2815 score_result = the_greed.FromFlatMapping(seed)
2816 if score_result.QS_global > best_score:
2817 best_score = score_result.QS_global
2820 if best_mapping !=
None:
2821 starting_blocks.append(best_mapping)
2822 if best_seed[0]
in ref_chains_copy:
2824 ref_chains_copy.remove(best_seed[0])
2829 for seed
in starting_blocks:
2830 seed = the_greed.ExtendMapping(seed)
2831 score_result = the_greed.FromFlatMapping(seed)
2832 if score_result.QS_global > best_score:
2833 best_score = score_result.QS_global
2836 if best_mapping
is not None and len(best_mapping) > len(mapping):
2839 something_happened =
True
2840 mapping.update(best_mapping)
2841 for ref_ch, mdl_ch
in best_mapping.items():
2842 for group_idx
in range(len(ref_chem_groups)):
2843 if ref_ch
in ref_chem_groups[group_idx]:
2844 ref_chem_groups[group_idx].remove(ref_ch)
2845 if mdl_ch
in mdl_chem_groups[group_idx]:
2846 mdl_chem_groups[group_idx].remove(mdl_ch)
2849 final_mapping = list()
2850 for ref_chains
in the_greed.ref_chem_groups:
2851 mapped_mdl_chains = list()
2852 for ref_ch
in ref_chains:
2853 if ref_ch
in mapping:
2854 mapped_mdl_chains.append(mapping[ref_ch])
2856 mapped_mdl_chains.append(
None)
2857 final_mapping.append(mapped_mdl_chains)
2859 return final_mapping
2861 def _SingleRigidRMSD(initial_transforms, initial_mappings, chem_groups,
2862 chem_mapping, trg_group_pos, mdl_group_pos):
2864 Takes initial transforms and sequentially adds chain pairs with lowest RMSD.
2865 The mapping from the transform that leads to lowest overall RMSD is
2868 best_mapping = dict()
2869 best_ssd = float(
"inf")
2873 for transform
in initial_transforms:
2875 mapped_mdl_chains = set()
2877 for trg_chains, mdl_chains, trg_pos, mdl_pos,
in zip(chem_groups,
2881 if len(trg_pos) == 0
or len(mdl_pos) == 0:
2885 for m_pos
in mdl_pos:
2887 t_m_pos.ApplyTransform(transform)
2888 t_mdl_pos.append(t_m_pos)
2889 for t_pos, t
in zip(trg_pos, trg_chains):
2890 for t_m_pos, m
in zip(t_mdl_pos, mdl_chains):
2891 ssd = t_pos.GetSummedSquaredDistances(t_m_pos)
2892 ssds.append((ssd, (t,m)))
2896 if p[0]
not in mapping
and p[1]
not in mapped_mdl_chains:
2897 mapping[p[0]] = p[1]
2898 mapped_mdl_chains.add(p[1])
2903 best_mapping = mapping
2907 def _IterativeRigidRMSD(initial_transforms, initial_mappings, chem_groups,
2908 chem_mapping, trg_group_pos, mdl_group_pos):
2909 """ Takes initial transforms and sequentially adds chain pairs with
2910 lowest RMSD. With each added chain pair, the transform gets updated.
2911 Thus the naming iterative. The mapping from the initial transform that
2912 leads to best overall RMSD score is returned.
2916 trg_pos_dict = dict()
2917 for trg_pos, trg_chains
in zip(trg_group_pos, chem_groups):
2918 for t_pos, t
in zip(trg_pos, trg_chains):
2919 trg_pos_dict[t] = t_pos
2920 mdl_pos_dict = dict()
2921 for mdl_pos, mdl_chains
in zip(mdl_group_pos, chem_mapping):
2922 for m_pos, m
in zip(mdl_pos, mdl_chains):
2923 mdl_pos_dict[m] = m_pos
2925 best_mapping = dict()
2926 best_rmsd = float(
"inf")
2927 for initial_transform, initial_mapping
in zip(initial_transforms,
2929 mapping = {initial_mapping[0]: initial_mapping[1]}
2930 transform =
geom.Mat4(initial_transform)
2931 mapped_trg_pos =
geom.Vec3List(trg_pos_dict[initial_mapping[0]])
2932 mapped_mdl_pos =
geom.Vec3List(mdl_pos_dict[initial_mapping[1]])
2936 trg_chain_groups = [set(group)
for group
in chem_groups]
2937 mdl_chain_groups = [set(group)
for group
in chem_mapping]
2940 for group
in trg_chain_groups:
2941 if initial_mapping[0]
in group:
2942 group.remove(initial_mapping[0])
2944 for group
in mdl_chain_groups:
2945 if initial_mapping[1]
in group:
2946 group.remove(initial_mapping[1])
2949 something_happened =
True
2950 while something_happened:
2952 something_happened=
False
2953 best_sc_mapping =
None
2954 best_sc_group_idx =
None
2955 best_sc_rmsd = float(
"inf")
2957 for trg_chains, mdl_chains
in zip(trg_chain_groups, mdl_chain_groups):
2958 for t
in trg_chains:
2959 t_pos = trg_pos_dict[t]
2960 for m
in mdl_chains:
2961 m_pos = mdl_pos_dict[m]
2963 t_m_pos.ApplyTransform(transform)
2964 rmsd = t_pos.GetRMSD(t_m_pos)
2965 if rmsd < best_sc_rmsd:
2967 best_sc_mapping = (t,m)
2968 best_sc_group_idx = group_idx
2971 if best_sc_mapping
is not None:
2972 something_happened =
True
2973 mapping[best_sc_mapping[0]] = best_sc_mapping[1]
2974 mapped_trg_pos.extend(trg_pos_dict[best_sc_mapping[0]])
2975 mapped_mdl_pos.extend(mdl_pos_dict[best_sc_mapping[1]])
2976 trg_chain_groups[best_sc_group_idx].remove(best_sc_mapping[0])
2977 mdl_chain_groups[best_sc_group_idx].remove(best_sc_mapping[1])
2979 transform = _GetSuperposition(mapped_mdl_pos, mapped_trg_pos,
2980 False).transformation
2983 mapped_mdl_pos.ApplyTransform(transform)
2984 rmsd = mapped_trg_pos.GetRMSD(mapped_mdl_pos)
2986 if rmsd < best_rmsd:
2988 best_mapping = mapping
2992 def _NaiveRMSD(chem_groups, chem_mapping, trg_group_pos, mdl_group_pos,
2996 trg_pos_dict = dict()
2997 for trg_pos, trg_chains
in zip(trg_group_pos, chem_groups):
2998 for t_pos, t
in zip(trg_pos, trg_chains):
2999 trg_pos_dict[t] = t_pos
3000 mdl_pos_dict = dict()
3001 for mdl_pos, mdl_chains
in zip(mdl_group_pos, chem_mapping):
3002 for m_pos, m
in zip(mdl_pos, mdl_chains):
3003 mdl_pos_dict[m] = m_pos
3005 best_mapping = dict()
3006 best_rmsd = float(
"inf")
3008 for mapping
in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
3011 for trg_group, mdl_group
in zip(chem_groups, mapping):
3012 for trg_ch, mdl_ch
in zip(trg_group, mdl_group):
3013 if trg_ch
is not None and mdl_ch
is not None:
3014 trg_pos.extend(trg_pos_dict[trg_ch])
3015 mdl_pos.extend(mdl_pos_dict[mdl_ch])
3016 superpose_res = mol.alg.SuperposeSVD(mdl_pos, trg_pos)
3017 if superpose_res.rmsd < best_rmsd:
3018 best_rmsd = superpose_res.rmsd
3019 best_mapping = mapping
3023 for chem_group, mapping
in zip(chem_groups, best_mapping):
3024 for trg_ch, mdl_ch
in zip(chem_group, mapping):
3025 tmp[trg_ch] = mdl_ch
3030 def _GetRefPos(trg, mdl, trg_msas, mdl_alns, max_pos = None):
3031 """ Extracts reference positions which are present in trg and mdl
3036 bb_trg = trg.Select(
"aname=\"CA\",\"C3'\"")
3037 bb_mdl = mdl.Select(
"aname=\"CA\",\"C3'\"")
3041 for aln_list
in mdl_alns:
3042 if len(aln_list) > 0:
3043 tmp = aln_list[0].GetSequence(0)
3044 ref_seq = seq.CreateSequence(tmp.GetName(), tmp.GetGaplessString())
3045 mdl_msas.append(seq.alg.MergePairwiseAlignments(aln_list, ref_seq))
3047 mdl_msas.append(seq.CreateAlignment())
3052 for trg_msa, mdl_msa
in zip(trg_msas, mdl_msas):
3054 if mdl_msa.GetCount() > 0:
3056 assert(trg_msa.GetSequence(0).GetGaplessString() == \
3057 mdl_msa.GetSequence(0).GetGaplessString())
3066 trg_indices = _GetFullyCoveredIndices(trg_msa)
3067 mdl_indices = _GetFullyCoveredIndices(mdl_msa)
3070 indices = sorted(list(trg_indices.intersection(mdl_indices)))
3073 if max_pos
is not None and len(indices) > max_pos:
3074 step = int(len(indices)/max_pos)
3075 indices = [indices[i]
for i
in range(0, len(indices), step)]
3078 trg_indices = _RefIndicesToColumnIndices(trg_msa, indices)
3079 mdl_indices = _RefIndicesToColumnIndices(mdl_msa, indices)
3082 trg_pos.append(list())
3083 mdl_pos.append(list())
3084 for s_idx
in range(trg_msa.GetCount()):
3085 trg_pos[-1].append(_ExtractMSAPos(trg_msa, s_idx, trg_indices,
3088 for s_idx
in range(1, mdl_msa.GetCount()):
3089 mdl_pos[-1].append(_ExtractMSAPos(mdl_msa, s_idx, mdl_indices,
3092 return (trg_pos, mdl_pos)
3094 def _GetFullyCoveredIndices(msa):
3095 """ Helper for _GetRefPos
3097 Returns a set containing the indices relative to first sequence in msa which
3098 are fully covered in all other sequences
3109 if sum([1
for olc
in col
if olc !=
'-']) == col.GetRowCount():
3110 indices.add(ref_idx)
3115 def _RefIndicesToColumnIndices(msa, indices):
3116 """ Helper for _GetRefPos
3118 Returns a list of mapped indices. indices refer to non-gap one letter
3119 codes in the first msa sequence. The returnes mapped indices are translated
3120 to the according msa column indices
3124 for col_idx, col
in enumerate(msa):
3126 mapping[ref_idx] = col_idx
3128 return [mapping[i]
for i
in indices]
3130 def _ExtractMSAPos(msa, s_idx, indices, view):
3131 """ Helper for _GetRefPos
3133 Returns a geom.Vec3List containing positions refering to given msa sequence.
3134 => Chain with corresponding name is mapped onto sequence and the position of
3135 the first atom of each residue specified in indices is extracted.
3136 Indices refers to column indices in msa!
3138 s = msa.GetSequence(s_idx)
3139 s_v = _CSel(view, [s.GetName()])
3142 assert(len(s.GetGaplessString()) == len(s_v.residues))
3144 residue_idx = [s.GetResidueIndex(i)
for i
in indices]
3145 return geom.Vec3List([s_v.residues[i].atoms[0].pos
for i
in residue_idx])
3147 def _NChemGroupMappings(ref_chains, mdl_chains):
3148 """ Number of mappings within one chem group
3150 :param ref_chains: Reference chains
3151 :type ref_chains: :class:`list` of :class:`str`
3152 :param mdl_chains: Model chains that are mapped onto *ref_chains*
3153 :type mdl_chains: :class:`list` of :class:`str`
3154 :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3156 n_ref = len(ref_chains)
3157 n_mdl = len(mdl_chains)
3159 return factorial(n_ref)
3161 n_choose_k = binom(n_ref, n_mdl)
3162 return n_choose_k * factorial(n_mdl)
3164 n_choose_k = binom(n_mdl, n_ref)
3165 return n_choose_k * factorial(n_ref)
3167 def _NMappings(ref_chains, mdl_chains):
3168 """ Number of mappings for a full chem mapping
3170 :param ref_chains: Chem groups of reference
3171 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3172 :param mdl_chains: Model chains that map onto those chem groups
3173 :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3174 :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3176 assert(len(ref_chains) == len(mdl_chains))
3178 for a,b
in zip(ref_chains, mdl_chains):
3179 n *= _NChemGroupMappings(a,b)
3182 def _NMappingsWithin(ref_chains, mdl_chains, max_mappings):
3183 """ Check whether total number of mappings is smaller than given maximum
3185 In principle the same as :func:`_NMappings` but it stops as soon as the
3188 :param ref_chains: Chem groups of reference
3189 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3190 :param mdl_chains: Model chains that map onto those chem groups
3191 :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3192 :param max_mappings: Number of max allowed mappings
3193 :returns: Whether number of possible mappings of *mdl_chains* onto
3194 *ref_chains* is below or equal *max_mappings*.
3196 assert(len(ref_chains) == len(mdl_chains))
3198 for a,b
in zip(ref_chains, mdl_chains):
3199 n *= _NChemGroupMappings(a,b)
3200 if n > max_mappings:
3204 def _RefSmallerGenerator(ref_chains, mdl_chains):
3205 """ Returns all possible ways to map mdl_chains onto ref_chains
3207 Specific for the case where len(ref_chains) < len(mdl_chains)
3209 for c
in itertools.combinations(mdl_chains, len(ref_chains)):
3210 for p
in itertools.permutations(c):
3213 def _RefLargerGenerator(ref_chains, mdl_chains):
3214 """ Returns all possible ways to map mdl_chains onto ref_chains
3216 Specific for the case where len(ref_chains) > len(mdl_chains)
3217 Ref chains without mapped mdl chain are assigned None
3219 n_ref = len(ref_chains)
3220 n_mdl = len(mdl_chains)
3221 for c
in itertools.combinations(range(n_ref), n_mdl):
3222 for p
in itertools.permutations(mdl_chains):
3223 ret_list = [
None] * n_ref
3224 for idx, ch
in zip(c, p):
3228 def _RefEqualGenerator(ref_chains, mdl_chains):
3229 """ Returns all possible ways to map mdl_chains onto ref_chains
3231 Specific for the case where len(ref_chains) == len(mdl_chains)
3233 for p
in itertools.permutations(mdl_chains):
3236 def _RefEmptyGenerator(ref_chains, mdl_chains):
3239 def _ConcatIterators(iterators):
3240 for item
in itertools.product(*iterators):
3243 def _ChainMappings(ref_chains, mdl_chains, n_max=None):
3244 """Returns all possible ways to map *mdl_chains* onto fixed *ref_chains*
3246 :param ref_chains: List of list of chemically equivalent chains in reference
3247 :type ref_chains: :class:`list` of :class:`list`
3248 :param mdl_chains: Equally long list of list of chemically equivalent chains
3249 in model that map on those ref chains.
3250 :type mdl_chains: :class:`list` of :class:`list`
3251 :param n_max: Aborts and raises :class:`RuntimeError` if max number of
3252 mappings is above this threshold.
3253 :type n_max: :class:`int`
3254 :returns: Iterator over all possible mappings of *mdl_chains* onto fixed
3255 *ref_chains*. Potentially contains None as padding when number of
3256 model chains for a certain mapping is smaller than the according
3258 Example: _ChainMappings([['A', 'B', 'C'], ['D', 'E']],
3259 [['x', 'y'], ['i', 'j']])
3260 gives an iterator over: [[['x', 'y', None], ['i', 'j']],
3261 [['x', 'y', None], ['j', 'i']],
3262 [['y', 'x', None], ['i', 'j']],
3263 [['y', 'x', None], ['j', 'i']],
3264 [['x', None, 'y'], ['i', 'j']],
3265 [['x', None, 'y'], ['j', 'i']],
3266 [['y', None, 'x'], ['i', 'j']],
3267 [['y', None, 'x'], ['j', 'i']],
3268 [[None, 'x', 'y'], ['i', 'j']],
3269 [[None, 'x', 'y'], ['j', 'i']],
3270 [[None, 'y', 'x'], ['i', 'j']],
3271 [[None, 'y', 'x'], ['j', 'i']]]
3273 assert(len(ref_chains) == len(mdl_chains))
3275 if n_max
is not None:
3276 if not _NMappingsWithin(ref_chains, mdl_chains, n_max):
3277 raise RuntimeError(f
"Too many mappings. Max allowed: {n_max}")
3282 for ref, mdl
in zip(ref_chains, mdl_chains):
3284 iterators.append(_RefEmptyGenerator(ref, mdl))
3285 elif len(ref) == len(mdl):
3286 iterators.append(_RefEqualGenerator(ref, mdl))
3287 elif len(ref) < len(mdl):
3288 iterators.append(_RefSmallerGenerator(ref, mdl))
3290 iterators.append(_RefLargerGenerator(ref, mdl))
3292 return _ConcatIterators(iterators)
3295 def _GetSuperposition(pos_one, pos_two, iterative):
3296 """ Computes minimal RMSD superposition for pos_one onto pos_two
3298 :param pos_one: Positions that should be superposed onto *pos_two*
3299 :type pos_one: :class:`geom.Vec3List`
3300 :param pos_two: Reference positions
3301 :type pos_two: :class:`geom.Vec3List`
3302 :iterative: Whether iterative superposition should be used. Iterative
3303 potentially raises, uses standard superposition as fallback.
3304 :type iterative: :class:`bool`
3305 :returns: Transformation matrix to superpose *pos_one* onto *pos_two*
3306 :rtype: :class:`ost.mol.alg.SuperpositionResult`
3311 res = mol.alg.IterativeSuperposeSVD(pos_one, pos_two)
3315 res = mol.alg.SuperposeSVD(pos_one, pos_two)
3319 __all__ = (
'ChainMapper',
'ReprResult',
'MappingResult')
def _NSCConserved(self, trg_ch, mdl_ch)
def _NPairConserved(self, trg_int, mdl_int)
def FromFlatMapping(self, flat_mapping)
def __init__(self, pep_subst_mat=seq.alg.BLOSUM62, pep_gap_open=-5, pep_gap_ext=-2, nuc_subst_mat=seq.alg.NUC44, nuc_gap_open=-4, nuc_gap_ext=-4, resnum_aln=False)
def Align(self, s1, s2, chem_type=None)
def NWAlign(self, s1, s2, chem_type)
def ResNumAlign(self, s1, s2)
def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, contact_d=12.0, steep_opt_rate=None, greedy_prune_contact_map=False)
def SCCounts(self, ref_ch, mdl_ch)
def ExtendMapping(self, mapping, max_ext=None)
def _SteepOpt(self, mapping, chains_to_optimize=None)
def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], steep_opt_rate=None)
def ExtendMapping(self, mapping, max_ext=None)
def _SteepOpt(self, mapping, chains_to_optimize=None)
def chem_group_ref_seqs(self)
def chem_group_alignments(self)
def GetlDDTMapping(self, model, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic", steep_opt_rate=None, block_seed_size=5, block_blocks_per_chem_group=5, chem_mapping_result=None, heuristic_n_max_naive=40320)
def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False, only_interchain=False, chem_mapping_result=None, global_mapping=None)
def GetNMappings(self, model)
def GetChemMapping(self, model)
def GetMapping(self, model, n_max_naive=40320)
def ProcessStructure(self, ent)
def Align(self, s1, s2, stype)
def __init__(self, target, resnum_alignments=False, pep_seqid_thr=95., nuc_seqid_thr=95., pep_subst_mat=seq.alg.BLOSUM62, pep_gap_open=-11, pep_gap_ext=-1, nuc_subst_mat=seq.alg.NUC44, nuc_gap_open=-4, nuc_gap_ext=-4, min_pep_length=6, min_nuc_length=4, n_max_naive=1e8, mdl_map_pep_seqid_thr=0., mdl_map_nuc_seqid_thr=0.)
def GetQSScoreMapping(self, model, contact_d=12.0, strategy="heuristic", block_seed_size=5, block_blocks_per_chem_group=5, heuristic_n_max_naive=40320, steep_opt_rate=None, chem_mapping_result=None, greedy_prune_contact_map=True)
def chem_group_types(self)
def GetRMSDMapping(self, model, strategy="heuristic", subsampling=50, chem_mapping_result=None, heuristic_n_max_naive=120)
def __init__(self, target, model, chem_groups, chem_mapping, unmapped_mdl_chains, mapping, alns, opt_score=None)
def GetFlatMapping(self, mdl_as_key=False)
def unmapped_mdl_chains(self)
def mdl_full_bb_pos(self)
def _GetFullBBPos(self, residues)
def superposed_mdl_bb_pos(self)
def _GetBBPos(self, residues)
def inconsistent_residues(self)
def GetFlatChainMapping(self, mdl_as_key=False)
def ref_full_bb_pos(self)
def __init__(self, lDDT, substructure, ref_view, mdl_view)
def _GetInconsistentResidues(self, ref_residues, mdl_residues)
def _InterfacePenalty1(self, interface)
def _InterfacePenalty2(self, interface)
def _MappedInterfaceScores(self, int1, int2)
def FromFlatMapping(self, flat_mapping)