OpenStructure
Loading...
Searching...
No Matches
chain_mapping.py
Go to the documentation of this file.
1"""
2Chain mapping aims to identify a one-to-one relationship between chains in a
3reference structure and a model.
4"""
5
6import itertools
7import copy
8
9import numpy as np
10
11from scipy.special import factorial
12from scipy.special import binom # as of Python 3.8, the math module implements
13 # comb, i.e. n choose k
14
15import ost
16from ost import seq
17from ost import mol
18from ost import geom
19
20from ost.mol.alg import lddt
21from ost.mol.alg import bb_lddt
22from ost.mol.alg import qsscore
23
24def _CSel(ent, cnames):
25 """ Returns view with specified chains
26
27 Ensures that quotation marks are around chain names to not confuse
28 OST query language with weird special characters.
29 """
30 query = "cname=" + ','.join([mol.QueryQuoteName(cname) for cname in cnames])
31 return ent.Select(query)
32
34 """ Result object for the chain mapping functions in :class:`ChainMapper`
35
36 Constructor is directly called within the functions, no need to construct
37 such objects yourself.
38 """
39 def __init__(self, target, model, chem_groups, chem_mapping,
40 mdl_chains_without_chem_mapping, mapping, alns, opt_score=None):
41 self._target = target
42 self._model = model
43 self._chem_groups = chem_groups
44 self._chem_mapping = chem_mapping
45 self._mdl_chains_without_chem_mapping = mdl_chains_without_chem_mapping
46 self._mapping = mapping
47 self._alns = alns
48 self._opt_score = opt_score
49
50 @property
51 def target(self):
52 """ Target/reference structure, i.e. :attr:`ChainMapper.target`
53
54 :type: :class:`ost.mol.EntityView`
55 """
56 return self._target
57
58 @property
59 def model(self):
60 """ Model structure that gets mapped onto :attr:`~target`
61
62 Underwent same processing as :attr:`ChainMapper.target`, i.e.
63 only contains peptide/nucleotide chains of sufficient size.
64
65 :type: :class:`ost.mol.EntityView`
66 """
67 return self._model
68
69 @property
70 def chem_groups(self):
71 """ Groups of chemically equivalent chains in :attr:`~target`
72
73 Same as :attr:`ChainMapper.chem_group`
74
75 :class:`list` of :class:`list` of :class:`str` (chain names)
76 """
77 return self._chem_groups
78
79 @property
80 def chem_mapping(self):
81 """ Assigns chains in :attr:`~model` to :attr:`~chem_groups`.
82
83 :class:`list` of :class:`list` of :class:`str` (chain names)
84 """
85 return self._chem_mapping
86
87 @property
89 """ Model chains that cannot be mapped to :attr:`chem_groups`
90
91 Depends on parameterization of :class:`ChainMapper`
92
93 :class:`list` of class:`str` (chain names)
94 """
96
97 @property
98 def mapping(self):
99 """ Mapping of :attr:`~model` chains onto :attr:`~target`
100
101 Exact same shape as :attr:`~chem_groups` but containing the names of the
102 mapped chains in :attr:`~model`. May contain None for :attr:`~target`
103 chains that are not covered. No guarantee that all chains in
104 :attr:`~model` are mapped.
105
106 :class:`list` of :class:`list` of :class:`str` (chain names)
107 """
108 return self._mapping
109
110 @property
111 def alns(self):
112 """ Alignments of mapped chains in :attr:`~target` and :attr:`~model`
113
114 Each alignment is accessible with ``alns[(t_chain,m_chain)]``. First
115 sequence is the sequence of :attr:`target` chain, second sequence the
116 one from :attr:`~model`.
117
118 :type: :class:`dict` with key: :class:`tuple` of :class:`str`, value:
119 :class:`ost.seq.AlignmentHandle`
120 """
121 return self._alns
122
123 @property
124 def opt_score(self):
125 """ Placeholder property without any guarantee of being set
126
127 Different scores get optimized in the various chain mapping algorithms.
128 Some of them may set their final optimal score in that property.
129 Consult the documentation of the respective chain mapping algorithm
130 for more information. Won't be in the return dict of
131 :func:`JSONSummary`.
132 """
133 return self._opt_score
134
135 def GetFlatMapping(self, mdl_as_key=False):
136 """ Returns flat mapping as :class:`dict` for all mapable chains
137
138 :param mdl_as_key: Default is target chain name as key and model chain
139 name as value. This can be reversed with this flag.
140 :returns: :class:`dict` with :class:`str` as key/value that describe
141 one-to-one mapping
142 """
143 flat_mapping = dict()
144 for trg_chem_group, mdl_chem_group in zip(self.chem_groupschem_groups,
146 for a,b in zip(trg_chem_group, mdl_chem_group):
147 if a is not None and b is not None:
148 if mdl_as_key:
149 flat_mapping[b] = a
150 else:
151 flat_mapping[a] = b
152 return flat_mapping
153
154 def JSONSummary(self):
155 """ Returns JSON serializable summary of results
156 """
157 json_dict = dict()
158 json_dict["chem_groups"] = self.chem_groupschem_groups
159 json_dict["mapping"] = self.mappingmapping
160 json_dict["flat_mapping"] = self.GetFlatMapping()
161 json_dict["alns"] = list()
162 for aln in self.alns.values():
163 trg_seq = aln.GetSequence(0)
164 mdl_seq = aln.GetSequence(1)
165 aln_dict = {"trg_ch": trg_seq.GetName(), "trg_seq": str(trg_seq),
166 "mdl_ch": mdl_seq.GetName(), "mdl_seq": str(mdl_seq)}
167 json_dict["alns"].append(aln_dict)
168 return json_dict
169
170
172
173 """ Result object for :func:`ChainMapper.GetRepr`
174
175 Constructor is directly called within the function, no need to construct
176 such objects yourself.
177
178 :param lDDT: lDDT for this mapping. Depends on how you call
179 :func:`ChainMapper.GetRepr` whether this is backbone only or
180 full atom lDDT.
181 :type lDDT: :class:`float`
182 :param substructure: The full substructure for which we searched for a
183 representation
184 :type substructure: :class:`ost.mol.EntityView`
185 :param ref_view: View pointing to the same underlying entity as
186 *substructure* but only contains the stuff that is mapped
187 :type ref_view: :class:`mol.EntityView`
188 :param mdl_view: The matching counterpart in model
189 :type mdl_view: :class:`mol.EntityView`
190 """
191 def __init__(self, lDDT, substructure, ref_view, mdl_view):
192 self._lDDT = lDDT
193 self._substructure = substructure
194 assert(len(ref_view.residues) == len(mdl_view.residues))
195 self._ref_view = ref_view
196 self._mdl_view = mdl_view
197
198 # lazily evaluated attributes
199 self._ref_bb_pos = None
200 self._mdl_bb_pos = None
203 self._superposition = None
205 self._ost_query = None
206 self._flat_mapping = None
208
209 @property
210 def lDDT(self):
211 """ lDDT of representation result
212
213 Depends on how you call :func:`ChainMapper.GetRepr` whether this is
214 backbone only or full atom lDDT.
215
216 :type: :class:`float`
217 """
218 return self._lDDT
219
220 @property
221 def substructure(self):
222 """ The full substructure for which we searched for a
223 representation
224
225 :type: :class:`ost.mol.EntityView`
226 """
227 return self._substructure
228
229 @property
230 def ref_view(self):
231 """ View which contains the mapped subset of :attr:`substructure`
232
233 :type: :class:`ost.mol.EntityView`
234 """
235 return self._ref_view
236
237 @property
238 def mdl_view(self):
239 """ The :attr:`ref_view` representation in the model
240
241 :type: :class:`ost.mol.EntityView`
242 """
243 return self._mdl_view
244
245 @property
246 def ref_residues(self):
247 """ The reference residues
248
249 :type: class:`mol.ResidueViewList`
250 """
251 return self.ref_view.residues
252
253 @property
254 def mdl_residues(self):
255 """ The model residues
256
257 :type: :class:`mol.ResidueViewList`
258 """
259 return self.mdl_view.residues
260
261 @property
263 """ A list of mapped residue whose names do not match (eg. ALA in the
264 reference and LEU in the model).
265
266 The mismatches are reported as a tuple of :class:`~ost.mol.ResidueView`
267 (reference, model), or as an empty list if all the residue names match.
268
269 :type: :class:`list`
270 """
271 if self._inconsistent_residues is None:
274 return self._inconsistent_residues
275
276 @property
277 def ref_bb_pos(self):
278 """ Representative backbone positions for reference residues.
279
280 Thats CA positions for peptides and C3' positions for Nucleotides.
281
282 :type: :class:`geom.Vec3List`
283 """
284 if self._ref_bb_pos is None:
286 return self._ref_bb_pos
287
288 @property
289 def mdl_bb_pos(self):
290 """ Representative backbone positions for model residues.
291
292 Thats CA positions for peptides and C3' positions for Nucleotides.
293
294 :type: :class:`geom.Vec3List`
295 """
296 if self._mdl_bb_pos is None:
298 return self._mdl_bb_pos
299
300 @property
302 """ Representative backbone positions for reference residues.
303
304 Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
305 positions for Nucleotides.
306
307 :type: :class:`geom.Vec3List`
308 """
309 if self._ref_full_bb_pos is None:
311 return self._ref_full_bb_pos
312
313 @property
315 """ Representative backbone positions for reference residues.
316
317 Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
318 positions for Nucleotides.
319
320 :type: :class:`geom.Vec3List`
321 """
322 if self._mdl_full_bb_pos is None:
324 return self._mdl_full_bb_pos
325
326
327 @property
328 def superposition(self):
329 """ Superposition of mdl residues onto ref residues
330
331 Superposition computed as minimal RMSD superposition on
332 :attr:`ref_bb_pos` and :attr:`mdl_bb_pos`. If number of positions is
333 smaller 3, the full_bb_pos equivalents are used instead.
334
335 :type: :class:`ost.mol.alg.SuperpositionResult`
336 """
337 if self._superposition is None:
338 if len(self.mdl_bb_posmdl_bb_pos) < 3:
341 else:
344 return self._superposition
345
346 @property
347 def transform(self):
348 """ Transformation to superpose mdl residues onto ref residues
349
350 :type: :class:`ost.geom.Mat4`
351 """
352 return self.superposition.transformation
353
354 @property
356 """ :attr:`mdl_bb_pos` with :attr:`transform applied`
357
358 :type: :class:`geom.Vec3List`
359 """
360 if self._superposed_mdl_bb_pos is None:
363 return self._superposed_mdl_bb_pos
364
365 @property
366 def bb_rmsd(self):
367 """ RMSD of the binding site backbone atoms after :attr:`superposition`
368
369 :type: :class:`float`
370 """
371 return self.superposition.rmsd
372
373
374 @property
375 def ost_query(self):
376 """ query for mdl residues in OpenStructure query language
377
378 Repr can be selected as ``full_mdl.Select(ost_query)``
379
380 Returns invalid query if residue numbers have insertion codes.
381
382 :type: :class:`str`
383 """
384 if self._ost_query is None:
385 chain_rnums = dict()
386 for r in self.mdl_residuesmdl_residues:
387 chname = r.GetChain().GetName()
388 rnum = r.GetNumber().GetNum()
389 if chname not in chain_rnums:
390 chain_rnums[chname] = list()
391 chain_rnums[chname].append(str(rnum))
392 chain_queries = list()
393 for k,v in chain_rnums.items():
394 q = f"(cname={mol.QueryQuoteName(k)} and "
395 q += f"rnum={','.join(v)})"
396 chain_queries.append(q)
397 self._ost_query = " or ".join(chain_queries)
398 return self._ost_query
399
400 def JSONSummary(self):
401 """ Returns JSON serializable summary of results
402 """
403 json_dict = dict()
404 json_dict["lDDT"] = self.lDDT
405 json_dict["ref_residues"] = [r.GetQualifiedName() for r in \
407 json_dict["mdl_residues"] = [r.GetQualifiedName() for r in \
409 json_dict["transform"] = list(self.transformtransform.data)
410 json_dict["bb_rmsd"] = self.bb_rmsd
411 json_dict["ost_query"] = self.ost_query
412 json_dict["flat_mapping"] = self.GetFlatChainMapping()
413 return json_dict
414
415 def GetFlatChainMapping(self, mdl_as_key=False):
416 """ Returns flat mapping of all chains in the representation
417
418 :param mdl_as_key: Default is target chain name as key and model chain
419 name as value. This can be reversed with this flag.
420 :returns: :class:`dict` with :class:`str` as key/value that describe
421 one-to-one mapping
422 """
423 flat_mapping = dict()
424 for trg_res, mdl_res in zip(self.ref_residuesref_residues, self.mdl_residuesmdl_residues):
425 if mdl_as_key:
426 flat_mapping[mdl_res.chain.name] = trg_res.chain.name
427 else:
428 flat_mapping[trg_res.chain.name] = mdl_res.chain.name
429 return flat_mapping
430
431 def _GetFullBBPos(self, residues):
432 """ Helper to extract full backbone positions
433 """
434 exp_pep_atoms = ["N", "CA", "C"]
435 exp_nuc_atoms = ["O5'", "C5'", "C4'", "C3'", "O3'"]
436 bb_pos = geom.Vec3List()
437 for r in residues:
438 if r.GetChemType() == mol.ChemType.NUCLEOTIDES:
439 exp_atoms = exp_nuc_atoms
440 elif r.GetChemType() == mol.ChemType.AMINOACIDS:
441 exp_atoms = exp_pep_atoms
442 else:
443 raise RuntimeError(f"Residue {r.qualified_name} has unexpected"
444 f" chem type {r.GetChemType()}")
445 for aname in exp_atoms:
446 a = r.FindAtom(aname)
447 if not a.IsValid():
448 raise RuntimeError(f"Expected atom {aname} missing from "
449 f"{r.qualified_name}")
450 bb_pos.append(a.GetPos())
451 return bb_pos
452
453 def _GetBBPos(self, residues):
454 """ Helper to extract single representative position for each residue
455 """
456 bb_pos = geom.Vec3List()
457 for r in residues:
458 at = r.FindAtom("CA")
459 if not at.IsValid():
460 at = r.FindAtom("C3'")
461 if not at.IsValid():
462 raise RuntimeError(f"Residue {r.qualified_name} missing "
463 f"expected CA/C3' atom")
464 bb_pos.append(at.GetPos())
465 return bb_pos
466
467 def _GetInconsistentResidues(self, ref_residues, mdl_residues):
468 """ Helper to extract a list of inconsistent residues.
469 """
470 if len(ref_residues) != len(mdl_residues):
471 raise ValueError("Something terrible happened... Reference and "
472 "model lengths differ... RUN...")
473 inconsistent_residues = list()
474 for ref_residue, mdl_residue in zip(ref_residues, mdl_residues):
475 if ref_residue.name != mdl_residue.name:
476 inconsistent_residues.append((ref_residue, mdl_residue))
477 return inconsistent_residues
478
479
481 """ Class to compute chain mappings
482
483 All algorithms are performed on processed structures which fulfill
484 criteria as given in constructor arguments (*min_pep_length*,
485 "min_nuc_length") and only contain residues which have all required backbone
486 atoms. for peptide residues thats N, CA, C and CB (no CB for GLY), for
487 nucleotide residues thats O5', C5', C4', C3' and O3'.
488
489 Chain mapping is a three step process:
490
491 * Group chemically identical chains in *target* into so called chem
492 groups. There are two options to achieve this:
493 1) using pairwise alignments that are either computed with
494 Needleman-Wunsch (NW) or simply derived from residue numbers
495 (*resnum_alignments* flag). In case of NW, *pep_subst_mat*, *pep_gap_open*
496 and *pep_gap_ext* and their nucleotide equivalents are relevant. Two
497 chains are considered identical if they fulfill the *pep_seqid_thr* and
498 have at least *min_pep_length* aligned residues. Same logic is applied
499 for nucleotides with respective thresholds.
500 2) specify seqres sequences and provide the mapping manually. This can
501 be useful if you're loading data from an mmCIF file where you have mmCIF
502 entity information. Alignments are performed based on residue numbers
503 in this case and don't consider the *resnum_alignments* flag. Any
504 mismatch of one letter code in the structure and the respective SEQRES
505 raises an error.
506
507 * Map chains in an input model to these groups. Parameters for generating
508 alignments are the same as above. Model chains are mapped to the chem
509 group with highest sequence identity. Optionally, you can set a minimum
510 sequence identity threshold with *mdl_map_pep_seqid_thr*/
511 *mdl_map_nuc_seqid_thr* to avoid nonsensical mappings. You can either
512 get the group mapping with :func:`GetChemMapping` or directly call
513 one of the full fletched one-to-one chain mapping functions which
514 execute that step internally.
515
516 * Obtain one-to-one mapping for chains in an input model and
517 *target* with one of the available mapping functions. Just to get an
518 idea of complexity. If *target* and *model* are octamers, there are
519 ``8! = 40320`` possible chain mappings.
520
521 :param target: Target structure onto which models are mapped.
522 Computations happen on a selection only containing
523 polypeptides and polynucleotides.
524 :type target: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
525 :param resnum_alignments: Use residue numbers instead of
526 Needleman-Wunsch to compute pairwise
527 alignments. Relevant for :attr:`~chem_groups`
528 if *seqres* is not specified explicitely.
529 :type resnum_alignments: :class:`bool`
530 :param pep_seqid_thr: Sequence identity threshold (in percent of aligned
531 columns) used to decide when two chains are
532 identical. 95 percent tolerates the few mutations
533 crystallographers like to do. Range: [0-100].
534 :type pep_seqid_thr: :class:`float`
535 :param nuc_seqid_thr: Nucleotide equivalent for *pep_seqid_thr*
536 :type nuc_seqid_thr: :class:`float`
537 :param pep_subst_mat: Substitution matrix to align peptide sequences,
538 irrelevant if *resnum_alignments* is True,
539 defaults to seq.alg.BLOSUM62
540 :type pep_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
541 :param pep_gap_open: Gap open penalty to align peptide sequences,
542 irrelevant if *resnum_alignments* is True
543 :type pep_gap_open: :class:`int`
544 :param pep_gap_ext: Gap extension penalty to align peptide sequences,
545 irrelevant if *resnum_alignments* is True
546 :type pep_gap_ext: :class:`int`
547 :param nuc_subst_mat: Nucleotide equivalent for *pep_subst_mat*,
548 defaults to seq.alg.NUC44
549 :type nuc_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
550 :param nuc_gap_open: Nucleotide equivalent for *pep_gap_open*
551 :type nuc_gap_open: :class:`int`
552 :param nuc_gap_ext: Nucleotide equivalent for *pep_gap_ext*
553 :type nuc_gap_ext: :class:`int`
554 :param min_pep_length: Minimal number of residues for a peptide chain to be
555 considered in target and in models.
556 :type min_pep_length: :class:`int`
557 :param min_nuc_length: Minimal number of residues for a nucleotide chain to be
558 considered in target and in models.
559 :type min_nuc_length: :class:`int`
560 :param n_max_naive: Max possible chain mappings that are enumerated in
561 :func:`~GetNaivelDDTMapping` /
562 :func:`~GetDecomposerlDDTMapping`. A
563 :class:`RuntimeError` is raised in case of bigger
564 complexity.
565 :type n_max_naive: :class:`int`
566 :param mdl_map_pep_seqid_thr: Sequence identity threshold (in percent of
567 aligned columns) to avoid non-sensical
568 mapping of model chains to reference chem
569 groups. If a value larger than 0.0 is
570 provided, minimal criteria for assignment
571 becomes a sequence identity of
572 *mdl_map_pep_seqid_thr* and at least
573 *min_pep_length* aligned residues. If set to
574 zero, it simply assigns a model chain to the
575 chem group with highest sequence identity.
576 Range: [0-100].
577 :type mdl_map_pep_seqid_thr: :class:`float`
578 :param mdl_map_nuc_seqid_thr: Nucleotide equivalent of
579 *mdl_map_pep_seqid_thr*
580 :type mdl_map_nuc_seqid_thr: :class:`float`
581 :param seqres: SEQRES sequences. All polymer chains in *target* will be
582 aligned to these sequences using a residue number based
583 alignment, effectively ignoring *resnum_alignments* in
584 the chem grouping step. Additionally, you need to
585 manually specify a mapping of the polymer chains using
586 *trg_seqres_mapping* and an error is raised otherwise.
587 The one letter codes in
588 the structure must exactly match the respective characters
589 in seqres and an error is raised if not. These seqres
590 sequences are set as :attr:`~chem_group_ref_seqs` and will
591 thus influence the mapping of model chains.
592 :type seqres: :class:`ost.seq.SequenceList`
593 :param trg_seqres_mapping: Maps each polymer chain in *target* to a sequence
594 in *seqres*. It's a :class:`dict` with chain name
595 as key and seqres sequence name as value.
596 :type trg_seqres_mapping: :class:`dict`
597 """
598 def __init__(self, target, resnum_alignments=False,
599 pep_seqid_thr = 95., nuc_seqid_thr = 95.,
600 pep_subst_mat = seq.alg.BLOSUM62, pep_gap_open = -11,
601 pep_gap_ext = -1, nuc_subst_mat = seq.alg.NUC44,
602 nuc_gap_open = -4, nuc_gap_ext = -4,
603 min_pep_length = 6, min_nuc_length = 4,
604 n_max_naive = 1e8,
605 mdl_map_pep_seqid_thr = 0.,
606 mdl_map_nuc_seqid_thr = 0.,
607 seqres = None,
608 trg_seqres_mapping = None):
609
610 # input parameter check
611 seqres_params_set = sum([seqres is not None,
612 trg_seqres_mapping is not None])
613 if seqres_params_set not in [0, 2]:
614 raise RuntimeError("Must either give both, seqres and "
615 "trg_seqres_mapping, or none of them.")
616 # attributes
617 self.resnum_alignments = resnum_alignments
618 self.pep_seqid_thr = pep_seqid_thr
619 self.nuc_seqid_thr = nuc_seqid_thr
620 self.min_pep_length = min_pep_length
621 self.min_nuc_length = min_nuc_length
622 self.n_max_naive = n_max_naive
623 self.mdl_map_pep_seqid_thr = mdl_map_pep_seqid_thr
624 self.mdl_map_nuc_seqid_thr = mdl_map_nuc_seqid_thr
625 self.pep_subst_mat = pep_subst_mat
626 self.pep_gap_open = pep_gap_open
627 self.pep_gap_ext = pep_gap_ext
628 self.nuc_subst_mat = nuc_subst_mat
629 self.nuc_gap_open = nuc_gap_open
630 self.nuc_gap_ext = nuc_gap_ext
631 self.seqres = seqres
632 self.trg_seqres_mapping = trg_seqres_mapping
633 self.seqres_set = seqres_params_set == 2
634
635 # lazy computed attributes
636 self._chem_groups = None
640
641 # target structure preprocessing
643 self.ProcessStructure(target)
644
645 # input parameter check
646 if self.seqres_set:
647 # check if seqres names, i.e. entity ids, are unique
648 entity_ids = set()
649 for s in self.seqres:
650 sname = s.GetName()
651 if sname in entity_ids:
652 raise RuntimeError(f"Sequence names in seqres param must "
653 f"be unique and refer to entity ids. "
654 f"Duplicate sequence name: \"{sname}\"")
655 entity_ids.add(sname)
656 # check if entity ids in trg_seqres_mapping are all covered
657 # in seqres
658 for cname, entity_id in self.trg_seqres_mapping.items():
659 if entity_id not in entity_ids:
660 raise RuntimeError(f"trg_seqres_mapping contains entity id "
661 f"for which no seqres is given: "
662 f"cname: \"{cname}\", entity id: "
663 f"\"{entity_id}\"")
664 # check if all sequences in self.polypep_seqs and self.polynuc_seqs
665 # have a mapping in trg_seqres_mapping
666 for s in self.polypep_seqs:
667 if s.GetName() not in self.trg_seqres_mapping:
668 raise RuntimeError(f"If seqres information is provided, "
669 f"all polymer chains must be covered "
670 f"by trg_seqres_mapping. Missing "
671 f"mapping for chain \"{s.GetName()}\"")
672 for s in self.polynuc_seqspolynuc_seqs:
673 if s.GetName() not in self.trg_seqres_mapping:
674 raise RuntimeError(f"If seqres information is provided, "
675 f"all polymer chains must be covered "
676 f"by trg_seqres_mapping. Missing "
677 f"mapping for chain \"{s.GetName()}\"")
678
679
680 @property
681 def target(self):
682 """Target structure that only contains peptides/nucleotides
683
684 Contains only residues that have the backbone representatives
685 (CA for peptide and C3' for nucleotides) to avoid ATOMSEQ alignment
686 inconsistencies when switching between all atom and backbone only
687 representations.
688
689 :type: :class:`ost.mol.EntityView`
690 """
691 return self._target
692
693 @property
694 def polypep_seqs(self):
695 """Sequences of peptide chains in :attr:`~target`
696
697 :type: :class:`ost.seq.SequenceList`
698 """
699 return self._polypep_seqs
700
701 @property
702 def polynuc_seqs(self):
703 """Sequences of nucleotide chains in :attr:`~target`
704
705 :type: :class:`ost.seq.SequenceList`
706 """
707 return self._polynuc_seqs
708
709 @property
710 def chem_groups(self):
711 """Groups of chemically equivalent chains in :attr:`~target`
712
713 First chain in group is the one with longest sequence.
714
715 :getter: Computed on first use (cached)
716 :type: :class:`list` of :class:`list` of :class:`str` (chain names)
717 """
718 if self._chem_groups is None:
719 self._ComputeChemGroups()
720 return self._chem_groups
721
722 @property
724 """MSA for each group in :attr:`~chem_groups`
725
726 The first sequence is the reference sequence.
727 The subsequent sequences represent the ATOMSEQ sequences in
728 :attr:`~target` in same order as in :attr:`~chem_groups`.
729
730 :getter: Computed on first use (cached)
731 :type: :class:`ost.seq.AlignmentList`
732 """
733 if self._chem_group_alignments is None:
734 self._ComputeChemGroups()
735 return self._chem_group_alignments
736
737 @property
739 """Reference sequence for each group in :attr:`~chem_groups`
740
741 :getter: Computed on first use (cached)
742 :type: :class:`ost.seq.SequenceList`
743 """
744 if self._chem_group_ref_seqs is None:
745 self._ComputeChemGroups()
746 return self._chem_group_ref_seqs
747
748 @property
750 """ChemType of each group in :attr:`~chem_groups`
751
752 Specifying if groups are poly-peptides/nucleotides, i.e.
753 :class:`ost.mol.ChemType.AMINOACIDS` or
754 :class:`ost.mol.ChemType.NUCLEOTIDES`
755
756 :getter: Computed on first use (cached)
757 :type: :class:`list` of :class:`ost.mol.ChemType`
758 """
759 if self._chem_group_types is None:
760 self._ComputeChemGroups()
761 return self._chem_group_types
762
763 def GetChemMapping(self, model):
764 """Maps sequences in *model* to chem_groups of target
765
766 :param model: Model from which to extract sequences, a
767 selection that only includes peptides and nucleotides
768 is performed and returned along other results.
769 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
770 :returns: Tuple with two lists of length `len(self.chem_groups)`, a list
771 and an :class:`ost.mol.EntityView` representing *model*:
772 1) Each element is a :class:`list` with mdl chain names that
773 map to the chem group at that position.
774 2) Each element is a :class:`ost.seq.AlignmentList` aligning
775 these mdl chain sequences to the chem group ref sequences.
776 3) The model chains that could not be mapped to the reference
777 4) A selection of *model* that only contains polypeptides and
778 polynucleotides whose ATOMSEQ exactly matches the sequence
779 info in the returned alignments.
780 """
781 mdl, mdl_pep_seqs, mdl_nuc_seqs = self.ProcessStructure(model)
782 mapping = [list() for x in self.chem_groupschem_groups]
783 mdl_chains_without_chem_mapping = list()
784 alns = [seq.AlignmentList() for x in self.chem_groupschem_groups]
785
786 for s in mdl_pep_seqs:
787 idx, aln = self._MapSequence(self.chem_group_ref_seqs,
789 s, mol.ChemType.AMINOACIDS, mdl,
790 seq_id_thr = self.mdl_map_pep_seqid_thr,
791 min_aln_length = self.min_pep_length)
792 if idx is None:
793 ost.LogWarning("Could not map model chain %s to any chem group"
794 " in the target" % s.name)
795 mdl_chains_without_chem_mapping.append(s.GetName())
796 else:
797 mapping[idx].append(s.GetName())
798 alns[idx].append(aln)
799
800 for s in mdl_nuc_seqs:
801 idx, aln = self._MapSequence(self.chem_group_ref_seqs,
803 s, mol.ChemType.NUCLEOTIDES, mdl,
804 seq_id_thr = self.mdl_map_nuc_seqid_thr,
805 min_aln_length = self.min_nuc_length)
806 if idx is None:
807 ost.LogWarning("Could not map model chain %s to any chem group"
808 " in the target" % s.name)
809 mdl_chains_without_chem_mapping.append(s.GetName())
810 else:
811 mapping[idx].append(s.GetName())
812 alns[idx].append(aln)
813
814 return (mapping, alns, mdl_chains_without_chem_mapping, mdl)
815
816
817 def GetlDDTMapping(self, model, inclusion_radius=15.0,
818 thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic",
819 steep_opt_rate = None, block_seed_size = 5,
820 block_blocks_per_chem_group = 5,
821 chem_mapping_result = None,
822 heuristic_n_max_naive = 40320):
823 """ Identify chain mapping by optimizing lDDT score
824
825 Maps *model* chain sequences to :attr:`~chem_groups` and find mapping
826 based on backbone only lDDT score (CA for amino acids C3' for
827 Nucleotides).
828
829 Either performs a naive search, i.e. enumerate all possible mappings or
830 executes a greedy strategy that tries to identify a (close to) optimal
831 mapping in an iterative way by starting from a start mapping (seed). In
832 each iteration, the one-to-one mapping that leads to highest increase
833 in number of conserved contacts is added with the additional requirement
834 that this added mapping must have non-zero interface counts towards the
835 already mapped chains. So basically we're "growing" the mapped structure
836 by only adding connected stuff.
837
838 The available strategies:
839
840 * **naive**: Enumerates all possible mappings and returns best
841
842 * **greedy_full**: try multiple seeds for greedy extension, i.e. try
843 all ref/mdl chain combinations within the respective chem groups and
844 retain the mapping leading to the best lDDT.
845
846 * **greedy_block**: try multiple seeds for greedy extension, i.e. try
847 all ref/mdl chain combinations within the respective chem groups and
848 extend them to *block_seed_size*. *block_blocks_per_chem_group*
849 for each chem group are selected for exhaustive extension.
850
851 * **heuristic**: Uses *naive* strategy if number of possible mappings
852 is within *heuristic_n_max_naive*. The default of 40320 corresponds
853 to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
854 6!*2!=1440 etc. If the number of possible mappings is larger,
855 *greedy_full* is used.
856
857 Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
858 mapping.
859
860 :param model: Model to map
861 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
862 :param inclusion_radius: Inclusion radius for lDDT
863 :type inclusion_radius: :class:`float`
864 :param thresholds: Thresholds for lDDT
865 :type thresholds: :class:`list` of :class:`float`
866 :param strategy: Strategy to find mapping. Must be in ["naive",
867 "greedy_full", "greedy_block"]
868 :type strategy: :class:`str`
869 :param steep_opt_rate: Only relevant for greedy strategies.
870 If set, every *steep_opt_rate* mappings, a simple
871 optimization is executed with the goal of
872 avoiding local minima. The optimization
873 iteratively checks all possible swaps of mappings
874 within their respective chem groups and accepts
875 swaps that improve lDDT score. Iteration stops as
876 soon as no improvement can be achieved anymore.
877 :type steep_opt_rate: :class:`int`
878 :param block_seed_size: Param for *greedy_block* strategy - Initial seeds
879 are extended by that number of chains.
880 :type block_seed_size: :class:`int`
881 :param block_blocks_per_chem_group: Param for *greedy_block* strategy -
882 Number of blocks per chem group that
883 are extended in an initial search
884 for high scoring local solutions.
885 :type block_blocks_per_chem_group: :class:`int`
886 :param chem_mapping_result: Pro param. The result of
887 :func:`~GetChemMapping` where you provided
888 *model*. If set, *model* parameter is not
889 used.
890 :type chem_mapping_result: :class:`tuple`
891 :returns: A :class:`MappingResult`
892 """
893
894 strategies = ["naive", "greedy_full", "greedy_block", "heuristic"]
895 if strategy not in strategies:
896 raise RuntimeError(f"Strategy must be in {strategies}")
897
898 if chem_mapping_result is None:
899 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
900 self.GetChemMapping(model)
901 else:
902 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
903 chem_mapping_result
904
905 ref_mdl_alns = _GetRefMdlAlns(self.chem_groupschem_groups,
907 chem_mapping,
908 chem_group_alns)
909
910 # check for the simplest case
911 one_to_one = _CheckOneToOneMapping(self.chem_groupschem_groups, chem_mapping)
912 if one_to_one is not None:
913 alns = dict()
914 for ref_group, mdl_group in zip(self.chem_groupschem_groups, one_to_one):
915 for ref_ch, mdl_ch in zip(ref_group, mdl_group):
916 if ref_ch is not None and mdl_ch is not None:
917 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
918 alns[(ref_ch, mdl_ch)] = aln
919 return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
920 mdl_chains_without_chem_mapping, one_to_one, alns)
921
922 if strategy == "heuristic":
923 if _NMappingsWithin(self.chem_groupschem_groups, chem_mapping,
924 heuristic_n_max_naive):
925 strategy = "naive"
926 else:
927 strategy = "greedy_full"
928
929 mapping = None
930 opt_lddt = None
931
932 if strategy == "naive":
933 mapping, opt_lddt = _lDDTNaive(self.targettarget, mdl, inclusion_radius,
934 thresholds, self.chem_groupschem_groups,
935 chem_mapping, ref_mdl_alns,
936 self.n_max_naive)
937 else:
938 # its one of the greedy strategies - setup greedy searcher
939 the_greed = _lDDTGreedySearcher(self.targettarget, mdl, self.chem_groupschem_groups,
940 chem_mapping, ref_mdl_alns,
941 inclusion_radius=inclusion_radius,
942 thresholds=thresholds,
943 steep_opt_rate=steep_opt_rate)
944 if strategy == "greedy_full":
945 mapping = _lDDTGreedyFull(the_greed)
946 elif strategy == "greedy_block":
947 mapping = _lDDTGreedyBlock(the_greed, block_seed_size,
948 block_blocks_per_chem_group)
949 # cached => lDDT computation is fast here
950 opt_lddt = the_greed.Score(mapping)
951
952 alns = dict()
953 for ref_group, mdl_group in zip(self.chem_groupschem_groups, mapping):
954 for ref_ch, mdl_ch in zip(ref_group, mdl_group):
955 if ref_ch is not None and mdl_ch is not None:
956 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
957 alns[(ref_ch, mdl_ch)] = aln
958
959 return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
960 mdl_chains_without_chem_mapping, mapping, alns,
961 opt_score = opt_lddt)
962
963
964 def GetQSScoreMapping(self, model, contact_d = 12.0, strategy = "heuristic",
965 block_seed_size = 5, block_blocks_per_chem_group = 5,
966 heuristic_n_max_naive = 40320, steep_opt_rate = None,
967 chem_mapping_result = None,
968 greedy_prune_contact_map = True):
969 """ Identify chain mapping based on QSScore
970
971 Scoring is based on CA/C3' positions which are present in all chains of
972 a :attr:`chem_groups` as well as the *model* chains which are mapped to
973 that respective chem group.
974
975 The following strategies are available:
976
977 * **naive**: Naively iterate all possible mappings and return best based
978 on QS score.
979
980 * **greedy_full**: try multiple seeds for greedy extension, i.e. try
981 all ref/mdl chain combinations within the respective chem groups and
982 retain the mapping leading to the best QS-score.
983
984 * **greedy_block**: try multiple seeds for greedy extension, i.e. try
985 all ref/mdl chain combinations within the respective chem groups and
986 extend them to *block_seed_size*. *block_blocks_per_chem_group*
987 for each chem group are selected for exhaustive extension.
988
989 * **heuristic**: Uses *naive* strategy if number of possible mappings
990 is within *heuristic_n_max_naive*. The default of 40320 corresponds
991 to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
992 6!*2!=1440 etc. If the number of possible mappings is larger,
993 *greedy_full* is used.
994
995 Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
996 mapping.
997
998 :param model: Model to map
999 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1000 :param contact_d: Max distance between two residues to be considered as
1001 contact in qs scoring
1002 :type contact_d: :class:`float`
1003 :param strategy: Strategy for sampling, must be in ["naive",
1004 greedy_full", "greedy_block"]
1005 :type strategy: :class:`str`
1006 :param chem_mapping_result: Pro param. The result of
1007 :func:`~GetChemMapping` where you provided
1008 *model*. If set, *model* parameter is not
1009 used.
1010 :type chem_mapping_result: :class:`tuple`
1011 :param greedy_prune_contact_map: Relevant for all strategies that use
1012 greedy extensions. If True, only chains
1013 with at least 3 contacts (8A CB
1014 distance) towards already mapped chains
1015 in trg/mdl are considered for
1016 extension. All chains that give a
1017 potential non-zero QS-score increase
1018 are used otherwise (at least one
1019 contact within 12A). The consequence
1020 is reduced runtime and usually no
1021 real reduction in accuracy.
1022 :returns: A :class:`MappingResult`
1023 """
1024
1025 strategies = ["naive", "greedy_full", "greedy_block", "heuristic"]
1026 if strategy not in strategies:
1027 raise RuntimeError(f"strategy must be {strategies}")
1028
1029 if chem_mapping_result is None:
1030 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1031 self.GetChemMapping(model)
1032 else:
1033 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1034 chem_mapping_result
1035 ref_mdl_alns = _GetRefMdlAlns(self.chem_groupschem_groups,
1037 chem_mapping,
1038 chem_group_alns)
1039 # check for the simplest case
1040 one_to_one = _CheckOneToOneMapping(self.chem_groupschem_groups, chem_mapping)
1041 if one_to_one is not None:
1042 alns = dict()
1043 for ref_group, mdl_group in zip(self.chem_groupschem_groups, one_to_one):
1044 for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1045 if ref_ch is not None and mdl_ch is not None:
1046 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1047 alns[(ref_ch, mdl_ch)] = aln
1048 return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1049 mdl_chains_without_chem_mapping, one_to_one, alns)
1050
1051 if strategy == "heuristic":
1052 if _NMappingsWithin(self.chem_groupschem_groups, chem_mapping,
1053 heuristic_n_max_naive):
1054 strategy = "naive"
1055 else:
1056 strategy = "greedy_full"
1057
1058 mapping = None
1059 opt_qsscore = None
1060
1061 if strategy == "naive":
1062 mapping, opt_qsscore = _QSScoreNaive(self.targettarget, mdl,
1064 chem_mapping, ref_mdl_alns,
1065 contact_d, self.n_max_naive)
1066 else:
1067 # its one of the greedy strategies - setup greedy searcher
1068 the_greed = _QSScoreGreedySearcher(self.targettarget, mdl,
1070 chem_mapping, ref_mdl_alns,
1071 contact_d = contact_d,
1072 steep_opt_rate=steep_opt_rate,
1073 greedy_prune_contact_map = greedy_prune_contact_map)
1074 if strategy == "greedy_full":
1075 mapping = _QSScoreGreedyFull(the_greed)
1076 elif strategy == "greedy_block":
1077 mapping = _QSScoreGreedyBlock(the_greed, block_seed_size,
1078 block_blocks_per_chem_group)
1079 # cached => QSScore computation is fast here
1080 opt_qsscore = the_greed.Score(mapping, check=False).QS_global
1081
1082 alns = dict()
1083 for ref_group, mdl_group in zip(self.chem_groupschem_groups, mapping):
1084 for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1085 if ref_ch is not None and mdl_ch is not None:
1086 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1087 alns[(ref_ch, mdl_ch)] = aln
1088
1089 return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1090 mdl_chains_without_chem_mapping, mapping, alns,
1091 opt_score = opt_qsscore)
1092
1093 def GetRMSDMapping(self, model, strategy = "heuristic", subsampling=50,
1094 chem_mapping_result = None, heuristic_n_max_naive = 120):
1095 """Identify chain mapping based on minimal RMSD superposition
1096
1097 Superposition and scoring is based on CA/C3' positions which are present
1098 in all chains of a :attr:`chem_groups` as well as the *model*
1099 chains which are mapped to that respective chem group.
1100
1101 The following strategies are available:
1102
1103 * **naive**: Naively iterate all possible mappings and return the one
1104 with lowes RMSD.
1105
1106 * **greedy_single**: perform all vs. all single chain superpositions
1107 within the respective ref/mdl chem groups to use as starting points.
1108 For each starting point, iteratively add the model/target chain pair
1109 with lowest RMSD until a full mapping is achieved. The mapping with
1110 lowest RMSD is returned.
1111
1112 * **greedy_single_centroid**: Same as *greedy_single* but iteratively
1113 adds model/target chain pairs with lowest centroid distance. The
1114 mapping with the lowest centroid RMSD is returned. This is mainly for
1115 evaluation purposes. One known failure mode is that intertwined
1116 structures may have multiple centroids sitting on top of each other.
1117 RMSD and thus *greedy_single* are better able to disentangle this
1118 situation.
1119
1120 * **greedy_iterative**: Same as greedy_single_rmsd exept that the
1121 transformation gets updated with each added chain pair.
1122
1123 * **heuristic**: Uses *naive* strategy if number of possible mappings
1124 is within *heuristic_n_max_naive*. The default of 120 corresponds
1125 to a homo-pentamer (5!=120). If the number of possible mappings is
1126 larger, *greedy_iterative* is used.
1127
1128 :param model: Model to map
1129 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1130 :param strategy: Strategy for sampling. Must be in ["naive",
1131 "greedy_single", "greedy_iterative"]
1132 :type strategy: :class:`str`
1133 :param subsampling: If given, only an equally distributed subset
1134 of CA/C3' positions in each chain are used for
1135 superposition/scoring.
1136 :type subsampling: :class:`int`
1137 :param chem_mapping_result: Pro param. The result of
1138 :func:`~GetChemMapping` where you provided
1139 *model*. If set, *model* parameter is not
1140 used.
1141 :type chem_mapping_result: :class:`tuple`
1142 :returns: A :class:`MappingResult`
1143 """
1144
1145 strategies = ["naive", "greedy_single", "greedy_single_centroid",
1146 "greedy_iterative", "heuristic"]
1147
1148 if strategy not in strategies:
1149 raise RuntimeError(f"strategy must be {strategies}")
1150
1151 if chem_mapping_result is None:
1152 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1153 self.GetChemMapping(model)
1154 else:
1155 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1156 chem_mapping_result
1157 ref_mdl_alns = _GetRefMdlAlns(self.chem_groupschem_groups,
1159 chem_mapping,
1160 chem_group_alns)
1161
1162 # check for the simplest case
1163 one_to_one = _CheckOneToOneMapping(self.chem_groupschem_groups, chem_mapping)
1164 if one_to_one is not None:
1165 alns = dict()
1166 for ref_group, mdl_group in zip(self.chem_groupschem_groups, one_to_one):
1167 for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1168 if ref_ch is not None and mdl_ch is not None:
1169 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1170 alns[(ref_ch, mdl_ch)] = aln
1171 return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1172 mdl_chains_without_chem_mapping, one_to_one, alns)
1173
1174 trg_group_pos, mdl_group_pos = _GetRefPos(self.targettarget, mdl,
1176 chem_group_alns,
1177 max_pos = subsampling)
1178
1179 if strategy == "heuristic":
1180 if _NMappingsWithin(self.chem_groupschem_groups, chem_mapping,
1181 heuristic_n_max_naive):
1182 strategy = "naive"
1183 else:
1184 strategy = "greedy_iterative"
1185
1186 mapping = None
1187
1188 if strategy.startswith("greedy"):
1189 # get transforms of any mdl chain onto any trg chain in same chem
1190 # group that fulfills gdtts threshold
1191 initial_transforms = list()
1192 initial_mappings = list()
1193 for trg_pos, trg_chains, mdl_pos, mdl_chains in zip(trg_group_pos,
1195 mdl_group_pos,
1196 chem_mapping):
1197 for t_pos, t in zip(trg_pos, trg_chains):
1198 for m_pos, m in zip(mdl_pos, mdl_chains):
1199 if len(t_pos) >= 3 and len(m_pos) >= 3:
1200 transform = _GetSuperposition(m_pos, t_pos,
1201 False).transformation
1202 initial_transforms.append(transform)
1203 initial_mappings.append((t,m))
1204
1205 if strategy == "greedy_single":
1206 mapping = _SingleRigidRMSD(initial_transforms,
1207 initial_mappings,
1209 chem_mapping,
1210 trg_group_pos,
1211 mdl_group_pos)
1212
1213 elif strategy == "greedy_single_centroid":
1214 mapping = _SingleRigidCentroid(initial_transforms,
1215 initial_mappings,
1217 chem_mapping,
1218 trg_group_pos,
1219 mdl_group_pos)
1220
1221 elif strategy == "greedy_iterative":
1222 mapping = _IterativeRigidRMSD(initial_transforms,
1223 initial_mappings,
1224 self.chem_groupschem_groups, chem_mapping,
1225 trg_group_pos, mdl_group_pos)
1226 elif strategy == "naive":
1227 mapping = _NaiveRMSD(self.chem_groupschem_groups, chem_mapping,
1228 trg_group_pos, mdl_group_pos,
1229 self.n_max_naive)
1230
1231 # translate mapping format and return
1232 final_mapping = list()
1233 for ref_chains in self.chem_groupschem_groups:
1234 mapped_mdl_chains = list()
1235 for ref_ch in ref_chains:
1236 if ref_ch in mapping:
1237 mapped_mdl_chains.append(mapping[ref_ch])
1238 else:
1239 mapped_mdl_chains.append(None)
1240 final_mapping.append(mapped_mdl_chains)
1241
1242 alns = dict()
1243 for ref_group, mdl_group in zip(self.chem_groupschem_groups, final_mapping):
1244 for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1245 if ref_ch is not None and mdl_ch is not None:
1246 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1247 alns[(ref_ch, mdl_ch)] = aln
1248
1249 return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1250 mdl_chains_without_chem_mapping, final_mapping, alns)
1251
1252
1253 def GetAFMMapping(self, model, chem_mapping_result = None):
1254 """Identify chain mapping as in AlphaFold-Multimer (AFM)
1255
1256 Described in section 7.3 of
1257
1258 Richard Evans, Michael O’Neill, Alexander Pritzel, Natasha Antropova,
1259 Andrew Senior, Tim Green, Augustin Žídek, Russ Bates, Sam Blackwell,
1260 Jason Yim, et al. Protein complex prediction with AlphaFold-Multimer.
1261 bioRxiv preprint bioRxiv:10.1101/2021.10.04.463034, 2021.
1262
1263 To summarize (largely follows the description in the AF-Multimer paper):
1264 One anchor chain in the ground truth (reference) is selected. If the
1265 reference has stoichiometry A3B2, an arbitary chain in B is selected
1266 as it has smaller ambiguity. In a tie, for example A2B2, the longest
1267 chain in A, B is selected.
1268
1269 Given a model chain with same sequence as the reference anchor chain,
1270 a CA-RMSD (C3' for nucleotides) based superposition is performed. Chains
1271 are then greedily assigned by minimum distance of their geometric
1272 centers. This procedure is repeated for every model chain with same
1273 sequence and the assignment with minimal RMSD of the geometric centers
1274 is returned.
1275
1276 A modification of this algorithm allows to deal with different
1277 stoichiometries in reference and model. In the anchor selection, this
1278 function ensures that there is at least one model chain that can be
1279 mapped to this anchor. If the logic above does not identify a mappable
1280 chain pair for the smallest group, it continues with the second smallest
1281 group etc.
1282
1283 :param model: Model to map
1284 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1285 :param chem_mapping_result: Pro param. The result of
1286 :func:`~GetChemMapping` where you provided
1287 *model*. If set, *model* parameter is not
1288 used.
1289 :type chem_mapping_result: :class:`tuple`
1290 :returns: A :class:`MappingResult`
1291 """
1292 if chem_mapping_result is None:
1293 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1294 self.GetChemMapping(model)
1295 else:
1296 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1297 chem_mapping_result
1298 ref_mdl_alns = _GetRefMdlAlns(self.chem_groupschem_groups,
1300 chem_mapping,
1301 chem_group_alns)
1302
1303 # check for the simplest case
1304 one_to_one = _CheckOneToOneMapping(self.chem_groupschem_groups, chem_mapping)
1305 if one_to_one is not None:
1306 alns = dict()
1307 for ref_group, mdl_group in zip(self.chem_groupschem_groups, one_to_one):
1308 for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1309 if ref_ch is not None and mdl_ch is not None:
1310 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1311 alns[(ref_ch, mdl_ch)] = aln
1312 return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1313 mdl_chains_without_chem_mapping, one_to_one, alns)
1314
1315 # Select anchor chain in reference
1316 ref_group_idx = None
1317 ref_ch = None
1318
1319 chem_group_sizes = list(set([len(g) for g in self.chem_groupschem_groups]))
1320 chem_group_sizes.sort()
1321
1322 for min_size in chem_group_sizes:
1323 min_chem_groups = list()
1324 for chem_group_idx, chem_group in enumerate(self.chem_groupschem_groups):
1325 if len(chem_group) == min_size:
1326 min_chem_groups.append(chem_group_idx)
1327 if len(min_chem_groups) == 1:
1328 if len(chem_mapping[min_chem_groups[0]]) > 0:
1329 # Unique smallest chem group that has at least one
1330 # model chain for mapping
1331 # => select arbitrary chain
1332 ref_group_idx = min_chem_groups[0]
1333 ref_ch = self.chem_groupschem_groups[min_chem_groups[0]][0]
1334 else:
1335 # No unique smallest chem group
1336 # => select the largest chain from any of these chem groups
1337 # that has at least one model chain for mapping
1338 n_max = 0
1339 for chem_group_idx in min_chem_groups:
1340 if len(chem_mapping[chem_group_idx]) > 0:
1341 for ch in self.chem_groupschem_groups[chem_group_idx]:
1342 n = self.targettarget.FindChain(ch).GetResidueCount()
1343 if n > n_max:
1344 n_max = n
1345 ref_group_idx = chem_group_idx
1346 ref_ch = ch
1347 if ref_group_idx is not None and ref_ch is not None:
1348 break
1349
1350 # One transformation for each model chain that can be mapped to this one
1351 # anchor in the reference
1352 ref_bb_pos = _GetBBPos(self.targettarget)
1353 mdl_bb_pos = _GetBBPos(mdl)
1354 transformations = list()
1355 for mdl_ch in chem_mapping[ref_group_idx]:
1356 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1357 # assertion can be removed after some testing
1358 l1 = len(ref_bb_pos[ref_ch])
1359 l2 = len(aln.GetSequence(0).GetGaplessString())
1360 assert(l1 == l2)
1361 l1 = len(mdl_bb_pos[mdl_ch])
1362 l2 = len(aln.GetSequence(1).GetGaplessString())
1363 assert(l1 == l2)
1364 # extract aligned positions
1365 ref_idx = 0
1366 mdl_idx = 0
1367 ref_pos = geom.Vec3List()
1368 mdl_pos = geom.Vec3List()
1369 for col in aln:
1370 if col[0] != '-' and col[1] != '-':
1371 ref_pos.append(ref_bb_pos[ref_ch][ref_idx])
1372 mdl_pos.append(mdl_bb_pos[mdl_ch][mdl_idx])
1373 if col[0] != '-':
1374 ref_idx += 1
1375 if col[1] != '-':
1376 mdl_idx += 1
1377 t = _GetSuperposition(mdl_pos, ref_pos, False).transformation
1378 transformations.append(t)
1379
1380 ref_centers = {k: v.center for k,v in ref_bb_pos.items()}
1381 mdl_chains = list()
1382 mdl_centers = list()
1383 for k,v in mdl_bb_pos.items():
1384 mdl_chains.append(k)
1385 mdl_centers.append(v.center)
1386 mdl_centers = geom.Vec3List(mdl_centers)
1387
1388 best_rmsd = float("inf")
1389 best_mapping = None # k: ref_ch, v: mdl_ch
1390
1391 for t in transformations:
1392 tmp = geom.Vec3List(mdl_centers)
1393 tmp.ApplyTransform(t)
1394 # somehow the Vec3List did not really work in a dict comprehension
1395 t_mdl_centers = dict()
1396 for i in range(len(tmp)):
1397 t_mdl_centers[mdl_chains[i]] = tmp[i]
1398
1399 # one entry for each possible assignment
1400 # (<squared_distance>, <ref_ch>, <mdl_ch>)
1401 tmp = list()
1402 for a, b in zip(self.chem_groupschem_groups, chem_mapping):
1403 for ref_ch in a:
1404 for mdl_ch in b:
1405 d = geom.Length2(ref_centers[ref_ch]-t_mdl_centers[mdl_ch])
1406 tmp.append((d, ref_ch, mdl_ch))
1407 tmp.sort()
1408
1409 mapping = dict()
1410 mapped_mdl_chains = set()
1411 for item in tmp:
1412 if item[1] not in mapping and item[2] not in mapped_mdl_chains:
1413 mapping[item[1]] = item[2]
1414 mapped_mdl_chains.add(item[2])
1415
1416 if len(mapping) == 0:
1417 raise RuntimeError("Empty mapping in GetAFMMapping")
1418 elif len(mapping) == 1:
1419 # by definition zero
1420 rmsd = 0.0
1421 elif len(mapping) == 2:
1422 # no superposition possible with two positions
1423 # derive rmsd from the distance difference between
1424 # the centroid pairs in reference and model
1425 ref_p = [ref_centers[k] for k in mapping.keys()]
1426 mdl_p = [t_mdl_centers[v] for v in mapping.values()]
1427 ref_d = geom.Distance(ref_p[0], ref_p[1])
1428 mdl_d = geom.Distance(mdl_p[0], mdl_p[1])
1429 # compute RMSD when placing pair of coordinates with smaller
1430 # distance in the middle of pair of cooridnates with larger
1431 # distance
1432 dd = abs(ref_d-mdl_d) # distance difference
1433 # rmsd = sqrt(1/2 * ((dd/2)**2 + (dd/2)**2))
1434 # => rmsd = dd/2
1435 rmsd = dd/2
1436 else:
1437 # go for classic Kabsch superposition
1438 mapped_ref_centers = geom.Vec3List()
1439 mapped_mdl_centers = geom.Vec3List()
1440 for ref_ch, mdl_ch in mapping.items():
1441 mapped_ref_centers.append(ref_centers[ref_ch])
1442 mapped_mdl_centers.append(t_mdl_centers[mdl_ch])
1443 rmsd = _GetSuperposition(mapped_ref_centers,
1444 mapped_mdl_centers, False).rmsd
1445 if rmsd < best_rmsd:
1446 best_rmsd = rmsd
1447 best_mapping = mapping
1448
1449 # translate mapping format and return
1450 final_mapping = list()
1451 for ref_chains in self.chem_groupschem_groups:
1452 mapped_mdl_chains = list()
1453 for ref_ch in ref_chains:
1454 if ref_ch in best_mapping:
1455 mapped_mdl_chains.append(best_mapping[ref_ch])
1456 else:
1457 mapped_mdl_chains.append(None)
1458 final_mapping.append(mapped_mdl_chains)
1459
1460 alns = dict()
1461 for ref_group, mdl_group in zip(self.chem_groupschem_groups, final_mapping):
1462 for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1463 if ref_ch is not None and mdl_ch is not None:
1464 aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1465 alns[(ref_ch, mdl_ch)] = aln
1466
1467 return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1468 mdl_chains_without_chem_mapping, final_mapping, alns)
1469
1470
1471 def GetMapping(self, model, n_max_naive = 40320):
1472 """ Convenience function to get mapping with currently preferred method
1473
1474 If number of possible chain mappings is <= *n_max_naive*, a naive
1475 QS-score mapping is performed and optimal QS-score is guaranteed.
1476 For anything else, a QS-score mapping with the greedy_full strategy is
1477 performed (greedy_prune_contact_map = True). The default for
1478 *n_max_naive* of 40320 corresponds to an octamer (8!=40320). A
1479 structure with stoichiometry A6B2 would be 6!*2!=1440 etc.
1480
1481 If :attr:`~target` has nucleotide sequences, the QS-score target
1482 function is replaced with a backbone only lDDT score that has
1483 an inclusion radius of 30A.
1484 """
1486 return self.GetlDDTMapping(model, strategy = "heuristic",
1487 inclusion_radius = 30.0,
1488 heuristic_n_max_naive = n_max_naive)
1489 else:
1490 return self.GetQSScoreMapping(model, strategy="heuristic",
1491 greedy_prune_contact_map=True,
1492 heuristic_n_max_naive = n_max_naive)
1493
1494 def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0,
1495 thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False,
1496 only_interchain=False, chem_mapping_result = None,
1497 global_mapping = None):
1498 """ Identify *topn* representations of *substructure* in *model*
1499
1500 *substructure* defines a subset of :attr:`~target` for which one
1501 wants the *topn* representations in *model*. Representations are scored
1502 and sorted by lDDT.
1503
1504 :param substructure: A :class:`ost.mol.EntityView` which is a subset of
1505 :attr:`~target`. Should be selected with the
1506 OpenStructure query language. Example: if you're
1507 interested in residues with number 42,43 and 85 in
1508 chain A:
1509 ``substructure=mapper.target.Select("cname=A and rnum=42,43,85")``
1510 A :class:`RuntimeError` is raised if *substructure*
1511 does not refer to the same underlying
1512 :class:`ost.mol.EntityHandle` as :attr:`~target`.
1513 :type substructure: :class:`ost.mol.EntityView`
1514 :param model: Structure in which one wants to find representations for
1515 *substructure*
1516 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1517 :param topn: Max number of representations that are returned
1518 :type topn: :class:`int`
1519 :param inclusion_radius: Inclusion radius for lDDT
1520 :type inclusion_radius: :class:`float`
1521 :param thresholds: Thresholds for lDDT
1522 :type thresholds: :class:`list` of :class:`float`
1523 :param bb_only: Only consider backbone atoms in lDDT computation
1524 :type bb_only: :class:`bool`
1525 :param only_interchain: Only score interchain contacts in lDDT. Useful
1526 if you want to identify interface patches.
1527 :type only_interchain: :class:`bool`
1528 :param chem_mapping_result: Pro param. The result of
1529 :func:`~GetChemMapping` where you provided
1530 *model*. If set, *model* parameter is not
1531 used.
1532 :type chem_mapping_result: :class:`tuple`
1533 :param global_mapping: Pro param. Specify a global mapping result. This
1534 fully defines the desired representation in the
1535 model but extracts it and enriches it with all
1536 the nice attributes of :class:`ReprResult`.
1537 The target attribute in *global_mapping* must be
1538 of the same entity as self.target and the model
1539 attribute of *global_mapping* must be of the same
1540 entity as *model*.
1541 :type global_mapping: :class:`MappingResult`
1542 :returns: :class:`list` of :class:`ReprResult`
1543 """
1544
1545 if topn < 1:
1546 raise RuntimeError("topn must be >= 1")
1547
1548 if global_mapping is not None:
1549 # ensure that this mapping is derived from the same structures
1550 if global_mapping.target.handle.GetHashCode() != \
1551 self.targettarget.handle.GetHashCode():
1552 raise RuntimeError("global_mapping.target must be the same "
1553 "entity as self.target")
1554 if global_mapping.model.handle.GetHashCode() != \
1555 model.handle.GetHashCode():
1556 raise RuntimeError("global_mapping.model must be the same "
1557 "entity as model param")
1558
1559 # check whether substructure really is a subset of self.target
1560 for r in substructure.residues:
1561 ch_name = r.GetChain().GetName()
1562 rnum = r.GetNumber()
1563 target_r = self.targettarget.FindResidue(ch_name, rnum)
1564 if not target_r.IsValid():
1565 raise RuntimeError(f"substructure has residue "
1566 f"{r.GetQualifiedName()} which is not in "
1567 f"self.target")
1568 if target_r.handle.GetHashCode() != r.handle.GetHashCode():
1569 raise RuntimeError(f"substructure has residue "
1570 f"{r.GetQualifiedName()} which has an "
1571 f"equivalent in self.target but it does "
1572 f"not refer to the same underlying "
1573 f"EntityHandle")
1574 for a in r.atoms:
1575 target_a = target_r.FindAtom(a.GetName())
1576 if not target_a.IsValid():
1577 raise RuntimeError(f"substructure has atom "
1578 f"{a.GetQualifiedName()} which is not "
1579 f"in self.target")
1580 if a.handle.GetHashCode() != target_a.handle.GetHashCode():
1581 raise RuntimeError(f"substructure has atom "
1582 f"{a.GetQualifiedName()} which has an "
1583 f"equivalent in self.target but it does "
1584 f"not refer to the same underlying "
1585 f"EntityHandle")
1586
1587 # check whether it contains either CA or C3'
1588 ca = r.FindAtom("CA")
1589 c3 = r.FindAtom("C3'") # FindAtom with prime in string is tested
1590 # and works
1591 if not ca.IsValid() and not c3.IsValid():
1592 raise RuntimeError("All residues in substructure must contain "
1593 "a backbone atom named CA or C3\'")
1594
1595 # perform mapping and alignments on full structures
1596 if chem_mapping_result is None:
1597 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1598 self.GetChemMapping(model)
1599 else:
1600 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1601 chem_mapping_result
1602 ref_mdl_alns = _GetRefMdlAlns(self.chem_groupschem_groups,
1604 chem_mapping,
1605 chem_group_alns)
1606
1607 # Get residue indices relative to full target chain
1608 substructure_res_indices = dict()
1609 for ch in substructure.chains:
1610 full_ch = self.targettarget.FindChain(ch.GetName())
1611 idx = [full_ch.GetResidueIndex(r.GetNumber()) for r in ch.residues]
1612 substructure_res_indices[ch.GetName()] = idx
1613
1614 # strip down variables to make them specific to substructure
1615 # keep only chem_groups which are present in substructure
1616 substructure_chem_groups = list()
1617 substructure_chem_mapping = list()
1618
1619 chnames = set([ch.GetName() for ch in substructure.chains])
1620 for chem_group, mapping in zip(self.chem_groupschem_groups, chem_mapping):
1621 substructure_chem_group = [ch for ch in chem_group if ch in chnames]
1622 if len(substructure_chem_group) > 0:
1623 substructure_chem_groups.append(substructure_chem_group)
1624 substructure_chem_mapping.append(mapping)
1625
1626 # early stopping if no mdl chain can be mapped to substructure
1627 n_mapped_mdl_chains = sum([len(m) for m in substructure_chem_mapping])
1628 if n_mapped_mdl_chains == 0:
1629 return list()
1630
1631 # strip the reference sequence in alignments to only contain
1632 # sequence from substructure
1633 substructure_ref_mdl_alns = dict()
1634 mdl_views = dict()
1635 for ch in mdl.chains:
1636 mdl_views[ch.GetName()] = _CSel(mdl, [ch.GetName()])
1637 for chem_group, mapping in zip(substructure_chem_groups,
1638 substructure_chem_mapping):
1639 for ref_ch in chem_group:
1640 for mdl_ch in mapping:
1641 full_aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1642 ref_seq = full_aln.GetSequence(0)
1643 # the ref sequence is tricky... we start with a gap only
1644 # sequence and only add olcs as defined by the residue
1645 # indices that we extracted before...
1646 tmp = ['-'] * len(full_aln)
1647 for idx in substructure_res_indices[ref_ch]:
1648 idx_in_seq = ref_seq.GetPos(idx)
1649 tmp[idx_in_seq] = ref_seq[idx_in_seq]
1650 ref_seq = seq.CreateSequence(ref_ch, ''.join(tmp))
1651 ref_seq.AttachView(_CSel(substructure, [ref_ch]))
1652 mdl_seq = full_aln.GetSequence(1)
1653 mdl_seq = seq.CreateSequence(mdl_seq.GetName(),
1654 mdl_seq.GetString())
1655 mdl_seq.AttachView(mdl_views[mdl_ch])
1656 aln = seq.CreateAlignment()
1657 aln.AddSequence(ref_seq)
1658 aln.AddSequence(mdl_seq)
1659 substructure_ref_mdl_alns[(ref_ch, mdl_ch)] = aln
1660
1661 lddt_scorer = lddt.lDDTScorer(substructure,
1662 inclusion_radius = inclusion_radius,
1663 bb_only = bb_only)
1664 scored_mappings = list()
1665
1666 if global_mapping:
1667 # construct mapping of substructure from global mapping
1668 flat_mapping = global_mapping.GetFlatMapping()
1669 mapping = list()
1670 for chem_group, chem_mapping in zip(substructure_chem_groups,
1671 substructure_chem_mapping):
1672 chem_group_mapping = list()
1673 for ch in chem_group:
1674 if ch in flat_mapping:
1675 mdl_ch = flat_mapping[ch]
1676 if mdl_ch in chem_mapping:
1677 chem_group_mapping.append(mdl_ch)
1678 else:
1679 chem_group_mapping.append(None)
1680 else:
1681 chem_group_mapping.append(None)
1682 mapping.append(chem_group_mapping)
1683 mappings = [mapping]
1684 else:
1685 mappings = list(_ChainMappings(substructure_chem_groups,
1686 substructure_chem_mapping,
1687 self.n_max_naive))
1688
1689 # This step can be slow so give some hints in logs
1690 msg = "Computing initial ranking of %d chain mappings" % len(mappings)
1691 (ost.LogWarning if len(mappings) > 10000 else ost.LogInfo)(msg)
1692
1693 for mapping in mappings:
1694 # chain_mapping and alns as input for lDDT computation
1695 lddt_chain_mapping = dict()
1696 lddt_alns = dict()
1697 n_res_aln = 0
1698 for ref_chem_group, mdl_chem_group in zip(substructure_chem_groups,
1699 mapping):
1700 for ref_ch, mdl_ch in zip(ref_chem_group, mdl_chem_group):
1701 # some mdl chains can be None
1702 if mdl_ch is not None:
1703 lddt_chain_mapping[mdl_ch] = ref_ch
1704 aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1705 lddt_alns[mdl_ch] = aln
1706 tmp = [int(c[0] != '-' and c[1] != '-') for c in aln]
1707 n_res_aln += sum(tmp)
1708 # don't compute lDDT if no single residue in mdl and ref is aligned
1709 if n_res_aln == 0:
1710 continue
1711
1712 lDDT, _ = lddt_scorer.lDDT(mdl, thresholds=thresholds,
1713 chain_mapping=lddt_chain_mapping,
1714 residue_mapping = lddt_alns,
1715 check_resnames = False,
1716 no_intrachain = only_interchain)
1717
1718 if lDDT is None:
1719 ost.LogVerbose("No valid contacts in the reference")
1720 lDDT = 0.0 # that means, that we have not a single valid contact
1721 # in lDDT. For the code below to work, we just set it
1722 # to a terrible score => 0.0
1723
1724 if len(scored_mappings) == 0:
1725 scored_mappings.append((lDDT, mapping))
1726 elif len(scored_mappings) < topn:
1727 scored_mappings.append((lDDT, mapping))
1728 scored_mappings.sort(reverse=True, key=lambda x: x[0])
1729 elif lDDT > scored_mappings[-1][0]:
1730 scored_mappings.append((lDDT, mapping))
1731 scored_mappings.sort(reverse=True, key=lambda x: x[0])
1732 scored_mappings = scored_mappings[:topn]
1733
1734 # finalize and return
1735 results = list()
1736 for scored_mapping in scored_mappings:
1737 ref_view = substructure.handle.CreateEmptyView()
1738 mdl_view = mdl.handle.CreateEmptyView()
1739 for ref_ch_group, mdl_ch_group in zip(substructure_chem_groups,
1740 scored_mapping[1]):
1741 for ref_ch, mdl_ch in zip(ref_ch_group, mdl_ch_group):
1742 if ref_ch is not None and mdl_ch is not None:
1743 aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1744 for col in aln:
1745 if col[0] != '-' and col[1] != '-':
1746 ref_view.AddResidue(col.GetResidue(0),
1747 mol.ViewAddFlag.INCLUDE_ALL)
1748 mdl_view.AddResidue(col.GetResidue(1),
1749 mol.ViewAddFlag.INCLUDE_ALL)
1750 results.append(ReprResult(scored_mapping[0], substructure,
1751 ref_view, mdl_view))
1752 return results
1753
1754 def GetNMappings(self, model):
1755 """ Returns number of possible mappings
1756
1757 :param model: Model with chains that are mapped onto
1758 :attr:`chem_groups`
1759 :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1760 """
1761 chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1762 self.GetChemMapping(model)
1763 return _NMappings(self.chem_groupschem_groups, chem_mapping)
1764
1765 def ProcessStructure(self, ent):
1766 """ Entity processing for chain mapping
1767
1768 * Selects view containing peptide and nucleotide residues which have
1769 required backbone atoms present - for peptide residues thats
1770 N, CA, C and CB (no CB for GLY), for nucleotide residues thats
1771 O5', C5', C4', C3' and O3'.
1772 * filters view by chain lengths, see *min_pep_length* and
1773 *min_nuc_length* in constructor
1774 * Extracts atom sequences for each chain in that view
1775 * If residue number alignments are used, strictly increasing residue
1776 numbers without insertion codes are ensured in each chain
1777
1778 :param ent: Entity to process
1779 :type ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1780 :returns: Tuple with 3 elements: 1) :class:`ost.mol.EntityView`
1781 containing peptide and nucleotide residues 2)
1782 :class:`ost.seq.SequenceList` containing ATOMSEQ sequences
1783 for each polypeptide chain in returned view
1784 3) same for polynucleotide chains
1785 """
1786 view = ent.CreateEmptyView()
1787 exp_pep_atoms = ["N", "CA", "C", "CB"]
1788 exp_nuc_atoms = ["\"O5'\"", "\"C5'\"", "\"C4'\"", "\"C3'\"", "\"O3'\""]
1789 pep_query = "peptide=true and aname=" + ','.join(exp_pep_atoms)
1790 nuc_query = "nucleotide=true and aname=" + ','.join(exp_nuc_atoms)
1791
1792 pep_sel = ent.Select(pep_query)
1793 for r in pep_sel.residues:
1794 if len(r.atoms) == 4:
1795 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1796 elif r.name == "GLY" and len(r.atoms) == 3:
1797 atom_names = [a.GetName() for a in r.atoms]
1798 if sorted(atom_names) == ["C", "CA", "N"]:
1799 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1800
1801 nuc_sel = ent.Select(nuc_query)
1802 for r in nuc_sel.residues:
1803 if len(r.atoms) == 5:
1804 view.AddResidue(r.handle, mol.INCLUDE_ALL)
1805
1806 polypep_seqs = seq.CreateSequenceList()
1807 polynuc_seqs = seq.CreateSequenceList()
1808
1809 if len(view.residues) == 0:
1810 # no residues survived => return
1811 return (view, polypep_seqs, polynuc_seqs)
1812
1813 for ch in view.chains:
1814 n_res = len(ch.residues)
1815 n_pep = sum([r.IsPeptideLinking() for r in ch.residues])
1816 n_nuc = sum([r.IsNucleotideLinking() for r in ch.residues])
1817
1818 # guarantee that we have either pep or nuc (no mix of the two)
1819 if n_pep > 0 and n_nuc > 0:
1820 raise RuntimeError(f"Must not mix peptide and nucleotide linking "
1821 f"residues in same chain ({ch.GetName()})")
1822
1823 if (n_pep + n_nuc) != n_res:
1824 raise RuntimeError("All residues must either be peptide_linking "
1825 "or nucleotide_linking")
1826
1827 # filter out short chains
1828 if n_pep > 0 and n_pep < self.min_pep_length:
1829 ost.LogWarning("Skipping short peptide chain: %s" % ch.name)
1830 continue
1831
1832 if n_nuc > 0 and n_nuc < self.min_nuc_length:
1833 ost.LogWarning("Skipping short nucleotide chain: %s" % ch.name)
1834 continue
1835
1836 # the superfast residue number based alignment adds some
1837 # restrictions on the numbers themselves:
1838 # 1) no insertion codes 2) strictly increasing
1839 if self.resnum_alignments:
1840 # check if no insertion codes are present in residue numbers
1841 ins_codes = [r.GetNumber().GetInsCode() for r in ch.residues]
1842 if len(set(ins_codes)) != 1 or ins_codes[0] != '\0':
1843 raise RuntimeError("Residue numbers in input structures must not "
1844 "contain insertion codes")
1845
1846 # check if residue numbers are strictly increasing
1847 nums = [r.GetNumber().GetNum() for r in ch.residues]
1848 if not all(i < j for i, j in zip(nums, nums[1:])):
1849 raise RuntimeError("Residue numbers in input structures must be "
1850 "strictly increasing for each chain")
1851
1852 s = ''.join([r.one_letter_code for r in ch.residues])
1853 s = seq.CreateSequence(ch.GetName(), s)
1854 if n_pep == n_res:
1855 polypep_seqs.AddSequence(s)
1856 elif n_nuc == n_res:
1857 polynuc_seqs.AddSequence(s)
1858 else:
1859 raise RuntimeError("This shouldnt happen")
1860
1861 if len(polypep_seqs) == 0 and len(polynuc_seqs) == 0:
1862 raise RuntimeError(f"No chain fulfilled minimum length requirement "
1863 f"to be considered in chain mapping "
1864 f"({self.min_pep_length} for peptide chains, "
1865 f"{self.min_nuc_length} for nucleotide chains) "
1866 f"- mapping failed")
1867
1868 # select for chains for which we actually extracted the sequence
1869 chain_names = [s.name for s in polypep_seqs]
1870 chain_names += [s.name for s in polynuc_seqs]
1871 view = _CSel(view, chain_names)
1872
1873 return (view, polypep_seqs, polynuc_seqs)
1874
1875 def NWAlign(self, s1, s2, stype):
1876 """ Access to internal sequence alignment functionality
1877
1878 Performs Needleman-Wunsch alignment with parameterization
1879 setup at ChainMapper construction
1880
1881 :param s1: First sequence to align
1882 :type s1: :class:`ost.seq.SequenceHandle`
1883 :param s2: Second sequence to align
1884 :type s2: :class:`ost.seq.SequenceHandle`
1885 :param stype: Type of sequences to align, must be in
1886 [:class:`ost.mol.ChemType.AMINOACIDS`,
1887 :class:`ost.mol.ChemType.NUCLEOTIDES`]
1888 :returns: Pairwise alignment of s1 and s2
1889 """
1890 if stype == mol.ChemType.AMINOACIDS:
1891 return seq.alg.SemiGlobalAlign(s1, s2, self.pep_subst_mat,
1892 gap_open=self.pep_gap_open,
1893 gap_ext=self.pep_gap_ext)[0]
1894 elif stype == mol.ChemType.NUCLEOTIDES:
1895 return seq.alg.SemiGlobalAlign(s1, s2, self.nuc_subst_mat,
1896 gap_open=self.nuc_gap_open,
1897 gap_ext=self.nuc_gap_ext)[0]
1898 else:
1899 raise RuntimeError("Invalid ChemType")
1900 return aln
1901
1902 def SEQRESAlign(self, seqres, s, s_ent):
1903 """ Access to internal sequence alignment functionality
1904
1905 Performs alignment on SEQRES. Residue numbers for *s* are
1906 extracted from *s_ent*. Residue numbers start from 1, i.e.
1907 1 aligns to the first element in *seqres*.
1908
1909 :param seqres: SEQRES sequence
1910 :type seqres: :class:`ost.seq.SequenceHandle`
1911 :param s: Sequence to align
1912 :type s: :class:`ost.seq.SequenceHandle`
1913 :param s_ent: Structure which must contain a chain of same name as *s*.
1914 This chain must have the exact same number of residues
1915 as characters in *s*.
1916 :type s_ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1917 """
1918
1919 ch = s_ent.FindChain(s.name)
1920
1921 if not ch.IsValid():
1922 raise RuntimeError("s_ent lacks required chain in SEQRESAlign")
1923
1924 if len(s) != ch.GetResidueCount():
1925 raise RuntimeError("Sequence/structure mismatch in SEQRESAlign")
1926
1927 rnums = [r.GetNumber().GetNum() for r in ch.residues]
1928 max_rnum = max(len(seqres), max(rnums))
1929
1930 # potentially add some trailing gaps
1931 seqres_s = seqres.GetGaplessString() + '-'*(max_rnum-len(seqres))
1932
1933 olcs = ['-'] * max_rnum
1934 for olc, num in zip(s, rnums):
1935 olcs[num-1] = olc
1936
1937 aln = seq.CreateAlignment()
1938 aln.AddSequence(seq.CreateSequence(seqres.GetName(), seqres_s))
1939 aln.AddSequence(seq.CreateSequence(s.GetName(), ''.join(olcs)))
1940 return aln
1941
1942 def ResNumAlign(self, s1, s2, s1_ent, s2_ent):
1943 """ Access to internal sequence alignment functionality
1944
1945 Performs residue number based alignment. Residue numbers are extracted
1946 from *s1_ent*/*s2_ent*.
1947
1948 :param s1: First sequence to align
1949 :type s1: :class:`ost.seq.SequenceHandle`
1950 :param s2: Second sequence to align
1951 :type s2: :class:`ost.seq.SequenceHandle`
1952 :param s1_ent: Structure as source for residue numbers in *s1*. Must
1953 contain a chain named after sequence name in *s1*. This
1954 chain must have the exact same number of residues as
1955 characters in s1.
1956 :type s1_ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1957 :param s2_ent: Same for *s2*.
1958 :type s2_ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1959
1960 :returns: Pairwise alignment of s1 and s2
1961 """
1962 ch1 = s1_ent.FindChain(s1.name)
1963 ch2 = s2_ent.FindChain(s2.name)
1964
1965 if not ch1.IsValid():
1966 raise RuntimeError("s1_ent lacks requested chain in ResNumAlign")
1967
1968 if not ch2.IsValid():
1969 raise RuntimeError("s2_ent lacks requested chain in ResNumAlign")
1970
1971 if len(s1) != ch1.GetResidueCount():
1972 raise RuntimeError("Sequence/structure mismatch in ResNumAlign")
1973
1974 if len(s2) != ch2.GetResidueCount():
1975 raise RuntimeError("Sequence/structure mismatch in ResNumAlign")
1976
1977 rnums1 = [r.GetNumber().GetNum() for r in ch1.residues]
1978 rnums2 = [r.GetNumber().GetNum() for r in ch2.residues]
1979
1980 min_num = min(rnums1[0], rnums2[0])
1981 max_num = max(rnums1[-1], rnums2[-1])
1982 aln_length = max_num - min_num + 1
1983
1984 aln_s1 = ['-'] * aln_length
1985 for olc, rnum in zip(s1, rnums1):
1986 aln_s1[rnum-min_num] = olc
1987
1988 aln_s2 = ['-'] * aln_length
1989 for olc, rnum in zip(s2, rnums2):
1990 aln_s2[rnum-min_num] = olc
1991
1992 aln = seq.CreateAlignment()
1993 aln.AddSequence(seq.CreateSequence(s1.GetName(), ''.join(aln_s1)))
1994 aln.AddSequence(seq.CreateSequence(s2.GetName(), ''.join(aln_s2)))
1995 return aln
1996
1997
1999 """ Sets properties :attr:`~chem_groups`,
2000 :attr:`~chem_group_alignments`, :attr:`~chem_group_ref_seqs`,
2001 :attr:`~chem_group_types`
2002 """
2003
2004 if self.seqres_set:
2007 else:
2010
2011 self._chem_group_ref_seqs = seq.CreateSequenceList()
2013 s = seq.CreateSequence(a.GetSequence(0).GetName(),
2014 a.GetSequence(0).GetGaplessString())
2015 self._chem_group_ref_seqs.AddSequence(s)
2016
2017 self._chem_groups = list()
2019 group = list()
2020 for s_idx in range(1, a.GetCount()):
2021 s = a.GetSequence(s_idx)
2022 group.append(s.GetName())
2023 self._chem_groups.append(group)
2024
2025
2027 """ Groups target sequences based on SEQRES
2028
2029 returns tuple that can be set as self._chem_group_alignments and
2030 self._chem_group_types
2031 """
2032 pep_groups = self._GroupSequencesSEQRES(self.polypep_seqs)
2033 nuc_groups = self._GroupSequencesSEQRES(self.polynuc_seqspolynuc_seqs)
2034 group_types = [mol.ChemType.AMINOACIDS] * len(pep_groups)
2035 group_types += [mol.ChemType.NUCLEOTIDES] * len(nuc_groups)
2036 groups = pep_groups
2037 groups.extend(nuc_groups)
2038
2039 return (groups, group_types)
2040
2041
2042 def _GroupSequencesSEQRES(self, poly_seqs):
2043
2044 alns = dict()
2045
2046 for poly_seq in poly_seqs:
2047 sname = poly_seq.GetName()
2048 seqres_name = self.trg_seqres_mapping[sname]
2049 if seqres_name not in alns:
2050 aln = seq.CreateAlignment()
2051 seqres = None
2052 for s in self.seqres:
2053 if s.GetName() == seqres_name:
2054 seqres = s
2055 break
2056 if seqres is None:
2057 # each element in self.trg_seqres_mapping is guaranteed
2058 # to have a SEQRES counterpart as checked in ChainMapper
2059 # constructor. So we should never end up here.
2060 raise RuntimeError("You should never get here - contact "
2061 "OST developer")
2062 aln.AddSequence(seqres)
2063 alns[seqres_name] = aln
2064 seqres = alns[seqres_name].GetSequence(0)
2065 aln_length = seqres.GetLength()
2066 atomseq = ['-'] * aln_length
2067 trg_chain = self.targettarget.FindChain(sname)
2068 assert(trg_chain.IsValid())
2069 assert(trg_chain.GetResidueCount() == len(poly_seq))
2070 for olc, r in zip(poly_seq, trg_chain.residues):
2071 num = r.GetNumber().GetNum()
2072 if num < 1 or num > aln_length:
2073 raise RuntimeError(f"Observed residue with invalid number: "
2074 f"\"{r}\" which cannot be aligned to "
2075 f"seqres with id \"{seqres_name}\"")
2076 if olc != seqres[num-1]:
2077 raise RuntimeError(f"Sequence mismatch of residue \"{r}\" "
2078 f"with olc \"{olc}\" and residue number "
2079 f"\"{num}\". Character at that location "
2080 f"in seqres: \"{seqres[num-1]}\"."
2081 f"Seqres name: \"{seqres_name}\"")
2082 atomseq[num-1] = olc
2083 atomseq = ''.join(atomseq)
2084 alns[seqres_name].AddSequence(seq.CreateSequence(sname, atomseq))
2085
2086 return [aln for aln in alns.values()]
2087
2088
2090 """ Groups target sequences based on ATOMSEQ
2091
2092 returns tuple that can be set as self._chem_group_alignments and
2093 self._chem_group_types
2094 """
2095 pep_groups = self._GroupSequences(self.polypep_seqs, self.pep_seqid_thr,
2096 self.min_pep_length,
2097 mol.ChemType.AMINOACIDS)
2098 nuc_groups = self._GroupSequences(self.polynuc_seqspolynuc_seqs, self.nuc_seqid_thr,
2099 self.min_nuc_length,
2100 mol.ChemType.NUCLEOTIDES)
2101
2102 # pep_groups and nuc_groups give us alignments based on ATOMSEQ.
2103 # For example: If you have polymer chains A,B and C in the same
2104 # group and A is the longest one, you get an alignment that looks
2105 # like:
2106 # A: ASDFE
2107 # B: ASDF-
2108 # C: -SDFE
2109 #
2110 # however, the first sequence in chem group alignments must not be
2111 # bound to any ATOMSEQ and represent the reference sequence. In the
2112 # case of this function, this is simply a copy of sequence A:
2113 # REF: ASDFE
2114 # A: ASDFE
2115 # B: ASDF-
2116 # C: -SDFE
2117
2118 # do pep_groups
2119 tmp = list()
2120 for a in pep_groups:
2121 new_a = seq.CreateAlignment()
2122 new_a.AddSequence(a.GetSequence(0))
2123 for s_idx in range(a.GetCount()):
2124 new_a.AddSequence(a.GetSequence(s_idx))
2125 tmp.append(new_a)
2126 pep_groups = tmp
2127
2128 # do nuc groups
2129 tmp = list()
2130 for a in nuc_groups:
2131 new_a = seq.CreateAlignment()
2132 new_a.AddSequence(a.GetSequence(0))
2133 for s_idx in range(a.GetCount()):
2134 new_a.AddSequence(a.GetSequence(s_idx))
2135 tmp.append(new_a)
2136 nuc_groups = tmp
2137
2138 group_types = [mol.ChemType.AMINOACIDS] * len(pep_groups)
2139 group_types += [mol.ChemType.NUCLEOTIDES] * len(nuc_groups)
2140 groups = pep_groups
2141 groups.extend(nuc_groups)
2142
2143 return (groups, group_types)
2144
2145 def _GroupSequences(self, seqs, seqid_thr, min_length, chem_type):
2146 """Get list of alignments representing groups of equivalent sequences
2147
2148 :param seqid_thr: Threshold used to decide when two chains are identical.
2149 :type seqid_thr: :class:`float`
2150 :param gap_thr: Additional threshold to avoid gappy alignments with high
2151 seqid. Number of aligned columns must be at least this
2152 number.
2153 :type gap_thr: :class:`int`
2154 :param aligner: Helper class to generate pairwise alignments
2155 :type aligner: :class:`_Aligner`
2156 :param chem_type: ChemType of seqs, must be in
2157 [:class:`ost.mol.ChemType.AMINOACIDS`,
2158 :class:`ost.mol.ChemType.NUCLEOTIDES`]
2159 :type chem_type: :class:`ost.mol.ChemType`
2160 :returns: A list of alignments, one alignment for each group
2161 with longest sequence (reference) as first sequence.
2162 :rtype: :class:`ost.seq.AlignmentList`
2163 """
2164 groups = list()
2165 for s_idx in range(len(seqs)):
2166 matching_group = None
2167 for g_idx in range(len(groups)):
2168 for g_s_idx in range(len(groups[g_idx])):
2169 if self.resnum_alignments:
2170 aln = self.ResNumAlign(seqs[s_idx],
2171 seqs[groups[g_idx][g_s_idx]],
2172 self.targettarget, self.targettarget)
2173 else:
2174 aln = self.NWAlign(seqs[s_idx],
2175 seqs[groups[g_idx][g_s_idx]],
2176 chem_type)
2177 sid, n_aligned = _GetAlnPropsOne(aln)
2178 if sid >= seqid_thr and n_aligned >= min_length:
2179 matching_group = g_idx
2180 break
2181 if matching_group is not None:
2182 break
2183
2184 if matching_group is None:
2185 groups.append([s_idx])
2186 else:
2187 groups[matching_group].append(s_idx)
2188
2189 # sort based on sequence length
2190 sorted_groups = list()
2191 for g in groups:
2192 if len(g) > 1:
2193 tmp = sorted([[len(seqs[i]), i] for i in g], reverse=True)
2194 sorted_groups.append([x[1] for x in tmp])
2195 else:
2196 sorted_groups.append(g)
2197
2198 # translate from indices back to sequences and directly generate alignments
2199 # of the groups with the longest (first) sequence as reference
2200 aln_list = seq.AlignmentList()
2201 for g in sorted_groups:
2202 if len(g) == 1:
2203 # aln with one single sequence
2204 aln_list.append(seq.CreateAlignment(seqs[g[0]]))
2205 else:
2206 # obtain pairwise aln of first sequence (reference) to all others
2207 alns = seq.AlignmentList()
2208 i = g[0]
2209 for j in g[1:]:
2210 if self.resnum_alignments:
2211 aln = self.ResNumAlign(seqs[i], seqs[j],
2212 self.targettarget, self.targettarget)
2213 else:
2214 aln = self.NWAlign(seqs[i], seqs[j], chem_type)
2215 alns.append(aln)
2216 # and merge
2217 aln_list.append(seq.alg.MergePairwiseAlignments(alns, seqs[i]))
2218
2219 return aln_list
2220
2221 def _MapSequence(self, ref_seqs, ref_types, s, s_type, s_ent,
2222 seq_id_thr=0.0, min_aln_length=0):
2223 """Tries top map *s* onto any of the sequences in *ref_seqs*
2224
2225 Computes alignments of *s* to each of the reference sequences of equal type
2226 and sorts them by seqid*fraction_covered (seqid: sequence identity of
2227 aligned columns in alignment, fraction_covered: Fraction of non-gap
2228 characters in reference sequence that are covered by non-gap characters in
2229 *s*). Best scoring mapping is returned. Optionally, *seq_id*/
2230 *min_aln_length* thresholds can be enabled to avoid non-sensical mappings.
2231 However, *min_aln_length* only has an effect if *seq_id_thr* > 0!!!
2232
2233 :param ref_seqs: Reference sequences
2234 :type ref_seqs: :class:`ost.seq.SequenceList`
2235 :param ref_types: Types of reference sequences, e.g.
2236 ost.mol.ChemType.AminoAcids
2237 :type ref_types: :class:`list` of :class:`ost.mol.ChemType`
2238 :param s: Sequence to map
2239 :type s: :class:`ost.seq.SequenceHandle`
2240 :param s_type: Type of *s*, only try mapping to sequences in *ref_seqs*
2241 with equal type as defined in *ref_types*
2242 :param s_ent: Entity which represents *s*. Only relevant in case of
2243 residue number alignments.
2244 :type s_ent: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView`
2245 :param seq_id_thr: Minimum sequence identity to be considered as match
2246 :type seq_id_thr: :class:`float`
2247 :param min_aln_length: Minimum number of aligned columns to be considered
2248 as match. Only has an effect if *seq_id_thr* > 0!
2249 :type min_aln_length: :class:`int`
2250 :returns: Tuple with two elements. 1) index of sequence in *ref_seqs* to
2251 which *s* can be mapped 2) Pairwise sequence alignment with
2252 sequence from *ref_seqs* as first sequence. Both elements are
2253 None if no mapping can be found or if thresholds are not
2254 fulfilled for any alignment.
2255 :raises: :class:`RuntimeError` if mapping is ambiguous, i.e. *s*
2256 successfully maps to more than one sequence in *ref_seqs*
2257 """
2258 scored_alns = list()
2259 for ref_idx, ref_seq in enumerate(ref_seqs):
2260 if ref_types[ref_idx] == s_type:
2261 if self.seqres_set:
2262 aln = self.SEQRESAlign(ref_seq, s, s_ent)
2263 elif self.resnum_alignments:
2264 aln = self.ResNumAlign(ref_seq, s, self.targettarget, s_ent)
2265 else:
2266 aln = self.NWAlign(ref_seq, s, s_type)
2267 seqid, n_tot, n_aligned = _GetAlnPropsTwo(aln)
2268 if seq_id_thr > 0:
2269 if seqid >= seq_id_thr and n_aligned >= min_aln_length:
2270 fraction_covered = float(n_aligned)/n_tot
2271 score = seqid * fraction_covered
2272 scored_alns.append((score, ref_idx, aln))
2273 else:
2274 fraction_covered = float(n_aligned)/n_tot
2275 score = seqid * fraction_covered
2276 scored_alns.append((score, ref_idx, aln))
2277
2278 if len(scored_alns) == 0:
2279 return (None, None) # no mapping possible...
2280
2281 scored_alns = sorted(scored_alns, key=lambda x: x[0], reverse=True)
2282 return (scored_alns[0][1], scored_alns[0][2])
2283
2285 """Returns basic properties of *aln* version two...
2286
2287 :param aln: Alignment to compute properties
2288 :type aln: :class:`seq.AlignmentHandle`
2289 :returns: Tuple with 3 elements. 1) sequence identity in range [0, 100]
2290 considering aligned columns 2) Number of non gap characters in
2291 first sequence 3) Number of aligned characters.
2292 """
2293 assert(aln.GetCount() == 2)
2294 n_tot = sum([1 for col in aln if col[0] != '-'])
2295 n_aligned = sum([1 for col in aln if (col[0] != '-' and col[1] != '-')])
2296 return (seq.alg.SequenceIdentity(aln), n_tot, n_aligned)
2297
2299
2300 """Returns basic properties of *aln* version one...
2301
2302 :param aln: Alignment to compute properties
2303 :type aln: :class:`seq.AlignmentHandle`
2304 :returns: Tuple with 2 elements. 1) sequence identify in range [0, 100]
2305 considering aligned columns 2) Number of aligned columns.
2306 """
2307 assert(aln.GetCount() == 2)
2308 seqid = seq.alg.SequenceIdentity(aln)
2309 n_aligned = sum([1 for col in aln if (col[0] != '-' and col[1] != '-')])
2310 return (seqid, n_aligned)
2311
2312def _GetRefMdlAlns(ref_chem_groups, ref_chem_group_msas, mdl_chem_groups,
2313 mdl_chem_group_alns, pairs=None):
2314 """ Get all possible ref/mdl chain alignments given chem group mapping
2315
2316 :param ref_chem_groups: :attr:`ChainMapper.chem_groups`
2317 :type ref_chem_groups: :class:`list` of :class:`list` of :class:`str`
2318 :param ref_chem_group_msas: :attr:`ChainMapper.chem_group_alignments`
2319 :type ref_chem_group_msas: :class:`ost.seq.AlignmentList`
2320 :param mdl_chem_groups: Groups of model chains that are mapped to
2321 *ref_chem_groups*. Return value of
2322 :func:`ChainMapper.GetChemMapping`.
2323 :type mdl_chem_groups: :class:`list` of :class:`list` of :class:`str`
2324 :param mdl_chem_group_alns: A pairwise sequence alignment for every chain
2325 in *mdl_chem_groups* that aligns these sequences
2326 to the respective reference sequence.
2327 Return values of
2328 :func:`ChainMapper.GetChemMapping`.
2329 :type mdl_chem_group_alns: :class:`list` of :class:`ost.seq.AlignmentList`
2330 :param pairs: Pro param - restrict return dict to specified pairs. A set of
2331 tuples in form (<trg_ch>, <mdl_ch>)
2332 :type pairs: :class:`set`
2333 :returns: A dictionary holding all possible ref/mdl chain alignments. Keys
2334 in that dictionary are tuples of the form (ref_ch, mdl_ch) and
2335 values are the respective pairwise alignments with first sequence
2336 being from ref, the second from mdl.
2337 """
2338 # alignment of each model chain to chem_group reference sequence
2339 mdl_alns = dict()
2340 for alns in mdl_chem_group_alns:
2341 for aln in alns:
2342 mdl_chain_name = aln.GetSequence(1).GetName()
2343 mdl_alns[mdl_chain_name] = aln
2344
2345 # generate all alignments between ref/mdl chain atomseqs that we will
2346 # ever observe
2347 ref_mdl_alns = dict()
2348 for ref_chains, mdl_chains, ref_aln in zip(ref_chem_groups, mdl_chem_groups,
2349 ref_chem_group_msas):
2350 for ref_ch in ref_chains:
2351 for mdl_ch in mdl_chains:
2352 if pairs is not None and (ref_ch, mdl_ch) not in pairs:
2353 continue
2354 # obtain alignments of mdl and ref chains towards chem
2355 # group ref sequence and merge them
2356 aln_list = seq.AlignmentList()
2357
2358 # do ref aln
2359
2361 s1 = ref_aln.GetSequence(0)
2362 # ATOMSEQ of ref_ch
2363 s2 = ref_aln.GetSequence(1+ref_chains.index(ref_ch))
2364 aln_list.append(seq.CreateAlignment(s1, s2))
2365
2366 # do mdl aln
2367
2368 aln_list.append(mdl_alns[mdl_ch])
2369
2370 # merge
2371
2372 ref_seq = seq.CreateSequence(s1.GetName(),
2373 s1.GetGaplessString())
2374 merged_aln = seq.alg.MergePairwiseAlignments(aln_list,
2375 ref_seq)
2376 # merged_aln:
2377 # seq1: ref seq of chem group
2378 # seq2: seq of ref chain
2379 # seq3: seq of mdl chain
2380 # => we need the alignment between seq2 and seq3
2381 s2 = merged_aln.GetSequence(1)
2382 s3 = merged_aln.GetSequence(2)
2383 # cut leading and trailing gap columns
2384 a = 0 # number of leading gap columns
2385 for idx in range(len(s2)):
2386 if s2[idx] != '-' or s3[idx] != '-':
2387 break
2388 a += 1
2389 b = 0 # number of trailing gap columns
2390 for idx in reversed(range(len(s2))):
2391 if s2[idx] != '-' or s3[idx] != '-':
2392 break
2393 b += 1
2394 s2 = seq.CreateSequence(s2.GetName(), s2[a: len(s2)-b])
2395 s3 = seq.CreateSequence(s3.GetName(), s3[a: len(s3)-b])
2396 ref_mdl_alns[(ref_ch, mdl_ch)] = seq.CreateAlignment(s2, s3)
2397
2398 return ref_mdl_alns
2399
2400def _CheckOneToOneMapping(ref_chains, mdl_chains):
2401 """ Checks whether we already have a perfect one to one mapping
2402
2403 That means each list in *ref_chains* has either exactly one element
2404 and the respective list in *mdl_chains* has also one element or
2405 it has several elements and the respective list in *mdl_chains* is
2406 empty (ref chain(s) has no mapped mdl chain). Returns None if no such
2407 mapping can be found.
2408
2409 :param ref_chains: corresponds to :attr:`ChainMapper.chem_groups`
2410 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
2411 :param mdl_chains: mdl chains mapped to chem groups in *ref_chains*, i.e.
2412 the return value of :func:`ChainMapper.GetChemMapping`
2413 :type mdl_chains: class:`list` of :class:`list` of :class:`str`
2414 :returns: A :class:`list` of :class:`list` if a one to one mapping is found,
2415 None otherwise
2416 """
2417 only_one_to_one = True
2418 one_to_one = list()
2419 for ref, mdl in zip(ref_chains, mdl_chains):
2420 if len(ref) == 1 and len(mdl) == 1:
2421 one_to_one.append(mdl)
2422 elif len(ref) >= 1 and len(mdl) == 0:
2423 one_to_one.append(len(ref)*[None])
2424 else:
2425 only_one_to_one = False
2426 break
2427 if only_one_to_one:
2428 return one_to_one
2429 else:
2430 return None
2431
2433 def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
2434 ref_mdl_alns, inclusion_radius = 15.0,
2435 thresholds = [0.5, 1.0, 2.0, 4.0],
2436 steep_opt_rate = None):
2437
2438 """ Greedy extension of already existing but incomplete chain mappings
2439 """
2440 super().__init__(ref, ref_chem_groups, mdl, ref_mdl_alns,
2441 dist_thresh=inclusion_radius,
2442 dist_diff_thresholds=thresholds)
2443
2444 self.mdl_chem_groups = mdl_chem_groups
2445 self.steep_opt_rate = steep_opt_rate
2446
2447 self.neighbors = {k: set() for k in self.trg.chain_names}
2448 for interface in self.trg.interacting_chains:
2449 self.neighbors[interface[0]].add(interface[1])
2450 self.neighbors[interface[1]].add(interface[0])
2451
2452 assert(len(ref_chem_groups) == len(mdl_chem_groups))
2453 self.ref_chem_groups = ref_chem_groups
2454 self.mdl_chem_groups = mdl_chem_groups
2457 for g_idx, (ref_g, mdl_g) in enumerate(zip(ref_chem_groups,
2458 mdl_chem_groups)):
2459 for ch in ref_g:
2460 self.ref_ch_group_mapper[ch] = g_idx
2461 for ch in mdl_g:
2462 self.mdl_ch_group_mapper[ch] = g_idx
2463
2464 # keep track of mdl chains that potentially give lDDT contributions,
2465 # i.e. they have locations within inclusion_radius + max(thresholds)
2466 self.mdl_neighbors = {k: set() for k in self.mdl.chain_names}
2467 for interface in self.mdl.potentially_contributing_chains:
2468 self.mdl_neighbors[interface[0]].add(interface[1])
2469 self.mdl_neighbors[interface[1]].add(interface[0])
2470
2471 def ExtendMapping(self, mapping, max_ext = None):
2472
2473 if len(mapping) == 0:
2474 raise RuntimError("Mapping must contain a starting point")
2475
2476 # Ref chains onto which we can map. The algorithm starts with a mapping
2477 # on ref_ch. From there we can start to expand to connected neighbors.
2478 # All neighbors that we can reach from the already mapped chains are
2479 # stored in this set which will be updated during runtime
2480 map_targets = set()
2481 for ref_ch in mapping.keys():
2482 map_targets.update(self.neighbors[ref_ch])
2483
2484 # remove the already mapped chains
2485 for ref_ch in mapping.keys():
2486 map_targets.discard(ref_ch)
2487
2488 if len(map_targets) == 0:
2489 return mapping # nothing to extend
2490
2491 # keep track of what model chains are not yet mapped for each chem group
2492 free_mdl_chains = list()
2493 for chem_group in self.mdl_chem_groups:
2494 tmp = [x for x in chem_group if x not in mapping.values()]
2495 free_mdl_chains.append(set(tmp))
2496
2497 # keep track of what ref chains got a mapping
2498 newly_mapped_ref_chains = list()
2499
2500 something_happened = True
2501 while something_happened:
2502 something_happened=False
2503
2504 if self.steep_opt_rate is not None:
2505 n_chains = len(newly_mapped_ref_chains)
2506 if n_chains > 0 and n_chains % self.steep_opt_rate == 0:
2507 mapping = self._SteepOpt(mapping, newly_mapped_ref_chains)
2508
2509 if max_ext is not None and len(newly_mapped_ref_chains) >= max_ext:
2510 break
2511
2512 max_n = 0
2513 max_mapping = None
2514 for ref_ch in map_targets:
2515 chem_group_idx = self.ref_ch_group_mapper[ref_ch]
2516 for mdl_ch in free_mdl_chains[chem_group_idx]:
2517 # single chain score
2518 n_single = self._NSCConserved(ref_ch, mdl_ch).sum()
2519 # scores towards neighbors that are already mapped
2520 n_inter = 0
2521 for neighbor in self.neighbors[ref_ch]:
2522 if neighbor in mapping and mapping[neighbor] in \
2523 self.mdl_neighbors[mdl_ch]:
2524 n_inter += self._NPairConserved((ref_ch, neighbor),
2525 (mdl_ch, mapping[neighbor])).sum()
2526 n = n_single + n_inter
2527
2528 if n_inter > 0 and n > max_n:
2529 # Only accept a new solution if its actually connected
2530 # i.e. n_inter > 0. Otherwise we could just map a big
2531 # fat mdl chain sitting somewhere in Nirvana
2532 max_n = n
2533 max_mapping = (ref_ch, mdl_ch)
2534
2535 if max_n > 0:
2536 something_happened = True
2537 # assign new found mapping
2538 mapping[max_mapping[0]] = max_mapping[1]
2539
2540 # add all neighboring chains to map targets as they are now
2541 # reachable
2542 for neighbor in self.neighbors[max_mapping[0]]:
2543 if neighbor not in mapping:
2544 map_targets.add(neighbor)
2545
2546 # remove the ref chain from map targets
2547 map_targets.remove(max_mapping[0])
2548
2549 # remove the mdl chain from free_mdl_chains - its taken...
2550 chem_group_idx = self.ref_ch_group_mapper[max_mapping[0]]
2551 free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2552
2553 # keep track of what ref chains got a mapping
2554 newly_mapped_ref_chains.append(max_mapping[0])
2555
2556 return mapping
2557
2558 def _SteepOpt(self, mapping, chains_to_optimize=None):
2559
2560 # just optimize ALL ref chains if nothing specified
2561 if chains_to_optimize is None:
2562 chains_to_optimize = mapping.keys()
2563
2564 # make sure that we only have ref chains which are actually mapped
2565 ref_chains = [x for x in chains_to_optimize if mapping[x] is not None]
2566
2567 # group ref chains to be optimized into chem groups
2568 tmp = dict()
2569 for ch in ref_chains:
2570 chem_group_idx = self.ref_ch_group_mapper[ch]
2571 if chem_group_idx in tmp:
2572 tmp[chem_group_idx].append(ch)
2573 else:
2574 tmp[chem_group_idx] = [ch]
2575 chem_groups = list(tmp.values())
2576
2577 # try all possible mapping swaps. Swaps that improve the score are
2578 # immediately accepted and we start all over again
2579 current_lddt = self.FromFlatMapping(mapping)
2580 something_happened = True
2581 while something_happened:
2582 something_happened = False
2583 for chem_group in chem_groups:
2584 if something_happened:
2585 break
2586 for ch1, ch2 in itertools.combinations(chem_group, 2):
2587 swapped_mapping = dict(mapping)
2588 swapped_mapping[ch1] = mapping[ch2]
2589 swapped_mapping[ch2] = mapping[ch1]
2590 score = self.FromFlatMapping(swapped_mapping)
2591 if score > current_lddt:
2592 something_happened = True
2593 mapping = swapped_mapping
2594 current_lddt = score
2595 break
2596 return mapping
2597
2598
2599def _lDDTNaive(trg, mdl, inclusion_radius, thresholds, chem_groups,
2600 chem_mapping, ref_mdl_alns, n_max_naive):
2601 """ Naively iterates all possible chain mappings and returns the best
2602 """
2603 best_mapping = None
2604 best_lddt = -1.0
2605
2606 # Setup scoring
2607 lddt_scorer = bb_lddt.BBlDDTScorer(trg, chem_groups, mdl, ref_mdl_alns,
2608 dist_thresh=inclusion_radius,
2609 dist_diff_thresholds=thresholds)
2610 for mapping in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2611 lDDT = lddt_scorer.Score(mapping, check=False)
2612 if lDDT > best_lddt:
2613 best_mapping = mapping
2614 best_lddt = lDDT
2615
2616 return (best_mapping, best_lddt)
2617
2618
2619def _GetSeeds(ref_chem_groups, mdl_chem_groups, mapped_ref_chains = set(),
2620 mapped_mdl_chains = set()):
2621 seeds = list()
2622 for ref_chains, mdl_chains in zip(ref_chem_groups,
2623 mdl_chem_groups):
2624 for ref_ch in ref_chains:
2625 if ref_ch not in mapped_ref_chains:
2626 for mdl_ch in mdl_chains:
2627 if mdl_ch not in mapped_mdl_chains:
2628 seeds.append((ref_ch, mdl_ch))
2629 return seeds
2630
2631def _lDDTGreedyFull(the_greed):
2632 """ Uses each reference chain as starting point for expansion
2633 """
2634
2635 seeds = _GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
2636 best_overall_score = -1.0
2637 best_overall_mapping = dict()
2638
2639 for seed in seeds:
2640
2641 # do initial extension
2642 mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
2643
2644 # repeat the process until we have a full mapping
2645 something_happened = True
2646 while something_happened:
2647 something_happened = False
2648 remnant_seeds = _GetSeeds(the_greed.ref_chem_groups,
2649 the_greed.mdl_chem_groups,
2650 mapped_ref_chains = set(mapping.keys()),
2651 mapped_mdl_chains = set(mapping.values()))
2652 if len(remnant_seeds) > 0:
2653 # still more mapping to be done
2654 best_score = -1.0
2655 best_mapping = None
2656 for remnant_seed in remnant_seeds:
2657 tmp_mapping = dict(mapping)
2658 tmp_mapping[remnant_seed[0]] = remnant_seed[1]
2659 tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
2660 score = the_greed.FromFlatMapping(tmp_mapping)
2661 if score > best_score:
2662 best_score = score
2663 best_mapping = tmp_mapping
2664 if best_mapping is not None:
2665 something_happened = True
2666 mapping = best_mapping
2667
2668 score = the_greed.FromFlatMapping(mapping)
2669 if score > best_overall_score:
2670 best_overall_score = score
2671 best_overall_mapping = mapping
2672
2673 mapping = best_overall_mapping
2674
2675 # translate mapping format and return
2676 final_mapping = list()
2677 for ref_chains in the_greed.ref_chem_groups:
2678 mapped_mdl_chains = list()
2679 for ref_ch in ref_chains:
2680 if ref_ch in mapping:
2681 mapped_mdl_chains.append(mapping[ref_ch])
2682 else:
2683 mapped_mdl_chains.append(None)
2684 final_mapping.append(mapped_mdl_chains)
2685
2686 return final_mapping
2687
2688
2689def _lDDTGreedyBlock(the_greed, seed_size, blocks_per_chem_group):
2690 """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
2691 respective chem groups and compute single chain lDDTs. The
2692 *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
2693 and the best scoring one is exhaustively extended.
2694 """
2695
2696 if seed_size is None or seed_size < 1:
2697 raise RuntimeError(f"seed_size must be an int >= 1 (got {seed_size})")
2698
2699 if blocks_per_chem_group is None or blocks_per_chem_group < 1:
2700 raise RuntimeError(f"blocks_per_chem_group must be an int >= 1 "
2701 f"(got {blocks_per_chem_group})")
2702
2703 max_ext = seed_size - 1 # -1 => start seed already has size 1
2704
2705 ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
2706 mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
2707
2708 mapping = dict()
2709 something_happened = True
2710 while something_happened:
2711 something_happened = False
2712 starting_blocks = list()
2713 for ref_chains, mdl_chains in zip(ref_chem_groups, mdl_chem_groups):
2714 if len(mdl_chains) == 0:
2715 continue # nothing to map
2716 ref_chains_copy = list(ref_chains)
2717 for i in range(blocks_per_chem_group):
2718 if len(ref_chains_copy) == 0:
2719 break
2720 seeds = list()
2721 for ref_ch in ref_chains_copy:
2722 seeds += [(ref_ch, mdl_ch) for mdl_ch in mdl_chains]
2723 # extend starting seeds to *seed_size* and retain best scoring
2724 # block for further extension
2725 best_score = -1.0
2726 best_mapping = None
2727 best_seed = None
2728 for s in seeds:
2729 seed = dict(mapping)
2730 seed.update({s[0]: s[1]})
2731 seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
2732 seed_lddt = the_greed.FromFlatMapping(seed)
2733 if seed_lddt > best_score:
2734 best_score = seed_lddt
2735 best_mapping = seed
2736 best_seed = s
2737 if best_mapping != None:
2738 starting_blocks.append(best_mapping)
2739 if best_seed[0] in ref_chains_copy:
2740 # remove that ref chain to enforce diversity
2741 ref_chains_copy.remove(best_seed[0])
2742
2743 # fully expand initial starting blocks
2744 best_lddt = 0.0
2745 best_mapping = None
2746 for seed in starting_blocks:
2747 seed = the_greed.ExtendMapping(seed)
2748 seed_lddt = the_greed.FromFlatMapping(seed)
2749 if seed_lddt > best_lddt:
2750 best_lddt = seed_lddt
2751 best_mapping = seed
2752
2753 if best_lddt == 0.0:
2754 break # no proper mapping found anymore
2755
2756 something_happened = True
2757 mapping.update(best_mapping)
2758 for ref_ch, mdl_ch in best_mapping.items():
2759 for group_idx in range(len(ref_chem_groups)):
2760 if ref_ch in ref_chem_groups[group_idx]:
2761 ref_chem_groups[group_idx].remove(ref_ch)
2762 if mdl_ch in mdl_chem_groups[group_idx]:
2763 mdl_chem_groups[group_idx].remove(mdl_ch)
2764
2765 # translate mapping format and return
2766 final_mapping = list()
2767 for ref_chains in the_greed.ref_chem_groups:
2768 mapped_mdl_chains = list()
2769 for ref_ch in ref_chains:
2770 if ref_ch in mapping:
2771 mapped_mdl_chains.append(mapping[ref_ch])
2772 else:
2773 mapped_mdl_chains.append(None)
2774 final_mapping.append(mapped_mdl_chains)
2775
2776 return final_mapping
2777
2778
2780 def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
2781 ref_mdl_alns, contact_d = 12.0,
2782 steep_opt_rate = None, greedy_prune_contact_map=False):
2783 """ Greedy extension of already existing but incomplete chain mappings
2784 """
2785 super().__init__(ref, ref_chem_groups, mdl, ref_mdl_alns,
2786 contact_d = contact_d)
2787 self.ref = ref
2788 self.mdl = mdl
2789 self.ref_mdl_alns = ref_mdl_alns
2790 self.steep_opt_rate = steep_opt_rate
2791
2792 if greedy_prune_contact_map:
2793 self.neighbors = {k: set() for k in self.qsent1qsent1.chain_names}
2794 for p in self.qsent1qsent1.interacting_chains:
2795 if np.count_nonzero(self.qsent1qsent1.PairDist(p[0], p[1])<=8) >= 3:
2796 self.neighbors[p[0]].add(p[1])
2797 self.neighbors[p[1]].add(p[0])
2798
2799 self.mdl_neighbors = {k: set() for k in self.qsent2qsent2.chain_names}
2800 for p in self.qsent2qsent2.interacting_chains:
2801 if np.count_nonzero(self.qsent2qsent2.PairDist(p[0], p[1])<=8) >= 3:
2802 self.mdl_neighbors[p[0]].add(p[1])
2803 self.mdl_neighbors[p[1]].add(p[0])
2804
2805
2806 else:
2807 self.neighbors = {k: set() for k in self.qsent1qsent1.chain_names}
2808 for p in self.qsent1qsent1.interacting_chains:
2809 self.neighbors[p[0]].add(p[1])
2810 self.neighbors[p[1]].add(p[0])
2811
2812 self.mdl_neighbors = {k: set() for k in self.qsent2qsent2.chain_names}
2813 for p in self.qsent2qsent2.interacting_chains:
2814 self.mdl_neighbors[p[0]].add(p[1])
2815 self.mdl_neighbors[p[1]].add(p[0])
2816
2817 assert(len(ref_chem_groups) == len(mdl_chem_groups))
2818 self.ref_chem_groups = ref_chem_groups
2819 self.mdl_chem_groups = mdl_chem_groups
2822 for g_idx, (ref_g, mdl_g) in enumerate(zip(ref_chem_groups,
2823 mdl_chem_groups)):
2824 for ch in ref_g:
2825 self.ref_ch_group_mapper[ch] = g_idx
2826 for ch in mdl_g:
2827 self.mdl_ch_group_mapper[ch] = g_idx
2828
2829 # cache for lDDT based single chain conserved contacts
2830 # used to identify starting points for further extension by QS score
2831 # key: tuple (ref_ch, mdl_ch) value: number of conserved contacts
2834 for ch in self.ref.chains:
2835 single_chain_ref = _CSel(self.ref, [ch.GetName()])
2836 self.single_chain_scorer[ch.GetName()] = \
2837 lddt.lDDTScorer(single_chain_ref, bb_only = True)
2838
2839 def SCCounts(self, ref_ch, mdl_ch):
2840 if not (ref_ch, mdl_ch) in self.single_chain_cache:
2841 alns = dict()
2842 alns[mdl_ch] = self.ref_mdl_alns[(ref_ch, mdl_ch)]
2843 mdl_sel = _CSel(self.mdl, [mdl_ch])
2844 s = self.single_chain_scorer[ref_ch]
2845 _,_,_,conserved,_,_,_ = s.lDDT(mdl_sel,
2846 residue_mapping=alns,
2847 return_dist_test=True,
2848 no_interchain=True,
2849 chain_mapping={mdl_ch: ref_ch},
2850 check_resnames=False)
2851 self.single_chain_cache[(ref_ch, mdl_ch)] = conserved
2852 return self.single_chain_cache[(ref_ch, mdl_ch)]
2853
2854 def ExtendMapping(self, mapping, max_ext = None):
2855
2856 if len(mapping) == 0:
2857 raise RuntimError("Mapping must contain a starting point")
2858
2859 # Ref chains onto which we can map. The algorithm starts with a mapping
2860 # on ref_ch. From there we can start to expand to connected neighbors.
2861 # All neighbors that we can reach from the already mapped chains are
2862 # stored in this set which will be updated during runtime
2863 map_targets = set()
2864 for ref_ch in mapping.keys():
2865 map_targets.update(self.neighbors[ref_ch])
2866
2867 # remove the already mapped chains
2868 for ref_ch in mapping.keys():
2869 map_targets.discard(ref_ch)
2870
2871 if len(map_targets) == 0:
2872 return mapping # nothing to extend
2873
2874 # keep track of what model chains are not yet mapped for each chem group
2875 free_mdl_chains = list()
2876 for chem_group in self.mdl_chem_groups:
2877 tmp = [x for x in chem_group if x not in mapping.values()]
2878 free_mdl_chains.append(set(tmp))
2879
2880 # keep track of what ref chains got a mapping
2881 newly_mapped_ref_chains = list()
2882
2883 something_happened = True
2884 while something_happened:
2885 something_happened=False
2886
2887 if self.steep_opt_rate is not None:
2888 n_chains = len(newly_mapped_ref_chains)
2889 if n_chains > 0 and n_chains % self.steep_opt_rate == 0:
2890 mapping = self._SteepOpt(mapping, newly_mapped_ref_chains)
2891
2892 if max_ext is not None and len(newly_mapped_ref_chains) >= max_ext:
2893 break
2894
2895 score_result = self.FromFlatMapping(mapping)
2896 old_score = score_result.QS_global
2897 nominator = score_result.weighted_scores
2898 denominator = score_result.weight_sum + score_result.weight_extra_all
2899
2900 max_diff = 0.0
2901 max_mapping = None
2902 for ref_ch in map_targets:
2903 chem_group_idx = self.ref_ch_group_mapper[ref_ch]
2904 for mdl_ch in free_mdl_chains[chem_group_idx]:
2905 # we're not computing full QS-score here, we directly hack
2906 # into the QS-score formula to compute a diff
2907 nominator_diff = 0.0
2908 denominator_diff = 0.0
2909 for neighbor in self.neighbors[ref_ch]:
2910 if neighbor in mapping and mapping[neighbor] in \
2911 self.mdl_neighbors[mdl_ch]:
2912 # it's a newly added interface if (ref_ch, mdl_ch)
2913 # are added to mapping
2914 int1 = (ref_ch, neighbor)
2915 int2 = (mdl_ch, mapping[neighbor])
2916 a, b, c, d = self._MappedInterfaceScores(int1, int2)
2917 nominator_diff += a # weighted_scores
2918 denominator_diff += b # weight_sum
2919 denominator_diff += d # weight_extra_all
2920 # the respective interface penalties are subtracted
2921 # from denominator
2922 denominator_diff -= self._InterfacePenalty1(int1)
2923 denominator_diff -= self._InterfacePenalty2(int2)
2924
2925 if nominator_diff > 0:
2926 # Only accept a new solution if its actually connected
2927 # i.e. nominator_diff > 0.
2928 new_nominator = nominator + nominator_diff
2929 new_denominator = denominator + denominator_diff
2930 new_score = 0.0
2931 if new_denominator != 0.0:
2932 new_score = new_nominator/new_denominator
2933 diff = new_score - old_score
2934 if diff > max_diff:
2935 max_diff = diff
2936 max_mapping = (ref_ch, mdl_ch)
2937
2938 if max_mapping is not None:
2939 something_happened = True
2940 # assign new found mapping
2941 mapping[max_mapping[0]] = max_mapping[1]
2942
2943 # add all neighboring chains to map targets as they are now
2944 # reachable
2945 for neighbor in self.neighbors[max_mapping[0]]:
2946 if neighbor not in mapping:
2947 map_targets.add(neighbor)
2948
2949 # remove the ref chain from map targets
2950 map_targets.remove(max_mapping[0])
2951
2952 # remove the mdl chain from free_mdl_chains - its taken...
2953 chem_group_idx = self.ref_ch_group_mapper[max_mapping[0]]
2954 free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2955
2956 # keep track of what ref chains got a mapping
2957 newly_mapped_ref_chains.append(max_mapping[0])
2958
2959 return mapping
2960
2961 def _SteepOpt(self, mapping, chains_to_optimize=None):
2962
2963 # just optimize ALL ref chains if nothing specified
2964 if chains_to_optimize is None:
2965 chains_to_optimize = mapping.keys()
2966
2967 # make sure that we only have ref chains which are actually mapped
2968 ref_chains = [x for x in chains_to_optimize if mapping[x] is not None]
2969
2970 # group ref chains to be optimized into chem groups
2971 tmp = dict()
2972 for ch in ref_chains:
2973 chem_group_idx = self.ref_ch_group_mapper[ch]
2974 if chem_group_idx in tmp:
2975 tmp[chem_group_idx].append(ch)
2976 else:
2977 tmp[chem_group_idx] = [ch]
2978 chem_groups = list(tmp.values())
2979
2980 # try all possible mapping swaps. Swaps that improve the score are
2981 # immediately accepted and we start all over again
2982 score_result = self.FromFlatMapping(mapping)
2983 current_score = score_result.QS_global
2984 something_happened = True
2985 while something_happened:
2986 something_happened = False
2987 for chem_group in chem_groups:
2988 if something_happened:
2989 break
2990 for ch1, ch2 in itertools.combinations(chem_group, 2):
2991 swapped_mapping = dict(mapping)
2992 swapped_mapping[ch1] = mapping[ch2]
2993 swapped_mapping[ch2] = mapping[ch1]
2994 score_result = self.FromFlatMapping(swapped_mapping)
2995 if score_result.QS_global > current_score:
2996 something_happened = True
2997 mapping = swapped_mapping
2998 current_score = score_result.QS_global
2999 break
3000 return mapping
3001
3002
3003def _QSScoreNaive(trg, mdl, chem_groups, chem_mapping, ref_mdl_alns, contact_d,
3004 n_max_naive):
3005 best_mapping = None
3006 best_score = -1.0
3007 # qs_scorer implements caching, score calculation is thus as fast as it gets
3008 # you'll just hit a wall when the number of possible mappings becomes large
3009 qs_scorer = qsscore.QSScorer(trg, chem_groups, mdl, ref_mdl_alns)
3010 for mapping in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
3011 score_result = qs_scorer.Score(mapping, check=False)
3012 if score_result.QS_global > best_score:
3013 best_mapping = mapping
3014 best_score = score_result.QS_global
3015 return (best_mapping, best_score)
3016
3017def _QSScoreGreedyFull(the_greed):
3018 """ Uses each reference chain as starting point for expansion
3019 """
3020
3021 seeds = _GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
3022 best_overall_score = -1.0
3023 best_overall_mapping = dict()
3024
3025 for seed in seeds:
3026
3027 # do initial extension
3028 mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
3029
3030 # repeat the process until we have a full mapping
3031 something_happened = True
3032 while something_happened:
3033 something_happened = False
3034 remnant_seeds = _GetSeeds(the_greed.ref_chem_groups,
3035 the_greed.mdl_chem_groups,
3036 mapped_ref_chains = set(mapping.keys()),
3037 mapped_mdl_chains = set(mapping.values()))
3038 if len(remnant_seeds) > 0:
3039 # still more mapping to be done
3040 best_score = -1.0
3041 best_mapping = None
3042 for remnant_seed in remnant_seeds:
3043 tmp_mapping = dict(mapping)
3044 tmp_mapping[remnant_seed[0]] = remnant_seed[1]
3045 tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
3046 score_result = the_greed.FromFlatMapping(tmp_mapping)
3047 if score_result.QS_global > best_score:
3048 best_score = score_result.QS_global
3049 best_mapping = tmp_mapping
3050 if best_mapping is not None:
3051 something_happened = True
3052 mapping = best_mapping
3053
3054 score_result = the_greed.FromFlatMapping(mapping)
3055 if score_result.QS_global > best_overall_score:
3056 best_overall_score = score_result.QS_global
3057 best_overall_mapping = mapping
3058
3059 mapping = best_overall_mapping
3060
3061 # translate mapping format and return
3062 final_mapping = list()
3063 for ref_chains in the_greed.ref_chem_groups:
3064 mapped_mdl_chains = list()
3065 for ref_ch in ref_chains:
3066 if ref_ch in mapping:
3067 mapped_mdl_chains.append(mapping[ref_ch])
3068 else:
3069 mapped_mdl_chains.append(None)
3070 final_mapping.append(mapped_mdl_chains)
3071
3072 return final_mapping
3073
3074
3075def _QSScoreGreedyBlock(the_greed, seed_size, blocks_per_chem_group):
3076 """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
3077 respective chem groups and compute single chain lDDTs. The
3078 *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
3079 and the best scoring one with respect to QS score is exhaustively extended.
3080 """
3081
3082 if seed_size is None or seed_size < 1:
3083 raise RuntimeError(f"seed_size must be an int >= 1 (got {seed_size})")
3084
3085 if blocks_per_chem_group is None or blocks_per_chem_group < 1:
3086 raise RuntimeError(f"blocks_per_chem_group must be an int >= 1 "
3087 f"(got {blocks_per_chem_group})")
3088
3089 max_ext = seed_size - 1 # -1 => start seed already has size 1
3090
3091 ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
3092 mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
3093
3094 mapping = dict()
3095
3096 something_happened = True
3097 while something_happened:
3098 something_happened = False
3099 starting_blocks = list()
3100 for ref_chains, mdl_chains in zip(ref_chem_groups, mdl_chem_groups):
3101 if len(mdl_chains) == 0:
3102 continue # nothing to map
3103 ref_chains_copy = list(ref_chains)
3104 for i in range(blocks_per_chem_group):
3105 if len(ref_chains_copy) == 0:
3106 break
3107 seeds = list()
3108 for ref_ch in ref_chains_copy:
3109 seeds += [(ref_ch, mdl_ch) for mdl_ch in mdl_chains]
3110 # extend starting seeds to *seed_size* and retain best scoring block
3111 # for further extension
3112 best_score = -1.0
3113 best_mapping = None
3114 best_seed = None
3115 for s in seeds:
3116 seed = dict(mapping)
3117 seed.update({s[0]: s[1]})
3118 seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
3119 score_result = the_greed.FromFlatMapping(seed)
3120 if score_result.QS_global > best_score:
3121 best_score = score_result.QS_global
3122 best_mapping = seed
3123 best_seed = s
3124 if best_mapping != None:
3125 starting_blocks.append(best_mapping)
3126 if best_seed[0] in ref_chains_copy:
3127 # remove selected ref chain to enforce diversity
3128 ref_chains_copy.remove(best_seed[0])
3129
3130 # fully expand initial starting blocks
3131 best_score = -1.0
3132 best_mapping = None
3133 for seed in starting_blocks:
3134 seed = the_greed.ExtendMapping(seed)
3135 score_result = the_greed.FromFlatMapping(seed)
3136 if score_result.QS_global > best_score:
3137 best_score = score_result.QS_global
3138 best_mapping = seed
3139
3140 if best_mapping is not None and len(best_mapping) > len(mapping):
3141 # this even accepts extensions that lead to no increase in QS-score
3142 # at least they make sense from an lDDT perspective
3143 something_happened = True
3144 mapping.update(best_mapping)
3145 for ref_ch, mdl_ch in best_mapping.items():
3146 for group_idx in range(len(ref_chem_groups)):
3147 if ref_ch in ref_chem_groups[group_idx]:
3148 ref_chem_groups[group_idx].remove(ref_ch)
3149 if mdl_ch in mdl_chem_groups[group_idx]:
3150 mdl_chem_groups[group_idx].remove(mdl_ch)
3151
3152 # translate mapping format and return
3153 final_mapping = list()
3154 for ref_chains in the_greed.ref_chem_groups:
3155 mapped_mdl_chains = list()
3156 for ref_ch in ref_chains:
3157 if ref_ch in mapping:
3158 mapped_mdl_chains.append(mapping[ref_ch])
3159 else:
3160 mapped_mdl_chains.append(None)
3161 final_mapping.append(mapped_mdl_chains)
3162
3163 return final_mapping
3164
3165def _SingleRigidRMSD(initial_transforms, initial_mappings, chem_groups,
3166 chem_mapping, trg_group_pos, mdl_group_pos):
3167 """
3168 Takes initial transforms and sequentially adds chain pairs with lowest RMSD.
3169 The mapping from the transform that leads to lowest overall RMSD is
3170 returned.
3171 """
3172 best_mapping = dict()
3173 best_ssd = float("inf") # we're actually going for summed squared distances
3174 # Since all positions have same lengths and we do a
3175 # full mapping, lowest SSD has a guarantee of also
3176 # being lowest RMSD
3177 for transform in initial_transforms:
3178 mapping = dict()
3179 mapped_mdl_chains = set()
3180 ssd = 0.0
3181 for trg_chains, mdl_chains, trg_pos, mdl_pos, in zip(chem_groups,
3182 chem_mapping,
3183 trg_group_pos,
3184 mdl_group_pos):
3185 if len(trg_pos) == 0 or len(mdl_pos) == 0:
3186 continue # cannot compute valid rmsd
3187 ssds = list()
3188 t_mdl_pos = list()
3189 for m_pos in mdl_pos:
3190 t_m_pos = geom.Vec3List(m_pos)
3191 t_m_pos.ApplyTransform(transform)
3192 t_mdl_pos.append(t_m_pos)
3193 for t_pos, t in zip(trg_pos, trg_chains):
3194 for t_m_pos, m in zip(t_mdl_pos, mdl_chains):
3195 ssd = t_pos.GetSummedSquaredDistances(t_m_pos)
3196 ssds.append((ssd, (t,m)))
3197 ssds.sort()
3198 for item in ssds:
3199 p = item[1]
3200 if p[0] not in mapping and p[1] not in mapped_mdl_chains:
3201 mapping[p[0]] = p[1]
3202 mapped_mdl_chains.add(p[1])
3203 ssd += item[0]
3204
3205 if ssd < best_ssd:
3206 best_ssd = ssd
3207 best_mapping = mapping
3208
3209 return best_mapping
3210
3211
3212def _SingleRigidCentroid(initial_transforms, initial_mappings, chem_groups,
3213 chem_mapping, trg_group_pos, mdl_group_pos):
3214 """
3215 Takes initial transforms and sequentially adds chain pairs with lowest
3216 centroid distance.
3217 The mapping from the transform that leads to lowest overall RMSD of
3218 the centroids is returned.
3219 """
3220 best_mapping = dict()
3221 best_rmsd = float("inf")
3222
3223 trg_group_centroids = list()
3224 mdl_group_centroids = list()
3225
3226 for trg_pos, mdl_pos, in zip(trg_group_pos, mdl_group_pos):
3227 trg_group_centroids.append(geom.Vec3List([p.center for p in trg_pos]))
3228 mdl_group_centroids.append(geom.Vec3List([p.center for p in mdl_pos]))
3229
3230 for transform in initial_transforms:
3231 mapping = dict()
3232 mapped_mdl_chains = set()
3233 mapped_trg_centroids = list()
3234 mapped_mdl_centroids = list()
3235 for trg_chains, mdl_chains, trg_centroids, mdl_centroids, in zip(chem_groups,
3236 chem_mapping,
3237 trg_group_centroids,
3238 mdl_group_centroids):
3239 if len(trg_centroids) == 0 or len(mdl_centroids) == 0:
3240 continue # cannot compute valid rmsd
3241 distances = list()
3242 t_mdl_centroids = geom.Vec3List(mdl_centroids)
3243 t_mdl_centroids.ApplyTransform(transform)
3244
3245 for trg_idx in range(len(trg_chains)):
3246 for mdl_idx in range(len(mdl_chains)):
3247 distances.append((geom.Length2(trg_centroids[trg_idx]-t_mdl_centroids[mdl_idx]),
3248 (trg_chains[trg_idx], mdl_chains[mdl_idx]),
3249 (trg_centroids[trg_idx], t_mdl_centroids[mdl_idx])))
3250
3251 distances.sort()
3252 for item in distances:
3253 p = item[1]
3254 if p[0] not in mapping and p[1] not in mapped_mdl_chains:
3255 mapping[p[0]] = p[1]
3256 mapped_mdl_chains.add(p[1])
3257 mapped_trg_centroids.append(item[2][0])
3258 mapped_mdl_centroids.append(item[2][1])
3259
3260
3261 if len(mapping) == 0:
3262 raise RuntimeError("Empty mapping in _SingleRigidCentroid")
3263 elif len(mapping) == 1:
3264 # by definition zero
3265 rmsd = 0.0
3266 elif len(mapping) == 2:
3267 # no superposition possible with two positions
3268 # derive rmsd from the distance difference between
3269 # the centroid pairs in reference and model
3270 ref_d = geom.Distance(mapped_trg_centroids[0],
3271 mapped_trg_centroids[1])
3272 mdl_d = geom.Distance(mapped_mdl_centroids[0],
3273 mapped_mdl_centroids[1])
3274 # compute RMSD when placing pair of coordinates with smaller
3275 # distance in the middle of pair of cooridnates with larger
3276 # distance
3277 dd = abs(ref_d-mdl_d) # distance difference
3278 # rmsd = sqrt(1/2 * ((dd/2)**2 + (dd/2)**2))
3279 # => rmsd = dd/2
3280 rmsd = dd/2
3281 else:
3282 # go for classic Kabsch superposition
3283 mapped_trg_centroids = geom.Vec3List(mapped_trg_centroids)
3284 mapped_mdl_centroids = geom.Vec3List(mapped_mdl_centroids)
3285 rmsd = _GetSuperposition(mapped_trg_centroids,
3286 mapped_mdl_centroids, False).rmsd
3287
3288 if rmsd < best_rmsd:
3289 best_rmsd = rmsd
3290 best_mapping = mapping
3291
3292 return best_mapping
3293
3294
3295def _IterativeRigidRMSD(initial_transforms, initial_mappings, chem_groups,
3296 chem_mapping, trg_group_pos, mdl_group_pos):
3297 """ Takes initial transforms and sequentially adds chain pairs with
3298 lowest RMSD. With each added chain pair, the transform gets updated.
3299 Thus the naming iterative. The mapping from the initial transform that
3300 leads to best overall RMSD score is returned.
3301 """
3302
3303 # to directly retrieve positions using chain names
3304 trg_pos_dict = dict()
3305 for trg_pos, trg_chains in zip(trg_group_pos, chem_groups):
3306 for t_pos, t in zip(trg_pos, trg_chains):
3307 trg_pos_dict[t] = t_pos
3308 mdl_pos_dict = dict()
3309 for mdl_pos, mdl_chains in zip(mdl_group_pos, chem_mapping):
3310 for m_pos, m in zip(mdl_pos, mdl_chains):
3311 mdl_pos_dict[m] = m_pos
3312
3313 best_mapping = dict()
3314 best_rmsd = float("inf")
3315 for initial_transform, initial_mapping in zip(initial_transforms,
3316 initial_mappings):
3317 mapping = {initial_mapping[0]: initial_mapping[1]}
3318 transform = geom.Mat4(initial_transform)
3319 mapped_trg_pos = geom.Vec3List(trg_pos_dict[initial_mapping[0]])
3320 mapped_mdl_pos = geom.Vec3List(mdl_pos_dict[initial_mapping[1]])
3321
3322 # the following variables contain the chains which are
3323 # available for mapping
3324 trg_chain_groups = [set(group) for group in chem_groups]
3325 mdl_chain_groups = [set(group) for group in chem_mapping]
3326
3327 # search and kick out inital mapping
3328 for group in trg_chain_groups:
3329 if initial_mapping[0] in group:
3330 group.remove(initial_mapping[0])
3331 break
3332 for group in mdl_chain_groups:
3333 if initial_mapping[1] in group:
3334 group.remove(initial_mapping[1])
3335 break
3336
3337 something_happened = True
3338 while something_happened:
3339 # search for best mapping given current transform
3340 something_happened=False
3341 best_sc_mapping = None
3342 best_sc_group_idx = None
3343 best_sc_rmsd = float("inf")
3344 group_idx = 0
3345 for trg_chains, mdl_chains in zip(trg_chain_groups, mdl_chain_groups):
3346 for t in trg_chains:
3347 t_pos = trg_pos_dict[t]
3348 for m in mdl_chains:
3349 m_pos = mdl_pos_dict[m]
3350 t_m_pos = geom.Vec3List(m_pos)
3351 t_m_pos.ApplyTransform(transform)
3352 rmsd = t_pos.GetRMSD(t_m_pos)
3353 if rmsd < best_sc_rmsd:
3354 best_sc_rmsd = rmsd
3355 best_sc_mapping = (t,m)
3356 best_sc_group_idx = group_idx
3357 group_idx += 1
3358
3359 if best_sc_mapping is not None:
3360 something_happened = True
3361 mapping[best_sc_mapping[0]] = best_sc_mapping[1]
3362 mapped_trg_pos.extend(trg_pos_dict[best_sc_mapping[0]])
3363 mapped_mdl_pos.extend(mdl_pos_dict[best_sc_mapping[1]])
3364 trg_chain_groups[best_sc_group_idx].remove(best_sc_mapping[0])
3365 mdl_chain_groups[best_sc_group_idx].remove(best_sc_mapping[1])
3366
3367 transform = _GetSuperposition(mapped_mdl_pos, mapped_trg_pos,
3368 False).transformation
3369
3370 # compute overall RMSD for current transform
3371 mapped_mdl_pos.ApplyTransform(transform)
3372 rmsd = mapped_trg_pos.GetRMSD(mapped_mdl_pos)
3373
3374 if rmsd < best_rmsd:
3375 best_rmsd = rmsd
3376 best_mapping = mapping
3377
3378 return best_mapping
3379
3380def _NaiveRMSD(chem_groups, chem_mapping, trg_group_pos, mdl_group_pos,
3381 n_max_naive):
3382
3383 # to directly retrieve positions using chain names
3384 trg_pos_dict = dict()
3385 for trg_pos, trg_chains in zip(trg_group_pos, chem_groups):
3386 for t_pos, t in zip(trg_pos, trg_chains):
3387 trg_pos_dict[t] = t_pos
3388 mdl_pos_dict = dict()
3389 for mdl_pos, mdl_chains in zip(mdl_group_pos, chem_mapping):
3390 for m_pos, m in zip(mdl_pos, mdl_chains):
3391 mdl_pos_dict[m] = m_pos
3392
3393 best_mapping = [[None]*len(x) for x in chem_groups]
3394 best_rmsd = float("inf")
3395
3396 for mapping in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
3397 trg_pos = geom.Vec3List()
3398 mdl_pos = geom.Vec3List()
3399 for trg_group, mdl_group in zip(chem_groups, mapping):
3400 for trg_ch, mdl_ch in zip(trg_group, mdl_group):
3401 if trg_ch is not None and mdl_ch is not None:
3402 trg_pos.extend(trg_pos_dict[trg_ch])
3403 mdl_pos.extend(mdl_pos_dict[mdl_ch])
3404 if len(mdl_pos) >= 3:
3405 superpose_res = mol.alg.SuperposeSVD(mdl_pos, trg_pos)
3406 if superpose_res.rmsd < best_rmsd:
3407 best_rmsd = superpose_res.rmsd
3408 best_mapping = mapping
3409
3410 # this is stupid...
3411 tmp = dict()
3412 for chem_group, mapping in zip(chem_groups, best_mapping):
3413 for trg_ch, mdl_ch in zip(chem_group, mapping):
3414 tmp[trg_ch] = mdl_ch
3415
3416 return tmp
3417
3418
3419def _GetRefPos(trg, mdl, trg_msas, mdl_alns, max_pos = None):
3420 """ Extracts reference positions which are present in trg and mdl
3421 """
3422
3423 # mdl_alns are pairwise, let's construct MSAs
3424 mdl_msas = list()
3425 for aln_list in mdl_alns:
3426 if len(aln_list) > 0:
3427 tmp = aln_list[0].GetSequence(0)
3428 ref_seq = seq.CreateSequence(tmp.GetName(), tmp.GetGaplessString())
3429 mdl_msas.append(seq.alg.MergePairwiseAlignments(aln_list, ref_seq))
3430 else:
3431 mdl_msas.append(seq.CreateAlignment())
3432
3433 # fetch positions of all backbone atoms
3434 bb_trg = _GetBBPos(trg)
3435 bb_mdl = _GetBBPos(mdl)
3436
3437 # there are the actual return values
3438 trg_pos = list()
3439 mdl_pos = list()
3440
3441 for trg_msa, mdl_msa in zip(trg_msas, mdl_msas):
3442
3443 if mdl_msa.GetCount() > 0:
3444 # make sure they have the same ref sequence (should be a given...)
3445 assert(trg_msa.GetSequence(0).GetGaplessString() == \
3446 mdl_msa.GetSequence(0).GetGaplessString())
3447 else:
3448 # if mdl_msa is empty, i.e. no model chain maps to the chem group
3449 # represented by trg_msa, we just continue. The result will be
3450 # empty position lists added to trg_pos and mdl_pos.
3451 pass
3452
3453 # check which columns in MSAs are fully covered (indices relative to
3454 # first sequence)
3455 trg_indices = _GetFullyCoveredIndices(trg_msa)
3456 mdl_indices = _GetFullyCoveredIndices(mdl_msa)
3457
3458 # get indices where both, mdl and trg, are fully covered
3459 indices = sorted(list(trg_indices.intersection(mdl_indices)))
3460
3461 # subsample if necessary
3462 if max_pos is not None and len(indices) > max_pos:
3463 step = int(len(indices)/max_pos)
3464 indices = [indices[i] for i in range(0, len(indices), step)]
3465
3466 # translate to column indices in the respective MSAs
3467 trg_indices = _RefIndicesToColumnIndices(trg_msa, indices)
3468 mdl_indices = _RefIndicesToColumnIndices(mdl_msa, indices)
3469
3470 # extract positions
3471 trg_pos.append(list())
3472 mdl_pos.append(list())
3473 # first seq in trg_msa is ref sequence and does not refer to any
3474 # ATOMSEQ
3475 for s_idx in range(1, trg_msa.GetCount()):
3476 trg_pos[-1].append(_ExtractMSAPos(trg_msa, s_idx, trg_indices,
3477 bb_trg))
3478 # first seq in mdl_msa is ref sequence in trg and does not belong to mdl
3479 for s_idx in range(1, mdl_msa.GetCount()):
3480 mdl_pos[-1].append(_ExtractMSAPos(mdl_msa, s_idx, mdl_indices,
3481 bb_mdl))
3482
3483 return (trg_pos, mdl_pos)
3484
3485def _GetBBPos(ent):
3486 """ Helper for _GetRefPos
3487
3488 Returns a dict with key: chain name and value: a geom.Vec3List of backbone
3489 atom positions. Assumes that either CA or C3' is always there.
3490 """
3491 bb = dict()
3492 for ch in ent.chains:
3493 l = geom.Vec3List()
3494 for r in ch.residues:
3495 a = r.FindAtom("CA")
3496 if not a.IsValid():
3497 a = r.FindAtom("C3'")
3498 l.append(a.GetPos())
3499 bb[ch.name] = l
3500 return bb
3501
3503 """ Helper for _GetRefPos
3504
3505 Returns a set containing the indices relative to first sequence in msa which
3506 are fully covered in all other sequences
3507
3508 --AA-A-A
3509 -BBBB-BB
3510 CCCC-C-C
3511
3512 => (0,1,3)
3513 """
3514 indices = set()
3515 ref_idx = 0
3516 for col in msa:
3517 if sum([1 for olc in col if olc != '-']) == col.GetRowCount():
3518 indices.add(ref_idx)
3519 if col[0] != '-':
3520 ref_idx += 1
3521 return indices
3522
3524 """ Helper for _GetRefPos
3525
3526 Returns a list of mapped indices. indices refer to non-gap one letter
3527 codes in the first msa sequence. The returnes mapped indices are translated
3528 to the according msa column indices
3529 """
3530 ref_idx = 0
3531 mapping = dict()
3532 for col_idx, col in enumerate(msa):
3533 if col[0] != '-':
3534 mapping[ref_idx] = col_idx
3535 ref_idx += 1
3536 return [mapping[i] for i in indices]
3537
3538def _ExtractMSAPos(msa, s_idx, indices, bb):
3539 """ Helper for _GetRefPos
3540
3541 Returns a geom.Vec3List containing positions refering to given msa sequence.
3542 => Chain with corresponding name is mapped onto sequence and the position of
3543 the first atom of each residue specified in indices is extracted.
3544 Indices refers to column indices in msa!
3545 """
3546 s = msa.GetSequence(s_idx)
3547 ch_bb = bb[s.GetName()]
3548
3549 # sanity check
3550 assert(len(s.GetGaplessString()) == len(ch_bb))
3551
3552 residue_idx = [s.GetResidueIndex(i) for i in indices]
3553 return geom.Vec3List([ch_bb[i] for i in residue_idx])
3554
3555
3556def _NChemGroupMappings(ref_chains, mdl_chains):
3557 """ Number of mappings within one chem group
3558
3559 :param ref_chains: Reference chains
3560 :type ref_chains: :class:`list` of :class:`str`
3561 :param mdl_chains: Model chains that are mapped onto *ref_chains*
3562 :type mdl_chains: :class:`list` of :class:`str`
3563 :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3564 """
3565 n_ref = len(ref_chains)
3566 n_mdl = len(mdl_chains)
3567 if n_ref == n_mdl:
3568 return factorial(n_ref)
3569 elif n_ref > n_mdl:
3570 n_choose_k = binom(n_ref, n_mdl)
3571 return n_choose_k * factorial(n_mdl)
3572 else:
3573 n_choose_k = binom(n_mdl, n_ref)
3574 return n_choose_k * factorial(n_ref)
3575
3576def _NMappings(ref_chains, mdl_chains):
3577 """ Number of mappings for a full chem mapping
3578
3579 :param ref_chains: Chem groups of reference
3580 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3581 :param mdl_chains: Model chains that map onto those chem groups
3582 :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3583 :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3584 """
3585 assert(len(ref_chains) == len(mdl_chains))
3586 n = 1
3587 for a,b in zip(ref_chains, mdl_chains):
3588 n *= _NChemGroupMappings(a,b)
3589 return n
3590
3591def _NMappingsWithin(ref_chains, mdl_chains, max_mappings):
3592 """ Check whether total number of mappings is smaller than given maximum
3593
3594 In principle the same as :func:`_NMappings` but it stops as soon as the
3595 maximum is hit.
3596
3597 :param ref_chains: Chem groups of reference
3598 :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3599 :param mdl_chains: Model chains that map onto those chem groups
3600 :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3601 :param max_mappings: Number of max allowed mappings
3602 :returns: Whether number of possible mappings of *mdl_chains* onto
3603 *ref_chains* is below or equal *max_mappings*.
3604 """
3605 assert(len(ref_chains) == len(mdl_chains))
3606 n = 1
3607 for a,b in zip(ref_chains, mdl_chains):
3608 n *= _NChemGroupMappings(a,b)
3609 if n > max_mappings:
3610 return False
3611 return True
3612
3613def _RefSmallerGenerator(ref_chains, mdl_chains):
3614 """ Returns all possible ways to map mdl_chains onto ref_chains
3615
3616 Specific for the case where len(ref_chains) < len(mdl_chains)
3617 """
3618 for c in itertools.combinations(mdl_chains, len(ref_chains)):
3619 for p in itertools.permutations(c):
3620 yield list(p)
3621
3622def _RefLargerGenerator(ref_chains, mdl_chains):
3623 """ Returns all possible ways to map mdl_chains onto ref_chains
3624
3625 Specific for the case where len(ref_chains) > len(mdl_chains)
3626 Ref chains without mapped mdl chain are assigned None
3627 """
3628 n_ref = len(ref_chains)
3629 n_mdl = len(mdl_chains)
3630 for c in itertools.combinations(range(n_ref), n_mdl):
3631 for p in itertools.permutations(mdl_chains):
3632 ret_list = [None] * n_ref
3633 for idx, ch in zip(c, p):
3634 ret_list[idx] = ch
3635 yield ret_list
3636
3637def _RefEqualGenerator(ref_chains, mdl_chains):
3638 """ Returns all possible ways to map mdl_chains onto ref_chains
3639
3640 Specific for the case where len(ref_chains) == len(mdl_chains)
3641 """
3642 for p in itertools.permutations(mdl_chains):
3643 yield list(p)
3644
3645def _RefEmptyGenerator(ref_chains, mdl_chains):
3646 yield list()
3647
3648def _ConcatIterators(iterators):
3649 for item in itertools.product(*iterators):
3650 yield list(item)
3651
3652def _ChainMappings(ref_chains, mdl_chains, n_max=None):
3653 """Returns all possible ways to map *mdl_chains* onto fixed *ref_chains*
3654
3655 :param ref_chains: List of list of chemically equivalent chains in reference
3656 :type ref_chains: :class:`list` of :class:`list`
3657 :param mdl_chains: Equally long list of list of chemically equivalent chains
3658 in model that map on those ref chains.
3659 :type mdl_chains: :class:`list` of :class:`list`
3660 :param n_max: Aborts and raises :class:`RuntimeError` if max number of
3661 mappings is above this threshold.
3662 :type n_max: :class:`int`
3663 :returns: Iterator over all possible mappings of *mdl_chains* onto fixed
3664 *ref_chains*. Potentially contains None as padding when number of
3665 model chains for a certain mapping is smaller than the according
3666 reference chains.
3667 Example: _ChainMappings([['A', 'B', 'C'], ['D', 'E']],
3668 [['x', 'y'], ['i', 'j']])
3669 gives an iterator over: [[['x', 'y', None], ['i', 'j']],
3670 [['x', 'y', None], ['j', 'i']],
3671 [['y', 'x', None], ['i', 'j']],
3672 [['y', 'x', None], ['j', 'i']],
3673 [['x', None, 'y'], ['i', 'j']],
3674 [['x', None, 'y'], ['j', 'i']],
3675 [['y', None, 'x'], ['i', 'j']],
3676 [['y', None, 'x'], ['j', 'i']],
3677 [[None, 'x', 'y'], ['i', 'j']],
3678 [[None, 'x', 'y'], ['j', 'i']],
3679 [[None, 'y', 'x'], ['i', 'j']],
3680 [[None, 'y', 'x'], ['j', 'i']]]
3681 """
3682 assert(len(ref_chains) == len(mdl_chains))
3683
3684 if n_max is not None:
3685 if not _NMappingsWithin(ref_chains, mdl_chains, n_max):
3686 raise RuntimeError(f"Too many mappings. Max allowed: {n_max}")
3687
3688 # one iterator per mapping representing all mdl combinations relative to
3689 # reference
3690 iterators = list()
3691 for ref, mdl in zip(ref_chains, mdl_chains):
3692 if len(ref) == 0:
3693 iterators.append(_RefEmptyGenerator(ref, mdl))
3694 elif len(ref) == len(mdl):
3695 iterators.append(_RefEqualGenerator(ref, mdl))
3696 elif len(ref) < len(mdl):
3697 iterators.append(_RefSmallerGenerator(ref, mdl))
3698 else:
3699 iterators.append(_RefLargerGenerator(ref, mdl))
3700
3701 return _ConcatIterators(iterators)
3702
3703
3704def _GetSuperposition(pos_one, pos_two, iterative):
3705 """ Computes minimal RMSD superposition for pos_one onto pos_two
3706
3707 :param pos_one: Positions that should be superposed onto *pos_two*
3708 :type pos_one: :class:`geom.Vec3List`
3709 :param pos_two: Reference positions
3710 :type pos_two: :class:`geom.Vec3List`
3711 :iterative: Whether iterative superposition should be used. Iterative
3712 potentially raises, uses standard superposition as fallback.
3713 :type iterative: :class:`bool`
3714 :returns: Transformation matrix to superpose *pos_one* onto *pos_two*
3715 :rtype: :class:`ost.mol.alg.SuperpositionResult`
3716 """
3717 res = None
3718 if iterative:
3719 try:
3720 res = mol.alg.IterativeSuperposeSVD(pos_one, pos_two)
3721 except:
3722 pass # triggers fallback below
3723 if res is None:
3724 res = mol.alg.SuperposeSVD(pos_one, pos_two)
3725 return res
3726
3727# specify public interface
3728__all__ = ('ChainMapper', 'ReprResult', 'MappingResult')
FromFlatMapping(self, flat_mapping)
Definition bb_lddt.py:457
_NPairConserved(self, trg_int, mdl_int)
Definition bb_lddt.py:504
_NSCConserved(self, trg_ch, mdl_ch)
Definition bb_lddt.py:481
__init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, contact_d=12.0, steep_opt_rate=None, greedy_prune_contact_map=False)
_SteepOpt(self, mapping, chains_to_optimize=None)
_SteepOpt(self, mapping, chains_to_optimize=None)
__init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], steep_opt_rate=None)
GetMapping(self, model, n_max_naive=40320)
__init__(self, target, resnum_alignments=False, pep_seqid_thr=95., nuc_seqid_thr=95., pep_subst_mat=seq.alg.BLOSUM62, pep_gap_open=-11, pep_gap_ext=-1, nuc_subst_mat=seq.alg.NUC44, nuc_gap_open=-4, nuc_gap_ext=-4, min_pep_length=6, min_nuc_length=4, n_max_naive=1e8, mdl_map_pep_seqid_thr=0., mdl_map_nuc_seqid_thr=0., seqres=None, trg_seqres_mapping=None)
GetRMSDMapping(self, model, strategy="heuristic", subsampling=50, chem_mapping_result=None, heuristic_n_max_naive=120)
ResNumAlign(self, s1, s2, s1_ent, s2_ent)
_GroupSequences(self, seqs, seqid_thr, min_length, chem_type)
_MapSequence(self, ref_seqs, ref_types, s, s_type, s_ent, seq_id_thr=0.0, min_aln_length=0)
GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False, only_interchain=False, chem_mapping_result=None, global_mapping=None)
GetQSScoreMapping(self, model, contact_d=12.0, strategy="heuristic", block_seed_size=5, block_blocks_per_chem_group=5, heuristic_n_max_naive=40320, steep_opt_rate=None, chem_mapping_result=None, greedy_prune_contact_map=True)
GetAFMMapping(self, model, chem_mapping_result=None)
GetlDDTMapping(self, model, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic", steep_opt_rate=None, block_seed_size=5, block_blocks_per_chem_group=5, chem_mapping_result=None, heuristic_n_max_naive=40320)
__init__(self, target, model, chem_groups, chem_mapping, mdl_chains_without_chem_mapping, mapping, alns, opt_score=None)
__init__(self, lDDT, substructure, ref_view, mdl_view)
GetFlatChainMapping(self, mdl_as_key=False)
_GetInconsistentResidues(self, ref_residues, mdl_residues)
FromFlatMapping(self, flat_mapping)
Definition qsscore.py:528
_MappedInterfaceScores(self, int1, int2)
Definition qsscore.py:583
_InterfacePenalty2(self, interface)
Definition qsscore.py:740
_InterfacePenalty1(self, interface)
Definition qsscore.py:734
Real DLLEXPORT_OST_GEOM Distance(const Line2 &l, const Vec2 &v)
Real Length2(const Vec2 &v)
returns squared length of vector
Definition vecmat2_op.hh:35
_ChainMappings(ref_chains, mdl_chains, n_max=None)
_GetSuperposition(pos_one, pos_two, iterative)
_IterativeRigidRMSD(initial_transforms, initial_mappings, chem_groups, chem_mapping, trg_group_pos, mdl_group_pos)
_RefSmallerGenerator(ref_chains, mdl_chains)
_RefLargerGenerator(ref_chains, mdl_chains)
_RefEmptyGenerator(ref_chains, mdl_chains)
_GetRefMdlAlns(ref_chem_groups, ref_chem_group_msas, mdl_chem_groups, mdl_chem_group_alns, pairs=None)
_lDDTGreedyBlock(the_greed, seed_size, blocks_per_chem_group)
_NaiveRMSD(chem_groups, chem_mapping, trg_group_pos, mdl_group_pos, n_max_naive)
_RefIndicesToColumnIndices(msa, indices)
_SingleRigidCentroid(initial_transforms, initial_mappings, chem_groups, chem_mapping, trg_group_pos, mdl_group_pos)
_CheckOneToOneMapping(ref_chains, mdl_chains)
_QSScoreNaive(trg, mdl, chem_groups, chem_mapping, ref_mdl_alns, contact_d, n_max_naive)
_NMappings(ref_chains, mdl_chains)
_ExtractMSAPos(msa, s_idx, indices, bb)
_QSScoreGreedyBlock(the_greed, seed_size, blocks_per_chem_group)
_lDDTNaive(trg, mdl, inclusion_radius, thresholds, chem_groups, chem_mapping, ref_mdl_alns, n_max_naive)
_GetRefPos(trg, mdl, trg_msas, mdl_alns, max_pos=None)
_NChemGroupMappings(ref_chains, mdl_chains)
_GetSeeds(ref_chem_groups, mdl_chem_groups, mapped_ref_chains=set(), mapped_mdl_chains=set())
_SingleRigidRMSD(initial_transforms, initial_mappings, chem_groups, chem_mapping, trg_group_pos, mdl_group_pos)
_RefEqualGenerator(ref_chains, mdl_chains)
_NMappingsWithin(ref_chains, mdl_chains, max_mappings)