OpenStructure
chain_mapping.py
Go to the documentation of this file.
1 """
2 Chain mapping aims to identify a one-to-one relationship between chains in a
3 reference structure and a model.
4 """
5 
6 import itertools
7 import copy
8 
9 import numpy as np
10 
11 from scipy.special import factorial
12 from scipy.special import binom # as of Python 3.8, the math module implements
13  # comb, i.e. n choose k
14 
15 import ost
16 from ost import seq
17 from ost import mol
18 from ost import geom
19 
20 from ost.mol.alg import lddt
21 from ost.mol.alg import bb_lddt
22 from ost.mol.alg import qsscore
23 
24 def _CSel(ent, cnames):
25  """ Returns view with specified chains
26 
27  Ensures that quotation marks are around chain names to not confuse
28  OST query language with weird special characters.
29  """
30  query = "cname=" + ','.join([mol.QueryQuoteName(cname) for cname in cnames])
31  return ent.Select(query)
32 
34  """ Result object for the chain mapping functions in :class:`ChainMapper`
35 
36  Constructor is directly called within the functions, no need to construct
37  such objects yourself.
38  """
39  def __init__(self, target, model, chem_groups, chem_mapping,
40  mdl_chains_without_chem_mapping, mapping, alns, opt_score=None):
41  self._target_target = target
42  self._model_model = model
43  self._chem_groups_chem_groups = chem_groups
44  self._chem_mapping_chem_mapping = chem_mapping
45  self._mdl_chains_without_chem_mapping_mdl_chains_without_chem_mapping = mdl_chains_without_chem_mapping
46  self._mapping_mapping = mapping
47  self._alns_alns = alns
48  self._opt_score_opt_score = opt_score
49 
50  @property
51  def target(self):
52  """ Target/reference structure, i.e. :attr:`ChainMapper.target`
53 
54  :type: :class:`ost.mol.EntityView`
55  """
56  return self._target_target
57 
58  @property
59  def model(self):
60  """ Model structure that gets mapped onto :attr:`~target`
61 
62  Underwent same processing as :attr:`ChainMapper.target`, i.e.
63  only contains peptide/nucleotide chains of sufficient size.
64 
65  :type: :class:`ost.mol.EntityView`
66  """
67  return self._model_model
68 
69  @property
70  def chem_groups(self):
71  """ Groups of chemically equivalent chains in :attr:`~target`
72 
73  Same as :attr:`ChainMapper.chem_group`
74 
75  :class:`list` of :class:`list` of :class:`str` (chain names)
76  """
77  return self._chem_groups_chem_groups
78 
79  @property
80  def chem_mapping(self):
81  """ Assigns chains in :attr:`~model` to :attr:`~chem_groups`.
82 
83  :class:`list` of :class:`list` of :class:`str` (chain names)
84  """
85  return self._chem_mapping_chem_mapping
86 
87  @property
89  """ Model chains that cannot be mapped to :attr:`chem_groups`
90 
91  Depends on parameterization of :class:`ChainMapper`
92 
93  :class:`list` of class:`str` (chain names)
94  """
95  return self._mdl_chains_without_chem_mapping_mdl_chains_without_chem_mapping
96 
97  @property
98  def mapping(self):
99  """ Mapping of :attr:`~model` chains onto :attr:`~target`
100 
101  Exact same shape as :attr:`~chem_groups` but containing the names of the
102  mapped chains in :attr:`~model`. May contain None for :attr:`~target`
103  chains that are not covered. No guarantee that all chains in
104  :attr:`~model` are mapped.
105 
106  :class:`list` of :class:`list` of :class:`str` (chain names)
107  """
108  return self._mapping_mapping
109 
110  @property
111  def alns(self):
112  """ Alignments of mapped chains in :attr:`~target` and :attr:`~model`
113 
114  Each alignment is accessible with ``alns[(t_chain,m_chain)]``. First
115  sequence is the sequence of :attr:`target` chain, second sequence the
116  one from :attr:`~model`.
117 
118  :type: :class:`dict` with key: :class:`tuple` of :class:`str`, value:
119  :class:`ost.seq.AlignmentHandle`
120  """
121  return self._alns_alns
122 
123  @property
124  def opt_score(self):
125  """ Placeholder property without any guarantee of being set
126 
127  Different scores get optimized in the various chain mapping algorithms.
128  Some of them may set their final optimal score in that property.
129  Consult the documentation of the respective chain mapping algorithm
130  for more information. Won't be in the return dict of
131  :func:`JSONSummary`.
132  """
133  return self._opt_score_opt_score
134 
135  def GetFlatMapping(self, mdl_as_key=False):
136  """ Returns flat mapping as :class:`dict` for all mapable chains
137 
138  :param mdl_as_key: Default is target chain name as key and model chain
139  name as value. This can be reversed with this flag.
140  :returns: :class:`dict` with :class:`str` as key/value that describe
141  one-to-one mapping
142  """
143  flat_mapping = dict()
144  for trg_chem_group, mdl_chem_group in zip(self.chem_groupschem_groups,
145  self.mappingmapping):
146  for a,b in zip(trg_chem_group, mdl_chem_group):
147  if a is not None and b is not None:
148  if mdl_as_key:
149  flat_mapping[b] = a
150  else:
151  flat_mapping[a] = b
152  return flat_mapping
153 
154  def JSONSummary(self):
155  """ Returns JSON serializable summary of results
156  """
157  json_dict = dict()
158  json_dict["chem_groups"] = self.chem_groupschem_groups
159  json_dict["mapping"] = self.mappingmapping
160  json_dict["flat_mapping"] = self.GetFlatMappingGetFlatMapping()
161  json_dict["alns"] = list()
162  for aln in self.alnsalns.values():
163  trg_seq = aln.GetSequence(0)
164  mdl_seq = aln.GetSequence(1)
165  aln_dict = {"trg_ch": trg_seq.GetName(), "trg_seq": str(trg_seq),
166  "mdl_ch": mdl_seq.GetName(), "mdl_seq": str(mdl_seq)}
167  json_dict["alns"].append(aln_dict)
168  return json_dict
169 
170 
172 
173  """ Result object for :func:`ChainMapper.GetRepr`
174 
175  Constructor is directly called within the function, no need to construct
176  such objects yourself.
177 
178  :param lDDT: lDDT for this mapping. Depends on how you call
179  :func:`ChainMapper.GetRepr` whether this is backbone only or
180  full atom lDDT.
181  :type lDDT: :class:`float`
182  :param substructure: The full substructure for which we searched for a
183  representation
184  :type substructure: :class:`ost.mol.EntityView`
185  :param ref_view: View pointing to the same underlying entity as
186  *substructure* but only contains the stuff that is mapped
187  :type ref_view: :class:`mol.EntityView`
188  :param mdl_view: The matching counterpart in model
189  :type mdl_view: :class:`mol.EntityView`
190  """
191  def __init__(self, lDDT, substructure, ref_view, mdl_view):
192  self._lDDT_lDDT = lDDT
193  self._substructure_substructure = substructure
194  assert(len(ref_view.residues) == len(mdl_view.residues))
195  self._ref_view_ref_view = ref_view
196  self._mdl_view_mdl_view = mdl_view
197 
198  # lazily evaluated attributes
199  self._ref_bb_pos_ref_bb_pos = None
200  self._mdl_bb_pos_mdl_bb_pos = None
201  self._ref_full_bb_pos_ref_full_bb_pos = None
202  self._mdl_full_bb_pos_mdl_full_bb_pos = None
203  self._superposition_superposition = None
204  self._superposed_mdl_bb_pos_superposed_mdl_bb_pos = None
205  self._ost_query_ost_query = None
206  self._flat_mapping_flat_mapping = None
207  self._inconsistent_residues_inconsistent_residues = None
208 
209  @property
210  def lDDT(self):
211  """ lDDT of representation result
212 
213  Depends on how you call :func:`ChainMapper.GetRepr` whether this is
214  backbone only or full atom lDDT.
215 
216  :type: :class:`float`
217  """
218  return self._lDDT_lDDT
219 
220  @property
221  def substructure(self):
222  """ The full substructure for which we searched for a
223  representation
224 
225  :type: :class:`ost.mol.EntityView`
226  """
227  return self._substructure_substructure
228 
229  @property
230  def ref_view(self):
231  """ View which contains the mapped subset of :attr:`substructure`
232 
233  :type: :class:`ost.mol.EntityView`
234  """
235  return self._ref_view_ref_view
236 
237  @property
238  def mdl_view(self):
239  """ The :attr:`ref_view` representation in the model
240 
241  :type: :class:`ost.mol.EntityView`
242  """
243  return self._mdl_view_mdl_view
244 
245  @property
246  def ref_residues(self):
247  """ The reference residues
248 
249  :type: class:`mol.ResidueViewList`
250  """
251  return self.ref_viewref_view.residues
252 
253  @property
254  def mdl_residues(self):
255  """ The model residues
256 
257  :type: :class:`mol.ResidueViewList`
258  """
259  return self.mdl_viewmdl_view.residues
260 
261  @property
263  """ A list of mapped residue whose names do not match (eg. ALA in the
264  reference and LEU in the model).
265 
266  The mismatches are reported as a tuple of :class:`~ost.mol.ResidueView`
267  (reference, model), or as an empty list if all the residue names match.
268 
269  :type: :class:`list`
270  """
271  if self._inconsistent_residues_inconsistent_residues is None:
272  self._inconsistent_residues_inconsistent_residues = self._GetInconsistentResidues_GetInconsistentResidues(
273  self.ref_residuesref_residues, self.mdl_residuesmdl_residues)
274  return self._inconsistent_residues_inconsistent_residues
275 
276  @property
277  def ref_bb_pos(self):
278  """ Representative backbone positions for reference residues.
279 
280  Thats CA positions for peptides and C3' positions for Nucleotides.
281 
282  :type: :class:`geom.Vec3List`
283  """
284  if self._ref_bb_pos_ref_bb_pos is None:
285  self._ref_bb_pos_ref_bb_pos = self._GetBBPos_GetBBPos(self.ref_residuesref_residues)
286  return self._ref_bb_pos_ref_bb_pos
287 
288  @property
289  def mdl_bb_pos(self):
290  """ Representative backbone positions for model residues.
291 
292  Thats CA positions for peptides and C3' positions for Nucleotides.
293 
294  :type: :class:`geom.Vec3List`
295  """
296  if self._mdl_bb_pos_mdl_bb_pos is None:
297  self._mdl_bb_pos_mdl_bb_pos = self._GetBBPos_GetBBPos(self.mdl_residuesmdl_residues)
298  return self._mdl_bb_pos_mdl_bb_pos
299 
300  @property
301  def ref_full_bb_pos(self):
302  """ Representative backbone positions for reference residues.
303 
304  Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
305  positions for Nucleotides.
306 
307  :type: :class:`geom.Vec3List`
308  """
309  if self._ref_full_bb_pos_ref_full_bb_pos is None:
310  self._ref_full_bb_pos_ref_full_bb_pos = self._GetFullBBPos_GetFullBBPos(self.ref_residuesref_residues)
311  return self._ref_full_bb_pos_ref_full_bb_pos
312 
313  @property
314  def mdl_full_bb_pos(self):
315  """ Representative backbone positions for reference residues.
316 
317  Thats N, CA and C positions for peptides and O5', C5', C4', C3', O3'
318  positions for Nucleotides.
319 
320  :type: :class:`geom.Vec3List`
321  """
322  if self._mdl_full_bb_pos_mdl_full_bb_pos is None:
323  self._mdl_full_bb_pos_mdl_full_bb_pos = self._GetFullBBPos_GetFullBBPos(self.mdl_residuesmdl_residues)
324  return self._mdl_full_bb_pos_mdl_full_bb_pos
325 
326 
327  @property
328  def superposition(self):
329  """ Superposition of mdl residues onto ref residues
330 
331  Superposition computed as minimal RMSD superposition on
332  :attr:`ref_bb_pos` and :attr:`mdl_bb_pos`. If number of positions is
333  smaller 3, the full_bb_pos equivalents are used instead.
334 
335  :type: :class:`ost.mol.alg.SuperpositionResult`
336  """
337  if self._superposition_superposition is None:
338  if len(self.mdl_bb_posmdl_bb_pos) < 3:
339  self._superposition_superposition = _GetSuperposition(self.mdl_full_bb_posmdl_full_bb_pos,
340  self.ref_full_bb_posref_full_bb_pos, False)
341  else:
342  self._superposition_superposition = _GetSuperposition(self.mdl_bb_posmdl_bb_pos,
343  self.ref_bb_posref_bb_pos, False)
344  return self._superposition_superposition
345 
346  @property
347  def transform(self):
348  """ Transformation to superpose mdl residues onto ref residues
349 
350  :type: :class:`ost.geom.Mat4`
351  """
352  return self.superpositionsuperposition.transformation
353 
354  @property
356  """ :attr:`mdl_bb_pos` with :attr:`transform applied`
357 
358  :type: :class:`geom.Vec3List`
359  """
360  if self._superposed_mdl_bb_pos_superposed_mdl_bb_pos is None:
361  self._superposed_mdl_bb_pos_superposed_mdl_bb_pos = geom.Vec3List(self.mdl_bb_posmdl_bb_pos)
362  self._superposed_mdl_bb_pos_superposed_mdl_bb_pos.ApplyTransform(self.transformtransform)
363  return self._superposed_mdl_bb_pos_superposed_mdl_bb_pos
364 
365  @property
366  def bb_rmsd(self):
367  """ RMSD of the binding site backbone atoms after :attr:`superposition`
368 
369  :type: :class:`float`
370  """
371  return self.superpositionsuperposition.rmsd
372 
373 
374  @property
375  def ost_query(self):
376  """ query for mdl residues in OpenStructure query language
377 
378  Repr can be selected as ``full_mdl.Select(ost_query)``
379 
380  Returns invalid query if residue numbers have insertion codes.
381 
382  :type: :class:`str`
383  """
384  if self._ost_query_ost_query is None:
385  chain_rnums = dict()
386  for r in self.mdl_residuesmdl_residues:
387  chname = r.GetChain().GetName()
388  rnum = r.GetNumber().GetNum()
389  if chname not in chain_rnums:
390  chain_rnums[chname] = list()
391  chain_rnums[chname].append(str(rnum))
392  chain_queries = list()
393  for k,v in chain_rnums.items():
394  q = f"(cname={mol.QueryQuoteName(k)} and "
395  q += f"rnum={','.join(v)})"
396  chain_queries.append(q)
397  self._ost_query_ost_query = " or ".join(chain_queries)
398  return self._ost_query_ost_query
399 
400  def JSONSummary(self):
401  """ Returns JSON serializable summary of results
402  """
403  json_dict = dict()
404  json_dict["lDDT"] = self.lDDTlDDT
405  json_dict["ref_residues"] = [r.GetQualifiedName() for r in \
406  self.ref_residuesref_residues]
407  json_dict["mdl_residues"] = [r.GetQualifiedName() for r in \
408  self.mdl_residuesmdl_residues]
409  json_dict["transform"] = list(self.transformtransform.data)
410  json_dict["bb_rmsd"] = self.bb_rmsdbb_rmsd
411  json_dict["ost_query"] = self.ost_queryost_query
412  json_dict["flat_mapping"] = self.GetFlatChainMappingGetFlatChainMapping()
413  return json_dict
414 
415  def GetFlatChainMapping(self, mdl_as_key=False):
416  """ Returns flat mapping of all chains in the representation
417 
418  :param mdl_as_key: Default is target chain name as key and model chain
419  name as value. This can be reversed with this flag.
420  :returns: :class:`dict` with :class:`str` as key/value that describe
421  one-to-one mapping
422  """
423  flat_mapping = dict()
424  for trg_res, mdl_res in zip(self.ref_residuesref_residues, self.mdl_residuesmdl_residues):
425  if mdl_as_key:
426  flat_mapping[mdl_res.chain.name] = trg_res.chain.name
427  else:
428  flat_mapping[trg_res.chain.name] = mdl_res.chain.name
429  return flat_mapping
430 
431  def _GetFullBBPos(self, residues):
432  """ Helper to extract full backbone positions
433  """
434  exp_pep_atoms = ["N", "CA", "C"]
435  exp_nuc_atoms = ["O5'", "C5'", "C4'", "C3'", "O3'"]
436  bb_pos = geom.Vec3List()
437  for r in residues:
438  if r.GetChemType() == mol.ChemType.NUCLEOTIDES:
439  exp_atoms = exp_nuc_atoms
440  elif r.GetChemType() == mol.ChemType.AMINOACIDS:
441  exp_atoms = exp_pep_atoms
442  else:
443  raise RuntimeError(f"Residue {r.qualified_name} has unexpected"
444  f" chem type {r.GetChemType()}")
445  for aname in exp_atoms:
446  a = r.FindAtom(aname)
447  if not a.IsValid():
448  raise RuntimeError(f"Expected atom {aname} missing from "
449  f"{r.qualified_name}")
450  bb_pos.append(a.GetPos())
451  return bb_pos
452 
453  def _GetBBPos(self, residues):
454  """ Helper to extract single representative position for each residue
455  """
456  bb_pos = geom.Vec3List()
457  for r in residues:
458  at = r.FindAtom("CA")
459  if not at.IsValid():
460  at = r.FindAtom("C3'")
461  if not at.IsValid():
462  raise RuntimeError(f"Residue {r.qualified_name} missing "
463  f"expected CA/C3' atom")
464  bb_pos.append(at.GetPos())
465  return bb_pos
466 
467  def _GetInconsistentResidues(self, ref_residues, mdl_residues):
468  """ Helper to extract a list of inconsistent residues.
469  """
470  if len(ref_residues) != len(mdl_residues):
471  raise ValueError("Something terrible happened... Reference and "
472  "model lengths differ... RUN...")
473  inconsistent_residues = list()
474  for ref_residue, mdl_residue in zip(ref_residues, mdl_residues):
475  if ref_residue.name != mdl_residue.name:
476  inconsistent_residues.append((ref_residue, mdl_residue))
477  return inconsistent_residues
478 
479 
481  """ Class to compute chain mappings
482 
483  All algorithms are performed on processed structures which fulfill
484  criteria as given in constructor arguments (*min_pep_length*,
485  "min_nuc_length") and only contain residues which have all required backbone
486  atoms. for peptide residues thats N, CA, C and CB (no CB for GLY), for
487  nucleotide residues thats O5', C5', C4', C3' and O3'.
488 
489  Chain mapping is a three step process:
490 
491  * Group chemically identical chains in *target* into so called chem
492  groups. There are two options to achieve this:
493  1) using pairwise alignments that are either computed with
494  Needleman-Wunsch (NW) or simply derived from residue numbers
495  (*resnum_alignments* flag). In case of NW, *pep_subst_mat*, *pep_gap_open*
496  and *pep_gap_ext* and their nucleotide equivalents are relevant. Two
497  chains are considered identical if they fulfill the *pep_seqid_thr* and
498  have at least *min_pep_length* aligned residues. Same logic is applied
499  for nucleotides with respective thresholds.
500  2) specify seqres sequences and provide the mapping manually. This can
501  be useful if you're loading data from an mmCIF file where you have mmCIF
502  entity information. Alignments are performed based on residue numbers
503  in this case and don't consider the *resnum_alignments* flag. Any
504  mismatch of one letter code in the structure and the respective SEQRES
505  raises an error.
506 
507  * Map chains in an input model to these groups. Parameters for generating
508  alignments are the same as above. Model chains are mapped to the chem
509  group with highest sequence identity. Optionally, you can set a minimum
510  sequence identity threshold with *mdl_map_pep_seqid_thr*/
511  *mdl_map_nuc_seqid_thr* to avoid nonsensical mappings. You can either
512  get the group mapping with :func:`GetChemMapping` or directly call
513  one of the full fletched one-to-one chain mapping functions which
514  execute that step internally.
515 
516  * Obtain one-to-one mapping for chains in an input model and
517  *target* with one of the available mapping functions. Just to get an
518  idea of complexity. If *target* and *model* are octamers, there are
519  ``8! = 40320`` possible chain mappings.
520 
521  :param target: Target structure onto which models are mapped.
522  Computations happen on a selection only containing
523  polypeptides and polynucleotides.
524  :type target: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
525  :param resnum_alignments: Use residue numbers instead of
526  Needleman-Wunsch to compute pairwise
527  alignments. Relevant for :attr:`~chem_groups`
528  if *seqres* is not specified explicitely.
529  :type resnum_alignments: :class:`bool`
530  :param pep_seqid_thr: Sequence identity threshold (in percent of aligned
531  columns) used to decide when two chains are
532  identical. 95 percent tolerates the few mutations
533  crystallographers like to do. Range: [0-100].
534  :type pep_seqid_thr: :class:`float`
535  :param nuc_seqid_thr: Nucleotide equivalent for *pep_seqid_thr*
536  :type nuc_seqid_thr: :class:`float`
537  :param pep_subst_mat: Substitution matrix to align peptide sequences,
538  irrelevant if *resnum_alignments* is True,
539  defaults to seq.alg.BLOSUM62
540  :type pep_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
541  :param pep_gap_open: Gap open penalty to align peptide sequences,
542  irrelevant if *resnum_alignments* is True
543  :type pep_gap_open: :class:`int`
544  :param pep_gap_ext: Gap extension penalty to align peptide sequences,
545  irrelevant if *resnum_alignments* is True
546  :type pep_gap_ext: :class:`int`
547  :param nuc_subst_mat: Nucleotide equivalent for *pep_subst_mat*,
548  defaults to seq.alg.NUC44
549  :type nuc_subst_mat: :class:`ost.seq.alg.SubstWeightMatrix`
550  :param nuc_gap_open: Nucleotide equivalent for *pep_gap_open*
551  :type nuc_gap_open: :class:`int`
552  :param nuc_gap_ext: Nucleotide equivalent for *pep_gap_ext*
553  :type nuc_gap_ext: :class:`int`
554  :param min_pep_length: Minimal number of residues for a peptide chain to be
555  considered in target and in models.
556  :type min_pep_length: :class:`int`
557  :param min_nuc_length: Minimal number of residues for a nucleotide chain to be
558  considered in target and in models.
559  :type min_nuc_length: :class:`int`
560  :param n_max_naive: Max possible chain mappings that are enumerated in
561  :func:`~GetNaivelDDTMapping` /
562  :func:`~GetDecomposerlDDTMapping`. A
563  :class:`RuntimeError` is raised in case of bigger
564  complexity.
565  :type n_max_naive: :class:`int`
566  :param mdl_map_pep_seqid_thr: Sequence identity threshold (in percent of
567  aligned columns) to avoid non-sensical
568  mapping of model chains to reference chem
569  groups. If a value larger than 0.0 is
570  provided, minimal criteria for assignment
571  becomes a sequence identity of
572  *mdl_map_pep_seqid_thr* and at least
573  *min_pep_length* aligned residues. If set to
574  zero, it simply assigns a model chain to the
575  chem group with highest sequence identity.
576  Range: [0-100].
577  :type mdl_map_pep_seqid_thr: :class:`float`
578  :param mdl_map_nuc_seqid_thr: Nucleotide equivalent of
579  *mdl_map_pep_seqid_thr*
580  :type mdl_map_nuc_seqid_thr: :class:`float`
581  :param seqres: SEQRES sequences. All polymer chains in *target* will be
582  aligned to these sequences using a residue number based
583  alignment, effectively ignoring *resnum_alignments* in
584  the chem grouping step. Additionally, you need to
585  manually specify a mapping of the polymer chains using
586  *trg_seqres_mapping* and an error is raised otherwise.
587  The one letter codes in
588  the structure must exactly match the respective characters
589  in seqres and an error is raised if not. These seqres
590  sequences are set as :attr:`~chem_group_ref_seqs` and will
591  thus influence the mapping of model chains.
592  :type seqres: :class:`ost.seq.SequenceList`
593  :param trg_seqres_mapping: Maps each polymer chain in *target* to a sequence
594  in *seqres*. It's a :class:`dict` with chain name
595  as key and seqres sequence name as value.
596  :type trg_seqres_mapping: :class:`dict`
597  """
598  def __init__(self, target, resnum_alignments=False,
599  pep_seqid_thr = 95., nuc_seqid_thr = 95.,
600  pep_subst_mat = seq.alg.BLOSUM62, pep_gap_open = -11,
601  pep_gap_ext = -1, nuc_subst_mat = seq.alg.NUC44,
602  nuc_gap_open = -4, nuc_gap_ext = -4,
603  min_pep_length = 6, min_nuc_length = 4,
604  n_max_naive = 1e8,
605  mdl_map_pep_seqid_thr = 0.,
606  mdl_map_nuc_seqid_thr = 0.,
607  seqres = None,
608  trg_seqres_mapping = None):
609 
610  # input parameter check
611  seqres_params_set = sum([seqres is not None,
612  trg_seqres_mapping is not None])
613  if seqres_params_set not in [0, 2]:
614  raise RuntimeError("Must either give both, seqres and "
615  "trg_seqres_mapping, or none of them.")
616  # attributes
617  self.resnum_alignmentsresnum_alignments = resnum_alignments
618  self.pep_seqid_thrpep_seqid_thr = pep_seqid_thr
619  self.nuc_seqid_thrnuc_seqid_thr = nuc_seqid_thr
620  self.min_pep_lengthmin_pep_length = min_pep_length
621  self.min_nuc_lengthmin_nuc_length = min_nuc_length
622  self.n_max_naiven_max_naive = n_max_naive
623  self.mdl_map_pep_seqid_thrmdl_map_pep_seqid_thr = mdl_map_pep_seqid_thr
624  self.mdl_map_nuc_seqid_thrmdl_map_nuc_seqid_thr = mdl_map_nuc_seqid_thr
625  self.pep_subst_matpep_subst_mat = pep_subst_mat
626  self.pep_gap_openpep_gap_open = pep_gap_open
627  self.pep_gap_extpep_gap_ext = pep_gap_ext
628  self.nuc_subst_matnuc_subst_mat = nuc_subst_mat
629  self.nuc_gap_opennuc_gap_open = nuc_gap_open
630  self.nuc_gap_extnuc_gap_ext = nuc_gap_ext
631  self.seqresseqres = seqres
632  self.trg_seqres_mappingtrg_seqres_mapping = trg_seqres_mapping
633  self.seqres_setseqres_set = seqres_params_set == 2
634 
635  # lazy computed attributes
636  self._chem_groups_chem_groups = None
637  self._chem_group_alignments_chem_group_alignments = None
638  self._chem_group_ref_seqs_chem_group_ref_seqs = None
639  self._chem_group_types_chem_group_types = None
640 
641  # target structure preprocessing
642  self._target, self._polypep_seqs, self._polynuc_seqs_polynuc_seqs = \
643  self.ProcessStructureProcessStructure(target)
644 
645  # input parameter check
646  if self.seqres_setseqres_set:
647  # check if seqres names, i.e. entity ids, are unique
648  entity_ids = set()
649  for s in self.seqresseqres:
650  sname = s.GetName()
651  if sname in entity_ids:
652  raise RuntimeError(f"Sequence names in seqres param must "
653  f"be unique and refer to entity ids. "
654  f"Duplicate sequence name: \"{sname}\"")
655  entity_ids.add(sname)
656  # check if entity ids in trg_seqres_mapping are all covered
657  # in seqres
658  for cname, entity_id in self.trg_seqres_mappingtrg_seqres_mapping.items():
659  if entity_id not in entity_ids:
660  raise RuntimeError(f"trg_seqres_mapping contains entity id "
661  f"for which no seqres is given: "
662  f"cname: \"{cname}\", entity id: "
663  f"\"{entity_id}\"")
664  # check if all sequences in self.polypep_seqs and self.polynuc_seqs
665  # have a mapping in trg_seqres_mapping
666  for s in self.polypep_seqspolypep_seqs:
667  if s.GetName() not in self.trg_seqres_mappingtrg_seqres_mapping:
668  raise RuntimeError(f"If seqres information is provided, "
669  f"all polymer chains must be covered "
670  f"by trg_seqres_mapping. Missing "
671  f"mapping for chain \"{s.GetName()}\"")
672  for s in self.polynuc_seqspolynuc_seqs:
673  if s.GetName() not in self.trg_seqres_mappingtrg_seqres_mapping:
674  raise RuntimeError(f"If seqres information is provided, "
675  f"all polymer chains must be covered "
676  f"by trg_seqres_mapping. Missing "
677  f"mapping for chain \"{s.GetName()}\"")
678 
679 
680  @property
681  def target(self):
682  """Target structure that only contains peptides/nucleotides
683 
684  Contains only residues that have the backbone representatives
685  (CA for peptide and C3' for nucleotides) to avoid ATOMSEQ alignment
686  inconsistencies when switching between all atom and backbone only
687  representations.
688 
689  :type: :class:`ost.mol.EntityView`
690  """
691  return self._target
692 
693  @property
694  def polypep_seqs(self):
695  """Sequences of peptide chains in :attr:`~target`
696 
697  :type: :class:`ost.seq.SequenceList`
698  """
699  return self._polypep_seqs
700 
701  @property
702  def polynuc_seqs(self):
703  """Sequences of nucleotide chains in :attr:`~target`
704 
705  :type: :class:`ost.seq.SequenceList`
706  """
707  return self._polynuc_seqs_polynuc_seqs
708 
709  @property
710  def chem_groups(self):
711  """Groups of chemically equivalent chains in :attr:`~target`
712 
713  First chain in group is the one with longest sequence.
714 
715  :getter: Computed on first use (cached)
716  :type: :class:`list` of :class:`list` of :class:`str` (chain names)
717  """
718  if self._chem_groups_chem_groups is None:
719  self._ComputeChemGroups_ComputeChemGroups()
720  return self._chem_groups_chem_groups
721 
722  @property
724  """MSA for each group in :attr:`~chem_groups`
725 
726  The first sequence is the reference sequence.
727  The subsequent sequences represent the ATOMSEQ sequences in
728  :attr:`~target` in same order as in :attr:`~chem_groups`.
729 
730  :getter: Computed on first use (cached)
731  :type: :class:`ost.seq.AlignmentList`
732  """
733  if self._chem_group_alignments_chem_group_alignments is None:
734  self._ComputeChemGroups_ComputeChemGroups()
735  return self._chem_group_alignments_chem_group_alignments
736 
737  @property
739  """Reference sequence for each group in :attr:`~chem_groups`
740 
741  :getter: Computed on first use (cached)
742  :type: :class:`ost.seq.SequenceList`
743  """
744  if self._chem_group_ref_seqs_chem_group_ref_seqs is None:
745  self._ComputeChemGroups_ComputeChemGroups()
746  return self._chem_group_ref_seqs_chem_group_ref_seqs
747 
748  @property
749  def chem_group_types(self):
750  """ChemType of each group in :attr:`~chem_groups`
751 
752  Specifying if groups are poly-peptides/nucleotides, i.e.
753  :class:`ost.mol.ChemType.AMINOACIDS` or
754  :class:`ost.mol.ChemType.NUCLEOTIDES`
755 
756  :getter: Computed on first use (cached)
757  :type: :class:`list` of :class:`ost.mol.ChemType`
758  """
759  if self._chem_group_types_chem_group_types is None:
760  self._ComputeChemGroups_ComputeChemGroups()
761  return self._chem_group_types_chem_group_types
762 
763  def GetChemMapping(self, model):
764  """Maps sequences in *model* to chem_groups of target
765 
766  :param model: Model from which to extract sequences, a
767  selection that only includes peptides and nucleotides
768  is performed and returned along other results.
769  :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
770  :returns: Tuple with two lists of length `len(self.chem_groups)`, a list
771  and an :class:`ost.mol.EntityView` representing *model*:
772  1) Each element is a :class:`list` with mdl chain names that
773  map to the chem group at that position.
774  2) Each element is a :class:`ost.seq.AlignmentList` aligning
775  these mdl chain sequences to the chem group ref sequences.
776  3) The model chains that could not be mapped to the reference
777  4) A selection of *model* that only contains polypeptides and
778  polynucleotides whose ATOMSEQ exactly matches the sequence
779  info in the returned alignments.
780  """
781  mdl, mdl_pep_seqs, mdl_nuc_seqs = self.ProcessStructureProcessStructure(model)
782  mapping = [list() for x in self.chem_groupschem_groups]
783  mdl_chains_without_chem_mapping = list()
784  alns = [seq.AlignmentList() for x in self.chem_groupschem_groups]
785 
786  for s in mdl_pep_seqs:
787  idx, aln = self._MapSequence_MapSequence(self.chem_group_ref_seqschem_group_ref_seqs,
788  self.chem_group_typeschem_group_types,
789  s, mol.ChemType.AMINOACIDS, mdl,
790  seq_id_thr = self.mdl_map_pep_seqid_thrmdl_map_pep_seqid_thr,
791  min_aln_length = self.min_pep_lengthmin_pep_length)
792  if idx is None:
793  ost.LogWarning("Could not map model chain %s to any chem group"
794  " in the target" % s.name)
795  mdl_chains_without_chem_mapping.append(s.GetName())
796  else:
797  mapping[idx].append(s.GetName())
798  alns[idx].append(aln)
799 
800  for s in mdl_nuc_seqs:
801  idx, aln = self._MapSequence_MapSequence(self.chem_group_ref_seqschem_group_ref_seqs,
802  self.chem_group_typeschem_group_types,
803  s, mol.ChemType.NUCLEOTIDES, mdl,
804  seq_id_thr = self.mdl_map_nuc_seqid_thrmdl_map_nuc_seqid_thr,
805  min_aln_length = self.min_nuc_lengthmin_nuc_length)
806  if idx is None:
807  ost.LogWarning("Could not map model chain %s to any chem group"
808  " in the target" % s.name)
809  mdl_chains_without_chem_mapping.append(s.GetName())
810  else:
811  mapping[idx].append(s.GetName())
812  alns[idx].append(aln)
813 
814  return (mapping, alns, mdl_chains_without_chem_mapping, mdl)
815 
816 
817  def GetlDDTMapping(self, model, inclusion_radius=15.0,
818  thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic",
819  steep_opt_rate = None, block_seed_size = 5,
820  block_blocks_per_chem_group = 5,
821  chem_mapping_result = None,
822  heuristic_n_max_naive = 40320):
823  """ Identify chain mapping by optimizing lDDT score
824 
825  Maps *model* chain sequences to :attr:`~chem_groups` and find mapping
826  based on backbone only lDDT score (CA for amino acids C3' for
827  Nucleotides).
828 
829  Either performs a naive search, i.e. enumerate all possible mappings or
830  executes a greedy strategy that tries to identify a (close to) optimal
831  mapping in an iterative way by starting from a start mapping (seed). In
832  each iteration, the one-to-one mapping that leads to highest increase
833  in number of conserved contacts is added with the additional requirement
834  that this added mapping must have non-zero interface counts towards the
835  already mapped chains. So basically we're "growing" the mapped structure
836  by only adding connected stuff.
837 
838  The available strategies:
839 
840  * **naive**: Enumerates all possible mappings and returns best
841 
842  * **greedy_full**: try multiple seeds for greedy extension, i.e. try
843  all ref/mdl chain combinations within the respective chem groups and
844  retain the mapping leading to the best lDDT.
845 
846  * **greedy_block**: try multiple seeds for greedy extension, i.e. try
847  all ref/mdl chain combinations within the respective chem groups and
848  extend them to *block_seed_size*. *block_blocks_per_chem_group*
849  for each chem group are selected for exhaustive extension.
850 
851  * **heuristic**: Uses *naive* strategy if number of possible mappings
852  is within *heuristic_n_max_naive*. The default of 40320 corresponds
853  to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
854  6!*2!=1440 etc. If the number of possible mappings is larger,
855  *greedy_full* is used.
856 
857  Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
858  mapping.
859 
860  :param model: Model to map
861  :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
862  :param inclusion_radius: Inclusion radius for lDDT
863  :type inclusion_radius: :class:`float`
864  :param thresholds: Thresholds for lDDT
865  :type thresholds: :class:`list` of :class:`float`
866  :param strategy: Strategy to find mapping. Must be in ["naive",
867  "greedy_full", "greedy_block"]
868  :type strategy: :class:`str`
869  :param steep_opt_rate: Only relevant for greedy strategies.
870  If set, every *steep_opt_rate* mappings, a simple
871  optimization is executed with the goal of
872  avoiding local minima. The optimization
873  iteratively checks all possible swaps of mappings
874  within their respective chem groups and accepts
875  swaps that improve lDDT score. Iteration stops as
876  soon as no improvement can be achieved anymore.
877  :type steep_opt_rate: :class:`int`
878  :param block_seed_size: Param for *greedy_block* strategy - Initial seeds
879  are extended by that number of chains.
880  :type block_seed_size: :class:`int`
881  :param block_blocks_per_chem_group: Param for *greedy_block* strategy -
882  Number of blocks per chem group that
883  are extended in an initial search
884  for high scoring local solutions.
885  :type block_blocks_per_chem_group: :class:`int`
886  :param chem_mapping_result: Pro param. The result of
887  :func:`~GetChemMapping` where you provided
888  *model*. If set, *model* parameter is not
889  used.
890  :type chem_mapping_result: :class:`tuple`
891  :returns: A :class:`MappingResult`
892  """
893 
894  strategies = ["naive", "greedy_full", "greedy_block", "heuristic"]
895  if strategy not in strategies:
896  raise RuntimeError(f"Strategy must be in {strategies}")
897 
898  if chem_mapping_result is None:
899  chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
900  self.GetChemMappingGetChemMapping(model)
901  else:
902  chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
903  chem_mapping_result
904 
905  ref_mdl_alns = _GetRefMdlAlns(self.chem_groupschem_groups,
906  self.chem_group_alignmentschem_group_alignments,
907  chem_mapping,
908  chem_group_alns)
909 
910  # check for the simplest case
911  one_to_one = _CheckOneToOneMapping(self.chem_groupschem_groups, chem_mapping)
912  if one_to_one is not None:
913  alns = dict()
914  for ref_group, mdl_group in zip(self.chem_groupschem_groups, one_to_one):
915  for ref_ch, mdl_ch in zip(ref_group, mdl_group):
916  if ref_ch is not None and mdl_ch is not None:
917  aln = ref_mdl_alns[(ref_ch, mdl_ch)]
918  alns[(ref_ch, mdl_ch)] = aln
919  return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
920  mdl_chains_without_chem_mapping, one_to_one, alns)
921 
922  if strategy == "heuristic":
923  if _NMappingsWithin(self.chem_groupschem_groups, chem_mapping,
924  heuristic_n_max_naive):
925  strategy = "naive"
926  else:
927  strategy = "greedy_full"
928 
929  mapping = None
930  opt_lddt = None
931 
932  if strategy == "naive":
933  mapping, opt_lddt = _lDDTNaive(self.targettarget, mdl, inclusion_radius,
934  thresholds, self.chem_groupschem_groups,
935  chem_mapping, ref_mdl_alns,
936  self.n_max_naiven_max_naive)
937  else:
938  # its one of the greedy strategies - setup greedy searcher
939  the_greed = _lDDTGreedySearcher(self.targettarget, mdl, self.chem_groupschem_groups,
940  chem_mapping, ref_mdl_alns,
941  inclusion_radius=inclusion_radius,
942  thresholds=thresholds,
943  steep_opt_rate=steep_opt_rate)
944  if strategy == "greedy_full":
945  mapping = _lDDTGreedyFull(the_greed)
946  elif strategy == "greedy_block":
947  mapping = _lDDTGreedyBlock(the_greed, block_seed_size,
948  block_blocks_per_chem_group)
949  # cached => lDDT computation is fast here
950  opt_lddt = the_greed.Score(mapping)
951 
952  alns = dict()
953  for ref_group, mdl_group in zip(self.chem_groupschem_groups, mapping):
954  for ref_ch, mdl_ch in zip(ref_group, mdl_group):
955  if ref_ch is not None and mdl_ch is not None:
956  aln = ref_mdl_alns[(ref_ch, mdl_ch)]
957  alns[(ref_ch, mdl_ch)] = aln
958 
959  return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
960  mdl_chains_without_chem_mapping, mapping, alns,
961  opt_score = opt_lddt)
962 
963 
964  def GetQSScoreMapping(self, model, contact_d = 12.0, strategy = "heuristic",
965  block_seed_size = 5, block_blocks_per_chem_group = 5,
966  heuristic_n_max_naive = 40320, steep_opt_rate = None,
967  chem_mapping_result = None,
968  greedy_prune_contact_map = True):
969  """ Identify chain mapping based on QSScore
970 
971  Scoring is based on CA/C3' positions which are present in all chains of
972  a :attr:`chem_groups` as well as the *model* chains which are mapped to
973  that respective chem group.
974 
975  The following strategies are available:
976 
977  * **naive**: Naively iterate all possible mappings and return best based
978  on QS score.
979 
980  * **greedy_full**: try multiple seeds for greedy extension, i.e. try
981  all ref/mdl chain combinations within the respective chem groups and
982  retain the mapping leading to the best QS-score.
983 
984  * **greedy_block**: try multiple seeds for greedy extension, i.e. try
985  all ref/mdl chain combinations within the respective chem groups and
986  extend them to *block_seed_size*. *block_blocks_per_chem_group*
987  for each chem group are selected for exhaustive extension.
988 
989  * **heuristic**: Uses *naive* strategy if number of possible mappings
990  is within *heuristic_n_max_naive*. The default of 40320 corresponds
991  to an octamer (8!=40320). A structure with stoichiometry A6B2 would be
992  6!*2!=1440 etc. If the number of possible mappings is larger,
993  *greedy_full* is used.
994 
995  Sets :attr:`MappingResult.opt_score` in case of no trivial one-to-one
996  mapping.
997 
998  :param model: Model to map
999  :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1000  :param contact_d: Max distance between two residues to be considered as
1001  contact in qs scoring
1002  :type contact_d: :class:`float`
1003  :param strategy: Strategy for sampling, must be in ["naive",
1004  greedy_full", "greedy_block"]
1005  :type strategy: :class:`str`
1006  :param chem_mapping_result: Pro param. The result of
1007  :func:`~GetChemMapping` where you provided
1008  *model*. If set, *model* parameter is not
1009  used.
1010  :type chem_mapping_result: :class:`tuple`
1011  :param greedy_prune_contact_map: Relevant for all strategies that use
1012  greedy extensions. If True, only chains
1013  with at least 3 contacts (8A CB
1014  distance) towards already mapped chains
1015  in trg/mdl are considered for
1016  extension. All chains that give a
1017  potential non-zero QS-score increase
1018  are used otherwise (at least one
1019  contact within 12A). The consequence
1020  is reduced runtime and usually no
1021  real reduction in accuracy.
1022  :returns: A :class:`MappingResult`
1023  """
1024 
1025  strategies = ["naive", "greedy_full", "greedy_block", "heuristic"]
1026  if strategy not in strategies:
1027  raise RuntimeError(f"strategy must be {strategies}")
1028 
1029  if chem_mapping_result is None:
1030  chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1031  self.GetChemMappingGetChemMapping(model)
1032  else:
1033  chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1034  chem_mapping_result
1035  ref_mdl_alns = _GetRefMdlAlns(self.chem_groupschem_groups,
1036  self.chem_group_alignmentschem_group_alignments,
1037  chem_mapping,
1038  chem_group_alns)
1039  # check for the simplest case
1040  one_to_one = _CheckOneToOneMapping(self.chem_groupschem_groups, chem_mapping)
1041  if one_to_one is not None:
1042  alns = dict()
1043  for ref_group, mdl_group in zip(self.chem_groupschem_groups, one_to_one):
1044  for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1045  if ref_ch is not None and mdl_ch is not None:
1046  aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1047  alns[(ref_ch, mdl_ch)] = aln
1048  return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1049  mdl_chains_without_chem_mapping, one_to_one, alns)
1050 
1051  if strategy == "heuristic":
1052  if _NMappingsWithin(self.chem_groupschem_groups, chem_mapping,
1053  heuristic_n_max_naive):
1054  strategy = "naive"
1055  else:
1056  strategy = "greedy_full"
1057 
1058  mapping = None
1059  opt_qsscore = None
1060 
1061  if strategy == "naive":
1062  mapping, opt_qsscore = _QSScoreNaive(self.targettarget, mdl,
1063  self.chem_groupschem_groups,
1064  chem_mapping, ref_mdl_alns,
1065  contact_d, self.n_max_naiven_max_naive)
1066  else:
1067  # its one of the greedy strategies - setup greedy searcher
1068  the_greed = _QSScoreGreedySearcher(self.targettarget, mdl,
1069  self.chem_groupschem_groups,
1070  chem_mapping, ref_mdl_alns,
1071  contact_d = contact_d,
1072  steep_opt_rate=steep_opt_rate,
1073  greedy_prune_contact_map = greedy_prune_contact_map)
1074  if strategy == "greedy_full":
1075  mapping = _QSScoreGreedyFull(the_greed)
1076  elif strategy == "greedy_block":
1077  mapping = _QSScoreGreedyBlock(the_greed, block_seed_size,
1078  block_blocks_per_chem_group)
1079  # cached => QSScore computation is fast here
1080  opt_qsscore = the_greed.Score(mapping, check=False).QS_global
1081 
1082  alns = dict()
1083  for ref_group, mdl_group in zip(self.chem_groupschem_groups, mapping):
1084  for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1085  if ref_ch is not None and mdl_ch is not None:
1086  aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1087  alns[(ref_ch, mdl_ch)] = aln
1088 
1089  return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1090  mdl_chains_without_chem_mapping, mapping, alns,
1091  opt_score = opt_qsscore)
1092 
1093  def GetRMSDMapping(self, model, strategy = "heuristic", subsampling=50,
1094  chem_mapping_result = None, heuristic_n_max_naive = 120):
1095  """Identify chain mapping based on minimal RMSD superposition
1096 
1097  Superposition and scoring is based on CA/C3' positions which are present
1098  in all chains of a :attr:`chem_groups` as well as the *model*
1099  chains which are mapped to that respective chem group.
1100 
1101  The following strategies are available:
1102 
1103  * **naive**: Naively iterate all possible mappings and return the one
1104  with lowes RMSD.
1105 
1106  * **greedy_single**: perform all vs. all single chain superpositions
1107  within the respective ref/mdl chem groups to use as starting points.
1108  For each starting point, iteratively add the model/target chain pair
1109  with lowest RMSD until a full mapping is achieved. The mapping with
1110  lowest RMSD is returned.
1111 
1112  * **greedy_single_centroid**: Same as *greedy_single* but iteratively
1113  adds model/target chain pairs with lowest centroid distance. The
1114  mapping with the lowest centroid RMSD is returned. This is mainly for
1115  evaluation purposes. One known failure mode is that intertwined
1116  structures may have multiple centroids sitting on top of each other.
1117  RMSD and thus *greedy_single* are better able to disentangle this
1118  situation.
1119 
1120  * **greedy_iterative**: Same as greedy_single_rmsd exept that the
1121  transformation gets updated with each added chain pair.
1122 
1123  * **heuristic**: Uses *naive* strategy if number of possible mappings
1124  is within *heuristic_n_max_naive*. The default of 120 corresponds
1125  to a homo-pentamer (5!=120). If the number of possible mappings is
1126  larger, *greedy_iterative* is used.
1127 
1128  :param model: Model to map
1129  :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1130  :param strategy: Strategy for sampling. Must be in ["naive",
1131  "greedy_single", "greedy_iterative"]
1132  :type strategy: :class:`str`
1133  :param subsampling: If given, only an equally distributed subset
1134  of CA/C3' positions in each chain are used for
1135  superposition/scoring.
1136  :type subsampling: :class:`int`
1137  :param chem_mapping_result: Pro param. The result of
1138  :func:`~GetChemMapping` where you provided
1139  *model*. If set, *model* parameter is not
1140  used.
1141  :type chem_mapping_result: :class:`tuple`
1142  :returns: A :class:`MappingResult`
1143  """
1144 
1145  strategies = ["naive", "greedy_single", "greedy_single_centroid",
1146  "greedy_iterative", "heuristic"]
1147 
1148  if strategy not in strategies:
1149  raise RuntimeError(f"strategy must be {strategies}")
1150 
1151  if chem_mapping_result is None:
1152  chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1153  self.GetChemMappingGetChemMapping(model)
1154  else:
1155  chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1156  chem_mapping_result
1157  ref_mdl_alns = _GetRefMdlAlns(self.chem_groupschem_groups,
1158  self.chem_group_alignmentschem_group_alignments,
1159  chem_mapping,
1160  chem_group_alns)
1161 
1162  # check for the simplest case
1163  one_to_one = _CheckOneToOneMapping(self.chem_groupschem_groups, chem_mapping)
1164  if one_to_one is not None:
1165  alns = dict()
1166  for ref_group, mdl_group in zip(self.chem_groupschem_groups, one_to_one):
1167  for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1168  if ref_ch is not None and mdl_ch is not None:
1169  aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1170  alns[(ref_ch, mdl_ch)] = aln
1171  return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1172  mdl_chains_without_chem_mapping, one_to_one, alns)
1173 
1174  trg_group_pos, mdl_group_pos = _GetRefPos(self.targettarget, mdl,
1175  self.chem_group_alignmentschem_group_alignments,
1176  chem_group_alns,
1177  max_pos = subsampling)
1178 
1179  if strategy == "heuristic":
1180  if _NMappingsWithin(self.chem_groupschem_groups, chem_mapping,
1181  heuristic_n_max_naive):
1182  strategy = "naive"
1183  else:
1184  strategy = "greedy_iterative"
1185 
1186  mapping = None
1187 
1188  if strategy.startswith("greedy"):
1189  # get transforms of any mdl chain onto any trg chain in same chem
1190  # group that fulfills gdtts threshold
1191  initial_transforms = list()
1192  initial_mappings = list()
1193  for trg_pos, trg_chains, mdl_pos, mdl_chains in zip(trg_group_pos,
1194  self.chem_groupschem_groups,
1195  mdl_group_pos,
1196  chem_mapping):
1197  for t_pos, t in zip(trg_pos, trg_chains):
1198  for m_pos, m in zip(mdl_pos, mdl_chains):
1199  if len(t_pos) >= 3 and len(m_pos) >= 3:
1200  transform = _GetSuperposition(m_pos, t_pos,
1201  False).transformation
1202  initial_transforms.append(transform)
1203  initial_mappings.append((t,m))
1204 
1205  if strategy == "greedy_single":
1206  mapping = _SingleRigidRMSD(initial_transforms,
1207  initial_mappings,
1208  self.chem_groupschem_groups,
1209  chem_mapping,
1210  trg_group_pos,
1211  mdl_group_pos)
1212 
1213  elif strategy == "greedy_single_centroid":
1214  mapping = _SingleRigidCentroid(initial_transforms,
1215  initial_mappings,
1216  self.chem_groupschem_groups,
1217  chem_mapping,
1218  trg_group_pos,
1219  mdl_group_pos)
1220 
1221  elif strategy == "greedy_iterative":
1222  mapping = _IterativeRigidRMSD(initial_transforms,
1223  initial_mappings,
1224  self.chem_groupschem_groups, chem_mapping,
1225  trg_group_pos, mdl_group_pos)
1226  elif strategy == "naive":
1227  mapping = _NaiveRMSD(self.chem_groupschem_groups, chem_mapping,
1228  trg_group_pos, mdl_group_pos,
1229  self.n_max_naiven_max_naive)
1230 
1231  # translate mapping format and return
1232  final_mapping = list()
1233  for ref_chains in self.chem_groupschem_groups:
1234  mapped_mdl_chains = list()
1235  for ref_ch in ref_chains:
1236  if ref_ch in mapping:
1237  mapped_mdl_chains.append(mapping[ref_ch])
1238  else:
1239  mapped_mdl_chains.append(None)
1240  final_mapping.append(mapped_mdl_chains)
1241 
1242  alns = dict()
1243  for ref_group, mdl_group in zip(self.chem_groupschem_groups, final_mapping):
1244  for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1245  if ref_ch is not None and mdl_ch is not None:
1246  aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1247  alns[(ref_ch, mdl_ch)] = aln
1248 
1249  return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1250  mdl_chains_without_chem_mapping, final_mapping, alns)
1251 
1252 
1253  def GetAFMMapping(self, model, chem_mapping_result = None):
1254  """Identify chain mapping as in AlphaFold-Multimer (AFM)
1255 
1256  Described in section 7.3 of
1257 
1258  Richard Evans, Michael O’Neill, Alexander Pritzel, Natasha Antropova,
1259  Andrew Senior, Tim Green, Augustin Žídek, Russ Bates, Sam Blackwell,
1260  Jason Yim, et al. Protein complex prediction with AlphaFold-Multimer.
1261  bioRxiv preprint bioRxiv:10.1101/2021.10.04.463034, 2021.
1262 
1263  To summarize (largely follows the description in the AF-Multimer paper):
1264  One anchor chain in the ground truth (reference) is selected. If the
1265  reference has stoichiometry A3B2, an arbitary chain in B is selected
1266  as it has smaller ambiguity. In a tie, for example A2B2, the longest
1267  chain in A, B is selected.
1268 
1269  Given a model chain with same sequence as the reference anchor chain,
1270  a CA-RMSD (C3' for nucleotides) based superposition is performed. Chains
1271  are then greedily assigned by minimum distance of their geometric
1272  centers. This procedure is repeated for every model chain with same
1273  sequence and the assignment with minimal RMSD of the geometric centers
1274  is returned.
1275 
1276  A modification of this algorithm allows to deal with different
1277  stoichiometries in reference and model. In the anchor selection, this
1278  function ensures that there is at least one model chain that can be
1279  mapped to this anchor. If the logic above does not identify a mappable
1280  chain pair for the smallest group, it continues with the second smallest
1281  group etc.
1282 
1283  :param model: Model to map
1284  :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1285  :param chem_mapping_result: Pro param. The result of
1286  :func:`~GetChemMapping` where you provided
1287  *model*. If set, *model* parameter is not
1288  used.
1289  :type chem_mapping_result: :class:`tuple`
1290  :returns: A :class:`MappingResult`
1291  """
1292  if chem_mapping_result is None:
1293  chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1294  self.GetChemMappingGetChemMapping(model)
1295  else:
1296  chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1297  chem_mapping_result
1298  ref_mdl_alns = _GetRefMdlAlns(self.chem_groupschem_groups,
1299  self.chem_group_alignmentschem_group_alignments,
1300  chem_mapping,
1301  chem_group_alns)
1302 
1303  # check for the simplest case
1304  one_to_one = _CheckOneToOneMapping(self.chem_groupschem_groups, chem_mapping)
1305  if one_to_one is not None:
1306  alns = dict()
1307  for ref_group, mdl_group in zip(self.chem_groupschem_groups, one_to_one):
1308  for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1309  if ref_ch is not None and mdl_ch is not None:
1310  aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1311  alns[(ref_ch, mdl_ch)] = aln
1312  return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1313  mdl_chains_without_chem_mapping, one_to_one, alns)
1314 
1315  # Select anchor chain in reference
1316  ref_group_idx = None
1317  ref_ch = None
1318 
1319  chem_group_sizes = list(set([len(g) for g in self.chem_groupschem_groups]))
1320  chem_group_sizes.sort()
1321 
1322  for min_size in chem_group_sizes:
1323  min_chem_groups = list()
1324  for chem_group_idx, chem_group in enumerate(self.chem_groupschem_groups):
1325  if len(chem_group) == min_size:
1326  min_chem_groups.append(chem_group_idx)
1327  if len(min_chem_groups) == 1:
1328  if len(chem_mapping[min_chem_groups[0]]) > 0:
1329  # Unique smallest chem group that has at least one
1330  # model chain for mapping
1331  # => select arbitrary chain
1332  ref_group_idx = min_chem_groups[0]
1333  ref_ch = self.chem_groupschem_groups[min_chem_groups[0]][0]
1334  else:
1335  # No unique smallest chem group
1336  # => select the largest chain from any of these chem groups
1337  # that has at least one model chain for mapping
1338  n_max = 0
1339  for chem_group_idx in min_chem_groups:
1340  if len(chem_mapping[chem_group_idx]) > 0:
1341  for ch in self.chem_groupschem_groups[chem_group_idx]:
1342  n = self.targettarget.FindChain(ch).GetResidueCount()
1343  if n > n_max:
1344  n_max = n
1345  ref_group_idx = chem_group_idx
1346  ref_ch = ch
1347  if ref_group_idx is not None and ref_ch is not None:
1348  break
1349 
1350  # One transformation for each model chain that can be mapped to this one
1351  # anchor in the reference
1352  ref_bb_pos = _GetBBPos(self.targettarget)
1353  mdl_bb_pos = _GetBBPos(mdl)
1354  transformations = list()
1355  for mdl_ch in chem_mapping[ref_group_idx]:
1356  aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1357  # assertion can be removed after some testing
1358  l1 = len(ref_bb_pos[ref_ch])
1359  l2 = len(aln.GetSequence(0).GetGaplessString())
1360  assert(l1 == l2)
1361  l1 = len(mdl_bb_pos[mdl_ch])
1362  l2 = len(aln.GetSequence(1).GetGaplessString())
1363  assert(l1 == l2)
1364  # extract aligned positions
1365  ref_idx = 0
1366  mdl_idx = 0
1367  ref_pos = geom.Vec3List()
1368  mdl_pos = geom.Vec3List()
1369  for col in aln:
1370  if col[0] != '-' and col[1] != '-':
1371  ref_pos.append(ref_bb_pos[ref_ch][ref_idx])
1372  mdl_pos.append(mdl_bb_pos[mdl_ch][mdl_idx])
1373  if col[0] != '-':
1374  ref_idx += 1
1375  if col[1] != '-':
1376  mdl_idx += 1
1377  t = _GetSuperposition(mdl_pos, ref_pos, False).transformation
1378  transformations.append(t)
1379 
1380  ref_centers = {k: v.center for k,v in ref_bb_pos.items()}
1381  mdl_chains = list()
1382  mdl_centers = list()
1383  for k,v in mdl_bb_pos.items():
1384  mdl_chains.append(k)
1385  mdl_centers.append(v.center)
1386  mdl_centers = geom.Vec3List(mdl_centers)
1387 
1388  best_rmsd = float("inf")
1389  best_mapping = None # k: ref_ch, v: mdl_ch
1390 
1391  for t in transformations:
1392  tmp = geom.Vec3List(mdl_centers)
1393  tmp.ApplyTransform(t)
1394  # somehow the Vec3List did not really work in a dict comprehension
1395  t_mdl_centers = dict()
1396  for i in range(len(tmp)):
1397  t_mdl_centers[mdl_chains[i]] = tmp[i]
1398 
1399  # one entry for each possible assignment
1400  # (<squared_distance>, <ref_ch>, <mdl_ch>)
1401  tmp = list()
1402  for a, b in zip(self.chem_groupschem_groups, chem_mapping):
1403  for ref_ch in a:
1404  for mdl_ch in b:
1405  d = geom.Length2(ref_centers[ref_ch]-t_mdl_centers[mdl_ch])
1406  tmp.append((d, ref_ch, mdl_ch))
1407  tmp.sort()
1408 
1409  mapping = dict()
1410  mapped_mdl_chains = set()
1411  for item in tmp:
1412  if item[1] not in mapping and item[2] not in mapped_mdl_chains:
1413  mapping[item[1]] = item[2]
1414  mapped_mdl_chains.add(item[2])
1415 
1416  if len(mapping) == 0:
1417  raise RuntimeError("Empty mapping in GetAFMMapping")
1418  elif len(mapping) == 1:
1419  # by definition zero
1420  rmsd = 0.0
1421  elif len(mapping) == 2:
1422  # no superposition possible with two positions
1423  # derive rmsd from the distance difference between
1424  # the centroid pairs in reference and model
1425  ref_p = [ref_centers[k] for k in mapping.keys()]
1426  mdl_p = [t_mdl_centers[v] for v in mapping.values()]
1427  ref_d = geom.Distance(ref_p[0], ref_p[1])
1428  mdl_d = geom.Distance(mdl_p[0], mdl_p[1])
1429  # compute RMSD when placing pair of coordinates with smaller
1430  # distance in the middle of pair of cooridnates with larger
1431  # distance
1432  dd = abs(ref_d-mdl_d) # distance difference
1433  # rmsd = sqrt(1/2 * ((dd/2)**2 + (dd/2)**2))
1434  # => rmsd = dd/2
1435  rmsd = dd/2
1436  else:
1437  # go for classic Kabsch superposition
1438  mapped_ref_centers = geom.Vec3List()
1439  mapped_mdl_centers = geom.Vec3List()
1440  for ref_ch, mdl_ch in mapping.items():
1441  mapped_ref_centers.append(ref_centers[ref_ch])
1442  mapped_mdl_centers.append(t_mdl_centers[mdl_ch])
1443  rmsd = _GetSuperposition(mapped_ref_centers,
1444  mapped_mdl_centers, False).rmsd
1445  if rmsd < best_rmsd:
1446  best_rmsd = rmsd
1447  best_mapping = mapping
1448 
1449  # translate mapping format and return
1450  final_mapping = list()
1451  for ref_chains in self.chem_groupschem_groups:
1452  mapped_mdl_chains = list()
1453  for ref_ch in ref_chains:
1454  if ref_ch in best_mapping:
1455  mapped_mdl_chains.append(best_mapping[ref_ch])
1456  else:
1457  mapped_mdl_chains.append(None)
1458  final_mapping.append(mapped_mdl_chains)
1459 
1460  alns = dict()
1461  for ref_group, mdl_group in zip(self.chem_groupschem_groups, final_mapping):
1462  for ref_ch, mdl_ch in zip(ref_group, mdl_group):
1463  if ref_ch is not None and mdl_ch is not None:
1464  aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1465  alns[(ref_ch, mdl_ch)] = aln
1466 
1467  return MappingResult(self.targettarget, mdl, self.chem_groupschem_groups, chem_mapping,
1468  mdl_chains_without_chem_mapping, final_mapping, alns)
1469 
1470 
1471  def GetMapping(self, model, n_max_naive = 40320):
1472  """ Convenience function to get mapping with currently preferred method
1473 
1474  If number of possible chain mappings is <= *n_max_naive*, a naive
1475  QS-score mapping is performed and optimal QS-score is guaranteed.
1476  For anything else, a QS-score mapping with the greedy_full strategy is
1477  performed (greedy_prune_contact_map = True). The default for
1478  *n_max_naive* of 40320 corresponds to an octamer (8!=40320). A
1479  structure with stoichiometry A6B2 would be 6!*2!=1440 etc.
1480 
1481  If :attr:`~target` has nucleotide sequences, the QS-score target
1482  function is replaced with a backbone only lDDT score that has
1483  an inclusion radius of 30A.
1484  """
1485  if len(self.polynuc_seqspolynuc_seqs) > 0:
1486  return self.GetlDDTMappingGetlDDTMapping(model, strategy = "heuristic",
1487  inclusion_radius = 30.0,
1488  heuristic_n_max_naive = n_max_naive)
1489  else:
1490  return self.GetQSScoreMappingGetQSScoreMapping(model, strategy="heuristic",
1491  greedy_prune_contact_map=True,
1492  heuristic_n_max_naive = n_max_naive)
1493 
1494  def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0,
1495  thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False,
1496  only_interchain=False, chem_mapping_result = None,
1497  global_mapping = None):
1498  """ Identify *topn* representations of *substructure* in *model*
1499 
1500  *substructure* defines a subset of :attr:`~target` for which one
1501  wants the *topn* representations in *model*. Representations are scored
1502  and sorted by lDDT.
1503 
1504  :param substructure: A :class:`ost.mol.EntityView` which is a subset of
1505  :attr:`~target`. Should be selected with the
1506  OpenStructure query language. Example: if you're
1507  interested in residues with number 42,43 and 85 in
1508  chain A:
1509  ``substructure=mapper.target.Select("cname=A and rnum=42,43,85")``
1510  A :class:`RuntimeError` is raised if *substructure*
1511  does not refer to the same underlying
1512  :class:`ost.mol.EntityHandle` as :attr:`~target`.
1513  :type substructure: :class:`ost.mol.EntityView`
1514  :param model: Structure in which one wants to find representations for
1515  *substructure*
1516  :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1517  :param topn: Max number of representations that are returned
1518  :type topn: :class:`int`
1519  :param inclusion_radius: Inclusion radius for lDDT
1520  :type inclusion_radius: :class:`float`
1521  :param thresholds: Thresholds for lDDT
1522  :type thresholds: :class:`list` of :class:`float`
1523  :param bb_only: Only consider backbone atoms in lDDT computation
1524  :type bb_only: :class:`bool`
1525  :param only_interchain: Only score interchain contacts in lDDT. Useful
1526  if you want to identify interface patches.
1527  :type only_interchain: :class:`bool`
1528  :param chem_mapping_result: Pro param. The result of
1529  :func:`~GetChemMapping` where you provided
1530  *model*. If set, *model* parameter is not
1531  used.
1532  :type chem_mapping_result: :class:`tuple`
1533  :param global_mapping: Pro param. Specify a global mapping result. This
1534  fully defines the desired representation in the
1535  model but extracts it and enriches it with all
1536  the nice attributes of :class:`ReprResult`.
1537  The target attribute in *global_mapping* must be
1538  of the same entity as self.target and the model
1539  attribute of *global_mapping* must be of the same
1540  entity as *model*.
1541  :type global_mapping: :class:`MappingResult`
1542  :returns: :class:`list` of :class:`ReprResult`
1543  """
1544 
1545  if topn < 1:
1546  raise RuntimeError("topn must be >= 1")
1547 
1548  if global_mapping is not None:
1549  # ensure that this mapping is derived from the same structures
1550  if global_mapping.target.handle.GetHashCode() != \
1551  self.targettarget.handle.GetHashCode():
1552  raise RuntimeError("global_mapping.target must be the same "
1553  "entity as self.target")
1554  if global_mapping.model.handle.GetHashCode() != \
1555  model.handle.GetHashCode():
1556  raise RuntimeError("global_mapping.model must be the same "
1557  "entity as model param")
1558 
1559  # check whether substructure really is a subset of self.target
1560  for r in substructure.residues:
1561  ch_name = r.GetChain().GetName()
1562  rnum = r.GetNumber()
1563  target_r = self.targettarget.FindResidue(ch_name, rnum)
1564  if not target_r.IsValid():
1565  raise RuntimeError(f"substructure has residue "
1566  f"{r.GetQualifiedName()} which is not in "
1567  f"self.target")
1568  if target_r.handle.GetHashCode() != r.handle.GetHashCode():
1569  raise RuntimeError(f"substructure has residue "
1570  f"{r.GetQualifiedName()} which has an "
1571  f"equivalent in self.target but it does "
1572  f"not refer to the same underlying "
1573  f"EntityHandle")
1574  for a in r.atoms:
1575  target_a = target_r.FindAtom(a.GetName())
1576  if not target_a.IsValid():
1577  raise RuntimeError(f"substructure has atom "
1578  f"{a.GetQualifiedName()} which is not "
1579  f"in self.target")
1580  if a.handle.GetHashCode() != target_a.handle.GetHashCode():
1581  raise RuntimeError(f"substructure has atom "
1582  f"{a.GetQualifiedName()} which has an "
1583  f"equivalent in self.target but it does "
1584  f"not refer to the same underlying "
1585  f"EntityHandle")
1586 
1587  # check whether it contains either CA or C3'
1588  ca = r.FindAtom("CA")
1589  c3 = r.FindAtom("C3'") # FindAtom with prime in string is tested
1590  # and works
1591  if not ca.IsValid() and not c3.IsValid():
1592  raise RuntimeError("All residues in substructure must contain "
1593  "a backbone atom named CA or C3\'")
1594 
1595  # perform mapping and alignments on full structures
1596  if chem_mapping_result is None:
1597  chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1598  self.GetChemMappingGetChemMapping(model)
1599  else:
1600  chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1601  chem_mapping_result
1602  ref_mdl_alns = _GetRefMdlAlns(self.chem_groupschem_groups,
1603  self.chem_group_alignmentschem_group_alignments,
1604  chem_mapping,
1605  chem_group_alns)
1606 
1607  # Get residue indices relative to full target chain
1608  substructure_res_indices = dict()
1609  for ch in substructure.chains:
1610  full_ch = self.targettarget.FindChain(ch.GetName())
1611  idx = [full_ch.GetResidueIndex(r.GetNumber()) for r in ch.residues]
1612  substructure_res_indices[ch.GetName()] = idx
1613 
1614  # strip down variables to make them specific to substructure
1615  # keep only chem_groups which are present in substructure
1616  substructure_chem_groups = list()
1617  substructure_chem_mapping = list()
1618 
1619  chnames = set([ch.GetName() for ch in substructure.chains])
1620  for chem_group, mapping in zip(self.chem_groupschem_groups, chem_mapping):
1621  substructure_chem_group = [ch for ch in chem_group if ch in chnames]
1622  if len(substructure_chem_group) > 0:
1623  substructure_chem_groups.append(substructure_chem_group)
1624  substructure_chem_mapping.append(mapping)
1625 
1626  # early stopping if no mdl chain can be mapped to substructure
1627  n_mapped_mdl_chains = sum([len(m) for m in substructure_chem_mapping])
1628  if n_mapped_mdl_chains == 0:
1629  return list()
1630 
1631  # strip the reference sequence in alignments to only contain
1632  # sequence from substructure
1633  substructure_ref_mdl_alns = dict()
1634  mdl_views = dict()
1635  for ch in mdl.chains:
1636  mdl_views[ch.GetName()] = _CSel(mdl, [ch.GetName()])
1637  for chem_group, mapping in zip(substructure_chem_groups,
1638  substructure_chem_mapping):
1639  for ref_ch in chem_group:
1640  for mdl_ch in mapping:
1641  full_aln = ref_mdl_alns[(ref_ch, mdl_ch)]
1642  ref_seq = full_aln.GetSequence(0)
1643  # the ref sequence is tricky... we start with a gap only
1644  # sequence and only add olcs as defined by the residue
1645  # indices that we extracted before...
1646  tmp = ['-'] * len(full_aln)
1647  for idx in substructure_res_indices[ref_ch]:
1648  idx_in_seq = ref_seq.GetPos(idx)
1649  tmp[idx_in_seq] = ref_seq[idx_in_seq]
1650  ref_seq = seq.CreateSequence(ref_ch, ''.join(tmp))
1651  ref_seq.AttachView(_CSel(substructure, [ref_ch]))
1652  mdl_seq = full_aln.GetSequence(1)
1653  mdl_seq = seq.CreateSequence(mdl_seq.GetName(),
1654  mdl_seq.GetString())
1655  mdl_seq.AttachView(mdl_views[mdl_ch])
1656  aln = seq.CreateAlignment()
1657  aln.AddSequence(ref_seq)
1658  aln.AddSequence(mdl_seq)
1659  substructure_ref_mdl_alns[(ref_ch, mdl_ch)] = aln
1660 
1661  lddt_scorer = lddt.lDDTScorer(substructure,
1662  inclusion_radius = inclusion_radius,
1663  bb_only = bb_only)
1664  scored_mappings = list()
1665 
1666  if global_mapping:
1667  # construct mapping of substructure from global mapping
1668  flat_mapping = global_mapping.GetFlatMapping()
1669  mapping = list()
1670  for chem_group, chem_mapping in zip(substructure_chem_groups,
1671  substructure_chem_mapping):
1672  chem_group_mapping = list()
1673  for ch in chem_group:
1674  if ch in flat_mapping:
1675  mdl_ch = flat_mapping[ch]
1676  if mdl_ch in chem_mapping:
1677  chem_group_mapping.append(mdl_ch)
1678  else:
1679  chem_group_mapping.append(None)
1680  else:
1681  chem_group_mapping.append(None)
1682  mapping.append(chem_group_mapping)
1683  mappings = [mapping]
1684  else:
1685  mappings = list(_ChainMappings(substructure_chem_groups,
1686  substructure_chem_mapping,
1687  self.n_max_naiven_max_naive))
1688 
1689  # This step can be slow so give some hints in logs
1690  msg = "Computing initial ranking of %d chain mappings" % len(mappings)
1691  (ost.LogWarning if len(mappings) > 10000 else ost.LogInfo)(msg)
1692 
1693  for mapping in mappings:
1694  # chain_mapping and alns as input for lDDT computation
1695  lddt_chain_mapping = dict()
1696  lddt_alns = dict()
1697  n_res_aln = 0
1698  for ref_chem_group, mdl_chem_group in zip(substructure_chem_groups,
1699  mapping):
1700  for ref_ch, mdl_ch in zip(ref_chem_group, mdl_chem_group):
1701  # some mdl chains can be None
1702  if mdl_ch is not None:
1703  lddt_chain_mapping[mdl_ch] = ref_ch
1704  aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1705  lddt_alns[mdl_ch] = aln
1706  tmp = [int(c[0] != '-' and c[1] != '-') for c in aln]
1707  n_res_aln += sum(tmp)
1708  # don't compute lDDT if no single residue in mdl and ref is aligned
1709  if n_res_aln == 0:
1710  continue
1711 
1712  lDDT, _ = lddt_scorer.lDDT(mdl, thresholds=thresholds,
1713  chain_mapping=lddt_chain_mapping,
1714  residue_mapping = lddt_alns,
1715  check_resnames = False,
1716  no_intrachain = only_interchain)
1717 
1718  if lDDT is None:
1719  ost.LogVerbose("No valid contacts in the reference")
1720  lDDT = 0.0 # that means, that we have not a single valid contact
1721  # in lDDT. For the code below to work, we just set it
1722  # to a terrible score => 0.0
1723 
1724  if len(scored_mappings) == 0:
1725  scored_mappings.append((lDDT, mapping))
1726  elif len(scored_mappings) < topn:
1727  scored_mappings.append((lDDT, mapping))
1728  scored_mappings.sort(reverse=True, key=lambda x: x[0])
1729  elif lDDT > scored_mappings[-1][0]:
1730  scored_mappings.append((lDDT, mapping))
1731  scored_mappings.sort(reverse=True, key=lambda x: x[0])
1732  scored_mappings = scored_mappings[:topn]
1733 
1734  # finalize and return
1735  results = list()
1736  for scored_mapping in scored_mappings:
1737  ref_view = substructure.handle.CreateEmptyView()
1738  mdl_view = mdl.handle.CreateEmptyView()
1739  for ref_ch_group, mdl_ch_group in zip(substructure_chem_groups,
1740  scored_mapping[1]):
1741  for ref_ch, mdl_ch in zip(ref_ch_group, mdl_ch_group):
1742  if ref_ch is not None and mdl_ch is not None:
1743  aln = substructure_ref_mdl_alns[(ref_ch, mdl_ch)]
1744  for col in aln:
1745  if col[0] != '-' and col[1] != '-':
1746  ref_view.AddResidue(col.GetResidue(0),
1747  mol.ViewAddFlag.INCLUDE_ALL)
1748  mdl_view.AddResidue(col.GetResidue(1),
1749  mol.ViewAddFlag.INCLUDE_ALL)
1750  results.append(ReprResult(scored_mapping[0], substructure,
1751  ref_view, mdl_view))
1752  return results
1753 
1754  def GetNMappings(self, model):
1755  """ Returns number of possible mappings
1756 
1757  :param model: Model with chains that are mapped onto
1758  :attr:`chem_groups`
1759  :type model: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1760  """
1761  chem_mapping, chem_group_alns, mdl_chains_without_chem_mapping, mdl = \
1762  self.GetChemMappingGetChemMapping(model)
1763  return _NMappings(self.chem_groupschem_groups, chem_mapping)
1764 
1765  def ProcessStructure(self, ent):
1766  """ Entity processing for chain mapping
1767 
1768  * Selects view containing peptide and nucleotide residues which have
1769  required backbone atoms present - for peptide residues thats
1770  N, CA, C and CB (no CB for GLY), for nucleotide residues thats
1771  O5', C5', C4', C3' and O3'.
1772  * filters view by chain lengths, see *min_pep_length* and
1773  *min_nuc_length* in constructor
1774  * Extracts atom sequences for each chain in that view
1775  * If residue number alignments are used, strictly increasing residue
1776  numbers without insertion codes are ensured in each chain
1777 
1778  :param ent: Entity to process
1779  :type ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1780  :returns: Tuple with 3 elements: 1) :class:`ost.mol.EntityView`
1781  containing peptide and nucleotide residues 2)
1782  :class:`ost.seq.SequenceList` containing ATOMSEQ sequences
1783  for each polypeptide chain in returned view
1784  3) same for polynucleotide chains
1785  """
1786  view = ent.CreateEmptyView()
1787  exp_pep_atoms = ["N", "CA", "C", "CB"]
1788  exp_nuc_atoms = ["\"O5'\"", "\"C5'\"", "\"C4'\"", "\"C3'\"", "\"O3'\""]
1789  pep_query = "peptide=true and aname=" + ','.join(exp_pep_atoms)
1790  nuc_query = "nucleotide=true and aname=" + ','.join(exp_nuc_atoms)
1791 
1792  pep_sel = ent.Select(pep_query)
1793  for r in pep_sel.residues:
1794  if len(r.atoms) == 4:
1795  view.AddResidue(r.handle, mol.INCLUDE_ALL)
1796  elif r.name == "GLY" and len(r.atoms) == 3:
1797  atom_names = [a.GetName() for a in r.atoms]
1798  if sorted(atom_names) == ["C", "CA", "N"]:
1799  view.AddResidue(r.handle, mol.INCLUDE_ALL)
1800 
1801  nuc_sel = ent.Select(nuc_query)
1802  for r in nuc_sel.residues:
1803  if len(r.atoms) == 5:
1804  view.AddResidue(r.handle, mol.INCLUDE_ALL)
1805 
1806  polypep_seqs = seq.CreateSequenceList()
1807  polynuc_seqs = seq.CreateSequenceList()
1808 
1809  if len(view.residues) == 0:
1810  # no residues survived => return
1811  return (view, polypep_seqs, polynuc_seqs)
1812 
1813  for ch in view.chains:
1814  n_res = len(ch.residues)
1815  n_pep = sum([r.IsPeptideLinking() for r in ch.residues])
1816  n_nuc = sum([r.IsNucleotideLinking() for r in ch.residues])
1817 
1818  # guarantee that we have either pep or nuc (no mix of the two)
1819  if n_pep > 0 and n_nuc > 0:
1820  raise RuntimeError(f"Must not mix peptide and nucleotide linking "
1821  f"residues in same chain ({ch.GetName()})")
1822 
1823  if (n_pep + n_nuc) != n_res:
1824  raise RuntimeError("All residues must either be peptide_linking "
1825  "or nucleotide_linking")
1826 
1827  # filter out short chains
1828  if n_pep > 0 and n_pep < self.min_pep_lengthmin_pep_length:
1829  ost.LogWarning("Skipping short peptide chain: %s" % ch.name)
1830  continue
1831 
1832  if n_nuc > 0 and n_nuc < self.min_nuc_lengthmin_nuc_length:
1833  ost.LogWarning("Skipping short nucleotide chain: %s" % ch.name)
1834  continue
1835 
1836  # the superfast residue number based alignment adds some
1837  # restrictions on the numbers themselves:
1838  # 1) no insertion codes 2) strictly increasing
1839  if self.resnum_alignmentsresnum_alignments:
1840  # check if no insertion codes are present in residue numbers
1841  ins_codes = [r.GetNumber().GetInsCode() for r in ch.residues]
1842  if len(set(ins_codes)) != 1 or ins_codes[0] != '\0':
1843  raise RuntimeError("Residue numbers in input structures must not "
1844  "contain insertion codes")
1845 
1846  # check if residue numbers are strictly increasing
1847  nums = [r.GetNumber().GetNum() for r in ch.residues]
1848  if not all(i < j for i, j in zip(nums, nums[1:])):
1849  raise RuntimeError("Residue numbers in input structures must be "
1850  "strictly increasing for each chain")
1851 
1852  s = ''.join([r.one_letter_code for r in ch.residues])
1853  s = seq.CreateSequence(ch.GetName(), s)
1854  if n_pep == n_res:
1855  polypep_seqs.AddSequence(s)
1856  elif n_nuc == n_res:
1857  polynuc_seqs.AddSequence(s)
1858  else:
1859  raise RuntimeError("This shouldnt happen")
1860 
1861  if len(polypep_seqs) == 0 and len(polynuc_seqs) == 0:
1862  raise RuntimeError(f"No chain fulfilled minimum length requirement "
1863  f"to be considered in chain mapping "
1864  f"({self.min_pep_length} for peptide chains, "
1865  f"{self.min_nuc_length} for nucleotide chains) "
1866  f"- mapping failed")
1867 
1868  # select for chains for which we actually extracted the sequence
1869  chain_names = [s.name for s in polypep_seqs]
1870  chain_names += [s.name for s in polynuc_seqs]
1871  view = _CSel(view, chain_names)
1872 
1873  return (view, polypep_seqs, polynuc_seqs)
1874 
1875  def NWAlign(self, s1, s2, stype):
1876  """ Access to internal sequence alignment functionality
1877 
1878  Performs Needleman-Wunsch alignment with parameterization
1879  setup at ChainMapper construction
1880 
1881  :param s1: First sequence to align
1882  :type s1: :class:`ost.seq.SequenceHandle`
1883  :param s2: Second sequence to align
1884  :type s2: :class:`ost.seq.SequenceHandle`
1885  :param stype: Type of sequences to align, must be in
1886  [:class:`ost.mol.ChemType.AMINOACIDS`,
1887  :class:`ost.mol.ChemType.NUCLEOTIDES`]
1888  :returns: Pairwise alignment of s1 and s2
1889  """
1890  if stype == mol.ChemType.AMINOACIDS:
1891  return seq.alg.SemiGlobalAlign(s1, s2, self.pep_subst_matpep_subst_mat,
1892  gap_open=self.pep_gap_openpep_gap_open,
1893  gap_ext=self.pep_gap_extpep_gap_ext)[0]
1894  elif stype == mol.ChemType.NUCLEOTIDES:
1895  return seq.alg.SemiGlobalAlign(s1, s2, self.nuc_subst_matnuc_subst_mat,
1896  gap_open=self.nuc_gap_opennuc_gap_open,
1897  gap_ext=self.nuc_gap_extnuc_gap_ext)[0]
1898  else:
1899  raise RuntimeError("Invalid ChemType")
1900  return aln
1901 
1902  def SEQRESAlign(self, seqres, s, s_ent):
1903  """ Access to internal sequence alignment functionality
1904 
1905  Performs alignment on SEQRES. Residue numbers for *s* are
1906  extracted from *s_ent*. Residue numbers start from 1, i.e.
1907  1 aligns to the first element in *seqres*.
1908 
1909  :param seqres: SEQRES sequence
1910  :type seqres: :class:`ost.seq.SequenceHandle`
1911  :param s: Sequence to align
1912  :type s: :class:`ost.seq.SequenceHandle`
1913  :param s_ent: Structure which must contain a chain of same name as *s*.
1914  This chain must have the exact same number of residues
1915  as characters in *s*.
1916  :type s_ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1917  """
1918 
1919  ch = s_ent.FindChain(s.name)
1920 
1921  if not ch.IsValid():
1922  raise RuntimeError("s_ent lacks required chain in SEQRESAlign")
1923 
1924  if len(s) != ch.GetResidueCount():
1925  raise RuntimeError("Sequence/structure mismatch in SEQRESAlign")
1926 
1927  rnums = [r.GetNumber().GetNum() for r in ch.residues]
1928  max_rnum = max(len(seqres), max(rnums))
1929 
1930  # potentially add some trailing gaps
1931  seqres_s = seqres.GetGaplessString() + '-'*(max_rnum-len(seqres))
1932 
1933  olcs = ['-'] * max_rnum
1934  for olc, num in zip(s, rnums):
1935  olcs[num-1] = olc
1936 
1937  aln = seq.CreateAlignment()
1938  aln.AddSequence(seq.CreateSequence(seqres.GetName(), seqres_s))
1939  aln.AddSequence(seq.CreateSequence(s.GetName(), ''.join(olcs)))
1940  return aln
1941 
1942  def ResNumAlign(self, s1, s2, s1_ent, s2_ent):
1943  """ Access to internal sequence alignment functionality
1944 
1945  Performs residue number based alignment. Residue numbers are extracted
1946  from *s1_ent*/*s2_ent*.
1947 
1948  :param s1: First sequence to align
1949  :type s1: :class:`ost.seq.SequenceHandle`
1950  :param s2: Second sequence to align
1951  :type s2: :class:`ost.seq.SequenceHandle`
1952  :param s1_ent: Structure as source for residue numbers in *s1*. Must
1953  contain a chain named after sequence name in *s1*. This
1954  chain must have the exact same number of residues as
1955  characters in s1.
1956  :type s1_ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1957  :param s2_ent: Same for *s2*.
1958  :type s2_ent: :class:`ost.mol.EntityView`/:class:`ost.mol.EntityHandle`
1959 
1960  :returns: Pairwise alignment of s1 and s2
1961  """
1962  ch1 = s1_ent.FindChain(s1.name)
1963  ch2 = s2_ent.FindChain(s2.name)
1964 
1965  if not ch1.IsValid():
1966  raise RuntimeError("s1_ent lacks requested chain in ResNumAlign")
1967 
1968  if not ch2.IsValid():
1969  raise RuntimeError("s2_ent lacks requested chain in ResNumAlign")
1970 
1971  if len(s1) != ch1.GetResidueCount():
1972  raise RuntimeError("Sequence/structure mismatch in ResNumAlign")
1973 
1974  if len(s2) != ch2.GetResidueCount():
1975  raise RuntimeError("Sequence/structure mismatch in ResNumAlign")
1976 
1977  rnums1 = [r.GetNumber().GetNum() for r in ch1.residues]
1978  rnums2 = [r.GetNumber().GetNum() for r in ch2.residues]
1979 
1980  min_num = min(rnums1[0], rnums2[0])
1981  max_num = max(rnums1[-1], rnums2[-1])
1982  aln_length = max_num - min_num + 1
1983 
1984  aln_s1 = ['-'] * aln_length
1985  for olc, rnum in zip(s1, rnums1):
1986  aln_s1[rnum-min_num] = olc
1987 
1988  aln_s2 = ['-'] * aln_length
1989  for olc, rnum in zip(s2, rnums2):
1990  aln_s2[rnum-min_num] = olc
1991 
1992  aln = seq.CreateAlignment()
1993  aln.AddSequence(seq.CreateSequence(s1.GetName(), ''.join(aln_s1)))
1994  aln.AddSequence(seq.CreateSequence(s2.GetName(), ''.join(aln_s2)))
1995  return aln
1996 
1997 
1998  def _ComputeChemGroups(self):
1999  """ Sets properties :attr:`~chem_groups`,
2000  :attr:`~chem_group_alignments`, :attr:`~chem_group_ref_seqs`,
2001  :attr:`~chem_group_types`
2002  """
2003 
2004  if self.seqres_setseqres_set:
2005  self._chem_group_alignments_chem_group_alignments, self._chem_group_types_chem_group_types =\
2006  self._ChemGroupAlignmentsFromSEQRES_ChemGroupAlignmentsFromSEQRES()
2007  else:
2008  self._chem_group_alignments_chem_group_alignments, self._chem_group_types_chem_group_types =\
2009  self._ChemGroupAlignmentsFromATOMSEQ_ChemGroupAlignmentsFromATOMSEQ()
2010 
2011  self._chem_group_ref_seqs_chem_group_ref_seqs = seq.CreateSequenceList()
2012  for a in self.chem_group_alignmentschem_group_alignments:
2013  s = seq.CreateSequence(a.GetSequence(0).GetName(),
2014  a.GetSequence(0).GetGaplessString())
2015  self._chem_group_ref_seqs_chem_group_ref_seqs.AddSequence(s)
2016 
2017  self._chem_groups_chem_groups = list()
2018  for a in self.chem_group_alignmentschem_group_alignments:
2019  group = list()
2020  for s_idx in range(1, a.GetCount()):
2021  s = a.GetSequence(s_idx)
2022  group.append(s.GetName())
2023  self._chem_groups_chem_groups.append(group)
2024 
2025 
2026  def _ChemGroupAlignmentsFromSEQRES(self):
2027  """ Groups target sequences based on SEQRES
2028 
2029  returns tuple that can be set as self._chem_group_alignments and
2030  self._chem_group_types
2031  """
2032  pep_groups = self._GroupSequencesSEQRES_GroupSequencesSEQRES(self.polypep_seqspolypep_seqs)
2033  nuc_groups = self._GroupSequencesSEQRES_GroupSequencesSEQRES(self.polynuc_seqspolynuc_seqs)
2034  group_types = [mol.ChemType.AMINOACIDS] * len(pep_groups)
2035  group_types += [mol.ChemType.NUCLEOTIDES] * len(nuc_groups)
2036  groups = pep_groups
2037  groups.extend(nuc_groups)
2038 
2039  return (groups, group_types)
2040 
2041 
2042  def _GroupSequencesSEQRES(self, poly_seqs):
2043 
2044  alns = dict()
2045 
2046  for poly_seq in poly_seqs:
2047  sname = poly_seq.GetName()
2048  seqres_name = self.trg_seqres_mappingtrg_seqres_mapping[sname]
2049  if seqres_name not in alns:
2050  aln = seq.CreateAlignment()
2051  seqres = None
2052  for s in self.seqresseqres:
2053  if s.GetName() == seqres_name:
2054  seqres = s
2055  break
2056  if seqres is None:
2057  # each element in self.trg_seqres_mapping is guaranteed
2058  # to have a SEQRES counterpart as checked in ChainMapper
2059  # constructor. So we should never end up here.
2060  raise RuntimeError("You should never get here - contact "
2061  "OST developer")
2062  aln.AddSequence(seqres)
2063  alns[seqres_name] = aln
2064  seqres = alns[seqres_name].GetSequence(0)
2065  aln_length = seqres.GetLength()
2066  atomseq = ['-'] * aln_length
2067  trg_chain = self.targettarget.FindChain(sname)
2068  assert(trg_chain.IsValid())
2069  assert(trg_chain.GetResidueCount() == len(poly_seq))
2070  for olc, r in zip(poly_seq, trg_chain.residues):
2071  num = r.GetNumber().GetNum()
2072  if num < 1 or num > aln_length:
2073  raise RuntimeError(f"Observed residue with invalid number: "
2074  f"\"{r}\" which cannot be aligned to "
2075  f"seqres with id \"{seqres_name}\"")
2076  if olc != seqres[num-1]:
2077  raise RuntimeError(f"Sequence mismatch of residue \"{r}\" "
2078  f"with olc \"{olc}\" and residue number "
2079  f"\"{num}\". Character at that location "
2080  f"in seqres: \"{seqres[num-1]}\"."
2081  f"Seqres name: \"{seqres_name}\"")
2082  atomseq[num-1] = olc
2083  atomseq = ''.join(atomseq)
2084  alns[seqres_name].AddSequence(seq.CreateSequence(sname, atomseq))
2085 
2086  return [aln for aln in alns.values()]
2087 
2088 
2089  def _ChemGroupAlignmentsFromATOMSEQ(self):
2090  """ Groups target sequences based on ATOMSEQ
2091 
2092  returns tuple that can be set as self._chem_group_alignments and
2093  self._chem_group_types
2094  """
2095  pep_groups = self._GroupSequences_GroupSequences(self.polypep_seqspolypep_seqs, self.pep_seqid_thrpep_seqid_thr,
2096  self.min_pep_lengthmin_pep_length,
2097  mol.ChemType.AMINOACIDS)
2098  nuc_groups = self._GroupSequences_GroupSequences(self.polynuc_seqspolynuc_seqs, self.nuc_seqid_thrnuc_seqid_thr,
2099  self.min_nuc_lengthmin_nuc_length,
2100  mol.ChemType.NUCLEOTIDES)
2101 
2102  # pep_groups and nuc_groups give us alignments based on ATOMSEQ.
2103  # For example: If you have polymer chains A,B and C in the same
2104  # group and A is the longest one, you get an alignment that looks
2105  # like:
2106  # A: ASDFE
2107  # B: ASDF-
2108  # C: -SDFE
2109  #
2110  # however, the first sequence in chem group alignments must not be
2111  # bound to any ATOMSEQ and represent the reference sequence. In the
2112  # case of this function, this is simply a copy of sequence A:
2113  # REF: ASDFE
2114  # A: ASDFE
2115  # B: ASDF-
2116  # C: -SDFE
2117 
2118  # do pep_groups
2119  tmp = list()
2120  for a in pep_groups:
2121  new_a = seq.CreateAlignment()
2122  new_a.AddSequence(a.GetSequence(0))
2123  for s_idx in range(a.GetCount()):
2124  new_a.AddSequence(a.GetSequence(s_idx))
2125  tmp.append(new_a)
2126  pep_groups = tmp
2127 
2128  # do nuc groups
2129  tmp = list()
2130  for a in nuc_groups:
2131  new_a = seq.CreateAlignment()
2132  new_a.AddSequence(a.GetSequence(0))
2133  for s_idx in range(a.GetCount()):
2134  new_a.AddSequence(a.GetSequence(s_idx))
2135  tmp.append(new_a)
2136  nuc_groups = tmp
2137 
2138  group_types = [mol.ChemType.AMINOACIDS] * len(pep_groups)
2139  group_types += [mol.ChemType.NUCLEOTIDES] * len(nuc_groups)
2140  groups = pep_groups
2141  groups.extend(nuc_groups)
2142 
2143  return (groups, group_types)
2144 
2145  def _GroupSequences(self, seqs, seqid_thr, min_length, chem_type):
2146  """Get list of alignments representing groups of equivalent sequences
2147 
2148  :param seqid_thr: Threshold used to decide when two chains are identical.
2149  :type seqid_thr: :class:`float`
2150  :param gap_thr: Additional threshold to avoid gappy alignments with high
2151  seqid. Number of aligned columns must be at least this
2152  number.
2153  :type gap_thr: :class:`int`
2154  :param aligner: Helper class to generate pairwise alignments
2155  :type aligner: :class:`_Aligner`
2156  :param chem_type: ChemType of seqs, must be in
2157  [:class:`ost.mol.ChemType.AMINOACIDS`,
2158  :class:`ost.mol.ChemType.NUCLEOTIDES`]
2159  :type chem_type: :class:`ost.mol.ChemType`
2160  :returns: A list of alignments, one alignment for each group
2161  with longest sequence (reference) as first sequence.
2162  :rtype: :class:`ost.seq.AlignmentList`
2163  """
2164  groups = list()
2165  for s_idx in range(len(seqs)):
2166  matching_group = None
2167  for g_idx in range(len(groups)):
2168  for g_s_idx in range(len(groups[g_idx])):
2169  if self.resnum_alignmentsresnum_alignments:
2170  aln = self.ResNumAlignResNumAlign(seqs[s_idx],
2171  seqs[groups[g_idx][g_s_idx]],
2172  self.targettarget, self.targettarget)
2173  else:
2174  aln = self.NWAlignNWAlign(seqs[s_idx],
2175  seqs[groups[g_idx][g_s_idx]],
2176  chem_type)
2177  sid, n_aligned = _GetAlnPropsOne(aln)
2178  if sid >= seqid_thr and n_aligned >= min_length:
2179  matching_group = g_idx
2180  break
2181  if matching_group is not None:
2182  break
2183 
2184  if matching_group is None:
2185  groups.append([s_idx])
2186  else:
2187  groups[matching_group].append(s_idx)
2188 
2189  # sort based on sequence length
2190  sorted_groups = list()
2191  for g in groups:
2192  if len(g) > 1:
2193  tmp = sorted([[len(seqs[i]), i] for i in g], reverse=True)
2194  sorted_groups.append([x[1] for x in tmp])
2195  else:
2196  sorted_groups.append(g)
2197 
2198  # translate from indices back to sequences and directly generate alignments
2199  # of the groups with the longest (first) sequence as reference
2200  aln_list = seq.AlignmentList()
2201  for g in sorted_groups:
2202  if len(g) == 1:
2203  # aln with one single sequence
2204  aln_list.append(seq.CreateAlignment(seqs[g[0]]))
2205  else:
2206  # obtain pairwise aln of first sequence (reference) to all others
2207  alns = seq.AlignmentList()
2208  i = g[0]
2209  for j in g[1:]:
2210  if self.resnum_alignmentsresnum_alignments:
2211  aln = self.ResNumAlignResNumAlign(seqs[i], seqs[j],
2212  self.targettarget, self.targettarget)
2213  else:
2214  aln = self.NWAlignNWAlign(seqs[i], seqs[j], chem_type)
2215  alns.append(aln)
2216  # and merge
2217  aln_list.append(seq.alg.MergePairwiseAlignments(alns, seqs[i]))
2218 
2219  return aln_list
2220 
2221  def _MapSequence(self, ref_seqs, ref_types, s, s_type, s_ent,
2222  seq_id_thr=0.0, min_aln_length=0):
2223  """Tries top map *s* onto any of the sequences in *ref_seqs*
2224 
2225  Computes alignments of *s* to each of the reference sequences of equal type
2226  and sorts them by seqid*fraction_covered (seqid: sequence identity of
2227  aligned columns in alignment, fraction_covered: Fraction of non-gap
2228  characters in reference sequence that are covered by non-gap characters in
2229  *s*). Best scoring mapping is returned. Optionally, *seq_id*/
2230  *min_aln_length* thresholds can be enabled to avoid non-sensical mappings.
2231  However, *min_aln_length* only has an effect if *seq_id_thr* > 0!!!
2232 
2233  :param ref_seqs: Reference sequences
2234  :type ref_seqs: :class:`ost.seq.SequenceList`
2235  :param ref_types: Types of reference sequences, e.g.
2236  ost.mol.ChemType.AminoAcids
2237  :type ref_types: :class:`list` of :class:`ost.mol.ChemType`
2238  :param s: Sequence to map
2239  :type s: :class:`ost.seq.SequenceHandle`
2240  :param s_type: Type of *s*, only try mapping to sequences in *ref_seqs*
2241  with equal type as defined in *ref_types*
2242  :param s_ent: Entity which represents *s*. Only relevant in case of
2243  residue number alignments.
2244  :type s_ent: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView`
2245  :param seq_id_thr: Minimum sequence identity to be considered as match
2246  :type seq_id_thr: :class:`float`
2247  :param min_aln_length: Minimum number of aligned columns to be considered
2248  as match. Only has an effect if *seq_id_thr* > 0!
2249  :type min_aln_length: :class:`int`
2250  :returns: Tuple with two elements. 1) index of sequence in *ref_seqs* to
2251  which *s* can be mapped 2) Pairwise sequence alignment with
2252  sequence from *ref_seqs* as first sequence. Both elements are
2253  None if no mapping can be found or if thresholds are not
2254  fulfilled for any alignment.
2255  :raises: :class:`RuntimeError` if mapping is ambiguous, i.e. *s*
2256  successfully maps to more than one sequence in *ref_seqs*
2257  """
2258  scored_alns = list()
2259  for ref_idx, ref_seq in enumerate(ref_seqs):
2260  if ref_types[ref_idx] == s_type:
2261  if self.seqres_setseqres_set:
2262  aln = self.SEQRESAlignSEQRESAlign(ref_seq, s, s_ent)
2263  elif self.resnum_alignmentsresnum_alignments:
2264  aln = self.ResNumAlignResNumAlign(ref_seq, s, self.targettarget, s_ent)
2265  else:
2266  aln = self.NWAlignNWAlign(ref_seq, s, s_type)
2267  seqid, n_tot, n_aligned = _GetAlnPropsTwo(aln)
2268  if seq_id_thr > 0:
2269  if seqid >= seq_id_thr and n_aligned >= min_aln_length:
2270  fraction_covered = float(n_aligned)/n_tot
2271  score = seqid * fraction_covered
2272  scored_alns.append((score, ref_idx, aln))
2273  else:
2274  fraction_covered = float(n_aligned)/n_tot
2275  score = seqid * fraction_covered
2276  scored_alns.append((score, ref_idx, aln))
2277 
2278  if len(scored_alns) == 0:
2279  return (None, None) # no mapping possible...
2280 
2281  scored_alns = sorted(scored_alns, key=lambda x: x[0], reverse=True)
2282  return (scored_alns[0][1], scored_alns[0][2])
2283 
2284 def _GetAlnPropsTwo(aln):
2285  """Returns basic properties of *aln* version two...
2286 
2287  :param aln: Alignment to compute properties
2288  :type aln: :class:`seq.AlignmentHandle`
2289  :returns: Tuple with 3 elements. 1) sequence identity in range [0, 100]
2290  considering aligned columns 2) Number of non gap characters in
2291  first sequence 3) Number of aligned characters.
2292  """
2293  assert(aln.GetCount() == 2)
2294  n_tot = sum([1 for col in aln if col[0] != '-'])
2295  n_aligned = sum([1 for col in aln if (col[0] != '-' and col[1] != '-')])
2296  return (seq.alg.SequenceIdentity(aln), n_tot, n_aligned)
2297 
2298 def _GetAlnPropsOne(aln):
2299 
2300  """Returns basic properties of *aln* version one...
2301 
2302  :param aln: Alignment to compute properties
2303  :type aln: :class:`seq.AlignmentHandle`
2304  :returns: Tuple with 2 elements. 1) sequence identify in range [0, 100]
2305  considering aligned columns 2) Number of aligned columns.
2306  """
2307  assert(aln.GetCount() == 2)
2308  seqid = seq.alg.SequenceIdentity(aln)
2309  n_aligned = sum([1 for col in aln if (col[0] != '-' and col[1] != '-')])
2310  return (seqid, n_aligned)
2311 
2312 def _GetRefMdlAlns(ref_chem_groups, ref_chem_group_msas, mdl_chem_groups,
2313  mdl_chem_group_alns, pairs=None):
2314  """ Get all possible ref/mdl chain alignments given chem group mapping
2315 
2316  :param ref_chem_groups: :attr:`ChainMapper.chem_groups`
2317  :type ref_chem_groups: :class:`list` of :class:`list` of :class:`str`
2318  :param ref_chem_group_msas: :attr:`ChainMapper.chem_group_alignments`
2319  :type ref_chem_group_msas: :class:`ost.seq.AlignmentList`
2320  :param mdl_chem_groups: Groups of model chains that are mapped to
2321  *ref_chem_groups*. Return value of
2322  :func:`ChainMapper.GetChemMapping`.
2323  :type mdl_chem_groups: :class:`list` of :class:`list` of :class:`str`
2324  :param mdl_chem_group_alns: A pairwise sequence alignment for every chain
2325  in *mdl_chem_groups* that aligns these sequences
2326  to the respective reference sequence.
2327  Return values of
2328  :func:`ChainMapper.GetChemMapping`.
2329  :type mdl_chem_group_alns: :class:`list` of :class:`ost.seq.AlignmentList`
2330  :param pairs: Pro param - restrict return dict to specified pairs. A set of
2331  tuples in form (<trg_ch>, <mdl_ch>)
2332  :type pairs: :class:`set`
2333  :returns: A dictionary holding all possible ref/mdl chain alignments. Keys
2334  in that dictionary are tuples of the form (ref_ch, mdl_ch) and
2335  values are the respective pairwise alignments with first sequence
2336  being from ref, the second from mdl.
2337  """
2338  # alignment of each model chain to chem_group reference sequence
2339  mdl_alns = dict()
2340  for alns in mdl_chem_group_alns:
2341  for aln in alns:
2342  mdl_chain_name = aln.GetSequence(1).GetName()
2343  mdl_alns[mdl_chain_name] = aln
2344 
2345  # generate all alignments between ref/mdl chain atomseqs that we will
2346  # ever observe
2347  ref_mdl_alns = dict()
2348  for ref_chains, mdl_chains, ref_aln in zip(ref_chem_groups, mdl_chem_groups,
2349  ref_chem_group_msas):
2350  for ref_ch in ref_chains:
2351  for mdl_ch in mdl_chains:
2352  if pairs is not None and (ref_ch, mdl_ch) not in pairs:
2353  continue
2354  # obtain alignments of mdl and ref chains towards chem
2355  # group ref sequence and merge them
2356  aln_list = seq.AlignmentList()
2357 
2358  # do ref aln
2359 
2361  s1 = ref_aln.GetSequence(0)
2362  # ATOMSEQ of ref_ch
2363  s2 = ref_aln.GetSequence(1+ref_chains.index(ref_ch))
2364  aln_list.append(seq.CreateAlignment(s1, s2))
2365 
2366  # do mdl aln
2367 
2368  aln_list.append(mdl_alns[mdl_ch])
2369 
2370  # merge
2371 
2372  ref_seq = seq.CreateSequence(s1.GetName(),
2373  s1.GetGaplessString())
2374  merged_aln = seq.alg.MergePairwiseAlignments(aln_list,
2375  ref_seq)
2376  # merged_aln:
2377  # seq1: ref seq of chem group
2378  # seq2: seq of ref chain
2379  # seq3: seq of mdl chain
2380  # => we need the alignment between seq2 and seq3
2381  s2 = merged_aln.GetSequence(1)
2382  s3 = merged_aln.GetSequence(2)
2383  # cut leading and trailing gap columns
2384  a = 0 # number of leading gap columns
2385  for idx in range(len(s2)):
2386  if s2[idx] != '-' or s3[idx] != '-':
2387  break
2388  a += 1
2389  b = 0 # number of trailing gap columns
2390  for idx in reversed(range(len(s2))):
2391  if s2[idx] != '-' or s3[idx] != '-':
2392  break
2393  b += 1
2394  s2 = seq.CreateSequence(s2.GetName(), s2[a: len(s2)-b])
2395  s3 = seq.CreateSequence(s3.GetName(), s3[a: len(s3)-b])
2396  ref_mdl_alns[(ref_ch, mdl_ch)] = seq.CreateAlignment(s2, s3)
2397 
2398  return ref_mdl_alns
2399 
2400 def _CheckOneToOneMapping(ref_chains, mdl_chains):
2401  """ Checks whether we already have a perfect one to one mapping
2402 
2403  That means each list in *ref_chains* has either exactly one element
2404  and the respective list in *mdl_chains* has also one element or
2405  it has several elements and the respective list in *mdl_chains* is
2406  empty (ref chain(s) has no mapped mdl chain). Returns None if no such
2407  mapping can be found.
2408 
2409  :param ref_chains: corresponds to :attr:`ChainMapper.chem_groups`
2410  :type ref_chains: :class:`list` of :class:`list` of :class:`str`
2411  :param mdl_chains: mdl chains mapped to chem groups in *ref_chains*, i.e.
2412  the return value of :func:`ChainMapper.GetChemMapping`
2413  :type mdl_chains: class:`list` of :class:`list` of :class:`str`
2414  :returns: A :class:`list` of :class:`list` if a one to one mapping is found,
2415  None otherwise
2416  """
2417  only_one_to_one = True
2418  one_to_one = list()
2419  for ref, mdl in zip(ref_chains, mdl_chains):
2420  if len(ref) == 1 and len(mdl) == 1:
2421  one_to_one.append(mdl)
2422  elif len(ref) >= 1 and len(mdl) == 0:
2423  one_to_one.append(len(ref)*[None])
2424  else:
2425  only_one_to_one = False
2426  break
2427  if only_one_to_one:
2428  return one_to_one
2429  else:
2430  return None
2431 
2433  def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
2434  ref_mdl_alns, inclusion_radius = 15.0,
2435  thresholds = [0.5, 1.0, 2.0, 4.0],
2436  steep_opt_rate = None):
2437 
2438  """ Greedy extension of already existing but incomplete chain mappings
2439  """
2440  super().__init__(ref, ref_chem_groups, mdl, ref_mdl_alns,
2441  dist_thresh=inclusion_radius,
2442  dist_diff_thresholds=thresholds)
2443 
2444  self.mdl_chem_groupsmdl_chem_groups = mdl_chem_groups
2445  self.steep_opt_ratesteep_opt_rate = steep_opt_rate
2446 
2447  self.neighborsneighbors = {k: set() for k in self.trgtrg.chain_names}
2448  for interface in self.trgtrg.interacting_chains:
2449  self.neighborsneighbors[interface[0]].add(interface[1])
2450  self.neighborsneighbors[interface[1]].add(interface[0])
2451 
2452  assert(len(ref_chem_groups) == len(mdl_chem_groups))
2453  self.ref_chem_groupsref_chem_groups = ref_chem_groups
2454  self.mdl_chem_groupsmdl_chem_groups = mdl_chem_groups
2455  self.ref_ch_group_mapperref_ch_group_mapper = dict()
2456  self.mdl_ch_group_mappermdl_ch_group_mapper = dict()
2457  for g_idx, (ref_g, mdl_g) in enumerate(zip(ref_chem_groups,
2458  mdl_chem_groups)):
2459  for ch in ref_g:
2460  self.ref_ch_group_mapperref_ch_group_mapper[ch] = g_idx
2461  for ch in mdl_g:
2462  self.mdl_ch_group_mappermdl_ch_group_mapper[ch] = g_idx
2463 
2464  # keep track of mdl chains that potentially give lDDT contributions,
2465  # i.e. they have locations within inclusion_radius + max(thresholds)
2466  self.mdl_neighborsmdl_neighbors = {k: set() for k in self.mdlmdl.chain_names}
2467  for interface in self.mdlmdl.potentially_contributing_chains:
2468  self.mdl_neighborsmdl_neighbors[interface[0]].add(interface[1])
2469  self.mdl_neighborsmdl_neighbors[interface[1]].add(interface[0])
2470 
2471  def ExtendMapping(self, mapping, max_ext = None):
2472 
2473  if len(mapping) == 0:
2474  raise RuntimError("Mapping must contain a starting point")
2475 
2476  # Ref chains onto which we can map. The algorithm starts with a mapping
2477  # on ref_ch. From there we can start to expand to connected neighbors.
2478  # All neighbors that we can reach from the already mapped chains are
2479  # stored in this set which will be updated during runtime
2480  map_targets = set()
2481  for ref_ch in mapping.keys():
2482  map_targets.update(self.neighborsneighbors[ref_ch])
2483 
2484  # remove the already mapped chains
2485  for ref_ch in mapping.keys():
2486  map_targets.discard(ref_ch)
2487 
2488  if len(map_targets) == 0:
2489  return mapping # nothing to extend
2490 
2491  # keep track of what model chains are not yet mapped for each chem group
2492  free_mdl_chains = list()
2493  for chem_group in self.mdl_chem_groupsmdl_chem_groups:
2494  tmp = [x for x in chem_group if x not in mapping.values()]
2495  free_mdl_chains.append(set(tmp))
2496 
2497  # keep track of what ref chains got a mapping
2498  newly_mapped_ref_chains = list()
2499 
2500  something_happened = True
2501  while something_happened:
2502  something_happened=False
2503 
2504  if self.steep_opt_ratesteep_opt_rate is not None:
2505  n_chains = len(newly_mapped_ref_chains)
2506  if n_chains > 0 and n_chains % self.steep_opt_ratesteep_opt_rate == 0:
2507  mapping = self._SteepOpt_SteepOpt(mapping, newly_mapped_ref_chains)
2508 
2509  if max_ext is not None and len(newly_mapped_ref_chains) >= max_ext:
2510  break
2511 
2512  max_n = 0
2513  max_mapping = None
2514  for ref_ch in map_targets:
2515  chem_group_idx = self.ref_ch_group_mapperref_ch_group_mapper[ref_ch]
2516  for mdl_ch in free_mdl_chains[chem_group_idx]:
2517  # single chain score
2518  n_single = self._NSCConserved_NSCConserved(ref_ch, mdl_ch).sum()
2519  # scores towards neighbors that are already mapped
2520  n_inter = 0
2521  for neighbor in self.neighborsneighbors[ref_ch]:
2522  if neighbor in mapping and mapping[neighbor] in \
2523  self.mdl_neighborsmdl_neighbors[mdl_ch]:
2524  n_inter += self._NPairConserved_NPairConserved((ref_ch, neighbor),
2525  (mdl_ch, mapping[neighbor])).sum()
2526  n = n_single + n_inter
2527 
2528  if n_inter > 0 and n > max_n:
2529  # Only accept a new solution if its actually connected
2530  # i.e. n_inter > 0. Otherwise we could just map a big
2531  # fat mdl chain sitting somewhere in Nirvana
2532  max_n = n
2533  max_mapping = (ref_ch, mdl_ch)
2534 
2535  if max_n > 0:
2536  something_happened = True
2537  # assign new found mapping
2538  mapping[max_mapping[0]] = max_mapping[1]
2539 
2540  # add all neighboring chains to map targets as they are now
2541  # reachable
2542  for neighbor in self.neighborsneighbors[max_mapping[0]]:
2543  if neighbor not in mapping:
2544  map_targets.add(neighbor)
2545 
2546  # remove the ref chain from map targets
2547  map_targets.remove(max_mapping[0])
2548 
2549  # remove the mdl chain from free_mdl_chains - its taken...
2550  chem_group_idx = self.ref_ch_group_mapperref_ch_group_mapper[max_mapping[0]]
2551  free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2552 
2553  # keep track of what ref chains got a mapping
2554  newly_mapped_ref_chains.append(max_mapping[0])
2555 
2556  return mapping
2557 
2558  def _SteepOpt(self, mapping, chains_to_optimize=None):
2559 
2560  # just optimize ALL ref chains if nothing specified
2561  if chains_to_optimize is None:
2562  chains_to_optimize = mapping.keys()
2563 
2564  # make sure that we only have ref chains which are actually mapped
2565  ref_chains = [x for x in chains_to_optimize if mapping[x] is not None]
2566 
2567  # group ref chains to be optimized into chem groups
2568  tmp = dict()
2569  for ch in ref_chains:
2570  chem_group_idx = self.ref_ch_group_mapperref_ch_group_mapper[ch]
2571  if chem_group_idx in tmp:
2572  tmp[chem_group_idx].append(ch)
2573  else:
2574  tmp[chem_group_idx] = [ch]
2575  chem_groups = list(tmp.values())
2576 
2577  # try all possible mapping swaps. Swaps that improve the score are
2578  # immediately accepted and we start all over again
2579  current_lddt = self.FromFlatMappingFromFlatMapping(mapping)
2580  something_happened = True
2581  while something_happened:
2582  something_happened = False
2583  for chem_group in chem_groups:
2584  if something_happened:
2585  break
2586  for ch1, ch2 in itertools.combinations(chem_group, 2):
2587  swapped_mapping = dict(mapping)
2588  swapped_mapping[ch1] = mapping[ch2]
2589  swapped_mapping[ch2] = mapping[ch1]
2590  score = self.FromFlatMappingFromFlatMapping(swapped_mapping)
2591  if score > current_lddt:
2592  something_happened = True
2593  mapping = swapped_mapping
2594  current_lddt = score
2595  break
2596  return mapping
2597 
2598 
2599 def _lDDTNaive(trg, mdl, inclusion_radius, thresholds, chem_groups,
2600  chem_mapping, ref_mdl_alns, n_max_naive):
2601  """ Naively iterates all possible chain mappings and returns the best
2602  """
2603  best_mapping = None
2604  best_lddt = -1.0
2605 
2606  # Setup scoring
2607  lddt_scorer = bb_lddt.BBlDDTScorer(trg, chem_groups, mdl, ref_mdl_alns,
2608  dist_thresh=inclusion_radius,
2609  dist_diff_thresholds=thresholds)
2610  for mapping in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
2611  lDDT = lddt_scorer.Score(mapping, check=False)
2612  if lDDT > best_lddt:
2613  best_mapping = mapping
2614  best_lddt = lDDT
2615 
2616  return (best_mapping, best_lddt)
2617 
2618 
2619 def _GetSeeds(ref_chem_groups, mdl_chem_groups, mapped_ref_chains = set(),
2620  mapped_mdl_chains = set()):
2621  seeds = list()
2622  for ref_chains, mdl_chains in zip(ref_chem_groups,
2623  mdl_chem_groups):
2624  for ref_ch in ref_chains:
2625  if ref_ch not in mapped_ref_chains:
2626  for mdl_ch in mdl_chains:
2627  if mdl_ch not in mapped_mdl_chains:
2628  seeds.append((ref_ch, mdl_ch))
2629  return seeds
2630 
2631 def _lDDTGreedyFull(the_greed):
2632  """ Uses each reference chain as starting point for expansion
2633  """
2634 
2635  seeds = _GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
2636  best_overall_score = -1.0
2637  best_overall_mapping = dict()
2638 
2639  for seed in seeds:
2640 
2641  # do initial extension
2642  mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
2643 
2644  # repeat the process until we have a full mapping
2645  something_happened = True
2646  while something_happened:
2647  something_happened = False
2648  remnant_seeds = _GetSeeds(the_greed.ref_chem_groups,
2649  the_greed.mdl_chem_groups,
2650  mapped_ref_chains = set(mapping.keys()),
2651  mapped_mdl_chains = set(mapping.values()))
2652  if len(remnant_seeds) > 0:
2653  # still more mapping to be done
2654  best_score = -1.0
2655  best_mapping = None
2656  for remnant_seed in remnant_seeds:
2657  tmp_mapping = dict(mapping)
2658  tmp_mapping[remnant_seed[0]] = remnant_seed[1]
2659  tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
2660  score = the_greed.FromFlatMapping(tmp_mapping)
2661  if score > best_score:
2662  best_score = score
2663  best_mapping = tmp_mapping
2664  if best_mapping is not None:
2665  something_happened = True
2666  mapping = best_mapping
2667 
2668  score = the_greed.FromFlatMapping(mapping)
2669  if score > best_overall_score:
2670  best_overall_score = score
2671  best_overall_mapping = mapping
2672 
2673  mapping = best_overall_mapping
2674 
2675  # translate mapping format and return
2676  final_mapping = list()
2677  for ref_chains in the_greed.ref_chem_groups:
2678  mapped_mdl_chains = list()
2679  for ref_ch in ref_chains:
2680  if ref_ch in mapping:
2681  mapped_mdl_chains.append(mapping[ref_ch])
2682  else:
2683  mapped_mdl_chains.append(None)
2684  final_mapping.append(mapped_mdl_chains)
2685 
2686  return final_mapping
2687 
2688 
2689 def _lDDTGreedyBlock(the_greed, seed_size, blocks_per_chem_group):
2690  """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
2691  respective chem groups and compute single chain lDDTs. The
2692  *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
2693  and the best scoring one is exhaustively extended.
2694  """
2695 
2696  if seed_size is None or seed_size < 1:
2697  raise RuntimeError(f"seed_size must be an int >= 1 (got {seed_size})")
2698 
2699  if blocks_per_chem_group is None or blocks_per_chem_group < 1:
2700  raise RuntimeError(f"blocks_per_chem_group must be an int >= 1 "
2701  f"(got {blocks_per_chem_group})")
2702 
2703  max_ext = seed_size - 1 # -1 => start seed already has size 1
2704 
2705  ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
2706  mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
2707 
2708  mapping = dict()
2709  something_happened = True
2710  while something_happened:
2711  something_happened = False
2712  starting_blocks = list()
2713  for ref_chains, mdl_chains in zip(ref_chem_groups, mdl_chem_groups):
2714  if len(mdl_chains) == 0:
2715  continue # nothing to map
2716  ref_chains_copy = list(ref_chains)
2717  for i in range(blocks_per_chem_group):
2718  if len(ref_chains_copy) == 0:
2719  break
2720  seeds = list()
2721  for ref_ch in ref_chains_copy:
2722  seeds += [(ref_ch, mdl_ch) for mdl_ch in mdl_chains]
2723  # extend starting seeds to *seed_size* and retain best scoring
2724  # block for further extension
2725  best_score = -1.0
2726  best_mapping = None
2727  best_seed = None
2728  for s in seeds:
2729  seed = dict(mapping)
2730  seed.update({s[0]: s[1]})
2731  seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
2732  seed_lddt = the_greed.FromFlatMapping(seed)
2733  if seed_lddt > best_score:
2734  best_score = seed_lddt
2735  best_mapping = seed
2736  best_seed = s
2737  if best_mapping != None:
2738  starting_blocks.append(best_mapping)
2739  if best_seed[0] in ref_chains_copy:
2740  # remove that ref chain to enforce diversity
2741  ref_chains_copy.remove(best_seed[0])
2742 
2743  # fully expand initial starting blocks
2744  best_lddt = 0.0
2745  best_mapping = None
2746  for seed in starting_blocks:
2747  seed = the_greed.ExtendMapping(seed)
2748  seed_lddt = the_greed.FromFlatMapping(seed)
2749  if seed_lddt > best_lddt:
2750  best_lddt = seed_lddt
2751  best_mapping = seed
2752 
2753  if best_lddt == 0.0:
2754  break # no proper mapping found anymore
2755 
2756  something_happened = True
2757  mapping.update(best_mapping)
2758  for ref_ch, mdl_ch in best_mapping.items():
2759  for group_idx in range(len(ref_chem_groups)):
2760  if ref_ch in ref_chem_groups[group_idx]:
2761  ref_chem_groups[group_idx].remove(ref_ch)
2762  if mdl_ch in mdl_chem_groups[group_idx]:
2763  mdl_chem_groups[group_idx].remove(mdl_ch)
2764 
2765  # translate mapping format and return
2766  final_mapping = list()
2767  for ref_chains in the_greed.ref_chem_groups:
2768  mapped_mdl_chains = list()
2769  for ref_ch in ref_chains:
2770  if ref_ch in mapping:
2771  mapped_mdl_chains.append(mapping[ref_ch])
2772  else:
2773  mapped_mdl_chains.append(None)
2774  final_mapping.append(mapped_mdl_chains)
2775 
2776  return final_mapping
2777 
2778 
2780  def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups,
2781  ref_mdl_alns, contact_d = 12.0,
2782  steep_opt_rate = None, greedy_prune_contact_map=False):
2783  """ Greedy extension of already existing but incomplete chain mappings
2784  """
2785  super().__init__(ref, ref_chem_groups, mdl, ref_mdl_alns,
2786  contact_d = contact_d)
2787  self.refref = ref
2788  self.mdlmdl = mdl
2789  self.ref_mdl_alnsref_mdl_alns = ref_mdl_alns
2790  self.steep_opt_ratesteep_opt_rate = steep_opt_rate
2791 
2792  if greedy_prune_contact_map:
2793  self.neighborsneighbors = {k: set() for k in self.qsent1qsent1.chain_names}
2794  for p in self.qsent1qsent1.interacting_chains:
2795  if np.count_nonzero(self.qsent1qsent1.PairDist(p[0], p[1])<=8) >= 3:
2796  self.neighborsneighbors[p[0]].add(p[1])
2797  self.neighborsneighbors[p[1]].add(p[0])
2798 
2799  self.mdl_neighborsmdl_neighbors = {k: set() for k in self.qsent2qsent2.chain_names}
2800  for p in self.qsent2qsent2.interacting_chains:
2801  if np.count_nonzero(self.qsent2qsent2.PairDist(p[0], p[1])<=8) >= 3:
2802  self.mdl_neighborsmdl_neighbors[p[0]].add(p[1])
2803  self.mdl_neighborsmdl_neighbors[p[1]].add(p[0])
2804 
2805 
2806  else:
2807  self.neighborsneighbors = {k: set() for k in self.qsent1qsent1.chain_names}
2808  for p in self.qsent1qsent1.interacting_chains:
2809  self.neighborsneighbors[p[0]].add(p[1])
2810  self.neighborsneighbors[p[1]].add(p[0])
2811 
2812  self.mdl_neighborsmdl_neighbors = {k: set() for k in self.qsent2qsent2.chain_names}
2813  for p in self.qsent2qsent2.interacting_chains:
2814  self.mdl_neighborsmdl_neighbors[p[0]].add(p[1])
2815  self.mdl_neighborsmdl_neighbors[p[1]].add(p[0])
2816 
2817  assert(len(ref_chem_groups) == len(mdl_chem_groups))
2818  self.ref_chem_groupsref_chem_groups = ref_chem_groups
2819  self.mdl_chem_groupsmdl_chem_groups = mdl_chem_groups
2820  self.ref_ch_group_mapperref_ch_group_mapper = dict()
2821  self.mdl_ch_group_mappermdl_ch_group_mapper = dict()
2822  for g_idx, (ref_g, mdl_g) in enumerate(zip(ref_chem_groups,
2823  mdl_chem_groups)):
2824  for ch in ref_g:
2825  self.ref_ch_group_mapperref_ch_group_mapper[ch] = g_idx
2826  for ch in mdl_g:
2827  self.mdl_ch_group_mappermdl_ch_group_mapper[ch] = g_idx
2828 
2829  # cache for lDDT based single chain conserved contacts
2830  # used to identify starting points for further extension by QS score
2831  # key: tuple (ref_ch, mdl_ch) value: number of conserved contacts
2832  self.single_chain_scorersingle_chain_scorer = dict()
2833  self.single_chain_cachesingle_chain_cache = dict()
2834  for ch in self.refref.chains:
2835  single_chain_ref = _CSel(self.refref, [ch.GetName()])
2836  self.single_chain_scorersingle_chain_scorer[ch.GetName()] = \
2837  lddt.lDDTScorer(single_chain_ref, bb_only = True)
2838 
2839  def SCCounts(self, ref_ch, mdl_ch):
2840  if not (ref_ch, mdl_ch) in self.single_chain_cachesingle_chain_cache:
2841  alns = dict()
2842  alns[mdl_ch] = self.ref_mdl_alnsref_mdl_alns[(ref_ch, mdl_ch)]
2843  mdl_sel = _CSel(self.mdlmdl, [mdl_ch])
2844  s = self.single_chain_scorersingle_chain_scorer[ref_ch]
2845  _,_,_,conserved,_,_,_ = s.lDDT(mdl_sel,
2846  residue_mapping=alns,
2847  return_dist_test=True,
2848  no_interchain=True,
2849  chain_mapping={mdl_ch: ref_ch},
2850  check_resnames=False)
2851  self.single_chain_cachesingle_chain_cache[(ref_ch, mdl_ch)] = conserved
2852  return self.single_chain_cachesingle_chain_cache[(ref_ch, mdl_ch)]
2853 
2854  def ExtendMapping(self, mapping, max_ext = None):
2855 
2856  if len(mapping) == 0:
2857  raise RuntimError("Mapping must contain a starting point")
2858 
2859  # Ref chains onto which we can map. The algorithm starts with a mapping
2860  # on ref_ch. From there we can start to expand to connected neighbors.
2861  # All neighbors that we can reach from the already mapped chains are
2862  # stored in this set which will be updated during runtime
2863  map_targets = set()
2864  for ref_ch in mapping.keys():
2865  map_targets.update(self.neighborsneighbors[ref_ch])
2866 
2867  # remove the already mapped chains
2868  for ref_ch in mapping.keys():
2869  map_targets.discard(ref_ch)
2870 
2871  if len(map_targets) == 0:
2872  return mapping # nothing to extend
2873 
2874  # keep track of what model chains are not yet mapped for each chem group
2875  free_mdl_chains = list()
2876  for chem_group in self.mdl_chem_groupsmdl_chem_groups:
2877  tmp = [x for x in chem_group if x not in mapping.values()]
2878  free_mdl_chains.append(set(tmp))
2879 
2880  # keep track of what ref chains got a mapping
2881  newly_mapped_ref_chains = list()
2882 
2883  something_happened = True
2884  while something_happened:
2885  something_happened=False
2886 
2887  if self.steep_opt_ratesteep_opt_rate is not None:
2888  n_chains = len(newly_mapped_ref_chains)
2889  if n_chains > 0 and n_chains % self.steep_opt_ratesteep_opt_rate == 0:
2890  mapping = self._SteepOpt_SteepOpt(mapping, newly_mapped_ref_chains)
2891 
2892  if max_ext is not None and len(newly_mapped_ref_chains) >= max_ext:
2893  break
2894 
2895  score_result = self.FromFlatMappingFromFlatMapping(mapping)
2896  old_score = score_result.QS_global
2897  nominator = score_result.weighted_scores
2898  denominator = score_result.weight_sum + score_result.weight_extra_all
2899 
2900  max_diff = 0.0
2901  max_mapping = None
2902  for ref_ch in map_targets:
2903  chem_group_idx = self.ref_ch_group_mapperref_ch_group_mapper[ref_ch]
2904  for mdl_ch in free_mdl_chains[chem_group_idx]:
2905  # we're not computing full QS-score here, we directly hack
2906  # into the QS-score formula to compute a diff
2907  nominator_diff = 0.0
2908  denominator_diff = 0.0
2909  for neighbor in self.neighborsneighbors[ref_ch]:
2910  if neighbor in mapping and mapping[neighbor] in \
2911  self.mdl_neighborsmdl_neighbors[mdl_ch]:
2912  # it's a newly added interface if (ref_ch, mdl_ch)
2913  # are added to mapping
2914  int1 = (ref_ch, neighbor)
2915  int2 = (mdl_ch, mapping[neighbor])
2916  a, b, c, d = self._MappedInterfaceScores_MappedInterfaceScores(int1, int2)
2917  nominator_diff += a # weighted_scores
2918  denominator_diff += b # weight_sum
2919  denominator_diff += d # weight_extra_all
2920  # the respective interface penalties are subtracted
2921  # from denominator
2922  denominator_diff -= self._InterfacePenalty1_InterfacePenalty1(int1)
2923  denominator_diff -= self._InterfacePenalty2_InterfacePenalty2(int2)
2924 
2925  if nominator_diff > 0:
2926  # Only accept a new solution if its actually connected
2927  # i.e. nominator_diff > 0.
2928  new_nominator = nominator + nominator_diff
2929  new_denominator = denominator + denominator_diff
2930  new_score = 0.0
2931  if new_denominator != 0.0:
2932  new_score = new_nominator/new_denominator
2933  diff = new_score - old_score
2934  if diff > max_diff:
2935  max_diff = diff
2936  max_mapping = (ref_ch, mdl_ch)
2937 
2938  if max_mapping is not None:
2939  something_happened = True
2940  # assign new found mapping
2941  mapping[max_mapping[0]] = max_mapping[1]
2942 
2943  # add all neighboring chains to map targets as they are now
2944  # reachable
2945  for neighbor in self.neighborsneighbors[max_mapping[0]]:
2946  if neighbor not in mapping:
2947  map_targets.add(neighbor)
2948 
2949  # remove the ref chain from map targets
2950  map_targets.remove(max_mapping[0])
2951 
2952  # remove the mdl chain from free_mdl_chains - its taken...
2953  chem_group_idx = self.ref_ch_group_mapperref_ch_group_mapper[max_mapping[0]]
2954  free_mdl_chains[chem_group_idx].remove(max_mapping[1])
2955 
2956  # keep track of what ref chains got a mapping
2957  newly_mapped_ref_chains.append(max_mapping[0])
2958 
2959  return mapping
2960 
2961  def _SteepOpt(self, mapping, chains_to_optimize=None):
2962 
2963  # just optimize ALL ref chains if nothing specified
2964  if chains_to_optimize is None:
2965  chains_to_optimize = mapping.keys()
2966 
2967  # make sure that we only have ref chains which are actually mapped
2968  ref_chains = [x for x in chains_to_optimize if mapping[x] is not None]
2969 
2970  # group ref chains to be optimized into chem groups
2971  tmp = dict()
2972  for ch in ref_chains:
2973  chem_group_idx = self.ref_ch_group_mapperref_ch_group_mapper[ch]
2974  if chem_group_idx in tmp:
2975  tmp[chem_group_idx].append(ch)
2976  else:
2977  tmp[chem_group_idx] = [ch]
2978  chem_groups = list(tmp.values())
2979 
2980  # try all possible mapping swaps. Swaps that improve the score are
2981  # immediately accepted and we start all over again
2982  score_result = self.FromFlatMappingFromFlatMapping(mapping)
2983  current_score = score_result.QS_global
2984  something_happened = True
2985  while something_happened:
2986  something_happened = False
2987  for chem_group in chem_groups:
2988  if something_happened:
2989  break
2990  for ch1, ch2 in itertools.combinations(chem_group, 2):
2991  swapped_mapping = dict(mapping)
2992  swapped_mapping[ch1] = mapping[ch2]
2993  swapped_mapping[ch2] = mapping[ch1]
2994  score_result = self.FromFlatMappingFromFlatMapping(swapped_mapping)
2995  if score_result.QS_global > current_score:
2996  something_happened = True
2997  mapping = swapped_mapping
2998  current_score = score_result.QS_global
2999  break
3000  return mapping
3001 
3002 
3003 def _QSScoreNaive(trg, mdl, chem_groups, chem_mapping, ref_mdl_alns, contact_d,
3004  n_max_naive):
3005  best_mapping = None
3006  best_score = -1.0
3007  # qs_scorer implements caching, score calculation is thus as fast as it gets
3008  # you'll just hit a wall when the number of possible mappings becomes large
3009  qs_scorer = qsscore.QSScorer(trg, chem_groups, mdl, ref_mdl_alns)
3010  for mapping in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
3011  score_result = qs_scorer.Score(mapping, check=False)
3012  if score_result.QS_global > best_score:
3013  best_mapping = mapping
3014  best_score = score_result.QS_global
3015  return (best_mapping, best_score)
3016 
3017 def _QSScoreGreedyFull(the_greed):
3018  """ Uses each reference chain as starting point for expansion
3019  """
3020 
3021  seeds = _GetSeeds(the_greed.ref_chem_groups, the_greed.mdl_chem_groups)
3022  best_overall_score = -1.0
3023  best_overall_mapping = dict()
3024 
3025  for seed in seeds:
3026 
3027  # do initial extension
3028  mapping = the_greed.ExtendMapping({seed[0]: seed[1]})
3029 
3030  # repeat the process until we have a full mapping
3031  something_happened = True
3032  while something_happened:
3033  something_happened = False
3034  remnant_seeds = _GetSeeds(the_greed.ref_chem_groups,
3035  the_greed.mdl_chem_groups,
3036  mapped_ref_chains = set(mapping.keys()),
3037  mapped_mdl_chains = set(mapping.values()))
3038  if len(remnant_seeds) > 0:
3039  # still more mapping to be done
3040  best_score = -1.0
3041  best_mapping = None
3042  for remnant_seed in remnant_seeds:
3043  tmp_mapping = dict(mapping)
3044  tmp_mapping[remnant_seed[0]] = remnant_seed[1]
3045  tmp_mapping = the_greed.ExtendMapping(tmp_mapping)
3046  score_result = the_greed.FromFlatMapping(tmp_mapping)
3047  if score_result.QS_global > best_score:
3048  best_score = score_result.QS_global
3049  best_mapping = tmp_mapping
3050  if best_mapping is not None:
3051  something_happened = True
3052  mapping = best_mapping
3053 
3054  score_result = the_greed.FromFlatMapping(mapping)
3055  if score_result.QS_global > best_overall_score:
3056  best_overall_score = score_result.QS_global
3057  best_overall_mapping = mapping
3058 
3059  mapping = best_overall_mapping
3060 
3061  # translate mapping format and return
3062  final_mapping = list()
3063  for ref_chains in the_greed.ref_chem_groups:
3064  mapped_mdl_chains = list()
3065  for ref_ch in ref_chains:
3066  if ref_ch in mapping:
3067  mapped_mdl_chains.append(mapping[ref_ch])
3068  else:
3069  mapped_mdl_chains.append(None)
3070  final_mapping.append(mapped_mdl_chains)
3071 
3072  return final_mapping
3073 
3074 
3075 def _QSScoreGreedyBlock(the_greed, seed_size, blocks_per_chem_group):
3076  """ try multiple seeds, i.e. try all ref/mdl chain combinations within the
3077  respective chem groups and compute single chain lDDTs. The
3078  *blocks_per_chem_group* best scoring ones are extend by *seed_size* chains
3079  and the best scoring one with respect to QS score is exhaustively extended.
3080  """
3081 
3082  if seed_size is None or seed_size < 1:
3083  raise RuntimeError(f"seed_size must be an int >= 1 (got {seed_size})")
3084 
3085  if blocks_per_chem_group is None or blocks_per_chem_group < 1:
3086  raise RuntimeError(f"blocks_per_chem_group must be an int >= 1 "
3087  f"(got {blocks_per_chem_group})")
3088 
3089  max_ext = seed_size - 1 # -1 => start seed already has size 1
3090 
3091  ref_chem_groups = copy.deepcopy(the_greed.ref_chem_groups)
3092  mdl_chem_groups = copy.deepcopy(the_greed.mdl_chem_groups)
3093 
3094  mapping = dict()
3095 
3096  something_happened = True
3097  while something_happened:
3098  something_happened = False
3099  starting_blocks = list()
3100  for ref_chains, mdl_chains in zip(ref_chem_groups, mdl_chem_groups):
3101  if len(mdl_chains) == 0:
3102  continue # nothing to map
3103  ref_chains_copy = list(ref_chains)
3104  for i in range(blocks_per_chem_group):
3105  if len(ref_chains_copy) == 0:
3106  break
3107  seeds = list()
3108  for ref_ch in ref_chains_copy:
3109  seeds += [(ref_ch, mdl_ch) for mdl_ch in mdl_chains]
3110  # extend starting seeds to *seed_size* and retain best scoring block
3111  # for further extension
3112  best_score = -1.0
3113  best_mapping = None
3114  best_seed = None
3115  for s in seeds:
3116  seed = dict(mapping)
3117  seed.update({s[0]: s[1]})
3118  seed = the_greed.ExtendMapping(seed, max_ext = max_ext)
3119  score_result = the_greed.FromFlatMapping(seed)
3120  if score_result.QS_global > best_score:
3121  best_score = score_result.QS_global
3122  best_mapping = seed
3123  best_seed = s
3124  if best_mapping != None:
3125  starting_blocks.append(best_mapping)
3126  if best_seed[0] in ref_chains_copy:
3127  # remove selected ref chain to enforce diversity
3128  ref_chains_copy.remove(best_seed[0])
3129 
3130  # fully expand initial starting blocks
3131  best_score = -1.0
3132  best_mapping = None
3133  for seed in starting_blocks:
3134  seed = the_greed.ExtendMapping(seed)
3135  score_result = the_greed.FromFlatMapping(seed)
3136  if score_result.QS_global > best_score:
3137  best_score = score_result.QS_global
3138  best_mapping = seed
3139 
3140  if best_mapping is not None and len(best_mapping) > len(mapping):
3141  # this even accepts extensions that lead to no increase in QS-score
3142  # at least they make sense from an lDDT perspective
3143  something_happened = True
3144  mapping.update(best_mapping)
3145  for ref_ch, mdl_ch in best_mapping.items():
3146  for group_idx in range(len(ref_chem_groups)):
3147  if ref_ch in ref_chem_groups[group_idx]:
3148  ref_chem_groups[group_idx].remove(ref_ch)
3149  if mdl_ch in mdl_chem_groups[group_idx]:
3150  mdl_chem_groups[group_idx].remove(mdl_ch)
3151 
3152  # translate mapping format and return
3153  final_mapping = list()
3154  for ref_chains in the_greed.ref_chem_groups:
3155  mapped_mdl_chains = list()
3156  for ref_ch in ref_chains:
3157  if ref_ch in mapping:
3158  mapped_mdl_chains.append(mapping[ref_ch])
3159  else:
3160  mapped_mdl_chains.append(None)
3161  final_mapping.append(mapped_mdl_chains)
3162 
3163  return final_mapping
3164 
3165 def _SingleRigidRMSD(initial_transforms, initial_mappings, chem_groups,
3166  chem_mapping, trg_group_pos, mdl_group_pos):
3167  """
3168  Takes initial transforms and sequentially adds chain pairs with lowest RMSD.
3169  The mapping from the transform that leads to lowest overall RMSD is
3170  returned.
3171  """
3172  best_mapping = dict()
3173  best_ssd = float("inf") # we're actually going for summed squared distances
3174  # Since all positions have same lengths and we do a
3175  # full mapping, lowest SSD has a guarantee of also
3176  # being lowest RMSD
3177  for transform in initial_transforms:
3178  mapping = dict()
3179  mapped_mdl_chains = set()
3180  ssd = 0.0
3181  for trg_chains, mdl_chains, trg_pos, mdl_pos, in zip(chem_groups,
3182  chem_mapping,
3183  trg_group_pos,
3184  mdl_group_pos):
3185  if len(trg_pos) == 0 or len(mdl_pos) == 0:
3186  continue # cannot compute valid rmsd
3187  ssds = list()
3188  t_mdl_pos = list()
3189  for m_pos in mdl_pos:
3190  t_m_pos = geom.Vec3List(m_pos)
3191  t_m_pos.ApplyTransform(transform)
3192  t_mdl_pos.append(t_m_pos)
3193  for t_pos, t in zip(trg_pos, trg_chains):
3194  for t_m_pos, m in zip(t_mdl_pos, mdl_chains):
3195  ssd = t_pos.GetSummedSquaredDistances(t_m_pos)
3196  ssds.append((ssd, (t,m)))
3197  ssds.sort()
3198  for item in ssds:
3199  p = item[1]
3200  if p[0] not in mapping and p[1] not in mapped_mdl_chains:
3201  mapping[p[0]] = p[1]
3202  mapped_mdl_chains.add(p[1])
3203  ssd += item[0]
3204 
3205  if ssd < best_ssd:
3206  best_ssd = ssd
3207  best_mapping = mapping
3208 
3209  return best_mapping
3210 
3211 
3212 def _SingleRigidCentroid(initial_transforms, initial_mappings, chem_groups,
3213  chem_mapping, trg_group_pos, mdl_group_pos):
3214  """
3215  Takes initial transforms and sequentially adds chain pairs with lowest
3216  centroid distance.
3217  The mapping from the transform that leads to lowest overall RMSD of
3218  the centroids is returned.
3219  """
3220  best_mapping = dict()
3221  best_rmsd = float("inf")
3222 
3223  trg_group_centroids = list()
3224  mdl_group_centroids = list()
3225 
3226  for trg_pos, mdl_pos, in zip(trg_group_pos, mdl_group_pos):
3227  trg_group_centroids.append(geom.Vec3List([p.center for p in trg_pos]))
3228  mdl_group_centroids.append(geom.Vec3List([p.center for p in mdl_pos]))
3229 
3230  for transform in initial_transforms:
3231  mapping = dict()
3232  mapped_mdl_chains = set()
3233  mapped_trg_centroids = list()
3234  mapped_mdl_centroids = list()
3235  for trg_chains, mdl_chains, trg_centroids, mdl_centroids, in zip(chem_groups,
3236  chem_mapping,
3237  trg_group_centroids,
3238  mdl_group_centroids):
3239  if len(trg_centroids) == 0 or len(mdl_centroids) == 0:
3240  continue # cannot compute valid rmsd
3241  distances = list()
3242  t_mdl_centroids = geom.Vec3List(mdl_centroids)
3243  t_mdl_centroids.ApplyTransform(transform)
3244 
3245  for trg_idx in range(len(trg_chains)):
3246  for mdl_idx in range(len(mdl_chains)):
3247  distances.append((geom.Length2(trg_centroids[trg_idx]-t_mdl_centroids[mdl_idx]),
3248  (trg_chains[trg_idx], mdl_chains[mdl_idx]),
3249  (trg_centroids[trg_idx], t_mdl_centroids[mdl_idx])))
3250 
3251  distances.sort()
3252  for item in distances:
3253  p = item[1]
3254  if p[0] not in mapping and p[1] not in mapped_mdl_chains:
3255  mapping[p[0]] = p[1]
3256  mapped_mdl_chains.add(p[1])
3257  mapped_trg_centroids.append(item[2][0])
3258  mapped_mdl_centroids.append(item[2][1])
3259 
3260 
3261  if len(mapping) == 0:
3262  raise RuntimeError("Empty mapping in _SingleRigidCentroid")
3263  elif len(mapping) == 1:
3264  # by definition zero
3265  rmsd = 0.0
3266  elif len(mapping) == 2:
3267  # no superposition possible with two positions
3268  # derive rmsd from the distance difference between
3269  # the centroid pairs in reference and model
3270  ref_d = geom.Distance(mapped_trg_centroids[0],
3271  mapped_trg_centroids[1])
3272  mdl_d = geom.Distance(mapped_mdl_centroids[0],
3273  mapped_mdl_centroids[1])
3274  # compute RMSD when placing pair of coordinates with smaller
3275  # distance in the middle of pair of cooridnates with larger
3276  # distance
3277  dd = abs(ref_d-mdl_d) # distance difference
3278  # rmsd = sqrt(1/2 * ((dd/2)**2 + (dd/2)**2))
3279  # => rmsd = dd/2
3280  rmsd = dd/2
3281  else:
3282  # go for classic Kabsch superposition
3283  mapped_trg_centroids = geom.Vec3List(mapped_trg_centroids)
3284  mapped_mdl_centroids = geom.Vec3List(mapped_mdl_centroids)
3285  rmsd = _GetSuperposition(mapped_trg_centroids,
3286  mapped_mdl_centroids, False).rmsd
3287 
3288  if rmsd < best_rmsd:
3289  best_rmsd = rmsd
3290  best_mapping = mapping
3291 
3292  return best_mapping
3293 
3294 
3295 def _IterativeRigidRMSD(initial_transforms, initial_mappings, chem_groups,
3296  chem_mapping, trg_group_pos, mdl_group_pos):
3297  """ Takes initial transforms and sequentially adds chain pairs with
3298  lowest RMSD. With each added chain pair, the transform gets updated.
3299  Thus the naming iterative. The mapping from the initial transform that
3300  leads to best overall RMSD score is returned.
3301  """
3302 
3303  # to directly retrieve positions using chain names
3304  trg_pos_dict = dict()
3305  for trg_pos, trg_chains in zip(trg_group_pos, chem_groups):
3306  for t_pos, t in zip(trg_pos, trg_chains):
3307  trg_pos_dict[t] = t_pos
3308  mdl_pos_dict = dict()
3309  for mdl_pos, mdl_chains in zip(mdl_group_pos, chem_mapping):
3310  for m_pos, m in zip(mdl_pos, mdl_chains):
3311  mdl_pos_dict[m] = m_pos
3312 
3313  best_mapping = dict()
3314  best_rmsd = float("inf")
3315  for initial_transform, initial_mapping in zip(initial_transforms,
3316  initial_mappings):
3317  mapping = {initial_mapping[0]: initial_mapping[1]}
3318  transform = geom.Mat4(initial_transform)
3319  mapped_trg_pos = geom.Vec3List(trg_pos_dict[initial_mapping[0]])
3320  mapped_mdl_pos = geom.Vec3List(mdl_pos_dict[initial_mapping[1]])
3321 
3322  # the following variables contain the chains which are
3323  # available for mapping
3324  trg_chain_groups = [set(group) for group in chem_groups]
3325  mdl_chain_groups = [set(group) for group in chem_mapping]
3326 
3327  # search and kick out inital mapping
3328  for group in trg_chain_groups:
3329  if initial_mapping[0] in group:
3330  group.remove(initial_mapping[0])
3331  break
3332  for group in mdl_chain_groups:
3333  if initial_mapping[1] in group:
3334  group.remove(initial_mapping[1])
3335  break
3336 
3337  something_happened = True
3338  while something_happened:
3339  # search for best mapping given current transform
3340  something_happened=False
3341  best_sc_mapping = None
3342  best_sc_group_idx = None
3343  best_sc_rmsd = float("inf")
3344  group_idx = 0
3345  for trg_chains, mdl_chains in zip(trg_chain_groups, mdl_chain_groups):
3346  for t in trg_chains:
3347  t_pos = trg_pos_dict[t]
3348  for m in mdl_chains:
3349  m_pos = mdl_pos_dict[m]
3350  t_m_pos = geom.Vec3List(m_pos)
3351  t_m_pos.ApplyTransform(transform)
3352  rmsd = t_pos.GetRMSD(t_m_pos)
3353  if rmsd < best_sc_rmsd:
3354  best_sc_rmsd = rmsd
3355  best_sc_mapping = (t,m)
3356  best_sc_group_idx = group_idx
3357  group_idx += 1
3358 
3359  if best_sc_mapping is not None:
3360  something_happened = True
3361  mapping[best_sc_mapping[0]] = best_sc_mapping[1]
3362  mapped_trg_pos.extend(trg_pos_dict[best_sc_mapping[0]])
3363  mapped_mdl_pos.extend(mdl_pos_dict[best_sc_mapping[1]])
3364  trg_chain_groups[best_sc_group_idx].remove(best_sc_mapping[0])
3365  mdl_chain_groups[best_sc_group_idx].remove(best_sc_mapping[1])
3366 
3367  transform = _GetSuperposition(mapped_mdl_pos, mapped_trg_pos,
3368  False).transformation
3369 
3370  # compute overall RMSD for current transform
3371  mapped_mdl_pos.ApplyTransform(transform)
3372  rmsd = mapped_trg_pos.GetRMSD(mapped_mdl_pos)
3373 
3374  if rmsd < best_rmsd:
3375  best_rmsd = rmsd
3376  best_mapping = mapping
3377 
3378  return best_mapping
3379 
3380 def _NaiveRMSD(chem_groups, chem_mapping, trg_group_pos, mdl_group_pos,
3381  n_max_naive):
3382 
3383  # to directly retrieve positions using chain names
3384  trg_pos_dict = dict()
3385  for trg_pos, trg_chains in zip(trg_group_pos, chem_groups):
3386  for t_pos, t in zip(trg_pos, trg_chains):
3387  trg_pos_dict[t] = t_pos
3388  mdl_pos_dict = dict()
3389  for mdl_pos, mdl_chains in zip(mdl_group_pos, chem_mapping):
3390  for m_pos, m in zip(mdl_pos, mdl_chains):
3391  mdl_pos_dict[m] = m_pos
3392 
3393  best_mapping = [[None]*len(x) for x in chem_groups]
3394  best_rmsd = float("inf")
3395 
3396  for mapping in _ChainMappings(chem_groups, chem_mapping, n_max_naive):
3397  trg_pos = geom.Vec3List()
3398  mdl_pos = geom.Vec3List()
3399  for trg_group, mdl_group in zip(chem_groups, mapping):
3400  for trg_ch, mdl_ch in zip(trg_group, mdl_group):
3401  if trg_ch is not None and mdl_ch is not None:
3402  trg_pos.extend(trg_pos_dict[trg_ch])
3403  mdl_pos.extend(mdl_pos_dict[mdl_ch])
3404  if len(mdl_pos) >= 3:
3405  superpose_res = mol.alg.SuperposeSVD(mdl_pos, trg_pos)
3406  if superpose_res.rmsd < best_rmsd:
3407  best_rmsd = superpose_res.rmsd
3408  best_mapping = mapping
3409 
3410  # this is stupid...
3411  tmp = dict()
3412  for chem_group, mapping in zip(chem_groups, best_mapping):
3413  for trg_ch, mdl_ch in zip(chem_group, mapping):
3414  tmp[trg_ch] = mdl_ch
3415 
3416  return tmp
3417 
3418 
3419 def _GetRefPos(trg, mdl, trg_msas, mdl_alns, max_pos = None):
3420  """ Extracts reference positions which are present in trg and mdl
3421  """
3422 
3423  # mdl_alns are pairwise, let's construct MSAs
3424  mdl_msas = list()
3425  for aln_list in mdl_alns:
3426  if len(aln_list) > 0:
3427  tmp = aln_list[0].GetSequence(0)
3428  ref_seq = seq.CreateSequence(tmp.GetName(), tmp.GetGaplessString())
3429  mdl_msas.append(seq.alg.MergePairwiseAlignments(aln_list, ref_seq))
3430  else:
3431  mdl_msas.append(seq.CreateAlignment())
3432 
3433  # fetch positions of all backbone atoms
3434  bb_trg = _GetBBPos(trg)
3435  bb_mdl = _GetBBPos(mdl)
3436 
3437  # there are the actual return values
3438  trg_pos = list()
3439  mdl_pos = list()
3440 
3441  for trg_msa, mdl_msa in zip(trg_msas, mdl_msas):
3442 
3443  if mdl_msa.GetCount() > 0:
3444  # make sure they have the same ref sequence (should be a given...)
3445  assert(trg_msa.GetSequence(0).GetGaplessString() == \
3446  mdl_msa.GetSequence(0).GetGaplessString())
3447  else:
3448  # if mdl_msa is empty, i.e. no model chain maps to the chem group
3449  # represented by trg_msa, we just continue. The result will be
3450  # empty position lists added to trg_pos and mdl_pos.
3451  pass
3452 
3453  # check which columns in MSAs are fully covered (indices relative to
3454  # first sequence)
3455  trg_indices = _GetFullyCoveredIndices(trg_msa)
3456  mdl_indices = _GetFullyCoveredIndices(mdl_msa)
3457 
3458  # get indices where both, mdl and trg, are fully covered
3459  indices = sorted(list(trg_indices.intersection(mdl_indices)))
3460 
3461  # subsample if necessary
3462  if max_pos is not None and len(indices) > max_pos:
3463  step = int(len(indices)/max_pos)
3464  indices = [indices[i] for i in range(0, len(indices), step)]
3465 
3466  # translate to column indices in the respective MSAs
3467  trg_indices = _RefIndicesToColumnIndices(trg_msa, indices)
3468  mdl_indices = _RefIndicesToColumnIndices(mdl_msa, indices)
3469 
3470  # extract positions
3471  trg_pos.append(list())
3472  mdl_pos.append(list())
3473  # first seq in trg_msa is ref sequence and does not refer to any
3474  # ATOMSEQ
3475  for s_idx in range(1, trg_msa.GetCount()):
3476  trg_pos[-1].append(_ExtractMSAPos(trg_msa, s_idx, trg_indices,
3477  bb_trg))
3478  # first seq in mdl_msa is ref sequence in trg and does not belong to mdl
3479  for s_idx in range(1, mdl_msa.GetCount()):
3480  mdl_pos[-1].append(_ExtractMSAPos(mdl_msa, s_idx, mdl_indices,
3481  bb_mdl))
3482 
3483  return (trg_pos, mdl_pos)
3484 
3485 def _GetBBPos(ent):
3486  """ Helper for _GetRefPos
3487 
3488  Returns a dict with key: chain name and value: a geom.Vec3List of backbone
3489  atom positions. Assumes that either CA or C3' is always there.
3490  """
3491  bb = dict()
3492  for ch in ent.chains:
3493  l = geom.Vec3List()
3494  for r in ch.residues:
3495  a = r.FindAtom("CA")
3496  if not a.IsValid():
3497  a = r.FindAtom("C3'")
3498  l.append(a.GetPos())
3499  bb[ch.name] = l
3500  return bb
3501 
3502 def _GetFullyCoveredIndices(msa):
3503  """ Helper for _GetRefPos
3504 
3505  Returns a set containing the indices relative to first sequence in msa which
3506  are fully covered in all other sequences
3507 
3508  --AA-A-A
3509  -BBBB-BB
3510  CCCC-C-C
3511 
3512  => (0,1,3)
3513  """
3514  indices = set()
3515  ref_idx = 0
3516  for col in msa:
3517  if sum([1 for olc in col if olc != '-']) == col.GetRowCount():
3518  indices.add(ref_idx)
3519  if col[0] != '-':
3520  ref_idx += 1
3521  return indices
3522 
3523 def _RefIndicesToColumnIndices(msa, indices):
3524  """ Helper for _GetRefPos
3525 
3526  Returns a list of mapped indices. indices refer to non-gap one letter
3527  codes in the first msa sequence. The returnes mapped indices are translated
3528  to the according msa column indices
3529  """
3530  ref_idx = 0
3531  mapping = dict()
3532  for col_idx, col in enumerate(msa):
3533  if col[0] != '-':
3534  mapping[ref_idx] = col_idx
3535  ref_idx += 1
3536  return [mapping[i] for i in indices]
3537 
3538 def _ExtractMSAPos(msa, s_idx, indices, bb):
3539  """ Helper for _GetRefPos
3540 
3541  Returns a geom.Vec3List containing positions refering to given msa sequence.
3542  => Chain with corresponding name is mapped onto sequence and the position of
3543  the first atom of each residue specified in indices is extracted.
3544  Indices refers to column indices in msa!
3545  """
3546  s = msa.GetSequence(s_idx)
3547  ch_bb = bb[s.GetName()]
3548 
3549  # sanity check
3550  assert(len(s.GetGaplessString()) == len(ch_bb))
3551 
3552  residue_idx = [s.GetResidueIndex(i) for i in indices]
3553  return geom.Vec3List([ch_bb[i] for i in residue_idx])
3554 
3555 
3556 def _NChemGroupMappings(ref_chains, mdl_chains):
3557  """ Number of mappings within one chem group
3558 
3559  :param ref_chains: Reference chains
3560  :type ref_chains: :class:`list` of :class:`str`
3561  :param mdl_chains: Model chains that are mapped onto *ref_chains*
3562  :type mdl_chains: :class:`list` of :class:`str`
3563  :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3564  """
3565  n_ref = len(ref_chains)
3566  n_mdl = len(mdl_chains)
3567  if n_ref == n_mdl:
3568  return factorial(n_ref)
3569  elif n_ref > n_mdl:
3570  n_choose_k = binom(n_ref, n_mdl)
3571  return n_choose_k * factorial(n_mdl)
3572  else:
3573  n_choose_k = binom(n_mdl, n_ref)
3574  return n_choose_k * factorial(n_ref)
3575 
3576 def _NMappings(ref_chains, mdl_chains):
3577  """ Number of mappings for a full chem mapping
3578 
3579  :param ref_chains: Chem groups of reference
3580  :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3581  :param mdl_chains: Model chains that map onto those chem groups
3582  :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3583  :returns: Number of possible mappings of *mdl_chains* onto *ref_chains*
3584  """
3585  assert(len(ref_chains) == len(mdl_chains))
3586  n = 1
3587  for a,b in zip(ref_chains, mdl_chains):
3588  n *= _NChemGroupMappings(a,b)
3589  return n
3590 
3591 def _NMappingsWithin(ref_chains, mdl_chains, max_mappings):
3592  """ Check whether total number of mappings is smaller than given maximum
3593 
3594  In principle the same as :func:`_NMappings` but it stops as soon as the
3595  maximum is hit.
3596 
3597  :param ref_chains: Chem groups of reference
3598  :type ref_chains: :class:`list` of :class:`list` of :class:`str`
3599  :param mdl_chains: Model chains that map onto those chem groups
3600  :type mdl_chains: :class:`list` of :class:`list` of :class:`str`
3601  :param max_mappings: Number of max allowed mappings
3602  :returns: Whether number of possible mappings of *mdl_chains* onto
3603  *ref_chains* is below or equal *max_mappings*.
3604  """
3605  assert(len(ref_chains) == len(mdl_chains))
3606  n = 1
3607  for a,b in zip(ref_chains, mdl_chains):
3608  n *= _NChemGroupMappings(a,b)
3609  if n > max_mappings:
3610  return False
3611  return True
3612 
3613 def _RefSmallerGenerator(ref_chains, mdl_chains):
3614  """ Returns all possible ways to map mdl_chains onto ref_chains
3615 
3616  Specific for the case where len(ref_chains) < len(mdl_chains)
3617  """
3618  for c in itertools.combinations(mdl_chains, len(ref_chains)):
3619  for p in itertools.permutations(c):
3620  yield list(p)
3621 
3622 def _RefLargerGenerator(ref_chains, mdl_chains):
3623  """ Returns all possible ways to map mdl_chains onto ref_chains
3624 
3625  Specific for the case where len(ref_chains) > len(mdl_chains)
3626  Ref chains without mapped mdl chain are assigned None
3627  """
3628  n_ref = len(ref_chains)
3629  n_mdl = len(mdl_chains)
3630  for c in itertools.combinations(range(n_ref), n_mdl):
3631  for p in itertools.permutations(mdl_chains):
3632  ret_list = [None] * n_ref
3633  for idx, ch in zip(c, p):
3634  ret_list[idx] = ch
3635  yield ret_list
3636 
3637 def _RefEqualGenerator(ref_chains, mdl_chains):
3638  """ Returns all possible ways to map mdl_chains onto ref_chains
3639 
3640  Specific for the case where len(ref_chains) == len(mdl_chains)
3641  """
3642  for p in itertools.permutations(mdl_chains):
3643  yield list(p)
3644 
3645 def _RefEmptyGenerator(ref_chains, mdl_chains):
3646  yield list()
3647 
3648 def _ConcatIterators(iterators):
3649  for item in itertools.product(*iterators):
3650  yield list(item)
3651 
3652 def _ChainMappings(ref_chains, mdl_chains, n_max=None):
3653  """Returns all possible ways to map *mdl_chains* onto fixed *ref_chains*
3654 
3655  :param ref_chains: List of list of chemically equivalent chains in reference
3656  :type ref_chains: :class:`list` of :class:`list`
3657  :param mdl_chains: Equally long list of list of chemically equivalent chains
3658  in model that map on those ref chains.
3659  :type mdl_chains: :class:`list` of :class:`list`
3660  :param n_max: Aborts and raises :class:`RuntimeError` if max number of
3661  mappings is above this threshold.
3662  :type n_max: :class:`int`
3663  :returns: Iterator over all possible mappings of *mdl_chains* onto fixed
3664  *ref_chains*. Potentially contains None as padding when number of
3665  model chains for a certain mapping is smaller than the according
3666  reference chains.
3667  Example: _ChainMappings([['A', 'B', 'C'], ['D', 'E']],
3668  [['x', 'y'], ['i', 'j']])
3669  gives an iterator over: [[['x', 'y', None], ['i', 'j']],
3670  [['x', 'y', None], ['j', 'i']],
3671  [['y', 'x', None], ['i', 'j']],
3672  [['y', 'x', None], ['j', 'i']],
3673  [['x', None, 'y'], ['i', 'j']],
3674  [['x', None, 'y'], ['j', 'i']],
3675  [['y', None, 'x'], ['i', 'j']],
3676  [['y', None, 'x'], ['j', 'i']],
3677  [[None, 'x', 'y'], ['i', 'j']],
3678  [[None, 'x', 'y'], ['j', 'i']],
3679  [[None, 'y', 'x'], ['i', 'j']],
3680  [[None, 'y', 'x'], ['j', 'i']]]
3681  """
3682  assert(len(ref_chains) == len(mdl_chains))
3683 
3684  if n_max is not None:
3685  if not _NMappingsWithin(ref_chains, mdl_chains, n_max):
3686  raise RuntimeError(f"Too many mappings. Max allowed: {n_max}")
3687 
3688  # one iterator per mapping representing all mdl combinations relative to
3689  # reference
3690  iterators = list()
3691  for ref, mdl in zip(ref_chains, mdl_chains):
3692  if len(ref) == 0:
3693  iterators.append(_RefEmptyGenerator(ref, mdl))
3694  elif len(ref) == len(mdl):
3695  iterators.append(_RefEqualGenerator(ref, mdl))
3696  elif len(ref) < len(mdl):
3697  iterators.append(_RefSmallerGenerator(ref, mdl))
3698  else:
3699  iterators.append(_RefLargerGenerator(ref, mdl))
3700 
3701  return _ConcatIterators(iterators)
3702 
3703 
3704 def _GetSuperposition(pos_one, pos_two, iterative):
3705  """ Computes minimal RMSD superposition for pos_one onto pos_two
3706 
3707  :param pos_one: Positions that should be superposed onto *pos_two*
3708  :type pos_one: :class:`geom.Vec3List`
3709  :param pos_two: Reference positions
3710  :type pos_two: :class:`geom.Vec3List`
3711  :iterative: Whether iterative superposition should be used. Iterative
3712  potentially raises, uses standard superposition as fallback.
3713  :type iterative: :class:`bool`
3714  :returns: Transformation matrix to superpose *pos_one* onto *pos_two*
3715  :rtype: :class:`ost.mol.alg.SuperpositionResult`
3716  """
3717  res = None
3718  if iterative:
3719  try:
3720  res = mol.alg.IterativeSuperposeSVD(pos_one, pos_two)
3721  except:
3722  pass # triggers fallback below
3723  if res is None:
3724  res = mol.alg.SuperposeSVD(pos_one, pos_two)
3725  return res
3726 
3727 # specify public interface
3728 __all__ = ('ChainMapper', 'ReprResult', 'MappingResult')
def _NSCConserved(self, trg_ch, mdl_ch)
Definition: bb_lddt.py:481
def _NPairConserved(self, trg_int, mdl_int)
Definition: bb_lddt.py:504
def FromFlatMapping(self, flat_mapping)
Definition: bb_lddt.py:457
def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, contact_d=12.0, steep_opt_rate=None, greedy_prune_contact_map=False)
def ExtendMapping(self, mapping, max_ext=None)
def _SteepOpt(self, mapping, chains_to_optimize=None)
def __init__(self, ref, mdl, ref_chem_groups, mdl_chem_groups, ref_mdl_alns, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], steep_opt_rate=None)
def ExtendMapping(self, mapping, max_ext=None)
def _SteepOpt(self, mapping, chains_to_optimize=None)
def GetlDDTMapping(self, model, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], strategy="heuristic", steep_opt_rate=None, block_seed_size=5, block_blocks_per_chem_group=5, chem_mapping_result=None, heuristic_n_max_naive=40320)
def GetRepr(self, substructure, model, topn=1, inclusion_radius=15.0, thresholds=[0.5, 1.0, 2.0, 4.0], bb_only=False, only_interchain=False, chem_mapping_result=None, global_mapping=None)
def GetAFMMapping(self, model, chem_mapping_result=None)
def _GroupSequences(self, seqs, seqid_thr, min_length, chem_type)
def GetMapping(self, model, n_max_naive=40320)
def ResNumAlign(self, s1, s2, s1_ent, s2_ent)
def SEQRESAlign(self, seqres, s, s_ent)
def GetQSScoreMapping(self, model, contact_d=12.0, strategy="heuristic", block_seed_size=5, block_blocks_per_chem_group=5, heuristic_n_max_naive=40320, steep_opt_rate=None, chem_mapping_result=None, greedy_prune_contact_map=True)
def __init__(self, target, resnum_alignments=False, pep_seqid_thr=95., nuc_seqid_thr=95., pep_subst_mat=seq.alg.BLOSUM62, pep_gap_open=-11, pep_gap_ext=-1, nuc_subst_mat=seq.alg.NUC44, nuc_gap_open=-4, nuc_gap_ext=-4, min_pep_length=6, min_nuc_length=4, n_max_naive=1e8, mdl_map_pep_seqid_thr=0., mdl_map_nuc_seqid_thr=0., seqres=None, trg_seqres_mapping=None)
def _MapSequence(self, ref_seqs, ref_types, s, s_type, s_ent, seq_id_thr=0.0, min_aln_length=0)
def GetRMSDMapping(self, model, strategy="heuristic", subsampling=50, chem_mapping_result=None, heuristic_n_max_naive=120)
def GetFlatMapping(self, mdl_as_key=False)
def __init__(self, target, model, chem_groups, chem_mapping, mdl_chains_without_chem_mapping, mapping, alns, opt_score=None)
def GetFlatChainMapping(self, mdl_as_key=False)
def __init__(self, lDDT, substructure, ref_view, mdl_view)
def _GetInconsistentResidues(self, ref_residues, mdl_residues)
def _InterfacePenalty1(self, interface)
Definition: qsscore.py:734
def _InterfacePenalty2(self, interface)
Definition: qsscore.py:740
def _MappedInterfaceScores(self, int1, int2)
Definition: qsscore.py:583
def FromFlatMapping(self, flat_mapping)
Definition: qsscore.py:528
Real DLLEXPORT_OST_GEOM Distance(const Line2 &l, const Vec2 &v)
Real Length2(const Vec2 &v)
returns squared length of vector
Definition: vecmat2_op.hh:35