OpenStructure
ligand_scoring_scrmsd.py
Go to the documentation of this file.
1 import numpy as np
2 
3 from ost import LogWarning, LogScript, LogInfo, LogVerbose
4 from ost import geom
5 from ost import mol
6 
7 from ost.mol.alg import ligand_scoring_base
8 
9 
11  """ :class:`LigandScorer` implementing symmetry corrected RMSD (BiSyRMSD).
12 
13  :class:`SCRMSDScorer` computes a score for a specific pair of target/model
14  ligands.
15 
16  The returned RMSD is based on a binding site superposition.
17  The binding site of the target structure is defined as all residues with at
18  least one atom within `bs_radius` around the target ligand.
19  It only contains protein and nucleic acid residues from chains that
20  pass the criteria for the
21  :class:`chain mapping <ost.mol.alg.chain_mapping>`. This means ignoring
22  other ligands, waters, short polymers as well as any incorrectly connected
23  chains that may be in proximity.
24  The respective model binding site for superposition is identified by
25  naively enumerating all possible mappings of model chains onto their
26  chemically equivalent target counterparts from the target binding site.
27  The `binding_sites_topn` with respect to lDDT score are evaluated and
28  an RMSD is computed.
29  You can either try to map ALL model chains onto the target binding site by
30  enabling `full_bs_search` or restrict the model chains for a specific
31  target/model ligand pair to the chains with at least one atom within
32  *model_bs_radius* around the model ligand. The latter can be significantly
33  faster in case of large complexes.
34  Symmetry correction is achieved by simply computing an RMSD value for
35  each symmetry, i.e. atom-atom assignments of the ligand as given by
36  :class:`LigandScorer`. The lowest RMSD value is returned.
37 
38  Populates :attr:`LigandScorer.aux_data` with following :class:`dict` keys:
39 
40  * rmsd: The BiSyRMSD score
41  * lddt_lp: lDDT of the binding pocket used for superposition (lDDT-LP)
42  * bs_ref_res: :class:`list` of binding site residues in target
43  * bs_ref_res_mapped: :class:`list` of target binding site residues that
44  are mapped to model
45  * bs_mdl_res_mapped: :class:`list` of same length with respective model
46  residues
47  * bb_rmsd: Backbone RMSD (CA, C3' for nucleotides; full backbone for
48  binding sites with fewer than 3 residues) for mapped binding site
49  residues after superposition
50  * target_ligand: The actual target ligand for which the score was computed
51  * model_ligand: The actual model ligand for which the score was computed
52  * chain_mapping: :class:`dict` with a chain mapping of chains involved in
53  binding site - key: trg chain name, value: mdl chain name
54  * transform: :class:`geom.Mat4` to transform model binding site onto target
55  binding site
56  * inconsistent_residues: :class:`list` of :class:`tuple` representing
57  residues with inconsistent residue names upon mapping (which is given by
58  bs_ref_res_mapped and bs_mdl_res_mapped). Tuples have two elements:
59  1) trg residue 2) mdl residue
60 
61  :param model: Passed to parent constructor - see :class:`LigandScorer`.
62  :type model: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView`
63  :param target: Passed to parent constructor - see :class:`LigandScorer`.
64  :type target: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView`
65  :param model_ligands: Passed to parent constructor - see
66  :class:`LigandScorer`.
67  :type model_ligands: :class:`list`
68  :param target_ligands: Passed to parent constructor - see
69  :class:`LigandScorer`.
70  :type target_ligands: :class:`list`
71  :param resnum_alignments: Passed to parent constructor - see
72  :class:`LigandScorer`.
73  :type resnum_alignments: :class:`bool`
74  :param rename_ligand_chain: Passed to parent constructor - see
75  :class:`LigandScorer`.
76  :type rename_ligand_chain: :class:`bool`
77  :param substructure_match: Passed to parent constructor - see
78  :class:`LigandScorer`.
79  :type substructure_match: :class:`bool`
80  :param coverage_delta: Passed to parent constructor - see
81  :class:`LigandScorer`.
82  :type coverage_delta: :class:`float`
83  :param max_symmetries: Passed to parent constructor - see
84  :class:`LigandScorer`.
85  :type max_symmetries: :class:`int`
86  :param bs_radius: Inclusion radius for the binding site. Residues with
87  atoms within this distance of the ligand will be considered
88  for inclusion in the binding site.
89  :type bs_radius: :class:`float`
90  :param lddt_lp_radius: lDDT inclusion radius for lDDT-LP.
91  :type lddt_lp_radius: :class:`float`
92  :param model_bs_radius: inclusion radius for model binding sites.
93  Only used when full_bs_search=False, otherwise the
94  radius is effectively infinite. Only chains with
95  atoms within this distance of a model ligand will
96  be considered in the chain mapping.
97  :type model_bs_radius: :class:`float`
98  :param binding_sites_topn: maximum number of model binding site
99  representations to assess per target binding
100  site.
101  :type binding_sites_topn: :class:`int`
102  :param full_bs_search: If True, all potential binding sites in the model
103  are searched for each target binding site. If False,
104  the search space in the model is reduced to chains
105  around (`model_bs_radius` Å) model ligands.
106  This speeds up computations, but may result in
107  ligands not being scored if the predicted ligand
108  pose is too far from the actual binding site.
109  :type full_bs_search: :class:`bool`
110  :param min_pep_length: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`.
111  :type min_pep_length: :class:`int`
112  :param min_nuc_length: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
113  :type min_nuc_length: :class:`int`
114  :param pep_seqid_thr: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
115  :type pep_seqid_thr: :class:`float`
116  :param nuc_seqid_thr: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
117  :type nuc_seqid_thr: :class:`float`
118  :param mdl_map_pep_seqid_thr: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
119  :type mdl_map_pep_seqid_thr: :class:`float`
120  :param mdl_map_nuc_seqid_thr: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
121  :type mdl_map_nuc_seqid_thr: :class:`float`
122  :param seqres: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
123  :type seqres: :class:`ost.seq.SequenceList`
124  :param trg_seqres_mapping: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
125  :type trg_seqres_mapping: :class:`dict`
126  """
127  def __init__(self, model, target, model_ligands, target_ligands,
128  resnum_alignments=False, rename_ligand_chain=False,
129  substructure_match=False, coverage_delta=0.2,
130  max_symmetries=1e5, bs_radius=4.0, lddt_lp_radius=15.0,
131  model_bs_radius=25, binding_sites_topn=100000,
132  full_bs_search=False, min_pep_length = 6,
133  min_nuc_length = 4, pep_seqid_thr = 95.,
134  nuc_seqid_thr = 95.,
135  mdl_map_pep_seqid_thr = 0.,
136  mdl_map_nuc_seqid_thr = 0.,
137  seqres=None,
138  trg_seqres_mapping=None):
139 
140  super().__init__(model, target, model_ligands, target_ligands,
141  resnum_alignments = resnum_alignments,
142  rename_ligand_chain = rename_ligand_chain,
143  substructure_match = substructure_match,
144  coverage_delta = coverage_delta,
145  max_symmetries = max_symmetries,
146  min_pep_length = min_pep_length,
147  min_nuc_length = min_nuc_length,
148  pep_seqid_thr = pep_seqid_thr,
149  nuc_seqid_thr = nuc_seqid_thr,
150  mdl_map_pep_seqid_thr = mdl_map_pep_seqid_thr,
151  mdl_map_nuc_seqid_thr = mdl_map_nuc_seqid_thr,
152  seqres = seqres,
153  trg_seqres_mapping = trg_seqres_mapping)
154 
155  self.bs_radiusbs_radius = bs_radius
156  self.lddt_lp_radiuslddt_lp_radius = lddt_lp_radius
157  self.model_bs_radiusmodel_bs_radius = model_bs_radius
158  self.binding_sites_topnbinding_sites_topn = binding_sites_topn
159  self.full_bs_searchfull_bs_search = full_bs_search
160 
161  # Residues that are in contact with a ligand => binding site
162  # defined as all residues with at least one atom within self.radius
163  # key: ligand.handle.hash_code, value: EntityView of whatever
164  # entity ligand belongs to
165  self._binding_sites_binding_sites = dict()
166 
167  # cache for GetRepr chain mapping calls
168  self._repr_repr = dict()
169 
170  # lazily precomputed variables to speedup GetRepr chain mapping calls
171  # for localized GetRepr searches
172  self._get_repr_input_get_repr_input = dict()
173 
174  # update state decoding from parent with subclass specific stuff
175  self.state_decodingstate_decoding[10] = ("target_binding_site",
176  "No residues were in proximity of the "
177  "target ligand.")
178  self.state_decodingstate_decoding[11] = ("model_binding_site", "Binding site was not"
179  " found in the model, i.e. the binding site"
180  " was not modeled or the model ligand was "
181  "positioned too far in combination with "
182  "full_bs_search=False.")
183  self.state_decodingstate_decoding[20] = ("unknown",
184  "Unknown error occured in SCRMSDScorer")
185 
186  def _compute(self, symmetries, target_ligand, model_ligand):
187  """ Implements interface from parent
188  """
189  # set default to invalid scores
190  best_rmsd_result = {"rmsd": None,
191  "lddt_lp": None,
192  "bs_ref_res": list(),
193  "bs_ref_res_mapped": list(),
194  "bs_mdl_res_mapped": list(),
195  "bb_rmsd": None,
196  "target_ligand": target_ligand,
197  "model_ligand": model_ligand,
198  "chain_mapping": dict(),
199  "transform": geom.Mat4(),
200  "inconsistent_residues": list()}
201 
202  representations = self._get_repr_get_repr(target_ligand, model_ligand)
203  # This step can be slow so give some hints in logs
204  msg = "Computing BiSyRMSD with %d chain mappings" % len(representations)
205  (LogWarning if len(representations) > 10000 else LogInfo)(msg)
206 
207  for r in representations:
208  rmsd = _SCRMSD_symmetries(symmetries, model_ligand,
209  target_ligand, transformation=r.transform)
210 
211  if best_rmsd_result["rmsd"] is None or \
212  rmsd < best_rmsd_result["rmsd"]:
213  best_rmsd_result = {"rmsd": rmsd,
214  "lddt_lp": r.lDDT,
215  "bs_ref_res": r.substructure.residues,
216  "bs_ref_res_mapped": r.ref_residues,
217  "bs_mdl_res_mapped": r.mdl_residues,
218  "bb_rmsd": r.bb_rmsd,
219  "target_ligand": target_ligand,
220  "model_ligand": model_ligand,
221  "chain_mapping": r.GetFlatChainMapping(),
222  "transform": r.transform,
223  "inconsistent_residues":
224  r.inconsistent_residues}
225 
226  target_ligand_state = 0
227  model_ligand_state = 0
228  pair_state = 0
229 
230  if best_rmsd_result["rmsd"] is not None:
231  best_rmsd = best_rmsd_result["rmsd"]
232  else:
233  # try to identify error states
234  best_rmsd = np.nan
235  error_state = 20 # unknown error
236  N = self._get_target_binding_site_get_target_binding_site(target_ligand).GetResidueCount()
237  if N == 0:
238  pair_state = 6 # binding_site
239  target_ligand_state = 10
240  elif len(representations) == 0:
241  pair_state = 11
242 
243  return (best_rmsd, pair_state, target_ligand_state, model_ligand_state,
244  best_rmsd_result)
245 
246  def _score_dir(self):
247  """ Implements interface from parent
248  """
249  return '-'
250 
251  def _get_repr(self, target_ligand, model_ligand):
252 
253  key = None
254  if self.full_bs_searchfull_bs_search:
255  # all possible binding sites, independent from actual model ligand
256  key = (target_ligand.handle.hash_code, 0)
257  else:
258  key = (target_ligand.handle.hash_code,
259  model_ligand.handle.hash_code)
260 
261  if key not in self._repr_repr:
262  ref_bs = self._get_target_binding_site_get_target_binding_site(target_ligand)
263  LogVerbose("%d chains are in proximity of the target ligand: %s" % (
264  ref_bs.chain_count, ", ".join([c.name for c in ref_bs.chains])))
265  if self.full_bs_searchfull_bs_search:
266  reprs = self._chain_mapper_chain_mapper.GetRepr(
267  ref_bs, self.modelmodel, inclusion_radius=self.lddt_lp_radiuslddt_lp_radius,
268  topn=self.binding_sites_topnbinding_sites_topn)
269  else:
270  repr_in = self._get_get_repr_input_get_get_repr_input(model_ligand)
271  radius = self.lddt_lp_radiuslddt_lp_radius
272  reprs = self._chain_mapper_chain_mapper.GetRepr(ref_bs, self.modelmodel,
273  inclusion_radius=radius,
274  topn=self.binding_sites_topnbinding_sites_topn,
275  chem_mapping_result=repr_in)
276  self._repr_repr[key] = reprs
277 
278  return self._repr_repr[key]
279 
280  def _get_target_binding_site(self, target_ligand):
281 
282  if target_ligand.handle.hash_code not in self._binding_sites_binding_sites:
283 
284  # create view of reference binding site
285  ref_residues_hashes = set() # helper to keep track of added residues
286  ignored_residue_hashes = {target_ligand.hash_code}
287  for ligand_at in target_ligand.atoms:
288  close_atoms = self.targettarget.FindWithin(ligand_at.GetPos(),
289  self.bs_radiusbs_radius)
290  for close_at in close_atoms:
291  # Skip any residue not in the chain mapping target
292  ref_res = close_at.GetResidue()
293  h = ref_res.handle.GetHashCode()
294  if h not in ref_residues_hashes and \
295  h not in ignored_residue_hashes:
296  with ligand_scoring_base._SinkVerbosityLevel(1):
297  view = self._chain_mapper_chain_mapper.target.ViewForHandle(ref_res)
298  if view.IsValid():
299  h = ref_res.handle.GetHashCode()
300  ref_residues_hashes.add(h)
301  elif ref_res.is_ligand:
302  msg = f"Ignoring ligand {ref_res.qualified_name} "
303  msg += "in binding site of "
304  msg += str(target_ligand.qualified_name)
305  LogWarning(msg)
306  ignored_residue_hashes.add(h)
307  elif ref_res.chem_type == mol.ChemType.WATERS:
308  pass # That's ok, no need to warn
309  else:
310  msg = f"Ignoring residue {ref_res.qualified_name} "
311  msg += "in binding site of "
312  msg += str(target_ligand.qualified_name)
313  LogWarning(msg)
314  ignored_residue_hashes.add(h)
315 
316  ref_bs = self.targettarget.CreateEmptyView()
317  if ref_residues_hashes:
318  # reason for doing that separately is to guarantee same ordering
319  # of residues as in underlying entity. (Reorder by ResNum seems
320  # only available on ChainHandles)
321  for ch in self.targettarget.chains:
322  for r in ch.residues:
323  if r.handle.GetHashCode() in ref_residues_hashes:
324  ref_bs.AddResidue(r, mol.ViewAddFlag.INCLUDE_ALL)
325  if len(ref_bs.residues) == 0:
326  raise RuntimeError("Failed to add proximity residues to "
327  "the reference binding site entity")
328 
329  self._binding_sites_binding_sites[target_ligand.handle.hash_code] = ref_bs
330 
331  return self._binding_sites_binding_sites[target_ligand.handle.hash_code]
332 
333  def _get_get_repr_input(self, mdl_ligand):
334  if mdl_ligand.handle.hash_code not in self._get_repr_input_get_repr_input:
335 
336  # figure out what chains in the model are in contact with the ligand
337  # that may give a non-zero contribution to lDDT in
338  # chain_mapper.GetRepr
339  radius = self.model_bs_radiusmodel_bs_radius
340  chains = set()
341  for at in mdl_ligand.atoms:
342  with ligand_scoring_base._SinkVerbosityLevel(1):
343  close_atoms = self._chain_mapping_mdl_chain_mapping_mdl.FindWithin(at.GetPos(),
344  radius)
345  for close_at in close_atoms:
346  chains.add(close_at.GetChain().GetName())
347 
348  if len(chains) > 0:
349  LogVerbose("%d chains are in proximity of the model ligand: %s" % (
350  len(chains), ", ".join(chains)))
351 
352  # the chain mapping model which only contains close chains
353  query = "cname="
354  query += ','.join([mol.QueryQuoteName(x) for x in chains])
355  mdl = self._chain_mapping_mdl_chain_mapping_mdl.Select(query)
356 
357  # chem mapping which is reduced to the respective chains
358  chem_mapping = list()
359  for m in self._chem_mapping_chem_mapping:
360  chem_mapping.append([x for x in m if x in chains])
361 
362  self._get_repr_input_get_repr_input[mdl_ligand.handle.hash_code] = \
363  (mdl, chem_mapping)
364 
365  else:
366  self._get_repr_input_get_repr_input[mdl_ligand.handle.hash_code] = \
367  (self._chain_mapping_mdl_chain_mapping_mdl.CreateEmptyView(),
368  [list() for _ in self._chem_mapping_chem_mapping])
369 
370  return (self._get_repr_input_get_repr_input[mdl_ligand.hash_code][1],
371  self._chem_group_alns_chem_group_alns,
372  self._mdl_chains_without_chem_mapping_mdl_chains_without_chem_mapping,
373  self._get_repr_input_get_repr_input[mdl_ligand.hash_code][0])
374 
375 
376 def SCRMSD(model_ligand, target_ligand, transformation=geom.Mat4(),
377  substructure_match=False, max_symmetries=1e6):
378  """Calculate symmetry-corrected RMSD.
379 
380  Binding site superposition must be computed separately and passed as
381  `transformation`.
382 
383  :param model_ligand: The model ligand
384  :type model_ligand: :class:`ost.mol.ResidueHandle` or
385  :class:`ost.mol.ResidueView`
386  :param target_ligand: The target ligand
387  :type target_ligand: :class:`ost.mol.ResidueHandle` or
388  :class:`ost.mol.ResidueView`
389  :param transformation: Optional transformation to apply on each atom
390  position of model_ligand.
391  :type transformation: :class:`ost.geom.Mat4`
392  :param substructure_match: Set this to True to allow partial target
393  ligand.
394  :type substructure_match: :class:`bool`
395  :param max_symmetries: If more than that many isomorphisms exist, raise
396  a :class:`TooManySymmetriesError`. This can only be assessed by
397  generating at least that many isomorphisms and can take some time.
398  :type max_symmetries: :class:`int`
399  :rtype: :class:`float`
400  :raises: :class:`ost.mol.alg.ligand_scoring_base.NoSymmetryError` when no
401  symmetry can be found,
402  :class:`ost.mol.alg.ligand_scoring_base.DisconnectedGraphError`
403  when ligand graph is disconnected,
404  :class:`ost.mol.alg.ligand_scoring_base.TooManySymmetriesError`
405  when more than *max_symmetries* isomorphisms are found.
406  """
407 
408  symmetries = ligand_scoring_base.ComputeSymmetries(model_ligand,
409  target_ligand,
410  substructure_match=substructure_match,
411  by_atom_index=True,
412  max_symmetries=max_symmetries)
413  return _SCRMSD_symmetries(symmetries, model_ligand, target_ligand,
414  transformation)
415 
416 
417 def _SCRMSD_symmetries(symmetries, model_ligand, target_ligand,
418  transformation):
419  """Compute SCRMSD with pre-computed symmetries. Internal. """
420 
421  # setup numpy positions for model ligand and apply transformation
422  mdl_ligand_pos = np.ones((model_ligand.GetAtomCount(), 4))
423  for a_idx, a in enumerate(model_ligand.atoms):
424  p = a.GetPos()
425  mdl_ligand_pos[a_idx, 0] = p[0]
426  mdl_ligand_pos[a_idx, 1] = p[1]
427  mdl_ligand_pos[a_idx, 2] = p[2]
428  np_transformation = np.zeros((4,4))
429  for i in range(4):
430  for j in range(4):
431  np_transformation[i,j] = transformation[i,j]
432  mdl_ligand_pos = mdl_ligand_pos.dot(np_transformation.T)[:,:3]
433 
434  # setup numpy positions for target ligand
435  trg_ligand_pos = np.zeros((target_ligand.GetAtomCount(), 3))
436  for a_idx, a in enumerate(target_ligand.atoms):
437  p = a.GetPos()
438  trg_ligand_pos[a_idx, 0] = p[0]
439  trg_ligand_pos[a_idx, 1] = p[1]
440  trg_ligand_pos[a_idx, 2] = p[2]
441 
442  # position matrices to iterate symmetries
443  # there is a guarantee that
444  # target_ligand.GetAtomCount() <= model_ligand.GetAtomCount()
445  # and that each target ligand atom is part of every symmetry
446  # => target_ligand.GetAtomCount() is size of both position matrices
447  rmsd_mdl_pos = np.zeros((target_ligand.GetAtomCount(), 3))
448  rmsd_trg_pos = np.zeros((target_ligand.GetAtomCount(), 3))
449 
450  # iterate symmetries and find the one with lowest RMSD
451  best_rmsd = np.inf
452  for i, (trg_sym, mdl_sym) in enumerate(symmetries):
453  for idx, (mdl_anum, trg_anum) in enumerate(zip(mdl_sym, trg_sym)):
454  rmsd_mdl_pos[idx,:] = mdl_ligand_pos[mdl_anum, :]
455  rmsd_trg_pos[idx,:] = trg_ligand_pos[trg_anum, :]
456  rmsd = np.sqrt(((rmsd_mdl_pos - rmsd_trg_pos)**2).sum(-1).mean())
457  if rmsd < best_rmsd:
458  best_rmsd = rmsd
459 
460  return best_rmsd
461 
462 # specify public interface
463 __all__ = ('SCRMSDScorer', 'SCRMSD')
def __init__(self, model, target, model_ligands, target_ligands, resnum_alignments=False, rename_ligand_chain=False, substructure_match=False, coverage_delta=0.2, max_symmetries=1e5, bs_radius=4.0, lddt_lp_radius=15.0, model_bs_radius=25, binding_sites_topn=100000, full_bs_search=False, min_pep_length=6, min_nuc_length=4, pep_seqid_thr=95., nuc_seqid_thr=95., mdl_map_pep_seqid_thr=0., mdl_map_nuc_seqid_thr=0., seqres=None, trg_seqres_mapping=None)
def _get_repr(self, target_ligand, model_ligand)
def SCRMSD(model_ligand, target_ligand, transformation=geom.Mat4(), substructure_match=False, max_symmetries=1e6)