OpenStructure
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Modules Pages
ligand_scoring_scrmsd.py
Go to the documentation of this file.
1 import numpy as np
2 
3 from ost import LogWarning, LogScript, LogInfo, LogVerbose
4 from ost import geom
5 from ost import mol
6 
7 from ost.mol.alg import ligand_scoring_base
8 
9 
11  """ :class:`LigandScorer` implementing symmetry corrected RMSD (BiSyRMSD).
12 
13  :class:`SCRMSDScorer` computes a score for a specific pair of target/model
14  ligands.
15 
16  The returned RMSD is based on a binding site superposition.
17  The binding site of the target structure is defined as all residues with at
18  least one atom within `bs_radius` around the target ligand.
19  It only contains protein and nucleic acid residues from chains that
20  pass the criteria for the
21  :class:`chain mapping <ost.mol.alg.chain_mapping>`. This means ignoring
22  other ligands, waters, short polymers as well as any incorrectly connected
23  chains that may be in proximity.
24  The respective model binding site for superposition is identified by
25  naively enumerating all possible mappings of model chains onto their
26  chemically equivalent target counterparts from the target binding site.
27  The `binding_sites_topn` with respect to lDDT score are evaluated and
28  an RMSD is computed.
29  You can either try to map ALL model chains onto the target binding site by
30  enabling `full_bs_search` or restrict the model chains for a specific
31  target/model ligand pair to the chains with at least one atom within
32  *model_bs_radius* around the model ligand. The latter can be significantly
33  faster in case of large complexes.
34  Symmetry correction is achieved by simply computing an RMSD value for
35  each symmetry, i.e. atom-atom assignments of the ligand as given by
36  :class:`LigandScorer`. The lowest RMSD value is returned.
37 
38  Populates :attr:`LigandScorer.aux_data` with following :class:`dict` keys:
39 
40  * rmsd: The BiSyRMSD score
41  * lddt_lp: lDDT of the binding pocket used for superposition (lDDT-LP)
42  * bs_ref_res: :class:`list` of binding site residues in target
43  * bs_ref_res_mapped: :class:`list` of target binding site residues that
44  are mapped to model
45  * bs_mdl_res_mapped: :class:`list` of same length with respective model
46  residues
47  * bb_rmsd: Backbone RMSD (CA, C3' for nucleotides; full backbone for
48  binding sites with fewer than 3 residues) for mapped binding site
49  residues after superposition
50  * target_ligand: The actual target ligand for which the score was computed
51  * model_ligand: The actual model ligand for which the score was computed
52  * chain_mapping: :class:`dict` with a chain mapping of chains involved in
53  binding site - key: trg chain name, value: mdl chain name
54  * transform: :class:`geom.Mat4` to transform model binding site onto target
55  binding site
56  * inconsistent_residues: :class:`list` of :class:`tuple` representing
57  residues with inconsistent residue names upon mapping (which is given by
58  bs_ref_res_mapped and bs_mdl_res_mapped). Tuples have two elements:
59  1) trg residue 2) mdl residue
60 
61  :param model: Passed to parent constructor - see :class:`LigandScorer`.
62  :type model: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView`
63  :param target: Passed to parent constructor - see :class:`LigandScorer`.
64  :type target: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView`
65  :param model_ligands: Passed to parent constructor - see
66  :class:`LigandScorer`.
67  :type model_ligands: :class:`list`
68  :param target_ligands: Passed to parent constructor - see
69  :class:`LigandScorer`.
70  :type target_ligands: :class:`list`
71  :param resnum_alignments: Passed to parent constructor - see
72  :class:`LigandScorer`.
73  :type resnum_alignments: :class:`bool`
74  :param rename_ligand_chain: Passed to parent constructor - see
75  :class:`LigandScorer`.
76  :type rename_ligand_chain: :class:`bool`
77  :param substructure_match: Passed to parent constructor - see
78  :class:`LigandScorer`.
79  :type substructure_match: :class:`bool`
80  :param coverage_delta: Passed to parent constructor - see
81  :class:`LigandScorer`.
82  :type coverage_delta: :class:`float`
83  :param max_symmetries: Passed to parent constructor - see
84  :class:`LigandScorer`.
85  :type max_symmetries: :class:`int`
86  :param bs_radius: Inclusion radius for the binding site. Residues with
87  atoms within this distance of the ligand will be considered
88  for inclusion in the binding site.
89  :type bs_radius: :class:`float`
90  :param lddt_lp_radius: lDDT inclusion radius for lDDT-LP.
91  :type lddt_lp_radius: :class:`float`
92  :param model_bs_radius: inclusion radius for model binding sites.
93  Only used when full_bs_search=False, otherwise the
94  radius is effectively infinite. Only chains with
95  atoms within this distance of a model ligand will
96  be considered in the chain mapping.
97  :type model_bs_radius: :class:`float`
98  :param binding_sites_topn: maximum number of model binding site
99  representations to assess per target binding
100  site.
101  :type binding_sites_topn: :class:`int`
102  :param full_bs_search: If True, all potential binding sites in the model
103  are searched for each target binding site. If False,
104  the search space in the model is reduced to chains
105  around (`model_bs_radius` Å) model ligands.
106  This speeds up computations, but may result in
107  ligands not being scored if the predicted ligand
108  pose is too far from the actual binding site.
109  :type full_bs_search: :class:`bool`
110  :param min_pep_length: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`.
111  :type min_pep_length: :class:`int`
112  :param min_nuc_length: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
113  :type min_nuc_length: :class:`int`
114  :param pep_seqid_thr: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
115  :type pep_seqid_thr: :class:`float`
116  :param nuc_seqid_thr: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
117  :type nuc_seqid_thr: :class:`float`
118  :param mdl_map_pep_seqid_thr: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
119  :type mdl_map_pep_seqid_thr: :class:`float`
120  :param mdl_map_nuc_seqid_thr: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
121  :type mdl_map_nuc_seqid_thr: :class:`float`
122  """
123  def __init__(self, model, target, model_ligands, target_ligands,
124  resnum_alignments=False, rename_ligand_chain=False,
125  substructure_match=False, coverage_delta=0.2,
126  max_symmetries=1e5, bs_radius=4.0, lddt_lp_radius=15.0,
127  model_bs_radius=25, binding_sites_topn=100000,
128  full_bs_search=False, min_pep_length = 6,
129  min_nuc_length = 4, pep_seqid_thr = 95.,
130  nuc_seqid_thr = 95.,
131  mdl_map_pep_seqid_thr = 0.,
132  mdl_map_nuc_seqid_thr = 0.):
133 
134  super().__init__(model, target, model_ligands, target_ligands,
135  resnum_alignments = resnum_alignments,
136  rename_ligand_chain = rename_ligand_chain,
137  substructure_match = substructure_match,
138  coverage_delta = coverage_delta,
139  max_symmetries = max_symmetries,
140  min_pep_length = min_pep_length,
141  min_nuc_length = min_nuc_length,
142  pep_seqid_thr = pep_seqid_thr,
143  nuc_seqid_thr = nuc_seqid_thr,
144  mdl_map_pep_seqid_thr = mdl_map_pep_seqid_thr,
145  mdl_map_nuc_seqid_thr = mdl_map_nuc_seqid_thr)
146 
147  self.bs_radiusbs_radius = bs_radius
148  self.lddt_lp_radiuslddt_lp_radius = lddt_lp_radius
149  self.model_bs_radiusmodel_bs_radius = model_bs_radius
150  self.binding_sites_topnbinding_sites_topn = binding_sites_topn
151  self.full_bs_searchfull_bs_search = full_bs_search
152 
153  # Residues that are in contact with a ligand => binding site
154  # defined as all residues with at least one atom within self.radius
155  # key: ligand.handle.hash_code, value: EntityView of whatever
156  # entity ligand belongs to
157  self._binding_sites_binding_sites = dict()
158 
159  # cache for GetRepr chain mapping calls
160  self._repr_repr = dict()
161 
162  # lazily precomputed variables to speedup GetRepr chain mapping calls
163  # for localized GetRepr searches
164  self._get_repr_input_get_repr_input = dict()
165 
166  # update state decoding from parent with subclass specific stuff
167  self.state_decodingstate_decoding[10] = ("target_binding_site",
168  "No residues were in proximity of the "
169  "target ligand.")
170  self.state_decodingstate_decoding[11] = ("model_binding_site", "Binding site was not"
171  " found in the model, i.e. the binding site"
172  " was not modeled or the model ligand was "
173  "positioned too far in combination with "
174  "full_bs_search=False.")
175  self.state_decodingstate_decoding[20] = ("unknown",
176  "Unknown error occured in SCRMSDScorer")
177 
178  def _compute(self, symmetries, target_ligand, model_ligand):
179  """ Implements interface from parent
180  """
181  # set default to invalid scores
182  best_rmsd_result = {"rmsd": None,
183  "lddt_lp": None,
184  "bs_ref_res": list(),
185  "bs_ref_res_mapped": list(),
186  "bs_mdl_res_mapped": list(),
187  "bb_rmsd": None,
188  "target_ligand": target_ligand,
189  "model_ligand": model_ligand,
190  "chain_mapping": dict(),
191  "transform": geom.Mat4(),
192  "inconsistent_residues": list()}
193 
194  representations = self._get_repr_get_repr(target_ligand, model_ligand)
195  # This step can be slow so give some hints in logs
196  msg = "Computing BiSyRMSD with %d chain mappings" % len(representations)
197  (LogWarning if len(representations) > 10000 else LogInfo)(msg)
198 
199  for r in representations:
200  rmsd = _SCRMSD_symmetries(symmetries, model_ligand,
201  target_ligand, transformation=r.transform)
202 
203  if best_rmsd_result["rmsd"] is None or \
204  rmsd < best_rmsd_result["rmsd"]:
205  best_rmsd_result = {"rmsd": rmsd,
206  "lddt_lp": r.lDDT,
207  "bs_ref_res": r.substructure.residues,
208  "bs_ref_res_mapped": r.ref_residues,
209  "bs_mdl_res_mapped": r.mdl_residues,
210  "bb_rmsd": r.bb_rmsd,
211  "target_ligand": target_ligand,
212  "model_ligand": model_ligand,
213  "chain_mapping": r.GetFlatChainMapping(),
214  "transform": r.transform,
215  "inconsistent_residues":
216  r.inconsistent_residues}
217 
218  target_ligand_state = 0
219  model_ligand_state = 0
220  pair_state = 0
221 
222  if best_rmsd_result["rmsd"] is not None:
223  best_rmsd = best_rmsd_result["rmsd"]
224  else:
225  # try to identify error states
226  best_rmsd = np.nan
227  error_state = 20 # unknown error
228  N = self._get_target_binding_site_get_target_binding_site(target_ligand).GetResidueCount()
229  if N == 0:
230  pair_state = 6 # binding_site
231  target_ligand_state = 10
232  elif len(representations) == 0:
233  pair_state = 11
234 
235  return (best_rmsd, pair_state, target_ligand_state, model_ligand_state,
236  best_rmsd_result)
237 
238  def _score_dir(self):
239  """ Implements interface from parent
240  """
241  return '-'
242 
243  def _get_repr(self, target_ligand, model_ligand):
244 
245  key = None
246  if self.full_bs_searchfull_bs_search:
247  # all possible binding sites, independent from actual model ligand
248  key = (target_ligand.handle.hash_code, 0)
249  else:
250  key = (target_ligand.handle.hash_code,
251  model_ligand.handle.hash_code)
252 
253  if key not in self._repr_repr:
254  ref_bs = self._get_target_binding_site_get_target_binding_site(target_ligand)
255  LogVerbose("%d chains are in proximity of the target ligand: %s" % (
256  ref_bs.chain_count, ", ".join([c.name for c in ref_bs.chains])))
257  if self.full_bs_searchfull_bs_search:
258  reprs = self._chain_mapper_chain_mapper.GetRepr(
259  ref_bs, self.modelmodel, inclusion_radius=self.lddt_lp_radiuslddt_lp_radius,
260  topn=self.binding_sites_topnbinding_sites_topn)
261  else:
262  repr_in = self._get_get_repr_input_get_get_repr_input(model_ligand)
263  radius = self.lddt_lp_radiuslddt_lp_radius
264  reprs = self._chain_mapper_chain_mapper.GetRepr(ref_bs, self.modelmodel,
265  inclusion_radius=radius,
266  topn=self.binding_sites_topnbinding_sites_topn,
267  chem_mapping_result=repr_in)
268  self._repr_repr[key] = reprs
269 
270  return self._repr_repr[key]
271 
272  def _get_target_binding_site(self, target_ligand):
273 
274  if target_ligand.handle.hash_code not in self._binding_sites_binding_sites:
275 
276  # create view of reference binding site
277  ref_residues_hashes = set() # helper to keep track of added residues
278  ignored_residue_hashes = {target_ligand.hash_code}
279  for ligand_at in target_ligand.atoms:
280  close_atoms = self.targettarget.FindWithin(ligand_at.GetPos(),
281  self.bs_radiusbs_radius)
282  for close_at in close_atoms:
283  # Skip any residue not in the chain mapping target
284  ref_res = close_at.GetResidue()
285  h = ref_res.handle.GetHashCode()
286  if h not in ref_residues_hashes and \
287  h not in ignored_residue_hashes:
288  with ligand_scoring_base._SinkVerbosityLevel(1):
289  view = self._chain_mapper_chain_mapper.target.ViewForHandle(ref_res)
290  if view.IsValid():
291  h = ref_res.handle.GetHashCode()
292  ref_residues_hashes.add(h)
293  elif ref_res.is_ligand:
294  msg = f"Ignoring ligand {ref_res.qualified_name} "
295  msg += "in binding site of "
296  msg += str(target_ligand.qualified_name)
297  LogWarning(msg)
298  ignored_residue_hashes.add(h)
299  elif ref_res.chem_type == mol.ChemType.WATERS:
300  pass # That's ok, no need to warn
301  else:
302  msg = f"Ignoring residue {ref_res.qualified_name} "
303  msg += "in binding site of "
304  msg += str(target_ligand.qualified_name)
305  LogWarning(msg)
306  ignored_residue_hashes.add(h)
307 
308  ref_bs = self.targettarget.CreateEmptyView()
309  if ref_residues_hashes:
310  # reason for doing that separately is to guarantee same ordering
311  # of residues as in underlying entity. (Reorder by ResNum seems
312  # only available on ChainHandles)
313  for ch in self.targettarget.chains:
314  for r in ch.residues:
315  if r.handle.GetHashCode() in ref_residues_hashes:
316  ref_bs.AddResidue(r, mol.ViewAddFlag.INCLUDE_ALL)
317  if len(ref_bs.residues) == 0:
318  raise RuntimeError("Failed to add proximity residues to "
319  "the reference binding site entity")
320 
321  self._binding_sites_binding_sites[target_ligand.handle.hash_code] = ref_bs
322 
323  return self._binding_sites_binding_sites[target_ligand.handle.hash_code]
324 
325  def _get_get_repr_input(self, mdl_ligand):
326  if mdl_ligand.handle.hash_code not in self._get_repr_input_get_repr_input:
327 
328  # figure out what chains in the model are in contact with the ligand
329  # that may give a non-zero contribution to lDDT in
330  # chain_mapper.GetRepr
331  radius = self.model_bs_radiusmodel_bs_radius
332  chains = set()
333  for at in mdl_ligand.atoms:
334  with ligand_scoring_base._SinkVerbosityLevel(1):
335  close_atoms = self._chain_mapping_mdl_chain_mapping_mdl.FindWithin(at.GetPos(),
336  radius)
337  for close_at in close_atoms:
338  chains.add(close_at.GetChain().GetName())
339 
340  if len(chains) > 0:
341  LogVerbose("%d chains are in proximity of the model ligand: %s" % (
342  len(chains), ", ".join(chains)))
343 
344  # the chain mapping model which only contains close chains
345  query = "cname="
346  query += ','.join([mol.QueryQuoteName(x) for x in chains])
347  mdl = self._chain_mapping_mdl_chain_mapping_mdl.Select(query)
348 
349  # chem mapping which is reduced to the respective chains
350  chem_mapping = list()
351  for m in self._chem_mapping_chem_mapping:
352  chem_mapping.append([x for x in m if x in chains])
353 
354  self._get_repr_input_get_repr_input[mdl_ligand.handle.hash_code] = \
355  (mdl, chem_mapping)
356 
357  else:
358  self._get_repr_input_get_repr_input[mdl_ligand.handle.hash_code] = \
359  (self._chain_mapping_mdl_chain_mapping_mdl.CreateEmptyView(),
360  [list() for _ in self._chem_mapping_chem_mapping])
361 
362  return (self._get_repr_input_get_repr_input[mdl_ligand.hash_code][1],
363  self._chem_group_alns_chem_group_alns,
364  self._unmapped_mdl_chains_unmapped_mdl_chains,
365  self._get_repr_input_get_repr_input[mdl_ligand.hash_code][0])
366 
367 
368 def SCRMSD(model_ligand, target_ligand, transformation=geom.Mat4(),
369  substructure_match=False, max_symmetries=1e6):
370  """Calculate symmetry-corrected RMSD.
371 
372  Binding site superposition must be computed separately and passed as
373  `transformation`.
374 
375  :param model_ligand: The model ligand
376  :type model_ligand: :class:`ost.mol.ResidueHandle` or
377  :class:`ost.mol.ResidueView`
378  :param target_ligand: The target ligand
379  :type target_ligand: :class:`ost.mol.ResidueHandle` or
380  :class:`ost.mol.ResidueView`
381  :param transformation: Optional transformation to apply on each atom
382  position of model_ligand.
383  :type transformation: :class:`ost.geom.Mat4`
384  :param substructure_match: Set this to True to allow partial target
385  ligand.
386  :type substructure_match: :class:`bool`
387  :param max_symmetries: If more than that many isomorphisms exist, raise
388  a :class:`TooManySymmetriesError`. This can only be assessed by
389  generating at least that many isomorphisms and can take some time.
390  :type max_symmetries: :class:`int`
391  :rtype: :class:`float`
392  :raises: :class:`ost.mol.alg.ligand_scoring_base.NoSymmetryError` when no
393  symmetry can be found,
394  :class:`ost.mol.alg.ligand_scoring_base.DisconnectedGraphError`
395  when ligand graph is disconnected,
396  :class:`ost.mol.alg.ligand_scoring_base.TooManySymmetriesError`
397  when more than *max_symmetries* isomorphisms are found.
398  """
399 
400  symmetries = ligand_scoring_base.ComputeSymmetries(model_ligand,
401  target_ligand,
402  substructure_match=substructure_match,
403  by_atom_index=True,
404  max_symmetries=max_symmetries)
405  return _SCRMSD_symmetries(symmetries, model_ligand, target_ligand,
406  transformation)
407 
408 
409 def _SCRMSD_symmetries(symmetries, model_ligand, target_ligand,
410  transformation):
411  """Compute SCRMSD with pre-computed symmetries. Internal. """
412 
413  # setup numpy positions for model ligand and apply transformation
414  mdl_ligand_pos = np.ones((model_ligand.GetAtomCount(), 4))
415  for a_idx, a in enumerate(model_ligand.atoms):
416  p = a.GetPos()
417  mdl_ligand_pos[a_idx, 0] = p[0]
418  mdl_ligand_pos[a_idx, 1] = p[1]
419  mdl_ligand_pos[a_idx, 2] = p[2]
420  np_transformation = np.zeros((4,4))
421  for i in range(4):
422  for j in range(4):
423  np_transformation[i,j] = transformation[i,j]
424  mdl_ligand_pos = mdl_ligand_pos.dot(np_transformation.T)[:,:3]
425 
426  # setup numpy positions for target ligand
427  trg_ligand_pos = np.zeros((target_ligand.GetAtomCount(), 3))
428  for a_idx, a in enumerate(target_ligand.atoms):
429  p = a.GetPos()
430  trg_ligand_pos[a_idx, 0] = p[0]
431  trg_ligand_pos[a_idx, 1] = p[1]
432  trg_ligand_pos[a_idx, 2] = p[2]
433 
434  # position matrices to iterate symmetries
435  # there is a guarantee that
436  # target_ligand.GetAtomCount() <= model_ligand.GetAtomCount()
437  # and that each target ligand atom is part of every symmetry
438  # => target_ligand.GetAtomCount() is size of both position matrices
439  rmsd_mdl_pos = np.zeros((target_ligand.GetAtomCount(), 3))
440  rmsd_trg_pos = np.zeros((target_ligand.GetAtomCount(), 3))
441 
442  # iterate symmetries and find the one with lowest RMSD
443  best_rmsd = np.inf
444  for i, (trg_sym, mdl_sym) in enumerate(symmetries):
445  for idx, (mdl_anum, trg_anum) in enumerate(zip(mdl_sym, trg_sym)):
446  rmsd_mdl_pos[idx,:] = mdl_ligand_pos[mdl_anum, :]
447  rmsd_trg_pos[idx,:] = trg_ligand_pos[trg_anum, :]
448  rmsd = np.sqrt(((rmsd_mdl_pos - rmsd_trg_pos)**2).sum(-1).mean())
449  if rmsd < best_rmsd:
450  best_rmsd = rmsd
451 
452  return best_rmsd
453 
454 # specify public interface
455 __all__ = ('SCRMSDScorer', 'SCRMSD')
def __init__(self, model, target, model_ligands, target_ligands, resnum_alignments=False, rename_ligand_chain=False, substructure_match=False, coverage_delta=0.2, max_symmetries=1e5, bs_radius=4.0, lddt_lp_radius=15.0, model_bs_radius=25, binding_sites_topn=100000, full_bs_search=False, min_pep_length=6, min_nuc_length=4, pep_seqid_thr=95., nuc_seqid_thr=95., mdl_map_pep_seqid_thr=0., mdl_map_nuc_seqid_thr=0.)
def _get_repr(self, target_ligand, model_ligand)
def SCRMSD(model_ligand, target_ligand, transformation=geom.Mat4(), substructure_match=False, max_symmetries=1e6)