OpenStructure
Loading...
Searching...
No Matches
ligand_scoring_scrmsd.py
Go to the documentation of this file.
1import numpy as np
2
3from ost import LogWarning, LogScript, LogInfo, LogVerbose
4from ost import geom
5from ost import mol
6
7from ost.mol.alg import ligand_scoring_base
8
9
11 """ :class:`LigandScorer` implementing symmetry corrected RMSD (BiSyRMSD).
12
13 :class:`SCRMSDScorer` computes a score for a specific pair of target/model
14 ligands.
15
16 The returned RMSD is based on a binding site superposition.
17 The binding site of the target structure is defined as all residues with at
18 least one atom within `bs_radius` around the target ligand.
19 It only contains protein and nucleic acid residues from chains that
20 pass the criteria for the
21 :class:`chain mapping <ost.mol.alg.chain_mapping>`. This means ignoring
22 other ligands, waters, short polymers as well as any incorrectly connected
23 chains that may be in proximity.
24 The respective model binding site for superposition is identified by
25 naively enumerating all possible mappings of model chains onto their
26 chemically equivalent target counterparts from the target binding site.
27 The `binding_sites_topn` with respect to lDDT score are evaluated and
28 an RMSD is computed.
29 You can either try to map ALL model chains onto the target binding site by
30 enabling `full_bs_search` or restrict the model chains for a specific
31 target/model ligand pair to the chains with at least one atom within
32 *model_bs_radius* around the model ligand. The latter can be significantly
33 faster in case of large complexes.
34 Symmetry correction is achieved by simply computing an RMSD value for
35 each symmetry, i.e. atom-atom assignments of the ligand as given by
36 :class:`LigandScorer`. The lowest RMSD value is returned.
37
38 Populates :attr:`LigandScorer.aux_data` with following :class:`dict` keys:
39
40 * rmsd: The BiSyRMSD score
41 * lddt_lp: lDDT of the binding pocket used for superposition (lDDT-LP)
42 * bs_ref_res: :class:`list` of binding site residues in target
43 * bs_ref_res_mapped: :class:`list` of target binding site residues that
44 are mapped to model
45 * bs_mdl_res_mapped: :class:`list` of same length with respective model
46 residues
47 * bb_rmsd: Backbone RMSD (CA, C3' for nucleotides; full backbone for
48 binding sites with fewer than 3 residues) for mapped binding site
49 residues after superposition
50 * target_ligand: The actual target ligand for which the score was computed
51 * model_ligand: The actual model ligand for which the score was computed
52 * chain_mapping: :class:`dict` with a chain mapping of chains involved in
53 binding site - key: trg chain name, value: mdl chain name
54 * transform: :class:`geom.Mat4` to transform model binding site onto target
55 binding site
56 * inconsistent_residues: :class:`list` of :class:`tuple` representing
57 residues with inconsistent residue names upon mapping (which is given by
58 bs_ref_res_mapped and bs_mdl_res_mapped). Tuples have two elements:
59 1) trg residue 2) mdl residue
60
61 :param model: Passed to parent constructor - see :class:`LigandScorer`.
62 :type model: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView`
63 :param target: Passed to parent constructor - see :class:`LigandScorer`.
64 :type target: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView`
65 :param model_ligands: Passed to parent constructor - see
66 :class:`LigandScorer`.
67 :type model_ligands: :class:`list`
68 :param target_ligands: Passed to parent constructor - see
69 :class:`LigandScorer`.
70 :type target_ligands: :class:`list`
71 :param resnum_alignments: Passed to parent constructor - see
72 :class:`LigandScorer`.
73 :type resnum_alignments: :class:`bool`
74 :param rename_ligand_chain: Passed to parent constructor - see
75 :class:`LigandScorer`.
76 :type rename_ligand_chain: :class:`bool`
77 :param substructure_match: Passed to parent constructor - see
78 :class:`LigandScorer`.
79 :type substructure_match: :class:`bool`
80 :param coverage_delta: Passed to parent constructor - see
81 :class:`LigandScorer`.
82 :type coverage_delta: :class:`float`
83 :param max_symmetries: Passed to parent constructor - see
84 :class:`LigandScorer`.
85 :type max_symmetries: :class:`int`
86 :param bs_radius: Inclusion radius for the binding site. Residues with
87 atoms within this distance of the ligand will be considered
88 for inclusion in the binding site.
89 :type bs_radius: :class:`float`
90 :param lddt_lp_radius: lDDT inclusion radius for lDDT-LP.
91 :type lddt_lp_radius: :class:`float`
92 :param model_bs_radius: inclusion radius for model binding sites.
93 Only used when full_bs_search=False, otherwise the
94 radius is effectively infinite. Only chains with
95 atoms within this distance of a model ligand will
96 be considered in the chain mapping.
97 :type model_bs_radius: :class:`float`
98 :param binding_sites_topn: maximum number of model binding site
99 representations to assess per target binding
100 site.
101 :type binding_sites_topn: :class:`int`
102 :param full_bs_search: If True, all potential binding sites in the model
103 are searched for each target binding site. If False,
104 the search space in the model is reduced to chains
105 around (`model_bs_radius` Å) model ligands.
106 This speeds up computations, but may result in
107 ligands not being scored if the predicted ligand
108 pose is too far from the actual binding site.
109 :type full_bs_search: :class:`bool`
110 :param min_pep_length: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`.
111 :type min_pep_length: :class:`int`
112 :param min_nuc_length: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
113 :type min_nuc_length: :class:`int`
114 :param pep_seqid_thr: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
115 :type pep_seqid_thr: :class:`float`
116 :param nuc_seqid_thr: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
117 :type nuc_seqid_thr: :class:`float`
118 :param mdl_map_pep_seqid_thr: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
119 :type mdl_map_pep_seqid_thr: :class:`float`
120 :param mdl_map_nuc_seqid_thr: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
121 :type mdl_map_nuc_seqid_thr: :class:`float`
122 :param seqres: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
123 :type seqres: :class:`ost.seq.SequenceList`
124 :param trg_seqres_mapping: See :class:`ost.mol.alg.ligand_scoring_base.LigandScorer`
125 :type trg_seqres_mapping: :class:`dict`
126 """
127 def __init__(self, model, target, model_ligands, target_ligands,
128 resnum_alignments=False, rename_ligand_chain=False,
129 substructure_match=False, coverage_delta=0.2,
130 max_symmetries=1e5, bs_radius=4.0, lddt_lp_radius=15.0,
131 model_bs_radius=25, binding_sites_topn=100000,
132 full_bs_search=False, min_pep_length = 6,
133 min_nuc_length = 4, pep_seqid_thr = 95.,
134 nuc_seqid_thr = 95.,
135 mdl_map_pep_seqid_thr = 0.,
136 mdl_map_nuc_seqid_thr = 0.,
137 seqres=None,
138 trg_seqres_mapping=None):
139
140 super().__init__(model, target, model_ligands, target_ligands,
141 resnum_alignments = resnum_alignments,
142 rename_ligand_chain = rename_ligand_chain,
143 substructure_match = substructure_match,
144 coverage_delta = coverage_delta,
145 max_symmetries = max_symmetries,
146 min_pep_length = min_pep_length,
147 min_nuc_length = min_nuc_length,
148 pep_seqid_thr = pep_seqid_thr,
149 nuc_seqid_thr = nuc_seqid_thr,
150 mdl_map_pep_seqid_thr = mdl_map_pep_seqid_thr,
151 mdl_map_nuc_seqid_thr = mdl_map_nuc_seqid_thr,
152 seqres = seqres,
153 trg_seqres_mapping = trg_seqres_mapping)
154
155 self.bs_radius = bs_radius
156 self.lddt_lp_radius = lddt_lp_radius
157 self.model_bs_radius = model_bs_radius
158 self.binding_sites_topn = binding_sites_topn
159 self.full_bs_search = full_bs_search
160
161 # Residues that are in contact with a ligand => binding site
162 # defined as all residues with at least one atom within self.radius
163 # key: ligand.handle.hash_code, value: EntityView of whatever
164 # entity ligand belongs to
165 self._binding_sites = dict()
166
167 # cache for GetRepr chain mapping calls
168 self._repr = dict()
169
170 # lazily precomputed variables to speedup GetRepr chain mapping calls
171 # for localized GetRepr searches
172 self._get_repr_input = dict()
173
174 # update state decoding from parent with subclass specific stuff
175 self.state_decoding[10] = ("target_binding_site",
176 "No residues were in proximity of the "
177 "target ligand.")
178 self.state_decoding[11] = ("model_binding_site", "Binding site was not"
179 " found in the model, i.e. the binding site"
180 " was not modeled or the model ligand was "
181 "positioned too far in combination with "
182 "full_bs_search=False.")
183 self.state_decoding[20] = ("unknown",
184 "Unknown error occured in SCRMSDScorer")
185
186 def _compute(self, symmetries, target_ligand, model_ligand):
187 """ Implements interface from parent
188 """
189 # set default to invalid scores
190 best_rmsd_result = {"rmsd": None,
191 "lddt_lp": None,
192 "bs_ref_res": list(),
193 "bs_ref_res_mapped": list(),
194 "bs_mdl_res_mapped": list(),
195 "bb_rmsd": None,
196 "target_ligand": target_ligand,
197 "model_ligand": model_ligand,
198 "chain_mapping": dict(),
199 "transform": geom.Mat4(),
200 "inconsistent_residues": list()}
201
202 representations = self._get_repr(target_ligand, model_ligand)
203 # This step can be slow so give some hints in logs
204 msg = "Computing BiSyRMSD with %d chain mappings" % len(representations)
205 (LogWarning if len(representations) > 10000 else LogInfo)(msg)
206
207 for r in representations:
208 rmsd = _SCRMSD_symmetries(symmetries, model_ligand,
209 target_ligand, transformation=r.transform)
210
211 if best_rmsd_result["rmsd"] is None or \
212 rmsd < best_rmsd_result["rmsd"]:
213 best_rmsd_result = {"rmsd": rmsd,
214 "lddt_lp": r.lDDT,
215 "bs_ref_res": r.substructure.residues,
216 "bs_ref_res_mapped": r.ref_residues,
217 "bs_mdl_res_mapped": r.mdl_residues,
218 "bb_rmsd": r.bb_rmsd,
219 "target_ligand": target_ligand,
220 "model_ligand": model_ligand,
221 "chain_mapping": r.GetFlatChainMapping(),
222 "transform": r.transform,
223 "inconsistent_residues":
224 r.inconsistent_residues}
225
226 target_ligand_state = 0
227 model_ligand_state = 0
228 pair_state = 0
229
230 if best_rmsd_result["rmsd"] is not None:
231 best_rmsd = best_rmsd_result["rmsd"]
232 else:
233 # try to identify error states
234 best_rmsd = np.nan
235 error_state = 20 # unknown error
236 N = self._get_target_binding_site(target_ligand).GetResidueCount()
237 if N == 0:
238 pair_state = 6 # binding_site
239 target_ligand_state = 10
240 elif len(representations) == 0:
241 pair_state = 11
242
243 return (best_rmsd, pair_state, target_ligand_state, model_ligand_state,
244 best_rmsd_result)
245
246 def _score_dir(self):
247 """ Implements interface from parent
248 """
249 return '-'
250
251 def _get_repr(self, target_ligand, model_ligand):
252
253 key = None
254 if self.full_bs_search:
255 # all possible binding sites, independent from actual model ligand
256 key = (target_ligand.handle.hash_code, 0)
257 else:
258 key = (target_ligand.handle.hash_code,
259 model_ligand.handle.hash_code)
260
261 if key not in self._repr:
262 ref_bs = self._get_target_binding_site(target_ligand)
263 LogVerbose("%d chains are in proximity of the target ligand: %s" % (
264 ref_bs.chain_count, ", ".join([c.name for c in ref_bs.chains])))
265 if self.full_bs_search:
266 reprs = self._chain_mapper.GetRepr(
267 ref_bs, self.modelmodelmodel, inclusion_radius=self.lddt_lp_radius,
268 topn=self.binding_sites_topn)
269 else:
270 repr_in = self._get_get_repr_input(model_ligand)
271 radius = self.lddt_lp_radius
272 reprs = self._chain_mapper.GetRepr(ref_bs, self.modelmodelmodel,
273 inclusion_radius=radius,
274 topn=self.binding_sites_topn,
275 chem_mapping_result=repr_in)
276 self._repr[key] = reprs
277
278 return self._repr[key]
279
280 def _get_target_binding_site(self, target_ligand):
281
282 if target_ligand.handle.hash_code not in self._binding_sites:
283
284 # create view of reference binding site
285 ref_residues_hashes = set() # helper to keep track of added residues
286 ignored_residue_hashes = {target_ligand.hash_code}
287 for ligand_at in target_ligand.atoms:
288 close_atoms = self.targettarget.FindWithin(ligand_at.GetPos(),
289 self.bs_radius)
290 for close_at in close_atoms:
291 # Skip any residue not in the chain mapping target
292 ref_res = close_at.GetResidue()
293 h = ref_res.handle.GetHashCode()
294 if h not in ref_residues_hashes and \
295 h not in ignored_residue_hashes:
296 with ligand_scoring_base._SinkVerbosityLevel(1):
297 view = self._chain_mapper.target.ViewForHandle(ref_res)
298 if view.IsValid():
299 h = ref_res.handle.GetHashCode()
300 ref_residues_hashes.add(h)
301 elif ref_res.is_ligand:
302 msg = f"Ignoring ligand {ref_res.qualified_name} "
303 msg += "in binding site of "
304 msg += str(target_ligand.qualified_name)
305 LogWarning(msg)
306 ignored_residue_hashes.add(h)
307 elif ref_res.chem_type == mol.ChemType.WATERS:
308 pass # That's ok, no need to warn
309 else:
310 msg = f"Ignoring residue {ref_res.qualified_name} "
311 msg += "in binding site of "
312 msg += str(target_ligand.qualified_name)
313 LogWarning(msg)
314 ignored_residue_hashes.add(h)
315
316 ref_bs = self.targettarget.CreateEmptyView()
317 if ref_residues_hashes:
318 # reason for doing that separately is to guarantee same ordering
319 # of residues as in underlying entity. (Reorder by ResNum seems
320 # only available on ChainHandles)
321 for ch in self.targettarget.chains:
322 for r in ch.residues:
323 if r.handle.GetHashCode() in ref_residues_hashes:
324 ref_bs.AddResidue(r, mol.ViewAddFlag.INCLUDE_ALL)
325 if len(ref_bs.residues) == 0:
326 raise RuntimeError("Failed to add proximity residues to "
327 "the reference binding site entity")
328
329 self._binding_sites[target_ligand.handle.hash_code] = ref_bs
330
331 return self._binding_sites[target_ligand.handle.hash_code]
332
333 def _get_get_repr_input(self, mdl_ligand):
334 if mdl_ligand.handle.hash_code not in self._get_repr_input:
335
336 # figure out what chains in the model are in contact with the ligand
337 # that may give a non-zero contribution to lDDT in
338 # chain_mapper.GetRepr
339 radius = self.model_bs_radius
340 chains = set()
341 for at in mdl_ligand.atoms:
342 with ligand_scoring_base._SinkVerbosityLevel(1):
343 close_atoms = self._chain_mapping_mdl.FindWithin(at.GetPos(),
344 radius)
345 for close_at in close_atoms:
346 chains.add(close_at.GetChain().GetName())
347
348 if len(chains) > 0:
349 LogVerbose("%d chains are in proximity of the model ligand: %s" % (
350 len(chains), ", ".join(chains)))
351
352 # the chain mapping model which only contains close chains
353 query = "cname="
354 query += ','.join([mol.QueryQuoteName(x) for x in chains])
355 mdl = self._chain_mapping_mdl.Select(query)
356
357 # chem mapping which is reduced to the respective chains
358 chem_mapping = list()
359 for m in self._chem_mapping_chem_mapping:
360 chem_mapping.append([x for x in m if x in chains])
361
362 self._get_repr_input[mdl_ligand.handle.hash_code] = \
363 (mdl, chem_mapping)
364
365 else:
366 self._get_repr_input[mdl_ligand.handle.hash_code] = \
367 (self._chain_mapping_mdl.CreateEmptyView(),
368 [list() for _ in self._chem_mapping_chem_mapping])
369
370 return (self._get_repr_input[mdl_ligand.hash_code][1],
373 self._get_repr_input[mdl_ligand.hash_code][0])
374
375
376def SCRMSD(model_ligand, target_ligand, transformation=geom.Mat4(),
377 substructure_match=False, max_symmetries=1e6):
378 """Calculate symmetry-corrected RMSD.
379
380 Binding site superposition must be computed separately and passed as
381 `transformation`.
382
383 :param model_ligand: The model ligand
384 :type model_ligand: :class:`ost.mol.ResidueHandle` or
385 :class:`ost.mol.ResidueView`
386 :param target_ligand: The target ligand
387 :type target_ligand: :class:`ost.mol.ResidueHandle` or
388 :class:`ost.mol.ResidueView`
389 :param transformation: Optional transformation to apply on each atom
390 position of model_ligand.
391 :type transformation: :class:`ost.geom.Mat4`
392 :param substructure_match: Set this to True to allow partial target
393 ligand.
394 :type substructure_match: :class:`bool`
395 :param max_symmetries: If more than that many isomorphisms exist, raise
396 a :class:`TooManySymmetriesError`. This can only be assessed by
397 generating at least that many isomorphisms and can take some time.
398 :type max_symmetries: :class:`int`
399 :rtype: :class:`float`
400 :raises: :class:`ost.mol.alg.ligand_scoring_base.NoSymmetryError` when no
401 symmetry can be found,
402 :class:`ost.mol.alg.ligand_scoring_base.DisconnectedGraphError`
403 when ligand graph is disconnected,
404 :class:`ost.mol.alg.ligand_scoring_base.TooManySymmetriesError`
405 when more than *max_symmetries* isomorphisms are found.
406 """
407
408 symmetries = ligand_scoring_base.ComputeSymmetries(model_ligand,
409 target_ligand,
410 substructure_match=substructure_match,
411 by_atom_index=True,
412 max_symmetries=max_symmetries)
413 return _SCRMSD_symmetries(symmetries, model_ligand, target_ligand,
414 transformation)
415
416
417def _SCRMSD_symmetries(symmetries, model_ligand, target_ligand,
418 transformation):
419 """Compute SCRMSD with pre-computed symmetries. Internal. """
420
421 # setup numpy positions for model ligand and apply transformation
422 mdl_ligand_pos = np.ones((model_ligand.GetAtomCount(), 4))
423 for a_idx, a in enumerate(model_ligand.atoms):
424 p = a.GetPos()
425 mdl_ligand_pos[a_idx, 0] = p[0]
426 mdl_ligand_pos[a_idx, 1] = p[1]
427 mdl_ligand_pos[a_idx, 2] = p[2]
428 np_transformation = np.zeros((4,4))
429 for i in range(4):
430 for j in range(4):
431 np_transformation[i,j] = transformation[i,j]
432 mdl_ligand_pos = mdl_ligand_pos.dot(np_transformation.T)[:,:3]
433
434 # setup numpy positions for target ligand
435 trg_ligand_pos = np.zeros((target_ligand.GetAtomCount(), 3))
436 for a_idx, a in enumerate(target_ligand.atoms):
437 p = a.GetPos()
438 trg_ligand_pos[a_idx, 0] = p[0]
439 trg_ligand_pos[a_idx, 1] = p[1]
440 trg_ligand_pos[a_idx, 2] = p[2]
441
442 # position matrices to iterate symmetries
443 # there is a guarantee that
444 # target_ligand.GetAtomCount() <= model_ligand.GetAtomCount()
445 # and that each target ligand atom is part of every symmetry
446 # => target_ligand.GetAtomCount() is size of both position matrices
447 rmsd_mdl_pos = np.zeros((target_ligand.GetAtomCount(), 3))
448 rmsd_trg_pos = np.zeros((target_ligand.GetAtomCount(), 3))
449
450 # iterate symmetries and find the one with lowest RMSD
451 best_rmsd = np.inf
452 for i, (trg_sym, mdl_sym) in enumerate(symmetries):
453 for idx, (mdl_anum, trg_anum) in enumerate(zip(mdl_sym, trg_sym)):
454 rmsd_mdl_pos[idx,:] = mdl_ligand_pos[mdl_anum, :]
455 rmsd_trg_pos[idx,:] = trg_ligand_pos[trg_anum, :]
456 rmsd = np.sqrt(((rmsd_mdl_pos - rmsd_trg_pos)**2).sum(-1).mean())
457 if rmsd < best_rmsd:
458 best_rmsd = rmsd
459
460 return best_rmsd
461
462# specify public interface
463__all__ = ('SCRMSDScorer', 'SCRMSD')
__init__(self, model, target, model_ligands, target_ligands, resnum_alignments=False, rename_ligand_chain=False, substructure_match=False, coverage_delta=0.2, max_symmetries=1e5, bs_radius=4.0, lddt_lp_radius=15.0, model_bs_radius=25, binding_sites_topn=100000, full_bs_search=False, min_pep_length=6, min_nuc_length=4, pep_seqid_thr=95., nuc_seqid_thr=95., mdl_map_pep_seqid_thr=0., mdl_map_nuc_seqid_thr=0., seqres=None, trg_seqres_mapping=None)
_compute(self, symmetries, target_ligand, model_ligand)
_SCRMSD_symmetries(symmetries, model_ligand, target_ligand, transformation)
SCRMSD(model_ligand, target_ligand, transformation=geom.Mat4(), substructure_match=False, max_symmetries=1e6)