OpenStructure
Loading...
Searching...
No Matches
scoring_base.py
Go to the documentation of this file.
1import ost
2from ost import io
3from ost import conop
4from ost import mol
5from ost import seq
6
7
8def CleanHydrogens(ent, clib):
9 """ Scoring helper - Returns copy of *ent* without hydrogens
10
11 Non-standard hydrogen naming can cause trouble in residue property
12 assignment which is done by the :class:`ost.conop.RuleBasedProcessor` when
13 loading. In fact, residue property assignment is not done for every residue
14 that has unknown atoms according to the chemical component dictionary. This
15 function therefore re-processes the entity after removing hydrogens.
16
17 :param ent: Entity to clean
18 :type ent: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView`
19 :param clib: Compound library to perform re-processing after hydrogen
20 removal.
21 :type clib: :class:`ost.conop.CompoundLib`
22 :returns: Cleaned and re-processed ent
23 """
24 cleaned_ent = mol.CreateEntityFromView(ent.Select(
25 "ele != H and ele != D"), include_exlusive_atoms=False)
26 # process again to set missing residue properties due to non standard
27 # hydrogens
28 processor = conop.RuleBasedProcessor(clib)
29 processor.Process(cleaned_ent)
30 return cleaned_ent
31
32
33def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False,
34 fault_tolerant=False, extract_seqres_mapping=False):
35 """ Scoring helper - Prepares input from mmCIF
36
37 Only performs gentle cleanup of hydrogen atoms. Further cleanup is delegated
38 to scoring classes.
39
40 Depending on input flags, the following outputs can be retrieved:
41
42 * poly_ent (:class:`ost.mol.EntityHandle`): An OpenStructure entity with only
43 polymer chains. This is based on _entity.type extracted from *mmcif_path*.
44 If _entity.type is not defined for every chain, a warning is logged
45 and the returned poly_ent is a selection for peptide and nucleotide
46 residues as defined in the chemical component dictionary.
47 * non_poly_entities (:class:`list` of :class:`ost.mol.EntityHandle`):
48 OpenStructure entities representing all non-polymer (ligand) entities.
49 This is based on _entity.type extracted from *mmcif_path*. If _entity.type
50 is not defined for every chain, an error is raised.
51 * seqres (:class:`ost.seq.SequenceList`): Seqres sequences with entity id
52 as sequence names and the respective canonical seqres as sequence. Set to
53 None and triggers a warning if information is missing in *mmcif_path*.
54 * trg_seqres_mapping (:class:`dict`): Dictionary with chain names in
55 poly_ent as keys and the respective entity ids as values.
56 Set to None and triggers a warning if information is missing in
57 *mmcif_path*.
58
59 :param mmcif_path: Path to mmCIF file that contains polymer and optionally
60 non-polymer entities
61 :type mmcif_path: :class:`str`
62 :param biounit: If given, construct specified biounit from mmCIF AU
63 :type biounit: :class:`str`
64 :param extract_nonpoly: Controls return value
65 :type extract_nonpoly: :class:`bool`
66 :param fault_tolerant: Passed as parameter to :func:`ost.io.LoadMMCIF`
67 :type fault_tolerant: :class:`bool`
68 :param extract_seqres_mapping: Controls return value
69 :type extract_seqres_mapping: :class:`bool`
70 :returns: poly_ent if *extract_nonpoly*/*extract_seqres_mapping* are False.
71 (poly_ent, non_poly_entities) if *extract_nonpoly* is True.
72 (poly_ent, seqres, trg_seqres_mapping) if *extract_seqres_mapping*
73 is True.
74 (poly_ent, non_poly_entities, seqres, trg_seqres_mapping) if both
75 flags are True.
76 """
77 clib = conop.GetDefaultLib()
78 if not clib:
79 ost.LogError("A compound library is required. "
80 "Please refer to the OpenStructure website: "
81 "https://openstructure.org/docs/conop/compoundlib/.")
82 raise RuntimeError("No compound library found")
83
84 # return variables that will be defined depending on input flags
85 poly_ent = None
86 non_poly_entities = None
87 seqres = None
88 trg_seqres_mapping = None
89
90 # increase loglevel, as we would pollute the info log with weird stuff
91 ost.PushVerbosityLevel(ost.LogLevel.Error)
92 mmcif_entity, mmcif_seqres, mmcif_info = io.LoadMMCIF(mmcif_path, seqres=True, info=True,
93 fault_tolerant=fault_tolerant)
94 # restore old loglevel and return
95 ost.PopVerbosityLevel()
96
97 mmcif_entity = CleanHydrogens(mmcif_entity, clib)
98
99 # construct biounit if necessary
100 if biounit is not None:
101 biounit_found = False
102 for bu in mmcif_info.biounits:
103 if bu.id == biounit:
104 mmcif_entity = mol.alg.CreateBU(mmcif_entity, bu)
105 biounit_found = True
106 break
107 if not biounit_found:
108 raise RuntimeError(f"Specified biounit '{biounit}' not in "
109 f"{mmcif_path}")
110
111 # check if we have entity types defined for each chain
112 missing_entity_types = list()
113 for ch in mmcif_entity.chains:
114 cname = None
115 if biounit is not None:
116 # if a biounit is constructed, you get chain names like: 1.YOLO
117 # we cannot simply split by '.' since '.' is an allowed character
118 # in chain names. => split by first occurence
119 dot_index = ch.name.find('.')
120 if dot_index == -1:
121 cname = ch.name
122 else:
123 cname = ch.name[dot_index+1:]
124 else:
125 cname = ch.name
126 try:
127 entity_id = mmcif_info.GetMMCifEntityIdTr(cname)
128 # the following raises if there is no desc for entity_id
129 entity_desc = mmcif_info.GetEntityDesc(entity_id)
130 except:
131 missing_entity_types.append(cname)
132
133 if len(missing_entity_types) > 0:
134 msg = f"mmCIF file does not define _entity.type for chains "
135 msg += f"{missing_entity_types}. "
136
137 if not fault_tolerant:
138 msg += f"Use fault tolerant mode to ignore this error and "
139 msg += f"fallback to select polymers based on peptide/nucleotide "
140 msg += f"residues as defined in chemical component dictionary."
141 raise RuntimeError(msg)
142
143 msg += f"Fallback to select polymers based on peptide/nucleotide "
144 msg += f"residues as defined in chemical component dictionary (fault "
145 msg += f"tolerant mode)."
146 ost.LogWarning(msg)
147
148 poly_sel = mmcif_entity.Select("peptide=true or nucleotide=true")
149 poly_ent = mol.CreateEntityFromView(poly_sel, True)
150 else:
151 polymer_entity_ids = mmcif_info.GetEntityIdsOfType("polymer")
152 for ch in mmcif_entity.chains:
153 cname = None
154 if biounit is not None:
155 # if a biounit is constructed, you get chain names like: 1.YOLO
156 # we cannot simply split by '.' since '.' is an allowed character
157 # in chain names. => split by first occurence
158 dot_index = ch.name.find('.')
159 if dot_index == -1:
160 cname = ch.name
161 else:
162 cname = ch.name[dot_index+1:]
163 else:
164 cname = ch.name
165 if mmcif_info.GetMMCifEntityIdTr(cname) in polymer_entity_ids:
166 ch.SetIntProp("poly", 1)
167 poly_sel = mmcif_entity.Select("gcpoly:0=1")
168 poly_ent = mol.CreateEntityFromView(poly_sel, True)
169
170 if extract_nonpoly:
171 if len(missing_entity_types) > 0:
172 msg = f"mmCIF file does not contain _entity.type for the following "
173 msg += f"chain(s): {missing_entity_types}. Extracting non-polymers "
174 msg += f"from mmCIF files requires _entity.type to be set for all "
175 msg += f"chains."
176 raise RuntimeError(msg)
177
178 non_polymer_entity_ids = mmcif_info.GetEntityIdsOfType("non-polymer")
179 nonpoly_id = 1
180 for ch in mmcif_entity.chains:
181 cname = None
182 if biounit is not None:
183 # if a biounit is constructed, you get chain names like: 1.YOLO
184 # we cannot simply split by '.' since '.' is an allowed character
185 # in chain names. => split by first occurence
186 dot_index = ch.name.find('.')
187 if dot_index == -1:
188 cname = ch.name
189 else:
190 cname = ch.name[dot_index+1:]
191 else:
192 cname = ch.name
193 if mmcif_info.GetMMCifEntityIdTr(cname) in non_polymer_entity_ids:
194 ch.SetIntProp("nonpolyid", nonpoly_id)
195 nonpoly_id += 1
196
197 non_poly_entities = list()
198 for i in range(1,nonpoly_id):
199 view = mmcif_entity.Select(f"gcnonpolyid:0={i}")
200 if view.GetResidueCount() != 1:
201 raise RuntimeError(f"Expected non-polymer entities in "
202 f"{mmcif_path} to contain exactly 1 "
203 f"residue. Got {view.GetResidueCount()} "
204 f"in chain {view.chains[0].name}")
205 compound = clib.FindCompound(view.residues[0].name)
206 if compound is None:
207 error_msg = f"\"{view.residues[0].name}\" is not available in " \
208 f"the compound library."
209 if fault_tolerant:
210 error_msg += f"A distance-based heuristic was used to " \
211 f"connect the ligand atoms (fault tolerant " \
212 f"mode)."
213 ost.LogWarning(error_msg)
214 else:
215 error_msg += f"Use fault tolerant mode to ignore this " \
216 f"error and use a distance based heuristic " \
217 f"to connect the ligand atoms."
218 raise RuntimeError(error_msg)
219
220 non_poly_entities.append(mol.CreateEntityFromView(view, True))
221
222 if extract_seqres_mapping:
223 # mmcif seqres is a list of sequences that relates to
224 # chain names in the assymetric unit. What we want is a list
225 # of sequences that relate to the underlying entities.
226 seqres = seq.CreateSequenceList()
227 seqres_processed = set()
228
229 for s in mmcif_seqres:
230 entity_id = mmcif_info.GetMMCifEntityIdTr(s.GetName())
231 if entity_id not in seqres_processed:
232 seqres_processed.add(entity_id)
233 seqres.AddSequence(seq.CreateSequence(entity_id, s.GetGaplessString()))
234
235 # check if we have SEQRES defined for each polymer chain
236 missing_seqres = list()
237 for ch in poly_ent.chains:
238 cname = None
239 if biounit is not None:
240 # if a biounit is constructed, you get chain names like: 1.YOLO
241 # we cannot simply split by '.' since '.' is an allowed character
242 # in chain names. => split by first occurence
243 dot_index = ch.name.find('.')
244 if dot_index == -1:
245 cname = ch.name
246 else:
247 cname = ch.name[dot_index+1:]
248 else:
249 cname = ch.name
250
251 entity_id = mmcif_info.GetMMCifEntityIdTr(cname)
252 if entity_id not in seqres_processed:
253 missing_seqres.append(cname)
254
255 if len(missing_seqres) > 0:
256 msg = f"Extracting chem grouping from mmCIF file requires all "
257 msg += f"SEQRES information set. SEQRES is missing for polymer "
258 msg += f"chain(s) {missing_seqres}. "
259
260 if not fault_tolerant:
261 msg += f"Use fault tolerant mode to ignore this error and "
262 msg += f"fallback sequence identity-based chem grouping."
263 raise RuntimeError(msg)
264
265 msg += f"Chem grouping will be based on sequence identity (fault "
266 msg += f"tolerant mode)."
267 ost.LogWarning(msg)
268
269 seqres = None
270 trg_seqres_mapping = None
271 else:
272 trg_seqres_mapping = dict()
273 if biounit is None:
274 cnames = [ch.name for ch in poly_ent.chains]
275 for cname in cnames:
276 trg_seqres_mapping[cname] = mmcif_info.GetMMCifEntityIdTr(cname)
277 else:
278 bu_cnames = [ch.name for ch in poly_ent.chains]
279 au_cnames = list()
280 for bu_cname in bu_cnames:
281 dot_idx = bu_cname.index(".")
282 au_cnames.append(bu_cname[dot_idx + 1 :])
283 for au_cname, bu_cname in zip(au_cnames, bu_cnames):
284 trg_seqres_mapping[bu_cname] = mmcif_info.GetMMCifEntityIdTr(au_cname)
285
286 if extract_nonpoly and extract_seqres_mapping:
287 return (poly_ent, non_poly_entities, seqres, trg_seqres_mapping)
288 elif extract_nonpoly:
289 return (poly_ent, non_poly_entities)
290 elif extract_seqres_mapping:
291 return (poly_ent, seqres, trg_seqres_mapping)
292 else:
293 return poly_ent
294
295
296def PDBPrep(pdb_path, fault_tolerant=False):
297 """ Scoring helper - Prepares scoring input from PDB
298
299 Only performs gentle cleanup of hydrogen atoms. Further cleanup is delegated
300 to scoring classes. There is no logic to extract ligands from PDB
301 files. Ligands must be provided separately as SDF files in these cases.
302
303 :param pdb_path: Path to PDB file that contains polymer entities
304 :type pdb_path: :class:`str`
305 :param fault_tolerant: Passed as parameter to :func:`ost.io.LoadPDB`
306 :type fault_tolerant: :class:`bool`
307 :returns: :class:`EntityHandle` from loaded file.
308 """
309 clib = conop.GetDefaultLib()
310 if not clib:
311 ost.LogError("A compound library is required. "
312 "Please refer to the OpenStructure website: "
313 "https://openstructure.org/docs/conop/compoundlib/.")
314 raise RuntimeError("No compound library found")
315
316 # increase loglevel, as we would pollute the info log with weird stuff
317 ost.PushVerbosityLevel(ost.LogLevel.Error)
318 pdb_entity = io.LoadPDB(pdb_path, fault_tolerant=fault_tolerant)
319 # restore old loglevel and return
320 ost.PopVerbosityLevel()
321 pdb_entity = CleanHydrogens(pdb_entity, clib)
322
323 return pdb_entity
324
325__all__ = ('CleanHydrogens', 'MMCIFPrep', 'PDBPrep')
MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False, fault_tolerant=False, extract_seqres_mapping=False)
PDBPrep(pdb_path, fault_tolerant=False)