33def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False,
34 fault_tolerant=False, extract_seqres_mapping=False):
35 """ Scoring helper - Prepares input from mmCIF
37 Only performs gentle cleanup of hydrogen atoms. Further cleanup is delegated
40 Depending on input flags, the following outputs can be retrieved:
42 * poly_ent (:class:`ost.mol.EntityHandle`): An OpenStructure entity with only
43 polymer chains. This is based on _entity.type extracted from *mmcif_path*.
44 If _entity.type is not defined for every chain, a warning is logged
45 and the returned poly_ent is a selection for peptide and nucleotide
46 residues as defined in the chemical component dictionary.
47 * non_poly_entities (:class:`list` of :class:`ost.mol.EntityHandle`):
48 OpenStructure entities representing all non-polymer (ligand) entities.
49 This is based on _entity.type extracted from *mmcif_path*. If _entity.type
50 is not defined for every chain, an error is raised.
51 * seqres (:class:`ost.seq.SequenceList`): Seqres sequences with entity id
52 as sequence names and the respective canonical seqres as sequence. Set to
53 None and triggers a warning if information is missing in *mmcif_path*.
54 * trg_seqres_mapping (:class:`dict`): Dictionary with chain names in
55 poly_ent as keys and the respective entity ids as values.
56 Set to None and triggers a warning if information is missing in
59 :param mmcif_path: Path to mmCIF file that contains polymer and optionally
61 :type mmcif_path: :class:`str`
62 :param biounit: If given, construct specified biounit from mmCIF AU
63 :type biounit: :class:`str`
64 :param extract_nonpoly: Controls return value
65 :type extract_nonpoly: :class:`bool`
66 :param fault_tolerant: Passed as parameter to :func:`ost.io.LoadMMCIF`
67 :type fault_tolerant: :class:`bool`
68 :param extract_seqres_mapping: Controls return value
69 :type extract_seqres_mapping: :class:`bool`
70 :returns: poly_ent if *extract_nonpoly*/*extract_seqres_mapping* are False.
71 (poly_ent, non_poly_entities) if *extract_nonpoly* is True.
72 (poly_ent, seqres, trg_seqres_mapping) if *extract_seqres_mapping*
74 (poly_ent, non_poly_entities, seqres, trg_seqres_mapping) if both
77 clib = conop.GetDefaultLib()
79 ost.LogError(
"A compound library is required. "
80 "Please refer to the OpenStructure website: "
81 "https://openstructure.org/docs/conop/compoundlib/.")
82 raise RuntimeError(
"No compound library found")
86 non_poly_entities =
None
88 trg_seqres_mapping =
None
91 ost.PushVerbosityLevel(ost.LogLevel.Error)
92 mmcif_entity, mmcif_seqres, mmcif_info = io.LoadMMCIF(mmcif_path, seqres=
True, info=
True,
93 fault_tolerant=fault_tolerant)
95 ost.PopVerbosityLevel()
100 if biounit
is not None:
101 biounit_found =
False
102 for bu
in mmcif_info.biounits:
104 mmcif_entity = mol.alg.CreateBU(mmcif_entity, bu)
107 if not biounit_found:
108 raise RuntimeError(f
"Specified biounit '{biounit}' not in "
112 missing_entity_types = list()
113 for ch
in mmcif_entity.chains:
115 if biounit
is not None:
119 dot_index = ch.name.find(
'.')
123 cname = ch.name[dot_index+1:]
127 entity_id = mmcif_info.GetMMCifEntityIdTr(cname)
129 entity_desc = mmcif_info.GetEntityDesc(entity_id)
131 missing_entity_types.append(cname)
133 if len(missing_entity_types) > 0:
134 msg = f
"mmCIF file does not define _entity.type for chains "
135 msg += f
"{missing_entity_types}. "
137 if not fault_tolerant:
138 msg += f
"Use fault tolerant mode to ignore this error and "
139 msg += f
"fallback to select polymers based on peptide/nucleotide "
140 msg += f
"residues as defined in chemical component dictionary."
141 raise RuntimeError(msg)
143 msg += f
"Fallback to select polymers based on peptide/nucleotide "
144 msg += f
"residues as defined in chemical component dictionary (fault "
145 msg += f
"tolerant mode)."
148 poly_sel = mmcif_entity.Select(
"peptide=true or nucleotide=true")
149 poly_ent = mol.CreateEntityFromView(poly_sel,
True)
151 polymer_entity_ids = mmcif_info.GetEntityIdsOfType(
"polymer")
152 for ch
in mmcif_entity.chains:
154 if biounit
is not None:
158 dot_index = ch.name.find(
'.')
162 cname = ch.name[dot_index+1:]
165 if mmcif_info.GetMMCifEntityIdTr(cname)
in polymer_entity_ids:
166 ch.SetIntProp(
"poly", 1)
167 poly_sel = mmcif_entity.Select(
"gcpoly:0=1")
168 poly_ent = mol.CreateEntityFromView(poly_sel,
True)
171 if len(missing_entity_types) > 0:
172 msg = f
"mmCIF file does not contain _entity.type for the following "
173 msg += f
"chain(s): {missing_entity_types}. Extracting non-polymers "
174 msg += f
"from mmCIF files requires _entity.type to be set for all "
176 raise RuntimeError(msg)
178 non_polymer_entity_ids = mmcif_info.GetEntityIdsOfType(
"non-polymer")
180 for ch
in mmcif_entity.chains:
182 if biounit
is not None:
186 dot_index = ch.name.find(
'.')
190 cname = ch.name[dot_index+1:]
193 if mmcif_info.GetMMCifEntityIdTr(cname)
in non_polymer_entity_ids:
194 ch.SetIntProp(
"nonpolyid", nonpoly_id)
197 non_poly_entities = list()
198 for i
in range(1,nonpoly_id):
199 view = mmcif_entity.Select(f
"gcnonpolyid:0={i}")
200 if view.GetResidueCount() != 1:
201 raise RuntimeError(f
"Expected non-polymer entities in "
202 f
"{mmcif_path} to contain exactly 1 "
203 f
"residue. Got {view.GetResidueCount()} "
204 f
"in chain {view.chains[0].name}")
205 compound = clib.FindCompound(view.residues[0].name)
207 error_msg = f
"\"{view.residues[0].name}\" is not available in " \
208 f
"the compound library."
210 error_msg += f
"A distance-based heuristic was used to " \
211 f
"connect the ligand atoms (fault tolerant " \
213 ost.LogWarning(error_msg)
215 error_msg += f
"Use fault tolerant mode to ignore this " \
216 f
"error and use a distance based heuristic " \
217 f
"to connect the ligand atoms."
218 raise RuntimeError(error_msg)
220 non_poly_entities.append(mol.CreateEntityFromView(view,
True))
222 if extract_seqres_mapping:
226 seqres = seq.CreateSequenceList()
227 seqres_processed = set()
229 for s
in mmcif_seqres:
230 entity_id = mmcif_info.GetMMCifEntityIdTr(s.GetName())
231 if entity_id
not in seqres_processed:
232 seqres_processed.add(entity_id)
233 seqres.AddSequence(seq.CreateSequence(entity_id, s.GetGaplessString()))
236 missing_seqres = list()
237 for ch
in poly_ent.chains:
239 if biounit
is not None:
243 dot_index = ch.name.find(
'.')
247 cname = ch.name[dot_index+1:]
251 entity_id = mmcif_info.GetMMCifEntityIdTr(cname)
252 if entity_id
not in seqres_processed:
253 missing_seqres.append(cname)
255 if len(missing_seqres) > 0:
256 msg = f
"Extracting chem grouping from mmCIF file requires all "
257 msg += f
"SEQRES information set. SEQRES is missing for polymer "
258 msg += f
"chain(s) {missing_seqres}. "
260 if not fault_tolerant:
261 msg += f
"Use fault tolerant mode to ignore this error and "
262 msg += f
"fallback sequence identity-based chem grouping."
263 raise RuntimeError(msg)
265 msg += f
"Chem grouping will be based on sequence identity (fault "
266 msg += f
"tolerant mode)."
270 trg_seqres_mapping =
None
272 trg_seqres_mapping = dict()
274 cnames = [ch.name
for ch
in poly_ent.chains]
276 trg_seqres_mapping[cname] = mmcif_info.GetMMCifEntityIdTr(cname)
278 bu_cnames = [ch.name
for ch
in poly_ent.chains]
280 for bu_cname
in bu_cnames:
281 dot_idx = bu_cname.index(
".")
282 au_cnames.append(bu_cname[dot_idx + 1 :])
283 for au_cname, bu_cname
in zip(au_cnames, bu_cnames):
284 trg_seqres_mapping[bu_cname] = mmcif_info.GetMMCifEntityIdTr(au_cname)
286 if extract_nonpoly
and extract_seqres_mapping:
287 return (poly_ent, non_poly_entities, seqres, trg_seqres_mapping)
288 elif extract_nonpoly:
289 return (poly_ent, non_poly_entities)
290 elif extract_seqres_mapping:
291 return (poly_ent, seqres, trg_seqres_mapping)