OpenStructure
scoring_base.py
Go to the documentation of this file.
1 import ost
2 from ost import io
3 from ost import conop
4 from ost import mol
5 from ost import seq
6 
7 
8 def CleanHydrogens(ent, clib):
9  """ Scoring helper - Returns copy of *ent* without hydrogens
10 
11  Non-standard hydrogen naming can cause trouble in residue property
12  assignment which is done by the :class:`ost.conop.RuleBasedProcessor` when
13  loading. In fact, residue property assignment is not done for every residue
14  that has unknown atoms according to the chemical component dictionary. This
15  function therefore re-processes the entity after removing hydrogens.
16 
17  :param ent: Entity to clean
18  :type ent: :class:`ost.mol.EntityHandle`/:class:`ost.mol.EntityView`
19  :param clib: Compound library to perform re-processing after hydrogen
20  removal.
21  :type clib: :class:`ost.conop.CompoundLib`
22  :returns: Cleaned and re-processed ent
23  """
24  cleaned_ent = mol.CreateEntityFromView(ent.Select(
25  "ele != H and ele != D"), include_exlusive_atoms=False)
26  # process again to set missing residue properties due to non standard
27  # hydrogens
28  processor = conop.RuleBasedProcessor(clib)
29  processor.Process(cleaned_ent)
30  return cleaned_ent
31 
32 
33 def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False,
34  fault_tolerant=False, allow_heuristic_conn=False,
35  extract_seqres_mapping=False):
36  """ Scoring helper - Prepares input from mmCIF
37 
38  Only performs gentle cleanup of hydrogen atoms. Further cleanup is delegated
39  to scoring classes.
40 
41  Depending on input flags, the following outputs can be retrieved:
42 
43  * poly_ent (:class:`ost.mol.EntityHandle`): An OpenStructure entity with only
44  polymer chains.
45  * non_poly_entities (:class:`list` of :class:`ost.mol.EntityHandle`):
46  OpenStructure entities representing all non-polymer (ligand) entities.
47  * seqres (:class:`ost.seq.SequenceList`): Seqres sequences with entity id
48  as sequence names and the respective canonical seqres as sequence.
49  * trg_seqres_mapping (:class:`dict`): Dictionary with chain names in
50  poly_ent as keys and the respective entity ids as values.
51 
52  :param mmcif_path: Path to mmCIF file that contains polymer and optionally
53  non-polymer entities
54  :type mmcif_path: :class:`str`
55  :param biounit: If given, construct specified biounit from mmCIF AU
56  :type biounit: :class:`str`
57  :param extract_nonpoly: Controls return value
58  :type extract_nonpoly: :class:`bool`
59  :param fault_tolerant: Passed as parameter to :func:`ost.io.LoadMMCIF`
60  :type fault_tolerant: :class:`bool`
61  :param allow_heuristic_conn: Only relevant if extract_nonpoly is True.
62  The chemical component dictionary is relevant
63  for connectivity information. By default, we
64  enforce the presence of each non-polymer in
65  the dictionary to ensure correct connectity.
66  If you enable this flag, you allow the use
67  of a distance based heuristic as fallback.
68  With all its consequences in ligand matching.
69  :type allow_heuristic_conn: :class:`bool`
70  :param extract_seqres_mapping: Controls return value
71  :type extract_seqres_mapping: :class:`bool`
72  :returns: poly_ent if *extract_nonpoly*/*extract_seqres_mapping* are False.
73  (poly_ent, non_poly_entities) if *extract_nonpoly* is True.
74  (poly_ent, seqres, trg_seqres_mapping) if *extract_seqres_mapping*
75  is True.
76  (poly_ent, non_poly_entities, seqres, trg_seqres_mapping) if both
77  flags are True.
78  """
79  clib = conop.GetDefaultLib()
80  if not clib:
81  ost.LogError("A compound library is required. "
82  "Please refer to the OpenStructure website: "
83  "https://openstructure.org/docs/conop/compoundlib/.")
84  raise RuntimeError("No compound library found")
85 
86  # return variables that will be defined depending on input flags
87  poly_ent = None
88  non_poly_entities = None
89  seqres = None
90  trg_seqres_mapping = None
91 
92 
93  mmcif_entity, mmcif_seqres, mmcif_info = io.LoadMMCIF(mmcif_path, seqres=True, info=True,
94  fault_tolerant=fault_tolerant)
95  mmcif_entity = CleanHydrogens(mmcif_entity, clib)
96 
97  # get AU chain names representing polymer entities
98  polymer_entity_ids = mmcif_info.GetEntityIdsOfType("polymer")
99  polymer_chain_names = list()
100  for ch in mmcif_entity.chains:
101  if mmcif_info.GetMMCifEntityIdTr(ch.name) in polymer_entity_ids:
102  polymer_chain_names.append(ch.name)
103 
104  # get AU chain names representing non-polymer entities
105  non_polymer_entity_ids = mmcif_info.GetEntityIdsOfType("non-polymer")
106  non_polymer_chain_names = list()
107  for ch in mmcif_entity.chains:
108  if mmcif_info.GetMMCifEntityIdTr(ch.name) in non_polymer_entity_ids:
109  non_polymer_chain_names.append(ch.name)
110 
111  # construct biounit if necessary
112  if biounit is not None:
113  biounit_found = False
114  for bu in mmcif_info.biounits:
115  if bu.id == biounit:
116  mmcif_entity = mol.alg.CreateBU(mmcif_entity, bu)
117  biounit_found = True
118  break
119  if not biounit_found:
120  raise RuntimeError(f"Specified biounit '{biounit}' not in "
121  f"{mmcif_path}")
122 
123  # assign generic properties for selection later on
124  non_poly_id = 0
125  for ch in mmcif_entity.chains:
126  cname = None
127  if biounit is not None:
128  # if a biounit is constructed, you get chain names like: 1.YOLO
129  # we cannot simply split by '.' since '.' is an allowed character
130  # in chain names. => split by first occurence
131  dot_index = ch.name.find('.')
132  if dot_index == -1:
133  cname = ch.name
134  else:
135  cname = ch.name[dot_index+1:]
136  else:
137  cname = ch.name
138 
139  if cname in polymer_chain_names:
140  ch.SetIntProp("poly", 1)
141  if cname in non_polymer_chain_names:
142  ch.SetIntProp("nonpolyid", non_poly_id)
143  non_poly_id += 1
144 
145  poly_sel = mmcif_entity.Select("gcpoly:0=1")
146  poly_ent = mol.CreateEntityFromView(poly_sel, True)
147 
148  if extract_nonpoly:
149  non_poly_sel = mmcif_entity.Select("gcnonpoly:0=1")
150  non_poly_entities = list()
151  for i in range(non_poly_id):
152  view = mmcif_entity.Select(f"gcnonpolyid:{non_poly_id}={i}")
153  if view.GetResidueCount() != 1:
154  raise RuntimeError(f"Expect non-polymer entities in "
155  f"{mmcif_path} to contain exactly 1 "
156  f"residue. Got {ch.GetResidueCount()} "
157  f"in chain {ch.name}")
158  if not allow_heuristic_conn:
159  compound = clib.FindCompound(view.residues[0].name)
160  if compound is None:
161  raise RuntimeError(f"Can only extract non-polymer entities if "
162  f"respective residues are available in PDB "
163  f"component dictionary. Can't find "
164  f"\"{view.residues[0].name}\"")
165 
166  non_poly_entities.append(mol.CreateEntityFromView(view, True))
167 
168  if extract_seqres_mapping:
169  # mmcif seqres is a list of sequences that relates to
170  # chain names in the assymetric unit. What we want is a list
171  # of sequences that relate to the underlying entities.
172  seqres = seq.CreateSequenceList()
173  seqres_processed = set()
174 
175  for s in mmcif_seqres:
176  entity_id = mmcif_info.GetMMCifEntityIdTr(s.GetName())
177  if entity_id not in seqres_processed:
178  seqres_processed.add(entity_id)
179  seqres.AddSequence(seq.CreateSequence(entity_id, s.GetGaplessString()))
180 
181  trg_seqres_mapping = dict()
182  if biounit is None:
183  cnames = [ch.name for ch in poly_ent.chains]
184  for cname in cnames:
185  trg_seqres_mapping[cname] = mmcif_info.GetMMCifEntityIdTr(cname)
186  else:
187  bu_cnames = [ch.name for ch in poly_ent.chains]
188  au_cnames = list()
189  for bu_cname in bu_cnames:
190  dot_idx = bu_cname.index(".")
191  au_cnames.append(bu_cname[dot_idx + 1 :])
192  for au_cname, bu_cname in zip(au_cnames, bu_cnames):
193  trg_seqres_mapping[bu_cname] = mmcif_info.GetMMCifEntityIdTr(au_cname)
194 
195 
196  if extract_nonpoly and extract_seqres_mapping:
197  return (poly_ent, non_poly_entities, seqres, trg_seqres_mapping)
198  elif extract_nonpoly:
199  return (poly_ent, non_poly_entities)
200  elif extract_seqres_mapping:
201  return (poly_ent, seqres, trg_seqres_mapping)
202  else:
203  return poly_ent
204 
205 
206 def PDBPrep(pdb_path, fault_tolerant=False):
207  """ Scoring helper - Prepares scoring input from PDB
208 
209  Only performs gentle cleanup of hydrogen atoms. Further cleanup is delegated
210  to scoring classes. There is no logic to extract ligands from PDB
211  files. Ligands must be provided separately as SDF files in these cases.
212 
213  :param pdb_path: Path to PDB file that contains polymer entities
214  :type pdb_path: :class:`str`
215  :param fault_tolerant: Passed as parameter to :func:`ost.io.LoadPDB`
216  :type fault_tolerant: :class:`bool`
217  :returns: :class:`EntityHandle` from loaded file.
218  """
219  clib = conop.GetDefaultLib()
220  if not clib:
221  ost.LogError("A compound library is required. "
222  "Please refer to the OpenStructure website: "
223  "https://openstructure.org/docs/conop/compoundlib/.")
224  raise RuntimeError("No compound library found")
225 
226  pdb_entity = io.LoadPDB(pdb_path, fault_tolerant=fault_tolerant)
227  pdb_entity = CleanHydrogens(pdb_entity, clib)
228 
229  return pdb_entity
230 
231 __all__ = ('CleanHydrogens', 'MMCIFPrep', 'PDBPrep')
def CleanHydrogens(ent, clib)
Definition: scoring_base.py:8
def PDBPrep(pdb_path, fault_tolerant=False)
def MMCIFPrep(mmcif_path, biounit=None, extract_nonpoly=False, fault_tolerant=False, allow_heuristic_conn=False, extract_seqres_mapping=False)
Definition: scoring_base.py:35