OpenStructure
mmcif_reader.hh
Go to the documentation of this file.
1 //------------------------------------------------------------------------------
2 // This file is part of the OpenStructure project <www.openstructure.org>
3 //
4 // Copyright (C) 2008-2020 by the OpenStructure authors
5 //
6 // This library is free software; you can redistribute it and/or modify it under
7 // the terms of the GNU Lesser General Public License as published by the Free
8 // Software Foundation; either version 3.0 of the License, or (at your option)
9 // any later version.
10 // This library is distributed in the hope that it will be useful, but WITHOUT
11 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Lesser General Public License
16 // along with this library; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 //------------------------------------------------------------------------------
19 #ifndef OST_MMCIF_READER_HH
20 #define OST_MMCIF_READER_HH
21 
22 #include <map>
23 
24 #include <ost/geom/geom.hh>
25 #include <ost/seq/sequence_list.hh>
27 #include <ost/mol/chain_type.hh>
29 #include <ost/io/mol/io_profile.hh>
30 #include <ost/io/io_exception.hh>
32 #include <ost/io/mol/mmcif_info.hh>
33 
34 namespace ost { namespace io {
35 
61 class DLLEXPORT_OST_IO MMCifReader : public StarParser {
62 public:
68 
69  MMCifReader(std::istream& stream, mol::EntityHandle& ent_handle,
70  const IOProfile& profile);
71 
77  MMCifReader(const String& filename, mol::EntityHandle& ent_handle,
78  const IOProfile& profile);
79 
82  void Init();
83 
85  void ClearState();
86 
90  void SetRestrictChains(const String& restrict_chains);
91 
92  const String& GetRestrictChains() const
93  {
94  return restrict_chains_;
95  }
96 
101  void SetAuthChainID(bool id)
102  {
103  auth_chain_id_ = id;
104  }
105 
112  virtual bool OnBeginData(const StringRef& data_name);
113 
119  virtual bool OnBeginLoop(const StarLoopDesc& header); // tested
120 
125  virtual void OnDataRow(const StarLoopDesc& header,
126  const std::vector<StringRef>& columns);
127 
129  virtual void OnEndData();
130 
135 
139  const MMCifInfo& GetInfo() { return info_; }
140 
141 protected:
148  void TryStoreIdx(const int mapping,
149  const String& item,
150  const StarLoopDesc& header)
151  {
152  indices_[mapping] = header.GetIndex(item);
153 
154  if (indices_[mapping] == -1) {
155  throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
156  "No item '" + item +
157  "' found in '" +
158  header.GetCategory()+
159  "' header",
160  this->GetCurrentLinenum()));
161  }
162  } // tested
163 
177  bool ParseAtomIdent(const std::vector<StringRef>& columns,
178  String& auth_chain_name,
179  String& cif_chain_name,
180  StringRef& res_name,
181  mol::ResNum& resnum,
182  bool& valid_res_num,
183  StringRef& atom_name,
184  char& alt_loc);
185 
189  void ParseAndAddAtom(const std::vector<StringRef>& columns);
190 
194  void ParseEntity(const std::vector<StringRef>& columns);
195 
199  void ParseEntityPoly(const std::vector<StringRef>& columns);
200 
204  void ParseCitation(const std::vector<StringRef>& columns);
205 
206  const MMCifInfoStructRefs& GetStructRefs() const { return struct_refs_; }
207 
211  void ParseCitationAuthor(const std::vector<StringRef>& columns);
212 
214  void ParseStructRef(const std::vector<StringRef>& columns);
215 
217  void ParseStructRefSeq(const std::vector<StringRef>& columns);
218 
220  void ParseStructRefSeqDif(const std::vector<StringRef>& columns);
224  void ParseExptl(const std::vector<StringRef>& columns);
225 
229  void ParseRefine(const std::vector<StringRef>& columns);
230 
234  void ParsePdbxStructAssembly(const std::vector<StringRef>& columns);
235 
239  void ParsePdbxStructAssemblyGen(const std::vector<StringRef>& columns);
240 
241  std::vector<std::vector<String> > UnPackOperExperession(StringRef expression);
242 
243  void StoreExpression(const char* l, const char* s,
244  bool& is_range, int lborder,
245  std::vector<String>& single_block);
246 
247  void StoreRange(const char*& l, const char* s, bool& is_range, int& lborder,
248  std::vector<String>& single_block);
249 
253  void ParsePdbxStructOperList(const std::vector<StringRef>& columns);
254 
258  void ParseDatabasePDBRev(const std::vector<StringRef>& columns);
259 
263  void ParsePdbxAuditRevisionHistory(const std::vector<StringRef>& columns);
264 
268  void ParsePdbxAuditRevisionDetails(const std::vector<StringRef>& columns);
269 
273  void ParsePdbxDatabaseStatus(const std::vector<StringRef>& columns);
274 
278  void ParseStruct(const std::vector<StringRef>& columns);
279 
283  void ParseStructConf(const std::vector<StringRef>& columns);
284 
288  void ParseStructSheetRange(const std::vector<StringRef>& columns);
289 
293  void ParsePdbxDatabasePdbObsSpr(const std::vector<StringRef>& columns);
294 
298  void ParsePdbxEntityBranch(const std::vector<StringRef>& columns);
299 
303  void ParsePdbxEntityBranchLink(const std::vector<StringRef>& columns);
304 
308  void ParseEntityPolySeq(const std::vector<StringRef>& columns);
309 
313  void ParseEm3DReconstruction(const std::vector<StringRef>& columns);
314 
316  typedef enum {
317  MMCIF_HELIX,
318  MMCIF_STRAND,
319  MMCIF_TURN,
320  MMCIF_COIL
321  } MMCifSecStructElement;
322 
327 
332 
333 private:
335  typedef enum {
336  MAX_ITEMS_IN_ROW=19
337  } MMCifMagicNos;
338 
340  typedef enum {
341  AUTH_ASYM_ID,
342  AS_ID,
343  LABEL_ALT_ID,
344  LABEL_ASYM_ID,
345  LABEL_ATOM_ID,
346  LABEL_COMP_ID,
347  LABEL_ENTITY_ID,
348  LABEL_SEQ_ID,
349  AUTH_SEQ_ID,
350  TYPE_SYMBOL,
351  CARTN_X,
352  CARTN_Y,
353  CARTN_Z,
354  OCCUPANCY,
355  B_ISO_OR_EQUIV,
356  PDBX_PDB_INS_CODE,
357  GROUP_PDB,
358  PDBX_PDB_MODEL_NUM,
359  FORMAL_CHARGE
360  } AtomSiteItems;
361 
363  typedef enum {
364  E_ID,
365  E_TYPE,
366  PDBX_DESCRIPTION
367  } EntityItems;
368 
370  typedef enum {
371  ENTITY_ID,
372  EP_TYPE,
373  PDBX_SEQ_ONE_LETTER_CODE,
374  PDBX_SEQ_ONE_LETTER_CODE_CAN
375  } EntityPolyItems;
376 
378  typedef enum {
379  CITATION_ID,
380  ABSTRACT_ID_CAS,
381  BOOK_ID_ISBN,
382  BOOK_TITLE,
383  BOOK_PUBLISHER,
384  BOOK_PUBLISHER_CITY,
385  JOURNAL_ABBREV,
386  JOURNAL_VOLUME,
387  PAGE_FIRST,
388  PAGE_LAST,
389  PDBX_DATABASE_ID_DOI,
390  PDBX_DATABASE_ID_PUBMED,
391  YEAR,
392  TITLE
393  } CitationItems;
394 
396  typedef enum {
397  AUTHOR_CITATION_ID,
398  AUTHOR_NAME,
399  ORDINAL
400  } CitationAuthorItems;
401 
403  typedef enum {
404  EXPTL_ENTRY_ID,
405  METHOD
406  } ExptlItems;
407 
409  typedef enum {
410  REFINE_ENTRY_ID,
411  LS_D_RES_HIGH,
412  LS_D_RES_LOW,
413  LS_R_FACTOR_R_WORK,
414  LS_R_FACTOR_R_FREE
415  } RefineItems;
416 
418  typedef enum {
419  PSA_DETAILS,
420  PSA_ID,
421  METHOD_DETAILS
422  } PdbxStructAssemblyItems;
423 
424  // \enum items of the struct_ref category
425  typedef enum {
426  SR_ENTITY_ID,
427  SR_ID,
428  SR_DB_CODE,
429  SR_DB_NAME,
430  SR_DB_ACCESS
431  } StructRefItems;
432 
434  typedef enum {
435  SRS_ALIGN_ID,
436  SRS_STRUCT_REF_ID,
437  SRS_PDBX_STRAND_ID,
438  SRS_DB_ALIGN_BEG,
439  SRS_DB_ALIGN_END,
440  SRS_ENT_ALIGN_BEG,
441  SRS_ENT_ALIGN_END
442  } StructRefSeqItems;
443 
445  typedef enum {
446  SRSD_ALIGN_ID,
447  SRSD_SEQ_RNUM,
448  SRSD_DB_RNUM,
449  SRSD_DETAILS
450  } StructRefSeqDifItems;
451 
453  typedef enum {
454  ASSEMBLY_ID,
455  ASYM_ID_LIST,
456  OPER_EXPRESSION
457  } PdbxStructAssemblyGenItems;
458 
460  typedef enum {
461  PSOL_ID,
462  PSOL_TYPE,
463  VECTOR_1,
464  VECTOR_2,
465  VECTOR_3,
466  MATRIX_1_1,
467  MATRIX_1_2,
468  MATRIX_1_3,
469  MATRIX_2_1,
470  MATRIX_2_2,
471  MATRIX_2_3,
472  MATRIX_3_1,
473  MATRIX_3_2,
474  MATRIX_3_3
475  } PdbxStructOperListItems;
476 
478  typedef enum {
479  STRUCT_ENTRY_ID,
480  PDBX_CASP_FLAG,
481  PDBX_DESCRIPTOR,
482  PDBX_FORMULA_WEIGHT,
483  PDBX_FORMULA_WEIGHT_METHOD,
484  PDBX_MODEL_DETAILS,
485  PDBX_MODEL_TYPE_DETAILS,
486  STRUCT_TITLE
487  } StructItems;
488 
490  typedef enum {
491  SC_BEG_AUTH_ASYM_ID,
492  SC_BEG_LABEL_ASYM_ID,
493  SC_BEG_LABEL_COMP_ID,
494  SC_BEG_LABEL_SEQ_ID,
495  SC_CONF_TYPE_ID,
496  SC_END_AUTH_ASYM_ID,
497  SC_END_LABEL_ASYM_ID,
498  SC_END_LABEL_COMP_ID,
499  SC_END_LABEL_SEQ_ID,
500  SC_ID,
501  } StructConfItems;
502 
504  typedef enum {
505  SSR_BEG_LABEL_ASYM_ID,
506  SSR_BEG_LABEL_COMP_ID,
507  SSR_BEG_LABEL_SEQ_ID,
508  SSR_END_LABEL_ASYM_ID,
509  SSR_END_LABEL_COMP_ID,
510  SSR_END_LABEL_SEQ_ID,
511  SSR_SHEET_ID,
512  SSR_ID,
513  SSR_BEG_AUTH_ASYM_ID,
514  SSR_END_AUTH_ASYM_ID,
515  } StructSheetRangeItems;
516 
518  typedef enum {
519  DATE,
520  PDPOS_ID,
521  PDB_ID,
522  REPLACE_PDB_ID,
523  } PdbxDatabasePDBObsSpr;
524 
526  typedef enum {
527  DPI_NUM,
528  DPI_DATE,
529  DPI_DATE_ORIGINAL,
530  DPI_STATUS,
531  } DatabasePDBRevItems;
532 
534  typedef enum {
535  PARH_ORDINAL,
536  PARH_REVISION_DATE,
537  PARH_MAJOR,
538  PARH_MINOR,
539  } PdbxAuditRevisionHistoryItems;
540 
542  typedef enum {
543  PARD_REVISION_ORDINAL,
544  PARD_TYPE,
545  } PdbxAuditRevisionDetailsItems;
546 
548  typedef enum {
549  PDS_RECVD_INITIAL_DEPOSITION_DATE,
550  } PdbxDatabaseStatusItems;
551 
553  typedef enum {
554  BR_ENTITY_ID,
555  BR_ENTITY_TYPE
556  } EntityBranchItems;
557 
559  typedef enum {
560  BL_ENTITY_ID,
561  BL_ATOM_ID_1,
562  BL_ATOM_ID_2,
563  BL_COMP_ID_1,
564  BL_COMP_ID_2,
565  BL_ENTITY_BRANCH_LIST_NUM_1,
566  BL_ENTITY_BRANCH_LIST_NUM_2,
567  BL_ATOM_STEREO_CONFIG_1,
568  BL_ATOM_STEREO_CONFIG_2,
569  BL_VALUE_ORDER
570  } EntityBranchLinkItems;
571 
573  typedef enum {
574  EPS_ENTITY_ID,
575  EPS_MON_ID,
576  EPS_NUM,
577  EPS_HETERO
578  } EntityPolySeqItems;
579 
581  typedef enum {
582  EM_RESOLUTION
583  } Em3DReconstructionItems;
584 
586  typedef enum {
587  ATOM_SITE,
588  ENTITY,
589  ENTITY_POLY,
590  CITATION,
591  CITATION_AUTHOR,
592  EXPTL,
593  REFINE,
594  PDBX_STRUCT_ASSEMBLY,
595  PDBX_STRUCT_ASSEMBLY_GEN,
596  PDBX_STRUCT_OPER_LIST,
597  STRUCT,
598  STRUCT_CONF,
599  STRUCT_SHEET_RANGE,
600  PDBX_DATABASE_PDB_OBS_SPR,
601  STRUCT_REF,
602  STRUCT_REF_SEQ,
603  STRUCT_REF_SEQ_DIF,
604  DATABASE_PDB_REV,
605  PDBX_AUDIT_REVISION_HISTORY,
606  PDBX_AUDIT_REVISION_DETAILS,
607  PDBX_DATABASE_STATUS,
608  PDBX_ENTITY_BRANCH,
609  PDBX_ENTITY_BRANCH_LINK,
610  ENTITY_POLY_SEQ,
611  EM_3D_RECONSTRUCTION,
612  DONT_KNOW
613  } MMCifCategory;
614 
618  MMCifEntityDescMap::iterator GetEntityDescMapIterator(const String& entity_id);
619 
621  typedef struct {
622  String biounit_id;
623  std::vector<String> chains;
625  std::vector<std::vector<String> > operations;
627  } MMCifBioUAssembly;
628  typedef std::vector<MMCifBioUAssembly> MMCifBioUAssemblyVector;
629 
630  typedef std::map<String, std::pair<std::vector<int>, std::vector<String> > >
631  MMCifCitationAuthorMap;
632 
634  typedef struct {
635  mol::ResNum start;
636  mol::ResNum end;
637  String chain_name;
638  } MMCifHSEntry;
639  typedef std::vector<MMCifHSEntry> MMCifHSVector;
640 
642  typedef struct {
643  String details;
644  String method_details;
645  } MMCifPSAEntry;
646  typedef std::map<String, MMCifPSAEntry> MMCifPSAMap;
647 
649  struct MMCifRevisionDesc {
650  // silly GCC note: major() & minor() exist as macros...facepalm
651  MMCifRevisionDesc(int _num, const String& _date, int _major, int _minor)
652  : date(_date) {
653  num = _num;
654  major = _major;
655  minor = _minor;
656  }
657  int num;
658  String date;
659  int major;
660  int minor;
661  };
662 
664  typedef struct {
665  int res_num_1;
666  String cmp_1;
667  String atm_nm_1;
668  int res_num_2;
669  String cmp_2;
670  String atm_nm_2;
671  unsigned char bond_order;
672  } MMCifPdbxEntityBranchLink;
673  typedef std::map<String, std::vector<MMCifPdbxEntityBranchLink> >
674  MMCifPdbxEntityBranchLinkMap;
675 
676  // members
677  MMCifCategory category_;
678  int category_counts_[DONT_KNOW+1];
679  int indices_[MAX_ITEMS_IN_ROW];
680  const IOProfile& profile_;
681  mol::EntityHandle& ent_handle_;
682  String restrict_chains_;
683  bool auth_chain_id_;
684  mol::ChainHandle curr_chain_;
685  mol::ResidueHandle curr_residue_;
686  int chain_count_;
687  int residue_count_;
688  int atom_count_;
689  bool warned_name_mismatch_;
690  bool warned_rule_based_;
691  String subst_res_id_;
692  bool has_model_;
693  int curr_model_;
694  std::set<int> warned_ignored_model_; // keep track of ignored model warnings
695  std::vector<std::pair<mol::ChainHandle, String> > chain_id_pairs_;
697  MMCifEntityDescMap entity_desc_map_;
698  MMCifInfo info_;
699  MMCifCitationAuthorMap authors_map_;
700  MMCifBioUAssemblyVector bu_assemblies_;
701  MMCifPSAMap bu_origin_map_;
702  MMCifHSVector helix_list_;
703  MMCifHSVector strand_list_;
704  MMCifInfoStructRefs struct_refs_;
705  // for storing revisions
706  std::vector<MMCifRevisionDesc> revisions_;
707  std::map<int, String> revision_types_;
708  bool database_PDB_rev_added_;
709  // for entity_branch connections
710  MMCifPdbxEntityBranchLinkMap entity_branch_link_map_;
711  // for storing entity_poly_seq
712  std::map<String, std::map<int, String> > entity_poly_seq_map_;
713  std::map<String, std::vector<std::pair<int, String> > > entity_poly_seq_h_map_;
714 };
715 
721  const StringRef value_order);
722 
727  const unsigned char bond_order);
728 }}
729 
730 #endif
731 
732 // LocalWords: MMCifEntityDescMap
convenient datatype for referencing character data
Definition: string_ref.hh:39
container class for additional information from MMCif files
Definition: mmcif_info.hh:983
void ParseExptl(const std::vector< StringRef > &columns)
Fetch mmCIF exptl information.
MMCifReader(std::istream &stream, mol::EntityHandle &ent_handle, const IOProfile &profile)
create a MMCifReader
void ParsePdbxDatabaseStatus(const std::vector< StringRef > &columns)
Fetch mmCIF pdbx_database_status information.
virtual bool OnBeginData(const StringRef &data_name)
check mmcif input to be read. Substitutional function for StarParser.
void ParsePdbxStructAssembly(const std::vector< StringRef > &columns)
Fetch mmCIF pdbx_struct_assembly information.
void ParseStruct(const std::vector< StringRef > &columns)
Fetch mmCIF struct information.
void ParseStructRef(const std::vector< StringRef > &columns)
\ brief parse a row in the struct_ref category
void Init()
Initialise the reader.
void ParsePdbxAuditRevisionDetails(const std::vector< StringRef > &columns)
Fetch mmCIF pdbx_audit_revision_details information.
MMCifSecStructElement DetermineSecStructType(const StringRef &type) const
Check whether an element was classified sheet or helix.
const String & GetRestrictChains() const
Definition: mmcif_reader.hh:92
std::vector< std::vector< String > > UnPackOperExperession(StringRef expression)
bool ParseAtomIdent(const std::vector< StringRef > &columns, String &auth_chain_name, String &cif_chain_name, StringRef &res_name, mol::ResNum &resnum, bool &valid_res_num, StringRef &atom_name, char &alt_loc)
fetch values identifying atoms
virtual void OnEndData()
Finalise parsing.
void ParsePdbxStructOperList(const std::vector< StringRef > &columns)
Fetch mmCIF pdbx_struct_oper_list information.
void ParseEntityPoly(const std::vector< StringRef > &columns)
Fetch mmCIF entity_poly information.
void StoreExpression(const char *l, const char *s, bool &is_range, int lborder, std::vector< String > &single_block)
virtual void OnDataRow(const StarLoopDesc &header, const std::vector< StringRef > &columns)
read a row of data
void ParseEntity(const std::vector< StringRef > &columns)
Fetch mmCIF entity information.
void SetAuthChainID(bool id)
Enable or disable reading of auth_chain_id instead aof label_chain id (default)
void SetRestrictChains(const String &restrict_chains)
Set names of restricted chains for the reader.
void AssignSecStructure(mol::EntityHandle ent)
Transform data from struct_conf entry into secondary structure.
const MMCifInfo & GetInfo()
Get additional information of the mmCIF file.
void ParseEntityPolySeq(const std::vector< StringRef > &columns)
Fetch mmCIF entity_poly_seq information.
void ParseStructRefSeqDif(const std::vector< StringRef > &columns)
parse row in the struct_ref_seq_dif category
void ParsePdbxEntityBranchLink(const std::vector< StringRef > &columns)
Fetch mmCIF pdbx_entity_branch_link information.
const MMCifInfoStructRefs & GetStructRefs() const
seq::SequenceList GetSeqRes() const
Return sequences.
void ParsePdbxStructAssemblyGen(const std::vector< StringRef > &columns)
Fetch mmCIF pdbx_struct_assembly_gen information.
void ParseDatabasePDBRev(const std::vector< StringRef > &columns)
Fetch mmCIF database_PDB_rev information.
void StoreRange(const char *&l, const char *s, bool &is_range, int &lborder, std::vector< String > &single_block)
void ClearState()
Set up a fresh instance.
void ParsePdbxAuditRevisionHistory(const std::vector< StringRef > &columns)
Fetch mmCIF pdbx_audit_revision_history information.
void ParseCitationAuthor(const std::vector< StringRef > &columns)
Fetch mmCIF citation_author information.
void ParseStructConf(const std::vector< StringRef > &columns)
Fetch mmCIF struct_conf (secondary structure) information.
void ParsePdbxEntityBranch(const std::vector< StringRef > &columns)
Fetch mmCIF pdbx_entity_branch information.
void ParsePdbxDatabasePdbObsSpr(const std::vector< StringRef > &columns)
Fetch mmCIF pdbx_database_PDB_obs_spr information.
void ParseAndAddAtom(const std::vector< StringRef > &columns)
Fetch atom information and store it.
void TryStoreIdx(const int mapping, const String &item, const StarLoopDesc &header)
Store an item index from loop header in preparation for reading a row. Throws an exception if the ite...
MMCifReader(const String &filename, mol::EntityHandle &ent_handle, const IOProfile &profile)
create a MMCifReader
void ParseStructRefSeq(const std::vector< StringRef > &columns)
parse row in the struct_ref_seq category
virtual bool OnBeginLoop(const StarLoopDesc &header)
check if a current loop is to be parsed
void ParseCitation(const std::vector< StringRef > &columns)
Fetch mmCIF citation information.
void ParseRefine(const std::vector< StringRef > &columns)
Fetch mmCIF refine information.
void ParseStructSheetRange(const std::vector< StringRef > &columns)
Fetch mmCIF struct_sheet_range (beta sheets) information.
void ParseEm3DReconstruction(const std::vector< StringRef > &columns)
Fetch mmCIF entity_poly_seq information.
int GetIndex(const String &name) const
Definition: star_parser.hh:65
const String & GetCategory() const
Definition: star_parser.hh:92
Protein or molecule.
list of sequences.
#define DLLEXPORT_OST_IO
std::string String
Definition: base.hh:54
@ STAR_DIAG_ERROR
Definition: star_parser.hh:40
DLLEXPORT_OST_IO String OSTBondOrderToMMCifValueOrder(const unsigned char bond_order)
Translate an OST bond_order to mmCIF value_order.
std::vector< MMCifInfoStructRefPtr > MMCifInfoStructRefs
Definition: mmcif_info.hh:843
std::map< String, MMCifEntityDesc > MMCifEntityDescMap
Definition: mmcif_info.hh:967
DLLEXPORT_OST_IO unsigned char MMCifValueOrderToOSTBondOrder(const StringRef value_order)
Translate mmCIF info on bond type (e.g. pdbx_entity_branch_link.value_order) to OST bond_order.
pointer_it< T > end(const std::vector< T > &values)
Definition: base.dox:1