OpenStructure
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
mmcif_reader.hh
Go to the documentation of this file.
1 //------------------------------------------------------------------------------
2 // This file is part of the OpenStructure project <www.openstructure.org>
3 //
4 // Copyright (C) 2008-2020 by the OpenStructure authors
5 //
6 // This library is free software; you can redistribute it and/or modify it under
7 // the terms of the GNU Lesser General Public License as published by the Free
8 // Software Foundation; either version 3.0 of the License, or (at your option)
9 // any later version.
10 // This library is distributed in the hope that it will be useful, but WITHOUT
11 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Lesser General Public License
16 // along with this library; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 //------------------------------------------------------------------------------
19 #ifndef OST_MMCIF_READER_HH
20 #define OST_MMCIF_READER_HH
21 
22 #include <map>
23 
24 #include <ost/geom/geom.hh>
25 #include <ost/seq/sequence_list.hh>
27 #include <ost/mol/chain_type.hh>
29 #include <ost/io/mol/io_profile.hh>
30 #include <ost/io/io_exception.hh>
32 #include <ost/io/mol/mmcif_info.hh>
33 
34 namespace ost { namespace io {
35 
62 public:
68 
69  MMCifReader(std::istream& stream, mol::EntityHandle& ent_handle,
70  const IOProfile& profile);
71 
77  MMCifReader(const String& filename, mol::EntityHandle& ent_handle,
78  const IOProfile& profile);
79 
82  void Init();
83 
85  void ClearState();
86 
90  void SetRestrictChains(const String& restrict_chains);
91 
97  void SetReadCanonicalSeqRes(bool flag)
98  {
99  seqres_can_ = flag;
100  }
101 
102  const String& GetRestrictChains() const
103  {
104  return restrict_chains_;
105  }
106 
111  void SetAuthChainID(bool id)
112  {
113  auth_chain_id_ = id;
114  }
115 
122  virtual bool OnBeginData(const StringRef& data_name);
123 
129  virtual bool OnBeginLoop(const StarLoopDesc& header); // tested
130 
135  virtual void OnDataRow(const StarLoopDesc& header,
136  const std::vector<StringRef>& columns);
137 
139  virtual void OnEndData();
140 
145  return seqres_;
146  }
147 
151  void SetReadSeqRes(bool flag)
152  {
153  read_seqres_ = flag;
154  }
155 
159  bool GetReadSeqRes() const
160  {
161  return read_seqres_;
162  }
163 
167  const MMCifInfo& GetInfo() { return info_; }
168 
169 protected:
176  void TryStoreIdx(const int mapping,
177  const String& item,
178  const StarLoopDesc& header)
179  {
180  indices_[mapping] = header.GetIndex(item);
181 
182  if (indices_[mapping] == -1) {
183  throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
184  "No item '" + item +
185  "' found in '" +
186  header.GetCategory()+
187  "' header",
188  this->GetCurrentLinenum()));
189  }
190  } // tested
191 
205  bool ParseAtomIdent(const std::vector<StringRef>& columns,
206  String& auth_chain_name,
207  String& cif_chain_name,
208  StringRef& res_name,
209  mol::ResNum& resnum,
210  bool& valid_res_num,
211  StringRef& atom_name,
212  char& alt_loc);
213 
217  void ParseAndAddAtom(const std::vector<StringRef>& columns);
218 
222  void ParseEntity(const std::vector<StringRef>& columns);
223 
227  void ParseEntityPoly(const std::vector<StringRef>& columns);
228 
232  void ParseCitation(const std::vector<StringRef>& columns);
233 
234  const MMCifInfoStructRefs& GetStructRefs() const { return struct_refs_; }
243  String ConvertSEQRES(const String& seqres, conop::CompoundLibBasePtr compound_lib);
247  void ParseCitationAuthor(const std::vector<StringRef>& columns);
248 
250  void ParseStructRef(const std::vector<StringRef>& columns);
251 
253  void ParseStructRefSeq(const std::vector<StringRef>& columns);
254 
256  void ParseStructRefSeqDif(const std::vector<StringRef>& columns);
260  void ParseExptl(const std::vector<StringRef>& columns);
261 
265  void ParseRefine(const std::vector<StringRef>& columns);
266 
270  void ParsePdbxStructAssembly(const std::vector<StringRef>& columns);
271 
275  void ParsePdbxStructAssemblyGen(const std::vector<StringRef>& columns);
276 
277  std::vector<std::vector<String> > UnPackOperExperession(StringRef expression);
278 
279  void StoreExpression(const char* l, const char* s,
280  bool& is_range, int lborder,
281  std::vector<String>& single_block);
282 
283  void StoreRange(const char*& l, const char* s, bool& is_range, int& lborder,
284  std::vector<String>& single_block);
285 
289  void ParsePdbxStructOperList(const std::vector<StringRef>& columns);
290 
294  void ParseDatabasePDBRev(const std::vector<StringRef>& columns);
295 
299  void ParsePdbxAuditRevisionHistory(const std::vector<StringRef>& columns);
300 
304  void ParsePdbxAuditRevisionDetails(const std::vector<StringRef>& columns);
305 
309  void ParsePdbxDatabaseStatus(const std::vector<StringRef>& columns);
310 
314  void ParseStruct(const std::vector<StringRef>& columns);
315 
319  void ParseStructConf(const std::vector<StringRef>& columns);
320 
324  void ParseStructSheetRange(const std::vector<StringRef>& columns);
325 
329  void ParsePdbxDatabasePdbObsSpr(const std::vector<StringRef>& columns);
330 
334  void ParsePdbxEntityBranch(const std::vector<StringRef>& columns);
335 
339  void ParsePdbxEntityBranchLink(const std::vector<StringRef>& columns);
340 
344  void ParseEntityPolySeq(const std::vector<StringRef>& columns);
345 
349  void ParseEm3DReconstruction(const std::vector<StringRef>& columns);
350 
352  typedef enum {
356  MMCIF_COIL
357  } MMCifSecStructElement;
358 
362  MMCifSecStructElement DetermineSecStructType(const StringRef& type) const;
363 
367  void AssignSecStructure(mol::EntityHandle ent);
368 
369 private:
371  typedef enum {
372  MAX_ITEMS_IN_ROW=19
373  } MMCifMagicNos;
374 
376  typedef enum {
377  AUTH_ASYM_ID,
378  AS_ID,
379  LABEL_ALT_ID,
380  LABEL_ASYM_ID,
381  LABEL_ATOM_ID,
382  LABEL_COMP_ID,
383  LABEL_ENTITY_ID,
384  LABEL_SEQ_ID,
385  AUTH_SEQ_ID,
386  TYPE_SYMBOL,
387  CARTN_X,
388  CARTN_Y,
389  CARTN_Z,
390  OCCUPANCY,
391  B_ISO_OR_EQUIV,
392  PDBX_PDB_INS_CODE,
393  GROUP_PDB,
394  PDBX_PDB_MODEL_NUM,
395  FORMAL_CHARGE
396  } AtomSiteItems;
397 
399  typedef enum {
400  E_ID,
401  E_TYPE,
402  PDBX_DESCRIPTION
403  } EntityItems;
404 
406  typedef enum {
407  ENTITY_ID,
408  EP_TYPE,
409  PDBX_SEQ_ONE_LETTER_CODE,
410  PDBX_SEQ_ONE_LETTER_CODE_CAN
411  } EntityPolyItems;
412 
414  typedef enum {
415  CITATION_ID,
416  ABSTRACT_ID_CAS,
417  BOOK_ID_ISBN,
418  BOOK_TITLE,
419  BOOK_PUBLISHER,
420  BOOK_PUBLISHER_CITY,
421  JOURNAL_ABBREV,
422  JOURNAL_VOLUME,
423  PAGE_FIRST,
424  PAGE_LAST,
425  PDBX_DATABASE_ID_DOI,
426  PDBX_DATABASE_ID_PUBMED,
427  YEAR,
428  TITLE
429  } CitationItems;
430 
432  typedef enum {
433  AUTHOR_CITATION_ID,
434  AUTHOR_NAME,
435  ORDINAL
436  } CitationAuthorItems;
437 
439  typedef enum {
440  EXPTL_ENTRY_ID,
441  METHOD
442  } ExptlItems;
443 
445  typedef enum {
446  REFINE_ENTRY_ID,
447  LS_D_RES_HIGH,
448  LS_D_RES_LOW,
449  LS_R_FACTOR_R_WORK,
450  LS_R_FACTOR_R_FREE
451  } RefineItems;
452 
454  typedef enum {
455  PSA_DETAILS,
456  PSA_ID,
457  METHOD_DETAILS
458  } PdbxStructAssemblyItems;
459 
460  // \enum items of the struct_ref category
461  typedef enum {
462  SR_ENTITY_ID,
463  SR_ID,
464  SR_DB_CODE,
465  SR_DB_NAME,
466  SR_DB_ACCESS
467  } StructRefItems;
468 
470  typedef enum {
471  SRS_ALIGN_ID,
472  SRS_STRUCT_REF_ID,
473  SRS_PDBX_STRAND_ID,
474  SRS_DB_ALIGN_BEG,
475  SRS_DB_ALIGN_END,
476  SRS_ENT_ALIGN_BEG,
477  SRS_ENT_ALIGN_END
478  } StructRefSeqItems;
479 
481  typedef enum {
482  SRSD_ALIGN_ID,
483  SRSD_SEQ_RNUM,
484  SRSD_DB_RNUM,
485  SRSD_DETAILS
486  } StructRefSeqDifItems;
487 
489  typedef enum {
490  ASSEMBLY_ID,
491  ASYM_ID_LIST,
492  OPER_EXPRESSION
493  } PdbxStructAssemblyGenItems;
494 
496  typedef enum {
497  PSOL_ID,
498  PSOL_TYPE,
499  VECTOR_1,
500  VECTOR_2,
501  VECTOR_3,
502  MATRIX_1_1,
503  MATRIX_1_2,
504  MATRIX_1_3,
505  MATRIX_2_1,
506  MATRIX_2_2,
507  MATRIX_2_3,
508  MATRIX_3_1,
509  MATRIX_3_2,
510  MATRIX_3_3
511  } PdbxStructOperListItems;
512 
514  typedef enum {
515  STRUCT_ENTRY_ID,
516  PDBX_CASP_FLAG,
517  PDBX_DESCRIPTOR,
518  PDBX_FORMULA_WEIGHT,
519  PDBX_FORMULA_WEIGHT_METHOD,
520  PDBX_MODEL_DETAILS,
521  PDBX_MODEL_TYPE_DETAILS,
522  STRUCT_TITLE
523  } StructItems;
524 
526  typedef enum {
527  SC_BEG_AUTH_ASYM_ID,
528  SC_BEG_LABEL_ASYM_ID,
529  SC_BEG_LABEL_COMP_ID,
530  SC_BEG_LABEL_SEQ_ID,
531  SC_CONF_TYPE_ID,
532  SC_END_AUTH_ASYM_ID,
533  SC_END_LABEL_ASYM_ID,
534  SC_END_LABEL_COMP_ID,
535  SC_END_LABEL_SEQ_ID,
536  SC_ID,
537  } StructConfItems;
538 
540  typedef enum {
541  SSR_BEG_LABEL_ASYM_ID,
542  SSR_BEG_LABEL_COMP_ID,
543  SSR_BEG_LABEL_SEQ_ID,
544  SSR_END_LABEL_ASYM_ID,
545  SSR_END_LABEL_COMP_ID,
546  SSR_END_LABEL_SEQ_ID,
547  SSR_SHEET_ID,
548  SSR_ID,
549  SSR_BEG_AUTH_ASYM_ID,
550  SSR_END_AUTH_ASYM_ID,
551  } StructSheetRangeItems;
552 
554  typedef enum {
555  DATE,
556  PDPOS_ID,
557  PDB_ID,
558  REPLACE_PDB_ID,
559  } PdbxDatabasePDBObsSpr;
560 
562  typedef enum {
563  DPI_NUM,
564  DPI_DATE,
565  DPI_DATE_ORIGINAL,
566  DPI_STATUS,
567  } DatabasePDBRevItems;
568 
570  typedef enum {
571  PARH_ORDINAL,
572  PARH_REVISION_DATE,
573  PARH_MAJOR,
574  PARH_MINOR,
575  } PdbxAuditRevisionHistoryItems;
576 
578  typedef enum {
579  PARD_REVISION_ORDINAL,
580  PARD_TYPE,
581  } PdbxAuditRevisionDetailsItems;
582 
584  typedef enum {
585  PDS_RECVD_INITIAL_DEPOSITION_DATE,
586  } PdbxDatabaseStatusItems;
587 
589  typedef enum {
590  BR_ENTITY_ID,
591  BR_ENTITY_TYPE
592  } EntityBranchItems;
593 
595  typedef enum {
596  BL_ENTITY_ID,
597  BL_ATOM_ID_1,
598  BL_ATOM_ID_2,
599  BL_COMP_ID_1,
600  BL_COMP_ID_2,
601  BL_ENTITY_BRANCH_LIST_NUM_1,
602  BL_ENTITY_BRANCH_LIST_NUM_2,
603  BL_ATOM_STEREO_CONFIG_1,
604  BL_ATOM_STEREO_CONFIG_2,
605  BL_VALUE_ORDER
606  } EntityBranchLinkItems;
607 
609  typedef enum {
610  EPS_ENTITY_ID,
611  EPS_MON_ID,
612  EPS_NUM,
613  EPS_HETERO
614  } EntityPolySeqItems;
615 
617  typedef enum {
618  EM_RESOLUTION
619  } Em3DReconstructionItems;
620 
622  typedef enum {
623  ATOM_SITE,
624  ENTITY,
625  ENTITY_POLY,
626  CITATION,
627  CITATION_AUTHOR,
628  EXPTL,
629  REFINE,
630  PDBX_STRUCT_ASSEMBLY,
631  PDBX_STRUCT_ASSEMBLY_GEN,
632  PDBX_STRUCT_OPER_LIST,
633  STRUCT,
634  STRUCT_CONF,
635  STRUCT_SHEET_RANGE,
636  PDBX_DATABASE_PDB_OBS_SPR,
637  STRUCT_REF,
638  STRUCT_REF_SEQ,
639  STRUCT_REF_SEQ_DIF,
640  DATABASE_PDB_REV,
641  PDBX_AUDIT_REVISION_HISTORY,
642  PDBX_AUDIT_REVISION_DETAILS,
643  PDBX_DATABASE_STATUS,
644  PDBX_ENTITY_BRANCH,
645  PDBX_ENTITY_BRANCH_LINK,
646  ENTITY_POLY_SEQ,
647  EM_3D_RECONSTRUCTION,
648  DONT_KNOW
649  } MMCifCategory;
650 
654  MMCifEntityDescMap::iterator GetEntityDescMapIterator(const String& entity_id);
655 
657  typedef struct {
658  String biounit_id;
659  std::vector<String> chains;
660  std::vector<std::vector<String> > operations;
662  } MMCifBioUAssembly;
664  typedef std::vector<MMCifBioUAssembly> MMCifBioUAssemblyVector;
665 
666  typedef std::map<String, std::pair<std::vector<int>, std::vector<String> > >
667  MMCifCitationAuthorMap;
668 
670  typedef struct {
671  mol::ResNum start;
672  mol::ResNum end;
673  String chain_name;
674  } MMCifHSEntry;
675  typedef std::vector<MMCifHSEntry> MMCifHSVector;
676 
678  typedef struct {
679  String details;
680  String method_details;
681  } MMCifPSAEntry;
682  typedef std::map<String, MMCifPSAEntry> MMCifPSAMap;
683 
685  struct MMCifRevisionDesc {
686  // silly GCC note: major() & minor() exist as macros...facepalm
687  MMCifRevisionDesc(int _num, const String& _date, int _major, int _minor)
688  : date(_date) {
689  num = _num;
690  major = _major;
691  minor = _minor;
692  }
693  int num;
694  String date;
695  int major;
696  int minor;
697  };
698 
700  typedef struct {
701  int res_num_1;
702  String cmp_1;
703  String atm_nm_1;
704  int res_num_2;
705  String cmp_2;
706  String atm_nm_2;
707  unsigned char bond_order;
708  } MMCifPdbxEntityBranchLink;
709  typedef std::map<String, std::vector<MMCifPdbxEntityBranchLink> >
710  MMCifPdbxEntityBranchLinkMap;
711 
712  // members
713  MMCifCategory category_;
714  int category_counts_[DONT_KNOW+1];
715  int indices_[MAX_ITEMS_IN_ROW];
716  const IOProfile& profile_;
717  mol::EntityHandle& ent_handle_;
718  String restrict_chains_;
719  bool auth_chain_id_;
720  bool seqres_can_;
721  mol::ChainHandle curr_chain_;
722  mol::ResidueHandle curr_residue_;
723  int chain_count_;
724  int residue_count_;
725  int atom_count_;
726  bool warned_name_mismatch_;
727  bool warned_rule_based_;
728  String subst_res_id_;
729  bool has_model_;
730  int curr_model_;
731  std::vector<std::pair<mol::ChainHandle, String> > chain_id_pairs_;
733  MMCifEntityDescMap entity_desc_map_;
734  seq::SequenceList seqres_;
735  bool read_seqres_;
736  MMCifInfo info_;
737  MMCifCitationAuthorMap authors_map_;
738  MMCifBioUAssemblyVector bu_assemblies_;
739  MMCifPSAMap bu_origin_map_;
740  MMCifHSVector helix_list_;
741  MMCifHSVector strand_list_;
742  MMCifInfoStructRefs struct_refs_;
743  // for storing revisions
744  std::vector<MMCifRevisionDesc> revisions_;
745  std::map<int, String> revision_types_;
746  bool database_PDB_rev_added_;
747  // for entity_branch connections
748  MMCifPdbxEntityBranchLinkMap entity_branch_link_map_;
749  // for storing entity_poly_seq
750  std::map<String, std::map<int, String> > entity_poly_seq_map_;
751  std::map<String, std::vector<std::pair<int, String> > > entity_poly_seq_h_map_;
752 };
753 
759  const StringRef value_order);
760 
765  const unsigned char bond_order);
766 }}
767 
768 #endif
769 
770 // LocalWords: MMCifEntityDescMap
convenient datatype for referencing character data
Definition: string_ref.hh:39
void SetAuthChainID(bool id)
Enable or disable reading of auth_chain_id instead aof label_chain id (default)
container class for additional information from MMCif files
Definition: mmcif_info.hh:991
boost::shared_ptr< CompoundLibBase > CompoundLibBasePtr
std::string String
Definition: base.hh:54
const MMCifInfoStructRefs & GetStructRefs() const
std::map< String, MMCifEntityDesc > MMCifEntityDescMap
Definition: mmcif_info.hh:975
seq::SequenceList GetSeqRes() const
Return sequences.
const String & GetRestrictChains() const
Protein or molecule.
pointer_it< T > end(const std::vector< T > &values)
parser for the STAR file format
Definition: star_parser.hh:114
DLLEXPORT_OST_IO unsigned char MMCifValueOrderToOSTBondOrder(const StringRef value_order)
Translate mmCIF info on bond type (e.g. pdbx_entity_branch_link.value_order) to OST bond_order...
tuple compound_lib
Definition: init.py:184
std::vector< MMCifInfoStructRefPtr > MMCifInfoStructRefs
Definition: mmcif_info.hh:843
DLLEXPORT_OST_IO String OSTBondOrderToMMCifValueOrder(const unsigned char bond_order)
Translate an OST bond_order to mmCIF value_order.
std::vector< SequenceImplPtr > SequenceList
void SetReadSeqRes(bool flag)
Toggle reading of SEQRES.
reader for the mmcif file format
Definition: mmcif_reader.hh:61
const MMCifInfo & GetInfo()
Get additional information of the mmCIF file.
#define DLLEXPORT_OST_IO
list of sequences.
void TryStoreIdx(const int mapping, const String &item, const StarLoopDesc &header)
Store an item index from loop header in preparation for reading a row. Throws an exception if the ite...
const String & GetCategory() const
Definition: star_parser.hh:92
int GetIndex(const String &name) const
Definition: star_parser.hh:65
void SetReadCanonicalSeqRes(bool flag)
Toggle reading of canonical sequence residues (entity_poly.pdbx_seq_one_letter_code_can instead of en...
Definition: mmcif_reader.hh:97
bool GetReadSeqRes() const
Check if reading of SEQRES is enabled.