OpenStructure
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
mmcif_reader.hh
Go to the documentation of this file.
1 //------------------------------------------------------------------------------
2 // This file is part of the OpenStructure project <www.openstructure.org>
3 //
4 // Copyright (C) 2008-2020 by the OpenStructure authors
5 //
6 // This library is free software; you can redistribute it and/or modify it under
7 // the terms of the GNU Lesser General Public License as published by the Free
8 // Software Foundation; either version 3.0 of the License, or (at your option)
9 // any later version.
10 // This library is distributed in the hope that it will be useful, but WITHOUT
11 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Lesser General Public License
16 // along with this library; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 //------------------------------------------------------------------------------
19 #ifndef OST_MMCIF_READER_HH
20 #define OST_MMCIF_READER_HH
21 
22 #include <map>
23 
24 #include <ost/geom/geom.hh>
25 #include <ost/seq/sequence_list.hh>
27 #include <ost/mol/chain_type.hh>
29 #include <ost/io/mol/io_profile.hh>
30 #include <ost/io/io_exception.hh>
32 #include <ost/io/mol/mmcif_info.hh>
33 
34 namespace ost { namespace io {
35 
62 public:
68 
69  MMCifReader(std::istream& stream, mol::EntityHandle& ent_handle,
70  const IOProfile& profile);
71 
77  MMCifReader(const String& filename, mol::EntityHandle& ent_handle,
78  const IOProfile& profile);
79 
82  void Init();
83 
85  void ClearState();
86 
90  void SetRestrictChains(const String& restrict_chains);
91 
97  void SetReadCanonicalSeqRes(bool flag)
98  {
99  seqres_can_ = flag;
100  }
101 
102  const String& GetRestrictChains() const
103  {
104  return restrict_chains_;
105  }
106 
111  void SetAuthChainID(bool id)
112  {
113  auth_chain_id_ = id;
114  }
115 
122  virtual bool OnBeginData(const StringRef& data_name);
123 
129  virtual bool OnBeginLoop(const StarLoopDesc& header); // tested
130 
135  virtual void OnDataRow(const StarLoopDesc& header,
136  const std::vector<StringRef>& columns);
137 
139  virtual void OnEndData();
140 
145  return seqres_;
146  }
147 
151  void SetReadSeqRes(bool flag)
152  {
153  read_seqres_ = flag;
154  }
155 
159  bool GetReadSeqRes() const
160  {
161  return read_seqres_;
162  }
163 
167  const MMCifInfo& GetInfo() { return info_; }
168 
169 protected:
176  void TryStoreIdx(const int mapping,
177  const String& item,
178  const StarLoopDesc& header)
179  {
180  indices_[mapping] = header.GetIndex(item);
181 
182  if (indices_[mapping] == -1) {
183  throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
184  "No item '" + item +
185  "' found in '" +
186  header.GetCategory()+
187  "' header",
188  this->GetCurrentLinenum()));
189  }
190  } // tested
191 
205  bool ParseAtomIdent(const std::vector<StringRef>& columns,
206  String& auth_chain_name,
207  String& cif_chain_name,
208  StringRef& res_name,
209  mol::ResNum& resnum,
210  bool& valid_res_num,
211  StringRef& atom_name,
212  char& alt_loc);
213 
217  void ParseAndAddAtom(const std::vector<StringRef>& columns);
218 
222  void ParseEntity(const std::vector<StringRef>& columns);
223 
227  void ParseEntityPoly(const std::vector<StringRef>& columns);
228 
232  void ParseCitation(const std::vector<StringRef>& columns);
233 
234  const MMCifInfoStructRefs& GetStructRefs() const { return struct_refs_; }
243  String ConvertSEQRES(const String& seqres, conop::CompoundLibBasePtr compound_lib);
247  void ParseCitationAuthor(const std::vector<StringRef>& columns);
248 
250  void ParseStructRef(const std::vector<StringRef>& columns);
251 
253  void ParseStructRefSeq(const std::vector<StringRef>& columns);
254 
256  void ParseStructRefSeqDif(const std::vector<StringRef>& columns);
260  void ParseExptl(const std::vector<StringRef>& columns);
261 
265  void ParseRefine(const std::vector<StringRef>& columns);
266 
270  void ParsePdbxStructAssembly(const std::vector<StringRef>& columns);
271 
275  void ParsePdbxStructAssemblyGen(const std::vector<StringRef>& columns);
276 
277  std::vector<std::vector<String> > UnPackOperExperession(StringRef expression);
278 
279  void StoreExpression(const char* l, const char* s,
280  bool& is_range, int lborder,
281  std::vector<String>& single_block);
282 
283  void StoreRange(const char*& l, const char* s, bool& is_range, int& lborder,
284  std::vector<String>& single_block);
285 
289  void ParsePdbxStructOperList(const std::vector<StringRef>& columns);
290 
294  void ParseDatabasePDBRev(const std::vector<StringRef>& columns);
295 
299  void ParsePdbxAuditRevisionHistory(const std::vector<StringRef>& columns);
300 
304  void ParsePdbxAuditRevisionDetails(const std::vector<StringRef>& columns);
305 
309  void ParsePdbxDatabaseStatus(const std::vector<StringRef>& columns);
310 
314  void ParseStruct(const std::vector<StringRef>& columns);
315 
319  void ParseStructConf(const std::vector<StringRef>& columns);
320 
324  void ParseStructSheetRange(const std::vector<StringRef>& columns);
325 
329  void ParsePdbxDatabasePdbObsSpr(const std::vector<StringRef>& columns);
330 
334  void ParsePdbxEntityBranch(const std::vector<StringRef>& columns);
335 
339  void ParsePdbxEntityBranchLink(const std::vector<StringRef>& columns);
340 
342  typedef enum {
346  MMCIF_COIL
347  } MMCifSecStructElement;
348 
352  MMCifSecStructElement DetermineSecStructType(const StringRef& type) const;
353 
357  void AssignSecStructure(mol::EntityHandle ent);
358 
359 private:
361  typedef enum {
362  MAX_ITEMS_IN_ROW=19
363  } MMCifMagicNos;
364 
366  typedef enum {
367  AUTH_ASYM_ID,
368  AS_ID,
369  LABEL_ALT_ID,
370  LABEL_ASYM_ID,
371  LABEL_ATOM_ID,
372  LABEL_COMP_ID,
373  LABEL_ENTITY_ID,
374  LABEL_SEQ_ID,
375  AUTH_SEQ_ID,
376  TYPE_SYMBOL,
377  CARTN_X,
378  CARTN_Y,
379  CARTN_Z,
380  OCCUPANCY,
381  B_ISO_OR_EQUIV,
382  PDBX_PDB_INS_CODE,
383  GROUP_PDB,
384  PDBX_PDB_MODEL_NUM,
385  FORMAL_CHARGE
386  } AtomSiteItems;
387 
389  typedef enum {
390  E_ID,
391  E_TYPE,
392  PDBX_DESCRIPTION
393  } EntityItems;
394 
396  typedef enum {
397  ENTITY_ID,
398  EP_TYPE,
399  PDBX_SEQ_ONE_LETTER_CODE,
400  PDBX_SEQ_ONE_LETTER_CODE_CAN
401  } EntityPolyItems;
402 
404  typedef enum {
405  CITATION_ID,
406  ABSTRACT_ID_CAS,
407  BOOK_ID_ISBN,
408  BOOK_TITLE,
409  BOOK_PUBLISHER,
410  BOOK_PUBLISHER_CITY,
411  JOURNAL_ABBREV,
412  JOURNAL_VOLUME,
413  PAGE_FIRST,
414  PAGE_LAST,
415  PDBX_DATABASE_ID_DOI,
416  PDBX_DATABASE_ID_PUBMED,
417  YEAR,
418  TITLE
419  } CitationItems;
420 
422  typedef enum {
423  AUTHOR_CITATION_ID,
424  AUTHOR_NAME,
425  ORDINAL
426  } CitationAuthorItems;
427 
429  typedef enum {
430  EXPTL_ENTRY_ID,
431  METHOD
432  } ExptlItems;
433 
435  typedef enum {
436  REFINE_ENTRY_ID,
437  LS_D_RES_HIGH,
438  LS_D_RES_LOW,
439  LS_R_FACTOR_R_WORK,
440  LS_R_FACTOR_R_FREE
441  } RefineItems;
442 
444  typedef enum {
445  PSA_DETAILS,
446  PSA_ID,
447  METHOD_DETAILS
448  } PdbxStructAssemblyItems;
449 
450  // \enum items of the struct_ref category
451  typedef enum {
452  SR_ENTITY_ID,
453  SR_ID,
454  SR_DB_CODE,
455  SR_DB_NAME,
456  SR_DB_ACCESS
457  } StructRefItems;
458 
460  typedef enum {
461  SRS_ALIGN_ID,
462  SRS_STRUCT_REF_ID,
463  SRS_PDBX_STRAND_ID,
464  SRS_DB_ALIGN_BEG,
465  SRS_DB_ALIGN_END,
466  SRS_ENT_ALIGN_BEG,
467  SRS_ENT_ALIGN_END
468  } StructRefSeqItems;
469 
471  typedef enum {
472  SRSD_ALIGN_ID,
473  SRSD_SEQ_RNUM,
474  SRSD_DB_RNUM,
475  SRSD_DETAILS
476  } StructRefSeqDifItems;
477 
479  typedef enum {
480  ASSEMBLY_ID,
481  ASYM_ID_LIST,
482  OPER_EXPRESSION
483  } PdbxStructAssemblyGenItems;
484 
486  typedef enum {
487  PSOL_ID,
488  PSOL_TYPE,
489  VECTOR_1,
490  VECTOR_2,
491  VECTOR_3,
492  MATRIX_1_1,
493  MATRIX_1_2,
494  MATRIX_1_3,
495  MATRIX_2_1,
496  MATRIX_2_2,
497  MATRIX_2_3,
498  MATRIX_3_1,
499  MATRIX_3_2,
500  MATRIX_3_3
501  } PdbxStructOperListItems;
502 
504  typedef enum {
505  STRUCT_ENTRY_ID,
506  PDBX_CASP_FLAG,
507  PDBX_DESCRIPTOR,
508  PDBX_FORMULA_WEIGHT,
509  PDBX_FORMULA_WEIGHT_METHOD,
510  PDBX_MODEL_DETAILS,
511  PDBX_MODEL_TYPE_DETAILS,
512  STRUCT_TITLE
513  } StructItems;
514 
516  typedef enum {
517  SC_BEG_AUTH_ASYM_ID,
518  SC_BEG_LABEL_ASYM_ID,
519  SC_BEG_LABEL_COMP_ID,
520  SC_BEG_LABEL_SEQ_ID,
521  SC_CONF_TYPE_ID,
522  SC_END_AUTH_ASYM_ID,
523  SC_END_LABEL_ASYM_ID,
524  SC_END_LABEL_COMP_ID,
525  SC_END_LABEL_SEQ_ID,
526  SC_ID,
527  } StructConfItems;
528 
530  typedef enum {
531  SSR_BEG_LABEL_ASYM_ID,
532  SSR_BEG_LABEL_COMP_ID,
533  SSR_BEG_LABEL_SEQ_ID,
534  SSR_END_LABEL_ASYM_ID,
535  SSR_END_LABEL_COMP_ID,
536  SSR_END_LABEL_SEQ_ID,
537  SSR_SHEET_ID,
538  SSR_ID,
539  SSR_BEG_AUTH_ASYM_ID,
540  SSR_END_AUTH_ASYM_ID,
541  } StructSheetRangeItems;
542 
544  typedef enum {
545  DATE,
546  PDPOS_ID,
547  PDB_ID,
548  REPLACE_PDB_ID,
549  } PdbxDatabasePDBObsSpr;
550 
552  typedef enum {
553  DPI_NUM,
554  DPI_DATE,
555  DPI_DATE_ORIGINAL,
556  DPI_STATUS,
557  } DatabasePDBRevItems;
558 
560  typedef enum {
561  PARH_ORDINAL,
562  PARH_REVISION_DATE,
563  PARH_MAJOR,
564  PARH_MINOR,
565  } PdbxAuditRevisionHistoryItems;
566 
568  typedef enum {
569  PARD_REVISION_ORDINAL,
570  PARD_TYPE,
571  } PdbxAuditRevisionDetailsItems;
572 
574  typedef enum {
575  PDS_RECVD_INITIAL_DEPOSITION_DATE,
576  } PdbxDatabaseStatusItems;
577 
579  typedef enum {
580  BR_ENTITY_ID,
581  BR_ENTITY_TYPE
582  } EntityBranchItems;
583 
585  typedef enum {
586  BL_ENTITY_ID,
587  BL_ATOM_ID_1,
588  BL_ATOM_ID_2,
589  BL_COMP_ID_1,
590  BL_COMP_ID_2,
591  BL_ENTITY_BRANCH_LIST_NUM_1,
592  BL_ENTITY_BRANCH_LIST_NUM_2,
593  BL_ATOM_STEREO_CONFIG_1,
594  BL_ATOM_STEREO_CONFIG_2,
595  BL_VALUE_ORDER
596  } EntityBranchLinkItems;
597 
599  typedef enum {
600  ATOM_SITE,
601  ENTITY,
602  ENTITY_POLY,
603  CITATION,
604  CITATION_AUTHOR,
605  EXPTL,
606  REFINE,
607  PDBX_STRUCT_ASSEMBLY,
608  PDBX_STRUCT_ASSEMBLY_GEN,
609  PDBX_STRUCT_OPER_LIST,
610  STRUCT,
611  STRUCT_CONF,
612  STRUCT_SHEET_RANGE,
613  PDBX_DATABASE_PDB_OBS_SPR,
614  STRUCT_REF,
615  STRUCT_REF_SEQ,
616  STRUCT_REF_SEQ_DIF,
617  DATABASE_PDB_REV,
618  PDBX_AUDIT_REVISION_HISTORY,
619  PDBX_AUDIT_REVISION_DETAILS,
620  PDBX_DATABASE_STATUS,
621  PDBX_ENTITY_BRANCH,
622  PDBX_ENTITY_BRANCH_LINK,
623  DONT_KNOW
624  } MMCifCategory;
625 
627  typedef struct {
628  mol::ChainType type;
629  String details;
630  String seqres;
631  } MMCifEntityDesc;
632  typedef std::map<String, MMCifEntityDesc> MMCifEntityDescMap;
633 
637  MMCifEntityDescMap::iterator GetEntityDescMapIterator(const String& entity_id);
638 
640  typedef struct {
641  String biounit_id;
642  std::vector<String> chains;
643  std::vector<std::vector<String> > operations;
645  } MMCifBioUAssembly;
647  typedef std::vector<MMCifBioUAssembly> MMCifBioUAssemblyVector;
648 
649  typedef std::map<String, std::pair<std::vector<int>, std::vector<String> > >
650  MMCifCitationAuthorMap;
651 
653  typedef struct {
654  mol::ResNum start;
655  mol::ResNum end;
656  String chain_name;
657  } MMCifHSEntry;
658  typedef std::vector<MMCifHSEntry> MMCifHSVector;
659 
661  typedef struct {
662  String details;
663  String method_details;
664  } MMCifPSAEntry;
665  typedef std::map<String, MMCifPSAEntry> MMCifPSAMap;
666 
668  struct MMCifRevisionDesc {
669  // silly GCC note: major() & minor() exist as macros...facepalm
670  MMCifRevisionDesc(int _num, const String& _date, int _major, int _minor)
671  : date(_date) {
672  num = _num;
673  major = _major;
674  minor = _minor;
675  }
676  int num;
677  String date;
678  int major;
679  int minor;
680  };
681 
683  typedef struct {
684  int res_num_1;
685  String cmp_1;
686  String atm_nm_1;
687  int res_num_2;
688  String cmp_2;
689  String atm_nm_2;
690  unsigned char bond_order;
691  } MMCifPdbxEntityBranchLink;
692  typedef std::map<String, std::vector<MMCifPdbxEntityBranchLink> >
693  MMCifPdbxEntityBranchLinkMap;
694 
695  // members
696  MMCifCategory category_;
697  int category_counts_[DONT_KNOW+1];
698  int indices_[MAX_ITEMS_IN_ROW];
699  const IOProfile& profile_;
700  mol::EntityHandle& ent_handle_;
701  String restrict_chains_;
702  bool auth_chain_id_;
703  bool seqres_can_;
704  mol::ChainHandle curr_chain_;
705  mol::ResidueHandle curr_residue_;
706  int chain_count_;
707  int residue_count_;
708  int atom_count_;
709  bool warned_name_mismatch_;
710  bool warned_rule_based_;
711  String subst_res_id_;
712  bool has_model_;
713  int curr_model_;
714  std::vector<std::pair<mol::ChainHandle, String> > chain_id_pairs_;
716  MMCifEntityDescMap entity_desc_map_;
717  seq::SequenceList seqres_;
718  bool read_seqres_;
719  MMCifInfo info_;
720  MMCifCitationAuthorMap authors_map_;
721  MMCifBioUAssemblyVector bu_assemblies_;
722  MMCifPSAMap bu_origin_map_;
723  MMCifHSVector helix_list_;
724  MMCifHSVector strand_list_;
725  MMCifInfoStructRefs struct_refs_;
726  // for storing revisions
727  std::vector<MMCifRevisionDesc> revisions_;
728  std::map<int, String> revision_types_;
729  bool database_PDB_rev_added_;
730  // for entity_branch connections
731  MMCifPdbxEntityBranchLinkMap entity_branch_link_map_;
732 };
733 
739  const StringRef value_order);
740 
745  const unsigned char bond_order);
746 }}
747 
748 #endif
749 
750 // LocalWords: MMCifEntityDescMap
convenient datatype for referencing character data
Definition: string_ref.hh:39
void SetAuthChainID(bool id)
Enable or disable reading of auth_chain_id instead aof label_chain id (default)
container class for additional information from MMCif files
Definition: mmcif_info.hh:975
boost::shared_ptr< CompoundLibBase > CompoundLibBasePtr
std::string String
Definition: base.hh:54
const MMCifInfoStructRefs & GetStructRefs() const
seq::SequenceList GetSeqRes() const
Return sequences.
const String & GetRestrictChains() const
Protein or molecule.
pointer_it< T > end(const std::vector< T > &values)
parser for the STAR file format
Definition: star_parser.hh:114
DLLEXPORT_OST_IO unsigned char MMCifValueOrderToOSTBondOrder(const StringRef value_order)
Translate mmCIF info on bond type (e.g. pdbx_entity_branch_link.value_order) to OST bond_order...
tuple compound_lib
Definition: init.py:184
std::vector< MMCifInfoStructRefPtr > MMCifInfoStructRefs
Definition: mmcif_info.hh:843
DLLEXPORT_OST_IO String OSTBondOrderToMMCifValueOrder(const unsigned char bond_order)
Translate an OST bond_order to mmCIF value_order.
std::vector< SequenceImplPtr > SequenceList
void SetReadSeqRes(bool flag)
Toggle reading of SEQRES.
reader for the mmcif file format
Definition: mmcif_reader.hh:61
const MMCifInfo & GetInfo()
Get additional information of the mmCIF file.
#define DLLEXPORT_OST_IO
list of sequences.
void TryStoreIdx(const int mapping, const String &item, const StarLoopDesc &header)
Store an item index from loop header in preparation for reading a row. Throws an exception if the ite...
const String & GetCategory() const
Definition: star_parser.hh:92
int GetIndex(const String &name) const
Definition: star_parser.hh:65
void SetReadCanonicalSeqRes(bool flag)
Toggle reading of canonical sequence residues (entity_poly.pdbx_seq_one_letter_code_can instead of en...
Definition: mmcif_reader.hh:97
bool GetReadSeqRes() const
Check if reading of SEQRES is enabled.