OpenStructure
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
mmcif_reader.hh
Go to the documentation of this file.
1 //------------------------------------------------------------------------------
2 // This file is part of the OpenStructure project <www.openstructure.org>
3 //
4 // Copyright (C) 2008-2011 by the OpenStructure authors
5 //
6 // This library is free software; you can redistribute it and/or modify it under
7 // the terms of the GNU Lesser General Public License as published by the Free
8 // Software Foundation; either version 3.0 of the License, or (at your option)
9 // any later version.
10 // This library is distributed in the hope that it will be useful, but WITHOUT
11 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Lesser General Public License
16 // along with this library; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 //------------------------------------------------------------------------------
19 #ifndef OST_MMCIF_READER_HH
20 #define OST_MMCIF_READER_HH
21 
22 #include <map>
23 
24 #include <ost/geom/geom.hh>
25 #include <ost/seq/sequence_list.hh>
27 #include <ost/mol/chain_type.hh>
29 #include <ost/io/mol/io_profile.hh>
30 #include <ost/io/io_exception.hh>
32 #include <ost/io/mol/mmcif_info.hh>
33 
34 namespace ost { namespace io {
35 
60 public:
66 
67  MMCifReader(std::istream& stream, mol::EntityHandle& ent_handle,
68  const IOProfile& profile);
69 
75  MMCifReader(const String& filename, mol::EntityHandle& ent_handle,
76  const IOProfile& profile);
77 
80  void Init();
81 
83  void ClearState();
84 
88  void SetRestrictChains(const String& restrict_chains);
89 
95  void SetReadCanonicalSeqRes(bool flag)
96  {
97  seqres_can_ = flag;
98  }
99 
100  const String& GetRestrictChains() const
101  {
102  return restrict_chains_;
103  }
104 
109  void SetAuthChainID(bool id)
110  {
111  auth_chain_id_ = id;
112  }
113 
120  virtual bool OnBeginData(const StringRef& data_name);
121 
127  virtual bool OnBeginLoop(const StarLoopDesc& header); // tested
128 
133  virtual void OnDataRow(const StarLoopDesc& header,
134  const std::vector<StringRef>& columns);
135 
137  virtual void OnEndData();
138 
143  return seqres_;
144  }
145 
149  void SetReadSeqRes(bool flag)
150  {
151  read_seqres_ = flag;
152  }
153 
157  bool GetReadSeqRes() const
158  {
159  return read_seqres_;
160  }
161 
165  const MMCifInfo& GetInfo() { return info_; }
166 
167 protected:
174  void TryStoreIdx(const int mapping,
175  const String& item,
176  const StarLoopDesc& header)
177  {
178  indices_[mapping] = header.GetIndex(item);
179 
180  if (indices_[mapping] == -1) {
181  throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
182  "No item '" + item +
183  "' found in '" +
184  header.GetCategory()+
185  "' header",
186  this->GetCurrentLinenum()));
187  }
188  } // tested
189 
203  bool ParseAtomIdent(const std::vector<StringRef>& columns,
204  String& auth_chain_name,
205  String& cif_chain_name,
206  StringRef& res_name,
207  mol::ResNum& resnum,
208  bool& valid_res_num,
209  StringRef& atom_name,
210  char& alt_loc);
211 
215  void ParseAndAddAtom(const std::vector<StringRef>& columns);
216 
220  void ParseEntity(const std::vector<StringRef>& columns);
221 
225  void ParseEntityPoly(const std::vector<StringRef>& columns);
226 
230  void ParseCitation(const std::vector<StringRef>& columns);
231 
232  const MMCifInfoStructRefs& GetStructRefs() const { return struct_refs_; }
241  String ConvertSEQRES(const String& seqres, conop::CompoundLibPtr compound_lib);
245  void ParseCitationAuthor(const std::vector<StringRef>& columns);
246 
248  void ParseStructRef(const std::vector<StringRef>& columns);
249 
251  void ParseStructRefSeq(const std::vector<StringRef>& columns);
252 
254  void ParseStructRefSeqDif(const std::vector<StringRef>& columns);
258  void ParseExptl(const std::vector<StringRef>& columns);
259 
263  void ParseRefine(const std::vector<StringRef>& columns);
264 
268  void ParsePdbxStructAssembly(const std::vector<StringRef>& columns);
269 
273  void ParsePdbxStructAssemblyGen(const std::vector<StringRef>& columns);
274 
275  std::vector<std::vector<String> > UnPackOperExperession(StringRef expression);
276 
277  void StoreExpression(const char* l, const char* s,
278  bool& is_range, int lborder,
279  std::vector<String>& single_block);
280 
281  void StoreRange(const char*& l, const char* s, bool& is_range, int& lborder,
282  std::vector<String>& single_block);
283 
287  void ParsePdbxStructOperList(const std::vector<StringRef>& columns);
288 
292  void ParseDatabasePDBRev(const std::vector<StringRef>& columns);
293 
297  void ParsePdbxAuditRevisionHistory(const std::vector<StringRef>& columns);
298 
302  void ParsePdbxAuditRevisionDetails(const std::vector<StringRef>& columns);
303 
307  void ParsePdbxDatabaseStatus(const std::vector<StringRef>& columns);
308 
312  void ParseStruct(const std::vector<StringRef>& columns);
313 
317  void ParseStructConf(const std::vector<StringRef>& columns);
318 
322  void ParseStructSheetRange(const std::vector<StringRef>& columns);
323 
327  void ParsePdbxDatabasePdbObsSpr(const std::vector<StringRef>& columns);
328 
330  typedef enum {
333  MMCIF_TURN
334  } MMCifSecStructElement;
335 
339  MMCifSecStructElement DetermineSecStructType(const StringRef& type) const;
340 
344  void AssignSecStructure(mol::EntityHandle ent);
345 
346 private:
348  typedef enum {
349  MAX_ITEMS_IN_ROW=18
350  } MMCifMagicNos;
351 
353  typedef enum {
354  AUTH_ASYM_ID,
355  AS_ID,
356  LABEL_ALT_ID,
357  LABEL_ASYM_ID,
358  LABEL_ATOM_ID,
359  LABEL_COMP_ID,
360  LABEL_ENTITY_ID,
361  LABEL_SEQ_ID,
362  AUTH_SEQ_ID,
363  TYPE_SYMBOL,
364  CARTN_X,
365  CARTN_Y,
366  CARTN_Z,
367  OCCUPANCY,
368  B_ISO_OR_EQUIV,
369  PDBX_PDB_INS_CODE,
370  GROUP_PDB,
371  PDBX_PDB_MODEL_NUM
372  } AtomSiteItems;
373 
375  typedef enum {
376  E_ID,
377  E_TYPE,
378  PDBX_DESCRIPTION
379  } EntityItems;
380 
382  typedef enum {
383  ENTITY_ID,
384  EP_TYPE,
385  PDBX_SEQ_ONE_LETTER_CODE,
386  PDBX_SEQ_ONE_LETTER_CODE_CAN
387  } EntityPolyItems;
388 
390  typedef enum {
391  CITATION_ID,
392  ABSTRACT_ID_CAS,
393  BOOK_ID_ISBN,
394  BOOK_TITLE,
395  BOOK_PUBLISHER,
396  BOOK_PUBLISHER_CITY,
397  JOURNAL_ABBREV,
398  JOURNAL_VOLUME,
399  PAGE_FIRST,
400  PAGE_LAST,
401  PDBX_DATABASE_ID_DOI,
402  PDBX_DATABASE_ID_PUBMED,
403  YEAR,
404  TITLE
405  } CitationItems;
406 
408  typedef enum {
409  AUTHOR_CITATION_ID,
410  AUTHOR_NAME,
411  ORDINAL
412  } CitationAuthorItems;
413 
415  typedef enum {
416  EXPTL_ENTRY_ID,
417  METHOD
418  } ExptlItems;
419 
421  typedef enum {
422  REFINE_ENTRY_ID,
423  LS_D_RES_HIGH,
424  LS_D_RES_LOW,
425  LS_R_FACTOR_R_WORK,
426  LS_R_FACTOR_R_FREE
427  } RefineItems;
428 
430  typedef enum {
431  PSA_DETAILS,
432  PSA_ID,
433  METHOD_DETAILS
434  } PdbxStructAssemblyItems;
435 
436  // \enum items of the struct_ref category
437  typedef enum {
438  SR_ENTITY_ID,
439  SR_ID,
440  SR_DB_CODE,
441  SR_DB_NAME,
442  SR_DB_ACCESS
443  } StructRefItems;
444 
446  typedef enum {
447  SRS_ALIGN_ID,
448  SRS_STRUCT_REF_ID,
449  SRS_PDBX_STRAND_ID,
450  SRS_DB_ALIGN_BEG,
451  SRS_DB_ALIGN_END,
452  SRS_ENT_ALIGN_BEG,
453  SRS_ENT_ALIGN_END
454  } StructRefSeqItems;
455 
457  typedef enum {
458  SRSD_ALIGN_ID,
459  SRSD_SEQ_RNUM,
460  SRSD_DB_RNUM,
461  SRSD_DETAILS
462  } StructRefSeqDifItems;
463 
465  typedef enum {
466  ASSEMBLY_ID,
467  ASYM_ID_LIST,
468  OPER_EXPRESSION
469  } PdbxStructAssemblyGenItems;
470 
472  typedef enum {
473  PSOL_ID,
474  PSOL_TYPE,
475  VECTOR_1,
476  VECTOR_2,
477  VECTOR_3,
478  MATRIX_1_1,
479  MATRIX_1_2,
480  MATRIX_1_3,
481  MATRIX_2_1,
482  MATRIX_2_2,
483  MATRIX_2_3,
484  MATRIX_3_1,
485  MATRIX_3_2,
486  MATRIX_3_3
487  } PdbxStructOperListItems;
488 
490  typedef enum {
491  STRUCT_ENTRY_ID,
492  PDBX_CASP_FLAG,
493  PDBX_DESCRIPTOR,
494  PDBX_FORMULA_WEIGHT,
495  PDBX_FORMULA_WEIGHT_METHOD,
496  PDBX_MODEL_DETAILS,
497  PDBX_MODEL_TYPE_DETAILS,
498  STRUCT_TITLE
499  } StructItems;
500 
502  typedef enum {
503  SC_BEG_AUTH_ASYM_ID,
504  SC_BEG_LABEL_ASYM_ID,
505  SC_BEG_LABEL_COMP_ID,
506  SC_BEG_LABEL_SEQ_ID,
507  SC_CONF_TYPE_ID,
508  SC_END_AUTH_ASYM_ID,
509  SC_END_LABEL_ASYM_ID,
510  SC_END_LABEL_COMP_ID,
511  SC_END_LABEL_SEQ_ID,
512  SC_ID,
513  } StructConfItems;
514 
516  typedef enum {
517  SSR_BEG_LABEL_ASYM_ID,
518  SSR_BEG_LABEL_COMP_ID,
519  SSR_BEG_LABEL_SEQ_ID,
520  SSR_END_LABEL_ASYM_ID,
521  SSR_END_LABEL_COMP_ID,
522  SSR_END_LABEL_SEQ_ID,
523  SSR_SHEET_ID,
524  SSR_ID,
525  SSR_BEG_AUTH_ASYM_ID,
526  SSR_END_AUTH_ASYM_ID,
527  } StructSheetRangeItems;
528 
530  typedef enum {
531  DATE,
532  PDPOS_ID,
533  PDB_ID,
534  REPLACE_PDB_ID,
535  } PdbxDatabasePDBObsSpr;
536 
538  typedef enum {
539  DPI_NUM,
540  DPI_DATE,
541  DPI_DATE_ORIGINAL,
542  DPI_STATUS,
543  } DatabasePDBRevItems;
544 
546  typedef enum {
547  PARH_ORDINAL,
548  PARH_REVISION_DATE,
549  PARH_MAJOR,
550  PARH_MINOR,
551  } PdbxAuditRevisionHistoryItems;
552 
554  typedef enum {
555  PARD_REVISION_ORDINAL,
556  PARD_TYPE,
557  } PdbxAuditRevisionDetailsItems;
558 
560  typedef enum {
561  PDS_RECVD_INITIAL_DEPOSITION_DATE,
562  } PdbxDatabaseStatusItems;
563 
565  typedef enum {
566  ATOM_SITE,
567  ENTITY,
568  ENTITY_POLY,
569  CITATION,
570  CITATION_AUTHOR,
571  EXPTL,
572  REFINE,
573  PDBX_STRUCT_ASSEMBLY,
574  PDBX_STRUCT_ASSEMBLY_GEN,
575  PDBX_STRUCT_OPER_LIST,
576  STRUCT,
577  STRUCT_CONF,
578  STRUCT_SHEET_RANGE,
579  PDBX_DATABASE_PDB_OBS_SPR,
580  STRUCT_REF,
581  STRUCT_REF_SEQ,
582  STRUCT_REF_SEQ_DIF,
583  DATABASE_PDB_REV,
584  PDBX_AUDIT_REVISION_HISTORY,
585  PDBX_AUDIT_REVISION_DETAILS,
586  PDBX_DATABASE_STATUS,
587  DONT_KNOW
588  } MMCifCategory;
589 
591  typedef struct {
592  mol::ChainType type;
593  String details;
594  String seqres;
595  } MMCifEntityDesc;
596  typedef std::map<String, MMCifEntityDesc> MMCifEntityDescMap;
597 
599  typedef struct {
600  String biounit_id;
601  std::vector<String> chains;
602  std::vector<std::vector<String> > operations;
604  } MMCifBioUAssembly;
606  typedef std::vector<MMCifBioUAssembly> MMCifBioUAssemblyVector;
607 
608  typedef std::map<String, std::pair<std::vector<int>, std::vector<String> > >
609  MMCifCitationAuthorMap;
610 
612  typedef struct {
613  mol::ResNum start;
614  mol::ResNum end;
615  String chain_name;
616  } MMCifHSEntry;
617  typedef std::vector<MMCifHSEntry> MMCifHSVector;
618 
620  typedef struct {
621  String details;
622  String method_details;
623  } MMCifPSAEntry;
624  typedef std::map<String, MMCifPSAEntry> MMCifPSAMap;
625 
627  struct MMCifRevisionDesc {
628  // silly GCC note: major() & minor() exist as macros...facepalm
629  MMCifRevisionDesc(int _num, const String& _date, int _major, int _minor)
630  : date(_date) {
631  num = _num;
632  major = _major;
633  minor = _minor;
634  }
635  int num;
636  String date;
637  int major;
638  int minor;
639  };
640 
641  // members
642  MMCifCategory category_;
643  int category_counts_[DONT_KNOW+1];
644  int indices_[MAX_ITEMS_IN_ROW];
645  const IOProfile& profile_;
646  mol::EntityHandle& ent_handle_;
647  String restrict_chains_;
648  bool auth_chain_id_;
649  bool seqres_can_;
650  mol::ChainHandle curr_chain_;
651  mol::ResidueHandle curr_residue_;
652  int chain_count_;
653  int residue_count_;
654  int atom_count_;
655  bool warned_name_mismatch_;
656  bool warned_rule_based_;
657  String subst_res_id_;
658  bool has_model_;
659  int curr_model_;
660  std::vector<std::pair<mol::ChainHandle, String> > chain_id_pairs_;
662  MMCifEntityDescMap entity_desc_map_;
663  seq::SequenceList seqres_;
664  bool read_seqres_;
665  MMCifInfo info_;
666  MMCifCitationAuthorMap authors_map_;
667  MMCifBioUAssemblyVector bu_assemblies_;
668  MMCifPSAMap bu_origin_map_;
669  MMCifHSVector helix_list_;
670  MMCifHSVector strand_list_;
671  MMCifInfoStructRefs struct_refs_;
672  // for storing revisions
673  std::vector<MMCifRevisionDesc> revisions_;
674  std::map<int, String> revision_types_;
675  bool database_PDB_rev_added_;
676 };
677 
678 }}
679 
680 #endif
convenient datatype for referencing character data
Definition: string_ref.hh:39
void SetAuthChainID(bool id)
Enable or disable reading of auth_chain_id instead aof label_chain id (default)
container class for additional information from MMCif files
Definition: mmcif_info.hh:934
std::string String
Definition: base.hh:54
const MMCifInfoStructRefs & GetStructRefs() const
seq::SequenceList GetSeqRes() const
Return sequences.
const String & GetRestrictChains() const
Protein or molecule.
pointer_it< T > end(const std::vector< T > &values)
parser for the STAR file format
Definition: star_parser.hh:114
boost::shared_ptr< CompoundLib > CompoundLibPtr
Definition: compound_lib.hh:31
std::vector< MMCifInfoStructRefPtr > MMCifInfoStructRefs
Definition: mmcif_info.hh:841
std::vector< SequenceImplPtr > SequenceList
void SetReadSeqRes(bool flag)
Toggle reading of SEQRES.
reader for the mmcif file format
Definition: mmcif_reader.hh:59
const MMCifInfo & GetInfo()
Get additional information of the mmCIF file.
#define DLLEXPORT_OST_IO
list of sequences.
void TryStoreIdx(const int mapping, const String &item, const StarLoopDesc &header)
Store an item index from loop header in preparation for reading a row. Throws an exception if the ite...
const String & GetCategory() const
Definition: star_parser.hh:92
int GetIndex(const String &name) const
Definition: star_parser.hh:65
void SetReadCanonicalSeqRes(bool flag)
Toggle reading of canonical sequence residues (entity_poly.pdbx_seq_one_letter_code_can instead of en...
Definition: mmcif_reader.hh:95
bool GetReadSeqRes() const
Check if reading of SEQRES is enabled.