OpenStructure
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
aaindex.py
Go to the documentation of this file.
1 import os
2 from enum import Enum
3 
4 from ost import GetSharedDataPath
5 
6 def _StrToFloat(str_item):
7  """Returns None if *str_item* looks like invalid number, casts to
8  float otherwise"""
9  x = str_item.strip()
10  if x.lower() in ["na", "-", "none"]:
11  return None
12  return float(x)
13 
14 
15 class AnnoType(str, Enum):
16  """Possible types of aaindex entries"""
17  SINGLE = 'SINGLE' #: Single amino acid annotation
18  PAIR = 'PAIR' #: Pairwise amino acid annotation, substitution/pairwise scores
19 
20 
22  """Data object representing an annotation in aaindex, preferably
23  constructed from it's static :func:`Parse` method. The following
24  attributes are available:
25 
26  * key: aaindex accession number (e.g. ANDN920101)
27  * desc: descriptive title
28  * ref: Reference to article if available
29  * authors: Authors of article if available
30  * title: Title of article if available
31  * journal: Journal of article if available
32  * anno_type: Enum (:class:`AnnoType`) specifying whether we're dealing
33  with a single or pairwise amino acid annotation/score.
34  * anno: :class:`dict` with annotation. If *anno_type* is SINGLE,
35  keys are amino acid one letter codes (single character strings).
36  If *anno_type* is PAIR, keys are two one letter codes added
37  together (two character strings). Even when the thing is
38  symmetric, both keys exist. I.e. 'AB' AND 'BA'.
39  Values are of type :class:`float` (None if not available).
40  """
41  def __init__(self):
42  self.key = None
43  self.desc = None
44  self.ref = None
45  self.authors = None
46  self.title = None
47  self.journal = None
48  self.anno_type = AnnoType.SINGLE
49  self.anno = dict()
50 
51  @staticmethod
52  def Parse(data):
53  """Creates :class:`AAIndexData` from data.
54 
55  :param data: Iterable with strings in data format described for aaindex.
56  :returns: :class:`AAIndexData`, if iterable contains several entries,
57  parsing stops at separation sequence ('//'). None is returned
58  if nothing could be parsed.
59  :raises: descriptive error in case of corrupt data
60  """
61  key = ""
62  desc = ""
63  ref = ""
64  authors = ""
65  title = ""
66  journal = ""
67  anno = dict()
68  anno_type = None
69 
70  # temp variables, used for parsing
71  values = list()
72  pair_rows = None
73  pair_cols = None
74 
75  current_data_type = None
76  stuff_read = False
77  for line in data:
78  if line.startswith("//"):
79  break # we're done
80  elif line.strip() == "":
81  continue # nothing to read
82  elif line[0] in ["H", "D", "R", "A", "T", "J", "I", "M"]:
83  stuff_read = True
84  current_data_type = line[0] # something we're interested in
85  elif line.startswith(" "):
86  pass # continuation of previous stuff
87  else:
88  current_data_type = None # unkown data, skip...
89 
90  if current_data_type == "H":
91  key = line[2:].strip()
92  elif current_data_type == "D":
93  desc += line[2:]
94  elif current_data_type == "R":
95  ref += line[2:]
96  elif current_data_type == "A":
97  authors += line[2:]
98  elif current_data_type == "T":
99  title += line[2:]
100  elif current_data_type == "J":
101  journal += line[2:]
102  elif current_data_type == "I":
103  if anno_type == AnnoType.PAIR:
104  raise RuntimeError("Observed single AA and pairwise "
105  "features in the same aaindex entry")
106  anno_type = AnnoType.SINGLE
107  if line.startswith("I"):
108  # header, must match expected aa ordering
109  aakeys = [item.strip() for item in line[1:].split()]
110  exp_aa_keys = ["A/L", "R/K", "N/M", "D/F", "C/P", "Q/S",
111  "E/T", "G/W", "H/Y", "I/V"]
112  if aakeys != exp_aa_keys:
113  raise RuntimeError(f"Keys in single AA AAIndex entry "
114  "are expected to be "
115  "I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V "
116  "got {line}")
117  else:
118  # its numbers... will be added to anno dict below
119  values += [_StrToFloat(x) for x in line.split()]
120  elif current_data_type == "M":
121  if anno_type == AnnoType.SINGLE:
122  raise RuntimeError("Observed single AA and pairwise "
123  "features in the same aaindex entry")
124  anno_type = AnnoType.PAIR
125  if line.startswith("M"):
126  # header, don't expect strict one letter code ordering here
127  # also because some pair entries include gaps ('-').
128  # expect something like: "M rows = <x>, cols = <x>"
129  split_line = line[1:].split(',')
130  split_line = sorted([item.strip() for item in split_line])
131  # after sorting we should have exactly two elements, the
132  # first starting with cols and the second with rows
133  if len(split_line) != 2 or \
134  not split_line[0].startswith("cols") or \
135  not split_line[1].startswith("rows"):
136  raise RuntimeError(f"Expect value header in pair "
137  "AAIndex entry to be of form: "
138  "\"M rows = <x>, cols = <x>\" got: "
139  "{line}")
140  pair_cols = split_line[0].split("=")[1].strip()
141  pair_rows = split_line[1].split("=")[1].strip()
142  if len(pair_cols) != len(pair_cols):
143  raise RuntimeError(f"Expect rows and cols to have same "
144  "number of elements when parsing "
145  "pair AAIndex entry got {line}")
146  else:
147  # its numbers... will be added to anno dict below
148  values += [_StrToFloat(x) for x in line.split()]
149 
150  if not stuff_read:
151  return None
152 
153  if key == "":
154  raise RuntimeError("Cannot parse AAIndex entry without key...")
155 
156  if anno_type == AnnoType.SINGLE:
157  olcs = "ARNDCQEGHILKMFPSTWYV"
158  if len(olcs) != len(values):
159  raise RuntimeError(f"Expected {len(olcs)} values in single AA "
160  "AAIndex entry, got {len(values)}")
161  for olc, value in zip(olcs, values):
162  anno[olc] = value
163  elif anno_type == AnnoType.PAIR:
164  # number of values can differ as the provided matrix can either be
165  # diagonal (symmetric A -> B == B -> A) or rectangular (non-symmetric
166  # A -> B != B -> A)
167  # For the former, number of columns and rows must be equal, no such
168  # requirement for non-symmetric case
169  n_values_match = False
170  n_cols = len(pair_cols)
171  n_rows = len(pair_rows)
172  n_nonsym = n_cols * n_rows
173  if len(values) == n_nonsym:
174  n_values_match = True
175  value_idx = 0
176  for a in pair_rows:
177  for b in pair_cols:
178  anno[a+b] = values[value_idx]
179  value_idx += 1
180 
181  if n_cols == n_rows:
182  n_values_match = True
183  N = n_cols
184  n_sym = (N*N - N) / 2 # number of elements below diagonal
185  n_sym += N # add diagonal elements again
186  if len(values) == n_sym:
187  value_idx = 0
188  for row_idx, row in enumerate(pair_rows):
189  for col in pair_cols[: row_idx+1]:
190  anno[row+col] = values[value_idx]
191  anno[col+row] = values[value_idx]
192  value_idx += 1
193  if not n_values_match:
194  raise RuntimeError(f"Number of parsed values doesn't match "
195  "parsed rows and cols descriptors")
196  else:
197  raise RuntimeError("Cannot parse AAIndex entry without values...")
198 
199  data = AAIndexData()
200  data.key = key
201  data.title = title
202  data.ref = ref
203  data.authors = authors
204  data.title = title
205  data.journal = journal
206  data.anno_type = anno_type
207  data.anno = anno
208  return data
209 
210  def GetScore(self, olc):
211  """Score/Annotation getter
212 
213  :param olc: One letter code of amino acid
214  :type olc: :class:`string`
215  :returns: Annotation/score for *olc*
216  :raises: :class:`ValueError` if *olc* is not known or
217  :class:`RuntimeError` if anno_type of this
218  :class:`AAIndexData` object is not AnnoType.SINGLE.
219  """
220  if self.anno_type == AnnoType.SINGLE:
221  if olc in self.anno:
222  return self.anno[olc]
223  else:
224  raise ValueError(f"OLC not in AAIndex: {olc}")
225  raise RuntimeError("Cannot return score for single amino acid with "
226  "AAIndex of type PAIR")
227 
228  def GetPairScore(self, olc_one, olc_two):
229  """Score/Annotation getter
230 
231  :param olc_one: One letter code of first amino acid
232  :type olc_one: :class:`string`
233  :param olc_two: One letter code of second amino acid
234  :type olc_two: :class:`string`
235  :returns: Pairwise annotation/score for *olc_one*/*olc_two*
236  :raises: :class:`ValueError` if key constructed from *olc_one* and
237  *olc_two* is not known or
238  :class:`RuntimeError` if anno_type of this
239  :class:`AAIndexData` object is not AnnoType.PAIR.
240  """
241  if self.anno_type == AnnoType.PAIR:
242  anno_key = olc_one + olc_two
243  if anno_key in self.anno:
244  return self.anno[anno_key]
245  else:
246  raise ValueError(f"Cannot find annotation for following pairs "
247  "of olcs: {olc_one}, {olc_two}")
248  raise RuntimeError("Cannot return score for pair of amino acid "
249  "with AAIndex of type SINGLE")
250 
251 
252 class AAIndex:
253  """Provides access to data from the amino acid index database (aaindex):
254 
255  Kawashima, S. and Kanehisa, M.; AAindex: amino acid index database.
256  Nucleic Acids Res. 28, 374 (2000).
257 
258  Files are available `here <https://www.genome.jp/aaindex/>`_
259 
260  :param aaindex_files: Paths to aaindex files. If not given, the files
261  aaindex1, aaindex2 and aaindex3 from the specified
262  source are used (Release 9.2, Feb 2017).
263  :type aaindex_files: :class:`list` of :class:`str`
264  """
265  def __init__(self, aaindex_files = None):
266  if aaindex_files is None:
267  aaindex_dir = os.path.join(GetSharedDataPath(), 'aaindex')
268  self.files_to_load = [os.path.join(aaindex_dir, "aaindex1"),
269  os.path.join(aaindex_dir, "aaindex2"),
270  os.path.join(aaindex_dir, "aaindex3")]
271  else:
272  self.files_to_load = list(aaindex_files)
273  self.aaindex_entries = dict()
274 
275  def keys(self):
276  """Emulate dict like behvaiour and returns all available keys, accession
277  numbers respectively.
278 
279  :returns: keys (or accession numbers) of all available aaindex entries
280  """
281  self._LoadAll()
282  return self.aaindex_entries.keys()
283 
284  def values(self):
285  """Emulate dict like behvaiour and returns all available entries.
286 
287  :returns: iterable of entries (type :class:`AAIndexData`)
288  """
289  self._LoadAll()
290  return self.aaindex_entries.values()
291 
292  def __getitem__(self, key):
293  """Getter by aaindex accession number (e.g. ANDN920101)
294 
295  :param key: aaindex accession number
296  :type key: :class:`str`
297  :returns: :class:`AAIndexData` object
298  """
299  while True:
300  if key in self.aaindex_entries:
301  return self.aaindex_entries[key]
302  # key is not available let's load the next aaindex file
303  if not self._Load():
304  # all files loaded, there is entry with *key* in any of them
305  break
306  raise KeyError(f"{key} does not exist in provided aa_index files")
307 
308  def _LoadAll(self):
309  """Loads all remaining files specified in self.files_to_load
310  """
311  while self._Load():
312  pass
313 
314  def _Load(self):
315  """Loads and removes first element in self.files_to_load. Returns False
316  if there is no file to load anymore, True if one is successfully loaded.
317  """
318  if len(self.files_to_load) == 0:
319  return False
320  path = self.files_to_load.pop(0)
321  if not os.path.exists(path):
322  raise RuntimeError(f"Tried to load {path} but file doesnt exist.")
323  with open(path, 'r') as fh:
324  data = fh.readlines()
325  data_it = iter(data)
326  while True: # read as long as it spits out AAIndexData entries
327  entry = AAIndexData.Parse(data_it)
328  if entry is None:
329  break # nothing to read anymore...
330  else:
331  self.aaindex_entries[entry.key] = entry
332  return True
String DLLEXPORT_OST_BASE GetSharedDataPath()