OpenStructure
Loading...
Searching...
No Matches
aaindex.py
Go to the documentation of this file.
1import os
2from enum import Enum
3
4from ost import GetSharedDataPath
5
6def _StrToFloat(str_item):
7 """Returns None if *str_item* looks like invalid number, casts to
8 float otherwise"""
9 x = str_item.strip()
10 if x.lower() in ["na", "-", "none"]:
11 return None
12 return float(x)
13
14
15class AnnoType(str, Enum):
16 """Possible types of aaindex entries"""
17 SINGLE = 'SINGLE' #: Single amino acid annotation
18 PAIR = 'PAIR' #: Pairwise amino acid annotation, substitution/pairwise scores
19
20
22 """Data object representing an annotation in aaindex, preferably
23 constructed from it's static :func:`Parse` method. The following
24 attributes are available:
25
26 * key: aaindex accession number (e.g. ANDN920101)
27 * desc: descriptive title
28 * ref: Reference to article if available
29 * authors: Authors of article if available
30 * title: Title of article if available
31 * journal: Journal of article if available
32 * anno_type: Enum (:class:`AnnoType`) specifying whether we're dealing
33 with a single or pairwise amino acid annotation/score.
34 * anno: :class:`dict` with annotation. If *anno_type* is SINGLE,
35 keys are amino acid one letter codes (single character strings).
36 If *anno_type* is PAIR, keys are two one letter codes added
37 together (two character strings). Even when the thing is
38 symmetric, both keys exist. I.e. 'AB' AND 'BA'.
39 Values are of type :class:`float` (None if not available).
40 """
41 def __init__(self):
42 self.key = None
43 self.desc = None
44 self.ref = None
45 self.authors = None
46 self.title = None
47 self.journal = None
48 self.anno_type = AnnoType.SINGLE
49 self.anno = dict()
50
51 @staticmethod
52 def Parse(data):
53 """Creates :class:`AAIndexData` from data.
54
55 :param data: Iterable with strings in data format described for aaindex.
56 :returns: :class:`AAIndexData`, if iterable contains several entries,
57 parsing stops at separation sequence ('//'). None is returned
58 if nothing could be parsed.
59 :raises: descriptive error in case of corrupt data
60 """
61 key = ""
62 desc = ""
63 ref = ""
64 authors = ""
65 title = ""
66 journal = ""
67 anno = dict()
68 anno_type = None
69
70 # temp variables, used for parsing
71 values = list()
72 pair_rows = None
73 pair_cols = None
74
75 current_data_type = None
76 stuff_read = False
77 for line in data:
78 if line.startswith("//"):
79 break # we're done
80 elif line.strip() == "":
81 continue # nothing to read
82 elif line[0] in ["H", "D", "R", "A", "T", "J", "I", "M"]:
83 stuff_read = True
84 current_data_type = line[0] # something we're interested in
85 elif line.startswith(" "):
86 pass # continuation of previous stuff
87 else:
88 current_data_type = None # unkown data, skip...
89
90 if current_data_type == "H":
91 key = line[2:].strip()
92 elif current_data_type == "D":
93 desc += line[2:]
94 elif current_data_type == "R":
95 ref += line[2:]
96 elif current_data_type == "A":
97 authors += line[2:]
98 elif current_data_type == "T":
99 title += line[2:]
100 elif current_data_type == "J":
101 journal += line[2:]
102 elif current_data_type == "I":
103 if anno_type == AnnoType.PAIR:
104 raise RuntimeError("Observed single AA and pairwise "
105 "features in the same aaindex entry")
106 anno_type = AnnoType.SINGLE
107 if line.startswith("I"):
108 # header, must match expected aa ordering
109 aakeys = [item.strip() for item in line[1:].split()]
110 exp_aa_keys = ["A/L", "R/K", "N/M", "D/F", "C/P", "Q/S",
111 "E/T", "G/W", "H/Y", "I/V"]
112 if aakeys != exp_aa_keys:
113 raise RuntimeError(f"Keys in single AA AAIndex entry "
114 "are expected to be "
115 "I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V "
116 "got {line}")
117 else:
118 # its numbers... will be added to anno dict below
119 values += [_StrToFloat(x) for x in line.split()]
120 elif current_data_type == "M":
121 if anno_type == AnnoType.SINGLE:
122 raise RuntimeError("Observed single AA and pairwise "
123 "features in the same aaindex entry")
124 anno_type = AnnoType.PAIR
125 if line.startswith("M"):
126 # header, don't expect strict one letter code ordering here
127 # also because some pair entries include gaps ('-').
128 # expect something like: "M rows = <x>, cols = <x>"
129 split_line = line[1:].split(',')
130 split_line = sorted([item.strip() for item in split_line])
131 # after sorting we should have exactly two elements, the
132 # first starting with cols and the second with rows
133 if len(split_line) != 2 or \
134 not split_line[0].startswith("cols") or \
135 not split_line[1].startswith("rows"):
136 raise RuntimeError(f"Expect value header in pair "
137 "AAIndex entry to be of form: "
138 "\"M rows = <x>, cols = <x>\" got: "
139 "{line}")
140 pair_cols = split_line[0].split("=")[1].strip()
141 pair_rows = split_line[1].split("=")[1].strip()
142 if len(pair_cols) != len(pair_cols):
143 raise RuntimeError(f"Expect rows and cols to have same "
144 "number of elements when parsing "
145 "pair AAIndex entry got {line}")
146 else:
147 # its numbers... will be added to anno dict below
148 values += [_StrToFloat(x) for x in line.split()]
149
150 if not stuff_read:
151 return None
152
153 if key == "":
154 raise RuntimeError("Cannot parse AAIndex entry without key...")
155
156 if anno_type == AnnoType.SINGLE:
157 olcs = "ARNDCQEGHILKMFPSTWYV"
158 if len(olcs) != len(values):
159 raise RuntimeError(f"Expected {len(olcs)} values in single AA "
160 "AAIndex entry, got {len(values)}")
161 for olc, value in zip(olcs, values):
162 anno[olc] = value
163 elif anno_type == AnnoType.PAIR:
164 # number of values can differ as the provided matrix can either be
165 # diagonal (symmetric A -> B == B -> A) or rectangular (non-symmetric
166 # A -> B != B -> A)
167 # For the former, number of columns and rows must be equal, no such
168 # requirement for non-symmetric case
169 n_values_match = False
170 n_cols = len(pair_cols)
171 n_rows = len(pair_rows)
172 n_nonsym = n_cols * n_rows
173 if len(values) == n_nonsym:
174 n_values_match = True
175 value_idx = 0
176 for a in pair_rows:
177 for b in pair_cols:
178 anno[a+b] = values[value_idx]
179 value_idx += 1
180
181 if n_cols == n_rows:
182 n_values_match = True
183 N = n_cols
184 n_sym = (N*N - N) / 2 # number of elements below diagonal
185 n_sym += N # add diagonal elements again
186 if len(values) == n_sym:
187 value_idx = 0
188 for row_idx, row in enumerate(pair_rows):
189 for col in pair_cols[: row_idx+1]:
190 anno[row+col] = values[value_idx]
191 anno[col+row] = values[value_idx]
192 value_idx += 1
193 if not n_values_match:
194 raise RuntimeError(f"Number of parsed values doesn't match "
195 "parsed rows and cols descriptors")
196 else:
197 raise RuntimeError("Cannot parse AAIndex entry without values...")
198
199 data = AAIndexData()
200 data.key = key
201 data.title = title
202 data.ref = ref
203 data.authors = authors
204 data.title = title
205 data.journal = journal
206 data.anno_type = anno_type
207 data.anno = anno
208 return data
209
210 def GetScore(self, olc):
211 """Score/Annotation getter
212
213 :param olc: One letter code of amino acid
214 :type olc: :class:`string`
215 :returns: Annotation/score for *olc*
216 :raises: :class:`ValueError` if *olc* is not known or
217 :class:`RuntimeError` if anno_type of this
218 :class:`AAIndexData` object is not AnnoType.SINGLE.
219 """
220 if self.anno_type == AnnoType.SINGLE:
221 if olc in self.anno:
222 return self.anno[olc]
223 else:
224 raise ValueError(f"OLC not in AAIndex: {olc}")
225 raise RuntimeError("Cannot return score for single amino acid with "
226 "AAIndex of type PAIR")
227
228 def GetPairScore(self, olc_one, olc_two):
229 """Score/Annotation getter
230
231 :param olc_one: One letter code of first amino acid
232 :type olc_one: :class:`string`
233 :param olc_two: One letter code of second amino acid
234 :type olc_two: :class:`string`
235 :returns: Pairwise annotation/score for *olc_one*/*olc_two*
236 :raises: :class:`ValueError` if key constructed from *olc_one* and
237 *olc_two* is not known or
238 :class:`RuntimeError` if anno_type of this
239 :class:`AAIndexData` object is not AnnoType.PAIR.
240 """
241 if self.anno_type == AnnoType.PAIR:
242 anno_key = olc_one + olc_two
243 if anno_key in self.anno:
244 return self.anno[anno_key]
245 else:
246 raise ValueError(f"Cannot find annotation for following pairs "
247 "of olcs: {olc_one}, {olc_two}")
248 raise RuntimeError("Cannot return score for pair of amino acid "
249 "with AAIndex of type SINGLE")
250
251
253 """Provides access to data from the amino acid index database (aaindex):
254
255 Kawashima, S. and Kanehisa, M.; AAindex: amino acid index database.
256 Nucleic Acids Res. 28, 374 (2000).
257
258 Files are available `here <https://www.genome.jp/aaindex/>`_
259
260 :param aaindex_files: Paths to aaindex files. If not given, the files
261 aaindex1, aaindex2 and aaindex3 from the specified
262 source are used (Release 9.2, Feb 2017).
263 :type aaindex_files: :class:`list` of :class:`str`
264 """
265 def __init__(self, aaindex_files = None):
266 if aaindex_files is None:
267 aaindex_dir = os.path.join(GetSharedDataPath(), 'aaindex')
268 self.files_to_load = [os.path.join(aaindex_dir, "aaindex1"),
269 os.path.join(aaindex_dir, "aaindex2"),
270 os.path.join(aaindex_dir, "aaindex3")]
271 else:
272 self.files_to_load = list(aaindex_files)
273 self.aaindex_entries = dict()
274
275 def keys(self):
276 """Emulate dict like behvaiour and returns all available keys, accession
277 numbers respectively.
278
279 :returns: keys (or accession numbers) of all available aaindex entries
280 """
281 self._LoadAll()
282 return self.aaindex_entries.keys()
283
284 def values(self):
285 """Emulate dict like behvaiour and returns all available entries.
286
287 :returns: iterable of entries (type :class:`AAIndexData`)
288 """
289 self._LoadAll()
290 return self.aaindex_entries.values()
291
292 def __getitem__(self, key):
293 """Getter by aaindex accession number (e.g. ANDN920101)
294
295 :param key: aaindex accession number
296 :type key: :class:`str`
297 :returns: :class:`AAIndexData` object
298 """
299 while True:
300 if key in self.aaindex_entries:
301 return self.aaindex_entries[key]
302 # key is not available let's load the next aaindex file
303 if not self._Load():
304 # all files loaded, there is entry with *key* in any of them
305 break
306 raise KeyError(f"{key} does not exist in provided aa_index files")
307
308 def _LoadAll(self):
309 """Loads all remaining files specified in self.files_to_load
310 """
311 while self._Load():
312 pass
313
314 def _Load(self):
315 """Loads and removes first element in self.files_to_load. Returns False
316 if there is no file to load anymore, True if one is successfully loaded.
317 """
318 if len(self.files_to_load) == 0:
319 return False
320 path = self.files_to_load.pop(0)
321 if not os.path.exists(path):
322 raise RuntimeError(f"Tried to load {path} but file doesnt exist.")
323 with open(path, 'r') as fh:
324 data = fh.readlines()
325 data_it = iter(data)
326 while True: # read as long as it spits out AAIndexData entries
327 entry = AAIndexData.Parse(data_it)
328 if entry is None:
329 break # nothing to read anymore...
330 else:
331 self.aaindex_entries[entry.key] = entry
332 return True
GetPairScore(self, olc_one, olc_two)
Definition aaindex.py:228
__init__(self, aaindex_files=None)
Definition aaindex.py:265
_StrToFloat(str_item)
Definition aaindex.py:6