4 from ost
import GetSharedDataPath
6 def _StrToFloat(str_item):
7 """Returns None if *str_item* looks like invalid number, casts to
10 if x.lower()
in [
"na",
"-",
"none"]:
16 """Possible types of aaindex entries"""
22 """Data object representing an annotation in aaindex, preferably
23 constructed from it's static :func:`Parse` method. The following
24 attributes are available:
26 * key: aaindex accession number (e.g. ANDN920101)
27 * desc: descriptive title
28 * ref: Reference to article if available
29 * authors: Authors of article if available
30 * title: Title of article if available
31 * journal: Journal of article if available
32 * anno_type: Enum (:class:`AnnoType`) specifying whether we're dealing
33 with a single or pairwise amino acid annotation/score.
34 * anno: :class:`dict` with annotation. If *anno_type* is SINGLE,
35 keys are amino acid one letter codes (single character strings).
36 If *anno_type* is PAIR, keys are two one letter codes added
37 together (two character strings). Even when the thing is
38 symmetric, both keys exist. I.e. 'AB' AND 'BA'.
39 Values are of type :class:`float` (None if not available).
53 """Creates :class:`AAIndexData` from data.
55 :param data: Iterable with strings in data format described for aaindex.
56 :returns: :class:`AAIndexData`, if iterable contains several entries,
57 parsing stops at separation sequence ('//'). None is returned
58 if nothing could be parsed.
59 :raises: descriptive error in case of corrupt data
75 current_data_type =
None
78 if line.startswith(
"//"):
80 elif line.strip() ==
"":
82 elif line[0]
in [
"H",
"D",
"R", "A", "T", "J", "I", "M"]:
84 current_data_type = line[0]
85 elif line.startswith(
" "):
88 current_data_type =
None
90 if current_data_type ==
"H":
91 key = line[2:].strip()
92 elif current_data_type ==
"D":
94 elif current_data_type ==
"R":
96 elif current_data_type ==
"A":
98 elif current_data_type ==
"T":
100 elif current_data_type ==
"J":
102 elif current_data_type ==
"I":
103 if anno_type == AnnoType.PAIR:
104 raise RuntimeError(
"Observed single AA and pairwise "
105 "features in the same aaindex entry")
106 anno_type = AnnoType.SINGLE
107 if line.startswith(
"I"):
109 aakeys = [item.strip()
for item
in line[1:].split()]
110 exp_aa_keys = [
"A/L",
"R/K",
"N/M",
"D/F",
"C/P",
"Q/S",
111 "E/T",
"G/W",
"H/Y",
"I/V"]
112 if aakeys != exp_aa_keys:
113 raise RuntimeError(f
"Keys in single AA AAIndex entry "
114 "are expected to be "
115 "I A/L R/K N/M D/F C/P Q/S E/T G/W H/Y I/V "
119 values += [_StrToFloat(x)
for x
in line.split()]
120 elif current_data_type ==
"M":
121 if anno_type == AnnoType.SINGLE:
122 raise RuntimeError(
"Observed single AA and pairwise "
123 "features in the same aaindex entry")
124 anno_type = AnnoType.PAIR
125 if line.startswith(
"M"):
129 split_line = line[1:].split(
',')
130 split_line = sorted([item.strip()
for item
in split_line])
133 if len(split_line) != 2
or \
134 not split_line[0].startswith(
"cols")
or \
135 not split_line[1].startswith(
"rows"):
136 raise RuntimeError(f
"Expect value header in pair "
137 "AAIndex entry to be of form: "
138 "\"M rows = <x>, cols = <x>\" got: "
140 pair_cols = split_line[0].split(
"=")[1].strip()
141 pair_rows = split_line[1].split(
"=")[1].strip()
142 if len(pair_cols) != len(pair_cols):
143 raise RuntimeError(f
"Expect rows and cols to have same "
144 "number of elements when parsing "
145 "pair AAIndex entry got {line}")
148 values += [_StrToFloat(x)
for x
in line.split()]
154 raise RuntimeError(
"Cannot parse AAIndex entry without key...")
156 if anno_type == AnnoType.SINGLE:
157 olcs =
"ARNDCQEGHILKMFPSTWYV"
158 if len(olcs) != len(values):
159 raise RuntimeError(f
"Expected {len(olcs)} values in single AA "
160 "AAIndex entry, got {len(values)}")
161 for olc, value
in zip(olcs, values):
163 elif anno_type == AnnoType.PAIR:
169 n_values_match =
False
170 n_cols = len(pair_cols)
171 n_rows = len(pair_rows)
172 n_nonsym = n_cols * n_rows
173 if len(values) == n_nonsym:
174 n_values_match =
True
178 anno[a+b] = values[value_idx]
182 n_values_match =
True
184 n_sym = (N*N - N) / 2
186 if len(values) == n_sym:
188 for row_idx, row
in enumerate(pair_rows):
189 for col
in pair_cols[: row_idx+1]:
190 anno[row+col] = values[value_idx]
191 anno[col+row] = values[value_idx]
193 if not n_values_match:
194 raise RuntimeError(f
"Number of parsed values doesn't match "
195 "parsed rows and cols descriptors")
197 raise RuntimeError(
"Cannot parse AAIndex entry without values...")
203 data.authors = authors
205 data.journal = journal
206 data.anno_type = anno_type
211 """Score/Annotation getter
213 :param olc: One letter code of amino acid
214 :type olc: :class:`string`
215 :returns: Annotation/score for *olc*
216 :raises: :class:`ValueError` if *olc* is not known or
217 :class:`RuntimeError` if anno_type of this
218 :class:`AAIndexData` object is not AnnoType.SINGLE.
222 return self.
anno[olc]
224 raise ValueError(f
"OLC not in AAIndex: {olc}")
225 raise RuntimeError(
"Cannot return score for single amino acid with "
226 "AAIndex of type PAIR")
229 """Score/Annotation getter
231 :param olc_one: One letter code of first amino acid
232 :type olc_one: :class:`string`
233 :param olc_two: One letter code of second amino acid
234 :type olc_two: :class:`string`
235 :returns: Pairwise annotation/score for *olc_one*/*olc_two*
236 :raises: :class:`ValueError` if key constructed from *olc_one* and
237 *olc_two* is not known or
238 :class:`RuntimeError` if anno_type of this
239 :class:`AAIndexData` object is not AnnoType.PAIR.
242 anno_key = olc_one + olc_two
243 if anno_key
in self.
anno:
244 return self.
anno[anno_key]
246 raise ValueError(f
"Cannot find annotation for following pairs "
247 "of olcs: {olc_one}, {olc_two}")
248 raise RuntimeError(
"Cannot return score for pair of amino acid "
249 "with AAIndex of type SINGLE")
253 """Provides access to data from the amino acid index database (aaindex):
255 Kawashima, S. and Kanehisa, M.; AAindex: amino acid index database.
256 Nucleic Acids Res. 28, 374 (2000).
258 Files are available `here <https://www.genome.jp/aaindex/>`_
260 :param aaindex_files: Paths to aaindex files. If not given, the files
261 aaindex1, aaindex2 and aaindex3 from the specified
262 source are used (Release 9.2, Feb 2017).
263 :type aaindex_files: :class:`list` of :class:`str`
266 if aaindex_files
is None:
269 os.path.join(aaindex_dir,
"aaindex2"),
270 os.path.join(aaindex_dir,
"aaindex3")]
276 """Emulate dict like behvaiour and returns all available keys, accession
277 numbers respectively.
279 :returns: keys (or accession numbers) of all available aaindex entries
282 return self.aaindex_entries.keys()
285 """Emulate dict like behvaiour and returns all available entries.
287 :returns: iterable of entries (type :class:`AAIndexData`)
290 return self.aaindex_entries.values()
293 """Getter by aaindex accession number (e.g. ANDN920101)
295 :param key: aaindex accession number
296 :type key: :class:`str`
297 :returns: :class:`AAIndexData` object
306 raise KeyError(f
"{key} does not exist in provided aa_index files")
309 """Loads all remaining files specified in self.files_to_load
315 """Loads and removes first element in self.files_to_load. Returns False
316 if there is no file to load anymore, True if one is successfully loaded.
320 path = self.files_to_load.pop(0)
321 if not os.path.exists(path):
322 raise RuntimeError(f
"Tried to load {path} but file doesnt exist.")
323 with open(path,
'r') as fh:
324 data = fh.readlines()
327 entry = AAIndexData.Parse(data_it)
String DLLEXPORT_OST_BASE GetSharedDataPath()