1 '''HHblits wrapper classes and functions.
11 from ost
import settings, seq
16 A hit found by HHblits
20 String identifying the hit
26 Pairwise alignment containing the aligned part between the query and the
27 target. First sequence is the query, the second sequence the target.
29 :type: :class:`~ost.seq.AlignmentHandle`
37 .. attribute:: ss_score
39 The secondary structure score
45 The E-value of the alignment
51 The P-value of the alignment
57 The probability of the alignment (between 0 and 100)
61 def __init__(self, hit_id, aln, score, ss_score, evalue, pvalue, prob):
71 """Stats from the beginning of search output.
75 The name of the query sequence
79 .. attribute:: match_columns
81 Total of aligned Match columns
87 Value of the ``-neff`` option
91 .. attribute:: searched_hmms
93 Number of profiles searched
101 :type: :class:`datetime.datetime`
103 .. attribute:: command
118 '''Fetch header content.
120 First, we seek the start of the identifier, that is, the first whitespace
121 after the hit number + 1. Since the identifier may contain whitespaces
122 itself, we cannot split the whole line
124 :param line: Line from the output header.
125 :type line: :class:`str`
127 :return: Hit information and query/template offsets
128 :rtype: (:class:`HHblitsHit`, (:class:`int`, :class:`int`))
130 for i
in range(0, len(line)):
131 if line[i].isdigit():
133 for i
in range(i, len(line)):
136 assert len(line)-i >= 31
and line[i+1] !=
' '
137 hit_id = line[i+1:i+31].strip()
138 fields = line[i+32:].split()
139 prob = float(fields[0])
140 evalue = float(fields[1])
141 pvalue = float(fields[2])
142 score = float(fields[3])
143 ss_score = float(fields[4])
144 offsets = (int(fields[6].split(
'-')[0]), int(fields[7].split(
'-')[0]))
145 return (
HHblitsHit(hit_id,
None, score, ss_score, evalue, pvalue, prob),
150 Parses the HHblits output as produced by :meth:`HHblits.Search` and returns
151 the header of the search results and a list of hits.
153 :param output: Iterable containing the lines of the HHblits output file
154 :type output: iterable (e.g. an open file handle)
156 :return: a tuple of the header of the search results and the hits
157 :rtype: (:class:`HHblitsHeader`, :class:`list` of :class:`HHblitsHit`)
160 def _ParseHeaderSection(lines):
161 value_start_column = 14
162 date_pattern =
'%a %b %d %H:%M:%S %Y'
165 assert line.startswith(
'Query')
166 header.query = line[value_start_column:].strip()
168 assert line.startswith(
'Match_columns')
169 header.match_columns = int(line[value_start_column:].strip())
172 assert line.startswith(
'No_of_seqs')
175 assert line.startswith(
'Neff')
176 header.n_eff = float(line[value_start_column:].strip())
179 assert line.startswith(
'Searched_HMMs')
180 header.searched_hmms = int(line[value_start_column:].strip())
183 assert line.startswith(
'Date')
184 value = line[value_start_column:].strip()
185 header.date = datetime.datetime.strptime(value, date_pattern)
188 assert line.startswith(
'Command')
189 header.command = line[value_start_column:].strip()
192 assert len(line.strip()) == 0
195 def _ParseTableOfContents(lines):
196 assert lines.next().startswith(
' No Hit')
200 if len(line.strip()) == 0:
205 def _ParseResultBody(query_id, hits, lines):
209 def _MakeAln(query_id, hit_id, query_string, templ_string,
211 s1 = seq.CreateSequence(query_id, query_string)
212 s1.offset = q_offset-1
213 s2 = seq.CreateSequence(hit_id, templ_string)
214 s2.offset = t_offset-1
215 return seq.CreateAlignment(s1, s2)
225 if len(line.strip()) == 0:
227 if line.startswith(
'Done!'):
228 if len(query_str) > 0:
229 hits[entry_index][0].aln = _MakeAln(\
230 query_id, hits[entry_index][0].hit_id,
231 query_str, templ_str, *hits[entry_index][1])
232 return [h
for h, o
in hits]
233 if line.startswith(
'No '):
234 if len(query_str) > 0:
235 hits[entry_index][0].aln = _MakeAln(\
236 query_id, hits[entry_index][0].hit_id,
237 query_str, templ_str, *hits[entry_index][1])
238 entry_index = int(line[3:].strip())-1
239 hits[entry_index][0].hit_id = lines.next()[1:].strip()
246 assert entry_index !=
None
248 if line[1:].startswith(
' Consensus'):
250 if line[1:].startswith(
' ss_pred'):
252 if line[1:].startswith(
' ss_conf'):
254 if line[1:].startswith(
' ss_dssp'):
256 if line.startswith(
'T '):
257 for start_pos
in range(22, len(line)):
258 if line[start_pos].isalpha()
or line[start_pos] ==
'-':
260 end_pos = line.find(
' ', start_pos)
263 error_str =
"Unparsable line '%s' for entry No %d" \
264 % (line.strip(), entry_index + 1)
265 raise AssertionError(error_str)
266 templ_str += line[start_pos:end_pos]
267 if line.startswith(
'Q '):
268 for start_pos
in range(22, len(line)):
269 if line[start_pos].isalpha()
or line[start_pos] ==
'-':
271 end_pos = line.find(
' ', start_pos)
274 error_str =
"Unparsable line '%s' for entry No %d" \
275 % (line.strip(), entry_index + 1)
276 raise AssertionError(error_str)
277 query_str += line[start_pos:end_pos]
278 except StopIteration:
279 if len(query_str) > 0:
280 hits[entry_index][0].aln = _MakeAln(query_id,
281 hits[entry_index][0].hit_id,
282 query_str, templ_str,
283 *hits[entry_index][1])
284 return [h
for h, o
in hits]
285 header = _ParseHeaderSection(lines)
290 hits = _ParseTableOfContents(lines)
291 return header, _ParseResultBody(header.query, hits, lines)
295 Parse secondary structure information and the multiple sequence alignment
296 out of an A3M file as produced by :meth:`HHblits.BuildQueryMSA`.
298 :param a3m_file: Iterable containing the lines of the A3M file
299 :type a3m_file: iterable (e.g. an open file handle)
301 :return: Dictionary containing "ss_pred" (:class:`list`), "ss_conf"
302 (:class:`list`) and "msa" (:class:`~ost.seq.AlignmentHandle`).
304 profile_dict = dict()
310 for line
in a3m_file:
311 if len(line.rstrip()) == 0:
313 elif line.startswith(
'>ss_pred'):
316 elif line.startswith(
'>ss_conf'):
320 if state ==
'ssconf' or state ==
'msa':
322 msa_head.append(line[1:].rstrip())
324 raise IOError(
'The A3M file is missing the "ss_conf" section')
328 if state ==
'sspred':
329 pred_seq_txt += line.rstrip()
330 elif state ==
'ssconf':
331 conf_seq_txt += line.rstrip()
333 msa_seq[len(msa_seq)-1] += line.rstrip()
335 profile_dict[
'ss_pred'] = list()
336 profile_dict[
'ss_conf'] = list()
337 for i
in range(0, len(pred_seq_txt)):
338 profile_dict[
'ss_pred'].append(pred_seq_txt[i])
339 profile_dict[
'ss_conf'].append(int(conf_seq_txt[i]))
343 profile_dict[
'msa'] =
None
346 al = seq.AlignmentList()
347 for i
in range(1, len(msa_seq)):
359 nl = seq.CreateAlignment(seq.CreateSequence(msa_head[0], qs),
360 seq.CreateSequence(msa_head[i], ts))
362 profile_dict[
'msa'] = seq.alg.MergePairwiseAlignments(\
363 al, seq.CreateSequence(msa_head[0], t))
369 Parse secondary structure information and the MSA out of an HHM profile as
370 produced by :meth:`HHblits.A3MToProfile`.
372 :param profile: Opened file handle holding the profile.
373 :type profile: :class:`file`
375 :return: Dictionary containing "ss_pred" (:class:`list`), "ss_conf"
376 (:class:`list`), "msa" (:class:`~ost.seq.AlignmentHandle`) and
377 "consensus" (:class:`~ost.seq.SequenceHandle`).
379 profile_dict = dict()
387 if len(line.rstrip()) == 0:
389 if line.rstrip() ==
'>ss_pred PSIPRED predicted secondary structure':
392 elif line.rstrip() ==
'>ss_conf PSIPRED confidence values':
395 elif line.rstrip() ==
'>Consensus':
399 if state ==
'consensus' or state ==
'msa':
401 msa_head.append(line[1:].rstrip())
403 raise IOError(
'Profile file "%s" is missing ' % profile.name+
404 'the "Consensus" section')
411 if state ==
'sspred':
412 pred_seq_txt += line.rstrip()
413 elif state ==
'ssconf':
414 conf_seq_txt += line.rstrip()
416 msa_seq[len(msa_seq)-1] += line.rstrip()
417 elif state ==
'consensus':
418 consensus_txt += line.rstrip()
420 profile_dict[
'ss_pred'] = list()
421 profile_dict[
'ss_conf'] = list()
422 for i
in range(0, len(pred_seq_txt)):
423 profile_dict[
'ss_pred'].append(pred_seq_txt[i])
424 profile_dict[
'ss_conf'].append(int(conf_seq_txt[i]))
428 profile_dict[
'msa'] =
None
431 al = seq.AlignmentList()
432 for i
in range(1, len(msa_seq)):
444 nl = seq.CreateAlignment(seq.CreateSequence(msa_head[0], qs),
445 seq.CreateSequence(msa_head[i], ts))
447 profile_dict[
'msa'] = seq.alg.MergePairwiseAlignments(\
448 al, seq.CreateSequence(msa_head[0], t))
451 profile_dict[
'consensus'] = seq.CreateSequence(
'Consensus', consensus_txt)
458 Initialise a new HHblits "search" for the given query. Query may either
459 be a :class:`~ost.seq.SequenceHandle` or a string. In the former case, the
460 query is the actual query sequence, in the latter case, the query is the
461 filename to the file containing the query.
463 :param query: Query sequence as file or sequence.
464 :type query: :class:`~ost.seq.SequenceHandle` or :class:`str`
465 :param hhsuite_root: Path to the top-level directory of your hhsuite
467 :type hhsuite_root: :class:`str`
468 :param hhblits_bin: Name of the hhblits binary. Will only be used if
469 :attr:`hhsuite_root`:file:`/bin/hhblits` does not exist.
470 :type hhblits_bin: :class:`str`
471 :param working_dir: Directory for temporary files. Will be created if not
472 present but **not** automatically deleted.
473 :type working_dir: :class:`str`
475 OUTPUT_PREFIX =
'query_hhblits'
476 def __init__(self, query, hhsuite_root, hhblits_bin=None, working_dir=None):
479 if os.path.exists(os.path.join(self.
hhsuite_root,
'bin/hhblits')):
484 explicit_file_name=hhblits_bin)
492 if not os.path.exists(working_dir):
493 os.mkdir(working_dir)
494 if isinstance(query, str):
497 if self.
filename != os.path.abspath(query):
501 '%s.fasta' % HHblits.OUTPUT_PREFIX)
505 if isinstance(query, str):
507 self.
filename = os.path.abspath(os.path.join(
516 """Builds the MSA for the query sequence.
518 This function directly uses hhblits of hhtools. While in theory it would
519 be possible to do this by PSI-blasting on our own, hhblits is supposed
520 to be faster. Also it is supposed to prevent alignment corruption. The
521 alignment corruption is caused by low-scoring terminal alignments that
522 draw the sequences found by PSI-blast away from the optimum. By removing
523 these low scoring ends, part of the alignment corruption can be
526 hhblits does **not** call PSIPRED on the MSA to predict the secondary
527 structure of the query sequence. This is done by addss.pl of hhtools.
528 The predicted secondary structure is stored together with the sequences
529 identified by hhblits.
531 The produced A3M file can be parsed by :func:`ParseA3M`. If the file was
532 already produced, hhblits is not called again and the existing file path
535 :param nrdb: Database to be align against; has to be an hhblits database
536 :type nrdb: :class:`str`
538 :param options: Dictionary of options to *hhblits*, one "-" is added in
539 front of every key. Boolean True values add flag without
540 value. Merged with default options {'cpu': 1, 'n': 1},
541 where 'n' defines the number of iterations.
542 :type options: :class:`dict`
544 :param a3m_file: a path of a3m_file to be used, optional
545 :type a3m_file: :class:`str`
547 :return: The path to the A3M file containing the MSA
551 a3m_file =
'%s.a3m' % os.path.splitext(self.
filename)[0]
553 a3m_file = os.path.abspath(a3m_file)
554 if os.path.exists(a3m_file):
555 ost.LogInfo(
'Reusing already existing query alignment (%s)' % a3m_file)
557 ost.LogInfo(
'Using hhblits from "%s"' % self.
hhsuite_root)
558 full_nrdb = os.path.join(os.path.abspath(os.path.split(nrdb)[0]),
559 os.path.split(nrdb)[1])
564 opt_cmd, _ = _ParseOptions(opts)
565 hhblits_cmd =
'%s -e 0.001 -i %s -oa3m %s -d %s %s' % \
568 job = subprocess.Popen(hhblits_cmd, shell=
True, cwd=self.
working_dir,
569 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
570 sout, _ = job.communicate()
571 lines = sout.splitlines()
573 ost.LogVerbose(line.strip())
574 if not os.path.exists(a3m_file):
575 ost.LogWarning(
'Building query profile failed, no output')
579 'lib/hh/scripts/addss.pl'),
581 env = dict(os.environ)
582 env.update({
'PERL5LIB' : os.path.join(self.
hhsuite_root,
585 'PATH' :
'%s:%s' % (os.path.join(self.
hhsuite_root,
'bin'),
586 os.environ[
'PATH'])})
587 job = subprocess.Popen(addss_cmd, shell=
True, cwd=self.
working_dir,
588 env=env, stdout=subprocess.PIPE,
589 stderr=subprocess.PIPE)
590 sout, serr = job.communicate()
591 lines = sout.splitlines()
593 if 'error' in line.lower():
594 ost.LogWarning(
'Predicting secondary structure for MSA '+
595 '(%s) failed, on command: %s' % (a3m_file, line))
601 Converts the A3M alignment file to a hhm profile. If hhm_file is not
602 given, the output file will be set to <:attr:`a3m_file`-basename>.hhm.
604 The produced HHM file can be parsed by :func:`ParseHHM`.
606 If the file was already produced, the existing file path is returned
607 without recomputing it.
609 :param a3m_file: Path to input MSA as produced by :meth:`BuildQueryMSA`
610 :type a3m_file: :class:`str`
612 :param hhm_file: Desired output file name
613 :type hhm_file: :class:`str`
615 :return: Path to the profile file
618 hhmake = os.path.join(self.
bin_dir,
'hhmake')
620 hhm_file =
'%s.hhm' % os.path.splitext(a3m_file)[0]
621 if os.path.exists(hhm_file):
623 ost.LogVerbose(
'converting %s to %s' % (a3m_file, hhm_file))
625 if subprocess.call(
'%s -i %s -o %s' % (hhmake, a3m_file, hhm_file),
627 raise IOError(
'could not convert a3m to hhm file')
630 def A3MToCS(self, a3m_file, cs_file=None, options={}):
632 Converts the A3M alignment file to a column state sequence file. If
633 cs_file is not given, the output file will be set to
634 <:attr:`a3m_file`-basename>.seq219.
636 If the file was already produced, the existing file path is returned
637 without recomputing it.
639 :param a3m_file: Path to input MSA as produced by :meth:`BuildQueryMSA`
640 :type a3m_file: :class:`str`
642 :param cs_file: Output file name (may be omitted)
643 :type cs_file: :class:`str`
645 :param options: Dictionary of options to *cstranslate*, one "-" is added
646 in front of every key. Boolean True values add flag
648 :type options: :class:`dict`
650 :return: Path to the column state sequence file
653 cstranslate = os.path.join(self.
hhlib_dir,
'bin',
'cstranslate')
655 cs_file =
'%s.seq219' % os.path.splitext(a3m_file)[0]
656 if os.path.exists(cs_file):
658 opt_cmd, _ = _ParseOptions(options)
659 cs_cmd =
'%s -i %s -o %s %s' % (
661 os.path.abspath(a3m_file),
662 os.path.abspath(cs_file),
664 ost.LogVerbose(
'converting %s to %s' % (a3m_file, cs_file))
665 job = subprocess.Popen(cs_cmd, shell=
True, cwd=self.
working_dir,
666 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
667 sout, _ = job.communicate()
668 lines = sout.splitlines()
670 if 'Wrote abstract state sequence to' in line:
672 ost.LogWarning(
'Creating column state sequence file (%s) failed' % \
676 """Delete temporary data.
678 Delete temporary data if no working dir was given. Controlled by
679 :attr:`needs_cleanup`.
685 '''In case something went wrong, call to make sure everything is clean.
687 This will delete the working dir independently of :attr:`needs_cleanup`.
694 def Search(self, a3m_file, database, options={}, prefix=''):
696 Searches for templates in the given database. Before running the search,
697 the hhm file is copied. This makes it possible to launch several hhblits
698 instances at once. Upon success, the filename of the result file is
699 returned. This file may be parsed with :func:`ParseHHblitsOutput`.
701 :param a3m_file: Path to input MSA as produced by :meth:`BuildQueryMSA`
702 :type a3m_file: :class:`str`
704 :param database: Search database, needs to be the common prefix of the
706 :type database: :class:`str`
708 :param options: Dictionary of options to *hhblits*, one "-" is added in
709 front of every key. Boolean True values add flag without
710 value. Merged with default options {'cpu': 1, 'n': 1},
711 where 'n' defines the number of iterations.
712 :type options: :class:`dict`
714 :param prefix: Prefix to the result file
715 :type prefix: :class:`str`
717 :return: The path to the result file
723 opt_cmd, opt_str = _ParseOptions(opts)
724 base = os.path.basename(os.path.splitext(a3m_file)[0])
725 hhr_file =
'%s%s_%s.hhr' % (prefix, base, opt_str)
726 hhr_file = os.path.join(self.
working_dir, hhr_file)
727 search_cmd =
'%s %s -e 0.001 -Z 10000 -B 10000 -i %s -o %s -d %s' % (
730 os.path.abspath(a3m_file),
731 os.path.abspath(hhr_file),
732 os.path.join(os.path.abspath(os.path.split(database)[0]),
733 os.path.split(database)[1]))
734 ost.LogInfo(
'searching %s' % database)
735 ost.LogVerbose(search_cmd)
736 job = subprocess.Popen(search_cmd, shell=
True, cwd=self.
working_dir,
737 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
738 sout, serr = job.communicate()
739 if job.returncode != 0:
740 lines = sout.splitlines()
742 ost.LogError(line.strip())
743 lines = serr.splitlines()
745 ost.LogError(line.strip())
750 def _ParseOptions(opts):
752 :return: Tuple of strings (opt_cmd, opt_str), where opt_cmd can be
753 passed to command ("-" added in front of keys, options
754 separated by space) and opt_str (options separated by "_")
755 can be used for filenames.
756 :param opts: Dictionary of options, one "-" is added in front of every
757 key. Boolean True values add flag without value.
761 for k, val
in opts.iteritems():
762 if type(val) == type(
True):
764 opt_cmd.append(
'-%s' % str(k))
765 opt_str.append(str(k))
767 opt_cmd.append(
'-%s %s' % (str(k), str(val)))
768 opt_str.append(
'%s%s' % (str(k), str(val)))
769 opt_cmd =
' '.join(opt_cmd)
770 opt_str =
'_'.join(opt_str)
771 return opt_cmd, opt_str
774 __all__ = [
'HHblits',
'HHblitsHit',
'HHblitsHeader',
775 'ParseHHblitsOutput',
'ParseA3M',
'ParseHHM',
void DLLEXPORT_OST_IO SaveSequence(const seq::ConstSequenceHandle &sequence, const String &filename, const String &format="auto")