1 '''HHblits wrapper classes and functions.
11 from ost
import settings, seq
16 A hit found by HHblits
20 String identifying the hit
26 Pairwise alignment containing the aligned part between the query and the
27 target. First sequence is the query, the second sequence the target.
29 :type: :class:`~ost.seq.AlignmentHandle`
37 .. attribute:: ss_score
39 The secondary structure score
45 The E-value of the alignment
51 The P-value of the alignment
57 The probability of the alignment (between 0 and 100)
61 def __init__(self, hit_id, aln, score, ss_score, evalue, pvalue, prob):
71 """Stats from the beginning of search output.
75 The name of the query sequence
79 .. attribute:: match_columns
81 Total of aligned Match columns
87 Value of the ``-neff`` option
91 .. attribute:: searched_hmms
93 Number of profiles searched
101 :type: :class:`datetime.datetime`
103 .. attribute:: command
118 '''Fetch header content.
120 First, we seek the start of the identifier, that is, the first whitespace
121 after the hit number + 1. Since the identifier may contain whitespaces
122 itself, we cannot split the whole line
124 :param line: Line from the output header.
125 :type line: :class:`str`
127 :return: Hit information and query/template offsets
128 :rtype: (:class:`HHblitsHit`, (:class:`int`, :class:`int`))
130 for i
in range(0, len(line)):
131 if line[i].isdigit():
133 for i
in range(i, len(line)):
136 assert len(line)-i >= 31
and line[i+1] !=
' '
137 hit_id = line[i+1:i+31].strip()
138 fields = line[i+32:].split()
139 prob = float(fields[0])
140 evalue = float(fields[1])
141 pvalue = float(fields[2])
142 score = float(fields[3])
143 ss_score = float(fields[4])
144 offsets = (int(fields[6].split(
'-')[0]), int(fields[7].split(
'-')[0]))
145 return (
HHblitsHit(hit_id,
None, score, ss_score, evalue, pvalue, prob),
150 Parses the HHblits output as produced by :meth:`HHblits.Search` and returns
151 the header of the search results and a list of hits.
153 :param output: Iterable containing the lines of the HHblits output file
154 :type output: iterable (e.g. an open file handle)
156 :return: a tuple of the header of the search results and the hits
157 :rtype: (:class:`HHblitsHeader`, :class:`list` of :class:`HHblitsHit`)
160 def _ParseHeaderSection(lines):
161 value_start_column = 14
162 date_pattern =
'%a %b %d %H:%M:%S %Y'
165 assert line.startswith(
'Query')
166 header.query = line[value_start_column:].strip()
168 assert line.startswith(
'Match_columns')
169 header.match_columns = int(line[value_start_column:].strip())
172 assert line.startswith(
'No_of_seqs')
175 assert line.startswith(
'Neff')
176 header.n_eff = float(line[value_start_column:].strip())
179 assert line.startswith(
'Searched_HMMs')
180 header.searched_hmms = int(line[value_start_column:].strip())
183 assert line.startswith(
'Date')
184 value = line[value_start_column:].strip()
185 header.date = datetime.datetime.strptime(value, date_pattern)
188 assert line.startswith(
'Command')
189 header.command = line[value_start_column:].strip()
192 assert len(line.strip()) == 0
195 def _ParseTableOfContents(lines):
196 assert lines.next().startswith(
' No Hit')
200 if len(line.strip()) == 0:
205 def _ParseResultBody(query_id, hits, lines):
209 def _MakeAln(query_id, hit_id, query_string, templ_string,
211 s1 = seq.CreateSequence(query_id, query_string)
212 s1.offset = q_offset-1
213 s2 = seq.CreateSequence(hit_id, templ_string)
214 s2.offset = t_offset-1
215 return seq.CreateAlignment(s1, s2)
225 if len(line.strip()) == 0:
227 if line.startswith(
'Done!'):
228 if len(query_str) > 0:
229 hits[entry_index][0].aln = _MakeAln(\
230 query_id, hits[entry_index][0].hit_id,
231 query_str, templ_str, *hits[entry_index][1])
232 return [h
for h, o
in hits]
233 if line.startswith(
'No '):
234 if len(query_str) > 0:
235 hits[entry_index][0].aln = _MakeAln(\
236 query_id, hits[entry_index][0].hit_id,
237 query_str, templ_str, *hits[entry_index][1])
238 entry_index = int(line[3:].strip())-1
239 hits[entry_index][0].hit_id = lines.next()[1:].strip()
246 assert entry_index !=
None
248 if line[1:].startswith(
' Consensus'):
250 if line[1:].startswith(
' ss_pred'):
252 if line[1:].startswith(
' ss_conf'):
254 if line[1:].startswith(
' ss_dssp'):
256 if line.startswith(
'T '):
257 end_pos = line.find(
' ', 22)
260 error_str =
"Unparsable line '%s' for entry No %d" \
261 % (line.strip(), entry_index + 1)
262 raise AssertionError(error_str)
263 templ_str += line[22:end_pos]
264 if line.startswith(
'Q '):
265 end_pos = line.find(
' ', 22)
268 error_str =
"Unparsable line '%s' for entry No %d" \
269 % (line.strip(), entry_index + 1)
270 raise AssertionError(error_str)
271 query_str += line[22:end_pos]
272 except StopIteration:
273 if len(query_str) > 0:
274 hits[entry_index][0].aln = _MakeAln(query_id,
275 hits[entry_index][0].hit_id,
276 query_str, templ_str,
277 *hits[entry_index][1])
278 return [h
for h, o
in hits]
279 header = _ParseHeaderSection(lines)
284 hits = _ParseTableOfContents(lines)
285 return header, _ParseResultBody(header.query, hits, lines)
289 Parse secondary structure information and the multiple sequence alignment
290 out of an A3M file as produced by :meth:`HHblits.BuildQueryMSA`.
292 :param a3m_file: Iterable containing the lines of the A3M file
293 :type a3m_file: iterable (e.g. an open file handle)
295 :return: Dictionary containing "ss_pred" (:class:`list`), "ss_conf"
296 (:class:`list`) and "msa" (:class:`~ost.seq.AlignmentHandle`).
298 profile_dict = dict()
304 for line
in a3m_file:
305 if len(line.rstrip()) == 0:
307 elif line.startswith(
'>ss_pred'):
310 elif line.startswith(
'>ss_conf'):
314 if state ==
'ssconf' or state ==
'msa':
316 msa_head.append(line[1:].rstrip())
318 raise IOError(
'The A3M file is missing the "ss_conf" section')
322 if state ==
'sspred':
323 pred_seq_txt += line.rstrip()
324 elif state ==
'ssconf':
325 conf_seq_txt += line.rstrip()
327 msa_seq[len(msa_seq)-1] += line.rstrip()
329 profile_dict[
'ss_pred'] = list()
330 profile_dict[
'ss_conf'] = list()
331 for i
in range(0, len(pred_seq_txt)):
332 profile_dict[
'ss_pred'].append(pred_seq_txt[i])
333 profile_dict[
'ss_conf'].append(int(conf_seq_txt[i]))
337 profile_dict[
'msa'] =
None
340 al = seq.AlignmentList()
341 for i
in range(1, len(msa_seq)):
353 nl = seq.CreateAlignment(seq.CreateSequence(msa_head[0], qs),
354 seq.CreateSequence(msa_head[i], ts))
356 profile_dict[
'msa'] = seq.alg.MergePairwiseAlignments(\
357 al, seq.CreateSequence(msa_head[0], t))
363 Parse secondary structure information and the MSA out of an HHM profile as
364 produced by :meth:`HHblits.A3MToProfile`.
366 :param profile: Opened file handle holding the profile.
367 :type profile: :class:`file`
369 :return: Dictionary containing "ss_pred" (:class:`list`), "ss_conf"
370 (:class:`list`), "msa" (:class:`~ost.seq.AlignmentHandle`) and
371 "consensus" (:class:`~ost.seq.SequenceHandle`).
373 profile_dict = dict()
381 if len(line.rstrip()) == 0:
383 if line.rstrip() ==
'>ss_pred PSIPRED predicted secondary structure':
386 elif line.rstrip() ==
'>ss_conf PSIPRED confidence values':
389 elif line.rstrip() ==
'>Consensus':
393 if state ==
'consensus' or state ==
'msa':
395 msa_head.append(line[1:].rstrip())
397 raise IOError(
'Profile file "%s" is missing ' % profile.name+
398 'the "Consensus" section')
405 if state ==
'sspred':
406 pred_seq_txt += line.rstrip()
407 elif state ==
'ssconf':
408 conf_seq_txt += line.rstrip()
410 msa_seq[len(msa_seq)-1] += line.rstrip()
411 elif state ==
'consensus':
412 consensus_txt += line.rstrip()
414 profile_dict[
'ss_pred'] = list()
415 profile_dict[
'ss_conf'] = list()
416 for i
in range(0, len(pred_seq_txt)):
417 profile_dict[
'ss_pred'].append(pred_seq_txt[i])
418 profile_dict[
'ss_conf'].append(int(conf_seq_txt[i]))
422 profile_dict[
'msa'] =
None
425 al = seq.AlignmentList()
426 for i
in range(1, len(msa_seq)):
438 nl = seq.CreateAlignment(seq.CreateSequence(msa_head[0], qs),
439 seq.CreateSequence(msa_head[i], ts))
441 profile_dict[
'msa'] = seq.alg.MergePairwiseAlignments(\
442 al, seq.CreateSequence(msa_head[0], t))
445 profile_dict[
'consensus'] = seq.CreateSequence(
'Consensus', consensus_txt)
452 Initialise a new HHblits "search" for the given query. Query may either
453 be a :class:`~ost.seq.SequenceHandle` or a string. In the former case, the
454 query is the actual query sequence, in the latter case, the query is the
455 filename to the file containing the query.
457 :param query: Query sequence as file or sequence.
458 :type query: :class:`~ost.seq.SequenceHandle` or :class:`str`
459 :param hhsuite_root: Path to the top-level directory of your hhsuite
461 :type hhsuite_root: :class:`str`
462 :param hhblits_bin: Name of the hhblits binary. Will only be used if
463 :attr:`hhsuite_root`:file:`/bin/hhblits` does not exist.
464 :type hhblits_bin: :class:`str`
465 :param working_dir: Directory for temporary files. Will be created if not
466 present but **not** automatically deleted.
467 :type working_dir: :class:`str`
469 OUTPUT_PREFIX =
'query_hhblits'
470 def __init__(self, query, hhsuite_root, hhblits_bin=None, working_dir=None):
473 if os.path.exists(os.path.join(self.
hhsuite_root,
'bin/hhblits')):
478 explicit_file_name=hhblits_bin)
486 if not os.path.exists(working_dir):
487 os.mkdir(working_dir)
488 if isinstance(query, str):
491 if self.
filename != os.path.abspath(query):
495 '%s.fasta' % HHblits.OUTPUT_PREFIX)
499 if isinstance(query, str):
501 self.
filename = os.path.abspath(os.path.join(
510 """Builds the MSA for the query sequence.
512 This function directly uses hhblits of hhtools. While in theory it would
513 be possible to do this by PSI-blasting on our own, hhblits is supposed
514 to be faster. Also it is supposed to prevent alignment corruption. The
515 alignment corruption is caused by low-scoring terminal alignments that
516 draw the sequences found by PSI-blast away from the optimum. By removing
517 these low scoring ends, part of the alignment corruption can be
520 hhblits does **not** call PSIPRED on the MSA to predict the secondary
521 structure of the query sequence. This is done by addss.pl of hhtools.
522 The predicted secondary structure is stored together with the sequences
523 identified by hhblits.
525 The produced A3M file can be parsed by :func:`ParseA3M`. If the file was
526 already produced, hhblits is not called again and the existing file path
529 :param nrdb: Database to be align against; has to be an hhblits database
530 :type nrdb: :class:`str`
532 :param options: Dictionary of options to *hhblits*, one "-" is added in
533 front of every key. Boolean True values add flag without
534 value. Merged with default options {'cpu': 1, 'n': 1},
535 where 'n' defines the number of iterations.
536 :type options: :class:`dict`
538 :param a3m_file: a path of a3m_file to be used, optional
539 :type a3m_file: :class:`str`
541 :return: The path to the A3M file containing the MSA
545 a3m_file =
'%s.a3m' % os.path.splitext(self.
filename)[0]
546 if os.path.exists(a3m_file):
547 ost.LogInfo(
'Reusing already existing query alignment (%s)' % a3m_file)
549 ost.LogInfo(
'Using hhblits from "%s"' % self.
hhsuite_root)
550 full_nrdb = os.path.join(os.path.abspath(os.path.split(nrdb)[0]),
551 os.path.split(nrdb)[1])
556 opt_cmd, _ = _ParseOptions(opts)
557 hhblits_cmd =
'%s -e 0.001 -i %s -oa3m %s -d %s %s' % \
560 job = subprocess.Popen(hhblits_cmd, shell=
True, cwd=self.
working_dir,
561 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
562 sout, _ = job.communicate()
563 lines = sout.splitlines()
565 ost.LogVerbose(line.strip())
566 if not os.path.exists(a3m_file):
567 ost.LogWarning(
'Building query profile failed, no output')
571 'lib/hh/scripts/addss.pl'),
573 env = dict(os.environ)
574 env.update({
'PERL5LIB' : os.path.join(self.
hhsuite_root,
577 'PATH' :
'%s:%s' % (os.path.join(self.
hhsuite_root,
'bin'),
578 os.environ[
'PATH'])})
579 job = subprocess.Popen(addss_cmd, shell=
True, cwd=self.
working_dir,
580 env=env, stdout=subprocess.PIPE,
581 stderr=subprocess.PIPE)
582 sout, serr = job.communicate()
583 lines = sout.splitlines()
585 if 'error' in line.lower():
586 ost.LogWarning(
'Predicting secondary structure for MSA '+
587 '(%s) failed, on command: %s' % (a3m_file, line))
593 Converts the A3M alignment file to a hhm profile. If hhm_file is not
594 given, the output file will be set to <:attr:`a3m_file`-basename>.hhm.
596 The produced A3M file can be parsed by :func:`ParseHHM`.
598 If the file was already produced, the existing file path is returned
599 without recomputing it.
601 :param a3m_file: Path to input MSA as produced by :meth:`BuildQueryMSA`
602 :type a3m_file: :class:`str`
604 :param hhm_file: Desired output file name
605 :type hhm_file: :class:`str`
607 :return: Path to the profile file
610 hhmake = os.path.join(self.
bin_dir,
'hhmake')
612 hhm_file =
'%s.hhm' % os.path.splitext(a3m_file)[0]
613 if os.path.exists(hhm_file):
615 ost.LogVerbose(
'converting %s to %s' % (a3m_file, hhm_file))
617 if subprocess.call(
'%s -i %s -o %s' % (hhmake, a3m_file, hhm_file),
619 raise IOError(
'could not convert a3m to hhm file')
622 def A3MToCS(self, a3m_file, cs_file=None, options={}):
624 Converts the A3M alignment file to a column state sequence file. If
625 cs_file is not given, the output file will be set to
626 <:attr:`a3m_file`-basename>.seq219.
628 If the file was already produced, the existing file path is returned
629 without recomputing it.
631 :param a3m_file: Path to input MSA as produced by :meth:`BuildQueryMSA`
632 :type a3m_file: :class:`str`
634 :param cs_file: Output file name (may be omitted)
635 :type cs_file: :class:`str`
637 :param options: Dictionary of options to *cstranslate*, one "-" is added
638 in front of every key. Boolean True values add flag
640 :type options: :class:`dict`
642 :return: Path to the column state sequence file
645 cstranslate = os.path.join(self.
hhlib_dir,
'bin',
'cstranslate')
647 cs_file =
'%s.seq219' % os.path.splitext(a3m_file)[0]
648 if os.path.exists(cs_file):
650 opt_cmd, _ = _ParseOptions(options)
651 cs_cmd =
'%s -i %s -o %s %s' % (
653 os.path.abspath(a3m_file),
654 os.path.abspath(cs_file),
656 ost.LogVerbose(
'converting %s to %s' % (a3m_file, cs_file))
657 job = subprocess.Popen(cs_cmd, shell=
True, cwd=self.
working_dir,
658 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
659 sout, _ = job.communicate()
660 lines = sout.splitlines()
662 if 'Wrote abstract state sequence to' in line:
664 ost.LogWarning(
'Creating column state sequence file (%s) failed' % \
668 """Delete temporary data.
670 Delete temporary data if no working dir was given. Controlled by
671 :attr:`needs_cleanup`.
677 '''In case something went wrong, call to make sure everything is clean.
679 This will delete the working dir independently of :attr:`needs_cleanup`.
686 def Search(self, a3m_file, database, options={}, prefix=''):
688 Searches for templates in the given database. Before running the search,
689 the hhm file is copied. This makes it possible to launch several hhblits
690 instances at once. Upon success, the filename of the result file is
691 returned. This file may be parsed with :func:`ParseHHblitsOutput`.
693 :param a3m_file: Path to input MSA as produced by :meth:`BuildQueryMSA`
694 :type a3m_file: :class:`str`
696 :param database: Search database, needs to be the common prefix of the
698 :type database: :class:`str`
700 :param options: Dictionary of options to *hhblits*, one "-" is added in
701 front of every key. Boolean True values add flag without
702 value. Merged with default options {'cpu': 1, 'n': 1},
703 where 'n' defines the number of iterations.
704 :type options: :class:`dict`
706 :param prefix: Prefix to the result file
707 :type prefix: :class:`str`
709 :return: The path to the result file
715 opt_cmd, opt_str = _ParseOptions(opts)
716 base = os.path.basename(os.path.splitext(a3m_file)[0])
717 hhr_file =
'%s%s_%s.hhr' % (prefix, base, opt_str)
718 hhr_file = os.path.join(self.
working_dir, hhr_file)
719 search_cmd =
'%s %s -e 0.001 -Z 10000 -B 10000 -i %s -o %s -d %s' % (
722 os.path.abspath(a3m_file),
724 os.path.join(os.path.abspath(os.path.split(database)[0]),
725 os.path.split(database)[1]))
726 ost.LogInfo(
'searching %s' % database)
727 ost.LogVerbose(search_cmd)
728 job = subprocess.Popen(search_cmd, shell=
True, cwd=self.
working_dir,
729 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
730 sout, serr = job.communicate()
731 if job.returncode != 0:
732 lines = sout.splitlines()
734 ost.LogError(line.strip())
735 lines = serr.splitlines()
737 ost.LogError(line.strip())
742 def _ParseOptions(opts):
744 :return: Tuple of strings (opt_cmd, opt_str), where opt_cmd can be
745 passed to command ("-" added in front of keys, options
746 separated by space) and opt_str (options separated by "_")
747 can be used for filenames.
748 :param opts: Dictionary of options, one "-" is added in front of every
749 key. Boolean True values add flag without value.
753 for k, val
in opts.iteritems():
754 if type(val) == type(
True):
756 opt_cmd.append(
'-%s' % str(k))
757 opt_str.append(str(k))
759 opt_cmd.append(
'-%s %s' % (str(k), str(val)))
760 opt_str.append(
'%s%s' % (str(k), str(val)))
761 opt_cmd =
' '.join(opt_cmd)
762 opt_str =
'_'.join(opt_str)
763 return opt_cmd, opt_str
766 __all__ = [
'HHblits',
'HHblitsHit',
'HHblitsHeader',
767 'ParseHHblitsOutput',
'ParseA3M',
'ParseHHM',
void DLLEXPORT_OST_IO SaveSequence(const seq::ConstSequenceHandle &sequence, const String &filename, const String &format="auto")