1 '''HHblits wrapper classes and functions.
11 from ost
import settings, seq
16 A hit found by HHblits
20 String identifying the hit
26 Pairwise alignment containing the aligned part between the query and the
27 target. First sequence is the query, the second sequence the target.
29 :type: :class:`~ost.seq.AlignmentHandle`
37 .. attribute:: ss_score
39 The secondary structure score
45 The E-value of the alignment
51 The P-value of the alignment
57 The probability of the alignment (between 0 and 100)
61 def __init__(self, hit_id, aln, score, ss_score, evalue, pvalue, prob):
71 """Stats from the beginning of search output.
75 The name of the query sequence
79 .. attribute:: match_columns
81 Total of aligned Match columns
87 Value of the ``-neff`` option
91 .. attribute:: searched_hmms
93 Number of profiles searched
101 :type: :class:`datetime.datetime`
103 .. attribute:: command
118 '''Fetch header content.
120 First, we seek the start of the identifier, that is, the first whitespace
121 after the hit number + 1. Since the identifier may contain whitespaces
122 itself, we cannot split the whole line
124 :param line: Line from the output header.
125 :type line: :class:`str`
127 :return: Hit information and query/template offsets
128 :rtype: (:class:`HHblitsHit`, (:class:`int`, :class:`int`))
130 for i
in range(0, len(line)):
131 if line[i].isdigit():
133 for i
in range(i, len(line)):
136 assert len(line)-i >= 31
and line[i+1] !=
' '
137 hit_id = line[i+1:i+31].strip()
138 fields = line[i+32:].split()
139 prob = float(fields[0])
140 evalue = float(fields[1])
141 pvalue = float(fields[2])
142 score = float(fields[3])
143 ss_score = float(fields[4])
144 offsets = (int(fields[6].split(
'-')[0]), int(fields[7].split(
'-')[0]))
145 return (
HHblitsHit(hit_id,
None, score, ss_score, evalue, pvalue, prob),
150 Parses the HHblits output as produced by :meth:`HHblits.Search` and returns
151 the header of the search results and a list of hits.
153 :param output: Iterable containing the lines of the HHblits output file
154 :type output: iterable (e.g. an open file handle)
156 :return: a tuple of the header of the search results and the hits
157 :rtype: (:class:`HHblitsHeader`, :class:`list` of :class:`HHblitsHit`)
160 def _ParseHeaderSection(lines):
161 value_start_column = 14
162 date_pattern =
'%a %b %d %H:%M:%S %Y'
165 assert line.startswith(
'Query')
166 header.query = line[value_start_column:].strip()
168 assert line.startswith(
'Match_columns')
169 header.match_columns = int(line[value_start_column:].strip())
172 assert line.startswith(
'No_of_seqs')
175 assert line.startswith(
'Neff')
176 header.n_eff = float(line[value_start_column:].strip())
179 assert line.startswith(
'Searched_HMMs')
180 header.searched_hmms = int(line[value_start_column:].strip())
183 assert line.startswith(
'Date')
184 value = line[value_start_column:].strip()
185 header.date = datetime.datetime.strptime(value, date_pattern)
188 assert line.startswith(
'Command')
189 header.command = line[value_start_column:].strip()
192 assert len(line.strip()) == 0
195 def _ParseTableOfContents(lines):
197 assert line.startswith(
' No Hit')
201 if len(line.strip()) == 0:
206 def _ParseResultBody(query_id, hits, lines):
210 def _MakeAln(query_id, hit_id, query_string, templ_string,
212 s1 = seq.CreateSequence(query_id, query_string)
213 s1.offset = q_offset-1
214 s2 = seq.CreateSequence(hit_id, templ_string)
215 s2.offset = t_offset-1
216 return seq.CreateAlignment(s1, s2)
226 if len(line.strip()) == 0:
228 if line.startswith(
'Done!'):
229 if len(query_str) > 0:
230 hits[entry_index][0].aln = _MakeAln(\
231 query_id, hits[entry_index][0].hit_id,
232 query_str, templ_str, *hits[entry_index][1])
233 return [h
for h, o
in hits]
234 if line.startswith(
'No '):
235 if len(query_str) > 0:
236 hits[entry_index][0].aln = _MakeAln(\
237 query_id, hits[entry_index][0].hit_id,
238 query_str, templ_str, *hits[entry_index][1])
239 entry_index = int(line[3:].strip())-1
241 hits[entry_index][0].hit_id = line[1:].strip()
248 assert entry_index !=
None
250 if line[1:].startswith(
' Consensus'):
252 if line[1:].startswith(
' ss_pred'):
254 if line[1:].startswith(
' ss_conf'):
256 if line[1:].startswith(
' ss_dssp'):
258 if line.startswith(
'T '):
259 for start_pos
in range(22, len(line)):
260 if line[start_pos].isalpha()
or line[start_pos] ==
'-':
262 end_pos = line.find(
' ', start_pos)
265 error_str =
"Unparsable line '%s' for entry No %d" \
266 % (line.strip(), entry_index + 1)
267 raise AssertionError(error_str)
268 templ_str += line[start_pos:end_pos]
269 if line.startswith(
'Q '):
270 for start_pos
in range(22, len(line)):
271 if line[start_pos].isalpha()
or line[start_pos] ==
'-':
273 end_pos = line.find(
' ', start_pos)
276 error_str =
"Unparsable line '%s' for entry No %d" \
277 % (line.strip(), entry_index + 1)
278 raise AssertionError(error_str)
279 query_str += line[start_pos:end_pos]
280 except StopIteration:
281 if len(query_str) > 0:
282 hits[entry_index][0].aln = _MakeAln(query_id,
283 hits[entry_index][0].hit_id,
284 query_str, templ_str,
285 *hits[entry_index][1])
286 return [h
for h, o
in hits]
287 header = _ParseHeaderSection(lines)
292 hits = _ParseTableOfContents(lines)
293 return header, _ParseResultBody(header.query, hits, lines)
297 Parse secondary structure information and the multiple sequence alignment
298 out of an A3M file as produced by :meth:`HHblits.BuildQueryMSA`.
300 :param a3m_file: Iterable containing the lines of the A3M file
301 :type a3m_file: iterable (e.g. an open file handle)
303 :return: Dictionary containing "ss_pred" (:class:`list`), "ss_conf"
304 (:class:`list`) and "msa" (:class:`~ost.seq.AlignmentHandle`).
305 If not available, "ss_pred" and "ss_conf" entries are set to None.
307 profile_dict = dict()
313 for line
in a3m_file:
314 if len(line.rstrip()) == 0:
316 elif line.startswith(
'>ss_pred'):
319 elif line.startswith(
'>ss_conf'):
324 msa_head.append(line[1:].rstrip())
328 if state ==
'sspred':
329 pred_seq_txt += line.rstrip()
330 elif state ==
'ssconf':
331 conf_seq_txt += line.rstrip()
333 msa_seq[len(msa_seq)-1] += line.rstrip()
335 if len(pred_seq_txt) > 0:
336 profile_dict[
'ss_pred'] = list()
337 profile_dict[
'ss_conf'] = list()
338 for i
in range(0, len(pred_seq_txt)):
339 profile_dict[
'ss_pred'].append(pred_seq_txt[i])
340 profile_dict[
'ss_conf'].append(int(conf_seq_txt[i]))
342 profile_dict[
'ss_pred'] =
None
343 profile_dict[
'ss_conf'] =
None
347 profile_dict[
'msa'] =
None
350 al = seq.AlignmentList()
351 for i
in range(1, len(msa_seq)):
363 nl = seq.CreateAlignment(seq.CreateSequence(msa_head[0], qs),
364 seq.CreateSequence(msa_head[i], ts))
366 profile_dict[
'msa'] = seq.alg.MergePairwiseAlignments(\
367 al, seq.CreateSequence(msa_head[0], t))
373 Parse secondary structure information and the MSA out of an HHM profile as
374 produced by :meth:`HHblits.A3MToProfile`.
376 :param profile: Opened file handle holding the profile.
377 :type profile: :class:`file`
379 :return: Dictionary containing "ss_pred" (:class:`list`), "ss_conf"
380 (:class:`list`), "msa" (:class:`~ost.seq.AlignmentHandle`) and
381 "consensus" (:class:`~ost.seq.SequenceHandle`).
382 If not available, "ss_pred" and "ss_conf" entries are set to None.
384 profile_dict = dict()
392 if len(line.rstrip()) == 0:
394 if line.rstrip() ==
'>ss_pred PSIPRED predicted secondary structure':
397 elif line.rstrip() ==
'>ss_conf PSIPRED confidence values':
400 elif line.rstrip() ==
'>Consensus':
404 if state ==
'consensus' or state ==
'msa':
406 msa_head.append(line[1:].rstrip())
408 raise IOError(
'Profile file "%s" is missing ' % profile.name+
409 'the "Consensus" section')
416 if state ==
'sspred':
417 pred_seq_txt += line.rstrip()
418 elif state ==
'ssconf':
419 conf_seq_txt += line.rstrip()
421 msa_seq[len(msa_seq)-1] += line.rstrip()
422 elif state ==
'consensus':
423 consensus_txt += line.rstrip()
425 if len(pred_seq_txt) > 0:
426 profile_dict[
'ss_pred'] = list()
427 profile_dict[
'ss_conf'] = list()
428 for i
in range(0, len(pred_seq_txt)):
429 profile_dict[
'ss_pred'].append(pred_seq_txt[i])
430 profile_dict[
'ss_conf'].append(int(conf_seq_txt[i]))
432 profile_dict[
'ss_pred'] =
None
433 profile_dict[
'ss_conf'] =
None
437 profile_dict[
'msa'] =
None
440 al = seq.AlignmentList()
441 for i
in range(1, len(msa_seq)):
453 nl = seq.CreateAlignment(seq.CreateSequence(msa_head[0], qs),
454 seq.CreateSequence(msa_head[i], ts))
456 profile_dict[
'msa'] = seq.alg.MergePairwiseAlignments(\
457 al, seq.CreateSequence(msa_head[0], t))
459 profile_dict[
'consensus'] = seq.CreateSequence(
'Consensus', consensus_txt)
466 Initialise a new HHblits "search" for the given query. Query may either
467 be a :class:`~ost.seq.SequenceHandle` or a string. In the former case, the
468 query is the actual query sequence, in the latter case, the query is the
469 filename to the file containing the query.
471 :param query: Query sequence as file or sequence.
472 :type query: :class:`~ost.seq.SequenceHandle` or :class:`str`
473 :param hhsuite_root: Path to the top-level directory of your hhsuite
475 :type hhsuite_root: :class:`str`
476 :param hhblits_bin: Name of the hhblits binary. Will only be used if
477 :attr:`hhsuite_root`:file:`/bin/hhblits` does not exist.
478 :type hhblits_bin: :class:`str`
479 :param working_dir: Directory for temporary files. Will be created if not
480 present but **not** automatically deleted.
481 :type working_dir: :class:`str`
483 OUTPUT_PREFIX =
'query_hhblits'
484 def __init__(self, query, hhsuite_root, hhblits_bin=None, working_dir=None):
487 if os.path.exists(os.path.join(self.
hhsuite_roothhsuite_root,
'bin/hhblits')):
491 self.
hhblits_binhhblits_bin = settings.Locate(
'hhblits',
492 explicit_file_name=hhblits_bin)
500 if not os.path.exists(working_dir):
501 os.mkdir(working_dir)
502 if isinstance(query, str):
503 self.
filenamefilename = os.path.abspath(os.path.join(
504 self.
working_dirworking_dir, os.path.basename(query)))
505 if self.
filenamefilename != os.path.abspath(query):
506 shutil.copy(query, self.
filenamefilename)
509 '%s.fasta' % HHblits.OUTPUT_PREFIX))
513 if isinstance(query, str):
515 self.
filenamefilename = os.path.abspath(os.path.join(
516 self.
working_dirworking_dir, os.path.basename(query)))
517 shutil.copy(query, self.
filenamefilename)
521 self.
filenamefilename = tmp_dir.files[0]
524 """Builds the MSA for the query sequence.
526 The produced A3M file can be parsed by :func:`ParseA3M`. If the file was
527 already produced, hhblits is not called again and the existing file path
528 is returned (neglecting the *assign_ss* flag!!!).
530 :param nrdb: Database to be align against; has to be an hhblits database
531 :type nrdb: :class:`str`
533 :param options: Dictionary of options to *hhblits*, one "-" is added in
534 front of every key. Boolean True values add flag without
535 value. Merged with default options
536 {'cpu': 1, 'n': 1, 'e': 0.001}, where 'n' defines the
537 number of iterations and 'e' the E-value cutoff for
538 inclusion of sequences in result alignment.
539 :type options: :class:`dict`
541 :param a3m_file: a path of a3m_file to be used, optional
542 :type a3m_file: :class:`str`
544 :param assign_ss: HHblits does not assign predicted secondary structure
545 by default. You can optionally assign it with the
546 addss.pl script provided by the HH-suite. However,
547 your HH-suite installation requires you to specify
548 paths to PSIRED etc. We refer to the HH-suite user
549 guide for further instructions. Assignment is done
550 by calling :func:`HHblits.AssignSSToA3M`
552 :type assign_ss: :class:`bool`
554 :return: The path to the A3M file containing the MSA
558 a3m_file =
'%s.a3m' % os.path.splitext(self.
filenamefilename)[0]
560 a3m_file = os.path.abspath(a3m_file)
561 if os.path.exists(a3m_file):
562 ost.LogInfo(
'Reusing already existing query alignment (%s)' % a3m_file)
564 ost.LogInfo(
'Using hhblits from "%s"' % self.
hhsuite_roothhsuite_root)
565 full_nrdb = os.path.join(os.path.abspath(os.path.split(nrdb)[0]),
566 os.path.split(nrdb)[1])
572 opt_cmd, _ = _ParseOptions(opts)
573 hhblits_cmd =
'%s -i %s -oa3m %s -d %s %s' % \
577 p = subprocess.run(hhblits_cmd, shell=
True, cwd=self.
working_dirworking_dir,
578 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
580 lines = p.stdout.decode().splitlines()
582 ost.LogVerbose(line.strip())
584 lines = p.stderr.decode().splitlines()
586 ost.LogError(line.strip())
588 if not os.path.exists(a3m_file):
589 raise RuntimeError(
'Building query profile failed, no output')
598 HHblits does not assign predicted secondary structure by default.
599 You can optionally assign it with the addss.pl script provided by the
600 HH-suite. However, your HH-suite installation requires you to specify
601 paths to PSIRED etc. We refer to the HH-suite user guide for further
604 :param a3m_file: Path to file you want to assign secondary structure to
605 :type a3m_file: :class:`str`
608 a3m_file = os.path.abspath(a3m_file)
609 addss_cmd =
"perl %s %s" % (os.path.join(self.
hhsuite_roothhsuite_root,
612 env = dict(os.environ)
613 env.update({
'PERL5LIB' : os.path.join(self.
hhsuite_roothhsuite_root,
'scripts'),
615 'PATH' :
'%s:%s' % (os.path.join(self.
hhsuite_roothhsuite_root,
'bin'),
616 os.environ[
'PATH'])})
618 p = subprocess.run(addss_cmd, shell=
True, cwd=self.
working_dirworking_dir,
619 env=env, stdout=subprocess.PIPE,
620 stderr=subprocess.PIPE)
622 lines = p.stdout.decode().splitlines()
624 ost.LogVerbose(line.strip())
625 if 'error' in line.lower()
or 'bad interpreter' in line.lower():
626 raise RuntimeError(
'Predicting secondary structure for MSA '+
627 '(%s) failed, on command: %s' % (a3m_file, line))
629 lines = p.stderr.decode().splitlines()
631 ost.LogError(line.strip())
632 if 'error' in line.lower()
or 'bad interpreter' in line.lower():
633 raise RuntimeError(
'Predicting secondary structure for MSA '+
634 '(%s) failed, on command: %s' % (a3m_file, line))
640 Converts the A3M alignment file to a hhm profile. If hhm_file is not
641 given, the output file will be set to <:attr:`a3m_file`-basename>.hhm.
643 The produced HHM file can be parsed by :func:`ParseHHM`.
645 If the file was already produced, the existing file path is returned
646 without recomputing it.
648 :param a3m_file: Path to input MSA as produced by :meth:`BuildQueryMSA`
649 :type a3m_file: :class:`str`
651 :param hhm_file: Desired output file name
652 :type hhm_file: :class:`str`
654 :return: Path to the profile file
657 hhmake = os.path.join(self.
bin_dirbin_dir,
'hhmake')
659 hhm_file =
'%s.hhm' % os.path.splitext(a3m_file)[0]
660 if os.path.exists(hhm_file):
662 ost.LogVerbose(
'converting %s to %s' % (a3m_file, hhm_file))
663 p = subprocess.run(
'%s -i %s -o %s' % (hhmake, a3m_file, hhm_file),
664 shell=
True, stdout=subprocess.PIPE,
665 stderr=subprocess.PIPE)
666 lines = p.stdout.decode().splitlines()
668 ost.LogVerbose(line.strip())
669 lines = p.stderr.decode().splitlines()
671 ost.LogError(line.strip())
673 if p.returncode != 0:
674 raise IOError(
'could not convert a3m to hhm file')
676 if not os.path.exists(hhm_file):
677 raise RuntimeError(
'could not convert a3m to hhm file, no output')
681 def A3MToCS(self, a3m_file, cs_file=None, options={}):
683 Converts the A3M alignment file to a column state sequence file. If
684 cs_file is not given, the output file will be set to
685 <:attr:`a3m_file`-basename>.seq219.
687 If the file was already produced, the existing file path is returned
688 without recomputing it.
690 :param a3m_file: Path to input MSA as produced by :meth:`BuildQueryMSA`
691 :type a3m_file: :class:`str`
693 :param cs_file: Output file name (may be omitted)
694 :type cs_file: :class:`str`
696 :param options: Dictionary of options to *cstranslate*, one "-" is added
697 in front of every key. Boolean True values add flag
699 :type options: :class:`dict`
701 :return: Path to the column state sequence file
704 cstranslate = os.path.join(self.
bin_dirbin_dir,
'cstranslate')
706 cs_file =
'%s.seq219' % os.path.splitext(a3m_file)[0]
707 if os.path.exists(cs_file):
709 opt_cmd, _ = _ParseOptions(options)
710 cs_cmd =
'%s -i %s -o %s %s' % (
712 os.path.abspath(a3m_file),
713 os.path.abspath(cs_file),
715 ost.LogVerbose(
'converting %s to %s' % (a3m_file, cs_file))
716 p = subprocess.run(cs_cmd, shell=
True, cwd=self.
working_dirworking_dir,
717 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
719 if not os.path.exists(cs_file):
720 raise RuntimeError(
'Creating column state sequence file failed, ' +
723 if b
'Wrote abstract state sequence to' in p.stdout:
726 raise RuntimeError(
'Creating column state sequence file failed')
729 """Delete temporary data.
731 Delete temporary data if no working dir was given. Controlled by
732 :attr:`needs_cleanup`.
738 '''In case something went wrong, call to make sure everything is clean.
740 This will delete the working dir independently of :attr:`needs_cleanup`.
747 def Search(self, a3m_file, database, options={}, prefix=''):
749 Searches for templates in the given database. Before running the search,
750 the hhm file is copied. This makes it possible to launch several hhblits
751 instances at once. Upon success, the filename of the result file is
752 returned. This file may be parsed with :func:`ParseHHblitsOutput`.
754 :param a3m_file: Path to input MSA as produced by :meth:`BuildQueryMSA`
755 :type a3m_file: :class:`str`
757 :param database: Search database, needs to be the common prefix of the
759 :type database: :class:`str`
761 :param options: Dictionary of options to *hhblits*, one "-" is added in
762 front of every key. Boolean True values add flag without
763 value. Merged with default options {'cpu': 1, 'n': 1},
764 where 'n' defines the number of iterations.
765 :type options: :class:`dict`
767 :param prefix: Prefix to the result file
768 :type prefix: :class:`str`
770 :return: The path to the result file
776 opt_cmd, opt_str = _ParseOptions(opts)
777 base = os.path.basename(os.path.splitext(a3m_file)[0])
778 hhr_file =
'%s%s_%s.hhr' % (prefix, base, opt_str)
779 hhr_file = os.path.join(self.
working_dirworking_dir, hhr_file)
780 search_cmd =
'%s %s -e 0.001 -Z 10000 -B 10000 -i %s -o %s -d %s' % (
783 os.path.abspath(a3m_file),
784 os.path.abspath(hhr_file),
785 os.path.join(os.path.abspath(os.path.split(database)[0]),
786 os.path.split(database)[1]))
787 ost.LogInfo(
'searching %s' % database)
788 ost.LogVerbose(search_cmd)
789 p = subprocess.run(search_cmd, shell=
True, cwd=self.
working_dirworking_dir,
790 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
791 lines = p.stdout.decode().splitlines()
793 ost.LogVerbose(line.strip())
794 lines = p.stderr.decode().splitlines()
796 ost.LogError(line.strip())
798 if p.returncode != 0:
799 raise RuntimeError(
'Sequence search failed')
801 if not os.path.exists(hhr_file):
802 raise RuntimeError(
'Sequence search failed, no output')
807 def _ParseOptions(opts):
809 :return: Tuple of strings (opt_cmd, opt_str), where opt_cmd can be
810 passed to command ("-" added in front of keys, options
811 separated by space) and opt_str (options separated by "_")
812 can be used for filenames.
813 :param opts: Dictionary of options, one "-" is added in front of every
814 key. Boolean True values add flag without value.
818 for k, val
in opts.items():
819 if type(val) == type(
True):
821 opt_cmd.append(
'-%s' % str(k))
822 opt_str.append(str(k))
824 opt_cmd.append(
'-%s %s' % (str(k), str(val)))
825 opt_str.append(
'%s%s' % (str(k), str(val)))
826 opt_cmd =
' '.join(opt_cmd)
827 opt_str =
'_'.join(opt_str)
828 return opt_cmd, opt_str
831 __all__ = [
'HHblits',
'HHblitsHit',
'HHblitsHeader',
832 'ParseHHblitsOutput',
'ParseA3M',
'ParseHHM',
def __init__(self, hit_id, aln, score, ss_score, evalue, pvalue, prob)
def A3MToCS(self, a3m_file, cs_file=None, options={})
def AssignSSToA3M(self, a3m_file)
def __init__(self, query, hhsuite_root, hhblits_bin=None, working_dir=None)
def Search(self, a3m_file, database, options={}, prefix='')
def BuildQueryMSA(self, nrdb, options={}, a3m_file=None, assign_ss=True)
def A3MToProfile(self, a3m_file, hhm_file=None)
def ParseHHblitsOutput(output)
def ParseHeaderLine(line)
void DLLEXPORT_OST_IO SaveSequence(const seq::ConstSequenceHandle &sequence, const String &filename, const String &format="auto")