1 '''HHblits wrapper classes and functions.
11 from ost
import settings, seq
16 A hit found by HHblits
20 String identifying the hit
26 Pairwise alignment containing the aligned part between the query and the
27 target. First sequence is the query, the second sequence the target.
29 :type: :class:`~ost.seq.AlignmentHandle`
37 .. attribute:: ss_score
39 The secondary structure score
45 The E-value of the alignment
51 The P-value of the alignment
57 The probability of the alignment (between 0 and 100)
61 def __init__(self, hit_id, aln, score, ss_score, evalue, pvalue, prob):
71 """Stats from the beginning of search output.
75 The name of the query sequence
79 .. attribute:: match_columns
81 Total of aligned Match columns
87 Value of the ``-neff`` option
91 .. attribute:: searched_hmms
93 Number of profiles searched
101 :type: :class:`datetime.datetime`
103 .. attribute:: command
118 '''Fetch header content.
120 First, we seek the start of the identifier, that is, the first whitespace
121 after the hit number + 1. Since the identifier may contain whitespaces
122 itself, we cannot split the whole line
124 :param line: Line from the output header.
125 :type line: :class:`str`
127 :return: Hit information and query/template offsets
128 :rtype: (:class:`HHblitsHit`, (:class:`int`, :class:`int`))
130 for i
in range(0, len(line)):
131 if line[i].isdigit():
133 for i
in range(i, len(line)):
136 assert len(line)-i >= 31
and line[i+1] !=
' '
137 hit_id = line[i+1:i+31].strip()
138 fields = line[i+32:].split()
139 prob = float(fields[0])
140 evalue = float(fields[1])
141 pvalue = float(fields[2])
142 score = float(fields[3])
143 ss_score = float(fields[4])
144 offsets = (int(fields[6].split(
'-')[0]), int(fields[7].split(
'-')[0]))
145 return (
HHblitsHit(hit_id,
None, score, ss_score, evalue, pvalue, prob),
150 Parses the HHblits output as produced by :meth:`HHblits.Search` and returns
151 the header of the search results and a list of hits.
153 :param output: Iterable containing the lines of the HHblits output file
154 :type output: iterable (e.g. an open file handle)
156 :return: a tuple of the header of the search results and the hits
157 :rtype: (:class:`HHblitsHeader`, :class:`list` of :class:`HHblitsHit`)
160 def _ParseHeaderSection(lines):
161 value_start_column = 14
162 date_pattern =
'%a %b %d %H:%M:%S %Y'
165 assert line.startswith(
'Query')
166 header.query = line[value_start_column:].strip()
168 assert line.startswith(
'Match_columns')
169 header.match_columns = int(line[value_start_column:].strip())
172 assert line.startswith(
'No_of_seqs')
175 assert line.startswith(
'Neff')
176 header.n_eff = float(line[value_start_column:].strip())
179 assert line.startswith(
'Searched_HMMs')
180 header.searched_hmms = int(line[value_start_column:].strip())
183 assert line.startswith(
'Date')
184 value = line[value_start_column:].strip()
185 header.date = datetime.datetime.strptime(value, date_pattern)
188 assert line.startswith(
'Command')
189 header.command = line[value_start_column:].strip()
192 assert len(line.strip()) == 0
195 def _ParseTableOfContents(lines):
197 assert line.startswith(
' No Hit')
201 if len(line.strip()) == 0:
206 def _ParseResultBody(query_id, hits, lines):
210 def _MakeAln(query_id, hit_id, query_string, templ_string,
212 s1 = seq.CreateSequence(query_id, query_string)
213 s1.offset = q_offset-1
214 s2 = seq.CreateSequence(hit_id, templ_string)
215 s2.offset = t_offset-1
216 return seq.CreateAlignment(s1, s2)
226 if len(line.strip()) == 0:
228 if line.startswith(
'Done!'):
229 if len(query_str) > 0:
230 hits[entry_index][0].aln = _MakeAln(\
231 query_id, hits[entry_index][0].hit_id,
232 query_str, templ_str, *hits[entry_index][1])
233 return [h
for h, o
in hits]
234 if line.startswith(
'No '):
235 if len(query_str) > 0:
236 hits[entry_index][0].aln = _MakeAln(\
237 query_id, hits[entry_index][0].hit_id,
238 query_str, templ_str, *hits[entry_index][1])
239 entry_index = int(line[3:].strip())-1
241 hits[entry_index][0].hit_id = line[1:].strip()
248 assert entry_index !=
None
250 if line[1:].startswith(
' Consensus'):
252 if line[1:].startswith(
' ss_pred'):
254 if line[1:].startswith(
' ss_conf'):
256 if line[1:].startswith(
' ss_dssp'):
258 if line.startswith(
'T '):
259 for start_pos
in range(22, len(line)):
260 if line[start_pos].isalpha()
or line[start_pos] ==
'-':
262 end_pos = line.find(
' ', start_pos)
265 error_str =
"Unparsable line '%s' for entry No %d" \
266 % (line.strip(), entry_index + 1)
267 raise AssertionError(error_str)
268 templ_str += line[start_pos:end_pos]
269 if line.startswith(
'Q '):
270 for start_pos
in range(22, len(line)):
271 if line[start_pos].isalpha()
or line[start_pos] ==
'-':
273 end_pos = line.find(
' ', start_pos)
276 error_str =
"Unparsable line '%s' for entry No %d" \
277 % (line.strip(), entry_index + 1)
278 raise AssertionError(error_str)
279 query_str += line[start_pos:end_pos]
280 except StopIteration:
281 if len(query_str) > 0:
282 hits[entry_index][0].aln = _MakeAln(query_id,
283 hits[entry_index][0].hit_id,
284 query_str, templ_str,
285 *hits[entry_index][1])
286 return [h
for h, o
in hits]
287 header = _ParseHeaderSection(lines)
292 hits = _ParseTableOfContents(lines)
293 return header, _ParseResultBody(header.query, hits, lines)
297 Parse secondary structure information and the multiple sequence alignment
298 out of an A3M file as produced by :meth:`HHblits.BuildQueryMSA`.
300 :param a3m_file: Iterable containing the lines of the A3M file
301 :type a3m_file: iterable (e.g. an open file handle)
303 :return: Dictionary containing "ss_pred" (:class:`list`), "ss_conf"
304 (:class:`list`) and "msa" (:class:`~ost.seq.AlignmentHandle`).
306 profile_dict = dict()
312 for line
in a3m_file:
313 if len(line.rstrip()) == 0:
315 elif line.startswith(
'>ss_pred'):
318 elif line.startswith(
'>ss_conf'):
322 if state ==
'ssconf' or state ==
'msa':
324 msa_head.append(line[1:].rstrip())
326 raise IOError(
'The A3M file is missing the "ss_conf" section')
330 if state ==
'sspred':
331 pred_seq_txt += line.rstrip()
332 elif state ==
'ssconf':
333 conf_seq_txt += line.rstrip()
335 msa_seq[len(msa_seq)-1] += line.rstrip()
337 profile_dict[
'ss_pred'] = list()
338 profile_dict[
'ss_conf'] = list()
339 for i
in range(0, len(pred_seq_txt)):
340 profile_dict[
'ss_pred'].append(pred_seq_txt[i])
341 profile_dict[
'ss_conf'].append(int(conf_seq_txt[i]))
345 profile_dict[
'msa'] =
None
348 al = seq.AlignmentList()
349 for i
in range(1, len(msa_seq)):
361 nl = seq.CreateAlignment(seq.CreateSequence(msa_head[0], qs),
362 seq.CreateSequence(msa_head[i], ts))
364 profile_dict[
'msa'] = seq.alg.MergePairwiseAlignments(\
365 al, seq.CreateSequence(msa_head[0], t))
371 Parse secondary structure information and the MSA out of an HHM profile as
372 produced by :meth:`HHblits.A3MToProfile`.
374 :param profile: Opened file handle holding the profile.
375 :type profile: :class:`file`
377 :return: Dictionary containing "ss_pred" (:class:`list`), "ss_conf"
378 (:class:`list`), "msa" (:class:`~ost.seq.AlignmentHandle`) and
379 "consensus" (:class:`~ost.seq.SequenceHandle`).
381 profile_dict = dict()
389 if len(line.rstrip()) == 0:
391 if line.rstrip() ==
'>ss_pred PSIPRED predicted secondary structure':
394 elif line.rstrip() ==
'>ss_conf PSIPRED confidence values':
397 elif line.rstrip() ==
'>Consensus':
401 if state ==
'consensus' or state ==
'msa':
403 msa_head.append(line[1:].rstrip())
405 raise IOError(
'Profile file "%s" is missing ' % profile.name+
406 'the "Consensus" section')
413 if state ==
'sspred':
414 pred_seq_txt += line.rstrip()
415 elif state ==
'ssconf':
416 conf_seq_txt += line.rstrip()
418 msa_seq[len(msa_seq)-1] += line.rstrip()
419 elif state ==
'consensus':
420 consensus_txt += line.rstrip()
422 profile_dict[
'ss_pred'] = list()
423 profile_dict[
'ss_conf'] = list()
424 for i
in range(0, len(pred_seq_txt)):
425 profile_dict[
'ss_pred'].append(pred_seq_txt[i])
426 profile_dict[
'ss_conf'].append(int(conf_seq_txt[i]))
430 profile_dict[
'msa'] =
None
433 al = seq.AlignmentList()
434 for i
in range(1, len(msa_seq)):
446 nl = seq.CreateAlignment(seq.CreateSequence(msa_head[0], qs),
447 seq.CreateSequence(msa_head[i], ts))
449 profile_dict[
'msa'] = seq.alg.MergePairwiseAlignments(\
450 al, seq.CreateSequence(msa_head[0], t))
453 profile_dict[
'consensus'] = seq.CreateSequence(
'Consensus', consensus_txt)
460 Initialise a new HHblits "search" for the given query. Query may either
461 be a :class:`~ost.seq.SequenceHandle` or a string. In the former case, the
462 query is the actual query sequence, in the latter case, the query is the
463 filename to the file containing the query.
465 :param query: Query sequence as file or sequence.
466 :type query: :class:`~ost.seq.SequenceHandle` or :class:`str`
467 :param hhsuite_root: Path to the top-level directory of your hhsuite
469 :type hhsuite_root: :class:`str`
470 :param hhblits_bin: Name of the hhblits binary. Will only be used if
471 :attr:`hhsuite_root`:file:`/bin/hhblits` does not exist.
472 :type hhblits_bin: :class:`str`
473 :param working_dir: Directory for temporary files. Will be created if not
474 present but **not** automatically deleted.
475 :type working_dir: :class:`str`
477 OUTPUT_PREFIX =
'query_hhblits'
478 def __init__(self, query, hhsuite_root, hhblits_bin=None, working_dir=None):
481 if os.path.exists(os.path.join(self.
hhsuite_roothhsuite_root,
'bin/hhblits')):
485 self.
hhblits_binhhblits_bin = settings.Locate(
'hhblits',
486 explicit_file_name=hhblits_bin)
494 if not os.path.exists(working_dir):
495 os.mkdir(working_dir)
496 if isinstance(query, str):
497 self.
filenamefilename = os.path.abspath(os.path.join(
498 self.
working_dirworking_dir, os.path.basename(query)))
499 if self.
filenamefilename != os.path.abspath(query):
500 shutil.copy(query, self.
filenamefilename)
503 '%s.fasta' % HHblits.OUTPUT_PREFIX))
507 if isinstance(query, str):
509 self.
filenamefilename = os.path.abspath(os.path.join(
510 self.
working_dirworking_dir, os.path.basename(query)))
511 shutil.copy(query, self.
filenamefilename)
515 self.
filenamefilename = tmp_dir.files[0]
518 """Builds the MSA for the query sequence.
520 This function directly uses hhblits of hhtools. While in theory it would
521 be possible to do this by PSI-blasting on our own, hhblits is supposed
522 to be faster. Also it is supposed to prevent alignment corruption. The
523 alignment corruption is caused by low-scoring terminal alignments that
524 draw the sequences found by PSI-blast away from the optimum. By removing
525 these low scoring ends, part of the alignment corruption can be
528 hhblits does **not** call PSIPRED on the MSA to predict the secondary
529 structure of the query sequence. This is done by addss.pl of hhtools.
530 The predicted secondary structure is stored together with the sequences
531 identified by hhblits.
533 The produced A3M file can be parsed by :func:`ParseA3M`. If the file was
534 already produced, hhblits is not called again and the existing file path
537 :param nrdb: Database to be align against; has to be an hhblits database
538 :type nrdb: :class:`str`
540 :param options: Dictionary of options to *hhblits*, one "-" is added in
541 front of every key. Boolean True values add flag without
542 value. Merged with default options {'cpu': 1, 'n': 1},
543 where 'n' defines the number of iterations.
544 :type options: :class:`dict`
546 :param a3m_file: a path of a3m_file to be used, optional
547 :type a3m_file: :class:`str`
549 :return: The path to the A3M file containing the MSA
553 a3m_file =
'%s.a3m' % os.path.splitext(self.
filenamefilename)[0]
555 a3m_file = os.path.abspath(a3m_file)
556 if os.path.exists(a3m_file):
557 ost.LogInfo(
'Reusing already existing query alignment (%s)' % a3m_file)
559 ost.LogInfo(
'Using hhblits from "%s"' % self.
hhsuite_roothhsuite_root)
560 full_nrdb = os.path.join(os.path.abspath(os.path.split(nrdb)[0]),
561 os.path.split(nrdb)[1])
566 opt_cmd, _ = _ParseOptions(opts)
567 hhblits_cmd =
'%s -e 0.001 -i %s -oa3m %s -d %s %s' % \
570 job = subprocess.Popen(hhblits_cmd, shell=
True, cwd=self.
working_dirworking_dir,
571 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
572 sout, _ = job.communicate()
573 lines = sout.decode().splitlines()
575 ost.LogVerbose(line.strip())
576 if not os.path.exists(a3m_file):
577 ost.LogWarning(
'Building query profile failed, no output')
580 addss_cmd =
"%s %s" % (os.path.join(self.
hhsuite_roothhsuite_root,
581 'lib/hh/scripts/addss.pl'),
583 env = dict(os.environ)
584 env.update({
'PERL5LIB' : os.path.join(self.
hhsuite_roothhsuite_root,
587 'PATH' :
'%s:%s' % (os.path.join(self.
hhsuite_roothhsuite_root,
'bin'),
588 os.environ[
'PATH'])})
589 job = subprocess.Popen(addss_cmd, shell=
True, cwd=self.
working_dirworking_dir,
590 env=env, stdout=subprocess.PIPE,
591 stderr=subprocess.PIPE)
592 sout, serr = job.communicate()
593 lines = sout.decode().splitlines()
595 if 'error' in line.lower():
596 ost.LogWarning(
'Predicting secondary structure for MSA '+
597 '(%s) failed, on command: %s' % (a3m_file, line))
603 Converts the A3M alignment file to a hhm profile. If hhm_file is not
604 given, the output file will be set to <:attr:`a3m_file`-basename>.hhm.
606 The produced HHM file can be parsed by :func:`ParseHHM`.
608 If the file was already produced, the existing file path is returned
609 without recomputing it.
611 :param a3m_file: Path to input MSA as produced by :meth:`BuildQueryMSA`
612 :type a3m_file: :class:`str`
614 :param hhm_file: Desired output file name
615 :type hhm_file: :class:`str`
617 :return: Path to the profile file
620 hhmake = os.path.join(self.
bin_dirbin_dir,
'hhmake')
622 hhm_file =
'%s.hhm' % os.path.splitext(a3m_file)[0]
623 if os.path.exists(hhm_file):
625 ost.LogVerbose(
'converting %s to %s' % (a3m_file, hhm_file))
626 os.putenv(
'HHLIB', self.
hhlib_dirhhlib_dir)
627 job = subprocess.Popen(
'%s -i %s -o %s' % (hhmake, a3m_file, hhm_file),
628 shell=
True, stdout=subprocess.PIPE,
629 stderr=subprocess.PIPE)
630 sout, serr = job.communicate()
631 lines = serr.decode().splitlines()
634 lines = sout.decode().splitlines()
637 if job.returncode !=0:
638 raise IOError(
'could not convert a3m to hhm file')
641 def A3MToCS(self, a3m_file, cs_file=None, options={}):
643 Converts the A3M alignment file to a column state sequence file. If
644 cs_file is not given, the output file will be set to
645 <:attr:`a3m_file`-basename>.seq219.
647 If the file was already produced, the existing file path is returned
648 without recomputing it.
650 :param a3m_file: Path to input MSA as produced by :meth:`BuildQueryMSA`
651 :type a3m_file: :class:`str`
653 :param cs_file: Output file name (may be omitted)
654 :type cs_file: :class:`str`
656 :param options: Dictionary of options to *cstranslate*, one "-" is added
657 in front of every key. Boolean True values add flag
659 :type options: :class:`dict`
661 :return: Path to the column state sequence file
664 cstranslate = os.path.join(self.
hhlib_dirhhlib_dir,
'bin',
'cstranslate')
666 cs_file =
'%s.seq219' % os.path.splitext(a3m_file)[0]
667 if os.path.exists(cs_file):
669 opt_cmd, _ = _ParseOptions(options)
670 cs_cmd =
'%s -i %s -o %s %s' % (
672 os.path.abspath(a3m_file),
673 os.path.abspath(cs_file),
675 ost.LogVerbose(
'converting %s to %s' % (a3m_file, cs_file))
676 job = subprocess.Popen(cs_cmd, shell=
True, cwd=self.
working_dirworking_dir,
677 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
678 sout, _ = job.communicate()
679 if b
'Wrote abstract state sequence to' in sout:
682 ost.LogWarning(
'Creating column state sequence file (%s) failed' % \
686 """Delete temporary data.
688 Delete temporary data if no working dir was given. Controlled by
689 :attr:`needs_cleanup`.
695 '''In case something went wrong, call to make sure everything is clean.
697 This will delete the working dir independently of :attr:`needs_cleanup`.
704 def Search(self, a3m_file, database, options={}, prefix=''):
706 Searches for templates in the given database. Before running the search,
707 the hhm file is copied. This makes it possible to launch several hhblits
708 instances at once. Upon success, the filename of the result file is
709 returned. This file may be parsed with :func:`ParseHHblitsOutput`.
711 :param a3m_file: Path to input MSA as produced by :meth:`BuildQueryMSA`
712 :type a3m_file: :class:`str`
714 :param database: Search database, needs to be the common prefix of the
716 :type database: :class:`str`
718 :param options: Dictionary of options to *hhblits*, one "-" is added in
719 front of every key. Boolean True values add flag without
720 value. Merged with default options {'cpu': 1, 'n': 1},
721 where 'n' defines the number of iterations.
722 :type options: :class:`dict`
724 :param prefix: Prefix to the result file
725 :type prefix: :class:`str`
727 :return: The path to the result file
733 opt_cmd, opt_str = _ParseOptions(opts)
734 base = os.path.basename(os.path.splitext(a3m_file)[0])
735 hhr_file =
'%s%s_%s.hhr' % (prefix, base, opt_str)
736 hhr_file = os.path.join(self.
working_dirworking_dir, hhr_file)
737 search_cmd =
'%s %s -e 0.001 -Z 10000 -B 10000 -i %s -o %s -d %s' % (
740 os.path.abspath(a3m_file),
741 os.path.abspath(hhr_file),
742 os.path.join(os.path.abspath(os.path.split(database)[0]),
743 os.path.split(database)[1]))
744 ost.LogInfo(
'searching %s' % database)
745 ost.LogVerbose(search_cmd)
746 job = subprocess.Popen(search_cmd, shell=
True, cwd=self.
working_dirworking_dir,
747 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
748 sout, serr = job.communicate()
749 if job.returncode != 0:
750 lines = sout.decode().splitlines()
752 ost.LogError(line.strip())
753 lines = serr.decode().splitlines()
755 ost.LogError(line.strip())
760 def _ParseOptions(opts):
762 :return: Tuple of strings (opt_cmd, opt_str), where opt_cmd can be
763 passed to command ("-" added in front of keys, options
764 separated by space) and opt_str (options separated by "_")
765 can be used for filenames.
766 :param opts: Dictionary of options, one "-" is added in front of every
767 key. Boolean True values add flag without value.
771 for k, val
in opts.items():
772 if type(val) == type(
True):
774 opt_cmd.append(
'-%s' % str(k))
775 opt_str.append(str(k))
777 opt_cmd.append(
'-%s %s' % (str(k), str(val)))
778 opt_str.append(
'%s%s' % (str(k), str(val)))
779 opt_cmd =
' '.join(opt_cmd)
780 opt_str =
'_'.join(opt_str)
781 return opt_cmd, opt_str
784 __all__ = [
'HHblits',
'HHblitsHit',
'HHblitsHeader',
785 'ParseHHblitsOutput',
'ParseA3M',
'ParseHHM',
def __init__(self, hit_id, aln, score, ss_score, evalue, pvalue, prob)
def A3MToCS(self, a3m_file, cs_file=None, options={})
def __init__(self, query, hhsuite_root, hhblits_bin=None, working_dir=None)
def Search(self, a3m_file, database, options={}, prefix='')
def BuildQueryMSA(self, nrdb, options={}, a3m_file=None)
def A3MToProfile(self, a3m_file, hhm_file=None)
def ParseHHblitsOutput(output)
def ParseHeaderLine(line)
void DLLEXPORT_OST_IO SaveSequence(const seq::ConstSequenceHandle &sequence, const String &filename, const String &format="auto")