OpenStructure
blast.py
Go to the documentation of this file.
1 from ost import settings
2 import subprocess
3 from xml.dom import minidom
4 from ost import io, seq
5 import ost
6 import re
7 import os
9  def __init__(self, aln, bit_score, score, evalue):
10  self.aln=aln
11  self.bit_score=bit_score
12  self.score=score
13  self.evalue=evalue
14 
15 class BlastHit:
16  """
17  A positive match found by BLAST.
18 
19  Each blast hit consists of one or more aligned patches.
20 
21  """
22  def __init__(self, identifier, aligned_patches):
23  self.identifier=identifier
24  self.aligned_patches=aligned_patches
25 
26 def ParseBlastOutput(string):
27  """
28  Parses the blast output and returns a list of BlastHits
29  """
30  def _GetText(node):
31  rc=''
32  for child in node.childNodes:
33  if child.nodeType==child.TEXT_NODE:
34  rc+=child.data
35  return rc
36 
37  def _GetValue(node, tag_name):
38  tag=node.getElementsByTagName(tag_name)
39  assert len(tag)==1
40  return _GetText(tag[0])
41 
42  def _GetInt(node, tag_name):
43  return int(_GetValue(node, tag_name))
44 
45  def _ParseHsp(query_id, hit_id, hsp):
46  bit_score=float(_GetValue(hsp, 'Hsp_bit-score'))
47  score=float(_GetValue(hsp, 'Hsp_score'))
48  evalue=float(_GetValue(hsp, 'Hsp_evalue'))
49  query_offset=_GetInt(hsp, 'Hsp_query-from')-1
50  hit_offset=_GetInt(hsp, 'Hsp_hit-from')-1
51  query_seq=seq.CreateSequence(str(query_id), str(_GetValue(hsp, 'Hsp_qseq')))
52  query_seq.offset=query_offset
53  hit_seq=seq.CreateSequence(str(hit_id), str(_GetValue(hsp, 'Hsp_hseq')))
54  hit_seq.offset=hit_offset
55  aln=seq.CreateAlignment(query_seq, hit_seq)
56  return AlignedPatch(aln, bit_score, score, evalue)
57  try:
58  doc=minidom.parseString(string)
59  except Exception, e:
60  ost.LogError('Error while parsing BLAST output: %s' % str(e))
61  return None
62  hits=[]
63  query_id=_GetValue(doc, 'BlastOutput_query-def')
64  for hit in doc.getElementsByTagName('Hit'):
65  hit_id=_GetValue(hit, 'Hit_def')
66  hsps=hit.getElementsByTagName('Hsp')
67  aligned_patches=[_ParseHsp(query_id, hit_id, hsp) for hsp in hsps]
68  hits.append(BlastHit(hit_id, aligned_patches))
69  return hits
70 
71 
72 
73 class BlastError(RuntimeError):
74  def __init__(self, brief, details):
75  self.brief=brief
76  self.details=details
77 
78  def __str__(self):
79  if self.details:
80  return '%s\n%s' % (self.brief, self.details)
81  else:
82  return self.brief
83 
84 def Blast(query, database, gap_open=11, gap_ext=1, matrix='BLOSUM62',
85  blast_location=None):
86  """
87  Runs a protein vs. protein blast search. The results are returned as a
88  list of :class:`BlastHit` instances.
89 
90  :param query: the query sequence
91  :type query: :class:`seq.ConstSequenceHandle`
92 
93  :param database: The filename of the sequence database. Make sure that
94  formatdb has been run on the database and the <database>.pin file exists.
95  :param matrix: The substitution matrix to be used. Must be one of 'BLOSUM45',
96  'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70'.
97  :param gap_open: Gap opening penalty. Note that only a subset of gap opening
98  penalties is supported for each substitutition matrix. Consult the blast
99  docs for more information.
100  :param gap_ext: Gap extension penalty. Only a subset of gap extension
101  penalties are supported for each of the substitution matrices. Consult the
102  blast docs for more information.
103  """
104  subst_mats=('BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70',)
105  if matrix not in subst_mats:
106  raise ValueError('matrix must be one of %s' % ', '.join(subst_mats))
107  if not os.path.exists('%s.pin' % database):
108  raise IOError("Database %s does not exist" % database)
109  blastall_exe=settings.Locate('blastall', explicit_file_name=blast_location)
110  args=[blastall_exe, '-d', database, '-p', 'blastp',
111  '-m', '7', '-M', matrix, '-G', str(gap_open), '-E', str(gap_ext)]
112  ost.LogInfo('running BLAST (%s)' % ' '.join(args))
113  blast_pipe=subprocess.Popen(args, stderr=subprocess.PIPE,
114  stdout=subprocess.PIPE, stdin=subprocess.PIPE)
115  stdout, stderr=blast_pipe.communicate(io.SequenceToString(query, 'fasta'))
116  if len(stderr)>0:
117  pattern=re.compile(r'^\[.*\]\s+ERROR:\s+(.*)')
118  lines=stderr.split('\n')
119  error_message=pattern.match(lines[0])
120  if error_message:
121  raise BlastError(error_message.group(1), '\n'.join(lines[1:]))
122  return ParseBlastOutput(stdout)