OpenStructure
Loading...
Searching...
No Matches
blast.py
Go to the documentation of this file.
1from ost import settings
2import subprocess
3from xml.dom import minidom
4from ost import io, seq
5import ost
6import os, re, sys
7
9 """
10 An aligned patch, aka. HSP
11
12 .. attribute:: aln
13
14 The local alignment. Sequence offset of both sequences in the alignment are
15 set to the starting position in the query and target sequence, respectively.
16
17 :type: :class:`~ost.seq.AlignmentHandle`
18
19 .. attribute:: bit_score
20
21 The bit score of the HSP
22
23 .. attribute:: score
24
25 The score of the HSP
26
27 .. attribute:: evalue
28
29 The E-value of the HSP
30
31 .. attribute:: seqid
32
33 The sequence identity of the HSP
34 """
35 def __init__(self, aln, bit_score, score, evalue, seqid):
36 self.aln=aln
37 self.bit_score=bit_score
38 self.score=score
39 self.evalue=evalue
40 self.seqid=seqid
41
43 """
44 A positive match found by BLAST.
45
46 Each blast hit consists of one or more HSPs, encoded by the
47 :class:`AlignedPatch` class.
48
49 .. attribute:: identifier
50
51 The identifier of the matching sequence
52
53 .. attribute:: aligned_patches
54
55 list of :class:`AlignedPatch` instances holding the actual HSPs.
56 """
57 def __init__(self, identifier, aligned_patches):
58 self.identifier=identifier
59 self.aligned_patches=aligned_patches
60
61
62def ParseBlastOutput(string, seqid_thres=0, evalue_thres=float("infinity")):
63 """
64 Parses the blast output and returns a list of BlastHits
65 setting no seqid_thres or evalue_thres, restores default behaviour without filtering
66 """
67 def _GetText(node):
68 rc=''
69 for child in node.childNodes:
70 if child.nodeType==child.TEXT_NODE:
71 rc+=child.data
72 return rc
73
74 def _GetValue(node, tag_name):
75 tag=node.getElementsByTagName(tag_name)
76 assert len(tag)==1
77 return _GetText(tag[0])
78
79 def _GetInt(node, tag_name):
80 return int(_GetValue(node, tag_name))
81
82 def _ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres=0, evalue_thres=float("infinity")):
83 bit_score=float(_GetValue(hsp, 'Hsp_bit-score'))
84 score=float(_GetValue(hsp, 'Hsp_score'))
85 evalue=float(_GetValue(hsp, 'Hsp_evalue'))
86 try:
87 identity=float(_GetValue(hsp, 'Hsp_identity'))
88 except AssertionError:
89 # The Hsp_identity tag is not a 'must' in the BLAST XML format. It
90 # describes the number of matching characters. Hence we assume, if it is
91 # missing, there are 0 matches.
92 identity=0
93 hsp_align_len=float(_GetValue(hsp, 'Hsp_align-len'))
94 seqid=identity/hsp_align_len
95 query_offset=_GetInt(hsp, 'Hsp_query-from')-1
96 hit_offset=_GetInt(hsp, 'Hsp_hit-from')-1
97 query_seq=seq.CreateSequence(str(query_id), str(_GetValue(hsp, 'Hsp_qseq')))
98 query_seq.offset=query_offset
99 hit_seq=seq.CreateSequence(str(hit_id), str(_GetValue(hsp, 'Hsp_hseq')))
100 hit_seq.offset=hit_offset
101 try:
102 if seqid > float(seqid_thres) and evalue < evalue_thres:
103 aln=seq.CreateAlignment(query_seq, hit_seq)
104 return AlignedPatch(aln, bit_score, score, evalue, seqid)
105
106 except Exception as e:
107 print(str(e), query_seq, hit_seq)
108
109 try:
110 doc=minidom.parseString(string)
111 except Exception as e:
112 ost.LogError('Error while parsing BLAST output: %s' % str(e))
113 return None
114 hits=[]
115 query_id=_GetValue(doc, 'BlastOutput_query-def')
116 tot_query_len=_GetValue(doc, 'BlastOutput_query-len')
117 for hit in doc.getElementsByTagName('Hit'):
118 hit_id=_GetValue(hit, 'Hit_def')
119 hsps=hit.getElementsByTagName('Hsp')
120 aligned_patches=[_ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres, evalue_thres) for hsp in hsps]
121 hits.append(BlastHit(hit_id, aligned_patches))
122 return hits
123
124
125
126class BlastError(RuntimeError):
127 def __init__(self, brief, details):
128 self.brief=brief
129 self.details=details
130
131 def __str__(self):
132 if self.details:
133 return '%s\n%s' % (self.brief, self.details)
134 else:
135 return self.brief
136
137def CreateDB(infasta, dbout, mkdb_cmd=None):
138 """
139 Create a blast DB from a fasta file
140
141 :param infasta: the pdb fasta from which the database will be created
142 :type infasta: :class:`string`
143
144 :param dbout: output location for blastDB file
145 :type dbout: :class:`string`
146
147
148 """
149 if mkdb_cmd==None:
150 try:
151 exe=settings.Locate('makeblastdb')
152 args=[exe, '-in', infasta, '-out', dbout, '-dbtype', 'prot']
153 except:
154 try:
155 exe=settings.Locate('formatdb')
156 args=[exe, '-i', infasta, '-n', dbout]
157 except:
158 raise RuntimeError('could not find makeblastdb/formatdb executable')
159 else:
160 if os.path.basename(mkdb_cmd)=='makeblastdb':
161 exe=settings.Locate('makeblastdb',explicit_file_name=mkdb_cmd)
162 args=[exe, '-in', infasta, '-out', dbout, '-dbtype', 'prot']
163 elif os.path.basename(mkdb_cmd)=='formatdb':
164 exe=settings.Locate('formatdb',explicit_filename=mkdb_cmd)
165 args=[exe, '-i', infasta, '-n', dbout]
166 else:
167 raise IOError('mkdb command must either be the path to formatdb or makeblastdb!')
168
169 ost.LogInfo('creating blast DB (%s)' % ' '.join(args))
170 blast_pipe=subprocess.Popen(args, stdout=subprocess.PIPE,
171 stderr=subprocess.PIPE)
172 blast_pipe.communicate()
173
174def BlastVersion(blast_location=None):
175 """
176 Returns the version of the BLAST executable, e.g. 2.2.24 as a string
177 """
178
179 try:
180 blast_exe=settings.Locate('blastp',explicit_file_name=blast_location)
181 except:
182 try:
183 blast_exe=settings.Locate('blastall', explicit_file_name=blast_location)
184 except:
185 raise RuntimeError('could not find blast executable')
186
187 if os.path.basename(blast_exe)=='blastall':
188 args=[blast_exe]
189 pattern=re.compile(r'\s*blastall (\d+\.\d+\.\d+)\s+arguments:\s*')
190
191 else:
192 args=[blast_exe, '-version']
193 pattern=re.compile(r'\s*Package: blast (\d+\.\d+\.\d+),\s+')
194
195 blast_pipe=subprocess.Popen(args, stdout=subprocess.PIPE,
196 stderr=subprocess.PIPE)
197 stdout, _ = blast_pipe.communicate()
198 lines=stdout.decode().splitlines()
199
200 for line in lines:
201 m=pattern.match(line)
202 if m:
203 return m.group(1)
204 raise IOError("could not determine blast version for '%s'" % blast_exe)
205
206
207
208def Blast(query, database, gap_open=11, gap_ext=1, matrix='BLOSUM62',
209 blast_location=None, outfmt=0, filter_low_complexity=True):
210 """
211 Runs a protein vs. protein blast search. The results are returned
212 according to the value of the ``outfmt`` parameter.
213
214 :param query: the query sequence
215 :type query: :class:`seq.ConstSequenceHandle`
216
217 :param database: The filename of the sequence database. Make sure that
218 formatdb has been run on the database and the <database>.pin file exists.
219 :param matrix: The substitution matrix to be used. Must be one of 'BLOSUM45',
220 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70'.
221 :param gap_open: Gap opening penalty. Note that only a subset of gap opening
222 penalties is supported for each substitutition matrix. Consult the blast
223 docs for more information.
224 :param gap_ext: Gap extension penalty. Only a subset of gap extension
225 penalties are supported for each of the substitution matrices. Consult the
226 blast docs for more information.
227 :param outfmt: output format, where '0' corresponds to default output (parsed
228 blast output and 1 to raw string output).
229 :param filter_low_complexity: Mask off segments of the query sequence that
230 have low compositional complexity, as determined by the SEG program of
231 Wootton & Federhen (Computers and Chemistry, 1993)
232 :rtype: :class:`BlastHit` (with ``outfmt=0``) or :class:`str`
233 (with ``outfmt=1``)
234 """
235 subst_mats=('BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70',)
236 if matrix not in subst_mats:
237 raise ValueError('matrix must be one of %s' % ', '.join(subst_mats))
238 if not os.path.exists('%s.pin' % database) and not os.path.exists('%s.pal' % database):
239 raise IOError("Database %s does not exist" % database)
240 if blast_location!=None and not os.path.exists(blast_location):
241 ost.LogScript('Could not find %s' %blast_location)
242
243 if blast_location==None:
244 try:
245 blast_exe=settings.Locate('blastp')
246 except:
247 try:
248 blast_exe=settings.Locate('blastall')
249 except:
250 raise RuntimeError('could not find blast executable')
251 else:
252 blast_exe=settings.Locate(os.path.basename(blast_location),explicit_file_name=blast_location)
253
254 if os.path.basename(blast_exe)=='blastall':
255 args=[blast_exe, '-d', database, '-p', 'blastp',
256 '-m', '7', '-M', matrix, '-G', str(gap_open), '-E', str(gap_ext)]
257 if filter_low_complexity==False:
258 args.append('-F')
259 args.append('F')
260
261 else:
262 complexity_opt='-seg'
263 if filter_low_complexity==True:
264 complexity_arg='yes'
265 else:
266 complexity_arg='no'
267 args=[blast_exe, '-db', database, '-matrix', matrix,
268 '-gapopen', str(gap_open), '-gapextend', str(gap_ext), '-outfmt', '5', complexity_opt, complexity_arg ]
269
270 ost.LogInfo('running BLAST (%s)' % ' '.join(args))
271 blast_pipe=subprocess.Popen(args, stderr=subprocess.PIPE,
272 stdout=subprocess.PIPE, stdin=subprocess.PIPE)
273 if isinstance(query, str):
274 stdout, stderr=blast_pipe.communicate(query.encode())
275 else:
276 stdout, stderr=blast_pipe.communicate(io.SequenceToString(query, 'fasta').encode())
277
278 if len(stderr)>0:
279 pattern=re.compile(r'^\[.*\]\s+ERROR:\s+(.*)')
280 lines=stderr.decode().split('\n')
281 error_message=pattern.match(lines[0])
282 if error_message:
283 raise BlastError(error_message.group(1), '\n'.join(lines[1:]))
284 if outfmt==0:
285 return ParseBlastOutput(stdout.decode())
286 else:
287 return stdout.decode()
__init__(self, aln, bit_score, score, evalue, seqid)
Definition blast.py:35
__init__(self, brief, details)
Definition blast.py:127
__init__(self, identifier, aligned_patches)
Definition blast.py:57
ParseBlastOutput(string, seqid_thres=0, evalue_thres=float("infinity"))
Definition blast.py:62
Blast(query, database, gap_open=11, gap_ext=1, matrix='BLOSUM62', blast_location=None, outfmt=0, filter_low_complexity=True)
Definition blast.py:209
BlastVersion(blast_location=None)
Definition blast.py:174
CreateDB(infasta, dbout, mkdb_cmd=None)
Definition blast.py:137