64 Parses the blast output and returns a list of BlastHits
65 setting no seqid_thres or evalue_thres, restores default behaviour without filtering
69 for child
in node.childNodes:
70 if child.nodeType==child.TEXT_NODE:
74 def _GetValue(node, tag_name):
75 tag=node.getElementsByTagName(tag_name)
77 return _GetText(tag[0])
79 def _GetInt(node, tag_name):
80 return int(_GetValue(node, tag_name))
82 def _ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres=0, evalue_thres=float(
"infinity")):
83 bit_score=float(_GetValue(hsp,
'Hsp_bit-score'))
84 score=float(_GetValue(hsp,
'Hsp_score'))
85 evalue=float(_GetValue(hsp,
'Hsp_evalue'))
87 identity=float(_GetValue(hsp,
'Hsp_identity'))
88 except AssertionError:
93 hsp_align_len=float(_GetValue(hsp,
'Hsp_align-len'))
94 seqid=identity/hsp_align_len
95 query_offset=_GetInt(hsp,
'Hsp_query-from')-1
96 hit_offset=_GetInt(hsp,
'Hsp_hit-from')-1
97 query_seq=seq.CreateSequence(str(query_id), str(_GetValue(hsp,
'Hsp_qseq')))
98 query_seq.offset=query_offset
99 hit_seq=seq.CreateSequence(str(hit_id), str(_GetValue(hsp,
'Hsp_hseq')))
100 hit_seq.offset=hit_offset
102 if seqid > float(seqid_thres)
and evalue < evalue_thres:
103 aln=seq.CreateAlignment(query_seq, hit_seq)
104 return AlignedPatch(aln, bit_score, score, evalue, seqid)
106 except Exception
as e:
107 print(str(e), query_seq, hit_seq)
110 doc=minidom.parseString(string)
111 except Exception
as e:
112 ost.LogError(
'Error while parsing BLAST output: %s' % str(e))
115 query_id=_GetValue(doc,
'BlastOutput_query-def')
116 tot_query_len=_GetValue(doc,
'BlastOutput_query-len')
117 for hit
in doc.getElementsByTagName(
'Hit'):
118 hit_id=_GetValue(hit,
'Hit_def')
119 hsps=hit.getElementsByTagName(
'Hsp')
120 aligned_patches=[_ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres, evalue_thres)
for hsp
in hsps]
121 hits.append(
BlastHit(hit_id, aligned_patches))
139 Create a blast DB from a fasta file
141 :param infasta: the pdb fasta from which the database will be created
142 :type infasta: :class:`string`
144 :param dbout: output location for blastDB file
145 :type dbout: :class:`string`
151 exe=settings.Locate(
'makeblastdb')
152 args=[exe,
'-in', infasta,
'-out', dbout,
'-dbtype',
'prot']
155 exe=settings.Locate(
'formatdb')
156 args=[exe,
'-i', infasta,
'-n', dbout]
158 raise RuntimeError(
'could not find makeblastdb/formatdb executable')
160 if os.path.basename(mkdb_cmd)==
'makeblastdb':
161 exe=settings.Locate(
'makeblastdb',explicit_file_name=mkdb_cmd)
162 args=[exe,
'-in', infasta,
'-out', dbout,
'-dbtype',
'prot']
163 elif os.path.basename(mkdb_cmd)==
'formatdb':
164 exe=settings.Locate(
'formatdb',explicit_filename=mkdb_cmd)
165 args=[exe,
'-i', infasta,
'-n', dbout]
167 raise IOError(
'mkdb command must either be the path to formatdb or makeblastdb!')
169 ost.LogInfo(
'creating blast DB (%s)' %
' '.join(args))
170 blast_pipe=subprocess.Popen(args, stdout=subprocess.PIPE,
171 stderr=subprocess.PIPE)
172 blast_pipe.communicate()
176 Returns the version of the BLAST executable, e.g. 2.2.24 as a string
180 blast_exe=settings.Locate(
'blastp',explicit_file_name=blast_location)
183 blast_exe=settings.Locate(
'blastall', explicit_file_name=blast_location)
185 raise RuntimeError(
'could not find blast executable')
187 if os.path.basename(blast_exe)==
'blastall':
189 pattern=re.compile(
r'\s*blastall (\d+\.\d+\.\d+)\s+arguments:\s*')
192 args=[blast_exe,
'-version']
193 pattern=re.compile(
r'\s*Package: blast (\d+\.\d+\.\d+),\s+')
195 blast_pipe=subprocess.Popen(args, stdout=subprocess.PIPE,
196 stderr=subprocess.PIPE)
197 stdout, _ = blast_pipe.communicate()
198 lines=stdout.decode().splitlines()
201 m=pattern.match(line)
204 raise IOError(
"could not determine blast version for '%s'" % blast_exe)
208def Blast(query, database, gap_open=11, gap_ext=1, matrix='BLOSUM62',
209 blast_location=None, outfmt=0, filter_low_complexity=True):
211 Runs a protein vs. protein blast search. The results are returned
212 according to the value of the ``outfmt`` parameter.
214 :param query: the query sequence
215 :type query: :class:`seq.ConstSequenceHandle`
217 :param database: The filename of the sequence database. Make sure that
218 formatdb has been run on the database and the <database>.pin file exists.
219 :param matrix: The substitution matrix to be used. Must be one of 'BLOSUM45',
220 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70'.
221 :param gap_open: Gap opening penalty. Note that only a subset of gap opening
222 penalties is supported for each substitutition matrix. Consult the blast
223 docs for more information.
224 :param gap_ext: Gap extension penalty. Only a subset of gap extension
225 penalties are supported for each of the substitution matrices. Consult the
226 blast docs for more information.
227 :param outfmt: output format, where '0' corresponds to default output (parsed
228 blast output and 1 to raw string output).
229 :param filter_low_complexity: Mask off segments of the query sequence that
230 have low compositional complexity, as determined by the SEG program of
231 Wootton & Federhen (Computers and Chemistry, 1993)
232 :rtype: :class:`BlastHit` (with ``outfmt=0``) or :class:`str`
235 subst_mats=(
'BLOSUM45',
'BLOSUM62',
'BLOSUM80',
'PAM30',
'PAM70',)
236 if matrix
not in subst_mats:
237 raise ValueError(
'matrix must be one of %s' %
', '.join(subst_mats))
238 if not os.path.exists(
'%s.pin' % database)
and not os.path.exists(
'%s.pal' % database):
239 raise IOError(
"Database %s does not exist" % database)
240 if blast_location!=
None and not os.path.exists(blast_location):
241 ost.LogScript(
'Could not find %s' %blast_location)
243 if blast_location==
None:
245 blast_exe=settings.Locate(
'blastp')
248 blast_exe=settings.Locate(
'blastall')
250 raise RuntimeError(
'could not find blast executable')
252 blast_exe=settings.Locate(os.path.basename(blast_location),explicit_file_name=blast_location)
254 if os.path.basename(blast_exe)==
'blastall':
255 args=[blast_exe,
'-d', database,
'-p',
'blastp',
256 '-m',
'7',
'-M', matrix,
'-G', str(gap_open),
'-E', str(gap_ext)]
257 if filter_low_complexity==
False:
262 complexity_opt=
'-seg'
263 if filter_low_complexity==
True:
267 args=[blast_exe,
'-db', database,
'-matrix', matrix,
268 '-gapopen', str(gap_open),
'-gapextend', str(gap_ext),
'-outfmt',
'5', complexity_opt, complexity_arg ]
270 ost.LogInfo(
'running BLAST (%s)' %
' '.join(args))
271 blast_pipe=subprocess.Popen(args, stderr=subprocess.PIPE,
272 stdout=subprocess.PIPE, stdin=subprocess.PIPE)
273 if isinstance(query, str):
274 stdout, stderr=blast_pipe.communicate(query.encode())
276 stdout, stderr=blast_pipe.communicate(io.SequenceToString(query,
'fasta').encode())
279 pattern=re.compile(
r'^\[.*\]\s+ERROR:\s+(.*)')
280 lines=stderr.decode().split(
'\n')
281 error_message=pattern.match(lines[0])
283 raise BlastError(error_message.group(1),
'\n'.join(lines[1:]))
287 return stdout.decode()
Blast(query, database, gap_open=11, gap_ext=1, matrix='BLOSUM62', blast_location=None, outfmt=0, filter_low_complexity=True)