1 from ost
import settings
3 from xml.dom
import minidom
4 from ost
import io, seq
10 An aligned patch, aka. HSP
14 The local alignment. Sequence offset of both sequences in the alignment are
15 set to the starting position in the query and target sequence, respectively.
17 :type: :class:`~ost.seq.AlignmentHandle`
19 .. attribute:: bit_score
21 The bit score of the HSP
29 The E-value of the HSP
33 The sequence identity of the HSP
35 def __init__(self, aln, bit_score, score, evalue, seqid):
44 A positive match found by BLAST.
46 Each blast hit consists of one or more HSPs, encoded by the
47 :class:`AlignedPatch` class.
49 .. attribute:: identifier
51 The identifier of the matching sequence
53 .. attribute:: aligned_patches
55 list of :class:`AlignedPatch` instances holding the actual HSPs.
57 def __init__(self, identifier, aligned_patches):
64 Parses the blast output and returns a list of BlastHits
65 setting no seqid_thres or evalue_thres, restores default behaviour without filtering
69 for child
in node.childNodes:
70 if child.nodeType==child.TEXT_NODE:
74 def _GetValue(node, tag_name):
75 tag=node.getElementsByTagName(tag_name)
77 return _GetText(tag[0])
79 def _GetInt(node, tag_name):
80 return int(_GetValue(node, tag_name))
82 def _ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres=0, evalue_thres=float(
"infinity")):
83 bit_score=float(_GetValue(hsp,
'Hsp_bit-score'))
84 score=float(_GetValue(hsp,
'Hsp_score'))
85 evalue=float(_GetValue(hsp,
'Hsp_evalue'))
87 identity=float(_GetValue(hsp,
'Hsp_identity'))
88 except AssertionError:
93 hsp_align_len=float(_GetValue(hsp,
'Hsp_align-len'))
94 seqid=identity/hsp_align_len
95 query_offset=_GetInt(hsp,
'Hsp_query-from')-1
96 hit_offset=_GetInt(hsp,
'Hsp_hit-from')-1
97 query_seq=seq.CreateSequence(str(query_id), str(_GetValue(hsp,
'Hsp_qseq')))
98 query_seq.offset=query_offset
99 hit_seq=seq.CreateSequence(str(hit_id), str(_GetValue(hsp,
'Hsp_hseq')))
100 hit_seq.offset=hit_offset
102 if seqid > float(seqid_thres)
and evalue < evalue_thres:
103 aln=seq.CreateAlignment(query_seq, hit_seq)
104 return AlignedPatch(aln, bit_score, score, evalue, seqid)
107 print str(e), query_seq, hit_seq
110 doc=minidom.parseString(string)
112 ost.LogError(
'Error while parsing BLAST output: %s' % str(e))
115 query_id=_GetValue(doc,
'BlastOutput_query-def')
116 tot_query_len=_GetValue(doc,
'BlastOutput_query-len')
117 for hit
in doc.getElementsByTagName(
'Hit'):
118 hit_id=_GetValue(hit,
'Hit_def')
119 hsps=hit.getElementsByTagName(
'Hsp')
120 aligned_patches=[_ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres, evalue_thres)
for hsp
in hsps]
121 hits.append(
BlastHit(hit_id, aligned_patches))
139 Create a blast DB from a fasta file
141 :param infasta: the pdb fasta from which the database will be created
142 :type infasta: :class:`string`
144 :param dbout: output location for blastDB file
145 :type dbout: :class:`string`
151 exe=settings.Locate(
'formatdb')
152 args=[exe,
'-i', infasta,
'-v',str(10000),
'-n',dbout]
155 exe=settings.Locate(
'makeblastdb')
156 args=[exe,
'-in', infasta,
' -max_file_sz 10GB -out', dbout,
'-dbtype',
'prot']
159 raise RuntimeError(
'could not find makeblastdb/formatdb executable')
161 if os.path.basename(mkdb_cmd)==
'makeblastdb':
162 exe=settings.Locate(
'makeblastdb',explicit_file_name=mkdb_cmd)
163 args=[exe,
'-in', infasta,
' -max_file_sz 10GB -out', dbout,
'-dbtype',
'prot']
164 elif os.path.basename(mkdb_cmd)==
'formatdb':
165 exe=settings.Locate(
'formatdb',explicit_filename=mkdb_cmd)
166 args=[exe,
'-i', infasta,
'-v',str(10000),
'-n',dbout]
168 raise IOError(
'mkdb command must either be the path to formatdb or makeblastdb!')
171 ost.LogInfo(
'creating blast DB (%s)' % cmd)
176 Returns the version of the BLAST executable, e.g. 2.2.24 as a string
180 blast_exe=settings.Locate(
'blastall',explicit_file_name=blast_location)
183 blast_exe=settings.Locate(
'blastp', explicit_file_name=blast_location)
185 raise RuntimeError(
'could not find blast executable')
187 if os.path.basename(blast_exe)==
'blastall':
189 pattern=re.compile(
r'blastall (\d+\.\d+\.\d+)\s+arguments:\s*')
192 args=[blast_exe,
'-version']
193 pattern=re.compile(
r'Package: blast (\d+\.\d+\.\d+),\s+')
195 blast_pipe=subprocess.Popen(args, stdout=subprocess.PIPE,
196 stderr=subprocess.PIPE)
197 lines=blast_pipe.stdout.readlines()
200 m=pattern.match(line)
203 raise IOError(
"could not determine blast version for '%s'" % blast_exe)
207 def Blast(query, database, gap_open=11, gap_ext=1, matrix='BLOSUM62',
208 blast_location=
None, outfmt=0, filter_low_complexity=
True):
210 Runs a protein vs. protein blast search. The results are returned as a
211 list of :class:`BlastHit` instances.
213 :param query: the query sequence
214 :type query: :class:`seq.ConstSequenceHandle`
216 :param database: The filename of the sequence database. Make sure that
217 formatdb has been run on the database and the <database>.pin file exists.
218 :param matrix: The substitution matrix to be used. Must be one of 'BLOSUM45',
219 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70'.
220 :param gap_open: Gap opening penalty. Note that only a subset of gap opening
221 penalties is supported for each substitutition matrix. Consult the blast
222 docs for more information.
223 :param gap_ext: Gap extension penalty. Only a subset of gap extension
224 penalties are supported for each of the substitution matrices. Consult the
225 blast docs for more information.
226 :param outfmt: output format, where '0' corresponds to default output (parsed blast output and 1 to raw output)
227 :param filter_low_complexity: Mask off segments of the query sequence that
228 have low compositional complexity, as determined by the SEG program of
229 Wootton & Federhen (Computers and Chemistry, 1993)
231 subst_mats=(
'BLOSUM45',
'BLOSUM62',
'BLOSUM80',
'PAM30',
'PAM70',)
232 if matrix
not in subst_mats:
233 raise ValueError(
'matrix must be one of %s' %
', '.join(subst_mats))
234 if not os.path.exists(
'%s.pin' % database)
and not os.path.exists(
'%s.pal' % database):
235 raise IOError(
"Database %s does not exist" % database)
236 if blast_location!=
None and not os.path.exists(blast_location):
237 ost.LogScript(
'Could not find %s' %blast_location)
239 if blast_location==
None:
241 blast_exe=settings.Locate(
'blastall')
244 blast_exe=settings.Locate(
'blastp')
246 raise RuntimeError(
'could not find blast executable')
248 blast_exe=settings.Locate(os.path.basename(blast_location),explicit_file_name=blast_location)
250 if os.path.basename(blast_exe)==
'blastall':
251 args=[blast_exe,
'-d', database,
'-p',
'blastp',
252 '-m',
'7',
'-M', matrix,
'-G', str(gap_open),
'-E', str(gap_ext)]
253 if filter_low_complexity==
False:
258 complexity_opt=
'-seg'
259 if filter_low_complexity==
True:
263 args=[blast_exe,
'-db', database,
'-matrix', matrix,
264 '-gapopen', str(gap_open),
'-gapextend', str(gap_ext),
'-outfmt',
'5', complexity_opt, complexity_arg ]
266 ost.LogInfo(
'running BLAST (%s)' %
' '.join(args))
267 blast_pipe=subprocess.Popen(args, stderr=subprocess.PIPE,
268 stdout=subprocess.PIPE, stdin=subprocess.PIPE)
269 if isinstance(query, str):
270 stdout, stderr=blast_pipe.communicate(query)
272 stdout, stderr=blast_pipe.communicate(io.SequenceToString(query,
'fasta'))
275 pattern=re.compile(
r'^\[.*\]\s+ERROR:\s+(.*)')
276 lines=stderr.split(
'\n')
277 error_message=pattern.match(lines[0])
279 raise BlastError(error_message.group(1),
'\n'.join(lines[1:]))