1 from ost
import settings
3 from xml.dom
import minidom
4 from ost
import io, seq
7 sys.path.append(
'PYMODULES')
12 An aligned patch, aka. HSP
16 The local alignment. Sequence offset of both sequences in the alignment are
17 set to the starting position in the query and target sequence, respectively.
19 :type: :class:`~ost.seq.AlignmentHandle`
21 .. attribute:: bit_score
23 The bit score of the HSP
31 The E-value of the HSP
35 The sequence identity of the HSP
37 def __init__(self, aln, bit_score, score, evalue, seqid):
46 A positive match found by BLAST.
48 Each blast hit consists of one or more HSPs, encoded by the
49 :class:`AlignedPatch` class.
51 .. attribute:: identifier
53 The identifier of the matching sequence
55 .. attribute:: aligned_patches
57 list of :class:`AlignedPatch` instances holding the actual HSPs.
59 def __init__(self, identifier, aligned_patches):
66 Parses the blast output and returns a list of BlastHits
67 setting no seqid_thres or evalue_thres, restores default behaviour without filtering
71 for child
in node.childNodes:
72 if child.nodeType==child.TEXT_NODE:
76 def _GetValue(node, tag_name):
77 tag=node.getElementsByTagName(tag_name)
79 return _GetText(tag[0])
81 def _GetInt(node, tag_name):
82 return int(_GetValue(node, tag_name))
84 def _ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres=0, evalue_thres=float(
"infinity")):
85 bit_score=float(_GetValue(hsp,
'Hsp_bit-score'))
86 score=float(_GetValue(hsp,
'Hsp_score'))
87 evalue=float(_GetValue(hsp,
'Hsp_evalue'))
88 identity=float(_GetValue(hsp,
'Hsp_identity'))
89 hsp_align_len=float(_GetValue(hsp,
'Hsp_align-len'))
90 seqid=identity/hsp_align_len
91 query_offset=_GetInt(hsp,
'Hsp_query-from')-1
92 hit_offset=_GetInt(hsp,
'Hsp_hit-from')-1
93 query_seq=seq.CreateSequence(str(query_id), str(_GetValue(hsp,
'Hsp_qseq')))
94 query_seq.offset=query_offset
95 hit_seq=seq.CreateSequence(str(hit_id), str(_GetValue(hsp,
'Hsp_hseq')))
96 hit_seq.offset=hit_offset
98 if seqid > float(seqid_thres)
and evalue < evalue_thres:
99 aln=seq.CreateAlignment(query_seq, hit_seq)
100 return AlignedPatch(aln, bit_score, score, evalue, seqid)
103 print str(e), query_seq, hit_seq
106 doc=minidom.parseString(string)
108 ost.LogError(
'Error while parsing BLAST output: %s' % str(e))
111 query_id=_GetValue(doc,
'BlastOutput_query-def')
112 tot_query_len=_GetValue(doc,
'BlastOutput_query-len')
113 for hit
in doc.getElementsByTagName(
'Hit'):
114 hit_id=_GetValue(hit,
'Hit_def')
115 hsps=hit.getElementsByTagName(
'Hsp')
116 aligned_patches=[_ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres, evalue_thres)
for hsp
in hsps]
117 hits.append(
BlastHit(hit_id, aligned_patches))
135 Create a blast DB from a fasta file
137 :param infasta: the pdb fasta from which the database will be created
138 :type infasta: :class:`string`
140 :param dbout: output location for blastDB file
141 :type infasta: :class:`string`
147 exe=settings.Locate(
'formatdb')
148 args=[exe,
'-i', infasta,
'-v',str(10000),
'-n',dbout]
151 exe=settings.Locate(
'makeblastdb')
152 args=[exe,
'-in', infasta,
' -max_file_sz 10GB -out', dbout,
'-dbtype',
'prot']
155 raise RuntimeError(
'could not find makeblastdb/formatdb executable')
157 if os.path.basename(mkdb_cmd)==
'makeblastdb':
158 exe=settings.Locate(
'makeblastdb',explicit_file_name=mkdb_cmd)
159 args=[exe,
'-in', infasta,
' -max_file_sz 10GB -out', dbout,
'-dbtype',
'prot']
160 elif os.path.basename(mkdb_cmd)==
'formatdb':
161 exe=settings.Locate(
'formatdb',explicit_filename=mkdb_cmd)
162 args=[exe,
'-i', infasta,
'-v',str(10000),
'-n',dbout]
164 raise IOError(
'mkdb command must either be the path to formatdb or makeblastdb!')
167 ost.LogInfo(
'creating blast DB (%s)' % cmd)
172 Returns the version of the BLAST executable, e.g. 2.2.24 as a string
176 blast_exe=settings.Locate(
'blastall',explicit_file_name=blast_location)
179 blast_exe=settings.Locate(
'blastp', explicit_file_name=blast_location)
181 raise RuntimeError(
'could not find blast executable')
183 if os.path.basename(blast_exe)==
'blastall':
185 pattern=re.compile(
r'blastall (\d+\.\d+\.\d+)\s+arguments:\s*')
188 args=[blast_exe,
'-version']
189 pattern=re.compile(
r'Package: blast (\d+\.\d+\.\d+),\s+')
191 blast_pipe=subprocess.Popen(args, stdout=subprocess.PIPE,
192 stderr=subprocess.PIPE)
193 lines=blast_pipe.stdout.readlines()
196 m=pattern.match(line)
199 raise IOError(
"could not determine blast version for '%s'" % blast_exe)
203 def Blast(query, database, gap_open=11, gap_ext=1, matrix='BLOSUM62',
204 blast_location=
None, outfmt=0, filter_low_complexity=
True):
206 Runs a protein vs. protein blast search. The results are returned as a
207 list of :class:`BlastHit` instances.
209 :param query: the query sequence
210 :type query: :class:`seq.ConstSequenceHandle`
212 :param database: The filename of the sequence database. Make sure that
213 formatdb has been run on the database and the <database>.pin file exists.
214 :param matrix: The substitution matrix to be used. Must be one of 'BLOSUM45',
215 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70'.
216 :param gap_open: Gap opening penalty. Note that only a subset of gap opening
217 penalties is supported for each substitutition matrix. Consult the blast
218 docs for more information.
219 :param gap_ext: Gap extension penalty. Only a subset of gap extension
220 penalties are supported for each of the substitution matrices. Consult the
221 blast docs for more information.
222 :param outfmt: output format, where '0' corresponds to default output (parsed blast output and 1 to raw output)
223 :param filter_low_complexity: Mask off segments of the query sequence that
224 have low compositional complexity, as determined by the SEG program of
225 Wootton & Federhen (Computers and Chemistry, 1993)
227 subst_mats=(
'BLOSUM45',
'BLOSUM62',
'BLOSUM80',
'PAM30',
'PAM70',)
228 if matrix
not in subst_mats:
229 raise ValueError(
'matrix must be one of %s' %
', '.join(subst_mats))
230 if not os.path.exists(
'%s.pin' % database)
and not os.path.exists(
'%s.pal' % database):
231 raise IOError(
"Database %s does not exist" % database)
232 if blast_location!=
None and not os.path.exists(blast_location):
233 ost.LogScript(
'Could not find %s' %blast_location)
235 if blast_location==
None:
237 blast_exe=settings.Locate(
'blastall')
240 blast_exe=settings.Locate(
'blastp')
242 raise RuntimeError(
'could not find blast executable')
244 blast_exe=settings.Locate(os.path.basename(blast_location),explicit_file_name=blast_location)
246 if os.path.basename(blast_exe)==
'blastall':
247 args=[blast_exe,
'-d', database,
'-p',
'blastp',
248 '-m',
'7',
'-M', matrix,
'-G', str(gap_open),
'-E', str(gap_ext)]
249 if filter_low_complexity==
False:
254 complexity_opt=
'-seg'
255 if filter_low_complexity==
True:
259 args=[blast_exe,
'-db', database,
'-matrix', matrix,
260 '-gapopen', str(gap_open),
'-gapextend', str(gap_ext),
'-outfmt',
'5', complexity_opt, complexity_arg ]
262 ost.LogInfo(
'running BLAST (%s)' %
' '.join(args))
263 blast_pipe=subprocess.Popen(args, stderr=subprocess.PIPE,
264 stdout=subprocess.PIPE, stdin=subprocess.PIPE)
265 if isinstance(query, str):
266 stdout, stderr=blast_pipe.communicate(query)
268 stdout, stderr=blast_pipe.communicate(io.SequenceToString(query,
'fasta'))
271 pattern=re.compile(
r'^\[.*\]\s+ERROR:\s+(.*)')
272 lines=stderr.split(
'\n')
273 error_message=pattern.match(lines[0])
275 raise BlastError(error_message.group(1),
'\n'.join(lines[1:]))