1 from ost
import settings
3 from xml.dom
import minidom
4 from ost
import io, seq
10 An aligned patch, aka. HSP
14 The local alignment. Sequence offset of both sequences in the alignment are
15 set to the starting position in the query and target sequence, respectively.
17 :type: :class:`~ost.seq.AlignmentHandle`
19 .. attribute:: bit_score
21 The bit score of the HSP
29 The E-value of the HSP
33 The sequence identity of the HSP
35 def __init__(self, aln, bit_score, score, evalue, seqid):
44 A positive match found by BLAST.
46 Each blast hit consists of one or more HSPs, encoded by the
47 :class:`AlignedPatch` class.
49 .. attribute:: identifier
51 The identifier of the matching sequence
53 .. attribute:: aligned_patches
55 list of :class:`AlignedPatch` instances holding the actual HSPs.
57 def __init__(self, identifier, aligned_patches):
64 Parses the blast output and returns a list of BlastHits
65 setting no seqid_thres or evalue_thres, restores default behaviour without filtering
69 for child
in node.childNodes:
70 if child.nodeType==child.TEXT_NODE:
74 def _GetValue(node, tag_name):
75 tag=node.getElementsByTagName(tag_name)
77 return _GetText(tag[0])
79 def _GetInt(node, tag_name):
80 return int(_GetValue(node, tag_name))
82 def _ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres=0, evalue_thres=float(
"infinity")):
83 bit_score=float(_GetValue(hsp,
'Hsp_bit-score'))
84 score=float(_GetValue(hsp,
'Hsp_score'))
85 evalue=float(_GetValue(hsp,
'Hsp_evalue'))
87 identity=float(_GetValue(hsp,
'Hsp_identity'))
88 except AssertionError:
93 hsp_align_len=float(_GetValue(hsp,
'Hsp_align-len'))
94 seqid=identity/hsp_align_len
95 query_offset=_GetInt(hsp,
'Hsp_query-from')-1
96 hit_offset=_GetInt(hsp,
'Hsp_hit-from')-1
97 query_seq=seq.CreateSequence(str(query_id), str(_GetValue(hsp,
'Hsp_qseq')))
98 query_seq.offset=query_offset
99 hit_seq=seq.CreateSequence(str(hit_id), str(_GetValue(hsp,
'Hsp_hseq')))
100 hit_seq.offset=hit_offset
102 if seqid > float(seqid_thres)
and evalue < evalue_thres:
103 aln=seq.CreateAlignment(query_seq, hit_seq)
104 return AlignedPatch(aln, bit_score, score, evalue, seqid)
106 except Exception
as e:
107 print(str(e), query_seq, hit_seq)
110 doc=minidom.parseString(string)
111 except Exception
as e:
112 ost.LogError(
'Error while parsing BLAST output: %s' % str(e))
115 query_id=_GetValue(doc,
'BlastOutput_query-def')
116 tot_query_len=_GetValue(doc,
'BlastOutput_query-len')
117 for hit
in doc.getElementsByTagName(
'Hit'):
118 hit_id=_GetValue(hit,
'Hit_def')
119 hsps=hit.getElementsByTagName(
'Hsp')
120 aligned_patches=[_ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres, evalue_thres)
for hsp
in hsps]
121 hits.append(
BlastHit(hit_id, aligned_patches))
139 Create a blast DB from a fasta file
141 :param infasta: the pdb fasta from which the database will be created
142 :type infasta: :class:`string`
144 :param dbout: output location for blastDB file
145 :type dbout: :class:`string`
151 exe=settings.Locate(
'makeblastdb')
152 args=[exe,
'-in', infasta,
'-out', dbout,
'-dbtype',
'prot']
155 exe=settings.Locate(
'formatdb')
156 args=[exe,
'-i', infasta,
'-n', dbout]
158 raise RuntimeError(
'could not find makeblastdb/formatdb executable')
160 if os.path.basename(mkdb_cmd)==
'makeblastdb':
161 exe=settings.Locate(
'makeblastdb',explicit_file_name=mkdb_cmd)
162 args=[exe,
'-in', infasta,
'-out', dbout,
'-dbtype',
'prot']
163 elif os.path.basename(mkdb_cmd)==
'formatdb':
164 exe=settings.Locate(
'formatdb',explicit_filename=mkdb_cmd)
165 args=[exe,
'-i', infasta,
'-n', dbout]
167 raise IOError(
'mkdb command must either be the path to formatdb or makeblastdb!')
169 ost.LogInfo(
'creating blast DB (%s)' %
' '.join(args))
170 blast_pipe=subprocess.Popen(args, stdout=subprocess.PIPE,
171 stderr=subprocess.PIPE)
172 blast_pipe.communicate()
176 Returns the version of the BLAST executable, e.g. 2.2.24 as a string
180 blast_exe=settings.Locate(
'blastp',explicit_file_name=blast_location)
183 blast_exe=settings.Locate(
'blastall', explicit_file_name=blast_location)
185 raise RuntimeError(
'could not find blast executable')
187 if os.path.basename(blast_exe)==
'blastall':
189 pattern=re.compile(
r'\s*blastall (\d+\.\d+\.\d+)\s+arguments:\s*')
192 args=[blast_exe,
'-version']
193 pattern=re.compile(
r'\s*Package: blast (\d+\.\d+\.\d+),\s+')
195 blast_pipe=subprocess.Popen(args, stdout=subprocess.PIPE,
196 stderr=subprocess.PIPE)
197 stdout, _ = blast_pipe.communicate()
198 lines=stdout.decode().splitlines()
201 m=pattern.match(line)
204 raise IOError(
"could not determine blast version for '%s'" % blast_exe)
208 def Blast(query, database, gap_open=11, gap_ext=1, matrix='BLOSUM62',
209 blast_location=
None, outfmt=0, filter_low_complexity=
True):
211 Runs a protein vs. protein blast search. The results are returned
212 according to the value of the ``outfmt`` parameter.
214 :param query: the query sequence
215 :type query: :class:`seq.ConstSequenceHandle`
217 :param database: The filename of the sequence database. Make sure that
218 formatdb has been run on the database and the <database>.pin file exists.
219 :param matrix: The substitution matrix to be used. Must be one of 'BLOSUM45',
220 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70'.
221 :param gap_open: Gap opening penalty. Note that only a subset of gap opening
222 penalties is supported for each substitutition matrix. Consult the blast
223 docs for more information.
224 :param gap_ext: Gap extension penalty. Only a subset of gap extension
225 penalties are supported for each of the substitution matrices. Consult the
226 blast docs for more information.
227 :param outfmt: output format, where '0' corresponds to default output (parsed
228 blast output and 1 to raw string output).
229 :param filter_low_complexity: Mask off segments of the query sequence that
230 have low compositional complexity, as determined by the SEG program of
231 Wootton & Federhen (Computers and Chemistry, 1993)
232 :rtype: :class:`BlastHit` (with ``outfmt=0``) or :class:`str`
235 subst_mats=(
'BLOSUM45',
'BLOSUM62',
'BLOSUM80',
'PAM30',
'PAM70',)
236 if matrix
not in subst_mats:
237 raise ValueError(
'matrix must be one of %s' %
', '.join(subst_mats))
238 if not os.path.exists(
'%s.pin' % database)
and not os.path.exists(
'%s.pal' % database):
239 raise IOError(
"Database %s does not exist" % database)
240 if blast_location!=
None and not os.path.exists(blast_location):
241 ost.LogScript(
'Could not find %s' %blast_location)
243 if blast_location==
None:
245 blast_exe=settings.Locate(
'blastp')
248 blast_exe=settings.Locate(
'blastall')
250 raise RuntimeError(
'could not find blast executable')
252 blast_exe=settings.Locate(os.path.basename(blast_location),explicit_file_name=blast_location)
254 if os.path.basename(blast_exe)==
'blastall':
255 args=[blast_exe,
'-d', database,
'-p',
'blastp',
256 '-m',
'7',
'-M', matrix,
'-G', str(gap_open),
'-E', str(gap_ext)]
257 if filter_low_complexity==
False:
262 complexity_opt=
'-seg'
263 if filter_low_complexity==
True:
267 args=[blast_exe,
'-db', database,
'-matrix', matrix,
268 '-gapopen', str(gap_open),
'-gapextend', str(gap_ext),
'-outfmt',
'5', complexity_opt, complexity_arg ]
270 ost.LogInfo(
'running BLAST (%s)' %
' '.join(args))
271 blast_pipe=subprocess.Popen(args, stderr=subprocess.PIPE,
272 stdout=subprocess.PIPE, stdin=subprocess.PIPE)
273 if isinstance(query, str):
274 stdout, stderr=blast_pipe.communicate(query.encode())
276 stdout, stderr=blast_pipe.communicate(io.SequenceToString(query,
'fasta').encode())
279 pattern=re.compile(
r'^\[.*\]\s+ERROR:\s+(.*)')
280 lines=stderr.decode().split(
'\n')
281 error_message=pattern.match(lines[0])
283 raise BlastError(error_message.group(1),
'\n'.join(lines[1:]))
287 return stdout.decode()