00001 from ost import settings
00002 import subprocess
00003 from xml.dom import minidom
00004 from ost import io, seq
00005 import ost
00006 import os, re, sys
00007
00008 class AlignedPatch:
00009 """
00010 An aligned patch, aka. HSP
00011
00012 .. attribute:: aln
00013
00014 The local alignment. Sequence offset of both sequences in the alignment are
00015 set to the starting position in the query and target sequence, respectively.
00016
00017 :type: :class:`~ost.seq.AlignmentHandle`
00018
00019 .. attribute:: bit_score
00020
00021 The bit score of the HSP
00022
00023 .. attribute:: score
00024
00025 The score of the HSP
00026
00027 .. attribute:: evalue
00028
00029 The E-value of the HSP
00030
00031 .. attribute:: seqid
00032
00033 The sequence identity of the HSP
00034 """
00035 def __init__(self, aln, bit_score, score, evalue, seqid):
00036 self.aln=aln
00037 self.bit_score=bit_score
00038 self.score=score
00039 self.evalue=evalue
00040 self.seqid=seqid
00041
00042 class BlastHit:
00043 """
00044 A positive match found by BLAST.
00045
00046 Each blast hit consists of one or more HSPs, encoded by the
00047 :class:`AlignedPatch` class.
00048
00049 .. attribute:: identifier
00050
00051 The identifier of the matching sequence
00052
00053 .. attribute:: aligned_patches
00054
00055 list of :class:`AlignedPatch` instances holding the actual HSPs.
00056 """
00057 def __init__(self, identifier, aligned_patches):
00058 self.identifier=identifier
00059 self.aligned_patches=aligned_patches
00060
00061
00062 def ParseBlastOutput(string, seqid_thres=0, evalue_thres=float("infinity")):
00063 """
00064 Parses the blast output and returns a list of BlastHits
00065 setting no seqid_thres or evalue_thres, restores default behaviour without filtering
00066 """
00067 def _GetText(node):
00068 rc=''
00069 for child in node.childNodes:
00070 if child.nodeType==child.TEXT_NODE:
00071 rc+=child.data
00072 return rc
00073
00074 def _GetValue(node, tag_name):
00075 tag=node.getElementsByTagName(tag_name)
00076 assert len(tag)==1
00077 return _GetText(tag[0])
00078
00079 def _GetInt(node, tag_name):
00080 return int(_GetValue(node, tag_name))
00081
00082 def _ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres=0, evalue_thres=float("infinity")):
00083 bit_score=float(_GetValue(hsp, 'Hsp_bit-score'))
00084 score=float(_GetValue(hsp, 'Hsp_score'))
00085 evalue=float(_GetValue(hsp, 'Hsp_evalue'))
00086 try:
00087 identity=float(_GetValue(hsp, 'Hsp_identity'))
00088 except AssertionError:
00089
00090
00091
00092 identity=0
00093 hsp_align_len=float(_GetValue(hsp, 'Hsp_align-len'))
00094 seqid=identity/hsp_align_len
00095 query_offset=_GetInt(hsp, 'Hsp_query-from')-1
00096 hit_offset=_GetInt(hsp, 'Hsp_hit-from')-1
00097 query_seq=seq.CreateSequence(str(query_id), str(_GetValue(hsp, 'Hsp_qseq')))
00098 query_seq.offset=query_offset
00099 hit_seq=seq.CreateSequence(str(hit_id), str(_GetValue(hsp, 'Hsp_hseq')))
00100 hit_seq.offset=hit_offset
00101 try:
00102 if seqid > float(seqid_thres) and evalue < evalue_thres:
00103 aln=seq.CreateAlignment(query_seq, hit_seq)
00104 return AlignedPatch(aln, bit_score, score, evalue, seqid)
00105
00106 except Exception, e:
00107 print str(e), query_seq, hit_seq
00108
00109 try:
00110 doc=minidom.parseString(string)
00111 except Exception, e:
00112 ost.LogError('Error while parsing BLAST output: %s' % str(e))
00113 return None
00114 hits=[]
00115 query_id=_GetValue(doc, 'BlastOutput_query-def')
00116 tot_query_len=_GetValue(doc, 'BlastOutput_query-len')
00117 for hit in doc.getElementsByTagName('Hit'):
00118 hit_id=_GetValue(hit, 'Hit_def')
00119 hsps=hit.getElementsByTagName('Hsp')
00120 aligned_patches=[_ParseHsp(query_id, hit_id, hsp, tot_query_len, seqid_thres, evalue_thres) for hsp in hsps]
00121 hits.append(BlastHit(hit_id, aligned_patches))
00122 return hits
00123
00124
00125
00126 class BlastError(RuntimeError):
00127 def __init__(self, brief, details):
00128 self.brief=brief
00129 self.details=details
00130
00131 def __str__(self):
00132 if self.details:
00133 return '%s\n%s' % (self.brief, self.details)
00134 else:
00135 return self.brief
00136
00137 def CreateDB(infasta, dbout, mkdb_cmd=None):
00138 """
00139 Create a blast DB from a fasta file
00140
00141 :param infasta: the pdb fasta from which the database will be created
00142 :type infasta: :class:`string`
00143
00144 :param dbout: output location for blastDB file
00145 :type dbout: :class:`string`
00146
00147
00148 """
00149 if mkdb_cmd==None:
00150 try:
00151 exe=settings.Locate('formatdb')
00152 args=[exe, '-i', infasta, '-v',str(10000),'-n',dbout]
00153 except:
00154 try:
00155 exe=settings.Locate('makeblastdb')
00156 args=[exe, '-in', infasta, ' -max_file_sz 10GB -out', dbout,'-dbtype','prot']
00157
00158 except:
00159 raise RuntimeError('could not find makeblastdb/formatdb executable')
00160 else:
00161 if os.path.basename(mkdb_cmd)=='makeblastdb':
00162 exe=settings.Locate('makeblastdb',explicit_file_name=mkdb_cmd)
00163 args=[exe, '-in', infasta, ' -max_file_sz 10GB -out', dbout,'-dbtype','prot']
00164 elif os.path.basename(mkdb_cmd)=='formatdb':
00165 exe=settings.Locate('formatdb',explicit_filename=mkdb_cmd)
00166 args=[exe, '-i', infasta, '-v',str(10000),'-n',dbout]
00167 else:
00168 raise IOError('mkdb command must either be the path to formatdb or makeblastdb!')
00169
00170 cmd=' '.join(args)
00171 ost.LogInfo('creating blast DB (%s)' % cmd)
00172 os.system(cmd)
00173
00174 def BlastVersion(blast_location=None):
00175 """
00176 Returns the version of the BLAST executable, e.g. 2.2.24 as a string
00177 """
00178
00179 try:
00180 blast_exe=settings.Locate('blastall',explicit_file_name=blast_location)
00181 except:
00182 try:
00183 blast_exe=settings.Locate('blastp', explicit_file_name=blast_location)
00184 except:
00185 raise RuntimeError('could not find blast executable')
00186
00187 if os.path.basename(blast_exe)=='blastall':
00188 args=[blast_exe]
00189 pattern=re.compile(r'blastall (\d+\.\d+\.\d+)\s+arguments:\s*')
00190
00191 else:
00192 args=[blast_exe, '-version']
00193 pattern=re.compile(r'Package: blast (\d+\.\d+\.\d+),\s+')
00194
00195 blast_pipe=subprocess.Popen(args, stdout=subprocess.PIPE,
00196 stderr=subprocess.PIPE)
00197 lines=blast_pipe.stdout.readlines()
00198
00199 for line in lines:
00200 m=pattern.match(line)
00201 if m:
00202 return m.group(1)
00203 raise IOError("could not determine blast version for '%s'" % blast_exe)
00204
00205
00206
00207 def Blast(query, database, gap_open=11, gap_ext=1, matrix='BLOSUM62',
00208 blast_location=None, outfmt=0, filter_low_complexity=True):
00209 """
00210 Runs a protein vs. protein blast search. The results are returned as a
00211 list of :class:`BlastHit` instances.
00212
00213 :param query: the query sequence
00214 :type query: :class:`seq.ConstSequenceHandle`
00215
00216 :param database: The filename of the sequence database. Make sure that
00217 formatdb has been run on the database and the <database>.pin file exists.
00218 :param matrix: The substitution matrix to be used. Must be one of 'BLOSUM45',
00219 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70'.
00220 :param gap_open: Gap opening penalty. Note that only a subset of gap opening
00221 penalties is supported for each substitutition matrix. Consult the blast
00222 docs for more information.
00223 :param gap_ext: Gap extension penalty. Only a subset of gap extension
00224 penalties are supported for each of the substitution matrices. Consult the
00225 blast docs for more information.
00226 :param outfmt: output format, where '0' corresponds to default output (parsed blast output and 1 to raw output)
00227 :param filter_low_complexity: Mask off segments of the query sequence that
00228 have low compositional complexity, as determined by the SEG program of
00229 Wootton & Federhen (Computers and Chemistry, 1993)
00230 """
00231 subst_mats=('BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30', 'PAM70',)
00232 if matrix not in subst_mats:
00233 raise ValueError('matrix must be one of %s' % ', '.join(subst_mats))
00234 if not os.path.exists('%s.pin' % database) and not os.path.exists('%s.pal' % database):
00235 raise IOError("Database %s does not exist" % database)
00236 if blast_location!=None and not os.path.exists(blast_location):
00237 ost.LogScript('Could not find %s' %blast_location)
00238
00239 if blast_location==None:
00240 try:
00241 blast_exe=settings.Locate('blastall')
00242 except:
00243 try:
00244 blast_exe=settings.Locate('blastp')
00245 except:
00246 raise RuntimeError('could not find blast executable')
00247 else:
00248 blast_exe=settings.Locate(os.path.basename(blast_location),explicit_file_name=blast_location)
00249
00250 if os.path.basename(blast_exe)=='blastall':
00251 args=[blast_exe, '-d', database, '-p', 'blastp',
00252 '-m', '7', '-M', matrix, '-G', str(gap_open), '-E', str(gap_ext)]
00253 if filter_low_complexity==False:
00254 args.append('-F')
00255 args.append('F')
00256
00257 else:
00258 complexity_opt='-seg'
00259 if filter_low_complexity==True:
00260 complexity_arg='yes'
00261 else:
00262 complexity_arg='no'
00263 args=[blast_exe, '-db', database, '-matrix', matrix,
00264 '-gapopen', str(gap_open), '-gapextend', str(gap_ext), '-outfmt', '5', complexity_opt, complexity_arg ]
00265
00266 ost.LogInfo('running BLAST (%s)' % ' '.join(args))
00267 blast_pipe=subprocess.Popen(args, stderr=subprocess.PIPE,
00268 stdout=subprocess.PIPE, stdin=subprocess.PIPE)
00269 if isinstance(query, str):
00270 stdout, stderr=blast_pipe.communicate(query)
00271 else:
00272 stdout, stderr=blast_pipe.communicate(io.SequenceToString(query, 'fasta'))
00273
00274 if len(stderr)>0:
00275 pattern=re.compile(r'^\[.*\]\s+ERROR:\s+(.*)')
00276 lines=stderr.split('\n')
00277 error_message=pattern.match(lines[0])
00278 if error_message:
00279 raise BlastError(error_message.group(1), '\n'.join(lines[1:]))
00280 if outfmt==0:
00281 return ParseBlastOutput(stdout)
00282 else:
00283 return stdout