00001 from ost.bindings import utils
00002 from ost import settings, io, seq, LogError
00003 import os
00004 import subprocess
00005
00006 def ClustalW(seq1, seq2=None, clustalw=None, keep_files=False, nopgap=False,
00007 clustalw_option_string=False):
00008 '''
00009 Runs a clustalw multiple sequence alignment. The results are returned as a
00010 :class:`~ost.seq.AlignmentHandle` instance.
00011
00012 There are two ways to use this function:
00013
00014 - align exactly two sequences:
00015
00016 :param seq1: sequence_one
00017 :type seq1: :class:`~ost.seq.SequenceHandle` or :class:`str`
00018
00019 :param seq2: sequence_two
00020 :type seq2: :class:`~ost.seq.SequenceHandle` or :class:`str`
00021
00022 The two sequences can be specified as two separate function parameters
00023 (`seq1`, `seq2`). The type of both parameters can be either
00024 :class:`~ost.seq.SequenceHandle` or :class:`str`, but must be the same for
00025 both parameters.
00026
00027 - align two or more sequences:
00028
00029 :param seq1: sequence_list
00030 :type seq1: :class:`~ost.seq.SequenceList`
00031
00032 :param seq2: must be :class:`None`
00033
00034 Two or more sequences can be specified by using a
00035 :class:`~ost.seq.SequenceList`. It is then passed as the first function
00036 parameter (`seq1`). The second parameter (`seq2`) must be :class:`None`.
00037
00038
00039 :param clustalw: path to clustalw executable (used in :func:`~ost.settings.Locate`)
00040 :type clustalw: :class:`str`
00041 :param nopgap: turn residue-specific gaps off
00042 :type nopgap: :class:`bool`
00043 :param clustalw_option_string: additional clustalw flags (see http://toolkit.tuebingen.mpg.de/clustalw/help_params)
00044 :type clustalw_option_string: :class:`str`
00045 :param keep_files: do not delete temporary files
00046 :type keep_files: :class:`bool`
00047
00048 Note: ClustalW will convert lowercase to uppercase, and change all '.' to '-'.
00049 OST will convert and '?' to 'X' before aligning sequences with Clustalw.
00050
00051 ClustalW will accept only IUB/IUPAC amino acid and nucleic acid codes:
00052
00053 ======= ======================= ======= ============================
00054 Residue Name Residue Name
00055 ======= ======================= ======= ============================
00056 A alanine P proline
00057 B aspartate or asparagine Q glutamine
00058 C cystine R arginine
00059 D aspartate S serine
00060 E glutamate T threonine
00061 F phenylalanine U selenocysteine
00062 G glycine V valine
00063 H histidine W tryptophan
00064 I isoleucine Y tyrosine
00065 K lysine Z glutamate or glutamine
00066 L leucine X any
00067 M methionine \* translation stop
00068 N asparagine \- gap of indeterminate length
00069 ======= ======================= ======= ============================
00070
00071 '''
00072 clustalw_path=settings.Locate(('clustalw', 'clustalw2'),
00073 explicit_file_name=clustalw)
00074
00075 if seq2!=None:
00076 if isinstance(seq1, seq.SequenceHandle) and isinstance(seq2, seq.SequenceHandle):
00077 seq_list=seq.CreateSequenceList()
00078 seq_list.AddSequence(seq1)
00079 seq_list.AddSequence(seq2)
00080 elif isinstance(seq1, str) and isinstance(seq2, str):
00081 seqh1=seq.CreateSequence("seq1", seq1)
00082 seqh2=seq.CreateSequence("seq2", seq2)
00083 seq_list=seq.CreateSequenceList()
00084 seq_list.AddSequence(seqh1)
00085 seq_list.AddSequence(seqh2)
00086 else:
00087 LogError("WARNING: Specify at least two Sequences")
00088 return
00089 elif isinstance(seq1, seq.SequenceList):
00090 seq_list=seq1
00091 else:
00092 LogError("WARNING: Specify either two SequenceHandles or one SequenceList")
00093 return
00094
00095 sequence_names = set()
00096 for s in seq_list:
00097 sequence_names.add(s.GetName())
00098 if len(sequence_names) < len(seq_list):
00099 raise ValueError("ClustalW can only process sequences with unique identifiers!")
00100
00101
00102 new_list = seq.CreateSequenceList()
00103 for s in seq_list:
00104 ss = s.Copy()
00105 for i,c in enumerate(ss):
00106 if c=='?':
00107 ss[i]='X'
00108 new_list.AddSequence(ss)
00109
00110 seq_list = new_list
00111
00112
00113 temp_dir=utils.TempDirWithFiles((seq_list,))
00114 out=os.path.join(temp_dir.dirname, 'out.fasta')
00115 command='%s -infile="%s" -output=fasta -outfile="%s"' % (clustalw_path,
00116 temp_dir.files[0],
00117 out)
00118 if nopgap:
00119 command+=" -nopgap"
00120 if clustalw_option_string!=False:
00121 command=command+" "+clustalw_option_string
00122
00123 ps=subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
00124 ps.stdout.readlines()
00125 aln=io.LoadAlignment(out)
00126
00127
00128 for sequence in seq_list:
00129 for seq_num,aln_seq in enumerate(aln.sequences):
00130 if aln_seq.GetName()==sequence.GetName():
00131 break
00132 aln.SetSequenceOffset(seq_num,sequence.offset)
00133 if sequence.HasAttachedView():
00134 aln.AttachView(seq_num,sequence.GetAttachedView().Copy())
00135
00136 if not keep_files:
00137 temp_dir.Cleanup()
00138
00139 return aln