OpenStructure
Loading...
Searching...
No Matches
clustalw.py
Go to the documentation of this file.
1from ost.bindings import utils
2from ost import settings, io, seq, LogError
3import os
4import subprocess
5
6def ClustalW(seq1, seq2=None, clustalw=None, keep_files=False, nopgap=False,
7 clustalw_option_string=False):
8 '''
9 Runs a ClustalW multiple sequence alignment. The results are returned as a
10 :class:`~ost.seq.AlignmentHandle` instance.
11
12 There are two ways to use this function:
13
14 - align exactly two sequences:
15
16 :param seq1: sequence_one
17 :type seq1: :class:`~ost.seq.SequenceHandle` or :class:`str`
18
19 :param seq2: sequence_two
20 :type seq2: :class:`~ost.seq.SequenceHandle` or :class:`str`
21
22 The two sequences can be specified as two separate function parameters
23 (`seq1`, `seq2`). The type of both parameters can be either
24 :class:`~ost.seq.SequenceHandle` or :class:`str`, but must be the same for
25 both parameters.
26
27 - align two or more sequences:
28
29 :param seq1: sequence_list
30 :type seq1: :class:`~ost.seq.SequenceList`
31
32 :param seq2: must be :class:`None`
33
34 Two or more sequences can be specified by using a
35 :class:`~ost.seq.SequenceList`. It is then passed as the first function
36 parameter (`seq1`). The second parameter (`seq2`) must be :class:`None`.
37
38
39 :param clustalw: path to ClustalW executable (used in :func:`~ost.settings.Locate`)
40 :type clustalw: :class:`str`
41 :param nopgap: turn residue-specific gaps off
42 :type nopgap: :class:`bool`
43 :param clustalw_option_string: additional ClustalW flags (see http://www.clustal.org/download/clustalw_help.txt)
44 :type clustalw_option_string: :class:`str`
45 :param keep_files: do not delete temporary files
46 :type keep_files: :class:`bool`
47
48 .. note ::
49
50 - In the passed sequences ClustalW will convert lowercase to uppercase, and
51 change all '.' to '-'. OST will convert and '?' to 'X' before aligning
52 sequences with ClustalW.
53 - If a :attr:`sequence name <ost.seq.SequenceHandle.name>` contains spaces,
54 only the part before the space is considered as sequence name. To avoid
55 surprises, you should remove spaces from the sequence name.
56 - Sequence names must be unique (:class:`ValueError` exception raised
57 otherwise).
58
59 ClustalW will accept only IUB/IUPAC amino acid and nucleic acid codes:
60
61 ======= ======================= ======= ============================
62 Residue Name Residue Name
63 ======= ======================= ======= ============================
64 A alanine P proline
65 B aspartate or asparagine Q glutamine
66 C cystine R arginine
67 D aspartate S serine
68 E glutamate T threonine
69 F phenylalanine U selenocysteine
70 G glycine V valine
71 H histidine W tryptophan
72 I isoleucine Y tyrosine
73 K lysine Z glutamate or glutamine
74 L leucine X any
75 M methionine \\* translation stop
76 N asparagine \\- gap of indeterminate length
77 ======= ======================= ======= ============================
78
79 '''
80 clustalw_path=settings.Locate(('clustalw', 'clustalw2'),
81 explicit_file_name=clustalw)
82
83 if seq2!=None:
84 if isinstance(seq1, seq.SequenceHandle) and isinstance(seq2, seq.SequenceHandle):
85 seq_list=seq.CreateSequenceList()
86 seq_list.AddSequence(seq1)
87 seq_list.AddSequence(seq2)
88 elif isinstance(seq1, str) and isinstance(seq2, str):
89 seqh1=seq.CreateSequence("seq1", seq1)
90 seqh2=seq.CreateSequence("seq2", seq2)
91 seq_list=seq.CreateSequenceList()
92 seq_list.AddSequence(seqh1)
93 seq_list.AddSequence(seqh2)
94 else:
95 LogError("WARNING: Specify at least two Sequences")
96 return
97 elif isinstance(seq1, seq.SequenceList):
98 seq_list=seq1
99 else:
100 LogError("WARNING: Specify either two SequenceHandles or one SequenceList")
101 return
102
103 sequence_names = set()
104 for s in seq_list:
105 # we cut out anything after a space to be consistent with ClustalW behaviour
106 sequence_names.add(s.GetName().split(' ')[0])
107 if len(sequence_names) < len(seq_list):
108 raise ValueError("ClustalW can only process sequences with unique identifiers!")
109
110
111 new_list = seq.CreateSequenceList()
112 for s in seq_list:
113 ss = s.Copy()
114 for i,c in enumerate(ss):
115 if c=='?':
116 ss[i]='X'
117 new_list.AddSequence(ss)
118
119 seq_list = new_list
120
121
122 temp_dir=utils.TempDirWithFiles((seq_list,))
123 out=os.path.join(temp_dir.dirname, 'out.fasta')
124 command='%s -infile="%s" -output=fasta -outfile="%s"' % (clustalw_path,
125 temp_dir.files[0],
126 out)
127 if nopgap:
128 command+=" -nopgap"
129 if clustalw_option_string!=False:
130 command=command+" "+clustalw_option_string #see useful flags: http://toolkit.tuebingen.mpg.de/clustalw/help_params
131
132 subprocess.run(command, shell=True, stdout=subprocess.DEVNULL)
133
134 aln=io.LoadAlignment(out)
135
136
137 for sequence in seq_list:
138 for seq_num,aln_seq in enumerate(aln.sequences):
139 if aln_seq.GetName()==sequence.GetName():
140 break
141 aln.SetSequenceOffset(seq_num,sequence.offset)
142 if sequence.HasAttachedView():
143 aln.AttachView(seq_num,sequence.GetAttachedView().Copy())
144
145 if not keep_files:
146 temp_dir.Cleanup()
147
148 return aln
mutable sequence handle.
list of sequences.