OpenStructure
clustalw.py
Go to the documentation of this file.
1 from ost.bindings import utils
2 from ost import settings, io, seq, LogError
3 import os
4 import subprocess
5 
6 def ClustalW(seq1, seq2=None, clustalw=None, keep_files=False, nopgap=False,
7  clustalw_option_string=False):
8  '''
9  Runs a ClustalW multiple sequence alignment. The results are returned as a
10  :class:`~ost.seq.AlignmentHandle` instance.
11 
12  There are two ways to use this function:
13 
14  - align exactly two sequences:
15 
16  :param seq1: sequence_one
17  :type seq1: :class:`~ost.seq.SequenceHandle` or :class:`str`
18 
19  :param seq2: sequence_two
20  :type seq2: :class:`~ost.seq.SequenceHandle` or :class:`str`
21 
22  The two sequences can be specified as two separate function parameters
23  (`seq1`, `seq2`). The type of both parameters can be either
24  :class:`~ost.seq.SequenceHandle` or :class:`str`, but must be the same for
25  both parameters.
26 
27  - align two or more sequences:
28 
29  :param seq1: sequence_list
30  :type seq1: :class:`~ost.seq.SequenceList`
31 
32  :param seq2: must be :class:`None`
33 
34  Two or more sequences can be specified by using a
35  :class:`~ost.seq.SequenceList`. It is then passed as the first function
36  parameter (`seq1`). The second parameter (`seq2`) must be :class:`None`.
37 
38 
39  :param clustalw: path to ClustalW executable (used in :func:`~ost.settings.Locate`)
40  :type clustalw: :class:`str`
41  :param nopgap: turn residue-specific gaps off
42  :type nopgap: :class:`bool`
43  :param clustalw_option_string: additional ClustalW flags (see http://www.clustal.org/download/clustalw_help.txt)
44  :type clustalw_option_string: :class:`str`
45  :param keep_files: do not delete temporary files
46  :type keep_files: :class:`bool`
47 
48  .. note ::
49 
50  - In the passed sequences ClustalW will convert lowercase to uppercase, and
51  change all '.' to '-'. OST will convert and '?' to 'X' before aligning
52  sequences with ClustalW.
53  - If a :attr:`sequence name <ost.seq.SequenceHandle.name>` contains spaces,
54  only the part before the space is considered as sequence name. To avoid
55  surprises, you should remove spaces from the sequence name.
56  - Sequence names must be unique (:class:`ValueError` exception raised
57  otherwise).
58 
59  ClustalW will accept only IUB/IUPAC amino acid and nucleic acid codes:
60 
61  ======= ======================= ======= ============================
62  Residue Name Residue Name
63  ======= ======================= ======= ============================
64  A alanine P proline
65  B aspartate or asparagine Q glutamine
66  C cystine R arginine
67  D aspartate S serine
68  E glutamate T threonine
69  F phenylalanine U selenocysteine
70  G glycine V valine
71  H histidine W tryptophan
72  I isoleucine Y tyrosine
73  K lysine Z glutamate or glutamine
74  L leucine X any
75  M methionine \\* translation stop
76  N asparagine \\- gap of indeterminate length
77  ======= ======================= ======= ============================
78 
79  '''
80  clustalw_path=settings.Locate(('clustalw', 'clustalw2'),
81  explicit_file_name=clustalw)
82 
83  if seq2!=None:
84  if isinstance(seq1, seq.SequenceHandle) and isinstance(seq2, seq.SequenceHandle):
85  seq_list=seq.CreateSequenceList()
86  seq_list.AddSequence(seq1)
87  seq_list.AddSequence(seq2)
88  elif isinstance(seq1, str) and isinstance(seq2, str):
89  seqh1=seq.CreateSequence("seq1", seq1)
90  seqh2=seq.CreateSequence("seq2", seq2)
91  seq_list=seq.CreateSequenceList()
92  seq_list.AddSequence(seqh1)
93  seq_list.AddSequence(seqh2)
94  else:
95  LogError("WARNING: Specify at least two Sequences")
96  return
97  elif isinstance(seq1, seq.SequenceList):
98  seq_list=seq1
99  else:
100  LogError("WARNING: Specify either two SequenceHandles or one SequenceList")
101  return
102 
103  sequence_names = set()
104  for s in seq_list:
105  # we cut out anything after a space to be consistent with ClustalW behaviour
106  sequence_names.add(s.GetName().split(' ')[0])
107  if len(sequence_names) < len(seq_list):
108  raise ValueError("ClustalW can only process sequences with unique identifiers!")
109 
110 
111  new_list = seq.CreateSequenceList()
112  for s in seq_list:
113  ss = s.Copy()
114  for i,c in enumerate(ss):
115  if c=='?':
116  ss[i]='X'
117  new_list.AddSequence(ss)
118 
119  seq_list = new_list
120 
121 
122  temp_dir=utils.TempDirWithFiles((seq_list,))
123  out=os.path.join(temp_dir.dirname, 'out.fasta')
124  command='%s -infile="%s" -output=fasta -outfile="%s"' % (clustalw_path,
125  temp_dir.files[0],
126  out)
127  if nopgap:
128  command+=" -nopgap"
129  if clustalw_option_string!=False:
130  command=command+" "+clustalw_option_string #see useful flags: http://toolkit.tuebingen.mpg.de/clustalw/help_params
131 
132  subprocess.run(command, shell=True, stdout=subprocess.DEVNULL)
133 
134  aln=io.LoadAlignment(out)
135 
136 
137  for sequence in seq_list:
138  for seq_num,aln_seq in enumerate(aln.sequences):
139  if aln_seq.GetName()==sequence.GetName():
140  break
141  aln.SetSequenceOffset(seq_num,sequence.offset)
142  if sequence.HasAttachedView():
143  aln.AttachView(seq_num,sequence.GetAttachedView().Copy())
144 
145  if not keep_files:
146  temp_dir.Cleanup()
147 
148  return aln
mutable sequence handle.
list of sequences.
def ClustalW(seq1, seq2=None, clustalw=None, keep_files=False, nopgap=False, clustalw_option_string=False)
Definition: clustalw.py:7