OpenStructure
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
kclust.py
Go to the documentation of this file.
1 import subprocess, os, tempfile
2 from ost import io, seq, settings
3 
4 """
5 Wrapper for kClust a protein sequence clustering algorithm
6 
7 unpublished, but mentioned in:
8 
9 Michael Remmert, Andreas Biegert, Andreas Hauser & Johannes Soeding
10 HHblits: lighting-fast iterative protein sequence searching by
11 HMM-HMM alignment
12 Nature Methods 9, 173-175 (2012)
13 
14 source code can be downloaded from:
15 
16 ftp://toolkit.genzentrum.lmu.de/pub/
17 """
18 
19 class cluster:
20 
21  """
22  Holds the information of one cluster
23 
24  .. attribute:: sequences
25 
26  SequenceList containing all sequences of the cluster
27 
28  .. attribute:: representative_id
29 
30  a string, that contains the name of the representative sequence of the cluster
31  as estimated by kClust.
32 
33  .. attribute:: alignment
34 
35  alignment handle containing a multiple sequence alignment of all sequences of
36  the cluster. Gets only calculated when binding is called with appropriate flag.
37  """
38 
39  def __init__(self, sequences, representative_id):
40 
41  self.sequences=sequences
42  self.representative_id=representative_id
43  self.alignment=None
44 
45 def _SetupFiles(sequences):
46 
47  # create temporary directory
48  tmp_dir_name=tempfile.mkdtemp()
49  io.SaveSequenceList(sequences, os.path.join(tmp_dir_name,'fastadb.fasta'))
50  return tmp_dir_name
51 
52 def _CleanupFiles(tmp_dir_name):
53 
54  import shutil
55  shutil.rmtree(tmp_dir_name)
56 
57 def _ParseOutput(tmp_dir_name):
58 
59  with open(os.path.join(tmp_dir_name,'headers.dmp'),'r') as f:
60  header_data=f.readlines()
61  with open(os.path.join(tmp_dir_name,'clusters.dmp'),'r') as f:
62  cluster_data=f.readlines()
63  sequences=io.LoadSequenceList(os.path.join(tmp_dir_name,'fastadb.fasta'))
64 
65  clusters=dict()
66  header_mapper=dict()
67  for line in header_data:
68  header_mapper[int(line.split()[0])]=line.split()[1].strip().strip('>')
69 
70  #find numeric ids of the representatives of the clusters
71  unique_representatives=list()
72  for line in cluster_data[1:]:
73  actual_cluster=int(line.split()[1])
74  try:
75  unique_representatives.index(actual_cluster)
76  except:
77  unique_representatives.append(actual_cluster)
78 
79  #assign every header to its corresponding cluster, where the
80  #cluster id is given by the id of the representative of the cluster
81  for idx in unique_representatives:
82  clusters[idx]=seq.CreateSequenceList()
83  for line in cluster_data[1:]:
84  clusters[int(line.split()[1])].AddSequence(sequences.FindSequence(header_mapper[int(line.split()[0])]))
85 
86  #translate into final output
87 
88  res=list()
89  for k, v in clusters.items():
90  res.append(cluster(v, header_mapper[k]))
91 
92  return res
93 
94 def _RunkClust(tmp_dir_name, clustering_thresh, create_alignments):
95 
96  bitscore=clustering_thresh*0.060269-0.68498
97 
98  executable=settings.Locate('kClust')
99 
100  cmd=[]
101  cmd.append(executable)
102  cmd.append('-i')
103  cmd.append(os.path.join(tmp_dir_name,'fastadb.fasta'))
104  cmd.append('-d')
105  cmd.append(tmp_dir_name)
106  cmd.append('-s')
107  cmd.append(str(bitscore))
108 
109  cmd=' '.join(cmd)
110  ps=subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
111  stdout, stderr = ps.communicate()
112 
113  result=_ParseOutput(tmp_dir_name)
114 
115  if(create_alignments):
116  from ost.bindings import clustalw
117  for c in result:
118  if len(c.sequences)>1:
119  c.alignment=clustalw.ClustalW(c.sequences)
120  else:
121  aln=seq.CreateAlignment()
122  aln.AddSequence(c.sequences[0])
123  c.alignment=aln
124 
125  return result
126 
127 def kClust(sequences, clustering_thresh=30, create_alignments=False):
128 
129  """
130  Uses kClust to generate clusters of amino acid sequences.
131 
132  :param sequences: All sequences you want to cluster.
133  :type sequences: :class:`ost.seq.SequenceList`
134 
135  :param clustering_thres: Sequence identity threshold to build clusters. Note,
136  that clustering_thresh is more a rule of thumb, since
137  compositional bias in the sequence can also play a role.
138  The value gets transformed in a bitscore, that is used
139  as an input parameter of kClust
140 
141  :param create_alignments: Flag, wether the alignments of the clusters get calculated.
142  Requires clustalw in the path.
143 
144  :returns: A list of cluster instances
145 
146  :raises: :class:`~ost.settings.FileNotFound` if kClust could not be located.
147  """
148 
149  tmp_dir_name=_SetupFiles(sequences)
150  result=_RunkClust(tmp_dir_name, clustering_thresh, create_alignments)
151  _CleanupFiles(tmp_dir_name)
152  return result