OpenStructure
Loading...
Searching...
No Matches
kclust.py
Go to the documentation of this file.
1import subprocess, os, tempfile
2from ost import io, seq, settings
3
4"""
5Wrapper for kClust a protein sequence clustering algorithm
6
7unpublished, but mentioned in:
8
9Michael Remmert, Andreas Biegert, Andreas Hauser & Johannes Soeding
10HHblits: lighting-fast iterative protein sequence searching by
11HMM-HMM alignment
12Nature Methods 9, 173-175 (2012)
13
14source code can be downloaded from:
15
16ftp://toolkit.genzentrum.lmu.de/pub/
17"""
18
19class cluster:
20
21 """
22 Holds the information of one cluster
23
24 .. attribute:: sequences
25
26 SequenceList containing all sequences of the cluster
27
28 .. attribute:: representative_id
29
30 a string, that contains the name of the representative sequence of the cluster
31 as estimated by kClust.
32
33 .. attribute:: alignment
34
35 alignment handle containing a multiple sequence alignment of all sequences of
36 the cluster. Gets only calculated when binding is called with appropriate flag.
37 """
38
39 def __init__(self, sequences, representative_id):
40
41 self.sequences=sequences
42 self.representative_id=representative_id
43 self.alignment=None
44
45def _SetupFiles(sequences):
46
47 # create temporary directory
48 tmp_dir_name=tempfile.mkdtemp()
49 io.SaveSequenceList(sequences, os.path.join(tmp_dir_name,'fastadb.fasta'))
50 return tmp_dir_name
51
52def _CleanupFiles(tmp_dir_name):
53
54 import shutil
55 shutil.rmtree(tmp_dir_name)
56
57def _ParseOutput(tmp_dir_name):
58
59 with open(os.path.join(tmp_dir_name,'headers.dmp'),'r') as f:
60 header_data=f.readlines()
61 with open(os.path.join(tmp_dir_name,'clusters.dmp'),'r') as f:
62 cluster_data=f.readlines()
63 sequences=io.LoadSequenceList(os.path.join(tmp_dir_name,'fastadb.fasta'))
64
65 clusters=dict()
66 header_mapper=dict()
67 for line in header_data:
68 header_mapper[int(line.split()[0])]=line.split()[1].strip().strip('>')
69
70 #find numeric ids of the representatives of the clusters
71 unique_representatives=list()
72 for line in cluster_data[1:]:
73 actual_cluster=int(line.split()[1])
74 try:
75 unique_representatives.index(actual_cluster)
76 except:
77 unique_representatives.append(actual_cluster)
78
79 #assign every header to its corresponding cluster, where the
80 #cluster id is given by the id of the representative of the cluster
81 for idx in unique_representatives:
82 clusters[idx]=seq.CreateSequenceList()
83 for line in cluster_data[1:]:
84 clusters[int(line.split()[1])].AddSequence(sequences.FindSequence(header_mapper[int(line.split()[0])]))
85
86 #translate into final output
87
88 res=list()
89 for k, v in clusters.items():
90 res.append(cluster(v, header_mapper[k]))
91
92 return res
93
94def _RunkClust(tmp_dir_name, clustering_thresh, create_alignments):
95
96 bitscore=clustering_thresh*0.060269-0.68498
97
98 executable=settings.Locate('kClust')
99
100 cmd=[]
101 cmd.append(executable)
102 cmd.append('-i')
103 cmd.append(os.path.join(tmp_dir_name,'fastadb.fasta'))
104 cmd.append('-d')
105 cmd.append(tmp_dir_name)
106 cmd.append('-s')
107 cmd.append(str(bitscore))
108
109 cmd=' '.join(cmd)
110 ps=subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
111 stdout, stderr = ps.communicate()
112
113 result=_ParseOutput(tmp_dir_name)
114
115 if(create_alignments):
116 from ost.bindings import clustalw
117 for c in result:
118 if len(c.sequences)>1:
119 c.alignment=clustalw.ClustalW(c.sequences)
120 else:
121 aln=seq.CreateAlignment()
122 aln.AddSequence(c.sequences[0])
123 c.alignment=aln
124
125 return result
126
127def kClust(sequences, clustering_thresh=30, create_alignments=False):
128
129 """
130 Uses kClust to generate clusters of amino acid sequences.
131
132 :param sequences: All sequences you want to cluster.
133 :type sequences: :class:`ost.seq.SequenceList`
134
135 :param clustering_thres: Sequence identity threshold to build clusters. Note,
136 that clustering_thresh is more a rule of thumb, since
137 compositional bias in the sequence can also play a role.
138 The value gets transformed in a bitscore, that is used
139 as an input parameter of kClust
140
141 :param create_alignments: Flag, wether the alignments of the clusters get calculated.
142 Requires clustalw in the path.
143
144 :returns: A list of cluster instances
145
146 :raises: :class:`~ost.settings.FileNotFound` if kClust could not be located.
147 """
148
149 tmp_dir_name=_SetupFiles(sequences)
150 result=_RunkClust(tmp_dir_name, clustering_thresh, create_alignments)
151 _CleanupFiles(tmp_dir_name)
152 return result
__init__(self, sequences, representative_id)
Definition kclust.py:39
_CleanupFiles(tmp_dir_name)
Definition kclust.py:52
_ParseOutput(tmp_dir_name)
Definition kclust.py:57
kClust(sequences, clustering_thresh=30, create_alignments=False)
Definition kclust.py:127
_RunkClust(tmp_dir_name, clustering_thresh, create_alignments)
Definition kclust.py:94
_SetupFiles(sequences)
Definition kclust.py:45