1 import subprocess, os, tempfile
2 from ost
import io, seq, settings
5 Wrapper for kClust a protein sequence clustering algorithm
7 unpublished, but mentioned in:
9 Michael Remmert, Andreas Biegert, Andreas Hauser & Johannes Soeding
10 HHblits: lighting-fast iterative protein sequence searching by
12 Nature Methods 9, 173-175 (2012)
14 source code can be downloaded from:
16 ftp://toolkit.genzentrum.lmu.de/pub/
22 Holds the information of one cluster
24 .. attribute:: sequences
26 SequenceList containing all sequences of the cluster
28 .. attribute:: representative_id
30 a string, that contains the name of the representative sequence of the cluster
31 as estimated by kClust.
33 .. attribute:: alignment
35 alignment handle containing a multiple sequence alignment of all sequences of
36 the cluster. Gets only calculated when binding is called with appropriate flag.
39 def __init__(self, sequences, representative_id):
45 def _SetupFiles(sequences):
48 tmp_dir_name=tempfile.mkdtemp()
49 io.SaveSequenceList(sequences, os.path.join(tmp_dir_name,
'fastadb.fasta'))
52 def _CleanupFiles(tmp_dir_name):
55 shutil.rmtree(tmp_dir_name)
57 def _ParseOutput(tmp_dir_name):
59 header_data=open(os.path.join(tmp_dir_name,
'headers.dmp'),
'r').readlines()
60 cluster_data=open(os.path.join(tmp_dir_name,'clusters.dmp'),
'r').readlines()
61 sequences=io.LoadSequenceList(os.path.join(tmp_dir_name,'fastadb.fasta'))
65 for line
in header_data:
66 header_mapper[int(line.split()[0])]=line.split()[1].strip().strip(
'>')
69 unique_representatives=list()
70 for line
in cluster_data[1:]:
71 actual_cluster=int(line.split()[1])
73 unique_representatives.index(actual_cluster)
75 unique_representatives.append(actual_cluster)
79 for idx
in unique_representatives:
80 clusters[idx]=seq.CreateSequenceList()
81 for line
in cluster_data[1:]:
82 clusters[int(line.split()[1])].AddSequence(sequences.FindSequence(header_mapper[int(line.split()[0])]))
87 for k, v
in clusters.iteritems():
88 res.append(
cluster(v, header_mapper[k]))
92 def _RunkClust(tmp_dir_name, clustering_thresh, create_alignments):
94 bitscore=clustering_thresh*0.060269-0.68498
96 executable=settings.Locate(
'kClust')
99 cmd.append(executable)
101 cmd.append(os.path.join(tmp_dir_name,
'fastadb.fasta'))
103 cmd.append(tmp_dir_name)
105 cmd.append(str(bitscore))
108 ps=subprocess.Popen(cmd, shell=
True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
109 stdout, stderr = ps.communicate()
111 result=_ParseOutput(tmp_dir_name)
113 if(create_alignments):
116 if len(c.sequences)>1:
117 c.alignment=clustalw.ClustalW(c.sequences)
119 aln=seq.CreateAlignment()
120 aln.AddSequence(c.sequences[0])
125 def kClust(sequences, clustering_thresh=30, create_alignments=False):
128 Uses kClust to generate clusters of amino acid sequences.
130 :param sequences: All sequences you want to cluster.
131 :type sequences: :class:`ost.seq.SequenceList`
133 :param clustering_thres: Sequence identity threshold to build clusters. Note,
134 that clustering_thresh is more a rule of thumb, since
135 compositional bias in the sequence can also play a role.
136 The value gets transformed in a bitscore, that is used
137 as an input parameter of kClust
139 :param create_alignments: Flag, wether the alignments of the clusters get calculated.
140 Requires clustalw in the path.
142 :returns: A list of cluster instances
144 :raises: :class:`~ost.settings.FileNotFound` if kClust could not be located.
147 tmp_dir_name=_SetupFiles(sequences)
148 result=_RunkClust(tmp_dir_name, clustering_thresh, create_alignments)
149 _CleanupFiles(tmp_dir_name)