1 import subprocess, os, tempfile
2 from ost
import io, seq, settings
5 Wrapper for kClust a protein sequence clustering algorithm
7 unpublished, but mentioned in:
9 Michael Remmert, Andreas Biegert, Andreas Hauser & Johannes Soeding
10 HHblits: lighting-fast iterative protein sequence searching by
12 Nature Methods 9, 173-175 (2012)
14 source code can be downloaded from:
16 ftp://toolkit.genzentrum.lmu.de/pub/
22 Holds the information of one cluster
24 .. attribute:: sequences
26 SequenceList containing all sequences of the cluster
28 .. attribute:: representative_id
30 a string, that contains the name of the representative sequence of the cluster
31 as estimated by kClust.
33 .. attribute:: alignment
35 alignment handle containing a multiple sequence alignment of all sequences of
36 the cluster. Gets only calculated when binding is called with appropriate flag.
39 def __init__(self, sequences, representative_id):
45 def _SetupFiles(sequences):
48 tmp_dir_name=tempfile.mkdtemp()
49 io.SaveSequenceList(sequences, os.path.join(tmp_dir_name,
'fastadb.fasta'))
52 def _CleanupFiles(tmp_dir_name):
55 shutil.rmtree(tmp_dir_name)
57 def _ParseOutput(tmp_dir_name):
59 with open(os.path.join(tmp_dir_name,
'headers.dmp'),
'r') as f:
60 header_data=f.readlines()
61 with open(os.path.join(tmp_dir_name,'clusters.dmp'),
'r') as f:
62 cluster_data=f.readlines()
63 sequences=io.LoadSequenceList(os.path.join(tmp_dir_name,'fastadb.fasta'))
67 for line
in header_data:
68 header_mapper[int(line.split()[0])]=line.split()[1].strip().strip(
'>')
71 unique_representatives=list()
72 for line
in cluster_data[1:]:
73 actual_cluster=int(line.split()[1])
75 unique_representatives.index(actual_cluster)
77 unique_representatives.append(actual_cluster)
81 for idx
in unique_representatives:
82 clusters[idx]=seq.CreateSequenceList()
83 for line
in cluster_data[1:]:
84 clusters[int(line.split()[1])].AddSequence(sequences.FindSequence(header_mapper[int(line.split()[0])]))
89 for k, v
in clusters.items():
90 res.append(
cluster(v, header_mapper[k]))
94 def _RunkClust(tmp_dir_name, clustering_thresh, create_alignments):
96 bitscore=clustering_thresh*0.060269-0.68498
98 executable=settings.Locate(
'kClust')
101 cmd.append(executable)
103 cmd.append(os.path.join(tmp_dir_name,
'fastadb.fasta'))
105 cmd.append(tmp_dir_name)
107 cmd.append(str(bitscore))
110 ps=subprocess.Popen(cmd, shell=
True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
111 stdout, stderr = ps.communicate()
113 result=_ParseOutput(tmp_dir_name)
115 if(create_alignments):
118 if len(c.sequences)>1:
119 c.alignment=clustalw.ClustalW(c.sequences)
121 aln=seq.CreateAlignment()
122 aln.AddSequence(c.sequences[0])
127 def kClust(sequences, clustering_thresh=30, create_alignments=False):
130 Uses kClust to generate clusters of amino acid sequences.
132 :param sequences: All sequences you want to cluster.
133 :type sequences: :class:`ost.seq.SequenceList`
135 :param clustering_thres: Sequence identity threshold to build clusters. Note,
136 that clustering_thresh is more a rule of thumb, since
137 compositional bias in the sequence can also play a role.
138 The value gets transformed in a bitscore, that is used
139 as an input parameter of kClust
141 :param create_alignments: Flag, wether the alignments of the clusters get calculated.
142 Requires clustalw in the path.
144 :returns: A list of cluster instances
146 :raises: :class:`~ost.settings.FileNotFound` if kClust could not be located.
149 tmp_dir_name=_SetupFiles(sequences)
150 result=_RunkClust(tmp_dir_name, clustering_thresh, create_alignments)
151 _CleanupFiles(tmp_dir_name)