00001 import subprocess, os, tempfile
00002 from ost import io, seq, settings
00003
00004 """
00005 Wrapper for kClust a protein sequence clustering algorithm
00006
00007 unpublished, but mentioned in:
00008
00009 Michael Remmert, Andreas Biegert, Andreas Hauser & Johannes Soeding
00010 HHblits: lighting-fast iterative protein sequence searching by
00011 HMM-HMM alignment
00012 Nature Methods 9, 173-175 (2012)
00013
00014 source code can be downloaded from:
00015
00016 ftp://toolkit.genzentrum.lmu.de/pub/
00017 """
00018
00019 class cluster:
00020
00021 """
00022 Holds the information of one cluster
00023
00024 .. attribute:: sequences
00025
00026 SequenceList containing all sequences of the cluster
00027
00028 .. attribute:: representative_id
00029
00030 a string, that contains the name of the representative sequence of the cluster
00031 as estimated by kClust.
00032
00033 .. attribute:: alignment
00034
00035 alignment handle containing a multiple sequence alignment of all sequences of
00036 the cluster. Gets only calculated when binding is called with appropriate flag.
00037 """
00038
00039 def __init__(self, sequences, representative_id):
00040
00041 self.sequences=sequences
00042 self.representative_id=representative_id
00043 self.alignment=None
00044
00045 def _SetupFiles(sequences):
00046
00047
00048 tmp_dir_name=tempfile.mkdtemp()
00049 io.SaveSequenceList(sequences, os.path.join(tmp_dir_name,'fastadb.fasta'))
00050 return tmp_dir_name
00051
00052 def _CleanupFiles(tmp_dir_name):
00053
00054 import shutil
00055 shutil.rmtree(tmp_dir_name)
00056
00057 def _ParseOutput(tmp_dir_name):
00058
00059 header_data=open(os.path.join(tmp_dir_name,'headers.dmp'),'r').readlines()
00060 cluster_data=open(os.path.join(tmp_dir_name,'clusters.dmp'),'r').readlines()
00061 sequences=io.LoadSequenceList(os.path.join(tmp_dir_name,'fastadb.fasta'))
00062
00063 clusters=dict()
00064 header_mapper=dict()
00065 for line in header_data:
00066 header_mapper[int(line.split()[0])]=line.split()[1].strip().strip('>')
00067
00068
00069 unique_representatives=list()
00070 for line in cluster_data[1:]:
00071 actual_cluster=int(line.split()[1])
00072 try:
00073 unique_representatives.index(actual_cluster)
00074 except:
00075 unique_representatives.append(actual_cluster)
00076
00077
00078
00079 for idx in unique_representatives:
00080 clusters[idx]=seq.CreateSequenceList()
00081 for line in cluster_data[1:]:
00082 clusters[int(line.split()[1])].AddSequence(sequences.FindSequence(header_mapper[int(line.split()[0])]))
00083
00084
00085
00086 res=list()
00087 for k, v in clusters.iteritems():
00088 res.append(cluster(v, header_mapper[k]))
00089
00090 return res
00091
00092 def _RunkClust(tmp_dir_name, clustering_thresh, create_alignments):
00093
00094 bitscore=clustering_thresh*0.060269-0.68498
00095
00096 executable=settings.Locate('kClust')
00097
00098 cmd=[]
00099 cmd.append(executable)
00100 cmd.append('-i')
00101 cmd.append(os.path.join(tmp_dir_name,'fastadb.fasta'))
00102 cmd.append('-d')
00103 cmd.append(tmp_dir_name)
00104 cmd.append('-s')
00105 cmd.append(str(bitscore))
00106
00107 cmd=' '.join(cmd)
00108 ps=subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
00109 stdout, stderr = ps.communicate()
00110
00111 result=_ParseOutput(tmp_dir_name)
00112
00113 if(create_alignments):
00114 from ost.bindings import clustalw
00115 for c in result:
00116 if len(c.sequences)>1:
00117 c.alignment=clustalw.ClustalW(c.sequences)
00118 else:
00119 aln=seq.CreateAlignment()
00120 aln.AddSequence(c.sequences[0])
00121 c.alignment=aln
00122
00123 return result
00124
00125 def kClust(sequences, clustering_thresh=30, create_alignments=False):
00126
00127 """
00128 Uses kClust to generate clusters of amino acid sequences.
00129
00130 :param sequences: All sequences you want to cluster.
00131 :type sequences: :class:`ost.seq.SequenceList`
00132
00133 :param clustering_thres: Sequence identity threshold to build clusters. Note,
00134 that clustering_thresh is more a rule of thumb, since
00135 compositional bias in the sequence can also play a role.
00136 The value gets transformed in a bitscore, that is used
00137 as an input parameter of kClust
00138
00139 :param create_alignments: Flag, wether the alignments of the clusters get calculated.
00140 Requires clustalw in the path.
00141
00142 :returns: A list of cluster instances
00143
00144 :raises: :class:`~ost.settings.FileNotFound` if kClust could not be located.
00145 """
00146
00147 tmp_dir_name=_SetupFiles(sequences)
00148 result=_RunkClust(tmp_dir_name, clustering_thresh, create_alignments)
00149 _CleanupFiles(tmp_dir_name)
00150 return result