d6/df0/kclust_8py_source.html

import subprocess, os, tempfile

from ost import io, seq, settings


"""

Wrapper for kClust a protein sequence clustering algorithm


unpublished, but mentioned in:


Michael Remmert, Andreas Biegert, Andreas Hauser & Johannes Soeding

HHblits: lighting-fast iterative protein sequence searching by

HMM-HMM alignment

Nature Methods 9, 173-175 (2012)


source code can be downloaded from:


ftp://toolkit.genzentrum.lmu.de/pub/

"""


class cluster:


  """

  Holds the information of one cluster


  .. attribute:: sequences


    SequenceList containing all sequences of the cluster


  .. attribute:: representative_id


    a string, that contains the name of the representative sequence of the cluster

    as estimated by kClust.


  .. attribute:: alignment


    alignment handle containing a multiple sequence alignment of all sequences of

    the cluster. Gets only calculated when binding is called with appropriate flag.

  """


  def __init__(self, sequences, representative_id):


    self.sequences=sequences

    self.representative_id=representative_id

    self.alignment=None


def _SetupFiles(sequences):


  # create temporary directory

  tmp_dir_name=tempfile.mkdtemp()

  io.SaveSequenceList(sequences, os.path.join(tmp_dir_name,'fastadb.fasta'))

  return tmp_dir_name


def _CleanupFiles(tmp_dir_name):


  import shutil

  shutil.rmtree(tmp_dir_name)


def _ParseOutput(tmp_dir_name):


  with open(os.path.join(tmp_dir_name,'headers.dmp'),'r') as f:

    header_data=f.readlines()

  with open(os.path.join(tmp_dir_name,'clusters.dmp'),'r') as f:

    cluster_data=f.readlines()

  sequences=io.LoadSequenceList(os.path.join(tmp_dir_name,'fastadb.fasta'))


  clusters=dict()

  header_mapper=dict()

  for line in header_data:

    header_mapper[int(line.split()[0])]=line.split()[1].strip().strip('>')


  #find numeric ids of the representatives of the clusters

  unique_representatives=list()

  for line in cluster_data[1:]:

    actual_cluster=int(line.split()[1])

    try:

      unique_representatives.index(actual_cluster)

    except:

      unique_representatives.append(actual_cluster)


  #assign every header to its corresponding cluster, where the

  #cluster id is given by the id of the representative of the cluster

  for idx in unique_representatives:

    clusters[idx]=seq.CreateSequenceList()

  for line in cluster_data[1:]:

    clusters[int(line.split()[1])].AddSequence(sequences.FindSequence(header_mapper[int(line.split()[0])]))


  #translate into final output


  res=list()

  for k, v in clusters.items():

    res.append(cluster(v, header_mapper[k]))


  return res


def _RunkClust(tmp_dir_name, clustering_thresh, create_alignments):


  bitscore=clustering_thresh*0.060269-0.68498


  executable=settings.Locate('kClust')


  cmd=[]

  cmd.append(executable)

  cmd.append('-i')

  cmd.append(os.path.join(tmp_dir_name,'fastadb.fasta'))

  cmd.append('-d')

  cmd.append(tmp_dir_name)

  cmd.append('-s')

  cmd.append(str(bitscore))


  cmd=' '.join(cmd)

  ps=subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

  stdout, stderr = ps.communicate()


  result=_ParseOutput(tmp_dir_name)


  if(create_alignments):

    from ost.bindings import clustalw

    for c in result:

      if len(c.sequences)>1:

        c.alignment=clustalw.ClustalW(c.sequences)

      else:

        aln=seq.CreateAlignment()

        aln.AddSequence(c.sequences[0])

        c.alignment=aln


  return result


def kClust(sequences, clustering_thresh=30, create_alignments=False):


  """

  Uses kClust to generate clusters of amino acid sequences.


  :param sequences: All sequences you want to cluster.

  :type sequences: :class:`ost.seq.SequenceList`


  :param clustering_thres: Sequence identity threshold to build clusters. Note,

                           that clustering_thresh is more a rule of thumb, since

                           compositional bias in the sequence can also play a role.

                           The value gets transformed in a bitscore, that is used

                           as an input parameter of kClust


  :param create_alignments: Flag, wether the alignments of the clusters get calculated.

                            Requires clustalw in the path.


  :returns: A list of cluster instances


  :raises: :class:`~ost.settings.FileNotFound` if kClust could not be located.

  """


  tmp_dir_name=_SetupFiles(sequences)

  result=_RunkClust(tmp_dir_name, clustering_thresh, create_alignments)

  _CleanupFiles(tmp_dir_name)

  return result


ost::bindings.kclust.cluster
Definition kclust.py:19

ost::bindings.kclust.cluster.sequences
sequences
Definition kclust.py:41

ost::bindings.kclust.cluster.alignment
alignment
Definition kclust.py:43

ost::bindings.kclust.cluster.__init__
__init__(self, sequences, representative_id)
Definition kclust.py:39

ost::bindings.kclust.cluster.representative_id
representative_id
Definition kclust.py:42

ost::bindings.kclust._CleanupFiles
_CleanupFiles(tmp_dir_name)
Definition kclust.py:52

ost::bindings.kclust._ParseOutput
_ParseOutput(tmp_dir_name)
Definition kclust.py:57

ost::bindings.kclust.kClust
kClust(sequences, clustering_thresh=30, create_alignments=False)
Definition kclust.py:127

ost::bindings.kclust._RunkClust
_RunkClust(tmp_dir_name, clustering_thresh, create_alignments)
Definition kclust.py:94

ost::bindings.kclust._SetupFiles
_SetupFiles(sequences)
Definition kclust.py:45

ost::bindings
Definition wrap_tmalign.hh:28