OpenStructure: stage/lib64/python2.7/site-packages/ost/table.py Source File

00001 import csv
00002 import re
00003 import math
00004 from ost import stutil
00005 import itertools
00006 import operator
00007 import cPickle
00008 import weakref
00009 from ost import LogError, LogWarning, LogInfo, LogVerbose
00010 
00011 def MakeTitle(col_name):
00012   return col_name.replace('_', ' ')
00013 
00014 def IsStringLike(value):
00015   if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
00016     return False
00017   try:
00018     value+''
00019     return True
00020   except:
00021     return False
00022 
00023 def IsNullString(value):
00024   value=value.strip().upper()
00025   return value in ('', 'NULL', 'NONE', 'NA')
00026 
00027 def IsScalar(value):
00028   if IsStringLike(value):
00029     return True
00030   try:
00031     if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
00032       return False
00033     iter(value)
00034     return False
00035   except:
00036     return True
00037 
00038 def GuessColumnType(iterator):
00039   empty=True
00040   possibilities=set(['bool', 'int', 'float'])
00041   for ele in iterator:
00042     str_ele=str(ele).upper()
00043     if IsNullString(str_ele):
00044       continue
00045     empty=False
00046     if 'int' in possibilities:
00047       try:
00048         int(str_ele)
00049       except ValueError:
00050         possibilities.remove('int')
00051 
00052     if 'float' in possibilities:
00053       try:
00054         float(str_ele)
00055       except ValueError:
00056         possibilities.remove('float')
00057     if 'bool' in possibilities:
00058       if str_ele not in set(['YES', 'NO', 'TRUE', 'FALSE']):
00059         possibilities.remove('bool')
00060 
00061     if len(possibilities)==0:
00062       return 'string'
00063   if len(possibilities)==2:
00064     return 'int'
00065   if empty:
00066     return 'string'
00067   # return the last element available
00068   return possibilities.pop()
00069 
00070 class BinaryColExpr:
00071   def __init__(self, op, lhs, rhs):
00072     self.op=op
00073     self.lhs=lhs
00074     self.rhs=rhs
00075     if IsScalar(lhs):
00076       self.lhs=itertools.cyle([self.lhs])
00077     if IsScalar(rhs):
00078       self.rhs=itertools.cycle([self.rhs])
00079   def __iter__(self):
00080     for l, r in zip(self.lhs, self.rhs):
00081       if l!=None and r!=None:
00082         yield self.op(l, r)
00083       else:
00084         yield None
00085   def __add__(self, rhs):
00086     return BinaryColExpr(operator.add, self, rhs)
00087 
00088   def __sub__(self, rhs):
00089     return BinaryColExpr(operator.sub, self, rhs)
00090 
00091   def __mul__(self, rhs):
00092     return BinaryColExpr(operator.mul, self, rhs)
00093 
00094   def __div__(self, rhs):
00095     return BinaryColExpr(operator.div, self, rhs)
00096 
00097 class TableCol:
00098   def __init__(self, table, col):
00099     self._table=table
00100     if type(col)==str:
00101       self.col_index=self._table.GetColIndex(col)
00102     else:
00103       self.col_index=col
00104 
00105   def __iter__(self):
00106     for row in self._table.rows:
00107       yield row[self.col_index]
00108 
00109   def __len__(self):
00110     return len(self._table.rows)
00111 
00112   def __getitem__(self, index):
00113     return self._table.rows[index][self.col_index]
00114 
00115   def __setitem__(self, index, value):
00116     self._table.rows[index][self.col_index]=value
00117   
00118   def __add__(self, rhs):
00119     return BinaryColExpr(operator.add, self, rhs)
00120 
00121   def __sub__(self, rhs):
00122     return BinaryColExpr(operator.sub, self, rhs)
00123 
00124   def __mul__(self, rhs):
00125     return BinaryColExpr(operator.mul, self, rhs)
00126 
00127   def __div__(self, rhs):
00128     return BinaryColExpr(operator.div, self, rhs)
00129 
00130 class TableRow:
00131   """
00132   Essentially a named tuple, but allows column names that are not valid 
00133   python variable names.
00134   """
00135   def __init__(self, row_data, tab):
00136     self.__dict__['tab'] = weakref.proxy(tab)
00137     self.__dict__['row_data'] = row_data
00138 
00139   def __getitem__(self, col_name):
00140     if type(col_name)==int:
00141       return self.row_data[col_name]
00142     return self.row_data[self.tab.GetColIndex(col_name)]
00143 
00144   def __str__(self):
00145     s = []
00146     for k, v in zip(self.__dict__['tab'].col_names, self.__dict__['row_data']):
00147       s.append('%s=%s' % (k, str(v)))
00148     return ', '.join(s)
00149       
00150       
00151   def __len__(self):
00152     return len(self.row_data)
00153 
00154   def __setitem__(self, col_name, val):
00155     if type(col_name)==int:
00156       self.row_data[col_name] = val
00157     else:
00158       self.row_data[self.tab.GetColIndex(col_name)] = val
00159 
00160   def __getattr__(self, col_name):
00161     if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
00162       raise AttributeError(col_name)
00163     return self.row_data[self.tab.GetColIndex(col_name)]
00164 
00165   def __setattr__(self, col_name, val):
00166     if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
00167       raise AttributeError(col_name)
00168     self.row_data[self.tab.GetColIndex(col_name)] = val
00169 
00170 class Table(object):
00171   """
00172   
00173   The table class provides convenient access to data in tabular form. An empty 
00174   table can be easily constructed as follows
00175   
00176   .. code-block:: python
00177   
00178     tab = Table()
00179     
00180   If you want to add columns directly when creating the table, column names
00181   and *column types* can be specified as follows
00182   
00183   .. code-block:: python
00184   
00185     tab = Table(['nameX','nameY','nameZ'], 'sfb')
00186     
00187   this will create three columns called nameX, nameY and nameZ of type string,
00188   float and bool, respectively. There will be no data in the table and thus,
00189   the table will not contain any rows.
00190   
00191   The following *column types* are supported:
00192   
00193   ======= ========
00194   name     abbrev
00195   ======= ========
00196   string     s
00197   float      f
00198   int        i
00199   bool       b
00200   ======= ========
00201   
00202   If you want to add data to the table in addition, use the following:
00203   
00204   .. code-block:: python
00205   
00206     tab=Table(['nameX','nameY','nameZ'],
00207               'sfb',
00208               nameX = ['a','b','c'],
00209               nameY = [0.1, 1.2, 3.414],
00210               nameZ = [True, False, False])
00211               
00212   if values for one column is left out, they will be filled with NA, but if
00213   values are specified, all values must be specified (i.e. same number of
00214   values per column)
00215     
00216   """
00217 
00218   SUPPORTED_TYPES=('int', 'float', 'bool', 'string',)
00219   
00220   
00221   def __init__(self, col_names=[], col_types=None, **kwargs):
00222 
00223     self.col_names=list(col_names)
00224     self.comment=''
00225     self.name=''
00226     
00227     self.col_types = self._ParseColTypes(col_types)
00228     self.rows=[]    
00229     if len(kwargs)>=0:
00230       if not col_names:
00231         self.col_names=[v for v in kwargs.keys()]
00232       if not self.col_types:
00233         self.col_types=['string' for u in range(len(self.col_names))]
00234       if len(kwargs)>0:
00235         self._AddRowsFromDict(kwargs)
00236 
00237   def __getattr__(self, col_name):
00238     # pickling doesn't call the standard __init__ defined above and thus
00239     # col_names might not be defined. This leads to infinite recursions.
00240     # Protect against it by checking that col_names is contained in 
00241     # __dict__
00242     if 'col_names' not in self.__dict__ or col_name not in self.col_names:
00243       raise AttributeError(col_name)
00244     return TableCol(self, col_name)
00245 
00246   @staticmethod
00247   def _ParseColTypes(types, exp_num=None):
00248     if types==None:
00249       return None
00250     
00251     short2long = {'s' : 'string', 'i': 'int', 'b' : 'bool', 'f' : 'float'}
00252     allowed_short = short2long.keys()
00253     allowed_long = short2long.values()
00254     
00255     type_list = []
00256     
00257     # string type
00258     if IsScalar(types):
00259       if type(types)==str:
00260         types = types.lower()
00261         
00262         # single value
00263         if types in allowed_long:
00264           type_list.append(types)
00265         elif types in allowed_short:
00266           type_list.append(short2long[types])
00267         
00268         # comma separated list of long or short types
00269         elif types.find(',')!=-1:
00270           for t in types.split(','):
00271             if t in allowed_long:
00272               type_list.append(t)
00273             elif t in allowed_short:
00274               type_list.append(short2long[t])
00275             else:
00276               raise ValueError('Unknown type %s in types %s'%(t,types))
00277         
00278         # string of short types
00279         else:
00280           for t in types:
00281             if t in allowed_short:
00282               type_list.append(short2long[t])
00283             else:
00284               raise ValueError('Unknown type %s in types %s'%(t,types))
00285       
00286       # non-string type
00287       else:
00288         raise ValueError('Col type %s must be string or list'%types)
00289     
00290     # list type
00291     else:
00292       for t in types:
00293         # must be string type
00294         if type(t)==str:
00295           t = t.lower()
00296           if t in allowed_long:
00297             type_list.append(t)
00298           elif t in allowed_short:
00299             type_list.append(short2long[t])
00300           else:
00301             raise ValueError('Unknown type %s in types %s'%(t,types))
00302         
00303         # non-string type
00304         else:
00305           raise ValueError('Col type %s must be string or list'%types)
00306     
00307     if exp_num:
00308       if len(type_list)!=exp_num:
00309         raise ValueError('Parsed number of col types (%i) differs from ' + \
00310                          'expected (%i) in types %s'%(len(type_list),exp_num,types))
00311       
00312     return type_list
00313 
00314   def SetName(self, name):
00315     '''
00316     Set name of the table
00317 
00318     :param name: name
00319     :type name: :class:`str`
00320     '''
00321     self.name = name
00322     
00323   def GetName(self):
00324     '''
00325     Get name of table
00326     '''
00327     return self.name
00328 
00329   def RenameCol(self, old_name, new_name):
00330     """
00331     Rename column *old_name* to *new_name*.
00332 
00333     :param old_name: Name of the old column
00334     :param new_name: Name of the new column
00335     :raises: :exc:`ValueError` when *old_name* is not a valid column
00336     """
00337     if old_name==new_name:
00338       return
00339     self.AddCol(new_name, self.col_types[self.GetColIndex(old_name)],
00340                 self[old_name])
00341     self.RemoveCol(old_name)
00342   def _Coerce(self, value, ty):
00343     '''
00344     Try to convert values (e.g. from :class:`str` type) to the specified type
00345 
00346     :param value: the value
00347     :type value: any type
00348 
00349     :param ty: name of type to convert it to (i.e. *int*, *float*, *string*,
00350                *bool*)
00351     :type ty: :class:`str`
00352     '''
00353     if value=='NA' or value==None:
00354       return None
00355     if ty=='int':
00356       return int(value)
00357     if ty=='float':
00358       return float(value)
00359     if ty=='string':
00360       return str(value)
00361     if ty=='bool':
00362       if isinstance(value, str) or isinstance(value, unicode):
00363         if value.upper() in ('FALSE', 'NO',):
00364           return False
00365         return True
00366       return bool(value)
00367     raise ValueError('Unknown type %s' % ty)
00368 
00369   def GetColIndex(self, col):
00370     '''
00371     Returns the column index for the column with the given name.
00372 
00373     :raises: ValueError if no column with the name is found.
00374     '''
00375     if col not in self.col_names:
00376       raise ValueError('Table has no column named "%s"' % col)
00377     return self.col_names.index(col)
00378   
00379   def GetColNames(self):
00380     '''
00381     Returns a list containing all column names.
00382     '''
00383     return self.col_names
00384   
00385   def SearchColNames(self, regex):
00386     '''
00387     Returns a list of column names matching the regex.
00388 
00389     :param regex: regex pattern
00390     :type regex: :class:`str`
00391 
00392     :returns: :class:`list` of column names (:class:`str`)
00393     '''
00394     matching_names = []
00395     for name in self.col_names:
00396       matches = re.search(regex, name)
00397       if matches:
00398         matching_names.append(name)
00399     return matching_names
00400 
00401   def HasCol(self, col):
00402     '''
00403     Checks if the column with a given name is present in the table.
00404     '''
00405     return col in self.col_names
00406   
00407   def __getitem__(self, k):
00408     if type(k)==int:
00409       return TableCol(self, self.col_names[k])
00410     else:
00411       return TableCol(self, k)
00412 
00413   def __setitem__(self, k, value):
00414     col_index=k
00415     if type(k)!=int:
00416       col_index=self.GetColIndex(k)
00417     if IsScalar(value):
00418       value=itertools.cycle([value])
00419     for r, v in zip(self.rows, value):
00420       r[col_index]=v
00421 
00422   def ToString(self, float_format='%.3f', int_format='%d', rows=None):
00423     '''
00424     Convert the table into a string representation.
00425 
00426     The output format can be modified for int and float type columns by
00427     specifying a formatting string for the parameters *float_format* and
00428     *int_format*.
00429 
00430     The option *rows* specify the range of rows to be printed. The parameter
00431     must be a type that supports indexing (e.g. a :class:`list`) containing the 
00432     start and end row *index*, e.g. [start_row_idx, end_row_idx].
00433 
00434     :param float_format: formatting string for float columns
00435     :type float_format: :class:`str`
00436 
00437     :param int_format: formatting string for int columns
00438     :type int_format: :class:`str`
00439 
00440     :param rows: iterable containing start and end row *index*
00441     :type rows: iterable containing :class:`ints <int>`
00442     '''
00443     widths=[len(cn) for cn in self.col_names]
00444     sel_rows=self.rows
00445     if rows:
00446       sel_rows=self.rows[rows[0]:rows[1]]
00447     for row in sel_rows:
00448       for i, (ty, col) in enumerate(zip(self.col_types, row)):
00449         if col==None:
00450           widths[i]=max(widths[i], len('NA'))
00451         elif ty=='float':
00452           widths[i]=max(widths[i], len(float_format % col))
00453         elif ty=='int':
00454           widths[i]=max(widths[i], len(int_format % col))
00455         else:
00456           widths[i]=max(widths[i], len(str(col)))
00457     s=''
00458     if self.comment:
00459       s+=''.join(['# %s\n' % l for l in self.comment.split('\n')])
00460     total_width=sum(widths)+2*len(widths)
00461     for width, col_name in zip(widths, self.col_names):
00462       s+=col_name.center(width+2)
00463     s+='\n%s\n' % ('-'*total_width)
00464     for row in sel_rows:
00465       for width, ty, col in zip(widths, self.col_types, row):
00466         cs=''
00467         if col==None:
00468           cs='NA'.center(width+2)
00469         elif ty=='float':
00470           cs=(float_format % col).rjust(width+2)
00471         elif ty=='int':
00472           cs=(int_format % col).rjust(width+2)
00473         else:
00474           cs=' '+str(col).ljust(width+1)
00475         s+=cs
00476       s+='\n'
00477     return s
00478 
00479   def __str__(self):
00480     return self.ToString()
00481   
00482   def Stats(self, col):
00483      idx  = self.GetColIndex(col)
00484      text ='''
00485 Statistics for column %(col)s
00486 
00487   Number of Rows         : %(num)d
00488   Number of Rows Not None: %(num_non_null)d 
00489   Mean                   : %(mean)f
00490   Median                 : %(median)f
00491   Standard Deviation     : %(stddev)f
00492   Min                    : %(min)f
00493   Max                    : %(max)f
00494 '''
00495      data = {
00496        'col' : col,
00497        'num' : len(self.rows),
00498        'num_non_null' : self.Count(col),
00499        'median' : self.Median(col),
00500        'mean' : self.Mean(col),
00501        'stddev' : self.StdDev(col),
00502        'min' : self.Min(col),
00503        'max' : self.Max(col),
00504      }
00505      return text % data
00506 
00507   def _AddRowsFromDict(self, d, overwrite=None):
00508     '''
00509     Add one or more rows from a :class:`dictionary <dict>`.
00510     
00511     If *overwrite* is not None and set to an existing column name, the specified 
00512     column in the table is searched for the first occurrence of a value matching
00513     the value of the column with the same name in the dictionary. If a matching
00514     value is found, the row is overwritten with the dictionary. If no matching
00515     row is found, a new row is appended to the table.
00516 
00517     :param d: dictionary containing the data
00518     :type d: :class:`dict`
00519 
00520     :param overwrite: column name to overwrite existing row if value in
00521                       column *overwrite* matches
00522     :type overwrite: :class:`str`
00523 
00524     :raises: :class:`ValueError` if multiple rows are added but the number of
00525              data items is different for different columns.
00526     '''
00527     # get column indices
00528     idxs = [self.GetColIndex(k) for k in d.keys()]
00529     
00530     # convert scalar values to list
00531     old_len = None
00532     for k,v in d.iteritems():
00533       if IsScalar(v):
00534         v = [v]
00535         d[k] = v
00536       if not old_len:
00537         old_len = len(v)
00538       elif old_len!=len(v):
00539         raise ValueError("Cannot add rows: length of data must be equal " + \
00540                          "for all columns in %s"%str(d))
00541     
00542     # convert column based dict to row based dict and create row and add data
00543     for i,data in enumerate(zip(*d.values())):
00544       new_row = [None for a in range(len(self.col_names))]
00545       for idx,v in zip(idxs,data):
00546         new_row[idx] = self._Coerce(v, self.col_types[idx])
00547         
00548       # partially overwrite existing row with new data
00549       if overwrite:
00550         overwrite_idx = self.GetColIndex(overwrite)
00551         added = False
00552         for i,r in enumerate(self.rows):
00553           if r[overwrite_idx]==new_row[overwrite_idx]:
00554             for j,e in enumerate(self.rows[i]):
00555               if new_row[j]==None:
00556                 new_row[j] = e
00557             self.rows[i] = new_row
00558             added = True
00559             break
00560           
00561       # if not overwrite or overwrite did not find appropriate row
00562       if not overwrite or not added:
00563         self.rows.append(new_row)
00564       
00565   def PairedTTest(self, col_a, col_b):
00566     """
00567     Two-sided test for the null-hypothesis that two related samples 
00568     have the same average (expected values).
00569     
00570     :param col_a: First column
00571     :type col_a:  :class:`str`
00572     :param col_b: Second column
00573     :type col_b:  :class:`str`
00574 
00575     :returns: P-value between 0 and 1 that the two columns have the 
00576        same average. The smaller the value, the less related the two
00577        columns are.
00578     """
00579     from scipy.stats import ttest_rel
00580     xs = []
00581     ys = []
00582     for x, y in self.Zip(col_a, col_b):
00583       if x!=None and y!=None:
00584         xs.append(x)
00585         ys.append(y)
00586     result = ttest_rel(xs, ys)
00587     return result[1]
00588 
00589   def AddRow(self, data, overwrite=None):
00590     """
00591     Add a row to the table.
00592     
00593     *data* may either be a dictionary or a list-like object:
00594 
00595      - If *data* is a dictionary, the keys in the dictionary must match the
00596        column names. Columns not found in the dict will be initialized to None.
00597        If the dict contains list-like objects, multiple rows will be added, if
00598        the number of items in all list-like objects is the same, otherwise a
00599        :class:`ValueError` is raised.
00600 
00601      - If *data* is a list-like object, the row is initialized from the values
00602        in *data*. The number of items in *data* must match the number of
00603        columns in the table. A :class:`ValuerError` is raised otherwise. The
00604        values are added in the order specified in the list, thus, the order of
00605        the data must match the columns.
00606           
00607     If *overwrite* is not None and set to an existing column name, the specified 
00608     column in the table is searched for the first occurrence of a value matching
00609     the value of the column with the same name in the dictionary. If a matching
00610     value is found, the row is overwritten with the dictionary. If no matching
00611     row is found, a new row is appended to the table.
00612 
00613     :param data: data to add
00614     :type data: :class:`dict` or *list-like* object
00615 
00616     :param overwrite: column name to overwrite existing row if value in
00617                       column *overwrite* matches
00618     :type overwrite: :class:`str`
00619 
00620     :raises: :class:`ValueError` if *list-like* object is used and number of
00621              items does *not* match number of columns in table.
00622 
00623     :raises: :class:`ValueError` if *dict* is used and multiple rows are added
00624              but the number of data items is different for different columns.
00625 
00626     **Example:** add multiple data rows to a subset of columns using a dictionary
00627 
00628     .. code-block:: python
00629 
00630       # create table with three float columns
00631       tab = Table(['x','y','z'], 'fff')
00632 
00633       # add rows from dict
00634       data = {'x': [1.2, 1.6], 'z': [1.6, 5.3]}
00635       tab.AddRow(data)
00636       print tab
00637 
00638       '''
00639       will produce the table
00640 
00641       ====  ====  ====
00642       x     y     z
00643       ====  ====  ====
00644       1.20  NA    1.60
00645       1.60  NA    5.30
00646       ====  ====  ====
00647       '''
00648 
00649       # overwrite the row with x=1.2 and add row with x=1.9
00650       data = {'x': [1.2, 1.9], 'z': [7.9, 3.5]}
00651       tab.AddRow(data, overwrite='x')
00652       print tab
00653 
00654       '''
00655       will produce the table
00656 
00657       ====  ====  ====
00658       x     y     z
00659       ====  ====  ====
00660       1.20  NA    7.90
00661       1.60  NA    5.30
00662       1.90  NA    3.50
00663       ====  ====  ====
00664       '''
00665     """
00666     if type(data)==dict:
00667       self._AddRowsFromDict(data, overwrite)
00668     else:
00669       if len(data)!=len(self.col_names):
00670         msg='data array must have %d elements, not %d'
00671         raise ValueError(msg % (len(self.col_names), len(data)))
00672       new_row = [self._Coerce(v, t) for v, t in zip(data, self.col_types)]
00673       
00674       # fully overwrite existing row with new data
00675       if overwrite:
00676         overwrite_idx = self.GetColIndex(overwrite)
00677         added = False
00678         for i,r in enumerate(self.rows):
00679           if r[overwrite_idx]==new_row[overwrite_idx]:
00680             self.rows[i] = new_row
00681             added = True
00682             break
00683       
00684       # if not overwrite or overwrite did not find appropriate row
00685       if not overwrite or not added:
00686         self.rows.append(new_row)
00687 
00688   def RemoveCol(self, col):
00689     """
00690     Remove column with the given name from the table.
00691 
00692     :param col: name of column to remove
00693     :type col: :class:`str`
00694     """
00695     idx = self.GetColIndex(col)
00696     del self.col_names[idx]
00697     del self.col_types[idx]
00698     for row in self.rows:
00699       del row[idx]
00700 
00701   def AddCol(self, col_name, col_type, data=None):
00702     """
00703     Add a column to the right of the table.
00704     
00705     :param col_name: name of new column
00706     :type col_name: :class:`str`
00707 
00708     :param col_type: type of new column (long versions: *int*, *float*, *bool*,
00709                      *string* or short versions: *i*, *f*, *b*, *s*)
00710     :type col_type: :class:`str`
00711 
00712     :param data: data to add to new column
00713     :type data: scalar or iterable
00714 
00715     **Example:**
00716 
00717     .. code-block:: python
00718     
00719       tab = Table(['x'], 'f', x=range(5))
00720       tab.AddCol('even', 'bool', itertools.cycle([True, False]))
00721       print tab
00722     
00723       '''
00724       will produce the table
00725 
00726       ====  ====
00727       x     even
00728       ====  ====
00729         0   True
00730         1   False
00731         2   True
00732         3   False
00733         4   True
00734       ====  ====
00735       '''
00736 
00737     If data is a constant instead of an iterable object, it's value
00738     will be written into each row:
00739 
00740     .. code-block:: python
00741 
00742       tab = Table(['x'], 'f', x=range(5))
00743       tab.AddCol('num', 'i', 1)
00744       print tab
00745 
00746       '''
00747       will produce the table
00748 
00749       ====  ====
00750       x     num
00751       ====  ====
00752         0   1
00753         1   1
00754         2   1
00755         3   1
00756         4   1
00757       ====  ====
00758       '''
00759     
00760     As a special case, if there are no previous rows, and data is not 
00761     None, rows are added for every item in data.
00762     """
00763 
00764     if col_name in self.col_names:
00765       raise ValueError('Column with name %s already exists'%col_name)
00766 
00767     col_type = self._ParseColTypes(col_type, exp_num=1)[0]
00768     self.col_names.append(col_name)
00769     self.col_types.append(col_type)
00770 
00771     if len(self.rows)>0:
00772       if IsScalar(data):
00773         for row in self.rows:
00774           row.append(data)
00775       else:
00776         if hasattr(data, '__len__') and len(data)!=len(self.rows):
00777           self.col_names.pop()
00778           self.col_types.pop()
00779           raise ValueError('Length of data (%i) must correspond to number of '%len(data) +\
00780                            'existing rows (%i)'%len(self.rows))
00781         for row, d in zip(self.rows, data):
00782           row.append(d)
00783 
00784     elif data!=None and len(self.col_names)==1:
00785       if IsScalar(data):
00786         self.AddRow({col_name : data})
00787       else:
00788         for v in data:
00789           self.AddRow({col_name : v})
00790 
00791   def Filter(self, *args, **kwargs):
00792     """
00793     Returns a filtered table only containing rows matching all the predicates 
00794     in kwargs and args For example,
00795     
00796     .. code-block:: python
00797     
00798       tab.Filter(town='Basel')
00799     
00800     will return all the rows where the value of the column "town" is equal to 
00801     "Basel". Several predicates may be combined, i.e.
00802     
00803     .. code-block:: python
00804     
00805       tab.Filter(town='Basel', male=True)
00806       
00807     will return the rows with "town" equal to "Basel" and "male" equal to true.
00808     args are unary callables returning true if the row should be included in the
00809     result and false if not.
00810     """
00811     filt_tab=Table(list(self.col_names), list(self.col_types))
00812     for row in self.rows:
00813       matches=True
00814       for func in args:
00815         if not func(row):
00816           matches=False
00817           break
00818       for key, val in kwargs.iteritems():
00819         if row[self.GetColIndex(key)]!=val:
00820           matches=False
00821           break
00822       if matches:
00823         filt_tab.AddRow(row)
00824     return filt_tab
00825 
00826 
00827   def Select(self, query):
00828 
00829     """
00830     Returns a new table object containing all rows matching a logical query
00831     expression.
00832     
00833     *query* is a string containing the logical expression, that will be
00834     evaluated for every row.
00835 
00836     Operands have to be the name of a column or an expression that can be
00837     parsed to float, int, bool or string.
00838     Valid operators are: and, or, !=, !, <=, >=, ==, =, <, >, +, -, \*, /
00839     
00840     .. code-block:: python
00841     
00842       subtab = tab.Select('col_a>0.5 and (col_b=5 or col_c=5)')
00843 
00844     The selection query should be self explaining. Allowed parenthesis are:
00845     (), [], {}, whereas parenthesis mismatches get recognized. Expressions like
00846     '3<=col_a>=col_b' throw an error, due to problems in figuring out the
00847     evaluation order.
00848 
00849     There are two special expressions:
00850 
00851     .. code-block:: python
00852 
00853       #selects rows, where 1.0<=col_a<=1.5
00854       subtab = tab.Select('col_a=1.0:1.5')
00855 
00856       #selects rows, where col_a=1 or col_a=2 or col_a=3
00857       subtab = tab.Select('col_a=1,2,3')
00858 
00859     Only consistent types can be compared. If col_a is of type string and col_b
00860     is of type int, following expression would throw an error: 'col_a<col_b'
00861     """
00862 
00863     try:
00864       from table_selector import TableSelector
00865     except:
00866       raise ImportError("Tried to import from the file table_selector.py, but could not find it!")
00867 
00868     selector=TableSelector(self.col_types, self.col_names, query)
00869 
00870     selected_tab=Table(list(self.col_names), list(self.col_types))
00871 
00872     for row in self.rows:
00873       if selector.EvaluateRow(row):
00874         selected_tab.AddRow(row)
00875 
00876     return selected_tab
00877 
00878 
00879   @staticmethod
00880   def _LoadOST(stream_or_filename):
00881     fieldname_pattern=re.compile(r'(?P<name>[^[]+)(\[(?P<type>\w+)\])?')
00882     values_pattern=re.compile("([^\" ]+|\"[^\"]*\")+")
00883     if not hasattr(stream_or_filename, 'read'):
00884       stream=open(stream_or_filename, 'r')
00885     else:
00886       stream=stream_or_filename
00887     header=False
00888     num_lines=0
00889     for line in stream:
00890       line=line.strip()
00891       if line.startswith('#'):
00892         continue
00893       if len(line)==0:
00894         continue
00895       num_lines+=1
00896       if not header:
00897         fieldnames=[]
00898         fieldtypes=[]
00899         for col in line.split():
00900           match=fieldname_pattern.match(col)
00901           if match:
00902             if match.group('type'):
00903               fieldtypes.append(match.group('type'))
00904             else:
00905               fieldtypes.append('string')
00906             fieldnames.append(match.group('name'))
00907         tab=Table(fieldnames, fieldtypes)
00908         header=True
00909         continue
00910       tab.AddRow([x.strip('"') for x in values_pattern.findall(line)])
00911     if num_lines==0:
00912       raise IOError("Cannot read table from empty stream")
00913     return tab
00914 
00915   def _GuessColumnTypes(self):
00916     for col_idx in range(len(self.col_names)):
00917       self.col_types[col_idx]=GuessColumnType(self[self.col_names[col_idx]])
00918     for row in self.rows:
00919       for idx in range(len(row)):
00920         row[idx]=self._Coerce(row[idx], self.col_types[idx])
00921         
00922   @staticmethod
00923   def _LoadCSV(stream_or_filename, sep):
00924     if not hasattr(stream_or_filename, 'read'):
00925       stream=open(stream_or_filename, 'r')
00926     else:
00927       stream=stream_or_filename
00928     reader=csv.reader(stream, delimiter=sep)
00929     first=True
00930     for row in reader:
00931       if first:
00932         header=row
00933         types='s'*len(row)
00934         tab=Table(header, types)
00935         first=False
00936       else:
00937         tab.AddRow(row)
00938     if first:
00939       raise IOError('trying to load table from empty CSV stream/file')
00940 
00941     tab._GuessColumnTypes()
00942     return tab
00943 
00944   @staticmethod
00945   def _LoadPickle(stream_or_filename):
00946     if not hasattr(stream_or_filename, 'read'):
00947       stream=open(stream_or_filename, 'rb')
00948     else:
00949       stream=stream_or_filename
00950     return cPickle.load(stream)
00951 
00952   @staticmethod
00953   def _GuessFormat(filename):
00954     try:
00955       filename = filename.name
00956     except AttributeError, e:
00957       pass
00958     if filename.endswith('.csv'):
00959       return 'csv'
00960     elif filename.endswith('.pickle'):
00961       return 'pickle'
00962     else:
00963       return 'ost'
00964     
00965     
00966   @staticmethod
00967   def Load(stream_or_filename, format='auto', sep=','):
00968     """
00969     Load table from stream or file with given name.
00970 
00971     By default, the file format is set to *auto*, which tries to guess the file
00972     format from the file extension. The following file extensions are
00973     recognized:
00974     
00975     ============    ======================
00976     extension       recognized format
00977     ============    ======================
00978     .csv            comma separated values
00979     .pickle         pickled byte stream
00980     <all others>    ost-specific format
00981     ============    ======================
00982     
00983     Thus, *format* must be specified for reading file with different filename
00984     extensions.
00985 
00986     The following file formats are understood:
00987 
00988     - ost
00989 
00990       This is an ost-specific, but still human readable file format. The file
00991       (stream) must start with header line of the form
00992 
00993         col_name1[type1] <col_name2[type2]>...
00994 
00995       The types given in brackets must be one of the data types the
00996       :class:`Table` class understands. Each following line in the file then must
00997       contains exactly the same number of data items as listed in the header. The
00998       data items are automatically converted to the column format. Lines starting
00999       with a '#' and empty lines are ignored.
01000 
01001     - pickle
01002 
01003       Deserializes the table from a pickled byte stream.
01004 
01005     - csv
01006 
01007       Reads the table from comma separated values stream. Since there is no
01008       explicit type information in the csv file, the column types are guessed,
01009       using the following simple rules:
01010 
01011       * if all values are either NA/NULL/NONE the type is set to string.
01012       * if all non-null values are convertible to float/int the type is set to
01013         float/int.
01014       * if all non-null values are true/false/yes/no, the value is set to bool.
01015       * for all other cases, the column type is set to string.
01016 
01017     :returns: A new :class:`Table` instance
01018     """
01019     format=format.lower()
01020     if format=='auto':
01021       format = Table._GuessFormat(stream_or_filename)
01022       
01023     if format=='ost':
01024       return Table._LoadOST(stream_or_filename)
01025     if format=='csv':
01026       return Table._LoadCSV(stream_or_filename, sep=sep)
01027     if format=='pickle':
01028       return Table._LoadPickle(stream_or_filename)
01029     raise ValueError('unknown format ""' % format)
01030 
01031   def Sort(self, by, order='+'):
01032     """
01033     Performs an in-place sort of the table, based on column *by*.
01034 
01035     :param by: column name by which to sort
01036     :type by: :class:`str`
01037 
01038     :param order: ascending (``-``) or descending (``+``) order
01039     :type order: :class:`str` (i.e. *+*, *-*)
01040     """
01041     sign=-1
01042     if order=='-':
01043       sign=1
01044     key_index=self.GetColIndex(by)
01045     def _key_cmp(lhs, rhs):
01046       return sign*cmp(lhs[key_index], rhs[key_index])
01047     self.rows=sorted(self.rows, _key_cmp)
01048     
01049   def GetUnique(self, col, ignore_nan=True):
01050     """
01051     Extract a list of all unique values from one column.
01052 
01053     :param col: column name
01054     :type col: :class:`str`
01055 
01056     :param ignore_nan: ignore all *None* values
01057     :type ignore_nan: :class:`bool`
01058     """
01059     idx = self.GetColIndex(col)
01060     seen = {}
01061     result = []
01062     for row in self.rows:
01063       item = row[idx]
01064       if item!=None or ignore_nan==False:
01065         if item in seen: continue
01066         seen[item] = 1
01067         result.append(item)
01068     return result
01069     
01070   def Zip(self, *args):
01071     """
01072     Allows to conveniently iterate over a selection of columns, e.g.
01073     
01074     .. code-block:: python
01075     
01076       tab = Table.Load('...')
01077       for col1, col2 in tab.Zip('col1', 'col2'):
01078         print col1, col2
01079     
01080     is a shortcut for
01081     
01082     .. code-block:: python
01083     
01084       tab = Table.Load('...')
01085       for col1, col2 in zip(tab['col1'], tab['col2']):
01086         print col1, col2
01087     """
01088     return zip(*[self[arg] for arg in args])
01089 
01090   def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None,
01091            z_title=None, x_range=None, y_range=None, z_range=None,
01092            color=None, plot_if=None, legend=None,
01093            num_z_levels=10, z_contour=True, z_interpol='nn', diag_line=False,
01094            labels=None, max_num_labels=None, title=None, clear=True, save=False,
01095            **kwargs):
01096     """
01097     Function to plot values from your table in 1, 2 or 3 dimensions using
01098     `Matplotlib <http://matplotlib.sourceforge.net>`__
01099 
01100     :param x: column name for first dimension
01101     :type x: :class:`str`
01102 
01103     :param y: column name for second dimension
01104     :type y: :class:`str`
01105 
01106     :param z: column name for third dimension
01107     :type z: :class:`str`
01108 
01109     :param style: symbol style (e.g. *.*, *-*, *x*, *o*, *+*, *\**). For a
01110                   complete list check (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
01111     :type style: :class:`str`
01112 
01113     :param x_title: title for first dimension, if not specified it is
01114                     automatically derived from column name
01115     :type x_title: :class:`str`
01116 
01117     :param y_title: title for second dimension, if not specified it is
01118                     automatically derived from column name
01119     :type y_title: :class:`str`
01120 
01121     :param z_title: title for third dimension, if not specified it is
01122                     automatically derived from column name
01123     :type z_title: :class:`str`
01124 
01125     :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
01126     :type x_range: :class:`list` of length two
01127 
01128     :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
01129     :type y_range: :class:`list` of length two
01130 
01131     :param z_range: start and end value for third dimension (e.g. [start_z, end_z])
01132     :type z_range: :class:`list` of length two
01133 
01134     :param color: color for data (e.g. *b*, *g*, *r*). For a complete list check
01135                   (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
01136     :type color: :class:`str`
01137 
01138     :param plot_if: callable which returnes *True* if row should be plotted. Is
01139                     invoked like ``plot_if(self, row)``
01140     :type plot_if: callable
01141 
01142     :param legend: legend label for data series
01143     :type legend: :class:`str`
01144 
01145     :param num_z_levels: number of levels for third dimension
01146     :type num_z_levels: :class:`int`
01147 
01148     :param diag_line: draw diagonal line
01149     :type diag_line: :class:`bool`
01150 
01151     :param labels: column name containing labels to put on x-axis for one
01152                    dimensional plot
01153     :type labels: :class:`str`
01154 
01155     :param max_num_labels: limit maximum number of labels
01156     :type max_num_labels: :class:`int`
01157 
01158     :param title: plot title, if not specified it is automatically derived from
01159                   plotted column names
01160     :type title: :class:`str`
01161 
01162     :param clear: clear old data from plot
01163     :type clear: :class:`bool`
01164 
01165     :param save: filename for saving plot
01166     :type save: :class:`str`
01167 
01168     :param z_contour: draw contour lines
01169     :type z_contour: :class:`bool`
01170 
01171     :param z_interpol: interpolation method for 3-dimensional plot (one of 'nn',
01172                        'linear')
01173     :type z_interpol: :class:`str`
01174 
01175     :param \*\*kwargs: additional arguments passed to matplotlib
01176     
01177     :returns: the ``matplotlib.pyplot`` module 
01178 
01179     **Examples:** simple plotting functions
01180 
01181     .. code-block:: python
01182 
01183       tab = Table(['a','b','c','d'],'iffi', a=range(5,0,-1),
01184                                             b=[x/2.0 for x in range(1,6)],
01185                                             c=[math.cos(x) for x in range(0,5)],
01186                                             d=range(3,8))
01187 
01188       # one dimensional plot of column 'd' vs. index
01189       plt = tab.Plot('d')
01190       plt.show()
01191 
01192       # two dimensional plot of 'a' vs. 'c'
01193       plt = tab.Plot('a', y='c', style='o-')
01194       plt.show()
01195 
01196       # three dimensional plot of 'a' vs. 'c' with values 'b'
01197       plt = tab.Plot('a', y='c', z='b')
01198       # manually save plot to file
01199       plt.savefig("plot.png")
01200     """
01201     try:
01202       import matplotlib.pyplot as plt
01203       import matplotlib.mlab as mlab
01204       import numpy as np
01205       idx1 = self.GetColIndex(x)
01206       xs = []
01207       ys = []
01208       zs = []
01209 
01210       if clear:
01211         plt.figure(figsize=[8, 6])
01212       
01213       if x_title!=None:
01214         nice_x=x_title
01215       else:
01216         nice_x=MakeTitle(x)
01217       
01218       if y_title!=None:
01219         nice_y=y_title
01220       else:
01221         if y:
01222           nice_y=MakeTitle(y)
01223         else:
01224           nice_y=None
01225       
01226       if z_title!=None:
01227         nice_z = z_title
01228       else:
01229         if z:
01230           nice_z = MakeTitle(z)
01231         else:
01232           nice_z = None
01233 
01234       if x_range and (IsScalar(x_range) or len(x_range)!=2):
01235         raise ValueError('parameter x_range must contain exactly two elements')
01236       if y_range and (IsScalar(y_range) or len(y_range)!=2):
01237         raise ValueError('parameter y_range must contain exactly two elements')
01238       if z_range and (IsScalar(z_range) or len(z_range)!=2):
01239         raise ValueError('parameter z_range must contain exactly two elements')
01240 
01241       if color:
01242         kwargs['color']=color
01243       if legend:
01244         kwargs['label']=legend
01245       if y and z:
01246         idx3 = self.GetColIndex(z)
01247         idx2 = self.GetColIndex(y)
01248         for row in self.rows:
01249           if row[idx1]!=None and row[idx2]!=None and row[idx3]!=None:
01250             if plot_if and not plot_if(self, row):
01251               continue
01252             xs.append(row[idx1])
01253             ys.append(row[idx2])
01254             zs.append(row[idx3])
01255         levels = []
01256         if z_range:
01257           z_spacing = (z_range[1] - z_range[0]) / num_z_levels
01258           l = z_range[0]
01259         else:
01260           l = self.Min(z)
01261           z_spacing = (self.Max(z) - l) / num_z_levels
01262         
01263         for i in range(0,num_z_levels+1):
01264           levels.append(l)
01265           l += z_spacing
01266   
01267         xi = np.linspace(min(xs),max(xs),len(xs)*10)
01268         yi = np.linspace(min(ys),max(ys),len(ys)*10)
01269         zi = mlab.griddata(xs, ys, zs, xi, yi, interp=z_interpol)
01270   
01271         if z_contour:
01272           plt.contour(xi,yi,zi,levels,linewidths=0.5,colors='k')
01273 
01274         plt.contourf(xi,yi,zi,levels,cmap=plt.cm.jet)
01275         plt.colorbar(ticks=levels)
01276             
01277       elif y:
01278         idx2=self.GetColIndex(y)
01279         for row in self.rows:
01280           if row[idx1]!=None and row[idx2]!=None:
01281             if plot_if and not plot_if(self, row):
01282               continue
01283             xs.append(row[idx1])
01284             ys.append(row[idx2])
01285         plt.plot(xs, ys, style, **kwargs)
01286         
01287       else:
01288         label_vals=[]
01289         
01290         if labels:
01291           label_idx=self.GetColIndex(labels)
01292         for row in self.rows:
01293           if row[idx1]!=None:
01294             if plot_if and not plot_if(self, row):
01295               continue
01296             xs.append(row[idx1])
01297             if labels:
01298               label_vals.append(row[label_idx])
01299         plt.plot(xs, style, **kwargs)
01300         if labels:
01301           interval = 1
01302           if max_num_labels:
01303             if len(label_vals)>max_num_labels:
01304               interval = int(math.ceil(float(len(label_vals))/max_num_labels))
01305               label_vals = label_vals[::interval]
01306           plt.xticks(np.arange(0, len(xs), interval), label_vals, rotation=45,
01307                      size='x-small')
01308       
01309       if title==None:
01310         if nice_z:
01311           title = '%s of %s vs. %s' % (nice_z, nice_x, nice_y)
01312         elif nice_y:
01313           title = '%s vs. %s' % (nice_x, nice_y)
01314         else:
01315           title = nice_x
01316   
01317       plt.title(title, size='x-large', fontweight='bold',
01318                 verticalalignment='bottom')
01319       
01320       if legend:
01321         plt.legend(loc=0)
01322       
01323       if x and y:
01324         plt.xlabel(nice_x, size='x-large')
01325         if x_range:
01326           plt.xlim(x_range[0], x_range[1])
01327         if y_range:
01328           plt.ylim(y_range[0], y_range[1])
01329         if diag_line:
01330           plt.plot(x_range, y_range, '-', color='black')
01331         
01332         plt.ylabel(nice_y, size='x-large')
01333       else:
01334         if y_range:
01335           plt.ylim(y_range[0], y_range[1])
01336         if x_title:
01337           plt.xlabel(x_title, size='x-large')
01338         plt.ylabel(nice_y, size='x-large')
01339       if save:
01340         plt.savefig(save)
01341       return plt
01342     except ImportError:
01343       LogError("Function needs numpy and matplotlib, but I could not import it.")
01344       raise
01345     
01346   def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False,
01347                     histtype='stepfilled', align='mid', x_title=None,
01348                     y_title=None, title=None, clear=True, save=False,
01349                     color=None, y_range=None):
01350     """
01351     Create a histogram of the data in col for the range *x_range*, split into
01352     *num_bins* bins and plot it using Matplotlib.
01353 
01354     :param col: column name with data
01355     :type col: :class:`str`
01356 
01357     :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
01358     :type x_range: :class:`list` of length two
01359 
01360     :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
01361     :type y_range: :class:`list` of length two
01362 
01363     :param num_bins: number of bins in range
01364     :type num_bins: :class:`int`
01365 
01366     :param color: Color to be used for the histogram. If not set, color will be 
01367         determined by matplotlib
01368     :type color: :class:`str`
01369 
01370     :param normed: normalize histogram
01371     :type normed: :class:`bool`
01372 
01373     :param histtype: type of histogram (i.e. *bar*, *barstacked*, *step*,
01374                      *stepfilled*). See (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
01375     :type histtype: :class:`str`
01376 
01377     :param align: style of histogram (*left*, *mid*, *right*). See
01378                   (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
01379     :type align: :class:`str`
01380 
01381     :param x_title: title for first dimension, if not specified it is
01382                     automatically derived from column name
01383     :type x_title: :class:`str`
01384 
01385     :param y_title: title for second dimension, if not specified it is
01386                     automatically derived from column name
01387     :type y_title: :class:`str`
01388 
01389     :param title: plot title, if not specified it is automatically derived from
01390                   plotted column names
01391     :type title: :class:`str`
01392 
01393     :param clear: clear old data from plot
01394     :type clear: :class:`bool`
01395 
01396     :param save: filename for saving plot
01397     :type save: :class:`str`
01398 
01399     **Examples:** simple plotting functions
01400 
01401     .. code-block:: python
01402 
01403       tab = Table(['a'],'f', a=[math.cos(x*0.01) for x in range(100)])
01404 
01405       # one dimensional plot of column 'd' vs. index
01406       plt = tab.PlotHistogram('a')
01407       plt.show()
01408 
01409     """
01410     try:
01411       import matplotlib.pyplot as plt
01412       import numpy as np
01413       
01414       if len(self.rows)==0:
01415         return None
01416       kwargs={}
01417       if color:
01418         kwargs['color']=color
01419       idx = self.GetColIndex(col)
01420       data = []
01421       for r in self.rows:
01422         if r[idx]!=None:
01423           data.append(r[idx])
01424         
01425       if clear:
01426         plt.clf()
01427         
01428       n, bins, patches = plt.hist(data, bins=num_bins, range=x_range,
01429                                   normed=normed, histtype=histtype, align=align,
01430                                   **kwargs)
01431       
01432       if x_title!=None:
01433         nice_x=x_title
01434       else:
01435         nice_x=MakeTitle(col)
01436       plt.xlabel(nice_x, size='x-large')
01437       if y_range:
01438         plt.ylim(y_range) 
01439       if y_title!=None:
01440         nice_y=y_title
01441       else:
01442         nice_y="bin count"  
01443       plt.ylabel(nice_y, size='x-large')
01444       
01445       if title!=None:
01446         nice_title=title
01447       else:
01448         nice_title="Histogram of %s"%nice_x
01449       plt.title(nice_title, size='x-large', fontweight='bold')
01450       
01451       if save:
01452         plt.savefig(save)
01453       return plt
01454     except ImportError:
01455       LogError("Function needs numpy and matplotlib, but I could not import it.")
01456       raise
01457  
01458   def _Max(self, col):
01459     if len(self.rows)==0:
01460       return None, None
01461     idx = self.GetColIndex(col)
01462     col_type = self.col_types[idx]
01463     if col_type=='int' or col_type=='float':
01464       max_val = -float('inf')
01465     elif col_type=='bool':
01466       max_val = False
01467     elif col_type=='string':
01468       max_val = chr(0)
01469     max_idx = None
01470     for i in range(0, len(self.rows)):
01471       if self.rows[i][idx]>max_val:
01472         max_val = self.rows[i][idx]
01473         max_idx = i
01474     return max_val, max_idx
01475 
01476   def PlotBar(self, cols=None, rows=None, xlabels=None, set_xlabels=True, xlabels_rotation='horizontal', y_title=None, title=None, 
01477               colors=None, width=0.8, bottom=0, legend=False, legend_names=None, show=False, save=False):
01478 
01479     """
01480     Create a barplot of the data in cols. Every column will be represented
01481     at one position. If there are several rows, each column will be grouped 
01482     together.
01483 
01484     :param cols: List of column names. Every column will be represented as a 
01485                  single bar. If cols is None, every column of the table gets 
01486                  plotted.
01487     :type cols: :class:`list`
01488 
01489     :param rows: List of row indices. Values from given rows will be plotted 
01490                  in parallel at one column position. If set to None, all rows 
01491                  of the table will be plotted. Note, that the maximum number 
01492                  of rows is 7.
01493     :type rows: :class:`list`
01494 
01495     :param xlabels: Label for every col on x-axis. If set to None, the column 
01496                     names are used. The xlabel plotting can be supressed by 
01497                     the parameter set_xlabel.
01498     :type xlabels: :class:`list`
01499 
01500     :param set_xlabels: Controls whether xlabels are plotted or not.
01501     :type set_xlabels: :class:`bool`
01502 
01503     :param x_labels_rotation: Can either be 'horizontal', 'vertical' or an 
01504                               integer, that describes the rotation in degrees.
01505 
01506     :param y_title: Y-axis description
01507     :type y_title: :class:`str`
01508 
01509     :title: Title of the plot. No title appears if set to None
01510     :type title: :class:`str`
01511 
01512     :param colors: Colors of the different bars in each group. Must be a list 
01513                    of valid colors in matplotlib. Length of color and rows must 
01514                    be consistent.
01515     :type colors: :class:`list`
01516 
01517     :param width: The available space for the groups on the x-axis is divided 
01518                   by the exact number of groups. The parameters width is the 
01519                   fraction of what is actually used. If it would be 1.0 the 
01520                   bars of the different groups would touch each other.
01521                   Value must be between [0;1]
01522     :type width: :class:`float`
01523 
01524     :param bottom: Bottom
01525     :type bottom: :class:`float`
01526 
01527     :param legend: Legend for color explanation, the corresponding row 
01528                    respectively. If set to True, legend_names must be provided.
01529     :type legend: :class:`bool`
01530 
01531     :param legend_names: List of names, that describe the differently colored 
01532                          bars. Length must be consistent with number of rows.
01533 
01534     :param show: If set to True, the plot is directly displayed.
01535 
01536     :param save: If set, a png image with name save in the current working 
01537                  directory will be saved.
01538     :type save: :class:`str`
01539 
01540     """
01541     try:
01542       import numpy as np
01543       import matplotlib.pyplot as plt
01544     except:
01545       raise ImportError('PlotBar relies on numpy and matplotlib, but I could' \
01546                         'not import it!')
01547       
01548     standard_colors=['b','g','y','c','m','r','k']
01549     data=[]
01550 
01551     if cols==None:
01552       cols=self.col_names
01553 
01554     if width<=0 or width>1:
01555       raise ValueError('Width must be in [0;1]')
01556 
01557     if rows==None:
01558       if len(self.rows)>7:
01559         raise ValueError('Table contains too many rows to represent them at one '\
01560                          'bar position in parallel. You can Select a Subtable or '\
01561                          'specify the parameter rows with a list of row indices '\
01562                          '(max 7)')
01563       else:
01564         rows=range(len(self.rows))
01565     else:
01566       if not isinstance(rows,list):
01567         rows=[rows]
01568       if len(rows)>7:
01569         raise ValueError('Too many rows to represent (max 7). Please note, that '\
01570                          'data from multiple rows from one column gets '\
01571                          'represented at one position in parallel.')
01572 
01573     for r_idx in rows:
01574       row=self.rows[r_idx] 
01575       temp=list()
01576       for c in cols:
01577         try:
01578           c_idx=self.GetColIndex(c)
01579         except:
01580           raise ValueError('Cannot find column with name '+str(c))
01581         temp.append(row[c_idx])
01582       data.append(temp)  
01583 
01584     if colors==None:
01585       colors=standard_colors[:len(rows)]
01586 
01587     if len(rows)!=len(colors):
01588       raise ValueError("Number of rows and number of colors must be consistent!")
01589 
01590     ind=np.arange(len(data[0]))
01591     single_bar_width=float(width)/len(data)
01592     
01593     fig=plt.figure()
01594     ax=fig.add_subplot(111)
01595     legend_data=[]
01596 
01597     for i in range(len(data)):
01598       legend_data.append(ax.bar(ind+i*single_bar_width+(1-width)/2,data[i],single_bar_width,bottom=bottom,color=colors[i])[0])
01599       
01600     if title!=None:
01601       ax.set_title(title, size='x-large', fontweight='bold')  
01602     
01603     if y_title!=None:
01604       nice_y=y_title
01605     else:
01606       nice_y="value" 
01607     ax.set_ylabel(nice_y)
01608     
01609     if xlabels:
01610       if len(data[0])!=len(xlabels):
01611         raise ValueError('Number of xlabels is not consistent with number of cols!')
01612     else:
01613       xlabels=cols
01614       
01615     if set_xlabels:
01616       ax.set_xticks(ind+0.5)
01617       ax.set_xticklabels(xlabels, rotation = xlabels_rotation)
01618     else:
01619       ax.set_xticks([])
01620       
01621     if legend == True:
01622       if legend_names==None:
01623         raise ValueError('You must provide legend names! e.g. names for the rows, '\
01624                          'that are printed in parallel.')
01625       if len(legend_names)!=len(data):
01626         raise ValueError('length of legend_names must be consistent with number '\
01627                          'of plotted rows!')
01628       ax.legend(legend_data, legend_names)   
01629 
01630     if save:
01631       plt.savefig(save)
01632 
01633     if show:
01634       plt.show()
01635     
01636     return plt
01637       
01638   def PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log',
01639                  colormap='jet', show_scalebar=False, scalebar_label=None, clear=True, save=False, show=False):
01640 
01641     """
01642     Create a heatplot of the data in col x vs the data in col y using matplotlib
01643 
01644     :param x: column name with x data
01645     :type x: :class:`str`
01646 
01647     :param y: column name with y data
01648     :type y: :class:`str`
01649 
01650     :param title: title of the plot, will be generated automatically if set to None
01651     :type title: :class:`str`
01652 
01653     :param x_title: label of x-axis, will be generated automatically if set to None
01654     :type title: :class:`str`
01655 
01656     :param y_title: label of y-axis, will be generated automatically if set to None
01657     :type title: :class:`str`
01658 
01659     :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
01660     :type x_range: :class:`list` of length two
01661 
01662     :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
01663     :type y_range: :class:`list` of length two
01664 
01665     :param binning: type of binning. If set to None, the value of a hexbin will
01666                     correspond to the number of datapoints falling into it. If
01667                     set to 'log', the value will be the log with base 10 of the above
01668                     value (log(i+1)). If an integer is provided, the number of a 
01669                     hexbin is equal the number of datapoints falling into it divided 
01670                     by the integer. If a list of values is provided, these values
01671                     will be the lower bounds of the bins.
01672     
01673     :param colormap: colormap, that will be used. Value can be every colormap defined
01674                      in matplotlib or an own defined colormap. You can either pass a
01675                      string with the name of the matplotlib colormap or a colormap
01676                      object.
01677 
01678     :param show_scalebar: If set to True, a scalebar according to the chosen colormap is shown
01679     :type show_scalebar: :class:`bool`
01680 
01681     :param scalebar_label: Label of the scalebar
01682     :type scalebar_label: :class:`str`
01683 
01684     :param clear: clear old data from plot
01685     :type clear: :class:`bool`
01686 
01687     :param save: filename for saving plot
01688     :type save: :class:`str`
01689 
01690     :param show: directly show plot
01691     :type show: :class:`bool`
01692     
01693     """
01694 
01695     try:
01696       import matplotlib.pyplot as plt
01697       import matplotlib.cm as cm
01698     except:
01699       raise ImportError('PlotHexbin relies on matplotlib, but I could not import it')
01700 
01701     idx=self.GetColIndex(x)
01702     idy=self.GetColIndex(y)
01703     xdata=[]
01704     ydata=[]
01705 
01706     for r in self.rows:
01707       if r[idx]!=None and r[idy]!=None:
01708         xdata.append(r[idx])
01709         ydata.append(r[idy])
01710 
01711     if clear:
01712       plt.clf()
01713       
01714     if x_title!=None:
01715       nice_x=x_title
01716     else:
01717       nice_x=MakeTitle(x)
01718       
01719     if y_title!=None:
01720       nice_y=y_title
01721     else:
01722       nice_y=MakeTitle(y)
01723 
01724     if title==None:
01725       title = '%s vs. %s' % (nice_x, nice_y)
01726   
01727     if IsStringLike(colormap):
01728       colormap=getattr(cm, colormap)
01729 
01730     if x_range and (IsScalar(x_range) or len(x_range)!=2):
01731       raise ValueError('parameter x_range must contain exactly two elements')
01732     if y_range and (IsScalar(y_range) or len(y_range)!=2):
01733       raise ValueError('parameter y_range must contain exactly two elements')
01734 
01735     ext = [min(xdata),max(xdata),min(ydata),max(ydata)]
01736 
01737     if x_range:
01738       plt.xlim((x_range[0], x_range[1]))
01739       ext[0]=x_range[0]
01740       ext[1]=x_range[1]
01741     if y_range:
01742       plt.ylim(y_range[0], y_range[1])
01743       ext[2]=y_range[0]
01744       ext[3]=y_range[1]
01745 
01746 
01747     plt.hexbin(xdata, ydata, bins=binning, cmap=colormap, extent=ext)
01748 
01749     plt.title(title, size='x-large', fontweight='bold',
01750               verticalalignment='bottom')
01751 
01752     plt.xlabel(nice_x)
01753     plt.ylabel(nice_y)
01754         
01755     if show_scalebar:
01756       cb=plt.colorbar()
01757       if scalebar_label:
01758         cb.set_label(scalebar_label)
01759 
01760     if save:
01761       plt.savefig(save)
01762 
01763     if show:
01764       plt.show()
01765 
01766     return plt
01767         
01768   def MaxRow(self, col):
01769     """
01770     Returns the row containing the cell with the maximal value in col. If 
01771     several rows have the highest value, only the first one is returned.
01772     ''None'' values are ignored.
01773 
01774     :param col: column name
01775     :type col: :class:`str`
01776 
01777     :returns: row with maximal col value or None if the table is empty
01778     """
01779     val, idx = self._Max(col)
01780     if idx!=None:
01781       return self.rows[idx]
01782   
01783   def Max(self, col):
01784     """
01785     Returns the maximum value in col. If several rows have the highest value,
01786     only the first one is returned. ''None'' values are ignored.
01787 
01788     :param col: column name
01789     :type col: :class:`str`
01790     """
01791     val, idx = self._Max(col)
01792     return val
01793   
01794   def MaxIdx(self, col):
01795     """
01796     Returns the row index of the cell with the maximal value in col. If
01797     several rows have the highest value, only the first one is returned.
01798     ''None'' values are ignored.
01799 
01800     :param col: column name
01801     :type col: :class:`str`
01802     """
01803     val, idx = self._Max(col)
01804     return idx
01805   
01806   def _Min(self, col):
01807     if len(self.rows)==0:
01808       return None, None
01809     idx=self.GetColIndex(col)
01810     col_type = self.col_types[idx]
01811     if col_type=='int' or col_type=='float':
01812       min_val=float('inf')
01813     elif col_type=='bool':
01814       min_val=True
01815     elif col_type=='string':
01816       min_val=chr(255)
01817     min_idx=None
01818     for i,row in enumerate(self.rows):
01819       if row[idx]!=None and row[idx]<min_val:
01820         min_val=row[idx]
01821         min_idx=i
01822     return min_val, min_idx
01823 
01824   def Min(self, col):
01825     """
01826     Returns the minimal value in col. If several rows have the lowest value,
01827     only the first one is returned. ''None'' values are ignored.
01828 
01829     :param col: column name
01830     :type col: :class:`str`
01831     """
01832     val, idx = self._Min(col)
01833     return val
01834   
01835   def MinRow(self, col):
01836     """
01837     Returns the row containing the cell with the minimal value in col. If 
01838     several rows have the lowest value, only the first one is returned.
01839     ''None'' values are ignored.
01840 
01841     :param col: column name
01842     :type col: :class:`str`
01843 
01844     :returns: row with minimal col value or None if the table is empty
01845     """
01846     val, idx = self._Min(col)
01847     if idx!=None:
01848       return self.rows[idx]
01849   
01850   def MinIdx(self, col):
01851     """
01852     Returns the row index of the cell with the minimal value in col. If
01853     several rows have the lowest value, only the first one is returned.
01854     ''None'' values are ignored.
01855 
01856     :param col: column name
01857     :type col: :class:`str`
01858     """
01859     val, idx = self._Min(col)
01860     return idx
01861   
01862   def Sum(self, col):
01863     """
01864     Returns the sum of the given column. Cells with ''None'' are ignored. Returns 
01865     0.0, if the column doesn't contain any elements. Col must be of numeric
01866     column type ('float', 'int') or boolean column type.
01867 
01868     :param col: column name
01869     :type col: :class:`str`
01870 
01871     :raises: :class:`TypeError` if column type is ``string``
01872     """
01873     idx = self.GetColIndex(col)
01874     col_type = self.col_types[idx]
01875     if col_type!='int' and col_type!='float' and col_type!='bool':
01876       raise TypeError("Sum can only be used on numeric column types")
01877     s = 0.0
01878     for r in self.rows:
01879       if r[idx]!=None:
01880         s += r[idx] 
01881     return s 
01882 
01883   def Mean(self, col):
01884     """
01885     Returns the mean of the given column. Cells with ''None'' are ignored. Returns 
01886     None, if the column doesn't contain any elements. Col must be of numeric
01887     ('float', 'int') or boolean column type.
01888 
01889     If column type is *bool*, the function returns the ratio of
01890     number of 'Trues' by total number of elements.
01891 
01892     :param col: column name
01893     :type col: :class:`str`
01894 
01895     :raises: :class:`TypeError` if column type is ``string``
01896     """
01897     idx = self.GetColIndex(col)
01898     col_type = self.col_types[idx]
01899     if col_type!='int' and col_type!='float' and col_type!='bool':
01900       raise TypeError("Mean can only be used on numeric or bool column types")
01901     
01902     vals=[]
01903     for v in self[col]:
01904       if v!=None:
01905         vals.append(v)
01906     try:
01907       return stutil.Mean(vals)
01908     except:
01909       return None
01910     
01911   def RowMean(self, mean_col_name, cols):
01912     """
01913     Adds a new column of type 'float' with a specified name (*mean_col_name*),
01914     containing the mean of all specified columns for each row.
01915     
01916     Cols are specified by their names and must be of numeric column
01917     type ('float', 'int') or boolean column type. Cells with None are ignored.
01918     Adds ''None'' if the row doesn't contain any values.
01919     
01920     :param mean_col_name: name of new column containing mean values
01921     :type mean_col_name: :class:`str`
01922 
01923     :param cols: name or list of names of columns to include in computation of
01924                  mean
01925     :type cols: :class:`str` or :class:`list` of strings
01926 
01927     :raises: :class:`TypeError` if column type of columns in *col* is ``string``
01928     
01929     == Example ==
01930    
01931     Staring with the following table:
01932     
01933     ==== ==== ====
01934     x     y    u           
01935     ==== ==== ====
01936      1    10  100 
01937      2    15  None 
01938      3    20  400 
01939     ==== ==== ====
01940     
01941     the code here adds a column with the name 'mean' to yield the table below:
01942     
01943     .. code-block::python
01944     
01945       tab.RowMean('mean', ['x', 'u'])
01946     
01947     
01948     ==== ==== ==== ===== 
01949     x     y    u   mean           
01950     ==== ==== ==== =====
01951      1    10  100  50.5 
01952      2    15  None 2
01953      3    20  400  201.5 
01954     ==== ==== ==== =====
01955       
01956     """
01957     
01958     if IsScalar(cols):
01959       cols = [cols]
01960     
01961     cols_idxs = []
01962     for col in cols:
01963       idx = self.GetColIndex(col)
01964       col_type = self.col_types[idx]
01965       if col_type!='int' and col_type!='float' and col_type!='bool':
01966         raise TypeError("RowMean can only be used on numeric column types")
01967       cols_idxs.append(idx)
01968       
01969     mean_rows = []
01970     for row in self.rows:
01971       vals = []
01972       for idx in cols_idxs:
01973         v = row[idx]
01974         if v!=None:
01975           vals.append(v)
01976       try:
01977         mean = stutil.Mean(vals)
01978         mean_rows.append(mean)
01979       except:
01980         mean_rows.append(None)
01981     
01982     self.AddCol(mean_col_name, 'f', mean_rows)
01983     
01984   def Percentiles(self, col, nths):
01985     """
01986     Returns the percentiles of column *col* given in *nths*.
01987 
01988     The percentiles are calculated as 
01989     
01990     .. code-block:: python
01991 
01992       values[min(len(values), int(round(len(values)*nth/100+0.5)-1))]
01993 
01994     where values are the sorted values of *col* not equal to ''None''
01995 
01996     :param col: column name
01997     :type col:  :class:`str`
01998     :param nths: list of percentiles to be calculated. Each percentile is a
01999                  number between 0 and 100.
02000     :type nths:  :class:`list` of numbers
02001 
02002     :raises: :class:`TypeError` if column type is ``string``
02003     :returns: List of percentiles in the same order as given in *nths*
02004     """
02005     idx = self.GetColIndex(col)
02006     col_type = self.col_types[idx]
02007     if col_type!='int' and col_type!='float' and col_type!='bool':
02008       raise TypeError("Median can only be used on numeric column types")
02009     
02010     for nth in nths:
02011       if nth < 0 or nth > 100:
02012         raise ValueError("percentiles must be between 0 and 100")
02013     vals=[]
02014     for v in self[col]:
02015       if v!=None:
02016         vals.append(v)
02017     vals=sorted(vals)
02018     if len(vals)==0:
02019       return [None]*len(nths)
02020     percentiles=[]
02021     
02022     for nth in nths:
02023       p=vals[min(len(vals)-1, int(round(len(vals)*nth/100.0+0.5)-1))]
02024       percentiles.append(p)
02025     return percentiles
02026 
02027   def Median(self, col):
02028     """
02029     Returns the median of the given column. Cells with ''None'' are ignored. Returns 
02030     ''None'', if the column doesn't contain any elements. Col must be of numeric
02031     column type ('float', 'int') or boolean column type.
02032 
02033     :param col: column name
02034     :type col: :class:`str`
02035 
02036     :raises: :class:`TypeError` if column type is ``string``
02037     """
02038     idx = self.GetColIndex(col)
02039     col_type = self.col_types[idx]
02040     if col_type!='int' and col_type!='float' and col_type!='bool':
02041       raise TypeError("Median can only be used on numeric column types")
02042     
02043     vals=[]
02044     for v in self[col]:
02045       if v!=None:
02046         vals.append(v)
02047     stutil.Median(vals)
02048     try:
02049       return stutil.Median(vals)
02050     except:
02051       return None
02052     
02053   def StdDev(self, col):
02054     """
02055     Returns the standard deviation of the given column. Cells with ''None'' are
02056     ignored. Returns ''None'', if the column doesn't contain any elements. Col must
02057     be of numeric column type ('float', 'int') or boolean column type.
02058 
02059     :param col: column name
02060     :type col: :class:`str`
02061 
02062     :raises: :class:`TypeError` if column type is ``string``
02063     """
02064     idx = self.GetColIndex(col)
02065     col_type = self.col_types[idx]
02066     if col_type!='int' and col_type!='float' and col_type!='bool':
02067       raise TypeError("StdDev can only be used on numeric column types")
02068     
02069     vals=[]
02070     for v in self[col]:
02071       if v!=None:
02072         vals.append(v)
02073     try:
02074       return stutil.StdDev(vals)
02075     except:
02076       return None
02077 
02078   def Count(self, col, ignore_nan=True):
02079     """
02080     Count the number of cells in column that are not equal to ''None''.
02081 
02082     :param col: column name
02083     :type col: :class:`str`
02084 
02085     :param ignore_nan: ignore all *None* values
02086     :type ignore_nan: :class:`bool`
02087     """
02088     count=0
02089     idx=self.GetColIndex(col)
02090     for r in self.rows:
02091       if ignore_nan:
02092         if r[idx]!=None:
02093           count+=1
02094       else:
02095         count+=1
02096     return count
02097 
02098   def Correl(self, col1, col2):
02099     """
02100     Calculate the Pearson correlation coefficient between *col1* and *col2*, only
02101     taking rows into account where both of the values are not equal to *None*.
02102     If there are not enough data points to calculate a correlation coefficient,
02103     *None* is returned.
02104 
02105     :param col1: column name for first column
02106     :type col1: :class:`str`
02107 
02108     :param col2: column name for second column
02109     :type col2: :class:`str`
02110     """
02111     if IsStringLike(col1) and IsStringLike(col2):
02112       col1 = self.GetColIndex(col1)
02113       col2 = self.GetColIndex(col2)
02114     vals1, vals2=([],[])
02115     for v1, v2 in zip(self[col1], self[col2]):
02116       if v1!=None and v2!=None:
02117         vals1.append(v1)
02118         vals2.append(v2)
02119     try:
02120       return stutil.Correl(vals1, vals2)
02121     except:
02122       return None
02123 
02124   def SpearmanCorrel(self, col1, col2):
02125     """
02126     Calculate the Spearman correlation coefficient between col1 and col2, only 
02127     taking rows into account where both of the values are not equal to None. If 
02128     there are not enough data points to calculate a correlation coefficient, 
02129     None is returned.
02130     
02131     :warning: The function depends on the following module: *scipy.stats.mstats*
02132 
02133     :param col1: column name for first column
02134     :type col1: :class:`str`
02135 
02136     :param col2: column name for second column
02137     :type col2: :class:`str`
02138     """
02139     try:
02140       import scipy.stats.mstats
02141       
02142       if IsStringLike(col1) and IsStringLike(col2):
02143         col1 = self.GetColIndex(col1)
02144         col2 = self.GetColIndex(col2)
02145       vals1, vals2=([],[])
02146       for v1, v2 in zip(self[col1], self[col2]):
02147         if v1!=None and v2!=None:
02148           vals1.append(v1)
02149           vals2.append(v2)
02150       try:
02151         correl = scipy.stats.mstats.spearmanr(vals1, vals2)[0]
02152         if scipy.isnan(correl):
02153           return None
02154         return correl
02155       except:
02156         return None
02157 
02158     except ImportError:
02159       LogError("Function needs scipy.stats.mstats, but I could not import it.")
02160       raise
02161     
02162 
02163   def Save(self, stream_or_filename, format='ost', sep=','):
02164     """
02165     Save the table to stream or filename. The following three file formats
02166     are supported (for more information on file formats, see :meth:`Load`):
02167 
02168     =============   =======================================
02169     ost             ost-specific format (human readable)
02170     csv             comma separated values (human readable)
02171     pickle          pickled byte stream (binary)
02172     html            HTML table
02173     context         ConTeXt table
02174     =============   =======================================
02175 
02176     :param stream_or_filename: filename or stream for writing output
02177     :type stream_or_filename: :class:`str` or :class:`file`
02178 
02179     :param format: output format (i.e. *ost*, *csv*, *pickle*)
02180     :type format: :class:`str`
02181 
02182     :raises: :class:`ValueError` if format is unknown
02183     """
02184     format=format.lower()
02185     if format=='ost':
02186       return self._SaveOST(stream_or_filename)
02187     if format=='csv':
02188       return self._SaveCSV(stream_or_filename, sep=sep)
02189     if format=='pickle':
02190       return self._SavePickle(stream_or_filename)
02191     if format=='html':
02192       return self._SaveHTML(stream_or_filename)
02193     if format=='context':
02194       return self._SaveContext(stream_or_filename)
02195     raise ValueError('unknown format "%s"' % format)
02196 
02197   def _SavePickle(self, stream):
02198     if not hasattr(stream, 'write'):
02199       stream=open(stream, 'wb')
02200     cPickle.dump(self, stream, cPickle.HIGHEST_PROTOCOL)
02201 
02202   def _SaveHTML(self, stream_or_filename):
02203     def _escape(s):
02204       return s.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;')
02205 
02206     file_opened = False
02207     if not hasattr(stream_or_filename, 'write'):
02208       stream = open(stream_or_filename, 'w')
02209       file_opened = True
02210     else:
02211       stream = stream_or_filename
02212     stream.write('<table>') 
02213     stream.write('<tr>')
02214     for col_name in self.col_names:
02215       stream.write('<th>%s</th>' % _escape(col_name)) 
02216     stream.write('</tr>')
02217     for row in self.rows:
02218       stream.write('<tr>')
02219       for i, col in enumerate(row):
02220         val = ''
02221         if col != None:
02222            if self.col_types[i] == 'float':
02223              val = '%.3f' % col
02224            elif self.col_types[i] == 'int':
02225              val = '%d' % col
02226            elif self.col_types[i] == 'bool':
02227              val = col and 'true' or 'false'
02228            else:
02229              val  = str(col)
02230         stream.write('<td>%s</td>' % _escape(val))
02231       stream.write('</tr>')
02232     stream.write('</table>')
02233     if file_opened:
02234       stream.close()
02235   def _SaveContext(self, stream_or_filename):
02236     file_opened = False
02237     if not hasattr(stream_or_filename, 'write'):
02238       stream = open(stream_or_filename, 'w')
02239       file_opened = True
02240     else:
02241       stream = stream_or_filename
02242     stream.write('\\starttable[') 
02243     for col_type in self.col_types:
02244       if col_type =='string':
02245         stream.write('l|')
02246       elif col_type=='int':
02247         stream.write('r|')
02248       elif col_type =='float':
02249         stream.write('i3r|')
02250       else:
02251         stream.write('l|')
02252     stream.write(']\n\\HL\n')
02253     for col_name in self.col_names:
02254       stream.write('\\NC \\bf %s' % col_name) 
02255     stream.write(' \\AR\\HL\n')
02256     for row in self.rows:
02257       for i, col in enumerate(row):
02258         val = '---'
02259         if col != None:
02260            if self.col_types[i] == 'float':
02261              val = '%.3f' % col
02262            elif self.col_types[i] == 'int':
02263              val = '%d' % col
02264            elif self.col_types[i] == 'bool':
02265              val = col and 'true' or 'false'
02266            else:
02267              val  = str(col)
02268         stream.write('\\NC %s' % val)
02269       stream.write(' \\AR\n')
02270     stream.write('\\HL\n')
02271     stream.write('\\stoptable')
02272     if file_opened:
02273       stream.close()
02274 
02275   def _SaveCSV(self, stream, sep):
02276     if not hasattr(stream, 'write'):
02277       stream=open(stream, 'wb')
02278 
02279     writer=csv.writer(stream, delimiter=sep)
02280     writer.writerow(['%s' % n for n in self.col_names])
02281     for row in self.rows:
02282       row=list(row)
02283       for i, c in enumerate(row):
02284         if c==None:
02285           row[i]='NA'
02286       writer.writerow(row)
02287 
02288   def _SaveOST(self, stream):
02289     if hasattr(stream, 'write'):
02290       writer=csv.writer(stream, delimiter=' ')
02291     else:
02292       stream=open(stream, 'w')
02293       writer=csv.writer(stream, delimiter=' ')
02294     if self.comment:
02295       stream.write(''.join(['# %s\n' % l for l in self.comment.split('\n')]))
02296     writer.writerow(['%s[%s]' % t for t in zip(self.col_names, self.col_types)])
02297     for row in self.rows:
02298       row=list(row)
02299       for i, c in enumerate(row):
02300         if c==None:
02301           row[i]='NA'
02302       writer.writerow(row)
02303   
02304      
02305   def GetNumpyMatrix(self, *args):
02306     '''
02307     Returns a numpy matrix containing the selected columns from the table as 
02308     columns in the matrix.
02309 
02310     Only columns of type *int* or *float* are supported. *NA* values in the
02311     table will be converted to *None* values.
02312 
02313     :param \*args: column names to include in numpy matrix
02314 
02315     :warning: The function depends on *numpy*
02316     '''
02317     try:
02318       import numpy as np
02319       
02320       if len(args)==0:
02321         raise RuntimeError("At least one column must be specified.")
02322       
02323       idxs = []
02324       for arg in args:
02325         idx = self.GetColIndex(arg)
02326         col_type = self.col_types[idx]
02327         if col_type!='int' and col_type!='float':
02328           raise TypeError("Numpy matrix can only be generated from numeric column types")
02329         idxs.append(idx)
02330       m = np.matrix([list(self[i]) for i in idxs]) 
02331       return m.T
02332     
02333     except ImportError:
02334       LogError("Function needs numpy, but I could not import it.")
02335       raise
02336     
02337 
02338 
02339   def GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0):
02340 
02341     '''
02342     In place Gaussian smooth of a column in the table with a given standard deviation.
02343     All nan are set to nan_value before smoothing.
02344 
02345     :param col: column name
02346     :type col: :class:`str`
02347 
02348     :param std: standard deviation for gaussian kernel
02349     :type std: `scalar` 
02350 
02351     :param na_value: all na (None) values of the speciefied column are set to na_value before smoothing
02352     :type na_value: `scalar`
02353 
02354     :param padding: allows to handle padding behaviour see scipy ndimage.gaussian_filter1d documentation for more information. standard is reflect
02355     :type padding: :class:`str`
02356 
02357     :param c: constant value used for padding if padding mode is constant
02358     :type c: `scalar`
02359 
02360 
02361 
02362     :warning: The function depends on *scipy*
02363     ''' 
02364 
02365     try:
02366       from scipy import ndimage
02367       import numpy as np
02368     except ImportError:
02369       LogError("I need scipy.ndimage and numpy, but could not import it")
02370       raise
02371       
02372     idx = self.GetColIndex(col)
02373     col_type = self.col_types[idx]
02374     if col_type!='int' and col_type!='float':
02375       raise TypeError("GaussianSmooth can only be used on numeric column types")
02376 
02377     vals=[]
02378     for v in self[col]:
02379       if v!=None:
02380         vals.append(v)
02381       else:
02382         vals.append(na_value)
02383 
02384     
02385     smoothed_values_ndarray=ndimage.gaussian_filter1d(vals,std, mode=padding, cval=c)
02386 
02387     result=[]
02388 
02389     for v in smoothed_values_ndarray:
02390       result.append(v)
02391 
02392     self[col]=result
02393 
02394 
02395   def GetOptimalPrefactors(self, ref_col, *args, **kwargs):
02396     '''
02397     This returns the optimal prefactor values (i.e. a, b, c, ...) for the
02398     following equation
02399     
02400     .. math::
02401       :label: op1
02402       
02403       a*u + b*v + c*w + ... = z
02404     
02405     where u, v, w and z are vectors. In matrix notation
02406     
02407     .. math::
02408       :label: op2
02409       
02410       A*p = z
02411     
02412     where A contains the data from the table (u,v,w,...), p are the prefactors 
02413     to optimize (a,b,c,...) and z is the vector containing the result of
02414     equation :eq:`op1`.
02415     
02416     The parameter ref_col equals to z in both equations, and \*args are columns
02417     u, v and w (or A in :eq:`op2`). All columns must be specified by their names.
02418     
02419     **Example:**
02420     
02421     .. code-block:: python
02422     
02423       tab.GetOptimalPrefactors('colC', 'colA', 'colB')
02424     
02425     The function returns a list of containing the prefactors a, b, c, ... in 
02426     the correct order (i.e. same as columns were specified in \*args).
02427     
02428     Weighting:
02429     If the kwarg weights="columX" is specified, the equations are weighted by
02430     the values in that column. Each row is multiplied by the weight in that row,
02431     which leads to :eq:`op3`:
02432     
02433     .. math::
02434       :label: op3
02435       
02436       weight*a*u + weight*b*v + weight*c*w + ... = weight*z
02437     
02438     Weights must be float or int and can have any value. A value of 0 ignores
02439     this equation, a value of 1 means the same as no weight. If all weights are
02440     the same for each row, the same result will be obtained as with no weights.
02441     
02442     **Example:**
02443     
02444     .. code-block:: python
02445     
02446       tab.GetOptimalPrefactors('colC', 'colA', 'colB', weights='colD')
02447     
02448     '''
02449     try:
02450       import numpy as np
02451   
02452       if len(args)==0:
02453         raise RuntimeError("At least one column must be specified.")
02454       
02455       b = self.GetNumpyMatrix(ref_col)
02456       a = self.GetNumpyMatrix(*args)
02457       
02458       if len(kwargs)!=0:
02459         if kwargs.has_key('weights'):
02460           w = self.GetNumpyMatrix(kwargs['weights'])
02461           b = np.multiply(b,w)
02462           a = np.multiply(a,w)
02463           
02464         else:
02465           raise RuntimeError("specified unrecognized kwargs, use weights as key")
02466       
02467       k = (a.T*a).I*a.T*b
02468       return list(np.array(k.T).reshape(-1))
02469     
02470     except ImportError:
02471       LogError("Function needs numpy, but I could not import it.")
02472       raise
02473 
02474   def PlotEnrichment(self, score_col, class_col, score_dir='-', 
02475                      class_dir='-', class_cutoff=2.0,
02476                      style='-', title=None, x_title=None, y_title=None,
02477                      clear=True, save=None):
02478     '''
02479     Plot an enrichment curve using matplotlib of column *score_col* classified
02480     according to *class_col*.
02481     
02482     For more information about parameters of the enrichment, see
02483     :meth:`ComputeEnrichment`, and for plotting see :meth:`Plot`.
02484     
02485     :warning: The function depends on *matplotlib*
02486     '''
02487     try:
02488       import matplotlib.pyplot as plt
02489     
02490       enrx, enry = self.ComputeEnrichment(score_col, class_col, score_dir,
02491                                           class_dir, class_cutoff)
02492       
02493       if not title:
02494         title = 'Enrichment of %s'%score_col
02495         
02496       if not x_title:
02497         x_title = '% database'
02498         
02499       if not y_title:
02500         y_title = '% positives'
02501         
02502       if clear:
02503         plt.clf()
02504         
02505       plt.plot(enrx, enry, style)
02506       
02507       plt.title(title, size='x-large', fontweight='bold')     
02508       plt.ylabel(y_title, size='x-large')
02509       plt.xlabel(x_title, size='x-large')
02510       
02511       if save:
02512         plt.savefig(save)
02513       
02514       return plt
02515     except ImportError:
02516       LogError("Function needs matplotlib, but I could not import it.")
02517       raise
02518     
02519   def ComputeEnrichment(self, score_col, class_col, score_dir='-', 
02520                         class_dir='-', class_cutoff=2.0):
02521     '''
02522     Computes the enrichment of column *score_col* classified according to
02523     *class_col*.
02524     
02525     For this it is necessary, that the datapoints are classified into positive
02526     and negative points. This can be done in two ways:
02527     
02528      - by using one 'bool' type column (*class_col*) which contains *True* for
02529        positives and *False* for negatives
02530        
02531      - by specifying a classification column (*class_col*), a cutoff value
02532        (*class_cutoff*) and the classification columns direction (*class_dir*).
02533        This will generate the classification on the fly
02534 
02535        * if ``class_dir=='-'``: values in the classification column that are less than or equal to class_cutoff will be counted as positives
02536        * if ``class_dir=='+'``: values in the classification column that are larger than or equal to class_cutoff will be counted as positives
02537 
02538     During the calculation, the table will be sorted according to *score_dir*,
02539     where a '-' values means smallest values first and therefore, the smaller
02540     the value, the better.
02541     
02542     :warning: If either the value of *class_col* or *score_col* is *None*, the
02543               data in this row is ignored.
02544     '''
02545     
02546     ALLOWED_DIR = ['+','-']
02547     
02548     score_idx = self.GetColIndex(score_col)
02549     score_type = self.col_types[score_idx]
02550     if score_type!='int' and score_type!='float':
02551       raise TypeError("Score column must be numeric type")
02552     
02553     class_idx = self.GetColIndex(class_col)
02554     class_type = self.col_types[class_idx]
02555     if class_type!='int' and class_type!='float' and class_type!='bool':
02556       raise TypeError("Classifier column must be numeric or bool type")
02557     
02558     if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
02559       raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
02560     
02561     self.Sort(score_col, score_dir)
02562     
02563     x = [0]
02564     y = [0]
02565     enr = 0
02566     old_score_val = None
02567     i = 0
02568 
02569     for row in self.rows:
02570       class_val = row[class_idx]
02571       score_val = row[score_idx]
02572       if class_val==None or score_val==None:
02573         continue
02574       if class_val!=None:
02575         if old_score_val==None:
02576           old_score_val = score_val
02577         if score_val!=old_score_val:
02578           x.append(i)
02579           y.append(enr)
02580           old_score_val = score_val
02581         i+=1
02582         if class_type=='bool':
02583           if class_val==True:
02584             enr += 1
02585         else:
02586           if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
02587             enr += 1
02588     x.append(i)
02589     y.append(enr)
02590 
02591     # if no false positives or false negatives values are found return None
02592     if x[-1]==0 or y[-1]==0:
02593       return None
02594 
02595     x = [float(v)/x[-1] for v in x]
02596     y = [float(v)/y[-1] for v in y]
02597     return x,y
02598     
02599   def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-', 
02600                            class_dir='-', class_cutoff=2.0):
02601     '''
02602     Computes the area under the curve of the enrichment using the trapezoidal
02603     rule.
02604     
02605     For more information about parameters of the enrichment, see
02606     :meth:`ComputeEnrichment`.
02607 
02608     :warning: The function depends on *numpy*
02609     '''
02610     try:
02611       import numpy as np
02612       
02613       enr = self.ComputeEnrichment(score_col, class_col, score_dir,
02614                                           class_dir, class_cutoff)
02615       
02616       if enr==None:
02617         return None
02618       return np.trapz(enr[1], enr[0])
02619     except ImportError:
02620       LogError("Function needs numpy, but I could not import it.")
02621       raise
02622 
02623   def ComputeROC(self, score_col, class_col, score_dir='-',
02624                  class_dir='-', class_cutoff=2.0):
02625     '''
02626     Computes the receiver operating characteristics (ROC) of column *score_col*
02627     classified according to *class_col*.
02628 
02629     For this it is necessary, that the datapoints are classified into positive
02630     and negative points. This can be done in two ways:
02631 
02632      - by using one 'bool' column (*class_col*) which contains True for positives
02633        and False for negatives
02634      - by using a non-bool column (*class_col*), a cutoff value (*class_cutoff*)
02635        and the classification columns direction (*class_dir*). This will generate
02636        the classification on the fly
02637 
02638        - if ``class_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff* will be counted as positives
02639        - if ``class_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff* will be counted as positives
02640 
02641     During the calculation, the table will be sorted according to *score_dir*,
02642     where a '-' values means smallest values first and therefore, the smaller
02643     the value, the better.
02644 
02645     If *class_col* does not contain any positives (i.e. value is True (if column
02646     is of type bool) or evaluated to True (if column is of type int or float
02647     (depending on *class_dir* and *class_cutoff*))) the ROC is not defined and
02648     the function will return *None*.
02649 
02650     :warning: If either the value of *class_col* or *score_col* is *None*, the
02651               data in this row is ignored.
02652     '''
02653 
02654     ALLOWED_DIR = ['+','-']
02655 
02656     score_idx = self.GetColIndex(score_col)
02657     score_type = self.col_types[score_idx]
02658     if score_type!='int' and score_type!='float':
02659       raise TypeError("Score column must be numeric type")
02660 
02661     class_idx = self.GetColIndex(class_col)
02662     class_type = self.col_types[class_idx]
02663     if class_type!='int' and class_type!='float' and class_type!='bool':
02664       raise TypeError("Classifier column must be numeric or bool type")
02665 
02666     if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
02667       raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
02668 
02669     self.Sort(score_col, score_dir)
02670 
02671     x = [0]
02672     y = [0]
02673     tp = 0
02674     fp = 0
02675     old_score_val = None
02676 
02677     for i,row in enumerate(self.rows):
02678       class_val = row[class_idx]
02679       score_val = row[score_idx]
02680       if class_val==None or score_val==None:
02681         continue
02682       if class_val!=None:
02683         if old_score_val==None:
02684           old_score_val = score_val
02685         if score_val!=old_score_val:
02686           x.append(fp)
02687           y.append(tp)
02688           old_score_val = score_val
02689         if class_type=='bool':
02690           if class_val==True:
02691             tp += 1
02692           else:
02693             fp += 1
02694         else:
02695           if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
02696             tp += 1
02697           else:
02698             fp += 1
02699     x.append(fp)
02700     y.append(tp)
02701     
02702     # if no false positives or false negatives values are found return None
02703     if x[-1]==0 or y[-1]==0:
02704       return None
02705     
02706     x = [float(v)/x[-1] for v in x]
02707     y = [float(v)/y[-1] for v in y]
02708     return x,y
02709 
02710   def ComputeROCAUC(self, score_col, class_col, score_dir='-',
02711                     class_dir='-', class_cutoff=2.0):
02712     '''
02713     Computes the area under the curve of the receiver operating characteristics
02714     using the trapezoidal rule.
02715     
02716     For more information about parameters of the ROC, see
02717     :meth:`ComputeROC`.
02718 
02719     :warning: The function depends on *numpy*
02720     '''
02721     try:
02722       import numpy as np
02723 
02724       roc = self.ComputeROC(score_col, class_col, score_dir,
02725                             class_dir, class_cutoff)
02726 
02727       if not roc:
02728         return None
02729       return np.trapz(roc[1], roc[0])
02730     except ImportError:
02731       LogError("Function needs numpy, but I could not import it.")
02732       raise
02733     
02734   def ComputeLogROCAUC(self, score_col, class_col, score_dir='-',
02735                        class_dir='-', class_cutoff=2.0):
02736     '''
02737     Computes the area under the curve of the log receiver operating 
02738     characteristics (logROC) where the x-axis is semilogarithmic
02739     using the trapezoidal rule.
02740     
02741     The logROC is computed with a lambda of 0.001 according to 
02742     Rapid Context-Dependent Ligand Desolvation in Molecular Docking
02743     Mysinger M. and Shoichet B., Journal of Chemical Information and Modeling
02744     2010 50 (9), 1561-1573
02745     
02746     For more information about parameters of the ROC, see
02747     :meth:`ComputeROC`.
02748 
02749     :warning: The function depends on *numpy*
02750     '''
02751     try:
02752       import numpy as np
02753 
02754       roc = self.ComputeROC(score_col, class_col, score_dir,
02755                             class_dir, class_cutoff)
02756 
02757       if not roc:
02758         return None
02759       
02760       rocxt, rocyt = roc
02761       rocx=[]
02762       rocy=[]
02763       
02764       # define lambda
02765       l=0.001
02766       
02767       # remove all duplicate x-values
02768       rocxt = [x if x>0 else l for x in rocxt]
02769       for i in range(len(rocxt)-1):
02770         if rocxt[i]==rocxt[i+1]:
02771           continue
02772         rocx.append(rocxt[i])
02773         rocy.append(rocyt[i])
02774       rocx.append(1.0)
02775       rocy.append(1.0)
02776       
02777       # compute logauc
02778       value = 0
02779       for i in range(len(rocx)-1):
02780         x = rocx[i]
02781         if rocx[i]==rocx[i+1]:
02782           continue
02783         b = rocy[i+1]-rocx[i+1]*((rocy[i+1]-rocy[i])/(rocx[i+1]-rocx[i]))
02784         value += ((rocy[i+1]-rocy[i])/math.log(10))+b*(math.log10(rocx[i+1])-math.log10(rocx[i]))
02785       return value/math.log10(1.0/l)
02786       
02787     except ImportError:
02788       LogError("Function needs numpy, but I could not import it.")
02789       raise
02790 
02791   def PlotROC(self, score_col, class_col, score_dir='-',
02792               class_dir='-', class_cutoff=2.0,
02793               style='-', title=None, x_title=None, y_title=None,
02794               clear=True, save=None):
02795     '''
02796     Plot an ROC curve using matplotlib.
02797     
02798     For more information about parameters of the ROC, see
02799     :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
02800 
02801     :warning: The function depends on *matplotlib*
02802     '''
02803 
02804     try:
02805       import matplotlib.pyplot as plt
02806 
02807       roc = self.ComputeROC(score_col, class_col, score_dir,
02808                                    class_dir, class_cutoff)
02809       
02810       if not roc:
02811         return None
02812 
02813       enrx, enry = roc
02814 
02815       if not title:
02816         title = 'ROC of %s'%score_col
02817 
02818       if not x_title:
02819         x_title = 'false positive rate'
02820 
02821       if not y_title:
02822         y_title = 'true positive rate'
02823 
02824       if clear:
02825         plt.clf()
02826 
02827       plt.plot(enrx, enry, style)
02828 
02829       plt.title(title, size='x-large', fontweight='bold')
02830       plt.ylabel(y_title, size='x-large')
02831       plt.xlabel(x_title, size='x-large')
02832 
02833       if save:
02834         plt.savefig(save)
02835 
02836       return plt
02837     except ImportError:
02838       LogError("Function needs matplotlib, but I could not import it.")
02839       raise
02840     
02841   def PlotLogROC(self, score_col, class_col, score_dir='-',
02842                  class_dir='-', class_cutoff=2.0,
02843                  style='-', title=None, x_title=None, y_title=None,
02844                  clear=True, save=None):
02845     '''
02846     Plot an logROC curve where the x-axis is semilogarithmic using matplotlib 
02847     
02848     For more information about parameters of the ROC, see
02849     :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
02850 
02851     :warning: The function depends on *matplotlib*
02852     '''
02853 
02854     try:
02855       import matplotlib.pyplot as plt
02856 
02857       roc = self.ComputeROC(score_col, class_col, score_dir,
02858                                    class_dir, class_cutoff)
02859       
02860       if not roc:
02861         return None
02862 
02863       rocx, rocy = roc
02864 
02865       if not title:
02866         title = 'logROC of %s'%score_col
02867 
02868       if not x_title:
02869         x_title = 'false positive rate'
02870 
02871       if not y_title:
02872         y_title = 'true positive rate'
02873 
02874       if clear:
02875         plt.clf()
02876      
02877       rocx = [x if x>0 else 0.001 for x in rocx]
02878       
02879       
02880       plt.plot(rocx, rocy, style)
02881 
02882       plt.title(title, size='x-large', fontweight='bold')
02883       plt.ylabel(y_title, size='x-large')
02884       plt.xlabel(x_title, size='x-large')
02885       
02886       plt.xscale('log', basex=10)
02887       plt.xlim(0.001, 1.0)
02888       
02889 
02890       if save:
02891         plt.savefig(save)
02892 
02893       return plt
02894     except ImportError:
02895       LogError("Function needs matplotlib, but I could not import it.")
02896       raise  
02897   
02898   def ComputeMCC(self, score_col, class_col, score_dir='-',
02899                  class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
02900     '''
02901     Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
02902     with the points classified into true positives, false positives, true
02903     negatives and false negatives according to a specified classification
02904     column (*class_col*).
02905     
02906     The datapoints in *score_col* and *class_col* are classified into
02907     positive and negative points. This can be done in two ways:
02908     
02909      - by using 'bool' columns which contains True for positives and False
02910        for negatives
02911        
02912      - by using 'float' or 'int' columns and specifying a cutoff value and the
02913        columns direction. This will generate the classification on the fly
02914        
02915        * if ``class_dir``/``score_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
02916        * if ``class_dir``/``score_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
02917                                     
02918     The two possibilities can be used together, i.e. 'bool' type for one column
02919     and 'float'/'int' type and cutoff/direction for the other column.
02920     '''
02921     ALLOWED_DIR = ['+','-']
02922 
02923     score_idx = self.GetColIndex(score_col)
02924     score_type = self.col_types[score_idx]
02925     if score_type!='int' and score_type!='float' and score_type!='bool':
02926       raise TypeError("Score column must be numeric or bool type")
02927 
02928     class_idx = self.GetColIndex(class_col)
02929     class_type = self.col_types[class_idx]
02930     if class_type!='int' and class_type!='float' and class_type!='bool':
02931       raise TypeError("Classifier column must be numeric or bool type")
02932 
02933     if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
02934       raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
02935      
02936     tp = 0
02937     fp = 0
02938     fn = 0
02939     tn = 0
02940 
02941     for i,row in enumerate(self.rows):
02942       class_val = row[class_idx]
02943       score_val = row[score_idx]
02944       if class_val!=None:
02945         if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
02946           if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
02947             tp += 1
02948           else:
02949             fn += 1
02950         else:
02951           if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
02952             tn += 1
02953           else:
02954             fp += 1
02955 
02956     mcc = None
02957     msg = None
02958     if (tp+fn)==0:
02959       msg = 'factor (tp + fn) is zero'
02960     elif (tp+fp)==0:
02961       msg = 'factor (tp + fp) is zero'
02962     elif (tn+fn)==0:
02963       msg = 'factor (tn + fn) is zero'
02964     elif (tn+fp)==0:
02965       msg = 'factor (tn + fp) is zero'
02966     
02967     if msg:
02968       LogWarning("Could not compute MCC: MCC is not defined since %s"%msg)
02969     else:
02970       mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
02971     return mcc
02972     
02973 
02974   def IsEmpty(self, col_name=None, ignore_nan=True):
02975     '''
02976     Checks if a table is empty.
02977     
02978     If no column name is specified, the whole table is checked for being empty,
02979     whereas if a column name is specified, only this column is checked.
02980     
02981     By default, all NAN (or None) values are ignored, and thus, a table
02982     containing only NAN values is considered as empty. By specifying the 
02983     option ignore_nan=False, NAN values are counted as 'normal' values.
02984     '''
02985     
02986     # table with no columns and no rows
02987     if len(self.col_names)==0:
02988       if col_name:
02989         raise ValueError('Table has no column named "%s"' % col_name)
02990       return True
02991     
02992     # column name specified
02993     if col_name:
02994       if self.Count(col_name, ignore_nan=ignore_nan)==0:
02995         return True
02996       else:
02997         return False
02998       
02999     # no column name specified -> test whole table
03000     else:
03001       for row in self.rows:
03002         for cell in row:
03003           if ignore_nan:
03004             if cell!=None:
03005               return False
03006           else:
03007             return False
03008     return True
03009     
03010 
03011   def Extend(self, tab, overwrite=None):
03012     """
03013     Append each row of *tab* to the current table. The data is appended based
03014     on the column names, thus the order of the table columns is *not* relevant,
03015     only the header names.
03016     
03017     If there is a column in *tab* that is not present in the current table,
03018     it is added to the current table and filled with *None* for all the rows
03019     present in the current table.
03020     
03021     If the type of any column in *tab* is not the same as in the current table
03022     a *TypeError* is raised.
03023     
03024     If *overwrite* is not None and set to an existing column name, the specified 
03025     column in the table is searched for the first occurrence of a value matching
03026     the value of the column with the same name in the dictionary. If a matching
03027     value is found, the row is overwritten with the dictionary. If no matching
03028     row is found, a new row is appended to the table.
03029     """
03030     # add column to current table if it doesn't exist
03031     for name,typ in zip(tab.col_names, tab.col_types):
03032       if not name in self.col_names:
03033         self.AddCol(name, typ)
03034     
03035     # check that column types are the same in current and new table
03036     for name in self.col_names:
03037       if name in tab.col_names:
03038         curr_type = self.col_types[self.GetColIndex(name)]
03039         new_type = tab.col_types[tab.GetColIndex(name)]
03040         if curr_type!=new_type:
03041           raise TypeError('cannot extend table, column %s in new '%name +\
03042                           'table different type (%s) than in '%new_type +\
03043                           'current table (%s)'%curr_type)
03044     
03045     num_rows = len(tab.rows)
03046     for i in range(0,num_rows):
03047       row = tab.rows[i]
03048       data = dict(zip(tab.col_names,row))
03049       self.AddRow(data, overwrite)
03050     
03051 
03052 def Merge(table1, table2, by, only_matching=False):
03053   """
03054   Returns a new table containing the data from both tables. The rows are 
03055   combined based on the common values in the column(s) by. The option 'by' can
03056   be a list of column names. When this is the case, merging is based on
03057   multiple columns.
03058   For example, the two tables below
03059 
03060   ==== ====
03061   x     y            
03062   ==== ====
03063    1    10
03064    2    15
03065    3    20
03066   ==== ====
03067   
03068   ==== ====
03069   x     u
03070   ==== ====
03071     1  100
03072     3  200
03073     4  400
03074   ==== ====
03075 
03076   when merged by column x, produce the following output:
03077 
03078   ===== ===== =====
03079   x      y     u
03080   ===== ===== =====
03081   1      10    100
03082   2      15    None
03083   3      20    200
03084   4      None  400
03085   ===== ===== =====
03086   
03087 
03088   """
03089   def _key(row, indices):
03090     return tuple([row[i] for i in indices])
03091   def _keep(indices, cn, ct, ni):
03092     ncn, nct, nni=([],[],[])
03093     for i in range(len(cn)):
03094       if i not in indices:
03095         ncn.append(cn[i])
03096         nct.append(ct[i])
03097         nni.append(ni[i])
03098     return ncn, nct, nni
03099   col_names=list(table2.col_names)
03100   col_types=list(table2.col_types)
03101   new_index=[i for i in range(len(col_names))]
03102   if isinstance(by, str):
03103     common2_indices=[col_names.index(by)]
03104   else:
03105     common2_indices=[col_names.index(b) for b in by]
03106   col_names, col_types, new_index=_keep(common2_indices, col_names, 
03107                                         col_types, new_index)
03108 
03109   for i, name in enumerate(col_names):
03110     try_name=name
03111     counter=1
03112     while try_name in table1.col_names:
03113       counter+=1
03114       try_name='%s_%d' % (name, counter)
03115     col_names[i]=try_name
03116   common1={}
03117   if isinstance(by, str):
03118     common1_indices=[table1.col_names.index(by)]
03119   else:
03120     common1_indices=[table1.col_names.index(b) for b in by]
03121   for row in table1.rows:
03122     key=_key(row, common1_indices)
03123     if key in common1:
03124       raise ValueError('duplicate key "%s in first table"' % (str(key)))
03125     common1[key]=row
03126   common2={}
03127   for row in table2.rows:
03128     key=_key(row, common2_indices)
03129     if key in common2:
03130       raise ValueError('duplicate key "%s" in second table' % (str(key)))
03131     common2[key]=row
03132   new_tab=Table(table1.col_names+col_names, table1.col_types+col_types)
03133   for k, v in common1.iteritems():
03134     row=v+[None for i in range(len(table2.col_names)-len(common2_indices))]
03135     matched=False
03136     if k in common2:
03137       matched=True
03138       row2=common2[k]
03139       for i, index in enumerate(new_index):
03140         row[len(table1.col_names)+i]=row2[index]
03141     if only_matching and not matched:
03142       continue
03143     new_tab.AddRow(row)
03144   if only_matching:
03145     return new_tab
03146   for k, v in common2.iteritems():
03147     if not k in common1:
03148       v2=[v[i] for i in new_index]
03149       row=[None for i in range(len(table1.col_names))]+v2
03150       for common1_index, common2_index in zip(common1_indices, common2_indices):
03151         row[common1_index]=v[common2_index]
03152       new_tab.AddRow(row)
03153   return new_tab
03154