OpenStructure: stage/lib64/python2.7/site-packages/ost/table.py Source File

00001 import csv
00002 import re
00003 import math
00004 from ost import stutil
00005 import itertools
00006 import operator
00007 import cPickle
00008 import weakref
00009 from ost import LogError, LogWarning, LogInfo, LogVerbose
00010 
00011 def MakeTitle(col_name):
00012   return col_name.replace('_', ' ')
00013 
00014 def IsStringLike(value):
00015   if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
00016     return False
00017   try:
00018     value+''
00019     return True
00020   except:
00021     return False
00022 
00023 def IsNullString(value):
00024   value=value.strip().upper()
00025   return value in ('', 'NULL', 'NONE', 'NA')
00026 
00027 def IsScalar(value):
00028   if IsStringLike(value):
00029     return True
00030   try:
00031     if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
00032       return False
00033     iter(value)
00034     return False
00035   except:
00036     return True
00037 
00038 def GuessColumnType(iterator):
00039   empty=True
00040   possibilities=set(['bool', 'int', 'float'])
00041   for ele in iterator:
00042     str_ele=str(ele).upper()
00043     if IsNullString(str_ele):
00044       continue
00045     empty=False
00046     if 'int' in possibilities:
00047       try:
00048         int(str_ele)
00049       except ValueError:
00050         possibilities.remove('int')
00051 
00052     if 'float' in possibilities:
00053       try:
00054         float(str_ele)
00055       except ValueError:
00056         possibilities.remove('float')
00057     if 'bool' in possibilities:
00058       if str_ele not in set(['YES', 'NO', 'TRUE', 'FALSE']):
00059         possibilities.remove('bool')
00060 
00061     if len(possibilities)==0:
00062       return 'string'
00063   if len(possibilities)==2:
00064     return 'int'
00065   if empty:
00066     return 'string'
00067   # return the last element available
00068   return possibilities.pop()
00069 
00070 class BinaryColExpr:
00071   def __init__(self, op, lhs, rhs):
00072     self.op=op
00073     self.lhs=lhs
00074     self.rhs=rhs
00075     if IsScalar(lhs):
00076       self.lhs=itertools.cyle([self.lhs])
00077     if IsScalar(rhs):
00078       self.rhs=itertools.cycle([self.rhs])
00079   def __iter__(self):
00080     for l, r in zip(self.lhs, self.rhs):
00081       if l!=None and r!=None:
00082         yield self.op(l, r)
00083       else:
00084         yield None
00085   def __add__(self, rhs):
00086     return BinaryColExpr(operator.add, self, rhs)
00087 
00088   def __sub__(self, rhs):
00089     return BinaryColExpr(operator.sub, self, rhs)
00090 
00091   def __mul__(self, rhs):
00092     return BinaryColExpr(operator.mul, self, rhs)
00093 
00094   def __div__(self, rhs):
00095     return BinaryColExpr(operator.div, self, rhs)
00096 
00097 class TableCol:
00098   def __init__(self, table, col):
00099     self._table=table
00100     if type(col)==str:
00101       self.col_index=self._table.GetColIndex(col)
00102     else:
00103       self.col_index=col
00104 
00105   def __iter__(self):
00106     for row in self._table.rows:
00107       yield row[self.col_index]
00108 
00109   def __len__(self):
00110     return len(self._table.rows)
00111 
00112   def __getitem__(self, index):
00113     return self._table.rows[index][self.col_index]
00114 
00115   def __setitem__(self, index, value):
00116     self._table.rows[index][self.col_index]=value
00117   
00118   def __add__(self, rhs):
00119     return BinaryColExpr(operator.add, self, rhs)
00120 
00121   def __sub__(self, rhs):
00122     return BinaryColExpr(operator.sub, self, rhs)
00123 
00124   def __mul__(self, rhs):
00125     return BinaryColExpr(operator.mul, self, rhs)
00126 
00127   def __div__(self, rhs):
00128     return BinaryColExpr(operator.div, self, rhs)
00129 
00130 class TableRow:
00131   """
00132   Essentially a named tuple, but allows column names that are not valid 
00133   python variable names.
00134   """
00135   def __init__(self, row_data, tab):
00136     self.__dict__['tab'] = weakref.proxy(tab)
00137     self.__dict__['row_data'] = row_data
00138 
00139   def __getitem__(self, col_name):
00140     if type(col_name)==int:
00141       return self.row_data[col_name]
00142     return self.row_data[self.tab.GetColIndex(col_name)]
00143 
00144   def __str__(self):
00145     s = []
00146     for k, v in zip(self.__dict__['tab'].col_names, self.__dict__['row_data']):
00147       s.append('%s=%s' % (k, str(v)))
00148     return ', '.join(s)
00149       
00150       
00151   def __len__(self):
00152     return len(self.row_data)
00153 
00154   def __setitem__(self, col_name, val):
00155     if type(col_name)==int:
00156       self.row_data[col_name] = val
00157     else:
00158       self.row_data[self.tab.GetColIndex(col_name)] = val
00159 
00160   def __getattr__(self, col_name):
00161     if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
00162       raise AttributeError(col_name)
00163     return self.row_data[self.tab.GetColIndex(col_name)]
00164 
00165   def __setattr__(self, col_name, val):
00166     if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
00167       raise AttributeError(col_name)
00168     self.row_data[self.tab.GetColIndex(col_name)] = val
00169 
00170 class Table(object):
00171   """
00172   
00173   The table class provides convenient access to data in tabular form. An empty 
00174   table can be easily constructed as follows
00175   
00176   .. code-block:: python
00177   
00178     tab = Table()
00179     
00180   If you want to add columns directly when creating the table, column names
00181   and *column types* can be specified as follows
00182   
00183   .. code-block:: python
00184   
00185     tab = Table(['nameX','nameY','nameZ'], 'sfb')
00186     
00187   this will create three columns called nameX, nameY and nameZ of type string,
00188   float and bool, respectively. There will be no data in the table and thus,
00189   the table will not contain any rows.
00190   
00191   The following *column types* are supported:
00192   
00193   ======= ========
00194   name     abbrev
00195   ======= ========
00196   string     s
00197   float      f
00198   int        i
00199   bool       b
00200   ======= ========
00201   
00202   If you want to add data to the table in addition, use the following:
00203   
00204   .. code-block:: python
00205   
00206     tab=Table(['nameX','nameY','nameZ'],
00207               'sfb',
00208               nameX = ['a','b','c'],
00209               nameY = [0.1, 1.2, 3.414],
00210               nameZ = [True, False, False])
00211               
00212   if values for one column is left out, they will be filled with NA, but if
00213   values are specified, all values must be specified (i.e. same number of
00214   values per column)
00215     
00216   """
00217 
00218   SUPPORTED_TYPES=('int', 'float', 'bool', 'string',)
00219   
00220   
00221   def __init__(self, col_names=[], col_types=None, **kwargs):
00222 
00223     self.col_names=list(col_names)
00224     self.comment=''
00225     self.name=''
00226     
00227     self.col_types = self._ParseColTypes(col_types)
00228     self.rows=[]    
00229     if len(kwargs)>=0:
00230       if not col_names:
00231         self.col_names=[v for v in kwargs.keys()]
00232       if not self.col_types:
00233         self.col_types=['string' for u in range(len(self.col_names))]
00234       if len(kwargs)>0:
00235         self._AddRowsFromDict(kwargs)
00236 
00237   def __getattr__(self, col_name):
00238     # pickling doesn't call the standard __init__ defined above and thus
00239     # col_names might not be defined. This leads to infinite recursions.
00240     # Protect against it by checking that col_names is contained in 
00241     # __dict__
00242     if 'col_names' not in self.__dict__ or col_name not in self.col_names:
00243       raise AttributeError(col_name)
00244     return TableCol(self, col_name)
00245 
00246   @staticmethod
00247   def _ParseColTypes(types, exp_num=None):
00248     if types==None:
00249       return None
00250     
00251     short2long = {'s' : 'string', 'i': 'int', 'b' : 'bool', 'f' : 'float'}
00252     allowed_short = short2long.keys()
00253     allowed_long = short2long.values()
00254     
00255     type_list = []
00256     
00257     # string type
00258     if IsScalar(types):
00259       if type(types)==str:
00260         types = types.lower()
00261         
00262         # single value
00263         if types in allowed_long:
00264           type_list.append(types)
00265         elif types in allowed_short:
00266           type_list.append(short2long[types])
00267         
00268         # comma separated list of long or short types
00269         elif types.find(',')!=-1:
00270           for t in types.split(','):
00271             if t in allowed_long:
00272               type_list.append(t)
00273             elif t in allowed_short:
00274               type_list.append(short2long[t])
00275             else:
00276               raise ValueError('Unknown type %s in types %s'%(t,types))
00277         
00278         # string of short types
00279         else:
00280           for t in types:
00281             if t in allowed_short:
00282               type_list.append(short2long[t])
00283             else:
00284               raise ValueError('Unknown type %s in types %s'%(t,types))
00285       
00286       # non-string type
00287       else:
00288         raise ValueError('Col type %s must be string or list'%types)
00289     
00290     # list type
00291     else:
00292       for t in types:
00293         # must be string type
00294         if type(t)==str:
00295           t = t.lower()
00296           if t in allowed_long:
00297             type_list.append(t)
00298           elif t in allowed_short:
00299             type_list.append(short2long[t])
00300           else:
00301             raise ValueError('Unknown type %s in types %s'%(t,types))
00302         
00303         # non-string type
00304         else:
00305           raise ValueError('Col type %s must be string or list'%types)
00306     
00307     if exp_num:
00308       if len(type_list)!=exp_num:
00309         raise ValueError('Parsed number of col types (%i) differs from ' + \
00310                          'expected (%i) in types %s'%(len(type_list),exp_num,types))
00311       
00312     return type_list
00313 
00314   def SetName(self, name):
00315     '''
00316     Set name of the table
00317 
00318     :param name: name
00319     :type name: :class:`str`
00320     '''
00321     self.name = name
00322     
00323   def GetName(self):
00324     '''
00325     Get name of table
00326     '''
00327     return self.name
00328 
00329   def RenameCol(self, old_name, new_name):
00330     """
00331     Rename column *old_name* to *new_name*.
00332 
00333     :param old_name: Name of the old column
00334     :param new_name: Name of the new column
00335     :raises: :exc:`ValueError` when *old_name* is not a valid column
00336     """
00337     if old_name==new_name:
00338       return
00339     self.AddCol(new_name, self.col_types[self.GetColIndex(old_name)],
00340                 self[old_name])
00341     self.RemoveCol(old_name)
00342   def _Coerce(self, value, ty):
00343     '''
00344     Try to convert values (e.g. from :class:`str` type) to the specified type
00345 
00346     :param value: the value
00347     :type value: any type
00348 
00349     :param ty: name of type to convert it to (i.e. *int*, *float*, *string*,
00350                *bool*)
00351     :type ty: :class:`str`
00352     '''
00353     if value=='NA' or value==None:
00354       return None
00355     if ty=='int':
00356       return int(value)
00357     if ty=='float':
00358       return float(value)
00359     if ty=='string':
00360       return str(value)
00361     if ty=='bool':
00362       if isinstance(value, str) or isinstance(value, unicode):
00363         if value.upper() in ('FALSE', 'NO',):
00364           return False
00365         return True
00366       return bool(value)
00367     raise ValueError('Unknown type %s' % ty)
00368 
00369   def GetColIndex(self, col):
00370     '''
00371     Returns the column index for the column with the given name.
00372 
00373     :raises: ValueError if no column with the name is found.
00374     '''
00375     if col not in self.col_names:
00376       raise ValueError('Table has no column named "%s"' % col)
00377     return self.col_names.index(col)
00378   
00379   def GetColNames(self):
00380     '''
00381     Returns a list containing all column names.
00382     '''
00383     return self.col_names
00384   
00385   def SearchColNames(self, regex):
00386     '''
00387     Returns a list of column names matching the regex.
00388 
00389     :param regex: regex pattern
00390     :type regex: :class:`str`
00391 
00392     :returns: :class:`list` of column names (:class:`str`)
00393     '''
00394     matching_names = []
00395     for name in self.col_names:
00396       matches = re.search(regex, name)
00397       if matches:
00398         matching_names.append(name)
00399     return matching_names
00400 
00401   def HasCol(self, col):
00402     '''
00403     Checks if the column with a given name is present in the table.
00404     '''
00405     return col in self.col_names
00406   
00407   def __getitem__(self, k):
00408     if type(k)==int:
00409       return TableCol(self, self.col_names[k])
00410     else:
00411       return TableCol(self, k)
00412 
00413   def __setitem__(self, k, value):
00414     col_index=k
00415     if type(k)!=int:
00416       col_index=self.GetColIndex(k)
00417     if IsScalar(value):
00418       value=itertools.cycle([value])
00419     for r, v in zip(self.rows, value):
00420       r[col_index]=v
00421 
00422   def ToString(self, float_format='%.3f', int_format='%d', rows=None):
00423     '''
00424     Convert the table into a string representation.
00425 
00426     The output format can be modified for int and float type columns by
00427     specifying a formatting string for the parameters *float_format* and
00428     *int_format*.
00429 
00430     The option *rows* specify the range of rows to be printed. The parameter
00431     must be a type that supports indexing (e.g. a :class:`list`) containing the 
00432     start and end row *index*, e.g. [start_row_idx, end_row_idx].
00433 
00434     :param float_format: formatting string for float columns
00435     :type float_format: :class:`str`
00436 
00437     :param int_format: formatting string for int columns
00438     :type int_format: :class:`str`
00439 
00440     :param rows: iterable containing start and end row *index*
00441     :type rows: iterable containing :class:`ints <int>`
00442     '''
00443     widths=[len(cn) for cn in self.col_names]
00444     sel_rows=self.rows
00445     if rows:
00446       sel_rows=self.rows[rows[0]:rows[1]]
00447     for row in sel_rows:
00448       for i, (ty, col) in enumerate(zip(self.col_types, row)):
00449         if col==None:
00450           widths[i]=max(widths[i], len('NA'))
00451         elif ty=='float':
00452           widths[i]=max(widths[i], len(float_format % col))
00453         elif ty=='int':
00454           widths[i]=max(widths[i], len(int_format % col))
00455         else:
00456           widths[i]=max(widths[i], len(str(col)))
00457     s=''
00458     if self.comment:
00459       s+=''.join(['# %s\n' % l for l in self.comment.split('\n')])
00460     total_width=sum(widths)+2*len(widths)
00461     for width, col_name in zip(widths, self.col_names):
00462       s+=col_name.center(width+2)
00463     s+='\n%s\n' % ('-'*total_width)
00464     for row in sel_rows:
00465       for width, ty, col in zip(widths, self.col_types, row):
00466         cs=''
00467         if col==None:
00468           cs='NA'.center(width+2)
00469         elif ty=='float':
00470           cs=(float_format % col).rjust(width+2)
00471         elif ty=='int':
00472           cs=(int_format % col).rjust(width+2)
00473         else:
00474           cs=' '+str(col).ljust(width+1)
00475         s+=cs
00476       s+='\n'
00477     return s
00478 
00479   def __str__(self):
00480     return self.ToString()
00481   
00482   def Stats(self, col):
00483      idx  = self.GetColIndex(col)
00484      text ='''
00485 Statistics for column %(col)s
00486 
00487   Number of Rows         : %(num)d
00488   Number of Rows Not None: %(num_non_null)d 
00489   Mean                   : %(mean)f
00490   Median                 : %(median)f
00491   Standard Deviation     : %(stddev)f
00492   Min                    : %(min)f
00493   Max                    : %(max)f
00494 '''
00495      data = {
00496        'col' : col,
00497        'num' : len(self.rows),
00498        'num_non_null' : self.Count(col),
00499        'median' : self.Median(col),
00500        'mean' : self.Mean(col),
00501        'stddev' : self.StdDev(col),
00502        'min' : self.Min(col),
00503        'max' : self.Max(col),
00504      }
00505      return text % data
00506 
00507   def _AddRowsFromDict(self, d, overwrite=None):
00508     '''
00509     Add one or more rows from a :class:`dictionary <dict>`.
00510     
00511     If *overwrite* is not None and set to an existing column name, the specified 
00512     column in the table is searched for the first occurrence of a value matching
00513     the value of the column with the same name in the dictionary. If a matching
00514     value is found, the row is overwritten with the dictionary. If no matching
00515     row is found, a new row is appended to the table.
00516 
00517     :param d: dictionary containing the data
00518     :type d: :class:`dict`
00519 
00520     :param overwrite: column name to overwrite existing row if value in
00521                       column *overwrite* matches
00522     :type overwrite: :class:`str`
00523 
00524     :raises: :class:`ValueError` if multiple rows are added but the number of
00525              data items is different for different columns.
00526     '''
00527     # get column indices
00528     idxs = [self.GetColIndex(k) for k in d.keys()]
00529     
00530     # convert scalar values to list
00531     old_len = None
00532     for k,v in d.iteritems():
00533       if IsScalar(v):
00534         v = [v]
00535         d[k] = v
00536       if not old_len:
00537         old_len = len(v)
00538       elif old_len!=len(v):
00539         raise ValueError("Cannot add rows: length of data must be equal " + \
00540                          "for all columns in %s"%str(d))
00541     
00542     # convert column based dict to row based dict and create row and add data
00543     for i,data in enumerate(zip(*d.values())):
00544       new_row = [None for a in range(len(self.col_names))]
00545       for idx,v in zip(idxs,data):
00546         new_row[idx] = self._Coerce(v, self.col_types[idx])
00547         
00548       # partially overwrite existing row with new data
00549       if overwrite:
00550         overwrite_idx = self.GetColIndex(overwrite)
00551         added = False
00552         for i,r in enumerate(self.rows):
00553           if r[overwrite_idx]==new_row[overwrite_idx]:
00554             for j,e in enumerate(self.rows[i]):
00555               if new_row[j]==None:
00556                 new_row[j] = e
00557             self.rows[i] = new_row
00558             added = True
00559             break
00560           
00561       # if not overwrite or overwrite did not find appropriate row
00562       if not overwrite or not added:
00563         self.rows.append(new_row)
00564       
00565   def PairedTTest(self, col_a, col_b):
00566     """
00567     Two-sided test for the null-hypothesis that two related samples 
00568     have the same average (expected values).
00569     
00570     :param col_a: First column
00571     :param col_b: Second column
00572 
00573     :returns: P-value between 0 and 1 that the two columns have the 
00574        same average. The smaller the value, the less related the two
00575        columns are.
00576     """
00577     from scipy.stats import ttest_rel
00578     xs = []
00579     ys = []
00580     for x, y in self.Zip(col_a, col_b):
00581       if x!=None and y!=None:
00582         xs.append(x)
00583         ys.append(y)
00584     result = ttest_rel(xs, ys)
00585     return result[1]
00586 
00587   def AddRow(self, data, overwrite=None):
00588     """
00589     Add a row to the table.
00590     
00591     *data* may either be a dictionary or a list-like object:
00592 
00593      - If *data* is a dictionary, the keys in the dictionary must match the
00594        column names. Columns not found in the dict will be initialized to None.
00595        If the dict contains list-like objects, multiple rows will be added, if
00596        the number of items in all list-like objects is the same, otherwise a
00597        :class:`ValueError` is raised.
00598 
00599      - If *data* is a list-like object, the row is initialized from the values
00600        in *data*. The number of items in *data* must match the number of
00601        columns in the table. A :class:`ValuerError` is raised otherwise. The
00602        values are added in the order specified in the list, thus, the order of
00603        the data must match the columns.
00604           
00605     If *overwrite* is not None and set to an existing column name, the specified 
00606     column in the table is searched for the first occurrence of a value matching
00607     the value of the column with the same name in the dictionary. If a matching
00608     value is found, the row is overwritten with the dictionary. If no matching
00609     row is found, a new row is appended to the table.
00610 
00611     :param data: data to add
00612     :type data: :class:`dict` or *list-like* object
00613 
00614     :param overwrite: column name to overwrite existing row if value in
00615                       column *overwrite* matches
00616     :type overwrite: :class:`str`
00617 
00618     :raises: :class:`ValueError` if *list-like* object is used and number of
00619              items does *not* match number of columns in table.
00620 
00621     :raises: :class:`ValueError` if *dict* is used and multiple rows are added
00622              but the number of data items is different for different columns.
00623 
00624     **Example:** add multiple data rows to a subset of columns using a dictionary
00625 
00626     .. code-block:: python
00627 
00628       # create table with three float columns
00629       tab = Table(['x','y','z'], 'fff')
00630 
00631       # add rows from dict
00632       data = {'x': [1.2, 1.6], 'z': [1.6, 5.3]}
00633       tab.AddRow(data)
00634       print tab
00635 
00636       '''
00637       will produce the table
00638 
00639       ====  ====  ====
00640       x     y     z
00641       ====  ====  ====
00642       1.20  NA    1.60
00643       1.60  NA    5.30
00644       ====  ====  ====
00645       '''
00646 
00647       # overwrite the row with x=1.2 and add row with x=1.9
00648       data = {'x': [1.2, 1.9], 'z': [7.9, 3.5]}
00649       tab.AddRow(data, overwrite='x')
00650       print tab
00651 
00652       '''
00653       will produce the table
00654 
00655       ====  ====  ====
00656       x     y     z
00657       ====  ====  ====
00658       1.20  NA    7.90
00659       1.60  NA    5.30
00660       1.90  NA    3.50
00661       ====  ====  ====
00662       '''
00663     """
00664     if type(data)==dict:
00665       self._AddRowsFromDict(data, overwrite)
00666     else:
00667       if len(data)!=len(self.col_names):
00668         msg='data array must have %d elements, not %d'
00669         raise ValueError(msg % (len(self.col_names), len(data)))
00670       new_row = [self._Coerce(v, t) for v, t in zip(data, self.col_types)]
00671       
00672       # fully overwrite existing row with new data
00673       if overwrite:
00674         overwrite_idx = self.GetColIndex(overwrite)
00675         added = False
00676         for i,r in enumerate(self.rows):
00677           if r[overwrite_idx]==new_row[overwrite_idx]:
00678             self.rows[i] = new_row
00679             added = True
00680             break
00681       
00682       # if not overwrite or overwrite did not find appropriate row
00683       if not overwrite or not added:
00684         self.rows.append(new_row)
00685 
00686   def RemoveCol(self, col):
00687     """
00688     Remove column with the given name from the table.
00689 
00690     :param col: name of column to remove
00691     :type col: :class:`str`
00692     """
00693     idx = self.GetColIndex(col)
00694     del self.col_names[idx]
00695     del self.col_types[idx]
00696     for row in self.rows:
00697       del row[idx]
00698 
00699   def AddCol(self, col_name, col_type, data=None):
00700     """
00701     Add a column to the right of the table.
00702     
00703     :param col_name: name of new column
00704     :type col_name: :class:`str`
00705 
00706     :param col_type: type of new column (long versions: *int*, *float*, *bool*,
00707                      *string* or short versions: *i*, *f*, *b*, *s*)
00708     :type col_type: :class:`str`
00709 
00710     :param data: data to add to new column
00711     :type data: scalar or iterable
00712 
00713     **Example:**
00714 
00715     .. code-block:: python
00716     
00717       tab = Table(['x'], 'f', x=range(5))
00718       tab.AddCol('even', 'bool', itertools.cycle([True, False]))
00719       print tab
00720     
00721       '''
00722       will produce the table
00723 
00724       ====  ====
00725       x     even
00726       ====  ====
00727         0   True
00728         1   False
00729         2   True
00730         3   False
00731         4   True
00732       ====  ====
00733       '''
00734 
00735     If data is a constant instead of an iterable object, it's value
00736     will be written into each row:
00737 
00738     .. code-block:: python
00739 
00740       tab = Table(['x'], 'f', x=range(5))
00741       tab.AddCol('num', 'i', 1)
00742       print tab
00743 
00744       '''
00745       will produce the table
00746 
00747       ====  ====
00748       x     num
00749       ====  ====
00750         0   1
00751         1   1
00752         2   1
00753         3   1
00754         4   1
00755       ====  ====
00756       '''
00757     
00758     As a special case, if there are no previous rows, and data is not 
00759     None, rows are added for every item in data.
00760     """
00761 
00762     if col_name in self.col_names:
00763       raise ValueError('Column with name %s already exists'%col_name)
00764 
00765     col_type = self._ParseColTypes(col_type, exp_num=1)[0]
00766     self.col_names.append(col_name)
00767     self.col_types.append(col_type)
00768 
00769     if len(self.rows)>0:
00770       if IsScalar(data):
00771         for row in self.rows:
00772           row.append(data)
00773       else:
00774         if hasattr(data, '__len__') and len(data)!=len(self.rows):
00775           self.col_names.pop()
00776           self.col_types.pop()
00777           raise ValueError('Length of data (%i) must correspond to number of '%len(data) +\
00778                            'existing rows (%i)'%len(self.rows))
00779         for row, d in zip(self.rows, data):
00780           row.append(d)
00781 
00782     elif data!=None and len(self.col_names)==1:
00783       if IsScalar(data):
00784         self.AddRow({col_name : data})
00785       else:
00786         for v in data:
00787           self.AddRow({col_name : v})
00788 
00789   def Filter(self, *args, **kwargs):
00790     """
00791     Returns a filtered table only containing rows matching all the predicates 
00792     in kwargs and args For example,
00793     
00794     .. code-block:: python
00795     
00796       tab.Filter(town='Basel')
00797     
00798     will return all the rows where the value of the column "town" is equal to 
00799     "Basel". Several predicates may be combined, i.e.
00800     
00801     .. code-block:: python
00802     
00803       tab.Filter(town='Basel', male=True)
00804       
00805     will return the rows with "town" equal to "Basel" and "male" equal to true.
00806     args are unary callables returning true if the row should be included in the
00807     result and false if not.
00808     """
00809     filt_tab=Table(list(self.col_names), list(self.col_types))
00810     for row in self.rows:
00811       matches=True
00812       for func in args:
00813         if not func(row):
00814           matches=False
00815           break
00816       for key, val in kwargs.iteritems():
00817         if row[self.GetColIndex(key)]!=val:
00818           matches=False
00819           break
00820       if matches:
00821         filt_tab.AddRow(row)
00822     return filt_tab
00823 
00824 
00825   def Select(self, query):
00826 
00827     """
00828     Returns a new table object containing all rows matching a logical query expression.
00829     
00830     *query* is a string containing the logical expression, that will be evaluated
00831     for every row. 
00832 
00833     Operands have to be the name of a column or an expression that can be parsed to 
00834     float, int, bool or string.
00835     Valid operators are: and, or, !=, !, <=, >=, ==, =, <, >, +, -, *, / 
00836     
00837     .. code-block:: python
00838     
00839       subtab = tab.Select('col_a>0.5 and (col_b=5 or col_c=5)')
00840 
00841     The selection query should be self explaining. Allowed parenthesis are: (), [], {}, 
00842     whereas parenthesis mismatches get recognized. Expressions like '3<=col_a>=col_b'
00843     throw an error, due to problems in figuring out the evaluation order.
00844 
00845     There are two special expressions:
00846 
00847     .. code-block:: python
00848 
00849       #selects rows, where 1.0<=col_a<=1.5
00850       subtab = tab.Select('col_a=1.0:1.5')
00851 
00852       #selects rows, where col_a=1 or col_a=2 or col_a=3
00853       subtab = tab.Select('col_a=1,2,3')
00854 
00855     Only consistent types can be compared. If col_a is of type string and col_b is of type int, 
00856     following expression would throw an error: 'col_a<col_b'
00857 
00858     """
00859 
00860     try:
00861       from table_selector import TableSelector
00862     except:
00863       raise ImportError("Tried to import from the file table_selector.py, but could not find it!")
00864 
00865     selector=TableSelector(self.col_types, self.col_names, query)
00866 
00867     selected_tab=Table(list(self.col_names), list(self.col_types))
00868 
00869     for row in self.rows:
00870       if selector.EvaluateRow(row):
00871         selected_tab.AddRow(row)
00872 
00873     return selected_tab
00874 
00875 
00876   @staticmethod
00877   def _LoadOST(stream_or_filename):
00878     fieldname_pattern=re.compile(r'(?P<name>[^[]+)(\[(?P<type>\w+)\])?')
00879     values_pattern=re.compile("([^\" ]+|\"[^\"]*\")+")
00880     if not hasattr(stream_or_filename, 'read'):
00881       stream=open(stream_or_filename, 'r')
00882     else:
00883       stream=stream_or_filename
00884     header=False
00885     num_lines=0
00886     for line in stream:
00887       line=line.strip()
00888       if line.startswith('#'):
00889         continue
00890       if len(line)==0:
00891         continue
00892       num_lines+=1
00893       if not header:
00894         fieldnames=[]
00895         fieldtypes=[]
00896         for col in line.split():
00897           match=fieldname_pattern.match(col)
00898           if match:
00899             if match.group('type'):
00900               fieldtypes.append(match.group('type'))
00901             else:
00902               fieldtypes.append('string')
00903             fieldnames.append(match.group('name'))
00904         tab=Table(fieldnames, fieldtypes)
00905         header=True
00906         continue
00907       tab.AddRow([x.strip('"') for x in values_pattern.findall(line)])
00908     if num_lines==0:
00909       raise IOError("Cannot read table from empty stream")
00910     return tab
00911 
00912   def _GuessColumnTypes(self):
00913     for col_idx in range(len(self.col_names)):
00914       self.col_types[col_idx]=GuessColumnType(self[self.col_names[col_idx]])
00915     for row in self.rows:
00916       for idx in range(len(row)):
00917         row[idx]=self._Coerce(row[idx], self.col_types[idx])
00918         
00919   @staticmethod
00920   def _LoadCSV(stream_or_filename, sep):
00921     if not hasattr(stream_or_filename, 'read'):
00922       stream=open(stream_or_filename, 'r')
00923     else:
00924       stream=stream_or_filename
00925     reader=csv.reader(stream, delimiter=sep)
00926     first=True
00927     for row in reader:
00928       if first:
00929         header=row
00930         types='s'*len(row)
00931         tab=Table(header, types)
00932         first=False
00933       else:
00934         tab.AddRow(row)
00935     if first:
00936       raise IOError('trying to load table from empty CSV stream/file')
00937 
00938     tab._GuessColumnTypes()
00939     return tab
00940 
00941   @staticmethod
00942   def _LoadPickle(stream_or_filename):
00943     if not hasattr(stream_or_filename, 'read'):
00944       stream=open(stream_or_filename, 'rb')
00945     else:
00946       stream=stream_or_filename
00947     return cPickle.load(stream)
00948 
00949   @staticmethod
00950   def _GuessFormat(filename):
00951     try:
00952       filename = filename.name
00953     except AttributeError, e:
00954       pass
00955     if filename.endswith('.csv'):
00956       return 'csv'
00957     elif filename.endswith('.pickle'):
00958       return 'pickle'
00959     else:
00960       return 'ost'
00961     
00962     
00963   @staticmethod
00964   def Load(stream_or_filename, format='auto', sep=','):
00965     """
00966     Load table from stream or file with given name.
00967 
00968     By default, the file format is set to *auto*, which tries to guess the file
00969     format from the file extension. The following file extensions are
00970     recognized:
00971     
00972     ============    ======================
00973     extension       recognized format
00974     ============    ======================
00975     .csv            comma separated values
00976     .pickle         pickled byte stream
00977     <all others>    ost-specific format
00978     ============    ======================
00979     
00980     Thus, *format* must be specified for reading file with different filename
00981     extensions.
00982 
00983     The following file formats are understood:
00984 
00985     - ost
00986 
00987       This is an ost-specific, but still human readable file format. The file
00988       (stream) must start with header line of the form
00989 
00990         col_name1[type1] <col_name2[type2]>...
00991 
00992       The types given in brackets must be one of the data types the
00993       :class:`Table` class understands. Each following line in the file then must
00994       contains exactly the same number of data items as listed in the header. The
00995       data items are automatically converted to the column format. Lines starting
00996       with a '#' and empty lines are ignored.
00997 
00998     - pickle
00999 
01000       Deserializes the table from a pickled byte stream.
01001 
01002     - csv
01003 
01004       Reads the table from comma separated values stream. Since there is no
01005       explicit type information in the csv file, the column types are guessed,
01006       using the following simple rules:
01007 
01008       * if all values are either NA/NULL/NONE the type is set to string.
01009       * if all non-null values are convertible to float/int the type is set to
01010         float/int.
01011       * if all non-null values are true/false/yes/no, the value is set to bool.
01012       * for all other cases, the column type is set to string.
01013 
01014     :returns: A new :class:`Table` instance
01015     """
01016     format=format.lower()
01017     if format=='auto':
01018       format = Table._GuessFormat(stream_or_filename)
01019       
01020     if format=='ost':
01021       return Table._LoadOST(stream_or_filename)
01022     if format=='csv':
01023       return Table._LoadCSV(stream_or_filename, sep=sep)
01024     if format=='pickle':
01025       return Table._LoadPickle(stream_or_filename)
01026     raise ValueError('unknown format ""' % format)
01027 
01028   def Sort(self, by, order='+'):
01029     """
01030     Performs an in-place sort of the table, based on column *by*.
01031 
01032     :param by: column name by which to sort
01033     :type by: :class:`str`
01034 
01035     :param order: ascending (``-``) or descending (``+``) order
01036     :type order: :class:`str` (i.e. *+*, *-*)
01037     """
01038     sign=-1
01039     if order=='-':
01040       sign=1
01041     key_index=self.GetColIndex(by)
01042     def _key_cmp(lhs, rhs):
01043       return sign*cmp(lhs[key_index], rhs[key_index])
01044     self.rows=sorted(self.rows, _key_cmp)
01045     
01046   def GetUnique(self, col, ignore_nan=True):
01047     """
01048     Extract a list of all unique values from one column.
01049 
01050     :param col: column name
01051     :type col: :class:`str`
01052 
01053     :param ignore_nan: ignore all *None* values
01054     :type ignore_nan: :class:`bool`
01055     """
01056     idx = self.GetColIndex(col)
01057     seen = {}
01058     result = []
01059     for row in self.rows:
01060       item = row[idx]
01061       if item!=None or ignore_nan==False:
01062         if item in seen: continue
01063         seen[item] = 1
01064         result.append(item)
01065     return result
01066     
01067   def Zip(self, *args):
01068     """
01069     Allows to conveniently iterate over a selection of columns, e.g.
01070     
01071     .. code-block:: python
01072     
01073       tab = Table.Load('...')
01074       for col1, col2 in tab.Zip('col1', 'col2'):
01075         print col1, col2
01076     
01077     is a shortcut for
01078     
01079     .. code-block:: python
01080     
01081       tab = Table.Load('...')
01082       for col1, col2 in zip(tab['col1'], tab['col2']):
01083         print col1, col2
01084     """
01085     return zip(*[self[arg] for arg in args])
01086 
01087   def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None,
01088            z_title=None, x_range=None, y_range=None, z_range=None,
01089            color=None, plot_if=None, legend=None,
01090            num_z_levels=10, z_contour=True, z_interpol='nn', diag_line=False,
01091            labels=None, max_num_labels=None, title=None, clear=True, save=False,
01092            **kwargs):
01093     """
01094     Function to plot values from your table in 1, 2 or 3 dimensions using
01095     `Matplotlib <http://matplotlib.sourceforge.net>`__
01096 
01097     :param x: column name for first dimension
01098     :type x: :class:`str`
01099 
01100     :param y: column name for second dimension
01101     :type y: :class:`str`
01102 
01103     :param z: column name for third dimension
01104     :type z: :class:`str`
01105 
01106     :param style: symbol style (e.g. *.*, *-*, *x*, *o*, *+*, *\**). For a
01107                   complete list check (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
01108     :type style: :class:`str`
01109 
01110     :param x_title: title for first dimension, if not specified it is
01111                     automatically derived from column name
01112     :type x_title: :class:`str`
01113 
01114     :param y_title: title for second dimension, if not specified it is
01115                     automatically derived from column name
01116     :type y_title: :class:`str`
01117 
01118     :param z_title: title for third dimension, if not specified it is
01119                     automatically derived from column name
01120     :type z_title: :class:`str`
01121 
01122     :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
01123     :type x_range: :class:`list` of length two
01124 
01125     :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
01126     :type y_range: :class:`list` of length two
01127 
01128     :param z_range: start and end value for third dimension (e.g. [start_z, end_z])
01129     :type z_range: :class:`list` of length two
01130 
01131     :param color: color for data (e.g. *b*, *g*, *r*). For a complete list check
01132                   (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
01133     :type color: :class:`str`
01134 
01135     :param plot_if: callable which returnes *True* if row should be plotted. Is
01136                     invoked like ``plot_if(self, row)``
01137     :type plot_if: callable
01138 
01139     :param legend: legend label for data series
01140     :type legend: :class:`str`
01141 
01142     :param num_z_levels: number of levels for third dimension
01143     :type num_z_levels: :class:`int`
01144 
01145     :param diag_line: draw diagonal line
01146     :type diag_line: :class:`bool`
01147 
01148     :param labels: column name containing labels to put on x-axis for one
01149                    dimensional plot
01150     :type labels: :class:`str`
01151 
01152     :param max_num_labels: limit maximum number of labels
01153     :type max_num_labels: :class:`int`
01154 
01155     :param title: plot title, if not specified it is automatically derived from
01156                   plotted column names
01157     :type title: :class:`str`
01158 
01159     :param clear: clear old data from plot
01160     :type clear: :class:`bool`
01161 
01162     :param save: filename for saving plot
01163     :type save: :class:`str`
01164 
01165     :param z_contour: draw contour lines
01166     :type z_contour: :class:`bool`
01167 
01168     :param z_interpol: interpolation method for 3-dimensional plot (one of 'nn',
01169                        'linear')
01170     :type z_interpol: :class:`str`
01171 
01172     :param \*\*kwargs: additional arguments passed to matplotlib
01173     
01174     :returns: the ``matplotlib.pyplot`` module 
01175 
01176     **Examples:** simple plotting functions
01177 
01178     .. code-block:: python
01179 
01180       tab = Table(['a','b','c','d'],'iffi', a=range(5,0,-1),
01181                                             b=[x/2.0 for x in range(1,6)],
01182                                             c=[math.cos(x) for x in range(0,5)],
01183                                             d=range(3,8))
01184 
01185       # one dimensional plot of column 'd' vs. index
01186       plt = tab.Plot('d')
01187       plt.show()
01188 
01189       # two dimensional plot of 'a' vs. 'c'
01190       plt = tab.Plot('a', y='c', style='o-')
01191       plt.show()
01192 
01193       # three dimensional plot of 'a' vs. 'c' with values 'b'
01194       plt = tab.Plot('a', y='c', z='b')
01195       # manually save plot to file
01196       plt.savefig("plot.png")
01197     """
01198     try:
01199       import matplotlib.pyplot as plt
01200       import matplotlib.mlab as mlab
01201       import numpy as np
01202       idx1 = self.GetColIndex(x)
01203       xs = []
01204       ys = []
01205       zs = []
01206 
01207       if clear:
01208         plt.figure(figsize=[8, 6])
01209       
01210       if x_title!=None:
01211         nice_x=x_title
01212       else:
01213         nice_x=MakeTitle(x)
01214       
01215       if y_title!=None:
01216         nice_y=y_title
01217       else:
01218         if y:
01219           nice_y=MakeTitle(y)
01220         else:
01221           nice_y=None
01222       
01223       if z_title!=None:
01224         nice_z = z_title
01225       else:
01226         if z:
01227           nice_z = MakeTitle(z)
01228         else:
01229           nice_z = None
01230 
01231       if x_range and (IsScalar(x_range) or len(x_range)!=2):
01232         raise ValueError('parameter x_range must contain exactly two elements')
01233       if y_range and (IsScalar(y_range) or len(y_range)!=2):
01234         raise ValueError('parameter y_range must contain exactly two elements')
01235       if z_range and (IsScalar(z_range) or len(z_range)!=2):
01236         raise ValueError('parameter z_range must contain exactly two elements')
01237 
01238       if color:
01239         kwargs['color']=color
01240       if legend:
01241         kwargs['label']=legend
01242       if y and z:
01243         idx3 = self.GetColIndex(z)
01244         idx2 = self.GetColIndex(y)
01245         for row in self.rows:
01246           if row[idx1]!=None and row[idx2]!=None and row[idx3]!=None:
01247             if plot_if and not plot_if(self, row):
01248               continue
01249             xs.append(row[idx1])
01250             ys.append(row[idx2])
01251             zs.append(row[idx3])
01252         levels = []
01253         if z_range:
01254           z_spacing = (z_range[1] - z_range[0]) / num_z_levels
01255           l = z_range[0]
01256         else:
01257           l = self.Min(z)
01258           z_spacing = (self.Max(z) - l) / num_z_levels
01259         
01260         for i in range(0,num_z_levels+1):
01261           levels.append(l)
01262           l += z_spacing
01263   
01264         xi = np.linspace(min(xs),max(xs),len(xs)*10)
01265         yi = np.linspace(min(ys),max(ys),len(ys)*10)
01266         zi = mlab.griddata(xs, ys, zs, xi, yi, interp=z_interpol)
01267   
01268         if z_contour:
01269           plt.contour(xi,yi,zi,levels,linewidths=0.5,colors='k')
01270 
01271         plt.contourf(xi,yi,zi,levels,cmap=plt.cm.jet)
01272         plt.colorbar(ticks=levels)
01273             
01274       elif y:
01275         idx2=self.GetColIndex(y)
01276         for row in self.rows:
01277           if row[idx1]!=None and row[idx2]!=None:
01278             if plot_if and not plot_if(self, row):
01279               continue
01280             xs.append(row[idx1])
01281             ys.append(row[idx2])
01282         plt.plot(xs, ys, style, **kwargs)
01283         
01284       else:
01285         label_vals=[]
01286         
01287         if labels:
01288           label_idx=self.GetColIndex(labels)
01289         for row in self.rows:
01290           if row[idx1]!=None:
01291             if plot_if and not plot_if(self, row):
01292               continue
01293             xs.append(row[idx1])
01294             if labels:
01295               label_vals.append(row[label_idx])
01296         plt.plot(xs, style, **kwargs)
01297         if labels:
01298           interval = 1
01299           if max_num_labels:
01300             if len(label_vals)>max_num_labels:
01301               interval = int(math.ceil(float(len(label_vals))/max_num_labels))
01302               label_vals = label_vals[::interval]
01303           plt.xticks(np.arange(0, len(xs), interval), label_vals, rotation=45,
01304                      size='x-small')
01305       
01306       if title==None:
01307         if nice_z:
01308           title = '%s of %s vs. %s' % (nice_z, nice_x, nice_y)
01309         elif nice_y:
01310           title = '%s vs. %s' % (nice_x, nice_y)
01311         else:
01312           title = nice_x
01313   
01314       plt.title(title, size='x-large', fontweight='bold',
01315                 verticalalignment='bottom')
01316       
01317       if legend:
01318         plt.legend(loc=0)
01319       
01320       if x and y:
01321         plt.xlabel(nice_x, size='x-large')
01322         if x_range:
01323           plt.xlim(x_range[0], x_range[1])
01324         if y_range:
01325           plt.ylim(y_range[0], y_range[1])
01326         if diag_line:
01327           plt.plot(x_range, y_range, '-', color='black')
01328         
01329         plt.ylabel(nice_y, size='x-large')
01330       else:
01331         if y_range:
01332           plt.ylim(y_range[0], y_range[1])
01333         if x_title:
01334           plt.xlabel(x_title, size='x-large')
01335         plt.ylabel(nice_y, size='x-large')
01336       if save:
01337         plt.savefig(save)
01338       return plt
01339     except ImportError:
01340       LogError("Function needs numpy and matplotlib, but I could not import it.")
01341       raise
01342     
01343   def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False,
01344                     histtype='stepfilled', align='mid', x_title=None,
01345                     y_title=None, title=None, clear=True, save=False,
01346                     color=None, y_range=None):
01347     """
01348     Create a histogram of the data in col for the range *x_range*, split into
01349     *num_bins* bins and plot it using Matplotlib.
01350 
01351     :param col: column name with data
01352     :type col: :class:`str`
01353 
01354     :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
01355     :type x_range: :class:`list` of length two
01356 
01357     :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
01358     :type y_range: :class:`list` of length two
01359 
01360     :param num_bins: number of bins in range
01361     :type num_bins: :class:`int`
01362 
01363     :param color: Color to be used for the histogram. If not set, color will be 
01364         determined by matplotlib
01365     :type color: :class:`str`
01366 
01367     :param normed: normalize histogram
01368     :type normed: :class:`bool`
01369 
01370     :param histtype: type of histogram (i.e. *bar*, *barstacked*, *step*,
01371                      *stepfilled*). See (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
01372     :type histtype: :class:`str`
01373 
01374     :param align: style of histogram (*left*, *mid*, *right*). See
01375                   (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
01376     :type align: :class:`str`
01377 
01378     :param x_title: title for first dimension, if not specified it is
01379                     automatically derived from column name
01380     :type x_title: :class:`str`
01381 
01382     :param y_title: title for second dimension, if not specified it is
01383                     automatically derived from column name
01384     :type y_title: :class:`str`
01385 
01386     :param title: plot title, if not specified it is automatically derived from
01387                   plotted column names
01388     :type title: :class:`str`
01389 
01390     :param clear: clear old data from plot
01391     :type clear: :class:`bool`
01392 
01393     :param save: filename for saving plot
01394     :type save: :class:`str`
01395 
01396     **Examples:** simple plotting functions
01397 
01398     .. code-block:: python
01399 
01400       tab = Table(['a'],'f', a=[math.cos(x*0.01) for x in range(100)])
01401 
01402       # one dimensional plot of column 'd' vs. index
01403       plt = tab.PlotHistogram('a')
01404       plt.show()
01405 
01406     """
01407     try:
01408       import matplotlib.pyplot as plt
01409       import numpy as np
01410       
01411       if len(self.rows)==0:
01412         return None
01413       kwargs={}
01414       if color:
01415         kwargs['color']=color
01416       idx = self.GetColIndex(col)
01417       data = []
01418       for r in self.rows:
01419         if r[idx]!=None:
01420           data.append(r[idx])
01421         
01422       if clear:
01423         plt.clf()
01424         
01425       n, bins, patches = plt.hist(data, bins=num_bins, range=x_range,
01426                                   normed=normed, histtype=histtype, align=align,
01427                                   **kwargs)
01428       
01429       if x_title!=None:
01430         nice_x=x_title
01431       else:
01432         nice_x=MakeTitle(col)
01433       plt.xlabel(nice_x, size='x-large')
01434       if y_range:
01435         plt.ylim(y_range) 
01436       if y_title!=None:
01437         nice_y=y_title
01438       else:
01439         nice_y="bin count"  
01440       plt.ylabel(nice_y, size='x-large')
01441       
01442       if title!=None:
01443         nice_title=title
01444       else:
01445         nice_title="Histogram of %s"%nice_x
01446       plt.title(nice_title, size='x-large', fontweight='bold')
01447       
01448       if save:
01449         plt.savefig(save)
01450       return plt
01451     except ImportError:
01452       LogError("Function needs numpy and matplotlib, but I could not import it.")
01453       raise
01454  
01455   def _Max(self, col):
01456     if len(self.rows)==0:
01457       return None, None
01458     idx = self.GetColIndex(col)
01459     col_type = self.col_types[idx]
01460     if col_type=='int' or col_type=='float':
01461       max_val = -float('inf')
01462     elif col_type=='bool':
01463       max_val = False
01464     elif col_type=='string':
01465       max_val = chr(0)
01466     max_idx = None
01467     for i in range(0, len(self.rows)):
01468       if self.rows[i][idx]>max_val:
01469         max_val = self.rows[i][idx]
01470         max_idx = i
01471     return max_val, max_idx
01472 
01473   def PlotBar(self, cols=None, rows=None, xlabels=None, set_xlabels=True, xlabels_rotation='horizontal', y_title=None, title=None, 
01474               colors=None, width=0.8, bottom=0, legend=False, legend_names=None, show=False, save=False):
01475 
01476     """
01477     Create a barplot of the data in cols. Every column will be represented
01478     at one position. If there are several rows, each column will be grouped 
01479     together.
01480 
01481     :param cols: List of column names. Every column will be represented as a 
01482                  single bar. If cols is None, every column of the table gets 
01483                  plotted.
01484     :type cols: :class:`list`
01485 
01486     :param rows: List of row indices. Values from given rows will be plotted 
01487                  in parallel at one column position. If set to None, all rows 
01488                  of the table will be plotted. Note, that the maximum number 
01489                  of rows is 7.
01490     :type rows: :class:`list`
01491 
01492     :param xlabels: Label for every col on x-axis. If set to None, the column 
01493                     names are used. The xlabel plotting can be supressed by 
01494                     the parameter set_xlabel.
01495     :type xlabels: :class:`list`
01496 
01497     :param set_xlabels: Controls whether xlabels are plotted or not.
01498     :type set_xlabels: :class:`bool`
01499 
01500     :param x_labels_rotation: Can either be 'horizontal', 'vertical' or an 
01501                               integer, that describes the rotation in degrees.
01502 
01503     :param y_title: Y-axis description
01504     :type y_title: :class:`str`
01505 
01506     :title: Title of the plot. No title appears if set to None
01507     :type title: :class:`str`
01508 
01509     :param colors: Colors of the different bars in each group. Must be a list 
01510                    of valid colors in matplotlib. Length of color and rows must 
01511                    be consistent.
01512     :type colors: :class:`list`
01513 
01514     :param width: The available space for the groups on the x-axis is divided 
01515                   by the exact number of groups. The parameters width is the 
01516                   fraction of what is actually used. If it would be 1.0 the 
01517                   bars of the different groups would touch each other.
01518                   Value must be between [0;1]
01519     :type width: :class:`float`
01520 
01521     :param bottom: Bottom
01522     :type bottom: :class:`float`
01523 
01524     :param legend: Legend for color explanation, the corresponding row 
01525                    respectively. If set to True, legend_names must be provided.
01526     :type legend: :class:`bool`
01527 
01528     :param legend_names: List of names, that describe the differently colored 
01529                          bars. Length must be consistent with number of rows.
01530 
01531     :param show: If set to True, the plot is directly displayed.
01532 
01533     :param save: If set, a png image with name save in the current working 
01534                  directory will be saved.
01535     :type save: :class:`str`
01536 
01537     """
01538     try:
01539       import numpy as np
01540       import matplotlib.pyplot as plt
01541     except:
01542       raise ImportError('PlotBar relies on numpy and matplotlib, but I could' \
01543                         'not import it!')
01544       
01545     standard_colors=['b','g','y','c','m','r','k']
01546     data=[]
01547 
01548     if cols==None:
01549       cols=self.col_names
01550 
01551     if width<=0 or width>1:
01552       raise ValueError('Width must be in [0;1]')
01553 
01554     if rows==None:
01555       if len(self.rows)>7:
01556         raise ValueError('Table contains too many rows to represent them at one '\
01557                          'bar position in parallel. You can Select a Subtable or '\
01558                          'specify the parameter rows with a list of row indices '\
01559                          '(max 7)')
01560       else:
01561         rows=range(len(self.rows))
01562     else:
01563       if not isinstance(rows,list):
01564         rows=[rows]
01565       if len(rows)>7:
01566         raise ValueError('Too many rows to represent (max 7). Please note, that '\
01567                          'data from multiple rows from one column gets '\
01568                          'represented at one position in parallel.')
01569 
01570     for r_idx in rows:
01571       row=self.rows[r_idx] 
01572       temp=list()
01573       for c in cols:
01574         try:
01575           c_idx=self.GetColIndex(c)
01576         except:
01577           raise ValueError('Cannot find column with name '+str(c))
01578         temp.append(row[c_idx])
01579       data.append(temp)  
01580 
01581     if colors==None:
01582       colors=standard_colors[:len(rows)]
01583 
01584     if len(rows)!=len(colors):
01585       raise ValueError("Number of rows and number of colors must be consistent!")
01586 
01587     ind=np.arange(len(data[0]))
01588     single_bar_width=float(width)/len(data)
01589     
01590     fig=plt.figure()
01591     ax=fig.add_subplot(111)
01592     legend_data=[]
01593 
01594     for i in range(len(data)):
01595       legend_data.append(ax.bar(ind+i*single_bar_width+(1-width)/2,data[i],single_bar_width,bottom=bottom,color=colors[i])[0])
01596       
01597     if title!=None:
01598       ax.set_title(title, size='x-large', fontweight='bold')  
01599     
01600     if y_title!=None:
01601       nice_y=y_title
01602     else:
01603       nice_y="value" 
01604     ax.set_ylabel(nice_y)
01605     
01606     if xlabels:
01607       if len(data[0])!=len(xlabels):
01608         raise ValueError('Number of xlabels is not consistent with number of cols!')
01609     else:
01610       xlabels=cols
01611       
01612     if set_xlabels:
01613       ax.set_xticks(ind+0.5)
01614       ax.set_xticklabels(xlabels, rotation = xlabels_rotation)
01615     else:
01616       ax.set_xticks([])
01617       
01618     if legend == True:
01619       if legend_names==None:
01620         raise ValueError('You must provide legend names! e.g. names for the rows, '\
01621                          'that are printed in parallel.')
01622       if len(legend_names)!=len(data):
01623         raise ValueError('length of legend_names must be consistent with number '\
01624                          'of plotted rows!')
01625       ax.legend(legend_data, legend_names)   
01626 
01627     if save:
01628       plt.savefig(save)
01629 
01630     if show:
01631       plt.show()
01632     
01633     return plt
01634       
01635   def PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log',
01636                  colormap='jet', show_scalebar=False, scalebar_label=None, clear=True, save=False, show=False):
01637 
01638     """
01639     Create a heatplot of the data in col x vs the data in col y using matplotlib
01640 
01641     :param x: column name with x data
01642     :type x: :class:`str`
01643 
01644     :param y: column name with y data
01645     :type y: :class:`str`
01646 
01647     :param title: title of the plot, will be generated automatically if set to None
01648     :type title: :class:`str`
01649 
01650     :param x_title: label of x-axis, will be generated automatically if set to None
01651     :type title: :class:`str`
01652 
01653     :param y_title: label of y-axis, will be generated automatically if set to None
01654     :type title: :class:`str`
01655 
01656     :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
01657     :type x_range: :class:`list` of length two
01658 
01659     :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
01660     :type y_range: :class:`list` of length two
01661 
01662     :param binning: type of binning. If set to None, the value of a hexbin will
01663                     correspond to the number of datapoints falling into it. If
01664                     set to 'log', the value will be the log with base 10 of the above
01665                     value (log(i+1)). If an integer is provided, the number of a 
01666                     hexbin is equal the number of datapoints falling into it divided 
01667                     by the integer. If a list of values is provided, these values
01668                     will be the lower bounds of the bins.
01669     
01670     :param colormap: colormap, that will be used. Value can be every colormap defined
01671                      in matplotlib or an own defined colormap. You can either pass a
01672                      string with the name of the matplotlib colormap or a colormap
01673                      object.
01674 
01675     :param show_scalebar: If set to True, a scalebar according to the chosen colormap is shown
01676     :type show_scalebar: :class:`bool`
01677 
01678     :param scalebar_label: Label of the scalebar
01679     :type scalebar_label: :class:`str`
01680 
01681     :param clear: clear old data from plot
01682     :type clear: :class:`bool`
01683 
01684     :param save: filename for saving plot
01685     :type save: :class:`str`
01686 
01687     :param show: directly show plot
01688     :type show: :class:`bool`
01689     
01690     """
01691 
01692     try:
01693       import matplotlib.pyplot as plt
01694       import matplotlib.cm as cm
01695     except:
01696       raise ImportError('PlotHexbin relies on matplotlib, but I could not import it')
01697 
01698     idx=self.GetColIndex(x)
01699     idy=self.GetColIndex(y)
01700     xdata=[]
01701     ydata=[]
01702 
01703     for r in self.rows:
01704       if r[idx]!=None and r[idy]!=None:
01705         xdata.append(r[idx])
01706         ydata.append(r[idy])
01707 
01708     if clear:
01709       plt.clf()
01710       
01711     if x_title!=None:
01712       nice_x=x_title
01713     else:
01714       nice_x=MakeTitle(x)
01715       
01716     if y_title!=None:
01717       nice_y=y_title
01718     else:
01719       nice_y=MakeTitle(y)
01720 
01721     if title==None:
01722       title = '%s vs. %s' % (nice_x, nice_y)
01723   
01724     if IsStringLike(colormap):
01725       colormap=getattr(cm, colormap)
01726 
01727     if x_range and (IsScalar(x_range) or len(x_range)!=2):
01728       raise ValueError('parameter x_range must contain exactly two elements')
01729     if y_range and (IsScalar(y_range) or len(y_range)!=2):
01730       raise ValueError('parameter y_range must contain exactly two elements')
01731 
01732     ext = [min(xdata),max(xdata),min(ydata),max(ydata)]
01733 
01734     if x_range:
01735       plt.xlim((x_range[0], x_range[1]))
01736       ext[0]=x_range[0]
01737       ext[1]=x_range[1]
01738     if y_range:
01739       plt.ylim(y_range[0], y_range[1])
01740       ext[2]=y_range[0]
01741       ext[3]=y_range[1]
01742 
01743 
01744     plt.hexbin(xdata, ydata, bins=binning, cmap=colormap, extent=ext)
01745 
01746     plt.title(title, size='x-large', fontweight='bold',
01747               verticalalignment='bottom')
01748 
01749     plt.xlabel(nice_x)
01750     plt.ylabel(nice_y)
01751         
01752     if show_scalebar:
01753       cb=plt.colorbar()
01754       if scalebar_label:
01755         cb.set_label(scalebar_label)
01756 
01757     if save:
01758       plt.savefig(save)
01759 
01760     if show:
01761       plt.show()
01762 
01763     return plt
01764         
01765   def MaxRow(self, col):
01766     """
01767     Returns the row containing the cell with the maximal value in col. If 
01768     several rows have the highest value, only the first one is returned.
01769     ''None'' values are ignored.
01770 
01771     :param col: column name
01772     :type col: :class:`str`
01773 
01774     :returns: row with maximal col value or None if the table is empty
01775     """
01776     val, idx = self._Max(col)
01777     if idx!=None:
01778       return self.rows[idx]
01779   
01780   def Max(self, col):
01781     """
01782     Returns the maximum value in col. If several rows have the highest value,
01783     only the first one is returned. ''None'' values are ignored.
01784 
01785     :param col: column name
01786     :type col: :class:`str`
01787     """
01788     val, idx = self._Max(col)
01789     return val
01790   
01791   def MaxIdx(self, col):
01792     """
01793     Returns the row index of the cell with the maximal value in col. If
01794     several rows have the highest value, only the first one is returned.
01795     ''None'' values are ignored.
01796 
01797     :param col: column name
01798     :type col: :class:`str`
01799     """
01800     val, idx = self._Max(col)
01801     return idx
01802   
01803   def _Min(self, col):
01804     if len(self.rows)==0:
01805       return None, None
01806     idx=self.GetColIndex(col)
01807     col_type = self.col_types[idx]
01808     if col_type=='int' or col_type=='float':
01809       min_val=float('inf')
01810     elif col_type=='bool':
01811       min_val=True
01812     elif col_type=='string':
01813       min_val=chr(255)
01814     min_idx=None
01815     for i,row in enumerate(self.rows):
01816       if row[idx]!=None and row[idx]<min_val:
01817         min_val=row[idx]
01818         min_idx=i
01819     return min_val, min_idx
01820 
01821   def Min(self, col):
01822     """
01823     Returns the minimal value in col. If several rows have the lowest value,
01824     only the first one is returned. ''None'' values are ignored.
01825 
01826     :param col: column name
01827     :type col: :class:`str`
01828     """
01829     val, idx = self._Min(col)
01830     return val
01831   
01832   def MinRow(self, col):
01833     """
01834     Returns the row containing the cell with the minimal value in col. If 
01835     several rows have the lowest value, only the first one is returned.
01836     ''None'' values are ignored.
01837 
01838     :param col: column name
01839     :type col: :class:`str`
01840 
01841     :returns: row with minimal col value or None if the table is empty
01842     """
01843     val, idx = self._Min(col)
01844     if idx!=None:
01845       return self.rows[idx]
01846   
01847   def MinIdx(self, col):
01848     """
01849     Returns the row index of the cell with the minimal value in col. If
01850     several rows have the lowest value, only the first one is returned.
01851     ''None'' values are ignored.
01852 
01853     :param col: column name
01854     :type col: :class:`str`
01855     """
01856     val, idx = self._Min(col)
01857     return idx
01858   
01859   def Sum(self, col):
01860     """
01861     Returns the sum of the given column. Cells with ''None'' are ignored. Returns 
01862     0.0, if the column doesn't contain any elements. Col must be of numeric
01863     column type ('float', 'int') or boolean column type.
01864 
01865     :param col: column name
01866     :type col: :class:`str`
01867 
01868     :raises: :class:`TypeError` if column type is ``string``
01869     """
01870     idx = self.GetColIndex(col)
01871     col_type = self.col_types[idx]
01872     if col_type!='int' and col_type!='float' and col_type!='bool':
01873       raise TypeError("Sum can only be used on numeric column types")
01874     s = 0.0
01875     for r in self.rows:
01876       if r[idx]!=None:
01877         s += r[idx] 
01878     return s 
01879 
01880   def Mean(self, col):
01881     """
01882     Returns the mean of the given column. Cells with ''None'' are ignored. Returns 
01883     None, if the column doesn't contain any elements. Col must be of numeric
01884     ('float', 'int') or boolean column type.
01885 
01886     If column type is *bool*, the function returns the ratio of
01887     number of 'Trues' by total number of elements.
01888 
01889     :param col: column name
01890     :type col: :class:`str`
01891 
01892     :raises: :class:`TypeError` if column type is ``string``
01893     """
01894     idx = self.GetColIndex(col)
01895     col_type = self.col_types[idx]
01896     if col_type!='int' and col_type!='float' and col_type!='bool':
01897       raise TypeError("Mean can only be used on numeric or bool column types")
01898     
01899     vals=[]
01900     for v in self[col]:
01901       if v!=None:
01902         vals.append(v)
01903     try:
01904       return stutil.Mean(vals)
01905     except:
01906       return None
01907     
01908   def RowMean(self, mean_col_name, cols):
01909     """
01910     Adds a new column of type 'float' with a specified name (*mean_col_name*),
01911     containing the mean of all specified columns for each row.
01912     
01913     Cols are specified by their names and must be of numeric column
01914     type ('float', 'int') or boolean column type. Cells with None are ignored.
01915     Adds ''None'' if the row doesn't contain any values.
01916     
01917     :param mean_col_name: name of new column containing mean values
01918     :type mean_col_name: :class:`str`
01919 
01920     :param cols: name or list of names of columns to include in computation of
01921                  mean
01922     :type cols: :class:`str` or :class:`list` of strings
01923 
01924     :raises: :class:`TypeError` if column type of columns in *col* is ``string``
01925     
01926     == Example ==
01927    
01928     Staring with the following table:
01929     
01930     ==== ==== ====
01931     x     y    u           
01932     ==== ==== ====
01933      1    10  100 
01934      2    15  None 
01935      3    20  400 
01936     ==== ==== ====
01937     
01938     the code here adds a column with the name 'mean' to yield the table below:
01939     
01940     .. code-block::python
01941     
01942       tab.RowMean('mean', ['x', 'u'])
01943     
01944     
01945     ==== ==== ==== ===== 
01946     x     y    u   mean           
01947     ==== ==== ==== =====
01948      1    10  100  50.5 
01949      2    15  None 2
01950      3    20  400  201.5 
01951     ==== ==== ==== =====
01952       
01953     """
01954     
01955     if IsScalar(cols):
01956       cols = [cols]
01957     
01958     cols_idxs = []
01959     for col in cols:
01960       idx = self.GetColIndex(col)
01961       col_type = self.col_types[idx]
01962       if col_type!='int' and col_type!='float' and col_type!='bool':
01963         raise TypeError("RowMean can only be used on numeric column types")
01964       cols_idxs.append(idx)
01965       
01966     mean_rows = []
01967     for row in self.rows:
01968       vals = []
01969       for idx in cols_idxs:
01970         v = row[idx]
01971         if v!=None:
01972           vals.append(v)
01973       try:
01974         mean = stutil.Mean(vals)
01975         mean_rows.append(mean)
01976       except:
01977         mean_rows.append(None)
01978     
01979     self.AddCol(mean_col_name, 'f', mean_rows)
01980     
01981   def Percentiles(self, col, nths):
01982     """
01983     Returns the percentiles of column *col* given in *nths*.
01984 
01985     The percentiles are calculated as 
01986     
01987     .. code-block:: python
01988 
01989       values[min(len(values), int(round(len(values)*p/100+0.5)-1))]
01990 
01991     where values are the sorted values of *col* not equal to ''None''
01992     :param: nths: list of percentiles to be calculated. Each percentile is a number
01993         between 0 and 100.
01994 
01995     :raises: :class:`TypeError` if column type is ``string``
01996     :returns: List of percentiles in the same order as given in *nths*
01997     """
01998     idx = self.GetColIndex(col)
01999     col_type = self.col_types[idx]
02000     if col_type!='int' and col_type!='float' and col_type!='bool':
02001       raise TypeError("Median can only be used on numeric column types")
02002     
02003     for nth in nths:
02004       if nth < 0 or nth > 100:
02005         raise ValueError("percentiles must be between 0 and 100")
02006     vals=[]
02007     for v in self[col]:
02008       if v!=None:
02009         vals.append(v)
02010     vals=sorted(vals)
02011     if len(vals)==0:
02012       return [None]*len(nths)
02013     percentiles=[]
02014     
02015     for nth in nths:
02016       p=vals[min(len(vals)-1, int(round(len(vals)*nth/100.0+0.5)-1))]
02017       percentiles.append(p)
02018     return percentiles
02019 
02020   def Median(self, col):
02021     """
02022     Returns the median of the given column. Cells with ''None'' are ignored. Returns 
02023     ''None'', if the column doesn't contain any elements. Col must be of numeric
02024     column type ('float', 'int') or boolean column type.
02025 
02026     :param col: column name
02027     :type col: :class:`str`
02028 
02029     :raises: :class:`TypeError` if column type is ``string``
02030     """
02031     idx = self.GetColIndex(col)
02032     col_type = self.col_types[idx]
02033     if col_type!='int' and col_type!='float' and col_type!='bool':
02034       raise TypeError("Median can only be used on numeric column types")
02035     
02036     vals=[]
02037     for v in self[col]:
02038       if v!=None:
02039         vals.append(v)
02040     stutil.Median(vals)
02041     try:
02042       return stutil.Median(vals)
02043     except:
02044       return None
02045     
02046   def StdDev(self, col):
02047     """
02048     Returns the standard deviation of the given column. Cells with ''None'' are
02049     ignored. Returns ''None'', if the column doesn't contain any elements. Col must
02050     be of numeric column type ('float', 'int') or boolean column type.
02051 
02052     :param col: column name
02053     :type col: :class:`str`
02054 
02055     :raises: :class:`TypeError` if column type is ``string``
02056     """
02057     idx = self.GetColIndex(col)
02058     col_type = self.col_types[idx]
02059     if col_type!='int' and col_type!='float' and col_type!='bool':
02060       raise TypeError("StdDev can only be used on numeric column types")
02061     
02062     vals=[]
02063     for v in self[col]:
02064       if v!=None:
02065         vals.append(v)
02066     try:
02067       return stutil.StdDev(vals)
02068     except:
02069       return None
02070 
02071   def Count(self, col, ignore_nan=True):
02072     """
02073     Count the number of cells in column that are not equal to ''None''.
02074 
02075     :param col: column name
02076     :type col: :class:`str`
02077 
02078     :param ignore_nan: ignore all *None* values
02079     :type ignore_nan: :class:`bool`
02080     """
02081     count=0
02082     idx=self.GetColIndex(col)
02083     for r in self.rows:
02084       if ignore_nan:
02085         if r[idx]!=None:
02086           count+=1
02087       else:
02088         count+=1
02089     return count
02090 
02091   def Correl(self, col1, col2):
02092     """
02093     Calculate the Pearson correlation coefficient between *col1* and *col2*, only
02094     taking rows into account where both of the values are not equal to *None*.
02095     If there are not enough data points to calculate a correlation coefficient,
02096     *None* is returned.
02097 
02098     :param col1: column name for first column
02099     :type col1: :class:`str`
02100 
02101     :param col2: column name for second column
02102     :type col2: :class:`str`
02103     """
02104     if IsStringLike(col1) and IsStringLike(col2):
02105       col1 = self.GetColIndex(col1)
02106       col2 = self.GetColIndex(col2)
02107     vals1, vals2=([],[])
02108     for v1, v2 in zip(self[col1], self[col2]):
02109       if v1!=None and v2!=None:
02110         vals1.append(v1)
02111         vals2.append(v2)
02112     try:
02113       return stutil.Correl(vals1, vals2)
02114     except:
02115       return None
02116 
02117   def SpearmanCorrel(self, col1, col2):
02118     """
02119     Calculate the Spearman correlation coefficient between col1 and col2, only 
02120     taking rows into account where both of the values are not equal to None. If 
02121     there are not enough data points to calculate a correlation coefficient, 
02122     None is returned.
02123     
02124     :warning: The function depends on the following module: *scipy.stats.mstats*
02125 
02126     :param col1: column name for first column
02127     :type col1: :class:`str`
02128 
02129     :param col2: column name for second column
02130     :type col2: :class:`str`
02131     """
02132     try:
02133       import scipy.stats.mstats
02134       
02135       if IsStringLike(col1) and IsStringLike(col2):
02136         col1 = self.GetColIndex(col1)
02137         col2 = self.GetColIndex(col2)
02138       vals1, vals2=([],[])
02139       for v1, v2 in zip(self[col1], self[col2]):
02140         if v1!=None and v2!=None:
02141           vals1.append(v1)
02142           vals2.append(v2)
02143       try:
02144         correl = scipy.stats.mstats.spearmanr(vals1, vals2)[0]
02145         if scipy.isnan(correl):
02146           return None
02147         return correl
02148       except:
02149         return None
02150 
02151     except ImportError:
02152       LogError("Function needs scipy.stats.mstats, but I could not import it.")
02153       raise
02154     
02155 
02156   def Save(self, stream_or_filename, format='ost', sep=','):
02157     """
02158     Save the table to stream or filename. The following three file formats
02159     are supported (for more information on file formats, see :meth:`Load`):
02160 
02161     =============   =======================================
02162     ost             ost-specific format (human readable)
02163     csv             comma separated values (human readable)
02164     pickle          pickled byte stream (binary)
02165     html            HTML table
02166     context         ConTeXt table
02167     =============   =======================================
02168 
02169     :param stream_or_filename: filename or stream for writing output
02170     :type stream_or_filename: :class:`str` or :class:`file`
02171 
02172     :param format: output format (i.e. *ost*, *csv*, *pickle*)
02173     :type format: :class:`str`
02174 
02175     :raises: :class:`ValueError` if format is unknown
02176     """
02177     format=format.lower()
02178     if format=='ost':
02179       return self._SaveOST(stream_or_filename)
02180     if format=='csv':
02181       return self._SaveCSV(stream_or_filename, sep=sep)
02182     if format=='pickle':
02183       return self._SavePickle(stream_or_filename)
02184     if format=='html':
02185       return self._SaveHTML(stream_or_filename)
02186     if format=='context':
02187       return self._SaveContext(stream_or_filename)
02188     raise ValueError('unknown format "%s"' % format)
02189 
02190   def _SavePickle(self, stream):
02191     if not hasattr(stream, 'write'):
02192       stream=open(stream, 'wb')
02193     cPickle.dump(self, stream, cPickle.HIGHEST_PROTOCOL)
02194 
02195   def _SaveHTML(self, stream_or_filename):
02196     def _escape(s):
02197       return s.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;')
02198 
02199     file_opened = False
02200     if not hasattr(stream_or_filename, 'write'):
02201       stream = open(stream_or_filename, 'w')
02202       file_opened = True
02203     else:
02204       stream = stream_or_filename
02205     stream.write('<table>') 
02206     stream.write('<tr>')
02207     for col_name in self.col_names:
02208       stream.write('<th>%s</th>' % _escape(col_name)) 
02209     stream.write('</tr>')
02210     for row in self.rows:
02211       stream.write('<tr>')
02212       for i, col in enumerate(row):
02213         val = ''
02214         if col != None:
02215            if self.col_types[i] == 'float':
02216              val = '%.3f' % col
02217            elif self.col_types[i] == 'int':
02218              val = '%d' % col
02219            elif self.col_types[i] == 'bool':
02220              val = col and 'true' or 'false'
02221            else:
02222              val  = str(col)
02223         stream.write('<td>%s</td>' % _escape(val))
02224       stream.write('</tr>')
02225     stream.write('</table>')
02226     if file_opened:
02227       stream.close()
02228   def _SaveContext(self, stream_or_filename):
02229     file_opened = False
02230     if not hasattr(stream_or_filename, 'write'):
02231       stream = open(stream_or_filename, 'w')
02232       file_opened = True
02233     else:
02234       stream = stream_or_filename
02235     stream.write('\\starttable[') 
02236     for col_type in self.col_types:
02237       if col_type =='string':
02238         stream.write('l|')
02239       elif col_type=='int':
02240         stream.write('r|')
02241       elif col_type =='float':
02242         stream.write('i3r|')
02243       else:
02244         stream.write('l|')
02245     stream.write(']\n\\HL\n')
02246     for col_name in self.col_names:
02247       stream.write('\\NC \\bf %s' % col_name) 
02248     stream.write(' \\AR\\HL\n')
02249     for row in self.rows:
02250       for i, col in enumerate(row):
02251         val = '---'
02252         if col != None:
02253            if self.col_types[i] == 'float':
02254              val = '%.3f' % col
02255            elif self.col_types[i] == 'int':
02256              val = '%d' % col
02257            elif self.col_types[i] == 'bool':
02258              val = col and 'true' or 'false'
02259            else:
02260              val  = str(col)
02261         stream.write('\\NC %s' % val)
02262       stream.write(' \\AR\n')
02263     stream.write('\\HL\n')
02264     stream.write('\\stoptable')
02265     if file_opened:
02266       stream.close()
02267 
02268   def _SaveCSV(self, stream, sep):
02269     if not hasattr(stream, 'write'):
02270       stream=open(stream, 'wb')
02271 
02272     writer=csv.writer(stream, delimiter=sep)
02273     writer.writerow(['%s' % n for n in self.col_names])
02274     for row in self.rows:
02275       row=list(row)
02276       for i, c in enumerate(row):
02277         if c==None:
02278           row[i]='NA'
02279       writer.writerow(row)
02280 
02281   def _SaveOST(self, stream):
02282     if hasattr(stream, 'write'):
02283       writer=csv.writer(stream, delimiter=' ')
02284     else:
02285       stream=open(stream, 'w')
02286       writer=csv.writer(stream, delimiter=' ')
02287     if self.comment:
02288       stream.write(''.join(['# %s\n' % l for l in self.comment.split('\n')]))
02289     writer.writerow(['%s[%s]' % t for t in zip(self.col_names, self.col_types)])
02290     for row in self.rows:
02291       row=list(row)
02292       for i, c in enumerate(row):
02293         if c==None:
02294           row[i]='NA'
02295       writer.writerow(row)
02296   
02297      
02298   def GetNumpyMatrix(self, *args):
02299     '''
02300     Returns a numpy matrix containing the selected columns from the table as 
02301     columns in the matrix.
02302 
02303     Only columns of type *int* or *float* are supported. *NA* values in the
02304     table will be converted to *None* values.
02305 
02306     :param \*args: column names to include in numpy matrix
02307 
02308     :warning: The function depends on *numpy*
02309     '''
02310     try:
02311       import numpy as np
02312       
02313       if len(args)==0:
02314         raise RuntimeError("At least one column must be specified.")
02315       
02316       idxs = []
02317       for arg in args:
02318         idx = self.GetColIndex(arg)
02319         col_type = self.col_types[idx]
02320         if col_type!='int' and col_type!='float':
02321           raise TypeError("Numpy matrix can only be generated from numeric column types")
02322         idxs.append(idx)
02323       m = np.matrix([list(self[i]) for i in idxs]) 
02324       return m.T
02325     
02326     except ImportError:
02327       LogError("Function needs numpy, but I could not import it.")
02328       raise
02329     
02330 
02331 
02332   def GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0):
02333 
02334     '''
02335     In place Gaussian smooth of a column in the table with a given standard deviation.
02336     All nan are set to nan_value before smoothing.
02337 
02338     :param col: column name
02339     :type col: :class:`str`
02340 
02341     :param std: standard deviation for gaussian kernel
02342     :type std: `scalar` 
02343 
02344     :param na_value: all na (None) values of the speciefied column are set to na_value before smoothing
02345     :type na_value: `scalar`
02346 
02347     :param padding: allows to handle padding behaviour see scipy ndimage.gaussian_filter1d documentation for more information. standard is reflect
02348     :type padding: :class:`str`
02349 
02350     :param c: constant value used for padding if padding mode is constant
02351     :type c: `scalar`
02352 
02353 
02354 
02355     :warning: The function depends on *scipy*
02356     ''' 
02357 
02358     try:
02359       from scipy import ndimage
02360       import numpy as np
02361     except ImportError:
02362       LogError("I need scipy.ndimage and numpy, but could not import it")
02363       raise
02364       
02365     idx = self.GetColIndex(col)
02366     col_type = self.col_types[idx]
02367     if col_type!='int' and col_type!='float':
02368       raise TypeError("GaussianSmooth can only be used on numeric column types")
02369 
02370     vals=[]
02371     for v in self[col]:
02372       if v!=None:
02373         vals.append(v)
02374       else:
02375         vals.append(na_value)
02376 
02377     
02378     smoothed_values_ndarray=ndimage.gaussian_filter1d(vals,std, mode=padding, cval=c)
02379 
02380     result=[]
02381 
02382     for v in smoothed_values_ndarray:
02383       result.append(v)
02384 
02385     self[col]=result
02386 
02387 
02388   def GetOptimalPrefactors(self, ref_col, *args, **kwargs):
02389     '''
02390     This returns the optimal prefactor values (i.e. a, b, c, ...) for the
02391     following equation
02392     
02393     .. math::
02394       :label: op1
02395       
02396       a*u + b*v + c*w + ... = z
02397     
02398     where u, v, w and z are vectors. In matrix notation
02399     
02400     .. math::
02401       :label: op2
02402       
02403       A*p = z
02404     
02405     where A contains the data from the table (u,v,w,...), p are the prefactors 
02406     to optimize (a,b,c,...) and z is the vector containing the result of
02407     equation :eq:`op1`.
02408     
02409     The parameter ref_col equals to z in both equations, and \*args are columns
02410     u, v and w (or A in :eq:`op2`). All columns must be specified by their names.
02411     
02412     **Example:**
02413     
02414     .. code-block:: python
02415     
02416       tab.GetOptimalPrefactors('colC', 'colA', 'colB')
02417     
02418     The function returns a list of containing the prefactors a, b, c, ... in 
02419     the correct order (i.e. same as columns were specified in \*args).
02420     
02421     Weighting:
02422     If the kwarg weights="columX" is specified, the equations are weighted by
02423     the values in that column. Each row is multiplied by the weight in that row,
02424     which leads to :eq:`op3`:
02425     
02426     .. math::
02427       :label: op3
02428       
02429       weight*a*u + weight*b*v + weight*c*w + ... = weight*z
02430     
02431     Weights must be float or int and can have any value. A value of 0 ignores
02432     this equation, a value of 1 means the same as no weight. If all weights are
02433     the same for each row, the same result will be obtained as with no weights.
02434     
02435     **Example:**
02436     
02437     .. code-block:: python
02438     
02439       tab.GetOptimalPrefactors('colC', 'colA', 'colB', weights='colD')
02440     
02441     '''
02442     try:
02443       import numpy as np
02444   
02445       if len(args)==0:
02446         raise RuntimeError("At least one column must be specified.")
02447       
02448       b = self.GetNumpyMatrix(ref_col)
02449       a = self.GetNumpyMatrix(*args)
02450       
02451       if len(kwargs)!=0:
02452         if kwargs.has_key('weights'):
02453           w = self.GetNumpyMatrix(kwargs['weights'])
02454           b = np.multiply(b,w)
02455           a = np.multiply(a,w)
02456           
02457         else:
02458           raise RuntimeError("specified unrecognized kwargs, use weights as key")
02459       
02460       k = (a.T*a).I*a.T*b
02461       return list(np.array(k.T).reshape(-1))
02462     
02463     except ImportError:
02464       LogError("Function needs numpy, but I could not import it.")
02465       raise
02466 
02467   def PlotEnrichment(self, score_col, class_col, score_dir='-', 
02468                      class_dir='-', class_cutoff=2.0,
02469                      style='-', title=None, x_title=None, y_title=None,
02470                      clear=True, save=None):
02471     '''
02472     Plot an enrichment curve using matplotlib of column *score_col* classified
02473     according to *class_col*.
02474     
02475     For more information about parameters of the enrichment, see
02476     :meth:`ComputeEnrichment`, and for plotting see :meth:`Plot`.
02477     
02478     :warning: The function depends on *matplotlib*
02479     '''
02480     try:
02481       import matplotlib.pyplot as plt
02482     
02483       enrx, enry = self.ComputeEnrichment(score_col, class_col, score_dir,
02484                                           class_dir, class_cutoff)
02485       
02486       if not title:
02487         title = 'Enrichment of %s'%score_col
02488         
02489       if not x_title:
02490         x_title = '% database'
02491         
02492       if not y_title:
02493         y_title = '% positives'
02494         
02495       if clear:
02496         plt.clf()
02497         
02498       plt.plot(enrx, enry, style)
02499       
02500       plt.title(title, size='x-large', fontweight='bold')     
02501       plt.ylabel(y_title, size='x-large')
02502       plt.xlabel(x_title, size='x-large')
02503       
02504       if save:
02505         plt.savefig(save)
02506       
02507       return plt
02508     except ImportError:
02509       LogError("Function needs matplotlib, but I could not import it.")
02510       raise
02511     
02512   def ComputeEnrichment(self, score_col, class_col, score_dir='-', 
02513                         class_dir='-', class_cutoff=2.0):
02514     '''
02515     Computes the enrichment of column *score_col* classified according to
02516     *class_col*.
02517     
02518     For this it is necessary, that the datapoints are classified into positive
02519     and negative points. This can be done in two ways:
02520     
02521      - by using one 'bool' type column (*class_col*) which contains *True* for
02522        positives and *False* for negatives
02523        
02524      - by specifying a classification column (*class_col*), a cutoff value
02525        (*class_cutoff*) and the classification columns direction (*class_dir*).
02526        This will generate the classification on the fly
02527 
02528        * if ``class_dir=='-'``: values in the classification column that are less than or equal to class_cutoff will be counted as positives
02529        * if ``class_dir=='+'``: values in the classification column that are larger than or equal to class_cutoff will be counted as positives
02530 
02531     During the calculation, the table will be sorted according to *score_dir*,
02532     where a '-' values means smallest values first and therefore, the smaller
02533     the value, the better.
02534     
02535     :warning: If either the value of *class_col* or *score_col* is *None*, the
02536               data in this row is ignored.
02537     '''
02538     
02539     ALLOWED_DIR = ['+','-']
02540     
02541     score_idx = self.GetColIndex(score_col)
02542     score_type = self.col_types[score_idx]
02543     if score_type!='int' and score_type!='float':
02544       raise TypeError("Score column must be numeric type")
02545     
02546     class_idx = self.GetColIndex(class_col)
02547     class_type = self.col_types[class_idx]
02548     if class_type!='int' and class_type!='float' and class_type!='bool':
02549       raise TypeError("Classifier column must be numeric or bool type")
02550     
02551     if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
02552       raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
02553     
02554     self.Sort(score_col, score_dir)
02555     
02556     x = [0]
02557     y = [0]
02558     enr = 0
02559     old_score_val = None
02560     i = 0
02561 
02562     for row in self.rows:
02563       class_val = row[class_idx]
02564       score_val = row[score_idx]
02565       if class_val==None or score_val==None:
02566         continue
02567       if class_val!=None:
02568         if old_score_val==None:
02569           old_score_val = score_val
02570         if score_val!=old_score_val:
02571           x.append(i)
02572           y.append(enr)
02573           old_score_val = score_val
02574         i+=1
02575         if class_type=='bool':
02576           if class_val==True:
02577             enr += 1
02578         else:
02579           if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
02580             enr += 1
02581     x.append(i)
02582     y.append(enr)
02583 
02584     # if no false positives or false negatives values are found return None
02585     if x[-1]==0 or y[-1]==0:
02586       return None
02587 
02588     x = [float(v)/x[-1] for v in x]
02589     y = [float(v)/y[-1] for v in y]
02590     return x,y
02591     
02592   def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-', 
02593                            class_dir='-', class_cutoff=2.0):
02594     '''
02595     Computes the area under the curve of the enrichment using the trapezoidal
02596     rule.
02597     
02598     For more information about parameters of the enrichment, see
02599     :meth:`ComputeEnrichment`.
02600 
02601     :warning: The function depends on *numpy*
02602     '''
02603     try:
02604       import numpy as np
02605       
02606       enr = self.ComputeEnrichment(score_col, class_col, score_dir,
02607                                           class_dir, class_cutoff)
02608       
02609       if enr==None:
02610         return None
02611       return np.trapz(enr[1], enr[0])
02612     except ImportError:
02613       LogError("Function needs numpy, but I could not import it.")
02614       raise
02615 
02616   def ComputeROC(self, score_col, class_col, score_dir='-',
02617                  class_dir='-', class_cutoff=2.0):
02618     '''
02619     Computes the receiver operating characteristics (ROC) of column *score_col*
02620     classified according to *class_col*.
02621 
02622     For this it is necessary, that the datapoints are classified into positive
02623     and negative points. This can be done in two ways:
02624 
02625      - by using one 'bool' column (*class_col*) which contains True for positives
02626        and False for negatives
02627      - by using a non-bool column (*class_col*), a cutoff value (*class_cutoff*)
02628        and the classification columns direction (*class_dir*). This will generate
02629        the classification on the fly
02630 
02631        - if ``class_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff* will be counted as positives
02632        - if ``class_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff* will be counted as positives
02633 
02634     During the calculation, the table will be sorted according to *score_dir*,
02635     where a '-' values means smallest values first and therefore, the smaller
02636     the value, the better.
02637 
02638     If *class_col* does not contain any positives (i.e. value is True (if column
02639     is of type bool) or evaluated to True (if column is of type int or float
02640     (depending on *class_dir* and *class_cutoff*))) the ROC is not defined and
02641     the function will return *None*.
02642 
02643     :warning: If either the value of *class_col* or *score_col* is *None*, the
02644               data in this row is ignored.
02645     '''
02646 
02647     ALLOWED_DIR = ['+','-']
02648 
02649     score_idx = self.GetColIndex(score_col)
02650     score_type = self.col_types[score_idx]
02651     if score_type!='int' and score_type!='float':
02652       raise TypeError("Score column must be numeric type")
02653 
02654     class_idx = self.GetColIndex(class_col)
02655     class_type = self.col_types[class_idx]
02656     if class_type!='int' and class_type!='float' and class_type!='bool':
02657       raise TypeError("Classifier column must be numeric or bool type")
02658 
02659     if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
02660       raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
02661 
02662     self.Sort(score_col, score_dir)
02663 
02664     x = [0]
02665     y = [0]
02666     tp = 0
02667     fp = 0
02668     old_score_val = None
02669 
02670     for i,row in enumerate(self.rows):
02671       class_val = row[class_idx]
02672       score_val = row[score_idx]
02673       if class_val==None or score_val==None:
02674         continue
02675       if class_val!=None:
02676         if old_score_val==None:
02677           old_score_val = score_val
02678         if score_val!=old_score_val:
02679           x.append(fp)
02680           y.append(tp)
02681           old_score_val = score_val
02682         if class_type=='bool':
02683           if class_val==True:
02684             tp += 1
02685           else:
02686             fp += 1
02687         else:
02688           if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
02689             tp += 1
02690           else:
02691             fp += 1
02692     x.append(fp)
02693     y.append(tp)
02694     
02695     # if no false positives or false negatives values are found return None
02696     if x[-1]==0 or y[-1]==0:
02697       return None
02698     
02699     x = [float(v)/x[-1] for v in x]
02700     y = [float(v)/y[-1] for v in y]
02701     return x,y
02702 
02703   def ComputeROCAUC(self, score_col, class_col, score_dir='-',
02704                     class_dir='-', class_cutoff=2.0):
02705     '''
02706     Computes the area under the curve of the receiver operating characteristics
02707     using the trapezoidal rule.
02708     
02709     For more information about parameters of the ROC, see
02710     :meth:`ComputeROC`.
02711 
02712     :warning: The function depends on *numpy*
02713     '''
02714     try:
02715       import numpy as np
02716 
02717       roc = self.ComputeROC(score_col, class_col, score_dir,
02718                             class_dir, class_cutoff)
02719 
02720       if not roc:
02721         return None
02722       return np.trapz(roc[1], roc[0])
02723     except ImportError:
02724       LogError("Function needs numpy, but I could not import it.")
02725       raise
02726     
02727   def ComputeLogROCAUC(self, score_col, class_col, score_dir='-',
02728                        class_dir='-', class_cutoff=2.0):
02729     '''
02730     Computes the area under the curve of the log receiver operating 
02731     characteristics (logROC) where the x-axis is semilogarithmic
02732     using the trapezoidal rule.
02733     
02734     The logROC is computed with a lambda of 0.001 according to 
02735     Rapid Context-Dependent Ligand Desolvation in Molecular Docking
02736     Mysinger M. and Shoichet B., Journal of Chemical Information and Modeling
02737     2010 50 (9), 1561-1573
02738     
02739     For more information about parameters of the ROC, see
02740     :meth:`ComputeROC`.
02741 
02742     :warning: The function depends on *numpy*
02743     '''
02744     try:
02745       import numpy as np
02746 
02747       roc = self.ComputeROC(score_col, class_col, score_dir,
02748                             class_dir, class_cutoff)
02749 
02750       if not roc:
02751         return None
02752       
02753       rocxt, rocyt = roc
02754       rocx=[]
02755       rocy=[]
02756       
02757       # define lambda
02758       l=0.001
02759       
02760       # remove all duplicate x-values
02761       rocxt = [x if x>0 else l for x in rocxt]
02762       for i in range(len(rocxt)-1):
02763         if rocxt[i]==rocxt[i+1]:
02764           continue
02765         rocx.append(rocxt[i])
02766         rocy.append(rocyt[i])
02767       rocx.append(1.0)
02768       rocy.append(1.0)
02769       
02770       # compute logauc
02771       value = 0
02772       for i in range(len(rocx)-1):
02773         x = rocx[i]
02774         if rocx[i]==rocx[i+1]:
02775           continue
02776         b = rocy[i+1]-rocx[i+1]*((rocy[i+1]-rocy[i])/(rocx[i+1]-rocx[i]))
02777         value += ((rocy[i+1]-rocy[i])/math.log(10))+b*(math.log10(rocx[i+1])-math.log10(rocx[i]))
02778       return value/math.log10(1.0/l)
02779       
02780     except ImportError:
02781       LogError("Function needs numpy, but I could not import it.")
02782       raise
02783 
02784   def PlotROC(self, score_col, class_col, score_dir='-',
02785               class_dir='-', class_cutoff=2.0,
02786               style='-', title=None, x_title=None, y_title=None,
02787               clear=True, save=None):
02788     '''
02789     Plot an ROC curve using matplotlib.
02790     
02791     For more information about parameters of the ROC, see
02792     :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
02793 
02794     :warning: The function depends on *matplotlib*
02795     '''
02796 
02797     try:
02798       import matplotlib.pyplot as plt
02799 
02800       roc = self.ComputeROC(score_col, class_col, score_dir,
02801                                    class_dir, class_cutoff)
02802       
02803       if not roc:
02804         return None
02805 
02806       enrx, enry = roc
02807 
02808       if not title:
02809         title = 'ROC of %s'%score_col
02810 
02811       if not x_title:
02812         x_title = 'false positive rate'
02813 
02814       if not y_title:
02815         y_title = 'true positive rate'
02816 
02817       if clear:
02818         plt.clf()
02819 
02820       plt.plot(enrx, enry, style)
02821 
02822       plt.title(title, size='x-large', fontweight='bold')
02823       plt.ylabel(y_title, size='x-large')
02824       plt.xlabel(x_title, size='x-large')
02825 
02826       if save:
02827         plt.savefig(save)
02828 
02829       return plt
02830     except ImportError:
02831       LogError("Function needs matplotlib, but I could not import it.")
02832       raise
02833     
02834   def PlotLogROC(self, score_col, class_col, score_dir='-',
02835                  class_dir='-', class_cutoff=2.0,
02836                  style='-', title=None, x_title=None, y_title=None,
02837                  clear=True, save=None):
02838     '''
02839     Plot an logROC curve where the x-axis is semilogarithmic using matplotlib 
02840     
02841     For more information about parameters of the ROC, see
02842     :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
02843 
02844     :warning: The function depends on *matplotlib*
02845     '''
02846 
02847     try:
02848       import matplotlib.pyplot as plt
02849 
02850       roc = self.ComputeROC(score_col, class_col, score_dir,
02851                                    class_dir, class_cutoff)
02852       
02853       if not roc:
02854         return None
02855 
02856       rocx, rocy = roc
02857 
02858       if not title:
02859         title = 'logROC of %s'%score_col
02860 
02861       if not x_title:
02862         x_title = 'false positive rate'
02863 
02864       if not y_title:
02865         y_title = 'true positive rate'
02866 
02867       if clear:
02868         plt.clf()
02869      
02870       rocx = [x if x>0 else 0.001 for x in rocx]
02871       
02872       
02873       plt.plot(rocx, rocy, style)
02874 
02875       plt.title(title, size='x-large', fontweight='bold')
02876       plt.ylabel(y_title, size='x-large')
02877       plt.xlabel(x_title, size='x-large')
02878       
02879       plt.xscale('log', basex=10)
02880       plt.xlim(0.001, 1.0)
02881       
02882 
02883       if save:
02884         plt.savefig(save)
02885 
02886       return plt
02887     except ImportError:
02888       LogError("Function needs matplotlib, but I could not import it.")
02889       raise  
02890   
02891   def ComputeMCC(self, score_col, class_col, score_dir='-',
02892                  class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
02893     '''
02894     Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
02895     with the points classified into true positives, false positives, true
02896     negatives and false negatives according to a specified classification
02897     column (*class_col*).
02898     
02899     The datapoints in *score_col* and *class_col* are classified into
02900     positive and negative points. This can be done in two ways:
02901     
02902      - by using 'bool' columns which contains True for positives and False
02903        for negatives
02904        
02905      - by using 'float' or 'int' columns and specifying a cutoff value and the
02906        columns direction. This will generate the classification on the fly
02907        
02908        * if ``class_dir``/``score_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
02909        * if ``class_dir``/``score_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
02910                                     
02911     The two possibilities can be used together, i.e. 'bool' type for one column
02912     and 'float'/'int' type and cutoff/direction for the other column.
02913     '''
02914     ALLOWED_DIR = ['+','-']
02915 
02916     score_idx = self.GetColIndex(score_col)
02917     score_type = self.col_types[score_idx]
02918     if score_type!='int' and score_type!='float' and score_type!='bool':
02919       raise TypeError("Score column must be numeric or bool type")
02920 
02921     class_idx = self.GetColIndex(class_col)
02922     class_type = self.col_types[class_idx]
02923     if class_type!='int' and class_type!='float' and class_type!='bool':
02924       raise TypeError("Classifier column must be numeric or bool type")
02925 
02926     if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
02927       raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
02928      
02929     tp = 0
02930     fp = 0
02931     fn = 0
02932     tn = 0
02933 
02934     for i,row in enumerate(self.rows):
02935       class_val = row[class_idx]
02936       score_val = row[score_idx]
02937       if class_val!=None:
02938         if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
02939           if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
02940             tp += 1
02941           else:
02942             fn += 1
02943         else:
02944           if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
02945             tn += 1
02946           else:
02947             fp += 1
02948 
02949     mcc = None
02950     msg = None
02951     if (tp+fn)==0:
02952       msg = 'factor (tp + fn) is zero'
02953     elif (tp+fp)==0:
02954       msg = 'factor (tp + fp) is zero'
02955     elif (tn+fn)==0:
02956       msg = 'factor (tn + fn) is zero'
02957     elif (tn+fp)==0:
02958       msg = 'factor (tn + fp) is zero'
02959     
02960     if msg:
02961       LogWarning("Could not compute MCC: MCC is not defined since %s"%msg)
02962     else:
02963       mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
02964     return mcc
02965     
02966 
02967   def IsEmpty(self, col_name=None, ignore_nan=True):
02968     '''
02969     Checks if a table is empty.
02970     
02971     If no column name is specified, the whole table is checked for being empty,
02972     whereas if a column name is specified, only this column is checked.
02973     
02974     By default, all NAN (or None) values are ignored, and thus, a table
02975     containing only NAN values is considered as empty. By specifying the 
02976     option ignore_nan=False, NAN values are counted as 'normal' values.
02977     '''
02978     
02979     # table with no columns and no rows
02980     if len(self.col_names)==0:
02981       if col_name:
02982         raise ValueError('Table has no column named "%s"' % col_name)
02983       return True
02984     
02985     # column name specified
02986     if col_name:
02987       if self.Count(col_name, ignore_nan=ignore_nan)==0:
02988         return True
02989       else:
02990         return False
02991       
02992     # no column name specified -> test whole table
02993     else:
02994       for row in self.rows:
02995         for cell in row:
02996           if ignore_nan:
02997             if cell!=None:
02998               return False
02999           else:
03000             return False
03001     return True
03002     
03003 
03004   def Extend(self, tab, overwrite=None):
03005     """
03006     Append each row of *tab* to the current table. The data is appended based
03007     on the column names, thus the order of the table columns is *not* relevant,
03008     only the header names.
03009     
03010     If there is a column in *tab* that is not present in the current table,
03011     it is added to the current table and filled with *None* for all the rows
03012     present in the current table.
03013     
03014     If the type of any column in *tab* is not the same as in the current table
03015     a *TypeError* is raised.
03016     
03017     If *overwrite* is not None and set to an existing column name, the specified 
03018     column in the table is searched for the first occurrence of a value matching
03019     the value of the column with the same name in the dictionary. If a matching
03020     value is found, the row is overwritten with the dictionary. If no matching
03021     row is found, a new row is appended to the table.
03022     """
03023     # add column to current table if it doesn't exist
03024     for name,typ in zip(tab.col_names, tab.col_types):
03025       if not name in self.col_names:
03026         self.AddCol(name, typ)
03027     
03028     # check that column types are the same in current and new table
03029     for name in self.col_names:
03030       if name in tab.col_names:
03031         curr_type = self.col_types[self.GetColIndex(name)]
03032         new_type = tab.col_types[tab.GetColIndex(name)]
03033         if curr_type!=new_type:
03034           raise TypeError('cannot extend table, column %s in new '%name +\
03035                           'table different type (%s) than in '%new_type +\
03036                           'current table (%s)'%curr_type)
03037     
03038     num_rows = len(tab.rows)
03039     for i in range(0,num_rows):
03040       row = tab.rows[i]
03041       data = dict(zip(tab.col_names,row))
03042       self.AddRow(data, overwrite)
03043     
03044 
03045 def Merge(table1, table2, by, only_matching=False):
03046   """
03047   Returns a new table containing the data from both tables. The rows are 
03048   combined based on the common values in the column(s) by. The option 'by' can
03049   be a list of column names. When this is the case, merging is based on
03050   multiple columns.
03051   For example, the two tables below
03052 
03053   ==== ====
03054   x     y            
03055   ==== ====
03056    1    10
03057    2    15
03058    3    20
03059   ==== ====
03060   
03061   ==== ====
03062   x     u
03063   ==== ====
03064     1  100
03065     3  200
03066     4  400
03067   ==== ====
03068 
03069   when merged by column x, produce the following output:
03070 
03071   ===== ===== =====
03072   x      y     u
03073   ===== ===== =====
03074   1      10    100
03075   2      15    None
03076   3      20    200
03077   4      None  400
03078   ===== ===== =====
03079   
03080 
03081   """
03082   def _key(row, indices):
03083     return tuple([row[i] for i in indices])
03084   def _keep(indices, cn, ct, ni):
03085     ncn, nct, nni=([],[],[])
03086     for i in range(len(cn)):
03087       if i not in indices:
03088         ncn.append(cn[i])
03089         nct.append(ct[i])
03090         nni.append(ni[i])
03091     return ncn, nct, nni
03092   col_names=list(table2.col_names)
03093   col_types=list(table2.col_types)
03094   new_index=[i for i in range(len(col_names))]
03095   if isinstance(by, str):
03096     common2_indices=[col_names.index(by)]
03097   else:
03098     common2_indices=[col_names.index(b) for b in by]
03099   col_names, col_types, new_index=_keep(common2_indices, col_names, 
03100                                         col_types, new_index)
03101 
03102   for i, name in enumerate(col_names):
03103     try_name=name
03104     counter=1
03105     while try_name in table1.col_names:
03106       counter+=1
03107       try_name='%s_%d' % (name, counter)
03108     col_names[i]=try_name
03109   common1={}
03110   if isinstance(by, str):
03111     common1_indices=[table1.col_names.index(by)]
03112   else:
03113     common1_indices=[table1.col_names.index(b) for b in by]
03114   for row in table1.rows:
03115     key=_key(row, common1_indices)
03116     if key in common1:
03117       raise ValueError('duplicate key "%s in first table"' % (str(key)))
03118     common1[key]=row
03119   common2={}
03120   for row in table2.rows:
03121     key=_key(row, common2_indices)
03122     if key in common2:
03123       raise ValueError('duplicate key "%s" in second table' % (str(key)))
03124     common2[key]=row
03125   new_tab=Table(table1.col_names+col_names, table1.col_types+col_types)
03126   for k, v in common1.iteritems():
03127     row=v+[None for i in range(len(table2.col_names)-len(common2_indices))]
03128     matched=False
03129     if k in common2:
03130       matched=True
03131       row2=common2[k]
03132       for i, index in enumerate(new_index):
03133         row[len(table1.col_names)+i]=row2[index]
03134     if only_matching and not matched:
03135       continue
03136     new_tab.AddRow(row)
03137   if only_matching:
03138     return new_tab
03139   for k, v in common2.iteritems():
03140     if not k in common1:
03141       v2=[v[i] for i in new_index]
03142       row=[None for i in range(len(table1.col_names))]+v2
03143       for common1_index, common2_index in zip(common1_indices, common2_indices):
03144         row[common1_index]=v[common2_index]
03145       new_tab.AddRow(row)
03146   return new_tab
03147