00001 import csv
00002 import re
00003 import math
00004 from ost import stutil
00005 import itertools
00006 import operator
00007 import cPickle
00008 import weakref
00009 from ost import LogError, LogWarning, LogInfo, LogVerbose
00010
00011 def MakeTitle(col_name):
00012 return col_name.replace('_', ' ')
00013
00014 def IsStringLike(value):
00015 if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
00016 return False
00017 try:
00018 value+''
00019 return True
00020 except:
00021 return False
00022
00023 def IsNullString(value):
00024 value=value.strip().upper()
00025 return value in ('', 'NULL', 'NONE', 'NA')
00026
00027 def IsScalar(value):
00028 if IsStringLike(value):
00029 return True
00030 try:
00031 if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
00032 return False
00033 iter(value)
00034 return False
00035 except:
00036 return True
00037
00038 def GuessColumnType(iterator):
00039 empty=True
00040 possibilities=set(['bool', 'int', 'float'])
00041 for ele in iterator:
00042 str_ele=str(ele).upper()
00043 if IsNullString(str_ele):
00044 continue
00045 empty=False
00046 if 'int' in possibilities:
00047 try:
00048 int(str_ele)
00049 except ValueError:
00050 possibilities.remove('int')
00051
00052 if 'float' in possibilities:
00053 try:
00054 float(str_ele)
00055 except ValueError:
00056 possibilities.remove('float')
00057 if 'bool' in possibilities:
00058 if str_ele not in set(['YES', 'NO', 'TRUE', 'FALSE']):
00059 possibilities.remove('bool')
00060
00061 if len(possibilities)==0:
00062 return 'string'
00063 if len(possibilities)==2:
00064 return 'int'
00065 if empty:
00066 return 'string'
00067
00068 return possibilities.pop()
00069
00070 class BinaryColExpr:
00071 def __init__(self, op, lhs, rhs):
00072 self.op=op
00073 self.lhs=lhs
00074 self.rhs=rhs
00075 if IsScalar(lhs):
00076 self.lhs=itertools.cyle([self.lhs])
00077 if IsScalar(rhs):
00078 self.rhs=itertools.cycle([self.rhs])
00079 def __iter__(self):
00080 for l, r in zip(self.lhs, self.rhs):
00081 if l!=None and r!=None:
00082 yield self.op(l, r)
00083 else:
00084 yield None
00085 def __add__(self, rhs):
00086 return BinaryColExpr(operator.add, self, rhs)
00087
00088 def __sub__(self, rhs):
00089 return BinaryColExpr(operator.sub, self, rhs)
00090
00091 def __mul__(self, rhs):
00092 return BinaryColExpr(operator.mul, self, rhs)
00093
00094 def __div__(self, rhs):
00095 return BinaryColExpr(operator.div, self, rhs)
00096
00097 class TableCol:
00098 def __init__(self, table, col):
00099 self._table=table
00100 if type(col)==str:
00101 self.col_index=self._table.GetColIndex(col)
00102 else:
00103 self.col_index=col
00104
00105 def __iter__(self):
00106 for row in self._table.rows:
00107 yield row[self.col_index]
00108
00109 def __len__(self):
00110 return len(self._table.rows)
00111
00112 def __getitem__(self, index):
00113 return self._table.rows[index][self.col_index]
00114
00115 def __setitem__(self, index, value):
00116 self._table.rows[index][self.col_index]=value
00117
00118 def __add__(self, rhs):
00119 return BinaryColExpr(operator.add, self, rhs)
00120
00121 def __sub__(self, rhs):
00122 return BinaryColExpr(operator.sub, self, rhs)
00123
00124 def __mul__(self, rhs):
00125 return BinaryColExpr(operator.mul, self, rhs)
00126
00127 def __div__(self, rhs):
00128 return BinaryColExpr(operator.div, self, rhs)
00129
00130 class TableRow:
00131 """
00132 Essentially a named tuple, but allows column names that are not valid
00133 python variable names.
00134 """
00135 def __init__(self, row_data, tab):
00136 self.__dict__['tab'] = weakref.proxy(tab)
00137 self.__dict__['row_data'] = row_data
00138
00139 def __getitem__(self, col_name):
00140 if type(col_name)==int:
00141 return self.row_data[col_name]
00142 return self.row_data[self.tab.GetColIndex(col_name)]
00143
00144 def __str__(self):
00145 s = []
00146 for k, v in zip(self.__dict__['tab'].col_names, self.__dict__['row_data']):
00147 s.append('%s=%s' % (k, str(v)))
00148 return ', '.join(s)
00149
00150
00151 def __len__(self):
00152 return len(self.row_data)
00153
00154 def __setitem__(self, col_name, val):
00155 if type(col_name)==int:
00156 self.row_data[col_name] = val
00157 else:
00158 self.row_data[self.tab.GetColIndex(col_name)] = val
00159
00160 def __getattr__(self, col_name):
00161 if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
00162 raise AttributeError(col_name)
00163 return self.row_data[self.tab.GetColIndex(col_name)]
00164
00165 def __setattr__(self, col_name, val):
00166 if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
00167 raise AttributeError(col_name)
00168 self.row_data[self.tab.GetColIndex(col_name)] = val
00169
00170 class Table(object):
00171 """
00172
00173 The table class provides convenient access to data in tabular form. An empty
00174 table can be easily constructed as follows
00175
00176 .. code-block:: python
00177
00178 tab = Table()
00179
00180 If you want to add columns directly when creating the table, column names
00181 and *column types* can be specified as follows
00182
00183 .. code-block:: python
00184
00185 tab = Table(['nameX','nameY','nameZ'], 'sfb')
00186
00187 this will create three columns called nameX, nameY and nameZ of type string,
00188 float and bool, respectively. There will be no data in the table and thus,
00189 the table will not contain any rows.
00190
00191 The following *column types* are supported:
00192
00193 ======= ========
00194 name abbrev
00195 ======= ========
00196 string s
00197 float f
00198 int i
00199 bool b
00200 ======= ========
00201
00202 If you want to add data to the table in addition, use the following:
00203
00204 .. code-block:: python
00205
00206 tab=Table(['nameX','nameY','nameZ'],
00207 'sfb',
00208 nameX = ['a','b','c'],
00209 nameY = [0.1, 1.2, 3.414],
00210 nameZ = [True, False, False])
00211
00212 if values for one column is left out, they will be filled with NA, but if
00213 values are specified, all values must be specified (i.e. same number of
00214 values per column)
00215
00216 """
00217
00218 SUPPORTED_TYPES=('int', 'float', 'bool', 'string',)
00219
00220
00221 def __init__(self, col_names=[], col_types=None, **kwargs):
00222
00223 self.col_names=list(col_names)
00224 self.comment=''
00225 self.name=''
00226
00227 self.col_types = self._ParseColTypes(col_types)
00228 self.rows=[]
00229 if len(kwargs)>=0:
00230 if not col_names:
00231 self.col_names=[v for v in kwargs.keys()]
00232 if not self.col_types:
00233 self.col_types=['string' for u in range(len(self.col_names))]
00234 if len(kwargs)>0:
00235 self._AddRowsFromDict(kwargs)
00236
00237 def __getattr__(self, col_name):
00238
00239
00240
00241
00242 if 'col_names' not in self.__dict__ or col_name not in self.col_names:
00243 raise AttributeError(col_name)
00244 return TableCol(self, col_name)
00245
00246 @staticmethod
00247 def _ParseColTypes(types, exp_num=None):
00248 if types==None:
00249 return None
00250
00251 short2long = {'s' : 'string', 'i': 'int', 'b' : 'bool', 'f' : 'float'}
00252 allowed_short = short2long.keys()
00253 allowed_long = short2long.values()
00254
00255 type_list = []
00256
00257
00258 if IsScalar(types):
00259 if type(types)==str:
00260 types = types.lower()
00261
00262
00263 if types in allowed_long:
00264 type_list.append(types)
00265 elif types in allowed_short:
00266 type_list.append(short2long[types])
00267
00268
00269 elif types.find(',')!=-1:
00270 for t in types.split(','):
00271 if t in allowed_long:
00272 type_list.append(t)
00273 elif t in allowed_short:
00274 type_list.append(short2long[t])
00275 else:
00276 raise ValueError('Unknown type %s in types %s'%(t,types))
00277
00278
00279 else:
00280 for t in types:
00281 if t in allowed_short:
00282 type_list.append(short2long[t])
00283 else:
00284 raise ValueError('Unknown type %s in types %s'%(t,types))
00285
00286
00287 else:
00288 raise ValueError('Col type %s must be string or list'%types)
00289
00290
00291 else:
00292 for t in types:
00293
00294 if type(t)==str:
00295 t = t.lower()
00296 if t in allowed_long:
00297 type_list.append(t)
00298 elif t in allowed_short:
00299 type_list.append(short2long[t])
00300 else:
00301 raise ValueError('Unknown type %s in types %s'%(t,types))
00302
00303
00304 else:
00305 raise ValueError('Col type %s must be string or list'%types)
00306
00307 if exp_num:
00308 if len(type_list)!=exp_num:
00309 raise ValueError('Parsed number of col types (%i) differs from ' + \
00310 'expected (%i) in types %s'%(len(type_list),exp_num,types))
00311
00312 return type_list
00313
00314 def SetName(self, name):
00315 '''
00316 Set name of the table
00317
00318 :param name: name
00319 :type name: :class:`str`
00320 '''
00321 self.name = name
00322
00323 def GetName(self):
00324 '''
00325 Get name of table
00326 '''
00327 return self.name
00328
00329 def RenameCol(self, old_name, new_name):
00330 """
00331 Rename column *old_name* to *new_name*.
00332
00333 :param old_name: Name of the old column
00334 :param new_name: Name of the new column
00335 :raises: :exc:`ValueError` when *old_name* is not a valid column
00336 """
00337 if old_name==new_name:
00338 return
00339 self.AddCol(new_name, self.col_types[self.GetColIndex(old_name)],
00340 self[old_name])
00341 self.RemoveCol(old_name)
00342 def _Coerce(self, value, ty):
00343 '''
00344 Try to convert values (e.g. from :class:`str` type) to the specified type
00345
00346 :param value: the value
00347 :type value: any type
00348
00349 :param ty: name of type to convert it to (i.e. *int*, *float*, *string*,
00350 *bool*)
00351 :type ty: :class:`str`
00352 '''
00353 if value=='NA' or value==None:
00354 return None
00355 if ty=='int':
00356 return int(value)
00357 if ty=='float':
00358 return float(value)
00359 if ty=='string':
00360 return str(value)
00361 if ty=='bool':
00362 if isinstance(value, str) or isinstance(value, unicode):
00363 if value.upper() in ('FALSE', 'NO',):
00364 return False
00365 return True
00366 return bool(value)
00367 raise ValueError('Unknown type %s' % ty)
00368
00369 def GetColIndex(self, col):
00370 '''
00371 Returns the column index for the column with the given name.
00372
00373 :raises: ValueError if no column with the name is found.
00374 '''
00375 if col not in self.col_names:
00376 raise ValueError('Table has no column named "%s"' % col)
00377 return self.col_names.index(col)
00378
00379 def GetColNames(self):
00380 '''
00381 Returns a list containing all column names.
00382 '''
00383 return self.col_names
00384
00385 def SearchColNames(self, regex):
00386 '''
00387 Returns a list of column names matching the regex.
00388
00389 :param regex: regex pattern
00390 :type regex: :class:`str`
00391
00392 :returns: :class:`list` of column names (:class:`str`)
00393 '''
00394 matching_names = []
00395 for name in self.col_names:
00396 matches = re.search(regex, name)
00397 if matches:
00398 matching_names.append(name)
00399 return matching_names
00400
00401 def HasCol(self, col):
00402 '''
00403 Checks if the column with a given name is present in the table.
00404 '''
00405 return col in self.col_names
00406
00407 def __getitem__(self, k):
00408 if type(k)==int:
00409 return TableCol(self, self.col_names[k])
00410 else:
00411 return TableCol(self, k)
00412
00413 def __setitem__(self, k, value):
00414 col_index=k
00415 if type(k)!=int:
00416 col_index=self.GetColIndex(k)
00417 if IsScalar(value):
00418 value=itertools.cycle([value])
00419 for r, v in zip(self.rows, value):
00420 r[col_index]=v
00421
00422 def ToString(self, float_format='%.3f', int_format='%d', rows=None):
00423 '''
00424 Convert the table into a string representation.
00425
00426 The output format can be modified for int and float type columns by
00427 specifying a formatting string for the parameters *float_format* and
00428 *int_format*.
00429
00430 The option *rows* specify the range of rows to be printed. The parameter
00431 must be a type that supports indexing (e.g. a :class:`list`) containing the
00432 start and end row *index*, e.g. [start_row_idx, end_row_idx].
00433
00434 :param float_format: formatting string for float columns
00435 :type float_format: :class:`str`
00436
00437 :param int_format: formatting string for int columns
00438 :type int_format: :class:`str`
00439
00440 :param rows: iterable containing start and end row *index*
00441 :type rows: iterable containing :class:`ints <int>`
00442 '''
00443 widths=[len(cn) for cn in self.col_names]
00444 sel_rows=self.rows
00445 if rows:
00446 sel_rows=self.rows[rows[0]:rows[1]]
00447 for row in sel_rows:
00448 for i, (ty, col) in enumerate(zip(self.col_types, row)):
00449 if col==None:
00450 widths[i]=max(widths[i], len('NA'))
00451 elif ty=='float':
00452 widths[i]=max(widths[i], len(float_format % col))
00453 elif ty=='int':
00454 widths[i]=max(widths[i], len(int_format % col))
00455 else:
00456 widths[i]=max(widths[i], len(str(col)))
00457 s=''
00458 if self.comment:
00459 s+=''.join(['# %s\n' % l for l in self.comment.split('\n')])
00460 total_width=sum(widths)+2*len(widths)
00461 for width, col_name in zip(widths, self.col_names):
00462 s+=col_name.center(width+2)
00463 s+='\n%s\n' % ('-'*total_width)
00464 for row in sel_rows:
00465 for width, ty, col in zip(widths, self.col_types, row):
00466 cs=''
00467 if col==None:
00468 cs='NA'.center(width+2)
00469 elif ty=='float':
00470 cs=(float_format % col).rjust(width+2)
00471 elif ty=='int':
00472 cs=(int_format % col).rjust(width+2)
00473 else:
00474 cs=' '+str(col).ljust(width+1)
00475 s+=cs
00476 s+='\n'
00477 return s
00478
00479 def __str__(self):
00480 return self.ToString()
00481
00482 def Stats(self, col):
00483 idx = self.GetColIndex(col)
00484 text ='''
00485 Statistics for column %(col)s
00486
00487 Number of Rows : %(num)d
00488 Number of Rows Not None: %(num_non_null)d
00489 Mean : %(mean)f
00490 Median : %(median)f
00491 Standard Deviation : %(stddev)f
00492 Min : %(min)f
00493 Max : %(max)f
00494 '''
00495 data = {
00496 'col' : col,
00497 'num' : len(self.rows),
00498 'num_non_null' : self.Count(col),
00499 'median' : self.Median(col),
00500 'mean' : self.Mean(col),
00501 'stddev' : self.StdDev(col),
00502 'min' : self.Min(col),
00503 'max' : self.Max(col),
00504 }
00505 return text % data
00506
00507 def _AddRowsFromDict(self, d, overwrite=None):
00508 '''
00509 Add one or more rows from a :class:`dictionary <dict>`.
00510
00511 If *overwrite* is not None and set to an existing column name, the specified
00512 column in the table is searched for the first occurrence of a value matching
00513 the value of the column with the same name in the dictionary. If a matching
00514 value is found, the row is overwritten with the dictionary. If no matching
00515 row is found, a new row is appended to the table.
00516
00517 :param d: dictionary containing the data
00518 :type d: :class:`dict`
00519
00520 :param overwrite: column name to overwrite existing row if value in
00521 column *overwrite* matches
00522 :type overwrite: :class:`str`
00523
00524 :raises: :class:`ValueError` if multiple rows are added but the number of
00525 data items is different for different columns.
00526 '''
00527
00528 idxs = [self.GetColIndex(k) for k in d.keys()]
00529
00530
00531 old_len = None
00532 for k,v in d.iteritems():
00533 if IsScalar(v):
00534 v = [v]
00535 d[k] = v
00536 if not old_len:
00537 old_len = len(v)
00538 elif old_len!=len(v):
00539 raise ValueError("Cannot add rows: length of data must be equal " + \
00540 "for all columns in %s"%str(d))
00541
00542
00543 for i,data in enumerate(zip(*d.values())):
00544 new_row = [None for a in range(len(self.col_names))]
00545 for idx,v in zip(idxs,data):
00546 new_row[idx] = self._Coerce(v, self.col_types[idx])
00547
00548
00549 if overwrite:
00550 overwrite_idx = self.GetColIndex(overwrite)
00551 added = False
00552 for i,r in enumerate(self.rows):
00553 if r[overwrite_idx]==new_row[overwrite_idx]:
00554 for j,e in enumerate(self.rows[i]):
00555 if new_row[j]==None:
00556 new_row[j] = e
00557 self.rows[i] = new_row
00558 added = True
00559 break
00560
00561
00562 if not overwrite or not added:
00563 self.rows.append(new_row)
00564
00565 def PairedTTest(self, col_a, col_b):
00566 """
00567 Two-sided test for the null-hypothesis that two related samples
00568 have the same average (expected values).
00569
00570 :param col_a: First column
00571 :param col_b: Second column
00572
00573 :returns: P-value between 0 and 1 that the two columns have the
00574 same average. The smaller the value, the less related the two
00575 columns are.
00576 """
00577 from scipy.stats import ttest_rel
00578 xs = []
00579 ys = []
00580 for x, y in self.Zip(col_a, col_b):
00581 if x!=None and y!=None:
00582 xs.append(x)
00583 ys.append(y)
00584 result = ttest_rel(xs, ys)
00585 return result[1]
00586
00587 def AddRow(self, data, overwrite=None):
00588 """
00589 Add a row to the table.
00590
00591 *data* may either be a dictionary or a list-like object:
00592
00593 - If *data* is a dictionary, the keys in the dictionary must match the
00594 column names. Columns not found in the dict will be initialized to None.
00595 If the dict contains list-like objects, multiple rows will be added, if
00596 the number of items in all list-like objects is the same, otherwise a
00597 :class:`ValueError` is raised.
00598
00599 - If *data* is a list-like object, the row is initialized from the values
00600 in *data*. The number of items in *data* must match the number of
00601 columns in the table. A :class:`ValuerError` is raised otherwise. The
00602 values are added in the order specified in the list, thus, the order of
00603 the data must match the columns.
00604
00605 If *overwrite* is not None and set to an existing column name, the specified
00606 column in the table is searched for the first occurrence of a value matching
00607 the value of the column with the same name in the dictionary. If a matching
00608 value is found, the row is overwritten with the dictionary. If no matching
00609 row is found, a new row is appended to the table.
00610
00611 :param data: data to add
00612 :type data: :class:`dict` or *list-like* object
00613
00614 :param overwrite: column name to overwrite existing row if value in
00615 column *overwrite* matches
00616 :type overwrite: :class:`str`
00617
00618 :raises: :class:`ValueError` if *list-like* object is used and number of
00619 items does *not* match number of columns in table.
00620
00621 :raises: :class:`ValueError` if *dict* is used and multiple rows are added
00622 but the number of data items is different for different columns.
00623
00624 **Example:** add multiple data rows to a subset of columns using a dictionary
00625
00626 .. code-block:: python
00627
00628 # create table with three float columns
00629 tab = Table(['x','y','z'], 'fff')
00630
00631 # add rows from dict
00632 data = {'x': [1.2, 1.6], 'z': [1.6, 5.3]}
00633 tab.AddRow(data)
00634 print tab
00635
00636 '''
00637 will produce the table
00638
00639 ==== ==== ====
00640 x y z
00641 ==== ==== ====
00642 1.20 NA 1.60
00643 1.60 NA 5.30
00644 ==== ==== ====
00645 '''
00646
00647 # overwrite the row with x=1.2 and add row with x=1.9
00648 data = {'x': [1.2, 1.9], 'z': [7.9, 3.5]}
00649 tab.AddRow(data, overwrite='x')
00650 print tab
00651
00652 '''
00653 will produce the table
00654
00655 ==== ==== ====
00656 x y z
00657 ==== ==== ====
00658 1.20 NA 7.90
00659 1.60 NA 5.30
00660 1.90 NA 3.50
00661 ==== ==== ====
00662 '''
00663 """
00664 if type(data)==dict:
00665 self._AddRowsFromDict(data, overwrite)
00666 else:
00667 if len(data)!=len(self.col_names):
00668 msg='data array must have %d elements, not %d'
00669 raise ValueError(msg % (len(self.col_names), len(data)))
00670 new_row = [self._Coerce(v, t) for v, t in zip(data, self.col_types)]
00671
00672
00673 if overwrite:
00674 overwrite_idx = self.GetColIndex(overwrite)
00675 added = False
00676 for i,r in enumerate(self.rows):
00677 if r[overwrite_idx]==new_row[overwrite_idx]:
00678 self.rows[i] = new_row
00679 added = True
00680 break
00681
00682
00683 if not overwrite or not added:
00684 self.rows.append(new_row)
00685
00686 def RemoveCol(self, col):
00687 """
00688 Remove column with the given name from the table.
00689
00690 :param col: name of column to remove
00691 :type col: :class:`str`
00692 """
00693 idx = self.GetColIndex(col)
00694 del self.col_names[idx]
00695 del self.col_types[idx]
00696 for row in self.rows:
00697 del row[idx]
00698
00699 def AddCol(self, col_name, col_type, data=None):
00700 """
00701 Add a column to the right of the table.
00702
00703 :param col_name: name of new column
00704 :type col_name: :class:`str`
00705
00706 :param col_type: type of new column (long versions: *int*, *float*, *bool*,
00707 *string* or short versions: *i*, *f*, *b*, *s*)
00708 :type col_type: :class:`str`
00709
00710 :param data: data to add to new column
00711 :type data: scalar or iterable
00712
00713 **Example:**
00714
00715 .. code-block:: python
00716
00717 tab = Table(['x'], 'f', x=range(5))
00718 tab.AddCol('even', 'bool', itertools.cycle([True, False]))
00719 print tab
00720
00721 '''
00722 will produce the table
00723
00724 ==== ====
00725 x even
00726 ==== ====
00727 0 True
00728 1 False
00729 2 True
00730 3 False
00731 4 True
00732 ==== ====
00733 '''
00734
00735 If data is a constant instead of an iterable object, it's value
00736 will be written into each row:
00737
00738 .. code-block:: python
00739
00740 tab = Table(['x'], 'f', x=range(5))
00741 tab.AddCol('num', 'i', 1)
00742 print tab
00743
00744 '''
00745 will produce the table
00746
00747 ==== ====
00748 x num
00749 ==== ====
00750 0 1
00751 1 1
00752 2 1
00753 3 1
00754 4 1
00755 ==== ====
00756 '''
00757
00758 As a special case, if there are no previous rows, and data is not
00759 None, rows are added for every item in data.
00760 """
00761
00762 if col_name in self.col_names:
00763 raise ValueError('Column with name %s already exists'%col_name)
00764
00765 col_type = self._ParseColTypes(col_type, exp_num=1)[0]
00766 self.col_names.append(col_name)
00767 self.col_types.append(col_type)
00768
00769 if len(self.rows)>0:
00770 if IsScalar(data):
00771 for row in self.rows:
00772 row.append(data)
00773 else:
00774 if hasattr(data, '__len__') and len(data)!=len(self.rows):
00775 self.col_names.pop()
00776 self.col_types.pop()
00777 raise ValueError('Length of data (%i) must correspond to number of '%len(data) +\
00778 'existing rows (%i)'%len(self.rows))
00779 for row, d in zip(self.rows, data):
00780 row.append(d)
00781
00782 elif data!=None and len(self.col_names)==1:
00783 if IsScalar(data):
00784 self.AddRow({col_name : data})
00785 else:
00786 for v in data:
00787 self.AddRow({col_name : v})
00788
00789 def Filter(self, *args, **kwargs):
00790 """
00791 Returns a filtered table only containing rows matching all the predicates
00792 in kwargs and args For example,
00793
00794 .. code-block:: python
00795
00796 tab.Filter(town='Basel')
00797
00798 will return all the rows where the value of the column "town" is equal to
00799 "Basel". Several predicates may be combined, i.e.
00800
00801 .. code-block:: python
00802
00803 tab.Filter(town='Basel', male=True)
00804
00805 will return the rows with "town" equal to "Basel" and "male" equal to true.
00806 args are unary callables returning true if the row should be included in the
00807 result and false if not.
00808 """
00809 filt_tab=Table(list(self.col_names), list(self.col_types))
00810 for row in self.rows:
00811 matches=True
00812 for func in args:
00813 if not func(row):
00814 matches=False
00815 break
00816 for key, val in kwargs.iteritems():
00817 if row[self.GetColIndex(key)]!=val:
00818 matches=False
00819 break
00820 if matches:
00821 filt_tab.AddRow(row)
00822 return filt_tab
00823
00824
00825 def Select(self, query):
00826
00827 """
00828 Returns a new table object containing all rows matching a logical query expression.
00829
00830 *query* is a string containing the logical expression, that will be evaluated
00831 for every row.
00832
00833 Operands have to be the name of a column or an expression that can be parsed to
00834 float, int, bool or string.
00835 Valid operators are: and, or, !=, !, <=, >=, ==, =, <, >, +, -, *, /
00836
00837 .. code-block:: python
00838
00839 subtab = tab.Select('col_a>0.5 and (col_b=5 or col_c=5)')
00840
00841 The selection query should be self explaining. Allowed parenthesis are: (), [], {},
00842 whereas parenthesis mismatches get recognized. Expressions like '3<=col_a>=col_b'
00843 throw an error, due to problems in figuring out the evaluation order.
00844
00845 There are two special expressions:
00846
00847 .. code-block:: python
00848
00849 #selects rows, where 1.0<=col_a<=1.5
00850 subtab = tab.Select('col_a=1.0:1.5')
00851
00852 #selects rows, where col_a=1 or col_a=2 or col_a=3
00853 subtab = tab.Select('col_a=1,2,3')
00854
00855 Only consistent types can be compared. If col_a is of type string and col_b is of type int,
00856 following expression would throw an error: 'col_a<col_b'
00857
00858 """
00859
00860 try:
00861 from table_selector import TableSelector
00862 except:
00863 raise ImportError("Tried to import from the file table_selector.py, but could not find it!")
00864
00865 selector=TableSelector(self.col_types, self.col_names, query)
00866
00867 selected_tab=Table(list(self.col_names), list(self.col_types))
00868
00869 for row in self.rows:
00870 if selector.EvaluateRow(row):
00871 selected_tab.AddRow(row)
00872
00873 return selected_tab
00874
00875
00876 @staticmethod
00877 def _LoadOST(stream_or_filename):
00878 fieldname_pattern=re.compile(r'(?P<name>[^[]+)(\[(?P<type>\w+)\])?')
00879 values_pattern=re.compile("([^\" ]+|\"[^\"]*\")+")
00880 if not hasattr(stream_or_filename, 'read'):
00881 stream=open(stream_or_filename, 'r')
00882 else:
00883 stream=stream_or_filename
00884 header=False
00885 num_lines=0
00886 for line in stream:
00887 line=line.strip()
00888 if line.startswith('#'):
00889 continue
00890 if len(line)==0:
00891 continue
00892 num_lines+=1
00893 if not header:
00894 fieldnames=[]
00895 fieldtypes=[]
00896 for col in line.split():
00897 match=fieldname_pattern.match(col)
00898 if match:
00899 if match.group('type'):
00900 fieldtypes.append(match.group('type'))
00901 else:
00902 fieldtypes.append('string')
00903 fieldnames.append(match.group('name'))
00904 tab=Table(fieldnames, fieldtypes)
00905 header=True
00906 continue
00907 tab.AddRow([x.strip('"') for x in values_pattern.findall(line)])
00908 if num_lines==0:
00909 raise IOError("Cannot read table from empty stream")
00910 return tab
00911
00912 def _GuessColumnTypes(self):
00913 for col_idx in range(len(self.col_names)):
00914 self.col_types[col_idx]=GuessColumnType(self[self.col_names[col_idx]])
00915 for row in self.rows:
00916 for idx in range(len(row)):
00917 row[idx]=self._Coerce(row[idx], self.col_types[idx])
00918
00919 @staticmethod
00920 def _LoadCSV(stream_or_filename, sep):
00921 if not hasattr(stream_or_filename, 'read'):
00922 stream=open(stream_or_filename, 'r')
00923 else:
00924 stream=stream_or_filename
00925 reader=csv.reader(stream, delimiter=sep)
00926 first=True
00927 for row in reader:
00928 if first:
00929 header=row
00930 types='s'*len(row)
00931 tab=Table(header, types)
00932 first=False
00933 else:
00934 tab.AddRow(row)
00935 if first:
00936 raise IOError('trying to load table from empty CSV stream/file')
00937
00938 tab._GuessColumnTypes()
00939 return tab
00940
00941 @staticmethod
00942 def _LoadPickle(stream_or_filename):
00943 if not hasattr(stream_or_filename, 'read'):
00944 stream=open(stream_or_filename, 'rb')
00945 else:
00946 stream=stream_or_filename
00947 return cPickle.load(stream)
00948
00949 @staticmethod
00950 def _GuessFormat(filename):
00951 try:
00952 filename = filename.name
00953 except AttributeError, e:
00954 pass
00955 if filename.endswith('.csv'):
00956 return 'csv'
00957 elif filename.endswith('.pickle'):
00958 return 'pickle'
00959 else:
00960 return 'ost'
00961
00962
00963 @staticmethod
00964 def Load(stream_or_filename, format='auto', sep=','):
00965 """
00966 Load table from stream or file with given name.
00967
00968 By default, the file format is set to *auto*, which tries to guess the file
00969 format from the file extension. The following file extensions are
00970 recognized:
00971
00972 ============ ======================
00973 extension recognized format
00974 ============ ======================
00975 .csv comma separated values
00976 .pickle pickled byte stream
00977 <all others> ost-specific format
00978 ============ ======================
00979
00980 Thus, *format* must be specified for reading file with different filename
00981 extensions.
00982
00983 The following file formats are understood:
00984
00985 - ost
00986
00987 This is an ost-specific, but still human readable file format. The file
00988 (stream) must start with header line of the form
00989
00990 col_name1[type1] <col_name2[type2]>...
00991
00992 The types given in brackets must be one of the data types the
00993 :class:`Table` class understands. Each following line in the file then must
00994 contains exactly the same number of data items as listed in the header. The
00995 data items are automatically converted to the column format. Lines starting
00996 with a '#' and empty lines are ignored.
00997
00998 - pickle
00999
01000 Deserializes the table from a pickled byte stream.
01001
01002 - csv
01003
01004 Reads the table from comma separated values stream. Since there is no
01005 explicit type information in the csv file, the column types are guessed,
01006 using the following simple rules:
01007
01008 * if all values are either NA/NULL/NONE the type is set to string.
01009 * if all non-null values are convertible to float/int the type is set to
01010 float/int.
01011 * if all non-null values are true/false/yes/no, the value is set to bool.
01012 * for all other cases, the column type is set to string.
01013
01014 :returns: A new :class:`Table` instance
01015 """
01016 format=format.lower()
01017 if format=='auto':
01018 format = Table._GuessFormat(stream_or_filename)
01019
01020 if format=='ost':
01021 return Table._LoadOST(stream_or_filename)
01022 if format=='csv':
01023 return Table._LoadCSV(stream_or_filename, sep=sep)
01024 if format=='pickle':
01025 return Table._LoadPickle(stream_or_filename)
01026 raise ValueError('unknown format ""' % format)
01027
01028 def Sort(self, by, order='+'):
01029 """
01030 Performs an in-place sort of the table, based on column *by*.
01031
01032 :param by: column name by which to sort
01033 :type by: :class:`str`
01034
01035 :param order: ascending (``-``) or descending (``+``) order
01036 :type order: :class:`str` (i.e. *+*, *-*)
01037 """
01038 sign=-1
01039 if order=='-':
01040 sign=1
01041 key_index=self.GetColIndex(by)
01042 def _key_cmp(lhs, rhs):
01043 return sign*cmp(lhs[key_index], rhs[key_index])
01044 self.rows=sorted(self.rows, _key_cmp)
01045
01046 def GetUnique(self, col, ignore_nan=True):
01047 """
01048 Extract a list of all unique values from one column.
01049
01050 :param col: column name
01051 :type col: :class:`str`
01052
01053 :param ignore_nan: ignore all *None* values
01054 :type ignore_nan: :class:`bool`
01055 """
01056 idx = self.GetColIndex(col)
01057 seen = {}
01058 result = []
01059 for row in self.rows:
01060 item = row[idx]
01061 if item!=None or ignore_nan==False:
01062 if item in seen: continue
01063 seen[item] = 1
01064 result.append(item)
01065 return result
01066
01067 def Zip(self, *args):
01068 """
01069 Allows to conveniently iterate over a selection of columns, e.g.
01070
01071 .. code-block:: python
01072
01073 tab = Table.Load('...')
01074 for col1, col2 in tab.Zip('col1', 'col2'):
01075 print col1, col2
01076
01077 is a shortcut for
01078
01079 .. code-block:: python
01080
01081 tab = Table.Load('...')
01082 for col1, col2 in zip(tab['col1'], tab['col2']):
01083 print col1, col2
01084 """
01085 return zip(*[self[arg] for arg in args])
01086
01087 def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None,
01088 z_title=None, x_range=None, y_range=None, z_range=None,
01089 color=None, plot_if=None, legend=None,
01090 num_z_levels=10, z_contour=True, z_interpol='nn', diag_line=False,
01091 labels=None, max_num_labels=None, title=None, clear=True, save=False,
01092 **kwargs):
01093 """
01094 Function to plot values from your table in 1, 2 or 3 dimensions using
01095 `Matplotlib <http://matplotlib.sourceforge.net>`__
01096
01097 :param x: column name for first dimension
01098 :type x: :class:`str`
01099
01100 :param y: column name for second dimension
01101 :type y: :class:`str`
01102
01103 :param z: column name for third dimension
01104 :type z: :class:`str`
01105
01106 :param style: symbol style (e.g. *.*, *-*, *x*, *o*, *+*, *\**). For a
01107 complete list check (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
01108 :type style: :class:`str`
01109
01110 :param x_title: title for first dimension, if not specified it is
01111 automatically derived from column name
01112 :type x_title: :class:`str`
01113
01114 :param y_title: title for second dimension, if not specified it is
01115 automatically derived from column name
01116 :type y_title: :class:`str`
01117
01118 :param z_title: title for third dimension, if not specified it is
01119 automatically derived from column name
01120 :type z_title: :class:`str`
01121
01122 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
01123 :type x_range: :class:`list` of length two
01124
01125 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
01126 :type y_range: :class:`list` of length two
01127
01128 :param z_range: start and end value for third dimension (e.g. [start_z, end_z])
01129 :type z_range: :class:`list` of length two
01130
01131 :param color: color for data (e.g. *b*, *g*, *r*). For a complete list check
01132 (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
01133 :type color: :class:`str`
01134
01135 :param plot_if: callable which returnes *True* if row should be plotted. Is
01136 invoked like ``plot_if(self, row)``
01137 :type plot_if: callable
01138
01139 :param legend: legend label for data series
01140 :type legend: :class:`str`
01141
01142 :param num_z_levels: number of levels for third dimension
01143 :type num_z_levels: :class:`int`
01144
01145 :param diag_line: draw diagonal line
01146 :type diag_line: :class:`bool`
01147
01148 :param labels: column name containing labels to put on x-axis for one
01149 dimensional plot
01150 :type labels: :class:`str`
01151
01152 :param max_num_labels: limit maximum number of labels
01153 :type max_num_labels: :class:`int`
01154
01155 :param title: plot title, if not specified it is automatically derived from
01156 plotted column names
01157 :type title: :class:`str`
01158
01159 :param clear: clear old data from plot
01160 :type clear: :class:`bool`
01161
01162 :param save: filename for saving plot
01163 :type save: :class:`str`
01164
01165 :param z_contour: draw contour lines
01166 :type z_contour: :class:`bool`
01167
01168 :param z_interpol: interpolation method for 3-dimensional plot (one of 'nn',
01169 'linear')
01170 :type z_interpol: :class:`str`
01171
01172 :param \*\*kwargs: additional arguments passed to matplotlib
01173
01174 :returns: the ``matplotlib.pyplot`` module
01175
01176 **Examples:** simple plotting functions
01177
01178 .. code-block:: python
01179
01180 tab = Table(['a','b','c','d'],'iffi', a=range(5,0,-1),
01181 b=[x/2.0 for x in range(1,6)],
01182 c=[math.cos(x) for x in range(0,5)],
01183 d=range(3,8))
01184
01185 # one dimensional plot of column 'd' vs. index
01186 plt = tab.Plot('d')
01187 plt.show()
01188
01189 # two dimensional plot of 'a' vs. 'c'
01190 plt = tab.Plot('a', y='c', style='o-')
01191 plt.show()
01192
01193 # three dimensional plot of 'a' vs. 'c' with values 'b'
01194 plt = tab.Plot('a', y='c', z='b')
01195 # manually save plot to file
01196 plt.savefig("plot.png")
01197 """
01198 try:
01199 import matplotlib.pyplot as plt
01200 import matplotlib.mlab as mlab
01201 import numpy as np
01202 idx1 = self.GetColIndex(x)
01203 xs = []
01204 ys = []
01205 zs = []
01206
01207 if clear:
01208 plt.figure(figsize=[8, 6])
01209
01210 if x_title!=None:
01211 nice_x=x_title
01212 else:
01213 nice_x=MakeTitle(x)
01214
01215 if y_title!=None:
01216 nice_y=y_title
01217 else:
01218 if y:
01219 nice_y=MakeTitle(y)
01220 else:
01221 nice_y=None
01222
01223 if z_title!=None:
01224 nice_z = z_title
01225 else:
01226 if z:
01227 nice_z = MakeTitle(z)
01228 else:
01229 nice_z = None
01230
01231 if x_range and (IsScalar(x_range) or len(x_range)!=2):
01232 raise ValueError('parameter x_range must contain exactly two elements')
01233 if y_range and (IsScalar(y_range) or len(y_range)!=2):
01234 raise ValueError('parameter y_range must contain exactly two elements')
01235 if z_range and (IsScalar(z_range) or len(z_range)!=2):
01236 raise ValueError('parameter z_range must contain exactly two elements')
01237
01238 if color:
01239 kwargs['color']=color
01240 if legend:
01241 kwargs['label']=legend
01242 if y and z:
01243 idx3 = self.GetColIndex(z)
01244 idx2 = self.GetColIndex(y)
01245 for row in self.rows:
01246 if row[idx1]!=None and row[idx2]!=None and row[idx3]!=None:
01247 if plot_if and not plot_if(self, row):
01248 continue
01249 xs.append(row[idx1])
01250 ys.append(row[idx2])
01251 zs.append(row[idx3])
01252 levels = []
01253 if z_range:
01254 z_spacing = (z_range[1] - z_range[0]) / num_z_levels
01255 l = z_range[0]
01256 else:
01257 l = self.Min(z)
01258 z_spacing = (self.Max(z) - l) / num_z_levels
01259
01260 for i in range(0,num_z_levels+1):
01261 levels.append(l)
01262 l += z_spacing
01263
01264 xi = np.linspace(min(xs),max(xs),len(xs)*10)
01265 yi = np.linspace(min(ys),max(ys),len(ys)*10)
01266 zi = mlab.griddata(xs, ys, zs, xi, yi, interp=z_interpol)
01267
01268 if z_contour:
01269 plt.contour(xi,yi,zi,levels,linewidths=0.5,colors='k')
01270
01271 plt.contourf(xi,yi,zi,levels,cmap=plt.cm.jet)
01272 plt.colorbar(ticks=levels)
01273
01274 elif y:
01275 idx2=self.GetColIndex(y)
01276 for row in self.rows:
01277 if row[idx1]!=None and row[idx2]!=None:
01278 if plot_if and not plot_if(self, row):
01279 continue
01280 xs.append(row[idx1])
01281 ys.append(row[idx2])
01282 plt.plot(xs, ys, style, **kwargs)
01283
01284 else:
01285 label_vals=[]
01286
01287 if labels:
01288 label_idx=self.GetColIndex(labels)
01289 for row in self.rows:
01290 if row[idx1]!=None:
01291 if plot_if and not plot_if(self, row):
01292 continue
01293 xs.append(row[idx1])
01294 if labels:
01295 label_vals.append(row[label_idx])
01296 plt.plot(xs, style, **kwargs)
01297 if labels:
01298 interval = 1
01299 if max_num_labels:
01300 if len(label_vals)>max_num_labels:
01301 interval = int(math.ceil(float(len(label_vals))/max_num_labels))
01302 label_vals = label_vals[::interval]
01303 plt.xticks(np.arange(0, len(xs), interval), label_vals, rotation=45,
01304 size='x-small')
01305
01306 if title==None:
01307 if nice_z:
01308 title = '%s of %s vs. %s' % (nice_z, nice_x, nice_y)
01309 elif nice_y:
01310 title = '%s vs. %s' % (nice_x, nice_y)
01311 else:
01312 title = nice_x
01313
01314 plt.title(title, size='x-large', fontweight='bold',
01315 verticalalignment='bottom')
01316
01317 if legend:
01318 plt.legend(loc=0)
01319
01320 if x and y:
01321 plt.xlabel(nice_x, size='x-large')
01322 if x_range:
01323 plt.xlim(x_range[0], x_range[1])
01324 if y_range:
01325 plt.ylim(y_range[0], y_range[1])
01326 if diag_line:
01327 plt.plot(x_range, y_range, '-', color='black')
01328
01329 plt.ylabel(nice_y, size='x-large')
01330 else:
01331 if y_range:
01332 plt.ylim(y_range[0], y_range[1])
01333 if x_title:
01334 plt.xlabel(x_title, size='x-large')
01335 plt.ylabel(nice_y, size='x-large')
01336 if save:
01337 plt.savefig(save)
01338 return plt
01339 except ImportError:
01340 LogError("Function needs numpy and matplotlib, but I could not import it.")
01341 raise
01342
01343 def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False,
01344 histtype='stepfilled', align='mid', x_title=None,
01345 y_title=None, title=None, clear=True, save=False,
01346 color=None, y_range=None):
01347 """
01348 Create a histogram of the data in col for the range *x_range*, split into
01349 *num_bins* bins and plot it using Matplotlib.
01350
01351 :param col: column name with data
01352 :type col: :class:`str`
01353
01354 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
01355 :type x_range: :class:`list` of length two
01356
01357 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
01358 :type y_range: :class:`list` of length two
01359
01360 :param num_bins: number of bins in range
01361 :type num_bins: :class:`int`
01362
01363 :param color: Color to be used for the histogram. If not set, color will be
01364 determined by matplotlib
01365 :type color: :class:`str`
01366
01367 :param normed: normalize histogram
01368 :type normed: :class:`bool`
01369
01370 :param histtype: type of histogram (i.e. *bar*, *barstacked*, *step*,
01371 *stepfilled*). See (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
01372 :type histtype: :class:`str`
01373
01374 :param align: style of histogram (*left*, *mid*, *right*). See
01375 (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
01376 :type align: :class:`str`
01377
01378 :param x_title: title for first dimension, if not specified it is
01379 automatically derived from column name
01380 :type x_title: :class:`str`
01381
01382 :param y_title: title for second dimension, if not specified it is
01383 automatically derived from column name
01384 :type y_title: :class:`str`
01385
01386 :param title: plot title, if not specified it is automatically derived from
01387 plotted column names
01388 :type title: :class:`str`
01389
01390 :param clear: clear old data from plot
01391 :type clear: :class:`bool`
01392
01393 :param save: filename for saving plot
01394 :type save: :class:`str`
01395
01396 **Examples:** simple plotting functions
01397
01398 .. code-block:: python
01399
01400 tab = Table(['a'],'f', a=[math.cos(x*0.01) for x in range(100)])
01401
01402 # one dimensional plot of column 'd' vs. index
01403 plt = tab.PlotHistogram('a')
01404 plt.show()
01405
01406 """
01407 try:
01408 import matplotlib.pyplot as plt
01409 import numpy as np
01410
01411 if len(self.rows)==0:
01412 return None
01413 kwargs={}
01414 if color:
01415 kwargs['color']=color
01416 idx = self.GetColIndex(col)
01417 data = []
01418 for r in self.rows:
01419 if r[idx]!=None:
01420 data.append(r[idx])
01421
01422 if clear:
01423 plt.clf()
01424
01425 n, bins, patches = plt.hist(data, bins=num_bins, range=x_range,
01426 normed=normed, histtype=histtype, align=align,
01427 **kwargs)
01428
01429 if x_title!=None:
01430 nice_x=x_title
01431 else:
01432 nice_x=MakeTitle(col)
01433 plt.xlabel(nice_x, size='x-large')
01434 if y_range:
01435 plt.ylim(y_range)
01436 if y_title!=None:
01437 nice_y=y_title
01438 else:
01439 nice_y="bin count"
01440 plt.ylabel(nice_y, size='x-large')
01441
01442 if title!=None:
01443 nice_title=title
01444 else:
01445 nice_title="Histogram of %s"%nice_x
01446 plt.title(nice_title, size='x-large', fontweight='bold')
01447
01448 if save:
01449 plt.savefig(save)
01450 return plt
01451 except ImportError:
01452 LogError("Function needs numpy and matplotlib, but I could not import it.")
01453 raise
01454
01455 def _Max(self, col):
01456 if len(self.rows)==0:
01457 return None, None
01458 idx = self.GetColIndex(col)
01459 col_type = self.col_types[idx]
01460 if col_type=='int' or col_type=='float':
01461 max_val = -float('inf')
01462 elif col_type=='bool':
01463 max_val = False
01464 elif col_type=='string':
01465 max_val = chr(0)
01466 max_idx = None
01467 for i in range(0, len(self.rows)):
01468 if self.rows[i][idx]>max_val:
01469 max_val = self.rows[i][idx]
01470 max_idx = i
01471 return max_val, max_idx
01472
01473 def PlotBar(self, cols=None, rows=None, xlabels=None, set_xlabels=True, xlabels_rotation='horizontal', y_title=None, title=None,
01474 colors=None, width=0.8, bottom=0, legend=False, legend_names=None, show=False, save=False):
01475
01476 """
01477 Create a barplot of the data in cols. Every column will be represented
01478 at one position. If there are several rows, each column will be grouped
01479 together.
01480
01481 :param cols: List of column names. Every column will be represented as a
01482 single bar. If cols is None, every column of the table gets
01483 plotted.
01484 :type cols: :class:`list`
01485
01486 :param rows: List of row indices. Values from given rows will be plotted
01487 in parallel at one column position. If set to None, all rows
01488 of the table will be plotted. Note, that the maximum number
01489 of rows is 7.
01490 :type rows: :class:`list`
01491
01492 :param xlabels: Label for every col on x-axis. If set to None, the column
01493 names are used. The xlabel plotting can be supressed by
01494 the parameter set_xlabel.
01495 :type xlabels: :class:`list`
01496
01497 :param set_xlabels: Controls whether xlabels are plotted or not.
01498 :type set_xlabels: :class:`bool`
01499
01500 :param x_labels_rotation: Can either be 'horizontal', 'vertical' or an
01501 integer, that describes the rotation in degrees.
01502
01503 :param y_title: Y-axis description
01504 :type y_title: :class:`str`
01505
01506 :title: Title of the plot. No title appears if set to None
01507 :type title: :class:`str`
01508
01509 :param colors: Colors of the different bars in each group. Must be a list
01510 of valid colors in matplotlib. Length of color and rows must
01511 be consistent.
01512 :type colors: :class:`list`
01513
01514 :param width: The available space for the groups on the x-axis is divided
01515 by the exact number of groups. The parameters width is the
01516 fraction of what is actually used. If it would be 1.0 the
01517 bars of the different groups would touch each other.
01518 Value must be between [0;1]
01519 :type width: :class:`float`
01520
01521 :param bottom: Bottom
01522 :type bottom: :class:`float`
01523
01524 :param legend: Legend for color explanation, the corresponding row
01525 respectively. If set to True, legend_names must be provided.
01526 :type legend: :class:`bool`
01527
01528 :param legend_names: List of names, that describe the differently colored
01529 bars. Length must be consistent with number of rows.
01530
01531 :param show: If set to True, the plot is directly displayed.
01532
01533 :param save: If set, a png image with name save in the current working
01534 directory will be saved.
01535 :type save: :class:`str`
01536
01537 """
01538 try:
01539 import numpy as np
01540 import matplotlib.pyplot as plt
01541 except:
01542 raise ImportError('PlotBar relies on numpy and matplotlib, but I could' \
01543 'not import it!')
01544
01545 standard_colors=['b','g','y','c','m','r','k']
01546 data=[]
01547
01548 if cols==None:
01549 cols=self.col_names
01550
01551 if width<=0 or width>1:
01552 raise ValueError('Width must be in [0;1]')
01553
01554 if rows==None:
01555 if len(self.rows)>7:
01556 raise ValueError('Table contains too many rows to represent them at one '\
01557 'bar position in parallel. You can Select a Subtable or '\
01558 'specify the parameter rows with a list of row indices '\
01559 '(max 7)')
01560 else:
01561 rows=range(len(self.rows))
01562 else:
01563 if not isinstance(rows,list):
01564 rows=[rows]
01565 if len(rows)>7:
01566 raise ValueError('Too many rows to represent (max 7). Please note, that '\
01567 'data from multiple rows from one column gets '\
01568 'represented at one position in parallel.')
01569
01570 for r_idx in rows:
01571 row=self.rows[r_idx]
01572 temp=list()
01573 for c in cols:
01574 try:
01575 c_idx=self.GetColIndex(c)
01576 except:
01577 raise ValueError('Cannot find column with name '+str(c))
01578 temp.append(row[c_idx])
01579 data.append(temp)
01580
01581 if colors==None:
01582 colors=standard_colors[:len(rows)]
01583
01584 if len(rows)!=len(colors):
01585 raise ValueError("Number of rows and number of colors must be consistent!")
01586
01587 ind=np.arange(len(data[0]))
01588 single_bar_width=float(width)/len(data)
01589
01590 fig=plt.figure()
01591 ax=fig.add_subplot(111)
01592 legend_data=[]
01593
01594 for i in range(len(data)):
01595 legend_data.append(ax.bar(ind+i*single_bar_width+(1-width)/2,data[i],single_bar_width,bottom=bottom,color=colors[i])[0])
01596
01597 if title!=None:
01598 ax.set_title(title, size='x-large', fontweight='bold')
01599
01600 if y_title!=None:
01601 nice_y=y_title
01602 else:
01603 nice_y="value"
01604 ax.set_ylabel(nice_y)
01605
01606 if xlabels:
01607 if len(data[0])!=len(xlabels):
01608 raise ValueError('Number of xlabels is not consistent with number of cols!')
01609 else:
01610 xlabels=cols
01611
01612 if set_xlabels:
01613 ax.set_xticks(ind+0.5)
01614 ax.set_xticklabels(xlabels, rotation = xlabels_rotation)
01615 else:
01616 ax.set_xticks([])
01617
01618 if legend == True:
01619 if legend_names==None:
01620 raise ValueError('You must provide legend names! e.g. names for the rows, '\
01621 'that are printed in parallel.')
01622 if len(legend_names)!=len(data):
01623 raise ValueError('length of legend_names must be consistent with number '\
01624 'of plotted rows!')
01625 ax.legend(legend_data, legend_names)
01626
01627 if save:
01628 plt.savefig(save)
01629
01630 if show:
01631 plt.show()
01632
01633 return plt
01634
01635 def PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log',
01636 colormap='jet', show_scalebar=False, scalebar_label=None, clear=True, save=False, show=False):
01637
01638 """
01639 Create a heatplot of the data in col x vs the data in col y using matplotlib
01640
01641 :param x: column name with x data
01642 :type x: :class:`str`
01643
01644 :param y: column name with y data
01645 :type y: :class:`str`
01646
01647 :param title: title of the plot, will be generated automatically if set to None
01648 :type title: :class:`str`
01649
01650 :param x_title: label of x-axis, will be generated automatically if set to None
01651 :type title: :class:`str`
01652
01653 :param y_title: label of y-axis, will be generated automatically if set to None
01654 :type title: :class:`str`
01655
01656 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
01657 :type x_range: :class:`list` of length two
01658
01659 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
01660 :type y_range: :class:`list` of length two
01661
01662 :param binning: type of binning. If set to None, the value of a hexbin will
01663 correspond to the number of datapoints falling into it. If
01664 set to 'log', the value will be the log with base 10 of the above
01665 value (log(i+1)). If an integer is provided, the number of a
01666 hexbin is equal the number of datapoints falling into it divided
01667 by the integer. If a list of values is provided, these values
01668 will be the lower bounds of the bins.
01669
01670 :param colormap: colormap, that will be used. Value can be every colormap defined
01671 in matplotlib or an own defined colormap. You can either pass a
01672 string with the name of the matplotlib colormap or a colormap
01673 object.
01674
01675 :param show_scalebar: If set to True, a scalebar according to the chosen colormap is shown
01676 :type show_scalebar: :class:`bool`
01677
01678 :param scalebar_label: Label of the scalebar
01679 :type scalebar_label: :class:`str`
01680
01681 :param clear: clear old data from plot
01682 :type clear: :class:`bool`
01683
01684 :param save: filename for saving plot
01685 :type save: :class:`str`
01686
01687 :param show: directly show plot
01688 :type show: :class:`bool`
01689
01690 """
01691
01692 try:
01693 import matplotlib.pyplot as plt
01694 import matplotlib.cm as cm
01695 except:
01696 raise ImportError('PlotHexbin relies on matplotlib, but I could not import it')
01697
01698 idx=self.GetColIndex(x)
01699 idy=self.GetColIndex(y)
01700 xdata=[]
01701 ydata=[]
01702
01703 for r in self.rows:
01704 if r[idx]!=None and r[idy]!=None:
01705 xdata.append(r[idx])
01706 ydata.append(r[idy])
01707
01708 if clear:
01709 plt.clf()
01710
01711 if x_title!=None:
01712 nice_x=x_title
01713 else:
01714 nice_x=MakeTitle(x)
01715
01716 if y_title!=None:
01717 nice_y=y_title
01718 else:
01719 nice_y=MakeTitle(y)
01720
01721 if title==None:
01722 title = '%s vs. %s' % (nice_x, nice_y)
01723
01724 if IsStringLike(colormap):
01725 colormap=getattr(cm, colormap)
01726
01727 if x_range and (IsScalar(x_range) or len(x_range)!=2):
01728 raise ValueError('parameter x_range must contain exactly two elements')
01729 if y_range and (IsScalar(y_range) or len(y_range)!=2):
01730 raise ValueError('parameter y_range must contain exactly two elements')
01731
01732 ext = [min(xdata),max(xdata),min(ydata),max(ydata)]
01733
01734 if x_range:
01735 plt.xlim((x_range[0], x_range[1]))
01736 ext[0]=x_range[0]
01737 ext[1]=x_range[1]
01738 if y_range:
01739 plt.ylim(y_range[0], y_range[1])
01740 ext[2]=y_range[0]
01741 ext[3]=y_range[1]
01742
01743
01744 plt.hexbin(xdata, ydata, bins=binning, cmap=colormap, extent=ext)
01745
01746 plt.title(title, size='x-large', fontweight='bold',
01747 verticalalignment='bottom')
01748
01749 plt.xlabel(nice_x)
01750 plt.ylabel(nice_y)
01751
01752 if show_scalebar:
01753 cb=plt.colorbar()
01754 if scalebar_label:
01755 cb.set_label(scalebar_label)
01756
01757 if save:
01758 plt.savefig(save)
01759
01760 if show:
01761 plt.show()
01762
01763 return plt
01764
01765 def MaxRow(self, col):
01766 """
01767 Returns the row containing the cell with the maximal value in col. If
01768 several rows have the highest value, only the first one is returned.
01769 ''None'' values are ignored.
01770
01771 :param col: column name
01772 :type col: :class:`str`
01773
01774 :returns: row with maximal col value or None if the table is empty
01775 """
01776 val, idx = self._Max(col)
01777 if idx!=None:
01778 return self.rows[idx]
01779
01780 def Max(self, col):
01781 """
01782 Returns the maximum value in col. If several rows have the highest value,
01783 only the first one is returned. ''None'' values are ignored.
01784
01785 :param col: column name
01786 :type col: :class:`str`
01787 """
01788 val, idx = self._Max(col)
01789 return val
01790
01791 def MaxIdx(self, col):
01792 """
01793 Returns the row index of the cell with the maximal value in col. If
01794 several rows have the highest value, only the first one is returned.
01795 ''None'' values are ignored.
01796
01797 :param col: column name
01798 :type col: :class:`str`
01799 """
01800 val, idx = self._Max(col)
01801 return idx
01802
01803 def _Min(self, col):
01804 if len(self.rows)==0:
01805 return None, None
01806 idx=self.GetColIndex(col)
01807 col_type = self.col_types[idx]
01808 if col_type=='int' or col_type=='float':
01809 min_val=float('inf')
01810 elif col_type=='bool':
01811 min_val=True
01812 elif col_type=='string':
01813 min_val=chr(255)
01814 min_idx=None
01815 for i,row in enumerate(self.rows):
01816 if row[idx]!=None and row[idx]<min_val:
01817 min_val=row[idx]
01818 min_idx=i
01819 return min_val, min_idx
01820
01821 def Min(self, col):
01822 """
01823 Returns the minimal value in col. If several rows have the lowest value,
01824 only the first one is returned. ''None'' values are ignored.
01825
01826 :param col: column name
01827 :type col: :class:`str`
01828 """
01829 val, idx = self._Min(col)
01830 return val
01831
01832 def MinRow(self, col):
01833 """
01834 Returns the row containing the cell with the minimal value in col. If
01835 several rows have the lowest value, only the first one is returned.
01836 ''None'' values are ignored.
01837
01838 :param col: column name
01839 :type col: :class:`str`
01840
01841 :returns: row with minimal col value or None if the table is empty
01842 """
01843 val, idx = self._Min(col)
01844 if idx!=None:
01845 return self.rows[idx]
01846
01847 def MinIdx(self, col):
01848 """
01849 Returns the row index of the cell with the minimal value in col. If
01850 several rows have the lowest value, only the first one is returned.
01851 ''None'' values are ignored.
01852
01853 :param col: column name
01854 :type col: :class:`str`
01855 """
01856 val, idx = self._Min(col)
01857 return idx
01858
01859 def Sum(self, col):
01860 """
01861 Returns the sum of the given column. Cells with ''None'' are ignored. Returns
01862 0.0, if the column doesn't contain any elements. Col must be of numeric
01863 column type ('float', 'int') or boolean column type.
01864
01865 :param col: column name
01866 :type col: :class:`str`
01867
01868 :raises: :class:`TypeError` if column type is ``string``
01869 """
01870 idx = self.GetColIndex(col)
01871 col_type = self.col_types[idx]
01872 if col_type!='int' and col_type!='float' and col_type!='bool':
01873 raise TypeError("Sum can only be used on numeric column types")
01874 s = 0.0
01875 for r in self.rows:
01876 if r[idx]!=None:
01877 s += r[idx]
01878 return s
01879
01880 def Mean(self, col):
01881 """
01882 Returns the mean of the given column. Cells with ''None'' are ignored. Returns
01883 None, if the column doesn't contain any elements. Col must be of numeric
01884 ('float', 'int') or boolean column type.
01885
01886 If column type is *bool*, the function returns the ratio of
01887 number of 'Trues' by total number of elements.
01888
01889 :param col: column name
01890 :type col: :class:`str`
01891
01892 :raises: :class:`TypeError` if column type is ``string``
01893 """
01894 idx = self.GetColIndex(col)
01895 col_type = self.col_types[idx]
01896 if col_type!='int' and col_type!='float' and col_type!='bool':
01897 raise TypeError("Mean can only be used on numeric or bool column types")
01898
01899 vals=[]
01900 for v in self[col]:
01901 if v!=None:
01902 vals.append(v)
01903 try:
01904 return stutil.Mean(vals)
01905 except:
01906 return None
01907
01908 def RowMean(self, mean_col_name, cols):
01909 """
01910 Adds a new column of type 'float' with a specified name (*mean_col_name*),
01911 containing the mean of all specified columns for each row.
01912
01913 Cols are specified by their names and must be of numeric column
01914 type ('float', 'int') or boolean column type. Cells with None are ignored.
01915 Adds ''None'' if the row doesn't contain any values.
01916
01917 :param mean_col_name: name of new column containing mean values
01918 :type mean_col_name: :class:`str`
01919
01920 :param cols: name or list of names of columns to include in computation of
01921 mean
01922 :type cols: :class:`str` or :class:`list` of strings
01923
01924 :raises: :class:`TypeError` if column type of columns in *col* is ``string``
01925
01926 == Example ==
01927
01928 Staring with the following table:
01929
01930 ==== ==== ====
01931 x y u
01932 ==== ==== ====
01933 1 10 100
01934 2 15 None
01935 3 20 400
01936 ==== ==== ====
01937
01938 the code here adds a column with the name 'mean' to yield the table below:
01939
01940 .. code-block::python
01941
01942 tab.RowMean('mean', ['x', 'u'])
01943
01944
01945 ==== ==== ==== =====
01946 x y u mean
01947 ==== ==== ==== =====
01948 1 10 100 50.5
01949 2 15 None 2
01950 3 20 400 201.5
01951 ==== ==== ==== =====
01952
01953 """
01954
01955 if IsScalar(cols):
01956 cols = [cols]
01957
01958 cols_idxs = []
01959 for col in cols:
01960 idx = self.GetColIndex(col)
01961 col_type = self.col_types[idx]
01962 if col_type!='int' and col_type!='float' and col_type!='bool':
01963 raise TypeError("RowMean can only be used on numeric column types")
01964 cols_idxs.append(idx)
01965
01966 mean_rows = []
01967 for row in self.rows:
01968 vals = []
01969 for idx in cols_idxs:
01970 v = row[idx]
01971 if v!=None:
01972 vals.append(v)
01973 try:
01974 mean = stutil.Mean(vals)
01975 mean_rows.append(mean)
01976 except:
01977 mean_rows.append(None)
01978
01979 self.AddCol(mean_col_name, 'f', mean_rows)
01980
01981 def Percentiles(self, col, nths):
01982 """
01983 Returns the percentiles of column *col* given in *nths*.
01984
01985 The percentiles are calculated as
01986
01987 .. code-block:: python
01988
01989 values[min(len(values), int(round(len(values)*p/100+0.5)-1))]
01990
01991 where values are the sorted values of *col* not equal to ''None''
01992 :param: nths: list of percentiles to be calculated. Each percentile is a number
01993 between 0 and 100.
01994
01995 :raises: :class:`TypeError` if column type is ``string``
01996 :returns: List of percentiles in the same order as given in *nths*
01997 """
01998 idx = self.GetColIndex(col)
01999 col_type = self.col_types[idx]
02000 if col_type!='int' and col_type!='float' and col_type!='bool':
02001 raise TypeError("Median can only be used on numeric column types")
02002
02003 for nth in nths:
02004 if nth < 0 or nth > 100:
02005 raise ValueError("percentiles must be between 0 and 100")
02006 vals=[]
02007 for v in self[col]:
02008 if v!=None:
02009 vals.append(v)
02010 vals=sorted(vals)
02011 if len(vals)==0:
02012 return [None]*len(nths)
02013 percentiles=[]
02014
02015 for nth in nths:
02016 p=vals[min(len(vals)-1, int(round(len(vals)*nth/100.0+0.5)-1))]
02017 percentiles.append(p)
02018 return percentiles
02019
02020 def Median(self, col):
02021 """
02022 Returns the median of the given column. Cells with ''None'' are ignored. Returns
02023 ''None'', if the column doesn't contain any elements. Col must be of numeric
02024 column type ('float', 'int') or boolean column type.
02025
02026 :param col: column name
02027 :type col: :class:`str`
02028
02029 :raises: :class:`TypeError` if column type is ``string``
02030 """
02031 idx = self.GetColIndex(col)
02032 col_type = self.col_types[idx]
02033 if col_type!='int' and col_type!='float' and col_type!='bool':
02034 raise TypeError("Median can only be used on numeric column types")
02035
02036 vals=[]
02037 for v in self[col]:
02038 if v!=None:
02039 vals.append(v)
02040 stutil.Median(vals)
02041 try:
02042 return stutil.Median(vals)
02043 except:
02044 return None
02045
02046 def StdDev(self, col):
02047 """
02048 Returns the standard deviation of the given column. Cells with ''None'' are
02049 ignored. Returns ''None'', if the column doesn't contain any elements. Col must
02050 be of numeric column type ('float', 'int') or boolean column type.
02051
02052 :param col: column name
02053 :type col: :class:`str`
02054
02055 :raises: :class:`TypeError` if column type is ``string``
02056 """
02057 idx = self.GetColIndex(col)
02058 col_type = self.col_types[idx]
02059 if col_type!='int' and col_type!='float' and col_type!='bool':
02060 raise TypeError("StdDev can only be used on numeric column types")
02061
02062 vals=[]
02063 for v in self[col]:
02064 if v!=None:
02065 vals.append(v)
02066 try:
02067 return stutil.StdDev(vals)
02068 except:
02069 return None
02070
02071 def Count(self, col, ignore_nan=True):
02072 """
02073 Count the number of cells in column that are not equal to ''None''.
02074
02075 :param col: column name
02076 :type col: :class:`str`
02077
02078 :param ignore_nan: ignore all *None* values
02079 :type ignore_nan: :class:`bool`
02080 """
02081 count=0
02082 idx=self.GetColIndex(col)
02083 for r in self.rows:
02084 if ignore_nan:
02085 if r[idx]!=None:
02086 count+=1
02087 else:
02088 count+=1
02089 return count
02090
02091 def Correl(self, col1, col2):
02092 """
02093 Calculate the Pearson correlation coefficient between *col1* and *col2*, only
02094 taking rows into account where both of the values are not equal to *None*.
02095 If there are not enough data points to calculate a correlation coefficient,
02096 *None* is returned.
02097
02098 :param col1: column name for first column
02099 :type col1: :class:`str`
02100
02101 :param col2: column name for second column
02102 :type col2: :class:`str`
02103 """
02104 if IsStringLike(col1) and IsStringLike(col2):
02105 col1 = self.GetColIndex(col1)
02106 col2 = self.GetColIndex(col2)
02107 vals1, vals2=([],[])
02108 for v1, v2 in zip(self[col1], self[col2]):
02109 if v1!=None and v2!=None:
02110 vals1.append(v1)
02111 vals2.append(v2)
02112 try:
02113 return stutil.Correl(vals1, vals2)
02114 except:
02115 return None
02116
02117 def SpearmanCorrel(self, col1, col2):
02118 """
02119 Calculate the Spearman correlation coefficient between col1 and col2, only
02120 taking rows into account where both of the values are not equal to None. If
02121 there are not enough data points to calculate a correlation coefficient,
02122 None is returned.
02123
02124 :warning: The function depends on the following module: *scipy.stats.mstats*
02125
02126 :param col1: column name for first column
02127 :type col1: :class:`str`
02128
02129 :param col2: column name for second column
02130 :type col2: :class:`str`
02131 """
02132 try:
02133 import scipy.stats.mstats
02134
02135 if IsStringLike(col1) and IsStringLike(col2):
02136 col1 = self.GetColIndex(col1)
02137 col2 = self.GetColIndex(col2)
02138 vals1, vals2=([],[])
02139 for v1, v2 in zip(self[col1], self[col2]):
02140 if v1!=None and v2!=None:
02141 vals1.append(v1)
02142 vals2.append(v2)
02143 try:
02144 correl = scipy.stats.mstats.spearmanr(vals1, vals2)[0]
02145 if scipy.isnan(correl):
02146 return None
02147 return correl
02148 except:
02149 return None
02150
02151 except ImportError:
02152 LogError("Function needs scipy.stats.mstats, but I could not import it.")
02153 raise
02154
02155
02156 def Save(self, stream_or_filename, format='ost', sep=','):
02157 """
02158 Save the table to stream or filename. The following three file formats
02159 are supported (for more information on file formats, see :meth:`Load`):
02160
02161 ============= =======================================
02162 ost ost-specific format (human readable)
02163 csv comma separated values (human readable)
02164 pickle pickled byte stream (binary)
02165 html HTML table
02166 context ConTeXt table
02167 ============= =======================================
02168
02169 :param stream_or_filename: filename or stream for writing output
02170 :type stream_or_filename: :class:`str` or :class:`file`
02171
02172 :param format: output format (i.e. *ost*, *csv*, *pickle*)
02173 :type format: :class:`str`
02174
02175 :raises: :class:`ValueError` if format is unknown
02176 """
02177 format=format.lower()
02178 if format=='ost':
02179 return self._SaveOST(stream_or_filename)
02180 if format=='csv':
02181 return self._SaveCSV(stream_or_filename, sep=sep)
02182 if format=='pickle':
02183 return self._SavePickle(stream_or_filename)
02184 if format=='html':
02185 return self._SaveHTML(stream_or_filename)
02186 if format=='context':
02187 return self._SaveContext(stream_or_filename)
02188 raise ValueError('unknown format "%s"' % format)
02189
02190 def _SavePickle(self, stream):
02191 if not hasattr(stream, 'write'):
02192 stream=open(stream, 'wb')
02193 cPickle.dump(self, stream, cPickle.HIGHEST_PROTOCOL)
02194
02195 def _SaveHTML(self, stream_or_filename):
02196 def _escape(s):
02197 return s.replace('&', '&').replace('>', '>').replace('<', '<')
02198
02199 file_opened = False
02200 if not hasattr(stream_or_filename, 'write'):
02201 stream = open(stream_or_filename, 'w')
02202 file_opened = True
02203 else:
02204 stream = stream_or_filename
02205 stream.write('<table>')
02206 stream.write('<tr>')
02207 for col_name in self.col_names:
02208 stream.write('<th>%s</th>' % _escape(col_name))
02209 stream.write('</tr>')
02210 for row in self.rows:
02211 stream.write('<tr>')
02212 for i, col in enumerate(row):
02213 val = ''
02214 if col != None:
02215 if self.col_types[i] == 'float':
02216 val = '%.3f' % col
02217 elif self.col_types[i] == 'int':
02218 val = '%d' % col
02219 elif self.col_types[i] == 'bool':
02220 val = col and 'true' or 'false'
02221 else:
02222 val = str(col)
02223 stream.write('<td>%s</td>' % _escape(val))
02224 stream.write('</tr>')
02225 stream.write('</table>')
02226 if file_opened:
02227 stream.close()
02228 def _SaveContext(self, stream_or_filename):
02229 file_opened = False
02230 if not hasattr(stream_or_filename, 'write'):
02231 stream = open(stream_or_filename, 'w')
02232 file_opened = True
02233 else:
02234 stream = stream_or_filename
02235 stream.write('\\starttable[')
02236 for col_type in self.col_types:
02237 if col_type =='string':
02238 stream.write('l|')
02239 elif col_type=='int':
02240 stream.write('r|')
02241 elif col_type =='float':
02242 stream.write('i3r|')
02243 else:
02244 stream.write('l|')
02245 stream.write(']\n\\HL\n')
02246 for col_name in self.col_names:
02247 stream.write('\\NC \\bf %s' % col_name)
02248 stream.write(' \\AR\\HL\n')
02249 for row in self.rows:
02250 for i, col in enumerate(row):
02251 val = '---'
02252 if col != None:
02253 if self.col_types[i] == 'float':
02254 val = '%.3f' % col
02255 elif self.col_types[i] == 'int':
02256 val = '%d' % col
02257 elif self.col_types[i] == 'bool':
02258 val = col and 'true' or 'false'
02259 else:
02260 val = str(col)
02261 stream.write('\\NC %s' % val)
02262 stream.write(' \\AR\n')
02263 stream.write('\\HL\n')
02264 stream.write('\\stoptable')
02265 if file_opened:
02266 stream.close()
02267
02268 def _SaveCSV(self, stream, sep):
02269 if not hasattr(stream, 'write'):
02270 stream=open(stream, 'wb')
02271
02272 writer=csv.writer(stream, delimiter=sep)
02273 writer.writerow(['%s' % n for n in self.col_names])
02274 for row in self.rows:
02275 row=list(row)
02276 for i, c in enumerate(row):
02277 if c==None:
02278 row[i]='NA'
02279 writer.writerow(row)
02280
02281 def _SaveOST(self, stream):
02282 if hasattr(stream, 'write'):
02283 writer=csv.writer(stream, delimiter=' ')
02284 else:
02285 stream=open(stream, 'w')
02286 writer=csv.writer(stream, delimiter=' ')
02287 if self.comment:
02288 stream.write(''.join(['# %s\n' % l for l in self.comment.split('\n')]))
02289 writer.writerow(['%s[%s]' % t for t in zip(self.col_names, self.col_types)])
02290 for row in self.rows:
02291 row=list(row)
02292 for i, c in enumerate(row):
02293 if c==None:
02294 row[i]='NA'
02295 writer.writerow(row)
02296
02297
02298 def GetNumpyMatrix(self, *args):
02299 '''
02300 Returns a numpy matrix containing the selected columns from the table as
02301 columns in the matrix.
02302
02303 Only columns of type *int* or *float* are supported. *NA* values in the
02304 table will be converted to *None* values.
02305
02306 :param \*args: column names to include in numpy matrix
02307
02308 :warning: The function depends on *numpy*
02309 '''
02310 try:
02311 import numpy as np
02312
02313 if len(args)==0:
02314 raise RuntimeError("At least one column must be specified.")
02315
02316 idxs = []
02317 for arg in args:
02318 idx = self.GetColIndex(arg)
02319 col_type = self.col_types[idx]
02320 if col_type!='int' and col_type!='float':
02321 raise TypeError("Numpy matrix can only be generated from numeric column types")
02322 idxs.append(idx)
02323 m = np.matrix([list(self[i]) for i in idxs])
02324 return m.T
02325
02326 except ImportError:
02327 LogError("Function needs numpy, but I could not import it.")
02328 raise
02329
02330
02331
02332 def GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0):
02333
02334 '''
02335 In place Gaussian smooth of a column in the table with a given standard deviation.
02336 All nan are set to nan_value before smoothing.
02337
02338 :param col: column name
02339 :type col: :class:`str`
02340
02341 :param std: standard deviation for gaussian kernel
02342 :type std: `scalar`
02343
02344 :param na_value: all na (None) values of the speciefied column are set to na_value before smoothing
02345 :type na_value: `scalar`
02346
02347 :param padding: allows to handle padding behaviour see scipy ndimage.gaussian_filter1d documentation for more information. standard is reflect
02348 :type padding: :class:`str`
02349
02350 :param c: constant value used for padding if padding mode is constant
02351 :type c: `scalar`
02352
02353
02354
02355 :warning: The function depends on *scipy*
02356 '''
02357
02358 try:
02359 from scipy import ndimage
02360 import numpy as np
02361 except ImportError:
02362 LogError("I need scipy.ndimage and numpy, but could not import it")
02363 raise
02364
02365 idx = self.GetColIndex(col)
02366 col_type = self.col_types[idx]
02367 if col_type!='int' and col_type!='float':
02368 raise TypeError("GaussianSmooth can only be used on numeric column types")
02369
02370 vals=[]
02371 for v in self[col]:
02372 if v!=None:
02373 vals.append(v)
02374 else:
02375 vals.append(na_value)
02376
02377
02378 smoothed_values_ndarray=ndimage.gaussian_filter1d(vals,std, mode=padding, cval=c)
02379
02380 result=[]
02381
02382 for v in smoothed_values_ndarray:
02383 result.append(v)
02384
02385 self[col]=result
02386
02387
02388 def GetOptimalPrefactors(self, ref_col, *args, **kwargs):
02389 '''
02390 This returns the optimal prefactor values (i.e. a, b, c, ...) for the
02391 following equation
02392
02393 .. math::
02394 :label: op1
02395
02396 a*u + b*v + c*w + ... = z
02397
02398 where u, v, w and z are vectors. In matrix notation
02399
02400 .. math::
02401 :label: op2
02402
02403 A*p = z
02404
02405 where A contains the data from the table (u,v,w,...), p are the prefactors
02406 to optimize (a,b,c,...) and z is the vector containing the result of
02407 equation :eq:`op1`.
02408
02409 The parameter ref_col equals to z in both equations, and \*args are columns
02410 u, v and w (or A in :eq:`op2`). All columns must be specified by their names.
02411
02412 **Example:**
02413
02414 .. code-block:: python
02415
02416 tab.GetOptimalPrefactors('colC', 'colA', 'colB')
02417
02418 The function returns a list of containing the prefactors a, b, c, ... in
02419 the correct order (i.e. same as columns were specified in \*args).
02420
02421 Weighting:
02422 If the kwarg weights="columX" is specified, the equations are weighted by
02423 the values in that column. Each row is multiplied by the weight in that row,
02424 which leads to :eq:`op3`:
02425
02426 .. math::
02427 :label: op3
02428
02429 weight*a*u + weight*b*v + weight*c*w + ... = weight*z
02430
02431 Weights must be float or int and can have any value. A value of 0 ignores
02432 this equation, a value of 1 means the same as no weight. If all weights are
02433 the same for each row, the same result will be obtained as with no weights.
02434
02435 **Example:**
02436
02437 .. code-block:: python
02438
02439 tab.GetOptimalPrefactors('colC', 'colA', 'colB', weights='colD')
02440
02441 '''
02442 try:
02443 import numpy as np
02444
02445 if len(args)==0:
02446 raise RuntimeError("At least one column must be specified.")
02447
02448 b = self.GetNumpyMatrix(ref_col)
02449 a = self.GetNumpyMatrix(*args)
02450
02451 if len(kwargs)!=0:
02452 if kwargs.has_key('weights'):
02453 w = self.GetNumpyMatrix(kwargs['weights'])
02454 b = np.multiply(b,w)
02455 a = np.multiply(a,w)
02456
02457 else:
02458 raise RuntimeError("specified unrecognized kwargs, use weights as key")
02459
02460 k = (a.T*a).I*a.T*b
02461 return list(np.array(k.T).reshape(-1))
02462
02463 except ImportError:
02464 LogError("Function needs numpy, but I could not import it.")
02465 raise
02466
02467 def PlotEnrichment(self, score_col, class_col, score_dir='-',
02468 class_dir='-', class_cutoff=2.0,
02469 style='-', title=None, x_title=None, y_title=None,
02470 clear=True, save=None):
02471 '''
02472 Plot an enrichment curve using matplotlib of column *score_col* classified
02473 according to *class_col*.
02474
02475 For more information about parameters of the enrichment, see
02476 :meth:`ComputeEnrichment`, and for plotting see :meth:`Plot`.
02477
02478 :warning: The function depends on *matplotlib*
02479 '''
02480 try:
02481 import matplotlib.pyplot as plt
02482
02483 enrx, enry = self.ComputeEnrichment(score_col, class_col, score_dir,
02484 class_dir, class_cutoff)
02485
02486 if not title:
02487 title = 'Enrichment of %s'%score_col
02488
02489 if not x_title:
02490 x_title = '% database'
02491
02492 if not y_title:
02493 y_title = '% positives'
02494
02495 if clear:
02496 plt.clf()
02497
02498 plt.plot(enrx, enry, style)
02499
02500 plt.title(title, size='x-large', fontweight='bold')
02501 plt.ylabel(y_title, size='x-large')
02502 plt.xlabel(x_title, size='x-large')
02503
02504 if save:
02505 plt.savefig(save)
02506
02507 return plt
02508 except ImportError:
02509 LogError("Function needs matplotlib, but I could not import it.")
02510 raise
02511
02512 def ComputeEnrichment(self, score_col, class_col, score_dir='-',
02513 class_dir='-', class_cutoff=2.0):
02514 '''
02515 Computes the enrichment of column *score_col* classified according to
02516 *class_col*.
02517
02518 For this it is necessary, that the datapoints are classified into positive
02519 and negative points. This can be done in two ways:
02520
02521 - by using one 'bool' type column (*class_col*) which contains *True* for
02522 positives and *False* for negatives
02523
02524 - by specifying a classification column (*class_col*), a cutoff value
02525 (*class_cutoff*) and the classification columns direction (*class_dir*).
02526 This will generate the classification on the fly
02527
02528 * if ``class_dir=='-'``: values in the classification column that are less than or equal to class_cutoff will be counted as positives
02529 * if ``class_dir=='+'``: values in the classification column that are larger than or equal to class_cutoff will be counted as positives
02530
02531 During the calculation, the table will be sorted according to *score_dir*,
02532 where a '-' values means smallest values first and therefore, the smaller
02533 the value, the better.
02534
02535 :warning: If either the value of *class_col* or *score_col* is *None*, the
02536 data in this row is ignored.
02537 '''
02538
02539 ALLOWED_DIR = ['+','-']
02540
02541 score_idx = self.GetColIndex(score_col)
02542 score_type = self.col_types[score_idx]
02543 if score_type!='int' and score_type!='float':
02544 raise TypeError("Score column must be numeric type")
02545
02546 class_idx = self.GetColIndex(class_col)
02547 class_type = self.col_types[class_idx]
02548 if class_type!='int' and class_type!='float' and class_type!='bool':
02549 raise TypeError("Classifier column must be numeric or bool type")
02550
02551 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
02552 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
02553
02554 self.Sort(score_col, score_dir)
02555
02556 x = [0]
02557 y = [0]
02558 enr = 0
02559 old_score_val = None
02560 i = 0
02561
02562 for row in self.rows:
02563 class_val = row[class_idx]
02564 score_val = row[score_idx]
02565 if class_val==None or score_val==None:
02566 continue
02567 if class_val!=None:
02568 if old_score_val==None:
02569 old_score_val = score_val
02570 if score_val!=old_score_val:
02571 x.append(i)
02572 y.append(enr)
02573 old_score_val = score_val
02574 i+=1
02575 if class_type=='bool':
02576 if class_val==True:
02577 enr += 1
02578 else:
02579 if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
02580 enr += 1
02581 x.append(i)
02582 y.append(enr)
02583
02584 # if no false positives or false negatives values are found return None
02585 if x[-1]==0 or y[-1]==0:
02586 return None
02587
02588 x = [float(v)/x[-1] for v in x]
02589 y = [float(v)/y[-1] for v in y]
02590 return x,y
02591
02592 def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-',
02593 class_dir='-', class_cutoff=2.0):
02594 '''
02595 Computes the area under the curve of the enrichment using the trapezoidal
02596 rule.
02597
02598 For more information about parameters of the enrichment, see
02599 :meth:`ComputeEnrichment`.
02600
02601 :warning: The function depends on *numpy*
02602 '''
02603 try:
02604 import numpy as np
02605
02606 enr = self.ComputeEnrichment(score_col, class_col, score_dir,
02607 class_dir, class_cutoff)
02608
02609 if enr==None:
02610 return None
02611 return np.trapz(enr[1], enr[0])
02612 except ImportError:
02613 LogError("Function needs numpy, but I could not import it.")
02614 raise
02615
02616 def ComputeROC(self, score_col, class_col, score_dir='-',
02617 class_dir='-', class_cutoff=2.0):
02618 '''
02619 Computes the receiver operating characteristics (ROC) of column *score_col*
02620 classified according to *class_col*.
02621
02622 For this it is necessary, that the datapoints are classified into positive
02623 and negative points. This can be done in two ways:
02624
02625 - by using one 'bool' column (*class_col*) which contains True for positives
02626 and False for negatives
02627 - by using a non-bool column (*class_col*), a cutoff value (*class_cutoff*)
02628 and the classification columns direction (*class_dir*). This will generate
02629 the classification on the fly
02630
02631 - if ``class_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff* will be counted as positives
02632 - if ``class_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff* will be counted as positives
02633
02634 During the calculation, the table will be sorted according to *score_dir*,
02635 where a '-' values means smallest values first and therefore, the smaller
02636 the value, the better.
02637
02638 If *class_col* does not contain any positives (i.e. value is True (if column
02639 is of type bool) or evaluated to True (if column is of type int or float
02640 (depending on *class_dir* and *class_cutoff*))) the ROC is not defined and
02641 the function will return *None*.
02642
02643 :warning: If either the value of *class_col* or *score_col* is *None*, the
02644 data in this row is ignored.
02645 '''
02646
02647 ALLOWED_DIR = ['+','-']
02648
02649 score_idx = self.GetColIndex(score_col)
02650 score_type = self.col_types[score_idx]
02651 if score_type!='int' and score_type!='float':
02652 raise TypeError("Score column must be numeric type")
02653
02654 class_idx = self.GetColIndex(class_col)
02655 class_type = self.col_types[class_idx]
02656 if class_type!='int' and class_type!='float' and class_type!='bool':
02657 raise TypeError("Classifier column must be numeric or bool type")
02658
02659 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
02660 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
02661
02662 self.Sort(score_col, score_dir)
02663
02664 x = [0]
02665 y = [0]
02666 tp = 0
02667 fp = 0
02668 old_score_val = None
02669
02670 for i,row in enumerate(self.rows):
02671 class_val = row[class_idx]
02672 score_val = row[score_idx]
02673 if class_val==None or score_val==None:
02674 continue
02675 if class_val!=None:
02676 if old_score_val==None:
02677 old_score_val = score_val
02678 if score_val!=old_score_val:
02679 x.append(fp)
02680 y.append(tp)
02681 old_score_val = score_val
02682 if class_type=='bool':
02683 if class_val==True:
02684 tp += 1
02685 else:
02686 fp += 1
02687 else:
02688 if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
02689 tp += 1
02690 else:
02691 fp += 1
02692 x.append(fp)
02693 y.append(tp)
02694
02695 # if no false positives or false negatives values are found return None
02696 if x[-1]==0 or y[-1]==0:
02697 return None
02698
02699 x = [float(v)/x[-1] for v in x]
02700 y = [float(v)/y[-1] for v in y]
02701 return x,y
02702
02703 def ComputeROCAUC(self, score_col, class_col, score_dir='-',
02704 class_dir='-', class_cutoff=2.0):
02705 '''
02706 Computes the area under the curve of the receiver operating characteristics
02707 using the trapezoidal rule.
02708
02709 For more information about parameters of the ROC, see
02710 :meth:`ComputeROC`.
02711
02712 :warning: The function depends on *numpy*
02713 '''
02714 try:
02715 import numpy as np
02716
02717 roc = self.ComputeROC(score_col, class_col, score_dir,
02718 class_dir, class_cutoff)
02719
02720 if not roc:
02721 return None
02722 return np.trapz(roc[1], roc[0])
02723 except ImportError:
02724 LogError("Function needs numpy, but I could not import it.")
02725 raise
02726
02727 def ComputeLogROCAUC(self, score_col, class_col, score_dir='-',
02728 class_dir='-', class_cutoff=2.0):
02729 '''
02730 Computes the area under the curve of the log receiver operating
02731 characteristics (logROC) where the x-axis is semilogarithmic
02732 using the trapezoidal rule.
02733
02734 The logROC is computed with a lambda of 0.001 according to
02735 Rapid Context-Dependent Ligand Desolvation in Molecular Docking
02736 Mysinger M. and Shoichet B., Journal of Chemical Information and Modeling
02737 2010 50 (9), 1561-1573
02738
02739 For more information about parameters of the ROC, see
02740 :meth:`ComputeROC`.
02741
02742 :warning: The function depends on *numpy*
02743 '''
02744 try:
02745 import numpy as np
02746
02747 roc = self.ComputeROC(score_col, class_col, score_dir,
02748 class_dir, class_cutoff)
02749
02750 if not roc:
02751 return None
02752
02753 rocxt, rocyt = roc
02754 rocx=[]
02755 rocy=[]
02756
02757 # define lambda
02758 l=0.001
02759
02760 # remove all duplicate x-values
02761 rocxt = [x if x>0 else l for x in rocxt]
02762 for i in range(len(rocxt)-1):
02763 if rocxt[i]==rocxt[i+1]:
02764 continue
02765 rocx.append(rocxt[i])
02766 rocy.append(rocyt[i])
02767 rocx.append(1.0)
02768 rocy.append(1.0)
02769
02770 # compute logauc
02771 value = 0
02772 for i in range(len(rocx)-1):
02773 x = rocx[i]
02774 if rocx[i]==rocx[i+1]:
02775 continue
02776 b = rocy[i+1]-rocx[i+1]*((rocy[i+1]-rocy[i])/(rocx[i+1]-rocx[i]))
02777 value += ((rocy[i+1]-rocy[i])/math.log(10))+b*(math.log10(rocx[i+1])-math.log10(rocx[i]))
02778 return value/math.log10(1.0/l)
02779
02780 except ImportError:
02781 LogError("Function needs numpy, but I could not import it.")
02782 raise
02783
02784 def PlotROC(self, score_col, class_col, score_dir='-',
02785 class_dir='-', class_cutoff=2.0,
02786 style='-', title=None, x_title=None, y_title=None,
02787 clear=True, save=None):
02788 '''
02789 Plot an ROC curve using matplotlib.
02790
02791 For more information about parameters of the ROC, see
02792 :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
02793
02794 :warning: The function depends on *matplotlib*
02795 '''
02796
02797 try:
02798 import matplotlib.pyplot as plt
02799
02800 roc = self.ComputeROC(score_col, class_col, score_dir,
02801 class_dir, class_cutoff)
02802
02803 if not roc:
02804 return None
02805
02806 enrx, enry = roc
02807
02808 if not title:
02809 title = 'ROC of %s'%score_col
02810
02811 if not x_title:
02812 x_title = 'false positive rate'
02813
02814 if not y_title:
02815 y_title = 'true positive rate'
02816
02817 if clear:
02818 plt.clf()
02819
02820 plt.plot(enrx, enry, style)
02821
02822 plt.title(title, size='x-large', fontweight='bold')
02823 plt.ylabel(y_title, size='x-large')
02824 plt.xlabel(x_title, size='x-large')
02825
02826 if save:
02827 plt.savefig(save)
02828
02829 return plt
02830 except ImportError:
02831 LogError("Function needs matplotlib, but I could not import it.")
02832 raise
02833
02834 def PlotLogROC(self, score_col, class_col, score_dir='-',
02835 class_dir='-', class_cutoff=2.0,
02836 style='-', title=None, x_title=None, y_title=None,
02837 clear=True, save=None):
02838 '''
02839 Plot an logROC curve where the x-axis is semilogarithmic using matplotlib
02840
02841 For more information about parameters of the ROC, see
02842 :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
02843
02844 :warning: The function depends on *matplotlib*
02845 '''
02846
02847 try:
02848 import matplotlib.pyplot as plt
02849
02850 roc = self.ComputeROC(score_col, class_col, score_dir,
02851 class_dir, class_cutoff)
02852
02853 if not roc:
02854 return None
02855
02856 rocx, rocy = roc
02857
02858 if not title:
02859 title = 'logROC of %s'%score_col
02860
02861 if not x_title:
02862 x_title = 'false positive rate'
02863
02864 if not y_title:
02865 y_title = 'true positive rate'
02866
02867 if clear:
02868 plt.clf()
02869
02870 rocx = [x if x>0 else 0.001 for x in rocx]
02871
02872
02873 plt.plot(rocx, rocy, style)
02874
02875 plt.title(title, size='x-large', fontweight='bold')
02876 plt.ylabel(y_title, size='x-large')
02877 plt.xlabel(x_title, size='x-large')
02878
02879 plt.xscale('log', basex=10)
02880 plt.xlim(0.001, 1.0)
02881
02882
02883 if save:
02884 plt.savefig(save)
02885
02886 return plt
02887 except ImportError:
02888 LogError("Function needs matplotlib, but I could not import it.")
02889 raise
02890
02891 def ComputeMCC(self, score_col, class_col, score_dir='-',
02892 class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
02893 '''
02894 Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
02895 with the points classified into true positives, false positives, true
02896 negatives and false negatives according to a specified classification
02897 column (*class_col*).
02898
02899 The datapoints in *score_col* and *class_col* are classified into
02900 positive and negative points. This can be done in two ways:
02901
02902 - by using 'bool' columns which contains True for positives and False
02903 for negatives
02904
02905 - by using 'float' or 'int' columns and specifying a cutoff value and the
02906 columns direction. This will generate the classification on the fly
02907
02908 * if ``class_dir``/``score_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
02909 * if ``class_dir``/``score_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
02910
02911 The two possibilities can be used together, i.e. 'bool' type for one column
02912 and 'float'/'int' type and cutoff/direction for the other column.
02913 '''
02914 ALLOWED_DIR = ['+','-']
02915
02916 score_idx = self.GetColIndex(score_col)
02917 score_type = self.col_types[score_idx]
02918 if score_type!='int' and score_type!='float' and score_type!='bool':
02919 raise TypeError("Score column must be numeric or bool type")
02920
02921 class_idx = self.GetColIndex(class_col)
02922 class_type = self.col_types[class_idx]
02923 if class_type!='int' and class_type!='float' and class_type!='bool':
02924 raise TypeError("Classifier column must be numeric or bool type")
02925
02926 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
02927 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
02928
02929 tp = 0
02930 fp = 0
02931 fn = 0
02932 tn = 0
02933
02934 for i,row in enumerate(self.rows):
02935 class_val = row[class_idx]
02936 score_val = row[score_idx]
02937 if class_val!=None:
02938 if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
02939 if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
02940 tp += 1
02941 else:
02942 fn += 1
02943 else:
02944 if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
02945 tn += 1
02946 else:
02947 fp += 1
02948
02949 mcc = None
02950 msg = None
02951 if (tp+fn)==0:
02952 msg = 'factor (tp + fn) is zero'
02953 elif (tp+fp)==0:
02954 msg = 'factor (tp + fp) is zero'
02955 elif (tn+fn)==0:
02956 msg = 'factor (tn + fn) is zero'
02957 elif (tn+fp)==0:
02958 msg = 'factor (tn + fp) is zero'
02959
02960 if msg:
02961 LogWarning("Could not compute MCC: MCC is not defined since %s"%msg)
02962 else:
02963 mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
02964 return mcc
02965
02966
02967 def IsEmpty(self, col_name=None, ignore_nan=True):
02968 '''
02969 Checks if a table is empty.
02970
02971 If no column name is specified, the whole table is checked for being empty,
02972 whereas if a column name is specified, only this column is checked.
02973
02974 By default, all NAN (or None) values are ignored, and thus, a table
02975 containing only NAN values is considered as empty. By specifying the
02976 option ignore_nan=False, NAN values are counted as 'normal' values.
02977 '''
02978
02979 # table with no columns and no rows
02980 if len(self.col_names)==0:
02981 if col_name:
02982 raise ValueError('Table has no column named "%s"' % col_name)
02983 return True
02984
02985 # column name specified
02986 if col_name:
02987 if self.Count(col_name, ignore_nan=ignore_nan)==0:
02988 return True
02989 else:
02990 return False
02991
02992 # no column name specified -> test whole table
02993 else:
02994 for row in self.rows:
02995 for cell in row:
02996 if ignore_nan:
02997 if cell!=None:
02998 return False
02999 else:
03000 return False
03001 return True
03002
03003
03004 def Extend(self, tab, overwrite=None):
03005 """
03006 Append each row of *tab* to the current table. The data is appended based
03007 on the column names, thus the order of the table columns is *not* relevant,
03008 only the header names.
03009
03010 If there is a column in *tab* that is not present in the current table,
03011 it is added to the current table and filled with *None* for all the rows
03012 present in the current table.
03013
03014 If the type of any column in *tab* is not the same as in the current table
03015 a *TypeError* is raised.
03016
03017 If *overwrite* is not None and set to an existing column name, the specified
03018 column in the table is searched for the first occurrence of a value matching
03019 the value of the column with the same name in the dictionary. If a matching
03020 value is found, the row is overwritten with the dictionary. If no matching
03021 row is found, a new row is appended to the table.
03022 """
03023 # add column to current table if it doesn't exist
03024 for name,typ in zip(tab.col_names, tab.col_types):
03025 if not name in self.col_names:
03026 self.AddCol(name, typ)
03027
03028 # check that column types are the same in current and new table
03029 for name in self.col_names:
03030 if name in tab.col_names:
03031 curr_type = self.col_types[self.GetColIndex(name)]
03032 new_type = tab.col_types[tab.GetColIndex(name)]
03033 if curr_type!=new_type:
03034 raise TypeError('cannot extend table, column %s in new '%name +\
03035 'table different type (%s) than in '%new_type +\
03036 'current table (%s)'%curr_type)
03037
03038 num_rows = len(tab.rows)
03039 for i in range(0,num_rows):
03040 row = tab.rows[i]
03041 data = dict(zip(tab.col_names,row))
03042 self.AddRow(data, overwrite)
03043
03044
03045 def Merge(table1, table2, by, only_matching=False):
03046 """
03047 Returns a new table containing the data from both tables. The rows are
03048 combined based on the common values in the column(s) by. The option 'by' can
03049 be a list of column names. When this is the case, merging is based on
03050 multiple columns.
03051 For example, the two tables below
03052
03053 ==== ====
03054 x y
03055 ==== ====
03056 1 10
03057 2 15
03058 3 20
03059 ==== ====
03060
03061 ==== ====
03062 x u
03063 ==== ====
03064 1 100
03065 3 200
03066 4 400
03067 ==== ====
03068
03069 when merged by column x, produce the following output:
03070
03071 ===== ===== =====
03072 x y u
03073 ===== ===== =====
03074 1 10 100
03075 2 15 None
03076 3 20 200
03077 4 None 400
03078 ===== ===== =====
03079
03080
03081 """
03082 def _key(row, indices):
03083 return tuple([row[i] for i in indices])
03084 def _keep(indices, cn, ct, ni):
03085 ncn, nct, nni=([],[],[])
03086 for i in range(len(cn)):
03087 if i not in indices:
03088 ncn.append(cn[i])
03089 nct.append(ct[i])
03090 nni.append(ni[i])
03091 return ncn, nct, nni
03092 col_names=list(table2.col_names)
03093 col_types=list(table2.col_types)
03094 new_index=[i for i in range(len(col_names))]
03095 if isinstance(by, str):
03096 common2_indices=[col_names.index(by)]
03097 else:
03098 common2_indices=[col_names.index(b) for b in by]
03099 col_names, col_types, new_index=_keep(common2_indices, col_names,
03100 col_types, new_index)
03101
03102 for i, name in enumerate(col_names):
03103 try_name=name
03104 counter=1
03105 while try_name in table1.col_names:
03106 counter+=1
03107 try_name='%s_%d' % (name, counter)
03108 col_names[i]=try_name
03109 common1={}
03110 if isinstance(by, str):
03111 common1_indices=[table1.col_names.index(by)]
03112 else:
03113 common1_indices=[table1.col_names.index(b) for b in by]
03114 for row in table1.rows:
03115 key=_key(row, common1_indices)
03116 if key in common1:
03117 raise ValueError('duplicate key "%s in first table"' % (str(key)))
03118 common1[key]=row
03119 common2={}
03120 for row in table2.rows:
03121 key=_key(row, common2_indices)
03122 if key in common2:
03123 raise ValueError('duplicate key "%s" in second table' % (str(key)))
03124 common2[key]=row
03125 new_tab=Table(table1.col_names+col_names, table1.col_types+col_types)
03126 for k, v in common1.iteritems():
03127 row=v+[None for i in range(len(table2.col_names)-len(common2_indices))]
03128 matched=False
03129 if k in common2:
03130 matched=True
03131 row2=common2[k]
03132 for i, index in enumerate(new_index):
03133 row[len(table1.col_names)+i]=row2[index]
03134 if only_matching and not matched:
03135 continue
03136 new_tab.AddRow(row)
03137 if only_matching:
03138 return new_tab
03139 for k, v in common2.iteritems():
03140 if not k in common1:
03141 v2=[v[i] for i in new_index]
03142 row=[None for i in range(len(table1.col_names))]+v2
03143 for common1_index, common2_index in zip(common1_indices, common2_indices):
03144 row[common1_index]=v[common2_index]
03145 new_tab.AddRow(row)
03146 return new_tab
03147