00001 import csv
00002 import re
00003 import math
00004 from ost import stutil
00005 import itertools
00006 import operator
00007 import cPickle
00008 import weakref
00009 from ost import LogError, LogWarning, LogInfo, LogVerbose
00010
00011 def MakeTitle(col_name):
00012 return col_name.replace('_', ' ')
00013
00014 def IsStringLike(value):
00015 if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
00016 return False
00017 try:
00018 value+''
00019 return True
00020 except:
00021 return False
00022
00023 def IsNullString(value):
00024 value=value.strip().upper()
00025 return value in ('', 'NULL', 'NONE', 'NA')
00026
00027 def IsScalar(value):
00028 if IsStringLike(value):
00029 return True
00030 try:
00031 if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
00032 return False
00033 iter(value)
00034 return False
00035 except:
00036 return True
00037
00038 def GuessColumnType(iterator):
00039 empty=True
00040 possibilities=set(['bool', 'int', 'float'])
00041 for ele in iterator:
00042 str_ele=str(ele).upper()
00043 if IsNullString(str_ele):
00044 continue
00045 empty=False
00046 if 'int' in possibilities:
00047 try:
00048 int(str_ele)
00049 except ValueError:
00050 possibilities.remove('int')
00051
00052 if 'float' in possibilities:
00053 try:
00054 float(str_ele)
00055 except ValueError:
00056 possibilities.remove('float')
00057 if 'bool' in possibilities:
00058 if str_ele not in set(['YES', 'NO', 'TRUE', 'FALSE']):
00059 possibilities.remove('bool')
00060
00061 if len(possibilities)==0:
00062 return 'string'
00063 if len(possibilities)==2:
00064 return 'int'
00065 if empty:
00066 return 'string'
00067
00068 return possibilities.pop()
00069
00070 class BinaryColExpr:
00071 def __init__(self, op, lhs, rhs):
00072 self.op=op
00073 self.lhs=lhs
00074 self.rhs=rhs
00075 if IsScalar(lhs):
00076 self.lhs=itertools.cyle([self.lhs])
00077 if IsScalar(rhs):
00078 self.rhs=itertools.cycle([self.rhs])
00079 def __iter__(self):
00080 for l, r in zip(self.lhs, self.rhs):
00081 if l!=None and r!=None:
00082 yield self.op(l, r)
00083 else:
00084 yield None
00085 def __add__(self, rhs):
00086 return BinaryColExpr(operator.add, self, rhs)
00087
00088 def __sub__(self, rhs):
00089 return BinaryColExpr(operator.sub, self, rhs)
00090
00091 def __mul__(self, rhs):
00092 return BinaryColExpr(operator.mul, self, rhs)
00093
00094 def __div__(self, rhs):
00095 return BinaryColExpr(operator.div, self, rhs)
00096
00097 class TableCol:
00098 def __init__(self, table, col):
00099 self._table=table
00100 if type(col)==str:
00101 self.col_index=self._table.GetColIndex(col)
00102 else:
00103 self.col_index=col
00104
00105 def __iter__(self):
00106 for row in self._table.rows:
00107 yield row[self.col_index]
00108
00109 def __len__(self):
00110 return len(self._table.rows)
00111
00112 def __getitem__(self, index):
00113 return self._table.rows[index][self.col_index]
00114
00115 def __setitem__(self, index, value):
00116 self._table.rows[index][self.col_index]=value
00117
00118 def __add__(self, rhs):
00119 return BinaryColExpr(operator.add, self, rhs)
00120
00121 def __sub__(self, rhs):
00122 return BinaryColExpr(operator.sub, self, rhs)
00123
00124 def __mul__(self, rhs):
00125 return BinaryColExpr(operator.mul, self, rhs)
00126
00127 def __div__(self, rhs):
00128 return BinaryColExpr(operator.div, self, rhs)
00129
00130 class TableRow:
00131 """
00132 Essentially a named tuple, but allows column names that are not valid
00133 python variable names.
00134 """
00135 def __init__(self, row_data, tab):
00136 self.__dict__['tab'] = weakref.proxy(tab)
00137 self.__dict__['row_data'] = row_data
00138
00139 def __getitem__(self, col_name):
00140 if type(col_name)==int:
00141 return self.row_data[col_name]
00142 return self.row_data[self.tab.GetColIndex(col_name)]
00143
00144 def __str__(self):
00145 s = []
00146 for k, v in zip(self.__dict__['tab'].col_names, self.__dict__['row_data']):
00147 s.append('%s=%s' % (k, str(v)))
00148 return ', '.join(s)
00149
00150
00151 def __len__(self):
00152 return len(self.row_data)
00153
00154 def __setitem__(self, col_name, val):
00155 if type(col_name)==int:
00156 self.row_data[col_name] = val
00157 else:
00158 self.row_data[self.tab.GetColIndex(col_name)] = val
00159
00160 def __getattr__(self, col_name):
00161 if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
00162 raise AttributeError(col_name)
00163 return self.row_data[self.tab.GetColIndex(col_name)]
00164
00165 def __setattr__(self, col_name, val):
00166 if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
00167 raise AttributeError(col_name)
00168 self.row_data[self.tab.GetColIndex(col_name)] = val
00169
00170 class Table(object):
00171 """
00172
00173 The table class provides convenient access to data in tabular form. An empty
00174 table can be easily constructed as follows
00175
00176 .. code-block:: python
00177
00178 tab = Table()
00179
00180 If you want to add columns directly when creating the table, column names
00181 and *column types* can be specified as follows
00182
00183 .. code-block:: python
00184
00185 tab = Table(['nameX','nameY','nameZ'], 'sfb')
00186
00187 this will create three columns called nameX, nameY and nameZ of type string,
00188 float and bool, respectively. There will be no data in the table and thus,
00189 the table will not contain any rows.
00190
00191 The following *column types* are supported:
00192
00193 ======= ========
00194 name abbrev
00195 ======= ========
00196 string s
00197 float f
00198 int i
00199 bool b
00200 ======= ========
00201
00202 If you want to add data to the table in addition, use the following:
00203
00204 .. code-block:: python
00205
00206 tab=Table(['nameX','nameY','nameZ'],
00207 'sfb',
00208 nameX = ['a','b','c'],
00209 nameY = [0.1, 1.2, 3.414],
00210 nameZ = [True, False, False])
00211
00212 if values for one column is left out, they will be filled with NA, but if
00213 values are specified, all values must be specified (i.e. same number of
00214 values per column)
00215
00216 """
00217
00218 SUPPORTED_TYPES=('int', 'float', 'bool', 'string',)
00219
00220
00221 def __init__(self, col_names=[], col_types=None, **kwargs):
00222
00223 self.col_names=list(col_names)
00224 self.comment=''
00225 self.name=''
00226
00227 self.col_types = self._ParseColTypes(col_types)
00228 self.rows=[]
00229 if len(kwargs)>=0:
00230 if not col_names:
00231 self.col_names=[v for v in kwargs.keys()]
00232 if not self.col_types:
00233 self.col_types=['string' for u in range(len(self.col_names))]
00234 if len(kwargs)>0:
00235 self._AddRowsFromDict(kwargs)
00236
00237 def __getattr__(self, col_name):
00238
00239
00240
00241
00242 if 'col_names' not in self.__dict__ or col_name not in self.col_names:
00243 raise AttributeError(col_name)
00244 return TableCol(self, col_name)
00245
00246 @staticmethod
00247 def _ParseColTypes(types, exp_num=None):
00248 if types==None:
00249 return None
00250
00251 short2long = {'s' : 'string', 'i': 'int', 'b' : 'bool', 'f' : 'float'}
00252 allowed_short = short2long.keys()
00253 allowed_long = short2long.values()
00254
00255 type_list = []
00256
00257
00258 if IsScalar(types):
00259 if type(types)==str:
00260 types = types.lower()
00261
00262
00263 if types in allowed_long:
00264 type_list.append(types)
00265 elif types in allowed_short:
00266 type_list.append(short2long[types])
00267
00268
00269 elif types.find(',')!=-1:
00270 for t in types.split(','):
00271 if t in allowed_long:
00272 type_list.append(t)
00273 elif t in allowed_short:
00274 type_list.append(short2long[t])
00275 else:
00276 raise ValueError('Unknown type %s in types %s'%(t,types))
00277
00278
00279 else:
00280 for t in types:
00281 if t in allowed_short:
00282 type_list.append(short2long[t])
00283 else:
00284 raise ValueError('Unknown type %s in types %s'%(t,types))
00285
00286
00287 else:
00288 raise ValueError('Col type %s must be string or list'%types)
00289
00290
00291 else:
00292 for t in types:
00293
00294 if type(t)==str:
00295 t = t.lower()
00296 if t in allowed_long:
00297 type_list.append(t)
00298 elif t in allowed_short:
00299 type_list.append(short2long[t])
00300 else:
00301 raise ValueError('Unknown type %s in types %s'%(t,types))
00302
00303
00304 else:
00305 raise ValueError('Col type %s must be string or list'%types)
00306
00307 if exp_num:
00308 if len(type_list)!=exp_num:
00309 raise ValueError('Parsed number of col types (%i) differs from ' + \
00310 'expected (%i) in types %s'%(len(type_list),exp_num,types))
00311
00312 return type_list
00313
00314 def SetName(self, name):
00315 '''
00316 Set name of the table
00317
00318 :param name: name
00319 :type name: :class:`str`
00320 '''
00321 self.name = name
00322
00323 def GetName(self):
00324 '''
00325 Get name of table
00326 '''
00327 return self.name
00328
00329 def RenameCol(self, old_name, new_name):
00330 """
00331 Rename column *old_name* to *new_name*.
00332
00333 :param old_name: Name of the old column
00334 :param new_name: Name of the new column
00335 :raises: :exc:`ValueError` when *old_name* is not a valid column
00336 """
00337 if old_name==new_name:
00338 return
00339 self.AddCol(new_name, self.col_types[self.GetColIndex(old_name)],
00340 self[old_name])
00341 self.RemoveCol(old_name)
00342 def _Coerce(self, value, ty):
00343 '''
00344 Try to convert values (e.g. from :class:`str` type) to the specified type
00345
00346 :param value: the value
00347 :type value: any type
00348
00349 :param ty: name of type to convert it to (i.e. *int*, *float*, *string*,
00350 *bool*)
00351 :type ty: :class:`str`
00352 '''
00353 if value=='NA' or value==None:
00354 return None
00355 if ty=='int':
00356 return int(value)
00357 if ty=='float':
00358 return float(value)
00359 if ty=='string':
00360 return str(value)
00361 if ty=='bool':
00362 if isinstance(value, str) or isinstance(value, unicode):
00363 if value.upper() in ('FALSE', 'NO',):
00364 return False
00365 return True
00366 return bool(value)
00367 raise ValueError('Unknown type %s' % ty)
00368
00369 def GetColIndex(self, col):
00370 '''
00371 Returns the column index for the column with the given name.
00372
00373 :raises: ValueError if no column with the name is found.
00374 '''
00375 if col not in self.col_names:
00376 raise ValueError('Table has no column named "%s"' % col)
00377 return self.col_names.index(col)
00378
00379 def GetColNames(self):
00380 '''
00381 Returns a list containing all column names.
00382 '''
00383 return self.col_names
00384
00385 def SearchColNames(self, regex):
00386 '''
00387 Returns a list of column names matching the regex.
00388
00389 :param regex: regex pattern
00390 :type regex: :class:`str`
00391
00392 :returns: :class:`list` of column names (:class:`str`)
00393 '''
00394 matching_names = []
00395 for name in self.col_names:
00396 matches = re.search(regex, name)
00397 if matches:
00398 matching_names.append(name)
00399 return matching_names
00400
00401 def HasCol(self, col):
00402 '''
00403 Checks if the column with a given name is present in the table.
00404 '''
00405 return col in self.col_names
00406
00407 def __getitem__(self, k):
00408 if type(k)==int:
00409 return TableCol(self, self.col_names[k])
00410 else:
00411 return TableCol(self, k)
00412
00413 def __setitem__(self, k, value):
00414 col_index=k
00415 if type(k)!=int:
00416 col_index=self.GetColIndex(k)
00417 if IsScalar(value):
00418 value=itertools.cycle([value])
00419 for r, v in zip(self.rows, value):
00420 r[col_index]=v
00421
00422 def ToString(self, float_format='%.3f', int_format='%d', rows=None):
00423 '''
00424 Convert the table into a string representation.
00425
00426 The output format can be modified for int and float type columns by
00427 specifying a formatting string for the parameters *float_format* and
00428 *int_format*.
00429
00430 The option *rows* specify the range of rows to be printed. The parameter
00431 must be a type that supports indexing (e.g. a :class:`list`) containing the
00432 start and end row *index*, e.g. [start_row_idx, end_row_idx].
00433
00434 :param float_format: formatting string for float columns
00435 :type float_format: :class:`str`
00436
00437 :param int_format: formatting string for int columns
00438 :type int_format: :class:`str`
00439
00440 :param rows: iterable containing start and end row *index*
00441 :type rows: iterable containing :class:`ints <int>`
00442 '''
00443 widths=[len(cn) for cn in self.col_names]
00444 sel_rows=self.rows
00445 if rows:
00446 sel_rows=self.rows[rows[0]:rows[1]]
00447 for row in sel_rows:
00448 for i, (ty, col) in enumerate(zip(self.col_types, row)):
00449 if col==None:
00450 widths[i]=max(widths[i], len('NA'))
00451 elif ty=='float':
00452 widths[i]=max(widths[i], len(float_format % col))
00453 elif ty=='int':
00454 widths[i]=max(widths[i], len(int_format % col))
00455 else:
00456 widths[i]=max(widths[i], len(str(col)))
00457 s=''
00458 if self.comment:
00459 s+=''.join(['# %s\n' % l for l in self.comment.split('\n')])
00460 total_width=sum(widths)+2*len(widths)
00461 for width, col_name in zip(widths, self.col_names):
00462 s+=col_name.center(width+2)
00463 s+='\n%s\n' % ('-'*total_width)
00464 for row in sel_rows:
00465 for width, ty, col in zip(widths, self.col_types, row):
00466 cs=''
00467 if col==None:
00468 cs='NA'.center(width+2)
00469 elif ty=='float':
00470 cs=(float_format % col).rjust(width+2)
00471 elif ty=='int':
00472 cs=(int_format % col).rjust(width+2)
00473 else:
00474 cs=' '+str(col).ljust(width+1)
00475 s+=cs
00476 s+='\n'
00477 return s
00478
00479 def __str__(self):
00480 return self.ToString()
00481
00482 def Stats(self, col):
00483 idx = self.GetColIndex(col)
00484 text ='''
00485 Statistics for column %(col)s
00486
00487 Number of Rows : %(num)d
00488 Number of Rows Not None: %(num_non_null)d
00489 Mean : %(mean)f
00490 Median : %(median)f
00491 Standard Deviation : %(stddev)f
00492 Min : %(min)f
00493 Max : %(max)f
00494 '''
00495 data = {
00496 'col' : col,
00497 'num' : len(self.rows),
00498 'num_non_null' : self.Count(col),
00499 'median' : self.Median(col),
00500 'mean' : self.Mean(col),
00501 'stddev' : self.StdDev(col),
00502 'min' : self.Min(col),
00503 'max' : self.Max(col),
00504 }
00505 return text % data
00506
00507 def _AddRowsFromDict(self, d, overwrite=None):
00508 '''
00509 Add one or more rows from a :class:`dictionary <dict>`.
00510
00511 If *overwrite* is not None and set to an existing column name, the specified
00512 column in the table is searched for the first occurrence of a value matching
00513 the value of the column with the same name in the dictionary. If a matching
00514 value is found, the row is overwritten with the dictionary. If no matching
00515 row is found, a new row is appended to the table.
00516
00517 :param d: dictionary containing the data
00518 :type d: :class:`dict`
00519
00520 :param overwrite: column name to overwrite existing row if value in
00521 column *overwrite* matches
00522 :type overwrite: :class:`str`
00523
00524 :raises: :class:`ValueError` if multiple rows are added but the number of
00525 data items is different for different columns.
00526 '''
00527
00528 idxs = [self.GetColIndex(k) for k in d.keys()]
00529
00530
00531 old_len = None
00532 for k,v in d.iteritems():
00533 if IsScalar(v):
00534 v = [v]
00535 d[k] = v
00536 if not old_len:
00537 old_len = len(v)
00538 elif old_len!=len(v):
00539 raise ValueError("Cannot add rows: length of data must be equal " + \
00540 "for all columns in %s"%str(d))
00541
00542
00543 for i,data in enumerate(zip(*d.values())):
00544 new_row = [None for a in range(len(self.col_names))]
00545 for idx,v in zip(idxs,data):
00546 new_row[idx] = self._Coerce(v, self.col_types[idx])
00547
00548
00549 if overwrite:
00550 overwrite_idx = self.GetColIndex(overwrite)
00551 added = False
00552 for i,r in enumerate(self.rows):
00553 if r[overwrite_idx]==new_row[overwrite_idx]:
00554 for j,e in enumerate(self.rows[i]):
00555 if new_row[j]==None:
00556 new_row[j] = e
00557 self.rows[i] = new_row
00558 added = True
00559 break
00560
00561
00562 if not overwrite or not added:
00563 self.rows.append(new_row)
00564
00565 def PairedTTest(self, col_a, col_b):
00566 """
00567 Two-sided test for the null-hypothesis that two related samples
00568 have the same average (expected values).
00569
00570 :param col_a: First column
00571 :type col_a: :class:`str`
00572 :param col_b: Second column
00573 :type col_b: :class:`str`
00574
00575 :returns: P-value between 0 and 1 that the two columns have the
00576 same average. The smaller the value, the less related the two
00577 columns are.
00578 """
00579 from scipy.stats import ttest_rel
00580 xs = []
00581 ys = []
00582 for x, y in self.Zip(col_a, col_b):
00583 if x!=None and y!=None:
00584 xs.append(x)
00585 ys.append(y)
00586 result = ttest_rel(xs, ys)
00587 return result[1]
00588
00589 def AddRow(self, data, overwrite=None):
00590 """
00591 Add a row to the table.
00592
00593 *data* may either be a dictionary or a list-like object:
00594
00595 - If *data* is a dictionary, the keys in the dictionary must match the
00596 column names. Columns not found in the dict will be initialized to None.
00597 If the dict contains list-like objects, multiple rows will be added, if
00598 the number of items in all list-like objects is the same, otherwise a
00599 :class:`ValueError` is raised.
00600
00601 - If *data* is a list-like object, the row is initialized from the values
00602 in *data*. The number of items in *data* must match the number of
00603 columns in the table. A :class:`ValuerError` is raised otherwise. The
00604 values are added in the order specified in the list, thus, the order of
00605 the data must match the columns.
00606
00607 If *overwrite* is not None and set to an existing column name, the specified
00608 column in the table is searched for the first occurrence of a value matching
00609 the value of the column with the same name in the dictionary. If a matching
00610 value is found, the row is overwritten with the dictionary. If no matching
00611 row is found, a new row is appended to the table.
00612
00613 :param data: data to add
00614 :type data: :class:`dict` or *list-like* object
00615
00616 :param overwrite: column name to overwrite existing row if value in
00617 column *overwrite* matches
00618 :type overwrite: :class:`str`
00619
00620 :raises: :class:`ValueError` if *list-like* object is used and number of
00621 items does *not* match number of columns in table.
00622
00623 :raises: :class:`ValueError` if *dict* is used and multiple rows are added
00624 but the number of data items is different for different columns.
00625
00626 **Example:** add multiple data rows to a subset of columns using a dictionary
00627
00628 .. code-block:: python
00629
00630 # create table with three float columns
00631 tab = Table(['x','y','z'], 'fff')
00632
00633 # add rows from dict
00634 data = {'x': [1.2, 1.6], 'z': [1.6, 5.3]}
00635 tab.AddRow(data)
00636 print tab
00637
00638 '''
00639 will produce the table
00640
00641 ==== ==== ====
00642 x y z
00643 ==== ==== ====
00644 1.20 NA 1.60
00645 1.60 NA 5.30
00646 ==== ==== ====
00647 '''
00648
00649 # overwrite the row with x=1.2 and add row with x=1.9
00650 data = {'x': [1.2, 1.9], 'z': [7.9, 3.5]}
00651 tab.AddRow(data, overwrite='x')
00652 print tab
00653
00654 '''
00655 will produce the table
00656
00657 ==== ==== ====
00658 x y z
00659 ==== ==== ====
00660 1.20 NA 7.90
00661 1.60 NA 5.30
00662 1.90 NA 3.50
00663 ==== ==== ====
00664 '''
00665 """
00666 if type(data)==dict:
00667 self._AddRowsFromDict(data, overwrite)
00668 else:
00669 if len(data)!=len(self.col_names):
00670 msg='data array must have %d elements, not %d'
00671 raise ValueError(msg % (len(self.col_names), len(data)))
00672 new_row = [self._Coerce(v, t) for v, t in zip(data, self.col_types)]
00673
00674
00675 if overwrite:
00676 overwrite_idx = self.GetColIndex(overwrite)
00677 added = False
00678 for i,r in enumerate(self.rows):
00679 if r[overwrite_idx]==new_row[overwrite_idx]:
00680 self.rows[i] = new_row
00681 added = True
00682 break
00683
00684
00685 if not overwrite or not added:
00686 self.rows.append(new_row)
00687
00688 def RemoveCol(self, col):
00689 """
00690 Remove column with the given name from the table.
00691
00692 :param col: name of column to remove
00693 :type col: :class:`str`
00694 """
00695 idx = self.GetColIndex(col)
00696 del self.col_names[idx]
00697 del self.col_types[idx]
00698 for row in self.rows:
00699 del row[idx]
00700
00701 def AddCol(self, col_name, col_type, data=None):
00702 """
00703 Add a column to the right of the table.
00704
00705 :param col_name: name of new column
00706 :type col_name: :class:`str`
00707
00708 :param col_type: type of new column (long versions: *int*, *float*, *bool*,
00709 *string* or short versions: *i*, *f*, *b*, *s*)
00710 :type col_type: :class:`str`
00711
00712 :param data: data to add to new column
00713 :type data: scalar or iterable
00714
00715 **Example:**
00716
00717 .. code-block:: python
00718
00719 tab = Table(['x'], 'f', x=range(5))
00720 tab.AddCol('even', 'bool', itertools.cycle([True, False]))
00721 print tab
00722
00723 '''
00724 will produce the table
00725
00726 ==== ====
00727 x even
00728 ==== ====
00729 0 True
00730 1 False
00731 2 True
00732 3 False
00733 4 True
00734 ==== ====
00735 '''
00736
00737 If data is a constant instead of an iterable object, it's value
00738 will be written into each row:
00739
00740 .. code-block:: python
00741
00742 tab = Table(['x'], 'f', x=range(5))
00743 tab.AddCol('num', 'i', 1)
00744 print tab
00745
00746 '''
00747 will produce the table
00748
00749 ==== ====
00750 x num
00751 ==== ====
00752 0 1
00753 1 1
00754 2 1
00755 3 1
00756 4 1
00757 ==== ====
00758 '''
00759
00760 As a special case, if there are no previous rows, and data is not
00761 None, rows are added for every item in data.
00762 """
00763
00764 if col_name in self.col_names:
00765 raise ValueError('Column with name %s already exists'%col_name)
00766
00767 col_type = self._ParseColTypes(col_type, exp_num=1)[0]
00768 self.col_names.append(col_name)
00769 self.col_types.append(col_type)
00770
00771 if len(self.rows)>0:
00772 if IsScalar(data):
00773 for row in self.rows:
00774 row.append(data)
00775 else:
00776 if hasattr(data, '__len__') and len(data)!=len(self.rows):
00777 self.col_names.pop()
00778 self.col_types.pop()
00779 raise ValueError('Length of data (%i) must correspond to number of '%len(data) +\
00780 'existing rows (%i)'%len(self.rows))
00781 for row, d in zip(self.rows, data):
00782 row.append(d)
00783
00784 elif data!=None and len(self.col_names)==1:
00785 if IsScalar(data):
00786 self.AddRow({col_name : data})
00787 else:
00788 for v in data:
00789 self.AddRow({col_name : v})
00790
00791 def Filter(self, *args, **kwargs):
00792 """
00793 Returns a filtered table only containing rows matching all the predicates
00794 in kwargs and args For example,
00795
00796 .. code-block:: python
00797
00798 tab.Filter(town='Basel')
00799
00800 will return all the rows where the value of the column "town" is equal to
00801 "Basel". Several predicates may be combined, i.e.
00802
00803 .. code-block:: python
00804
00805 tab.Filter(town='Basel', male=True)
00806
00807 will return the rows with "town" equal to "Basel" and "male" equal to true.
00808 args are unary callables returning true if the row should be included in the
00809 result and false if not.
00810 """
00811 filt_tab=Table(list(self.col_names), list(self.col_types))
00812 for row in self.rows:
00813 matches=True
00814 for func in args:
00815 if not func(row):
00816 matches=False
00817 break
00818 for key, val in kwargs.iteritems():
00819 if row[self.GetColIndex(key)]!=val:
00820 matches=False
00821 break
00822 if matches:
00823 filt_tab.AddRow(row)
00824 return filt_tab
00825
00826
00827 def Select(self, query):
00828
00829 """
00830 Returns a new table object containing all rows matching a logical query
00831 expression.
00832
00833 *query* is a string containing the logical expression, that will be
00834 evaluated for every row.
00835
00836 Operands have to be the name of a column or an expression that can be
00837 parsed to float, int, bool or string.
00838 Valid operators are: and, or, !=, !, <=, >=, ==, =, <, >, +, -, \*, /
00839
00840 .. code-block:: python
00841
00842 subtab = tab.Select('col_a>0.5 and (col_b=5 or col_c=5)')
00843
00844 The selection query should be self explaining. Allowed parenthesis are:
00845 (), [], {}, whereas parenthesis mismatches get recognized. Expressions like
00846 '3<=col_a>=col_b' throw an error, due to problems in figuring out the
00847 evaluation order.
00848
00849 There are two special expressions:
00850
00851 .. code-block:: python
00852
00853 #selects rows, where 1.0<=col_a<=1.5
00854 subtab = tab.Select('col_a=1.0:1.5')
00855
00856 #selects rows, where col_a=1 or col_a=2 or col_a=3
00857 subtab = tab.Select('col_a=1,2,3')
00858
00859 Only consistent types can be compared. If col_a is of type string and col_b
00860 is of type int, following expression would throw an error: 'col_a<col_b'
00861 """
00862
00863 try:
00864 from table_selector import TableSelector
00865 except:
00866 raise ImportError("Tried to import from the file table_selector.py, but could not find it!")
00867
00868 selector=TableSelector(self.col_types, self.col_names, query)
00869
00870 selected_tab=Table(list(self.col_names), list(self.col_types))
00871
00872 for row in self.rows:
00873 if selector.EvaluateRow(row):
00874 selected_tab.AddRow(row)
00875
00876 return selected_tab
00877
00878
00879 @staticmethod
00880 def _LoadOST(stream_or_filename):
00881 fieldname_pattern=re.compile(r'(?P<name>[^[]+)(\[(?P<type>\w+)\])?')
00882 values_pattern=re.compile("([^\" ]+|\"[^\"]*\")+")
00883 if not hasattr(stream_or_filename, 'read'):
00884 stream=open(stream_or_filename, 'r')
00885 else:
00886 stream=stream_or_filename
00887 header=False
00888 num_lines=0
00889 for line in stream:
00890 line=line.strip()
00891 if line.startswith('#'):
00892 continue
00893 if len(line)==0:
00894 continue
00895 num_lines+=1
00896 if not header:
00897 fieldnames=[]
00898 fieldtypes=[]
00899 for col in line.split():
00900 match=fieldname_pattern.match(col)
00901 if match:
00902 if match.group('type'):
00903 fieldtypes.append(match.group('type'))
00904 else:
00905 fieldtypes.append('string')
00906 fieldnames.append(match.group('name'))
00907 tab=Table(fieldnames, fieldtypes)
00908 header=True
00909 continue
00910 tab.AddRow([x.strip('"') for x in values_pattern.findall(line)])
00911 if num_lines==0:
00912 raise IOError("Cannot read table from empty stream")
00913 return tab
00914
00915 def _GuessColumnTypes(self):
00916 for col_idx in range(len(self.col_names)):
00917 self.col_types[col_idx]=GuessColumnType(self[self.col_names[col_idx]])
00918 for row in self.rows:
00919 for idx in range(len(row)):
00920 row[idx]=self._Coerce(row[idx], self.col_types[idx])
00921
00922 @staticmethod
00923 def _LoadCSV(stream_or_filename, sep):
00924 if not hasattr(stream_or_filename, 'read'):
00925 stream=open(stream_or_filename, 'r')
00926 else:
00927 stream=stream_or_filename
00928 reader=csv.reader(stream, delimiter=sep)
00929 first=True
00930 for row in reader:
00931 if first:
00932 header=row
00933 types='s'*len(row)
00934 tab=Table(header, types)
00935 first=False
00936 else:
00937 tab.AddRow(row)
00938 if first:
00939 raise IOError('trying to load table from empty CSV stream/file')
00940
00941 tab._GuessColumnTypes()
00942 return tab
00943
00944 @staticmethod
00945 def _LoadPickle(stream_or_filename):
00946 if not hasattr(stream_or_filename, 'read'):
00947 stream=open(stream_or_filename, 'rb')
00948 else:
00949 stream=stream_or_filename
00950 return cPickle.load(stream)
00951
00952 @staticmethod
00953 def _GuessFormat(filename):
00954 try:
00955 filename = filename.name
00956 except AttributeError, e:
00957 pass
00958 if filename.endswith('.csv'):
00959 return 'csv'
00960 elif filename.endswith('.pickle'):
00961 return 'pickle'
00962 else:
00963 return 'ost'
00964
00965
00966 @staticmethod
00967 def Load(stream_or_filename, format='auto', sep=','):
00968 """
00969 Load table from stream or file with given name.
00970
00971 By default, the file format is set to *auto*, which tries to guess the file
00972 format from the file extension. The following file extensions are
00973 recognized:
00974
00975 ============ ======================
00976 extension recognized format
00977 ============ ======================
00978 .csv comma separated values
00979 .pickle pickled byte stream
00980 <all others> ost-specific format
00981 ============ ======================
00982
00983 Thus, *format* must be specified for reading file with different filename
00984 extensions.
00985
00986 The following file formats are understood:
00987
00988 - ost
00989
00990 This is an ost-specific, but still human readable file format. The file
00991 (stream) must start with header line of the form
00992
00993 col_name1[type1] <col_name2[type2]>...
00994
00995 The types given in brackets must be one of the data types the
00996 :class:`Table` class understands. Each following line in the file then must
00997 contains exactly the same number of data items as listed in the header. The
00998 data items are automatically converted to the column format. Lines starting
00999 with a '#' and empty lines are ignored.
01000
01001 - pickle
01002
01003 Deserializes the table from a pickled byte stream.
01004
01005 - csv
01006
01007 Reads the table from comma separated values stream. Since there is no
01008 explicit type information in the csv file, the column types are guessed,
01009 using the following simple rules:
01010
01011 * if all values are either NA/NULL/NONE the type is set to string.
01012 * if all non-null values are convertible to float/int the type is set to
01013 float/int.
01014 * if all non-null values are true/false/yes/no, the value is set to bool.
01015 * for all other cases, the column type is set to string.
01016
01017 :returns: A new :class:`Table` instance
01018 """
01019 format=format.lower()
01020 if format=='auto':
01021 format = Table._GuessFormat(stream_or_filename)
01022
01023 if format=='ost':
01024 return Table._LoadOST(stream_or_filename)
01025 if format=='csv':
01026 return Table._LoadCSV(stream_or_filename, sep=sep)
01027 if format=='pickle':
01028 return Table._LoadPickle(stream_or_filename)
01029 raise ValueError('unknown format ""' % format)
01030
01031 def Sort(self, by, order='+'):
01032 """
01033 Performs an in-place sort of the table, based on column *by*.
01034
01035 :param by: column name by which to sort
01036 :type by: :class:`str`
01037
01038 :param order: ascending (``-``) or descending (``+``) order
01039 :type order: :class:`str` (i.e. *+*, *-*)
01040 """
01041 sign=-1
01042 if order=='-':
01043 sign=1
01044 key_index=self.GetColIndex(by)
01045 def _key_cmp(lhs, rhs):
01046 return sign*cmp(lhs[key_index], rhs[key_index])
01047 self.rows=sorted(self.rows, _key_cmp)
01048
01049 def GetUnique(self, col, ignore_nan=True):
01050 """
01051 Extract a list of all unique values from one column.
01052
01053 :param col: column name
01054 :type col: :class:`str`
01055
01056 :param ignore_nan: ignore all *None* values
01057 :type ignore_nan: :class:`bool`
01058 """
01059 idx = self.GetColIndex(col)
01060 seen = {}
01061 result = []
01062 for row in self.rows:
01063 item = row[idx]
01064 if item!=None or ignore_nan==False:
01065 if item in seen: continue
01066 seen[item] = 1
01067 result.append(item)
01068 return result
01069
01070 def Zip(self, *args):
01071 """
01072 Allows to conveniently iterate over a selection of columns, e.g.
01073
01074 .. code-block:: python
01075
01076 tab = Table.Load('...')
01077 for col1, col2 in tab.Zip('col1', 'col2'):
01078 print col1, col2
01079
01080 is a shortcut for
01081
01082 .. code-block:: python
01083
01084 tab = Table.Load('...')
01085 for col1, col2 in zip(tab['col1'], tab['col2']):
01086 print col1, col2
01087 """
01088 return zip(*[self[arg] for arg in args])
01089
01090 def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None,
01091 z_title=None, x_range=None, y_range=None, z_range=None,
01092 color=None, plot_if=None, legend=None,
01093 num_z_levels=10, z_contour=True, z_interpol='nn', diag_line=False,
01094 labels=None, max_num_labels=None, title=None, clear=True, save=False,
01095 **kwargs):
01096 """
01097 Function to plot values from your table in 1, 2 or 3 dimensions using
01098 `Matplotlib <http://matplotlib.sourceforge.net>`__
01099
01100 :param x: column name for first dimension
01101 :type x: :class:`str`
01102
01103 :param y: column name for second dimension
01104 :type y: :class:`str`
01105
01106 :param z: column name for third dimension
01107 :type z: :class:`str`
01108
01109 :param style: symbol style (e.g. *.*, *-*, *x*, *o*, *+*, *\**). For a
01110 complete list check (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
01111 :type style: :class:`str`
01112
01113 :param x_title: title for first dimension, if not specified it is
01114 automatically derived from column name
01115 :type x_title: :class:`str`
01116
01117 :param y_title: title for second dimension, if not specified it is
01118 automatically derived from column name
01119 :type y_title: :class:`str`
01120
01121 :param z_title: title for third dimension, if not specified it is
01122 automatically derived from column name
01123 :type z_title: :class:`str`
01124
01125 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
01126 :type x_range: :class:`list` of length two
01127
01128 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
01129 :type y_range: :class:`list` of length two
01130
01131 :param z_range: start and end value for third dimension (e.g. [start_z, end_z])
01132 :type z_range: :class:`list` of length two
01133
01134 :param color: color for data (e.g. *b*, *g*, *r*). For a complete list check
01135 (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
01136 :type color: :class:`str`
01137
01138 :param plot_if: callable which returnes *True* if row should be plotted. Is
01139 invoked like ``plot_if(self, row)``
01140 :type plot_if: callable
01141
01142 :param legend: legend label for data series
01143 :type legend: :class:`str`
01144
01145 :param num_z_levels: number of levels for third dimension
01146 :type num_z_levels: :class:`int`
01147
01148 :param diag_line: draw diagonal line
01149 :type diag_line: :class:`bool`
01150
01151 :param labels: column name containing labels to put on x-axis for one
01152 dimensional plot
01153 :type labels: :class:`str`
01154
01155 :param max_num_labels: limit maximum number of labels
01156 :type max_num_labels: :class:`int`
01157
01158 :param title: plot title, if not specified it is automatically derived from
01159 plotted column names
01160 :type title: :class:`str`
01161
01162 :param clear: clear old data from plot
01163 :type clear: :class:`bool`
01164
01165 :param save: filename for saving plot
01166 :type save: :class:`str`
01167
01168 :param z_contour: draw contour lines
01169 :type z_contour: :class:`bool`
01170
01171 :param z_interpol: interpolation method for 3-dimensional plot (one of 'nn',
01172 'linear')
01173 :type z_interpol: :class:`str`
01174
01175 :param \*\*kwargs: additional arguments passed to matplotlib
01176
01177 :returns: the ``matplotlib.pyplot`` module
01178
01179 **Examples:** simple plotting functions
01180
01181 .. code-block:: python
01182
01183 tab = Table(['a','b','c','d'],'iffi', a=range(5,0,-1),
01184 b=[x/2.0 for x in range(1,6)],
01185 c=[math.cos(x) for x in range(0,5)],
01186 d=range(3,8))
01187
01188 # one dimensional plot of column 'd' vs. index
01189 plt = tab.Plot('d')
01190 plt.show()
01191
01192 # two dimensional plot of 'a' vs. 'c'
01193 plt = tab.Plot('a', y='c', style='o-')
01194 plt.show()
01195
01196 # three dimensional plot of 'a' vs. 'c' with values 'b'
01197 plt = tab.Plot('a', y='c', z='b')
01198 # manually save plot to file
01199 plt.savefig("plot.png")
01200 """
01201 try:
01202 import matplotlib.pyplot as plt
01203 import matplotlib.mlab as mlab
01204 import numpy as np
01205 idx1 = self.GetColIndex(x)
01206 xs = []
01207 ys = []
01208 zs = []
01209
01210 if clear:
01211 plt.figure(figsize=[8, 6])
01212
01213 if x_title!=None:
01214 nice_x=x_title
01215 else:
01216 nice_x=MakeTitle(x)
01217
01218 if y_title!=None:
01219 nice_y=y_title
01220 else:
01221 if y:
01222 nice_y=MakeTitle(y)
01223 else:
01224 nice_y=None
01225
01226 if z_title!=None:
01227 nice_z = z_title
01228 else:
01229 if z:
01230 nice_z = MakeTitle(z)
01231 else:
01232 nice_z = None
01233
01234 if x_range and (IsScalar(x_range) or len(x_range)!=2):
01235 raise ValueError('parameter x_range must contain exactly two elements')
01236 if y_range and (IsScalar(y_range) or len(y_range)!=2):
01237 raise ValueError('parameter y_range must contain exactly two elements')
01238 if z_range and (IsScalar(z_range) or len(z_range)!=2):
01239 raise ValueError('parameter z_range must contain exactly two elements')
01240
01241 if color:
01242 kwargs['color']=color
01243 if legend:
01244 kwargs['label']=legend
01245 if y and z:
01246 idx3 = self.GetColIndex(z)
01247 idx2 = self.GetColIndex(y)
01248 for row in self.rows:
01249 if row[idx1]!=None and row[idx2]!=None and row[idx3]!=None:
01250 if plot_if and not plot_if(self, row):
01251 continue
01252 xs.append(row[idx1])
01253 ys.append(row[idx2])
01254 zs.append(row[idx3])
01255 levels = []
01256 if z_range:
01257 z_spacing = (z_range[1] - z_range[0]) / num_z_levels
01258 l = z_range[0]
01259 else:
01260 l = self.Min(z)
01261 z_spacing = (self.Max(z) - l) / num_z_levels
01262
01263 for i in range(0,num_z_levels+1):
01264 levels.append(l)
01265 l += z_spacing
01266
01267 xi = np.linspace(min(xs),max(xs),len(xs)*10)
01268 yi = np.linspace(min(ys),max(ys),len(ys)*10)
01269 zi = mlab.griddata(xs, ys, zs, xi, yi, interp=z_interpol)
01270
01271 if z_contour:
01272 plt.contour(xi,yi,zi,levels,linewidths=0.5,colors='k')
01273
01274 plt.contourf(xi,yi,zi,levels,cmap=plt.cm.jet)
01275 plt.colorbar(ticks=levels)
01276
01277 elif y:
01278 idx2=self.GetColIndex(y)
01279 for row in self.rows:
01280 if row[idx1]!=None and row[idx2]!=None:
01281 if plot_if and not plot_if(self, row):
01282 continue
01283 xs.append(row[idx1])
01284 ys.append(row[idx2])
01285 plt.plot(xs, ys, style, **kwargs)
01286
01287 else:
01288 label_vals=[]
01289
01290 if labels:
01291 label_idx=self.GetColIndex(labels)
01292 for row in self.rows:
01293 if row[idx1]!=None:
01294 if plot_if and not plot_if(self, row):
01295 continue
01296 xs.append(row[idx1])
01297 if labels:
01298 label_vals.append(row[label_idx])
01299 plt.plot(xs, style, **kwargs)
01300 if labels:
01301 interval = 1
01302 if max_num_labels:
01303 if len(label_vals)>max_num_labels:
01304 interval = int(math.ceil(float(len(label_vals))/max_num_labels))
01305 label_vals = label_vals[::interval]
01306 plt.xticks(np.arange(0, len(xs), interval), label_vals, rotation=45,
01307 size='x-small')
01308
01309 if title==None:
01310 if nice_z:
01311 title = '%s of %s vs. %s' % (nice_z, nice_x, nice_y)
01312 elif nice_y:
01313 title = '%s vs. %s' % (nice_x, nice_y)
01314 else:
01315 title = nice_x
01316
01317 plt.title(title, size='x-large', fontweight='bold',
01318 verticalalignment='bottom')
01319
01320 if legend:
01321 plt.legend(loc=0)
01322
01323 if x and y:
01324 plt.xlabel(nice_x, size='x-large')
01325 if x_range:
01326 plt.xlim(x_range[0], x_range[1])
01327 if y_range:
01328 plt.ylim(y_range[0], y_range[1])
01329 if diag_line:
01330 plt.plot(x_range, y_range, '-', color='black')
01331
01332 plt.ylabel(nice_y, size='x-large')
01333 else:
01334 if y_range:
01335 plt.ylim(y_range[0], y_range[1])
01336 if x_title:
01337 plt.xlabel(x_title, size='x-large')
01338 plt.ylabel(nice_y, size='x-large')
01339 if save:
01340 plt.savefig(save)
01341 return plt
01342 except ImportError:
01343 LogError("Function needs numpy and matplotlib, but I could not import it.")
01344 raise
01345
01346 def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False,
01347 histtype='stepfilled', align='mid', x_title=None,
01348 y_title=None, title=None, clear=True, save=False,
01349 color=None, y_range=None):
01350 """
01351 Create a histogram of the data in col for the range *x_range*, split into
01352 *num_bins* bins and plot it using Matplotlib.
01353
01354 :param col: column name with data
01355 :type col: :class:`str`
01356
01357 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
01358 :type x_range: :class:`list` of length two
01359
01360 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
01361 :type y_range: :class:`list` of length two
01362
01363 :param num_bins: number of bins in range
01364 :type num_bins: :class:`int`
01365
01366 :param color: Color to be used for the histogram. If not set, color will be
01367 determined by matplotlib
01368 :type color: :class:`str`
01369
01370 :param normed: normalize histogram
01371 :type normed: :class:`bool`
01372
01373 :param histtype: type of histogram (i.e. *bar*, *barstacked*, *step*,
01374 *stepfilled*). See (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
01375 :type histtype: :class:`str`
01376
01377 :param align: style of histogram (*left*, *mid*, *right*). See
01378 (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
01379 :type align: :class:`str`
01380
01381 :param x_title: title for first dimension, if not specified it is
01382 automatically derived from column name
01383 :type x_title: :class:`str`
01384
01385 :param y_title: title for second dimension, if not specified it is
01386 automatically derived from column name
01387 :type y_title: :class:`str`
01388
01389 :param title: plot title, if not specified it is automatically derived from
01390 plotted column names
01391 :type title: :class:`str`
01392
01393 :param clear: clear old data from plot
01394 :type clear: :class:`bool`
01395
01396 :param save: filename for saving plot
01397 :type save: :class:`str`
01398
01399 **Examples:** simple plotting functions
01400
01401 .. code-block:: python
01402
01403 tab = Table(['a'],'f', a=[math.cos(x*0.01) for x in range(100)])
01404
01405 # one dimensional plot of column 'd' vs. index
01406 plt = tab.PlotHistogram('a')
01407 plt.show()
01408
01409 """
01410 try:
01411 import matplotlib.pyplot as plt
01412 import numpy as np
01413
01414 if len(self.rows)==0:
01415 return None
01416 kwargs={}
01417 if color:
01418 kwargs['color']=color
01419 idx = self.GetColIndex(col)
01420 data = []
01421 for r in self.rows:
01422 if r[idx]!=None:
01423 data.append(r[idx])
01424
01425 if clear:
01426 plt.clf()
01427
01428 n, bins, patches = plt.hist(data, bins=num_bins, range=x_range,
01429 normed=normed, histtype=histtype, align=align,
01430 **kwargs)
01431
01432 if x_title!=None:
01433 nice_x=x_title
01434 else:
01435 nice_x=MakeTitle(col)
01436 plt.xlabel(nice_x, size='x-large')
01437 if y_range:
01438 plt.ylim(y_range)
01439 if y_title!=None:
01440 nice_y=y_title
01441 else:
01442 nice_y="bin count"
01443 plt.ylabel(nice_y, size='x-large')
01444
01445 if title!=None:
01446 nice_title=title
01447 else:
01448 nice_title="Histogram of %s"%nice_x
01449 plt.title(nice_title, size='x-large', fontweight='bold')
01450
01451 if save:
01452 plt.savefig(save)
01453 return plt
01454 except ImportError:
01455 LogError("Function needs numpy and matplotlib, but I could not import it.")
01456 raise
01457
01458 def _Max(self, col):
01459 if len(self.rows)==0:
01460 return None, None
01461 idx = self.GetColIndex(col)
01462 col_type = self.col_types[idx]
01463 if col_type=='int' or col_type=='float':
01464 max_val = -float('inf')
01465 elif col_type=='bool':
01466 max_val = False
01467 elif col_type=='string':
01468 max_val = chr(0)
01469 max_idx = None
01470 for i in range(0, len(self.rows)):
01471 if self.rows[i][idx]>max_val:
01472 max_val = self.rows[i][idx]
01473 max_idx = i
01474 return max_val, max_idx
01475
01476 def PlotBar(self, cols=None, rows=None, xlabels=None, set_xlabels=True, xlabels_rotation='horizontal', y_title=None, title=None,
01477 colors=None, width=0.8, bottom=0, legend=False, legend_names=None, show=False, save=False):
01478
01479 """
01480 Create a barplot of the data in cols. Every column will be represented
01481 at one position. If there are several rows, each column will be grouped
01482 together.
01483
01484 :param cols: List of column names. Every column will be represented as a
01485 single bar. If cols is None, every column of the table gets
01486 plotted.
01487 :type cols: :class:`list`
01488
01489 :param rows: List of row indices. Values from given rows will be plotted
01490 in parallel at one column position. If set to None, all rows
01491 of the table will be plotted. Note, that the maximum number
01492 of rows is 7.
01493 :type rows: :class:`list`
01494
01495 :param xlabels: Label for every col on x-axis. If set to None, the column
01496 names are used. The xlabel plotting can be supressed by
01497 the parameter set_xlabel.
01498 :type xlabels: :class:`list`
01499
01500 :param set_xlabels: Controls whether xlabels are plotted or not.
01501 :type set_xlabels: :class:`bool`
01502
01503 :param x_labels_rotation: Can either be 'horizontal', 'vertical' or an
01504 integer, that describes the rotation in degrees.
01505
01506 :param y_title: Y-axis description
01507 :type y_title: :class:`str`
01508
01509 :title: Title of the plot. No title appears if set to None
01510 :type title: :class:`str`
01511
01512 :param colors: Colors of the different bars in each group. Must be a list
01513 of valid colors in matplotlib. Length of color and rows must
01514 be consistent.
01515 :type colors: :class:`list`
01516
01517 :param width: The available space for the groups on the x-axis is divided
01518 by the exact number of groups. The parameters width is the
01519 fraction of what is actually used. If it would be 1.0 the
01520 bars of the different groups would touch each other.
01521 Value must be between [0;1]
01522 :type width: :class:`float`
01523
01524 :param bottom: Bottom
01525 :type bottom: :class:`float`
01526
01527 :param legend: Legend for color explanation, the corresponding row
01528 respectively. If set to True, legend_names must be provided.
01529 :type legend: :class:`bool`
01530
01531 :param legend_names: List of names, that describe the differently colored
01532 bars. Length must be consistent with number of rows.
01533
01534 :param show: If set to True, the plot is directly displayed.
01535
01536 :param save: If set, a png image with name save in the current working
01537 directory will be saved.
01538 :type save: :class:`str`
01539
01540 """
01541 try:
01542 import numpy as np
01543 import matplotlib.pyplot as plt
01544 except:
01545 raise ImportError('PlotBar relies on numpy and matplotlib, but I could' \
01546 'not import it!')
01547
01548 standard_colors=['b','g','y','c','m','r','k']
01549 data=[]
01550
01551 if cols==None:
01552 cols=self.col_names
01553
01554 if width<=0 or width>1:
01555 raise ValueError('Width must be in [0;1]')
01556
01557 if rows==None:
01558 if len(self.rows)>7:
01559 raise ValueError('Table contains too many rows to represent them at one '\
01560 'bar position in parallel. You can Select a Subtable or '\
01561 'specify the parameter rows with a list of row indices '\
01562 '(max 7)')
01563 else:
01564 rows=range(len(self.rows))
01565 else:
01566 if not isinstance(rows,list):
01567 rows=[rows]
01568 if len(rows)>7:
01569 raise ValueError('Too many rows to represent (max 7). Please note, that '\
01570 'data from multiple rows from one column gets '\
01571 'represented at one position in parallel.')
01572
01573 for r_idx in rows:
01574 row=self.rows[r_idx]
01575 temp=list()
01576 for c in cols:
01577 try:
01578 c_idx=self.GetColIndex(c)
01579 except:
01580 raise ValueError('Cannot find column with name '+str(c))
01581 temp.append(row[c_idx])
01582 data.append(temp)
01583
01584 if colors==None:
01585 colors=standard_colors[:len(rows)]
01586
01587 if len(rows)!=len(colors):
01588 raise ValueError("Number of rows and number of colors must be consistent!")
01589
01590 ind=np.arange(len(data[0]))
01591 single_bar_width=float(width)/len(data)
01592
01593 fig=plt.figure()
01594 ax=fig.add_subplot(111)
01595 legend_data=[]
01596
01597 for i in range(len(data)):
01598 legend_data.append(ax.bar(ind+i*single_bar_width+(1-width)/2,data[i],single_bar_width,bottom=bottom,color=colors[i])[0])
01599
01600 if title!=None:
01601 ax.set_title(title, size='x-large', fontweight='bold')
01602
01603 if y_title!=None:
01604 nice_y=y_title
01605 else:
01606 nice_y="value"
01607 ax.set_ylabel(nice_y)
01608
01609 if xlabels:
01610 if len(data[0])!=len(xlabels):
01611 raise ValueError('Number of xlabels is not consistent with number of cols!')
01612 else:
01613 xlabels=cols
01614
01615 if set_xlabels:
01616 ax.set_xticks(ind+0.5)
01617 ax.set_xticklabels(xlabels, rotation = xlabels_rotation)
01618 else:
01619 ax.set_xticks([])
01620
01621 if legend == True:
01622 if legend_names==None:
01623 raise ValueError('You must provide legend names! e.g. names for the rows, '\
01624 'that are printed in parallel.')
01625 if len(legend_names)!=len(data):
01626 raise ValueError('length of legend_names must be consistent with number '\
01627 'of plotted rows!')
01628 ax.legend(legend_data, legend_names)
01629
01630 if save:
01631 plt.savefig(save)
01632
01633 if show:
01634 plt.show()
01635
01636 return plt
01637
01638 def PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log',
01639 colormap='jet', show_scalebar=False, scalebar_label=None, clear=True, save=False, show=False):
01640
01641 """
01642 Create a heatplot of the data in col x vs the data in col y using matplotlib
01643
01644 :param x: column name with x data
01645 :type x: :class:`str`
01646
01647 :param y: column name with y data
01648 :type y: :class:`str`
01649
01650 :param title: title of the plot, will be generated automatically if set to None
01651 :type title: :class:`str`
01652
01653 :param x_title: label of x-axis, will be generated automatically if set to None
01654 :type title: :class:`str`
01655
01656 :param y_title: label of y-axis, will be generated automatically if set to None
01657 :type title: :class:`str`
01658
01659 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
01660 :type x_range: :class:`list` of length two
01661
01662 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
01663 :type y_range: :class:`list` of length two
01664
01665 :param binning: type of binning. If set to None, the value of a hexbin will
01666 correspond to the number of datapoints falling into it. If
01667 set to 'log', the value will be the log with base 10 of the above
01668 value (log(i+1)). If an integer is provided, the number of a
01669 hexbin is equal the number of datapoints falling into it divided
01670 by the integer. If a list of values is provided, these values
01671 will be the lower bounds of the bins.
01672
01673 :param colormap: colormap, that will be used. Value can be every colormap defined
01674 in matplotlib or an own defined colormap. You can either pass a
01675 string with the name of the matplotlib colormap or a colormap
01676 object.
01677
01678 :param show_scalebar: If set to True, a scalebar according to the chosen colormap is shown
01679 :type show_scalebar: :class:`bool`
01680
01681 :param scalebar_label: Label of the scalebar
01682 :type scalebar_label: :class:`str`
01683
01684 :param clear: clear old data from plot
01685 :type clear: :class:`bool`
01686
01687 :param save: filename for saving plot
01688 :type save: :class:`str`
01689
01690 :param show: directly show plot
01691 :type show: :class:`bool`
01692
01693 """
01694
01695 try:
01696 import matplotlib.pyplot as plt
01697 import matplotlib.cm as cm
01698 except:
01699 raise ImportError('PlotHexbin relies on matplotlib, but I could not import it')
01700
01701 idx=self.GetColIndex(x)
01702 idy=self.GetColIndex(y)
01703 xdata=[]
01704 ydata=[]
01705
01706 for r in self.rows:
01707 if r[idx]!=None and r[idy]!=None:
01708 xdata.append(r[idx])
01709 ydata.append(r[idy])
01710
01711 if clear:
01712 plt.clf()
01713
01714 if x_title!=None:
01715 nice_x=x_title
01716 else:
01717 nice_x=MakeTitle(x)
01718
01719 if y_title!=None:
01720 nice_y=y_title
01721 else:
01722 nice_y=MakeTitle(y)
01723
01724 if title==None:
01725 title = '%s vs. %s' % (nice_x, nice_y)
01726
01727 if IsStringLike(colormap):
01728 colormap=getattr(cm, colormap)
01729
01730 if x_range and (IsScalar(x_range) or len(x_range)!=2):
01731 raise ValueError('parameter x_range must contain exactly two elements')
01732 if y_range and (IsScalar(y_range) or len(y_range)!=2):
01733 raise ValueError('parameter y_range must contain exactly two elements')
01734
01735 ext = [min(xdata),max(xdata),min(ydata),max(ydata)]
01736
01737 if x_range:
01738 plt.xlim((x_range[0], x_range[1]))
01739 ext[0]=x_range[0]
01740 ext[1]=x_range[1]
01741 if y_range:
01742 plt.ylim(y_range[0], y_range[1])
01743 ext[2]=y_range[0]
01744 ext[3]=y_range[1]
01745
01746
01747 plt.hexbin(xdata, ydata, bins=binning, cmap=colormap, extent=ext)
01748
01749 plt.title(title, size='x-large', fontweight='bold',
01750 verticalalignment='bottom')
01751
01752 plt.xlabel(nice_x)
01753 plt.ylabel(nice_y)
01754
01755 if show_scalebar:
01756 cb=plt.colorbar()
01757 if scalebar_label:
01758 cb.set_label(scalebar_label)
01759
01760 if save:
01761 plt.savefig(save)
01762
01763 if show:
01764 plt.show()
01765
01766 return plt
01767
01768 def MaxRow(self, col):
01769 """
01770 Returns the row containing the cell with the maximal value in col. If
01771 several rows have the highest value, only the first one is returned.
01772 ''None'' values are ignored.
01773
01774 :param col: column name
01775 :type col: :class:`str`
01776
01777 :returns: row with maximal col value or None if the table is empty
01778 """
01779 val, idx = self._Max(col)
01780 if idx!=None:
01781 return self.rows[idx]
01782
01783 def Max(self, col):
01784 """
01785 Returns the maximum value in col. If several rows have the highest value,
01786 only the first one is returned. ''None'' values are ignored.
01787
01788 :param col: column name
01789 :type col: :class:`str`
01790 """
01791 val, idx = self._Max(col)
01792 return val
01793
01794 def MaxIdx(self, col):
01795 """
01796 Returns the row index of the cell with the maximal value in col. If
01797 several rows have the highest value, only the first one is returned.
01798 ''None'' values are ignored.
01799
01800 :param col: column name
01801 :type col: :class:`str`
01802 """
01803 val, idx = self._Max(col)
01804 return idx
01805
01806 def _Min(self, col):
01807 if len(self.rows)==0:
01808 return None, None
01809 idx=self.GetColIndex(col)
01810 col_type = self.col_types[idx]
01811 if col_type=='int' or col_type=='float':
01812 min_val=float('inf')
01813 elif col_type=='bool':
01814 min_val=True
01815 elif col_type=='string':
01816 min_val=chr(255)
01817 min_idx=None
01818 for i,row in enumerate(self.rows):
01819 if row[idx]!=None and row[idx]<min_val:
01820 min_val=row[idx]
01821 min_idx=i
01822 return min_val, min_idx
01823
01824 def Min(self, col):
01825 """
01826 Returns the minimal value in col. If several rows have the lowest value,
01827 only the first one is returned. ''None'' values are ignored.
01828
01829 :param col: column name
01830 :type col: :class:`str`
01831 """
01832 val, idx = self._Min(col)
01833 return val
01834
01835 def MinRow(self, col):
01836 """
01837 Returns the row containing the cell with the minimal value in col. If
01838 several rows have the lowest value, only the first one is returned.
01839 ''None'' values are ignored.
01840
01841 :param col: column name
01842 :type col: :class:`str`
01843
01844 :returns: row with minimal col value or None if the table is empty
01845 """
01846 val, idx = self._Min(col)
01847 if idx!=None:
01848 return self.rows[idx]
01849
01850 def MinIdx(self, col):
01851 """
01852 Returns the row index of the cell with the minimal value in col. If
01853 several rows have the lowest value, only the first one is returned.
01854 ''None'' values are ignored.
01855
01856 :param col: column name
01857 :type col: :class:`str`
01858 """
01859 val, idx = self._Min(col)
01860 return idx
01861
01862 def Sum(self, col):
01863 """
01864 Returns the sum of the given column. Cells with ''None'' are ignored. Returns
01865 0.0, if the column doesn't contain any elements. Col must be of numeric
01866 column type ('float', 'int') or boolean column type.
01867
01868 :param col: column name
01869 :type col: :class:`str`
01870
01871 :raises: :class:`TypeError` if column type is ``string``
01872 """
01873 idx = self.GetColIndex(col)
01874 col_type = self.col_types[idx]
01875 if col_type!='int' and col_type!='float' and col_type!='bool':
01876 raise TypeError("Sum can only be used on numeric column types")
01877 s = 0.0
01878 for r in self.rows:
01879 if r[idx]!=None:
01880 s += r[idx]
01881 return s
01882
01883 def Mean(self, col):
01884 """
01885 Returns the mean of the given column. Cells with ''None'' are ignored. Returns
01886 None, if the column doesn't contain any elements. Col must be of numeric
01887 ('float', 'int') or boolean column type.
01888
01889 If column type is *bool*, the function returns the ratio of
01890 number of 'Trues' by total number of elements.
01891
01892 :param col: column name
01893 :type col: :class:`str`
01894
01895 :raises: :class:`TypeError` if column type is ``string``
01896 """
01897 idx = self.GetColIndex(col)
01898 col_type = self.col_types[idx]
01899 if col_type!='int' and col_type!='float' and col_type!='bool':
01900 raise TypeError("Mean can only be used on numeric or bool column types")
01901
01902 vals=[]
01903 for v in self[col]:
01904 if v!=None:
01905 vals.append(v)
01906 try:
01907 return stutil.Mean(vals)
01908 except:
01909 return None
01910
01911 def RowMean(self, mean_col_name, cols):
01912 """
01913 Adds a new column of type 'float' with a specified name (*mean_col_name*),
01914 containing the mean of all specified columns for each row.
01915
01916 Cols are specified by their names and must be of numeric column
01917 type ('float', 'int') or boolean column type. Cells with None are ignored.
01918 Adds ''None'' if the row doesn't contain any values.
01919
01920 :param mean_col_name: name of new column containing mean values
01921 :type mean_col_name: :class:`str`
01922
01923 :param cols: name or list of names of columns to include in computation of
01924 mean
01925 :type cols: :class:`str` or :class:`list` of strings
01926
01927 :raises: :class:`TypeError` if column type of columns in *col* is ``string``
01928
01929 == Example ==
01930
01931 Staring with the following table:
01932
01933 ==== ==== ====
01934 x y u
01935 ==== ==== ====
01936 1 10 100
01937 2 15 None
01938 3 20 400
01939 ==== ==== ====
01940
01941 the code here adds a column with the name 'mean' to yield the table below:
01942
01943 .. code-block::python
01944
01945 tab.RowMean('mean', ['x', 'u'])
01946
01947
01948 ==== ==== ==== =====
01949 x y u mean
01950 ==== ==== ==== =====
01951 1 10 100 50.5
01952 2 15 None 2
01953 3 20 400 201.5
01954 ==== ==== ==== =====
01955
01956 """
01957
01958 if IsScalar(cols):
01959 cols = [cols]
01960
01961 cols_idxs = []
01962 for col in cols:
01963 idx = self.GetColIndex(col)
01964 col_type = self.col_types[idx]
01965 if col_type!='int' and col_type!='float' and col_type!='bool':
01966 raise TypeError("RowMean can only be used on numeric column types")
01967 cols_idxs.append(idx)
01968
01969 mean_rows = []
01970 for row in self.rows:
01971 vals = []
01972 for idx in cols_idxs:
01973 v = row[idx]
01974 if v!=None:
01975 vals.append(v)
01976 try:
01977 mean = stutil.Mean(vals)
01978 mean_rows.append(mean)
01979 except:
01980 mean_rows.append(None)
01981
01982 self.AddCol(mean_col_name, 'f', mean_rows)
01983
01984 def Percentiles(self, col, nths):
01985 """
01986 Returns the percentiles of column *col* given in *nths*.
01987
01988 The percentiles are calculated as
01989
01990 .. code-block:: python
01991
01992 values[min(len(values), int(round(len(values)*nth/100+0.5)-1))]
01993
01994 where values are the sorted values of *col* not equal to ''None''
01995
01996 :param col: column name
01997 :type col: :class:`str`
01998 :param nths: list of percentiles to be calculated. Each percentile is a
01999 number between 0 and 100.
02000 :type nths: :class:`list` of numbers
02001
02002 :raises: :class:`TypeError` if column type is ``string``
02003 :returns: List of percentiles in the same order as given in *nths*
02004 """
02005 idx = self.GetColIndex(col)
02006 col_type = self.col_types[idx]
02007 if col_type!='int' and col_type!='float' and col_type!='bool':
02008 raise TypeError("Median can only be used on numeric column types")
02009
02010 for nth in nths:
02011 if nth < 0 or nth > 100:
02012 raise ValueError("percentiles must be between 0 and 100")
02013 vals=[]
02014 for v in self[col]:
02015 if v!=None:
02016 vals.append(v)
02017 vals=sorted(vals)
02018 if len(vals)==0:
02019 return [None]*len(nths)
02020 percentiles=[]
02021
02022 for nth in nths:
02023 p=vals[min(len(vals)-1, int(round(len(vals)*nth/100.0+0.5)-1))]
02024 percentiles.append(p)
02025 return percentiles
02026
02027 def Median(self, col):
02028 """
02029 Returns the median of the given column. Cells with ''None'' are ignored. Returns
02030 ''None'', if the column doesn't contain any elements. Col must be of numeric
02031 column type ('float', 'int') or boolean column type.
02032
02033 :param col: column name
02034 :type col: :class:`str`
02035
02036 :raises: :class:`TypeError` if column type is ``string``
02037 """
02038 idx = self.GetColIndex(col)
02039 col_type = self.col_types[idx]
02040 if col_type!='int' and col_type!='float' and col_type!='bool':
02041 raise TypeError("Median can only be used on numeric column types")
02042
02043 vals=[]
02044 for v in self[col]:
02045 if v!=None:
02046 vals.append(v)
02047 stutil.Median(vals)
02048 try:
02049 return stutil.Median(vals)
02050 except:
02051 return None
02052
02053 def StdDev(self, col):
02054 """
02055 Returns the standard deviation of the given column. Cells with ''None'' are
02056 ignored. Returns ''None'', if the column doesn't contain any elements. Col must
02057 be of numeric column type ('float', 'int') or boolean column type.
02058
02059 :param col: column name
02060 :type col: :class:`str`
02061
02062 :raises: :class:`TypeError` if column type is ``string``
02063 """
02064 idx = self.GetColIndex(col)
02065 col_type = self.col_types[idx]
02066 if col_type!='int' and col_type!='float' and col_type!='bool':
02067 raise TypeError("StdDev can only be used on numeric column types")
02068
02069 vals=[]
02070 for v in self[col]:
02071 if v!=None:
02072 vals.append(v)
02073 try:
02074 return stutil.StdDev(vals)
02075 except:
02076 return None
02077
02078 def Count(self, col, ignore_nan=True):
02079 """
02080 Count the number of cells in column that are not equal to ''None''.
02081
02082 :param col: column name
02083 :type col: :class:`str`
02084
02085 :param ignore_nan: ignore all *None* values
02086 :type ignore_nan: :class:`bool`
02087 """
02088 count=0
02089 idx=self.GetColIndex(col)
02090 for r in self.rows:
02091 if ignore_nan:
02092 if r[idx]!=None:
02093 count+=1
02094 else:
02095 count+=1
02096 return count
02097
02098 def Correl(self, col1, col2):
02099 """
02100 Calculate the Pearson correlation coefficient between *col1* and *col2*, only
02101 taking rows into account where both of the values are not equal to *None*.
02102 If there are not enough data points to calculate a correlation coefficient,
02103 *None* is returned.
02104
02105 :param col1: column name for first column
02106 :type col1: :class:`str`
02107
02108 :param col2: column name for second column
02109 :type col2: :class:`str`
02110 """
02111 if IsStringLike(col1) and IsStringLike(col2):
02112 col1 = self.GetColIndex(col1)
02113 col2 = self.GetColIndex(col2)
02114 vals1, vals2=([],[])
02115 for v1, v2 in zip(self[col1], self[col2]):
02116 if v1!=None and v2!=None:
02117 vals1.append(v1)
02118 vals2.append(v2)
02119 try:
02120 return stutil.Correl(vals1, vals2)
02121 except:
02122 return None
02123
02124 def SpearmanCorrel(self, col1, col2):
02125 """
02126 Calculate the Spearman correlation coefficient between col1 and col2, only
02127 taking rows into account where both of the values are not equal to None. If
02128 there are not enough data points to calculate a correlation coefficient,
02129 None is returned.
02130
02131 :warning: The function depends on the following module: *scipy.stats.mstats*
02132
02133 :param col1: column name for first column
02134 :type col1: :class:`str`
02135
02136 :param col2: column name for second column
02137 :type col2: :class:`str`
02138 """
02139 try:
02140 import scipy.stats.mstats
02141
02142 if IsStringLike(col1) and IsStringLike(col2):
02143 col1 = self.GetColIndex(col1)
02144 col2 = self.GetColIndex(col2)
02145 vals1, vals2=([],[])
02146 for v1, v2 in zip(self[col1], self[col2]):
02147 if v1!=None and v2!=None:
02148 vals1.append(v1)
02149 vals2.append(v2)
02150 try:
02151 correl = scipy.stats.mstats.spearmanr(vals1, vals2)[0]
02152 if scipy.isnan(correl):
02153 return None
02154 return correl
02155 except:
02156 return None
02157
02158 except ImportError:
02159 LogError("Function needs scipy.stats.mstats, but I could not import it.")
02160 raise
02161
02162
02163 def Save(self, stream_or_filename, format='ost', sep=','):
02164 """
02165 Save the table to stream or filename. The following three file formats
02166 are supported (for more information on file formats, see :meth:`Load`):
02167
02168 ============= =======================================
02169 ost ost-specific format (human readable)
02170 csv comma separated values (human readable)
02171 pickle pickled byte stream (binary)
02172 html HTML table
02173 context ConTeXt table
02174 ============= =======================================
02175
02176 :param stream_or_filename: filename or stream for writing output
02177 :type stream_or_filename: :class:`str` or :class:`file`
02178
02179 :param format: output format (i.e. *ost*, *csv*, *pickle*)
02180 :type format: :class:`str`
02181
02182 :raises: :class:`ValueError` if format is unknown
02183 """
02184 format=format.lower()
02185 if format=='ost':
02186 return self._SaveOST(stream_or_filename)
02187 if format=='csv':
02188 return self._SaveCSV(stream_or_filename, sep=sep)
02189 if format=='pickle':
02190 return self._SavePickle(stream_or_filename)
02191 if format=='html':
02192 return self._SaveHTML(stream_or_filename)
02193 if format=='context':
02194 return self._SaveContext(stream_or_filename)
02195 raise ValueError('unknown format "%s"' % format)
02196
02197 def _SavePickle(self, stream):
02198 if not hasattr(stream, 'write'):
02199 stream=open(stream, 'wb')
02200 cPickle.dump(self, stream, cPickle.HIGHEST_PROTOCOL)
02201
02202 def _SaveHTML(self, stream_or_filename):
02203 def _escape(s):
02204 return s.replace('&', '&').replace('>', '>').replace('<', '<')
02205
02206 file_opened = False
02207 if not hasattr(stream_or_filename, 'write'):
02208 stream = open(stream_or_filename, 'w')
02209 file_opened = True
02210 else:
02211 stream = stream_or_filename
02212 stream.write('<table>')
02213 stream.write('<tr>')
02214 for col_name in self.col_names:
02215 stream.write('<th>%s</th>' % _escape(col_name))
02216 stream.write('</tr>')
02217 for row in self.rows:
02218 stream.write('<tr>')
02219 for i, col in enumerate(row):
02220 val = ''
02221 if col != None:
02222 if self.col_types[i] == 'float':
02223 val = '%.3f' % col
02224 elif self.col_types[i] == 'int':
02225 val = '%d' % col
02226 elif self.col_types[i] == 'bool':
02227 val = col and 'true' or 'false'
02228 else:
02229 val = str(col)
02230 stream.write('<td>%s</td>' % _escape(val))
02231 stream.write('</tr>')
02232 stream.write('</table>')
02233 if file_opened:
02234 stream.close()
02235 def _SaveContext(self, stream_or_filename):
02236 file_opened = False
02237 if not hasattr(stream_or_filename, 'write'):
02238 stream = open(stream_or_filename, 'w')
02239 file_opened = True
02240 else:
02241 stream = stream_or_filename
02242 stream.write('\\starttable[')
02243 for col_type in self.col_types:
02244 if col_type =='string':
02245 stream.write('l|')
02246 elif col_type=='int':
02247 stream.write('r|')
02248 elif col_type =='float':
02249 stream.write('i3r|')
02250 else:
02251 stream.write('l|')
02252 stream.write(']\n\\HL\n')
02253 for col_name in self.col_names:
02254 stream.write('\\NC \\bf %s' % col_name)
02255 stream.write(' \\AR\\HL\n')
02256 for row in self.rows:
02257 for i, col in enumerate(row):
02258 val = '---'
02259 if col != None:
02260 if self.col_types[i] == 'float':
02261 val = '%.3f' % col
02262 elif self.col_types[i] == 'int':
02263 val = '%d' % col
02264 elif self.col_types[i] == 'bool':
02265 val = col and 'true' or 'false'
02266 else:
02267 val = str(col)
02268 stream.write('\\NC %s' % val)
02269 stream.write(' \\AR\n')
02270 stream.write('\\HL\n')
02271 stream.write('\\stoptable')
02272 if file_opened:
02273 stream.close()
02274
02275 def _SaveCSV(self, stream, sep):
02276 if not hasattr(stream, 'write'):
02277 stream=open(stream, 'wb')
02278
02279 writer=csv.writer(stream, delimiter=sep)
02280 writer.writerow(['%s' % n for n in self.col_names])
02281 for row in self.rows:
02282 row=list(row)
02283 for i, c in enumerate(row):
02284 if c==None:
02285 row[i]='NA'
02286 writer.writerow(row)
02287
02288 def _SaveOST(self, stream):
02289 if hasattr(stream, 'write'):
02290 writer=csv.writer(stream, delimiter=' ')
02291 else:
02292 stream=open(stream, 'w')
02293 writer=csv.writer(stream, delimiter=' ')
02294 if self.comment:
02295 stream.write(''.join(['# %s\n' % l for l in self.comment.split('\n')]))
02296 writer.writerow(['%s[%s]' % t for t in zip(self.col_names, self.col_types)])
02297 for row in self.rows:
02298 row=list(row)
02299 for i, c in enumerate(row):
02300 if c==None:
02301 row[i]='NA'
02302 writer.writerow(row)
02303
02304
02305 def GetNumpyMatrix(self, *args):
02306 '''
02307 Returns a numpy matrix containing the selected columns from the table as
02308 columns in the matrix.
02309
02310 Only columns of type *int* or *float* are supported. *NA* values in the
02311 table will be converted to *None* values.
02312
02313 :param \*args: column names to include in numpy matrix
02314
02315 :warning: The function depends on *numpy*
02316 '''
02317 try:
02318 import numpy as np
02319
02320 if len(args)==0:
02321 raise RuntimeError("At least one column must be specified.")
02322
02323 idxs = []
02324 for arg in args:
02325 idx = self.GetColIndex(arg)
02326 col_type = self.col_types[idx]
02327 if col_type!='int' and col_type!='float':
02328 raise TypeError("Numpy matrix can only be generated from numeric column types")
02329 idxs.append(idx)
02330 m = np.matrix([list(self[i]) for i in idxs])
02331 return m.T
02332
02333 except ImportError:
02334 LogError("Function needs numpy, but I could not import it.")
02335 raise
02336
02337
02338
02339 def GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0):
02340
02341 '''
02342 In place Gaussian smooth of a column in the table with a given standard deviation.
02343 All nan are set to nan_value before smoothing.
02344
02345 :param col: column name
02346 :type col: :class:`str`
02347
02348 :param std: standard deviation for gaussian kernel
02349 :type std: `scalar`
02350
02351 :param na_value: all na (None) values of the speciefied column are set to na_value before smoothing
02352 :type na_value: `scalar`
02353
02354 :param padding: allows to handle padding behaviour see scipy ndimage.gaussian_filter1d documentation for more information. standard is reflect
02355 :type padding: :class:`str`
02356
02357 :param c: constant value used for padding if padding mode is constant
02358 :type c: `scalar`
02359
02360
02361
02362 :warning: The function depends on *scipy*
02363 '''
02364
02365 try:
02366 from scipy import ndimage
02367 import numpy as np
02368 except ImportError:
02369 LogError("I need scipy.ndimage and numpy, but could not import it")
02370 raise
02371
02372 idx = self.GetColIndex(col)
02373 col_type = self.col_types[idx]
02374 if col_type!='int' and col_type!='float':
02375 raise TypeError("GaussianSmooth can only be used on numeric column types")
02376
02377 vals=[]
02378 for v in self[col]:
02379 if v!=None:
02380 vals.append(v)
02381 else:
02382 vals.append(na_value)
02383
02384
02385 smoothed_values_ndarray=ndimage.gaussian_filter1d(vals,std, mode=padding, cval=c)
02386
02387 result=[]
02388
02389 for v in smoothed_values_ndarray:
02390 result.append(v)
02391
02392 self[col]=result
02393
02394
02395 def GetOptimalPrefactors(self, ref_col, *args, **kwargs):
02396 '''
02397 This returns the optimal prefactor values (i.e. a, b, c, ...) for the
02398 following equation
02399
02400 .. math::
02401 :label: op1
02402
02403 a*u + b*v + c*w + ... = z
02404
02405 where u, v, w and z are vectors. In matrix notation
02406
02407 .. math::
02408 :label: op2
02409
02410 A*p = z
02411
02412 where A contains the data from the table (u,v,w,...), p are the prefactors
02413 to optimize (a,b,c,...) and z is the vector containing the result of
02414 equation :eq:`op1`.
02415
02416 The parameter ref_col equals to z in both equations, and \*args are columns
02417 u, v and w (or A in :eq:`op2`). All columns must be specified by their names.
02418
02419 **Example:**
02420
02421 .. code-block:: python
02422
02423 tab.GetOptimalPrefactors('colC', 'colA', 'colB')
02424
02425 The function returns a list of containing the prefactors a, b, c, ... in
02426 the correct order (i.e. same as columns were specified in \*args).
02427
02428 Weighting:
02429 If the kwarg weights="columX" is specified, the equations are weighted by
02430 the values in that column. Each row is multiplied by the weight in that row,
02431 which leads to :eq:`op3`:
02432
02433 .. math::
02434 :label: op3
02435
02436 weight*a*u + weight*b*v + weight*c*w + ... = weight*z
02437
02438 Weights must be float or int and can have any value. A value of 0 ignores
02439 this equation, a value of 1 means the same as no weight. If all weights are
02440 the same for each row, the same result will be obtained as with no weights.
02441
02442 **Example:**
02443
02444 .. code-block:: python
02445
02446 tab.GetOptimalPrefactors('colC', 'colA', 'colB', weights='colD')
02447
02448 '''
02449 try:
02450 import numpy as np
02451
02452 if len(args)==0:
02453 raise RuntimeError("At least one column must be specified.")
02454
02455 b = self.GetNumpyMatrix(ref_col)
02456 a = self.GetNumpyMatrix(*args)
02457
02458 if len(kwargs)!=0:
02459 if kwargs.has_key('weights'):
02460 w = self.GetNumpyMatrix(kwargs['weights'])
02461 b = np.multiply(b,w)
02462 a = np.multiply(a,w)
02463
02464 else:
02465 raise RuntimeError("specified unrecognized kwargs, use weights as key")
02466
02467 k = (a.T*a).I*a.T*b
02468 return list(np.array(k.T).reshape(-1))
02469
02470 except ImportError:
02471 LogError("Function needs numpy, but I could not import it.")
02472 raise
02473
02474 def PlotEnrichment(self, score_col, class_col, score_dir='-',
02475 class_dir='-', class_cutoff=2.0,
02476 style='-', title=None, x_title=None, y_title=None,
02477 clear=True, save=None):
02478 '''
02479 Plot an enrichment curve using matplotlib of column *score_col* classified
02480 according to *class_col*.
02481
02482 For more information about parameters of the enrichment, see
02483 :meth:`ComputeEnrichment`, and for plotting see :meth:`Plot`.
02484
02485 :warning: The function depends on *matplotlib*
02486 '''
02487 try:
02488 import matplotlib.pyplot as plt
02489
02490 enrx, enry = self.ComputeEnrichment(score_col, class_col, score_dir,
02491 class_dir, class_cutoff)
02492
02493 if not title:
02494 title = 'Enrichment of %s'%score_col
02495
02496 if not x_title:
02497 x_title = '% database'
02498
02499 if not y_title:
02500 y_title = '% positives'
02501
02502 if clear:
02503 plt.clf()
02504
02505 plt.plot(enrx, enry, style)
02506
02507 plt.title(title, size='x-large', fontweight='bold')
02508 plt.ylabel(y_title, size='x-large')
02509 plt.xlabel(x_title, size='x-large')
02510
02511 if save:
02512 plt.savefig(save)
02513
02514 return plt
02515 except ImportError:
02516 LogError("Function needs matplotlib, but I could not import it.")
02517 raise
02518
02519 def ComputeEnrichment(self, score_col, class_col, score_dir='-',
02520 class_dir='-', class_cutoff=2.0):
02521 '''
02522 Computes the enrichment of column *score_col* classified according to
02523 *class_col*.
02524
02525 For this it is necessary, that the datapoints are classified into positive
02526 and negative points. This can be done in two ways:
02527
02528 - by using one 'bool' type column (*class_col*) which contains *True* for
02529 positives and *False* for negatives
02530
02531 - by specifying a classification column (*class_col*), a cutoff value
02532 (*class_cutoff*) and the classification columns direction (*class_dir*).
02533 This will generate the classification on the fly
02534
02535 * if ``class_dir=='-'``: values in the classification column that are less than or equal to class_cutoff will be counted as positives
02536 * if ``class_dir=='+'``: values in the classification column that are larger than or equal to class_cutoff will be counted as positives
02537
02538 During the calculation, the table will be sorted according to *score_dir*,
02539 where a '-' values means smallest values first and therefore, the smaller
02540 the value, the better.
02541
02542 :warning: If either the value of *class_col* or *score_col* is *None*, the
02543 data in this row is ignored.
02544 '''
02545
02546 ALLOWED_DIR = ['+','-']
02547
02548 score_idx = self.GetColIndex(score_col)
02549 score_type = self.col_types[score_idx]
02550 if score_type!='int' and score_type!='float':
02551 raise TypeError("Score column must be numeric type")
02552
02553 class_idx = self.GetColIndex(class_col)
02554 class_type = self.col_types[class_idx]
02555 if class_type!='int' and class_type!='float' and class_type!='bool':
02556 raise TypeError("Classifier column must be numeric or bool type")
02557
02558 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
02559 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
02560
02561 self.Sort(score_col, score_dir)
02562
02563 x = [0]
02564 y = [0]
02565 enr = 0
02566 old_score_val = None
02567 i = 0
02568
02569 for row in self.rows:
02570 class_val = row[class_idx]
02571 score_val = row[score_idx]
02572 if class_val==None or score_val==None:
02573 continue
02574 if class_val!=None:
02575 if old_score_val==None:
02576 old_score_val = score_val
02577 if score_val!=old_score_val:
02578 x.append(i)
02579 y.append(enr)
02580 old_score_val = score_val
02581 i+=1
02582 if class_type=='bool':
02583 if class_val==True:
02584 enr += 1
02585 else:
02586 if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
02587 enr += 1
02588 x.append(i)
02589 y.append(enr)
02590
02591 # if no false positives or false negatives values are found return None
02592 if x[-1]==0 or y[-1]==0:
02593 return None
02594
02595 x = [float(v)/x[-1] for v in x]
02596 y = [float(v)/y[-1] for v in y]
02597 return x,y
02598
02599 def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-',
02600 class_dir='-', class_cutoff=2.0):
02601 '''
02602 Computes the area under the curve of the enrichment using the trapezoidal
02603 rule.
02604
02605 For more information about parameters of the enrichment, see
02606 :meth:`ComputeEnrichment`.
02607
02608 :warning: The function depends on *numpy*
02609 '''
02610 try:
02611 import numpy as np
02612
02613 enr = self.ComputeEnrichment(score_col, class_col, score_dir,
02614 class_dir, class_cutoff)
02615
02616 if enr==None:
02617 return None
02618 return np.trapz(enr[1], enr[0])
02619 except ImportError:
02620 LogError("Function needs numpy, but I could not import it.")
02621 raise
02622
02623 def ComputeROC(self, score_col, class_col, score_dir='-',
02624 class_dir='-', class_cutoff=2.0):
02625 '''
02626 Computes the receiver operating characteristics (ROC) of column *score_col*
02627 classified according to *class_col*.
02628
02629 For this it is necessary, that the datapoints are classified into positive
02630 and negative points. This can be done in two ways:
02631
02632 - by using one 'bool' column (*class_col*) which contains True for positives
02633 and False for negatives
02634 - by using a non-bool column (*class_col*), a cutoff value (*class_cutoff*)
02635 and the classification columns direction (*class_dir*). This will generate
02636 the classification on the fly
02637
02638 - if ``class_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff* will be counted as positives
02639 - if ``class_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff* will be counted as positives
02640
02641 During the calculation, the table will be sorted according to *score_dir*,
02642 where a '-' values means smallest values first and therefore, the smaller
02643 the value, the better.
02644
02645 If *class_col* does not contain any positives (i.e. value is True (if column
02646 is of type bool) or evaluated to True (if column is of type int or float
02647 (depending on *class_dir* and *class_cutoff*))) the ROC is not defined and
02648 the function will return *None*.
02649
02650 :warning: If either the value of *class_col* or *score_col* is *None*, the
02651 data in this row is ignored.
02652 '''
02653
02654 ALLOWED_DIR = ['+','-']
02655
02656 score_idx = self.GetColIndex(score_col)
02657 score_type = self.col_types[score_idx]
02658 if score_type!='int' and score_type!='float':
02659 raise TypeError("Score column must be numeric type")
02660
02661 class_idx = self.GetColIndex(class_col)
02662 class_type = self.col_types[class_idx]
02663 if class_type!='int' and class_type!='float' and class_type!='bool':
02664 raise TypeError("Classifier column must be numeric or bool type")
02665
02666 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
02667 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
02668
02669 self.Sort(score_col, score_dir)
02670
02671 x = [0]
02672 y = [0]
02673 tp = 0
02674 fp = 0
02675 old_score_val = None
02676
02677 for i,row in enumerate(self.rows):
02678 class_val = row[class_idx]
02679 score_val = row[score_idx]
02680 if class_val==None or score_val==None:
02681 continue
02682 if class_val!=None:
02683 if old_score_val==None:
02684 old_score_val = score_val
02685 if score_val!=old_score_val:
02686 x.append(fp)
02687 y.append(tp)
02688 old_score_val = score_val
02689 if class_type=='bool':
02690 if class_val==True:
02691 tp += 1
02692 else:
02693 fp += 1
02694 else:
02695 if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
02696 tp += 1
02697 else:
02698 fp += 1
02699 x.append(fp)
02700 y.append(tp)
02701
02702 # if no false positives or false negatives values are found return None
02703 if x[-1]==0 or y[-1]==0:
02704 return None
02705
02706 x = [float(v)/x[-1] for v in x]
02707 y = [float(v)/y[-1] for v in y]
02708 return x,y
02709
02710 def ComputeROCAUC(self, score_col, class_col, score_dir='-',
02711 class_dir='-', class_cutoff=2.0):
02712 '''
02713 Computes the area under the curve of the receiver operating characteristics
02714 using the trapezoidal rule.
02715
02716 For more information about parameters of the ROC, see
02717 :meth:`ComputeROC`.
02718
02719 :warning: The function depends on *numpy*
02720 '''
02721 try:
02722 import numpy as np
02723
02724 roc = self.ComputeROC(score_col, class_col, score_dir,
02725 class_dir, class_cutoff)
02726
02727 if not roc:
02728 return None
02729 return np.trapz(roc[1], roc[0])
02730 except ImportError:
02731 LogError("Function needs numpy, but I could not import it.")
02732 raise
02733
02734 def ComputeLogROCAUC(self, score_col, class_col, score_dir='-',
02735 class_dir='-', class_cutoff=2.0):
02736 '''
02737 Computes the area under the curve of the log receiver operating
02738 characteristics (logROC) where the x-axis is semilogarithmic
02739 using the trapezoidal rule.
02740
02741 The logROC is computed with a lambda of 0.001 according to
02742 Rapid Context-Dependent Ligand Desolvation in Molecular Docking
02743 Mysinger M. and Shoichet B., Journal of Chemical Information and Modeling
02744 2010 50 (9), 1561-1573
02745
02746 For more information about parameters of the ROC, see
02747 :meth:`ComputeROC`.
02748
02749 :warning: The function depends on *numpy*
02750 '''
02751 try:
02752 import numpy as np
02753
02754 roc = self.ComputeROC(score_col, class_col, score_dir,
02755 class_dir, class_cutoff)
02756
02757 if not roc:
02758 return None
02759
02760 rocxt, rocyt = roc
02761 rocx=[]
02762 rocy=[]
02763
02764 # define lambda
02765 l=0.001
02766
02767 # remove all duplicate x-values
02768 rocxt = [x if x>0 else l for x in rocxt]
02769 for i in range(len(rocxt)-1):
02770 if rocxt[i]==rocxt[i+1]:
02771 continue
02772 rocx.append(rocxt[i])
02773 rocy.append(rocyt[i])
02774 rocx.append(1.0)
02775 rocy.append(1.0)
02776
02777 # compute logauc
02778 value = 0
02779 for i in range(len(rocx)-1):
02780 x = rocx[i]
02781 if rocx[i]==rocx[i+1]:
02782 continue
02783 b = rocy[i+1]-rocx[i+1]*((rocy[i+1]-rocy[i])/(rocx[i+1]-rocx[i]))
02784 value += ((rocy[i+1]-rocy[i])/math.log(10))+b*(math.log10(rocx[i+1])-math.log10(rocx[i]))
02785 return value/math.log10(1.0/l)
02786
02787 except ImportError:
02788 LogError("Function needs numpy, but I could not import it.")
02789 raise
02790
02791 def PlotROC(self, score_col, class_col, score_dir='-',
02792 class_dir='-', class_cutoff=2.0,
02793 style='-', title=None, x_title=None, y_title=None,
02794 clear=True, save=None):
02795 '''
02796 Plot an ROC curve using matplotlib.
02797
02798 For more information about parameters of the ROC, see
02799 :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
02800
02801 :warning: The function depends on *matplotlib*
02802 '''
02803
02804 try:
02805 import matplotlib.pyplot as plt
02806
02807 roc = self.ComputeROC(score_col, class_col, score_dir,
02808 class_dir, class_cutoff)
02809
02810 if not roc:
02811 return None
02812
02813 enrx, enry = roc
02814
02815 if not title:
02816 title = 'ROC of %s'%score_col
02817
02818 if not x_title:
02819 x_title = 'false positive rate'
02820
02821 if not y_title:
02822 y_title = 'true positive rate'
02823
02824 if clear:
02825 plt.clf()
02826
02827 plt.plot(enrx, enry, style)
02828
02829 plt.title(title, size='x-large', fontweight='bold')
02830 plt.ylabel(y_title, size='x-large')
02831 plt.xlabel(x_title, size='x-large')
02832
02833 if save:
02834 plt.savefig(save)
02835
02836 return plt
02837 except ImportError:
02838 LogError("Function needs matplotlib, but I could not import it.")
02839 raise
02840
02841 def PlotLogROC(self, score_col, class_col, score_dir='-',
02842 class_dir='-', class_cutoff=2.0,
02843 style='-', title=None, x_title=None, y_title=None,
02844 clear=True, save=None):
02845 '''
02846 Plot an logROC curve where the x-axis is semilogarithmic using matplotlib
02847
02848 For more information about parameters of the ROC, see
02849 :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
02850
02851 :warning: The function depends on *matplotlib*
02852 '''
02853
02854 try:
02855 import matplotlib.pyplot as plt
02856
02857 roc = self.ComputeROC(score_col, class_col, score_dir,
02858 class_dir, class_cutoff)
02859
02860 if not roc:
02861 return None
02862
02863 rocx, rocy = roc
02864
02865 if not title:
02866 title = 'logROC of %s'%score_col
02867
02868 if not x_title:
02869 x_title = 'false positive rate'
02870
02871 if not y_title:
02872 y_title = 'true positive rate'
02873
02874 if clear:
02875 plt.clf()
02876
02877 rocx = [x if x>0 else 0.001 for x in rocx]
02878
02879
02880 plt.plot(rocx, rocy, style)
02881
02882 plt.title(title, size='x-large', fontweight='bold')
02883 plt.ylabel(y_title, size='x-large')
02884 plt.xlabel(x_title, size='x-large')
02885
02886 plt.xscale('log', basex=10)
02887 plt.xlim(0.001, 1.0)
02888
02889
02890 if save:
02891 plt.savefig(save)
02892
02893 return plt
02894 except ImportError:
02895 LogError("Function needs matplotlib, but I could not import it.")
02896 raise
02897
02898 def ComputeMCC(self, score_col, class_col, score_dir='-',
02899 class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
02900 '''
02901 Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
02902 with the points classified into true positives, false positives, true
02903 negatives and false negatives according to a specified classification
02904 column (*class_col*).
02905
02906 The datapoints in *score_col* and *class_col* are classified into
02907 positive and negative points. This can be done in two ways:
02908
02909 - by using 'bool' columns which contains True for positives and False
02910 for negatives
02911
02912 - by using 'float' or 'int' columns and specifying a cutoff value and the
02913 columns direction. This will generate the classification on the fly
02914
02915 * if ``class_dir``/``score_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
02916 * if ``class_dir``/``score_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
02917
02918 The two possibilities can be used together, i.e. 'bool' type for one column
02919 and 'float'/'int' type and cutoff/direction for the other column.
02920 '''
02921 ALLOWED_DIR = ['+','-']
02922
02923 score_idx = self.GetColIndex(score_col)
02924 score_type = self.col_types[score_idx]
02925 if score_type!='int' and score_type!='float' and score_type!='bool':
02926 raise TypeError("Score column must be numeric or bool type")
02927
02928 class_idx = self.GetColIndex(class_col)
02929 class_type = self.col_types[class_idx]
02930 if class_type!='int' and class_type!='float' and class_type!='bool':
02931 raise TypeError("Classifier column must be numeric or bool type")
02932
02933 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
02934 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
02935
02936 tp = 0
02937 fp = 0
02938 fn = 0
02939 tn = 0
02940
02941 for i,row in enumerate(self.rows):
02942 class_val = row[class_idx]
02943 score_val = row[score_idx]
02944 if class_val!=None:
02945 if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
02946 if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
02947 tp += 1
02948 else:
02949 fn += 1
02950 else:
02951 if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
02952 tn += 1
02953 else:
02954 fp += 1
02955
02956 mcc = None
02957 msg = None
02958 if (tp+fn)==0:
02959 msg = 'factor (tp + fn) is zero'
02960 elif (tp+fp)==0:
02961 msg = 'factor (tp + fp) is zero'
02962 elif (tn+fn)==0:
02963 msg = 'factor (tn + fn) is zero'
02964 elif (tn+fp)==0:
02965 msg = 'factor (tn + fp) is zero'
02966
02967 if msg:
02968 LogWarning("Could not compute MCC: MCC is not defined since %s"%msg)
02969 else:
02970 mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
02971 return mcc
02972
02973
02974 def IsEmpty(self, col_name=None, ignore_nan=True):
02975 '''
02976 Checks if a table is empty.
02977
02978 If no column name is specified, the whole table is checked for being empty,
02979 whereas if a column name is specified, only this column is checked.
02980
02981 By default, all NAN (or None) values are ignored, and thus, a table
02982 containing only NAN values is considered as empty. By specifying the
02983 option ignore_nan=False, NAN values are counted as 'normal' values.
02984 '''
02985
02986 # table with no columns and no rows
02987 if len(self.col_names)==0:
02988 if col_name:
02989 raise ValueError('Table has no column named "%s"' % col_name)
02990 return True
02991
02992 # column name specified
02993 if col_name:
02994 if self.Count(col_name, ignore_nan=ignore_nan)==0:
02995 return True
02996 else:
02997 return False
02998
02999 # no column name specified -> test whole table
03000 else:
03001 for row in self.rows:
03002 for cell in row:
03003 if ignore_nan:
03004 if cell!=None:
03005 return False
03006 else:
03007 return False
03008 return True
03009
03010
03011 def Extend(self, tab, overwrite=None):
03012 """
03013 Append each row of *tab* to the current table. The data is appended based
03014 on the column names, thus the order of the table columns is *not* relevant,
03015 only the header names.
03016
03017 If there is a column in *tab* that is not present in the current table,
03018 it is added to the current table and filled with *None* for all the rows
03019 present in the current table.
03020
03021 If the type of any column in *tab* is not the same as in the current table
03022 a *TypeError* is raised.
03023
03024 If *overwrite* is not None and set to an existing column name, the specified
03025 column in the table is searched for the first occurrence of a value matching
03026 the value of the column with the same name in the dictionary. If a matching
03027 value is found, the row is overwritten with the dictionary. If no matching
03028 row is found, a new row is appended to the table.
03029 """
03030 # add column to current table if it doesn't exist
03031 for name,typ in zip(tab.col_names, tab.col_types):
03032 if not name in self.col_names:
03033 self.AddCol(name, typ)
03034
03035 # check that column types are the same in current and new table
03036 for name in self.col_names:
03037 if name in tab.col_names:
03038 curr_type = self.col_types[self.GetColIndex(name)]
03039 new_type = tab.col_types[tab.GetColIndex(name)]
03040 if curr_type!=new_type:
03041 raise TypeError('cannot extend table, column %s in new '%name +\
03042 'table different type (%s) than in '%new_type +\
03043 'current table (%s)'%curr_type)
03044
03045 num_rows = len(tab.rows)
03046 for i in range(0,num_rows):
03047 row = tab.rows[i]
03048 data = dict(zip(tab.col_names,row))
03049 self.AddRow(data, overwrite)
03050
03051
03052 def Merge(table1, table2, by, only_matching=False):
03053 """
03054 Returns a new table containing the data from both tables. The rows are
03055 combined based on the common values in the column(s) by. The option 'by' can
03056 be a list of column names. When this is the case, merging is based on
03057 multiple columns.
03058 For example, the two tables below
03059
03060 ==== ====
03061 x y
03062 ==== ====
03063 1 10
03064 2 15
03065 3 20
03066 ==== ====
03067
03068 ==== ====
03069 x u
03070 ==== ====
03071 1 100
03072 3 200
03073 4 400
03074 ==== ====
03075
03076 when merged by column x, produce the following output:
03077
03078 ===== ===== =====
03079 x y u
03080 ===== ===== =====
03081 1 10 100
03082 2 15 None
03083 3 20 200
03084 4 None 400
03085 ===== ===== =====
03086
03087
03088 """
03089 def _key(row, indices):
03090 return tuple([row[i] for i in indices])
03091 def _keep(indices, cn, ct, ni):
03092 ncn, nct, nni=([],[],[])
03093 for i in range(len(cn)):
03094 if i not in indices:
03095 ncn.append(cn[i])
03096 nct.append(ct[i])
03097 nni.append(ni[i])
03098 return ncn, nct, nni
03099 col_names=list(table2.col_names)
03100 col_types=list(table2.col_types)
03101 new_index=[i for i in range(len(col_names))]
03102 if isinstance(by, str):
03103 common2_indices=[col_names.index(by)]
03104 else:
03105 common2_indices=[col_names.index(b) for b in by]
03106 col_names, col_types, new_index=_keep(common2_indices, col_names,
03107 col_types, new_index)
03108
03109 for i, name in enumerate(col_names):
03110 try_name=name
03111 counter=1
03112 while try_name in table1.col_names:
03113 counter+=1
03114 try_name='%s_%d' % (name, counter)
03115 col_names[i]=try_name
03116 common1={}
03117 if isinstance(by, str):
03118 common1_indices=[table1.col_names.index(by)]
03119 else:
03120 common1_indices=[table1.col_names.index(b) for b in by]
03121 for row in table1.rows:
03122 key=_key(row, common1_indices)
03123 if key in common1:
03124 raise ValueError('duplicate key "%s in first table"' % (str(key)))
03125 common1[key]=row
03126 common2={}
03127 for row in table2.rows:
03128 key=_key(row, common2_indices)
03129 if key in common2:
03130 raise ValueError('duplicate key "%s" in second table' % (str(key)))
03131 common2[key]=row
03132 new_tab=Table(table1.col_names+col_names, table1.col_types+col_types)
03133 for k, v in common1.iteritems():
03134 row=v+[None for i in range(len(table2.col_names)-len(common2_indices))]
03135 matched=False
03136 if k in common2:
03137 matched=True
03138 row2=common2[k]
03139 for i, index in enumerate(new_index):
03140 row[len(table1.col_names)+i]=row2[index]
03141 if only_matching and not matched:
03142 continue
03143 new_tab.AddRow(row)
03144 if only_matching:
03145 return new_tab
03146 for k, v in common2.iteritems():
03147 if not k in common1:
03148 v2=[v[i] for i in new_index]
03149 row=[None for i in range(len(table1.col_names))]+v2
03150 for common1_index, common2_index in zip(common1_indices, common2_indices):
03151 row[common1_index]=v[common2_index]
03152 new_tab.AddRow(row)
03153 return new_tab
03154