OpenStructure
table.py
Go to the documentation of this file.
1 import csv
2 import re
3 import math
4 from ost import stutil
5 import itertools
6 import operator
7 import pickle
8 import weakref
9 from ost import LogError, LogWarning, LogInfo, LogVerbose
10 
11 def MakeTitle(col_name):
12  return col_name.replace('_', ' ')
13 
14 def IsStringLike(value):
15  if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
16  return False
17  try:
18  value+''
19  return True
20  except:
21  return False
22 
23 def IsNullString(value):
24  value=value.strip().upper()
25  return value in ('', 'NULL', 'NONE', 'NA')
26 
27 def IsScalar(value):
28  if IsStringLike(value):
29  return True
30  try:
31  if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
32  return False
33  iter(value)
34  return False
35  except:
36  return True
37 
38 def GuessColumnType(iterator):
39  empty=True
40  possibilities=set(['bool', 'int', 'float'])
41  for ele in iterator:
42  str_ele=str(ele).upper()
43  if IsNullString(str_ele):
44  continue
45  empty=False
46  if 'int' in possibilities:
47  try:
48  int(str_ele)
49  except ValueError:
50  possibilities.remove('int')
51 
52  if 'float' in possibilities:
53  try:
54  float(str_ele)
55  except ValueError:
56  possibilities.remove('float')
57  if 'bool' in possibilities:
58  if str_ele not in set(['YES', 'NO', 'TRUE', 'FALSE']):
59  possibilities.remove('bool')
60 
61  if len(possibilities)==0:
62  return 'string'
63  if len(possibilities)==2:
64  return 'int'
65  if empty:
66  return 'string'
67  # return the last element available
68  return possibilities.pop()
69 
71  def __init__(self, op, lhs, rhs):
72  self.opop=op
73  self.lhslhs=lhs
74  self.rhsrhs=rhs
75  if IsScalar(lhs):
76  self.lhslhs=itertools.cyle([self.lhslhs])
77  if IsScalar(rhs):
78  self.rhsrhs=itertools.cycle([self.rhsrhs])
79  def __iter__(self):
80  for l, r in zip(self.lhslhs, self.rhsrhs):
81  if l!=None and r!=None:
82  yield self.opop(l, r)
83  else:
84  yield None
85  def __add__(self, rhs):
86  return BinaryColExpr(operator.add, self, rhs)
87 
88  def __sub__(self, rhs):
89  return BinaryColExpr(operator.sub, self, rhs)
90 
91  def __mul__(self, rhs):
92  return BinaryColExpr(operator.mul, self, rhs)
93 
94  def __div__(self, rhs):
95  return BinaryColExpr(operator.div, self, rhs)
96 
97 class TableCol:
98  def __init__(self, table, col):
99  self._table_table=table
100  if type(col)==str:
101  self.col_indexcol_index=self._table_table.GetColIndex(col)
102  else:
103  self.col_indexcol_index=col
104 
105  def __iter__(self):
106  for row in self._table_table.rows:
107  yield row[self.col_indexcol_index]
108 
109  def __len__(self):
110  return len(self._table_table.rows)
111 
112  def __getitem__(self, index):
113  return self._table_table.rows[index][self.col_indexcol_index]
114 
115  def __setitem__(self, index, value):
116  self._table_table.rows[index][self.col_indexcol_index]=value
117 
118  def __add__(self, rhs):
119  return BinaryColExpr(operator.add, self, rhs)
120 
121  def __sub__(self, rhs):
122  return BinaryColExpr(operator.sub, self, rhs)
123 
124  def __mul__(self, rhs):
125  return BinaryColExpr(operator.mul, self, rhs)
126 
127  def __div__(self, rhs):
128  return BinaryColExpr(operator.div, self, rhs)
129 
130 class TableRow:
131  """
132  Essentially a named tuple, but allows column names that are not valid
133  python variable names.
134  """
135  def __init__(self, row_data, tab):
136  self.__dict__['tab'] = weakref.proxy(tab)
137  self.__dict__['row_data'] = row_data
138 
139  def __getitem__(self, col_name):
140  if type(col_name)==int:
141  return self.row_data[col_name]
142  return self.row_data[self.tab.GetColIndex(col_name)]
143 
144  def __str__(self):
145  s = []
146  for k, v in zip(self.__dict__['tab'].col_names, self.__dict__['row_data']):
147  s.append('%s=%s' % (k, str(v)))
148  return ', '.join(s)
149 
150 
151  def __len__(self):
152  return len(self.row_data)
153 
154  def __setitem__(self, col_name, val):
155  if type(col_name)==int:
156  self.row_data[col_name] = val
157  else:
158  self.row_data[self.tab.GetColIndex(col_name)] = val
159 
160  def __getattr__(self, col_name):
161  if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
162  raise AttributeError(col_name)
163  return self.row_data[self.tab.GetColIndex(col_name)]
164 
165  def __setattr__(self, col_name, val):
166  if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
167  raise AttributeError(col_name)
168  self.row_data[self.tab.GetColIndex(col_name)] = val
169 
170 class Table(object):
171  """
172 
173  The table class provides convenient access to data in tabular form. An empty
174  table can be easily constructed as follows
175 
176  .. code-block:: python
177 
178  tab = Table()
179 
180  If you want to add columns directly when creating the table, column names
181  and *column types* can be specified as follows
182 
183  .. code-block:: python
184 
185  tab = Table(['nameX','nameY','nameZ'], 'sfb')
186 
187  this will create three columns called nameX, nameY and nameZ of type string,
188  float and bool, respectively. There will be no data in the table and thus,
189  the table will not contain any rows.
190 
191  The following *column types* are supported:
192 
193  ======= ========
194  name abbrev
195  ======= ========
196  string s
197  float f
198  int i
199  bool b
200  ======= ========
201 
202  If you want to add data to the table in addition, use the following:
203 
204  .. code-block:: python
205 
206  tab=Table(['nameX','nameY','nameZ'],
207  'sfb',
208  nameX = ['a','b','c'],
209  nameY = [0.1, 1.2, 3.414],
210  nameZ = [True, False, False])
211 
212  if values for one column is left out, they will be filled with NA, but if
213  values are specified, all values must be specified (i.e. same number of
214  values per column)
215 
216  """
217 
218  SUPPORTED_TYPES=('int', 'float', 'bool', 'string',)
219 
220 
221  def __init__(self, col_names=[], col_types=None, **kwargs):
222 
223  self.col_namescol_names=list(col_names)
224  self.commentcomment=''
225  self.namename=''
226 
227  self.col_typescol_types = self._ParseColTypes_ParseColTypes(col_types)
228  self.rowsrows=[]
229  if len(kwargs)>=0:
230  if not col_names:
231  self.col_namescol_names=[v for v in list(kwargs.keys())]
232  if not self.col_typescol_types:
233  self.col_typescol_types=['string' for u in range(len(self.col_namescol_names))]
234  if len(kwargs)>0:
235  self._AddRowsFromDict_AddRowsFromDict(kwargs)
236 
237  def __getattr__(self, col_name):
238  # pickling doesn't call the standard __init__ defined above and thus
239  # col_names might not be defined. This leads to infinite recursions.
240  # Protect against it by checking that col_names is contained in
241  # __dict__
242  if 'col_names' not in self.__dict__ or col_name not in self.col_namescol_names:
243  raise AttributeError(col_name)
244  return TableCol(self, col_name)
245 
246  @staticmethod
247  def _ParseColTypes(types, exp_num=None):
248  if types==None:
249  return None
250 
251  short2long = {'s' : 'string', 'i': 'int', 'b' : 'bool', 'f' : 'float'}
252  allowed_short = list(short2long.keys())
253  allowed_long = list(short2long.values())
254 
255  type_list = []
256 
257  # string type
258  if IsScalar(types):
259  if type(types)==str:
260  types = types.lower()
261 
262  # single value
263  if types in allowed_long:
264  type_list.append(types)
265  elif types in allowed_short:
266  type_list.append(short2long[types])
267 
268  # comma separated list of long or short types
269  elif types.find(',')!=-1:
270  for t in types.split(','):
271  if t in allowed_long:
272  type_list.append(t)
273  elif t in allowed_short:
274  type_list.append(short2long[t])
275  else:
276  raise ValueError('Unknown type %s in types %s'%(t,types))
277 
278  # string of short types
279  else:
280  for t in types:
281  if t in allowed_short:
282  type_list.append(short2long[t])
283  else:
284  raise ValueError('Unknown type %s in types %s'%(t,types))
285 
286  # non-string type
287  else:
288  raise ValueError('Col type %s must be string or list'%types)
289 
290  # list type
291  else:
292  for t in types:
293  # must be string type
294  if type(t)==str:
295  t = t.lower()
296  if t in allowed_long:
297  type_list.append(t)
298  elif t in allowed_short:
299  type_list.append(short2long[t])
300  else:
301  raise ValueError('Unknown type %s in types %s'%(t,types))
302 
303  # non-string type
304  else:
305  raise ValueError('Col type %s must be string or list'%types)
306 
307  if exp_num:
308  if len(type_list)!=exp_num:
309  raise ValueError('Parsed number of col types (%i) differs from ' + \
310  'expected (%i) in types %s'%(len(type_list),exp_num,types))
311 
312  return type_list
313 
314  def SetName(self, name):
315  '''
316  Set name of the table
317 
318  :param name: name
319  :type name: :class:`str`
320  '''
321  self.namename = name
322 
323  def GetName(self):
324  '''
325  Get name of table
326  '''
327  return self.namename
328 
329  def RenameCol(self, old_name, new_name):
330  """
331  Rename column *old_name* to *new_name*.
332 
333  :param old_name: Name of the old column
334  :param new_name: Name of the new column
335  :raises: :exc:`ValueError` when *old_name* is not a valid column
336  """
337  if old_name==new_name:
338  return
339  self.AddColAddCol(new_name, self.col_typescol_types[self.GetColIndexGetColIndex(old_name)],
340  self[old_name])
341  self.RemoveColRemoveCol(old_name)
342  def _Coerce(self, value, ty):
343  '''
344  Try to convert values (e.g. from :class:`str` type) to the specified type
345 
346  :param value: the value
347  :type value: any type
348 
349  :param ty: name of type to convert it to (i.e. *int*, *float*, *string*,
350  *bool*)
351  :type ty: :class:`str`
352  '''
353  if value=='NA' or value==None:
354  return None
355  if ty=='int':
356  return int(value)
357  if ty=='float':
358  return float(value)
359  if ty=='string':
360  return str(value)
361  if ty=='bool':
362  if isinstance(value, str) or isinstance(value, str):
363  if value.upper() in ('FALSE', 'NO',):
364  return False
365  return True
366  return bool(value)
367  raise ValueError('Unknown type %s' % ty)
368 
369  def GetColIndex(self, col):
370  '''
371  Returns the column index for the column with the given name.
372 
373  :raises: ValueError if no column with the name is found.
374  '''
375  if col not in self.col_namescol_names:
376  raise ValueError('Table has no column named "%s"' % col)
377  return self.col_namescol_names.index(col)
378 
379  def GetColNames(self):
380  '''
381  Returns a list containing all column names.
382  '''
383  return self.col_namescol_names
384 
385  def SearchColNames(self, regex):
386  '''
387  Returns a list of column names matching the regex.
388 
389  :param regex: regex pattern
390  :type regex: :class:`str`
391 
392  :returns: :class:`list` of column names (:class:`str`)
393  '''
394  matching_names = []
395  for name in self.col_namescol_names:
396  matches = re.search(regex, name)
397  if matches:
398  matching_names.append(name)
399  return matching_names
400 
401  def HasCol(self, col):
402  '''
403  Checks if the column with a given name is present in the table.
404  '''
405  return col in self.col_namescol_names
406 
407  def __getitem__(self, k):
408  if type(k)==int:
409  return TableCol(self, self.col_namescol_names[k])
410  else:
411  return TableCol(self, k)
412 
413  def __setitem__(self, k, value):
414  col_index=k
415  if type(k)!=int:
416  col_index=self.GetColIndexGetColIndex(k)
417  if IsScalar(value):
418  value=itertools.cycle([value])
419  for r, v in zip(self.rowsrows, value):
420  r[col_index]=v
421 
422  def ToString(self, float_format='%.3f', int_format='%d', rows=None):
423  '''
424  Convert the table into a string representation.
425 
426  The output format can be modified for int and float type columns by
427  specifying a formatting string for the parameters *float_format* and
428  *int_format*.
429 
430  The option *rows* specify the range of rows to be printed. The parameter
431  must be a type that supports indexing (e.g. a :class:`list`) containing the
432  start and end row *index*, e.g. [start_row_idx, end_row_idx].
433 
434  :param float_format: formatting string for float columns
435  :type float_format: :class:`str`
436 
437  :param int_format: formatting string for int columns
438  :type int_format: :class:`str`
439 
440  :param rows: iterable containing start and end row *index*
441  :type rows: iterable containing :class:`ints <int>`
442  '''
443  widths=[len(cn) for cn in self.col_namescol_names]
444  sel_rows=self.rowsrows
445  if rows:
446  sel_rows=self.rowsrows[rows[0]:rows[1]]
447  for row in sel_rows:
448  for i, (ty, col) in enumerate(zip(self.col_typescol_types, row)):
449  if col==None:
450  widths[i]=max(widths[i], len('NA'))
451  elif ty=='float':
452  widths[i]=max(widths[i], len(float_format % col))
453  elif ty=='int':
454  widths[i]=max(widths[i], len(int_format % col))
455  else:
456  widths[i]=max(widths[i], len(str(col)))
457  s=''
458  if self.commentcomment:
459  s+=''.join(['# %s\n' % l for l in self.commentcomment.split('\n')])
460  total_width=sum(widths)+2*len(widths)
461  for width, col_name in zip(widths, self.col_namescol_names):
462  s+=col_name.center(width+2)
463  s+='\n%s\n' % ('-'*total_width)
464  for row in sel_rows:
465  for width, ty, col in zip(widths, self.col_typescol_types, row):
466  cs=''
467  if col==None:
468  cs='NA'.center(width+2)
469  elif ty=='float':
470  cs=(float_format % col).rjust(width+2)
471  elif ty=='int':
472  cs=(int_format % col).rjust(width+2)
473  else:
474  cs=' '+str(col).ljust(width+1)
475  s+=cs
476  s+='\n'
477  return s
478 
479  def __str__(self):
480  return self.ToStringToString()
481 
482  def Stats(self, col):
483  idx = self.GetColIndexGetColIndex(col)
484  text ='''
485 Statistics for column %(col)s
486 
487  Number of Rows : %(num)d
488  Number of Rows Not None: %(num_non_null)d
489  Mean : %(mean)f
490  Median : %(median)f
491  Standard Deviation : %(stddev)f
492  Min : %(min)f
493  Max : %(max)f
494 '''
495  data = {
496  'col' : col,
497  'num' : len(self.rowsrows),
498  'num_non_null' : self.CountCount(col),
499  'median' : self.MedianMedian(col),
500  'mean' : self.MeanMean(col),
501  'stddev' : self.StdDevStdDev(col),
502  'min' : self.MinMin(col),
503  'max' : self.MaxMax(col),
504  }
505  return text % data
506 
507  def _AddRowsFromDict(self, d, overwrite=None):
508  '''
509  Add one or more rows from a :class:`dictionary <dict>`.
510 
511  If *overwrite* is not None and set to an existing column name, the specified
512  column in the table is searched for the first occurrence of a value matching
513  the value of the column with the same name in the dictionary. If a matching
514  value is found, the row is overwritten with the dictionary. If no matching
515  row is found, a new row is appended to the table.
516 
517  :param d: dictionary containing the data
518  :type d: :class:`dict`
519 
520  :param overwrite: column name to overwrite existing row if value in
521  column *overwrite* matches
522  :type overwrite: :class:`str`
523 
524  :raises: :class:`ValueError` if multiple rows are added but the number of
525  data items is different for different columns.
526  '''
527  # get column indices
528  idxs = [self.GetColIndexGetColIndex(k) for k in list(d.keys())]
529 
530  # convert scalar values to list
531  old_len = None
532  for k,v in d.items():
533  if IsScalar(v):
534  v = [v]
535  d[k] = v
536  if not old_len:
537  old_len = len(v)
538  elif old_len!=len(v):
539  raise ValueError("Cannot add rows: length of data must be equal " + \
540  "for all columns in %s"%str(d))
541 
542  # convert column based dict to row based dict and create row and add data
543  for i,data in enumerate(zip(*list(d.values()))):
544  new_row = [None for a in range(len(self.col_namescol_names))]
545  for idx,v in zip(idxs,data):
546  new_row[idx] = self._Coerce_Coerce(v, self.col_typescol_types[idx])
547 
548  # partially overwrite existing row with new data
549  if overwrite:
550  overwrite_idx = self.GetColIndexGetColIndex(overwrite)
551  added = False
552  for i,r in enumerate(self.rowsrows):
553  if r[overwrite_idx]==new_row[overwrite_idx]:
554  for j,e in enumerate(self.rowsrows[i]):
555  if new_row[j]==None:
556  new_row[j] = e
557  self.rowsrows[i] = new_row
558  added = True
559  break
560 
561  # if not overwrite or overwrite did not find appropriate row
562  if not overwrite or not added:
563  self.rowsrows.append(new_row)
564 
565  def PairedTTest(self, col_a, col_b):
566  """
567  Two-sided test for the null-hypothesis that two related samples
568  have the same average (expected values).
569 
570  :param col_a: First column
571  :type col_a: :class:`str`
572  :param col_b: Second column
573  :type col_b: :class:`str`
574 
575  :returns: P-value between 0 and 1 that the two columns have the
576  same average. The smaller the value, the less related the two
577  columns are.
578  """
579  from scipy.stats import ttest_rel
580  xs = []
581  ys = []
582  for x, y in self.ZipZip(col_a, col_b):
583  if x!=None and y!=None:
584  xs.append(x)
585  ys.append(y)
586  result = ttest_rel(xs, ys)
587  return result[1]
588 
589  def AddRow(self, data, overwrite=None):
590  """
591  Add a row to the table.
592 
593  *data* may either be a dictionary or a list-like object:
594 
595  - If *data* is a dictionary, the keys in the dictionary must match the
596  column names. Columns not found in the dict will be initialized to None.
597  If the dict contains list-like objects, multiple rows will be added, if
598  the number of items in all list-like objects is the same, otherwise a
599  :class:`ValueError` is raised.
600 
601  - If *data* is a list-like object, the row is initialized from the values
602  in *data*. The number of items in *data* must match the number of
603  columns in the table. A :class:`ValuerError` is raised otherwise. The
604  values are added in the order specified in the list, thus, the order of
605  the data must match the columns.
606 
607  If *overwrite* is not None and set to an existing column name, the specified
608  column in the table is searched for the first occurrence of a value matching
609  the value of the column with the same name in the dictionary. If a matching
610  value is found, the row is overwritten with the dictionary. If no matching
611  row is found, a new row is appended to the table.
612 
613  :param data: data to add
614  :type data: :class:`dict` or *list-like* object
615 
616  :param overwrite: column name to overwrite existing row if value in
617  column *overwrite* matches
618  :type overwrite: :class:`str`
619 
620  :raises: :class:`ValueError` if *list-like* object is used and number of
621  items does *not* match number of columns in table.
622 
623  :raises: :class:`ValueError` if *dict* is used and multiple rows are added
624  but the number of data items is different for different columns.
625 
626  **Example:** add multiple data rows to a subset of columns using a dictionary
627 
628  .. code-block:: python
629 
630  # create table with three float columns
631  tab = Table(['x','y','z'], 'fff')
632 
633  # add rows from dict
634  data = {'x': [1.2, 1.6], 'z': [1.6, 5.3]}
635  tab.AddRow(data)
636  print tab
637 
638  '''
639  will produce the table
640 
641  ==== ==== ====
642  x y z
643  ==== ==== ====
644  1.20 NA 1.60
645  1.60 NA 5.30
646  ==== ==== ====
647  '''
648 
649  # overwrite the row with x=1.2 and add row with x=1.9
650  data = {'x': [1.2, 1.9], 'z': [7.9, 3.5]}
651  tab.AddRow(data, overwrite='x')
652  print tab
653 
654  '''
655  will produce the table
656 
657  ==== ==== ====
658  x y z
659  ==== ==== ====
660  1.20 NA 7.90
661  1.60 NA 5.30
662  1.90 NA 3.50
663  ==== ==== ====
664  '''
665  """
666  if type(data)==dict:
667  self._AddRowsFromDict_AddRowsFromDict(data, overwrite)
668  else:
669  if len(data)!=len(self.col_namescol_names):
670  msg='data array must have %d elements, not %d'
671  raise ValueError(msg % (len(self.col_namescol_names), len(data)))
672  new_row = [self._Coerce_Coerce(v, t) for v, t in zip(data, self.col_typescol_types)]
673 
674  # fully overwrite existing row with new data
675  if overwrite:
676  overwrite_idx = self.GetColIndexGetColIndex(overwrite)
677  added = False
678  for i,r in enumerate(self.rowsrows):
679  if r[overwrite_idx]==new_row[overwrite_idx]:
680  self.rowsrows[i] = new_row
681  added = True
682  break
683 
684  # if not overwrite or overwrite did not find appropriate row
685  if not overwrite or not added:
686  self.rowsrows.append(new_row)
687 
688  def RemoveCol(self, col):
689  """
690  Remove column with the given name from the table.
691 
692  :param col: name of column to remove
693  :type col: :class:`str`
694  """
695  idx = self.GetColIndexGetColIndex(col)
696  del self.col_namescol_names[idx]
697  del self.col_typescol_types[idx]
698  for row in self.rowsrows:
699  del row[idx]
700 
701  def AddCol(self, col_name, col_type, data=None):
702  """
703  Add a column to the right of the table.
704 
705  :param col_name: name of new column
706  :type col_name: :class:`str`
707 
708  :param col_type: type of new column (long versions: *int*, *float*, *bool*,
709  *string* or short versions: *i*, *f*, *b*, *s*)
710  :type col_type: :class:`str`
711 
712  :param data: data to add to new column
713  :type data: scalar or iterable
714 
715  **Example:**
716 
717  .. code-block:: python
718 
719  tab = Table(['x'], 'f', x=range(5))
720  tab.AddCol('even', 'bool', itertools.cycle([True, False]))
721  print tab
722 
723  '''
724  will produce the table
725 
726  ==== ====
727  x even
728  ==== ====
729  0 True
730  1 False
731  2 True
732  3 False
733  4 True
734  ==== ====
735  '''
736 
737  If data is a constant instead of an iterable object, it's value
738  will be written into each row:
739 
740  .. code-block:: python
741 
742  tab = Table(['x'], 'f', x=range(5))
743  tab.AddCol('num', 'i', 1)
744  print tab
745 
746  '''
747  will produce the table
748 
749  ==== ====
750  x num
751  ==== ====
752  0 1
753  1 1
754  2 1
755  3 1
756  4 1
757  ==== ====
758  '''
759 
760  As a special case, if there are no previous rows, and data is not
761  None, rows are added for every item in data.
762  """
763 
764  if col_name in self.col_namescol_names:
765  raise ValueError('Column with name %s already exists'%col_name)
766 
767  col_type = self._ParseColTypes_ParseColTypes(col_type, exp_num=1)[0]
768  self.col_namescol_names.append(col_name)
769  self.col_typescol_types.append(col_type)
770 
771  if len(self.rowsrows)>0:
772  if IsScalar(data):
773  for row in self.rowsrows:
774  row.append(data)
775  else:
776  if hasattr(data, '__len__') and len(data)!=len(self.rowsrows):
777  self.col_namescol_names.pop()
778  self.col_typescol_types.pop()
779  raise ValueError('Length of data (%i) must correspond to number of '%len(data) +\
780  'existing rows (%i)'%len(self.rowsrows))
781  for row, d in zip(self.rowsrows, data):
782  row.append(d)
783 
784  elif data!=None and len(self.col_namescol_names)==1:
785  if IsScalar(data):
786  self.AddRowAddRow({col_name : data})
787  else:
788  for v in data:
789  self.AddRowAddRow({col_name : v})
790 
791  def Filter(self, *args, **kwargs):
792  """
793  Returns a filtered table only containing rows matching all the predicates
794  in kwargs and args For example,
795 
796  .. code-block:: python
797 
798  tab.Filter(town='Basel')
799 
800  will return all the rows where the value of the column "town" is equal to
801  "Basel". Several predicates may be combined, i.e.
802 
803  .. code-block:: python
804 
805  tab.Filter(town='Basel', male=True)
806 
807  will return the rows with "town" equal to "Basel" and "male" equal to true.
808  args are unary callables returning true if the row should be included in the
809  result and false if not.
810  """
811  filt_tab=Table(list(self.col_namescol_names), list(self.col_typescol_types))
812  for row in self.rowsrows:
813  matches=True
814  for func in args:
815  if not func(row):
816  matches=False
817  break
818  for key, val in kwargs.items():
819  if row[self.GetColIndexGetColIndex(key)]!=val:
820  matches=False
821  break
822  if matches:
823  filt_tab.AddRow(row)
824  return filt_tab
825 
826 
827  def Select(self, query):
828 
829  """
830  Returns a new table object containing all rows matching a logical query
831  expression.
832 
833  *query* is a string containing the logical expression, that will be
834  evaluated for every row.
835 
836  Operands have to be the name of a column or an expression that can be
837  parsed to float, int, bool or string.
838  Valid operators are: and, or, !=, !, <=, >=, ==, =, <, >, +, -, \\*, /
839 
840  .. code-block:: python
841 
842  subtab = tab.Select('col_a>0.5 and (col_b=5 or col_c=5)')
843 
844  The selection query should be self explaining. Allowed parenthesis are:
845  (), [], {}, whereas parenthesis mismatches get recognized. Expressions like
846  '3<=col_a>=col_b' throw an error, due to problems in figuring out the
847  evaluation order.
848 
849  There are two special expressions:
850 
851  .. code-block:: python
852 
853  #selects rows, where 1.0<=col_a<=1.5
854  subtab = tab.Select('col_a=1.0:1.5')
855 
856  #selects rows, where col_a=1 or col_a=2 or col_a=3
857  subtab = tab.Select('col_a=1,2,3')
858 
859  Only consistent types can be compared. If col_a is of type string and col_b
860  is of type int, following expression would throw an error: 'col_a<col_b'
861  """
862 
863  try:
864  from .table_selector import TableSelector
865  except:
866  raise ImportError("Tried to import from the file table_selector.py, but could not find it!")
867 
868  selector=TableSelector(self.col_typescol_types, self.col_namescol_names, query)
869 
870  selected_tab=Table(list(self.col_namescol_names), list(self.col_typescol_types))
871 
872  for row in self.rowsrows:
873  if selector.EvaluateRow(row):
874  selected_tab.AddRow(row)
875 
876  return selected_tab
877 
878 
879  @staticmethod
880  def _LoadOST(stream_or_filename):
881  fieldname_pattern=re.compile(r'(?P<name>[^[]+)(\[(?P<type>\w+)\])?')
882  values_pattern=re.compile("([^\" ]+|\"[^\"]*\")+")
883  file_opened=False
884  if not hasattr(stream_or_filename, 'read'):
885  stream=open(stream_or_filename, 'r')
886  file_opened=True
887  else:
888  stream=stream_or_filename
889  header=False
890  num_lines=0
891  for line in stream:
892  line=line.strip()
893  if line.startswith('#'):
894  continue
895  if len(line)==0:
896  continue
897  num_lines+=1
898  if not header:
899  fieldnames=[]
900  fieldtypes=[]
901  for col in line.split():
902  match=fieldname_pattern.match(col)
903  if match:
904  if match.group('type'):
905  fieldtypes.append(match.group('type'))
906  else:
907  fieldtypes.append('string')
908  fieldnames.append(match.group('name'))
909  try:
910  tab=Table(fieldnames, fieldtypes)
911  except Exception as e:
912  # potentially fails if we read in crap... clean up and pass on error
913  if file_opened:
914  stream.close()
915  raise e
916  header=True
917  continue
918  tab.AddRow([x.strip('"') for x in values_pattern.findall(line)])
919  if file_opened:
920  stream.close()
921  if num_lines==0:
922  raise IOError("Cannot read table from empty stream")
923  return tab
924 
925  def _GuessColumnTypes(self):
926  for col_idx in range(len(self.col_namescol_names)):
927  self.col_typescol_types[col_idx]=GuessColumnType(self[self.col_namescol_names[col_idx]])
928  for row in self.rowsrows:
929  for idx in range(len(row)):
930  row[idx]=self._Coerce_Coerce(row[idx], self.col_typescol_types[idx])
931 
932  @staticmethod
933  def _LoadCSV(stream_or_filename, sep):
934  file_opened=False
935  if not hasattr(stream_or_filename, 'read'):
936  stream=open(stream_or_filename, 'r')
937  file_opened=True
938  else:
939  stream=stream_or_filename
940  reader=csv.reader(stream, delimiter=sep)
941  first=True
942  for row in reader:
943  if first:
944  header=row
945  types='s'*len(row)
946  tab=Table(header, types)
947  first=False
948  else:
949  tab.AddRow(row)
950  if file_opened:
951  stream.close()
952  if first:
953  raise IOError('trying to load table from empty CSV stream/file')
954 
955  tab._GuessColumnTypes()
956  return tab
957 
958  @staticmethod
959  def _LoadPickle(stream_or_filename):
960  file_opened=False
961  if not hasattr(stream_or_filename, 'read'):
962  stream=open(stream_or_filename, 'rb')
963  file_opened=True
964  else:
965  stream=stream_or_filename
966  tab = pickle.load(stream)
967  if file_opened:
968  stream.close()
969  return tab
970 
971  @staticmethod
972  def _GuessFormat(filename):
973  try:
974  filename = filename.name
975  except AttributeError as e:
976  pass
977  if filename.endswith('.csv'):
978  return 'csv'
979  elif filename.endswith('.pickle'):
980  return 'pickle'
981  else:
982  return 'ost'
983 
984 
985  @staticmethod
986  def Load(stream_or_filename, format='auto', sep=','):
987  """
988  Load table from stream or file with given name.
989 
990  By default, the file format is set to *auto*, which tries to guess the file
991  format from the file extension. The following file extensions are
992  recognized:
993 
994  ============ ======================
995  extension recognized format
996  ============ ======================
997  .csv comma separated values
998  .pickle pickled byte stream
999  <all others> ost-specific format
1000  ============ ======================
1001 
1002  Thus, *format* must be specified for reading file with different filename
1003  extensions.
1004 
1005  The following file formats are understood:
1006 
1007  - ost
1008 
1009  This is an ost-specific, but still human readable file format. The file
1010  (stream) must start with header line of the form
1011 
1012  col_name1[type1] <col_name2[type2]>...
1013 
1014  The types given in brackets must be one of the data types the
1015  :class:`Table` class understands. Each following line in the file then must
1016  contains exactly the same number of data items as listed in the header. The
1017  data items are automatically converted to the column format. Lines starting
1018  with a '#' and empty lines are ignored.
1019 
1020  - pickle
1021 
1022  Deserializes the table from a pickled byte stream.
1023 
1024  - csv
1025 
1026  Reads the table from comma separated values stream. Since there is no
1027  explicit type information in the csv file, the column types are guessed,
1028  using the following simple rules:
1029 
1030  * if all values are either NA/NULL/NONE the type is set to string.
1031  * if all non-null values are convertible to float/int the type is set to
1032  float/int.
1033  * if all non-null values are true/false/yes/no, the value is set to bool.
1034  * for all other cases, the column type is set to string.
1035 
1036  :returns: A new :class:`Table` instance
1037  """
1038  format=format.lower()
1039  if format=='auto':
1040  format = Table._GuessFormat(stream_or_filename)
1041 
1042  if format=='ost':
1043  return Table._LoadOST(stream_or_filename)
1044  if format=='csv':
1045  return Table._LoadCSV(stream_or_filename, sep=sep)
1046  if format=='pickle':
1047  return Table._LoadPickle(stream_or_filename)
1048  raise ValueError('unknown format ""' % format)
1049 
1050  def Sort(self, by, order='+'):
1051  """
1052  Performs an in-place sort of the table, based on column *by*.
1053 
1054  :param by: column name by which to sort
1055  :type by: :class:`str`
1056 
1057  :param order: ascending (``-``) or descending (``+``) order
1058  :type order: :class:`str` (i.e. *+*, *-*)
1059  """
1060  sign=-1
1061  if order=='-':
1062  sign=1
1063  key_index=self.GetColIndexGetColIndex(by)
1064  def _key_cmp(lhs, rhs):
1065  a = lhs[key_index]
1066  b = rhs[key_index]
1067  # mimic behaviour of the cmp function from Python2 that happily
1068  # compared None values
1069  if a is None or b is None:
1070  if a is None and b is not None:
1071  return -1 * sign
1072  if b is None and a is not None:
1073  return 1 * sign
1074  return 0
1075  return sign*((a > b) - (a < b))
1076 
1077  import functools
1078  self.rowsrows=sorted(self.rowsrows, key=functools.cmp_to_key(_key_cmp))
1079 
1080  def GetUnique(self, col, ignore_nan=True):
1081  """
1082  Extract a list of all unique values from one column.
1083 
1084  :param col: column name
1085  :type col: :class:`str`
1086 
1087  :param ignore_nan: ignore all *None* values
1088  :type ignore_nan: :class:`bool`
1089  """
1090  idx = self.GetColIndexGetColIndex(col)
1091  seen = {}
1092  result = []
1093  for row in self.rowsrows:
1094  item = row[idx]
1095  if item!=None or ignore_nan==False:
1096  if item in seen: continue
1097  seen[item] = 1
1098  result.append(item)
1099  return result
1100 
1101  def Zip(self, *args):
1102  """
1103  Allows to conveniently iterate over a selection of columns, e.g.
1104 
1105  .. code-block:: python
1106 
1107  tab = Table.Load('...')
1108  for col1, col2 in tab.Zip('col1', 'col2'):
1109  print col1, col2
1110 
1111  is a shortcut for
1112 
1113  .. code-block:: python
1114 
1115  tab = Table.Load('...')
1116  for col1, col2 in zip(tab['col1'], tab['col2']):
1117  print col1, col2
1118  """
1119  return list(zip(*[self[arg] for arg in args]))
1120 
1121  def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None,
1122  z_title=None, x_range=None, y_range=None, z_range=None,
1123  color=None, plot_if=None, legend=None,
1124  num_z_levels=10, z_contour=True, z_interpol='nn', diag_line=False,
1125  labels=None, max_num_labels=None, title=None, clear=True, save=False,
1126  **kwargs):
1127  """
1128  Function to plot values from your table in 1, 2 or 3 dimensions using
1129  `Matplotlib <http://matplotlib.sourceforge.net>`__
1130 
1131  :param x: column name for first dimension
1132  :type x: :class:`str`
1133 
1134  :param y: column name for second dimension
1135  :type y: :class:`str`
1136 
1137  :param z: column name for third dimension
1138  :type z: :class:`str`
1139 
1140  :param style: symbol style (e.g. *.*, *-*, *x*, *o*, *+*, *\\**). For a
1141  complete list check (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1142  :type style: :class:`str`
1143 
1144  :param x_title: title for first dimension, if not specified it is
1145  automatically derived from column name
1146  :type x_title: :class:`str`
1147 
1148  :param y_title: title for second dimension, if not specified it is
1149  automatically derived from column name
1150  :type y_title: :class:`str`
1151 
1152  :param z_title: title for third dimension, if not specified it is
1153  automatically derived from column name
1154  :type z_title: :class:`str`
1155 
1156  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1157  :type x_range: :class:`list` of length two
1158 
1159  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1160  :type y_range: :class:`list` of length two
1161 
1162  :param z_range: start and end value for third dimension (e.g. [start_z, end_z])
1163  :type z_range: :class:`list` of length two
1164 
1165  :param color: color for data (e.g. *b*, *g*, *r*). For a complete list check
1166  (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1167  :type color: :class:`str`
1168 
1169  :param plot_if: callable which returnes *True* if row should be plotted. Is
1170  invoked like ``plot_if(self, row)``
1171  :type plot_if: callable
1172 
1173  :param legend: legend label for data series
1174  :type legend: :class:`str`
1175 
1176  :param num_z_levels: number of levels for third dimension
1177  :type num_z_levels: :class:`int`
1178 
1179  :param diag_line: draw diagonal line
1180  :type diag_line: :class:`bool`
1181 
1182  :param labels: column name containing labels to put on x-axis for one
1183  dimensional plot
1184  :type labels: :class:`str`
1185 
1186  :param max_num_labels: limit maximum number of labels
1187  :type max_num_labels: :class:`int`
1188 
1189  :param title: plot title, if not specified it is automatically derived from
1190  plotted column names
1191  :type title: :class:`str`
1192 
1193  :param clear: clear old data from plot
1194  :type clear: :class:`bool`
1195 
1196  :param save: filename for saving plot
1197  :type save: :class:`str`
1198 
1199  :param z_contour: draw contour lines
1200  :type z_contour: :class:`bool`
1201 
1202  :param z_interpol: interpolation method for 3-dimensional plot (one of 'nn',
1203  'linear')
1204  :type z_interpol: :class:`str`
1205 
1206  :param \\*\\*kwargs: additional arguments passed to matplotlib
1207 
1208  :returns: the ``matplotlib.pyplot`` module
1209 
1210  **Examples:** simple plotting functions
1211 
1212  .. code-block:: python
1213 
1214  tab = Table(['a','b','c','d'],'iffi', a=range(5,0,-1),
1215  b=[x/2.0 for x in range(1,6)],
1216  c=[math.cos(x) for x in range(0,5)],
1217  d=range(3,8))
1218 
1219  # one dimensional plot of column 'd' vs. index
1220  plt = tab.Plot('d')
1221  plt.show()
1222 
1223  # two dimensional plot of 'a' vs. 'c'
1224  plt = tab.Plot('a', y='c', style='o-')
1225  plt.show()
1226 
1227  # three dimensional plot of 'a' vs. 'c' with values 'b'
1228  plt = tab.Plot('a', y='c', z='b')
1229  # manually save plot to file
1230  plt.savefig("plot.png")
1231  """
1232  try:
1233  import matplotlib.pyplot as plt
1234  import matplotlib.mlab as mlab
1235  import numpy as np
1236  idx1 = self.GetColIndexGetColIndex(x)
1237  xs = []
1238  ys = []
1239  zs = []
1240 
1241  if clear:
1242  plt.figure(figsize=[8, 6])
1243 
1244  if x_title!=None:
1245  nice_x=x_title
1246  else:
1247  nice_x=MakeTitle(x)
1248 
1249  if y_title!=None:
1250  nice_y=y_title
1251  else:
1252  if y:
1253  nice_y=MakeTitle(y)
1254  else:
1255  nice_y=None
1256 
1257  if z_title!=None:
1258  nice_z = z_title
1259  else:
1260  if z:
1261  nice_z = MakeTitle(z)
1262  else:
1263  nice_z = None
1264 
1265  if x_range and (IsScalar(x_range) or len(x_range)!=2):
1266  raise ValueError('parameter x_range must contain exactly two elements')
1267  if y_range and (IsScalar(y_range) or len(y_range)!=2):
1268  raise ValueError('parameter y_range must contain exactly two elements')
1269  if z_range and (IsScalar(z_range) or len(z_range)!=2):
1270  raise ValueError('parameter z_range must contain exactly two elements')
1271 
1272  if color:
1273  kwargs['color']=color
1274  if legend:
1275  kwargs['label']=legend
1276  if y and z:
1277  idx3 = self.GetColIndexGetColIndex(z)
1278  idx2 = self.GetColIndexGetColIndex(y)
1279  for row in self.rowsrows:
1280  if row[idx1]!=None and row[idx2]!=None and row[idx3]!=None:
1281  if plot_if and not plot_if(self, row):
1282  continue
1283  xs.append(row[idx1])
1284  ys.append(row[idx2])
1285  zs.append(row[idx3])
1286  levels = []
1287  if z_range:
1288  z_spacing = (z_range[1] - z_range[0]) / num_z_levels
1289  l = z_range[0]
1290  else:
1291  l = self.MinMin(z)
1292  z_spacing = (self.MaxMax(z) - l) / num_z_levels
1293 
1294  for i in range(0,num_z_levels+1):
1295  levels.append(l)
1296  l += z_spacing
1297 
1298  xi = np.linspace(min(xs),max(xs),len(xs)*10)
1299  yi = np.linspace(min(ys),max(ys),len(ys)*10)
1300  zi = mlab.griddata(xs, ys, zs, xi, yi, interp=z_interpol)
1301 
1302  if z_contour:
1303  plt.contour(xi,yi,zi,levels,linewidths=0.5,colors='k')
1304 
1305  plt.contourf(xi,yi,zi,levels,cmap=plt.cm.jet)
1306  plt.colorbar(ticks=levels)
1307 
1308  elif y:
1309  idx2=self.GetColIndexGetColIndex(y)
1310  for row in self.rowsrows:
1311  if row[idx1]!=None and row[idx2]!=None:
1312  if plot_if and not plot_if(self, row):
1313  continue
1314  xs.append(row[idx1])
1315  ys.append(row[idx2])
1316  plt.plot(xs, ys, style, **kwargs)
1317 
1318  else:
1319  label_vals=[]
1320 
1321  if labels:
1322  label_idx=self.GetColIndexGetColIndex(labels)
1323  for row in self.rowsrows:
1324  if row[idx1]!=None:
1325  if plot_if and not plot_if(self, row):
1326  continue
1327  xs.append(row[idx1])
1328  if labels:
1329  label_vals.append(row[label_idx])
1330  plt.plot(xs, style, **kwargs)
1331  if labels:
1332  interval = 1
1333  if max_num_labels:
1334  if len(label_vals)>max_num_labels:
1335  interval = int(math.ceil(float(len(label_vals))/max_num_labels))
1336  label_vals = label_vals[::interval]
1337  plt.xticks(np.arange(0, len(xs), interval), label_vals, rotation=45,
1338  size='x-small')
1339 
1340  if title==None:
1341  if nice_z:
1342  title = '%s of %s vs. %s' % (nice_z, nice_x, nice_y)
1343  elif nice_y:
1344  title = '%s vs. %s' % (nice_x, nice_y)
1345  else:
1346  title = nice_x
1347 
1348  plt.title(title, size='x-large', fontweight='bold',
1349  verticalalignment='bottom')
1350 
1351  if legend:
1352  plt.legend(loc=0)
1353 
1354  if x and y:
1355  plt.xlabel(nice_x, size='x-large')
1356  if x_range:
1357  plt.xlim(x_range[0], x_range[1])
1358  if y_range:
1359  plt.ylim(y_range[0], y_range[1])
1360  if diag_line:
1361  plt.plot(x_range, y_range, '-', color='black')
1362 
1363  plt.ylabel(nice_y, size='x-large')
1364  else:
1365  if y_range:
1366  plt.ylim(y_range[0], y_range[1])
1367  if x_title:
1368  plt.xlabel(x_title, size='x-large')
1369  plt.ylabel(nice_y, size='x-large')
1370  if save:
1371  plt.savefig(save)
1372  return plt
1373  except ImportError:
1374  LogError("Function needs numpy and matplotlib, but I could not import it.")
1375  raise
1376 
1377  def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False,
1378  histtype='stepfilled', align='mid', x_title=None,
1379  y_title=None, title=None, clear=True, save=False,
1380  color=None, y_range=None):
1381  """
1382  Create a histogram of the data in col for the range *x_range*, split into
1383  *num_bins* bins and plot it using Matplotlib.
1384 
1385  :param col: column name with data
1386  :type col: :class:`str`
1387 
1388  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1389  :type x_range: :class:`list` of length two
1390 
1391  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1392  :type y_range: :class:`list` of length two
1393 
1394  :param num_bins: number of bins in range
1395  :type num_bins: :class:`int`
1396 
1397  :param color: Color to be used for the histogram. If not set, color will be
1398  determined by matplotlib
1399  :type color: :class:`str`
1400 
1401  :param normed: normalize histogram
1402  :type normed: :class:`bool`
1403 
1404  :param histtype: type of histogram (i.e. *bar*, *barstacked*, *step*,
1405  *stepfilled*). See (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1406  :type histtype: :class:`str`
1407 
1408  :param align: style of histogram (*left*, *mid*, *right*). See
1409  (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1410  :type align: :class:`str`
1411 
1412  :param x_title: title for first dimension, if not specified it is
1413  automatically derived from column name
1414  :type x_title: :class:`str`
1415 
1416  :param y_title: title for second dimension, if not specified it is
1417  automatically derived from column name
1418  :type y_title: :class:`str`
1419 
1420  :param title: plot title, if not specified it is automatically derived from
1421  plotted column names
1422  :type title: :class:`str`
1423 
1424  :param clear: clear old data from plot
1425  :type clear: :class:`bool`
1426 
1427  :param save: filename for saving plot
1428  :type save: :class:`str`
1429 
1430  **Examples:** simple plotting functions
1431 
1432  .. code-block:: python
1433 
1434  tab = Table(['a'],'f', a=[math.cos(x*0.01) for x in range(100)])
1435 
1436  # one dimensional plot of column 'd' vs. index
1437  plt = tab.PlotHistogram('a')
1438  plt.show()
1439 
1440  """
1441  try:
1442  import matplotlib.pyplot as plt
1443  import numpy as np
1444 
1445  if len(self.rowsrows)==0:
1446  return None
1447  kwargs={}
1448  if color:
1449  kwargs['color']=color
1450  idx = self.GetColIndexGetColIndex(col)
1451  data = []
1452  for r in self.rowsrows:
1453  if r[idx]!=None:
1454  data.append(r[idx])
1455 
1456  if clear:
1457  plt.clf()
1458 
1459  n, bins, patches = plt.hist(data, bins=num_bins, range=x_range,
1460  normed=normed, histtype=histtype, align=align,
1461  **kwargs)
1462 
1463  if x_title!=None:
1464  nice_x=x_title
1465  else:
1466  nice_x=MakeTitle(col)
1467  plt.xlabel(nice_x, size='x-large')
1468  if y_range:
1469  plt.ylim(y_range)
1470  if y_title!=None:
1471  nice_y=y_title
1472  else:
1473  nice_y="bin count"
1474  plt.ylabel(nice_y, size='x-large')
1475 
1476  if title!=None:
1477  nice_title=title
1478  else:
1479  nice_title="Histogram of %s"%nice_x
1480  plt.title(nice_title, size='x-large', fontweight='bold')
1481 
1482  if save:
1483  plt.savefig(save)
1484  return plt
1485  except ImportError:
1486  LogError("Function needs numpy and matplotlib, but I could not import it.")
1487  raise
1488 
1489  def _Max(self, col):
1490  if len(self.rowsrows)==0:
1491  return None, None
1492  idx = self.GetColIndexGetColIndex(col)
1493  col_type = self.col_typescol_types[idx]
1494  if col_type=='int' or col_type=='float':
1495  max_val = -float('inf')
1496  elif col_type=='bool':
1497  max_val = False
1498  elif col_type=='string':
1499  max_val = chr(0)
1500  max_idx = None
1501  for i in range(0, len(self.rowsrows)):
1502  val = self.rowsrows[i][idx]
1503  if val and val > max_val:
1504  max_val = self.rowsrows[i][idx]
1505  max_idx = i
1506  return max_val, max_idx
1507 
1508  def PlotBar(self, cols=None, rows=None, xlabels=None, set_xlabels=True, xlabels_rotation='horizontal', y_title=None, title=None,
1509  colors=None, width=0.8, bottom=0, legend=False, legend_names=None, show=False, save=False):
1510 
1511  """
1512  Create a barplot of the data in cols. Every column will be represented
1513  at one position. If there are several rows, each column will be grouped
1514  together.
1515 
1516  :param cols: List of column names. Every column will be represented as a
1517  single bar. If cols is None, every column of the table gets
1518  plotted.
1519  :type cols: :class:`list`
1520 
1521  :param rows: List of row indices. Values from given rows will be plotted
1522  in parallel at one column position. If set to None, all rows
1523  of the table will be plotted. Note, that the maximum number
1524  of rows is 7.
1525  :type rows: :class:`list`
1526 
1527  :param xlabels: Label for every col on x-axis. If set to None, the column
1528  names are used. The xlabel plotting can be supressed by
1529  the parameter set_xlabel.
1530  :type xlabels: :class:`list`
1531 
1532  :param set_xlabels: Controls whether xlabels are plotted or not.
1533  :type set_xlabels: :class:`bool`
1534 
1535  :param x_labels_rotation: Can either be 'horizontal', 'vertical' or an
1536  integer, that describes the rotation in degrees.
1537 
1538  :param y_title: Y-axis description
1539  :type y_title: :class:`str`
1540 
1541  :title: Title of the plot. No title appears if set to None
1542  :type title: :class:`str`
1543 
1544  :param colors: Colors of the different bars in each group. Must be a list
1545  of valid colors in matplotlib. Length of color and rows must
1546  be consistent.
1547  :type colors: :class:`list`
1548 
1549  :param width: The available space for the groups on the x-axis is divided
1550  by the exact number of groups. The parameters width is the
1551  fraction of what is actually used. If it would be 1.0 the
1552  bars of the different groups would touch each other.
1553  Value must be between [0;1]
1554  :type width: :class:`float`
1555 
1556  :param bottom: Bottom
1557  :type bottom: :class:`float`
1558 
1559  :param legend: Legend for color explanation, the corresponding row
1560  respectively. If set to True, legend_names must be provided.
1561  :type legend: :class:`bool`
1562 
1563  :param legend_names: List of names, that describe the differently colored
1564  bars. Length must be consistent with number of rows.
1565 
1566  :param show: If set to True, the plot is directly displayed.
1567 
1568  :param save: If set, a png image with name save in the current working
1569  directory will be saved.
1570  :type save: :class:`str`
1571 
1572  """
1573  try:
1574  import numpy as np
1575  import matplotlib.pyplot as plt
1576  except:
1577  raise ImportError('PlotBar relies on numpy and matplotlib, but I could' \
1578  'not import it!')
1579 
1580  standard_colors=['b','g','y','c','m','r','k']
1581  data=[]
1582 
1583  if cols==None:
1584  cols=self.col_namescol_names
1585 
1586  if width<=0 or width>1:
1587  raise ValueError('Width must be in [0;1]')
1588 
1589  if rows==None:
1590  if len(self.rowsrows)>7:
1591  raise ValueError('Table contains too many rows to represent them at one '\
1592  'bar position in parallel. You can Select a Subtable or '\
1593  'specify the parameter rows with a list of row indices '\
1594  '(max 7)')
1595  else:
1596  rows=list(range(len(self.rowsrows)))
1597  else:
1598  if not isinstance(rows,list):
1599  rows=[rows]
1600  if len(rows)>7:
1601  raise ValueError('Too many rows to represent (max 7). Please note, that '\
1602  'data from multiple rows from one column gets '\
1603  'represented at one position in parallel.')
1604 
1605  for r_idx in rows:
1606  row=self.rowsrows[r_idx]
1607  temp=list()
1608  for c in cols:
1609  try:
1610  c_idx=self.GetColIndexGetColIndex(c)
1611  except:
1612  raise ValueError('Cannot find column with name '+str(c))
1613  temp.append(row[c_idx])
1614  data.append(temp)
1615 
1616  if colors==None:
1617  colors=standard_colors[:len(rows)]
1618 
1619  if len(rows)!=len(colors):
1620  raise ValueError("Number of rows and number of colors must be consistent!")
1621 
1622  ind=np.arange(len(data[0]))
1623  single_bar_width=float(width)/len(data)
1624 
1625  fig=plt.figure()
1626  ax=fig.add_subplot(111)
1627  legend_data=[]
1628 
1629  for i in range(len(data)):
1630  legend_data.append(ax.bar(ind+i*single_bar_width+(1-width)/2,data[i],single_bar_width,bottom=bottom,color=colors[i])[0])
1631 
1632  if title!=None:
1633  ax.set_title(title, size='x-large', fontweight='bold')
1634 
1635  if y_title!=None:
1636  nice_y=y_title
1637  else:
1638  nice_y="value"
1639  ax.set_ylabel(nice_y)
1640 
1641  if xlabels:
1642  if len(data[0])!=len(xlabels):
1643  raise ValueError('Number of xlabels is not consistent with number of cols!')
1644  else:
1645  xlabels=cols
1646 
1647  if set_xlabels:
1648  ax.set_xticks(ind+0.5)
1649  ax.set_xticklabels(xlabels, rotation = xlabels_rotation)
1650  else:
1651  ax.set_xticks([])
1652 
1653  if legend == True:
1654  if legend_names==None:
1655  raise ValueError('You must provide legend names! e.g. names for the rows, '\
1656  'that are printed in parallel.')
1657  if len(legend_names)!=len(data):
1658  raise ValueError('length of legend_names must be consistent with number '\
1659  'of plotted rows!')
1660  ax.legend(legend_data, legend_names)
1661 
1662  if save:
1663  plt.savefig(save)
1664 
1665  if show:
1666  plt.show()
1667 
1668  return plt
1669 
1670  def PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log',
1671  colormap='jet', show_scalebar=False, scalebar_label=None, clear=True, save=False, show=False):
1672 
1673  """
1674  Create a heatplot of the data in col x vs the data in col y using matplotlib
1675 
1676  :param x: column name with x data
1677  :type x: :class:`str`
1678 
1679  :param y: column name with y data
1680  :type y: :class:`str`
1681 
1682  :param title: title of the plot, will be generated automatically if set to None
1683  :type title: :class:`str`
1684 
1685  :param x_title: label of x-axis, will be generated automatically if set to None
1686  :type title: :class:`str`
1687 
1688  :param y_title: label of y-axis, will be generated automatically if set to None
1689  :type title: :class:`str`
1690 
1691  :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1692  :type x_range: :class:`list` of length two
1693 
1694  :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1695  :type y_range: :class:`list` of length two
1696 
1697  :param binning: type of binning. If set to None, the value of a hexbin will
1698  correspond to the number of datapoints falling into it. If
1699  set to 'log', the value will be the log with base 10 of the above
1700  value (log(i+1)). If an integer is provided, the number of a
1701  hexbin is equal the number of datapoints falling into it divided
1702  by the integer. If a list of values is provided, these values
1703  will be the lower bounds of the bins.
1704 
1705  :param colormap: colormap, that will be used. Value can be every colormap defined
1706  in matplotlib or an own defined colormap. You can either pass a
1707  string with the name of the matplotlib colormap or a colormap
1708  object.
1709 
1710  :param show_scalebar: If set to True, a scalebar according to the chosen colormap is shown
1711  :type show_scalebar: :class:`bool`
1712 
1713  :param scalebar_label: Label of the scalebar
1714  :type scalebar_label: :class:`str`
1715 
1716  :param clear: clear old data from plot
1717  :type clear: :class:`bool`
1718 
1719  :param save: filename for saving plot
1720  :type save: :class:`str`
1721 
1722  :param show: directly show plot
1723  :type show: :class:`bool`
1724 
1725  """
1726 
1727  try:
1728  import matplotlib.pyplot as plt
1729  import matplotlib.cm as cm
1730  except:
1731  raise ImportError('PlotHexbin relies on matplotlib, but I could not import it')
1732 
1733  idx=self.GetColIndexGetColIndex(x)
1734  idy=self.GetColIndexGetColIndex(y)
1735  xdata=[]
1736  ydata=[]
1737 
1738  for r in self.rowsrows:
1739  if r[idx]!=None and r[idy]!=None:
1740  xdata.append(r[idx])
1741  ydata.append(r[idy])
1742 
1743  if clear:
1744  plt.clf()
1745 
1746  if x_title!=None:
1747  nice_x=x_title
1748  else:
1749  nice_x=MakeTitle(x)
1750 
1751  if y_title!=None:
1752  nice_y=y_title
1753  else:
1754  nice_y=MakeTitle(y)
1755 
1756  if title==None:
1757  title = '%s vs. %s' % (nice_x, nice_y)
1758 
1759  if IsStringLike(colormap):
1760  colormap=getattr(cm, colormap)
1761 
1762  if x_range and (IsScalar(x_range) or len(x_range)!=2):
1763  raise ValueError('parameter x_range must contain exactly two elements')
1764  if y_range and (IsScalar(y_range) or len(y_range)!=2):
1765  raise ValueError('parameter y_range must contain exactly two elements')
1766 
1767  ext = [min(xdata),max(xdata),min(ydata),max(ydata)]
1768 
1769  if x_range:
1770  plt.xlim((x_range[0], x_range[1]))
1771  ext[0]=x_range[0]
1772  ext[1]=x_range[1]
1773  if y_range:
1774  plt.ylim(y_range[0], y_range[1])
1775  ext[2]=y_range[0]
1776  ext[3]=y_range[1]
1777 
1778 
1779  plt.hexbin(xdata, ydata, bins=binning, cmap=colormap, extent=ext)
1780 
1781  plt.title(title, size='x-large', fontweight='bold',
1782  verticalalignment='bottom')
1783 
1784  plt.xlabel(nice_x)
1785  plt.ylabel(nice_y)
1786 
1787  if show_scalebar:
1788  cb=plt.colorbar()
1789  if scalebar_label:
1790  cb.set_label(scalebar_label)
1791 
1792  if save:
1793  plt.savefig(save)
1794 
1795  if show:
1796  plt.show()
1797 
1798  return plt
1799 
1800  def MaxRow(self, col):
1801  """
1802  Returns the row containing the cell with the maximal value in col. If
1803  several rows have the highest value, only the first one is returned.
1804  ''None'' values are ignored.
1805 
1806  :param col: column name
1807  :type col: :class:`str`
1808 
1809  :returns: row with maximal col value or None if the table is empty
1810  """
1811  val, idx = self._Max_Max(col)
1812  if idx!=None:
1813  return self.rowsrows[idx]
1814 
1815  def Max(self, col):
1816  """
1817  Returns the maximum value in col. If several rows have the highest value,
1818  only the first one is returned. ''None'' values are ignored.
1819 
1820  :param col: column name
1821  :type col: :class:`str`
1822  """
1823  val, idx = self._Max_Max(col)
1824  return val
1825 
1826  def MaxIdx(self, col):
1827  """
1828  Returns the row index of the cell with the maximal value in col. If
1829  several rows have the highest value, only the first one is returned.
1830  ''None'' values are ignored.
1831 
1832  :param col: column name
1833  :type col: :class:`str`
1834  """
1835  val, idx = self._Max_Max(col)
1836  return idx
1837 
1838  def _Min(self, col):
1839  if len(self.rowsrows)==0:
1840  return None, None
1841  idx=self.GetColIndexGetColIndex(col)
1842  col_type = self.col_typescol_types[idx]
1843  if col_type=='int' or col_type=='float':
1844  min_val=float('inf')
1845  elif col_type=='bool':
1846  min_val=True
1847  elif col_type=='string':
1848  min_val=chr(255)
1849  min_idx=None
1850  for i,row in enumerate(self.rowsrows):
1851  if row[idx]!=None and row[idx]<min_val:
1852  min_val=row[idx]
1853  min_idx=i
1854  return min_val, min_idx
1855 
1856  def Min(self, col):
1857  """
1858  Returns the minimal value in col. If several rows have the lowest value,
1859  only the first one is returned. ''None'' values are ignored.
1860 
1861  :param col: column name
1862  :type col: :class:`str`
1863  """
1864  val, idx = self._Min_Min(col)
1865  return val
1866 
1867  def MinRow(self, col):
1868  """
1869  Returns the row containing the cell with the minimal value in col. If
1870  several rows have the lowest value, only the first one is returned.
1871  ''None'' values are ignored.
1872 
1873  :param col: column name
1874  :type col: :class:`str`
1875 
1876  :returns: row with minimal col value or None if the table is empty
1877  """
1878  val, idx = self._Min_Min(col)
1879  if idx!=None:
1880  return self.rowsrows[idx]
1881 
1882  def MinIdx(self, col):
1883  """
1884  Returns the row index of the cell with the minimal value in col. If
1885  several rows have the lowest value, only the first one is returned.
1886  ''None'' values are ignored.
1887 
1888  :param col: column name
1889  :type col: :class:`str`
1890  """
1891  val, idx = self._Min_Min(col)
1892  return idx
1893 
1894  def Sum(self, col):
1895  """
1896  Returns the sum of the given column. Cells with ''None'' are ignored. Returns
1897  0.0, if the column doesn't contain any elements. Col must be of numeric
1898  column type ('float', 'int') or boolean column type.
1899 
1900  :param col: column name
1901  :type col: :class:`str`
1902 
1903  :raises: :class:`TypeError` if column type is ``string``
1904  """
1905  idx = self.GetColIndexGetColIndex(col)
1906  col_type = self.col_typescol_types[idx]
1907  if col_type!='int' and col_type!='float' and col_type!='bool':
1908  raise TypeError("Sum can only be used on numeric column types")
1909  s = 0.0
1910  for r in self.rowsrows:
1911  if r[idx]!=None:
1912  s += r[idx]
1913  return s
1914 
1915  def Mean(self, col):
1916  """
1917  Returns the mean of the given column. Cells with ''None'' are ignored. Returns
1918  None, if the column doesn't contain any elements. Col must be of numeric
1919  ('float', 'int') or boolean column type.
1920 
1921  If column type is *bool*, the function returns the ratio of
1922  number of 'Trues' by total number of elements.
1923 
1924  :param col: column name
1925  :type col: :class:`str`
1926 
1927  :raises: :class:`TypeError` if column type is ``string``
1928  """
1929  idx = self.GetColIndexGetColIndex(col)
1930  col_type = self.col_typescol_types[idx]
1931  if col_type!='int' and col_type!='float' and col_type!='bool':
1932  raise TypeError("Mean can only be used on numeric or bool column types")
1933 
1934  vals=[]
1935  for v in self[col]:
1936  if v!=None:
1937  vals.append(v)
1938  try:
1939  return stutil.Mean(vals)
1940  except:
1941  return None
1942 
1943  def RowMean(self, mean_col_name, cols):
1944  """
1945  Adds a new column of type 'float' with a specified name (*mean_col_name*),
1946  containing the mean of all specified columns for each row.
1947 
1948  Cols are specified by their names and must be of numeric column
1949  type ('float', 'int') or boolean column type. Cells with None are ignored.
1950  Adds ''None'' if the row doesn't contain any values.
1951 
1952  :param mean_col_name: name of new column containing mean values
1953  :type mean_col_name: :class:`str`
1954 
1955  :param cols: name or list of names of columns to include in computation of
1956  mean
1957  :type cols: :class:`str` or :class:`list` of strings
1958 
1959  :raises: :class:`TypeError` if column type of columns in *col* is ``string``
1960 
1961  == Example ==
1962 
1963  Staring with the following table:
1964 
1965  ==== ==== ====
1966  x y u
1967  ==== ==== ====
1968  1 10 100
1969  2 15 None
1970  3 20 400
1971  ==== ==== ====
1972 
1973  the code here adds a column with the name 'mean' to yield the table below:
1974 
1975  .. code-block::python
1976 
1977  tab.RowMean('mean', ['x', 'u'])
1978 
1979 
1980  ==== ==== ==== =====
1981  x y u mean
1982  ==== ==== ==== =====
1983  1 10 100 50.5
1984  2 15 None 2
1985  3 20 400 201.5
1986  ==== ==== ==== =====
1987 
1988  """
1989 
1990  if IsScalar(cols):
1991  cols = [cols]
1992 
1993  cols_idxs = []
1994  for col in cols:
1995  idx = self.GetColIndexGetColIndex(col)
1996  col_type = self.col_typescol_types[idx]
1997  if col_type!='int' and col_type!='float' and col_type!='bool':
1998  raise TypeError("RowMean can only be used on numeric column types")
1999  cols_idxs.append(idx)
2000 
2001  mean_rows = []
2002  for row in self.rowsrows:
2003  vals = []
2004  for idx in cols_idxs:
2005  v = row[idx]
2006  if v!=None:
2007  vals.append(v)
2008  try:
2009  mean = stutil.Mean(vals)
2010  mean_rows.append(mean)
2011  except:
2012  mean_rows.append(None)
2013 
2014  self.AddColAddCol(mean_col_name, 'f', mean_rows)
2015 
2016  def Percentiles(self, col, nths):
2017  """
2018  Returns the percentiles of column *col* given in *nths*.
2019 
2020  The percentiles are calculated as
2021 
2022  .. code-block:: python
2023 
2024  values[min(len(values)-1, int(math.floor(len(values)*nth/100.0)))]
2025 
2026  where values are the sorted values of *col* not equal to ''None''
2027 
2028  :param col: column name
2029  :type col: :class:`str`
2030  :param nths: list of percentiles to be calculated. Each percentile is a
2031  number between 0 and 100.
2032  :type nths: :class:`list` of numbers
2033 
2034  :raises: :class:`TypeError` if column type is ``string``
2035  :returns: List of percentiles in the same order as given in *nths*
2036  """
2037  idx = self.GetColIndexGetColIndex(col)
2038  col_type = self.col_typescol_types[idx]
2039  if col_type!='int' and col_type!='float' and col_type!='bool':
2040  raise TypeError("Median can only be used on numeric column types")
2041 
2042  for nth in nths:
2043  if nth < 0 or nth > 100:
2044  raise ValueError("percentiles must be between 0 and 100")
2045  vals=[]
2046  for v in self[col]:
2047  if v!=None:
2048  vals.append(v)
2049  vals=sorted(vals)
2050  if len(vals)==0:
2051  return [None]*len(nths)
2052  percentiles=[]
2053 
2054  for nth in nths:
2055  # rounding behaviour between Python2 and Python3 changed....
2056  # p=vals[min(len(vals)-1, int(round(len(vals)*nth/100.0+0.5)-1))]
2057  p=vals[min(len(vals)-1, int(math.floor(len(vals)*nth/100.0)))]
2058  percentiles.append(p)
2059  return percentiles
2060 
2061  def Median(self, col):
2062  """
2063  Returns the median of the given column. Cells with ''None'' are ignored. Returns
2064  ''None'', if the column doesn't contain any elements. Col must be of numeric
2065  column type ('float', 'int') or boolean column type.
2066 
2067  :param col: column name
2068  :type col: :class:`str`
2069 
2070  :raises: :class:`TypeError` if column type is ``string``
2071  """
2072  idx = self.GetColIndexGetColIndex(col)
2073  col_type = self.col_typescol_types[idx]
2074  if col_type!='int' and col_type!='float' and col_type!='bool':
2075  raise TypeError("Median can only be used on numeric column types")
2076 
2077  vals=[]
2078  for v in self[col]:
2079  if v!=None:
2080  vals.append(v)
2081  stutil.Median(vals)
2082  try:
2083  return stutil.Median(vals)
2084  except:
2085  return None
2086 
2087  def StdDev(self, col):
2088  """
2089  Returns the standard deviation of the given column. Cells with ''None'' are
2090  ignored. Returns ''None'', if the column doesn't contain any elements. Col must
2091  be of numeric column type ('float', 'int') or boolean column type.
2092 
2093  :param col: column name
2094  :type col: :class:`str`
2095 
2096  :raises: :class:`TypeError` if column type is ``string``
2097  """
2098  idx = self.GetColIndexGetColIndex(col)
2099  col_type = self.col_typescol_types[idx]
2100  if col_type!='int' and col_type!='float' and col_type!='bool':
2101  raise TypeError("StdDev can only be used on numeric column types")
2102 
2103  vals=[]
2104  for v in self[col]:
2105  if v!=None:
2106  vals.append(v)
2107  try:
2108  return stutil.StdDev(vals)
2109  except:
2110  return None
2111 
2112  def Count(self, col, ignore_nan=True):
2113  """
2114  Count the number of cells in column that are not equal to ''None''.
2115 
2116  :param col: column name
2117  :type col: :class:`str`
2118 
2119  :param ignore_nan: ignore all *None* values
2120  :type ignore_nan: :class:`bool`
2121  """
2122  count=0
2123  idx=self.GetColIndexGetColIndex(col)
2124  for r in self.rowsrows:
2125  if ignore_nan:
2126  if r[idx]!=None:
2127  count+=1
2128  else:
2129  count+=1
2130  return count
2131 
2132  def Correl(self, col1, col2):
2133  """
2134  Calculate the Pearson correlation coefficient between *col1* and *col2*, only
2135  taking rows into account where both of the values are not equal to *None*.
2136  If there are not enough data points to calculate a correlation coefficient,
2137  *None* is returned.
2138 
2139  :param col1: column name for first column
2140  :type col1: :class:`str`
2141 
2142  :param col2: column name for second column
2143  :type col2: :class:`str`
2144  """
2145  if IsStringLike(col1) and IsStringLike(col2):
2146  col1 = self.GetColIndexGetColIndex(col1)
2147  col2 = self.GetColIndexGetColIndex(col2)
2148  vals1, vals2=([],[])
2149  for v1, v2 in zip(self[col1], self[col2]):
2150  if v1!=None and v2!=None:
2151  vals1.append(v1)
2152  vals2.append(v2)
2153  try:
2154  return stutil.Correl(vals1, vals2)
2155  except:
2156  return None
2157 
2158  def SpearmanCorrel(self, col1, col2):
2159  """
2160  Calculate the Spearman correlation coefficient between col1 and col2, only
2161  taking rows into account where both of the values are not equal to None. If
2162  there are not enough data points to calculate a correlation coefficient,
2163  None is returned.
2164 
2165  :warning: The function depends on the following module: *scipy.stats.mstats*
2166 
2167  :param col1: column name for first column
2168  :type col1: :class:`str`
2169 
2170  :param col2: column name for second column
2171  :type col2: :class:`str`
2172  """
2173  try:
2174  import scipy.stats.mstats
2175  import numpy as np
2176 
2177  if IsStringLike(col1) and IsStringLike(col2):
2178  col1 = self.GetColIndexGetColIndex(col1)
2179  col2 = self.GetColIndexGetColIndex(col2)
2180  vals1, vals2=([],[])
2181  for v1, v2 in zip(self[col1], self[col2]):
2182  if v1!=None and v2!=None:
2183  vals1.append(v1)
2184  vals2.append(v2)
2185  try:
2186  correl = scipy.stats.mstats.spearmanr(vals1, vals2)[0]
2187  if np.isnan(correl):
2188  return None
2189  return correl
2190  except:
2191  return None
2192 
2193  except ImportError:
2194  LogError("Function needs scipy.stats.mstats, but I could not import it.")
2195  raise
2196 
2197 
2198  def Save(self, stream_or_filename, format='ost', sep=','):
2199  """
2200  Save the table to stream or filename. The following three file formats
2201  are supported (for more information on file formats, see :meth:`Load`):
2202 
2203  ============= =======================================
2204  ost ost-specific format (human readable)
2205  csv comma separated values (human readable)
2206  pickle pickled byte stream (binary)
2207  html HTML table
2208  context ConTeXt table
2209  ============= =======================================
2210 
2211  :param stream_or_filename: filename or stream for writing output
2212  :type stream_or_filename: :class:`str` or :class:`file`
2213 
2214  :param format: output format (i.e. *ost*, *csv*, *pickle*)
2215  :type format: :class:`str`
2216 
2217  :raises: :class:`ValueError` if format is unknown
2218  """
2219  format=format.lower()
2220  if format=='ost':
2221  return self._SaveOST_SaveOST(stream_or_filename)
2222  if format=='csv':
2223  return self._SaveCSV_SaveCSV(stream_or_filename, sep=sep)
2224  if format=='pickle':
2225  return self._SavePickle_SavePickle(stream_or_filename)
2226  if format=='html':
2227  return self._SaveHTML_SaveHTML(stream_or_filename)
2228  if format=='context':
2229  return self._SaveContext_SaveContext(stream_or_filename)
2230  raise ValueError('unknown format "%s"' % format)
2231 
2232  def _SavePickle(self, stream):
2233  file_opened=False
2234  if not hasattr(stream, 'write'):
2235  stream=open(stream, 'wb')
2236  file_opened=True
2237  pickle.dump(self, stream, pickle.HIGHEST_PROTOCOL)
2238  if file_opened:
2239  stream.close()
2240 
2241  def _SaveHTML(self, stream_or_filename):
2242  def _escape(s):
2243  return s.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;')
2244 
2245  file_opened = False
2246  if not hasattr(stream_or_filename, 'write'):
2247  stream = open(stream_or_filename, 'w')
2248  file_opened = True
2249  else:
2250  stream = stream_or_filename
2251  stream.write('<table>')
2252  stream.write('<tr>')
2253  for col_name in self.col_namescol_names:
2254  stream.write('<th>%s</th>' % _escape(col_name))
2255  stream.write('</tr>')
2256  for row in self.rowsrows:
2257  stream.write('<tr>')
2258  for i, col in enumerate(row):
2259  val = ''
2260  if col != None:
2261  if self.col_typescol_types[i] == 'float':
2262  val = '%.3f' % col
2263  elif self.col_typescol_types[i] == 'int':
2264  val = '%d' % col
2265  elif self.col_typescol_types[i] == 'bool':
2266  val = col and 'true' or 'false'
2267  else:
2268  val = str(col)
2269  stream.write('<td>%s</td>' % _escape(val))
2270  stream.write('</tr>')
2271  stream.write('</table>')
2272  if file_opened:
2273  stream.close()
2274  def _SaveContext(self, stream_or_filename):
2275  file_opened = False
2276  if not hasattr(stream_or_filename, 'write'):
2277  stream = open(stream_or_filename, 'w')
2278  file_opened = True
2279  else:
2280  stream = stream_or_filename
2281  stream.write('\\starttable[')
2282  for col_type in self.col_typescol_types:
2283  if col_type =='string':
2284  stream.write('l|')
2285  elif col_type=='int':
2286  stream.write('r|')
2287  elif col_type =='float':
2288  stream.write('i3r|')
2289  else:
2290  stream.write('l|')
2291  stream.write(']\n\\HL\n')
2292  for col_name in self.col_namescol_names:
2293  stream.write('\\NC \\bf %s' % col_name)
2294  stream.write(' \\AR\\HL\n')
2295  for row in self.rowsrows:
2296  for i, col in enumerate(row):
2297  val = '---'
2298  if col != None:
2299  if self.col_typescol_types[i] == 'float':
2300  val = '%.3f' % col
2301  elif self.col_typescol_types[i] == 'int':
2302  val = '%d' % col
2303  elif self.col_typescol_types[i] == 'bool':
2304  val = col and 'true' or 'false'
2305  else:
2306  val = str(col)
2307  stream.write('\\NC %s' % val)
2308  stream.write(' \\AR\n')
2309  stream.write('\\HL\n')
2310  stream.write('\\stoptable')
2311  if file_opened:
2312  stream.close()
2313 
2314  def _SaveCSV(self, stream, sep):
2315  file_opened=False
2316  if not hasattr(stream, 'write'):
2317  stream=open(stream, 'w')
2318  file_opened=True
2319 
2320  writer=csv.writer(stream, delimiter=sep)
2321  writer.writerow(['%s' % n for n in self.col_namescol_names])
2322  for row in self.rowsrows:
2323  row=list(row)
2324  for i, c in enumerate(row):
2325  if c==None:
2326  row[i]='NA'
2327  writer.writerow(row)
2328  if file_opened:
2329  stream.close()
2330 
2331 
2332  def _SaveOST(self, stream):
2333  file_opened=False
2334  if hasattr(stream, 'write'):
2335  writer=csv.writer(stream, delimiter=' ')
2336  else:
2337  stream=open(stream, 'w')
2338  writer=csv.writer(stream, delimiter=' ')
2339  file_opened=True
2340  if self.commentcomment:
2341  stream.write(''.join(['# %s\n' % l for l in self.commentcomment.split('\n')]))
2342  writer.writerow(['%s[%s]' % t for t in zip(self.col_namescol_names, self.col_typescol_types)])
2343  for row in self.rowsrows:
2344  row=list(row)
2345  for i, c in enumerate(row):
2346  if c==None:
2347  row[i]='NA'
2348  writer.writerow(row)
2349  if file_opened:
2350  stream.close()
2351 
2352  def GetNumpyMatrixAsArray(self, *args):
2353  '''
2354  Returns a numpy array containing the selected columns from the table as
2355  columns as a matrix.
2356 
2357  Only columns of type *int* or *float* are supported. *NA* values in the
2358  table will be converted to *None* values.
2359 
2360  Originally the function used the numpy matrix class but that is going to be
2361  deprecated in the future. Numpy itself suggests replacing numpy matrix by
2362  numpy array.
2363 
2364  :param \\*args: column names to include in numpy array
2365 
2366  :warning: The function depends on *numpy*
2367  '''
2368  try:
2369  import numpy as np
2370 
2371  if len(args)==0:
2372  raise RuntimeError("At least one column must be specified.")
2373 
2374  idxs = []
2375  for arg in args:
2376  idx = self.GetColIndexGetColIndex(arg)
2377  col_type = self.col_typescol_types[idx]
2378  if col_type!='int' and col_type!='float':
2379  raise TypeError("Numpy matrix can only be generated from numeric "+\
2380  "column types")
2381  idxs.append(idx)
2382 
2383  a = np.array([list(self[i]) for i in idxs])
2384  return a.T
2385 
2386  except ImportError:
2387  LogError("Function needs numpy, but I could not import it.")
2388  raise
2389 
2390  def GetNumpyMatrix(self, *args):
2391  '''
2392  *Caution*: Numpy is deprecating the use of the numpy matrix class.
2393 
2394  Returns a numpy matrix containing the selected columns from the table as
2395  columns in the matrix.
2396 
2397  Only columns of type *int* or *float* are supported. *NA* values in the
2398  table will be converted to *None* values.
2399 
2400  :param \\*args: column names to include in numpy matrix
2401 
2402  :warning: The function depends on *numpy*
2403  '''
2404  LogWarning("table.GetNumpyMatrix is deprecated, please use "+
2405  "table.GetNumpyMatrixAsArray instead")
2406  try:
2407  import numpy as np
2408  m = self.GetNumpyMatrixAsArrayGetNumpyMatrixAsArray(*args)
2409  return np.matrix(m)
2410  except ImportError:
2411  LogError("Function needs numpy, but I could not import it.")
2412  raise
2413 
2414  def GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0):
2415 
2416  '''
2417  In place Gaussian smooth of a column in the table with a given standard deviation.
2418  All nan are set to nan_value before smoothing.
2419 
2420  :param col: column name
2421  :type col: :class:`str`
2422 
2423  :param std: standard deviation for gaussian kernel
2424  :type std: `scalar`
2425 
2426  :param na_value: all na (None) values of the speciefied column are set to na_value before smoothing
2427  :type na_value: `scalar`
2428 
2429  :param padding: allows to handle padding behaviour see scipy ndimage.gaussian_filter1d documentation for more information. standard is reflect
2430  :type padding: :class:`str`
2431 
2432  :param c: constant value used for padding if padding mode is constant
2433  :type c: `scalar`
2434 
2435 
2436 
2437  :warning: The function depends on *scipy*
2438  '''
2439 
2440  try:
2441  from scipy import ndimage
2442  import numpy as np
2443  except ImportError:
2444  LogError("I need scipy.ndimage and numpy, but could not import it")
2445  raise
2446 
2447  idx = self.GetColIndexGetColIndex(col)
2448  col_type = self.col_typescol_types[idx]
2449  if col_type!='int' and col_type!='float':
2450  raise TypeError("GaussianSmooth can only be used on numeric column types")
2451 
2452  vals=[]
2453  for v in self[col]:
2454  if v!=None:
2455  vals.append(v)
2456  else:
2457  vals.append(na_value)
2458 
2459 
2460  smoothed_values_ndarray=ndimage.gaussian_filter1d(vals,std, mode=padding, cval=c)
2461 
2462  result=[]
2463 
2464  for v in smoothed_values_ndarray:
2465  result.append(v)
2466 
2467  self[col]=result
2468 
2469 
2470  def GetOptimalPrefactors(self, ref_col, *args, **kwargs):
2471  '''
2472  This returns the optimal prefactor values (i.e. :math:`a, b, c, ...`) for
2473  the following equation
2474 
2475  .. math::
2476  :label: op1
2477 
2478  a*u + b*v + c*w + ... = z
2479 
2480  where :math:`u, v, w` and :math:`z` are vectors. In matrix notation
2481 
2482  .. math::
2483  :label: op2
2484 
2485  A*p = z
2486 
2487  where :math:`A` contains the data from the table :math:`(u,v,w,...)`,
2488  :math:`p` are the prefactors to optimize :math:`(a,b,c,...)` and :math:`z`
2489  is the vector containing the result of equation :eq:`op1`.
2490 
2491  The parameter ref_col equals to :math:`z` in both equations, and \\*args
2492  are columns :math:`u`, :math:`v` and :math:`w` (or :math:`A` in :eq:`op2`).
2493  All columns must be specified by their names.
2494 
2495  **Example:**
2496 
2497  .. code-block:: python
2498 
2499  tab.GetOptimalPrefactors('colC', 'colA', 'colB')
2500 
2501  The function returns a list containing the prefactors
2502  :math:`a, b, c, ...` in the correct order (i.e. same as columns were
2503  specified in \\*args).
2504 
2505  Weighting:
2506  If the kwarg weights="columX" is specified, the equations are weighted by
2507  the values in that column. Each row is multiplied by the weight in that
2508  row, which leads to :eq:`op3`:
2509 
2510  .. math::
2511  :label: op3
2512 
2513  \\textit{weight}*a*u + \\textit{weight}*b*v + \\textit{weight}*c*w + ...
2514  = \\textit{weight}*z
2515 
2516  Weights must be float or int and can have any value. A value of 0 ignores
2517  this equation, a value of 1 means the same as no weight. If all weights are
2518  the same for each row, the same result will be obtained as with no weights.
2519 
2520  **Example:**
2521 
2522  .. code-block:: python
2523 
2524  tab.GetOptimalPrefactors('colC', 'colA', 'colB', weights='colD')
2525 
2526  '''
2527  try:
2528  import numpy as np
2529 
2530  if len(args)==0:
2531  raise RuntimeError("At least one column must be specified.")
2532 
2533  b = self.GetNumpyMatrixAsArrayGetNumpyMatrixAsArray(ref_col)
2534  a = self.GetNumpyMatrixAsArrayGetNumpyMatrixAsArray(*args)
2535 
2536  if len(kwargs)!=0:
2537  if 'weights' in kwargs:
2538  w = self.GetNumpyMatrixAsArrayGetNumpyMatrixAsArray(kwargs['weights'])
2539  b = np.multiply(b,w)
2540  a = np.multiply(a,w)
2541 
2542  else:
2543  raise RuntimeError("specified unrecognized kwargs, use weights as key")
2544 
2545  k = np.linalg.inv(a.T@a)@a.T@b
2546  return list(k.T.reshape(-1))
2547 
2548  except ImportError:
2549  LogError("Function needs numpy, but I could not import it.")
2550  raise
2551 
2552  def PlotEnrichment(self, score_col, class_col, score_dir='-',
2553  class_dir='-', class_cutoff=2.0,
2554  style='-', title=None, x_title=None, y_title=None,
2555  clear=True, save=None):
2556  '''
2557  Plot an enrichment curve using matplotlib of column *score_col* classified
2558  according to *class_col*.
2559 
2560  For more information about parameters of the enrichment, see
2561  :meth:`ComputeEnrichment`, and for plotting see :meth:`Plot`.
2562 
2563  :warning: The function depends on *matplotlib*
2564  '''
2565  try:
2566  import matplotlib.pyplot as plt
2567 
2568  enrx, enry = self.ComputeEnrichmentComputeEnrichment(score_col, class_col, score_dir,
2569  class_dir, class_cutoff)
2570 
2571  if not title:
2572  title = 'Enrichment of %s'%score_col
2573 
2574  if not x_title:
2575  x_title = '% database'
2576 
2577  if not y_title:
2578  y_title = '% positives'
2579 
2580  if clear:
2581  plt.clf()
2582 
2583  plt.plot(enrx, enry, style)
2584 
2585  plt.title(title, size='x-large', fontweight='bold')
2586  plt.ylabel(y_title, size='x-large')
2587  plt.xlabel(x_title, size='x-large')
2588 
2589  if save:
2590  plt.savefig(save)
2591 
2592  return plt
2593  except ImportError:
2594  LogError("Function needs matplotlib, but I could not import it.")
2595  raise
2596 
2597  def ComputeEnrichment(self, score_col, class_col, score_dir='-',
2598  class_dir='-', class_cutoff=2.0):
2599  '''
2600  Computes the enrichment of column *score_col* classified according to
2601  *class_col*.
2602 
2603  For this it is necessary, that the datapoints are classified into positive
2604  and negative points. This can be done in two ways:
2605 
2606  - by using one 'bool' type column (*class_col*) which contains *True* for
2607  positives and *False* for negatives
2608 
2609  - by specifying a classification column (*class_col*), a cutoff value
2610  (*class_cutoff*) and the classification columns direction (*class_dir*).
2611  This will generate the classification on the fly
2612 
2613  * if ``class_dir=='-'``: values in the classification column that are less than or equal to class_cutoff will be counted as positives
2614  * if ``class_dir=='+'``: values in the classification column that are larger than or equal to class_cutoff will be counted as positives
2615 
2616  During the calculation, the table will be sorted according to *score_dir*,
2617  where a '-' values means smallest values first and therefore, the smaller
2618  the value, the better.
2619 
2620  :warning: If either the value of *class_col* or *score_col* is *None*, the
2621  data in this row is ignored.
2622  '''
2623 
2624  ALLOWED_DIR = ['+','-']
2625 
2626  score_idx = self.GetColIndexGetColIndex(score_col)
2627  score_type = self.col_typescol_types[score_idx]
2628  if score_type!='int' and score_type!='float':
2629  raise TypeError("Score column must be numeric type")
2630 
2631  class_idx = self.GetColIndexGetColIndex(class_col)
2632  class_type = self.col_typescol_types[class_idx]
2633  if class_type!='int' and class_type!='float' and class_type!='bool':
2634  raise TypeError("Classifier column must be numeric or bool type")
2635 
2636  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2637  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2638 
2639  self.SortSort(score_col, score_dir)
2640 
2641  x = [0]
2642  y = [0]
2643  enr = 0
2644  old_score_val = None
2645  i = 0
2646 
2647  for row in self.rowsrows:
2648  class_val = row[class_idx]
2649  score_val = row[score_idx]
2650  if class_val==None or score_val==None:
2651  continue
2652  if class_val!=None:
2653  if old_score_val==None:
2654  old_score_val = score_val
2655  if score_val!=old_score_val:
2656  x.append(i)
2657  y.append(enr)
2658  old_score_val = score_val
2659  i+=1
2660  if class_type=='bool':
2661  if class_val==True:
2662  enr += 1
2663  else:
2664  if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2665  enr += 1
2666  x.append(i)
2667  y.append(enr)
2668 
2669  # if no false positives or false negatives values are found return None
2670  if x[-1]==0 or y[-1]==0:
2671  return None
2672 
2673  x = [float(v)/x[-1] for v in x]
2674  y = [float(v)/y[-1] for v in y]
2675  return x,y
2676 
2677  def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-',
2678  class_dir='-', class_cutoff=2.0):
2679  '''
2680  Computes the area under the curve of the enrichment using the trapezoidal
2681  rule.
2682 
2683  For more information about parameters of the enrichment, see
2684  :meth:`ComputeEnrichment`.
2685 
2686  :warning: The function depends on *numpy*
2687  '''
2688  try:
2689  import numpy as np
2690 
2691  enr = self.ComputeEnrichmentComputeEnrichment(score_col, class_col, score_dir,
2692  class_dir, class_cutoff)
2693 
2694  if enr==None:
2695  return None
2696  return np.trapz(enr[1], enr[0])
2697  except ImportError:
2698  LogError("Function needs numpy, but I could not import it.")
2699  raise
2700 
2701  def ComputeROC(self, score_col, class_col, score_dir='-',
2702  class_dir='-', class_cutoff=2.0):
2703  '''
2704  Computes the receiver operating characteristics (ROC) of column *score_col*
2705  classified according to *class_col*.
2706 
2707  For this it is necessary, that the datapoints are classified into positive
2708  and negative points. This can be done in two ways:
2709 
2710  - by using one 'bool' column (*class_col*) which contains True for positives
2711  and False for negatives
2712  - by using a non-bool column (*class_col*), a cutoff value (*class_cutoff*)
2713  and the classification columns direction (*class_dir*). This will generate
2714  the classification on the fly
2715 
2716  - if ``class_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff* will be counted as positives
2717  - if ``class_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff* will be counted as positives
2718 
2719  During the calculation, the table will be sorted according to *score_dir*,
2720  where a '-' values means smallest values first and therefore, the smaller
2721  the value, the better.
2722 
2723  If *class_col* does not contain any positives (i.e. value is True (if column
2724  is of type bool) or evaluated to True (if column is of type int or float
2725  (depending on *class_dir* and *class_cutoff*))) the ROC is not defined and
2726  the function will return *None*.
2727 
2728  :warning: If either the value of *class_col* or *score_col* is *None*, the
2729  data in this row is ignored.
2730  '''
2731 
2732  ALLOWED_DIR = ['+','-']
2733 
2734  score_idx = self.GetColIndexGetColIndex(score_col)
2735  score_type = self.col_typescol_types[score_idx]
2736  if score_type!='int' and score_type!='float':
2737  raise TypeError("Score column must be numeric type")
2738 
2739  class_idx = self.GetColIndexGetColIndex(class_col)
2740  class_type = self.col_typescol_types[class_idx]
2741  if class_type!='int' and class_type!='float' and class_type!='bool':
2742  raise TypeError("Classifier column must be numeric or bool type")
2743 
2744  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2745  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2746 
2747  self.SortSort(score_col, score_dir)
2748 
2749  x = [0]
2750  y = [0]
2751  tp = 0
2752  fp = 0
2753  old_score_val = None
2754 
2755  for i,row in enumerate(self.rowsrows):
2756  class_val = row[class_idx]
2757  score_val = row[score_idx]
2758  if class_val==None or score_val==None:
2759  continue
2760  if class_val!=None:
2761  if old_score_val==None:
2762  old_score_val = score_val
2763  if score_val!=old_score_val:
2764  x.append(fp)
2765  y.append(tp)
2766  old_score_val = score_val
2767  if class_type=='bool':
2768  if class_val==True:
2769  tp += 1
2770  else:
2771  fp += 1
2772  else:
2773  if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2774  tp += 1
2775  else:
2776  fp += 1
2777  x.append(fp)
2778  y.append(tp)
2779 
2780  # if no false positives or false negatives values are found return None
2781  if x[-1]==0 or y[-1]==0:
2782  return None
2783 
2784  x = [float(v)/x[-1] for v in x]
2785  y = [float(v)/y[-1] for v in y]
2786  return x,y
2787 
2788  def ComputeROCAUC(self, score_col, class_col, score_dir='-',
2789  class_dir='-', class_cutoff=2.0):
2790  '''
2791  Computes the area under the curve of the receiver operating characteristics
2792  using the trapezoidal rule.
2793 
2794  For more information about parameters of the ROC, see
2795  :meth:`ComputeROC`.
2796 
2797  :warning: The function depends on *numpy*
2798  '''
2799  try:
2800  import numpy as np
2801 
2802  roc = self.ComputeROCComputeROC(score_col, class_col, score_dir,
2803  class_dir, class_cutoff)
2804 
2805  if not roc:
2806  return None
2807  return np.trapz(roc[1], roc[0])
2808  except ImportError:
2809  LogError("Function needs numpy, but I could not import it.")
2810  raise
2811 
2812  def ComputeLogROCAUC(self, score_col, class_col, score_dir='-',
2813  class_dir='-', class_cutoff=2.0):
2814  '''
2815  Computes the area under the curve of the log receiver operating
2816  characteristics (logROC) where the x-axis is semilogarithmic
2817  using the trapezoidal rule.
2818 
2819  The logROC is computed with a lambda of 0.001 according to
2820  Rapid Context-Dependent Ligand Desolvation in Molecular Docking
2821  Mysinger M. and Shoichet B., Journal of Chemical Information and Modeling
2822  2010 50 (9), 1561-1573
2823 
2824  For more information about parameters of the ROC, see
2825  :meth:`ComputeROC`.
2826 
2827  :warning: The function depends on *numpy*
2828  '''
2829  try:
2830  import numpy as np
2831 
2832  roc = self.ComputeROCComputeROC(score_col, class_col, score_dir,
2833  class_dir, class_cutoff)
2834 
2835  if not roc:
2836  return None
2837 
2838  rocxt, rocyt = roc
2839  rocx=[]
2840  rocy=[]
2841 
2842  # define lambda
2843  l=0.001
2844 
2845  # remove all duplicate x-values
2846  rocxt = [x if x>0 else l for x in rocxt]
2847  for i in range(len(rocxt)-1):
2848  if rocxt[i]==rocxt[i+1]:
2849  continue
2850  rocx.append(rocxt[i])
2851  rocy.append(rocyt[i])
2852  rocx.append(1.0)
2853  rocy.append(1.0)
2854 
2855  # compute logauc
2856  value = 0
2857  for i in range(len(rocx)-1):
2858  x = rocx[i]
2859  if rocx[i]==rocx[i+1]:
2860  continue
2861  b = rocy[i+1]-rocx[i+1]*((rocy[i+1]-rocy[i])/(rocx[i+1]-rocx[i]))
2862  value += ((rocy[i+1]-rocy[i])/math.log(10))+b*(math.log10(rocx[i+1])-math.log10(rocx[i]))
2863  return value/math.log10(1.0/l)
2864 
2865  except ImportError:
2866  LogError("Function needs numpy, but I could not import it.")
2867  raise
2868 
2869  def PlotROC(self, score_col, class_col, score_dir='-',
2870  class_dir='-', class_cutoff=2.0,
2871  style='-', title=None, x_title=None, y_title=None,
2872  clear=True, save=None):
2873  '''
2874  Plot an ROC curve using matplotlib.
2875 
2876  For more information about parameters of the ROC, see
2877  :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2878 
2879  :warning: The function depends on *matplotlib*
2880  '''
2881 
2882  try:
2883  import matplotlib.pyplot as plt
2884 
2885  roc = self.ComputeROCComputeROC(score_col, class_col, score_dir,
2886  class_dir, class_cutoff)
2887 
2888  if not roc:
2889  return None
2890 
2891  enrx, enry = roc
2892 
2893  if not title:
2894  title = 'ROC of %s'%score_col
2895 
2896  if not x_title:
2897  x_title = 'false positive rate'
2898 
2899  if not y_title:
2900  y_title = 'true positive rate'
2901 
2902  if clear:
2903  plt.clf()
2904 
2905  plt.plot(enrx, enry, style)
2906 
2907  plt.title(title, size='x-large', fontweight='bold')
2908  plt.ylabel(y_title, size='x-large')
2909  plt.xlabel(x_title, size='x-large')
2910 
2911  if save:
2912  plt.savefig(save)
2913 
2914  return plt
2915  except ImportError:
2916  LogError("Function needs matplotlib, but I could not import it.")
2917  raise
2918 
2919  def PlotLogROC(self, score_col, class_col, score_dir='-',
2920  class_dir='-', class_cutoff=2.0,
2921  style='-', title=None, x_title=None, y_title=None,
2922  clear=True, save=None):
2923  '''
2924  Plot an logROC curve where the x-axis is semilogarithmic using matplotlib
2925 
2926  For more information about parameters of the ROC, see
2927  :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2928 
2929  :warning: The function depends on *matplotlib*
2930  '''
2931 
2932  try:
2933  import matplotlib.pyplot as plt
2934 
2935  roc = self.ComputeROCComputeROC(score_col, class_col, score_dir,
2936  class_dir, class_cutoff)
2937 
2938  if not roc:
2939  return None
2940 
2941  rocx, rocy = roc
2942 
2943  if not title:
2944  title = 'logROC of %s'%score_col
2945 
2946  if not x_title:
2947  x_title = 'false positive rate'
2948 
2949  if not y_title:
2950  y_title = 'true positive rate'
2951 
2952  if clear:
2953  plt.clf()
2954 
2955  rocx = [x if x>0 else 0.001 for x in rocx]
2956 
2957 
2958  plt.plot(rocx, rocy, style)
2959 
2960  plt.title(title, size='x-large', fontweight='bold')
2961  plt.ylabel(y_title, size='x-large')
2962  plt.xlabel(x_title, size='x-large')
2963  try:
2964  plt.xscale('log', basex=10)
2965  except:
2966  plt.xscale('log', base=10) # breaking change in matplotlib 3.5
2967  plt.xlim(0.001, 1.0)
2968 
2969  if save:
2970  plt.savefig(save)
2971 
2972  return plt
2973  except ImportError:
2974  LogError("Function needs matplotlib, but I could not import it.")
2975  raise
2976 
2977  def ComputeMCC(self, score_col, class_col, score_dir='-',
2978  class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
2979  '''
2980  Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
2981  with the points classified into true positives, false positives, true
2982  negatives and false negatives according to a specified classification
2983  column (*class_col*).
2984 
2985  The datapoints in *score_col* and *class_col* are classified into
2986  positive and negative points. This can be done in two ways:
2987 
2988  - by using 'bool' columns which contains True for positives and False
2989  for negatives
2990 
2991  - by using 'float' or 'int' columns and specifying a cutoff value and the
2992  columns direction. This will generate the classification on the fly
2993 
2994  * if ``class_dir``/``score_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2995  * if ``class_dir``/``score_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2996 
2997  The two possibilities can be used together, i.e. 'bool' type for one column
2998  and 'float'/'int' type and cutoff/direction for the other column.
2999  '''
3000  ALLOWED_DIR = ['+','-']
3001 
3002  score_idx = self.GetColIndexGetColIndex(score_col)
3003  score_type = self.col_typescol_types[score_idx]
3004  if score_type!='int' and score_type!='float' and score_type!='bool':
3005  raise TypeError("Score column must be numeric or bool type")
3006 
3007  class_idx = self.GetColIndexGetColIndex(class_col)
3008  class_type = self.col_typescol_types[class_idx]
3009  if class_type!='int' and class_type!='float' and class_type!='bool':
3010  raise TypeError("Classifier column must be numeric or bool type")
3011 
3012  if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
3013  raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
3014 
3015  tp = 0
3016  fp = 0
3017  fn = 0
3018  tn = 0
3019 
3020  for i,row in enumerate(self.rowsrows):
3021  class_val = row[class_idx]
3022  score_val = row[score_idx]
3023  if class_val!=None:
3024  if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
3025  if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
3026  tp += 1
3027  else:
3028  fn += 1
3029  else:
3030  if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
3031  tn += 1
3032  else:
3033  fp += 1
3034 
3035  mcc = None
3036  msg = None
3037  if (tp+fn)==0:
3038  msg = 'factor (tp + fn) is zero'
3039  elif (tp+fp)==0:
3040  msg = 'factor (tp + fp) is zero'
3041  elif (tn+fn)==0:
3042  msg = 'factor (tn + fn) is zero'
3043  elif (tn+fp)==0:
3044  msg = 'factor (tn + fp) is zero'
3045 
3046  if msg:
3047  LogWarning("Could not compute MCC: MCC is not defined since %s"%msg)
3048  else:
3049  mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
3050  return mcc
3051 
3052 
3053  def IsEmpty(self, col_name=None, ignore_nan=True):
3054  '''
3055  Checks if a table is empty.
3056 
3057  If no column name is specified, the whole table is checked for being empty,
3058  whereas if a column name is specified, only this column is checked.
3059 
3060  By default, all NAN (or None) values are ignored, and thus, a table
3061  containing only NAN values is considered as empty. By specifying the
3062  option ignore_nan=False, NAN values are counted as 'normal' values.
3063  '''
3064 
3065  # table with no columns and no rows
3066  if len(self.col_namescol_names)==0:
3067  if col_name:
3068  raise ValueError('Table has no column named "%s"' % col_name)
3069  return True
3070 
3071  # column name specified
3072  if col_name:
3073  if self.CountCount(col_name, ignore_nan=ignore_nan)==0:
3074  return True
3075  else:
3076  return False
3077 
3078  # no column name specified -> test whole table
3079  else:
3080  for row in self.rowsrows:
3081  for cell in row:
3082  if ignore_nan:
3083  if cell!=None:
3084  return False
3085  else:
3086  return False
3087  return True
3088 
3089 
3090  def Extend(self, tab, overwrite=None):
3091  """
3092  Append each row of *tab* to the current table. The data is appended based
3093  on the column names, thus the order of the table columns is *not* relevant,
3094  only the header names.
3095 
3096  If there is a column in *tab* that is not present in the current table,
3097  it is added to the current table and filled with *None* for all the rows
3098  present in the current table.
3099 
3100  If the type of any column in *tab* is not the same as in the current table
3101  a *TypeError* is raised.
3102 
3103  If *overwrite* is not None and set to an existing column name, the specified
3104  column in the table is searched for the first occurrence of a value matching
3105  the value of the column with the same name in the dictionary. If a matching
3106  value is found, the row is overwritten with the dictionary. If no matching
3107  row is found, a new row is appended to the table.
3108  """
3109  # add column to current table if it doesn't exist
3110  for name,typ in zip(tab.col_names, tab.col_types):
3111  if not name in self.col_namescol_names:
3112  self.AddColAddCol(name, typ)
3113 
3114  # check that column types are the same in current and new table
3115  for name in self.col_namescol_names:
3116  if name in tab.col_names:
3117  curr_type = self.col_typescol_types[self.GetColIndexGetColIndex(name)]
3118  new_type = tab.col_types[tab.GetColIndex(name)]
3119  if curr_type!=new_type:
3120  raise TypeError('cannot extend table, column %s in new '%name +\
3121  'table different type (%s) than in '%new_type +\
3122  'current table (%s)'%curr_type)
3123 
3124  num_rows = len(tab.rows)
3125  for i in range(0,num_rows):
3126  row = tab.rows[i]
3127  data = dict(list(zip(tab.col_names,row)))
3128  self.AddRowAddRow(data, overwrite)
3129 
3130 
3131 def Merge(table1, table2, by, only_matching=False):
3132  """
3133  Returns a new table containing the data from both tables. The rows are
3134  combined based on the common values in the column(s) by. The option 'by' can
3135  be a list of column names. When this is the case, merging is based on
3136  multiple columns.
3137  For example, the two tables below
3138 
3139  ==== ====
3140  x y
3141  ==== ====
3142  1 10
3143  2 15
3144  3 20
3145  ==== ====
3146 
3147  ==== ====
3148  x u
3149  ==== ====
3150  1 100
3151  3 200
3152  4 400
3153  ==== ====
3154 
3155  when merged by column x, produce the following output:
3156 
3157  ===== ===== =====
3158  x y u
3159  ===== ===== =====
3160  1 10 100
3161  2 15 None
3162  3 20 200
3163  4 None 400
3164  ===== ===== =====
3165 
3166 
3167  """
3168  def _key(row, indices):
3169  return tuple([row[i] for i in indices])
3170  def _keep(indices, cn, ct, ni):
3171  ncn, nct, nni=([],[],[])
3172  for i in range(len(cn)):
3173  if i not in indices:
3174  ncn.append(cn[i])
3175  nct.append(ct[i])
3176  nni.append(ni[i])
3177  return ncn, nct, nni
3178  col_names=list(table2.col_names)
3179  col_types=list(table2.col_types)
3180  new_index=[i for i in range(len(col_names))]
3181  if isinstance(by, str):
3182  common2_indices=[col_names.index(by)]
3183  else:
3184  common2_indices=[col_names.index(b) for b in by]
3185  col_names, col_types, new_index=_keep(common2_indices, col_names,
3186  col_types, new_index)
3187 
3188  for i, name in enumerate(col_names):
3189  try_name=name
3190  counter=1
3191  while try_name in table1.col_names:
3192  counter+=1
3193  try_name='%s_%d' % (name, counter)
3194  col_names[i]=try_name
3195  common1={}
3196  if isinstance(by, str):
3197  common1_indices=[table1.col_names.index(by)]
3198  else:
3199  common1_indices=[table1.col_names.index(b) for b in by]
3200  for row in table1.rows:
3201  key=_key(row, common1_indices)
3202  if key in common1:
3203  raise ValueError('duplicate key "%s in first table"' % (str(key)))
3204  common1[key]=row
3205  common2={}
3206  for row in table2.rows:
3207  key=_key(row, common2_indices)
3208  if key in common2:
3209  raise ValueError('duplicate key "%s" in second table' % (str(key)))
3210  common2[key]=row
3211  new_tab=Table(table1.col_names+col_names, table1.col_types+col_types)
3212  for k, v in common1.items():
3213  row=v+[None for i in range(len(table2.col_names)-len(common2_indices))]
3214  matched=False
3215  if k in common2:
3216  matched=True
3217  row2=common2[k]
3218  for i, index in enumerate(new_index):
3219  row[len(table1.col_names)+i]=row2[index]
3220  if only_matching and not matched:
3221  continue
3222  new_tab.AddRow(row)
3223  if only_matching:
3224  return new_tab
3225  for k, v in common2.items():
3226  if not k in common1:
3227  v2=[v[i] for i in new_index]
3228  row=[None for i in range(len(table1.col_names))]+v2
3229  for common1_index, common2_index in zip(common1_indices, common2_indices):
3230  row[common1_index]=v[common2_index]
3231  new_tab.AddRow(row)
3232  return new_tab
3233 
3234 
3235 # LocalWords: numpy Numpy
def __add__(self, rhs)
Definition: table.py:85
def __iter__(self)
Definition: table.py:79
def __init__(self, op, lhs, rhs)
Definition: table.py:71
def __div__(self, rhs)
Definition: table.py:94
def __mul__(self, rhs)
Definition: table.py:91
def __sub__(self, rhs)
Definition: table.py:88
def __init__(self, table, col)
Definition: table.py:98
def __add__(self, rhs)
Definition: table.py:118
def __iter__(self)
Definition: table.py:105
def __getitem__(self, index)
Definition: table.py:112
def __div__(self, rhs)
Definition: table.py:127
def __setitem__(self, index, value)
Definition: table.py:115
def __len__(self)
Definition: table.py:109
def __mul__(self, rhs)
Definition: table.py:124
def __sub__(self, rhs)
Definition: table.py:121
def Percentiles(self, col, nths)
Definition: table.py:2016
def __getitem__(self, k)
Definition: table.py:407
def _SaveOST(self, stream)
Definition: table.py:2332
def ComputeEnrichment(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0)
Definition: table.py:2598
def _Min(self, col)
Definition: table.py:1838
def Sort(self, by, order='+')
Definition: table.py:1050
def GetColIndex(self, col)
Definition: table.py:369
def _ParseColTypes(types, exp_num=None)
Definition: table.py:247
def HasCol(self, col)
Definition: table.py:401
def SpearmanCorrel(self, col1, col2)
Definition: table.py:2158
def __str__(self)
Definition: table.py:479
def GetNumpyMatrixAsArray(self, *args)
Definition: table.py:2352
def Extend(self, tab, overwrite=None)
Definition: table.py:3090
def Min(self, col)
Definition: table.py:1856
def PlotBar(self, cols=None, rows=None, xlabels=None, set_xlabels=True, xlabels_rotation='horizontal', y_title=None, title=None, colors=None, width=0.8, bottom=0, legend=False, legend_names=None, show=False, save=False)
Definition: table.py:1509
def __init__(self, col_names=[], col_types=None, **kwargs)
Definition: table.py:221
def PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log', colormap='jet', show_scalebar=False, scalebar_label=None, clear=True, save=False, show=False)
Definition: table.py:1671
def GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0)
Definition: table.py:2414
def __getattr__(self, col_name)
Definition: table.py:237
def _SaveHTML(self, stream_or_filename)
Definition: table.py:2241
def RemoveCol(self, col)
Definition: table.py:688
def __setitem__(self, k, value)
Definition: table.py:413
def Median(self, col)
Definition: table.py:2061
def PlotROC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0, style='-', title=None, x_title=None, y_title=None, clear=True, save=None)
Definition: table.py:2872
def MaxIdx(self, col)
Definition: table.py:1826
def Filter(self, *args, **kwargs)
Definition: table.py:791
def GetColNames(self)
Definition: table.py:379
def AddCol(self, col_name, col_type, data=None)
Definition: table.py:701
def Mean(self, col)
Definition: table.py:1915
def Count(self, col, ignore_nan=True)
Definition: table.py:2112
def MaxRow(self, col)
Definition: table.py:1800
def MinRow(self, col)
Definition: table.py:1867
def Load(stream_or_filename, format='auto', sep=',')
Definition: table.py:986
def GetName(self)
Definition: table.py:323
def AddRow(self, data, overwrite=None)
Definition: table.py:589
def ToString(self, float_format='%.3f', int_format='%d', rows=None)
Definition: table.py:422
def _SaveContext(self, stream_or_filename)
Definition: table.py:2274
def PlotLogROC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0, style='-', title=None, x_title=None, y_title=None, clear=True, save=None)
Definition: table.py:2922
def Correl(self, col1, col2)
Definition: table.py:2132
def _Max(self, col)
Definition: table.py:1489
def PlotEnrichment(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0, style='-', title=None, x_title=None, y_title=None, clear=True, save=None)
Definition: table.py:2555
def SetName(self, name)
Definition: table.py:314
def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0)
Definition: table.py:2678
def _AddRowsFromDict(self, d, overwrite=None)
Definition: table.py:507
def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None, z_title=None, x_range=None, y_range=None, z_range=None, color=None, plot_if=None, legend=None, num_z_levels=10, z_contour=True, z_interpol='nn', diag_line=False, labels=None, max_num_labels=None, title=None, clear=True, save=False, **kwargs)
Definition: table.py:1126
def Save(self, stream_or_filename, format='ost', sep=',')
Definition: table.py:2198
def GetUnique(self, col, ignore_nan=True)
Definition: table.py:1080
def ComputeLogROCAUC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0)
Definition: table.py:2813
def _SavePickle(self, stream)
Definition: table.py:2232
def IsEmpty(self, col_name=None, ignore_nan=True)
Definition: table.py:3053
def RenameCol(self, old_name, new_name)
Definition: table.py:329
def Max(self, col)
Definition: table.py:1815
def Select(self, query)
Definition: table.py:827
def ComputeROC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0)
Definition: table.py:2702
def _Coerce(self, value, ty)
Definition: table.py:342
def GetNumpyMatrix(self, *args)
Definition: table.py:2390
def Zip(self, *args)
Definition: table.py:1101
def _SaveCSV(self, stream, sep)
Definition: table.py:2314
def ComputeROCAUC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0)
Definition: table.py:2789
def StdDev(self, col)
Definition: table.py:2087
def MinIdx(self, col)
Definition: table.py:1882
def GetOptimalPrefactors(self, ref_col, *args, **kwargs)
Definition: table.py:2470
def PairedTTest(self, col_a, col_b)
Definition: table.py:565
def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False, histtype='stepfilled', align='mid', x_title=None, y_title=None, title=None, clear=True, save=False, color=None, y_range=None)
Definition: table.py:1380
def SearchColNames(self, regex)
Definition: table.py:385
def Stats(self, col)
Definition: table.py:482
def Sum(self, col)
Definition: table.py:1894
def ComputeMCC(self, score_col, class_col, score_dir='-', class_dir='-', score_cutoff=2.0, class_cutoff=2.0)
Definition: table.py:2978
def RowMean(self, mean_col_name, cols)
Definition: table.py:1943
def __str__(self)
Definition: table.py:144
def __setitem__(self, col_name, val)
Definition: table.py:154
def __getattr__(self, col_name)
Definition: table.py:160
def __len__(self)
Definition: table.py:151
def __setattr__(self, col_name, val)
Definition: table.py:165
def __init__(self, row_data, tab)
Definition: table.py:135
def __getitem__(self, col_name)
Definition: table.py:139
def MakeTitle(col_name)
Definition: table.py:11
def GuessColumnType(iterator)
Definition: table.py:38
def IsStringLike(value)
Definition: table.py:14
def IsScalar(value)
Definition: table.py:27
def Merge(table1, table2, by, only_matching=False)
Definition: table.py:3131
def IsNullString(value)
Definition: table.py:23