OpenStructure
Loading...
Searching...
No Matches
table.py
Go to the documentation of this file.
1import csv
2import re
3import math
4from ost import stutil
5import itertools
6import operator
7import pickle
8import weakref
9from ost import LogError, LogWarning, LogInfo, LogVerbose
10
11def MakeTitle(col_name):
12 return col_name.replace('_', ' ')
13
14def IsStringLike(value):
15 if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
16 return False
17 try:
18 value+''
19 return True
20 except:
21 return False
22
23def IsNullString(value):
24 value=value.strip().upper()
25 return value in ('', 'NULL', 'NONE', 'NA')
26
27def IsScalar(value):
28 if IsStringLike(value):
29 return True
30 try:
31 if isinstance(value, TableCol) or isinstance(value, BinaryColExpr):
32 return False
33 iter(value)
34 return False
35 except:
36 return True
37
38def GuessColumnType(iterator):
39 empty=True
40 possibilities=set(['bool', 'int', 'float'])
41 for ele in iterator:
42 str_ele=str(ele).upper()
43 if IsNullString(str_ele):
44 continue
45 empty=False
46 if 'int' in possibilities:
47 try:
48 int(str_ele)
49 except ValueError:
50 possibilities.remove('int')
51
52 if 'float' in possibilities:
53 try:
54 float(str_ele)
55 except ValueError:
56 possibilities.remove('float')
57 if 'bool' in possibilities:
58 if str_ele not in set(['YES', 'NO', 'TRUE', 'FALSE']):
59 possibilities.remove('bool')
60
61 if len(possibilities)==0:
62 return 'string'
63 if len(possibilities)==2:
64 return 'int'
65 if empty:
66 return 'string'
67 # return the last element available
68 return possibilities.pop()
69
71 def __init__(self, op, lhs, rhs):
72 self.op=op
73 self.lhs=lhs
74 self.rhs=rhs
75 if IsScalar(lhs):
76 self.lhs=itertools.cyle([self.lhs])
77 if IsScalar(rhs):
78 self.rhs=itertools.cycle([self.rhs])
79 def __iter__(self):
80 for l, r in zip(self.lhs, self.rhs):
81 if l!=None and r!=None:
82 yield self.op(l, r)
83 else:
84 yield None
85 def __add__(self, rhs):
86 return BinaryColExpr(operator.add, self, rhs)
87
88 def __sub__(self, rhs):
89 return BinaryColExpr(operator.sub, self, rhs)
90
91 def __mul__(self, rhs):
92 return BinaryColExpr(operator.mul, self, rhs)
93
94 def __div__(self, rhs):
95 return BinaryColExpr(operator.div, self, rhs)
96
98 def __init__(self, table, col):
99 self._table=table
100 if type(col)==str:
101 self.col_index=self._table.GetColIndex(col)
102 else:
103 self.col_index=col
104
105 def __iter__(self):
106 for row in self._table.rows:
107 yield row[self.col_index]
108
109 def __len__(self):
110 return len(self._table.rows)
111
112 def __getitem__(self, index):
113 return self._table.rows[index][self.col_index]
114
115 def __setitem__(self, index, value):
116 self._table.rows[index][self.col_index]=value
117
118 def __add__(self, rhs):
119 return BinaryColExpr(operator.add, self, rhs)
120
121 def __sub__(self, rhs):
122 return BinaryColExpr(operator.sub, self, rhs)
123
124 def __mul__(self, rhs):
125 return BinaryColExpr(operator.mul, self, rhs)
126
127 def __div__(self, rhs):
128 return BinaryColExpr(operator.div, self, rhs)
129
131 """
132 Essentially a named tuple, but allows column names that are not valid
133 python variable names.
134 """
135 def __init__(self, row_data, tab):
136 self.__dict__['tab'] = weakref.proxy(tab)
137 self.__dict__['row_data'] = row_data
138
139 def __getitem__(self, col_name):
140 if type(col_name)==int:
141 return self.row_data[col_name]
142 return self.row_data[self.tab.GetColIndex(col_name)]
143
144 def __str__(self):
145 s = []
146 for k, v in zip(self.__dict__['tab'].col_names, self.__dict__['row_data']):
147 s.append('%s=%s' % (k, str(v)))
148 return ', '.join(s)
149
150
151 def __len__(self):
152 return len(self.row_data)
153
154 def __setitem__(self, col_name, val):
155 if type(col_name)==int:
156 self.row_data[col_name] = val
157 else:
158 self.row_data[self.tab.GetColIndex(col_name)] = val
159
160 def __getattr__(self, col_name):
161 if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
162 raise AttributeError(col_name)
163 return self.row_data[self.tab.GetColIndex(col_name)]
164
165 def __setattr__(self, col_name, val):
166 if 'col_names' not in self.tab.__dict__ or col_name not in self.tab.col_names:
167 raise AttributeError(col_name)
168 self.row_data[self.tab.GetColIndex(col_name)] = val
169
170class Table(object):
171 """
172
173 The table class provides convenient access to data in tabular form. An empty
174 table can be easily constructed as follows
175
176 .. code-block:: python
177
178 tab = Table()
179
180 If you want to add columns directly when creating the table, column names
181 and *column types* can be specified as follows
182
183 .. code-block:: python
184
185 tab = Table(['nameX','nameY','nameZ'], 'sfb')
186
187 this will create three columns called nameX, nameY and nameZ of type string,
188 float and bool, respectively. There will be no data in the table and thus,
189 the table will not contain any rows.
190
191 The following *column types* are supported:
192
193 ======= ========
194 name abbrev
195 ======= ========
196 string s
197 float f
198 int i
199 bool b
200 ======= ========
201
202 If you want to add data to the table in addition, use the following:
203
204 .. code-block:: python
205
206 tab=Table(['nameX','nameY','nameZ'],
207 'sfb',
208 nameX = ['a','b','c'],
209 nameY = [0.1, 1.2, 3.414],
210 nameZ = [True, False, False])
211
212 if values for one column is left out, they will be filled with NA, but if
213 values are specified, all values must be specified (i.e. same number of
214 values per column)
215
216 """
217
218 SUPPORTED_TYPES=('int', 'float', 'bool', 'string',)
219
220
221 def __init__(self, col_names=[], col_types=None, **kwargs):
222
223 self.col_names=list(col_names)
224 self.comment=''
225 self.name=''
226
227 self.col_types = self._ParseColTypes(col_types)
228 self.rows=[]
229 if len(kwargs)>=0:
230 if not col_names:
231 self.col_names=[v for v in list(kwargs.keys())]
232 if not self.col_types:
233 self.col_types=['string' for u in range(len(self.col_names))]
234 if len(kwargs)>0:
235 self._AddRowsFromDict(kwargs)
236
237 def __getattr__(self, col_name):
238 # pickling doesn't call the standard __init__ defined above and thus
239 # col_names might not be defined. This leads to infinite recursions.
240 # Protect against it by checking that col_names is contained in
241 # __dict__
242 if 'col_names' not in self.__dict__ or col_name not in self.col_names:
243 raise AttributeError(col_name)
244 return TableCol(self, col_name)
245
246 @staticmethod
247 def _ParseColTypes(types, exp_num=None):
248 if types==None:
249 return None
250
251 short2long = {'s' : 'string', 'i': 'int', 'b' : 'bool', 'f' : 'float'}
252 allowed_short = list(short2long.keys())
253 allowed_long = list(short2long.values())
254
255 type_list = []
256
257 # string type
258 if IsScalar(types):
259 if type(types)==str:
260 types = types.lower()
261
262 # single value
263 if types in allowed_long:
264 type_list.append(types)
265 elif types in allowed_short:
266 type_list.append(short2long[types])
267
268 # comma separated list of long or short types
269 elif types.find(',')!=-1:
270 for t in types.split(','):
271 if t in allowed_long:
272 type_list.append(t)
273 elif t in allowed_short:
274 type_list.append(short2long[t])
275 else:
276 raise ValueError('Unknown type %s in types %s'%(t,types))
277
278 # string of short types
279 else:
280 for t in types:
281 if t in allowed_short:
282 type_list.append(short2long[t])
283 else:
284 raise ValueError('Unknown type %s in types %s'%(t,types))
285
286 # non-string type
287 else:
288 raise ValueError('Col type %s must be string or list'%types)
289
290 # list type
291 else:
292 for t in types:
293 # must be string type
294 if type(t)==str:
295 t = t.lower()
296 if t in allowed_long:
297 type_list.append(t)
298 elif t in allowed_short:
299 type_list.append(short2long[t])
300 else:
301 raise ValueError('Unknown type %s in types %s'%(t,types))
302
303 # non-string type
304 else:
305 raise ValueError('Col type %s must be string or list'%types)
306
307 if exp_num:
308 if len(type_list)!=exp_num:
309 raise ValueError('Parsed number of col types (%i) differs from ' + \
310 'expected (%i) in types %s'%(len(type_list),exp_num,types))
311
312 return type_list
313
314 def SetName(self, name):
315 '''
316 Set name of the table
317
318 :param name: name
319 :type name: :class:`str`
320 '''
321 self.name = name
322
323 def GetName(self):
324 '''
325 Get name of table
326 '''
327 return self.name
328
329 def RenameCol(self, old_name, new_name):
330 """
331 Rename column *old_name* to *new_name*.
332
333 :param old_name: Name of the old column
334 :param new_name: Name of the new column
335 :raises: :exc:`ValueError` when *old_name* is not a valid column
336 """
337 if old_name==new_name:
338 return
339 self.AddCol(new_name, self.col_types[self.GetColIndex(old_name)],
340 self[old_name])
341 self.RemoveCol(old_name)
342 def _Coerce(self, value, ty):
343 '''
344 Try to convert values (e.g. from :class:`str` type) to the specified type
345
346 :param value: the value
347 :type value: any type
348
349 :param ty: name of type to convert it to (i.e. *int*, *float*, *string*,
350 *bool*)
351 :type ty: :class:`str`
352 '''
353 if value=='NA' or value==None:
354 return None
355 if ty=='int':
356 return int(value)
357 if ty=='float':
358 return float(value)
359 if ty=='string':
360 return str(value)
361 if ty=='bool':
362 if isinstance(value, str) or isinstance(value, str):
363 if value.upper() in ('FALSE', 'NO',):
364 return False
365 return True
366 return bool(value)
367 raise ValueError('Unknown type %s' % ty)
368
369 def GetColIndex(self, col):
370 '''
371 Returns the column index for the column with the given name.
372
373 :raises: ValueError if no column with the name is found.
374 '''
375 if col not in self.col_names:
376 raise ValueError('Table has no column named "%s"' % col)
377 return self.col_names.index(col)
378
379 def GetColNames(self):
380 '''
381 Returns a list containing all column names.
382 '''
383 return self.col_names
384
385 def SearchColNames(self, regex):
386 '''
387 Returns a list of column names matching the regex.
388
389 :param regex: regex pattern
390 :type regex: :class:`str`
391
392 :returns: :class:`list` of column names (:class:`str`)
393 '''
394 matching_names = []
395 for name in self.col_names:
396 matches = re.search(regex, name)
397 if matches:
398 matching_names.append(name)
399 return matching_names
400
401 def HasCol(self, col):
402 '''
403 Checks if the column with a given name is present in the table.
404 '''
405 return col in self.col_names
406
407 def __getitem__(self, k):
408 if type(k)==int:
409 return TableCol(self, self.col_names[k])
410 else:
411 return TableCol(self, k)
412
413 def __setitem__(self, k, value):
414 col_index=k
415 if type(k)!=int:
416 col_index=self.GetColIndex(k)
417 if IsScalar(value):
418 value=itertools.cycle([value])
419 for r, v in zip(self.rows, value):
420 r[col_index]=v
421
422 def ToString(self, float_format='%.3f', int_format='%d', rows=None):
423 '''
424 Convert the table into a string representation.
425
426 The output format can be modified for int and float type columns by
427 specifying a formatting string for the parameters *float_format* and
428 *int_format*.
429
430 The option *rows* specify the range of rows to be printed. The parameter
431 must be a type that supports indexing (e.g. a :class:`list`) containing the
432 start and end row *index*, e.g. [start_row_idx, end_row_idx].
433
434 :param float_format: formatting string for float columns
435 :type float_format: :class:`str`
436
437 :param int_format: formatting string for int columns
438 :type int_format: :class:`str`
439
440 :param rows: iterable containing start and end row *index*
441 :type rows: iterable containing :class:`ints <int>`
442 '''
443 widths=[len(cn) for cn in self.col_names]
444 sel_rows=self.rows
445 if rows:
446 sel_rows=self.rows[rows[0]:rows[1]]
447 for row in sel_rows:
448 for i, (ty, col) in enumerate(zip(self.col_types, row)):
449 if col==None:
450 widths[i]=max(widths[i], len('NA'))
451 elif ty=='float':
452 widths[i]=max(widths[i], len(float_format % col))
453 elif ty=='int':
454 widths[i]=max(widths[i], len(int_format % col))
455 else:
456 widths[i]=max(widths[i], len(str(col)))
457 s=''
458 if self.comment:
459 s+=''.join(['# %s\n' % l for l in self.comment.split('\n')])
460 total_width=sum(widths)+2*len(widths)
461 for width, col_name in zip(widths, self.col_names):
462 s+=col_name.center(width+2)
463 s+='\n%s\n' % ('-'*total_width)
464 for row in sel_rows:
465 for width, ty, col in zip(widths, self.col_types, row):
466 cs=''
467 if col==None:
468 cs='NA'.center(width+2)
469 elif ty=='float':
470 cs=(float_format % col).rjust(width+2)
471 elif ty=='int':
472 cs=(int_format % col).rjust(width+2)
473 else:
474 cs=' '+str(col).ljust(width+1)
475 s+=cs
476 s+='\n'
477 return s
478
479 def __str__(self):
480 return self.ToString()
481
482 def Stats(self, col):
483 idx = self.GetColIndex(col)
484 text ='''
485Statistics for column %(col)s
486
487 Number of Rows : %(num)d
488 Number of Rows Not None: %(num_non_null)d
489 Mean : %(mean)f
490 Median : %(median)f
491 Standard Deviation : %(stddev)f
492 Min : %(min)f
493 Max : %(max)f
494'''
495 data = {
496 'col' : col,
497 'num' : len(self.rows),
498 'num_non_null' : self.Count(col),
499 'median' : self.Median(col),
500 'mean' : self.Mean(col),
501 'stddev' : self.StdDev(col),
502 'min' : self.Min(col),
503 'max' : self.Max(col),
504 }
505 return text % data
506
507 def _AddRowsFromDict(self, d, overwrite=None):
508 '''
509 Add one or more rows from a :class:`dictionary <dict>`.
510
511 If *overwrite* is not None and set to an existing column name, the specified
512 column in the table is searched for the first occurrence of a value matching
513 the value of the column with the same name in the dictionary. If a matching
514 value is found, the row is overwritten with the dictionary. If no matching
515 row is found, a new row is appended to the table.
516
517 :param d: dictionary containing the data
518 :type d: :class:`dict`
519
520 :param overwrite: column name to overwrite existing row if value in
521 column *overwrite* matches
522 :type overwrite: :class:`str`
523
524 :raises: :class:`ValueError` if multiple rows are added but the number of
525 data items is different for different columns.
526 '''
527 # get column indices
528 idxs = [self.GetColIndex(k) for k in list(d.keys())]
529
530 # convert scalar values to list
531 old_len = None
532 for k,v in d.items():
533 if IsScalar(v):
534 v = [v]
535 d[k] = v
536 if not old_len:
537 old_len = len(v)
538 elif old_len!=len(v):
539 raise ValueError("Cannot add rows: length of data must be equal " + \
540 "for all columns in %s"%str(d))
541
542 # convert column based dict to row based dict and create row and add data
543 for i,data in enumerate(zip(*list(d.values()))):
544 new_row = [None for a in range(len(self.col_names))]
545 for idx,v in zip(idxs,data):
546 new_row[idx] = self._Coerce(v, self.col_types[idx])
547
548 # partially overwrite existing row with new data
549 if overwrite:
550 overwrite_idx = self.GetColIndex(overwrite)
551 added = False
552 for i,r in enumerate(self.rows):
553 if r[overwrite_idx]==new_row[overwrite_idx]:
554 for j,e in enumerate(self.rows[i]):
555 if new_row[j]==None:
556 new_row[j] = e
557 self.rows[i] = new_row
558 added = True
559 break
560
561 # if not overwrite or overwrite did not find appropriate row
562 if not overwrite or not added:
563 self.rows.append(new_row)
564
565 def PairedTTest(self, col_a, col_b):
566 """
567 Two-sided test for the null-hypothesis that two related samples
568 have the same average (expected values).
569
570 :param col_a: First column
571 :type col_a: :class:`str`
572 :param col_b: Second column
573 :type col_b: :class:`str`
574
575 :returns: P-value between 0 and 1 that the two columns have the
576 same average. The smaller the value, the less related the two
577 columns are.
578 """
579 from scipy.stats import ttest_rel
580 xs = []
581 ys = []
582 for x, y in self.Zip(col_a, col_b):
583 if x!=None and y!=None:
584 xs.append(x)
585 ys.append(y)
586 result = ttest_rel(xs, ys)
587 return result[1]
588
589 def AddRow(self, data, overwrite=None):
590 """
591 Add a row to the table.
592
593 *data* may either be a dictionary or a list-like object:
594
595 - If *data* is a dictionary, the keys in the dictionary must match the
596 column names. Columns not found in the dict will be initialized to None.
597 If the dict contains list-like objects, multiple rows will be added, if
598 the number of items in all list-like objects is the same, otherwise a
599 :class:`ValueError` is raised.
600
601 - If *data* is a list-like object, the row is initialized from the values
602 in *data*. The number of items in *data* must match the number of
603 columns in the table. A :class:`ValuerError` is raised otherwise. The
604 values are added in the order specified in the list, thus, the order of
605 the data must match the columns.
606
607 If *overwrite* is not None and set to an existing column name, the specified
608 column in the table is searched for the first occurrence of a value matching
609 the value of the column with the same name in the dictionary. If a matching
610 value is found, the row is overwritten with the dictionary. If no matching
611 row is found, a new row is appended to the table.
612
613 :param data: data to add
614 :type data: :class:`dict` or *list-like* object
615
616 :param overwrite: column name to overwrite existing row if value in
617 column *overwrite* matches
618 :type overwrite: :class:`str`
619
620 :raises: :class:`ValueError` if *list-like* object is used and number of
621 items does *not* match number of columns in table.
622
623 :raises: :class:`ValueError` if *dict* is used and multiple rows are added
624 but the number of data items is different for different columns.
625
626 **Example:** add multiple data rows to a subset of columns using a dictionary
627
628 .. code-block:: python
629
630 # create table with three float columns
631 tab = Table(['x','y','z'], 'fff')
632
633 # add rows from dict
634 data = {'x': [1.2, 1.6], 'z': [1.6, 5.3]}
635 tab.AddRow(data)
636 print tab
637
638 '''
639 will produce the table
640
641 ==== ==== ====
642 x y z
643 ==== ==== ====
644 1.20 NA 1.60
645 1.60 NA 5.30
646 ==== ==== ====
647 '''
648
649 # overwrite the row with x=1.2 and add row with x=1.9
650 data = {'x': [1.2, 1.9], 'z': [7.9, 3.5]}
651 tab.AddRow(data, overwrite='x')
652 print tab
653
654 '''
655 will produce the table
656
657 ==== ==== ====
658 x y z
659 ==== ==== ====
660 1.20 NA 7.90
661 1.60 NA 5.30
662 1.90 NA 3.50
663 ==== ==== ====
664 '''
665 """
666 if type(data)==dict:
667 self._AddRowsFromDict(data, overwrite)
668 else:
669 if len(data)!=len(self.col_names):
670 msg='data array must have %d elements, not %d'
671 raise ValueError(msg % (len(self.col_names), len(data)))
672 new_row = [self._Coerce(v, t) for v, t in zip(data, self.col_types)]
673
674 # fully overwrite existing row with new data
675 if overwrite:
676 overwrite_idx = self.GetColIndex(overwrite)
677 added = False
678 for i,r in enumerate(self.rows):
679 if r[overwrite_idx]==new_row[overwrite_idx]:
680 self.rows[i] = new_row
681 added = True
682 break
683
684 # if not overwrite or overwrite did not find appropriate row
685 if not overwrite or not added:
686 self.rows.append(new_row)
687
688 def RemoveCol(self, col):
689 """
690 Remove column with the given name from the table.
691
692 :param col: name of column to remove
693 :type col: :class:`str`
694 """
695 idx = self.GetColIndex(col)
696 del self.col_names[idx]
697 del self.col_types[idx]
698 for row in self.rows:
699 del row[idx]
700
701 def AddCol(self, col_name, col_type, data=None):
702 """
703 Add a column to the right of the table.
704
705 :param col_name: name of new column
706 :type col_name: :class:`str`
707
708 :param col_type: type of new column (long versions: *int*, *float*, *bool*,
709 *string* or short versions: *i*, *f*, *b*, *s*)
710 :type col_type: :class:`str`
711
712 :param data: data to add to new column
713 :type data: scalar or iterable
714
715 **Example:**
716
717 .. code-block:: python
718
719 tab = Table(['x'], 'f', x=range(5))
720 tab.AddCol('even', 'bool', itertools.cycle([True, False]))
721 print tab
722
723 '''
724 will produce the table
725
726 ==== ====
727 x even
728 ==== ====
729 0 True
730 1 False
731 2 True
732 3 False
733 4 True
734 ==== ====
735 '''
736
737 If data is a constant instead of an iterable object, it's value
738 will be written into each row:
739
740 .. code-block:: python
741
742 tab = Table(['x'], 'f', x=range(5))
743 tab.AddCol('num', 'i', 1)
744 print tab
745
746 '''
747 will produce the table
748
749 ==== ====
750 x num
751 ==== ====
752 0 1
753 1 1
754 2 1
755 3 1
756 4 1
757 ==== ====
758 '''
759
760 As a special case, if there are no previous rows, and data is not
761 None, rows are added for every item in data.
762 """
763
764 if col_name in self.col_names:
765 raise ValueError('Column with name %s already exists'%col_name)
766
767 col_type = self._ParseColTypes(col_type, exp_num=1)[0]
768 self.col_names.append(col_name)
769 self.col_types.append(col_type)
770
771 if len(self.rows)>0:
772 if IsScalar(data):
773 for row in self.rows:
774 row.append(data)
775 else:
776 if hasattr(data, '__len__') and len(data)!=len(self.rows):
777 self.col_names.pop()
778 self.col_types.pop()
779 raise ValueError('Length of data (%i) must correspond to number of '%len(data) +\
780 'existing rows (%i)'%len(self.rows))
781 for row, d in zip(self.rows, data):
782 row.append(d)
783
784 elif data!=None and len(self.col_names)==1:
785 if IsScalar(data):
786 self.AddRow({col_name : data})
787 else:
788 for v in data:
789 self.AddRow({col_name : v})
790
791 def Filter(self, *args, **kwargs):
792 """
793 Returns a filtered table only containing rows matching all the predicates
794 in kwargs and args For example,
795
796 .. code-block:: python
797
798 tab.Filter(town='Basel')
799
800 will return all the rows where the value of the column "town" is equal to
801 "Basel". Several predicates may be combined, i.e.
802
803 .. code-block:: python
804
805 tab.Filter(town='Basel', male=True)
806
807 will return the rows with "town" equal to "Basel" and "male" equal to true.
808 args are unary callables returning true if the row should be included in the
809 result and false if not.
810 """
811 filt_tab=Table(list(self.col_names), list(self.col_types))
812 for row in self.rows:
813 matches=True
814 for func in args:
815 if not func(row):
816 matches=False
817 break
818 for key, val in kwargs.items():
819 if row[self.GetColIndex(key)]!=val:
820 matches=False
821 break
822 if matches:
823 filt_tab.AddRow(row)
824 return filt_tab
825
826
827 def Select(self, query):
828
829 """
830 Returns a new table object containing all rows matching a logical query
831 expression.
832
833 *query* is a string containing the logical expression, that will be
834 evaluated for every row.
835
836 Operands have to be the name of a column or an expression that can be
837 parsed to float, int, bool or string.
838 Valid operators are: and, or, !=, !, <=, >=, ==, =, <, >, +, -, \\*, /
839
840 .. code-block:: python
841
842 subtab = tab.Select('col_a>0.5 and (col_b=5 or col_c=5)')
843
844 The selection query should be self explaining. Allowed parenthesis are:
845 (), [], {}, whereas parenthesis mismatches get recognized. Expressions like
846 '3<=col_a>=col_b' throw an error, due to problems in figuring out the
847 evaluation order.
848
849 There are two special expressions:
850
851 .. code-block:: python
852
853 #selects rows, where 1.0<=col_a<=1.5
854 subtab = tab.Select('col_a=1.0:1.5')
855
856 #selects rows, where col_a=1 or col_a=2 or col_a=3
857 subtab = tab.Select('col_a=1,2,3')
858
859 Only consistent types can be compared. If col_a is of type string and col_b
860 is of type int, following expression would throw an error: 'col_a<col_b'
861 """
862
863 try:
864 from .table_selector import TableSelector
865 except:
866 raise ImportError("Tried to import from the file table_selector.py, but could not find it!")
867
868 selector=TableSelector(self.col_types, self.col_names, query)
869
870 selected_tab=Table(list(self.col_names), list(self.col_types))
871
872 for row in self.rows:
873 if selector.EvaluateRow(row):
874 selected_tab.AddRow(row)
875
876 return selected_tab
877
878
879 @staticmethod
880 def _LoadOST(stream_or_filename):
881 fieldname_pattern=re.compile(r'(?P<name>[^[]+)(\[(?P<type>\w+)\])?')
882 values_pattern=re.compile("([^\" ]+|\"[^\"]*\")+")
883 file_opened=False
884 if not hasattr(stream_or_filename, 'read'):
885 stream=open(stream_or_filename, 'r')
886 file_opened=True
887 else:
888 stream=stream_or_filename
889 header=False
890 num_lines=0
891 for line in stream:
892 line=line.strip()
893 if line.startswith('#'):
894 continue
895 if len(line)==0:
896 continue
897 num_lines+=1
898 if not header:
899 fieldnames=[]
900 fieldtypes=[]
901 for col in line.split():
902 match=fieldname_pattern.match(col)
903 if match:
904 if match.group('type'):
905 fieldtypes.append(match.group('type'))
906 else:
907 fieldtypes.append('string')
908 fieldnames.append(match.group('name'))
909 try:
910 tab=Table(fieldnames, fieldtypes)
911 except Exception as e:
912 # potentially fails if we read in crap... clean up and pass on error
913 if file_opened:
914 stream.close()
915 raise e
916 header=True
917 continue
918 tab.AddRow([x.strip('"') for x in values_pattern.findall(line)])
919 if file_opened:
920 stream.close()
921 if num_lines==0:
922 raise IOError("Cannot read table from empty stream")
923 return tab
924
926 for col_idx in range(len(self.col_names)):
927 self.col_types[col_idx]=GuessColumnType(self[self.col_names[col_idx]])
928 for row in self.rows:
929 for idx in range(len(row)):
930 row[idx]=self._Coerce(row[idx], self.col_types[idx])
931
932 @staticmethod
933 def _LoadCSV(stream_or_filename, sep):
934 file_opened=False
935 if not hasattr(stream_or_filename, 'read'):
936 stream=open(stream_or_filename, 'r')
937 file_opened=True
938 else:
939 stream=stream_or_filename
940 reader=csv.reader(stream, delimiter=sep)
941 first=True
942 for row in reader:
943 if first:
944 header=row
945 types='s'*len(row)
946 tab=Table(header, types)
947 first=False
948 else:
949 tab.AddRow(row)
950 if file_opened:
951 stream.close()
952 if first:
953 raise IOError('trying to load table from empty CSV stream/file')
954
955 tab._GuessColumnTypes()
956 return tab
957
958 @staticmethod
959 def _LoadPickle(stream_or_filename):
960 file_opened=False
961 if not hasattr(stream_or_filename, 'read'):
962 stream=open(stream_or_filename, 'rb')
963 file_opened=True
964 else:
965 stream=stream_or_filename
966 tab = pickle.load(stream)
967 if file_opened:
968 stream.close()
969 return tab
970
971 @staticmethod
972 def _GuessFormat(filename):
973 try:
974 filename = filename.name
975 except AttributeError as e:
976 pass
977 if filename.endswith('.csv'):
978 return 'csv'
979 elif filename.endswith('.pickle'):
980 return 'pickle'
981 else:
982 return 'ost'
983
984
985 @staticmethod
986 def Load(stream_or_filename, format='auto', sep=','):
987 """
988 Load table from stream or file with given name.
989
990 By default, the file format is set to *auto*, which tries to guess the file
991 format from the file extension. The following file extensions are
992 recognized:
993
994 ============ ======================
995 extension recognized format
996 ============ ======================
997 .csv comma separated values
998 .pickle pickled byte stream
999 <all others> ost-specific format
1000 ============ ======================
1001
1002 Thus, *format* must be specified for reading file with different filename
1003 extensions.
1004
1005 The following file formats are understood:
1006
1007 - ost
1008
1009 This is an ost-specific, but still human readable file format. The file
1010 (stream) must start with header line of the form
1011
1012 col_name1[type1] <col_name2[type2]>...
1013
1014 The types given in brackets must be one of the data types the
1015 :class:`Table` class understands. Each following line in the file then must
1016 contains exactly the same number of data items as listed in the header. The
1017 data items are automatically converted to the column format. Lines starting
1018 with a '#' and empty lines are ignored.
1019
1020 - pickle
1021
1022 Deserializes the table from a pickled byte stream.
1023
1024 - csv
1025
1026 Reads the table from comma separated values stream. Since there is no
1027 explicit type information in the csv file, the column types are guessed,
1028 using the following simple rules:
1029
1030 * if all values are either NA/NULL/NONE the type is set to string.
1031 * if all non-null values are convertible to float/int the type is set to
1032 float/int.
1033 * if all non-null values are true/false/yes/no, the value is set to bool.
1034 * for all other cases, the column type is set to string.
1035
1036 :returns: A new :class:`Table` instance
1037 """
1038 format=format.lower()
1039 if format=='auto':
1040 format = Table._GuessFormat(stream_or_filename)
1041
1042 if format=='ost':
1043 return Table._LoadOST(stream_or_filename)
1044 if format=='csv':
1045 return Table._LoadCSV(stream_or_filename, sep=sep)
1046 if format=='pickle':
1047 return Table._LoadPickle(stream_or_filename)
1048 raise ValueError('unknown format ""' % format)
1049
1050 def Sort(self, by, order='+'):
1051 """
1052 Performs an in-place sort of the table, based on column *by*.
1053
1054 :param by: column name by which to sort
1055 :type by: :class:`str`
1056
1057 :param order: ascending (``-``) or descending (``+``) order
1058 :type order: :class:`str` (i.e. *+*, *-*)
1059 """
1060 sign=-1
1061 if order=='-':
1062 sign=1
1063 key_index=self.GetColIndex(by)
1064 def _key_cmp(lhs, rhs):
1065 a = lhs[key_index]
1066 b = rhs[key_index]
1067 # mimic behaviour of the cmp function from Python2 that happily
1068 # compared None values
1069 if a is None or b is None:
1070 if a is None and b is not None:
1071 return -1 * sign
1072 if b is None and a is not None:
1073 return 1 * sign
1074 return 0
1075 return sign*((a > b) - (a < b))
1076
1077 import functools
1078 self.rows=sorted(self.rows, key=functools.cmp_to_key(_key_cmp))
1079
1080 def GetUnique(self, col, ignore_nan=True):
1081 """
1082 Extract a list of all unique values from one column.
1083
1084 :param col: column name
1085 :type col: :class:`str`
1086
1087 :param ignore_nan: ignore all *None* values
1088 :type ignore_nan: :class:`bool`
1089 """
1090 idx = self.GetColIndex(col)
1091 seen = {}
1092 result = []
1093 for row in self.rows:
1094 item = row[idx]
1095 if item!=None or ignore_nan==False:
1096 if item in seen: continue
1097 seen[item] = 1
1098 result.append(item)
1099 return result
1100
1101 def Zip(self, *args):
1102 """
1103 Allows to conveniently iterate over a selection of columns, e.g.
1104
1105 .. code-block:: python
1106
1107 tab = Table.Load('...')
1108 for col1, col2 in tab.Zip('col1', 'col2'):
1109 print col1, col2
1110
1111 is a shortcut for
1112
1113 .. code-block:: python
1114
1115 tab = Table.Load('...')
1116 for col1, col2 in zip(tab['col1'], tab['col2']):
1117 print col1, col2
1118 """
1119 return list(zip(*[self[arg] for arg in args]))
1120
1121 def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None,
1122 z_title=None, x_range=None, y_range=None, z_range=None,
1123 color=None, plot_if=None, legend=None,
1124 num_z_levels=10, z_contour=True, z_interpol='nn', diag_line=False,
1125 labels=None, max_num_labels=None, title=None, clear=True, save=False,
1126 **kwargs):
1127 """
1128 Function to plot values from your table in 1, 2 or 3 dimensions using
1129 `Matplotlib <http://matplotlib.sourceforge.net>`__
1130
1131 :param x: column name for first dimension
1132 :type x: :class:`str`
1133
1134 :param y: column name for second dimension
1135 :type y: :class:`str`
1136
1137 :param z: column name for third dimension
1138 :type z: :class:`str`
1139
1140 :param style: symbol style (e.g. *.*, *-*, *x*, *o*, *+*, *\\**). For a
1141 complete list check (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1142 :type style: :class:`str`
1143
1144 :param x_title: title for first dimension, if not specified it is
1145 automatically derived from column name
1146 :type x_title: :class:`str`
1147
1148 :param y_title: title for second dimension, if not specified it is
1149 automatically derived from column name
1150 :type y_title: :class:`str`
1151
1152 :param z_title: title for third dimension, if not specified it is
1153 automatically derived from column name
1154 :type z_title: :class:`str`
1155
1156 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1157 :type x_range: :class:`list` of length two
1158
1159 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1160 :type y_range: :class:`list` of length two
1161
1162 :param z_range: start and end value for third dimension (e.g. [start_z, end_z])
1163 :type z_range: :class:`list` of length two
1164
1165 :param color: color for data (e.g. *b*, *g*, *r*). For a complete list check
1166 (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1167 :type color: :class:`str`
1168
1169 :param plot_if: callable which returnes *True* if row should be plotted. Is
1170 invoked like ``plot_if(self, row)``
1171 :type plot_if: callable
1172
1173 :param legend: legend label for data series
1174 :type legend: :class:`str`
1175
1176 :param num_z_levels: number of levels for third dimension
1177 :type num_z_levels: :class:`int`
1178
1179 :param diag_line: draw diagonal line
1180 :type diag_line: :class:`bool`
1181
1182 :param labels: column name containing labels to put on x-axis for one
1183 dimensional plot
1184 :type labels: :class:`str`
1185
1186 :param max_num_labels: limit maximum number of labels
1187 :type max_num_labels: :class:`int`
1188
1189 :param title: plot title, if not specified it is automatically derived from
1190 plotted column names
1191 :type title: :class:`str`
1192
1193 :param clear: clear old data from plot
1194 :type clear: :class:`bool`
1195
1196 :param save: filename for saving plot
1197 :type save: :class:`str`
1198
1199 :param z_contour: draw contour lines
1200 :type z_contour: :class:`bool`
1201
1202 :param z_interpol: interpolation method for 3-dimensional plot (one of 'nn',
1203 'linear')
1204 :type z_interpol: :class:`str`
1205
1206 :param \\*\\*kwargs: additional arguments passed to matplotlib
1207
1208 :returns: the ``matplotlib.pyplot`` module
1209
1210 **Examples:** simple plotting functions
1211
1212 .. code-block:: python
1213
1214 tab = Table(['a','b','c','d'],'iffi', a=range(5,0,-1),
1215 b=[x/2.0 for x in range(1,6)],
1216 c=[math.cos(x) for x in range(0,5)],
1217 d=range(3,8))
1218
1219 # one dimensional plot of column 'd' vs. index
1220 plt = tab.Plot('d')
1221 plt.show()
1222
1223 # two dimensional plot of 'a' vs. 'c'
1224 plt = tab.Plot('a', y='c', style='o-')
1225 plt.show()
1226
1227 # three dimensional plot of 'a' vs. 'c' with values 'b'
1228 plt = tab.Plot('a', y='c', z='b')
1229 # manually save plot to file
1230 plt.savefig("plot.png")
1231 """
1232 try:
1233 import matplotlib.pyplot as plt
1234 import matplotlib.mlab as mlab
1235 import numpy as np
1236 idx1 = self.GetColIndex(x)
1237 xs = []
1238 ys = []
1239 zs = []
1240
1241 if clear:
1242 plt.figure(figsize=[8, 6])
1243
1244 if x_title!=None:
1245 nice_x=x_title
1246 else:
1247 nice_x=MakeTitle(x)
1248
1249 if y_title!=None:
1250 nice_y=y_title
1251 else:
1252 if y:
1253 nice_y=MakeTitle(y)
1254 else:
1255 nice_y=None
1256
1257 if z_title!=None:
1258 nice_z = z_title
1259 else:
1260 if z:
1261 nice_z = MakeTitle(z)
1262 else:
1263 nice_z = None
1264
1265 if x_range and (IsScalar(x_range) or len(x_range)!=2):
1266 raise ValueError('parameter x_range must contain exactly two elements')
1267 if y_range and (IsScalar(y_range) or len(y_range)!=2):
1268 raise ValueError('parameter y_range must contain exactly two elements')
1269 if z_range and (IsScalar(z_range) or len(z_range)!=2):
1270 raise ValueError('parameter z_range must contain exactly two elements')
1271
1272 if color:
1273 kwargs['color']=color
1274 if legend:
1275 kwargs['label']=legend
1276 if y and z:
1277 idx3 = self.GetColIndex(z)
1278 idx2 = self.GetColIndex(y)
1279 for row in self.rows:
1280 if row[idx1]!=None and row[idx2]!=None and row[idx3]!=None:
1281 if plot_if and not plot_if(self, row):
1282 continue
1283 xs.append(row[idx1])
1284 ys.append(row[idx2])
1285 zs.append(row[idx3])
1286 levels = []
1287 if z_range:
1288 z_spacing = (z_range[1] - z_range[0]) / num_z_levels
1289 l = z_range[0]
1290 else:
1291 l = self.Min(z)
1292 z_spacing = (self.Max(z) - l) / num_z_levels
1293
1294 for i in range(0,num_z_levels+1):
1295 levels.append(l)
1296 l += z_spacing
1297
1298 xi = np.linspace(min(xs),max(xs),len(xs)*10)
1299 yi = np.linspace(min(ys),max(ys),len(ys)*10)
1300 zi = mlab.griddata(xs, ys, zs, xi, yi, interp=z_interpol)
1301
1302 if z_contour:
1303 plt.contour(xi,yi,zi,levels,linewidths=0.5,colors='k')
1304
1305 plt.contourf(xi,yi,zi,levels,cmap=plt.cm.jet)
1306 plt.colorbar(ticks=levels)
1307
1308 elif y:
1309 idx2=self.GetColIndex(y)
1310 for row in self.rows:
1311 if row[idx1]!=None and row[idx2]!=None:
1312 if plot_if and not plot_if(self, row):
1313 continue
1314 xs.append(row[idx1])
1315 ys.append(row[idx2])
1316 plt.plot(xs, ys, style, **kwargs)
1317
1318 else:
1319 label_vals=[]
1320
1321 if labels:
1322 label_idx=self.GetColIndex(labels)
1323 for row in self.rows:
1324 if row[idx1]!=None:
1325 if plot_if and not plot_if(self, row):
1326 continue
1327 xs.append(row[idx1])
1328 if labels:
1329 label_vals.append(row[label_idx])
1330 plt.plot(xs, style, **kwargs)
1331 if labels:
1332 interval = 1
1333 if max_num_labels:
1334 if len(label_vals)>max_num_labels:
1335 interval = int(math.ceil(float(len(label_vals))/max_num_labels))
1336 label_vals = label_vals[::interval]
1337 plt.xticks(np.arange(0, len(xs), interval), label_vals, rotation=45,
1338 size='x-small')
1339
1340 if title==None:
1341 if nice_z:
1342 title = '%s of %s vs. %s' % (nice_z, nice_x, nice_y)
1343 elif nice_y:
1344 title = '%s vs. %s' % (nice_x, nice_y)
1345 else:
1346 title = nice_x
1347
1348 plt.title(title, size='x-large', fontweight='bold',
1349 verticalalignment='bottom')
1350
1351 if legend:
1352 plt.legend(loc=0)
1353
1354 if x and y:
1355 plt.xlabel(nice_x, size='x-large')
1356 if x_range:
1357 plt.xlim(x_range[0], x_range[1])
1358 if y_range:
1359 plt.ylim(y_range[0], y_range[1])
1360 if diag_line:
1361 plt.plot(x_range, y_range, '-', color='black')
1362
1363 plt.ylabel(nice_y, size='x-large')
1364 else:
1365 if y_range:
1366 plt.ylim(y_range[0], y_range[1])
1367 if x_title:
1368 plt.xlabel(x_title, size='x-large')
1369 plt.ylabel(nice_y, size='x-large')
1370 if save:
1371 plt.savefig(save)
1372 return plt
1373 except ImportError:
1374 LogError("Function needs numpy and matplotlib, but I could not import it.")
1375 raise
1376
1377 def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False,
1378 histtype='stepfilled', align='mid', x_title=None,
1379 y_title=None, title=None, clear=True, save=False,
1380 color=None, y_range=None):
1381 """
1382 Create a histogram of the data in col for the range *x_range*, split into
1383 *num_bins* bins and plot it using Matplotlib.
1384
1385 :param col: column name with data
1386 :type col: :class:`str`
1387
1388 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1389 :type x_range: :class:`list` of length two
1390
1391 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1392 :type y_range: :class:`list` of length two
1393
1394 :param num_bins: number of bins in range
1395 :type num_bins: :class:`int`
1396
1397 :param color: Color to be used for the histogram. If not set, color will be
1398 determined by matplotlib
1399 :type color: :class:`str`
1400
1401 :param normed: normalize histogram
1402 :type normed: :class:`bool`
1403
1404 :param histtype: type of histogram (i.e. *bar*, *barstacked*, *step*,
1405 *stepfilled*). See (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1406 :type histtype: :class:`str`
1407
1408 :param align: style of histogram (*left*, *mid*, *right*). See
1409 (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1410 :type align: :class:`str`
1411
1412 :param x_title: title for first dimension, if not specified it is
1413 automatically derived from column name
1414 :type x_title: :class:`str`
1415
1416 :param y_title: title for second dimension, if not specified it is
1417 automatically derived from column name
1418 :type y_title: :class:`str`
1419
1420 :param title: plot title, if not specified it is automatically derived from
1421 plotted column names
1422 :type title: :class:`str`
1423
1424 :param clear: clear old data from plot
1425 :type clear: :class:`bool`
1426
1427 :param save: filename for saving plot
1428 :type save: :class:`str`
1429
1430 **Examples:** simple plotting functions
1431
1432 .. code-block:: python
1433
1434 tab = Table(['a'],'f', a=[math.cos(x*0.01) for x in range(100)])
1435
1436 # one dimensional plot of column 'd' vs. index
1437 plt = tab.PlotHistogram('a')
1438 plt.show()
1439
1440 """
1441 try:
1442 import matplotlib.pyplot as plt
1443 import numpy as np
1444
1445 if len(self.rows)==0:
1446 return None
1447 kwargs={}
1448 if color:
1449 kwargs['color']=color
1450 idx = self.GetColIndex(col)
1451 data = []
1452 for r in self.rows:
1453 if r[idx]!=None:
1454 data.append(r[idx])
1455
1456 if clear:
1457 plt.clf()
1458
1459 n, bins, patches = plt.hist(data, bins=num_bins, range=x_range,
1460 normed=normed, histtype=histtype, align=align,
1461 **kwargs)
1462
1463 if x_title!=None:
1464 nice_x=x_title
1465 else:
1466 nice_x=MakeTitle(col)
1467 plt.xlabel(nice_x, size='x-large')
1468 if y_range:
1469 plt.ylim(y_range)
1470 if y_title!=None:
1471 nice_y=y_title
1472 else:
1473 nice_y="bin count"
1474 plt.ylabel(nice_y, size='x-large')
1475
1476 if title!=None:
1477 nice_title=title
1478 else:
1479 nice_title="Histogram of %s"%nice_x
1480 plt.title(nice_title, size='x-large', fontweight='bold')
1481
1482 if save:
1483 plt.savefig(save)
1484 return plt
1485 except ImportError:
1486 LogError("Function needs numpy and matplotlib, but I could not import it.")
1487 raise
1488
1489 def _Max(self, col):
1490 if len(self.rows)==0:
1491 return None, None
1492 idx = self.GetColIndex(col)
1493 col_type = self.col_types[idx]
1494 if col_type=='int' or col_type=='float':
1495 max_val = -float('inf')
1496 elif col_type=='bool':
1497 max_val = False
1498 elif col_type=='string':
1499 max_val = chr(0)
1500 max_idx = None
1501 for i in range(0, len(self.rows)):
1502 val = self.rows[i][idx]
1503 if val and val > max_val:
1504 max_val = self.rows[i][idx]
1505 max_idx = i
1506 return max_val, max_idx
1507
1508 def PlotBar(self, cols=None, rows=None, xlabels=None, set_xlabels=True, xlabels_rotation='horizontal', y_title=None, title=None,
1509 colors=None, width=0.8, bottom=0, legend=False, legend_names=None, show=False, save=False):
1510
1511 """
1512 Create a barplot of the data in cols. Every column will be represented
1513 at one position. If there are several rows, each column will be grouped
1514 together.
1515
1516 :param cols: List of column names. Every column will be represented as a
1517 single bar. If cols is None, every column of the table gets
1518 plotted.
1519 :type cols: :class:`list`
1520
1521 :param rows: List of row indices. Values from given rows will be plotted
1522 in parallel at one column position. If set to None, all rows
1523 of the table will be plotted. Note, that the maximum number
1524 of rows is 7.
1525 :type rows: :class:`list`
1526
1527 :param xlabels: Label for every col on x-axis. If set to None, the column
1528 names are used. The xlabel plotting can be supressed by
1529 the parameter set_xlabel.
1530 :type xlabels: :class:`list`
1531
1532 :param set_xlabels: Controls whether xlabels are plotted or not.
1533 :type set_xlabels: :class:`bool`
1534
1535 :param x_labels_rotation: Can either be 'horizontal', 'vertical' or an
1536 integer, that describes the rotation in degrees.
1537
1538 :param y_title: Y-axis description
1539 :type y_title: :class:`str`
1540
1541 :title: Title of the plot. No title appears if set to None
1542 :type title: :class:`str`
1543
1544 :param colors: Colors of the different bars in each group. Must be a list
1545 of valid colors in matplotlib. Length of color and rows must
1546 be consistent.
1547 :type colors: :class:`list`
1548
1549 :param width: The available space for the groups on the x-axis is divided
1550 by the exact number of groups. The parameters width is the
1551 fraction of what is actually used. If it would be 1.0 the
1552 bars of the different groups would touch each other.
1553 Value must be between [0;1]
1554 :type width: :class:`float`
1555
1556 :param bottom: Bottom
1557 :type bottom: :class:`float`
1558
1559 :param legend: Legend for color explanation, the corresponding row
1560 respectively. If set to True, legend_names must be provided.
1561 :type legend: :class:`bool`
1562
1563 :param legend_names: List of names, that describe the differently colored
1564 bars. Length must be consistent with number of rows.
1565
1566 :param show: If set to True, the plot is directly displayed.
1567
1568 :param save: If set, a png image with name save in the current working
1569 directory will be saved.
1570 :type save: :class:`str`
1571
1572 """
1573 try:
1574 import numpy as np
1575 import matplotlib.pyplot as plt
1576 except:
1577 raise ImportError('PlotBar relies on numpy and matplotlib, but I could' \
1578 'not import it!')
1579
1580 standard_colors=['b','g','y','c','m','r','k']
1581 data=[]
1582
1583 if cols==None:
1584 cols=self.col_names
1585
1586 if width<=0 or width>1:
1587 raise ValueError('Width must be in [0;1]')
1588
1589 if rows==None:
1590 if len(self.rows)>7:
1591 raise ValueError('Table contains too many rows to represent them at one '\
1592 'bar position in parallel. You can Select a Subtable or '\
1593 'specify the parameter rows with a list of row indices '\
1594 '(max 7)')
1595 else:
1596 rows=list(range(len(self.rows)))
1597 else:
1598 if not isinstance(rows,list):
1599 rows=[rows]
1600 if len(rows)>7:
1601 raise ValueError('Too many rows to represent (max 7). Please note, that '\
1602 'data from multiple rows from one column gets '\
1603 'represented at one position in parallel.')
1604
1605 for r_idx in rows:
1606 row=self.rows[r_idx]
1607 temp=list()
1608 for c in cols:
1609 try:
1610 c_idx=self.GetColIndex(c)
1611 except:
1612 raise ValueError('Cannot find column with name '+str(c))
1613 temp.append(row[c_idx])
1614 data.append(temp)
1615
1616 if colors==None:
1617 colors=standard_colors[:len(rows)]
1618
1619 if len(rows)!=len(colors):
1620 raise ValueError("Number of rows and number of colors must be consistent!")
1621
1622 ind=np.arange(len(data[0]))
1623 single_bar_width=float(width)/len(data)
1624
1625 fig=plt.figure()
1626 ax=fig.add_subplot(111)
1627 legend_data=[]
1628
1629 for i in range(len(data)):
1630 legend_data.append(ax.bar(ind+i*single_bar_width+(1-width)/2,data[i],single_bar_width,bottom=bottom,color=colors[i])[0])
1631
1632 if title!=None:
1633 ax.set_title(title, size='x-large', fontweight='bold')
1634
1635 if y_title!=None:
1636 nice_y=y_title
1637 else:
1638 nice_y="value"
1639 ax.set_ylabel(nice_y)
1640
1641 if xlabels:
1642 if len(data[0])!=len(xlabels):
1643 raise ValueError('Number of xlabels is not consistent with number of cols!')
1644 else:
1645 xlabels=cols
1646
1647 if set_xlabels:
1648 ax.set_xticks(ind+0.5)
1649 ax.set_xticklabels(xlabels, rotation = xlabels_rotation)
1650 else:
1651 ax.set_xticks([])
1652
1653 if legend == True:
1654 if legend_names==None:
1655 raise ValueError('You must provide legend names! e.g. names for the rows, '\
1656 'that are printed in parallel.')
1657 if len(legend_names)!=len(data):
1658 raise ValueError('length of legend_names must be consistent with number '\
1659 'of plotted rows!')
1660 ax.legend(legend_data, legend_names)
1661
1662 if save:
1663 plt.savefig(save)
1664
1665 if show:
1666 plt.show()
1667
1668 return plt
1669
1670 def PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log',
1671 colormap='jet', show_scalebar=False, scalebar_label=None, clear=True, save=False, show=False):
1672
1673 """
1674 Create a heatplot of the data in col x vs the data in col y using matplotlib
1675
1676 :param x: column name with x data
1677 :type x: :class:`str`
1678
1679 :param y: column name with y data
1680 :type y: :class:`str`
1681
1682 :param title: title of the plot, will be generated automatically if set to None
1683 :type title: :class:`str`
1684
1685 :param x_title: label of x-axis, will be generated automatically if set to None
1686 :type title: :class:`str`
1687
1688 :param y_title: label of y-axis, will be generated automatically if set to None
1689 :type title: :class:`str`
1690
1691 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1692 :type x_range: :class:`list` of length two
1693
1694 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1695 :type y_range: :class:`list` of length two
1696
1697 :param binning: type of binning. If set to None, the value of a hexbin will
1698 correspond to the number of datapoints falling into it. If
1699 set to 'log', the value will be the log with base 10 of the above
1700 value (log(i+1)). If an integer is provided, the number of a
1701 hexbin is equal the number of datapoints falling into it divided
1702 by the integer. If a list of values is provided, these values
1703 will be the lower bounds of the bins.
1704
1705 :param colormap: colormap, that will be used. Value can be every colormap defined
1706 in matplotlib or an own defined colormap. You can either pass a
1707 string with the name of the matplotlib colormap or a colormap
1708 object.
1709
1710 :param show_scalebar: If set to True, a scalebar according to the chosen colormap is shown
1711 :type show_scalebar: :class:`bool`
1712
1713 :param scalebar_label: Label of the scalebar
1714 :type scalebar_label: :class:`str`
1715
1716 :param clear: clear old data from plot
1717 :type clear: :class:`bool`
1718
1719 :param save: filename for saving plot
1720 :type save: :class:`str`
1721
1722 :param show: directly show plot
1723 :type show: :class:`bool`
1724
1725 """
1726
1727 try:
1728 import matplotlib.pyplot as plt
1729 import matplotlib.cm as cm
1730 except:
1731 raise ImportError('PlotHexbin relies on matplotlib, but I could not import it')
1732
1733 idx=self.GetColIndex(x)
1734 idy=self.GetColIndex(y)
1735 xdata=[]
1736 ydata=[]
1737
1738 for r in self.rows:
1739 if r[idx]!=None and r[idy]!=None:
1740 xdata.append(r[idx])
1741 ydata.append(r[idy])
1742
1743 if clear:
1744 plt.clf()
1745
1746 if x_title!=None:
1747 nice_x=x_title
1748 else:
1749 nice_x=MakeTitle(x)
1750
1751 if y_title!=None:
1752 nice_y=y_title
1753 else:
1754 nice_y=MakeTitle(y)
1755
1756 if title==None:
1757 title = '%s vs. %s' % (nice_x, nice_y)
1758
1759 if IsStringLike(colormap):
1760 colormap=getattr(cm, colormap)
1761
1762 if x_range and (IsScalar(x_range) or len(x_range)!=2):
1763 raise ValueError('parameter x_range must contain exactly two elements')
1764 if y_range and (IsScalar(y_range) or len(y_range)!=2):
1765 raise ValueError('parameter y_range must contain exactly two elements')
1766
1767 ext = [min(xdata),max(xdata),min(ydata),max(ydata)]
1768
1769 if x_range:
1770 plt.xlim((x_range[0], x_range[1]))
1771 ext[0]=x_range[0]
1772 ext[1]=x_range[1]
1773 if y_range:
1774 plt.ylim(y_range[0], y_range[1])
1775 ext[2]=y_range[0]
1776 ext[3]=y_range[1]
1777
1778
1779 plt.hexbin(xdata, ydata, bins=binning, cmap=colormap, extent=ext)
1780
1781 plt.title(title, size='x-large', fontweight='bold',
1782 verticalalignment='bottom')
1783
1784 plt.xlabel(nice_x)
1785 plt.ylabel(nice_y)
1786
1787 if show_scalebar:
1788 cb=plt.colorbar()
1789 if scalebar_label:
1790 cb.set_label(scalebar_label)
1791
1792 if save:
1793 plt.savefig(save)
1794
1795 if show:
1796 plt.show()
1797
1798 return plt
1799
1800 def MaxRow(self, col):
1801 """
1802 Returns the row containing the cell with the maximal value in col. If
1803 several rows have the highest value, only the first one is returned.
1804 ''None'' values are ignored.
1805
1806 :param col: column name
1807 :type col: :class:`str`
1808
1809 :returns: row with maximal col value or None if the table is empty
1810 """
1811 val, idx = self._Max(col)
1812 if idx!=None:
1813 return self.rows[idx]
1814
1815 def Max(self, col):
1816 """
1817 Returns the maximum value in col. If several rows have the highest value,
1818 only the first one is returned. ''None'' values are ignored.
1819
1820 :param col: column name
1821 :type col: :class:`str`
1822 """
1823 val, idx = self._Max(col)
1824 return val
1825
1826 def MaxIdx(self, col):
1827 """
1828 Returns the row index of the cell with the maximal value in col. If
1829 several rows have the highest value, only the first one is returned.
1830 ''None'' values are ignored.
1831
1832 :param col: column name
1833 :type col: :class:`str`
1834 """
1835 val, idx = self._Max(col)
1836 return idx
1837
1838 def _Min(self, col):
1839 if len(self.rows)==0:
1840 return None, None
1841 idx=self.GetColIndex(col)
1842 col_type = self.col_types[idx]
1843 if col_type=='int' or col_type=='float':
1844 min_val=float('inf')
1845 elif col_type=='bool':
1846 min_val=True
1847 elif col_type=='string':
1848 min_val=chr(255)
1849 min_idx=None
1850 for i,row in enumerate(self.rows):
1851 if row[idx]!=None and row[idx]<min_val:
1852 min_val=row[idx]
1853 min_idx=i
1854 return min_val, min_idx
1855
1856 def Min(self, col):
1857 """
1858 Returns the minimal value in col. If several rows have the lowest value,
1859 only the first one is returned. ''None'' values are ignored.
1860
1861 :param col: column name
1862 :type col: :class:`str`
1863 """
1864 val, idx = self._Min(col)
1865 return val
1866
1867 def MinRow(self, col):
1868 """
1869 Returns the row containing the cell with the minimal value in col. If
1870 several rows have the lowest value, only the first one is returned.
1871 ''None'' values are ignored.
1872
1873 :param col: column name
1874 :type col: :class:`str`
1875
1876 :returns: row with minimal col value or None if the table is empty
1877 """
1878 val, idx = self._Min(col)
1879 if idx!=None:
1880 return self.rows[idx]
1881
1882 def MinIdx(self, col):
1883 """
1884 Returns the row index of the cell with the minimal value in col. If
1885 several rows have the lowest value, only the first one is returned.
1886 ''None'' values are ignored.
1887
1888 :param col: column name
1889 :type col: :class:`str`
1890 """
1891 val, idx = self._Min(col)
1892 return idx
1893
1894 def Sum(self, col):
1895 """
1896 Returns the sum of the given column. Cells with ''None'' are ignored. Returns
1897 0.0, if the column doesn't contain any elements. Col must be of numeric
1898 column type ('float', 'int') or boolean column type.
1899
1900 :param col: column name
1901 :type col: :class:`str`
1902
1903 :raises: :class:`TypeError` if column type is ``string``
1904 """
1905 idx = self.GetColIndex(col)
1906 col_type = self.col_types[idx]
1907 if col_type!='int' and col_type!='float' and col_type!='bool':
1908 raise TypeError("Sum can only be used on numeric column types")
1909 s = 0.0
1910 for r in self.rows:
1911 if r[idx]!=None:
1912 s += r[idx]
1913 return s
1914
1915 def Mean(self, col):
1916 """
1917 Returns the mean of the given column. Cells with ''None'' are ignored. Returns
1918 None, if the column doesn't contain any elements. Col must be of numeric
1919 ('float', 'int') or boolean column type.
1920
1921 If column type is *bool*, the function returns the ratio of
1922 number of 'Trues' by total number of elements.
1923
1924 :param col: column name
1925 :type col: :class:`str`
1926
1927 :raises: :class:`TypeError` if column type is ``string``
1928 """
1929 idx = self.GetColIndex(col)
1930 col_type = self.col_types[idx]
1931 if col_type!='int' and col_type!='float' and col_type!='bool':
1932 raise TypeError("Mean can only be used on numeric or bool column types")
1933
1934 vals=[]
1935 for v in self[col]:
1936 if v!=None:
1937 vals.append(v)
1938 try:
1939 return stutil.Mean(vals)
1940 except:
1941 return None
1942
1943 def RowMean(self, mean_col_name, cols):
1944 """
1945 Adds a new column of type 'float' with a specified name (*mean_col_name*),
1946 containing the mean of all specified columns for each row.
1947
1948 Cols are specified by their names and must be of numeric column
1949 type ('float', 'int') or boolean column type. Cells with None are ignored.
1950 Adds ''None'' if the row doesn't contain any values.
1951
1952 :param mean_col_name: name of new column containing mean values
1953 :type mean_col_name: :class:`str`
1954
1955 :param cols: name or list of names of columns to include in computation of
1956 mean
1957 :type cols: :class:`str` or :class:`list` of strings
1958
1959 :raises: :class:`TypeError` if column type of columns in *col* is ``string``
1960
1961 == Example ==
1962
1963 Staring with the following table:
1964
1965 ==== ==== ====
1966 x y u
1967 ==== ==== ====
1968 1 10 100
1969 2 15 None
1970 3 20 400
1971 ==== ==== ====
1972
1973 the code here adds a column with the name 'mean' to yield the table below:
1974
1975 .. code-block::python
1976
1977 tab.RowMean('mean', ['x', 'u'])
1978
1979
1980 ==== ==== ==== =====
1981 x y u mean
1982 ==== ==== ==== =====
1983 1 10 100 50.5
1984 2 15 None 2
1985 3 20 400 201.5
1986 ==== ==== ==== =====
1987
1988 """
1989
1990 if IsScalar(cols):
1991 cols = [cols]
1992
1993 cols_idxs = []
1994 for col in cols:
1995 idx = self.GetColIndex(col)
1996 col_type = self.col_types[idx]
1997 if col_type!='int' and col_type!='float' and col_type!='bool':
1998 raise TypeError("RowMean can only be used on numeric column types")
1999 cols_idxs.append(idx)
2000
2001 mean_rows = []
2002 for row in self.rows:
2003 vals = []
2004 for idx in cols_idxs:
2005 v = row[idx]
2006 if v!=None:
2007 vals.append(v)
2008 try:
2009 mean = stutil.Mean(vals)
2010 mean_rows.append(mean)
2011 except:
2012 mean_rows.append(None)
2013
2014 self.AddCol(mean_col_name, 'f', mean_rows)
2015
2016 def Percentiles(self, col, nths):
2017 """
2018 Returns the percentiles of column *col* given in *nths*.
2019
2020 The percentiles are calculated as
2021
2022 .. code-block:: python
2023
2024 values[min(len(values)-1, int(math.floor(len(values)*nth/100.0)))]
2025
2026 where values are the sorted values of *col* not equal to ''None''
2027
2028 :param col: column name
2029 :type col: :class:`str`
2030 :param nths: list of percentiles to be calculated. Each percentile is a
2031 number between 0 and 100.
2032 :type nths: :class:`list` of numbers
2033
2034 :raises: :class:`TypeError` if column type is ``string``
2035 :returns: List of percentiles in the same order as given in *nths*
2036 """
2037 idx = self.GetColIndex(col)
2038 col_type = self.col_types[idx]
2039 if col_type!='int' and col_type!='float' and col_type!='bool':
2040 raise TypeError("Median can only be used on numeric column types")
2041
2042 for nth in nths:
2043 if nth < 0 or nth > 100:
2044 raise ValueError("percentiles must be between 0 and 100")
2045 vals=[]
2046 for v in self[col]:
2047 if v!=None:
2048 vals.append(v)
2049 vals=sorted(vals)
2050 if len(vals)==0:
2051 return [None]*len(nths)
2052 percentiles=[]
2053
2054 for nth in nths:
2055 # rounding behaviour between Python2 and Python3 changed....
2056 # p=vals[min(len(vals)-1, int(round(len(vals)*nth/100.0+0.5)-1))]
2057 p=vals[min(len(vals)-1, int(math.floor(len(vals)*nth/100.0)))]
2058 percentiles.append(p)
2059 return percentiles
2060
2061 def Median(self, col):
2062 """
2063 Returns the median of the given column. Cells with ''None'' are ignored. Returns
2064 ''None'', if the column doesn't contain any elements. Col must be of numeric
2065 column type ('float', 'int') or boolean column type.
2066
2067 :param col: column name
2068 :type col: :class:`str`
2069
2070 :raises: :class:`TypeError` if column type is ``string``
2071 """
2072 idx = self.GetColIndex(col)
2073 col_type = self.col_types[idx]
2074 if col_type!='int' and col_type!='float' and col_type!='bool':
2075 raise TypeError("Median can only be used on numeric column types")
2076
2077 vals=[]
2078 for v in self[col]:
2079 if v!=None:
2080 vals.append(v)
2081 stutil.Median(vals)
2082 try:
2083 return stutil.Median(vals)
2084 except:
2085 return None
2086
2087 def StdDev(self, col):
2088 """
2089 Returns the standard deviation of the given column. Cells with ''None'' are
2090 ignored. Returns ''None'', if the column doesn't contain any elements. Col must
2091 be of numeric column type ('float', 'int') or boolean column type.
2092
2093 :param col: column name
2094 :type col: :class:`str`
2095
2096 :raises: :class:`TypeError` if column type is ``string``
2097 """
2098 idx = self.GetColIndex(col)
2099 col_type = self.col_types[idx]
2100 if col_type!='int' and col_type!='float' and col_type!='bool':
2101 raise TypeError("StdDev can only be used on numeric column types")
2102
2103 vals=[]
2104 for v in self[col]:
2105 if v!=None:
2106 vals.append(v)
2107 try:
2108 return stutil.StdDev(vals)
2109 except:
2110 return None
2111
2112 def Count(self, col, ignore_nan=True):
2113 """
2114 Count the number of cells in column that are not equal to ''None''.
2115
2116 :param col: column name
2117 :type col: :class:`str`
2118
2119 :param ignore_nan: ignore all *None* values
2120 :type ignore_nan: :class:`bool`
2121 """
2122 count=0
2123 idx=self.GetColIndex(col)
2124 for r in self.rows:
2125 if ignore_nan:
2126 if r[idx]!=None:
2127 count+=1
2128 else:
2129 count+=1
2130 return count
2131
2132 def Correl(self, col1, col2):
2133 """
2134 Calculate the Pearson correlation coefficient between *col1* and *col2*, only
2135 taking rows into account where both of the values are not equal to *None*.
2136 If there are not enough data points to calculate a correlation coefficient,
2137 *None* is returned.
2138
2139 :param col1: column name for first column
2140 :type col1: :class:`str`
2141
2142 :param col2: column name for second column
2143 :type col2: :class:`str`
2144 """
2145 if IsStringLike(col1) and IsStringLike(col2):
2146 col1 = self.GetColIndex(col1)
2147 col2 = self.GetColIndex(col2)
2148 vals1, vals2=([],[])
2149 for v1, v2 in zip(self[col1], self[col2]):
2150 if v1!=None and v2!=None:
2151 vals1.append(v1)
2152 vals2.append(v2)
2153 try:
2154 return stutil.Correl(vals1, vals2)
2155 except:
2156 return None
2157
2158 def SpearmanCorrel(self, col1, col2):
2159 """
2160 Calculate the Spearman correlation coefficient between col1 and col2, only
2161 taking rows into account where both of the values are not equal to None. If
2162 there are not enough data points to calculate a correlation coefficient,
2163 None is returned.
2164
2165 :warning: The function depends on the following module: *scipy.stats.mstats*
2166
2167 :param col1: column name for first column
2168 :type col1: :class:`str`
2169
2170 :param col2: column name for second column
2171 :type col2: :class:`str`
2172 """
2173 try:
2174 import scipy.stats.mstats
2175 import numpy as np
2176
2177 if IsStringLike(col1) and IsStringLike(col2):
2178 col1 = self.GetColIndex(col1)
2179 col2 = self.GetColIndex(col2)
2180 vals1, vals2=([],[])
2181 for v1, v2 in zip(self[col1], self[col2]):
2182 if v1!=None and v2!=None:
2183 vals1.append(v1)
2184 vals2.append(v2)
2185 try:
2186 correl = scipy.stats.mstats.spearmanr(vals1, vals2)[0]
2187 if np.isnan(correl):
2188 return None
2189 return correl
2190 except:
2191 return None
2192
2193 except ImportError:
2194 LogError("Function needs scipy.stats.mstats, but I could not import it.")
2195 raise
2196
2197
2198 def Save(self, stream_or_filename, format='ost', sep=','):
2199 """
2200 Save the table to stream or filename. The following three file formats
2201 are supported (for more information on file formats, see :meth:`Load`):
2202
2203 ============= =======================================
2204 ost ost-specific format (human readable)
2205 csv comma separated values (human readable)
2206 pickle pickled byte stream (binary)
2207 html HTML table
2208 context ConTeXt table
2209 ============= =======================================
2210
2211 :param stream_or_filename: filename or stream for writing output
2212 :type stream_or_filename: :class:`str` or :class:`file`
2213
2214 :param format: output format (i.e. *ost*, *csv*, *pickle*)
2215 :type format: :class:`str`
2216
2217 :raises: :class:`ValueError` if format is unknown
2218 """
2219 format=format.lower()
2220 if format=='ost':
2221 return self._SaveOST(stream_or_filename)
2222 if format=='csv':
2223 return self._SaveCSV(stream_or_filename, sep=sep)
2224 if format=='pickle':
2225 return self._SavePickle(stream_or_filename)
2226 if format=='html':
2227 return self._SaveHTML(stream_or_filename)
2228 if format=='context':
2229 return self._SaveContext(stream_or_filename)
2230 raise ValueError('unknown format "%s"' % format)
2231
2232 def _SavePickle(self, stream):
2233 file_opened=False
2234 if not hasattr(stream, 'write'):
2235 stream=open(stream, 'wb')
2236 file_opened=True
2237 pickle.dump(self, stream, pickle.HIGHEST_PROTOCOL)
2238 if file_opened:
2239 stream.close()
2240
2241 def _SaveHTML(self, stream_or_filename):
2242 def _escape(s):
2243 return s.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;')
2244
2245 file_opened = False
2246 if not hasattr(stream_or_filename, 'write'):
2247 stream = open(stream_or_filename, 'w')
2248 file_opened = True
2249 else:
2250 stream = stream_or_filename
2251 stream.write('<table>')
2252 stream.write('<tr>')
2253 for col_name in self.col_names:
2254 stream.write('<th>%s</th>' % _escape(col_name))
2255 stream.write('</tr>')
2256 for row in self.rows:
2257 stream.write('<tr>')
2258 for i, col in enumerate(row):
2259 val = ''
2260 if col != None:
2261 if self.col_types[i] == 'float':
2262 val = '%.3f' % col
2263 elif self.col_types[i] == 'int':
2264 val = '%d' % col
2265 elif self.col_types[i] == 'bool':
2266 val = col and 'true' or 'false'
2267 else:
2268 val = str(col)
2269 stream.write('<td>%s</td>' % _escape(val))
2270 stream.write('</tr>')
2271 stream.write('</table>')
2272 if file_opened:
2273 stream.close()
2274 def _SaveContext(self, stream_or_filename):
2275 file_opened = False
2276 if not hasattr(stream_or_filename, 'write'):
2277 stream = open(stream_or_filename, 'w')
2278 file_opened = True
2279 else:
2280 stream = stream_or_filename
2281 stream.write('\\starttable[')
2282 for col_type in self.col_types:
2283 if col_type =='string':
2284 stream.write('l|')
2285 elif col_type=='int':
2286 stream.write('r|')
2287 elif col_type =='float':
2288 stream.write('i3r|')
2289 else:
2290 stream.write('l|')
2291 stream.write(']\n\\HL\n')
2292 for col_name in self.col_names:
2293 stream.write('\\NC \\bf %s' % col_name)
2294 stream.write(' \\AR\\HL\n')
2295 for row in self.rows:
2296 for i, col in enumerate(row):
2297 val = '---'
2298 if col != None:
2299 if self.col_types[i] == 'float':
2300 val = '%.3f' % col
2301 elif self.col_types[i] == 'int':
2302 val = '%d' % col
2303 elif self.col_types[i] == 'bool':
2304 val = col and 'true' or 'false'
2305 else:
2306 val = str(col)
2307 stream.write('\\NC %s' % val)
2308 stream.write(' \\AR\n')
2309 stream.write('\\HL\n')
2310 stream.write('\\stoptable')
2311 if file_opened:
2312 stream.close()
2313
2314 def _SaveCSV(self, stream, sep):
2315 file_opened=False
2316 if not hasattr(stream, 'write'):
2317 stream=open(stream, 'w')
2318 file_opened=True
2319
2320 writer=csv.writer(stream, delimiter=sep)
2321 writer.writerow(['%s' % n for n in self.col_names])
2322 for row in self.rows:
2323 row=list(row)
2324 for i, c in enumerate(row):
2325 if c==None:
2326 row[i]='NA'
2327 writer.writerow(row)
2328 if file_opened:
2329 stream.close()
2330
2331
2332 def _SaveOST(self, stream):
2333 file_opened=False
2334 if hasattr(stream, 'write'):
2335 writer=csv.writer(stream, delimiter=' ')
2336 else:
2337 stream=open(stream, 'w')
2338 writer=csv.writer(stream, delimiter=' ')
2339 file_opened=True
2340 if self.comment:
2341 stream.write(''.join(['# %s\n' % l for l in self.comment.split('\n')]))
2342 writer.writerow(['%s[%s]' % t for t in zip(self.col_names, self.col_types)])
2343 for row in self.rows:
2344 row=list(row)
2345 for i, c in enumerate(row):
2346 if c==None:
2347 row[i]='NA'
2348 writer.writerow(row)
2349 if file_opened:
2350 stream.close()
2351
2352 def GetNumpyMatrixAsArray(self, *args):
2353 '''
2354 Returns a numpy array containing the selected columns from the table as
2355 columns as a matrix.
2356
2357 Only columns of type *int* or *float* are supported. *NA* values in the
2358 table will be converted to *None* values.
2359
2360 Originally the function used the numpy matrix class but that is going to be
2361 deprecated in the future. Numpy itself suggests replacing numpy matrix by
2362 numpy array.
2363
2364 :param \\*args: column names to include in numpy array
2365
2366 :warning: The function depends on *numpy*
2367 '''
2368 try:
2369 import numpy as np
2370
2371 if len(args)==0:
2372 raise RuntimeError("At least one column must be specified.")
2373
2374 idxs = []
2375 for arg in args:
2376 idx = self.GetColIndex(arg)
2377 col_type = self.col_types[idx]
2378 if col_type!='int' and col_type!='float':
2379 raise TypeError("Numpy matrix can only be generated from numeric "+\
2380 "column types")
2381 idxs.append(idx)
2382
2383 a = np.array([list(self[i]) for i in idxs])
2384 return a.T
2385
2386 except ImportError:
2387 LogError("Function needs numpy, but I could not import it.")
2388 raise
2389
2390 def GetNumpyMatrix(self, *args):
2391 '''
2392 *Caution*: Numpy is deprecating the use of the numpy matrix class.
2393
2394 Returns a numpy matrix containing the selected columns from the table as
2395 columns in the matrix.
2396
2397 Only columns of type *int* or *float* are supported. *NA* values in the
2398 table will be converted to *None* values.
2399
2400 :param \\*args: column names to include in numpy matrix
2401
2402 :warning: The function depends on *numpy*
2403 '''
2404 LogWarning("table.GetNumpyMatrix is deprecated, please use "+
2405 "table.GetNumpyMatrixAsArray instead")
2406 try:
2407 import numpy as np
2408 m = self.GetNumpyMatrixAsArray(*args)
2409 return np.matrix(m)
2410 except ImportError:
2411 LogError("Function needs numpy, but I could not import it.")
2412 raise
2413
2414 def GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0):
2415
2416 '''
2417 In place Gaussian smooth of a column in the table with a given standard deviation.
2418 All nan are set to nan_value before smoothing.
2419
2420 :param col: column name
2421 :type col: :class:`str`
2422
2423 :param std: standard deviation for gaussian kernel
2424 :type std: `scalar`
2425
2426 :param na_value: all na (None) values of the speciefied column are set to na_value before smoothing
2427 :type na_value: `scalar`
2428
2429 :param padding: allows to handle padding behaviour see scipy ndimage.gaussian_filter1d documentation for more information. standard is reflect
2430 :type padding: :class:`str`
2431
2432 :param c: constant value used for padding if padding mode is constant
2433 :type c: `scalar`
2434
2435
2436
2437 :warning: The function depends on *scipy*
2438 '''
2439
2440 try:
2441 from scipy import ndimage
2442 import numpy as np
2443 except ImportError:
2444 LogError("I need scipy.ndimage and numpy, but could not import it")
2445 raise
2446
2447 idx = self.GetColIndex(col)
2448 col_type = self.col_types[idx]
2449 if col_type!='int' and col_type!='float':
2450 raise TypeError("GaussianSmooth can only be used on numeric column types")
2451
2452 vals=[]
2453 for v in self[col]:
2454 if v!=None:
2455 vals.append(v)
2456 else:
2457 vals.append(na_value)
2458
2459
2460 smoothed_values_ndarray=ndimage.gaussian_filter1d(vals,std, mode=padding, cval=c)
2461
2462 result=[]
2463
2464 for v in smoothed_values_ndarray:
2465 result.append(v)
2466
2467 self[col]=result
2468
2469
2470 def GetOptimalPrefactors(self, ref_col, *args, **kwargs):
2471 '''
2472 This returns the optimal prefactor values (i.e. :math:`a, b, c, ...`) for
2473 the following equation
2474
2475 .. math::
2476 :label: op1
2477
2478 a*u + b*v + c*w + ... = z
2479
2480 where :math:`u, v, w` and :math:`z` are vectors. In matrix notation
2481
2482 .. math::
2483 :label: op2
2484
2485 A*p = z
2486
2487 where :math:`A` contains the data from the table :math:`(u,v,w,...)`,
2488 :math:`p` are the prefactors to optimize :math:`(a,b,c,...)` and :math:`z`
2489 is the vector containing the result of equation :eq:`op1`.
2490
2491 The parameter ref_col equals to :math:`z` in both equations, and \\*args
2492 are columns :math:`u`, :math:`v` and :math:`w` (or :math:`A` in :eq:`op2`).
2493 All columns must be specified by their names.
2494
2495 **Example:**
2496
2497 .. code-block:: python
2498
2499 tab.GetOptimalPrefactors('colC', 'colA', 'colB')
2500
2501 The function returns a list containing the prefactors
2502 :math:`a, b, c, ...` in the correct order (i.e. same as columns were
2503 specified in \\*args).
2504
2505 Weighting:
2506 If the kwarg weights="columX" is specified, the equations are weighted by
2507 the values in that column. Each row is multiplied by the weight in that
2508 row, which leads to :eq:`op3`:
2509
2510 .. math::
2511 :label: op3
2512
2513 \\textit{weight}*a*u + \\textit{weight}*b*v + \\textit{weight}*c*w + ...
2514 = \\textit{weight}*z
2515
2516 Weights must be float or int and can have any value. A value of 0 ignores
2517 this equation, a value of 1 means the same as no weight. If all weights are
2518 the same for each row, the same result will be obtained as with no weights.
2519
2520 **Example:**
2521
2522 .. code-block:: python
2523
2524 tab.GetOptimalPrefactors('colC', 'colA', 'colB', weights='colD')
2525
2526 '''
2527 try:
2528 import numpy as np
2529
2530 if len(args)==0:
2531 raise RuntimeError("At least one column must be specified.")
2532
2533 b = self.GetNumpyMatrixAsArray(ref_col)
2534 a = self.GetNumpyMatrixAsArray(*args)
2535
2536 if len(kwargs)!=0:
2537 if 'weights' in kwargs:
2538 w = self.GetNumpyMatrixAsArray(kwargs['weights'])
2539 b = np.multiply(b,w)
2540 a = np.multiply(a,w)
2541
2542 else:
2543 raise RuntimeError("specified unrecognized kwargs, use weights as key")
2544
2545 k = np.linalg.inv(a.T@a)@a.T@b
2546 return list(k.T.reshape(-1))
2547
2548 except ImportError:
2549 LogError("Function needs numpy, but I could not import it.")
2550 raise
2551
2552 def PlotEnrichment(self, score_col, class_col, score_dir='-',
2553 class_dir='-', class_cutoff=2.0,
2554 style='-', title=None, x_title=None, y_title=None,
2555 clear=True, save=None):
2556 '''
2557 Plot an enrichment curve using matplotlib of column *score_col* classified
2558 according to *class_col*.
2559
2560 For more information about parameters of the enrichment, see
2561 :meth:`ComputeEnrichment`, and for plotting see :meth:`Plot`.
2562
2563 :warning: The function depends on *matplotlib*
2564 '''
2565 try:
2566 import matplotlib.pyplot as plt
2567
2568 enrx, enry = self.ComputeEnrichment(score_col, class_col, score_dir,
2569 class_dir, class_cutoff)
2570
2571 if not title:
2572 title = 'Enrichment of %s'%score_col
2573
2574 if not x_title:
2575 x_title = '% database'
2576
2577 if not y_title:
2578 y_title = '% positives'
2579
2580 if clear:
2581 plt.clf()
2582
2583 plt.plot(enrx, enry, style)
2584
2585 plt.title(title, size='x-large', fontweight='bold')
2586 plt.ylabel(y_title, size='x-large')
2587 plt.xlabel(x_title, size='x-large')
2588
2589 if save:
2590 plt.savefig(save)
2591
2592 return plt
2593 except ImportError:
2594 LogError("Function needs matplotlib, but I could not import it.")
2595 raise
2596
2597 def ComputeEnrichment(self, score_col, class_col, score_dir='-',
2598 class_dir='-', class_cutoff=2.0):
2599 '''
2600 Computes the enrichment of column *score_col* classified according to
2601 *class_col*.
2602
2603 For this it is necessary, that the datapoints are classified into positive
2604 and negative points. This can be done in two ways:
2605
2606 - by using one 'bool' type column (*class_col*) which contains *True* for
2607 positives and *False* for negatives
2608
2609 - by specifying a classification column (*class_col*), a cutoff value
2610 (*class_cutoff*) and the classification columns direction (*class_dir*).
2611 This will generate the classification on the fly
2612
2613 * if ``class_dir=='-'``: values in the classification column that are less than or equal to class_cutoff will be counted as positives
2614 * if ``class_dir=='+'``: values in the classification column that are larger than or equal to class_cutoff will be counted as positives
2615
2616 During the calculation, the table will be sorted according to *score_dir*,
2617 where a '-' values means smallest values first and therefore, the smaller
2618 the value, the better.
2619
2620 :warning: If either the value of *class_col* or *score_col* is *None*, the
2621 data in this row is ignored.
2622 '''
2623
2624 ALLOWED_DIR = ['+','-']
2625
2626 score_idx = self.GetColIndex(score_col)
2627 score_type = self.col_types[score_idx]
2628 if score_type!='int' and score_type!='float':
2629 raise TypeError("Score column must be numeric type")
2630
2631 class_idx = self.GetColIndex(class_col)
2632 class_type = self.col_types[class_idx]
2633 if class_type!='int' and class_type!='float' and class_type!='bool':
2634 raise TypeError("Classifier column must be numeric or bool type")
2635
2636 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2637 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2638
2639 self.Sort(score_col, score_dir)
2640
2641 x = [0]
2642 y = [0]
2643 enr = 0
2644 old_score_val = None
2645 i = 0
2646
2647 for row in self.rows:
2648 class_val = row[class_idx]
2649 score_val = row[score_idx]
2650 if class_val==None or score_val==None:
2651 continue
2652 if class_val!=None:
2653 if old_score_val==None:
2654 old_score_val = score_val
2655 if score_val!=old_score_val:
2656 x.append(i)
2657 y.append(enr)
2658 old_score_val = score_val
2659 i+=1
2660 if class_type=='bool':
2661 if class_val==True:
2662 enr += 1
2663 else:
2664 if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2665 enr += 1
2666 x.append(i)
2667 y.append(enr)
2668
2669 # if no false positives or false negatives values are found return None
2670 if x[-1]==0 or y[-1]==0:
2671 return None
2672
2673 x = [float(v)/x[-1] for v in x]
2674 y = [float(v)/y[-1] for v in y]
2675 return x,y
2676
2677 def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-',
2678 class_dir='-', class_cutoff=2.0):
2679 '''
2680 Computes the area under the curve of the enrichment using the trapezoidal
2681 rule.
2682
2683 For more information about parameters of the enrichment, see
2684 :meth:`ComputeEnrichment`.
2685
2686 :warning: The function depends on *numpy*
2687 '''
2688 try:
2689 import numpy as np
2690
2691 enr = self.ComputeEnrichment(score_col, class_col, score_dir,
2692 class_dir, class_cutoff)
2693
2694 if enr==None:
2695 return None
2696 return np.trapz(enr[1], enr[0])
2697 except ImportError:
2698 LogError("Function needs numpy, but I could not import it.")
2699 raise
2700
2701 def ComputeROC(self, score_col, class_col, score_dir='-',
2702 class_dir='-', class_cutoff=2.0):
2703 '''
2704 Computes the receiver operating characteristics (ROC) of column *score_col*
2705 classified according to *class_col*.
2706
2707 For this it is necessary, that the datapoints are classified into positive
2708 and negative points. This can be done in two ways:
2709
2710 - by using one 'bool' column (*class_col*) which contains True for positives
2711 and False for negatives
2712 - by using a non-bool column (*class_col*), a cutoff value (*class_cutoff*)
2713 and the classification columns direction (*class_dir*). This will generate
2714 the classification on the fly
2715
2716 - if ``class_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff* will be counted as positives
2717 - if ``class_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff* will be counted as positives
2718
2719 During the calculation, the table will be sorted according to *score_dir*,
2720 where a '-' values means smallest values first and therefore, the smaller
2721 the value, the better.
2722
2723 If *class_col* does not contain any positives (i.e. value is True (if column
2724 is of type bool) or evaluated to True (if column is of type int or float
2725 (depending on *class_dir* and *class_cutoff*))) the ROC is not defined and
2726 the function will return *None*.
2727
2728 :warning: If either the value of *class_col* or *score_col* is *None*, the
2729 data in this row is ignored.
2730 '''
2731
2732 ALLOWED_DIR = ['+','-']
2733
2734 score_idx = self.GetColIndex(score_col)
2735 score_type = self.col_types[score_idx]
2736 if score_type!='int' and score_type!='float':
2737 raise TypeError("Score column must be numeric type")
2738
2739 class_idx = self.GetColIndex(class_col)
2740 class_type = self.col_types[class_idx]
2741 if class_type!='int' and class_type!='float' and class_type!='bool':
2742 raise TypeError("Classifier column must be numeric or bool type")
2743
2744 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2745 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2746
2747 self.Sort(score_col, score_dir)
2748
2749 x = [0]
2750 y = [0]
2751 tp = 0
2752 fp = 0
2753 old_score_val = None
2754
2755 for i,row in enumerate(self.rows):
2756 class_val = row[class_idx]
2757 score_val = row[score_idx]
2758 if class_val==None or score_val==None:
2759 continue
2760 if class_val!=None:
2761 if old_score_val==None:
2762 old_score_val = score_val
2763 if score_val!=old_score_val:
2764 x.append(fp)
2765 y.append(tp)
2766 old_score_val = score_val
2767 if class_type=='bool':
2768 if class_val==True:
2769 tp += 1
2770 else:
2771 fp += 1
2772 else:
2773 if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2774 tp += 1
2775 else:
2776 fp += 1
2777 x.append(fp)
2778 y.append(tp)
2779
2780 # if no false positives or false negatives values are found return None
2781 if x[-1]==0 or y[-1]==0:
2782 return None
2783
2784 x = [float(v)/x[-1] for v in x]
2785 y = [float(v)/y[-1] for v in y]
2786 return x,y
2787
2788 def ComputeROCAUC(self, score_col, class_col, score_dir='-',
2789 class_dir='-', class_cutoff=2.0):
2790 '''
2791 Computes the area under the curve of the receiver operating characteristics
2792 using the trapezoidal rule.
2793
2794 For more information about parameters of the ROC, see
2795 :meth:`ComputeROC`.
2796
2797 :warning: The function depends on *numpy*
2798 '''
2799 try:
2800 import numpy as np
2801
2802 roc = self.ComputeROC(score_col, class_col, score_dir,
2803 class_dir, class_cutoff)
2804
2805 if not roc:
2806 return None
2807 return np.trapz(roc[1], roc[0])
2808 except ImportError:
2809 LogError("Function needs numpy, but I could not import it.")
2810 raise
2811
2812 def ComputeLogROCAUC(self, score_col, class_col, score_dir='-',
2813 class_dir='-', class_cutoff=2.0):
2814 '''
2815 Computes the area under the curve of the log receiver operating
2816 characteristics (logROC) where the x-axis is semilogarithmic
2817 using the trapezoidal rule.
2818
2819 The logROC is computed with a lambda of 0.001 according to
2820 Rapid Context-Dependent Ligand Desolvation in Molecular Docking
2821 Mysinger M. and Shoichet B., Journal of Chemical Information and Modeling
2822 2010 50 (9), 1561-1573
2823
2824 For more information about parameters of the ROC, see
2825 :meth:`ComputeROC`.
2826
2827 :warning: The function depends on *numpy*
2828 '''
2829 try:
2830 import numpy as np
2831
2832 roc = self.ComputeROC(score_col, class_col, score_dir,
2833 class_dir, class_cutoff)
2834
2835 if not roc:
2836 return None
2837
2838 rocxt, rocyt = roc
2839 rocx=[]
2840 rocy=[]
2841
2842 # define lambda
2843 l=0.001
2844
2845 # remove all duplicate x-values
2846 rocxt = [x if x>0 else l for x in rocxt]
2847 for i in range(len(rocxt)-1):
2848 if rocxt[i]==rocxt[i+1]:
2849 continue
2850 rocx.append(rocxt[i])
2851 rocy.append(rocyt[i])
2852 rocx.append(1.0)
2853 rocy.append(1.0)
2854
2855 # compute logauc
2856 value = 0
2857 for i in range(len(rocx)-1):
2858 x = rocx[i]
2859 if rocx[i]==rocx[i+1]:
2860 continue
2861 b = rocy[i+1]-rocx[i+1]*((rocy[i+1]-rocy[i])/(rocx[i+1]-rocx[i]))
2862 value += ((rocy[i+1]-rocy[i])/math.log(10))+b*(math.log10(rocx[i+1])-math.log10(rocx[i]))
2863 return value/math.log10(1.0/l)
2864
2865 except ImportError:
2866 LogError("Function needs numpy, but I could not import it.")
2867 raise
2868
2869 def PlotROC(self, score_col, class_col, score_dir='-',
2870 class_dir='-', class_cutoff=2.0,
2871 style='-', title=None, x_title=None, y_title=None,
2872 clear=True, save=None):
2873 '''
2874 Plot an ROC curve using matplotlib.
2875
2876 For more information about parameters of the ROC, see
2877 :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2878
2879 :warning: The function depends on *matplotlib*
2880 '''
2881
2882 try:
2883 import matplotlib.pyplot as plt
2884
2885 roc = self.ComputeROC(score_col, class_col, score_dir,
2886 class_dir, class_cutoff)
2887
2888 if not roc:
2889 return None
2890
2891 enrx, enry = roc
2892
2893 if not title:
2894 title = 'ROC of %s'%score_col
2895
2896 if not x_title:
2897 x_title = 'false positive rate'
2898
2899 if not y_title:
2900 y_title = 'true positive rate'
2901
2902 if clear:
2903 plt.clf()
2904
2905 plt.plot(enrx, enry, style)
2906
2907 plt.title(title, size='x-large', fontweight='bold')
2908 plt.ylabel(y_title, size='x-large')
2909 plt.xlabel(x_title, size='x-large')
2910
2911 if save:
2912 plt.savefig(save)
2913
2914 return plt
2915 except ImportError:
2916 LogError("Function needs matplotlib, but I could not import it.")
2917 raise
2918
2919 def PlotLogROC(self, score_col, class_col, score_dir='-',
2920 class_dir='-', class_cutoff=2.0,
2921 style='-', title=None, x_title=None, y_title=None,
2922 clear=True, save=None):
2923 '''
2924 Plot an logROC curve where the x-axis is semilogarithmic using matplotlib
2925
2926 For more information about parameters of the ROC, see
2927 :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2928
2929 :warning: The function depends on *matplotlib*
2930 '''
2931
2932 try:
2933 import matplotlib.pyplot as plt
2934
2935 roc = self.ComputeROC(score_col, class_col, score_dir,
2936 class_dir, class_cutoff)
2937
2938 if not roc:
2939 return None
2940
2941 rocx, rocy = roc
2942
2943 if not title:
2944 title = 'logROC of %s'%score_col
2945
2946 if not x_title:
2947 x_title = 'false positive rate'
2948
2949 if not y_title:
2950 y_title = 'true positive rate'
2951
2952 if clear:
2953 plt.clf()
2954
2955 rocx = [x if x>0 else 0.001 for x in rocx]
2956
2957
2958 plt.plot(rocx, rocy, style)
2959
2960 plt.title(title, size='x-large', fontweight='bold')
2961 plt.ylabel(y_title, size='x-large')
2962 plt.xlabel(x_title, size='x-large')
2963 try:
2964 plt.xscale('log', basex=10)
2965 except:
2966 plt.xscale('log', base=10) # breaking change in matplotlib 3.5
2967 plt.xlim(0.001, 1.0)
2968
2969 if save:
2970 plt.savefig(save)
2971
2972 return plt
2973 except ImportError:
2974 LogError("Function needs matplotlib, but I could not import it.")
2975 raise
2976
2977 def ComputeMCC(self, score_col, class_col, score_dir='-',
2978 class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
2979 '''
2980 Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
2981 with the points classified into true positives, false positives, true
2982 negatives and false negatives according to a specified classification
2983 column (*class_col*).
2984
2985 The datapoints in *score_col* and *class_col* are classified into
2986 positive and negative points. This can be done in two ways:
2987
2988 - by using 'bool' columns which contains True for positives and False
2989 for negatives
2990
2991 - by using 'float' or 'int' columns and specifying a cutoff value and the
2992 columns direction. This will generate the classification on the fly
2993
2994 * if ``class_dir``/``score_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2995 * if ``class_dir``/``score_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2996
2997 The two possibilities can be used together, i.e. 'bool' type for one column
2998 and 'float'/'int' type and cutoff/direction for the other column.
2999 '''
3000 ALLOWED_DIR = ['+','-']
3001
3002 score_idx = self.GetColIndex(score_col)
3003 score_type = self.col_types[score_idx]
3004 if score_type!='int' and score_type!='float' and score_type!='bool':
3005 raise TypeError("Score column must be numeric or bool type")
3006
3007 class_idx = self.GetColIndex(class_col)
3008 class_type = self.col_types[class_idx]
3009 if class_type!='int' and class_type!='float' and class_type!='bool':
3010 raise TypeError("Classifier column must be numeric or bool type")
3011
3012 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
3013 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
3014
3015 tp = 0
3016 fp = 0
3017 fn = 0
3018 tn = 0
3019
3020 for i,row in enumerate(self.rows):
3021 class_val = row[class_idx]
3022 score_val = row[score_idx]
3023 if class_val!=None:
3024 if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
3025 if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
3026 tp += 1
3027 else:
3028 fn += 1
3029 else:
3030 if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
3031 tn += 1
3032 else:
3033 fp += 1
3034
3035 mcc = None
3036 msg = None
3037 if (tp+fn)==0:
3038 msg = 'factor (tp + fn) is zero'
3039 elif (tp+fp)==0:
3040 msg = 'factor (tp + fp) is zero'
3041 elif (tn+fn)==0:
3042 msg = 'factor (tn + fn) is zero'
3043 elif (tn+fp)==0:
3044 msg = 'factor (tn + fp) is zero'
3045
3046 if msg:
3047 LogWarning("Could not compute MCC: MCC is not defined since %s"%msg)
3048 else:
3049 mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
3050 return mcc
3051
3052
3053 def IsEmpty(self, col_name=None, ignore_nan=True):
3054 '''
3055 Checks if a table is empty.
3056
3057 If no column name is specified, the whole table is checked for being empty,
3058 whereas if a column name is specified, only this column is checked.
3059
3060 By default, all NAN (or None) values are ignored, and thus, a table
3061 containing only NAN values is considered as empty. By specifying the
3062 option ignore_nan=False, NAN values are counted as 'normal' values.
3063 '''
3064
3065 # table with no columns and no rows
3066 if len(self.col_names)==0:
3067 if col_name:
3068 raise ValueError('Table has no column named "%s"' % col_name)
3069 return True
3070
3071 # column name specified
3072 if col_name:
3073 if self.Count(col_name, ignore_nan=ignore_nan)==0:
3074 return True
3075 else:
3076 return False
3077
3078 # no column name specified -> test whole table
3079 else:
3080 for row in self.rows:
3081 for cell in row:
3082 if ignore_nan:
3083 if cell!=None:
3084 return False
3085 else:
3086 return False
3087 return True
3088
3089
3090 def Extend(self, tab, overwrite=None):
3091 """
3092 Append each row of *tab* to the current table. The data is appended based
3093 on the column names, thus the order of the table columns is *not* relevant,
3094 only the header names.
3095
3096 If there is a column in *tab* that is not present in the current table,
3097 it is added to the current table and filled with *None* for all the rows
3098 present in the current table.
3099
3100 If the type of any column in *tab* is not the same as in the current table
3101 a *TypeError* is raised.
3102
3103 If *overwrite* is not None and set to an existing column name, the specified
3104 column in the table is searched for the first occurrence of a value matching
3105 the value of the column with the same name in the dictionary. If a matching
3106 value is found, the row is overwritten with the dictionary. If no matching
3107 row is found, a new row is appended to the table.
3108 """
3109 # add column to current table if it doesn't exist
3110 for name,typ in zip(tab.col_names, tab.col_types):
3111 if not name in self.col_names:
3112 self.AddCol(name, typ)
3113
3114 # check that column types are the same in current and new table
3115 for name in self.col_names:
3116 if name in tab.col_names:
3117 curr_type = self.col_types[self.GetColIndex(name)]
3118 new_type = tab.col_types[tab.GetColIndex(name)]
3119 if curr_type!=new_type:
3120 raise TypeError('cannot extend table, column %s in new '%name +\
3121 'table different type (%s) than in '%new_type +\
3122 'current table (%s)'%curr_type)
3123
3124 num_rows = len(tab.rows)
3125 for i in range(0,num_rows):
3126 row = tab.rows[i]
3127 data = dict(list(zip(tab.col_names,row)))
3128 self.AddRow(data, overwrite)
3129
3130
3131def Merge(table1, table2, by, only_matching=False):
3132 """
3133 Returns a new table containing the data from both tables. The rows are
3134 combined based on the common values in the column(s) by. The option 'by' can
3135 be a list of column names. When this is the case, merging is based on
3136 multiple columns.
3137 For example, the two tables below
3138
3139 ==== ====
3140 x y
3141 ==== ====
3142 1 10
3143 2 15
3144 3 20
3145 ==== ====
3146
3147 ==== ====
3148 x u
3149 ==== ====
3150 1 100
3151 3 200
3152 4 400
3153 ==== ====
3154
3155 when merged by column x, produce the following output:
3156
3157 ===== ===== =====
3158 x y u
3159 ===== ===== =====
3160 1 10 100
3161 2 15 None
3162 3 20 200
3163 4 None 400
3164 ===== ===== =====
3165
3166
3167 """
3168 def _key(row, indices):
3169 return tuple([row[i] for i in indices])
3170 def _keep(indices, cn, ct, ni):
3171 ncn, nct, nni=([],[],[])
3172 for i in range(len(cn)):
3173 if i not in indices:
3174 ncn.append(cn[i])
3175 nct.append(ct[i])
3176 nni.append(ni[i])
3177 return ncn, nct, nni
3178 col_names=list(table2.col_names)
3179 col_types=list(table2.col_types)
3180 new_index=[i for i in range(len(col_names))]
3181 if isinstance(by, str):
3182 common2_indices=[col_names.index(by)]
3183 else:
3184 common2_indices=[col_names.index(b) for b in by]
3185 col_names, col_types, new_index=_keep(common2_indices, col_names,
3186 col_types, new_index)
3187
3188 for i, name in enumerate(col_names):
3189 try_name=name
3190 counter=1
3191 while try_name in table1.col_names:
3192 counter+=1
3193 try_name='%s_%d' % (name, counter)
3194 col_names[i]=try_name
3195 common1={}
3196 if isinstance(by, str):
3197 common1_indices=[table1.col_names.index(by)]
3198 else:
3199 common1_indices=[table1.col_names.index(b) for b in by]
3200 for row in table1.rows:
3201 key=_key(row, common1_indices)
3202 if key in common1:
3203 raise ValueError('duplicate key "%s in first table"' % (str(key)))
3204 common1[key]=row
3205 common2={}
3206 for row in table2.rows:
3207 key=_key(row, common2_indices)
3208 if key in common2:
3209 raise ValueError('duplicate key "%s" in second table' % (str(key)))
3210 common2[key]=row
3211 new_tab=Table(table1.col_names+col_names, table1.col_types+col_types)
3212 for k, v in common1.items():
3213 row=v+[None for i in range(len(table2.col_names)-len(common2_indices))]
3214 matched=False
3215 if k in common2:
3216 matched=True
3217 row2=common2[k]
3218 for i, index in enumerate(new_index):
3219 row[len(table1.col_names)+i]=row2[index]
3220 if only_matching and not matched:
3221 continue
3222 new_tab.AddRow(row)
3223 if only_matching:
3224 return new_tab
3225 for k, v in common2.items():
3226 if not k in common1:
3227 v2=[v[i] for i in new_index]
3228 row=[None for i in range(len(table1.col_names))]+v2
3229 for common1_index, common2_index in zip(common1_indices, common2_indices):
3230 row[common1_index]=v[common2_index]
3231 new_tab.AddRow(row)
3232 return new_tab
3233
3234
3235# LocalWords: numpy Numpy
__div__(self, rhs)
Definition table.py:94
__init__(self, op, lhs, rhs)
Definition table.py:71
__sub__(self, rhs)
Definition table.py:88
__add__(self, rhs)
Definition table.py:85
__mul__(self, rhs)
Definition table.py:91
__div__(self, rhs)
Definition table.py:127
__getitem__(self, index)
Definition table.py:112
__setitem__(self, index, value)
Definition table.py:115
__sub__(self, rhs)
Definition table.py:121
__add__(self, rhs)
Definition table.py:118
__mul__(self, rhs)
Definition table.py:124
__init__(self, table, col)
Definition table.py:98
Median(self, col)
Definition table.py:2061
Correl(self, col1, col2)
Definition table.py:2132
_SaveCSV(self, stream, sep)
Definition table.py:2314
GetOptimalPrefactors(self, ref_col, *args, **kwargs)
Definition table.py:2470
_LoadOST(stream_or_filename)
Definition table.py:880
Min(self, col)
Definition table.py:1856
Mean(self, col)
Definition table.py:1915
_AddRowsFromDict(self, d, overwrite=None)
Definition table.py:507
Select(self, query)
Definition table.py:827
GetUnique(self, col, ignore_nan=True)
Definition table.py:1080
MinIdx(self, col)
Definition table.py:1882
PlotLogROC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0, style='-', title=None, x_title=None, y_title=None, clear=True, save=None)
Definition table.py:2922
Sort(self, by, order='+')
Definition table.py:1050
GetColIndex(self, col)
Definition table.py:369
_GuessColumnTypes(self)
Definition table.py:925
PlotBar(self, cols=None, rows=None, xlabels=None, set_xlabels=True, xlabels_rotation='horizontal', y_title=None, title=None, colors=None, width=0.8, bottom=0, legend=False, legend_names=None, show=False, save=False)
Definition table.py:1509
RenameCol(self, old_name, new_name)
Definition table.py:329
_SavePickle(self, stream)
Definition table.py:2232
_LoadCSV(stream_or_filename, sep)
Definition table.py:933
MinRow(self, col)
Definition table.py:1867
Sum(self, col)
Definition table.py:1894
HasCol(self, col)
Definition table.py:401
RemoveCol(self, col)
Definition table.py:688
__str__(self)
Definition table.py:479
PairedTTest(self, col_a, col_b)
Definition table.py:565
Filter(self, *args, **kwargs)
Definition table.py:791
Percentiles(self, col, nths)
Definition table.py:2016
GetColNames(self)
Definition table.py:379
Load(stream_or_filename, format='auto', sep=',')
Definition table.py:986
_SaveContext(self, stream_or_filename)
Definition table.py:2274
ComputeLogROCAUC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0)
Definition table.py:2813
ComputeROCAUC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0)
Definition table.py:2789
ComputeEnrichment(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0)
Definition table.py:2598
IsEmpty(self, col_name=None, ignore_nan=True)
Definition table.py:3053
MaxIdx(self, col)
Definition table.py:1826
AddRow(self, data, overwrite=None)
Definition table.py:589
Stats(self, col)
Definition table.py:482
GetNumpyMatrix(self, *args)
Definition table.py:2390
Save(self, stream_or_filename, format='ost', sep=',')
Definition table.py:2198
StdDev(self, col)
Definition table.py:2087
__getattr__(self, col_name)
Definition table.py:237
GetName(self)
Definition table.py:323
GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0)
Definition table.py:2414
PlotEnrichment(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0, style='-', title=None, x_title=None, y_title=None, clear=True, save=None)
Definition table.py:2555
_ParseColTypes(types, exp_num=None)
Definition table.py:247
Count(self, col, ignore_nan=True)
Definition table.py:2112
PlotHistogram(self, col, x_range=None, num_bins=10, normed=False, histtype='stepfilled', align='mid', x_title=None, y_title=None, title=None, clear=True, save=False, color=None, y_range=None)
Definition table.py:1380
SearchColNames(self, regex)
Definition table.py:385
_SaveOST(self, stream)
Definition table.py:2332
__init__(self, col_names=[], col_types=None, **kwargs)
Definition table.py:221
__setitem__(self, k, value)
Definition table.py:413
_GuessFormat(filename)
Definition table.py:972
_SaveHTML(self, stream_or_filename)
Definition table.py:2241
ComputeMCC(self, score_col, class_col, score_dir='-', class_dir='-', score_cutoff=2.0, class_cutoff=2.0)
Definition table.py:2978
SpearmanCorrel(self, col1, col2)
Definition table.py:2158
Max(self, col)
Definition table.py:1815
RowMean(self, mean_col_name, cols)
Definition table.py:1943
_Max(self, col)
Definition table.py:1489
GetNumpyMatrixAsArray(self, *args)
Definition table.py:2352
MaxRow(self, col)
Definition table.py:1800
SetName(self, name)
Definition table.py:314
ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0)
Definition table.py:2678
Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None, z_title=None, x_range=None, y_range=None, z_range=None, color=None, plot_if=None, legend=None, num_z_levels=10, z_contour=True, z_interpol='nn', diag_line=False, labels=None, max_num_labels=None, title=None, clear=True, save=False, **kwargs)
Definition table.py:1126
_Min(self, col)
Definition table.py:1838
AddCol(self, col_name, col_type, data=None)
Definition table.py:701
ToString(self, float_format='%.3f', int_format='%d', rows=None)
Definition table.py:422
_LoadPickle(stream_or_filename)
Definition table.py:959
__getitem__(self, k)
Definition table.py:407
Extend(self, tab, overwrite=None)
Definition table.py:3090
PlotROC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0, style='-', title=None, x_title=None, y_title=None, clear=True, save=None)
Definition table.py:2872
PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log', colormap='jet', show_scalebar=False, scalebar_label=None, clear=True, save=False, show=False)
Definition table.py:1671
ComputeROC(self, score_col, class_col, score_dir='-', class_dir='-', class_cutoff=2.0)
Definition table.py:2702
Zip(self, *args)
Definition table.py:1101
_Coerce(self, value, ty)
Definition table.py:342
__getitem__(self, col_name)
Definition table.py:139
__setattr__(self, col_name, val)
Definition table.py:165
__getattr__(self, col_name)
Definition table.py:160
__setitem__(self, col_name, val)
Definition table.py:154
__init__(self, row_data, tab)
Definition table.py:135
GuessColumnType(iterator)
Definition table.py:38
Merge(table1, table2, by, only_matching=False)
Definition table.py:3131
IsNullString(value)
Definition table.py:23
MakeTitle(col_name)
Definition table.py:11
IsScalar(value)
Definition table.py:27
IsStringLike(value)
Definition table.py:14