9 from ost
import LogError, LogWarning, LogInfo, LogVerbose
12 return col_name.replace(
'_',
' ')
15 if isinstance(value, TableCol)
or isinstance(value, BinaryColExpr):
24 value=value.strip().upper()
25 return value
in (
'',
'NULL',
'NONE',
'NA')
31 if isinstance(value, TableCol)
or isinstance(value, BinaryColExpr):
40 possibilities=set([
'bool',
'int',
'float'])
42 str_ele=str(ele).upper()
46 if 'int' in possibilities:
50 possibilities.remove(
'int')
52 if 'float' in possibilities:
56 possibilities.remove(
'float')
57 if 'bool' in possibilities:
58 if str_ele
not in set([
'YES',
'NO',
'TRUE',
'FALSE']):
59 possibilities.remove(
'bool')
61 if len(possibilities)==0:
63 if len(possibilities)==2:
68 return possibilities.pop()
76 self.
lhs=itertools.cyle([self.
lhs])
78 self.
rhs=itertools.cycle([self.
rhs])
80 for l, r
in zip(self.
lhs, self.
rhs):
81 if l!=
None and r!=
None:
106 for row
in self._table.rows:
110 return len(self._table.rows)
113 return self._table.rows[index][self.
col_index]
116 self._table.rows[index][self.
col_index]=value
132 Essentially a named tuple, but allows column names that are not valid
133 python variable names.
136 self.__dict__[
'tab'] = weakref.proxy(tab)
137 self.__dict__[
'row_data'] = row_data
140 if type(col_name)==int:
141 return self.row_data[col_name]
142 return self.row_data[self.tab.GetColIndex(col_name)]
146 for k, v
in zip(self.__dict__[
'tab'].col_names, self.__dict__[
'row_data']):
147 s.append(
'%s=%s' % (k, str(v)))
152 return len(self.row_data)
155 if type(col_name)==int:
156 self.row_data[col_name] = val
158 self.row_data[self.tab.GetColIndex(col_name)] = val
161 if 'col_names' not in self.tab.__dict__
or col_name
not in self.tab.col_names:
162 raise AttributeError(col_name)
163 return self.row_data[self.tab.GetColIndex(col_name)]
166 if 'col_names' not in self.tab.__dict__
or col_name
not in self.tab.col_names:
167 raise AttributeError(col_name)
168 self.row_data[self.tab.GetColIndex(col_name)] = val
173 The table class provides convenient access to data in tabular form. An empty
174 table can be easily constructed as follows
176 .. code-block:: python
180 If you want to add columns directly when creating the table, column names
181 and *column types* can be specified as follows
183 .. code-block:: python
185 tab = Table(['nameX','nameY','nameZ'], 'sfb')
187 this will create three columns called nameX, nameY and nameZ of type string,
188 float and bool, respectively. There will be no data in the table and thus,
189 the table will not contain any rows.
191 The following *column types* are supported:
202 If you want to add data to the table in addition, use the following:
204 .. code-block:: python
206 tab=Table(['nameX','nameY','nameZ'],
208 nameX = ['a','b','c'],
209 nameY = [0.1, 1.2, 3.414],
210 nameZ = [True, False, False])
212 if values for one column is left out, they will be filled with NA, but if
213 values are specified, all values must be specified (i.e. same number of
218 SUPPORTED_TYPES=(
'int',
'float',
'bool',
'string',)
221 def __init__(self, col_names=[], col_types=None, **kwargs):
231 self.
col_names=[v
for v
in kwargs.keys()]
242 if 'col_names' not in self.__dict__
or col_name
not in self.
col_names:
243 raise AttributeError(col_name)
247 def _ParseColTypes(types, exp_num=None):
251 short2long = {
's' :
'string',
'i':
'int',
'b' :
'bool',
'f' :
'float'}
252 allowed_short = short2long.keys()
253 allowed_long = short2long.values()
260 types = types.lower()
263 if types
in allowed_long:
264 type_list.append(types)
265 elif types
in allowed_short:
266 type_list.append(short2long[types])
269 elif types.find(
',')!=-1:
270 for t
in types.split(
','):
271 if t
in allowed_long:
273 elif t
in allowed_short:
274 type_list.append(short2long[t])
276 raise ValueError(
'Unknown type %s in types %s'%(t,types))
281 if t
in allowed_short:
282 type_list.append(short2long[t])
284 raise ValueError(
'Unknown type %s in types %s'%(t,types))
288 raise ValueError(
'Col type %s must be string or list'%types)
296 if t
in allowed_long:
298 elif t
in allowed_short:
299 type_list.append(short2long[t])
301 raise ValueError(
'Unknown type %s in types %s'%(t,types))
305 raise ValueError(
'Col type %s must be string or list'%types)
308 if len(type_list)!=exp_num:
309 raise ValueError(
'Parsed number of col types (%i) differs from ' + \
310 'expected (%i) in types %s'%(len(type_list),exp_num,types))
316 Set name of the table
319 :type name: :class:`str`
331 Rename column *old_name* to *new_name*.
333 :param old_name: Name of the old column
334 :param new_name: Name of the new column
335 :raises: :exc:`ValueError` when *old_name* is not a valid column
337 if old_name==new_name:
342 def _Coerce(self, value, ty):
344 Try to convert values (e.g. from :class:`str` type) to the specified type
346 :param value: the value
347 :type value: any type
349 :param ty: name of type to convert it to (i.e. *int*, *float*, *string*,
351 :type ty: :class:`str`
353 if value==
'NA' or value==
None:
362 if isinstance(value, str)
or isinstance(value, unicode):
363 if value.upper()
in (
'FALSE',
'NO',):
367 raise ValueError(
'Unknown type %s' % ty)
371 Returns the column index for the column with the given name.
373 :raises: ValueError if no column with the name is found.
376 raise ValueError(
'Table has no column named "%s"' % col)
377 return self.col_names.index(col)
381 Returns a list containing all column names.
387 Returns a list of column names matching the regex.
389 :param regex: regex pattern
390 :type regex: :class:`str`
392 :returns: :class:`list` of column names (:class:`str`)
396 matches = re.search(regex, name)
398 matching_names.append(name)
399 return matching_names
403 Checks if the column with a given name is present in the table.
418 value=itertools.cycle([value])
419 for r, v
in zip(self.
rows, value):
422 def ToString(self, float_format='%.3f', int_format='%d', rows=None):
424 Convert the table into a string representation.
426 The output format can be modified for int and float type columns by
427 specifying a formatting string for the parameters *float_format* and
430 The option *rows* specify the range of rows to be printed. The parameter
431 must be a type that supports indexing (e.g. a :class:`list`) containing the
432 start and end row *index*, e.g. [start_row_idx, end_row_idx].
434 :param float_format: formatting string for float columns
435 :type float_format: :class:`str`
437 :param int_format: formatting string for int columns
438 :type int_format: :class:`str`
440 :param rows: iterable containing start and end row *index*
441 :type rows: iterable containing :class:`ints <int>`
443 widths=[len(cn)
for cn
in self.
col_names]
446 sel_rows=self.
rows[rows[0]:rows[1]]
448 for i, (ty, col)
in enumerate(zip(self.
col_types, row)):
450 widths[i]=max(widths[i], len(
'NA'))
452 widths[i]=max(widths[i], len(float_format % col))
454 widths[i]=max(widths[i], len(int_format % col))
456 widths[i]=max(widths[i], len(str(col)))
459 s+=
''.join([
'# %s\n' % l
for l
in self.comment.split(
'\n')])
460 total_width=sum(widths)+2*len(widths)
461 for width, col_name
in zip(widths, self.
col_names):
462 s+=col_name.center(width+2)
463 s+=
'\n%s\n' % (
'-'*total_width)
465 for width, ty, col
in zip(widths, self.
col_types, row):
468 cs=
'NA'.center(width+2)
470 cs=(float_format % col).rjust(width+2)
472 cs=(int_format % col).rjust(width+2)
474 cs=
' '+str(col).ljust(width+1)
485 Statistics for column %(col)s
487 Number of Rows : %(num)d
488 Number of Rows Not None: %(num_non_null)d
491 Standard Deviation : %(stddev)f
497 'num' : len(self.
rows),
498 'num_non_null' : self.
Count(col),
499 'median' : self.
Median(col),
500 'mean' : self.
Mean(col),
501 'stddev' : self.
StdDev(col),
502 'min' : self.
Min(col),
503 'max' : self.
Max(col),
507 def _AddRowsFromDict(self, d, overwrite=None):
509 Add one or more rows from a :class:`dictionary <dict>`.
511 If *overwrite* is not None and set to an existing column name, the specified
512 column in the table is searched for the first occurrence of a value matching
513 the value of the column with the same name in the dictionary. If a matching
514 value is found, the row is overwritten with the dictionary. If no matching
515 row is found, a new row is appended to the table.
517 :param d: dictionary containing the data
518 :type d: :class:`dict`
520 :param overwrite: column name to overwrite existing row if value in
521 column *overwrite* matches
522 :type overwrite: :class:`str`
524 :raises: :class:`ValueError` if multiple rows are added but the number of
525 data items is different for different columns.
532 for k,v
in d.iteritems():
538 elif old_len!=len(v):
539 raise ValueError(
"Cannot add rows: length of data must be equal " + \
540 "for all columns in %s"%str(d))
543 for i,data
in enumerate(zip(*d.values())):
544 new_row = [
None for a
in range(len(self.
col_names))]
545 for idx,v
in zip(idxs,data):
552 for i,r
in enumerate(self.
rows):
553 if r[overwrite_idx]==new_row[overwrite_idx]:
554 for j,e
in enumerate(self.
rows[i]):
557 self.
rows[i] = new_row
562 if not overwrite
or not added:
563 self.rows.append(new_row)
567 Two-sided test for the null-hypothesis that two related samples
568 have the same average (expected values).
570 :param col_a: First column
571 :param col_b: Second column
573 :returns: P-value between 0 and 1 that the two columns have the
574 same average. The smaller the value, the less related the two
577 from scipy.stats
import ttest_rel
580 for x, y
in self.
Zip(col_a, col_b):
581 if x!=
None and y!=
None:
584 result = ttest_rel(xs, ys)
589 Add a row to the table.
591 *data* may either be a dictionary or a list-like object:
593 - If *data* is a dictionary, the keys in the dictionary must match the
594 column names. Columns not found in the dict will be initialized to None.
595 If the dict contains list-like objects, multiple rows will be added, if
596 the number of items in all list-like objects is the same, otherwise a
597 :class:`ValueError` is raised.
599 - If *data* is a list-like object, the row is initialized from the values
600 in *data*. The number of items in *data* must match the number of
601 columns in the table. A :class:`ValuerError` is raised otherwise. The
602 values are added in the order specified in the list, thus, the order of
603 the data must match the columns.
605 If *overwrite* is not None and set to an existing column name, the specified
606 column in the table is searched for the first occurrence of a value matching
607 the value of the column with the same name in the dictionary. If a matching
608 value is found, the row is overwritten with the dictionary. If no matching
609 row is found, a new row is appended to the table.
611 :param data: data to add
612 :type data: :class:`dict` or *list-like* object
614 :param overwrite: column name to overwrite existing row if value in
615 column *overwrite* matches
616 :type overwrite: :class:`str`
618 :raises: :class:`ValueError` if *list-like* object is used and number of
619 items does *not* match number of columns in table.
621 :raises: :class:`ValueError` if *dict* is used and multiple rows are added
622 but the number of data items is different for different columns.
624 **Example:** add multiple data rows to a subset of columns using a dictionary
626 .. code-block:: python
628 # create table with three float columns
629 tab = Table(['x','y','z'], 'fff')
632 data = {'x': [1.2, 1.6], 'z': [1.6, 5.3]}
637 will produce the table
647 # overwrite the row with x=1.2 and add row with x=1.9
648 data = {'x': [1.2, 1.9], 'z': [7.9, 3.5]}
649 tab.AddRow(data, overwrite='x')
653 will produce the table
668 msg=
'data array must have %d elements, not %d'
669 raise ValueError(msg % (len(self.
col_names), len(data)))
676 for i,r
in enumerate(self.
rows):
677 if r[overwrite_idx]==new_row[overwrite_idx]:
678 self.
rows[i] = new_row
683 if not overwrite
or not added:
684 self.rows.append(new_row)
688 Remove column with the given name from the table.
690 :param col: name of column to remove
691 :type col: :class:`str`
696 for row
in self.
rows:
699 def AddCol(self, col_name, col_type, data=None):
701 Add a column to the right of the table.
703 :param col_name: name of new column
704 :type col_name: :class:`str`
706 :param col_type: type of new column (long versions: *int*, *float*, *bool*,
707 *string* or short versions: *i*, *f*, *b*, *s*)
708 :type col_type: :class:`str`
710 :param data: data to add to new column
711 :type data: scalar or iterable
715 .. code-block:: python
717 tab = Table(['x'], 'f', x=range(5))
718 tab.AddCol('even', 'bool', itertools.cycle([True, False]))
722 will produce the table
735 If data is a constant instead of an iterable object, it's value
736 will be written into each row:
738 .. code-block:: python
740 tab = Table(['x'], 'f', x=range(5))
741 tab.AddCol('num', 'i', 1)
745 will produce the table
758 As a special case, if there are no previous rows, and data is not
759 None, rows are added for every item in data.
763 raise ValueError(
'Column with name %s already exists'%col_name)
766 self.col_names.append(col_name)
767 self.col_types.append(col_type)
771 for row
in self.
rows:
774 if hasattr(data,
'__len__')
and len(data)!=len(self.
rows):
777 raise ValueError(
'Length of data (%i) must correspond to number of '%len(data) +\
778 'existing rows (%i)'%len(self.
rows))
779 for row, d
in zip(self.
rows, data):
782 elif data!=
None and len(self.
col_names)==1:
784 self.
AddRow({col_name : data})
787 self.
AddRow({col_name : v})
791 Returns a filtered table only containing rows matching all the predicates
792 in kwargs and args For example,
794 .. code-block:: python
796 tab.Filter(town='Basel')
798 will return all the rows where the value of the column "town" is equal to
799 "Basel". Several predicates may be combined, i.e.
801 .. code-block:: python
803 tab.Filter(town='Basel', male=True)
805 will return the rows with "town" equal to "Basel" and "male" equal to true.
806 args are unary callables returning true if the row should be included in the
807 result and false if not.
810 for row
in self.
rows:
816 for key, val
in kwargs.iteritems():
828 Returns a new table object containing all rows matching a logical query expression.
830 *query* is a string containing the logical expression, that will be evaluated
833 Operands have to be the name of a column or an expression that can be parsed to
834 float, int, bool or string.
835 Valid operators are: and, or, !=, !, <=, >=, ==, =, <, >, +, -, *, /
837 .. code-block:: python
839 subtab = tab.Select('col_a>0.5 and (col_b=5 or col_c=5)')
841 The selection query should be self explaining. Allowed parenthesis are: (), [], {},
842 whereas parenthesis mismatches get recognized. Expressions like '3<=col_a>=col_b'
843 throw an error, due to problems in figuring out the evaluation order.
845 There are two special expressions:
847 .. code-block:: python
849 #selects rows, where 1.0<=col_a<=1.5
850 subtab = tab.Select('col_a=1.0:1.5')
852 #selects rows, where col_a=1 or col_a=2 or col_a=3
853 subtab = tab.Select('col_a=1,2,3')
855 Only consistent types can be compared. If col_a is of type string and col_b is of type int,
856 following expression would throw an error: 'col_a<col_b'
861 from table_selector
import TableSelector
863 raise ImportError(
"Tried to import from the file table_selector.py, but could not find it!")
869 for row
in self.
rows:
870 if selector.EvaluateRow(row):
871 selected_tab.AddRow(row)
877 def _LoadOST(stream_or_filename):
878 fieldname_pattern=re.compile(
r'(?P<name>[^[]+)(\[(?P<type>\w+)\])?')
879 values_pattern=re.compile(
"([^\" ]+|\"[^\"]*\")+")
880 if not hasattr(stream_or_filename,
'read'):
881 stream=open(stream_or_filename,
'r')
883 stream=stream_or_filename
888 if line.startswith(
'#'):
896 for col
in line.split():
897 match=fieldname_pattern.match(col)
899 if match.group(
'type'):
900 fieldtypes.append(match.group(
'type'))
902 fieldtypes.append(
'string')
903 fieldnames.append(match.group(
'name'))
904 tab=
Table(fieldnames, fieldtypes)
907 tab.AddRow([x.strip(
'"')
for x
in values_pattern.findall(line)])
909 raise IOError(
"Cannot read table from empty stream")
912 def _GuessColumnTypes(self):
913 for col_idx
in range(len(self.
col_names)):
915 for row
in self.
rows:
916 for idx
in range(len(row)):
920 def _LoadCSV(stream_or_filename, sep):
921 if not hasattr(stream_or_filename,
'read'):
922 stream=open(stream_or_filename,
'r')
924 stream=stream_or_filename
925 reader=csv.reader(stream, delimiter=sep)
931 tab=
Table(header, types)
936 raise IOError(
'trying to load table from empty CSV stream/file')
938 tab._GuessColumnTypes()
942 def _LoadPickle(stream_or_filename):
943 if not hasattr(stream_or_filename,
'read'):
944 stream=open(stream_or_filename,
'rb')
946 stream=stream_or_filename
947 return cPickle.load(stream)
950 def _GuessFormat(filename):
952 filename = filename.name
953 except AttributeError, e:
955 if filename.endswith(
'.csv'):
957 elif filename.endswith(
'.pickle'):
964 def Load(stream_or_filename, format='auto', sep=','):
966 Load table from stream or file with given name.
968 By default, the file format is set to *auto*, which tries to guess the file
969 format from the file extension. The following file extensions are
972 ============ ======================
973 extension recognized format
974 ============ ======================
975 .csv comma separated values
976 .pickle pickled byte stream
977 <all others> ost-specific format
978 ============ ======================
980 Thus, *format* must be specified for reading file with different filename
983 The following file formats are understood:
987 This is an ost-specific, but still human readable file format. The file
988 (stream) must start with header line of the form
990 col_name1[type1] <col_name2[type2]>...
992 The types given in brackets must be one of the data types the
993 :class:`Table` class understands. Each following line in the file then must
994 contains exactly the same number of data items as listed in the header. The
995 data items are automatically converted to the column format. Lines starting
996 with a '#' and empty lines are ignored.
1000 Deserializes the table from a pickled byte stream.
1004 Reads the table from comma separated values stream. Since there is no
1005 explicit type information in the csv file, the column types are guessed,
1006 using the following simple rules:
1008 * if all values are either NA/NULL/NONE the type is set to string.
1009 * if all non-null values are convertible to float/int the type is set to
1011 * if all non-null values are true/false/yes/no, the value is set to bool.
1012 * for all other cases, the column type is set to string.
1014 :returns: A new :class:`Table` instance
1016 format=format.lower()
1018 format = Table._GuessFormat(stream_or_filename)
1021 return Table._LoadOST(stream_or_filename)
1023 return Table._LoadCSV(stream_or_filename, sep=sep)
1024 if format==
'pickle':
1025 return Table._LoadPickle(stream_or_filename)
1026 raise ValueError(
'unknown format ""' % format)
1030 Performs an in-place sort of the table, based on column *by*.
1032 :param by: column name by which to sort
1033 :type by: :class:`str`
1035 :param order: ascending (``-``) or descending (``+``) order
1036 :type order: :class:`str` (i.e. *+*, *-*)
1042 def _key_cmp(lhs, rhs):
1043 return sign*cmp(lhs[key_index], rhs[key_index])
1044 self.
rows=sorted(self.
rows, _key_cmp)
1048 Extract a list of all unique values from one column.
1050 :param col: column name
1051 :type col: :class:`str`
1053 :param ignore_nan: ignore all *None* values
1054 :type ignore_nan: :class:`bool`
1059 for row
in self.
rows:
1061 if item!=
None or ignore_nan==
False:
1062 if item
in seen:
continue
1069 Allows to conveniently iterate over a selection of columns, e.g.
1071 .. code-block:: python
1073 tab = Table.Load('...')
1074 for col1, col2 in tab.Zip('col1', 'col2'):
1079 .. code-block:: python
1081 tab = Table.Load('...')
1082 for col1, col2 in zip(tab['col1'], tab['col2']):
1085 return zip(*[self[arg]
for arg
in args])
1087 def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None,
1088 z_title=
None, x_range=
None, y_range=
None, z_range=
None,
1089 color=
None, plot_if=
None, legend=
None,
1090 num_z_levels=10, z_contour=
True, z_interpol=
'nn', diag_line=
False,
1091 labels=
None, max_num_labels=
None, title=
None, clear=
True, save=
False,
1094 Function to plot values from your table in 1, 2 or 3 dimensions using
1095 `Matplotlib <http://matplotlib.sourceforge.net>`__
1097 :param x: column name for first dimension
1098 :type x: :class:`str`
1100 :param y: column name for second dimension
1101 :type y: :class:`str`
1103 :param z: column name for third dimension
1104 :type z: :class:`str`
1106 :param style: symbol style (e.g. *.*, *-*, *x*, *o*, *+*, *\**). For a
1107 complete list check (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1108 :type style: :class:`str`
1110 :param x_title: title for first dimension, if not specified it is
1111 automatically derived from column name
1112 :type x_title: :class:`str`
1114 :param y_title: title for second dimension, if not specified it is
1115 automatically derived from column name
1116 :type y_title: :class:`str`
1118 :param z_title: title for third dimension, if not specified it is
1119 automatically derived from column name
1120 :type z_title: :class:`str`
1122 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1123 :type x_range: :class:`list` of length two
1125 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1126 :type y_range: :class:`list` of length two
1128 :param z_range: start and end value for third dimension (e.g. [start_z, end_z])
1129 :type z_range: :class:`list` of length two
1131 :param color: color for data (e.g. *b*, *g*, *r*). For a complete list check
1132 (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1133 :type color: :class:`str`
1135 :param plot_if: callable which returnes *True* if row should be plotted. Is
1136 invoked like ``plot_if(self, row)``
1137 :type plot_if: callable
1139 :param legend: legend label for data series
1140 :type legend: :class:`str`
1142 :param num_z_levels: number of levels for third dimension
1143 :type num_z_levels: :class:`int`
1145 :param diag_line: draw diagonal line
1146 :type diag_line: :class:`bool`
1148 :param labels: column name containing labels to put on x-axis for one
1150 :type labels: :class:`str`
1152 :param max_num_labels: limit maximum number of labels
1153 :type max_num_labels: :class:`int`
1155 :param title: plot title, if not specified it is automatically derived from
1156 plotted column names
1157 :type title: :class:`str`
1159 :param clear: clear old data from plot
1160 :type clear: :class:`bool`
1162 :param save: filename for saving plot
1163 :type save: :class:`str`
1165 :param z_contour: draw contour lines
1166 :type z_contour: :class:`bool`
1168 :param z_interpol: interpolation method for 3-dimensional plot (one of 'nn',
1170 :type z_interpol: :class:`str`
1172 :param \*\*kwargs: additional arguments passed to matplotlib
1174 :returns: the ``matplotlib.pyplot`` module
1176 **Examples:** simple plotting functions
1178 .. code-block:: python
1180 tab = Table(['a','b','c','d'],'iffi', a=range(5,0,-1),
1181 b=[x/2.0 for x in range(1,6)],
1182 c=[math.cos(x) for x in range(0,5)],
1185 # one dimensional plot of column 'd' vs. index
1189 # two dimensional plot of 'a' vs. 'c'
1190 plt = tab.Plot('a', y='c', style='o-')
1193 # three dimensional plot of 'a' vs. 'c' with values 'b'
1194 plt = tab.Plot('a', y='c', z='b')
1195 # manually save plot to file
1196 plt.savefig("plot.png")
1199 import matplotlib.pyplot
as plt
1200 import matplotlib.mlab
as mlab
1208 plt.figure(figsize=[8, 6])
1231 if x_range
and (
IsScalar(x_range)
or len(x_range)!=2):
1232 raise ValueError(
'parameter x_range must contain exactly two elements')
1233 if y_range
and (
IsScalar(y_range)
or len(y_range)!=2):
1234 raise ValueError(
'parameter y_range must contain exactly two elements')
1235 if z_range
and (
IsScalar(z_range)
or len(z_range)!=2):
1236 raise ValueError(
'parameter z_range must contain exactly two elements')
1239 kwargs[
'color']=color
1241 kwargs[
'label']=legend
1245 for row
in self.
rows:
1246 if row[idx1]!=
None and row[idx2]!=
None and row[idx3]!=
None:
1247 if plot_if
and not plot_if(self, row):
1249 xs.append(row[idx1])
1250 ys.append(row[idx2])
1251 zs.append(row[idx3])
1254 z_spacing = (z_range[1] - z_range[0]) / num_z_levels
1258 z_spacing = (self.
Max(z) - l) / num_z_levels
1260 for i
in range(0,num_z_levels+1):
1264 xi = np.linspace(min(xs),max(xs),len(xs)*10)
1265 yi = np.linspace(min(ys),max(ys),len(ys)*10)
1266 zi = mlab.griddata(xs, ys, zs, xi, yi, interp=z_interpol)
1269 plt.contour(xi,yi,zi,levels,linewidths=0.5,colors=
'k')
1271 plt.contourf(xi,yi,zi,levels,cmap=plt.cm.jet)
1272 plt.colorbar(ticks=levels)
1276 for row
in self.
rows:
1277 if row[idx1]!=
None and row[idx2]!=
None:
1278 if plot_if
and not plot_if(self, row):
1280 xs.append(row[idx1])
1281 ys.append(row[idx2])
1282 plt.plot(xs, ys, style, **kwargs)
1289 for row
in self.
rows:
1291 if plot_if
and not plot_if(self, row):
1293 xs.append(row[idx1])
1295 label_vals.append(row[label_idx])
1296 plt.plot(xs, style, **kwargs)
1300 if len(label_vals)>max_num_labels:
1301 interval = int(math.ceil(float(len(label_vals))/max_num_labels))
1302 label_vals = label_vals[::interval]
1303 plt.xticks(np.arange(0, len(xs), interval), label_vals, rotation=45,
1308 title =
'%s of %s vs. %s' % (nice_z, nice_x, nice_y)
1310 title =
'%s vs. %s' % (nice_x, nice_y)
1314 plt.title(title, size=
'x-large', fontweight=
'bold',
1315 verticalalignment=
'bottom')
1321 plt.xlabel(nice_x, size=
'x-large')
1323 plt.xlim(x_range[0], x_range[1])
1325 plt.ylim(y_range[0], y_range[1])
1327 plt.plot(x_range, y_range,
'-', color=
'black')
1329 plt.ylabel(nice_y, size=
'x-large')
1332 plt.ylim(y_range[0], y_range[1])
1334 plt.xlabel(x_title, size=
'x-large')
1335 plt.ylabel(nice_y, size=
'x-large')
1340 LogError(
"Function needs numpy and matplotlib, but I could not import it.")
1343 def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False,
1344 histtype=
'stepfilled', align=
'mid', x_title=
None,
1345 y_title=
None, title=
None, clear=
True, save=
False,
1346 color=
None, y_range=
None):
1348 Create a histogram of the data in col for the range *x_range*, split into
1349 *num_bins* bins and plot it using Matplotlib.
1351 :param col: column name with data
1352 :type col: :class:`str`
1354 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1355 :type x_range: :class:`list` of length two
1357 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1358 :type y_range: :class:`list` of length two
1360 :param num_bins: number of bins in range
1361 :type num_bins: :class:`int`
1363 :param color: Color to be used for the histogram. If not set, color will be
1364 determined by matplotlib
1365 :type color: :class:`str`
1367 :param normed: normalize histogram
1368 :type normed: :class:`bool`
1370 :param histtype: type of histogram (i.e. *bar*, *barstacked*, *step*,
1371 *stepfilled*). See (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1372 :type histtype: :class:`str`
1374 :param align: style of histogram (*left*, *mid*, *right*). See
1375 (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1376 :type align: :class:`str`
1378 :param x_title: title for first dimension, if not specified it is
1379 automatically derived from column name
1380 :type x_title: :class:`str`
1382 :param y_title: title for second dimension, if not specified it is
1383 automatically derived from column name
1384 :type y_title: :class:`str`
1386 :param title: plot title, if not specified it is automatically derived from
1387 plotted column names
1388 :type title: :class:`str`
1390 :param clear: clear old data from plot
1391 :type clear: :class:`bool`
1393 :param save: filename for saving plot
1394 :type save: :class:`str`
1396 **Examples:** simple plotting functions
1398 .. code-block:: python
1400 tab = Table(['a'],'f', a=[math.cos(x*0.01) for x in range(100)])
1402 # one dimensional plot of column 'd' vs. index
1403 plt = tab.PlotHistogram('a')
1408 import matplotlib.pyplot
as plt
1411 if len(self.
rows)==0:
1415 kwargs[
'color']=color
1425 n, bins, patches = plt.hist(data, bins=num_bins, range=x_range,
1426 normed=normed, histtype=histtype, align=align,
1433 plt.xlabel(nice_x, size=
'x-large')
1440 plt.ylabel(nice_y, size=
'x-large')
1445 nice_title=
"Histogram of %s"%nice_x
1446 plt.title(nice_title, size=
'x-large', fontweight=
'bold')
1452 LogError(
"Function needs numpy and matplotlib, but I could not import it.")
1455 def _Max(self, col):
1456 if len(self.
rows)==0:
1460 if col_type==
'int' or col_type==
'float':
1461 max_val = -float(
'inf')
1462 elif col_type==
'bool':
1464 elif col_type==
'string':
1467 for i
in range(0, len(self.
rows)):
1468 if self.
rows[i][idx]>max_val:
1469 max_val = self.
rows[i][idx]
1471 return max_val, max_idx
1473 def PlotBar(self, cols=None, rows=None, xlabels=None, set_xlabels=True, xlabels_rotation='horizontal', y_title=None, title=None,
1474 colors=
None, width=0.8, bottom=0, legend=
False, legend_names=
None, show=
False, save=
False):
1477 Create a barplot of the data in cols. Every column will be represented
1478 at one position. If there are several rows, each column will be grouped
1481 :param cols: List of column names. Every column will be represented as a
1482 single bar. If cols is None, every column of the table gets
1484 :type cols: :class:`list`
1486 :param rows: List of row indices. Values from given rows will be plotted
1487 in parallel at one column position. If set to None, all rows
1488 of the table will be plotted. Note, that the maximum number
1490 :type rows: :class:`list`
1492 :param xlabels: Label for every col on x-axis. If set to None, the column
1493 names are used. The xlabel plotting can be supressed by
1494 the parameter set_xlabel.
1495 :type xlabels: :class:`list`
1497 :param set_xlabels: Controls whether xlabels are plotted or not.
1498 :type set_xlabels: :class:`bool`
1500 :param x_labels_rotation: Can either be 'horizontal', 'vertical' or an
1501 integer, that describes the rotation in degrees.
1503 :param y_title: Y-axis description
1504 :type y_title: :class:`str`
1506 :title: Title of the plot. No title appears if set to None
1507 :type title: :class:`str`
1509 :param colors: Colors of the different bars in each group. Must be a list
1510 of valid colors in matplotlib. Length of color and rows must
1512 :type colors: :class:`list`
1514 :param width: The available space for the groups on the x-axis is divided
1515 by the exact number of groups. The parameters width is the
1516 fraction of what is actually used. If it would be 1.0 the
1517 bars of the different groups would touch each other.
1518 Value must be between [0;1]
1519 :type width: :class:`float`
1521 :param bottom: Bottom
1522 :type bottom: :class:`float`
1524 :param legend: Legend for color explanation, the corresponding row
1525 respectively. If set to True, legend_names must be provided.
1526 :type legend: :class:`bool`
1528 :param legend_names: List of names, that describe the differently colored
1529 bars. Length must be consistent with number of rows.
1531 :param show: If set to True, the plot is directly displayed.
1533 :param save: If set, a png image with name save in the current working
1534 directory will be saved.
1535 :type save: :class:`str`
1540 import matplotlib.pyplot
as plt
1542 raise ImportError(
'PlotBar relies on numpy and matplotlib, but I could' \
1545 standard_colors=[
'b',
'g',
'y',
'c',
'm',
'r','k']
1551 if width<=0
or width>1:
1552 raise ValueError(
'Width must be in [0;1]')
1555 if len(self.
rows)>7:
1556 raise ValueError(
'Table contains too many rows to represent them at one '\
1557 'bar position in parallel. You can Select a Subtable or '\
1558 'specify the parameter rows with a list of row indices '\
1561 rows=range(len(self.
rows))
1563 if not isinstance(rows,list):
1566 raise ValueError(
'Too many rows to represent (max 7). Please note, that '\
1567 'data from multiple rows from one column gets '\
1568 'represented at one position in parallel.')
1571 row=self.
rows[r_idx]
1577 raise ValueError(
'Cannot find column with name '+str(c))
1578 temp.append(row[c_idx])
1582 colors=standard_colors[:len(rows)]
1584 if len(rows)!=len(colors):
1585 raise ValueError(
"Number of rows and number of colors must be consistent!")
1587 ind=np.arange(len(data[0]))
1588 single_bar_width=float(width)/len(data)
1591 ax=fig.add_subplot(111)
1594 for i
in range(len(data)):
1595 legend_data.append(ax.bar(ind+i*single_bar_width+(1-width)/2,data[i],single_bar_width,bottom=bottom,color=colors[i])[0])
1598 ax.set_title(title, size=
'x-large', fontweight=
'bold')
1604 ax.set_ylabel(nice_y)
1607 if len(data[0])!=len(xlabels):
1608 raise ValueError(
'Number of xlabels is not consistent with number of cols!')
1613 ax.set_xticks(ind+0.5)
1614 ax.set_xticklabels(xlabels, rotation = xlabels_rotation)
1619 if legend_names==
None:
1620 raise ValueError(
'You must provide legend names! e.g. names for the rows, '\
1621 'that are printed in parallel.')
1622 if len(legend_names)!=len(data):
1623 raise ValueError(
'length of legend_names must be consistent with number '\
1625 ax.legend(legend_data, legend_names)
1635 def PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log',
1636 colormap=
'jet', show_scalebar=
False, scalebar_label=
None, clear=
True, save=
False, show=
False):
1639 Create a heatplot of the data in col x vs the data in col y using matplotlib
1641 :param x: column name with x data
1642 :type x: :class:`str`
1644 :param y: column name with y data
1645 :type y: :class:`str`
1647 :param title: title of the plot, will be generated automatically if set to None
1648 :type title: :class:`str`
1650 :param x_title: label of x-axis, will be generated automatically if set to None
1651 :type title: :class:`str`
1653 :param y_title: label of y-axis, will be generated automatically if set to None
1654 :type title: :class:`str`
1656 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1657 :type x_range: :class:`list` of length two
1659 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1660 :type y_range: :class:`list` of length two
1662 :param binning: type of binning. If set to None, the value of a hexbin will
1663 correspond to the number of datapoints falling into it. If
1664 set to 'log', the value will be the log with base 10 of the above
1665 value (log(i+1)). If an integer is provided, the number of a
1666 hexbin is equal the number of datapoints falling into it divided
1667 by the integer. If a list of values is provided, these values
1668 will be the lower bounds of the bins.
1670 :param colormap: colormap, that will be used. Value can be every colormap defined
1671 in matplotlib or an own defined colormap. You can either pass a
1672 string with the name of the matplotlib colormap or a colormap
1675 :param show_scalebar: If set to True, a scalebar according to the chosen colormap is shown
1676 :type show_scalebar: :class:`bool`
1678 :param scalebar_label: Label of the scalebar
1679 :type scalebar_label: :class:`str`
1681 :param clear: clear old data from plot
1682 :type clear: :class:`bool`
1684 :param save: filename for saving plot
1685 :type save: :class:`str`
1687 :param show: directly show plot
1688 :type show: :class:`bool`
1693 import matplotlib.pyplot
as plt
1694 import matplotlib.cm
as cm
1696 raise ImportError(
'PlotHexbin relies on matplotlib, but I could not import it')
1704 if r[idx]!=
None and r[idy]!=
None:
1705 xdata.append(r[idx])
1706 ydata.append(r[idy])
1722 title =
'%s vs. %s' % (nice_x, nice_y)
1725 colormap=getattr(cm, colormap)
1727 if x_range
and (
IsScalar(x_range)
or len(x_range)!=2):
1728 raise ValueError(
'parameter x_range must contain exactly two elements')
1729 if y_range
and (
IsScalar(y_range)
or len(y_range)!=2):
1730 raise ValueError(
'parameter y_range must contain exactly two elements')
1732 ext = [min(xdata),max(xdata),min(ydata),max(ydata)]
1735 plt.xlim((x_range[0], x_range[1]))
1739 plt.ylim(y_range[0], y_range[1])
1744 plt.hexbin(xdata, ydata, bins=binning, cmap=colormap, extent=ext)
1746 plt.title(title, size=
'x-large', fontweight=
'bold',
1747 verticalalignment=
'bottom')
1755 cb.set_label(scalebar_label)
1767 Returns the row containing the cell with the maximal value in col. If
1768 several rows have the highest value, only the first one is returned.
1769 ''None'' values are ignored.
1771 :param col: column name
1772 :type col: :class:`str`
1774 :returns: row with maximal col value or None if the table is empty
1776 val, idx = self.
_Max(col)
1778 return self.
rows[idx]
1782 Returns the maximum value in col. If several rows have the highest value,
1783 only the first one is returned. ''None'' values are ignored.
1785 :param col: column name
1786 :type col: :class:`str`
1788 val, idx = self.
_Max(col)
1793 Returns the row index of the cell with the maximal value in col. If
1794 several rows have the highest value, only the first one is returned.
1795 ''None'' values are ignored.
1797 :param col: column name
1798 :type col: :class:`str`
1800 val, idx = self.
_Max(col)
1803 def _Min(self, col):
1804 if len(self.
rows)==0:
1808 if col_type==
'int' or col_type==
'float':
1809 min_val=float(
'inf')
1810 elif col_type==
'bool':
1812 elif col_type==
'string':
1815 for i,row
in enumerate(self.
rows):
1816 if row[idx]!=
None and row[idx]<min_val:
1819 return min_val, min_idx
1823 Returns the minimal value in col. If several rows have the lowest value,
1824 only the first one is returned. ''None'' values are ignored.
1826 :param col: column name
1827 :type col: :class:`str`
1829 val, idx = self.
_Min(col)
1834 Returns the row containing the cell with the minimal value in col. If
1835 several rows have the lowest value, only the first one is returned.
1836 ''None'' values are ignored.
1838 :param col: column name
1839 :type col: :class:`str`
1841 :returns: row with minimal col value or None if the table is empty
1843 val, idx = self.
_Min(col)
1845 return self.
rows[idx]
1849 Returns the row index of the cell with the minimal value in col. If
1850 several rows have the lowest value, only the first one is returned.
1851 ''None'' values are ignored.
1853 :param col: column name
1854 :type col: :class:`str`
1856 val, idx = self.
_Min(col)
1861 Returns the sum of the given column. Cells with ''None'' are ignored. Returns
1862 0.0, if the column doesn't contain any elements. Col must be of numeric
1863 column type ('float', 'int') or boolean column type.
1865 :param col: column name
1866 :type col: :class:`str`
1868 :raises: :class:`TypeError` if column type is ``string``
1872 if col_type!=
'int' and col_type!=
'float' and col_type!=
'bool':
1873 raise TypeError(
"Sum can only be used on numeric column types")
1882 Returns the mean of the given column. Cells with ''None'' are ignored. Returns
1883 None, if the column doesn't contain any elements. Col must be of numeric
1884 ('float', 'int') or boolean column type.
1886 If column type is *bool*, the function returns the ratio of
1887 number of 'Trues' by total number of elements.
1889 :param col: column name
1890 :type col: :class:`str`
1892 :raises: :class:`TypeError` if column type is ``string``
1896 if col_type!=
'int' and col_type!=
'float' and col_type!=
'bool':
1897 raise TypeError(
"Mean can only be used on numeric or bool column types")
1904 return stutil.Mean(vals)
1910 Adds a new column of type 'float' with a specified name (*mean_col_name*),
1911 containing the mean of all specified columns for each row.
1913 Cols are specified by their names and must be of numeric column
1914 type ('float', 'int') or boolean column type. Cells with None are ignored.
1915 Adds ''None'' if the row doesn't contain any values.
1917 :param mean_col_name: name of new column containing mean values
1918 :type mean_col_name: :class:`str`
1920 :param cols: name or list of names of columns to include in computation of
1922 :type cols: :class:`str` or :class:`list` of strings
1924 :raises: :class:`TypeError` if column type of columns in *col* is ``string``
1928 Staring with the following table:
1938 the code here adds a column with the name 'mean' to yield the table below:
1940 .. code-block::python
1942 tab.RowMean('mean', ['x', 'u'])
1945 ==== ==== ==== =====
1947 ==== ==== ==== =====
1951 ==== ==== ==== =====
1960 idx = self.GetColIndex(col)
1961 col_type = self.col_types[idx]
1962 if col_type!='int' and col_type!='float' and col_type!='bool':
1963 raise TypeError("RowMean can only be used on numeric column types")
1964 cols_idxs.append(idx)
1967 for row in self.rows:
1969 for idx in cols_idxs:
1974 mean = stutil.Mean(vals)
1975 mean_rows.append(mean)
1977 mean_rows.append(None)
1979 self.AddCol(mean_col_name, 'f', mean_rows)
1981 def Percentiles(self, col, nths):
1983 Returns the percentiles of column *col* given
in *nths*.
1985 The percentiles are calculated
as
1987 .. code-block:: python
1989 values[min(len(values), int(
round(len(values)*p/100+0.5)-1))]
1991 where values are the sorted values of *col*
not equal to
''None''
1992 :param: nths: list of percentiles to be calculated. Each percentile
is a number
1995 :raises: :
class:`TypeError`
if column type
is ``string``
1996 :returns: List of percentiles
in the same order
as given
in *nths*
1998 idx = self.GetColIndex(col)
1999 col_type = self.col_types[idx]
2000 if col_type!='int' and col_type!='float' and col_type!='bool':
2001 raise TypeError("Median can only be used on numeric column types")
2004 if nth < 0 or nth > 100:
2005 raise ValueError("percentiles must be between 0 and 100")
2012 return [None]*len(nths)
2016 p=vals[min(len(vals)-1, int(round(len(vals)*nth/100.0+0.5)-1))]
2017 percentiles.append(p)
2020 def Median(self, col):
2022 Returns the median of the given column. Cells with
''None'' are ignored. Returns
2023 ''None'',
if the column doesn
't contain any elements. Col must be of numeric
2024 column type ('float',
'int')
or boolean column type.
2026 :param col: column name
2027 :type col: :
class:`str`
2029 :raises: :
class:`TypeError`
if column type
is ``string``
2031 idx = self.GetColIndex(col)
2032 col_type = self.col_types[idx]
2033 if col_type!='int' and col_type!='float' and col_type!='bool':
2034 raise TypeError("Median can only be used on numeric column types")
2042 return stutil.Median(vals)
2046 def StdDev(self, col):
2048 Returns the standard deviation of the given column. Cells with
''None'' are
2049 ignored. Returns
''None'',
if the column doesn
't contain any elements. Col must
2050 be of numeric column type ('float',
'int')
or boolean column type.
2052 :param col: column name
2053 :type col: :
class:`str`
2055 :raises: :
class:`TypeError`
if column type
is ``string``
2057 idx = self.GetColIndex(col)
2058 col_type = self.col_types[idx]
2059 if col_type!='int' and col_type!='float' and col_type!='bool':
2060 raise TypeError("StdDev can only be used on numeric column types")
2067 return stutil.StdDev(vals)
2071 def Count(self, col, ignore_nan=True):
2073 Count the number of cells
in column that are
not equal to
''None''.
2075 :param col: column name
2076 :type col: :
class:`str`
2078 :param ignore_nan: ignore all *
None* values
2079 :type ignore_nan: :
class:`bool`
2082 idx=self.GetColIndex(col)
2091 def Correl(self, col1, col2):
2093 Calculate the Pearson correlation coefficient between *col1*
and *col2*, only
2094 taking rows into account where both of the values are
not equal to *
None*.
2095 If there are
not enough data points to calculate a correlation coefficient,
2098 :param col1: column name
for first column
2099 :type col1: :
class:`str`
2101 :param col2: column name
for second column
2102 :type col2: :
class:`str`
2104 if IsStringLike(col1) and IsStringLike(col2):
2105 col1 = self.GetColIndex(col1)
2106 col2 = self.GetColIndex(col2)
2107 vals1, vals2=([],[])
2108 for v1, v2 in zip(self[col1], self[col2]):
2109 if v1!=None and v2!=None:
2113 return stutil.Correl(vals1, vals2)
2117 def SpearmanCorrel(self, col1, col2):
2119 Calculate the Spearman correlation coefficient between col1
and col2, only
2120 taking rows into account where both of the values are
not equal to
None. If
2121 there are
not enough data points to calculate a correlation coefficient,
2124 :warning: The function depends on the following module: *scipy.stats.mstats*
2126 :param col1: column name
for first column
2127 :type col1: :
class:`str`
2129 :param col2: column name
for second column
2130 :type col2: :
class:`str`
2133 import scipy.stats.mstats
2135 if IsStringLike(col1) and IsStringLike(col2):
2136 col1 = self.GetColIndex(col1)
2137 col2 = self.GetColIndex(col2)
2138 vals1, vals2=([],[])
2139 for v1, v2 in zip(self[col1], self[col2]):
2140 if v1!=None and v2!=None:
2144 correl = scipy.stats.mstats.spearmanr(vals1, vals2)[0]
2145 if scipy.isnan(correl):
2152 LogError("Function needs scipy.stats.mstats, but I could not import it.")
2156 def Save(self, stream_or_filename, format='ost', sep=','):
2158 Save the table to stream
or filename. The following three file formats
2159 are supported (
for more information on file formats, see :meth:`Load`):
2161 ============= =======================================
2162 ost ost-specific format (human readable)
2163 csv comma separated values (human readable)
2164 pickle pickled byte stream (binary)
2166 context ConTeXt table
2167 ============= =======================================
2169 :param stream_or_filename: filename
or stream
for writing output
2170 :type stream_or_filename: :
class:`str`
or :
class:`file`
2172 :param format: output format (i.e. *ost*, *csv*, *pickle*)
2173 :type format: :
class:`str`
2175 :raises: :
class:`ValueError`
if format
is unknown
2177 format=format.lower()
2179 return self._SaveOST(stream_or_filename)
2181 return self._SaveCSV(stream_or_filename, sep=sep)
2182 if format=='pickle':
2183 return self._SavePickle(stream_or_filename)
2185 return self._SaveHTML(stream_or_filename)
2186 if format=='context':
2187 return self._SaveContext(stream_or_filename)
2188 raise ValueError('unknown format "%s"' % format)
2190 def _SavePickle(self, stream):
2191 if not hasattr(stream, 'write'):
2192 stream=open(stream, 'wb')
2193 cPickle.dump(self, stream, cPickle.HIGHEST_PROTOCOL)
2195 def _SaveHTML(self, stream_or_filename):
2197 return s.replace('&', '&').replace('>', '>').replace('<', '<')
2200 if not hasattr(stream_or_filename, 'write'):
2201 stream = open(stream_or_filename, 'w')
2204 stream = stream_or_filename
2205 stream.write('<table>')
2206 stream.write('<tr>')
2207 for col_name in self.col_names:
2208 stream.write('<th>%s</th>' % _escape(col_name))
2209 stream.write('</tr>')
2210 for row in self.rows:
2211 stream.write('<tr>')
2212 for i, col in enumerate(row):
2215 if self.col_types[i] == 'float':
2217 elif self.col_types[i] == 'int':
2219 elif self.col_types[i] == 'bool':
2220 val = col and 'true' or 'false'
2223 stream.write('<td>%s</td>' % _escape(val))
2224 stream.write('</tr>')
2225 stream.write('</table>')
2228 def _SaveContext(self, stream_or_filename):
2230 if not hasattr(stream_or_filename, 'write'):
2231 stream = open(stream_or_filename, 'w')
2234 stream = stream_or_filename
2235 stream.write('\\starttable[')
2236 for col_type in self.col_types:
2237 if col_type =='string':
2239 elif col_type=='int':
2241 elif col_type =='float':
2242 stream.write('i3r|')
2245 stream.write(']\n\\HL\n')
2246 for col_name in self.col_names:
2247 stream.write('\\NC \\bf %s' % col_name)
2248 stream.write(' \\AR\\HL\n')
2249 for row in self.rows:
2250 for i, col in enumerate(row):
2253 if self.col_types[i] == 'float':
2255 elif self.col_types[i] == 'int':
2257 elif self.col_types[i] == 'bool':
2258 val = col and 'true' or 'false'
2261 stream.write('\\NC %s' % val)
2262 stream.write(' \\AR\n')
2263 stream.write('\\HL\n')
2264 stream.write('\\stoptable')
2268 def _SaveCSV(self, stream, sep):
2269 if not hasattr(stream, 'write'):
2270 stream=open(stream, 'wb')
2272 writer=csv.writer(stream, delimiter=sep)
2273 writer.writerow(['%s' % n for n in self.col_names])
2274 for row in self.rows:
2276 for i, c in enumerate(row):
2279 writer.writerow(row)
2281 def _SaveOST(self, stream):
2282 if hasattr(stream, 'write'):
2283 writer=csv.writer(stream, delimiter=' ')
2285 stream=open(stream, 'w')
2286 writer=csv.writer(stream, delimiter=' ')
2288 stream.write(''.join(['# %s\n' % l for l in self.comment.split('\n')]))
2289 writer.writerow(['%s[%s]' % t for t in zip(self.col_names, self.col_types)])
2290 for row in self.rows:
2292 for i, c in enumerate(row):
2295 writer.writerow(row)
2298 def GetNumpyMatrix(self, *args):
2300 Returns a numpy matrix containing the selected columns from the table as
2301 columns in the matrix.
2303 Only columns of type *int* or *float* are supported. *NA* values in the
2304 table will be converted to *None* values.
2306 :param \*args: column names to include in numpy matrix
2308 :warning: The function depends on *numpy*
2314 raise RuntimeError("At least one column must be specified.")
2318 idx = self.GetColIndex(arg)
2319 col_type = self.col_types[idx]
2320 if col_type!='int' and col_type!='float':
2321 raise TypeError("Numpy matrix can only be generated from numeric column types")
2323 m = np.matrix([list(self[i]) for i in idxs])
2327 LogError("Function needs numpy, but I could not import it.")
2332 def GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0):
2335 In place Gaussian smooth of a column in the table with a given standard deviation.
2336 All nan are set to nan_value before smoothing.
2338 :param col: column name
2339 :type col: :class:`str`
2341 :param std: standard deviation for gaussian kernel
2344 :param na_value: all na (None) values of the speciefied column are set to na_value before smoothing
2345 :type na_value: `scalar`
2347 :param padding: allows to handle padding behaviour see scipy ndimage.gaussian_filter1d documentation for more information. standard is reflect
2348 :type padding: :class:`str`
2350 :param c: constant value used for padding if padding mode is constant
2355 :warning: The function depends on *scipy*
2359 from scipy import ndimage
2362 LogError("I need scipy.ndimage and numpy, but could not import it")
2365 idx = self.GetColIndex(col)
2366 col_type = self.col_types[idx]
2367 if col_type!='int' and col_type!='float':
2368 raise TypeError("GaussianSmooth can only be used on numeric column types")
2375 vals.append(na_value)
2378 smoothed_values_ndarray=ndimage.gaussian_filter1d(vals,std, mode=padding, cval=c)
2382 for v in smoothed_values_ndarray:
2388 def GetOptimalPrefactors(self, ref_col, *args, **kwargs):
2390 This returns the optimal prefactor values (i.e. a, b, c, ...) for the
2396 a*u + b*v + c*w + ... = z
2398 where u, v, w and z are vectors. In matrix notation
2405 where A contains the data from the table (u,v,w,...), p are the prefactors
2406 to optimize (a,b,c,...) and z is the vector containing the result of
2409 The parameter ref_col equals to z in both equations, and \*args are columns
2410 u, v and w (or A in :eq:`op2`). All columns must be specified by their names.
2414 .. code-block:: python
2416 tab.GetOptimalPrefactors('colC', 'colA', 'colB')
2418 The function returns a list of containing the prefactors a, b, c, ... in
2419 the correct order (i.e. same as columns were specified in \*args).
2422 If the kwarg weights="columX" is specified, the equations are weighted by
2423 the values in that column. Each row is multiplied by the weight in that row,
2424 which leads to :eq:`op3`:
2429 weight*a*u + weight*b*v + weight*c*w + ... = weight*z
2431 Weights must be float or int and can have any value. A value of 0 ignores
2432 this equation, a value of 1 means the same as no weight. If all weights are
2433 the same for each row, the same result will be obtained as with no weights.
2437 .. code-block:: python
2439 tab.GetOptimalPrefactors('colC', 'colA', 'colB', weights='colD')
2446 raise RuntimeError("At least one column must be specified.")
2448 b = self.GetNumpyMatrix(ref_col)
2449 a = self.GetNumpyMatrix(*args)
2452 if kwargs.has_key('weights'):
2453 w = self.GetNumpyMatrix(kwargs['weights'])
2454 b = np.multiply(b,w)
2455 a = np.multiply(a,w)
2458 raise RuntimeError("specified unrecognized kwargs, use weights as key")
2461 return list(np.array(k.T).reshape(-1))
2464 LogError("Function needs numpy, but I could not import it.")
2467 def PlotEnrichment(self, score_col, class_col, score_dir='-',
2468 class_dir='-', class_cutoff=2.0,
2469 style='-', title=None, x_title=None, y_title=None,
2470 clear=True, save=None):
2472 Plot an enrichment curve using matplotlib of column *score_col* classified
2473 according to *class_col*.
2475 For more information about parameters of the enrichment, see
2476 :meth:`ComputeEnrichment`, and for plotting see :meth:`Plot`.
2478 :warning: The function depends on *matplotlib*
2481 import matplotlib.pyplot as plt
2483 enrx, enry = self.ComputeEnrichment(score_col, class_col, score_dir,
2484 class_dir, class_cutoff)
2487 title = 'Enrichment of %s'%score_col
2490 x_title = '% database'
2493 y_title = '% positives'
2498 plt.plot(enrx, enry, style)
2500 plt.title(title, size='x-large', fontweight='bold')
2501 plt.ylabel(y_title, size='x-large')
2502 plt.xlabel(x_title, size='x-large')
2509 LogError("Function needs matplotlib, but I could not import it.")
2512 def ComputeEnrichment(self, score_col, class_col, score_dir='-',
2513 class_dir='-', class_cutoff=2.0):
2515 Computes the enrichment of column *score_col* classified according to
2518 For this it is necessary, that the datapoints are classified into positive
2519 and negative points. This can be done in two ways:
2521 - by using one 'bool' type column (*class_col*) which contains *True* for
2522 positives and *False* for negatives
2524 - by specifying a classification column (*class_col*), a cutoff value
2525 (*class_cutoff*) and the classification columns direction (*class_dir*).
2526 This will generate the classification on the fly
2528 * if ``class_dir=='-'``: values in the classification column that are less than or equal to class_cutoff will be counted as positives
2529 * if ``class_dir=='+'``: values in the classification column that are larger than or equal to class_cutoff will be counted as positives
2531 During the calculation, the table will be sorted according to *score_dir*,
2532 where a '-' values means smallest values first and therefore, the smaller
2533 the value, the better.
2535 :warning: If either the value of *class_col* or *score_col* is *None*, the
2536 data in this row is ignored.
2539 ALLOWED_DIR = ['+','-']
2541 score_idx = self.GetColIndex(score_col)
2542 score_type = self.col_types[score_idx]
2543 if score_type!='int' and score_type!='float':
2544 raise TypeError("Score column must be numeric type")
2546 class_idx = self.GetColIndex(class_col)
2547 class_type = self.col_types[class_idx]
2548 if class_type!='int' and class_type!='float' and class_type!='bool':
2549 raise TypeError("Classifier column must be numeric or bool type")
2551 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2552 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2554 self.Sort(score_col, score_dir)
2559 old_score_val = None
2562 for row in self.rows:
2563 class_val = row[class_idx]
2564 score_val = row[score_idx]
2565 if class_val==None or score_val==None:
2568 if old_score_val==None:
2569 old_score_val = score_val
2570 if score_val!=old_score_val:
2573 old_score_val = score_val
2575 if class_type=='bool':
2579 if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2584 # if no false positives or false negatives values are found return None
2585 if x[-1]==0 or y[-1]==0:
2588 x = [float(v)/x[-1] for v in x]
2589 y = [float(v)/y[-1] for v in y]
2592 def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-',
2593 class_dir='-', class_cutoff=2.0):
2595 Computes the area under the curve of the enrichment using the trapezoidal
2598 For more information about parameters of the enrichment, see
2599 :meth:`ComputeEnrichment`.
2601 :warning: The function depends on *numpy*
2606 enr = self.ComputeEnrichment(score_col, class_col, score_dir,
2607 class_dir, class_cutoff)
2611 return np.trapz(enr[1], enr[0])
2613 LogError("Function needs numpy, but I could not import it.")
2616 def ComputeROC(self, score_col, class_col, score_dir='-',
2617 class_dir='-', class_cutoff=2.0):
2619 Computes the receiver operating characteristics (ROC) of column *score_col*
2620 classified according to *class_col*.
2622 For this it is necessary, that the datapoints are classified into positive
2623 and negative points. This can be done in two ways:
2625 - by using one 'bool' column (*class_col*) which contains True for positives
2626 and False for negatives
2627 - by using a non-bool column (*class_col*), a cutoff value (*class_cutoff*)
2628 and the classification columns direction (*class_dir*). This will generate
2629 the classification on the fly
2631 - if ``class_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff* will be counted as positives
2632 - if ``class_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff* will be counted as positives
2634 During the calculation, the table will be sorted according to *score_dir*,
2635 where a '-' values means smallest values first and therefore, the smaller
2636 the value, the better.
2638 If *class_col* does not contain any positives (i.e. value is True (if column
2639 is of type bool) or evaluated to True (if column is of type int or float
2640 (depending on *class_dir* and *class_cutoff*))) the ROC is not defined and
2641 the function will return *None*.
2643 :warning: If either the value of *class_col* or *score_col* is *None*, the
2644 data in this row is ignored.
2647 ALLOWED_DIR = ['+','-']
2649 score_idx = self.GetColIndex(score_col)
2650 score_type = self.col_types[score_idx]
2651 if score_type!='int' and score_type!='float':
2652 raise TypeError("Score column must be numeric type")
2654 class_idx = self.GetColIndex(class_col)
2655 class_type = self.col_types[class_idx]
2656 if class_type!='int' and class_type!='float' and class_type!='bool':
2657 raise TypeError("Classifier column must be numeric or bool type")
2659 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2660 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2662 self.Sort(score_col, score_dir)
2668 old_score_val = None
2670 for i,row in enumerate(self.rows):
2671 class_val = row[class_idx]
2672 score_val = row[score_idx]
2673 if class_val==None or score_val==None:
2676 if old_score_val==None:
2677 old_score_val = score_val
2678 if score_val!=old_score_val:
2681 old_score_val = score_val
2682 if class_type=='bool':
2688 if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2695 # if no false positives or false negatives values are found return None
2696 if x[-1]==0 or y[-1]==0:
2699 x = [float(v)/x[-1] for v in x]
2700 y = [float(v)/y[-1] for v in y]
2703 def ComputeROCAUC(self, score_col, class_col, score_dir='-',
2704 class_dir='-', class_cutoff=2.0):
2706 Computes the area under the curve of the receiver operating characteristics
2707 using the trapezoidal rule.
2709 For more information about parameters of the ROC, see
2712 :warning: The function depends on *numpy*
2717 roc = self.ComputeROC(score_col, class_col, score_dir,
2718 class_dir, class_cutoff)
2722 return np.trapz(roc[1], roc[0])
2724 LogError("Function needs numpy, but I could not import it.")
2727 def ComputeLogROCAUC(self, score_col, class_col, score_dir='-',
2728 class_dir='-', class_cutoff=2.0):
2730 Computes the area under the curve of the log receiver operating
2731 characteristics (logROC) where the x-axis is semilogarithmic
2732 using the trapezoidal rule.
2734 The logROC is computed with a lambda of 0.001 according to
2735 Rapid Context-Dependent Ligand Desolvation in Molecular Docking
2736 Mysinger M. and Shoichet B., Journal of Chemical Information and Modeling
2737 2010 50 (9), 1561-1573
2739 For more information about parameters of the ROC, see
2742 :warning: The function depends on *numpy*
2747 roc = self.ComputeROC(score_col, class_col, score_dir,
2748 class_dir, class_cutoff)
2760 # remove all duplicate x-values
2761 rocxt = [x if x>0 else l for x in rocxt]
2762 for i in range(len(rocxt)-1):
2763 if rocxt[i]==rocxt[i+1]:
2765 rocx.append(rocxt[i])
2766 rocy.append(rocyt[i])
2772 for i in range(len(rocx)-1):
2774 if rocx[i]==rocx[i+1]:
2776 b = rocy[i+1]-rocx[i+1]*((rocy[i+1]-rocy[i])/(rocx[i+1]-rocx[i]))
2777 value += ((rocy[i+1]-rocy[i])/math.log(10))+b*(math.log10(rocx[i+1])-math.log10(rocx[i]))
2778 return value/math.log10(1.0/l)
2781 LogError("Function needs numpy, but I could not import it.")
2784 def PlotROC(self, score_col, class_col, score_dir='-',
2785 class_dir='-', class_cutoff=2.0,
2786 style='-', title=None, x_title=None, y_title=None,
2787 clear=True, save=None):
2789 Plot an ROC curve using matplotlib.
2791 For more information about parameters of the ROC, see
2792 :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2794 :warning: The function depends on *matplotlib*
2798 import matplotlib.pyplot as plt
2800 roc = self.ComputeROC(score_col, class_col, score_dir,
2801 class_dir, class_cutoff)
2809 title = 'ROC of %s'%score_col
2812 x_title = 'false positive rate'
2815 y_title = 'true positive rate'
2820 plt.plot(enrx, enry, style)
2822 plt.title(title, size='x-large', fontweight='bold')
2823 plt.ylabel(y_title, size='x-large')
2824 plt.xlabel(x_title, size='x-large')
2831 LogError("Function needs matplotlib, but I could not import it.")
2834 def PlotLogROC(self, score_col, class_col, score_dir='-',
2835 class_dir='-', class_cutoff=2.0,
2836 style='-', title=None, x_title=None, y_title=None,
2837 clear=True, save=None):
2839 Plot an logROC curve where the x-axis is semilogarithmic using matplotlib
2841 For more information about parameters of the ROC, see
2842 :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2844 :warning: The function depends on *matplotlib*
2848 import matplotlib.pyplot as plt
2850 roc = self.ComputeROC(score_col, class_col, score_dir,
2851 class_dir, class_cutoff)
2859 title = 'logROC of %s'%score_col
2862 x_title = 'false positive rate'
2865 y_title = 'true positive rate'
2870 rocx = [x if x>0 else 0.001 for x in rocx]
2873 plt.plot(rocx, rocy, style)
2875 plt.title(title, size='x-large', fontweight='bold')
2876 plt.ylabel(y_title, size='x-large')
2877 plt.xlabel(x_title, size='x-large')
2879 plt.xscale('log', basex=10)
2880 plt.xlim(0.001, 1.0)
2888 LogError("Function needs matplotlib, but I could not import it.")
2891 def ComputeMCC(self, score_col, class_col, score_dir='-',
2892 class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
2894 Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
2895 with the points classified into true positives, false positives, true
2896 negatives and false negatives according to a specified classification
2897 column (*class_col*).
2899 The datapoints in *score_col* and *class_col* are classified into
2900 positive and negative points. This can be done in two ways:
2902 - by using 'bool' columns which contains True for positives and False
2905 - by using 'float' or 'int' columns and specifying a cutoff value and the
2906 columns direction. This will generate the classification on the fly
2908 * if ``class_dir``/``score_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2909 * if ``class_dir``/``score_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2911 The two possibilities can be used together, i.e. 'bool' type for one column
2912 and 'float'/'int' type and cutoff/direction for the other column.
2914 ALLOWED_DIR = ['+','-']
2916 score_idx = self.GetColIndex(score_col)
2917 score_type = self.col_types[score_idx]
2918 if score_type!='int' and score_type!='float' and score_type!='bool':
2919 raise TypeError("Score column must be numeric or bool type")
2921 class_idx = self.GetColIndex(class_col)
2922 class_type = self.col_types[class_idx]
2923 if class_type!='int' and class_type!='float' and class_type!='bool':
2924 raise TypeError("Classifier column must be numeric or bool type")
2926 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2927 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2934 for i,row in enumerate(self.rows):
2935 class_val = row[class_idx]
2936 score_val = row[score_idx]
2938 if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
2939 if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
2944 if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
2952 msg = 'factor (tp + fn) is zero'
2954 msg = 'factor (tp + fp) is zero'
2956 msg = 'factor (tn + fn) is zero'
2958 msg = 'factor (tn + fp) is zero'
2961 LogWarning("Could not compute MCC: MCC is not defined since %s"%msg)
2963 mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
2967 def IsEmpty(self, col_name=None, ignore_nan=True):
2969 Checks if a table is empty.
2971 If no column name is specified, the whole table is checked for being empty,
2972 whereas if a column name is specified, only this column is checked.
2974 By default, all NAN (or None) values are ignored, and thus, a table
2975 containing only NAN values is considered as empty. By specifying the
2976 option ignore_nan=False, NAN values are counted as 'normal' values.
2979 # table with no columns and no rows
2980 if len(self.col_names)==0:
2982 raise ValueError('Table has no column named "%s"' % col_name)
2985 # column name specified
2987 if self.Count(col_name, ignore_nan=ignore_nan)==0:
2992 # no column name specified -> test whole table
2994 for row in self.rows:
3004 def Extend(self, tab, overwrite=None):
3006 Append each row of *tab* to the current table. The data
is appended based
3007 on the column names, thus the order of the table columns
is *
not* relevant,
3008 only the header names.
3010 If there
is a column
in *tab* that
is not present
in the current table,
3011 it
is added to the current table
and filled with *
None*
for all the rows
3012 present
in the current table.
3014 If the type of any column
in *tab*
is not the same
as in the current table
3015 a *TypeError*
is raised.
3017 If *overwrite*
is not None and set to an existing column name, the specified
3018 column
in the table
is searched
for the first occurrence of a value matching
3019 the value of the column with the same name
in the dictionary. If a matching
3020 value
is found, the row
is overwritten with the dictionary. If no matching
3021 row
is found, a new row
is appended to the table.
3023 # add column to current table if it doesn't exist
3024 for name,typ in zip(tab.col_names, tab.col_types):
3025 if not name in self.col_names:
3026 self.AddCol(name, typ)
3028 # check that column types are the same in current and new table
3029 for name in self.col_names:
3030 if name in tab.col_names:
3031 curr_type = self.col_types[self.GetColIndex(name)]
3032 new_type = tab.col_types[tab.GetColIndex(name)]
3033 if curr_type!=new_type:
3034 raise TypeError('cannot extend table, column %s in new '%name +\
3035 'table different type (%s) than in '%new_type +\
3036 'current table (%s)'%curr_type)
3038 num_rows = len(tab.rows)
3039 for i in range(0,num_rows):
3041 data = dict(zip(tab.col_names,row))
3042 self.AddRow(data, overwrite)
3045 def Merge(table1, table2, by, only_matching=False):
3047 Returns a new table containing the data
from both tables. The rows are
3048 combined based on the common values
in the column(s) by. The option
'by' can
3049 be a list of column names. When this
is the case, merging
is based on
3051 For example, the two tables below
3069 when merged by column x, produce the following output:
3082 def _key(row, indices):
3083 return tuple([row[i] for i in indices])
3084 def _keep(indices, cn, ct, ni):
3085 ncn, nct, nni=([],[],[])
3086 for i in range(len(cn)):
3087 if i not in indices:
3091 return ncn, nct, nni
3092 col_names=list(table2.col_names)
3093 col_types=list(table2.col_types)
3094 new_index=[i for i in range(len(col_names))]
3095 if isinstance(by, str):
3096 common2_indices=[col_names.index(by)]
3098 common2_indices=[col_names.index(b) for b in by]
3099 col_names, col_types, new_index=_keep(common2_indices, col_names,
3100 col_types, new_index)
3102 for i, name in enumerate(col_names):
3105 while try_name in table1.col_names:
3107 try_name='%s_%d' % (name, counter)
3108 col_names[i]=try_name
3110 if isinstance(by, str):
3111 common1_indices=[table1.col_names.index(by)]
3113 common1_indices=[table1.col_names.index(b) for b in by]
3114 for row in table1.rows:
3115 key=_key(row, common1_indices)
3117 raise ValueError('duplicate key "%s in first table"' % (str(key)))
3120 for row in table2.rows:
3121 key=_key(row, common2_indices)
3123 raise ValueError('duplicate key "%s" in second table' % (str(key)))
3125 new_tab=Table(table1.col_names+col_names, table1.col_types+col_types)
3126 for k, v in common1.iteritems():
3127 row=v+[None for i in range(len(table2.col_names)-len(common2_indices))]
3132 for i, index in enumerate(new_index):
3133 row[len(table1.col_names)+i]=row2[index]
3134 if only_matching and not matched:
3139 for k, v in common2.iteritems():
3140 if not k in common1:
3141 v2=[v[i] for i in new_index]
3142 row=[None for i in range(len(table1.col_names))]+v2
3143 for common1_index, common2_index in zip(common1_indices, common2_indices):
3144 row[common1_index]=v[common2_index]