9 from ost
import LogError, LogWarning, LogInfo, LogVerbose
12 return col_name.replace(
'_',
' ')
15 if isinstance(value, TableCol)
or isinstance(value, BinaryColExpr):
24 value=value.strip().upper()
25 return value
in (
'',
'NULL',
'NONE',
'NA')
31 if isinstance(value, TableCol)
or isinstance(value, BinaryColExpr):
40 possibilities=set([
'bool',
'int',
'float'])
42 str_ele=str(ele).upper()
46 if 'int' in possibilities:
50 possibilities.remove(
'int')
52 if 'float' in possibilities:
56 possibilities.remove(
'float')
57 if 'bool' in possibilities:
58 if str_ele
not in set([
'YES',
'NO',
'TRUE',
'FALSE']):
59 possibilities.remove(
'bool')
61 if len(possibilities)==0:
63 if len(possibilities)==2:
68 return possibilities.pop()
76 self.
lhs=itertools.cyle([self.
lhs])
78 self.
rhs=itertools.cycle([self.
rhs])
80 for l, r
in zip(self.
lhs, self.
rhs):
81 if l!=
None and r!=
None:
106 for row
in self._table.rows:
110 return len(self._table.rows)
113 return self._table.rows[index][self.
col_index]
116 self._table.rows[index][self.
col_index]=value
132 Essentially a named tuple, but allows column names that are not valid
133 python variable names.
136 self.__dict__[
'tab'] = weakref.proxy(tab)
137 self.__dict__[
'row_data'] = row_data
140 if type(col_name)==int:
141 return self.row_data[col_name]
142 return self.row_data[self.tab.GetColIndex(col_name)]
146 for k, v
in zip(self.__dict__[
'tab'].col_names, self.__dict__[
'row_data']):
147 s.append(
'%s=%s' % (k, str(v)))
152 return len(self.row_data)
155 if type(col_name)==int:
156 self.row_data[col_name] = val
158 self.row_data[self.tab.GetColIndex(col_name)] = val
161 if 'col_names' not in self.tab.__dict__
or col_name
not in self.tab.col_names:
162 raise AttributeError(col_name)
163 return self.row_data[self.tab.GetColIndex(col_name)]
166 if 'col_names' not in self.tab.__dict__
or col_name
not in self.tab.col_names:
167 raise AttributeError(col_name)
168 self.row_data[self.tab.GetColIndex(col_name)] = val
173 The table class provides convenient access to data in tabular form. An empty
174 table can be easily constructed as follows
176 .. code-block:: python
180 If you want to add columns directly when creating the table, column names
181 and *column types* can be specified as follows
183 .. code-block:: python
185 tab = Table(['nameX','nameY','nameZ'], 'sfb')
187 this will create three columns called nameX, nameY and nameZ of type string,
188 float and bool, respectively. There will be no data in the table and thus,
189 the table will not contain any rows.
191 The following *column types* are supported:
202 If you want to add data to the table in addition, use the following:
204 .. code-block:: python
206 tab=Table(['nameX','nameY','nameZ'],
208 nameX = ['a','b','c'],
209 nameY = [0.1, 1.2, 3.414],
210 nameZ = [True, False, False])
212 if values for one column is left out, they will be filled with NA, but if
213 values are specified, all values must be specified (i.e. same number of
218 SUPPORTED_TYPES=(
'int',
'float',
'bool',
'string',)
221 def __init__(self, col_names=[], col_types=None, **kwargs):
231 self.
col_names=[v
for v
in kwargs.keys()]
242 if 'col_names' not in self.__dict__
or col_name
not in self.
col_names:
243 raise AttributeError(col_name)
247 def _ParseColTypes(types, exp_num=None):
251 short2long = {
's' :
'string',
'i':
'int',
'b' :
'bool',
'f' :
'float'}
252 allowed_short = short2long.keys()
253 allowed_long = short2long.values()
260 types = types.lower()
263 if types
in allowed_long:
264 type_list.append(types)
265 elif types
in allowed_short:
266 type_list.append(short2long[types])
269 elif types.find(
',')!=-1:
270 for t
in types.split(
','):
271 if t
in allowed_long:
273 elif t
in allowed_short:
274 type_list.append(short2long[t])
276 raise ValueError(
'Unknown type %s in types %s'%(t,types))
281 if t
in allowed_short:
282 type_list.append(short2long[t])
284 raise ValueError(
'Unknown type %s in types %s'%(t,types))
288 raise ValueError(
'Col type %s must be string or list'%types)
296 if t
in allowed_long:
298 elif t
in allowed_short:
299 type_list.append(short2long[t])
301 raise ValueError(
'Unknown type %s in types %s'%(t,types))
305 raise ValueError(
'Col type %s must be string or list'%types)
308 if len(type_list)!=exp_num:
309 raise ValueError(
'Parsed number of col types (%i) differs from ' + \
310 'expected (%i) in types %s'%(len(type_list),exp_num,types))
316 Set name of the table
319 :type name: :class:`str`
331 Rename column *old_name* to *new_name*.
333 :param old_name: Name of the old column
334 :param new_name: Name of the new column
335 :raises: :exc:`ValueError` when *old_name* is not a valid column
337 if old_name==new_name:
342 def _Coerce(self, value, ty):
344 Try to convert values (e.g. from :class:`str` type) to the specified type
346 :param value: the value
347 :type value: any type
349 :param ty: name of type to convert it to (i.e. *int*, *float*, *string*,
351 :type ty: :class:`str`
353 if value==
'NA' or value==
None:
362 if isinstance(value, str)
or isinstance(value, unicode):
363 if value.upper()
in (
'FALSE',
'NO',):
367 raise ValueError(
'Unknown type %s' % ty)
371 Returns the column index for the column with the given name.
373 :raises: ValueError if no column with the name is found.
376 raise ValueError(
'Table has no column named "%s"' % col)
377 return self.col_names.index(col)
381 Returns a list containing all column names.
387 Returns a list of column names matching the regex.
389 :param regex: regex pattern
390 :type regex: :class:`str`
392 :returns: :class:`list` of column names (:class:`str`)
396 matches = re.search(regex, name)
398 matching_names.append(name)
399 return matching_names
403 Checks if the column with a given name is present in the table.
418 value=itertools.cycle([value])
419 for r, v
in zip(self.
rows, value):
422 def ToString(self, float_format='%.3f', int_format='%d', rows=None):
424 Convert the table into a string representation.
426 The output format can be modified for int and float type columns by
427 specifying a formatting string for the parameters *float_format* and
430 The option *rows* specify the range of rows to be printed. The parameter
431 must be a type that supports indexing (e.g. a :class:`list`) containing the
432 start and end row *index*, e.g. [start_row_idx, end_row_idx].
434 :param float_format: formatting string for float columns
435 :type float_format: :class:`str`
437 :param int_format: formatting string for int columns
438 :type int_format: :class:`str`
440 :param rows: iterable containing start and end row *index*
441 :type rows: iterable containing :class:`ints <int>`
443 widths=[len(cn)
for cn
in self.
col_names]
446 sel_rows=self.
rows[rows[0]:rows[1]]
448 for i, (ty, col)
in enumerate(zip(self.
col_types, row)):
450 widths[i]=max(widths[i], len(
'NA'))
452 widths[i]=max(widths[i], len(float_format % col))
454 widths[i]=max(widths[i], len(int_format % col))
456 widths[i]=max(widths[i], len(str(col)))
459 s+=
''.join([
'# %s\n' % l
for l
in self.comment.split(
'\n')])
460 total_width=sum(widths)+2*len(widths)
461 for width, col_name
in zip(widths, self.
col_names):
462 s+=col_name.center(width+2)
463 s+=
'\n%s\n' % (
'-'*total_width)
465 for width, ty, col
in zip(widths, self.
col_types, row):
468 cs=
'NA'.center(width+2)
470 cs=(float_format % col).rjust(width+2)
472 cs=(int_format % col).rjust(width+2)
474 cs=
' '+str(col).ljust(width+1)
485 Statistics for column %(col)s
487 Number of Rows : %(num)d
488 Number of Rows Not None: %(num_non_null)d
491 Standard Deviation : %(stddev)f
497 'num' : len(self.
rows),
498 'num_non_null' : self.
Count(col),
499 'median' : self.
Median(col),
500 'mean' : self.
Mean(col),
501 'stddev' : self.
StdDev(col),
502 'min' : self.
Min(col),
503 'max' : self.
Max(col),
507 def _AddRowsFromDict(self, d, overwrite=None):
509 Add one or more rows from a :class:`dictionary <dict>`.
511 If *overwrite* is not None and set to an existing column name, the specified
512 column in the table is searched for the first occurrence of a value matching
513 the value of the column with the same name in the dictionary. If a matching
514 value is found, the row is overwritten with the dictionary. If no matching
515 row is found, a new row is appended to the table.
517 :param d: dictionary containing the data
518 :type d: :class:`dict`
520 :param overwrite: column name to overwrite existing row if value in
521 column *overwrite* matches
522 :type overwrite: :class:`str`
524 :raises: :class:`ValueError` if multiple rows are added but the number of
525 data items is different for different columns.
532 for k,v
in d.iteritems():
538 elif old_len!=len(v):
539 raise ValueError(
"Cannot add rows: length of data must be equal " + \
540 "for all columns in %s"%str(d))
543 for i,data
in enumerate(zip(*d.values())):
544 new_row = [
None for a
in range(len(self.
col_names))]
545 for idx,v
in zip(idxs,data):
552 for i,r
in enumerate(self.
rows):
553 if r[overwrite_idx]==new_row[overwrite_idx]:
554 for j,e
in enumerate(self.
rows[i]):
557 self.
rows[i] = new_row
562 if not overwrite
or not added:
563 self.rows.append(new_row)
567 Two-sided test for the null-hypothesis that two related samples
568 have the same average (expected values).
570 :param col_a: First column
571 :type col_a: :class:`str`
572 :param col_b: Second column
573 :type col_b: :class:`str`
575 :returns: P-value between 0 and 1 that the two columns have the
576 same average. The smaller the value, the less related the two
579 from scipy.stats
import ttest_rel
582 for x, y
in self.
Zip(col_a, col_b):
583 if x!=
None and y!=
None:
586 result = ttest_rel(xs, ys)
591 Add a row to the table.
593 *data* may either be a dictionary or a list-like object:
595 - If *data* is a dictionary, the keys in the dictionary must match the
596 column names. Columns not found in the dict will be initialized to None.
597 If the dict contains list-like objects, multiple rows will be added, if
598 the number of items in all list-like objects is the same, otherwise a
599 :class:`ValueError` is raised.
601 - If *data* is a list-like object, the row is initialized from the values
602 in *data*. The number of items in *data* must match the number of
603 columns in the table. A :class:`ValuerError` is raised otherwise. The
604 values are added in the order specified in the list, thus, the order of
605 the data must match the columns.
607 If *overwrite* is not None and set to an existing column name, the specified
608 column in the table is searched for the first occurrence of a value matching
609 the value of the column with the same name in the dictionary. If a matching
610 value is found, the row is overwritten with the dictionary. If no matching
611 row is found, a new row is appended to the table.
613 :param data: data to add
614 :type data: :class:`dict` or *list-like* object
616 :param overwrite: column name to overwrite existing row if value in
617 column *overwrite* matches
618 :type overwrite: :class:`str`
620 :raises: :class:`ValueError` if *list-like* object is used and number of
621 items does *not* match number of columns in table.
623 :raises: :class:`ValueError` if *dict* is used and multiple rows are added
624 but the number of data items is different for different columns.
626 **Example:** add multiple data rows to a subset of columns using a dictionary
628 .. code-block:: python
630 # create table with three float columns
631 tab = Table(['x','y','z'], 'fff')
634 data = {'x': [1.2, 1.6], 'z': [1.6, 5.3]}
639 will produce the table
649 # overwrite the row with x=1.2 and add row with x=1.9
650 data = {'x': [1.2, 1.9], 'z': [7.9, 3.5]}
651 tab.AddRow(data, overwrite='x')
655 will produce the table
670 msg=
'data array must have %d elements, not %d'
671 raise ValueError(msg % (len(self.
col_names), len(data)))
678 for i,r
in enumerate(self.
rows):
679 if r[overwrite_idx]==new_row[overwrite_idx]:
680 self.
rows[i] = new_row
685 if not overwrite
or not added:
686 self.rows.append(new_row)
690 Remove column with the given name from the table.
692 :param col: name of column to remove
693 :type col: :class:`str`
698 for row
in self.
rows:
701 def AddCol(self, col_name, col_type, data=None):
703 Add a column to the right of the table.
705 :param col_name: name of new column
706 :type col_name: :class:`str`
708 :param col_type: type of new column (long versions: *int*, *float*, *bool*,
709 *string* or short versions: *i*, *f*, *b*, *s*)
710 :type col_type: :class:`str`
712 :param data: data to add to new column
713 :type data: scalar or iterable
717 .. code-block:: python
719 tab = Table(['x'], 'f', x=range(5))
720 tab.AddCol('even', 'bool', itertools.cycle([True, False]))
724 will produce the table
737 If data is a constant instead of an iterable object, it's value
738 will be written into each row:
740 .. code-block:: python
742 tab = Table(['x'], 'f', x=range(5))
743 tab.AddCol('num', 'i', 1)
747 will produce the table
760 As a special case, if there are no previous rows, and data is not
761 None, rows are added for every item in data.
765 raise ValueError(
'Column with name %s already exists'%col_name)
768 self.col_names.append(col_name)
769 self.col_types.append(col_type)
773 for row
in self.
rows:
776 if hasattr(data,
'__len__')
and len(data)!=len(self.
rows):
779 raise ValueError(
'Length of data (%i) must correspond to number of '%len(data) +\
780 'existing rows (%i)'%len(self.
rows))
781 for row, d
in zip(self.
rows, data):
784 elif data!=
None and len(self.
col_names)==1:
786 self.
AddRow({col_name : data})
789 self.
AddRow({col_name : v})
793 Returns a filtered table only containing rows matching all the predicates
794 in kwargs and args For example,
796 .. code-block:: python
798 tab.Filter(town='Basel')
800 will return all the rows where the value of the column "town" is equal to
801 "Basel". Several predicates may be combined, i.e.
803 .. code-block:: python
805 tab.Filter(town='Basel', male=True)
807 will return the rows with "town" equal to "Basel" and "male" equal to true.
808 args are unary callables returning true if the row should be included in the
809 result and false if not.
812 for row
in self.
rows:
818 for key, val
in kwargs.iteritems():
830 Returns a new table object containing all rows matching a logical query
833 *query* is a string containing the logical expression, that will be
834 evaluated for every row.
836 Operands have to be the name of a column or an expression that can be
837 parsed to float, int, bool or string.
838 Valid operators are: and, or, !=, !, <=, >=, ==, =, <, >, +, -, \*, /
840 .. code-block:: python
842 subtab = tab.Select('col_a>0.5 and (col_b=5 or col_c=5)')
844 The selection query should be self explaining. Allowed parenthesis are:
845 (), [], {}, whereas parenthesis mismatches get recognized. Expressions like
846 '3<=col_a>=col_b' throw an error, due to problems in figuring out the
849 There are two special expressions:
851 .. code-block:: python
853 #selects rows, where 1.0<=col_a<=1.5
854 subtab = tab.Select('col_a=1.0:1.5')
856 #selects rows, where col_a=1 or col_a=2 or col_a=3
857 subtab = tab.Select('col_a=1,2,3')
859 Only consistent types can be compared. If col_a is of type string and col_b
860 is of type int, following expression would throw an error: 'col_a<col_b'
864 from table_selector
import TableSelector
866 raise ImportError(
"Tried to import from the file table_selector.py, but could not find it!")
872 for row
in self.
rows:
873 if selector.EvaluateRow(row):
874 selected_tab.AddRow(row)
880 def _LoadOST(stream_or_filename):
881 fieldname_pattern=re.compile(
r'(?P<name>[^[]+)(\[(?P<type>\w+)\])?')
882 values_pattern=re.compile(
"([^\" ]+|\"[^\"]*\")+")
883 if not hasattr(stream_or_filename,
'read'):
884 stream=open(stream_or_filename,
'r')
886 stream=stream_or_filename
891 if line.startswith(
'#'):
899 for col
in line.split():
900 match=fieldname_pattern.match(col)
902 if match.group(
'type'):
903 fieldtypes.append(match.group(
'type'))
905 fieldtypes.append(
'string')
906 fieldnames.append(match.group(
'name'))
907 tab=
Table(fieldnames, fieldtypes)
910 tab.AddRow([x.strip(
'"')
for x
in values_pattern.findall(line)])
912 raise IOError(
"Cannot read table from empty stream")
915 def _GuessColumnTypes(self):
916 for col_idx
in range(len(self.
col_names)):
918 for row
in self.
rows:
919 for idx
in range(len(row)):
923 def _LoadCSV(stream_or_filename, sep):
924 if not hasattr(stream_or_filename,
'read'):
925 stream=open(stream_or_filename,
'r')
927 stream=stream_or_filename
928 reader=csv.reader(stream, delimiter=sep)
934 tab=
Table(header, types)
939 raise IOError(
'trying to load table from empty CSV stream/file')
941 tab._GuessColumnTypes()
945 def _LoadPickle(stream_or_filename):
946 if not hasattr(stream_or_filename,
'read'):
947 stream=open(stream_or_filename,
'rb')
949 stream=stream_or_filename
950 return cPickle.load(stream)
953 def _GuessFormat(filename):
955 filename = filename.name
956 except AttributeError, e:
958 if filename.endswith(
'.csv'):
960 elif filename.endswith(
'.pickle'):
967 def Load(stream_or_filename, format='auto', sep=','):
969 Load table from stream or file with given name.
971 By default, the file format is set to *auto*, which tries to guess the file
972 format from the file extension. The following file extensions are
975 ============ ======================
976 extension recognized format
977 ============ ======================
978 .csv comma separated values
979 .pickle pickled byte stream
980 <all others> ost-specific format
981 ============ ======================
983 Thus, *format* must be specified for reading file with different filename
986 The following file formats are understood:
990 This is an ost-specific, but still human readable file format. The file
991 (stream) must start with header line of the form
993 col_name1[type1] <col_name2[type2]>...
995 The types given in brackets must be one of the data types the
996 :class:`Table` class understands. Each following line in the file then must
997 contains exactly the same number of data items as listed in the header. The
998 data items are automatically converted to the column format. Lines starting
999 with a '#' and empty lines are ignored.
1003 Deserializes the table from a pickled byte stream.
1007 Reads the table from comma separated values stream. Since there is no
1008 explicit type information in the csv file, the column types are guessed,
1009 using the following simple rules:
1011 * if all values are either NA/NULL/NONE the type is set to string.
1012 * if all non-null values are convertible to float/int the type is set to
1014 * if all non-null values are true/false/yes/no, the value is set to bool.
1015 * for all other cases, the column type is set to string.
1017 :returns: A new :class:`Table` instance
1019 format=format.lower()
1021 format = Table._GuessFormat(stream_or_filename)
1024 return Table._LoadOST(stream_or_filename)
1026 return Table._LoadCSV(stream_or_filename, sep=sep)
1027 if format==
'pickle':
1028 return Table._LoadPickle(stream_or_filename)
1029 raise ValueError(
'unknown format ""' % format)
1033 Performs an in-place sort of the table, based on column *by*.
1035 :param by: column name by which to sort
1036 :type by: :class:`str`
1038 :param order: ascending (``-``) or descending (``+``) order
1039 :type order: :class:`str` (i.e. *+*, *-*)
1045 def _key_cmp(lhs, rhs):
1046 return sign*cmp(lhs[key_index], rhs[key_index])
1047 self.
rows=sorted(self.
rows, _key_cmp)
1051 Extract a list of all unique values from one column.
1053 :param col: column name
1054 :type col: :class:`str`
1056 :param ignore_nan: ignore all *None* values
1057 :type ignore_nan: :class:`bool`
1062 for row
in self.
rows:
1064 if item!=
None or ignore_nan==
False:
1065 if item
in seen:
continue
1072 Allows to conveniently iterate over a selection of columns, e.g.
1074 .. code-block:: python
1076 tab = Table.Load('...')
1077 for col1, col2 in tab.Zip('col1', 'col2'):
1082 .. code-block:: python
1084 tab = Table.Load('...')
1085 for col1, col2 in zip(tab['col1'], tab['col2']):
1088 return zip(*[self[arg]
for arg
in args])
1090 def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None,
1091 z_title=
None, x_range=
None, y_range=
None, z_range=
None,
1092 color=
None, plot_if=
None, legend=
None,
1093 num_z_levels=10, z_contour=
True, z_interpol=
'nn', diag_line=
False,
1094 labels=
None, max_num_labels=
None, title=
None, clear=
True, save=
False,
1097 Function to plot values from your table in 1, 2 or 3 dimensions using
1098 `Matplotlib <http://matplotlib.sourceforge.net>`__
1100 :param x: column name for first dimension
1101 :type x: :class:`str`
1103 :param y: column name for second dimension
1104 :type y: :class:`str`
1106 :param z: column name for third dimension
1107 :type z: :class:`str`
1109 :param style: symbol style (e.g. *.*, *-*, *x*, *o*, *+*, *\**). For a
1110 complete list check (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1111 :type style: :class:`str`
1113 :param x_title: title for first dimension, if not specified it is
1114 automatically derived from column name
1115 :type x_title: :class:`str`
1117 :param y_title: title for second dimension, if not specified it is
1118 automatically derived from column name
1119 :type y_title: :class:`str`
1121 :param z_title: title for third dimension, if not specified it is
1122 automatically derived from column name
1123 :type z_title: :class:`str`
1125 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1126 :type x_range: :class:`list` of length two
1128 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1129 :type y_range: :class:`list` of length two
1131 :param z_range: start and end value for third dimension (e.g. [start_z, end_z])
1132 :type z_range: :class:`list` of length two
1134 :param color: color for data (e.g. *b*, *g*, *r*). For a complete list check
1135 (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1136 :type color: :class:`str`
1138 :param plot_if: callable which returnes *True* if row should be plotted. Is
1139 invoked like ``plot_if(self, row)``
1140 :type plot_if: callable
1142 :param legend: legend label for data series
1143 :type legend: :class:`str`
1145 :param num_z_levels: number of levels for third dimension
1146 :type num_z_levels: :class:`int`
1148 :param diag_line: draw diagonal line
1149 :type diag_line: :class:`bool`
1151 :param labels: column name containing labels to put on x-axis for one
1153 :type labels: :class:`str`
1155 :param max_num_labels: limit maximum number of labels
1156 :type max_num_labels: :class:`int`
1158 :param title: plot title, if not specified it is automatically derived from
1159 plotted column names
1160 :type title: :class:`str`
1162 :param clear: clear old data from plot
1163 :type clear: :class:`bool`
1165 :param save: filename for saving plot
1166 :type save: :class:`str`
1168 :param z_contour: draw contour lines
1169 :type z_contour: :class:`bool`
1171 :param z_interpol: interpolation method for 3-dimensional plot (one of 'nn',
1173 :type z_interpol: :class:`str`
1175 :param \*\*kwargs: additional arguments passed to matplotlib
1177 :returns: the ``matplotlib.pyplot`` module
1179 **Examples:** simple plotting functions
1181 .. code-block:: python
1183 tab = Table(['a','b','c','d'],'iffi', a=range(5,0,-1),
1184 b=[x/2.0 for x in range(1,6)],
1185 c=[math.cos(x) for x in range(0,5)],
1188 # one dimensional plot of column 'd' vs. index
1192 # two dimensional plot of 'a' vs. 'c'
1193 plt = tab.Plot('a', y='c', style='o-')
1196 # three dimensional plot of 'a' vs. 'c' with values 'b'
1197 plt = tab.Plot('a', y='c', z='b')
1198 # manually save plot to file
1199 plt.savefig("plot.png")
1202 import matplotlib.pyplot
as plt
1203 import matplotlib.mlab
as mlab
1211 plt.figure(figsize=[8, 6])
1234 if x_range
and (
IsScalar(x_range)
or len(x_range)!=2):
1235 raise ValueError(
'parameter x_range must contain exactly two elements')
1236 if y_range
and (
IsScalar(y_range)
or len(y_range)!=2):
1237 raise ValueError(
'parameter y_range must contain exactly two elements')
1238 if z_range
and (
IsScalar(z_range)
or len(z_range)!=2):
1239 raise ValueError(
'parameter z_range must contain exactly two elements')
1242 kwargs[
'color']=color
1244 kwargs[
'label']=legend
1248 for row
in self.
rows:
1249 if row[idx1]!=
None and row[idx2]!=
None and row[idx3]!=
None:
1250 if plot_if
and not plot_if(self, row):
1252 xs.append(row[idx1])
1253 ys.append(row[idx2])
1254 zs.append(row[idx3])
1257 z_spacing = (z_range[1] - z_range[0]) / num_z_levels
1261 z_spacing = (self.
Max(z) - l) / num_z_levels
1263 for i
in range(0,num_z_levels+1):
1267 xi = np.linspace(min(xs),max(xs),len(xs)*10)
1268 yi = np.linspace(min(ys),max(ys),len(ys)*10)
1269 zi = mlab.griddata(xs, ys, zs, xi, yi, interp=z_interpol)
1272 plt.contour(xi,yi,zi,levels,linewidths=0.5,colors=
'k')
1274 plt.contourf(xi,yi,zi,levels,cmap=plt.cm.jet)
1275 plt.colorbar(ticks=levels)
1279 for row
in self.
rows:
1280 if row[idx1]!=
None and row[idx2]!=
None:
1281 if plot_if
and not plot_if(self, row):
1283 xs.append(row[idx1])
1284 ys.append(row[idx2])
1285 plt.plot(xs, ys, style, **kwargs)
1292 for row
in self.
rows:
1294 if plot_if
and not plot_if(self, row):
1296 xs.append(row[idx1])
1298 label_vals.append(row[label_idx])
1299 plt.plot(xs, style, **kwargs)
1303 if len(label_vals)>max_num_labels:
1304 interval = int(math.ceil(float(len(label_vals))/max_num_labels))
1305 label_vals = label_vals[::interval]
1306 plt.xticks(np.arange(0, len(xs), interval), label_vals, rotation=45,
1311 title =
'%s of %s vs. %s' % (nice_z, nice_x, nice_y)
1313 title =
'%s vs. %s' % (nice_x, nice_y)
1317 plt.title(title, size=
'x-large', fontweight=
'bold',
1318 verticalalignment=
'bottom')
1324 plt.xlabel(nice_x, size=
'x-large')
1326 plt.xlim(x_range[0], x_range[1])
1328 plt.ylim(y_range[0], y_range[1])
1330 plt.plot(x_range, y_range,
'-', color=
'black')
1332 plt.ylabel(nice_y, size=
'x-large')
1335 plt.ylim(y_range[0], y_range[1])
1337 plt.xlabel(x_title, size=
'x-large')
1338 plt.ylabel(nice_y, size=
'x-large')
1343 LogError(
"Function needs numpy and matplotlib, but I could not import it.")
1346 def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False,
1347 histtype=
'stepfilled', align=
'mid', x_title=
None,
1348 y_title=
None, title=
None, clear=
True, save=
False,
1349 color=
None, y_range=
None):
1351 Create a histogram of the data in col for the range *x_range*, split into
1352 *num_bins* bins and plot it using Matplotlib.
1354 :param col: column name with data
1355 :type col: :class:`str`
1357 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1358 :type x_range: :class:`list` of length two
1360 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1361 :type y_range: :class:`list` of length two
1363 :param num_bins: number of bins in range
1364 :type num_bins: :class:`int`
1366 :param color: Color to be used for the histogram. If not set, color will be
1367 determined by matplotlib
1368 :type color: :class:`str`
1370 :param normed: normalize histogram
1371 :type normed: :class:`bool`
1373 :param histtype: type of histogram (i.e. *bar*, *barstacked*, *step*,
1374 *stepfilled*). See (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1375 :type histtype: :class:`str`
1377 :param align: style of histogram (*left*, *mid*, *right*). See
1378 (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1379 :type align: :class:`str`
1381 :param x_title: title for first dimension, if not specified it is
1382 automatically derived from column name
1383 :type x_title: :class:`str`
1385 :param y_title: title for second dimension, if not specified it is
1386 automatically derived from column name
1387 :type y_title: :class:`str`
1389 :param title: plot title, if not specified it is automatically derived from
1390 plotted column names
1391 :type title: :class:`str`
1393 :param clear: clear old data from plot
1394 :type clear: :class:`bool`
1396 :param save: filename for saving plot
1397 :type save: :class:`str`
1399 **Examples:** simple plotting functions
1401 .. code-block:: python
1403 tab = Table(['a'],'f', a=[math.cos(x*0.01) for x in range(100)])
1405 # one dimensional plot of column 'd' vs. index
1406 plt = tab.PlotHistogram('a')
1411 import matplotlib.pyplot
as plt
1414 if len(self.
rows)==0:
1418 kwargs[
'color']=color
1428 n, bins, patches = plt.hist(data, bins=num_bins, range=x_range,
1429 normed=normed, histtype=histtype, align=align,
1436 plt.xlabel(nice_x, size=
'x-large')
1443 plt.ylabel(nice_y, size=
'x-large')
1448 nice_title=
"Histogram of %s"%nice_x
1449 plt.title(nice_title, size=
'x-large', fontweight=
'bold')
1455 LogError(
"Function needs numpy and matplotlib, but I could not import it.")
1458 def _Max(self, col):
1459 if len(self.
rows)==0:
1463 if col_type==
'int' or col_type==
'float':
1464 max_val = -float(
'inf')
1465 elif col_type==
'bool':
1467 elif col_type==
'string':
1470 for i
in range(0, len(self.
rows)):
1471 if self.
rows[i][idx]>max_val:
1472 max_val = self.
rows[i][idx]
1474 return max_val, max_idx
1476 def PlotBar(self, cols=None, rows=None, xlabels=None, set_xlabels=True, xlabels_rotation='horizontal', y_title=None, title=None,
1477 colors=
None, width=0.8, bottom=0, legend=
False, legend_names=
None, show=
False, save=
False):
1480 Create a barplot of the data in cols. Every column will be represented
1481 at one position. If there are several rows, each column will be grouped
1484 :param cols: List of column names. Every column will be represented as a
1485 single bar. If cols is None, every column of the table gets
1487 :type cols: :class:`list`
1489 :param rows: List of row indices. Values from given rows will be plotted
1490 in parallel at one column position. If set to None, all rows
1491 of the table will be plotted. Note, that the maximum number
1493 :type rows: :class:`list`
1495 :param xlabels: Label for every col on x-axis. If set to None, the column
1496 names are used. The xlabel plotting can be supressed by
1497 the parameter set_xlabel.
1498 :type xlabels: :class:`list`
1500 :param set_xlabels: Controls whether xlabels are plotted or not.
1501 :type set_xlabels: :class:`bool`
1503 :param x_labels_rotation: Can either be 'horizontal', 'vertical' or an
1504 integer, that describes the rotation in degrees.
1506 :param y_title: Y-axis description
1507 :type y_title: :class:`str`
1509 :title: Title of the plot. No title appears if set to None
1510 :type title: :class:`str`
1512 :param colors: Colors of the different bars in each group. Must be a list
1513 of valid colors in matplotlib. Length of color and rows must
1515 :type colors: :class:`list`
1517 :param width: The available space for the groups on the x-axis is divided
1518 by the exact number of groups. The parameters width is the
1519 fraction of what is actually used. If it would be 1.0 the
1520 bars of the different groups would touch each other.
1521 Value must be between [0;1]
1522 :type width: :class:`float`
1524 :param bottom: Bottom
1525 :type bottom: :class:`float`
1527 :param legend: Legend for color explanation, the corresponding row
1528 respectively. If set to True, legend_names must be provided.
1529 :type legend: :class:`bool`
1531 :param legend_names: List of names, that describe the differently colored
1532 bars. Length must be consistent with number of rows.
1534 :param show: If set to True, the plot is directly displayed.
1536 :param save: If set, a png image with name save in the current working
1537 directory will be saved.
1538 :type save: :class:`str`
1543 import matplotlib.pyplot
as plt
1545 raise ImportError(
'PlotBar relies on numpy and matplotlib, but I could' \
1548 standard_colors=[
'b',
'g',
'y',
'c',
'm',
'r','k']
1554 if width<=0
or width>1:
1555 raise ValueError(
'Width must be in [0;1]')
1558 if len(self.
rows)>7:
1559 raise ValueError(
'Table contains too many rows to represent them at one '\
1560 'bar position in parallel. You can Select a Subtable or '\
1561 'specify the parameter rows with a list of row indices '\
1564 rows=range(len(self.
rows))
1566 if not isinstance(rows,list):
1569 raise ValueError(
'Too many rows to represent (max 7). Please note, that '\
1570 'data from multiple rows from one column gets '\
1571 'represented at one position in parallel.')
1574 row=self.
rows[r_idx]
1580 raise ValueError(
'Cannot find column with name '+str(c))
1581 temp.append(row[c_idx])
1585 colors=standard_colors[:len(rows)]
1587 if len(rows)!=len(colors):
1588 raise ValueError(
"Number of rows and number of colors must be consistent!")
1590 ind=np.arange(len(data[0]))
1591 single_bar_width=float(width)/len(data)
1594 ax=fig.add_subplot(111)
1597 for i
in range(len(data)):
1598 legend_data.append(ax.bar(ind+i*single_bar_width+(1-width)/2,data[i],single_bar_width,bottom=bottom,color=colors[i])[0])
1601 ax.set_title(title, size=
'x-large', fontweight=
'bold')
1607 ax.set_ylabel(nice_y)
1610 if len(data[0])!=len(xlabels):
1611 raise ValueError(
'Number of xlabels is not consistent with number of cols!')
1616 ax.set_xticks(ind+0.5)
1617 ax.set_xticklabels(xlabels, rotation = xlabels_rotation)
1622 if legend_names==
None:
1623 raise ValueError(
'You must provide legend names! e.g. names for the rows, '\
1624 'that are printed in parallel.')
1625 if len(legend_names)!=len(data):
1626 raise ValueError(
'length of legend_names must be consistent with number '\
1628 ax.legend(legend_data, legend_names)
1638 def PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log',
1639 colormap=
'jet', show_scalebar=
False, scalebar_label=
None, clear=
True, save=
False, show=
False):
1642 Create a heatplot of the data in col x vs the data in col y using matplotlib
1644 :param x: column name with x data
1645 :type x: :class:`str`
1647 :param y: column name with y data
1648 :type y: :class:`str`
1650 :param title: title of the plot, will be generated automatically if set to None
1651 :type title: :class:`str`
1653 :param x_title: label of x-axis, will be generated automatically if set to None
1654 :type title: :class:`str`
1656 :param y_title: label of y-axis, will be generated automatically if set to None
1657 :type title: :class:`str`
1659 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1660 :type x_range: :class:`list` of length two
1662 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1663 :type y_range: :class:`list` of length two
1665 :param binning: type of binning. If set to None, the value of a hexbin will
1666 correspond to the number of datapoints falling into it. If
1667 set to 'log', the value will be the log with base 10 of the above
1668 value (log(i+1)). If an integer is provided, the number of a
1669 hexbin is equal the number of datapoints falling into it divided
1670 by the integer. If a list of values is provided, these values
1671 will be the lower bounds of the bins.
1673 :param colormap: colormap, that will be used. Value can be every colormap defined
1674 in matplotlib or an own defined colormap. You can either pass a
1675 string with the name of the matplotlib colormap or a colormap
1678 :param show_scalebar: If set to True, a scalebar according to the chosen colormap is shown
1679 :type show_scalebar: :class:`bool`
1681 :param scalebar_label: Label of the scalebar
1682 :type scalebar_label: :class:`str`
1684 :param clear: clear old data from plot
1685 :type clear: :class:`bool`
1687 :param save: filename for saving plot
1688 :type save: :class:`str`
1690 :param show: directly show plot
1691 :type show: :class:`bool`
1696 import matplotlib.pyplot
as plt
1697 import matplotlib.cm
as cm
1699 raise ImportError(
'PlotHexbin relies on matplotlib, but I could not import it')
1707 if r[idx]!=
None and r[idy]!=
None:
1708 xdata.append(r[idx])
1709 ydata.append(r[idy])
1725 title =
'%s vs. %s' % (nice_x, nice_y)
1728 colormap=getattr(cm, colormap)
1730 if x_range
and (
IsScalar(x_range)
or len(x_range)!=2):
1731 raise ValueError(
'parameter x_range must contain exactly two elements')
1732 if y_range
and (
IsScalar(y_range)
or len(y_range)!=2):
1733 raise ValueError(
'parameter y_range must contain exactly two elements')
1735 ext = [min(xdata),max(xdata),min(ydata),max(ydata)]
1738 plt.xlim((x_range[0], x_range[1]))
1742 plt.ylim(y_range[0], y_range[1])
1747 plt.hexbin(xdata, ydata, bins=binning, cmap=colormap, extent=ext)
1749 plt.title(title, size=
'x-large', fontweight=
'bold',
1750 verticalalignment=
'bottom')
1758 cb.set_label(scalebar_label)
1770 Returns the row containing the cell with the maximal value in col. If
1771 several rows have the highest value, only the first one is returned.
1772 ''None'' values are ignored.
1774 :param col: column name
1775 :type col: :class:`str`
1777 :returns: row with maximal col value or None if the table is empty
1779 val, idx = self.
_Max(col)
1781 return self.
rows[idx]
1785 Returns the maximum value in col. If several rows have the highest value,
1786 only the first one is returned. ''None'' values are ignored.
1788 :param col: column name
1789 :type col: :class:`str`
1791 val, idx = self.
_Max(col)
1796 Returns the row index of the cell with the maximal value in col. If
1797 several rows have the highest value, only the first one is returned.
1798 ''None'' values are ignored.
1800 :param col: column name
1801 :type col: :class:`str`
1803 val, idx = self.
_Max(col)
1806 def _Min(self, col):
1807 if len(self.
rows)==0:
1811 if col_type==
'int' or col_type==
'float':
1812 min_val=float(
'inf')
1813 elif col_type==
'bool':
1815 elif col_type==
'string':
1818 for i,row
in enumerate(self.
rows):
1819 if row[idx]!=
None and row[idx]<min_val:
1822 return min_val, min_idx
1826 Returns the minimal value in col. If several rows have the lowest value,
1827 only the first one is returned. ''None'' values are ignored.
1829 :param col: column name
1830 :type col: :class:`str`
1832 val, idx = self.
_Min(col)
1837 Returns the row containing the cell with the minimal value in col. If
1838 several rows have the lowest value, only the first one is returned.
1839 ''None'' values are ignored.
1841 :param col: column name
1842 :type col: :class:`str`
1844 :returns: row with minimal col value or None if the table is empty
1846 val, idx = self.
_Min(col)
1848 return self.
rows[idx]
1852 Returns the row index of the cell with the minimal value in col. If
1853 several rows have the lowest value, only the first one is returned.
1854 ''None'' values are ignored.
1856 :param col: column name
1857 :type col: :class:`str`
1859 val, idx = self.
_Min(col)
1864 Returns the sum of the given column. Cells with ''None'' are ignored. Returns
1865 0.0, if the column doesn't contain any elements. Col must be of numeric
1866 column type ('float', 'int') or boolean column type.
1868 :param col: column name
1869 :type col: :class:`str`
1871 :raises: :class:`TypeError` if column type is ``string``
1875 if col_type!=
'int' and col_type!=
'float' and col_type!=
'bool':
1876 raise TypeError(
"Sum can only be used on numeric column types")
1885 Returns the mean of the given column. Cells with ''None'' are ignored. Returns
1886 None, if the column doesn't contain any elements. Col must be of numeric
1887 ('float', 'int') or boolean column type.
1889 If column type is *bool*, the function returns the ratio of
1890 number of 'Trues' by total number of elements.
1892 :param col: column name
1893 :type col: :class:`str`
1895 :raises: :class:`TypeError` if column type is ``string``
1899 if col_type!=
'int' and col_type!=
'float' and col_type!=
'bool':
1900 raise TypeError(
"Mean can only be used on numeric or bool column types")
1907 return stutil.Mean(vals)
1913 Adds a new column of type 'float' with a specified name (*mean_col_name*),
1914 containing the mean of all specified columns for each row.
1916 Cols are specified by their names and must be of numeric column
1917 type ('float', 'int') or boolean column type. Cells with None are ignored.
1918 Adds ''None'' if the row doesn't contain any values.
1920 :param mean_col_name: name of new column containing mean values
1921 :type mean_col_name: :class:`str`
1923 :param cols: name or list of names of columns to include in computation of
1925 :type cols: :class:`str` or :class:`list` of strings
1927 :raises: :class:`TypeError` if column type of columns in *col* is ``string``
1931 Staring with the following table:
1941 the code here adds a column with the name 'mean' to yield the table below:
1943 .. code-block::python
1945 tab.RowMean('mean', ['x', 'u'])
1948 ==== ==== ==== =====
1950 ==== ==== ==== =====
1954 ==== ==== ==== =====
1963 idx = self.GetColIndex(col)
1964 col_type = self.col_types[idx]
1965 if col_type!='int' and col_type!='float' and col_type!='bool':
1966 raise TypeError("RowMean can only be used on numeric column types")
1967 cols_idxs.append(idx)
1970 for row in self.rows:
1972 for idx in cols_idxs:
1977 mean = stutil.Mean(vals)
1978 mean_rows.append(mean)
1980 mean_rows.append(None)
1982 self.AddCol(mean_col_name, 'f', mean_rows)
1984 def Percentiles(self, col, nths):
1986 Returns the percentiles of column *col* given
in *nths*.
1988 The percentiles are calculated
as
1990 .. code-block:: python
1992 values[min(len(values), int(round(len(values)*nth/100+0.5)-1))]
1994 where values are the sorted values of *col*
not equal to
''None''
1996 :param col: column name
1997 :type col: :
class:`str`
1998 :param nths: list of percentiles to be calculated. Each percentile
is a
1999 number between 0
and 100.
2000 :type nths: :
class:`list` of numbers
2002 :raises: :
class:`TypeError`
if column type
is ``string``
2003 :returns: List of percentiles
in the same order
as given
in *nths*
2005 idx = self.GetColIndex(col)
2006 col_type = self.col_types[idx]
2007 if col_type!='int' and col_type!='float' and col_type!='bool':
2008 raise TypeError("Median can only be used on numeric column types")
2011 if nth < 0 or nth > 100:
2012 raise ValueError("percentiles must be between 0 and 100")
2019 return [None]*len(nths)
2023 p=vals[min(len(vals)-1, int(round(len(vals)*nth/100.0+0.5)-1))]
2024 percentiles.append(p)
2027 def Median(self, col):
2029 Returns the median of the given column. Cells with
''None'' are ignored. Returns
2030 ''None'',
if the column doesn
't contain any elements. Col must be of numeric
2031 column type ('float',
'int')
or boolean column type.
2033 :param col: column name
2034 :type col: :
class:`str`
2036 :raises: :
class:`TypeError`
if column type
is ``string``
2038 idx = self.GetColIndex(col)
2039 col_type = self.col_types[idx]
2040 if col_type!='int' and col_type!='float' and col_type!='bool':
2041 raise TypeError("Median can only be used on numeric column types")
2049 return stutil.Median(vals)
2053 def StdDev(self, col):
2055 Returns the standard deviation of the given column. Cells with
''None'' are
2056 ignored. Returns
''None'',
if the column doesn
't contain any elements. Col must
2057 be of numeric column type ('float',
'int')
or boolean column type.
2059 :param col: column name
2060 :type col: :
class:`str`
2062 :raises: :
class:`TypeError`
if column type
is ``string``
2064 idx = self.GetColIndex(col)
2065 col_type = self.col_types[idx]
2066 if col_type!='int' and col_type!='float' and col_type!='bool':
2067 raise TypeError("StdDev can only be used on numeric column types")
2074 return stutil.StdDev(vals)
2078 def Count(self, col, ignore_nan=True):
2080 Count the number of cells
in column that are
not equal to
''None''.
2082 :param col: column name
2083 :type col: :
class:`str`
2085 :param ignore_nan: ignore all *
None* values
2086 :type ignore_nan: :
class:`bool`
2089 idx=self.GetColIndex(col)
2098 def Correl(self, col1, col2):
2100 Calculate the Pearson correlation coefficient between *col1*
and *col2*, only
2101 taking rows into account where both of the values are
not equal to *
None*.
2102 If there are
not enough data points to calculate a correlation coefficient,
2105 :param col1: column name
for first column
2106 :type col1: :
class:`str`
2108 :param col2: column name
for second column
2109 :type col2: :
class:`str`
2111 if IsStringLike(col1) and IsStringLike(col2):
2112 col1 = self.GetColIndex(col1)
2113 col2 = self.GetColIndex(col2)
2114 vals1, vals2=([],[])
2115 for v1, v2 in zip(self[col1], self[col2]):
2116 if v1!=None and v2!=None:
2120 return stutil.Correl(vals1, vals2)
2124 def SpearmanCorrel(self, col1, col2):
2126 Calculate the Spearman correlation coefficient between col1
and col2, only
2127 taking rows into account where both of the values are
not equal to
None. If
2128 there are
not enough data points to calculate a correlation coefficient,
2131 :warning: The function depends on the following module: *scipy.stats.mstats*
2133 :param col1: column name
for first column
2134 :type col1: :
class:`str`
2136 :param col2: column name
for second column
2137 :type col2: :
class:`str`
2140 import scipy.stats.mstats
2142 if IsStringLike(col1) and IsStringLike(col2):
2143 col1 = self.GetColIndex(col1)
2144 col2 = self.GetColIndex(col2)
2145 vals1, vals2=([],[])
2146 for v1, v2 in zip(self[col1], self[col2]):
2147 if v1!=None and v2!=None:
2151 correl = scipy.stats.mstats.spearmanr(vals1, vals2)[0]
2152 if scipy.isnan(correl):
2159 LogError("Function needs scipy.stats.mstats, but I could not import it.")
2163 def Save(self, stream_or_filename, format='ost', sep=','):
2165 Save the table to stream
or filename. The following three file formats
2166 are supported (
for more information on file formats, see :meth:`Load`):
2168 ============= =======================================
2169 ost ost-specific format (human readable)
2170 csv comma separated values (human readable)
2171 pickle pickled byte stream (binary)
2173 context ConTeXt table
2174 ============= =======================================
2176 :param stream_or_filename: filename
or stream
for writing output
2177 :type stream_or_filename: :
class:`str`
or :
class:`file`
2179 :param format: output format (i.e. *ost*, *csv*, *pickle*)
2180 :type format: :
class:`str`
2182 :raises: :
class:`ValueError`
if format
is unknown
2184 format=format.lower()
2186 return self._SaveOST(stream_or_filename)
2188 return self._SaveCSV(stream_or_filename, sep=sep)
2189 if format=='pickle':
2190 return self._SavePickle(stream_or_filename)
2192 return self._SaveHTML(stream_or_filename)
2193 if format=='context':
2194 return self._SaveContext(stream_or_filename)
2195 raise ValueError('unknown format "%s"' % format)
2197 def _SavePickle(self, stream):
2198 if not hasattr(stream, 'write'):
2199 stream=open(stream, 'wb')
2200 cPickle.dump(self, stream, cPickle.HIGHEST_PROTOCOL)
2202 def _SaveHTML(self, stream_or_filename):
2204 return s.replace('&', '&').replace('>', '>').replace('<', '<')
2207 if not hasattr(stream_or_filename, 'write'):
2208 stream = open(stream_or_filename, 'w')
2211 stream = stream_or_filename
2212 stream.write('<table>')
2213 stream.write('<tr>')
2214 for col_name in self.col_names:
2215 stream.write('<th>%s</th>' % _escape(col_name))
2216 stream.write('</tr>')
2217 for row in self.rows:
2218 stream.write('<tr>')
2219 for i, col in enumerate(row):
2222 if self.col_types[i] == 'float':
2224 elif self.col_types[i] == 'int':
2226 elif self.col_types[i] == 'bool':
2227 val = col and 'true' or 'false'
2230 stream.write('<td>%s</td>' % _escape(val))
2231 stream.write('</tr>')
2232 stream.write('</table>')
2235 def _SaveContext(self, stream_or_filename):
2237 if not hasattr(stream_or_filename, 'write'):
2238 stream = open(stream_or_filename, 'w')
2241 stream = stream_or_filename
2242 stream.write('\\starttable[')
2243 for col_type in self.col_types:
2244 if col_type =='string':
2246 elif col_type=='int':
2248 elif col_type =='float':
2249 stream.write('i3r|')
2252 stream.write(']\n\\HL\n')
2253 for col_name in self.col_names:
2254 stream.write('\\NC \\bf %s' % col_name)
2255 stream.write(' \\AR\\HL\n')
2256 for row in self.rows:
2257 for i, col in enumerate(row):
2260 if self.col_types[i] == 'float':
2262 elif self.col_types[i] == 'int':
2264 elif self.col_types[i] == 'bool':
2265 val = col and 'true' or 'false'
2268 stream.write('\\NC %s' % val)
2269 stream.write(' \\AR\n')
2270 stream.write('\\HL\n')
2271 stream.write('\\stoptable')
2275 def _SaveCSV(self, stream, sep):
2276 if not hasattr(stream, 'write'):
2277 stream=open(stream, 'wb')
2279 writer=csv.writer(stream, delimiter=sep)
2280 writer.writerow(['%s' % n for n in self.col_names])
2281 for row in self.rows:
2283 for i, c in enumerate(row):
2286 writer.writerow(row)
2288 def _SaveOST(self, stream):
2289 if hasattr(stream, 'write'):
2290 writer=csv.writer(stream, delimiter=' ')
2292 stream=open(stream, 'w')
2293 writer=csv.writer(stream, delimiter=' ')
2295 stream.write(''.join(['# %s\n' % l for l in self.comment.split('\n')]))
2296 writer.writerow(['%s[%s]' % t for t in zip(self.col_names, self.col_types)])
2297 for row in self.rows:
2299 for i, c in enumerate(row):
2302 writer.writerow(row)
2305 def GetNumpyMatrix(self, *args):
2307 Returns a numpy matrix containing the selected columns from the table as
2308 columns in the matrix.
2310 Only columns of type *int* or *float* are supported. *NA* values in the
2311 table will be converted to *None* values.
2313 :param \*args: column names to include in numpy matrix
2315 :warning: The function depends on *numpy*
2321 raise RuntimeError("At least one column must be specified.")
2325 idx = self.GetColIndex(arg)
2326 col_type = self.col_types[idx]
2327 if col_type!='int' and col_type!='float':
2328 raise TypeError("Numpy matrix can only be generated from numeric column types")
2330 m = np.matrix([list(self[i]) for i in idxs])
2334 LogError("Function needs numpy, but I could not import it.")
2339 def GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0):
2342 In place Gaussian smooth of a column in the table with a given standard deviation.
2343 All nan are set to nan_value before smoothing.
2345 :param col: column name
2346 :type col: :class:`str`
2348 :param std: standard deviation for gaussian kernel
2351 :param na_value: all na (None) values of the speciefied column are set to na_value before smoothing
2352 :type na_value: `scalar`
2354 :param padding: allows to handle padding behaviour see scipy ndimage.gaussian_filter1d documentation for more information. standard is reflect
2355 :type padding: :class:`str`
2357 :param c: constant value used for padding if padding mode is constant
2362 :warning: The function depends on *scipy*
2366 from scipy import ndimage
2369 LogError("I need scipy.ndimage and numpy, but could not import it")
2372 idx = self.GetColIndex(col)
2373 col_type = self.col_types[idx]
2374 if col_type!='int' and col_type!='float':
2375 raise TypeError("GaussianSmooth can only be used on numeric column types")
2382 vals.append(na_value)
2385 smoothed_values_ndarray=ndimage.gaussian_filter1d(vals,std, mode=padding, cval=c)
2389 for v in smoothed_values_ndarray:
2395 def GetOptimalPrefactors(self, ref_col, *args, **kwargs):
2397 This returns the optimal prefactor values (i.e. a, b, c, ...) for the
2403 a*u + b*v + c*w + ... = z
2405 where u, v, w and z are vectors. In matrix notation
2412 where A contains the data from the table (u,v,w,...), p are the prefactors
2413 to optimize (a,b,c,...) and z is the vector containing the result of
2416 The parameter ref_col equals to z in both equations, and \*args are columns
2417 u, v and w (or A in :eq:`op2`). All columns must be specified by their names.
2421 .. code-block:: python
2423 tab.GetOptimalPrefactors('colC', 'colA', 'colB')
2425 The function returns a list of containing the prefactors a, b, c, ... in
2426 the correct order (i.e. same as columns were specified in \*args).
2429 If the kwarg weights="columX" is specified, the equations are weighted by
2430 the values in that column. Each row is multiplied by the weight in that row,
2431 which leads to :eq:`op3`:
2436 weight*a*u + weight*b*v + weight*c*w + ... = weight*z
2438 Weights must be float or int and can have any value. A value of 0 ignores
2439 this equation, a value of 1 means the same as no weight. If all weights are
2440 the same for each row, the same result will be obtained as with no weights.
2444 .. code-block:: python
2446 tab.GetOptimalPrefactors('colC', 'colA', 'colB', weights='colD')
2453 raise RuntimeError("At least one column must be specified.")
2455 b = self.GetNumpyMatrix(ref_col)
2456 a = self.GetNumpyMatrix(*args)
2459 if kwargs.has_key('weights'):
2460 w = self.GetNumpyMatrix(kwargs['weights'])
2461 b = np.multiply(b,w)
2462 a = np.multiply(a,w)
2465 raise RuntimeError("specified unrecognized kwargs, use weights as key")
2468 return list(np.array(k.T).reshape(-1))
2471 LogError("Function needs numpy, but I could not import it.")
2474 def PlotEnrichment(self, score_col, class_col, score_dir='-',
2475 class_dir='-', class_cutoff=2.0,
2476 style='-', title=None, x_title=None, y_title=None,
2477 clear=True, save=None):
2479 Plot an enrichment curve using matplotlib of column *score_col* classified
2480 according to *class_col*.
2482 For more information about parameters of the enrichment, see
2483 :meth:`ComputeEnrichment`, and for plotting see :meth:`Plot`.
2485 :warning: The function depends on *matplotlib*
2488 import matplotlib.pyplot as plt
2490 enrx, enry = self.ComputeEnrichment(score_col, class_col, score_dir,
2491 class_dir, class_cutoff)
2494 title = 'Enrichment of %s'%score_col
2497 x_title = '% database'
2500 y_title = '% positives'
2505 plt.plot(enrx, enry, style)
2507 plt.title(title, size='x-large', fontweight='bold')
2508 plt.ylabel(y_title, size='x-large')
2509 plt.xlabel(x_title, size='x-large')
2516 LogError("Function needs matplotlib, but I could not import it.")
2519 def ComputeEnrichment(self, score_col, class_col, score_dir='-',
2520 class_dir='-', class_cutoff=2.0):
2522 Computes the enrichment of column *score_col* classified according to
2525 For this it is necessary, that the datapoints are classified into positive
2526 and negative points. This can be done in two ways:
2528 - by using one 'bool' type column (*class_col*) which contains *True* for
2529 positives and *False* for negatives
2531 - by specifying a classification column (*class_col*), a cutoff value
2532 (*class_cutoff*) and the classification columns direction (*class_dir*).
2533 This will generate the classification on the fly
2535 * if ``class_dir=='-'``: values in the classification column that are less than or equal to class_cutoff will be counted as positives
2536 * if ``class_dir=='+'``: values in the classification column that are larger than or equal to class_cutoff will be counted as positives
2538 During the calculation, the table will be sorted according to *score_dir*,
2539 where a '-' values means smallest values first and therefore, the smaller
2540 the value, the better.
2542 :warning: If either the value of *class_col* or *score_col* is *None*, the
2543 data in this row is ignored.
2546 ALLOWED_DIR = ['+','-']
2548 score_idx = self.GetColIndex(score_col)
2549 score_type = self.col_types[score_idx]
2550 if score_type!='int' and score_type!='float':
2551 raise TypeError("Score column must be numeric type")
2553 class_idx = self.GetColIndex(class_col)
2554 class_type = self.col_types[class_idx]
2555 if class_type!='int' and class_type!='float' and class_type!='bool':
2556 raise TypeError("Classifier column must be numeric or bool type")
2558 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2559 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2561 self.Sort(score_col, score_dir)
2566 old_score_val = None
2569 for row in self.rows:
2570 class_val = row[class_idx]
2571 score_val = row[score_idx]
2572 if class_val==None or score_val==None:
2575 if old_score_val==None:
2576 old_score_val = score_val
2577 if score_val!=old_score_val:
2580 old_score_val = score_val
2582 if class_type=='bool':
2586 if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2591 # if no false positives or false negatives values are found return None
2592 if x[-1]==0 or y[-1]==0:
2595 x = [float(v)/x[-1] for v in x]
2596 y = [float(v)/y[-1] for v in y]
2599 def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-',
2600 class_dir='-', class_cutoff=2.0):
2602 Computes the area under the curve of the enrichment using the trapezoidal
2605 For more information about parameters of the enrichment, see
2606 :meth:`ComputeEnrichment`.
2608 :warning: The function depends on *numpy*
2613 enr = self.ComputeEnrichment(score_col, class_col, score_dir,
2614 class_dir, class_cutoff)
2618 return np.trapz(enr[1], enr[0])
2620 LogError("Function needs numpy, but I could not import it.")
2623 def ComputeROC(self, score_col, class_col, score_dir='-',
2624 class_dir='-', class_cutoff=2.0):
2626 Computes the receiver operating characteristics (ROC) of column *score_col*
2627 classified according to *class_col*.
2629 For this it is necessary, that the datapoints are classified into positive
2630 and negative points. This can be done in two ways:
2632 - by using one 'bool' column (*class_col*) which contains True for positives
2633 and False for negatives
2634 - by using a non-bool column (*class_col*), a cutoff value (*class_cutoff*)
2635 and the classification columns direction (*class_dir*). This will generate
2636 the classification on the fly
2638 - if ``class_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff* will be counted as positives
2639 - if ``class_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff* will be counted as positives
2641 During the calculation, the table will be sorted according to *score_dir*,
2642 where a '-' values means smallest values first and therefore, the smaller
2643 the value, the better.
2645 If *class_col* does not contain any positives (i.e. value is True (if column
2646 is of type bool) or evaluated to True (if column is of type int or float
2647 (depending on *class_dir* and *class_cutoff*))) the ROC is not defined and
2648 the function will return *None*.
2650 :warning: If either the value of *class_col* or *score_col* is *None*, the
2651 data in this row is ignored.
2654 ALLOWED_DIR = ['+','-']
2656 score_idx = self.GetColIndex(score_col)
2657 score_type = self.col_types[score_idx]
2658 if score_type!='int' and score_type!='float':
2659 raise TypeError("Score column must be numeric type")
2661 class_idx = self.GetColIndex(class_col)
2662 class_type = self.col_types[class_idx]
2663 if class_type!='int' and class_type!='float' and class_type!='bool':
2664 raise TypeError("Classifier column must be numeric or bool type")
2666 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2667 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2669 self.Sort(score_col, score_dir)
2675 old_score_val = None
2677 for i,row in enumerate(self.rows):
2678 class_val = row[class_idx]
2679 score_val = row[score_idx]
2680 if class_val==None or score_val==None:
2683 if old_score_val==None:
2684 old_score_val = score_val
2685 if score_val!=old_score_val:
2688 old_score_val = score_val
2689 if class_type=='bool':
2695 if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2702 # if no false positives or false negatives values are found return None
2703 if x[-1]==0 or y[-1]==0:
2706 x = [float(v)/x[-1] for v in x]
2707 y = [float(v)/y[-1] for v in y]
2710 def ComputeROCAUC(self, score_col, class_col, score_dir='-',
2711 class_dir='-', class_cutoff=2.0):
2713 Computes the area under the curve of the receiver operating characteristics
2714 using the trapezoidal rule.
2716 For more information about parameters of the ROC, see
2719 :warning: The function depends on *numpy*
2724 roc = self.ComputeROC(score_col, class_col, score_dir,
2725 class_dir, class_cutoff)
2729 return np.trapz(roc[1], roc[0])
2731 LogError("Function needs numpy, but I could not import it.")
2734 def ComputeLogROCAUC(self, score_col, class_col, score_dir='-',
2735 class_dir='-', class_cutoff=2.0):
2737 Computes the area under the curve of the log receiver operating
2738 characteristics (logROC) where the x-axis is semilogarithmic
2739 using the trapezoidal rule.
2741 The logROC is computed with a lambda of 0.001 according to
2742 Rapid Context-Dependent Ligand Desolvation in Molecular Docking
2743 Mysinger M. and Shoichet B., Journal of Chemical Information and Modeling
2744 2010 50 (9), 1561-1573
2746 For more information about parameters of the ROC, see
2749 :warning: The function depends on *numpy*
2754 roc = self.ComputeROC(score_col, class_col, score_dir,
2755 class_dir, class_cutoff)
2767 # remove all duplicate x-values
2768 rocxt = [x if x>0 else l for x in rocxt]
2769 for i in range(len(rocxt)-1):
2770 if rocxt[i]==rocxt[i+1]:
2772 rocx.append(rocxt[i])
2773 rocy.append(rocyt[i])
2779 for i in range(len(rocx)-1):
2781 if rocx[i]==rocx[i+1]:
2783 b = rocy[i+1]-rocx[i+1]*((rocy[i+1]-rocy[i])/(rocx[i+1]-rocx[i]))
2784 value += ((rocy[i+1]-rocy[i])/math.log(10))+b*(math.log10(rocx[i+1])-math.log10(rocx[i]))
2785 return value/math.log10(1.0/l)
2788 LogError("Function needs numpy, but I could not import it.")
2791 def PlotROC(self, score_col, class_col, score_dir='-',
2792 class_dir='-', class_cutoff=2.0,
2793 style='-', title=None, x_title=None, y_title=None,
2794 clear=True, save=None):
2796 Plot an ROC curve using matplotlib.
2798 For more information about parameters of the ROC, see
2799 :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2801 :warning: The function depends on *matplotlib*
2805 import matplotlib.pyplot as plt
2807 roc = self.ComputeROC(score_col, class_col, score_dir,
2808 class_dir, class_cutoff)
2816 title = 'ROC of %s'%score_col
2819 x_title = 'false positive rate'
2822 y_title = 'true positive rate'
2827 plt.plot(enrx, enry, style)
2829 plt.title(title, size='x-large', fontweight='bold')
2830 plt.ylabel(y_title, size='x-large')
2831 plt.xlabel(x_title, size='x-large')
2838 LogError("Function needs matplotlib, but I could not import it.")
2841 def PlotLogROC(self, score_col, class_col, score_dir='-',
2842 class_dir='-', class_cutoff=2.0,
2843 style='-', title=None, x_title=None, y_title=None,
2844 clear=True, save=None):
2846 Plot an logROC curve where the x-axis is semilogarithmic using matplotlib
2848 For more information about parameters of the ROC, see
2849 :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2851 :warning: The function depends on *matplotlib*
2855 import matplotlib.pyplot as plt
2857 roc = self.ComputeROC(score_col, class_col, score_dir,
2858 class_dir, class_cutoff)
2866 title = 'logROC of %s'%score_col
2869 x_title = 'false positive rate'
2872 y_title = 'true positive rate'
2877 rocx = [x if x>0 else 0.001 for x in rocx]
2880 plt.plot(rocx, rocy, style)
2882 plt.title(title, size='x-large', fontweight='bold')
2883 plt.ylabel(y_title, size='x-large')
2884 plt.xlabel(x_title, size='x-large')
2886 plt.xscale('log', basex=10)
2887 plt.xlim(0.001, 1.0)
2895 LogError("Function needs matplotlib, but I could not import it.")
2898 def ComputeMCC(self, score_col, class_col, score_dir='-',
2899 class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
2901 Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
2902 with the points classified into true positives, false positives, true
2903 negatives and false negatives according to a specified classification
2904 column (*class_col*).
2906 The datapoints in *score_col* and *class_col* are classified into
2907 positive and negative points. This can be done in two ways:
2909 - by using 'bool' columns which contains True for positives and False
2912 - by using 'float' or 'int' columns and specifying a cutoff value and the
2913 columns direction. This will generate the classification on the fly
2915 * if ``class_dir``/``score_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2916 * if ``class_dir``/``score_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2918 The two possibilities can be used together, i.e. 'bool' type for one column
2919 and 'float'/'int' type and cutoff/direction for the other column.
2921 ALLOWED_DIR = ['+','-']
2923 score_idx = self.GetColIndex(score_col)
2924 score_type = self.col_types[score_idx]
2925 if score_type!='int' and score_type!='float' and score_type!='bool':
2926 raise TypeError("Score column must be numeric or bool type")
2928 class_idx = self.GetColIndex(class_col)
2929 class_type = self.col_types[class_idx]
2930 if class_type!='int' and class_type!='float' and class_type!='bool':
2931 raise TypeError("Classifier column must be numeric or bool type")
2933 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2934 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2941 for i,row in enumerate(self.rows):
2942 class_val = row[class_idx]
2943 score_val = row[score_idx]
2945 if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
2946 if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
2951 if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
2959 msg = 'factor (tp + fn) is zero'
2961 msg = 'factor (tp + fp) is zero'
2963 msg = 'factor (tn + fn) is zero'
2965 msg = 'factor (tn + fp) is zero'
2968 LogWarning("Could not compute MCC: MCC is not defined since %s"%msg)
2970 mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
2974 def IsEmpty(self, col_name=None, ignore_nan=True):
2976 Checks if a table is empty.
2978 If no column name is specified, the whole table is checked for being empty,
2979 whereas if a column name is specified, only this column is checked.
2981 By default, all NAN (or None) values are ignored, and thus, a table
2982 containing only NAN values is considered as empty. By specifying the
2983 option ignore_nan=False, NAN values are counted as 'normal' values.
2986 # table with no columns and no rows
2987 if len(self.col_names)==0:
2989 raise ValueError('Table has no column named "%s"' % col_name)
2992 # column name specified
2994 if self.Count(col_name, ignore_nan=ignore_nan)==0:
2999 # no column name specified -> test whole table
3001 for row in self.rows:
3011 def Extend(self, tab, overwrite=None):
3013 Append each row of *tab* to the current table. The data
is appended based
3014 on the column names, thus the order of the table columns
is *
not* relevant,
3015 only the header names.
3017 If there
is a column
in *tab* that
is not present
in the current table,
3018 it
is added to the current table
and filled with *
None*
for all the rows
3019 present
in the current table.
3021 If the type of any column
in *tab*
is not the same
as in the current table
3022 a *TypeError*
is raised.
3024 If *overwrite*
is not None and set to an existing column name, the specified
3025 column
in the table
is searched
for the first occurrence of a value matching
3026 the value of the column with the same name
in the dictionary. If a matching
3027 value
is found, the row
is overwritten with the dictionary. If no matching
3028 row
is found, a new row
is appended to the table.
3030 # add column to current table if it doesn't exist
3031 for name,typ in zip(tab.col_names, tab.col_types):
3032 if not name in self.col_names:
3033 self.AddCol(name, typ)
3035 # check that column types are the same in current and new table
3036 for name in self.col_names:
3037 if name in tab.col_names:
3038 curr_type = self.col_types[self.GetColIndex(name)]
3039 new_type = tab.col_types[tab.GetColIndex(name)]
3040 if curr_type!=new_type:
3041 raise TypeError('cannot extend table, column %s in new '%name +\
3042 'table different type (%s) than in '%new_type +\
3043 'current table (%s)'%curr_type)
3045 num_rows = len(tab.rows)
3046 for i in range(0,num_rows):
3048 data = dict(zip(tab.col_names,row))
3049 self.AddRow(data, overwrite)
3052 def Merge(table1, table2, by, only_matching=False):
3054 Returns a new table containing the data
from both tables. The rows are
3055 combined based on the common values
in the column(s) by. The option
'by' can
3056 be a list of column names. When this
is the case, merging
is based on
3058 For example, the two tables below
3076 when merged by column x, produce the following output:
3089 def _key(row, indices):
3090 return tuple([row[i] for i in indices])
3091 def _keep(indices, cn, ct, ni):
3092 ncn, nct, nni=([],[],[])
3093 for i in range(len(cn)):
3094 if i not in indices:
3098 return ncn, nct, nni
3099 col_names=list(table2.col_names)
3100 col_types=list(table2.col_types)
3101 new_index=[i for i in range(len(col_names))]
3102 if isinstance(by, str):
3103 common2_indices=[col_names.index(by)]
3105 common2_indices=[col_names.index(b) for b in by]
3106 col_names, col_types, new_index=_keep(common2_indices, col_names,
3107 col_types, new_index)
3109 for i, name in enumerate(col_names):
3112 while try_name in table1.col_names:
3114 try_name='%s_%d' % (name, counter)
3115 col_names[i]=try_name
3117 if isinstance(by, str):
3118 common1_indices=[table1.col_names.index(by)]
3120 common1_indices=[table1.col_names.index(b) for b in by]
3121 for row in table1.rows:
3122 key=_key(row, common1_indices)
3124 raise ValueError('duplicate key "%s in first table"' % (str(key)))
3127 for row in table2.rows:
3128 key=_key(row, common2_indices)
3130 raise ValueError('duplicate key "%s" in second table' % (str(key)))
3132 new_tab=Table(table1.col_names+col_names, table1.col_types+col_types)
3133 for k, v in common1.iteritems():
3134 row=v+[None for i in range(len(table2.col_names)-len(common2_indices))]
3139 for i, index in enumerate(new_index):
3140 row[len(table1.col_names)+i]=row2[index]
3141 if only_matching and not matched:
3146 for k, v in common2.iteritems():
3147 if not k in common1:
3148 v2=[v[i] for i in new_index]
3149 row=[None for i in range(len(table1.col_names))]+v2
3150 for common1_index, common2_index in zip(common1_indices, common2_indices):
3151 row[common1_index]=v[common2_index]