8 from ost
import LogError, LogWarning, LogInfo, LogVerbose
11 return col_name.replace(
'_',
' ')
14 if isinstance(value, TableCol)
or isinstance(value, BinaryColExpr):
23 value=value.strip().upper()
24 return value
in (
'',
'NULL',
'NONE',
'NA')
30 if isinstance(value, TableCol)
or isinstance(value, BinaryColExpr):
39 possibilities=set([
'bool',
'int',
'float'])
41 str_ele=str(ele).upper()
45 if 'int' in possibilities:
49 possibilities.remove(
'int')
51 if 'float' in possibilities:
55 possibilities.remove(
'float')
56 if 'bool' in possibilities:
57 if str_ele
not in set([
'YES',
'NO',
'TRUE',
'FALSE']):
58 possibilities.remove(
'bool')
60 if len(possibilities)==0:
62 if len(possibilities)==2:
67 return possibilities.pop()
75 self.
lhs=itertools.cyle([self.
lhs])
77 self.
rhs=itertools.cycle([self.
rhs])
79 for l, r
in zip(self.
lhs, self.
rhs):
80 if l!=
None and r!=
None:
105 for row
in self._table.rows:
109 return len(self._table.rows)
112 return self._table.rows[index][self.
col_index]
115 self._table.rows[index][self.
col_index]=value
133 The table class provides convenient access to data in tabular form. An empty
134 table can be easily constructed as follows
136 .. code-block:: python
140 If you want to add columns directly when creating the table, column names
141 and *column types* can be specified as follows
143 .. code-block:: python
145 tab=Table(['nameX','nameY','nameZ'], 'sfb')
147 this will create three columns called nameX, nameY and nameZ of type string,
148 float and bool, respectively. There will be no data in the table and thus,
149 the table will not contain any rows.
151 The following *column types* are supported:
162 If you want to add data to the table in addition, use the following:
164 .. code-block:: python
166 tab=Table(['nameX','nameY','nameZ'],
169 nameY=[0.1, 1.2, 3.414],
170 nameZ=[True, False, False])
172 if values for one column is left out, they will be filled with NA, but if
173 values are specified, all values must be specified (i.e. same number of
178 SUPPORTED_TYPES=(
'int',
'float',
'bool',
'string',)
181 def __init__(self, col_names=None, col_types=None, **kwargs):
190 self.
col_names=[v
for v
in kwargs.keys()]
201 if 'col_names' not in self.__dict__
or col_name
not in self.
col_names:
202 raise AttributeError(col_name)
206 def _ParseColTypes(types, exp_num=None):
210 short2long = {
's' :
'string',
'i':
'int',
'b' :
'bool',
'f' :
'float'}
211 allowed_short = short2long.keys()
212 allowed_long = short2long.values()
219 types = types.lower()
222 if types
in allowed_long:
223 type_list.append(types)
224 elif types
in allowed_short:
225 type_list.append(short2long[types])
228 elif types.find(
',')!=-1:
229 for t
in types.split(
','):
230 if t
in allowed_long:
232 elif t
in allowed_short:
233 type_list.append(short2long[t])
235 raise ValueError(
'Unknown type %s in types %s'%(t,types))
240 if t
in allowed_short:
241 type_list.append(short2long[t])
243 raise ValueError(
'Unknown type %s in types %s'%(t,types))
247 raise ValueError(
'Col type %s must be string or list'%types)
255 if t
in allowed_long:
257 elif t
in allowed_short:
258 type_list.append(short2long[t])
260 raise ValueError(
'Unknown type %s in types %s'%(t,types))
264 raise ValueError(
'Col type %s must be string or list'%types)
267 if len(type_list)!=exp_num:
268 raise ValueError(
'Parsed number of col types (%i) differs from ' + \
269 'expected (%i) in types %s'%(len(type_list),exp_num,types))
275 Set name of the table
277 :type name: :class:`str`
289 Rename column *old_name* to *new_name*.
291 :param old_name: Name of the old column
292 :param new_name: Name of the new column
293 :raises: :exc:`ValueError` when *old_name* is not a valid column
295 if old_name==new_name:
300 def _Coerce(self, value, ty):
302 Try to convert values (e.g. from :class:`str` type) to the specified type
304 :param value: the value
305 :type value: any type
307 :param ty: name of type to convert it to (i.e. *int*, *float*, *string*,
309 :type ty: :class:`str`
311 if value==
'NA' or value==
None:
320 if isinstance(value, str)
or isinstance(value, unicode):
321 if value.upper()
in (
'FALSE',
'NO',):
325 raise ValueError(
'Unknown type %s' % ty)
329 Returns the column index for the column with the given name.
331 :raises: ValueError if no column with the name is found
334 raise ValueError(
'Table has no column named "%s"' % col)
335 return self.col_names.index(col)
339 Returns a list containing all column names.
345 Returns a list of column names matching the regex
347 :param regex: regex pattern
348 :type regex: :class:`str`
350 :returns: :class:`list` of column names (:class:`str`)
354 matches = re.search(regex, name)
356 matching_names.append(name)
357 return matching_names
361 Checks if the column with a given name is present in the table.
376 value=itertools.cycle([value])
377 for r, v
in zip(self.
rows, value):
380 def ToString(self, float_format='%.3f', int_format='%d', rows=None):
382 Convert the table into a string representation.
384 The output format can be modified for int and float type columns by
385 specifying a formatting string for the parameters 'float_format' and
388 The option 'rows' specify the range of rows to be printed. The parameter
389 must be a type that supports indexing (e.g. a :class:`list`) containing the
390 start and end row *index*, e.g. [start_row_idx, end_row_idx].
392 :param float_format: formatting string for float columns
393 :type float_format: :class:`str`
395 :param int_format: formatting string for int columns
396 :type int_format: :class:`str`
398 :param rows: iterable containing start and end row *index*
399 :type rows: iterable containing :class:`ints <int>`
401 widths=[len(cn)
for cn
in self.
col_names]
404 sel_rows=self.
rows[rows[0]:rows[1]]
406 for i, (ty, col)
in enumerate(zip(self.
col_types, row)):
408 widths[i]=max(widths[i], len(
'NA'))
410 widths[i]=max(widths[i], len(float_format % col))
412 widths[i]=max(widths[i], len(int_format % col))
414 widths[i]=max(widths[i], len(str(col)))
417 s+=
''.join([
'# %s\n' % l
for l
in self.comment.split(
'\n')])
418 total_width=sum(widths)+2*len(widths)
419 for width, col_name
in zip(widths, self.
col_names):
420 s+=col_name.center(width+2)
421 s+=
'\n%s\n' % (
'-'*total_width)
423 for width, ty, col
in zip(widths, self.
col_types, row):
426 cs=
'NA'.center(width+2)
428 cs=(float_format % col).rjust(width+2)
430 cs=(int_format % col).rjust(width+2)
432 cs=
' '+str(col).ljust(width+1)
443 Statistics for column %(col)s
445 Number of Rows : %(num)d
446 Number of Rows Not None: %(num_non_null)d
449 Standard Deviation : %(stddev)f
455 'num' : len(self.
rows),
456 'num_non_null' : self.
Count(col),
457 'median' : self.
Median(col),
458 'mean' : self.
Mean(col),
459 'stddev' : self.
StdDev(col),
460 'min' : self.
Min(col),
461 'max' : self.
Max(col),
465 def _AddRowsFromDict(self, d, overwrite=None):
467 Add one or more rows from a :class:`dictionary <dict>`.
469 If *overwrite* is not None and set to an existing column name, the specified
470 column in the table is searched for the first occurrence of a value matching
471 the value of the column with the same name in the dictionary. If a matching
472 value is found, the row is overwritten with the dictionary. If no matching
473 row is found, a new row is appended to the table.
475 :param d: dictionary containing the data
476 :type d: :class:`dict`
478 :param overwrite: column name to overwrite existing row if value in
479 column *overwrite* matches
480 :type overwrite: :class:`str`
482 :raises: :class:`ValueError` if multiple rows are added but the number of
483 data items is different for different columns.
490 for k,v
in d.iteritems():
496 elif old_len!=len(v):
497 raise ValueError(
"Cannot add rows: length of data must be equal " + \
498 "for all columns in %s"%str(d))
501 for i,data
in enumerate(zip(*d.values())):
502 new_row = [
None for a
in range(len(self.
col_names))]
503 for idx,v
in zip(idxs,data):
510 for i,r
in enumerate(self.
rows):
511 if r[overwrite_idx]==new_row[overwrite_idx]:
512 for j,e
in enumerate(self.
rows[i]):
515 self.
rows[i] = new_row
520 if not overwrite
or not added:
521 self.rows.append(new_row)
525 Two-sided test for the null-hypothesis that two related samples
526 have the same average (expected values)
528 :param col_a: First column
529 :param col_b: Second column
531 :returns: P-value between 0 and 1 that the two columns have the
532 same average. The smaller the value, the less related the two
535 from scipy.stats
import ttest_rel
538 for x, y
in self.
Zip(col_a, col_b):
539 if x!=
None and y!=
None:
542 result = ttest_rel(xs, ys)
547 Add a row to the table.
549 *data* may either be a dictionary or a list-like object:
551 - If *data* is a dictionary the keys in the dictionary must match the
552 column names. Columns not found in the dict will be initialized to None.
553 If the dict contains list-like objects, multiple rows will be added, if
554 the number of items in all list-like objects is the same, otherwise a
555 :class:`ValueError` is raised.
557 - If *data* is a list-like object, the row is initialized from the values
558 in *data*. The number of items in *data* must match the number of
559 columns in the table. A :class:`ValuerError` is raised otherwise. The
560 values are added in the order specified in the list, thus, the order of
561 the data must match the columns.
563 If *overwrite* is not None and set to an existing column name, the specified
564 column in the table is searched for the first occurrence of a value matching
565 the value of the column with the same name in the dictionary. If a matching
566 value is found, the row is overwritten with the dictionary. If no matching
567 row is found, a new row is appended to the table.
569 :param data: data to add
570 :type data: :class:`dict` or *list-like* object
572 :param overwrite: column name to overwrite existing row if value in
573 column *overwrite* matches
574 :type overwrite: :class:`str`
576 :raises: :class:`ValueError` if *list-like* object is used and number of
577 items does *not* match number of columns in table.
579 :raises: :class:`ValueError` if *dict* is used and multiple rows are added
580 but the number of data items is different for different columns.
582 **Example:** add multiple data rows to a subset of columns using a dictionary
584 .. code-block:: python
586 # create table with three float columns
587 tab = Table(['x','y','z'], 'fff')
590 data = {'x': [1.2, 1.6], 'z': [1.6, 5.3]}
595 will produce the table
605 # overwrite the row with x=1.2 and add row with x=1.9
606 data = {'x': [1.2, 1.9], 'z': [7.9, 3.5]}
607 tab.AddRow(data, overwrite='x')
611 will produce the table
626 msg=
'data array must have %d elements, not %d'
627 raise ValueError(msg % (len(self.
col_names), len(data)))
634 for i,r
in enumerate(self.
rows):
635 if r[overwrite_idx]==new_row[overwrite_idx]:
636 self.
rows[i] = new_row
641 if not overwrite
or not added:
642 self.rows.append(new_row)
646 Remove column with the given name from the table
648 :param col: name of column to remove
649 :type col: :class:`str`
654 for row
in self.
rows:
657 def AddCol(self, col_name, col_type, data=None):
659 Add a column to the right of the table.
661 :param col_name: name of new column
662 :type col_name: :class:`str`
664 :param col_type: type of new column (long versions: *int*, *float*, *bool*,
665 *string* or short versions: *i*, *f*, *b*, *s*)
666 :type col_type: :class:`str`
668 :param data: data to add to new column.
669 :type data: scalar or iterable
673 .. code-block:: python
675 tab=Table(['x'], 'f', x=range(5))
676 tab.AddCol('even', 'bool', itertools.cycle([True, False]))
680 will produce the table
693 If data is a constant instead of an iterable object, it's value
694 will be written into each row:
696 .. code-block:: python
698 tab=Table(['x'], 'f', x=range(5))
699 tab.AddCol('num', 'i', 1)
703 will produce the table
716 As a special case, if there are no previous rows, and data is not
717 None, rows are added for every item in data.
721 raise ValueError(
'Column with name %s already exists'%col_name)
724 self.col_names.append(col_name)
725 self.col_types.append(col_type)
729 for row
in self.
rows:
732 if hasattr(data,
'__len__')
and len(data)!=len(self.
rows):
735 raise ValueError(
'Length of data (%i) must correspond to number of '%len(data) +\
736 'existing rows (%i)'%len(self.
rows))
737 for row, d
in zip(self.
rows, data):
740 elif data!=
None and len(self.
col_names)==1:
742 self.
AddRow({col_name : data})
745 self.
AddRow({col_name : v})
749 Returns a filtered table only containing rows matching all the predicates
750 in kwargs and args For example,
752 .. code-block:: python
754 tab.Filter(town='Basel')
756 will return all the rows where the value of the column "town" is equal to
757 "Basel". Several predicates may be combined, i.e.
759 .. code-block:: python
761 tab.Filter(town='Basel', male=True)
763 will return the rows with "town" equal to "Basel" and "male" equal to true.
764 args are unary callables returning true if the row should be included in the
765 result and false if not.
768 for row
in self.
rows:
774 for key, val
in kwargs.iteritems():
783 def _LoadOST(stream_or_filename):
784 fieldname_pattern=re.compile(
r'(?P<name>[^[]+)(\[(?P<type>\w+)\])?')
785 values_pattern=re.compile(
"([^\" ]+|\"[^\"]*\")+")
786 if not hasattr(stream_or_filename,
'read'):
787 stream=open(stream_or_filename,
'r')
789 stream=stream_or_filename
794 if line.startswith(
'#'):
802 for col
in line.split():
803 match=fieldname_pattern.match(col)
805 if match.group(
'type'):
806 fieldtypes.append(match.group(
'type'))
808 fieldtypes.append(
'string')
809 fieldnames.append(match.group(
'name'))
810 tab=
Table(fieldnames, fieldtypes)
813 tab.AddRow([x.strip(
'"')
for x
in values_pattern.findall(line)])
815 raise IOError(
"Cannot read table from empty stream")
818 def _GuessColumnTypes(self):
819 for col_idx
in range(len(self.
col_names)):
821 for row
in self.
rows:
822 for idx
in range(len(row)):
826 def _LoadCSV(stream_or_filename, sep):
827 if not hasattr(stream_or_filename,
'read'):
828 stream=open(stream_or_filename,
'r')
830 stream=stream_or_filename
831 reader=csv.reader(stream, delimiter=sep)
837 tab=
Table(header, types)
842 raise IOError(
'trying to load table from empty CSV stream/file')
844 tab._GuessColumnTypes()
848 def _LoadPickle(stream_or_filename):
849 if not hasattr(stream_or_filename,
'read'):
850 stream=open(stream_or_filename,
'rb')
852 stream=stream_or_filename
853 return cPickle.load(stream)
856 def _GuessFormat(filename):
858 filename = filename.name
859 except AttributeError, e:
861 if filename.endswith(
'.csv'):
863 elif filename.endswith(
'.pickle'):
870 def Load(stream_or_filename, format='auto', sep=','):
872 Load table from stream or file with given name.
874 By default, the file format is set to *auto*, which tries to guess the file
875 format from the file extension. The following file extensions are
878 ============ ======================
879 extension recognized format
880 ============ ======================
881 .csv comma separated values
882 .pickle pickled byte stream
883 <all others> ost-specific format
884 ============ ======================
886 Thus, *format* must be specified for reading file with different filename
889 The following file formats are understood:
893 This is an ost-specific, but still human readable file format. The file
894 (stream) must start with header line of the form
896 col_name1[type1] <col_name2[type2]>...
898 The types given in brackets must be one of the data types the
899 :class:`Table` class understands. Each following line in the file then must
900 contains exactly the same number of data items as listed in the header. The
901 data items are automatically converted to the column format. Lines starting
902 with a '#' and empty lines are ignored.
906 Deserializes the table from a pickled byte stream
910 Reads the table from comma separated values stream. Since there is no
911 explicit type information in the csv file, the column types are guessed,
912 using the following simple rules:
914 * if all values are either NA/NULL/NONE the type is set to string
915 * if all non-null values are convertible to float/int the type is set to
917 * if all non-null values are true/false/yes/no, the value is set to bool
918 * for all other cases, the column type is set to string
920 :returns: A new :class:`Table` instance
922 format=format.lower()
924 format = Table._GuessFormat(stream_or_filename)
927 return Table._LoadOST(stream_or_filename)
929 return Table._LoadCSV(stream_or_filename, sep=sep)
931 return Table._LoadPickle(stream_or_filename)
932 raise ValueError(
'unknown format ""' % format)
936 Performs an in-place sort of the table, based on column *by*.
938 :param by: column name by which to sort
939 :type by: :class:`str`
941 :param order: ascending (``-``) or descending (``+``) order
942 :type order: :class:`str` (i.e. *+*, *-*)
948 def _key_cmp(lhs, rhs):
949 return sign*cmp(lhs[key_index], rhs[key_index])
950 self.
rows=sorted(self.
rows, _key_cmp)
954 Extract a list of all unique values from one column
956 :param col: column name
957 :type col: :class:`str`
959 :param ignore_nan: ignore all *None* values
960 :type ignore_nan: :class:`bool`
965 for row
in self.
rows:
967 if item!=
None or ignore_nan==
False:
968 if item
in seen:
continue
975 Allows to conveniently iterate over a selection of columns, e.g.
977 .. code-block:: python
979 tab=Table.Load('...')
980 for col1, col2 in tab.Zip('col1', 'col2'):
985 .. code-block:: python
987 tab=Table.Load('...')
988 for col1, col2 in zip(tab['col1'], tab['col2']):
991 return zip(*[self[arg]
for arg
in args])
993 def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None,
994 z_title=
None, x_range=
None, y_range=
None, z_range=
None,
995 color=
None, plot_if=
None, legend=
None,
996 num_z_levels=10, z_contour=
True, z_interpol=
'nn', diag_line=
False,
997 labels=
None, max_num_labels=
None, title=
None, clear=
True, save=
False,
1000 Function to plot values from your table in 1, 2 or 3 dimensions using
1001 `Matplotlib <http://matplotlib.sourceforge.net>`__
1003 :param x: column name for first dimension
1004 :type x: :class:`str`
1006 :param y: column name for second dimension
1007 :type y: :class:`str`
1009 :param z: column name for third dimension
1010 :type z: :class:`str`
1012 :param style: symbol style (e.g. *.*, *-*, *x*, *o*, *+*, *\**). For a
1013 complete list check (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1014 :type style: :class:`str`
1016 :param x_title: title for first dimension, if not specified it is
1017 automatically derived from column name
1018 :type x_title: :class:`str`
1020 :param y_title: title for second dimension, if not specified it is
1021 automatically derived from column name
1022 :type y_title: :class:`str`
1024 :param z_title: title for third dimension, if not specified it is
1025 automatically derived from column name
1026 :type z_title: :class:`str`
1028 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1029 :type x_range: :class:`list` of length two
1031 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1032 :type y_range: :class:`list` of length two
1034 :param z_range: start and end value for third dimension (e.g. [start_z, end_z])
1035 :type z_range: :class:`list` of length two
1037 :param color: color for data (e.g. *b*, *g*, *r*). For a complete list check
1038 (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
1039 :type color: :class:`str`
1041 :param plot_if: callable which returnes *True* if row should be plotted. Is
1042 invoked like ``plot_if(self, row)``
1043 :type plot_if: callable
1045 :param legend: legend label for data series
1046 :type legend: :class:`str`
1048 :param num_z_levels: number of levels for third dimension
1049 :type num_z_levels: :class:`int`
1051 :param diag_line: draw diagonal line
1052 :type diag_line: :class:`bool`
1054 :param labels: column name containing labels to put on x-axis for one
1056 :type labels: :class:`str`
1058 :param max_num_labels: limit maximum number of labels
1059 :type max_num_labels: :class:`int`
1061 :param title: plot title, if not specified it is automatically derived from
1062 plotted column names
1063 :type title: :class:`str`
1065 :param clear: clear old data from plot
1066 :type clear: :class:`bool`
1068 :param save: filename for saving plot
1069 :type save: :class:`str`
1071 :param z_contour: draw contour lines
1072 :type z_contour: :class:`bool`
1074 :param z_interpol: interpolation method for 3-dimensional plot (one of 'nn',
1076 :type z_interpol: :class:`str`
1078 :param \*\*kwargs: additional arguments passed to matplotlib
1080 :returns: the ``matplotlib.pyplot`` module
1082 **Examples:** simple plotting functions
1084 .. code-block:: python
1086 tab=Table(['a','b','c','d'],'iffi', a=range(5,0,-1),
1087 b=[x/2.0 for x in range(1,6)],
1088 c=[math.cos(x) for x in range(0,5)],
1091 # one dimensional plot of column 'd' vs. index
1095 # two dimensional plot of 'a' vs. 'c'
1096 plt=tab.Plot('a', y='c', style='o-')
1099 # three dimensional plot of 'a' vs. 'c' with values 'b'
1100 plt=tab.Plot('a', y='c', z='b')
1101 # manually save plot to file
1102 plt.savefig("plot.png")
1105 import matplotlib.pyplot
as plt
1106 import matplotlib.mlab
as mlab
1114 plt.figure(figsize=[8, 6])
1137 if x_range
and (
IsScalar(x_range)
or len(x_range)!=2):
1138 raise ValueError(
'parameter x_range must contain exactly two elements')
1139 if y_range
and (
IsScalar(y_range)
or len(y_range)!=2):
1140 raise ValueError(
'parameter y_range must contain exactly two elements')
1141 if z_range
and (
IsScalar(z_range)
or len(z_range)!=2):
1142 raise ValueError(
'parameter z_range must contain exactly two elements')
1145 kwargs[
'color']=color
1147 kwargs[
'label']=legend
1151 for row
in self.
rows:
1152 if row[idx1]!=
None and row[idx2]!=
None and row[idx3]!=
None:
1153 if plot_if
and not plot_if(self, row):
1155 xs.append(row[idx1])
1156 ys.append(row[idx2])
1157 zs.append(row[idx3])
1160 z_spacing = (z_range[1] - z_range[0]) / num_z_levels
1164 z_spacing = (self.
Max(z) - l) / num_z_levels
1166 for i
in range(0,num_z_levels+1):
1170 xi = np.linspace(min(xs),max(xs),len(xs)*10)
1171 yi = np.linspace(min(ys),max(ys),len(ys)*10)
1172 zi = mlab.griddata(xs, ys, zs, xi, yi, interp=z_interpol)
1175 plt.contour(xi,yi,zi,levels,linewidths=0.5,colors=
'k')
1177 plt.contourf(xi,yi,zi,levels,cmap=plt.cm.jet)
1178 plt.colorbar(ticks=levels)
1182 for row
in self.
rows:
1183 if row[idx1]!=
None and row[idx2]!=
None:
1184 if plot_if
and not plot_if(self, row):
1186 xs.append(row[idx1])
1187 ys.append(row[idx2])
1188 plt.plot(xs, ys, style, **kwargs)
1195 for row
in self.
rows:
1197 if plot_if
and not plot_if(self, row):
1199 xs.append(row[idx1])
1201 label_vals.append(row[label_idx])
1202 plt.plot(xs, style, **kwargs)
1206 if len(label_vals)>max_num_labels:
1207 interval = int(math.ceil(float(len(label_vals))/max_num_labels))
1208 label_vals = label_vals[::interval]
1209 plt.xticks(np.arange(0, len(xs), interval), label_vals, rotation=45,
1214 title =
'%s of %s vs. %s' % (nice_z, nice_x, nice_y)
1216 title =
'%s vs. %s' % (nice_x, nice_y)
1220 plt.title(title, size=
'x-large', fontweight=
'bold',
1221 verticalalignment=
'bottom')
1227 plt.xlabel(nice_x, size=
'x-large')
1229 plt.xlim(x_range[0], x_range[1])
1231 plt.ylim(y_range[0], y_range[1])
1233 plt.plot(x_range, y_range,
'-')
1235 plt.ylabel(nice_y, size=
'x-large')
1238 plt.ylim(y_range[0], y_range[1])
1240 plt.xlabel(x_title, size=
'x-large')
1241 plt.ylabel(nice_y, size=
'x-large')
1246 LogError(
"Function needs numpy and matplotlib, but I could not import it.")
1249 def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False,
1250 histtype=
'stepfilled', align=
'mid', x_title=
None,
1251 y_title=
None, title=
None, clear=
True, save=
False,
1252 color=
None, y_range=
None):
1254 Create a histogram of the data in col for the range *x_range*, split into
1255 *num_bins* bins and plot it using Matplotlib.
1257 :param col: column name with data
1258 :type col: :class:`str`
1260 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1261 :type x_range: :class:`list` of length two
1263 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1264 :type y_range: :class:`list` of length two
1266 :param num_bins: number of bins in range
1267 :type num_bins: :class:`int`
1269 :param color: Color to be used for the histogram. If not set, color will be
1270 determined by matplotlib
1271 :type color: :class:`str`
1273 :param normed: normalize histogram
1274 :type normed: :class:`bool`
1276 :param histtype: type of histogram (i.e. *bar*, *barstacked*, *step*,
1277 *stepfilled*). See (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1278 :type histtype: :class:`str`
1280 :param align: style of histogram (*left*, *mid*, *right*). See
1281 (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1282 :type align: :class:`str`
1284 :param x_title: title for first dimension, if not specified it is
1285 automatically derived from column name
1286 :type x_title: :class:`str`
1288 :param y_title: title for second dimension, if not specified it is
1289 automatically derived from column name
1290 :type y_title: :class:`str`
1292 :param title: plot title, if not specified it is automatically derived from
1293 plotted column names
1294 :type title: :class:`str`
1296 :param clear: clear old data from plot
1297 :type clear: :class:`bool`
1299 :param save: filename for saving plot
1300 :type save: :class:`str`
1302 **Examples:** simple plotting functions
1304 .. code-block:: python
1306 tab=Table(['a'],'f', a=[math.cos(x*0.01) for x in range(100)])
1308 # one dimensional plot of column 'd' vs. index
1309 plt=tab.PlotHistogram('a')
1314 import matplotlib.pyplot
as plt
1317 if len(self.
rows)==0:
1321 kwargs[
'color']=color
1331 n, bins, patches = plt.hist(data, bins=num_bins, range=x_range,
1332 normed=normed, histtype=histtype, align=align,
1339 plt.xlabel(nice_x, size=
'x-large')
1346 plt.ylabel(nice_y, size=
'x-large')
1351 nice_title=
"Histogram of %s"%nice_x
1352 plt.title(nice_title, size=
'x-large', fontweight=
'bold')
1358 LogError(
"Function needs numpy and matplotlib, but I could not import it.")
1361 def _Max(self, col):
1362 if len(self.
rows)==0:
1366 if col_type==
'int' or col_type==
'float':
1367 max_val = -float(
'inf')
1368 elif col_type==
'bool':
1370 elif col_type==
'string':
1373 for i
in range(0, len(self.
rows)):
1374 if self.
rows[i][idx]>max_val:
1375 max_val = self.
rows[i][idx]
1377 return max_val, max_idx
1379 def PlotBar(self, cols, x_labels=None, x_labels_rotation='horizontal', y_title=None, title=None,
1380 colors=
None, yerr_cols=
None, width=0.8, bottom=0,
1381 legend=
True, save=
False):
1384 Create a barplot of the data in cols. Every element of a column will be represented
1385 as a single bar. If there are several columns, each row will be grouped together.
1387 :param cols: Column names with data. If cols is a string, every element of that column
1388 will be represented as a single bar. If cols is a list, every row resulting
1389 of these columns will be grouped together. Every value of the table still
1390 is represented by a single bar.
1392 :param x_labels: Label for every row on x-axis.
1393 :type x_labels: :class:`list`
1395 :param x_labels_rotation: Can either be 'horizontal', 'vertical' or a number that
1396 describes the rotation in degrees.
1398 :param y_title: Y-axis description
1399 :type y_title: :class:`str`
1402 :type title: :class:`str`
1404 :param colors: Colors of the different bars in each group. Must be a list of valid
1405 colornames in matplotlib. Length of color and cols must be consistent.
1406 :type colors: :class:`list`
1408 :param yerr_cols: Columns containing the y-error information. Can either be a string
1409 if only one column is plotted or a list otherwise. Length of
1410 yerr_cols and cols must be consistent.
1412 :param width: The available space for the groups on the x-axis is divided by the exact
1413 number of groups. The parameters width is the fraction of what is actually
1414 used. If it would be 1.0 the bars of the different groups would touch each other.
1415 :type width: :class:`float`
1417 :param bottom: Bottom
1418 :type bottom: :class:`float`
1420 :param legend: Legend for color explanation, the corresponding column respectively.
1421 :type legend: :class:`bool`
1423 :param save: If set, a png image with name $save in the current working directory will be saved.
1424 :type save: :class:`str`
1429 import matplotlib.pyplot
as plt
1431 raise ImportError(
'PlotBar relies on numpy and matplotlib, but I could not import it!')
1434 raise ValueError(
'More than seven bars at one position looks rather meaningless...')
1436 standard_colors=[
'b',
'g',
'y',
'c',
'm',
'r','k']
1440 if not isinstance(cols, list):
1444 if not isinstance(yerr_cols, list):
1445 yerr_cols=[yerr_cols]
1446 if len(yerr_cols)!=len(cols):
1447 raise RuntimeError (
'Number of cols and number of error columns must be consistent!')
1462 yerr_data.append(temp)
1464 for i
in range(len(cols)):
1465 yerr_data.append(
None)
1468 colors=standard_colors[:len(cols)]
1470 if len(cols)!=len(colors):
1471 raise RuntimeError(
"Number of columns and number of colors must be consistent!")
1473 ind=np.arange(len(data[0]))
1474 single_bar_width=float(width)/len(data)
1477 ax=fig.add_subplot(111)
1479 for i
in range(len(data)):
1480 legend_data.append(ax.bar(ind+i*single_bar_width,data[i],single_bar_width,bottom=bottom,color=colors[i],yerr=yerr_data[i], ecolor=
'black')[0])
1485 nice_title=
"coolest barplot on earth"
1486 ax.set_title(nice_title, size=
'x-large', fontweight=
'bold')
1492 ax.set_ylabel(nice_y)
1495 if len(data[0])!=len(x_labels):
1496 raise ValueError(
'Number of xlabels is not consistent with number of rows!')
1499 for i
in range(1,len(data[0])+1):
1500 x_labels.append(
'Row '+str(i))
1502 ax.set_xticks(ind+width*0.5)
1503 ax.set_xticklabels(x_labels, rotation = x_labels_rotation)
1506 ax.legend(legend_data, cols)
1513 def PlotHexbin(self, x, y, title=None, x_title=None, y_title=None, x_range=None, y_range=None, binning='log',
1514 colormap=
'jet', show_scalebar=
False, scalebar_label=
None, clear=
True, save=
False, show=
False):
1517 Create a heatplot of the data in col x vs the data in col y using matplotlib
1519 :param x: column name with x data
1520 :type x: :class:`str`
1522 :param y: column name with y data
1523 :type y: :class:`str`
1525 :param title: title of the plot, will be generated automatically if set to None
1526 :type title: :class:`str`
1528 :param x_title: label of x-axis, will be generated automatically if set to None
1529 :type title: :class:`str`
1531 :param y_title: label of y-axis, will be generated automatically if set to None
1532 :type title: :class:`str`
1534 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1535 :type x_range: :class:`list` of length two
1537 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
1538 :type y_range: :class:`list` of length two
1540 :param binning: type of binning. If set to None, the value of a hexbin will
1541 correspond to the number of datapoints falling into it. If
1542 set to 'log', the value will be the log with base 10 of the above
1543 value (log(i+1)). If an integer is provided, the number of a
1544 hexbin is equal the number of datapoints falling into it divided
1545 by the integer. If a list of values is provided, these values
1546 will be the lower bounds of the bins.
1548 :param colormap: colormap, that will be used. Value can be every colormap defined
1549 in matplotlib or an own defined colormap. You can either pass a
1550 string with the name of the matplotlib colormap or a colormap
1553 :param show_scalebar: If set to True, a scalebar according to the chosen colormap is shown
1554 :type show_scalebar: :class:`bool`
1556 :param scalebar_label: Label of the scalebar
1557 :type scalebar_label: :class:`str`
1559 :param clear: clear old data from plot
1560 :type clear: :class:`bool`
1562 :param save: filename for saving plot
1563 :type save: :class:`str`
1565 :param show: directly show plot
1566 :type show: :class:`bool`
1571 import matplotlib.pyplot
as plt
1572 import matplotlib.cm
as cm
1574 raise ImportError(
'PlotHexbin relies on matplotlib, but I could not import it')
1582 if r[idx]!=
None and r[idy]!=
None:
1583 xdata.append(r[idx])
1584 ydata.append(r[idy])
1600 title =
'%s vs. %s' % (nice_x, nice_y)
1603 colormap=getattr(cm, colormap)
1605 if x_range
and (
IsScalar(x_range)
or len(x_range)!=2):
1606 raise ValueError(
'parameter x_range must contain exactly two elements')
1607 if y_range
and (
IsScalar(y_range)
or len(y_range)!=2):
1608 raise ValueError(
'parameter y_range must contain exactly two elements')
1610 ext = [min(xdata),max(xdata),min(ydata),max(ydata)]
1613 plt.xlim((x_range[0], x_range[1]))
1617 plt.ylim(y_range[0], y_range[1])
1622 plt.hexbin(xdata, ydata, bins=binning, cmap=colormap, extent=ext)
1624 plt.title(title, size=
'x-large', fontweight=
'bold',
1625 verticalalignment=
'bottom')
1633 cb.set_label(scalebar_label)
1645 Returns the row containing the cell with the maximal value in col. If
1646 several rows have the highest value, only the first one is returned.
1647 None values are ignored.
1649 :param col: column name
1650 :type col: :class:`str`
1652 :returns: row with maximal col value or None if the table is empty
1654 val, idx = self.
_Max(col)
1656 return self.
rows[idx]
1660 Returns the maximum value in col. If several rows have the highest value,
1661 only the first one is returned. None values are ignored.
1663 :param col: column name
1664 :type col: :class:`str`
1666 val, idx = self.
_Max(col)
1671 Returns the row index of the cell with the maximal value in col. If
1672 several rows have the highest value, only the first one is returned.
1673 None values are ignored.
1675 :param col: column name
1676 :type col: :class:`str`
1678 val, idx = self.
_Max(col)
1681 def _Min(self, col):
1682 if len(self.
rows)==0:
1686 if col_type==
'int' or col_type==
'float':
1687 min_val=float(
'inf')
1688 elif col_type==
'bool':
1690 elif col_type==
'string':
1693 for i,row
in enumerate(self.
rows):
1694 if row[idx]!=
None and row[idx]<min_val:
1697 return min_val, min_idx
1701 Returns the minimal value in col. If several rows have the lowest value,
1702 only the first one is returned. None values are ignored.
1704 :param col: column name
1705 :type col: :class:`str`
1707 val, idx = self.
_Min(col)
1712 Returns the row containing the cell with the minimal value in col. If
1713 several rows have the lowest value, only the first one is returned.
1714 None values are ignored.
1716 :param col: column name
1717 :type col: :class:`str`
1719 :returns: row with minimal col value or None if the table is empty
1721 val, idx = self.
_Min(col)
1723 return self.
rows[idx]
1727 Returns the row index of the cell with the minimal value in col. If
1728 several rows have the lowest value, only the first one is returned.
1729 None values are ignored.
1731 :param col: column name
1732 :type col: :class:`str`
1734 val, idx = self.
_Min(col)
1739 Returns the sum of the given column. Cells with None are ignored. Returns
1740 0.0, if the column doesn't contain any elements. Col must be of numeric
1741 column type ('float', 'int') or boolean column type.
1743 :param col: column name
1744 :type col: :class:`str`
1746 :raises: :class:`TypeError` if column type is ``string``
1750 if col_type!=
'int' and col_type!=
'float' and col_type!=
'bool':
1751 raise TypeError(
"Sum can only be used on numeric column types")
1760 Returns the mean of the given column. Cells with None are ignored. Returns
1761 None, if the column doesn't contain any elements. Col must be of numeric
1762 ('float', 'int') or boolean column type.
1764 If column type is *bool*, the function returns the ratio of
1765 number of 'Trues' by total number of elements.
1767 :param col: column name
1768 :type col: :class:`str`
1770 :raises: :class:`TypeError` if column type is ``string``
1774 if col_type!=
'int' and col_type!=
'float' and col_type!=
'bool':
1775 raise TypeError(
"Mean can only be used on numeric or bool column types")
1782 return stutil.Mean(vals)
1788 Adds a new column of type 'float' with a specified name (*mean_col_name*),
1789 containing the mean of all specified columns for each row.
1791 Cols are specified by their names and must be of numeric column
1792 type ('float', 'int') or boolean column type. Cells with None are ignored.
1793 Adds None if the row doesn't contain any values.
1795 :param mean_col_name: name of new column containing mean values
1796 :type mean_col_name: :class:`str`
1798 :param cols: name or list of names of columns to include in computation of
1800 :type cols: :class:`str` or :class:`list` of strings
1802 :raises: :class:`TypeError` if column type of columns in *col* is ``string``
1806 Staring with the following table:
1816 the code here adds a column with the name 'mean' to yield the table below:
1818 .. code-block::python
1820 tab.RowMean('mean', ['x', 'u'])
1823 ==== ==== ==== =====
1825 ==== ==== ==== =====
1829 ==== ==== ==== =====
1838 idx = self.GetColIndex(col)
1839 col_type = self.col_types[idx]
1840 if col_type!='int' and col_type!='float' and col_type!='bool':
1841 raise TypeError("RowMean can only be used on numeric column types")
1842 cols_idxs.append(idx)
1845 for row in self.rows:
1847 for idx in cols_idxs:
1852 mean = stutil.Mean(vals)
1853 mean_rows.append(mean)
1855 mean_rows.append(None)
1857 self.AddCol(mean_col_name, 'f', mean_rows)
1859 def Percentiles(self, col, nths):
1861 returns the percentiles of column *col* given
in *nths*.
1863 The percentils are calculated
as
1865 .. code-block:: python
1867 values[min(len(values), int(
round(len(values)*p/100+0.5)-1))]
1869 where values are the sorted values of *col*
not equal to none
1870 :param: nths: list of percentiles to be calculated. Each percentil
is a number
1873 :raises: :
class:`TypeError`
if column type
is ``string``
1874 :returns: List of percentils
in the same order
as given
in *nths*
1876 idx = self.GetColIndex(col)
1877 col_type = self.col_types[idx]
1878 if col_type!='int' and col_type!='float' and col_type!='bool':
1879 raise TypeError("Median can only be used on numeric column types")
1882 if nth < 0 or nth > 100:
1883 raise ValueError("percentiles must be between 0 and 100")
1890 return [None]*len(nths)
1894 p=vals[min(len(vals)-1, int(round(len(vals)*nth/100.0+0.5)-1))]
1895 percentiles.append(p)
1898 def Median(self, col):
1900 Returns the median of the given column. Cells with
None are ignored. Returns
1901 None,
if the column doesn
't contain any elements. Col must be of numeric
1902 column type ('float',
'int')
or boolean column type.
1904 :param col: column name
1905 :type col: :
class:`str`
1907 :raises: :
class:`TypeError`
if column type
is ``string``
1909 idx = self.GetColIndex(col)
1910 col_type = self.col_types[idx]
1911 if col_type!='int' and col_type!='float' and col_type!='bool':
1912 raise TypeError("Median can only be used on numeric column types")
1920 return stutil.Median(vals)
1924 def StdDev(self, col):
1926 Returns the standard deviation of the given column. Cells with
None are
1927 ignored. Returns
None,
if the column doesn
't contain any elements. Col must
1928 be of numeric column type ('float',
'int')
or boolean column type.
1930 :param col: column name
1931 :type col: :
class:`str`
1933 :raises: :
class:`TypeError`
if column type
is ``string``
1935 idx = self.GetColIndex(col)
1936 col_type = self.col_types[idx]
1937 if col_type!='int' and col_type!='float' and col_type!='bool':
1938 raise TypeError("StdDev can only be used on numeric column types")
1945 return stutil.StdDev(vals)
1949 def Count(self, col, ignore_nan=True):
1951 Count the number of cells
in column that are
not equal to
None.
1953 :param col: column name
1954 :type col: :
class:`str`
1956 :param ignore_nan: ignore all *
None* values
1957 :type ignore_nan: :
class:`bool`
1960 idx=self.GetColIndex(col)
1969 def Correl(self, col1, col2):
1971 Calculate the Pearson correlation coefficient between *col1*
and *col2*, only
1972 taking rows into account where both of the values are
not equal to *
None*.
1973 If there are
not enough data points to calculate a correlation coefficient,
1976 :param col1: column name
for first column
1977 :type col1: :
class:`str`
1979 :param col2: column name
for second column
1980 :type col2: :
class:`str`
1982 if IsStringLike(col1) and IsStringLike(col2):
1983 col1 = self.GetColIndex(col1)
1984 col2 = self.GetColIndex(col2)
1985 vals1, vals2=([],[])
1986 for v1, v2 in zip(self[col1], self[col2]):
1987 if v1!=None and v2!=None:
1991 return stutil.Correl(vals1, vals2)
1995 def SpearmanCorrel(self, col1, col2):
1997 Calculate the Spearman correlation coefficient between col1
and col2, only
1998 taking rows into account where both of the values are
not equal to
None. If
1999 there are
not enough data points to calculate a correlation coefficient,
2002 :warning: The function depends on the following module: *scipy.stats.mstats*
2004 :param col1: column name
for first column
2005 :type col1: :
class:`str`
2007 :param col2: column name
for second column
2008 :type col2: :
class:`str`
2011 import scipy.stats.mstats
2013 if IsStringLike(col1) and IsStringLike(col2):
2014 col1 = self.GetColIndex(col1)
2015 col2 = self.GetColIndex(col2)
2016 vals1, vals2=([],[])
2017 for v1, v2 in zip(self[col1], self[col2]):
2018 if v1!=None and v2!=None:
2022 correl = scipy.stats.mstats.spearmanr(vals1, vals2)[0]
2023 if scipy.isnan(correl):
2030 LogError("Function needs scipy.stats.mstats, but I could not import it.")
2034 def Save(self, stream_or_filename, format='ost', sep=','):
2036 Save the table to stream
or filename. The following three file formats
2037 are supported (
for more information on file formats, see :meth:`Load`):
2039 ============= =======================================
2040 ost ost-specific format (human readable)
2041 csv comma separated values (human readable)
2042 pickle pickled byte stream (binary)
2044 context ConTeXt table
2045 ============= =======================================
2047 :param stream_or_filename: filename
or stream
for writing output
2048 :type stream_or_filename: :
class:`str`
or :
class:`file`
2050 :param format: output format (i.e. *ost*, *csv*, *pickle*)
2051 :type format: :
class:`str`
2053 :raises: :
class:`ValueError`
if format
is unknown
2055 format=format.lower()
2057 return self._SaveOST(stream_or_filename)
2059 return self._SaveCSV(stream_or_filename, sep=sep)
2060 if format=='pickle':
2061 return self._SavePickle(stream_or_filename)
2063 return self._SaveHTML(stream_or_filename)
2064 if format=='context':
2065 return self._SaveContext(stream_or_filename)
2066 raise ValueError('unknown format "%s"' % format)
2068 def _SavePickle(self, stream):
2069 if not hasattr(stream, 'write'):
2070 stream=open(stream, 'wb')
2071 cPickle.dump(self, stream, cPickle.HIGHEST_PROTOCOL)
2073 def _SaveHTML(self, stream_or_filename):
2075 return s.replace('&', '&').replace('>', '>').replace('<', '<')
2078 if not hasattr(stream_or_filename, 'write'):
2079 stream = open(stream_or_filename, 'w')
2082 stream = stream_or_filename
2083 stream.write('<table>')
2084 stream.write('<tr>')
2085 for col_name in self.col_names:
2086 stream.write('<th>%s</th>' % _escape(col_name))
2087 stream.write('</tr>')
2088 for row in self.rows:
2089 stream.write('<tr>')
2090 for i, col in enumerate(row):
2093 if self.col_types[i] == 'float':
2095 elif self.col_types[i] == 'int':
2097 elif self.col_types[i] == 'bool':
2098 val = col and 'true' or 'false'
2101 stream.write('<td>%s</td>' % _escape(val))
2102 stream.write('</tr>')
2103 stream.write('</table>')
2106 def _SaveContext(self, stream_or_filename):
2108 if not hasattr(stream_or_filename, 'write'):
2109 stream = open(stream_or_filename, 'w')
2112 stream = stream_or_filename
2113 stream.write('\\starttable[')
2114 for col_type in self.col_types:
2115 if col_type =='string':
2117 elif col_type=='int':
2119 elif col_type =='float':
2120 stream.write('i3r|')
2123 stream.write(']\n\\HL\n')
2124 for col_name in self.col_names:
2125 stream.write('\\NC \\bf %s' % col_name)
2126 stream.write(' \\AR\\HL\n')
2127 for row in self.rows:
2128 for i, col in enumerate(row):
2131 if self.col_types[i] == 'float':
2133 elif self.col_types[i] == 'int':
2135 elif self.col_types[i] == 'bool':
2136 val = col and 'true' or 'false'
2139 stream.write('\\NC %s' % val)
2140 stream.write(' \\AR\n')
2141 stream.write('\\HL\n')
2142 stream.write('\\stoptable')
2146 def _SaveCSV(self, stream, sep):
2147 if not hasattr(stream, 'write'):
2148 stream=open(stream, 'wb')
2150 writer=csv.writer(stream, delimiter=sep)
2151 writer.writerow(['%s' % n for n in self.col_names])
2152 for row in self.rows:
2154 for i, c in enumerate(row):
2157 writer.writerow(row)
2159 def _SaveOST(self, stream):
2160 if hasattr(stream, 'write'):
2161 writer=csv.writer(stream, delimiter=' ')
2163 stream=open(stream, 'w')
2164 writer=csv.writer(stream, delimiter=' ')
2166 stream.write(''.join(['# %s\n' % l for l in self.comment.split('\n')]))
2167 writer.writerow(['%s[%s]' % t for t in zip(self.col_names, self.col_types)])
2168 for row in self.rows:
2170 for i, c in enumerate(row):
2173 writer.writerow(row)
2176 def GetNumpyMatrix(self, *args):
2178 Returns a numpy matrix containing the selected columns from the table as
2179 columns in the matrix.
2180 Only columns of type *int* or *float* are supported. *NA* values in the
2181 table will be converted to *None* values.
2183 :param \*args: column names to include in numpy matrix
2185 :warning: The function depends on *numpy*
2191 raise RuntimeError("At least one column must be specified.")
2195 idx = self.GetColIndex(arg)
2196 col_type = self.col_types[idx]
2197 if col_type!='int' and col_type!='float':
2198 raise TypeError("Numpy matrix can only be generated from numeric column types")
2200 m = np.matrix([list(self[i]) for i in idxs])
2204 LogError("Function needs numpy, but I could not import it.")
2209 def GaussianSmooth(self, col, std=1.0, na_value=0.0, padding='reflect', c=0.0):
2212 In place gaussian smooth of a column in the table with a given standard deviation.
2213 All nan are set to nan_value before smoothing.
2215 :param col: column name
2216 :type col: :class:`str`
2218 :param std: standard deviation for gaussian kernel
2221 :param na_value: all na (None) values of the speciefied column are set to na_value before smoothing
2222 :type na_value: `scalar`
2224 :param padding: allows to handle padding behaviour see scipy ndimage.gaussian_filter1d documentation for more information. standard is reflect
2225 :type padding: :class:`str`
2227 :param c: constant value used for padding if padding mode is constant
2232 :warning: The function depends on *scipy*
2236 from scipy import ndimage
2239 LogError("I need scipy.ndimage and numpy, but could not import it")
2242 idx = self.GetColIndex(col)
2243 col_type = self.col_types[idx]
2244 if col_type!='int' and col_type!='float':
2245 raise TypeError("GaussianSmooth can only be used on numeric column types")
2252 vals.append(na_value)
2255 smoothed_values_ndarray=ndimage.gaussian_filter1d(vals,std, mode=padding, cval=c)
2259 for v in smoothed_values_ndarray:
2265 def GetOptimalPrefactors(self, ref_col, *args, **kwargs):
2267 This returns the optimal prefactor values (i.e. a, b, c, ...) for the
2273 a*u + b*v + c*w + ... = z
2275 where u, v, w and z are vectors. In matrix notation
2282 where A contains the data from the table (u,v,w,...), p are the prefactors
2283 to optimize (a,b,c,...) and z is the vector containing the result of
2286 The parameter ref_col equals to z in both equations, and \*args are columns
2287 u, v and w (or A in :eq:`op2`). All columns must be specified by their names.
2291 .. code-block:: python
2293 tab.GetOptimalPrefactors('colC', 'colA', 'colB')
2295 The function returns a list of containing the prefactors a, b, c, ... in
2296 the correct order (i.e. same as columns were specified in \*args).
2299 If the kwarg weights="columX" is specified, the equations are weighted by
2300 the values in that column. Each row is multiplied by the weight in that row,
2301 which leads to :eq:`op3`:
2306 weight*a*u + weight*b*v + weight*c*w + ... = weight*z
2308 Weights must be float or int and can have any value. A value of 0 ignores
2309 this equation, a value of 1 means the same as no weight. If all weights are
2310 the same for each row, the same result will be obtained as with no weights.
2314 .. code-block:: python
2316 tab.GetOptimalPrefactors('colC', 'colA', 'colB', weights='colD')
2323 raise RuntimeError("At least one column must be specified.")
2325 b = self.GetNumpyMatrix(ref_col)
2326 a = self.GetNumpyMatrix(*args)
2329 if kwargs.has_key('weights'):
2330 w = self.GetNumpyMatrix(kwargs['weights'])
2331 b = np.multiply(b,w)
2332 a = np.multiply(a,w)
2335 raise RuntimeError("specified unrecognized kwargs, use weights as key")
2338 return list(np.array(k.T).reshape(-1))
2341 LogError("Function needs numpy, but I could not import it.")
2344 def PlotEnrichment(self, score_col, class_col, score_dir='-',
2345 class_dir='-', class_cutoff=2.0,
2346 style='-', title=None, x_title=None, y_title=None,
2347 clear=True, save=None):
2349 Plot an enrichment curve using matplotlib of column *score_col* classified
2350 according to *class_col*.
2352 For more information about parameters of the enrichment, see
2353 :meth:`ComputeEnrichment`, and for plotting see :meth:`Plot`.
2355 :warning: The function depends on *matplotlib*
2358 import matplotlib.pyplot as plt
2360 enrx, enry = self.ComputeEnrichment(score_col, class_col, score_dir,
2361 class_dir, class_cutoff)
2364 title = 'Enrichment of %s'%score_col
2367 x_title = '% database'
2370 y_title = '% positives'
2375 plt.plot(enrx, enry, style)
2377 plt.title(title, size='x-large', fontweight='bold')
2378 plt.ylabel(y_title, size='x-large')
2379 plt.xlabel(x_title, size='x-large')
2386 LogError("Function needs matplotlib, but I could not import it.")
2389 def ComputeEnrichment(self, score_col, class_col, score_dir='-',
2390 class_dir='-', class_cutoff=2.0):
2392 Computes the enrichment of column *score_col* classified according to
2395 For this it is necessary, that the datapoints are classified into positive
2396 and negative points. This can be done in two ways:
2398 - by using one 'bool' type column (*class_col*) which contains *True* for
2399 positives and *False* for negatives
2401 - by specifying a classification column (*class_col*), a cutoff value
2402 (*class_cutoff*) and the classification columns direction (*class_dir*).
2403 This will generate the classification on the fly
2405 * if ``class_dir=='-'``: values in the classification column that are less than or equal to class_cutoff will be counted as positives
2406 * if ``class_dir=='+'``: values in the classification column that are larger than or equal to class_cutoff will be counted as positives
2408 During the calculation, the table will be sorted according to *score_dir*,
2409 where a '-' values means smallest values first and therefore, the smaller
2410 the value, the better.
2412 :warning: If either the value of *class_col* or *score_col* is *None*, the
2413 data in this row is ignored.
2416 ALLOWED_DIR = ['+','-']
2418 score_idx = self.GetColIndex(score_col)
2419 score_type = self.col_types[score_idx]
2420 if score_type!='int' and score_type!='float':
2421 raise TypeError("Score column must be numeric type")
2423 class_idx = self.GetColIndex(class_col)
2424 class_type = self.col_types[class_idx]
2425 if class_type!='int' and class_type!='float' and class_type!='bool':
2426 raise TypeError("Classifier column must be numeric or bool type")
2428 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2429 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2431 self.Sort(score_col, score_dir)
2436 old_score_val = None
2439 for row in self.rows:
2440 class_val = row[class_idx]
2441 score_val = row[score_idx]
2442 if class_val==None or score_val==None:
2445 if old_score_val==None:
2446 old_score_val = score_val
2447 if score_val!=old_score_val:
2450 old_score_val = score_val
2452 if class_type=='bool':
2456 if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2461 # if no false positives or false negatives values are found return None
2462 if x[-1]==0 or y[-1]==0:
2465 x = [float(v)/x[-1] for v in x]
2466 y = [float(v)/y[-1] for v in y]
2469 def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-',
2470 class_dir='-', class_cutoff=2.0):
2472 Computes the area under the curve of the enrichment using the trapezoidal
2475 For more information about parameters of the enrichment, see
2476 :meth:`ComputeEnrichment`.
2478 :warning: The function depends on *numpy*
2483 enr = self.ComputeEnrichment(score_col, class_col, score_dir,
2484 class_dir, class_cutoff)
2488 return np.trapz(enr[1], enr[0])
2490 LogError("Function needs numpy, but I could not import it.")
2493 def ComputeROC(self, score_col, class_col, score_dir='-',
2494 class_dir='-', class_cutoff=2.0):
2496 Computes the receiver operating characteristics (ROC) of column *score_col*
2497 classified according to *class_col*.
2499 For this it is necessary, that the datapoints are classified into positive
2500 and negative points. This can be done in two ways:
2502 - by using one 'bool' column (*class_col*) which contains True for positives
2503 and False for negatives
2504 - by using a non-bool column (*class_col*), a cutoff value (*class_cutoff*)
2505 and the classification columns direction (*class_dir*). This will generate
2506 the classification on the fly
2508 - if ``class_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff* will be counted as positives
2509 - if ``class_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff* will be counted as positives
2511 During the calculation, the table will be sorted according to *score_dir*,
2512 where a '-' values means smallest values first and therefore, the smaller
2513 the value, the better.
2515 If *class_col* does not contain any positives (i.e. value is True (if column
2516 is of type bool) or evaluated to True (if column is of type int or float
2517 (depending on *class_dir* and *class_cutoff*))) the ROC is not defined and
2518 the function will return *None*.
2520 :warning: If either the value of *class_col* or *score_col* is *None*, the
2521 data in this row is ignored.
2524 ALLOWED_DIR = ['+','-']
2526 score_idx = self.GetColIndex(score_col)
2527 score_type = self.col_types[score_idx]
2528 if score_type!='int' and score_type!='float':
2529 raise TypeError("Score column must be numeric type")
2531 class_idx = self.GetColIndex(class_col)
2532 class_type = self.col_types[class_idx]
2533 if class_type!='int' and class_type!='float' and class_type!='bool':
2534 raise TypeError("Classifier column must be numeric or bool type")
2536 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2537 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2539 self.Sort(score_col, score_dir)
2545 old_score_val = None
2547 for i,row in enumerate(self.rows):
2548 class_val = row[class_idx]
2549 score_val = row[score_idx]
2550 if class_val==None or score_val==None:
2553 if old_score_val==None:
2554 old_score_val = score_val
2555 if score_val!=old_score_val:
2558 old_score_val = score_val
2559 if class_type=='bool':
2565 if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
2572 # if no false positives or false negatives values are found return None
2573 if x[-1]==0 or y[-1]==0:
2576 x = [float(v)/x[-1] for v in x]
2577 y = [float(v)/y[-1] for v in y]
2580 def ComputeROCAUC(self, score_col, class_col, score_dir='-',
2581 class_dir='-', class_cutoff=2.0):
2583 Computes the area under the curve of the receiver operating characteristics
2584 using the trapezoidal rule.
2586 For more information about parameters of the ROC, see
2589 :warning: The function depends on *numpy*
2594 roc = self.ComputeROC(score_col, class_col, score_dir,
2595 class_dir, class_cutoff)
2599 return np.trapz(roc[1], roc[0])
2601 LogError("Function needs numpy, but I could not import it.")
2604 def PlotROC(self, score_col, class_col, score_dir='-',
2605 class_dir='-', class_cutoff=2.0,
2606 style='-', title=None, x_title=None, y_title=None,
2607 clear=True, save=None):
2609 Plot an ROC curve using matplotlib.
2611 For more information about parameters of the ROC, see
2612 :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2614 :warning: The function depends on *matplotlib*
2618 import matplotlib.pyplot as plt
2620 roc = self.ComputeROC(score_col, class_col, score_dir,
2621 class_dir, class_cutoff)
2629 title = 'ROC of %s'%score_col
2632 x_title = 'false positive rate'
2635 y_title = 'true positive rate'
2640 plt.plot(enrx, enry, style)
2642 plt.title(title, size='x-large', fontweight='bold')
2643 plt.ylabel(y_title, size='x-large')
2644 plt.xlabel(x_title, size='x-large')
2651 LogError("Function needs matplotlib, but I could not import it.")
2654 def ComputeMCC(self, score_col, class_col, score_dir='-',
2655 class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
2657 Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
2658 with the points classified into true positives, false positives, true
2659 negatives and false negatives according to a specified classification
2660 column (*class_col*).
2662 The datapoints in *score_col* and *class_col* are classified into
2663 positive and negative points. This can be done in two ways:
2665 - by using 'bool' columns which contains True for positives and False
2668 - by using 'float' or 'int' columns and specifying a cutoff value and the
2669 columns direction. This will generate the classification on the fly
2671 * if ``class_dir``/``score_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2672 * if ``class_dir``/``score_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2674 The two possibilities can be used together, i.e. 'bool' type for one column
2675 and 'float'/'int' type and cutoff/direction for the other column.
2677 ALLOWED_DIR = ['+','-']
2679 score_idx = self.GetColIndex(score_col)
2680 score_type = self.col_types[score_idx]
2681 if score_type!='int' and score_type!='float' and score_type!='bool':
2682 raise TypeError("Score column must be numeric or bool type")
2684 class_idx = self.GetColIndex(class_col)
2685 class_type = self.col_types[class_idx]
2686 if class_type!='int' and class_type!='float' and class_type!='bool':
2687 raise TypeError("Classifier column must be numeric or bool type")
2689 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2690 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2697 for i,row in enumerate(self.rows):
2698 class_val = row[class_idx]
2699 score_val = row[score_idx]
2701 if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
2702 if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
2707 if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
2715 msg = 'factor (tp + fn) is zero'
2717 msg = 'factor (tp + fp) is zero'
2719 msg = 'factor (tn + fn) is zero'
2721 msg = 'factor (tn + fp) is zero'
2724 LogWarning("Could not compute MCC: MCC is not defined since %s"%msg)
2726 mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
2730 def IsEmpty(self, col_name=None, ignore_nan=True):
2732 Checks if a table is empty.
2734 If no column name is specified, the whole table is checked for being empty,
2735 whereas if a column name is specified, only this column is checked.
2737 By default, all NAN (or None) values are ignored, and thus, a table
2738 containing only NAN values is considered as empty. By specifying the
2739 option ignore_nan=False, NAN values are counted as 'normal' values.
2742 # table with no columns and no rows
2743 if len(self.col_names)==0:
2745 raise ValueError('Table has no column named "%s"' % col_name)
2748 # column name specified
2750 if self.Count(col_name, ignore_nan=ignore_nan)==0:
2755 # no column name specified -> test whole table
2757 for row in self.rows:
2767 def Extend(self, tab, overwrite=None):
2769 Append each row of *tab* to the current table. The data
is appended based
2770 on the column names, thus the order of the table columns
is *
not* relevant,
2771 only the header names.
2773 If there
is a column
in *tab* that
is not present
in the current table,
2774 it
is added to the current table
and filled with *
None*
for all the rows
2775 present
in the current table.
2777 If the type of any column
in *tab*
is not the same
as in the current table
2778 a *TypeError*
is raised.
2780 If *overwrite*
is not None and set to an existing column name, the specified
2781 column
in the table
is searched
for the first occurrence of a value matching
2782 the value of the column with the same name
in the dictionary. If a matching
2783 value
is found, the row
is overwritten with the dictionary. If no matching
2784 row
is found, a new row
is appended to the table.
2786 # add column to current table if it doesn't exist
2787 for name,typ in zip(tab.col_names, tab.col_types):
2788 if not name in self.col_names:
2789 self.AddCol(name, typ)
2791 # check that column types are the same in current and new table
2792 for name in self.col_names:
2793 if name in tab.col_names:
2794 curr_type = self.col_types[self.GetColIndex(name)]
2795 new_type = tab.col_types[tab.GetColIndex(name)]
2796 if curr_type!=new_type:
2797 raise TypeError('cannot extend table, column %s in new '%name +\
2798 'table different type (%s) than in '%new_type +\
2799 'current table (%s)'%curr_type)
2801 num_rows = len(tab.rows)
2802 for i in range(0,num_rows):
2804 data = dict(zip(tab.col_names,row))
2805 self.AddRow(data, overwrite)
2808 def Merge(table1, table2, by, only_matching=False):
2810 Returns a new table containing the data
from both tables. The rows are
2811 combined based on the common values
in the column(s) by. The option
'by' can
2812 be a list of column names. When this
is the case, merging
is based on
2814 For example, the two tables below
2841 when merged by column x, produce the following output:
2843 def _key(row, indices):
2844 return tuple([row[i] for i in indices])
2845 def _keep(indices, cn, ct, ni):
2846 ncn, nct, nni=([],[],[])
2847 for i in range(len(cn)):
2848 if i not in indices:
2852 return ncn, nct, nni
2853 col_names=list(table2.col_names)
2854 col_types=list(table2.col_types)
2855 new_index=[i for i in range(len(col_names))]
2856 if isinstance(by, str):
2857 common2_indices=[col_names.index(by)]
2859 common2_indices=[col_names.index(b) for b in by]
2860 col_names, col_types, new_index=_keep(common2_indices, col_names,
2861 col_types, new_index)
2863 for i, name in enumerate(col_names):
2866 while try_name in table1.col_names:
2868 try_name='%s_%d' % (name, counter)
2869 col_names[i]=try_name
2871 if isinstance(by, str):
2872 common1_indices=[table1.col_names.index(by)]
2874 common1_indices=[table1.col_names.index(b) for b in by]
2875 for row in table1.rows:
2876 key=_key(row, common1_indices)
2878 raise ValueError('duplicate key "%s in first table"' % (str(key)))
2881 for row in table2.rows:
2882 key=_key(row, common2_indices)
2884 raise ValueError('duplicate key "%s" in second table' % (str(key)))
2886 new_tab=Table(table1.col_names+col_names, table1.col_types+col_types)
2887 for k, v in common1.iteritems():
2888 row=v+[None for i in range(len(table2.col_names)-len(common2_indices))]
2893 for i, index in enumerate(new_index):
2894 row[len(table1.col_names)+i]=row2[index]
2895 if only_matching and not matched:
2900 for k, v in common2.iteritems():
2901 if not k in common1:
2902 v2=[v[i] for i in new_index]
2903 row=[None for i in range(len(table1.col_names))]+v2
2904 for common1_index, common2_index in zip(common1_indices, common2_indices):
2905 row[common1_index]=v[common2_index]