8 from ost
import LogError, LogWarning, LogInfo, LogVerbose
11 return col_name.replace(
'_',
' ')
14 if isinstance(value, TableCol)
or isinstance(value, BinaryColExpr):
23 value=value.strip().upper()
24 return value
in (
'',
'NULL',
'NONE',
'NA')
30 if isinstance(value, TableCol)
or isinstance(value, BinaryColExpr):
39 possibilities=set([
'bool',
'int',
'float'])
41 str_ele=str(ele).upper()
45 if 'int' in possibilities:
49 possibilities.remove(
'int')
51 if 'float' in possibilities:
55 possibilities.remove(
'float')
56 if 'bool' in possibilities:
57 if str_ele
not in set([
'YES',
'NO',
'TRUE',
'FALSE']):
58 possibilities.remove(
'bool')
60 if len(possibilities)==0:
62 if len(possibilities)==2:
67 return possibilities.pop()
75 self.
lhs=itertools.cyle([self.
lhs])
77 self.
rhs=itertools.cycle([self.
rhs])
79 for l, r
in zip(self.
lhs, self.
rhs):
80 if l!=
None and r!=
None:
102 for row
in self._table.rows:
106 return len(self._table.rows)
109 return self._table.rows[index][self.
col_index]
112 self._table.rows[index][self.
col_index]=value
129 The table class provides convenient access to data in tabular form. An empty
130 table can be easily constructed as follows
132 .. code-block:: python
136 If you want to add columns directly when creating the table, column names
137 and *column types* can be specified as follows
139 .. code-block:: python
141 tab=Table(['nameX','nameY','nameZ'], 'sfb')
143 this will create three columns called nameX, nameY and nameZ of type string,
144 float and bool, respectively. There will be no data in the table and thus,
145 the table will not contain any rows.
147 The following *column types* are supported:
158 If you want to add data to the table in addition, use the following:
160 .. code-block:: python
162 tab=Table(['nameX','nameY','nameZ'],
165 nameY=[0.1, 1.2, 3.414],
166 nameZ=[True, False, False])
168 if values for one column is left out, they will be filled with NA, but if
169 values are specified, all values must be specified (i.e. same number of
174 SUPPORTED_TYPES=(
'int',
'float',
'bool',
'string',)
177 def __init__(self, col_names=None, col_types=None, **kwargs):
186 self.
col_names=[v
for v
in kwargs.keys()]
193 def _ParseColTypes(types, exp_num=None):
197 short2long = {
's' :
'string',
'i':
'int',
'b' :
'bool',
'f' :
'float'}
198 allowed_short = short2long.keys()
199 allowed_long = short2long.values()
206 types = types.lower()
209 if types
in allowed_long:
210 type_list.append(types)
211 elif types
in allowed_short:
212 type_list.append(short2long[types])
215 elif types.find(
',')!=-1:
216 for t
in types.split(
','):
217 if t
in allowed_long:
219 elif t
in allowed_short:
220 type_list.append(short2long[t])
222 raise ValueError(
'Unknown type %s in types %s'%(t,types))
227 if t
in allowed_short:
228 type_list.append(short2long[t])
230 raise ValueError(
'Unknown type %s in types %s'%(t,types))
234 raise ValueError(
'Col type %s must be string or list'%types)
242 if t
in allowed_long:
244 elif t
in allowed_short:
245 type_list.append(short2long[t])
247 raise ValueError(
'Unknown type %s in types %s'%(t,types))
251 raise ValueError(
'Col type %s must be string or list'%types)
254 if len(type_list)!=exp_num:
255 raise ValueError(
'Parsed number of col types (%i) differs from ' + \
256 'expected (%i) in types %s'%(len(type_list),exp_num,types))
262 Set name of the table
264 :type name: :class:`str`
274 def _Coerce(self, value, ty):
276 Try to convert values (e.g. from :class:`str` type) to the specified type
278 :param value: the value
279 :type value: any type
281 :param ty: name of type to convert it to (i.e. *int*, *float*, *string*,
283 :type ty: :class:`str`
285 if value==
'NA' or value==
None:
294 if isinstance(value, str)
or isinstance(value, unicode):
295 if value.upper()
in (
'FALSE',
'NO',):
299 raise ValueError(
'Unknown type %s' % ty)
303 Returns the column index for the column with the given name.
305 :raises: ValueError if no column with the name is found
308 raise ValueError(
'Table has no column named "%s"' % col)
309 return self.col_names.index(col)
313 Returns a list containing all column names.
319 Checks if the column with a given name is present in the table.
334 value=itertools.cycle([value])
335 for r, v
in zip(self.
rows, value):
338 def ToString(self, float_format='%.3f', int_format='%d', rows=None):
340 Convert the table into a string representation.
342 The output format can be modified for int and float type columns by
343 specifying a formatting string for the parameters 'float_format' and
346 The option 'rows' specify the range of rows to be printed. The parameter
347 must be a type that supports indexing (e.g. a :class:`list`) containing the
348 start and end row *index*, e.g. [start_row_idx, end_row_idx].
350 :param float_format: formatting string for float columns
351 :type float_format: :class:`str`
353 :param int_format: formatting string for int columns
354 :type int_format: :class:`str`
356 :param rows: iterable containing start and end row *index*
357 :type rows: iterable containing :class:`ints <int>`
359 widths=[len(cn)
for cn
in self.
col_names]
362 sel_rows=self.
rows[rows[0]:rows[1]]
364 for i, (ty, col)
in enumerate(zip(self.
col_types, row)):
366 widths[i]=max(widths[i], len(
'NA'))
368 widths[i]=max(widths[i], len(float_format % col))
370 widths[i]=max(widths[i], len(int_format % col))
372 widths[i]=max(widths[i], len(str(col)))
375 s+=
''.join([
'# %s\n' % l
for l
in self.comment.split(
'\n')])
376 total_width=sum(widths)+2*len(widths)
377 for width, col_name
in zip(widths, self.
col_names):
378 s+=col_name.center(width+2)
379 s+=
'\n%s\n' % (
'-'*total_width)
381 for width, ty, col
in zip(widths, self.
col_types, row):
384 cs=
'NA'.center(width+2)
386 cs=(float_format % col).rjust(width+2)
388 cs=(int_format % col).rjust(width+2)
390 cs=
' '+str(col).ljust(width+1)
398 def _AddRowsFromDict(self, d, overwrite=None):
400 Add one or more rows from a :class:`dictionary <dict>`.
402 If *overwrite* is not None and set to an existing column name, the specified
403 column in the table is searched for the first occurrence of a value matching
404 the value of the column with the same name in the dictionary. If a matching
405 value is found, the row is overwritten with the dictionary. If no matching
406 row is found, a new row is appended to the table.
408 :param d: dictionary containing the data
409 :type d: :class:`dict`
411 :param overwrite: column name to overwrite existing row if value in
412 column *overwrite* matches
413 :type overwrite: :class:`str`
415 :raises: :class:`ValueError` if multiple rows are added but the number of
416 data items is different for different columns.
423 for k,v
in d.iteritems():
429 elif old_len!=len(v):
430 raise ValueError(
"Cannot add rows: length of data must be equal " + \
431 "for all columns in %s"%str(d))
434 for i,data
in enumerate(zip(*d.values())):
435 new_row = [
None for a
in range(len(self.
col_names))]
436 for idx,v
in zip(idxs,data):
443 for i,r
in enumerate(self.
rows):
444 if r[overwrite_idx]==new_row[overwrite_idx]:
445 for j,e
in enumerate(self.
rows[i]):
448 self.
rows[i] = new_row
453 if not overwrite
or not added:
454 self.rows.append(new_row)
459 Add a row to the table.
461 *data* may either be a dictionary or a list-like object:
463 - If *data* is a dictionary the keys in the dictionary must match the
464 column names. Columns not found in the dict will be initialized to None.
465 If the dict contains list-like objects, multiple rows will be added, if
466 the number of items in all list-like objects is the same, otherwise a
467 :class:`ValueError` is raised.
469 - If *data* is a list-like object, the row is initialized from the values
470 in *data*. The number of items in *data* must match the number of
471 columns in the table. A :class:`ValuerError` is raised otherwise. The
472 values are added in the order specified in the list, thus, the order of
473 the data must match the columns.
475 If *overwrite* is not None and set to an existing column name, the specified
476 column in the table is searched for the first occurrence of a value matching
477 the value of the column with the same name in the dictionary. If a matching
478 value is found, the row is overwritten with the dictionary. If no matching
479 row is found, a new row is appended to the table.
481 :param data: data to add
482 :type data: :class:`dict` or *list-like* object
484 :param overwrite: column name to overwrite existing row if value in
485 column *overwrite* matches
486 :type overwrite: :class:`str`
488 :raises: :class:`ValueError` if *list-like* object is used and number of
489 items does *not* match number of columns in table.
491 :raises: :class:`ValueError` if *dict* is used and multiple rows are added
492 but the number of data items is different for different columns.
494 **Example:** add multiple data rows to a subset of columns using a dictionary
496 .. code-block:: python
498 # create table with three float columns
499 tab = Table(['x','y','z'], 'fff')
502 data = {'x': [1.2, 1.6], 'z': [1.6, 5.3]}
507 will produce the table
517 # overwrite the row with x=1.2 and add row with x=1.9
518 data = {'x': [1.2, 1.9], 'z': [7.9, 3.5]}
519 tab.AddRow(data, overwrite='x')
523 will produce the table
539 msg=
'data array must have %d elements, not %d'
540 raise ValueError(msg % (len(self.
col_names), len(data)))
547 for i,r
in enumerate(self.
rows):
548 if r[overwrite_idx]==new_row[overwrite_idx]:
549 self.
rows[i] = new_row
554 if not overwrite
or not added:
555 self.rows.append(new_row)
559 Remove column with the given name from the table
561 :param col: name of column to remove
562 :type col: :class:`str`
567 for row
in self.
rows:
570 def AddCol(self, col_name, col_type, data=None):
572 Add a column to the right of the table.
574 :param col_name: name of new column
575 :type col_name: :class:`str`
577 :param col_type: type of new column (long versions: *int*, *float*, *bool*,
578 *string* or short versions: *i*, *f*, *b*, *s*)
579 :type col_type: :class:`str`
581 :param data: data to add to new column.
582 :type data: scalar or iterable
586 .. code-block:: python
588 tab=Table(['x'], 'f', x=range(5))
589 tab.AddCol('even', 'bool', itertools.cycle([True, False]))
593 will produce the table
606 If data is a constant instead of an iterable object, it's value
607 will be written into each row:
609 .. code-block:: python
611 tab=Table(['x'], 'f', x=range(5))
612 tab.AddCol('num', 'i', 1)
616 will produce the table
631 :meth:`AddCol` only adds data to existing rows and does *not*
632 add new rows. Use :meth:`AddRow` to do this. Therefore, the following code
633 snippet does not add any data items:
635 .. code-block:: python
638 tab.AddCol('even', 'int', [1,2,3,4,5])
642 will produce the empty table
651 self.col_names.append(col_name)
652 self.col_types.append(col_type)
654 for row
in self.
rows:
657 for row, d
in zip(self.
rows, data):
662 Returns a filtered table only containing rows matching all the predicates
663 in kwargs and args For example,
665 .. code-block:: python
667 tab.Filter(town='Basel')
669 will return all the rows where the value of the column "town" is equal to
670 "Basel". Several predicates may be combined, i.e.
672 .. code-block:: python
674 tab.Filter(town='Basel', male=True)
676 will return the rows with "town" equal to "Basel" and "male" equal to true.
677 args are unary callables returning true if the row should be included in the
678 result and false if not.
681 for row
in self.
rows:
687 for key, val
in kwargs.iteritems():
696 def _LoadOST(stream_or_filename):
697 fieldname_pattern=re.compile(
r'(?P<name>[^[]+)(\[(?P<type>\w+)\])?')
698 values_pattern=re.compile(
"([^\" ]+|\"[^\"]*\")+")
699 if not hasattr(stream_or_filename,
'read'):
700 stream=open(stream_or_filename,
'r')
702 stream=stream_or_filename
707 if line.startswith(
'#'):
715 for col
in line.split():
716 match=fieldname_pattern.match(col)
718 if match.group(
'type'):
719 fieldtypes.append(match.group(
'type'))
721 fieldtypes.append(
'string')
722 fieldnames.append(match.group(
'name'))
723 tab=
Table(fieldnames, fieldtypes)
726 tab.AddRow([x.strip(
'"')
for x
in values_pattern.findall(line)])
728 raise IOError(
"Cannot read table from empty stream")
731 def _GuessColumnTypes(self):
732 for col_idx
in range(len(self.
col_names)):
734 for row
in self.
rows:
735 for idx
in range(len(row)):
739 def _LoadCSV(stream_or_filename, sep):
740 if not hasattr(stream_or_filename,
'read'):
741 stream=open(stream_or_filename,
'r')
743 stream=stream_or_filename
744 reader=csv.reader(stream, delimiter=sep)
750 tab=
Table(header, types)
755 raise IOError(
'trying to load table from empty CSV stream/file')
757 tab._GuessColumnTypes()
761 def _LoadPickle(stream_or_filename):
762 if not hasattr(stream_or_filename,
'read'):
763 stream=open(stream_or_filename,
'rb')
765 stream=stream_or_filename
766 return cPickle.load(stream)
769 def _GuessFormat(filename):
771 filename = filename.name
772 except AttributeError, e:
774 if filename.endswith(
'.csv'):
776 elif filename.endswith(
'.pickle'):
783 def Load(stream_or_filename, format='auto', sep=','):
785 Load table from stream or file with given name.
787 By default, the file format is set to *auto*, which tries to guess the file
788 format from the file extension. The following file extensions are
791 ============ ======================
792 extension recognized format
793 ============ ======================
794 .csv comma separated values
795 .pickle pickled byte stream
796 <all others> ost-specific format
797 ============ ======================
799 Thus, *format* must be specified for reading file with different filename
802 The following file formats are understood:
806 This is an ost-specific, but still human readable file format. The file
807 (stream) must start with header line of the form
809 col_name1[type1] <col_name2[type2]>...
811 The types given in brackets must be one of the data types the
812 :class:`Table` class understands. Each following line in the file then must
813 contains exactly the same number of data items as listed in the header. The
814 data items are automatically converted to the column format. Lines starting
815 with a '#' and empty lines are ignored.
819 Deserializes the table from a pickled byte stream
823 Reads the table from comma separated values stream. Since there is no
824 explicit type information in the csv file, the column types are guessed,
825 using the following simple rules:
827 * if all values are either NA/NULL/NONE the type is set to string
828 * if all non-null values are convertible to float/int the type is set to
830 * if all non-null values are true/false/yes/no, the value is set to bool
831 * for all other cases, the column type is set to string
833 :returns: A new :class:`Table` instance
835 format=format.lower()
837 format = Table._GuessFormat(stream_or_filename)
840 return Table._LoadOST(stream_or_filename)
842 return Table._LoadCSV(stream_or_filename, sep=sep)
844 return Table._LoadPickle(stream_or_filename)
845 raise ValueError(
'unknown format ""' % format)
849 Performs an in-place sort of the table, based on column *by*.
851 :param by: column name by which to sort
852 :type by: :class:`str`
854 :param order: ascending (``-``) or descending (``+``) order
855 :type order: :class:`str` (i.e. *+*, *-*)
861 def _key_cmp(lhs, rhs):
862 return sign*cmp(lhs[key_index], rhs[key_index])
863 self.
rows=sorted(self.
rows, _key_cmp)
867 Extract a list of all unique values from one column
869 :param col: column name
870 :type col: :class:`str`
872 :param ignore_nan: ignore all *None* values
873 :type ignore_nan: :class:`bool`
878 for row
in self.
rows:
880 if item!=
None or ignore_nan==
False:
881 if item
in seen:
continue
888 Allows to conveniently iterate over a selection of columns, e.g.
890 .. code-block:: python
892 tab=Table.Load('...')
893 for col1, col2 in tab.Zip('col1', 'col2'):
898 .. code-block:: python
900 tab=Table.Load('...')
901 for col1, col2 in zip(tab['col1'], tab['col2']):
904 return zip(*[self[arg]
for arg
in args])
906 def Plot(self, x, y=None, z=None, style='.', x_title=None, y_title=None,
907 z_title=
None, x_range=
None, y_range=
None, z_range=
None,
908 color=
None, plot_if=
None, legend=
None,
909 num_z_levels=10, diag_line=
False, labels=
None, max_num_labels=
None,
910 title=
None, clear=
True, save=
False, **kwargs):
912 Function to plot values from your table in 1, 2 or 3 dimensions using
913 `Matplotlib <http://matplotlib.sourceforge.net>`__
915 :param x: column name for first dimension
916 :type x: :class:`str`
918 :param y: column name for second dimension
919 :type y: :class:`str`
921 :param z: column name for third dimension
922 :type z: :class:`str`
924 :param style: symbol style (e.g. *.*, *-*, *x*, *o*, *+*, *\**). For a
925 complete list check (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
926 :type style: :class:`str`
928 :param x_title: title for first dimension, if not specified it is
929 automatically derived from column name
930 :type x_title: :class:`str`
932 :param y_title: title for second dimension, if not specified it is
933 automatically derived from column name
934 :type y_title: :class:`str`
936 :param z_title: title for third dimension, if not specified it is
937 automatically derived from column name
938 :type z_title: :class:`str`
940 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
941 :type x_range: :class:`list` of length two
943 :param y_range: start and end value for second dimension (e.g. [start_y, end_y])
944 :type y_range: :class:`list` of length two
946 :param z_range: start and end value for third dimension (e.g. [start_z, end_z])
947 :type z_range: :class:`list` of length two
949 :param color: color for data (e.g. *b*, *g*, *r*). For a complete list check
950 (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.plot>`__).
951 :type color: :class:`str`
953 :param plot_if: callable which returnes *True* if row should be plotted. Is
954 invoked like ``plot_if(self, row)``
955 :type plot_if: callable
957 :param legend: legend label for data series
958 :type legend: :class:`str`
960 :param num_z_levels: number of levels for third dimension
961 :type num_z_levels: :class:`int`
963 :param diag_line: draw diagonal line
964 :type diag_line: :class:`bool`
966 :param labels: column name containing labels to put on x-axis for one
968 :type labels: :class:`str`
970 :param max_num_labels: limit maximum number of labels
971 :type max_num_labels: :class:`int`
973 :param title: plot title, if not specified it is automatically derived from
975 :type title: :class:`str`
977 :param clear: clear old data from plot
978 :type clear: :class:`bool`
980 :param save: filename for saving plot
981 :type save: :class:`str`
983 :param \*\*kwargs: additional arguments passed to matplotlib
985 :returns: the ``matplotlib.pyplot`` module
987 **Examples:** simple plotting functions
989 .. code-block:: python
991 tab=Table(['a','b','c','d'],'iffi', a=range(5,0,-1),
992 b=[x/2.0 for x in range(1,6)],
993 c=[math.cos(x) for x in range(0,5)],
996 # one dimensional plot of column 'd' vs. index
1000 # two dimensional plot of 'a' vs. 'c'
1001 plt=tab.Plot('a', y='c', style='o-')
1004 # three dimensional plot of 'a' vs. 'c' with values 'b'
1005 plt=tab.Plot('a', y='c', z='b')
1006 # manually save plot to file
1007 plt.savefig("plot.png")
1010 import matplotlib.pyplot
as plt
1011 import matplotlib.mlab
as mlab
1019 plt.figure(figsize=[8, 6])
1042 if x_range
and (
IsScalar(x_range)
or len(x_range)!=2):
1043 raise ValueError(
'parameter x_range must contain exactly two elements')
1044 if y_range
and (
IsScalar(y_range)
or len(y_range)!=2):
1045 raise ValueError(
'parameter y_range must contain exactly two elements')
1046 if z_range
and (
IsScalar(z_range)
or len(z_range)!=2):
1047 raise ValueError(
'parameter z_range must contain exactly two elements')
1050 kwargs[
'color']=color
1052 kwargs[
'label']=legend
1056 for row
in self.
rows:
1057 if row[idx1]!=
None and row[idx2]!=
None and row[idx3]!=
None:
1058 if plot_if
and not plot_if(self, row):
1060 xs.append(row[idx1])
1061 ys.append(row[idx2])
1062 zs.append(row[idx3])
1065 z_spacing = (z_range[1] - z_range[0]) / num_z_levels
1069 z_spacing = (self.
Max(z) - l) / num_z_levels
1071 for i
in range(0,num_z_levels+1):
1075 xi = np.linspace(min(xs)-0.1,max(xs)+0.1,len(xs)*10)
1076 yi = np.linspace(min(ys)-0.1,max(ys)+0.1,len(ys)*10)
1077 zi = mlab.griddata(xs, ys, zs, xi, yi)
1079 plt.contour(xi,yi,zi,levels,linewidths=0.5,colors=
'k')
1080 plt.contourf(xi,yi,zi,levels,cmap=plt.cm.jet)
1081 plt.colorbar(ticks=levels)
1085 for row
in self.
rows:
1086 if row[idx1]!=
None and row[idx2]!=
None:
1087 if plot_if
and not plot_if(self, row):
1089 xs.append(row[idx1])
1090 ys.append(row[idx2])
1091 plt.plot(xs, ys, style, **kwargs)
1098 for row
in self.
rows:
1100 if plot_if
and not plot_if(self, row):
1102 xs.append(row[idx1])
1104 label_vals.append(row[label_idx])
1105 plt.plot(xs, style, **kwargs)
1109 if len(label_vals)>max_num_labels:
1110 interval = int(math.ceil(float(len(label_vals))/max_num_labels))
1111 label_vals = label_vals[::interval]
1112 plt.xticks(np.arange(0, len(xs), interval), label_vals, rotation=45,
1117 title =
'%s of %s vs. %s' % (nice_z, nice_x, nice_y)
1119 title =
'%s vs. %s' % (nice_x, nice_y)
1123 plt.title(title, size=
'x-large', fontweight=
'bold',
1124 verticalalignment=
'bottom')
1130 plt.xlabel(nice_x, size=
'x-large')
1132 plt.xlim(x_range[0], x_range[1])
1134 plt.ylim(y_range[0], y_range[1])
1136 plt.plot(x_range, y_range,
'-')
1138 plt.ylabel(nice_y, size=
'x-large')
1141 plt.ylim(y_range[0], y_range[1])
1143 plt.xlabel(x_title, size=
'x-large')
1144 plt.ylabel(nice_y, size=
'x-large')
1149 LogError(
"Function needs numpy and matplotlib, but I could not import it.")
1152 def PlotHistogram(self, col, x_range=None, num_bins=10, normed=False,
1153 histtype=
'stepfilled', align=
'mid', x_title=
None,
1154 y_title=
None, title=
None, clear=
True, save=
False):
1156 Create a histogram of the data in col for the range *x_range*, split into
1157 *num_bins* bins and plot it using Matplotlib.
1159 :param col: column name with data
1160 :type col: :class:`str`
1162 :param x_range: start and end value for first dimension (e.g. [start_x, end_x])
1163 :type x_range: :class:`list` of length two
1165 :param num_bins: number of bins in range
1166 :type num_bins: :class:`int`
1168 :param normed: normalize histogram
1169 :type normed: :class:`bool`
1171 :param histtype: type of histogram (i.e. *bar*, *barstacked*, *step*,
1172 *stepfilled*). See (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1173 :type histtype: :class:`str`
1175 :param align: style of histogram (*left*, *mid*, *right*). See
1176 (`matplotlib docu <http://matplotlib.sourceforge.net/api/pyplot_api.html#matplotlib.pyplot.hist>`__).
1177 :type align: :class:`str`
1179 :param x_title: title for first dimension, if not specified it is
1180 automatically derived from column name
1181 :type x_title: :class:`str`
1183 :param y_title: title for second dimension, if not specified it is
1184 automatically derived from column name
1185 :type y_title: :class:`str`
1187 :param title: plot title, if not specified it is automatically derived from
1188 plotted column names
1189 :type title: :class:`str`
1191 :param clear: clear old data from plot
1192 :type clear: :class:`bool`
1194 :param save: filename for saving plot
1195 :type save: :class:`str`
1197 **Examples:** simple plotting functions
1199 .. code-block:: python
1201 tab=Table(['a'],'f', a=[math.cos(x*0.01) for x in range(100)])
1203 # one dimensional plot of column 'd' vs. index
1204 plt=tab.PlotHistogram('a')
1209 import matplotlib.pyplot
as plt
1212 if len(self.
rows)==0:
1224 n, bins, patches = plt.hist(data, bins=num_bins, range=x_range,
1225 normed=normed, histtype=histtype, align=align)
1231 plt.xlabel(nice_x, size=
'x-large')
1237 plt.ylabel(nice_y, size=
'x-large')
1242 nice_title=
"Histogram of %s"%nice_x
1243 plt.title(nice_title, size=
'x-large', fontweight=
'bold')
1249 LogError(
"Function needs numpy and matplotlib, but I could not import it.")
1252 def _Max(self, col):
1253 if len(self.
rows)==0:
1257 if col_type==
'int' or col_type==
'float':
1258 max_val = -float(
'inf')
1259 elif col_type==
'bool':
1261 elif col_type==
'string':
1264 for i
in range(0, len(self.
rows)):
1265 if self.
rows[i][idx]>max_val:
1266 max_val = self.
rows[i][idx]
1268 return max_val, max_idx
1272 Returns the row containing the cell with the maximal value in col. If
1273 several rows have the highest value, only the first one is returned.
1274 None values are ignored.
1276 :param col: column name
1277 :type col: :class:`str`
1279 val, idx = self.
_Max(col)
1280 return self.
rows[idx]
1284 Returns the maximum value in col. If several rows have the highest value,
1285 only the first one is returned. None values are ignored.
1287 :param col: column name
1288 :type col: :class:`str`
1290 val, idx = self.
_Max(col)
1295 Returns the row index of the cell with the maximal value in col. If
1296 several rows have the highest value, only the first one is returned.
1297 None values are ignored.
1299 :param col: column name
1300 :type col: :class:`str`
1302 val, idx = self.
_Max(col)
1305 def _Min(self, col):
1306 if len(self.
rows)==0:
1310 if col_type==
'int' or col_type==
'float':
1311 min_val=float(
'inf')
1312 elif col_type==
'bool':
1314 elif col_type==
'string':
1317 for i,row
in enumerate(self.
rows):
1318 if row[idx]!=
None and row[idx]<min_val:
1321 return min_val, min_idx
1325 Returns the minimal value in col. If several rows have the lowest value,
1326 only the first one is returned. None values are ignored.
1328 :param col: column name
1329 :type col: :class:`str`
1331 val, idx = self.
_Min(col)
1336 Returns the row containing the cell with the minimal value in col. If
1337 several rows have the lowest value, only the first one is returned.
1338 None values are ignored.
1340 :param col: column name
1341 :type col: :class:`str`
1343 val, idx = self.
_Min(col)
1344 return self.
rows[idx]
1348 Returns the row index of the cell with the minimal value in col. If
1349 several rows have the lowest value, only the first one is returned.
1350 None values are ignored.
1352 :param col: column name
1353 :type col: :class:`str`
1355 val, idx = self.
_Min(col)
1360 Returns the sum of the given column. Cells with None are ignored. Returns
1361 0.0, if the column doesn't contain any elements. Col must be of numeric
1362 column type ('float', 'int') or boolean column type.
1364 :param col: column name
1365 :type col: :class:`str`
1367 :raises: :class:`TypeError` if column type is ``string``
1371 if col_type!=
'int' and col_type!=
'float' and col_type!=
'bool':
1372 raise TypeError(
"Sum can only be used on numeric column types")
1381 Returns the mean of the given column. Cells with None are ignored. Returns
1382 None, if the column doesn't contain any elements. Col must be of numeric
1383 ('float', 'int') or boolean column type.
1385 If column type is *bool*, the function returns the ratio of
1386 number of 'Trues' by total number of elements.
1388 :param col: column name
1389 :type col: :class:`str`
1391 :raises: :class:`TypeError` if column type is ``string``
1395 if col_type!=
'int' and col_type!=
'float' and col_type!=
'bool':
1396 raise TypeError(
"Mean can only be used on numeric or bool column types")
1403 return stutil.Mean(vals)
1409 Adds a new column of type 'float' with a specified name (*mean_col_name*),
1410 containing the mean of all specified columns for each row.
1412 Cols are specified by their names and must be of numeric column
1413 type ('float', 'int') or boolean column type. Cells with None are ignored.
1414 Adds None if the row doesn't contain any values.
1416 :param mean_col_name: name of new column containing mean values
1417 :type mean_col_name: :class:`str`
1419 :param cols: name or list of names of columns to include in computation of
1421 :type cols: :class:`str` or :class:`list` of strings
1423 :raises: :class:`TypeError` if column type of columns in *col* is ``string``
1427 Staring with the following table:
1437 the code here adds a column with the name 'mean' to yield the table below:
1439 .. code-block::python
1441 tab.RowMean('mean', ['x', 'u'])
1444 ==== ==== ==== =====
1446 ==== ==== ==== =====
1450 ==== ==== ==== =====
1459 idx = self.GetColIndex(col)
1460 col_type = self.col_types[idx]
1461 if col_type!='int' and col_type!='float' and col_type!='bool':
1462 raise TypeError("RowMean can only be used on numeric column types")
1463 cols_idxs.append(idx)
1466 for row in self.rows:
1468 for idx in cols_idxs:
1473 mean = stutil.Mean(vals)
1474 mean_rows.append(mean)
1476 mean_rows.append(None)
1478 self.AddCol(mean_col_name, 'f', mean_rows)
1480 def Median(self, col):
1482 Returns the median of the given column. Cells with
None are ignored. Returns
1483 None,
if the column doesn
't contain any elements. Col must be of numeric
1484 column type ('float',
'int')
or boolean column type.
1486 :param col: column name
1487 :type col: :
class:`str`
1489 :raises: :
class:`TypeError`
if column type
is ``string``
1491 idx = self.GetColIndex(col)
1492 col_type = self.col_types[idx]
1493 if col_type!='int' and col_type!='float' and col_type!='bool':
1494 raise TypeError("Median can only be used on numeric column types")
1502 return stutil.Median(vals)
1506 def StdDev(self, col):
1508 Returns the standard deviation of the given column. Cells with
None are
1509 ignored. Returns
None,
if the column doesn
't contain any elements. Col must
1510 be of numeric column type ('float',
'int')
or boolean column type.
1512 :param col: column name
1513 :type col: :
class:`str`
1515 :raises: :
class:`TypeError`
if column type
is ``string``
1517 idx = self.GetColIndex(col)
1518 col_type = self.col_types[idx]
1519 if col_type!='int' and col_type!='float' and col_type!='bool':
1520 raise TypeError("StdDev can only be used on numeric column types")
1527 return stutil.StdDev(vals)
1531 def Count(self, col, ignore_nan=True):
1533 Count the number of cells
in column that are
not equal to
None.
1535 :param col: column name
1536 :type col: :
class:`str`
1538 :param ignore_nan: ignore all *
None* values
1539 :type ignore_nan: :
class:`bool`
1542 idx=self.GetColIndex(col)
1551 def Correl(self, col1, col2):
1553 Calculate the Pearson correlation coefficient between *col1*
and *col2*, only
1554 taking rows into account where both of the values are
not equal to *
None*.
1555 If there are
not enough data points to calculate a correlation coefficient,
1558 :param col1: column name
for first column
1559 :type col1: :
class:`str`
1561 :param col2: column name
for second column
1562 :type col2: :
class:`str`
1564 if IsStringLike(col1) and IsStringLike(col2):
1565 col1 = self.GetColIndex(col1)
1566 col2 = self.GetColIndex(col2)
1567 vals1, vals2=([],[])
1568 for v1, v2 in zip(self[col1], self[col2]):
1569 if v1!=None and v2!=None:
1573 return stutil.Correl(vals1, vals2)
1577 def SpearmanCorrel(self, col1, col2):
1579 Calculate the Spearman correlation coefficient between col1
and col2, only
1580 taking rows into account where both of the values are
not equal to
None. If
1581 there are
not enough data points to calculate a correlation coefficient,
1584 :warning: The function depends on the following module: *scipy.stats.mstats*
1586 :param col1: column name
for first column
1587 :type col1: :
class:`str`
1589 :param col2: column name
for second column
1590 :type col2: :
class:`str`
1593 import scipy.stats.mstats
1595 if IsStringLike(col1) and IsStringLike(col2):
1596 col1 = self.GetColIndex(col1)
1597 col2 = self.GetColIndex(col2)
1598 vals1, vals2=([],[])
1599 for v1, v2 in zip(self[col1], self[col2]):
1600 if v1!=None and v2!=None:
1604 correl = scipy.stats.mstats.spearmanr(vals1, vals2)[0]
1605 if scipy.isnan(correl):
1612 LogError("Function needs scipy.stats.mstats, but I could not import it.")
1616 def Save(self, stream_or_filename, format='ost', sep=','):
1618 Save the table to stream
or filename. The following three file formats
1619 are supported (
for more information on file formats, see :meth:`Load`):
1621 ============= =======================================
1622 ost ost-specific format (human readable)
1623 csv comma separated values (human readable)
1624 pickle pickled byte stream (binary)
1625 ============= =======================================
1627 :param stream_or_filename: filename
or stream
for writing output
1628 :type stream_or_filename: :
class:`str`
or :
class:`file`
1630 :param format: output format (i.e. *ost*, *csv*, *pickle*)
1631 :type format: :
class:`str`
1633 :raises: :
class:`ValueError`
if format
is unknown
1635 format=format.lower()
1637 return self._SaveOST(stream_or_filename)
1639 return self._SaveCSV(stream_or_filename, sep=sep)
1640 if format=='pickle':
1641 return self._SavePickle(stream_or_filename)
1642 raise ValueError('unknown format "%s"' % format)
1644 def _SavePickle(self, stream):
1645 if not hasattr(stream, 'write'):
1646 stream=open(stream, 'wb')
1647 cPickle.dump(self, stream, cPickle.HIGHEST_PROTOCOL)
1649 def _SaveCSV(self, stream, sep):
1650 if not hasattr(stream, 'write'):
1651 stream=open(stream, 'wb')
1653 writer=csv.writer(stream, delimiter=sep)
1654 writer.writerow(['%s' % n for n in self.col_names])
1655 for row in self.rows:
1657 for i, c in enumerate(row):
1660 writer.writerow(row)
1662 def _SaveOST(self, stream):
1663 if hasattr(stream, 'write'):
1664 writer=csv.writer(stream, delimiter=' ')
1666 stream=open(stream, 'w')
1667 writer=csv.writer(stream, delimiter=' ')
1669 stream.write(''.join(['# %s\n' % l for l in self.comment.split('\n')]))
1670 writer.writerow(['%s[%s]' % t for t in zip(self.col_names, self.col_types)])
1671 for row in self.rows:
1673 for i, c in enumerate(row):
1676 writer.writerow(row)
1679 def GetNumpyMatrix(self, *args):
1681 Returns a numpy matrix containing the selected columns from the table as
1682 columns in the matrix.
1683 Only columns of type *int* or *float* are supported. *NA* values in the
1684 table will be converted to *None* values.
1686 :param \*args: column names to include in numpy matrix
1688 :warning: The function depends on *numpy*
1694 raise RuntimeError("At least one column must be specified.")
1698 idx = self.GetColIndex(arg)
1699 col_type = self.col_types[idx]
1700 if col_type!='int' and col_type!='float':
1701 raise TypeError("Numpy matrix can only be generated from numeric column types")
1703 m = np.matrix([list(self[i]) for i in idxs])
1707 LogError("Function needs numpy, but I could not import it.")
1710 def GetOptimalPrefactors(self, ref_col, *args, **kwargs):
1712 This returns the optimal prefactor values (i.e. a, b, c, ...) for the
1718 a*u + b*v + c*w + ... = z
1720 where u, v, w and z are vectors. In matrix notation
1727 where A contains the data from the table (u,v,w,...), p are the prefactors
1728 to optimize (a,b,c,...) and z is the vector containing the result of
1731 The parameter ref_col equals to z in both equations, and \*args are columns
1732 u, v and w (or A in :eq:`op2`). All columns must be specified by their names.
1736 .. code-block:: python
1738 tab.GetOptimalPrefactors('colC', 'colA', 'colB')
1740 The function returns a list of containing the prefactors a, b, c, ... in
1741 the correct order (i.e. same as columns were specified in \*args).
1744 If the kwarg weights="columX" is specified, the equations are weighted by
1745 the values in that column. Each row is multiplied by the weight in that row,
1746 which leads to :eq:`op3`:
1751 weight*a*u + weight*b*v + weight*c*w + ... = weight*z
1753 Weights must be float or int and can have any value. A value of 0 ignores
1754 this equation, a value of 1 means the same as no weight. If all weights are
1755 the same for each row, the same result will be obtained as with no weights.
1759 .. code-block:: python
1761 tab.GetOptimalPrefactors('colC', 'colA', 'colB', weights='colD')
1768 raise RuntimeError("At least one column must be specified.")
1770 b = self.GetNumpyMatrix(ref_col)
1771 a = self.GetNumpyMatrix(*args)
1774 if kwargs.has_key('weights'):
1775 w = self.GetNumpyMatrix(kwargs['weights'])
1776 b = np.multiply(b,w)
1777 a = np.multiply(a,w)
1780 raise RuntimeError("specified unrecognized kwargs, use weights as key")
1783 return list(np.array(k.T).reshape(-1))
1786 LogError("Function needs numpy, but I could not import it.")
1789 def PlotEnrichment(self, score_col, class_col, score_dir='-',
1790 class_dir='-', class_cutoff=2.0,
1791 style='-', title=None, x_title=None, y_title=None,
1792 clear=True, save=None):
1794 Plot an enrichment curve using matplotlib of column *score_col* classified
1795 according to *class_col*.
1797 For more information about parameters of the enrichment, see
1798 :meth:`ComputeEnrichment`, and for plotting see :meth:`Plot`.
1800 :warning: The function depends on *matplotlib*
1803 import matplotlib.pyplot as plt
1805 enrx, enry = self.ComputeEnrichment(score_col, class_col, score_dir,
1806 class_dir, class_cutoff)
1809 title = 'Enrichment of %s'%score_col
1812 x_title = '% database'
1815 y_title = '% positives'
1820 plt.plot(enrx, enry, style)
1822 plt.title(title, size='x-large', fontweight='bold')
1823 plt.ylabel(y_title, size='x-large')
1824 plt.xlabel(x_title, size='x-large')
1831 LogError("Function needs matplotlib, but I could not import it.")
1834 def ComputeEnrichment(self, score_col, class_col, score_dir='-',
1835 class_dir='-', class_cutoff=2.0):
1837 Computes the enrichment of column *score_col* classified according to
1840 For this it is necessary, that the datapoints are classified into positive
1841 and negative points. This can be done in two ways:
1843 - by using one 'bool' type column (*class_col*) which contains *True* for
1844 positives and *False* for negatives
1846 - by specifying a classification column (*class_col*), a cutoff value
1847 (*class_cutoff*) and the classification columns direction (*class_dir*).
1848 This will generate the classification on the fly
1850 * if ``class_dir=='-'``: values in the classification column that are less than or equal to class_cutoff will be counted as positives
1851 * if ``class_dir=='+'``: values in the classification column that are larger than or equal to class_cutoff will be counted as positives
1853 During the calculation, the table will be sorted according to *score_dir*,
1854 where a '-' values means smallest values first and therefore, the smaller
1855 the value, the better.
1859 ALLOWED_DIR = ['+','-']
1861 score_idx = self.GetColIndex(score_col)
1862 score_type = self.col_types[score_idx]
1863 if score_type!='int' and score_type!='float':
1864 raise TypeError("Score column must be numeric type")
1866 class_idx = self.GetColIndex(class_col)
1867 class_type = self.col_types[class_idx]
1868 if class_type!='int' and class_type!='float' and class_type!='bool':
1869 raise TypeError("Classifier column must be numeric or bool type")
1871 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
1872 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
1874 self.Sort(score_col, score_dir)
1879 for i,row in enumerate(self.rows):
1880 class_val = row[class_idx]
1882 if class_type=='bool':
1886 if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
1890 x = [float(v)/x[-1] for v in x]
1891 y = [float(v)/y[-1] for v in y]
1894 def ComputeEnrichmentAUC(self, score_col, class_col, score_dir='-',
1895 class_dir='-', class_cutoff=2.0):
1897 Computes the area under the curve of the enrichment using the trapezoidal
1900 For more information about parameters of the enrichment, see
1901 :meth:`ComputeEnrichment`.
1903 :warning: The function depends on *numpy*
1908 enrx, enry = self.ComputeEnrichment(score_col, class_col, score_dir,
1909 class_dir, class_cutoff)
1911 return np.trapz(enry, enrx)
1913 LogError("Function needs numpy, but I could not import it.")
1916 def ComputeROC(self, score_col, class_col, score_dir='-',
1917 class_dir='-', class_cutoff=2.0):
1919 Computes the receiver operating characteristics (ROC) of column *score_col*
1920 classified according to *class_col*.
1922 For this it is necessary, that the datapoints are classified into positive
1923 and negative points. This can be done in two ways:
1925 - by using one 'bool' column (*class_col*) which contains True for positives
1926 and False for negatives
1927 - by using a non-bool column (*class_col*), a cutoff value (*class_cutoff*)
1928 and the classification columns direction (*class_dir*). This will generate
1929 the classification on the fly
1931 - if ``class_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff* will be counted as positives
1932 - if ``class_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff* will be counted as positives
1934 During the calculation, the table will be sorted according to *score_dir*,
1935 where a '-' values means smallest values first and therefore, the smaller
1936 the value, the better.
1938 If *class_col* does not contain any positives (i.e. value is True (if column
1939 is of type bool) or evaluated to True (if column is of type int or float
1940 (depending on *class_dir* and *class_cutoff*))) the ROC is not defined and
1941 the function will return *None*.
1944 ALLOWED_DIR = ['+','-']
1946 score_idx = self.GetColIndex(score_col)
1947 score_type = self.col_types[score_idx]
1948 if score_type!='int' and score_type!='float':
1949 raise TypeError("Score column must be numeric type")
1951 class_idx = self.GetColIndex(class_col)
1952 class_type = self.col_types[class_idx]
1953 if class_type!='int' and class_type!='float' and class_type!='bool':
1954 raise TypeError("Classifier column must be numeric or bool type")
1956 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
1957 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
1959 self.Sort(score_col, score_dir)
1965 old_score_val = None
1967 for i,row in enumerate(self.rows):
1968 class_val = row[class_idx]
1969 score_val = row[score_idx]
1971 if old_score_val==None:
1972 old_score_val = score_val
1973 if score_val!=old_score_val:
1976 old_score_val = score_val
1977 if class_type=='bool':
1983 if (class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff):
1990 # if no false positives or false negatives values are found return None
1991 if x[-1]==0 or y[-1]==0:
1994 x = [float(v)/x[-1] for v in x]
1995 y = [float(v)/y[-1] for v in y]
1998 def ComputeROCAUC(self, score_col, class_col, score_dir='-',
1999 class_dir='-', class_cutoff=2.0):
2001 Computes the area under the curve of the receiver operating characteristics
2002 using the trapezoidal rule.
2004 For more information about parameters of the ROC, see
2007 :warning: The function depends on *numpy*
2012 roc = self.ComputeROC(score_col, class_col, score_dir,
2013 class_dir, class_cutoff)
2017 return np.trapz(roc[1], roc[0])
2019 LogError("Function needs numpy, but I could not import it.")
2022 def PlotROC(self, score_col, class_col, score_dir='-',
2023 class_dir='-', class_cutoff=2.0,
2024 style='-', title=None, x_title=None, y_title=None,
2025 clear=True, save=None):
2027 Plot an ROC curve using matplotlib.
2029 For more information about parameters of the ROC, see
2030 :meth:`ComputeROC`, and for plotting see :meth:`Plot`.
2032 :warning: The function depends on *matplotlib*
2036 import matplotlib.pyplot as plt
2038 roc = self.ComputeROC(score_col, class_col, score_dir,
2039 class_dir, class_cutoff)
2047 title = 'ROC of %s'%score_col
2050 x_title = 'false positive rate'
2053 y_title = 'true positive rate'
2058 plt.plot(enrx, enry, style)
2060 plt.title(title, size='x-large', fontweight='bold')
2061 plt.ylabel(y_title, size='x-large')
2062 plt.xlabel(x_title, size='x-large')
2069 LogError("Function needs matplotlib, but I could not import it.")
2072 def ComputeMCC(self, score_col, class_col, score_dir='-',
2073 class_dir='-', score_cutoff=2.0, class_cutoff=2.0):
2075 Compute Matthews correlation coefficient (MCC) for one column (*score_col*)
2076 with the points classified into true positives, false positives, true
2077 negatives and false negatives according to a specified classification
2078 column (*class_col*).
2080 The datapoints in *score_col* and *class_col* are classified into
2081 positive and negative points. This can be done in two ways:
2083 - by using 'bool' columns which contains True for positives and False
2086 - by using 'float' or 'int' columns and specifying a cutoff value and the
2087 columns direction. This will generate the classification on the fly
2089 * if ``class_dir``/``score_dir=='-'``: values in the classification column that are less than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2090 * if ``class_dir``/``score_dir=='+'``: values in the classification column that are larger than or equal to *class_cutoff*/*score_cutoff* will be counted as positives
2092 The two possibilities can be used together, i.e. 'bool' type for one column
2093 and 'float'/'int' type and cutoff/direction for the other column.
2095 ALLOWED_DIR = ['+','-']
2097 score_idx = self.GetColIndex(score_col)
2098 score_type = self.col_types[score_idx]
2099 if score_type!='int' and score_type!='float' and score_type!='bool':
2100 raise TypeError("Score column must be numeric or bool type")
2102 class_idx = self.GetColIndex(class_col)
2103 class_type = self.col_types[class_idx]
2104 if class_type!='int' and class_type!='float' and class_type!='bool':
2105 raise TypeError("Classifier column must be numeric or bool type")
2107 if (score_dir not in ALLOWED_DIR) or (class_dir not in ALLOWED_DIR):
2108 raise ValueError("Direction must be one of %s"%str(ALLOWED_DIR))
2115 for i,row in enumerate(self.rows):
2116 class_val = row[class_idx]
2117 score_val = row[score_idx]
2119 if (class_type=='bool' and class_val==True) or (class_type!='bool' and ((class_dir=='-' and class_val<=class_cutoff) or (class_dir=='+' and class_val>=class_cutoff))):
2120 if (score_type=='bool' and score_val==True) or (score_type!='bool' and ((score_dir=='-' and score_val<=score_cutoff) or (score_dir=='+' and score_val>=score_cutoff))):
2125 if (score_type=='bool' and score_val==False) or (score_type!='bool' and ((score_dir=='-' and score_val>score_cutoff) or (score_dir=='+' and score_val<score_cutoff))):
2133 msg = 'factor (tp + fn) is zero'
2135 msg = 'factor (tp + fp) is zero'
2137 msg = 'factor (tn + fn) is zero'
2139 msg = 'factor (tn + fp) is zero'
2142 LogWarning("Could not compute MCC: MCC is not defined since %s"%msg)
2144 mcc = ((tp*tn)-(fp*fn)) / math.sqrt((tp+fn)*(tp+fp)*(tn+fn)*(tn+fp))
2148 def IsEmpty(self, col_name=None, ignore_nan=True):
2150 Checks if a table is empty.
2152 If no column name is specified, the whole table is checked for being empty,
2153 whereas if a column name is specified, only this column is checked.
2155 By default, all NAN (or None) values are ignored, and thus, a table
2156 containing only NAN values is considered as empty. By specifying the
2157 option ignore_nan=False, NAN values are counted as 'normal' values.
2160 # table with no columns and no rows
2161 if len(self.col_names)==0:
2163 raise ValueError('Table has no column named "%s"' % col_name)
2166 # column name specified
2168 if self.Count(col_name, ignore_nan=ignore_nan)==0:
2173 # no column name specified -> test whole table
2175 for row in self.rows:
2184 def Extend(self, tab, overwrite=None):
2186 Append each row of *tab* to the current table. The data
is appended based
2187 on the column names, thus the order of the table columns
is *
not* relevant,
2188 only the header names.
2190 If there
is a column
in *tab* that
is not present
in the current table,
2191 it
is added to the current table
and filled with *
None*
for all the rows
2192 present
in the current table.
2194 If the type of any column
in *tab*
is not the same
as in the current table
2195 a *TypeError*
is raised.
2197 If *overwrite*
is not None and set to an existing column name, the specified
2198 column
in the table
is searched
for the first occurrence of a value matching
2199 the value of the column with the same name
in the dictionary. If a matching
2200 value
is found, the row
is overwritten with the dictionary. If no matching
2201 row
is found, a new row
is appended to the table.
2203 # add column to current table if it doesn't exist
2204 for name,typ in zip(tab.col_names, tab.col_types):
2205 if not name in self.col_names:
2206 self.AddCol(name, typ)
2208 # check that column types are the same in current and new table
2209 for name in self.col_names:
2210 if name in tab.col_names:
2211 curr_type = self.col_types[self.GetColIndex(name)]
2212 new_type = tab.col_types[tab.GetColIndex(name)]
2213 if curr_type!=new_type:
2214 raise TypeError('cannot extend table, column %s in new '%name +\
2215 'table different type (%s) than in '%new_type +\
2216 'current table (%s)'%curr_type)
2218 num_rows = len(tab.rows)
2219 for i in range(0,num_rows):
2221 data = dict(zip(tab.col_names,row))
2222 self.AddRow(data, overwrite)
2225 def Merge(table1, table2, by, only_matching=False):
2227 Returns a new table containing the data
from both tables. The rows are
2228 combined based on the common values
in the column(s) by. The option
'by' can
2229 be a list of column names. When this
is the case, merging
is based on
2231 For example, the two tables below
2258 when merged by column x, produce the following output:
2260 def _key(row, indices):
2261 return tuple([row[i] for i in indices])
2262 def _keep(indices, cn, ct, ni):
2263 ncn, nct, nni=([],[],[])
2264 for i in range(len(cn)):
2265 if i not in indices:
2269 return ncn, nct, nni
2270 col_names=list(table2.col_names)
2271 col_types=list(table2.col_types)
2272 new_index=[i for i in range(len(col_names))]
2273 if isinstance(by, str):
2274 common2_indices=[col_names.index(by)]
2276 common2_indices=[col_names.index(b) for b in by]
2277 col_names, col_types, new_index=_keep(common2_indices, col_names,
2278 col_types, new_index)
2280 for i, name in enumerate(col_names):
2283 while try_name in table1.col_names:
2285 try_name='%s_%d' % (name, counter)
2286 col_names[i]=try_name
2288 if isinstance(by, str):
2289 common1_indices=[table1.col_names.index(by)]
2291 common1_indices=[table1.col_names.index(b) for b in by]
2292 for row in table1.rows:
2293 key=_key(row, common1_indices)
2295 raise ValueError('duplicate key "%s in first table"' % (str(key)))
2298 for row in table2.rows:
2299 key=_key(row, common2_indices)
2301 raise ValueError('duplicate key "%s" in second table' % (str(key)))
2303 new_tab=Table(table1.col_names+col_names, table1.col_types+col_types)
2304 for k, v in common1.iteritems():
2305 row=v+[None for i in range(len(table2.col_names)-len(common2_indices))]
2310 for i, index in enumerate(new_index):
2311 row[len(table1.col_names)+i]=row2[index]
2312 if only_matching and not matched:
2317 for k, v in common2.iteritems():
2318 if not k in common1:
2319 v2=[v[i] for i in new_index]
2320 row=[None for i in range(len(table1.col_names))]+v2
2321 for common1_index, common2_index in zip(common1_indices, common2_indices):
2322 row[common1_index]=v[common2_index]