Source code for pyexcel.sheets.nominablesheet

"""
    pyexcel.sheets.nominablesheet
    ~~~~~~~~~~~~~~~~~~~

    Building on top of filterablesheet, adding named columns and rows support

    :copyright: (c) 2014-2015 by Onni Software Ltd.
    :license: New BSD License, see LICENSE for more details
"""
from .matrix import Row, Column, Matrix
from .formattablesheet import FormattableSheet
from .filterablesheet import FilterableSheet
from ..formatters import (
    ColumnFormatter,
    RowFormatter,
    NamedColumnFormatter,
    NamedRowFormatter)
from .._compact import is_string, OrderedDict, PY2, is_array_type
from .._compact import is_tuple_consists_of_strings
from ..filters import ColumnIndexFilter, RowIndexFilter
from ..iterators import (
    ColumnIndexIterator,
    RowIndexIterator,
    NamedRowIterator,
    NamedColumnIterator
)
from ..constants import (
    MESSAGE_NOT_IMPLEMENTED_02,
    MESSAGE_DATA_ERROR_ORDEREDDICT_IS_EXPECTED,
    DEFAULT_NAME
)


def names_to_indices(names, series):
    if isinstance(names, str):
        indices = series.index(names)
    elif (isinstance(names, list) and
          isinstance(names[0], str)):
        # translate each row name to index
        indices = [series.index(astr) for astr in names]
    else:
        return names
    return indices


def make_names_unique(alist):
    duplicates = {}
    new_names = []
    for item in alist:
        if item in duplicates:
            duplicates[item] = duplicates[item] + 1
            new_names.append("%s-%d" % (item, duplicates[item]))
        else:
            duplicates[item] = 0
            new_names.append(str(item))
    return new_names


[docs]class NamedRow(Row):
    """Series Sheet would have Named Row instead of Row

    Here is an example to merge sheets. Suppose we have the
    following three files::

        >>> import pyexcel as pe
        >>> data = [[1,2,3],[4,5,6],[7,8,9]]
        >>> s = pe.Sheet(data)
        >>> s.save_as("1.csv")
        >>> data2 = [['a','b','c'],['d','e','f'],['g','h','i']]
        >>> s2 = pe.Sheet(data2)
        >>> s2.save_as("2.csv")
        >>> data3=[[1.1, 2.2, 3.3],[4.4, 5.5, 6.6],[7.7, 8.8, 9.9]]
        >>> s3=pe.Sheet(data3)
        >>> s3.save_as("3.csv")


        >>> merged = pe.Sheet()
        >>> for file in ["1.csv", "2.csv", "3.csv"]:
        ...     r = pe.get_sheet(file_name=file)
        ...     merged.row += r
        >>> merged.save_as("merged.csv")

    Now let's verify what we had::

        >>> sheet = pe.get_sheet(file_name="merged.csv")

    this is added to overcome doctest's inability to handle
    python 3's unicode::

        >>> sheet.format(lambda v: str(v))
        >>> sheet
        merged.csv:
        +-----+-----+-----+
        | 1   | 2   | 3   |
        +-----+-----+-----+
        | 4   | 5   | 6   |
        +-----+-----+-----+
        | 7   | 8   | 9   |
        +-----+-----+-----+
        | a   | b   | c   |
        +-----+-----+-----+
        | d   | e   | f   |
        +-----+-----+-----+
        | g   | h   | i   |
        +-----+-----+-----+
        | 1.1 | 2.2 | 3.3 |
        +-----+-----+-----+
        | 4.4 | 5.5 | 6.6 |
        +-----+-----+-----+
        | 7.7 | 8.8 | 9.9 |
        +-----+-----+-----+

    .. testcleanup::
        >>> import os
        >>> os.unlink("1.csv")
        >>> os.unlink("2.csv")
        >>> os.unlink("3.csv")
        >>> os.unlink("merged.csv")

    """
[docs]    def select(self, names):
        """Delete row indices other than specified

        Examples:

            >>> import pyexcel as pe
            >>> data = [[1],[2],[3],[4],[5],[6],[7],[9]]
            >>> sheet = pe.Sheet(data)
            >>> sheet
            pyexcel sheet:
            +---+
            | 1 |
            +---+
            | 2 |
            +---+
            | 3 |
            +---+
            | 4 |
            +---+
            | 5 |
            +---+
            | 6 |
            +---+
            | 7 |
            +---+
            | 9 |
            +---+
            >>> sheet.row.select([1,2,3,5])
            >>> sheet
            pyexcel sheet:
            +---+
            | 2 |
            +---+
            | 3 |
            +---+
            | 4 |
            +---+
            | 6 |
            +---+
            >>> data = [
            ...     ['a', 1],
            ...     ['b', 1],
            ...     ['c', 1]
            ... ]
            >>> sheet = pe.Sheet(data, name_rows_by_column=0)
            >>> sheet.row.select(['a', 'b'])
            >>> sheet
            pyexcel sheet:
            +---+---+
            | a | 1 |
            +---+---+
            | b | 1 |
            +---+---+

        """
        if is_array_type(names, str):
            indices = names_to_indices(names, self.ref.rownames)
            Row.select(self, indices)
        else:
            Row.select(self, names)

    def __delitem__(self, column_name):
        """

        Examples::

            >>> import pyexcel as pe
            >>> data = [
            ...     ['a', 1],
            ...     ['b', 1],
            ...     ['c', 1]
            ... ]
            >>> sheet = pe.Sheet(data, name_rows_by_column=0)
            >>> del sheet.row['a', 'b']
            >>> sheet
            pyexcel sheet:
            +---+---+
            | c | 1 |
            +---+---+

        """
        if is_string(type(column_name)):
            self.ref.delete_named_row_at(column_name)
        elif is_tuple_consists_of_strings(column_name):
            indices = names_to_indices(list(column_name), self.ref.rownames)
            Row.__delitem__(self, indices)
        else:
            Row.__delitem__(self, column_name)

    def __setitem__(self, str_or_aslice, c):
        if is_string(type(str_or_aslice)):
            self.ref.set_named_row_at(str_or_aslice, c)
        else:
            Row.__setitem__(self, str_or_aslice, c)

    def __getitem__(self, str_or_aslice):
        if is_string(type(str_or_aslice)):
            return self.ref.named_row_at(str_or_aslice)
        else:
            return Row.__getitem__(self, str_or_aslice)

    def __iadd__(self, other):
        """Overload += sign

        :param list other: the row header must be the first element.
        :return: self
        """
        if isinstance(other, OrderedDict):
            self.ref.extend_rows(other)
        else:
            Row.__iadd__(self, other)
        return self

    def __add__(self, other):
        """Overload += sign

        :return: self
        """
        self.__iadd__(other)
        return self.ref

[docs]    def format(self,
               row_index=None, formatter=None,
               format_specs=None, on_demand=False):
        """Format a row
        """
        def handle_one_formatter(rows, theformatter, on_demand):
            new_indices = rows
            if len(self.ref.rownames) > 0:
                new_indices = names_to_indices(rows, self.ref.rownames)
            aformatter = RowFormatter(new_indices, theformatter)
            if on_demand:
                self.ref.add_formatter(aformatter)
            else:
                self.ref.apply_formatter(aformatter)
        if row_index is not None:
            handle_one_formatter(row_index, formatter, on_demand)
        elif format_specs:
            for spec in format_specs:
                if len(spec) == 3:
                    handle_one_formatter(spec[0], spec[1],
                                         on_demand)
                else:
                    handle_one_formatter(spec[0], spec[1],
                                         on_demand)


[docs]class NamedColumn(Column):
    """Series Sheet would have Named Column instead of Column

    example::

        import pyexcel as pe

        r = pe.SeriesReader("example.csv")
        print(r.column["column 1"])

    """
[docs]    def select(self, names):
        """Delete columns other than specified

        Examples:

            >>> import pyexcel as pe
            >>> data = [[1,2,3,4,5,6,7,9]]
            >>> sheet = pe.Sheet(data)
            >>> sheet
            pyexcel sheet:
            +---+---+---+---+---+---+---+---+
            | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 9 |
            +---+---+---+---+---+---+---+---+
            >>> sheet.column.select([1,2,3,5])
            >>> sheet
            pyexcel sheet:
            +---+---+---+---+
            | 2 | 3 | 4 | 6 |
            +---+---+---+---+
            >>> data = [
            ...     ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'],
            ...     [1,2,3,4,5,6,7,9],
            ... ]
            >>> sheet = pe.Sheet(data, name_columns_by_row=0)
            >>> sheet
            pyexcel sheet:
            +---+---+---+---+---+---+---+---+
            | a | b | c | d | e | f | g | h |
            +===+===+===+===+===+===+===+===+
            | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 9 |
            +---+---+---+---+---+---+---+---+
            >>> del sheet.column['a', 'b', 'i', 'f'] # doctest:+ELLIPSIS
            Traceback (most recent call last):
                ...
            ValueError: ...
            >>> sheet.column.select(['a', 'c', 'e', 'h'])
            >>> sheet
            pyexcel sheet:
            +---+---+---+---+
            | a | c | e | h |
            +===+===+===+===+
            | 1 | 3 | 5 | 9 |
            +---+---+---+---+

        """
        if is_array_type(names, str):
            indices = names_to_indices(names, self.ref.colnames)
            Column.select(self, indices)
        else:
            Column.select(self, names)

    def __delitem__(self, str_or_aslice):
        """

        Example::

            >>> import pyexcel as pe
            >>> data = [
            ...     ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'],
            ...     [1,2,3,4,5,6,7,9],
            ... ]
            >>> sheet = pe.Sheet(data, name_columns_by_row=0)
            >>> sheet
            pyexcel sheet:
            +---+---+---+---+---+---+---+---+
            | a | b | c | d | e | f | g | h |
            +===+===+===+===+===+===+===+===+
            | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 9 |
            +---+---+---+---+---+---+---+---+
            >>> del sheet.column['a', 'b', 'i', 'f'] # doctest:+ELLIPSIS
            Traceback (most recent call last):
                ...
            ValueError: ...
            >>> del sheet.column['a', 'c', 'e', 'h']
            >>> sheet
            pyexcel sheet:
            +---+---+---+---+
            | b | d | f | g |
            +===+===+===+===+
            | 2 | 4 | 6 | 7 |
            +---+---+---+---+

        """
        if is_string(type(str_or_aslice)):
            self.ref.delete_named_column_at(str_or_aslice)
        elif is_tuple_consists_of_strings(str_or_aslice):
            indices = names_to_indices(list(str_or_aslice), self.ref.colnames)
            Column.__delitem__(self, indices)
        else:
            Column.__delitem__(self, str_or_aslice)

    def __setitem__(self, str_or_aslice, c):
        if is_string(type(str_or_aslice)):
            self.ref.set_named_column_at(str_or_aslice, c)
        else:
            Column.__setitem__(self, str_or_aslice, c)

    def __getitem__(self, str_or_aslice):
        if is_string(type(str_or_aslice)):
            return self.ref.named_column_at(str_or_aslice)
        else:
            return Column.__getitem__(self, str_or_aslice)

    def __iadd__(self, other):
        """Overload += sign

        :param list other: the column header must be the first element.
        :return: self
        """
        if isinstance(other, OrderedDict):
            self.ref.extend_columns(other)
        else:
            Column.__iadd__(self, other)
        return self

    def __add__(self, other):
        """Overload += sign

        :return: self
        """
        self.__iadd__(other)
        return self.ref

[docs]    def format(self,
               column_index=None, formatter=None,
               format_specs=None, on_demand=False):
        """Format a column
        """
        def handle_one_formatter(columns, aformatter, on_demand):
            new_indices = columns
            if len(self.ref.colnames) > 0:
                new_indices = names_to_indices(columns, self.ref.colnames)
            theformatter = ColumnFormatter(new_indices, aformatter)
            if on_demand:
                self.ref.add_formatter(theformatter)
            else:
                self.ref.apply_formatter(theformatter)
        if column_index is not None:
            handle_one_formatter(column_index, formatter, on_demand)
        elif format_specs:
            for spec in format_specs:
                if len(spec) == 3:
                    handle_one_formatter(spec[0], spec[1],
                                         on_demand)
                else:
                    handle_one_formatter(spec[0], spec[1],
                                         on_demand)


VALID_SHEET_PARAMETERS = ['name_columns_by_row',
                          'name_rows_by_column',
                          'colnames',
                          'rownames',
                          'transpose_before',
                          'transpose_after']


[docs]class NominableSheet(FilterableSheet):
    """Allow dictionary group of the content
    """
[docs]    def __init__(self, sheet=None, name=DEFAULT_NAME,
                 name_columns_by_row=-1,
                 name_rows_by_column=-1,
                 colnames=None,
                 rownames=None,
                 transpose_before=False,
                 transpose_after=False):
        """Constructor

        :param sheet: two dimensional array
        :param name: this becomes the sheet name.
        :param name_columns_by_row: use a row to name all columns
        :param name_rows_by_column: use a column to name all rows
        :param colnames: use an external list of strings to name the columns
        :param rownames: use an external list of strings to name the rows
        """
        # this get rid of phatom data by not specifying sheet
        if sheet is None:
            sheet = []
        FilterableSheet.__init__(self, sheet)
        if transpose_before:
            self.transpose()
        self.name = name
        self._column_names = []
        self._row_names = []
        self.named_row = NamedRow(self)
        self.named_column = NamedColumn(self)
        if name_columns_by_row != -1:
            if colnames:
                raise NotImplementedError(MESSAGE_NOT_IMPLEMENTED_02)
            self.name_columns_by_row(name_columns_by_row)
        else:
            if colnames:
                self._column_names = colnames
        if name_rows_by_column != -1:
            if rownames:
                raise NotImplementedError(MESSAGE_NOT_IMPLEMENTED_02)
            self.name_rows_by_column(name_rows_by_column)
        else:
            if rownames:
                self._row_names = rownames
        if transpose_after:
            self.transpose()

    @property
    def row(self):
        """Row representation. see :class:`NamedRow`

        examples::

            >>> import pyexcel as pe
            >>> data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
            >>> sheet = pe.Sheet(data)
            >>> sheet.row[1]
            [4, 5, 6]
            >>> sheet.row[0:3]
            [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
            >>> sheet.row += [11, 12, 13]
            >>> sheet.row[3]
            [11, 12, 13]
            >>> sheet.row[0:4] = [0, 0, 0] # set all to zero
            >>> sheet.row[3]
            [0, 0, 0]
            >>> sheet.row[0] = ['a', 'b', 'c'] # set one row
            >>> sheet.row[0]
            ['a', 'b', 'c']
            >>> del sheet.row[0] # delete first row
            >>> sheet.row[0] # now, second row becomes the first
            [0, 0, 0]
            >>> del sheet.row[0:]
            >>> sheet.row[0]  # nothing left
            Traceback (most recent call last):
                ...
            IndexError
        """
        return self.named_row

    @row.setter
    def row(self, value):
        # dummy setter to enable self.row += ..
        pass

    @property
    def column(self):
        """Column representation. see :class:`NamedColumn`"""
        return self.named_column

    @column.setter
    def column(self, value):
        # dummy setter to enable self.column += ..
        pass

    def name_columns_by_row(self, row_index):
        """Use the elements of a specified row to represent individual columns

        The specified row will be deleted from the data
        :param int row_index: the index of the row that has the column names
        """
        self.row_index = row_index
        self._column_names = make_names_unique(self.row_at(row_index))
        del self.row[row_index]

    def name_rows_by_column(self, column_index):
        """Use the elements of a specified column to represent individual rows

        The specified column will be deleted from the data
        :param int column_index: the index of the column that has the row names
        """
        self.column_index = column_index
        self._row_names = make_names_unique(self.column_at(column_index))
        del self.column[column_index]

    @property
    def colnames(self):
        """Return column names"""
        if len(self._filters) != 0:
            column_filters = [f for f in self._filters
                              if isinstance(f, ColumnIndexFilter)]
            if len(column_filters) != 0:
                indices = range(0, len(self._column_names))
                for f in column_filters:
                    indices = [i for i in indices if i not in f.indices]
                return [self._column_names[i] for i in indices]
            else:
                return self._column_names
        else:
            return self._column_names

    @colnames.setter
    def colnames(self, value):
        """Set column names"""
        self._column_names = make_names_unique(value)

    @property
    def rownames(self):
        """Return row names"""
        if len(self._filters) != 0:
            row_filters = [f for f in self._filters
                           if isinstance(f, RowIndexFilter)]
            if len(row_filters) != 0:
                indices = range(0, len(self._row_names))
                for f in row_filters:
                    indices = [i for i in indices if i not in f.indices]
                return [self._row_names[i] for i in indices]
            else:
                return self._row_names
        else:
            return self._row_names

    @rownames.setter
    def rownames(self, value):
        """Set row names"""
        self._row_names = make_names_unique(value)

    def named_column_at(self, name):
        """Get a column by its name """
        index = name
        if is_string(type(index)):
            index = self.colnames.index(name)
        column_array = self.column_at(index)
        return column_array

    def set_named_column_at(self, name, column_array):
        """
        Take the first row as column names

        Given name to identify the column index, set the column to
        the given array except the column name.
        """
        index = name
        if is_string(type(index)):
            index = self.colnames.index(name)
        self.set_column_at(index, column_array)

    def delete_columns(self, column_indices):
        """Delete one or more columns

        :param list column_indices: a list of column indices
        """
        FilterableSheet.delete_columns(self, column_indices)
        if len(self._column_names) > 0:
            new_series = [self._column_names[i]
                          for i in range(0, len(self._column_names))
                          if i not in column_indices]
            self._column_names = new_series

    def delete_rows(self, row_indices):
        """Delete one or more rows

        :param list row_indices: a list of row indices
        """
        FilterableSheet.delete_rows(self, row_indices)
        if len(self._row_names) > 0:
            new_series = [self._row_names[i]
                          for i in range(0, len(self._row_names))
                          if i not in row_indices]
            self._row_names = new_series

    def delete_named_column_at(self, name):
        """Works only after you named columns by a row

        Given name to identify the column index, set the column to
        the given array except the column name.
        :param str name: a column name
        """
        if isinstance(name, int):
            if len(self.rownames) > 0:
                self.rownames.pop(name)
            self.delete_columns([name])
        else:
            index = self.colnames.index(name)
            self.colnames.pop(index)
            FilterableSheet.delete_columns(self, [index])

    def named_row_at(self, name):
        """Get a row by its name """
        index = name
        # if is_string(type(index)):
        index = self.rownames.index(name)
        row_array = self.row_at(index)
        return row_array

    def set_named_row_at(self, name, row_array):
        """
        Take the first column as row names

        Given name to identify the row index, set the row to
        the given array except the row name.
        """
        index = name
        if is_string(type(index)):
            index = self.rownames.index(name)
        self.set_row_at(index, row_array)

    def delete_named_row_at(self, name):
        """Take the first column as row names

        Given name to identify the row index, set the row to
        the given array except the row name.
        """
        if isinstance(name, int):
            if len(self.rownames) > 0:
                self.rownames.pop(name)
            self.delete_rows([name])
        else:
            index = self.rownames.index(name)
            self.rownames.pop(index)
            FilterableSheet.delete_rows(self, [index])

    def apply_formatter(self, aformatter):
        """Apply the formatter immediately.

        :param Formatter aformatter: a custom formatter
        """
        aformatter = self._translate_named_formatter(aformatter)
        FormattableSheet.apply_formatter(self, aformatter)

    def _translate_named_formatter(self, aformatter):
        if isinstance(aformatter, NamedColumnFormatter):
            series = self.colnames
        elif isinstance(aformatter, NamedRowFormatter):
            series = self.rownames
        else:
            series = None
        if series:
            indices = names_to_indices(aformatter.indices, series)
            aformatter.update_index(indices)
        return aformatter

    def add_formatter(self, aformatter):
        """Add a lazy formatter.

        The formatter takes effect on the fly when a cell value is read
        This is cost effective when you have a big data table
        and you use only a few rows or columns. If you have farily modest
        data table, you can choose apply_formatter() too.

        :param Formatter aformatter: a custom formatter
        """
        aformatter = self._translate_named_formatter(aformatter)
        FormattableSheet.add_formatter(self, aformatter)

    def extend_rows(self, rows):
        """Take ordereddict to extend named rows

        :param ordereddist/list rows: a list of rows.
        """
        incoming_data = []
        if isinstance(rows, OrderedDict):
            keys = rows.keys()
            for k in keys:
                self.rownames.append(k)
                incoming_data.append(rows[k])
            FilterableSheet.extend_rows(self, incoming_data)
        elif len(self.rownames) > 0:
            raise TypeError(MESSAGE_DATA_ERROR_ORDEREDDICT_IS_EXPECTED)
        else:
            FilterableSheet.extend_rows(self, rows)

    def extend_columns_with_rows(self, rows):
        """Put rows on the right most side of the data"""
        if len(self.colnames) > 0:
            headers = rows.pop(self.row_index)
            self._column_names += headers
        FilterableSheet.extend_columns_with_rows(self, rows)

    def extend_columns(self, columns):
        """Take ordereddict to extend named columns

        :param ordereddist/list columns: a list of columns
        """
        incoming_data = []
        if isinstance(columns, OrderedDict):
            keys = columns.keys()
            for k in keys:
                self.colnames.append(k)
                incoming_data.append(columns[k])
            FilterableSheet.extend_columns(self, incoming_data)
        elif len(self.colnames) > 0:
            raise TypeError(MESSAGE_DATA_ERROR_ORDEREDDICT_IS_EXPECTED)
        else:
            FilterableSheet.extend_columns(self, columns)

    def __iter__(self):
        if len(self._column_names) > 0:
            return ColumnIndexIterator(self)
        elif len(self._row_names) > 0:
            return RowIndexIterator(self)
        else:
            return FilterableSheet.__iter__(self)

    def to_array(self):
        """Returns an array after filtering"""
        from ..utils import to_array
        ret = []
        ret += to_array(self.rows())
        if len(self.rownames) > 0:
            ret = map(lambda value: [value[0]] + value[1],
                      zip(self.rownames, ret))
            if not PY2:
                ret = list(ret)
        if len(self.colnames) > 0:
            if len(self.rownames) > 0:
                ret.insert(0, [""] + self.colnames)
            else:
                ret.insert(0, self.colnames)
        return ret

    def to_records(self, custom_headers=None):
        """Returns the content as an array of dictionaries

        """
        from ..utils import to_records
        return to_records(self, custom_headers)

    def to_dict(self, row=False):
        """Returns a dictionary"""
        from ..utils import to_dict
        if row:
            return to_dict(RowIndexIterator(self))
        else:
            return to_dict(ColumnIndexIterator(self))

    def __getitem__(self, aset):
        if isinstance(aset, tuple):
            if isinstance(aset[0], str):
                row = self.rownames.index(aset[0])
            else:
                row = aset[0]

            if isinstance(aset[1], str):
                column = self.colnames.index(aset[1])
            else:
                column = aset[1]
            return self.cell_value(row, column)
        else:
            return Matrix.__getitem__(self, aset)

    def named_rows(self):
        return NamedRowIterator(self)

    def named_columns(self):
        return NamedColumnIterator(self)