Source code for pyexcel.readers

"""
    pyexcel.readers
    ~~~~~~~~~~~~~~~~~~~

    Uniform interface for reading different excel file formats

    :copyright: (c) 2014 by C. W.
    :license: GPL v3
"""
from iterators import (HBRTLIterator,
                       HTLBRIterator,
                       VBRTLIterator,
                       VTLBRIterator,
                       RowIterator,
                       RowReverseIterator,
                       ColumnIterator,
                       ColumnReverseIterator,
                       SeriesColumnIterator)
from filters import (RowFilter,
                     ColumnIndexFilter)


[docs]class CSVReader: """ csv reader """ def __init__(self, file): import csv self.array = [] reader = csv.reader(open(file, 'rb'), dialect=csv.excel) self.array.extend(reader)
[docs] def number_of_rows(self): """ Number of rows in the csv file """ return len(self.array)
[docs] def number_of_columns(self): """ Number of columns in the csv file assuming the length of each row is uniform """ if len(self.array) > 1: return len(self.array[0]) else: return 0
[docs] def cell_value(self, row, column): """ Random access to the csv cells """ value = self.array[row][column] try: if "." in value: return float(value) else: return int(value) except ValueError: return value
[docs]class XLSReader: """ xls reader Currently only support first sheet in the file """ def __init__(self, file): import xlrd self.workbook = xlrd.open_workbook(file) self.worksheet = self.workbook.sheet_by_index(0)
[docs] def number_of_rows(self): """ Number of rows in the xls file """ return self.worksheet.nrows
[docs] def number_of_columns(self): """ Number of columns in the xls file """ return self.worksheet.ncols
[docs] def cell_value(self, row, column): """ Random access to the xls cells """ return self.worksheet.cell_value(row, column)
[docs]class ODSReaderImp(CSVReader): """ ods reader Currently only support first sheet in the file """ def __init__(self, file): import ext.odsreader as odsreader self.ods = odsreader.ODSReader(file) keys = self.ods.SHEETS.keys() self.array = self.ods.getSheet(keys[0])
[docs]class Reader: """ Wrapper class to unify csv, xls and xlsx reader """ def __init__(self, file): """ Reader constructor Selecting a specific reader according to file extension """ if (file.endswith(".xlsm") or file.endswith(".xlsx") or file.endswith(".xls")): self.reader = XLSReader(file) elif file.endswith(".csv"): self.reader = CSVReader(file) elif file.endswith(".ods"): self.reader = ODSReaderImp(file) else: raise NotImplementedError("can not open %s" % file) def __iter__(self): """ Default iterator to go through each cell one by one from top row to bottom row and from left to right """ return self.enumerate()
[docs] def enumerate(self): """ Default iterator to go through each cell one by one from top row to bottom row and from left to right """ return HTLBRIterator(self)
[docs] def reverse(self): """ Reverse iterator to go through each cell one by one from bottom row to top row and from right to left """ return HBRTLIterator(self)
[docs] def vertical(self): """ Default iterator to go through each cell one by one from leftmost column to rightmost row and from top to bottom """ return VTLBRIterator(self)
[docs] def rvertical(self): """ Default iterator to go through each cell one by one from rightmost column to leftmost row and from bottom to top """ return VBRTLIterator(self)
[docs] def rows(self): """ Returns a row iterator to go through each row from top to bottom """ return RowIterator(self)
[docs] def rrows(self): """ Returns a row iterator to go through each row from bottom to top """ return RowReverseIterator(self)
[docs] def columns(self): """ Returns a column iterator to go through each column from left to right """ return ColumnIterator(self)
[docs] def rcolumns(self): """ Returns a column iterator to go through each column from right to left """ return ColumnReverseIterator(self)
[docs] def number_of_rows(self): """ Number of rows in the data file """ return self.reader.number_of_rows()
[docs] def number_of_columns(self): """ Number of columns in the data file """ return self.reader.number_of_columns()
[docs] def cell_value(self, row, column): """ Random access to the data cells """ if row in self.row_range() and column in self.column_range(): return self.reader.cell_value(row, column) else: return None
[docs] def row_range(self): """ Utility function to get row range """ return range(0, self.reader.number_of_rows())
[docs] def column_range(self): """ Utility function to get column range """ return range(0, self.reader.number_of_columns())
[docs] def row_at(self, index): """ Returns an array that collects all data at the specified row """ if index in self.row_range(): cell_array = [] for i in self.column_range(): cell_array.append(self.cell_value(index, i)) return cell_array else: return None
[docs] def column_at(self, index): """ Returns an array that collects all data at the specified column """ if index in self.column_range(): cell_array = [] for i in self.row_range(): cell_array.append(self.cell_value(i, index)) return cell_array else: return None
[docs] def contains(self, predicate): for r in self.rows(): if predicate(r): return True else: return False
[docs]class FilterableReader(Reader): """ Reader that can be applied one filter """ _filter = None
[docs] def row_range(self): if self._filter: new_rows = self.reader.number_of_rows() - self._filter.rows() return range(0, new_rows) else: return range(0, self.reader.number_of_rows())
[docs] def column_range(self): if self._filter: new_cols = self.reader.number_of_columns() - self._filter.columns() return range(0, new_cols) else: return range(0, self.reader.number_of_columns())
[docs] def number_of_rows(self): """ Number of rows in the data file """ if self._filter: return self.reader.number_of_rows() - self._filter.rows() else: return self.reader.number_of_rows()
[docs] def number_of_columns(self): """ Number of columns in the data file """ if self._filter: return self.reader.number_of_columns() - self._filter.columns() else: return self.reader.number_of_columns()
[docs] def cell_value(self, row, column): """ Random access to the data cells """ if row in self.row_range() and column in self.column_range(): if self._filter: new_row, new_column = self._filter.translate(row, column) return self.reader.cell_value(new_row, new_column) else: return self.reader.cell_value(row, column) else: return None
[docs] def filter(self, afilter): afilter.validate_filter(self) self._filter = afilter return self
[docs]class GenericSeriesReader(FilterableReader): """ For data with column headers x y z 1 2 3 4 5 6 This class has a default filter that filter out row 0 as headers. Extra functions were added to return headers at row 0 """ def __init__(self, reader): self.reader = reader # filter out the first row self.filter(RowFilter([0])) self.headers = None def _headers(self): self.headers = [] for i in self.column_range(): self.headers.append(self.reader.cell_value(0, i))
[docs] def series(self): self._headers() return self.headers
[docs] def named_column_at(self, name): self._headers() index = self.headers.index(name) column_array = self.column_at(index) return {name: column_array}
def __iter__(self): return SeriesColumnIterator(self)
[docs]class StaticSeriesReader(GenericSeriesReader): """ Static Series Reader. No filters can be applied. """ def __init__(self, file): reader = Reader(file) GenericSeriesReader.__init__(self, reader)
[docs]class ColumnFilterableSeriesReader(GenericSeriesReader): """ Columns can be filtered but not rows """ def __init__(self, file): self.reader = FilterableReader(file) GenericSeriesReader.filter(self, RowFilter([0])) self.headers = None
[docs] def filter(self, filter): self.reader.filter(filter) self._filter.validate_filter(self)
[docs]class SeriesReader(GenericSeriesReader): """ rows other than header row can be filtered. row number has been shifted by 1 as header row is protected. columns can be filtered. """ def __init__(self, file): self.reader = ColumnFilterableSeriesReader(file)
[docs] def series(self): return self.reader.series()
[docs] def named_column_at(self, name): headers = self.series() index = headers.index(name) column_array = self.column_at(index) return {name: column_array}
[docs] def filter(self, afilter): if isinstance(afilter, ColumnIndexFilter): self.reader.filter(afilter) else: GenericSeriesReader.filter(self, afilter)