pandas-dev
diff --git a/‎pandas/io/opendocument/__init__.py
Lines changed: 3 additions & 0 deletions b/‎pandas/io/opendocument/__init__.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎pandas/io/opendocument/odfreader.py
Lines changed: 171 additions & 0 deletions b/‎pandas/io/opendocument/odfreader.py
Lines changed: 171 additions & 0 deletions
diff --git a/‎pandas/tests/io/data/blank-row-repeat.ods
13.3 KB b/‎pandas/tests/io/data/blank-row-repeat.ods
13.3 KB
diff --git a/‎pandas/tests/io/data/datatypes.ods
10.4 KB b/‎pandas/tests/io/data/datatypes.ods
10.4 KB
diff --git a/‎pandas/tests/io/data/headers.ods
8.13 KB b/‎pandas/tests/io/data/headers.ods
8.13 KB
diff --git a/‎pandas/tests/io/data/lowerdiagonal.ods
7.35 KB b/‎pandas/tests/io/data/lowerdiagonal.ods
7.35 KB
diff --git a/‎pandas/tests/io/data/runlengthencoding.ods
7.71 KB b/‎pandas/tests/io/data/runlengthencoding.ods
7.71 KB
diff --git a/‎pandas/tests/io/data/writertable.odt
10.1 KB b/‎pandas/tests/io/data/writertable.odt
10.1 KB
diff --git a/‎pandas/tests/io/test_opendocument.py
Lines changed: 149 additions & 0 deletions b/‎pandas/tests/io/test_opendocument.py
Lines changed: 149 additions & 0 deletions
@@ -0,0 +1,3 @@
+from pandas.io.opendocument.odfreader import ODFReader
+
+__all__ = ['ODFReader']
@@ -0,0 +1,171 @@
+import pandas
+from pandas.io.parsers import TextParser
+
+
+class ODFReader:
+    """Read tables out of OpenDocument formatted files
+
+    Parameters
+    ----------
+    filepath_or_stream: string, path to be parsed or
+        an open readable stream.
+    """
+    def __init__(self, filepath_or_stream):
+        try:
+            from odf.opendocument import load as document_load
+            from odf.table import Table
+        except ImportError:
+            raise ImportError("Install odfpy for OpenDocument support")
+
+        self.filepath_or_stream = None
+        self.document = None
+        self.tables = None
+        self.filepath_or_stream = filepath_or_stream
+        self.document = document_load(filepath_or_stream)
+        self.tables = self.document.getElementsByType(Table)
+
+    @property
+    def sheet_names(self):
+        """Return table names is the document"""
+        from odf.namespaces import TABLENS
+        return [t.attributes[(TABLENS, 'name')] for t in self.tables]
+
+    def get_sheet_by_index(self, index):
+        return self.__get_table(self.tables[index])
+
+    def get_sheet_by_name(self, name):
+        i = self.sheet_names.index(name)
+        if i != -1:
+            return self.__get_table(self.tables[i])
+        else:
+            raise KeyError(name)
+
+    def get_sheet(self, name):
+        """Given a sheet name or index, return the root ODF Table node
+        """
+        if isinstance(name, str):
+            return self.get_sheet_by_name(name)
+        elif isinstance(name, int):
+            return self.get_sheet_by_index(name)
+        else:
+            raise ValueError(
+                'Unrecognized sheet identifier type {}. Please use'
+                'a string or integer'.format(type(name)))
+
+    def parse(self, sheet_name=0, **kwds):
+        data = self.get_sheet(sheet_name)
+        parser = TextParser(data, **kwds)
+        return parser.read()
+
+    def __get_table(self, sheet):
+        """Parse an ODF Table into a list of lists
+        """
+        from odf.table import TableCell, TableRow
+
+        sheet_rows = sheet.getElementsByType(TableRow)
+        table = []
+        empty_rows = 0
+        max_row_len = 0
+        for i, sheet_row in enumerate(sheet_rows):
+            sheet_cells = sheet_row.getElementsByType(TableCell)
+            empty_cells = 0
+            table_row = []
+            for j, sheet_cell in enumerate(sheet_cells):
+                value = self.__get_cell_value(sheet_cell)
+                column_repeat = self.__get_cell_repeat(sheet_cell)
+
+                if len(sheet_cell.childNodes) == 0:
+                    empty_cells += column_repeat
+                else:
+                    if empty_cells > 0:
+                        table_row.extend([None] * empty_cells)
+                        empty_cells = 0
+                    table_row.extend([value] * column_repeat)
+
+            if max_row_len < len(table_row):
+                max_row_len = len(table_row)
+
+            row_repeat = self.__get_row_repeat(sheet_row)
+            if self.__is_empty_row(sheet_row):
+                empty_rows += row_repeat
+            else:
+                if empty_rows > 0:
+                    table.extend([None] * empty_rows)
+                    empty_rows = 0
+                assert row_repeat == 1, "{} {}".format(
+                    row_repeat,
+                    len(sheet_row.childNodes))
+                table.append(table_row)
+
+        # Make our table square
+        for row in table:
+            if len(row) < max_row_len:
+                row.extend([None] * (max_row_len - len(row)))
+
+        return table
+
+    def __get_row_repeat(self, row):
+        """Return number of times this row was repeated
+
+        Repeating an empty row appeared to be a common way
+        of representing sparse rows in the table.
+        """
+        from odf.namespaces import TABLENS
+        repeat = row.attributes.get((TABLENS, 'number-rows-repeated'))
+        if repeat is None:
+            return 1
+        return int(repeat)
+
+    def __get_cell_repeat(self, cell):
+        from odf.namespaces import TABLENS
+        repeat = cell.attributes.get((TABLENS, 'number-columns-repeated'))
+        if repeat is None:
+            return 1
+        return int(repeat)
+
+    def __is_empty_row(self, row):
+        """Helper function to find empty rows
+        """
+        for column in row.childNodes:
+            if len(column.childNodes) > 0:
+                return False
+
+        return True
+
+    def __get_cell_value(self, cell):
+        from odf.namespaces import OFFICENS
+        cell_type = cell.attributes.get((OFFICENS, 'value-type'))
+        if cell_type == 'boolean':
+            cell_value = cell.attributes.get((OFFICENS, 'boolean'))
+            return bool(cell_value)
+        elif cell_type in ('float', 'percentage'):
+            cell_value = cell.attributes.get((OFFICENS, 'value'))
+            return float(cell_value)
+        elif cell_type == 'string':
+            # FIXME: how do I actually get the string value?
+            return str(cell)
+        elif cell_type == 'currency':
+            cell_value = cell.attributes.get((OFFICENS, 'value'))
+            return float(cell_value)
+        elif cell_type == 'date':
+            cell_value = cell.attributes.get((OFFICENS, 'date-value'))
+            return pandas.Timestamp(cell_value)
+        elif cell_type == 'time':
+            cell_value = cell.attributes.get((OFFICENS, 'time-value'))
+            return(pandas_isoduration_compatibility(cell_value))
+        elif cell_type is None:
+            return None
+        else:
+            raise ValueError('Unrecognized type %s', cell_type)
+
+
+def pandas_isoduration_compatibility(duration):
+    """Libreoffice returns durations without any day attributes
+
+    For example PT3H45M0S. The current pandas Timedelta
+    parse requires the presence of a day component.
+    Workaround for https://github.com/pandas-dev/pandas/issues/25422
+    """
+    if duration.startswith('PT'):
+        duration = 'P0DT' + duration[2:]
+    return pandas.Timedelta(duration)
@@ -0,0 +1,149 @@
+from collections import OrderedDict
+import os
+from numpy import nan
+import pandas
+import pandas.util._test_decorators as td
+import pandas.util.testing as tm
+from pandas import DataFrame, Timestamp, Timedelta
+from pandas.io.opendocument import ODFReader
+import pytest
+
+
+@td.skip_if_no('odf')
+class TestOpenDocument(object):
+    @pytest.fixture(autouse=True)
+    def setup_method(self, datapath):
+        self.dirpath = datapath("io", "data")
+
+    def get_opendocument(self, filename, *args, **kwargs):
+        """
+        Return ODFReader class containing tables from parsed OpenDocument file
+
+        Parameters
+        ----------
+        filename : str
+
+        Returns
+        -------
+
+        document : ODFReader object
+        """
+        pth = os.path.join(self.dirpath, filename)
+        document = ODFReader(pth)
+        return document
+
+    def get_opendocumentdf(self, filename, *args, **kwargs):
+        """
+        Return DataFrame from named sheet in a parsed OpenDocument file
+
+        Parameters
+        ----------
+        filename : str
+            File base name
+
+        Returns
+        -------
+
+        df : DataFrame
+        """
+        document = self.get_opendocument(filename)
+        return document.parse(*args, **kwargs)
+
+    def test_read_types(self):
+        """Make sure we read ODF data types correctly
+        """
+        book = self.get_opendocument('datatypes.ods')
+        assert len(book.sheet_names) == 1
+        assert book.sheet_names == ['Sheet1']
+        sheet = book.parse('Sheet1', header=None)
+
+        expected = DataFrame(
+            [[1.0],
+             [1.25],
+             ['a'],
+             [Timestamp(2003, 1, 2)],
+             [False],
+             [0.35],
+             [Timedelta(hours=3, minutes=45),
+              Timedelta(hours=17, minutes=53),
+              Timedelta(hours=14, minutes=8)],
+             # though what should the value of a hyperlink be?
+             ['UBERON:0002101']])
+        tm.assert_equal(sheet, expected)
+
+    def test_read_lower_diagonal(self):
+        """TextParser failed when given an irregular list of lists
+
+        Make sure we can parse:
+        1
+        2 3
+        4 5 6
+        7 8 9 10
+        """
+        sheet = self.get_opendocumentdf(
+            'lowerdiagonal.ods', 'Sheet1',
+            index_col=None, header=None)
+
+        assert sheet.shape == (4, 4)
+
+    def test_read_headers(self):
+        """Do we read headers correctly?
+        """
+        sheet = self.get_opendocumentdf(
+            'headers.ods', 'Sheet1', index_col=0)
+
+        expected = DataFrame.from_dict(OrderedDict([
+            ("Header", ["Row 1", "Row 2"]),
+            ("Column 1", [1.0, 2.0]),
+            ("Column 2", [3.0, 4.0]),
+            # Empty Column
+            ("Column 4", [7.0, 8.0]),
+            # Empty Column 2
+            ("Column 6", [11.0, 12.0])]))
+        expected.set_index("Header", inplace=True)
+        columns = ["Column 1", "Column 2", "Column 4", "Column 6"]
+        tm.assert_equal(sheet[columns], expected)
+        empties = [None, 'None.1']
+        for name in empties:
+            for value in sheet[name]:
+                assert pandas.isnull(value)
+
+    def test_read_writer_table(self):
+        """ODF reuses the same table tags in Writer and Presentation files
+
+        Test reading a table out of a text document
+        """
+        table = self.get_opendocumentdf(
+            'writertable.odt', 'Table1', index_col=0)
+
+        assert table.shape == (3, 3)
+        expected = DataFrame.from_dict(OrderedDict([
+            ("Header", ["Row 1", "Row 2", "Row 3"]),
+            ("Column 1", [1.0, 2.0, 3.0]),
+            ("Unnamed: 2", [nan, nan, nan]),
+            ("Column 3", [7.0, 8.0, 9.0])]))
+        expected.set_index("Header", inplace=True)
+        columns = ["Column 1", "Column 3"]
+        tm.assert_equal(table[columns], expected[columns])
+
+        # make sure pandas gives a name to the unnamed column
+        for i in range(3):
+            assert pandas.isnull(table["Unnamed: 2"][i])
+
+    def test_blank_row_repeat(self):
+        table = self.get_opendocumentdf(
+            'blank-row-repeat.ods', 'Biosamples')
+
+        assert table.shape == (8, 9)
+        assert table['biosample_accession'][7] == 9.0
+
+    def test_runlengthencoding(self):
+        """Calc will use repeat when adjacent columns have the same value.
+        """
+        sheet = self.get_opendocumentdf(
+            'runlengthencoding.ods', 'Sheet1', header=None)
+        assert sheet.shape == (5, 3)
+        # check by column, not by row.
+        assert list(sheet[0]) ==  [1.0, 1.0, 2.0, 2.0, 2.0]
+        assert list(sheet[1]) ==  [1.0, 2.0, 2.0, 2.0, 2.0]
+        assert list(sheet[2]) ==  [1.0, 2.0, 2.0, 2.0, 2.0]
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from pandas.io.opendocument.odfreader import ODFReader`
	`2`	`+`
	`3`	`+__all__ = ['ODFReader']`