Skip to content

Commit 7e6a59c

Browse files
committed
Class to read OpenDocument Tables
This is primarly intended for LibreOffice calc spreadsheets but will also work with LO Writer and probalby with LO Impress documents.
1 parent 15d8178 commit 7e6a59c

9 files changed

+323
-0
lines changed

pandas/io/opendocument/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from pandas.io.opendocument.odfreader import ODFReader
2+
3+
__all__ = ['ODFReader']

pandas/io/opendocument/odfreader.py

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import pandas
2+
from pandas.io.parsers import TextParser
3+
4+
5+
class ODFReader:
6+
"""Read tables out of OpenDocument formatted files
7+
8+
Parameters
9+
----------
10+
filepath_or_stream: string, path to be parsed or
11+
an open readable stream.
12+
"""
13+
def __init__(self, filepath_or_stream):
14+
try:
15+
from odf.opendocument import load as document_load
16+
from odf.table import Table
17+
except ImportError:
18+
raise ImportError("Install odfpy for OpenDocument support")
19+
20+
self.filepath_or_stream = None
21+
self.document = None
22+
self.tables = None
23+
self.filepath_or_stream = filepath_or_stream
24+
self.document = document_load(filepath_or_stream)
25+
self.tables = self.document.getElementsByType(Table)
26+
27+
@property
28+
def sheet_names(self):
29+
"""Return table names is the document"""
30+
from odf.namespaces import TABLENS
31+
return [t.attributes[(TABLENS, 'name')] for t in self.tables]
32+
33+
def get_sheet_by_index(self, index):
34+
return self.__get_table(self.tables[index])
35+
36+
def get_sheet_by_name(self, name):
37+
i = self.sheet_names.index(name)
38+
if i != -1:
39+
return self.__get_table(self.tables[i])
40+
else:
41+
raise KeyError(name)
42+
43+
def get_sheet(self, name):
44+
"""Given a sheet name or index, return the root ODF Table node
45+
"""
46+
if isinstance(name, str):
47+
return self.get_sheet_by_name(name)
48+
elif isinstance(name, int):
49+
return self.get_sheet_by_index(name)
50+
else:
51+
raise ValueError(
52+
'Unrecognized sheet identifier type {}. Please use'
53+
'a string or integer'.format(type(name)))
54+
55+
def parse(self, sheet_name=0, **kwds):
56+
data = self.get_sheet(sheet_name)
57+
parser = TextParser(data, **kwds)
58+
return parser.read()
59+
60+
def __get_table(self, sheet):
61+
"""Parse an ODF Table into a list of lists
62+
"""
63+
from odf.table import TableCell, TableRow
64+
65+
sheet_rows = sheet.getElementsByType(TableRow)
66+
table = []
67+
empty_rows = 0
68+
max_row_len = 0
69+
for i, sheet_row in enumerate(sheet_rows):
70+
sheet_cells = sheet_row.getElementsByType(TableCell)
71+
empty_cells = 0
72+
table_row = []
73+
for j, sheet_cell in enumerate(sheet_cells):
74+
value = self.__get_cell_value(sheet_cell)
75+
column_repeat = self.__get_cell_repeat(sheet_cell)
76+
77+
if len(sheet_cell.childNodes) == 0:
78+
empty_cells += column_repeat
79+
else:
80+
if empty_cells > 0:
81+
table_row.extend([None] * empty_cells)
82+
empty_cells = 0
83+
table_row.extend([value] * column_repeat)
84+
85+
if max_row_len < len(table_row):
86+
max_row_len = len(table_row)
87+
88+
row_repeat = self.__get_row_repeat(sheet_row)
89+
if self.__is_empty_row(sheet_row):
90+
empty_rows += row_repeat
91+
else:
92+
if empty_rows > 0:
93+
table.extend([None] * empty_rows)
94+
empty_rows = 0
95+
assert row_repeat == 1, "{} {}".format(
96+
row_repeat,
97+
len(sheet_row.childNodes))
98+
table.append(table_row)
99+
100+
# Make our table square
101+
for row in table:
102+
if len(row) < max_row_len:
103+
row.extend([None] * (max_row_len - len(row)))
104+
105+
return table
106+
107+
def __get_row_repeat(self, row):
108+
"""Return number of times this row was repeated
109+
110+
Repeating an empty row appeared to be a common way
111+
of representing sparse rows in the table.
112+
"""
113+
from odf.namespaces import TABLENS
114+
repeat = row.attributes.get((TABLENS, 'number-rows-repeated'))
115+
if repeat is None:
116+
return 1
117+
return int(repeat)
118+
119+
def __get_cell_repeat(self, cell):
120+
from odf.namespaces import TABLENS
121+
repeat = cell.attributes.get((TABLENS, 'number-columns-repeated'))
122+
if repeat is None:
123+
return 1
124+
return int(repeat)
125+
126+
def __is_empty_row(self, row):
127+
"""Helper function to find empty rows
128+
"""
129+
for column in row.childNodes:
130+
if len(column.childNodes) > 0:
131+
return False
132+
133+
return True
134+
135+
def __get_cell_value(self, cell):
136+
from odf.namespaces import OFFICENS
137+
cell_type = cell.attributes.get((OFFICENS, 'value-type'))
138+
if cell_type == 'boolean':
139+
cell_value = cell.attributes.get((OFFICENS, 'boolean'))
140+
return bool(cell_value)
141+
elif cell_type in ('float', 'percentage'):
142+
cell_value = cell.attributes.get((OFFICENS, 'value'))
143+
return float(cell_value)
144+
elif cell_type == 'string':
145+
# FIXME: how do I actually get the string value?
146+
return str(cell)
147+
elif cell_type == 'currency':
148+
cell_value = cell.attributes.get((OFFICENS, 'value'))
149+
return float(cell_value)
150+
elif cell_type == 'date':
151+
cell_value = cell.attributes.get((OFFICENS, 'date-value'))
152+
return pandas.Timestamp(cell_value)
153+
elif cell_type == 'time':
154+
cell_value = cell.attributes.get((OFFICENS, 'time-value'))
155+
return(pandas_isoduration_compatibility(cell_value))
156+
elif cell_type is None:
157+
return None
158+
else:
159+
raise ValueError('Unrecognized type %s', cell_type)
160+
161+
162+
def pandas_isoduration_compatibility(duration):
163+
"""Libreoffice returns durations without any day attributes
164+
165+
For example PT3H45M0S. The current pandas Timedelta
166+
parse requires the presence of a day component.
167+
Workaround for https://github.com/pandas-dev/pandas/issues/25422
168+
"""
169+
if duration.startswith('PT'):
170+
duration = 'P0DT' + duration[2:]
171+
return pandas.Timedelta(duration)
13.3 KB
Binary file not shown.

pandas/tests/io/data/datatypes.ods

10.4 KB
Binary file not shown.

pandas/tests/io/data/headers.ods

8.13 KB
Binary file not shown.
7.35 KB
Binary file not shown.
7.71 KB
Binary file not shown.

pandas/tests/io/data/writertable.odt

10.1 KB
Binary file not shown.

pandas/tests/io/test_opendocument.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
from collections import OrderedDict
2+
import os
3+
from numpy import nan
4+
import pandas
5+
import pandas.util._test_decorators as td
6+
import pandas.util.testing as tm
7+
from pandas import DataFrame, Timestamp, Timedelta
8+
from pandas.io.opendocument import ODFReader
9+
import pytest
10+
11+
12+
@td.skip_if_no('odf')
13+
class TestOpenDocument(object):
14+
@pytest.fixture(autouse=True)
15+
def setup_method(self, datapath):
16+
self.dirpath = datapath("io", "data")
17+
18+
def get_opendocument(self, filename, *args, **kwargs):
19+
"""
20+
Return ODFReader class containing tables from parsed OpenDocument file
21+
22+
Parameters
23+
----------
24+
filename : str
25+
26+
Returns
27+
-------
28+
29+
document : ODFReader object
30+
"""
31+
pth = os.path.join(self.dirpath, filename)
32+
document = ODFReader(pth)
33+
return document
34+
35+
def get_opendocumentdf(self, filename, *args, **kwargs):
36+
"""
37+
Return DataFrame from named sheet in a parsed OpenDocument file
38+
39+
Parameters
40+
----------
41+
filename : str
42+
File base name
43+
44+
Returns
45+
-------
46+
47+
df : DataFrame
48+
"""
49+
document = self.get_opendocument(filename)
50+
return document.parse(*args, **kwargs)
51+
52+
def test_read_types(self):
53+
"""Make sure we read ODF data types correctly
54+
"""
55+
book = self.get_opendocument('datatypes.ods')
56+
assert len(book.sheet_names) == 1
57+
assert book.sheet_names == ['Sheet1']
58+
sheet = book.parse('Sheet1', header=None)
59+
60+
expected = DataFrame(
61+
[[1.0],
62+
[1.25],
63+
['a'],
64+
[Timestamp(2003, 1, 2)],
65+
[False],
66+
[0.35],
67+
[Timedelta(hours=3, minutes=45),
68+
Timedelta(hours=17, minutes=53),
69+
Timedelta(hours=14, minutes=8)],
70+
# though what should the value of a hyperlink be?
71+
['UBERON:0002101']])
72+
tm.assert_equal(sheet, expected)
73+
74+
def test_read_lower_diagonal(self):
75+
"""TextParser failed when given an irregular list of lists
76+
77+
Make sure we can parse:
78+
1
79+
2 3
80+
4 5 6
81+
7 8 9 10
82+
"""
83+
sheet = self.get_opendocumentdf(
84+
'lowerdiagonal.ods', 'Sheet1',
85+
index_col=None, header=None)
86+
87+
assert sheet.shape == (4, 4)
88+
89+
def test_read_headers(self):
90+
"""Do we read headers correctly?
91+
"""
92+
sheet = self.get_opendocumentdf(
93+
'headers.ods', 'Sheet1', index_col=0)
94+
95+
expected = DataFrame.from_dict(OrderedDict([
96+
("Header", ["Row 1", "Row 2"]),
97+
("Column 1", [1.0, 2.0]),
98+
("Column 2", [3.0, 4.0]),
99+
# Empty Column
100+
("Column 4", [7.0, 8.0]),
101+
# Empty Column 2
102+
("Column 6", [11.0, 12.0])]))
103+
expected.set_index("Header", inplace=True)
104+
columns = ["Column 1", "Column 2", "Column 4", "Column 6"]
105+
tm.assert_equal(sheet[columns], expected)
106+
empties = [None, 'None.1']
107+
for name in empties:
108+
for value in sheet[name]:
109+
assert pandas.isnull(value)
110+
111+
def test_read_writer_table(self):
112+
"""ODF reuses the same table tags in Writer and Presentation files
113+
114+
Test reading a table out of a text document
115+
"""
116+
table = self.get_opendocumentdf(
117+
'writertable.odt', 'Table1', index_col=0)
118+
119+
assert table.shape == (3, 3)
120+
expected = DataFrame.from_dict(OrderedDict([
121+
("Header", ["Row 1", "Row 2", "Row 3"]),
122+
("Column 1", [1.0, 2.0, 3.0]),
123+
("Unnamed: 2", [nan, nan, nan]),
124+
("Column 3", [7.0, 8.0, 9.0])]))
125+
expected.set_index("Header", inplace=True)
126+
columns = ["Column 1", "Column 3"]
127+
tm.assert_equal(table[columns], expected[columns])
128+
129+
# make sure pandas gives a name to the unnamed column
130+
for i in range(3):
131+
assert pandas.isnull(table["Unnamed: 2"][i])
132+
133+
def test_blank_row_repeat(self):
134+
table = self.get_opendocumentdf(
135+
'blank-row-repeat.ods', 'Biosamples')
136+
137+
assert table.shape == (8, 9)
138+
assert table['biosample_accession'][7] == 9.0
139+
140+
def test_runlengthencoding(self):
141+
"""Calc will use repeat when adjacent columns have the same value.
142+
"""
143+
sheet = self.get_opendocumentdf(
144+
'runlengthencoding.ods', 'Sheet1', header=None)
145+
assert sheet.shape == (5, 3)
146+
# check by column, not by row.
147+
assert list(sheet[0]) == [1.0, 1.0, 2.0, 2.0, 2.0]
148+
assert list(sheet[1]) == [1.0, 2.0, 2.0, 2.0, 2.0]
149+
assert list(sheet[2]) == [1.0, 2.0, 2.0, 2.0, 2.0]

0 commit comments

Comments
 (0)