Skip to content

io.html.read_html support XPath expressions for table selection #5416

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ pandas 0.13.0
New features
~~~~~~~~~~~~

- ``read_html()`` now accepts an ``xpath`` string argument representing an
xpath expression used for selecting tables to be read (:issue:`5416`)
- ``plot(kind='kde')`` now accepts the optional parameters ``bw_method`` and
``ind``, passed to scipy.stats.gaussian_kde() (for scipy >= 0.11.0) to set
the bandwidth, and to gkde.evaluate() to specify the indicies at which it
Expand Down
74 changes: 52 additions & 22 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,15 @@ class _HtmlFrameParser(object):
See each method's respective documentation for details on their
functionality.
"""
def __init__(self, io, match, attrs):
def __init__(self, io, match, attrs, xpath):
self.io = io
self.match = match
self.attrs = attrs
self.xpath = xpath

def parse_tables(self):
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
tables = self._parse_tables(self._build_doc(), self.match, self.attrs,
self.xpath)
return (self._build_table(table) for table in tables)

def _parse_raw_data(self, rows):
Expand Down Expand Up @@ -227,7 +229,7 @@ def _parse_td(self, obj):
"""
raise NotImplementedError

def _parse_tables(self, doc, match, attrs):
def _parse_tables(self, doc, match, attrs, xpath):
"""Return all tables from the parsed DOM.

Parameters
Expand All @@ -242,6 +244,9 @@ def _parse_tables(self, doc, match, attrs):
A dictionary of table attributes that can be used to disambiguate
mutliple tables on a page.

xpath : str or None
An XPath style string used to filter for tables to be returned.

Raises
------
ValueError
Expand Down Expand Up @@ -393,7 +398,7 @@ def _parse_tbody(self, table):
def _parse_tfoot(self, table):
return table.find_all('tfoot')

def _parse_tables(self, doc, match, attrs):
def _parse_tables(self, doc, match, attrs, xpath):
element_name = self._strainer.name
tables = doc.find_all(element_name, attrs=attrs)

Expand Down Expand Up @@ -481,24 +486,36 @@ def _parse_tr(self, table):
expr = './/tr[normalize-space()]'
return table.xpath(expr)

def _parse_tables(self, doc, match, kwargs):
pattern = match.pattern
def _parse_tables(self, doc, match, kwargs, xpath):
if xpath:
xpath_expr = xpath
tables = doc.xpath(xpath_expr)

# 1. check all descendants for the given pattern and only search tables
# 2. go up the tree until we find a table
query = '//table//*[re:test(text(), %r)]/ancestor::table'
xpath_expr = u(query) % pattern
if not all(table.tag == 'table' for table in tables):
raise ValueError("XPath expression %r matched non-table elements" % xpath)

# if any table attributes were given build an xpath expression to
# search for them
if kwargs:
xpath_expr += _build_xpath_expr(kwargs)
if not tables:
raise ValueError("No tables found using XPath expression %r" % xpath)
return tables

tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
else:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be nice to dump the below into a function...but not necessary for this PR.

pattern = match.pattern

if not tables:
raise ValueError("No tables found matching regex %r" % pattern)
return tables
# 1. check all descendants for the given pattern and only search tables
# 2. go up the tree until we find a table
query = '//table//*[re:test(text(), %r)]/ancestor::table'
xpath_expr = u(query) % pattern

# if any table attributes were given build an xpath expression to
# search for them
if kwargs:
xpath_expr += _build_xpath_expr(kwargs)

tables = doc.xpath(xpath_expr, namespaces=_re_namespace)

if not tables:
raise ValueError("No tables found matching regex %r" % pattern)
return tables

def _build_doc(self):
"""
Expand Down Expand Up @@ -688,15 +705,22 @@ def _validate_flavor(flavor):


def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
parse_dates, tupleize_cols, thousands, attrs):
parse_dates, tupleize_cols, thousands, attrs, xpath):
flavor = _validate_flavor(flavor)
compiled_match = re.compile(match) # you can pass a compiled regex here

if xpath and not _HAS_LXML:
raise ValueError("XPath table selection needs the lxml module, "
"please install it.")

# hack around python 3 deleting the exception variable
retained = None
for flav in flavor:
parser = _parser_dispatch(flav)
p = parser(io, compiled_match, attrs)
if xpath and flav in ('bs4', 'html5lib'):
raise NotImplementedError

p = parser(io, compiled_match, attrs, xpath)

try:
tables = p.parse_tables()
Expand All @@ -714,7 +738,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,

def read_html(io, match='.+', flavor=None, header=None, index_col=None,
skiprows=None, infer_types=None, attrs=None, parse_dates=False,
tupleize_cols=False, thousands=','):
tupleize_cols=False, thousands=',', xpath=None):
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.

Parameters
Expand Down Expand Up @@ -795,6 +819,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
thousands : str, optional
Separator to use to parse thousands. Defaults to ``','``.

xpath : str or None, optional
If not ``None`` try to identify the set of tables to be read by an
XPath string; takes precedence over ``match``. Defaults to ``None``.
Note: This functionality is not (yet) available with the Beautiful Soup
parser (``flavor=bs4``).

Returns
-------
dfs : list of DataFrames
Expand Down Expand Up @@ -840,4 +870,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
raise ValueError('cannot skip rows starting from the end of the '
'data (you passed a negative value)')
return _parse(flavor, io, match, header, index_col, skiprows, infer_types,
parse_dates, tupleize_cols, thousands, attrs)
parse_dates, tupleize_cols, thousands, attrs, xpath)
84 changes: 84 additions & 0 deletions pandas/io/tests/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
from numpy.random import rand
from numpy.testing.decorators import slow

try:
from lxml.etree import XPathEvalError
except ImportError:
pass

from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index,
date_range, Series)
from pandas.compat import map, zip, StringIO, string_types
Expand Down Expand Up @@ -581,12 +586,28 @@ def test_parse_dates_combine(self):
newdf = DataFrame({'datetime': raw_dates})
tm.assert_frame_equal(newdf, res[0])

def test_xpath_bs4_not_implemented(self):
with open(self.spam_data) as f:
with self.assertRaises(NotImplementedError):
self.read_html(f, flavor='bs4',
xpath="//div[@class='garbage']/table")


class TestReadHtmlLxml(unittest.TestCase):
@classmethod
def setUpClass(cls):
_skip_if_no('lxml')

def setup_data(self):
self.valid_data = os.path.join(DATA_PATH, 'valid_markup.html')

def setup_flavor(self):
self.flavor = 'lxml'

def setUp(self):
self.setup_data()
self.setup_flavor()

def read_html(self, *args, **kwargs):
self.flavor = ['lxml']
kwargs['flavor'] = kwargs.get('flavor', self.flavor)
Expand Down Expand Up @@ -630,6 +651,69 @@ def test_parse_dates_combine(self):
newdf = DataFrame({'datetime': raw_dates})
tm.assert_frame_equal(newdf, res[0])

def test_attrs_file_like(self):
with open(self.valid_data) as f:
dfs = self.read_html(f,
attrs={'class': 'dataframe'})

tm.assert_isinstance(dfs, list)
for df in dfs:
tm.assert_isinstance(df, DataFrame)

def test_match_no_match(self):
with open(self.valid_data) as f:
with self.assertRaises(ValueError):
dfs = self.read_html(f,
match='supercalifragilistic')

def test_xpath_file_like(self):
with open(self.valid_data) as f:
dfs = self.read_html(f,
xpath="//table[@class='dataframe']")

tm.assert_isinstance(dfs, list)
for df in dfs:
tm.assert_isinstance(df, DataFrame)

@slow
def test_xpath_file_url(self):
url = self.valid_data
dfs = self.read_html(file_path_to_url(url),
xpath="//*[@class='dataframe']")
tm.assert_isinstance(dfs, list)
for df in dfs:
tm.assert_isinstance(df, DataFrame)

def test_xpath_direct_ref(self):
with open(self.valid_data) as f:
dfs = self.read_html(f,
xpath="//html/body/table[@class='dataframe']"
"[last()]")
assert dfs[0].shape == (2, 3)

def test_xpath_match_multiple(self):
with open(self.valid_data) as f:
dfs = self.read_html(f,
xpath="//*[@class='dataframe']")

assert len(dfs) == 2

def test_xpath_match_none(self):
with open(self.valid_data) as f:
with self.assertRaises(ValueError):
self.read_html(f, xpath="//div[@class='garbage']/table")

def test_xpath_not_all_tables(self):
with open(self.valid_data) as f:
with self.assertRaises(ValueError):
self.read_html(f,
xpath="//tr")

def test_invalid_xpath(self):
with open(self.valid_data) as f:
with self.assertRaises(XPathEvalError):
self.read_html(f, xpath="//div[@@class=garbage]/table")


def test_invalid_flavor():
url = 'google.com'
Expand Down