pandas-dev · phaebz · Nov 2, 2013 · Nov 2, 2013 · Nov 3, 2013 · Nov 3, 2013
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -52,6 +52,8 @@ pandas 0.13.0
 New features
 ~~~~~~~~~~~~
 
+  - ``read_html()`` now accepts an ``xpath`` string argument representing an
+    xpath expression used for selecting tables to be read (:issue:`5416`)
   - ``plot(kind='kde')`` now accepts the optional parameters ``bw_method`` and
     ``ind``, passed to scipy.stats.gaussian_kde() (for scipy >= 0.11.0) to set
     the bandwidth, and to gkde.evaluate() to specify the indicies at which it

diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -165,13 +165,15 @@ class _HtmlFrameParser(object):
     See each method's respective documentation for details on their
     functionality.
     """
-    def __init__(self, io, match, attrs):
+    def __init__(self, io, match, attrs, xpath):
         self.io = io
         self.match = match
         self.attrs = attrs
+        self.xpath = xpath
 
     def parse_tables(self):
-        tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
+        tables = self._parse_tables(self._build_doc(), self.match, self.attrs,
+                                    self.xpath)
         return (self._build_table(table) for table in tables)
 
     def _parse_raw_data(self, rows):
@@ -227,7 +229,7 @@ def _parse_td(self, obj):
         """
         raise NotImplementedError
 
-    def _parse_tables(self, doc, match, attrs):
+    def _parse_tables(self, doc, match, attrs, xpath):
         """Return all tables from the parsed DOM.
 
         Parameters
@@ -242,6 +244,9 @@ def _parse_tables(self, doc, match, attrs):
             A dictionary of table attributes that can be used to disambiguate
             mutliple tables on a page.
 
+        xpath : str or None
+            An XPath style string used to filter for tables to be returned.
+
         Raises
         ------
         ValueError
@@ -393,7 +398,7 @@ def _parse_tbody(self, table):
     def _parse_tfoot(self, table):
         return table.find_all('tfoot')
 
-    def _parse_tables(self, doc, match, attrs):
+    def _parse_tables(self, doc, match, attrs, xpath):
         element_name = self._strainer.name
         tables = doc.find_all(element_name, attrs=attrs)
 
@@ -481,24 +486,36 @@ def _parse_tr(self, table):
         expr = './/tr[normalize-space()]'
         return table.xpath(expr)
 
-    def _parse_tables(self, doc, match, kwargs):
-        pattern = match.pattern
+    def _parse_tables(self, doc, match, kwargs, xpath):
+        if xpath:
+            xpath_expr = xpath
+            tables = doc.xpath(xpath_expr)
 
-        # 1. check all descendants for the given pattern and only search tables
-        # 2. go up the tree until we find a table
-        query = '//table//*[re:test(text(), %r)]/ancestor::table'
-        xpath_expr = u(query) % pattern
+            if not all(table.tag == 'table' for table in tables):
+                raise ValueError("XPath expression %r matched non-table elements" % xpath)
 
-        # if any table attributes were given build an xpath expression to
-        # search for them
-        if kwargs:
-            xpath_expr += _build_xpath_expr(kwargs)
+            if not tables:
+                raise ValueError("No tables found using XPath expression %r" % xpath)
+            return tables
 
-        tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
+        else:
+            pattern = match.pattern
 
-        if not tables:
-            raise ValueError("No tables found matching regex %r" % pattern)
-        return tables
+            # 1. check all descendants for the given pattern and only search tables
+            # 2. go up the tree until we find a table
+            query = '//table//*[re:test(text(), %r)]/ancestor::table'
+            xpath_expr = u(query) % pattern
+
+            # if any table attributes were given build an xpath expression to
+            # search for them
+            if kwargs:
+                xpath_expr += _build_xpath_expr(kwargs)
+
+            tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
+
+            if not tables:
+                raise ValueError("No tables found matching regex %r" % pattern)
+            return tables
 
     def _build_doc(self):
         """
@@ -688,15 +705,22 @@ def _validate_flavor(flavor):
 
 
 def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
-           parse_dates, tupleize_cols, thousands, attrs):
+           parse_dates, tupleize_cols, thousands, attrs, xpath):
     flavor = _validate_flavor(flavor)
     compiled_match = re.compile(match)  # you can pass a compiled regex here
 
+    if xpath and not _HAS_LXML:
+        raise ValueError("XPath table selection needs the lxml module, "
+                         "please install it.")
+
     # hack around python 3 deleting the exception variable
     retained = None
     for flav in flavor:
         parser = _parser_dispatch(flav)
-        p = parser(io, compiled_match, attrs)
+        if xpath and flav in ('bs4', 'html5lib'):
+            raise NotImplementedError
+
+        p = parser(io, compiled_match, attrs, xpath)
 
         try:
             tables = p.parse_tables()
@@ -714,7 +738,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types,
 
 def read_html(io, match='.+', flavor=None, header=None, index_col=None,
               skiprows=None, infer_types=None, attrs=None, parse_dates=False,
-              tupleize_cols=False, thousands=','):
+              tupleize_cols=False, thousands=',', xpath=None):
     r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
 
     Parameters
@@ -795,6 +819,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
     thousands : str, optional
         Separator to use to parse thousands. Defaults to ``','``.
 
+    xpath : str or None, optional
+        If not ``None`` try to identify the set of tables to be read by an
+        XPath string; takes precedence over ``match``. Defaults to ``None``.
+        Note: This functionality is not (yet) available with the Beautiful Soup
+        parser (``flavor=bs4``).
+
     Returns
     -------
     dfs : list of DataFrames
@@ -840,4 +870,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
         raise ValueError('cannot skip rows starting from the end of the '
                          'data (you passed a negative value)')
     return _parse(flavor, io, match, header, index_col, skiprows, infer_types,
-                  parse_dates, tupleize_cols, thousands, attrs)
+                  parse_dates, tupleize_cols, thousands, attrs, xpath)
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
@@ -18,6 +18,11 @@
 from numpy.random import rand
 from numpy.testing.decorators import slow
 
+try:
+    from lxml.etree import XPathEvalError
+except ImportError:
+    pass
+
 from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index,
                     date_range, Series)
 from pandas.compat import map, zip, StringIO, string_types
@@ -581,12 +586,28 @@ def test_parse_dates_combine(self):
         newdf = DataFrame({'datetime': raw_dates})
         tm.assert_frame_equal(newdf, res[0])
 
+    def test_xpath_bs4_not_implemented(self):
+        with open(self.spam_data) as f:
+            with self.assertRaises(NotImplementedError):
+                self.read_html(f, flavor='bs4',
+                               xpath="//div[@class='garbage']/table")
+
 
 class TestReadHtmlLxml(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         _skip_if_no('lxml')
 
+    def setup_data(self):
+        self.valid_data = os.path.join(DATA_PATH, 'valid_markup.html')
+
+    def setup_flavor(self):
+        self.flavor = 'lxml'
+
+    def setUp(self):
+        self.setup_data()
+        self.setup_flavor()
+
     def read_html(self, *args, **kwargs):
         self.flavor = ['lxml']
         kwargs['flavor'] = kwargs.get('flavor', self.flavor)
@@ -630,6 +651,69 @@ def test_parse_dates_combine(self):
         newdf = DataFrame({'datetime': raw_dates})
         tm.assert_frame_equal(newdf, res[0])
 
+    def test_attrs_file_like(self):
+        with open(self.valid_data) as f:
+            dfs = self.read_html(f,
+                                 attrs={'class': 'dataframe'})
+
+        tm.assert_isinstance(dfs, list)
+        for df in dfs:
+            tm.assert_isinstance(df, DataFrame)
+
+    def test_match_no_match(self):
+        with open(self.valid_data) as f:
+            with self.assertRaises(ValueError):
+                dfs = self.read_html(f,
+                                     match='supercalifragilistic')
+
+    def test_xpath_file_like(self):
+        with open(self.valid_data) as f:
+            dfs = self.read_html(f,
+                                 xpath="//table[@class='dataframe']")
+
+        tm.assert_isinstance(dfs, list)
+        for df in dfs:
+            tm.assert_isinstance(df, DataFrame)
+
+    @slow
+    def test_xpath_file_url(self):
+        url = self.valid_data
+        dfs = self.read_html(file_path_to_url(url),
+                             xpath="//*[@class='dataframe']")
+        tm.assert_isinstance(dfs, list)
+        for df in dfs:
+            tm.assert_isinstance(df, DataFrame)
+
+    def test_xpath_direct_ref(self):
+        with open(self.valid_data) as f:
+            dfs = self.read_html(f,
+                                 xpath="//html/body/table[@class='dataframe']"
+                                       "[last()]")
+        assert dfs[0].shape == (2, 3)
+
+    def test_xpath_match_multiple(self):
+        with open(self.valid_data) as f:
+            dfs = self.read_html(f,
+                                 xpath="//*[@class='dataframe']")
+
+        assert len(dfs) == 2
+
+    def test_xpath_match_none(self):
+        with open(self.valid_data) as f:
+            with self.assertRaises(ValueError):
+                self.read_html(f, xpath="//div[@class='garbage']/table")
+
+    def test_xpath_not_all_tables(self):
+        with open(self.valid_data) as f:
+            with self.assertRaises(ValueError):
+                self.read_html(f,
+                               xpath="//tr")
+
+    def test_invalid_xpath(self):
+        with open(self.valid_data) as f:
+            with self.assertRaises(XPathEvalError):
+                self.read_html(f, xpath="//div[@@class=garbage]/table")
+
 
 def test_invalid_flavor():
     url = 'google.com'