From 3e5879440a6b513233f71f62de9234f67125dee7 Mon Sep 17 00:00:00 2001
From: Adam Hooper <adam@adamhooper.com>
Date: Thu, 14 Jun 2018 13:09:44 -0400
Subject: [PATCH 01/12] Handle colspan and rowspan

This is essentially a rebased and squashed #17054 (mad props to @jowens
for doing all the hard thinking). My tweaks:

* test_computer_sales_page (see #17074) no longer tests for ParserError,
  because the ParserError was a bug caused by missing colspan support.
  Now, test that MultiIndex works as expected.
* I respectfully removed the fill_rowspan argument from #17073. Instead,
  the virtual cells created by rowspan/colspan are always copies of the
  real cells' text. This prevents _infer_columns() from naming virtual
  cells as "Unnamed: ..."
* I removed a small layer of abstraction to respect #20891 (multiple
  <tbody> support), which was implemented after @jowens' pull request.
  Now _HtmlFrameParser has _parse_thead_trs, _parse_tbody_trs and
  _parse_tfoot_trs, each returning a list of <tr>s. That let me remove
  _parse_tr, Making All The Tests Pass.
* That caused a snowball effect. lxml does not fix malformed <thead>, as
  tested by spam.html. The previous hacky workaround was in
  _parse_raw_thead, but the new _parse_thead_trs signature returns nodes
  instead of text. The new hacky solution: return the <thead> itself,
  pretending it's a <tr>. This works in all the tests. A better solution
  is to use html5lib with lxml; but that might belong in a separate pull
  request.
---
 doc/source/whatsnew/v0.24.0.txt |   3 +-
 pandas/io/html.py               | 368 ++++++++++++++++++--------------
 pandas/tests/io/test_html.py    | 210 ++++++++++++++++--
 3 files changed, 405 insertions(+), 176 deletions(-)
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index abf574ae109fd..0f0ad3452e934 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -24,6 +24,7 @@ Other Enhancements
   <https://pandas-gbq.readthedocs.io/en/latest/changelog.html#changelog-0-5-0>`__.
   (:issue:`21627`)
 - New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`)
+- :func:`read_html` handles colspan and rowspan arguments and attempts to infer a header if the header is not explicitly specified (:issue:`17054`)
 -
 
 .. _whatsnew_0240.api_breaking:
@@ -223,7 +224,7 @@ MultiIndex
 I/O
 ^^^
 
--
+- :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
 -
 -
 
diff --git a/pandas/io/html.py b/pandas/io/html.py
index 8fd876e85889f..6e774f1846b99 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -10,13 +10,11 @@
 
 from distutils.version import LooseVersion
 
-import numpy as np
-
 from pandas.core.dtypes.common import is_list_like
 from pandas.errors import EmptyDataError
 from pandas.io.common import _is_url, urlopen, _validate_header_arg
 from pandas.io.parsers import TextParser
-from pandas.compat import (lrange, lmap, u, string_types, iteritems,
+from pandas.compat import (lrange, lmap, lfilter, u, string_types, iteritems,
                            raise_with_traceback, binary_type)
 from pandas import Series
 import pandas.core.common as com
@@ -193,11 +191,11 @@ class _HtmlFrameParser(object):
         * :func:`_build_doc`
         * :func:`_text_getter`
         * :func:`_parse_td`
+        * :func:`_parse_thead_tr`
+        * :func:`_parse_tbody_tr`
+        * :func:`_parse_tfoot_tr`
         * :func:`_parse_tables`
-        * :func:`_parse_tr`
-        * :func:`_parse_thead`
-        * :func:`_parse_tbody`
-        * :func:`_parse_tfoot`
+        * :func:`_equals_tag`
     See each method's respective documentation for details on their
     functionality.
     """
@@ -210,32 +208,14 @@ def __init__(self, io, match, attrs, encoding, displayed_only):
         self.displayed_only = displayed_only
 
     def parse_tables(self):
-        tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
-        return (self._build_table(table) for table in tables)
-
-    def _parse_raw_data(self, rows):
-        """Parse the raw data into a list of lists.
-
-        Parameters
-        ----------
-        rows : iterable of node-like
-            A list of row elements.
-
-        text_getter : callable
-            A callable that gets the text from an individual node. This must be
-            defined by subclasses.
-
-        column_finder : callable
-            A callable that takes a row node as input and returns a list of the
-            column node in that row. This must be defined by subclasses.
+        """Parse and return all tables from the DOM.
 
         Returns
         -------
-        data : list of list of strings
+        tables : list of parsed (header, body, footer) tuples from tables
         """
-        data = [[_remove_whitespace(self._text_getter(col)) for col in
-                 self._parse_td(row)] for row in rows]
-        return data
+        tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
+        return (self._parse_thead_tbody_tfoot(table) for table in tables)
 
     def _text_getter(self, obj):
         """Return the text of an individual DOM node.
@@ -257,7 +237,7 @@ def _parse_td(self, obj):
 
         Parameters
         ----------
-        obj : node-like
+        obj : an HTML row element
 
         Returns
         -------
@@ -266,90 +246,88 @@ def _parse_td(self, obj):
         """
         raise com.AbstractMethodError(self)
 
-    def _parse_tables(self, doc, match, attrs):
-        """Return all tables from the parsed DOM.
+    def _parse_thead_tr(self, table):
+        """Return the list of thead row elements from the parsed table element.
 
         Parameters
         ----------
-        doc : tree-like
-            The DOM from which to parse the table element.
-
-        match : str or regular expression
-            The text to search for in the DOM tree.
-
-        attrs : dict
-            A dictionary of table attributes that can be used to disambiguate
-            multiple tables on a page.
-
-        Raises
-        ------
-        ValueError
-            * If `match` does not match any text in the document.
+        table : a table element that contains zero or more thead elements.
 
         Returns
         -------
-        tables : list of node-like
-            A list of <table> elements to be parsed into raw data.
+        rows : list of <tr> row elements of a table
         """
         raise com.AbstractMethodError(self)
 
-    def _parse_tr(self, table):
-        """Return the list of row elements from the parsed table element.
+    def _parse_tbody_tr(self, table):
+        """Return the list of tbody row elements from the parsed table element.
+
+        HTML5 table bodies consist of either 0 or more <tbody> elements (which
+        only contain <tr> elements) or 0 or more <tr> elements. This method
+        checks for both structures.
 
         Parameters
         ----------
-        table : node-like
-            A table element that contains row elements.
+        table : a table element that contains row elements.
 
         Returns
         -------
-        rows : list of node-like
-            A list row elements of a table, usually <tr> or <th> elements.
+        rows : list of <tr> row elements of a table
         """
         raise com.AbstractMethodError(self)
 
-    def _parse_thead(self, table):
-        """Return the header of a table.
+    def _parse_tfoot_tr(self, table):
+        """Return the list of tfoot row elements from the parsed table element.
 
         Parameters
         ----------
-        table : node-like
-            A table element that contains row elements.
+        table : a table element that contains row elements.
 
         Returns
         -------
-        thead : node-like
-            A <thead>...</thead> element.
+        rows : list of <tr> row elements of a table
         """
         raise com.AbstractMethodError(self)
 
-    def _parse_tbody(self, table):
-        """Return the list of tbody elements from the parsed table element.
+    def _parse_tables(self, doc, match, attrs):
+        """Return all tables from the parsed DOM.
 
         Parameters
         ----------
-        table : node-like
-            A table element that contains row elements.
+        doc : the DOM from which to parse the table element.
+
+        match : str or regular expression
+            The text to search for in the DOM tree.
+
+        attrs : dict
+            A dictionary of table attributes that can be used to disambiguate
+            multiple tables on a page.
+
+        Raises
+        ------
+        ValueError : `match` does not match any text in the document.
 
         Returns
         -------
-        tbodys : list of node-like
-            A list of <tbody>...</tbody> elements
+        tables : list of HTML <table> elements to be parsed into raw data.
         """
         raise com.AbstractMethodError(self)
 
-    def _parse_tfoot(self, table):
-        """Return the footer of the table if any.
+    def _equals_tag(self, obj, tag):
+        """Return whether an individual DOM node matches a tag
 
         Parameters
         ----------
-        table : node-like
-            A table element that contains row elements.
+        obj : node-like
+            A DOM node.
+
+        tag : str
+            Tag name to be checked for equality
 
         Returns
         -------
-        tfoot : node-like
-            A <tfoot>...</tfoot> element.
+        is_tag_equal : boolean
+            boolean indicating if the object is equal to tag 'tag'
         """
         raise com.AbstractMethodError(self)
 
@@ -358,47 +336,115 @@ def _build_doc(self):
 
         Returns
         -------
-        obj : tree-like
+        obj : the DOM from which to parse the table element.
         """
         raise com.AbstractMethodError(self)
 
-    def _build_table(self, table):
-        header = self._parse_raw_thead(table)
-        body = self._parse_raw_tbody(table)
-        footer = self._parse_raw_tfoot(table)
-        return header, body, footer
+    def _parse_thead_tbody_tfoot(self, table_html):
+        """Given a table, return parsed header, body, and foot.
+           Header and body are lists-of-lists. Top level list is a list of
+           rows. Each row is a list of parsed elements.
 
-    def _parse_raw_thead(self, table):
-        thead = self._parse_thead(table)
-        res = []
-        if thead:
-            trs = self._parse_tr(thead[0])
-            for tr in trs:
-                cols = lmap(self._text_getter, self._parse_td(tr))
-                if any(col != '' for col in cols):
-                    res.append(cols)
-        return res
+           Logic: Use <thead>, <tbody>, <tfoot> elements to identify
+                  header, body, and footer, otherwise:
+                  - Put all rows into body
+                  - Move rows from top of body to header only if
+                    all elements inside row are <th>
+                  - Move rows from bottom of body to footer only if
+                    all elements inside row are <th>
 
-    def _parse_raw_tfoot(self, table):
-        tfoot = self._parse_tfoot(table)
-        res = []
-        if tfoot:
-            res = lmap(self._text_getter, self._parse_td(tfoot[0]))
-        return np.atleast_1d(
-            np.array(res).squeeze()) if res and len(res) == 1 else res
+        Parameters
+        ----------
+        table_html : a single HTML table element.
 
-    def _parse_raw_tbody(self, table):
-        tbodies = self._parse_tbody(table)
+        Returns
+        -------
+        tuple of (header, body, footer)
+        header : list of rows, each of which is a list of parsed
+                 header elements
+        body : list of rows, each of which is a list of parsed body elements
+        footer : list of rows, each of which is a list of parsed
+                 footer elements
+        """
 
-        raw_data = []
+        header_rows = self._parse_thead_tr(table_html)
+        body_rows = self._parse_tbody_tr(table_html)
+        footer_rows = self._parse_tfoot_tr(table_html)
 
-        if tbodies:
-            for tbody in tbodies:
-                raw_data.extend(self._parse_tr(tbody))
-        else:
-            raw_data.extend(self._parse_tr(table))
+        if not header_rows:
+            # The table has no <thead>. Treat first all-<th> rows as headers.
+            while body_rows and all(self._equals_tag(t, 'th') for t in
+                                    self._parse_td(body_rows[0])):
+                # this row should be a header row, move it from body to header
+                header_rows.append(body_rows.pop(0))
+
+        if not footer_rows:
+            # The table has no <tfoot>. Treat last all-<th> rows as footers.
+            while body_rows and all(self._equals_tag(t, 'th') for t in
+                                    self._parse_td(body_rows[-1])):
+                # this row should be a footer row, move it from body to footer
+                footer_rows.insert(0, body_rows.pop())
+
+        header = self._expand_colspan_rowspan(header_rows)
+        body = self._expand_colspan_rowspan(body_rows)
+        footer = self._expand_colspan_rowspan(footer_rows)
+
+        return header, body, footer
+
+    def _expand_colspan_rowspan(self, rows):
+        """Given a list of <tr>s, return a list of text rows that copy cell
+           text across rowspans/colspans.
 
-        return self._parse_raw_data(raw_data)
+        Parameters
+        ----------
+        rows : list of <tr>s
+
+        Returns
+        -------
+        res : list of rows, each of which is a list of str in that row
+        """
+
+        res = []
+        saved_span = []
+        for row in rows:
+            extracted_row = self._parse_td(row)
+            cols_text = [_remove_whitespace(
+                self._text_getter(col)) for col in extracted_row]
+            col_colspans = [int(col.get('colspan', 1))
+                            for col in extracted_row]
+            col_rowspans = [int(col.get('rowspan', 1))
+                            for col in extracted_row]
+            # expand cols using col_colspans
+            # maybe this can be done with a list comprehension, dunno
+            cols = list(zip(
+                list(com.flatten(
+                    lmap(lambda text_nc: [text_nc[0]] * text_nc[1],
+                         list(zip(cols_text, col_colspans))))),
+                list(com.flatten(
+                    lmap(lambda nc_nr: [nc_nr[1]] * nc_nr[0],
+                         list(zip(col_colspans, col_rowspans))))))
+            )
+            # cols is now a list of (text, number of rows)
+            # now insert any previous rowspans
+            for (col, (text, nr)) in saved_span:
+                cols.insert(col, (text, nr))
+
+            # save next saved_span
+            def advance_item_to_next_row(item):
+                (col, (text, nr)) = item
+                if nr == 1:
+                    return None
+                else:
+                    return (col, (text, nr - 1))
+            saved_span = lfilter(lambda i: i is not None,
+                                 lmap(advance_item_to_next_row,
+                                      list(enumerate(cols))))
+            cols = [text for (text, nr) in cols]
+            # generate cols with text only
+            if any([col != '' for col in cols]):
+                res.append(cols)
+
+        return res
 
     def _handle_hidden_tables(self, tbl_list, attr_name):
         """Returns list of tables, potentially removing hidden elements
@@ -442,27 +488,6 @@ def __init__(self, *args, **kwargs):
         from bs4 import SoupStrainer
         self._strainer = SoupStrainer('table')
 
-    def _text_getter(self, obj):
-        return obj.text
-
-    def _parse_td(self, row):
-        return row.find_all(('td', 'th'))
-
-    def _parse_tr(self, element):
-        return element.find_all('tr')
-
-    def _parse_th(self, element):
-        return element.find_all('th')
-
-    def _parse_thead(self, table):
-        return table.find_all('thead')
-
-    def _parse_tbody(self, table):
-        return table.find_all('tbody')
-
-    def _parse_tfoot(self, table):
-        return table.find_all('tfoot')
-
     def _parse_tables(self, doc, match, attrs):
         element_name = self._strainer.name
         tables = doc.find_all(element_name, attrs=attrs)
@@ -490,6 +515,27 @@ def _parse_tables(self, doc, match, attrs):
                              .format(patt=match.pattern))
         return result
 
+    def _text_getter(self, obj):
+        return obj.text
+
+    def _equals_tag(self, obj, tag):
+        return obj.name == tag
+
+    def _parse_td(self, row):
+        return row.find_all(('td', 'th'), recursive=False)
+
+    def _parse_thead_tr(self, table):
+        return table.select('thead tr')
+
+    def _parse_tbody_tr(self, table):
+        from_tbody = table.select('tbody tr')
+        from_root = table.find_all('tr', recursive=False)
+        # HTML spec: at most one of these lists has content
+        return from_tbody + from_root
+
+    def _parse_tfoot_tr(self, table):
+        return table.select('tfoot tr')
+
     def _setup_build_doc(self):
         raw_text = _read(self.io)
         if not raw_text:
@@ -554,10 +600,9 @@ def _text_getter(self, obj):
         return obj.text_content()
 
     def _parse_td(self, row):
-        return row.xpath('.//td|.//th')
-
-    def _parse_tr(self, table):
-        return table.xpath('.//tr')
+        # Look for direct descendents only: the "row" element here may be a
+        # <thead> or <tfoot> (see _parse_thead_tr).
+        return row.xpath('./td|./th')
 
     def _parse_tables(self, doc, match, kwargs):
         pattern = match.pattern
@@ -590,6 +635,12 @@ def _parse_tables(self, doc, match, kwargs):
                              .format(patt=pattern))
         return tables
 
+    def _equals_tag(self, obj, tag):
+        return obj.tag == tag
+
+    def _contains_tag(self, obj, tag):
+        return obj.find(tag) is not None
+
     def _build_doc(self):
         """
         Raises
@@ -637,41 +688,30 @@ def _build_doc(self):
                 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
         return r
 
-    def _parse_tbody(self, table):
-        return table.xpath('.//tbody')
+    def _parse_thead_tr(self, table):
+        rows = []
 
-    def _parse_thead(self, table):
-        return table.xpath('.//thead')
+        for thead in table.xpath('.//thead'):
+            rows.extend(thead.xpath('./tr'))
 
-    def _parse_tfoot(self, table):
-        return table.xpath('.//tfoot')
+            # lxml does not clean up the clearly-erroneous
+            # <thead><th>foo</th><th>bar</th></thead>.
+            elements_at_root = thead.xpath('./td|./th')
+            if elements_at_root:
+                # Pass the entire <thead> as a row. _parse_td() will interpret
+                # it correctly.
+                rows.append(thead)
 
-    def _parse_raw_thead(self, table):
-        expr = './/thead'
-        thead = table.xpath(expr)
-        res = []
-        if thead:
-            # Grab any directly descending table headers first
-            ths = thead[0].xpath('./th')
-            if ths:
-                cols = [_remove_whitespace(x.text_content()) for x in ths]
-                if any(col != '' for col in cols):
-                    res.append(cols)
-            else:
-                trs = self._parse_tr(thead[0])
+        return rows
 
-                for tr in trs:
-                    cols = [_remove_whitespace(x.text_content()) for x in
-                            self._parse_td(tr)]
+    def _parse_tbody_tr(self, table):
+        from_tbody = table.xpath('.//tbody//tr')
+        from_root = table.xpath('./tr')
+        # HTML spec: at most one of these lists has content
+        return from_tbody + from_root
 
-                    if any(col != '' for col in cols):
-                        res.append(cols)
-        return res
-
-    def _parse_raw_tfoot(self, table):
-        expr = './/tfoot//th|//tfoot//td'
-        return [_remove_whitespace(x.text_content()) for x in
-                table.xpath(expr)]
+    def _parse_tfoot_tr(self, table):
+        return table.xpath('.//tfoot//tr')
 
 
 def _expand_elements(body):
@@ -695,7 +735,7 @@ def _data_to_frame(**kwargs):
             header = 0 if rows == [0] else rows
 
     if foot:
-        body += [foot]
+        body += foot
 
     # fill out elements of body that are "ragged"
     _expand_elements(body)
@@ -953,7 +993,13 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
 
     This function searches for ``<table>`` elements and only for ``<tr>``
     and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
-    element in the table. ``<td>`` stands for "table data".
+    element in the table. ``<td>`` stands for "table data". This function
+    attempts to properly handle ``colspan`` and ``rowspan`` attributes.
+    If the function has a ``<thead>`` argument, it is used to construct
+    the header, otherwise the function attempts to find the header within
+    the body (by putting rows with only ``<th>`` elements into the header).
+
+        .. versionadded:: 0.21.0
 
     Similar to :func:`~pandas.read_csv` the `header` argument is applied
     **after** `skiprows` is applied.
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 9c6a8de7ed446..b8f520ee17d72 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -18,7 +18,6 @@
 from pandas.io.common import URLError, file_path_to_url
 import pandas.io.html
 from pandas.io.html import read_html
-from pandas._libs.parsers import ParserError
 
 import pandas.util.testing as tm
 import pandas.util._test_decorators as td
@@ -129,16 +128,7 @@ def test_banklist(self):
 
         assert_framelist_equal(df1, df2)
 
-    def test_spam_no_types(self):
-
-        # infer_types removed in #10892
-        df1 = self.read_html(self.spam_data, '.*Water.*')
-        df2 = self.read_html(self.spam_data, 'Unit')
-        assert_framelist_equal(df1, df2)
-        assert df1[0].iloc[0, 0] == 'Proximates'
-        assert df1[0].columns[0] == 'Nutrient'
-
-    def test_spam_with_types(self):
+    def test_spam(self):
         df1 = self.read_html(self.spam_data, '.*Water.*')
         df2 = self.read_html(self.spam_data, 'Unit')
         assert_framelist_equal(df1, df2)
@@ -372,7 +362,7 @@ def test_thousands_macau_stats(self, datapath):
                              attrs={'class': 'style1'})
         df = dfs[all_non_nan_table_index]
 
-        assert not any(s.isna().any() for _, s in df.iteritems())
+        assert not any(s.isnull().any() for _, s in df.iteritems())
 
     @pytest.mark.slow
     def test_thousands_macau_index_col(self, datapath):
@@ -381,7 +371,7 @@ def test_thousands_macau_index_col(self, datapath):
         dfs = self.read_html(macau_data, index_col=0, header=0)
         df = dfs[all_non_nan_table_index]
 
-        assert not any(s.isna().any() for _, s in df.iteritems())
+        assert not any(s.isnull().any() for _, s in df.iteritems())
 
     def test_empty_tables(self):
         """
@@ -461,6 +451,44 @@ def test_header_and_one_column(self):
         result = self.read_html(data)[0]
         tm.assert_frame_equal(result, expected)
 
+    def test_thead_without_tr(self):
+        """
+        Ensure parser adds <tr> within <thead> on malformed HTML.
+        """
+        data1 = StringIO('''<table>
+            <thead>
+                <tr>
+                    <th>Country</th>
+                    <th>Municipality</th>
+                    <th>Year</th>
+                </tr>
+            </thead>
+            <tbody>
+                <tr>
+                    <td>Ukraine</td>
+                    <th>Odessa</th>
+                    <td>1944</td>
+                </tr>
+            </tbody>
+        </table>''')
+        data2 = StringIO('''<table>
+            <thead>
+                <th>Country</th>
+                <th>Municipality</th>
+                <th>Year</th>
+            </thead>
+            <tbody>
+                <tr>
+                    <td>Ukraine</td>
+                    <th>Odessa</th>
+                    <td>1944</td>
+                </tr>
+            </tbody>
+        </table>''')
+        res1 = self.read_html(data1)
+        res2 = self.read_html(data2, header=0)
+        assert_framelist_equal(res1, res2)
+
     def test_tfoot_read(self):
         """
         Make sure that read_html reads tfoot, containing td or th.
@@ -592,7 +620,7 @@ def test_gold_canyon(self):
                             attrs={'id': 'table'})[0]
         assert gc in df.to_string()
 
-    def test_different_number_of_rows(self):
+    def test_different_number_of_cols(self):
         expected = """<table border="1" class="dataframe">
                         <thead>
                             <tr style="text-align: right;">
@@ -654,6 +682,160 @@ def test_different_number_of_rows(self):
         res = self.read_html(out, index_col=0)[0]
         tm.assert_frame_equal(expected, res)
 
+    def test_colspan_rowspan_are_1(self):
+        # GH17054
+        expected = """<table>
+                        <thead>
+                            <tr>
+                            <th>X</th>
+                            <th>Y</th>
+                            <th>Z</th>
+                            <th>W</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                        </tbody>
+                    </table>"""
+        out = """<table>
+                   <thead>
+                       <tr>
+                       <th colspan="1">X</th>
+                       <th>Y</th>
+                       <th rowspan="1">Z</th>
+                       <th>W</th>
+                       </tr>
+                   </thead>
+                   <tbody>
+                   </tbody>
+               </table>"""
+        expected = self.read_html(expected)[0]
+        res = self.read_html(out)[0]
+        tm.assert_frame_equal(expected, res)
+
+    def test_colspan_rowspan_are_more_than_1(self):
+        # GH17054
+        expected = """<table>
+                        <thead>
+                            <tr>
+                            <th>X</th>
+                            <th>X</th>
+                            <th>Y</th>
+                            <th>Z</th>
+                            <th>W</th>
+                            </tr>
+                            <tr>
+                            <th>1</th>
+                            <th>2</th>
+                            <th>2</th>
+                            <th>Z</th>
+                            <th>3</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                        </tbody>
+                    </table>"""
+        out = """<table>
+                   <thead>
+                       <tr>
+                       <th colspan="2">X</th>
+                       <th>Y</th>
+                       <th rowspan="2">Z</th>
+                       <th>W</th>
+                       </tr>
+                       <tr>
+                       <th>1</th>
+                       <th colspan="2">2</th>
+                       <th>3</th>
+                       </tr>
+                   </thead>
+                   <tbody>
+                   </tbody>
+               </table>"""
+        expected = self.read_html(expected)[0]
+        res = self.read_html(out)[0]
+        tm.assert_frame_equal(expected, res)
+
+    def test_tbody_colspan_rowspan_copy_values(self):
+        # GH17054
+        expected = """<table>
+                        <tbody>
+                            <tr>
+                            <td>1</td>
+                            <td>1</td>
+                            <td>2</td>
+                            <td>3</td>
+                            <td>4</td>
+                            </tr>
+                            <tr>
+                            <td>5</td>
+                            <td>6</td>
+                            <td>6</td>
+                            <td>3</td>
+                            <td>7</td>
+                            </tr>
+                        </tbody>
+                    </table>"""
+        out = """<table>
+                   <tbody>
+                       <tr>
+                       <td colspan="2">1</td>
+                       <td>2</td>
+                       <td rowspan="2">3</td>
+                       <td>4</td>
+                       </tr>
+                       <tr>
+                       <td>5</td>
+                       <td colspan="2">6</td>
+                       <td>7</td>
+                       </tr>
+                   </tbody>
+               </table>"""
+        expected = self.read_html(expected)[0]
+        res = self.read_html(out)[0]
+        tm.assert_frame_equal(expected, res)
+
+    def test_header_should_be_inferred_from_th_elements(self):
+        # GH17054
+        expected = """<table>
+                        <thead>
+                            <tr>
+                            <th>X</th>
+                            <th>X</th>
+                            <th>Y</th>
+                            <th>Z</th>
+                            <th>W</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr>
+                            <td>1</td>
+                            <td>2</td>
+                            <td>3</td>
+                            <td>4</td>
+                            <td>5</td>
+                        </tbody>
+                    </table>"""
+        out = """<table>
+                            <tr>
+                            <th>X</th>
+                            <th>X</th>
+                            <th>Y</th>
+                            <th>Z</th>
+                            <th>W</th>
+                            </tr>
+                            <tr>
+                            <td>1</td>
+                            <td>2</td>
+                            <td>3</td>
+                            <td>4</td>
+                            <td>5</td>
+                    </table>"""
+        expected = self.read_html(expected)[0]  # header is explicit
+        res = self.read_html(out)[0]            # infer header
+        tm.assert_frame_equal(expected, res)
+        res2 = self.read_html(out, header=0)[0]  # manually set header
+        tm.assert_frame_equal(expected, res2)
+
     def test_parse_dates_list(self):
         df = DataFrame({'date': date_range('1/1/2001', periods=10)})
         expected = df.to_html()

From f89b32a1ed1b770f5bc5f5cad8bd1dee75107f88 Mon Sep 17 00:00:00 2001
From: Adam Hooper <adam@adamhooper.com>
Date: Fri, 15 Jun 2018 11:30:34 -0400
Subject: [PATCH 02/12] Fixes after code review -- thanks, @WillAyd

---
 pandas/io/html.py            | 224 ++++++++++++++++++++------------
 pandas/tests/io/test_html.py | 244 ++++++++++++++++++++++-------------
 2 files changed, 295 insertions(+), 173 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 6e774f1846b99..d97a79cd51f08 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -14,7 +14,7 @@
 from pandas.errors import EmptyDataError
 from pandas.io.common import _is_url, urlopen, _validate_header_arg
 from pandas.io.parsers import TextParser
-from pandas.compat import (lrange, lmap, lfilter, u, string_types, iteritems,
+from pandas.compat import (lrange, lmap, u, string_types, iteritems,
                            raise_with_traceback, binary_type)
 from pandas import Series
 import pandas.core.common as com
@@ -189,6 +189,7 @@ class _HtmlFrameParser(object):
     -----
     To subclass this class effectively you must override the following methods:
         * :func:`_build_doc`
+        * :func:`_attr_getter`
         * :func:`_text_getter`
         * :func:`_parse_td`
         * :func:`_parse_thead_tr`
@@ -208,7 +209,8 @@ def __init__(self, io, match, attrs, encoding, displayed_only):
         self.displayed_only = displayed_only
 
     def parse_tables(self):
-        """Parse and return all tables from the DOM.
+        """
+        Parse and return all tables from the DOM.
 
         Returns
         -------
@@ -217,8 +219,28 @@ def parse_tables(self):
         tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
         return (self._parse_thead_tbody_tfoot(table) for table in tables)
 
+    def _attr_getter(self, obj, attr):
+        """
+        Return the attribute value of an individual DOM node.
+
+        Parameters
+        ----------
+        obj : node-like
+            A DOM node.
+
+        attr : str or unicode
+            The attribute, such as "colspan"
+
+        Returns
+        -------
+        text : str or unicode
+            The attribute value.
+        """
+        raise com.AbstractMethodError(self)
+
     def _text_getter(self, obj):
-        """Return the text of an individual DOM node.
+        """
+        Return the text of an individual DOM node.
 
         Parameters
         ----------
@@ -237,7 +259,8 @@ def _parse_td(self, obj):
 
         Parameters
         ----------
-        obj : an HTML row element
+        obj : node-like
+            A DOM <tr> node.
 
         Returns
         -------
@@ -247,7 +270,8 @@ def _parse_td(self, obj):
         raise com.AbstractMethodError(self)
 
     def _parse_thead_tr(self, table):
-        """Return the list of thead row elements from the parsed table element.
+        """
+        Return the list of thead row elements from the parsed table element.
 
         Parameters
         ----------
@@ -260,7 +284,8 @@ def _parse_thead_tr(self, table):
         raise com.AbstractMethodError(self)
 
     def _parse_tbody_tr(self, table):
-        """Return the list of tbody row elements from the parsed table element.
+        """
+        Return the list of tbody row elements from the parsed table element.
 
         HTML5 table bodies consist of either 0 or more <tbody> elements (which
         only contain <tr> elements) or 0 or more <tr> elements. This method
@@ -277,7 +302,8 @@ def _parse_tbody_tr(self, table):
         raise com.AbstractMethodError(self)
 
     def _parse_tfoot_tr(self, table):
-        """Return the list of tfoot row elements from the parsed table element.
+        """
+        Return the list of tfoot row elements from the parsed table element.
 
         Parameters
         ----------
@@ -290,7 +316,8 @@ def _parse_tfoot_tr(self, table):
         raise com.AbstractMethodError(self)
 
     def _parse_tables(self, doc, match, attrs):
-        """Return all tables from the parsed DOM.
+        """
+        Return all tables from the parsed DOM.
 
         Parameters
         ----------
@@ -314,7 +341,8 @@ def _parse_tables(self, doc, match, attrs):
         raise com.AbstractMethodError(self)
 
     def _equals_tag(self, obj, tag):
-        """Return whether an individual DOM node matches a tag
+        """
+        Return whether an individual DOM node matches a tag
 
         Parameters
         ----------
@@ -332,7 +360,8 @@ def _equals_tag(self, obj, tag):
         raise com.AbstractMethodError(self)
 
     def _build_doc(self):
-        """Return a tree-like object that can be used to iterate over the DOM.
+        """
+        Return a tree-like object that can be used to iterate over the DOM.
 
         Returns
         -------
@@ -341,48 +370,49 @@ def _build_doc(self):
         raise com.AbstractMethodError(self)
 
     def _parse_thead_tbody_tfoot(self, table_html):
-        """Given a table, return parsed header, body, and foot.
-           Header and body are lists-of-lists. Top level list is a list of
-           rows. Each row is a list of parsed elements.
-
-           Logic: Use <thead>, <tbody>, <tfoot> elements to identify
-                  header, body, and footer, otherwise:
-                  - Put all rows into body
-                  - Move rows from top of body to header only if
-                    all elements inside row are <th>
-                  - Move rows from bottom of body to footer only if
-                    all elements inside row are <th>
+        """
+        Given a table, return parsed header, body, and foot.
+
+        Header and body are lists-of-lists. Top level list is a list of
+        rows. Each row is a list of str text.
+
+        Logic: Use <thead>, <tbody>, <tfoot> elements to identify
+               header, body, and footer, otherwise:
+               - Put all rows into body
+               - Move rows from top of body to header only if
+                 all elements inside row are <th>
+               - Move rows from bottom of body to footer only if
+                 all elements inside row are <th>
 
         Parameters
         ----------
-        table_html : a single HTML table element.
+        table_html : node-like
 
         Returns
         -------
         tuple of (header, body, footer)
-        header : list of rows, each of which is a list of parsed
-                 header elements
-        body : list of rows, each of which is a list of parsed body elements
-        footer : list of rows, each of which is a list of parsed
-                 footer elements
         """
 
         header_rows = self._parse_thead_tr(table_html)
         body_rows = self._parse_tbody_tr(table_html)
         footer_rows = self._parse_tfoot_tr(table_html)
 
+        def row_is_all_th(row):
+            return all(self._equals_tag(t, 'th') for t in
+                       self._parse_td(row))
+
         if not header_rows:
-            # The table has no <thead>. Treat first all-<th> rows as headers.
-            while body_rows and all(self._equals_tag(t, 'th') for t in
-                                    self._parse_td(body_rows[0])):
-                # this row should be a header row, move it from body to header
+            # The table has no <thead>. Move the top all-<th> rows from the
+            # <tbody> to the <thead>. (This is a common case because many
+            # tables in the wild have no <thead> or <tfoot>
+            while body_rows and row_is_all_th(body_rows[0]):
                 header_rows.append(body_rows.pop(0))
 
         if not footer_rows:
             # The table has no <tfoot>. Treat last all-<th> rows as footers.
-            while body_rows and all(self._equals_tag(t, 'th') for t in
-                                    self._parse_td(body_rows[-1])):
-                # this row should be a footer row, move it from body to footer
+            while body_rows and row_is_all_th(body_rows[-1]):
+                # .insert(), not .append(): we're moving "bottom of <tbody>" to
+                # "top of <tfoot>"
                 footer_rows.insert(0, body_rows.pop())
 
         header = self._expand_colspan_rowspan(header_rows)
@@ -392,8 +422,9 @@ def _parse_thead_tbody_tfoot(self, table_html):
         return header, body, footer
 
     def _expand_colspan_rowspan(self, rows):
-        """Given a list of <tr>s, return a list of text rows that copy cell
-           text across rowspans/colspans.
+        """
+        Given a list of <tr>s, return a list of text rows that copy cell
+        text across rowspans/colspans.
 
         Parameters
         ----------
@@ -404,50 +435,69 @@ def _expand_colspan_rowspan(self, rows):
         res : list of rows, each of which is a list of str in that row
         """
 
-        res = []
-        saved_span = []
-        for row in rows:
-            extracted_row = self._parse_td(row)
-            cols_text = [_remove_whitespace(
-                self._text_getter(col)) for col in extracted_row]
-            col_colspans = [int(col.get('colspan', 1))
-                            for col in extracted_row]
-            col_rowspans = [int(col.get('rowspan', 1))
-                            for col in extracted_row]
-            # expand cols using col_colspans
-            # maybe this can be done with a list comprehension, dunno
-            cols = list(zip(
-                list(com.flatten(
-                    lmap(lambda text_nc: [text_nc[0]] * text_nc[1],
-                         list(zip(cols_text, col_colspans))))),
-                list(com.flatten(
-                    lmap(lambda nc_nr: [nc_nr[1]] * nc_nr[0],
-                         list(zip(col_colspans, col_rowspans))))))
-            )
-            # cols is now a list of (text, number of rows)
-            # now insert any previous rowspans
-            for (col, (text, nr)) in saved_span:
-                cols.insert(col, (text, nr))
-
-            # save next saved_span
-            def advance_item_to_next_row(item):
-                (col, (text, nr)) = item
-                if nr == 1:
-                    return None
-                else:
-                    return (col, (text, nr - 1))
-            saved_span = lfilter(lambda i: i is not None,
-                                 lmap(advance_item_to_next_row,
-                                      list(enumerate(cols))))
-            cols = [text for (text, nr) in cols]
-            # generate cols with text only
-            if any([col != '' for col in cols]):
-                res.append(cols)
-
-        return res
+        all_texts = []  # list of rows, each a list of str
+        remainder = []  # list of (index, text, nrows)
+
+        for tr in rows:
+            texts = []  # the output for this row
+            next_remainder = []
+
+            index = 0
+            tds = self._parse_td(tr)
+            for td in tds:
+                # Append texts from previous rows with rowspan>1 that come
+                # before this <td>
+                while remainder and remainder[0][0] <= index:
+                    prev_i, prev_text, prev_rowspan = remainder.pop(0)
+                    texts.append(prev_text)
+                    if prev_rowspan > 1:
+                        next_remainder.append((prev_i, prev_text,
+                                               prev_rowspan - 1))
+                    index += 1
+
+                # Append the text from this <td>, colspan times
+                text = _remove_whitespace(self._text_getter(td))
+                rowspan = int(self._attr_getter(td, 'rowspan') or 1)
+                colspan = int(self._attr_getter(td, 'colspan') or 1)
+
+                for _ in range(colspan):
+                    texts.append(text)
+                    if rowspan > 1:
+                        next_remainder.append((index, text, rowspan - 1))
+                    index += 1
+
+            # Append texts from previous rows at the final position
+            for prev_i, prev_text, prev_rowspan in remainder:
+                texts.append(prev_text)
+                if prev_rowspan > 1:
+                    next_remainder.append((prev_i, prev_text,
+                                           prev_rowspan - 1))
+
+            all_texts.append(texts)
+            remainder = next_remainder
+
+        # Append rows that only appear because the previous row had non-1
+        # rowspan
+        while remainder:
+            next_remainder = []
+            texts = []
+            for prev_i, prev_text, prev_rowspan in remainder:
+                texts.append(prev_text)
+                if prev_rowspan > 1:
+                    next_remainder.append((prev_i, prev_text,
+                                           prev_rowspan - 1))
+            all_texts.append(texts)
+            remainder = next_remainder
+
+        # ignore all-empty-text rows
+        no_empty = [row for row in all_texts
+                    if any(text for text in row)]
+
+        return no_empty
 
     def _handle_hidden_tables(self, tbl_list, attr_name):
-        """Returns list of tables, potentially removing hidden elements
+        """
+        Return list of tables, potentially removing hidden elements
 
         Parameters
         ----------
@@ -515,6 +565,9 @@ def _parse_tables(self, doc, match, attrs):
                              .format(patt=match.pattern))
         return result
 
+    def _attr_getter(self, obj, attr):
+        return obj.get(attr)
+
     def _text_getter(self, obj):
         return obj.text
 
@@ -596,11 +649,14 @@ class _LxmlFrameParser(_HtmlFrameParser):
     def __init__(self, *args, **kwargs):
         super(_LxmlFrameParser, self).__init__(*args, **kwargs)
 
+    def _attr_getter(self, obj, attr):
+        return obj.get(attr)
+
     def _text_getter(self, obj):
         return obj.text_content()
 
     def _parse_td(self, row):
-        # Look for direct descendents only: the "row" element here may be a
+        # Look for direct children only: the "row" element here may be a
         # <thead> or <tfoot> (see _parse_thead_tr).
         return row.xpath('./td|./th')
 
@@ -694,12 +750,14 @@ def _parse_thead_tr(self, table):
         for thead in table.xpath('.//thead'):
             rows.extend(thead.xpath('./tr'))
 
-            # lxml does not clean up the clearly-erroneous
-            # <thead><th>foo</th><th>bar</th></thead>.
+            # HACK: lxml does not clean up the clearly-erroneous
+            # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
+            # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
+            # children as though it's a <tr>.
+            #
+            # Better solution would be to use html5lib.
             elements_at_root = thead.xpath('./td|./th')
             if elements_at_root:
-                # Pass the entire <thead> as a row. _parse_td() will interpret
-                # it correctly.
                 rows.append(thead)
 
         return rows
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index b8f520ee17d72..ace96c526c86f 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -362,7 +362,7 @@ def test_thousands_macau_stats(self, datapath):
                              attrs={'class': 'style1'})
         df = dfs[all_non_nan_table_index]
 
-        assert not any(s.isnull().any() for _, s in df.iteritems())
+        assert not any(s.isna().any() for _, s in df.iteritems())
 
     @pytest.mark.slow
     def test_thousands_macau_index_col(self, datapath):
@@ -371,7 +371,7 @@ def test_thousands_macau_index_col(self, datapath):
         dfs = self.read_html(macau_data, index_col=0, header=0)
         df = dfs[all_non_nan_table_index]
 
-        assert not any(s.isnull().any() for _, s in df.iteritems())
+        assert not any(s.isna().any() for _, s in df.iteritems())
 
     def test_empty_tables(self):
         """
@@ -685,29 +685,21 @@ def test_different_number_of_cols(self):
     def test_colspan_rowspan_are_1(self):
         # GH17054
         expected = """<table>
-                        <thead>
-                            <tr>
-                            <th>X</th>
-                            <th>Y</th>
-                            <th>Z</th>
-                            <th>W</th>
-                            </tr>
-                        </thead>
-                        <tbody>
-                        </tbody>
-                    </table>"""
+                        <tr>
+                          <th>X</th>
+                          <th>Y</th>
+                          <th>Z</th>
+                          <th>W</th>
+                        </tr>
+                      </table>"""
         out = """<table>
-                   <thead>
-                       <tr>
-                       <th colspan="1">X</th>
-                       <th>Y</th>
-                       <th rowspan="1">Z</th>
-                       <th>W</th>
-                       </tr>
-                   </thead>
-                   <tbody>
-                   </tbody>
-               </table>"""
+                   <tr>
+                     <th colspan="1">X</th>
+                     <th>Y</th>
+                     <th rowspan="1">Z</th>
+                     <th>W</th>
+                   </tr>
+                 </table>"""
         expected = self.read_html(expected)[0]
         res = self.read_html(out)[0]
         tm.assert_frame_equal(expected, res)
@@ -715,42 +707,34 @@ def test_colspan_rowspan_are_1(self):
     def test_colspan_rowspan_are_more_than_1(self):
         # GH17054
         expected = """<table>
-                        <thead>
-                            <tr>
-                            <th>X</th>
-                            <th>X</th>
-                            <th>Y</th>
-                            <th>Z</th>
-                            <th>W</th>
-                            </tr>
-                            <tr>
-                            <th>1</th>
-                            <th>2</th>
-                            <th>2</th>
-                            <th>Z</th>
-                            <th>3</th>
-                            </tr>
-                        </thead>
-                        <tbody>
-                        </tbody>
-                    </table>"""
+                        <tr>
+                          <td>X</td>
+                          <td>X</td>
+                          <td>Y</td>
+                          <td>Z</td>
+                          <td>W</td>
+                        </tr>
+                        <tr>
+                          <td>1</td>
+                          <td>2</td>
+                          <td>2</td>
+                          <td>Z</td>
+                          <td>3</td>
+                        </tr>
+                      </table>"""
         out = """<table>
-                   <thead>
-                       <tr>
-                       <th colspan="2">X</th>
-                       <th>Y</th>
-                       <th rowspan="2">Z</th>
-                       <th>W</th>
-                       </tr>
-                       <tr>
-                       <th>1</th>
-                       <th colspan="2">2</th>
-                       <th>3</th>
-                       </tr>
-                   </thead>
-                   <tbody>
-                   </tbody>
-               </table>"""
+                   <tr>
+                     <td colspan="2">X</td>
+                     <td>Y</td>
+                     <td rowspan="2">Z</td>
+                     <td>W</td>
+                   </tr>
+                   <tr>
+                     <td>1</td>
+                     <td colspan="2">2</td>
+                     <td>3</td>
+                   </tr>
+                 </table>"""
         expected = self.read_html(expected)[0]
         res = self.read_html(out)[0]
         tm.assert_frame_equal(expected, res)
@@ -758,40 +742,120 @@ def test_colspan_rowspan_are_more_than_1(self):
     def test_tbody_colspan_rowspan_copy_values(self):
         # GH17054
         expected = """<table>
-                        <tbody>
-                            <tr>
-                            <td>1</td>
-                            <td>1</td>
-                            <td>2</td>
-                            <td>3</td>
-                            <td>4</td>
-                            </tr>
-                            <tr>
-                            <td>5</td>
-                            <td>6</td>
-                            <td>6</td>
-                            <td>3</td>
-                            <td>7</td>
-                            </tr>
-                        </tbody>
-                    </table>"""
+                        <tr>
+                          <td>1</td>
+                          <td>1</td>
+                          <td>2</td>
+                          <td>3</td>
+                          <td>4</td>
+                        </tr>
+                        <tr>
+                          <td>5</td>
+                          <td>6</td>
+                          <td>6</td>
+                          <td>3</td>
+                          <td>7</td>
+                        </tr>
+                      </table>"""
         out = """<table>
-                   <tbody>
-                       <tr>
-                       <td colspan="2">1</td>
-                       <td>2</td>
-                       <td rowspan="2">3</td>
-                       <td>4</td>
-                       </tr>
-                       <tr>
-                       <td>5</td>
-                       <td colspan="2">6</td>
-                       <td>7</td>
-                       </tr>
-                   </tbody>
-               </table>"""
+                   <tr>
+                     <td colspan="2">1</td>
+                     <td>2</td>
+                     <td rowspan="2">3</td>
+                     <td>4</td>
+                   </tr>
+                   <tr>
+                     <td>5</td>
+                     <td colspan="2">6</td>
+                     <td>7</td>
+                   </tr>
+                 </table>"""
+        expected = self.read_html(expected)[0]
+        res = self.read_html(out)[0]
+        tm.assert_frame_equal(expected, res)
+
+    def test_colspan_rowspan_both_not_1(self):
+        # GH17054
+        expected = """<table>
+                        <tr>
+                          <td>a</td>
+                          <td>b</td>
+                          <td>b</td>
+                          <td>b</td>
+                          <td>c</td>
+                        </tr>
+                        <tr>
+                          <td>a</td>
+                          <td>b</td>
+                          <td>b</td>
+                          <td>b</td>
+                          <td>d</td>
+                        </tr>
+                      </table>"""
+        out = """<table>
+                   <tr>
+                     <td rowspan="2">a</td>
+                     <td rowspan="2" colspan="3">b</td>
+                     <td>c</td>
+                   </tr>
+                   <tr>
+                     <td>d</td>
+                   </tr>
+                 </table>"""
+        expected = self.read_html(expected)[0]
+        res = self.read_html(out)[0]
+        tm.assert_frame_equal(expected, res)
+
+    def test_rowspan_at_end_of_row(self):
+        # GH17054
+        expected = """<table>
+                        <tr>
+                          <td>a</td>
+                          <td>b</td>
+                        </tr>
+                        <tr>
+                          <td>c</td>
+                          <td>b</td>
+                        </tr>
+                      </table>"""
+        out = """<table>
+                   <tr>
+                     <td>a</td>
+                     <td rowspan="2">b</td>
+                   </tr>
+                   <tr>
+                     <td>c</td>
+                   </tr>
+                 </table>"""
+        expected = self.read_html(expected)[0]
+        res = self.read_html(out)[0]
+        tm.assert_frame_equal(expected, res)
+
+    def test_rowspan_only_rows(self):
+        # GH17054
+        expected = """<table>
+                        <tr>
+                          <td>a</td>
+                          <td>b</td>
+                        </tr>
+                        <tr>
+                          <td>a</td>
+                          <td>b</td>
+                        </tr>
+                        <tr>
+                          <td>a</td>
+                          <td>b</td>
+                        </tr>
+                      </table>"""
+        out = """<table>
+                   <tr>
+                     <td rowspan="3">a</td>
+                     <td rowspan="3">b</td>
+                   </tr>
+                 </table>"""
         expected = self.read_html(expected)[0]
         res = self.read_html(out)[0]
+        print(res)
         tm.assert_frame_equal(expected, res)
 
     def test_header_should_be_inferred_from_th_elements(self):

From 34f87cb9338385f91d3df204b666e7dd5b8cfb65 Mon Sep 17 00:00:00 2001
From: Adam Hooper <adam@adamhooper.com>
Date: Fri, 15 Jun 2018 11:38:32 -0400
Subject: [PATCH 03/12] Docstring tweaks

---
 pandas/io/html.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index d97a79cd51f08..e02c39c7ff04c 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -214,7 +214,7 @@ def parse_tables(self):
 
         Returns
         -------
-        tables : list of parsed (header, body, footer) tuples from tables
+        list of parsed (header, body, footer) tuples from tables.
         """
         tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
         return (self._parse_thead_tbody_tfoot(table) for table in tables)
@@ -233,7 +233,7 @@ def _attr_getter(self, obj, attr):
 
         Returns
         -------
-        text : str or unicode
+        str or unicode
             The attribute value.
         """
         raise com.AbstractMethodError(self)
@@ -264,7 +264,7 @@ def _parse_td(self, obj):
 
         Returns
         -------
-        columns : list of node-like
+        list of node-like
             These are the elements of each row, i.e., the columns.
         """
         raise com.AbstractMethodError(self)
@@ -279,7 +279,8 @@ def _parse_thead_tr(self, table):
 
         Returns
         -------
-        rows : list of <tr> row elements of a table
+        list of node-like
+            These are the <tr> row elements of a table.
         """
         raise com.AbstractMethodError(self)
 
@@ -297,7 +298,8 @@ def _parse_tbody_tr(self, table):
 
         Returns
         -------
-        rows : list of <tr> row elements of a table
+        list of node-like
+            These are the <tr> row elements of a table.
         """
         raise com.AbstractMethodError(self)
 
@@ -311,7 +313,8 @@ def _parse_tfoot_tr(self, table):
 
         Returns
         -------
-        rows : list of <tr> row elements of a table
+        list of node-like
+            These are the <tr> row elements of a table.
         """
         raise com.AbstractMethodError(self)
 
@@ -336,7 +339,8 @@ def _parse_tables(self, doc, match, attrs):
 
         Returns
         -------
-        tables : list of HTML <table> elements to be parsed into raw data.
+        list of node-like
+            HTML <table> elements to be parsed into raw data.
         """
         raise com.AbstractMethodError(self)
 
@@ -354,8 +358,8 @@ def _equals_tag(self, obj, tag):
 
         Returns
         -------
-        is_tag_equal : boolean
-            boolean indicating if the object is equal to tag 'tag'
+        boolean
+            Whether the object is equal to tag 'tag'
         """
         raise com.AbstractMethodError(self)
 
@@ -365,7 +369,8 @@ def _build_doc(self):
 
         Returns
         -------
-        obj : the DOM from which to parse the table element.
+        node-like
+            The DOM from which to parse the table element.
         """
         raise com.AbstractMethodError(self)
 
@@ -390,7 +395,7 @@ def _parse_thead_tbody_tfoot(self, table_html):
 
         Returns
         -------
-        tuple of (header, body, footer)
+        tuple of (header, body, footer), each a list of list-of-text rows.
         """
 
         header_rows = self._parse_thead_tr(table_html)
@@ -432,7 +437,8 @@ def _expand_colspan_rowspan(self, rows):
 
         Returns
         -------
-        res : list of rows, each of which is a list of str in that row
+        list of list
+            Each returned row is a list of str text.
         """
 
         all_texts = []  # list of rows, each a list of str
@@ -501,14 +507,14 @@ def _handle_hidden_tables(self, tbl_list, attr_name):
 
         Parameters
         ----------
-        tbl_list : list of Tag or list of Element
+        tbl_list : list of node-like
             Type of list elements will vary depending upon parser used
         attr_name : str
             Name of the accessor for retrieving HTML attributes
 
         Returns
         -------
-        list of Tag or list of Element
+        list of node-like
             Return type matches `tbl_list`
         """
         if not self.displayed_only:

From 582c86b8667a851db706a702d51b6904044ff688 Mon Sep 17 00:00:00 2001
From: Adam Hooper <adam@adamhooper.com>
Date: Fri, 15 Jun 2018 12:46:25 -0400
Subject: [PATCH 04/12] Details -- thanks, @jreback

---
 doc/source/whatsnew/v0.24.0.txt | 2 +-
 pandas/io/html.py               | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index 0f0ad3452e934..499a48faec325 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -24,7 +24,7 @@ Other Enhancements
   <https://pandas-gbq.readthedocs.io/en/latest/changelog.html#changelog-0-5-0>`__.
   (:issue:`21627`)
 - New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`)
-- :func:`read_html` handles colspan and rowspan arguments and attempts to infer a header if the header is not explicitly specified (:issue:`17054`)
+- :func:`read_html` copies cell data across ``colspan``s and ``rowspan``s, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`)
 -
 
 .. _whatsnew_0240.api_breaking:
diff --git a/pandas/io/html.py b/pandas/io/html.py
index e02c39c7ff04c..15f37007e22c8 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -378,6 +378,8 @@ def _parse_thead_tbody_tfoot(self, table_html):
         """
         Given a table, return parsed header, body, and foot.
 
+        Notes
+        -----
         Header and body are lists-of-lists. Top level list is a list of
         rows. Each row is a list of str text.
 
@@ -700,9 +702,6 @@ def _parse_tables(self, doc, match, kwargs):
     def _equals_tag(self, obj, tag):
         return obj.tag == tag
 
-    def _contains_tag(self, obj, tag):
-        return obj.find(tag) is not None
-
     def _build_doc(self):
         """
         Raises

From d2f0b83fccddd78a8214053b12c826f045167a8d Mon Sep 17 00:00:00 2001
From: Adam Hooper <adam@adamhooper.com>
Date: Fri, 15 Jun 2018 15:57:02 -0400
Subject: [PATCH 05/12] Tweak comments

---
 pandas/io/html.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 15f37007e22c8..fcb01ebdd06d1 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -354,12 +354,12 @@ def _equals_tag(self, obj, tag):
             A DOM node.
 
         tag : str
-            Tag name to be checked for equality
+            Tag name to be checked for equality.
 
         Returns
         -------
         boolean
-            Whether the object is equal to tag 'tag'
+            Whether `obj`'s tag name is `tag`
         """
         raise com.AbstractMethodError(self)
 
@@ -430,12 +430,17 @@ def row_is_all_th(row):
 
     def _expand_colspan_rowspan(self, rows):
         """
-        Given a list of <tr>s, return a list of text rows that copy cell
-        text across rowspans/colspans.
+        Given a list of <tr>s, return a list of text rows.
+
+        Notes
+        -----
+        Any cell with ``rowspan`` or ``colspan`` will have its contents copied
+        to subsequent cells.
 
         Parameters
         ----------
-        rows : list of <tr>s
+        rows : list of node-like
+            List of <tr>s
 
         Returns
         -------

From 74c23842db970f98d3139c3930e98b301aea2229 Mon Sep 17 00:00:00 2001
From: Adam Hooper <adam@adamhooper.com>
Date: Fri, 15 Jun 2018 18:52:04 -0400
Subject: [PATCH 06/12] Address latest review comments from @WillAyd

Mostly involved reformatting test_html.py
---
 pandas/io/html.py            |  20 +-
 pandas/tests/io/test_html.py | 594 +++++++++++++++++++----------------
 2 files changed, 322 insertions(+), 292 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index fcb01ebdd06d1..466aa7444f638 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -236,7 +236,8 @@ def _attr_getter(self, obj, attr):
         str or unicode
             The attribute value.
         """
-        raise com.AbstractMethodError(self)
+        # Both lxml and BeautifulSoup have the same implementation:
+        return obj.get(attr)
 
     def _text_getter(self, obj):
         """
@@ -409,19 +410,12 @@ def row_is_all_th(row):
                        self._parse_td(row))
 
         if not header_rows:
-            # The table has no <thead>. Move the top all-<th> rows from the
-            # <tbody> to the <thead>. (This is a common case because many
+            # The table has no <thead>. Move the top all-<th> rows from
+            # body_rows to header_rows. (This is a common case because many
             # tables in the wild have no <thead> or <tfoot>
             while body_rows and row_is_all_th(body_rows[0]):
                 header_rows.append(body_rows.pop(0))
 
-        if not footer_rows:
-            # The table has no <tfoot>. Treat last all-<th> rows as footers.
-            while body_rows and row_is_all_th(body_rows[-1]):
-                # .insert(), not .append(): we're moving "bottom of <tbody>" to
-                # "top of <tfoot>"
-                footer_rows.insert(0, body_rows.pop())
-
         header = self._expand_colspan_rowspan(header_rows)
         body = self._expand_colspan_rowspan(body_rows)
         footer = self._expand_colspan_rowspan(footer_rows)
@@ -578,9 +572,6 @@ def _parse_tables(self, doc, match, attrs):
                              .format(patt=match.pattern))
         return result
 
-    def _attr_getter(self, obj, attr):
-        return obj.get(attr)
-
     def _text_getter(self, obj):
         return obj.text
 
@@ -662,9 +653,6 @@ class _LxmlFrameParser(_HtmlFrameParser):
     def __init__(self, *args, **kwargs):
         super(_LxmlFrameParser, self).__init__(*args, **kwargs)
 
-    def _attr_getter(self, obj, attr):
-        return obj.get(attr)
-
     def _text_getter(self, obj):
         return obj.text_content()
 
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index ace96c526c86f..b2a72f015f369 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -391,18 +391,21 @@ def test_empty_tables(self):
                 </tr>
             </tbody>
         </table>'''
+
         data2 = data1 + '''<table>
             <tbody>
             </tbody>
         </table>'''
-        res1 = self.read_html(StringIO(data1))
-        res2 = self.read_html(StringIO(data2))
-        assert_framelist_equal(res1, res2)
+
+        expected = self.read_html(data1)
+        result = self.read_html(data2)
+
+        assert_framelist_equal(result, expected)
 
     def test_multiple_tbody(self):
         # GH-20690
         # Read all tbody tags within a single table.
-        data = '''<table>
+        result = self.read_html('''<table>
             <thead>
                 <tr>
                     <th>A</th>
@@ -421,9 +424,10 @@ def test_multiple_tbody(self):
                     <td>4</td>
                 </tr>
             </tbody>
-        </table>'''
+        </table>''')[0]
+
         expected = DataFrame({'A': [1, 3], 'B': [2, 4]})
-        result = self.read_html(StringIO(data))[0]
+
         tm.assert_frame_equal(result, expected)
 
     def test_header_and_one_column(self):
@@ -431,9 +435,7 @@ def test_header_and_one_column(self):
         Don't fail with bs4 when there is a header and only one column
         as described in issue #9178
         """
-        data = StringIO('''<html>
-            <body>
-             <table>
+        result = self.read_html('''<table>
                 <thead>
                     <tr>
                         <th>Header</th>
@@ -444,18 +446,17 @@ def test_header_and_one_column(self):
                         <td>first</td>
                     </tr>
                 </tbody>
-            </table>
-            </body>
-        </html>''')
+            </table>''')[0]
+
         expected = DataFrame(data={'Header': 'first'}, index=[0])
-        result = self.read_html(data)[0]
+
         tm.assert_frame_equal(result, expected)
 
     def test_thead_without_tr(self):
         """
         Ensure parser adds <tr> within <thead> on malformed HTML.
         """
-        data1 = StringIO('''<table>
+        expected = self.read_html('''<table>
             <thead>
                 <tr>
                     <th>Country</th>
@@ -470,8 +471,9 @@ def test_thead_without_tr(self):
                     <td>1944</td>
                 </tr>
             </tbody>
-        </table>''')
-        data2 = StringIO('''<table>
+        </table>''')[0]
+
+        result = self.read_html('''<table>
             <thead>
                 <th>Country</th>
                 <th>Municipality</th>
@@ -484,10 +486,9 @@ def test_thead_without_tr(self):
                     <td>1944</td>
                 </tr>
             </tbody>
-        </table>''')
-        res1 = self.read_html(data1)
-        res2 = self.read_html(data2, header=0)
-        assert_framelist_equal(res1, res2)
+        </table>''')[0]
+
+        tm.assert_frame_equal(result, expected)
 
     def test_tfoot_read(self):
         """
@@ -512,18 +513,21 @@ def test_tfoot_read(self):
             </tfoot>
         </table>'''
 
+        expected1 = DataFrame({'A': ['bodyA'], 'B': ['bodyB']})
+        expected2 = DataFrame({'A': ['bodyA', 'footA'],
+                               'B': ['bodyB', 'footB']})
+
         data1 = data_template.format(footer="")
         data2 = data_template.format(
             footer="<tr><td>footA</td><th>footB</th></tr>")
 
-        d1 = {'A': ['bodyA'], 'B': ['bodyB']}
-        d2 = {'A': ['bodyA', 'footA'], 'B': ['bodyB', 'footB']}
+        result1 = self.read_html(data1)[0]
+        result2 = self.read_html(data2)[0]
 
-        tm.assert_frame_equal(self.read_html(data1)[0], DataFrame(d1))
-        tm.assert_frame_equal(self.read_html(data2)[0], DataFrame(d2))
+        tm.assert_frame_equal(result1, expected1)
+        tm.assert_frame_equal(result2, expected2)
 
     def test_countries_municipalities(self):
-        # GH5048
         data1 = StringIO('''<table>
             <thead>
                 <tr>
@@ -540,6 +544,7 @@ def test_countries_municipalities(self):
                 </tr>
             </tbody>
         </table>''')
+
         data2 = StringIO('''
         <table>
             <tbody>
@@ -555,20 +560,22 @@ def test_countries_municipalities(self):
                 </tr>
             </tbody>
         </table>''')
-        res1 = self.read_html(data1)
-        res2 = self.read_html(data2, header=0)
-        assert_framelist_equal(res1, res2)
+
+        expected = self.read_html(data1)[0]
+        result = self.read_html(data2, header=0)[0]  # GH5048
+
+        tm.assert_frame_equal(result, expected)
 
     def test_nyse_wsj_commas_table(self, datapath):
         data = datapath('io', 'data', 'nyse_wsj.html')
         df = self.read_html(data, index_col=0, header=0,
                             attrs={'class': 'mdcTable'})[0]
 
-        columns = Index(['Issue(Roll over for charts and headlines)',
-                         'Volume', 'Price', 'Chg', '% Chg'])
+        expected = Index(['Issue(Roll over for charts and headlines)',
+                          'Volume', 'Price', 'Chg', '% Chg'])
         nrows = 100
         assert df.shape[0] == nrows
-        tm.assert_index_equal(df.columns, columns)
+        tm.assert_index_equal(df.columns, expected)
 
     @pytest.mark.slow
     def test_banklist_header(self, datapath):
@@ -621,7 +628,7 @@ def test_gold_canyon(self):
         assert gc in df.to_string()
 
     def test_different_number_of_cols(self):
-        expected = """<table border="1" class="dataframe">
+        expected = self.read_html("""<table>
                         <thead>
                             <tr style="text-align: right;">
                             <th></th>
@@ -650,8 +657,9 @@ def test_different_number_of_cols(self):
                             <td> 0.222</td>
                             </tr>
                         </tbody>
-                    </table>"""
-        out = """<table border="1" class="dataframe">
+                    </table>""", index_col=0)[0]
+
+        result = self.read_html("""<table>
                     <thead>
                         <tr style="text-align: right;">
                         <th></th>
@@ -677,228 +685,254 @@ def test_different_number_of_cols(self):
                         <td> 0.222</td>
                         </tr>
                     </tbody>
-                 </table>"""
-        expected = self.read_html(expected, index_col=0)[0]
-        res = self.read_html(out, index_col=0)[0]
-        tm.assert_frame_equal(expected, res)
+                 </table>""", index_col=0)[0]
+
+        tm.assert_frame_equal(result, expected)
 
     def test_colspan_rowspan_are_1(self):
         # GH17054
-        expected = """<table>
-                        <tr>
-                          <th>X</th>
-                          <th>Y</th>
-                          <th>Z</th>
-                          <th>W</th>
-                        </tr>
-                      </table>"""
-        out = """<table>
-                   <tr>
-                     <th colspan="1">X</th>
-                     <th>Y</th>
-                     <th rowspan="1">Z</th>
-                     <th>W</th>
-                   </tr>
-                 </table>"""
-        expected = self.read_html(expected)[0]
-        res = self.read_html(out)[0]
-        tm.assert_frame_equal(expected, res)
+        expected = self.read_html(
+            """<table>
+                 <tr>
+                   <th>X</th>
+                   <th>Y</th>
+                   <th>Z</th>
+                   <th>W</th>
+                 </tr>
+               </table>""")[0]
+
+        result = self.read_html(
+            """<table>
+                 <tr>
+                   <th colspan="1">X</th>
+                   <th>Y</th>
+                   <th rowspan="1">Z</th>
+                   <th>W</th>
+                 </tr>
+               </table>""")[0]
+
+        tm.assert_frame_equal(result, expected)
 
     def test_colspan_rowspan_are_more_than_1(self):
         # GH17054
-        expected = """<table>
-                        <tr>
-                          <td>X</td>
-                          <td>X</td>
-                          <td>Y</td>
-                          <td>Z</td>
-                          <td>W</td>
-                        </tr>
-                        <tr>
-                          <td>1</td>
-                          <td>2</td>
-                          <td>2</td>
-                          <td>Z</td>
-                          <td>3</td>
-                        </tr>
-                      </table>"""
-        out = """<table>
-                   <tr>
-                     <td colspan="2">X</td>
-                     <td>Y</td>
-                     <td rowspan="2">Z</td>
-                     <td>W</td>
-                   </tr>
-                   <tr>
-                     <td>1</td>
-                     <td colspan="2">2</td>
-                     <td>3</td>
-                   </tr>
-                 </table>"""
-        expected = self.read_html(expected)[0]
-        res = self.read_html(out)[0]
-        tm.assert_frame_equal(expected, res)
+        expected = self.read_html(
+            """<table>
+                 <tr>
+                   <td>X</td>
+                   <td>X</td>
+                   <td>Y</td>
+                   <td>Z</td>
+                   <td>W</td>
+                 </tr>
+                 <tr>
+                   <td>1</td>
+                   <td>2</td>
+                   <td>2</td>
+                   <td>Z</td>
+                   <td>3</td>
+                 </tr>
+               </table>""")[0]
+
+        result = self.read_html(
+            """<table>
+                 <tr>
+                   <td colspan="2">X</td>
+                   <td>Y</td>
+                   <td rowspan="2">Z</td>
+                   <td>W</td>
+                 </tr>
+                 <tr>
+                   <td>1</td>
+                   <td colspan="2">2</td>
+                   <td>3</td>
+                 </tr>
+               </table>""")[0]
+
+        tm.assert_frame_equal(result, expected)
 
     def test_tbody_colspan_rowspan_copy_values(self):
         # GH17054
-        expected = """<table>
-                        <tr>
-                          <td>1</td>
-                          <td>1</td>
-                          <td>2</td>
-                          <td>3</td>
-                          <td>4</td>
-                        </tr>
-                        <tr>
-                          <td>5</td>
-                          <td>6</td>
-                          <td>6</td>
-                          <td>3</td>
-                          <td>7</td>
-                        </tr>
-                      </table>"""
-        out = """<table>
-                   <tr>
-                     <td colspan="2">1</td>
-                     <td>2</td>
-                     <td rowspan="2">3</td>
-                     <td>4</td>
-                   </tr>
-                   <tr>
-                     <td>5</td>
-                     <td colspan="2">6</td>
-                     <td>7</td>
-                   </tr>
-                 </table>"""
-        expected = self.read_html(expected)[0]
-        res = self.read_html(out)[0]
-        tm.assert_frame_equal(expected, res)
+        expected = self.read_html(
+            """<table>
+                 <tr>
+                   <td>1</td>
+                   <td>1</td>
+                   <td>2</td>
+                   <td>3</td>
+                   <td>4</td>
+                 </tr>
+                 <tr>
+                   <td>5</td>
+                   <td>6</td>
+                   <td>6</td>
+                   <td>3</td>
+                   <td>7</td>
+                 </tr>
+               </table>""")[0]
+
+        result = self.read_html(
+            """<table>
+                 <tr>
+                   <td colspan="2">1</td>
+                   <td>2</td>
+                   <td rowspan="2">3</td>
+                   <td>4</td>
+                 </tr>
+                 <tr>
+                   <td>5</td>
+                   <td colspan="2">6</td>
+                   <td>7</td>
+                 </tr>
+               </table>""")[0]
+
+        tm.assert_frame_equal(result, expected)
 
     def test_colspan_rowspan_both_not_1(self):
         # GH17054
-        expected = """<table>
-                        <tr>
-                          <td>a</td>
-                          <td>b</td>
-                          <td>b</td>
-                          <td>b</td>
-                          <td>c</td>
-                        </tr>
-                        <tr>
-                          <td>a</td>
-                          <td>b</td>
-                          <td>b</td>
-                          <td>b</td>
-                          <td>d</td>
-                        </tr>
-                      </table>"""
-        out = """<table>
-                   <tr>
-                     <td rowspan="2">a</td>
-                     <td rowspan="2" colspan="3">b</td>
-                     <td>c</td>
-                   </tr>
-                   <tr>
-                     <td>d</td>
-                   </tr>
-                 </table>"""
-        expected = self.read_html(expected)[0]
-        res = self.read_html(out)[0]
-        tm.assert_frame_equal(expected, res)
+        expected = self.read_html(
+            """<table>
+                 <tr>
+                   <td>a</td>
+                   <td>b</td>
+                   <td>b</td>
+                   <td>b</td>
+                   <td>c</td>
+                 </tr>
+                 <tr>
+                   <td>a</td>
+                   <td>b</td>
+                   <td>b</td>
+                   <td>b</td>
+                   <td>d</td>
+                 </tr>
+               </table>""")[0]
+
+        result = self.read_html(
+            """<table>
+                 <tr>
+                   <td rowspan="2">a</td>
+                   <td rowspan="2" colspan="3">b</td>
+                   <td>c</td>
+                 </tr>
+                 <tr>
+                   <td>d</td>
+                 </tr>
+               </table>""")[0]
+
+        tm.assert_frame_equal(result, expected)
 
     def test_rowspan_at_end_of_row(self):
         # GH17054
-        expected = """<table>
-                        <tr>
-                          <td>a</td>
-                          <td>b</td>
-                        </tr>
-                        <tr>
-                          <td>c</td>
-                          <td>b</td>
-                        </tr>
-                      </table>"""
-        out = """<table>
+        expected = read_html(
+            """<table>
+                 <tr>
+                   <td>a</td>
+                   <td>b</td>
+                 </tr>
+                 <tr>
+                   <td>c</td>
+                   <td>b</td>
+                 </tr>
+               </table>""")[0]
+
+        result = read_html(
+            """<table>
+                 <tr>
+                   <td>a</td>
+                   <td rowspan="2">b</td>
+                 </tr>
+                 <tr>
+                   <td>c</td>
+                 </tr>
+               </table>""")[0]
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_rowspan_only_rows(self):
+        # GH17054
+        expected = self.read_html(
+            """<table>
+                 <tr>
+                   <td>a</td>
+                   <td>b</td>
+                 </tr>
+                 <tr>
+                   <td>a</td>
+                   <td>b</td>
+                 </tr>
+                 <tr>
+                   <td>a</td>
+                   <td>b</td>
+                 </tr>
+               </table>""")[0]
+
+        result = read_html(
+            """<table>
+                 <tr>
+                   <td rowspan="3">a</td>
+                   <td rowspan="3">b</td>
+                 </tr>
+               </table>""")[0]
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_header_inferred_from_th_elements(self):
+        # GH17054
+        expected = read_html(
+            """<table>
+                 <thead>
                    <tr>
-                     <td>a</td>
-                     <td rowspan="2">b</td>
+                     <th>X</th>
+                     <th>X</th>
+                     <th>Y</th>
+                     <th>Z</th>
+                     <th>W</th>
                    </tr>
                    <tr>
-                     <td>c</td>
+                     <th>a</th>
+                     <th>b</th>
+                     <th>a</th>
+                     <th>a</th>
+                     <th>a</th>
                    </tr>
-                 </table>"""
-        expected = self.read_html(expected)[0]
-        res = self.read_html(out)[0]
-        tm.assert_frame_equal(expected, res)
-
-    def test_rowspan_only_rows(self):
-        # GH17054
-        expected = """<table>
-                        <tr>
-                          <td>a</td>
-                          <td>b</td>
-                        </tr>
-                        <tr>
-                          <td>a</td>
-                          <td>b</td>
-                        </tr>
-                        <tr>
-                          <td>a</td>
-                          <td>b</td>
-                        </tr>
-                      </table>"""
-        out = """<table>
+                 </thead>
+                 <tbody>
                    <tr>
-                     <td rowspan="3">a</td>
-                     <td rowspan="3">b</td>
+                     <td>1</td>
+                     <td>2</td>
+                     <td>3</td>
+                     <td>4</td>
+                     <td>5</td>
                    </tr>
-                 </table>"""
-        expected = self.read_html(expected)[0]
-        res = self.read_html(out)[0]
-        print(res)
-        tm.assert_frame_equal(expected, res)
+                 </tbody>
+               </table>""")[0]
+
+        result = read_html(
+            """<table>
+                 <tr>
+                   <th>X</th>
+                   <th>X</th>
+                   <th>Y</th>
+                   <th>Z</th>
+                   <th>W</th>
+                 </tr>
+                 <tr>
+                   <th>a</th>
+                   <th>b</th>
+                   <th>a</th>
+                   <th>a</th>
+                   <th>a</th>
+                 </tr>
+                 <tr>
+                   <td>1</td>
+                   <td>2</td>
+                   <td>3</td>
+                   <td>4</td>
+                   <td>5</td>
+                 </tr>
+               </table>""")[0]
 
-    def test_header_should_be_inferred_from_th_elements(self):
-        # GH17054
-        expected = """<table>
-                        <thead>
-                            <tr>
-                            <th>X</th>
-                            <th>X</th>
-                            <th>Y</th>
-                            <th>Z</th>
-                            <th>W</th>
-                            </tr>
-                        </thead>
-                        <tbody>
-                            <tr>
-                            <td>1</td>
-                            <td>2</td>
-                            <td>3</td>
-                            <td>4</td>
-                            <td>5</td>
-                        </tbody>
-                    </table>"""
-        out = """<table>
-                            <tr>
-                            <th>X</th>
-                            <th>X</th>
-                            <th>Y</th>
-                            <th>Z</th>
-                            <th>W</th>
-                            </tr>
-                            <tr>
-                            <td>1</td>
-                            <td>2</td>
-                            <td>3</td>
-                            <td>4</td>
-                            <td>5</td>
-                    </table>"""
-        expected = self.read_html(expected)[0]  # header is explicit
-        res = self.read_html(out)[0]            # infer header
-        tm.assert_frame_equal(expected, res)
-        res2 = self.read_html(out, header=0)[0]  # manually set header
-        tm.assert_frame_equal(expected, res2)
+        tm.assert_frame_equal(result, expected)
 
     def test_parse_dates_list(self):
         df = DataFrame({'date': date_range('1/1/2001', periods=10)})
@@ -936,9 +970,8 @@ def test_wikipedia_states_table(self, datapath):
         assert result['sq mi'].dtype == np.dtype('float64')
 
     def test_decimal_rows(self):
-
         # GH 12907
-        data = StringIO('''<html>
+        result = self.read_html('''<html>
             <body>
              <table>
                 <thead>
@@ -953,9 +986,10 @@ def test_decimal_rows(self):
                 </tbody>
             </table>
             </body>
-        </html>''')
+        </html>''', decimal='#')[0]
+
         expected = DataFrame(data={'Header': 1100.101}, index=[0])
-        result = self.read_html(data, decimal='#')[0]
+
         assert result['Header'].dtype == np.dtype('float64')
         tm.assert_frame_equal(result, expected)
 
@@ -963,53 +997,61 @@ def test_bool_header_arg(self):
         # GH 6114
         for arg in [True, False]:
             with pytest.raises(TypeError):
-                read_html(self.spam_data, header=arg)
+                self.read_html(self.spam_data, header=arg)
 
     def test_converters(self):
         # GH 13461
-        html_data = """<table>
-                        <thead>
-                            <th>a</th>
-                            </tr>
-                        </thead>
-                        <tbody>
-                            <tr>
-                            <td> 0.763</td>
-                            </tr>
-                            <tr>
-                            <td> 0.244</td>
-                            </tr>
-                        </tbody>
-                    </table>"""
+        result = self.read_html(
+            """<table>
+                 <thead>
+                   <tr>
+                     <th>a</th>
+                    </tr>
+                 </thead>
+                 <tbody>
+                   <tr>
+                     <td> 0.763</td>
+                   </tr>
+                   <tr>
+                     <td> 0.244</td>
+                   </tr>
+                 </tbody>
+               </table>""",
+            converters={'a': str}
+        )[0]
 
-        expected_df = DataFrame({'a': ['0.763', '0.244']})
-        html_df = read_html(html_data, converters={'a': str})[0]
-        tm.assert_frame_equal(expected_df, html_df)
+        expected = DataFrame({'a': ['0.763', '0.244']})
+
+        tm.assert_frame_equal(result, expected)
 
     def test_na_values(self):
         # GH 13461
-        html_data = """<table>
-                        <thead>
-                            <th>a</th>
-                            </tr>
-                        </thead>
-                        <tbody>
-                            <tr>
-                            <td> 0.763</td>
-                            </tr>
-                            <tr>
-                            <td> 0.244</td>
-                            </tr>
-                        </tbody>
-                    </table>"""
+        result = self.read_html(
+            """<table>
+                 <thead>
+                   <tr>
+                     <th>a</th>
+                   </tr>
+                 </thead>
+                 <tbody>
+                   <tr>
+                     <td> 0.763</td>
+                   </tr>
+                   <tr>
+                     <td> 0.244</td>
+                   </tr>
+                 </tbody>
+               </table>""",
+            na_values=[0.244])[0]
 
-        expected_df = DataFrame({'a': [0.763, np.nan]})
-        html_df = read_html(html_data, na_values=[0.244])[0]
-        tm.assert_frame_equal(expected_df, html_df)
+        expected = DataFrame({'a': [0.763, np.nan]})
+
+        tm.assert_frame_equal(result, expected)
 
     def test_keep_default_na(self):
         html_data = """<table>
                         <thead>
+                            <tr>
                             <th>a</th>
                             </tr>
                         </thead>
@@ -1024,11 +1066,11 @@ def test_keep_default_na(self):
                     </table>"""
 
         expected_df = DataFrame({'a': ['N/A', 'NA']})
-        html_df = read_html(html_data, keep_default_na=False)[0]
+        html_df = self.read_html(html_data, keep_default_na=False)[0]
         tm.assert_frame_equal(expected_df, html_df)
 
         expected_df = DataFrame({'a': [np.nan, np.nan]})
-        html_df = read_html(html_data, keep_default_na=True)[0]
+        html_df = self.read_html(html_data, keep_default_na=True)[0]
         tm.assert_frame_equal(expected_df, html_df)
 
     def test_multiple_header_rows(self):
@@ -1040,7 +1082,7 @@ def test_multiple_header_rows(self):
                                ["Name", "Unnamed: 1_level_1",
                                 "Unnamed: 2_level_1"]]
         html = expected_df.to_html(index=False)
-        html_df = read_html(html, )[0]
+        html_df = self.read_html(html, )[0]
         tm.assert_frame_equal(expected_df, html_df)
 
     def test_works_on_valid_markup(self, datapath):

From ad6e869b2459d605288ba996a105e88d65c1cfa0 Mon Sep 17 00:00:00 2001
From: Adam Hooper <adam@adamhooper.com>
Date: Thu, 21 Jun 2018 09:54:25 -0400
Subject: [PATCH 07/12] Clean up html tests

---
 pandas/tests/io/test_html.py | 445 +++++++++++++----------------------
 1 file changed, 166 insertions(+), 279 deletions(-)

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index b2a72f015f369..fca872e459892 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -377,30 +377,28 @@ def test_empty_tables(self):
         """
         Make sure that read_html ignores empty tables.
         """
-        data1 = '''<table>
-            <thead>
-                <tr>
-                    <th>A</th>
-                    <th>B</th>
-                </tr>
-            </thead>
-            <tbody>
-                <tr>
-                    <td>1</td>
-                    <td>2</td>
-                </tr>
-            </tbody>
-        </table>'''
-
-        data2 = data1 + '''<table>
-            <tbody>
-            </tbody>
-        </table>'''
-
-        expected = self.read_html(data1)
-        result = self.read_html(data2)
+        result = self.read_html('''
+            <table>
+                <thead>
+                    <tr>
+                        <th>A</th>
+                        <th>B</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    <tr>
+                        <td>1</td>
+                        <td>2</td>
+                    </tr>
+                </tbody>
+            </table>
+            <table>
+                <tbody>
+                </tbody>
+            </table>
+        ''')
 
-        assert_framelist_equal(result, expected)
+        assert len(result) == 1
 
     def test_multiple_tbody(self):
         # GH-20690
@@ -456,7 +454,7 @@ def test_thead_without_tr(self):
         """
         Ensure parser adds <tr> within <thead> on malformed HTML.
         """
-        expected = self.read_html('''<table>
+        result = self.read_html('''<table>
             <thead>
                 <tr>
                     <th>Country</th>
@@ -473,20 +471,11 @@ def test_thead_without_tr(self):
             </tbody>
         </table>''')[0]
 
-        result = self.read_html('''<table>
-            <thead>
-                <th>Country</th>
-                <th>Municipality</th>
-                <th>Year</th>
-            </thead>
-            <tbody>
-                <tr>
-                    <td>Ukraine</td>
-                    <th>Odessa</th>
-                    <td>1944</td>
-                </tr>
-            </tbody>
-        </table>''')[0]
+        expected = DataFrame(data={
+            'Country': ['Ukraine'],
+            'Municipality': ['Odessa'],
+            'Year': [1944],
+        })
 
         tm.assert_frame_equal(result, expected)
 
@@ -527,42 +516,23 @@ def test_tfoot_read(self):
         tm.assert_frame_equal(result1, expected1)
         tm.assert_frame_equal(result2, expected2)
 
-    def test_countries_municipalities(self):
-        data1 = StringIO('''<table>
-            <thead>
-                <tr>
-                    <th>Country</th>
-                    <th>Municipality</th>
-                    <th>Year</th>
-                </tr>
-            </thead>
-            <tbody>
-                <tr>
-                    <td>Ukraine</td>
-                    <th>Odessa</th>
-                    <td>1944</td>
-                </tr>
-            </tbody>
-        </table>''')
-
-        data2 = StringIO('''
-        <table>
-            <tbody>
+    def test_parse_header_of_non_string_column(self):
+        # GH5048: if header is specified explicitly, an int column should be
+        # parsed as int while its header is parsed as str
+        result = self.read_html('''
+            <table>
                 <tr>
-                    <th>Country</th>
-                    <th>Municipality</th>
-                    <th>Year</th>
+                    <td>S</td>
+                    <td>I</td>
                 </tr>
                 <tr>
-                    <td>Ukraine</td>
-                    <th>Odessa</th>
+                    <td>text</td>
                     <td>1944</td>
                 </tr>
-            </tbody>
-        </table>''')
+            </table>
+        ''', header=0)[0]
 
-        expected = self.read_html(data1)[0]
-        result = self.read_html(data2, header=0)[0]  # GH5048
+        expected = DataFrame(data={'S': ['text'], 'I': [1944]})
 
         tm.assert_frame_equal(result, expected)
 
@@ -689,248 +659,165 @@ def test_different_number_of_cols(self):
 
         tm.assert_frame_equal(result, expected)
 
-    def test_colspan_rowspan_are_1(self):
+    def test_colspan_rowspan_1(self):
         # GH17054
-        expected = self.read_html(
-            """<table>
-                 <tr>
-                   <th>X</th>
-                   <th>Y</th>
-                   <th>Z</th>
-                   <th>W</th>
-                 </tr>
-               </table>""")[0]
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <th>A</th>
+                    <th colspan="1">B</th>
+                    <th rowspan="1">C</th>
+                </tr>
+                <tr>
+                    <td>a</td>
+                    <td>b</td>
+                    <td>c</td>
+                </tr>
+            </table>
+        """)[0]
 
-        result = self.read_html(
-            """<table>
-                 <tr>
-                   <th colspan="1">X</th>
-                   <th>Y</th>
-                   <th rowspan="1">Z</th>
-                   <th>W</th>
-                 </tr>
-               </table>""")[0]
+        expected = DataFrame(data={
+            'A': ['a'],
+            'B': ['b'],
+            'C': ['c'],
+        })
 
         tm.assert_frame_equal(result, expected)
 
-    def test_colspan_rowspan_are_more_than_1(self):
+    def test_colspan_rowspan_copy_values(self):
         # GH17054
-        expected = self.read_html(
-            """<table>
-                 <tr>
-                   <td>X</td>
-                   <td>X</td>
-                   <td>Y</td>
-                   <td>Z</td>
-                   <td>W</td>
-                 </tr>
-                 <tr>
-                   <td>1</td>
-                   <td>2</td>
-                   <td>2</td>
-                   <td>Z</td>
-                   <td>3</td>
-                 </tr>
-               </table>""")[0]
-
-        result = self.read_html(
-            """<table>
-                 <tr>
-                   <td colspan="2">X</td>
-                   <td>Y</td>
-                   <td rowspan="2">Z</td>
-                   <td>W</td>
-                 </tr>
-                 <tr>
-                   <td>1</td>
-                   <td colspan="2">2</td>
-                   <td>3</td>
-                 </tr>
-               </table>""")[0]
 
-        tm.assert_frame_equal(result, expected)
+        # In ASCII, with lowercase letters being copies:
+        #
+        # X x Y Z W
+        # A B b z C
 
-    def test_tbody_colspan_rowspan_copy_values(self):
-        # GH17054
-        expected = self.read_html(
-            """<table>
-                 <tr>
-                   <td>1</td>
-                   <td>1</td>
-                   <td>2</td>
-                   <td>3</td>
-                   <td>4</td>
-                 </tr>
-                 <tr>
-                   <td>5</td>
-                   <td>6</td>
-                   <td>6</td>
-                   <td>3</td>
-                   <td>7</td>
-                 </tr>
-               </table>""")[0]
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <td colspan="2">X</td>
+                    <td>Y</td>
+                    <td rowspan="2">Z</td>
+                    <td>W</td>
+                </tr>
+                <tr>
+                    <td>A</td>
+                    <td colspan="2">B</td>
+                    <td>C</td>
+                </tr>
+            </table>
+        """, header=0)[0]
 
-        result = self.read_html(
-            """<table>
-                 <tr>
-                   <td colspan="2">1</td>
-                   <td>2</td>
-                   <td rowspan="2">3</td>
-                   <td>4</td>
-                 </tr>
-                 <tr>
-                   <td>5</td>
-                   <td colspan="2">6</td>
-                   <td>7</td>
-                 </tr>
-               </table>""")[0]
+        expected = DataFrame(data={
+            'X': ['A'],
+            'X.1': ['B'],
+            'Y': ['B'],
+            'Z': ['Z'],
+            'W': ['C'],
+        })
 
         tm.assert_frame_equal(result, expected)
 
     def test_colspan_rowspan_both_not_1(self):
         # GH17054
-        expected = self.read_html(
-            """<table>
-                 <tr>
-                   <td>a</td>
-                   <td>b</td>
-                   <td>b</td>
-                   <td>b</td>
-                   <td>c</td>
-                 </tr>
-                 <tr>
-                   <td>a</td>
-                   <td>b</td>
-                   <td>b</td>
-                   <td>b</td>
-                   <td>d</td>
-                 </tr>
-               </table>""")[0]
 
-        result = self.read_html(
-            """<table>
-                 <tr>
-                   <td rowspan="2">a</td>
-                   <td rowspan="2" colspan="3">b</td>
-                   <td>c</td>
-                 </tr>
-                 <tr>
-                   <td>d</td>
-                 </tr>
-               </table>""")[0]
+        # In ASCII, with lowercase letters being copies:
+        #
+        # A B b b C
+        # a b b b D
+
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <td rowspan="2">A</td>
+                    <td rowspan="2" colspan="3">B</td>
+                    <td>C</td>
+                </tr>
+                <tr>
+                    <td>D</td>
+                </tr>
+            </table>
+        """, header=0)[0]
+
+        expected = DataFrame(data={
+            'A': ['A'],
+            'B': ['B'],
+            'B.1': ['B'],
+            'B.2': ['B'],
+            'C': ['D'],
+        })
 
         tm.assert_frame_equal(result, expected)
 
     def test_rowspan_at_end_of_row(self):
         # GH17054
-        expected = read_html(
-            """<table>
-                 <tr>
-                   <td>a</td>
-                   <td>b</td>
-                 </tr>
-                 <tr>
-                   <td>c</td>
-                   <td>b</td>
-                 </tr>
-               </table>""")[0]
-
-        result = read_html(
-            """<table>
-                 <tr>
-                   <td>a</td>
-                   <td rowspan="2">b</td>
-                 </tr>
-                 <tr>
-                   <td>c</td>
-                 </tr>
-               </table>""")[0]
+
+        # In ASCII, with lowercase letters being copies:
+        #
+        # A B
+        # C b
+
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <td>A</td>
+                    <td rowspan="2">B</td>
+                </tr>
+                <tr>
+                    <td>C</td>
+                </tr>
+            </table>
+        """, header=0)[0]
+
+        expected = DataFrame(data={
+            'A': ['C'],
+            'B': ['B']
+        })
 
         tm.assert_frame_equal(result, expected)
 
     def test_rowspan_only_rows(self):
         # GH17054
-        expected = self.read_html(
-            """<table>
-                 <tr>
-                   <td>a</td>
-                   <td>b</td>
-                 </tr>
-                 <tr>
-                   <td>a</td>
-                   <td>b</td>
-                 </tr>
-                 <tr>
-                   <td>a</td>
-                   <td>b</td>
-                 </tr>
-               </table>""")[0]
-
-        result = read_html(
-            """<table>
-                 <tr>
-                   <td rowspan="3">a</td>
-                   <td rowspan="3">b</td>
-                 </tr>
-               </table>""")[0]
+
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <td rowspan="3">A</td>
+                    <td rowspan="3">B</td>
+                </tr>
+            </table>
+        """, header=0)[0]
+
+        expected = DataFrame(data={
+            'A': ['A', 'A'],
+            'B': ['B', 'B'],
+        })
 
         tm.assert_frame_equal(result, expected)
 
     def test_header_inferred_from_th_elements(self):
         # GH17054
-        expected = read_html(
-            """<table>
-                 <thead>
-                   <tr>
-                     <th>X</th>
-                     <th>X</th>
-                     <th>Y</th>
-                     <th>Z</th>
-                     <th>W</th>
-                   </tr>
-                   <tr>
-                     <th>a</th>
-                     <th>b</th>
-                     <th>a</th>
-                     <th>a</th>
-                     <th>a</th>
-                   </tr>
-                 </thead>
-                 <tbody>
-                   <tr>
-                     <td>1</td>
-                     <td>2</td>
-                     <td>3</td>
-                     <td>4</td>
-                     <td>5</td>
-                   </tr>
-                 </tbody>
-               </table>""")[0]
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <th>A</th>
+                    <th>B</th>
+                </tr>
+                <tr>
+                    <th>a</th>
+                    <th>b</th>
+                </tr>
+                <tr>
+                    <td>1</td>
+                    <td>2</td>
+                </tr>
+            </table>
+        """)[0]
 
-        result = read_html(
-            """<table>
-                 <tr>
-                   <th>X</th>
-                   <th>X</th>
-                   <th>Y</th>
-                   <th>Z</th>
-                   <th>W</th>
-                 </tr>
-                 <tr>
-                   <th>a</th>
-                   <th>b</th>
-                   <th>a</th>
-                   <th>a</th>
-                   <th>a</th>
-                 </tr>
-                 <tr>
-                   <td>1</td>
-                   <td>2</td>
-                   <td>3</td>
-                   <td>4</td>
-                   <td>5</td>
-                 </tr>
-               </table>""")[0]
+        expected = DataFrame(data={
+            ('A', 'a'): [1],
+            ('B', 'b'): [2],
+        })
 
         tm.assert_frame_equal(result, expected)
 

From 6fa04896dd763d86f98bc1efd3adb54e646de441 Mon Sep 17 00:00:00 2001
From: Adam Hooper <adam@adamhooper.com>
Date: Thu, 21 Jun 2018 12:07:07 -0400
Subject: [PATCH 08/12] Do not nix rows of empty

... but _ignore_ empty rows when inferring columns. This changes the
behavior of test_spam_header, which previously ignored an empty row when
the user explicitly stated the row number to use as header.
---
 pandas/io/html.py            |  18 ++---
 pandas/tests/io/test_html.py | 125 +++++++++++++++++++++++------------
 2 files changed, 92 insertions(+), 51 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 466aa7444f638..18588123fdae8 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -496,11 +496,7 @@ def _expand_colspan_rowspan(self, rows):
             all_texts.append(texts)
             remainder = next_remainder
 
-        # ignore all-empty-text rows
-        no_empty = [row for row in all_texts
-                    if any(text for text in row)]
-
-        return no_empty
+        return all_texts
 
     def _handle_hidden_tables(self, tbl_list, attr_name):
         """
@@ -785,10 +781,16 @@ def _data_to_frame(**kwargs):
     header = kwargs.pop('header')
     kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
     if head:
-        rows = lrange(len(head))
         body = head + body
-        if header is None:  # special case when a table has <th> elements
-            header = 0 if rows == [0] else rows
+
+        # Infer header when there is a <thead> or top <th>-only rows
+        if header is None:
+            if len(head) == 1:
+                header = 0
+            else:
+                # ignore all-empty-text rows
+                header = [i for i, row in enumerate(head)
+                          if any(text for text in row)]
 
     if foot:
         body += foot
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index fca872e459892..b78c4f27d8c3f 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -15,6 +15,7 @@
                     date_range, Series)
 from pandas.compat import (map, zip, StringIO, BytesIO,
                            is_platform_windows, PY3, reload)
+from pandas.errors import ParserError
 from pandas.io.common import URLError, file_path_to_url
 import pandas.io.html
 from pandas.io.html import read_html
@@ -147,7 +148,7 @@ def test_banklist_no_match(self):
             assert isinstance(df, DataFrame)
 
     def test_spam_header(self):
-        df = self.read_html(self.spam_data, '.*Water.*', header=1)[0]
+        df = self.read_html(self.spam_data, '.*Water.*', header=2)[0]
         assert df.columns[0] == 'Proximates'
         assert not df.empty
 
@@ -424,7 +425,7 @@ def test_multiple_tbody(self):
             </tbody>
         </table>''')[0]
 
-        expected = DataFrame({'A': [1, 3], 'B': [2, 4]})
+        expected = DataFrame(data=[[1, 2], [3, 4]], columns=['A', 'B'])
 
         tm.assert_frame_equal(result, expected)
 
@@ -471,11 +472,8 @@ def test_thead_without_tr(self):
             </tbody>
         </table>''')[0]
 
-        expected = DataFrame(data={
-            'Country': ['Ukraine'],
-            'Municipality': ['Odessa'],
-            'Year': [1944],
-        })
+        expected = DataFrame(data=[['Ukraine', 'Odessa', 1944]],
+                             columns=['Country', 'Municipality', 'Year'])
 
         tm.assert_frame_equal(result, expected)
 
@@ -502,9 +500,10 @@ def test_tfoot_read(self):
             </tfoot>
         </table>'''
 
-        expected1 = DataFrame({'A': ['bodyA'], 'B': ['bodyB']})
-        expected2 = DataFrame({'A': ['bodyA', 'footA'],
-                               'B': ['bodyB', 'footB']})
+        expected1 = DataFrame(data=[['bodyA', 'bodyB']], columns=['A', 'B'])
+
+        expected2 = DataFrame(data=[['bodyA', 'bodyB'], ['footA', 'footB']],
+                              columns=['A', 'B'])
 
         data1 = data_template.format(footer="")
         data2 = data_template.format(
@@ -532,7 +531,7 @@ def test_parse_header_of_non_string_column(self):
             </table>
         ''', header=0)[0]
 
-        expected = DataFrame(data={'S': ['text'], 'I': [1944]})
+        expected = DataFrame([['text', 1944]], columns=('S', 'I'))
 
         tm.assert_frame_equal(result, expected)
 
@@ -676,11 +675,7 @@ def test_colspan_rowspan_1(self):
             </table>
         """)[0]
 
-        expected = DataFrame(data={
-            'A': ['a'],
-            'B': ['b'],
-            'C': ['c'],
-        })
+        expected = DataFrame([['a', 'b', 'c']], columns=['A', 'B', 'C'])
 
         tm.assert_frame_equal(result, expected)
 
@@ -708,13 +703,8 @@ def test_colspan_rowspan_copy_values(self):
             </table>
         """, header=0)[0]
 
-        expected = DataFrame(data={
-            'X': ['A'],
-            'X.1': ['B'],
-            'Y': ['B'],
-            'Z': ['Z'],
-            'W': ['C'],
-        })
+        expected = DataFrame(data=[['A', 'B', 'B', 'Z', 'C']],
+                             columns=['X', 'X.1', 'Y', 'Z', 'W'])
 
         tm.assert_frame_equal(result, expected)
 
@@ -739,13 +729,8 @@ def test_colspan_rowspan_both_not_1(self):
             </table>
         """, header=0)[0]
 
-        expected = DataFrame(data={
-            'A': ['A'],
-            'B': ['B'],
-            'B.1': ['B'],
-            'B.2': ['B'],
-            'C': ['D'],
-        })
+        expected = DataFrame(data=[['A', 'B', 'B', 'B', 'D']],
+                             columns=['A', 'B', 'B.1', 'B.2', 'C'])
 
         tm.assert_frame_equal(result, expected)
 
@@ -769,10 +754,7 @@ def test_rowspan_at_end_of_row(self):
             </table>
         """, header=0)[0]
 
-        expected = DataFrame(data={
-            'A': ['C'],
-            'B': ['B']
-        })
+        expected = DataFrame(data=[['C', 'B']], columns=['A', 'B'])
 
         tm.assert_frame_equal(result, expected)
 
@@ -788,14 +770,12 @@ def test_rowspan_only_rows(self):
             </table>
         """, header=0)[0]
 
-        expected = DataFrame(data={
-            'A': ['A', 'A'],
-            'B': ['B', 'B'],
-        })
+        expected = DataFrame(data=[['A', 'B'], ['A', 'B']],
+                             columns=['A', 'B'])
 
         tm.assert_frame_equal(result, expected)
 
-    def test_header_inferred_from_th_elements(self):
+    def test_header_inferred_from_rows_with_only_th(self):
         # GH17054
         result = self.read_html("""
             <table>
@@ -814,10 +794,9 @@ def test_header_inferred_from_th_elements(self):
             </table>
         """)[0]
 
-        expected = DataFrame(data={
-            ('A', 'a'): [1],
-            ('B', 'b'): [2],
-        })
+        columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']],
+                             labels=[[0, 1], [0, 1]])
+        expected = DataFrame(data=[[1, 2]], columns=columns)
 
         tm.assert_frame_equal(result, expected)
 
@@ -856,6 +835,23 @@ def test_wikipedia_states_table(self, datapath):
         result = self.read_html(data, 'Arizona', header=1)[0]
         assert result['sq mi'].dtype == np.dtype('float64')
 
+    def test_parser_error_on_empty_header_row(self):
+        with tm.assert_raises_regex(ParserError,
+                                    r"Passed header=\[0,1\] are "
+                                    r"too many rows for this "
+                                    r"multi_index of columns"):
+            self.read_html("""
+                <table>
+                    <thead>
+                        <tr><th></th><th></tr>
+                        <tr><th>A</th><th>B</th></tr>
+                    </thead>
+                    <tbody>
+                        <tr><td>a</td><td>b</td></tr>
+                    </tbody>
+                </table>
+            """, header=[0, 1])
+
     def test_decimal_rows(self):
         # GH 12907
         result = self.read_html('''<html>
@@ -960,6 +956,49 @@ def test_keep_default_na(self):
         html_df = self.read_html(html_data, keep_default_na=True)[0]
         tm.assert_frame_equal(expected_df, html_df)
 
+    def test_preserve_empty_rows(self):
+        result = self.read_html("""
+            <table>
+                <tr>
+                    <th>A</th>
+                    <th>B</th>
+                </tr>
+                <tr>
+                    <td>a</td>
+                    <td>b</td>
+                </tr>
+                <tr>
+                    <td></td>
+                    <td></td>
+                </tr>
+            </table>
+        """)[0]
+
+        expected = DataFrame(data=[['a', 'b'], [np.nan, np.nan]],
+                             columns=['A', 'B'])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_ignore_empty_rows_when_inferring_header(self):
+        result = self.read_html("""
+            <table>
+                <thead>
+                    <tr><th></th><th></tr>
+                    <tr><th>A</th><th>B</th></tr>
+                    <tr><th>a</th><th>b</th></tr>
+                </thead>
+                <tbody>
+                    <tr><td>1</td><td>2</td></tr>
+                </tbody>
+            </table>
+        """)[0]
+
+        columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']],
+                             labels=[[0, 1], [0, 1]])
+        expected = DataFrame(data=[[1, 2]], columns=columns)
+
+        tm.assert_frame_equal(result, expected)
+
     def test_multiple_header_rows(self):
         # Issue #13434
         expected_df = DataFrame(data=[("Hillary", 68, "D"),

From d4f4bb1ade73278d97daf1fee08575227dbd8fb1 Mon Sep 17 00:00:00 2001
From: Adam Hooper <adam@adamhooper.com>
Date: Tue, 3 Jul 2018 17:59:09 -0400
Subject: [PATCH 09/12] Comments: Notes after Returns

---
 pandas/io/html.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 18588123fdae8..45fe3b017e4f6 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -379,6 +379,14 @@ def _parse_thead_tbody_tfoot(self, table_html):
         """
         Given a table, return parsed header, body, and foot.
 
+        Parameters
+        ----------
+        table_html : node-like
+
+        Returns
+        -------
+        tuple of (header, body, footer), each a list of list-of-text rows.
+
         Notes
         -----
         Header and body are lists-of-lists. Top level list is a list of
@@ -391,14 +399,6 @@ def _parse_thead_tbody_tfoot(self, table_html):
                  all elements inside row are <th>
                - Move rows from bottom of body to footer only if
                  all elements inside row are <th>
-
-        Parameters
-        ----------
-        table_html : node-like
-
-        Returns
-        -------
-        tuple of (header, body, footer), each a list of list-of-text rows.
         """
 
         header_rows = self._parse_thead_tr(table_html)
@@ -426,11 +426,6 @@ def _expand_colspan_rowspan(self, rows):
         """
         Given a list of <tr>s, return a list of text rows.
 
-        Notes
-        -----
-        Any cell with ``rowspan`` or ``colspan`` will have its contents copied
-        to subsequent cells.
-
         Parameters
         ----------
         rows : list of node-like
@@ -440,6 +435,11 @@ def _expand_colspan_rowspan(self, rows):
         -------
         list of list
             Each returned row is a list of str text.
+
+        Notes
+        -----
+        Any cell with ``rowspan`` or ``colspan`` will have its contents copied
+        to subsequent cells.
         """
 
         all_texts = []  # list of rows, each a list of str

From e296bd1a8e1051c8078ea2e387470b0f6b795525 Mon Sep 17 00:00:00 2001
From: Adam Hooper <adam@adamhooper.com>
Date: Wed, 4 Jul 2018 13:36:12 -0400
Subject: [PATCH 10/12] Document read_html changes in whatsnew

---
 doc/source/whatsnew/v0.24.0.txt | 114 ++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index 83100dc634661..5aac64eb7f976 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -168,6 +168,120 @@ Current Behavior:
     ...
     OverflowError: Trying to coerce negative values to unsigned integers
 
+read_html Incompatibilities
+---------------------------
+
+:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes.
+Now it understands them, treating them as a sequence of cells with the same
+value.
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+    In [1]: pd.read_html("""
+          <table>
+            <thead>
+              <tr>
+                <th>A</th><th>B</th><th>C</th>
+              </tr>
+            </thead>
+            <tbody>
+              <tr>
+                <td colspan="2">1</td><td>2</td>
+              </tr>
+            </tbody>
+          </table>
+        """)
+    Out [1]:
+    [   A  B   C
+    0  1  2 NaN]
+
+Current Behavior:
+
+.. code-block:: ipython
+
+    In [1]: pd.read_html("""
+          <table>
+            <thead>
+              <tr>
+                <th>A</th><th>B</th><th>C</th>
+              </tr>
+            </thead>
+            <tbody>
+              <tr>
+                <td colspan="2">1</td><td>2</td>
+              </tr>
+            </tbody>
+          </table>
+        """)
+    Out [1]:
+    [   A  B  C
+    0  1  2  2]
+
+Calls that relied on the previous behavior will need to be changed.
+
+Also, :func:`read_html` previously ignored some ``<tr>`` elements when called
+with ``header=`` or ``skiprows=`` on some unusual HTML tables.
+(:issue:`21641`)
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+    In [1]: pd.read_html("""
+          <table>
+            <thead>
+              <tr>
+                <!-- empty header row, was ignored -->
+                <th></th><th></th><th></th>
+              </tr>
+              <tr>
+                <th>A</th><th>B</th><th>C</th>
+              </tr>
+            </thead>
+            <tbody>
+              <tr>
+                <td>1</td><td>2</td><td>3</td>
+              </tr>
+            </tbody>
+          </table>
+        """, header=2)
+    Out [1]:
+    [Empty DataFrame
+    Columns: [1, 2, 3]
+    Index: []]
+
+Current Behavior:
+
+.. code-block:: ipython
+
+    In [1]: pd.read_html("""
+          <table>
+            <thead>
+              <tr>
+                <!-- empty header row, was ignored -->
+                <th></th><th></th><th></th>
+              </tr>
+              <tr>
+                <th>A</th><th>B</th><th>C</th>
+              </tr>
+            </thead>
+            <tbody>
+              <tr>
+                <td>1</td><td>2</td><td>3</td>
+              </tr>
+            </tbody>
+          </table>
+        """, header=2)
+    Out [1]:
+    [   A  B  C
+    0  1  2  3]
+
+Previously, the workaround was to write ``header=0`` instead of ``header=1``
+for this example table. Now, that workaround must be removed. This should not
+affect many users, since most HTML tables do not have empty header rows.
+
 - :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`)
 -
 -

From 95ce9934a4ca8a7b5504f1a68f14c0b60ea21c50 Mon Sep 17 00:00:00 2001
From: Adam Hooper <adam@adamhooper.com>
Date: Wed, 4 Jul 2018 16:29:51 -0400
Subject: [PATCH 11/12] Improve whatsnew with ipython

---
 doc/source/whatsnew/v0.24.0.txt | 122 ++++++--------------------------
 1 file changed, 23 insertions(+), 99 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index bc33c466f3529..db3598ce2a181 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -174,119 +174,43 @@ Current Behavior:
     ...
     OverflowError: Trying to coerce negative values to unsigned integers
 
-read_html Incompatibilities
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
+read_html Enhancements
+^^^^^^^^^^^^^^^^^^^^^^
 
 :func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes.
-Now it understands them, treating them as a sequence of cells with the same
+Now it understands them, treating them as sequences of cells with the same
 value. (:issue:`17054`)
 
-Previous Behavior:
-
-.. code-block:: ipython
-
-    In [1]: pd.read_html("""
-          <table>
-            <thead>
-              <tr>
-                <th>A</th><th>B</th><th>C</th>
-              </tr>
-            </thead>
-            <tbody>
-              <tr>
-                <td colspan="2">1</td><td>2</td>
-              </tr>
-            </tbody>
-          </table>
-        """)
-    Out [1]:
-    [   A  B   C
-    0  1  2 NaN]
-
-Current Behavior:
-
-.. code-block:: ipython
+.. ipython:: python
 
-    In [1]: pd.read_html("""
-          <table>
-            <thead>
-              <tr>
-                <th>A</th><th>B</th><th>C</th>
-              </tr>
-            </thead>
-            <tbody>
-              <tr>
-                <td colspan="2">1</td><td>2</td>
-              </tr>
-            </tbody>
-          </table>
-        """)
-    Out [1]:
-    [   A  B  C
-    0  1  2  2]
-
-Calls that relied on the previous behavior will need to be changed.
-
-Also, :func:`read_html` previously ignored some ``<tr>`` elements when called
-with ``header=`` or ``skiprows=`` on some unusual HTML tables.
-(:issue:`21641`)
+    result = pd.read_html("""
+      <table>
+        <thead>
+          <tr>
+            <th>A</th><th>B</th><th>C</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td colspan="2">1</td><td>2</td>
+          </tr>
+        </tbody>
+      </table>""")
 
 Previous Behavior:
 
 .. code-block:: ipython
 
-    In [1]: pd.read_html("""
-          <table>
-            <thead>
-              <tr>
-                <!-- empty header row, was ignored -->
-                <th></th><th></th><th></th>
-              </tr>
-              <tr>
-                <th>A</th><th>B</th><th>C</th>
-              </tr>
-            </thead>
-            <tbody>
-              <tr>
-                <td>1</td><td>2</td><td>3</td>
-              </tr>
-            </tbody>
-          </table>
-        """, header=2)
-    Out [1]:
-    [Empty DataFrame
-    Columns: [1, 2, 3]
-    Index: []]
+    In [13]: result
+    Out [13]:
+    [   A  B   C
+     0  1  2 NaN]
 
 Current Behavior:
 
-.. code-block:: ipython
+.. ipython:: python
 
-    In [1]: pd.read_html("""
-          <table>
-            <thead>
-              <tr>
-                <!-- empty header row, was ignored -->
-                <th></th><th></th><th></th>
-              </tr>
-              <tr>
-                <th>A</th><th>B</th><th>C</th>
-              </tr>
-            </thead>
-            <tbody>
-              <tr>
-                <td>1</td><td>2</td><td>3</td>
-              </tr>
-            </tbody>
-          </table>
-        """, header=2)
-    Out [1]:
-    [   A  B  C
-    0  1  2  3]
-
-Previously, the workaround was to write ``header=0`` instead of ``header=1``
-for this example table. Now, that workaround must be removed. This should not
-affect many users, since most HTML tables do not have empty header rows.
+    result
 
 Datetimelike API Changes
 ^^^^^^^^^^^^^^^^^^^^^^^^

From 5fd863bb3611093aefcda7e0f16573d77a3190d4 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Thu, 5 Jul 2018 12:40:28 -0500
Subject: [PATCH 12/12] fixup whatsnew

---
 doc/source/whatsnew/v0.24.0.txt | 80 +++++++++++++++++----------------
 1 file changed, 41 insertions(+), 39 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index db3598ce2a181..d0b8f00150099 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -10,7 +10,7 @@ New features
 
 - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
 
-.. _whatsnew_0240.enhancements.extension_array_operators
+.. _whatsnew_0240.enhancements.extension_array_operators:
 
 ``ExtensionArray`` operator support
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -26,6 +26,46 @@ See the :ref:`ExtensionArray Operator Support
 <extending.extension.operator>` documentation section for details on both
 ways of adding operator support.
 
+.. _whatsnew_0240.enhancements.read_html:
+
+``read_html`` Enhancements
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes.
+Now it understands them, treating them as sequences of cells with the same
+value. (:issue:`17054`)
+
+.. ipython:: python
+
+    result = pd.read_html("""
+      <table>
+        <thead>
+          <tr>
+            <th>A</th><th>B</th><th>C</th>
+          </tr>
+        </thead>
+        <tbody>
+          <tr>
+            <td colspan="2">1</td><td>2</td>
+          </tr>
+        </tbody>
+      </table>""")
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+    In [13]: result
+    Out [13]:
+    [   A  B   C
+     0  1  2 NaN]
+
+Current Behavior:
+
+.. ipython:: python
+
+    result
+
 .. _whatsnew_0240.enhancements.other:
 
 Other Enhancements
@@ -174,44 +214,6 @@ Current Behavior:
     ...
     OverflowError: Trying to coerce negative values to unsigned integers
 
-read_html Enhancements
-^^^^^^^^^^^^^^^^^^^^^^
-
-:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes.
-Now it understands them, treating them as sequences of cells with the same
-value. (:issue:`17054`)
-
-.. ipython:: python
-
-    result = pd.read_html("""
-      <table>
-        <thead>
-          <tr>
-            <th>A</th><th>B</th><th>C</th>
-          </tr>
-        </thead>
-        <tbody>
-          <tr>
-            <td colspan="2">1</td><td>2</td>
-          </tr>
-        </tbody>
-      </table>""")
-
-Previous Behavior:
-
-.. code-block:: ipython
-
-    In [13]: result
-    Out [13]:
-    [   A  B   C
-     0  1  2 NaN]
-
-Current Behavior:
-
-.. ipython:: python
-
-    result
-
 Datetimelike API Changes
 ^^^^^^^^^^^^^^^^^^^^^^^^
 

+ - Move rows from bottom of body to footer only if + all elements inside row are	- def _parse_raw_tfoot(self, table): - tfoot = self._parse_tfoot(table) - res = [] - if tfoot: - res = lmap(self._text_getter, self._parse_td(tfoot[0])) - return np.atleast_1d( - np.array(res).squeeze()) if res and len(res) == 1 else res + Parameters + ---------- + table_html : a single HTML table element. - def _parse_raw_tbody(self, table): - tbodies = self._parse_tbody(table) + Returns + ------- + tuple of (header, body, footer) + header : list of rows, each of which is a list of parsed + header elements + body : list of rows, each of which is a list of parsed body elements + footer : list of rows, each of which is a list of parsed + footer elements + """ - raw_data = [] + header_rows = self._parse_thead_tr(table_html) + body_rows = self._parse_tbody_tr(table_html) + footer_rows = self._parse_tfoot_tr(table_html) - if tbodies: - for tbody in tbodies: - raw_data.extend(self._parse_tr(tbody)) - else: - raw_data.extend(self._parse_tr(table)) + if not header_rows: + # The table has no
rows as headers. + while body_rows and all(self._equals_tag(t, 'th') for t in + self._parse_td(body_rows[0])): + # this row should be a header row, move it from body to header + header_rows.append(body_rows.pop(0)) + + if not footer_rows: + # The table has no
rows as footers. + while body_rows and all(self._equals_tag(t, 'th') for t in + self._parse_td(body_rows[-1])): + # this row should be a footer row, move it from body to footer + footer_rows.insert(0, body_rows.pop()) + + header = self._expand_colspan_rowspan(header_rows) + body = self._expand_colspan_rowspan(body_rows) + footer = self._expand_colspan_rowspan(footer_rows) + + return header, body, footer + + def _expand_colspan_rowspan(self, rows): + """Given a list of
foo	bar
`` elements into the header). + + .. versionadded:: 0.21.0 Similar to :func:`~pandas.read_csv` the `header` argument is applied after `skiprows` is applied. diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 9c6a8de7ed446..b8f520ee17d72 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -18,7 +18,6 @@ from pandas.io.common import URLError, file_path_to_url import pandas.io.html from pandas.io.html import read_html -from pandas._libs.parsers import ParserError import pandas.util.testing as tm import pandas.util._test_decorators as td @@ -129,16 +128,7 @@ def test_banklist(self): assert_framelist_equal(df1, df2) - def test_spam_no_types(self): - - # infer_types removed in #10892 - df1 = self.read_html(self.spam_data, '.Water.') - df2 = self.read_html(self.spam_data, 'Unit') - assert_framelist_equal(df1, df2) - assert df1[0].iloc[0, 0] == 'Proximates' - assert df1[0].columns[0] == 'Nutrient' - - def test_spam_with_types(self): + def test_spam(self): df1 = self.read_html(self.spam_data, '.Water.') df2 = self.read_html(self.spam_data, 'Unit') assert_framelist_equal(df1, df2) @@ -372,7 +362,7 @@ def test_thousands_macau_stats(self, datapath): attrs={'class': 'style1'}) df = dfs[all_non_nan_table_index] - assert not any(s.isna().any() for _, s in df.iteritems()) + assert not any(s.isnull().any() for _, s in df.iteritems()) @pytest.mark.slow def test_thousands_macau_index_col(self, datapath): @@ -381,7 +371,7 @@ def test_thousands_macau_index_col(self, datapath): dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] - assert not any(s.isna().any() for _, s in df.iteritems()) + assert not any(s.isnull().any() for _, s in df.iteritems()) def test_empty_tables(self): """ @@ -461,6 +451,44 @@ def test_header_and_one_column(self): result = self.read_html(data)[0] tm.assert_frame_equal(result, expected) + def test_thead_without_tr(self): + """ + Ensure parser adds
`` rows and ``	`` elements within each ``
`` - element in the table. ``	`` stands for "table data". + element in the table. ``	`` stands for "table data". This function + attempts to properly handle ``colspan`` and ``rowspan`` attributes. + If the function has a ``