diff --git a/pandas/io/html.py b/pandas/io/html.py
index 3c38dae91eb89..0ee523f699159 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -16,11 +16,11 @@
from pandas.io.common import (EmptyDataError, _is_url, urlopen,
parse_url, _validate_header_arg)
from pandas.io.parsers import TextParser
-from pandas.compat import (lrange, lmap, u, string_types, iteritems,
+from pandas.compat import (lrange, lmap, lfilter, u, string_types, iteritems,
raise_with_traceback, binary_type)
from pandas import Series
-from pandas.core.common import AbstractMethodError
-from pandas.formats.printing import pprint_thing
+from pandas.core.common import (AbstractMethodError, flatten)
+from pandas.io.formats.printing import pprint_thing
_IMPORTS = False
_HAS_BS4 = False
@@ -175,13 +175,15 @@ class _HtmlFrameParser(object):
-----
To subclass this class effectively you must override the following methods:
* :func:`_build_doc`
- * :func:`_text_getter`
- * :func:`_parse_td`
* :func:`_parse_tables`
- * :func:`_parse_tr`
- * :func:`_parse_thead`
- * :func:`_parse_tbody`
- * :func:`_parse_tfoot`
+ * :func:`_text_getter`
+ * :func:`_equals_tag`
+ * :func:`_has_tag`
+ * :func:`_extract_td`
+ * :func:`_extract_tr`
+ * :func:`_extract_thead`
+ * :func:`_extract_tbody`
+ * :func:`_extract_tfoot`
See each method's respective documentation for details on their
functionality.
"""
@@ -196,29 +198,32 @@ def parse_tables(self):
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
return (self._build_table(table) for table in tables)
- def _parse_raw_data(self, rows):
- """Parse the raw data into a list of lists.
+ def _parse_tables(self, doc, match, attrs):
+ """Return all tables from the parsed DOM.
Parameters
----------
- rows : iterable of node-like
- A list of row elements.
+ doc : tree-like
+ The DOM from which to parse the table element.
- text_getter : callable
- A callable that gets the text from an individual node. This must be
- defined by subclasses.
+ match : str or regular expression
+ The text to search for in the DOM tree.
- column_finder : callable
- A callable that takes a row node as input and returns a list of the
- column node in that row. This must be defined by subclasses.
+ attrs : dict
+ A dictionary of table attributes that can be used to disambiguate
+ mutliple tables on a page.
+
+ Raises
+ ------
+ ValueError
+ * If `match` does not match any text in the document.
Returns
-------
- data : list of list of strings
+ tables : list of node-like
+ A list of
elements to be parsed into raw data.
"""
- data = [[_remove_whitespace(self._text_getter(col)) for col in
- self._parse_td(row)] for row in rows]
- return data
+ raise AbstractMethodError(self)
def _text_getter(self, obj):
"""Return the text of an individual DOM node.
@@ -235,48 +240,58 @@ def _text_getter(self, obj):
"""
raise AbstractMethodError(self)
- def _parse_td(self, obj):
- """Return the td elements from a row element.
+ def _equals_tag(self, obj, tag):
+ """Returns whether an individual DOM node matches a tag
Parameters
----------
obj : node-like
+ A DOM node.
+
+ tag : string
+ Tag to be checked for equality
Returns
-------
- columns : list of node-like
- These are the elements of each row, i.e., the columns.
+ boolean
+ Does the object match tag 'tag'?
"""
raise AbstractMethodError(self)
- def _parse_tables(self, doc, match, attrs):
- """Return all tables from the parsed DOM.
+ def _contains_tag(self, obj, tag):
+ """Returns whether an individual DOM node has a particular tag
+ contained within it
Parameters
----------
- doc : tree-like
- The DOM from which to parse the table element.
+ obj : node-like
+ A DOM node.
- match : str or regular expression
- The text to search for in the DOM tree.
+ tag : string
+ Tag to be found in this DOM
- attrs : dict
- A dictionary of table attributes that can be used to disambiguate
- mutliple tables on a page.
+ Returns
+ -------
+ boolean
+ Does the object contain tag 'tag'?
+ """
+ raise AbstractMethodError(self)
- Raises
- ------
- ValueError
- * If `match` does not match any text in the document.
+ def _extract_td(self, obj):
+ """Return the td elements from a row element.
+
+ Parameters
+ ----------
+ obj : node-like
Returns
-------
- tables : list of node-like
- A list of elements to be parsed into raw data.
+ columns : list of node-like
+ These are the elements of each row, i.e., the columns.
"""
raise AbstractMethodError(self)
- def _parse_tr(self, table):
+ def _extract_tr(self, table):
"""Return the list of row elements from the parsed table element.
Parameters
@@ -291,7 +306,7 @@ def _parse_tr(self, table):
"""
raise AbstractMethodError(self)
- def _parse_thead(self, table):
+ def _extract_thead(self, table):
"""Return the header of a table.
Parameters
@@ -306,7 +321,7 @@ def _parse_thead(self, table):
"""
raise AbstractMethodError(self)
- def _parse_tbody(self, table):
+ def _extract_tbody(self, table):
"""Return the body of the table.
Parameters
@@ -321,7 +336,7 @@ def _parse_tbody(self, table):
"""
raise AbstractMethodError(self)
- def _parse_tfoot(self, table):
+ def _extract_tfoot(self, table):
"""Return the footer of the table if any.
Parameters
@@ -345,37 +360,149 @@ def _build_doc(self):
"""
raise AbstractMethodError(self)
- def _build_table(self, table):
- header = self._parse_raw_thead(table)
- body = self._parse_raw_tbody(table)
- footer = self._parse_raw_tfoot(table)
+ def _build_table(self, table_html):
+ header, body, footer = self._parse_raw_thead_tbody_tfoot(table_html)
+ # the above "footer" actually produces a footer. The below "footer"
+ # rarely does. The below "footer" is the legacy behavior and so I'm
+ # leaving it for the time being.
+ footer = self._parse_raw_tfoot(table_html)
return header, body, footer
- def _parse_raw_thead(self, table):
- thead = self._parse_thead(table)
- res = []
- if thead:
- res = lmap(self._text_getter, self._parse_th(thead[0]))
- return np.atleast_1d(
- np.array(res).squeeze()) if res and len(res) == 1 else res
+ def _parse_raw_thead_tbody_tfoot(self, table_html):
+ """Given a table, return parsed header, body, and foot.
+ Header and body are lists-of-lists. Top level list is a list of
+ rows. Each row is a list of parsed elements.
- def _parse_raw_tfoot(self, table):
- tfoot = self._parse_tfoot(table)
+ Logic: Use , , elements to identify
+ header, body, and footer, otherwise:
+ - Put all rows into body
+ - Move rows from top of body to header only if
+ all elements inside row are
+ - Move rows from bottom of body to footer only if
+ all elements inside row are |
+
+ Parameters
+ ----------
+ table_html : node-like
+ A single table element
+
+ Returns
+ -------
+ header, body, footer
+ header : list of list of node-like
+ List of rows, each of which is a list of parsed header elements
+ body : list of list of node-like
+ List of rows, each of which is a list of parsed body elements
+ footer : list of list of node-like
+ List of rows, each of which is a list of parsed footer elements
+ """
+ header_rows = []
+ body_rows = []
+ footer_rows = []
+ # first, are there thead and tbody elements in the table?
+ if (self._contains_tag(table_html, 'thead') and
+ self._contains_tag(table_html, 'tbody')):
+ header_rows = self._extract_tr(self._extract_thead(table_html)[0])
+ body_rows = self._extract_tr(self._extract_tbody(table_html)[0])
+ if self._contains_tag(table_html, 'tfoot'):
+ footer_rows = self._extract_tr(
+ self._extract_tfoot(table_html)[0])
+ else:
+ # otherwise we need to split the body into header/body/foot
+ body_rows = self._extract_tr(table_html)
+ if body_rows == []:
+ # empty table, just return nothing
+ return [], [], []
+ # splitting criterion: if all tags within a row are th, it's part
+ # of the header/footer
+ while all(self._equals_tag(t, 'th') for t in
+ self._extract_td(body_rows[0])):
+ # this row should be a header row, move it from body to header
+ header_rows.append(body_rows.pop(0))
+ while all(self._equals_tag(t, 'th') for t in
+ self._extract_td(body_rows[-1])):
+ # this row should be a footer row, move it from body to footer
+ footer_rows.insert(0, body_rows.pop())
+ header = self._expand_colspan_rowspan(header_rows, fill_rowspan=False)
+ body = self._expand_colspan_rowspan(body_rows, fill_rowspan=True)
+ footer = self._expand_colspan_rowspan(footer_rows, fill_rowspan=False)
+ # the below line is lifted from _parse_raw_tfoot. Not sure what it
+ # does.
+ footer = np.atleast_1d(np.array(footer).squeeze(
+ )) if footer and len(footer) == 1 else footer
+ return header, body, footer
+
+ def _expand_colspan_rowspan(self, rows, fill_rowspan=True):
+ """Given a list of rows, return a list of rows that properly handle
+ colspan/rowspan
+
+ Discussion on behavior of fill_rowspan in #17073
+
+ Parameters
+ ----------
+ rows : list of list of node-like
+ List of rows, each of which is a list of nodes
+
+ fill_rowspan : boolean
+ Should a rowspan fill every item in the rowspan (True) or only the
+ bottommost element (False)?
+
+ Returns
+ -------
+ res : list of list of node-like
+ List of rows, each of which is a list of nodes, respecting
+ colspan/rowspan
+ """
+ res = []
+ saved_span = []
+ for row in rows:
+ extracted_row = self._extract_td(row)
+ cols_text = [_remove_whitespace(
+ self._text_getter(col)) for col in extracted_row]
+ col_colspans = [int(col.get('colspan', 1))
+ for col in extracted_row]
+ col_rowspans = [int(col.get('rowspan', 1))
+ for col in extracted_row]
+ # expand cols using col_colspans
+ # maybe this can be done with a list comprehension, dunno
+ cols = zip(
+ list(flatten(
+ lmap(lambda (text, nc): [text] * nc,
+ zip(cols_text, col_colspans)))),
+ list(flatten(
+ lmap(lambda (nc, nr): [nr] * nc,
+ zip(col_colspans, col_rowspans))))
+ )
+ # cols is now a list of (text, number of rows)
+ # now insert any previous rowspans
+ for (col, (text, nr)) in saved_span:
+ cols.insert(col, (text, nr))
+
+ # save next saved_span
+ def advance_item_to_next_row(item):
+ (col, (text, nr)) = item
+ if nr == 1:
+ return None
+ else:
+ # only keep the text around if fill_rowspan is set
+ return (col, (text if fill_rowspan else '', nr - 1))
+ saved_span = lfilter(lambda i: i is not None,
+ lmap(advance_item_to_next_row,
+ list(enumerate(cols))))
+ cols = [text for (text, nr) in cols]
+ # generate cols with text only
+ if any([col != '' for col in cols]):
+ res.append(cols)
+ return res
+
+ def _parse_raw_tfoot(self, table_html):
+ tfoot = self._extract_tfoot(table_html)
res = []
if tfoot:
- res = lmap(self._text_getter, self._parse_td(tfoot[0]))
+ res = lmap(self._text_getter, self._extract_td(tfoot[0]))
return np.atleast_1d(
np.array(res).squeeze()) if res and len(res) == 1 else res
- def _parse_raw_tbody(self, table):
- tbody = self._parse_tbody(table)
-
- try:
- res = self._parse_tr(tbody[0])
- except IndexError:
- res = self._parse_tr(table)
- return self._parse_raw_data(res)
-
class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
"""HTML to DataFrame parser that uses BeautifulSoup under the hood.
@@ -397,27 +524,6 @@ def __init__(self, *args, **kwargs):
from bs4 import SoupStrainer
self._strainer = SoupStrainer('table')
- def _text_getter(self, obj):
- return obj.text
-
- def _parse_td(self, row):
- return row.find_all(('td', 'th'))
-
- def _parse_tr(self, element):
- return element.find_all('tr')
-
- def _parse_th(self, element):
- return element.find_all('th')
-
- def _parse_thead(self, table):
- return table.find_all('thead')
-
- def _parse_tbody(self, table):
- return table.find_all('tbody')
-
- def _parse_tfoot(self, table):
- return table.find_all('tfoot')
-
def _parse_tables(self, doc, match, attrs):
element_name = self._strainer.name
tables = doc.find_all(element_name, attrs=attrs)
@@ -439,6 +545,33 @@ def _parse_tables(self, doc, match, attrs):
match.pattern)
return result
+ def _text_getter(self, obj):
+ return obj.text
+
+ def _equals_tag(self, obj, tag):
+ return obj.name == tag
+
+ def _contains_tag(self, obj, tag):
+ return obj.find(tag) is not None
+
+ def _extract_td(self, row):
+ return row.find_all(('td', 'th'))
+
+ def _extract_tr(self, element):
+ return element.find_all('tr')
+
+ def _extract_th(self, element):
+ return element.find_all('th')
+
+ def _extract_thead(self, table):
+ return table.find_all('thead')
+
+ def _extract_tbody(self, table):
+ return table.find_all('tbody')
+
+ def _extract_tfoot(self, table):
+ return table.find_all('tfoot')
+
def _setup_build_doc(self):
raw_text = _read(self.io)
if not raw_text:
@@ -498,16 +631,6 @@ class _LxmlFrameParser(_HtmlFrameParser):
def __init__(self, *args, **kwargs):
super(_LxmlFrameParser, self).__init__(*args, **kwargs)
- def _text_getter(self, obj):
- return obj.text_content()
-
- def _parse_td(self, row):
- return row.xpath('.//td|.//th')
-
- def _parse_tr(self, table):
- expr = './/tr[normalize-space()]'
- return table.xpath(expr)
-
def _parse_tables(self, doc, match, kwargs):
pattern = match.pattern
@@ -527,6 +650,22 @@ def _parse_tables(self, doc, match, kwargs):
raise ValueError("No tables found matching regex %r" % pattern)
return tables
+ def _equals_tag(self, obj, tag):
+ return obj.tag == tag
+
+ def _contains_tag(self, obj, tag):
+ return obj.find(tag) is not None
+
+ def _text_getter(self, obj):
+ return obj.text_content()
+
+ def _extract_td(self, row):
+ return row.xpath('.//td|.//th')
+
+ def _extract_tr(self, table):
+ expr = './/tr[normalize-space()]'
+ return table.xpath(expr)
+
def _build_doc(self):
"""
Raises
@@ -581,19 +720,27 @@ def _build_doc(self):
raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
return r
- def _parse_tbody(self, table):
+ def _extract_tbody(self, table):
return table.xpath('.//tbody')
- def _parse_thead(self, table):
+ def _extract_thead(self, table):
return table.xpath('.//thead')
- def _parse_tfoot(self, table):
+ def _extract_tfoot(self, table):
return table.xpath('.//tfoot')
def _parse_raw_thead(self, table):
- expr = './/thead//th'
- return [_remove_whitespace(x.text_content()) for x in
- table.xpath(expr)]
+ expr = './/thead'
+ thead = table.xpath(expr)
+ res = []
+ if thead:
+ trs = self._extract_tr(thead[0])
+ for tr in trs:
+ cols = [_remove_whitespace(x.text_content()) for x in
+ self._extract_td(tr)]
+ if any([col != '' for col in cols]):
+ res.append(cols)
+ return res
def _parse_raw_tfoot(self, table):
expr = './/tfoot//th|//tfoot//td'
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py
index f4eec864da572..e91fbebf02108 100644
--- a/pandas/io/tests/test_html.py
+++ b/pandas/io/tests/test_html.py
@@ -633,6 +633,118 @@ def test_different_number_of_rows(self):
res = self.read_html(out, index_col=0)[0]
tm.assert_frame_equal(expected, res)
+ def test_colspan_rowspan_are_1(self):
+ expected = """
+
+
+ X |
+ Y |
+ Z |
+ W |
+
+
+
+
+ """
+ out = """
+
+
+ X |
+ Y |
+ Z |
+ W |
+
+
+
+
+ """
+ expected = self.read_html(expected)[0]
+ res = self.read_html(out)[0]
+ tm.assert_frame_equal(expected, res)
+
+ def test_colspan_rowspan_are_more_than_1(self):
+ expected = """
+
+
+ X |
+ X |
+ Y |
+ Z |
+ W |
+
+
+ 1 |
+ 2 |
+ 2 |
+ |
+ 3 |
+
+
+
+
+ """
+ out = """
+
+
+ X |
+ Y |
+ Z |
+ W |
+
+
+ 1 |
+ 2 |
+ 3 |
+
+
+
+
+ """
+ expected = self.read_html(expected)[0]
+ res = self.read_html(out)[0]
+ tm.assert_frame_equal(expected, res)
+
+ def test_header_should_be_inferred_from_th_elements(self):
+ expected = """
+
+
+ X |
+ X |
+ Y |
+ Z |
+ W |
+
+
+
+
+ 1 |
+ 2 |
+ 3 |
+ 4 |
+ 5 |
+
+ """
+ out = """
+
+ X |
+ X |
+ Y |
+ Z |
+ W |
+
+
+ 1 |
+ 2 |
+ 3 |
+ 4 |
+ 5 |
+ """
+ expected = self.read_html(expected)[0] # header is explicit
+ res = self.read_html(out)[0] # infer header
+ tm.assert_frame_equal(expected, res)
+ res2 = self.read_html(out, header=0)[0] # manually set header
+ tm.assert_frame_equal(expected, res2)
+
def test_parse_dates_list(self):
df = DataFrame({'date': date_range('1/1/2001', periods=10)})
expected = df.to_html()
@@ -650,13 +762,6 @@ def test_parse_dates_combine(self):
newdf = DataFrame({'datetime': raw_dates})
tm.assert_frame_equal(newdf, res[0])
- def test_computer_sales_page(self):
- data = os.path.join(DATA_PATH, 'computer_sales_page.html')
- with tm.assertRaisesRegexp(ParserError, r"Passed header=\[0,1\] are "
- "too many rows for this multi_index "
- "of columns"):
- self.read_html(data, header=[0, 1])
-
def test_wikipedia_states_table(self):
data = os.path.join(DATA_PATH, 'wikipedia_states.html')
assert os.path.isfile(data), '%r is not a file' % data
@@ -872,8 +977,9 @@ def test_computer_sales_page(self):
def test_invalid_flavor():
url = 'google.com'
- with tm.assertRaises(ValueError):
- read_html(url, 'google', flavor='not a* valid**++ flaver')
+ with pytest.raises(ValueError):
+ read_html(url, 'google', flavor='not a* valid**++ flavor')
+ pandas / tests / io / test_html.py
def get_elements_from_file(url, element='table'):
@@ -922,3 +1028,33 @@ def test_same_ordering():
if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
+
+
+class ErrorThread(threading.Thread):
+
+ def run(self):
+ try:
+ super(ErrorThread, self).run()
+ except Exception as e:
+ self.err = e
+ else:
+ self.err = None
+
+
+@pytest.mark.slow
+def test_importcheck_thread_safety():
+ # see gh-16928
+
+ # force import check by reinitalising global vars in html.py
+ reload(pandas.io.html)
+
+ filename = os.path.join(DATA_PATH, 'valid_markup.html')
+ helper_thread1 = ErrorThread(target=read_html, args=(filename,))
+ helper_thread2 = ErrorThread(target=read_html, args=(filename,))
+
+ helper_thread1.start()
+ helper_thread2.start()
+
+ while helper_thread1.is_alive() or helper_thread2.is_alive():
+ pass
+ assert None is helper_thread1.err is helper_thread2.err
|