From 3e5879440a6b513233f71f62de9234f67125dee7 Mon Sep 17 00:00:00 2001 From: Adam Hooper Date: Thu, 14 Jun 2018 13:09:44 -0400 Subject: [PATCH 01/12] Handle colspan and rowspan This is essentially a rebased and squashed #17054 (mad props to @jowens for doing all the hard thinking). My tweaks: * test_computer_sales_page (see #17074) no longer tests for ParserError, because the ParserError was a bug caused by missing colspan support. Now, test that MultiIndex works as expected. * I respectfully removed the fill_rowspan argument from #17073. Instead, the virtual cells created by rowspan/colspan are always copies of the real cells' text. This prevents _infer_columns() from naming virtual cells as "Unnamed: ..." * I removed a small layer of abstraction to respect #20891 (multiple support), which was implemented after @jowens' pull request. Now _HtmlFrameParser has _parse_thead_trs, _parse_tbody_trs and _parse_tfoot_trs, each returning a list of s. That let me remove _parse_tr, Making All The Tests Pass. * That caused a snowball effect. lxml does not fix malformed , as tested by spam.html. The previous hacky workaround was in _parse_raw_thead, but the new _parse_thead_trs signature returns nodes instead of text. The new hacky solution: return the itself, pretending it's a . This works in all the tests. A better solution is to use html5lib with lxml; but that might belong in a separate pull request. --- doc/source/whatsnew/v0.24.0.txt | 3 +- pandas/io/html.py | 368 ++++++++++++++++++-------------- pandas/tests/io/test_html.py | 210 ++++++++++++++++-- 3 files changed, 405 insertions(+), 176 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index abf574ae109fd..0f0ad3452e934 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -24,6 +24,7 @@ Other Enhancements `__. (:issue:`21627`) - New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) +- :func:`read_html` handles colspan and rowspan arguments and attempts to infer a header if the header is not explicitly specified (:issue:`17054`) - .. _whatsnew_0240.api_breaking: @@ -223,7 +224,7 @@ MultiIndex I/O ^^^ -- +- :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) - - diff --git a/pandas/io/html.py b/pandas/io/html.py index 8fd876e85889f..6e774f1846b99 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -10,13 +10,11 @@ from distutils.version import LooseVersion -import numpy as np - from pandas.core.dtypes.common import is_list_like from pandas.errors import EmptyDataError from pandas.io.common import _is_url, urlopen, _validate_header_arg from pandas.io.parsers import TextParser -from pandas.compat import (lrange, lmap, u, string_types, iteritems, +from pandas.compat import (lrange, lmap, lfilter, u, string_types, iteritems, raise_with_traceback, binary_type) from pandas import Series import pandas.core.common as com @@ -193,11 +191,11 @@ class _HtmlFrameParser(object): * :func:`_build_doc` * :func:`_text_getter` * :func:`_parse_td` + * :func:`_parse_thead_tr` + * :func:`_parse_tbody_tr` + * :func:`_parse_tfoot_tr` * :func:`_parse_tables` - * :func:`_parse_tr` - * :func:`_parse_thead` - * :func:`_parse_tbody` - * :func:`_parse_tfoot` + * :func:`_equals_tag` See each method's respective documentation for details on their functionality. """ @@ -210,32 +208,14 @@ def __init__(self, io, match, attrs, encoding, displayed_only): self.displayed_only = displayed_only def parse_tables(self): - tables = self._parse_tables(self._build_doc(), self.match, self.attrs) - return (self._build_table(table) for table in tables) - - def _parse_raw_data(self, rows): - """Parse the raw data into a list of lists. - - Parameters - ---------- - rows : iterable of node-like - A list of row elements. - - text_getter : callable - A callable that gets the text from an individual node. This must be - defined by subclasses. - - column_finder : callable - A callable that takes a row node as input and returns a list of the - column node in that row. This must be defined by subclasses. + """Parse and return all tables from the DOM. Returns ------- - data : list of list of strings + tables : list of parsed (header, body, footer) tuples from tables """ - data = [[_remove_whitespace(self._text_getter(col)) for col in - self._parse_td(row)] for row in rows] - return data + tables = self._parse_tables(self._build_doc(), self.match, self.attrs) + return (self._parse_thead_tbody_tfoot(table) for table in tables) def _text_getter(self, obj): """Return the text of an individual DOM node. @@ -257,7 +237,7 @@ def _parse_td(self, obj): Parameters ---------- - obj : node-like + obj : an HTML row element Returns ------- @@ -266,90 +246,88 @@ def _parse_td(self, obj): """ raise com.AbstractMethodError(self) - def _parse_tables(self, doc, match, attrs): - """Return all tables from the parsed DOM. + def _parse_thead_tr(self, table): + """Return the list of thead row elements from the parsed table element. Parameters ---------- - doc : tree-like - The DOM from which to parse the table element. - - match : str or regular expression - The text to search for in the DOM tree. - - attrs : dict - A dictionary of table attributes that can be used to disambiguate - multiple tables on a page. - - Raises - ------ - ValueError - * If `match` does not match any text in the document. + table : a table element that contains zero or more thead elements. Returns ------- - tables : list of node-like - A list of elements to be parsed into raw data. + rows : list of row elements of a table """ raise com.AbstractMethodError(self) - def _parse_tr(self, table): - """Return the list of row elements from the parsed table element. + def _parse_tbody_tr(self, table): + """Return the list of tbody row elements from the parsed table element. + + HTML5 table bodies consist of either 0 or more elements (which + only contain elements) or 0 or more elements. This method + checks for both structures. Parameters ---------- - table : node-like - A table element that contains row elements. + table : a table element that contains row elements. Returns ------- - rows : list of node-like - A list row elements of a table, usually or row elements of a table """ raise com.AbstractMethodError(self) - def _parse_thead(self, table): - """Return the header of a table. + def _parse_tfoot_tr(self, table): + """Return the list of tfoot row elements from the parsed table element. Parameters ---------- - table : node-like - A table element that contains row elements. + table : a table element that contains row elements. Returns ------- - thead : node-like - A ... element. + rows : list of row elements of a table """ raise com.AbstractMethodError(self) - def _parse_tbody(self, table): - """Return the list of tbody elements from the parsed table element. + def _parse_tables(self, doc, match, attrs): + """Return all tables from the parsed DOM. Parameters ---------- - table : node-like - A table element that contains row elements. + doc : the DOM from which to parse the table element. + + match : str or regular expression + The text to search for in the DOM tree. + + attrs : dict + A dictionary of table attributes that can be used to disambiguate + multiple tables on a page. + + Raises + ------ + ValueError : `match` does not match any text in the document. Returns ------- - tbodys : list of node-like - A list of ... elements + tables : list of HTML
elements. + rows : list of
elements to be parsed into raw data. """ raise com.AbstractMethodError(self) - def _parse_tfoot(self, table): - """Return the footer of the table if any. + def _equals_tag(self, obj, tag): + """Return whether an individual DOM node matches a tag Parameters ---------- - table : node-like - A table element that contains row elements. + obj : node-like + A DOM node. + + tag : str + Tag name to be checked for equality Returns ------- - tfoot : node-like - A ... element. + is_tag_equal : boolean + boolean indicating if the object is equal to tag 'tag' """ raise com.AbstractMethodError(self) @@ -358,47 +336,115 @@ def _build_doc(self): Returns ------- - obj : tree-like + obj : the DOM from which to parse the table element. """ raise com.AbstractMethodError(self) - def _build_table(self, table): - header = self._parse_raw_thead(table) - body = self._parse_raw_tbody(table) - footer = self._parse_raw_tfoot(table) - return header, body, footer + def _parse_thead_tbody_tfoot(self, table_html): + """Given a table, return parsed header, body, and foot. + Header and body are lists-of-lists. Top level list is a list of + rows. Each row is a list of parsed elements. - def _parse_raw_thead(self, table): - thead = self._parse_thead(table) - res = [] - if thead: - trs = self._parse_tr(thead[0]) - for tr in trs: - cols = lmap(self._text_getter, self._parse_td(tr)) - if any(col != '' for col in cols): - res.append(cols) - return res + Logic: Use , , elements to identify + header, body, and footer, otherwise: + - Put all rows into body + - Move rows from top of body to header only if + all elements inside row are . Treat first all-. Treat last all-s, return a list of text rows that copy cell + text across rowspans/colspans. - return self._parse_raw_data(raw_data) + Parameters + ---------- + rows : list of s + + Returns + ------- + res : list of rows, each of which is a list of str in that row + """ + + res = [] + saved_span = [] + for row in rows: + extracted_row = self._parse_td(row) + cols_text = [_remove_whitespace( + self._text_getter(col)) for col in extracted_row] + col_colspans = [int(col.get('colspan', 1)) + for col in extracted_row] + col_rowspans = [int(col.get('rowspan', 1)) + for col in extracted_row] + # expand cols using col_colspans + # maybe this can be done with a list comprehension, dunno + cols = list(zip( + list(com.flatten( + lmap(lambda text_nc: [text_nc[0]] * text_nc[1], + list(zip(cols_text, col_colspans))))), + list(com.flatten( + lmap(lambda nc_nr: [nc_nr[1]] * nc_nr[0], + list(zip(col_colspans, col_rowspans)))))) + ) + # cols is now a list of (text, number of rows) + # now insert any previous rowspans + for (col, (text, nr)) in saved_span: + cols.insert(col, (text, nr)) + + # save next saved_span + def advance_item_to_next_row(item): + (col, (text, nr)) = item + if nr == 1: + return None + else: + return (col, (text, nr - 1)) + saved_span = lfilter(lambda i: i is not None, + lmap(advance_item_to_next_row, + list(enumerate(cols)))) + cols = [text for (text, nr) in cols] + # generate cols with text only + if any([col != '' for col in cols]): + res.append(cols) + + return res def _handle_hidden_tables(self, tbl_list, attr_name): """Returns list of tables, potentially removing hidden elements @@ -442,27 +488,6 @@ def __init__(self, *args, **kwargs): from bs4 import SoupStrainer self._strainer = SoupStrainer('table') - def _text_getter(self, obj): - return obj.text - - def _parse_td(self, row): - return row.find_all(('td', 'th')) - - def _parse_tr(self, element): - return element.find_all('tr') - - def _parse_th(self, element): - return element.find_all('th') - - def _parse_thead(self, table): - return table.find_all('thead') - - def _parse_tbody(self, table): - return table.find_all('tbody') - - def _parse_tfoot(self, table): - return table.find_all('tfoot') - def _parse_tables(self, doc, match, attrs): element_name = self._strainer.name tables = doc.find_all(element_name, attrs=attrs) @@ -490,6 +515,27 @@ def _parse_tables(self, doc, match, attrs): .format(patt=match.pattern)) return result + def _text_getter(self, obj): + return obj.text + + def _equals_tag(self, obj, tag): + return obj.name == tag + + def _parse_td(self, row): + return row.find_all(('td', 'th'), recursive=False) + + def _parse_thead_tr(self, table): + return table.select('thead tr') + + def _parse_tbody_tr(self, table): + from_tbody = table.select('tbody tr') + from_root = table.find_all('tr', recursive=False) + # HTML spec: at most one of these lists has content + return from_tbody + from_root + + def _parse_tfoot_tr(self, table): + return table.select('tfoot tr') + def _setup_build_doc(self): raw_text = _read(self.io) if not raw_text: @@ -554,10 +600,9 @@ def _text_getter(self, obj): return obj.text_content() def _parse_td(self, row): - return row.xpath('.//td|.//th') - - def _parse_tr(self, table): - return table.xpath('.//tr') + # Look for direct descendents only: the "row" element here may be a + # or (see _parse_thead_tr). + return row.xpath('./td|./th') def _parse_tables(self, doc, match, kwargs): pattern = match.pattern @@ -590,6 +635,12 @@ def _parse_tables(self, doc, match, kwargs): .format(patt=pattern)) return tables + def _equals_tag(self, obj, tag): + return obj.tag == tag + + def _contains_tag(self, obj, tag): + return obj.find(tag) is not None + def _build_doc(self): """ Raises @@ -637,41 +688,30 @@ def _build_doc(self): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r - def _parse_tbody(self, table): - return table.xpath('.//tbody') + def _parse_thead_tr(self, table): + rows = [] - def _parse_thead(self, table): - return table.xpath('.//thead') + for thead in table.xpath('.//thead'): + rows.extend(thead.xpath('./tr')) - def _parse_tfoot(self, table): - return table.xpath('.//tfoot') + # lxml does not clean up the clearly-erroneous + # . + elements_at_root = thead.xpath('./td|./th') + if elements_at_root: + # Pass the entire as a row. _parse_td() will interpret + # it correctly. + rows.append(thead) - def _parse_raw_thead(self, table): - expr = './/thead' - thead = table.xpath(expr) - res = [] - if thead: - # Grab any directly descending table headers first - ths = thead[0].xpath('./th') - if ths: - cols = [_remove_whitespace(x.text_content()) for x in ths] - if any(col != '' for col in cols): - res.append(cols) - else: - trs = self._parse_tr(thead[0]) + return rows - for tr in trs: - cols = [_remove_whitespace(x.text_content()) for x in - self._parse_td(tr)] + def _parse_tbody_tr(self, table): + from_tbody = table.xpath('.//tbody//tr') + from_root = table.xpath('./tr') + # HTML spec: at most one of these lists has content + return from_tbody + from_root - if any(col != '' for col in cols): - res.append(cols) - return res - - def _parse_raw_tfoot(self, table): - expr = './/tfoot//th|//tfoot//td' - return [_remove_whitespace(x.text_content()) for x in - table.xpath(expr)] + def _parse_tfoot_tr(self, table): + return table.xpath('.//tfoot//tr') def _expand_elements(body): @@ -695,7 +735,7 @@ def _data_to_frame(**kwargs): header = 0 if rows == [0] else rows if foot: - body += [foot] + body += foot # fill out elements of body that are "ragged" _expand_elements(body) @@ -953,7 +993,13 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, This function searches for ``
+ - Move rows from bottom of body to footer only if + all elements inside row are - def _parse_raw_tfoot(self, table): - tfoot = self._parse_tfoot(table) - res = [] - if tfoot: - res = lmap(self._text_getter, self._parse_td(tfoot[0])) - return np.atleast_1d( - np.array(res).squeeze()) if res and len(res) == 1 else res + Parameters + ---------- + table_html : a single HTML table element. - def _parse_raw_tbody(self, table): - tbodies = self._parse_tbody(table) + Returns + ------- + tuple of (header, body, footer) + header : list of rows, each of which is a list of parsed + header elements + body : list of rows, each of which is a list of parsed body elements + footer : list of rows, each of which is a list of parsed + footer elements + """ - raw_data = [] + header_rows = self._parse_thead_tr(table_html) + body_rows = self._parse_tbody_tr(table_html) + footer_rows = self._parse_tfoot_tr(table_html) - if tbodies: - for tbody in tbodies: - raw_data.extend(self._parse_tr(tbody)) - else: - raw_data.extend(self._parse_tr(table)) + if not header_rows: + # The table has no
rows as headers. + while body_rows and all(self._equals_tag(t, 'th') for t in + self._parse_td(body_rows[0])): + # this row should be a header row, move it from body to header + header_rows.append(body_rows.pop(0)) + + if not footer_rows: + # The table has no
rows as footers. + while body_rows and all(self._equals_tag(t, 'th') for t in + self._parse_td(body_rows[-1])): + # this row should be a footer row, move it from body to footer + footer_rows.insert(0, body_rows.pop()) + + header = self._expand_colspan_rowspan(header_rows) + body = self._expand_colspan_rowspan(body_rows) + footer = self._expand_colspan_rowspan(footer_rows) + + return header, body, footer + + def _expand_colspan_rowspan(self, rows): + """Given a list of
foobar
`` elements and only for ```` and ```` or ```` argument, it is used to construct + the header, otherwise the function attempts to find the header within + the body (by putting rows with only `` within on malformed HTML. + """ + data1 = StringIO('''
`` rows and ```` elements within each ``
`` - element in the table. ```` stands for "table data". + element in the table. ```` stands for "table data". This function + attempts to properly handle ``colspan`` and ``rowspan`` attributes. + If the function has a ``
`` elements into the header). + + .. versionadded:: 0.21.0 Similar to :func:`~pandas.read_csv` the `header` argument is applied **after** `skiprows` is applied. diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 9c6a8de7ed446..b8f520ee17d72 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -18,7 +18,6 @@ from pandas.io.common import URLError, file_path_to_url import pandas.io.html from pandas.io.html import read_html -from pandas._libs.parsers import ParserError import pandas.util.testing as tm import pandas.util._test_decorators as td @@ -129,16 +128,7 @@ def test_banklist(self): assert_framelist_equal(df1, df2) - def test_spam_no_types(self): - - # infer_types removed in #10892 - df1 = self.read_html(self.spam_data, '.*Water.*') - df2 = self.read_html(self.spam_data, 'Unit') - assert_framelist_equal(df1, df2) - assert df1[0].iloc[0, 0] == 'Proximates' - assert df1[0].columns[0] == 'Nutrient' - - def test_spam_with_types(self): + def test_spam(self): df1 = self.read_html(self.spam_data, '.*Water.*') df2 = self.read_html(self.spam_data, 'Unit') assert_framelist_equal(df1, df2) @@ -372,7 +362,7 @@ def test_thousands_macau_stats(self, datapath): attrs={'class': 'style1'}) df = dfs[all_non_nan_table_index] - assert not any(s.isna().any() for _, s in df.iteritems()) + assert not any(s.isnull().any() for _, s in df.iteritems()) @pytest.mark.slow def test_thousands_macau_index_col(self, datapath): @@ -381,7 +371,7 @@ def test_thousands_macau_index_col(self, datapath): dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] - assert not any(s.isna().any() for _, s in df.iteritems()) + assert not any(s.isnull().any() for _, s in df.iteritems()) def test_empty_tables(self): """ @@ -461,6 +451,44 @@ def test_header_and_one_column(self): result = self.read_html(data)[0] tm.assert_frame_equal(result, expected) + def test_thead_without_tr(self): + """ + Ensure parser adds
+ + + + + + + + + + + + + + +
CountryMunicipalityYear
UkraineOdessa1944
''') + data2 = StringIO(''' + + + + + + + + + + + + +
CountryMunicipalityYear
UkraineOdessa1944
''') + res1 = self.read_html(data1) + res2 = self.read_html(data2, header=0) + assert_framelist_equal(res1, res2) + def test_tfoot_read(self): """ Make sure that read_html reads tfoot, containing td or th. @@ -592,7 +620,7 @@ def test_gold_canyon(self): attrs={'id': 'table'})[0] assert gc in df.to_string() - def test_different_number_of_rows(self): + def test_different_number_of_cols(self): expected = """ @@ -654,6 +682,160 @@ def test_different_number_of_rows(self): res = self.read_html(out, index_col=0)[0] tm.assert_frame_equal(expected, res) + def test_colspan_rowspan_are_1(self): + # GH17054 + expected = """
+ + + + + + + + + + +
XYZW
""" + out = """ + + + + + + + + + + +
XYZW
""" + expected = self.read_html(expected)[0] + res = self.read_html(out)[0] + tm.assert_frame_equal(expected, res) + + def test_colspan_rowspan_are_more_than_1(self): + # GH17054 + expected = """ + + + + + + + + + + + + + + + + + + +
XXYZW
122Z3
""" + out = """ + + + + + + + + + + + + + + + +
XYZW
123
""" + expected = self.read_html(expected)[0] + res = self.read_html(out)[0] + tm.assert_frame_equal(expected, res) + + def test_tbody_colspan_rowspan_copy_values(self): + # GH17054 + expected = """ + + + + + + + + + + + + + + + + +
11234
56637
""" + out = """ + + + + + + + + + + + + + +
1234
567
""" + expected = self.read_html(expected)[0] + res = self.read_html(out)[0] + tm.assert_frame_equal(expected, res) + + def test_header_should_be_inferred_from_th_elements(self): + # GH17054 + expected = """ + + + + + + + + + + + + + + + + + +
XXYZW
12345
""" + out = """ + + + + + + + + + + + + + +
XXYZW
12345
""" + expected = self.read_html(expected)[0] # header is explicit + res = self.read_html(out)[0] # infer header + tm.assert_frame_equal(expected, res) + res2 = self.read_html(out, header=0)[0] # manually set header + tm.assert_frame_equal(expected, res2) + def test_parse_dates_list(self): df = DataFrame({'date': date_range('1/1/2001', periods=10)}) expected = df.to_html() From f89b32a1ed1b770f5bc5f5cad8bd1dee75107f88 Mon Sep 17 00:00:00 2001 From: Adam Hooper Date: Fri, 15 Jun 2018 11:30:34 -0400 Subject: [PATCH 02/12] Fixes after code review -- thanks, @WillAyd --- pandas/io/html.py | 224 ++++++++++++++++++++------------ pandas/tests/io/test_html.py | 244 ++++++++++++++++++++++------------- 2 files changed, 295 insertions(+), 173 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 6e774f1846b99..d97a79cd51f08 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -14,7 +14,7 @@ from pandas.errors import EmptyDataError from pandas.io.common import _is_url, urlopen, _validate_header_arg from pandas.io.parsers import TextParser -from pandas.compat import (lrange, lmap, lfilter, u, string_types, iteritems, +from pandas.compat import (lrange, lmap, u, string_types, iteritems, raise_with_traceback, binary_type) from pandas import Series import pandas.core.common as com @@ -189,6 +189,7 @@ class _HtmlFrameParser(object): ----- To subclass this class effectively you must override the following methods: * :func:`_build_doc` + * :func:`_attr_getter` * :func:`_text_getter` * :func:`_parse_td` * :func:`_parse_thead_tr` @@ -208,7 +209,8 @@ def __init__(self, io, match, attrs, encoding, displayed_only): self.displayed_only = displayed_only def parse_tables(self): - """Parse and return all tables from the DOM. + """ + Parse and return all tables from the DOM. Returns ------- @@ -217,8 +219,28 @@ def parse_tables(self): tables = self._parse_tables(self._build_doc(), self.match, self.attrs) return (self._parse_thead_tbody_tfoot(table) for table in tables) + def _attr_getter(self, obj, attr): + """ + Return the attribute value of an individual DOM node. + + Parameters + ---------- + obj : node-like + A DOM node. + + attr : str or unicode + The attribute, such as "colspan" + + Returns + ------- + text : str or unicode + The attribute value. + """ + raise com.AbstractMethodError(self) + def _text_getter(self, obj): - """Return the text of an individual DOM node. + """ + Return the text of an individual DOM node. Parameters ---------- @@ -237,7 +259,8 @@ def _parse_td(self, obj): Parameters ---------- - obj : an HTML row element + obj : node-like + A DOM node. Returns ------- @@ -247,7 +270,8 @@ def _parse_td(self, obj): raise com.AbstractMethodError(self) def _parse_thead_tr(self, table): - """Return the list of thead row elements from the parsed table element. + """ + Return the list of thead row elements from the parsed table element. Parameters ---------- @@ -260,7 +284,8 @@ def _parse_thead_tr(self, table): raise com.AbstractMethodError(self) def _parse_tbody_tr(self, table): - """Return the list of tbody row elements from the parsed table element. + """ + Return the list of tbody row elements from the parsed table element. HTML5 table bodies consist of either 0 or more elements (which only contain elements) or 0 or more elements. This method @@ -277,7 +302,8 @@ def _parse_tbody_tr(self, table): raise com.AbstractMethodError(self) def _parse_tfoot_tr(self, table): - """Return the list of tfoot row elements from the parsed table element. + """ + Return the list of tfoot row elements from the parsed table element. Parameters ---------- @@ -290,7 +316,8 @@ def _parse_tfoot_tr(self, table): raise com.AbstractMethodError(self) def _parse_tables(self, doc, match, attrs): - """Return all tables from the parsed DOM. + """ + Return all tables from the parsed DOM. Parameters ---------- @@ -314,7 +341,8 @@ def _parse_tables(self, doc, match, attrs): raise com.AbstractMethodError(self) def _equals_tag(self, obj, tag): - """Return whether an individual DOM node matches a tag + """ + Return whether an individual DOM node matches a tag Parameters ---------- @@ -332,7 +360,8 @@ def _equals_tag(self, obj, tag): raise com.AbstractMethodError(self) def _build_doc(self): - """Return a tree-like object that can be used to iterate over the DOM. + """ + Return a tree-like object that can be used to iterate over the DOM. Returns ------- @@ -341,48 +370,49 @@ def _build_doc(self): raise com.AbstractMethodError(self) def _parse_thead_tbody_tfoot(self, table_html): - """Given a table, return parsed header, body, and foot. - Header and body are lists-of-lists. Top level list is a list of - rows. Each row is a list of parsed elements. - - Logic: Use , , elements to identify - header, body, and footer, otherwise: - - Put all rows into body - - Move rows from top of body to header only if - all elements inside row are - - Move rows from bottom of body to footer only if - all elements inside row are + """ + Given a table, return parsed header, body, and foot. + + Header and body are lists-of-lists. Top level list is a list of + rows. Each row is a list of str text. + + Logic: Use , , elements to identify + header, body, and footer, otherwise: + - Put all rows into body + - Move rows from top of body to header only if + all elements inside row are + - Move rows from bottom of body to footer only if + all elements inside row are Parameters ---------- - table_html : a single HTML table element. + table_html : node-like Returns ------- tuple of (header, body, footer) - header : list of rows, each of which is a list of parsed - header elements - body : list of rows, each of which is a list of parsed body elements - footer : list of rows, each of which is a list of parsed - footer elements """ header_rows = self._parse_thead_tr(table_html) body_rows = self._parse_tbody_tr(table_html) footer_rows = self._parse_tfoot_tr(table_html) + def row_is_all_th(row): + return all(self._equals_tag(t, 'th') for t in + self._parse_td(row)) + if not header_rows: - # The table has no . Treat first all- rows as headers. - while body_rows and all(self._equals_tag(t, 'th') for t in - self._parse_td(body_rows[0])): - # this row should be a header row, move it from body to header + # The table has no . Move the top all- rows from the + # to the . (This is a common case because many + # tables in the wild have no or + while body_rows and row_is_all_th(body_rows[0]): header_rows.append(body_rows.pop(0)) if not footer_rows: # The table has no . Treat last all- rows as footers. - while body_rows and all(self._equals_tag(t, 'th') for t in - self._parse_td(body_rows[-1])): - # this row should be a footer row, move it from body to footer + while body_rows and row_is_all_th(body_rows[-1]): + # .insert(), not .append(): we're moving "bottom of " to + # "top of " footer_rows.insert(0, body_rows.pop()) header = self._expand_colspan_rowspan(header_rows) @@ -392,8 +422,9 @@ def _parse_thead_tbody_tfoot(self, table_html): return header, body, footer def _expand_colspan_rowspan(self, rows): - """Given a list of s, return a list of text rows that copy cell - text across rowspans/colspans. + """ + Given a list of s, return a list of text rows that copy cell + text across rowspans/colspans. Parameters ---------- @@ -404,50 +435,69 @@ def _expand_colspan_rowspan(self, rows): res : list of rows, each of which is a list of str in that row """ - res = [] - saved_span = [] - for row in rows: - extracted_row = self._parse_td(row) - cols_text = [_remove_whitespace( - self._text_getter(col)) for col in extracted_row] - col_colspans = [int(col.get('colspan', 1)) - for col in extracted_row] - col_rowspans = [int(col.get('rowspan', 1)) - for col in extracted_row] - # expand cols using col_colspans - # maybe this can be done with a list comprehension, dunno - cols = list(zip( - list(com.flatten( - lmap(lambda text_nc: [text_nc[0]] * text_nc[1], - list(zip(cols_text, col_colspans))))), - list(com.flatten( - lmap(lambda nc_nr: [nc_nr[1]] * nc_nr[0], - list(zip(col_colspans, col_rowspans)))))) - ) - # cols is now a list of (text, number of rows) - # now insert any previous rowspans - for (col, (text, nr)) in saved_span: - cols.insert(col, (text, nr)) - - # save next saved_span - def advance_item_to_next_row(item): - (col, (text, nr)) = item - if nr == 1: - return None - else: - return (col, (text, nr - 1)) - saved_span = lfilter(lambda i: i is not None, - lmap(advance_item_to_next_row, - list(enumerate(cols)))) - cols = [text for (text, nr) in cols] - # generate cols with text only - if any([col != '' for col in cols]): - res.append(cols) - - return res + all_texts = [] # list of rows, each a list of str + remainder = [] # list of (index, text, nrows) + + for tr in rows: + texts = [] # the output for this row + next_remainder = [] + + index = 0 + tds = self._parse_td(tr) + for td in tds: + # Append texts from previous rows with rowspan>1 that come + # before this + while remainder and remainder[0][0] <= index: + prev_i, prev_text, prev_rowspan = remainder.pop(0) + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, + prev_rowspan - 1)) + index += 1 + + # Append the text from this , colspan times + text = _remove_whitespace(self._text_getter(td)) + rowspan = int(self._attr_getter(td, 'rowspan') or 1) + colspan = int(self._attr_getter(td, 'colspan') or 1) + + for _ in range(colspan): + texts.append(text) + if rowspan > 1: + next_remainder.append((index, text, rowspan - 1)) + index += 1 + + # Append texts from previous rows at the final position + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, + prev_rowspan - 1)) + + all_texts.append(texts) + remainder = next_remainder + + # Append rows that only appear because the previous row had non-1 + # rowspan + while remainder: + next_remainder = [] + texts = [] + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, + prev_rowspan - 1)) + all_texts.append(texts) + remainder = next_remainder + + # ignore all-empty-text rows + no_empty = [row for row in all_texts + if any(text for text in row)] + + return no_empty def _handle_hidden_tables(self, tbl_list, attr_name): - """Returns list of tables, potentially removing hidden elements + """ + Return list of tables, potentially removing hidden elements Parameters ---------- @@ -515,6 +565,9 @@ def _parse_tables(self, doc, match, attrs): .format(patt=match.pattern)) return result + def _attr_getter(self, obj, attr): + return obj.get(attr) + def _text_getter(self, obj): return obj.text @@ -596,11 +649,14 @@ class _LxmlFrameParser(_HtmlFrameParser): def __init__(self, *args, **kwargs): super(_LxmlFrameParser, self).__init__(*args, **kwargs) + def _attr_getter(self, obj, attr): + return obj.get(attr) + def _text_getter(self, obj): return obj.text_content() def _parse_td(self, row): - # Look for direct descendents only: the "row" element here may be a + # Look for direct children only: the "row" element here may be a # or (see _parse_thead_tr). return row.xpath('./td|./th') @@ -694,12 +750,14 @@ def _parse_thead_tr(self, table): for thead in table.xpath('.//thead'): rows.extend(thead.xpath('./tr')) - # lxml does not clean up the clearly-erroneous - # foobar. + # HACK: lxml does not clean up the clearly-erroneous + # foobar. (Missing ). Add + # the and _pretend_ it's a ; _parse_td() will find its + # children as though it's a . + # + # Better solution would be to use html5lib. elements_at_root = thead.xpath('./td|./th') if elements_at_root: - # Pass the entire as a row. _parse_td() will interpret - # it correctly. rows.append(thead) return rows diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index b8f520ee17d72..ace96c526c86f 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -362,7 +362,7 @@ def test_thousands_macau_stats(self, datapath): attrs={'class': 'style1'}) df = dfs[all_non_nan_table_index] - assert not any(s.isnull().any() for _, s in df.iteritems()) + assert not any(s.isna().any() for _, s in df.iteritems()) @pytest.mark.slow def test_thousands_macau_index_col(self, datapath): @@ -371,7 +371,7 @@ def test_thousands_macau_index_col(self, datapath): dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] - assert not any(s.isnull().any() for _, s in df.iteritems()) + assert not any(s.isna().any() for _, s in df.iteritems()) def test_empty_tables(self): """ @@ -685,29 +685,21 @@ def test_different_number_of_cols(self): def test_colspan_rowspan_are_1(self): # GH17054 expected = """ - - - - - - - - - - -
XYZW
""" + + X + Y + Z + W + + """ out = """ - - - - - - - - - - -
XYZW
""" + + X + Y + Z + W + + """ expected = self.read_html(expected)[0] res = self.read_html(out)[0] tm.assert_frame_equal(expected, res) @@ -715,42 +707,34 @@ def test_colspan_rowspan_are_1(self): def test_colspan_rowspan_are_more_than_1(self): # GH17054 expected = """ - - - - - - - - - - - - - - - - - - -
XXYZW
122Z3
""" + + X + X + Y + Z + W + + + 1 + 2 + 2 + Z + 3 + + """ out = """ - - - - - - - - - - - - - - - -
XYZW
123
""" + + X + Y + Z + W + + + 1 + 2 + 3 + + """ expected = self.read_html(expected)[0] res = self.read_html(out)[0] tm.assert_frame_equal(expected, res) @@ -758,40 +742,120 @@ def test_colspan_rowspan_are_more_than_1(self): def test_tbody_colspan_rowspan_copy_values(self): # GH17054 expected = """ - - - - - - - - - - - - - - - - -
11234
56637
""" + + 1 + 1 + 2 + 3 + 4 + + + 5 + 6 + 6 + 3 + 7 + + """ out = """ - - - - - - - - - - - - - -
1234
567
""" + + 1 + 2 + 3 + 4 + + + 5 + 6 + 7 + + """ + expected = self.read_html(expected)[0] + res = self.read_html(out)[0] + tm.assert_frame_equal(expected, res) + + def test_colspan_rowspan_both_not_1(self): + # GH17054 + expected = """ + + + + + + + + + + + + + + +
abbbc
abbbd
""" + out = """ + + + + + + + + +
abc
d
""" + expected = self.read_html(expected)[0] + res = self.read_html(out)[0] + tm.assert_frame_equal(expected, res) + + def test_rowspan_at_end_of_row(self): + # GH17054 + expected = """ + + + + + + + + +
ab
cb
""" + out = """ + + + + + + + +
ab
c
""" + expected = self.read_html(expected)[0] + res = self.read_html(out)[0] + tm.assert_frame_equal(expected, res) + + def test_rowspan_only_rows(self): + # GH17054 + expected = """ + + + + + + + + + + + + +
ab
ab
ab
""" + out = """ + + + + +
ab
""" expected = self.read_html(expected)[0] res = self.read_html(out)[0] + print(res) tm.assert_frame_equal(expected, res) def test_header_should_be_inferred_from_th_elements(self): From 34f87cb9338385f91d3df204b666e7dd5b8cfb65 Mon Sep 17 00:00:00 2001 From: Adam Hooper Date: Fri, 15 Jun 2018 11:38:32 -0400 Subject: [PATCH 03/12] Docstring tweaks --- pandas/io/html.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index d97a79cd51f08..e02c39c7ff04c 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -214,7 +214,7 @@ def parse_tables(self): Returns ------- - tables : list of parsed (header, body, footer) tuples from tables + list of parsed (header, body, footer) tuples from tables. """ tables = self._parse_tables(self._build_doc(), self.match, self.attrs) return (self._parse_thead_tbody_tfoot(table) for table in tables) @@ -233,7 +233,7 @@ def _attr_getter(self, obj, attr): Returns ------- - text : str or unicode + str or unicode The attribute value. """ raise com.AbstractMethodError(self) @@ -264,7 +264,7 @@ def _parse_td(self, obj): Returns ------- - columns : list of node-like + list of node-like These are the elements of each row, i.e., the columns. """ raise com.AbstractMethodError(self) @@ -279,7 +279,8 @@ def _parse_thead_tr(self, table): Returns ------- - rows : list of row elements of a table + list of node-like + These are the row elements of a table. """ raise com.AbstractMethodError(self) @@ -297,7 +298,8 @@ def _parse_tbody_tr(self, table): Returns ------- - rows : list of row elements of a table + list of node-like + These are the row elements of a table. """ raise com.AbstractMethodError(self) @@ -311,7 +313,8 @@ def _parse_tfoot_tr(self, table): Returns ------- - rows : list of row elements of a table + list of node-like + These are the row elements of a table. """ raise com.AbstractMethodError(self) @@ -336,7 +339,8 @@ def _parse_tables(self, doc, match, attrs): Returns ------- - tables : list of HTML elements to be parsed into raw data. + list of node-like + HTML
elements to be parsed into raw data. """ raise com.AbstractMethodError(self) @@ -354,8 +358,8 @@ def _equals_tag(self, obj, tag): Returns ------- - is_tag_equal : boolean - boolean indicating if the object is equal to tag 'tag' + boolean + Whether the object is equal to tag 'tag' """ raise com.AbstractMethodError(self) @@ -365,7 +369,8 @@ def _build_doc(self): Returns ------- - obj : the DOM from which to parse the table element. + node-like + The DOM from which to parse the table element. """ raise com.AbstractMethodError(self) @@ -390,7 +395,7 @@ def _parse_thead_tbody_tfoot(self, table_html): Returns ------- - tuple of (header, body, footer) + tuple of (header, body, footer), each a list of list-of-text rows. """ header_rows = self._parse_thead_tr(table_html) @@ -432,7 +437,8 @@ def _expand_colspan_rowspan(self, rows): Returns ------- - res : list of rows, each of which is a list of str in that row + list of list + Each returned row is a list of str text. """ all_texts = [] # list of rows, each a list of str @@ -501,14 +507,14 @@ def _handle_hidden_tables(self, tbl_list, attr_name): Parameters ---------- - tbl_list : list of Tag or list of Element + tbl_list : list of node-like Type of list elements will vary depending upon parser used attr_name : str Name of the accessor for retrieving HTML attributes Returns ------- - list of Tag or list of Element + list of node-like Return type matches `tbl_list` """ if not self.displayed_only: From 582c86b8667a851db706a702d51b6904044ff688 Mon Sep 17 00:00:00 2001 From: Adam Hooper Date: Fri, 15 Jun 2018 12:46:25 -0400 Subject: [PATCH 04/12] Details -- thanks, @jreback --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/io/html.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 0f0ad3452e934..499a48faec325 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -24,7 +24,7 @@ Other Enhancements `__. (:issue:`21627`) - New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) -- :func:`read_html` handles colspan and rowspan arguments and attempts to infer a header if the header is not explicitly specified (:issue:`17054`) +- :func:`read_html` copies cell data across ``colspan``s and ``rowspan``s, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) - .. _whatsnew_0240.api_breaking: diff --git a/pandas/io/html.py b/pandas/io/html.py index e02c39c7ff04c..15f37007e22c8 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -378,6 +378,8 @@ def _parse_thead_tbody_tfoot(self, table_html): """ Given a table, return parsed header, body, and foot. + Notes + ----- Header and body are lists-of-lists. Top level list is a list of rows. Each row is a list of str text. @@ -700,9 +702,6 @@ def _parse_tables(self, doc, match, kwargs): def _equals_tag(self, obj, tag): return obj.tag == tag - def _contains_tag(self, obj, tag): - return obj.find(tag) is not None - def _build_doc(self): """ Raises From d2f0b83fccddd78a8214053b12c826f045167a8d Mon Sep 17 00:00:00 2001 From: Adam Hooper Date: Fri, 15 Jun 2018 15:57:02 -0400 Subject: [PATCH 05/12] Tweak comments --- pandas/io/html.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 15f37007e22c8..fcb01ebdd06d1 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -354,12 +354,12 @@ def _equals_tag(self, obj, tag): A DOM node. tag : str - Tag name to be checked for equality + Tag name to be checked for equality. Returns ------- boolean - Whether the object is equal to tag 'tag' + Whether `obj`'s tag name is `tag` """ raise com.AbstractMethodError(self) @@ -430,12 +430,17 @@ def row_is_all_th(row): def _expand_colspan_rowspan(self, rows): """ - Given a list of s, return a list of text rows that copy cell - text across rowspans/colspans. + Given a list of s, return a list of text rows. + + Notes + ----- + Any cell with ``rowspan`` or ``colspan`` will have its contents copied + to subsequent cells. Parameters ---------- - rows : list of s + rows : list of node-like + List of s Returns ------- From 74c23842db970f98d3139c3930e98b301aea2229 Mon Sep 17 00:00:00 2001 From: Adam Hooper Date: Fri, 15 Jun 2018 18:52:04 -0400 Subject: [PATCH 06/12] Address latest review comments from @WillAyd Mostly involved reformatting test_html.py --- pandas/io/html.py | 20 +- pandas/tests/io/test_html.py | 594 +++++++++++++++++++---------------- 2 files changed, 322 insertions(+), 292 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index fcb01ebdd06d1..466aa7444f638 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -236,7 +236,8 @@ def _attr_getter(self, obj, attr): str or unicode The attribute value. """ - raise com.AbstractMethodError(self) + # Both lxml and BeautifulSoup have the same implementation: + return obj.get(attr) def _text_getter(self, obj): """ @@ -409,19 +410,12 @@ def row_is_all_th(row): self._parse_td(row)) if not header_rows: - # The table has no . Move the top all- to the . (This is a common case because many + # The table has no . Move the top all- or while body_rows and row_is_all_th(body_rows[0]): header_rows.append(body_rows.pop(0)) - if not footer_rows: - # The table has no . Treat last all-" to - # "top of " - footer_rows.insert(0, body_rows.pop()) - header = self._expand_colspan_rowspan(header_rows) body = self._expand_colspan_rowspan(body_rows) footer = self._expand_colspan_rowspan(footer_rows) @@ -578,9 +572,6 @@ def _parse_tables(self, doc, match, attrs): .format(patt=match.pattern)) return result - def _attr_getter(self, obj, attr): - return obj.get(attr) - def _text_getter(self, obj): return obj.text @@ -662,9 +653,6 @@ class _LxmlFrameParser(_HtmlFrameParser): def __init__(self, *args, **kwargs): super(_LxmlFrameParser, self).__init__(*args, **kwargs) - def _attr_getter(self, obj, attr): - return obj.get(attr) - def _text_getter(self, obj): return obj.text_content() diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index ace96c526c86f..b2a72f015f369 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -391,18 +391,21 @@ def test_empty_tables(self):
rows from the - #
rows from + # body_rows to header_rows. (This is a common case because many # tables in the wild have no
rows as footers. - while body_rows and row_is_all_th(body_rows[-1]): - # .insert(), not .append(): we're moving "bottom of
''' + data2 = data1 + '''
''' - res1 = self.read_html(StringIO(data1)) - res2 = self.read_html(StringIO(data2)) - assert_framelist_equal(res1, res2) + + expected = self.read_html(data1) + result = self.read_html(data2) + + assert_framelist_equal(result, expected) def test_multiple_tbody(self): # GH-20690 # Read all tbody tags within a single table. - data = ''' + result = self.read_html('''
@@ -421,9 +424,10 @@ def test_multiple_tbody(self): -
A4
''' + ''')[0] + expected = DataFrame({'A': [1, 3], 'B': [2, 4]}) - result = self.read_html(StringIO(data))[0] + tm.assert_frame_equal(result, expected) def test_header_and_one_column(self): @@ -431,9 +435,7 @@ def test_header_and_one_column(self): Don't fail with bs4 when there is a header and only one column as described in issue #9178 """ - data = StringIO(''' - - + result = self.read_html('''
@@ -444,18 +446,17 @@ def test_header_and_one_column(self): -
Headerfirst
- - ''') + ''')[0] + expected = DataFrame(data={'Header': 'first'}, index=[0]) - result = self.read_html(data)[0] + tm.assert_frame_equal(result, expected) def test_thead_without_tr(self): """ Ensure parser adds within on malformed HTML. """ - data1 = StringIO(''' + expected = self.read_html('''
@@ -470,8 +471,9 @@ def test_thead_without_tr(self): -
Country1944
''') - data2 = StringIO(''' +
''')[0] + + result = self.read_html(''' @@ -484,10 +486,9 @@ def test_thead_without_tr(self): -
Country Municipality1944
''') - res1 = self.read_html(data1) - res2 = self.read_html(data2, header=0) - assert_framelist_equal(res1, res2) + ''')[0] + + tm.assert_frame_equal(result, expected) def test_tfoot_read(self): """ @@ -512,18 +513,21 @@ def test_tfoot_read(self): ''' + expected1 = DataFrame({'A': ['bodyA'], 'B': ['bodyB']}) + expected2 = DataFrame({'A': ['bodyA', 'footA'], + 'B': ['bodyB', 'footB']}) + data1 = data_template.format(footer="") data2 = data_template.format( footer="footAfootB") - d1 = {'A': ['bodyA'], 'B': ['bodyB']} - d2 = {'A': ['bodyA', 'footA'], 'B': ['bodyB', 'footB']} + result1 = self.read_html(data1)[0] + result2 = self.read_html(data2)[0] - tm.assert_frame_equal(self.read_html(data1)[0], DataFrame(d1)) - tm.assert_frame_equal(self.read_html(data2)[0], DataFrame(d2)) + tm.assert_frame_equal(result1, expected1) + tm.assert_frame_equal(result2, expected2) def test_countries_municipalities(self): - # GH5048 data1 = StringIO(''' @@ -540,6 +544,7 @@ def test_countries_municipalities(self):
''') + data2 = StringIO(''' @@ -555,20 +560,22 @@ def test_countries_municipalities(self):
''') - res1 = self.read_html(data1) - res2 = self.read_html(data2, header=0) - assert_framelist_equal(res1, res2) + + expected = self.read_html(data1)[0] + result = self.read_html(data2, header=0)[0] # GH5048 + + tm.assert_frame_equal(result, expected) def test_nyse_wsj_commas_table(self, datapath): data = datapath('io', 'data', 'nyse_wsj.html') df = self.read_html(data, index_col=0, header=0, attrs={'class': 'mdcTable'})[0] - columns = Index(['Issue(Roll over for charts and headlines)', - 'Volume', 'Price', 'Chg', '% Chg']) + expected = Index(['Issue(Roll over for charts and headlines)', + 'Volume', 'Price', 'Chg', '% Chg']) nrows = 100 assert df.shape[0] == nrows - tm.assert_index_equal(df.columns, columns) + tm.assert_index_equal(df.columns, expected) @pytest.mark.slow def test_banklist_header(self, datapath): @@ -621,7 +628,7 @@ def test_gold_canyon(self): assert gc in df.to_string() def test_different_number_of_cols(self): - expected = """ + expected = self.read_html("""
@@ -650,8 +657,9 @@ def test_different_number_of_cols(self): -
0.222
""" - out = """ +
""", index_col=0)[0] + + result = self.read_html(""" @@ -677,228 +685,254 @@ def test_different_number_of_cols(self): -
0.222
""" - expected = self.read_html(expected, index_col=0)[0] - res = self.read_html(out, index_col=0)[0] - tm.assert_frame_equal(expected, res) + """, index_col=0)[0] + + tm.assert_frame_equal(result, expected) def test_colspan_rowspan_are_1(self): # GH17054 - expected = """ - - - - - - -
XYZW
""" - out = """ - - - - - - -
XYZW
""" - expected = self.read_html(expected)[0] - res = self.read_html(out)[0] - tm.assert_frame_equal(expected, res) + expected = self.read_html( + """ + + + + + + +
XYZW
""")[0] + + result = self.read_html( + """ + + + + + + +
XYZW
""")[0] + + tm.assert_frame_equal(result, expected) def test_colspan_rowspan_are_more_than_1(self): # GH17054 - expected = """ - - - - - - - - - - - - - - -
XXYZW
122Z3
""" - out = """ - - - - - - - - - - - -
XYZW
123
""" - expected = self.read_html(expected)[0] - res = self.read_html(out)[0] - tm.assert_frame_equal(expected, res) + expected = self.read_html( + """ + + + + + + + + + + + + + + +
XXYZW
122Z3
""")[0] + + result = self.read_html( + """ + + + + + + + + + + + +
XYZW
123
""")[0] + + tm.assert_frame_equal(result, expected) def test_tbody_colspan_rowspan_copy_values(self): # GH17054 - expected = """ - - - - - - - - - - - - - - -
11234
56637
""" - out = """ - - - - - - - - - - - -
1234
567
""" - expected = self.read_html(expected)[0] - res = self.read_html(out)[0] - tm.assert_frame_equal(expected, res) + expected = self.read_html( + """ + + + + + + + + + + + + + + +
11234
56637
""")[0] + + result = self.read_html( + """ + + + + + + + + + + + +
1234
567
""")[0] + + tm.assert_frame_equal(result, expected) def test_colspan_rowspan_both_not_1(self): # GH17054 - expected = """ - - - - - - - - - - - - - - -
abbbc
abbbd
""" - out = """ - - - - - - - - -
abc
d
""" - expected = self.read_html(expected)[0] - res = self.read_html(out)[0] - tm.assert_frame_equal(expected, res) + expected = self.read_html( + """ + + + + + + + + + + + + + + +
abbbc
abbbd
""")[0] + + result = self.read_html( + """ + + + + + + + + +
abc
d
""")[0] + + tm.assert_frame_equal(result, expected) def test_rowspan_at_end_of_row(self): # GH17054 - expected = """ - - - - - - - - -
ab
cb
""" - out = """ + expected = read_html( + """
+ + + + + + + + +
ab
cb
""")[0] + + result = read_html( + """ + + + + + + + +
ab
c
""")[0] + + tm.assert_frame_equal(result, expected) + + def test_rowspan_only_rows(self): + # GH17054 + expected = self.read_html( + """ + + + + + + + + + + + + +
ab
ab
ab
""")[0] + + result = read_html( + """ + + + + +
ab
""")[0] + + tm.assert_frame_equal(result, expected) + + def test_header_inferred_from_th_elements(self): + # GH17054 + expected = read_html( + """ + - - + + + + + - + + + + + -
abXXYZW
cabaaa
""" - expected = self.read_html(expected)[0] - res = self.read_html(out)[0] - tm.assert_frame_equal(expected, res) - - def test_rowspan_only_rows(self): - # GH17054 - expected = """ - - - - - - - - - - - - -
ab
ab
ab
""" - out = """ + + - - + + + + + -
ab12345
""" - expected = self.read_html(expected)[0] - res = self.read_html(out)[0] - print(res) - tm.assert_frame_equal(expected, res) + + """)[0] + + result = read_html( + """ + + + + + + + + + + + + + + + + + + + + + +
XXYZW
abaaa
12345
""")[0] - def test_header_should_be_inferred_from_th_elements(self): - # GH17054 - expected = """ - - - - - - - - - - - - - - - - - -
XXYZW
12345
""" - out = """ - - - - - - - - - - - - - -
XXYZW
12345
""" - expected = self.read_html(expected)[0] # header is explicit - res = self.read_html(out)[0] # infer header - tm.assert_frame_equal(expected, res) - res2 = self.read_html(out, header=0)[0] # manually set header - tm.assert_frame_equal(expected, res2) + tm.assert_frame_equal(result, expected) def test_parse_dates_list(self): df = DataFrame({'date': date_range('1/1/2001', periods=10)}) @@ -936,9 +970,8 @@ def test_wikipedia_states_table(self, datapath): assert result['sq mi'].dtype == np.dtype('float64') def test_decimal_rows(self): - # GH 12907 - data = StringIO(''' + result = self.read_html(''' @@ -953,9 +986,10 @@ def test_decimal_rows(self):
- ''') + ''', decimal='#')[0] + expected = DataFrame(data={'Header': 1100.101}, index=[0]) - result = self.read_html(data, decimal='#')[0] + assert result['Header'].dtype == np.dtype('float64') tm.assert_frame_equal(result, expected) @@ -963,53 +997,61 @@ def test_bool_header_arg(self): # GH 6114 for arg in [True, False]: with pytest.raises(TypeError): - read_html(self.spam_data, header=arg) + self.read_html(self.spam_data, header=arg) def test_converters(self): # GH 13461 - html_data = """ - - - - - - - - - - - - -
a
0.763
0.244
""" + result = self.read_html( + """ + + + + + + + + + + + + + +
a
0.763
0.244
""", + converters={'a': str} + )[0] - expected_df = DataFrame({'a': ['0.763', '0.244']}) - html_df = read_html(html_data, converters={'a': str})[0] - tm.assert_frame_equal(expected_df, html_df) + expected = DataFrame({'a': ['0.763', '0.244']}) + + tm.assert_frame_equal(result, expected) def test_na_values(self): # GH 13461 - html_data = """ - - - - - - - - - - - - -
a
0.763
0.244
""" + result = self.read_html( + """ + + + + + + + + + + + + + +
a
0.763
0.244
""", + na_values=[0.244])[0] - expected_df = DataFrame({'a': [0.763, np.nan]}) - html_df = read_html(html_data, na_values=[0.244])[0] - tm.assert_frame_equal(expected_df, html_df) + expected = DataFrame({'a': [0.763, np.nan]}) + + tm.assert_frame_equal(result, expected) def test_keep_default_na(self): html_data = """ + @@ -1024,11 +1066,11 @@ def test_keep_default_na(self):
a
""" expected_df = DataFrame({'a': ['N/A', 'NA']}) - html_df = read_html(html_data, keep_default_na=False)[0] + html_df = self.read_html(html_data, keep_default_na=False)[0] tm.assert_frame_equal(expected_df, html_df) expected_df = DataFrame({'a': [np.nan, np.nan]}) - html_df = read_html(html_data, keep_default_na=True)[0] + html_df = self.read_html(html_data, keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df) def test_multiple_header_rows(self): @@ -1040,7 +1082,7 @@ def test_multiple_header_rows(self): ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"]] html = expected_df.to_html(index=False) - html_df = read_html(html, )[0] + html_df = self.read_html(html, )[0] tm.assert_frame_equal(expected_df, html_df) def test_works_on_valid_markup(self, datapath): From ad6e869b2459d605288ba996a105e88d65c1cfa0 Mon Sep 17 00:00:00 2001 From: Adam Hooper Date: Thu, 21 Jun 2018 09:54:25 -0400 Subject: [PATCH 07/12] Clean up html tests --- pandas/tests/io/test_html.py | 445 +++++++++++++---------------------- 1 file changed, 166 insertions(+), 279 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index b2a72f015f369..fca872e459892 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -377,30 +377,28 @@ def test_empty_tables(self): """ Make sure that read_html ignores empty tables. """ - data1 = ''' - - - - - - - - - - - - -
AB
12
''' - - data2 = data1 + ''' - - -
''' - - expected = self.read_html(data1) - result = self.read_html(data2) + result = self.read_html(''' + + + + + + + + + + + + + +
AB
12
+ + + +
+ ''') - assert_framelist_equal(result, expected) + assert len(result) == 1 def test_multiple_tbody(self): # GH-20690 @@ -456,7 +454,7 @@ def test_thead_without_tr(self): """ Ensure parser adds within on malformed HTML. """ - expected = self.read_html(''' + result = self.read_html('''
@@ -473,20 +471,11 @@ def test_thead_without_tr(self):
Country
''')[0] - result = self.read_html(''' - - - - - - - - - - - - -
CountryMunicipalityYear
UkraineOdessa1944
''')[0] + expected = DataFrame(data={ + 'Country': ['Ukraine'], + 'Municipality': ['Odessa'], + 'Year': [1944], + }) tm.assert_frame_equal(result, expected) @@ -527,42 +516,23 @@ def test_tfoot_read(self): tm.assert_frame_equal(result1, expected1) tm.assert_frame_equal(result2, expected2) - def test_countries_municipalities(self): - data1 = StringIO(''' - - - - - - - - - - - - - - -
CountryMunicipalityYear
UkraineOdessa1944
''') - - data2 = StringIO(''' - - + def test_parse_header_of_non_string_column(self): + # GH5048: if header is specified explicitly, an int column should be + # parsed as int while its header is parsed as str + result = self.read_html(''' +
- - - + + - - + - -
CountryMunicipalityYearSI
UkraineOdessatext 1944
''') + + ''', header=0)[0] - expected = self.read_html(data1)[0] - result = self.read_html(data2, header=0)[0] # GH5048 + expected = DataFrame(data={'S': ['text'], 'I': [1944]}) tm.assert_frame_equal(result, expected) @@ -689,248 +659,165 @@ def test_different_number_of_cols(self): tm.assert_frame_equal(result, expected) - def test_colspan_rowspan_are_1(self): + def test_colspan_rowspan_1(self): # GH17054 - expected = self.read_html( - """ - - - - - - -
XYZW
""")[0] + result = self.read_html(""" + + + + + + + + + + + +
ABC
abc
+ """)[0] - result = self.read_html( - """ - - - - - - -
XYZW
""")[0] + expected = DataFrame(data={ + 'A': ['a'], + 'B': ['b'], + 'C': ['c'], + }) tm.assert_frame_equal(result, expected) - def test_colspan_rowspan_are_more_than_1(self): + def test_colspan_rowspan_copy_values(self): # GH17054 - expected = self.read_html( - """ - - - - - - - - - - - - - - -
XXYZW
122Z3
""")[0] - - result = self.read_html( - """ - - - - - - - - - - - -
XYZW
123
""")[0] - tm.assert_frame_equal(result, expected) + # In ASCII, with lowercase letters being copies: + # + # X x Y Z W + # A B b z C - def test_tbody_colspan_rowspan_copy_values(self): - # GH17054 - expected = self.read_html( - """ - - - - - - - - - - - - - - -
11234
56637
""")[0] + result = self.read_html(""" + + + + + + + + + + + + +
XYZW
ABC
+ """, header=0)[0] - result = self.read_html( - """ - - - - - - - - - - - -
1234
567
""")[0] + expected = DataFrame(data={ + 'X': ['A'], + 'X.1': ['B'], + 'Y': ['B'], + 'Z': ['Z'], + 'W': ['C'], + }) tm.assert_frame_equal(result, expected) def test_colspan_rowspan_both_not_1(self): # GH17054 - expected = self.read_html( - """ - - - - - - - - - - - - - - -
abbbc
abbbd
""")[0] - result = self.read_html( - """ - - - - - - - - -
abc
d
""")[0] + # In ASCII, with lowercase letters being copies: + # + # A B b b C + # a b b b D + + result = self.read_html(""" + + + + + + + + + +
ABC
D
+ """, header=0)[0] + + expected = DataFrame(data={ + 'A': ['A'], + 'B': ['B'], + 'B.1': ['B'], + 'B.2': ['B'], + 'C': ['D'], + }) tm.assert_frame_equal(result, expected) def test_rowspan_at_end_of_row(self): # GH17054 - expected = read_html( - """ - - - - - - - - -
ab
cb
""")[0] - - result = read_html( - """ - - - - - - - -
ab
c
""")[0] + + # In ASCII, with lowercase letters being copies: + # + # A B + # C b + + result = self.read_html(""" + + + + + + + + +
AB
C
+ """, header=0)[0] + + expected = DataFrame(data={ + 'A': ['C'], + 'B': ['B'] + }) tm.assert_frame_equal(result, expected) def test_rowspan_only_rows(self): # GH17054 - expected = self.read_html( - """ - - - - - - - - - - - - -
ab
ab
ab
""")[0] - - result = read_html( - """ - - - - -
ab
""")[0] + + result = self.read_html(""" + + + + + +
AB
+ """, header=0)[0] + + expected = DataFrame(data={ + 'A': ['A', 'A'], + 'B': ['B', 'B'], + }) tm.assert_frame_equal(result, expected) def test_header_inferred_from_th_elements(self): # GH17054 - expected = read_html( - """ - - - - - - - - - - - - - - - - - - - - - - - - - -
XXYZW
abaaa
12345
""")[0] + result = self.read_html(""" + + + + + + + + + + + + + +
AB
ab
12
+ """)[0] - result = read_html( - """ - - - - - - - - - - - - - - - - - - - - - -
XXYZW
abaaa
12345
""")[0] + expected = DataFrame(data={ + ('A', 'a'): [1], + ('B', 'b'): [2], + }) tm.assert_frame_equal(result, expected) From 6fa04896dd763d86f98bc1efd3adb54e646de441 Mon Sep 17 00:00:00 2001 From: Adam Hooper Date: Thu, 21 Jun 2018 12:07:07 -0400 Subject: [PATCH 08/12] Do not nix rows of empty ... but _ignore_ empty rows when inferring columns. This changes the behavior of test_spam_header, which previously ignored an empty row when the user explicitly stated the row number to use as header. --- pandas/io/html.py | 18 ++--- pandas/tests/io/test_html.py | 125 +++++++++++++++++++++++------------ 2 files changed, 92 insertions(+), 51 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 466aa7444f638..18588123fdae8 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -496,11 +496,7 @@ def _expand_colspan_rowspan(self, rows): all_texts.append(texts) remainder = next_remainder - # ignore all-empty-text rows - no_empty = [row for row in all_texts - if any(text for text in row)] - - return no_empty + return all_texts def _handle_hidden_tables(self, tbl_list, attr_name): """ @@ -785,10 +781,16 @@ def _data_to_frame(**kwargs): header = kwargs.pop('header') kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) if head: - rows = lrange(len(head)) body = head + body - if header is None: # special case when a table has elements - header = 0 if rows == [0] else rows + + # Infer header when there is a or top -only rows + if header is None: + if len(head) == 1: + header = 0 + else: + # ignore all-empty-text rows + header = [i for i, row in enumerate(head) + if any(text for text in row)] if foot: body += foot diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index fca872e459892..b78c4f27d8c3f 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -15,6 +15,7 @@ date_range, Series) from pandas.compat import (map, zip, StringIO, BytesIO, is_platform_windows, PY3, reload) +from pandas.errors import ParserError from pandas.io.common import URLError, file_path_to_url import pandas.io.html from pandas.io.html import read_html @@ -147,7 +148,7 @@ def test_banklist_no_match(self): assert isinstance(df, DataFrame) def test_spam_header(self): - df = self.read_html(self.spam_data, '.*Water.*', header=1)[0] + df = self.read_html(self.spam_data, '.*Water.*', header=2)[0] assert df.columns[0] == 'Proximates' assert not df.empty @@ -424,7 +425,7 @@ def test_multiple_tbody(self): ''')[0] - expected = DataFrame({'A': [1, 3], 'B': [2, 4]}) + expected = DataFrame(data=[[1, 2], [3, 4]], columns=['A', 'B']) tm.assert_frame_equal(result, expected) @@ -471,11 +472,8 @@ def test_thead_without_tr(self): ''')[0] - expected = DataFrame(data={ - 'Country': ['Ukraine'], - 'Municipality': ['Odessa'], - 'Year': [1944], - }) + expected = DataFrame(data=[['Ukraine', 'Odessa', 1944]], + columns=['Country', 'Municipality', 'Year']) tm.assert_frame_equal(result, expected) @@ -502,9 +500,10 @@ def test_tfoot_read(self): ''' - expected1 = DataFrame({'A': ['bodyA'], 'B': ['bodyB']}) - expected2 = DataFrame({'A': ['bodyA', 'footA'], - 'B': ['bodyB', 'footB']}) + expected1 = DataFrame(data=[['bodyA', 'bodyB']], columns=['A', 'B']) + + expected2 = DataFrame(data=[['bodyA', 'bodyB'], ['footA', 'footB']], + columns=['A', 'B']) data1 = data_template.format(footer="") data2 = data_template.format( @@ -532,7 +531,7 @@ def test_parse_header_of_non_string_column(self): ''', header=0)[0] - expected = DataFrame(data={'S': ['text'], 'I': [1944]}) + expected = DataFrame([['text', 1944]], columns=('S', 'I')) tm.assert_frame_equal(result, expected) @@ -676,11 +675,7 @@ def test_colspan_rowspan_1(self): """)[0] - expected = DataFrame(data={ - 'A': ['a'], - 'B': ['b'], - 'C': ['c'], - }) + expected = DataFrame([['a', 'b', 'c']], columns=['A', 'B', 'C']) tm.assert_frame_equal(result, expected) @@ -708,13 +703,8 @@ def test_colspan_rowspan_copy_values(self): """, header=0)[0] - expected = DataFrame(data={ - 'X': ['A'], - 'X.1': ['B'], - 'Y': ['B'], - 'Z': ['Z'], - 'W': ['C'], - }) + expected = DataFrame(data=[['A', 'B', 'B', 'Z', 'C']], + columns=['X', 'X.1', 'Y', 'Z', 'W']) tm.assert_frame_equal(result, expected) @@ -739,13 +729,8 @@ def test_colspan_rowspan_both_not_1(self): """, header=0)[0] - expected = DataFrame(data={ - 'A': ['A'], - 'B': ['B'], - 'B.1': ['B'], - 'B.2': ['B'], - 'C': ['D'], - }) + expected = DataFrame(data=[['A', 'B', 'B', 'B', 'D']], + columns=['A', 'B', 'B.1', 'B.2', 'C']) tm.assert_frame_equal(result, expected) @@ -769,10 +754,7 @@ def test_rowspan_at_end_of_row(self): """, header=0)[0] - expected = DataFrame(data={ - 'A': ['C'], - 'B': ['B'] - }) + expected = DataFrame(data=[['C', 'B']], columns=['A', 'B']) tm.assert_frame_equal(result, expected) @@ -788,14 +770,12 @@ def test_rowspan_only_rows(self): """, header=0)[0] - expected = DataFrame(data={ - 'A': ['A', 'A'], - 'B': ['B', 'B'], - }) + expected = DataFrame(data=[['A', 'B'], ['A', 'B']], + columns=['A', 'B']) tm.assert_frame_equal(result, expected) - def test_header_inferred_from_th_elements(self): + def test_header_inferred_from_rows_with_only_th(self): # GH17054 result = self.read_html(""" @@ -814,10 +794,9 @@ def test_header_inferred_from_th_elements(self):
""")[0] - expected = DataFrame(data={ - ('A', 'a'): [1], - ('B', 'b'): [2], - }) + columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']], + labels=[[0, 1], [0, 1]]) + expected = DataFrame(data=[[1, 2]], columns=columns) tm.assert_frame_equal(result, expected) @@ -856,6 +835,23 @@ def test_wikipedia_states_table(self, datapath): result = self.read_html(data, 'Arizona', header=1)[0] assert result['sq mi'].dtype == np.dtype('float64') + def test_parser_error_on_empty_header_row(self): + with tm.assert_raises_regex(ParserError, + r"Passed header=\[0,1\] are " + r"too many rows for this " + r"multi_index of columns"): + self.read_html(""" + + + + + + + + +
AB
ab
+ """, header=[0, 1]) + def test_decimal_rows(self): # GH 12907 result = self.read_html(''' @@ -960,6 +956,49 @@ def test_keep_default_na(self): html_df = self.read_html(html_data, keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df) + def test_preserve_empty_rows(self): + result = self.read_html(""" + + + + + + + + + + + + + +
AB
ab
+ """)[0] + + expected = DataFrame(data=[['a', 'b'], [np.nan, np.nan]], + columns=['A', 'B']) + + tm.assert_frame_equal(result, expected) + + def test_ignore_empty_rows_when_inferring_header(self): + result = self.read_html(""" + + + + + + + + + +
AB
ab
12
+ """)[0] + + columns = MultiIndex(levels=[['A', 'B'], ['a', 'b']], + labels=[[0, 1], [0, 1]]) + expected = DataFrame(data=[[1, 2]], columns=columns) + + tm.assert_frame_equal(result, expected) + def test_multiple_header_rows(self): # Issue #13434 expected_df = DataFrame(data=[("Hillary", 68, "D"), From d4f4bb1ade73278d97daf1fee08575227dbd8fb1 Mon Sep 17 00:00:00 2001 From: Adam Hooper Date: Tue, 3 Jul 2018 17:59:09 -0400 Subject: [PATCH 09/12] Comments: Notes after Returns --- pandas/io/html.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 18588123fdae8..45fe3b017e4f6 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -379,6 +379,14 @@ def _parse_thead_tbody_tfoot(self, table_html): """ Given a table, return parsed header, body, and foot. + Parameters + ---------- + table_html : node-like + + Returns + ------- + tuple of (header, body, footer), each a list of list-of-text rows. + Notes ----- Header and body are lists-of-lists. Top level list is a list of @@ -391,14 +399,6 @@ def _parse_thead_tbody_tfoot(self, table_html): all elements inside row are - Move rows from bottom of body to footer only if all elements inside row are - - Parameters - ---------- - table_html : node-like - - Returns - ------- - tuple of (header, body, footer), each a list of list-of-text rows. """ header_rows = self._parse_thead_tr(table_html) @@ -426,11 +426,6 @@ def _expand_colspan_rowspan(self, rows): """ Given a list of s, return a list of text rows. - Notes - ----- - Any cell with ``rowspan`` or ``colspan`` will have its contents copied - to subsequent cells. - Parameters ---------- rows : list of node-like @@ -440,6 +435,11 @@ def _expand_colspan_rowspan(self, rows): ------- list of list Each returned row is a list of str text. + + Notes + ----- + Any cell with ``rowspan`` or ``colspan`` will have its contents copied + to subsequent cells. """ all_texts = [] # list of rows, each a list of str From e296bd1a8e1051c8078ea2e387470b0f6b795525 Mon Sep 17 00:00:00 2001 From: Adam Hooper Date: Wed, 4 Jul 2018 13:36:12 -0400 Subject: [PATCH 10/12] Document read_html changes in whatsnew --- doc/source/whatsnew/v0.24.0.txt | 114 ++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 83100dc634661..5aac64eb7f976 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -168,6 +168,120 @@ Current Behavior: ... OverflowError: Trying to coerce negative values to unsigned integers +read_html Incompatibilities +--------------------------- + +:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. +Now it understands them, treating them as a sequence of cells with the same +value. + +Previous Behavior: + +.. code-block:: ipython + + In [1]: pd.read_html(""" + + + + + + + + + + + +
ABC
12
+ """) + Out [1]: + [ A B C + 0 1 2 NaN] + +Current Behavior: + +.. code-block:: ipython + + In [1]: pd.read_html(""" + + + + + + + + + + + +
ABC
12
+ """) + Out [1]: + [ A B C + 0 1 2 2] + +Calls that relied on the previous behavior will need to be changed. + +Also, :func:`read_html` previously ignored some ```` elements when called +with ``header=`` or ``skiprows=`` on some unusual HTML tables. +(:issue:`21641`) + +Previous Behavior: + +.. code-block:: ipython + + In [1]: pd.read_html(""" + + + + + + + + + + + + + + + +
ABC
123
+ """, header=2) + Out [1]: + [Empty DataFrame + Columns: [1, 2, 3] + Index: []] + +Current Behavior: + +.. code-block:: ipython + + In [1]: pd.read_html(""" + + + + + + + + + + + + + + + +
ABC
123
+ """, header=2) + Out [1]: + [ A B C + 0 1 2 3] + +Previously, the workaround was to write ``header=0`` instead of ``header=1`` +for this example table. Now, that workaround must be removed. This should not +affect many users, since most HTML tables do not have empty header rows. + - :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`) - - From 95ce9934a4ca8a7b5504f1a68f14c0b60ea21c50 Mon Sep 17 00:00:00 2001 From: Adam Hooper Date: Wed, 4 Jul 2018 16:29:51 -0400 Subject: [PATCH 11/12] Improve whatsnew with ipython --- doc/source/whatsnew/v0.24.0.txt | 122 ++++++-------------------------- 1 file changed, 23 insertions(+), 99 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index bc33c466f3529..db3598ce2a181 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -174,119 +174,43 @@ Current Behavior: ... OverflowError: Trying to coerce negative values to unsigned integers -read_html Incompatibilities -^^^^^^^^^^^^^^^^^^^^^^^^^^^ +read_html Enhancements +^^^^^^^^^^^^^^^^^^^^^^ :func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. -Now it understands them, treating them as a sequence of cells with the same +Now it understands them, treating them as sequences of cells with the same value. (:issue:`17054`) -Previous Behavior: - -.. code-block:: ipython - - In [1]: pd.read_html(""" - - - - - - - - - - - -
ABC
12
- """) - Out [1]: - [ A B C - 0 1 2 NaN] - -Current Behavior: - -.. code-block:: ipython +.. ipython:: python - In [1]: pd.read_html(""" - - - - - - - - - - - -
ABC
12
- """) - Out [1]: - [ A B C - 0 1 2 2] - -Calls that relied on the previous behavior will need to be changed. - -Also, :func:`read_html` previously ignored some ```` elements when called -with ``header=`` or ``skiprows=`` on some unusual HTML tables. -(:issue:`21641`) + result = pd.read_html(""" + + + + + + + + + + + +
ABC
12
""") Previous Behavior: .. code-block:: ipython - In [1]: pd.read_html(""" - - - - - - - - - - - - - - - -
ABC
123
- """, header=2) - Out [1]: - [Empty DataFrame - Columns: [1, 2, 3] - Index: []] + In [13]: result + Out [13]: + [ A B C + 0 1 2 NaN] Current Behavior: -.. code-block:: ipython +.. ipython:: python - In [1]: pd.read_html(""" - - - - - - - - - - - - - - - -
ABC
123
- """, header=2) - Out [1]: - [ A B C - 0 1 2 3] - -Previously, the workaround was to write ``header=0`` instead of ``header=1`` -for this example table. Now, that workaround must be removed. This should not -affect many users, since most HTML tables do not have empty header rows. + result Datetimelike API Changes ^^^^^^^^^^^^^^^^^^^^^^^^ From 5fd863bb3611093aefcda7e0f16573d77a3190d4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 5 Jul 2018 12:40:28 -0500 Subject: [PATCH 12/12] fixup whatsnew --- doc/source/whatsnew/v0.24.0.txt | 80 +++++++++++++++++---------------- 1 file changed, 41 insertions(+), 39 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index db3598ce2a181..d0b8f00150099 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -10,7 +10,7 @@ New features - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) -.. _whatsnew_0240.enhancements.extension_array_operators +.. _whatsnew_0240.enhancements.extension_array_operators: ``ExtensionArray`` operator support ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -26,6 +26,46 @@ See the :ref:`ExtensionArray Operator Support ` documentation section for details on both ways of adding operator support. +.. _whatsnew_0240.enhancements.read_html: + +``read_html`` Enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. +Now it understands them, treating them as sequences of cells with the same +value. (:issue:`17054`) + +.. ipython:: python + + result = pd.read_html(""" + + + + + + + + + + + +
ABC
12
""") + +Previous Behavior: + +.. code-block:: ipython + + In [13]: result + Out [13]: + [ A B C + 0 1 2 NaN] + +Current Behavior: + +.. ipython:: python + + result + .. _whatsnew_0240.enhancements.other: Other Enhancements @@ -174,44 +214,6 @@ Current Behavior: ... OverflowError: Trying to coerce negative values to unsigned integers -read_html Enhancements -^^^^^^^^^^^^^^^^^^^^^^ - -:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. -Now it understands them, treating them as sequences of cells with the same -value. (:issue:`17054`) - -.. ipython:: python - - result = pd.read_html(""" - - - - - - - - - - - -
ABC
12
""") - -Previous Behavior: - -.. code-block:: ipython - - In [13]: result - Out [13]: - [ A B C - 0 1 2 NaN] - -Current Behavior: - -.. ipython:: python - - result - Datetimelike API Changes ^^^^^^^^^^^^^^^^^^^^^^^^