From 0d4b418ddf1c8e4c00c63bbdd719781a7f507052 Mon Sep 17 00:00:00 2001 From: Liam Date: Sat, 28 Oct 2017 18:22:00 -0500 Subject: [PATCH 1/4] io.html.read_html(): rewind seekable io objects when parsers fail If lxml has read to the end of a file and then errored, bs4/html5lib won't rewind it before trying to parse again, and will throw a `ValueError: No text parsed from document`. This patch fixes this issue, by rewinding the file object when a parser fails. If the object was IO-ish but not seekable, we throw an error notifying the user and asking them to try a different flavor. --- pandas/io/html.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/io/html.py b/pandas/io/html.py index 6f98683a1bff1..e1636d8007345 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -742,6 +742,18 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs): try: tables = p.parse_tables() except Exception as caught: + # if `io` is an io-like object, check if it's seekable + # and try to rewind it before trying the next parser + if hasattr(io, 'seekable') and io.seekable(): + io.seek(0) + elif hasattr(io, 'seekable') and not io.seekable(): + # if we couldn't rewind it, let the user know + raise ValueError('The flavor {} failed to parse your input. ' + 'Since you passed a non-rewindable file ' + 'object, we can\'t rewind it to try ' + 'another parser. Try read_html() with a ' + 'different flavor.'.format(flav)) + retained = caught else: break From 9e813173336127dca19611f9d2d22c0c99b48e52 Mon Sep 17 00:00:00 2001 From: Liam Date: Sat, 28 Oct 2017 19:00:49 -0500 Subject: [PATCH 2/4] test: read_html() parse error interaction with unseekable IO --- pandas/tests/io/test_html.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 8dfae2733ef20..c481870d5edd0 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -967,3 +967,27 @@ def test_importcheck_thread_safety(): while helper_thread1.is_alive() or helper_thread2.is_alive(): pass assert None is helper_thread1.err is helper_thread2.err + + +def test_parse_failure_unseekable(): + # Issue #17975 + _skip_if_no('lxml') + + class UnseekableStringIO(StringIO): + def seekable(self): + return False + + good = UnseekableStringIO(''' +
spam
eggs
''') + bad = UnseekableStringIO(''' +
spameggs
''') + + assert read_html(good) + assert read_html(bad, flavor='bs4') + + bad.seek(0) + + with pytest.raises(ValueError, + match='passed a non-rewindable file object'): + read_html(bad) + From 58b46474f7fa2afa31c4d0a7dbf33c52c1758d4c Mon Sep 17 00:00:00 2001 From: Liam Date: Sat, 28 Oct 2017 19:55:02 -0500 Subject: [PATCH 3/4] test: read_html() rewinding after parse errors --- pandas/tests/io/test_html.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index c481870d5edd0..399cac905967e 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -991,3 +991,29 @@ def seekable(self): match='passed a non-rewindable file object'): read_html(bad) + +def test_parse_failure_rewinds(): + # Issue #17975 + _skip_if_no('lxml') + + class MockFile(object): + def __init__(self, data): + self.data = data + self.at_end = False + + def read(self, size=None): + data = '' if self.at_end else self.data + self.at_end = True + return data + + def seek(self, offset): + self.at_end = False + + def seekable(self): + return True + + good = MockFile('
spam
eggs
') + bad = MockFile('
spameggs
') + + assert read_html(good) + assert read_html(bad) From a85cd4217cc18e5e684ae9e8f75a2acaa948e739 Mon Sep 17 00:00:00 2001 From: Liam Date: Sat, 28 Oct 2017 20:07:08 -0500 Subject: [PATCH 4/4] doc: whatsnew for read_html() rewinding fixes --- doc/source/whatsnew/v0.22.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index cbd094ec4ef49..1a08d48f3948b 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -103,7 +103,7 @@ Indexing I/O ^^^ -- +- :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`) - -