diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index cbd094ec4ef49..1a08d48f3948b 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -103,7 +103,7 @@ Indexing I/O ^^^ -- +- :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`) - - diff --git a/pandas/io/html.py b/pandas/io/html.py index 6f98683a1bff1..e1636d8007345 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -742,6 +742,18 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs): try: tables = p.parse_tables() except Exception as caught: + # if `io` is an io-like object, check if it's seekable + # and try to rewind it before trying the next parser + if hasattr(io, 'seekable') and io.seekable(): + io.seek(0) + elif hasattr(io, 'seekable') and not io.seekable(): + # if we couldn't rewind it, let the user know + raise ValueError('The flavor {} failed to parse your input. ' + 'Since you passed a non-rewindable file ' + 'object, we can\'t rewind it to try ' + 'another parser. Try read_html() with a ' + 'different flavor.'.format(flav)) + retained = caught else: break diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 8dfae2733ef20..399cac905967e 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -967,3 +967,53 @@ def test_importcheck_thread_safety(): while helper_thread1.is_alive() or helper_thread2.is_alive(): pass assert None is helper_thread1.err is helper_thread2.err + + +def test_parse_failure_unseekable(): + # Issue #17975 + _skip_if_no('lxml') + + class UnseekableStringIO(StringIO): + def seekable(self): + return False + + good = UnseekableStringIO(''' +
spam
eggs
''') + bad = UnseekableStringIO(''' +
spameggs
''') + + assert read_html(good) + assert read_html(bad, flavor='bs4') + + bad.seek(0) + + with pytest.raises(ValueError, + match='passed a non-rewindable file object'): + read_html(bad) + + +def test_parse_failure_rewinds(): + # Issue #17975 + _skip_if_no('lxml') + + class MockFile(object): + def __init__(self, data): + self.data = data + self.at_end = False + + def read(self, size=None): + data = '' if self.at_end else self.data + self.at_end = True + return data + + def seek(self, offset): + self.at_end = False + + def seekable(self): + return True + + good = MockFile('
spam
eggs
') + bad = MockFile('
spameggs
') + + assert read_html(good) + assert read_html(bad)